hoglah 0.2.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
hoglah/__init__.py ADDED
@@ -0,0 +1,33 @@
1
+ """Hoglah — lightweight local-first Ollama job queue manager.
2
+
3
+ A simple, persistent job queue and orchestration layer for running LLM
4
+ inference (via Ollama) in resource-constrained environments.
5
+
6
+ See docs/requirements-v1.0.md and the README for the full specification.
7
+ """
8
+
9
+ from importlib.metadata import PackageNotFoundError, version as _pkg_version
10
+
11
+ from .adapters import BaseAdapter, OllamaAdapter, StubAdapter
12
+ from .client import Hoglah, HoglahConfig
13
+ from .models import JobResult, JobStatus, JobRequest
14
+
15
+ # Single source of truth is pyproject.toml; read it from the installed
16
+ # package metadata so __version__ can never drift from the wheel again.
17
+ try:
18
+ __version__ = _pkg_version("hoglah")
19
+ except PackageNotFoundError: # not installed (e.g. raw source tree)
20
+ __version__ = "0.0.0+source"
21
+ del _pkg_version, PackageNotFoundError
22
+
23
+ __all__ = [
24
+ "__version__",
25
+ "BaseAdapter",
26
+ "Hoglah",
27
+ "HoglahConfig",
28
+ "JobResult",
29
+ "JobStatus",
30
+ "JobRequest",
31
+ "OllamaAdapter",
32
+ "StubAdapter",
33
+ ]
hoglah/adapters.py ADDED
@@ -0,0 +1,306 @@
1
+ """Execution adapters for Hoglah.
2
+
3
+ Default is StubAdapter (no network calls) — safe for resource-constrained or
4
+ shared environments.
5
+
6
+ Real execution is available via OllamaAdapter (uses the official `ollama` package).
7
+ Pass adapter=OllamaAdapter(host=...) to Hoglah(...) or configure via CLI flags
8
+ when supported.
9
+
10
+ The BaseAdapter protocol also exposes list_models() for discovery.
11
+ """
12
+
13
+ from __future__ import annotations
14
+
15
+ import asyncio
16
+ from abc import ABC, abstractmethod
17
+ from typing import Any
18
+
19
+ import ollama # official client (declared dep)
20
+
21
+ from .models import JobRequest
22
+
23
+
24
+ class BaseAdapter(ABC):
25
+ """Common interface for job executors (generate or chat style)."""
26
+
27
+ @abstractmethod
28
+ async def run(self, request: JobRequest) -> tuple[str, dict[str, int], dict[str, Any]]:
29
+ """
30
+ Execute the request and return:
31
+ (output_text, usage_dict, metadata_dict)
32
+
33
+ usage_dict should contain at least:
34
+ {"prompt_tokens": int, "completion_tokens": int, "total": int}
35
+
36
+ metadata can carry "truncated", "truncation_reason", etc.
37
+ """
38
+ raise NotImplementedError
39
+
40
+ async def list_models(self) -> list[dict[str, Any]]:
41
+ """Return available models. Default empty; adapters should override."""
42
+ return []
43
+
44
+ async def pull_model(self, model: str) -> None:
45
+ """Ensure the model is available (pull if missing). Default no-op."""
46
+ pass # Stub does nothing; real adapter implements with ollama pull
47
+
48
+ async def show_model(self, model: str) -> dict[str, Any]:
49
+ """Return details for a specific model (from ollama show or stub). Default empty."""
50
+ return {}
51
+
52
+
53
+ class StubAdapter(BaseAdapter):
54
+ """
55
+ Safe no-op / simulation adapter.
56
+
57
+ - Does NOT call Ollama.
58
+ - Returns deterministic fake output.
59
+ - Simulates truncation reporting when the request looks "large" relative to num_ctx.
60
+ - Small artificial delay so the worker loop has something to do.
61
+ """
62
+
63
+ async def run(self, request: JobRequest) -> tuple[str, dict[str, int], dict[str, Any]]:
64
+ # Tiny simulated "thinking" time
65
+ await asyncio.sleep(0.03)
66
+
67
+ if request.messages:
68
+ # chat style
69
+ last = request.messages[-1] if request.messages else {}
70
+ content = last.get("content", "") if isinstance(last, dict) else str(last)
71
+ base = f"[STUB-CHAT] Responded to: {content[:60]}..."
72
+ else:
73
+ prompt = request.prompt or ""
74
+ base = f"[STUB] Generated response for: {prompt[:60]}..."
75
+
76
+ # crude token estimation
77
+ prompt_tokens = len((request.prompt or str(request.messages or "")).split())
78
+ num_ctx = request.num_ctx or 4096
79
+
80
+ completion_tokens = 25
81
+ total = prompt_tokens + completion_tokens
82
+
83
+ output = base
84
+ meta: dict[str, Any] = {}
85
+
86
+ # Simulate the truncation behavior requested in the spec
87
+ if num_ctx and prompt_tokens > num_ctx * 0.9:
88
+ meta["truncated"] = True
89
+ meta["truncation_reason"] = "simulated_context_limit_in_stub"
90
+ output = base[:80] + " ... [truncated in stub for testing]"
91
+
92
+ meta["effective_num_ctx"] = num_ctx
93
+
94
+ usage = {
95
+ "prompt_tokens": prompt_tokens,
96
+ "completion_tokens": completion_tokens,
97
+ "total": total,
98
+ }
99
+
100
+ return output, usage, meta
101
+
102
+ async def list_models(self) -> list[dict[str, Any]]:
103
+ return [
104
+ {"name": "stub-model:1b", "size": 123456, "digest": "stub", "details": {"family": "stub"}},
105
+ {"name": "stub-model:7b", "size": 4567890, "digest": "stub", "details": {"family": "stub"}},
106
+ ]
107
+
108
+ async def show_model(self, model: str) -> dict[str, Any]:
109
+ return {
110
+ "name": model,
111
+ "size": 123456,
112
+ "digest": "stub",
113
+ "details": {"family": "stub", "parameter_size": "1B", "quantization_level": "Q4_0"},
114
+ "parameters": "num_ctx 4096",
115
+ "template": "{{ .System }} {{ .Prompt }}",
116
+ }
117
+
118
+
119
+ class OllamaAdapter(BaseAdapter):
120
+ """
121
+ Real adapter using the official ollama Python client against a local/remote
122
+ Ollama server.
123
+
124
+ - Supports both prompt (generate) and messages (chat) submission styles.
125
+ - Maps common generation params (temperature, num_ctx, etc.) into the
126
+ request options.
127
+ - Returns usage counts from Ollama (prompt_eval_count / eval_count) when
128
+ present.
129
+ - Context truncation: if Ollama errors with context-related messages we
130
+ surface them; otherwise we succeed and let the model/Ollama decide.
131
+ Callers always get a result (per ADR-009) unless the error is fatal.
132
+ """
133
+
134
+ def __init__(self, host: str | None = None):
135
+ self.host = host
136
+ self._client: ollama.AsyncClient | None = None
137
+ self._client_loop: asyncio.AbstractEventLoop | None = None
138
+
139
+ def _get_client(self) -> ollama.AsyncClient:
140
+ # ollama.AsyncClient wraps an httpx.AsyncClient whose primitives are
141
+ # bound to the event loop that first uses them; reusing it from a
142
+ # different loop raises "bound to a different event loop". The
143
+ # background worker runs in one loop (cached, fast path), but the
144
+ # sync facades run each call in a fresh loop, so cache PER LOOP and
145
+ # recreate when the running loop changes.
146
+ loop = asyncio.get_running_loop()
147
+ if self._client is None or self._client_loop is not loop:
148
+ self._client = ollama.AsyncClient(host=self.host)
149
+ self._client_loop = loop
150
+ return self._client
151
+
152
+ def _build_options(self, request: JobRequest) -> dict[str, Any]:
153
+ opts: dict[str, Any] = {}
154
+ if request.options:
155
+ opts.update(request.options)
156
+ for key in (
157
+ "temperature",
158
+ "top_p",
159
+ "top_k",
160
+ "repeat_penalty",
161
+ "seed",
162
+ "num_predict",
163
+ "stop",
164
+ "num_ctx",
165
+ ):
166
+ val = getattr(request, key, None)
167
+ if val is not None:
168
+ opts[key] = val
169
+ return opts or {}
170
+
171
+ async def run(self, request: JobRequest) -> tuple[str, dict[str, int], dict[str, Any]]:
172
+ client = self._get_client()
173
+ options = self._build_options(request)
174
+ meta: dict[str, Any] = {}
175
+ usage: dict[str, int] = {}
176
+
177
+ # Auto-pull if the model is not present (makes real mode much more ergonomic)
178
+ await self.pull_model(request.model)
179
+
180
+ # Smart context handling: query model details for recommended num_ctx if not explicitly provided
181
+ effective_num_ctx = request.num_ctx
182
+ try:
183
+ model_info = await self.show_model(request.model)
184
+ params = model_info.get("parameters", "") or ""
185
+ if not effective_num_ctx:
186
+ for line in str(params).splitlines():
187
+ if "num_ctx" in line.lower():
188
+ parts = line.strip().split()
189
+ if len(parts) >= 2:
190
+ try:
191
+ effective_num_ctx = int(parts[-1])
192
+ break
193
+ except (ValueError, TypeError):
194
+ pass
195
+ if effective_num_ctx:
196
+ meta["effective_num_ctx"] = effective_num_ctx
197
+ if "num_ctx" not in options:
198
+ options["num_ctx"] = effective_num_ctx
199
+ except Exception:
200
+ pass # best effort, don't fail the job
201
+
202
+ try:
203
+ if request.messages:
204
+ # chat path
205
+ resp = await client.chat(
206
+ model=request.model,
207
+ messages=request.messages or [],
208
+ options=options if options else None,
209
+ format=request.format,
210
+ keep_alive=request.keep_alive,
211
+ )
212
+ # resp is ChatResponse-like (has .message or dict access)
213
+ msg = getattr(resp, "message", None) or (resp.get("message") if isinstance(resp, dict) else None)
214
+ if msg:
215
+ output = msg.get("content") if isinstance(msg, dict) else getattr(msg, "content", "")
216
+ else:
217
+ output = str(resp)
218
+ # token counts (chat responses use these fields)
219
+ prompt_tokens = int(getattr(resp, "prompt_eval_count", 0) or (resp.get("prompt_eval_count", 0) if isinstance(resp, dict) else 0))
220
+ completion_tokens = int(getattr(resp, "eval_count", 0) or (resp.get("eval_count", 0) if isinstance(resp, dict) else 0))
221
+ else:
222
+ # generate path
223
+ resp = await client.generate(
224
+ model=request.model,
225
+ prompt=request.prompt or "",
226
+ system=request.system_prompt,
227
+ options=options if options else None,
228
+ format=request.format,
229
+ keep_alive=request.keep_alive,
230
+ )
231
+ output = getattr(resp, "response", None) or (resp.get("response") if isinstance(resp, dict) else str(resp))
232
+ prompt_tokens = int(getattr(resp, "prompt_eval_count", 0) or (resp.get("prompt_eval_count", 0) if isinstance(resp, dict) else 0))
233
+ completion_tokens = int(getattr(resp, "eval_count", 0) or (resp.get("eval_count", 0) if isinstance(resp, dict) else 0))
234
+
235
+ total = prompt_tokens + completion_tokens
236
+ usage = {
237
+ "prompt_tokens": prompt_tokens or 0,
238
+ "completion_tokens": completion_tokens or 0,
239
+ "total": total,
240
+ }
241
+
242
+ # Best-effort truncation / completion info from real Ollama responses
243
+ done_reason = getattr(resp, "done_reason", None) or (resp.get("done_reason") if isinstance(resp, dict) else None)
244
+ if done_reason == "length":
245
+ meta["truncated"] = True
246
+ meta["truncation_reason"] = "length" # hit max tokens / context window
247
+ elif done_reason:
248
+ meta["done_reason"] = done_reason
249
+
250
+ # Fallback heuristic for cases where done_reason not present
251
+ if not meta.get("truncated") and ("context" in str(output).lower() or "truncat" in str(output).lower()):
252
+ meta["truncated"] = True
253
+ meta["truncation_reason"] = "possible_context_truncation_from_model"
254
+
255
+ return str(output or ""), usage, meta
256
+
257
+ except Exception:
258
+ # Let the caller (client._execute_with_retries) classify as transient/permanent
259
+ # and decide on retries / final failure. We re-raise so error path is exercised.
260
+ raise
261
+
262
+ async def list_models(self) -> list[dict[str, Any]]:
263
+ client = self._get_client()
264
+ resp = await client.list()
265
+ # resp is usually ListResponse with .models list of Model objects or dicts
266
+ models = getattr(resp, "models", None) or (resp.get("models") if isinstance(resp, dict) else [])
267
+ result = []
268
+ for m in models or []:
269
+ if isinstance(m, dict):
270
+ result.append(m)
271
+ else:
272
+ # Try to turn object into dict
273
+ d = {}
274
+ for attr in ("name", "model", "size", "digest", "details", "modified_at"):
275
+ if hasattr(m, attr):
276
+ d[attr] = getattr(m, attr)
277
+ elif isinstance(m, dict) and attr in m:
278
+ d[attr] = m[attr]
279
+ if d:
280
+ result.append(d)
281
+ return result
282
+
283
+ async def pull_model(self, model: str) -> None:
284
+ """Pull the model if it is not already present locally."""
285
+ client = self._get_client()
286
+ try:
287
+ await client.show(model=model)
288
+ return # already present
289
+ except Exception:
290
+ pass # not present, pull it
291
+ # Perform the pull (can be long-running; no stream for simplicity)
292
+ await client.pull(model=model)
293
+
294
+ async def show_model(self, model: str) -> dict[str, Any]:
295
+ client = self._get_client()
296
+ resp = await client.show(model=model)
297
+ if isinstance(resp, dict):
298
+ return resp
299
+ # Convert object to dict
300
+ d = {}
301
+ for attr in ("name", "model", "size", "digest", "details", "parameters", "template", "modified_at"):
302
+ if hasattr(resp, attr):
303
+ d[attr] = getattr(resp, attr)
304
+ elif isinstance(resp, dict) and attr in resp:
305
+ d[attr] = resp[attr]
306
+ return d or {"name": model}