hoglah 0.2.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- hoglah/__init__.py +33 -0
- hoglah/adapters.py +306 -0
- hoglah/cli.py +613 -0
- hoglah/client.py +725 -0
- hoglah/config.py +86 -0
- hoglah/models.py +107 -0
- hoglah/py.typed +1 -0
- hoglah/store.py +354 -0
- hoglah-0.2.2.dist-info/METADATA +520 -0
- hoglah-0.2.2.dist-info/RECORD +14 -0
- hoglah-0.2.2.dist-info/WHEEL +5 -0
- hoglah-0.2.2.dist-info/entry_points.txt +2 -0
- hoglah-0.2.2.dist-info/licenses/LICENSE +201 -0
- hoglah-0.2.2.dist-info/top_level.txt +1 -0
hoglah/__init__.py
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
"""Hoglah — lightweight local-first Ollama job queue manager.
|
|
2
|
+
|
|
3
|
+
A simple, persistent job queue and orchestration layer for running LLM
|
|
4
|
+
inference (via Ollama) in resource-constrained environments.
|
|
5
|
+
|
|
6
|
+
See docs/requirements-v1.0.md and the README for the full specification.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from importlib.metadata import PackageNotFoundError, version as _pkg_version
|
|
10
|
+
|
|
11
|
+
from .adapters import BaseAdapter, OllamaAdapter, StubAdapter
|
|
12
|
+
from .client import Hoglah, HoglahConfig
|
|
13
|
+
from .models import JobResult, JobStatus, JobRequest
|
|
14
|
+
|
|
15
|
+
# Single source of truth is pyproject.toml; read it from the installed
|
|
16
|
+
# package metadata so __version__ can never drift from the wheel again.
|
|
17
|
+
try:
|
|
18
|
+
__version__ = _pkg_version("hoglah")
|
|
19
|
+
except PackageNotFoundError: # not installed (e.g. raw source tree)
|
|
20
|
+
__version__ = "0.0.0+source"
|
|
21
|
+
del _pkg_version, PackageNotFoundError
|
|
22
|
+
|
|
23
|
+
__all__ = [
|
|
24
|
+
"__version__",
|
|
25
|
+
"BaseAdapter",
|
|
26
|
+
"Hoglah",
|
|
27
|
+
"HoglahConfig",
|
|
28
|
+
"JobResult",
|
|
29
|
+
"JobStatus",
|
|
30
|
+
"JobRequest",
|
|
31
|
+
"OllamaAdapter",
|
|
32
|
+
"StubAdapter",
|
|
33
|
+
]
|
hoglah/adapters.py
ADDED
|
@@ -0,0 +1,306 @@
|
|
|
1
|
+
"""Execution adapters for Hoglah.
|
|
2
|
+
|
|
3
|
+
Default is StubAdapter (no network calls) — safe for resource-constrained or
|
|
4
|
+
shared environments.
|
|
5
|
+
|
|
6
|
+
Real execution is available via OllamaAdapter (uses the official `ollama` package).
|
|
7
|
+
Pass adapter=OllamaAdapter(host=...) to Hoglah(...) or configure via CLI flags
|
|
8
|
+
when supported.
|
|
9
|
+
|
|
10
|
+
The BaseAdapter protocol also exposes list_models() for discovery.
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from __future__ import annotations
|
|
14
|
+
|
|
15
|
+
import asyncio
|
|
16
|
+
from abc import ABC, abstractmethod
|
|
17
|
+
from typing import Any
|
|
18
|
+
|
|
19
|
+
import ollama # official client (declared dep)
|
|
20
|
+
|
|
21
|
+
from .models import JobRequest
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class BaseAdapter(ABC):
|
|
25
|
+
"""Common interface for job executors (generate or chat style)."""
|
|
26
|
+
|
|
27
|
+
@abstractmethod
|
|
28
|
+
async def run(self, request: JobRequest) -> tuple[str, dict[str, int], dict[str, Any]]:
|
|
29
|
+
"""
|
|
30
|
+
Execute the request and return:
|
|
31
|
+
(output_text, usage_dict, metadata_dict)
|
|
32
|
+
|
|
33
|
+
usage_dict should contain at least:
|
|
34
|
+
{"prompt_tokens": int, "completion_tokens": int, "total": int}
|
|
35
|
+
|
|
36
|
+
metadata can carry "truncated", "truncation_reason", etc.
|
|
37
|
+
"""
|
|
38
|
+
raise NotImplementedError
|
|
39
|
+
|
|
40
|
+
async def list_models(self) -> list[dict[str, Any]]:
|
|
41
|
+
"""Return available models. Default empty; adapters should override."""
|
|
42
|
+
return []
|
|
43
|
+
|
|
44
|
+
async def pull_model(self, model: str) -> None:
|
|
45
|
+
"""Ensure the model is available (pull if missing). Default no-op."""
|
|
46
|
+
pass # Stub does nothing; real adapter implements with ollama pull
|
|
47
|
+
|
|
48
|
+
async def show_model(self, model: str) -> dict[str, Any]:
|
|
49
|
+
"""Return details for a specific model (from ollama show or stub). Default empty."""
|
|
50
|
+
return {}
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
class StubAdapter(BaseAdapter):
|
|
54
|
+
"""
|
|
55
|
+
Safe no-op / simulation adapter.
|
|
56
|
+
|
|
57
|
+
- Does NOT call Ollama.
|
|
58
|
+
- Returns deterministic fake output.
|
|
59
|
+
- Simulates truncation reporting when the request looks "large" relative to num_ctx.
|
|
60
|
+
- Small artificial delay so the worker loop has something to do.
|
|
61
|
+
"""
|
|
62
|
+
|
|
63
|
+
async def run(self, request: JobRequest) -> tuple[str, dict[str, int], dict[str, Any]]:
|
|
64
|
+
# Tiny simulated "thinking" time
|
|
65
|
+
await asyncio.sleep(0.03)
|
|
66
|
+
|
|
67
|
+
if request.messages:
|
|
68
|
+
# chat style
|
|
69
|
+
last = request.messages[-1] if request.messages else {}
|
|
70
|
+
content = last.get("content", "") if isinstance(last, dict) else str(last)
|
|
71
|
+
base = f"[STUB-CHAT] Responded to: {content[:60]}..."
|
|
72
|
+
else:
|
|
73
|
+
prompt = request.prompt or ""
|
|
74
|
+
base = f"[STUB] Generated response for: {prompt[:60]}..."
|
|
75
|
+
|
|
76
|
+
# crude token estimation
|
|
77
|
+
prompt_tokens = len((request.prompt or str(request.messages or "")).split())
|
|
78
|
+
num_ctx = request.num_ctx or 4096
|
|
79
|
+
|
|
80
|
+
completion_tokens = 25
|
|
81
|
+
total = prompt_tokens + completion_tokens
|
|
82
|
+
|
|
83
|
+
output = base
|
|
84
|
+
meta: dict[str, Any] = {}
|
|
85
|
+
|
|
86
|
+
# Simulate the truncation behavior requested in the spec
|
|
87
|
+
if num_ctx and prompt_tokens > num_ctx * 0.9:
|
|
88
|
+
meta["truncated"] = True
|
|
89
|
+
meta["truncation_reason"] = "simulated_context_limit_in_stub"
|
|
90
|
+
output = base[:80] + " ... [truncated in stub for testing]"
|
|
91
|
+
|
|
92
|
+
meta["effective_num_ctx"] = num_ctx
|
|
93
|
+
|
|
94
|
+
usage = {
|
|
95
|
+
"prompt_tokens": prompt_tokens,
|
|
96
|
+
"completion_tokens": completion_tokens,
|
|
97
|
+
"total": total,
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
return output, usage, meta
|
|
101
|
+
|
|
102
|
+
async def list_models(self) -> list[dict[str, Any]]:
|
|
103
|
+
return [
|
|
104
|
+
{"name": "stub-model:1b", "size": 123456, "digest": "stub", "details": {"family": "stub"}},
|
|
105
|
+
{"name": "stub-model:7b", "size": 4567890, "digest": "stub", "details": {"family": "stub"}},
|
|
106
|
+
]
|
|
107
|
+
|
|
108
|
+
async def show_model(self, model: str) -> dict[str, Any]:
|
|
109
|
+
return {
|
|
110
|
+
"name": model,
|
|
111
|
+
"size": 123456,
|
|
112
|
+
"digest": "stub",
|
|
113
|
+
"details": {"family": "stub", "parameter_size": "1B", "quantization_level": "Q4_0"},
|
|
114
|
+
"parameters": "num_ctx 4096",
|
|
115
|
+
"template": "{{ .System }} {{ .Prompt }}",
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
class OllamaAdapter(BaseAdapter):
|
|
120
|
+
"""
|
|
121
|
+
Real adapter using the official ollama Python client against a local/remote
|
|
122
|
+
Ollama server.
|
|
123
|
+
|
|
124
|
+
- Supports both prompt (generate) and messages (chat) submission styles.
|
|
125
|
+
- Maps common generation params (temperature, num_ctx, etc.) into the
|
|
126
|
+
request options.
|
|
127
|
+
- Returns usage counts from Ollama (prompt_eval_count / eval_count) when
|
|
128
|
+
present.
|
|
129
|
+
- Context truncation: if Ollama errors with context-related messages we
|
|
130
|
+
surface them; otherwise we succeed and let the model/Ollama decide.
|
|
131
|
+
Callers always get a result (per ADR-009) unless the error is fatal.
|
|
132
|
+
"""
|
|
133
|
+
|
|
134
|
+
def __init__(self, host: str | None = None):
|
|
135
|
+
self.host = host
|
|
136
|
+
self._client: ollama.AsyncClient | None = None
|
|
137
|
+
self._client_loop: asyncio.AbstractEventLoop | None = None
|
|
138
|
+
|
|
139
|
+
def _get_client(self) -> ollama.AsyncClient:
|
|
140
|
+
# ollama.AsyncClient wraps an httpx.AsyncClient whose primitives are
|
|
141
|
+
# bound to the event loop that first uses them; reusing it from a
|
|
142
|
+
# different loop raises "bound to a different event loop". The
|
|
143
|
+
# background worker runs in one loop (cached, fast path), but the
|
|
144
|
+
# sync facades run each call in a fresh loop, so cache PER LOOP and
|
|
145
|
+
# recreate when the running loop changes.
|
|
146
|
+
loop = asyncio.get_running_loop()
|
|
147
|
+
if self._client is None or self._client_loop is not loop:
|
|
148
|
+
self._client = ollama.AsyncClient(host=self.host)
|
|
149
|
+
self._client_loop = loop
|
|
150
|
+
return self._client
|
|
151
|
+
|
|
152
|
+
def _build_options(self, request: JobRequest) -> dict[str, Any]:
|
|
153
|
+
opts: dict[str, Any] = {}
|
|
154
|
+
if request.options:
|
|
155
|
+
opts.update(request.options)
|
|
156
|
+
for key in (
|
|
157
|
+
"temperature",
|
|
158
|
+
"top_p",
|
|
159
|
+
"top_k",
|
|
160
|
+
"repeat_penalty",
|
|
161
|
+
"seed",
|
|
162
|
+
"num_predict",
|
|
163
|
+
"stop",
|
|
164
|
+
"num_ctx",
|
|
165
|
+
):
|
|
166
|
+
val = getattr(request, key, None)
|
|
167
|
+
if val is not None:
|
|
168
|
+
opts[key] = val
|
|
169
|
+
return opts or {}
|
|
170
|
+
|
|
171
|
+
async def run(self, request: JobRequest) -> tuple[str, dict[str, int], dict[str, Any]]:
|
|
172
|
+
client = self._get_client()
|
|
173
|
+
options = self._build_options(request)
|
|
174
|
+
meta: dict[str, Any] = {}
|
|
175
|
+
usage: dict[str, int] = {}
|
|
176
|
+
|
|
177
|
+
# Auto-pull if the model is not present (makes real mode much more ergonomic)
|
|
178
|
+
await self.pull_model(request.model)
|
|
179
|
+
|
|
180
|
+
# Smart context handling: query model details for recommended num_ctx if not explicitly provided
|
|
181
|
+
effective_num_ctx = request.num_ctx
|
|
182
|
+
try:
|
|
183
|
+
model_info = await self.show_model(request.model)
|
|
184
|
+
params = model_info.get("parameters", "") or ""
|
|
185
|
+
if not effective_num_ctx:
|
|
186
|
+
for line in str(params).splitlines():
|
|
187
|
+
if "num_ctx" in line.lower():
|
|
188
|
+
parts = line.strip().split()
|
|
189
|
+
if len(parts) >= 2:
|
|
190
|
+
try:
|
|
191
|
+
effective_num_ctx = int(parts[-1])
|
|
192
|
+
break
|
|
193
|
+
except (ValueError, TypeError):
|
|
194
|
+
pass
|
|
195
|
+
if effective_num_ctx:
|
|
196
|
+
meta["effective_num_ctx"] = effective_num_ctx
|
|
197
|
+
if "num_ctx" not in options:
|
|
198
|
+
options["num_ctx"] = effective_num_ctx
|
|
199
|
+
except Exception:
|
|
200
|
+
pass # best effort, don't fail the job
|
|
201
|
+
|
|
202
|
+
try:
|
|
203
|
+
if request.messages:
|
|
204
|
+
# chat path
|
|
205
|
+
resp = await client.chat(
|
|
206
|
+
model=request.model,
|
|
207
|
+
messages=request.messages or [],
|
|
208
|
+
options=options if options else None,
|
|
209
|
+
format=request.format,
|
|
210
|
+
keep_alive=request.keep_alive,
|
|
211
|
+
)
|
|
212
|
+
# resp is ChatResponse-like (has .message or dict access)
|
|
213
|
+
msg = getattr(resp, "message", None) or (resp.get("message") if isinstance(resp, dict) else None)
|
|
214
|
+
if msg:
|
|
215
|
+
output = msg.get("content") if isinstance(msg, dict) else getattr(msg, "content", "")
|
|
216
|
+
else:
|
|
217
|
+
output = str(resp)
|
|
218
|
+
# token counts (chat responses use these fields)
|
|
219
|
+
prompt_tokens = int(getattr(resp, "prompt_eval_count", 0) or (resp.get("prompt_eval_count", 0) if isinstance(resp, dict) else 0))
|
|
220
|
+
completion_tokens = int(getattr(resp, "eval_count", 0) or (resp.get("eval_count", 0) if isinstance(resp, dict) else 0))
|
|
221
|
+
else:
|
|
222
|
+
# generate path
|
|
223
|
+
resp = await client.generate(
|
|
224
|
+
model=request.model,
|
|
225
|
+
prompt=request.prompt or "",
|
|
226
|
+
system=request.system_prompt,
|
|
227
|
+
options=options if options else None,
|
|
228
|
+
format=request.format,
|
|
229
|
+
keep_alive=request.keep_alive,
|
|
230
|
+
)
|
|
231
|
+
output = getattr(resp, "response", None) or (resp.get("response") if isinstance(resp, dict) else str(resp))
|
|
232
|
+
prompt_tokens = int(getattr(resp, "prompt_eval_count", 0) or (resp.get("prompt_eval_count", 0) if isinstance(resp, dict) else 0))
|
|
233
|
+
completion_tokens = int(getattr(resp, "eval_count", 0) or (resp.get("eval_count", 0) if isinstance(resp, dict) else 0))
|
|
234
|
+
|
|
235
|
+
total = prompt_tokens + completion_tokens
|
|
236
|
+
usage = {
|
|
237
|
+
"prompt_tokens": prompt_tokens or 0,
|
|
238
|
+
"completion_tokens": completion_tokens or 0,
|
|
239
|
+
"total": total,
|
|
240
|
+
}
|
|
241
|
+
|
|
242
|
+
# Best-effort truncation / completion info from real Ollama responses
|
|
243
|
+
done_reason = getattr(resp, "done_reason", None) or (resp.get("done_reason") if isinstance(resp, dict) else None)
|
|
244
|
+
if done_reason == "length":
|
|
245
|
+
meta["truncated"] = True
|
|
246
|
+
meta["truncation_reason"] = "length" # hit max tokens / context window
|
|
247
|
+
elif done_reason:
|
|
248
|
+
meta["done_reason"] = done_reason
|
|
249
|
+
|
|
250
|
+
# Fallback heuristic for cases where done_reason not present
|
|
251
|
+
if not meta.get("truncated") and ("context" in str(output).lower() or "truncat" in str(output).lower()):
|
|
252
|
+
meta["truncated"] = True
|
|
253
|
+
meta["truncation_reason"] = "possible_context_truncation_from_model"
|
|
254
|
+
|
|
255
|
+
return str(output or ""), usage, meta
|
|
256
|
+
|
|
257
|
+
except Exception:
|
|
258
|
+
# Let the caller (client._execute_with_retries) classify as transient/permanent
|
|
259
|
+
# and decide on retries / final failure. We re-raise so error path is exercised.
|
|
260
|
+
raise
|
|
261
|
+
|
|
262
|
+
async def list_models(self) -> list[dict[str, Any]]:
|
|
263
|
+
client = self._get_client()
|
|
264
|
+
resp = await client.list()
|
|
265
|
+
# resp is usually ListResponse with .models list of Model objects or dicts
|
|
266
|
+
models = getattr(resp, "models", None) or (resp.get("models") if isinstance(resp, dict) else [])
|
|
267
|
+
result = []
|
|
268
|
+
for m in models or []:
|
|
269
|
+
if isinstance(m, dict):
|
|
270
|
+
result.append(m)
|
|
271
|
+
else:
|
|
272
|
+
# Try to turn object into dict
|
|
273
|
+
d = {}
|
|
274
|
+
for attr in ("name", "model", "size", "digest", "details", "modified_at"):
|
|
275
|
+
if hasattr(m, attr):
|
|
276
|
+
d[attr] = getattr(m, attr)
|
|
277
|
+
elif isinstance(m, dict) and attr in m:
|
|
278
|
+
d[attr] = m[attr]
|
|
279
|
+
if d:
|
|
280
|
+
result.append(d)
|
|
281
|
+
return result
|
|
282
|
+
|
|
283
|
+
async def pull_model(self, model: str) -> None:
|
|
284
|
+
"""Pull the model if it is not already present locally."""
|
|
285
|
+
client = self._get_client()
|
|
286
|
+
try:
|
|
287
|
+
await client.show(model=model)
|
|
288
|
+
return # already present
|
|
289
|
+
except Exception:
|
|
290
|
+
pass # not present, pull it
|
|
291
|
+
# Perform the pull (can be long-running; no stream for simplicity)
|
|
292
|
+
await client.pull(model=model)
|
|
293
|
+
|
|
294
|
+
async def show_model(self, model: str) -> dict[str, Any]:
|
|
295
|
+
client = self._get_client()
|
|
296
|
+
resp = await client.show(model=model)
|
|
297
|
+
if isinstance(resp, dict):
|
|
298
|
+
return resp
|
|
299
|
+
# Convert object to dict
|
|
300
|
+
d = {}
|
|
301
|
+
for attr in ("name", "model", "size", "digest", "details", "parameters", "template", "modified_at"):
|
|
302
|
+
if hasattr(resp, attr):
|
|
303
|
+
d[attr] = getattr(resp, attr)
|
|
304
|
+
elif isinstance(resp, dict) and attr in resp:
|
|
305
|
+
d[attr] = resp[attr]
|
|
306
|
+
return d or {"name": model}
|