dacli-ai 0.4.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,18 @@
1
+ Metadata-Version: 2.4
2
+ Name: dacli-ai
3
+ Version: 0.4.0
4
+ Summary: Provider LLM client and token/pricing accounting for dacli
5
+ Author-email: Mouad Jaouhari <github@mj-dev.net>
6
+ Project-URL: Homepage, https://github.com/mouadja02/dacli
7
+ Keywords: llm,anthropic,openai,token accounting
8
+ Requires-Python: >=3.10
9
+ Description-Content-Type: text/markdown
10
+ Requires-Dist: anthropic<1,>=0.40
11
+ Requires-Dist: openai<3,>=1.40
12
+ Requires-Dist: httpx<1,>=0.27
13
+
14
+ # dacli-ai
15
+
16
+ Provider LLM client (anthropic / openai / openrouter) and token/pricing accounting
17
+ for [dacli](https://github.com/mouadja02/dacli). The leaf wheel — no dacli-core
18
+ dependency. Embed it with `dacli-core` for a headless agent.
@@ -0,0 +1,5 @@
1
+ # dacli-ai
2
+
3
+ Provider LLM client (anthropic / openai / openrouter) and token/pricing accounting
4
+ for [dacli](https://github.com/mouadja02/dacli). The leaf wheel — no dacli-core
5
+ dependency. Embed it with `dacli-core` for a headless agent.
@@ -0,0 +1,30 @@
1
+ [build-system]
2
+ requires = ["setuptools>=68"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "dacli-ai"
7
+ dynamic = ["version"]
8
+ description = "Provider LLM client and token/pricing accounting for dacli"
9
+ readme = "README.md"
10
+ requires-python = ">=3.10"
11
+ authors = [{ name = "Mouad Jaouhari", email = "github@mj-dev.net" }]
12
+ keywords = ["llm", "anthropic", "openai", "token accounting"]
13
+ # The leaf wheel: provider SDKs + the pricing fetch. The SDKs are base — the
14
+ # client is inert without one. httpx backs the models.dev pricing lookup.
15
+ dependencies = [
16
+ "anthropic>=0.40,<1",
17
+ "openai>=1.40,<3",
18
+ "httpx>=0.27,<1",
19
+ ]
20
+
21
+ [project.urls]
22
+ Homepage = "https://github.com/mouadja02/dacli"
23
+
24
+ [tool.setuptools.dynamic]
25
+ version = { attr = "dacli.ai.__version__" }
26
+
27
+ [tool.setuptools.packages.find]
28
+ where = ["src"]
29
+ include = ["dacli*"]
30
+ namespaces = true
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,8 @@
1
+ # dacli-ai: the provider-agnostic LLM client (ℛ) and token/pricing accounting.
2
+ # The leaf wheel — no dacli-core import at runtime.
3
+
4
+ from dacli.ai.llm import LLMClient
5
+
6
+ __version__ = "0.4.0"
7
+
8
+ __all__ = ["LLMClient"]
@@ -0,0 +1,213 @@
1
+ from __future__ import annotations
2
+
3
+ import random
4
+ import asyncio
5
+ import logging
6
+ import contextlib
7
+ from typing import TYPE_CHECKING, Any, TypeAlias
8
+ from collections.abc import Awaitable, Callable
9
+
10
+ from dacli.ai.providers import Provider, create_provider, unsupported_tools_error
11
+
12
+ if TYPE_CHECKING:
13
+ # ai is the leaf wheel; it must not import core at runtime. Settings is only a
14
+ # type annotation here — the client reads it by attribute (duck-typed).
15
+ from dacli.config.settings import Settings
16
+
17
+ logger = logging.getLogger(__name__)
18
+
19
+ # Type of the optional streaming callback: receives each text delta as it
20
+ # arrives. Returning None; it is presentation-only and must not raise into the
21
+ # generate path (the UI guards its own rendering).
22
+ OnText: TypeAlias = Callable[[str], None] | None
23
+
24
+ # Type of the optional retry-status callback. Invoked once per *retry* (not on
25
+ # the final failure) with the upcoming attempt number, the backoff delay about
26
+ # to be slept, and the transient error that triggered it, so the TUI/logger can
27
+ # render "⟳ retrying in 2.1s (429)".
28
+ OnRetry = Callable[..., None] | None
29
+
30
+
31
+ class LLMClient:
32
+ # Multi-provider LLM client: a thin facade that selects a Provider in
33
+ # initialize() and delegates request mechanics to it (A-2). The public
34
+ # surface (generate / classify / last_usage) is provider-agnostic; the
35
+ # shared retry/backoff lives here so no provider duplicates it.
36
+
37
+ def __init__(self, settings: Settings):
38
+ # Initialize LLM client with settings
39
+ self.settings = settings
40
+ # The concrete SDK client (AsyncOpenAI / AsyncAnthropic) owned by the
41
+ # active provider; mirrored here so callers (and tests) can inspect or
42
+ # inject it on the facade. Typed Any so provider-specific attribute
43
+ # access type-checks without a fragile union.
44
+ self._client: Any = None
45
+ self._provider = settings.llm.provider
46
+ # The active Provider implementation, selected in initialize().
47
+ self._provider_impl: Provider | None = None
48
+ # Provider-normalized token usage of the most recent generate() call,
49
+ # read by the kernel for cost tracking. Reset on each generate().
50
+ self.last_usage: dict[str, int] = {}
51
+
52
+ async def initialize(self) -> None:
53
+ # Select and initialize the provider. ``supports_tools`` is checked
54
+ # here so a provider that cannot do tool calling (which every real
55
+ # agent turn requires) fails fast at configuration time, not deep
56
+ # inside the first turn (P02, Option B — honest removal).
57
+ provider = create_provider(
58
+ self._provider.lower(), self.settings, retry=self._with_retry
59
+ )
60
+ if not provider.supports_tools:
61
+ raise unsupported_tools_error(provider.name)
62
+ await provider.initialize()
63
+ self._provider_impl = provider
64
+ self._client = provider.client
65
+
66
+ def _impl(self) -> Provider:
67
+ # The active provider, created lazily for paths that bypass
68
+ # initialize(). A fake SDK client injected on the facade (tests) is
69
+ # pushed through to the provider so both always see the same client.
70
+ impl = getattr(self, "_provider_impl", None)
71
+ if impl is None:
72
+ impl = create_provider(
73
+ self._provider.lower(), self.settings, retry=self._with_retry
74
+ )
75
+ self._provider_impl = impl
76
+ if impl.client is not self._client:
77
+ impl.client = self._client
78
+ return impl
79
+
80
+ async def generate(self, messages: list[dict[str, str]], tools: list[dict] | None = None, system_prompt: str | None = None, on_text: OnText = None, model: str | None = None, on_retry: OnRetry = None) -> tuple[str, list[dict]]:
81
+ """
82
+ Generate a response from the LLM.
83
+
84
+ Args:
85
+ messages: Conversation messages
86
+ tools: Available tool definitions
87
+ system_prompt: System prompt to use
88
+ on_text: Optional callback invoked with each text delta as it is
89
+ generated. When provided (and the provider supports it) the
90
+ response is streamed; the return value is unchanged. Providers
91
+ without streaming call it once with the full text instead, so
92
+ the UI behaves identically.
93
+ model: Optional per-call model override (model tiering, ℛ).
94
+ When None the configured ``settings.llm.model`` is used, so the
95
+ default single-model path is byte-for-byte unchanged. The
96
+ ``ModelRouter`` passes the cheap or strong tier id here.
97
+
98
+ Returns:
99
+ Tuple of (response content, tool calls)
100
+ """
101
+ if not self._client:
102
+ await self.initialize()
103
+
104
+ self.last_usage = {} # populated from the provider below
105
+ model = model or self.settings.llm.model
106
+ impl = self._impl()
107
+ content, tool_calls = await impl.generate(
108
+ messages, tools, system_prompt, on_text=on_text, model=model, on_retry=on_retry
109
+ )
110
+ self.last_usage = impl.last_usage
111
+ return content, tool_calls
112
+
113
+ def _retryable_exceptions(self) -> tuple[type, ...]:
114
+ # The active provider's declared transient-error classes (429 rate
115
+ # limit, dropped connection, 5xx); an unknown provider retries nothing,
116
+ # so a transient blip simply surfaces unchanged.
117
+ try:
118
+ return self._impl().retryable_exceptions()
119
+ except ValueError:
120
+ return ()
121
+
122
+ @staticmethod
123
+ def _default_on_retry(*, attempt: int, delay: float, error: Exception) -> None:
124
+ # Fallback status sink when no on_retry is wired (e.g. P13 TUI absent):
125
+ # log the transient failure + backoff so a retried turn is never silent.
126
+ logger.warning(
127
+ "LLM call failed (%s); retrying in %.1fs (attempt %d)",
128
+ type(error).__name__,
129
+ delay,
130
+ attempt,
131
+ )
132
+
133
+ async def _with_retry(
134
+ self,
135
+ fn: Callable[[], Awaitable],
136
+ *,
137
+ attempts: int | None = None,
138
+ base: float | None = None,
139
+ on_retry: OnRetry = None,
140
+ retryable: tuple[type[BaseException], ...] | None = None,
141
+ ):
142
+ """Run ``fn`` with bounded, jittered exponential backoff (P05).
143
+
144
+ ``fn`` is an argument-free coroutine factory invoked once per attempt;
145
+ for streaming paths it re-establishes the stream from scratch on retry
146
+ (at-most-once-token caveat: any partial tokens already emitted to the UI
147
+ are discarded when the stream is restarted). Only ``retryable`` classes
148
+ are retried — everything else propagates immediately (fail fast).
149
+ """
150
+ attempts = attempts or self.settings.llm.retry_attempts
151
+ base = base if base is not None else self.settings.llm.retry_base_delay
152
+ retryable = retryable if retryable is not None else self._retryable_exceptions()
153
+ on_retry = on_retry or self._default_on_retry
154
+ for i in range(attempts):
155
+ try:
156
+ return await fn()
157
+ except retryable as e:
158
+ if i == attempts - 1:
159
+ raise
160
+ delay = base * 2 ** i + random.random() * 0.3
161
+ # a status sink must never break the retry loop
162
+ with contextlib.suppress(Exception):
163
+ on_retry(attempt=i + 1, delay=delay, error=e)
164
+ await asyncio.sleep(delay)
165
+ # Unreachable: attempts >= 1, so the loop always returns or raises. Kept
166
+ # so the function provably never returns None.
167
+ raise RuntimeError("_with_retry exhausted without returning or raising")
168
+
169
+ async def _stream_openai(self, request_kwargs: dict, on_text: OnText, on_retry: OnRetry = None) -> tuple[str, list[dict]]:
170
+ # Back-compat seam: the OpenAI-compatible streaming path is also driven
171
+ # directly through the facade (tests inject a fake SDK client on
172
+ # ``_client``). Delegates to the provider; identical to generate()'s
173
+ # streaming path.
174
+ impl = self._impl()
175
+ result = await impl._stream(request_kwargs, on_text, on_retry=on_retry)
176
+ self.last_usage = impl.last_usage
177
+ return result
178
+
179
+ async def classify(self, text: str, labels: list[str], instructions: str | None = None, model: str | None = None) -> str:
180
+ """
181
+ Thin classification helper used by the router.
182
+
183
+ Sends a tool-free completion asking the model to pick exactly one label
184
+ from ``labels`` and normalizes the answer back onto that set when possible.
185
+ ``model`` lets the caller force the cheap tier — classification
186
+ is the canonical cheap-model job.
187
+ """
188
+ system = instructions or (
189
+ "You are a classifier. Respond with exactly one of the allowed labels "
190
+ "and nothing else."
191
+ )
192
+ label_list = ", ".join(labels)
193
+ prompt = (
194
+ f"Allowed labels: {label_list}\n\n"
195
+ f"Text to classify:\n{text}\n\n"
196
+ "Respond with exactly one label from the allowed list."
197
+ )
198
+ content, _ = await self.generate(
199
+ messages=[{"role": "user", "content": prompt}],
200
+ tools=None,
201
+ system_prompt=system,
202
+ model=model,
203
+ )
204
+ answer = (content or "").strip()
205
+
206
+ # Exact (case-insensitive) match first, then substring fallback.
207
+ for label in labels:
208
+ if answer.lower() == label.lower():
209
+ return label
210
+ for label in labels:
211
+ if label.lower() in answer.lower():
212
+ return label
213
+ return answer
@@ -0,0 +1,325 @@
1
+ """Model pricing via the models.dev API, plus token-usage accounting.
2
+
3
+ dacli tracks how many tokens each LLM call consumes and what it costs. Pricing
4
+ is looked up from https://models.dev/api.json (a community database of model
5
+ specs/pricing), filtered to the configured provider + model. The payload is
6
+ cached on disk with a TTL so we don't hit the network every turn, and we degrade
7
+ gracefully when offline (tokens are still tracked; cost is reported as unknown).
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ import json
13
+ import logging
14
+ import os
15
+ import re
16
+ import tempfile
17
+ import time
18
+ from dataclasses import dataclass
19
+ from difflib import SequenceMatcher
20
+ from pathlib import Path
21
+ from typing import Any
22
+
23
+ log = logging.getLogger(__name__)
24
+
25
+
26
+ def _write_json_atomic(path: Path, obj: Any) -> None:
27
+ # ai is the leaf wheel and can't reach core.atomicio. The cache is best-effort
28
+ # (a torn write just forces a re-fetch), but cheap crash-safety is still worth
29
+ # it: write a sibling temp, fsync, os.replace (atomic on POSIX + Windows).
30
+ path.parent.mkdir(parents=True, exist_ok=True)
31
+ fd, tmp = tempfile.mkstemp(dir=path.parent, suffix=".tmp")
32
+ try:
33
+ with os.fdopen(fd, "w", encoding="utf-8") as f:
34
+ f.write(json.dumps(obj))
35
+ f.flush()
36
+ os.fsync(f.fileno())
37
+ os.replace(tmp, path)
38
+ finally:
39
+ if os.path.exists(tmp):
40
+ os.unlink(tmp)
41
+
42
+ MODELS_DEV_URL = "https://models.dev/api.json"
43
+ CACHE_TTL_SECONDS = 24 * 60 * 60 # refresh pricing at most once a day
44
+ # Short network timeout: pricing is a startup nicety, not a blocker. A
45
+ # first-run offline user (no cache yet) must not wait long before we fall back to
46
+ # "cost unknown". Keep it well under a human's patience threshold.
47
+ HTTP_TIMEOUT_SECONDS = 5.0
48
+
49
+ # Minimum similarity score for a fuzzy model match to be trusted. Below this we
50
+ # return no pricing (better an honest "unknown" than a wrong, confident price).
51
+ SIMILARITY_THRESHOLD = 0.62
52
+
53
+
54
+ @dataclass
55
+ class TokenUsage:
56
+ """Token counts for one or many LLM calls (provider-normalized)."""
57
+
58
+ input: int = 0
59
+ output: int = 0
60
+ cache_read: int = 0
61
+ cache_creation: int = 0
62
+
63
+ def add(self, other: TokenUsage) -> None:
64
+ self.input += other.input
65
+ self.output += other.output
66
+ self.cache_read += other.cache_read
67
+ self.cache_creation += other.cache_creation
68
+
69
+ @property
70
+ def total(self) -> int:
71
+ return self.input + self.output + self.cache_read + self.cache_creation
72
+
73
+ def as_dict(self) -> dict[str, int]:
74
+ return {
75
+ "input": self.input,
76
+ "output": self.output,
77
+ "cache_read": self.cache_read,
78
+ "cache_creation": self.cache_creation,
79
+ }
80
+
81
+ @classmethod
82
+ def from_dict(cls, d: dict[str, Any] | None) -> TokenUsage:
83
+ d = d or {}
84
+ return cls(
85
+ input=int(d.get("input", 0) or 0),
86
+ output=int(d.get("output", 0) or 0),
87
+ cache_read=int(d.get("cache_read", 0) or 0),
88
+ cache_creation=int(d.get("cache_creation", 0) or 0),
89
+ )
90
+
91
+
92
+ @dataclass
93
+ class ModelPricing:
94
+ """USD cost per 1M tokens for a single model (models.dev `cost` block)."""
95
+
96
+ provider: str
97
+ model: str
98
+ input: float = 0.0
99
+ output: float = 0.0
100
+ cache_read: float = 0.0
101
+ cache_write: float = 0.0
102
+ # The models.dev entry we actually priced against. Equal to ``model`` on an
103
+ # exact hit; on a fuzzy hit it names the closest catalog model (e.g.
104
+ # ``openai/gpt-oss-120b`` for a requested ``openai/gpt-oss-120b:nitro``).
105
+ resolved_model: str = ""
106
+ resolved_provider: str = ""
107
+ match: str = "exact" # "exact" | "normalized" | "similar"
108
+ similarity: float = 1.0
109
+
110
+ @property
111
+ def is_fuzzy(self) -> bool:
112
+ return self.match != "exact"
113
+
114
+ def cost_for(self, usage: TokenUsage) -> float:
115
+ """Compute USD cost for a usage record (prices are per 1M tokens)."""
116
+ return (
117
+ usage.input * self.input
118
+ + usage.output * self.output
119
+ + usage.cache_read * self.cache_read
120
+ + usage.cache_creation * self.cache_write
121
+ ) / 1_000_000
122
+
123
+
124
+ # ----------------------------------------------------------------------------
125
+ # models.dev lookup
126
+ # ----------------------------------------------------------------------------
127
+ def _ci_get(d: Any, key: str) -> Any:
128
+ """Case-insensitive dict lookup (model/provider ids vary in casing)."""
129
+ if not isinstance(d, dict) or not key:
130
+ return None
131
+ if key in d:
132
+ return d[key]
133
+ lowered = key.lower()
134
+ for k, v in d.items():
135
+ if isinstance(k, str) and k.lower() == lowered:
136
+ return v
137
+ return None
138
+
139
+
140
+ # Provider-routing variant suffixes (OpenRouter et al.) that don't change the
141
+ # underlying model's price — stripped before matching, e.g.
142
+ # ``openai/gpt-oss-120b:nitro`` -> ``openai/gpt-oss-120b``.
143
+ _VARIANT_SUFFIX_RE = re.compile(r":[^:/]+$")
144
+
145
+
146
+ def _normalize_model_id(model: str) -> str:
147
+ """Lowercase + drop the routing-variant suffix for matching."""
148
+ s = (model or "").strip().lower()
149
+ # Strip a trailing ``:variant`` (nitro/floor/free/beta/extended/online/...).
150
+ # models.dev ids never carry a ``:`` so this only removes routing noise.
151
+ s = _VARIANT_SUFFIX_RE.sub("", s)
152
+ return s.strip()
153
+
154
+
155
+ def _basename(model_id: str) -> str:
156
+ # The vendor/model -> model part ("openai/gpt-oss-120b" -> "gpt-oss-120b").
157
+ return model_id.rsplit("/", 1)[-1]
158
+
159
+
160
+ def _iter_models(payload: dict, provider: str):
161
+ """Yield ``(provider_id, provider_entry, model_id, model_entry)``.
162
+
163
+ The configured provider's models come first so a routed model is priced
164
+ against *that* provider's catalog (e.g. OpenRouter pricing for an
165
+ OpenRouter-routed model) before falling back to other providers.
166
+ """
167
+ seen_provider = None
168
+ prov = _ci_get(payload, provider)
169
+ if isinstance(prov, dict):
170
+ seen_provider = next((k for k in payload if k.lower() == (provider or "").lower()), provider)
171
+ for mid, entry in (prov.get("models", {}) or {}).items():
172
+ if isinstance(entry, dict):
173
+ yield seen_provider, prov, mid, entry
174
+ for pid, pval in payload.items():
175
+ if pid == seen_provider or not isinstance(pval, dict):
176
+ continue
177
+ for mid, entry in (pval.get("models", {}) or {}).items():
178
+ if isinstance(entry, dict):
179
+ yield pid, pval, mid, entry
180
+
181
+
182
+ def _score(query_norm: str, candidate_id: str) -> float:
183
+ """Similarity in [0,1] between a normalized query and a candidate model id."""
184
+ cand_norm = _normalize_model_id(candidate_id)
185
+ if query_norm == cand_norm:
186
+ return 1.0
187
+ # A matching basename is a strong signal even if the vendor prefix differs.
188
+ base_q, base_c = _basename(query_norm), _basename(cand_norm)
189
+ if base_q == base_c:
190
+ return 0.97
191
+ full = SequenceMatcher(None, query_norm, cand_norm).ratio()
192
+ base = SequenceMatcher(None, base_q, base_c).ratio()
193
+ # Reward containment (e.g. "gpt-oss-120b" inside "openai/gpt-oss-120b").
194
+ contain = 0.9 if (base_q and base_q in base_c) or (base_c and base_c in base_q) else 0.0
195
+ return max(full, base, contain)
196
+
197
+
198
+ def _find_model(payload: Any, provider: str, model: str) -> tuple[dict, dict, str, str, str, float] | None:
199
+ """Locate the best (provider_entry, model_entry, ...) for provider+model.
200
+
201
+ Returns ``(provider_entry, model_entry, resolved_provider_id, resolved_model_id,
202
+ match_kind, similarity)`` or ``None``. Match resolution, in order:
203
+
204
+ 1. **exact** case-insensitive id in the named provider, then any provider;
205
+ 2. **normalized** exact (after stripping the routing-variant suffix);
206
+ 3. **similar** — the closest catalog id by similarity, above a threshold,
207
+ preferring the configured provider when its best match ties.
208
+ """
209
+ if not isinstance(payload, dict) or not model:
210
+ return None
211
+
212
+ # 1. exact (preserves the original behavior + tests).
213
+ prov = _ci_get(payload, provider)
214
+ if isinstance(prov, dict):
215
+ m = _ci_get(prov.get("models", {}), model)
216
+ if isinstance(m, dict):
217
+ return prov, m, provider, model, "exact", 1.0
218
+ for pid, pval in payload.items():
219
+ if not isinstance(pval, dict):
220
+ continue
221
+ m = _ci_get(pval.get("models", {}), model)
222
+ if isinstance(m, dict):
223
+ return pval, m, pid, model, "exact", 1.0
224
+
225
+ # 2 & 3. normalized-exact + similarity over all candidates, provider-first.
226
+ query_norm = _normalize_model_id(model)
227
+ if not query_norm:
228
+ return None
229
+
230
+ best = None # (score, is_same_provider, provider_id, prov_entry, model_id, entry)
231
+ same_provider_id = next((k for k in payload if k.lower() == (provider or "").lower()), None)
232
+ for pid, pval, mid, entry in _iter_models(payload, provider):
233
+ score = _score(query_norm, mid)
234
+ same = (pid == same_provider_id)
235
+ # A normalized-exact hit (score 1.0) wins immediately within the
236
+ # provider-first ordering.
237
+ if score >= 1.0 and same:
238
+ return pval, entry, pid, mid, "normalized", 1.0
239
+ cand = (score, same, pid, pval, mid, entry)
240
+ if best is None or (score, same) > (best[0], best[1]):
241
+ best = cand
242
+
243
+ if best and best[0] >= SIMILARITY_THRESHOLD:
244
+ score, _same, pid, pval, mid, entry = best
245
+ kind = "normalized" if score >= 1.0 else "similar"
246
+ return pval, entry, pid, mid, kind, round(score, 3)
247
+ return None
248
+
249
+
250
+ def pricing_from_payload(payload: Any, provider: str, model: str) -> ModelPricing | None:
251
+ """Build :class:`ModelPricing` from an in-memory api.json payload (pure).
252
+
253
+ Falls back to a similarity search when an exact id isn't in the catalog, so
254
+ a routed/variant model (``…:nitro``) is priced against its closest match.
255
+ """
256
+ found = _find_model(payload, (provider or "").strip(), (model or "").strip())
257
+ if not found:
258
+ return None
259
+ _prov_entry, entry, resolved_provider, resolved_model, kind, similarity = found
260
+ cost = entry.get("cost") or {}
261
+ return ModelPricing(
262
+ provider=provider,
263
+ model=model,
264
+ input=float(cost.get("input", 0) or 0),
265
+ output=float(cost.get("output", 0) or 0),
266
+ cache_read=float(cost.get("cache_read", 0) or 0),
267
+ cache_write=float(cost.get("cache_write", 0) or 0),
268
+ resolved_model=resolved_model,
269
+ resolved_provider=resolved_provider,
270
+ match=kind,
271
+ similarity=similarity,
272
+ )
273
+
274
+
275
+ # ----------------------------------------------------------------------------
276
+ # cached fetch
277
+ # ----------------------------------------------------------------------------
278
+ def _cache_path(cache_dir: str) -> Path:
279
+ return Path(cache_dir) / "models_cache.json"
280
+
281
+
282
+ def _load_cache(cache_dir: str) -> tuple[float, Any]:
283
+ try:
284
+ data = json.loads(_cache_path(cache_dir).read_text(encoding="utf-8"))
285
+ return float(data.get("fetched_at", 0)), data.get("payload")
286
+ except Exception:
287
+ return 0.0, None
288
+
289
+
290
+ def _save_cache(cache_dir: str, payload: Any) -> None:
291
+ try:
292
+ path = _cache_path(cache_dir)
293
+ _write_json_atomic(path, {"fetched_at": time.time(), "payload": payload})
294
+ except Exception:
295
+ log.debug("pricing cache write failed", exc_info=True) # best-effort
296
+
297
+
298
+ def fetch_api_json(cache_dir: str = ".dacli", force_refresh: bool = False) -> Any:
299
+ """Return the models.dev payload, using a TTL cache and offline fallback."""
300
+ fetched_at, payload = _load_cache(cache_dir)
301
+ fresh = payload is not None and (time.time() - fetched_at) < CACHE_TTL_SECONDS
302
+ if fresh and not force_refresh:
303
+ return payload
304
+
305
+ try:
306
+ import httpx
307
+
308
+ resp = httpx.get(MODELS_DEV_URL, timeout=HTTP_TIMEOUT_SECONDS)
309
+ resp.raise_for_status()
310
+ payload = resp.json()
311
+ _save_cache(cache_dir, payload)
312
+ return payload
313
+ except Exception:
314
+ return payload # stale cache or None when offline
315
+
316
+
317
+ def fetch_pricing(
318
+ provider: str,
319
+ model: str,
320
+ cache_dir: str = ".dacli",
321
+ force_refresh: bool = False,
322
+ ) -> ModelPricing | None:
323
+ """Resolve pricing for provider+model, or ``None`` if unavailable/offline."""
324
+ payload = fetch_api_json(cache_dir, force_refresh=force_refresh)
325
+ return pricing_from_payload(payload, provider, model)
@@ -0,0 +1,376 @@
1
+ """Per-provider LLM implementations behind the :class:`Provider` protocol (A-2).
2
+
3
+ Each provider owns its SDK client, request shaping, streaming reassembly, and
4
+ usage normalization, and **declares** its capabilities (``supports_tools``) so
5
+ "does this provider support tools?" is a configure-time property, not a
6
+ turn-time surprise. The :class:`~dacli.ai.llm.LLMClient` facade selects
7
+ a provider in ``initialize()`` and delegates; bounded retry/backoff stays in
8
+ the facade and is injected as ``retry`` so no provider duplicates it.
9
+ """
10
+
11
+ from __future__ import annotations
12
+
13
+ import json
14
+ import contextlib
15
+ from abc import ABC, abstractmethod
16
+ from typing import TYPE_CHECKING, Any, ClassVar
17
+
18
+ if TYPE_CHECKING:
19
+ from dacli.config.settings import Settings
20
+ from dacli.ai.llm import OnRetry, OnText
21
+
22
+
23
+ def emit(on_text: OnText, delta: str) -> None:
24
+ # Stream a delta to the UI without ever letting a rendering error break
25
+ # generation (reliability-first).
26
+ if on_text and delta:
27
+ with contextlib.suppress(Exception):
28
+ on_text(delta)
29
+
30
+
31
+ class Provider(ABC):
32
+ """One LLM provider: declared capabilities + the request/stream mechanics.
33
+
34
+ ``retry`` is the facade's ``_with_retry`` (bounded, jittered exponential
35
+ backoff) — shared, never reimplemented per provider. ``client`` is the
36
+ provider's SDK client, built lazily in :meth:`initialize` with the SDK's
37
+ own retries disabled so the configured count is authoritative.
38
+ """
39
+
40
+ name: ClassVar[str] = ""
41
+ supports_tools: ClassVar[bool] = True
42
+
43
+ def __init__(self, settings: Settings, *, retry):
44
+ self.settings = settings
45
+ self._retry = retry
46
+ self.client: Any = None
47
+ # Provider-normalized token usage of the most recent generate() call;
48
+ # the facade copies it onto its own ``last_usage`` after delegating.
49
+ self.last_usage: dict[str, int] = {}
50
+
51
+ @abstractmethod
52
+ async def initialize(self) -> None:
53
+ """Construct the SDK client (``max_retries=0`` — retry is ours)."""
54
+
55
+ @abstractmethod
56
+ async def generate(
57
+ self,
58
+ messages: list[dict[str, str]],
59
+ tools: list[dict] | None = None,
60
+ system_prompt: str | None = None,
61
+ on_text: OnText = None,
62
+ model: str | None = None,
63
+ on_retry: OnRetry = None,
64
+ ) -> tuple[str, list[dict]]:
65
+ """Return ``(content, tool_calls)`` for one completion."""
66
+
67
+ @abstractmethod
68
+ def normalize_usage(self, raw) -> dict[str, int]:
69
+ """Map the provider's raw usage object onto the shared usage dict."""
70
+
71
+ def retryable_exceptions(self) -> tuple[type, ...]:
72
+ # Provider-specific *transient* error classes that are safe to retry
73
+ # (429 rate limit, dropped connection, 5xx). Auth / 4xx-validation
74
+ # errors are deliberately excluded so they fail fast. Imported lazily
75
+ # (mirroring initialize()) and tolerant of a missing SDK -> () means
76
+ # "retry nothing", so a transient blip simply surfaces unchanged.
77
+ return ()
78
+
79
+
80
+ class OpenAIProvider(Provider):
81
+ """OpenAI (and any OpenAI-compatible endpoint via ``base_url``)."""
82
+
83
+ name = "openai"
84
+
85
+ async def initialize(self) -> None:
86
+ from openai import AsyncOpenAI
87
+ self.client = AsyncOpenAI(
88
+ api_key=self.settings.llm.api_key,
89
+ base_url=self.settings.llm.base_url,
90
+ timeout=self.settings.llm.timeout,
91
+ max_retries=0,
92
+ )
93
+
94
+ def retryable_exceptions(self) -> tuple[type, ...]:
95
+ try:
96
+ from openai import (
97
+ RateLimitError,
98
+ APIConnectionError,
99
+ InternalServerError,
100
+ )
101
+ except ImportError:
102
+ return ()
103
+ return (RateLimitError, APIConnectionError, InternalServerError)
104
+
105
+ async def generate(self, messages: list[dict[str, str]], tools: list[dict] | None = None, system_prompt: str | None = None, on_text: OnText = None, model: str | None = None, on_retry: OnRetry = None) -> tuple[str, list[dict]]:
106
+ # Generate using OpenAI-compatibile API
107
+
108
+ # Prepare messages includes system prompt
109
+ full_messages = []
110
+ if system_prompt:
111
+ full_messages.append({"role": "system", "content": system_prompt})
112
+ full_messages.extend(messages)
113
+
114
+ # Prepare request
115
+ request_kwargs = {
116
+ "model": model or self.settings.llm.model,
117
+ "messages": full_messages,
118
+ "temperature": self.settings.llm.temperature,
119
+ "max_tokens": self.settings.llm.max_tokens,
120
+ }
121
+
122
+ # Add tools if provided
123
+ if tools:
124
+ request_kwargs["tools"] = tools
125
+ request_kwargs["tool_choice"] = "auto"
126
+
127
+ if on_text is not None:
128
+ return await self._stream(request_kwargs, on_text, on_retry=on_retry)
129
+
130
+ # Make request (retried on transient errors; permanent errors fail fast).
131
+ response = await self._retry(
132
+ lambda: self.client.chat.completions.create(**request_kwargs),
133
+ on_retry=on_retry,
134
+ retryable=self.retryable_exceptions(),
135
+ )
136
+
137
+ # Extract response
138
+ choice = response.choices[0]
139
+ content = choice.message.content or ""
140
+
141
+ # Extract tool calls
142
+ tool_calls = []
143
+ if hasattr(choice.message, "tool_calls") and choice.message.tool_calls:
144
+ tool_calls.extend(
145
+ {"id": tc.id, "name": tc.function.name, "arguments": json.loads(tc.function.arguments)}
146
+ for tc in choice.message.tool_calls
147
+ )
148
+
149
+ self.last_usage = self.normalize_usage(getattr(response, "usage", None))
150
+ return content, tool_calls
151
+
152
+ async def _stream(self, request_kwargs: dict, on_text: OnText, on_retry: OnRetry = None) -> tuple[str, list[dict]]:
153
+ # Streaming variant: accumulate text deltas (emitted live) and reassemble
154
+ # tool calls, which arrive as indexed fragments across chunks.
155
+ request_kwargs = {**request_kwargs, "stream": True, "stream_options": {"include_usage": True}}
156
+
157
+ # The whole stream is retried as a unit: a transient error while
158
+ # establishing *or* consuming the stream restarts it from scratch.
159
+ # at-most-once-token caveat — partial deltas already emitted to the UI on
160
+ # a failed attempt are discarded; the restart re-emits from the top.
161
+ async def _do() -> tuple[str, list[dict]]:
162
+ stream = await self.client.chat.completions.create(**request_kwargs)
163
+
164
+ content = ""
165
+ usage_obj = None
166
+ # index -> {"id", "name", "arguments"(str)}
167
+ acc: dict[int, dict[str, str]] = {}
168
+
169
+ async for chunk in stream:
170
+ if getattr(chunk, "usage", None):
171
+ usage_obj = chunk.usage # final usage chunk (include_usage)
172
+ if not chunk.choices:
173
+ continue
174
+ delta = chunk.choices[0].delta
175
+ if getattr(delta, "content", None):
176
+ content += delta.content
177
+ emit(on_text, delta.content)
178
+ for tc in (getattr(delta, "tool_calls", None) or []):
179
+ slot = acc.setdefault(tc.index, {"id": "", "name": "", "arguments": ""})
180
+ if getattr(tc, "id", None):
181
+ slot["id"] = tc.id
182
+ fn = getattr(tc, "function", None)
183
+ if fn is not None:
184
+ if getattr(fn, "name", None):
185
+ slot["name"] = fn.name
186
+ if getattr(fn, "arguments", None):
187
+ slot["arguments"] += fn.arguments
188
+
189
+ tool_calls = []
190
+ for index in sorted(acc):
191
+ slot = acc[index]
192
+ if not slot["name"]:
193
+ continue
194
+ try:
195
+ arguments = json.loads(slot["arguments"] or "{}")
196
+ except json.JSONDecodeError:
197
+ arguments = {}
198
+ tool_calls.append({"id": slot["id"], "name": slot["name"], "arguments": arguments})
199
+
200
+ self.last_usage = self.normalize_usage(usage_obj)
201
+ return content, tool_calls
202
+
203
+ return await self._retry(_do, on_retry=on_retry, retryable=self.retryable_exceptions())
204
+
205
+ def normalize_usage(self, raw) -> dict[str, int]:
206
+ # OpenAI prompt_tokens includes cached tokens -> split them out so cost
207
+ # isn't double-counted.
208
+ if raw is None:
209
+ return {}
210
+ details = getattr(raw, "prompt_tokens_details", None)
211
+ cached = (getattr(details, "cached_tokens", 0) or 0) if details is not None else 0
212
+ return {
213
+ "input": max(0, (getattr(raw, "prompt_tokens", 0) or 0) - cached),
214
+ "output": getattr(raw, "completion_tokens", 0) or 0,
215
+ "cache_read": cached,
216
+ "cache_creation": 0,
217
+ }
218
+
219
+
220
+ class OpenRouterProvider(OpenAIProvider):
221
+ """OpenRouter — OpenAI-compatible, with the OpenRouter endpoint default."""
222
+
223
+ name = "openrouter"
224
+
225
+ async def initialize(self) -> None:
226
+ from openai import AsyncOpenAI
227
+ self.client = AsyncOpenAI(
228
+ api_key=self.settings.llm.api_key,
229
+ base_url=self.settings.llm.base_url or "https://openrouter.ai/api/v1",
230
+ timeout=self.settings.llm.timeout,
231
+ max_retries=0,
232
+ )
233
+
234
+
235
+ class AnthropicProvider(Provider):
236
+ """Anthropic Messages API."""
237
+
238
+ name = "anthropic"
239
+
240
+ async def initialize(self) -> None:
241
+ from anthropic import AsyncAnthropic
242
+ self.client = AsyncAnthropic(
243
+ api_key=self.settings.llm.api_key,
244
+ base_url=self.settings.llm.base_url,
245
+ timeout=self.settings.llm.timeout,
246
+ max_retries=0,
247
+ )
248
+
249
+ def retryable_exceptions(self) -> tuple[type, ...]:
250
+ try:
251
+ from anthropic import (
252
+ RateLimitError,
253
+ APIConnectionError,
254
+ InternalServerError,
255
+ )
256
+ except ImportError:
257
+ return ()
258
+ return (RateLimitError, APIConnectionError, InternalServerError)
259
+
260
+ async def generate(self, messages: list[dict[str, str]], tools: list[dict] | None = None, system_prompt: str | None = None, on_text: OnText = None, model: str | None = None, on_retry: OnRetry = None) -> tuple[str, list[dict]]:
261
+ # Generate using Anthropic API
262
+ # Prepare request
263
+ request_kwargs = {
264
+ "model": model or self.settings.llm.model,
265
+ "max_tokens": self.settings.llm.max_tokens,
266
+ "messages": messages,
267
+ }
268
+
269
+ if system_prompt:
270
+ request_kwargs["system"] = system_prompt
271
+
272
+ # Convert tools to Anthropic format
273
+ if tools:
274
+ request_kwargs["tools"] = [
275
+ {
276
+ "name": tool["function"]["name"],
277
+ "description": tool["function"]["description"],
278
+ "input_schema": tool["function"]["parameters"]
279
+ }
280
+ for tool in tools
281
+ ]
282
+
283
+ if on_text is not None:
284
+ return await self._stream(request_kwargs, on_text, on_retry=on_retry)
285
+
286
+ async def _do() -> tuple[str, list[dict]]:
287
+ response = await self.client.messages.create(**request_kwargs)
288
+ self.last_usage = self.normalize_usage(getattr(response, "usage", None))
289
+ return self._extract(response.content)
290
+
291
+ return await self._retry(_do, on_retry=on_retry, retryable=self.retryable_exceptions())
292
+
293
+ async def _stream(self, request_kwargs: dict, on_text: OnText, on_retry: OnRetry = None) -> tuple[str, list[dict]]:
294
+ # Streaming variant: emit text events live, then read the assembled
295
+ # final message for the authoritative content + tool_use blocks. The
296
+ # whole stream is retried as a unit (see OpenAIProvider._stream for the
297
+ # at-most-once-token caveat on restart).
298
+ async def _do() -> tuple[str, list[dict]]:
299
+ async with self.client.messages.stream(**request_kwargs) as stream:
300
+ async for text in stream.text_stream:
301
+ emit(on_text, text)
302
+ final = await stream.get_final_message()
303
+ self.last_usage = self.normalize_usage(getattr(final, "usage", None))
304
+ return self._extract(final.content)
305
+
306
+ return await self._retry(_do, on_retry=on_retry, retryable=self.retryable_exceptions())
307
+
308
+ def normalize_usage(self, raw) -> dict[str, int]:
309
+ # Anthropic reports cache tokens as fields separate from input_tokens.
310
+ if raw is None:
311
+ return {}
312
+ return {
313
+ "input": getattr(raw, "input_tokens", 0) or 0,
314
+ "output": getattr(raw, "output_tokens", 0) or 0,
315
+ "cache_read": getattr(raw, "cache_read_input_tokens", 0) or 0,
316
+ "cache_creation": getattr(raw, "cache_creation_input_tokens", 0) or 0,
317
+ }
318
+
319
+ @staticmethod
320
+ def _extract(blocks) -> tuple[str, list[dict]]:
321
+ content = ""
322
+ tool_calls = []
323
+ for block in blocks:
324
+ if block.type == "text":
325
+ content += block.text
326
+ elif block.type == "tool_use":
327
+ tool_calls.append({"id": block.id, "name": block.name, "arguments": block.input})
328
+ return content, tool_calls
329
+
330
+
331
+ class GoogleProvider(Provider):
332
+ """Gemini — declared, but it never supported tool calling (P02, Option B).
333
+
334
+ ``supports_tools = False`` is the load-bearing property: the facade rejects
335
+ it at configure time with a clear error instead of a turn-time
336
+ ``NotImplementedError``. The methods below are defensive for direct use.
337
+ """
338
+
339
+ name = "google"
340
+ supports_tools = False
341
+
342
+ async def initialize(self) -> None:
343
+ raise unsupported_tools_error(self.name)
344
+
345
+ async def generate(self, messages: list[dict[str, str]], tools: list[dict] | None = None, system_prompt: str | None = None, on_text: OnText = None, model: str | None = None, on_retry: OnRetry = None) -> tuple[str, list[dict]]:
346
+ raise unsupported_tools_error(self.name)
347
+
348
+ def normalize_usage(self, raw) -> dict[str, int]:
349
+ return {}
350
+
351
+
352
+ # Registry order is also the order alternatives are suggested in errors.
353
+ PROVIDERS: dict[str, type[Provider]] = {
354
+ "openai": OpenAIProvider,
355
+ "anthropic": AnthropicProvider,
356
+ "google": GoogleProvider,
357
+ "openrouter": OpenRouterProvider,
358
+ }
359
+
360
+
361
+ def unsupported_tools_error(name: str) -> ValueError:
362
+ """The configure-time error for a provider that declares no tool support."""
363
+ capable = [n for n, cls in PROVIDERS.items() if cls.supports_tools]
364
+ alternatives = ", ".join(f"'{n}'" for n in capable[:-1]) + f", or '{capable[-1]}'"
365
+ return ValueError(
366
+ f"The '{name}' provider does not yet support tool use, which dacli requires. "
367
+ f"Use {alternatives}."
368
+ )
369
+
370
+
371
+ def create_provider(name: str, settings: Settings, *, retry) -> Provider:
372
+ """Instantiate the provider registered under ``name`` (already lowercased)."""
373
+ cls = PROVIDERS.get(name)
374
+ if cls is None:
375
+ raise ValueError(f"Unsupported LLM provider: {name}")
376
+ return cls(settings, retry=retry)
@@ -0,0 +1,79 @@
1
+ """Deterministic, offline stand-in for :class:`reasoning.llm.LLMClient`.
2
+
3
+ Driven by an ordered list of scripted *responses*; each ``generate()`` call pops
4
+ the next one and returns it in the exact shape the kernel parses
5
+ (``core/kernel.py``): ``(content, tool_calls)`` where each tool call is
6
+ ``{"id", "name", "arguments"}``. An empty ``tool_calls`` ends the agent loop
7
+ (final answer). Running past the end raises :class:`ScriptExhausted` — a real
8
+ signal that the agent looped more than the scenario anticipated.
9
+
10
+ A scripted response is a dict::
11
+
12
+ {
13
+ "text": "optional assistant text",
14
+ "tool_calls": [ {"name": "update_plan", "arguments": {...}} ], # optional
15
+ "usage": {"input": 100, "output": 20}, # optional
16
+ }
17
+ """
18
+
19
+ from __future__ import annotations
20
+
21
+ from typing import Any
22
+ import contextlib
23
+
24
+
25
+ class ScriptExhausted(RuntimeError):
26
+ """Raised when ``generate()`` is called after the script is exhausted."""
27
+
28
+
29
+ class ScriptedLLM:
30
+ """An offline LLM double satisfying the kernel's LLM contract."""
31
+
32
+ def __init__(self, responses: list[dict[str, Any]]):
33
+ self._responses: list[dict[str, Any]] = list(responses or [])
34
+ self._i = 0
35
+ #: Provider-normalized usage of the most recent generate() call.
36
+ self.last_usage: dict[str, int] = {}
37
+ self.exhausted: bool = False
38
+
39
+ async def initialize(self) -> None:
40
+ # No network, nothing to set up.
41
+ return None
42
+
43
+ async def generate(
44
+ self,
45
+ messages: list[dict[str, Any]] | None = None,
46
+ tools: list[dict[str, Any]] | None = None,
47
+ system_prompt: str | None = None,
48
+ on_text: Any | None = None,
49
+ model: str | None = None,
50
+ ) -> tuple[str, list[dict[str, Any]]]:
51
+ if self._i >= len(self._responses):
52
+ self.exhausted = True
53
+ raise ScriptExhausted(
54
+ f"ScriptedLLM exhausted after {len(self._responses)} response(s): "
55
+ "the agent requested another generation the scenario did not script."
56
+ )
57
+ spec = self._responses[self._i]
58
+ self._i += 1
59
+
60
+ text = spec.get("text") or ""
61
+ self.last_usage = dict(spec.get("usage") or {})
62
+
63
+ tool_calls: list[dict[str, Any]] = []
64
+ for j, tc in enumerate(spec.get("tool_calls") or [], start=1):
65
+ tool_calls.append(
66
+ {
67
+ "id": tc.get("id") or f"call_{self._i}_{j}",
68
+ "name": tc["name"],
69
+ "arguments": tc.get("arguments") or {},
70
+ }
71
+ )
72
+
73
+ # Presentation parity with streaming providers (headless on_text is a
74
+ # no-op; the chat UI streams). Never let a presentation hook break us.
75
+ if on_text and text:
76
+ with contextlib.suppress(Exception):
77
+ on_text(text)
78
+
79
+ return text, tool_calls
@@ -0,0 +1,18 @@
1
+ Metadata-Version: 2.4
2
+ Name: dacli-ai
3
+ Version: 0.4.0
4
+ Summary: Provider LLM client and token/pricing accounting for dacli
5
+ Author-email: Mouad Jaouhari <github@mj-dev.net>
6
+ Project-URL: Homepage, https://github.com/mouadja02/dacli
7
+ Keywords: llm,anthropic,openai,token accounting
8
+ Requires-Python: >=3.10
9
+ Description-Content-Type: text/markdown
10
+ Requires-Dist: anthropic<1,>=0.40
11
+ Requires-Dist: openai<3,>=1.40
12
+ Requires-Dist: httpx<1,>=0.27
13
+
14
+ # dacli-ai
15
+
16
+ Provider LLM client (anthropic / openai / openrouter) and token/pricing accounting
17
+ for [dacli](https://github.com/mouadja02/dacli). The leaf wheel — no dacli-core
18
+ dependency. Embed it with `dacli-core` for a headless agent.
@@ -0,0 +1,12 @@
1
+ README.md
2
+ pyproject.toml
3
+ src/dacli/ai/__init__.py
4
+ src/dacli/ai/llm.py
5
+ src/dacli/ai/pricing.py
6
+ src/dacli/ai/providers.py
7
+ src/dacli/ai/scripted.py
8
+ src/dacli_ai.egg-info/PKG-INFO
9
+ src/dacli_ai.egg-info/SOURCES.txt
10
+ src/dacli_ai.egg-info/dependency_links.txt
11
+ src/dacli_ai.egg-info/requires.txt
12
+ src/dacli_ai.egg-info/top_level.txt
@@ -0,0 +1,3 @@
1
+ anthropic<1,>=0.40
2
+ openai<3,>=1.40
3
+ httpx<1,>=0.27
@@ -0,0 +1 @@
1
+ dacli