dacli-ai 0.4.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dacli_ai-0.4.0/PKG-INFO +18 -0
- dacli_ai-0.4.0/README.md +5 -0
- dacli_ai-0.4.0/pyproject.toml +30 -0
- dacli_ai-0.4.0/setup.cfg +4 -0
- dacli_ai-0.4.0/src/dacli/ai/__init__.py +8 -0
- dacli_ai-0.4.0/src/dacli/ai/llm.py +213 -0
- dacli_ai-0.4.0/src/dacli/ai/pricing.py +325 -0
- dacli_ai-0.4.0/src/dacli/ai/providers.py +376 -0
- dacli_ai-0.4.0/src/dacli/ai/scripted.py +79 -0
- dacli_ai-0.4.0/src/dacli_ai.egg-info/PKG-INFO +18 -0
- dacli_ai-0.4.0/src/dacli_ai.egg-info/SOURCES.txt +12 -0
- dacli_ai-0.4.0/src/dacli_ai.egg-info/dependency_links.txt +1 -0
- dacli_ai-0.4.0/src/dacli_ai.egg-info/requires.txt +3 -0
- dacli_ai-0.4.0/src/dacli_ai.egg-info/top_level.txt +1 -0
dacli_ai-0.4.0/PKG-INFO
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: dacli-ai
|
|
3
|
+
Version: 0.4.0
|
|
4
|
+
Summary: Provider LLM client and token/pricing accounting for dacli
|
|
5
|
+
Author-email: Mouad Jaouhari <github@mj-dev.net>
|
|
6
|
+
Project-URL: Homepage, https://github.com/mouadja02/dacli
|
|
7
|
+
Keywords: llm,anthropic,openai,token accounting
|
|
8
|
+
Requires-Python: >=3.10
|
|
9
|
+
Description-Content-Type: text/markdown
|
|
10
|
+
Requires-Dist: anthropic<1,>=0.40
|
|
11
|
+
Requires-Dist: openai<3,>=1.40
|
|
12
|
+
Requires-Dist: httpx<1,>=0.27
|
|
13
|
+
|
|
14
|
+
# dacli-ai
|
|
15
|
+
|
|
16
|
+
Provider LLM client (anthropic / openai / openrouter) and token/pricing accounting
|
|
17
|
+
for [dacli](https://github.com/mouadja02/dacli). The leaf wheel — no dacli-core
|
|
18
|
+
dependency. Embed it with `dacli-core` for a headless agent.
|
dacli_ai-0.4.0/README.md
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=68"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "dacli-ai"
|
|
7
|
+
dynamic = ["version"]
|
|
8
|
+
description = "Provider LLM client and token/pricing accounting for dacli"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.10"
|
|
11
|
+
authors = [{ name = "Mouad Jaouhari", email = "github@mj-dev.net" }]
|
|
12
|
+
keywords = ["llm", "anthropic", "openai", "token accounting"]
|
|
13
|
+
# The leaf wheel: provider SDKs + the pricing fetch. The SDKs are base — the
|
|
14
|
+
# client is inert without one. httpx backs the models.dev pricing lookup.
|
|
15
|
+
dependencies = [
|
|
16
|
+
"anthropic>=0.40,<1",
|
|
17
|
+
"openai>=1.40,<3",
|
|
18
|
+
"httpx>=0.27,<1",
|
|
19
|
+
]
|
|
20
|
+
|
|
21
|
+
[project.urls]
|
|
22
|
+
Homepage = "https://github.com/mouadja02/dacli"
|
|
23
|
+
|
|
24
|
+
[tool.setuptools.dynamic]
|
|
25
|
+
version = { attr = "dacli.ai.__version__" }
|
|
26
|
+
|
|
27
|
+
[tool.setuptools.packages.find]
|
|
28
|
+
where = ["src"]
|
|
29
|
+
include = ["dacli*"]
|
|
30
|
+
namespaces = true
|
dacli_ai-0.4.0/setup.cfg
ADDED
|
@@ -0,0 +1,213 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import random
|
|
4
|
+
import asyncio
|
|
5
|
+
import logging
|
|
6
|
+
import contextlib
|
|
7
|
+
from typing import TYPE_CHECKING, Any, TypeAlias
|
|
8
|
+
from collections.abc import Awaitable, Callable
|
|
9
|
+
|
|
10
|
+
from dacli.ai.providers import Provider, create_provider, unsupported_tools_error
|
|
11
|
+
|
|
12
|
+
if TYPE_CHECKING:
|
|
13
|
+
# ai is the leaf wheel; it must not import core at runtime. Settings is only a
|
|
14
|
+
# type annotation here — the client reads it by attribute (duck-typed).
|
|
15
|
+
from dacli.config.settings import Settings
|
|
16
|
+
|
|
17
|
+
logger = logging.getLogger(__name__)
|
|
18
|
+
|
|
19
|
+
# Type of the optional streaming callback: receives each text delta as it
|
|
20
|
+
# arrives. Returning None; it is presentation-only and must not raise into the
|
|
21
|
+
# generate path (the UI guards its own rendering).
|
|
22
|
+
OnText: TypeAlias = Callable[[str], None] | None
|
|
23
|
+
|
|
24
|
+
# Type of the optional retry-status callback. Invoked once per *retry* (not on
|
|
25
|
+
# the final failure) with the upcoming attempt number, the backoff delay about
|
|
26
|
+
# to be slept, and the transient error that triggered it, so the TUI/logger can
|
|
27
|
+
# render "⟳ retrying in 2.1s (429)".
|
|
28
|
+
OnRetry = Callable[..., None] | None
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class LLMClient:
|
|
32
|
+
# Multi-provider LLM client: a thin facade that selects a Provider in
|
|
33
|
+
# initialize() and delegates request mechanics to it (A-2). The public
|
|
34
|
+
# surface (generate / classify / last_usage) is provider-agnostic; the
|
|
35
|
+
# shared retry/backoff lives here so no provider duplicates it.
|
|
36
|
+
|
|
37
|
+
def __init__(self, settings: Settings):
|
|
38
|
+
# Initialize LLM client with settings
|
|
39
|
+
self.settings = settings
|
|
40
|
+
# The concrete SDK client (AsyncOpenAI / AsyncAnthropic) owned by the
|
|
41
|
+
# active provider; mirrored here so callers (and tests) can inspect or
|
|
42
|
+
# inject it on the facade. Typed Any so provider-specific attribute
|
|
43
|
+
# access type-checks without a fragile union.
|
|
44
|
+
self._client: Any = None
|
|
45
|
+
self._provider = settings.llm.provider
|
|
46
|
+
# The active Provider implementation, selected in initialize().
|
|
47
|
+
self._provider_impl: Provider | None = None
|
|
48
|
+
# Provider-normalized token usage of the most recent generate() call,
|
|
49
|
+
# read by the kernel for cost tracking. Reset on each generate().
|
|
50
|
+
self.last_usage: dict[str, int] = {}
|
|
51
|
+
|
|
52
|
+
async def initialize(self) -> None:
|
|
53
|
+
# Select and initialize the provider. ``supports_tools`` is checked
|
|
54
|
+
# here so a provider that cannot do tool calling (which every real
|
|
55
|
+
# agent turn requires) fails fast at configuration time, not deep
|
|
56
|
+
# inside the first turn (P02, Option B — honest removal).
|
|
57
|
+
provider = create_provider(
|
|
58
|
+
self._provider.lower(), self.settings, retry=self._with_retry
|
|
59
|
+
)
|
|
60
|
+
if not provider.supports_tools:
|
|
61
|
+
raise unsupported_tools_error(provider.name)
|
|
62
|
+
await provider.initialize()
|
|
63
|
+
self._provider_impl = provider
|
|
64
|
+
self._client = provider.client
|
|
65
|
+
|
|
66
|
+
def _impl(self) -> Provider:
|
|
67
|
+
# The active provider, created lazily for paths that bypass
|
|
68
|
+
# initialize(). A fake SDK client injected on the facade (tests) is
|
|
69
|
+
# pushed through to the provider so both always see the same client.
|
|
70
|
+
impl = getattr(self, "_provider_impl", None)
|
|
71
|
+
if impl is None:
|
|
72
|
+
impl = create_provider(
|
|
73
|
+
self._provider.lower(), self.settings, retry=self._with_retry
|
|
74
|
+
)
|
|
75
|
+
self._provider_impl = impl
|
|
76
|
+
if impl.client is not self._client:
|
|
77
|
+
impl.client = self._client
|
|
78
|
+
return impl
|
|
79
|
+
|
|
80
|
+
async def generate(self, messages: list[dict[str, str]], tools: list[dict] | None = None, system_prompt: str | None = None, on_text: OnText = None, model: str | None = None, on_retry: OnRetry = None) -> tuple[str, list[dict]]:
|
|
81
|
+
"""
|
|
82
|
+
Generate a response from the LLM.
|
|
83
|
+
|
|
84
|
+
Args:
|
|
85
|
+
messages: Conversation messages
|
|
86
|
+
tools: Available tool definitions
|
|
87
|
+
system_prompt: System prompt to use
|
|
88
|
+
on_text: Optional callback invoked with each text delta as it is
|
|
89
|
+
generated. When provided (and the provider supports it) the
|
|
90
|
+
response is streamed; the return value is unchanged. Providers
|
|
91
|
+
without streaming call it once with the full text instead, so
|
|
92
|
+
the UI behaves identically.
|
|
93
|
+
model: Optional per-call model override (model tiering, ℛ).
|
|
94
|
+
When None the configured ``settings.llm.model`` is used, so the
|
|
95
|
+
default single-model path is byte-for-byte unchanged. The
|
|
96
|
+
``ModelRouter`` passes the cheap or strong tier id here.
|
|
97
|
+
|
|
98
|
+
Returns:
|
|
99
|
+
Tuple of (response content, tool calls)
|
|
100
|
+
"""
|
|
101
|
+
if not self._client:
|
|
102
|
+
await self.initialize()
|
|
103
|
+
|
|
104
|
+
self.last_usage = {} # populated from the provider below
|
|
105
|
+
model = model or self.settings.llm.model
|
|
106
|
+
impl = self._impl()
|
|
107
|
+
content, tool_calls = await impl.generate(
|
|
108
|
+
messages, tools, system_prompt, on_text=on_text, model=model, on_retry=on_retry
|
|
109
|
+
)
|
|
110
|
+
self.last_usage = impl.last_usage
|
|
111
|
+
return content, tool_calls
|
|
112
|
+
|
|
113
|
+
def _retryable_exceptions(self) -> tuple[type, ...]:
|
|
114
|
+
# The active provider's declared transient-error classes (429 rate
|
|
115
|
+
# limit, dropped connection, 5xx); an unknown provider retries nothing,
|
|
116
|
+
# so a transient blip simply surfaces unchanged.
|
|
117
|
+
try:
|
|
118
|
+
return self._impl().retryable_exceptions()
|
|
119
|
+
except ValueError:
|
|
120
|
+
return ()
|
|
121
|
+
|
|
122
|
+
@staticmethod
|
|
123
|
+
def _default_on_retry(*, attempt: int, delay: float, error: Exception) -> None:
|
|
124
|
+
# Fallback status sink when no on_retry is wired (e.g. P13 TUI absent):
|
|
125
|
+
# log the transient failure + backoff so a retried turn is never silent.
|
|
126
|
+
logger.warning(
|
|
127
|
+
"LLM call failed (%s); retrying in %.1fs (attempt %d)",
|
|
128
|
+
type(error).__name__,
|
|
129
|
+
delay,
|
|
130
|
+
attempt,
|
|
131
|
+
)
|
|
132
|
+
|
|
133
|
+
async def _with_retry(
|
|
134
|
+
self,
|
|
135
|
+
fn: Callable[[], Awaitable],
|
|
136
|
+
*,
|
|
137
|
+
attempts: int | None = None,
|
|
138
|
+
base: float | None = None,
|
|
139
|
+
on_retry: OnRetry = None,
|
|
140
|
+
retryable: tuple[type[BaseException], ...] | None = None,
|
|
141
|
+
):
|
|
142
|
+
"""Run ``fn`` with bounded, jittered exponential backoff (P05).
|
|
143
|
+
|
|
144
|
+
``fn`` is an argument-free coroutine factory invoked once per attempt;
|
|
145
|
+
for streaming paths it re-establishes the stream from scratch on retry
|
|
146
|
+
(at-most-once-token caveat: any partial tokens already emitted to the UI
|
|
147
|
+
are discarded when the stream is restarted). Only ``retryable`` classes
|
|
148
|
+
are retried — everything else propagates immediately (fail fast).
|
|
149
|
+
"""
|
|
150
|
+
attempts = attempts or self.settings.llm.retry_attempts
|
|
151
|
+
base = base if base is not None else self.settings.llm.retry_base_delay
|
|
152
|
+
retryable = retryable if retryable is not None else self._retryable_exceptions()
|
|
153
|
+
on_retry = on_retry or self._default_on_retry
|
|
154
|
+
for i in range(attempts):
|
|
155
|
+
try:
|
|
156
|
+
return await fn()
|
|
157
|
+
except retryable as e:
|
|
158
|
+
if i == attempts - 1:
|
|
159
|
+
raise
|
|
160
|
+
delay = base * 2 ** i + random.random() * 0.3
|
|
161
|
+
# a status sink must never break the retry loop
|
|
162
|
+
with contextlib.suppress(Exception):
|
|
163
|
+
on_retry(attempt=i + 1, delay=delay, error=e)
|
|
164
|
+
await asyncio.sleep(delay)
|
|
165
|
+
# Unreachable: attempts >= 1, so the loop always returns or raises. Kept
|
|
166
|
+
# so the function provably never returns None.
|
|
167
|
+
raise RuntimeError("_with_retry exhausted without returning or raising")
|
|
168
|
+
|
|
169
|
+
async def _stream_openai(self, request_kwargs: dict, on_text: OnText, on_retry: OnRetry = None) -> tuple[str, list[dict]]:
|
|
170
|
+
# Back-compat seam: the OpenAI-compatible streaming path is also driven
|
|
171
|
+
# directly through the facade (tests inject a fake SDK client on
|
|
172
|
+
# ``_client``). Delegates to the provider; identical to generate()'s
|
|
173
|
+
# streaming path.
|
|
174
|
+
impl = self._impl()
|
|
175
|
+
result = await impl._stream(request_kwargs, on_text, on_retry=on_retry)
|
|
176
|
+
self.last_usage = impl.last_usage
|
|
177
|
+
return result
|
|
178
|
+
|
|
179
|
+
async def classify(self, text: str, labels: list[str], instructions: str | None = None, model: str | None = None) -> str:
|
|
180
|
+
"""
|
|
181
|
+
Thin classification helper used by the router.
|
|
182
|
+
|
|
183
|
+
Sends a tool-free completion asking the model to pick exactly one label
|
|
184
|
+
from ``labels`` and normalizes the answer back onto that set when possible.
|
|
185
|
+
``model`` lets the caller force the cheap tier — classification
|
|
186
|
+
is the canonical cheap-model job.
|
|
187
|
+
"""
|
|
188
|
+
system = instructions or (
|
|
189
|
+
"You are a classifier. Respond with exactly one of the allowed labels "
|
|
190
|
+
"and nothing else."
|
|
191
|
+
)
|
|
192
|
+
label_list = ", ".join(labels)
|
|
193
|
+
prompt = (
|
|
194
|
+
f"Allowed labels: {label_list}\n\n"
|
|
195
|
+
f"Text to classify:\n{text}\n\n"
|
|
196
|
+
"Respond with exactly one label from the allowed list."
|
|
197
|
+
)
|
|
198
|
+
content, _ = await self.generate(
|
|
199
|
+
messages=[{"role": "user", "content": prompt}],
|
|
200
|
+
tools=None,
|
|
201
|
+
system_prompt=system,
|
|
202
|
+
model=model,
|
|
203
|
+
)
|
|
204
|
+
answer = (content or "").strip()
|
|
205
|
+
|
|
206
|
+
# Exact (case-insensitive) match first, then substring fallback.
|
|
207
|
+
for label in labels:
|
|
208
|
+
if answer.lower() == label.lower():
|
|
209
|
+
return label
|
|
210
|
+
for label in labels:
|
|
211
|
+
if label.lower() in answer.lower():
|
|
212
|
+
return label
|
|
213
|
+
return answer
|
|
@@ -0,0 +1,325 @@
|
|
|
1
|
+
"""Model pricing via the models.dev API, plus token-usage accounting.
|
|
2
|
+
|
|
3
|
+
dacli tracks how many tokens each LLM call consumes and what it costs. Pricing
|
|
4
|
+
is looked up from https://models.dev/api.json (a community database of model
|
|
5
|
+
specs/pricing), filtered to the configured provider + model. The payload is
|
|
6
|
+
cached on disk with a TTL so we don't hit the network every turn, and we degrade
|
|
7
|
+
gracefully when offline (tokens are still tracked; cost is reported as unknown).
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
import json
|
|
13
|
+
import logging
|
|
14
|
+
import os
|
|
15
|
+
import re
|
|
16
|
+
import tempfile
|
|
17
|
+
import time
|
|
18
|
+
from dataclasses import dataclass
|
|
19
|
+
from difflib import SequenceMatcher
|
|
20
|
+
from pathlib import Path
|
|
21
|
+
from typing import Any
|
|
22
|
+
|
|
23
|
+
log = logging.getLogger(__name__)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def _write_json_atomic(path: Path, obj: Any) -> None:
|
|
27
|
+
# ai is the leaf wheel and can't reach core.atomicio. The cache is best-effort
|
|
28
|
+
# (a torn write just forces a re-fetch), but cheap crash-safety is still worth
|
|
29
|
+
# it: write a sibling temp, fsync, os.replace (atomic on POSIX + Windows).
|
|
30
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
31
|
+
fd, tmp = tempfile.mkstemp(dir=path.parent, suffix=".tmp")
|
|
32
|
+
try:
|
|
33
|
+
with os.fdopen(fd, "w", encoding="utf-8") as f:
|
|
34
|
+
f.write(json.dumps(obj))
|
|
35
|
+
f.flush()
|
|
36
|
+
os.fsync(f.fileno())
|
|
37
|
+
os.replace(tmp, path)
|
|
38
|
+
finally:
|
|
39
|
+
if os.path.exists(tmp):
|
|
40
|
+
os.unlink(tmp)
|
|
41
|
+
|
|
42
|
+
MODELS_DEV_URL = "https://models.dev/api.json"
|
|
43
|
+
CACHE_TTL_SECONDS = 24 * 60 * 60 # refresh pricing at most once a day
|
|
44
|
+
# Short network timeout: pricing is a startup nicety, not a blocker. A
|
|
45
|
+
# first-run offline user (no cache yet) must not wait long before we fall back to
|
|
46
|
+
# "cost unknown". Keep it well under a human's patience threshold.
|
|
47
|
+
HTTP_TIMEOUT_SECONDS = 5.0
|
|
48
|
+
|
|
49
|
+
# Minimum similarity score for a fuzzy model match to be trusted. Below this we
|
|
50
|
+
# return no pricing (better an honest "unknown" than a wrong, confident price).
|
|
51
|
+
SIMILARITY_THRESHOLD = 0.62
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
@dataclass
|
|
55
|
+
class TokenUsage:
|
|
56
|
+
"""Token counts for one or many LLM calls (provider-normalized)."""
|
|
57
|
+
|
|
58
|
+
input: int = 0
|
|
59
|
+
output: int = 0
|
|
60
|
+
cache_read: int = 0
|
|
61
|
+
cache_creation: int = 0
|
|
62
|
+
|
|
63
|
+
def add(self, other: TokenUsage) -> None:
|
|
64
|
+
self.input += other.input
|
|
65
|
+
self.output += other.output
|
|
66
|
+
self.cache_read += other.cache_read
|
|
67
|
+
self.cache_creation += other.cache_creation
|
|
68
|
+
|
|
69
|
+
@property
|
|
70
|
+
def total(self) -> int:
|
|
71
|
+
return self.input + self.output + self.cache_read + self.cache_creation
|
|
72
|
+
|
|
73
|
+
def as_dict(self) -> dict[str, int]:
|
|
74
|
+
return {
|
|
75
|
+
"input": self.input,
|
|
76
|
+
"output": self.output,
|
|
77
|
+
"cache_read": self.cache_read,
|
|
78
|
+
"cache_creation": self.cache_creation,
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
@classmethod
|
|
82
|
+
def from_dict(cls, d: dict[str, Any] | None) -> TokenUsage:
|
|
83
|
+
d = d or {}
|
|
84
|
+
return cls(
|
|
85
|
+
input=int(d.get("input", 0) or 0),
|
|
86
|
+
output=int(d.get("output", 0) or 0),
|
|
87
|
+
cache_read=int(d.get("cache_read", 0) or 0),
|
|
88
|
+
cache_creation=int(d.get("cache_creation", 0) or 0),
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
@dataclass
|
|
93
|
+
class ModelPricing:
|
|
94
|
+
"""USD cost per 1M tokens for a single model (models.dev `cost` block)."""
|
|
95
|
+
|
|
96
|
+
provider: str
|
|
97
|
+
model: str
|
|
98
|
+
input: float = 0.0
|
|
99
|
+
output: float = 0.0
|
|
100
|
+
cache_read: float = 0.0
|
|
101
|
+
cache_write: float = 0.0
|
|
102
|
+
# The models.dev entry we actually priced against. Equal to ``model`` on an
|
|
103
|
+
# exact hit; on a fuzzy hit it names the closest catalog model (e.g.
|
|
104
|
+
# ``openai/gpt-oss-120b`` for a requested ``openai/gpt-oss-120b:nitro``).
|
|
105
|
+
resolved_model: str = ""
|
|
106
|
+
resolved_provider: str = ""
|
|
107
|
+
match: str = "exact" # "exact" | "normalized" | "similar"
|
|
108
|
+
similarity: float = 1.0
|
|
109
|
+
|
|
110
|
+
@property
|
|
111
|
+
def is_fuzzy(self) -> bool:
|
|
112
|
+
return self.match != "exact"
|
|
113
|
+
|
|
114
|
+
def cost_for(self, usage: TokenUsage) -> float:
|
|
115
|
+
"""Compute USD cost for a usage record (prices are per 1M tokens)."""
|
|
116
|
+
return (
|
|
117
|
+
usage.input * self.input
|
|
118
|
+
+ usage.output * self.output
|
|
119
|
+
+ usage.cache_read * self.cache_read
|
|
120
|
+
+ usage.cache_creation * self.cache_write
|
|
121
|
+
) / 1_000_000
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
# ----------------------------------------------------------------------------
|
|
125
|
+
# models.dev lookup
|
|
126
|
+
# ----------------------------------------------------------------------------
|
|
127
|
+
def _ci_get(d: Any, key: str) -> Any:
|
|
128
|
+
"""Case-insensitive dict lookup (model/provider ids vary in casing)."""
|
|
129
|
+
if not isinstance(d, dict) or not key:
|
|
130
|
+
return None
|
|
131
|
+
if key in d:
|
|
132
|
+
return d[key]
|
|
133
|
+
lowered = key.lower()
|
|
134
|
+
for k, v in d.items():
|
|
135
|
+
if isinstance(k, str) and k.lower() == lowered:
|
|
136
|
+
return v
|
|
137
|
+
return None
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
# Provider-routing variant suffixes (OpenRouter et al.) that don't change the
|
|
141
|
+
# underlying model's price — stripped before matching, e.g.
|
|
142
|
+
# ``openai/gpt-oss-120b:nitro`` -> ``openai/gpt-oss-120b``.
|
|
143
|
+
_VARIANT_SUFFIX_RE = re.compile(r":[^:/]+$")
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
def _normalize_model_id(model: str) -> str:
|
|
147
|
+
"""Lowercase + drop the routing-variant suffix for matching."""
|
|
148
|
+
s = (model or "").strip().lower()
|
|
149
|
+
# Strip a trailing ``:variant`` (nitro/floor/free/beta/extended/online/...).
|
|
150
|
+
# models.dev ids never carry a ``:`` so this only removes routing noise.
|
|
151
|
+
s = _VARIANT_SUFFIX_RE.sub("", s)
|
|
152
|
+
return s.strip()
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
def _basename(model_id: str) -> str:
|
|
156
|
+
# The vendor/model -> model part ("openai/gpt-oss-120b" -> "gpt-oss-120b").
|
|
157
|
+
return model_id.rsplit("/", 1)[-1]
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
def _iter_models(payload: dict, provider: str):
|
|
161
|
+
"""Yield ``(provider_id, provider_entry, model_id, model_entry)``.
|
|
162
|
+
|
|
163
|
+
The configured provider's models come first so a routed model is priced
|
|
164
|
+
against *that* provider's catalog (e.g. OpenRouter pricing for an
|
|
165
|
+
OpenRouter-routed model) before falling back to other providers.
|
|
166
|
+
"""
|
|
167
|
+
seen_provider = None
|
|
168
|
+
prov = _ci_get(payload, provider)
|
|
169
|
+
if isinstance(prov, dict):
|
|
170
|
+
seen_provider = next((k for k in payload if k.lower() == (provider or "").lower()), provider)
|
|
171
|
+
for mid, entry in (prov.get("models", {}) or {}).items():
|
|
172
|
+
if isinstance(entry, dict):
|
|
173
|
+
yield seen_provider, prov, mid, entry
|
|
174
|
+
for pid, pval in payload.items():
|
|
175
|
+
if pid == seen_provider or not isinstance(pval, dict):
|
|
176
|
+
continue
|
|
177
|
+
for mid, entry in (pval.get("models", {}) or {}).items():
|
|
178
|
+
if isinstance(entry, dict):
|
|
179
|
+
yield pid, pval, mid, entry
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
def _score(query_norm: str, candidate_id: str) -> float:
|
|
183
|
+
"""Similarity in [0,1] between a normalized query and a candidate model id."""
|
|
184
|
+
cand_norm = _normalize_model_id(candidate_id)
|
|
185
|
+
if query_norm == cand_norm:
|
|
186
|
+
return 1.0
|
|
187
|
+
# A matching basename is a strong signal even if the vendor prefix differs.
|
|
188
|
+
base_q, base_c = _basename(query_norm), _basename(cand_norm)
|
|
189
|
+
if base_q == base_c:
|
|
190
|
+
return 0.97
|
|
191
|
+
full = SequenceMatcher(None, query_norm, cand_norm).ratio()
|
|
192
|
+
base = SequenceMatcher(None, base_q, base_c).ratio()
|
|
193
|
+
# Reward containment (e.g. "gpt-oss-120b" inside "openai/gpt-oss-120b").
|
|
194
|
+
contain = 0.9 if (base_q and base_q in base_c) or (base_c and base_c in base_q) else 0.0
|
|
195
|
+
return max(full, base, contain)
|
|
196
|
+
|
|
197
|
+
|
|
198
|
+
def _find_model(payload: Any, provider: str, model: str) -> tuple[dict, dict, str, str, str, float] | None:
|
|
199
|
+
"""Locate the best (provider_entry, model_entry, ...) for provider+model.
|
|
200
|
+
|
|
201
|
+
Returns ``(provider_entry, model_entry, resolved_provider_id, resolved_model_id,
|
|
202
|
+
match_kind, similarity)`` or ``None``. Match resolution, in order:
|
|
203
|
+
|
|
204
|
+
1. **exact** case-insensitive id in the named provider, then any provider;
|
|
205
|
+
2. **normalized** exact (after stripping the routing-variant suffix);
|
|
206
|
+
3. **similar** — the closest catalog id by similarity, above a threshold,
|
|
207
|
+
preferring the configured provider when its best match ties.
|
|
208
|
+
"""
|
|
209
|
+
if not isinstance(payload, dict) or not model:
|
|
210
|
+
return None
|
|
211
|
+
|
|
212
|
+
# 1. exact (preserves the original behavior + tests).
|
|
213
|
+
prov = _ci_get(payload, provider)
|
|
214
|
+
if isinstance(prov, dict):
|
|
215
|
+
m = _ci_get(prov.get("models", {}), model)
|
|
216
|
+
if isinstance(m, dict):
|
|
217
|
+
return prov, m, provider, model, "exact", 1.0
|
|
218
|
+
for pid, pval in payload.items():
|
|
219
|
+
if not isinstance(pval, dict):
|
|
220
|
+
continue
|
|
221
|
+
m = _ci_get(pval.get("models", {}), model)
|
|
222
|
+
if isinstance(m, dict):
|
|
223
|
+
return pval, m, pid, model, "exact", 1.0
|
|
224
|
+
|
|
225
|
+
# 2 & 3. normalized-exact + similarity over all candidates, provider-first.
|
|
226
|
+
query_norm = _normalize_model_id(model)
|
|
227
|
+
if not query_norm:
|
|
228
|
+
return None
|
|
229
|
+
|
|
230
|
+
best = None # (score, is_same_provider, provider_id, prov_entry, model_id, entry)
|
|
231
|
+
same_provider_id = next((k for k in payload if k.lower() == (provider or "").lower()), None)
|
|
232
|
+
for pid, pval, mid, entry in _iter_models(payload, provider):
|
|
233
|
+
score = _score(query_norm, mid)
|
|
234
|
+
same = (pid == same_provider_id)
|
|
235
|
+
# A normalized-exact hit (score 1.0) wins immediately within the
|
|
236
|
+
# provider-first ordering.
|
|
237
|
+
if score >= 1.0 and same:
|
|
238
|
+
return pval, entry, pid, mid, "normalized", 1.0
|
|
239
|
+
cand = (score, same, pid, pval, mid, entry)
|
|
240
|
+
if best is None or (score, same) > (best[0], best[1]):
|
|
241
|
+
best = cand
|
|
242
|
+
|
|
243
|
+
if best and best[0] >= SIMILARITY_THRESHOLD:
|
|
244
|
+
score, _same, pid, pval, mid, entry = best
|
|
245
|
+
kind = "normalized" if score >= 1.0 else "similar"
|
|
246
|
+
return pval, entry, pid, mid, kind, round(score, 3)
|
|
247
|
+
return None
|
|
248
|
+
|
|
249
|
+
|
|
250
|
+
def pricing_from_payload(payload: Any, provider: str, model: str) -> ModelPricing | None:
|
|
251
|
+
"""Build :class:`ModelPricing` from an in-memory api.json payload (pure).
|
|
252
|
+
|
|
253
|
+
Falls back to a similarity search when an exact id isn't in the catalog, so
|
|
254
|
+
a routed/variant model (``…:nitro``) is priced against its closest match.
|
|
255
|
+
"""
|
|
256
|
+
found = _find_model(payload, (provider or "").strip(), (model or "").strip())
|
|
257
|
+
if not found:
|
|
258
|
+
return None
|
|
259
|
+
_prov_entry, entry, resolved_provider, resolved_model, kind, similarity = found
|
|
260
|
+
cost = entry.get("cost") or {}
|
|
261
|
+
return ModelPricing(
|
|
262
|
+
provider=provider,
|
|
263
|
+
model=model,
|
|
264
|
+
input=float(cost.get("input", 0) or 0),
|
|
265
|
+
output=float(cost.get("output", 0) or 0),
|
|
266
|
+
cache_read=float(cost.get("cache_read", 0) or 0),
|
|
267
|
+
cache_write=float(cost.get("cache_write", 0) or 0),
|
|
268
|
+
resolved_model=resolved_model,
|
|
269
|
+
resolved_provider=resolved_provider,
|
|
270
|
+
match=kind,
|
|
271
|
+
similarity=similarity,
|
|
272
|
+
)
|
|
273
|
+
|
|
274
|
+
|
|
275
|
+
# ----------------------------------------------------------------------------
|
|
276
|
+
# cached fetch
|
|
277
|
+
# ----------------------------------------------------------------------------
|
|
278
|
+
def _cache_path(cache_dir: str) -> Path:
|
|
279
|
+
return Path(cache_dir) / "models_cache.json"
|
|
280
|
+
|
|
281
|
+
|
|
282
|
+
def _load_cache(cache_dir: str) -> tuple[float, Any]:
|
|
283
|
+
try:
|
|
284
|
+
data = json.loads(_cache_path(cache_dir).read_text(encoding="utf-8"))
|
|
285
|
+
return float(data.get("fetched_at", 0)), data.get("payload")
|
|
286
|
+
except Exception:
|
|
287
|
+
return 0.0, None
|
|
288
|
+
|
|
289
|
+
|
|
290
|
+
def _save_cache(cache_dir: str, payload: Any) -> None:
|
|
291
|
+
try:
|
|
292
|
+
path = _cache_path(cache_dir)
|
|
293
|
+
_write_json_atomic(path, {"fetched_at": time.time(), "payload": payload})
|
|
294
|
+
except Exception:
|
|
295
|
+
log.debug("pricing cache write failed", exc_info=True) # best-effort
|
|
296
|
+
|
|
297
|
+
|
|
298
|
+
def fetch_api_json(cache_dir: str = ".dacli", force_refresh: bool = False) -> Any:
|
|
299
|
+
"""Return the models.dev payload, using a TTL cache and offline fallback."""
|
|
300
|
+
fetched_at, payload = _load_cache(cache_dir)
|
|
301
|
+
fresh = payload is not None and (time.time() - fetched_at) < CACHE_TTL_SECONDS
|
|
302
|
+
if fresh and not force_refresh:
|
|
303
|
+
return payload
|
|
304
|
+
|
|
305
|
+
try:
|
|
306
|
+
import httpx
|
|
307
|
+
|
|
308
|
+
resp = httpx.get(MODELS_DEV_URL, timeout=HTTP_TIMEOUT_SECONDS)
|
|
309
|
+
resp.raise_for_status()
|
|
310
|
+
payload = resp.json()
|
|
311
|
+
_save_cache(cache_dir, payload)
|
|
312
|
+
return payload
|
|
313
|
+
except Exception:
|
|
314
|
+
return payload # stale cache or None when offline
|
|
315
|
+
|
|
316
|
+
|
|
317
|
+
def fetch_pricing(
|
|
318
|
+
provider: str,
|
|
319
|
+
model: str,
|
|
320
|
+
cache_dir: str = ".dacli",
|
|
321
|
+
force_refresh: bool = False,
|
|
322
|
+
) -> ModelPricing | None:
|
|
323
|
+
"""Resolve pricing for provider+model, or ``None`` if unavailable/offline."""
|
|
324
|
+
payload = fetch_api_json(cache_dir, force_refresh=force_refresh)
|
|
325
|
+
return pricing_from_payload(payload, provider, model)
|
|
@@ -0,0 +1,376 @@
|
|
|
1
|
+
"""Per-provider LLM implementations behind the :class:`Provider` protocol (A-2).
|
|
2
|
+
|
|
3
|
+
Each provider owns its SDK client, request shaping, streaming reassembly, and
|
|
4
|
+
usage normalization, and **declares** its capabilities (``supports_tools``) so
|
|
5
|
+
"does this provider support tools?" is a configure-time property, not a
|
|
6
|
+
turn-time surprise. The :class:`~dacli.ai.llm.LLMClient` facade selects
|
|
7
|
+
a provider in ``initialize()`` and delegates; bounded retry/backoff stays in
|
|
8
|
+
the facade and is injected as ``retry`` so no provider duplicates it.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
import json
|
|
14
|
+
import contextlib
|
|
15
|
+
from abc import ABC, abstractmethod
|
|
16
|
+
from typing import TYPE_CHECKING, Any, ClassVar
|
|
17
|
+
|
|
18
|
+
if TYPE_CHECKING:
|
|
19
|
+
from dacli.config.settings import Settings
|
|
20
|
+
from dacli.ai.llm import OnRetry, OnText
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def emit(on_text: OnText, delta: str) -> None:
|
|
24
|
+
# Stream a delta to the UI without ever letting a rendering error break
|
|
25
|
+
# generation (reliability-first).
|
|
26
|
+
if on_text and delta:
|
|
27
|
+
with contextlib.suppress(Exception):
|
|
28
|
+
on_text(delta)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class Provider(ABC):
|
|
32
|
+
"""One LLM provider: declared capabilities + the request/stream mechanics.
|
|
33
|
+
|
|
34
|
+
``retry`` is the facade's ``_with_retry`` (bounded, jittered exponential
|
|
35
|
+
backoff) — shared, never reimplemented per provider. ``client`` is the
|
|
36
|
+
provider's SDK client, built lazily in :meth:`initialize` with the SDK's
|
|
37
|
+
own retries disabled so the configured count is authoritative.
|
|
38
|
+
"""
|
|
39
|
+
|
|
40
|
+
name: ClassVar[str] = ""
|
|
41
|
+
supports_tools: ClassVar[bool] = True
|
|
42
|
+
|
|
43
|
+
def __init__(self, settings: Settings, *, retry):
|
|
44
|
+
self.settings = settings
|
|
45
|
+
self._retry = retry
|
|
46
|
+
self.client: Any = None
|
|
47
|
+
# Provider-normalized token usage of the most recent generate() call;
|
|
48
|
+
# the facade copies it onto its own ``last_usage`` after delegating.
|
|
49
|
+
self.last_usage: dict[str, int] = {}
|
|
50
|
+
|
|
51
|
+
@abstractmethod
|
|
52
|
+
async def initialize(self) -> None:
|
|
53
|
+
"""Construct the SDK client (``max_retries=0`` — retry is ours)."""
|
|
54
|
+
|
|
55
|
+
@abstractmethod
|
|
56
|
+
async def generate(
|
|
57
|
+
self,
|
|
58
|
+
messages: list[dict[str, str]],
|
|
59
|
+
tools: list[dict] | None = None,
|
|
60
|
+
system_prompt: str | None = None,
|
|
61
|
+
on_text: OnText = None,
|
|
62
|
+
model: str | None = None,
|
|
63
|
+
on_retry: OnRetry = None,
|
|
64
|
+
) -> tuple[str, list[dict]]:
|
|
65
|
+
"""Return ``(content, tool_calls)`` for one completion."""
|
|
66
|
+
|
|
67
|
+
@abstractmethod
|
|
68
|
+
def normalize_usage(self, raw) -> dict[str, int]:
|
|
69
|
+
"""Map the provider's raw usage object onto the shared usage dict."""
|
|
70
|
+
|
|
71
|
+
def retryable_exceptions(self) -> tuple[type, ...]:
|
|
72
|
+
# Provider-specific *transient* error classes that are safe to retry
|
|
73
|
+
# (429 rate limit, dropped connection, 5xx). Auth / 4xx-validation
|
|
74
|
+
# errors are deliberately excluded so they fail fast. Imported lazily
|
|
75
|
+
# (mirroring initialize()) and tolerant of a missing SDK -> () means
|
|
76
|
+
# "retry nothing", so a transient blip simply surfaces unchanged.
|
|
77
|
+
return ()
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
class OpenAIProvider(Provider):
|
|
81
|
+
"""OpenAI (and any OpenAI-compatible endpoint via ``base_url``)."""
|
|
82
|
+
|
|
83
|
+
name = "openai"
|
|
84
|
+
|
|
85
|
+
async def initialize(self) -> None:
|
|
86
|
+
from openai import AsyncOpenAI
|
|
87
|
+
self.client = AsyncOpenAI(
|
|
88
|
+
api_key=self.settings.llm.api_key,
|
|
89
|
+
base_url=self.settings.llm.base_url,
|
|
90
|
+
timeout=self.settings.llm.timeout,
|
|
91
|
+
max_retries=0,
|
|
92
|
+
)
|
|
93
|
+
|
|
94
|
+
def retryable_exceptions(self) -> tuple[type, ...]:
|
|
95
|
+
try:
|
|
96
|
+
from openai import (
|
|
97
|
+
RateLimitError,
|
|
98
|
+
APIConnectionError,
|
|
99
|
+
InternalServerError,
|
|
100
|
+
)
|
|
101
|
+
except ImportError:
|
|
102
|
+
return ()
|
|
103
|
+
return (RateLimitError, APIConnectionError, InternalServerError)
|
|
104
|
+
|
|
105
|
+
async def generate(self, messages: list[dict[str, str]], tools: list[dict] | None = None, system_prompt: str | None = None, on_text: OnText = None, model: str | None = None, on_retry: OnRetry = None) -> tuple[str, list[dict]]:
|
|
106
|
+
# Generate using OpenAI-compatibile API
|
|
107
|
+
|
|
108
|
+
# Prepare messages includes system prompt
|
|
109
|
+
full_messages = []
|
|
110
|
+
if system_prompt:
|
|
111
|
+
full_messages.append({"role": "system", "content": system_prompt})
|
|
112
|
+
full_messages.extend(messages)
|
|
113
|
+
|
|
114
|
+
# Prepare request
|
|
115
|
+
request_kwargs = {
|
|
116
|
+
"model": model or self.settings.llm.model,
|
|
117
|
+
"messages": full_messages,
|
|
118
|
+
"temperature": self.settings.llm.temperature,
|
|
119
|
+
"max_tokens": self.settings.llm.max_tokens,
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
# Add tools if provided
|
|
123
|
+
if tools:
|
|
124
|
+
request_kwargs["tools"] = tools
|
|
125
|
+
request_kwargs["tool_choice"] = "auto"
|
|
126
|
+
|
|
127
|
+
if on_text is not None:
|
|
128
|
+
return await self._stream(request_kwargs, on_text, on_retry=on_retry)
|
|
129
|
+
|
|
130
|
+
# Make request (retried on transient errors; permanent errors fail fast).
|
|
131
|
+
response = await self._retry(
|
|
132
|
+
lambda: self.client.chat.completions.create(**request_kwargs),
|
|
133
|
+
on_retry=on_retry,
|
|
134
|
+
retryable=self.retryable_exceptions(),
|
|
135
|
+
)
|
|
136
|
+
|
|
137
|
+
# Extract response
|
|
138
|
+
choice = response.choices[0]
|
|
139
|
+
content = choice.message.content or ""
|
|
140
|
+
|
|
141
|
+
# Extract tool calls
|
|
142
|
+
tool_calls = []
|
|
143
|
+
if hasattr(choice.message, "tool_calls") and choice.message.tool_calls:
|
|
144
|
+
tool_calls.extend(
|
|
145
|
+
{"id": tc.id, "name": tc.function.name, "arguments": json.loads(tc.function.arguments)}
|
|
146
|
+
for tc in choice.message.tool_calls
|
|
147
|
+
)
|
|
148
|
+
|
|
149
|
+
self.last_usage = self.normalize_usage(getattr(response, "usage", None))
|
|
150
|
+
return content, tool_calls
|
|
151
|
+
|
|
152
|
+
async def _stream(self, request_kwargs: dict, on_text: OnText, on_retry: OnRetry = None) -> tuple[str, list[dict]]:
|
|
153
|
+
# Streaming variant: accumulate text deltas (emitted live) and reassemble
|
|
154
|
+
# tool calls, which arrive as indexed fragments across chunks.
|
|
155
|
+
request_kwargs = {**request_kwargs, "stream": True, "stream_options": {"include_usage": True}}
|
|
156
|
+
|
|
157
|
+
# The whole stream is retried as a unit: a transient error while
|
|
158
|
+
# establishing *or* consuming the stream restarts it from scratch.
|
|
159
|
+
# at-most-once-token caveat — partial deltas already emitted to the UI on
|
|
160
|
+
# a failed attempt are discarded; the restart re-emits from the top.
|
|
161
|
+
async def _do() -> tuple[str, list[dict]]:
|
|
162
|
+
stream = await self.client.chat.completions.create(**request_kwargs)
|
|
163
|
+
|
|
164
|
+
content = ""
|
|
165
|
+
usage_obj = None
|
|
166
|
+
# index -> {"id", "name", "arguments"(str)}
|
|
167
|
+
acc: dict[int, dict[str, str]] = {}
|
|
168
|
+
|
|
169
|
+
async for chunk in stream:
|
|
170
|
+
if getattr(chunk, "usage", None):
|
|
171
|
+
usage_obj = chunk.usage # final usage chunk (include_usage)
|
|
172
|
+
if not chunk.choices:
|
|
173
|
+
continue
|
|
174
|
+
delta = chunk.choices[0].delta
|
|
175
|
+
if getattr(delta, "content", None):
|
|
176
|
+
content += delta.content
|
|
177
|
+
emit(on_text, delta.content)
|
|
178
|
+
for tc in (getattr(delta, "tool_calls", None) or []):
|
|
179
|
+
slot = acc.setdefault(tc.index, {"id": "", "name": "", "arguments": ""})
|
|
180
|
+
if getattr(tc, "id", None):
|
|
181
|
+
slot["id"] = tc.id
|
|
182
|
+
fn = getattr(tc, "function", None)
|
|
183
|
+
if fn is not None:
|
|
184
|
+
if getattr(fn, "name", None):
|
|
185
|
+
slot["name"] = fn.name
|
|
186
|
+
if getattr(fn, "arguments", None):
|
|
187
|
+
slot["arguments"] += fn.arguments
|
|
188
|
+
|
|
189
|
+
tool_calls = []
|
|
190
|
+
for index in sorted(acc):
|
|
191
|
+
slot = acc[index]
|
|
192
|
+
if not slot["name"]:
|
|
193
|
+
continue
|
|
194
|
+
try:
|
|
195
|
+
arguments = json.loads(slot["arguments"] or "{}")
|
|
196
|
+
except json.JSONDecodeError:
|
|
197
|
+
arguments = {}
|
|
198
|
+
tool_calls.append({"id": slot["id"], "name": slot["name"], "arguments": arguments})
|
|
199
|
+
|
|
200
|
+
self.last_usage = self.normalize_usage(usage_obj)
|
|
201
|
+
return content, tool_calls
|
|
202
|
+
|
|
203
|
+
return await self._retry(_do, on_retry=on_retry, retryable=self.retryable_exceptions())
|
|
204
|
+
|
|
205
|
+
def normalize_usage(self, raw) -> dict[str, int]:
|
|
206
|
+
# OpenAI prompt_tokens includes cached tokens -> split them out so cost
|
|
207
|
+
# isn't double-counted.
|
|
208
|
+
if raw is None:
|
|
209
|
+
return {}
|
|
210
|
+
details = getattr(raw, "prompt_tokens_details", None)
|
|
211
|
+
cached = (getattr(details, "cached_tokens", 0) or 0) if details is not None else 0
|
|
212
|
+
return {
|
|
213
|
+
"input": max(0, (getattr(raw, "prompt_tokens", 0) or 0) - cached),
|
|
214
|
+
"output": getattr(raw, "completion_tokens", 0) or 0,
|
|
215
|
+
"cache_read": cached,
|
|
216
|
+
"cache_creation": 0,
|
|
217
|
+
}
|
|
218
|
+
|
|
219
|
+
|
|
220
|
+
class OpenRouterProvider(OpenAIProvider):
|
|
221
|
+
"""OpenRouter — OpenAI-compatible, with the OpenRouter endpoint default."""
|
|
222
|
+
|
|
223
|
+
name = "openrouter"
|
|
224
|
+
|
|
225
|
+
async def initialize(self) -> None:
|
|
226
|
+
from openai import AsyncOpenAI
|
|
227
|
+
self.client = AsyncOpenAI(
|
|
228
|
+
api_key=self.settings.llm.api_key,
|
|
229
|
+
base_url=self.settings.llm.base_url or "https://openrouter.ai/api/v1",
|
|
230
|
+
timeout=self.settings.llm.timeout,
|
|
231
|
+
max_retries=0,
|
|
232
|
+
)
|
|
233
|
+
|
|
234
|
+
|
|
235
|
+
class AnthropicProvider(Provider):
|
|
236
|
+
"""Anthropic Messages API."""
|
|
237
|
+
|
|
238
|
+
name = "anthropic"
|
|
239
|
+
|
|
240
|
+
async def initialize(self) -> None:
|
|
241
|
+
from anthropic import AsyncAnthropic
|
|
242
|
+
self.client = AsyncAnthropic(
|
|
243
|
+
api_key=self.settings.llm.api_key,
|
|
244
|
+
base_url=self.settings.llm.base_url,
|
|
245
|
+
timeout=self.settings.llm.timeout,
|
|
246
|
+
max_retries=0,
|
|
247
|
+
)
|
|
248
|
+
|
|
249
|
+
def retryable_exceptions(self) -> tuple[type, ...]:
|
|
250
|
+
try:
|
|
251
|
+
from anthropic import (
|
|
252
|
+
RateLimitError,
|
|
253
|
+
APIConnectionError,
|
|
254
|
+
InternalServerError,
|
|
255
|
+
)
|
|
256
|
+
except ImportError:
|
|
257
|
+
return ()
|
|
258
|
+
return (RateLimitError, APIConnectionError, InternalServerError)
|
|
259
|
+
|
|
260
|
+
async def generate(self, messages: list[dict[str, str]], tools: list[dict] | None = None, system_prompt: str | None = None, on_text: OnText = None, model: str | None = None, on_retry: OnRetry = None) -> tuple[str, list[dict]]:
|
|
261
|
+
# Generate using Anthropic API
|
|
262
|
+
# Prepare request
|
|
263
|
+
request_kwargs = {
|
|
264
|
+
"model": model or self.settings.llm.model,
|
|
265
|
+
"max_tokens": self.settings.llm.max_tokens,
|
|
266
|
+
"messages": messages,
|
|
267
|
+
}
|
|
268
|
+
|
|
269
|
+
if system_prompt:
|
|
270
|
+
request_kwargs["system"] = system_prompt
|
|
271
|
+
|
|
272
|
+
# Convert tools to Anthropic format
|
|
273
|
+
if tools:
|
|
274
|
+
request_kwargs["tools"] = [
|
|
275
|
+
{
|
|
276
|
+
"name": tool["function"]["name"],
|
|
277
|
+
"description": tool["function"]["description"],
|
|
278
|
+
"input_schema": tool["function"]["parameters"]
|
|
279
|
+
}
|
|
280
|
+
for tool in tools
|
|
281
|
+
]
|
|
282
|
+
|
|
283
|
+
if on_text is not None:
|
|
284
|
+
return await self._stream(request_kwargs, on_text, on_retry=on_retry)
|
|
285
|
+
|
|
286
|
+
async def _do() -> tuple[str, list[dict]]:
|
|
287
|
+
response = await self.client.messages.create(**request_kwargs)
|
|
288
|
+
self.last_usage = self.normalize_usage(getattr(response, "usage", None))
|
|
289
|
+
return self._extract(response.content)
|
|
290
|
+
|
|
291
|
+
return await self._retry(_do, on_retry=on_retry, retryable=self.retryable_exceptions())
|
|
292
|
+
|
|
293
|
+
async def _stream(self, request_kwargs: dict, on_text: OnText, on_retry: OnRetry = None) -> tuple[str, list[dict]]:
|
|
294
|
+
# Streaming variant: emit text events live, then read the assembled
|
|
295
|
+
# final message for the authoritative content + tool_use blocks. The
|
|
296
|
+
# whole stream is retried as a unit (see OpenAIProvider._stream for the
|
|
297
|
+
# at-most-once-token caveat on restart).
|
|
298
|
+
async def _do() -> tuple[str, list[dict]]:
|
|
299
|
+
async with self.client.messages.stream(**request_kwargs) as stream:
|
|
300
|
+
async for text in stream.text_stream:
|
|
301
|
+
emit(on_text, text)
|
|
302
|
+
final = await stream.get_final_message()
|
|
303
|
+
self.last_usage = self.normalize_usage(getattr(final, "usage", None))
|
|
304
|
+
return self._extract(final.content)
|
|
305
|
+
|
|
306
|
+
return await self._retry(_do, on_retry=on_retry, retryable=self.retryable_exceptions())
|
|
307
|
+
|
|
308
|
+
def normalize_usage(self, raw) -> dict[str, int]:
|
|
309
|
+
# Anthropic reports cache tokens as fields separate from input_tokens.
|
|
310
|
+
if raw is None:
|
|
311
|
+
return {}
|
|
312
|
+
return {
|
|
313
|
+
"input": getattr(raw, "input_tokens", 0) or 0,
|
|
314
|
+
"output": getattr(raw, "output_tokens", 0) or 0,
|
|
315
|
+
"cache_read": getattr(raw, "cache_read_input_tokens", 0) or 0,
|
|
316
|
+
"cache_creation": getattr(raw, "cache_creation_input_tokens", 0) or 0,
|
|
317
|
+
}
|
|
318
|
+
|
|
319
|
+
@staticmethod
|
|
320
|
+
def _extract(blocks) -> tuple[str, list[dict]]:
|
|
321
|
+
content = ""
|
|
322
|
+
tool_calls = []
|
|
323
|
+
for block in blocks:
|
|
324
|
+
if block.type == "text":
|
|
325
|
+
content += block.text
|
|
326
|
+
elif block.type == "tool_use":
|
|
327
|
+
tool_calls.append({"id": block.id, "name": block.name, "arguments": block.input})
|
|
328
|
+
return content, tool_calls
|
|
329
|
+
|
|
330
|
+
|
|
331
|
+
class GoogleProvider(Provider):
|
|
332
|
+
"""Gemini — declared, but it never supported tool calling (P02, Option B).
|
|
333
|
+
|
|
334
|
+
``supports_tools = False`` is the load-bearing property: the facade rejects
|
|
335
|
+
it at configure time with a clear error instead of a turn-time
|
|
336
|
+
``NotImplementedError``. The methods below are defensive for direct use.
|
|
337
|
+
"""
|
|
338
|
+
|
|
339
|
+
name = "google"
|
|
340
|
+
supports_tools = False
|
|
341
|
+
|
|
342
|
+
async def initialize(self) -> None:
|
|
343
|
+
raise unsupported_tools_error(self.name)
|
|
344
|
+
|
|
345
|
+
async def generate(self, messages: list[dict[str, str]], tools: list[dict] | None = None, system_prompt: str | None = None, on_text: OnText = None, model: str | None = None, on_retry: OnRetry = None) -> tuple[str, list[dict]]:
|
|
346
|
+
raise unsupported_tools_error(self.name)
|
|
347
|
+
|
|
348
|
+
def normalize_usage(self, raw) -> dict[str, int]:
|
|
349
|
+
return {}
|
|
350
|
+
|
|
351
|
+
|
|
352
|
+
# Registry order is also the order alternatives are suggested in errors.
|
|
353
|
+
PROVIDERS: dict[str, type[Provider]] = {
|
|
354
|
+
"openai": OpenAIProvider,
|
|
355
|
+
"anthropic": AnthropicProvider,
|
|
356
|
+
"google": GoogleProvider,
|
|
357
|
+
"openrouter": OpenRouterProvider,
|
|
358
|
+
}
|
|
359
|
+
|
|
360
|
+
|
|
361
|
+
def unsupported_tools_error(name: str) -> ValueError:
|
|
362
|
+
"""The configure-time error for a provider that declares no tool support."""
|
|
363
|
+
capable = [n for n, cls in PROVIDERS.items() if cls.supports_tools]
|
|
364
|
+
alternatives = ", ".join(f"'{n}'" for n in capable[:-1]) + f", or '{capable[-1]}'"
|
|
365
|
+
return ValueError(
|
|
366
|
+
f"The '{name}' provider does not yet support tool use, which dacli requires. "
|
|
367
|
+
f"Use {alternatives}."
|
|
368
|
+
)
|
|
369
|
+
|
|
370
|
+
|
|
371
|
+
def create_provider(name: str, settings: Settings, *, retry) -> Provider:
|
|
372
|
+
"""Instantiate the provider registered under ``name`` (already lowercased)."""
|
|
373
|
+
cls = PROVIDERS.get(name)
|
|
374
|
+
if cls is None:
|
|
375
|
+
raise ValueError(f"Unsupported LLM provider: {name}")
|
|
376
|
+
return cls(settings, retry=retry)
|
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
"""Deterministic, offline stand-in for :class:`reasoning.llm.LLMClient`.
|
|
2
|
+
|
|
3
|
+
Driven by an ordered list of scripted *responses*; each ``generate()`` call pops
|
|
4
|
+
the next one and returns it in the exact shape the kernel parses
|
|
5
|
+
(``core/kernel.py``): ``(content, tool_calls)`` where each tool call is
|
|
6
|
+
``{"id", "name", "arguments"}``. An empty ``tool_calls`` ends the agent loop
|
|
7
|
+
(final answer). Running past the end raises :class:`ScriptExhausted` — a real
|
|
8
|
+
signal that the agent looped more than the scenario anticipated.
|
|
9
|
+
|
|
10
|
+
A scripted response is a dict::
|
|
11
|
+
|
|
12
|
+
{
|
|
13
|
+
"text": "optional assistant text",
|
|
14
|
+
"tool_calls": [ {"name": "update_plan", "arguments": {...}} ], # optional
|
|
15
|
+
"usage": {"input": 100, "output": 20}, # optional
|
|
16
|
+
}
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
from __future__ import annotations
|
|
20
|
+
|
|
21
|
+
from typing import Any
|
|
22
|
+
import contextlib
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class ScriptExhausted(RuntimeError):
|
|
26
|
+
"""Raised when ``generate()`` is called after the script is exhausted."""
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class ScriptedLLM:
|
|
30
|
+
"""An offline LLM double satisfying the kernel's LLM contract."""
|
|
31
|
+
|
|
32
|
+
def __init__(self, responses: list[dict[str, Any]]):
|
|
33
|
+
self._responses: list[dict[str, Any]] = list(responses or [])
|
|
34
|
+
self._i = 0
|
|
35
|
+
#: Provider-normalized usage of the most recent generate() call.
|
|
36
|
+
self.last_usage: dict[str, int] = {}
|
|
37
|
+
self.exhausted: bool = False
|
|
38
|
+
|
|
39
|
+
async def initialize(self) -> None:
|
|
40
|
+
# No network, nothing to set up.
|
|
41
|
+
return None
|
|
42
|
+
|
|
43
|
+
async def generate(
|
|
44
|
+
self,
|
|
45
|
+
messages: list[dict[str, Any]] | None = None,
|
|
46
|
+
tools: list[dict[str, Any]] | None = None,
|
|
47
|
+
system_prompt: str | None = None,
|
|
48
|
+
on_text: Any | None = None,
|
|
49
|
+
model: str | None = None,
|
|
50
|
+
) -> tuple[str, list[dict[str, Any]]]:
|
|
51
|
+
if self._i >= len(self._responses):
|
|
52
|
+
self.exhausted = True
|
|
53
|
+
raise ScriptExhausted(
|
|
54
|
+
f"ScriptedLLM exhausted after {len(self._responses)} response(s): "
|
|
55
|
+
"the agent requested another generation the scenario did not script."
|
|
56
|
+
)
|
|
57
|
+
spec = self._responses[self._i]
|
|
58
|
+
self._i += 1
|
|
59
|
+
|
|
60
|
+
text = spec.get("text") or ""
|
|
61
|
+
self.last_usage = dict(spec.get("usage") or {})
|
|
62
|
+
|
|
63
|
+
tool_calls: list[dict[str, Any]] = []
|
|
64
|
+
for j, tc in enumerate(spec.get("tool_calls") or [], start=1):
|
|
65
|
+
tool_calls.append(
|
|
66
|
+
{
|
|
67
|
+
"id": tc.get("id") or f"call_{self._i}_{j}",
|
|
68
|
+
"name": tc["name"],
|
|
69
|
+
"arguments": tc.get("arguments") or {},
|
|
70
|
+
}
|
|
71
|
+
)
|
|
72
|
+
|
|
73
|
+
# Presentation parity with streaming providers (headless on_text is a
|
|
74
|
+
# no-op; the chat UI streams). Never let a presentation hook break us.
|
|
75
|
+
if on_text and text:
|
|
76
|
+
with contextlib.suppress(Exception):
|
|
77
|
+
on_text(text)
|
|
78
|
+
|
|
79
|
+
return text, tool_calls
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: dacli-ai
|
|
3
|
+
Version: 0.4.0
|
|
4
|
+
Summary: Provider LLM client and token/pricing accounting for dacli
|
|
5
|
+
Author-email: Mouad Jaouhari <github@mj-dev.net>
|
|
6
|
+
Project-URL: Homepage, https://github.com/mouadja02/dacli
|
|
7
|
+
Keywords: llm,anthropic,openai,token accounting
|
|
8
|
+
Requires-Python: >=3.10
|
|
9
|
+
Description-Content-Type: text/markdown
|
|
10
|
+
Requires-Dist: anthropic<1,>=0.40
|
|
11
|
+
Requires-Dist: openai<3,>=1.40
|
|
12
|
+
Requires-Dist: httpx<1,>=0.27
|
|
13
|
+
|
|
14
|
+
# dacli-ai
|
|
15
|
+
|
|
16
|
+
Provider LLM client (anthropic / openai / openrouter) and token/pricing accounting
|
|
17
|
+
for [dacli](https://github.com/mouadja02/dacli). The leaf wheel — no dacli-core
|
|
18
|
+
dependency. Embed it with `dacli-core` for a headless agent.
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
README.md
|
|
2
|
+
pyproject.toml
|
|
3
|
+
src/dacli/ai/__init__.py
|
|
4
|
+
src/dacli/ai/llm.py
|
|
5
|
+
src/dacli/ai/pricing.py
|
|
6
|
+
src/dacli/ai/providers.py
|
|
7
|
+
src/dacli/ai/scripted.py
|
|
8
|
+
src/dacli_ai.egg-info/PKG-INFO
|
|
9
|
+
src/dacli_ai.egg-info/SOURCES.txt
|
|
10
|
+
src/dacli_ai.egg-info/dependency_links.txt
|
|
11
|
+
src/dacli_ai.egg-info/requires.txt
|
|
12
|
+
src/dacli_ai.egg-info/top_level.txt
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
dacli
|