dispatch-relay 0.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dispatch_relay/__init__.py +47 -0
- dispatch_relay/analytics.py +191 -0
- dispatch_relay/caching.py +168 -0
- dispatch_relay/core.py +138 -0
- dispatch_relay/cost.py +239 -0
- dispatch_relay/dspy_adapter.py +176 -0
- dispatch_relay/facade.py +352 -0
- dispatch_relay/interfaces.py +211 -0
- dispatch_relay/prompt_eval.py +209 -0
- dispatch_relay-0.0.1.dist-info/METADATA +81 -0
- dispatch_relay-0.0.1.dist-info/RECORD +15 -0
- dispatch_relay-0.0.1.dist-info/WHEEL +5 -0
- dispatch_relay-0.0.1.dist-info/licenses/LICENSE +21 -0
- dispatch_relay-0.0.1.dist-info/top_level.txt +2 -0
- omega_llm/__init__.py +23 -0
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
"""dispatch-relay — the swarph's canonical provider-agnostic LLM layer.
|
|
2
|
+
|
|
3
|
+
Pure core + 3 injected seams. The T1 contract, AI²-converged
|
|
4
|
+
with the peer 2026-06-08 (pending peer co-review).
|
|
5
|
+
|
|
6
|
+
Exports the three injected-interface seams (each a ``runtime_checkable`` Protocol +
|
|
7
|
+
a dependency-light default impl), the shared value types, the core-owned provider
|
|
8
|
+
facts, and the pure cost model:
|
|
9
|
+
|
|
10
|
+
- ConfigSource / DefaultConfigSource — resolve(key, role, default) → model_id
|
|
11
|
+
- UsageSink / NoOpUsageSink — record(...) usage (separate cache fields)
|
|
12
|
+
- DispatchBackend / DefaultDispatchBackend — supports(...) + dispatch(...) → LLMResponse
|
|
13
|
+
- LLMResponse / UsageRecord — shared value types
|
|
14
|
+
- DEFAULTS / extract_usage / resolve_usage — core-owned provider facts
|
|
15
|
+
- estimate_cost — pure pre-call cost estimator
|
|
16
|
+
"""
|
|
17
|
+
from __future__ import annotations
|
|
18
|
+
|
|
19
|
+
from .cost import estimate_cost
|
|
20
|
+
from .core import DEFAULTS, extract_usage, resolve_usage
|
|
21
|
+
from .interfaces import (
|
|
22
|
+
ConfigSource,
|
|
23
|
+
DefaultConfigSource,
|
|
24
|
+
UsageSink,
|
|
25
|
+
NoOpUsageSink,
|
|
26
|
+
DispatchBackend,
|
|
27
|
+
DefaultDispatchBackend,
|
|
28
|
+
LLMResponse,
|
|
29
|
+
UsageRecord,
|
|
30
|
+
)
|
|
31
|
+
|
|
32
|
+
__all__ = [
|
|
33
|
+
"ConfigSource",
|
|
34
|
+
"DefaultConfigSource",
|
|
35
|
+
"UsageSink",
|
|
36
|
+
"NoOpUsageSink",
|
|
37
|
+
"DispatchBackend",
|
|
38
|
+
"DefaultDispatchBackend",
|
|
39
|
+
"LLMResponse",
|
|
40
|
+
"UsageRecord",
|
|
41
|
+
"DEFAULTS",
|
|
42
|
+
"extract_usage",
|
|
43
|
+
"resolve_usage",
|
|
44
|
+
"estimate_cost",
|
|
45
|
+
]
|
|
46
|
+
|
|
47
|
+
__version__ = "0.0.1"
|
|
@@ -0,0 +1,191 @@
|
|
|
1
|
+
"""Token-usage analytics — pure aggregation over in-memory usage records.
|
|
2
|
+
|
|
3
|
+
This is the INVERTED form of the host's old fetch-then-aggregate helper: the
|
|
4
|
+
library owns the AGGREGATION LOGIC and takes the usage records AS INPUT; it does
|
|
5
|
+
NOT fetch. The host keeps a thin wrapper that reads its store (a usage-table
|
|
6
|
+
hypertable, etc.) and delegates the rows here. Inverting the dependency keeps
|
|
7
|
+
this module pure-stdlib + zero-dep so it lives in the dependency-light core
|
|
8
|
+
without re-opening the locked T1 seam contract (no 4th read-seam).
|
|
9
|
+
|
|
10
|
+
Each record is a mapping with the keys this module reads:
|
|
11
|
+
``provider`` / ``role`` / ``caller`` / ``model`` / ``day`` and the integer token
|
|
12
|
+
columns ``input`` / ``output`` / ``cached`` / ``thought`` plus a float ``cost``.
|
|
13
|
+
|
|
14
|
+
Schema-asymmetry note (load-bearing for the per-provider arithmetic in
|
|
15
|
+
``_row_total_tokens`` below):
|
|
16
|
+
|
|
17
|
+
- Anthropic: ``input`` is the FRESH remainder (``lc_input - cache_read -
|
|
18
|
+
cache_create``) and ``cached`` is SEPARATE. The two columns are disjoint.
|
|
19
|
+
True prompt size = ``input + cached``.
|
|
20
|
+
- Gemini: ``input`` = ``prompt_token_count`` (the FULL prompt) with
|
|
21
|
+
``cached`` as a SUBSET. True prompt size = ``input`` (``cached`` is
|
|
22
|
+
informational, already counted).
|
|
23
|
+
- OpenAI: ``input`` = ``prompt_tokens`` (FULL) with ``cached`` a subset.
|
|
24
|
+
Same convention as Gemini.
|
|
25
|
+
- ``thought`` carries reasoning tokens (Gemini Pro thinking mode, o1-style
|
|
26
|
+
models) — additive for total prompt cost regardless of provider, currently
|
|
27
|
+
0 for non-reasoning calls.
|
|
28
|
+
|
|
29
|
+
Summing ``input + output`` everywhere undercounts Anthropic by the cache_read
|
|
30
|
+
amount; summing ``input + output + cached`` everywhere double-counts
|
|
31
|
+
Gemini/OpenAI by the same. The fix is per-provider arithmetic.
|
|
32
|
+
"""
|
|
33
|
+
from __future__ import annotations
|
|
34
|
+
|
|
35
|
+
from collections import defaultdict
|
|
36
|
+
from typing import Optional
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
# Providers where the ``input`` column ALREADY includes cached tokens
|
|
40
|
+
# (cached is a subset, not a separate bucket). For these, summing
|
|
41
|
+
# ``input + cached`` would double-count.
|
|
42
|
+
_INPUT_INCLUDES_CACHED = {"gemini", "openai"}
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def _row_total_tokens(provider: str, in_tok: int, out_tok: int,
|
|
46
|
+
cached: int, thought: int) -> int:
|
|
47
|
+
"""True total prompt+output+thinking tokens for a row, accounting for the
|
|
48
|
+
Anthropic-vs-Gemini/OpenAI schema asymmetry documented at module top."""
|
|
49
|
+
if provider in _INPUT_INCLUDES_CACHED:
|
|
50
|
+
return in_tok + out_tok + thought
|
|
51
|
+
# Anthropic (and any provider that stores cached disjoint from input)
|
|
52
|
+
return in_tok + out_tok + cached + thought
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def summarize_usage(records, *, days: int = 7, role: Optional[str] = None,
|
|
56
|
+
provider: Optional[str] = None,
|
|
57
|
+
caller: Optional[str] = None) -> dict:
|
|
58
|
+
"""Aggregate already-fetched usage ``records`` into rollup buckets.
|
|
59
|
+
|
|
60
|
+
The ``days`` / ``role`` / ``provider`` / ``caller`` args are echoed into the
|
|
61
|
+
result as metadata (the host applied them as fetch filters); this function
|
|
62
|
+
aggregates whatever rows it is given.
|
|
63
|
+
|
|
64
|
+
caller bucketing: a ``by_caller`` bucket is always produced so callers can
|
|
65
|
+
rank workers by spend; NULL caller buckets as ``"<unattributed>"`` so the
|
|
66
|
+
JSON key is stable for dashboards (no null-handling on the consumer side).
|
|
67
|
+
|
|
68
|
+
Returns:
|
|
69
|
+
{
|
|
70
|
+
"days": int,
|
|
71
|
+
"role": str | None,
|
|
72
|
+
"provider": str | None,
|
|
73
|
+
"caller": str | None,
|
|
74
|
+
"total_cost_usd": float,
|
|
75
|
+
"total_tokens": int, # provider-aware sum, see module docstring
|
|
76
|
+
"by_provider": {provider: {input, output, cached, thought, cost, n_rows}},
|
|
77
|
+
"by_model": [{model, provider, role, caller, ...}, …],
|
|
78
|
+
"by_role": {role: {cost, tokens}},
|
|
79
|
+
"by_caller": {caller_or_"<unattributed>": {cost, tokens, n_rows}},
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
All values default to 0 on empty input so dashboards / callers don't need to
|
|
83
|
+
guard against None.
|
|
84
|
+
"""
|
|
85
|
+
by_provider: dict[str, dict] = defaultdict(
|
|
86
|
+
lambda: {"input": 0, "output": 0, "cached": 0, "thought": 0,
|
|
87
|
+
"cost": 0.0, "n_rows": 0}
|
|
88
|
+
)
|
|
89
|
+
by_role: dict[str, dict] = defaultdict(lambda: {"cost": 0.0, "tokens": 0})
|
|
90
|
+
# NULL caller → bucket as "<unattributed>" so the JSON key is stable
|
|
91
|
+
# for dashboards (avoids null-handling on the consumer side).
|
|
92
|
+
by_caller: dict[str, dict] = defaultdict(
|
|
93
|
+
lambda: {"cost": 0.0, "tokens": 0, "n_rows": 0}
|
|
94
|
+
)
|
|
95
|
+
by_model: list[dict] = []
|
|
96
|
+
total_cost = 0.0
|
|
97
|
+
total_tokens = 0
|
|
98
|
+
|
|
99
|
+
for r in records:
|
|
100
|
+
prov = r.get("provider") or "unknown"
|
|
101
|
+
rrole = r.get("role") or "agents"
|
|
102
|
+
rcaller = r.get("caller") or "<unattributed>"
|
|
103
|
+
cost = float(r.get("cost") or 0.0)
|
|
104
|
+
in_tok = int(r.get("input") or 0)
|
|
105
|
+
out_tok = int(r.get("output") or 0)
|
|
106
|
+
cached = int(r.get("cached") or 0)
|
|
107
|
+
thought = int(r.get("thought") or 0)
|
|
108
|
+
row_total = _row_total_tokens(prov, in_tok, out_tok, cached, thought)
|
|
109
|
+
|
|
110
|
+
by_provider[prov]["input"] += in_tok
|
|
111
|
+
by_provider[prov]["output"] += out_tok
|
|
112
|
+
by_provider[prov]["cached"] += cached
|
|
113
|
+
by_provider[prov]["thought"] += thought
|
|
114
|
+
by_provider[prov]["cost"] += cost
|
|
115
|
+
by_provider[prov]["n_rows"] += 1
|
|
116
|
+
|
|
117
|
+
by_role[rrole]["cost"] += cost
|
|
118
|
+
by_role[rrole]["tokens"] += row_total
|
|
119
|
+
|
|
120
|
+
by_caller[rcaller]["cost"] += cost
|
|
121
|
+
by_caller[rcaller]["tokens"] += row_total
|
|
122
|
+
by_caller[rcaller]["n_rows"] += 1
|
|
123
|
+
|
|
124
|
+
by_model.append({
|
|
125
|
+
"model": r.get("model"),
|
|
126
|
+
"provider": prov,
|
|
127
|
+
"role": rrole,
|
|
128
|
+
"caller": r.get("caller"), # raw NULL preserved on per-row records
|
|
129
|
+
"day": r.get("day"),
|
|
130
|
+
"input": in_tok,
|
|
131
|
+
"output": out_tok,
|
|
132
|
+
"cached": cached,
|
|
133
|
+
"thought": thought,
|
|
134
|
+
"cost": cost,
|
|
135
|
+
})
|
|
136
|
+
|
|
137
|
+
total_cost += cost
|
|
138
|
+
total_tokens += row_total
|
|
139
|
+
|
|
140
|
+
by_model.sort(key=lambda m: -m["cost"])
|
|
141
|
+
|
|
142
|
+
return {
|
|
143
|
+
"days": days,
|
|
144
|
+
"role": role,
|
|
145
|
+
"provider": provider,
|
|
146
|
+
"caller": caller,
|
|
147
|
+
"total_cost_usd": round(total_cost, 6),
|
|
148
|
+
"total_tokens": total_tokens,
|
|
149
|
+
"by_provider": {k: dict(v) for k, v in by_provider.items()},
|
|
150
|
+
"by_model": by_model,
|
|
151
|
+
"by_role": {k: dict(v) for k, v in by_role.items()},
|
|
152
|
+
"by_caller": {k: dict(v) for k, v in by_caller.items()},
|
|
153
|
+
}
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
def detect_anomalies(records, *, spike_factor: float = 2.0) -> list[dict]:
|
|
157
|
+
"""Flag (model, day) cells whose cost > spike_factor × baseline avg.
|
|
158
|
+
|
|
159
|
+
Baseline is the mean per-day cost for that model over the supplied records
|
|
160
|
+
EXCLUDING the day being checked. Returns rows in descending cost order.
|
|
161
|
+
Empty list when ``records`` is empty.
|
|
162
|
+
"""
|
|
163
|
+
if not records:
|
|
164
|
+
return []
|
|
165
|
+
|
|
166
|
+
# Group by model -> [(day, cost), …]
|
|
167
|
+
series: dict[str, list[tuple]] = defaultdict(list)
|
|
168
|
+
for r in records:
|
|
169
|
+
model = r.get("model") or "unknown"
|
|
170
|
+
cost = float(r.get("cost") or 0.0)
|
|
171
|
+
series[model].append((r.get("day"), cost))
|
|
172
|
+
|
|
173
|
+
spikes = []
|
|
174
|
+
for model, points in series.items():
|
|
175
|
+
if len(points) < 3:
|
|
176
|
+
continue
|
|
177
|
+
for day, cost in points:
|
|
178
|
+
others = [c for d, c in points if d != day]
|
|
179
|
+
if not others:
|
|
180
|
+
continue
|
|
181
|
+
baseline = sum(others) / len(others)
|
|
182
|
+
if baseline > 0 and cost >= spike_factor * baseline:
|
|
183
|
+
spikes.append({
|
|
184
|
+
"model": model,
|
|
185
|
+
"day": day,
|
|
186
|
+
"cost": round(cost, 6),
|
|
187
|
+
"baseline_avg": round(baseline, 6),
|
|
188
|
+
"factor": round(cost / baseline, 2),
|
|
189
|
+
})
|
|
190
|
+
spikes.sort(key=lambda s: -s["cost"])
|
|
191
|
+
return spikes
|
|
@@ -0,0 +1,168 @@
|
|
|
1
|
+
"""Anthropic prompt-caching helper — wrap any LangChain-compatible LLM.
|
|
2
|
+
|
|
3
|
+
Wraps any LangChain-compatible LLM (typically a :class:`dispatch_relay.facade._BoundLLM`)
|
|
4
|
+
so every ``.invoke(messages)`` call prepends a SystemMessage carrying
|
|
5
|
+
``cache_control: {"type": "ephemeral", "ttl": ttl}`` in the correct
|
|
6
|
+
list-of-blocks shape.
|
|
7
|
+
|
|
8
|
+
Why list-of-blocks: ``langchain_anthropic`` SILENTLY DROPS the
|
|
9
|
+
``additional_kwargs={"cache_control": ...}`` shape. The only shape that
|
|
10
|
+
propagates to the wire is::
|
|
11
|
+
|
|
12
|
+
SystemMessage(content=[{"type": "text", "text": ...,
|
|
13
|
+
"cache_control": {"type": "ephemeral", "ttl": "1h"}}])
|
|
14
|
+
|
|
15
|
+
For non-Anthropic LLMs the SystemMessage is sent without cache_control (Gemini's
|
|
16
|
+
implicit caching handles long stable prefixes automatically; OpenAI auto-caches
|
|
17
|
+
prompt prefixes ≥1024 tokens at 50% off input price).
|
|
18
|
+
|
|
19
|
+
This module lives in the ``[facade]`` extra (it needs ``langchain_core``), but the
|
|
20
|
+
langchain import is LAZY (inside :func:`build_cached_system_message`) so importing
|
|
21
|
+
the module is cheap and the zero-dep core stays importable without it.
|
|
22
|
+
"""
|
|
23
|
+
from __future__ import annotations
|
|
24
|
+
|
|
25
|
+
from typing import Any, Optional
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def build_cached_system_message(text: str, ttl: str, is_anthropic: bool):
|
|
29
|
+
"""Build a SystemMessage with the right shape for the provider.
|
|
30
|
+
|
|
31
|
+
For Anthropic: returns the list-of-blocks shape that langchain_anthropic
|
|
32
|
+
actually propagates to the wire. For non-Anthropic providers: plain string
|
|
33
|
+
content (cache_control would be ignored anyway).
|
|
34
|
+
"""
|
|
35
|
+
from langchain_core.messages import SystemMessage
|
|
36
|
+
if is_anthropic:
|
|
37
|
+
return SystemMessage(content=[{
|
|
38
|
+
"type": "text",
|
|
39
|
+
"text": text,
|
|
40
|
+
"cache_control": {"type": "ephemeral", "ttl": ttl},
|
|
41
|
+
}])
|
|
42
|
+
return SystemMessage(content=text)
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
# Backward-compat alias for the private name (callers should use the public one).
|
|
46
|
+
_build_cached_system_message = build_cached_system_message
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def _detect_anthropic(inner: Any) -> bool:
|
|
50
|
+
"""True if the given LLM is an Anthropic model.
|
|
51
|
+
|
|
52
|
+
A wrapper exposing ``_provider`` is trusted; raw ``ChatAnthropic`` is detected
|
|
53
|
+
by class-name sniff. Unknown wrappers default to False — safer to silently
|
|
54
|
+
disable cache_control than to inject it on a non-Anthropic wire.
|
|
55
|
+
"""
|
|
56
|
+
provider = getattr(inner, "_provider", None)
|
|
57
|
+
if provider is None:
|
|
58
|
+
provider = "anthropic" if "Anthropic" in type(inner).__name__ else ""
|
|
59
|
+
return provider == "anthropic"
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
class _CacheableLLM:
|
|
63
|
+
"""Proxy over a LangChain LLM that prepends a cached SystemMessage on invoke.
|
|
64
|
+
|
|
65
|
+
Delegates ``bind_tools`` / ``with_structured_output`` to the underlying LLM and
|
|
66
|
+
re-wraps the result so the cached SystemMessage is preserved across chained
|
|
67
|
+
calls (matches LangChain's chainable contract).
|
|
68
|
+
|
|
69
|
+
The provider flag is captured ONCE at the outermost wrap and threaded through
|
|
70
|
+
every chained re-wrap. Re-sniffing on a chained inner is unsafe — a structured-
|
|
71
|
+
output proxy may have no ``_provider`` attribute and a class name without
|
|
72
|
+
"Anthropic", so a re-sniff would silently flip ``_is_anthropic`` to False and
|
|
73
|
+
drop cache_control from the wire.
|
|
74
|
+
"""
|
|
75
|
+
|
|
76
|
+
def __init__(self, inner: Any, cached_text: str, ttl: str = "1h",
|
|
77
|
+
*, is_anthropic: Optional[bool] = None):
|
|
78
|
+
self._inner = inner
|
|
79
|
+
self._cached_text = cached_text
|
|
80
|
+
self._ttl = ttl
|
|
81
|
+
self._is_anthropic = (
|
|
82
|
+
_detect_anthropic(inner) if is_anthropic is None else is_anthropic
|
|
83
|
+
)
|
|
84
|
+
|
|
85
|
+
def _prepend(self, messages):
|
|
86
|
+
sysmsg = build_cached_system_message(
|
|
87
|
+
self._cached_text, self._ttl, self._is_anthropic,
|
|
88
|
+
)
|
|
89
|
+
if isinstance(messages, list):
|
|
90
|
+
return [sysmsg, *messages]
|
|
91
|
+
return [sysmsg, messages]
|
|
92
|
+
|
|
93
|
+
def invoke(self, messages, *args, **kwargs):
|
|
94
|
+
return self._inner.invoke(self._prepend(messages), *args, **kwargs)
|
|
95
|
+
|
|
96
|
+
async def ainvoke(self, messages, *args, **kwargs):
|
|
97
|
+
return await self._inner.ainvoke(self._prepend(messages), *args, **kwargs)
|
|
98
|
+
|
|
99
|
+
def stream(self, messages, *args, **kwargs):
|
|
100
|
+
return self._inner.stream(self._prepend(messages), *args, **kwargs)
|
|
101
|
+
|
|
102
|
+
def bind_tools(self, *args, **kwargs):
|
|
103
|
+
bound = self._inner.bind_tools(*args, **kwargs)
|
|
104
|
+
return _CacheableLLM(bound, self._cached_text, self._ttl,
|
|
105
|
+
is_anthropic=self._is_anthropic)
|
|
106
|
+
|
|
107
|
+
def with_structured_output(self, *args, **kwargs):
|
|
108
|
+
so = self._inner.with_structured_output(*args, **kwargs)
|
|
109
|
+
return _CacheableLLM(so, self._cached_text, self._ttl,
|
|
110
|
+
is_anthropic=self._is_anthropic)
|
|
111
|
+
|
|
112
|
+
def __getattr__(self, name):
|
|
113
|
+
return getattr(self._inner, name)
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
def with_cache(llm: Any, cached_text: str, ttl: str = "1h") -> _CacheableLLM:
|
|
117
|
+
"""Wrap an LLM so every ``.invoke()`` prepends a cached SystemMessage.
|
|
118
|
+
|
|
119
|
+
Args:
|
|
120
|
+
llm: any LangChain-compatible LLM (typically a ``_BoundLLM``).
|
|
121
|
+
cached_text: the long prefix to cache (e.g. a playbook / master prompt).
|
|
122
|
+
ttl: Anthropic ephemeral cache TTL — ``"5m"`` or ``"1h"``. Ignored for
|
|
123
|
+
non-Anthropic providers (the SystemMessage is still prepended, just
|
|
124
|
+
without the cache_control marker).
|
|
125
|
+
|
|
126
|
+
Returns:
|
|
127
|
+
A :class:`_CacheableLLM` proxy. Chainable via ``.bind_tools()`` and
|
|
128
|
+
``.with_structured_output()``.
|
|
129
|
+
"""
|
|
130
|
+
return _CacheableLLM(llm, cached_text, ttl=ttl)
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
# Module-level default TTL — settable by the Relay façade so
|
|
134
|
+
# `relay(cache_ttl_default="5m").claude().with_cache(text)` honors the TTL (the
|
|
135
|
+
# attached `.with_cache` method has no reference back to the Relay that made it).
|
|
136
|
+
DEFAULT_CACHE_TTL = "1h"
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
def set_default_cache_ttl(ttl: str) -> None:
|
|
140
|
+
"""Set the module-wide default TTL used by the attached ``.with_cache`` when
|
|
141
|
+
no explicit ``ttl=`` is passed. Called by ``Relay.__post_init__`` so
|
|
142
|
+
per-façade defaults stay in sync."""
|
|
143
|
+
global DEFAULT_CACHE_TTL
|
|
144
|
+
DEFAULT_CACHE_TTL = ttl
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
def _attach_with_cache_method():
|
|
148
|
+
"""Attach ``with_cache`` as a bound method on ``_BoundLLM``.
|
|
149
|
+
|
|
150
|
+
Idempotent — safe to call multiple times. Attached at import so any LLM
|
|
151
|
+
produced via ``relay(...).gemini()`` etc. exposes ``.with_cache(text)`` as if
|
|
152
|
+
native, without forcing callers to import :func:`with_cache`.
|
|
153
|
+
"""
|
|
154
|
+
try:
|
|
155
|
+
from dispatch_relay.facade import _BoundLLM
|
|
156
|
+
except ImportError:
|
|
157
|
+
return
|
|
158
|
+
|
|
159
|
+
if "with_cache" in _BoundLLM.__dict__:
|
|
160
|
+
return
|
|
161
|
+
|
|
162
|
+
def _method(self, cached_text: str, ttl: Optional[str] = None):
|
|
163
|
+
return with_cache(self, cached_text, ttl=ttl or DEFAULT_CACHE_TTL)
|
|
164
|
+
|
|
165
|
+
_BoundLLM.with_cache = _method # type: ignore[attr-defined]
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
_attach_with_cache_method()
|
dispatch_relay/core.py
ADDED
|
@@ -0,0 +1,138 @@
|
|
|
1
|
+
"""Core-owned provider facts — the DEFAULTS table + usage extraction.
|
|
2
|
+
|
|
3
|
+
These are provider-facts that must live in exactly ONE place (never duplicated
|
|
4
|
+
per backend or per config source):
|
|
5
|
+
|
|
6
|
+
- :data:`DEFAULTS` — the 7-key abstract-key → model-id table. The core passes
|
|
7
|
+
``default=DEFAULTS[key]`` into :meth:`ConfigSource.resolve`.
|
|
8
|
+
- :func:`extract_usage` — the single place that knows each provider's
|
|
9
|
+
usage-from-raw shape, including the Anthropic dual-path (the Session-19 surface).
|
|
10
|
+
- :func:`resolve_usage` — the locked reconciliation rule between a backend's
|
|
11
|
+
optional pre-populated ``LLMResponse.usage`` and core extraction.
|
|
12
|
+
"""
|
|
13
|
+
from __future__ import annotations
|
|
14
|
+
|
|
15
|
+
import dataclasses
|
|
16
|
+
from typing import Any
|
|
17
|
+
|
|
18
|
+
from .interfaces import LLMResponse, UsageRecord
|
|
19
|
+
|
|
20
|
+
# =====================================================================
|
|
21
|
+
# DEFAULTS — abstract model key → concrete model id (the 7-key table)
|
|
22
|
+
# =====================================================================
|
|
23
|
+
# Moved here verbatim from the old DefaultConfigSource.DEFAULTS. The core owns
|
|
24
|
+
# this provider-fact table and passes default=DEFAULTS[key] into resolve().
|
|
25
|
+
DEFAULTS: dict[str, str] = {
|
|
26
|
+
"gemini_flash": "gemini-2.5-flash",
|
|
27
|
+
"gemini_flash_lite": "gemini-2.5-flash-lite",
|
|
28
|
+
"gemini_pro": "gemini-2.5-pro",
|
|
29
|
+
"gemini_deep_research": "deep-research-pro-preview-12-2025",
|
|
30
|
+
"claude_sonnet": "claude-sonnet-4-6",
|
|
31
|
+
"claude_opus": "claude-opus-4-6",
|
|
32
|
+
"claude_haiku": "claude-haiku-4-5-20251001",
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def _get(obj: Any, key: str, default: Any = None) -> Any:
|
|
37
|
+
"""Read ``key`` from ``obj`` attribute-style OR dict-style.
|
|
38
|
+
|
|
39
|
+
Works on LangChain ``AIMessage``-like objects (attributes) AND plain dicts
|
|
40
|
+
(canned-dict test fixtures) uniformly.
|
|
41
|
+
"""
|
|
42
|
+
if obj is None:
|
|
43
|
+
return default
|
|
44
|
+
if isinstance(obj, dict):
|
|
45
|
+
return obj.get(key, default)
|
|
46
|
+
return getattr(obj, key, default)
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def extract_usage(provider: str, raw: Any) -> UsageRecord | None:
|
|
50
|
+
"""Extract a :class:`UsageRecord` from a provider's raw response.
|
|
51
|
+
|
|
52
|
+
The single place that knows each provider's usage-from-raw shape (provider-fact
|
|
53
|
+
→ core, never duplicated per backend).
|
|
54
|
+
|
|
55
|
+
Anthropic DUAL-PATH (the Session-19 surface):
|
|
56
|
+
* PREFER ``raw.response_metadata["usage"]`` — the UNCACHED remainder, with
|
|
57
|
+
``input_tokens`` / ``output_tokens`` / ``cache_read_input_tokens`` /
|
|
58
|
+
``cache_creation_input_tokens``.
|
|
59
|
+
* FALL BACK to ``raw.usage_metadata`` (the LangChain shape:
|
|
60
|
+
``input_tokens`` / ``output_tokens`` + ``input_token_details.cache_read`` /
|
|
61
|
+
``cache_creation``) only if ``response_metadata.usage`` is absent.
|
|
62
|
+
Using the wrong one double-counts.
|
|
63
|
+
|
|
64
|
+
Non-Anthropic (gemini / openai): read ``raw.usage_metadata``
|
|
65
|
+
(``input_tokens`` / ``output_tokens``, ``cache_read`` from
|
|
66
|
+
``input_token_details`` if present).
|
|
67
|
+
|
|
68
|
+
The model name is read from ``raw.response_metadata["model_name"]`` (both
|
|
69
|
+
Anthropic and Gemini surface it there — a real LangChain ``AIMessage`` has NO
|
|
70
|
+
top-level ``.model`` attribute), falling back to ``""`` if absent.
|
|
71
|
+
|
|
72
|
+
Returns ``None`` if no usage metadata is present (e.g. a subscription raw that
|
|
73
|
+
is a bare string).
|
|
74
|
+
"""
|
|
75
|
+
rmd = _get(raw, "response_metadata")
|
|
76
|
+
model = (_get(rmd, "model_name", "") if rmd is not None else "") or ""
|
|
77
|
+
|
|
78
|
+
if provider == "anthropic":
|
|
79
|
+
rmd_usage = _get(rmd, "usage") if rmd is not None else None
|
|
80
|
+
if rmd_usage is not None:
|
|
81
|
+
return UsageRecord(
|
|
82
|
+
input_tokens=int(_get(rmd_usage, "input_tokens", 0) or 0),
|
|
83
|
+
output_tokens=int(_get(rmd_usage, "output_tokens", 0) or 0),
|
|
84
|
+
cache_read=int(_get(rmd_usage, "cache_read_input_tokens", 0) or 0),
|
|
85
|
+
cache_creation=int(
|
|
86
|
+
_get(rmd_usage, "cache_creation_input_tokens", 0) or 0
|
|
87
|
+
),
|
|
88
|
+
model=model,
|
|
89
|
+
)
|
|
90
|
+
# Fall back to the LangChain usage_metadata shape.
|
|
91
|
+
um = _get(raw, "usage_metadata")
|
|
92
|
+
if um is not None:
|
|
93
|
+
details = _get(um, "input_token_details") or {}
|
|
94
|
+
return UsageRecord(
|
|
95
|
+
input_tokens=int(_get(um, "input_tokens", 0) or 0),
|
|
96
|
+
output_tokens=int(_get(um, "output_tokens", 0) or 0),
|
|
97
|
+
cache_read=int(_get(details, "cache_read", 0) or 0),
|
|
98
|
+
cache_creation=int(_get(details, "cache_creation", 0) or 0),
|
|
99
|
+
model=model,
|
|
100
|
+
)
|
|
101
|
+
return None
|
|
102
|
+
|
|
103
|
+
# gemini / openai (and any other non-anthropic provider)
|
|
104
|
+
um = _get(raw, "usage_metadata")
|
|
105
|
+
if um is not None:
|
|
106
|
+
details = _get(um, "input_token_details") or {}
|
|
107
|
+
return UsageRecord(
|
|
108
|
+
input_tokens=int(_get(um, "input_tokens", 0) or 0),
|
|
109
|
+
output_tokens=int(_get(um, "output_tokens", 0) or 0),
|
|
110
|
+
cache_read=int(_get(details, "cache_read", 0) or 0),
|
|
111
|
+
cache_creation=int(_get(details, "cache_creation", 0) or 0),
|
|
112
|
+
model=model,
|
|
113
|
+
)
|
|
114
|
+
return None
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
def resolve_usage(
|
|
118
|
+
response: LLMResponse, provider: str, model: str
|
|
119
|
+
) -> UsageRecord | None:
|
|
120
|
+
"""The LOCKED reconciliation rule.
|
|
121
|
+
|
|
122
|
+
A backend MAY pre-populate ``response.usage`` (a real escape hatch); otherwise
|
|
123
|
+
the core extracts it from ``response.raw``.
|
|
124
|
+
|
|
125
|
+
The dispatch call KNOWS the configured model (it's the ``model`` argument), so
|
|
126
|
+
the dispatch-arg ``model`` is authoritative: once a record is resolved, its
|
|
127
|
+
``model`` field is stamped with this argument — the configured model always
|
|
128
|
+
wins over whatever the raw echoed (or didn't). Returns ``None`` unchanged when
|
|
129
|
+
there is no usage (the subscription lane).
|
|
130
|
+
"""
|
|
131
|
+
record = (
|
|
132
|
+
response.usage
|
|
133
|
+
if response.usage is not None
|
|
134
|
+
else extract_usage(provider, response.raw)
|
|
135
|
+
)
|
|
136
|
+
if record is None:
|
|
137
|
+
return None
|
|
138
|
+
return dataclasses.replace(record, model=model)
|