dispatch-relay 0.0.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dispatch_relay-0.0.1/LICENSE +21 -0
- dispatch_relay-0.0.1/PKG-INFO +81 -0
- dispatch_relay-0.0.1/README.md +60 -0
- dispatch_relay-0.0.1/pyproject.toml +35 -0
- dispatch_relay-0.0.1/setup.cfg +4 -0
- dispatch_relay-0.0.1/src/dispatch_relay/__init__.py +47 -0
- dispatch_relay-0.0.1/src/dispatch_relay/analytics.py +191 -0
- dispatch_relay-0.0.1/src/dispatch_relay/caching.py +168 -0
- dispatch_relay-0.0.1/src/dispatch_relay/core.py +138 -0
- dispatch_relay-0.0.1/src/dispatch_relay/cost.py +239 -0
- dispatch_relay-0.0.1/src/dispatch_relay/dspy_adapter.py +176 -0
- dispatch_relay-0.0.1/src/dispatch_relay/facade.py +352 -0
- dispatch_relay-0.0.1/src/dispatch_relay/interfaces.py +211 -0
- dispatch_relay-0.0.1/src/dispatch_relay/prompt_eval.py +209 -0
- dispatch_relay-0.0.1/src/dispatch_relay.egg-info/PKG-INFO +81 -0
- dispatch_relay-0.0.1/src/dispatch_relay.egg-info/SOURCES.txt +27 -0
- dispatch_relay-0.0.1/src/dispatch_relay.egg-info/dependency_links.txt +1 -0
- dispatch_relay-0.0.1/src/dispatch_relay.egg-info/requires.txt +13 -0
- dispatch_relay-0.0.1/src/dispatch_relay.egg-info/top_level.txt +2 -0
- dispatch_relay-0.0.1/src/omega_llm/__init__.py +23 -0
- dispatch_relay-0.0.1/tests/test_analytics.py +124 -0
- dispatch_relay-0.0.1/tests/test_caching.py +240 -0
- dispatch_relay-0.0.1/tests/test_caller.py +81 -0
- dispatch_relay-0.0.1/tests/test_core.py +202 -0
- dispatch_relay-0.0.1/tests/test_cost.py +210 -0
- dispatch_relay-0.0.1/tests/test_dspy_adapter.py +165 -0
- dispatch_relay-0.0.1/tests/test_facade.py +180 -0
- dispatch_relay-0.0.1/tests/test_interfaces.py +213 -0
- dispatch_relay-0.0.1/tests/test_prompt_eval.py +175 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Pierre Samson and Claude
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: dispatch-relay
|
|
3
|
+
Version: 0.0.1
|
|
4
|
+
Summary: Provider-agnostic LLM dispatch layer: 3 injected seams (config / usage / dispatch) + a pure cost model. Relays usage to a sink rather than tracking it.
|
|
5
|
+
Author: Pierre Samson, Claude
|
|
6
|
+
License: MIT
|
|
7
|
+
Requires-Python: >=3.10
|
|
8
|
+
Description-Content-Type: text/markdown
|
|
9
|
+
License-File: LICENSE
|
|
10
|
+
Provides-Extra: facade
|
|
11
|
+
Requires-Dist: langchain-core>=0.3; extra == "facade"
|
|
12
|
+
Provides-Extra: dspy
|
|
13
|
+
Requires-Dist: langchain-core>=0.3; extra == "dspy"
|
|
14
|
+
Requires-Dist: dspy>=2.0; extra == "dspy"
|
|
15
|
+
Requires-Dist: litellm>=1.0; extra == "dspy"
|
|
16
|
+
Provides-Extra: all
|
|
17
|
+
Requires-Dist: langchain-core>=0.3; extra == "all"
|
|
18
|
+
Requires-Dist: dspy>=2.0; extra == "all"
|
|
19
|
+
Requires-Dist: litellm>=1.0; extra == "all"
|
|
20
|
+
Dynamic: license-file
|
|
21
|
+
|
|
22
|
+
# dispatch-relay
|
|
23
|
+
|
|
24
|
+
**A provider-agnostic LLM layer with three injected seams.** Resolve a model, dispatch a call across any provider, and *relay* usage to a sink your application owns — instead of the library tracking it for you. Pure-stdlib core, zero runtime dependencies.
|
|
25
|
+
|
|
26
|
+
```bash
|
|
27
|
+
pip install dispatch-relay
|
|
28
|
+
```
|
|
29
|
+
|
|
30
|
+
**Who it's for:** anyone running more than one LLM provider who wants one consistent dispatch + usage-attribution surface, with the host application in control of config resolution, usage recording, and the actual transport. The "relay, not track" name is the contract: usage is relayed to *your* sink (a database, a log, nothing) — the library never decides where it lands.
|
|
31
|
+
|
|
32
|
+
This is the **dependency-light foundation increment**: the three injected-interface seams + the pure cost model. (Caching and the higher-level façade arrive in a later increment and bring `langchain-core` etc. with them; this increment is pure-stdlib.)
|
|
33
|
+
|
|
34
|
+
> **Renamed from `omega-llm`.** `import omega_llm` still works as a deprecated alias that re-exports `dispatch_relay` (with a `DeprecationWarning`) — migrate to `import dispatch_relay`.
|
|
35
|
+
|
|
36
|
+
## The 3 injected seams (`dispatch_relay.interfaces`)
|
|
37
|
+
|
|
38
|
+
Each is a `@runtime_checkable typing.Protocol` (structural typing — a host satisfies the contract WITHOUT importing this library) + a dependency-light default impl.
|
|
39
|
+
|
|
40
|
+
| Seam | Method(s) | Default impl | A host can back it with |
|
|
41
|
+
|------|-----------|--------------|-------------------------|
|
|
42
|
+
| `ConfigSource` | `resolve(key, role, default) → model_id` | `DefaultConfigSource` (`os.getenv(f"{KEY}_MODEL") or default`) | a config store (role → global → env → default) |
|
|
43
|
+
| `UsageSink` | `record(*, provider, role, caller, model, tier, input_tokens, output_tokens, cache_read=0, cache_creation=0, cost_usd=0.0, cost_usd_raw=0.0, billing="metered", **extra) → None` | `NoOpUsageSink` (no-op) | a usage store / time-series table |
|
|
44
|
+
| `DispatchBackend` | `supports(*, provider, role, tier) → bool` + `dispatch(*, provider, model, messages, tier, role, caller, **kwargs) → LLMResponse` | `DefaultDispatchBackend` (direct SDK via injected `llm_factory`; `supports`→True) | subscription lanes / custom transports |
|
|
45
|
+
|
|
46
|
+
`cache_read` and `cache_creation` are **separate** fields on `UsageSink.record` and on `UsageRecord` — summing them undercounts Anthropic. `billing` marks the lane: `"metered"` ($-tracked SDK) vs `"subscription"` ($0).
|
|
47
|
+
|
|
48
|
+
## Value types & core-owned facts (`dispatch_relay.core`)
|
|
49
|
+
|
|
50
|
+
```python
|
|
51
|
+
@dataclass(frozen=True)
|
|
52
|
+
class UsageRecord: # input_tokens, output_tokens, cache_read=0, cache_creation=0, model=""
|
|
53
|
+
@dataclass(frozen=True)
|
|
54
|
+
class LLMResponse: # text, usage: UsageRecord | None, raw: Any
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
The provider-facts live in `dispatch_relay.core` (one place, never duplicated per backend):
|
|
58
|
+
|
|
59
|
+
- `DEFAULTS: dict[str, str]` — the abstract-key → model-id table. The core passes `default=DEFAULTS[key]` into `ConfigSource.resolve`.
|
|
60
|
+
- `extract_usage(provider, raw) → UsageRecord | None` — the single place that knows each provider's usage-from-raw shape. **Anthropic dual-path**: prefer `raw.response_metadata["usage"]` (the uncached remainder), fall back to `raw.usage_metadata` only if absent (using the wrong one double-counts). The **model name** comes from `raw.response_metadata["model_name"]` (both Anthropic and Gemini surface it there — a real LangChain `AIMessage` has no top-level `.model` attribute), falling back to `""`. Returns `None` when no usage metadata is present.
|
|
61
|
+
- `resolve_usage(response, provider, model) → UsageRecord | None` — the **locked reconciliation rule**: resolve `response.usage if response.usage is not None else extract_usage(provider, response.raw)`, then **stamp the authoritative `model`** — the dispatch call knows the configured `model`, so the dispatch-arg model always wins over whatever the raw echoed (via `dataclasses.replace`). Returns `None` unchanged when there's no usage (the subscription lane). `LLMResponse.usage` is a real escape hatch — a backend MAY pre-populate it; else the core extracts.
|
|
62
|
+
|
|
63
|
+
Both shipped backends return `LLMResponse(usage=None)`; the core extracts usage. The `DefaultDispatchBackend` derives `text` from `raw.content`: a `str` passes through; an Anthropic content **list** has its `type=="text"` blocks joined (non-text blocks skipped); anything else falls back to `str(raw)`. That fallback is only the default backend's degenerate case — real subscription backends (raws are **dicts**, not strings) construct `text` explicitly and pass `usage=None` with `billing="subscription"`.
|
|
64
|
+
|
|
65
|
+
## The pure cost model (`dispatch_relay.cost`)
|
|
66
|
+
|
|
67
|
+
`estimate_cost(*, prompt, tier="flash", provider="gemini", output_tokens_max=1024, cache_hit_ratio=0.0, role="agents") -> dict` — a single source of cost truth. Pricing tables for Gemini / Anthropic / OpenAI, the Gemini Flex 50% rebate gate, Anthropic + OpenAI cache-ratio math. Zero deps.
|
|
68
|
+
|
|
69
|
+
## Usage
|
|
70
|
+
|
|
71
|
+
```python
|
|
72
|
+
from dispatch_relay import estimate_cost, DefaultConfigSource, DEFAULTS
|
|
73
|
+
|
|
74
|
+
DefaultConfigSource().resolve("gemini_flash", "council", DEFAULTS["gemini_flash"])
|
|
75
|
+
# -> "gemini-2.5-flash" (env GEMINI_FLASH_MODEL wins if set)
|
|
76
|
+
estimate_cost(prompt=10_000, tier="sonnet", provider="anthropic", output_tokens_max=512)
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
## Authors
|
|
80
|
+
|
|
81
|
+
Pierre Samson and Claude. MIT licensed.
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
# dispatch-relay
|
|
2
|
+
|
|
3
|
+
**A provider-agnostic LLM layer with three injected seams.** Resolve a model, dispatch a call across any provider, and *relay* usage to a sink your application owns — instead of the library tracking it for you. Pure-stdlib core, zero runtime dependencies.
|
|
4
|
+
|
|
5
|
+
```bash
|
|
6
|
+
pip install dispatch-relay
|
|
7
|
+
```
|
|
8
|
+
|
|
9
|
+
**Who it's for:** anyone running more than one LLM provider who wants one consistent dispatch + usage-attribution surface, with the host application in control of config resolution, usage recording, and the actual transport. The "relay, not track" name is the contract: usage is relayed to *your* sink (a database, a log, nothing) — the library never decides where it lands.
|
|
10
|
+
|
|
11
|
+
This is the **dependency-light foundation increment**: the three injected-interface seams + the pure cost model. (Caching and the higher-level façade arrive in a later increment and bring `langchain-core` etc. with them; this increment is pure-stdlib.)
|
|
12
|
+
|
|
13
|
+
> **Renamed from `omega-llm`.** `import omega_llm` still works as a deprecated alias that re-exports `dispatch_relay` (with a `DeprecationWarning`) — migrate to `import dispatch_relay`.
|
|
14
|
+
|
|
15
|
+
## The 3 injected seams (`dispatch_relay.interfaces`)
|
|
16
|
+
|
|
17
|
+
Each is a `@runtime_checkable typing.Protocol` (structural typing — a host satisfies the contract WITHOUT importing this library) + a dependency-light default impl.
|
|
18
|
+
|
|
19
|
+
| Seam | Method(s) | Default impl | A host can back it with |
|
|
20
|
+
|------|-----------|--------------|-------------------------|
|
|
21
|
+
| `ConfigSource` | `resolve(key, role, default) → model_id` | `DefaultConfigSource` (`os.getenv(f"{KEY}_MODEL") or default`) | a config store (role → global → env → default) |
|
|
22
|
+
| `UsageSink` | `record(*, provider, role, caller, model, tier, input_tokens, output_tokens, cache_read=0, cache_creation=0, cost_usd=0.0, cost_usd_raw=0.0, billing="metered", **extra) → None` | `NoOpUsageSink` (no-op) | a usage store / time-series table |
|
|
23
|
+
| `DispatchBackend` | `supports(*, provider, role, tier) → bool` + `dispatch(*, provider, model, messages, tier, role, caller, **kwargs) → LLMResponse` | `DefaultDispatchBackend` (direct SDK via injected `llm_factory`; `supports`→True) | subscription lanes / custom transports |
|
|
24
|
+
|
|
25
|
+
`cache_read` and `cache_creation` are **separate** fields on `UsageSink.record` and on `UsageRecord` — summing them undercounts Anthropic. `billing` marks the lane: `"metered"` ($-tracked SDK) vs `"subscription"` ($0).
|
|
26
|
+
|
|
27
|
+
## Value types & core-owned facts (`dispatch_relay.core`)
|
|
28
|
+
|
|
29
|
+
```python
|
|
30
|
+
@dataclass(frozen=True)
|
|
31
|
+
class UsageRecord: # input_tokens, output_tokens, cache_read=0, cache_creation=0, model=""
|
|
32
|
+
@dataclass(frozen=True)
|
|
33
|
+
class LLMResponse: # text, usage: UsageRecord | None, raw: Any
|
|
34
|
+
```
|
|
35
|
+
|
|
36
|
+
The provider-facts live in `dispatch_relay.core` (one place, never duplicated per backend):
|
|
37
|
+
|
|
38
|
+
- `DEFAULTS: dict[str, str]` — the abstract-key → model-id table. The core passes `default=DEFAULTS[key]` into `ConfigSource.resolve`.
|
|
39
|
+
- `extract_usage(provider, raw) → UsageRecord | None` — the single place that knows each provider's usage-from-raw shape. **Anthropic dual-path**: prefer `raw.response_metadata["usage"]` (the uncached remainder), fall back to `raw.usage_metadata` only if absent (using the wrong one double-counts). The **model name** comes from `raw.response_metadata["model_name"]` (both Anthropic and Gemini surface it there — a real LangChain `AIMessage` has no top-level `.model` attribute), falling back to `""`. Returns `None` when no usage metadata is present.
|
|
40
|
+
- `resolve_usage(response, provider, model) → UsageRecord | None` — the **locked reconciliation rule**: resolve `response.usage if response.usage is not None else extract_usage(provider, response.raw)`, then **stamp the authoritative `model`** — the dispatch call knows the configured `model`, so the dispatch-arg model always wins over whatever the raw echoed (via `dataclasses.replace`). Returns `None` unchanged when there's no usage (the subscription lane). `LLMResponse.usage` is a real escape hatch — a backend MAY pre-populate it; else the core extracts.
|
|
41
|
+
|
|
42
|
+
Both shipped backends return `LLMResponse(usage=None)`; the core extracts usage. The `DefaultDispatchBackend` derives `text` from `raw.content`: a `str` passes through; an Anthropic content **list** has its `type=="text"` blocks joined (non-text blocks skipped); anything else falls back to `str(raw)`. That fallback is only the default backend's degenerate case — real subscription backends (raws are **dicts**, not strings) construct `text` explicitly and pass `usage=None` with `billing="subscription"`.
|
|
43
|
+
|
|
44
|
+
## The pure cost model (`dispatch_relay.cost`)
|
|
45
|
+
|
|
46
|
+
`estimate_cost(*, prompt, tier="flash", provider="gemini", output_tokens_max=1024, cache_hit_ratio=0.0, role="agents") -> dict` — a single source of cost truth. Pricing tables for Gemini / Anthropic / OpenAI, the Gemini Flex 50% rebate gate, Anthropic + OpenAI cache-ratio math. Zero deps.
|
|
47
|
+
|
|
48
|
+
## Usage
|
|
49
|
+
|
|
50
|
+
```python
|
|
51
|
+
from dispatch_relay import estimate_cost, DefaultConfigSource, DEFAULTS
|
|
52
|
+
|
|
53
|
+
DefaultConfigSource().resolve("gemini_flash", "council", DEFAULTS["gemini_flash"])
|
|
54
|
+
# -> "gemini-2.5-flash" (env GEMINI_FLASH_MODEL wins if set)
|
|
55
|
+
estimate_cost(prompt=10_000, tier="sonnet", provider="anthropic", output_tokens_max=512)
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
## Authors
|
|
59
|
+
|
|
60
|
+
Pierre Samson and Claude. MIT licensed.
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=61.0"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "dispatch-relay"
|
|
7
|
+
version = "0.0.1"
|
|
8
|
+
description = "Provider-agnostic LLM dispatch layer: 3 injected seams (config / usage / dispatch) + a pure cost model. Relays usage to a sink rather than tracking it."
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.10"
|
|
11
|
+
license = { text = "MIT" }
|
|
12
|
+
authors = [
|
|
13
|
+
{ name = "Pierre Samson" },
|
|
14
|
+
{ name = "Claude" },
|
|
15
|
+
]
|
|
16
|
+
|
|
17
|
+
# No runtime dependencies for THIS increment — the interfaces + cost model are
|
|
18
|
+
# pure-stdlib. langchain-core (and friends) arrive with the caching/façade
|
|
19
|
+
# increment, not here.
|
|
20
|
+
dependencies = []
|
|
21
|
+
|
|
22
|
+
[project.optional-dependencies]
|
|
23
|
+
# The façade/caching/prompt_eval surface — needs langchain message types.
|
|
24
|
+
# The core (interfaces + cost + analytics) stays pure-stdlib without this.
|
|
25
|
+
facade = ["langchain-core>=0.3"]
|
|
26
|
+
# The DSPy adapter (TrackedLM) path — sits on top of [facade].
|
|
27
|
+
dspy = ["langchain-core>=0.3", "dspy>=2.0", "litellm>=1.0"]
|
|
28
|
+
# Everything.
|
|
29
|
+
all = ["langchain-core>=0.3", "dspy>=2.0", "litellm>=1.0"]
|
|
30
|
+
|
|
31
|
+
[tool.setuptools.packages.find]
|
|
32
|
+
where = ["src"]
|
|
33
|
+
|
|
34
|
+
[tool.pytest.ini_options]
|
|
35
|
+
testpaths = ["tests"]
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
"""dispatch-relay — the swarph's canonical provider-agnostic LLM layer.
|
|
2
|
+
|
|
3
|
+
Pure core + 3 injected seams. The T1 contract, AI²-converged
|
|
4
|
+
with the peer 2026-06-08 (pending peer co-review).
|
|
5
|
+
|
|
6
|
+
Exports the three injected-interface seams (each a ``runtime_checkable`` Protocol +
|
|
7
|
+
a dependency-light default impl), the shared value types, the core-owned provider
|
|
8
|
+
facts, and the pure cost model:
|
|
9
|
+
|
|
10
|
+
- ConfigSource / DefaultConfigSource — resolve(key, role, default) → model_id
|
|
11
|
+
- UsageSink / NoOpUsageSink — record(...) usage (separate cache fields)
|
|
12
|
+
- DispatchBackend / DefaultDispatchBackend — supports(...) + dispatch(...) → LLMResponse
|
|
13
|
+
- LLMResponse / UsageRecord — shared value types
|
|
14
|
+
- DEFAULTS / extract_usage / resolve_usage — core-owned provider facts
|
|
15
|
+
- estimate_cost — pure pre-call cost estimator
|
|
16
|
+
"""
|
|
17
|
+
from __future__ import annotations
|
|
18
|
+
|
|
19
|
+
from .cost import estimate_cost
|
|
20
|
+
from .core import DEFAULTS, extract_usage, resolve_usage
|
|
21
|
+
from .interfaces import (
|
|
22
|
+
ConfigSource,
|
|
23
|
+
DefaultConfigSource,
|
|
24
|
+
UsageSink,
|
|
25
|
+
NoOpUsageSink,
|
|
26
|
+
DispatchBackend,
|
|
27
|
+
DefaultDispatchBackend,
|
|
28
|
+
LLMResponse,
|
|
29
|
+
UsageRecord,
|
|
30
|
+
)
|
|
31
|
+
|
|
32
|
+
__all__ = [
|
|
33
|
+
"ConfigSource",
|
|
34
|
+
"DefaultConfigSource",
|
|
35
|
+
"UsageSink",
|
|
36
|
+
"NoOpUsageSink",
|
|
37
|
+
"DispatchBackend",
|
|
38
|
+
"DefaultDispatchBackend",
|
|
39
|
+
"LLMResponse",
|
|
40
|
+
"UsageRecord",
|
|
41
|
+
"DEFAULTS",
|
|
42
|
+
"extract_usage",
|
|
43
|
+
"resolve_usage",
|
|
44
|
+
"estimate_cost",
|
|
45
|
+
]
|
|
46
|
+
|
|
47
|
+
__version__ = "0.0.1"
|
|
@@ -0,0 +1,191 @@
|
|
|
1
|
+
"""Token-usage analytics — pure aggregation over in-memory usage records.
|
|
2
|
+
|
|
3
|
+
This is the INVERTED form of the host's old fetch-then-aggregate helper: the
|
|
4
|
+
library owns the AGGREGATION LOGIC and takes the usage records AS INPUT; it does
|
|
5
|
+
NOT fetch. The host keeps a thin wrapper that reads its store (a usage-table
|
|
6
|
+
hypertable, etc.) and delegates the rows here. Inverting the dependency keeps
|
|
7
|
+
this module pure-stdlib + zero-dep so it lives in the dependency-light core
|
|
8
|
+
without re-opening the locked T1 seam contract (no 4th read-seam).
|
|
9
|
+
|
|
10
|
+
Each record is a mapping with the keys this module reads:
|
|
11
|
+
``provider`` / ``role`` / ``caller`` / ``model`` / ``day`` and the integer token
|
|
12
|
+
columns ``input`` / ``output`` / ``cached`` / ``thought`` plus a float ``cost``.
|
|
13
|
+
|
|
14
|
+
Schema-asymmetry note (load-bearing for the per-provider arithmetic in
|
|
15
|
+
``_row_total_tokens`` below):
|
|
16
|
+
|
|
17
|
+
- Anthropic: ``input`` is the FRESH remainder (``lc_input - cache_read -
|
|
18
|
+
cache_create``) and ``cached`` is SEPARATE. The two columns are disjoint.
|
|
19
|
+
True prompt size = ``input + cached``.
|
|
20
|
+
- Gemini: ``input`` = ``prompt_token_count`` (the FULL prompt) with
|
|
21
|
+
``cached`` as a SUBSET. True prompt size = ``input`` (``cached`` is
|
|
22
|
+
informational, already counted).
|
|
23
|
+
- OpenAI: ``input`` = ``prompt_tokens`` (FULL) with ``cached`` a subset.
|
|
24
|
+
Same convention as Gemini.
|
|
25
|
+
- ``thought`` carries reasoning tokens (Gemini Pro thinking mode, o1-style
|
|
26
|
+
models) — additive for total prompt cost regardless of provider, currently
|
|
27
|
+
0 for non-reasoning calls.
|
|
28
|
+
|
|
29
|
+
Summing ``input + output`` everywhere undercounts Anthropic by the cache_read
|
|
30
|
+
amount; summing ``input + output + cached`` everywhere double-counts
|
|
31
|
+
Gemini/OpenAI by the same. The fix is per-provider arithmetic.
|
|
32
|
+
"""
|
|
33
|
+
from __future__ import annotations
|
|
34
|
+
|
|
35
|
+
from collections import defaultdict
|
|
36
|
+
from typing import Optional
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
# Providers where the ``input`` column ALREADY includes cached tokens
|
|
40
|
+
# (cached is a subset, not a separate bucket). For these, summing
|
|
41
|
+
# ``input + cached`` would double-count.
|
|
42
|
+
_INPUT_INCLUDES_CACHED = {"gemini", "openai"}
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def _row_total_tokens(provider: str, in_tok: int, out_tok: int,
|
|
46
|
+
cached: int, thought: int) -> int:
|
|
47
|
+
"""True total prompt+output+thinking tokens for a row, accounting for the
|
|
48
|
+
Anthropic-vs-Gemini/OpenAI schema asymmetry documented at module top."""
|
|
49
|
+
if provider in _INPUT_INCLUDES_CACHED:
|
|
50
|
+
return in_tok + out_tok + thought
|
|
51
|
+
# Anthropic (and any provider that stores cached disjoint from input)
|
|
52
|
+
return in_tok + out_tok + cached + thought
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def summarize_usage(records, *, days: int = 7, role: Optional[str] = None,
|
|
56
|
+
provider: Optional[str] = None,
|
|
57
|
+
caller: Optional[str] = None) -> dict:
|
|
58
|
+
"""Aggregate already-fetched usage ``records`` into rollup buckets.
|
|
59
|
+
|
|
60
|
+
The ``days`` / ``role`` / ``provider`` / ``caller`` args are echoed into the
|
|
61
|
+
result as metadata (the host applied them as fetch filters); this function
|
|
62
|
+
aggregates whatever rows it is given.
|
|
63
|
+
|
|
64
|
+
caller bucketing: a ``by_caller`` bucket is always produced so callers can
|
|
65
|
+
rank workers by spend; NULL caller buckets as ``"<unattributed>"`` so the
|
|
66
|
+
JSON key is stable for dashboards (no null-handling on the consumer side).
|
|
67
|
+
|
|
68
|
+
Returns:
|
|
69
|
+
{
|
|
70
|
+
"days": int,
|
|
71
|
+
"role": str | None,
|
|
72
|
+
"provider": str | None,
|
|
73
|
+
"caller": str | None,
|
|
74
|
+
"total_cost_usd": float,
|
|
75
|
+
"total_tokens": int, # provider-aware sum, see module docstring
|
|
76
|
+
"by_provider": {provider: {input, output, cached, thought, cost, n_rows}},
|
|
77
|
+
"by_model": [{model, provider, role, caller, ...}, …],
|
|
78
|
+
"by_role": {role: {cost, tokens}},
|
|
79
|
+
"by_caller": {caller_or_"<unattributed>": {cost, tokens, n_rows}},
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
All values default to 0 on empty input so dashboards / callers don't need to
|
|
83
|
+
guard against None.
|
|
84
|
+
"""
|
|
85
|
+
by_provider: dict[str, dict] = defaultdict(
|
|
86
|
+
lambda: {"input": 0, "output": 0, "cached": 0, "thought": 0,
|
|
87
|
+
"cost": 0.0, "n_rows": 0}
|
|
88
|
+
)
|
|
89
|
+
by_role: dict[str, dict] = defaultdict(lambda: {"cost": 0.0, "tokens": 0})
|
|
90
|
+
# NULL caller → bucket as "<unattributed>" so the JSON key is stable
|
|
91
|
+
# for dashboards (avoids null-handling on the consumer side).
|
|
92
|
+
by_caller: dict[str, dict] = defaultdict(
|
|
93
|
+
lambda: {"cost": 0.0, "tokens": 0, "n_rows": 0}
|
|
94
|
+
)
|
|
95
|
+
by_model: list[dict] = []
|
|
96
|
+
total_cost = 0.0
|
|
97
|
+
total_tokens = 0
|
|
98
|
+
|
|
99
|
+
for r in records:
|
|
100
|
+
prov = r.get("provider") or "unknown"
|
|
101
|
+
rrole = r.get("role") or "agents"
|
|
102
|
+
rcaller = r.get("caller") or "<unattributed>"
|
|
103
|
+
cost = float(r.get("cost") or 0.0)
|
|
104
|
+
in_tok = int(r.get("input") or 0)
|
|
105
|
+
out_tok = int(r.get("output") or 0)
|
|
106
|
+
cached = int(r.get("cached") or 0)
|
|
107
|
+
thought = int(r.get("thought") or 0)
|
|
108
|
+
row_total = _row_total_tokens(prov, in_tok, out_tok, cached, thought)
|
|
109
|
+
|
|
110
|
+
by_provider[prov]["input"] += in_tok
|
|
111
|
+
by_provider[prov]["output"] += out_tok
|
|
112
|
+
by_provider[prov]["cached"] += cached
|
|
113
|
+
by_provider[prov]["thought"] += thought
|
|
114
|
+
by_provider[prov]["cost"] += cost
|
|
115
|
+
by_provider[prov]["n_rows"] += 1
|
|
116
|
+
|
|
117
|
+
by_role[rrole]["cost"] += cost
|
|
118
|
+
by_role[rrole]["tokens"] += row_total
|
|
119
|
+
|
|
120
|
+
by_caller[rcaller]["cost"] += cost
|
|
121
|
+
by_caller[rcaller]["tokens"] += row_total
|
|
122
|
+
by_caller[rcaller]["n_rows"] += 1
|
|
123
|
+
|
|
124
|
+
by_model.append({
|
|
125
|
+
"model": r.get("model"),
|
|
126
|
+
"provider": prov,
|
|
127
|
+
"role": rrole,
|
|
128
|
+
"caller": r.get("caller"), # raw NULL preserved on per-row records
|
|
129
|
+
"day": r.get("day"),
|
|
130
|
+
"input": in_tok,
|
|
131
|
+
"output": out_tok,
|
|
132
|
+
"cached": cached,
|
|
133
|
+
"thought": thought,
|
|
134
|
+
"cost": cost,
|
|
135
|
+
})
|
|
136
|
+
|
|
137
|
+
total_cost += cost
|
|
138
|
+
total_tokens += row_total
|
|
139
|
+
|
|
140
|
+
by_model.sort(key=lambda m: -m["cost"])
|
|
141
|
+
|
|
142
|
+
return {
|
|
143
|
+
"days": days,
|
|
144
|
+
"role": role,
|
|
145
|
+
"provider": provider,
|
|
146
|
+
"caller": caller,
|
|
147
|
+
"total_cost_usd": round(total_cost, 6),
|
|
148
|
+
"total_tokens": total_tokens,
|
|
149
|
+
"by_provider": {k: dict(v) for k, v in by_provider.items()},
|
|
150
|
+
"by_model": by_model,
|
|
151
|
+
"by_role": {k: dict(v) for k, v in by_role.items()},
|
|
152
|
+
"by_caller": {k: dict(v) for k, v in by_caller.items()},
|
|
153
|
+
}
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
def detect_anomalies(records, *, spike_factor: float = 2.0) -> list[dict]:
|
|
157
|
+
"""Flag (model, day) cells whose cost > spike_factor × baseline avg.
|
|
158
|
+
|
|
159
|
+
Baseline is the mean per-day cost for that model over the supplied records
|
|
160
|
+
EXCLUDING the day being checked. Returns rows in descending cost order.
|
|
161
|
+
Empty list when ``records`` is empty.
|
|
162
|
+
"""
|
|
163
|
+
if not records:
|
|
164
|
+
return []
|
|
165
|
+
|
|
166
|
+
# Group by model -> [(day, cost), …]
|
|
167
|
+
series: dict[str, list[tuple]] = defaultdict(list)
|
|
168
|
+
for r in records:
|
|
169
|
+
model = r.get("model") or "unknown"
|
|
170
|
+
cost = float(r.get("cost") or 0.0)
|
|
171
|
+
series[model].append((r.get("day"), cost))
|
|
172
|
+
|
|
173
|
+
spikes = []
|
|
174
|
+
for model, points in series.items():
|
|
175
|
+
if len(points) < 3:
|
|
176
|
+
continue
|
|
177
|
+
for day, cost in points:
|
|
178
|
+
others = [c for d, c in points if d != day]
|
|
179
|
+
if not others:
|
|
180
|
+
continue
|
|
181
|
+
baseline = sum(others) / len(others)
|
|
182
|
+
if baseline > 0 and cost >= spike_factor * baseline:
|
|
183
|
+
spikes.append({
|
|
184
|
+
"model": model,
|
|
185
|
+
"day": day,
|
|
186
|
+
"cost": round(cost, 6),
|
|
187
|
+
"baseline_avg": round(baseline, 6),
|
|
188
|
+
"factor": round(cost / baseline, 2),
|
|
189
|
+
})
|
|
190
|
+
spikes.sort(key=lambda s: -s["cost"])
|
|
191
|
+
return spikes
|
|
@@ -0,0 +1,168 @@
|
|
|
1
|
+
"""Anthropic prompt-caching helper — wrap any LangChain-compatible LLM.
|
|
2
|
+
|
|
3
|
+
Wraps any LangChain-compatible LLM (typically a :class:`dispatch_relay.facade._BoundLLM`)
|
|
4
|
+
so every ``.invoke(messages)`` call prepends a SystemMessage carrying
|
|
5
|
+
``cache_control: {"type": "ephemeral", "ttl": ttl}`` in the correct
|
|
6
|
+
list-of-blocks shape.
|
|
7
|
+
|
|
8
|
+
Why list-of-blocks: ``langchain_anthropic`` SILENTLY DROPS the
|
|
9
|
+
``additional_kwargs={"cache_control": ...}`` shape. The only shape that
|
|
10
|
+
propagates to the wire is::
|
|
11
|
+
|
|
12
|
+
SystemMessage(content=[{"type": "text", "text": ...,
|
|
13
|
+
"cache_control": {"type": "ephemeral", "ttl": "1h"}}])
|
|
14
|
+
|
|
15
|
+
For non-Anthropic LLMs the SystemMessage is sent without cache_control (Gemini's
|
|
16
|
+
implicit caching handles long stable prefixes automatically; OpenAI auto-caches
|
|
17
|
+
prompt prefixes ≥1024 tokens at 50% off input price).
|
|
18
|
+
|
|
19
|
+
This module lives in the ``[facade]`` extra (it needs ``langchain_core``), but the
|
|
20
|
+
langchain import is LAZY (inside :func:`build_cached_system_message`) so importing
|
|
21
|
+
the module is cheap and the zero-dep core stays importable without it.
|
|
22
|
+
"""
|
|
23
|
+
from __future__ import annotations
|
|
24
|
+
|
|
25
|
+
from typing import Any, Optional
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def build_cached_system_message(text: str, ttl: str, is_anthropic: bool):
|
|
29
|
+
"""Build a SystemMessage with the right shape for the provider.
|
|
30
|
+
|
|
31
|
+
For Anthropic: returns the list-of-blocks shape that langchain_anthropic
|
|
32
|
+
actually propagates to the wire. For non-Anthropic providers: plain string
|
|
33
|
+
content (cache_control would be ignored anyway).
|
|
34
|
+
"""
|
|
35
|
+
from langchain_core.messages import SystemMessage
|
|
36
|
+
if is_anthropic:
|
|
37
|
+
return SystemMessage(content=[{
|
|
38
|
+
"type": "text",
|
|
39
|
+
"text": text,
|
|
40
|
+
"cache_control": {"type": "ephemeral", "ttl": ttl},
|
|
41
|
+
}])
|
|
42
|
+
return SystemMessage(content=text)
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
# Backward-compat alias for the private name (callers should use the public one).
|
|
46
|
+
_build_cached_system_message = build_cached_system_message
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def _detect_anthropic(inner: Any) -> bool:
|
|
50
|
+
"""True if the given LLM is an Anthropic model.
|
|
51
|
+
|
|
52
|
+
A wrapper exposing ``_provider`` is trusted; raw ``ChatAnthropic`` is detected
|
|
53
|
+
by class-name sniff. Unknown wrappers default to False — safer to silently
|
|
54
|
+
disable cache_control than to inject it on a non-Anthropic wire.
|
|
55
|
+
"""
|
|
56
|
+
provider = getattr(inner, "_provider", None)
|
|
57
|
+
if provider is None:
|
|
58
|
+
provider = "anthropic" if "Anthropic" in type(inner).__name__ else ""
|
|
59
|
+
return provider == "anthropic"
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
class _CacheableLLM:
|
|
63
|
+
"""Proxy over a LangChain LLM that prepends a cached SystemMessage on invoke.
|
|
64
|
+
|
|
65
|
+
Delegates ``bind_tools`` / ``with_structured_output`` to the underlying LLM and
|
|
66
|
+
re-wraps the result so the cached SystemMessage is preserved across chained
|
|
67
|
+
calls (matches LangChain's chainable contract).
|
|
68
|
+
|
|
69
|
+
The provider flag is captured ONCE at the outermost wrap and threaded through
|
|
70
|
+
every chained re-wrap. Re-sniffing on a chained inner is unsafe — a structured-
|
|
71
|
+
output proxy may have no ``_provider`` attribute and a class name without
|
|
72
|
+
"Anthropic", so a re-sniff would silently flip ``_is_anthropic`` to False and
|
|
73
|
+
drop cache_control from the wire.
|
|
74
|
+
"""
|
|
75
|
+
|
|
76
|
+
def __init__(self, inner: Any, cached_text: str, ttl: str = "1h",
|
|
77
|
+
*, is_anthropic: Optional[bool] = None):
|
|
78
|
+
self._inner = inner
|
|
79
|
+
self._cached_text = cached_text
|
|
80
|
+
self._ttl = ttl
|
|
81
|
+
self._is_anthropic = (
|
|
82
|
+
_detect_anthropic(inner) if is_anthropic is None else is_anthropic
|
|
83
|
+
)
|
|
84
|
+
|
|
85
|
+
def _prepend(self, messages):
|
|
86
|
+
sysmsg = build_cached_system_message(
|
|
87
|
+
self._cached_text, self._ttl, self._is_anthropic,
|
|
88
|
+
)
|
|
89
|
+
if isinstance(messages, list):
|
|
90
|
+
return [sysmsg, *messages]
|
|
91
|
+
return [sysmsg, messages]
|
|
92
|
+
|
|
93
|
+
def invoke(self, messages, *args, **kwargs):
|
|
94
|
+
return self._inner.invoke(self._prepend(messages), *args, **kwargs)
|
|
95
|
+
|
|
96
|
+
async def ainvoke(self, messages, *args, **kwargs):
|
|
97
|
+
return await self._inner.ainvoke(self._prepend(messages), *args, **kwargs)
|
|
98
|
+
|
|
99
|
+
def stream(self, messages, *args, **kwargs):
|
|
100
|
+
return self._inner.stream(self._prepend(messages), *args, **kwargs)
|
|
101
|
+
|
|
102
|
+
def bind_tools(self, *args, **kwargs):
|
|
103
|
+
bound = self._inner.bind_tools(*args, **kwargs)
|
|
104
|
+
return _CacheableLLM(bound, self._cached_text, self._ttl,
|
|
105
|
+
is_anthropic=self._is_anthropic)
|
|
106
|
+
|
|
107
|
+
def with_structured_output(self, *args, **kwargs):
|
|
108
|
+
so = self._inner.with_structured_output(*args, **kwargs)
|
|
109
|
+
return _CacheableLLM(so, self._cached_text, self._ttl,
|
|
110
|
+
is_anthropic=self._is_anthropic)
|
|
111
|
+
|
|
112
|
+
def __getattr__(self, name):
|
|
113
|
+
return getattr(self._inner, name)
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
def with_cache(llm: Any, cached_text: str, ttl: str = "1h") -> _CacheableLLM:
|
|
117
|
+
"""Wrap an LLM so every ``.invoke()`` prepends a cached SystemMessage.
|
|
118
|
+
|
|
119
|
+
Args:
|
|
120
|
+
llm: any LangChain-compatible LLM (typically a ``_BoundLLM``).
|
|
121
|
+
cached_text: the long prefix to cache (e.g. a playbook / master prompt).
|
|
122
|
+
ttl: Anthropic ephemeral cache TTL — ``"5m"`` or ``"1h"``. Ignored for
|
|
123
|
+
non-Anthropic providers (the SystemMessage is still prepended, just
|
|
124
|
+
without the cache_control marker).
|
|
125
|
+
|
|
126
|
+
Returns:
|
|
127
|
+
A :class:`_CacheableLLM` proxy. Chainable via ``.bind_tools()`` and
|
|
128
|
+
``.with_structured_output()``.
|
|
129
|
+
"""
|
|
130
|
+
return _CacheableLLM(llm, cached_text, ttl=ttl)
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
# Module-level default TTL — settable by the Relay façade so
|
|
134
|
+
# `relay(cache_ttl_default="5m").claude().with_cache(text)` honors the TTL (the
|
|
135
|
+
# attached `.with_cache` method has no reference back to the Relay that made it).
|
|
136
|
+
DEFAULT_CACHE_TTL = "1h"
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
def set_default_cache_ttl(ttl: str) -> None:
|
|
140
|
+
"""Set the module-wide default TTL used by the attached ``.with_cache`` when
|
|
141
|
+
no explicit ``ttl=`` is passed. Called by ``Relay.__post_init__`` so
|
|
142
|
+
per-façade defaults stay in sync."""
|
|
143
|
+
global DEFAULT_CACHE_TTL
|
|
144
|
+
DEFAULT_CACHE_TTL = ttl
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
def _attach_with_cache_method():
|
|
148
|
+
"""Attach ``with_cache`` as a bound method on ``_BoundLLM``.
|
|
149
|
+
|
|
150
|
+
Idempotent — safe to call multiple times. Attached at import so any LLM
|
|
151
|
+
produced via ``relay(...).gemini()`` etc. exposes ``.with_cache(text)`` as if
|
|
152
|
+
native, without forcing callers to import :func:`with_cache`.
|
|
153
|
+
"""
|
|
154
|
+
try:
|
|
155
|
+
from dispatch_relay.facade import _BoundLLM
|
|
156
|
+
except ImportError:
|
|
157
|
+
return
|
|
158
|
+
|
|
159
|
+
if "with_cache" in _BoundLLM.__dict__:
|
|
160
|
+
return
|
|
161
|
+
|
|
162
|
+
def _method(self, cached_text: str, ttl: Optional[str] = None):
|
|
163
|
+
return with_cache(self, cached_text, ttl=ttl or DEFAULT_CACHE_TTL)
|
|
164
|
+
|
|
165
|
+
_BoundLLM.with_cache = _method # type: ignore[attr-defined]
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
_attach_with_cache_method()
|