dispatch-relay 0.0.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (29) hide show
  1. dispatch_relay-0.0.1/LICENSE +21 -0
  2. dispatch_relay-0.0.1/PKG-INFO +81 -0
  3. dispatch_relay-0.0.1/README.md +60 -0
  4. dispatch_relay-0.0.1/pyproject.toml +35 -0
  5. dispatch_relay-0.0.1/setup.cfg +4 -0
  6. dispatch_relay-0.0.1/src/dispatch_relay/__init__.py +47 -0
  7. dispatch_relay-0.0.1/src/dispatch_relay/analytics.py +191 -0
  8. dispatch_relay-0.0.1/src/dispatch_relay/caching.py +168 -0
  9. dispatch_relay-0.0.1/src/dispatch_relay/core.py +138 -0
  10. dispatch_relay-0.0.1/src/dispatch_relay/cost.py +239 -0
  11. dispatch_relay-0.0.1/src/dispatch_relay/dspy_adapter.py +176 -0
  12. dispatch_relay-0.0.1/src/dispatch_relay/facade.py +352 -0
  13. dispatch_relay-0.0.1/src/dispatch_relay/interfaces.py +211 -0
  14. dispatch_relay-0.0.1/src/dispatch_relay/prompt_eval.py +209 -0
  15. dispatch_relay-0.0.1/src/dispatch_relay.egg-info/PKG-INFO +81 -0
  16. dispatch_relay-0.0.1/src/dispatch_relay.egg-info/SOURCES.txt +27 -0
  17. dispatch_relay-0.0.1/src/dispatch_relay.egg-info/dependency_links.txt +1 -0
  18. dispatch_relay-0.0.1/src/dispatch_relay.egg-info/requires.txt +13 -0
  19. dispatch_relay-0.0.1/src/dispatch_relay.egg-info/top_level.txt +2 -0
  20. dispatch_relay-0.0.1/src/omega_llm/__init__.py +23 -0
  21. dispatch_relay-0.0.1/tests/test_analytics.py +124 -0
  22. dispatch_relay-0.0.1/tests/test_caching.py +240 -0
  23. dispatch_relay-0.0.1/tests/test_caller.py +81 -0
  24. dispatch_relay-0.0.1/tests/test_core.py +202 -0
  25. dispatch_relay-0.0.1/tests/test_cost.py +210 -0
  26. dispatch_relay-0.0.1/tests/test_dspy_adapter.py +165 -0
  27. dispatch_relay-0.0.1/tests/test_facade.py +180 -0
  28. dispatch_relay-0.0.1/tests/test_interfaces.py +213 -0
  29. dispatch_relay-0.0.1/tests/test_prompt_eval.py +175 -0
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Pierre Samson and Claude
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,81 @@
1
+ Metadata-Version: 2.4
2
+ Name: dispatch-relay
3
+ Version: 0.0.1
4
+ Summary: Provider-agnostic LLM dispatch layer: 3 injected seams (config / usage / dispatch) + a pure cost model. Relays usage to a sink rather than tracking it.
5
+ Author: Pierre Samson, Claude
6
+ License: MIT
7
+ Requires-Python: >=3.10
8
+ Description-Content-Type: text/markdown
9
+ License-File: LICENSE
10
+ Provides-Extra: facade
11
+ Requires-Dist: langchain-core>=0.3; extra == "facade"
12
+ Provides-Extra: dspy
13
+ Requires-Dist: langchain-core>=0.3; extra == "dspy"
14
+ Requires-Dist: dspy>=2.0; extra == "dspy"
15
+ Requires-Dist: litellm>=1.0; extra == "dspy"
16
+ Provides-Extra: all
17
+ Requires-Dist: langchain-core>=0.3; extra == "all"
18
+ Requires-Dist: dspy>=2.0; extra == "all"
19
+ Requires-Dist: litellm>=1.0; extra == "all"
20
+ Dynamic: license-file
21
+
22
+ # dispatch-relay
23
+
24
+ **A provider-agnostic LLM layer with three injected seams.** Resolve a model, dispatch a call across any provider, and *relay* usage to a sink your application owns — instead of the library tracking it for you. Pure-stdlib core, zero runtime dependencies.
25
+
26
+ ```bash
27
+ pip install dispatch-relay
28
+ ```
29
+
30
+ **Who it's for:** anyone running more than one LLM provider who wants one consistent dispatch + usage-attribution surface, with the host application in control of config resolution, usage recording, and the actual transport. The "relay, not track" name is the contract: usage is relayed to *your* sink (a database, a log, nothing) — the library never decides where it lands.
31
+
32
+ This is the **dependency-light foundation increment**: the three injected-interface seams + the pure cost model. (Caching and the higher-level façade arrive in a later increment and bring `langchain-core` etc. with them; this increment is pure-stdlib.)
33
+
34
+ > **Renamed from `omega-llm`.** `import omega_llm` still works as a deprecated alias that re-exports `dispatch_relay` (with a `DeprecationWarning`) — migrate to `import dispatch_relay`.
35
+
36
+ ## The 3 injected seams (`dispatch_relay.interfaces`)
37
+
38
+ Each is a `@runtime_checkable typing.Protocol` (structural typing — a host satisfies the contract WITHOUT importing this library) + a dependency-light default impl.
39
+
40
+ | Seam | Method(s) | Default impl | A host can back it with |
41
+ |------|-----------|--------------|-------------------------|
42
+ | `ConfigSource` | `resolve(key, role, default) → model_id` | `DefaultConfigSource` (`os.getenv(f"{KEY}_MODEL") or default`) | a config store (role → global → env → default) |
43
+ | `UsageSink` | `record(*, provider, role, caller, model, tier, input_tokens, output_tokens, cache_read=0, cache_creation=0, cost_usd=0.0, cost_usd_raw=0.0, billing="metered", **extra) → None` | `NoOpUsageSink` (no-op) | a usage store / time-series table |
44
+ | `DispatchBackend` | `supports(*, provider, role, tier) → bool` + `dispatch(*, provider, model, messages, tier, role, caller, **kwargs) → LLMResponse` | `DefaultDispatchBackend` (direct SDK via injected `llm_factory`; `supports`→True) | subscription lanes / custom transports |
45
+
46
+ `cache_read` and `cache_creation` are **separate** fields on `UsageSink.record` and on `UsageRecord` — summing them undercounts Anthropic. `billing` marks the lane: `"metered"` ($-tracked SDK) vs `"subscription"` ($0).
47
+
48
+ ## Value types & core-owned facts (`dispatch_relay.core`)
49
+
50
+ ```python
51
+ @dataclass(frozen=True)
52
+ class UsageRecord: # input_tokens, output_tokens, cache_read=0, cache_creation=0, model=""
53
+ @dataclass(frozen=True)
54
+ class LLMResponse: # text, usage: UsageRecord | None, raw: Any
55
+ ```
56
+
57
+ The provider-facts live in `dispatch_relay.core` (one place, never duplicated per backend):
58
+
59
+ - `DEFAULTS: dict[str, str]` — the abstract-key → model-id table. The core passes `default=DEFAULTS[key]` into `ConfigSource.resolve`.
60
+ - `extract_usage(provider, raw) → UsageRecord | None` — the single place that knows each provider's usage-from-raw shape. **Anthropic dual-path**: prefer `raw.response_metadata["usage"]` (the uncached remainder), fall back to `raw.usage_metadata` only if absent (using the wrong one double-counts). The **model name** comes from `raw.response_metadata["model_name"]` (both Anthropic and Gemini surface it there — a real LangChain `AIMessage` has no top-level `.model` attribute), falling back to `""`. Returns `None` when no usage metadata is present.
61
+ - `resolve_usage(response, provider, model) → UsageRecord | None` — the **locked reconciliation rule**: resolve `response.usage if response.usage is not None else extract_usage(provider, response.raw)`, then **stamp the authoritative `model`** — the dispatch call knows the configured `model`, so the dispatch-arg model always wins over whatever the raw echoed (via `dataclasses.replace`). Returns `None` unchanged when there's no usage (the subscription lane). `LLMResponse.usage` is a real escape hatch — a backend MAY pre-populate it; else the core extracts.
62
+
63
+ Both shipped backends return `LLMResponse(usage=None)`; the core extracts usage. The `DefaultDispatchBackend` derives `text` from `raw.content`: a `str` passes through; an Anthropic content **list** has its `type=="text"` blocks joined (non-text blocks skipped); anything else falls back to `str(raw)`. That fallback is only the default backend's degenerate case — real subscription backends (raws are **dicts**, not strings) construct `text` explicitly and pass `usage=None` with `billing="subscription"`.
64
+
65
+ ## The pure cost model (`dispatch_relay.cost`)
66
+
67
+ `estimate_cost(*, prompt, tier="flash", provider="gemini", output_tokens_max=1024, cache_hit_ratio=0.0, role="agents") -> dict` — a single source of cost truth. Pricing tables for Gemini / Anthropic / OpenAI, the Gemini Flex 50% rebate gate, Anthropic + OpenAI cache-ratio math. Zero deps.
68
+
69
+ ## Usage
70
+
71
+ ```python
72
+ from dispatch_relay import estimate_cost, DefaultConfigSource, DEFAULTS
73
+
74
+ DefaultConfigSource().resolve("gemini_flash", "council", DEFAULTS["gemini_flash"])
75
+ # -> "gemini-2.5-flash" (env GEMINI_FLASH_MODEL wins if set)
76
+ estimate_cost(prompt=10_000, tier="sonnet", provider="anthropic", output_tokens_max=512)
77
+ ```
78
+
79
+ ## Authors
80
+
81
+ Pierre Samson and Claude. MIT licensed.
@@ -0,0 +1,60 @@
1
+ # dispatch-relay
2
+
3
+ **A provider-agnostic LLM layer with three injected seams.** Resolve a model, dispatch a call across any provider, and *relay* usage to a sink your application owns — instead of the library tracking it for you. Pure-stdlib core, zero runtime dependencies.
4
+
5
+ ```bash
6
+ pip install dispatch-relay
7
+ ```
8
+
9
+ **Who it's for:** anyone running more than one LLM provider who wants one consistent dispatch + usage-attribution surface, with the host application in control of config resolution, usage recording, and the actual transport. The "relay, not track" name is the contract: usage is relayed to *your* sink (a database, a log, nothing) — the library never decides where it lands.
10
+
11
+ This is the **dependency-light foundation increment**: the three injected-interface seams + the pure cost model. (Caching and the higher-level façade arrive in a later increment and bring `langchain-core` etc. with them; this increment is pure-stdlib.)
12
+
13
+ > **Renamed from `omega-llm`.** `import omega_llm` still works as a deprecated alias that re-exports `dispatch_relay` (with a `DeprecationWarning`) — migrate to `import dispatch_relay`.
14
+
15
+ ## The 3 injected seams (`dispatch_relay.interfaces`)
16
+
17
+ Each is a `@runtime_checkable typing.Protocol` (structural typing — a host satisfies the contract WITHOUT importing this library) + a dependency-light default impl.
18
+
19
+ | Seam | Method(s) | Default impl | A host can back it with |
20
+ |------|-----------|--------------|-------------------------|
21
+ | `ConfigSource` | `resolve(key, role, default) → model_id` | `DefaultConfigSource` (`os.getenv(f"{KEY}_MODEL") or default`) | a config store (role → global → env → default) |
22
+ | `UsageSink` | `record(*, provider, role, caller, model, tier, input_tokens, output_tokens, cache_read=0, cache_creation=0, cost_usd=0.0, cost_usd_raw=0.0, billing="metered", **extra) → None` | `NoOpUsageSink` (no-op) | a usage store / time-series table |
23
+ | `DispatchBackend` | `supports(*, provider, role, tier) → bool` + `dispatch(*, provider, model, messages, tier, role, caller, **kwargs) → LLMResponse` | `DefaultDispatchBackend` (direct SDK via injected `llm_factory`; `supports`→True) | subscription lanes / custom transports |
24
+
25
+ `cache_read` and `cache_creation` are **separate** fields on `UsageSink.record` and on `UsageRecord` — summing them undercounts Anthropic. `billing` marks the lane: `"metered"` ($-tracked SDK) vs `"subscription"` ($0).
26
+
27
+ ## Value types & core-owned facts (`dispatch_relay.core`)
28
+
29
+ ```python
30
+ @dataclass(frozen=True)
31
+ class UsageRecord: # input_tokens, output_tokens, cache_read=0, cache_creation=0, model=""
32
+ @dataclass(frozen=True)
33
+ class LLMResponse: # text, usage: UsageRecord | None, raw: Any
34
+ ```
35
+
36
+ The provider-facts live in `dispatch_relay.core` (one place, never duplicated per backend):
37
+
38
+ - `DEFAULTS: dict[str, str]` — the abstract-key → model-id table. The core passes `default=DEFAULTS[key]` into `ConfigSource.resolve`.
39
+ - `extract_usage(provider, raw) → UsageRecord | None` — the single place that knows each provider's usage-from-raw shape. **Anthropic dual-path**: prefer `raw.response_metadata["usage"]` (the uncached remainder), fall back to `raw.usage_metadata` only if absent (using the wrong one double-counts). The **model name** comes from `raw.response_metadata["model_name"]` (both Anthropic and Gemini surface it there — a real LangChain `AIMessage` has no top-level `.model` attribute), falling back to `""`. Returns `None` when no usage metadata is present.
40
+ - `resolve_usage(response, provider, model) → UsageRecord | None` — the **locked reconciliation rule**: resolve `response.usage if response.usage is not None else extract_usage(provider, response.raw)`, then **stamp the authoritative `model`** — the dispatch call knows the configured `model`, so the dispatch-arg model always wins over whatever the raw echoed (via `dataclasses.replace`). Returns `None` unchanged when there's no usage (the subscription lane). `LLMResponse.usage` is a real escape hatch — a backend MAY pre-populate it; else the core extracts.
41
+
42
+ Both shipped backends return `LLMResponse(usage=None)`; the core extracts usage. The `DefaultDispatchBackend` derives `text` from `raw.content`: a `str` passes through; an Anthropic content **list** has its `type=="text"` blocks joined (non-text blocks skipped); anything else falls back to `str(raw)`. That fallback is only the default backend's degenerate case — real subscription backends (raws are **dicts**, not strings) construct `text` explicitly and pass `usage=None` with `billing="subscription"`.
43
+
44
+ ## The pure cost model (`dispatch_relay.cost`)
45
+
46
+ `estimate_cost(*, prompt, tier="flash", provider="gemini", output_tokens_max=1024, cache_hit_ratio=0.0, role="agents") -> dict` — a single source of cost truth. Pricing tables for Gemini / Anthropic / OpenAI, the Gemini Flex 50% rebate gate, Anthropic + OpenAI cache-ratio math. Zero deps.
47
+
48
+ ## Usage
49
+
50
+ ```python
51
+ from dispatch_relay import estimate_cost, DefaultConfigSource, DEFAULTS
52
+
53
+ DefaultConfigSource().resolve("gemini_flash", "council", DEFAULTS["gemini_flash"])
54
+ # -> "gemini-2.5-flash" (env GEMINI_FLASH_MODEL wins if set)
55
+ estimate_cost(prompt=10_000, tier="sonnet", provider="anthropic", output_tokens_max=512)
56
+ ```
57
+
58
+ ## Authors
59
+
60
+ Pierre Samson and Claude. MIT licensed.
@@ -0,0 +1,35 @@
1
+ [build-system]
2
+ requires = ["setuptools>=61.0"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "dispatch-relay"
7
+ version = "0.0.1"
8
+ description = "Provider-agnostic LLM dispatch layer: 3 injected seams (config / usage / dispatch) + a pure cost model. Relays usage to a sink rather than tracking it."
9
+ readme = "README.md"
10
+ requires-python = ">=3.10"
11
+ license = { text = "MIT" }
12
+ authors = [
13
+ { name = "Pierre Samson" },
14
+ { name = "Claude" },
15
+ ]
16
+
17
+ # No runtime dependencies for THIS increment — the interfaces + cost model are
18
+ # pure-stdlib. langchain-core (and friends) arrive with the caching/façade
19
+ # increment, not here.
20
+ dependencies = []
21
+
22
+ [project.optional-dependencies]
23
+ # The façade/caching/prompt_eval surface — needs langchain message types.
24
+ # The core (interfaces + cost + analytics) stays pure-stdlib without this.
25
+ facade = ["langchain-core>=0.3"]
26
+ # The DSPy adapter (TrackedLM) path — sits on top of [facade].
27
+ dspy = ["langchain-core>=0.3", "dspy>=2.0", "litellm>=1.0"]
28
+ # Everything.
29
+ all = ["langchain-core>=0.3", "dspy>=2.0", "litellm>=1.0"]
30
+
31
+ [tool.setuptools.packages.find]
32
+ where = ["src"]
33
+
34
+ [tool.pytest.ini_options]
35
+ testpaths = ["tests"]
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,47 @@
1
+ """dispatch-relay — the swarph's canonical provider-agnostic LLM layer.
2
+
3
+ Pure core + 3 injected seams. The T1 contract, AI²-converged
4
+ with the peer 2026-06-08 (pending peer co-review).
5
+
6
+ Exports the three injected-interface seams (each a ``runtime_checkable`` Protocol +
7
+ a dependency-light default impl), the shared value types, the core-owned provider
8
+ facts, and the pure cost model:
9
+
10
+ - ConfigSource / DefaultConfigSource — resolve(key, role, default) → model_id
11
+ - UsageSink / NoOpUsageSink — record(...) usage (separate cache fields)
12
+ - DispatchBackend / DefaultDispatchBackend — supports(...) + dispatch(...) → LLMResponse
13
+ - LLMResponse / UsageRecord — shared value types
14
+ - DEFAULTS / extract_usage / resolve_usage — core-owned provider facts
15
+ - estimate_cost — pure pre-call cost estimator
16
+ """
17
+ from __future__ import annotations
18
+
19
+ from .cost import estimate_cost
20
+ from .core import DEFAULTS, extract_usage, resolve_usage
21
+ from .interfaces import (
22
+ ConfigSource,
23
+ DefaultConfigSource,
24
+ UsageSink,
25
+ NoOpUsageSink,
26
+ DispatchBackend,
27
+ DefaultDispatchBackend,
28
+ LLMResponse,
29
+ UsageRecord,
30
+ )
31
+
32
+ __all__ = [
33
+ "ConfigSource",
34
+ "DefaultConfigSource",
35
+ "UsageSink",
36
+ "NoOpUsageSink",
37
+ "DispatchBackend",
38
+ "DefaultDispatchBackend",
39
+ "LLMResponse",
40
+ "UsageRecord",
41
+ "DEFAULTS",
42
+ "extract_usage",
43
+ "resolve_usage",
44
+ "estimate_cost",
45
+ ]
46
+
47
+ __version__ = "0.0.1"
@@ -0,0 +1,191 @@
1
+ """Token-usage analytics — pure aggregation over in-memory usage records.
2
+
3
+ This is the INVERTED form of the host's old fetch-then-aggregate helper: the
4
+ library owns the AGGREGATION LOGIC and takes the usage records AS INPUT; it does
5
+ NOT fetch. The host keeps a thin wrapper that reads its store (a usage-table
6
+ hypertable, etc.) and delegates the rows here. Inverting the dependency keeps
7
+ this module pure-stdlib + zero-dep so it lives in the dependency-light core
8
+ without re-opening the locked T1 seam contract (no 4th read-seam).
9
+
10
+ Each record is a mapping with the keys this module reads:
11
+ ``provider`` / ``role`` / ``caller`` / ``model`` / ``day`` and the integer token
12
+ columns ``input`` / ``output`` / ``cached`` / ``thought`` plus a float ``cost``.
13
+
14
+ Schema-asymmetry note (load-bearing for the per-provider arithmetic in
15
+ ``_row_total_tokens`` below):
16
+
17
+ - Anthropic: ``input`` is the FRESH remainder (``lc_input - cache_read -
18
+ cache_create``) and ``cached`` is SEPARATE. The two columns are disjoint.
19
+ True prompt size = ``input + cached``.
20
+ - Gemini: ``input`` = ``prompt_token_count`` (the FULL prompt) with
21
+ ``cached`` as a SUBSET. True prompt size = ``input`` (``cached`` is
22
+ informational, already counted).
23
+ - OpenAI: ``input`` = ``prompt_tokens`` (FULL) with ``cached`` a subset.
24
+ Same convention as Gemini.
25
+ - ``thought`` carries reasoning tokens (Gemini Pro thinking mode, o1-style
26
+ models) — additive for total prompt cost regardless of provider, currently
27
+ 0 for non-reasoning calls.
28
+
29
+ Summing ``input + output`` everywhere undercounts Anthropic by the cache_read
30
+ amount; summing ``input + output + cached`` everywhere double-counts
31
+ Gemini/OpenAI by the same. The fix is per-provider arithmetic.
32
+ """
33
+ from __future__ import annotations
34
+
35
+ from collections import defaultdict
36
+ from typing import Optional
37
+
38
+
39
+ # Providers where the ``input`` column ALREADY includes cached tokens
40
+ # (cached is a subset, not a separate bucket). For these, summing
41
+ # ``input + cached`` would double-count.
42
+ _INPUT_INCLUDES_CACHED = {"gemini", "openai"}
43
+
44
+
45
+ def _row_total_tokens(provider: str, in_tok: int, out_tok: int,
46
+ cached: int, thought: int) -> int:
47
+ """True total prompt+output+thinking tokens for a row, accounting for the
48
+ Anthropic-vs-Gemini/OpenAI schema asymmetry documented at module top."""
49
+ if provider in _INPUT_INCLUDES_CACHED:
50
+ return in_tok + out_tok + thought
51
+ # Anthropic (and any provider that stores cached disjoint from input)
52
+ return in_tok + out_tok + cached + thought
53
+
54
+
55
+ def summarize_usage(records, *, days: int = 7, role: Optional[str] = None,
56
+ provider: Optional[str] = None,
57
+ caller: Optional[str] = None) -> dict:
58
+ """Aggregate already-fetched usage ``records`` into rollup buckets.
59
+
60
+ The ``days`` / ``role`` / ``provider`` / ``caller`` args are echoed into the
61
+ result as metadata (the host applied them as fetch filters); this function
62
+ aggregates whatever rows it is given.
63
+
64
+ caller bucketing: a ``by_caller`` bucket is always produced so callers can
65
+ rank workers by spend; NULL caller buckets as ``"<unattributed>"`` so the
66
+ JSON key is stable for dashboards (no null-handling on the consumer side).
67
+
68
+ Returns:
69
+ {
70
+ "days": int,
71
+ "role": str | None,
72
+ "provider": str | None,
73
+ "caller": str | None,
74
+ "total_cost_usd": float,
75
+ "total_tokens": int, # provider-aware sum, see module docstring
76
+ "by_provider": {provider: {input, output, cached, thought, cost, n_rows}},
77
+ "by_model": [{model, provider, role, caller, ...}, …],
78
+ "by_role": {role: {cost, tokens}},
79
+ "by_caller": {caller_or_"<unattributed>": {cost, tokens, n_rows}},
80
+ }
81
+
82
+ All values default to 0 on empty input so dashboards / callers don't need to
83
+ guard against None.
84
+ """
85
+ by_provider: dict[str, dict] = defaultdict(
86
+ lambda: {"input": 0, "output": 0, "cached": 0, "thought": 0,
87
+ "cost": 0.0, "n_rows": 0}
88
+ )
89
+ by_role: dict[str, dict] = defaultdict(lambda: {"cost": 0.0, "tokens": 0})
90
+ # NULL caller → bucket as "<unattributed>" so the JSON key is stable
91
+ # for dashboards (avoids null-handling on the consumer side).
92
+ by_caller: dict[str, dict] = defaultdict(
93
+ lambda: {"cost": 0.0, "tokens": 0, "n_rows": 0}
94
+ )
95
+ by_model: list[dict] = []
96
+ total_cost = 0.0
97
+ total_tokens = 0
98
+
99
+ for r in records:
100
+ prov = r.get("provider") or "unknown"
101
+ rrole = r.get("role") or "agents"
102
+ rcaller = r.get("caller") or "<unattributed>"
103
+ cost = float(r.get("cost") or 0.0)
104
+ in_tok = int(r.get("input") or 0)
105
+ out_tok = int(r.get("output") or 0)
106
+ cached = int(r.get("cached") or 0)
107
+ thought = int(r.get("thought") or 0)
108
+ row_total = _row_total_tokens(prov, in_tok, out_tok, cached, thought)
109
+
110
+ by_provider[prov]["input"] += in_tok
111
+ by_provider[prov]["output"] += out_tok
112
+ by_provider[prov]["cached"] += cached
113
+ by_provider[prov]["thought"] += thought
114
+ by_provider[prov]["cost"] += cost
115
+ by_provider[prov]["n_rows"] += 1
116
+
117
+ by_role[rrole]["cost"] += cost
118
+ by_role[rrole]["tokens"] += row_total
119
+
120
+ by_caller[rcaller]["cost"] += cost
121
+ by_caller[rcaller]["tokens"] += row_total
122
+ by_caller[rcaller]["n_rows"] += 1
123
+
124
+ by_model.append({
125
+ "model": r.get("model"),
126
+ "provider": prov,
127
+ "role": rrole,
128
+ "caller": r.get("caller"), # raw NULL preserved on per-row records
129
+ "day": r.get("day"),
130
+ "input": in_tok,
131
+ "output": out_tok,
132
+ "cached": cached,
133
+ "thought": thought,
134
+ "cost": cost,
135
+ })
136
+
137
+ total_cost += cost
138
+ total_tokens += row_total
139
+
140
+ by_model.sort(key=lambda m: -m["cost"])
141
+
142
+ return {
143
+ "days": days,
144
+ "role": role,
145
+ "provider": provider,
146
+ "caller": caller,
147
+ "total_cost_usd": round(total_cost, 6),
148
+ "total_tokens": total_tokens,
149
+ "by_provider": {k: dict(v) for k, v in by_provider.items()},
150
+ "by_model": by_model,
151
+ "by_role": {k: dict(v) for k, v in by_role.items()},
152
+ "by_caller": {k: dict(v) for k, v in by_caller.items()},
153
+ }
154
+
155
+
156
+ def detect_anomalies(records, *, spike_factor: float = 2.0) -> list[dict]:
157
+ """Flag (model, day) cells whose cost > spike_factor × baseline avg.
158
+
159
+ Baseline is the mean per-day cost for that model over the supplied records
160
+ EXCLUDING the day being checked. Returns rows in descending cost order.
161
+ Empty list when ``records`` is empty.
162
+ """
163
+ if not records:
164
+ return []
165
+
166
+ # Group by model -> [(day, cost), …]
167
+ series: dict[str, list[tuple]] = defaultdict(list)
168
+ for r in records:
169
+ model = r.get("model") or "unknown"
170
+ cost = float(r.get("cost") or 0.0)
171
+ series[model].append((r.get("day"), cost))
172
+
173
+ spikes = []
174
+ for model, points in series.items():
175
+ if len(points) < 3:
176
+ continue
177
+ for day, cost in points:
178
+ others = [c for d, c in points if d != day]
179
+ if not others:
180
+ continue
181
+ baseline = sum(others) / len(others)
182
+ if baseline > 0 and cost >= spike_factor * baseline:
183
+ spikes.append({
184
+ "model": model,
185
+ "day": day,
186
+ "cost": round(cost, 6),
187
+ "baseline_avg": round(baseline, 6),
188
+ "factor": round(cost / baseline, 2),
189
+ })
190
+ spikes.sort(key=lambda s: -s["cost"])
191
+ return spikes
@@ -0,0 +1,168 @@
1
+ """Anthropic prompt-caching helper — wrap any LangChain-compatible LLM.
2
+
3
+ Wraps any LangChain-compatible LLM (typically a :class:`dispatch_relay.facade._BoundLLM`)
4
+ so every ``.invoke(messages)`` call prepends a SystemMessage carrying
5
+ ``cache_control: {"type": "ephemeral", "ttl": ttl}`` in the correct
6
+ list-of-blocks shape.
7
+
8
+ Why list-of-blocks: ``langchain_anthropic`` SILENTLY DROPS the
9
+ ``additional_kwargs={"cache_control": ...}`` shape. The only shape that
10
+ propagates to the wire is::
11
+
12
+ SystemMessage(content=[{"type": "text", "text": ...,
13
+ "cache_control": {"type": "ephemeral", "ttl": "1h"}}])
14
+
15
+ For non-Anthropic LLMs the SystemMessage is sent without cache_control (Gemini's
16
+ implicit caching handles long stable prefixes automatically; OpenAI auto-caches
17
+ prompt prefixes ≥1024 tokens at 50% off input price).
18
+
19
+ This module lives in the ``[facade]`` extra (it needs ``langchain_core``), but the
20
+ langchain import is LAZY (inside :func:`build_cached_system_message`) so importing
21
+ the module is cheap and the zero-dep core stays importable without it.
22
+ """
23
+ from __future__ import annotations
24
+
25
+ from typing import Any, Optional
26
+
27
+
28
+ def build_cached_system_message(text: str, ttl: str, is_anthropic: bool):
29
+ """Build a SystemMessage with the right shape for the provider.
30
+
31
+ For Anthropic: returns the list-of-blocks shape that langchain_anthropic
32
+ actually propagates to the wire. For non-Anthropic providers: plain string
33
+ content (cache_control would be ignored anyway).
34
+ """
35
+ from langchain_core.messages import SystemMessage
36
+ if is_anthropic:
37
+ return SystemMessage(content=[{
38
+ "type": "text",
39
+ "text": text,
40
+ "cache_control": {"type": "ephemeral", "ttl": ttl},
41
+ }])
42
+ return SystemMessage(content=text)
43
+
44
+
45
+ # Backward-compat alias for the private name (callers should use the public one).
46
+ _build_cached_system_message = build_cached_system_message
47
+
48
+
49
+ def _detect_anthropic(inner: Any) -> bool:
50
+ """True if the given LLM is an Anthropic model.
51
+
52
+ A wrapper exposing ``_provider`` is trusted; raw ``ChatAnthropic`` is detected
53
+ by class-name sniff. Unknown wrappers default to False — safer to silently
54
+ disable cache_control than to inject it on a non-Anthropic wire.
55
+ """
56
+ provider = getattr(inner, "_provider", None)
57
+ if provider is None:
58
+ provider = "anthropic" if "Anthropic" in type(inner).__name__ else ""
59
+ return provider == "anthropic"
60
+
61
+
62
+ class _CacheableLLM:
63
+ """Proxy over a LangChain LLM that prepends a cached SystemMessage on invoke.
64
+
65
+ Delegates ``bind_tools`` / ``with_structured_output`` to the underlying LLM and
66
+ re-wraps the result so the cached SystemMessage is preserved across chained
67
+ calls (matches LangChain's chainable contract).
68
+
69
+ The provider flag is captured ONCE at the outermost wrap and threaded through
70
+ every chained re-wrap. Re-sniffing on a chained inner is unsafe — a structured-
71
+ output proxy may have no ``_provider`` attribute and a class name without
72
+ "Anthropic", so a re-sniff would silently flip ``_is_anthropic`` to False and
73
+ drop cache_control from the wire.
74
+ """
75
+
76
+ def __init__(self, inner: Any, cached_text: str, ttl: str = "1h",
77
+ *, is_anthropic: Optional[bool] = None):
78
+ self._inner = inner
79
+ self._cached_text = cached_text
80
+ self._ttl = ttl
81
+ self._is_anthropic = (
82
+ _detect_anthropic(inner) if is_anthropic is None else is_anthropic
83
+ )
84
+
85
+ def _prepend(self, messages):
86
+ sysmsg = build_cached_system_message(
87
+ self._cached_text, self._ttl, self._is_anthropic,
88
+ )
89
+ if isinstance(messages, list):
90
+ return [sysmsg, *messages]
91
+ return [sysmsg, messages]
92
+
93
+ def invoke(self, messages, *args, **kwargs):
94
+ return self._inner.invoke(self._prepend(messages), *args, **kwargs)
95
+
96
+ async def ainvoke(self, messages, *args, **kwargs):
97
+ return await self._inner.ainvoke(self._prepend(messages), *args, **kwargs)
98
+
99
+ def stream(self, messages, *args, **kwargs):
100
+ return self._inner.stream(self._prepend(messages), *args, **kwargs)
101
+
102
+ def bind_tools(self, *args, **kwargs):
103
+ bound = self._inner.bind_tools(*args, **kwargs)
104
+ return _CacheableLLM(bound, self._cached_text, self._ttl,
105
+ is_anthropic=self._is_anthropic)
106
+
107
+ def with_structured_output(self, *args, **kwargs):
108
+ so = self._inner.with_structured_output(*args, **kwargs)
109
+ return _CacheableLLM(so, self._cached_text, self._ttl,
110
+ is_anthropic=self._is_anthropic)
111
+
112
+ def __getattr__(self, name):
113
+ return getattr(self._inner, name)
114
+
115
+
116
+ def with_cache(llm: Any, cached_text: str, ttl: str = "1h") -> _CacheableLLM:
117
+ """Wrap an LLM so every ``.invoke()`` prepends a cached SystemMessage.
118
+
119
+ Args:
120
+ llm: any LangChain-compatible LLM (typically a ``_BoundLLM``).
121
+ cached_text: the long prefix to cache (e.g. a playbook / master prompt).
122
+ ttl: Anthropic ephemeral cache TTL — ``"5m"`` or ``"1h"``. Ignored for
123
+ non-Anthropic providers (the SystemMessage is still prepended, just
124
+ without the cache_control marker).
125
+
126
+ Returns:
127
+ A :class:`_CacheableLLM` proxy. Chainable via ``.bind_tools()`` and
128
+ ``.with_structured_output()``.
129
+ """
130
+ return _CacheableLLM(llm, cached_text, ttl=ttl)
131
+
132
+
133
+ # Module-level default TTL — settable by the Relay façade so
134
+ # `relay(cache_ttl_default="5m").claude().with_cache(text)` honors the TTL (the
135
+ # attached `.with_cache` method has no reference back to the Relay that made it).
136
+ DEFAULT_CACHE_TTL = "1h"
137
+
138
+
139
+ def set_default_cache_ttl(ttl: str) -> None:
140
+ """Set the module-wide default TTL used by the attached ``.with_cache`` when
141
+ no explicit ``ttl=`` is passed. Called by ``Relay.__post_init__`` so
142
+ per-façade defaults stay in sync."""
143
+ global DEFAULT_CACHE_TTL
144
+ DEFAULT_CACHE_TTL = ttl
145
+
146
+
147
+ def _attach_with_cache_method():
148
+ """Attach ``with_cache`` as a bound method on ``_BoundLLM``.
149
+
150
+ Idempotent — safe to call multiple times. Attached at import so any LLM
151
+ produced via ``relay(...).gemini()`` etc. exposes ``.with_cache(text)`` as if
152
+ native, without forcing callers to import :func:`with_cache`.
153
+ """
154
+ try:
155
+ from dispatch_relay.facade import _BoundLLM
156
+ except ImportError:
157
+ return
158
+
159
+ if "with_cache" in _BoundLLM.__dict__:
160
+ return
161
+
162
+ def _method(self, cached_text: str, ttl: Optional[str] = None):
163
+ return with_cache(self, cached_text, ttl=ttl or DEFAULT_CACHE_TTL)
164
+
165
+ _BoundLLM.with_cache = _method # type: ignore[attr-defined]
166
+
167
+
168
+ _attach_with_cache_method()