coderouter-cli 1.8.3__py3-none-any.whl → 1.9.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -115,6 +115,24 @@ class RegistryCapabilities(BaseModel):
115
115
  "startup check)."
116
116
  ),
117
117
  )
118
+ cache_control: bool | None = Field(
119
+ default=None,
120
+ description=(
121
+ "v1.9-B: does the upstream preserve Anthropic ``cache_control`` "
122
+ "markers end-to-end on the wire? When True, the v0.5-B "
123
+ "translation-lossy gate stays quiet and the doctor cache probe "
124
+ "treats the upstream as eligible for hit-rate verification. "
125
+ "Independent from ``providers.yaml capabilities.prompt_cache`` "
126
+ "(provider-level explicit opt-in); the registry value carries "
127
+ "model-family defaults that match real-world wire support — "
128
+ "for Anthropic native (``claude-sonnet-*`` / ``claude-opus-*``) "
129
+ "and LM Studio's ``/v1/messages`` for ``qwen3.5-*`` / "
130
+ "``qwen3.6-*`` (verified live in v1.8.4, "
131
+ "``cache_read_input_tokens: 280`` observed). "
132
+ "``None`` (default) = no opinion → the v0.5-B gate falls back "
133
+ "to ``provider.kind == 'anthropic'`` for the answer."
134
+ ),
135
+ )
118
136
 
119
137
 
120
138
  class CapabilityRule(BaseModel):
@@ -182,6 +200,7 @@ class ResolvedCapabilities:
182
200
  tools: bool | None = None
183
201
  max_context_tokens: int | None = None
184
202
  claude_code_suitability: Literal["ok", "degraded"] | None = None
203
+ cache_control: bool | None = None
185
204
 
186
205
 
187
206
  # ---------------------------------------------------------------------------
@@ -233,12 +252,14 @@ class CapabilityRegistry:
233
252
  resolved_tools: bool | None = None
234
253
  resolved_max_ctx: int | None = None
235
254
  resolved_suitability: Literal["ok", "degraded"] | None = None
255
+ resolved_cache_control: bool | None = None
236
256
 
237
257
  thinking_locked = False
238
258
  reasoning_locked = False
239
259
  tools_locked = False
240
260
  max_ctx_locked = False
241
261
  suitability_locked = False
262
+ cache_control_locked = False
242
263
 
243
264
  for rule in self._rules:
244
265
  if not rule.kind_matches(kind):
@@ -261,12 +282,16 @@ class CapabilityRegistry:
261
282
  if not suitability_locked and caps.claude_code_suitability is not None:
262
283
  resolved_suitability = caps.claude_code_suitability
263
284
  suitability_locked = True
285
+ if not cache_control_locked and caps.cache_control is not None:
286
+ resolved_cache_control = caps.cache_control
287
+ cache_control_locked = True
264
288
  if (
265
289
  thinking_locked
266
290
  and reasoning_locked
267
291
  and tools_locked
268
292
  and max_ctx_locked
269
293
  and suitability_locked
294
+ and cache_control_locked
270
295
  ):
271
296
  break
272
297
 
@@ -276,6 +301,7 @@ class CapabilityRegistry:
276
301
  tools=resolved_tools,
277
302
  max_context_tokens=resolved_max_ctx,
278
303
  claude_code_suitability=resolved_suitability,
304
+ cache_control=resolved_cache_control,
279
305
  )
280
306
 
281
307
  # ------------------------------------------------------------------
@@ -49,6 +49,87 @@ class Capabilities(BaseModel):
49
49
  openai_compatible: bool = True
50
50
 
51
51
 
52
+ class CostConfig(BaseModel):
53
+ """v1.9-D: per-provider unit pricing for cost aggregation.
54
+
55
+ All fields are optional. When :attr:`ProviderConfig.cost` is unset,
56
+ the provider contributes zero to the cost dashboard but still
57
+ appears in token-count totals — same shape as a free local model.
58
+
59
+ Pricing model
60
+ -------------
61
+
62
+ Anthropic's prompt-cache pricing (verified 2026-04 docs.anthropic.com):
63
+
64
+ * Normal input : 1.0x ``input_tokens_per_million``
65
+ * Normal output : 1.0x ``output_tokens_per_million``
66
+ * Cache read : ``cache_read_discount`` x normal input
67
+ * Cache creation: ``cache_creation_premium`` x normal input
68
+
69
+ The 4-class breakdown (cache_hit / cache_creation / no_cache /
70
+ unknown) recorded by v1.9-A's ``cache-observed`` log lets the
71
+ cost aggregator apply the right multiplier per token, and the
72
+ "savings" figure in the dashboard is computed as
73
+ ``cache_read_input_tokens x normal x (1 - cache_read_discount)``
74
+ — i.e. what the operator *would have* paid without prompt
75
+ caching.
76
+
77
+ LiteLLM's cost tracker (verified 2026-04) does not implement
78
+ cache-aware breakdown; it bills ``cache_read_input_tokens`` at
79
+ full input rate, overstating spend on cache-heavy workloads. The
80
+ CodeRouter dashboard's selling point is correctness here.
81
+ """
82
+
83
+ model_config = ConfigDict(extra="forbid")
84
+
85
+ input_tokens_per_million: float | None = Field(
86
+ default=None,
87
+ ge=0.0,
88
+ description=(
89
+ "USD per million input tokens at normal (uncached) rate. "
90
+ "Anthropic Sonnet 4.x is around 3.00, Opus 4.x around 15.00 "
91
+ "(check the upstream's pricing page — values change)."
92
+ ),
93
+ )
94
+ output_tokens_per_million: float | None = Field(
95
+ default=None,
96
+ ge=0.0,
97
+ description=(
98
+ "USD per million output tokens. Output is invariably the "
99
+ "expensive side of the meter — for coding workloads with "
100
+ "large completions this dominates the bill."
101
+ ),
102
+ )
103
+ cache_read_discount: float = Field(
104
+ default=0.10,
105
+ ge=0.0,
106
+ le=1.0,
107
+ description=(
108
+ "Multiplier applied to ``input_tokens_per_million`` for "
109
+ "tokens served from prompt cache. Anthropic's 2026-04 "
110
+ "pricing is 0.10 (i.e. cache reads are billed at 10% of "
111
+ "normal input rate). LM Studio /v1/messages locally "
112
+ "honors the cache_read field but local backends usually "
113
+ "have ``input_tokens_per_million`` of 0.0, so this field "
114
+ "is moot there."
115
+ ),
116
+ )
117
+ cache_creation_premium: float = Field(
118
+ default=1.25,
119
+ ge=0.0,
120
+ description=(
121
+ "Multiplier applied to ``input_tokens_per_million`` for "
122
+ "tokens *written* to the prompt cache on the first hit. "
123
+ "Anthropic's 2026-04 pricing is 1.25 (cache writes cost "
124
+ "25% more than normal input on the writeback call; "
125
+ "subsequent reads then cost ``cache_read_discount`` x, "
126
+ "amortizing the writeback). Above 1.0 means premium, "
127
+ "1.0 = no premium, below 1.0 = discount on creation "
128
+ "(unusual but theoretically supported by the schema)."
129
+ ),
130
+ )
131
+
132
+
52
133
  class ProviderConfig(BaseModel):
53
134
  """A single provider entry from providers.yaml.
54
135
 
@@ -116,6 +197,19 @@ class ProviderConfig(BaseModel):
116
197
 
117
198
  capabilities: Capabilities = Field(default_factory=Capabilities)
118
199
 
200
+ cost: CostConfig | None = Field(
201
+ default=None,
202
+ description=(
203
+ "v1.9-D: per-provider unit pricing for cost aggregation. "
204
+ "Unset = provider contributes zero to the cost dashboard "
205
+ "(typical for local models). Set on paid endpoints to "
206
+ "feed the ``/dashboard`` cost panel and the "
207
+ "``coderouter stats --cost`` TUI summary. Cache-aware "
208
+ "calculation differentiates cache_read (90% discount on "
209
+ "Anthropic) from normal input — see :class:`CostConfig`."
210
+ ),
211
+ )
212
+
119
213
  @model_validator(mode="after")
120
214
  def _check_output_filters_known(self) -> ProviderConfig:
121
215
  """v1.0-A: fail at config-load on a typo'd filter name.
@@ -173,6 +267,73 @@ class FallbackChain(BaseModel):
173
267
  "for this profile."
174
268
  ),
175
269
  )
270
+ # v1.9-E (L3): tool-loop detection guard.
271
+ #
272
+ # Long-running agent loops can fall into "tool stuck" states where
273
+ # the assistant repeatedly calls the same tool with identical args
274
+ # because it can't make progress. The guard inspects the assistant
275
+ # tool_use history in the inbound request and, when the same call
276
+ # repeats above the threshold, takes the configured action.
277
+ #
278
+ # Three actions trade off intervention against UX disruption:
279
+ # * ``warn`` — emit a structured ``tool-loop-detected`` log only.
280
+ # Diagnostic; default for v1.9-E.
281
+ # * ``inject`` — append a system message reminder ("you appear to
282
+ # be looping, try a different approach") so the
283
+ # next assistant turn has a chance to course-correct.
284
+ # * ``break`` — short-circuit the request with an error response.
285
+ # Use when downstream cost / context exhaustion is
286
+ # worse than telling the agent to stop.
287
+ tool_loop_window: int = Field(
288
+ default=5,
289
+ ge=2,
290
+ le=50,
291
+ description=(
292
+ "v1.9-E (L3): how many of the most recent assistant tool_use "
293
+ "blocks to inspect for a loop. Default 5 covers the typical "
294
+ "Claude Code agent step depth without false-positiving on "
295
+ "legitimate same-tool repetition (e.g. iterating Read on "
296
+ "different files)."
297
+ ),
298
+ )
299
+ tool_loop_threshold: int = Field(
300
+ default=3,
301
+ ge=2,
302
+ le=50,
303
+ description=(
304
+ "v1.9-E (L3): how many *consecutive identical* tool calls "
305
+ "(same name + same args) trigger a loop verdict. Default 3 "
306
+ "catches the most common stuck patterns (Read same file 3x, "
307
+ "Bash same command 3x) while leaving headroom for "
308
+ "intentional repetition with intermediate observations."
309
+ ),
310
+ )
311
+ tool_loop_action: Literal["warn", "inject", "break"] = Field(
312
+ default="warn",
313
+ description=(
314
+ "v1.9-E (L3): action when a loop is detected. ``warn`` (default) "
315
+ "emits a log line only; ``inject`` adds a ``you-are-looping`` "
316
+ "system message reminder to the request; ``break`` returns an "
317
+ "error response. See FallbackChain comment for trade-offs."
318
+ ),
319
+ )
320
+ adaptive: bool = Field(
321
+ default=False,
322
+ description=(
323
+ "v1.9-C: enable health-based dynamic chain reordering for "
324
+ "this profile. When True, the engine consults its "
325
+ "AdaptiveAdjuster and may demote providers whose rolling-"
326
+ "window median latency or error rate exceeds the configured "
327
+ "thresholds (1.5x global median / 10% errors). Demotions are "
328
+ "debounced (30 s minimum between rank changes per provider) "
329
+ "so a transient blip cannot oscillate the chain. When False "
330
+ "(default), the static ``providers`` order is honored "
331
+ "verbatim — no observation overhead. Orthogonal to L5 "
332
+ "(binary HEALTHY/UNHEALTHY backend swap, planned for "
333
+ "v1.9-E phase 3): C handles the gradient case during normal "
334
+ "operation, L5 handles hard crashes."
335
+ ),
336
+ )
176
337
 
177
338
 
178
339
  # ---------------------------------------------------------------------------
coderouter/cost.py ADDED
@@ -0,0 +1,154 @@
1
+ """Cost calculation utilities (v1.9-D Cost-aware Dashboard).
2
+
3
+ Pure functions for translating per-request token counts into USD
4
+ spend, accounting for Anthropic's prompt-cache pricing model
5
+ (``cache_read`` at 10% of normal input, ``cache_creation`` at 125%).
6
+
7
+ Where this fits
8
+ ===============
9
+
10
+ The engine's ``_emit_cache_observed`` site (v1.9-A) calls
11
+ :func:`compute_cost_for_attempt` to enrich the ``cache-observed``
12
+ log line with ``cost_usd`` + ``cost_savings_usd`` fields. The
13
+ MetricsCollector then aggregates per-provider totals over the
14
+ process lifetime, and the dashboard / ``coderouter stats --cost``
15
+ TUI render those aggregates.
16
+
17
+ Why a separate module
18
+ =====================
19
+
20
+ Pricing math is small, pure, and shared by:
21
+
22
+ * the engine's per-request cost calc
23
+ * the collector's snapshot rendering (recomputes a "what-if no
24
+ cache" total for the savings panel)
25
+ * the future ``coderouter stats --cost`` CLI
26
+
27
+ Keeping it as a leaf module with no engine / collector imports
28
+ prevents circular dependencies and makes the pricing semantics
29
+ trivially testable in isolation.
30
+
31
+ Anthropic pricing reference (verified 2026-04)
32
+ ==============================================
33
+
34
+ For Sonnet / Opus / Haiku 4.x:
35
+
36
+ * Normal input : ``input_tokens_per_million`` x 1.0
37
+ * Cache read : ``input_tokens_per_million`` x 0.10
38
+ * Cache creation: ``input_tokens_per_million`` x 1.25
39
+ * Normal output : ``output_tokens_per_million`` x 1.0
40
+
41
+ Tokens reported by the upstream:
42
+
43
+ * ``input_tokens`` — "fresh" input (cache reads / writes are
44
+ excluded from this count and reported via the cache fields).
45
+ * ``cache_read_input_tokens`` — served from prompt cache.
46
+ * ``cache_creation_input_tokens`` — written to prompt cache.
47
+ * ``output_tokens`` — completion.
48
+
49
+ So a single response's billable cost is the sum of the four buckets
50
+ billed at their respective rates. The "savings" figure is the
51
+ counterfactual: what the operator *would have* paid without prompt
52
+ caching, so it focuses on the cache_read tokens (those are the
53
+ ones that got the 90% discount). cache_creation is a premium, not
54
+ a savings, so it doesn't enter the savings figure even though it's
55
+ in the cost calc.
56
+ """
57
+
58
+ from __future__ import annotations
59
+
60
+ from dataclasses import dataclass
61
+
62
+ from coderouter.config.schemas import CostConfig
63
+
64
+
65
+ @dataclass(frozen=True)
66
+ class CostBreakdown:
67
+ """Per-attempt cost components, all in USD.
68
+
69
+ All fields default to 0.0 so a free / unconfigured provider
70
+ yields a zero breakdown without callers having to special-case
71
+ None.
72
+
73
+ Fields
74
+ total_usd: full cost charged for this attempt (sum of the
75
+ four token buckets at their respective rates).
76
+ savings_usd: hypothetical "no-cache" delta — what the
77
+ operator *would have* paid for ``cache_read_input_tokens``
78
+ at full input rate, minus what they actually paid at
79
+ ``cache_read_discount`` rate. Always >= 0.
80
+ input_usd / output_usd / cache_read_usd / cache_creation_usd:
81
+ per-bucket breakdown for the dashboard's stacked bar
82
+ chart. ``input_usd`` is "fresh input only" (does not
83
+ include cache buckets); cache_read_usd / cache_creation_usd
84
+ are the post-discount / post-premium values.
85
+ """
86
+
87
+ total_usd: float = 0.0
88
+ savings_usd: float = 0.0
89
+ input_usd: float = 0.0
90
+ output_usd: float = 0.0
91
+ cache_read_usd: float = 0.0
92
+ cache_creation_usd: float = 0.0
93
+
94
+
95
+ _PER_MILLION: float = 1_000_000.0
96
+
97
+
98
+ def compute_cost_for_attempt(
99
+ cost_config: CostConfig | None,
100
+ *,
101
+ input_tokens: int,
102
+ output_tokens: int,
103
+ cache_read_input_tokens: int,
104
+ cache_creation_input_tokens: int,
105
+ ) -> CostBreakdown:
106
+ """Translate per-attempt token counts into a USD :class:`CostBreakdown`.
107
+
108
+ Returns a zero-filled breakdown when:
109
+ * ``cost_config`` is ``None`` (provider has no pricing
110
+ declared — typical for local models)
111
+ * Both ``input_tokens_per_million`` and ``output_tokens_per_million``
112
+ are unset (a partial declaration is permitted but the
113
+ resulting cost is whatever the set fields can compute)
114
+
115
+ Negative or zero token counts are accepted and contribute zero
116
+ cost — the engine never emits negatives, but this defensive
117
+ handling keeps a malformed log line from corrupting the
118
+ aggregate counters in the collector.
119
+ """
120
+ if cost_config is None:
121
+ return CostBreakdown()
122
+
123
+ input_rate = (cost_config.input_tokens_per_million or 0.0) / _PER_MILLION
124
+ output_rate = (cost_config.output_tokens_per_million or 0.0) / _PER_MILLION
125
+
126
+ safe_input = max(input_tokens, 0)
127
+ safe_output = max(output_tokens, 0)
128
+ safe_read = max(cache_read_input_tokens, 0)
129
+ safe_create = max(cache_creation_input_tokens, 0)
130
+
131
+ input_usd = safe_input * input_rate
132
+ output_usd = safe_output * output_rate
133
+ cache_read_usd = safe_read * input_rate * cost_config.cache_read_discount
134
+ cache_creation_usd = safe_create * input_rate * cost_config.cache_creation_premium
135
+
136
+ total_usd = input_usd + output_usd + cache_read_usd + cache_creation_usd
137
+
138
+ # Savings = what the operator would have paid at full input rate
139
+ # for the cache_read tokens, minus what they actually paid at
140
+ # the discounted rate. cache_creation is a *premium* (not a
141
+ # savings) so it doesn't enter the savings figure — including
142
+ # it would let a cache miss show up as "negative savings" which
143
+ # is semantically wrong and would confuse the dashboard.
144
+ full_rate_for_cache_read = safe_read * input_rate
145
+ savings_usd = full_rate_for_cache_read - cache_read_usd
146
+
147
+ return CostBreakdown(
148
+ total_usd=total_usd,
149
+ savings_usd=max(savings_usd, 0.0),
150
+ input_usd=input_usd,
151
+ output_usd=output_usd,
152
+ cache_read_usd=cache_read_usd,
153
+ cache_creation_usd=cache_creation_usd,
154
+ )
@@ -90,6 +90,61 @@ rules:
90
90
  capabilities:
91
91
  thinking: true
92
92
 
93
+ # ------------------------------------------------------------------
94
+ # Anthropic prompt caching — `cache_control` body field support (v1.9-B).
95
+ #
96
+ # Declares which (kind, model) pairs preserve the ``cache_control``
97
+ # marker end-to-end on the wire. The capability gate
98
+ # (``provider_supports_cache_control`` in coderouter/routing/capability.py)
99
+ # consults these declarations to decide whether to emit a
100
+ # ``capability-degraded reason=translation-lossy`` log when handing
101
+ # a cache_control-bearing request to the provider.
102
+ #
103
+ # Verification basis:
104
+ # * api.anthropic.com (kind=anthropic, claude-* models):
105
+ # verified live 2026-04-20 — 1321 tokens written on call 1,
106
+ # 1321 read on call 2. Native Anthropic shape passes through
107
+ # verbatim, no translation hop loses the marker.
108
+ # * LM Studio /v1/messages (kind=anthropic, qwen3.5-* / qwen3.6-*):
109
+ # verified live 2026-04-27 in v1.8.4 — `cache_read_input_tokens: 280`
110
+ # observed end-to-end through CodeRouter. LM Studio 0.4.12+ honors
111
+ # the marker on its Anthropic-compatible endpoint.
112
+ #
113
+ # All entries are kind=anthropic. openai_compat upstreams have no wire
114
+ # equivalent for cache_control (the OpenAI Chat Completions schema does
115
+ # not carry it), so they intentionally stay undeclared — the gate's
116
+ # fallback maps undeclared + kind=openai_compat to False.
117
+ # ------------------------------------------------------------------
118
+
119
+ - match: "claude-opus-4-*"
120
+ kind: anthropic
121
+ capabilities:
122
+ cache_control: true
123
+
124
+ - match: "claude-sonnet-4-*"
125
+ kind: anthropic
126
+ capabilities:
127
+ cache_control: true
128
+
129
+ - match: "claude-haiku-4-*"
130
+ kind: anthropic
131
+ capabilities:
132
+ cache_control: true
133
+
134
+ # LM Studio /v1/messages exposes Qwen3.5 / Qwen3.6 with Anthropic-shaped
135
+ # responses including ``cache_read_input_tokens``. The provider declares
136
+ # ``kind: anthropic`` even though the underlying model is open-weights;
137
+ # the registry rule keys off the model name pattern.
138
+ - match: "qwen3.5-*"
139
+ kind: anthropic
140
+ capabilities:
141
+ cache_control: true
142
+
143
+ - match: "qwen3.6-*"
144
+ kind: anthropic
145
+ capabilities:
146
+ cache_control: true
147
+
93
148
  # ------------------------------------------------------------------
94
149
  # Claude Code suitability — agentic harness compatibility hint (v1.7-B).
95
150
  #