coderouter-cli 1.8.5__py3-none-any.whl → 1.10.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- coderouter/config/capability_registry.py +26 -0
- coderouter/config/schemas.py +315 -1
- coderouter/cost.py +154 -0
- coderouter/data/model-capabilities.yaml +55 -0
- coderouter/doctor.py +243 -1
- coderouter/guards/__init__.py +18 -0
- coderouter/guards/backend_health.py +208 -0
- coderouter/guards/memory_pressure.py +210 -0
- coderouter/guards/tool_loop.py +339 -0
- coderouter/ingress/anthropic_routes.py +86 -0
- coderouter/logging.py +613 -0
- coderouter/metrics/collector.py +272 -0
- coderouter/metrics/prometheus.py +207 -0
- coderouter/routing/adaptive.py +495 -0
- coderouter/routing/auto_router.py +118 -13
- coderouter/routing/budget.py +191 -0
- coderouter/routing/capability.py +36 -18
- coderouter/routing/fallback.py +896 -18
- {coderouter_cli-1.8.5.dist-info → coderouter_cli-1.10.0.dist-info}/METADATA +27 -8
- {coderouter_cli-1.8.5.dist-info → coderouter_cli-1.10.0.dist-info}/RECORD +23 -16
- {coderouter_cli-1.8.5.dist-info → coderouter_cli-1.10.0.dist-info}/WHEEL +0 -0
- {coderouter_cli-1.8.5.dist-info → coderouter_cli-1.10.0.dist-info}/entry_points.txt +0 -0
- {coderouter_cli-1.8.5.dist-info → coderouter_cli-1.10.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -115,6 +115,24 @@ class RegistryCapabilities(BaseModel):
|
|
|
115
115
|
"startup check)."
|
|
116
116
|
),
|
|
117
117
|
)
|
|
118
|
+
cache_control: bool | None = Field(
|
|
119
|
+
default=None,
|
|
120
|
+
description=(
|
|
121
|
+
"v1.9-B: does the upstream preserve Anthropic ``cache_control`` "
|
|
122
|
+
"markers end-to-end on the wire? When True, the v0.5-B "
|
|
123
|
+
"translation-lossy gate stays quiet and the doctor cache probe "
|
|
124
|
+
"treats the upstream as eligible for hit-rate verification. "
|
|
125
|
+
"Independent from ``providers.yaml capabilities.prompt_cache`` "
|
|
126
|
+
"(provider-level explicit opt-in); the registry value carries "
|
|
127
|
+
"model-family defaults that match real-world wire support — "
|
|
128
|
+
"for Anthropic native (``claude-sonnet-*`` / ``claude-opus-*``) "
|
|
129
|
+
"and LM Studio's ``/v1/messages`` for ``qwen3.5-*`` / "
|
|
130
|
+
"``qwen3.6-*`` (verified live in v1.8.4, "
|
|
131
|
+
"``cache_read_input_tokens: 280`` observed). "
|
|
132
|
+
"``None`` (default) = no opinion → the v0.5-B gate falls back "
|
|
133
|
+
"to ``provider.kind == 'anthropic'`` for the answer."
|
|
134
|
+
),
|
|
135
|
+
)
|
|
118
136
|
|
|
119
137
|
|
|
120
138
|
class CapabilityRule(BaseModel):
|
|
@@ -182,6 +200,7 @@ class ResolvedCapabilities:
|
|
|
182
200
|
tools: bool | None = None
|
|
183
201
|
max_context_tokens: int | None = None
|
|
184
202
|
claude_code_suitability: Literal["ok", "degraded"] | None = None
|
|
203
|
+
cache_control: bool | None = None
|
|
185
204
|
|
|
186
205
|
|
|
187
206
|
# ---------------------------------------------------------------------------
|
|
@@ -233,12 +252,14 @@ class CapabilityRegistry:
|
|
|
233
252
|
resolved_tools: bool | None = None
|
|
234
253
|
resolved_max_ctx: int | None = None
|
|
235
254
|
resolved_suitability: Literal["ok", "degraded"] | None = None
|
|
255
|
+
resolved_cache_control: bool | None = None
|
|
236
256
|
|
|
237
257
|
thinking_locked = False
|
|
238
258
|
reasoning_locked = False
|
|
239
259
|
tools_locked = False
|
|
240
260
|
max_ctx_locked = False
|
|
241
261
|
suitability_locked = False
|
|
262
|
+
cache_control_locked = False
|
|
242
263
|
|
|
243
264
|
for rule in self._rules:
|
|
244
265
|
if not rule.kind_matches(kind):
|
|
@@ -261,12 +282,16 @@ class CapabilityRegistry:
|
|
|
261
282
|
if not suitability_locked and caps.claude_code_suitability is not None:
|
|
262
283
|
resolved_suitability = caps.claude_code_suitability
|
|
263
284
|
suitability_locked = True
|
|
285
|
+
if not cache_control_locked and caps.cache_control is not None:
|
|
286
|
+
resolved_cache_control = caps.cache_control
|
|
287
|
+
cache_control_locked = True
|
|
264
288
|
if (
|
|
265
289
|
thinking_locked
|
|
266
290
|
and reasoning_locked
|
|
267
291
|
and tools_locked
|
|
268
292
|
and max_ctx_locked
|
|
269
293
|
and suitability_locked
|
|
294
|
+
and cache_control_locked
|
|
270
295
|
):
|
|
271
296
|
break
|
|
272
297
|
|
|
@@ -276,6 +301,7 @@ class CapabilityRegistry:
|
|
|
276
301
|
tools=resolved_tools,
|
|
277
302
|
max_context_tokens=resolved_max_ctx,
|
|
278
303
|
claude_code_suitability=resolved_suitability,
|
|
304
|
+
cache_control=resolved_cache_control,
|
|
279
305
|
)
|
|
280
306
|
|
|
281
307
|
# ------------------------------------------------------------------
|
coderouter/config/schemas.py
CHANGED
|
@@ -49,6 +49,108 @@ class Capabilities(BaseModel):
|
|
|
49
49
|
openai_compatible: bool = True
|
|
50
50
|
|
|
51
51
|
|
|
52
|
+
class CostConfig(BaseModel):
|
|
53
|
+
"""v1.9-D: per-provider unit pricing for cost aggregation.
|
|
54
|
+
|
|
55
|
+
All fields are optional. When :attr:`ProviderConfig.cost` is unset,
|
|
56
|
+
the provider contributes zero to the cost dashboard but still
|
|
57
|
+
appears in token-count totals — same shape as a free local model.
|
|
58
|
+
|
|
59
|
+
Pricing model
|
|
60
|
+
-------------
|
|
61
|
+
|
|
62
|
+
Anthropic's prompt-cache pricing (verified 2026-04 docs.anthropic.com):
|
|
63
|
+
|
|
64
|
+
* Normal input : 1.0x ``input_tokens_per_million``
|
|
65
|
+
* Normal output : 1.0x ``output_tokens_per_million``
|
|
66
|
+
* Cache read : ``cache_read_discount`` x normal input
|
|
67
|
+
* Cache creation: ``cache_creation_premium`` x normal input
|
|
68
|
+
|
|
69
|
+
The 4-class breakdown (cache_hit / cache_creation / no_cache /
|
|
70
|
+
unknown) recorded by v1.9-A's ``cache-observed`` log lets the
|
|
71
|
+
cost aggregator apply the right multiplier per token, and the
|
|
72
|
+
"savings" figure in the dashboard is computed as
|
|
73
|
+
``cache_read_input_tokens x normal x (1 - cache_read_discount)``
|
|
74
|
+
— i.e. what the operator *would have* paid without prompt
|
|
75
|
+
caching.
|
|
76
|
+
|
|
77
|
+
LiteLLM's cost tracker (verified 2026-04) does not implement
|
|
78
|
+
cache-aware breakdown; it bills ``cache_read_input_tokens`` at
|
|
79
|
+
full input rate, overstating spend on cache-heavy workloads. The
|
|
80
|
+
CodeRouter dashboard's selling point is correctness here.
|
|
81
|
+
"""
|
|
82
|
+
|
|
83
|
+
model_config = ConfigDict(extra="forbid")
|
|
84
|
+
|
|
85
|
+
input_tokens_per_million: float | None = Field(
|
|
86
|
+
default=None,
|
|
87
|
+
ge=0.0,
|
|
88
|
+
description=(
|
|
89
|
+
"USD per million input tokens at normal (uncached) rate. "
|
|
90
|
+
"Anthropic Sonnet 4.x is around 3.00, Opus 4.x around 15.00 "
|
|
91
|
+
"(check the upstream's pricing page — values change)."
|
|
92
|
+
),
|
|
93
|
+
)
|
|
94
|
+
output_tokens_per_million: float | None = Field(
|
|
95
|
+
default=None,
|
|
96
|
+
ge=0.0,
|
|
97
|
+
description=(
|
|
98
|
+
"USD per million output tokens. Output is invariably the "
|
|
99
|
+
"expensive side of the meter — for coding workloads with "
|
|
100
|
+
"large completions this dominates the bill."
|
|
101
|
+
),
|
|
102
|
+
)
|
|
103
|
+
cache_read_discount: float = Field(
|
|
104
|
+
default=0.10,
|
|
105
|
+
ge=0.0,
|
|
106
|
+
le=1.0,
|
|
107
|
+
description=(
|
|
108
|
+
"Multiplier applied to ``input_tokens_per_million`` for "
|
|
109
|
+
"tokens served from prompt cache. Anthropic's 2026-04 "
|
|
110
|
+
"pricing is 0.10 (i.e. cache reads are billed at 10% of "
|
|
111
|
+
"normal input rate). LM Studio /v1/messages locally "
|
|
112
|
+
"honors the cache_read field but local backends usually "
|
|
113
|
+
"have ``input_tokens_per_million`` of 0.0, so this field "
|
|
114
|
+
"is moot there."
|
|
115
|
+
),
|
|
116
|
+
)
|
|
117
|
+
cache_creation_premium: float = Field(
|
|
118
|
+
default=1.25,
|
|
119
|
+
ge=0.0,
|
|
120
|
+
description=(
|
|
121
|
+
"Multiplier applied to ``input_tokens_per_million`` for "
|
|
122
|
+
"tokens *written* to the prompt cache on the first hit. "
|
|
123
|
+
"Anthropic's 2026-04 pricing is 1.25 (cache writes cost "
|
|
124
|
+
"25% more than normal input on the writeback call; "
|
|
125
|
+
"subsequent reads then cost ``cache_read_discount`` x, "
|
|
126
|
+
"amortizing the writeback). Above 1.0 means premium, "
|
|
127
|
+
"1.0 = no premium, below 1.0 = discount on creation "
|
|
128
|
+
"(unusual but theoretically supported by the schema)."
|
|
129
|
+
),
|
|
130
|
+
)
|
|
131
|
+
monthly_budget_usd: float | None = Field(
|
|
132
|
+
default=None,
|
|
133
|
+
ge=0.0,
|
|
134
|
+
description=(
|
|
135
|
+
"v1.10 (LiteLLM 由来 / v1.9-D の累積版): per-provider "
|
|
136
|
+
"monthly USD spend cap. When set, the engine's chain "
|
|
137
|
+
"resolver skips this provider and emits "
|
|
138
|
+
"``skip-budget-exceeded`` once the running per-provider "
|
|
139
|
+
"total for the current calendar month (UTC) reaches or "
|
|
140
|
+
"exceeds this value. Unset (None) = no cap (default). "
|
|
141
|
+
"\n\n"
|
|
142
|
+
"Reset semantics: in-memory only — running totals zero "
|
|
143
|
+
"out on process restart and on UTC calendar-month "
|
|
144
|
+
"rollover. Operators who need durable budget state "
|
|
145
|
+
"across restarts should pair this with external "
|
|
146
|
+
"monitoring on the cost dashboard's ``cost_total_usd`` "
|
|
147
|
+
"panel; persistent budget state is out of scope for "
|
|
148
|
+
"v1.10 (no on-disk store, no Redis, etc., per the "
|
|
149
|
+
"5-deps invariant in plan.md §5.4)."
|
|
150
|
+
),
|
|
151
|
+
)
|
|
152
|
+
|
|
153
|
+
|
|
52
154
|
class ProviderConfig(BaseModel):
|
|
53
155
|
"""A single provider entry from providers.yaml.
|
|
54
156
|
|
|
@@ -116,6 +218,19 @@ class ProviderConfig(BaseModel):
|
|
|
116
218
|
|
|
117
219
|
capabilities: Capabilities = Field(default_factory=Capabilities)
|
|
118
220
|
|
|
221
|
+
cost: CostConfig | None = Field(
|
|
222
|
+
default=None,
|
|
223
|
+
description=(
|
|
224
|
+
"v1.9-D: per-provider unit pricing for cost aggregation. "
|
|
225
|
+
"Unset = provider contributes zero to the cost dashboard "
|
|
226
|
+
"(typical for local models). Set on paid endpoints to "
|
|
227
|
+
"feed the ``/dashboard`` cost panel and the "
|
|
228
|
+
"``coderouter stats --cost`` TUI summary. Cache-aware "
|
|
229
|
+
"calculation differentiates cache_read (90% discount on "
|
|
230
|
+
"Anthropic) from normal input — see :class:`CostConfig`."
|
|
231
|
+
),
|
|
232
|
+
)
|
|
233
|
+
|
|
119
234
|
@model_validator(mode="after")
|
|
120
235
|
def _check_output_filters_known(self) -> ProviderConfig:
|
|
121
236
|
"""v1.0-A: fail at config-load on a typo'd filter name.
|
|
@@ -173,6 +288,162 @@ class FallbackChain(BaseModel):
|
|
|
173
288
|
"for this profile."
|
|
174
289
|
),
|
|
175
290
|
)
|
|
291
|
+
# v1.9-E (L3): tool-loop detection guard.
|
|
292
|
+
#
|
|
293
|
+
# Long-running agent loops can fall into "tool stuck" states where
|
|
294
|
+
# the assistant repeatedly calls the same tool with identical args
|
|
295
|
+
# because it can't make progress. The guard inspects the assistant
|
|
296
|
+
# tool_use history in the inbound request and, when the same call
|
|
297
|
+
# repeats above the threshold, takes the configured action.
|
|
298
|
+
#
|
|
299
|
+
# Three actions trade off intervention against UX disruption:
|
|
300
|
+
# * ``warn`` — emit a structured ``tool-loop-detected`` log only.
|
|
301
|
+
# Diagnostic; default for v1.9-E.
|
|
302
|
+
# * ``inject`` — append a system message reminder ("you appear to
|
|
303
|
+
# be looping, try a different approach") so the
|
|
304
|
+
# next assistant turn has a chance to course-correct.
|
|
305
|
+
# * ``break`` — short-circuit the request with an error response.
|
|
306
|
+
# Use when downstream cost / context exhaustion is
|
|
307
|
+
# worse than telling the agent to stop.
|
|
308
|
+
tool_loop_window: int = Field(
|
|
309
|
+
default=5,
|
|
310
|
+
ge=2,
|
|
311
|
+
le=50,
|
|
312
|
+
description=(
|
|
313
|
+
"v1.9-E (L3): how many of the most recent assistant tool_use "
|
|
314
|
+
"blocks to inspect for a loop. Default 5 covers the typical "
|
|
315
|
+
"Claude Code agent step depth without false-positiving on "
|
|
316
|
+
"legitimate same-tool repetition (e.g. iterating Read on "
|
|
317
|
+
"different files)."
|
|
318
|
+
),
|
|
319
|
+
)
|
|
320
|
+
tool_loop_threshold: int = Field(
|
|
321
|
+
default=3,
|
|
322
|
+
ge=2,
|
|
323
|
+
le=50,
|
|
324
|
+
description=(
|
|
325
|
+
"v1.9-E (L3): how many *consecutive identical* tool calls "
|
|
326
|
+
"(same name + same args) trigger a loop verdict. Default 3 "
|
|
327
|
+
"catches the most common stuck patterns (Read same file 3x, "
|
|
328
|
+
"Bash same command 3x) while leaving headroom for "
|
|
329
|
+
"intentional repetition with intermediate observations."
|
|
330
|
+
),
|
|
331
|
+
)
|
|
332
|
+
tool_loop_action: Literal["warn", "inject", "break"] = Field(
|
|
333
|
+
default="warn",
|
|
334
|
+
description=(
|
|
335
|
+
"v1.9-E (L3): action when a loop is detected. ``warn`` (default) "
|
|
336
|
+
"emits a log line only; ``inject`` adds a ``you-are-looping`` "
|
|
337
|
+
"system message reminder to the request; ``break`` returns an "
|
|
338
|
+
"error response. See FallbackChain comment for trade-offs."
|
|
339
|
+
),
|
|
340
|
+
)
|
|
341
|
+
# v1.9-E phase 2 (L2): memory-pressure detection + cooldown.
|
|
342
|
+
#
|
|
343
|
+
# Local backends (Ollama / LM Studio / llama.cpp) report VRAM
|
|
344
|
+
# exhaustion via 5xx responses with bodies like "out of memory" /
|
|
345
|
+
# "CUDA out of memory" / "insufficient memory". When the chain
|
|
346
|
+
# encounters one of these, marking the provider as "pressured"
|
|
347
|
+
# for a cooldown window prevents the engine from re-hammering the
|
|
348
|
+
# same exhausted backend on the very next request — the chain
|
|
349
|
+
# falls through to the next provider, which is typically a
|
|
350
|
+
# lighter-weight model or a remote fallback that has the headroom.
|
|
351
|
+
#
|
|
352
|
+
# Three actions trade off intervention against operator preference:
|
|
353
|
+
# * ``off`` — no detection / no logging / no skip. Backward-compat default.
|
|
354
|
+
# * ``warn`` — emit ``memory-pressure-detected`` log when an OOM
|
|
355
|
+
# error is observed; do not skip on subsequent calls.
|
|
356
|
+
# * ``skip`` — ``warn`` + put the provider in a cooldown window;
|
|
357
|
+
# subsequent chain resolves filter it out and emit
|
|
358
|
+
# ``skip-memory-pressure`` until the cooldown expires.
|
|
359
|
+
memory_pressure_action: Literal["off", "warn", "skip"] = Field(
|
|
360
|
+
default="warn",
|
|
361
|
+
description=(
|
|
362
|
+
"v1.9-E (L2 phase 2): action on observed backend OOM "
|
|
363
|
+
"(provider failure with an out-of-memory error body). "
|
|
364
|
+
"``warn`` (default) logs only — diagnostic, no chain "
|
|
365
|
+
"behavior change. ``skip`` enters a cooldown window so "
|
|
366
|
+
"the next request's chain resolver filters the pressured "
|
|
367
|
+
"provider out and falls through to the next entry. "
|
|
368
|
+
"``off`` disables the detector entirely (zero "
|
|
369
|
+
"observation overhead, identical to v1.9.x behavior)."
|
|
370
|
+
),
|
|
371
|
+
)
|
|
372
|
+
memory_pressure_cooldown_s: int = Field(
|
|
373
|
+
default=120,
|
|
374
|
+
ge=10,
|
|
375
|
+
le=3600,
|
|
376
|
+
description=(
|
|
377
|
+
"v1.9-E (L2 phase 2): cooldown window in seconds applied "
|
|
378
|
+
"after an OOM detection when ``memory_pressure_action`` "
|
|
379
|
+
"is ``skip``. Default 120 s gives the local backend "
|
|
380
|
+
"enough time to release model state from VRAM before the "
|
|
381
|
+
"engine re-attempts. Capped at 3600 s (1 hour) — anything "
|
|
382
|
+
"longer is better expressed as marking the provider "
|
|
383
|
+
"``paid: true`` and bouncing the process."
|
|
384
|
+
),
|
|
385
|
+
)
|
|
386
|
+
# v1.9-E phase 2 (L5): backend health monitoring (passive).
|
|
387
|
+
#
|
|
388
|
+
# A consecutive-failure state machine per provider:
|
|
389
|
+
# * HEALTHY — no recent failures (initial state).
|
|
390
|
+
# * DEGRADED — ``backend_health_threshold`` consecutive failures
|
|
391
|
+
# observed; the provider has lost its "fresh" status
|
|
392
|
+
# but is still attempted in chain order.
|
|
393
|
+
# * UNHEALTHY — ``2 x backend_health_threshold`` consecutive
|
|
394
|
+
# failures; depending on the action, the provider
|
|
395
|
+
# is either demoted to chain end or skipped entirely.
|
|
396
|
+
# A single success on ``provider-ok`` resets the counter and the
|
|
397
|
+
# state to HEALTHY immediately — no rolling window, no debounce.
|
|
398
|
+
# Distinct from the v1.9-C ``adaptive`` gradient (continuous
|
|
399
|
+
# latency / error-rate buffer with debounce) which handles the
|
|
400
|
+
# "slow but alive" case; L5 handles the "hard crash" case.
|
|
401
|
+
backend_health_action: Literal["off", "warn", "demote"] = Field(
|
|
402
|
+
default="warn",
|
|
403
|
+
description=(
|
|
404
|
+
"v1.9-E (L5 phase 2): action when a provider transitions "
|
|
405
|
+
"to UNHEALTHY (consecutive failures crossed the threshold). "
|
|
406
|
+
"``warn`` (default) emits a state-change log line only — "
|
|
407
|
+
"diagnostic, no chain reorder. ``demote`` additionally "
|
|
408
|
+
"moves the UNHEALTHY provider to the back of the chain "
|
|
409
|
+
"for the next ``_resolve_chain`` (similar to v1.9-C "
|
|
410
|
+
"adaptive demotion but state-machine-based, not "
|
|
411
|
+
"rolling-window-based). ``off`` disables the monitor "
|
|
412
|
+
"entirely (zero observation overhead, identical to "
|
|
413
|
+
"v1.9.x behavior)."
|
|
414
|
+
),
|
|
415
|
+
)
|
|
416
|
+
backend_health_threshold: int = Field(
|
|
417
|
+
default=3,
|
|
418
|
+
ge=2,
|
|
419
|
+
le=20,
|
|
420
|
+
description=(
|
|
421
|
+
"v1.9-E (L5 phase 2): consecutive-failure count that "
|
|
422
|
+
"triggers the HEALTHY → DEGRADED transition. The "
|
|
423
|
+
"DEGRADED → UNHEALTHY transition fires at ``2x`` this "
|
|
424
|
+
"value. Default 3 catches "
|
|
425
|
+
"Ollama / LM Studio crashes (which produce a deterministic "
|
|
426
|
+
"5xx pattern on every retry) without flapping on transient "
|
|
427
|
+
"blips that the v1.9-C adaptive adjuster already handles."
|
|
428
|
+
),
|
|
429
|
+
)
|
|
430
|
+
adaptive: bool = Field(
|
|
431
|
+
default=False,
|
|
432
|
+
description=(
|
|
433
|
+
"v1.9-C: enable health-based dynamic chain reordering for "
|
|
434
|
+
"this profile. When True, the engine consults its "
|
|
435
|
+
"AdaptiveAdjuster and may demote providers whose rolling-"
|
|
436
|
+
"window median latency or error rate exceeds the configured "
|
|
437
|
+
"thresholds (1.5x global median / 10% errors). Demotions are "
|
|
438
|
+
"debounced (30 s minimum between rank changes per provider) "
|
|
439
|
+
"so a transient blip cannot oscillate the chain. When False "
|
|
440
|
+
"(default), the static ``providers`` order is honored "
|
|
441
|
+
"verbatim — no observation overhead. Orthogonal to L5 "
|
|
442
|
+
"(binary HEALTHY/UNHEALTHY backend swap, planned for "
|
|
443
|
+
"v1.9-E phase 3): C handles the gradient case during normal "
|
|
444
|
+
"operation, L5 handles hard crashes."
|
|
445
|
+
),
|
|
446
|
+
)
|
|
176
447
|
|
|
177
448
|
|
|
178
449
|
# ---------------------------------------------------------------------------
|
|
@@ -197,6 +468,36 @@ class RuleMatcher(BaseModel):
|
|
|
197
468
|
- ``content_contains: "foo"`` — substring match (case-sensitive).
|
|
198
469
|
- ``content_regex: r"..."`` — Python ``re.search``; compiled at
|
|
199
470
|
model-construction time so typos fail startup.
|
|
471
|
+
|
|
472
|
+
Variants ([Unreleased] / per-model auto-routing, free-claude-code 由来):
|
|
473
|
+
|
|
474
|
+
- ``model_pattern: r"claude-3-5-haiku.*"`` — Python ``re.fullmatch``
|
|
475
|
+
against the request body's ``model`` field. Lets clients route on
|
|
476
|
+
the model identifier the agent (Claude Code / Cursor) sent
|
|
477
|
+
(Opus / Sonnet / Haiku → different profiles) without needing an
|
|
478
|
+
explicit ``profile`` field on the wire. Compiled at load like
|
|
479
|
+
``content_regex``. ``fullmatch`` semantics (vs ``search`` for
|
|
480
|
+
``content_regex``) because model identifiers are structured tokens
|
|
481
|
+
— users typically describe the whole identifier with a wildcard
|
|
482
|
+
tail, not an arbitrary substring.
|
|
483
|
+
|
|
484
|
+
Variants ([Unreleased] / longContext auto-switch, claude-code-router
|
|
485
|
+
由来):
|
|
486
|
+
|
|
487
|
+
- ``content_token_count_min: 32000`` — char-count ÷ 4 heuristic
|
|
488
|
+
across **all** messages in the request body (not just the
|
|
489
|
+
latest user message — this matcher describes the request's
|
|
490
|
+
overall size). When the estimated token count is ``>=`` the
|
|
491
|
+
threshold, route to a long-context profile (typically pointing
|
|
492
|
+
at Gemini Flash 1M ctx, Haiku 200K, etc.). Distinct from the
|
|
493
|
+
other content matchers which operate on the latest user
|
|
494
|
+
message only — context-window pressure is a request-shape
|
|
495
|
+
property, not a per-turn property. The estimator deliberately
|
|
496
|
+
avoids tiktoken / SentencePiece (forbidden by the 5-deps
|
|
497
|
+
invariant in plan.md §5.4); operators with non-English-heavy
|
|
498
|
+
workloads can compensate by tuning the threshold, since the
|
|
499
|
+
char/4 heuristic is conservative for CJK and looser for
|
|
500
|
+
English code.
|
|
200
501
|
"""
|
|
201
502
|
|
|
202
503
|
model_config = ConfigDict(extra="forbid")
|
|
@@ -205,12 +506,16 @@ class RuleMatcher(BaseModel):
|
|
|
205
506
|
code_fence_ratio_min: float | None = Field(default=None, ge=0.0, le=1.0)
|
|
206
507
|
content_contains: str | None = None
|
|
207
508
|
content_regex: str | None = None
|
|
509
|
+
model_pattern: str | None = None
|
|
510
|
+
content_token_count_min: int | None = Field(default=None, ge=1)
|
|
208
511
|
|
|
209
512
|
_MATCHER_FIELDS: tuple[str, ...] = (
|
|
210
513
|
"has_image",
|
|
211
514
|
"code_fence_ratio_min",
|
|
212
515
|
"content_contains",
|
|
213
516
|
"content_regex",
|
|
517
|
+
"model_pattern",
|
|
518
|
+
"content_token_count_min",
|
|
214
519
|
)
|
|
215
520
|
|
|
216
521
|
@model_validator(mode="after")
|
|
@@ -227,7 +532,9 @@ class RuleMatcher(BaseModel):
|
|
|
227
532
|
|
|
228
533
|
@model_validator(mode="after")
|
|
229
534
|
def _compile_regex_eagerly(self) -> Self:
|
|
230
|
-
"""Compile ``content_regex`` at load so bad
|
|
535
|
+
"""Compile ``content_regex`` / ``model_pattern`` at load so bad
|
|
536
|
+
patterns fail startup rather than at first request.
|
|
537
|
+
"""
|
|
231
538
|
if self.content_regex is not None:
|
|
232
539
|
try:
|
|
233
540
|
re.compile(self.content_regex)
|
|
@@ -235,6 +542,13 @@ class RuleMatcher(BaseModel):
|
|
|
235
542
|
raise ValueError(
|
|
236
543
|
f"Invalid regex for content_regex {self.content_regex!r}: {exc}"
|
|
237
544
|
) from exc
|
|
545
|
+
if self.model_pattern is not None:
|
|
546
|
+
try:
|
|
547
|
+
re.compile(self.model_pattern)
|
|
548
|
+
except re.error as exc:
|
|
549
|
+
raise ValueError(
|
|
550
|
+
f"Invalid regex for model_pattern {self.model_pattern!r}: {exc}"
|
|
551
|
+
) from exc
|
|
238
552
|
return self
|
|
239
553
|
|
|
240
554
|
|
coderouter/cost.py
ADDED
|
@@ -0,0 +1,154 @@
|
|
|
1
|
+
"""Cost calculation utilities (v1.9-D Cost-aware Dashboard).
|
|
2
|
+
|
|
3
|
+
Pure functions for translating per-request token counts into USD
|
|
4
|
+
spend, accounting for Anthropic's prompt-cache pricing model
|
|
5
|
+
(``cache_read`` at 10% of normal input, ``cache_creation`` at 125%).
|
|
6
|
+
|
|
7
|
+
Where this fits
|
|
8
|
+
===============
|
|
9
|
+
|
|
10
|
+
The engine's ``_emit_cache_observed`` site (v1.9-A) calls
|
|
11
|
+
:func:`compute_cost_for_attempt` to enrich the ``cache-observed``
|
|
12
|
+
log line with ``cost_usd`` + ``cost_savings_usd`` fields. The
|
|
13
|
+
MetricsCollector then aggregates per-provider totals over the
|
|
14
|
+
process lifetime, and the dashboard / ``coderouter stats --cost``
|
|
15
|
+
TUI render those aggregates.
|
|
16
|
+
|
|
17
|
+
Why a separate module
|
|
18
|
+
=====================
|
|
19
|
+
|
|
20
|
+
Pricing math is small, pure, and shared by:
|
|
21
|
+
|
|
22
|
+
* the engine's per-request cost calc
|
|
23
|
+
* the collector's snapshot rendering (recomputes a "what-if no
|
|
24
|
+
cache" total for the savings panel)
|
|
25
|
+
* the future ``coderouter stats --cost`` CLI
|
|
26
|
+
|
|
27
|
+
Keeping it as a leaf module with no engine / collector imports
|
|
28
|
+
prevents circular dependencies and makes the pricing semantics
|
|
29
|
+
trivially testable in isolation.
|
|
30
|
+
|
|
31
|
+
Anthropic pricing reference (verified 2026-04)
|
|
32
|
+
==============================================
|
|
33
|
+
|
|
34
|
+
For Sonnet / Opus / Haiku 4.x:
|
|
35
|
+
|
|
36
|
+
* Normal input : ``input_tokens_per_million`` x 1.0
|
|
37
|
+
* Cache read : ``input_tokens_per_million`` x 0.10
|
|
38
|
+
* Cache creation: ``input_tokens_per_million`` x 1.25
|
|
39
|
+
* Normal output : ``output_tokens_per_million`` x 1.0
|
|
40
|
+
|
|
41
|
+
Tokens reported by the upstream:
|
|
42
|
+
|
|
43
|
+
* ``input_tokens`` — "fresh" input (cache reads / writes are
|
|
44
|
+
excluded from this count and reported via the cache fields).
|
|
45
|
+
* ``cache_read_input_tokens`` — served from prompt cache.
|
|
46
|
+
* ``cache_creation_input_tokens`` — written to prompt cache.
|
|
47
|
+
* ``output_tokens`` — completion.
|
|
48
|
+
|
|
49
|
+
So a single response's billable cost is the sum of the four buckets
|
|
50
|
+
billed at their respective rates. The "savings" figure is the
|
|
51
|
+
counterfactual: what the operator *would have* paid without prompt
|
|
52
|
+
caching, so it focuses on the cache_read tokens (those are the
|
|
53
|
+
ones that got the 90% discount). cache_creation is a premium, not
|
|
54
|
+
a savings, so it doesn't enter the savings figure even though it's
|
|
55
|
+
in the cost calc.
|
|
56
|
+
"""
|
|
57
|
+
|
|
58
|
+
from __future__ import annotations
|
|
59
|
+
|
|
60
|
+
from dataclasses import dataclass
|
|
61
|
+
|
|
62
|
+
from coderouter.config.schemas import CostConfig
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
@dataclass(frozen=True)
|
|
66
|
+
class CostBreakdown:
|
|
67
|
+
"""Per-attempt cost components, all in USD.
|
|
68
|
+
|
|
69
|
+
All fields default to 0.0 so a free / unconfigured provider
|
|
70
|
+
yields a zero breakdown without callers having to special-case
|
|
71
|
+
None.
|
|
72
|
+
|
|
73
|
+
Fields
|
|
74
|
+
total_usd: full cost charged for this attempt (sum of the
|
|
75
|
+
four token buckets at their respective rates).
|
|
76
|
+
savings_usd: hypothetical "no-cache" delta — what the
|
|
77
|
+
operator *would have* paid for ``cache_read_input_tokens``
|
|
78
|
+
at full input rate, minus what they actually paid at
|
|
79
|
+
``cache_read_discount`` rate. Always >= 0.
|
|
80
|
+
input_usd / output_usd / cache_read_usd / cache_creation_usd:
|
|
81
|
+
per-bucket breakdown for the dashboard's stacked bar
|
|
82
|
+
chart. ``input_usd`` is "fresh input only" (does not
|
|
83
|
+
include cache buckets); cache_read_usd / cache_creation_usd
|
|
84
|
+
are the post-discount / post-premium values.
|
|
85
|
+
"""
|
|
86
|
+
|
|
87
|
+
total_usd: float = 0.0
|
|
88
|
+
savings_usd: float = 0.0
|
|
89
|
+
input_usd: float = 0.0
|
|
90
|
+
output_usd: float = 0.0
|
|
91
|
+
cache_read_usd: float = 0.0
|
|
92
|
+
cache_creation_usd: float = 0.0
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
_PER_MILLION: float = 1_000_000.0
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def compute_cost_for_attempt(
|
|
99
|
+
cost_config: CostConfig | None,
|
|
100
|
+
*,
|
|
101
|
+
input_tokens: int,
|
|
102
|
+
output_tokens: int,
|
|
103
|
+
cache_read_input_tokens: int,
|
|
104
|
+
cache_creation_input_tokens: int,
|
|
105
|
+
) -> CostBreakdown:
|
|
106
|
+
"""Translate per-attempt token counts into a USD :class:`CostBreakdown`.
|
|
107
|
+
|
|
108
|
+
Returns a zero-filled breakdown when:
|
|
109
|
+
* ``cost_config`` is ``None`` (provider has no pricing
|
|
110
|
+
declared — typical for local models)
|
|
111
|
+
* Both ``input_tokens_per_million`` and ``output_tokens_per_million``
|
|
112
|
+
are unset (a partial declaration is permitted but the
|
|
113
|
+
resulting cost is whatever the set fields can compute)
|
|
114
|
+
|
|
115
|
+
Negative or zero token counts are accepted and contribute zero
|
|
116
|
+
cost — the engine never emits negatives, but this defensive
|
|
117
|
+
handling keeps a malformed log line from corrupting the
|
|
118
|
+
aggregate counters in the collector.
|
|
119
|
+
"""
|
|
120
|
+
if cost_config is None:
|
|
121
|
+
return CostBreakdown()
|
|
122
|
+
|
|
123
|
+
input_rate = (cost_config.input_tokens_per_million or 0.0) / _PER_MILLION
|
|
124
|
+
output_rate = (cost_config.output_tokens_per_million or 0.0) / _PER_MILLION
|
|
125
|
+
|
|
126
|
+
safe_input = max(input_tokens, 0)
|
|
127
|
+
safe_output = max(output_tokens, 0)
|
|
128
|
+
safe_read = max(cache_read_input_tokens, 0)
|
|
129
|
+
safe_create = max(cache_creation_input_tokens, 0)
|
|
130
|
+
|
|
131
|
+
input_usd = safe_input * input_rate
|
|
132
|
+
output_usd = safe_output * output_rate
|
|
133
|
+
cache_read_usd = safe_read * input_rate * cost_config.cache_read_discount
|
|
134
|
+
cache_creation_usd = safe_create * input_rate * cost_config.cache_creation_premium
|
|
135
|
+
|
|
136
|
+
total_usd = input_usd + output_usd + cache_read_usd + cache_creation_usd
|
|
137
|
+
|
|
138
|
+
# Savings = what the operator would have paid at full input rate
|
|
139
|
+
# for the cache_read tokens, minus what they actually paid at
|
|
140
|
+
# the discounted rate. cache_creation is a *premium* (not a
|
|
141
|
+
# savings) so it doesn't enter the savings figure — including
|
|
142
|
+
# it would let a cache miss show up as "negative savings" which
|
|
143
|
+
# is semantically wrong and would confuse the dashboard.
|
|
144
|
+
full_rate_for_cache_read = safe_read * input_rate
|
|
145
|
+
savings_usd = full_rate_for_cache_read - cache_read_usd
|
|
146
|
+
|
|
147
|
+
return CostBreakdown(
|
|
148
|
+
total_usd=total_usd,
|
|
149
|
+
savings_usd=max(savings_usd, 0.0),
|
|
150
|
+
input_usd=input_usd,
|
|
151
|
+
output_usd=output_usd,
|
|
152
|
+
cache_read_usd=cache_read_usd,
|
|
153
|
+
cache_creation_usd=cache_creation_usd,
|
|
154
|
+
)
|
|
@@ -90,6 +90,61 @@ rules:
|
|
|
90
90
|
capabilities:
|
|
91
91
|
thinking: true
|
|
92
92
|
|
|
93
|
+
# ------------------------------------------------------------------
|
|
94
|
+
# Anthropic prompt caching — `cache_control` body field support (v1.9-B).
|
|
95
|
+
#
|
|
96
|
+
# Declares which (kind, model) pairs preserve the ``cache_control``
|
|
97
|
+
# marker end-to-end on the wire. The capability gate
|
|
98
|
+
# (``provider_supports_cache_control`` in coderouter/routing/capability.py)
|
|
99
|
+
# consults these declarations to decide whether to emit a
|
|
100
|
+
# ``capability-degraded reason=translation-lossy`` log when handing
|
|
101
|
+
# a cache_control-bearing request to the provider.
|
|
102
|
+
#
|
|
103
|
+
# Verification basis:
|
|
104
|
+
# * api.anthropic.com (kind=anthropic, claude-* models):
|
|
105
|
+
# verified live 2026-04-20 — 1321 tokens written on call 1,
|
|
106
|
+
# 1321 read on call 2. Native Anthropic shape passes through
|
|
107
|
+
# verbatim, no translation hop loses the marker.
|
|
108
|
+
# * LM Studio /v1/messages (kind=anthropic, qwen3.5-* / qwen3.6-*):
|
|
109
|
+
# verified live 2026-04-27 in v1.8.4 — `cache_read_input_tokens: 280`
|
|
110
|
+
# observed end-to-end through CodeRouter. LM Studio 0.4.12+ honors
|
|
111
|
+
# the marker on its Anthropic-compatible endpoint.
|
|
112
|
+
#
|
|
113
|
+
# All entries are kind=anthropic. openai_compat upstreams have no wire
|
|
114
|
+
# equivalent for cache_control (the OpenAI Chat Completions schema does
|
|
115
|
+
# not carry it), so they intentionally stay undeclared — the gate's
|
|
116
|
+
# fallback maps undeclared + kind=openai_compat to False.
|
|
117
|
+
# ------------------------------------------------------------------
|
|
118
|
+
|
|
119
|
+
- match: "claude-opus-4-*"
|
|
120
|
+
kind: anthropic
|
|
121
|
+
capabilities:
|
|
122
|
+
cache_control: true
|
|
123
|
+
|
|
124
|
+
- match: "claude-sonnet-4-*"
|
|
125
|
+
kind: anthropic
|
|
126
|
+
capabilities:
|
|
127
|
+
cache_control: true
|
|
128
|
+
|
|
129
|
+
- match: "claude-haiku-4-*"
|
|
130
|
+
kind: anthropic
|
|
131
|
+
capabilities:
|
|
132
|
+
cache_control: true
|
|
133
|
+
|
|
134
|
+
# LM Studio /v1/messages exposes Qwen3.5 / Qwen3.6 with Anthropic-shaped
|
|
135
|
+
# responses including ``cache_read_input_tokens``. The provider declares
|
|
136
|
+
# ``kind: anthropic`` even though the underlying model is open-weights;
|
|
137
|
+
# the registry rule keys off the model name pattern.
|
|
138
|
+
- match: "qwen3.5-*"
|
|
139
|
+
kind: anthropic
|
|
140
|
+
capabilities:
|
|
141
|
+
cache_control: true
|
|
142
|
+
|
|
143
|
+
- match: "qwen3.6-*"
|
|
144
|
+
kind: anthropic
|
|
145
|
+
capabilities:
|
|
146
|
+
cache_control: true
|
|
147
|
+
|
|
93
148
|
# ------------------------------------------------------------------
|
|
94
149
|
# Claude Code suitability — agentic harness compatibility hint (v1.7-B).
|
|
95
150
|
#
|