coderouter-cli 1.10.1__py3-none-any.whl → 2.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- coderouter/cli_stats.py +48 -1
- coderouter/config/schemas.py +189 -0
- coderouter/data/model-capabilities.yaml +79 -0
- coderouter/guards/context_budget.py +376 -0
- coderouter/guards/continuous_probe.py +349 -0
- coderouter/guards/drift_actions.py +111 -0
- coderouter/guards/drift_detection.py +308 -0
- coderouter/ingress/anthropic_routes.py +93 -12
- coderouter/ingress/app.py +39 -0
- coderouter/logging.py +351 -0
- coderouter/metrics/collector.py +142 -2
- coderouter/metrics/prometheus.py +212 -0
- coderouter/routing/adaptive.py +23 -0
- coderouter/routing/auto_router.py +2 -42
- coderouter/routing/fallback.py +481 -4
- coderouter/token_estimation.py +161 -0
- {coderouter_cli-1.10.1.dist-info → coderouter_cli-2.1.0.dist-info}/METADATA +11 -8
- {coderouter_cli-1.10.1.dist-info → coderouter_cli-2.1.0.dist-info}/RECORD +21 -16
- {coderouter_cli-1.10.1.dist-info → coderouter_cli-2.1.0.dist-info}/WHEEL +0 -0
- {coderouter_cli-1.10.1.dist-info → coderouter_cli-2.1.0.dist-info}/entry_points.txt +0 -0
- {coderouter_cli-1.10.1.dist-info → coderouter_cli-2.1.0.dist-info}/licenses/LICENSE +0 -0
coderouter/cli_stats.py
CHANGED
|
@@ -112,6 +112,10 @@ class GatesSummary:
|
|
|
112
112
|
degraded_breakdown: dict[str, int] # capability → count
|
|
113
113
|
filters_applied_total: int
|
|
114
114
|
filters_breakdown: dict[str, int] # filter name → count
|
|
115
|
+
# v2.0-F (L1): context budget guard summary
|
|
116
|
+
context_budget_warnings: int = 0
|
|
117
|
+
context_budget_trims: int = 0
|
|
118
|
+
context_budget_latest_ratio: dict[str, float] | None = None
|
|
115
119
|
|
|
116
120
|
|
|
117
121
|
@dataclass(frozen=True)
|
|
@@ -252,6 +256,8 @@ def build_gates_summary(snapshot: dict[str, Any]) -> GatesSummary:
|
|
|
252
256
|
)
|
|
253
257
|
degraded_breakdown = dict(counters.get("capability_degraded", {}) or {})
|
|
254
258
|
filters_breakdown = dict(counters.get("output_filter_applied", {}) or {})
|
|
259
|
+
# v2.0-F (L1): context budget guard counters
|
|
260
|
+
ctx_budget_latest = counters.get("context_budget_latest_ratio") or {}
|
|
255
261
|
return GatesSummary(
|
|
256
262
|
total_requests=total_requests,
|
|
257
263
|
total_failed=total_failed,
|
|
@@ -261,6 +267,13 @@ def build_gates_summary(snapshot: dict[str, Any]) -> GatesSummary:
|
|
|
261
267
|
degraded_breakdown=degraded_breakdown,
|
|
262
268
|
filters_applied_total=sum(filters_breakdown.values()),
|
|
263
269
|
filters_breakdown=filters_breakdown,
|
|
270
|
+
context_budget_warnings=int(
|
|
271
|
+
counters.get("context_budget_warnings_total", 0)
|
|
272
|
+
),
|
|
273
|
+
context_budget_trims=int(
|
|
274
|
+
counters.get("context_budget_trims_total", 0)
|
|
275
|
+
),
|
|
276
|
+
context_budget_latest_ratio=ctx_budget_latest if ctx_budget_latest else None,
|
|
264
277
|
)
|
|
265
278
|
|
|
266
279
|
|
|
@@ -397,6 +410,19 @@ def format_text(snapshot: dict[str, Any], *, width: int = 80) -> str:
|
|
|
397
410
|
else ""
|
|
398
411
|
)
|
|
399
412
|
)
|
|
413
|
+
# v2.0-F (L1): context budget guard stats
|
|
414
|
+
if gates.context_budget_warnings or gates.context_budget_trims:
|
|
415
|
+
ratio_str = ""
|
|
416
|
+
if gates.context_budget_latest_ratio:
|
|
417
|
+
top_profile = max(
|
|
418
|
+
gates.context_budget_latest_ratio,
|
|
419
|
+
key=gates.context_budget_latest_ratio.get, # type: ignore[arg-type]
|
|
420
|
+
)
|
|
421
|
+
ratio_str = f" (latest: {gates.context_budget_latest_ratio[top_profile]:.0%} {top_profile})"
|
|
422
|
+
lines.append(
|
|
423
|
+
f" context-budget warn: {gates.context_budget_warnings} "
|
|
424
|
+
f"trim: {gates.context_budget_trims}{ratio_str}"
|
|
425
|
+
)
|
|
400
426
|
lines.append("")
|
|
401
427
|
lines.append("Recent")
|
|
402
428
|
if not recent:
|
|
@@ -633,7 +659,28 @@ def _draw_frame( # pragma: no cover - curses-only
|
|
|
633
659
|
+ (f" ({_fmt_breakdown(gates.filters_breakdown)})" if gates.filters_breakdown else ""),
|
|
634
660
|
width,
|
|
635
661
|
)
|
|
636
|
-
row +=
|
|
662
|
+
row += 1
|
|
663
|
+
# v2.0-F (L1): context budget guard line
|
|
664
|
+
if gates.context_budget_warnings or gates.context_budget_trims:
|
|
665
|
+
ratio_str = ""
|
|
666
|
+
if gates.context_budget_latest_ratio:
|
|
667
|
+
top_profile = max(
|
|
668
|
+
gates.context_budget_latest_ratio,
|
|
669
|
+
key=gates.context_budget_latest_ratio.get, # type: ignore[arg-type]
|
|
670
|
+
)
|
|
671
|
+
ratio_str = f" (latest: {gates.context_budget_latest_ratio[top_profile]:.0%} {top_profile})"
|
|
672
|
+
budget_line = (
|
|
673
|
+
f" context-budget warn: {gates.context_budget_warnings} "
|
|
674
|
+
f"trim: {gates.context_budget_trims}{ratio_str}"
|
|
675
|
+
)
|
|
676
|
+
budget_color = (
|
|
677
|
+
_COLOR_YELLOW_PAIR
|
|
678
|
+
if gates.context_budget_trims == 0
|
|
679
|
+
else _COLOR_RED_PAIR
|
|
680
|
+
)
|
|
681
|
+
stdscr.addnstr(row, 0, budget_line, width, int(curses.color_pair(budget_color)))
|
|
682
|
+
row += 1
|
|
683
|
+
row += 1
|
|
637
684
|
|
|
638
685
|
if row >= height - 2:
|
|
639
686
|
return
|
coderouter/config/schemas.py
CHANGED
|
@@ -230,6 +230,18 @@ class ProviderConfig(BaseModel):
|
|
|
230
230
|
"Anthropic) from normal input — see :class:`CostConfig`."
|
|
231
231
|
),
|
|
232
232
|
)
|
|
233
|
+
max_context_tokens: int | None = Field(
|
|
234
|
+
default=None,
|
|
235
|
+
ge=1,
|
|
236
|
+
description=(
|
|
237
|
+
"v2.0-F (L1): explicit declaration of this provider's "
|
|
238
|
+
"context window size in tokens. When set, takes precedence "
|
|
239
|
+
"over the ``model-capabilities.yaml`` registry lookup. "
|
|
240
|
+
"When both are unset, the context budget guard falls back "
|
|
241
|
+
"to 128000 (128K). Examples: Ollama Qwen3 32K → 32768, "
|
|
242
|
+
"LM Studio Qwen3.5 128K → 131072, Anthropic Claude → 200000."
|
|
243
|
+
),
|
|
244
|
+
)
|
|
233
245
|
|
|
234
246
|
@model_validator(mode="after")
|
|
235
247
|
def _check_output_filters_known(self) -> ProviderConfig:
|
|
@@ -444,6 +456,147 @@ class FallbackChain(BaseModel):
|
|
|
444
456
|
"operation, L5 handles hard crashes."
|
|
445
457
|
),
|
|
446
458
|
)
|
|
459
|
+
# v2.0-F (L1): context budget guard.
|
|
460
|
+
#
|
|
461
|
+
# Long-running agent sessions accumulate messages that eventually
|
|
462
|
+
# exceed the target model's context window. Without intervention,
|
|
463
|
+
# the backend returns a 400 (Anthropic) or silently truncates
|
|
464
|
+
# (Ollama), killing the agent session. The context budget guard
|
|
465
|
+
# estimates the request's token count (char/4 heuristic, shared
|
|
466
|
+
# with the auto_router longContext matcher) and compares it against
|
|
467
|
+
# the target provider's declared max_context_tokens.
|
|
468
|
+
#
|
|
469
|
+
# Three actions:
|
|
470
|
+
# * ``off`` — no detection, no logging. Backward-compat default.
|
|
471
|
+
# * ``warn`` — emit ``context-budget-warning`` log + attach
|
|
472
|
+
# ``X-CodeRouter-Context-Budget: warning`` response
|
|
473
|
+
# header. No request mutation.
|
|
474
|
+
# * ``trim`` — ``warn`` + remove oldest non-system messages until
|
|
475
|
+
# the estimated token count drops below
|
|
476
|
+
# ``context_budget_trim_target``. Recent messages
|
|
477
|
+
# (``context_budget_preserve_last_n``) are always
|
|
478
|
+
# kept, and tool_use / tool_result pairs are preserved
|
|
479
|
+
# atomically to avoid breaking agent loops.
|
|
480
|
+
context_budget_action: Literal["off", "warn", "trim"] = Field(
|
|
481
|
+
default="off",
|
|
482
|
+
description=(
|
|
483
|
+
"v2.0-F (L1): action when estimated request tokens approach "
|
|
484
|
+
"the target provider's context window. ``off`` (default) "
|
|
485
|
+
"disables the guard entirely. ``warn`` emits a log and "
|
|
486
|
+
"response header at ``context_budget_warn_threshold``. "
|
|
487
|
+
"``trim`` additionally removes old messages at "
|
|
488
|
+
"``context_budget_trim_threshold`` to reclaim context space."
|
|
489
|
+
),
|
|
490
|
+
)
|
|
491
|
+
context_budget_warn_threshold: float = Field(
|
|
492
|
+
default=0.80,
|
|
493
|
+
ge=0.1,
|
|
494
|
+
le=1.0,
|
|
495
|
+
description=(
|
|
496
|
+
"v2.0-F (L1): context usage ratio (estimated_tokens / "
|
|
497
|
+
"max_context_tokens) at which a warning is emitted. "
|
|
498
|
+
"Default 0.80 (80%) gives early notice before trim fires."
|
|
499
|
+
),
|
|
500
|
+
)
|
|
501
|
+
context_budget_trim_threshold: float = Field(
|
|
502
|
+
default=0.90,
|
|
503
|
+
ge=0.1,
|
|
504
|
+
le=1.0,
|
|
505
|
+
description=(
|
|
506
|
+
"v2.0-F (L1): context usage ratio at which trim fires "
|
|
507
|
+
"(only when ``context_budget_action`` is ``trim``). "
|
|
508
|
+
"Default 0.90 (90%) leaves a 10% margin for the backend's "
|
|
509
|
+
"own token counting to differ from the char/4 estimate."
|
|
510
|
+
),
|
|
511
|
+
)
|
|
512
|
+
context_budget_trim_target: float = Field(
|
|
513
|
+
default=0.75,
|
|
514
|
+
ge=0.1,
|
|
515
|
+
le=1.0,
|
|
516
|
+
description=(
|
|
517
|
+
"v2.0-F (L1): target context usage ratio after trim. "
|
|
518
|
+
"Messages are removed from the front until the estimate "
|
|
519
|
+
"drops below this ratio. Default 0.75 (75%) gives headroom "
|
|
520
|
+
"for several more turns before trim fires again."
|
|
521
|
+
),
|
|
522
|
+
)
|
|
523
|
+
context_budget_preserve_last_n: int = Field(
|
|
524
|
+
default=4,
|
|
525
|
+
ge=1,
|
|
526
|
+
le=100,
|
|
527
|
+
description=(
|
|
528
|
+
"v2.0-F (L1): minimum number of recent messages to always "
|
|
529
|
+
"preserve when trimming. Default 4 (2 user-assistant pairs) "
|
|
530
|
+
"keeps the agent's immediate working context intact."
|
|
531
|
+
),
|
|
532
|
+
)
|
|
533
|
+
|
|
534
|
+
# ------------------------------------------------------------------
|
|
535
|
+
# v2.0-G (L4): Drift detection — response quality degradation guard
|
|
536
|
+
# ------------------------------------------------------------------
|
|
537
|
+
#
|
|
538
|
+
# Long-running sessions on local LLMs can suffer gradual quality
|
|
539
|
+
# decay (KV cache pressure, thermal throttling, VRAM fragmentation)
|
|
540
|
+
# where the model "succeeds" but produces empty/short/toolless
|
|
541
|
+
# responses. This guard observes response quality signals in a
|
|
542
|
+
# rolling window and detects statistical drift.
|
|
543
|
+
#
|
|
544
|
+
# Four actions:
|
|
545
|
+
# * ``off`` — no detection (default).
|
|
546
|
+
# * ``warn`` — emit structured log + response header.
|
|
547
|
+
# * ``promote`` — ``warn`` + demote drifted provider in chain.
|
|
548
|
+
# * ``reload`` — ``promote`` + attempt KV cache flush (Ollama).
|
|
549
|
+
drift_detection_action: Literal["off", "warn", "promote", "reload"] = Field(
|
|
550
|
+
default="off",
|
|
551
|
+
description=(
|
|
552
|
+
"v2.0-G (L4): action on response quality drift detection. "
|
|
553
|
+
"``off`` (default) disables drift detection. ``warn`` emits "
|
|
554
|
+
"a log and response header. ``promote`` additionally demotes "
|
|
555
|
+
"the drifted provider in the chain. ``reload`` attempts to "
|
|
556
|
+
"flush the provider's KV cache (Ollama only) before promoting."
|
|
557
|
+
),
|
|
558
|
+
)
|
|
559
|
+
drift_detection_window_size: int = Field(
|
|
560
|
+
default=20,
|
|
561
|
+
ge=4,
|
|
562
|
+
le=200,
|
|
563
|
+
description=(
|
|
564
|
+
"v2.0-G (L4): number of recent responses to keep in the "
|
|
565
|
+
"rolling observation window per provider. Larger windows "
|
|
566
|
+
"are more robust to noise but slower to detect drift."
|
|
567
|
+
),
|
|
568
|
+
)
|
|
569
|
+
drift_detection_cooldown_s: int = Field(
|
|
570
|
+
default=300,
|
|
571
|
+
ge=10,
|
|
572
|
+
le=3600,
|
|
573
|
+
description=(
|
|
574
|
+
"v2.0-G (L4): seconds after a promote/reload action before "
|
|
575
|
+
"the drifted provider's rank is reset for recovery check. "
|
|
576
|
+
"Default 300s (5 min) gives the model time to stabilize."
|
|
577
|
+
),
|
|
578
|
+
)
|
|
579
|
+
drift_detection_sensitivity: Literal["low", "normal", "high"] = Field(
|
|
580
|
+
default="normal",
|
|
581
|
+
description=(
|
|
582
|
+
"v2.0-G (L4): threshold preset for drift signals. "
|
|
583
|
+
"``low`` tolerates more degradation before triggering, "
|
|
584
|
+
"``high`` is stricter (fewer bad responses needed)."
|
|
585
|
+
),
|
|
586
|
+
)
|
|
587
|
+
|
|
588
|
+
# --- v2.0-H (L6): Mid-stream partial stitching --------------------------
|
|
589
|
+
# * ``off`` — discard partial content on mid-stream failure (legacy).
|
|
590
|
+
# * ``surface`` — return partial content as a truncated-but-valid response.
|
|
591
|
+
partial_stitch_action: Literal["off", "surface"] = Field(
|
|
592
|
+
default="off",
|
|
593
|
+
description=(
|
|
594
|
+
"v2.0-H (L6): action when a streaming response fails mid-stream. "
|
|
595
|
+
"``off`` discards partial content (legacy error event). "
|
|
596
|
+
"``surface`` returns accumulated text as a graceful stream "
|
|
597
|
+
"termination with a ``coderouter_partial`` metadata event."
|
|
598
|
+
),
|
|
599
|
+
)
|
|
447
600
|
|
|
448
601
|
|
|
449
602
|
# ---------------------------------------------------------------------------
|
|
@@ -682,6 +835,42 @@ class CodeRouterConfig(BaseModel):
|
|
|
682
835
|
),
|
|
683
836
|
)
|
|
684
837
|
|
|
838
|
+
# v2.0-I: Continuous probing — background health checks for idle periods.
|
|
839
|
+
continuous_probe: Literal["off", "active"] = Field(
|
|
840
|
+
default="off",
|
|
841
|
+
description=(
|
|
842
|
+
"v2.0-I: enable background health probes. 'active' starts a "
|
|
843
|
+
"background task that periodically sends 1-token requests to "
|
|
844
|
+
"each provider, feeding results into the L5 backend health "
|
|
845
|
+
"state machine. 'off' = no probing (backward-compatible default)."
|
|
846
|
+
),
|
|
847
|
+
)
|
|
848
|
+
probe_interval_s: float = Field(
|
|
849
|
+
default=60.0,
|
|
850
|
+
ge=5.0,
|
|
851
|
+
le=3600.0,
|
|
852
|
+
description=(
|
|
853
|
+
"v2.0-I: seconds between probe rounds. Lower = faster detection "
|
|
854
|
+
"but more probe traffic. 60s is a good balance for local models."
|
|
855
|
+
),
|
|
856
|
+
)
|
|
857
|
+
probe_paid: bool = Field(
|
|
858
|
+
default=False,
|
|
859
|
+
description=(
|
|
860
|
+
"v2.0-I: whether to probe providers marked ``paid: true``. "
|
|
861
|
+
"Default false protects operators from accidental API charges."
|
|
862
|
+
),
|
|
863
|
+
)
|
|
864
|
+
probe_timeout_s: float = Field(
|
|
865
|
+
default=10.0,
|
|
866
|
+
ge=1.0,
|
|
867
|
+
le=60.0,
|
|
868
|
+
description=(
|
|
869
|
+
"v2.0-I: per-provider timeout for probe requests. A provider "
|
|
870
|
+
"that doesn't respond within this window is recorded as failed."
|
|
871
|
+
),
|
|
872
|
+
)
|
|
873
|
+
|
|
685
874
|
@model_validator(mode="after")
|
|
686
875
|
def _check_default_profile_exists(self) -> CodeRouterConfig:
|
|
687
876
|
"""v0.6-A: surface a typo'd ``default_profile`` at load time.
|
|
@@ -406,3 +406,82 @@ rules:
|
|
|
406
406
|
kind: openai_compat
|
|
407
407
|
capabilities:
|
|
408
408
|
tools: true
|
|
409
|
+
|
|
410
|
+
# ------------------------------------------------------------------
|
|
411
|
+
# Context window declarations — max_context_tokens (v2.0-F)
|
|
412
|
+
#
|
|
413
|
+
# Used by the context-budget guard (L1) to know when a conversation
|
|
414
|
+
# is approaching the model's context limit. These fall below the
|
|
415
|
+
# capability-specific rules above because first-match-per-flag means
|
|
416
|
+
# a model can declare both `thinking: true` and `max_context_tokens`
|
|
417
|
+
# from different rules — each flag resolves independently.
|
|
418
|
+
#
|
|
419
|
+
# Values are the *effective* context window the model reliably handles.
|
|
420
|
+
# For models with claimed but untested larger windows, the conservative
|
|
421
|
+
# value is declared. Operators can override via per-provider
|
|
422
|
+
# `max_context_tokens` in providers.yaml or user model-capabilities.yaml.
|
|
423
|
+
# ------------------------------------------------------------------
|
|
424
|
+
|
|
425
|
+
# Anthropic Claude — 200K context window (all 4.x families)
|
|
426
|
+
- match: "claude-*"
|
|
427
|
+
kind: anthropic
|
|
428
|
+
capabilities:
|
|
429
|
+
max_context_tokens: 200000
|
|
430
|
+
|
|
431
|
+
# Qwen3 (base, non-coder) — Ollama default: 32K context
|
|
432
|
+
- match: "qwen3:*"
|
|
433
|
+
capabilities:
|
|
434
|
+
max_context_tokens: 32768
|
|
435
|
+
|
|
436
|
+
# Qwen3-Coder — 256K declared, conservative 131K for GGUF quantized
|
|
437
|
+
- match: "qwen3-coder:*"
|
|
438
|
+
capabilities:
|
|
439
|
+
max_context_tokens: 131072
|
|
440
|
+
|
|
441
|
+
- match: "qwen/qwen3-coder-*"
|
|
442
|
+
capabilities:
|
|
443
|
+
max_context_tokens: 131072
|
|
444
|
+
|
|
445
|
+
# Qwen3.5 — 131K verified (LM Studio Anthropic endpoint)
|
|
446
|
+
- match: "qwen3.5*"
|
|
447
|
+
capabilities:
|
|
448
|
+
max_context_tokens: 131072
|
|
449
|
+
|
|
450
|
+
# Qwen3.6 — 256K declared, conservative 131K
|
|
451
|
+
- match: "qwen3.6*"
|
|
452
|
+
capabilities:
|
|
453
|
+
max_context_tokens: 131072
|
|
454
|
+
|
|
455
|
+
# Gemma 4 — 128K context (all variants)
|
|
456
|
+
- match: "gemma4:*"
|
|
457
|
+
capabilities:
|
|
458
|
+
max_context_tokens: 131072
|
|
459
|
+
|
|
460
|
+
- match: "google/gemma-4*"
|
|
461
|
+
capabilities:
|
|
462
|
+
max_context_tokens: 131072
|
|
463
|
+
|
|
464
|
+
# DeepSeek V3 — 128K context
|
|
465
|
+
- match: "deepseek*v3*"
|
|
466
|
+
capabilities:
|
|
467
|
+
max_context_tokens: 131072
|
|
468
|
+
|
|
469
|
+
# DeepSeek R1 — 128K context
|
|
470
|
+
- match: "deepseek*r1*"
|
|
471
|
+
capabilities:
|
|
472
|
+
max_context_tokens: 131072
|
|
473
|
+
|
|
474
|
+
# GPT-OSS — 131K context
|
|
475
|
+
- match: "*gpt-oss*"
|
|
476
|
+
capabilities:
|
|
477
|
+
max_context_tokens: 131072
|
|
478
|
+
|
|
479
|
+
# Devstral — 128K context (Mistral coding)
|
|
480
|
+
- match: "*devstral*"
|
|
481
|
+
capabilities:
|
|
482
|
+
max_context_tokens: 131072
|
|
483
|
+
|
|
484
|
+
# Kimi K2 — 128K context
|
|
485
|
+
- match: "*kimi-k2*"
|
|
486
|
+
capabilities:
|
|
487
|
+
max_context_tokens: 131072
|