coderouter-cli 1.10.1__py3-none-any.whl → 2.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
coderouter/cli_stats.py CHANGED
@@ -112,6 +112,10 @@ class GatesSummary:
112
112
  degraded_breakdown: dict[str, int] # capability → count
113
113
  filters_applied_total: int
114
114
  filters_breakdown: dict[str, int] # filter name → count
115
+ # v2.0-F (L1): context budget guard summary
116
+ context_budget_warnings: int = 0
117
+ context_budget_trims: int = 0
118
+ context_budget_latest_ratio: dict[str, float] | None = None
115
119
 
116
120
 
117
121
  @dataclass(frozen=True)
@@ -252,6 +256,8 @@ def build_gates_summary(snapshot: dict[str, Any]) -> GatesSummary:
252
256
  )
253
257
  degraded_breakdown = dict(counters.get("capability_degraded", {}) or {})
254
258
  filters_breakdown = dict(counters.get("output_filter_applied", {}) or {})
259
+ # v2.0-F (L1): context budget guard counters
260
+ ctx_budget_latest = counters.get("context_budget_latest_ratio") or {}
255
261
  return GatesSummary(
256
262
  total_requests=total_requests,
257
263
  total_failed=total_failed,
@@ -261,6 +267,13 @@ def build_gates_summary(snapshot: dict[str, Any]) -> GatesSummary:
261
267
  degraded_breakdown=degraded_breakdown,
262
268
  filters_applied_total=sum(filters_breakdown.values()),
263
269
  filters_breakdown=filters_breakdown,
270
+ context_budget_warnings=int(
271
+ counters.get("context_budget_warnings_total", 0)
272
+ ),
273
+ context_budget_trims=int(
274
+ counters.get("context_budget_trims_total", 0)
275
+ ),
276
+ context_budget_latest_ratio=ctx_budget_latest if ctx_budget_latest else None,
264
277
  )
265
278
 
266
279
 
@@ -397,6 +410,19 @@ def format_text(snapshot: dict[str, Any], *, width: int = 80) -> str:
397
410
  else ""
398
411
  )
399
412
  )
413
+ # v2.0-F (L1): context budget guard stats
414
+ if gates.context_budget_warnings or gates.context_budget_trims:
415
+ ratio_str = ""
416
+ if gates.context_budget_latest_ratio:
417
+ top_profile = max(
418
+ gates.context_budget_latest_ratio,
419
+ key=gates.context_budget_latest_ratio.get, # type: ignore[arg-type]
420
+ )
421
+ ratio_str = f" (latest: {gates.context_budget_latest_ratio[top_profile]:.0%} {top_profile})"
422
+ lines.append(
423
+ f" context-budget warn: {gates.context_budget_warnings} "
424
+ f"trim: {gates.context_budget_trims}{ratio_str}"
425
+ )
400
426
  lines.append("")
401
427
  lines.append("Recent")
402
428
  if not recent:
@@ -633,7 +659,28 @@ def _draw_frame( # pragma: no cover - curses-only
633
659
  + (f" ({_fmt_breakdown(gates.filters_breakdown)})" if gates.filters_breakdown else ""),
634
660
  width,
635
661
  )
636
- row += 2
662
+ row += 1
663
+ # v2.0-F (L1): context budget guard line
664
+ if gates.context_budget_warnings or gates.context_budget_trims:
665
+ ratio_str = ""
666
+ if gates.context_budget_latest_ratio:
667
+ top_profile = max(
668
+ gates.context_budget_latest_ratio,
669
+ key=gates.context_budget_latest_ratio.get, # type: ignore[arg-type]
670
+ )
671
+ ratio_str = f" (latest: {gates.context_budget_latest_ratio[top_profile]:.0%} {top_profile})"
672
+ budget_line = (
673
+ f" context-budget warn: {gates.context_budget_warnings} "
674
+ f"trim: {gates.context_budget_trims}{ratio_str}"
675
+ )
676
+ budget_color = (
677
+ _COLOR_YELLOW_PAIR
678
+ if gates.context_budget_trims == 0
679
+ else _COLOR_RED_PAIR
680
+ )
681
+ stdscr.addnstr(row, 0, budget_line, width, int(curses.color_pair(budget_color)))
682
+ row += 1
683
+ row += 1
637
684
 
638
685
  if row >= height - 2:
639
686
  return
@@ -230,6 +230,18 @@ class ProviderConfig(BaseModel):
230
230
  "Anthropic) from normal input — see :class:`CostConfig`."
231
231
  ),
232
232
  )
233
+ max_context_tokens: int | None = Field(
234
+ default=None,
235
+ ge=1,
236
+ description=(
237
+ "v2.0-F (L1): explicit declaration of this provider's "
238
+ "context window size in tokens. When set, takes precedence "
239
+ "over the ``model-capabilities.yaml`` registry lookup. "
240
+ "When both are unset, the context budget guard falls back "
241
+ "to 128000 (128K). Examples: Ollama Qwen3 32K → 32768, "
242
+ "LM Studio Qwen3.5 128K → 131072, Anthropic Claude → 200000."
243
+ ),
244
+ )
233
245
 
234
246
  @model_validator(mode="after")
235
247
  def _check_output_filters_known(self) -> ProviderConfig:
@@ -444,6 +456,147 @@ class FallbackChain(BaseModel):
444
456
  "operation, L5 handles hard crashes."
445
457
  ),
446
458
  )
459
+ # v2.0-F (L1): context budget guard.
460
+ #
461
+ # Long-running agent sessions accumulate messages that eventually
462
+ # exceed the target model's context window. Without intervention,
463
+ # the backend returns a 400 (Anthropic) or silently truncates
464
+ # (Ollama), killing the agent session. The context budget guard
465
+ # estimates the request's token count (char/4 heuristic, shared
466
+ # with the auto_router longContext matcher) and compares it against
467
+ # the target provider's declared max_context_tokens.
468
+ #
469
+ # Three actions:
470
+ # * ``off`` — no detection, no logging. Backward-compat default.
471
+ # * ``warn`` — emit ``context-budget-warning`` log + attach
472
+ # ``X-CodeRouter-Context-Budget: warning`` response
473
+ # header. No request mutation.
474
+ # * ``trim`` — ``warn`` + remove oldest non-system messages until
475
+ # the estimated token count drops below
476
+ # ``context_budget_trim_target``. Recent messages
477
+ # (``context_budget_preserve_last_n``) are always
478
+ # kept, and tool_use / tool_result pairs are preserved
479
+ # atomically to avoid breaking agent loops.
480
+ context_budget_action: Literal["off", "warn", "trim"] = Field(
481
+ default="off",
482
+ description=(
483
+ "v2.0-F (L1): action when estimated request tokens approach "
484
+ "the target provider's context window. ``off`` (default) "
485
+ "disables the guard entirely. ``warn`` emits a log and "
486
+ "response header at ``context_budget_warn_threshold``. "
487
+ "``trim`` additionally removes old messages at "
488
+ "``context_budget_trim_threshold`` to reclaim context space."
489
+ ),
490
+ )
491
+ context_budget_warn_threshold: float = Field(
492
+ default=0.80,
493
+ ge=0.1,
494
+ le=1.0,
495
+ description=(
496
+ "v2.0-F (L1): context usage ratio (estimated_tokens / "
497
+ "max_context_tokens) at which a warning is emitted. "
498
+ "Default 0.80 (80%) gives early notice before trim fires."
499
+ ),
500
+ )
501
+ context_budget_trim_threshold: float = Field(
502
+ default=0.90,
503
+ ge=0.1,
504
+ le=1.0,
505
+ description=(
506
+ "v2.0-F (L1): context usage ratio at which trim fires "
507
+ "(only when ``context_budget_action`` is ``trim``). "
508
+ "Default 0.90 (90%) leaves a 10% margin for the backend's "
509
+ "own token counting to differ from the char/4 estimate."
510
+ ),
511
+ )
512
+ context_budget_trim_target: float = Field(
513
+ default=0.75,
514
+ ge=0.1,
515
+ le=1.0,
516
+ description=(
517
+ "v2.0-F (L1): target context usage ratio after trim. "
518
+ "Messages are removed from the front until the estimate "
519
+ "drops below this ratio. Default 0.75 (75%) gives headroom "
520
+ "for several more turns before trim fires again."
521
+ ),
522
+ )
523
+ context_budget_preserve_last_n: int = Field(
524
+ default=4,
525
+ ge=1,
526
+ le=100,
527
+ description=(
528
+ "v2.0-F (L1): minimum number of recent messages to always "
529
+ "preserve when trimming. Default 4 (2 user-assistant pairs) "
530
+ "keeps the agent's immediate working context intact."
531
+ ),
532
+ )
533
+
534
+ # ------------------------------------------------------------------
535
+ # v2.0-G (L4): Drift detection — response quality degradation guard
536
+ # ------------------------------------------------------------------
537
+ #
538
+ # Long-running sessions on local LLMs can suffer gradual quality
539
+ # decay (KV cache pressure, thermal throttling, VRAM fragmentation)
540
+ # where the model "succeeds" but produces empty/short/toolless
541
+ # responses. This guard observes response quality signals in a
542
+ # rolling window and detects statistical drift.
543
+ #
544
+ # Four actions:
545
+ # * ``off`` — no detection (default).
546
+ # * ``warn`` — emit structured log + response header.
547
+ # * ``promote`` — ``warn`` + demote drifted provider in chain.
548
+ # * ``reload`` — ``promote`` + attempt KV cache flush (Ollama).
549
+ drift_detection_action: Literal["off", "warn", "promote", "reload"] = Field(
550
+ default="off",
551
+ description=(
552
+ "v2.0-G (L4): action on response quality drift detection. "
553
+ "``off`` (default) disables drift detection. ``warn`` emits "
554
+ "a log and response header. ``promote`` additionally demotes "
555
+ "the drifted provider in the chain. ``reload`` attempts to "
556
+ "flush the provider's KV cache (Ollama only) before promoting."
557
+ ),
558
+ )
559
+ drift_detection_window_size: int = Field(
560
+ default=20,
561
+ ge=4,
562
+ le=200,
563
+ description=(
564
+ "v2.0-G (L4): number of recent responses to keep in the "
565
+ "rolling observation window per provider. Larger windows "
566
+ "are more robust to noise but slower to detect drift."
567
+ ),
568
+ )
569
+ drift_detection_cooldown_s: int = Field(
570
+ default=300,
571
+ ge=10,
572
+ le=3600,
573
+ description=(
574
+ "v2.0-G (L4): seconds after a promote/reload action before "
575
+ "the drifted provider's rank is reset for recovery check. "
576
+ "Default 300s (5 min) gives the model time to stabilize."
577
+ ),
578
+ )
579
+ drift_detection_sensitivity: Literal["low", "normal", "high"] = Field(
580
+ default="normal",
581
+ description=(
582
+ "v2.0-G (L4): threshold preset for drift signals. "
583
+ "``low`` tolerates more degradation before triggering, "
584
+ "``high`` is stricter (fewer bad responses needed)."
585
+ ),
586
+ )
587
+
588
+ # --- v2.0-H (L6): Mid-stream partial stitching --------------------------
589
+ # * ``off`` — discard partial content on mid-stream failure (legacy).
590
+ # * ``surface`` — return partial content as a truncated-but-valid response.
591
+ partial_stitch_action: Literal["off", "surface"] = Field(
592
+ default="off",
593
+ description=(
594
+ "v2.0-H (L6): action when a streaming response fails mid-stream. "
595
+ "``off`` discards partial content (legacy error event). "
596
+ "``surface`` returns accumulated text as a graceful stream "
597
+ "termination with a ``coderouter_partial`` metadata event."
598
+ ),
599
+ )
447
600
 
448
601
 
449
602
  # ---------------------------------------------------------------------------
@@ -682,6 +835,42 @@ class CodeRouterConfig(BaseModel):
682
835
  ),
683
836
  )
684
837
 
838
+ # v2.0-I: Continuous probing — background health checks for idle periods.
839
+ continuous_probe: Literal["off", "active"] = Field(
840
+ default="off",
841
+ description=(
842
+ "v2.0-I: enable background health probes. 'active' starts a "
843
+ "background task that periodically sends 1-token requests to "
844
+ "each provider, feeding results into the L5 backend health "
845
+ "state machine. 'off' = no probing (backward-compatible default)."
846
+ ),
847
+ )
848
+ probe_interval_s: float = Field(
849
+ default=60.0,
850
+ ge=5.0,
851
+ le=3600.0,
852
+ description=(
853
+ "v2.0-I: seconds between probe rounds. Lower = faster detection "
854
+ "but more probe traffic. 60s is a good balance for local models."
855
+ ),
856
+ )
857
+ probe_paid: bool = Field(
858
+ default=False,
859
+ description=(
860
+ "v2.0-I: whether to probe providers marked ``paid: true``. "
861
+ "Default false protects operators from accidental API charges."
862
+ ),
863
+ )
864
+ probe_timeout_s: float = Field(
865
+ default=10.0,
866
+ ge=1.0,
867
+ le=60.0,
868
+ description=(
869
+ "v2.0-I: per-provider timeout for probe requests. A provider "
870
+ "that doesn't respond within this window is recorded as failed."
871
+ ),
872
+ )
873
+
685
874
  @model_validator(mode="after")
686
875
  def _check_default_profile_exists(self) -> CodeRouterConfig:
687
876
  """v0.6-A: surface a typo'd ``default_profile`` at load time.
@@ -406,3 +406,82 @@ rules:
406
406
  kind: openai_compat
407
407
  capabilities:
408
408
  tools: true
409
+
410
+ # ------------------------------------------------------------------
411
+ # Context window declarations — max_context_tokens (v2.0-F)
412
+ #
413
+ # Used by the context-budget guard (L1) to know when a conversation
414
+ # is approaching the model's context limit. These fall below the
415
+ # capability-specific rules above because first-match-per-flag means
416
+ # a model can declare both `thinking: true` and `max_context_tokens`
417
+ # from different rules — each flag resolves independently.
418
+ #
419
+ # Values are the *effective* context window the model reliably handles.
420
+ # For models with claimed but untested larger windows, the conservative
421
+ # value is declared. Operators can override via per-provider
422
+ # `max_context_tokens` in providers.yaml or user model-capabilities.yaml.
423
+ # ------------------------------------------------------------------
424
+
425
+ # Anthropic Claude — 200K context window (all 4.x families)
426
+ - match: "claude-*"
427
+ kind: anthropic
428
+ capabilities:
429
+ max_context_tokens: 200000
430
+
431
+ # Qwen3 (base, non-coder) — Ollama default: 32K context
432
+ - match: "qwen3:*"
433
+ capabilities:
434
+ max_context_tokens: 32768
435
+
436
+ # Qwen3-Coder — 256K declared, conservative 131K for GGUF quantized
437
+ - match: "qwen3-coder:*"
438
+ capabilities:
439
+ max_context_tokens: 131072
440
+
441
+ - match: "qwen/qwen3-coder-*"
442
+ capabilities:
443
+ max_context_tokens: 131072
444
+
445
+ # Qwen3.5 — 131K verified (LM Studio Anthropic endpoint)
446
+ - match: "qwen3.5*"
447
+ capabilities:
448
+ max_context_tokens: 131072
449
+
450
+ # Qwen3.6 — 256K declared, conservative 131K
451
+ - match: "qwen3.6*"
452
+ capabilities:
453
+ max_context_tokens: 131072
454
+
455
+ # Gemma 4 — 128K context (all variants)
456
+ - match: "gemma4:*"
457
+ capabilities:
458
+ max_context_tokens: 131072
459
+
460
+ - match: "google/gemma-4*"
461
+ capabilities:
462
+ max_context_tokens: 131072
463
+
464
+ # DeepSeek V3 — 128K context
465
+ - match: "deepseek*v3*"
466
+ capabilities:
467
+ max_context_tokens: 131072
468
+
469
+ # DeepSeek R1 — 128K context
470
+ - match: "deepseek*r1*"
471
+ capabilities:
472
+ max_context_tokens: 131072
473
+
474
+ # GPT-OSS — 131K context
475
+ - match: "*gpt-oss*"
476
+ capabilities:
477
+ max_context_tokens: 131072
478
+
479
+ # Devstral — 128K context (Mistral coding)
480
+ - match: "*devstral*"
481
+ capabilities:
482
+ max_context_tokens: 131072
483
+
484
+ # Kimi K2 — 128K context
485
+ - match: "*kimi-k2*"
486
+ capabilities:
487
+ max_context_tokens: 131072