PyPI - coderouter-cli - Versions diffs - 1.10.1__py3-none-any.whl → 2.1.0__py3-none-any.whl - Mend

coderouter-cli 1.10.1py3-none-any.whl → 2.1.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (21) hide show

coderouter/cli_stats.py +48 -1
coderouter/config/schemas.py +189 -0
coderouter/data/model-capabilities.yaml +79 -0
coderouter/guards/context_budget.py +376 -0
coderouter/guards/continuous_probe.py +349 -0
coderouter/guards/drift_actions.py +111 -0
coderouter/guards/drift_detection.py +308 -0
coderouter/ingress/anthropic_routes.py +93 -12
coderouter/ingress/app.py +39 -0
coderouter/logging.py +351 -0
coderouter/metrics/collector.py +142 -2
coderouter/metrics/prometheus.py +212 -0
coderouter/routing/adaptive.py +23 -0
coderouter/routing/auto_router.py +2 -42
coderouter/routing/fallback.py +481 -4
coderouter/token_estimation.py +161 -0
{coderouter_cli-1.10.1.dist-info → coderouter_cli-2.1.0.dist-info}/METADATA +11 -8
{coderouter_cli-1.10.1.dist-info → coderouter_cli-2.1.0.dist-info}/RECORD +21 -16
{coderouter_cli-1.10.1.dist-info → coderouter_cli-2.1.0.dist-info}/WHEEL +0 -0
{coderouter_cli-1.10.1.dist-info → coderouter_cli-2.1.0.dist-info}/entry_points.txt +0 -0
{coderouter_cli-1.10.1.dist-info → coderouter_cli-2.1.0.dist-info}/licenses/LICENSE +0 -0

coderouter/cli_stats.py CHANGED Viewed

@@ -112,6 +112,10 @@ class GatesSummary:
     degraded_breakdown: dict[str, int]  # capability → count
     filters_applied_total: int
     filters_breakdown: dict[str, int]  # filter name → count
+    # v2.0-F (L1): context budget guard summary
+    context_budget_warnings: int = 0
+    context_budget_trims: int = 0
+    context_budget_latest_ratio: dict[str, float] | None = None
 @dataclass(frozen=True)
@@ -252,6 +256,8 @@ def build_gates_summary(snapshot: dict[str, Any]) -> GatesSummary:
     )
     degraded_breakdown = dict(counters.get("capability_degraded", {}) or {})
     filters_breakdown = dict(counters.get("output_filter_applied", {}) or {})
+    # v2.0-F (L1): context budget guard counters
+    ctx_budget_latest = counters.get("context_budget_latest_ratio") or {}
     return GatesSummary(
         total_requests=total_requests,
         total_failed=total_failed,
@@ -261,6 +267,13 @@ def build_gates_summary(snapshot: dict[str, Any]) -> GatesSummary:
         degraded_breakdown=degraded_breakdown,
         filters_applied_total=sum(filters_breakdown.values()),
         filters_breakdown=filters_breakdown,
+        context_budget_warnings=int(
+            counters.get("context_budget_warnings_total", 0)
+        ),
+        context_budget_trims=int(
+            counters.get("context_budget_trims_total", 0)
+        ),
+        context_budget_latest_ratio=ctx_budget_latest if ctx_budget_latest else None,
     )
@@ -397,6 +410,19 @@ def format_text(snapshot: dict[str, Any], *, width: int = 80) -> str:
             else ""
         )
     )
+    # v2.0-F (L1): context budget guard stats
+    if gates.context_budget_warnings or gates.context_budget_trims:
+        ratio_str = ""
+        if gates.context_budget_latest_ratio:
+            top_profile = max(
+                gates.context_budget_latest_ratio,
+                key=gates.context_budget_latest_ratio.get,  # type: ignore[arg-type]
+            )
+            ratio_str = f"  (latest: {gates.context_budget_latest_ratio[top_profile]:.0%} {top_profile})"
+        lines.append(
+            f"  context-budget warn:   {gates.context_budget_warnings}  "
+            f"trim: {gates.context_budget_trims}{ratio_str}"
+        )
     lines.append("")
     lines.append("Recent")
     if not recent:
@@ -633,7 +659,28 @@ def _draw_frame(  # pragma: no cover - curses-only
         + (f"  ({_fmt_breakdown(gates.filters_breakdown)})" if gates.filters_breakdown else ""),
         width,
     )
-    row += 2
+    row += 1
+    # v2.0-F (L1): context budget guard line
+    if gates.context_budget_warnings or gates.context_budget_trims:
+        ratio_str = ""
+        if gates.context_budget_latest_ratio:
+            top_profile = max(
+                gates.context_budget_latest_ratio,
+                key=gates.context_budget_latest_ratio.get,  # type: ignore[arg-type]
+            )
+            ratio_str = f"  (latest: {gates.context_budget_latest_ratio[top_profile]:.0%} {top_profile})"
+        budget_line = (
+            f"  context-budget warn:   {gates.context_budget_warnings}  "
+            f"trim: {gates.context_budget_trims}{ratio_str}"
+        )
+        budget_color = (
+            _COLOR_YELLOW_PAIR
+            if gates.context_budget_trims == 0
+            else _COLOR_RED_PAIR
+        )
+        stdscr.addnstr(row, 0, budget_line, width, int(curses.color_pair(budget_color)))
+        row += 1
+    row += 1
     if row >= height - 2:
         return

coderouter/config/schemas.py CHANGED Viewed

@@ -230,6 +230,18 @@ class ProviderConfig(BaseModel):
             "Anthropic) from normal input — see :class:`CostConfig`."
         ),
     )
+    max_context_tokens: int | None = Field(
+        default=None,
+        ge=1,
+        description=(
+            "v2.0-F (L1): explicit declaration of this provider's "
+            "context window size in tokens. When set, takes precedence "
+            "over the ``model-capabilities.yaml`` registry lookup. "
+            "When both are unset, the context budget guard falls back "
+            "to 128000 (128K). Examples: Ollama Qwen3 32K → 32768, "
+            "LM Studio Qwen3.5 128K → 131072, Anthropic Claude → 200000."
+        ),
+    )
     @model_validator(mode="after")
     def _check_output_filters_known(self) -> ProviderConfig:
@@ -444,6 +456,147 @@ class FallbackChain(BaseModel):
             "operation, L5 handles hard crashes."
         ),
     )
+    # v2.0-F (L1): context budget guard.
+    #
+    # Long-running agent sessions accumulate messages that eventually
+    # exceed the target model's context window. Without intervention,
+    # the backend returns a 400 (Anthropic) or silently truncates
+    # (Ollama), killing the agent session. The context budget guard
+    # estimates the request's token count (char/4 heuristic, shared
+    # with the auto_router longContext matcher) and compares it against
+    # the target provider's declared max_context_tokens.
+    #
+    # Three actions:
+    #   * ``off``  — no detection, no logging. Backward-compat default.
+    #   * ``warn`` — emit ``context-budget-warning`` log + attach
+    #                ``X-CodeRouter-Context-Budget: warning`` response
+    #                header. No request mutation.
+    #   * ``trim`` — ``warn`` + remove oldest non-system messages until
+    #                the estimated token count drops below
+    #                ``context_budget_trim_target``. Recent messages
+    #                (``context_budget_preserve_last_n``) are always
+    #                kept, and tool_use / tool_result pairs are preserved
+    #                atomically to avoid breaking agent loops.
+    context_budget_action: Literal["off", "warn", "trim"] = Field(
+        default="off",
+        description=(
+            "v2.0-F (L1): action when estimated request tokens approach "
+            "the target provider's context window. ``off`` (default) "
+            "disables the guard entirely. ``warn`` emits a log and "
+            "response header at ``context_budget_warn_threshold``. "
+            "``trim`` additionally removes old messages at "
+            "``context_budget_trim_threshold`` to reclaim context space."
+        ),
+    )
+    context_budget_warn_threshold: float = Field(
+        default=0.80,
+        ge=0.1,
+        le=1.0,
+        description=(
+            "v2.0-F (L1): context usage ratio (estimated_tokens / "
+            "max_context_tokens) at which a warning is emitted. "
+            "Default 0.80 (80%) gives early notice before trim fires."
+        ),
+    )
+    context_budget_trim_threshold: float = Field(
+        default=0.90,
+        ge=0.1,
+        le=1.0,
+        description=(
+            "v2.0-F (L1): context usage ratio at which trim fires "
+            "(only when ``context_budget_action`` is ``trim``). "
+            "Default 0.90 (90%) leaves a 10% margin for the backend's "
+            "own token counting to differ from the char/4 estimate."
+        ),
+    )
+    context_budget_trim_target: float = Field(
+        default=0.75,
+        ge=0.1,
+        le=1.0,
+        description=(
+            "v2.0-F (L1): target context usage ratio after trim. "
+            "Messages are removed from the front until the estimate "
+            "drops below this ratio. Default 0.75 (75%) gives headroom "
+            "for several more turns before trim fires again."
+        ),
+    )
+    context_budget_preserve_last_n: int = Field(
+        default=4,
+        ge=1,
+        le=100,
+        description=(
+            "v2.0-F (L1): minimum number of recent messages to always "
+            "preserve when trimming. Default 4 (2 user-assistant pairs) "
+            "keeps the agent's immediate working context intact."
+        ),
+    )
+    # ------------------------------------------------------------------
+    # v2.0-G (L4): Drift detection — response quality degradation guard
+    # ------------------------------------------------------------------
+    #
+    # Long-running sessions on local LLMs can suffer gradual quality
+    # decay (KV cache pressure, thermal throttling, VRAM fragmentation)
+    # where the model "succeeds" but produces empty/short/toolless
+    # responses. This guard observes response quality signals in a
+    # rolling window and detects statistical drift.
+    #
+    # Four actions:
+    #   * ``off``     — no detection (default).
+    #   * ``warn``    — emit structured log + response header.
+    #   * ``promote`` — ``warn`` + demote drifted provider in chain.
+    #   * ``reload``  — ``promote`` + attempt KV cache flush (Ollama).
+    drift_detection_action: Literal["off", "warn", "promote", "reload"] = Field(
+        default="off",
+        description=(
+            "v2.0-G (L4): action on response quality drift detection. "
+            "``off`` (default) disables drift detection. ``warn`` emits "
+            "a log and response header. ``promote`` additionally demotes "
+            "the drifted provider in the chain. ``reload`` attempts to "
+            "flush the provider's KV cache (Ollama only) before promoting."
+        ),
+    )
+    drift_detection_window_size: int = Field(
+        default=20,
+        ge=4,
+        le=200,
+        description=(
+            "v2.0-G (L4): number of recent responses to keep in the "
+            "rolling observation window per provider. Larger windows "
+            "are more robust to noise but slower to detect drift."
+        ),
+    )
+    drift_detection_cooldown_s: int = Field(
+        default=300,
+        ge=10,
+        le=3600,
+        description=(
+            "v2.0-G (L4): seconds after a promote/reload action before "
+            "the drifted provider's rank is reset for recovery check. "
+            "Default 300s (5 min) gives the model time to stabilize."
+        ),
+    )
+    drift_detection_sensitivity: Literal["low", "normal", "high"] = Field(
+        default="normal",
+        description=(
+            "v2.0-G (L4): threshold preset for drift signals. "
+            "``low`` tolerates more degradation before triggering, "
+            "``high`` is stricter (fewer bad responses needed)."
+        ),
+    )
+    # --- v2.0-H (L6): Mid-stream partial stitching --------------------------
+    #   * ``off``      — discard partial content on mid-stream failure (legacy).
+    #   * ``surface``  — return partial content as a truncated-but-valid response.
+    partial_stitch_action: Literal["off", "surface"] = Field(
+        default="off",
+        description=(
+            "v2.0-H (L6): action when a streaming response fails mid-stream. "
+            "``off`` discards partial content (legacy error event). "
+            "``surface`` returns accumulated text as a graceful stream "
+            "termination with a ``coderouter_partial`` metadata event."
+        ),
+    )
 # ---------------------------------------------------------------------------
@@ -682,6 +835,42 @@ class CodeRouterConfig(BaseModel):
         ),
     )
+    # v2.0-I: Continuous probing — background health checks for idle periods.
+    continuous_probe: Literal["off", "active"] = Field(
+        default="off",
+        description=(
+            "v2.0-I: enable background health probes. 'active' starts a "
+            "background task that periodically sends 1-token requests to "
+            "each provider, feeding results into the L5 backend health "
+            "state machine. 'off' = no probing (backward-compatible default)."
+        ),
+    )
+    probe_interval_s: float = Field(
+        default=60.0,
+        ge=5.0,
+        le=3600.0,
+        description=(
+            "v2.0-I: seconds between probe rounds. Lower = faster detection "
+            "but more probe traffic. 60s is a good balance for local models."
+        ),
+    )
+    probe_paid: bool = Field(
+        default=False,
+        description=(
+            "v2.0-I: whether to probe providers marked ``paid: true``. "
+            "Default false protects operators from accidental API charges."
+        ),
+    )
+    probe_timeout_s: float = Field(
+        default=10.0,
+        ge=1.0,
+        le=60.0,
+        description=(
+            "v2.0-I: per-provider timeout for probe requests. A provider "
+            "that doesn't respond within this window is recorded as failed."
+        ),
+    )
     @model_validator(mode="after")
     def _check_default_profile_exists(self) -> CodeRouterConfig:
         """v0.6-A: surface a typo'd ``default_profile`` at load time.

coderouter/data/model-capabilities.yaml CHANGED Viewed

@@ -406,3 +406,82 @@ rules:
     kind: openai_compat
     capabilities:
       tools: true
+  # ------------------------------------------------------------------
+  # Context window declarations — max_context_tokens (v2.0-F)
+  #
+  # Used by the context-budget guard (L1) to know when a conversation
+  # is approaching the model's context limit. These fall below the
+  # capability-specific rules above because first-match-per-flag means
+  # a model can declare both `thinking: true` and `max_context_tokens`
+  # from different rules — each flag resolves independently.
+  #
+  # Values are the *effective* context window the model reliably handles.
+  # For models with claimed but untested larger windows, the conservative
+  # value is declared. Operators can override via per-provider
+  # `max_context_tokens` in providers.yaml or user model-capabilities.yaml.
+  # ------------------------------------------------------------------
+  # Anthropic Claude — 200K context window (all 4.x families)
+  - match: "claude-*"
+    kind: anthropic
+    capabilities:
+      max_context_tokens: 200000
+  # Qwen3 (base, non-coder) — Ollama default: 32K context
+  - match: "qwen3:*"
+    capabilities:
+      max_context_tokens: 32768
+  # Qwen3-Coder — 256K declared, conservative 131K for GGUF quantized
+  - match: "qwen3-coder:*"
+    capabilities:
+      max_context_tokens: 131072
+  - match: "qwen/qwen3-coder-*"
+    capabilities:
+      max_context_tokens: 131072
+  # Qwen3.5 — 131K verified (LM Studio Anthropic endpoint)
+  - match: "qwen3.5*"
+    capabilities:
+      max_context_tokens: 131072
+  # Qwen3.6 — 256K declared, conservative 131K
+  - match: "qwen3.6*"
+    capabilities:
+      max_context_tokens: 131072
+  # Gemma 4 — 128K context (all variants)
+  - match: "gemma4:*"
+    capabilities:
+      max_context_tokens: 131072
+  - match: "google/gemma-4*"
+    capabilities:
+      max_context_tokens: 131072
+  # DeepSeek V3 — 128K context
+  - match: "deepseek*v3*"
+    capabilities:
+      max_context_tokens: 131072
+  # DeepSeek R1 — 128K context
+  - match: "deepseek*r1*"
+    capabilities:
+      max_context_tokens: 131072
+  # GPT-OSS — 131K context
+  - match: "*gpt-oss*"
+    capabilities:
+      max_context_tokens: 131072
+  # Devstral — 128K context (Mistral coding)
+  - match: "*devstral*"
+    capabilities:
+      max_context_tokens: 131072
+  # Kimi K2 — 128K context
+  - match: "*kimi-k2*"
+    capabilities:
+      max_context_tokens: 131072

coderouter-cli 1.10.1__py3-none-any.whl → 2.1.0__py3-none-any.whl

coderouter-cli 1.10.1py3-none-any.whl → 2.1.0py3-none-any.whl