PyPI - coderouter-cli - Versions diffs - 2.0.0__py3-none-any.whl → 2.2.0__py3-none-any.whl - Mend

coderouter-cli 2.0.0py3-none-any.whl → 2.2.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (30) hide show

coderouter/cli.py +219 -0
coderouter/config/schemas.py +235 -2
coderouter/guards/__init__.py +6 -4
coderouter/guards/backend_health.py +34 -0
coderouter/guards/continuous_probe.py +349 -0
coderouter/guards/drift_actions.py +111 -0
coderouter/guards/drift_detection.py +308 -0
coderouter/guards/self_healing.py +413 -0
coderouter/guards/tool_loop.py +71 -0
coderouter/ingress/anthropic_routes.py +106 -12
coderouter/ingress/app.py +129 -0
coderouter/logging.py +370 -0
coderouter/metrics/collector.py +168 -0
coderouter/metrics/prometheus.py +141 -0
coderouter/output_filters.py +95 -4
coderouter/routing/adaptive.py +23 -0
coderouter/routing/budget.py +35 -0
coderouter/routing/fallback.py +496 -5
coderouter/state/__init__.py +15 -0
coderouter/state/audit_log.py +269 -0
coderouter/state/replay.py +316 -0
coderouter/state/request_log.py +178 -0
coderouter/state/store.py +212 -0
coderouter/translation/tool_repair.py +42 -1
coderouter_cli-2.2.0.dist-info/METADATA +243 -0
{coderouter_cli-2.0.0.dist-info → coderouter_cli-2.2.0.dist-info}/RECORD +29 -20
coderouter_cli-2.0.0.dist-info/METADATA +0 -559
{coderouter_cli-2.0.0.dist-info → coderouter_cli-2.2.0.dist-info}/WHEEL +0 -0
{coderouter_cli-2.0.0.dist-info → coderouter_cli-2.2.0.dist-info}/entry_points.txt +0 -0
{coderouter_cli-2.0.0.dist-info → coderouter_cli-2.2.0.dist-info}/licenses/LICENSE +0 -0

coderouter/metrics/collector.py CHANGED Viewed

@@ -48,6 +48,13 @@ Event inventory (dispatch table in :meth:`MetricsCollector._dispatch`)
                                    + per-profile counter + latest_ratio gauge
     ``context-budget-trimmed`` (v2.0-F)→ ``context_budget_trims_total``
                                    + per-profile counter
+    ``drift-detected`` (v2.0-G)  → ``drift_detected_total`` + per-provider
+    ``drift-promoted`` (v2.0-G)  → ``drift_promoted_total``
+    ``drift-reload-attempted``   → ``drift_reload_total`` / success
+    ``partial-stitch-surfaced``  → ``partial_stitch_surfaced_total`` (v2.0-H)
+    ``probe-completed`` (v2.0-I) → ``probe_total`` / ``probe_success`` / ``probe_failure``
+                                   + per-provider latency gauge
+    ``probe-round-completed``    → ``probe_rounds_total`` (v2.0-I)
     ``coderouter-startup``       → ``startup_info`` (stored for the UI header)
     Unrecognized events are ignored (forward-compat: adding a new log
@@ -194,6 +201,29 @@ class MetricsCollector(logging.Handler):
         self._context_budget_trims_by_profile: Counter[str] = Counter()
         self._context_budget_latest_ratio: dict[str, float] = {}
+        # v2.0-G (L4): drift detection counters. Per-provider counts of
+        # drift events, promotions (rank demotions), and reload attempts.
+        self._drift_detected_total: int = 0
+        self._drift_detected_by_provider: Counter[str] = Counter()
+        self._drift_promoted_total: int = 0
+        self._drift_reload_total: int = 0
+        self._drift_reload_success_total: int = 0
+        # v2.0-H (L6): partial stitch surfaced counter. Tracks how often
+        # the mid-stream failure recovery delivered partial content to the
+        # client instead of a generic error event.
+        self._partial_stitch_surfaced_total: int = 0
+        # v2.0-I: continuous probe counters. Per-provider probe attempts
+        # and outcomes, plus a round counter for the dashboard's
+        # "probes/min" panel.
+        self._probe_total: Counter[str] = Counter()  # per-provider total probes
+        self._probe_success: Counter[str] = Counter()  # per-provider successes
+        self._probe_failure: Counter[str] = Counter()  # per-provider failures
+        self._probe_rounds_total: int = 0
+        self._probe_latency_ms: dict[str, float] = {}  # per-provider latest
+        self._probe_drift_detected: Counter[str] = Counter()  # per-provider drift
         # Last-error snapshot per provider (overwrites previous). Enables the
         # dashboard's "last error" column without scanning the ring.
         self._last_error: dict[str, dict[str, Any]] = {}
@@ -372,6 +402,43 @@ class MetricsCollector(logging.Handler):
                 profile = _str(extras.get("profile"))
                 self._context_budget_trims_total += 1
                 self._context_budget_trims_by_profile[profile] += 1
+            elif event == "drift-detected":
+                # v2.0-G (L4): drift detection fired.
+                provider = _str(extras.get("provider"))
+                self._drift_detected_total += 1
+                self._drift_detected_by_provider[provider] += 1
+                self._push_recent(event, extras, record)
+            elif event == "drift-promoted":
+                # v2.0-G (L4): drifted provider was demoted.
+                self._drift_promoted_total += 1
+                self._push_recent(event, extras, record)
+            elif event == "drift-reload-attempted":
+                # v2.0-G (L4): Ollama KV cache flush attempted.
+                self._drift_reload_total += 1
+                if extras.get("success"):
+                    self._drift_reload_success_total += 1
+            elif event == "partial-stitch-surfaced":
+                # v2.0-H (L6): mid-stream failure gracefully surfaced.
+                self._partial_stitch_surfaced_total += 1
+                self._push_recent(event, extras, record)
+            elif event == "probe-completed":
+                # v2.0-I: per-provider probe outcome.
+                provider = _str(extras.get("provider"))
+                self._probe_total[provider] += 1
+                if extras.get("success"):
+                    self._probe_success[provider] += 1
+                else:
+                    self._probe_failure[provider] += 1
+                latency_raw = extras.get("latency_ms")
+                if isinstance(latency_raw, int | float):
+                    self._probe_latency_ms[provider] = float(latency_raw)
+            elif event == "probe-round-completed":
+                # v2.0-I: round counter for the dashboard.
+                self._probe_rounds_total += 1
+            elif event == "probe-capabilities-drift":
+                # v2.0-I: model mismatch detected by probe.
+                provider = _str(extras.get("provider"))
+                self._probe_drift_detected[provider] += 1
             elif event == "coderouter-startup":
                 # Snapshot a subset — startup payload contains lists that are
                 # safe to surface to /metrics.json. Version / providers /
@@ -534,11 +601,103 @@ class MetricsCollector(logging.Handler):
                     "context_budget_latest_ratio": dict(
                         self._context_budget_latest_ratio
                     ),
+                    # v2.0-G (L4): drift detection aggregate counters.
+                    "drift_detected_total": self._drift_detected_total,
+                    "drift_detected_by_provider": dict(
+                        self._drift_detected_by_provider
+                    ),
+                    "drift_promoted_total": self._drift_promoted_total,
+                    "drift_reload_total": self._drift_reload_total,
+                    "drift_reload_success_total": self._drift_reload_success_total,
+                    # v2.0-H (L6): partial stitch surfaced.
+                    "partial_stitch_surfaced_total": self._partial_stitch_surfaced_total,
+                    # v2.0-I: continuous probe counters.
+                    "probe_total": dict(self._probe_total),
+                    "probe_success": dict(self._probe_success),
+                    "probe_failure": dict(self._probe_failure),
+                    "probe_rounds_total": self._probe_rounds_total,
+                    "probe_latency_ms": dict(self._probe_latency_ms),
+                    "probe_drift_detected": dict(self._probe_drift_detected),
                 },
                 "providers": provider_rows,
                 "recent": list(self._recent),
             }
+    # ------------------------------------------------------------------
+    # v2.0-K: Persistence
+    # ------------------------------------------------------------------
+    def save_state(self) -> dict[str, object]:
+        """Export key counters for cross-restart persistence.
+        Returns a JSON-safe dict of the most operationally-important
+        counters.  The ``recent`` ring and per-provider ``last_error``
+        are excluded (ephemeral by nature).
+        """
+        with self._lock:
+            return {
+                "requests_total": self._requests_total,
+                "provider_attempts": dict(self._provider_attempts),
+                "provider_outcomes": {
+                    k: dict(v) for k, v in self._provider_outcomes.items()
+                },
+                "cost_total_usd": dict(self._cost_total_usd),
+                "cost_savings_usd": dict(self._cost_savings_usd),
+                "cost_total_usd_aggregate": self._cost_total_usd_aggregate,
+                "cost_savings_usd_aggregate": self._cost_savings_usd_aggregate,
+                "chain_paid_gate_blocked_total": self._chain_paid_gate_blocked_total,
+                "chain_budget_exceeded_total": self._chain_budget_exceeded_total,
+                "chain_memory_pressure_blocked_total": self._chain_memory_pressure_blocked_total,
+                "chain_uniform_auth_failure_total": self._chain_uniform_auth_failure_total,
+                "probe_rounds_total": self._probe_rounds_total,
+            }
+    def load_state(self, state: dict[str, object]) -> None:
+        """Restore counters from a previously saved dict.
+        Additive: values from ``state`` are *added* to the current
+        (zeroed) counters, so calling ``load_state`` on a fresh
+        collector restores the prior session's totals.
+        """
+        if not isinstance(state, dict):
+            return
+        with self._lock:
+            self._requests_total += int(state.get("requests_total", 0))
+            for k, v in (state.get("provider_attempts") or {}).items():
+                self._provider_attempts[k] += int(v)
+            for prov, outcomes in (state.get("provider_outcomes") or {}).items():
+                if not isinstance(outcomes, dict):
+                    continue
+                if prov not in self._provider_outcomes:
+                    self._provider_outcomes[prov] = Counter()
+                for k, v in outcomes.items():
+                    self._provider_outcomes[prov][k] += int(v)
+            for k, v in (state.get("cost_total_usd") or {}).items():
+                self._cost_total_usd[k] = self._cost_total_usd.get(k, 0.0) + float(v)
+            for k, v in (state.get("cost_savings_usd") or {}).items():
+                self._cost_savings_usd[k] = self._cost_savings_usd.get(k, 0.0) + float(v)
+            self._cost_total_usd_aggregate += float(
+                state.get("cost_total_usd_aggregate", 0.0)
+            )
+            self._cost_savings_usd_aggregate += float(
+                state.get("cost_savings_usd_aggregate", 0.0)
+            )
+            self._chain_paid_gate_blocked_total += int(
+                state.get("chain_paid_gate_blocked_total", 0)
+            )
+            self._chain_budget_exceeded_total += int(
+                state.get("chain_budget_exceeded_total", 0)
+            )
+            self._chain_memory_pressure_blocked_total += int(
+                state.get("chain_memory_pressure_blocked_total", 0)
+            )
+            self._chain_uniform_auth_failure_total += int(
+                state.get("chain_uniform_auth_failure_total", 0)
+            )
+            self._probe_rounds_total += int(
+                state.get("probe_rounds_total", 0)
+            )
     # ------------------------------------------------------------------
     # Test hook
     # ------------------------------------------------------------------
@@ -578,6 +737,15 @@ class MetricsCollector(logging.Handler):
             self._cost_savings_usd.clear()
             self._cost_total_usd_aggregate = 0.0
             self._cost_savings_usd_aggregate = 0.0
+            # v2.0-H (L6)
+            self._partial_stitch_surfaced_total = 0
+            # v2.0-I
+            self._probe_total.clear()
+            self._probe_success.clear()
+            self._probe_failure.clear()
+            self._probe_rounds_total = 0
+            self._probe_latency_ms.clear()
+            self._probe_drift_detected.clear()
             # v2.0-F (L1)
             self._context_budget_warnings_total = 0
             self._context_budget_trims_total = 0

coderouter/metrics/prometheus.py CHANGED Viewed

@@ -381,6 +381,147 @@ def format_prometheus(snapshot: dict[str, Any]) -> str:
             samples=ratio_samples,
         )
     )
+    # ---- v2.0-G (L4): drift detection metrics ------------------------------
+    lines.extend(
+        _counter(
+            name="drift_detected_total",
+            help_text=(
+                "Drift detection events (quality degradation detected), "
+                "by provider."
+            ),
+            samples=[
+                ((("provider", p),), v)
+                for p, v in sorted(
+                    counters.get("drift_detected_by_provider", {}).items()
+                )
+            ],
+        )
+    )
+    drift_promoted = counters.get("drift_promoted_total", 0)
+    if drift_promoted:
+        lines.extend(
+            _counter(
+                name="drift_promoted_total",
+                help_text=(
+                    "Number of times a drifted provider was demoted "
+                    "(promote/reload action fired)."
+                ),
+                samples=[(((),), drift_promoted)],
+            )
+        )
+    drift_reload = counters.get("drift_reload_total", 0)
+    if drift_reload:
+        lines.extend(
+            _counter(
+                name="drift_reload_total",
+                help_text="Ollama KV cache flush attempts (reload action).",
+                samples=[(((),), drift_reload)],
+            )
+        )
+    drift_reload_ok = counters.get("drift_reload_success_total", 0)
+    if drift_reload_ok:
+        lines.extend(
+            _counter(
+                name="drift_reload_success_total",
+                help_text="Successful Ollama KV cache flush attempts.",
+                samples=[(((),), drift_reload_ok)],
+            )
+        )
+    # ---- v2.0-H (L6): partial stitch surfaced metric -----------------------
+    partial_stitch = counters.get("partial_stitch_surfaced_total", 0)
+    if partial_stitch:
+        lines.extend(
+            _counter(
+                name="partial_stitch_surfaced_total",
+                help_text=(
+                    "Mid-stream failures where partial content was delivered "
+                    "to the client (partial_stitch_action=surface)."
+                ),
+                samples=[((), partial_stitch)],
+            )
+        )
+    # ---- v2.0-I: continuous probe metrics ------------------------------------
+    probe_total_samples: list[tuple[tuple[tuple[str, str], ...], int]] = []
+    for provider, count in sorted(counters.get("probe_total", {}).items()):
+        probe_total_samples.append(
+            ((("provider", provider),), count)
+        )
+    if probe_total_samples:
+        lines.extend(
+            _counter(
+                name="probe_total",
+                help_text=(
+                    "Continuous health probe attempts, by provider. "
+                    "Each probe sends a 1-token completion to verify "
+                    "the full model pipeline."
+                ),
+                samples=probe_total_samples,
+            )
+        )
+    probe_outcome_samples: list[tuple[tuple[tuple[str, str], ...], int]] = []
+    for provider, count in sorted(counters.get("probe_success", {}).items()):
+        probe_outcome_samples.append(
+            ((("provider", provider), ("outcome", "success")), count)
+        )
+    for provider, count in sorted(counters.get("probe_failure", {}).items()):
+        probe_outcome_samples.append(
+            ((("provider", provider), ("outcome", "failure")), count)
+        )
+    if probe_outcome_samples:
+        lines.extend(
+            _counter(
+                name="probe_outcomes_total",
+                help_text=(
+                    "Continuous probe outcomes by provider and result "
+                    "(success | failure)."
+                ),
+                samples=probe_outcome_samples,
+            )
+        )
+    probe_rounds = counters.get("probe_rounds_total", 0)
+    if probe_rounds:
+        lines.extend(
+            _counter(
+                name="probe_rounds_total",
+                help_text="Completed probe sweep rounds (one round probes all eligible providers).",
+                samples=[((), probe_rounds)],
+            )
+        )
+    probe_drift_samples: list[tuple[tuple[tuple[str, str], ...], int]] = [
+        ((("provider", p),), v)
+        for p, v in sorted(counters.get("probe_drift_detected", {}).items())
+    ]
+    if probe_drift_samples:
+        lines.extend(
+            _counter(
+                name="probe_drift_detected_total",
+                help_text=(
+                    "Model-name mismatches detected by continuous probing "
+                    "(configured model != response model), by provider."
+                ),
+                samples=probe_drift_samples,
+            )
+        )
+    # Gauge: latest probe latency per provider (ms).
+    latency_samples: list[tuple[tuple[tuple[str, str], ...], float]] = [
+        ((("provider", p),), round(v, 1))
+        for p, v in sorted(counters.get("probe_latency_ms", {}).items())
+    ]
+    if latency_samples:
+        lines.extend(
+            _gauge_float(
+                name="probe_latency_ms",
+                help_text=(
+                    "Latest probe round-trip latency in milliseconds, by "
+                    "provider. Gauge (most recent value, not cumulative)."
+                ),
+                samples=latency_samples,
+            )
+        )
     return "\n".join(lines) + "\n"

coderouter/output_filters.py CHANGED Viewed

@@ -52,6 +52,7 @@ __all__ = [
     "OutputFilterChain",
     "StripStopMarkersFilter",
     "StripThinkingFilter",
+    "StripToolCallXmlFilter",
     "apply_output_filters",
     "validate_output_filters",
 ]
@@ -63,20 +64,28 @@ __all__ = [
 DEFAULT_STOP_MARKERS: tuple[str, ...] = (
+    # v1.0-A originals
     "<|turn|>",
     "<|end|>",
     "<|python_tag|>",
     "<|im_end|>",
     "<|eot_id|>",
     "<|channel>thought",
+    # v2.2: tool-call XML tags leaked by Qwen / Hermes / Llama tool-call
+    # formats. These appear when the model writes tool calls as XML
+    # instead of structured JSON, or when the tokenizer's special-token
+    # handling leaks through.
+    "<|tool▁call|>",
+    "<|tool▁sep|>",
 )
 """Default stop/harness markers stripped by ``strip_stop_markers``.
 Covers Llama 3.x (``<|python_tag|>``, ``<|eot_id|>``), ChatML / Qwen
-(``<|im_end|>``, ``<|end|>``), Gemma-ish (``<|turn|>``) and OpenAI-
-harmony (``<|channel>thought``). Extending this tuple is an ABI change
-— users who need a bespoke set can add a dedicated filter entry in
-a later minor; for v1.0-A the fixed list covers observed leaks.
+(``<|im_end|>``, ``<|end|>``), Gemma-ish (``<|turn|>``), OpenAI-
+harmony (``<|channel>thought``), and Qwen / Hermes tool-call markers
+(``<|tool▁call|>``, ``<|tool▁sep|>``). Extending this tuple is an ABI
+change — users who need a bespoke set can add a dedicated filter entry
+in a later minor.
 """
@@ -292,6 +301,87 @@ class StripStopMarkersFilter:
         return "".join(out_parts)
+# ---------------------------------------------------------------------------
+# strip_tool_call_xml (v2.2)
+# ---------------------------------------------------------------------------
+_TOOL_CALL_OPEN = "<tool_call>"
+_TOOL_CALL_CLOSE = "</tool_call>"
+class StripToolCallXmlFilter:
+    """Remove ``<tool_call>...</tool_call>`` XML blocks from assistant content.
+    Qwen / Hermes / Llama tool-call formats sometimes emit tool calls
+    as ``<tool_call>{"name": "Bash", ...}</tool_call>`` XML in the
+    content stream. When ``tool_repair`` has already extracted the
+    structured JSON from these blocks, the XML wrapper tags are
+    leftover debris that confuse downstream clients.
+    Architecture note: this filter should run AFTER ``tool_repair``
+    has had a chance to extract the JSON. The filter chain is applied
+    at the adapter boundary (post-repair), so ordering is naturally
+    correct.
+    Implementation mirrors ``StripThinkingFilter`` — the same
+    stateful open/close tag scanning, same chunk-boundary safety.
+    """
+    name = "strip_tool_call_xml"
+    def __init__(self) -> None:
+        """Initialize the per-request buffer + in-block state to empty."""
+        self.modified: bool = False
+        self._in_block: bool = False
+        self._buffer: str = ""
+    def feed(self, text: str, *, eof: bool = False) -> str:
+        """Consume ``text`` and return the portion safe to emit now.
+        Mirrors the ``StripThinkingFilter`` algorithm: greedy tag
+        matching with partial-prefix holdback across chunk boundaries.
+        """
+        self._buffer += text
+        out_parts: list[str] = []
+        while True:
+            if not self._in_block:
+                idx = self._buffer.find(_TOOL_CALL_OPEN)
+                if idx != -1:
+                    out_parts.append(self._buffer[:idx])
+                    self._buffer = self._buffer[idx + len(_TOOL_CALL_OPEN) :]
+                    self._in_block = True
+                    self.modified = True
+                    continue
+                # No open tag — emit all but a potential partial prefix.
+                overlap = _max_suffix_overlap(self._buffer, _TOOL_CALL_OPEN)
+                if overlap:
+                    out_parts.append(self._buffer[:-overlap])
+                    self._buffer = self._buffer[-overlap:]
+                else:
+                    out_parts.append(self._buffer)
+                    self._buffer = ""
+                break
+            # in_block: suppress until we find the close tag.
+            idx = self._buffer.find(_TOOL_CALL_CLOSE)
+            if idx != -1:
+                self._buffer = self._buffer[idx + len(_TOOL_CALL_CLOSE) :]
+                self._in_block = False
+                continue
+            # No close tag — retain potential partial suffix, drop the rest.
+            overlap = _max_suffix_overlap(self._buffer, _TOOL_CALL_CLOSE)
+            self._buffer = self._buffer[-overlap:] if overlap else ""
+            break
+        if eof:
+            if not self._in_block:
+                out_parts.append(self._buffer)
+            # If still in block at eof, silently drop the partial block.
+            self._buffer = ""
+        return "".join(out_parts)
 # ---------------------------------------------------------------------------
 # Registry + chain
 # ---------------------------------------------------------------------------
@@ -300,6 +390,7 @@ class StripStopMarkersFilter:
 KNOWN_FILTERS: dict[str, type[OutputFilter]] = {
     StripThinkingFilter.name: StripThinkingFilter,
     StripStopMarkersFilter.name: StripStopMarkersFilter,
+    StripToolCallXmlFilter.name: StripToolCallXmlFilter,
 }
 """Registry of string-name → filter class.

coderouter/routing/adaptive.py CHANGED Viewed

@@ -234,6 +234,29 @@ class AdaptiveAdjuster:
         while entry.observations and entry.observations[0].ts_monotonic < cutoff:
             entry.observations.popleft()
+    def demote(self, provider: str, *, steps: int = 2) -> None:
+        """Force-demote a provider by injecting synthetic failure observations.
+        Used by v2.0-G drift detection to push a provider's error rate above
+        the demotion threshold (``ERROR_RATE_DEMOTE_THRESHOLD``). Each step
+        injects one synthetic failure observation, so ``steps=2`` guarantees
+        the provider will be ranked lower on the next ``compute_effective_order``.
+        The injected observations carry no latency signal (``latency_ms=None``)
+        and expire naturally after ``ROLLING_WINDOW_S`` seconds.
+        """
+        ts = time.monotonic()
+        with self._lock:
+            entry = self._state.setdefault(provider, _AdjusterState())
+            for _ in range(steps):
+                entry.observations.append(
+                    _ProviderObservation(
+                        ts_monotonic=ts,
+                        latency_ms=None,
+                        success=False,
+                    )
+                )
     # ------------------------------------------------------------------
     # Stats
     # ------------------------------------------------------------------

coderouter/routing/budget.py CHANGED Viewed

@@ -187,5 +187,40 @@ class BudgetTracker:
             self._totals.clear()
             self._month = current
+    # ------------------------------------------------------------------
+    # v2.0-K: Persistence
+    # ------------------------------------------------------------------
+    def save_state(self) -> dict[str, object]:
+        """Export the current state as a JSON-safe dict.
+        Called by the engine to persist budget totals across restarts.
+        """
+        with self._lock:
+            return {
+                "month": self._month,
+                "totals": dict(self._totals),
+            }
+    def load_state(self, state: dict[str, object]) -> None:
+        """Restore state from a previously saved dict.
+        Only restores if the saved month matches the current month
+        (no point restoring last month's totals into a new month).
+        """
+        if not isinstance(state, dict):
+            return
+        saved_month = state.get("month", "")
+        with self._lock:
+            current = _utc_month_key()
+            if saved_month != current:
+                return  # stale month — skip
+            totals = state.get("totals", {})
+            if isinstance(totals, dict):
+                self._totals = {
+                    k: float(v) for k, v in totals.items() if isinstance(v, (int, float))
+                }
+                self._month = current
 __all__ = ["BudgetTracker"]

coderouter-cli 2.0.0__py3-none-any.whl → 2.2.0__py3-none-any.whl

coderouter-cli 2.0.0py3-none-any.whl → 2.2.0py3-none-any.whl