PyPI - coderouter-cli - Versions diffs - 2.1.0__py3-none-any.whl → 2.2.0__py3-none-any.whl - Mend

coderouter-cli 2.1.0py3-none-any.whl → 2.2.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (25) hide show

coderouter/cli.py +219 -0
coderouter/config/schemas.py +132 -2
coderouter/guards/__init__.py +6 -4
coderouter/guards/backend_health.py +34 -0
coderouter/guards/self_healing.py +413 -0
coderouter/guards/tool_loop.py +71 -0
coderouter/ingress/anthropic_routes.py +31 -1
coderouter/ingress/app.py +90 -0
coderouter/logging.py +108 -0
coderouter/metrics/collector.py +75 -0
coderouter/output_filters.py +95 -4
coderouter/routing/budget.py +35 -0
coderouter/routing/fallback.py +211 -1
coderouter/state/__init__.py +15 -0
coderouter/state/audit_log.py +269 -0
coderouter/state/replay.py +316 -0
coderouter/state/request_log.py +178 -0
coderouter/state/store.py +212 -0
coderouter/translation/tool_repair.py +42 -1
coderouter_cli-2.2.0.dist-info/METADATA +243 -0
{coderouter_cli-2.1.0.dist-info → coderouter_cli-2.2.0.dist-info}/RECORD +24 -18
coderouter_cli-2.1.0.dist-info/METADATA +0 -560
{coderouter_cli-2.1.0.dist-info → coderouter_cli-2.2.0.dist-info}/WHEEL +0 -0
{coderouter_cli-2.1.0.dist-info → coderouter_cli-2.2.0.dist-info}/entry_points.txt +0 -0
{coderouter_cli-2.1.0.dist-info → coderouter_cli-2.2.0.dist-info}/licenses/LICENSE +0 -0

coderouter/guards/self_healing.py ADDED Viewed

@@ -0,0 +1,413 @@
+"""Self-healing routing orchestrator (v2.0-J).
+When the L5 :class:`BackendHealthMonitor` transitions a provider to
+UNHEALTHY and the profile's ``backend_health_action`` is ``exclude``,
+this orchestrator:
+1. **Excludes** the provider from the chain (complete removal, not
+   just demotion to the back).
+2. **Attempts restart** if the provider declares a ``restart_command``
+   in ``providers.yaml``.
+3. **Schedules recovery probes** with exponential backoff (default
+   30 s → 60 s → 120 s → 300 s cap) to detect when the backend
+   comes back online.
+4. **Restores** the provider to its original chain position on the
+   first successful recovery probe.
+Architecture
+============
+::
+    BackendHealthMonitor
+      └─ transition → UNHEALTHY
+           └─ engine calls orchestrator.on_unhealthy(provider)
+                ├─ add to _excluded set
+                ├─ try restart_command (if configured)
+                └─ schedule recovery probe (async)
+    recovery_probe_loop:
+      while provider in _excluded:
+        sleep(backoff interval)
+        probe_one(provider)
+        if success:
+          remove from _excluded
+          record_attempt(success=True)  → snap to HEALTHY
+          log restore
+          break
+        else:
+          interval = min(interval * 2, max_interval)
+    _resolve_chain (engine):
+      Pass 4b: if action == "exclude":
+        filter out providers in orchestrator._excluded
+Design choices
+==============
+- **Thread-safe** via an internal ``RLock`` (same pattern as
+  ``BackendHealthMonitor``). The exclude set and restart lock
+  are guarded independently.
+- **No new dependency** — subprocess (stdlib) for restart commands,
+  asyncio for recovery probe scheduling.
+- **Restart is opt-in** — only providers with ``restart_command``
+  set get automatic restart. Others rely solely on recovery probes
+  (waiting for manual restart by the operator).
+- **Double-restart prevention** — a per-provider ``_restart_lock``
+  prevents concurrent restart attempts (e.g. two profiles both
+  hitting UNHEALTHY on the same provider).
+"""
+from __future__ import annotations
+import asyncio
+import contextlib
+import subprocess
+import threading
+import time
+from dataclasses import dataclass
+from coderouter.config.schemas import ProviderConfig
+from coderouter.logging import (
+    get_logger,
+    log_self_healing_exclude,
+    log_self_healing_recovery_probe,
+    log_self_healing_restart,
+    log_self_healing_restore,
+)
+logger = get_logger(__name__)
+@dataclass(slots=True)
+class _ExcludedProvider:
+    """Metadata for a provider currently excluded from the chain."""
+    provider: str
+    excluded_at: float
+    profile: str
+    consecutive_failures: int
+class SelfHealingOrchestrator:
+    """Manages provider exclusion, restart, and recovery probing.
+    Public API:
+    - :meth:`on_unhealthy(provider, ...)` — called by the engine when
+      a provider transitions to UNHEALTHY with action ``exclude``.
+    - :meth:`on_recovered(provider, ...)` — called when a recovery
+      probe succeeds or a regular request succeeds on a previously
+      excluded provider.
+    - :meth:`is_excluded(provider)` — True iff the provider is
+      currently excluded from the chain.
+    - :meth:`excluded_providers()` — set of currently excluded
+      provider names.
+    - :meth:`try_restart(provider_config, ...)` — attempt to restart
+      a provider's backend process.
+    - :meth:`reset()` — clear all state. Mainly for tests.
+    """
+    def __init__(self) -> None:
+        self._lock: threading.RLock = threading.RLock()
+        self._excluded: dict[str, _ExcludedProvider] = {}
+        # Per-provider lock to prevent concurrent restart attempts.
+        self._restart_locks: dict[str, threading.Lock] = {}
+    # ------------------------------------------------------------------
+    # Exclusion management
+    # ------------------------------------------------------------------
+    def on_unhealthy(
+        self,
+        provider: str,
+        *,
+        profile: str,
+        consecutive_failures: int,
+    ) -> bool:
+        """Mark a provider as excluded from the chain.
+        Returns True if the provider was newly excluded (not already
+        excluded). Returns False if it was already excluded (idempotent).
+        """
+        with self._lock:
+            if provider in self._excluded:
+                return False
+            self._excluded[provider] = _ExcludedProvider(
+                provider=provider,
+                excluded_at=time.monotonic(),
+                profile=profile,
+                consecutive_failures=consecutive_failures,
+            )
+        log_self_healing_exclude(
+            logger,
+            provider=provider,
+            profile=profile,
+            consecutive_failures=consecutive_failures,
+        )
+        return True
+    def on_recovered(
+        self,
+        provider: str,
+        *,
+        profile: str,
+    ) -> float | None:
+        """Restore a provider to the chain after recovery.
+        Returns the duration (seconds) the provider was excluded,
+        or None if the provider was not in the excluded set.
+        """
+        with self._lock:
+            entry = self._excluded.pop(provider, None)
+            if entry is None:
+                return None
+        duration = time.monotonic() - entry.excluded_at
+        log_self_healing_restore(
+            logger,
+            provider=provider,
+            profile=profile,
+            excluded_duration_s=duration,
+        )
+        return duration
+    def is_excluded(self, provider: str) -> bool:
+        """True iff the provider is currently excluded from the chain."""
+        with self._lock:
+            return provider in self._excluded
+    def excluded_providers(self) -> set[str]:
+        """Return a snapshot of currently excluded provider names."""
+        with self._lock:
+            return set(self._excluded.keys())
+    def reset(self) -> None:
+        """Drop all state. Mainly for tests."""
+        with self._lock:
+            self._excluded.clear()
+            self._restart_locks.clear()
+    # ------------------------------------------------------------------
+    # v2.0-K: Persistence
+    # ------------------------------------------------------------------
+    def save_state(self) -> dict[str, object]:
+        """Export the current excluded-provider set for persistence."""
+        with self._lock:
+            return {
+                name: {
+                    "profile": entry.profile,
+                    "consecutive_failures": entry.consecutive_failures,
+                }
+                for name, entry in self._excluded.items()
+            }
+    def load_state(self, state: dict[str, object]) -> None:
+        """Restore excluded providers from a previously saved dict.
+        Re-creates ``_ExcludedProvider`` entries with ``excluded_at``
+        set to the current time (the original exclude timestamp is
+        lost across restarts — the important thing is that the provider
+        *stays* excluded until a recovery probe succeeds).
+        """
+        if not isinstance(state, dict):
+            return
+        with self._lock:
+            for name, data in state.items():
+                if not isinstance(data, dict):
+                    continue
+                if name in self._excluded:
+                    continue  # already excluded
+                profile = data.get("profile", "")
+                failures = data.get("consecutive_failures", 0)
+                if not isinstance(failures, int):
+                    failures = 0
+                self._excluded[name] = _ExcludedProvider(
+                    provider=name,
+                    excluded_at=time.monotonic(),
+                    profile=str(profile),
+                    consecutive_failures=failures,
+                )
+    # ------------------------------------------------------------------
+    # Restart helper
+    # ------------------------------------------------------------------
+    def try_restart(
+        self,
+        provider_config: ProviderConfig,
+        *,
+        timeout_s: float = 30.0,
+    ) -> bool:
+        """Attempt to restart a provider's backend process.
+        Returns True if the restart command succeeded (exit code 0),
+        False otherwise (or if no restart_command is configured).
+        Thread-safe: only one restart per provider at a time. A
+        concurrent call returns False immediately without blocking.
+        """
+        command = provider_config.restart_command
+        if not command:
+            return False
+        provider = provider_config.name
+        # Get or create a per-provider lock.
+        with self._lock:
+            if provider not in self._restart_locks:
+                self._restart_locks[provider] = threading.Lock()
+            restart_lock = self._restart_locks[provider]
+        # Non-blocking acquire — if another thread is already
+        # restarting this provider, we skip silently.
+        if not restart_lock.acquire(blocking=False):
+            return False
+        try:
+            result = subprocess.run(
+                command,
+                shell=True,
+                capture_output=True,
+                timeout=timeout_s,
+                text=True,
+            )
+            success = result.returncode == 0
+            error = result.stderr.strip() if not success else None
+            log_self_healing_restart(
+                logger,
+                provider=provider,
+                command=command,
+                success=success,
+                error=error,
+            )
+            return success
+        except subprocess.TimeoutExpired:
+            log_self_healing_restart(
+                logger,
+                provider=provider,
+                command=command,
+                success=False,
+                error=f"timeout after {timeout_s}s",
+            )
+            return False
+        except OSError as exc:
+            log_self_healing_restart(
+                logger,
+                provider=provider,
+                command=command,
+                success=False,
+                error=str(exc),
+            )
+            return False
+        finally:
+            restart_lock.release()
+# ---------------------------------------------------------------------------
+# Recovery probe loop (async, runs as a background task)
+# ---------------------------------------------------------------------------
+async def recovery_probe_loop(
+    provider_config: ProviderConfig,
+    *,
+    orchestrator: SelfHealingOrchestrator,
+    record_fn: object | None = None,
+    health_threshold: int = 3,
+    initial_interval_s: float = 30.0,
+    max_interval_s: float = 300.0,
+    restart_timeout_s: float = 30.0,
+    probe_timeout_s: float = 10.0,
+    shutdown_event: asyncio.Event | None = None,
+    profile: str = "",
+) -> None:
+    """Probe an excluded provider with exponential backoff until recovery.
+    This function runs as a long-lived asyncio task, one per excluded
+    provider. It terminates when:
+    - The provider recovers (probe succeeds) → restores to chain.
+    - The shutdown event is set → graceful exit.
+    - The provider is no longer excluded (external recovery).
+    On first invocation, attempts a restart if configured, then waits
+    for the initial interval before the first probe.
+    """
+    from coderouter.guards.continuous_probe import probe_one
+    _shutdown = shutdown_event or asyncio.Event()
+    provider_name = provider_config.name
+    interval = initial_interval_s
+    # Step 1: attempt restart if configured.
+    if provider_config.restart_command:
+        # Run in executor to avoid blocking the event loop.
+        loop = asyncio.get_running_loop()
+        await loop.run_in_executor(
+            None,
+            lambda: orchestrator.try_restart(
+                provider_config,
+                timeout_s=restart_timeout_s,
+            ),
+        )
+    # Step 2: exponential backoff recovery probes.
+    while not _shutdown.is_set():
+        # Check if still excluded (external code may have restored it).
+        if not orchestrator.is_excluded(provider_name):
+            return
+        # Wait for the current interval (or shutdown).
+        try:
+            await asyncio.wait_for(_shutdown.wait(), timeout=interval)
+            return  # shutdown signalled
+        except TimeoutError:
+            pass  # normal: interval elapsed
+        # Still excluded? Probe.
+        if not orchestrator.is_excluded(provider_name):
+            return
+        result = await probe_one(provider_config, timeout_s=probe_timeout_s)
+        if result.success:
+            # Provider is back! Restore it.
+            orchestrator.on_recovered(provider_name, profile=profile)
+            # Feed success into the backend health state machine
+            # so it snaps back to HEALTHY.
+            if record_fn is not None:
+                with contextlib.suppress(Exception):
+                    record_fn(  # type: ignore[operator]
+                        provider_name,
+                        success=True,
+                        threshold=health_threshold,
+                    )
+            log_self_healing_recovery_probe(
+                logger,
+                provider=provider_name,
+                success=True,
+                next_interval_s=0,
+                latency_ms=result.latency_ms,
+            )
+            return
+        # Failed — exponential backoff.
+        next_interval = min(interval * 2, max_interval_s)
+        log_self_healing_recovery_probe(
+            logger,
+            provider=provider_name,
+            success=False,
+            next_interval_s=next_interval,
+            latency_ms=result.latency_ms,
+        )
+        interval = next_interval
+__all__ = [
+    "SelfHealingOrchestrator",
+    "recovery_probe_loop",
+]

coderouter/guards/tool_loop.py CHANGED Viewed

@@ -148,6 +148,44 @@ class ToolLoopDetection:
     """
+@dataclass(frozen=True)
+class ToolCountExceeded:
+    """The outcome of a total tool-call count check.
+    Returned by :func:`check_total_tool_count` when the conversation's
+    cumulative tool_use count exceeds the configured hard cap. This is
+    a safety valve against runaway agents that call many *different*
+    tools without looping (which L3's identical-streak detector misses).
+    """
+    total_count: int
+    """How many tool_use blocks the conversation currently contains."""
+    max_allowed: int
+    """The configured ceiling that was exceeded."""
+class ToolCountExceededError(CodeRouterError):
+    """Raised when total tool-call count exceeds the hard cap.
+    The ingress converts this into a structured ``400`` response with
+    ``error: "tool_count_exceeded"`` so the client sees a programmable
+    failure rather than a 5xx.
+    """
+    def __init__(
+        self,
+        exceeded: ToolCountExceeded,
+        profile: str,
+    ) -> None:
+        super().__init__(
+            f"tool count exceeded on profile={profile!r}: "
+            f"{exceeded.total_count} tool calls exceed the limit of "
+            f"{exceeded.max_allowed}."
+        )
+        self.exceeded = exceeded
+        self.profile = profile
 class ToolLoopBreakError(CodeRouterError):
     """Raised when a loop is detected and the configured action is ``break``.
@@ -337,3 +375,36 @@ def inject_loop_break_hint(
         new_system = [*list(system), {"type": "text", "text": hint}]
     return request.model_copy(update={"system": new_system})
+# ---------------------------------------------------------------------------
+# Total tool-call count hard cap (v2.2)
+# ---------------------------------------------------------------------------
+def check_total_tool_count(
+    request: AnthropicRequest,
+    *,
+    max_calls: int,
+) -> ToolCountExceeded | None:
+    """Return a detection if total tool_use count exceeds ``max_calls``.
+    Unlike :func:`detect_tool_loop` which catches *identical*
+    consecutive calls, this is a blunt hard cap on the cumulative
+    number of tool_use blocks across the entire conversation. It
+    catches runaway agents that cycle through many *different* tools
+    without ever repeating the same (name, args) pair — a pattern
+    that the streak-based L3 detector cannot see.
+    Default ceiling is 50 (configurable per-profile). This is
+    deliberately more permissive than Unsloth Studio's 25 — Claude
+    Code's long-running agent sessions routinely reach 25+ tool calls
+    in normal operation.
+    Returns ``None`` when the count is within limits.
+    """
+    history = _extract_tool_use_history(request)
+    count = len(history)
+    if count > max_calls:
+        return ToolCountExceeded(total_count=count, max_allowed=max_calls)
+    return None

coderouter/ingress/anthropic_routes.py CHANGED Viewed

@@ -28,7 +28,7 @@ from typing import Any
 from fastapi import APIRouter, Header, HTTPException, Request
 from fastapi.responses import JSONResponse, StreamingResponse
-from coderouter.guards.tool_loop import ToolLoopBreakError
+from coderouter.guards.tool_loop import ToolCountExceededError, ToolLoopBreakError
 from coderouter.logging import get_logger
 from coderouter.routing import (
     FallbackEngine,
@@ -173,6 +173,18 @@ async def messages(
             status_code=400,
             detail=_tool_loop_break_detail(exc),
         ) from exc
+    except ToolCountExceededError as exc:
+        # v2.2: total tool-call count exceeded — surface as a 400.
+        raise HTTPException(
+            status_code=400,
+            detail={
+                "error": "tool_count_exceeded",
+                "message": str(exc),
+                "total_count": exc.exceeded.total_count,
+                "max_allowed": exc.exceeded.max_allowed,
+                "profile": exc.profile,
+            },
+        ) from exc
     # v2.0-G: collect drift header after engine dispatch.
     drift_severity = engine.last_drift_severity
@@ -239,6 +251,24 @@ async def _anthropic_sse_iterator(
             },
         )
         yield _format_anthropic_sse(err_event)
+    except ToolCountExceededError as exc:
+        # v2.2: streaming counterpart of the tool-count-exceeded 400.
+        err_event = AnthropicStreamEvent(
+            type="error",
+            data={
+                "type": "error",
+                "error": {
+                    "type": "invalid_request_error",
+                    "message": str(exc),
+                    "tool_count": {
+                        "total_count": exc.exceeded.total_count,
+                        "max_allowed": exc.exceeded.max_allowed,
+                        "profile": exc.profile,
+                    },
+                },
+            },
+        )
+        yield _format_anthropic_sse(err_event)
     except MidStreamError as exc:
         # v0.3-B: a provider failed AFTER emitting at least one event. We
         # cannot fall back (client already received partial content), so

coderouter/ingress/app.py CHANGED Viewed

@@ -73,6 +73,66 @@ def create_app(config_path: str | None = None) -> FastAPI:
         # potentially sub-optimally for the agentic harness.
         check_claude_code_chain_suitability(config, logger=logger)
+        # v2.0-K: attach persistent state store + audit/request log if configured.
+        state_store = None
+        audit_handler = None
+        request_log_handler = None
+        if config.state_dir:
+            import logging as _logging
+            from pathlib import Path
+            from coderouter.state.audit_log import AuditLogHandler
+            from coderouter.state.store import StateStore
+            state_path = Path(config.state_dir).expanduser()
+            state_store = StateStore(state_path / "coderouter.db")
+            engine.attach_state_store(state_store)
+            # Restore MetricsCollector state from the store.
+            from coderouter.metrics import get_collector
+            collector = get_collector()
+            if collector is not None:
+                metrics_state = state_store.get("metrics", "state")
+                if metrics_state is not None:
+                    with contextlib.suppress(Exception):
+                        collector.load_state(metrics_state)  # type: ignore[arg-type]
+            logger.info(
+                "state-store-attached",
+                extra={"state_dir": str(state_path)},
+            )
+            if config.audit_log == "active":
+                audit_handler = AuditLogHandler(
+                    state_path / "audit.jsonl",
+                    max_bytes=config.audit_log_max_bytes,
+                )
+                _logging.getLogger().addHandler(audit_handler)
+                logger.info(
+                    "audit-log-started",
+                    extra={
+                        "path": str(state_path / "audit.jsonl"),
+                        "max_bytes": config.audit_log_max_bytes,
+                    },
+                )
+            if config.request_log == "active":
+                from coderouter.state.request_log import RequestLogHandler
+                request_log_handler = RequestLogHandler(
+                    state_path / "requests.jsonl",
+                    max_bytes=config.request_log_max_bytes,
+                )
+                _logging.getLogger().addHandler(request_log_handler)
+                logger.info(
+                    "request-log-started",
+                    extra={
+                        "path": str(state_path / "requests.jsonl"),
+                        "max_bytes": config.request_log_max_bytes,
+                    },
+                )
         # v2.0-I: launch continuous probe background task if configured.
         probe_task = None
         shutdown_event = None
@@ -111,6 +171,36 @@ def create_app(config_path: str | None = None) -> FastAPI:
             with contextlib.suppress(Exception):
                 await probe_task
+        # v2.0-J: graceful shutdown of recovery probe tasks.
+        with contextlib.suppress(Exception):
+            await engine.shutdown_recovery_probes()
+        # v2.0-K: persist state and close audit log on shutdown.
+        if state_store is not None:
+            with contextlib.suppress(Exception):
+                engine.save_all_state()
+            # Save MetricsCollector state.
+            from coderouter.metrics import get_collector
+            collector = get_collector()
+            if collector is not None:
+                with contextlib.suppress(Exception):
+                    state_store.put("metrics", "state", collector.save_state())
+            with contextlib.suppress(Exception):
+                state_store.close()
+        if audit_handler is not None:
+            import logging as _logging
+            with contextlib.suppress(Exception):
+                _logging.getLogger().removeHandler(audit_handler)
+                audit_handler.close()
+        if request_log_handler is not None:
+            import logging as _logging
+            with contextlib.suppress(Exception):
+                _logging.getLogger().removeHandler(request_log_handler)
+                request_log_handler.close()
         logger.info("coderouter-shutdown")
     app = FastAPI(

coderouter-cli 2.1.0__py3-none-any.whl → 2.2.0__py3-none-any.whl

coderouter-cli 2.1.0py3-none-any.whl → 2.2.0py3-none-any.whl