PyPI - coderouter-cli - Versions diffs - 2.0.0__py3-none-any.whl → 2.1.0__py3-none-any.whl - Mend

coderouter-cli 2.0.0py3-none-any.whl → 2.1.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

coderouter/config/schemas.py +103 -0
coderouter/guards/continuous_probe.py +349 -0
coderouter/guards/drift_actions.py +111 -0
coderouter/guards/drift_detection.py +308 -0
coderouter/ingress/anthropic_routes.py +75 -11
coderouter/ingress/app.py +39 -0
coderouter/logging.py +262 -0
coderouter/metrics/collector.py +93 -0
coderouter/metrics/prometheus.py +141 -0
coderouter/routing/adaptive.py +23 -0
coderouter/routing/fallback.py +285 -4
{coderouter_cli-2.0.0.dist-info → coderouter_cli-2.1.0.dist-info}/METADATA +7 -6
{coderouter_cli-2.0.0.dist-info → coderouter_cli-2.1.0.dist-info}/RECORD +16 -13
{coderouter_cli-2.0.0.dist-info → coderouter_cli-2.1.0.dist-info}/WHEEL +0 -0
{coderouter_cli-2.0.0.dist-info → coderouter_cli-2.1.0.dist-info}/entry_points.txt +0 -0
{coderouter_cli-2.0.0.dist-info → coderouter_cli-2.1.0.dist-info}/licenses/LICENSE +0 -0

coderouter/config/schemas.py CHANGED Viewed

@@ -531,6 +531,73 @@ class FallbackChain(BaseModel):
         ),
     )
+    # ------------------------------------------------------------------
+    # v2.0-G (L4): Drift detection — response quality degradation guard
+    # ------------------------------------------------------------------
+    #
+    # Long-running sessions on local LLMs can suffer gradual quality
+    # decay (KV cache pressure, thermal throttling, VRAM fragmentation)
+    # where the model "succeeds" but produces empty/short/toolless
+    # responses. This guard observes response quality signals in a
+    # rolling window and detects statistical drift.
+    #
+    # Four actions:
+    #   * ``off``     — no detection (default).
+    #   * ``warn``    — emit structured log + response header.
+    #   * ``promote`` — ``warn`` + demote drifted provider in chain.
+    #   * ``reload``  — ``promote`` + attempt KV cache flush (Ollama).
+    drift_detection_action: Literal["off", "warn", "promote", "reload"] = Field(
+        default="off",
+        description=(
+            "v2.0-G (L4): action on response quality drift detection. "
+            "``off`` (default) disables drift detection. ``warn`` emits "
+            "a log and response header. ``promote`` additionally demotes "
+            "the drifted provider in the chain. ``reload`` attempts to "
+            "flush the provider's KV cache (Ollama only) before promoting."
+        ),
+    )
+    drift_detection_window_size: int = Field(
+        default=20,
+        ge=4,
+        le=200,
+        description=(
+            "v2.0-G (L4): number of recent responses to keep in the "
+            "rolling observation window per provider. Larger windows "
+            "are more robust to noise but slower to detect drift."
+        ),
+    )
+    drift_detection_cooldown_s: int = Field(
+        default=300,
+        ge=10,
+        le=3600,
+        description=(
+            "v2.0-G (L4): seconds after a promote/reload action before "
+            "the drifted provider's rank is reset for recovery check. "
+            "Default 300s (5 min) gives the model time to stabilize."
+        ),
+    )
+    drift_detection_sensitivity: Literal["low", "normal", "high"] = Field(
+        default="normal",
+        description=(
+            "v2.0-G (L4): threshold preset for drift signals. "
+            "``low`` tolerates more degradation before triggering, "
+            "``high`` is stricter (fewer bad responses needed)."
+        ),
+    )
+    # --- v2.0-H (L6): Mid-stream partial stitching --------------------------
+    #   * ``off``      — discard partial content on mid-stream failure (legacy).
+    #   * ``surface``  — return partial content as a truncated-but-valid response.
+    partial_stitch_action: Literal["off", "surface"] = Field(
+        default="off",
+        description=(
+            "v2.0-H (L6): action when a streaming response fails mid-stream. "
+            "``off`` discards partial content (legacy error event). "
+            "``surface`` returns accumulated text as a graceful stream "
+            "termination with a ``coderouter_partial`` metadata event."
+        ),
+    )
 # ---------------------------------------------------------------------------
 # v1.6-A: auto_router — declarative request-body classifier
@@ -768,6 +835,42 @@ class CodeRouterConfig(BaseModel):
         ),
     )
+    # v2.0-I: Continuous probing — background health checks for idle periods.
+    continuous_probe: Literal["off", "active"] = Field(
+        default="off",
+        description=(
+            "v2.0-I: enable background health probes. 'active' starts a "
+            "background task that periodically sends 1-token requests to "
+            "each provider, feeding results into the L5 backend health "
+            "state machine. 'off' = no probing (backward-compatible default)."
+        ),
+    )
+    probe_interval_s: float = Field(
+        default=60.0,
+        ge=5.0,
+        le=3600.0,
+        description=(
+            "v2.0-I: seconds between probe rounds. Lower = faster detection "
+            "but more probe traffic. 60s is a good balance for local models."
+        ),
+    )
+    probe_paid: bool = Field(
+        default=False,
+        description=(
+            "v2.0-I: whether to probe providers marked ``paid: true``. "
+            "Default false protects operators from accidental API charges."
+        ),
+    )
+    probe_timeout_s: float = Field(
+        default=10.0,
+        ge=1.0,
+        le=60.0,
+        description=(
+            "v2.0-I: per-provider timeout for probe requests. A provider "
+            "that doesn't respond within this window is recorded as failed."
+        ),
+    )
     @model_validator(mode="after")
     def _check_default_profile_exists(self) -> CodeRouterConfig:
         """v0.6-A: surface a typo'd ``default_profile`` at load time.

coderouter/guards/continuous_probe.py ADDED Viewed

@@ -0,0 +1,349 @@
+"""Continuous health probing (v2.0-I).
+Background task that periodically sends minimal 1-token requests to each
+configured provider, feeding the results into the L5 backend health
+state machine. Detects provider crashes during idle periods (no user
+traffic) so the chain resolver knows to skip/demote a dead backend
+before the next real request hits it.
+Architecture
+============
+::
+    lifespan startup
+      └─ asyncio.create_task(probe_loop(...))
+    probe_loop:
+      while not shutdown:
+        sleep(interval_s)
+        for provider in providers:
+          result = await probe_one(provider)
+          backend_health.record_attempt(...)
+          emit log + metrics
+Design choices
+==============
+- **1-token completion** rather than ``/api/version`` or ``/api/tags``
+  because version endpoints are Ollama-only; a 1-token generate confirms
+  the entire model-serving pipeline is operational (model loaded, KV
+  allocated, inference works).
+- **Sequential** probing (not parallel) to avoid hammering backends and
+  to keep the implementation trivially correct without gather/semaphore.
+- **No new dependency** — uses httpx (already a runtime dep) + asyncio
+  (stdlib).
+- **Graceful shutdown** via an ``asyncio.Event`` set by the lifespan
+  exit path. The loop checks the event each iteration and breaks cleanly.
+"""
+from __future__ import annotations
+import asyncio
+import contextlib
+import time
+from dataclasses import dataclass, field
+from typing import Any
+import httpx
+from coderouter.config.schemas import ProviderConfig
+from coderouter.logging import (
+    get_logger,
+    log_probe_capabilities_drift,
+    log_probe_completed,
+    log_probe_round_completed,
+)
+logger = get_logger(__name__)
+# ---------------------------------------------------------------------------
+# ProbeResult
+# ---------------------------------------------------------------------------
+@dataclass(slots=True)
+class ProbeResult:
+    """Outcome of a single provider probe."""
+    provider: str
+    success: bool
+    latency_ms: float
+    error: str | None = None
+    model_name: str | None = None
+    timestamp: float = field(default_factory=time.time)
+# ---------------------------------------------------------------------------
+# probe_one: single-provider 1-token probe
+# ---------------------------------------------------------------------------
+async def probe_one(
+    provider: ProviderConfig,
+    *,
+    timeout_s: float = 10.0,
+) -> ProbeResult:
+    """Send a minimal 1-token completion request and measure response.
+    For ``kind: openai_compat``: POST /v1/chat/completions
+    For ``kind: anthropic``: POST /v1/messages
+    The request asks for ``max_tokens: 1`` so the probe is as cheap as
+    possible (a single output token is generated, exercising the full
+    model pipeline without producing meaningful output).
+    Never raises — all failures are captured in ProbeResult(success=False).
+    """
+    import os
+    start = time.monotonic()
+    provider_name = provider.name
+    base_url = str(provider.base_url).rstrip("/")
+    # Resolve API key from env (same logic as the adapters)
+    headers: dict[str, str] = {}
+    if provider.api_key_env:
+        api_key = os.environ.get(provider.api_key_env, "")
+        if api_key:
+            if provider.kind == "anthropic":
+                headers["x-api-key"] = api_key
+                headers["anthropic-version"] = "2023-06-01"
+            else:
+                headers["Authorization"] = f"Bearer {api_key}"
+    try:
+        async with httpx.AsyncClient(timeout=timeout_s) as client:
+            if provider.kind == "anthropic":
+                url = f"{base_url}/v1/messages"
+                body: dict[str, Any] = {
+                    "model": provider.model,
+                    "max_tokens": 1,
+                    "messages": [{"role": "user", "content": "hi"}],
+                }
+                resp = await client.post(url, json=body, headers=headers)
+            else:
+                # openai_compat: Ollama, LM Studio, OpenRouter, etc.
+                url = f"{base_url}/chat/completions"
+                body = {
+                    "model": provider.model,
+                    "max_tokens": 1,
+                    "messages": [{"role": "user", "content": "hi"}],
+                }
+                resp = await client.post(url, json=body, headers=headers)
+        latency_ms = (time.monotonic() - start) * 1000
+        if resp.status_code >= 400:
+            return ProbeResult(
+                provider=provider_name,
+                success=False,
+                latency_ms=latency_ms,
+                error=f"HTTP {resp.status_code}: {resp.text[:200]}",
+            )
+        # Extract model name from response (for capabilities drift check)
+        model_name: str | None = None
+        try:
+            data = resp.json()
+            model_name = data.get("model")
+        except Exception:
+            pass
+        return ProbeResult(
+            provider=provider_name,
+            success=True,
+            latency_ms=latency_ms,
+            model_name=model_name,
+        )
+    except httpx.TimeoutException:
+        latency_ms = (time.monotonic() - start) * 1000
+        return ProbeResult(
+            provider=provider_name,
+            success=False,
+            latency_ms=latency_ms,
+            error=f"timeout after {timeout_s}s",
+        )
+    except Exception as exc:
+        latency_ms = (time.monotonic() - start) * 1000
+        return ProbeResult(
+            provider=provider_name,
+            success=False,
+            latency_ms=latency_ms,
+            error=str(exc)[:200],
+        )
+# ---------------------------------------------------------------------------
+# capabilities drift detection (Phase 3)
+# ---------------------------------------------------------------------------
+@dataclass(slots=True)
+class DriftReport:
+    """Report of a model-name mismatch between config and probe response."""
+    provider: str
+    configured_model: str
+    observed_model: str
+    in_registry: bool
+def check_probe_drift(
+    provider: ProviderConfig,
+    observed_model: str | None,
+    *,
+    registry: Any = None,
+) -> DriftReport | None:
+    """Compare the probe response model name against the configured model.
+    Returns a :class:`DriftReport` when the observed model differs from
+    ``provider.model``, or ``None`` when they match (or when no model
+    name was returned by the probe). The ``registry`` argument is an
+    optional :class:`CapabilityRegistry` instance used to check whether
+    the observed model has a known entry — when it doesn't, the report
+    sets ``in_registry=False`` as an extra signal for the operator.
+    Never raises — a missing registry or lookup error just defaults to
+    ``in_registry=True`` (conservative, avoids false positives).
+    """
+    if not observed_model:
+        return None
+    configured = provider.model or ""
+    # Normalize: some backends return the model with a prefix or
+    # formatting variation. We compare case-sensitively but strip
+    # whitespace.
+    if observed_model.strip() == configured.strip():
+        return None
+    # Check registry for the observed model
+    in_registry = True
+    if registry is not None:
+        try:
+            resolved = registry.lookup(kind=provider.kind, model=observed_model)
+            # If every resolved field is None, the model is unknown
+            if (
+                resolved.thinking is None
+                and resolved.tools is None
+                and resolved.max_context_tokens is None
+                and resolved.claude_code_suitability is None
+                and resolved.cache_control is None
+            ):
+                in_registry = False
+        except Exception:
+            pass  # defensive — never crash the probe loop
+    return DriftReport(
+        provider=provider.name,
+        configured_model=configured,
+        observed_model=observed_model,
+        in_registry=in_registry,
+    )
+# ---------------------------------------------------------------------------
+# probe_loop: background task
+# ---------------------------------------------------------------------------
+async def probe_loop(
+    providers: list[ProviderConfig],
+    *,
+    record_fn: Any = None,
+    interval_s: float = 60.0,
+    timeout_s: float = 10.0,
+    probe_paid: bool = False,
+    shutdown_event: asyncio.Event | None = None,
+    health_threshold: int = 3,
+    registry: Any = None,
+) -> None:
+    """Run continuous health probes in an infinite loop until shutdown.
+    Args:
+        providers: list of provider configs to probe.
+        record_fn: callable(provider_name, *, success, threshold) that
+            feeds the backend health state machine. When None, results
+            are only logged (useful for testing).
+        interval_s: seconds to sleep between probe rounds.
+        timeout_s: per-provider probe timeout.
+        probe_paid: if False, providers with ``paid=True`` are skipped.
+        shutdown_event: set this event to stop the loop gracefully.
+        health_threshold: consecutive-failure threshold passed to record_fn.
+        registry: optional CapabilityRegistry for model drift detection.
+    """
+    _shutdown = shutdown_event or asyncio.Event()
+    # Initial delay: let the server finish startup before first probe round.
+    try:
+        await asyncio.wait_for(_shutdown.wait(), timeout=interval_s)
+        return  # shutdown during initial delay
+    except TimeoutError:
+        pass  # normal: timeout means the delay elapsed without shutdown
+    while not _shutdown.is_set():
+        probed = 0
+        failures = 0
+        for provider in providers:
+            if _shutdown.is_set():
+                break
+            if provider.paid and not probe_paid:
+                continue
+            result = await probe_one(provider, timeout_s=timeout_s)
+            probed += 1
+            if not result.success:
+                failures += 1
+            # Feed into backend health state machine
+            if record_fn is not None:
+                with contextlib.suppress(Exception):
+                    record_fn(
+                        result.provider,
+                        success=result.success,
+                        threshold=health_threshold,
+                    )
+            # Log individual result
+            log_probe_completed(
+                logger,
+                provider=result.provider,
+                success=result.success,
+                latency_ms=result.latency_ms,
+                error=result.error,
+                model_name=result.model_name,
+            )
+            # Check for model-capabilities drift on success
+            if result.success and result.model_name:
+                drift = check_probe_drift(
+                    provider, result.model_name, registry=registry
+                )
+                if drift is not None:
+                    log_probe_capabilities_drift(
+                        logger,
+                        provider=drift.provider,
+                        configured_model=drift.configured_model,
+                        observed_model=drift.observed_model,
+                        in_registry=drift.in_registry,
+                    )
+        # Log round summary
+        if probed > 0:
+            log_probe_round_completed(
+                logger,
+                providers_probed=probed,
+                failures=failures,
+            )
+        # Wait for next interval or shutdown
+        try:
+            await asyncio.wait_for(_shutdown.wait(), timeout=interval_s)
+            break  # shutdown signaled
+        except TimeoutError:
+            pass  # normal: sleep elapsed, start next round

coderouter/guards/drift_actions.py ADDED Viewed

@@ -0,0 +1,111 @@
+"""Drift detection corrective actions (v2.0-G, L4).
+Currently the only non-trivial action is ``reload`` — flush the KV cache
+on Ollama-shape providers by sending a ``keep_alive=0`` request to unload
+the model, forcing a fresh context window on the next request.
+The ``promote`` action is handled directly in fallback.py via
+``AdaptiveAdjuster.demote()``.
+Architecture
+============
+All functions are **best-effort**: failures are logged but never raised.
+The engine continues regardless — the worst case is that the model stays
+loaded with its existing (potentially degraded) KV cache and the adaptive
+demotion still routes traffic elsewhere until cooldown expires.
+"""
+from __future__ import annotations
+import httpx
+from coderouter.config.schemas import ProviderConfig
+from coderouter.logging import get_logger, log_drift_reload_attempted
+logger = get_logger(__name__)
+def _is_ollama_shape(provider_config: ProviderConfig) -> bool:
+    """Return True if the provider looks like Ollama (port 11434 or num_ctx declared)."""
+    if provider_config.kind != "openai_compat":
+        return False
+    base_url = str(provider_config.base_url)
+    if ":11434" in base_url:
+        return True
+    extra = provider_config.extra_body or {}
+    options = extra.get("options")
+    return isinstance(options, dict) and "num_ctx" in options
+def _ollama_base_url(provider_config: ProviderConfig) -> str:
+    """Derive the Ollama native API base URL from the OpenAI-compat base_url.
+    Typical patterns:
+      - ``http://localhost:11434/v1`` → ``http://localhost:11434``
+      - ``http://host:11434/v1/``    → ``http://host:11434``
+    """
+    url = str(provider_config.base_url).rstrip("/")
+    # Strip the /v1 suffix to get the Ollama native API root
+    if url.endswith("/v1"):
+        url = url[:-3]
+    return url
+async def attempt_reload(provider_config: ProviderConfig) -> bool:
+    """Attempt to flush the Ollama KV cache by unloading the model.
+    Sends ``POST /api/generate`` with ``keep_alive: "0"`` to the Ollama
+    native API. This causes Ollama to unload the model from memory; the
+    next inference request will reload it with a fresh KV cache.
+    Parameters
+    ----------
+    provider_config:
+        The provider's configuration from providers.yaml. Must be
+        Ollama-shape (``kind: openai_compat`` + port 11434 or num_ctx).
+    Returns
+    -------
+    True if the unload request succeeded (HTTP 200), False otherwise.
+    Non-Ollama providers return False immediately (no-op).
+    """
+    if not _is_ollama_shape(provider_config):
+        logger.debug(
+            "drift-reload-skip",
+            extra={
+                "provider": provider_config.name,
+                "reason": "not-ollama-shape",
+            },
+        )
+        return False
+    base_url = _ollama_base_url(provider_config)
+    model = provider_config.model
+    try:
+        async with httpx.AsyncClient(timeout=10.0) as client:
+            resp = await client.post(
+                f"{base_url}/api/generate",
+                json={
+                    "model": model,
+                    "keep_alive": 0,
+                },
+            )
+        success = resp.status_code == 200
+    except (httpx.HTTPError, OSError) as exc:
+        logger.debug(
+            "drift-reload-http-error",
+            extra={
+                "provider": provider_config.name,
+                "error": str(exc)[:200],
+            },
+        )
+        success = False
+    log_drift_reload_attempted(
+        logger,
+        provider=provider_config.name,
+        success=success,
+    )
+    return success

coderouter-cli 2.0.0__py3-none-any.whl → 2.1.0__py3-none-any.whl

coderouter-cli 2.0.0py3-none-any.whl → 2.1.0py3-none-any.whl