PyPI - coderouter-cli - Versions diffs - 1.10.1__py3-none-any.whl → 2.0.0__py3-none-any.whl - Mend

coderouter-cli 1.10.1py3-none-any.whl → 2.0.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

coderouter/cli_stats.py +48 -1
coderouter/config/schemas.py +86 -0
coderouter/data/model-capabilities.yaml +79 -0
coderouter/guards/context_budget.py +376 -0
coderouter/ingress/anthropic_routes.py +19 -2
coderouter/logging.py +89 -0
coderouter/metrics/collector.py +49 -2
coderouter/metrics/prometheus.py +71 -0
coderouter/routing/auto_router.py +2 -42
coderouter/routing/fallback.py +196 -0
coderouter/token_estimation.py +161 -0
{coderouter_cli-1.10.1.dist-info → coderouter_cli-2.0.0.dist-info}/METADATA +10 -8
{coderouter_cli-1.10.1.dist-info → coderouter_cli-2.0.0.dist-info}/RECORD +16 -14
{coderouter_cli-1.10.1.dist-info → coderouter_cli-2.0.0.dist-info}/WHEEL +0 -0
{coderouter_cli-1.10.1.dist-info → coderouter_cli-2.0.0.dist-info}/entry_points.txt +0 -0
{coderouter_cli-1.10.1.dist-info → coderouter_cli-2.0.0.dist-info}/licenses/LICENSE +0 -0

coderouter/cli_stats.py CHANGED Viewed

@@ -112,6 +112,10 @@ class GatesSummary:
     degraded_breakdown: dict[str, int]  # capability → count
     filters_applied_total: int
     filters_breakdown: dict[str, int]  # filter name → count
+    # v2.0-F (L1): context budget guard summary
+    context_budget_warnings: int = 0
+    context_budget_trims: int = 0
+    context_budget_latest_ratio: dict[str, float] | None = None
 @dataclass(frozen=True)
@@ -252,6 +256,8 @@ def build_gates_summary(snapshot: dict[str, Any]) -> GatesSummary:
     )
     degraded_breakdown = dict(counters.get("capability_degraded", {}) or {})
     filters_breakdown = dict(counters.get("output_filter_applied", {}) or {})
+    # v2.0-F (L1): context budget guard counters
+    ctx_budget_latest = counters.get("context_budget_latest_ratio") or {}
     return GatesSummary(
         total_requests=total_requests,
         total_failed=total_failed,
@@ -261,6 +267,13 @@ def build_gates_summary(snapshot: dict[str, Any]) -> GatesSummary:
         degraded_breakdown=degraded_breakdown,
         filters_applied_total=sum(filters_breakdown.values()),
         filters_breakdown=filters_breakdown,
+        context_budget_warnings=int(
+            counters.get("context_budget_warnings_total", 0)
+        ),
+        context_budget_trims=int(
+            counters.get("context_budget_trims_total", 0)
+        ),
+        context_budget_latest_ratio=ctx_budget_latest if ctx_budget_latest else None,
     )
@@ -397,6 +410,19 @@ def format_text(snapshot: dict[str, Any], *, width: int = 80) -> str:
             else ""
         )
     )
+    # v2.0-F (L1): context budget guard stats
+    if gates.context_budget_warnings or gates.context_budget_trims:
+        ratio_str = ""
+        if gates.context_budget_latest_ratio:
+            top_profile = max(
+                gates.context_budget_latest_ratio,
+                key=gates.context_budget_latest_ratio.get,  # type: ignore[arg-type]
+            )
+            ratio_str = f"  (latest: {gates.context_budget_latest_ratio[top_profile]:.0%} {top_profile})"
+        lines.append(
+            f"  context-budget warn:   {gates.context_budget_warnings}  "
+            f"trim: {gates.context_budget_trims}{ratio_str}"
+        )
     lines.append("")
     lines.append("Recent")
     if not recent:
@@ -633,7 +659,28 @@ def _draw_frame(  # pragma: no cover - curses-only
         + (f"  ({_fmt_breakdown(gates.filters_breakdown)})" if gates.filters_breakdown else ""),
         width,
     )
-    row += 2
+    row += 1
+    # v2.0-F (L1): context budget guard line
+    if gates.context_budget_warnings or gates.context_budget_trims:
+        ratio_str = ""
+        if gates.context_budget_latest_ratio:
+            top_profile = max(
+                gates.context_budget_latest_ratio,
+                key=gates.context_budget_latest_ratio.get,  # type: ignore[arg-type]
+            )
+            ratio_str = f"  (latest: {gates.context_budget_latest_ratio[top_profile]:.0%} {top_profile})"
+        budget_line = (
+            f"  context-budget warn:   {gates.context_budget_warnings}  "
+            f"trim: {gates.context_budget_trims}{ratio_str}"
+        )
+        budget_color = (
+            _COLOR_YELLOW_PAIR
+            if gates.context_budget_trims == 0
+            else _COLOR_RED_PAIR
+        )
+        stdscr.addnstr(row, 0, budget_line, width, int(curses.color_pair(budget_color)))
+        row += 1
+    row += 1
     if row >= height - 2:
         return

coderouter/config/schemas.py CHANGED Viewed

@@ -230,6 +230,18 @@ class ProviderConfig(BaseModel):
             "Anthropic) from normal input — see :class:`CostConfig`."
         ),
     )
+    max_context_tokens: int | None = Field(
+        default=None,
+        ge=1,
+        description=(
+            "v2.0-F (L1): explicit declaration of this provider's "
+            "context window size in tokens. When set, takes precedence "
+            "over the ``model-capabilities.yaml`` registry lookup. "
+            "When both are unset, the context budget guard falls back "
+            "to 128000 (128K). Examples: Ollama Qwen3 32K → 32768, "
+            "LM Studio Qwen3.5 128K → 131072, Anthropic Claude → 200000."
+        ),
+    )
     @model_validator(mode="after")
     def _check_output_filters_known(self) -> ProviderConfig:
@@ -444,6 +456,80 @@ class FallbackChain(BaseModel):
             "operation, L5 handles hard crashes."
         ),
     )
+    # v2.0-F (L1): context budget guard.
+    #
+    # Long-running agent sessions accumulate messages that eventually
+    # exceed the target model's context window. Without intervention,
+    # the backend returns a 400 (Anthropic) or silently truncates
+    # (Ollama), killing the agent session. The context budget guard
+    # estimates the request's token count (char/4 heuristic, shared
+    # with the auto_router longContext matcher) and compares it against
+    # the target provider's declared max_context_tokens.
+    #
+    # Three actions:
+    #   * ``off``  — no detection, no logging. Backward-compat default.
+    #   * ``warn`` — emit ``context-budget-warning`` log + attach
+    #                ``X-CodeRouter-Context-Budget: warning`` response
+    #                header. No request mutation.
+    #   * ``trim`` — ``warn`` + remove oldest non-system messages until
+    #                the estimated token count drops below
+    #                ``context_budget_trim_target``. Recent messages
+    #                (``context_budget_preserve_last_n``) are always
+    #                kept, and tool_use / tool_result pairs are preserved
+    #                atomically to avoid breaking agent loops.
+    context_budget_action: Literal["off", "warn", "trim"] = Field(
+        default="off",
+        description=(
+            "v2.0-F (L1): action when estimated request tokens approach "
+            "the target provider's context window. ``off`` (default) "
+            "disables the guard entirely. ``warn`` emits a log and "
+            "response header at ``context_budget_warn_threshold``. "
+            "``trim`` additionally removes old messages at "
+            "``context_budget_trim_threshold`` to reclaim context space."
+        ),
+    )
+    context_budget_warn_threshold: float = Field(
+        default=0.80,
+        ge=0.1,
+        le=1.0,
+        description=(
+            "v2.0-F (L1): context usage ratio (estimated_tokens / "
+            "max_context_tokens) at which a warning is emitted. "
+            "Default 0.80 (80%) gives early notice before trim fires."
+        ),
+    )
+    context_budget_trim_threshold: float = Field(
+        default=0.90,
+        ge=0.1,
+        le=1.0,
+        description=(
+            "v2.0-F (L1): context usage ratio at which trim fires "
+            "(only when ``context_budget_action`` is ``trim``). "
+            "Default 0.90 (90%) leaves a 10% margin for the backend's "
+            "own token counting to differ from the char/4 estimate."
+        ),
+    )
+    context_budget_trim_target: float = Field(
+        default=0.75,
+        ge=0.1,
+        le=1.0,
+        description=(
+            "v2.0-F (L1): target context usage ratio after trim. "
+            "Messages are removed from the front until the estimate "
+            "drops below this ratio. Default 0.75 (75%) gives headroom "
+            "for several more turns before trim fires again."
+        ),
+    )
+    context_budget_preserve_last_n: int = Field(
+        default=4,
+        ge=1,
+        le=100,
+        description=(
+            "v2.0-F (L1): minimum number of recent messages to always "
+            "preserve when trimming. Default 4 (2 user-assistant pairs) "
+            "keeps the agent's immediate working context intact."
+        ),
+    )
 # ---------------------------------------------------------------------------

coderouter/data/model-capabilities.yaml CHANGED Viewed

@@ -406,3 +406,82 @@ rules:
     kind: openai_compat
     capabilities:
       tools: true
+  # ------------------------------------------------------------------
+  # Context window declarations — max_context_tokens (v2.0-F)
+  #
+  # Used by the context-budget guard (L1) to know when a conversation
+  # is approaching the model's context limit. These fall below the
+  # capability-specific rules above because first-match-per-flag means
+  # a model can declare both `thinking: true` and `max_context_tokens`
+  # from different rules — each flag resolves independently.
+  #
+  # Values are the *effective* context window the model reliably handles.
+  # For models with claimed but untested larger windows, the conservative
+  # value is declared. Operators can override via per-provider
+  # `max_context_tokens` in providers.yaml or user model-capabilities.yaml.
+  # ------------------------------------------------------------------
+  # Anthropic Claude — 200K context window (all 4.x families)
+  - match: "claude-*"
+    kind: anthropic
+    capabilities:
+      max_context_tokens: 200000
+  # Qwen3 (base, non-coder) — Ollama default: 32K context
+  - match: "qwen3:*"
+    capabilities:
+      max_context_tokens: 32768
+  # Qwen3-Coder — 256K declared, conservative 131K for GGUF quantized
+  - match: "qwen3-coder:*"
+    capabilities:
+      max_context_tokens: 131072
+  - match: "qwen/qwen3-coder-*"
+    capabilities:
+      max_context_tokens: 131072
+  # Qwen3.5 — 131K verified (LM Studio Anthropic endpoint)
+  - match: "qwen3.5*"
+    capabilities:
+      max_context_tokens: 131072
+  # Qwen3.6 — 256K declared, conservative 131K
+  - match: "qwen3.6*"
+    capabilities:
+      max_context_tokens: 131072
+  # Gemma 4 — 128K context (all variants)
+  - match: "gemma4:*"
+    capabilities:
+      max_context_tokens: 131072
+  - match: "google/gemma-4*"
+    capabilities:
+      max_context_tokens: 131072
+  # DeepSeek V3 — 128K context
+  - match: "deepseek*v3*"
+    capabilities:
+      max_context_tokens: 131072
+  # DeepSeek R1 — 128K context
+  - match: "deepseek*r1*"
+    capabilities:
+      max_context_tokens: 131072
+  # GPT-OSS — 131K context
+  - match: "*gpt-oss*"
+    capabilities:
+      max_context_tokens: 131072
+  # Devstral — 128K context (Mistral coding)
+  - match: "*devstral*"
+    capabilities:
+      max_context_tokens: 131072
+  # Kimi K2 — 128K context
+  - match: "*kimi-k2*"
+    capabilities:
+      max_context_tokens: 131072

coderouter/guards/context_budget.py ADDED Viewed

@@ -0,0 +1,376 @@
+"""Context budget guard (v2.0-F, L1).
+Long-running agent sessions (Claude Code, Cline, OpenClaw, etc.)
+accumulate messages that eventually exceed the target model's context
+window. Without intervention, the backend returns a 400 error
+(Anthropic: ``max_tokens`` violation) or silently truncates the
+prompt (Ollama), killing the agent session.
+This module provides the engine two pieces:
+  1. A **stateless estimator** :func:`estimate_context_usage` that
+     computes the approximate context-window fill ratio for a given
+     Anthropic request against a declared ``max_context_tokens``.
+     Pure function, no I/O.
+  2. A **stateless trimmer** :func:`trim_to_budget` that returns a
+     new request with old messages removed until the estimated usage
+     drops below a target ratio. Pure function, no mutation of the
+     input.
+Integration with the fallback engine
+=====================================
+The engine calls these at the ``_apply_context_budget_guard`` site —
+**after** tool-loop detection but **before** chain dispatch. The
+guard reads the resolved profile's ``context_budget_action`` field:
+  * ``off``  — guard is a no-op (default).
+  * ``warn`` — compute estimate; if over warn threshold, emit a
+               structured log + attach a response header.
+  * ``trim`` — ``warn`` behavior + if over trim threshold, call
+               :func:`trim_to_budget` and return the shortened
+               request to the engine.
+Token estimation
+================
+Uses the shared :func:`~coderouter.token_estimation.estimate_tokens_from_anthropic_request`
+(char/4 heuristic, 5-deps invariant). See that module's docstring
+for the CJK caveat and recommended threshold compensation.
+Trim algorithm
+==============
+  1. Always preserve the system prompt (not counted toward removal).
+  2. Always preserve the last ``preserve_last_n`` messages.
+  3. Remove messages from the front (oldest first).
+  4. Preserve tool_use / tool_result pairs atomically — if a kept
+     message contains a ``tool_result``, also keep the preceding
+     ``tool_use`` assistant message (and vice versa).
+  5. After removal, re-estimate; if still over ``trim_target``,
+     reduce ``preserve_last_n`` by 1 and retry (minimum floor: 2).
+"""
+from __future__ import annotations
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Any
+from coderouter.token_estimation import (
+    estimate_tokens_from_anthropic_request,
+)
+if TYPE_CHECKING:
+    from coderouter.translation.anthropic import AnthropicRequest
+# ---------------------------------------------------------------------------
+# Result types
+# ---------------------------------------------------------------------------
+@dataclass(frozen=True, slots=True)
+class ContextBudgetEstimate:
+    """Result of a context-budget estimation check."""
+    #: Estimated token count for the full request (system + messages).
+    estimated_tokens: int
+    #: Declared maximum context window for the target provider.
+    max_context_tokens: int
+    #: Ratio: estimated_tokens / max_context_tokens (0.0 to ∞).
+    usage_ratio: float
+    #: True when usage_ratio >= the profile's warn threshold.
+    over_warn_threshold: bool
+    #: True when usage_ratio >= the profile's trim threshold.
+    over_trim_threshold: bool
+@dataclass(frozen=True, slots=True)
+class TrimResult:
+    """Metadata about a trim operation (for logging)."""
+    #: Number of messages before trim.
+    messages_before: int
+    #: Number of messages after trim.
+    messages_after: int
+    #: Number of messages removed.
+    messages_removed: int
+    #: Estimated tokens before trim.
+    estimated_tokens_before: int
+    #: Estimated tokens after trim.
+    estimated_tokens_after: int
+# ---------------------------------------------------------------------------
+# Public API: estimation
+# ---------------------------------------------------------------------------
+def estimate_context_usage(
+    request: AnthropicRequest,
+    *,
+    max_context_tokens: int,
+    warn_threshold: float = 0.80,
+    trim_threshold: float = 0.90,
+) -> ContextBudgetEstimate:
+    """Estimate how full the target provider's context window is.
+    Pure function. Does not mutate the request. Returns a
+    :class:`ContextBudgetEstimate` with precomputed threshold booleans
+    so callers can branch without re-computing ratios.
+    Parameters
+    ----------
+    request
+        The inbound Anthropic request to evaluate.
+    max_context_tokens
+        Declared context window of the target provider (from
+        ProviderConfig.max_context_tokens, registry, or fallback 128K).
+    warn_threshold
+        Ratio at or above which ``over_warn_threshold`` is True.
+    trim_threshold
+        Ratio at or above which ``over_trim_threshold`` is True.
+    """
+    estimated = estimate_tokens_from_anthropic_request(
+        system=request.system,
+        messages=request.messages,
+    )
+    ratio = estimated / max_context_tokens if max_context_tokens > 0 else 0.0
+    return ContextBudgetEstimate(
+        estimated_tokens=estimated,
+        max_context_tokens=max_context_tokens,
+        usage_ratio=ratio,
+        over_warn_threshold=ratio >= warn_threshold,
+        over_trim_threshold=ratio >= trim_threshold,
+    )
+# ---------------------------------------------------------------------------
+# Public API: trimming
+# ---------------------------------------------------------------------------
+def trim_to_budget(
+    request: AnthropicRequest,
+    *,
+    max_context_tokens: int,
+    trim_target: float = 0.75,
+    preserve_last_n: int = 4,
+) -> tuple[AnthropicRequest, TrimResult]:
+    """Return a new request with old messages removed to fit the budget.
+    Pure function — does NOT mutate the input request.
+    Algorithm:
+      1. Compute target token count = max_context_tokens * trim_target.
+      2. Identify messages that MUST be preserved:
+         - Last ``preserve_last_n`` messages.
+         - Any tool_use / tool_result pairs linked to preserved messages.
+      3. Remove messages from the front until estimated tokens ≤ target.
+      4. If still over target after removing all removable messages,
+         reduce preserve_last_n by 1 and retry (floor: 2 messages).
+    Returns
+    -------
+    tuple[AnthropicRequest, TrimResult]
+        The trimmed request (new instance) and metadata about the trim.
+    """
+    messages = list(request.messages)
+    estimated_before = estimate_tokens_from_anthropic_request(
+        system=request.system,
+        messages=messages,
+    )
+    target_tokens = int(max_context_tokens * trim_target)
+    effective_preserve = min(preserve_last_n, len(messages))
+    # Iteratively trim until under target or preserve floor reached
+    trimmed_messages = _do_trim(
+        messages=messages,
+        system=request.system,
+        target_tokens=target_tokens,
+        preserve_last_n=effective_preserve,
+    )
+    estimated_after = estimate_tokens_from_anthropic_request(
+        system=request.system,
+        messages=trimmed_messages,
+    )
+    result = TrimResult(
+        messages_before=len(messages),
+        messages_after=len(trimmed_messages),
+        messages_removed=len(messages) - len(trimmed_messages),
+        estimated_tokens_before=estimated_before,
+        estimated_tokens_after=estimated_after,
+    )
+    # Build new request with trimmed messages.
+    # Import here to avoid circular import at module level.
+    from coderouter.translation.anthropic import AnthropicMessage
+    new_request = request.model_copy(
+        update={"messages": [AnthropicMessage(**_msg_to_dict(m)) for m in trimmed_messages]},
+    )
+    return new_request, result
+# ---------------------------------------------------------------------------
+# Internal helpers
+# ---------------------------------------------------------------------------
+def _msg_to_dict(msg: Any) -> dict[str, Any]:
+    """Convert an AnthropicMessage (or dict) to a plain dict for reconstruction."""
+    if hasattr(msg, "model_dump"):
+        return msg.model_dump()
+    if isinstance(msg, dict):
+        return msg
+    return {"role": "user", "content": ""}
+def _get_content(msg: Any) -> Any:
+    """Extract the content field from a message (Pydantic model or dict)."""
+    if hasattr(msg, "content"):
+        return msg.content
+    if isinstance(msg, dict):
+        return msg.get("content")
+    return None
+def _extract_tool_use_ids(msg: Any) -> set[str]:
+    """Extract all tool_use IDs from a message's content blocks."""
+    content = _get_content(msg)
+    ids: set[str] = set()
+    if isinstance(content, list):
+        for block in content:
+            if isinstance(block, dict) and block.get("type") == "tool_use":
+                tid = block.get("id")
+                if isinstance(tid, str):
+                    ids.add(tid)
+    return ids
+def _extract_tool_result_ids(msg: Any) -> set[str]:
+    """Extract all tool_use_ids referenced by tool_result blocks."""
+    content = _get_content(msg)
+    ids: set[str] = set()
+    if isinstance(content, list):
+        for block in content:
+            if isinstance(block, dict) and block.get("type") == "tool_result":
+                tid = block.get("tool_use_id")
+                if isinstance(tid, str):
+                    ids.add(tid)
+    return ids
+def _has_tool_use(msg: Any) -> bool:
+    """True if the message contains a tool_use content block."""
+    return len(_extract_tool_use_ids(msg)) > 0
+def _has_tool_result(msg: Any) -> bool:
+    """True if the message contains a tool_result content block."""
+    return len(_extract_tool_result_ids(msg)) > 0
+def _compute_preserve_set(
+    messages: list[Any],
+    preserve_last_n: int,
+) -> set[int]:
+    """Compute indices of messages that must be preserved.
+    Preserves:
+      - The last ``preserve_last_n`` messages.
+      - tool_use/tool_result pair integrity via ``tool_use_id`` matching:
+        if a preserved message has a tool_result referencing an ID, also
+        preserve the message (anywhere in the conversation) that emitted
+        the matching tool_use. Conversely, if a preserved message has a
+        tool_use, also preserve the message carrying the matching
+        tool_result. This handles multi-tool calls (one assistant message
+        with N tool_use blocks, one user message with N tool_results) and
+        non-adjacent pairs.
+    The algorithm iterates until stable (fixpoint), handling chains of
+    dependencies (e.g., preserving a tool_result pulls in its tool_use
+    assistant message, which might have another tool_use whose tool_result
+    also needs preserving).
+    """
+    n = len(messages)
+    if preserve_last_n >= n:
+        return set(range(n))
+    # Pre-compute tool ID mappings for efficient lookup
+    # tool_use_id → index of the message containing that tool_use
+    tool_use_index: dict[str, int] = {}
+    # tool_use_id → index of the message containing the matching tool_result
+    tool_result_index: dict[str, int] = {}
+    for i, msg in enumerate(messages):
+        for tid in _extract_tool_use_ids(msg):
+            tool_use_index[tid] = i
+        for tid in _extract_tool_result_ids(msg):
+            tool_result_index[tid] = i
+    preserved: set[int] = set(range(n - preserve_last_n, n))
+    # Expand to cover tool pairs via ID matching — iterate until stable
+    changed = True
+    while changed:
+        changed = False
+        for idx in list(preserved):
+            msg = messages[idx]
+            # If this message has tool_results, preserve the messages
+            # that contain the matching tool_use blocks
+            for tid in _extract_tool_result_ids(msg):
+                use_idx = tool_use_index.get(tid)
+                if use_idx is not None and use_idx not in preserved:
+                    preserved.add(use_idx)
+                    changed = True
+            # If this message has tool_use blocks, preserve the messages
+            # that contain the matching tool_results
+            for tid in _extract_tool_use_ids(msg):
+                result_idx = tool_result_index.get(tid)
+                if result_idx is not None and result_idx not in preserved:
+                    preserved.add(result_idx)
+                    changed = True
+    return preserved
+def _do_trim(
+    messages: list[Any],
+    system: Any,
+    target_tokens: int,
+    preserve_last_n: int,
+) -> list[Any]:
+    """Core trim loop. Reduces preserve_last_n if needed (floor: 2)."""
+    current_preserve = preserve_last_n
+    while current_preserve >= 2:
+        preserved_indices = _compute_preserve_set(messages, current_preserve)
+        # Keep only preserved messages (maintain order)
+        trimmed = [messages[i] for i in sorted(preserved_indices)]
+        estimated = estimate_tokens_from_anthropic_request(
+            system=system,
+            messages=trimmed,
+        )
+        if estimated <= target_tokens:
+            return trimmed
+        # Still over target — reduce preserve count and retry
+        current_preserve -= 1
+    # Floor reached — return with minimum preservation (last 2)
+    preserved_indices = _compute_preserve_set(messages, 2)
+    return [messages[i] for i in sorted(preserved_indices)]
+__all__ = [
+    "ContextBudgetEstimate",
+    "TrimResult",
+    "estimate_context_usage",
+    "trim_to_budget",
+]

coderouter/ingress/anthropic_routes.py CHANGED Viewed

@@ -26,7 +26,7 @@ from collections.abc import AsyncIterator
 from typing import Any
 from fastapi import APIRouter, Header, HTTPException, Request
-from fastapi.responses import StreamingResponse
+from fastapi.responses import JSONResponse, StreamingResponse
 from coderouter.guards.tool_loop import ToolLoopBreakError
 from coderouter.logging import get_logger
@@ -48,6 +48,7 @@ _PROFILE_HEADER = "x-coderouter-profile"
 _MODE_HEADER = "x-coderouter-mode"
 _ANTHROPIC_VERSION_HEADER = "anthropic-version"
 _ANTHROPIC_BETA_HEADER = "anthropic-beta"
+_CTX_BUDGET_HEADER = "X-CodeRouter-Context-Budget"
 @router.post("/messages", response_model=None)
@@ -131,11 +132,22 @@ async def messages(
                 detail=(f"unknown profile {anth_req.profile!r}. available: {available}"),
             ) from exc
+    # v2.0-F (L1): run context budget guard before dispatch so the
+    # response header can be set for both streaming and non-streaming.
+    # The engine's internal guard re-check is a cheap no-op.
+    anth_req, ctx_budget_status = engine.apply_context_budget(anth_req)
     if anth_req.stream:
+        stream_headers: dict[str, str] = {
+            "Cache-Control": "no-cache",
+            "X-Accel-Buffering": "no",
+        }
+        if ctx_budget_status:
+            stream_headers[_CTX_BUDGET_HEADER] = ctx_budget_status
         return StreamingResponse(
             _anthropic_sse_iterator(engine, anth_req),
             media_type="text/event-stream",
-            headers={"Cache-Control": "no-cache", "X-Accel-Buffering": "no"},
+            headers=stream_headers,
         )
     try:
@@ -155,6 +167,11 @@ async def messages(
             detail=_tool_loop_break_detail(exc),
         ) from exc
+    if ctx_budget_status:
+        return JSONResponse(
+            content=anth_resp.model_dump(exclude_none=True),
+            headers={_CTX_BUDGET_HEADER: ctx_budget_status},
+        )
     return anth_resp.model_dump(exclude_none=True)

coderouter-cli 1.10.1__py3-none-any.whl → 2.0.0__py3-none-any.whl

coderouter-cli 1.10.1py3-none-any.whl → 2.0.0py3-none-any.whl