coderouter-cli 1.10.0__py3-none-any.whl → 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
coderouter/cli_stats.py CHANGED
@@ -112,6 +112,10 @@ class GatesSummary:
112
112
  degraded_breakdown: dict[str, int] # capability → count
113
113
  filters_applied_total: int
114
114
  filters_breakdown: dict[str, int] # filter name → count
115
+ # v2.0-F (L1): context budget guard summary
116
+ context_budget_warnings: int = 0
117
+ context_budget_trims: int = 0
118
+ context_budget_latest_ratio: dict[str, float] | None = None
115
119
 
116
120
 
117
121
  @dataclass(frozen=True)
@@ -252,6 +256,8 @@ def build_gates_summary(snapshot: dict[str, Any]) -> GatesSummary:
252
256
  )
253
257
  degraded_breakdown = dict(counters.get("capability_degraded", {}) or {})
254
258
  filters_breakdown = dict(counters.get("output_filter_applied", {}) or {})
259
+ # v2.0-F (L1): context budget guard counters
260
+ ctx_budget_latest = counters.get("context_budget_latest_ratio") or {}
255
261
  return GatesSummary(
256
262
  total_requests=total_requests,
257
263
  total_failed=total_failed,
@@ -261,6 +267,13 @@ def build_gates_summary(snapshot: dict[str, Any]) -> GatesSummary:
261
267
  degraded_breakdown=degraded_breakdown,
262
268
  filters_applied_total=sum(filters_breakdown.values()),
263
269
  filters_breakdown=filters_breakdown,
270
+ context_budget_warnings=int(
271
+ counters.get("context_budget_warnings_total", 0)
272
+ ),
273
+ context_budget_trims=int(
274
+ counters.get("context_budget_trims_total", 0)
275
+ ),
276
+ context_budget_latest_ratio=ctx_budget_latest if ctx_budget_latest else None,
264
277
  )
265
278
 
266
279
 
@@ -397,6 +410,19 @@ def format_text(snapshot: dict[str, Any], *, width: int = 80) -> str:
397
410
  else ""
398
411
  )
399
412
  )
413
+ # v2.0-F (L1): context budget guard stats
414
+ if gates.context_budget_warnings or gates.context_budget_trims:
415
+ ratio_str = ""
416
+ if gates.context_budget_latest_ratio:
417
+ top_profile = max(
418
+ gates.context_budget_latest_ratio,
419
+ key=gates.context_budget_latest_ratio.get, # type: ignore[arg-type]
420
+ )
421
+ ratio_str = f" (latest: {gates.context_budget_latest_ratio[top_profile]:.0%} {top_profile})"
422
+ lines.append(
423
+ f" context-budget warn: {gates.context_budget_warnings} "
424
+ f"trim: {gates.context_budget_trims}{ratio_str}"
425
+ )
400
426
  lines.append("")
401
427
  lines.append("Recent")
402
428
  if not recent:
@@ -633,7 +659,28 @@ def _draw_frame( # pragma: no cover - curses-only
633
659
  + (f" ({_fmt_breakdown(gates.filters_breakdown)})" if gates.filters_breakdown else ""),
634
660
  width,
635
661
  )
636
- row += 2
662
+ row += 1
663
+ # v2.0-F (L1): context budget guard line
664
+ if gates.context_budget_warnings or gates.context_budget_trims:
665
+ ratio_str = ""
666
+ if gates.context_budget_latest_ratio:
667
+ top_profile = max(
668
+ gates.context_budget_latest_ratio,
669
+ key=gates.context_budget_latest_ratio.get, # type: ignore[arg-type]
670
+ )
671
+ ratio_str = f" (latest: {gates.context_budget_latest_ratio[top_profile]:.0%} {top_profile})"
672
+ budget_line = (
673
+ f" context-budget warn: {gates.context_budget_warnings} "
674
+ f"trim: {gates.context_budget_trims}{ratio_str}"
675
+ )
676
+ budget_color = (
677
+ _COLOR_YELLOW_PAIR
678
+ if gates.context_budget_trims == 0
679
+ else _COLOR_RED_PAIR
680
+ )
681
+ stdscr.addnstr(row, 0, budget_line, width, int(curses.color_pair(budget_color)))
682
+ row += 1
683
+ row += 1
637
684
 
638
685
  if row >= height - 2:
639
686
  return
@@ -230,6 +230,18 @@ class ProviderConfig(BaseModel):
230
230
  "Anthropic) from normal input — see :class:`CostConfig`."
231
231
  ),
232
232
  )
233
+ max_context_tokens: int | None = Field(
234
+ default=None,
235
+ ge=1,
236
+ description=(
237
+ "v2.0-F (L1): explicit declaration of this provider's "
238
+ "context window size in tokens. When set, takes precedence "
239
+ "over the ``model-capabilities.yaml`` registry lookup. "
240
+ "When both are unset, the context budget guard falls back "
241
+ "to 128000 (128K). Examples: Ollama Qwen3 32K → 32768, "
242
+ "LM Studio Qwen3.5 128K → 131072, Anthropic Claude → 200000."
243
+ ),
244
+ )
233
245
 
234
246
  @model_validator(mode="after")
235
247
  def _check_output_filters_known(self) -> ProviderConfig:
@@ -444,6 +456,80 @@ class FallbackChain(BaseModel):
444
456
  "operation, L5 handles hard crashes."
445
457
  ),
446
458
  )
459
+ # v2.0-F (L1): context budget guard.
460
+ #
461
+ # Long-running agent sessions accumulate messages that eventually
462
+ # exceed the target model's context window. Without intervention,
463
+ # the backend returns a 400 (Anthropic) or silently truncates
464
+ # (Ollama), killing the agent session. The context budget guard
465
+ # estimates the request's token count (char/4 heuristic, shared
466
+ # with the auto_router longContext matcher) and compares it against
467
+ # the target provider's declared max_context_tokens.
468
+ #
469
+ # Three actions:
470
+ # * ``off`` — no detection, no logging. Backward-compat default.
471
+ # * ``warn`` — emit ``context-budget-warning`` log + attach
472
+ # ``X-CodeRouter-Context-Budget: warning`` response
473
+ # header. No request mutation.
474
+ # * ``trim`` — ``warn`` + remove oldest non-system messages until
475
+ # the estimated token count drops below
476
+ # ``context_budget_trim_target``. Recent messages
477
+ # (``context_budget_preserve_last_n``) are always
478
+ # kept, and tool_use / tool_result pairs are preserved
479
+ # atomically to avoid breaking agent loops.
480
+ context_budget_action: Literal["off", "warn", "trim"] = Field(
481
+ default="off",
482
+ description=(
483
+ "v2.0-F (L1): action when estimated request tokens approach "
484
+ "the target provider's context window. ``off`` (default) "
485
+ "disables the guard entirely. ``warn`` emits a log and "
486
+ "response header at ``context_budget_warn_threshold``. "
487
+ "``trim`` additionally removes old messages at "
488
+ "``context_budget_trim_threshold`` to reclaim context space."
489
+ ),
490
+ )
491
+ context_budget_warn_threshold: float = Field(
492
+ default=0.80,
493
+ ge=0.1,
494
+ le=1.0,
495
+ description=(
496
+ "v2.0-F (L1): context usage ratio (estimated_tokens / "
497
+ "max_context_tokens) at which a warning is emitted. "
498
+ "Default 0.80 (80%) gives early notice before trim fires."
499
+ ),
500
+ )
501
+ context_budget_trim_threshold: float = Field(
502
+ default=0.90,
503
+ ge=0.1,
504
+ le=1.0,
505
+ description=(
506
+ "v2.0-F (L1): context usage ratio at which trim fires "
507
+ "(only when ``context_budget_action`` is ``trim``). "
508
+ "Default 0.90 (90%) leaves a 10% margin for the backend's "
509
+ "own token counting to differ from the char/4 estimate."
510
+ ),
511
+ )
512
+ context_budget_trim_target: float = Field(
513
+ default=0.75,
514
+ ge=0.1,
515
+ le=1.0,
516
+ description=(
517
+ "v2.0-F (L1): target context usage ratio after trim. "
518
+ "Messages are removed from the front until the estimate "
519
+ "drops below this ratio. Default 0.75 (75%) gives headroom "
520
+ "for several more turns before trim fires again."
521
+ ),
522
+ )
523
+ context_budget_preserve_last_n: int = Field(
524
+ default=4,
525
+ ge=1,
526
+ le=100,
527
+ description=(
528
+ "v2.0-F (L1): minimum number of recent messages to always "
529
+ "preserve when trimming. Default 4 (2 user-assistant pairs) "
530
+ "keeps the agent's immediate working context intact."
531
+ ),
532
+ )
447
533
 
448
534
 
449
535
  # ---------------------------------------------------------------------------
@@ -498,6 +584,23 @@ class RuleMatcher(BaseModel):
498
584
  workloads can compensate by tuning the threshold, since the
499
585
  char/4 heuristic is conservative for CJK and looser for
500
586
  English code.
587
+
588
+ Variants ([Unreleased] / tool-aware routing, OpenClaw + Pi 由来):
589
+
590
+ - ``has_tools: True`` — the request body declares one or more
591
+ tools (OpenAI ``tools[]`` / Anthropic ``tools[]`` / OpenAI legacy
592
+ ``functions[]``). Lets operators send tool-laden requests to a
593
+ tool-capable cloud profile while keeping plain chat on a small
594
+ local model (typical Raspberry Pi / low-spec deployment shape:
595
+ a 1-4B local model that cannot reliably tool-call paired with a
596
+ free-tier cloud chain that can). Distinct from the
597
+ ``capabilities.tools`` flag on a provider — that flag is read by
598
+ ``coderouter doctor`` for diagnostics but does NOT gate the
599
+ fallback chain (the chain just iterates providers in order and
600
+ engages the v0.3-D tool-downgrade path on non-native ones with
601
+ ``request.tools`` set). The ``has_tools`` matcher is the
602
+ profile-level lever for steering tool-laden traffic to the right
603
+ chain entirely.
501
604
  """
502
605
 
503
606
  model_config = ConfigDict(extra="forbid")
@@ -508,6 +611,13 @@ class RuleMatcher(BaseModel):
508
611
  content_regex: str | None = None
509
612
  model_pattern: str | None = None
510
613
  content_token_count_min: int | None = Field(default=None, ge=1)
614
+ # [Unreleased]: tool-aware routing (OpenClaw + Raspberry Pi 由来).
615
+ # See class docstring "Variants ([Unreleased] / tool-aware routing)"
616
+ # above for the full rationale. Boolean shape mirrors ``has_image`` —
617
+ # only the ``True`` value is meaningful (matches when the body
618
+ # declares any tools); ``False`` is rejected by ``_exactly_one``
619
+ # since a "no-tools" rule would shadow the default fall-through.
620
+ has_tools: bool | None = None
511
621
 
512
622
  _MATCHER_FIELDS: tuple[str, ...] = (
513
623
  "has_image",
@@ -516,6 +626,7 @@ class RuleMatcher(BaseModel):
516
626
  "content_regex",
517
627
  "model_pattern",
518
628
  "content_token_count_min",
629
+ "has_tools",
519
630
  )
520
631
 
521
632
  @model_validator(mode="after")
@@ -406,3 +406,82 @@ rules:
406
406
  kind: openai_compat
407
407
  capabilities:
408
408
  tools: true
409
+
410
+ # ------------------------------------------------------------------
411
+ # Context window declarations — max_context_tokens (v2.0-F)
412
+ #
413
+ # Used by the context-budget guard (L1) to know when a conversation
414
+ # is approaching the model's context limit. These fall below the
415
+ # capability-specific rules above because first-match-per-flag means
416
+ # a model can declare both `thinking: true` and `max_context_tokens`
417
+ # from different rules — each flag resolves independently.
418
+ #
419
+ # Values are the *effective* context window the model reliably handles.
420
+ # For models with claimed but untested larger windows, the conservative
421
+ # value is declared. Operators can override via per-provider
422
+ # `max_context_tokens` in providers.yaml or user model-capabilities.yaml.
423
+ # ------------------------------------------------------------------
424
+
425
+ # Anthropic Claude — 200K context window (all 4.x families)
426
+ - match: "claude-*"
427
+ kind: anthropic
428
+ capabilities:
429
+ max_context_tokens: 200000
430
+
431
+ # Qwen3 (base, non-coder) — Ollama default: 32K context
432
+ - match: "qwen3:*"
433
+ capabilities:
434
+ max_context_tokens: 32768
435
+
436
+ # Qwen3-Coder — 256K declared, conservative 131K for GGUF quantized
437
+ - match: "qwen3-coder:*"
438
+ capabilities:
439
+ max_context_tokens: 131072
440
+
441
+ - match: "qwen/qwen3-coder-*"
442
+ capabilities:
443
+ max_context_tokens: 131072
444
+
445
+ # Qwen3.5 — 131K verified (LM Studio Anthropic endpoint)
446
+ - match: "qwen3.5*"
447
+ capabilities:
448
+ max_context_tokens: 131072
449
+
450
+ # Qwen3.6 — 256K declared, conservative 131K
451
+ - match: "qwen3.6*"
452
+ capabilities:
453
+ max_context_tokens: 131072
454
+
455
+ # Gemma 4 — 128K context (all variants)
456
+ - match: "gemma4:*"
457
+ capabilities:
458
+ max_context_tokens: 131072
459
+
460
+ - match: "google/gemma-4*"
461
+ capabilities:
462
+ max_context_tokens: 131072
463
+
464
+ # DeepSeek V3 — 128K context
465
+ - match: "deepseek*v3*"
466
+ capabilities:
467
+ max_context_tokens: 131072
468
+
469
+ # DeepSeek R1 — 128K context
470
+ - match: "deepseek*r1*"
471
+ capabilities:
472
+ max_context_tokens: 131072
473
+
474
+ # GPT-OSS — 131K context
475
+ - match: "*gpt-oss*"
476
+ capabilities:
477
+ max_context_tokens: 131072
478
+
479
+ # Devstral — 128K context (Mistral coding)
480
+ - match: "*devstral*"
481
+ capabilities:
482
+ max_context_tokens: 131072
483
+
484
+ # Kimi K2 — 128K context
485
+ - match: "*kimi-k2*"
486
+ capabilities:
487
+ max_context_tokens: 131072
@@ -0,0 +1,376 @@
1
+ """Context budget guard (v2.0-F, L1).
2
+
3
+ Long-running agent sessions (Claude Code, Cline, OpenClaw, etc.)
4
+ accumulate messages that eventually exceed the target model's context
5
+ window. Without intervention, the backend returns a 400 error
6
+ (Anthropic: ``max_tokens`` violation) or silently truncates the
7
+ prompt (Ollama), killing the agent session.
8
+
9
+ This module provides the engine two pieces:
10
+
11
+ 1. A **stateless estimator** :func:`estimate_context_usage` that
12
+ computes the approximate context-window fill ratio for a given
13
+ Anthropic request against a declared ``max_context_tokens``.
14
+ Pure function, no I/O.
15
+ 2. A **stateless trimmer** :func:`trim_to_budget` that returns a
16
+ new request with old messages removed until the estimated usage
17
+ drops below a target ratio. Pure function, no mutation of the
18
+ input.
19
+
20
+ Integration with the fallback engine
21
+ =====================================
22
+
23
+ The engine calls these at the ``_apply_context_budget_guard`` site —
24
+ **after** tool-loop detection but **before** chain dispatch. The
25
+ guard reads the resolved profile's ``context_budget_action`` field:
26
+
27
+ * ``off`` — guard is a no-op (default).
28
+ * ``warn`` — compute estimate; if over warn threshold, emit a
29
+ structured log + attach a response header.
30
+ * ``trim`` — ``warn`` behavior + if over trim threshold, call
31
+ :func:`trim_to_budget` and return the shortened
32
+ request to the engine.
33
+
34
+ Token estimation
35
+ ================
36
+
37
+ Uses the shared :func:`~coderouter.token_estimation.estimate_tokens_from_anthropic_request`
38
+ (char/4 heuristic, 5-deps invariant). See that module's docstring
39
+ for the CJK caveat and recommended threshold compensation.
40
+
41
+ Trim algorithm
42
+ ==============
43
+
44
+ 1. Always preserve the system prompt (not counted toward removal).
45
+ 2. Always preserve the last ``preserve_last_n`` messages.
46
+ 3. Remove messages from the front (oldest first).
47
+ 4. Preserve tool_use / tool_result pairs atomically — if a kept
48
+ message contains a ``tool_result``, also keep the preceding
49
+ ``tool_use`` assistant message (and vice versa).
50
+ 5. After removal, re-estimate; if still over ``trim_target``,
51
+ reduce ``preserve_last_n`` by 1 and retry (minimum floor: 2).
52
+ """
53
+
54
+ from __future__ import annotations
55
+
56
+ from dataclasses import dataclass
57
+ from typing import TYPE_CHECKING, Any
58
+
59
+ from coderouter.token_estimation import (
60
+ estimate_tokens_from_anthropic_request,
61
+ )
62
+
63
+ if TYPE_CHECKING:
64
+ from coderouter.translation.anthropic import AnthropicRequest
65
+
66
+
67
+ # ---------------------------------------------------------------------------
68
+ # Result types
69
+ # ---------------------------------------------------------------------------
70
+
71
+
72
+ @dataclass(frozen=True, slots=True)
73
+ class ContextBudgetEstimate:
74
+ """Result of a context-budget estimation check."""
75
+
76
+ #: Estimated token count for the full request (system + messages).
77
+ estimated_tokens: int
78
+ #: Declared maximum context window for the target provider.
79
+ max_context_tokens: int
80
+ #: Ratio: estimated_tokens / max_context_tokens (0.0 to ∞).
81
+ usage_ratio: float
82
+ #: True when usage_ratio >= the profile's warn threshold.
83
+ over_warn_threshold: bool
84
+ #: True when usage_ratio >= the profile's trim threshold.
85
+ over_trim_threshold: bool
86
+
87
+
88
+ @dataclass(frozen=True, slots=True)
89
+ class TrimResult:
90
+ """Metadata about a trim operation (for logging)."""
91
+
92
+ #: Number of messages before trim.
93
+ messages_before: int
94
+ #: Number of messages after trim.
95
+ messages_after: int
96
+ #: Number of messages removed.
97
+ messages_removed: int
98
+ #: Estimated tokens before trim.
99
+ estimated_tokens_before: int
100
+ #: Estimated tokens after trim.
101
+ estimated_tokens_after: int
102
+
103
+
104
+ # ---------------------------------------------------------------------------
105
+ # Public API: estimation
106
+ # ---------------------------------------------------------------------------
107
+
108
+
109
+ def estimate_context_usage(
110
+ request: AnthropicRequest,
111
+ *,
112
+ max_context_tokens: int,
113
+ warn_threshold: float = 0.80,
114
+ trim_threshold: float = 0.90,
115
+ ) -> ContextBudgetEstimate:
116
+ """Estimate how full the target provider's context window is.
117
+
118
+ Pure function. Does not mutate the request. Returns a
119
+ :class:`ContextBudgetEstimate` with precomputed threshold booleans
120
+ so callers can branch without re-computing ratios.
121
+
122
+ Parameters
123
+ ----------
124
+ request
125
+ The inbound Anthropic request to evaluate.
126
+ max_context_tokens
127
+ Declared context window of the target provider (from
128
+ ProviderConfig.max_context_tokens, registry, or fallback 128K).
129
+ warn_threshold
130
+ Ratio at or above which ``over_warn_threshold`` is True.
131
+ trim_threshold
132
+ Ratio at or above which ``over_trim_threshold`` is True.
133
+ """
134
+ estimated = estimate_tokens_from_anthropic_request(
135
+ system=request.system,
136
+ messages=request.messages,
137
+ )
138
+ ratio = estimated / max_context_tokens if max_context_tokens > 0 else 0.0
139
+ return ContextBudgetEstimate(
140
+ estimated_tokens=estimated,
141
+ max_context_tokens=max_context_tokens,
142
+ usage_ratio=ratio,
143
+ over_warn_threshold=ratio >= warn_threshold,
144
+ over_trim_threshold=ratio >= trim_threshold,
145
+ )
146
+
147
+
148
+ # ---------------------------------------------------------------------------
149
+ # Public API: trimming
150
+ # ---------------------------------------------------------------------------
151
+
152
+
153
+ def trim_to_budget(
154
+ request: AnthropicRequest,
155
+ *,
156
+ max_context_tokens: int,
157
+ trim_target: float = 0.75,
158
+ preserve_last_n: int = 4,
159
+ ) -> tuple[AnthropicRequest, TrimResult]:
160
+ """Return a new request with old messages removed to fit the budget.
161
+
162
+ Pure function — does NOT mutate the input request.
163
+
164
+ Algorithm:
165
+ 1. Compute target token count = max_context_tokens * trim_target.
166
+ 2. Identify messages that MUST be preserved:
167
+ - Last ``preserve_last_n`` messages.
168
+ - Any tool_use / tool_result pairs linked to preserved messages.
169
+ 3. Remove messages from the front until estimated tokens ≤ target.
170
+ 4. If still over target after removing all removable messages,
171
+ reduce preserve_last_n by 1 and retry (floor: 2 messages).
172
+
173
+ Returns
174
+ -------
175
+ tuple[AnthropicRequest, TrimResult]
176
+ The trimmed request (new instance) and metadata about the trim.
177
+ """
178
+ messages = list(request.messages)
179
+ estimated_before = estimate_tokens_from_anthropic_request(
180
+ system=request.system,
181
+ messages=messages,
182
+ )
183
+ target_tokens = int(max_context_tokens * trim_target)
184
+ effective_preserve = min(preserve_last_n, len(messages))
185
+
186
+ # Iteratively trim until under target or preserve floor reached
187
+ trimmed_messages = _do_trim(
188
+ messages=messages,
189
+ system=request.system,
190
+ target_tokens=target_tokens,
191
+ preserve_last_n=effective_preserve,
192
+ )
193
+
194
+ estimated_after = estimate_tokens_from_anthropic_request(
195
+ system=request.system,
196
+ messages=trimmed_messages,
197
+ )
198
+
199
+ result = TrimResult(
200
+ messages_before=len(messages),
201
+ messages_after=len(trimmed_messages),
202
+ messages_removed=len(messages) - len(trimmed_messages),
203
+ estimated_tokens_before=estimated_before,
204
+ estimated_tokens_after=estimated_after,
205
+ )
206
+
207
+ # Build new request with trimmed messages.
208
+ # Import here to avoid circular import at module level.
209
+ from coderouter.translation.anthropic import AnthropicMessage
210
+
211
+ new_request = request.model_copy(
212
+ update={"messages": [AnthropicMessage(**_msg_to_dict(m)) for m in trimmed_messages]},
213
+ )
214
+ return new_request, result
215
+
216
+
217
+ # ---------------------------------------------------------------------------
218
+ # Internal helpers
219
+ # ---------------------------------------------------------------------------
220
+
221
+
222
+ def _msg_to_dict(msg: Any) -> dict[str, Any]:
223
+ """Convert an AnthropicMessage (or dict) to a plain dict for reconstruction."""
224
+ if hasattr(msg, "model_dump"):
225
+ return msg.model_dump()
226
+ if isinstance(msg, dict):
227
+ return msg
228
+ return {"role": "user", "content": ""}
229
+
230
+
231
+ def _get_content(msg: Any) -> Any:
232
+ """Extract the content field from a message (Pydantic model or dict)."""
233
+ if hasattr(msg, "content"):
234
+ return msg.content
235
+ if isinstance(msg, dict):
236
+ return msg.get("content")
237
+ return None
238
+
239
+
240
+ def _extract_tool_use_ids(msg: Any) -> set[str]:
241
+ """Extract all tool_use IDs from a message's content blocks."""
242
+ content = _get_content(msg)
243
+ ids: set[str] = set()
244
+ if isinstance(content, list):
245
+ for block in content:
246
+ if isinstance(block, dict) and block.get("type") == "tool_use":
247
+ tid = block.get("id")
248
+ if isinstance(tid, str):
249
+ ids.add(tid)
250
+ return ids
251
+
252
+
253
+ def _extract_tool_result_ids(msg: Any) -> set[str]:
254
+ """Extract all tool_use_ids referenced by tool_result blocks."""
255
+ content = _get_content(msg)
256
+ ids: set[str] = set()
257
+ if isinstance(content, list):
258
+ for block in content:
259
+ if isinstance(block, dict) and block.get("type") == "tool_result":
260
+ tid = block.get("tool_use_id")
261
+ if isinstance(tid, str):
262
+ ids.add(tid)
263
+ return ids
264
+
265
+
266
+ def _has_tool_use(msg: Any) -> bool:
267
+ """True if the message contains a tool_use content block."""
268
+ return len(_extract_tool_use_ids(msg)) > 0
269
+
270
+
271
+ def _has_tool_result(msg: Any) -> bool:
272
+ """True if the message contains a tool_result content block."""
273
+ return len(_extract_tool_result_ids(msg)) > 0
274
+
275
+
276
+ def _compute_preserve_set(
277
+ messages: list[Any],
278
+ preserve_last_n: int,
279
+ ) -> set[int]:
280
+ """Compute indices of messages that must be preserved.
281
+
282
+ Preserves:
283
+ - The last ``preserve_last_n`` messages.
284
+ - tool_use/tool_result pair integrity via ``tool_use_id`` matching:
285
+ if a preserved message has a tool_result referencing an ID, also
286
+ preserve the message (anywhere in the conversation) that emitted
287
+ the matching tool_use. Conversely, if a preserved message has a
288
+ tool_use, also preserve the message carrying the matching
289
+ tool_result. This handles multi-tool calls (one assistant message
290
+ with N tool_use blocks, one user message with N tool_results) and
291
+ non-adjacent pairs.
292
+
293
+ The algorithm iterates until stable (fixpoint), handling chains of
294
+ dependencies (e.g., preserving a tool_result pulls in its tool_use
295
+ assistant message, which might have another tool_use whose tool_result
296
+ also needs preserving).
297
+ """
298
+ n = len(messages)
299
+ if preserve_last_n >= n:
300
+ return set(range(n))
301
+
302
+ # Pre-compute tool ID mappings for efficient lookup
303
+ # tool_use_id → index of the message containing that tool_use
304
+ tool_use_index: dict[str, int] = {}
305
+ # tool_use_id → index of the message containing the matching tool_result
306
+ tool_result_index: dict[str, int] = {}
307
+
308
+ for i, msg in enumerate(messages):
309
+ for tid in _extract_tool_use_ids(msg):
310
+ tool_use_index[tid] = i
311
+ for tid in _extract_tool_result_ids(msg):
312
+ tool_result_index[tid] = i
313
+
314
+ preserved: set[int] = set(range(n - preserve_last_n, n))
315
+
316
+ # Expand to cover tool pairs via ID matching — iterate until stable
317
+ changed = True
318
+ while changed:
319
+ changed = False
320
+ for idx in list(preserved):
321
+ msg = messages[idx]
322
+
323
+ # If this message has tool_results, preserve the messages
324
+ # that contain the matching tool_use blocks
325
+ for tid in _extract_tool_result_ids(msg):
326
+ use_idx = tool_use_index.get(tid)
327
+ if use_idx is not None and use_idx not in preserved:
328
+ preserved.add(use_idx)
329
+ changed = True
330
+
331
+ # If this message has tool_use blocks, preserve the messages
332
+ # that contain the matching tool_results
333
+ for tid in _extract_tool_use_ids(msg):
334
+ result_idx = tool_result_index.get(tid)
335
+ if result_idx is not None and result_idx not in preserved:
336
+ preserved.add(result_idx)
337
+ changed = True
338
+
339
+ return preserved
340
+
341
+
342
+ def _do_trim(
343
+ messages: list[Any],
344
+ system: Any,
345
+ target_tokens: int,
346
+ preserve_last_n: int,
347
+ ) -> list[Any]:
348
+ """Core trim loop. Reduces preserve_last_n if needed (floor: 2)."""
349
+ current_preserve = preserve_last_n
350
+
351
+ while current_preserve >= 2:
352
+ preserved_indices = _compute_preserve_set(messages, current_preserve)
353
+ # Keep only preserved messages (maintain order)
354
+ trimmed = [messages[i] for i in sorted(preserved_indices)]
355
+
356
+ estimated = estimate_tokens_from_anthropic_request(
357
+ system=system,
358
+ messages=trimmed,
359
+ )
360
+ if estimated <= target_tokens:
361
+ return trimmed
362
+
363
+ # Still over target — reduce preserve count and retry
364
+ current_preserve -= 1
365
+
366
+ # Floor reached — return with minimum preservation (last 2)
367
+ preserved_indices = _compute_preserve_set(messages, 2)
368
+ return [messages[i] for i in sorted(preserved_indices)]
369
+
370
+
371
+ __all__ = [
372
+ "ContextBudgetEstimate",
373
+ "TrimResult",
374
+ "estimate_context_usage",
375
+ "trim_to_budget",
376
+ ]