aru-code 0.18.0__tar.gz → 0.19.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. {aru_code-0.18.0/aru_code.egg-info → aru_code-0.19.1}/PKG-INFO +1 -1
  2. aru_code-0.19.1/aru/__init__.py +1 -0
  3. {aru_code-0.18.0 → aru_code-0.19.1}/aru/context.py +235 -176
  4. {aru_code-0.18.0 → aru_code-0.19.1}/aru/session.py +6 -3
  5. {aru_code-0.18.0 → aru_code-0.19.1/aru_code.egg-info}/PKG-INFO +1 -1
  6. {aru_code-0.18.0 → aru_code-0.19.1}/pyproject.toml +1 -1
  7. {aru_code-0.18.0 → aru_code-0.19.1}/tests/test_cli.py +5 -4
  8. {aru_code-0.18.0 → aru_code-0.19.1}/tests/test_confabulation_regression.py +34 -11
  9. {aru_code-0.18.0 → aru_code-0.19.1}/tests/test_context.py +119 -41
  10. aru_code-0.18.0/aru/__init__.py +0 -1
  11. {aru_code-0.18.0 → aru_code-0.19.1}/LICENSE +0 -0
  12. {aru_code-0.18.0 → aru_code-0.19.1}/README.md +0 -0
  13. {aru_code-0.18.0 → aru_code-0.19.1}/aru/agent_factory.py +0 -0
  14. {aru_code-0.18.0 → aru_code-0.19.1}/aru/agents/__init__.py +0 -0
  15. {aru_code-0.18.0 → aru_code-0.19.1}/aru/agents/base.py +0 -0
  16. {aru_code-0.18.0 → aru_code-0.19.1}/aru/agents/executor.py +0 -0
  17. {aru_code-0.18.0 → aru_code-0.19.1}/aru/agents/planner.py +0 -0
  18. {aru_code-0.18.0 → aru_code-0.19.1}/aru/cache_patch.py +0 -0
  19. {aru_code-0.18.0 → aru_code-0.19.1}/aru/cli.py +0 -0
  20. {aru_code-0.18.0 → aru_code-0.19.1}/aru/commands.py +0 -0
  21. {aru_code-0.18.0 → aru_code-0.19.1}/aru/completers.py +0 -0
  22. {aru_code-0.18.0 → aru_code-0.19.1}/aru/config.py +0 -0
  23. {aru_code-0.18.0 → aru_code-0.19.1}/aru/display.py +0 -0
  24. {aru_code-0.18.0 → aru_code-0.19.1}/aru/history_blocks.py +0 -0
  25. {aru_code-0.18.0 → aru_code-0.19.1}/aru/permissions.py +0 -0
  26. {aru_code-0.18.0 → aru_code-0.19.1}/aru/providers.py +0 -0
  27. {aru_code-0.18.0 → aru_code-0.19.1}/aru/runner.py +0 -0
  28. {aru_code-0.18.0 → aru_code-0.19.1}/aru/runtime.py +0 -0
  29. {aru_code-0.18.0 → aru_code-0.19.1}/aru/tools/__init__.py +0 -0
  30. {aru_code-0.18.0 → aru_code-0.19.1}/aru/tools/ast_tools.py +0 -0
  31. {aru_code-0.18.0 → aru_code-0.19.1}/aru/tools/codebase.py +0 -0
  32. {aru_code-0.18.0 → aru_code-0.19.1}/aru/tools/gitignore.py +0 -0
  33. {aru_code-0.18.0 → aru_code-0.19.1}/aru/tools/mcp_client.py +0 -0
  34. {aru_code-0.18.0 → aru_code-0.19.1}/aru/tools/ranker.py +0 -0
  35. {aru_code-0.18.0 → aru_code-0.19.1}/aru/tools/tasklist.py +0 -0
  36. {aru_code-0.18.0 → aru_code-0.19.1}/aru_code.egg-info/SOURCES.txt +0 -0
  37. {aru_code-0.18.0 → aru_code-0.19.1}/aru_code.egg-info/dependency_links.txt +0 -0
  38. {aru_code-0.18.0 → aru_code-0.19.1}/aru_code.egg-info/entry_points.txt +0 -0
  39. {aru_code-0.18.0 → aru_code-0.19.1}/aru_code.egg-info/requires.txt +0 -0
  40. {aru_code-0.18.0 → aru_code-0.19.1}/aru_code.egg-info/top_level.txt +0 -0
  41. {aru_code-0.18.0 → aru_code-0.19.1}/setup.cfg +0 -0
  42. {aru_code-0.18.0 → aru_code-0.19.1}/tests/test_agents_base.py +0 -0
  43. {aru_code-0.18.0 → aru_code-0.19.1}/tests/test_cli_advanced.py +0 -0
  44. {aru_code-0.18.0 → aru_code-0.19.1}/tests/test_cli_base.py +0 -0
  45. {aru_code-0.18.0 → aru_code-0.19.1}/tests/test_cli_completers.py +0 -0
  46. {aru_code-0.18.0 → aru_code-0.19.1}/tests/test_cli_new.py +0 -0
  47. {aru_code-0.18.0 → aru_code-0.19.1}/tests/test_cli_run_cli.py +0 -0
  48. {aru_code-0.18.0 → aru_code-0.19.1}/tests/test_cli_session.py +0 -0
  49. {aru_code-0.18.0 → aru_code-0.19.1}/tests/test_cli_shell.py +0 -0
  50. {aru_code-0.18.0 → aru_code-0.19.1}/tests/test_codebase.py +0 -0
  51. {aru_code-0.18.0 → aru_code-0.19.1}/tests/test_config.py +0 -0
  52. {aru_code-0.18.0 → aru_code-0.19.1}/tests/test_executor.py +0 -0
  53. {aru_code-0.18.0 → aru_code-0.19.1}/tests/test_gitignore.py +0 -0
  54. {aru_code-0.18.0 → aru_code-0.19.1}/tests/test_main.py +0 -0
  55. {aru_code-0.18.0 → aru_code-0.19.1}/tests/test_mcp_client.py +0 -0
  56. {aru_code-0.18.0 → aru_code-0.19.1}/tests/test_permissions.py +0 -0
  57. {aru_code-0.18.0 → aru_code-0.19.1}/tests/test_planner.py +0 -0
  58. {aru_code-0.18.0 → aru_code-0.19.1}/tests/test_providers.py +0 -0
  59. {aru_code-0.18.0 → aru_code-0.19.1}/tests/test_ranker.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: aru-code
3
- Version: 0.18.0
3
+ Version: 0.19.1
4
4
  Summary: A Claude Code clone built with Agno agents
5
5
  Author-email: Estevao <estevaofon@gmail.com>
6
6
  License-Expression: MIT
@@ -0,0 +1 @@
1
+ __version__ = "0.19.1"
@@ -1,28 +1,40 @@
1
1
  """Context management for token optimization.
2
2
 
3
- Implements three layers of token reduction:
4
- 1. Pruning — evict old tool/assistant outputs from history
5
- 2. Truncation universal cap on tool output size
6
- 3. Compaction summarize entire conversation when approaching context limits
3
+ Mirrors opencode's two-layer approach:
4
+
5
+ 1. **Prune** (routine, lossy only on tool outputs): walks old tool_result
6
+ blocks and replaces their content with a placeholder. User/assistant
7
+ text is NEVER touched — it survives verbatim until real overflow.
8
+ This is the steady-state memory mechanism. Matches cache_patch.py's
9
+ strategy at the Agno message layer.
10
+
11
+ 2. **Compact** (rare, lossy full summary): triggers only when the per-call
12
+ context window actually approaches the model's limit. Runs a
13
+ compaction agent that produces a structured summary (Goal / Instructions
14
+ / Discoveries / Accomplished / File contents / Relevant files) and
15
+ marks the resulting assistant message with `summary: True` so
16
+ subsequent prunes stop at that checkpoint.
17
+
18
+ There is also a `truncate_output` layer used by individual tools to cap
19
+ their own output size before it ever reaches history.
7
20
  """
8
21
 
9
22
  from __future__ import annotations
10
23
 
11
24
  # ── Constants ──────────────────────────────────────────────────────
12
25
 
13
- # Pruning: minimum chars that must be freeable to justify a prune pass
14
- PRUNE_MINIMUM_CHARS = 12_000 # ~3.5K tokens
15
- # Placeholder that replaces evicted content
16
- PRUNED_PLACEHOLDER = "[cleared]"
17
- # User messages larger than this threshold are truncated when outside protection window
18
- PRUNE_USER_MSG_THRESHOLD = 2_000 # ~570 tokens
19
- # How many chars to keep from the start of a pruned user message
20
- PRUNE_USER_MSG_KEEP = 500 # ~140 tokens
26
+ # Pruning: minimum chars that must be freeable to justify a prune pass.
27
+ # Matches opencode's PRUNE_MINIMUM = 20_000 tokens (~80K chars @ 4 chars/token).
28
+ PRUNE_MINIMUM_CHARS = 80_000 # ~20K tokens
29
+ # Placeholder that replaces cleared tool_result content. Matches
30
+ # cache_patch.py's _PRUNED_PLACEHOLDER so both layers produce identical
31
+ # text when a tool output is cleared.
32
+ CLEARED_TOOL_RESULT = "[Old tool result cleared]"
21
33
  # Minimum number of recent user turns always protected (regardless of char budget)
22
34
  PRUNE_PROTECT_TURNS = 2
23
35
  # Tool result markers that should never be pruned (critical context)
24
36
  PRUNE_PROTECTED_MARKERS = {"[SubAgent-", "delegate_task"}
25
- # Tool names whose outputs should never be pruned (like OpenCode's PRUNE_PROTECTED_TOOLS)
37
+ # Tool names whose outputs should never be pruned (like opencode's PRUNE_PROTECTED_TOOLS)
26
38
  # These are checked as substrings in message content (tool results include the tool name)
27
39
  PRUNE_PROTECTED_TOOLS = {"delegate_task"}
28
40
 
@@ -32,17 +44,39 @@ TRUNCATE_MAX_BYTES = 15 * 1024 # 15 KB
32
44
  TRUNCATE_KEEP_START = 150 # lines to keep from the start
33
45
  TRUNCATE_KEEP_END = 60 # lines to keep from the end
34
46
  TRUNCATE_MAX_LINE_LENGTH = 1500 # chars per individual line (prevents minified files)
35
- # Directory for saving full truncated outputs (like OpenCode pattern)
47
+ # Directory for saving full truncated outputs (like opencode pattern)
36
48
  TRUNCATE_SAVE_DIR = ".aru/truncated"
37
49
 
38
- # Compaction: trigger when per-run input tokens exceed this fraction of model limit
39
- COMPACTION_THRESHOLD_RATIO = 0.70
40
- # Compaction: target post-compaction size as fraction of model context limit
41
- COMPACTION_TARGET_RATIO = 0.15
42
- # Compaction: also trigger after this many user turns (regardless of token count)
43
- COMPACTION_MAX_TURNS = 15
44
- # Compaction: reserve buffer for the compaction process itself (like OpenCode's 20K)
45
- COMPACTION_BUFFER_TOKENS = 20_000
50
+ # Compaction: chars of recent conversation preserved verbatim post-compact.
51
+ #
52
+ # Separate from the prune protect window (160K) because they measure
53
+ # different things:
54
+ # - Prune protect: "how much tool_result content stays intact"
55
+ # - Compact recent: "how much full-message history stays verbatim after
56
+ # the summary replaces the older portion"
57
+ #
58
+ # Set to 80K chars (~20K tokens) — half the prune window. Rationale:
59
+ # with the compactor now running on the main model (not a small one),
60
+ # summaries are faithful enough that we don't need 40K of recent overlap
61
+ # as a safety net. 20K still covers 3-6 recent turns verbatim, which
62
+ # mirrors the "last few exchanges" a human would re-read to resume work.
63
+ # Going to zero would match opencode exactly but requires the reactive
64
+ # overflow replay flow we haven't implemented yet.
65
+ COMPACT_RECENT_CHARS = 80_000
66
+
67
+ # Compaction: trigger when per-call input tokens approach real overflow.
68
+ # Matches opencode's philosophy: only fire near the model's actual context
69
+ # limit, not routinely. Routine context reduction is handled by prune_history
70
+ # (lossy only on tool outputs), so compaction is reserved for genuine
71
+ # overflow — where the next API call would otherwise exceed the model's
72
+ # input limit minus the reserved buffer.
73
+ #
74
+ # Opencode fires at `count >= limit.input - reserved` (overflow.ts:22) —
75
+ # no extra ratio. We mirror that here. The sole safety margin is
76
+ # COMPACTION_BUFFER_TOKENS, which is 30K (vs opencode's 20K) to give a bit
77
+ # more headroom for output + tool definitions + estimation noise, since
78
+ # we don't yet have a reactive overflow handler to catch the edge case.
79
+ COMPACTION_BUFFER_TOKENS = 30_000
46
80
  # Default model context limits (input tokens)
47
81
  MODEL_CONTEXT_LIMITS: dict[str, int] = {
48
82
  # Anthropic
@@ -114,61 +148,97 @@ Structured list of file paths relevant to continuing the work (one per line)."""
114
148
 
115
149
  # ── Layer 1: Pruning ──────────────────────────────────────────────
116
150
 
151
+ def _tool_result_content_len(msg: dict) -> int:
152
+ """Sum of content length of all non-cleared tool_result blocks in a message.
153
+
154
+ Mirrors opencode's prune walk, which accumulates only
155
+ `Token.estimate(part.state.output)` for `ToolPart`s (compaction.ts:119).
156
+ Text blocks and tool_use args are ignored — they are not the thing
157
+ being freed. This means pruning only "consumes budget" for real tool
158
+ output, so text-heavy conversations with few tool calls never trip
159
+ the prune path.
160
+
161
+ Already-cleared tool_results (content == CLEARED_TOOL_RESULT) are
162
+ skipped so a second pass doesn't double-count them.
163
+ """
164
+ from aru.history_blocks import is_tool_result
165
+ total = 0
166
+ for block in msg.get("content", []):
167
+ if is_tool_result(block):
168
+ content = block.get("content")
169
+ if content == CLEARED_TOOL_RESULT:
170
+ continue
171
+ if content is None:
172
+ continue
173
+ # tool_result content can be a string or a list of blocks —
174
+ # stringify to get a char count that roughly tracks tokens.
175
+ total += len(str(content))
176
+ return total
177
+
178
+
117
179
  def _get_prune_protect_chars(model_id: str = "default") -> int:
118
- """Scale protection window based on model context size.
119
-
120
- Returns the number of chars worth of recent history that should NEVER
121
- be pruned. The remaining history beyond this window is eligible for
122
- reversible pruning.
123
-
124
- Sizing rationale: the target is a steady-state per-call context
125
- window of ~20K tokens (what the user sees in the status bar), which
126
- means protected history should be ~17K tokens = ~60K chars. This
127
- floor is applied to every model; larger models get more protection
128
- scaled at ~7% of their context, capped at 200K chars (~57K tokens)
129
- to avoid protecting too much in 1M-context models where the extra
130
- history hurts prompt caching.
180
+ """Chars of recent history that must NEVER be pruned.
181
+
182
+ Flat value across all models, mirroring opencode's fixed
183
+ `PRUNE_PROTECT = 40_000` tokens (compaction.ts:36). At ~4 chars/token
184
+ that's 160K chars of tool-result content kept intact in the recent
185
+ window. Older tool_result blocks beyond this budget are eligible for
186
+ the lossy clear pass in `prune_history`.
187
+
188
+ Why flat (not scaled by model): opencode validated this in production
189
+ on contexts from 128K to 1M scaling by ratio adds complexity without
190
+ improving behavior, and protecting too much in 1M-context models can
191
+ actually hurt prompt caching by keeping rarely-touched tail content warm.
192
+
193
+ The `model_id` parameter is retained for signature compatibility with
194
+ older call sites; it has no effect on the returned value.
131
195
  """
132
- limit = MODEL_CONTEXT_LIMITS.get(model_id, MODEL_CONTEXT_LIMITS["default"])
133
- # ~4 chars per token, protect ~7% of context as the ratio ceiling
134
- ratio_based = int(limit * 0.07 * 4)
135
- # Floor of 60K chars (~17K tokens) keeps the user-visible context
136
- # window around 20K tokens steady-state after system + cache + output
137
- # overheads. Applies to any model where 7% would be smaller.
138
- return max(60_000, min(ratio_based, 200_000))
196
+ del model_id # unused — kept for signature compatibility
197
+ return 160_000
139
198
 
140
199
 
141
200
  def prune_history(
142
201
  history: list[dict], model_id: str = "default"
143
202
  ) -> list[dict]:
144
- """Reduce history token footprint by dropping old content blocks.
145
-
146
- Operates on block-shaped history (see `aru.history_blocks`). The
147
- algorithm walks backward accumulating a char budget, and for any
148
- message that falls outside the protection window:
149
-
150
- - `text` blocks on assistant messages replaced with `[cleared]`
151
- text block.
152
- - Large `text` blocks on user messages → truncated to first N chars.
153
- - `tool_use` blocks dropped **together with** their matching
154
- `tool_result` block in the subsequent tool/user message. Dropping
155
- them atomically is required: Anthropic's API rejects orphans with
156
- `400: tool_use_id not found`.
157
- - `tool_result` blocks dropped only when their paired `tool_use`
158
- is also dropped.
159
-
160
- Protection layers:
203
+ """Reduce history token footprint by clearing old tool result content.
204
+
205
+ Operates on block-shaped history (see `aru.history_blocks`). Matches
206
+ opencode's approach: the ONLY lossy operation is replacing the
207
+ content of old `tool_result` blocks with a short placeholder. Text
208
+ blocks (user and assistant), `tool_use` blocks, and block structure
209
+ are always preserved so the original ask survives verbatim until
210
+ real overflow forces a full compaction.
211
+
212
+ **Budget semantics** (opencode parity): the walk backward accumulates
213
+ **only tool_result content chars**, not whole-message chars. Text
214
+ blocks and tool_use args don't consume the protection budget, because
215
+ they aren't what prune can free. Consequences:
216
+ - Text-heavy conversations with few tool calls never trigger prune.
217
+ - Prune only fires when there is >= `protect_chars + PRUNE_MINIMUM_CHARS`
218
+ of tool_result content total — mirroring opencode's
219
+ `total > PRUNE_PROTECT + PRUNE_MINIMUM`.
220
+ - The "is it worth pruning?" dry-run check from opencode
221
+ (`pruned > PRUNE_MINIMUM`) is implicit: we cannot enter the loop
222
+ without enough prunable content, and once in the loop any walk
223
+ past `protect_chars` is guaranteed to be freeing real bytes.
224
+
225
+ Protection layers (applied on top of the budget walk):
161
226
  1. Turn-based: last `PRUNE_PROTECT_TURNS` user turns always kept
162
- intact, along with the assistant response right after each.
163
- 2. Char-based: recent content within the protection window is kept.
227
+ intact, plus the assistant response right after each. Index 0
228
+ (the original user ask) is also always protected.
229
+ 2. Budget-based: tool_result content within the 160K protect window
230
+ (~40K tokens, matching opencode) is kept.
164
231
  3. Content-based: messages whose stringified content contains any
165
232
  `PRUNE_PROTECTED_MARKERS` or `PRUNE_PROTECTED_TOOLS` never prune.
233
+ 4. Summary checkpoint: walking backward stops at any message marked
234
+ `summary: True` (a previous compaction's assistant output).
235
+ Everything before a summary was already consolidated and must
236
+ not be re-processed.
166
237
 
167
238
  Returns a new list (does not mutate the input).
168
239
  """
169
240
  from aru.history_blocks import (
170
- coerce_history_item, item_char_len, item_text,
171
- is_text, is_tool_use, is_tool_result, text_block,
241
+ coerce_history_item, item_text, is_tool_result,
172
242
  )
173
243
 
174
244
  if len(history) <= 2:
@@ -177,11 +247,15 @@ def prune_history(
177
247
  protect_chars = _get_prune_protect_chars(model_id)
178
248
  result = [coerce_history_item(m) for m in history]
179
249
 
180
- total_chars = sum(item_char_len(msg) for msg in result)
181
- if total_chars < protect_chars + PRUNE_MINIMUM_CHARS:
250
+ # Entry gate mirrors opencode: only proceed if total tool output
251
+ # exceeds protect + minimum. Text length is irrelevant.
252
+ total_tool_chars = sum(_tool_result_content_len(msg) for msg in result)
253
+ if total_tool_chars < protect_chars + PRUNE_MINIMUM_CHARS:
182
254
  return result
183
255
 
184
- # Identify indices of last N user turns (always protected)
256
+ # Identify indices of last N user turns (always protected) and index 0
257
+ # (the original user ask, protected defensively so the anchor never
258
+ # evaporates even if future edits change the budget calculus).
185
259
  turn_protected: set[int] = set()
186
260
  user_turns_seen = 0
187
261
  for i in range(len(result) - 1, -1, -1):
@@ -191,108 +265,60 @@ def prune_history(
191
265
  turn_protected.add(i)
192
266
  if i + 1 < len(result):
193
267
  turn_protected.add(i + 1)
194
-
195
- # Build a map of tool_use_id → (assistant_idx, user_idx) so we can
196
- # drop both halves of a pair atomically. The user_idx points to the
197
- # next message(s) after the assistant carrying the matching tool_result.
198
- tool_pair_loc: dict[str, tuple[int, int]] = {}
199
- for i, msg in enumerate(result):
200
- if msg["role"] != "assistant":
201
- continue
202
- for block in msg["content"]:
203
- if not is_tool_use(block):
204
- continue
205
- tu_id = block.get("id")
206
- if not tu_id:
207
- continue
208
- # Look forward for the matching tool_result (usually i+1)
209
- for j in range(i + 1, min(i + 3, len(result))):
210
- for rb in result[j]["content"]:
211
- if is_tool_result(rb) and rb.get("tool_use_id") == tu_id:
212
- tool_pair_loc[tu_id] = (i, j)
213
- break
214
- if tu_id in tool_pair_loc:
215
- break
216
-
217
- # Walk backward, protecting recent content
268
+ if result and result[0]["role"] == "user":
269
+ turn_protected.add(0)
270
+ if len(result) > 1:
271
+ turn_protected.add(1)
272
+
273
+ # Walk backward accumulating ONLY tool_result content chars into the
274
+ # protection budget. Messages with no tool_result (pure text, or just
275
+ # tool_use) consume zero budget and are skipped without pruning.
218
276
  protected = 0
219
- dropped_tool_use_ids: set[str] = set()
220
277
 
221
278
  for i in range(len(result) - 1, -1, -1):
222
279
  msg = result[i]
223
- msg_len = item_char_len(msg)
280
+
281
+ # Stop at the previous compaction summary marker — everything
282
+ # before it was already consolidated into the summary.
283
+ if msg.get("summary"):
284
+ break
285
+
286
+ tool_chars = _tool_result_content_len(msg)
287
+
288
+ # No prunable content here — nothing to clear, nothing to count.
289
+ if tool_chars == 0:
290
+ continue
224
291
 
225
292
  if i in turn_protected:
226
- protected += msg_len
293
+ protected += tool_chars
227
294
  continue
228
295
 
229
- if protected + msg_len <= protect_chars:
230
- protected += msg_len
296
+ if protected + tool_chars <= protect_chars:
297
+ protected += tool_chars
231
298
  continue
232
299
 
233
300
  # Outside protection window — check content-based protection
234
301
  text_view = item_text(msg)
235
302
  if (any(marker in text_view for marker in PRUNE_PROTECTED_MARKERS)
236
303
  or any(tool in text_view for tool in PRUNE_PROTECTED_TOOLS)):
237
- protected += msg_len
304
+ protected += tool_chars
238
305
  continue
239
306
 
240
- # Prune this message's blocks
307
+ # Clear any tool_result payloads in this message. Leave every
308
+ # other block (text, tool_use, thinking, etc.) untouched.
241
309
  new_blocks: list[dict] = []
242
310
  for block in msg["content"]:
243
- if is_text(block):
244
- if msg["role"] == "assistant":
245
- # Replace with a single placeholder (only if not already)
246
- if not new_blocks or new_blocks[-1].get("text") != PRUNED_PLACEHOLDER:
247
- new_blocks.append(text_block(PRUNED_PLACEHOLDER))
248
- elif msg["role"] == "user":
249
- text = block.get("text", "")
250
- if len(text) > PRUNE_USER_MSG_THRESHOLD:
251
- truncated = (
252
- text[:PRUNE_USER_MSG_KEEP]
253
- + f"\n\n[... {len(text) - PRUNE_USER_MSG_KEEP:,} "
254
- "chars pruned to save context ...]"
255
- )
256
- new_blocks.append(text_block(truncated))
257
- else:
258
- new_blocks.append(block)
259
- else:
260
- new_blocks.append(block)
261
- elif is_tool_use(block):
262
- # Drop the tool_use entirely and mark its id for paired removal
263
- tu_id = block.get("id")
264
- if tu_id:
265
- dropped_tool_use_ids.add(tu_id)
266
- # Do NOT add to new_blocks
267
- elif is_tool_result(block):
268
- # Drop only if its paired tool_use is also being dropped
269
- tu_id = block.get("tool_use_id")
270
- if tu_id in dropped_tool_use_ids:
271
- pass # drop
272
- else:
273
- new_blocks.append(block)
311
+ if is_tool_result(block) and block.get("content") != CLEARED_TOOL_RESULT:
312
+ new_blocks.append({
313
+ "type": "tool_result",
314
+ "tool_use_id": block.get("tool_use_id"),
315
+ "content": CLEARED_TOOL_RESULT,
316
+ })
274
317
  else:
275
318
  new_blocks.append(block)
276
319
 
277
320
  result[i] = {"role": msg["role"], "content": new_blocks}
278
321
 
279
- # Second pass: any tool_result blocks in user messages whose tool_use
280
- # was dropped on a previous pass (covers case where user msg was
281
- # inside protection but its paired assistant was outside).
282
- if dropped_tool_use_ids:
283
- for idx, msg in enumerate(result):
284
- if not msg["content"]:
285
- continue
286
- filtered = [
287
- b for b in msg["content"]
288
- if not (is_tool_result(b) and b.get("tool_use_id") in dropped_tool_use_ids)
289
- ]
290
- if len(filtered) != len(msg["content"]):
291
- result[idx] = {"role": msg["role"], "content": filtered}
292
-
293
- # Drop any messages that ended up with zero blocks (valid but useless)
294
- result = [m for m in result if m["content"]]
295
-
296
322
  return result
297
323
 
298
324
 
@@ -443,58 +469,78 @@ def should_compact(
443
469
  ) -> bool:
444
470
  """Check if the conversation should be compacted.
445
471
 
446
- Triggers on EITHER condition:
447
- 1. Token-based: tokens >= usable_context * threshold_ratio
448
- 2. Turn-based: user turns >= COMPACTION_MAX_TURNS (prevents slow token creep)
472
+ Fires when the per-call context window reaches real overflow:
473
+ `tokens >= limit - COMPACTION_BUFFER_TOKENS`.
474
+
475
+ Matches opencode's `isOverflow` in overflow.ts:22 — `count >= usable`,
476
+ no extra ratio. Routine context reduction is handled by `prune_history`
477
+ (lossy only on tool outputs), so compaction is reserved for genuine
478
+ overflow where the next API call would otherwise exceed the model's
479
+ input limit minus the reserved buffer.
449
480
 
450
481
  Accepts either an estimated token count (int) or the history list.
451
482
  """
452
483
  if isinstance(history_or_tokens, list):
453
- history = history_or_tokens
454
- tokens = estimate_history_tokens(history)
455
- # Turn-based trigger: count user messages
456
- user_turns = sum(1 for m in history if m["role"] == "user")
457
- if user_turns >= COMPACTION_MAX_TURNS:
458
- return True
484
+ tokens = estimate_history_tokens(history_or_tokens)
459
485
  else:
460
486
  tokens = history_or_tokens
461
487
 
462
488
  limit = MODEL_CONTEXT_LIMITS.get(model_id, MODEL_CONTEXT_LIMITS["default"])
463
489
  usable = limit - COMPACTION_BUFFER_TOKENS
464
- threshold = int(usable * COMPACTION_THRESHOLD_RATIO)
465
- return tokens >= threshold
490
+ return tokens >= usable
466
491
 
467
492
 
468
493
  def would_prune(history: list[dict], model_id: str = "default") -> bool:
469
494
  """Check if prune_history would discard content from this history.
470
495
 
471
- Uses the exact same criteria as prune_history: total chars exceed
472
- the protection window + minimum prunable threshold.
496
+ Uses the same entry gate as `prune_history`: total tool_result
497
+ content must exceed the protection window + minimum prunable
498
+ threshold. Text and tool_use args are not counted — only real
499
+ prunable output. Mirrors opencode's logic.
473
500
  """
474
- from aru.history_blocks import item_char_len
475
501
  if len(history) <= 2:
476
502
  return False
477
- total_chars = sum(item_char_len(msg) for msg in history)
503
+ total_tool_chars = sum(_tool_result_content_len(msg) for msg in history)
478
504
  protect_chars = _get_prune_protect_chars(model_id)
479
- return total_chars >= protect_chars + PRUNE_MINIMUM_CHARS
505
+ return total_tool_chars >= protect_chars + PRUNE_MINIMUM_CHARS
480
506
 
481
507
 
482
508
  def _split_history(history: list[dict], model_id: str = "default") -> tuple[list[dict], list[dict]]:
483
509
  """Split history into old (to summarize) and recent (to keep intact).
484
510
 
485
- Uses the same protection window as pruning.
511
+ Uses `COMPACT_RECENT_CHARS` (80K chars 20K tokens) as the "recent"
512
+ budget — half of the prune protect window. Rationale: the compactor
513
+ now runs on the main model and produces high-fidelity summaries, so
514
+ we don't need 40K of recent overlap as a safety net. 20K covers 3-6
515
+ recent turns verbatim, which is enough to absorb the gap between
516
+ the last summarized state and the next turn.
517
+
518
+ Defensively, the first user turn (index 0) is always pulled into
519
+ `recent` so the original ask survives literal even through a full
520
+ compaction — the compactor extracts it into the `## Goal` section
521
+ of the summary, but keeping it in recent too means the agent can
522
+ quote it verbatim afterward.
523
+
524
+ The `model_id` parameter is retained for signature compatibility;
525
+ the recent budget is a flat value not scaled by model context.
486
526
  """
527
+ del model_id # unused — recent budget is flat across models
487
528
  from aru.history_blocks import item_char_len
488
- protect_chars = _get_prune_protect_chars(model_id)
489
529
  protected = 0
490
530
  split_idx = len(history)
491
531
  for i in range(len(history) - 1, -1, -1):
492
532
  msg_len = item_char_len(history[i])
493
- if protected + msg_len <= protect_chars:
533
+ if protected + msg_len <= COMPACT_RECENT_CHARS:
494
534
  protected += msg_len
495
535
  split_idx = i
496
536
  else:
497
537
  break
538
+
539
+ # Defensive: force the first user turn into `recent` even if the
540
+ # protect budget would have sent it to `old`. The original ask is
541
+ # the session anchor and must stay literal.
542
+ if split_idx > 0 and history and history[0].get("role") == "user":
543
+ return history[1:split_idx], [history[0]] + history[split_idx:]
498
544
  return history[:split_idx], history[split_idx:]
499
545
 
500
546
 
@@ -563,12 +609,13 @@ def apply_compaction(
563
609
  The summary is emitted as a synthetic user→assistant exchange so that
564
610
  role alternation stays natural:
565
611
  [user: "Please summarize..."]
566
- [assistant: "<summary>"]
612
+ [assistant: "<summary>", summary=True]
567
613
  + recent messages as-is
568
614
 
569
- This shape avoids the `[user, user, ...]` sequence that previously
570
- biased the model toward describing actions rather than emitting
571
- structured tool calls.
615
+ The assistant summary is marked with `summary: True` as a checkpoint.
616
+ `prune_history` walks backward and stops at this marker, so content
617
+ already consolidated into the summary is never re-processed. Mirrors
618
+ opencode's `msg.info.summary` flag (see message-v2.ts:914).
572
619
  """
573
620
  from aru.history_blocks import text_block, coerce_history_item
574
621
  _, recent = _split_history(history, model_id)
@@ -581,6 +628,7 @@ def apply_compaction(
581
628
  {
582
629
  "role": "assistant",
583
630
  "content": [text_block(f"Prior conversation summary:\n\n{summary}")],
631
+ "summary": True,
584
632
  },
585
633
  ]
586
634
  compacted.extend(coerce_history_item(m) for m in recent)
@@ -596,10 +644,20 @@ async def compact_conversation(
596
644
  ) -> list[dict[str, str]]:
597
645
  """Run the compaction agent to summarize and replace history.
598
646
 
599
- Uses a small/fast model for the summarization to minimize cost.
600
- Falls back to simple truncation if the agent call fails.
647
+ Uses the **same model** as the main session (`model_ref`), not a
648
+ cheaper small model. Rationale:
649
+
650
+ - Compaction is rare (only on real overflow, ~0-2× per long session).
651
+ - The summary is the *only* persistent record of pre-window history.
652
+ - A weaker compactor risks dropping subtle decisions that the main
653
+ model would have caught — and once dropped, they cannot be recovered
654
+ mid-session.
655
+ - The marginal cost (Sonnet: ~$0.20-0.40 per session; Opus: a few
656
+ dollars) is justified by the fidelity gain on a non-recoverable
657
+ step.
658
+
659
+ Falls back to a mechanical summary if the agent call fails.
601
660
  """
602
- from aru.runtime import get_ctx
603
661
  from aru.providers import create_model
604
662
 
605
663
  prompt = build_compaction_prompt(history, plan_task, model_id=model_id)
@@ -607,16 +665,17 @@ async def compact_conversation(
607
665
  try:
608
666
  from agno.agent import Agent
609
667
 
610
- small_ref = get_ctx().small_model_ref
611
668
  compactor = Agent(
612
669
  name="Compactor",
613
- model=create_model(small_ref, max_tokens=4096),
670
+ model=create_model(model_ref, max_tokens=4096),
614
671
  instructions=(
615
672
  "You summarize coding conversations concisely. Output ONLY the requested sections, no preamble. "
616
673
  "Preserve: user goals, explicit instructions/preferences, file paths with line numbers, "
617
- "function/class names that were modified, what remains to be done, AND verbatim excerpts "
618
- "from any file contents shown in the conversation (signatures, critical constants, "
619
- "bug-related lines) under the '## File contents (key excerpts)' section. "
674
+ "function/class names that were modified, what remains to be done. "
675
+ "For the '## File contents (key excerpts)' section, use your judgment: "
676
+ "if a file was central to the work (being debugged, actively edited, or referenced "
677
+ "in a decision), include the critical lines verbatim; if a file was only briefly "
678
+ "read for context, just list the path. Do not mechanically copy everything. "
620
679
  "Drop: greetings, reasoning chains, redundant tool output, transient status messages."
621
680
  ),
622
681
  markdown=True,
@@ -402,9 +402,12 @@ class Session:
402
402
  self.history.append({"role": role, "content": blocks})
403
403
  # Hard cap as safety net — structured pruning/compaction in
404
404
  # aru/context.py handles the normal case; this only fires if
405
- # something bypasses them.
406
- if len(self.history) > 60:
407
- self.history = self.history[-60:]
405
+ # something bypasses them. Set high enough that long sessions
406
+ # (which now accumulate more messages because prune is
407
+ # non-destructive for text and compact rarely fires) don't hit
408
+ # this destructive path routinely.
409
+ if len(self.history) > 300:
410
+ self.history = self.history[-300:]
408
411
 
409
412
  def add_structured_message(self, role: str, blocks: list[dict]):
410
413
  """Explicitly add a message with pre-built content blocks.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: aru-code
3
- Version: 0.18.0
3
+ Version: 0.19.1
4
4
  Summary: A Claude Code clone built with Agno agents
5
5
  Author-email: Estevao <estevaofon@gmail.com>
6
6
  License-Expression: MIT
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "aru-code"
7
- version = "0.18.0"
7
+ version = "0.19.1"
8
8
  description = "A Claude Code clone built with Agno agents"
9
9
  readme = "README.md"
10
10
  license = "MIT"
@@ -314,11 +314,12 @@ class TestSession:
314
314
 
315
315
  def test_add_message_caps_history(self):
316
316
  session = Session()
317
- for i in range(75):
317
+ for i in range(350):
318
318
  session.add_message("user", f"msg {i}")
319
- # History is bounded by a hard cap (structured compaction in
320
- # aru.context handles the normal-path token management).
321
- assert len(session.history) <= 60
319
+ # History is bounded by a hard safety cap (structured pruning/
320
+ # compaction in aru.context handles the normal-path token
321
+ # management; this cap only fires on pathological growth).
322
+ assert len(session.history) <= 300
322
323
 
323
324
  def test_set_plan(self):
324
325
  session = Session()
@@ -176,17 +176,26 @@ class TestPrunePreservesPairs:
176
176
  """Fix 6: pruning must never orphan tool_use / tool_result blocks."""
177
177
 
178
178
  def test_prune_drops_tool_pair_atomically(self):
179
- """An old assistant with tool_use must have its tool_result dropped too."""
180
- # Build a history large enough to force pruning
181
- filler = "x" * 50_000
179
+ """An old tool_result whose content gets cleared must still keep
180
+ its block (matching tool_use_id), so the tool_use/tool_result
181
+ pair is never orphaned.
182
+
183
+ Opencode-aligned budget: prune only counts tool_result content
184
+ chars, so the history needs multiple large tool_result payloads
185
+ to clear the 240K entry gate.
186
+ """
187
+ big_output = "old file line\n" * 8_000 # ~100K chars per result
182
188
  history = [
183
189
  {"role": "user", "content": "request 1"},
184
- _assistant_tool_turn("old_tu", "read_file", {"path": "old.py"}, filler),
185
- _tool_result_turn("old_tu", "old contents " * 1000),
190
+ _assistant_tool_turn("old_tu", "read_file", {"path": "old.py"}),
191
+ _tool_result_turn("old_tu", big_output),
186
192
  {"role": "user", "content": "request 2"},
187
- {"role": "assistant", "content": "response 2"},
188
- {"role": "user", "content": "recent request " * 5000},
189
- {"role": "assistant", "content": "recent response"},
193
+ _assistant_tool_turn("mid_tu", "read_file", {"path": "mid.py"}),
194
+ _tool_result_turn("mid_tu", big_output),
195
+ {"role": "user", "content": "request 3"},
196
+ _assistant_tool_turn("recent_tu", "read_file", {"path": "new.py"}),
197
+ _tool_result_turn("recent_tu", big_output),
198
+ {"role": "user", "content": "summarize"},
190
199
  ]
191
200
 
192
201
  pruned = prune_history(history, model_id="default")
@@ -208,10 +217,20 @@ class TestPrunePreservesPairs:
208
217
  )
209
218
 
210
219
  def test_prune_keeps_recent_tool_pair(self):
211
- """A tool_use/tool_result pair inside the protection window must be kept."""
220
+ """A tool_use/tool_result pair inside the protection window must be
221
+ kept with its content intact, even when older tool_results get cleared.
222
+
223
+ Builds a history with two big old tool_results (enough to trigger
224
+ prune) and one small recent pair that must survive verbatim.
225
+ """
226
+ big_old = "old file content\n" * 10_000 # ~170K chars each
212
227
  history = [
213
- {"role": "user", "content": "old stuff " * 50_000},
214
- {"role": "assistant", "content": "old response " * 10_000},
228
+ {"role": "user", "content": "req 1"},
229
+ _assistant_tool_turn("tu_old1", "read_file", {"path": "a.py"}, "reading"),
230
+ _tool_result_turn("tu_old1", big_old),
231
+ {"role": "user", "content": "req 2"},
232
+ _assistant_tool_turn("tu_old2", "read_file", {"path": "b.py"}, "reading"),
233
+ _tool_result_turn("tu_old2", big_old),
215
234
  {"role": "user", "content": "read foo"},
216
235
  _assistant_tool_turn("tu_recent", "read_file", {"path": "foo.py"}, "reading"),
217
236
  _tool_result_turn("tu_recent", "def foo(): pass"),
@@ -227,6 +246,10 @@ class TestPrunePreservesPairs:
227
246
 
228
247
  assert len(tool_uses) == 1, "Recent tool_use was incorrectly pruned"
229
248
  assert len(tool_results) == 1, "Recent tool_result was incorrectly pruned"
249
+ # Recent content must be intact (not cleared)
250
+ assert tool_results[0].get("content") == "def foo(): pass", (
251
+ "Recent tool_result content was cleared — should be inside protection window"
252
+ )
230
253
 
231
254
  def test_prune_with_no_pairs_still_works(self):
232
255
  """Pure text history should prune without errors."""
@@ -10,8 +10,16 @@ from aru.context import (
10
10
  apply_compaction,
11
11
  build_compaction_prompt,
12
12
  format_context_block,
13
+ CLEARED_TOOL_RESULT,
14
+ )
15
+ from aru.history_blocks import (
16
+ coerce_history,
17
+ item_text,
18
+ tool_use_block,
19
+ tool_result_block,
20
+ text_block,
21
+ is_tool_result,
13
22
  )
14
- from aru.history_blocks import coerce_history, item_text
15
23
 
16
24
 
17
25
  class TestPruneHistory:
@@ -27,37 +35,103 @@ class TestPruneHistory:
27
35
  # Input is auto-coerced to block form on return
28
36
  assert result == coerce_history(messages)
29
37
 
30
- def test_prunes_old_assistant_messages(self):
31
- """Should prune old assistant messages when over threshold."""
32
- old_content = "x" * 30000
33
- recent_content = "y" * 10000
38
+ def test_prunes_old_tool_results_when_over_threshold(self):
39
+ """Should clear old tool_result content when total tool output
40
+ exceeds protect + minimum (opencode-aligned budget semantics).
41
+
42
+ The budget walks backward over tool_result content chars only.
43
+ Text and tool_use args don't count, so this test uses large
44
+ tool_result payloads to actually trip the prune path.
45
+ """
46
+ # Three rounds of read_file-sized outputs. Total ~300K chars
47
+ # of tool_result content — clears the 240K entry gate, and
48
+ # the 160K protect budget will cover only the most recent one.
49
+ big_output = "line of code\n" * 8_000 # ~100K chars
34
50
  messages = [
35
- {"role": "user", "content": "First request"},
36
- {"role": "assistant", "content": old_content},
37
- {"role": "user", "content": "Second request"},
38
- {"role": "assistant", "content": recent_content},
51
+ {"role": "user", "content": "round 1"},
52
+ {
53
+ "role": "assistant",
54
+ "content": [
55
+ text_block("reading"),
56
+ tool_use_block("tu_old", "read_file", {"path": "a.py"}),
57
+ ],
58
+ },
59
+ {"role": "tool", "content": [tool_result_block("tu_old", big_output)]},
60
+ {"role": "user", "content": "round 2"},
61
+ {
62
+ "role": "assistant",
63
+ "content": [
64
+ text_block("reading"),
65
+ tool_use_block("tu_mid", "read_file", {"path": "b.py"}),
66
+ ],
67
+ },
68
+ {"role": "tool", "content": [tool_result_block("tu_mid", big_output)]},
69
+ {"role": "user", "content": "round 3"},
70
+ {
71
+ "role": "assistant",
72
+ "content": [
73
+ text_block("reading"),
74
+ tool_use_block("tu_recent", "read_file", {"path": "c.py"}),
75
+ ],
76
+ },
77
+ {"role": "tool", "content": [tool_result_block("tu_recent", big_output)]},
78
+ {"role": "user", "content": "what did you find?"},
39
79
  ]
40
80
  result = prune_history(messages)
41
- # Should have placeholder for pruned content
42
- assert len(result) <= len(messages)
43
- # Recent messages should be preserved
44
- assert any("Second request" in str(m) for m in result)
45
-
46
- def test_preserves_user_messages(self):
47
- """Should always preserve user messages."""
48
- old_user = {"role": "user", "content": "Old user message"}
49
- old_assistant = {"role": "assistant", "content": "Old assistant " * 10000}
50
- recent = {"role": "user", "content": "Recent request"}
51
-
52
- messages = [old_user, old_assistant, recent]
53
- result = prune_history(messages)
54
81
 
55
- # User messages should be preserved (as placeholders or original)
56
- recent_preserved = any(
57
- m.get("role") == "user" and "Recent" in item_text(m)
58
- for m in result
82
+ # Same number of messages (prune never drops structure)
83
+ assert len(result) == len(messages)
84
+
85
+ # Collect tool_result blocks by tool_use_id
86
+ by_id: dict[str, dict] = {}
87
+ for msg in result:
88
+ for block in msg.get("content", []):
89
+ if is_tool_result(block):
90
+ by_id[block.get("tool_use_id")] = block
91
+
92
+ # All three pairs preserved at the block level
93
+ assert set(by_id.keys()) == {"tu_old", "tu_mid", "tu_recent"}
94
+
95
+ # Recent tool_result kept verbatim
96
+ assert by_id["tu_recent"]["content"] == big_output
97
+
98
+ # The older tool_result must have been cleared — at least one
99
+ # of tu_old/tu_mid should now hold the placeholder, since only
100
+ # 160K chars worth fits inside the protect window.
101
+ cleared_count = sum(
102
+ 1 for tu_id in ("tu_old", "tu_mid")
103
+ if by_id[tu_id]["content"] == CLEARED_TOOL_RESULT
104
+ )
105
+ assert cleared_count >= 1, (
106
+ "Expected at least one old tool_result to be cleared once "
107
+ "total output exceeded protect + minimum"
59
108
  )
60
- assert recent_preserved
109
+
110
+ def test_text_heavy_history_is_not_pruned(self):
111
+ """Conversations dominated by text (not tool output) must NOT
112
+ trigger prune even if total chars are huge.
113
+
114
+ This is the opencode-aligned semantics: text blocks don't enter
115
+ the prune budget. A 500K-char text history with no tool_results
116
+ is a no-op for prune_history.
117
+ """
118
+ messages = [
119
+ {"role": "user", "content": "long planning discussion " * 10_000},
120
+ {"role": "assistant", "content": "detailed reasoning " * 10_000},
121
+ {"role": "user", "content": "what's next?"},
122
+ {"role": "assistant", "content": "here's the plan " * 10_000},
123
+ ]
124
+ result = prune_history(messages)
125
+
126
+ # No tool_results exist anywhere in result
127
+ tool_results = [
128
+ b for m in result for b in m.get("content", []) if is_tool_result(b)
129
+ ]
130
+ assert tool_results == []
131
+ # Length preserved
132
+ assert len(result) == len(messages)
133
+ # No message content was altered to CLEARED_TOOL_RESULT
134
+ assert all(CLEARED_TOOL_RESULT not in item_text(m) for m in result)
61
135
 
62
136
  def test_empty_history(self):
63
137
  """Should handle empty history."""
@@ -108,20 +182,21 @@ class TestShouldCompact:
108
182
  """Tests for should_compact function."""
109
183
 
110
184
  def test_no_compaction_under_threshold(self):
111
- """Should not compact when under 50% of context limit."""
112
- # Default 200K tokens * 0.5 = 100K threshold; 5 tokens is well under
185
+ """Should not compact when well under the overflow threshold."""
186
+ # claude-sonnet-4-5 has 200K context; usable = 170K (buffer 30K).
187
+ # 5 tokens is well under.
113
188
  result = should_compact(5, model_id="claude-sonnet-4-5-20250929")
114
189
  assert result is False
115
190
 
116
191
  def test_compaction_over_threshold(self):
117
- """Should compact when over threshold."""
118
- # 300K tokens is over 50% of a 200K-token context window
192
+ """Should compact when over the real-overflow threshold."""
193
+ # 300K tokens is well over the 170K threshold of a 200K-context model.
119
194
  result = should_compact(300000, model_id="claude-sonnet-4-5-20250929")
120
195
  assert result is True
121
196
 
122
197
  def test_custom_context_limit(self):
123
198
  """Should respect custom context limit."""
124
- # gpt-4o has 128K context, 50% = 64K; 50K is under threshold
199
+ # gpt-4o has 128K context; usable = 98K. 50K is under.
125
200
  result = should_compact(50000, model_id="gpt-4o")
126
201
  assert isinstance(result, bool)
127
202
 
@@ -145,7 +220,8 @@ class TestCompactionTriggerUsesPerCallMetric:
145
220
 
146
221
  def test_small_per_call_window_does_not_fire(self):
147
222
  """Reproduces the exact bug report: per-call ~20K on qwen3.6-plus
148
- (128K limit, ~75.6K threshold) must NOT trigger compaction."""
223
+ (128K limit, ~98K threshold with 30K buffer) must NOT
224
+ trigger compaction."""
149
225
  # Values taken from the real session where compaction fired incorrectly:
150
226
  # "context: 20,184 (in: 16,652 / out: 696 / cache_read: 2,836)"
151
227
  last_input = 16_652
@@ -158,7 +234,7 @@ class TestCompactionTriggerUsesPerCallMetric:
158
234
  )
159
235
  assert last_call_window == 20_184, "window computation changed"
160
236
 
161
- # 20K is ~3.7× below the 75.6K threshold for a 128K-context model
237
+ # 20K is far below the ~98K threshold for a 128K-context model
162
238
  assert should_compact(last_call_window, model_id="qwen3.6-plus") is False, (
163
239
  "Compaction fired on a small per-call window. The runner is "
164
240
  "probably passing cumulative tokens (run_output.metrics.input_tokens) "
@@ -169,7 +245,9 @@ class TestCompactionTriggerUsesPerCallMetric:
169
245
  def test_large_per_call_window_still_fires(self):
170
246
  """Positive case: compaction must still fire when the last-call
171
247
  window actually approaches the model's context limit."""
172
- last_input = 80_000
248
+ # qwen3.6-plus: 128K limit, usable = 98K (buffer 30K).
249
+ # 105K input + 2K output + 0 cache = 107K window → must fire.
250
+ last_input = 105_000
173
251
  last_output = 2_000
174
252
  last_cache_read = 0
175
253
  last_cache_write = 0
@@ -177,17 +255,17 @@ class TestCompactionTriggerUsesPerCallMetric:
177
255
  last_call_window = (
178
256
  last_input + last_output + last_cache_read + last_cache_write
179
257
  )
180
- assert last_call_window == 82_000
258
+ assert last_call_window == 107_000
181
259
 
182
- # 82K > 75.6K threshold → must fire
260
+ # 107K > 98K threshold → must fire
183
261
  assert should_compact(last_call_window, model_id="qwen3.6-plus") is True
184
262
 
185
263
  def test_cumulative_metric_is_the_wrong_signal(self):
186
264
  """Illustrates WHY the old approach was wrong: a cumulative sum of
187
- 5 API calls at 18K each is 90K (above threshold), but the actual
265
+ 6 API calls at 18K each is 108K (above threshold), but the actual
188
266
  per-call window each time is only 18K (well below)."""
189
267
  per_call_window = 18_000
190
- num_api_calls_in_turn = 5
268
+ num_api_calls_in_turn = 6
191
269
  cumulative_if_summed = per_call_window * num_api_calls_in_turn
192
270
 
193
271
  # Old (wrong) behavior: cumulative triggers compaction
@@ -196,8 +274,8 @@ class TestCompactionTriggerUsesPerCallMetric:
196
274
  # New (correct) behavior: per-call does NOT trigger compaction
197
275
  assert should_compact(per_call_window, model_id="qwen3.6-plus") is False
198
276
 
199
- # The difference is the entire bug
200
- assert cumulative_if_summed > 75_600 > per_call_window
277
+ # The difference is the entire bug (threshold is 98K for qwen3.6-plus)
278
+ assert cumulative_if_summed > 98_000 > per_call_window
201
279
 
202
280
  def test_runner_source_uses_per_call_metric(self):
203
281
  """Static check against silent regression.
@@ -1 +0,0 @@
1
- __version__ = "0.18.0"
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes