aru-code 0.18.0__tar.gz → 0.19.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. {aru_code-0.18.0/aru_code.egg-info → aru_code-0.19.0}/PKG-INFO +1 -1
  2. aru_code-0.19.0/aru/__init__.py +1 -0
  3. {aru_code-0.18.0 → aru_code-0.19.0}/aru/context.py +187 -166
  4. {aru_code-0.18.0 → aru_code-0.19.0}/aru/session.py +6 -3
  5. {aru_code-0.18.0 → aru_code-0.19.0/aru_code.egg-info}/PKG-INFO +1 -1
  6. {aru_code-0.18.0 → aru_code-0.19.0}/pyproject.toml +1 -1
  7. {aru_code-0.18.0 → aru_code-0.19.0}/tests/test_cli.py +5 -4
  8. {aru_code-0.18.0 → aru_code-0.19.0}/tests/test_confabulation_regression.py +34 -11
  9. {aru_code-0.18.0 → aru_code-0.19.0}/tests/test_context.py +119 -41
  10. aru_code-0.18.0/aru/__init__.py +0 -1
  11. {aru_code-0.18.0 → aru_code-0.19.0}/LICENSE +0 -0
  12. {aru_code-0.18.0 → aru_code-0.19.0}/README.md +0 -0
  13. {aru_code-0.18.0 → aru_code-0.19.0}/aru/agent_factory.py +0 -0
  14. {aru_code-0.18.0 → aru_code-0.19.0}/aru/agents/__init__.py +0 -0
  15. {aru_code-0.18.0 → aru_code-0.19.0}/aru/agents/base.py +0 -0
  16. {aru_code-0.18.0 → aru_code-0.19.0}/aru/agents/executor.py +0 -0
  17. {aru_code-0.18.0 → aru_code-0.19.0}/aru/agents/planner.py +0 -0
  18. {aru_code-0.18.0 → aru_code-0.19.0}/aru/cache_patch.py +0 -0
  19. {aru_code-0.18.0 → aru_code-0.19.0}/aru/cli.py +0 -0
  20. {aru_code-0.18.0 → aru_code-0.19.0}/aru/commands.py +0 -0
  21. {aru_code-0.18.0 → aru_code-0.19.0}/aru/completers.py +0 -0
  22. {aru_code-0.18.0 → aru_code-0.19.0}/aru/config.py +0 -0
  23. {aru_code-0.18.0 → aru_code-0.19.0}/aru/display.py +0 -0
  24. {aru_code-0.18.0 → aru_code-0.19.0}/aru/history_blocks.py +0 -0
  25. {aru_code-0.18.0 → aru_code-0.19.0}/aru/permissions.py +0 -0
  26. {aru_code-0.18.0 → aru_code-0.19.0}/aru/providers.py +0 -0
  27. {aru_code-0.18.0 → aru_code-0.19.0}/aru/runner.py +0 -0
  28. {aru_code-0.18.0 → aru_code-0.19.0}/aru/runtime.py +0 -0
  29. {aru_code-0.18.0 → aru_code-0.19.0}/aru/tools/__init__.py +0 -0
  30. {aru_code-0.18.0 → aru_code-0.19.0}/aru/tools/ast_tools.py +0 -0
  31. {aru_code-0.18.0 → aru_code-0.19.0}/aru/tools/codebase.py +0 -0
  32. {aru_code-0.18.0 → aru_code-0.19.0}/aru/tools/gitignore.py +0 -0
  33. {aru_code-0.18.0 → aru_code-0.19.0}/aru/tools/mcp_client.py +0 -0
  34. {aru_code-0.18.0 → aru_code-0.19.0}/aru/tools/ranker.py +0 -0
  35. {aru_code-0.18.0 → aru_code-0.19.0}/aru/tools/tasklist.py +0 -0
  36. {aru_code-0.18.0 → aru_code-0.19.0}/aru_code.egg-info/SOURCES.txt +0 -0
  37. {aru_code-0.18.0 → aru_code-0.19.0}/aru_code.egg-info/dependency_links.txt +0 -0
  38. {aru_code-0.18.0 → aru_code-0.19.0}/aru_code.egg-info/entry_points.txt +0 -0
  39. {aru_code-0.18.0 → aru_code-0.19.0}/aru_code.egg-info/requires.txt +0 -0
  40. {aru_code-0.18.0 → aru_code-0.19.0}/aru_code.egg-info/top_level.txt +0 -0
  41. {aru_code-0.18.0 → aru_code-0.19.0}/setup.cfg +0 -0
  42. {aru_code-0.18.0 → aru_code-0.19.0}/tests/test_agents_base.py +0 -0
  43. {aru_code-0.18.0 → aru_code-0.19.0}/tests/test_cli_advanced.py +0 -0
  44. {aru_code-0.18.0 → aru_code-0.19.0}/tests/test_cli_base.py +0 -0
  45. {aru_code-0.18.0 → aru_code-0.19.0}/tests/test_cli_completers.py +0 -0
  46. {aru_code-0.18.0 → aru_code-0.19.0}/tests/test_cli_new.py +0 -0
  47. {aru_code-0.18.0 → aru_code-0.19.0}/tests/test_cli_run_cli.py +0 -0
  48. {aru_code-0.18.0 → aru_code-0.19.0}/tests/test_cli_session.py +0 -0
  49. {aru_code-0.18.0 → aru_code-0.19.0}/tests/test_cli_shell.py +0 -0
  50. {aru_code-0.18.0 → aru_code-0.19.0}/tests/test_codebase.py +0 -0
  51. {aru_code-0.18.0 → aru_code-0.19.0}/tests/test_config.py +0 -0
  52. {aru_code-0.18.0 → aru_code-0.19.0}/tests/test_executor.py +0 -0
  53. {aru_code-0.18.0 → aru_code-0.19.0}/tests/test_gitignore.py +0 -0
  54. {aru_code-0.18.0 → aru_code-0.19.0}/tests/test_main.py +0 -0
  55. {aru_code-0.18.0 → aru_code-0.19.0}/tests/test_mcp_client.py +0 -0
  56. {aru_code-0.18.0 → aru_code-0.19.0}/tests/test_permissions.py +0 -0
  57. {aru_code-0.18.0 → aru_code-0.19.0}/tests/test_planner.py +0 -0
  58. {aru_code-0.18.0 → aru_code-0.19.0}/tests/test_providers.py +0 -0
  59. {aru_code-0.18.0 → aru_code-0.19.0}/tests/test_ranker.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: aru-code
3
- Version: 0.18.0
3
+ Version: 0.19.0
4
4
  Summary: A Claude Code clone built with Agno agents
5
5
  Author-email: Estevao <estevaofon@gmail.com>
6
6
  License-Expression: MIT
@@ -0,0 +1 @@
1
+ __version__ = "0.19.0"
@@ -1,28 +1,40 @@
1
1
  """Context management for token optimization.
2
2
 
3
- Implements three layers of token reduction:
4
- 1. Pruning — evict old tool/assistant outputs from history
5
- 2. Truncation universal cap on tool output size
6
- 3. Compaction summarize entire conversation when approaching context limits
3
+ Mirrors opencode's two-layer approach:
4
+
5
+ 1. **Prune** (routine, lossy only on tool outputs): walks old tool_result
6
+ blocks and replaces their content with a placeholder. User/assistant
7
+ text is NEVER touched — it survives verbatim until real overflow.
8
+ This is the steady-state memory mechanism. Matches cache_patch.py's
9
+ strategy at the Agno message layer.
10
+
11
+ 2. **Compact** (rare, lossy full summary): triggers only when the per-call
12
+ context window actually approaches the model's limit. Runs a
13
+ compaction agent that produces a structured summary (Goal / Instructions
14
+ / Discoveries / Accomplished / File contents / Relevant files) and
15
+ marks the resulting assistant message with `summary: True` so
16
+ subsequent prunes stop at that checkpoint.
17
+
18
+ There is also a `truncate_output` layer used by individual tools to cap
19
+ their own output size before it ever reaches history.
7
20
  """
8
21
 
9
22
  from __future__ import annotations
10
23
 
11
24
  # ── Constants ──────────────────────────────────────────────────────
12
25
 
13
- # Pruning: minimum chars that must be freeable to justify a prune pass
14
- PRUNE_MINIMUM_CHARS = 12_000 # ~3.5K tokens
15
- # Placeholder that replaces evicted content
16
- PRUNED_PLACEHOLDER = "[cleared]"
17
- # User messages larger than this threshold are truncated when outside protection window
18
- PRUNE_USER_MSG_THRESHOLD = 2_000 # ~570 tokens
19
- # How many chars to keep from the start of a pruned user message
20
- PRUNE_USER_MSG_KEEP = 500 # ~140 tokens
26
+ # Pruning: minimum chars that must be freeable to justify a prune pass.
27
+ # Matches opencode's PRUNE_MINIMUM = 20_000 tokens (~80K chars @ 4 chars/token).
28
+ PRUNE_MINIMUM_CHARS = 80_000 # ~20K tokens
29
+ # Placeholder that replaces cleared tool_result content. Matches
30
+ # cache_patch.py's _PRUNED_PLACEHOLDER so both layers produce identical
31
+ # text when a tool output is cleared.
32
+ CLEARED_TOOL_RESULT = "[Old tool result cleared]"
21
33
  # Minimum number of recent user turns always protected (regardless of char budget)
22
34
  PRUNE_PROTECT_TURNS = 2
23
35
  # Tool result markers that should never be pruned (critical context)
24
36
  PRUNE_PROTECTED_MARKERS = {"[SubAgent-", "delegate_task"}
25
- # Tool names whose outputs should never be pruned (like OpenCode's PRUNE_PROTECTED_TOOLS)
37
+ # Tool names whose outputs should never be pruned (like opencode's PRUNE_PROTECTED_TOOLS)
26
38
  # These are checked as substrings in message content (tool results include the tool name)
27
39
  PRUNE_PROTECTED_TOOLS = {"delegate_task"}
28
40
 
@@ -32,17 +44,22 @@ TRUNCATE_MAX_BYTES = 15 * 1024 # 15 KB
32
44
  TRUNCATE_KEEP_START = 150 # lines to keep from the start
33
45
  TRUNCATE_KEEP_END = 60 # lines to keep from the end
34
46
  TRUNCATE_MAX_LINE_LENGTH = 1500 # chars per individual line (prevents minified files)
35
- # Directory for saving full truncated outputs (like OpenCode pattern)
47
+ # Directory for saving full truncated outputs (like opencode pattern)
36
48
  TRUNCATE_SAVE_DIR = ".aru/truncated"
37
49
 
38
- # Compaction: trigger when per-run input tokens exceed this fraction of model limit
39
- COMPACTION_THRESHOLD_RATIO = 0.70
40
- # Compaction: target post-compaction size as fraction of model context limit
41
- COMPACTION_TARGET_RATIO = 0.15
42
- # Compaction: also trigger after this many user turns (regardless of token count)
43
- COMPACTION_MAX_TURNS = 15
44
- # Compaction: reserve buffer for the compaction process itself (like OpenCode's 20K)
45
- COMPACTION_BUFFER_TOKENS = 20_000
50
+ # Compaction: trigger when per-call input tokens approach real overflow.
51
+ # Matches opencode's philosophy: only fire near the model's actual context
52
+ # limit, not routinely. Routine context reduction is handled by prune_history
53
+ # (lossy only on tool outputs), so compaction is reserved for genuine
54
+ # overflow where the next API call would otherwise exceed the model's
55
+ # input limit minus the reserved buffer.
56
+ #
57
+ # Opencode fires at `count >= limit.input - reserved` (overflow.ts:22) —
58
+ # no extra ratio. We mirror that here. The sole safety margin is
59
+ # COMPACTION_BUFFER_TOKENS, which is 30K (vs opencode's 20K) to give a bit
60
+ # more headroom for output + tool definitions + estimation noise, since
61
+ # we don't yet have a reactive overflow handler to catch the edge case.
62
+ COMPACTION_BUFFER_TOKENS = 30_000
46
63
  # Default model context limits (input tokens)
47
64
  MODEL_CONTEXT_LIMITS: dict[str, int] = {
48
65
  # Anthropic
@@ -114,61 +131,97 @@ Structured list of file paths relevant to continuing the work (one per line)."""
114
131
 
115
132
  # ── Layer 1: Pruning ──────────────────────────────────────────────
116
133
 
134
+ def _tool_result_content_len(msg: dict) -> int:
135
+ """Sum of content length of all non-cleared tool_result blocks in a message.
136
+
137
+ Mirrors opencode's prune walk, which accumulates only
138
+ `Token.estimate(part.state.output)` for `ToolPart`s (compaction.ts:119).
139
+ Text blocks and tool_use args are ignored — they are not the thing
140
+ being freed. This means pruning only "consumes budget" for real tool
141
+ output, so text-heavy conversations with few tool calls never trip
142
+ the prune path.
143
+
144
+ Already-cleared tool_results (content == CLEARED_TOOL_RESULT) are
145
+ skipped so a second pass doesn't double-count them.
146
+ """
147
+ from aru.history_blocks import is_tool_result
148
+ total = 0
149
+ for block in msg.get("content", []):
150
+ if is_tool_result(block):
151
+ content = block.get("content")
152
+ if content == CLEARED_TOOL_RESULT:
153
+ continue
154
+ if content is None:
155
+ continue
156
+ # tool_result content can be a string or a list of blocks —
157
+ # stringify to get a char count that roughly tracks tokens.
158
+ total += len(str(content))
159
+ return total
160
+
161
+
117
162
  def _get_prune_protect_chars(model_id: str = "default") -> int:
118
- """Scale protection window based on model context size.
119
-
120
- Returns the number of chars worth of recent history that should NEVER
121
- be pruned. The remaining history beyond this window is eligible for
122
- reversible pruning.
123
-
124
- Sizing rationale: the target is a steady-state per-call context
125
- window of ~20K tokens (what the user sees in the status bar), which
126
- means protected history should be ~17K tokens = ~60K chars. This
127
- floor is applied to every model; larger models get more protection
128
- scaled at ~7% of their context, capped at 200K chars (~57K tokens)
129
- to avoid protecting too much in 1M-context models where the extra
130
- history hurts prompt caching.
163
+ """Chars of recent history that must NEVER be pruned.
164
+
165
+ Flat value across all models, mirroring opencode's fixed
166
+ `PRUNE_PROTECT = 40_000` tokens (compaction.ts:36). At ~4 chars/token
167
+ that's 160K chars of tool-result content kept intact in the recent
168
+ window. Older tool_result blocks beyond this budget are eligible for
169
+ the lossy clear pass in `prune_history`.
170
+
171
+ Why flat (not scaled by model): opencode validated this in production
172
+ on contexts from 128K to 1M scaling by ratio adds complexity without
173
+ improving behavior, and protecting too much in 1M-context models can
174
+ actually hurt prompt caching by keeping rarely-touched tail content warm.
175
+
176
+ The `model_id` parameter is retained for signature compatibility with
177
+ older call sites; it has no effect on the returned value.
131
178
  """
132
- limit = MODEL_CONTEXT_LIMITS.get(model_id, MODEL_CONTEXT_LIMITS["default"])
133
- # ~4 chars per token, protect ~7% of context as the ratio ceiling
134
- ratio_based = int(limit * 0.07 * 4)
135
- # Floor of 60K chars (~17K tokens) keeps the user-visible context
136
- # window around 20K tokens steady-state after system + cache + output
137
- # overheads. Applies to any model where 7% would be smaller.
138
- return max(60_000, min(ratio_based, 200_000))
179
+ del model_id # unused — kept for signature compatibility
180
+ return 160_000
139
181
 
140
182
 
141
183
  def prune_history(
142
184
  history: list[dict], model_id: str = "default"
143
185
  ) -> list[dict]:
144
- """Reduce history token footprint by dropping old content blocks.
145
-
146
- Operates on block-shaped history (see `aru.history_blocks`). The
147
- algorithm walks backward accumulating a char budget, and for any
148
- message that falls outside the protection window:
149
-
150
- - `text` blocks on assistant messages replaced with `[cleared]`
151
- text block.
152
- - Large `text` blocks on user messages → truncated to first N chars.
153
- - `tool_use` blocks dropped **together with** their matching
154
- `tool_result` block in the subsequent tool/user message. Dropping
155
- them atomically is required: Anthropic's API rejects orphans with
156
- `400: tool_use_id not found`.
157
- - `tool_result` blocks dropped only when their paired `tool_use`
158
- is also dropped.
159
-
160
- Protection layers:
186
+ """Reduce history token footprint by clearing old tool result content.
187
+
188
+ Operates on block-shaped history (see `aru.history_blocks`). Matches
189
+ opencode's approach: the ONLY lossy operation is replacing the
190
+ content of old `tool_result` blocks with a short placeholder. Text
191
+ blocks (user and assistant), `tool_use` blocks, and block structure
192
+ are always preserved so the original ask survives verbatim until
193
+ real overflow forces a full compaction.
194
+
195
+ **Budget semantics** (opencode parity): the walk backward accumulates
196
+ **only tool_result content chars**, not whole-message chars. Text
197
+ blocks and tool_use args don't consume the protection budget, because
198
+ they aren't what prune can free. Consequences:
199
+ - Text-heavy conversations with few tool calls never trigger prune.
200
+ - Prune only fires when there is >= `protect_chars + PRUNE_MINIMUM_CHARS`
201
+ of tool_result content total — mirroring opencode's
202
+ `total > PRUNE_PROTECT + PRUNE_MINIMUM`.
203
+ - The "is it worth pruning?" dry-run check from opencode
204
+ (`pruned > PRUNE_MINIMUM`) is implicit: we cannot enter the loop
205
+ without enough prunable content, and once in the loop any walk
206
+ past `protect_chars` is guaranteed to be freeing real bytes.
207
+
208
+ Protection layers (applied on top of the budget walk):
161
209
  1. Turn-based: last `PRUNE_PROTECT_TURNS` user turns always kept
162
- intact, along with the assistant response right after each.
163
- 2. Char-based: recent content within the protection window is kept.
210
+ intact, plus the assistant response right after each. Index 0
211
+ (the original user ask) is also always protected.
212
+ 2. Budget-based: tool_result content within the 160K protect window
213
+ (~40K tokens, matching opencode) is kept.
164
214
  3. Content-based: messages whose stringified content contains any
165
215
  `PRUNE_PROTECTED_MARKERS` or `PRUNE_PROTECTED_TOOLS` never prune.
216
+ 4. Summary checkpoint: walking backward stops at any message marked
217
+ `summary: True` (a previous compaction's assistant output).
218
+ Everything before a summary was already consolidated and must
219
+ not be re-processed.
166
220
 
167
221
  Returns a new list (does not mutate the input).
168
222
  """
169
223
  from aru.history_blocks import (
170
- coerce_history_item, item_char_len, item_text,
171
- is_text, is_tool_use, is_tool_result, text_block,
224
+ coerce_history_item, item_text, is_tool_result,
172
225
  )
173
226
 
174
227
  if len(history) <= 2:
@@ -177,11 +230,15 @@ def prune_history(
177
230
  protect_chars = _get_prune_protect_chars(model_id)
178
231
  result = [coerce_history_item(m) for m in history]
179
232
 
180
- total_chars = sum(item_char_len(msg) for msg in result)
181
- if total_chars < protect_chars + PRUNE_MINIMUM_CHARS:
233
+ # Entry gate mirrors opencode: only proceed if total tool output
234
+ # exceeds protect + minimum. Text length is irrelevant.
235
+ total_tool_chars = sum(_tool_result_content_len(msg) for msg in result)
236
+ if total_tool_chars < protect_chars + PRUNE_MINIMUM_CHARS:
182
237
  return result
183
238
 
184
- # Identify indices of last N user turns (always protected)
239
+ # Identify indices of last N user turns (always protected) and index 0
240
+ # (the original user ask, protected defensively so the anchor never
241
+ # evaporates even if future edits change the budget calculus).
185
242
  turn_protected: set[int] = set()
186
243
  user_turns_seen = 0
187
244
  for i in range(len(result) - 1, -1, -1):
@@ -191,108 +248,60 @@ def prune_history(
191
248
  turn_protected.add(i)
192
249
  if i + 1 < len(result):
193
250
  turn_protected.add(i + 1)
194
-
195
- # Build a map of tool_use_id → (assistant_idx, user_idx) so we can
196
- # drop both halves of a pair atomically. The user_idx points to the
197
- # next message(s) after the assistant carrying the matching tool_result.
198
- tool_pair_loc: dict[str, tuple[int, int]] = {}
199
- for i, msg in enumerate(result):
200
- if msg["role"] != "assistant":
201
- continue
202
- for block in msg["content"]:
203
- if not is_tool_use(block):
204
- continue
205
- tu_id = block.get("id")
206
- if not tu_id:
207
- continue
208
- # Look forward for the matching tool_result (usually i+1)
209
- for j in range(i + 1, min(i + 3, len(result))):
210
- for rb in result[j]["content"]:
211
- if is_tool_result(rb) and rb.get("tool_use_id") == tu_id:
212
- tool_pair_loc[tu_id] = (i, j)
213
- break
214
- if tu_id in tool_pair_loc:
215
- break
216
-
217
- # Walk backward, protecting recent content
251
+ if result and result[0]["role"] == "user":
252
+ turn_protected.add(0)
253
+ if len(result) > 1:
254
+ turn_protected.add(1)
255
+
256
+ # Walk backward accumulating ONLY tool_result content chars into the
257
+ # protection budget. Messages with no tool_result (pure text, or just
258
+ # tool_use) consume zero budget and are skipped without pruning.
218
259
  protected = 0
219
- dropped_tool_use_ids: set[str] = set()
220
260
 
221
261
  for i in range(len(result) - 1, -1, -1):
222
262
  msg = result[i]
223
- msg_len = item_char_len(msg)
263
+
264
+ # Stop at the previous compaction summary marker — everything
265
+ # before it was already consolidated into the summary.
266
+ if msg.get("summary"):
267
+ break
268
+
269
+ tool_chars = _tool_result_content_len(msg)
270
+
271
+ # No prunable content here — nothing to clear, nothing to count.
272
+ if tool_chars == 0:
273
+ continue
224
274
 
225
275
  if i in turn_protected:
226
- protected += msg_len
276
+ protected += tool_chars
227
277
  continue
228
278
 
229
- if protected + msg_len <= protect_chars:
230
- protected += msg_len
279
+ if protected + tool_chars <= protect_chars:
280
+ protected += tool_chars
231
281
  continue
232
282
 
233
283
  # Outside protection window — check content-based protection
234
284
  text_view = item_text(msg)
235
285
  if (any(marker in text_view for marker in PRUNE_PROTECTED_MARKERS)
236
286
  or any(tool in text_view for tool in PRUNE_PROTECTED_TOOLS)):
237
- protected += msg_len
287
+ protected += tool_chars
238
288
  continue
239
289
 
240
- # Prune this message's blocks
290
+ # Clear any tool_result payloads in this message. Leave every
291
+ # other block (text, tool_use, thinking, etc.) untouched.
241
292
  new_blocks: list[dict] = []
242
293
  for block in msg["content"]:
243
- if is_text(block):
244
- if msg["role"] == "assistant":
245
- # Replace with a single placeholder (only if not already)
246
- if not new_blocks or new_blocks[-1].get("text") != PRUNED_PLACEHOLDER:
247
- new_blocks.append(text_block(PRUNED_PLACEHOLDER))
248
- elif msg["role"] == "user":
249
- text = block.get("text", "")
250
- if len(text) > PRUNE_USER_MSG_THRESHOLD:
251
- truncated = (
252
- text[:PRUNE_USER_MSG_KEEP]
253
- + f"\n\n[... {len(text) - PRUNE_USER_MSG_KEEP:,} "
254
- "chars pruned to save context ...]"
255
- )
256
- new_blocks.append(text_block(truncated))
257
- else:
258
- new_blocks.append(block)
259
- else:
260
- new_blocks.append(block)
261
- elif is_tool_use(block):
262
- # Drop the tool_use entirely and mark its id for paired removal
263
- tu_id = block.get("id")
264
- if tu_id:
265
- dropped_tool_use_ids.add(tu_id)
266
- # Do NOT add to new_blocks
267
- elif is_tool_result(block):
268
- # Drop only if its paired tool_use is also being dropped
269
- tu_id = block.get("tool_use_id")
270
- if tu_id in dropped_tool_use_ids:
271
- pass # drop
272
- else:
273
- new_blocks.append(block)
294
+ if is_tool_result(block) and block.get("content") != CLEARED_TOOL_RESULT:
295
+ new_blocks.append({
296
+ "type": "tool_result",
297
+ "tool_use_id": block.get("tool_use_id"),
298
+ "content": CLEARED_TOOL_RESULT,
299
+ })
274
300
  else:
275
301
  new_blocks.append(block)
276
302
 
277
303
  result[i] = {"role": msg["role"], "content": new_blocks}
278
304
 
279
- # Second pass: any tool_result blocks in user messages whose tool_use
280
- # was dropped on a previous pass (covers case where user msg was
281
- # inside protection but its paired assistant was outside).
282
- if dropped_tool_use_ids:
283
- for idx, msg in enumerate(result):
284
- if not msg["content"]:
285
- continue
286
- filtered = [
287
- b for b in msg["content"]
288
- if not (is_tool_result(b) and b.get("tool_use_id") in dropped_tool_use_ids)
289
- ]
290
- if len(filtered) != len(msg["content"]):
291
- result[idx] = {"role": msg["role"], "content": filtered}
292
-
293
- # Drop any messages that ended up with zero blocks (valid but useless)
294
- result = [m for m in result if m["content"]]
295
-
296
305
  return result
297
306
 
298
307
 
@@ -443,46 +452,50 @@ def should_compact(
443
452
  ) -> bool:
444
453
  """Check if the conversation should be compacted.
445
454
 
446
- Triggers on EITHER condition:
447
- 1. Token-based: tokens >= usable_context * threshold_ratio
448
- 2. Turn-based: user turns >= COMPACTION_MAX_TURNS (prevents slow token creep)
455
+ Fires when the per-call context window reaches real overflow:
456
+ `tokens >= limit - COMPACTION_BUFFER_TOKENS`.
457
+
458
+ Matches opencode's `isOverflow` in overflow.ts:22 — `count >= usable`,
459
+ no extra ratio. Routine context reduction is handled by `prune_history`
460
+ (lossy only on tool outputs), so compaction is reserved for genuine
461
+ overflow where the next API call would otherwise exceed the model's
462
+ input limit minus the reserved buffer.
449
463
 
450
464
  Accepts either an estimated token count (int) or the history list.
451
465
  """
452
466
  if isinstance(history_or_tokens, list):
453
- history = history_or_tokens
454
- tokens = estimate_history_tokens(history)
455
- # Turn-based trigger: count user messages
456
- user_turns = sum(1 for m in history if m["role"] == "user")
457
- if user_turns >= COMPACTION_MAX_TURNS:
458
- return True
467
+ tokens = estimate_history_tokens(history_or_tokens)
459
468
  else:
460
469
  tokens = history_or_tokens
461
470
 
462
471
  limit = MODEL_CONTEXT_LIMITS.get(model_id, MODEL_CONTEXT_LIMITS["default"])
463
472
  usable = limit - COMPACTION_BUFFER_TOKENS
464
- threshold = int(usable * COMPACTION_THRESHOLD_RATIO)
465
- return tokens >= threshold
473
+ return tokens >= usable
466
474
 
467
475
 
468
476
  def would_prune(history: list[dict], model_id: str = "default") -> bool:
469
477
  """Check if prune_history would discard content from this history.
470
478
 
471
- Uses the exact same criteria as prune_history: total chars exceed
472
- the protection window + minimum prunable threshold.
479
+ Uses the same entry gate as `prune_history`: total tool_result
480
+ content must exceed the protection window + minimum prunable
481
+ threshold. Text and tool_use args are not counted — only real
482
+ prunable output. Mirrors opencode's logic.
473
483
  """
474
- from aru.history_blocks import item_char_len
475
484
  if len(history) <= 2:
476
485
  return False
477
- total_chars = sum(item_char_len(msg) for msg in history)
486
+ total_tool_chars = sum(_tool_result_content_len(msg) for msg in history)
478
487
  protect_chars = _get_prune_protect_chars(model_id)
479
- return total_chars >= protect_chars + PRUNE_MINIMUM_CHARS
488
+ return total_tool_chars >= protect_chars + PRUNE_MINIMUM_CHARS
480
489
 
481
490
 
482
491
  def _split_history(history: list[dict], model_id: str = "default") -> tuple[list[dict], list[dict]]:
483
492
  """Split history into old (to summarize) and recent (to keep intact).
484
493
 
485
- Uses the same protection window as pruning.
494
+ Uses the same protection window as pruning. Defensively, the first
495
+ user turn (index 0) is always pulled into `recent` so the original
496
+ ask survives literal even through a full compaction — the compactor
497
+ extracts it into the `## Goal` section of the summary, but keeping
498
+ it in recent too means the agent can quote it verbatim afterward.
486
499
  """
487
500
  from aru.history_blocks import item_char_len
488
501
  protect_chars = _get_prune_protect_chars(model_id)
@@ -495,6 +508,12 @@ def _split_history(history: list[dict], model_id: str = "default") -> tuple[list
495
508
  split_idx = i
496
509
  else:
497
510
  break
511
+
512
+ # Defensive: force the first user turn into `recent` even if the
513
+ # protect budget would have sent it to `old`. The original ask is
514
+ # the session anchor and must stay literal.
515
+ if split_idx > 0 and history and history[0].get("role") == "user":
516
+ return history[1:split_idx], [history[0]] + history[split_idx:]
498
517
  return history[:split_idx], history[split_idx:]
499
518
 
500
519
 
@@ -563,12 +582,13 @@ def apply_compaction(
563
582
  The summary is emitted as a synthetic user→assistant exchange so that
564
583
  role alternation stays natural:
565
584
  [user: "Please summarize..."]
566
- [assistant: "<summary>"]
585
+ [assistant: "<summary>", summary=True]
567
586
  + recent messages as-is
568
587
 
569
- This shape avoids the `[user, user, ...]` sequence that previously
570
- biased the model toward describing actions rather than emitting
571
- structured tool calls.
588
+ The assistant summary is marked with `summary: True` as a checkpoint.
589
+ `prune_history` walks backward and stops at this marker, so content
590
+ already consolidated into the summary is never re-processed. Mirrors
591
+ opencode's `msg.info.summary` flag (see message-v2.ts:914).
572
592
  """
573
593
  from aru.history_blocks import text_block, coerce_history_item
574
594
  _, recent = _split_history(history, model_id)
@@ -581,6 +601,7 @@ def apply_compaction(
581
601
  {
582
602
  "role": "assistant",
583
603
  "content": [text_block(f"Prior conversation summary:\n\n{summary}")],
604
+ "summary": True,
584
605
  },
585
606
  ]
586
607
  compacted.extend(coerce_history_item(m) for m in recent)
@@ -402,9 +402,12 @@ class Session:
402
402
  self.history.append({"role": role, "content": blocks})
403
403
  # Hard cap as safety net — structured pruning/compaction in
404
404
  # aru/context.py handles the normal case; this only fires if
405
- # something bypasses them.
406
- if len(self.history) > 60:
407
- self.history = self.history[-60:]
405
+ # something bypasses them. Set high enough that long sessions
406
+ # (which now accumulate more messages because prune is
407
+ # non-destructive for text and compact rarely fires) don't hit
408
+ # this destructive path routinely.
409
+ if len(self.history) > 300:
410
+ self.history = self.history[-300:]
408
411
 
409
412
  def add_structured_message(self, role: str, blocks: list[dict]):
410
413
  """Explicitly add a message with pre-built content blocks.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: aru-code
3
- Version: 0.18.0
3
+ Version: 0.19.0
4
4
  Summary: A Claude Code clone built with Agno agents
5
5
  Author-email: Estevao <estevaofon@gmail.com>
6
6
  License-Expression: MIT
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "aru-code"
7
- version = "0.18.0"
7
+ version = "0.19.0"
8
8
  description = "A Claude Code clone built with Agno agents"
9
9
  readme = "README.md"
10
10
  license = "MIT"
@@ -314,11 +314,12 @@ class TestSession:
314
314
 
315
315
  def test_add_message_caps_history(self):
316
316
  session = Session()
317
- for i in range(75):
317
+ for i in range(350):
318
318
  session.add_message("user", f"msg {i}")
319
- # History is bounded by a hard cap (structured compaction in
320
- # aru.context handles the normal-path token management).
321
- assert len(session.history) <= 60
319
+ # History is bounded by a hard safety cap (structured pruning/
320
+ # compaction in aru.context handles the normal-path token
321
+ # management; this cap only fires on pathological growth).
322
+ assert len(session.history) <= 300
322
323
 
323
324
  def test_set_plan(self):
324
325
  session = Session()
@@ -176,17 +176,26 @@ class TestPrunePreservesPairs:
176
176
  """Fix 6: pruning must never orphan tool_use / tool_result blocks."""
177
177
 
178
178
  def test_prune_drops_tool_pair_atomically(self):
179
- """An old assistant with tool_use must have its tool_result dropped too."""
180
- # Build a history large enough to force pruning
181
- filler = "x" * 50_000
179
+ """An old tool_result whose content gets cleared must still keep
180
+ its block (matching tool_use_id), so the tool_use/tool_result
181
+ pair is never orphaned.
182
+
183
+ Opencode-aligned budget: prune only counts tool_result content
184
+ chars, so the history needs multiple large tool_result payloads
185
+ to clear the 240K entry gate.
186
+ """
187
+ big_output = "old file line\n" * 8_000 # ~100K chars per result
182
188
  history = [
183
189
  {"role": "user", "content": "request 1"},
184
- _assistant_tool_turn("old_tu", "read_file", {"path": "old.py"}, filler),
185
- _tool_result_turn("old_tu", "old contents " * 1000),
190
+ _assistant_tool_turn("old_tu", "read_file", {"path": "old.py"}),
191
+ _tool_result_turn("old_tu", big_output),
186
192
  {"role": "user", "content": "request 2"},
187
- {"role": "assistant", "content": "response 2"},
188
- {"role": "user", "content": "recent request " * 5000},
189
- {"role": "assistant", "content": "recent response"},
193
+ _assistant_tool_turn("mid_tu", "read_file", {"path": "mid.py"}),
194
+ _tool_result_turn("mid_tu", big_output),
195
+ {"role": "user", "content": "request 3"},
196
+ _assistant_tool_turn("recent_tu", "read_file", {"path": "new.py"}),
197
+ _tool_result_turn("recent_tu", big_output),
198
+ {"role": "user", "content": "summarize"},
190
199
  ]
191
200
 
192
201
  pruned = prune_history(history, model_id="default")
@@ -208,10 +217,20 @@ class TestPrunePreservesPairs:
208
217
  )
209
218
 
210
219
  def test_prune_keeps_recent_tool_pair(self):
211
- """A tool_use/tool_result pair inside the protection window must be kept."""
220
+ """A tool_use/tool_result pair inside the protection window must be
221
+ kept with its content intact, even when older tool_results get cleared.
222
+
223
+ Builds a history with two big old tool_results (enough to trigger
224
+ prune) and one small recent pair that must survive verbatim.
225
+ """
226
+ big_old = "old file content\n" * 10_000 # ~170K chars each
212
227
  history = [
213
- {"role": "user", "content": "old stuff " * 50_000},
214
- {"role": "assistant", "content": "old response " * 10_000},
228
+ {"role": "user", "content": "req 1"},
229
+ _assistant_tool_turn("tu_old1", "read_file", {"path": "a.py"}, "reading"),
230
+ _tool_result_turn("tu_old1", big_old),
231
+ {"role": "user", "content": "req 2"},
232
+ _assistant_tool_turn("tu_old2", "read_file", {"path": "b.py"}, "reading"),
233
+ _tool_result_turn("tu_old2", big_old),
215
234
  {"role": "user", "content": "read foo"},
216
235
  _assistant_tool_turn("tu_recent", "read_file", {"path": "foo.py"}, "reading"),
217
236
  _tool_result_turn("tu_recent", "def foo(): pass"),
@@ -227,6 +246,10 @@ class TestPrunePreservesPairs:
227
246
 
228
247
  assert len(tool_uses) == 1, "Recent tool_use was incorrectly pruned"
229
248
  assert len(tool_results) == 1, "Recent tool_result was incorrectly pruned"
249
+ # Recent content must be intact (not cleared)
250
+ assert tool_results[0].get("content") == "def foo(): pass", (
251
+ "Recent tool_result content was cleared — should be inside protection window"
252
+ )
230
253
 
231
254
  def test_prune_with_no_pairs_still_works(self):
232
255
  """Pure text history should prune without errors."""
@@ -10,8 +10,16 @@ from aru.context import (
10
10
  apply_compaction,
11
11
  build_compaction_prompt,
12
12
  format_context_block,
13
+ CLEARED_TOOL_RESULT,
14
+ )
15
+ from aru.history_blocks import (
16
+ coerce_history,
17
+ item_text,
18
+ tool_use_block,
19
+ tool_result_block,
20
+ text_block,
21
+ is_tool_result,
13
22
  )
14
- from aru.history_blocks import coerce_history, item_text
15
23
 
16
24
 
17
25
  class TestPruneHistory:
@@ -27,37 +35,103 @@ class TestPruneHistory:
27
35
  # Input is auto-coerced to block form on return
28
36
  assert result == coerce_history(messages)
29
37
 
30
- def test_prunes_old_assistant_messages(self):
31
- """Should prune old assistant messages when over threshold."""
32
- old_content = "x" * 30000
33
- recent_content = "y" * 10000
38
+ def test_prunes_old_tool_results_when_over_threshold(self):
39
+ """Should clear old tool_result content when total tool output
40
+ exceeds protect + minimum (opencode-aligned budget semantics).
41
+
42
+ The budget walks backward over tool_result content chars only.
43
+ Text and tool_use args don't count, so this test uses large
44
+ tool_result payloads to actually trip the prune path.
45
+ """
46
+ # Three rounds of read_file-sized outputs. Total ~300K chars
47
+ # of tool_result content — clears the 240K entry gate, and
48
+ # the 160K protect budget will cover only the most recent one.
49
+ big_output = "line of code\n" * 8_000 # ~100K chars
34
50
  messages = [
35
- {"role": "user", "content": "First request"},
36
- {"role": "assistant", "content": old_content},
37
- {"role": "user", "content": "Second request"},
38
- {"role": "assistant", "content": recent_content},
51
+ {"role": "user", "content": "round 1"},
52
+ {
53
+ "role": "assistant",
54
+ "content": [
55
+ text_block("reading"),
56
+ tool_use_block("tu_old", "read_file", {"path": "a.py"}),
57
+ ],
58
+ },
59
+ {"role": "tool", "content": [tool_result_block("tu_old", big_output)]},
60
+ {"role": "user", "content": "round 2"},
61
+ {
62
+ "role": "assistant",
63
+ "content": [
64
+ text_block("reading"),
65
+ tool_use_block("tu_mid", "read_file", {"path": "b.py"}),
66
+ ],
67
+ },
68
+ {"role": "tool", "content": [tool_result_block("tu_mid", big_output)]},
69
+ {"role": "user", "content": "round 3"},
70
+ {
71
+ "role": "assistant",
72
+ "content": [
73
+ text_block("reading"),
74
+ tool_use_block("tu_recent", "read_file", {"path": "c.py"}),
75
+ ],
76
+ },
77
+ {"role": "tool", "content": [tool_result_block("tu_recent", big_output)]},
78
+ {"role": "user", "content": "what did you find?"},
39
79
  ]
40
80
  result = prune_history(messages)
41
- # Should have placeholder for pruned content
42
- assert len(result) <= len(messages)
43
- # Recent messages should be preserved
44
- assert any("Second request" in str(m) for m in result)
45
-
46
- def test_preserves_user_messages(self):
47
- """Should always preserve user messages."""
48
- old_user = {"role": "user", "content": "Old user message"}
49
- old_assistant = {"role": "assistant", "content": "Old assistant " * 10000}
50
- recent = {"role": "user", "content": "Recent request"}
51
-
52
- messages = [old_user, old_assistant, recent]
53
- result = prune_history(messages)
54
81
 
55
- # User messages should be preserved (as placeholders or original)
56
- recent_preserved = any(
57
- m.get("role") == "user" and "Recent" in item_text(m)
58
- for m in result
82
+ # Same number of messages (prune never drops structure)
83
+ assert len(result) == len(messages)
84
+
85
+ # Collect tool_result blocks by tool_use_id
86
+ by_id: dict[str, dict] = {}
87
+ for msg in result:
88
+ for block in msg.get("content", []):
89
+ if is_tool_result(block):
90
+ by_id[block.get("tool_use_id")] = block
91
+
92
+ # All three pairs preserved at the block level
93
+ assert set(by_id.keys()) == {"tu_old", "tu_mid", "tu_recent"}
94
+
95
+ # Recent tool_result kept verbatim
96
+ assert by_id["tu_recent"]["content"] == big_output
97
+
98
+ # The older tool_result must have been cleared — at least one
99
+ # of tu_old/tu_mid should now hold the placeholder, since only
100
+ # 160K chars worth fits inside the protect window.
101
+ cleared_count = sum(
102
+ 1 for tu_id in ("tu_old", "tu_mid")
103
+ if by_id[tu_id]["content"] == CLEARED_TOOL_RESULT
104
+ )
105
+ assert cleared_count >= 1, (
106
+ "Expected at least one old tool_result to be cleared once "
107
+ "total output exceeded protect + minimum"
59
108
  )
60
- assert recent_preserved
109
+
110
+ def test_text_heavy_history_is_not_pruned(self):
111
+ """Conversations dominated by text (not tool output) must NOT
112
+ trigger prune even if total chars are huge.
113
+
114
+ This is the opencode-aligned semantics: text blocks don't enter
115
+ the prune budget. A 500K-char text history with no tool_results
116
+ is a no-op for prune_history.
117
+ """
118
+ messages = [
119
+ {"role": "user", "content": "long planning discussion " * 10_000},
120
+ {"role": "assistant", "content": "detailed reasoning " * 10_000},
121
+ {"role": "user", "content": "what's next?"},
122
+ {"role": "assistant", "content": "here's the plan " * 10_000},
123
+ ]
124
+ result = prune_history(messages)
125
+
126
+ # No tool_results exist anywhere in result
127
+ tool_results = [
128
+ b for m in result for b in m.get("content", []) if is_tool_result(b)
129
+ ]
130
+ assert tool_results == []
131
+ # Length preserved
132
+ assert len(result) == len(messages)
133
+ # No message content was altered to CLEARED_TOOL_RESULT
134
+ assert all(CLEARED_TOOL_RESULT not in item_text(m) for m in result)
61
135
 
62
136
  def test_empty_history(self):
63
137
  """Should handle empty history."""
@@ -108,20 +182,21 @@ class TestShouldCompact:
108
182
  """Tests for should_compact function."""
109
183
 
110
184
  def test_no_compaction_under_threshold(self):
111
- """Should not compact when under 50% of context limit."""
112
- # Default 200K tokens * 0.5 = 100K threshold; 5 tokens is well under
185
+ """Should not compact when well under the overflow threshold."""
186
+ # claude-sonnet-4-5 has 200K context; usable = 170K (buffer 30K).
187
+ # 5 tokens is well under.
113
188
  result = should_compact(5, model_id="claude-sonnet-4-5-20250929")
114
189
  assert result is False
115
190
 
116
191
  def test_compaction_over_threshold(self):
117
- """Should compact when over threshold."""
118
- # 300K tokens is over 50% of a 200K-token context window
192
+ """Should compact when over the real-overflow threshold."""
193
+ # 300K tokens is well over the 170K threshold of a 200K-context model.
119
194
  result = should_compact(300000, model_id="claude-sonnet-4-5-20250929")
120
195
  assert result is True
121
196
 
122
197
  def test_custom_context_limit(self):
123
198
  """Should respect custom context limit."""
124
- # gpt-4o has 128K context, 50% = 64K; 50K is under threshold
199
+ # gpt-4o has 128K context; usable = 98K. 50K is under.
125
200
  result = should_compact(50000, model_id="gpt-4o")
126
201
  assert isinstance(result, bool)
127
202
 
@@ -145,7 +220,8 @@ class TestCompactionTriggerUsesPerCallMetric:
145
220
 
146
221
  def test_small_per_call_window_does_not_fire(self):
147
222
  """Reproduces the exact bug report: per-call ~20K on qwen3.6-plus
148
- (128K limit, ~75.6K threshold) must NOT trigger compaction."""
223
+ (128K limit, ~98K threshold with 30K buffer) must NOT
224
+ trigger compaction."""
149
225
  # Values taken from the real session where compaction fired incorrectly:
150
226
  # "context: 20,184 (in: 16,652 / out: 696 / cache_read: 2,836)"
151
227
  last_input = 16_652
@@ -158,7 +234,7 @@ class TestCompactionTriggerUsesPerCallMetric:
158
234
  )
159
235
  assert last_call_window == 20_184, "window computation changed"
160
236
 
161
- # 20K is ~3.7× below the 75.6K threshold for a 128K-context model
237
+ # 20K is far below the ~98K threshold for a 128K-context model
162
238
  assert should_compact(last_call_window, model_id="qwen3.6-plus") is False, (
163
239
  "Compaction fired on a small per-call window. The runner is "
164
240
  "probably passing cumulative tokens (run_output.metrics.input_tokens) "
@@ -169,7 +245,9 @@ class TestCompactionTriggerUsesPerCallMetric:
169
245
  def test_large_per_call_window_still_fires(self):
170
246
  """Positive case: compaction must still fire when the last-call
171
247
  window actually approaches the model's context limit."""
172
- last_input = 80_000
248
+ # qwen3.6-plus: 128K limit, usable = 98K (buffer 30K).
249
+ # 105K input + 2K output + 0 cache = 107K window → must fire.
250
+ last_input = 105_000
173
251
  last_output = 2_000
174
252
  last_cache_read = 0
175
253
  last_cache_write = 0
@@ -177,17 +255,17 @@ class TestCompactionTriggerUsesPerCallMetric:
177
255
  last_call_window = (
178
256
  last_input + last_output + last_cache_read + last_cache_write
179
257
  )
180
- assert last_call_window == 82_000
258
+ assert last_call_window == 107_000
181
259
 
182
- # 82K > 75.6K threshold → must fire
260
+ # 107K > 98K threshold → must fire
183
261
  assert should_compact(last_call_window, model_id="qwen3.6-plus") is True
184
262
 
185
263
  def test_cumulative_metric_is_the_wrong_signal(self):
186
264
  """Illustrates WHY the old approach was wrong: a cumulative sum of
187
- 5 API calls at 18K each is 90K (above threshold), but the actual
265
+ 6 API calls at 18K each is 108K (above threshold), but the actual
188
266
  per-call window each time is only 18K (well below)."""
189
267
  per_call_window = 18_000
190
- num_api_calls_in_turn = 5
268
+ num_api_calls_in_turn = 6
191
269
  cumulative_if_summed = per_call_window * num_api_calls_in_turn
192
270
 
193
271
  # Old (wrong) behavior: cumulative triggers compaction
@@ -196,8 +274,8 @@ class TestCompactionTriggerUsesPerCallMetric:
196
274
  # New (correct) behavior: per-call does NOT trigger compaction
197
275
  assert should_compact(per_call_window, model_id="qwen3.6-plus") is False
198
276
 
199
- # The difference is the entire bug
200
- assert cumulative_if_summed > 75_600 > per_call_window
277
+ # The difference is the entire bug (threshold is 98K for qwen3.6-plus)
278
+ assert cumulative_if_summed > 98_000 > per_call_window
201
279
 
202
280
  def test_runner_source_uses_per_call_metric(self):
203
281
  """Static check against silent regression.
@@ -1 +0,0 @@
1
- __version__ = "0.18.0"
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes