aru-code 0.18.0__tar.gz → 0.19.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {aru_code-0.18.0/aru_code.egg-info → aru_code-0.19.1}/PKG-INFO +1 -1
- aru_code-0.19.1/aru/__init__.py +1 -0
- {aru_code-0.18.0 → aru_code-0.19.1}/aru/context.py +235 -176
- {aru_code-0.18.0 → aru_code-0.19.1}/aru/session.py +6 -3
- {aru_code-0.18.0 → aru_code-0.19.1/aru_code.egg-info}/PKG-INFO +1 -1
- {aru_code-0.18.0 → aru_code-0.19.1}/pyproject.toml +1 -1
- {aru_code-0.18.0 → aru_code-0.19.1}/tests/test_cli.py +5 -4
- {aru_code-0.18.0 → aru_code-0.19.1}/tests/test_confabulation_regression.py +34 -11
- {aru_code-0.18.0 → aru_code-0.19.1}/tests/test_context.py +119 -41
- aru_code-0.18.0/aru/__init__.py +0 -1
- {aru_code-0.18.0 → aru_code-0.19.1}/LICENSE +0 -0
- {aru_code-0.18.0 → aru_code-0.19.1}/README.md +0 -0
- {aru_code-0.18.0 → aru_code-0.19.1}/aru/agent_factory.py +0 -0
- {aru_code-0.18.0 → aru_code-0.19.1}/aru/agents/__init__.py +0 -0
- {aru_code-0.18.0 → aru_code-0.19.1}/aru/agents/base.py +0 -0
- {aru_code-0.18.0 → aru_code-0.19.1}/aru/agents/executor.py +0 -0
- {aru_code-0.18.0 → aru_code-0.19.1}/aru/agents/planner.py +0 -0
- {aru_code-0.18.0 → aru_code-0.19.1}/aru/cache_patch.py +0 -0
- {aru_code-0.18.0 → aru_code-0.19.1}/aru/cli.py +0 -0
- {aru_code-0.18.0 → aru_code-0.19.1}/aru/commands.py +0 -0
- {aru_code-0.18.0 → aru_code-0.19.1}/aru/completers.py +0 -0
- {aru_code-0.18.0 → aru_code-0.19.1}/aru/config.py +0 -0
- {aru_code-0.18.0 → aru_code-0.19.1}/aru/display.py +0 -0
- {aru_code-0.18.0 → aru_code-0.19.1}/aru/history_blocks.py +0 -0
- {aru_code-0.18.0 → aru_code-0.19.1}/aru/permissions.py +0 -0
- {aru_code-0.18.0 → aru_code-0.19.1}/aru/providers.py +0 -0
- {aru_code-0.18.0 → aru_code-0.19.1}/aru/runner.py +0 -0
- {aru_code-0.18.0 → aru_code-0.19.1}/aru/runtime.py +0 -0
- {aru_code-0.18.0 → aru_code-0.19.1}/aru/tools/__init__.py +0 -0
- {aru_code-0.18.0 → aru_code-0.19.1}/aru/tools/ast_tools.py +0 -0
- {aru_code-0.18.0 → aru_code-0.19.1}/aru/tools/codebase.py +0 -0
- {aru_code-0.18.0 → aru_code-0.19.1}/aru/tools/gitignore.py +0 -0
- {aru_code-0.18.0 → aru_code-0.19.1}/aru/tools/mcp_client.py +0 -0
- {aru_code-0.18.0 → aru_code-0.19.1}/aru/tools/ranker.py +0 -0
- {aru_code-0.18.0 → aru_code-0.19.1}/aru/tools/tasklist.py +0 -0
- {aru_code-0.18.0 → aru_code-0.19.1}/aru_code.egg-info/SOURCES.txt +0 -0
- {aru_code-0.18.0 → aru_code-0.19.1}/aru_code.egg-info/dependency_links.txt +0 -0
- {aru_code-0.18.0 → aru_code-0.19.1}/aru_code.egg-info/entry_points.txt +0 -0
- {aru_code-0.18.0 → aru_code-0.19.1}/aru_code.egg-info/requires.txt +0 -0
- {aru_code-0.18.0 → aru_code-0.19.1}/aru_code.egg-info/top_level.txt +0 -0
- {aru_code-0.18.0 → aru_code-0.19.1}/setup.cfg +0 -0
- {aru_code-0.18.0 → aru_code-0.19.1}/tests/test_agents_base.py +0 -0
- {aru_code-0.18.0 → aru_code-0.19.1}/tests/test_cli_advanced.py +0 -0
- {aru_code-0.18.0 → aru_code-0.19.1}/tests/test_cli_base.py +0 -0
- {aru_code-0.18.0 → aru_code-0.19.1}/tests/test_cli_completers.py +0 -0
- {aru_code-0.18.0 → aru_code-0.19.1}/tests/test_cli_new.py +0 -0
- {aru_code-0.18.0 → aru_code-0.19.1}/tests/test_cli_run_cli.py +0 -0
- {aru_code-0.18.0 → aru_code-0.19.1}/tests/test_cli_session.py +0 -0
- {aru_code-0.18.0 → aru_code-0.19.1}/tests/test_cli_shell.py +0 -0
- {aru_code-0.18.0 → aru_code-0.19.1}/tests/test_codebase.py +0 -0
- {aru_code-0.18.0 → aru_code-0.19.1}/tests/test_config.py +0 -0
- {aru_code-0.18.0 → aru_code-0.19.1}/tests/test_executor.py +0 -0
- {aru_code-0.18.0 → aru_code-0.19.1}/tests/test_gitignore.py +0 -0
- {aru_code-0.18.0 → aru_code-0.19.1}/tests/test_main.py +0 -0
- {aru_code-0.18.0 → aru_code-0.19.1}/tests/test_mcp_client.py +0 -0
- {aru_code-0.18.0 → aru_code-0.19.1}/tests/test_permissions.py +0 -0
- {aru_code-0.18.0 → aru_code-0.19.1}/tests/test_planner.py +0 -0
- {aru_code-0.18.0 → aru_code-0.19.1}/tests/test_providers.py +0 -0
- {aru_code-0.18.0 → aru_code-0.19.1}/tests/test_ranker.py +0 -0
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "0.19.1"
|
|
@@ -1,28 +1,40 @@
|
|
|
1
1
|
"""Context management for token optimization.
|
|
2
2
|
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
3
|
+
Mirrors opencode's two-layer approach:
|
|
4
|
+
|
|
5
|
+
1. **Prune** (routine, lossy only on tool outputs): walks old tool_result
|
|
6
|
+
blocks and replaces their content with a placeholder. User/assistant
|
|
7
|
+
text is NEVER touched — it survives verbatim until real overflow.
|
|
8
|
+
This is the steady-state memory mechanism. Matches cache_patch.py's
|
|
9
|
+
strategy at the Agno message layer.
|
|
10
|
+
|
|
11
|
+
2. **Compact** (rare, lossy full summary): triggers only when the per-call
|
|
12
|
+
context window actually approaches the model's limit. Runs a
|
|
13
|
+
compaction agent that produces a structured summary (Goal / Instructions
|
|
14
|
+
/ Discoveries / Accomplished / File contents / Relevant files) and
|
|
15
|
+
marks the resulting assistant message with `summary: True` so
|
|
16
|
+
subsequent prunes stop at that checkpoint.
|
|
17
|
+
|
|
18
|
+
There is also a `truncate_output` layer used by individual tools to cap
|
|
19
|
+
their own output size before it ever reaches history.
|
|
7
20
|
"""
|
|
8
21
|
|
|
9
22
|
from __future__ import annotations
|
|
10
23
|
|
|
11
24
|
# ── Constants ──────────────────────────────────────────────────────
|
|
12
25
|
|
|
13
|
-
# Pruning: minimum chars that must be freeable to justify a prune pass
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
#
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
PRUNE_USER_MSG_KEEP = 500 # ~140 tokens
|
|
26
|
+
# Pruning: minimum chars that must be freeable to justify a prune pass.
|
|
27
|
+
# Matches opencode's PRUNE_MINIMUM = 20_000 tokens (~80K chars @ 4 chars/token).
|
|
28
|
+
PRUNE_MINIMUM_CHARS = 80_000 # ~20K tokens
|
|
29
|
+
# Placeholder that replaces cleared tool_result content. Matches
|
|
30
|
+
# cache_patch.py's _PRUNED_PLACEHOLDER so both layers produce identical
|
|
31
|
+
# text when a tool output is cleared.
|
|
32
|
+
CLEARED_TOOL_RESULT = "[Old tool result cleared]"
|
|
21
33
|
# Minimum number of recent user turns always protected (regardless of char budget)
|
|
22
34
|
PRUNE_PROTECT_TURNS = 2
|
|
23
35
|
# Tool result markers that should never be pruned (critical context)
|
|
24
36
|
PRUNE_PROTECTED_MARKERS = {"[SubAgent-", "delegate_task"}
|
|
25
|
-
# Tool names whose outputs should never be pruned (like
|
|
37
|
+
# Tool names whose outputs should never be pruned (like opencode's PRUNE_PROTECTED_TOOLS)
|
|
26
38
|
# These are checked as substrings in message content (tool results include the tool name)
|
|
27
39
|
PRUNE_PROTECTED_TOOLS = {"delegate_task"}
|
|
28
40
|
|
|
@@ -32,17 +44,39 @@ TRUNCATE_MAX_BYTES = 15 * 1024 # 15 KB
|
|
|
32
44
|
TRUNCATE_KEEP_START = 150 # lines to keep from the start
|
|
33
45
|
TRUNCATE_KEEP_END = 60 # lines to keep from the end
|
|
34
46
|
TRUNCATE_MAX_LINE_LENGTH = 1500 # chars per individual line (prevents minified files)
|
|
35
|
-
# Directory for saving full truncated outputs (like
|
|
47
|
+
# Directory for saving full truncated outputs (like opencode pattern)
|
|
36
48
|
TRUNCATE_SAVE_DIR = ".aru/truncated"
|
|
37
49
|
|
|
38
|
-
# Compaction:
|
|
39
|
-
|
|
40
|
-
#
|
|
41
|
-
|
|
42
|
-
#
|
|
43
|
-
|
|
44
|
-
#
|
|
45
|
-
|
|
50
|
+
# Compaction: chars of recent conversation preserved verbatim post-compact.
|
|
51
|
+
#
|
|
52
|
+
# Separate from the prune protect window (160K) because they measure
|
|
53
|
+
# different things:
|
|
54
|
+
# - Prune protect: "how much tool_result content stays intact"
|
|
55
|
+
# - Compact recent: "how much full-message history stays verbatim after
|
|
56
|
+
# the summary replaces the older portion"
|
|
57
|
+
#
|
|
58
|
+
# Set to 80K chars (~20K tokens) — half the prune window. Rationale:
|
|
59
|
+
# with the compactor now running on the main model (not a small one),
|
|
60
|
+
# summaries are faithful enough that we don't need 40K of recent overlap
|
|
61
|
+
# as a safety net. 20K still covers 3-6 recent turns verbatim, which
|
|
62
|
+
# mirrors the "last few exchanges" a human would re-read to resume work.
|
|
63
|
+
# Going to zero would match opencode exactly but requires the reactive
|
|
64
|
+
# overflow replay flow we haven't implemented yet.
|
|
65
|
+
COMPACT_RECENT_CHARS = 80_000
|
|
66
|
+
|
|
67
|
+
# Compaction: trigger when per-call input tokens approach real overflow.
|
|
68
|
+
# Matches opencode's philosophy: only fire near the model's actual context
|
|
69
|
+
# limit, not routinely. Routine context reduction is handled by prune_history
|
|
70
|
+
# (lossy only on tool outputs), so compaction is reserved for genuine
|
|
71
|
+
# overflow — where the next API call would otherwise exceed the model's
|
|
72
|
+
# input limit minus the reserved buffer.
|
|
73
|
+
#
|
|
74
|
+
# Opencode fires at `count >= limit.input - reserved` (overflow.ts:22) —
|
|
75
|
+
# no extra ratio. We mirror that here. The sole safety margin is
|
|
76
|
+
# COMPACTION_BUFFER_TOKENS, which is 30K (vs opencode's 20K) to give a bit
|
|
77
|
+
# more headroom for output + tool definitions + estimation noise, since
|
|
78
|
+
# we don't yet have a reactive overflow handler to catch the edge case.
|
|
79
|
+
COMPACTION_BUFFER_TOKENS = 30_000
|
|
46
80
|
# Default model context limits (input tokens)
|
|
47
81
|
MODEL_CONTEXT_LIMITS: dict[str, int] = {
|
|
48
82
|
# Anthropic
|
|
@@ -114,61 +148,97 @@ Structured list of file paths relevant to continuing the work (one per line)."""
|
|
|
114
148
|
|
|
115
149
|
# ── Layer 1: Pruning ──────────────────────────────────────────────
|
|
116
150
|
|
|
151
|
+
def _tool_result_content_len(msg: dict) -> int:
|
|
152
|
+
"""Sum of content length of all non-cleared tool_result blocks in a message.
|
|
153
|
+
|
|
154
|
+
Mirrors opencode's prune walk, which accumulates only
|
|
155
|
+
`Token.estimate(part.state.output)` for `ToolPart`s (compaction.ts:119).
|
|
156
|
+
Text blocks and tool_use args are ignored — they are not the thing
|
|
157
|
+
being freed. This means pruning only "consumes budget" for real tool
|
|
158
|
+
output, so text-heavy conversations with few tool calls never trip
|
|
159
|
+
the prune path.
|
|
160
|
+
|
|
161
|
+
Already-cleared tool_results (content == CLEARED_TOOL_RESULT) are
|
|
162
|
+
skipped so a second pass doesn't double-count them.
|
|
163
|
+
"""
|
|
164
|
+
from aru.history_blocks import is_tool_result
|
|
165
|
+
total = 0
|
|
166
|
+
for block in msg.get("content", []):
|
|
167
|
+
if is_tool_result(block):
|
|
168
|
+
content = block.get("content")
|
|
169
|
+
if content == CLEARED_TOOL_RESULT:
|
|
170
|
+
continue
|
|
171
|
+
if content is None:
|
|
172
|
+
continue
|
|
173
|
+
# tool_result content can be a string or a list of blocks —
|
|
174
|
+
# stringify to get a char count that roughly tracks tokens.
|
|
175
|
+
total += len(str(content))
|
|
176
|
+
return total
|
|
177
|
+
|
|
178
|
+
|
|
117
179
|
def _get_prune_protect_chars(model_id: str = "default") -> int:
|
|
118
|
-
"""
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
180
|
+
"""Chars of recent history that must NEVER be pruned.
|
|
181
|
+
|
|
182
|
+
Flat value across all models, mirroring opencode's fixed
|
|
183
|
+
`PRUNE_PROTECT = 40_000` tokens (compaction.ts:36). At ~4 chars/token
|
|
184
|
+
that's 160K chars of tool-result content kept intact in the recent
|
|
185
|
+
window. Older tool_result blocks beyond this budget are eligible for
|
|
186
|
+
the lossy clear pass in `prune_history`.
|
|
187
|
+
|
|
188
|
+
Why flat (not scaled by model): opencode validated this in production
|
|
189
|
+
on contexts from 128K to 1M — scaling by ratio adds complexity without
|
|
190
|
+
improving behavior, and protecting too much in 1M-context models can
|
|
191
|
+
actually hurt prompt caching by keeping rarely-touched tail content warm.
|
|
192
|
+
|
|
193
|
+
The `model_id` parameter is retained for signature compatibility with
|
|
194
|
+
older call sites; it has no effect on the returned value.
|
|
131
195
|
"""
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
ratio_based = int(limit * 0.07 * 4)
|
|
135
|
-
# Floor of 60K chars (~17K tokens) keeps the user-visible context
|
|
136
|
-
# window around 20K tokens steady-state after system + cache + output
|
|
137
|
-
# overheads. Applies to any model where 7% would be smaller.
|
|
138
|
-
return max(60_000, min(ratio_based, 200_000))
|
|
196
|
+
del model_id # unused — kept for signature compatibility
|
|
197
|
+
return 160_000
|
|
139
198
|
|
|
140
199
|
|
|
141
200
|
def prune_history(
|
|
142
201
|
history: list[dict], model_id: str = "default"
|
|
143
202
|
) -> list[dict]:
|
|
144
|
-
"""Reduce history token footprint by
|
|
145
|
-
|
|
146
|
-
Operates on block-shaped history (see `aru.history_blocks`).
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
is
|
|
159
|
-
|
|
160
|
-
|
|
203
|
+
"""Reduce history token footprint by clearing old tool result content.
|
|
204
|
+
|
|
205
|
+
Operates on block-shaped history (see `aru.history_blocks`). Matches
|
|
206
|
+
opencode's approach: the ONLY lossy operation is replacing the
|
|
207
|
+
content of old `tool_result` blocks with a short placeholder. Text
|
|
208
|
+
blocks (user and assistant), `tool_use` blocks, and block structure
|
|
209
|
+
are always preserved — so the original ask survives verbatim until
|
|
210
|
+
real overflow forces a full compaction.
|
|
211
|
+
|
|
212
|
+
**Budget semantics** (opencode parity): the walk backward accumulates
|
|
213
|
+
**only tool_result content chars**, not whole-message chars. Text
|
|
214
|
+
blocks and tool_use args don't consume the protection budget, because
|
|
215
|
+
they aren't what prune can free. Consequences:
|
|
216
|
+
- Text-heavy conversations with few tool calls never trigger prune.
|
|
217
|
+
- Prune only fires when there is >= `protect_chars + PRUNE_MINIMUM_CHARS`
|
|
218
|
+
of tool_result content total — mirroring opencode's
|
|
219
|
+
`total > PRUNE_PROTECT + PRUNE_MINIMUM`.
|
|
220
|
+
- The "is it worth pruning?" dry-run check from opencode
|
|
221
|
+
(`pruned > PRUNE_MINIMUM`) is implicit: we cannot enter the loop
|
|
222
|
+
without enough prunable content, and once in the loop any walk
|
|
223
|
+
past `protect_chars` is guaranteed to be freeing real bytes.
|
|
224
|
+
|
|
225
|
+
Protection layers (applied on top of the budget walk):
|
|
161
226
|
1. Turn-based: last `PRUNE_PROTECT_TURNS` user turns always kept
|
|
162
|
-
intact,
|
|
163
|
-
|
|
227
|
+
intact, plus the assistant response right after each. Index 0
|
|
228
|
+
(the original user ask) is also always protected.
|
|
229
|
+
2. Budget-based: tool_result content within the 160K protect window
|
|
230
|
+
(~40K tokens, matching opencode) is kept.
|
|
164
231
|
3. Content-based: messages whose stringified content contains any
|
|
165
232
|
`PRUNE_PROTECTED_MARKERS` or `PRUNE_PROTECTED_TOOLS` never prune.
|
|
233
|
+
4. Summary checkpoint: walking backward stops at any message marked
|
|
234
|
+
`summary: True` (a previous compaction's assistant output).
|
|
235
|
+
Everything before a summary was already consolidated and must
|
|
236
|
+
not be re-processed.
|
|
166
237
|
|
|
167
238
|
Returns a new list (does not mutate the input).
|
|
168
239
|
"""
|
|
169
240
|
from aru.history_blocks import (
|
|
170
|
-
coerce_history_item,
|
|
171
|
-
is_text, is_tool_use, is_tool_result, text_block,
|
|
241
|
+
coerce_history_item, item_text, is_tool_result,
|
|
172
242
|
)
|
|
173
243
|
|
|
174
244
|
if len(history) <= 2:
|
|
@@ -177,11 +247,15 @@ def prune_history(
|
|
|
177
247
|
protect_chars = _get_prune_protect_chars(model_id)
|
|
178
248
|
result = [coerce_history_item(m) for m in history]
|
|
179
249
|
|
|
180
|
-
|
|
181
|
-
|
|
250
|
+
# Entry gate mirrors opencode: only proceed if total tool output
|
|
251
|
+
# exceeds protect + minimum. Text length is irrelevant.
|
|
252
|
+
total_tool_chars = sum(_tool_result_content_len(msg) for msg in result)
|
|
253
|
+
if total_tool_chars < protect_chars + PRUNE_MINIMUM_CHARS:
|
|
182
254
|
return result
|
|
183
255
|
|
|
184
|
-
# Identify indices of last N user turns (always protected)
|
|
256
|
+
# Identify indices of last N user turns (always protected) and index 0
|
|
257
|
+
# (the original user ask, protected defensively so the anchor never
|
|
258
|
+
# evaporates even if future edits change the budget calculus).
|
|
185
259
|
turn_protected: set[int] = set()
|
|
186
260
|
user_turns_seen = 0
|
|
187
261
|
for i in range(len(result) - 1, -1, -1):
|
|
@@ -191,108 +265,60 @@ def prune_history(
|
|
|
191
265
|
turn_protected.add(i)
|
|
192
266
|
if i + 1 < len(result):
|
|
193
267
|
turn_protected.add(i + 1)
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
for block in msg["content"]:
|
|
203
|
-
if not is_tool_use(block):
|
|
204
|
-
continue
|
|
205
|
-
tu_id = block.get("id")
|
|
206
|
-
if not tu_id:
|
|
207
|
-
continue
|
|
208
|
-
# Look forward for the matching tool_result (usually i+1)
|
|
209
|
-
for j in range(i + 1, min(i + 3, len(result))):
|
|
210
|
-
for rb in result[j]["content"]:
|
|
211
|
-
if is_tool_result(rb) and rb.get("tool_use_id") == tu_id:
|
|
212
|
-
tool_pair_loc[tu_id] = (i, j)
|
|
213
|
-
break
|
|
214
|
-
if tu_id in tool_pair_loc:
|
|
215
|
-
break
|
|
216
|
-
|
|
217
|
-
# Walk backward, protecting recent content
|
|
268
|
+
if result and result[0]["role"] == "user":
|
|
269
|
+
turn_protected.add(0)
|
|
270
|
+
if len(result) > 1:
|
|
271
|
+
turn_protected.add(1)
|
|
272
|
+
|
|
273
|
+
# Walk backward accumulating ONLY tool_result content chars into the
|
|
274
|
+
# protection budget. Messages with no tool_result (pure text, or just
|
|
275
|
+
# tool_use) consume zero budget and are skipped without pruning.
|
|
218
276
|
protected = 0
|
|
219
|
-
dropped_tool_use_ids: set[str] = set()
|
|
220
277
|
|
|
221
278
|
for i in range(len(result) - 1, -1, -1):
|
|
222
279
|
msg = result[i]
|
|
223
|
-
|
|
280
|
+
|
|
281
|
+
# Stop at the previous compaction summary marker — everything
|
|
282
|
+
# before it was already consolidated into the summary.
|
|
283
|
+
if msg.get("summary"):
|
|
284
|
+
break
|
|
285
|
+
|
|
286
|
+
tool_chars = _tool_result_content_len(msg)
|
|
287
|
+
|
|
288
|
+
# No prunable content here — nothing to clear, nothing to count.
|
|
289
|
+
if tool_chars == 0:
|
|
290
|
+
continue
|
|
224
291
|
|
|
225
292
|
if i in turn_protected:
|
|
226
|
-
protected +=
|
|
293
|
+
protected += tool_chars
|
|
227
294
|
continue
|
|
228
295
|
|
|
229
|
-
if protected +
|
|
230
|
-
protected +=
|
|
296
|
+
if protected + tool_chars <= protect_chars:
|
|
297
|
+
protected += tool_chars
|
|
231
298
|
continue
|
|
232
299
|
|
|
233
300
|
# Outside protection window — check content-based protection
|
|
234
301
|
text_view = item_text(msg)
|
|
235
302
|
if (any(marker in text_view for marker in PRUNE_PROTECTED_MARKERS)
|
|
236
303
|
or any(tool in text_view for tool in PRUNE_PROTECTED_TOOLS)):
|
|
237
|
-
protected +=
|
|
304
|
+
protected += tool_chars
|
|
238
305
|
continue
|
|
239
306
|
|
|
240
|
-
#
|
|
307
|
+
# Clear any tool_result payloads in this message. Leave every
|
|
308
|
+
# other block (text, tool_use, thinking, etc.) untouched.
|
|
241
309
|
new_blocks: list[dict] = []
|
|
242
310
|
for block in msg["content"]:
|
|
243
|
-
if
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
text = block.get("text", "")
|
|
250
|
-
if len(text) > PRUNE_USER_MSG_THRESHOLD:
|
|
251
|
-
truncated = (
|
|
252
|
-
text[:PRUNE_USER_MSG_KEEP]
|
|
253
|
-
+ f"\n\n[... {len(text) - PRUNE_USER_MSG_KEEP:,} "
|
|
254
|
-
"chars pruned to save context ...]"
|
|
255
|
-
)
|
|
256
|
-
new_blocks.append(text_block(truncated))
|
|
257
|
-
else:
|
|
258
|
-
new_blocks.append(block)
|
|
259
|
-
else:
|
|
260
|
-
new_blocks.append(block)
|
|
261
|
-
elif is_tool_use(block):
|
|
262
|
-
# Drop the tool_use entirely and mark its id for paired removal
|
|
263
|
-
tu_id = block.get("id")
|
|
264
|
-
if tu_id:
|
|
265
|
-
dropped_tool_use_ids.add(tu_id)
|
|
266
|
-
# Do NOT add to new_blocks
|
|
267
|
-
elif is_tool_result(block):
|
|
268
|
-
# Drop only if its paired tool_use is also being dropped
|
|
269
|
-
tu_id = block.get("tool_use_id")
|
|
270
|
-
if tu_id in dropped_tool_use_ids:
|
|
271
|
-
pass # drop
|
|
272
|
-
else:
|
|
273
|
-
new_blocks.append(block)
|
|
311
|
+
if is_tool_result(block) and block.get("content") != CLEARED_TOOL_RESULT:
|
|
312
|
+
new_blocks.append({
|
|
313
|
+
"type": "tool_result",
|
|
314
|
+
"tool_use_id": block.get("tool_use_id"),
|
|
315
|
+
"content": CLEARED_TOOL_RESULT,
|
|
316
|
+
})
|
|
274
317
|
else:
|
|
275
318
|
new_blocks.append(block)
|
|
276
319
|
|
|
277
320
|
result[i] = {"role": msg["role"], "content": new_blocks}
|
|
278
321
|
|
|
279
|
-
# Second pass: any tool_result blocks in user messages whose tool_use
|
|
280
|
-
# was dropped on a previous pass (covers case where user msg was
|
|
281
|
-
# inside protection but its paired assistant was outside).
|
|
282
|
-
if dropped_tool_use_ids:
|
|
283
|
-
for idx, msg in enumerate(result):
|
|
284
|
-
if not msg["content"]:
|
|
285
|
-
continue
|
|
286
|
-
filtered = [
|
|
287
|
-
b for b in msg["content"]
|
|
288
|
-
if not (is_tool_result(b) and b.get("tool_use_id") in dropped_tool_use_ids)
|
|
289
|
-
]
|
|
290
|
-
if len(filtered) != len(msg["content"]):
|
|
291
|
-
result[idx] = {"role": msg["role"], "content": filtered}
|
|
292
|
-
|
|
293
|
-
# Drop any messages that ended up with zero blocks (valid but useless)
|
|
294
|
-
result = [m for m in result if m["content"]]
|
|
295
|
-
|
|
296
322
|
return result
|
|
297
323
|
|
|
298
324
|
|
|
@@ -443,58 +469,78 @@ def should_compact(
|
|
|
443
469
|
) -> bool:
|
|
444
470
|
"""Check if the conversation should be compacted.
|
|
445
471
|
|
|
446
|
-
|
|
447
|
-
|
|
448
|
-
|
|
472
|
+
Fires when the per-call context window reaches real overflow:
|
|
473
|
+
`tokens >= limit - COMPACTION_BUFFER_TOKENS`.
|
|
474
|
+
|
|
475
|
+
Matches opencode's `isOverflow` in overflow.ts:22 — `count >= usable`,
|
|
476
|
+
no extra ratio. Routine context reduction is handled by `prune_history`
|
|
477
|
+
(lossy only on tool outputs), so compaction is reserved for genuine
|
|
478
|
+
overflow where the next API call would otherwise exceed the model's
|
|
479
|
+
input limit minus the reserved buffer.
|
|
449
480
|
|
|
450
481
|
Accepts either an estimated token count (int) or the history list.
|
|
451
482
|
"""
|
|
452
483
|
if isinstance(history_or_tokens, list):
|
|
453
|
-
|
|
454
|
-
tokens = estimate_history_tokens(history)
|
|
455
|
-
# Turn-based trigger: count user messages
|
|
456
|
-
user_turns = sum(1 for m in history if m["role"] == "user")
|
|
457
|
-
if user_turns >= COMPACTION_MAX_TURNS:
|
|
458
|
-
return True
|
|
484
|
+
tokens = estimate_history_tokens(history_or_tokens)
|
|
459
485
|
else:
|
|
460
486
|
tokens = history_or_tokens
|
|
461
487
|
|
|
462
488
|
limit = MODEL_CONTEXT_LIMITS.get(model_id, MODEL_CONTEXT_LIMITS["default"])
|
|
463
489
|
usable = limit - COMPACTION_BUFFER_TOKENS
|
|
464
|
-
|
|
465
|
-
return tokens >= threshold
|
|
490
|
+
return tokens >= usable
|
|
466
491
|
|
|
467
492
|
|
|
468
493
|
def would_prune(history: list[dict], model_id: str = "default") -> bool:
|
|
469
494
|
"""Check if prune_history would discard content from this history.
|
|
470
495
|
|
|
471
|
-
Uses the
|
|
472
|
-
the protection window + minimum prunable
|
|
496
|
+
Uses the same entry gate as `prune_history`: total tool_result
|
|
497
|
+
content must exceed the protection window + minimum prunable
|
|
498
|
+
threshold. Text and tool_use args are not counted — only real
|
|
499
|
+
prunable output. Mirrors opencode's logic.
|
|
473
500
|
"""
|
|
474
|
-
from aru.history_blocks import item_char_len
|
|
475
501
|
if len(history) <= 2:
|
|
476
502
|
return False
|
|
477
|
-
|
|
503
|
+
total_tool_chars = sum(_tool_result_content_len(msg) for msg in history)
|
|
478
504
|
protect_chars = _get_prune_protect_chars(model_id)
|
|
479
|
-
return
|
|
505
|
+
return total_tool_chars >= protect_chars + PRUNE_MINIMUM_CHARS
|
|
480
506
|
|
|
481
507
|
|
|
482
508
|
def _split_history(history: list[dict], model_id: str = "default") -> tuple[list[dict], list[dict]]:
|
|
483
509
|
"""Split history into old (to summarize) and recent (to keep intact).
|
|
484
510
|
|
|
485
|
-
Uses
|
|
511
|
+
Uses `COMPACT_RECENT_CHARS` (80K chars ≈ 20K tokens) as the "recent"
|
|
512
|
+
budget — half of the prune protect window. Rationale: the compactor
|
|
513
|
+
now runs on the main model and produces high-fidelity summaries, so
|
|
514
|
+
we don't need 40K of recent overlap as a safety net. 20K covers 3-6
|
|
515
|
+
recent turns verbatim, which is enough to absorb the gap between
|
|
516
|
+
the last summarized state and the next turn.
|
|
517
|
+
|
|
518
|
+
Defensively, the first user turn (index 0) is always pulled into
|
|
519
|
+
`recent` so the original ask survives literal even through a full
|
|
520
|
+
compaction — the compactor extracts it into the `## Goal` section
|
|
521
|
+
of the summary, but keeping it in recent too means the agent can
|
|
522
|
+
quote it verbatim afterward.
|
|
523
|
+
|
|
524
|
+
The `model_id` parameter is retained for signature compatibility;
|
|
525
|
+
the recent budget is a flat value not scaled by model context.
|
|
486
526
|
"""
|
|
527
|
+
del model_id # unused — recent budget is flat across models
|
|
487
528
|
from aru.history_blocks import item_char_len
|
|
488
|
-
protect_chars = _get_prune_protect_chars(model_id)
|
|
489
529
|
protected = 0
|
|
490
530
|
split_idx = len(history)
|
|
491
531
|
for i in range(len(history) - 1, -1, -1):
|
|
492
532
|
msg_len = item_char_len(history[i])
|
|
493
|
-
if protected + msg_len <=
|
|
533
|
+
if protected + msg_len <= COMPACT_RECENT_CHARS:
|
|
494
534
|
protected += msg_len
|
|
495
535
|
split_idx = i
|
|
496
536
|
else:
|
|
497
537
|
break
|
|
538
|
+
|
|
539
|
+
# Defensive: force the first user turn into `recent` even if the
|
|
540
|
+
# protect budget would have sent it to `old`. The original ask is
|
|
541
|
+
# the session anchor and must stay literal.
|
|
542
|
+
if split_idx > 0 and history and history[0].get("role") == "user":
|
|
543
|
+
return history[1:split_idx], [history[0]] + history[split_idx:]
|
|
498
544
|
return history[:split_idx], history[split_idx:]
|
|
499
545
|
|
|
500
546
|
|
|
@@ -563,12 +609,13 @@ def apply_compaction(
|
|
|
563
609
|
The summary is emitted as a synthetic user→assistant exchange so that
|
|
564
610
|
role alternation stays natural:
|
|
565
611
|
[user: "Please summarize..."]
|
|
566
|
-
[assistant: "<summary>"]
|
|
612
|
+
[assistant: "<summary>", summary=True]
|
|
567
613
|
+ recent messages as-is
|
|
568
614
|
|
|
569
|
-
|
|
570
|
-
|
|
571
|
-
|
|
615
|
+
The assistant summary is marked with `summary: True` as a checkpoint.
|
|
616
|
+
`prune_history` walks backward and stops at this marker, so content
|
|
617
|
+
already consolidated into the summary is never re-processed. Mirrors
|
|
618
|
+
opencode's `msg.info.summary` flag (see message-v2.ts:914).
|
|
572
619
|
"""
|
|
573
620
|
from aru.history_blocks import text_block, coerce_history_item
|
|
574
621
|
_, recent = _split_history(history, model_id)
|
|
@@ -581,6 +628,7 @@ def apply_compaction(
|
|
|
581
628
|
{
|
|
582
629
|
"role": "assistant",
|
|
583
630
|
"content": [text_block(f"Prior conversation summary:\n\n{summary}")],
|
|
631
|
+
"summary": True,
|
|
584
632
|
},
|
|
585
633
|
]
|
|
586
634
|
compacted.extend(coerce_history_item(m) for m in recent)
|
|
@@ -596,10 +644,20 @@ async def compact_conversation(
|
|
|
596
644
|
) -> list[dict[str, str]]:
|
|
597
645
|
"""Run the compaction agent to summarize and replace history.
|
|
598
646
|
|
|
599
|
-
Uses
|
|
600
|
-
|
|
647
|
+
Uses the **same model** as the main session (`model_ref`), not a
|
|
648
|
+
cheaper small model. Rationale:
|
|
649
|
+
|
|
650
|
+
- Compaction is rare (only on real overflow, ~0-2× per long session).
|
|
651
|
+
- The summary is the *only* persistent record of pre-window history.
|
|
652
|
+
- A weaker compactor risks dropping subtle decisions that the main
|
|
653
|
+
model would have caught — and once dropped, they cannot be recovered
|
|
654
|
+
mid-session.
|
|
655
|
+
- The marginal cost (Sonnet: ~$0.20-0.40 per session; Opus: a few
|
|
656
|
+
dollars) is justified by the fidelity gain on a non-recoverable
|
|
657
|
+
step.
|
|
658
|
+
|
|
659
|
+
Falls back to a mechanical summary if the agent call fails.
|
|
601
660
|
"""
|
|
602
|
-
from aru.runtime import get_ctx
|
|
603
661
|
from aru.providers import create_model
|
|
604
662
|
|
|
605
663
|
prompt = build_compaction_prompt(history, plan_task, model_id=model_id)
|
|
@@ -607,16 +665,17 @@ async def compact_conversation(
|
|
|
607
665
|
try:
|
|
608
666
|
from agno.agent import Agent
|
|
609
667
|
|
|
610
|
-
small_ref = get_ctx().small_model_ref
|
|
611
668
|
compactor = Agent(
|
|
612
669
|
name="Compactor",
|
|
613
|
-
model=create_model(
|
|
670
|
+
model=create_model(model_ref, max_tokens=4096),
|
|
614
671
|
instructions=(
|
|
615
672
|
"You summarize coding conversations concisely. Output ONLY the requested sections, no preamble. "
|
|
616
673
|
"Preserve: user goals, explicit instructions/preferences, file paths with line numbers, "
|
|
617
|
-
"function/class names that were modified, what remains to be done
|
|
618
|
-
"
|
|
619
|
-
"
|
|
674
|
+
"function/class names that were modified, what remains to be done. "
|
|
675
|
+
"For the '## File contents (key excerpts)' section, use your judgment: "
|
|
676
|
+
"if a file was central to the work (being debugged, actively edited, or referenced "
|
|
677
|
+
"in a decision), include the critical lines verbatim; if a file was only briefly "
|
|
678
|
+
"read for context, just list the path. Do not mechanically copy everything. "
|
|
620
679
|
"Drop: greetings, reasoning chains, redundant tool output, transient status messages."
|
|
621
680
|
),
|
|
622
681
|
markdown=True,
|
|
@@ -402,9 +402,12 @@ class Session:
|
|
|
402
402
|
self.history.append({"role": role, "content": blocks})
|
|
403
403
|
# Hard cap as safety net — structured pruning/compaction in
|
|
404
404
|
# aru/context.py handles the normal case; this only fires if
|
|
405
|
-
# something bypasses them.
|
|
406
|
-
|
|
407
|
-
|
|
405
|
+
# something bypasses them. Set high enough that long sessions
|
|
406
|
+
# (which now accumulate more messages because prune is
|
|
407
|
+
# non-destructive for text and compact rarely fires) don't hit
|
|
408
|
+
# this destructive path routinely.
|
|
409
|
+
if len(self.history) > 300:
|
|
410
|
+
self.history = self.history[-300:]
|
|
408
411
|
|
|
409
412
|
def add_structured_message(self, role: str, blocks: list[dict]):
|
|
410
413
|
"""Explicitly add a message with pre-built content blocks.
|
|
@@ -314,11 +314,12 @@ class TestSession:
|
|
|
314
314
|
|
|
315
315
|
def test_add_message_caps_history(self):
|
|
316
316
|
session = Session()
|
|
317
|
-
for i in range(
|
|
317
|
+
for i in range(350):
|
|
318
318
|
session.add_message("user", f"msg {i}")
|
|
319
|
-
# History is bounded by a hard cap (structured
|
|
320
|
-
# aru.context handles the normal-path token
|
|
321
|
-
|
|
319
|
+
# History is bounded by a hard safety cap (structured pruning/
|
|
320
|
+
# compaction in aru.context handles the normal-path token
|
|
321
|
+
# management; this cap only fires on pathological growth).
|
|
322
|
+
assert len(session.history) <= 300
|
|
322
323
|
|
|
323
324
|
def test_set_plan(self):
|
|
324
325
|
session = Session()
|
|
@@ -176,17 +176,26 @@ class TestPrunePreservesPairs:
|
|
|
176
176
|
"""Fix 6: pruning must never orphan tool_use / tool_result blocks."""
|
|
177
177
|
|
|
178
178
|
def test_prune_drops_tool_pair_atomically(self):
|
|
179
|
-
"""An old
|
|
180
|
-
|
|
181
|
-
|
|
179
|
+
"""An old tool_result whose content gets cleared must still keep
|
|
180
|
+
its block (matching tool_use_id), so the tool_use/tool_result
|
|
181
|
+
pair is never orphaned.
|
|
182
|
+
|
|
183
|
+
Opencode-aligned budget: prune only counts tool_result content
|
|
184
|
+
chars, so the history needs multiple large tool_result payloads
|
|
185
|
+
to clear the 240K entry gate.
|
|
186
|
+
"""
|
|
187
|
+
big_output = "old file line\n" * 8_000 # ~100K chars per result
|
|
182
188
|
history = [
|
|
183
189
|
{"role": "user", "content": "request 1"},
|
|
184
|
-
_assistant_tool_turn("old_tu", "read_file", {"path": "old.py"}
|
|
185
|
-
_tool_result_turn("old_tu",
|
|
190
|
+
_assistant_tool_turn("old_tu", "read_file", {"path": "old.py"}),
|
|
191
|
+
_tool_result_turn("old_tu", big_output),
|
|
186
192
|
{"role": "user", "content": "request 2"},
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
{"role": "
|
|
193
|
+
_assistant_tool_turn("mid_tu", "read_file", {"path": "mid.py"}),
|
|
194
|
+
_tool_result_turn("mid_tu", big_output),
|
|
195
|
+
{"role": "user", "content": "request 3"},
|
|
196
|
+
_assistant_tool_turn("recent_tu", "read_file", {"path": "new.py"}),
|
|
197
|
+
_tool_result_turn("recent_tu", big_output),
|
|
198
|
+
{"role": "user", "content": "summarize"},
|
|
190
199
|
]
|
|
191
200
|
|
|
192
201
|
pruned = prune_history(history, model_id="default")
|
|
@@ -208,10 +217,20 @@ class TestPrunePreservesPairs:
|
|
|
208
217
|
)
|
|
209
218
|
|
|
210
219
|
def test_prune_keeps_recent_tool_pair(self):
|
|
211
|
-
"""A tool_use/tool_result pair inside the protection window must be
|
|
220
|
+
"""A tool_use/tool_result pair inside the protection window must be
|
|
221
|
+
kept with its content intact, even when older tool_results get cleared.
|
|
222
|
+
|
|
223
|
+
Builds a history with two big old tool_results (enough to trigger
|
|
224
|
+
prune) and one small recent pair that must survive verbatim.
|
|
225
|
+
"""
|
|
226
|
+
big_old = "old file content\n" * 10_000 # ~170K chars each
|
|
212
227
|
history = [
|
|
213
|
-
{"role": "user", "content": "
|
|
214
|
-
|
|
228
|
+
{"role": "user", "content": "req 1"},
|
|
229
|
+
_assistant_tool_turn("tu_old1", "read_file", {"path": "a.py"}, "reading"),
|
|
230
|
+
_tool_result_turn("tu_old1", big_old),
|
|
231
|
+
{"role": "user", "content": "req 2"},
|
|
232
|
+
_assistant_tool_turn("tu_old2", "read_file", {"path": "b.py"}, "reading"),
|
|
233
|
+
_tool_result_turn("tu_old2", big_old),
|
|
215
234
|
{"role": "user", "content": "read foo"},
|
|
216
235
|
_assistant_tool_turn("tu_recent", "read_file", {"path": "foo.py"}, "reading"),
|
|
217
236
|
_tool_result_turn("tu_recent", "def foo(): pass"),
|
|
@@ -227,6 +246,10 @@ class TestPrunePreservesPairs:
|
|
|
227
246
|
|
|
228
247
|
assert len(tool_uses) == 1, "Recent tool_use was incorrectly pruned"
|
|
229
248
|
assert len(tool_results) == 1, "Recent tool_result was incorrectly pruned"
|
|
249
|
+
# Recent content must be intact (not cleared)
|
|
250
|
+
assert tool_results[0].get("content") == "def foo(): pass", (
|
|
251
|
+
"Recent tool_result content was cleared — should be inside protection window"
|
|
252
|
+
)
|
|
230
253
|
|
|
231
254
|
def test_prune_with_no_pairs_still_works(self):
|
|
232
255
|
"""Pure text history should prune without errors."""
|
|
@@ -10,8 +10,16 @@ from aru.context import (
|
|
|
10
10
|
apply_compaction,
|
|
11
11
|
build_compaction_prompt,
|
|
12
12
|
format_context_block,
|
|
13
|
+
CLEARED_TOOL_RESULT,
|
|
14
|
+
)
|
|
15
|
+
from aru.history_blocks import (
|
|
16
|
+
coerce_history,
|
|
17
|
+
item_text,
|
|
18
|
+
tool_use_block,
|
|
19
|
+
tool_result_block,
|
|
20
|
+
text_block,
|
|
21
|
+
is_tool_result,
|
|
13
22
|
)
|
|
14
|
-
from aru.history_blocks import coerce_history, item_text
|
|
15
23
|
|
|
16
24
|
|
|
17
25
|
class TestPruneHistory:
|
|
@@ -27,37 +35,103 @@ class TestPruneHistory:
|
|
|
27
35
|
# Input is auto-coerced to block form on return
|
|
28
36
|
assert result == coerce_history(messages)
|
|
29
37
|
|
|
30
|
-
def
|
|
31
|
-
"""Should
|
|
32
|
-
|
|
33
|
-
|
|
38
|
+
def test_prunes_old_tool_results_when_over_threshold(self):
|
|
39
|
+
"""Should clear old tool_result content when total tool output
|
|
40
|
+
exceeds protect + minimum (opencode-aligned budget semantics).
|
|
41
|
+
|
|
42
|
+
The budget walks backward over tool_result content chars only.
|
|
43
|
+
Text and tool_use args don't count, so this test uses large
|
|
44
|
+
tool_result payloads to actually trip the prune path.
|
|
45
|
+
"""
|
|
46
|
+
# Three rounds of read_file-sized outputs. Total ~300K chars
|
|
47
|
+
# of tool_result content — clears the 240K entry gate, and
|
|
48
|
+
# the 160K protect budget will cover only the most recent one.
|
|
49
|
+
big_output = "line of code\n" * 8_000 # ~100K chars
|
|
34
50
|
messages = [
|
|
35
|
-
{"role": "user", "content": "
|
|
36
|
-
{
|
|
37
|
-
|
|
38
|
-
|
|
51
|
+
{"role": "user", "content": "round 1"},
|
|
52
|
+
{
|
|
53
|
+
"role": "assistant",
|
|
54
|
+
"content": [
|
|
55
|
+
text_block("reading"),
|
|
56
|
+
tool_use_block("tu_old", "read_file", {"path": "a.py"}),
|
|
57
|
+
],
|
|
58
|
+
},
|
|
59
|
+
{"role": "tool", "content": [tool_result_block("tu_old", big_output)]},
|
|
60
|
+
{"role": "user", "content": "round 2"},
|
|
61
|
+
{
|
|
62
|
+
"role": "assistant",
|
|
63
|
+
"content": [
|
|
64
|
+
text_block("reading"),
|
|
65
|
+
tool_use_block("tu_mid", "read_file", {"path": "b.py"}),
|
|
66
|
+
],
|
|
67
|
+
},
|
|
68
|
+
{"role": "tool", "content": [tool_result_block("tu_mid", big_output)]},
|
|
69
|
+
{"role": "user", "content": "round 3"},
|
|
70
|
+
{
|
|
71
|
+
"role": "assistant",
|
|
72
|
+
"content": [
|
|
73
|
+
text_block("reading"),
|
|
74
|
+
tool_use_block("tu_recent", "read_file", {"path": "c.py"}),
|
|
75
|
+
],
|
|
76
|
+
},
|
|
77
|
+
{"role": "tool", "content": [tool_result_block("tu_recent", big_output)]},
|
|
78
|
+
{"role": "user", "content": "what did you find?"},
|
|
39
79
|
]
|
|
40
80
|
result = prune_history(messages)
|
|
41
|
-
# Should have placeholder for pruned content
|
|
42
|
-
assert len(result) <= len(messages)
|
|
43
|
-
# Recent messages should be preserved
|
|
44
|
-
assert any("Second request" in str(m) for m in result)
|
|
45
|
-
|
|
46
|
-
def test_preserves_user_messages(self):
|
|
47
|
-
"""Should always preserve user messages."""
|
|
48
|
-
old_user = {"role": "user", "content": "Old user message"}
|
|
49
|
-
old_assistant = {"role": "assistant", "content": "Old assistant " * 10000}
|
|
50
|
-
recent = {"role": "user", "content": "Recent request"}
|
|
51
|
-
|
|
52
|
-
messages = [old_user, old_assistant, recent]
|
|
53
|
-
result = prune_history(messages)
|
|
54
81
|
|
|
55
|
-
#
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
82
|
+
# Same number of messages (prune never drops structure)
|
|
83
|
+
assert len(result) == len(messages)
|
|
84
|
+
|
|
85
|
+
# Collect tool_result blocks by tool_use_id
|
|
86
|
+
by_id: dict[str, dict] = {}
|
|
87
|
+
for msg in result:
|
|
88
|
+
for block in msg.get("content", []):
|
|
89
|
+
if is_tool_result(block):
|
|
90
|
+
by_id[block.get("tool_use_id")] = block
|
|
91
|
+
|
|
92
|
+
# All three pairs preserved at the block level
|
|
93
|
+
assert set(by_id.keys()) == {"tu_old", "tu_mid", "tu_recent"}
|
|
94
|
+
|
|
95
|
+
# Recent tool_result kept verbatim
|
|
96
|
+
assert by_id["tu_recent"]["content"] == big_output
|
|
97
|
+
|
|
98
|
+
# The older tool_result must have been cleared — at least one
|
|
99
|
+
# of tu_old/tu_mid should now hold the placeholder, since only
|
|
100
|
+
# 160K chars worth fits inside the protect window.
|
|
101
|
+
cleared_count = sum(
|
|
102
|
+
1 for tu_id in ("tu_old", "tu_mid")
|
|
103
|
+
if by_id[tu_id]["content"] == CLEARED_TOOL_RESULT
|
|
104
|
+
)
|
|
105
|
+
assert cleared_count >= 1, (
|
|
106
|
+
"Expected at least one old tool_result to be cleared once "
|
|
107
|
+
"total output exceeded protect + minimum"
|
|
59
108
|
)
|
|
60
|
-
|
|
109
|
+
|
|
110
|
+
def test_text_heavy_history_is_not_pruned(self):
|
|
111
|
+
"""Conversations dominated by text (not tool output) must NOT
|
|
112
|
+
trigger prune even if total chars are huge.
|
|
113
|
+
|
|
114
|
+
This is the opencode-aligned semantics: text blocks don't enter
|
|
115
|
+
the prune budget. A 500K-char text history with no tool_results
|
|
116
|
+
is a no-op for prune_history.
|
|
117
|
+
"""
|
|
118
|
+
messages = [
|
|
119
|
+
{"role": "user", "content": "long planning discussion " * 10_000},
|
|
120
|
+
{"role": "assistant", "content": "detailed reasoning " * 10_000},
|
|
121
|
+
{"role": "user", "content": "what's next?"},
|
|
122
|
+
{"role": "assistant", "content": "here's the plan " * 10_000},
|
|
123
|
+
]
|
|
124
|
+
result = prune_history(messages)
|
|
125
|
+
|
|
126
|
+
# No tool_results exist anywhere in result
|
|
127
|
+
tool_results = [
|
|
128
|
+
b for m in result for b in m.get("content", []) if is_tool_result(b)
|
|
129
|
+
]
|
|
130
|
+
assert tool_results == []
|
|
131
|
+
# Length preserved
|
|
132
|
+
assert len(result) == len(messages)
|
|
133
|
+
# No message content was altered to CLEARED_TOOL_RESULT
|
|
134
|
+
assert all(CLEARED_TOOL_RESULT not in item_text(m) for m in result)
|
|
61
135
|
|
|
62
136
|
def test_empty_history(self):
|
|
63
137
|
"""Should handle empty history."""
|
|
@@ -108,20 +182,21 @@ class TestShouldCompact:
|
|
|
108
182
|
"""Tests for should_compact function."""
|
|
109
183
|
|
|
110
184
|
def test_no_compaction_under_threshold(self):
|
|
111
|
-
"""Should not compact when under
|
|
112
|
-
#
|
|
185
|
+
"""Should not compact when well under the overflow threshold."""
|
|
186
|
+
# claude-sonnet-4-5 has 200K context; usable = 170K (buffer 30K).
|
|
187
|
+
# 5 tokens is well under.
|
|
113
188
|
result = should_compact(5, model_id="claude-sonnet-4-5-20250929")
|
|
114
189
|
assert result is False
|
|
115
190
|
|
|
116
191
|
def test_compaction_over_threshold(self):
|
|
117
|
-
"""Should compact when over threshold."""
|
|
118
|
-
# 300K tokens is over
|
|
192
|
+
"""Should compact when over the real-overflow threshold."""
|
|
193
|
+
# 300K tokens is well over the 170K threshold of a 200K-context model.
|
|
119
194
|
result = should_compact(300000, model_id="claude-sonnet-4-5-20250929")
|
|
120
195
|
assert result is True
|
|
121
196
|
|
|
122
197
|
def test_custom_context_limit(self):
|
|
123
198
|
"""Should respect custom context limit."""
|
|
124
|
-
# gpt-4o has 128K context
|
|
199
|
+
# gpt-4o has 128K context; usable = 98K. 50K is under.
|
|
125
200
|
result = should_compact(50000, model_id="gpt-4o")
|
|
126
201
|
assert isinstance(result, bool)
|
|
127
202
|
|
|
@@ -145,7 +220,8 @@ class TestCompactionTriggerUsesPerCallMetric:
|
|
|
145
220
|
|
|
146
221
|
def test_small_per_call_window_does_not_fire(self):
|
|
147
222
|
"""Reproduces the exact bug report: per-call ~20K on qwen3.6-plus
|
|
148
|
-
(128K limit, ~
|
|
223
|
+
(128K limit, ~98K threshold with 30K buffer) must NOT
|
|
224
|
+
trigger compaction."""
|
|
149
225
|
# Values taken from the real session where compaction fired incorrectly:
|
|
150
226
|
# "context: 20,184 (in: 16,652 / out: 696 / cache_read: 2,836)"
|
|
151
227
|
last_input = 16_652
|
|
@@ -158,7 +234,7 @@ class TestCompactionTriggerUsesPerCallMetric:
|
|
|
158
234
|
)
|
|
159
235
|
assert last_call_window == 20_184, "window computation changed"
|
|
160
236
|
|
|
161
|
-
# 20K is
|
|
237
|
+
# 20K is far below the ~98K threshold for a 128K-context model
|
|
162
238
|
assert should_compact(last_call_window, model_id="qwen3.6-plus") is False, (
|
|
163
239
|
"Compaction fired on a small per-call window. The runner is "
|
|
164
240
|
"probably passing cumulative tokens (run_output.metrics.input_tokens) "
|
|
@@ -169,7 +245,9 @@ class TestCompactionTriggerUsesPerCallMetric:
|
|
|
169
245
|
def test_large_per_call_window_still_fires(self):
|
|
170
246
|
"""Positive case: compaction must still fire when the last-call
|
|
171
247
|
window actually approaches the model's context limit."""
|
|
172
|
-
|
|
248
|
+
# qwen3.6-plus: 128K limit, usable = 98K (buffer 30K).
|
|
249
|
+
# 105K input + 2K output + 0 cache = 107K window → must fire.
|
|
250
|
+
last_input = 105_000
|
|
173
251
|
last_output = 2_000
|
|
174
252
|
last_cache_read = 0
|
|
175
253
|
last_cache_write = 0
|
|
@@ -177,17 +255,17 @@ class TestCompactionTriggerUsesPerCallMetric:
|
|
|
177
255
|
last_call_window = (
|
|
178
256
|
last_input + last_output + last_cache_read + last_cache_write
|
|
179
257
|
)
|
|
180
|
-
assert last_call_window ==
|
|
258
|
+
assert last_call_window == 107_000
|
|
181
259
|
|
|
182
|
-
#
|
|
260
|
+
# 107K > 98K threshold → must fire
|
|
183
261
|
assert should_compact(last_call_window, model_id="qwen3.6-plus") is True
|
|
184
262
|
|
|
185
263
|
def test_cumulative_metric_is_the_wrong_signal(self):
|
|
186
264
|
"""Illustrates WHY the old approach was wrong: a cumulative sum of
|
|
187
|
-
|
|
265
|
+
6 API calls at 18K each is 108K (above threshold), but the actual
|
|
188
266
|
per-call window each time is only 18K (well below)."""
|
|
189
267
|
per_call_window = 18_000
|
|
190
|
-
num_api_calls_in_turn =
|
|
268
|
+
num_api_calls_in_turn = 6
|
|
191
269
|
cumulative_if_summed = per_call_window * num_api_calls_in_turn
|
|
192
270
|
|
|
193
271
|
# Old (wrong) behavior: cumulative triggers compaction
|
|
@@ -196,8 +274,8 @@ class TestCompactionTriggerUsesPerCallMetric:
|
|
|
196
274
|
# New (correct) behavior: per-call does NOT trigger compaction
|
|
197
275
|
assert should_compact(per_call_window, model_id="qwen3.6-plus") is False
|
|
198
276
|
|
|
199
|
-
# The difference is the entire bug
|
|
200
|
-
assert cumulative_if_summed >
|
|
277
|
+
# The difference is the entire bug (threshold is 98K for qwen3.6-plus)
|
|
278
|
+
assert cumulative_if_summed > 98_000 > per_call_window
|
|
201
279
|
|
|
202
280
|
def test_runner_source_uses_per_call_metric(self):
|
|
203
281
|
"""Static check against silent regression.
|
aru_code-0.18.0/aru/__init__.py
DELETED
|
@@ -1 +0,0 @@
|
|
|
1
|
-
__version__ = "0.18.0"
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|