aru-code 0.18.0__tar.gz → 0.19.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {aru_code-0.18.0/aru_code.egg-info → aru_code-0.19.0}/PKG-INFO +1 -1
- aru_code-0.19.0/aru/__init__.py +1 -0
- {aru_code-0.18.0 → aru_code-0.19.0}/aru/context.py +187 -166
- {aru_code-0.18.0 → aru_code-0.19.0}/aru/session.py +6 -3
- {aru_code-0.18.0 → aru_code-0.19.0/aru_code.egg-info}/PKG-INFO +1 -1
- {aru_code-0.18.0 → aru_code-0.19.0}/pyproject.toml +1 -1
- {aru_code-0.18.0 → aru_code-0.19.0}/tests/test_cli.py +5 -4
- {aru_code-0.18.0 → aru_code-0.19.0}/tests/test_confabulation_regression.py +34 -11
- {aru_code-0.18.0 → aru_code-0.19.0}/tests/test_context.py +119 -41
- aru_code-0.18.0/aru/__init__.py +0 -1
- {aru_code-0.18.0 → aru_code-0.19.0}/LICENSE +0 -0
- {aru_code-0.18.0 → aru_code-0.19.0}/README.md +0 -0
- {aru_code-0.18.0 → aru_code-0.19.0}/aru/agent_factory.py +0 -0
- {aru_code-0.18.0 → aru_code-0.19.0}/aru/agents/__init__.py +0 -0
- {aru_code-0.18.0 → aru_code-0.19.0}/aru/agents/base.py +0 -0
- {aru_code-0.18.0 → aru_code-0.19.0}/aru/agents/executor.py +0 -0
- {aru_code-0.18.0 → aru_code-0.19.0}/aru/agents/planner.py +0 -0
- {aru_code-0.18.0 → aru_code-0.19.0}/aru/cache_patch.py +0 -0
- {aru_code-0.18.0 → aru_code-0.19.0}/aru/cli.py +0 -0
- {aru_code-0.18.0 → aru_code-0.19.0}/aru/commands.py +0 -0
- {aru_code-0.18.0 → aru_code-0.19.0}/aru/completers.py +0 -0
- {aru_code-0.18.0 → aru_code-0.19.0}/aru/config.py +0 -0
- {aru_code-0.18.0 → aru_code-0.19.0}/aru/display.py +0 -0
- {aru_code-0.18.0 → aru_code-0.19.0}/aru/history_blocks.py +0 -0
- {aru_code-0.18.0 → aru_code-0.19.0}/aru/permissions.py +0 -0
- {aru_code-0.18.0 → aru_code-0.19.0}/aru/providers.py +0 -0
- {aru_code-0.18.0 → aru_code-0.19.0}/aru/runner.py +0 -0
- {aru_code-0.18.0 → aru_code-0.19.0}/aru/runtime.py +0 -0
- {aru_code-0.18.0 → aru_code-0.19.0}/aru/tools/__init__.py +0 -0
- {aru_code-0.18.0 → aru_code-0.19.0}/aru/tools/ast_tools.py +0 -0
- {aru_code-0.18.0 → aru_code-0.19.0}/aru/tools/codebase.py +0 -0
- {aru_code-0.18.0 → aru_code-0.19.0}/aru/tools/gitignore.py +0 -0
- {aru_code-0.18.0 → aru_code-0.19.0}/aru/tools/mcp_client.py +0 -0
- {aru_code-0.18.0 → aru_code-0.19.0}/aru/tools/ranker.py +0 -0
- {aru_code-0.18.0 → aru_code-0.19.0}/aru/tools/tasklist.py +0 -0
- {aru_code-0.18.0 → aru_code-0.19.0}/aru_code.egg-info/SOURCES.txt +0 -0
- {aru_code-0.18.0 → aru_code-0.19.0}/aru_code.egg-info/dependency_links.txt +0 -0
- {aru_code-0.18.0 → aru_code-0.19.0}/aru_code.egg-info/entry_points.txt +0 -0
- {aru_code-0.18.0 → aru_code-0.19.0}/aru_code.egg-info/requires.txt +0 -0
- {aru_code-0.18.0 → aru_code-0.19.0}/aru_code.egg-info/top_level.txt +0 -0
- {aru_code-0.18.0 → aru_code-0.19.0}/setup.cfg +0 -0
- {aru_code-0.18.0 → aru_code-0.19.0}/tests/test_agents_base.py +0 -0
- {aru_code-0.18.0 → aru_code-0.19.0}/tests/test_cli_advanced.py +0 -0
- {aru_code-0.18.0 → aru_code-0.19.0}/tests/test_cli_base.py +0 -0
- {aru_code-0.18.0 → aru_code-0.19.0}/tests/test_cli_completers.py +0 -0
- {aru_code-0.18.0 → aru_code-0.19.0}/tests/test_cli_new.py +0 -0
- {aru_code-0.18.0 → aru_code-0.19.0}/tests/test_cli_run_cli.py +0 -0
- {aru_code-0.18.0 → aru_code-0.19.0}/tests/test_cli_session.py +0 -0
- {aru_code-0.18.0 → aru_code-0.19.0}/tests/test_cli_shell.py +0 -0
- {aru_code-0.18.0 → aru_code-0.19.0}/tests/test_codebase.py +0 -0
- {aru_code-0.18.0 → aru_code-0.19.0}/tests/test_config.py +0 -0
- {aru_code-0.18.0 → aru_code-0.19.0}/tests/test_executor.py +0 -0
- {aru_code-0.18.0 → aru_code-0.19.0}/tests/test_gitignore.py +0 -0
- {aru_code-0.18.0 → aru_code-0.19.0}/tests/test_main.py +0 -0
- {aru_code-0.18.0 → aru_code-0.19.0}/tests/test_mcp_client.py +0 -0
- {aru_code-0.18.0 → aru_code-0.19.0}/tests/test_permissions.py +0 -0
- {aru_code-0.18.0 → aru_code-0.19.0}/tests/test_planner.py +0 -0
- {aru_code-0.18.0 → aru_code-0.19.0}/tests/test_providers.py +0 -0
- {aru_code-0.18.0 → aru_code-0.19.0}/tests/test_ranker.py +0 -0
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "0.19.0"
|
|
@@ -1,28 +1,40 @@
|
|
|
1
1
|
"""Context management for token optimization.
|
|
2
2
|
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
3
|
+
Mirrors opencode's two-layer approach:
|
|
4
|
+
|
|
5
|
+
1. **Prune** (routine, lossy only on tool outputs): walks old tool_result
|
|
6
|
+
blocks and replaces their content with a placeholder. User/assistant
|
|
7
|
+
text is NEVER touched — it survives verbatim until real overflow.
|
|
8
|
+
This is the steady-state memory mechanism. Matches cache_patch.py's
|
|
9
|
+
strategy at the Agno message layer.
|
|
10
|
+
|
|
11
|
+
2. **Compact** (rare, lossy full summary): triggers only when the per-call
|
|
12
|
+
context window actually approaches the model's limit. Runs a
|
|
13
|
+
compaction agent that produces a structured summary (Goal / Instructions
|
|
14
|
+
/ Discoveries / Accomplished / File contents / Relevant files) and
|
|
15
|
+
marks the resulting assistant message with `summary: True` so
|
|
16
|
+
subsequent prunes stop at that checkpoint.
|
|
17
|
+
|
|
18
|
+
There is also a `truncate_output` layer used by individual tools to cap
|
|
19
|
+
their own output size before it ever reaches history.
|
|
7
20
|
"""
|
|
8
21
|
|
|
9
22
|
from __future__ import annotations
|
|
10
23
|
|
|
11
24
|
# ── Constants ──────────────────────────────────────────────────────
|
|
12
25
|
|
|
13
|
-
# Pruning: minimum chars that must be freeable to justify a prune pass
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
#
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
PRUNE_USER_MSG_KEEP = 500 # ~140 tokens
|
|
26
|
+
# Pruning: minimum chars that must be freeable to justify a prune pass.
|
|
27
|
+
# Matches opencode's PRUNE_MINIMUM = 20_000 tokens (~80K chars @ 4 chars/token).
|
|
28
|
+
PRUNE_MINIMUM_CHARS = 80_000 # ~20K tokens
|
|
29
|
+
# Placeholder that replaces cleared tool_result content. Matches
|
|
30
|
+
# cache_patch.py's _PRUNED_PLACEHOLDER so both layers produce identical
|
|
31
|
+
# text when a tool output is cleared.
|
|
32
|
+
CLEARED_TOOL_RESULT = "[Old tool result cleared]"
|
|
21
33
|
# Minimum number of recent user turns always protected (regardless of char budget)
|
|
22
34
|
PRUNE_PROTECT_TURNS = 2
|
|
23
35
|
# Tool result markers that should never be pruned (critical context)
|
|
24
36
|
PRUNE_PROTECTED_MARKERS = {"[SubAgent-", "delegate_task"}
|
|
25
|
-
# Tool names whose outputs should never be pruned (like
|
|
37
|
+
# Tool names whose outputs should never be pruned (like opencode's PRUNE_PROTECTED_TOOLS)
|
|
26
38
|
# These are checked as substrings in message content (tool results include the tool name)
|
|
27
39
|
PRUNE_PROTECTED_TOOLS = {"delegate_task"}
|
|
28
40
|
|
|
@@ -32,17 +44,22 @@ TRUNCATE_MAX_BYTES = 15 * 1024 # 15 KB
|
|
|
32
44
|
TRUNCATE_KEEP_START = 150 # lines to keep from the start
|
|
33
45
|
TRUNCATE_KEEP_END = 60 # lines to keep from the end
|
|
34
46
|
TRUNCATE_MAX_LINE_LENGTH = 1500 # chars per individual line (prevents minified files)
|
|
35
|
-
# Directory for saving full truncated outputs (like
|
|
47
|
+
# Directory for saving full truncated outputs (like opencode pattern)
|
|
36
48
|
TRUNCATE_SAVE_DIR = ".aru/truncated"
|
|
37
49
|
|
|
38
|
-
# Compaction: trigger when per-
|
|
39
|
-
|
|
40
|
-
#
|
|
41
|
-
|
|
42
|
-
#
|
|
43
|
-
|
|
44
|
-
#
|
|
45
|
-
|
|
50
|
+
# Compaction: trigger when per-call input tokens approach real overflow.
|
|
51
|
+
# Matches opencode's philosophy: only fire near the model's actual context
|
|
52
|
+
# limit, not routinely. Routine context reduction is handled by prune_history
|
|
53
|
+
# (lossy only on tool outputs), so compaction is reserved for genuine
|
|
54
|
+
# overflow — where the next API call would otherwise exceed the model's
|
|
55
|
+
# input limit minus the reserved buffer.
|
|
56
|
+
#
|
|
57
|
+
# Opencode fires at `count >= limit.input - reserved` (overflow.ts:22) —
|
|
58
|
+
# no extra ratio. We mirror that here. The sole safety margin is
|
|
59
|
+
# COMPACTION_BUFFER_TOKENS, which is 30K (vs opencode's 20K) to give a bit
|
|
60
|
+
# more headroom for output + tool definitions + estimation noise, since
|
|
61
|
+
# we don't yet have a reactive overflow handler to catch the edge case.
|
|
62
|
+
COMPACTION_BUFFER_TOKENS = 30_000
|
|
46
63
|
# Default model context limits (input tokens)
|
|
47
64
|
MODEL_CONTEXT_LIMITS: dict[str, int] = {
|
|
48
65
|
# Anthropic
|
|
@@ -114,61 +131,97 @@ Structured list of file paths relevant to continuing the work (one per line)."""
|
|
|
114
131
|
|
|
115
132
|
# ── Layer 1: Pruning ──────────────────────────────────────────────
|
|
116
133
|
|
|
134
|
+
def _tool_result_content_len(msg: dict) -> int:
|
|
135
|
+
"""Sum of content length of all non-cleared tool_result blocks in a message.
|
|
136
|
+
|
|
137
|
+
Mirrors opencode's prune walk, which accumulates only
|
|
138
|
+
`Token.estimate(part.state.output)` for `ToolPart`s (compaction.ts:119).
|
|
139
|
+
Text blocks and tool_use args are ignored — they are not the thing
|
|
140
|
+
being freed. This means pruning only "consumes budget" for real tool
|
|
141
|
+
output, so text-heavy conversations with few tool calls never trip
|
|
142
|
+
the prune path.
|
|
143
|
+
|
|
144
|
+
Already-cleared tool_results (content == CLEARED_TOOL_RESULT) are
|
|
145
|
+
skipped so a second pass doesn't double-count them.
|
|
146
|
+
"""
|
|
147
|
+
from aru.history_blocks import is_tool_result
|
|
148
|
+
total = 0
|
|
149
|
+
for block in msg.get("content", []):
|
|
150
|
+
if is_tool_result(block):
|
|
151
|
+
content = block.get("content")
|
|
152
|
+
if content == CLEARED_TOOL_RESULT:
|
|
153
|
+
continue
|
|
154
|
+
if content is None:
|
|
155
|
+
continue
|
|
156
|
+
# tool_result content can be a string or a list of blocks —
|
|
157
|
+
# stringify to get a char count that roughly tracks tokens.
|
|
158
|
+
total += len(str(content))
|
|
159
|
+
return total
|
|
160
|
+
|
|
161
|
+
|
|
117
162
|
def _get_prune_protect_chars(model_id: str = "default") -> int:
|
|
118
|
-
"""
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
163
|
+
"""Chars of recent history that must NEVER be pruned.
|
|
164
|
+
|
|
165
|
+
Flat value across all models, mirroring opencode's fixed
|
|
166
|
+
`PRUNE_PROTECT = 40_000` tokens (compaction.ts:36). At ~4 chars/token
|
|
167
|
+
that's 160K chars of tool-result content kept intact in the recent
|
|
168
|
+
window. Older tool_result blocks beyond this budget are eligible for
|
|
169
|
+
the lossy clear pass in `prune_history`.
|
|
170
|
+
|
|
171
|
+
Why flat (not scaled by model): opencode validated this in production
|
|
172
|
+
on contexts from 128K to 1M — scaling by ratio adds complexity without
|
|
173
|
+
improving behavior, and protecting too much in 1M-context models can
|
|
174
|
+
actually hurt prompt caching by keeping rarely-touched tail content warm.
|
|
175
|
+
|
|
176
|
+
The `model_id` parameter is retained for signature compatibility with
|
|
177
|
+
older call sites; it has no effect on the returned value.
|
|
131
178
|
"""
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
ratio_based = int(limit * 0.07 * 4)
|
|
135
|
-
# Floor of 60K chars (~17K tokens) keeps the user-visible context
|
|
136
|
-
# window around 20K tokens steady-state after system + cache + output
|
|
137
|
-
# overheads. Applies to any model where 7% would be smaller.
|
|
138
|
-
return max(60_000, min(ratio_based, 200_000))
|
|
179
|
+
del model_id # unused — kept for signature compatibility
|
|
180
|
+
return 160_000
|
|
139
181
|
|
|
140
182
|
|
|
141
183
|
def prune_history(
|
|
142
184
|
history: list[dict], model_id: str = "default"
|
|
143
185
|
) -> list[dict]:
|
|
144
|
-
"""Reduce history token footprint by
|
|
145
|
-
|
|
146
|
-
Operates on block-shaped history (see `aru.history_blocks`).
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
is
|
|
159
|
-
|
|
160
|
-
|
|
186
|
+
"""Reduce history token footprint by clearing old tool result content.
|
|
187
|
+
|
|
188
|
+
Operates on block-shaped history (see `aru.history_blocks`). Matches
|
|
189
|
+
opencode's approach: the ONLY lossy operation is replacing the
|
|
190
|
+
content of old `tool_result` blocks with a short placeholder. Text
|
|
191
|
+
blocks (user and assistant), `tool_use` blocks, and block structure
|
|
192
|
+
are always preserved — so the original ask survives verbatim until
|
|
193
|
+
real overflow forces a full compaction.
|
|
194
|
+
|
|
195
|
+
**Budget semantics** (opencode parity): the walk backward accumulates
|
|
196
|
+
**only tool_result content chars**, not whole-message chars. Text
|
|
197
|
+
blocks and tool_use args don't consume the protection budget, because
|
|
198
|
+
they aren't what prune can free. Consequences:
|
|
199
|
+
- Text-heavy conversations with few tool calls never trigger prune.
|
|
200
|
+
- Prune only fires when there is >= `protect_chars + PRUNE_MINIMUM_CHARS`
|
|
201
|
+
of tool_result content total — mirroring opencode's
|
|
202
|
+
`total > PRUNE_PROTECT + PRUNE_MINIMUM`.
|
|
203
|
+
- The "is it worth pruning?" dry-run check from opencode
|
|
204
|
+
(`pruned > PRUNE_MINIMUM`) is implicit: we cannot enter the loop
|
|
205
|
+
without enough prunable content, and once in the loop any walk
|
|
206
|
+
past `protect_chars` is guaranteed to be freeing real bytes.
|
|
207
|
+
|
|
208
|
+
Protection layers (applied on top of the budget walk):
|
|
161
209
|
1. Turn-based: last `PRUNE_PROTECT_TURNS` user turns always kept
|
|
162
|
-
intact,
|
|
163
|
-
|
|
210
|
+
intact, plus the assistant response right after each. Index 0
|
|
211
|
+
(the original user ask) is also always protected.
|
|
212
|
+
2. Budget-based: tool_result content within the 160K protect window
|
|
213
|
+
(~40K tokens, matching opencode) is kept.
|
|
164
214
|
3. Content-based: messages whose stringified content contains any
|
|
165
215
|
`PRUNE_PROTECTED_MARKERS` or `PRUNE_PROTECTED_TOOLS` never prune.
|
|
216
|
+
4. Summary checkpoint: walking backward stops at any message marked
|
|
217
|
+
`summary: True` (a previous compaction's assistant output).
|
|
218
|
+
Everything before a summary was already consolidated and must
|
|
219
|
+
not be re-processed.
|
|
166
220
|
|
|
167
221
|
Returns a new list (does not mutate the input).
|
|
168
222
|
"""
|
|
169
223
|
from aru.history_blocks import (
|
|
170
|
-
coerce_history_item,
|
|
171
|
-
is_text, is_tool_use, is_tool_result, text_block,
|
|
224
|
+
coerce_history_item, item_text, is_tool_result,
|
|
172
225
|
)
|
|
173
226
|
|
|
174
227
|
if len(history) <= 2:
|
|
@@ -177,11 +230,15 @@ def prune_history(
|
|
|
177
230
|
protect_chars = _get_prune_protect_chars(model_id)
|
|
178
231
|
result = [coerce_history_item(m) for m in history]
|
|
179
232
|
|
|
180
|
-
|
|
181
|
-
|
|
233
|
+
# Entry gate mirrors opencode: only proceed if total tool output
|
|
234
|
+
# exceeds protect + minimum. Text length is irrelevant.
|
|
235
|
+
total_tool_chars = sum(_tool_result_content_len(msg) for msg in result)
|
|
236
|
+
if total_tool_chars < protect_chars + PRUNE_MINIMUM_CHARS:
|
|
182
237
|
return result
|
|
183
238
|
|
|
184
|
-
# Identify indices of last N user turns (always protected)
|
|
239
|
+
# Identify indices of last N user turns (always protected) and index 0
|
|
240
|
+
# (the original user ask, protected defensively so the anchor never
|
|
241
|
+
# evaporates even if future edits change the budget calculus).
|
|
185
242
|
turn_protected: set[int] = set()
|
|
186
243
|
user_turns_seen = 0
|
|
187
244
|
for i in range(len(result) - 1, -1, -1):
|
|
@@ -191,108 +248,60 @@ def prune_history(
|
|
|
191
248
|
turn_protected.add(i)
|
|
192
249
|
if i + 1 < len(result):
|
|
193
250
|
turn_protected.add(i + 1)
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
for block in msg["content"]:
|
|
203
|
-
if not is_tool_use(block):
|
|
204
|
-
continue
|
|
205
|
-
tu_id = block.get("id")
|
|
206
|
-
if not tu_id:
|
|
207
|
-
continue
|
|
208
|
-
# Look forward for the matching tool_result (usually i+1)
|
|
209
|
-
for j in range(i + 1, min(i + 3, len(result))):
|
|
210
|
-
for rb in result[j]["content"]:
|
|
211
|
-
if is_tool_result(rb) and rb.get("tool_use_id") == tu_id:
|
|
212
|
-
tool_pair_loc[tu_id] = (i, j)
|
|
213
|
-
break
|
|
214
|
-
if tu_id in tool_pair_loc:
|
|
215
|
-
break
|
|
216
|
-
|
|
217
|
-
# Walk backward, protecting recent content
|
|
251
|
+
if result and result[0]["role"] == "user":
|
|
252
|
+
turn_protected.add(0)
|
|
253
|
+
if len(result) > 1:
|
|
254
|
+
turn_protected.add(1)
|
|
255
|
+
|
|
256
|
+
# Walk backward accumulating ONLY tool_result content chars into the
|
|
257
|
+
# protection budget. Messages with no tool_result (pure text, or just
|
|
258
|
+
# tool_use) consume zero budget and are skipped without pruning.
|
|
218
259
|
protected = 0
|
|
219
|
-
dropped_tool_use_ids: set[str] = set()
|
|
220
260
|
|
|
221
261
|
for i in range(len(result) - 1, -1, -1):
|
|
222
262
|
msg = result[i]
|
|
223
|
-
|
|
263
|
+
|
|
264
|
+
# Stop at the previous compaction summary marker — everything
|
|
265
|
+
# before it was already consolidated into the summary.
|
|
266
|
+
if msg.get("summary"):
|
|
267
|
+
break
|
|
268
|
+
|
|
269
|
+
tool_chars = _tool_result_content_len(msg)
|
|
270
|
+
|
|
271
|
+
# No prunable content here — nothing to clear, nothing to count.
|
|
272
|
+
if tool_chars == 0:
|
|
273
|
+
continue
|
|
224
274
|
|
|
225
275
|
if i in turn_protected:
|
|
226
|
-
protected +=
|
|
276
|
+
protected += tool_chars
|
|
227
277
|
continue
|
|
228
278
|
|
|
229
|
-
if protected +
|
|
230
|
-
protected +=
|
|
279
|
+
if protected + tool_chars <= protect_chars:
|
|
280
|
+
protected += tool_chars
|
|
231
281
|
continue
|
|
232
282
|
|
|
233
283
|
# Outside protection window — check content-based protection
|
|
234
284
|
text_view = item_text(msg)
|
|
235
285
|
if (any(marker in text_view for marker in PRUNE_PROTECTED_MARKERS)
|
|
236
286
|
or any(tool in text_view for tool in PRUNE_PROTECTED_TOOLS)):
|
|
237
|
-
protected +=
|
|
287
|
+
protected += tool_chars
|
|
238
288
|
continue
|
|
239
289
|
|
|
240
|
-
#
|
|
290
|
+
# Clear any tool_result payloads in this message. Leave every
|
|
291
|
+
# other block (text, tool_use, thinking, etc.) untouched.
|
|
241
292
|
new_blocks: list[dict] = []
|
|
242
293
|
for block in msg["content"]:
|
|
243
|
-
if
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
text = block.get("text", "")
|
|
250
|
-
if len(text) > PRUNE_USER_MSG_THRESHOLD:
|
|
251
|
-
truncated = (
|
|
252
|
-
text[:PRUNE_USER_MSG_KEEP]
|
|
253
|
-
+ f"\n\n[... {len(text) - PRUNE_USER_MSG_KEEP:,} "
|
|
254
|
-
"chars pruned to save context ...]"
|
|
255
|
-
)
|
|
256
|
-
new_blocks.append(text_block(truncated))
|
|
257
|
-
else:
|
|
258
|
-
new_blocks.append(block)
|
|
259
|
-
else:
|
|
260
|
-
new_blocks.append(block)
|
|
261
|
-
elif is_tool_use(block):
|
|
262
|
-
# Drop the tool_use entirely and mark its id for paired removal
|
|
263
|
-
tu_id = block.get("id")
|
|
264
|
-
if tu_id:
|
|
265
|
-
dropped_tool_use_ids.add(tu_id)
|
|
266
|
-
# Do NOT add to new_blocks
|
|
267
|
-
elif is_tool_result(block):
|
|
268
|
-
# Drop only if its paired tool_use is also being dropped
|
|
269
|
-
tu_id = block.get("tool_use_id")
|
|
270
|
-
if tu_id in dropped_tool_use_ids:
|
|
271
|
-
pass # drop
|
|
272
|
-
else:
|
|
273
|
-
new_blocks.append(block)
|
|
294
|
+
if is_tool_result(block) and block.get("content") != CLEARED_TOOL_RESULT:
|
|
295
|
+
new_blocks.append({
|
|
296
|
+
"type": "tool_result",
|
|
297
|
+
"tool_use_id": block.get("tool_use_id"),
|
|
298
|
+
"content": CLEARED_TOOL_RESULT,
|
|
299
|
+
})
|
|
274
300
|
else:
|
|
275
301
|
new_blocks.append(block)
|
|
276
302
|
|
|
277
303
|
result[i] = {"role": msg["role"], "content": new_blocks}
|
|
278
304
|
|
|
279
|
-
# Second pass: any tool_result blocks in user messages whose tool_use
|
|
280
|
-
# was dropped on a previous pass (covers case where user msg was
|
|
281
|
-
# inside protection but its paired assistant was outside).
|
|
282
|
-
if dropped_tool_use_ids:
|
|
283
|
-
for idx, msg in enumerate(result):
|
|
284
|
-
if not msg["content"]:
|
|
285
|
-
continue
|
|
286
|
-
filtered = [
|
|
287
|
-
b for b in msg["content"]
|
|
288
|
-
if not (is_tool_result(b) and b.get("tool_use_id") in dropped_tool_use_ids)
|
|
289
|
-
]
|
|
290
|
-
if len(filtered) != len(msg["content"]):
|
|
291
|
-
result[idx] = {"role": msg["role"], "content": filtered}
|
|
292
|
-
|
|
293
|
-
# Drop any messages that ended up with zero blocks (valid but useless)
|
|
294
|
-
result = [m for m in result if m["content"]]
|
|
295
|
-
|
|
296
305
|
return result
|
|
297
306
|
|
|
298
307
|
|
|
@@ -443,46 +452,50 @@ def should_compact(
|
|
|
443
452
|
) -> bool:
|
|
444
453
|
"""Check if the conversation should be compacted.
|
|
445
454
|
|
|
446
|
-
|
|
447
|
-
|
|
448
|
-
|
|
455
|
+
Fires when the per-call context window reaches real overflow:
|
|
456
|
+
`tokens >= limit - COMPACTION_BUFFER_TOKENS`.
|
|
457
|
+
|
|
458
|
+
Matches opencode's `isOverflow` in overflow.ts:22 — `count >= usable`,
|
|
459
|
+
no extra ratio. Routine context reduction is handled by `prune_history`
|
|
460
|
+
(lossy only on tool outputs), so compaction is reserved for genuine
|
|
461
|
+
overflow where the next API call would otherwise exceed the model's
|
|
462
|
+
input limit minus the reserved buffer.
|
|
449
463
|
|
|
450
464
|
Accepts either an estimated token count (int) or the history list.
|
|
451
465
|
"""
|
|
452
466
|
if isinstance(history_or_tokens, list):
|
|
453
|
-
|
|
454
|
-
tokens = estimate_history_tokens(history)
|
|
455
|
-
# Turn-based trigger: count user messages
|
|
456
|
-
user_turns = sum(1 for m in history if m["role"] == "user")
|
|
457
|
-
if user_turns >= COMPACTION_MAX_TURNS:
|
|
458
|
-
return True
|
|
467
|
+
tokens = estimate_history_tokens(history_or_tokens)
|
|
459
468
|
else:
|
|
460
469
|
tokens = history_or_tokens
|
|
461
470
|
|
|
462
471
|
limit = MODEL_CONTEXT_LIMITS.get(model_id, MODEL_CONTEXT_LIMITS["default"])
|
|
463
472
|
usable = limit - COMPACTION_BUFFER_TOKENS
|
|
464
|
-
|
|
465
|
-
return tokens >= threshold
|
|
473
|
+
return tokens >= usable
|
|
466
474
|
|
|
467
475
|
|
|
468
476
|
def would_prune(history: list[dict], model_id: str = "default") -> bool:
|
|
469
477
|
"""Check if prune_history would discard content from this history.
|
|
470
478
|
|
|
471
|
-
Uses the
|
|
472
|
-
the protection window + minimum prunable
|
|
479
|
+
Uses the same entry gate as `prune_history`: total tool_result
|
|
480
|
+
content must exceed the protection window + minimum prunable
|
|
481
|
+
threshold. Text and tool_use args are not counted — only real
|
|
482
|
+
prunable output. Mirrors opencode's logic.
|
|
473
483
|
"""
|
|
474
|
-
from aru.history_blocks import item_char_len
|
|
475
484
|
if len(history) <= 2:
|
|
476
485
|
return False
|
|
477
|
-
|
|
486
|
+
total_tool_chars = sum(_tool_result_content_len(msg) for msg in history)
|
|
478
487
|
protect_chars = _get_prune_protect_chars(model_id)
|
|
479
|
-
return
|
|
488
|
+
return total_tool_chars >= protect_chars + PRUNE_MINIMUM_CHARS
|
|
480
489
|
|
|
481
490
|
|
|
482
491
|
def _split_history(history: list[dict], model_id: str = "default") -> tuple[list[dict], list[dict]]:
|
|
483
492
|
"""Split history into old (to summarize) and recent (to keep intact).
|
|
484
493
|
|
|
485
|
-
Uses the same protection window as pruning.
|
|
494
|
+
Uses the same protection window as pruning. Defensively, the first
|
|
495
|
+
user turn (index 0) is always pulled into `recent` so the original
|
|
496
|
+
ask survives literal even through a full compaction — the compactor
|
|
497
|
+
extracts it into the `## Goal` section of the summary, but keeping
|
|
498
|
+
it in recent too means the agent can quote it verbatim afterward.
|
|
486
499
|
"""
|
|
487
500
|
from aru.history_blocks import item_char_len
|
|
488
501
|
protect_chars = _get_prune_protect_chars(model_id)
|
|
@@ -495,6 +508,12 @@ def _split_history(history: list[dict], model_id: str = "default") -> tuple[list
|
|
|
495
508
|
split_idx = i
|
|
496
509
|
else:
|
|
497
510
|
break
|
|
511
|
+
|
|
512
|
+
# Defensive: force the first user turn into `recent` even if the
|
|
513
|
+
# protect budget would have sent it to `old`. The original ask is
|
|
514
|
+
# the session anchor and must stay literal.
|
|
515
|
+
if split_idx > 0 and history and history[0].get("role") == "user":
|
|
516
|
+
return history[1:split_idx], [history[0]] + history[split_idx:]
|
|
498
517
|
return history[:split_idx], history[split_idx:]
|
|
499
518
|
|
|
500
519
|
|
|
@@ -563,12 +582,13 @@ def apply_compaction(
|
|
|
563
582
|
The summary is emitted as a synthetic user→assistant exchange so that
|
|
564
583
|
role alternation stays natural:
|
|
565
584
|
[user: "Please summarize..."]
|
|
566
|
-
[assistant: "<summary>"]
|
|
585
|
+
[assistant: "<summary>", summary=True]
|
|
567
586
|
+ recent messages as-is
|
|
568
587
|
|
|
569
|
-
|
|
570
|
-
|
|
571
|
-
|
|
588
|
+
The assistant summary is marked with `summary: True` as a checkpoint.
|
|
589
|
+
`prune_history` walks backward and stops at this marker, so content
|
|
590
|
+
already consolidated into the summary is never re-processed. Mirrors
|
|
591
|
+
opencode's `msg.info.summary` flag (see message-v2.ts:914).
|
|
572
592
|
"""
|
|
573
593
|
from aru.history_blocks import text_block, coerce_history_item
|
|
574
594
|
_, recent = _split_history(history, model_id)
|
|
@@ -581,6 +601,7 @@ def apply_compaction(
|
|
|
581
601
|
{
|
|
582
602
|
"role": "assistant",
|
|
583
603
|
"content": [text_block(f"Prior conversation summary:\n\n{summary}")],
|
|
604
|
+
"summary": True,
|
|
584
605
|
},
|
|
585
606
|
]
|
|
586
607
|
compacted.extend(coerce_history_item(m) for m in recent)
|
|
@@ -402,9 +402,12 @@ class Session:
|
|
|
402
402
|
self.history.append({"role": role, "content": blocks})
|
|
403
403
|
# Hard cap as safety net — structured pruning/compaction in
|
|
404
404
|
# aru/context.py handles the normal case; this only fires if
|
|
405
|
-
# something bypasses them.
|
|
406
|
-
|
|
407
|
-
|
|
405
|
+
# something bypasses them. Set high enough that long sessions
|
|
406
|
+
# (which now accumulate more messages because prune is
|
|
407
|
+
# non-destructive for text and compact rarely fires) don't hit
|
|
408
|
+
# this destructive path routinely.
|
|
409
|
+
if len(self.history) > 300:
|
|
410
|
+
self.history = self.history[-300:]
|
|
408
411
|
|
|
409
412
|
def add_structured_message(self, role: str, blocks: list[dict]):
|
|
410
413
|
"""Explicitly add a message with pre-built content blocks.
|
|
@@ -314,11 +314,12 @@ class TestSession:
|
|
|
314
314
|
|
|
315
315
|
def test_add_message_caps_history(self):
|
|
316
316
|
session = Session()
|
|
317
|
-
for i in range(
|
|
317
|
+
for i in range(350):
|
|
318
318
|
session.add_message("user", f"msg {i}")
|
|
319
|
-
# History is bounded by a hard cap (structured
|
|
320
|
-
# aru.context handles the normal-path token
|
|
321
|
-
|
|
319
|
+
# History is bounded by a hard safety cap (structured pruning/
|
|
320
|
+
# compaction in aru.context handles the normal-path token
|
|
321
|
+
# management; this cap only fires on pathological growth).
|
|
322
|
+
assert len(session.history) <= 300
|
|
322
323
|
|
|
323
324
|
def test_set_plan(self):
|
|
324
325
|
session = Session()
|
|
@@ -176,17 +176,26 @@ class TestPrunePreservesPairs:
|
|
|
176
176
|
"""Fix 6: pruning must never orphan tool_use / tool_result blocks."""
|
|
177
177
|
|
|
178
178
|
def test_prune_drops_tool_pair_atomically(self):
|
|
179
|
-
"""An old
|
|
180
|
-
|
|
181
|
-
|
|
179
|
+
"""An old tool_result whose content gets cleared must still keep
|
|
180
|
+
its block (matching tool_use_id), so the tool_use/tool_result
|
|
181
|
+
pair is never orphaned.
|
|
182
|
+
|
|
183
|
+
Opencode-aligned budget: prune only counts tool_result content
|
|
184
|
+
chars, so the history needs multiple large tool_result payloads
|
|
185
|
+
to clear the 240K entry gate.
|
|
186
|
+
"""
|
|
187
|
+
big_output = "old file line\n" * 8_000 # ~100K chars per result
|
|
182
188
|
history = [
|
|
183
189
|
{"role": "user", "content": "request 1"},
|
|
184
|
-
_assistant_tool_turn("old_tu", "read_file", {"path": "old.py"}
|
|
185
|
-
_tool_result_turn("old_tu",
|
|
190
|
+
_assistant_tool_turn("old_tu", "read_file", {"path": "old.py"}),
|
|
191
|
+
_tool_result_turn("old_tu", big_output),
|
|
186
192
|
{"role": "user", "content": "request 2"},
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
{"role": "
|
|
193
|
+
_assistant_tool_turn("mid_tu", "read_file", {"path": "mid.py"}),
|
|
194
|
+
_tool_result_turn("mid_tu", big_output),
|
|
195
|
+
{"role": "user", "content": "request 3"},
|
|
196
|
+
_assistant_tool_turn("recent_tu", "read_file", {"path": "new.py"}),
|
|
197
|
+
_tool_result_turn("recent_tu", big_output),
|
|
198
|
+
{"role": "user", "content": "summarize"},
|
|
190
199
|
]
|
|
191
200
|
|
|
192
201
|
pruned = prune_history(history, model_id="default")
|
|
@@ -208,10 +217,20 @@ class TestPrunePreservesPairs:
|
|
|
208
217
|
)
|
|
209
218
|
|
|
210
219
|
def test_prune_keeps_recent_tool_pair(self):
|
|
211
|
-
"""A tool_use/tool_result pair inside the protection window must be
|
|
220
|
+
"""A tool_use/tool_result pair inside the protection window must be
|
|
221
|
+
kept with its content intact, even when older tool_results get cleared.
|
|
222
|
+
|
|
223
|
+
Builds a history with two big old tool_results (enough to trigger
|
|
224
|
+
prune) and one small recent pair that must survive verbatim.
|
|
225
|
+
"""
|
|
226
|
+
big_old = "old file content\n" * 10_000 # ~170K chars each
|
|
212
227
|
history = [
|
|
213
|
-
{"role": "user", "content": "
|
|
214
|
-
|
|
228
|
+
{"role": "user", "content": "req 1"},
|
|
229
|
+
_assistant_tool_turn("tu_old1", "read_file", {"path": "a.py"}, "reading"),
|
|
230
|
+
_tool_result_turn("tu_old1", big_old),
|
|
231
|
+
{"role": "user", "content": "req 2"},
|
|
232
|
+
_assistant_tool_turn("tu_old2", "read_file", {"path": "b.py"}, "reading"),
|
|
233
|
+
_tool_result_turn("tu_old2", big_old),
|
|
215
234
|
{"role": "user", "content": "read foo"},
|
|
216
235
|
_assistant_tool_turn("tu_recent", "read_file", {"path": "foo.py"}, "reading"),
|
|
217
236
|
_tool_result_turn("tu_recent", "def foo(): pass"),
|
|
@@ -227,6 +246,10 @@ class TestPrunePreservesPairs:
|
|
|
227
246
|
|
|
228
247
|
assert len(tool_uses) == 1, "Recent tool_use was incorrectly pruned"
|
|
229
248
|
assert len(tool_results) == 1, "Recent tool_result was incorrectly pruned"
|
|
249
|
+
# Recent content must be intact (not cleared)
|
|
250
|
+
assert tool_results[0].get("content") == "def foo(): pass", (
|
|
251
|
+
"Recent tool_result content was cleared — should be inside protection window"
|
|
252
|
+
)
|
|
230
253
|
|
|
231
254
|
def test_prune_with_no_pairs_still_works(self):
|
|
232
255
|
"""Pure text history should prune without errors."""
|
|
@@ -10,8 +10,16 @@ from aru.context import (
|
|
|
10
10
|
apply_compaction,
|
|
11
11
|
build_compaction_prompt,
|
|
12
12
|
format_context_block,
|
|
13
|
+
CLEARED_TOOL_RESULT,
|
|
14
|
+
)
|
|
15
|
+
from aru.history_blocks import (
|
|
16
|
+
coerce_history,
|
|
17
|
+
item_text,
|
|
18
|
+
tool_use_block,
|
|
19
|
+
tool_result_block,
|
|
20
|
+
text_block,
|
|
21
|
+
is_tool_result,
|
|
13
22
|
)
|
|
14
|
-
from aru.history_blocks import coerce_history, item_text
|
|
15
23
|
|
|
16
24
|
|
|
17
25
|
class TestPruneHistory:
|
|
@@ -27,37 +35,103 @@ class TestPruneHistory:
|
|
|
27
35
|
# Input is auto-coerced to block form on return
|
|
28
36
|
assert result == coerce_history(messages)
|
|
29
37
|
|
|
30
|
-
def
|
|
31
|
-
"""Should
|
|
32
|
-
|
|
33
|
-
|
|
38
|
+
def test_prunes_old_tool_results_when_over_threshold(self):
|
|
39
|
+
"""Should clear old tool_result content when total tool output
|
|
40
|
+
exceeds protect + minimum (opencode-aligned budget semantics).
|
|
41
|
+
|
|
42
|
+
The budget walks backward over tool_result content chars only.
|
|
43
|
+
Text and tool_use args don't count, so this test uses large
|
|
44
|
+
tool_result payloads to actually trip the prune path.
|
|
45
|
+
"""
|
|
46
|
+
# Three rounds of read_file-sized outputs. Total ~300K chars
|
|
47
|
+
# of tool_result content — clears the 240K entry gate, and
|
|
48
|
+
# the 160K protect budget will cover only the most recent one.
|
|
49
|
+
big_output = "line of code\n" * 8_000 # ~100K chars
|
|
34
50
|
messages = [
|
|
35
|
-
{"role": "user", "content": "
|
|
36
|
-
{
|
|
37
|
-
|
|
38
|
-
|
|
51
|
+
{"role": "user", "content": "round 1"},
|
|
52
|
+
{
|
|
53
|
+
"role": "assistant",
|
|
54
|
+
"content": [
|
|
55
|
+
text_block("reading"),
|
|
56
|
+
tool_use_block("tu_old", "read_file", {"path": "a.py"}),
|
|
57
|
+
],
|
|
58
|
+
},
|
|
59
|
+
{"role": "tool", "content": [tool_result_block("tu_old", big_output)]},
|
|
60
|
+
{"role": "user", "content": "round 2"},
|
|
61
|
+
{
|
|
62
|
+
"role": "assistant",
|
|
63
|
+
"content": [
|
|
64
|
+
text_block("reading"),
|
|
65
|
+
tool_use_block("tu_mid", "read_file", {"path": "b.py"}),
|
|
66
|
+
],
|
|
67
|
+
},
|
|
68
|
+
{"role": "tool", "content": [tool_result_block("tu_mid", big_output)]},
|
|
69
|
+
{"role": "user", "content": "round 3"},
|
|
70
|
+
{
|
|
71
|
+
"role": "assistant",
|
|
72
|
+
"content": [
|
|
73
|
+
text_block("reading"),
|
|
74
|
+
tool_use_block("tu_recent", "read_file", {"path": "c.py"}),
|
|
75
|
+
],
|
|
76
|
+
},
|
|
77
|
+
{"role": "tool", "content": [tool_result_block("tu_recent", big_output)]},
|
|
78
|
+
{"role": "user", "content": "what did you find?"},
|
|
39
79
|
]
|
|
40
80
|
result = prune_history(messages)
|
|
41
|
-
# Should have placeholder for pruned content
|
|
42
|
-
assert len(result) <= len(messages)
|
|
43
|
-
# Recent messages should be preserved
|
|
44
|
-
assert any("Second request" in str(m) for m in result)
|
|
45
|
-
|
|
46
|
-
def test_preserves_user_messages(self):
|
|
47
|
-
"""Should always preserve user messages."""
|
|
48
|
-
old_user = {"role": "user", "content": "Old user message"}
|
|
49
|
-
old_assistant = {"role": "assistant", "content": "Old assistant " * 10000}
|
|
50
|
-
recent = {"role": "user", "content": "Recent request"}
|
|
51
|
-
|
|
52
|
-
messages = [old_user, old_assistant, recent]
|
|
53
|
-
result = prune_history(messages)
|
|
54
81
|
|
|
55
|
-
#
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
82
|
+
# Same number of messages (prune never drops structure)
|
|
83
|
+
assert len(result) == len(messages)
|
|
84
|
+
|
|
85
|
+
# Collect tool_result blocks by tool_use_id
|
|
86
|
+
by_id: dict[str, dict] = {}
|
|
87
|
+
for msg in result:
|
|
88
|
+
for block in msg.get("content", []):
|
|
89
|
+
if is_tool_result(block):
|
|
90
|
+
by_id[block.get("tool_use_id")] = block
|
|
91
|
+
|
|
92
|
+
# All three pairs preserved at the block level
|
|
93
|
+
assert set(by_id.keys()) == {"tu_old", "tu_mid", "tu_recent"}
|
|
94
|
+
|
|
95
|
+
# Recent tool_result kept verbatim
|
|
96
|
+
assert by_id["tu_recent"]["content"] == big_output
|
|
97
|
+
|
|
98
|
+
# The older tool_result must have been cleared — at least one
|
|
99
|
+
# of tu_old/tu_mid should now hold the placeholder, since only
|
|
100
|
+
# 160K chars worth fits inside the protect window.
|
|
101
|
+
cleared_count = sum(
|
|
102
|
+
1 for tu_id in ("tu_old", "tu_mid")
|
|
103
|
+
if by_id[tu_id]["content"] == CLEARED_TOOL_RESULT
|
|
104
|
+
)
|
|
105
|
+
assert cleared_count >= 1, (
|
|
106
|
+
"Expected at least one old tool_result to be cleared once "
|
|
107
|
+
"total output exceeded protect + minimum"
|
|
59
108
|
)
|
|
60
|
-
|
|
109
|
+
|
|
110
|
+
def test_text_heavy_history_is_not_pruned(self):
|
|
111
|
+
"""Conversations dominated by text (not tool output) must NOT
|
|
112
|
+
trigger prune even if total chars are huge.
|
|
113
|
+
|
|
114
|
+
This is the opencode-aligned semantics: text blocks don't enter
|
|
115
|
+
the prune budget. A 500K-char text history with no tool_results
|
|
116
|
+
is a no-op for prune_history.
|
|
117
|
+
"""
|
|
118
|
+
messages = [
|
|
119
|
+
{"role": "user", "content": "long planning discussion " * 10_000},
|
|
120
|
+
{"role": "assistant", "content": "detailed reasoning " * 10_000},
|
|
121
|
+
{"role": "user", "content": "what's next?"},
|
|
122
|
+
{"role": "assistant", "content": "here's the plan " * 10_000},
|
|
123
|
+
]
|
|
124
|
+
result = prune_history(messages)
|
|
125
|
+
|
|
126
|
+
# No tool_results exist anywhere in result
|
|
127
|
+
tool_results = [
|
|
128
|
+
b for m in result for b in m.get("content", []) if is_tool_result(b)
|
|
129
|
+
]
|
|
130
|
+
assert tool_results == []
|
|
131
|
+
# Length preserved
|
|
132
|
+
assert len(result) == len(messages)
|
|
133
|
+
# No message content was altered to CLEARED_TOOL_RESULT
|
|
134
|
+
assert all(CLEARED_TOOL_RESULT not in item_text(m) for m in result)
|
|
61
135
|
|
|
62
136
|
def test_empty_history(self):
|
|
63
137
|
"""Should handle empty history."""
|
|
@@ -108,20 +182,21 @@ class TestShouldCompact:
|
|
|
108
182
|
"""Tests for should_compact function."""
|
|
109
183
|
|
|
110
184
|
def test_no_compaction_under_threshold(self):
|
|
111
|
-
"""Should not compact when under
|
|
112
|
-
#
|
|
185
|
+
"""Should not compact when well under the overflow threshold."""
|
|
186
|
+
# claude-sonnet-4-5 has 200K context; usable = 170K (buffer 30K).
|
|
187
|
+
# 5 tokens is well under.
|
|
113
188
|
result = should_compact(5, model_id="claude-sonnet-4-5-20250929")
|
|
114
189
|
assert result is False
|
|
115
190
|
|
|
116
191
|
def test_compaction_over_threshold(self):
|
|
117
|
-
"""Should compact when over threshold."""
|
|
118
|
-
# 300K tokens is over
|
|
192
|
+
"""Should compact when over the real-overflow threshold."""
|
|
193
|
+
# 300K tokens is well over the 170K threshold of a 200K-context model.
|
|
119
194
|
result = should_compact(300000, model_id="claude-sonnet-4-5-20250929")
|
|
120
195
|
assert result is True
|
|
121
196
|
|
|
122
197
|
def test_custom_context_limit(self):
|
|
123
198
|
"""Should respect custom context limit."""
|
|
124
|
-
# gpt-4o has 128K context
|
|
199
|
+
# gpt-4o has 128K context; usable = 98K. 50K is under.
|
|
125
200
|
result = should_compact(50000, model_id="gpt-4o")
|
|
126
201
|
assert isinstance(result, bool)
|
|
127
202
|
|
|
@@ -145,7 +220,8 @@ class TestCompactionTriggerUsesPerCallMetric:
|
|
|
145
220
|
|
|
146
221
|
def test_small_per_call_window_does_not_fire(self):
|
|
147
222
|
"""Reproduces the exact bug report: per-call ~20K on qwen3.6-plus
|
|
148
|
-
(128K limit, ~
|
|
223
|
+
(128K limit, ~98K threshold with 30K buffer) must NOT
|
|
224
|
+
trigger compaction."""
|
|
149
225
|
# Values taken from the real session where compaction fired incorrectly:
|
|
150
226
|
# "context: 20,184 (in: 16,652 / out: 696 / cache_read: 2,836)"
|
|
151
227
|
last_input = 16_652
|
|
@@ -158,7 +234,7 @@ class TestCompactionTriggerUsesPerCallMetric:
|
|
|
158
234
|
)
|
|
159
235
|
assert last_call_window == 20_184, "window computation changed"
|
|
160
236
|
|
|
161
|
-
# 20K is
|
|
237
|
+
# 20K is far below the ~98K threshold for a 128K-context model
|
|
162
238
|
assert should_compact(last_call_window, model_id="qwen3.6-plus") is False, (
|
|
163
239
|
"Compaction fired on a small per-call window. The runner is "
|
|
164
240
|
"probably passing cumulative tokens (run_output.metrics.input_tokens) "
|
|
@@ -169,7 +245,9 @@ class TestCompactionTriggerUsesPerCallMetric:
|
|
|
169
245
|
def test_large_per_call_window_still_fires(self):
|
|
170
246
|
"""Positive case: compaction must still fire when the last-call
|
|
171
247
|
window actually approaches the model's context limit."""
|
|
172
|
-
|
|
248
|
+
# qwen3.6-plus: 128K limit, usable = 98K (buffer 30K).
|
|
249
|
+
# 105K input + 2K output + 0 cache = 107K window → must fire.
|
|
250
|
+
last_input = 105_000
|
|
173
251
|
last_output = 2_000
|
|
174
252
|
last_cache_read = 0
|
|
175
253
|
last_cache_write = 0
|
|
@@ -177,17 +255,17 @@ class TestCompactionTriggerUsesPerCallMetric:
|
|
|
177
255
|
last_call_window = (
|
|
178
256
|
last_input + last_output + last_cache_read + last_cache_write
|
|
179
257
|
)
|
|
180
|
-
assert last_call_window ==
|
|
258
|
+
assert last_call_window == 107_000
|
|
181
259
|
|
|
182
|
-
#
|
|
260
|
+
# 107K > 98K threshold → must fire
|
|
183
261
|
assert should_compact(last_call_window, model_id="qwen3.6-plus") is True
|
|
184
262
|
|
|
185
263
|
def test_cumulative_metric_is_the_wrong_signal(self):
|
|
186
264
|
"""Illustrates WHY the old approach was wrong: a cumulative sum of
|
|
187
|
-
|
|
265
|
+
6 API calls at 18K each is 108K (above threshold), but the actual
|
|
188
266
|
per-call window each time is only 18K (well below)."""
|
|
189
267
|
per_call_window = 18_000
|
|
190
|
-
num_api_calls_in_turn =
|
|
268
|
+
num_api_calls_in_turn = 6
|
|
191
269
|
cumulative_if_summed = per_call_window * num_api_calls_in_turn
|
|
192
270
|
|
|
193
271
|
# Old (wrong) behavior: cumulative triggers compaction
|
|
@@ -196,8 +274,8 @@ class TestCompactionTriggerUsesPerCallMetric:
|
|
|
196
274
|
# New (correct) behavior: per-call does NOT trigger compaction
|
|
197
275
|
assert should_compact(per_call_window, model_id="qwen3.6-plus") is False
|
|
198
276
|
|
|
199
|
-
# The difference is the entire bug
|
|
200
|
-
assert cumulative_if_summed >
|
|
277
|
+
# The difference is the entire bug (threshold is 98K for qwen3.6-plus)
|
|
278
|
+
assert cumulative_if_summed > 98_000 > per_call_window
|
|
201
279
|
|
|
202
280
|
def test_runner_source_uses_per_call_metric(self):
|
|
203
281
|
"""Static check against silent regression.
|
aru_code-0.18.0/aru/__init__.py
DELETED
|
@@ -1 +0,0 @@
|
|
|
1
|
-
__version__ = "0.18.0"
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|