PyPI - aru-code - Versions diffs - 0.18.0__tar.gz → 0.19.0__tar.gz - Mend

aru-code 0.18.0tar.gz → 0.19.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (59) hide show

{aru_code-0.18.0/aru_code.egg-info → aru_code-0.19.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: aru-code
-Version: 0.18.0
+Version: 0.19.0
 Summary: A Claude Code clone built with Agno agents
 Author-email: Estevao <estevaofon@gmail.com>
 License-Expression: MIT

aru_code-0.19.0/aru/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ __version__ = "0.19.0"

{aru_code-0.18.0 → aru_code-0.19.0}/aru/context.py RENAMED Viewed

@@ -1,28 +1,40 @@
 """Context management for token optimization.
-Implements three layers of token reduction:
-1. Pruning — evict old tool/assistant outputs from history
-2. Truncation — universal cap on tool output size
-3. Compaction — summarize entire conversation when approaching context limits
+Mirrors opencode's two-layer approach:
+1. **Prune** (routine, lossy only on tool outputs): walks old tool_result
+   blocks and replaces their content with a placeholder. User/assistant
+   text is NEVER touched — it survives verbatim until real overflow.
+   This is the steady-state memory mechanism. Matches cache_patch.py's
+   strategy at the Agno message layer.
+2. **Compact** (rare, lossy full summary): triggers only when the per-call
+   context window actually approaches the model's limit. Runs a
+   compaction agent that produces a structured summary (Goal / Instructions
+   / Discoveries / Accomplished / File contents / Relevant files) and
+   marks the resulting assistant message with `summary: True` so
+   subsequent prunes stop at that checkpoint.
+There is also a `truncate_output` layer used by individual tools to cap
+their own output size before it ever reaches history.
 """
 from __future__ import annotations
 # ── Constants ──────────────────────────────────────────────────────
-# Pruning: minimum chars that must be freeable to justify a prune pass
-PRUNE_MINIMUM_CHARS = 12_000  # ~3.5K tokens
-# Placeholder that replaces evicted content
-PRUNED_PLACEHOLDER = "[cleared]"
-# User messages larger than this threshold are truncated when outside protection window
-PRUNE_USER_MSG_THRESHOLD = 2_000  # ~570 tokens
-# How many chars to keep from the start of a pruned user message
-PRUNE_USER_MSG_KEEP = 500  # ~140 tokens
+# Pruning: minimum chars that must be freeable to justify a prune pass.
+# Matches opencode's PRUNE_MINIMUM = 20_000 tokens (~80K chars @ 4 chars/token).
+PRUNE_MINIMUM_CHARS = 80_000  # ~20K tokens
+# Placeholder that replaces cleared tool_result content. Matches
+# cache_patch.py's _PRUNED_PLACEHOLDER so both layers produce identical
+# text when a tool output is cleared.
+CLEARED_TOOL_RESULT = "[Old tool result cleared]"
 # Minimum number of recent user turns always protected (regardless of char budget)
 PRUNE_PROTECT_TURNS = 2
 # Tool result markers that should never be pruned (critical context)
 PRUNE_PROTECTED_MARKERS = {"[SubAgent-", "delegate_task"}
-# Tool names whose outputs should never be pruned (like OpenCode's PRUNE_PROTECTED_TOOLS)
+# Tool names whose outputs should never be pruned (like opencode's PRUNE_PROTECTED_TOOLS)
 # These are checked as substrings in message content (tool results include the tool name)
 PRUNE_PROTECTED_TOOLS = {"delegate_task"}
@@ -32,17 +44,22 @@ TRUNCATE_MAX_BYTES = 15 * 1024  # 15 KB
 TRUNCATE_KEEP_START = 150  # lines to keep from the start
 TRUNCATE_KEEP_END = 60  # lines to keep from the end
 TRUNCATE_MAX_LINE_LENGTH = 1500  # chars per individual line (prevents minified files)
-# Directory for saving full truncated outputs (like OpenCode pattern)
+# Directory for saving full truncated outputs (like opencode pattern)
 TRUNCATE_SAVE_DIR = ".aru/truncated"
-# Compaction: trigger when per-run input tokens exceed this fraction of model limit
-COMPACTION_THRESHOLD_RATIO = 0.70
-# Compaction: target post-compaction size as fraction of model context limit
-COMPACTION_TARGET_RATIO = 0.15
-# Compaction: also trigger after this many user turns (regardless of token count)
-COMPACTION_MAX_TURNS = 15
-# Compaction: reserve buffer for the compaction process itself (like OpenCode's 20K)
-COMPACTION_BUFFER_TOKENS = 20_000
+# Compaction: trigger when per-call input tokens approach real overflow.
+# Matches opencode's philosophy: only fire near the model's actual context
+# limit, not routinely. Routine context reduction is handled by prune_history
+# (lossy only on tool outputs), so compaction is reserved for genuine
+# overflow — where the next API call would otherwise exceed the model's
+# input limit minus the reserved buffer.
+#
+# Opencode fires at `count >= limit.input - reserved` (overflow.ts:22) —
+# no extra ratio. We mirror that here. The sole safety margin is
+# COMPACTION_BUFFER_TOKENS, which is 30K (vs opencode's 20K) to give a bit
+# more headroom for output + tool definitions + estimation noise, since
+# we don't yet have a reactive overflow handler to catch the edge case.
+COMPACTION_BUFFER_TOKENS = 30_000
 # Default model context limits (input tokens)
 MODEL_CONTEXT_LIMITS: dict[str, int] = {
     # Anthropic
@@ -114,61 +131,97 @@ Structured list of file paths relevant to continuing the work (one per line)."""
 # ── Layer 1: Pruning ──────────────────────────────────────────────
+def _tool_result_content_len(msg: dict) -> int:
+    """Sum of content length of all non-cleared tool_result blocks in a message.
+    Mirrors opencode's prune walk, which accumulates only
+    `Token.estimate(part.state.output)` for `ToolPart`s (compaction.ts:119).
+    Text blocks and tool_use args are ignored — they are not the thing
+    being freed. This means pruning only "consumes budget" for real tool
+    output, so text-heavy conversations with few tool calls never trip
+    the prune path.
+    Already-cleared tool_results (content == CLEARED_TOOL_RESULT) are
+    skipped so a second pass doesn't double-count them.
+    """
+    from aru.history_blocks import is_tool_result
+    total = 0
+    for block in msg.get("content", []):
+        if is_tool_result(block):
+            content = block.get("content")
+            if content == CLEARED_TOOL_RESULT:
+                continue
+            if content is None:
+                continue
+            # tool_result content can be a string or a list of blocks —
+            # stringify to get a char count that roughly tracks tokens.
+            total += len(str(content))
+    return total
 def _get_prune_protect_chars(model_id: str = "default") -> int:
-    """Scale protection window based on model context size.
-    Returns the number of chars worth of recent history that should NEVER
-    be pruned. The remaining history beyond this window is eligible for
-    reversible pruning.
-    Sizing rationale: the target is a steady-state per-call context
-    window of ~20K tokens (what the user sees in the status bar), which
-    means protected history should be ~17K tokens = ~60K chars. This
-    floor is applied to every model; larger models get more protection
-    scaled at ~7% of their context, capped at 200K chars (~57K tokens)
-    to avoid protecting too much in 1M-context models where the extra
-    history hurts prompt caching.
+    """Chars of recent history that must NEVER be pruned.
+    Flat value across all models, mirroring opencode's fixed
+    `PRUNE_PROTECT = 40_000` tokens (compaction.ts:36). At ~4 chars/token
+    that's 160K chars of tool-result content kept intact in the recent
+    window. Older tool_result blocks beyond this budget are eligible for
+    the lossy clear pass in `prune_history`.
+    Why flat (not scaled by model): opencode validated this in production
+    on contexts from 128K to 1M — scaling by ratio adds complexity without
+    improving behavior, and protecting too much in 1M-context models can
+    actually hurt prompt caching by keeping rarely-touched tail content warm.
+    The `model_id` parameter is retained for signature compatibility with
+    older call sites; it has no effect on the returned value.
     """
-    limit = MODEL_CONTEXT_LIMITS.get(model_id, MODEL_CONTEXT_LIMITS["default"])
-    # ~4 chars per token, protect ~7% of context as the ratio ceiling
-    ratio_based = int(limit * 0.07 * 4)
-    # Floor of 60K chars (~17K tokens) keeps the user-visible context
-    # window around 20K tokens steady-state after system + cache + output
-    # overheads. Applies to any model where 7% would be smaller.
-    return max(60_000, min(ratio_based, 200_000))
+    del model_id  # unused — kept for signature compatibility
+    return 160_000
 def prune_history(
     history: list[dict], model_id: str = "default"
 ) -> list[dict]:
-    """Reduce history token footprint by dropping old content blocks.
-    Operates on block-shaped history (see `aru.history_blocks`). The
-    algorithm walks backward accumulating a char budget, and for any
-    message that falls outside the protection window:
-    - `text` blocks on assistant messages → replaced with `[cleared]`
-      text block.
-    - Large `text` blocks on user messages → truncated to first N chars.
-    - `tool_use` blocks → dropped **together with** their matching
-      `tool_result` block in the subsequent tool/user message. Dropping
-      them atomically is required: Anthropic's API rejects orphans with
-      `400: tool_use_id not found`.
-    - `tool_result` blocks → dropped only when their paired `tool_use`
-      is also dropped.
-    Protection layers:
+    """Reduce history token footprint by clearing old tool result content.
+    Operates on block-shaped history (see `aru.history_blocks`). Matches
+    opencode's approach: the ONLY lossy operation is replacing the
+    content of old `tool_result` blocks with a short placeholder. Text
+    blocks (user and assistant), `tool_use` blocks, and block structure
+    are always preserved — so the original ask survives verbatim until
+    real overflow forces a full compaction.
+    **Budget semantics** (opencode parity): the walk backward accumulates
+    **only tool_result content chars**, not whole-message chars. Text
+    blocks and tool_use args don't consume the protection budget, because
+    they aren't what prune can free. Consequences:
+      - Text-heavy conversations with few tool calls never trigger prune.
+      - Prune only fires when there is >= `protect_chars + PRUNE_MINIMUM_CHARS`
+        of tool_result content total — mirroring opencode's
+        `total > PRUNE_PROTECT + PRUNE_MINIMUM`.
+      - The "is it worth pruning?" dry-run check from opencode
+        (`pruned > PRUNE_MINIMUM`) is implicit: we cannot enter the loop
+        without enough prunable content, and once in the loop any walk
+        past `protect_chars` is guaranteed to be freeing real bytes.
+    Protection layers (applied on top of the budget walk):
     1. Turn-based: last `PRUNE_PROTECT_TURNS` user turns always kept
-       intact, along with the assistant response right after each.
-    2. Char-based: recent content within the protection window is kept.
+       intact, plus the assistant response right after each. Index 0
+       (the original user ask) is also always protected.
+    2. Budget-based: tool_result content within the 160K protect window
+       (~40K tokens, matching opencode) is kept.
     3. Content-based: messages whose stringified content contains any
        `PRUNE_PROTECTED_MARKERS` or `PRUNE_PROTECTED_TOOLS` never prune.
+    4. Summary checkpoint: walking backward stops at any message marked
+       `summary: True` (a previous compaction's assistant output).
+       Everything before a summary was already consolidated and must
+       not be re-processed.
     Returns a new list (does not mutate the input).
     """
     from aru.history_blocks import (
-        coerce_history_item, item_char_len, item_text,
-        is_text, is_tool_use, is_tool_result, text_block,
+        coerce_history_item, item_text, is_tool_result,
     )
     if len(history) <= 2:
@@ -177,11 +230,15 @@ def prune_history(
     protect_chars = _get_prune_protect_chars(model_id)
     result = [coerce_history_item(m) for m in history]
-    total_chars = sum(item_char_len(msg) for msg in result)
-    if total_chars < protect_chars + PRUNE_MINIMUM_CHARS:
+    # Entry gate mirrors opencode: only proceed if total tool output
+    # exceeds protect + minimum. Text length is irrelevant.
+    total_tool_chars = sum(_tool_result_content_len(msg) for msg in result)
+    if total_tool_chars < protect_chars + PRUNE_MINIMUM_CHARS:
         return result
-    # Identify indices of last N user turns (always protected)
+    # Identify indices of last N user turns (always protected) and index 0
+    # (the original user ask, protected defensively so the anchor never
+    # evaporates even if future edits change the budget calculus).
     turn_protected: set[int] = set()
     user_turns_seen = 0
     for i in range(len(result) - 1, -1, -1):
@@ -191,108 +248,60 @@ def prune_history(
                 turn_protected.add(i)
                 if i + 1 < len(result):
                     turn_protected.add(i + 1)
-    # Build a map of tool_use_id → (assistant_idx, user_idx) so we can
-    # drop both halves of a pair atomically. The user_idx points to the
-    # next message(s) after the assistant carrying the matching tool_result.
-    tool_pair_loc: dict[str, tuple[int, int]] = {}
-    for i, msg in enumerate(result):
-        if msg["role"] != "assistant":
-            continue
-        for block in msg["content"]:
-            if not is_tool_use(block):
-                continue
-            tu_id = block.get("id")
-            if not tu_id:
-                continue
-            # Look forward for the matching tool_result (usually i+1)
-            for j in range(i + 1, min(i + 3, len(result))):
-                for rb in result[j]["content"]:
-                    if is_tool_result(rb) and rb.get("tool_use_id") == tu_id:
-                        tool_pair_loc[tu_id] = (i, j)
-                        break
-                if tu_id in tool_pair_loc:
-                    break
-    # Walk backward, protecting recent content
+    if result and result[0]["role"] == "user":
+        turn_protected.add(0)
+        if len(result) > 1:
+            turn_protected.add(1)
+    # Walk backward accumulating ONLY tool_result content chars into the
+    # protection budget. Messages with no tool_result (pure text, or just
+    # tool_use) consume zero budget and are skipped without pruning.
     protected = 0
-    dropped_tool_use_ids: set[str] = set()
     for i in range(len(result) - 1, -1, -1):
         msg = result[i]
-        msg_len = item_char_len(msg)
+        # Stop at the previous compaction summary marker — everything
+        # before it was already consolidated into the summary.
+        if msg.get("summary"):
+            break
+        tool_chars = _tool_result_content_len(msg)
+        # No prunable content here — nothing to clear, nothing to count.
+        if tool_chars == 0:
+            continue
         if i in turn_protected:
-            protected += msg_len
+            protected += tool_chars
             continue
-        if protected + msg_len <= protect_chars:
-            protected += msg_len
+        if protected + tool_chars <= protect_chars:
+            protected += tool_chars
             continue
         # Outside protection window — check content-based protection
         text_view = item_text(msg)
         if (any(marker in text_view for marker in PRUNE_PROTECTED_MARKERS)
                 or any(tool in text_view for tool in PRUNE_PROTECTED_TOOLS)):
-            protected += msg_len
+            protected += tool_chars
             continue
-        # Prune this message's blocks
+        # Clear any tool_result payloads in this message. Leave every
+        # other block (text, tool_use, thinking, etc.) untouched.
         new_blocks: list[dict] = []
         for block in msg["content"]:
-            if is_text(block):
-                if msg["role"] == "assistant":
-                    # Replace with a single placeholder (only if not already)
-                    if not new_blocks or new_blocks[-1].get("text") != PRUNED_PLACEHOLDER:
-                        new_blocks.append(text_block(PRUNED_PLACEHOLDER))
-                elif msg["role"] == "user":
-                    text = block.get("text", "")
-                    if len(text) > PRUNE_USER_MSG_THRESHOLD:
-                        truncated = (
-                            text[:PRUNE_USER_MSG_KEEP]
-                            + f"\n\n[... {len(text) - PRUNE_USER_MSG_KEEP:,} "
-                              "chars pruned to save context ...]"
-                        )
-                        new_blocks.append(text_block(truncated))
-                    else:
-                        new_blocks.append(block)
-                else:
-                    new_blocks.append(block)
-            elif is_tool_use(block):
-                # Drop the tool_use entirely and mark its id for paired removal
-                tu_id = block.get("id")
-                if tu_id:
-                    dropped_tool_use_ids.add(tu_id)
-                # Do NOT add to new_blocks
-            elif is_tool_result(block):
-                # Drop only if its paired tool_use is also being dropped
-                tu_id = block.get("tool_use_id")
-                if tu_id in dropped_tool_use_ids:
-                    pass  # drop
-                else:
-                    new_blocks.append(block)
+            if is_tool_result(block) and block.get("content") != CLEARED_TOOL_RESULT:
+                new_blocks.append({
+                    "type": "tool_result",
+                    "tool_use_id": block.get("tool_use_id"),
+                    "content": CLEARED_TOOL_RESULT,
+                })
             else:
                 new_blocks.append(block)
         result[i] = {"role": msg["role"], "content": new_blocks}
-    # Second pass: any tool_result blocks in user messages whose tool_use
-    # was dropped on a previous pass (covers case where user msg was
-    # inside protection but its paired assistant was outside).
-    if dropped_tool_use_ids:
-        for idx, msg in enumerate(result):
-            if not msg["content"]:
-                continue
-            filtered = [
-                b for b in msg["content"]
-                if not (is_tool_result(b) and b.get("tool_use_id") in dropped_tool_use_ids)
-            ]
-            if len(filtered) != len(msg["content"]):
-                result[idx] = {"role": msg["role"], "content": filtered}
-    # Drop any messages that ended up with zero blocks (valid but useless)
-    result = [m for m in result if m["content"]]
     return result
@@ -443,46 +452,50 @@ def should_compact(
 ) -> bool:
     """Check if the conversation should be compacted.
-    Triggers on EITHER condition:
-    1. Token-based: tokens >= usable_context * threshold_ratio
-    2. Turn-based: user turns >= COMPACTION_MAX_TURNS (prevents slow token creep)
+    Fires when the per-call context window reaches real overflow:
+    `tokens >= limit - COMPACTION_BUFFER_TOKENS`.
+    Matches opencode's `isOverflow` in overflow.ts:22 — `count >= usable`,
+    no extra ratio. Routine context reduction is handled by `prune_history`
+    (lossy only on tool outputs), so compaction is reserved for genuine
+    overflow where the next API call would otherwise exceed the model's
+    input limit minus the reserved buffer.
     Accepts either an estimated token count (int) or the history list.
     """
     if isinstance(history_or_tokens, list):
-        history = history_or_tokens
-        tokens = estimate_history_tokens(history)
-        # Turn-based trigger: count user messages
-        user_turns = sum(1 for m in history if m["role"] == "user")
-        if user_turns >= COMPACTION_MAX_TURNS:
-            return True
+        tokens = estimate_history_tokens(history_or_tokens)
     else:
         tokens = history_or_tokens
     limit = MODEL_CONTEXT_LIMITS.get(model_id, MODEL_CONTEXT_LIMITS["default"])
     usable = limit - COMPACTION_BUFFER_TOKENS
-    threshold = int(usable * COMPACTION_THRESHOLD_RATIO)
-    return tokens >= threshold
+    return tokens >= usable
 def would_prune(history: list[dict], model_id: str = "default") -> bool:
     """Check if prune_history would discard content from this history.
-    Uses the exact same criteria as prune_history: total chars exceed
-    the protection window + minimum prunable threshold.
+    Uses the same entry gate as `prune_history`: total tool_result
+    content must exceed the protection window + minimum prunable
+    threshold. Text and tool_use args are not counted — only real
+    prunable output. Mirrors opencode's logic.
     """
-    from aru.history_blocks import item_char_len
     if len(history) <= 2:
         return False
-    total_chars = sum(item_char_len(msg) for msg in history)
+    total_tool_chars = sum(_tool_result_content_len(msg) for msg in history)
     protect_chars = _get_prune_protect_chars(model_id)
-    return total_chars >= protect_chars + PRUNE_MINIMUM_CHARS
+    return total_tool_chars >= protect_chars + PRUNE_MINIMUM_CHARS
 def _split_history(history: list[dict], model_id: str = "default") -> tuple[list[dict], list[dict]]:
     """Split history into old (to summarize) and recent (to keep intact).
-    Uses the same protection window as pruning.
+    Uses the same protection window as pruning. Defensively, the first
+    user turn (index 0) is always pulled into `recent` so the original
+    ask survives literal even through a full compaction — the compactor
+    extracts it into the `## Goal` section of the summary, but keeping
+    it in recent too means the agent can quote it verbatim afterward.
     """
     from aru.history_blocks import item_char_len
     protect_chars = _get_prune_protect_chars(model_id)
@@ -495,6 +508,12 @@ def _split_history(history: list[dict], model_id: str = "default") -> tuple[list
             split_idx = i
         else:
             break
+    # Defensive: force the first user turn into `recent` even if the
+    # protect budget would have sent it to `old`. The original ask is
+    # the session anchor and must stay literal.
+    if split_idx > 0 and history and history[0].get("role") == "user":
+        return history[1:split_idx], [history[0]] + history[split_idx:]
     return history[:split_idx], history[split_idx:]
@@ -563,12 +582,13 @@ def apply_compaction(
     The summary is emitted as a synthetic user→assistant exchange so that
     role alternation stays natural:
         [user: "Please summarize..."]
-        [assistant: "<summary>"]
+        [assistant: "<summary>", summary=True]
         + recent messages as-is
-    This shape avoids the `[user, user, ...]` sequence that previously
-    biased the model toward describing actions rather than emitting
-    structured tool calls.
+    The assistant summary is marked with `summary: True` as a checkpoint.
+    `prune_history` walks backward and stops at this marker, so content
+    already consolidated into the summary is never re-processed. Mirrors
+    opencode's `msg.info.summary` flag (see message-v2.ts:914).
     """
     from aru.history_blocks import text_block, coerce_history_item
     _, recent = _split_history(history, model_id)
@@ -581,6 +601,7 @@ def apply_compaction(
         {
             "role": "assistant",
             "content": [text_block(f"Prior conversation summary:\n\n{summary}")],
+            "summary": True,
         },
     ]
     compacted.extend(coerce_history_item(m) for m in recent)

{aru_code-0.18.0 → aru_code-0.19.0}/aru/session.py RENAMED Viewed

@@ -402,9 +402,12 @@ class Session:
         self.history.append({"role": role, "content": blocks})
         # Hard cap as safety net — structured pruning/compaction in
         # aru/context.py handles the normal case; this only fires if
-        # something bypasses them.
-        if len(self.history) > 60:
-            self.history = self.history[-60:]
+        # something bypasses them. Set high enough that long sessions
+        # (which now accumulate more messages because prune is
+        # non-destructive for text and compact rarely fires) don't hit
+        # this destructive path routinely.
+        if len(self.history) > 300:
+            self.history = self.history[-300:]
     def add_structured_message(self, role: str, blocks: list[dict]):
         """Explicitly add a message with pre-built content blocks.

{aru_code-0.18.0 → aru_code-0.19.0/aru_code.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: aru-code
-Version: 0.18.0
+Version: 0.19.0
 Summary: A Claude Code clone built with Agno agents
 Author-email: Estevao <estevaofon@gmail.com>
 License-Expression: MIT

{aru_code-0.18.0 → aru_code-0.19.0}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "aru-code"
-version = "0.18.0"
+version = "0.19.0"
 description = "A Claude Code clone built with Agno agents"
 readme = "README.md"
 license = "MIT"

{aru_code-0.18.0 → aru_code-0.19.0}/tests/test_cli.py RENAMED Viewed

@@ -314,11 +314,12 @@ class TestSession:
     def test_add_message_caps_history(self):
         session = Session()
-        for i in range(75):
+        for i in range(350):
             session.add_message("user", f"msg {i}")
-        # History is bounded by a hard cap (structured compaction in
-        # aru.context handles the normal-path token management).
-        assert len(session.history) <= 60
+        # History is bounded by a hard safety cap (structured pruning/
+        # compaction in aru.context handles the normal-path token
+        # management; this cap only fires on pathological growth).
+        assert len(session.history) <= 300
     def test_set_plan(self):
         session = Session()

{aru_code-0.18.0 → aru_code-0.19.0}/tests/test_confabulation_regression.py RENAMED Viewed

@@ -176,17 +176,26 @@ class TestPrunePreservesPairs:
     """Fix 6: pruning must never orphan tool_use / tool_result blocks."""
     def test_prune_drops_tool_pair_atomically(self):
-        """An old assistant with tool_use must have its tool_result dropped too."""
-        # Build a history large enough to force pruning
-        filler = "x" * 50_000
+        """An old tool_result whose content gets cleared must still keep
+        its block (matching tool_use_id), so the tool_use/tool_result
+        pair is never orphaned.
+        Opencode-aligned budget: prune only counts tool_result content
+        chars, so the history needs multiple large tool_result payloads
+        to clear the 240K entry gate.
+        """
+        big_output = "old file line\n" * 8_000  # ~100K chars per result
         history = [
             {"role": "user", "content": "request 1"},
-            _assistant_tool_turn("old_tu", "read_file", {"path": "old.py"}, filler),
-            _tool_result_turn("old_tu", "old contents " * 1000),
+            _assistant_tool_turn("old_tu", "read_file", {"path": "old.py"}),
+            _tool_result_turn("old_tu", big_output),
             {"role": "user", "content": "request 2"},
-            {"role": "assistant", "content": "response 2"},
-            {"role": "user", "content": "recent request " * 5000},
-            {"role": "assistant", "content": "recent response"},
+            _assistant_tool_turn("mid_tu", "read_file", {"path": "mid.py"}),
+            _tool_result_turn("mid_tu", big_output),
+            {"role": "user", "content": "request 3"},
+            _assistant_tool_turn("recent_tu", "read_file", {"path": "new.py"}),
+            _tool_result_turn("recent_tu", big_output),
+            {"role": "user", "content": "summarize"},
         ]
         pruned = prune_history(history, model_id="default")
@@ -208,10 +217,20 @@ class TestPrunePreservesPairs:
         )
     def test_prune_keeps_recent_tool_pair(self):
-        """A tool_use/tool_result pair inside the protection window must be kept."""
+        """A tool_use/tool_result pair inside the protection window must be
+        kept with its content intact, even when older tool_results get cleared.
+        Builds a history with two big old tool_results (enough to trigger
+        prune) and one small recent pair that must survive verbatim.
+        """
+        big_old = "old file content\n" * 10_000  # ~170K chars each
         history = [
-            {"role": "user", "content": "old stuff " * 50_000},
-            {"role": "assistant", "content": "old response " * 10_000},
+            {"role": "user", "content": "req 1"},
+            _assistant_tool_turn("tu_old1", "read_file", {"path": "a.py"}, "reading"),
+            _tool_result_turn("tu_old1", big_old),
+            {"role": "user", "content": "req 2"},
+            _assistant_tool_turn("tu_old2", "read_file", {"path": "b.py"}, "reading"),
+            _tool_result_turn("tu_old2", big_old),
             {"role": "user", "content": "read foo"},
             _assistant_tool_turn("tu_recent", "read_file", {"path": "foo.py"}, "reading"),
             _tool_result_turn("tu_recent", "def foo(): pass"),
@@ -227,6 +246,10 @@ class TestPrunePreservesPairs:
         assert len(tool_uses) == 1, "Recent tool_use was incorrectly pruned"
         assert len(tool_results) == 1, "Recent tool_result was incorrectly pruned"
+        # Recent content must be intact (not cleared)
+        assert tool_results[0].get("content") == "def foo(): pass", (
+            "Recent tool_result content was cleared — should be inside protection window"
+        )
     def test_prune_with_no_pairs_still_works(self):
         """Pure text history should prune without errors."""

{aru_code-0.18.0 → aru_code-0.19.0}/tests/test_context.py RENAMED Viewed

@@ -10,8 +10,16 @@ from aru.context import (
     apply_compaction,
     build_compaction_prompt,
     format_context_block,
+    CLEARED_TOOL_RESULT,
+)
+from aru.history_blocks import (
+    coerce_history,
+    item_text,
+    tool_use_block,
+    tool_result_block,
+    text_block,
+    is_tool_result,
 )
-from aru.history_blocks import coerce_history, item_text
 class TestPruneHistory:
@@ -27,37 +35,103 @@ class TestPruneHistory:
         # Input is auto-coerced to block form on return
         assert result == coerce_history(messages)
-    def test_prunes_old_assistant_messages(self):
-        """Should prune old assistant messages when over threshold."""
-        old_content = "x" * 30000
-        recent_content = "y" * 10000
+    def test_prunes_old_tool_results_when_over_threshold(self):
+        """Should clear old tool_result content when total tool output
+        exceeds protect + minimum (opencode-aligned budget semantics).
+        The budget walks backward over tool_result content chars only.
+        Text and tool_use args don't count, so this test uses large
+        tool_result payloads to actually trip the prune path.
+        """
+        # Three rounds of read_file-sized outputs. Total ~300K chars
+        # of tool_result content — clears the 240K entry gate, and
+        # the 160K protect budget will cover only the most recent one.
+        big_output = "line of code\n" * 8_000  # ~100K chars
         messages = [
-            {"role": "user", "content": "First request"},
-            {"role": "assistant", "content": old_content},
-            {"role": "user", "content": "Second request"},
-            {"role": "assistant", "content": recent_content},
+            {"role": "user", "content": "round 1"},
+            {
+                "role": "assistant",
+                "content": [
+                    text_block("reading"),
+                    tool_use_block("tu_old", "read_file", {"path": "a.py"}),
+                ],
+            },
+            {"role": "tool", "content": [tool_result_block("tu_old", big_output)]},
+            {"role": "user", "content": "round 2"},
+            {
+                "role": "assistant",
+                "content": [
+                    text_block("reading"),
+                    tool_use_block("tu_mid", "read_file", {"path": "b.py"}),
+                ],
+            },
+            {"role": "tool", "content": [tool_result_block("tu_mid", big_output)]},
+            {"role": "user", "content": "round 3"},
+            {
+                "role": "assistant",
+                "content": [
+                    text_block("reading"),
+                    tool_use_block("tu_recent", "read_file", {"path": "c.py"}),
+                ],
+            },
+            {"role": "tool", "content": [tool_result_block("tu_recent", big_output)]},
+            {"role": "user", "content": "what did you find?"},
         ]
         result = prune_history(messages)
-        # Should have placeholder for pruned content
-        assert len(result) <= len(messages)
-        # Recent messages should be preserved
-        assert any("Second request" in str(m) for m in result)
-    def test_preserves_user_messages(self):
-        """Should always preserve user messages."""
-        old_user = {"role": "user", "content": "Old user message"}
-        old_assistant = {"role": "assistant", "content": "Old assistant " * 10000}
-        recent = {"role": "user", "content": "Recent request"}
-        messages = [old_user, old_assistant, recent]
-        result = prune_history(messages)
-        # User messages should be preserved (as placeholders or original)
-        recent_preserved = any(
-            m.get("role") == "user" and "Recent" in item_text(m)
-            for m in result
+        # Same number of messages (prune never drops structure)
+        assert len(result) == len(messages)
+        # Collect tool_result blocks by tool_use_id
+        by_id: dict[str, dict] = {}
+        for msg in result:
+            for block in msg.get("content", []):
+                if is_tool_result(block):
+                    by_id[block.get("tool_use_id")] = block
+        # All three pairs preserved at the block level
+        assert set(by_id.keys()) == {"tu_old", "tu_mid", "tu_recent"}
+        # Recent tool_result kept verbatim
+        assert by_id["tu_recent"]["content"] == big_output
+        # The older tool_result must have been cleared — at least one
+        # of tu_old/tu_mid should now hold the placeholder, since only
+        # 160K chars worth fits inside the protect window.
+        cleared_count = sum(
+            1 for tu_id in ("tu_old", "tu_mid")
+            if by_id[tu_id]["content"] == CLEARED_TOOL_RESULT
+        )
+        assert cleared_count >= 1, (
+            "Expected at least one old tool_result to be cleared once "
+            "total output exceeded protect + minimum"
         )
-        assert recent_preserved
+    def test_text_heavy_history_is_not_pruned(self):
+        """Conversations dominated by text (not tool output) must NOT
+        trigger prune even if total chars are huge.
+        This is the opencode-aligned semantics: text blocks don't enter
+        the prune budget. A 500K-char text history with no tool_results
+        is a no-op for prune_history.
+        """
+        messages = [
+            {"role": "user", "content": "long planning discussion " * 10_000},
+            {"role": "assistant", "content": "detailed reasoning " * 10_000},
+            {"role": "user", "content": "what's next?"},
+            {"role": "assistant", "content": "here's the plan " * 10_000},
+        ]
+        result = prune_history(messages)
+        # No tool_results exist anywhere in result
+        tool_results = [
+            b for m in result for b in m.get("content", []) if is_tool_result(b)
+        ]
+        assert tool_results == []
+        # Length preserved
+        assert len(result) == len(messages)
+        # No message content was altered to CLEARED_TOOL_RESULT
+        assert all(CLEARED_TOOL_RESULT not in item_text(m) for m in result)
     def test_empty_history(self):
         """Should handle empty history."""
@@ -108,20 +182,21 @@ class TestShouldCompact:
     """Tests for should_compact function."""
     def test_no_compaction_under_threshold(self):
-        """Should not compact when under 50% of context limit."""
-        # Default 200K tokens * 0.5 = 100K threshold; 5 tokens is well under
+        """Should not compact when well under the overflow threshold."""
+        # claude-sonnet-4-5 has 200K context; usable = 170K (buffer 30K).
+        # 5 tokens is well under.
         result = should_compact(5, model_id="claude-sonnet-4-5-20250929")
         assert result is False
     def test_compaction_over_threshold(self):
-        """Should compact when over threshold."""
-        # 300K tokens is over 50% of a 200K-token context window
+        """Should compact when over the real-overflow threshold."""
+        # 300K tokens is well over the 170K threshold of a 200K-context model.
         result = should_compact(300000, model_id="claude-sonnet-4-5-20250929")
         assert result is True
     def test_custom_context_limit(self):
         """Should respect custom context limit."""
-        # gpt-4o has 128K context, 50% = 64K; 50K is under threshold
+        # gpt-4o has 128K context; usable = 98K. 50K is under.
         result = should_compact(50000, model_id="gpt-4o")
         assert isinstance(result, bool)
@@ -145,7 +220,8 @@ class TestCompactionTriggerUsesPerCallMetric:
     def test_small_per_call_window_does_not_fire(self):
         """Reproduces the exact bug report: per-call ~20K on qwen3.6-plus
-        (128K limit, ~75.6K threshold) must NOT trigger compaction."""
+        (128K limit, ~98K threshold with 30K buffer) must NOT
+        trigger compaction."""
         # Values taken from the real session where compaction fired incorrectly:
         # "context: 20,184 (in: 16,652 / out: 696 / cache_read: 2,836)"
         last_input = 16_652
@@ -158,7 +234,7 @@ class TestCompactionTriggerUsesPerCallMetric:
         )
         assert last_call_window == 20_184, "window computation changed"
-        # 20K is ~3.7× below the 75.6K threshold for a 128K-context model
+        # 20K is far below the ~98K threshold for a 128K-context model
         assert should_compact(last_call_window, model_id="qwen3.6-plus") is False, (
             "Compaction fired on a small per-call window. The runner is "
             "probably passing cumulative tokens (run_output.metrics.input_tokens) "
@@ -169,7 +245,9 @@ class TestCompactionTriggerUsesPerCallMetric:
     def test_large_per_call_window_still_fires(self):
         """Positive case: compaction must still fire when the last-call
         window actually approaches the model's context limit."""
-        last_input = 80_000
+        # qwen3.6-plus: 128K limit, usable = 98K (buffer 30K).
+        # 105K input + 2K output + 0 cache = 107K window → must fire.
+        last_input = 105_000
         last_output = 2_000
         last_cache_read = 0
         last_cache_write = 0
@@ -177,17 +255,17 @@ class TestCompactionTriggerUsesPerCallMetric:
         last_call_window = (
             last_input + last_output + last_cache_read + last_cache_write
         )
-        assert last_call_window == 82_000
+        assert last_call_window == 107_000
-        # 82K > 75.6K threshold → must fire
+        # 107K > 98K threshold → must fire
         assert should_compact(last_call_window, model_id="qwen3.6-plus") is True
     def test_cumulative_metric_is_the_wrong_signal(self):
         """Illustrates WHY the old approach was wrong: a cumulative sum of
-        5 API calls at 18K each is 90K (above threshold), but the actual
+        6 API calls at 18K each is 108K (above threshold), but the actual
         per-call window each time is only 18K (well below)."""
         per_call_window = 18_000
-        num_api_calls_in_turn = 5
+        num_api_calls_in_turn = 6
         cumulative_if_summed = per_call_window * num_api_calls_in_turn
         # Old (wrong) behavior: cumulative triggers compaction
@@ -196,8 +274,8 @@ class TestCompactionTriggerUsesPerCallMetric:
         # New (correct) behavior: per-call does NOT trigger compaction
         assert should_compact(per_call_window, model_id="qwen3.6-plus") is False
-        # The difference is the entire bug
-        assert cumulative_if_summed > 75_600 > per_call_window
+        # The difference is the entire bug (threshold is 98K for qwen3.6-plus)
+        assert cumulative_if_summed > 98_000 > per_call_window
     def test_runner_source_uses_per_call_metric(self):
         """Static check against silent regression.