PyPI - aru-code - Versions diffs - 0.19.0__tar.gz → 0.19.2__tar.gz - Mend

aru-code 0.19.0tar.gz → 0.19.2tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (59) hide show

{aru_code-0.19.0/aru_code.egg-info → aru_code-0.19.2}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: aru-code
-Version: 0.19.0
+Version: 0.19.2
 Summary: A Claude Code clone built with Agno agents
 Author-email: Estevao <estevaofon@gmail.com>
 License-Expression: MIT

aru_code-0.19.2/aru/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ __version__ = "0.19.2"

{aru_code-0.19.0 → aru_code-0.19.2}/aru/context.py RENAMED Viewed

@@ -47,6 +47,23 @@ TRUNCATE_MAX_LINE_LENGTH = 1500  # chars per individual line (prevents minified
 # Directory for saving full truncated outputs (like opencode pattern)
 TRUNCATE_SAVE_DIR = ".aru/truncated"
+# Compaction: chars of recent conversation preserved verbatim post-compact.
+#
+# Separate from the prune protect window (160K) because they measure
+# different things:
+#   - Prune protect: "how much tool_result content stays intact"
+#   - Compact recent: "how much full-message history stays verbatim after
+#     the summary replaces the older portion"
+#
+# Set to 80K chars (~20K tokens) — half the prune window. Rationale:
+# with the compactor now running on the main model (not a small one),
+# summaries are faithful enough that we don't need 40K of recent overlap
+# as a safety net. 20K still covers 3-6 recent turns verbatim, which
+# mirrors the "last few exchanges" a human would re-read to resume work.
+# Going to zero would match opencode exactly but requires the reactive
+# overflow replay flow we haven't implemented yet.
+COMPACT_RECENT_CHARS = 80_000
 # Compaction: trigger when per-call input tokens approach real overflow.
 # Matches opencode's philosophy: only fire near the model's actual context
 # limit, not routinely. Routine context reduction is handled by prune_history
@@ -491,19 +508,29 @@ def would_prune(history: list[dict], model_id: str = "default") -> bool:
 def _split_history(history: list[dict], model_id: str = "default") -> tuple[list[dict], list[dict]]:
     """Split history into old (to summarize) and recent (to keep intact).
-    Uses the same protection window as pruning. Defensively, the first
-    user turn (index 0) is always pulled into `recent` so the original
-    ask survives literal even through a full compaction — the compactor
-    extracts it into the `## Goal` section of the summary, but keeping
-    it in recent too means the agent can quote it verbatim afterward.
+    Uses `COMPACT_RECENT_CHARS` (80K chars ≈ 20K tokens) as the "recent"
+    budget — half of the prune protect window. Rationale: the compactor
+    now runs on the main model and produces high-fidelity summaries, so
+    we don't need 40K of recent overlap as a safety net. 20K covers 3-6
+    recent turns verbatim, which is enough to absorb the gap between
+    the last summarized state and the next turn.
+    Defensively, the first user turn (index 0) is always pulled into
+    `recent` so the original ask survives literal even through a full
+    compaction — the compactor extracts it into the `## Goal` section
+    of the summary, but keeping it in recent too means the agent can
+    quote it verbatim afterward.
+    The `model_id` parameter is retained for signature compatibility;
+    the recent budget is a flat value not scaled by model context.
     """
+    del model_id  # unused — recent budget is flat across models
     from aru.history_blocks import item_char_len
-    protect_chars = _get_prune_protect_chars(model_id)
     protected = 0
     split_idx = len(history)
     for i in range(len(history) - 1, -1, -1):
         msg_len = item_char_len(history[i])
-        if protected + msg_len <= protect_chars:
+        if protected + msg_len <= COMPACT_RECENT_CHARS:
             protected += msg_len
             split_idx = i
         else:
@@ -617,10 +644,20 @@ async def compact_conversation(
 ) -> list[dict[str, str]]:
     """Run the compaction agent to summarize and replace history.
-    Uses a small/fast model for the summarization to minimize cost.
-    Falls back to simple truncation if the agent call fails.
+    Uses the **same model** as the main session (`model_ref`), not a
+    cheaper small model. Rationale:
+    - Compaction is rare (only on real overflow, ~0-2× per long session).
+    - The summary is the *only* persistent record of pre-window history.
+    - A weaker compactor risks dropping subtle decisions that the main
+      model would have caught — and once dropped, they cannot be recovered
+      mid-session.
+    - The marginal cost (Sonnet: ~$0.20-0.40 per session; Opus: a few
+      dollars) is justified by the fidelity gain on a non-recoverable
+      step.
+    Falls back to a mechanical summary if the agent call fails.
     """
-    from aru.runtime import get_ctx
     from aru.providers import create_model
     prompt = build_compaction_prompt(history, plan_task, model_id=model_id)
@@ -628,16 +665,17 @@ async def compact_conversation(
     try:
         from agno.agent import Agent
-        small_ref = get_ctx().small_model_ref
         compactor = Agent(
             name="Compactor",
-            model=create_model(small_ref, max_tokens=4096),
+            model=create_model(model_ref, max_tokens=4096),
             instructions=(
                 "You summarize coding conversations concisely. Output ONLY the requested sections, no preamble. "
                 "Preserve: user goals, explicit instructions/preferences, file paths with line numbers, "
-                "function/class names that were modified, what remains to be done, AND verbatim excerpts "
-                "from any file contents shown in the conversation (signatures, critical constants, "
-                "bug-related lines) under the '## File contents (key excerpts)' section. "
+                "function/class names that were modified, what remains to be done. "
+                "For the '## File contents (key excerpts)' section, use your judgment: "
+                "if a file was central to the work (being debugged, actively edited, or referenced "
+                "in a decision), include the critical lines verbatim; if a file was only briefly "
+                "read for context, just list the path. Do not mechanically copy everything. "
                 "Drop: greetings, reasoning chains, redundant tool output, transient status messages."
             ),
             markdown=True,

{aru_code-0.19.0 → aru_code-0.19.2}/aru/providers.py RENAMED Viewed

@@ -330,35 +330,162 @@ def create_model(
     )
-def _make_cached_openai_chat_class():
-    """Create a CachedOpenAIChat subclass (lazy import to avoid top-level dependency)."""
+def _apply_cache_control(formatted_msg: dict) -> bool:
+    """Attach `cache_control: ephemeral` to a formatted OpenAI message.
+    Returns True if the marker was applied (i.e., the message had cacheable
+    content and wasn't already tagged). Skips messages whose content is not
+    a string or block list, messages already marked, and empty content.
+    Used by `CachedOpenAIChat` to tag system + last N user/assistant messages
+    for providers that honor OpenAI-style content blocks with `cache_control`
+    (DashScope/Qwen, and any OpenAI-compatible endpoint that mirrors the
+    Anthropic cache_control convention).
+    """
+    content = formatted_msg.get("content")
+    cache_tag = {"type": "ephemeral"}
+    if isinstance(content, str):
+        if not content:
+            return False
+        formatted_msg["content"] = [
+            {"type": "text", "text": content, "cache_control": cache_tag}
+        ]
+        return True
+    if isinstance(content, list) and content:
+        last = content[-1]
+        if isinstance(last, dict) and "cache_control" not in last:
+            last["cache_control"] = cache_tag
+            return True
+    return False
+def _make_cached_openai_chat_class(mark_recent_messages: bool = False):
+    """Create a CachedOpenAIChat subclass that injects prompt-cache markers.
+    DashScope (Qwen) and other OpenAI-compatible APIs support explicit prompt
+    caching via `cache_control: {"type": "ephemeral"}` on content blocks. This
+    subclass tags:
+    1. The **system message** — always. This is the minimum cache coverage
+       and is safe for any OpenAI-compatible provider that supports the marker
+       (unknown fields are ignored by providers that don't).
+    2. The **last 2 non-system / non-tool messages** — only when
+       `mark_recent_messages=True`. This unlocks prefix caching for the growing
+       conversation history (the big win: 5-8× cost reduction on multi-turn
+       sessions), but is gated because OpenAI's own API may not accept the
+       marker on user/assistant messages. The flag is wired from
+       `_create_provider_model` based on whether the provider has a custom
+       `base_url` — a strong signal that we're talking to a non-official
+       OpenAI endpoint (Qwen/DashScope/custom) that mirrors the Anthropic
+       convention.
+    Implementation: each of the 4 invoke methods (invoke/ainvoke plus stream
+    variants) pre-formats the full batch using the parent's `_format_message`,
+    tags the target messages via `_apply_cache_control`, stores the tagged
+    versions in `self._current_cache_tag_map` keyed by `id(original)`, and
+    then delegates to `super().<method>()`. The overridden `_format_message`
+    consults the map and returns the pre-tagged version when present.
+    """
     from agno.models.openai import OpenAIChat
     from agno.models.message import Message
     class CachedOpenAIChat(OpenAIChat):
-        """OpenAIChat subclass that injects cache_control into system messages.
+        _cache_recent_messages: bool = mark_recent_messages
-        DashScope (Qwen) and other OpenAI-compatible APIs support explicit prompt caching
-        via cache_control: {"type": "ephemeral"} on content blocks. This subclass
-        automatically adds that marker to system messages so the provider can cache
-        the system prompt between turns (up to 90% cost reduction on cached tokens).
-        """
+        # --- core hook ------------------------------------------------------
         def _format_message(self, message: Message, compress_tool_results: bool = False):
+            # If an invoke-level pre-tag map is active, use the tagged version
+            tag_map = getattr(self, "_current_cache_tag_map", None)
+            if tag_map is not None:
+                pre = tag_map.get(id(message))
+                if pre is not None:
+                    return pre
+            # Otherwise fall back to parent format + always-tag system
             formatted = super()._format_message(message, compress_tool_results)
-            if message.role == "system" and isinstance(formatted.get("content"), str):
-                text = formatted["content"]
-                formatted["content"] = [
-                    {
-                        "type": "text",
-                        "text": text,
-                        "cache_control": {"type": "ephemeral"},
-                    }
-                ]
+            if message.role == "system":
+                _apply_cache_control(formatted)
             return formatted
+        # --- batch pre-tagging ---------------------------------------------
+        def _build_cache_tag_map(self, messages, compress_tool_results: bool) -> dict:
+            """Format all messages up-front and tag system + last 2 recent.
+            Returns id(original_message) -> tagged formatted dict so the
+            overridden `_format_message` can substitute during super's
+            inline list comprehension.
+            Note: `OpenAIChat._format_message` rewrites `system` → `developer`
+            for newer OpenAI models. We check `Message.role` on the ORIGINAL
+            message (not the formatted dict) so the logic works regardless of
+            that rewrite.
+            """
+            # Use OpenAIChat's format directly (not self's) so the tag_map
+            # we're building doesn't cause recursive substitution.
+            base = [
+                OpenAIChat._format_message(self, m, compress_tool_results)
+                for m in messages
+            ]
+            # Tag the first system message (first Message with role=="system")
+            for orig, fmt in zip(messages, base):
+                if orig.role == "system":
+                    _apply_cache_control(fmt)
+                    break
+            # Optionally tag the last 2 non-system / non-tool messages.
+            # Iterate original+formatted in reverse so role checks stay
+            # on the unmodified Message role.
+            if self._cache_recent_messages:
+                marked = 0
+                for orig, fmt in zip(reversed(messages), reversed(base)):
+                    if marked >= 2:
+                        break
+                    if orig.role in ("system", "tool"):
+                        continue
+                    if _apply_cache_control(fmt):
+                        marked += 1
+            return {id(orig): fmt for orig, fmt in zip(messages, base)}
+        # --- invoke overrides: set up tag map, delegate to parent -----------
+        def invoke(self, messages, assistant_message, **kwargs):
+            compress = kwargs.get("compress_tool_results", False)
+            self._current_cache_tag_map = self._build_cache_tag_map(messages, compress)
+            try:
+                return super().invoke(messages, assistant_message, **kwargs)
+            finally:
+                self._current_cache_tag_map = None
+        async def ainvoke(self, messages, assistant_message, **kwargs):
+            compress = kwargs.get("compress_tool_results", False)
+            self._current_cache_tag_map = self._build_cache_tag_map(messages, compress)
+            try:
+                return await super().ainvoke(messages, assistant_message, **kwargs)
+            finally:
+                self._current_cache_tag_map = None
+        def invoke_stream(self, messages, assistant_message, **kwargs):
+            compress = kwargs.get("compress_tool_results", False)
+            self._current_cache_tag_map = self._build_cache_tag_map(messages, compress)
+            try:
+                yield from super().invoke_stream(messages, assistant_message, **kwargs)
+            finally:
+                self._current_cache_tag_map = None
+        async def ainvoke_stream(self, messages, assistant_message, **kwargs):
+            compress = kwargs.get("compress_tool_results", False)
+            self._current_cache_tag_map = self._build_cache_tag_map(messages, compress)
+            try:
+                async for item in super().ainvoke_stream(messages, assistant_message, **kwargs):
+                    yield item
+            finally:
+                self._current_cache_tag_map = None
     return CachedOpenAIChat
@@ -400,7 +527,14 @@ def _create_provider_model(
             }
         params.update(kwargs)
         if cache_system_prompt:
-            CachedOpenAIChat = _make_cached_openai_chat_class()
+            # Only mark recent messages with cache_control when the provider
+            # has a custom base_url (DashScope/Qwen/custom OpenAI-compat).
+            # Official OpenAI's API may reject the marker on user/assistant
+            # messages — for them, keep system-only caching.
+            mark_recent = bool(provider.base_url)
+            CachedOpenAIChat = _make_cached_openai_chat_class(
+                mark_recent_messages=mark_recent
+            )
             return CachedOpenAIChat(**params)
         from agno.models.openai import OpenAIChat
         return OpenAIChat(**params)
@@ -463,7 +597,14 @@ def _create_provider_model(
             }
         params.update(kwargs)
         if cache_system_prompt:
-            CachedOpenAIChat = _make_cached_openai_chat_class()
+            # Fallback branch always means "unknown OpenAI-compat provider"
+            # — if there's a base_url it's a custom endpoint that may honor
+            # the cache_control marker. Without base_url we're in an odd
+            # state (unknown type, no endpoint) — default to system-only.
+            mark_recent = bool(provider.base_url)
+            CachedOpenAIChat = _make_cached_openai_chat_class(
+                mark_recent_messages=mark_recent
+            )
             return CachedOpenAIChat(**params)
         from agno.models.openai import OpenAIChat
         return OpenAIChat(**params)

{aru_code-0.19.0 → aru_code-0.19.2/aru_code.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: aru-code
-Version: 0.19.0
+Version: 0.19.2
 Summary: A Claude Code clone built with Agno agents
 Author-email: Estevao <estevaofon@gmail.com>
 License-Expression: MIT

{aru_code-0.19.0 → aru_code-0.19.2}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "aru-code"
-version = "0.19.0"
+version = "0.19.2"
 description = "A Claude Code clone built with Agno agents"
 readme = "README.md"
 license = "MIT"