PyPI - aru-code - Versions diffs - 0.16.0__tar.gz → 0.17.0__tar.gz - Mend

aru-code 0.16.0tar.gz → 0.17.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (58) hide show

{aru_code-0.16.0/aru_code.egg-info → aru_code-0.17.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: aru-code
-Version: 0.16.0
+Version: 0.17.0
 Summary: A Claude Code clone built with Agno agents
 Author-email: Estevao <estevaofon@gmail.com>
 License-Expression: MIT

aru_code-0.17.0/aru/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ __version__ = "0.17.0"

{aru_code-0.16.0 → aru_code-0.17.0}/aru/agent_factory.py RENAMED Viewed

@@ -25,12 +25,7 @@ def create_general_agent(
     from aru.tools.codebase import GENERAL_TOOLS
     tools = GENERAL_TOOLS
-    # Only include AGENTS.md/project instructions on first turn to save ~1.6K tokens/turn
-    if config and not session.extra_instructions_sent:
-        extra = config.get_extra_instructions()
-        session.extra_instructions_sent = True
-    else:
-        extra = ""
+    extra = config.get_extra_instructions() if config else ""
     if env_context:
         extra = f"{extra}\n\n{env_context}" if extra else env_context
     model_ref = model_override or session.model_ref

{aru_code-0.16.0 → aru_code-0.17.0}/aru/agents/base.py RENAMED Viewed

@@ -48,12 +48,12 @@ Default shows 10 lines of context — use `context_lines=30` for full function b
 2. **Understand a file** → `read_file_smart(path, query)` — returns a concise answer, not raw content
 3. **Need raw content** → `read_file(path)` — returns first chunk + outline for large files
-**Stop early**: Once you have enough information to write the plan, STOP making tool calls \
-immediately. Do not exhaustively explore.
 **Batch independent tool calls**: When you need answers from multiple independent sources, \
 emit ALL those tool calls in a single response.
+**Stop early**: Once you have enough information to write the plan, stop exploring and write it. \
+Do not exhaustively read every file — batch what you need, then produce the plan.
 ## Output format — STRICT
 Your ONLY output is the plan below. Do NOT write analysis, coverage reports, summaries of
@@ -182,7 +182,8 @@ Every tool call accumulates its result in your context window. Use the minimum n
 **Batch independent tool calls**: emit ALL independent tool calls in a single response.
-**Stop early**: Once you have enough information to do the work, STOP exploring and start working.
+**Stop early**: Once you have enough information to act, stop exploring and start working. \
+Batch what you need upfront, then execute.
 **When adding or modifying unit tests, ALWAYS run them to verify they pass before finishing.**

aru_code-0.17.0/aru/cache_patch.py ADDED Viewed

@@ -0,0 +1,201 @@
+"""Monkey-patch Agno's model layer to reduce token consumption.
+Three optimizations:
+1. **Tool result pruning** (ALL providers): After each tool execution, old tool
+   results in the message list are truncated to a short summary. This prevents
+   O(n²) token growth where each API call re-sends all previous tool results.
+2. **Cache breakpoints** (Anthropic only): Marks the last 2 messages with
+   cache_control for Anthropic's prompt caching.
+3. **Per-call metrics** (ALL providers): Captures input/output tokens of the
+   last API call (context window size), exposed via get_last_call_metrics().
+These patches intercept Agno's internal loop so they work transparently
+regardless of which provider is used.
+"""
+from __future__ import annotations
+# Token-budget pruning (aligned with OpenCode's strategy):
+# - Protect recent tool results within a token budget
+# - Only prune if there's enough to free (avoid churn)
+# - Walk backwards, protecting recent content first
+# OpenCode uses 40K protect / 20K minimum; we use chars (~4 chars/token)
+_PRUNE_PROTECT_CHARS = 160_000   # ~40K tokens — recent content always kept
+_PRUNE_MINIMUM_CHARS = 80_000    # ~20K tokens — only prune if this much is freeable
+_PRUNED_PLACEHOLDER = "[Old tool result cleared]"
+# Last API call metrics (updated on every internal API call)
+_last_call_input_tokens: int = 0
+_last_call_output_tokens: int = 0
+_last_call_cache_read: int = 0
+_last_call_cache_write: int = 0
+def get_last_call_metrics() -> tuple[int, int, int, int]:
+    """Return (input, output, cache_read, cache_write) from the most recent API call."""
+    return _last_call_input_tokens, _last_call_output_tokens, _last_call_cache_read, _last_call_cache_write
+def _prune_tool_messages(messages):
+    """Clear old tool result content using a token-budget approach.
+    Walks backwards through messages, protecting recent content up to
+    PRUNE_PROTECT_CHARS. Older tool results beyond that budget are replaced
+    with a short placeholder. Only prunes if total freeable chars exceed
+    PRUNE_MINIMUM_CHARS (avoids unnecessary churn on small conversations).
+    Aligned with OpenCode's strategy: budget-based, not fixed-N.
+    """
+    # Collect tool message indices and their content sizes
+    tool_indices = []
+    for i, msg in enumerate(messages):
+        if getattr(msg, "role", None) == "tool":
+            content = getattr(msg, "content", None)
+            content_len = len(str(content)) if content is not None else 0
+            tool_indices.append((i, content_len))
+    if not tool_indices:
+        return
+    # Walk backwards, accumulating protected chars
+    protected_chars = 0
+    prune_candidates = []  # (index, content_len) of messages outside protection
+    for idx, content_len in reversed(tool_indices):
+        if protected_chars + content_len <= _PRUNE_PROTECT_CHARS:
+            protected_chars += content_len
+        else:
+            prune_candidates.append((idx, content_len))
+    # Only prune if there's enough to free
+    freeable = sum(cl for _, cl in prune_candidates)
+    if freeable < _PRUNE_MINIMUM_CHARS:
+        return
+    # Replace old tool results with placeholder
+    for idx, _ in prune_candidates:
+        msg = messages[idx]
+        content = getattr(msg, "content", None)
+        if content is None:
+            continue
+        # Skip if already pruned
+        if str(content) == _PRUNED_PLACEHOLDER:
+            continue
+        try:
+            msg.content = _PRUNED_PLACEHOLDER
+            if hasattr(msg, "compressed_content"):
+                msg.compressed_content = None
+        except (AttributeError, TypeError):
+            pass
+def apply_cache_patch():
+    """Apply all patches to reduce Agno's token consumption."""
+    _patch_tool_result_pruning()
+    _patch_claude_cache_breakpoints()
+    _patch_per_call_metrics()
+def _patch_tool_result_pruning():
+    """Patch format_function_call_results to prune old tool results.
+    This is called after each tool execution, right before the next API call.
+    Works for ALL providers (Claude, OpenAI, Qwen, etc.) since it patches
+    the base Model class.
+    """
+    from agno.models.base import Model
+    _original_format_results = Model.format_function_call_results
+    def _patched_format_results(self, messages, function_call_results, **kwargs):
+        # First: prune old tool results already in messages
+        _prune_tool_messages(messages)
+        # Then: add new results normally
+        return _original_format_results(self, messages, function_call_results, **kwargs)
+    Model.format_function_call_results = _patched_format_results
+def _patch_claude_cache_breakpoints():
+    """Patch Claude's format_messages to add cache breakpoints.
+    Marks the last 2 messages with cache_control for Anthropic's prompt
+    caching. Non-Anthropic providers ignore these fields.
+    """
+    try:
+        import agno.utils.models.claude as claude_utils
+    except ImportError:
+        return
+    _original_format = claude_utils.format_messages
+    def _patched_format_messages(messages, compress_tool_results=False):
+        chat_messages, system_message = _original_format(
+            messages, compress_tool_results=compress_tool_results
+        )
+        if not chat_messages:
+            return chat_messages, system_message
+        # Add cache_control to last 2 messages
+        cache_marker = {"type": "ephemeral"}
+        marked = 0
+        for msg in reversed(chat_messages):
+            if marked >= 2:
+                break
+            content = msg.get("content")
+            if isinstance(content, list) and content:
+                last_item = content[-1]
+                if isinstance(last_item, dict):
+                    last_item["cache_control"] = cache_marker
+                    marked += 1
+                elif hasattr(last_item, "type"):
+                    try:
+                        as_dict = last_item.model_dump() if hasattr(last_item, "model_dump") else dict(last_item)
+                        as_dict["cache_control"] = cache_marker
+                        content[-1] = as_dict
+                        marked += 1
+                    except Exception:
+                        pass
+            elif isinstance(content, str):
+                msg["content"] = [{"type": "text", "text": content, "cache_control": cache_marker}]
+                marked += 1
+        return chat_messages, system_message
+    claude_utils.format_messages = _patched_format_messages
+def _patch_per_call_metrics():
+    """Patch accumulate_model_metrics to capture per-API-call token counts.
+    After each internal API call, Agno calls this function to sum tokens
+    into RunMetrics. We intercept it to snapshot the last call's tokens,
+    giving us the actual context window size (comparable to OpenCode/Claude Code).
+    """
+    from agno.metrics import accumulate_model_metrics as _original_accumulate
+    import agno.metrics as _metrics_module
+    def _patched_accumulate(model_response, model, model_type, run_metrics=None):
+        global _last_call_input_tokens, _last_call_output_tokens
+        global _last_call_cache_read, _last_call_cache_write
+        usage = getattr(model_response, "response_usage", None)
+        if usage is not None:
+            _last_call_input_tokens = getattr(usage, "input_tokens", 0) or 0
+            _last_call_output_tokens = getattr(usage, "output_tokens", 0) or 0
+            _last_call_cache_read = getattr(usage, "cache_read_tokens", 0) or 0
+            _last_call_cache_write = getattr(usage, "cache_write_tokens", 0) or 0
+        return _original_accumulate(model_response, model, model_type, run_metrics)
+    _metrics_module.accumulate_model_metrics = _patched_accumulate
+    # Also patch the reference in base.py since it may have imported directly
+    try:
+        import agno.models.base as _base_module
+        _base_module.accumulate_model_metrics = _patched_accumulate
+    except (ImportError, AttributeError):
+        pass

{aru_code-0.16.0 → aru_code-0.17.0}/aru/cli.py RENAMED Viewed

@@ -14,6 +14,7 @@ import os
 import sys
 from rich.markdown import Markdown
+from rich.panel import Panel
 # ── Re-exports for backward compatibility ─────────────────────────────
 # Tests and external code import these from aru.cli; keep them accessible.
@@ -413,6 +414,15 @@ async def run_cli(skip_permissions: bool = False, resume_id: str | None = None):
             _show_help(config)
             continue
+        if user_input.lower() == "/cost":
+            console.print(Panel(
+                session.cost_summary,
+                title="[bold]Token Usage & Cost[/bold]",
+                border_style="cyan",
+                padding=(1, 2),
+            ))
+            continue
         if user_input.startswith("! "):
             cmd = user_input[2:].strip()
             if not cmd:
@@ -518,7 +528,7 @@ async def run_cli(skip_permissions: bool = False, resume_id: str | None = None):
                         session.add_message("assistant", run_result.with_tools_summary())
             else:
                 console.print(f"[yellow]Unknown command: /{cmd_name}[/yellow]")
-                console.print(f"[dim]Built-in: /plan, /model, /sessions, /commands, /skills, /agents, /quit[/dim]")
+                console.print(f"[dim]Built-in: /plan, /model, /sessions, /commands, /skills, /agents, /cost, /quit[/dim]")
                 if config.commands:
                     console.print(f"[dim]Custom: {', '.join(f'/{k}' for k in config.commands)}[/dim]")
                 if config.skills:

{aru_code-0.16.0 → aru_code-0.17.0}/aru/commands.py RENAMED Viewed

@@ -21,6 +21,7 @@ SLASH_COMMANDS = [
     ("/skills", "List available skills", "/skills"),
     ("/agents", "List custom agents", "/agents"),
     ("/mcp", "List loaded MCP tools", "/mcp"),
+    ("/cost", "Show detailed token usage and cost", "/cost"),
     ("/quit", "Exit aru", "/quit"),
 ]

{aru_code-0.16.0 → aru_code-0.17.0}/aru/context.py RENAMED Viewed

@@ -11,15 +11,15 @@ from __future__ import annotations
 # ── Constants ──────────────────────────────────────────────────────
 # Pruning: minimum chars that must be freeable to justify a prune pass
-PRUNE_MINIMUM_CHARS = 8_000  # ~2K tokens (was 12K — prune sooner)
+PRUNE_MINIMUM_CHARS = 12_000  # ~3.5K tokens
 # Placeholder that replaces evicted content
 PRUNED_PLACEHOLDER = "[cleared]"
 # User messages larger than this threshold are truncated when outside protection window
-PRUNE_USER_MSG_THRESHOLD = 1_200  # ~340 tokens (was 2K — catch file contents earlier)
+PRUNE_USER_MSG_THRESHOLD = 2_000  # ~570 tokens
 # How many chars to keep from the start of a pruned user message
-PRUNE_USER_MSG_KEEP = 300  # ~85 tokens (was 500 — enough for the request intent)
+PRUNE_USER_MSG_KEEP = 500  # ~140 tokens
 # Minimum number of recent user turns always protected (regardless of char budget)
-PRUNE_PROTECT_TURNS = 1  # was 2 — only protect the very last turn
+PRUNE_PROTECT_TURNS = 2
 # Tool result markers that should never be pruned (critical context)
 PRUNE_PROTECTED_MARKERS = {"[SubAgent-", "delegate_task"}
 # Tool names whose outputs should never be pruned (like OpenCode's PRUNE_PROTECTED_TOOLS)
@@ -27,20 +27,20 @@ PRUNE_PROTECTED_MARKERS = {"[SubAgent-", "delegate_task"}
 PRUNE_PROTECTED_TOOLS = {"delegate_task"}
 # Truncation: universal limits for any tool output
-TRUNCATE_MAX_LINES = 200  # was 300 — tighter to save context
-TRUNCATE_MAX_BYTES = 10 * 1024  # 10 KB (was 15KB — save full to disk instead)
+TRUNCATE_MAX_LINES = 300
+TRUNCATE_MAX_BYTES = 15 * 1024  # 15 KB
 TRUNCATE_KEEP_START = 150  # lines to keep from the start
-TRUNCATE_KEEP_END = 30  # lines to keep from the end (was 60)
+TRUNCATE_KEEP_END = 60  # lines to keep from the end
 TRUNCATE_MAX_LINE_LENGTH = 1500  # chars per individual line (prevents minified files)
 # Directory for saving full truncated outputs (like OpenCode pattern)
 TRUNCATE_SAVE_DIR = ".aru/truncated"
 # Compaction: trigger when per-run input tokens exceed this fraction of model limit
-COMPACTION_THRESHOLD_RATIO = 0.50  # was 0.70 — compact much earlier to stay lean
+COMPACTION_THRESHOLD_RATIO = 0.70
 # Compaction: target post-compaction size as fraction of model context limit
-COMPACTION_TARGET_RATIO = 0.10  # was 0.15 — more aggressive compaction target
+COMPACTION_TARGET_RATIO = 0.15
 # Compaction: also trigger after this many user turns (regardless of token count)
-COMPACTION_MAX_TURNS = 8
+COMPACTION_MAX_TURNS = 15
 # Compaction: reserve buffer for the compaction process itself (like OpenCode's 20K)
 COMPACTION_BUFFER_TOKENS = 20_000
 # Default model context limits (input tokens)
@@ -115,8 +115,8 @@ def _get_prune_protect_chars(model_id: str = "default") -> int:
     to prevent context overflow. Returns ~7% of the model's context in chars.
     """
     limit = MODEL_CONTEXT_LIMITS.get(model_id, MODEL_CONTEXT_LIMITS["default"])
-    # ~4 chars per token, protect ~5% of context (was 7% — tighter budget)
-    protect = int(limit * 0.05 * 4)
+    # ~4 chars per token, protect ~7% of context
+    protect = int(limit * 0.07 * 4)
     # Clamp between 10K (minimum usable) and 40K (diminishing returns)
     return max(10_000, min(protect, 40_000))

{aru_code-0.16.0 → aru_code-0.17.0}/aru/runner.py RENAMED Viewed

@@ -115,9 +115,6 @@ async def run_agent_capture(agent, message: str, session=None, lightweight: bool
             run_message = message
         # Build conversation history as real messages for the LLM
-        # Compact BEFORE pruning: if the history is large enough that pruning
-        # would discard content, compact first to preserve context via summary
-        # instead of losing it to placeholders.
         from aru.context import prune_history, should_compact, compact_conversation, would_prune
         if session and session.history and not lightweight:
             if would_prune(session.history, model_id=session.model_id):
@@ -242,23 +239,21 @@ async def run_agent_capture(agent, message: str, session=None, lightweight: bool
         if run_output and session and hasattr(run_output, "metrics"):
             session.track_tokens(run_output.metrics)
-            # Reactive compaction: use per-run input_tokens (sum of all API
-            # calls within this arun) as a conservative proxy for context pressure.
-            # session.history doesn't include tool results, so char-based estimates
-            # would miss the bulk of the context sent to the model.
+            # Reactive compaction: runs with a visible spinner so the user
+            # sees progress instead of a frozen screen.
             run_input_tokens = getattr(run_output.metrics, "input_tokens", 0) or 0
             if should_compact(run_input_tokens, session.model_id):
-                try:
-                    # Always prune first to shrink history before compaction
-                    session.history = prune_history(session.history, model_id=session.model_id)
-                    session.history = await compact_conversation(
-                        session.history, session.model_ref, session.plan_task,
-                        model_id=session.model_id,
-                    )
-                    console.print("[dim]Context compacted to save tokens.[/dim]")
-                except Exception:
-                    # Even if compaction fails, keep the pruned history
-                    pass
+                from rich.status import Status
+                with Status("[dim]Compacting context...[/dim]", console=console, spinner="dots"):
+                    try:
+                        session.history = prune_history(session.history, model_id=session.model_id)
+                        session.history = await compact_conversation(
+                            session.history, session.model_ref, session.plan_task,
+                            model_id=session.model_id,
+                        )
+                        console.print("[dim]Context compacted to save tokens.[/dim]")
+                    except Exception:
+                        pass
         final_content = accumulated or final_content
         remaining = (final_content or "")[display._flushed_len:]

{aru_code-0.16.0 → aru_code-0.17.0}/aru/session.py RENAMED Viewed

@@ -16,6 +16,46 @@ from aru.providers import MODEL_ALIASES, get_model_display, resolve_model_ref
 # Default model reference (provider/model format)
 DEFAULT_MODEL = "anthropic/claude-sonnet-4-5"
+# Pricing per million tokens (USD). Cache read/write have separate rates.
+# Format: {model_id_prefix: (input, output, cache_read, cache_write)}
+# Prices as of 2025-05. Models not listed fall back to "default".
+MODEL_PRICING: dict[str, tuple[float, float, float, float]] = {
+    # Anthropic  (input, output, cache_read=10%, cache_write=125%)
+    "claude-sonnet-4-5":    (3.00,  15.00,  0.30,   3.75),
+    "claude-sonnet-4-6":    (3.00,  15.00,  0.30,   3.75),
+    "claude-opus-4":        (15.00, 75.00,  1.50,  18.75),
+    "claude-opus-4-6":      (15.00, 75.00,  1.50,  18.75),
+    "claude-haiku-3-5":     (0.80,   4.00,  0.08,   1.00),
+    "claude-haiku-4-5":     (1.00,   5.00,  0.10,   1.25),
+    # OpenAI
+    "gpt-4o":               (2.50,  10.00,  1.25,   2.50),
+    "gpt-4o-mini":          (0.15,   0.60,  0.075,  0.15),
+    "gpt-4.1":              (2.00,   8.00,  0.50,   2.00),
+    "gpt-4.1-mini":         (0.40,   1.60,  0.10,   0.40),
+    "gpt-4.1-nano":         (0.10,   0.40,  0.025,  0.10),
+    "o3":                   (2.00,   8.00,  0.50,   2.00),
+    "o3-mini":              (1.10,   4.40,  0.275,  1.10),
+    "o4-mini":              (1.10,   4.40,  0.275,  1.10),
+    # Qwen / DashScope (<=256K tier, explicit cache: creation=125%, hit=10%)
+    "qwen3-plus":           (0.50,   3.00,  0.05,   0.625),
+    "qwen3.6-plus":         (0.50,   3.00,  0.05,   0.625),
+    "qwen-plus":            (0.50,   3.00,  0.05,   0.625),
+    "qwen-max":             (2.00,   6.00,  0.20,   2.50),
+    "qwen-turbo":           (0.30,   0.60,  0.03,   0.375),
+    "qwen3-coder-plus":     (0.50,   3.00,  0.05,   0.625),
+    # DeepSeek
+    "deepseek-chat":        (0.27,   1.10,  0.07,   0.27),
+    "deepseek-reasoner":    (0.55,   2.19,  0.14,   0.55),
+    # Google Gemini (via OpenRouter)
+    "gemini-2.5-pro":       (1.25,  10.00,  0.315,  1.25),
+    "gemini-2.5-flash":     (0.15,   0.60,  0.0375, 0.15),
+    # Groq (free tier / very cheap)
+    "llama-3.3-70b":        (0.59,   0.79,  0.0,    0.0),
+    "llama-3.1":            (0.05,   0.08,  0.0,    0.0),
+    # Fallback
+    "default":              (3.00,  15.00,  0.30,   3.75),
+}
 SESSIONS_DIR = os.path.join(".aru", "sessions")
@@ -141,12 +181,15 @@ class Session:
         self.total_cache_read_tokens: int = 0
         self.total_cache_write_tokens: int = 0
         self.api_calls: int = 0
+        # Per-call metrics: last API call's context window (set by cache_patch)
+        self.last_input_tokens: int = 0
+        self.last_output_tokens: int = 0
+        self.last_cache_read: int = 0
+        self.last_cache_write: int = 0
         # Context cache — invalidated on file mutations
         self._cached_tree: str | None = None
         self._cached_git_status: str | None = None
         self._context_dirty: bool = True
-        # Track whether AGENTS.md/extra instructions were already sent (skip on subsequent turns)
-        self.extra_instructions_sent: bool = False
         # Tree depth for env context (configurable via aru.json "tree_depth")
         self._tree_max_depth: int = 2
         # Token budget (0 = unlimited)
@@ -198,20 +241,100 @@ class Session:
         self.total_cache_read_tokens += getattr(metrics, "cache_read_tokens", 0) or 0
         self.total_cache_write_tokens += getattr(metrics, "cache_write_tokens", 0) or 0
         self.api_calls += 1
+        # Capture last API call's context window (set by cache_patch)
+        try:
+            from aru.cache_patch import get_last_call_metrics
+            self.last_input_tokens, self.last_output_tokens, self.last_cache_read, self.last_cache_write = get_last_call_metrics()
+        except ImportError:
+            self.last_input_tokens = getattr(metrics, "input_tokens", 0) or 0
+            self.last_output_tokens = getattr(metrics, "output_tokens", 0) or 0
+            self.last_cache_read = 0
+            self.last_cache_write = 0
+    def _get_pricing(self) -> tuple[float, float, float, float]:
+        """Get per-million-token pricing for the current model."""
+        model_id = self.model_id
+        # Try exact match, then prefix match, then fallback
+        for prefix, pricing in MODEL_PRICING.items():
+            if prefix == "default":
+                continue
+            if model_id.startswith(prefix):
+                return pricing
+        return MODEL_PRICING["default"]
+    @property
+    def estimated_cost(self) -> float:
+        """Estimate cumulative cost in USD based on token usage and model pricing.
+        For input tokens, subtracts cache_read (charged at cache rate) and
+        cache_write (charged at write rate) from the base input count.
+        """
+        price_in, price_out, price_cache_read, price_cache_write = self._get_pricing()
+        # Non-cached input = total input - cache_read - cache_write
+        base_input = max(0, self.total_input_tokens - self.total_cache_read_tokens - self.total_cache_write_tokens)
+        cost = (
+            base_input * price_in / 1_000_000
+            + self.total_output_tokens * price_out / 1_000_000
+            + self.total_cache_read_tokens * price_cache_read / 1_000_000
+            + self.total_cache_write_tokens * price_cache_write / 1_000_000
+        )
+        return cost
     @property
     def token_summary(self) -> str:
+        """One-line summary shown after each response: context window + cost."""
+        if self.last_input_tokens <= 0 and self.total_input_tokens == 0:
+            return ""
+        cost = self.estimated_cost
+        cost_str = f"${cost:.4f}" if cost < 0.01 else f"${cost:.2f}"
+        if self.last_input_tokens > 0:
+            ctx_total = self.last_input_tokens + self.last_output_tokens + self.last_cache_read + self.last_cache_write
+            parts = [f"in: {self.last_input_tokens:,}", f"out: {self.last_output_tokens:,}"]
+            if self.last_cache_read > 0:
+                parts.append(f"cache_read: {self.last_cache_read:,}")
+            if self.last_cache_write > 0:
+                parts.append(f"cache_write: {self.last_cache_write:,}")
+            return f"context: {ctx_total:,} ({' / '.join(parts)}) | cost: {cost_str}"
+        # Fallback when per-call metrics aren't available
+        total = self.total_input_tokens + self.total_output_tokens
+        return f"tokens: {total:,} | cost: {cost_str}"
+    @property
+    def cost_summary(self) -> str:
+        """Detailed cost breakdown for /cost command."""
         total = self.total_input_tokens + self.total_output_tokens
         if total == 0:
-            return ""
-        metrics_str = f"in: {self.total_input_tokens:,} / out: {self.total_output_tokens:,}"
+            return "No token usage yet."
+        cost = self.estimated_cost
+        cost_str = f"${cost:.4f}" if cost < 0.01 else f"${cost:.2f}"
+        lines = [
+            f"Session cost: {cost_str}",
+            f"",
+            f"Cumulative tokens:",
+            f"  input:       {self.total_input_tokens:,}",
+            f"  output:      {self.total_output_tokens:,}",
+        ]
         if self.total_cache_read_tokens > 0:
-            metrics_str += f" / cached: {self.total_cache_read_tokens:,}"
-        summary = f"tokens: {total:,} ({metrics_str}) | calls: {self.api_calls}"
+            lines.append(f"  cache_read:  {self.total_cache_read_tokens:,}")
+        if self.total_cache_write_tokens > 0:
+            lines.append(f"  cache_write: {self.total_cache_write_tokens:,}")
+        lines.append(f"  total:       {total:,}")
+        lines.append(f"  api calls:   {self.api_calls}")
+        if self.last_input_tokens > 0:
+            ctx_total = self.last_input_tokens + self.last_output_tokens + self.last_cache_read + self.last_cache_write
+            lines.append(f"")
+            lines.append(f"Last context window: {ctx_total:,}")
+            lines.append(f"  input:       {self.last_input_tokens:,}")
+            lines.append(f"  output:      {self.last_output_tokens:,}")
+            if self.last_cache_read > 0:
+                lines.append(f"  cache_read:  {self.last_cache_read:,}")
+            if self.last_cache_write > 0:
+                lines.append(f"  cache_write: {self.last_cache_write:,}")
         if self.token_budget > 0:
             pct = int(total / self.token_budget * 100)
-            summary += f" | budget: {pct}%"
-        return summary
+            lines.append(f"")
+            lines.append(f"Budget: {pct}% used")
+        return "\n".join(lines)
     def invalidate_context_cache(self):
         """Mark cached tree/git status as stale. Call after file mutations."""

{aru_code-0.16.0 → aru_code-0.17.0}/aru/tools/codebase.py RENAMED Viewed

@@ -55,7 +55,7 @@ def _format_diff(old_string: str, new_string: str) -> Group:
 # Hard ceiling per tool result (~7K tokens). Even max_size=0 respects this per chunk.
-_READ_HARD_CAP = 25_000  # bytes (was 40K — each tool result re-sent on next API call)
+_READ_HARD_CAP = 40_000  # bytes (~11K tokens)
 def clear_read_cache():
     """Clear the read cache. Call after file mutations to avoid stale data."""

{aru_code-0.16.0 → aru_code-0.17.0/aru_code.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: aru-code
-Version: 0.16.0
+Version: 0.17.0
 Summary: A Claude Code clone built with Agno agents
 Author-email: Estevao <estevaofon@gmail.com>
 License-Expression: MIT

{aru_code-0.16.0 → aru_code-0.17.0}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "aru-code"
-version = "0.16.0"
+version = "0.17.0"
 description = "A Claude Code clone built with Agno agents"
 readme = "README.md"
 license = "MIT"

{aru_code-0.16.0 → aru_code-0.17.0}/tests/test_cli.py RENAMED Viewed

@@ -286,16 +286,32 @@ class TestSession:
         session.total_output_tokens = 500
         session.api_calls = 3
         summary = session.token_summary
-        assert "1,000" in summary or "1000" in summary
-        assert "calls: 3" in summary
+        assert "tokens: 1,500" in summary
+        assert "cost:" in summary
-    def test_token_summary_with_cache(self):
+    def test_token_summary_with_context(self):
+        session = Session()
+        session.total_input_tokens = 1000
+        session.total_output_tokens = 500
+        session.last_input_tokens = 800
+        session.last_output_tokens = 200
+        session.last_cache_read = 100
+        session.api_calls = 1
+        summary = session.token_summary
+        assert "context:" in summary
+        assert "cache_read:" in summary
+        assert "cost:" in summary
+    def test_cost_summary(self):
         session = Session()
         session.total_input_tokens = 100
         session.total_output_tokens = 50
         session.total_cache_read_tokens = 200
         session.api_calls = 1
-        assert "cached" in session.token_summary
+        summary = session.cost_summary
+        assert "Session cost:" in summary
+        assert "input:" in summary
+        assert "cache_read:" in summary
     def test_to_dict_and_from_dict(self):
         session = Session(session_id="test123")

aru_code-0.16.0/aru/__init__.py DELETED Viewed

	@@ -1 +0,0 @@
1	- __version__ = "0.16.0"

aru_code-0.16.0/aru/cache_patch.py DELETED Viewed

@@ -1,133 +0,0 @@
-"""Monkey-patch Agno's model layer to reduce token consumption.
-Two optimizations:
-1. **Tool result pruning** (ALL providers): After each tool execution, old tool
-   results in the message list are truncated to a short summary. This prevents
-   O(n²) token growth where each API call re-sends all previous tool results.
-2. **Cache breakpoints** (Anthropic only): Marks the last 2 messages with
-   cache_control for Anthropic's prompt caching.
-These patches intercept Agno's internal loop so they work transparently
-regardless of which provider is used.
-"""
-from __future__ import annotations
-# Max chars to keep from old tool results
-_TOOL_RESULT_KEEP_CHARS = 200
-# Number of recent tool results to keep in full
-_KEEP_RECENT_RESULTS = 1
-def _prune_tool_messages(messages):
-    """Truncate old tool result content in the message list.
-    Keeps only the last N tool results in full. Older ones are truncated
-    to a short preview. This runs BEFORE each API call, so accumulated
-    tool results don't bloat the context on every re-send.
-    """
-    # Find all tool message indices
-    tool_indices = [
-        i for i, msg in enumerate(messages)
-        if getattr(msg, "role", None) == "tool"
-    ]
-    if len(tool_indices) <= _KEEP_RECENT_RESULTS:
-        return
-    # Prune all except the last N
-    for idx in tool_indices[:-_KEEP_RECENT_RESULTS]:
-        msg = messages[idx]
-        content = getattr(msg, "content", None)
-        if content is None:
-            continue
-        content_str = str(content)
-        if len(content_str) <= _TOOL_RESULT_KEEP_CHARS:
-            continue
-        truncated = content_str[:_TOOL_RESULT_KEEP_CHARS] + "\n[...truncated]"
-        try:
-            msg.content = truncated
-            if hasattr(msg, "compressed_content"):
-                msg.compressed_content = None
-        except (AttributeError, TypeError):
-            pass
-def apply_cache_patch():
-    """Apply all patches to reduce Agno's token consumption."""
-    _patch_tool_result_pruning()
-    _patch_claude_cache_breakpoints()
-def _patch_tool_result_pruning():
-    """Patch format_function_call_results to prune old tool results.
-    This is called after each tool execution, right before the next API call.
-    Works for ALL providers (Claude, OpenAI, Qwen, etc.) since it patches
-    the base Model class.
-    """
-    from agno.models.base import Model
-    _original_format_results = Model.format_function_call_results
-    def _patched_format_results(self, messages, function_call_results, **kwargs):
-        # First: prune old tool results already in messages
-        _prune_tool_messages(messages)
-        # Then: add new results normally
-        return _original_format_results(self, messages, function_call_results, **kwargs)
-    Model.format_function_call_results = _patched_format_results
-def _patch_claude_cache_breakpoints():
-    """Patch Claude's format_messages to add cache breakpoints.
-    Marks the last 2 messages with cache_control for Anthropic's prompt
-    caching. Non-Anthropic providers ignore these fields.
-    """
-    try:
-        import agno.utils.models.claude as claude_utils
-    except ImportError:
-        return
-    _original_format = claude_utils.format_messages
-    def _patched_format_messages(messages, compress_tool_results=False):
-        chat_messages, system_message = _original_format(
-            messages, compress_tool_results=compress_tool_results
-        )
-        if not chat_messages:
-            return chat_messages, system_message
-        # Add cache_control to last 2 messages
-        cache_marker = {"type": "ephemeral"}
-        marked = 0
-        for msg in reversed(chat_messages):
-            if marked >= 2:
-                break
-            content = msg.get("content")
-            if isinstance(content, list) and content:
-                last_item = content[-1]
-                if isinstance(last_item, dict):
-                    last_item["cache_control"] = cache_marker
-                    marked += 1
-                elif hasattr(last_item, "type"):
-                    try:
-                        as_dict = last_item.model_dump() if hasattr(last_item, "model_dump") else dict(last_item)
-                        as_dict["cache_control"] = cache_marker
-                        content[-1] = as_dict
-                        marked += 1
-                    except Exception:
-                        pass
-            elif isinstance(content, str):
-                msg["content"] = [{"type": "text", "text": content, "cache_control": cache_marker}]
-                marked += 1
-        return chat_messages, system_message
-    claude_utils.format_messages = _patched_format_messages