aru-code 0.16.0__tar.gz → 0.17.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (58) hide show
  1. {aru_code-0.16.0/aru_code.egg-info → aru_code-0.17.0}/PKG-INFO +1 -1
  2. aru_code-0.17.0/aru/__init__.py +1 -0
  3. {aru_code-0.16.0 → aru_code-0.17.0}/aru/agent_factory.py +1 -6
  4. {aru_code-0.16.0 → aru_code-0.17.0}/aru/agents/base.py +5 -4
  5. aru_code-0.17.0/aru/cache_patch.py +201 -0
  6. {aru_code-0.16.0 → aru_code-0.17.0}/aru/cli.py +11 -1
  7. {aru_code-0.16.0 → aru_code-0.17.0}/aru/commands.py +1 -0
  8. {aru_code-0.16.0 → aru_code-0.17.0}/aru/context.py +12 -12
  9. {aru_code-0.16.0 → aru_code-0.17.0}/aru/runner.py +13 -18
  10. {aru_code-0.16.0 → aru_code-0.17.0}/aru/session.py +131 -8
  11. {aru_code-0.16.0 → aru_code-0.17.0}/aru/tools/codebase.py +1 -1
  12. {aru_code-0.16.0 → aru_code-0.17.0/aru_code.egg-info}/PKG-INFO +1 -1
  13. {aru_code-0.16.0 → aru_code-0.17.0}/pyproject.toml +1 -1
  14. {aru_code-0.16.0 → aru_code-0.17.0}/tests/test_cli.py +20 -4
  15. aru_code-0.16.0/aru/__init__.py +0 -1
  16. aru_code-0.16.0/aru/cache_patch.py +0 -133
  17. {aru_code-0.16.0 → aru_code-0.17.0}/LICENSE +0 -0
  18. {aru_code-0.16.0 → aru_code-0.17.0}/README.md +0 -0
  19. {aru_code-0.16.0 → aru_code-0.17.0}/aru/agents/__init__.py +0 -0
  20. {aru_code-0.16.0 → aru_code-0.17.0}/aru/agents/executor.py +0 -0
  21. {aru_code-0.16.0 → aru_code-0.17.0}/aru/agents/planner.py +0 -0
  22. {aru_code-0.16.0 → aru_code-0.17.0}/aru/completers.py +0 -0
  23. {aru_code-0.16.0 → aru_code-0.17.0}/aru/config.py +0 -0
  24. {aru_code-0.16.0 → aru_code-0.17.0}/aru/display.py +0 -0
  25. {aru_code-0.16.0 → aru_code-0.17.0}/aru/permissions.py +0 -0
  26. {aru_code-0.16.0 → aru_code-0.17.0}/aru/providers.py +0 -0
  27. {aru_code-0.16.0 → aru_code-0.17.0}/aru/runtime.py +0 -0
  28. {aru_code-0.16.0 → aru_code-0.17.0}/aru/tools/__init__.py +0 -0
  29. {aru_code-0.16.0 → aru_code-0.17.0}/aru/tools/ast_tools.py +0 -0
  30. {aru_code-0.16.0 → aru_code-0.17.0}/aru/tools/gitignore.py +0 -0
  31. {aru_code-0.16.0 → aru_code-0.17.0}/aru/tools/mcp_client.py +0 -0
  32. {aru_code-0.16.0 → aru_code-0.17.0}/aru/tools/ranker.py +0 -0
  33. {aru_code-0.16.0 → aru_code-0.17.0}/aru/tools/tasklist.py +0 -0
  34. {aru_code-0.16.0 → aru_code-0.17.0}/aru_code.egg-info/SOURCES.txt +0 -0
  35. {aru_code-0.16.0 → aru_code-0.17.0}/aru_code.egg-info/dependency_links.txt +0 -0
  36. {aru_code-0.16.0 → aru_code-0.17.0}/aru_code.egg-info/entry_points.txt +0 -0
  37. {aru_code-0.16.0 → aru_code-0.17.0}/aru_code.egg-info/requires.txt +0 -0
  38. {aru_code-0.16.0 → aru_code-0.17.0}/aru_code.egg-info/top_level.txt +0 -0
  39. {aru_code-0.16.0 → aru_code-0.17.0}/setup.cfg +0 -0
  40. {aru_code-0.16.0 → aru_code-0.17.0}/tests/test_agents_base.py +0 -0
  41. {aru_code-0.16.0 → aru_code-0.17.0}/tests/test_cli_advanced.py +0 -0
  42. {aru_code-0.16.0 → aru_code-0.17.0}/tests/test_cli_base.py +0 -0
  43. {aru_code-0.16.0 → aru_code-0.17.0}/tests/test_cli_completers.py +0 -0
  44. {aru_code-0.16.0 → aru_code-0.17.0}/tests/test_cli_new.py +0 -0
  45. {aru_code-0.16.0 → aru_code-0.17.0}/tests/test_cli_run_cli.py +0 -0
  46. {aru_code-0.16.0 → aru_code-0.17.0}/tests/test_cli_session.py +0 -0
  47. {aru_code-0.16.0 → aru_code-0.17.0}/tests/test_cli_shell.py +0 -0
  48. {aru_code-0.16.0 → aru_code-0.17.0}/tests/test_codebase.py +0 -0
  49. {aru_code-0.16.0 → aru_code-0.17.0}/tests/test_config.py +0 -0
  50. {aru_code-0.16.0 → aru_code-0.17.0}/tests/test_context.py +0 -0
  51. {aru_code-0.16.0 → aru_code-0.17.0}/tests/test_executor.py +0 -0
  52. {aru_code-0.16.0 → aru_code-0.17.0}/tests/test_gitignore.py +0 -0
  53. {aru_code-0.16.0 → aru_code-0.17.0}/tests/test_main.py +0 -0
  54. {aru_code-0.16.0 → aru_code-0.17.0}/tests/test_mcp_client.py +0 -0
  55. {aru_code-0.16.0 → aru_code-0.17.0}/tests/test_permissions.py +0 -0
  56. {aru_code-0.16.0 → aru_code-0.17.0}/tests/test_planner.py +0 -0
  57. {aru_code-0.16.0 → aru_code-0.17.0}/tests/test_providers.py +0 -0
  58. {aru_code-0.16.0 → aru_code-0.17.0}/tests/test_ranker.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: aru-code
3
- Version: 0.16.0
3
+ Version: 0.17.0
4
4
  Summary: A Claude Code clone built with Agno agents
5
5
  Author-email: Estevao <estevaofon@gmail.com>
6
6
  License-Expression: MIT
@@ -0,0 +1 @@
1
+ __version__ = "0.17.0"
@@ -25,12 +25,7 @@ def create_general_agent(
25
25
  from aru.tools.codebase import GENERAL_TOOLS
26
26
  tools = GENERAL_TOOLS
27
27
 
28
- # Only include AGENTS.md/project instructions on first turn to save ~1.6K tokens/turn
29
- if config and not session.extra_instructions_sent:
30
- extra = config.get_extra_instructions()
31
- session.extra_instructions_sent = True
32
- else:
33
- extra = ""
28
+ extra = config.get_extra_instructions() if config else ""
34
29
  if env_context:
35
30
  extra = f"{extra}\n\n{env_context}" if extra else env_context
36
31
  model_ref = model_override or session.model_ref
@@ -48,12 +48,12 @@ Default shows 10 lines of context — use `context_lines=30` for full function b
48
48
  2. **Understand a file** → `read_file_smart(path, query)` — returns a concise answer, not raw content
49
49
  3. **Need raw content** → `read_file(path)` — returns first chunk + outline for large files
50
50
 
51
- **Stop early**: Once you have enough information to write the plan, STOP making tool calls \
52
- immediately. Do not exhaustively explore.
53
-
54
51
  **Batch independent tool calls**: When you need answers from multiple independent sources, \
55
52
  emit ALL those tool calls in a single response.
56
53
 
54
+ **Stop early**: Once you have enough information to write the plan, stop exploring and write it. \
55
+ Do not exhaustively read every file — batch what you need, then produce the plan.
56
+
57
57
  ## Output format — STRICT
58
58
 
59
59
  Your ONLY output is the plan below. Do NOT write analysis, coverage reports, summaries of
@@ -182,7 +182,8 @@ Every tool call accumulates its result in your context window. Use the minimum n
182
182
 
183
183
  **Batch independent tool calls**: emit ALL independent tool calls in a single response.
184
184
 
185
- **Stop early**: Once you have enough information to do the work, STOP exploring and start working.
185
+ **Stop early**: Once you have enough information to act, stop exploring and start working. \
186
+ Batch what you need upfront, then execute.
186
187
 
187
188
  **When adding or modifying unit tests, ALWAYS run them to verify they pass before finishing.**
188
189
 
@@ -0,0 +1,201 @@
1
+ """Monkey-patch Agno's model layer to reduce token consumption.
2
+
3
+ Three optimizations:
4
+
5
+ 1. **Tool result pruning** (ALL providers): After each tool execution, old tool
6
+ results in the message list are truncated to a short summary. This prevents
7
+ O(n²) token growth where each API call re-sends all previous tool results.
8
+
9
+ 2. **Cache breakpoints** (Anthropic only): Marks the last 2 messages with
10
+ cache_control for Anthropic's prompt caching.
11
+
12
+ 3. **Per-call metrics** (ALL providers): Captures input/output tokens of the
13
+ last API call (context window size), exposed via get_last_call_metrics().
14
+
15
+ These patches intercept Agno's internal loop so they work transparently
16
+ regardless of which provider is used.
17
+ """
18
+
19
+ from __future__ import annotations
20
+
21
+ # Token-budget pruning (aligned with OpenCode's strategy):
22
+ # - Protect recent tool results within a token budget
23
+ # - Only prune if there's enough to free (avoid churn)
24
+ # - Walk backwards, protecting recent content first
25
+ # OpenCode uses 40K protect / 20K minimum; we use chars (~4 chars/token)
26
+ _PRUNE_PROTECT_CHARS = 160_000 # ~40K tokens — recent content always kept
27
+ _PRUNE_MINIMUM_CHARS = 80_000 # ~20K tokens — only prune if this much is freeable
28
+ _PRUNED_PLACEHOLDER = "[Old tool result cleared]"
29
+
30
+ # Last API call metrics (updated on every internal API call)
31
+ _last_call_input_tokens: int = 0
32
+ _last_call_output_tokens: int = 0
33
+ _last_call_cache_read: int = 0
34
+ _last_call_cache_write: int = 0
35
+
36
+
37
+ def get_last_call_metrics() -> tuple[int, int, int, int]:
38
+ """Return (input, output, cache_read, cache_write) from the most recent API call."""
39
+ return _last_call_input_tokens, _last_call_output_tokens, _last_call_cache_read, _last_call_cache_write
40
+
41
+
42
+ def _prune_tool_messages(messages):
43
+ """Clear old tool result content using a token-budget approach.
44
+
45
+ Walks backwards through messages, protecting recent content up to
46
+ PRUNE_PROTECT_CHARS. Older tool results beyond that budget are replaced
47
+ with a short placeholder. Only prunes if total freeable chars exceed
48
+ PRUNE_MINIMUM_CHARS (avoids unnecessary churn on small conversations).
49
+
50
+ Aligned with OpenCode's strategy: budget-based, not fixed-N.
51
+ """
52
+ # Collect tool message indices and their content sizes
53
+ tool_indices = []
54
+ for i, msg in enumerate(messages):
55
+ if getattr(msg, "role", None) == "tool":
56
+ content = getattr(msg, "content", None)
57
+ content_len = len(str(content)) if content is not None else 0
58
+ tool_indices.append((i, content_len))
59
+
60
+ if not tool_indices:
61
+ return
62
+
63
+ # Walk backwards, accumulating protected chars
64
+ protected_chars = 0
65
+ prune_candidates = [] # (index, content_len) of messages outside protection
66
+
67
+ for idx, content_len in reversed(tool_indices):
68
+ if protected_chars + content_len <= _PRUNE_PROTECT_CHARS:
69
+ protected_chars += content_len
70
+ else:
71
+ prune_candidates.append((idx, content_len))
72
+
73
+ # Only prune if there's enough to free
74
+ freeable = sum(cl for _, cl in prune_candidates)
75
+ if freeable < _PRUNE_MINIMUM_CHARS:
76
+ return
77
+
78
+ # Replace old tool results with placeholder
79
+ for idx, _ in prune_candidates:
80
+ msg = messages[idx]
81
+ content = getattr(msg, "content", None)
82
+ if content is None:
83
+ continue
84
+ # Skip if already pruned
85
+ if str(content) == _PRUNED_PLACEHOLDER:
86
+ continue
87
+ try:
88
+ msg.content = _PRUNED_PLACEHOLDER
89
+ if hasattr(msg, "compressed_content"):
90
+ msg.compressed_content = None
91
+ except (AttributeError, TypeError):
92
+ pass
93
+
94
+
95
+ def apply_cache_patch():
96
+ """Apply all patches to reduce Agno's token consumption."""
97
+ _patch_tool_result_pruning()
98
+ _patch_claude_cache_breakpoints()
99
+ _patch_per_call_metrics()
100
+
101
+
102
+ def _patch_tool_result_pruning():
103
+ """Patch format_function_call_results to prune old tool results.
104
+
105
+ This is called after each tool execution, right before the next API call.
106
+ Works for ALL providers (Claude, OpenAI, Qwen, etc.) since it patches
107
+ the base Model class.
108
+ """
109
+ from agno.models.base import Model
110
+
111
+ _original_format_results = Model.format_function_call_results
112
+
113
+ def _patched_format_results(self, messages, function_call_results, **kwargs):
114
+ # First: prune old tool results already in messages
115
+ _prune_tool_messages(messages)
116
+ # Then: add new results normally
117
+ return _original_format_results(self, messages, function_call_results, **kwargs)
118
+
119
+ Model.format_function_call_results = _patched_format_results
120
+
121
+
122
+ def _patch_claude_cache_breakpoints():
123
+ """Patch Claude's format_messages to add cache breakpoints.
124
+
125
+ Marks the last 2 messages with cache_control for Anthropic's prompt
126
+ caching. Non-Anthropic providers ignore these fields.
127
+ """
128
+ try:
129
+ import agno.utils.models.claude as claude_utils
130
+ except ImportError:
131
+ return
132
+
133
+ _original_format = claude_utils.format_messages
134
+
135
+ def _patched_format_messages(messages, compress_tool_results=False):
136
+ chat_messages, system_message = _original_format(
137
+ messages, compress_tool_results=compress_tool_results
138
+ )
139
+
140
+ if not chat_messages:
141
+ return chat_messages, system_message
142
+
143
+ # Add cache_control to last 2 messages
144
+ cache_marker = {"type": "ephemeral"}
145
+ marked = 0
146
+ for msg in reversed(chat_messages):
147
+ if marked >= 2:
148
+ break
149
+ content = msg.get("content")
150
+ if isinstance(content, list) and content:
151
+ last_item = content[-1]
152
+ if isinstance(last_item, dict):
153
+ last_item["cache_control"] = cache_marker
154
+ marked += 1
155
+ elif hasattr(last_item, "type"):
156
+ try:
157
+ as_dict = last_item.model_dump() if hasattr(last_item, "model_dump") else dict(last_item)
158
+ as_dict["cache_control"] = cache_marker
159
+ content[-1] = as_dict
160
+ marked += 1
161
+ except Exception:
162
+ pass
163
+ elif isinstance(content, str):
164
+ msg["content"] = [{"type": "text", "text": content, "cache_control": cache_marker}]
165
+ marked += 1
166
+
167
+ return chat_messages, system_message
168
+
169
+ claude_utils.format_messages = _patched_format_messages
170
+
171
+
172
+ def _patch_per_call_metrics():
173
+ """Patch accumulate_model_metrics to capture per-API-call token counts.
174
+
175
+ After each internal API call, Agno calls this function to sum tokens
176
+ into RunMetrics. We intercept it to snapshot the last call's tokens,
177
+ giving us the actual context window size (comparable to OpenCode/Claude Code).
178
+ """
179
+ from agno.metrics import accumulate_model_metrics as _original_accumulate
180
+
181
+ import agno.metrics as _metrics_module
182
+
183
+ def _patched_accumulate(model_response, model, model_type, run_metrics=None):
184
+ global _last_call_input_tokens, _last_call_output_tokens
185
+ global _last_call_cache_read, _last_call_cache_write
186
+ usage = getattr(model_response, "response_usage", None)
187
+ if usage is not None:
188
+ _last_call_input_tokens = getattr(usage, "input_tokens", 0) or 0
189
+ _last_call_output_tokens = getattr(usage, "output_tokens", 0) or 0
190
+ _last_call_cache_read = getattr(usage, "cache_read_tokens", 0) or 0
191
+ _last_call_cache_write = getattr(usage, "cache_write_tokens", 0) or 0
192
+ return _original_accumulate(model_response, model, model_type, run_metrics)
193
+
194
+ _metrics_module.accumulate_model_metrics = _patched_accumulate
195
+
196
+ # Also patch the reference in base.py since it may have imported directly
197
+ try:
198
+ import agno.models.base as _base_module
199
+ _base_module.accumulate_model_metrics = _patched_accumulate
200
+ except (ImportError, AttributeError):
201
+ pass
@@ -14,6 +14,7 @@ import os
14
14
  import sys
15
15
 
16
16
  from rich.markdown import Markdown
17
+ from rich.panel import Panel
17
18
 
18
19
  # ── Re-exports for backward compatibility ─────────────────────────────
19
20
  # Tests and external code import these from aru.cli; keep them accessible.
@@ -413,6 +414,15 @@ async def run_cli(skip_permissions: bool = False, resume_id: str | None = None):
413
414
  _show_help(config)
414
415
  continue
415
416
 
417
+ if user_input.lower() == "/cost":
418
+ console.print(Panel(
419
+ session.cost_summary,
420
+ title="[bold]Token Usage & Cost[/bold]",
421
+ border_style="cyan",
422
+ padding=(1, 2),
423
+ ))
424
+ continue
425
+
416
426
  if user_input.startswith("! "):
417
427
  cmd = user_input[2:].strip()
418
428
  if not cmd:
@@ -518,7 +528,7 @@ async def run_cli(skip_permissions: bool = False, resume_id: str | None = None):
518
528
  session.add_message("assistant", run_result.with_tools_summary())
519
529
  else:
520
530
  console.print(f"[yellow]Unknown command: /{cmd_name}[/yellow]")
521
- console.print(f"[dim]Built-in: /plan, /model, /sessions, /commands, /skills, /agents, /quit[/dim]")
531
+ console.print(f"[dim]Built-in: /plan, /model, /sessions, /commands, /skills, /agents, /cost, /quit[/dim]")
522
532
  if config.commands:
523
533
  console.print(f"[dim]Custom: {', '.join(f'/{k}' for k in config.commands)}[/dim]")
524
534
  if config.skills:
@@ -21,6 +21,7 @@ SLASH_COMMANDS = [
21
21
  ("/skills", "List available skills", "/skills"),
22
22
  ("/agents", "List custom agents", "/agents"),
23
23
  ("/mcp", "List loaded MCP tools", "/mcp"),
24
+ ("/cost", "Show detailed token usage and cost", "/cost"),
24
25
  ("/quit", "Exit aru", "/quit"),
25
26
  ]
26
27
 
@@ -11,15 +11,15 @@ from __future__ import annotations
11
11
  # ── Constants ──────────────────────────────────────────────────────
12
12
 
13
13
  # Pruning: minimum chars that must be freeable to justify a prune pass
14
- PRUNE_MINIMUM_CHARS = 8_000 # ~2K tokens (was 12K — prune sooner)
14
+ PRUNE_MINIMUM_CHARS = 12_000 # ~3.5K tokens
15
15
  # Placeholder that replaces evicted content
16
16
  PRUNED_PLACEHOLDER = "[cleared]"
17
17
  # User messages larger than this threshold are truncated when outside protection window
18
- PRUNE_USER_MSG_THRESHOLD = 1_200 # ~340 tokens (was 2K — catch file contents earlier)
18
+ PRUNE_USER_MSG_THRESHOLD = 2_000 # ~570 tokens
19
19
  # How many chars to keep from the start of a pruned user message
20
- PRUNE_USER_MSG_KEEP = 300 # ~85 tokens (was 500 — enough for the request intent)
20
+ PRUNE_USER_MSG_KEEP = 500 # ~140 tokens
21
21
  # Minimum number of recent user turns always protected (regardless of char budget)
22
- PRUNE_PROTECT_TURNS = 1 # was 2 — only protect the very last turn
22
+ PRUNE_PROTECT_TURNS = 2
23
23
  # Tool result markers that should never be pruned (critical context)
24
24
  PRUNE_PROTECTED_MARKERS = {"[SubAgent-", "delegate_task"}
25
25
  # Tool names whose outputs should never be pruned (like OpenCode's PRUNE_PROTECTED_TOOLS)
@@ -27,20 +27,20 @@ PRUNE_PROTECTED_MARKERS = {"[SubAgent-", "delegate_task"}
27
27
  PRUNE_PROTECTED_TOOLS = {"delegate_task"}
28
28
 
29
29
  # Truncation: universal limits for any tool output
30
- TRUNCATE_MAX_LINES = 200 # was 300 — tighter to save context
31
- TRUNCATE_MAX_BYTES = 10 * 1024 # 10 KB (was 15KB — save full to disk instead)
30
+ TRUNCATE_MAX_LINES = 300
31
+ TRUNCATE_MAX_BYTES = 15 * 1024 # 15 KB
32
32
  TRUNCATE_KEEP_START = 150 # lines to keep from the start
33
- TRUNCATE_KEEP_END = 30 # lines to keep from the end (was 60)
33
+ TRUNCATE_KEEP_END = 60 # lines to keep from the end
34
34
  TRUNCATE_MAX_LINE_LENGTH = 1500 # chars per individual line (prevents minified files)
35
35
  # Directory for saving full truncated outputs (like OpenCode pattern)
36
36
  TRUNCATE_SAVE_DIR = ".aru/truncated"
37
37
 
38
38
  # Compaction: trigger when per-run input tokens exceed this fraction of model limit
39
- COMPACTION_THRESHOLD_RATIO = 0.50 # was 0.70 — compact much earlier to stay lean
39
+ COMPACTION_THRESHOLD_RATIO = 0.70
40
40
  # Compaction: target post-compaction size as fraction of model context limit
41
- COMPACTION_TARGET_RATIO = 0.10 # was 0.15 — more aggressive compaction target
41
+ COMPACTION_TARGET_RATIO = 0.15
42
42
  # Compaction: also trigger after this many user turns (regardless of token count)
43
- COMPACTION_MAX_TURNS = 8
43
+ COMPACTION_MAX_TURNS = 15
44
44
  # Compaction: reserve buffer for the compaction process itself (like OpenCode's 20K)
45
45
  COMPACTION_BUFFER_TOKENS = 20_000
46
46
  # Default model context limits (input tokens)
@@ -115,8 +115,8 @@ def _get_prune_protect_chars(model_id: str = "default") -> int:
115
115
  to prevent context overflow. Returns ~7% of the model's context in chars.
116
116
  """
117
117
  limit = MODEL_CONTEXT_LIMITS.get(model_id, MODEL_CONTEXT_LIMITS["default"])
118
- # ~4 chars per token, protect ~5% of context (was 7% — tighter budget)
119
- protect = int(limit * 0.05 * 4)
118
+ # ~4 chars per token, protect ~7% of context
119
+ protect = int(limit * 0.07 * 4)
120
120
  # Clamp between 10K (minimum usable) and 40K (diminishing returns)
121
121
  return max(10_000, min(protect, 40_000))
122
122
 
@@ -115,9 +115,6 @@ async def run_agent_capture(agent, message: str, session=None, lightweight: bool
115
115
  run_message = message
116
116
 
117
117
  # Build conversation history as real messages for the LLM
118
- # Compact BEFORE pruning: if the history is large enough that pruning
119
- # would discard content, compact first to preserve context via summary
120
- # instead of losing it to placeholders.
121
118
  from aru.context import prune_history, should_compact, compact_conversation, would_prune
122
119
  if session and session.history and not lightweight:
123
120
  if would_prune(session.history, model_id=session.model_id):
@@ -242,23 +239,21 @@ async def run_agent_capture(agent, message: str, session=None, lightweight: bool
242
239
  if run_output and session and hasattr(run_output, "metrics"):
243
240
  session.track_tokens(run_output.metrics)
244
241
 
245
- # Reactive compaction: use per-run input_tokens (sum of all API
246
- # calls within this arun) as a conservative proxy for context pressure.
247
- # session.history doesn't include tool results, so char-based estimates
248
- # would miss the bulk of the context sent to the model.
242
+ # Reactive compaction: runs with a visible spinner so the user
243
+ # sees progress instead of a frozen screen.
249
244
  run_input_tokens = getattr(run_output.metrics, "input_tokens", 0) or 0
250
245
  if should_compact(run_input_tokens, session.model_id):
251
- try:
252
- # Always prune first to shrink history before compaction
253
- session.history = prune_history(session.history, model_id=session.model_id)
254
- session.history = await compact_conversation(
255
- session.history, session.model_ref, session.plan_task,
256
- model_id=session.model_id,
257
- )
258
- console.print("[dim]Context compacted to save tokens.[/dim]")
259
- except Exception:
260
- # Even if compaction fails, keep the pruned history
261
- pass
246
+ from rich.status import Status
247
+ with Status("[dim]Compacting context...[/dim]", console=console, spinner="dots"):
248
+ try:
249
+ session.history = prune_history(session.history, model_id=session.model_id)
250
+ session.history = await compact_conversation(
251
+ session.history, session.model_ref, session.plan_task,
252
+ model_id=session.model_id,
253
+ )
254
+ console.print("[dim]Context compacted to save tokens.[/dim]")
255
+ except Exception:
256
+ pass
262
257
 
263
258
  final_content = accumulated or final_content
264
259
  remaining = (final_content or "")[display._flushed_len:]
@@ -16,6 +16,46 @@ from aru.providers import MODEL_ALIASES, get_model_display, resolve_model_ref
16
16
  # Default model reference (provider/model format)
17
17
  DEFAULT_MODEL = "anthropic/claude-sonnet-4-5"
18
18
 
19
+ # Pricing per million tokens (USD). Cache read/write have separate rates.
20
+ # Format: {model_id_prefix: (input, output, cache_read, cache_write)}
21
+ # Prices as of 2025-05. Models not listed fall back to "default".
22
+ MODEL_PRICING: dict[str, tuple[float, float, float, float]] = {
23
+ # Anthropic (input, output, cache_read=10%, cache_write=125%)
24
+ "claude-sonnet-4-5": (3.00, 15.00, 0.30, 3.75),
25
+ "claude-sonnet-4-6": (3.00, 15.00, 0.30, 3.75),
26
+ "claude-opus-4": (15.00, 75.00, 1.50, 18.75),
27
+ "claude-opus-4-6": (15.00, 75.00, 1.50, 18.75),
28
+ "claude-haiku-3-5": (0.80, 4.00, 0.08, 1.00),
29
+ "claude-haiku-4-5": (1.00, 5.00, 0.10, 1.25),
30
+ # OpenAI
31
+ "gpt-4o": (2.50, 10.00, 1.25, 2.50),
32
+ "gpt-4o-mini": (0.15, 0.60, 0.075, 0.15),
33
+ "gpt-4.1": (2.00, 8.00, 0.50, 2.00),
34
+ "gpt-4.1-mini": (0.40, 1.60, 0.10, 0.40),
35
+ "gpt-4.1-nano": (0.10, 0.40, 0.025, 0.10),
36
+ "o3": (2.00, 8.00, 0.50, 2.00),
37
+ "o3-mini": (1.10, 4.40, 0.275, 1.10),
38
+ "o4-mini": (1.10, 4.40, 0.275, 1.10),
39
+ # Qwen / DashScope (<=256K tier, explicit cache: creation=125%, hit=10%)
40
+ "qwen3-plus": (0.50, 3.00, 0.05, 0.625),
41
+ "qwen3.6-plus": (0.50, 3.00, 0.05, 0.625),
42
+ "qwen-plus": (0.50, 3.00, 0.05, 0.625),
43
+ "qwen-max": (2.00, 6.00, 0.20, 2.50),
44
+ "qwen-turbo": (0.30, 0.60, 0.03, 0.375),
45
+ "qwen3-coder-plus": (0.50, 3.00, 0.05, 0.625),
46
+ # DeepSeek
47
+ "deepseek-chat": (0.27, 1.10, 0.07, 0.27),
48
+ "deepseek-reasoner": (0.55, 2.19, 0.14, 0.55),
49
+ # Google Gemini (via OpenRouter)
50
+ "gemini-2.5-pro": (1.25, 10.00, 0.315, 1.25),
51
+ "gemini-2.5-flash": (0.15, 0.60, 0.0375, 0.15),
52
+ # Groq (free tier / very cheap)
53
+ "llama-3.3-70b": (0.59, 0.79, 0.0, 0.0),
54
+ "llama-3.1": (0.05, 0.08, 0.0, 0.0),
55
+ # Fallback
56
+ "default": (3.00, 15.00, 0.30, 3.75),
57
+ }
58
+
19
59
  SESSIONS_DIR = os.path.join(".aru", "sessions")
20
60
 
21
61
 
@@ -141,12 +181,15 @@ class Session:
141
181
  self.total_cache_read_tokens: int = 0
142
182
  self.total_cache_write_tokens: int = 0
143
183
  self.api_calls: int = 0
184
+ # Per-call metrics: last API call's context window (set by cache_patch)
185
+ self.last_input_tokens: int = 0
186
+ self.last_output_tokens: int = 0
187
+ self.last_cache_read: int = 0
188
+ self.last_cache_write: int = 0
144
189
  # Context cache — invalidated on file mutations
145
190
  self._cached_tree: str | None = None
146
191
  self._cached_git_status: str | None = None
147
192
  self._context_dirty: bool = True
148
- # Track whether AGENTS.md/extra instructions were already sent (skip on subsequent turns)
149
- self.extra_instructions_sent: bool = False
150
193
  # Tree depth for env context (configurable via aru.json "tree_depth")
151
194
  self._tree_max_depth: int = 2
152
195
  # Token budget (0 = unlimited)
@@ -198,20 +241,100 @@ class Session:
198
241
  self.total_cache_read_tokens += getattr(metrics, "cache_read_tokens", 0) or 0
199
242
  self.total_cache_write_tokens += getattr(metrics, "cache_write_tokens", 0) or 0
200
243
  self.api_calls += 1
244
+ # Capture last API call's context window (set by cache_patch)
245
+ try:
246
+ from aru.cache_patch import get_last_call_metrics
247
+ self.last_input_tokens, self.last_output_tokens, self.last_cache_read, self.last_cache_write = get_last_call_metrics()
248
+ except ImportError:
249
+ self.last_input_tokens = getattr(metrics, "input_tokens", 0) or 0
250
+ self.last_output_tokens = getattr(metrics, "output_tokens", 0) or 0
251
+ self.last_cache_read = 0
252
+ self.last_cache_write = 0
253
+
254
+ def _get_pricing(self) -> tuple[float, float, float, float]:
255
+ """Get per-million-token pricing for the current model."""
256
+ model_id = self.model_id
257
+ # Try exact match, then prefix match, then fallback
258
+ for prefix, pricing in MODEL_PRICING.items():
259
+ if prefix == "default":
260
+ continue
261
+ if model_id.startswith(prefix):
262
+ return pricing
263
+ return MODEL_PRICING["default"]
264
+
265
+ @property
266
+ def estimated_cost(self) -> float:
267
+ """Estimate cumulative cost in USD based on token usage and model pricing.
268
+
269
+ For input tokens, subtracts cache_read (charged at cache rate) and
270
+ cache_write (charged at write rate) from the base input count.
271
+ """
272
+ price_in, price_out, price_cache_read, price_cache_write = self._get_pricing()
273
+ # Non-cached input = total input - cache_read - cache_write
274
+ base_input = max(0, self.total_input_tokens - self.total_cache_read_tokens - self.total_cache_write_tokens)
275
+ cost = (
276
+ base_input * price_in / 1_000_000
277
+ + self.total_output_tokens * price_out / 1_000_000
278
+ + self.total_cache_read_tokens * price_cache_read / 1_000_000
279
+ + self.total_cache_write_tokens * price_cache_write / 1_000_000
280
+ )
281
+ return cost
201
282
 
202
283
  @property
203
284
  def token_summary(self) -> str:
285
+ """One-line summary shown after each response: context window + cost."""
286
+ if self.last_input_tokens <= 0 and self.total_input_tokens == 0:
287
+ return ""
288
+ cost = self.estimated_cost
289
+ cost_str = f"${cost:.4f}" if cost < 0.01 else f"${cost:.2f}"
290
+ if self.last_input_tokens > 0:
291
+ ctx_total = self.last_input_tokens + self.last_output_tokens + self.last_cache_read + self.last_cache_write
292
+ parts = [f"in: {self.last_input_tokens:,}", f"out: {self.last_output_tokens:,}"]
293
+ if self.last_cache_read > 0:
294
+ parts.append(f"cache_read: {self.last_cache_read:,}")
295
+ if self.last_cache_write > 0:
296
+ parts.append(f"cache_write: {self.last_cache_write:,}")
297
+ return f"context: {ctx_total:,} ({' / '.join(parts)}) | cost: {cost_str}"
298
+ # Fallback when per-call metrics aren't available
299
+ total = self.total_input_tokens + self.total_output_tokens
300
+ return f"tokens: {total:,} | cost: {cost_str}"
301
+
302
+ @property
303
+ def cost_summary(self) -> str:
304
+ """Detailed cost breakdown for /cost command."""
204
305
  total = self.total_input_tokens + self.total_output_tokens
205
306
  if total == 0:
206
- return ""
207
- metrics_str = f"in: {self.total_input_tokens:,} / out: {self.total_output_tokens:,}"
307
+ return "No token usage yet."
308
+ cost = self.estimated_cost
309
+ cost_str = f"${cost:.4f}" if cost < 0.01 else f"${cost:.2f}"
310
+ lines = [
311
+ f"Session cost: {cost_str}",
312
+ f"",
313
+ f"Cumulative tokens:",
314
+ f" input: {self.total_input_tokens:,}",
315
+ f" output: {self.total_output_tokens:,}",
316
+ ]
208
317
  if self.total_cache_read_tokens > 0:
209
- metrics_str += f" / cached: {self.total_cache_read_tokens:,}"
210
- summary = f"tokens: {total:,} ({metrics_str}) | calls: {self.api_calls}"
318
+ lines.append(f" cache_read: {self.total_cache_read_tokens:,}")
319
+ if self.total_cache_write_tokens > 0:
320
+ lines.append(f" cache_write: {self.total_cache_write_tokens:,}")
321
+ lines.append(f" total: {total:,}")
322
+ lines.append(f" api calls: {self.api_calls}")
323
+ if self.last_input_tokens > 0:
324
+ ctx_total = self.last_input_tokens + self.last_output_tokens + self.last_cache_read + self.last_cache_write
325
+ lines.append(f"")
326
+ lines.append(f"Last context window: {ctx_total:,}")
327
+ lines.append(f" input: {self.last_input_tokens:,}")
328
+ lines.append(f" output: {self.last_output_tokens:,}")
329
+ if self.last_cache_read > 0:
330
+ lines.append(f" cache_read: {self.last_cache_read:,}")
331
+ if self.last_cache_write > 0:
332
+ lines.append(f" cache_write: {self.last_cache_write:,}")
211
333
  if self.token_budget > 0:
212
334
  pct = int(total / self.token_budget * 100)
213
- summary += f" | budget: {pct}%"
214
- return summary
335
+ lines.append(f"")
336
+ lines.append(f"Budget: {pct}% used")
337
+ return "\n".join(lines)
215
338
 
216
339
  def invalidate_context_cache(self):
217
340
  """Mark cached tree/git status as stale. Call after file mutations."""
@@ -55,7 +55,7 @@ def _format_diff(old_string: str, new_string: str) -> Group:
55
55
 
56
56
 
57
57
  # Hard ceiling per tool result (~7K tokens). Even max_size=0 respects this per chunk.
58
- _READ_HARD_CAP = 25_000 # bytes (was 40K — each tool result re-sent on next API call)
58
+ _READ_HARD_CAP = 40_000 # bytes (~11K tokens)
59
59
 
60
60
  def clear_read_cache():
61
61
  """Clear the read cache. Call after file mutations to avoid stale data."""
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: aru-code
3
- Version: 0.16.0
3
+ Version: 0.17.0
4
4
  Summary: A Claude Code clone built with Agno agents
5
5
  Author-email: Estevao <estevaofon@gmail.com>
6
6
  License-Expression: MIT
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "aru-code"
7
- version = "0.16.0"
7
+ version = "0.17.0"
8
8
  description = "A Claude Code clone built with Agno agents"
9
9
  readme = "README.md"
10
10
  license = "MIT"
@@ -286,16 +286,32 @@ class TestSession:
286
286
  session.total_output_tokens = 500
287
287
  session.api_calls = 3
288
288
  summary = session.token_summary
289
- assert "1,000" in summary or "1000" in summary
290
- assert "calls: 3" in summary
289
+ assert "tokens: 1,500" in summary
290
+ assert "cost:" in summary
291
291
 
292
- def test_token_summary_with_cache(self):
292
+ def test_token_summary_with_context(self):
293
+ session = Session()
294
+ session.total_input_tokens = 1000
295
+ session.total_output_tokens = 500
296
+ session.last_input_tokens = 800
297
+ session.last_output_tokens = 200
298
+ session.last_cache_read = 100
299
+ session.api_calls = 1
300
+ summary = session.token_summary
301
+ assert "context:" in summary
302
+ assert "cache_read:" in summary
303
+ assert "cost:" in summary
304
+
305
+ def test_cost_summary(self):
293
306
  session = Session()
294
307
  session.total_input_tokens = 100
295
308
  session.total_output_tokens = 50
296
309
  session.total_cache_read_tokens = 200
297
310
  session.api_calls = 1
298
- assert "cached" in session.token_summary
311
+ summary = session.cost_summary
312
+ assert "Session cost:" in summary
313
+ assert "input:" in summary
314
+ assert "cache_read:" in summary
299
315
 
300
316
  def test_to_dict_and_from_dict(self):
301
317
  session = Session(session_id="test123")
@@ -1 +0,0 @@
1
- __version__ = "0.16.0"
@@ -1,133 +0,0 @@
1
- """Monkey-patch Agno's model layer to reduce token consumption.
2
-
3
- Two optimizations:
4
-
5
- 1. **Tool result pruning** (ALL providers): After each tool execution, old tool
6
- results in the message list are truncated to a short summary. This prevents
7
- O(n²) token growth where each API call re-sends all previous tool results.
8
-
9
- 2. **Cache breakpoints** (Anthropic only): Marks the last 2 messages with
10
- cache_control for Anthropic's prompt caching.
11
-
12
- These patches intercept Agno's internal loop so they work transparently
13
- regardless of which provider is used.
14
- """
15
-
16
- from __future__ import annotations
17
-
18
- # Max chars to keep from old tool results
19
- _TOOL_RESULT_KEEP_CHARS = 200
20
- # Number of recent tool results to keep in full
21
- _KEEP_RECENT_RESULTS = 1
22
-
23
-
24
- def _prune_tool_messages(messages):
25
- """Truncate old tool result content in the message list.
26
-
27
- Keeps only the last N tool results in full. Older ones are truncated
28
- to a short preview. This runs BEFORE each API call, so accumulated
29
- tool results don't bloat the context on every re-send.
30
- """
31
- # Find all tool message indices
32
- tool_indices = [
33
- i for i, msg in enumerate(messages)
34
- if getattr(msg, "role", None) == "tool"
35
- ]
36
-
37
- if len(tool_indices) <= _KEEP_RECENT_RESULTS:
38
- return
39
-
40
- # Prune all except the last N
41
- for idx in tool_indices[:-_KEEP_RECENT_RESULTS]:
42
- msg = messages[idx]
43
- content = getattr(msg, "content", None)
44
- if content is None:
45
- continue
46
-
47
- content_str = str(content)
48
- if len(content_str) <= _TOOL_RESULT_KEEP_CHARS:
49
- continue
50
-
51
- truncated = content_str[:_TOOL_RESULT_KEEP_CHARS] + "\n[...truncated]"
52
- try:
53
- msg.content = truncated
54
- if hasattr(msg, "compressed_content"):
55
- msg.compressed_content = None
56
- except (AttributeError, TypeError):
57
- pass
58
-
59
-
60
- def apply_cache_patch():
61
- """Apply all patches to reduce Agno's token consumption."""
62
- _patch_tool_result_pruning()
63
- _patch_claude_cache_breakpoints()
64
-
65
-
66
- def _patch_tool_result_pruning():
67
- """Patch format_function_call_results to prune old tool results.
68
-
69
- This is called after each tool execution, right before the next API call.
70
- Works for ALL providers (Claude, OpenAI, Qwen, etc.) since it patches
71
- the base Model class.
72
- """
73
- from agno.models.base import Model
74
-
75
- _original_format_results = Model.format_function_call_results
76
-
77
- def _patched_format_results(self, messages, function_call_results, **kwargs):
78
- # First: prune old tool results already in messages
79
- _prune_tool_messages(messages)
80
- # Then: add new results normally
81
- return _original_format_results(self, messages, function_call_results, **kwargs)
82
-
83
- Model.format_function_call_results = _patched_format_results
84
-
85
-
86
- def _patch_claude_cache_breakpoints():
87
- """Patch Claude's format_messages to add cache breakpoints.
88
-
89
- Marks the last 2 messages with cache_control for Anthropic's prompt
90
- caching. Non-Anthropic providers ignore these fields.
91
- """
92
- try:
93
- import agno.utils.models.claude as claude_utils
94
- except ImportError:
95
- return
96
-
97
- _original_format = claude_utils.format_messages
98
-
99
- def _patched_format_messages(messages, compress_tool_results=False):
100
- chat_messages, system_message = _original_format(
101
- messages, compress_tool_results=compress_tool_results
102
- )
103
-
104
- if not chat_messages:
105
- return chat_messages, system_message
106
-
107
- # Add cache_control to last 2 messages
108
- cache_marker = {"type": "ephemeral"}
109
- marked = 0
110
- for msg in reversed(chat_messages):
111
- if marked >= 2:
112
- break
113
- content = msg.get("content")
114
- if isinstance(content, list) and content:
115
- last_item = content[-1]
116
- if isinstance(last_item, dict):
117
- last_item["cache_control"] = cache_marker
118
- marked += 1
119
- elif hasattr(last_item, "type"):
120
- try:
121
- as_dict = last_item.model_dump() if hasattr(last_item, "model_dump") else dict(last_item)
122
- as_dict["cache_control"] = cache_marker
123
- content[-1] = as_dict
124
- marked += 1
125
- except Exception:
126
- pass
127
- elif isinstance(content, str):
128
- msg["content"] = [{"type": "text", "text": content, "cache_control": cache_marker}]
129
- marked += 1
130
-
131
- return chat_messages, system_message
132
-
133
- claude_utils.format_messages = _patched_format_messages
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes