aru-code 0.16.0__tar.gz → 0.17.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {aru_code-0.16.0/aru_code.egg-info → aru_code-0.17.0}/PKG-INFO +1 -1
- aru_code-0.17.0/aru/__init__.py +1 -0
- {aru_code-0.16.0 → aru_code-0.17.0}/aru/agent_factory.py +1 -6
- {aru_code-0.16.0 → aru_code-0.17.0}/aru/agents/base.py +5 -4
- aru_code-0.17.0/aru/cache_patch.py +201 -0
- {aru_code-0.16.0 → aru_code-0.17.0}/aru/cli.py +11 -1
- {aru_code-0.16.0 → aru_code-0.17.0}/aru/commands.py +1 -0
- {aru_code-0.16.0 → aru_code-0.17.0}/aru/context.py +12 -12
- {aru_code-0.16.0 → aru_code-0.17.0}/aru/runner.py +13 -18
- {aru_code-0.16.0 → aru_code-0.17.0}/aru/session.py +131 -8
- {aru_code-0.16.0 → aru_code-0.17.0}/aru/tools/codebase.py +1 -1
- {aru_code-0.16.0 → aru_code-0.17.0/aru_code.egg-info}/PKG-INFO +1 -1
- {aru_code-0.16.0 → aru_code-0.17.0}/pyproject.toml +1 -1
- {aru_code-0.16.0 → aru_code-0.17.0}/tests/test_cli.py +20 -4
- aru_code-0.16.0/aru/__init__.py +0 -1
- aru_code-0.16.0/aru/cache_patch.py +0 -133
- {aru_code-0.16.0 → aru_code-0.17.0}/LICENSE +0 -0
- {aru_code-0.16.0 → aru_code-0.17.0}/README.md +0 -0
- {aru_code-0.16.0 → aru_code-0.17.0}/aru/agents/__init__.py +0 -0
- {aru_code-0.16.0 → aru_code-0.17.0}/aru/agents/executor.py +0 -0
- {aru_code-0.16.0 → aru_code-0.17.0}/aru/agents/planner.py +0 -0
- {aru_code-0.16.0 → aru_code-0.17.0}/aru/completers.py +0 -0
- {aru_code-0.16.0 → aru_code-0.17.0}/aru/config.py +0 -0
- {aru_code-0.16.0 → aru_code-0.17.0}/aru/display.py +0 -0
- {aru_code-0.16.0 → aru_code-0.17.0}/aru/permissions.py +0 -0
- {aru_code-0.16.0 → aru_code-0.17.0}/aru/providers.py +0 -0
- {aru_code-0.16.0 → aru_code-0.17.0}/aru/runtime.py +0 -0
- {aru_code-0.16.0 → aru_code-0.17.0}/aru/tools/__init__.py +0 -0
- {aru_code-0.16.0 → aru_code-0.17.0}/aru/tools/ast_tools.py +0 -0
- {aru_code-0.16.0 → aru_code-0.17.0}/aru/tools/gitignore.py +0 -0
- {aru_code-0.16.0 → aru_code-0.17.0}/aru/tools/mcp_client.py +0 -0
- {aru_code-0.16.0 → aru_code-0.17.0}/aru/tools/ranker.py +0 -0
- {aru_code-0.16.0 → aru_code-0.17.0}/aru/tools/tasklist.py +0 -0
- {aru_code-0.16.0 → aru_code-0.17.0}/aru_code.egg-info/SOURCES.txt +0 -0
- {aru_code-0.16.0 → aru_code-0.17.0}/aru_code.egg-info/dependency_links.txt +0 -0
- {aru_code-0.16.0 → aru_code-0.17.0}/aru_code.egg-info/entry_points.txt +0 -0
- {aru_code-0.16.0 → aru_code-0.17.0}/aru_code.egg-info/requires.txt +0 -0
- {aru_code-0.16.0 → aru_code-0.17.0}/aru_code.egg-info/top_level.txt +0 -0
- {aru_code-0.16.0 → aru_code-0.17.0}/setup.cfg +0 -0
- {aru_code-0.16.0 → aru_code-0.17.0}/tests/test_agents_base.py +0 -0
- {aru_code-0.16.0 → aru_code-0.17.0}/tests/test_cli_advanced.py +0 -0
- {aru_code-0.16.0 → aru_code-0.17.0}/tests/test_cli_base.py +0 -0
- {aru_code-0.16.0 → aru_code-0.17.0}/tests/test_cli_completers.py +0 -0
- {aru_code-0.16.0 → aru_code-0.17.0}/tests/test_cli_new.py +0 -0
- {aru_code-0.16.0 → aru_code-0.17.0}/tests/test_cli_run_cli.py +0 -0
- {aru_code-0.16.0 → aru_code-0.17.0}/tests/test_cli_session.py +0 -0
- {aru_code-0.16.0 → aru_code-0.17.0}/tests/test_cli_shell.py +0 -0
- {aru_code-0.16.0 → aru_code-0.17.0}/tests/test_codebase.py +0 -0
- {aru_code-0.16.0 → aru_code-0.17.0}/tests/test_config.py +0 -0
- {aru_code-0.16.0 → aru_code-0.17.0}/tests/test_context.py +0 -0
- {aru_code-0.16.0 → aru_code-0.17.0}/tests/test_executor.py +0 -0
- {aru_code-0.16.0 → aru_code-0.17.0}/tests/test_gitignore.py +0 -0
- {aru_code-0.16.0 → aru_code-0.17.0}/tests/test_main.py +0 -0
- {aru_code-0.16.0 → aru_code-0.17.0}/tests/test_mcp_client.py +0 -0
- {aru_code-0.16.0 → aru_code-0.17.0}/tests/test_permissions.py +0 -0
- {aru_code-0.16.0 → aru_code-0.17.0}/tests/test_planner.py +0 -0
- {aru_code-0.16.0 → aru_code-0.17.0}/tests/test_providers.py +0 -0
- {aru_code-0.16.0 → aru_code-0.17.0}/tests/test_ranker.py +0 -0
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "0.17.0"
|
|
@@ -25,12 +25,7 @@ def create_general_agent(
|
|
|
25
25
|
from aru.tools.codebase import GENERAL_TOOLS
|
|
26
26
|
tools = GENERAL_TOOLS
|
|
27
27
|
|
|
28
|
-
|
|
29
|
-
if config and not session.extra_instructions_sent:
|
|
30
|
-
extra = config.get_extra_instructions()
|
|
31
|
-
session.extra_instructions_sent = True
|
|
32
|
-
else:
|
|
33
|
-
extra = ""
|
|
28
|
+
extra = config.get_extra_instructions() if config else ""
|
|
34
29
|
if env_context:
|
|
35
30
|
extra = f"{extra}\n\n{env_context}" if extra else env_context
|
|
36
31
|
model_ref = model_override or session.model_ref
|
|
@@ -48,12 +48,12 @@ Default shows 10 lines of context — use `context_lines=30` for full function b
|
|
|
48
48
|
2. **Understand a file** → `read_file_smart(path, query)` — returns a concise answer, not raw content
|
|
49
49
|
3. **Need raw content** → `read_file(path)` — returns first chunk + outline for large files
|
|
50
50
|
|
|
51
|
-
**Stop early**: Once you have enough information to write the plan, STOP making tool calls \
|
|
52
|
-
immediately. Do not exhaustively explore.
|
|
53
|
-
|
|
54
51
|
**Batch independent tool calls**: When you need answers from multiple independent sources, \
|
|
55
52
|
emit ALL those tool calls in a single response.
|
|
56
53
|
|
|
54
|
+
**Stop early**: Once you have enough information to write the plan, stop exploring and write it. \
|
|
55
|
+
Do not exhaustively read every file — batch what you need, then produce the plan.
|
|
56
|
+
|
|
57
57
|
## Output format — STRICT
|
|
58
58
|
|
|
59
59
|
Your ONLY output is the plan below. Do NOT write analysis, coverage reports, summaries of
|
|
@@ -182,7 +182,8 @@ Every tool call accumulates its result in your context window. Use the minimum n
|
|
|
182
182
|
|
|
183
183
|
**Batch independent tool calls**: emit ALL independent tool calls in a single response.
|
|
184
184
|
|
|
185
|
-
**Stop early**: Once you have enough information to
|
|
185
|
+
**Stop early**: Once you have enough information to act, stop exploring and start working. \
|
|
186
|
+
Batch what you need upfront, then execute.
|
|
186
187
|
|
|
187
188
|
**When adding or modifying unit tests, ALWAYS run them to verify they pass before finishing.**
|
|
188
189
|
|
|
@@ -0,0 +1,201 @@
|
|
|
1
|
+
"""Monkey-patch Agno's model layer to reduce token consumption.
|
|
2
|
+
|
|
3
|
+
Three optimizations:
|
|
4
|
+
|
|
5
|
+
1. **Tool result pruning** (ALL providers): After each tool execution, old tool
|
|
6
|
+
results in the message list are truncated to a short summary. This prevents
|
|
7
|
+
O(n²) token growth where each API call re-sends all previous tool results.
|
|
8
|
+
|
|
9
|
+
2. **Cache breakpoints** (Anthropic only): Marks the last 2 messages with
|
|
10
|
+
cache_control for Anthropic's prompt caching.
|
|
11
|
+
|
|
12
|
+
3. **Per-call metrics** (ALL providers): Captures input/output tokens of the
|
|
13
|
+
last API call (context window size), exposed via get_last_call_metrics().
|
|
14
|
+
|
|
15
|
+
These patches intercept Agno's internal loop so they work transparently
|
|
16
|
+
regardless of which provider is used.
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
from __future__ import annotations
|
|
20
|
+
|
|
21
|
+
# Token-budget pruning (aligned with OpenCode's strategy):
|
|
22
|
+
# - Protect recent tool results within a token budget
|
|
23
|
+
# - Only prune if there's enough to free (avoid churn)
|
|
24
|
+
# - Walk backwards, protecting recent content first
|
|
25
|
+
# OpenCode uses 40K protect / 20K minimum; we use chars (~4 chars/token)
|
|
26
|
+
_PRUNE_PROTECT_CHARS = 160_000 # ~40K tokens — recent content always kept
|
|
27
|
+
_PRUNE_MINIMUM_CHARS = 80_000 # ~20K tokens — only prune if this much is freeable
|
|
28
|
+
_PRUNED_PLACEHOLDER = "[Old tool result cleared]"
|
|
29
|
+
|
|
30
|
+
# Last API call metrics (updated on every internal API call)
|
|
31
|
+
_last_call_input_tokens: int = 0
|
|
32
|
+
_last_call_output_tokens: int = 0
|
|
33
|
+
_last_call_cache_read: int = 0
|
|
34
|
+
_last_call_cache_write: int = 0
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def get_last_call_metrics() -> tuple[int, int, int, int]:
|
|
38
|
+
"""Return (input, output, cache_read, cache_write) from the most recent API call."""
|
|
39
|
+
return _last_call_input_tokens, _last_call_output_tokens, _last_call_cache_read, _last_call_cache_write
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def _prune_tool_messages(messages):
|
|
43
|
+
"""Clear old tool result content using a token-budget approach.
|
|
44
|
+
|
|
45
|
+
Walks backwards through messages, protecting recent content up to
|
|
46
|
+
PRUNE_PROTECT_CHARS. Older tool results beyond that budget are replaced
|
|
47
|
+
with a short placeholder. Only prunes if total freeable chars exceed
|
|
48
|
+
PRUNE_MINIMUM_CHARS (avoids unnecessary churn on small conversations).
|
|
49
|
+
|
|
50
|
+
Aligned with OpenCode's strategy: budget-based, not fixed-N.
|
|
51
|
+
"""
|
|
52
|
+
# Collect tool message indices and their content sizes
|
|
53
|
+
tool_indices = []
|
|
54
|
+
for i, msg in enumerate(messages):
|
|
55
|
+
if getattr(msg, "role", None) == "tool":
|
|
56
|
+
content = getattr(msg, "content", None)
|
|
57
|
+
content_len = len(str(content)) if content is not None else 0
|
|
58
|
+
tool_indices.append((i, content_len))
|
|
59
|
+
|
|
60
|
+
if not tool_indices:
|
|
61
|
+
return
|
|
62
|
+
|
|
63
|
+
# Walk backwards, accumulating protected chars
|
|
64
|
+
protected_chars = 0
|
|
65
|
+
prune_candidates = [] # (index, content_len) of messages outside protection
|
|
66
|
+
|
|
67
|
+
for idx, content_len in reversed(tool_indices):
|
|
68
|
+
if protected_chars + content_len <= _PRUNE_PROTECT_CHARS:
|
|
69
|
+
protected_chars += content_len
|
|
70
|
+
else:
|
|
71
|
+
prune_candidates.append((idx, content_len))
|
|
72
|
+
|
|
73
|
+
# Only prune if there's enough to free
|
|
74
|
+
freeable = sum(cl for _, cl in prune_candidates)
|
|
75
|
+
if freeable < _PRUNE_MINIMUM_CHARS:
|
|
76
|
+
return
|
|
77
|
+
|
|
78
|
+
# Replace old tool results with placeholder
|
|
79
|
+
for idx, _ in prune_candidates:
|
|
80
|
+
msg = messages[idx]
|
|
81
|
+
content = getattr(msg, "content", None)
|
|
82
|
+
if content is None:
|
|
83
|
+
continue
|
|
84
|
+
# Skip if already pruned
|
|
85
|
+
if str(content) == _PRUNED_PLACEHOLDER:
|
|
86
|
+
continue
|
|
87
|
+
try:
|
|
88
|
+
msg.content = _PRUNED_PLACEHOLDER
|
|
89
|
+
if hasattr(msg, "compressed_content"):
|
|
90
|
+
msg.compressed_content = None
|
|
91
|
+
except (AttributeError, TypeError):
|
|
92
|
+
pass
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def apply_cache_patch():
|
|
96
|
+
"""Apply all patches to reduce Agno's token consumption."""
|
|
97
|
+
_patch_tool_result_pruning()
|
|
98
|
+
_patch_claude_cache_breakpoints()
|
|
99
|
+
_patch_per_call_metrics()
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
def _patch_tool_result_pruning():
|
|
103
|
+
"""Patch format_function_call_results to prune old tool results.
|
|
104
|
+
|
|
105
|
+
This is called after each tool execution, right before the next API call.
|
|
106
|
+
Works for ALL providers (Claude, OpenAI, Qwen, etc.) since it patches
|
|
107
|
+
the base Model class.
|
|
108
|
+
"""
|
|
109
|
+
from agno.models.base import Model
|
|
110
|
+
|
|
111
|
+
_original_format_results = Model.format_function_call_results
|
|
112
|
+
|
|
113
|
+
def _patched_format_results(self, messages, function_call_results, **kwargs):
|
|
114
|
+
# First: prune old tool results already in messages
|
|
115
|
+
_prune_tool_messages(messages)
|
|
116
|
+
# Then: add new results normally
|
|
117
|
+
return _original_format_results(self, messages, function_call_results, **kwargs)
|
|
118
|
+
|
|
119
|
+
Model.format_function_call_results = _patched_format_results
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
def _patch_claude_cache_breakpoints():
|
|
123
|
+
"""Patch Claude's format_messages to add cache breakpoints.
|
|
124
|
+
|
|
125
|
+
Marks the last 2 messages with cache_control for Anthropic's prompt
|
|
126
|
+
caching. Non-Anthropic providers ignore these fields.
|
|
127
|
+
"""
|
|
128
|
+
try:
|
|
129
|
+
import agno.utils.models.claude as claude_utils
|
|
130
|
+
except ImportError:
|
|
131
|
+
return
|
|
132
|
+
|
|
133
|
+
_original_format = claude_utils.format_messages
|
|
134
|
+
|
|
135
|
+
def _patched_format_messages(messages, compress_tool_results=False):
|
|
136
|
+
chat_messages, system_message = _original_format(
|
|
137
|
+
messages, compress_tool_results=compress_tool_results
|
|
138
|
+
)
|
|
139
|
+
|
|
140
|
+
if not chat_messages:
|
|
141
|
+
return chat_messages, system_message
|
|
142
|
+
|
|
143
|
+
# Add cache_control to last 2 messages
|
|
144
|
+
cache_marker = {"type": "ephemeral"}
|
|
145
|
+
marked = 0
|
|
146
|
+
for msg in reversed(chat_messages):
|
|
147
|
+
if marked >= 2:
|
|
148
|
+
break
|
|
149
|
+
content = msg.get("content")
|
|
150
|
+
if isinstance(content, list) and content:
|
|
151
|
+
last_item = content[-1]
|
|
152
|
+
if isinstance(last_item, dict):
|
|
153
|
+
last_item["cache_control"] = cache_marker
|
|
154
|
+
marked += 1
|
|
155
|
+
elif hasattr(last_item, "type"):
|
|
156
|
+
try:
|
|
157
|
+
as_dict = last_item.model_dump() if hasattr(last_item, "model_dump") else dict(last_item)
|
|
158
|
+
as_dict["cache_control"] = cache_marker
|
|
159
|
+
content[-1] = as_dict
|
|
160
|
+
marked += 1
|
|
161
|
+
except Exception:
|
|
162
|
+
pass
|
|
163
|
+
elif isinstance(content, str):
|
|
164
|
+
msg["content"] = [{"type": "text", "text": content, "cache_control": cache_marker}]
|
|
165
|
+
marked += 1
|
|
166
|
+
|
|
167
|
+
return chat_messages, system_message
|
|
168
|
+
|
|
169
|
+
claude_utils.format_messages = _patched_format_messages
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
def _patch_per_call_metrics():
|
|
173
|
+
"""Patch accumulate_model_metrics to capture per-API-call token counts.
|
|
174
|
+
|
|
175
|
+
After each internal API call, Agno calls this function to sum tokens
|
|
176
|
+
into RunMetrics. We intercept it to snapshot the last call's tokens,
|
|
177
|
+
giving us the actual context window size (comparable to OpenCode/Claude Code).
|
|
178
|
+
"""
|
|
179
|
+
from agno.metrics import accumulate_model_metrics as _original_accumulate
|
|
180
|
+
|
|
181
|
+
import agno.metrics as _metrics_module
|
|
182
|
+
|
|
183
|
+
def _patched_accumulate(model_response, model, model_type, run_metrics=None):
|
|
184
|
+
global _last_call_input_tokens, _last_call_output_tokens
|
|
185
|
+
global _last_call_cache_read, _last_call_cache_write
|
|
186
|
+
usage = getattr(model_response, "response_usage", None)
|
|
187
|
+
if usage is not None:
|
|
188
|
+
_last_call_input_tokens = getattr(usage, "input_tokens", 0) or 0
|
|
189
|
+
_last_call_output_tokens = getattr(usage, "output_tokens", 0) or 0
|
|
190
|
+
_last_call_cache_read = getattr(usage, "cache_read_tokens", 0) or 0
|
|
191
|
+
_last_call_cache_write = getattr(usage, "cache_write_tokens", 0) or 0
|
|
192
|
+
return _original_accumulate(model_response, model, model_type, run_metrics)
|
|
193
|
+
|
|
194
|
+
_metrics_module.accumulate_model_metrics = _patched_accumulate
|
|
195
|
+
|
|
196
|
+
# Also patch the reference in base.py since it may have imported directly
|
|
197
|
+
try:
|
|
198
|
+
import agno.models.base as _base_module
|
|
199
|
+
_base_module.accumulate_model_metrics = _patched_accumulate
|
|
200
|
+
except (ImportError, AttributeError):
|
|
201
|
+
pass
|
|
@@ -14,6 +14,7 @@ import os
|
|
|
14
14
|
import sys
|
|
15
15
|
|
|
16
16
|
from rich.markdown import Markdown
|
|
17
|
+
from rich.panel import Panel
|
|
17
18
|
|
|
18
19
|
# ── Re-exports for backward compatibility ─────────────────────────────
|
|
19
20
|
# Tests and external code import these from aru.cli; keep them accessible.
|
|
@@ -413,6 +414,15 @@ async def run_cli(skip_permissions: bool = False, resume_id: str | None = None):
|
|
|
413
414
|
_show_help(config)
|
|
414
415
|
continue
|
|
415
416
|
|
|
417
|
+
if user_input.lower() == "/cost":
|
|
418
|
+
console.print(Panel(
|
|
419
|
+
session.cost_summary,
|
|
420
|
+
title="[bold]Token Usage & Cost[/bold]",
|
|
421
|
+
border_style="cyan",
|
|
422
|
+
padding=(1, 2),
|
|
423
|
+
))
|
|
424
|
+
continue
|
|
425
|
+
|
|
416
426
|
if user_input.startswith("! "):
|
|
417
427
|
cmd = user_input[2:].strip()
|
|
418
428
|
if not cmd:
|
|
@@ -518,7 +528,7 @@ async def run_cli(skip_permissions: bool = False, resume_id: str | None = None):
|
|
|
518
528
|
session.add_message("assistant", run_result.with_tools_summary())
|
|
519
529
|
else:
|
|
520
530
|
console.print(f"[yellow]Unknown command: /{cmd_name}[/yellow]")
|
|
521
|
-
console.print(f"[dim]Built-in: /plan, /model, /sessions, /commands, /skills, /agents, /quit[/dim]")
|
|
531
|
+
console.print(f"[dim]Built-in: /plan, /model, /sessions, /commands, /skills, /agents, /cost, /quit[/dim]")
|
|
522
532
|
if config.commands:
|
|
523
533
|
console.print(f"[dim]Custom: {', '.join(f'/{k}' for k in config.commands)}[/dim]")
|
|
524
534
|
if config.skills:
|
|
@@ -21,6 +21,7 @@ SLASH_COMMANDS = [
|
|
|
21
21
|
("/skills", "List available skills", "/skills"),
|
|
22
22
|
("/agents", "List custom agents", "/agents"),
|
|
23
23
|
("/mcp", "List loaded MCP tools", "/mcp"),
|
|
24
|
+
("/cost", "Show detailed token usage and cost", "/cost"),
|
|
24
25
|
("/quit", "Exit aru", "/quit"),
|
|
25
26
|
]
|
|
26
27
|
|
|
@@ -11,15 +11,15 @@ from __future__ import annotations
|
|
|
11
11
|
# ── Constants ──────────────────────────────────────────────────────
|
|
12
12
|
|
|
13
13
|
# Pruning: minimum chars that must be freeable to justify a prune pass
|
|
14
|
-
PRUNE_MINIMUM_CHARS =
|
|
14
|
+
PRUNE_MINIMUM_CHARS = 12_000 # ~3.5K tokens
|
|
15
15
|
# Placeholder that replaces evicted content
|
|
16
16
|
PRUNED_PLACEHOLDER = "[cleared]"
|
|
17
17
|
# User messages larger than this threshold are truncated when outside protection window
|
|
18
|
-
PRUNE_USER_MSG_THRESHOLD =
|
|
18
|
+
PRUNE_USER_MSG_THRESHOLD = 2_000 # ~570 tokens
|
|
19
19
|
# How many chars to keep from the start of a pruned user message
|
|
20
|
-
PRUNE_USER_MSG_KEEP =
|
|
20
|
+
PRUNE_USER_MSG_KEEP = 500 # ~140 tokens
|
|
21
21
|
# Minimum number of recent user turns always protected (regardless of char budget)
|
|
22
|
-
PRUNE_PROTECT_TURNS =
|
|
22
|
+
PRUNE_PROTECT_TURNS = 2
|
|
23
23
|
# Tool result markers that should never be pruned (critical context)
|
|
24
24
|
PRUNE_PROTECTED_MARKERS = {"[SubAgent-", "delegate_task"}
|
|
25
25
|
# Tool names whose outputs should never be pruned (like OpenCode's PRUNE_PROTECTED_TOOLS)
|
|
@@ -27,20 +27,20 @@ PRUNE_PROTECTED_MARKERS = {"[SubAgent-", "delegate_task"}
|
|
|
27
27
|
PRUNE_PROTECTED_TOOLS = {"delegate_task"}
|
|
28
28
|
|
|
29
29
|
# Truncation: universal limits for any tool output
|
|
30
|
-
TRUNCATE_MAX_LINES =
|
|
31
|
-
TRUNCATE_MAX_BYTES =
|
|
30
|
+
TRUNCATE_MAX_LINES = 300
|
|
31
|
+
TRUNCATE_MAX_BYTES = 15 * 1024 # 15 KB
|
|
32
32
|
TRUNCATE_KEEP_START = 150 # lines to keep from the start
|
|
33
|
-
TRUNCATE_KEEP_END =
|
|
33
|
+
TRUNCATE_KEEP_END = 60 # lines to keep from the end
|
|
34
34
|
TRUNCATE_MAX_LINE_LENGTH = 1500 # chars per individual line (prevents minified files)
|
|
35
35
|
# Directory for saving full truncated outputs (like OpenCode pattern)
|
|
36
36
|
TRUNCATE_SAVE_DIR = ".aru/truncated"
|
|
37
37
|
|
|
38
38
|
# Compaction: trigger when per-run input tokens exceed this fraction of model limit
|
|
39
|
-
COMPACTION_THRESHOLD_RATIO = 0.
|
|
39
|
+
COMPACTION_THRESHOLD_RATIO = 0.70
|
|
40
40
|
# Compaction: target post-compaction size as fraction of model context limit
|
|
41
|
-
COMPACTION_TARGET_RATIO = 0.
|
|
41
|
+
COMPACTION_TARGET_RATIO = 0.15
|
|
42
42
|
# Compaction: also trigger after this many user turns (regardless of token count)
|
|
43
|
-
COMPACTION_MAX_TURNS =
|
|
43
|
+
COMPACTION_MAX_TURNS = 15
|
|
44
44
|
# Compaction: reserve buffer for the compaction process itself (like OpenCode's 20K)
|
|
45
45
|
COMPACTION_BUFFER_TOKENS = 20_000
|
|
46
46
|
# Default model context limits (input tokens)
|
|
@@ -115,8 +115,8 @@ def _get_prune_protect_chars(model_id: str = "default") -> int:
|
|
|
115
115
|
to prevent context overflow. Returns ~7% of the model's context in chars.
|
|
116
116
|
"""
|
|
117
117
|
limit = MODEL_CONTEXT_LIMITS.get(model_id, MODEL_CONTEXT_LIMITS["default"])
|
|
118
|
-
# ~4 chars per token, protect ~
|
|
119
|
-
protect = int(limit * 0.
|
|
118
|
+
# ~4 chars per token, protect ~7% of context
|
|
119
|
+
protect = int(limit * 0.07 * 4)
|
|
120
120
|
# Clamp between 10K (minimum usable) and 40K (diminishing returns)
|
|
121
121
|
return max(10_000, min(protect, 40_000))
|
|
122
122
|
|
|
@@ -115,9 +115,6 @@ async def run_agent_capture(agent, message: str, session=None, lightweight: bool
|
|
|
115
115
|
run_message = message
|
|
116
116
|
|
|
117
117
|
# Build conversation history as real messages for the LLM
|
|
118
|
-
# Compact BEFORE pruning: if the history is large enough that pruning
|
|
119
|
-
# would discard content, compact first to preserve context via summary
|
|
120
|
-
# instead of losing it to placeholders.
|
|
121
118
|
from aru.context import prune_history, should_compact, compact_conversation, would_prune
|
|
122
119
|
if session and session.history and not lightweight:
|
|
123
120
|
if would_prune(session.history, model_id=session.model_id):
|
|
@@ -242,23 +239,21 @@ async def run_agent_capture(agent, message: str, session=None, lightweight: bool
|
|
|
242
239
|
if run_output and session and hasattr(run_output, "metrics"):
|
|
243
240
|
session.track_tokens(run_output.metrics)
|
|
244
241
|
|
|
245
|
-
# Reactive compaction:
|
|
246
|
-
#
|
|
247
|
-
# session.history doesn't include tool results, so char-based estimates
|
|
248
|
-
# would miss the bulk of the context sent to the model.
|
|
242
|
+
# Reactive compaction: runs with a visible spinner so the user
|
|
243
|
+
# sees progress instead of a frozen screen.
|
|
249
244
|
run_input_tokens = getattr(run_output.metrics, "input_tokens", 0) or 0
|
|
250
245
|
if should_compact(run_input_tokens, session.model_id):
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
session.history
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
246
|
+
from rich.status import Status
|
|
247
|
+
with Status("[dim]Compacting context...[/dim]", console=console, spinner="dots"):
|
|
248
|
+
try:
|
|
249
|
+
session.history = prune_history(session.history, model_id=session.model_id)
|
|
250
|
+
session.history = await compact_conversation(
|
|
251
|
+
session.history, session.model_ref, session.plan_task,
|
|
252
|
+
model_id=session.model_id,
|
|
253
|
+
)
|
|
254
|
+
console.print("[dim]Context compacted to save tokens.[/dim]")
|
|
255
|
+
except Exception:
|
|
256
|
+
pass
|
|
262
257
|
|
|
263
258
|
final_content = accumulated or final_content
|
|
264
259
|
remaining = (final_content or "")[display._flushed_len:]
|
|
@@ -16,6 +16,46 @@ from aru.providers import MODEL_ALIASES, get_model_display, resolve_model_ref
|
|
|
16
16
|
# Default model reference (provider/model format)
|
|
17
17
|
DEFAULT_MODEL = "anthropic/claude-sonnet-4-5"
|
|
18
18
|
|
|
19
|
+
# Pricing per million tokens (USD). Cache read/write have separate rates.
|
|
20
|
+
# Format: {model_id_prefix: (input, output, cache_read, cache_write)}
|
|
21
|
+
# Prices as of 2025-05. Models not listed fall back to "default".
|
|
22
|
+
MODEL_PRICING: dict[str, tuple[float, float, float, float]] = {
|
|
23
|
+
# Anthropic (input, output, cache_read=10%, cache_write=125%)
|
|
24
|
+
"claude-sonnet-4-5": (3.00, 15.00, 0.30, 3.75),
|
|
25
|
+
"claude-sonnet-4-6": (3.00, 15.00, 0.30, 3.75),
|
|
26
|
+
"claude-opus-4": (15.00, 75.00, 1.50, 18.75),
|
|
27
|
+
"claude-opus-4-6": (15.00, 75.00, 1.50, 18.75),
|
|
28
|
+
"claude-haiku-3-5": (0.80, 4.00, 0.08, 1.00),
|
|
29
|
+
"claude-haiku-4-5": (1.00, 5.00, 0.10, 1.25),
|
|
30
|
+
# OpenAI
|
|
31
|
+
"gpt-4o": (2.50, 10.00, 1.25, 2.50),
|
|
32
|
+
"gpt-4o-mini": (0.15, 0.60, 0.075, 0.15),
|
|
33
|
+
"gpt-4.1": (2.00, 8.00, 0.50, 2.00),
|
|
34
|
+
"gpt-4.1-mini": (0.40, 1.60, 0.10, 0.40),
|
|
35
|
+
"gpt-4.1-nano": (0.10, 0.40, 0.025, 0.10),
|
|
36
|
+
"o3": (2.00, 8.00, 0.50, 2.00),
|
|
37
|
+
"o3-mini": (1.10, 4.40, 0.275, 1.10),
|
|
38
|
+
"o4-mini": (1.10, 4.40, 0.275, 1.10),
|
|
39
|
+
# Qwen / DashScope (<=256K tier, explicit cache: creation=125%, hit=10%)
|
|
40
|
+
"qwen3-plus": (0.50, 3.00, 0.05, 0.625),
|
|
41
|
+
"qwen3.6-plus": (0.50, 3.00, 0.05, 0.625),
|
|
42
|
+
"qwen-plus": (0.50, 3.00, 0.05, 0.625),
|
|
43
|
+
"qwen-max": (2.00, 6.00, 0.20, 2.50),
|
|
44
|
+
"qwen-turbo": (0.30, 0.60, 0.03, 0.375),
|
|
45
|
+
"qwen3-coder-plus": (0.50, 3.00, 0.05, 0.625),
|
|
46
|
+
# DeepSeek
|
|
47
|
+
"deepseek-chat": (0.27, 1.10, 0.07, 0.27),
|
|
48
|
+
"deepseek-reasoner": (0.55, 2.19, 0.14, 0.55),
|
|
49
|
+
# Google Gemini (via OpenRouter)
|
|
50
|
+
"gemini-2.5-pro": (1.25, 10.00, 0.315, 1.25),
|
|
51
|
+
"gemini-2.5-flash": (0.15, 0.60, 0.0375, 0.15),
|
|
52
|
+
# Groq (free tier / very cheap)
|
|
53
|
+
"llama-3.3-70b": (0.59, 0.79, 0.0, 0.0),
|
|
54
|
+
"llama-3.1": (0.05, 0.08, 0.0, 0.0),
|
|
55
|
+
# Fallback
|
|
56
|
+
"default": (3.00, 15.00, 0.30, 3.75),
|
|
57
|
+
}
|
|
58
|
+
|
|
19
59
|
SESSIONS_DIR = os.path.join(".aru", "sessions")
|
|
20
60
|
|
|
21
61
|
|
|
@@ -141,12 +181,15 @@ class Session:
|
|
|
141
181
|
self.total_cache_read_tokens: int = 0
|
|
142
182
|
self.total_cache_write_tokens: int = 0
|
|
143
183
|
self.api_calls: int = 0
|
|
184
|
+
# Per-call metrics: last API call's context window (set by cache_patch)
|
|
185
|
+
self.last_input_tokens: int = 0
|
|
186
|
+
self.last_output_tokens: int = 0
|
|
187
|
+
self.last_cache_read: int = 0
|
|
188
|
+
self.last_cache_write: int = 0
|
|
144
189
|
# Context cache — invalidated on file mutations
|
|
145
190
|
self._cached_tree: str | None = None
|
|
146
191
|
self._cached_git_status: str | None = None
|
|
147
192
|
self._context_dirty: bool = True
|
|
148
|
-
# Track whether AGENTS.md/extra instructions were already sent (skip on subsequent turns)
|
|
149
|
-
self.extra_instructions_sent: bool = False
|
|
150
193
|
# Tree depth for env context (configurable via aru.json "tree_depth")
|
|
151
194
|
self._tree_max_depth: int = 2
|
|
152
195
|
# Token budget (0 = unlimited)
|
|
@@ -198,20 +241,100 @@ class Session:
|
|
|
198
241
|
self.total_cache_read_tokens += getattr(metrics, "cache_read_tokens", 0) or 0
|
|
199
242
|
self.total_cache_write_tokens += getattr(metrics, "cache_write_tokens", 0) or 0
|
|
200
243
|
self.api_calls += 1
|
|
244
|
+
# Capture last API call's context window (set by cache_patch)
|
|
245
|
+
try:
|
|
246
|
+
from aru.cache_patch import get_last_call_metrics
|
|
247
|
+
self.last_input_tokens, self.last_output_tokens, self.last_cache_read, self.last_cache_write = get_last_call_metrics()
|
|
248
|
+
except ImportError:
|
|
249
|
+
self.last_input_tokens = getattr(metrics, "input_tokens", 0) or 0
|
|
250
|
+
self.last_output_tokens = getattr(metrics, "output_tokens", 0) or 0
|
|
251
|
+
self.last_cache_read = 0
|
|
252
|
+
self.last_cache_write = 0
|
|
253
|
+
|
|
254
|
+
def _get_pricing(self) -> tuple[float, float, float, float]:
|
|
255
|
+
"""Get per-million-token pricing for the current model."""
|
|
256
|
+
model_id = self.model_id
|
|
257
|
+
# Try exact match, then prefix match, then fallback
|
|
258
|
+
for prefix, pricing in MODEL_PRICING.items():
|
|
259
|
+
if prefix == "default":
|
|
260
|
+
continue
|
|
261
|
+
if model_id.startswith(prefix):
|
|
262
|
+
return pricing
|
|
263
|
+
return MODEL_PRICING["default"]
|
|
264
|
+
|
|
265
|
+
@property
|
|
266
|
+
def estimated_cost(self) -> float:
|
|
267
|
+
"""Estimate cumulative cost in USD based on token usage and model pricing.
|
|
268
|
+
|
|
269
|
+
For input tokens, subtracts cache_read (charged at cache rate) and
|
|
270
|
+
cache_write (charged at write rate) from the base input count.
|
|
271
|
+
"""
|
|
272
|
+
price_in, price_out, price_cache_read, price_cache_write = self._get_pricing()
|
|
273
|
+
# Non-cached input = total input - cache_read - cache_write
|
|
274
|
+
base_input = max(0, self.total_input_tokens - self.total_cache_read_tokens - self.total_cache_write_tokens)
|
|
275
|
+
cost = (
|
|
276
|
+
base_input * price_in / 1_000_000
|
|
277
|
+
+ self.total_output_tokens * price_out / 1_000_000
|
|
278
|
+
+ self.total_cache_read_tokens * price_cache_read / 1_000_000
|
|
279
|
+
+ self.total_cache_write_tokens * price_cache_write / 1_000_000
|
|
280
|
+
)
|
|
281
|
+
return cost
|
|
201
282
|
|
|
202
283
|
@property
|
|
203
284
|
def token_summary(self) -> str:
|
|
285
|
+
"""One-line summary shown after each response: context window + cost."""
|
|
286
|
+
if self.last_input_tokens <= 0 and self.total_input_tokens == 0:
|
|
287
|
+
return ""
|
|
288
|
+
cost = self.estimated_cost
|
|
289
|
+
cost_str = f"${cost:.4f}" if cost < 0.01 else f"${cost:.2f}"
|
|
290
|
+
if self.last_input_tokens > 0:
|
|
291
|
+
ctx_total = self.last_input_tokens + self.last_output_tokens + self.last_cache_read + self.last_cache_write
|
|
292
|
+
parts = [f"in: {self.last_input_tokens:,}", f"out: {self.last_output_tokens:,}"]
|
|
293
|
+
if self.last_cache_read > 0:
|
|
294
|
+
parts.append(f"cache_read: {self.last_cache_read:,}")
|
|
295
|
+
if self.last_cache_write > 0:
|
|
296
|
+
parts.append(f"cache_write: {self.last_cache_write:,}")
|
|
297
|
+
return f"context: {ctx_total:,} ({' / '.join(parts)}) | cost: {cost_str}"
|
|
298
|
+
# Fallback when per-call metrics aren't available
|
|
299
|
+
total = self.total_input_tokens + self.total_output_tokens
|
|
300
|
+
return f"tokens: {total:,} | cost: {cost_str}"
|
|
301
|
+
|
|
302
|
+
@property
|
|
303
|
+
def cost_summary(self) -> str:
|
|
304
|
+
"""Detailed cost breakdown for /cost command."""
|
|
204
305
|
total = self.total_input_tokens + self.total_output_tokens
|
|
205
306
|
if total == 0:
|
|
206
|
-
return ""
|
|
207
|
-
|
|
307
|
+
return "No token usage yet."
|
|
308
|
+
cost = self.estimated_cost
|
|
309
|
+
cost_str = f"${cost:.4f}" if cost < 0.01 else f"${cost:.2f}"
|
|
310
|
+
lines = [
|
|
311
|
+
f"Session cost: {cost_str}",
|
|
312
|
+
f"",
|
|
313
|
+
f"Cumulative tokens:",
|
|
314
|
+
f" input: {self.total_input_tokens:,}",
|
|
315
|
+
f" output: {self.total_output_tokens:,}",
|
|
316
|
+
]
|
|
208
317
|
if self.total_cache_read_tokens > 0:
|
|
209
|
-
|
|
210
|
-
|
|
318
|
+
lines.append(f" cache_read: {self.total_cache_read_tokens:,}")
|
|
319
|
+
if self.total_cache_write_tokens > 0:
|
|
320
|
+
lines.append(f" cache_write: {self.total_cache_write_tokens:,}")
|
|
321
|
+
lines.append(f" total: {total:,}")
|
|
322
|
+
lines.append(f" api calls: {self.api_calls}")
|
|
323
|
+
if self.last_input_tokens > 0:
|
|
324
|
+
ctx_total = self.last_input_tokens + self.last_output_tokens + self.last_cache_read + self.last_cache_write
|
|
325
|
+
lines.append(f"")
|
|
326
|
+
lines.append(f"Last context window: {ctx_total:,}")
|
|
327
|
+
lines.append(f" input: {self.last_input_tokens:,}")
|
|
328
|
+
lines.append(f" output: {self.last_output_tokens:,}")
|
|
329
|
+
if self.last_cache_read > 0:
|
|
330
|
+
lines.append(f" cache_read: {self.last_cache_read:,}")
|
|
331
|
+
if self.last_cache_write > 0:
|
|
332
|
+
lines.append(f" cache_write: {self.last_cache_write:,}")
|
|
211
333
|
if self.token_budget > 0:
|
|
212
334
|
pct = int(total / self.token_budget * 100)
|
|
213
|
-
|
|
214
|
-
|
|
335
|
+
lines.append(f"")
|
|
336
|
+
lines.append(f"Budget: {pct}% used")
|
|
337
|
+
return "\n".join(lines)
|
|
215
338
|
|
|
216
339
|
def invalidate_context_cache(self):
|
|
217
340
|
"""Mark cached tree/git status as stale. Call after file mutations."""
|
|
@@ -55,7 +55,7 @@ def _format_diff(old_string: str, new_string: str) -> Group:
|
|
|
55
55
|
|
|
56
56
|
|
|
57
57
|
# Hard ceiling per tool result (~7K tokens). Even max_size=0 respects this per chunk.
|
|
58
|
-
_READ_HARD_CAP =
|
|
58
|
+
_READ_HARD_CAP = 40_000 # bytes (~11K tokens)
|
|
59
59
|
|
|
60
60
|
def clear_read_cache():
|
|
61
61
|
"""Clear the read cache. Call after file mutations to avoid stale data."""
|
|
@@ -286,16 +286,32 @@ class TestSession:
|
|
|
286
286
|
session.total_output_tokens = 500
|
|
287
287
|
session.api_calls = 3
|
|
288
288
|
summary = session.token_summary
|
|
289
|
-
assert "1,
|
|
290
|
-
assert "
|
|
289
|
+
assert "tokens: 1,500" in summary
|
|
290
|
+
assert "cost:" in summary
|
|
291
291
|
|
|
292
|
-
def
|
|
292
|
+
def test_token_summary_with_context(self):
|
|
293
|
+
session = Session()
|
|
294
|
+
session.total_input_tokens = 1000
|
|
295
|
+
session.total_output_tokens = 500
|
|
296
|
+
session.last_input_tokens = 800
|
|
297
|
+
session.last_output_tokens = 200
|
|
298
|
+
session.last_cache_read = 100
|
|
299
|
+
session.api_calls = 1
|
|
300
|
+
summary = session.token_summary
|
|
301
|
+
assert "context:" in summary
|
|
302
|
+
assert "cache_read:" in summary
|
|
303
|
+
assert "cost:" in summary
|
|
304
|
+
|
|
305
|
+
def test_cost_summary(self):
|
|
293
306
|
session = Session()
|
|
294
307
|
session.total_input_tokens = 100
|
|
295
308
|
session.total_output_tokens = 50
|
|
296
309
|
session.total_cache_read_tokens = 200
|
|
297
310
|
session.api_calls = 1
|
|
298
|
-
|
|
311
|
+
summary = session.cost_summary
|
|
312
|
+
assert "Session cost:" in summary
|
|
313
|
+
assert "input:" in summary
|
|
314
|
+
assert "cache_read:" in summary
|
|
299
315
|
|
|
300
316
|
def test_to_dict_and_from_dict(self):
|
|
301
317
|
session = Session(session_id="test123")
|
aru_code-0.16.0/aru/__init__.py
DELETED
|
@@ -1 +0,0 @@
|
|
|
1
|
-
__version__ = "0.16.0"
|
|
@@ -1,133 +0,0 @@
|
|
|
1
|
-
"""Monkey-patch Agno's model layer to reduce token consumption.
|
|
2
|
-
|
|
3
|
-
Two optimizations:
|
|
4
|
-
|
|
5
|
-
1. **Tool result pruning** (ALL providers): After each tool execution, old tool
|
|
6
|
-
results in the message list are truncated to a short summary. This prevents
|
|
7
|
-
O(n²) token growth where each API call re-sends all previous tool results.
|
|
8
|
-
|
|
9
|
-
2. **Cache breakpoints** (Anthropic only): Marks the last 2 messages with
|
|
10
|
-
cache_control for Anthropic's prompt caching.
|
|
11
|
-
|
|
12
|
-
These patches intercept Agno's internal loop so they work transparently
|
|
13
|
-
regardless of which provider is used.
|
|
14
|
-
"""
|
|
15
|
-
|
|
16
|
-
from __future__ import annotations
|
|
17
|
-
|
|
18
|
-
# Max chars to keep from old tool results
|
|
19
|
-
_TOOL_RESULT_KEEP_CHARS = 200
|
|
20
|
-
# Number of recent tool results to keep in full
|
|
21
|
-
_KEEP_RECENT_RESULTS = 1
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
def _prune_tool_messages(messages):
|
|
25
|
-
"""Truncate old tool result content in the message list.
|
|
26
|
-
|
|
27
|
-
Keeps only the last N tool results in full. Older ones are truncated
|
|
28
|
-
to a short preview. This runs BEFORE each API call, so accumulated
|
|
29
|
-
tool results don't bloat the context on every re-send.
|
|
30
|
-
"""
|
|
31
|
-
# Find all tool message indices
|
|
32
|
-
tool_indices = [
|
|
33
|
-
i for i, msg in enumerate(messages)
|
|
34
|
-
if getattr(msg, "role", None) == "tool"
|
|
35
|
-
]
|
|
36
|
-
|
|
37
|
-
if len(tool_indices) <= _KEEP_RECENT_RESULTS:
|
|
38
|
-
return
|
|
39
|
-
|
|
40
|
-
# Prune all except the last N
|
|
41
|
-
for idx in tool_indices[:-_KEEP_RECENT_RESULTS]:
|
|
42
|
-
msg = messages[idx]
|
|
43
|
-
content = getattr(msg, "content", None)
|
|
44
|
-
if content is None:
|
|
45
|
-
continue
|
|
46
|
-
|
|
47
|
-
content_str = str(content)
|
|
48
|
-
if len(content_str) <= _TOOL_RESULT_KEEP_CHARS:
|
|
49
|
-
continue
|
|
50
|
-
|
|
51
|
-
truncated = content_str[:_TOOL_RESULT_KEEP_CHARS] + "\n[...truncated]"
|
|
52
|
-
try:
|
|
53
|
-
msg.content = truncated
|
|
54
|
-
if hasattr(msg, "compressed_content"):
|
|
55
|
-
msg.compressed_content = None
|
|
56
|
-
except (AttributeError, TypeError):
|
|
57
|
-
pass
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
def apply_cache_patch():
|
|
61
|
-
"""Apply all patches to reduce Agno's token consumption."""
|
|
62
|
-
_patch_tool_result_pruning()
|
|
63
|
-
_patch_claude_cache_breakpoints()
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
def _patch_tool_result_pruning():
|
|
67
|
-
"""Patch format_function_call_results to prune old tool results.
|
|
68
|
-
|
|
69
|
-
This is called after each tool execution, right before the next API call.
|
|
70
|
-
Works for ALL providers (Claude, OpenAI, Qwen, etc.) since it patches
|
|
71
|
-
the base Model class.
|
|
72
|
-
"""
|
|
73
|
-
from agno.models.base import Model
|
|
74
|
-
|
|
75
|
-
_original_format_results = Model.format_function_call_results
|
|
76
|
-
|
|
77
|
-
def _patched_format_results(self, messages, function_call_results, **kwargs):
|
|
78
|
-
# First: prune old tool results already in messages
|
|
79
|
-
_prune_tool_messages(messages)
|
|
80
|
-
# Then: add new results normally
|
|
81
|
-
return _original_format_results(self, messages, function_call_results, **kwargs)
|
|
82
|
-
|
|
83
|
-
Model.format_function_call_results = _patched_format_results
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
def _patch_claude_cache_breakpoints():
|
|
87
|
-
"""Patch Claude's format_messages to add cache breakpoints.
|
|
88
|
-
|
|
89
|
-
Marks the last 2 messages with cache_control for Anthropic's prompt
|
|
90
|
-
caching. Non-Anthropic providers ignore these fields.
|
|
91
|
-
"""
|
|
92
|
-
try:
|
|
93
|
-
import agno.utils.models.claude as claude_utils
|
|
94
|
-
except ImportError:
|
|
95
|
-
return
|
|
96
|
-
|
|
97
|
-
_original_format = claude_utils.format_messages
|
|
98
|
-
|
|
99
|
-
def _patched_format_messages(messages, compress_tool_results=False):
|
|
100
|
-
chat_messages, system_message = _original_format(
|
|
101
|
-
messages, compress_tool_results=compress_tool_results
|
|
102
|
-
)
|
|
103
|
-
|
|
104
|
-
if not chat_messages:
|
|
105
|
-
return chat_messages, system_message
|
|
106
|
-
|
|
107
|
-
# Add cache_control to last 2 messages
|
|
108
|
-
cache_marker = {"type": "ephemeral"}
|
|
109
|
-
marked = 0
|
|
110
|
-
for msg in reversed(chat_messages):
|
|
111
|
-
if marked >= 2:
|
|
112
|
-
break
|
|
113
|
-
content = msg.get("content")
|
|
114
|
-
if isinstance(content, list) and content:
|
|
115
|
-
last_item = content[-1]
|
|
116
|
-
if isinstance(last_item, dict):
|
|
117
|
-
last_item["cache_control"] = cache_marker
|
|
118
|
-
marked += 1
|
|
119
|
-
elif hasattr(last_item, "type"):
|
|
120
|
-
try:
|
|
121
|
-
as_dict = last_item.model_dump() if hasattr(last_item, "model_dump") else dict(last_item)
|
|
122
|
-
as_dict["cache_control"] = cache_marker
|
|
123
|
-
content[-1] = as_dict
|
|
124
|
-
marked += 1
|
|
125
|
-
except Exception:
|
|
126
|
-
pass
|
|
127
|
-
elif isinstance(content, str):
|
|
128
|
-
msg["content"] = [{"type": "text", "text": content, "cache_control": cache_marker}]
|
|
129
|
-
marked += 1
|
|
130
|
-
|
|
131
|
-
return chat_messages, system_message
|
|
132
|
-
|
|
133
|
-
claude_utils.format_messages = _patched_format_messages
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|