aru-code 0.31.0__tar.gz → 0.33.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (99) hide show
  1. {aru_code-0.31.0 → aru_code-0.33.0}/PKG-INFO +1 -1
  2. aru_code-0.33.0/aru/__init__.py +1 -0
  3. {aru_code-0.31.0 → aru_code-0.33.0}/aru/agent_factory.py +22 -3
  4. {aru_code-0.31.0 → aru_code-0.33.0}/aru/agents/base.py +94 -1
  5. aru_code-0.33.0/aru/agents/catalog.py +157 -0
  6. {aru_code-0.31.0 → aru_code-0.33.0}/aru/cache_patch.py +279 -19
  7. {aru_code-0.31.0 → aru_code-0.33.0}/aru/cli.py +57 -2
  8. {aru_code-0.31.0 → aru_code-0.33.0}/aru/commands.py +133 -0
  9. {aru_code-0.31.0 → aru_code-0.33.0}/aru/context.py +24 -1
  10. {aru_code-0.31.0 → aru_code-0.33.0}/aru/permissions.py +318 -21
  11. {aru_code-0.31.0 → aru_code-0.33.0}/aru/providers.py +214 -3
  12. {aru_code-0.31.0 → aru_code-0.33.0}/aru/runtime.py +78 -1
  13. {aru_code-0.31.0 → aru_code-0.33.0}/aru/session.py +115 -0
  14. {aru_code-0.31.0 → aru_code-0.33.0}/aru/tool_policy.py +75 -49
  15. {aru_code-0.31.0 → aru_code-0.33.0}/aru/tools/codebase.py +1 -1
  16. aru_code-0.33.0/aru/tools/delegate.py +602 -0
  17. aru_code-0.33.0/aru/tools/delegate_prompt.txt +34 -0
  18. {aru_code-0.31.0 → aru_code-0.33.0}/aru/tools/file_ops.py +2 -2
  19. {aru_code-0.31.0 → aru_code-0.33.0}/aru/tools/registry.py +10 -5
  20. {aru_code-0.31.0 → aru_code-0.33.0}/aru/tools/skill.py +1 -1
  21. {aru_code-0.31.0 → aru_code-0.33.0}/aru_code.egg-info/PKG-INFO +1 -1
  22. {aru_code-0.31.0 → aru_code-0.33.0}/aru_code.egg-info/SOURCES.txt +4 -0
  23. {aru_code-0.31.0 → aru_code-0.33.0}/pyproject.toml +4 -1
  24. {aru_code-0.31.0 → aru_code-0.33.0}/tests/test_catalog.py +8 -1
  25. aru_code-0.33.0/tests/test_delegate.py +1063 -0
  26. {aru_code-0.31.0 → aru_code-0.33.0}/tests/test_invoke_skill.py +4 -4
  27. aru_code-0.33.0/tests/test_microcompact.py +277 -0
  28. {aru_code-0.31.0 → aru_code-0.33.0}/tests/test_permissions.py +501 -0
  29. aru_code-0.33.0/tests/test_reasoning.py +455 -0
  30. {aru_code-0.31.0 → aru_code-0.33.0}/tests/test_tool_policy.py +88 -0
  31. aru_code-0.31.0/aru/__init__.py +0 -1
  32. aru_code-0.31.0/aru/agents/catalog.py +0 -92
  33. aru_code-0.31.0/aru/tools/delegate.py +0 -236
  34. {aru_code-0.31.0 → aru_code-0.33.0}/LICENSE +0 -0
  35. {aru_code-0.31.0 → aru_code-0.33.0}/README.md +0 -0
  36. {aru_code-0.31.0 → aru_code-0.33.0}/aru/agents/__init__.py +0 -0
  37. {aru_code-0.31.0 → aru_code-0.33.0}/aru/agents/planner.py +0 -0
  38. {aru_code-0.31.0 → aru_code-0.33.0}/aru/checkpoints.py +0 -0
  39. {aru_code-0.31.0 → aru_code-0.33.0}/aru/completers.py +0 -0
  40. {aru_code-0.31.0 → aru_code-0.33.0}/aru/config.py +0 -0
  41. {aru_code-0.31.0 → aru_code-0.33.0}/aru/display.py +0 -0
  42. {aru_code-0.31.0 → aru_code-0.33.0}/aru/history_blocks.py +0 -0
  43. {aru_code-0.31.0 → aru_code-0.33.0}/aru/plugin_cache.py +0 -0
  44. {aru_code-0.31.0 → aru_code-0.33.0}/aru/plugins/__init__.py +0 -0
  45. {aru_code-0.31.0 → aru_code-0.33.0}/aru/plugins/custom_tools.py +0 -0
  46. {aru_code-0.31.0 → aru_code-0.33.0}/aru/plugins/hooks.py +0 -0
  47. {aru_code-0.31.0 → aru_code-0.33.0}/aru/plugins/manager.py +0 -0
  48. {aru_code-0.31.0 → aru_code-0.33.0}/aru/plugins/tool_api.py +0 -0
  49. {aru_code-0.31.0 → aru_code-0.33.0}/aru/runner.py +0 -0
  50. {aru_code-0.31.0 → aru_code-0.33.0}/aru/select.py +0 -0
  51. {aru_code-0.31.0 → aru_code-0.33.0}/aru/tools/__init__.py +0 -0
  52. {aru_code-0.31.0 → aru_code-0.33.0}/aru/tools/_diff.py +0 -0
  53. {aru_code-0.31.0 → aru_code-0.33.0}/aru/tools/_shared.py +0 -0
  54. {aru_code-0.31.0 → aru_code-0.33.0}/aru/tools/ast_tools.py +0 -0
  55. {aru_code-0.31.0 → aru_code-0.33.0}/aru/tools/gitignore.py +0 -0
  56. {aru_code-0.31.0 → aru_code-0.33.0}/aru/tools/mcp_client.py +0 -0
  57. {aru_code-0.31.0 → aru_code-0.33.0}/aru/tools/plan_mode.py +0 -0
  58. {aru_code-0.31.0 → aru_code-0.33.0}/aru/tools/ranker.py +0 -0
  59. {aru_code-0.31.0 → aru_code-0.33.0}/aru/tools/search.py +0 -0
  60. {aru_code-0.31.0 → aru_code-0.33.0}/aru/tools/shell.py +0 -0
  61. {aru_code-0.31.0 → aru_code-0.33.0}/aru/tools/tasklist.py +0 -0
  62. {aru_code-0.31.0 → aru_code-0.33.0}/aru/tools/web.py +0 -0
  63. {aru_code-0.31.0 → aru_code-0.33.0}/aru_code.egg-info/dependency_links.txt +0 -0
  64. {aru_code-0.31.0 → aru_code-0.33.0}/aru_code.egg-info/entry_points.txt +0 -0
  65. {aru_code-0.31.0 → aru_code-0.33.0}/aru_code.egg-info/requires.txt +0 -0
  66. {aru_code-0.31.0 → aru_code-0.33.0}/aru_code.egg-info/top_level.txt +0 -0
  67. {aru_code-0.31.0 → aru_code-0.33.0}/setup.cfg +0 -0
  68. {aru_code-0.31.0 → aru_code-0.33.0}/tests/test_agents_base.py +0 -0
  69. {aru_code-0.31.0 → aru_code-0.33.0}/tests/test_agents_md_coverage.py +0 -0
  70. {aru_code-0.31.0 → aru_code-0.33.0}/tests/test_cache_patch_metrics.py +0 -0
  71. {aru_code-0.31.0 → aru_code-0.33.0}/tests/test_cache_patch_stop_reason.py +0 -0
  72. {aru_code-0.31.0 → aru_code-0.33.0}/tests/test_checkpoints.py +0 -0
  73. {aru_code-0.31.0 → aru_code-0.33.0}/tests/test_cli.py +0 -0
  74. {aru_code-0.31.0 → aru_code-0.33.0}/tests/test_cli_advanced.py +0 -0
  75. {aru_code-0.31.0 → aru_code-0.33.0}/tests/test_cli_base.py +0 -0
  76. {aru_code-0.31.0 → aru_code-0.33.0}/tests/test_cli_completers.py +0 -0
  77. {aru_code-0.31.0 → aru_code-0.33.0}/tests/test_cli_new.py +0 -0
  78. {aru_code-0.31.0 → aru_code-0.33.0}/tests/test_cli_run_cli.py +0 -0
  79. {aru_code-0.31.0 → aru_code-0.33.0}/tests/test_cli_session.py +0 -0
  80. {aru_code-0.31.0 → aru_code-0.33.0}/tests/test_cli_shell.py +0 -0
  81. {aru_code-0.31.0 → aru_code-0.33.0}/tests/test_codebase.py +0 -0
  82. {aru_code-0.31.0 → aru_code-0.33.0}/tests/test_confabulation_regression.py +0 -0
  83. {aru_code-0.31.0 → aru_code-0.33.0}/tests/test_config.py +0 -0
  84. {aru_code-0.31.0 → aru_code-0.33.0}/tests/test_context.py +0 -0
  85. {aru_code-0.31.0 → aru_code-0.33.0}/tests/test_gitignore.py +0 -0
  86. {aru_code-0.31.0 → aru_code-0.33.0}/tests/test_guardrails_scenarios.py +0 -0
  87. {aru_code-0.31.0 → aru_code-0.33.0}/tests/test_invoked_skills.py +0 -0
  88. {aru_code-0.31.0 → aru_code-0.33.0}/tests/test_main.py +0 -0
  89. {aru_code-0.31.0 → aru_code-0.33.0}/tests/test_mcp_client.py +0 -0
  90. {aru_code-0.31.0 → aru_code-0.33.0}/tests/test_plan_mode_refactor.py +0 -0
  91. {aru_code-0.31.0 → aru_code-0.33.0}/tests/test_plugin_cache.py +0 -0
  92. {aru_code-0.31.0 → aru_code-0.33.0}/tests/test_plugins.py +0 -0
  93. {aru_code-0.31.0 → aru_code-0.33.0}/tests/test_providers.py +0 -0
  94. {aru_code-0.31.0 → aru_code-0.33.0}/tests/test_ranker.py +0 -0
  95. {aru_code-0.31.0 → aru_code-0.33.0}/tests/test_runner_recovery.py +0 -0
  96. {aru_code-0.31.0 → aru_code-0.33.0}/tests/test_runtime.py +0 -0
  97. {aru_code-0.31.0 → aru_code-0.33.0}/tests/test_select.py +0 -0
  98. {aru_code-0.31.0 → aru_code-0.33.0}/tests/test_skill_disallowed_tools.py +0 -0
  99. {aru_code-0.31.0 → aru_code-0.33.0}/tests/test_tasklist.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: aru-code
3
- Version: 0.31.0
3
+ Version: 0.33.0
4
4
  Summary: A Claude Code clone built with Agno agents
5
5
  Author-email: Estevao <estevaofon@gmail.com>
6
6
  License-Expression: MIT
@@ -0,0 +1 @@
1
+ __version__ = "0.33.0"
@@ -150,15 +150,30 @@ async def create_agent_from_spec(
150
150
  resolved_model = model_ref or session.model_ref
151
151
 
152
152
  tools = _wrap_tools_with_hooks(spec.tools_factory())
153
- instructions = _build_instructions(spec.role, extra_instructions)
153
+ # Merge spec-level extra instructions (static, agent-specific policy like
154
+ # "you are read-only, never call write tools") with caller-provided extras
155
+ # (dynamic, session-specific context like cwd or AGENTS.md). Spec text
156
+ # comes first so the agent's baseline policy is established before any
157
+ # session-specific text that might try to override it.
158
+ combined_extra = "\n\n".join(
159
+ part for part in (spec.extra_instructions, extra_instructions) if part
160
+ )
161
+ instructions = _build_instructions(spec.role, combined_extra)
154
162
 
155
163
  instructions, resolved_model, max_tokens = await _apply_chat_hooks(
156
164
  instructions, resolved_model, spec.name, max_tokens=spec.max_tokens,
157
165
  )
158
166
 
167
+ reasoning_override = session.reasoning_override if session is not None else None
168
+
159
169
  return Agent(
160
170
  name=spec.name,
161
- model=create_model(resolved_model, max_tokens=max_tokens),
171
+ model=create_model(
172
+ resolved_model,
173
+ max_tokens=max_tokens,
174
+ use_reasoning=spec.use_reasoning,
175
+ reasoning_override=reasoning_override,
176
+ ),
162
177
  tools=tools,
163
178
  instructions=instructions,
164
179
  markdown=True,
@@ -210,7 +225,11 @@ async def create_custom_agent_instance(agent_def: CustomAgent, session: Session,
210
225
 
211
226
  return Agent(
212
227
  name=agent_def.name,
213
- model=create_model(model_ref, max_tokens=max_tokens),
228
+ model=create_model(
229
+ model_ref,
230
+ max_tokens=max_tokens,
231
+ reasoning_override=session.reasoning_override,
232
+ ),
214
233
  tools=tools,
215
234
  instructions=instructions,
216
235
  markdown=True,
@@ -374,11 +374,101 @@ Complete the search request efficiently and report your findings clearly.\
374
374
  """
375
375
 
376
376
 
377
+ VERIFIER_ROLE = """\
378
+ You are a verification sub-agent. Your sole job is to review a recent batch
379
+ of edits for correctness and report issues.
380
+
381
+ === CRITICAL: READ-ONLY MODE — NO FILE MODIFICATIONS ===
382
+ You are STRICTLY PROHIBITED from creating, editing, deleting, or moving
383
+ files. You do not have access to edit tools; attempts will fail. No
384
+ state-changing bash commands (no git add/commit, no npm/pip install, no
385
+ mkdir/touch/rm/cp/mv).
386
+
387
+ Your workflow:
388
+ 1. Read each file mentioned in the task using `read_file` or `read_files`
389
+ 2. Search for call sites / references to changed APIs using `grep_search`
390
+ 3. Skim related tests using `glob_search` + `read_file`
391
+ 4. Report findings in this structure:
392
+ - Inconsistencies found (with file:line refs)
393
+ - Missing follow-up edits (call sites not updated, etc.)
394
+ - Suspicious patterns worth the caller's attention (even if uncertain)
395
+ - What looks correct (brief — don't pad the report)
396
+
397
+ Be concise. Skip nitpicks (formatting, naming preferences). Focus on
398
+ bugs, broken contracts, or outdated call sites the caller likely missed.
399
+
400
+ Return ONE final message. The caller is not able to ask follow-ups
401
+ without a resume — include everything they need to act.\
402
+ """
403
+
404
+
405
+ REVIEWER_ROLE = """\
406
+ You are a code-review sub-agent. Review the files mentioned in the task
407
+ against common quality heuristics and produce actionable findings.
408
+
409
+ === CRITICAL: READ-ONLY MODE — NO FILE MODIFICATIONS ===
410
+ You may only read and search. No edit/write/delete/move operations. No
411
+ state-changing bash.
412
+
413
+ For each file covered:
414
+
415
+ - Naming: are identifiers clear and consistent with the surrounding code?
416
+ - Error handling: are edge cases covered? Any swallowed exceptions?
417
+ - Testing: is there test coverage for the new/modified code paths?
418
+ - Security: obvious injection, path traversal, secret exposure, unchecked
419
+ user input, missing auth checks?
420
+ - Complexity: functions that should be split, duplicated logic, over-
421
+ engineered abstractions for simple cases?
422
+
423
+ Report format:
424
+ - One bullet per finding
425
+ - Include file:line
426
+ - Classify severity: (blocker) / (important) / (nit) — omit (nit) unless
427
+ asked for a thorough review
428
+ - If nothing is wrong, say so plainly — do not fabricate issues
429
+
430
+ Return ONE final message covering every file you looked at.\
431
+ """
432
+
433
+
434
+ GUIDE_ROLE = """\
435
+ You are the Aru user-guide sub-agent. You answer questions about how to
436
+ use and configure Aru itself — slash commands, permission config, skills,
437
+ plugins, tool catalog, session management.
438
+
439
+ The questions are about Aru, NOT about the user's own codebase. When in
440
+ doubt, treat the task as "explain how to do X with Aru" rather than "do X
441
+ in the user's project".
442
+
443
+ === CRITICAL: READ-ONLY MODE — NO FILE MODIFICATIONS ===
444
+ You may only read and search. No edit/write/delete/move operations.
445
+
446
+ Authoritative sources, in priority order:
447
+ 1. `AGENTS.md` at the project root — architectural reference
448
+ 2. `docs/*.md` — user-facing documentation
449
+ 3. `aru.json` examples in the codebase — config shape
450
+ 4. Reading the code under `aru/` directly (last resort — prefer docs)
451
+
452
+ Workflow:
453
+ 1. `read_file` AGENTS.md first
454
+ 2. `glob_search` + `read_file` relevant docs/*.md
455
+ 3. Search `aru.json` or permission config examples if the question is
456
+ configuration-related
457
+
458
+ Never invent features. If the docs do not cover the topic, say so and
459
+ suggest the closest available alternative. Cite file paths in your
460
+ response so the user can verify.
461
+
462
+ Return ONE final message.\
463
+ """
464
+
465
+
377
466
  def build_instructions(role: str, extra: str = "") -> str:
378
467
  """Build complete instructions for an agent role.
379
468
 
380
469
  Args:
381
- role: One of 'planner', 'executor', 'general', 'explorer'.
470
+ role: One of 'planner', 'executor', 'general', 'explorer', 'verifier',
471
+ 'reviewer', 'guide'.
382
472
  extra: Additional project-specific instructions (README, AGENTS.md, skills).
383
473
  """
384
474
  role_text = {
@@ -386,6 +476,9 @@ def build_instructions(role: str, extra: str = "") -> str:
386
476
  "executor": EXECUTOR_ROLE,
387
477
  "general": GENERAL_ROLE,
388
478
  "explorer": EXPLORER_ROLE,
479
+ "verifier": VERIFIER_ROLE,
480
+ "reviewer": REVIEWER_ROLE,
481
+ "guide": GUIDE_ROLE,
389
482
  }[role]
390
483
 
391
484
  parts = [role_text, BASE_INSTRUCTIONS]
@@ -0,0 +1,157 @@
1
+ """Native agent catalog — single source of truth for built-in agent specs.
2
+
3
+ Each AgentSpec describes a runtime-parameterized agent: prompt role, tool list,
4
+ mode (primary/subagent), and model sizing. The factory in agent_factory.py
5
+ consumes specs and builds Agno Agent instances. The runner in runner.py looks
6
+ up specs by name when handling runner.prompt(PromptInput).
7
+
8
+ Custom agents (defined via .agents/agents/*.md) follow a separate path through
9
+ create_custom_agent_instance and are NOT listed here.
10
+ """
11
+
12
+ from __future__ import annotations
13
+
14
+ from dataclasses import dataclass
15
+ from typing import Callable, Literal
16
+
17
+
18
+ @dataclass(frozen=True)
19
+ class AgentSpec:
20
+ """Static description of a native agent.
21
+
22
+ The tools_factory is a lazy callable so module load order does not force
23
+ aru.tools.codebase to be imported before this module.
24
+
25
+ `max_tokens=None` means "use the model's full cap" (see providers.py).
26
+ An explicit int caps the agent below that ceiling — providers.py always
27
+ clamps the final value to min(requested, model_cap) so specs can never
28
+ ask for more than the model supports.
29
+
30
+ `description` is the LLM-facing summary rendered into `delegate_task`'s
31
+ docstring. Only subagent specs need a meaningful description (primary
32
+ agents are never picked via `agent_name`). Keep it short (1-3 sentences)
33
+ and directive — the model uses it to decide when this agent fits.
34
+
35
+ `extra_instructions` is appended to the base role instructions when the
36
+ agent is built. Use it for agent-specific policy ("you are read-only,
37
+ never call write tools") that shouldn't leak into other roles.
38
+ """
39
+
40
+ name: str # display name passed to Agno
41
+ role: str # key into build_instructions(role, ...)
42
+ mode: Literal["primary", "subagent"]
43
+ tools_factory: Callable[[], list] # lazy resolver — invoked at agent creation
44
+ max_tokens: int | None
45
+ small_model: bool = False # if True, factory uses ctx.small_model_ref
46
+ use_reasoning: bool = True # False skips thinking params (e.g. explorer)
47
+ description: str = "" # LLM-facing summary for `delegate_task` docstring
48
+ extra_instructions: str = "" # appended to base role instructions on build
49
+
50
+
51
+ def _build_tools() -> list:
52
+ from aru.tools.registry import GENERAL_TOOLS
53
+ return GENERAL_TOOLS
54
+
55
+
56
+ def _plan_tools() -> list:
57
+ from aru.tools.registry import PLANNER_TOOLS
58
+ return PLANNER_TOOLS
59
+
60
+
61
+ def _exec_tools() -> list:
62
+ from aru.tools.registry import EXECUTOR_TOOLS
63
+ return EXECUTOR_TOOLS
64
+
65
+
66
+ def _explore_tools() -> list:
67
+ from aru.tools.registry import EXPLORER_TOOLS
68
+ return EXPLORER_TOOLS
69
+
70
+
71
+ AGENTS: dict[str, AgentSpec] = {
72
+ # Primary agents default to the model's full output cap (clamped by
73
+ # providers.create_model). Subagents keep a tight budget so a runaway
74
+ # explorer can't blow through the whole turn.
75
+ "build": AgentSpec(
76
+ name="Aru",
77
+ role="general",
78
+ mode="primary",
79
+ tools_factory=_build_tools,
80
+ max_tokens=None,
81
+ ),
82
+ "plan": AgentSpec(
83
+ name="Planner",
84
+ role="planner",
85
+ mode="primary",
86
+ tools_factory=_plan_tools,
87
+ max_tokens=4096,
88
+ ),
89
+ "executor": AgentSpec(
90
+ name="Executor",
91
+ role="executor",
92
+ mode="primary",
93
+ tools_factory=_exec_tools,
94
+ max_tokens=None,
95
+ ),
96
+ "explorer": AgentSpec(
97
+ name="Explorer",
98
+ role="explorer",
99
+ mode="subagent",
100
+ tools_factory=_explore_tools,
101
+ max_tokens=8192,
102
+ small_model=True,
103
+ use_reasoning=False, # fast read-only subagent — no thinking overhead
104
+ description=(
105
+ "Fast read-only codebase exploration agent. Use for searching "
106
+ "files, finding patterns, reading code, and understanding "
107
+ "structure. Specify thoroughness in the task text: \"quick\" "
108
+ "(basic searches), \"medium\" (moderate exploration), or "
109
+ "\"very thorough\" (comprehensive analysis)."
110
+ ),
111
+ ),
112
+ "verification": AgentSpec(
113
+ name="Verifier",
114
+ role="verifier",
115
+ mode="subagent",
116
+ tools_factory=_explore_tools, # read-only
117
+ max_tokens=4096,
118
+ small_model=True,
119
+ use_reasoning=False,
120
+ description=(
121
+ "Double-check a recent batch of edits for correctness. Reads "
122
+ "changed files, searches for call sites, reports inconsistencies "
123
+ "and missing follow-up edits. Read-only — never edits. Use after "
124
+ "non-trivial multi-file edits to catch issues before the user sees them."
125
+ ),
126
+ ),
127
+ "reviewer": AgentSpec(
128
+ name="Reviewer",
129
+ role="reviewer",
130
+ mode="subagent",
131
+ tools_factory=_explore_tools, # read-only
132
+ max_tokens=4096,
133
+ small_model=True,
134
+ use_reasoning=False,
135
+ description=(
136
+ "Code review against naming, error handling, test coverage, and "
137
+ "security heuristics. Read-only; produces bulleted findings with "
138
+ "file:line refs and severity tags. Use when you want a second "
139
+ "pair of eyes before finalising changes."
140
+ ),
141
+ ),
142
+ "guide": AgentSpec(
143
+ name="Guide",
144
+ role="guide",
145
+ mode="subagent",
146
+ tools_factory=_explore_tools, # read-only
147
+ max_tokens=4096,
148
+ small_model=True,
149
+ use_reasoning=False,
150
+ description=(
151
+ "Answer questions about using Aru itself — slash commands, "
152
+ "permission config, skills, plugins, tool catalog. Reads "
153
+ "AGENTS.md and docs/ to ground answers. Use when the user's "
154
+ "question is about Aru's features, not their own codebase."
155
+ ),
156
+ ),
157
+ }
@@ -43,6 +43,43 @@ _last_call_cache_write: int = 0
43
43
  # We normalize "length" → "max_tokens" so callers can check a single value.
44
44
  _last_call_stop_reason: str | None = None
45
45
 
46
+ # Micro-compaction metrics (process-wide, reset by tests via
47
+ # reset_microcompact_stats()). Recorded by _prune_tool_messages every time it
48
+ # fires from the format_function_call_results patch. Surfaced in /cost so
49
+ # users can see what the pre-API-call prune is actually doing — the basis
50
+ # for any future calibration of count/time-based triggers (Passos 3/4 of the
51
+ # plan, deferred until we have data here to justify them).
52
+ _microcompact_invocations: int = 0 # times _prune_tool_messages was called
53
+ _microcompact_clear_passes: int = 0 # times the prune actually cleared anything
54
+ _microcompact_results_cleared: int = 0 # cumulative tool_result blocks cleared
55
+
56
+ # Reactive overflow recovery: counts API calls where the provider rejected the
57
+ # request as too long and we wiped older tool_results then retried. Surfaced
58
+ # in /cost so users can tell when the recovery path is masking a chronically
59
+ # oversized context (suggests prune thresholds or model choice need attention).
60
+ _microcompact_overflow_recoveries: int = 0
61
+ # Aggressive prune keeps only the last N compactable tool_results, no matter
62
+ # the budget. Picked low because by definition we got here AFTER the regular
63
+ # prune (160K protect) failed to keep the context within model limits.
64
+ _OVERFLOW_RECOVERY_KEEP_RECENT = 3
65
+ # Substrings (case-insensitive) that mark a provider error as a context-too-long
66
+ # rejection. Anthropic / OpenAI / DashScope / DeepSeek / Groq all phrase it
67
+ # slightly differently; the union below covers the seen variants. Match is
68
+ # substring against str(exc) — wider than ideal, but the fallback path (no
69
+ # recovery) only kicks in when wrong, and a false positive at worst replays
70
+ # the same call after a no-op prune.
71
+ _OVERFLOW_ERROR_SIGNATURES = (
72
+ "prompt is too long",
73
+ "context length",
74
+ "context_length_exceeded",
75
+ "maximum context",
76
+ "exceeds the maximum",
77
+ "exceeds context",
78
+ "input is too long",
79
+ "too many tokens",
80
+ "request too large",
81
+ )
82
+
46
83
 
47
84
  def get_last_call_metrics() -> tuple[int, int, int, int]:
48
85
  """Return (input, output, cache_read, cache_write) from the most recent API call."""
@@ -68,6 +105,130 @@ def reset_last_stop_reason() -> None:
68
105
  _last_call_stop_reason = None
69
106
 
70
107
 
108
+ def get_microcompact_stats() -> dict:
109
+ """Return process-wide micro-compaction metrics.
110
+
111
+ Keys:
112
+ - invocations: total times _prune_tool_messages ran
113
+ - clear_passes: subset that actually cleared something
114
+ - results_cleared: cumulative tool_result blocks wiped
115
+
116
+ Used by /cost and tests. The ratio results_cleared/invocations is the
117
+ natural calibration signal for whether the budget-based trigger fires
118
+ often enough — if it's near zero across long sessions, the threshold
119
+ is too lax (or the protect window too generous).
120
+ """
121
+ return {
122
+ "invocations": _microcompact_invocations,
123
+ "clear_passes": _microcompact_clear_passes,
124
+ "results_cleared": _microcompact_results_cleared,
125
+ "overflow_recoveries": _microcompact_overflow_recoveries,
126
+ }
127
+
128
+
129
+ def reset_microcompact_stats() -> None:
130
+ """Zero the micro-compaction counters. Test-only helper."""
131
+ global _microcompact_invocations, _microcompact_clear_passes, _microcompact_results_cleared
132
+ global _microcompact_overflow_recoveries
133
+ _microcompact_invocations = 0
134
+ _microcompact_clear_passes = 0
135
+ _microcompact_results_cleared = 0
136
+ _microcompact_overflow_recoveries = 0
137
+
138
+
139
+ def _is_context_overflow_error(exc) -> bool:
140
+ """Return True iff `exc` looks like a provider context-too-long rejection.
141
+
142
+ Substring match (case-insensitive) against the str of the exception and any
143
+ nested `original_error` attribute. Wider than ideal but cheap; the recovery
144
+ path that consumes this is itself idempotent (re-running with no changes
145
+ after a no-op prune just hits the same error again and propagates).
146
+ """
147
+ msgs: list[str] = []
148
+ try:
149
+ msgs.append(str(exc))
150
+ except Exception:
151
+ pass
152
+ inner = getattr(exc, "original_error", None) or getattr(exc, "__cause__", None)
153
+ if inner is not None:
154
+ try:
155
+ msgs.append(str(inner))
156
+ except Exception:
157
+ pass
158
+ blob = " ".join(m.lower() for m in msgs if m)
159
+ return any(sig in blob for sig in _OVERFLOW_ERROR_SIGNATURES)
160
+
161
+
162
+ def _aggressive_prune(messages, keep_recent: int = _OVERFLOW_RECOVERY_KEEP_RECENT) -> int:
163
+ """Wipe content of all but the last `keep_recent` compactable tool_results.
164
+
165
+ Used reactively after a provider rejects a request as too long. Ignores the
166
+ budget walk entirely — by the time we get here, the budget-based prune
167
+ already failed to keep us under the model's context limit, so its answer
168
+ is wrong for this request.
169
+
170
+ Non-compactable tool_results (delegate_task etc.) are still preserved.
171
+ Returns the number of results actually cleared.
172
+ """
173
+ from aru.context import COMPACTABLE_TOOLS
174
+
175
+ id_to_name = _build_tool_id_to_name_map(messages)
176
+
177
+ # Collect compactable tool_result indices in encounter order.
178
+ compactable_indices: list[int] = []
179
+ for i, msg in enumerate(messages):
180
+ if getattr(msg, "role", None) != "tool":
181
+ continue
182
+ tc_id = getattr(msg, "tool_call_id", None)
183
+ tool_name = id_to_name.get(tc_id) if tc_id else None
184
+ if tool_name in COMPACTABLE_TOOLS:
185
+ compactable_indices.append(i)
186
+
187
+ if len(compactable_indices) <= keep_recent:
188
+ return 0
189
+
190
+ to_clear = compactable_indices[:-keep_recent] if keep_recent > 0 else compactable_indices
191
+ cleared = 0
192
+ for idx in to_clear:
193
+ msg = messages[idx]
194
+ content = getattr(msg, "content", None)
195
+ if content is None or str(content) == _PRUNED_PLACEHOLDER:
196
+ continue
197
+ try:
198
+ msg.content = _PRUNED_PLACEHOLDER
199
+ if hasattr(msg, "compressed_content"):
200
+ msg.compressed_content = None
201
+ cleared += 1
202
+ except (AttributeError, TypeError):
203
+ pass
204
+ return cleared
205
+
206
+
207
+ def _build_tool_id_to_name_map(messages) -> dict:
208
+ """Walk assistant messages forward, building tool_call_id → tool_name map.
209
+
210
+ Required because Agno's `role="tool"` Message carries `tool_call_id` but
211
+ not the originating tool name — the name lives on the matching
212
+ `assistant.tool_calls[i].function.name` in a previous message.
213
+ """
214
+ id_to_name: dict = {}
215
+ for msg in messages:
216
+ if getattr(msg, "role", None) != "assistant":
217
+ continue
218
+ tool_calls = getattr(msg, "tool_calls", None)
219
+ if not tool_calls:
220
+ continue
221
+ for tc in tool_calls:
222
+ tc_id = tc.get("id") if isinstance(tc, dict) else None
223
+ if not tc_id:
224
+ continue
225
+ fn = tc.get("function") if isinstance(tc, dict) else None
226
+ tc_name = fn.get("name") if isinstance(fn, dict) else None
227
+ if tc_name:
228
+ id_to_name[tc_id] = tc_name
229
+ return id_to_name
230
+
231
+
71
232
  def _prune_tool_messages(messages):
72
233
  """Clear old tool result content using a token-budget approach.
73
234
 
@@ -77,49 +238,81 @@ def _prune_tool_messages(messages):
77
238
  PRUNE_MINIMUM_CHARS (avoids unnecessary churn on small conversations).
78
239
 
79
240
  Aligned with OpenCode's strategy: budget-based, not fixed-N.
241
+
242
+ **Tool allowlist**: only outputs of tools in `COMPACTABLE_TOOLS` are
243
+ eligible for clearing. Non-compactable tools (delegate_task, invoke_skill,
244
+ tasklist mutators) still consume the protection budget but are never
245
+ pruned — their content is semantically load-bearing. The id→name map is
246
+ built from prior assistant `tool_calls` since `role="tool"` Messages carry
247
+ only the call id, not the tool name. Single source of truth lives in
248
+ `aru.context.COMPACTABLE_TOOLS`.
249
+
250
+ Returns the number of tool results actually cleared (0 if none) for
251
+ metrics consumption by `_microcompact_stats`.
80
252
  """
81
- # Collect tool message indices and their content sizes
82
- tool_indices = []
83
- for i, msg in enumerate(messages):
84
- if getattr(msg, "role", None) == "tool":
85
- content = getattr(msg, "content", None)
86
- content_len = len(str(content)) if content is not None else 0
87
- tool_indices.append((i, content_len))
253
+ from aru.context import COMPACTABLE_TOOLS
88
254
 
89
- if not tool_indices:
90
- return
255
+ global _microcompact_invocations, _microcompact_clear_passes, _microcompact_results_cleared
256
+ _microcompact_invocations += 1
91
257
 
92
- # Walk backwards, accumulating protected chars
93
- protected_chars = 0
94
- prune_candidates = [] # (index, content_len) of messages outside protection
258
+ id_to_name = _build_tool_id_to_name_map(messages)
95
259
 
96
- for idx, content_len in reversed(tool_indices):
97
- if protected_chars + content_len <= _PRUNE_PROTECT_CHARS:
98
- protected_chars += content_len
99
- else:
260
+ # Collect tool message indices, their content sizes, and compactability.
261
+ tool_entries = [] # (index, content_len, is_compactable)
262
+ for i, msg in enumerate(messages):
263
+ if getattr(msg, "role", None) != "tool":
264
+ continue
265
+ content = getattr(msg, "content", None)
266
+ content_len = len(str(content)) if content is not None else 0
267
+ tc_id = getattr(msg, "tool_call_id", None)
268
+ tool_name = id_to_name.get(tc_id) if tc_id else None
269
+ # Defensive: if we can't resolve the name, treat as non-compactable.
270
+ # Better to leak budget than wipe a delegate_task result by mistake.
271
+ is_compactable = tool_name in COMPACTABLE_TOOLS if tool_name else False
272
+ tool_entries.append((i, content_len, is_compactable))
273
+
274
+ if not tool_entries:
275
+ return 0
276
+
277
+ # Walk backwards. ALL tool content (compactable or not) consumes the
278
+ # protection budget — the prompt carries it either way. Once the budget
279
+ # is exhausted, older entries are prune candidates ONLY if compactable;
280
+ # non-compactable old entries (delegate_task etc.) stay untouched.
281
+ running_total = 0
282
+ prune_candidates = [] # (index, content_len) of compactable messages outside protection
283
+
284
+ for idx, content_len, is_compactable in reversed(tool_entries):
285
+ in_recent_window = (running_total + content_len) <= _PRUNE_PROTECT_CHARS
286
+ running_total += content_len
287
+ if not in_recent_window and is_compactable:
100
288
  prune_candidates.append((idx, content_len))
101
289
 
102
290
  # Only prune if there's enough to free
103
291
  freeable = sum(cl for _, cl in prune_candidates)
104
292
  if freeable < _PRUNE_MINIMUM_CHARS:
105
- return
293
+ return 0
106
294
 
107
- # Replace old tool results with placeholder
295
+ cleared = 0
108
296
  for idx, _ in prune_candidates:
109
297
  msg = messages[idx]
110
298
  content = getattr(msg, "content", None)
111
299
  if content is None:
112
300
  continue
113
- # Skip if already pruned
114
301
  if str(content) == _PRUNED_PLACEHOLDER:
115
302
  continue
116
303
  try:
117
304
  msg.content = _PRUNED_PLACEHOLDER
118
305
  if hasattr(msg, "compressed_content"):
119
306
  msg.compressed_content = None
307
+ cleared += 1
120
308
  except (AttributeError, TypeError):
121
309
  pass
122
310
 
311
+ if cleared:
312
+ _microcompact_clear_passes += 1
313
+ _microcompact_results_cleared += cleared
314
+ return cleared
315
+
123
316
 
124
317
  def apply_cache_patch():
125
318
  """Apply all patches to reduce Agno's token consumption."""
@@ -127,6 +320,73 @@ def apply_cache_patch():
127
320
  _patch_claude_cache_breakpoints()
128
321
  _patch_per_call_metrics()
129
322
  _patch_stop_reason_capture()
323
+ _patch_overflow_recovery()
324
+
325
+
326
+ def _patch_overflow_recovery():
327
+ """Wrap Agno's retry loops to handle context-overflow rejections.
328
+
329
+ When the provider rejects a request as too long (after the regular pre-call
330
+ prune was insufficient), wipe content of all but the last
331
+ `_OVERFLOW_RECOVERY_KEEP_RECENT` compactable tool_results in the message
332
+ list and re-raise. Agno's existing retry loop in `_a*invoke_with_retry`
333
+ will retry once with the now-shorter messages.
334
+
335
+ Patches both `_ainvoke_with_retry` (non-stream) and
336
+ `_ainvoke_stream_with_retry` (stream — what Aru's runner uses). Each is
337
+ wrapped to call `_aggressive_prune` once per turn before the underlying
338
+ retry fires; subsequent overflow errors propagate normally so we never
339
+ loop forever wiping the same messages.
340
+
341
+ A turn-scoped flag (`_overflow_recovery_done` set on the Model instance)
342
+ ensures we only attempt recovery once per call site — if even the
343
+ aggressive prune doesn't shrink the prompt enough, the error propagates
344
+ and the user sees it instead of a silent retry storm.
345
+ """
346
+ from agno.models.base import Model
347
+ from agno.exceptions import ModelProviderError
348
+
349
+ _orig_ainvoke = Model._ainvoke_with_retry
350
+ _orig_ainvoke_stream = Model._ainvoke_stream_with_retry
351
+
352
+ async def _patched_ainvoke_with_retry(self, **kwargs):
353
+ global _microcompact_overflow_recoveries
354
+ try:
355
+ return await _orig_ainvoke(self, **kwargs)
356
+ except ModelProviderError as e:
357
+ if not _is_context_overflow_error(e):
358
+ raise
359
+ messages = kwargs.get("messages")
360
+ if messages is None:
361
+ raise
362
+ cleared = _aggressive_prune(messages)
363
+ if cleared == 0:
364
+ raise
365
+ _microcompact_overflow_recoveries += 1
366
+ return await _orig_ainvoke(self, **kwargs)
367
+
368
+ async def _patched_ainvoke_stream_with_retry(self, **kwargs):
369
+ global _microcompact_overflow_recoveries
370
+ try:
371
+ async for response in _orig_ainvoke_stream(self, **kwargs):
372
+ yield response
373
+ return
374
+ except ModelProviderError as e:
375
+ if not _is_context_overflow_error(e):
376
+ raise
377
+ messages = kwargs.get("messages")
378
+ if messages is None:
379
+ raise
380
+ cleared = _aggressive_prune(messages)
381
+ if cleared == 0:
382
+ raise
383
+ _microcompact_overflow_recoveries += 1
384
+ # Retry once with the now-pruned messages. A second overflow propagates.
385
+ async for response in _orig_ainvoke_stream(self, **kwargs):
386
+ yield response
387
+
388
+ Model._ainvoke_with_retry = _patched_ainvoke_with_retry
389
+ Model._ainvoke_stream_with_retry = _patched_ainvoke_stream_with_retry
130
390
 
131
391
 
132
392
  def _patch_tool_result_pruning():