PyPI - aru-code - Versions diffs - 0.53.0__tar.gz → 0.55.0__tar.gz - Mend

aru-code 0.53.0tar.gz → 0.55.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (213) hide show

{aru_code-0.53.0/aru_code.egg-info → aru_code-0.55.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: aru-code
-Version: 0.53.0
+Version: 0.55.0
 Summary: A Claude Code clone built with Agno agents
 Author-email: Estevao <estevaofon@gmail.com>
 License-Expression: MIT

aru_code-0.55.0/aru/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ __version__ = "0.55.0"

{aru_code-0.53.0 → aru_code-0.55.0}/aru/cache_patch.py RENAMED Viewed

@@ -23,6 +23,9 @@ regardless of which provider is used.
 from __future__ import annotations
+import os as _os
+import time as _time
 # Token-budget pruning (aligned with OpenCode's strategy):
 # - Protect recent tool results within a token budget
 # - Only prune if there's enough to free (avoid churn)
@@ -43,6 +46,22 @@ _last_call_cache_write: int = 0
 # We normalize "length" → "max_tokens" so callers can check a single value.
 _last_call_stop_reason: str | None = None
+# Per-call observability ring buffer. Each accumulate_model_metrics fire
+# appends one record; the ring caps at _CALL_HISTORY_MAX so a long-running
+# session doesn't grow unbounded. Surfaced via /calls so users can see
+# *which* models / model_types / call sites produced each request — the
+# canonical "why are there N api_calls?" diagnosis surface.
+_CALL_HISTORY_MAX = 200
+_call_history: list[dict] = []
+# Pending request metadata captured by the request-side patch right before
+# the provider call goes out. Read by ``_patched_accumulate`` after the
+# response lands and merged into the matching call_history record so /calls
+# shows both the response usage AND a summary of what was sent. Single-
+# slot global is OK: aru runs requests sequentially per ctx, and the patch
+# captures-then-clears synchronously around each invocation.
+_pending_request_meta: dict | None = None
 # Micro-compaction metrics (process-wide, reset by tests via
 # reset_microcompact_stats()). Recorded by _prune_tool_messages every time it
 # fires from the format_function_call_results patch. Surfaced in /cost so
@@ -105,6 +124,92 @@ def reset_last_stop_reason() -> None:
     _last_call_stop_reason = None
+def _summarize_request(messages, tools=None) -> dict:
+    """Build a compact summary of an outgoing request for /calls.
+    We deliberately don't store full message bodies — a single tool result
+    can be tens of KB and a long session would balloon memory. We keep:
+      * count of messages and per-role tally
+      * total chars across messages (proxy for prompt size)
+      * snippet of the first message (usually system prompt) and the last
+        message (usually the freshest user/tool turn — what the model is
+        responding to)
+      * snippet of the most recent ``user`` message specifically
+      * tool count
+    Snippets are capped at 240 chars. Enough to identify the call without
+    storing PII-heavy or token-heavy bodies.
+    """
+    out = {
+        "n_messages": 0,
+        "roles": {},
+        "total_chars": 0,
+        "first_snippet": "",
+        "last_snippet": "",
+        "last_user_snippet": "",
+        "n_tools": 0,
+    }
+    try:
+        msgs = list(messages or [])
+        out["n_messages"] = len(msgs)
+        out["n_tools"] = len(tools or [])
+        last_user = ""
+        for i, m in enumerate(msgs):
+            role = (getattr(m, "role", None) or "?")
+            out["roles"][role] = out["roles"].get(role, 0) + 1
+            content = getattr(m, "content", None)
+            if content is None:
+                content = getattr(m, "text", "")
+            if not isinstance(content, str):
+                try:
+                    content = str(content)
+                except Exception:
+                    content = ""
+            out["total_chars"] += len(content)
+            if i == 0:
+                out["first_snippet"] = content[:240]
+            if role == "user":
+                last_user = content[:240]
+        if msgs:
+            last = msgs[-1]
+            lc = getattr(last, "content", None) or getattr(last, "text", "")
+            if not isinstance(lc, str):
+                try:
+                    lc = str(lc)
+                except Exception:
+                    lc = ""
+            out["last_snippet"] = lc[:240]
+        out["last_user_snippet"] = last_user
+    except Exception:
+        pass
+    return out
+def _capture_request_meta(messages, tools=None) -> None:
+    """Stash a request summary into the pending slot for the next accumulate."""
+    global _pending_request_meta
+    _pending_request_meta = _summarize_request(messages, tools)
+def get_call_history() -> list[dict]:
+    """Return a copy of the per-API-call ring buffer.
+    Each entry: ``{n, model_type, model_id, provider, input_tokens,
+    output_tokens, cache_read, cache_write, stop_reason, caller, ts}``.
+    ``input_tokens`` is the *normalized* value (cache stripped for OpenAI-
+    style providers). ``caller`` is the agno file:line that invoked
+    accumulate_model_metrics — useful for distinguishing main-model calls
+    from parser/output-model/memory/recovery calls.
+    """
+    return list(_call_history)
+def reset_call_history() -> None:
+    """Clear the call ring buffer. Useful at session start or in tests."""
+    _call_history.clear()
 def get_microcompact_stats() -> dict:
     """Return process-wide micro-compaction metrics.
@@ -317,6 +422,72 @@ def _prune_tool_messages(messages):
 _PATCH_APPLIED = False
+def _patch_request_capture():
+    """Wrap the agno methods that receive ``messages`` right before the
+    provider HTTP call so /calls can show what was actually sent.
+    We hook the four ``Model._{a,}invoke{_stream,}_with_retry`` methods
+    on ``agno.models.base.Model`` — these are the chokepoint each subclass
+    flows through (sync/async × stream/non-stream). Each wrapper takes a
+    cheap snapshot of ``kwargs["messages"]`` into ``_pending_request_meta``
+    immediately before delegating to the original. ``_patched_accumulate``
+    then reads-and-clears that slot when the matching response lands.
+    The wrappers are best-effort: any exception during snapshotting is
+    swallowed so we never break the actual model call. Stream wrappers
+    must remain async generators (``async for ... yield``) — collecting
+    the stream first would defeat streaming.
+    """
+    try:
+        from agno.models.base import Model
+    except ImportError:
+        return
+    _orig_invoke = Model._invoke_with_retry
+    _orig_ainvoke = Model._ainvoke_with_retry
+    _orig_invoke_stream = Model._invoke_stream_with_retry
+    _orig_ainvoke_stream = Model._ainvoke_stream_with_retry
+    def _wrap_invoke(self, **kwargs):
+        try:
+            _capture_request_meta(kwargs.get("messages"), kwargs.get("tools"))
+        except Exception:
+            pass
+        return _orig_invoke(self, **kwargs)
+    async def _wrap_ainvoke(self, **kwargs):
+        try:
+            _capture_request_meta(kwargs.get("messages"), kwargs.get("tools"))
+        except Exception:
+            pass
+        return await _orig_ainvoke(self, **kwargs)
+    def _wrap_invoke_stream(self, **kwargs):
+        try:
+            _capture_request_meta(kwargs.get("messages"), kwargs.get("tools"))
+        except Exception:
+            pass
+        # _invoke_stream_with_retry returns an Iterator (sync generator)
+        return _orig_invoke_stream(self, **kwargs)
+    async def _wrap_ainvoke_stream(self, **kwargs):
+        try:
+            _capture_request_meta(kwargs.get("messages"), kwargs.get("tools"))
+        except Exception:
+            pass
+        # _ainvoke_stream_with_retry is an async generator — we must
+        # re-yield rather than return it (returning an async generator
+        # from an async def function wraps it in a coroutine that yields
+        # the generator object, which the caller would not iterate).
+        async for chunk in _orig_ainvoke_stream(self, **kwargs):
+            yield chunk
+    Model._invoke_with_retry = _wrap_invoke
+    Model._ainvoke_with_retry = _wrap_ainvoke
+    Model._invoke_stream_with_retry = _wrap_invoke_stream
+    Model._ainvoke_stream_with_retry = _wrap_ainvoke_stream
 def apply_cache_patch():
     """Apply all patches to reduce Agno's token consumption.
@@ -334,6 +505,7 @@ def apply_cache_patch():
     _patch_per_call_metrics()
     _patch_stop_reason_capture()
     _patch_overflow_recovery()
+    _patch_request_capture()
     _PATCH_APPLIED = True
@@ -515,6 +687,10 @@ def _publish_live_metrics(
         session.total_output_tokens += output_tokens
         session.total_cache_read_tokens += cache_read
         session.total_cache_write_tokens += cache_write
+        # Count real API requests (one per accumulate call). track_tokens
+        # used to do this at turn-end (++1), which collapsed multi-tool
+        # turns — a turn with N tool calls = N+1 requests but counted as 1.
+        session.api_calls = (getattr(session, "api_calls", 0) or 0) + 1
         session._live_input_added = (
             getattr(session, "_live_input_added", 0) + input_tokens
         )
@@ -587,6 +763,16 @@ def _patch_per_call_metrics():
         global _last_call_input_tokens, _last_call_output_tokens
         global _last_call_cache_read, _last_call_cache_write
         usage = getattr(model_response, "response_usage", None)
+        # Capture the call site (agno file:line that invoked accumulate)
+        # cheaply — only when there's a usage object worth recording.
+        _caller_str = ""
+        if usage is not None:
+            try:
+                import sys as _sys
+                _frame = _sys._getframe(1)
+                _caller_str = f"{_os.path.basename(_frame.f_code.co_filename)}:{_frame.f_lineno}"
+            except Exception:
+                _caller_str = "?"
         if usage is not None:
             input_tokens = getattr(usage, "input_tokens", 0) or 0
             output_tokens = getattr(usage, "output_tokens", 0) or 0
@@ -603,12 +789,59 @@ def _patch_per_call_metrics():
             is_anthropic = "anthropic" in (provider_name or "").lower()
             if not is_anthropic and cache_read and input_tokens >= cache_read:
                 input_tokens -= cache_read
+                # Mutate the shared usage object so the downstream
+                # ``_original_accumulate`` writes the *normalized* value
+                # into Agno's RunMetrics. Without this, RunMetrics keeps
+                # the raw (cache-inclusive) input while ``_last_call_*``
+                # and the live publish hold the normalized one, and
+                # ``Session.track_tokens`` reconciliation re-adds the
+                # cached portion as a fake "missing delta" — exactly the
+                # cumulative-vs-last asymmetry users see in /cost.
+                try:
+                    usage.input_tokens = input_tokens
+                except (AttributeError, TypeError):
+                    pass
             _last_call_input_tokens = input_tokens
             _last_call_output_tokens = output_tokens
             _last_call_cache_read = cache_read
             _last_call_cache_write = cache_write
+            # Per-call observability: append to the ring buffer so /calls
+            # can show breakdown by model_type (MODEL vs PARSER_MODEL vs
+            # MEMORY_MODEL etc.) and call site. Bounded to _CALL_HISTORY_MAX
+            # so a long session doesn't grow unbounded.
+            _model_id = ""
+            try:
+                _model_id = getattr(model, "id", "") or ""
+            except Exception:
+                pass
+            _mt_str = (
+                model_type.value
+                if hasattr(model_type, "value")
+                else str(model_type)
+            )
+            global _pending_request_meta
+            _req_meta = _pending_request_meta or {}
+            _pending_request_meta = None
+            _call_history.append({
+                "n": len(_call_history) + 1,
+                "model_type": _mt_str,
+                "model_id": _model_id,
+                "provider": provider_name or "",
+                "input_tokens": input_tokens,
+                "output_tokens": output_tokens,
+                "cache_read": cache_read,
+                "cache_write": cache_write,
+                "stop_reason": _last_call_stop_reason,
+                "caller": _caller_str,
+                "ts": _time.time(),
+                "request": _req_meta,
+            })
+            if len(_call_history) > _CALL_HISTORY_MAX:
+                # Keep the most recent N — drop from the front.
+                del _call_history[: len(_call_history) - _CALL_HISTORY_MAX]
             # Intra-turn live session update + bus publish. Gated to the
             # primary agent (subagent_depth == 0) so subagent API calls
             # don't double-count — delegate_task adds subagent totals in

{aru_code-0.53.0 → aru_code-0.55.0}/aru/cli.py RENAMED Viewed

@@ -761,6 +761,15 @@ async def run_cli(skip_permissions: bool = False, resume_id: str | None = None):
             ))
             continue
+        if user_input.lower() == "/calls":
+            console.print(Panel(
+                session.calls_summary,
+                title="[bold]Per-API-Call Breakdown[/bold]",
+                border_style="cyan",
+                padding=(1, 2),
+            ))
+            continue
         if user_input.lower() == "/subagents":
             from aru.commands import handle_subagents_command
             handle_subagents_command(session)

{aru_code-0.53.0 → aru_code-0.55.0}/aru/commands.py RENAMED Viewed

@@ -31,6 +31,7 @@ SLASH_COMMANDS = [
     ("/debug", "Debug utilities (plugin-errors)", "/debug <subcommand>"),
     ("/undo", "Undo last turn — restore files and/or conversation", "/undo"),
     ("/cost", "Show detailed token usage and cost", "/cost"),
+    ("/calls", "Show per-API-call breakdown (model_type, tokens, stop_reason, caller)", "/calls"),
     ("/yolo", "Toggle DANGEROUSLY skip all permissions (YOLO mode)", "/yolo"),
     ("/quit", "Exit aru", "/quit"),
 ]

{aru_code-0.53.0 → aru_code-0.55.0}/aru/permissions.py RENAMED Viewed

@@ -27,10 +27,31 @@ from rich.console import Group
 from rich.panel import Panel
 from rich.text import Text
-from aru.runtime import get_ctx
+from aru.runtime import begin_permission_wait, end_permission_wait, get_ctx
 from aru.select import select_option
+@contextmanager
+def _permission_prompt_scope(ctx):
+    """Hold ``permission_lock`` while marking the tool-call permission gate.
+    The gate tells the surrounding ``_thread_tool`` wrapper to suspend its
+    execution-timeout for as long as we are blocked here — acquiring the lock
+    can wait on a sibling tool's open prompt, and the prompt itself waits on
+    the user. Without this, the tool could report a timeout mid-prompt and
+    then apply the mutation out-of-band once the user finally answered.
+    ``begin_permission_wait`` runs BEFORE the lock is acquired so the
+    lock-wait is covered too; it is a no-op when no gate is installed (async
+    tools, tests). See ``aru.runtime.PermissionWaitGate``.
+    """
+    begin_permission_wait()
+    try:
+        with ctx.permission_lock:
+            yield
+    finally:
+        end_permission_wait()
 def _resolve_ui(ctx):
     """Return ``ctx.ui`` or install a ``ReplUI`` on-the-fly.
@@ -891,8 +912,11 @@ def check_permission(
         except Exception:
             pass  # never let plugin errors block permissions
-    # action == "ask" -> prompt user
-    with ctx.permission_lock:
+    # action == "ask" -> prompt user. The scope holds permission_lock AND
+    # suspends the tool-execution timeout while we block on the user (see
+    # _permission_prompt_scope) — so a slow human decision can never let the
+    # tool time out and then apply the mutation out-of-band.
+    with _permission_prompt_scope(ctx):
         # Re-check after acquiring lock (another thread may have resolved it)
         results2 = _resolve_many(category, subjects)
         if any(action == "deny" for action, _ in results2):

{aru_code-0.53.0 → aru_code-0.55.0}/aru/runner.py RENAMED Viewed

@@ -9,6 +9,7 @@ from dataclasses import dataclass, field
 from rich.markdown import Markdown
 from aru.display import console
+from aru.session import Session
 # Categories of tools that modify files (for highlighting in history)
@@ -640,18 +641,30 @@ async def run_agent_capture(agent, message: str, session=None, lightweight: bool
         })
         # Tier 2 #4: auto-memory extraction (opt-in, fire-and-forget).
+        # ``turn_tokens`` here is the size of the *exchange* (user message +
+        # assistant reply) — NOT the API call's prompt size. Earlier this
+        # used ``last_input_tokens + last_output_tokens``, but
+        # ``last_input_tokens`` includes the entire system prompt (~8K on
+        # aru with 30+ tools), so ``min_turn_tokens=500`` always tripped
+        # even on "Olá"/"ok"-style turns and the extractor fired every
+        # turn — burning the curator budget on nothing. Estimating from
+        # user+assistant char length matches the docstring intent
+        # ("trivial turns 'ok'/'thanks' don't trigger").
         try:
             from aru.memory.extractor import schedule_extraction_task
             from aru.runtime import get_ctx as _get_ctx
             _cfg = getattr(_get_ctx(), "config", None)
             _cfg_memory = getattr(_cfg, "memory", None) or {}
             _project_root = getattr(session, "project_root", None) or os.getcwd()
+            _exchange_tokens = Session.estimate_tokens(
+                (run_message or "") + (final_content or "")
+            )
             schedule_extraction_task(
                 project_root=_project_root,
                 user_msg=run_message or "",
                 assistant_msg=final_content or "",
                 config_memory=_cfg_memory,
-                turn_tokens=_turn_tokens_in + _turn_tokens_out,
+                turn_tokens=_exchange_tokens,
             )
         except Exception:
             pass  # extractor guards internally; swallow any unexpected raise

{aru_code-0.53.0 → aru_code-0.55.0}/aru/runtime.py RENAMED Viewed

@@ -345,6 +345,101 @@ def is_aborted() -> bool:
         return False
+# ── Permission-wait gate (tool-timeout suspension) ───────────────────
+#
+# Safety-critical. A tool's execution timeout (see
+# ``aru.tools._shared._thread_tool``) must NOT count the time a human spends
+# deciding on a permission prompt. The danger is concrete: if the timeout
+# fired while a prompt was still open, ``asyncio`` reports a timeout to the
+# model — but the worker thread it ran on CANNOT be killed (a Python
+# limitation), so it keeps running, parked on the prompt. The moment the user
+# clicks "yes", that orphaned thread applies the mutation **out-of-band**,
+# after the tool already claimed it timed out. An edit (or a delete) then
+# lands that the user never knowingly approved in-context.
+#
+# To prevent that, ``check_permission`` marks a per-tool-call gate while it
+# blocks on the user, and ``_thread_tool`` suspends its timeout for exactly
+# as long as the gate is active. Decision time is the human's, not the
+# tool's budget.
+#
+# Cross-thread mechanics: the gate object is created per tool call by the
+# ``_thread_tool`` wrapper (on the event loop) and stored in a ContextVar.
+# ``asyncio.to_thread`` copies the context into the worker thread, so
+# ``check_permission`` running on that thread flips the SAME gate object the
+# wrapper polls. Concurrent tool calls each get their own gate, so a prompt
+# open for one call never accidentally exempts a sibling.
+class PermissionWaitGate:
+    """Per-tool-call counter of in-flight human permission decisions.
+    A depth counter (not a bool) so re-entrant or repeated permission checks
+    within one tool call nest correctly. ``active`` is true whenever at least
+    one decision is outstanding.
+    """
+    __slots__ = ("_depth", "_lock")
+    def __init__(self) -> None:
+        self._depth = 0
+        self._lock = threading.Lock()
+    @property
+    def active(self) -> bool:
+        with self._lock:
+            return self._depth > 0
+    def enter(self) -> None:
+        with self._lock:
+            self._depth += 1
+    def leave(self) -> None:
+        with self._lock:
+            if self._depth > 0:
+                self._depth -= 1
+_perm_wait_gate: contextvars.ContextVar[PermissionWaitGate | None] = (
+    contextvars.ContextVar("aru_perm_wait_gate", default=None)
+)
+def install_permission_wait_gate() -> tuple[PermissionWaitGate, contextvars.Token]:
+    """Create a fresh gate, install it in the current context, return (gate, token).
+    Called by the ``_thread_tool`` wrapper before offloading to a worker
+    thread so the worker (which runs in a copy of this context) shares the
+    gate. Pair with ``reset_permission_wait_gate(token)`` in a finally.
+    """
+    gate = PermissionWaitGate()
+    token = _perm_wait_gate.set(gate)
+    return gate, token
+def reset_permission_wait_gate(token: contextvars.Token) -> None:
+    """Restore the previous gate binding (undo ``install_permission_wait_gate``)."""
+    _perm_wait_gate.reset(token)
+def begin_permission_wait() -> None:
+    """Mark that the current tool call is blocking on a human permission decision.
+    Paired with ``end_permission_wait()``. A no-op when no gate is installed
+    (async tools like ``bash`` that aren't wrapped by ``_thread_tool``, or
+    direct test calls) — those paths have no execution timeout to suspend.
+    """
+    gate = _perm_wait_gate.get()
+    if gate is not None:
+        gate.enter()
+def end_permission_wait() -> None:
+    """End the permission-wait window opened by ``begin_permission_wait()``."""
+    gate = _perm_wait_gate.get()
+    if gate is not None:
+        gate.leave()
 # ── Shared-state helpers (Stage 4) ───────────────────────────────────
 #
 # Individual ``dict[k] = v``, ``dict.get(k)``, and ``list.append`` are atomic

aru-code 0.53.0__tar.gz → 0.55.0__tar.gz

aru-code 0.53.0tar.gz → 0.55.0tar.gz