PyPI - stirrup - Versions diffs - 0.1.2__py3-none-any.whl → 0.1.3__py3-none-any.whl - Mend

stirrup 0.1.2py3-none-any.whl → 0.1.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (22) hide show

stirrup/__init__.py +2 -0
stirrup/clients/chat_completions_client.py +0 -3
stirrup/clients/litellm_client.py +20 -11
stirrup/clients/utils.py +6 -1
stirrup/constants.py +6 -2
stirrup/core/agent.py +180 -57
stirrup/core/cache.py +479 -0
stirrup/core/models.py +49 -7
stirrup/prompts/base_system_prompt.txt +1 -1
stirrup/tools/__init__.py +2 -0
stirrup/tools/calculator.py +1 -1
stirrup/tools/code_backends/base.py +7 -0
stirrup/tools/code_backends/e2b.py +25 -11
stirrup/tools/code_backends/local.py +2 -2
stirrup/tools/finish.py +1 -1
stirrup/tools/user_input.py +130 -0
stirrup/tools/web.py +1 -0
stirrup/utils/logging.py +24 -0
{stirrup-0.1.2.dist-info → stirrup-0.1.3.dist-info}/METADATA +1 -1
stirrup-0.1.3.dist-info/RECORD +36 -0
{stirrup-0.1.2.dist-info → stirrup-0.1.3.dist-info}/WHEEL +1 -1
stirrup-0.1.2.dist-info/RECORD +0 -34

stirrup/__init__.py CHANGED Viewed

@@ -35,6 +35,7 @@ from stirrup.core.models import (
     AssistantMessage,
     AudioContentBlock,
     ChatMessage,
+    EmptyParams,
     ImageContentBlock,
     LLMClient,
     SubAgentMetadata,
@@ -58,6 +59,7 @@ __all__ = [
     "AudioContentBlock",
     "ChatMessage",
     "ContextOverflowError",
+    "EmptyParams",
     "ImageContentBlock",
     "LLMClient",
     "SubAgentMetadata",

stirrup/clients/chat_completions_client.py CHANGED Viewed

@@ -67,7 +67,6 @@ class ChatCompletionsClient(LLMClient):
         *,
         base_url: str | None = None,
         api_key: str | None = None,
-        supports_audio_input: bool = False,
         reasoning_effort: str | None = None,
         timeout: float | None = None,
         max_retries: int = 2,
@@ -82,7 +81,6 @@ class ChatCompletionsClient(LLMClient):
                 Use for OpenAI-compatible providers (e.g., 'http://localhost:8000/v1').
             api_key: API key for authentication. If None, reads from OPENROUTER_API_KEY
                 environment variable.
-            supports_audio_input: Whether the model supports audio inputs. Defaults to False.
             reasoning_effort: Reasoning effort level for extended thinking models
                 (e.g., 'low', 'medium', 'high'). Only used with o1/o3 style models.
             timeout: Request timeout in seconds. If None, uses OpenAI SDK default.
@@ -92,7 +90,6 @@ class ChatCompletionsClient(LLMClient):
         """
         self._model = model
         self._max_tokens = max_tokens
-        self._supports_audio_input = supports_audio_input
         self._reasoning_effort = reasoning_effort
         self._kwargs = kwargs or {}

stirrup/clients/litellm_client.py CHANGED Viewed

@@ -7,7 +7,7 @@ Requires the litellm extra: `pip install stirrup[litellm]`
 """
 import logging
-from typing import Any
+from typing import Any, Literal
 try:
     from litellm import acompletion
@@ -38,6 +38,8 @@ __all__ = [
 LOGGER = logging.getLogger(__name__)
+type ReasoningEffort = Literal["none", "minimal", "low", "medium", "high", "xhigh", "default"]
 class LiteLLMClient(LLMClient):
     """LiteLLM-based client supporting multiple LLM providers with unified interface.
@@ -49,8 +51,8 @@ class LiteLLMClient(LLMClient):
         self,
         model_slug: str,
         max_tokens: int,
-        supports_audio_input: bool = False,
-        reasoning_effort: str | None = None,
+        api_key: str | None = None,
+        reasoning_effort: ReasoningEffort | None = None,
         kwargs: dict[str, Any] | None = None,
     ) -> None:
         """Initialize LiteLLM client with model configuration and capabilities.
@@ -58,15 +60,13 @@ class LiteLLMClient(LLMClient):
         Args:
             model_slug: Model identifier for LiteLLM (e.g., 'anthropic/claude-3-5-sonnet-20241022')
             max_tokens: Maximum context window size in tokens
-            supports_audio_input: Whether the model supports audio inputs
             reasoning_effort: Reasoning effort level for extended thinking models (e.g., 'medium', 'high')
             kwargs: Additional arguments to pass to LiteLLM completion calls
         """
         self._model_slug = model_slug
-        self._supports_video_input = False
-        self._supports_audio_input = supports_audio_input
         self._max_tokens = max_tokens
-        self._reasoning_effort = reasoning_effort
+        self._reasoning_effort: ReasoningEffort | None = reasoning_effort
+        self._api_key = api_key
         self._kwargs = kwargs or {}
     @property
@@ -92,6 +92,8 @@ class LiteLLMClient(LLMClient):
             tools=to_openai_tools(tools) if tools else None,
             tool_choice="auto" if tools else None,
             max_tokens=self._max_tokens,
+            reasoning_effort=self._reasoning_effort,
+            api_key=self._api_key,
             **self._kwargs,
         )
@@ -103,14 +105,20 @@ class LiteLLMClient(LLMClient):
             )
         msg = choice["message"]
         reasoning: Reasoning | None = None
         if getattr(msg, "reasoning_content", None) is not None:
             reasoning = Reasoning(content=msg.reasoning_content)
         if getattr(msg, "thinking_blocks", None) is not None and len(msg.thinking_blocks) > 0:
-            reasoning = Reasoning(
-                signature=msg.thinking_blocks[0]["signature"], content=msg.thinking_blocks[0]["content"]
-            )
+            if len(msg.thinking_blocks) > 1:
+                raise ValueError("Found multiple thinking blocks in the response")
+            signature = msg.thinking_blocks[0].get("thinking_signature", None)
+            content = msg.thinking_blocks[0].get("thinking", None)
+            if signature is None and content is None:
+                raise ValueError("Signature and content not found in the thinking block response")
+            reasoning = Reasoning(signature=signature, content=content)
         usage = r["usage"]
@@ -119,6 +127,7 @@ class LiteLLMClient(LLMClient):
                 tool_call_id=tc.get("id"),
                 name=tc["function"]["name"],
                 arguments=tc["function"].get("arguments", "") or "",
+                signature=tc.get("provider_specific_fields", {}).get("thought_signature", None),
             )
             for tc in (msg.get("tool_calls") or [])
         ]

stirrup/clients/utils.py CHANGED Viewed

@@ -12,6 +12,7 @@ from stirrup.core.models import (
     AudioContentBlock,
     ChatMessage,
     Content,
+    EmptyParams,
     ImageContentBlock,
     SystemMessage,
     Tool,
@@ -47,7 +48,7 @@ def to_openai_tools(tools: dict[str, Tool]) -> list[dict[str, Any]]:
             "name": t.name,
             "description": t.description,
         }
-        if t.parameters is not None:
+        if t.parameters is not EmptyParams:
             function["parameters"] = t.parameters.model_json_schema()
         tool_payload: dict[str, Any] = {
             "type": "function",
@@ -139,6 +140,10 @@ def to_openai_messages(msgs: list[ChatMessage]) -> list[dict[str, Any]]:
                     tool_dict = tool.model_dump()
                     tool_dict["id"] = tool.tool_call_id
                     tool_dict["type"] = "function"
+                    if tool.signature is not None:
+                        tool_dict["provider_specific_fields"] = {
+                            "thought_signature": tool.signature,
+                        }
                     tool_dict["function"] = {
                         "name": tool.name,
                         "arguments": tool.arguments,

stirrup/constants.py CHANGED Viewed

@@ -1,14 +1,18 @@
+from typing import Literal
 # Tool naming
-FINISH_TOOL_NAME = "finish"
+FINISH_TOOL_NAME: Literal["finish"] = "finish"
 # Agent execution limits
 AGENT_MAX_TURNS = 30  # Maximum agent turns before forced termination
 CONTEXT_SUMMARIZATION_CUTOFF = 0.7  # Context window usage threshold (0.0-1.0) that triggers message summarization
+TURNS_REMAINING_WARNING_THRESHOLD = 20
 # Media resolution limits
 RESOLUTION_1MP = 1_000_000  # 1 megapixel - default max resolution for images
 RESOLUTION_480P = 640 * 480  # 480p video resolution
 # Code execution
-SUBMISSION_SANDBOX_TIMEOUT = 60 * 10  # 10 minutes
+SANDBOX_TIMEOUT = 60 * 10  # 10 minutes
+SANDBOX_REQUEST_TIMEOUT = 60 * 3  # 3 minutes
 E2B_SANDBOX_TEMPLATE_ALIAS = "e2b-sandbox"

stirrup/core/agent.py CHANGED Viewed

@@ -2,9 +2,9 @@
 import contextvars
 import glob as glob_module
 import inspect
-import json
 import logging
 import re
+import signal
 from contextlib import AsyncExitStack
 from dataclasses import dataclass, field
 from itertools import chain, takewhile
@@ -19,7 +19,9 @@ from stirrup.constants import (
     AGENT_MAX_TURNS,
     CONTEXT_SUMMARIZATION_CUTOFF,
     FINISH_TOOL_NAME,
+    TURNS_REMAINING_WARNING_THRESHOLD,
 )
+from stirrup.core.cache import CacheManager, CacheState, compute_task_hash
 from stirrup.core.models import (
     AssistantMessage,
     ChatMessage,
@@ -72,6 +74,7 @@ class SessionState:
     depth: int = 0
     uploaded_file_paths: list[str] = field(default_factory=list)  # Paths of files uploaded to exec_env
     skills_metadata: list[SkillMetadata] = field(default_factory=list)  # Loaded skills metadata
+    logger: AgentLoggerBase | None = None  # Logger for pause/resume during user input
 _SESSION_STATE: contextvars.ContextVar[SessionState] = contextvars.ContextVar("session_state")
@@ -112,17 +115,19 @@ def _handle_text_only_tool_responses(tool_messages: list[ToolMessage]) -> tuple[
     return tool_messages, user_messages
-def _get_total_token_usage(messages: list[list[ChatMessage]]) -> TokenUsage:
-    """Aggregate token usage across all assistant messages in grouped conversation history.
+def _get_total_token_usage(messages: list[list[ChatMessage]]) -> list[TokenUsage]:
+    """
+    Returns a list of TokenUsage objects aggregated from all AssistantMessage
+    instances across the provided grouped message history.
     Args:
-        messages: List of message groups, where each group represents a segment of conversation.
+        messages: A list where each item is a list of ChatMessage objects representing a segment
+                  or turn group of the conversation history.
+    Returns:
+        List of TokenUsage corresponding to each AssistantMessage in the flattened conversation history.
     """
-    return sum(
-        [msg.token_usage for msg in chain.from_iterable(messages) if isinstance(msg, AssistantMessage)],
-        start=TokenUsage(),
-    )
+    return [msg.token_usage for msg in chain.from_iterable(messages) if isinstance(msg, AssistantMessage)]
 class SubAgentParams(BaseModel):
@@ -176,6 +181,7 @@ class Agent[FinishParams: BaseModel, FinishMeta]:
         finish_tool: Tool[FinishParams, FinishMeta] | None = None,
         # Agent options
         context_summarization_cutoff: float = CONTEXT_SUMMARIZATION_CUTOFF,
+        turns_remaining_warning_threshold: int = TURNS_REMAINING_WARNING_THRESHOLD,
         run_sync_in_thread: bool = True,
         text_only_tool_responses: bool = True,
         # Logging
@@ -215,6 +221,7 @@ class Agent[FinishParams: BaseModel, FinishMeta]:
         self._tools = tools if tools is not None else DEFAULT_TOOLS
         self._finish_tool: Tool = finish_tool if finish_tool is not None else SIMPLE_FINISH_TOOL
         self._context_summarization_cutoff = context_summarization_cutoff
+        self._turns_remaining_warning_threshold = turns_remaining_warning_threshold
         self._run_sync_in_thread = run_sync_in_thread
         self._text_only_tool_responses = text_only_tool_responses
@@ -225,6 +232,8 @@ class Agent[FinishParams: BaseModel, FinishMeta]:
         self._pending_output_dir: Path | None = None
         self._pending_input_files: str | Path | list[str | Path] | None = None
         self._pending_skills_dir: Path | None = None
+        self._resume: bool = False
+        self._clear_cache_on_success: bool = True
         # Instance-scoped state (populated during __aenter__, isolated per agent instance)
         self._active_tools: dict[str, Tool] = {}
@@ -232,6 +241,10 @@ class Agent[FinishParams: BaseModel, FinishMeta]:
         self._last_run_metadata: dict[str, list[Any]] = {}
         self._transferred_paths: list[str] = []  # Paths transferred to parent (for subagents)
+        # Cache state for resumption (set during run(), used in __aexit__ for caching on interrupt)
+        self._current_task_hash: str | None = None
+        self._current_run_state: CacheState | None = None
     @property
     def name(self) -> str:
         """The name of this agent."""
@@ -262,6 +275,8 @@ class Agent[FinishParams: BaseModel, FinishMeta]:
         output_dir: Path | str | None = None,
         input_files: str | Path | list[str | Path] | None = None,
         skills_dir: Path | str | None = None,
+        resume: bool = False,
+        clear_cache_on_success: bool = True,
     ) -> Self:
         """Configure a session and return self for use as async context manager.
@@ -277,6 +292,13 @@ class Agent[FinishParams: BaseModel, FinishMeta]:
             skills_dir: Directory containing skill definitions to load and make available
                        to the agent. Skills are uploaded to the execution environment
                        and their metadata is included in the system prompt.
+            resume: If True, attempt to resume from cached state if available.
+                   The cache is identified by hashing the init_msgs passed to run().
+                   Cached state includes message history, current turn, and execution
+                   environment files from a previous interrupted run.
+            clear_cache_on_success: If True (default), automatically clear the cache
+                                   when the agent completes successfully. Set to False
+                                   to preserve caches for inspection or debugging.
         Returns:
             Self, for use with `async with agent.session(...) as session:`
@@ -293,8 +315,18 @@ class Agent[FinishParams: BaseModel, FinishMeta]:
         self._pending_output_dir = Path(output_dir) if output_dir else None
         self._pending_input_files = input_files
         self._pending_skills_dir = Path(skills_dir) if skills_dir else None
+        self._resume = resume
+        self._clear_cache_on_success = clear_cache_on_success
         return self
+    def _handle_interrupt(self, _signum: int, _frame: object) -> None:
+        """Handle SIGINT to ensure caching before exit.
+        Converts the signal to a KeyboardInterrupt exception so that __aexit__
+        is properly called and can cache the state before cleanup.
+        """
+        raise KeyboardInterrupt("Agent interrupted - state will be cached")
     def _resolve_input_files(self, input_files: str | Path | list[str | Path]) -> list[Path]:
         """Resolve input file paths, expanding globs and normalizing to Path objects.
@@ -410,6 +442,15 @@ class Agent[FinishParams: BaseModel, FinishMeta]:
         # Base prompt with max_turns
         parts.append(BASE_SYSTEM_PROMPT_TEMPLATE.format(max_turns=self._max_turns))
+        # User interaction guidance based on whether user_input tool is available
+        if "user_input" in self._active_tools:
+            parts.append(
+                " You have access to the user_input tool which allows you to ask the user "
+                "questions when you need clarification or are uncertain about something."
+            )
+        else:
+            parts.append(" You are not able to interact with the user during the task.")
         # Input files section (if any were uploaded)
         state = _SESSION_STATE.get(None)
         if state and state.uploaded_file_paths:
@@ -514,6 +555,7 @@ class Agent[FinishParams: BaseModel, FinishMeta]:
             output_dir=str(self._pending_output_dir) if self._pending_output_dir else None,
             parent_exec_env=parent_state.exec_env if parent_state else None,
             depth=current_depth,
+            logger=self._logger,
         )
         _SESSION_STATE.set(state)
@@ -621,6 +663,11 @@ class Agent[FinishParams: BaseModel, FinishMeta]:
             # depth is already set (0 for main agent, passed in for sub-agents)
             self._logger.__enter__()
+            # Set up signal handler for graceful caching on interrupt (root agent only)
+            if current_depth == 0:
+                self._original_sigint = signal.getsignal(signal.SIGINT)
+                signal.signal(signal.SIGINT, self._handle_interrupt)
             return self
         except Exception:
@@ -642,6 +689,47 @@ class Agent[FinishParams: BaseModel, FinishMeta]:
         state = _SESSION_STATE.get()
         try:
+            # Cache state on non-success exit (only at root level)
+            should_cache = (
+                state.depth == 0
+                and (exc_type is not None or self._last_finish_params is None)
+                and self._current_task_hash is not None
+                and self._current_run_state is not None
+            )
+            logger.debug(
+                "[%s __aexit__] Cache decision: should_cache=%s, depth=%d, exc_type=%s, "
+                "finish_params=%s, task_hash=%s, run_state=%s",
+                self._name,
+                should_cache,
+                state.depth,
+                exc_type,
+                self._last_finish_params is not None,
+                self._current_task_hash,
+                self._current_run_state is not None,
+            )
+            if should_cache:
+                cache_manager = CacheManager(clear_on_success=self._clear_cache_on_success)
+                exec_env_dir = state.exec_env.temp_dir if state.exec_env else None
+                # Explicit checks to keep type checker happy - should_cache condition guarantees these
+                if self._current_task_hash is None or self._current_run_state is None:
+                    raise ValueError("Cache state is unexpectedly None after should_cache check")
+                # Temporarily block SIGINT during cache save to prevent interruption
+                original_handler = signal.getsignal(signal.SIGINT)
+                signal.signal(signal.SIGINT, signal.SIG_IGN)
+                try:
+                    cache_manager.save_state(
+                        self._current_task_hash,
+                        self._current_run_state,
+                        exec_env_dir,
+                    )
+                finally:
+                    signal.signal(signal.SIGINT, original_handler)
+                self._logger.info(f"Cached state for task {self._current_task_hash}")
             # Save files from finish_params.paths based on depth
             if state.output_dir and self._last_finish_params and state.exec_env:
                 paths = getattr(self._last_finish_params, "paths", None)
@@ -696,6 +784,11 @@ class Agent[FinishParams: BaseModel, FinishMeta]:
                                 state.depth,
                             )
         finally:
+            # Restore original signal handler (root agent only)
+            if hasattr(self, "_original_sigint"):
+                signal.signal(signal.SIGINT, self._original_sigint)
+                del self._original_sigint
             # Exit logger context
             self._logger.finish_params = self._last_finish_params
             self._logger.run_metadata = self._last_run_metadata
@@ -721,10 +814,7 @@ class Agent[FinishParams: BaseModel, FinishMeta]:
         if tool:
             try:
-                # Parse parameters if the tool has them, otherwise use None
-                params = (
-                    tool.parameters.model_validate_json(tool_call.arguments) if tool.parameters is not None else None
-                )
+                params = tool.parameters.model_validate_json(tool_call.arguments)
                 # Set parent depth for sub-agent tools to read
                 prev_depth = _PARENT_DEPTH.set(self._logger.depth)
@@ -749,17 +839,18 @@ class Agent[FinishParams: BaseModel, FinishMeta]:
                     tool_call.name,
                     tool_call.arguments,
                 )
-                result = ToolResult(content="Tool arguments are not valid")
+                result = ToolResult(content="Tool arguments are not valid", success=False)
                 args_valid = False
         else:
             LOGGER.debug(f"LLMClient tried to use the tool {tool_call.name} which is not in the tools list")
-            result = ToolResult(content=f"{tool_call.name} is not a valid tool")
+            result = ToolResult(content=f"{tool_call.name} is not a valid tool", success=False)
         return ToolMessage(
             content=result.content,
             tool_call_id=tool_call.tool_call_id,
             name=tool_call.name,
             args_was_valid=args_valid,
+            success=result.success,
         )
     async def step(
@@ -768,7 +859,7 @@ class Agent[FinishParams: BaseModel, FinishMeta]:
         run_metadata: dict[str, list[Any]],
         turn: int = 0,
         max_turns: int = 0,
-    ) -> tuple[AssistantMessage, list[ToolMessage], ToolCall | None]:
+    ) -> tuple[AssistantMessage, list[ToolMessage], FinishParams | None]:
         """Execute one agent step: generate assistant message and run any requested tool calls.
         Args:
@@ -786,24 +877,21 @@ class Agent[FinishParams: BaseModel, FinishMeta]:
         if turn > 0:
             self._logger.assistant_message(turn, max_turns, assistant_message)
+        finish_params: FinishParams | None = None
         tool_messages: list[ToolMessage] = []
-        finish_call: ToolCall | None = None
         if assistant_message.tool_calls:
-            finish_call = next(
-                (tc for tc in assistant_message.tool_calls if tc.name == FINISH_TOOL_NAME),
-                None,
-            )
             tool_messages = []
             for tool_call in assistant_message.tool_calls:
                 tool_message = await self.run_tool(tool_call, run_metadata)
                 tool_messages.append(tool_message)
+                if tool_message.success and tool_message.name == FINISH_TOOL_NAME:
+                    finish_params = self._finish_tool.parameters.model_validate_json(tool_call.arguments)
                 # Log tool result immediately
                 self._logger.tool_result(tool_message)
-        return assistant_message, tool_messages, finish_call
+        return assistant_message, tool_messages, finish_params
     async def summarize_messages(self, messages: list[ChatMessage]) -> list[ChatMessage]:
         """Condense message history using LLM to stay within context window."""
@@ -829,7 +917,7 @@ class Agent[FinishParams: BaseModel, FinishMeta]:
         init_msgs: str | list[ChatMessage],
         *,
         depth: int | None = None,
-    ) -> tuple[FinishParams | None, list[list[ChatMessage]], dict[str, list[Any]]]:
+    ) -> tuple[FinishParams | None, list[list[ChatMessage]], dict[str, Any]]:
         """Execute the agent loop until finish tool is called or max_turns reached.
         A base system prompt is automatically prepended to all runs, including:
@@ -859,23 +947,59 @@ class Agent[FinishParams: BaseModel, FinishMeta]:
             ])
         """
-        msgs: list[ChatMessage] = []
-        # Build the complete system prompt (base + input files + user instructions)
-        full_system_prompt = self._build_system_prompt()
-        msgs.append(SystemMessage(content=full_system_prompt))
+        # Compute task hash for caching/resume
+        task_hash = compute_task_hash(init_msgs)
+        self._current_task_hash = task_hash
+        # Initialize cache manager
+        cache_manager = CacheManager(clear_on_success=self._clear_cache_on_success)
+        start_turn = 0
+        resumed = False
+        # Try to resume from cache if requested
+        if self._resume:
+            state = _SESSION_STATE.get()
+            cached = cache_manager.load_state(task_hash)
+            if cached:
+                # Restore files to exec env
+                if state.exec_env and state.exec_env.temp_dir:
+                    cache_manager.restore_files(task_hash, state.exec_env.temp_dir)
+                # Restore state
+                msgs = cached.msgs
+                full_msg_history = cached.full_msg_history
+                run_metadata = cached.run_metadata
+                start_turn = cached.turn
+                resumed = True
+                self._logger.info(f"Resuming from cached state at turn {start_turn}")
+            else:
+                self._logger.info(f"No cache found for task {task_hash}, starting fresh")
-        if isinstance(init_msgs, str):
-            msgs.append(UserMessage(content=init_msgs))
-        else:
-            msgs.extend(init_msgs)
+        if not resumed:
+            msgs: list[ChatMessage] = []
+            # Build the complete system prompt (base + input files + user instructions)
+            full_system_prompt = self._build_system_prompt()
+            msgs.append(SystemMessage(content=full_system_prompt))
+            if isinstance(init_msgs, str):
+                msgs.append(UserMessage(content=init_msgs))
+            else:
+                msgs.extend(init_msgs)
+            # Local metadata storage - isolated per run() invocation for thread safety
+            run_metadata: dict[str, list[Any]] = {}
+            full_msg_history: list[list[ChatMessage]] = []
         # Set logger depth if provided (for sub-agent runs)
         if depth is not None:
             self._logger.depth = depth
-        # Log the task at run start
-        self._logger.task_message(msgs[-1].content)
+        # Log the task at run start (only if not resuming)
+        if not resumed:
+            self._logger.task_message(msgs[-1].content)
         # Show warnings (top-level only, if logger supports it)
         if self._logger.depth == 0 and isinstance(self._logger, AgentLogger):
@@ -886,25 +1010,30 @@ class Agent[FinishParams: BaseModel, FinishMeta]:
         # Use logger callback if available and not overridden
         step_callback = self._logger.on_step
-        # Local metadata storage - isolated per run() invocation for thread safety
-        run_metadata: dict[str, list[Any]] = {}
         full_msg_history: list[list[ChatMessage]] = []
-        finish_params: FinishParams | None = None
         # Cumulative stats for spinner
         total_tool_calls = 0
         total_input_tokens = 0
         total_output_tokens = 0
-        for i in range(self._max_turns):
-            if self._max_turns - i <= 30 and i != 0:
+        for i in range(start_turn, self._max_turns):
+            # Capture current state for potential caching (before any async work)
+            self._current_run_state = CacheState(
+                msgs=list(msgs),
+                full_msg_history=[list(group) for group in full_msg_history],
+                turn=i,
+                run_metadata=dict(run_metadata),
+                task_hash=task_hash,
+                agent_name=self._name,
+            )
+            if self._max_turns - i <= self._turns_remaining_warning_threshold and i != 0:
                 num_turns_remaining_msg = _num_turns_remaining_msg(self._max_turns - i)
                 msgs.append(num_turns_remaining_msg)
                 self._logger.user_message(num_turns_remaining_msg)
             # Pass turn info to step() for real-time logging
-            assistant_message, tool_messages, finish_call = await self.step(
+            assistant_message, tool_messages, finish_params = await self.step(
                 msgs,
                 run_metadata,
                 turn=i + 1,
@@ -930,18 +1059,8 @@ class Agent[FinishParams: BaseModel, FinishMeta]:
             msgs.extend([assistant_message, *tool_messages, *user_messages])
-            if finish_call:
-                try:
-                    finish_arguments = json.loads(finish_call.arguments)
-                    if self._finish_tool.parameters is not None:
-                        finish_params = self._finish_tool.parameters.model_validate(finish_arguments)
-                    break
-                except (json.JSONDecodeError, ValidationError, TypeError):
-                    LOGGER.debug(
-                        "Agent tried to use the finish tool but the tool call is not valid: %r",
-                        finish_call.arguments,
-                    )
-                    # continue until the finish tool call is valid
+            if finish_params:
+                break
             pct_context_used = assistant_message.token_usage.total / self._client.max_tokens
             if pct_context_used >= self._context_summarization_cutoff and i + 1 != self._max_turns:
@@ -956,15 +1075,18 @@ class Agent[FinishParams: BaseModel, FinishMeta]:
         full_msg_history.append(msgs)
         # Add agent's own token usage to run_metadata under "token_usage" key
-        agent_token_usage = _get_total_token_usage(full_msg_history)
-        if "token_usage" not in run_metadata:
-            run_metadata["token_usage"] = []
-        run_metadata["token_usage"].append(agent_token_usage)
+        run_metadata["token_usage"] = _get_total_token_usage(full_msg_history)
         # Store for __aexit__ to access (on instance for this agent)
         self._last_finish_params = finish_params
         self._last_run_metadata = run_metadata
+        # Clear cache on successful completion (finish_params is set)
+        if finish_params is not None and cache_manager.clear_on_success:
+            cache_manager.clear_cache(task_hash)
+            self._current_task_hash = None
+            self._current_run_state = None
         return finish_params, full_msg_history, run_metadata
     def to_tool(
@@ -1092,6 +1214,7 @@ class Agent[FinishParams: BaseModel, FinishMeta]:
                 )
                 return ToolResult(
                     content=f"<sub_agent_result>\n<error>{e!s}</error>\n</sub_agent_result>",
+                    success=False,
                     metadata=error_metadata,
                 )
             finally:

stirrup 0.1.2__py3-none-any.whl → 0.1.3__py3-none-any.whl

stirrup 0.1.2py3-none-any.whl → 0.1.3py3-none-any.whl