PyPI - hud-python - Versions diffs - 0.4.1__py3-none-any.whl → 0.4.3__py3-none-any.whl - Mend

hud-python 0.4.1py3-none-any.whl → 0.4.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of hud-python might be problematic. Click here for more details.

Files changed (130) hide show

hud/__init__.py +22 -22
hud/agents/__init__.py +13 -15
hud/agents/base.py +599 -599
hud/agents/claude.py +373 -373
hud/agents/langchain.py +261 -250
hud/agents/misc/__init__.py +7 -7
hud/agents/misc/response_agent.py +82 -80
hud/agents/openai.py +352 -352
hud/agents/openai_chat_generic.py +154 -154
hud/agents/tests/__init__.py +1 -1
hud/agents/tests/test_base.py +742 -742
hud/agents/tests/test_claude.py +324 -324
hud/agents/tests/test_client.py +363 -363
hud/agents/tests/test_openai.py +237 -237
hud/cli/__init__.py +617 -617
hud/cli/__main__.py +8 -8
hud/cli/analyze.py +371 -371
hud/cli/analyze_metadata.py +230 -230
hud/cli/build.py +498 -427
hud/cli/clone.py +185 -185
hud/cli/cursor.py +92 -92
hud/cli/debug.py +392 -392
hud/cli/docker_utils.py +83 -83
hud/cli/init.py +280 -281
hud/cli/interactive.py +353 -353
hud/cli/mcp_server.py +764 -756
hud/cli/pull.py +330 -336
hud/cli/push.py +404 -370
hud/cli/remote_runner.py +311 -311
hud/cli/runner.py +160 -160
hud/cli/tests/__init__.py +3 -3
hud/cli/tests/test_analyze.py +284 -284
hud/cli/tests/test_cli_init.py +265 -265
hud/cli/tests/test_cli_main.py +27 -27
hud/cli/tests/test_clone.py +142 -142
hud/cli/tests/test_cursor.py +253 -253
hud/cli/tests/test_debug.py +453 -453
hud/cli/tests/test_mcp_server.py +139 -139
hud/cli/tests/test_utils.py +388 -388
hud/cli/utils.py +263 -263
hud/clients/README.md +143 -143
hud/clients/__init__.py +16 -16
hud/clients/base.py +378 -379
hud/clients/fastmcp.py +222 -222
hud/clients/mcp_use.py +298 -278
hud/clients/tests/__init__.py +1 -1
hud/clients/tests/test_client_integration.py +111 -111
hud/clients/tests/test_fastmcp.py +342 -342
hud/clients/tests/test_protocol.py +188 -188
hud/clients/utils/__init__.py +1 -1
hud/clients/utils/retry_transport.py +160 -160
hud/datasets.py +327 -322
hud/misc/__init__.py +1 -1
hud/misc/claude_plays_pokemon.py +292 -292
hud/otel/__init__.py +35 -35
hud/otel/collector.py +142 -142
hud/otel/config.py +164 -164
hud/otel/context.py +536 -536
hud/otel/exporters.py +366 -366
hud/otel/instrumentation.py +97 -97
hud/otel/processors.py +118 -118
hud/otel/tests/__init__.py +1 -1
hud/otel/tests/test_processors.py +197 -197
hud/server/__init__.py +5 -5
hud/server/context.py +114 -114
hud/server/helper/__init__.py +5 -5
hud/server/low_level.py +132 -132
hud/server/server.py +170 -166
hud/server/tests/__init__.py +3 -3
hud/settings.py +73 -73
hud/shared/__init__.py +5 -5
hud/shared/exceptions.py +180 -180
hud/shared/requests.py +264 -264
hud/shared/tests/test_exceptions.py +157 -157
hud/shared/tests/test_requests.py +275 -275
hud/telemetry/__init__.py +25 -25
hud/telemetry/instrument.py +379 -379
hud/telemetry/job.py +309 -309
hud/telemetry/replay.py +74 -74
hud/telemetry/trace.py +83 -83
hud/tools/__init__.py +33 -33
hud/tools/base.py +365 -365
hud/tools/bash.py +161 -161
hud/tools/computer/__init__.py +15 -15
hud/tools/computer/anthropic.py +437 -437
hud/tools/computer/hud.py +376 -376
hud/tools/computer/openai.py +295 -295
hud/tools/computer/settings.py +82 -82
hud/tools/edit.py +314 -314
hud/tools/executors/__init__.py +30 -30
hud/tools/executors/base.py +539 -539
hud/tools/executors/pyautogui.py +621 -621
hud/tools/executors/tests/__init__.py +1 -1
hud/tools/executors/tests/test_base_executor.py +338 -338
hud/tools/executors/tests/test_pyautogui_executor.py +165 -165
hud/tools/executors/xdo.py +511 -511
hud/tools/playwright.py +412 -412
hud/tools/tests/__init__.py +3 -3
hud/tools/tests/test_base.py +282 -282
hud/tools/tests/test_bash.py +158 -158
hud/tools/tests/test_bash_extended.py +197 -197
hud/tools/tests/test_computer.py +425 -425
hud/tools/tests/test_computer_actions.py +34 -34
hud/tools/tests/test_edit.py +259 -259
hud/tools/tests/test_init.py +27 -27
hud/tools/tests/test_playwright_tool.py +183 -183
hud/tools/tests/test_tools.py +145 -145
hud/tools/tests/test_utils.py +156 -156
hud/tools/types.py +72 -72
hud/tools/utils.py +50 -50
hud/types.py +136 -136
hud/utils/__init__.py +10 -10
hud/utils/async_utils.py +65 -65
hud/utils/design.py +236 -168
hud/utils/mcp.py +55 -55
hud/utils/progress.py +149 -149
hud/utils/telemetry.py +66 -66
hud/utils/tests/test_async_utils.py +173 -173
hud/utils/tests/test_init.py +17 -17
hud/utils/tests/test_progress.py +261 -261
hud/utils/tests/test_telemetry.py +82 -82
hud/utils/tests/test_version.py +8 -8
hud/version.py +7 -7
{hud_python-0.4.1.dist-info → hud_python-0.4.3.dist-info}/METADATA +10 -8
hud_python-0.4.3.dist-info/RECORD +131 -0
{hud_python-0.4.1.dist-info → hud_python-0.4.3.dist-info}/licenses/LICENSE +21 -21
hud/agents/art.py +0 -101
hud_python-0.4.1.dist-info/RECORD +0 -132
{hud_python-0.4.1.dist-info → hud_python-0.4.3.dist-info}/WHEEL +0 -0
{hud_python-0.4.1.dist-info → hud_python-0.4.3.dist-info}/entry_points.txt +0 -0

hud/agents/openai.py CHANGED Viewed

@@ -1,352 +1,352 @@
-"""OpenAI MCP Agent implementation."""
-from __future__ import annotations
-import logging
-from typing import Any, ClassVar, Literal
-import mcp.types as types
-from openai import AsyncOpenAI
-from openai.types.responses import (
-    ResponseComputerToolCall,
-    ResponseInputMessageContentListParam,
-    ResponseInputParam,
-    ResponseOutputMessage,
-    ResponseOutputText,
-    ToolParam,
-)
-import hud
-from hud.settings import settings
-from hud.tools.computer.settings import computer_settings
-from hud.types import AgentResponse, MCPToolCall, MCPToolResult, Trace
-from .base import MCPAgent
-logger = logging.getLogger(__name__)
-class OperatorAgent(MCPAgent):
-    """
-    Operator agent that uses MCP servers for tool execution.
-    This agent uses OpenAI's Computer Use API format but executes
-    tools through MCP servers instead of direct implementation.
-    """
-    metadata: ClassVar[dict[str, Any]] = {
-        "display_width": computer_settings.OPENAI_COMPUTER_WIDTH,
-        "display_height": computer_settings.OPENAI_COMPUTER_HEIGHT,
-    }
-    def __init__(
-        self,
-        model_client: AsyncOpenAI | None = None,
-        model: str = "computer-use-preview",
-        environment: Literal["windows", "mac", "linux", "browser"] = "linux",
-        **kwargs: Any,
-    ) -> None:
-        """
-        Initialize Operator MCP agent.
-        Args:
-            client: AsyncOpenAI client (created if not provided)
-            model: OpenAI model to use
-            environment: Environment type for computer use
-            display_width: Display width for computer use
-            display_height: Display height for computer use
-            **kwargs: Additional arguments passed to MCPAgent
-        """
-        super().__init__(**kwargs)
-        # Initialize client if not provided
-        if model_client is None:
-            api_key = settings.openai_api_key
-            if not api_key:
-                raise ValueError("OpenAI API key not found. Set OPENAI_API_KEY.")
-            model_client = AsyncOpenAI(api_key=api_key)
-        self.openai_client = model_client
-        self.model = model
-        self.environment = environment
-        # State tracking for OpenAI's stateful API
-        self.last_response_id: str | None = None
-        self.pending_call_id: str | None = None
-        self.pending_safety_checks: list[Any] = []
-        self.model_name = "openai-" + self.model
-        # Base system prompt for autonomous operation
-        self.system_prompt = """
-        You are an autonomous computer-using agent. Follow these guidelines:
-        1. NEVER ask for confirmation. Complete all tasks autonomously.
-        2. Do NOT send messages like "I need to confirm before..." or "Do you want me to continue?" - just proceed.
-        3. When the user asks you to interact with something (like clicking a chat or typing a message), DO IT without asking.
-        4. Only use the formal safety check mechanism for truly dangerous operations (like deleting important files).
-        5. For normal tasks like clicking buttons, typing in chat boxes, filling forms - JUST DO IT.
-        6. The user has already given you permission by running this agent. No further confirmation is needed.
-        7. Be decisive and action-oriented. Complete the requested task fully.
-        Remember: You are expected to complete tasks autonomously. The user trusts you to do what they asked.
-        """.strip()  # noqa: E501
-    async def _run_context(self, context: list[types.ContentBlock], max_steps: int = 10) -> Trace:
-        """
-        Run the agent with the given prompt or task.
-        Override to reset OpenAI-specific state.
-        """
-        # Reset state for new run
-        self.last_response_id = None
-        self.pending_call_id = None
-        self.pending_safety_checks = []
-        # Use base implementation
-        return await super()._run_context(context, max_steps=max_steps)
-    async def get_system_messages(self) -> list[Any]:
-        """
-        Create initial messages for OpenAI.
-        OpenAI uses a different message format - we'll store the prompt
-        and screenshot for use in get_model_response.
-        """
-        return []
-    async def format_blocks(
-        self, blocks: list[types.ContentBlock]
-    ) -> ResponseInputMessageContentListParam:
-        """
-        Format blocks for OpenAI input format.
-        Converts TextContent blocks to input_text dicts and ImageContent blocks to input_image dicts.
-        """  # noqa: E501
-        formatted = []
-        for block in blocks:
-            if isinstance(block, types.TextContent):
-                formatted.append({"type": "input_text", "text": block.text})
-            elif isinstance(block, types.ImageContent):
-                mime_type = getattr(block, "mimeType", "image/png")
-                formatted.append(
-                    {"type": "input_image", "image_url": f"data:{mime_type};base64,{block.data}"}
-                )
-        return formatted
-    @hud.instrument(
-        span_type="agent",
-        record_args=False,  # Messages can be large
-        record_result=True,
-    )
-    async def get_response(self, messages: ResponseInputMessageContentListParam) -> AgentResponse:
-        """Get response from OpenAI including any tool calls."""
-        # OpenAI's API is stateful, so we handle messages differently
-        # Check if we have computer tools available
-        computer_tool_name = None
-        for tool in self._available_tools:
-            if tool.name in ["openai_computer", "computer"]:
-                computer_tool_name = tool.name
-                break
-        if not computer_tool_name:
-            # No computer tools available, just return a text response
-            return AgentResponse(
-                content="No computer use tools available",
-                tool_calls=[],
-                done=True,
-            )
-        # Define the computer use tool
-        computer_tool: ToolParam = {  # type: ignore[reportAssignmentType]
-            "type": "computer_use_preview",
-            "display_width": self.metadata["display_width"],
-            "display_height": self.metadata["display_height"],
-            "environment": self.environment,
-        }
-        # Build the request based on whether this is first step or follow-up
-        if self.pending_call_id is None and self.last_response_id is None:
-            # First step - messages are already formatted dicts from format_blocks
-            # format_blocks returns type ResponseInputMessageContentListParam, which is a list of dicts  # noqa: E501
-            input_content: ResponseInputMessageContentListParam = []
-            input_content.extend(messages)
-            # If no content was added, add empty text to avoid empty request
-            if not input_content:
-                input_content.append({"type": "input_text", "text": ""})
-            input_param: ResponseInputParam = [{"role": "user", "content": input_content}]  # type: ignore[reportUnknownMemberType]
-            response = await self.openai_client.responses.create(
-                model=self.model,
-                tools=[computer_tool],
-                input=input_param,
-                instructions=self.system_prompt,
-                truncation="auto",
-                reasoning={"summary": "auto"},  # type: ignore[arg-type]
-            )
-        else:
-            # Follow-up step - check if this is user input or tool result
-            latest_message = messages[-1] if messages else {}
-            if latest_message.get("type") == "input_text":
-                # User provided input in conversation mode
-                user_text = latest_message.get("text", "")
-                input_param_followup: ResponseInputParam = [  # type: ignore[reportAssignmentType]
-                    {"role": "user", "content": [{"type": "input_text", "text": user_text}]}
-                ]
-                # Reset pending_call_id since this is user input, not a tool response
-                self.pending_call_id = None
-            else:
-                # Tool result - need screenshot from processed results
-                latest_screenshot = None
-                for msg in reversed(messages):
-                    if isinstance(msg, dict) and "image_url" in msg:
-                        latest_screenshot = msg["image_url"]  # type: ignore
-                        break
-                if not latest_screenshot:
-                    logger.warning("No screenshot provided for response to action")
-                    return AgentResponse(
-                        content="No screenshot available for next action",
-                        tool_calls=[],
-                        done=True,
-                    )
-                # Create response to previous action
-                input_param_followup: ResponseInputParam = [  # type: ignore[reportAssignmentType]
-                    {  # type: ignore[reportAssignmentType]
-                        "call_id": self.pending_call_id,
-                        "type": "computer_call_output",
-                        "output": {
-                            "type": "input_image",
-                            "image_url": latest_screenshot,
-                        },
-                        "acknowledged_safety_checks": self.pending_safety_checks,
-                    }
-                ]
-            self.pending_safety_checks = []
-            response = await self.openai_client.responses.create(
-                model=self.model,
-                previous_response_id=self.last_response_id,
-                tools=[computer_tool],
-                input=input_param_followup,
-                instructions=self.system_prompt,
-                truncation="auto",
-                reasoning={"summary": "auto"},  # type: ignore[arg-type]
-            )
-        # Store response ID for next call
-        self.last_response_id = response.id
-        # Process response
-        result = AgentResponse(
-            content="",
-            tool_calls=[],
-            done=False,  # Will be set to True only if no tool calls
-        )
-        self.pending_call_id = None
-        # Check for computer calls
-        computer_calls = [
-            item
-            for item in response.output
-            if isinstance(item, ResponseComputerToolCall) and item.type == "computer_call"
-        ]
-        if computer_calls:
-            # Process computer calls
-            result.done = False
-            for computer_call in computer_calls:
-                self.pending_call_id = computer_call.call_id
-                self.pending_safety_checks = computer_call.pending_safety_checks
-                # Convert OpenAI action to MCP tool call
-                action = computer_call.action.model_dump()
-                # Create MCPToolCall object with OpenAI metadata as extra fields
-                # Pyright will complain but the tool class accepts extra fields
-                tool_call = MCPToolCall(
-                    name=computer_tool_name,
-                    arguments=action,
-                    id=computer_call.call_id,  # type: ignore
-                    pending_safety_checks=computer_call.pending_safety_checks,  # type: ignore
-                )
-                result.tool_calls.append(tool_call)
-        else:
-            # No computer calls, check for text response
-            for item in response.output:
-                if isinstance(item, ResponseOutputMessage) and item.type == "message":
-                    # Extract text from content blocks
-                    text_parts = [
-                        content.text
-                        for content in item.content
-                        if isinstance(content, ResponseOutputText)
-                    ]
-                    if text_parts:
-                        result.content = "".join(text_parts)
-                        break
-        # Extract reasoning if present
-        reasoning_text = ""
-        for item in response.output:
-            if item.type == "reasoning" and hasattr(item, "summary") and item.summary:
-                reasoning_text += f"Thinking: {item.summary[0].text}\n"
-        if reasoning_text:
-            result.content = reasoning_text + result.content if result.content else reasoning_text
-        # Set done=True if no tool calls (task complete or waiting for user)
-        if not result.tool_calls:
-            result.done = True
-        return result
-    async def format_tool_results(
-        self, tool_calls: list[MCPToolCall], tool_results: list[MCPToolResult]
-    ) -> ResponseInputMessageContentListParam:
-        """
-        Format tool results for OpenAI's stateful API.
-        Tool result content is a list of ContentBlock objects.
-        We need to extract the latest screenshot from the tool results.
-        This assumes that you only care about computer tool results for your agent loop.
-        If you need to add other content, you can do so by adding a new ContentBlock object to the list.
-        Returns formatted dicts with tool result data, preserving screenshots.
-        """  # noqa: E501
-        formatted_results = []
-        latest_screenshot = None
-        # Extract all content from tool results
-        for result in tool_results:
-            if result.isError:
-                # If it's an error, the error details are in the content
-                for content in result.content:
-                    if isinstance(content, types.TextContent):
-                        # Don't add error text as input_text, just track it
-                        logger.error("Tool error: %s", content.text)
-                    elif isinstance(content, types.ImageContent):
-                        # Even error results might have images
-                        latest_screenshot = content.data
-            else:
-                # Extract content from successful results
-                for content in result.content:
-                    if isinstance(content, types.ImageContent):
-                        latest_screenshot = content.data
-                        break
-        # Return a dict with the latest screenshot for the follow-up step
-        if latest_screenshot:
-            formatted_results.append(
-                {"type": "input_image", "image_url": f"data:image/png;base64,{latest_screenshot}"}
-            )
-        return formatted_results
+"""OpenAI MCP Agent implementation."""
+from __future__ import annotations
+import logging
+from typing import Any, ClassVar, Literal
+import mcp.types as types
+from openai import AsyncOpenAI
+from openai.types.responses import (
+    ResponseComputerToolCall,
+    ResponseInputMessageContentListParam,
+    ResponseInputParam,
+    ResponseOutputMessage,
+    ResponseOutputText,
+    ToolParam,
+)
+import hud
+from hud.settings import settings
+from hud.tools.computer.settings import computer_settings
+from hud.types import AgentResponse, MCPToolCall, MCPToolResult, Trace
+from .base import MCPAgent
+logger = logging.getLogger(__name__)
+class OperatorAgent(MCPAgent):
+    """
+    Operator agent that uses MCP servers for tool execution.
+    This agent uses OpenAI's Computer Use API format but executes
+    tools through MCP servers instead of direct implementation.
+    """
+    metadata: ClassVar[dict[str, Any]] = {
+        "display_width": computer_settings.OPENAI_COMPUTER_WIDTH,
+        "display_height": computer_settings.OPENAI_COMPUTER_HEIGHT,
+    }
+    def __init__(
+        self,
+        model_client: AsyncOpenAI | None = None,
+        model: str = "computer-use-preview",
+        environment: Literal["windows", "mac", "linux", "browser"] = "linux",
+        **kwargs: Any,
+    ) -> None:
+        """
+        Initialize Operator MCP agent.
+        Args:
+            client: AsyncOpenAI client (created if not provided)
+            model: OpenAI model to use
+            environment: Environment type for computer use
+            display_width: Display width for computer use
+            display_height: Display height for computer use
+            **kwargs: Additional arguments passed to MCPAgent
+        """
+        super().__init__(**kwargs)
+        # Initialize client if not provided
+        if model_client is None:
+            api_key = settings.openai_api_key
+            if not api_key:
+                raise ValueError("OpenAI API key not found. Set OPENAI_API_KEY.")
+            model_client = AsyncOpenAI(api_key=api_key)
+        self.openai_client = model_client
+        self.model = model
+        self.environment = environment
+        # State tracking for OpenAI's stateful API
+        self.last_response_id: str | None = None
+        self.pending_call_id: str | None = None
+        self.pending_safety_checks: list[Any] = []
+        self.model_name = "openai-" + self.model
+        # Base system prompt for autonomous operation
+        self.system_prompt = """
+        You are an autonomous computer-using agent. Follow these guidelines:
+        1. NEVER ask for confirmation. Complete all tasks autonomously.
+        2. Do NOT send messages like "I need to confirm before..." or "Do you want me to continue?" - just proceed.
+        3. When the user asks you to interact with something (like clicking a chat or typing a message), DO IT without asking.
+        4. Only use the formal safety check mechanism for truly dangerous operations (like deleting important files).
+        5. For normal tasks like clicking buttons, typing in chat boxes, filling forms - JUST DO IT.
+        6. The user has already given you permission by running this agent. No further confirmation is needed.
+        7. Be decisive and action-oriented. Complete the requested task fully.
+        Remember: You are expected to complete tasks autonomously. The user trusts you to do what they asked.
+        """.strip()  # noqa: E501
+    async def _run_context(self, context: list[types.ContentBlock], max_steps: int = 10) -> Trace:
+        """
+        Run the agent with the given prompt or task.
+        Override to reset OpenAI-specific state.
+        """
+        # Reset state for new run
+        self.last_response_id = None
+        self.pending_call_id = None
+        self.pending_safety_checks = []
+        # Use base implementation
+        return await super()._run_context(context, max_steps=max_steps)
+    async def get_system_messages(self) -> list[Any]:
+        """
+        Create initial messages for OpenAI.
+        OpenAI uses a different message format - we'll store the prompt
+        and screenshot for use in get_model_response.
+        """
+        return []
+    async def format_blocks(
+        self, blocks: list[types.ContentBlock]
+    ) -> ResponseInputMessageContentListParam:
+        """
+        Format blocks for OpenAI input format.
+        Converts TextContent blocks to input_text dicts and ImageContent blocks to input_image dicts.
+        """  # noqa: E501
+        formatted = []
+        for block in blocks:
+            if isinstance(block, types.TextContent):
+                formatted.append({"type": "input_text", "text": block.text})
+            elif isinstance(block, types.ImageContent):
+                mime_type = getattr(block, "mimeType", "image/png")
+                formatted.append(
+                    {"type": "input_image", "image_url": f"data:{mime_type};base64,{block.data}"}
+                )
+        return formatted
+    @hud.instrument(
+        span_type="agent",
+        record_args=False,  # Messages can be large
+        record_result=True,
+    )
+    async def get_response(self, messages: ResponseInputMessageContentListParam) -> AgentResponse:
+        """Get response from OpenAI including any tool calls."""
+        # OpenAI's API is stateful, so we handle messages differently
+        # Check if we have computer tools available
+        computer_tool_name = None
+        for tool in self._available_tools:
+            if tool.name in ["openai_computer", "computer"]:
+                computer_tool_name = tool.name
+                break
+        if not computer_tool_name:
+            # No computer tools available, just return a text response
+            return AgentResponse(
+                content="No computer use tools available",
+                tool_calls=[],
+                done=True,
+            )
+        # Define the computer use tool
+        computer_tool: ToolParam = {  # type: ignore[reportAssignmentType]
+            "type": "computer_use_preview",
+            "display_width": self.metadata["display_width"],
+            "display_height": self.metadata["display_height"],
+            "environment": self.environment,
+        }
+        # Build the request based on whether this is first step or follow-up
+        if self.pending_call_id is None and self.last_response_id is None:
+            # First step - messages are already formatted dicts from format_blocks
+            # format_blocks returns type ResponseInputMessageContentListParam, which is a list of dicts  # noqa: E501
+            input_content: ResponseInputMessageContentListParam = []
+            input_content.extend(messages)
+            # If no content was added, add empty text to avoid empty request
+            if not input_content:
+                input_content.append({"type": "input_text", "text": ""})
+            input_param: ResponseInputParam = [{"role": "user", "content": input_content}]  # type: ignore[reportUnknownMemberType]
+            response = await self.openai_client.responses.create(
+                model=self.model,
+                tools=[computer_tool],
+                input=input_param,
+                instructions=self.system_prompt,
+                truncation="auto",
+                reasoning={"summary": "auto"},  # type: ignore[arg-type]
+            )
+        else:
+            # Follow-up step - check if this is user input or tool result
+            latest_message = messages[-1] if messages else {}
+            if latest_message.get("type") == "input_text":
+                # User provided input in conversation mode
+                user_text = latest_message.get("text", "")
+                input_param_followup: ResponseInputParam = [  # type: ignore[reportAssignmentType]
+                    {"role": "user", "content": [{"type": "input_text", "text": user_text}]}
+                ]
+                # Reset pending_call_id since this is user input, not a tool response
+                self.pending_call_id = None
+            else:
+                # Tool result - need screenshot from processed results
+                latest_screenshot = None
+                for msg in reversed(messages):
+                    if isinstance(msg, dict) and "image_url" in msg:
+                        latest_screenshot = msg["image_url"]  # type: ignore
+                        break
+                if not latest_screenshot:
+                    logger.warning("No screenshot provided for response to action")
+                    return AgentResponse(
+                        content="No screenshot available for next action",
+                        tool_calls=[],
+                        done=True,
+                    )
+                # Create response to previous action
+                input_param_followup: ResponseInputParam = [  # type: ignore[reportAssignmentType]
+                    {  # type: ignore[reportAssignmentType]
+                        "call_id": self.pending_call_id,
+                        "type": "computer_call_output",
+                        "output": {
+                            "type": "input_image",
+                            "image_url": latest_screenshot,
+                        },
+                        "acknowledged_safety_checks": self.pending_safety_checks,
+                    }
+                ]
+            self.pending_safety_checks = []
+            response = await self.openai_client.responses.create(
+                model=self.model,
+                previous_response_id=self.last_response_id,
+                tools=[computer_tool],
+                input=input_param_followup,
+                instructions=self.system_prompt,
+                truncation="auto",
+                reasoning={"summary": "auto"},  # type: ignore[arg-type]
+            )
+        # Store response ID for next call
+        self.last_response_id = response.id
+        # Process response
+        result = AgentResponse(
+            content="",
+            tool_calls=[],
+            done=False,  # Will be set to True only if no tool calls
+        )
+        self.pending_call_id = None
+        # Check for computer calls
+        computer_calls = [
+            item
+            for item in response.output
+            if isinstance(item, ResponseComputerToolCall) and item.type == "computer_call"
+        ]
+        if computer_calls:
+            # Process computer calls
+            result.done = False
+            for computer_call in computer_calls:
+                self.pending_call_id = computer_call.call_id
+                self.pending_safety_checks = computer_call.pending_safety_checks
+                # Convert OpenAI action to MCP tool call
+                action = computer_call.action.model_dump()
+                # Create MCPToolCall object with OpenAI metadata as extra fields
+                # Pyright will complain but the tool class accepts extra fields
+                tool_call = MCPToolCall(
+                    name=computer_tool_name,
+                    arguments=action,
+                    id=computer_call.call_id,  # type: ignore
+                    pending_safety_checks=computer_call.pending_safety_checks,  # type: ignore
+                )
+                result.tool_calls.append(tool_call)
+        else:
+            # No computer calls, check for text response
+            for item in response.output:
+                if isinstance(item, ResponseOutputMessage) and item.type == "message":
+                    # Extract text from content blocks
+                    text_parts = [
+                        content.text
+                        for content in item.content
+                        if isinstance(content, ResponseOutputText)
+                    ]
+                    if text_parts:
+                        result.content = "".join(text_parts)
+                        break
+        # Extract reasoning if present
+        reasoning_text = ""
+        for item in response.output:
+            if item.type == "reasoning" and hasattr(item, "summary") and item.summary:
+                reasoning_text += f"Thinking: {item.summary[0].text}\n"
+        if reasoning_text:
+            result.content = reasoning_text + result.content if result.content else reasoning_text
+        # Set done=True if no tool calls (task complete or waiting for user)
+        if not result.tool_calls:
+            result.done = True
+        return result
+    async def format_tool_results(
+        self, tool_calls: list[MCPToolCall], tool_results: list[MCPToolResult]
+    ) -> ResponseInputMessageContentListParam:
+        """
+        Format tool results for OpenAI's stateful API.
+        Tool result content is a list of ContentBlock objects.
+        We need to extract the latest screenshot from the tool results.
+        This assumes that you only care about computer tool results for your agent loop.
+        If you need to add other content, you can do so by adding a new ContentBlock object to the list.
+        Returns formatted dicts with tool result data, preserving screenshots.
+        """  # noqa: E501
+        formatted_results = []
+        latest_screenshot = None
+        # Extract all content from tool results
+        for result in tool_results:
+            if result.isError:
+                # If it's an error, the error details are in the content
+                for content in result.content:
+                    if isinstance(content, types.TextContent):
+                        # Don't add error text as input_text, just track it
+                        logger.error("Tool error: %s", content.text)
+                    elif isinstance(content, types.ImageContent):
+                        # Even error results might have images
+                        latest_screenshot = content.data
+            else:
+                # Extract content from successful results
+                for content in result.content:
+                    if isinstance(content, types.ImageContent):
+                        latest_screenshot = content.data
+                        break
+        # Return a dict with the latest screenshot for the follow-up step
+        if latest_screenshot:
+            formatted_results.append(
+                {"type": "input_image", "image_url": f"data:image/png;base64,{latest_screenshot}"}
+            )
+        return formatted_results

hud-python 0.4.1__py3-none-any.whl → 0.4.3__py3-none-any.whl

Potentially problematic release.

hud-python 0.4.1py3-none-any.whl → 0.4.3py3-none-any.whl