PyPI - hud-python - Versions diffs - 0.3.0__py3-none-any.whl → 0.3.1__py3-none-any.whl - Mend

hud-python 0.3.0py3-none-any.whl → 0.3.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of hud-python might be problematic. Click here for more details.

Files changed (53) hide show

hud/__init__.py +7 -4
hud/adapters/common/adapter.py +14 -3
hud/adapters/common/tests/test_adapter.py +16 -4
hud/datasets.py +188 -0
hud/env/docker_client.py +14 -2
hud/env/local_docker_client.py +28 -6
hud/gym.py +0 -9
hud/{mcp_agent → mcp}/__init__.py +2 -0
hud/mcp/base.py +631 -0
hud/{mcp_agent → mcp}/claude.py +52 -47
hud/mcp/client.py +312 -0
hud/{mcp_agent → mcp}/langchain.py +52 -33
hud/{mcp_agent → mcp}/openai.py +56 -40
hud/{mcp_agent → mcp}/tests/test_base.py +129 -54
hud/mcp/tests/test_claude.py +294 -0
hud/mcp/tests/test_client.py +324 -0
hud/mcp/tests/test_openai.py +238 -0
hud/settings.py +6 -0
hud/task.py +1 -88
hud/taskset.py +2 -23
hud/telemetry/__init__.py +5 -0
hud/telemetry/_trace.py +180 -17
hud/telemetry/context.py +79 -0
hud/telemetry/exporter.py +165 -6
hud/telemetry/job.py +141 -0
hud/telemetry/tests/test_trace.py +36 -25
hud/tools/__init__.py +14 -1
hud/tools/executors/__init__.py +19 -2
hud/tools/executors/pyautogui.py +84 -50
hud/tools/executors/tests/test_pyautogui_executor.py +4 -1
hud/tools/playwright_tool.py +73 -67
hud/tools/tests/test_edit.py +8 -1
hud/tools/tests/test_tools.py +3 -0
hud/trajectory.py +5 -1
hud/utils/tests/test_version.py +1 -1
hud/version.py +1 -1
{hud_python-0.3.0.dist-info → hud_python-0.3.1.dist-info}/METADATA +20 -14
{hud_python-0.3.0.dist-info → hud_python-0.3.1.dist-info}/RECORD +41 -46
hud/evaluators/__init__.py +0 -9
hud/evaluators/base.py +0 -32
hud/evaluators/inspect.py +0 -24
hud/evaluators/judge.py +0 -189
hud/evaluators/match.py +0 -156
hud/evaluators/remote.py +0 -65
hud/evaluators/tests/__init__.py +0 -0
hud/evaluators/tests/test_inspect.py +0 -12
hud/evaluators/tests/test_judge.py +0 -231
hud/evaluators/tests/test_match.py +0 -115
hud/evaluators/tests/test_remote.py +0 -98
hud/mcp_agent/base.py +0 -723
/hud/{mcp_agent → mcp}/tests/__init__.py +0 -0
{hud_python-0.3.0.dist-info → hud_python-0.3.1.dist-info}/WHEEL +0 -0
{hud_python-0.3.0.dist-info → hud_python-0.3.1.dist-info}/licenses/LICENSE +0 -0

hud/{mcp_agent → mcp}/openai.py RENAMED Viewed

@@ -3,8 +3,11 @@
 from __future__ import annotations
 import logging
-from typing import Any, Literal
+from typing import TYPE_CHECKING, Any, Literal
+import mcp.types as types
+from mcp.types import CallToolRequestParams as MCPToolCall
+from mcp.types import CallToolResult as MCPToolResult
 from openai import AsyncOpenAI
 from openai.types.responses import (
     ResponseComputerToolCall,
@@ -16,7 +19,10 @@ from openai.types.responses import (
 from hud.settings import settings
-from .base import BaseMCPAgent
+from .base import AgentResult, BaseMCPAgent, ModelResponse
+if TYPE_CHECKING:
+    from hud.datasets import TaskConfig
 logger = logging.getLogger(__name__)
@@ -69,6 +75,8 @@ class OpenAIMCPAgent(BaseMCPAgent):
         self.pending_call_id: str | None = None
         self.pending_safety_checks: list[Any] = []
+        self.model_name = "openai-" + self.model
         # Base system prompt for autonomous operation
         self.base_system_prompt = """
         You are an autonomous computer-using agent. Follow these guidelines:
@@ -84,11 +92,9 @@ class OpenAIMCPAgent(BaseMCPAgent):
         Remember: You are expected to complete tasks autonomously. The user trusts you to do what they asked.
         """  # noqa: E501
-    async def run(
-        self, prompt: str, max_steps: int = 10, conversation_mode: bool = False
-    ) -> dict[str, Any]:
+    async def run(self, prompt_or_task: str | TaskConfig, max_steps: int = 10) -> AgentResult:
         """
-        Run the agent with the given prompt.
+        Run the agent with the given prompt or task.
         Override to reset OpenAI-specific state.
         """
@@ -98,9 +104,11 @@ class OpenAIMCPAgent(BaseMCPAgent):
         self.pending_safety_checks = []
         # Use base implementation
-        return await super().run(prompt, max_steps, conversation_mode)
+        return await super().run(prompt_or_task, max_steps)
-    async def create_initial_messages(self, prompt: str, screenshot: str | None) -> list[Any]:
+    async def create_initial_messages(
+        self, prompt: str, screenshot: str | None = None
+    ) -> list[Any]:
         """
         Create initial messages for OpenAI.
@@ -111,7 +119,7 @@ class OpenAIMCPAgent(BaseMCPAgent):
         # Just return a list with the prompt and screenshot
         return [{"prompt": prompt, "screenshot": screenshot}]
-    async def get_model_response(self, messages: list[Any], step: int) -> dict[str, Any]:
+    async def get_model_response(self, messages: list[Any]) -> ModelResponse:
         """Get response from OpenAI including any tool calls."""
         # OpenAI's API is stateful, so we handle messages differently
@@ -124,11 +132,11 @@ class OpenAIMCPAgent(BaseMCPAgent):
         if not computer_tool_name:
             # No computer tools available, just return a text response
-            return {
-                "content": "No computer use tools available",
-                "tool_calls": [],
-                "done": True,
-            }
+            return ModelResponse(
+                content="No computer use tools available",
+                tool_calls=[],
+                done=True,
+            )
         # Define the computer use tool
         computer_tool: ToolParam = {  # type: ignore[reportAssignmentType]
@@ -193,11 +201,11 @@ class OpenAIMCPAgent(BaseMCPAgent):
                 if not latest_screenshot:
                     logger.warning("No screenshot provided for response to action")
-                    return {
-                        "content": "No screenshot available for next action",
-                        "tool_calls": [],
-                        "done": True,
-                    }
+                    return ModelResponse(
+                        content="No screenshot available for next action",
+                        tool_calls=[],
+                        done=True,
+                    )
                 # Create response to previous action
                 input_param_followup: ResponseInputParam = [  # type: ignore[reportAssignmentType]
@@ -226,12 +234,11 @@ class OpenAIMCPAgent(BaseMCPAgent):
         self.last_response_id = response.id
         # Process response
-        result = {
-            "content": "",
-            "tool_calls": [],
-            "done": False,  # Will be set to True only if no tool calls
-            "raw_response": response.model_dump(),  # For debugging
-        }
+        result = ModelResponse(
+            content="",
+            tool_calls=[],
+            done=False,  # Will be set to True only if no tool calls
+        )
         self.pending_call_id = None
@@ -244,7 +251,7 @@ class OpenAIMCPAgent(BaseMCPAgent):
         if computer_calls:
             # Process computer calls
-            result["done"] = False
+            result.done = False
             for computer_call in computer_calls:
                 self.pending_call_id = computer_call.call_id
                 self.pending_safety_checks = computer_call.pending_safety_checks
@@ -252,13 +259,15 @@ class OpenAIMCPAgent(BaseMCPAgent):
                 # Convert OpenAI action to MCP tool call
                 action = computer_call.action.model_dump()
-                # Map OpenAI action to MCP tool call format
-                tool_call = {
-                    "name": computer_tool_name,
-                    "arguments": action,
-                    "call_id": computer_call.call_id,  # Store for reference
-                }
-                result["tool_calls"].append(tool_call)
+                # Create MCPToolCall object with OpenAI metadata as extra fields
+                # Pyright will complain but the tool class accepts extra fields
+                tool_call = MCPToolCall(
+                    name=computer_tool_name,
+                    arguments=action,
+                    call_id=computer_call.call_id,  # type: ignore
+                    pending_safety_checks=computer_call.pending_safety_checks,  # type: ignore
+                )
+                result.tool_calls.append(tool_call)
         else:
             # No computer calls, check for text response
             for item in response.output:
@@ -270,7 +279,7 @@ class OpenAIMCPAgent(BaseMCPAgent):
                         if isinstance(content, ResponseOutputText)
                     ]
                     if text_parts:
-                        result["content"] = "".join(text_parts)
+                        result.content = "".join(text_parts)
                         break
         # Extract reasoning if present
@@ -280,16 +289,16 @@ class OpenAIMCPAgent(BaseMCPAgent):
                 reasoning_text += f"Thinking: {item.summary[0].text}\n"
         if reasoning_text:
-            result["content"] = reasoning_text + result["content"]
+            result.content = reasoning_text + result.content if result.content else reasoning_text
         # Set done=True if no tool calls (task complete or waiting for user)
-        if not result["tool_calls"]:
-            result["done"] = True
+        if not result.tool_calls:
+            result.done = True
         return result
     async def format_tool_results(
-        self, processed_results: dict[str, Any], tool_calls: list[dict]
+        self, tool_calls: list[MCPToolCall], tool_results: list[MCPToolResult]
     ) -> list[Any]:
         """
         Format tool results for OpenAI's stateful API.
@@ -297,12 +306,19 @@ class OpenAIMCPAgent(BaseMCPAgent):
         OpenAI doesn't use a traditional message format - we just need to
         preserve the screenshot for the next step.
         """
-        # For OpenAI, we just need to track the latest screenshot
+        # Extract latest screenshot from results
+        latest_screenshot = None
+        for result in tool_results:
+            if not result.isError:
+                for content in result.content:
+                    if isinstance(content, types.ImageContent):
+                        latest_screenshot = content.data
         # Return a simple dict that get_model_response can use
         return [
             {
                 "type": "tool_result",
-                "screenshot": processed_results.get("screenshot"),
+                "screenshot": latest_screenshot,
             }
         ]

hud/{mcp_agent → mcp}/tests/test_base.py RENAMED Viewed

@@ -5,10 +5,18 @@ from __future__ import annotations
 from typing import TYPE_CHECKING, Any
 from unittest.mock import MagicMock
+# Import AsyncMock from unittest.mock if available (Python 3.8+)
+try:
+    from unittest.mock import AsyncMock
+except ImportError:
+    # Fallback for older Python versions
+    from unittest.mock import MagicMock as AsyncMock
 import pytest
 from mcp import types
+from mcp.types import CallToolRequestParams as MCPToolCall
-from hud.mcp_agent.base import BaseMCPAgent
+from hud.mcp.base import BaseMCPAgent
 from hud.tools.executors.base import BaseExecutor
 if TYPE_CHECKING:
@@ -18,8 +26,13 @@ if TYPE_CHECKING:
 class MockMCPAgent(BaseMCPAgent):
     """Concrete implementation of BaseMCPAgent for testing."""
-    def __init__(self, **kwargs: Any) -> None:
-        super().__init__(**kwargs)
+    def __init__(self, mcp_client: Any = None, **kwargs: Any) -> None:
+        if mcp_client is None:
+            # Create a mock client if none provided
+            mcp_client = MagicMock()
+            mcp_client.get_all_active_sessions = MagicMock(return_value={})
+            mcp_client.get_available_tools = MagicMock(return_value=[])
+        super().__init__(mcp_client=mcp_client, **kwargs)
         self.executor = BaseExecutor()  # Use simulated executor
         self._messages = []
@@ -66,46 +79,58 @@ class TestBaseMCPAgent:
         """Test initialization with default values."""
         agent = MockMCPAgent()
-        assert agent.client is not None
+        assert agent.mcp_client is not None
         assert agent.allowed_tools is None
         assert agent.disallowed_tools == []
         assert agent.initial_screenshot is False
         assert agent.max_screenshot_history == 3
         assert agent.append_tool_system_prompt is True
         assert agent.custom_system_prompt is None
-        assert agent.lifecycle_tools == {"setup": "setup", "evaluate": "evaluate"}
+        assert agent.lifecycle_tools == []
     def test_init_with_params(self):
         """Test initialization with custom parameters."""
         client = MagicMock()
         agent = MockMCPAgent(
-            client=client,
+            mcp_client=client,
             allowed_tools=["tool1", "tool2"],
             disallowed_tools=["bad_tool"],
             initial_screenshot=True,
             max_screenshot_history=5,
             append_tool_system_prompt=False,
             custom_system_prompt="Custom prompt",
-            lifecycle_tools={"setup": "custom_setup", "evaluate": "custom_eval"},
+            lifecycle_tools=["custom_setup", "custom_eval"],
         )
-        assert agent.client == client
+        assert agent.mcp_client == client
         assert agent.allowed_tools == ["tool1", "tool2"]
         assert agent.disallowed_tools == ["bad_tool"]
         assert agent.initial_screenshot is True
         assert agent.max_screenshot_history == 5
         assert agent.append_tool_system_prompt is False
         assert agent.custom_system_prompt == "Custom prompt"
-        assert agent.lifecycle_tools == {"setup": "custom_setup", "evaluate": "custom_eval"}
+        assert agent.lifecycle_tools == ["custom_setup", "custom_eval"]
-    @pytest.mark.asyncio
-    async def test_initialize_no_client(self):
-        """Test initialize fails without client."""
-        agent = MockMCPAgent()
-        agent.client = None
+    def test_init_no_client(self):
+        """Test init fails without client."""
+        # Create a minimal concrete implementation to test the ValueError
+        class TestAgent(BaseMCPAgent):
+            def create_initial_messages(
+                self, prompt: str, screenshot: str | None = None
+            ) -> list[dict[str, Any]]:
+                return []
-        with pytest.raises(ValueError, match="Client is not initialized"):
-            await agent.initialize()
+            def format_tool_results(
+                self, results: list[tuple[str, Any]], screenshot: str | None = None
+            ) -> list[dict[str, Any]]:
+                return []
+            async def get_model_response(self, messages: list[dict[str, Any]]) -> dict[str, Any]:
+                return {"content": "test"}
+        with pytest.raises(ValueError, match="MCPClient is required"):
+            TestAgent(mcp_client=None)
     @pytest.mark.asyncio
     async def test_initialize_with_sessions(self):
@@ -133,14 +158,31 @@ class TestBaseMCPAgent:
         mock_session.connector.client_session.list_tools = mock_list_tools
-        assert agent.client is not None
-        agent.client.get_all_active_sessions = MagicMock(return_value={"server1": mock_session})
+        assert agent.mcp_client is not None
+        agent.mcp_client.get_all_active_sessions = MagicMock(return_value={"server1": mock_session})
+        # Mock get_tool_map to return tools discovered from sessions
+        tool_map = {
+            "tool1": (
+                "server1",
+                types.Tool(name="tool1", description="Tool 1", inputSchema={"type": "object"}),
+            ),
+            "tool2": (
+                "server1",
+                types.Tool(name="tool2", description="Tool 2", inputSchema={"type": "object"}),
+            ),
+            "setup": (
+                "server1",
+                types.Tool(name="setup", description="Setup tool", inputSchema={"type": "object"}),
+            ),
+        }
+        agent.mcp_client.get_tool_map = MagicMock(return_value=tool_map)
         await agent.initialize()
         # Check available tools were populated (excludes lifecycle tools)
         tools = agent.get_available_tools()
-        assert len(tools) == 2  # tool1 and tool2 (setup is excluded as lifecycle tool)
+        assert len(tools) == 3  # All tools (setup is not in default lifecycle tools)
         # Check tool map was populated (includes all tools)
         tool_map = agent.get_tool_map()
@@ -173,15 +215,36 @@ class TestBaseMCPAgent:
         mock_session.connector.client_session.list_tools = mock_list_tools
-        assert agent.client is not None
-        agent.client.get_all_active_sessions = MagicMock(return_value={"server1": mock_session})
+        assert agent.mcp_client is not None
+        agent.mcp_client.get_all_active_sessions = MagicMock(return_value={"server1": mock_session})
+        # Mock get_tool_map to return tools discovered from sessions
+        tool_map = {
+            "tool1": (
+                "server1",
+                types.Tool(name="tool1", description="Tool 1", inputSchema={"type": "object"}),
+            ),
+            "tool2": (
+                "server1",
+                types.Tool(name="tool2", description="Tool 2", inputSchema={"type": "object"}),
+            ),
+            "tool3": (
+                "server1",
+                types.Tool(name="tool3", description="Tool 3", inputSchema={"type": "object"}),
+            ),
+            "setup": (
+                "server1",
+                types.Tool(name="setup", description="Setup", inputSchema={"type": "object"}),
+            ),
+        }
+        agent.mcp_client.get_tool_map = MagicMock(return_value=tool_map)
         await agent.initialize()
         # Check filtering worked - get_available_tools excludes lifecycle tools
         tools = agent.get_available_tools()
         tool_names = [t.name for t in tools]
-        assert len(tools) == 1  # Only tool1 (setup is excluded as lifecycle tool)
+        assert len(tools) == 1  # Only tool1 (tool2 and tool3 are filtered out)
         assert "tool1" in tool_names
         assert "setup" not in tool_names  # Lifecycle tool excluded from available tools
         assert "tool2" not in tool_names  # Not in allowed list
@@ -216,14 +279,26 @@ class TestBaseMCPAgent:
         mock_session.connector.client_session.call_tool = mock_call_tool
-        assert agent.client is not None
-        agent.client.get_all_active_sessions = MagicMock(return_value={"server1": mock_session})
-        agent.client.get_session = MagicMock(return_value=mock_session)
+        assert agent.mcp_client is not None
+        agent.mcp_client.get_all_active_sessions = MagicMock(return_value={"server1": mock_session})
+        # Mock get_tool_map to return tools discovered from sessions
+        tool_map = {
+            "test_tool": (
+                "server1",
+                types.Tool(name="test_tool", description="Test", inputSchema={"type": "object"}),
+            )
+        }
+        agent.mcp_client.get_tool_map = MagicMock(return_value=tool_map)
+        # Mock the client's call_tool method directly
+        agent.mcp_client.call_tool = AsyncMock(return_value=mock_result)
         await agent.initialize()
         # Call the tool
-        result = await agent.call_tool({"name": "test_tool", "arguments": {"param": "value"}})
+        tool_call = MCPToolCall(name="test_tool", arguments={"param": "value"})
+        result = await agent.call_tool(tool_call)
         assert result == mock_result
         assert not result.isError
@@ -240,22 +315,25 @@ class TestBaseMCPAgent:
             return types.ListToolsResult(tools=[])
         mock_session.list_tools = mock_list_tools
-        assert agent.client is not None
-        agent.client.get_all_active_sessions = MagicMock(return_value={"server1": mock_session})
+        assert agent.mcp_client is not None
+        agent.mcp_client.get_all_active_sessions = MagicMock(return_value={"server1": mock_session})
         await agent.initialize()
         # Try to call unknown tool
         with pytest.raises(ValueError, match="Tool 'unknown_tool' not found"):
-            await agent.call_tool({"name": "unknown_tool", "arguments": {}})
+            tool_call = MCPToolCall(name="unknown_tool", arguments={})
+            await agent.call_tool(tool_call)
     @pytest.mark.asyncio
     async def test_call_tool_no_name(self):
         """Test calling tool without name."""
+        # MCPToolCall accepts empty names, but the agent should validate
         agent = MockMCPAgent()
+        tool_call = MCPToolCall(name="", arguments={})
         with pytest.raises(ValueError, match="Tool call must have a 'name' field"):
-            await agent.call_tool({"arguments": {}})
+            await agent.call_tool(tool_call)
     def test_get_system_prompt_default(self):
         """Test get_system_prompt with default settings."""
@@ -307,6 +385,9 @@ class TestBaseMCPAgent:
         """Test getting tool schemas."""
         agent = MockMCPAgent()
+        # Add setup to lifecycle tools to test filtering
+        agent.lifecycle_tools = ["setup"]
         agent._available_tools = [
             types.Tool(name="tool1", description="Tool 1", inputSchema={"type": "object"}),
             types.Tool(name="setup", description="Setup", inputSchema={"type": "object"}),
@@ -360,36 +441,30 @@ class TestBaseMCPAgent:
         mock_session.connector.client_session.call_tool = mock_call_tool
-        assert agent.client is not None
-        agent.client.get_all_active_sessions = MagicMock(return_value={"server1": mock_session})
-        agent.client.get_session = MagicMock(return_value=mock_session)
+        assert agent.mcp_client is not None
+        agent.mcp_client.get_all_active_sessions = MagicMock(return_value={"server1": mock_session})
+        # Mock get_tool_map to return tools discovered from sessions
+        tool_map = {
+            "screenshot": (
+                "server1",
+                types.Tool(
+                    name="screenshot", description="Screenshot", inputSchema={"type": "object"}
+                ),
+            )
+        }
+        agent.mcp_client.get_tool_map = MagicMock(return_value=tool_map)
+        # Mock the client's call_tool method directly
+        agent.mcp_client.call_tool = AsyncMock(return_value=mock_result)
         await agent.initialize()
         screenshot = await agent.capture_screenshot()
         assert screenshot == "base64imagedata"
-    def test_process_tool_results_extracts_text(self):
-        """Test processing tool results extracts text content."""
-        agent = MockMCPAgent()
-        # Create a proper CallToolResult object
-        result = types.CallToolResult(
-            content=[
-                types.TextContent(type="text", text="Result text"),
-                types.ImageContent(type="image", data="imagedata", mimeType="image/png"),
-            ],
-            isError=False,
-        )
-        tool_results = [{"tool_name": "test_tool", "result": result}]
-        processed = agent.process_tool_results(tool_results)
-        assert "text" in processed
-        assert "Result text" in processed["text"]
-        assert "results" in processed
-        assert len(processed["results"]) == 1
+    # process_tool_results method was removed from base class
+    # This functionality is now handled internally
     def test_get_tools_by_server(self):
         """Test getting tools grouped by server."""

hud-python 0.3.0__py3-none-any.whl → 0.3.1__py3-none-any.whl

Potentially problematic release.

hud-python 0.3.0py3-none-any.whl → 0.3.1py3-none-any.whl