PyPI - hud-python - Versions diffs - 0.4.20__tar.gz → 0.4.22__tar.gz - Mend

hud-python 0.4.20tar.gz → 0.4.22tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of hud-python might be problematic. Click here for more details.

Files changed (201) hide show

{hud_python-0.4.20 → hud_python-0.4.22}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: hud-python
-Version: 0.4.20
+Version: 0.4.22
 Summary: SDK for the HUD platform.
 Project-URL: Homepage, https://github.com/hud-evals/hud-python
 Project-URL: Bug Tracker, https://github.com/hud-evals/hud-python/issues
@@ -38,6 +38,7 @@ Requires-Python: <3.14,>=3.11
 Requires-Dist: httpx<1,>=0.23.0
 Requires-Dist: hud-fastmcp-python-sdk>=0.1.2
 Requires-Dist: hud-mcp-python-sdk>=3.13.2
+Requires-Dist: hud-mcp-use-python-sdk>=2.3.16
 Requires-Dist: opentelemetry-api>=1.34.1
 Requires-Dist: opentelemetry-exporter-otlp-proto-http>=1.34.1
 Requires-Dist: opentelemetry-instrumentation-mcp>=0.44.1
@@ -56,7 +57,6 @@ Provides-Extra: agent
 Requires-Dist: anthropic; extra == 'agent'
 Requires-Dist: datasets>=2.14.0; extra == 'agent'
 Requires-Dist: dotenv>=0.9.9; extra == 'agent'
-Requires-Dist: hud-mcp-use-python-sdk>=2.3.13; extra == 'agent'
 Requires-Dist: ipykernel; extra == 'agent'
 Requires-Dist: ipython<9; extra == 'agent'
 Requires-Dist: jupyter-client; extra == 'agent'
@@ -70,7 +70,6 @@ Provides-Extra: agents
 Requires-Dist: anthropic; extra == 'agents'
 Requires-Dist: datasets>=2.14.0; extra == 'agents'
 Requires-Dist: dotenv>=0.9.9; extra == 'agents'
-Requires-Dist: hud-mcp-use-python-sdk>=2.3.13; extra == 'agents'
 Requires-Dist: ipykernel; extra == 'agents'
 Requires-Dist: ipython<9; extra == 'agents'
 Requires-Dist: jupyter-client; extra == 'agents'
@@ -85,7 +84,6 @@ Requires-Dist: aiodocker>=0.24.0; extra == 'dev'
 Requires-Dist: anthropic; extra == 'dev'
 Requires-Dist: datasets>=2.14.0; extra == 'dev'
 Requires-Dist: dotenv>=0.9.9; extra == 'dev'
-Requires-Dist: hud-mcp-use-python-sdk>=2.3.13; extra == 'dev'
 Requires-Dist: inspect-ai>=0.3.80; extra == 'dev'
 Requires-Dist: ipykernel; extra == 'dev'
 Requires-Dist: ipython<9; extra == 'dev'

{hud_python-0.4.20 → hud_python-0.4.22}/hud/__init__.py RENAMED Viewed

@@ -20,3 +20,10 @@ try:
     from .version import __version__
 except ImportError:
     __version__ = "unknown"
+try:
+    from .utils.pretty_errors import install_pretty_errors
+    install_pretty_errors()
+except Exception:  # noqa: S110
+    pass

{hud_python-0.4.20 → hud_python-0.4.22}/hud/agents/base.py RENAMED Viewed

@@ -94,6 +94,8 @@ class MCPAgent(ABC):
         self.model_name = model_name
         self.design = HUDDesign(logger=logger)
+        self.metadata = {}
         # Set verbose mode if requested
         if verbose:
             self.design.set_verbose(True)
@@ -111,10 +113,12 @@ class MCPAgent(ABC):
         # Initialize these here so methods can be called before initialize()
         self._available_tools: list[types.Tool] = []
         self._tool_map: dict[str, types.Tool] = {}  # Simplified: just name to tool
-        self.screenshot_history: list[str] = []
+        self.response_tool_name = None
+        self.initialization_complete = False
+        # Trace
         self._auto_trace = auto_trace
         self._auto_trace_cm: Any | None = None  # Store auto-created trace context manager
-        self.initialization_complete = False
         # Response agent to automatically interact with the model
         self.response_agent = response_agent
@@ -530,6 +534,9 @@ class MCPAgent(ABC):
         self._available_tools = []
         self._tool_map = {}
+        # Track response tools by server
+        response_tools_by_server: dict[str, str] = {}  # server_name -> tool_name
         for tool in all_tools:
             # Check if tool should be included
             if self.allowed_tools and tool.name not in self.allowed_tools:
@@ -541,10 +548,36 @@ class MCPAgent(ABC):
             # Simplified mapping - just tool name to tool
             self._tool_map[tool.name] = tool
-            # Auto-detect response tool as a lifecycle tool
-            if tool.name == "response" and "response" not in self.lifecycle_tools:
-                self.design.debug("Auto-detected 'response' tool as a lifecycle tool")
-                self.lifecycle_tools.append("response")
+            # Track response tools
+            if "response" in tool.name or tool.name == "response":
+                # Extract server name from tool name (e.g., "grader_response" -> "grader")
+                if "_" in tool.name:
+                    server_name = tool.name.split("_", 1)[0]
+                    response_tools_by_server[server_name] = tool.name
+                else:
+                    response_tools_by_server["_default"] = tool.name
+        # Find the response tool to use (prioritize last server in config)
+        if response_tools_by_server and hasattr(self.mcp_client, "mcp_config"):
+            # Get server names in order from mcp_config
+            server_names = list(self.mcp_client.mcp_config.keys())
+            # Try to find response tool from last server first
+            response_tool_name = None
+            for server_name in reversed(server_names):
+                if server_name in response_tools_by_server:
+                    response_tool_name = response_tools_by_server[server_name]
+                    break
+            # Fallback to any response tool
+            if not response_tool_name and response_tools_by_server:
+                response_tool_name = next(iter(response_tools_by_server.values()))
+            # Add to lifecycle tools if found
+            if response_tool_name and response_tool_name not in self.lifecycle_tools:
+                self.design.debug(f"Auto-detected '{response_tool_name}' tool as a lifecycle tool")
+                self.response_tool_name = response_tool_name
+                self.lifecycle_tools.append(response_tool_name)
         # Check if all required tools are available
         if self.required_tools:
@@ -565,13 +598,12 @@ class MCPAgent(ABC):
             response: The agent's response
             messages: The current message history (will be modified in-place)
         """
-        # Check if we have a response lifecycle tool
-        if "response" in self.lifecycle_tools and "response" in self._tool_map:
-            self.design.debug("Calling response lifecycle tool")
+        if self.response_tool_name:
+            self.design.debug(f"Calling response lifecycle tool: {self.response_tool_name}")
             try:
                 # Call the response tool with the agent's response
                 response_tool_call = MCPToolCall(
-                    name="response", arguments={"response": response.content, "messages": messages}
+                    name=self.response_tool_name, arguments={"response": response.content}
                 )
                 response_results = await self.call_tools(response_tool_call)

{hud_python-0.4.20 → hud_python-0.4.22}/hud/agents/claude.py RENAMED Viewed

@@ -306,19 +306,20 @@ class ClaudeAgent(MCPAgent):
         """Convert MCP tools to Claude tool format."""
         claude_tools = []
         self._claude_to_mcp_tool_map = {}  # Reset mapping
         # Find computer tool by priority
         computer_tool_priority = ["anthropic_computer", "computer_anthropic", "computer"]
         selected_computer_tool = None
         for priority_name in computer_tool_priority:
             for tool in self._available_tools:
-                if tool.name == priority_name:
+                # Check both exact match and suffix match (for prefixed tools)
+                if tool.name == priority_name or tool.name.endswith(f"_{priority_name}"):
                     selected_computer_tool = tool
                     break
             if selected_computer_tool:
                 break
         # Add the selected computer tool if found
         if selected_computer_tool:
             claude_tool = {
@@ -330,14 +331,18 @@ class ClaudeAgent(MCPAgent):
             # Map Claude's "computer" back to the actual MCP tool name
             self._claude_to_mcp_tool_map["computer"] = selected_computer_tool.name
             claude_tools.append(claude_tool)
-            logger.debug(f"Using {selected_computer_tool.name} as computer tool for Claude")
+            logger.debug("Using %s as computer tool for Claude", selected_computer_tool.name)
         # Add other non-computer tools
         for tool in self._available_tools:
             # Skip computer tools (already handled) and lifecycle tools
-            if tool.name in computer_tool_priority or tool.name in self.lifecycle_tools:
+            is_computer_tool = any(
+                tool.name == priority_name or tool.name.endswith(f"_{priority_name}")
+                for priority_name in computer_tool_priority
+            )
+            if is_computer_tool or tool.name in self.lifecycle_tools:
                 continue
             claude_tool = {
                 "name": tool.name,
                 "description": tool.description or f"Execute {tool.name}",
@@ -359,16 +364,21 @@ class ClaudeAgent(MCPAgent):
         messages_cached = copy.deepcopy(messages)
         # Mark last user message with cache control
-        if messages_cached and messages_cached[-1].get("role") == "user":
+        if (
+            messages_cached
+            and isinstance(messages_cached[-1], dict)
+            and messages_cached[-1].get("role") == "user"
+        ):
             last_content = messages_cached[-1]["content"]
             # Content is formatted to be list of ContentBlock in format_blocks and format_message
             if isinstance(last_content, list):
                 for block in last_content:
-                    # Only add cache control to block types that support it
-                    block_type = block.get("type")
-                    if block_type in ["text", "image", "tool_use", "tool_result"]:
-                        cache_control: BetaCacheControlEphemeralParam = {"type": "ephemeral"}
-                        block["cache_control"] = cache_control  # type: ignore[reportGeneralTypeIssues]
+                    # Only add cache control to dict-like block types that support it
+                    if isinstance(block, dict):
+                        block_type = block.get("type")
+                        if block_type in ["text", "image", "tool_use", "tool_result"]:
+                            cache_control: BetaCacheControlEphemeralParam = {"type": "ephemeral"}
+                            block["cache_control"] = cache_control  # type: ignore[reportGeneralTypeIssues]
         return messages_cached

hud_python-0.4.22/hud/agents/grounded_openai.py ADDED Viewed

@@ -0,0 +1,280 @@
+"""Grounded OpenAI agent that separates visual grounding from reasoning."""
+from __future__ import annotations
+import json
+from typing import Any
+from hud import instrument
+from hud.tools.grounding import GroundedComputerTool, Grounder, GrounderConfig
+from hud.types import AgentResponse, MCPToolCall, MCPToolResult
+from .openai_chat_generic import GenericOpenAIChatAgent
+class GroundedOpenAIChatAgent(GenericOpenAIChatAgent):
+    """OpenAI agent that uses a separate grounding model for element detection.
+    This agent:
+    - Exposes only a synthetic "computer" tool to the planning model
+    - Intercepts tool calls to ground element descriptions to coordinates
+    - Converts grounded results to real computer tool calls
+    - Maintains screenshot state for grounding operations
+    The architecture separates concerns:
+    - Planning model (GPT-4o etc) focuses on high-level reasoning
+    - Grounding model (Qwen2-VL etc) handles visual element detection
+    """
+    def __init__(
+        self,
+        *,
+        grounder_config: GrounderConfig,
+        model_name: str = "gpt-4o-mini",
+        allowed_tools: list[str] | None = None,
+        append_setup_output: bool = False,
+        system_prompt: str | None = None,
+        **kwargs: Any,
+    ) -> None:
+        """Initialize the grounded OpenAI agent.
+        Args:
+            grounder_config: Configuration for the grounding model
+            openai_client: OpenAI client for the planning model
+            model: Name of the OpenAI model to use for planning (e.g., "gpt-4o", "gpt-4o-mini")
+            real_computer_tool_name: Name of the actual computer tool to execute
+            **kwargs: Additional arguments passed to GenericOpenAIChatAgent
+        """
+        # Set defaults for grounded agent
+        if allowed_tools is None:
+            allowed_tools = ["computer"]
+        if system_prompt is None:
+            system_prompt = (
+                "You are a helpful AI assistant that can control the computer "
+                "through visual interaction.\n\n"
+                "IMPORTANT: Always explain your reasoning and observations before taking actions:\n"
+                "1. First, describe what you see on the screen\n"
+                "2. Explain what you plan to do and why\n"
+                "3. Then use the computer tool with natural language descriptions\n\n"
+                "For example:\n"
+                "- 'I can see a login form with username and password fields. "
+                "I need to click on the username field first.'\n"
+                "- 'There's a blue submit button at the bottom. "
+                "I'll click on it to submit the form.'\n"
+                "- 'I notice a red close button in the top right corner. "
+                "I'll click it to close this dialog.'\n\n"
+                "Use descriptive element descriptions like:\n"
+                "- Colors: 'red button', 'blue link', 'green checkmark'\n"
+                "- Position: 'top right corner', 'bottom of the page', 'left sidebar'\n"
+                "- Text content: 'Submit button', 'Login link', 'Cancel option'\n"
+                "- Element type: 'text field', 'dropdown menu', 'checkbox'"
+            )
+        super().__init__(
+            model_name=model_name,
+            allowed_tools=allowed_tools,
+            append_setup_output=append_setup_output,
+            system_prompt=system_prompt,
+            **kwargs,
+        )
+        self.grounder = Grounder(grounder_config)
+        self.grounded_tool = None
+    async def initialize(self, task: Any = None) -> None:
+        """Initialize the agent and create the grounded tool with mcp_client."""
+        # Call parent initialization first
+        await super().initialize(task)
+        if self.mcp_client is None:
+            raise ValueError("mcp_client must be initialized before creating grounded tool")
+        self.grounded_tool = GroundedComputerTool(
+            grounder=self.grounder, mcp_client=self.mcp_client, computer_tool_name="computer"
+        )
+    def get_tool_schemas(self) -> list[Any]:
+        """Override to expose only the synthetic grounded tool.
+        The planning model only sees the synthetic "computer" tool,
+        which is provided by the grounded tool itself.
+        Returns:
+            List containing only the grounded computer tool schema
+        """
+        if self.grounded_tool is None:
+            return []
+        return [self.grounded_tool.get_openai_tool_schema()]
+    @instrument(
+        span_type="agent",
+        record_args=False,
+        record_result=True,
+    )
+    async def get_response(self, messages: Any) -> AgentResponse:
+        """Get response from the planning model and handle grounded tool calls.
+        This method:
+        1. Calls the planning model with the grounded tool schema
+        2. Executes any tool calls directly through the grounded tool
+        3. Returns the response
+        Args:
+            messages: Conversation messages
+        Returns:
+            AgentResponse with either content or tool calls for MCP execution
+        """
+        tool_schemas = self.get_tool_schemas()
+        # Take initial screenshot and add to messages if this is the first turn
+        has_image = any(
+            isinstance(m.get("content"), list)
+            and any(
+                block.get("type") == "image_url"
+                for block in m["content"]
+                if isinstance(block, dict)
+            )
+            for m in messages
+            if isinstance(m.get("content"), list)
+        )
+        if not has_image:
+            if self.mcp_client is None:
+                raise ValueError("mcp_client is not initialized")
+            screenshot_result = await self.mcp_client.call_tool(
+                MCPToolCall(name="computer", arguments={"action": "screenshot"})
+            )
+            for block in screenshot_result.content:
+                # Check for ImageContent type from MCP
+                if hasattr(block, "data") and hasattr(block, "mimeType"):
+                    mime_type = getattr(block, "mimeType", "image/png")
+                    data = getattr(block, "data", "")
+                    messages.append(
+                        {
+                            "role": "user",
+                            "content": [
+                                {
+                                    "type": "image_url",
+                                    "image_url": {"url": f"data:{mime_type};base64,{data}"},
+                                }
+                            ],
+                        }
+                    )
+                    break
+        protected_keys = {"model", "messages", "tools", "parallel_tool_calls"}
+        extra = {k: v for k, v in (self.completion_kwargs or {}).items() if k not in protected_keys}
+        response = await self.oai.chat.completions.create(
+            model=self.model_name,
+            messages=messages,
+            tools=tool_schemas,
+            parallel_tool_calls=False,
+            **extra,
+        )
+        choice = response.choices[0]
+        msg = choice.message
+        assistant_msg: dict[str, Any] = {"role": "assistant"}
+        if msg.content:
+            assistant_msg["content"] = msg.content
+        if msg.tool_calls:
+            assistant_msg["tool_calls"] = msg.tool_calls
+        messages.append(assistant_msg)
+        self.conversation_history = messages.copy()
+        if not msg.tool_calls:
+            return AgentResponse(
+                content=msg.content or "",
+                tool_calls=[],
+                done=choice.finish_reason in ("stop", "length"),
+                raw=response,
+            )
+        tc = msg.tool_calls[0]
+        if tc.function.name != "computer":
+            return AgentResponse(
+                content=f"Error: Model called unexpected tool '{tc.function.name}'",
+                tool_calls=[],
+                done=True,
+                raw=response,
+            )
+        # Parse the arguments
+        try:
+            args = json.loads(tc.function.arguments or "{}")
+        except json.JSONDecodeError:
+            return AgentResponse(
+                content="Error: Invalid tool arguments", tool_calls=[], done=True, raw=response
+            )
+        tool_call = MCPToolCall(name="computer", arguments=args, id=tc.id)
+        return AgentResponse(
+            content=msg.content or "", tool_calls=[tool_call], done=False, raw=response
+        )
+    async def call_tools(
+        self, tool_call: MCPToolCall | list[MCPToolCall] | None = None
+    ) -> list[MCPToolResult]:
+        """Override call_tools to intercept computer tool calls.
+        Execute them through grounded tool.
+        """
+        if tool_call is None:
+            return []
+        if isinstance(tool_call, MCPToolCall):
+            tool_call = [tool_call]
+        results: list[MCPToolResult] = []
+        for tc in tool_call:
+            if tc.name == "computer":
+                # Execute through grounded tool instead of MCP
+                try:
+                    # Extract latest screenshot from conversation history
+                    screenshot_b64 = None
+                    for m in reversed(self.conversation_history):
+                        if m.get("role") == "user" and isinstance(m.get("content"), list):
+                            for block in m["content"]:
+                                if (
+                                    isinstance(block, dict)
+                                    and block.get("type") == "image_url"
+                                    and isinstance(block.get("image_url"), dict)
+                                ):
+                                    url = block["image_url"].get("url", "")
+                                    if url.startswith("data:"):
+                                        screenshot_b64 = (
+                                            url.split(",", 1)[1] if "," in url else None
+                                        )
+                                        break
+                            if screenshot_b64:
+                                break
+                    # Pass screenshot to grounded tool
+                    args_with_screenshot = dict(tc.arguments) if tc.arguments else {}
+                    if screenshot_b64:
+                        args_with_screenshot["screenshot_b64"] = screenshot_b64
+                    if self.grounded_tool is None:
+                        raise ValueError("Grounded tool is not initialized")
+                    content_blocks = await self.grounded_tool(**args_with_screenshot)
+                    results.append(MCPToolResult(content=content_blocks, isError=False))
+                except Exception as e:
+                    # Create error result
+                    from mcp.types import TextContent
+                    error_content = TextContent(text=str(e), type="text")
+                    results.append(MCPToolResult(content=[error_content], isError=True))
+            else:
+                # For non-computer tools, use parent implementation
+                parent_results = await super().call_tools(tc)
+                results.extend(parent_results)
+        return results

{hud_python-0.4.20 → hud_python-0.4.22}/hud/agents/tests/test_client.py RENAMED Viewed

@@ -33,29 +33,6 @@ class TestMCPClient:
         with patch("mcp_use.client.MCPClient.from_dict", return_value=mock_instance):
             yield mock_instance
-    @pytest.mark.asyncio
-    async def test_init_with_config(self, mock_telemetry):
-        """Test client initialization with config dictionary."""
-        mcp_config = {
-            "test_server": {
-                "command": "python",
-                "args": ["-m", "test_server"],
-                "env": {"TEST": "true"},
-            }
-        }
-        with patch("mcp_use.client.MCPClient.from_dict") as mock_from_dict:
-            mock_instance = MagicMock()
-            mock_instance.create_all_sessions = AsyncMock(return_value={})
-            mock_from_dict.return_value = mock_instance
-            client = MCPClient(mcp_config=mcp_config, verbose=True)
-            # Initialize to trigger connection
-            await client.initialize()
-            assert client.verbose is True
-            # Verify MCPUseClient.from_dict was called with proper config
-            mock_from_dict.assert_called_once_with({"mcpServers": mcp_config})
     @pytest.mark.asyncio
     async def test_connect_single_server(self, mock_telemetry, mock_mcp_use_client):
         """Test connecting to a single server."""
@@ -146,10 +123,10 @@ class TestMCPClient:
         # Verify sessions were created
         mock_mcp_use_client.create_all_sessions.assert_called_once()
-        # Check tools from both servers
+        # Check tools from both servers - should be prefixed with server names
         tools = await client.list_tools()
         names = {t.name for t in tools}
-        assert names == {"tool1", "tool2"}
+        assert names == {"server1_tool1", "server2_tool2"}
     @pytest.mark.asyncio
     async def test_call_tool(self, mock_telemetry, mock_mcp_use_client):
@@ -220,8 +197,15 @@ class TestMCPClient:
         await client.initialize()
-        with pytest.raises(ValueError, match="Tool 'nonexistent' not found"):
-            await client.call_tool(name="nonexistent", arguments={})
+        # Calling a non-existent tool should return an error result
+        result = await client.call_tool(name="nonexistent", arguments={})
+        assert result.isError is True
+        # Check that the error message is in the text content
+        text_content = ""
+        for content in result.content:
+            if isinstance(content, types.TextContent):
+                text_content += content.text
+        assert "Tool 'nonexistent' not found" in text_content
     @pytest.mark.asyncio
     async def test_get_telemetry_data(self, mock_telemetry, mock_mcp_use_client):

hud-python 0.4.20__tar.gz → 0.4.22__tar.gz

Potentially problematic release.

hud-python 0.4.20tar.gz → 0.4.22tar.gz