PyPI - hud-python - Versions diffs - 0.4.13__py3-none-any.whl → 0.4.15__py3-none-any.whl - Mend

hud-python 0.4.13py3-none-any.whl → 0.4.15py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of hud-python might be problematic. Click here for more details.

Files changed (35) hide show

hud/agents/base.py +118 -33
hud/agents/claude.py +1 -1
hud/agents/openai.py +5 -16
hud/agents/tests/test_openai.py +24 -79
hud/cli/__init__.py +137 -15
hud/cli/analyze.py +2 -4
hud/cli/build.py +6 -2
hud/cli/dev.py +67 -0
hud/cli/eval.py +90 -35
hud/cli/hf.py +406 -0
hud/cli/init.py +49 -30
hud/cli/tests/test_mcp_server.py +1 -4
hud/clients/base.py +2 -0
hud/clients/fastmcp.py +7 -2
hud/clients/mcp_use.py +3 -1
hud/clients/utils/retry_transport.py +34 -8
hud/datasets/__init__.py +32 -0
hud/datasets/execution/__init__.py +13 -0
hud/datasets/execution/parallel.py +592 -0
hud/datasets/execution/runner.py +123 -0
hud/datasets/task.py +107 -0
hud/datasets/utils.py +118 -0
hud/otel/instrumentation.py +6 -1
hud/server/server.py +58 -21
hud/settings.py +12 -0
hud/types.py +31 -10
hud/utils/design.py +168 -2
hud/utils/tests/test_version.py +1 -1
hud/version.py +1 -1
{hud_python-0.4.13.dist-info → hud_python-0.4.15.dist-info}/METADATA +4 -3
{hud_python-0.4.13.dist-info → hud_python-0.4.15.dist-info}/RECORD +34 -28
hud/datasets.py +0 -327
{hud_python-0.4.13.dist-info → hud_python-0.4.15.dist-info}/WHEEL +0 -0
{hud_python-0.4.13.dist-info → hud_python-0.4.15.dist-info}/entry_points.txt +0 -0
{hud_python-0.4.13.dist-info → hud_python-0.4.15.dist-info}/licenses/LICENSE +0 -0

hud/agents/base.py CHANGED Viewed

@@ -6,11 +6,12 @@ import asyncio
 import json
 import logging
 from abc import ABC, abstractmethod
-from typing import TYPE_CHECKING, Any, Literal
+from typing import TYPE_CHECKING, Any, ClassVar, Literal
 import mcp.types as types
 from hud.types import AgentResponse, MCPToolCall, MCPToolResult, Trace
+from hud.utils.design import HUDDesign
 from hud.utils.mcp import MCPConfigPatch, patch_mcp_config, setup_hud_telemetry
 if TYPE_CHECKING:
@@ -35,6 +36,7 @@ class MCPAgent(ABC):
     """
     metadata: dict[str, Any]
+    required_tools: ClassVar[list[str]] = []  # Tools that must be available
     def __init__(
         self,
@@ -51,6 +53,7 @@ class MCPAgent(ABC):
         model_name: str = "mcp-agent",
         response_agent: ResponseAgent | None = None,
         auto_trace: bool = True,
+        verbose: bool = False,
     ) -> None:
         """
         Initialize the base MCP agent.
@@ -63,12 +66,18 @@ class MCPAgent(ABC):
             initial_screenshot: Whether to capture screenshot before first prompt
             system_prompt: System prompt to use
             append_setup_output: Whether to append setup tool output to initial messages
+            verbose: If True, sets logging level to INFO. If False, only WARNING and above.
         """
         self.mcp_client = mcp_client
         self._auto_created_client = False  # Track if we created the client
         self.model_name = model_name
+        self.design = HUDDesign(logger=logger)
+        # Set verbose mode if requested
+        if verbose:
+            self.design.set_verbose(True)
         # Filtering
         self.allowed_tools = allowed_tools
@@ -101,7 +110,7 @@ class MCPAgent(ABC):
             self.mcp_client = MCPClient(mcp_config=task.mcp_config)
             self._auto_created_client = True
-            logger.info("Auto-created MCPClient from task.mcp_config")
+            self.design.info_log("Auto-created MCPClient from task.mcp_config")
         # Ensure we have a client
         if self.mcp_client is None:
@@ -112,7 +121,10 @@ class MCPAgent(ABC):
         await self._setup_config(self.mcp_client.mcp_config)
         # Initialize client if needed
-        await self.mcp_client.initialize()
+        try:
+            await self.mcp_client.initialize()
+        except Exception as e:
+            self._handle_connection_error(e)
         # If task is provided, add lifecycle tools
         if isinstance(task, Task):
@@ -134,9 +146,9 @@ class MCPAgent(ABC):
         # Re-apply filtering with updated lifecycle tools
         await self._filter_tools()
-        logger.info(
-            "Agent initialized with %d available tools (after filtering)",
-            len(self._available_tools),
+        num_tools = len(self._available_tools)
+        self.design.success_log(
+            f"Agent initialized with {num_tools} available tools (after filtering)"
         )
     async def run(self, prompt_or_task: str | Task | dict[str, Any], max_steps: int = 10) -> Trace:
@@ -173,6 +185,16 @@ class MCPAgent(ABC):
             else:
                 raise TypeError(f"prompt_or_task must be str or Task, got {type(prompt_or_task)}")
+        except Exception as e:
+            if self._is_connection_error(e):
+                # Return error trace for connection failures
+                return Trace(
+                    reward=0.0,
+                    done=True,
+                    content=self._get_connection_error_message(e),
+                    isError=True,
+                )
+            raise
         finally:
             # Cleanup auto-created resources
             await self._cleanup()
@@ -200,7 +222,7 @@ class MCPAgent(ABC):
             # Execute the setup tool and append the initial observation to the context
             if task.setup_tool is not None:
-                logger.info("Setting up tool phase: %s", task.setup_tool)
+                self.design.progress_log(f"Setting up tool phase: {task.setup_tool}")
                 results = await self.call_tools(task.setup_tool)
                 if any(result.isError for result in results):
                     raise RuntimeError(f"{results}")
@@ -214,7 +236,7 @@ class MCPAgent(ABC):
             prompt_result = await self._run_context(start_context, max_steps=max_steps)
         except Exception as e:
-            logger.error("Task execution failed: %s", e)
+            self.design.error_log(f"Task execution failed: {e}")
             # Create an error result but don't return yet - we still want to evaluate
             prompt_result = Trace(reward=0.0, done=True, content=str(e), isError=True)
             prompt_result.populate_from_context()
@@ -222,7 +244,7 @@ class MCPAgent(ABC):
         # Always evaluate if we have a prompt result and evaluate tool
         if prompt_result is not None and task.evaluate_tool is not None:
             try:
-                logger.info("Evaluating tool phase: %s", task.evaluate_tool)
+                self.design.progress_log(f"Evaluating tool phase: {task.evaluate_tool}")
                 results = await self.call_tools(task.evaluate_tool)
                 if any(result.isError for result in results):
@@ -245,7 +267,7 @@ class MCPAgent(ABC):
                             prompt_result.content = eval_content
             except Exception as e:
-                logger.error("Evaluation phase failed: %s", e)
+                self.design.error_log(f"Evaluation phase failed: {e}")
                 # Continue with the prompt result even if evaluation failed
         return (
@@ -276,21 +298,21 @@ class MCPAgent(ABC):
             # Add initial context
             messages.extend(await self.format_message(context))
-            logger.debug("Messages: %s", messages)
+            self.design.debug(f"Messages: {messages}")
             step_count = 0
             while max_steps == -1 or step_count < max_steps:
                 step_count += 1
                 if max_steps == -1:
-                    logger.info("Step %s (unlimited)", step_count)
+                    self.design.debug(f"Step {step_count} (unlimited)")
                 else:
-                    logger.info("Step %s/%s", step_count, max_steps)
+                    self.design.debug(f"Step {step_count}/{max_steps}")
                 try:
                     # 1. Get model response
                     response = await self.get_response(messages)
-                    logger.info("Agent:\n%s", response)
+                    self.design.debug(f"Agent:\n{response}")
                     # Check if we should stop
                     if response.done or not response.tool_calls:
@@ -302,16 +324,16 @@ class MCPAgent(ABC):
                                     response.content
                                 )
                             except Exception as e:
-                                logger.warning("ResponseAgent failed: %s", e)
+                                self.design.warning_log(f"ResponseAgent failed: {e}")
                         if decision == "STOP":
                             # Try to submit response through lifecycle tool
                             await self._maybe_submit_response(response, messages)
-                            logger.info("Stopping execution")
+                            self.design.debug("Stopping execution")
                             final_response = response
                             break
                         else:
-                            logger.info("Continuing execution")
+                            self.design.debug("Continuing execution")
                             messages.extend(await self.format_message(decision))
                             continue
@@ -323,19 +345,31 @@ class MCPAgent(ABC):
                     tool_messages = await self.format_tool_results(tool_calls, tool_results)
                     messages.extend(tool_messages)
+                    # Compact step completion display
+                    step_info = f"\n[bold]Step {step_count}"
+                    if max_steps != -1:
+                        step_info += f"/{max_steps}"
+                    step_info += "[/bold]"
+                    # Show tool calls and results in compact format
+                    for call, result in zip(tool_calls, tool_results, strict=False):
+                        step_info += f"\n{call}\n{result}"
+                    self.design.info_log(step_info)
                 except Exception as e:
-                    logger.error("Step failed: %s", e)
+                    self.design.error_log(f"Step failed: {e}")
                     error = str(e)
                     break
         except KeyboardInterrupt:
-            logger.info("Agent execution interrupted by user")
+            self.design.warning_log("Agent execution interrupted by user")
             error = "Interrupted by user"
         except asyncio.CancelledError:
-            logger.info("Agent execution cancelled")
+            self.design.warning_log("Agent execution cancelled")
             error = "Cancelled"
         except Exception as e:
-            logger.error("Unexpected error: %s", e)
+            self.design.error_log(f"Unexpected error: {e}")
             error = str(e)
         # Build result
@@ -376,17 +410,17 @@ class MCPAgent(ABC):
         results: list[MCPToolResult] = []
         for tc in tool_call:
             try:
-                logger.info("Calling tool: %s", tc)
+                self.design.debug(f"Calling tool: {tc}")
                 results.append(await self.mcp_client.call_tool(tc))
             except TimeoutError as e:
-                logger.error("Tool execution timed out: %s", e)
+                self.design.error_log(f"Tool execution timed out: {e}")
                 try:
                     await self.mcp_client.shutdown()
                 except Exception as close_err:
-                    logger.debug("Failed to close MCP client cleanly: %s", close_err)
+                    self.design.debug(f"Failed to close MCP client cleanly: {close_err}")
                 raise
             except Exception as e:
-                logger.error("Tool execution failed: %s", e)
+                self.design.error_log(f"Tool execution failed: {e}")
                 results.append(_format_error_result(str(e)))
         return results
@@ -490,9 +524,21 @@ class MCPAgent(ABC):
             # Auto-detect response tool as a lifecycle tool
             if tool.name == "response" and "response" not in self.lifecycle_tools:
-                logger.debug("Auto-detected 'response' tool as a lifecycle tool")
+                self.design.debug("Auto-detected 'response' tool as a lifecycle tool")
                 self.lifecycle_tools.append("response")
+        # Check if all required tools are available
+        if self.required_tools:
+            available_tool_names = {tool.name for tool in self._available_tools}
+            missing_tools = [
+                tool for tool in self.required_tools if tool not in available_tool_names
+            ]
+            if missing_tools:
+                raise ValueError(
+                    f"Required tools not available: {missing_tools}. "
+                    f"Available tools: {list(available_tool_names)}"
+                )
     async def _maybe_submit_response(self, response: AgentResponse, messages: list[Any]) -> None:
         """Submit response through lifecycle tool if available.
@@ -502,7 +548,7 @@ class MCPAgent(ABC):
         """
         # Check if we have a response lifecycle tool
         if "response" in self.lifecycle_tools and "response" in self._tool_map:
-            logger.debug("Calling response lifecycle tool")
+            self.design.debug("Calling response lifecycle tool")
             try:
                 # Call the response tool with the agent's response
                 response_tool_call = MCPToolCall(
@@ -517,9 +563,9 @@ class MCPAgent(ABC):
                 messages.extend(response_messages)
                 # Mark the task as done
-                logger.info("Response lifecycle tool executed, marking task as done")
+                self.design.debug("Response lifecycle tool executed, marking task as done")
             except Exception as e:
-                logger.error("Response lifecycle tool failed: %s", e)
+                self.design.error_log(f"Response lifecycle tool failed: {e}")
     async def _setup_config(self, mcp_config: dict[str, dict[str, Any]]) -> None:
         """Inject metadata into the metadata of the initialize request."""
@@ -573,9 +619,9 @@ class MCPAgent(ABC):
         if self._auto_trace_cm:
             try:
                 self._auto_trace_cm.__exit__(None, None, None)
-                logger.info("Closed auto-created trace")
+                self.design.debug("Closed auto-created trace")
             except Exception as e:
-                logger.warning("Failed to close auto-created trace: %s", e)
+                self.design.warning_log(f"Failed to close auto-created trace: {e}")
             finally:
                 self._auto_trace_cm = None
@@ -583,13 +629,52 @@ class MCPAgent(ABC):
         if self._auto_created_client and self.mcp_client:
             try:
                 await self.mcp_client.shutdown()
-                logger.info("Closed auto-created MCPClient")
+                self.design.debug("Closed auto-created MCPClient")
             except Exception as e:
-                logger.warning("Failed to close auto-created client: %s", e)
+                self.design.warning_log(f"Failed to close auto-created client: {e}")
             finally:
                 self.mcp_client = None
                 self._auto_created_client = False
+    def _is_connection_error(self, e: Exception) -> bool:
+        """Check if an exception is a connection error."""
+        error_msg = str(e).lower()
+        return any(
+            pattern in error_msg
+            for pattern in [
+                "connection",
+                "connect",
+                "refused",
+                "failed",
+                "could not connect",
+                "mcp server",
+            ]
+        )
+    def _get_connection_error_message(self, e: Exception) -> str:
+        """Extract a helpful connection error message."""
+        import re
+        url_match = re.search(r"https?://[^\s]+", str(e))
+        url = url_match.group(0) if url_match else "the MCP server"
+        return f"Connection failed: Could not connect to {url}. Is your MCP client/server running?"
+    def _handle_connection_error(self, e: Exception) -> None:
+        """Handle connection errors with helpful messages."""
+        if self._is_connection_error(e):
+            msg = self._get_connection_error_message(e)
+            # Always show connection errors, not just when logging is enabled
+            self.design.error(f"❌ {msg}")
+            self.design.info("💡 Make sure the MCP server is started before running the agent.")
+            # For localhost, provide specific instructions
+            error_str = str(e).lower()
+            if "localhost" in error_str or "127.0.0.1" in error_str:
+                self.design.info("   Run 'hud dev' in another terminal to start the MCP server")
+            raise RuntimeError(msg) from e
+        raise
 def _format_error_result(error_message: str) -> MCPToolResult:
     return MCPToolResult(content=text_to_blocks(error_message), isError=True)

hud/agents/claude.py CHANGED Viewed

@@ -50,7 +50,7 @@ class ClaudeAgent(MCPAgent):
     def __init__(
         self,
         model_client: AsyncAnthropic | None = None,
-        model: str = "claude-3-7-sonnet-20250219",
+        model: str = "claude-sonnet-4-20250514",
         max_tokens: int = 4096,
         use_computer_beta: bool = True,
         **kwargs: Any,

hud/agents/openai.py CHANGED Viewed

@@ -38,6 +38,7 @@ class OperatorAgent(MCPAgent):
         "display_width": computer_settings.OPENAI_COMPUTER_WIDTH,
         "display_height": computer_settings.OPENAI_COMPUTER_HEIGHT,
     }
+    required_tools: ClassVar[list[str]] = ["openai_computer"]
     def __init__(
         self,
@@ -143,20 +144,8 @@ class OperatorAgent(MCPAgent):
         """Get response from OpenAI including any tool calls."""
         # OpenAI's API is stateful, so we handle messages differently
-        # Check if we have computer tools available
-        computer_tool_name = None
-        for tool in self._available_tools:
-            if tool.name in ["openai_computer", "computer"]:
-                computer_tool_name = tool.name
-                break
-        if not computer_tool_name:
-            # No computer tools available, just return a text response
-            return AgentResponse(
-                content="No computer use tools available",
-                tool_calls=[],
-                done=True,
-            )
+        # Get the computer tool (guaranteed to exist due to required_tools)
+        computer_tool_name = "openai_computer"
         # Define the computer use tool
         computer_tool: ToolParam = {  # type: ignore[reportAssignmentType]
@@ -209,7 +198,7 @@ class OperatorAgent(MCPAgent):
                         break
                 if not latest_screenshot:
-                    logger.warning("No screenshot provided for response to action")
+                    self.design.warning_log("No screenshot provided for response to action")
                     return AgentResponse(
                         content="No screenshot available for next action",
                         tool_calls=[],
@@ -332,7 +321,7 @@ class OperatorAgent(MCPAgent):
                 for content in result.content:
                     if isinstance(content, types.TextContent):
                         # Don't add error text as input_text, just track it
-                        logger.error("Tool error: %s", content.text)
+                        self.design.error_log(f"Tool error: {content.text}")
                     elif isinstance(content, types.ImageContent):
                         # Even error results might have images
                         latest_screenshot = content.data

hud/agents/tests/test_openai.py CHANGED Viewed

@@ -20,6 +20,15 @@ class TestOperatorAgent:
         mcp_client = AsyncMock()
         # Set up the mcp_config attribute as a regular dict, not a coroutine
         mcp_client.mcp_config = {"test_server": {"url": "http://test"}}
+        # Mock list_tools to return the required openai_computer tool
+        mcp_client.list_tools = AsyncMock(
+            return_value=[
+                types.Tool(
+                    name="openai_computer", description="OpenAI computer use tool", inputSchema={}
+                )
+            ]
+        )
+        mcp_client.initialize = AsyncMock()
         return mcp_client
     @pytest.fixture
@@ -129,91 +138,27 @@ class TestOperatorAgent:
             types.Tool(name="computer_openai", description="Computer tool", inputSchema={})
         ]
-        # Since OpenAI checks isinstance() on response types, we need to mock that
-        # For now, let's just test that we get the expected "No computer use tools available"
-        # when there are no matching tools
-        agent._available_tools = [
-            types.Tool(name="other_tool", description="Other tool", inputSchema={})
-        ]
-        messages = [{"prompt": "What's on the screen?", "screenshot": None}]
-        response = await agent.get_response(messages)
-        assert response.content == "No computer use tools available"
-        assert response.tool_calls == []
-        assert response.done is True
-    @pytest.mark.asyncio
-    async def test_get_model_response_text_only(self, mock_mcp_client, mock_openai):
-        """Test getting text-only response when no computer tools available."""
-        agent = OperatorAgent(mcp_client=mock_mcp_client, model_client=mock_openai)
+        # Mock OpenAI API response for a successful computer use response
+        mock_response = MagicMock()
+        mock_response.id = "response_123"
+        mock_response.state = "completed"
+        # Mock the output message structure
+        mock_output_text = MagicMock()
+        mock_output_text.type = "output_text"
+        mock_output_text.text = "I can see the screen content."
+        mock_output_message = MagicMock()
+        mock_output_message.type = "message"
+        mock_output_message.content = [mock_output_text]
+        mock_response.output = [mock_output_message]
-        # Set up with no computer tools
-        agent._available_tools = []
+        mock_openai.responses.create = AsyncMock(return_value=mock_response)
-        messages = [{"prompt": "Hi", "screenshot": None}]
+        messages = [{"prompt": "What's on the screen?", "screenshot": None}]
         response = await agent.get_response(messages)
-        assert response.content == "No computer use tools available"
-        assert response.tool_calls == []
+        assert response.content == "I can see the screen content."
         assert response.done is True
-    @pytest.mark.asyncio
-    async def test_run_with_tools(self, mock_mcp_client, mock_openai):
-        """Test running agent with tool usage."""
-        agent = OperatorAgent(mcp_client=mock_mcp_client, model_client=mock_openai)
-        # Mock tool availability
-        agent._available_tools = [
-            types.Tool(name="search", description="Search tool", inputSchema={"type": "object"})
-        ]
-        # Base agent doesn't require server mapping for tool execution
-        # Mock initial response with tool use
-        initial_choice = MagicMock()
-        initial_choice.message = MagicMock(
-            content=None,
-            tool_calls=[
-                MagicMock(
-                    id="call_search",
-                    function=MagicMock(name="search", arguments='{"query": "OpenAI news"}'),
-                )
-            ],
-        )
-        initial_response = MagicMock()
-        initial_response.choices = [initial_choice]
-        initial_response.usage = MagicMock(prompt_tokens=10, completion_tokens=15, total_tokens=25)
-        # Mock follow-up response
-        final_choice = MagicMock()
-        final_choice.message = MagicMock(
-            content="Here are the latest OpenAI news...", tool_calls=None
-        )
-        final_response = MagicMock()
-        final_response.choices = [final_choice]
-        final_response.usage = MagicMock(prompt_tokens=20, completion_tokens=10, total_tokens=30)
-        mock_openai.chat.completions.create = AsyncMock(
-            side_effect=[initial_response, final_response]
-        )
-        # Mock tool execution
-        mock_mcp_client.call_tool = AsyncMock(
-            return_value=MCPToolResult(
-                content=[types.TextContent(type="text", text="Search results...")], isError=False
-            )
-        )
-        # Use a string prompt instead of a task
-        result = await agent.run("Search for OpenAI news")
-        # Since OpenAI integration currently returns "No computer use tools available"
-        # when the tool isn't a computer tool, we expect this
-        assert result.content == "No computer use tools available"
-        assert result.done is True
     @pytest.mark.asyncio
     async def test_handle_empty_response(self, mock_mcp_client, mock_openai):
         """Test handling empty response from API."""

hud-python 0.4.13__py3-none-any.whl → 0.4.15__py3-none-any.whl

Potentially problematic release.

hud-python 0.4.13py3-none-any.whl → 0.4.15py3-none-any.whl