PyPI - hud-python - Versions diffs - 0.4.21__py3-none-any.whl → 0.4.22__py3-none-any.whl - Mend

hud-python 0.4.21py3-none-any.whl → 0.4.22py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of hud-python might be problematic. Click here for more details.

Files changed (27) hide show

hud/agents/base.py +2 -0
hud/agents/claude.py +11 -6
hud/agents/grounded_openai.py +280 -0
hud/agents/tests/test_client.py +6 -1
hud/agents/tests/test_grounded_openai_agent.py +155 -0
hud/cli/eval.py +2 -2
hud/cli/utils/interactive.py +1 -1
hud/settings.py +6 -0
hud/tools/executors/tests/test_base_executor.py +1 -1
hud/tools/executors/xdo.py +1 -1
hud/tools/grounding/__init__.py +13 -0
hud/tools/grounding/config.py +54 -0
hud/tools/grounding/grounded_tool.py +314 -0
hud/tools/grounding/grounder.py +301 -0
hud/tools/grounding/tests/__init__.py +1 -0
hud/tools/grounding/tests/test_grounded_tool.py +196 -0
hud/tools/tests/test_playwright_tool.py +1 -1
hud/tools/tests/test_tools_init.py +1 -1
hud/tools/tests/test_utils.py +2 -2
hud/utils/agent_factories.py +86 -0
hud/utils/tests/test_version.py +1 -1
hud/version.py +1 -1
{hud_python-0.4.21.dist-info → hud_python-0.4.22.dist-info}/METADATA +1 -1
{hud_python-0.4.21.dist-info → hud_python-0.4.22.dist-info}/RECORD +27 -18
{hud_python-0.4.21.dist-info → hud_python-0.4.22.dist-info}/WHEEL +0 -0
{hud_python-0.4.21.dist-info → hud_python-0.4.22.dist-info}/entry_points.txt +0 -0
{hud_python-0.4.21.dist-info → hud_python-0.4.22.dist-info}/licenses/LICENSE +0 -0

hud/agents/base.py CHANGED Viewed

@@ -94,6 +94,8 @@ class MCPAgent(ABC):
         self.model_name = model_name
         self.design = HUDDesign(logger=logger)
+        self.metadata = {}
         # Set verbose mode if requested
         if verbose:
             self.design.set_verbose(True)

hud/agents/claude.py CHANGED Viewed

@@ -364,16 +364,21 @@ class ClaudeAgent(MCPAgent):
         messages_cached = copy.deepcopy(messages)
         # Mark last user message with cache control
-        if messages_cached and messages_cached[-1].get("role") == "user":
+        if (
+            messages_cached
+            and isinstance(messages_cached[-1], dict)
+            and messages_cached[-1].get("role") == "user"
+        ):
             last_content = messages_cached[-1]["content"]
             # Content is formatted to be list of ContentBlock in format_blocks and format_message
             if isinstance(last_content, list):
                 for block in last_content:
-                    # Only add cache control to block types that support it
-                    block_type = block.get("type")
-                    if block_type in ["text", "image", "tool_use", "tool_result"]:
-                        cache_control: BetaCacheControlEphemeralParam = {"type": "ephemeral"}
-                        block["cache_control"] = cache_control  # type: ignore[reportGeneralTypeIssues]
+                    # Only add cache control to dict-like block types that support it
+                    if isinstance(block, dict):
+                        block_type = block.get("type")
+                        if block_type in ["text", "image", "tool_use", "tool_result"]:
+                            cache_control: BetaCacheControlEphemeralParam = {"type": "ephemeral"}
+                            block["cache_control"] = cache_control  # type: ignore[reportGeneralTypeIssues]
         return messages_cached

hud/agents/grounded_openai.py ADDED Viewed

@@ -0,0 +1,280 @@
+"""Grounded OpenAI agent that separates visual grounding from reasoning."""
+from __future__ import annotations
+import json
+from typing import Any
+from hud import instrument
+from hud.tools.grounding import GroundedComputerTool, Grounder, GrounderConfig
+from hud.types import AgentResponse, MCPToolCall, MCPToolResult
+from .openai_chat_generic import GenericOpenAIChatAgent
+class GroundedOpenAIChatAgent(GenericOpenAIChatAgent):
+    """OpenAI agent that uses a separate grounding model for element detection.
+    This agent:
+    - Exposes only a synthetic "computer" tool to the planning model
+    - Intercepts tool calls to ground element descriptions to coordinates
+    - Converts grounded results to real computer tool calls
+    - Maintains screenshot state for grounding operations
+    The architecture separates concerns:
+    - Planning model (GPT-4o etc) focuses on high-level reasoning
+    - Grounding model (Qwen2-VL etc) handles visual element detection
+    """
+    def __init__(
+        self,
+        *,
+        grounder_config: GrounderConfig,
+        model_name: str = "gpt-4o-mini",
+        allowed_tools: list[str] | None = None,
+        append_setup_output: bool = False,
+        system_prompt: str | None = None,
+        **kwargs: Any,
+    ) -> None:
+        """Initialize the grounded OpenAI agent.
+        Args:
+            grounder_config: Configuration for the grounding model
+            openai_client: OpenAI client for the planning model
+            model: Name of the OpenAI model to use for planning (e.g., "gpt-4o", "gpt-4o-mini")
+            real_computer_tool_name: Name of the actual computer tool to execute
+            **kwargs: Additional arguments passed to GenericOpenAIChatAgent
+        """
+        # Set defaults for grounded agent
+        if allowed_tools is None:
+            allowed_tools = ["computer"]
+        if system_prompt is None:
+            system_prompt = (
+                "You are a helpful AI assistant that can control the computer "
+                "through visual interaction.\n\n"
+                "IMPORTANT: Always explain your reasoning and observations before taking actions:\n"
+                "1. First, describe what you see on the screen\n"
+                "2. Explain what you plan to do and why\n"
+                "3. Then use the computer tool with natural language descriptions\n\n"
+                "For example:\n"
+                "- 'I can see a login form with username and password fields. "
+                "I need to click on the username field first.'\n"
+                "- 'There's a blue submit button at the bottom. "
+                "I'll click on it to submit the form.'\n"
+                "- 'I notice a red close button in the top right corner. "
+                "I'll click it to close this dialog.'\n\n"
+                "Use descriptive element descriptions like:\n"
+                "- Colors: 'red button', 'blue link', 'green checkmark'\n"
+                "- Position: 'top right corner', 'bottom of the page', 'left sidebar'\n"
+                "- Text content: 'Submit button', 'Login link', 'Cancel option'\n"
+                "- Element type: 'text field', 'dropdown menu', 'checkbox'"
+            )
+        super().__init__(
+            model_name=model_name,
+            allowed_tools=allowed_tools,
+            append_setup_output=append_setup_output,
+            system_prompt=system_prompt,
+            **kwargs,
+        )
+        self.grounder = Grounder(grounder_config)
+        self.grounded_tool = None
+    async def initialize(self, task: Any = None) -> None:
+        """Initialize the agent and create the grounded tool with mcp_client."""
+        # Call parent initialization first
+        await super().initialize(task)
+        if self.mcp_client is None:
+            raise ValueError("mcp_client must be initialized before creating grounded tool")
+        self.grounded_tool = GroundedComputerTool(
+            grounder=self.grounder, mcp_client=self.mcp_client, computer_tool_name="computer"
+        )
+    def get_tool_schemas(self) -> list[Any]:
+        """Override to expose only the synthetic grounded tool.
+        The planning model only sees the synthetic "computer" tool,
+        which is provided by the grounded tool itself.
+        Returns:
+            List containing only the grounded computer tool schema
+        """
+        if self.grounded_tool is None:
+            return []
+        return [self.grounded_tool.get_openai_tool_schema()]
+    @instrument(
+        span_type="agent",
+        record_args=False,
+        record_result=True,
+    )
+    async def get_response(self, messages: Any) -> AgentResponse:
+        """Get response from the planning model and handle grounded tool calls.
+        This method:
+        1. Calls the planning model with the grounded tool schema
+        2. Executes any tool calls directly through the grounded tool
+        3. Returns the response
+        Args:
+            messages: Conversation messages
+        Returns:
+            AgentResponse with either content or tool calls for MCP execution
+        """
+        tool_schemas = self.get_tool_schemas()
+        # Take initial screenshot and add to messages if this is the first turn
+        has_image = any(
+            isinstance(m.get("content"), list)
+            and any(
+                block.get("type") == "image_url"
+                for block in m["content"]
+                if isinstance(block, dict)
+            )
+            for m in messages
+            if isinstance(m.get("content"), list)
+        )
+        if not has_image:
+            if self.mcp_client is None:
+                raise ValueError("mcp_client is not initialized")
+            screenshot_result = await self.mcp_client.call_tool(
+                MCPToolCall(name="computer", arguments={"action": "screenshot"})
+            )
+            for block in screenshot_result.content:
+                # Check for ImageContent type from MCP
+                if hasattr(block, "data") and hasattr(block, "mimeType"):
+                    mime_type = getattr(block, "mimeType", "image/png")
+                    data = getattr(block, "data", "")
+                    messages.append(
+                        {
+                            "role": "user",
+                            "content": [
+                                {
+                                    "type": "image_url",
+                                    "image_url": {"url": f"data:{mime_type};base64,{data}"},
+                                }
+                            ],
+                        }
+                    )
+                    break
+        protected_keys = {"model", "messages", "tools", "parallel_tool_calls"}
+        extra = {k: v for k, v in (self.completion_kwargs or {}).items() if k not in protected_keys}
+        response = await self.oai.chat.completions.create(
+            model=self.model_name,
+            messages=messages,
+            tools=tool_schemas,
+            parallel_tool_calls=False,
+            **extra,
+        )
+        choice = response.choices[0]
+        msg = choice.message
+        assistant_msg: dict[str, Any] = {"role": "assistant"}
+        if msg.content:
+            assistant_msg["content"] = msg.content
+        if msg.tool_calls:
+            assistant_msg["tool_calls"] = msg.tool_calls
+        messages.append(assistant_msg)
+        self.conversation_history = messages.copy()
+        if not msg.tool_calls:
+            return AgentResponse(
+                content=msg.content or "",
+                tool_calls=[],
+                done=choice.finish_reason in ("stop", "length"),
+                raw=response,
+            )
+        tc = msg.tool_calls[0]
+        if tc.function.name != "computer":
+            return AgentResponse(
+                content=f"Error: Model called unexpected tool '{tc.function.name}'",
+                tool_calls=[],
+                done=True,
+                raw=response,
+            )
+        # Parse the arguments
+        try:
+            args = json.loads(tc.function.arguments or "{}")
+        except json.JSONDecodeError:
+            return AgentResponse(
+                content="Error: Invalid tool arguments", tool_calls=[], done=True, raw=response
+            )
+        tool_call = MCPToolCall(name="computer", arguments=args, id=tc.id)
+        return AgentResponse(
+            content=msg.content or "", tool_calls=[tool_call], done=False, raw=response
+        )
+    async def call_tools(
+        self, tool_call: MCPToolCall | list[MCPToolCall] | None = None
+    ) -> list[MCPToolResult]:
+        """Override call_tools to intercept computer tool calls.
+        Execute them through grounded tool.
+        """
+        if tool_call is None:
+            return []
+        if isinstance(tool_call, MCPToolCall):
+            tool_call = [tool_call]
+        results: list[MCPToolResult] = []
+        for tc in tool_call:
+            if tc.name == "computer":
+                # Execute through grounded tool instead of MCP
+                try:
+                    # Extract latest screenshot from conversation history
+                    screenshot_b64 = None
+                    for m in reversed(self.conversation_history):
+                        if m.get("role") == "user" and isinstance(m.get("content"), list):
+                            for block in m["content"]:
+                                if (
+                                    isinstance(block, dict)
+                                    and block.get("type") == "image_url"
+                                    and isinstance(block.get("image_url"), dict)
+                                ):
+                                    url = block["image_url"].get("url", "")
+                                    if url.startswith("data:"):
+                                        screenshot_b64 = (
+                                            url.split(",", 1)[1] if "," in url else None
+                                        )
+                                        break
+                            if screenshot_b64:
+                                break
+                    # Pass screenshot to grounded tool
+                    args_with_screenshot = dict(tc.arguments) if tc.arguments else {}
+                    if screenshot_b64:
+                        args_with_screenshot["screenshot_b64"] = screenshot_b64
+                    if self.grounded_tool is None:
+                        raise ValueError("Grounded tool is not initialized")
+                    content_blocks = await self.grounded_tool(**args_with_screenshot)
+                    results.append(MCPToolResult(content=content_blocks, isError=False))
+                except Exception as e:
+                    # Create error result
+                    from mcp.types import TextContent
+                    error_content = TextContent(text=str(e), type="text")
+                    results.append(MCPToolResult(content=[error_content], isError=True))
+            else:
+                # For non-computer tools, use parent implementation
+                parent_results = await super().call_tools(tc)
+                results.extend(parent_results)
+        return results

hud/agents/tests/test_client.py CHANGED Viewed

@@ -200,7 +200,12 @@ class TestMCPClient:
         # Calling a non-existent tool should return an error result
         result = await client.call_tool(name="nonexistent", arguments={})
         assert result.isError is True
-        assert "Tool 'nonexistent' not found" in result.content[0].text
+        # Check that the error message is in the text content
+        text_content = ""
+        for content in result.content:
+            if isinstance(content, types.TextContent):
+                text_content += content.text
+        assert "Tool 'nonexistent' not found" in text_content
     @pytest.mark.asyncio
     async def test_get_telemetry_data(self, mock_telemetry, mock_mcp_use_client):

hud/agents/tests/test_grounded_openai_agent.py ADDED Viewed

@@ -0,0 +1,155 @@
+from __future__ import annotations
+import json
+from typing import Any
+import mcp.types as types
+import pytest
+from hud.agents.grounded_openai import GroundedOpenAIChatAgent
+from hud.tools.grounding import GrounderConfig
+from hud.types import MCPToolCall, MCPToolResult
+class DummyOpenAI:
+    class chat:  # type: ignore[no-redef]
+        class completions:
+            @staticmethod
+            async def create(**kwargs: Any) -> Any:
+                # Return a minimal object mimicking OpenAI response
+                class Msg:
+                    def __init__(self) -> None:
+                        self.content = "Thinking..."
+                        self.tool_calls = [
+                            type(
+                                "ToolCall",
+                                (),
+                                {
+                                    "id": "call_1",
+                                    "function": type(
+                                        "Fn",
+                                        (),
+                                        {
+                                            "name": "computer",
+                                            "arguments": json.dumps(
+                                                {
+                                                    "action": "click",
+                                                    "element_description": "blue button",
+                                                }
+                                            ),
+                                        },
+                                    ),
+                                },
+                            )()
+                        ]
+                class Choice:
+                    def __init__(self) -> None:
+                        self.message = Msg()
+                        self.finish_reason = "tool_calls"
+                class Resp:
+                    def __init__(self) -> None:
+                        self.choices = [Choice()]
+                return Resp()
+class FakeMCPClient:
+    def __init__(self) -> None:
+        self.tools: list[types.Tool] = [
+            types.Tool(name="computer", description="", inputSchema={}),
+            types.Tool(name="setup", description="internal functions", inputSchema={}),
+        ]
+        self.called: list[MCPToolCall] = []
+    async def initialize(self, mcp_config: dict[str, dict[str, Any]] | None = None) -> None:
+        return None
+    async def list_tools(self) -> list[types.Tool]:
+        return self.tools
+    async def call_tool(self, tool_call: MCPToolCall) -> MCPToolResult:
+        self.called.append(tool_call)
+        return MCPToolResult(content=[types.TextContent(text="ok", type="text")], isError=False)
+    @property
+    def mcp_config(self) -> dict[str, dict[str, Any]]:
+        return {"local": {"command": "echo", "args": ["ok"]}}
+    async def shutdown(self) -> None:
+        return None
+    async def list_resources(self) -> list[types.Resource]:  # not used here
+        return []
+    async def read_resource(self, uri: str) -> types.ReadResourceResult | None:
+        return None
+class DummyGrounder:
+    async def predict_click(self, *, image_b64: str, instruction: str, max_retries: int = 3):
+        return (7, 9)
+class DummyGroundedTool:
+    def __init__(self) -> None:
+        self.last_args: dict[str, Any] | None = None
+    async def __call__(self, **kwargs: Any):
+        self.last_args = kwargs
+        return [types.TextContent(text="ok", type="text")]
+    def get_openai_tool_schema(self) -> dict:
+        return {
+            "type": "function",
+            "function": {"name": "computer", "parameters": {"type": "object"}},
+        }
+@pytest.mark.asyncio
+async def test_call_tools_injects_screenshot_and_delegates(monkeypatch: pytest.MonkeyPatch) -> None:
+    # Agent with fake OpenAI client and fake MCP client
+    grounder_cfg = GrounderConfig(api_base="http://example", model="qwen")
+    agent = GroundedOpenAIChatAgent(
+        grounder_config=grounder_cfg,
+        openai_client=DummyOpenAI(),
+        model_name="gpt-4o-mini",
+        mcp_client=FakeMCPClient(),
+        initial_screenshot=False,
+    )
+    # Inject a dummy grounded tool to observe args without full initialization
+    dummy_tool = DummyGroundedTool()
+    agent.grounded_tool = dummy_tool  # type: ignore
+    # Seed conversation history with a user image
+    png_b64 = (
+        "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR4nGMAAQAABQAB"
+        "J2n0mQAAAABJRU5ErkJggg=="
+    )
+    agent.conversation_history = [
+        {
+            "role": "user",
+            "content": [
+                {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{png_b64}"}},
+            ],
+        }
+    ]
+    # Build a tool call as GroundedOpenAIChatAgent.get_response would produce
+    tool_call = MCPToolCall(
+        name="computer", arguments={"action": "click", "element_description": "blue button"}
+    )
+    results = await agent.call_tools(tool_call)
+    # One result returned
+    assert len(results) == 1 and not results[0].isError
+    # Grounded tool received screenshot_b64 injected
+    assert dummy_tool.last_args is not None
+    assert dummy_tool.last_args["action"] == "click"
+    assert dummy_tool.last_args["element_description"] == "blue button"
+    assert "screenshot_b64" in dummy_tool.last_args
+    assert isinstance(dummy_tool.last_args["screenshot_b64"], str)

hud/cli/eval.py CHANGED Viewed

@@ -87,7 +87,7 @@ async def run_single_task(
     except ImportError as e:
         design.error(
             "Dataset dependencies are not installed. "
-            "Please install with: pip install 'hud-python\u27E6agent\u27E7'"
+            "Please install with: pip install 'hud-python\u27e6agent\u27e7'"
         )
         raise typer.Exit(1) from e
@@ -111,7 +111,7 @@ async def run_single_task(
                 except ImportError as e:
                     design.error(
                         "OpenAI agent dependencies are not installed. "
-                        "Please install with: pip install 'hud-python\u27E6agent\u27E7'"
+                        "Please install with: pip install 'hud-python\u27e6agent\u27e7'"
                     )
                     raise typer.Exit(1) from e

hud/cli/utils/interactive.py CHANGED Viewed

@@ -74,7 +74,7 @@ class InteractiveMCPTester:
         for tool in self.tools:
             if "/" in tool.name:
-                hub, name = tool.name.split("/", 1)
+                hub, _ = tool.name.split("/", 1)
                 if hub not in hub_tools:
                     hub_tools[hub] = []
                 hub_tools[hub].append(tool)

hud/settings.py CHANGED Viewed

@@ -44,6 +44,12 @@ class Settings(BaseSettings):
         validation_alias="OPENAI_API_KEY",
     )
+    openrouter_api_key: str | None = Field(
+        default=None,
+        description="API key for OpenRouter models",
+        validation_alias="OPENROUTER_API_KEY",
+    )
     wandb_api_key: str | None = Field(
         default=None,
         description="API key for Weights & Biases",

hud/tools/executors/tests/test_base_executor.py CHANGED Viewed

@@ -361,5 +361,5 @@ class TestLazyImports:
         """Test lazy import with invalid attribute name."""
         import hud.tools.executors as executors_module
-        with pytest.raises(AttributeError, match="module '.*' has no attribute 'InvalidExecutor'"):
+        with pytest.raises(AttributeError, match=r"module '.*' has no attribute 'InvalidExecutor'"):
             _ = executors_module.InvalidExecutor

hud/tools/executors/xdo.py CHANGED Viewed

@@ -175,7 +175,7 @@ class XDOExecutor(BaseExecutor):
         screenshot_cmd = f"{self._display_prefix}scrot -p {screenshot_path}"
-        returncode, _, stderr = await run(screenshot_cmd)
+        returncode, _, _stderr = await run(screenshot_cmd)
         if returncode == 0 and screenshot_path.exists():
             try:

hud/tools/grounding/__init__.py ADDED Viewed

@@ -0,0 +1,13 @@
+"""Grounding module for visual element detection and coordinate resolution."""
+from __future__ import annotations
+from .config import GrounderConfig
+from .grounded_tool import GroundedComputerTool
+from .grounder import Grounder
+__all__ = [
+    "GroundedComputerTool",
+    "Grounder",
+    "GrounderConfig",
+]

hud/tools/grounding/config.py ADDED Viewed

@@ -0,0 +1,54 @@
+"""Configuration for grounding models."""
+from __future__ import annotations
+from dataclasses import dataclass, field
+from typing import Any
+SYSTEM_PROMPT = (
+    "You are a visual grounding model. Given an image and a description, "
+    "return ONLY the center pixel coordinates of the described element as a "
+    "single point in parentheses format: (x, y). Do not return bounding boxes "
+    "or multiple coordinates."
+)
+@dataclass
+class GrounderConfig:
+    """Configuration for grounding model clients.
+    Attributes:
+        api_base: Base URL for the grounding model API endpoint
+        model: Model identifier to use for grounding
+        api_key: API key for authentication (default: "EMPTY" for local models)
+        system_prompt: System prompt to guide the grounding model
+        output_format: Format for coordinate output ("pixels", "norm_0_1", "norm_0_999")
+        parser_regex: Regular expression to parse coordinates from model output
+        resize: Image resizing configuration dictionary
+    """
+    api_base: str
+    model: str
+    api_key: str = "EMPTY"
+    system_prompt: str = SYSTEM_PROMPT
+    output_format: str = "pixels"  # "pixels" | "norm_0_1" | "norm_0_999"
+    parser_regex: str = r"\((\d+),\s*(\d+)\)"
+    resize: dict[str, Any] = field(
+        default_factory=lambda: {
+            "enabled": True,
+            "min_pixels": 3136,
+            "max_pixels": 4096 * 2160,
+            "factor": 28,
+        }
+    )
+    def __post_init__(self) -> None:
+        """Validate configuration after initialization."""
+        if self.output_format not in ("pixels", "norm_0_1", "norm_0_999"):
+            raise ValueError(f"Invalid output_format: {self.output_format}")
+        if not self.api_base:
+            raise ValueError("api_base is required")
+        if not self.model:
+            raise ValueError("model is required")

hud-python 0.4.21__py3-none-any.whl → 0.4.22__py3-none-any.whl

Potentially problematic release.

hud-python 0.4.21py3-none-any.whl → 0.4.22py3-none-any.whl