PyPI - hud-python - Versions diffs - 0.4.21__py3-none-any.whl → 0.4.23__py3-none-any.whl - Mend

hud-python 0.4.21py3-none-any.whl → 0.4.23py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of hud-python might be problematic. Click here for more details.

Files changed (63) hide show

hud/agents/base.py +37 -37
hud/agents/claude.py +11 -6
hud/agents/grounded_openai.py +282 -0
hud/agents/misc/response_agent.py +3 -2
hud/agents/openai.py +2 -2
hud/agents/openai_chat_generic.py +3 -1
hud/agents/tests/test_client.py +6 -1
hud/agents/tests/test_grounded_openai_agent.py +155 -0
hud/cli/__init__.py +34 -24
hud/cli/analyze.py +27 -26
hud/cli/build.py +50 -46
hud/cli/debug.py +7 -7
hud/cli/dev.py +107 -99
hud/cli/eval.py +33 -31
hud/cli/hf.py +53 -53
hud/cli/init.py +28 -28
hud/cli/list_func.py +22 -22
hud/cli/pull.py +36 -36
hud/cli/push.py +76 -74
hud/cli/remove.py +42 -40
hud/cli/rl/__init__.py +2 -2
hud/cli/rl/init.py +41 -41
hud/cli/rl/pod.py +97 -91
hud/cli/rl/ssh.py +42 -40
hud/cli/rl/train.py +75 -73
hud/cli/rl/utils.py +10 -10
hud/cli/tests/test_analyze.py +1 -1
hud/cli/tests/test_analyze_metadata.py +2 -2
hud/cli/tests/test_pull.py +45 -45
hud/cli/tests/test_push.py +31 -29
hud/cli/tests/test_registry.py +15 -15
hud/cli/utils/environment.py +11 -11
hud/cli/utils/interactive.py +18 -18
hud/cli/utils/logging.py +12 -12
hud/cli/utils/metadata.py +12 -12
hud/cli/utils/registry.py +5 -5
hud/cli/utils/runner.py +23 -23
hud/cli/utils/server.py +16 -16
hud/settings.py +6 -0
hud/shared/hints.py +7 -7
hud/tools/executors/tests/test_base_executor.py +1 -1
hud/tools/executors/xdo.py +1 -1
hud/tools/grounding/__init__.py +13 -0
hud/tools/grounding/config.py +54 -0
hud/tools/grounding/grounded_tool.py +314 -0
hud/tools/grounding/grounder.py +302 -0
hud/tools/grounding/tests/__init__.py +1 -0
hud/tools/grounding/tests/test_grounded_tool.py +196 -0
hud/tools/tests/test_playwright_tool.py +1 -1
hud/tools/tests/test_tools_init.py +1 -1
hud/tools/tests/test_utils.py +2 -2
hud/types.py +4 -4
hud/utils/__init__.py +3 -3
hud/utils/agent_factories.py +86 -0
hud/utils/{design.py → hud_console.py} +39 -33
hud/utils/pretty_errors.py +6 -6
hud/utils/tests/test_version.py +1 -1
hud/version.py +1 -1
{hud_python-0.4.21.dist-info → hud_python-0.4.23.dist-info}/METADATA +3 -1
{hud_python-0.4.21.dist-info → hud_python-0.4.23.dist-info}/RECORD +63 -54
{hud_python-0.4.21.dist-info → hud_python-0.4.23.dist-info}/WHEEL +0 -0
{hud_python-0.4.21.dist-info → hud_python-0.4.23.dist-info}/entry_points.txt +0 -0
{hud_python-0.4.21.dist-info → hud_python-0.4.23.dist-info}/licenses/LICENSE +0 -0

hud/cli/utils/server.py CHANGED Viewed

@@ -7,7 +7,7 @@ from typing import Any
 from fastmcp import FastMCP
-from hud.utils.design import HUDDesign
+from hud.utils.hud_console import HUDConsole
 from .docker import generate_container_name, remove_container
@@ -24,7 +24,7 @@ class MCPServerManager:
         """
         self.image = image
         self.docker_args = docker_args or []
-        self.design = HUDDesign()
+        self.console = HUDConsole()
         self.container_name = self._generate_container_name()
     def _generate_container_name(self) -> str:
@@ -155,7 +155,7 @@ class MCPServerManager:
             pass  # Normal cancellation
         except Exception as e:
             if verbose:
-                self.design.error(f"Server error: {e}")
+                self.console.error(f"Server error: {e}")
             raise
@@ -174,16 +174,16 @@ async def run_server_with_interactive(
     from .interactive import run_interactive_mode
     from .logging import find_free_port
-    design = HUDDesign()
+    hud_console = HUDConsole()
     # Find available port
     actual_port = find_free_port(port)
     if actual_port is None:
-        design.error(f"No available ports found starting from {port}")
+        hud_console.error(f"No available ports found starting from {port}")
         return
     if actual_port != port:
-        design.warning(f"Port {port} in use, using port {actual_port} instead")
+        hud_console.warning(f"Port {port} in use, using port {actual_port} instead")
     # Clean up any existing container
     server_manager.cleanup_container()
@@ -198,16 +198,16 @@ async def run_server_with_interactive(
     proxy = server_manager.create_proxy(config, f"HUD Interactive - {server_manager.image}")
     # Show header
-    design.info("")  # Empty line
-    design.header("HUD MCP Server - Interactive Mode", icon="🎮")
+    hud_console.info("")  # Empty line
+    hud_console.header("HUD MCP Server - Interactive Mode", icon="🎮")
     # Show configuration
-    design.section_title("Server Information")
-    design.info(f"Image: {server_manager.image}")
-    design.info(f"Port: {actual_port}")
-    design.info(f"URL: http://localhost:{actual_port}/mcp")
-    design.info(f"Container: {server_manager.container_name}")
-    design.info("")
+    hud_console.section_title("Server Information")
+    hud_console.info(f"Image: {server_manager.image}")
+    hud_console.info(f"Port: {actual_port}")
+    hud_console.info(f"URL: http://localhost:{actual_port}/mcp")
+    hud_console.info(f"Container: {server_manager.container_name}")
+    hud_console.info("")
     # Create event to signal server is ready
     server_ready = asyncio.Event()
@@ -236,7 +236,7 @@ async def run_server_with_interactive(
         await run_interactive_mode(server_url, verbose=verbose)
     except KeyboardInterrupt:
-        design.info("\n👋 Shutting down...")
+        hud_console.info("\n👋 Shutting down...")
     finally:
         # Cancel server task
         if server_task and not server_task.done():
@@ -244,7 +244,7 @@ async def run_server_with_interactive(
             try:
                 await server_task
             except asyncio.CancelledError:
-                design.error("Server task cancelled")
+                hud_console.error("Server task cancelled")
         # Clean up container
         server_manager.cleanup_container()

hud/settings.py CHANGED Viewed

@@ -44,6 +44,12 @@ class Settings(BaseSettings):
         validation_alias="OPENAI_API_KEY",
     )
+    openrouter_api_key: str | None = Field(
+        default=None,
+        description="API key for OpenRouter models",
+        validation_alias="OPENROUTER_API_KEY",
+    )
     wandb_api_key: str | None = Field(
         default=None,
         description="API key for Weights & Biases",

hud/shared/hints.py CHANGED Viewed

@@ -144,9 +144,9 @@ def render_hints(hints: Iterable[Hint] | None, *, design: Any | None = None) ->
     try:
         if design is None:
-            from hud.utils.design import design as default_design  # lazy import
+            from hud.utils.hud_console import hud_console as default_design  # lazy import
-            design = default_design
+            hud_console = default_design
     except Exception:
         # If design is unavailable (non-CLI contexts), silently skip rendering
         return
@@ -155,23 +155,23 @@ def render_hints(hints: Iterable[Hint] | None, *, design: Any | None = None) ->
         try:
             # Compact rendering - skip title if same as message
             if hint.title and hint.title != hint.message:
-                design.warning(f"{hint.title}: {hint.message}")
+                hud_console.warning(f"{hint.title}: {hint.message}")
             else:
-                design.warning(hint.message)
+                hud_console.warning(hint.message)
             # Tips as bullet points
             if hint.tips:
                 for tip in hint.tips:
-                    design.info(f"  • {tip}")
+                    hud_console.info(f"  • {tip}")
             # Only show command examples if provided
             if hint.command_examples:
                 for cmd in hint.command_examples:
-                    design.command_example(cmd)
+                    hud_console.command_example(cmd)
             # Only show docs URL if provided
             if hint.docs_url:
-                design.link(hint.docs_url)
+                hud_console.link(hint.docs_url)
         except Exception:
             logger.warning("Failed to render hint: %s", hint)
             continue

hud/tools/executors/tests/test_base_executor.py CHANGED Viewed

@@ -361,5 +361,5 @@ class TestLazyImports:
         """Test lazy import with invalid attribute name."""
         import hud.tools.executors as executors_module
-        with pytest.raises(AttributeError, match="module '.*' has no attribute 'InvalidExecutor'"):
+        with pytest.raises(AttributeError, match=r"module '.*' has no attribute 'InvalidExecutor'"):
             _ = executors_module.InvalidExecutor

hud/tools/executors/xdo.py CHANGED Viewed

@@ -175,7 +175,7 @@ class XDOExecutor(BaseExecutor):
         screenshot_cmd = f"{self._display_prefix}scrot -p {screenshot_path}"
-        returncode, _, stderr = await run(screenshot_cmd)
+        returncode, _, _stderr = await run(screenshot_cmd)
         if returncode == 0 and screenshot_path.exists():
             try:

hud/tools/grounding/__init__.py ADDED Viewed

@@ -0,0 +1,13 @@
+"""Grounding module for visual element detection and coordinate resolution."""
+from __future__ import annotations
+from .config import GrounderConfig
+from .grounded_tool import GroundedComputerTool
+from .grounder import Grounder
+__all__ = [
+    "GroundedComputerTool",
+    "Grounder",
+    "GrounderConfig",
+]

hud/tools/grounding/config.py ADDED Viewed

@@ -0,0 +1,54 @@
+"""Configuration for grounding models."""
+from __future__ import annotations
+from dataclasses import dataclass, field
+from typing import Any
+SYSTEM_PROMPT = (
+    "You are a visual grounding model. Given an image and a description, "
+    "return ONLY the center pixel coordinates of the described element as a "
+    "single point in parentheses format: (x, y). Do not return bounding boxes "
+    "or multiple coordinates."
+)
+@dataclass
+class GrounderConfig:
+    """Configuration for grounding model clients.
+    Attributes:
+        api_base: Base URL for the grounding model API endpoint
+        model: Model identifier to use for grounding
+        api_key: API key for authentication (default: "EMPTY" for local models)
+        system_prompt: System prompt to guide the grounding model
+        output_format: Format for coordinate output ("pixels", "norm_0_1", "norm_0_999")
+        parser_regex: Regular expression to parse coordinates from model output
+        resize: Image resizing configuration dictionary
+    """
+    api_base: str
+    model: str
+    api_key: str = "EMPTY"
+    system_prompt: str = SYSTEM_PROMPT
+    output_format: str = "pixels"  # "pixels" | "norm_0_1" | "norm_0_999"
+    parser_regex: str = r"\((\d+),\s*(\d+)\)"
+    resize: dict[str, Any] = field(
+        default_factory=lambda: {
+            "enabled": True,
+            "min_pixels": 3136,
+            "max_pixels": 4096 * 2160,
+            "factor": 28,
+        }
+    )
+    def __post_init__(self) -> None:
+        """Validate configuration after initialization."""
+        if self.output_format not in ("pixels", "norm_0_1", "norm_0_999"):
+            raise ValueError(f"Invalid output_format: {self.output_format}")
+        if not self.api_base:
+            raise ValueError("api_base is required")
+        if not self.model:
+            raise ValueError("model is required")

hud/tools/grounding/grounded_tool.py ADDED Viewed

@@ -0,0 +1,314 @@
+"""Grounded computer tool that resolves element descriptions to coordinates."""
+from __future__ import annotations
+import logging
+from typing import Any
+from mcp import ErrorData, McpError
+from mcp.types import INVALID_PARAMS, ContentBlock
+from hud.clients.base import AgentMCPClient  # noqa: TC001
+from hud.tools.grounding.grounder import Grounder  # noqa: TC001
+from hud.types import MCPToolCall
+logger = logging.getLogger(__name__)
+class GroundedComputerTool:
+    """Computer tool wrapper that grounds element descriptions to coordinates.
+    This tool acts as a local wrapper that:
+    1. Accepts natural language element descriptions from the agent
+    2. Calls the environment's computer tool via MCP to take screenshots
+    3. Uses a grounding model to resolve descriptions to coordinates
+    4. Calls the environment's computer tool via MCP with resolved coordinates
+    5. Returns the result to the agent
+    This allows the agent to use element descriptions while ensuring all
+    computer actions happen in the correct environment.
+    """
+    def __init__(
+        self,
+        *,
+        grounder: Grounder,
+        mcp_client: AgentMCPClient,
+        computer_tool_name: str = "computer",
+    ) -> None:
+        """Initialize the grounded computer tool.
+        Args:
+            grounder: Grounder instance for visual grounding
+            mcp_client: MCP client to call the environment's computer tool
+            computer_tool_name: Name of the computer tool in the environment
+        """
+        self._grounder = grounder
+        self._mcp_client = mcp_client
+        self._computer_tool_name = computer_tool_name
+    def get_openai_tool_schema(self) -> dict:
+        """Get the OpenAI tool schema for the grounded computer tool.
+        Returns:
+            Dictionary containing the tool schema in OpenAI format
+        """
+        return {
+            "type": "function",
+            "function": {
+                "name": "computer",
+                "description": (
+                    "Control a computer by interacting with UI elements. This tool uses "
+                    "element descriptions to locate and interact with UI elements on the "
+                    "screen (e.g., 'red submit button', 'search text field', 'hamburger menu "
+                    "icon', 'close button in top right corner')."
+                ),
+                "parameters": {
+                    "type": "object",
+                    "properties": {
+                        "action": {
+                            "type": "string",
+                            "enum": [
+                                "click",
+                                "double_click",
+                                "move",
+                                "scroll",
+                                "drag",
+                                "type",
+                                "keypress",
+                                "wait",
+                                "screenshot",
+                                "get_current_url",
+                                "get_dimensions",
+                                "get_environment",
+                            ],
+                            "description": "The action to perform",
+                        },
+                        "element_description": {
+                            "type": "string",
+                            "description": (
+                                "Natural language description of the element for "
+                                "click/move/scroll actions"
+                            ),
+                        },
+                        "start_element_description": {
+                            "type": "string",
+                            "description": "Description of the start element for drag actions",
+                        },
+                        "end_element_description": {
+                            "type": "string",
+                            "description": "Description of the end element for drag actions",
+                        },
+                        "text": {"type": "string", "description": "Text to type"},
+                        "keys": {
+                            "type": "array",
+                            "items": {"type": "string"},
+                            "description": "Keys to press (e.g., ['ctrl', 'a'] for Ctrl+A)",
+                        },
+                        "button": {
+                            "type": "string",
+                            "enum": ["left", "right", "middle"],
+                            "description": "Mouse button to use",
+                        },
+                        "scroll_x": {"type": "integer", "description": "Horizontal scroll amount"},
+                        "scroll_y": {"type": "integer", "description": "Vertical scroll amount"},
+                    },
+                    "required": ["action"],
+                },
+            },
+        }
+    async def __call__(
+        self,
+        action: str,
+        # Screenshot from conversation
+        screenshot_b64: str | None = None,
+        # Grounding-specific parameters
+        element_description: str | None = None,
+        start_element_description: str | None = None,
+        end_element_description: str | None = None,
+        # Pass-through parameters
+        text: str | None = None,
+        keys: list[str] | None = None,
+        button: str | None = None,
+        scroll_x: int | None = None,
+        scroll_y: int | None = None,
+        **kwargs: Any,
+    ) -> list[ContentBlock]:
+        """Execute a computer action, grounding element descriptions to coordinates first.
+        This method calls the environment's computer tool through MCP to ensure
+        actions happen in the correct environment.
+        Args:
+            action: The action to perform
+            element_description: Description of element for click/move/scroll actions
+            start_element_description: Start element for drag actions
+            end_element_description: End element for drag actions
+            text: Text to type for type actions
+            keys: Keys to press for keypress actions
+            button: Mouse button (left, right, middle)
+            scroll_x: Horizontal scroll amount
+            scroll_y: Vertical scroll amount
+            **kwargs: Additional arguments
+        Returns:
+            List of ContentBlocks with action results from the environment
+        """
+        try:
+            # For actions that don't need grounding, call environment tool directly
+            if action in (
+                "screenshot",
+                "type",
+                "keypress",
+                "wait",
+                "get_current_url",
+                "get_dimensions",
+                "get_environment",
+            ):
+                computer_args: dict[str, Any] = {"action": action}
+                if text is not None:
+                    computer_args["text"] = text
+                if keys is not None:
+                    computer_args["keys"] = keys
+                result = await self._mcp_client.call_tool(
+                    MCPToolCall(
+                        name=self._computer_tool_name, arguments={**computer_args, **kwargs}
+                    )
+                )
+                return result.content
+            # For actions that need coordinates, we need to ground element descriptions
+            if action in ("click", "double_click", "move", "scroll"):
+                if not element_description:
+                    raise McpError(
+                        ErrorData(
+                            code=INVALID_PARAMS,
+                            message=f"element_description is required for {action} action",
+                        )
+                    )
+                if not screenshot_b64:
+                    raise McpError(
+                        ErrorData(
+                            code=INVALID_PARAMS, message="No screenshot available for grounding"
+                        )
+                    )
+                # Ground the element description to coordinates
+                coords = await self._grounder.predict_click(
+                    image_b64=screenshot_b64, instruction=element_description
+                )
+                if not coords:
+                    raise McpError(
+                        ErrorData(
+                            code=INVALID_PARAMS,
+                            message=(
+                                f"Could not locate element: '{element_description}'. "
+                                "Try a more specific description or different identifying "
+                                "features (color, position, text, etc.)"
+                            ),
+                        )
+                    )
+                x, y = coords
+                # Execute action with resolved coordinates
+                computer_args: dict[str, Any] = {"action": action, "x": x, "y": y}
+                if button:
+                    computer_args["button"] = button
+                if scroll_x is not None:
+                    computer_args["scroll_x"] = scroll_x
+                if scroll_y is not None:
+                    computer_args["scroll_y"] = scroll_y
+                result = await self._mcp_client.call_tool(
+                    MCPToolCall(
+                        name=self._computer_tool_name, arguments={**computer_args, **kwargs}
+                    )
+                )
+                return result.content
+            elif action == "drag":
+                if not start_element_description or not end_element_description:
+                    raise McpError(
+                        ErrorData(
+                            code=INVALID_PARAMS,
+                            message=(
+                                "start_element_description and end_element_description "
+                                "are required for drag action"
+                            ),
+                        )
+                    )
+                if not screenshot_b64:
+                    raise McpError(
+                        ErrorData(
+                            code=INVALID_PARAMS, message="No screenshot available for grounding"
+                        )
+                    )
+                # Ground both start and end points
+                start_coords = await self._grounder.predict_click(
+                    image_b64=screenshot_b64, instruction=start_element_description
+                )
+                if not start_coords:
+                    raise McpError(
+                        ErrorData(
+                            code=INVALID_PARAMS,
+                            message=(
+                                f"Could not locate start element: '{start_element_description}'. "
+                                "Try a more specific description or different identifying features."
+                            ),
+                        )
+                    )
+                end_coords = await self._grounder.predict_click(
+                    image_b64=screenshot_b64, instruction=end_element_description
+                )
+                if not end_coords:
+                    raise McpError(
+                        ErrorData(
+                            code=INVALID_PARAMS,
+                            message=(
+                                f"Could not locate end element: '{end_element_description}'. "
+                                "Try a more specific description or different identifying features."
+                            ),
+                        )
+                    )
+                # Execute drag with resolved coordinates
+                computer_args: dict[str, Any] = {
+                    "action": "drag",
+                    "path": [
+                        (start_coords[0], start_coords[1]),
+                        (end_coords[0], end_coords[1]),
+                    ],
+                }
+                if button:
+                    computer_args["button"] = button
+                result = await self._mcp_client.call_tool(
+                    MCPToolCall(
+                        name=self._computer_tool_name, arguments={**computer_args, **kwargs}
+                    )
+                )
+                return result.content
+            else:
+                raise McpError(
+                    ErrorData(code=INVALID_PARAMS, message=f"Unsupported action: {action}")
+                )
+        except McpError:
+            # Re-raise MCP errors
+            raise
+        except Exception as e:
+            logger.error("Grounded tool failed: %s", e)
+            raise McpError(
+                ErrorData(code=INVALID_PARAMS, message=f"Grounding failed: {e!s}")
+            ) from e

hud-python 0.4.21__py3-none-any.whl → 0.4.23__py3-none-any.whl

Potentially problematic release.

hud-python 0.4.21py3-none-any.whl → 0.4.23py3-none-any.whl