PyPI - hud-python - Versions diffs - 0.4.57__py3-none-any.whl → 0.4.59__py3-none-any.whl - Mend

hud-python 0.4.57py3-none-any.whl → 0.4.59py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of hud-python might be problematic. Click here for more details.

Files changed (35) hide show

hud/agents/__init__.py +2 -0
hud/agents/gemini.py +492 -0
hud/agents/tests/test_gemini.py +372 -0
hud/cli/__init__.py +46 -31
hud/cli/dev.py +111 -1
hud/cli/eval.py +59 -3
hud/cli/flows/dev.py +5 -3
hud/cli/init.py +14 -18
hud/cli/push.py +2 -2
hud/cli/rl/__init__.py +1 -1
hud/cli/rl/celebrate.py +1 -1
hud/cli/rl/remote_runner.py +3 -3
hud/cli/tests/test_eval.py +20 -0
hud/clients/base.py +1 -1
hud/clients/fastmcp.py +1 -1
hud/otel/config.py +1 -1
hud/otel/context.py +2 -2
hud/server/server.py +283 -36
hud/settings.py +6 -0
hud/shared/hints.py +3 -3
hud/telemetry/job.py +2 -2
hud/tools/__init__.py +13 -2
hud/tools/computer/__init__.py +2 -0
hud/tools/computer/gemini.py +385 -0
hud/tools/computer/settings.py +21 -0
hud/tools/playwright.py +17 -2
hud/tools/types.py +9 -1
hud/types.py +2 -1
hud/utils/tests/test_version.py +1 -1
hud/version.py +1 -1
{hud_python-0.4.57.dist-info → hud_python-0.4.59.dist-info}/METADATA +2 -1
{hud_python-0.4.57.dist-info → hud_python-0.4.59.dist-info}/RECORD +35 -32
{hud_python-0.4.57.dist-info → hud_python-0.4.59.dist-info}/WHEEL +0 -0
{hud_python-0.4.57.dist-info → hud_python-0.4.59.dist-info}/entry_points.txt +0 -0
{hud_python-0.4.57.dist-info → hud_python-0.4.59.dist-info}/licenses/LICENSE +0 -0

hud/tools/computer/gemini.py ADDED Viewed

@@ -0,0 +1,385 @@
+from __future__ import annotations
+import logging
+import platform
+from typing import TYPE_CHECKING, Any, Literal
+from mcp import ErrorData, McpError
+from mcp.types import INVALID_PARAMS, ContentBlock
+from pydantic import Field
+from hud.tools.types import ContentResult
+from .hud import HudComputerTool
+from .settings import computer_settings
+if TYPE_CHECKING:
+    from hud.tools.executors.base import BaseExecutor
+logger = logging.getLogger(__name__)
+ACTION_FIELD = Field(..., description="Gemini Computer Use action to perform")
+X_FIELD = Field(None, description="X coordinate (pixels in agent space)")
+Y_FIELD = Field(None, description="Y coordinate (pixels in agent space)")
+TEXT_FIELD = Field(None, description="Text to type")
+PRESS_ENTER_FIELD = Field(None, description="Whether to press Enter after typing (type_text_at)")
+CLEAR_BEFORE_TYPING_FIELD = Field(
+    None, description="Whether to select-all before typing (type_text_at)"
+)
+DIRECTION_FIELD = Field(None, description="Scroll direction for scroll_document/scroll_at")
+MAGNITUDE_FIELD = Field(None, description="Scroll magnitude (pixels in agent space)")
+URL_FIELD = Field(None, description="Target URL for navigate")
+KEYS_FIELD = Field(None, description="Keys for key_combination")
+DESTINATION_X_FIELD = Field(None, description="Destination X for drag_and_drop (agent space)")
+DESTINATION_Y_FIELD = Field(None, description="Destination Y for drag_and_drop (agent space)")
+TAKE_SCREENSHOT_ON_CLICK_FIELD = Field(
+    True, description="Whether to include a screenshot for interactive actions"
+)
+class GeminiComputerTool(HudComputerTool):
+    """
+    Gemini Computer Use tool for interacting with a computer via MCP.
+    Maps Gemini's predefined function names (open_web_browser, click_at, hover_at,
+    type_text_at, scroll_document, scroll_at, wait_5_seconds, go_back, go_forward,
+    search, navigate, key_combination, drag_and_drop) to executor actions.
+    """
+    def __init__(
+        self,
+        # Define within environment based on platform
+        executor: BaseExecutor | None = None,
+        platform_type: Literal["auto", "xdo", "pyautogui"] = "auto",
+        display_num: int | None = None,
+        # Overrides for what dimensions the agent thinks it operates in
+        width: int = computer_settings.GEMINI_COMPUTER_WIDTH,
+        height: int = computer_settings.GEMINI_COMPUTER_HEIGHT,
+        rescale_images: bool = computer_settings.GEMINI_RESCALE_IMAGES,
+        # What the agent sees as the tool's name, title, and description
+        name: str | None = None,
+        title: str | None = None,
+        description: str | None = None,
+        **kwargs: Any,
+    ) -> None:
+        """
+        Initialize with Gemini's default dimensions.
+        """
+        super().__init__(
+            executor=executor,
+            platform_type=platform_type,
+            display_num=display_num,
+            width=width,
+            height=height,
+            rescale_images=rescale_images,
+            name=name or "gemini_computer",
+            title=title or "Gemini Computer Tool",
+            description=description or "Control computer with mouse, keyboard, and screenshots",
+            **kwargs,
+        )
+    async def __call__(
+        self,
+        action: str = ACTION_FIELD,
+        # Common coordinates
+        x: int | None = X_FIELD,
+        y: int | None = Y_FIELD,
+        # Text input
+        text: str | None = TEXT_FIELD,
+        press_enter: bool | None = PRESS_ENTER_FIELD,
+        clear_before_typing: bool | None = CLEAR_BEFORE_TYPING_FIELD,
+        # Scroll parameters
+        direction: Literal["up", "down", "left", "right"] | None = DIRECTION_FIELD,
+        magnitude: int | None = MAGNITUDE_FIELD,
+        # Navigation
+        url: str | None = URL_FIELD,
+        # Key combos
+        keys: list[str] | str | None = KEYS_FIELD,
+        # Drag parameters
+        destination_x: int | None = DESTINATION_X_FIELD,
+        destination_y: int | None = DESTINATION_Y_FIELD,
+        # Behavior
+        take_screenshot_on_click: bool = TAKE_SCREENSHOT_ON_CLICK_FIELD,
+    ) -> list[ContentBlock]:
+        """
+        Handle Gemini Computer Use API calls by mapping to executor actions.
+        Returns:
+            List of MCP content blocks
+        """
+        logger.info("GeminiComputerTool received action: %s", action)
+        # Helper to finalize ContentResult: rescale if requested and ensure URL metadata
+        async def _finalize(
+            result: ContentResult, requested_url: str | None = None
+        ) -> list[ContentBlock]:
+            if result.base64_image and self.rescale_images:
+                try:
+                    result.base64_image = await self._rescale_screenshot(result.base64_image)
+                except Exception as e:
+                    logger.warning("Failed to rescale screenshot: %s", e)
+            # Always include URL metadata if provided; otherwise default to about:blank
+            result.url = requested_url or result.url or "about:blank"
+            return result.to_content_blocks()
+        # Scale coordinates helper
+        def _scale(xv: int | None, yv: int | None) -> tuple[int | None, int | None]:
+            return self._scale_coordinates(xv, yv)
+        # Gemini emits coordinates/magnitudes in a 0-1000 normalized space.
+        def _denormalize(value: float | None, axis: Literal["x", "y"]) -> int | None:
+            if value is None:
+                return None
+            try:
+                numeric = float(value)
+            except (TypeError, ValueError):
+                try:
+                    return int(value)  # type: ignore[arg-type]
+                except (TypeError, ValueError):
+                    return None
+            # Treat values within the normalized range (including defaults like 800).
+            if 0 <= numeric <= 1000:
+                target = self.width if axis == "x" else self.height
+                numeric = numeric / 1000 * target
+            return round(numeric)
+        def _scale_distance(value: int | None, axis: Literal["x", "y"]) -> int | None:
+            if value is None:
+                return None
+            scale = self.scale_x if axis == "x" else self.scale_y
+            if scale != 1.0:
+                return round(value / scale)
+            return value
+        # Map actions
+        if action == "open_web_browser":
+            screenshot = await self.executor.screenshot()
+            if screenshot:
+                result = ContentResult(base64_image=screenshot, url="about:blank")
+            else:
+                result = ContentResult(error="Failed to take screenshot", url="about:blank")
+            return await _finalize(result)
+        elif action == "click_at":
+            if x is None or y is None:
+                raise McpError(ErrorData(code=INVALID_PARAMS, message="x and y are required"))
+            dx = _denormalize(x, "x")
+            dy = _denormalize(y, "y")
+            sx, sy = _scale(dx, dy)
+            result = await self.executor.click(x=sx, y=sy)
+            return await _finalize(result)
+        elif action == "hover_at":
+            if x is None or y is None:
+                raise McpError(ErrorData(code=INVALID_PARAMS, message="x and y are required"))
+            dx = _denormalize(x, "x")
+            dy = _denormalize(y, "y")
+            sx, sy = _scale(dx, dy)
+            result = await self.executor.move(x=sx, y=sy)
+            return await _finalize(result)
+        elif action == "type_text_at":
+            if x is None or y is None:
+                raise McpError(ErrorData(code=INVALID_PARAMS, message="x and y are required"))
+            if text is None:
+                raise McpError(ErrorData(code=INVALID_PARAMS, message="text is required"))
+            dx = _denormalize(x, "x")
+            dy = _denormalize(y, "y")
+            sx, sy = _scale(dx, dy)
+            # Focus the field
+            await self.executor.move(x=sx, y=sy, take_screenshot=False)
+            await self.executor.click(x=sx, y=sy, take_screenshot=False)
+            # Clear existing text if requested
+            if clear_before_typing is None or clear_before_typing:
+                is_mac = platform.system().lower() == "darwin"
+                combo = ["cmd", "a"] if is_mac else ["ctrl", "a"]
+                await self.executor.press(keys=combo, take_screenshot=False)
+                delete_key = "backspace" if is_mac else "delete"
+                await self.executor.press(keys=[delete_key], take_screenshot=False)
+            # Type (optionally press enter after)
+            result = await self.executor.write(text=text, enter_after=bool(press_enter))
+            return await _finalize(result)
+        elif action == "scroll_document":
+            if direction is None:
+                raise McpError(ErrorData(code=INVALID_PARAMS, message="direction is required"))
+            # Default magnitude similar to reference implementation
+            mag = magnitude if magnitude is not None else 800
+            # Convert to environment units while preserving sign
+            if direction in ("down", "up"):
+                distance = _denormalize(mag, "y")
+                if distance is None:
+                    raise McpError(
+                        ErrorData(
+                            code=INVALID_PARAMS, message="Unable to determine scroll magnitude"
+                        )
+                    )
+                distance = _scale_distance(distance, "y")
+                if distance is None:
+                    raise McpError(
+                        ErrorData(
+                            code=INVALID_PARAMS,
+                            message="Unable to determine scroll magnitude",
+                        )
+                    )
+                scroll_y = distance if direction == "down" else -distance
+                scroll_x = None
+            elif direction in ("right", "left"):
+                distance = _denormalize(mag, "x")
+                if distance is None:
+                    raise McpError(
+                        ErrorData(
+                            code=INVALID_PARAMS, message="Unable to determine scroll magnitude"
+                        )
+                    )
+                distance = _scale_distance(distance, "x")
+                if distance is None:
+                    raise McpError(
+                        ErrorData(
+                            code=INVALID_PARAMS,
+                            message="Unable to determine scroll magnitude",
+                        )
+                    )
+                scroll_x = distance if direction == "right" else -distance
+                scroll_y = None
+            else:
+                raise McpError(
+                    ErrorData(code=INVALID_PARAMS, message=f"Invalid direction: {direction}")
+                )
+            result = await self.executor.scroll(scroll_x=scroll_x, scroll_y=scroll_y)
+            return await _finalize(result)
+        elif action == "scroll_at":
+            if direction is None:
+                raise McpError(ErrorData(code=INVALID_PARAMS, message="direction is required"))
+            if x is None or y is None:
+                raise McpError(ErrorData(code=INVALID_PARAMS, message="x and y are required"))
+            mag = magnitude if magnitude is not None else 800
+            dx = _denormalize(x, "x")
+            dy = _denormalize(y, "y")
+            sx, sy = _scale(dx, dy)
+            if direction in ("down", "up"):
+                distance = _denormalize(mag, "y")
+                if distance is None:
+                    raise McpError(
+                        ErrorData(
+                            code=INVALID_PARAMS, message="Unable to determine scroll magnitude"
+                        )
+                    )
+                distance = _scale_distance(distance, "y")
+                if distance is None:
+                    raise McpError(
+                        ErrorData(
+                            code=INVALID_PARAMS,
+                            message="Unable to determine scroll magnitude",
+                        )
+                    )
+                scroll_y = distance if direction == "down" else -distance
+                scroll_x = None
+            elif direction in ("right", "left"):
+                distance = _denormalize(mag, "x")
+                if distance is None:
+                    raise McpError(
+                        ErrorData(
+                            code=INVALID_PARAMS, message="Unable to determine scroll magnitude"
+                        )
+                    )
+                distance = _scale_distance(distance, "x")
+                if distance is None:
+                    raise McpError(
+                        ErrorData(
+                            code=INVALID_PARAMS,
+                            message="Unable to determine scroll magnitude",
+                        )
+                    )
+                scroll_x = distance if direction == "right" else -distance
+                scroll_y = None
+            else:
+                raise McpError(
+                    ErrorData(code=INVALID_PARAMS, message=f"Invalid direction: {direction}")
+                )
+            result = await self.executor.scroll(x=sx, y=sy, scroll_x=scroll_x, scroll_y=scroll_y)
+            return await _finalize(result)
+        elif action == "wait_5_seconds":
+            result = await self.executor.wait(time=5000)
+            return await _finalize(result)
+        elif action == "go_back":
+            is_mac = platform.system().lower() == "darwin"
+            combo = ["cmd", "["] if is_mac else ["alt", "left"]
+            result = await self.executor.press(keys=combo)
+            return await _finalize(result)
+        elif action == "go_forward":
+            is_mac = platform.system().lower() == "darwin"
+            combo = ["cmd", "]"] if is_mac else ["alt", "right"]
+            result = await self.executor.press(keys=combo)
+            return await _finalize(result)
+        elif action == "search":
+            # Best-effort navigate to a default search page
+            target = url or "https://www.google.com"
+            is_mac = platform.system().lower() == "darwin"
+            await self.executor.press(
+                keys=["cmd", "l"] if is_mac else ["ctrl", "l"], take_screenshot=False
+            )
+            result = await self.executor.write(text=target, enter_after=True)
+            return await _finalize(result, requested_url=target)
+        elif action == "navigate":
+            if not url:
+                raise McpError(ErrorData(code=INVALID_PARAMS, message="url is required"))
+            is_mac = platform.system().lower() == "darwin"
+            await self.executor.press(
+                keys=["cmd", "l"] if is_mac else ["ctrl", "l"], take_screenshot=False
+            )
+            result = await self.executor.write(text=url, enter_after=True)
+            return await _finalize(result, requested_url=url)
+        elif action == "key_combination":
+            if keys is None:
+                raise McpError(ErrorData(code=INVALID_PARAMS, message="keys is required"))
+            if isinstance(keys, str):
+                # Accept formats like "ctrl+c" or "ctrl+shift+t"
+                key_list = [k.strip() for k in keys.split("+") if k.strip()]
+            else:
+                key_list = keys
+            result = await self.executor.press(keys=key_list)
+            return await _finalize(result)
+        elif action == "drag_and_drop":
+            if x is None or y is None or destination_x is None or destination_y is None:
+                raise McpError(
+                    ErrorData(
+                        code=INVALID_PARAMS,
+                        message="x, y, destination_x, and destination_y are required",
+                    )
+                )
+            sx_norm = _denormalize(x, "x")
+            sy_norm = _denormalize(y, "y")
+            dx_norm = _denormalize(destination_x, "x")
+            dy_norm = _denormalize(destination_y, "y")
+            sx, sy = _scale(sx_norm, sy_norm)
+            dx_scaled, dy_scaled = _scale(dx_norm, dy_norm)
+            # Build a two-point path
+            path = []  # type: list[tuple[int, int]]
+            if (
+                sx is not None
+                and sy is not None
+                and dx_scaled is not None
+                and dy_scaled is not None
+            ):
+                path = [(sx, sy), (dx_scaled, dy_scaled)]
+            result = await self.executor.drag(path=path)
+            return await _finalize(result)
+        else:
+            raise McpError(ErrorData(code=INVALID_PARAMS, message=f"Unknown action: {action}"))

hud/tools/computer/settings.py CHANGED Viewed

@@ -94,5 +94,26 @@ class ComputerSettings(BaseSettings):
         validation_alias="QWEN_RESCALE_IMAGES",
     )
+    GEMINI_COMPUTER_WIDTH: int = Field(
+        default=1440,
+        description="Width of the display to use for the Gemini computer tools",
+        validation_alias="GEMINI_COMPUTER_WIDTH",
+    )
+    GEMINI_COMPUTER_HEIGHT: int = Field(
+        default=900,
+        description="Height of the display to use for the Gemini computer tools",
+        validation_alias="GEMINI_COMPUTER_HEIGHT",
+    )
+    GEMINI_RESCALE_IMAGES: bool = Field(
+        default=True,
+        description="Whether to rescale images to the agent width and height",
+        validation_alias="GEMINI_RESCALE_IMAGES",
+    )
+    GEMINI_MAX_RECENT_TURN_WITH_SCREENSHOTS: int = Field(
+        default=3,
+        description="Maximum number of recent turns to keep screenshots for in Gemini agent",
+        validation_alias="GEMINI_MAX_RECENT_TURN_WITH_SCREENSHOTS",
+    )
 computer_settings = ComputerSettings()

hud/tools/playwright.py CHANGED Viewed

@@ -84,6 +84,9 @@ class PlaywrightTool(BaseTool):
                             code=INVALID_PARAMS, message="url parameter is required for navigate"
                         )
                     )
+                # Guard against pydantic FieldInfo default leaking through
+                if not isinstance(wait_for_load_state, str):
+                    wait_for_load_state = None
                 result = await self.navigate(url, wait_for_load_state or "networkidle")
             elif action == "screenshot":
@@ -179,11 +182,16 @@ class PlaywrightTool(BaseTool):
                 if self._browser is None:
                     raise RuntimeError("Failed to connect to remote browser")
-                # Use existing context or create new one
+                # Reuse existing context and page where possible to avoid spawning new windows
                 contexts = self._browser.contexts
                 if contexts:
                     self._browser_context = contexts[0]
+                    # Prefer the first existing page to keep using the already visible window/tab
+                    existing_pages = self._browser_context.pages
+                    if existing_pages:
+                        self.page = existing_pages[0]
                 else:
+                    # As a fallback, create a new context
                     self._browser_context = await self._browser.new_context(
                         viewport={"width": 1920, "height": 1080},
                         ignore_https_errors=True,
@@ -225,7 +233,14 @@ class PlaywrightTool(BaseTool):
             if self._browser_context is None:
                 raise RuntimeError("Browser context failed to initialize")
-            self.page = await self._browser_context.new_page()
+            # Reuse existing page if available (for CDP connections), otherwise create new one
+            pages = self._browser_context.pages
+            if pages:
+                self.page = pages[0]
+                logger.info("Reusing existing browser page")
+            else:
+                self.page = await self._browser_context.new_page()
+                logger.info("Created new browser page")
             logger.info("Playwright browser launched successfully")
     async def navigate(

hud/tools/types.py CHANGED Viewed

@@ -28,6 +28,7 @@ class ContentResult(BaseModel):
     error: str | None = Field(default=None, description="Error message")
     base64_image: str | None = Field(default=None, description="Base64-encoded image")
     system: str | None = Field(default=None, description="System message")
+    url: str | None = Field(default=None, description="Current page URL (for browser automation)")
     def __add__(self, other: ContentResult) -> ContentResult:
         def combine_fields(
@@ -44,6 +45,7 @@ class ContentResult(BaseModel):
             error=combine_fields(self.error, other.error),
             base64_image=combine_fields(self.base64_image, other.base64_image, False),
             system=combine_fields(self.system, other.system),
+            url=combine_fields(self.url, other.url, False),
         )
     def to_content_blocks(self) -> list[ContentBlock]:
@@ -55,7 +57,7 @@ class ContentResult(BaseModel):
             result: ContentResult to convert
         Returns:
-            List of ContentBlock
+            List of ContentBlock with URL embedded as metadata if available
         """
         blocks: list[ContentBlock] = []
@@ -65,6 +67,12 @@ class ContentResult(BaseModel):
             blocks.append(TextContent(text=self.error, type="text"))
         if self.base64_image:
             blocks.append(ImageContent(data=self.base64_image, mimeType="image/png", type="image"))
+        # Add URL as a special metadata text block (for Gemini Computer Use)
+        # Always include URL if set, even if it's a placeholder like "about:blank"
+        if self.url:
+            blocks.append(TextContent(text=f"__URL__:{self.url}", type="text"))
         return blocks

hud/types.py CHANGED Viewed

@@ -25,6 +25,7 @@ _missing_api_key_error_logged: bool = False
 class AgentType(str, Enum):
     CLAUDE = "claude"
     OPENAI = "openai"
+    GEMINI = "gemini"
     VLLM = "vllm"
     LITELLM = "litellm"
     INTEGRATION_TEST = "integration_test"
@@ -230,7 +231,7 @@ class AgentResponse(BaseModel):
     tool_calls: list[MCPToolCall] = Field(default_factory=list)
     done: bool = Field(default=False)
-    # --- TELEMETRY [hud.so] ---
+    # --- TELEMETRY [hud.ai] ---
     # Responses
     content: str | None = Field(default=None)
     reasoning: str | None = Field(default=None)

hud/utils/tests/test_version.py CHANGED Viewed

@@ -5,4 +5,4 @@ def test_import():
     """Test that the package can be imported."""
     import hud
-    assert hud.__version__ == "0.4.57"
+    assert hud.__version__ == "0.4.59"

hud/version.py CHANGED Viewed

@@ -4,4 +4,4 @@ Version information for the HUD SDK.
 from __future__ import annotations
-__version__ = "0.4.57"
+__version__ = "0.4.59"

{hud_python-0.4.57.dist-info → hud_python-0.4.59.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: hud-python
-Version: 0.4.57
+Version: 0.4.59
 Summary: SDK for the HUD platform.
 Project-URL: Homepage, https://github.com/hud-evals/hud-python
 Project-URL: Bug Tracker, https://github.com/hud-evals/hud-python/issues
@@ -38,6 +38,7 @@ Requires-Python: <3.13,>=3.11
 Requires-Dist: anthropic
 Requires-Dist: blessed>=1.20.0
 Requires-Dist: datasets>=2.14.0
+Requires-Dist: google-genai
 Requires-Dist: httpx<1,>=0.23.0
 Requires-Dist: hud-fastmcp-python-sdk>=0.1.2
 Requires-Dist: hud-mcp-python-sdk>=3.13.2

hud-python 0.4.57__py3-none-any.whl → 0.4.59__py3-none-any.whl

Potentially problematic release.

hud-python 0.4.57py3-none-any.whl → 0.4.59py3-none-any.whl