PyPI - hud-python - Versions diffs - 0.4.45__py3-none-any.whl → 0.5.1__py3-none-any.whl - Mend

hud-python 0.4.45py3-none-any.whl → 0.5.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (274) hide show

hud/__init__.py +27 -7
hud/agents/__init__.py +11 -5
hud/agents/base.py +220 -500
hud/agents/claude.py +200 -240
hud/agents/gemini.py +275 -0
hud/agents/gemini_cua.py +335 -0
hud/agents/grounded_openai.py +98 -100
hud/agents/misc/integration_test_agent.py +51 -20
hud/agents/misc/response_agent.py +41 -36
hud/agents/openai.py +291 -292
hud/agents/{openai_chat_generic.py → openai_chat.py} +80 -34
hud/agents/operator.py +211 -0
hud/agents/tests/conftest.py +133 -0
hud/agents/tests/test_base.py +300 -622
hud/agents/tests/test_base_runtime.py +233 -0
hud/agents/tests/test_claude.py +379 -210
hud/agents/tests/test_client.py +9 -10
hud/agents/tests/test_gemini.py +369 -0
hud/agents/tests/test_grounded_openai_agent.py +65 -50
hud/agents/tests/test_openai.py +376 -140
hud/agents/tests/test_operator.py +362 -0
hud/agents/tests/test_run_eval.py +179 -0
hud/cli/__init__.py +461 -545
hud/cli/analyze.py +43 -5
hud/cli/build.py +664 -110
hud/cli/debug.py +8 -5
hud/cli/dev.py +882 -734
hud/cli/eval.py +782 -668
hud/cli/flows/dev.py +167 -0
hud/cli/flows/init.py +191 -0
hud/cli/flows/tasks.py +153 -56
hud/cli/flows/templates.py +151 -0
hud/cli/flows/tests/__init__.py +1 -0
hud/cli/flows/tests/test_dev.py +126 -0
hud/cli/init.py +60 -58
hud/cli/push.py +29 -11
hud/cli/rft.py +311 -0
hud/cli/rft_status.py +145 -0
hud/cli/tests/test_analyze.py +5 -5
hud/cli/tests/test_analyze_metadata.py +3 -2
hud/cli/tests/test_analyze_module.py +120 -0
hud/cli/tests/test_build.py +108 -6
hud/cli/tests/test_build_failure.py +41 -0
hud/cli/tests/test_build_module.py +50 -0
hud/cli/tests/test_cli_init.py +6 -1
hud/cli/tests/test_cli_more_wrappers.py +30 -0
hud/cli/tests/test_cli_root.py +140 -0
hud/cli/tests/test_convert.py +361 -0
hud/cli/tests/test_debug.py +12 -10
hud/cli/tests/test_dev.py +197 -0
hud/cli/tests/test_eval.py +251 -0
hud/cli/tests/test_eval_bedrock.py +51 -0
hud/cli/tests/test_init.py +124 -0
hud/cli/tests/test_main_module.py +11 -5
hud/cli/tests/test_mcp_server.py +12 -100
hud/cli/tests/test_push_happy.py +74 -0
hud/cli/tests/test_push_wrapper.py +23 -0
hud/cli/tests/test_registry.py +1 -1
hud/cli/tests/test_utils.py +1 -1
hud/cli/{rl → utils}/celebrate.py +14 -12
hud/cli/utils/config.py +18 -1
hud/cli/utils/docker.py +130 -4
hud/cli/utils/env_check.py +9 -9
hud/cli/utils/git.py +136 -0
hud/cli/utils/interactive.py +39 -5
hud/cli/utils/metadata.py +69 -0
hud/cli/utils/runner.py +1 -1
hud/cli/utils/server.py +2 -2
hud/cli/utils/source_hash.py +3 -3
hud/cli/utils/tasks.py +4 -1
hud/cli/utils/tests/__init__.py +0 -0
hud/cli/utils/tests/test_config.py +58 -0
hud/cli/utils/tests/test_docker.py +93 -0
hud/cli/utils/tests/test_docker_hints.py +71 -0
hud/cli/utils/tests/test_env_check.py +74 -0
hud/cli/utils/tests/test_environment.py +42 -0
hud/cli/utils/tests/test_git.py +142 -0
hud/cli/utils/tests/test_interactive_module.py +60 -0
hud/cli/utils/tests/test_local_runner.py +50 -0
hud/cli/utils/tests/test_logging_utils.py +23 -0
hud/cli/utils/tests/test_metadata.py +49 -0
hud/cli/utils/tests/test_package_runner.py +35 -0
hud/cli/utils/tests/test_registry_utils.py +49 -0
hud/cli/utils/tests/test_remote_runner.py +25 -0
hud/cli/utils/tests/test_runner_modules.py +52 -0
hud/cli/utils/tests/test_source_hash.py +36 -0
hud/cli/utils/tests/test_tasks.py +80 -0
hud/cli/utils/version_check.py +258 -0
hud/cli/{rl → utils}/viewer.py +2 -2
hud/clients/README.md +12 -11
hud/clients/__init__.py +4 -3
hud/clients/base.py +166 -26
hud/clients/environment.py +51 -0
hud/clients/fastmcp.py +13 -6
hud/clients/mcp_use.py +40 -15
hud/clients/tests/test_analyze_scenarios.py +206 -0
hud/clients/tests/test_protocol.py +9 -3
hud/datasets/__init__.py +23 -20
hud/datasets/loader.py +327 -0
hud/datasets/runner.py +192 -105
hud/datasets/tests/__init__.py +0 -0
hud/datasets/tests/test_loader.py +221 -0
hud/datasets/tests/test_utils.py +315 -0
hud/datasets/utils.py +270 -90
hud/environment/__init__.py +50 -0
hud/environment/connection.py +206 -0
hud/environment/connectors/__init__.py +33 -0
hud/environment/connectors/base.py +68 -0
hud/environment/connectors/local.py +177 -0
hud/environment/connectors/mcp_config.py +109 -0
hud/environment/connectors/openai.py +101 -0
hud/environment/connectors/remote.py +172 -0
hud/environment/environment.py +694 -0
hud/environment/integrations/__init__.py +45 -0
hud/environment/integrations/adk.py +67 -0
hud/environment/integrations/anthropic.py +196 -0
hud/environment/integrations/gemini.py +92 -0
hud/environment/integrations/langchain.py +82 -0
hud/environment/integrations/llamaindex.py +68 -0
hud/environment/integrations/openai.py +238 -0
hud/environment/mock.py +306 -0
hud/environment/router.py +112 -0
hud/environment/scenarios.py +493 -0
hud/environment/tests/__init__.py +1 -0
hud/environment/tests/test_connection.py +317 -0
hud/environment/tests/test_connectors.py +218 -0
hud/environment/tests/test_environment.py +161 -0
hud/environment/tests/test_integrations.py +257 -0
hud/environment/tests/test_local_connectors.py +201 -0
hud/environment/tests/test_scenarios.py +280 -0
hud/environment/tests/test_tools.py +208 -0
hud/environment/types.py +23 -0
hud/environment/utils/__init__.py +35 -0
hud/environment/utils/formats.py +215 -0
hud/environment/utils/schema.py +171 -0
hud/environment/utils/tool_wrappers.py +113 -0
hud/eval/__init__.py +67 -0
hud/eval/context.py +674 -0
hud/eval/display.py +299 -0
hud/eval/instrument.py +185 -0
hud/eval/manager.py +466 -0
hud/eval/parallel.py +268 -0
hud/eval/task.py +340 -0
hud/eval/tests/__init__.py +1 -0
hud/eval/tests/test_context.py +178 -0
hud/eval/tests/test_eval.py +210 -0
hud/eval/tests/test_manager.py +152 -0
hud/eval/tests/test_parallel.py +168 -0
hud/eval/tests/test_task.py +145 -0
hud/eval/types.py +63 -0
hud/eval/utils.py +183 -0
hud/patches/__init__.py +19 -0
hud/patches/mcp_patches.py +151 -0
hud/patches/warnings.py +54 -0
hud/samples/browser.py +4 -4
hud/server/__init__.py +2 -1
hud/server/low_level.py +2 -1
hud/server/router.py +164 -0
hud/server/server.py +567 -80
hud/server/tests/test_mcp_server_integration.py +11 -11
hud/server/tests/test_mcp_server_more.py +1 -1
hud/server/tests/test_server_extra.py +2 -0
hud/settings.py +45 -3
hud/shared/exceptions.py +36 -10
hud/shared/hints.py +26 -1
hud/shared/requests.py +15 -3
hud/shared/tests/test_exceptions.py +40 -31
hud/shared/tests/test_hints.py +167 -0
hud/telemetry/__init__.py +20 -19
hud/telemetry/exporter.py +201 -0
hud/telemetry/instrument.py +158 -253
hud/telemetry/tests/test_eval_telemetry.py +356 -0
hud/telemetry/tests/test_exporter.py +258 -0
hud/telemetry/tests/test_instrument.py +401 -0
hud/tools/__init__.py +16 -2
hud/tools/apply_patch.py +639 -0
hud/tools/base.py +54 -4
hud/tools/bash.py +2 -2
hud/tools/computer/__init__.py +4 -0
hud/tools/computer/anthropic.py +2 -2
hud/tools/computer/gemini.py +385 -0
hud/tools/computer/hud.py +23 -6
hud/tools/computer/openai.py +20 -21
hud/tools/computer/qwen.py +434 -0
hud/tools/computer/settings.py +37 -0
hud/tools/edit.py +3 -7
hud/tools/executors/base.py +4 -2
hud/tools/executors/pyautogui.py +1 -1
hud/tools/grounding/grounded_tool.py +13 -18
hud/tools/grounding/grounder.py +10 -31
hud/tools/grounding/tests/test_grounded_tool.py +26 -44
hud/tools/jupyter.py +330 -0
hud/tools/playwright.py +18 -3
hud/tools/shell.py +308 -0
hud/tools/tests/test_apply_patch.py +718 -0
hud/tools/tests/test_computer.py +4 -9
hud/tools/tests/test_computer_actions.py +24 -2
hud/tools/tests/test_jupyter_tool.py +181 -0
hud/tools/tests/test_shell.py +596 -0
hud/tools/tests/test_submit.py +85 -0
hud/tools/tests/test_types.py +193 -0
hud/tools/types.py +21 -1
hud/types.py +167 -57
hud/utils/__init__.py +2 -0
hud/utils/env.py +67 -0
hud/utils/hud_console.py +61 -3
hud/utils/mcp.py +15 -58
hud/utils/strict_schema.py +162 -0
hud/utils/tests/test_init.py +1 -2
hud/utils/tests/test_mcp.py +1 -28
hud/utils/tests/test_pretty_errors.py +186 -0
hud/utils/tests/test_tool_shorthand.py +154 -0
hud/utils/tests/test_version.py +1 -1
hud/utils/types.py +20 -0
hud/version.py +1 -1
hud_python-0.5.1.dist-info/METADATA +264 -0
hud_python-0.5.1.dist-info/RECORD +299 -0
{hud_python-0.4.45.dist-info → hud_python-0.5.1.dist-info}/WHEEL +1 -1
hud/agents/langchain.py +0 -261
hud/agents/lite_llm.py +0 -72
hud/cli/rl/__init__.py +0 -180
hud/cli/rl/config.py +0 -101
hud/cli/rl/display.py +0 -133
hud/cli/rl/gpu.py +0 -63
hud/cli/rl/gpu_utils.py +0 -321
hud/cli/rl/local_runner.py +0 -595
hud/cli/rl/presets.py +0 -96
hud/cli/rl/remote_runner.py +0 -463
hud/cli/rl/rl_api.py +0 -150
hud/cli/rl/vllm.py +0 -177
hud/cli/rl/wait_utils.py +0 -89
hud/datasets/parallel.py +0 -687
hud/misc/__init__.py +0 -1
hud/misc/claude_plays_pokemon.py +0 -292
hud/otel/__init__.py +0 -35
hud/otel/collector.py +0 -142
hud/otel/config.py +0 -181
hud/otel/context.py +0 -570
hud/otel/exporters.py +0 -369
hud/otel/instrumentation.py +0 -135
hud/otel/processors.py +0 -121
hud/otel/tests/__init__.py +0 -1
hud/otel/tests/test_processors.py +0 -197
hud/rl/README.md +0 -30
hud/rl/__init__.py +0 -1
hud/rl/actor.py +0 -176
hud/rl/buffer.py +0 -405
hud/rl/chat_template.jinja +0 -101
hud/rl/config.py +0 -192
hud/rl/distributed.py +0 -132
hud/rl/learner.py +0 -637
hud/rl/tests/__init__.py +0 -1
hud/rl/tests/test_learner.py +0 -186
hud/rl/train.py +0 -382
hud/rl/types.py +0 -101
hud/rl/utils/start_vllm_server.sh +0 -30
hud/rl/utils.py +0 -524
hud/rl/vllm_adapter.py +0 -143
hud/telemetry/job.py +0 -352
hud/telemetry/replay.py +0 -74
hud/telemetry/tests/test_replay.py +0 -40
hud/telemetry/tests/test_trace.py +0 -63
hud/telemetry/trace.py +0 -158
hud/utils/agent_factories.py +0 -86
hud/utils/async_utils.py +0 -65
hud/utils/group_eval.py +0 -223
hud/utils/progress.py +0 -149
hud/utils/tasks.py +0 -127
hud/utils/tests/test_async_utils.py +0 -173
hud/utils/tests/test_progress.py +0 -261
hud_python-0.4.45.dist-info/METADATA +0 -552
hud_python-0.4.45.dist-info/RECORD +0 -228
{hud_python-0.4.45.dist-info → hud_python-0.5.1.dist-info}/entry_points.txt +0 -0
{hud_python-0.4.45.dist-info → hud_python-0.5.1.dist-info}/licenses/LICENSE +0 -0

hud/tools/computer/openai.py CHANGED Viewed

@@ -9,7 +9,7 @@ from mcp.types import INTERNAL_ERROR, INVALID_PARAMS, ContentBlock, TextContent
 from pydantic import Field
 from hud.tools.computer.settings import computer_settings
-from hud.tools.types import ContentResult
+from hud.tools.types import ContentResult, Coordinate
 from .hud import HudComputerTool
@@ -18,6 +18,7 @@ if TYPE_CHECKING:
 logger = logging.getLogger(__name__)
 # Map OpenAI key names to CLA standard keys
 OPENAI_TO_CLA_KEYS = {
     # Common variations
@@ -95,14 +96,26 @@ class OpenAIComputerTool(HudComputerTool):
         # OpenAI uses lowercase key names
         return OPENAI_TO_CLA_KEYS.get(key.lower(), key.lower())
-    async def __call__(
+    async def __call__(  # type: ignore[override]
         self,
-        type: str = Field(..., description="The action type to perform"),
+        type: Literal[
+            "screenshot",
+            "click",
+            "double_click",
+            "scroll",
+            "type",
+            "wait",
+            "move",
+            "keypress",
+            "drag",
+            "response",
+            "custom",
+        ] = Field(..., description="The action type to perform"),
         # Coordinate parameters
         x: int | None = Field(None, description="X coordinate for click/move/scroll actions"),
         y: int | None = Field(None, description="Y coordinate for click/move/scroll actions"),
         # Button parameter
-        button: str | None = Field(
+        button: Literal["left", "right", "middle", "back", "forward"] | None = Field(
             None, description="Mouse button for click actions (left, right, middle, wheel)"
         ),
         # Text parameter
@@ -115,7 +128,7 @@ class OpenAIComputerTool(HudComputerTool):
         # Key press parameter
         keys: list[str] | None = Field(None, description="Keys to press"),
         # Drag parameter
-        path: list[dict[str, int]] | None = Field(
+        path: list[Coordinate] | None = Field(
             None, description="Path for drag actions as list of {x, y} dicts"
         ),
         # Custom action parameter
@@ -131,11 +144,6 @@ class OpenAIComputerTool(HudComputerTool):
         """
         logger.info("OpenAIComputerTool received type: %s", type)
-        # Map button names
-        button_map = {"wheel": "middle"}
-        if button:
-            button = button_map.get(button, button)
         # Process based on action type
         if type == "screenshot":
             screenshot_base64 = await self.executor.screenshot()
@@ -227,17 +235,8 @@ class OpenAIComputerTool(HudComputerTool):
                     )
                 )
-            # Convert path from list of dicts to list of tuples
-            drag_path = []
-            for point in path:
-                if "x" in point and "y" in point:
-                    drag_path.append((point["x"], point["y"]))
-                else:
-                    raise McpError(
-                        ErrorData(
-                            code=INVALID_PARAMS, message="Each point in path must have x and y"
-                        )
-                    )
+            # Convert path from list of Coordinate objects to list of tuples
+            drag_path = [(point.x, point.y) for point in path]
             scaled_path = self._scale_path(drag_path)
             result = await self.executor.drag(path=scaled_path)

hud/tools/computer/qwen.py ADDED Viewed

@@ -0,0 +1,434 @@
+# flake8: noqa: B008
+from __future__ import annotations
+import logging
+import re
+from typing import TYPE_CHECKING, Any, Literal
+from mcp import ErrorData, McpError
+from mcp.types import INTERNAL_ERROR, INVALID_PARAMS, ContentBlock
+from pydantic import Field
+from hud.tools.types import ContentResult
+from .hud import HudComputerTool
+from .settings import computer_settings
+if TYPE_CHECKING:
+    from hud.tools.executors.base import BaseExecutor
+logger = logging.getLogger(__name__)
+class QwenComputerTool(HudComputerTool):
+    """
+    Qwen Computer Use tool for interacting with the computer.
+    """
+    name: str = "computer_use"
+    api_type: str = "computer_use"
+    def __init__(
+        self,
+        # Define within environment based on platform
+        executor: BaseExecutor | None = None,
+        platform_type: Literal["auto", "xdo", "pyautogui"] = "auto",
+        display_num: int | None = None,
+        # Overrides for what dimensions the agent thinks it operates in
+        width: int = computer_settings.QWEN_COMPUTER_WIDTH,
+        height: int = computer_settings.QWEN_COMPUTER_HEIGHT,
+        rescale_images: bool = computer_settings.QWEN_RESCALE_IMAGES,
+        # What the agent sees as the tool's name, title, and description
+        name: str | None = None,
+        title: str | None = None,
+        description: str | None = None,
+        **kwargs: Any,
+    ) -> None:
+        """
+        Initialize with Qwen's default dimensions.
+        Args:
+            width: Target width for rescaling (None = use environment width)
+            height: Target height for rescaling (None = use environment height)
+            rescale_images: If True, rescale screenshots. If False, only rescale action coordinates
+            name: Tool name for MCP registration (auto-generated from class name if not provided)
+            title: Human-readable display name for the tool (auto-generated from class name)
+            description: Tool description (auto-generated from docstring if not provided)
+        """
+        # Store dimensions for description
+        self.display_width_px = width
+        self.display_height_px = height
+        # Build custom description with resolution info
+        custom_description = (
+            description
+            or f"""
+Use a mouse and keyboard to interact with a computer, and take screenshots.
+* This is an interface to a desktop GUI. You do not have access to a terminal or
+applications menu. You must click on desktop icons to start applications.
+* Some applications may take time to start or process actions, so you may need to
+wait and take successive screenshots to see the results of your actions. E.g. if you
+click on Firefox and a window doesn't open, try wait and taking another screenshot.
+* The screen's resolution is {width}x{height}.
+* Whenever you intend to move the cursor to click on an element like an icon, you
+should consult a screenshot to determine the coordinates of the element before
+moving the cursor.
+* If you tried clicking on a program or link but it failed to load, even after
+waiting, try adjusting your cursor position so that the tip of the cursor visually
+falls on the element that you want to click.
+* Make sure to click any buttons, links, icons, etc with the cursor tip in the
+center of the element. Don't click boxes on their edges.
+""".strip()
+        )
+        super().__init__(
+            executor=executor,
+            platform_type=platform_type,
+            display_num=display_num,
+            width=width,
+            height=height,
+            rescale_images=rescale_images,
+            name=name or "qwen_computer",
+            title=title or "Qwen Computer Tool",
+            description=custom_description,
+            **kwargs,
+        )
+    def to_params(self) -> dict:
+        """Convert to Qwen tool parameters."""
+        return {
+            "type": self.api_type,
+            "name": self.name,
+            "display_width_px": self.display_width_px,
+            "display_height_px": self.display_height_px,
+            "description": self.description,
+            "parameters": {
+                "properties": {
+                    "action": {
+                        "description": """
+The action to perform. The available actions are:
+* `key`: Performs key down presses on the arguments passed in order, then performs
+key releases in reverse order.
+* `type`: Type a string of text on the keyboard.
+* `mouse_move`: Move the cursor to a specified (x, y) pixel coordinate on the
+screen.
+* `left_click`: Click the left mouse button at a specified (x, y) pixel coordinate
+on the screen.
+* `left_click_drag`: Click and drag the cursor to a specified (x, y) pixel
+coordinate on the screen.
+* `right_click`: Click the right mouse button at a specified (x, y) pixel
+coordinate on the screen.
+* `middle_click`: Click the middle mouse button at a specified (x, y) pixel
+coordinate on the screen.
+* `double_click`: Double-click the left mouse button at a specified (x, y) pixel
+coordinate on the screen.
+* `triple_click`: Triple-click the left mouse button at a specified (x, y) pixel
+coordinate on the screen.
+* `scroll`: Performs a scroll of the mouse scroll wheel.
+* `hscroll`: Performs a horizontal scroll.
+* `wait`: Wait specified seconds for the change to happen.
+* `terminate`: Terminate the current task and report its completion status
+(NOT SUPPORTED).
+* `answer`: Answer a question (NOT SUPPORTED).
+""".strip(),
+                        "enum": [
+                            "key",
+                            "type",
+                            "mouse_move",
+                            "left_click",
+                            "left_click_drag",
+                            "right_click",
+                            "middle_click",
+                            "double_click",
+                            "triple_click",
+                            "scroll",
+                            "hscroll",
+                            "wait",
+                            "terminate",
+                            "answer",
+                        ],
+                        "type": "string",
+                    },
+                    "keys": {
+                        "description": "Required only by `action=key`.",
+                        "type": "array",
+                    },
+                    "text": {
+                        "description": "Required only by `action=type` and `action=answer`.",
+                        "type": "string",
+                    },
+                    "coordinate": {
+                        "description": (
+                            "(x, y): The x (pixels from the left edge) and y "
+                            "(pixels from the top edge) coordinates to move the mouse to."
+                        ),
+                        "type": "array",
+                    },
+                    "pixels": {
+                        "description": (
+                            "The amount of scrolling to perform. Positive values scroll up, "
+                            "negative values scroll down. Required only by `action=scroll` "
+                            "and `action=hscroll`."
+                        ),
+                        "type": "number",
+                    },
+                    "time": {
+                        "description": "The seconds to wait. Required only by `action=wait`.",
+                        "type": "number",
+                    },
+                    "status": {
+                        "description": (
+                            "The status of the task. Required only by `action=terminate`."
+                        ),
+                        "type": "string",
+                        "enum": ["success", "failure"],
+                    },
+                },
+                "required": ["action"],
+                "type": "object",
+            },
+        }
+    async def __call__(
+        self,
+        action: str = Field(..., description="The action to perform on the computer"),
+        keys: list[str] | None = Field(None, description="Keys for key action"),
+        text: str | None = Field(None, description="Text to type"),
+        coordinate: list[int] | None = Field(
+            None, description="The coordinate to interact with on the computer [x, y]"
+        ),
+        pixels: int | None = Field(None, description="Pixels to scroll"),
+        time: float | None = Field(None, description="Time to wait in seconds"),
+        status: str | None = Field(None, description="Status for terminate action"),
+    ) -> list[ContentBlock]:
+        """
+        Handle Qwen Computer Use API calls.
+        This converts Qwen's action format to HudComputerTool's format.
+        Returns:
+            List of MCP content blocks
+        """
+        logger.info("QwenComputerTool received action: %s", action)
+        # Handle non-computer actions that should raise errors
+        if action == "terminate":
+            raise McpError(
+                ErrorData(
+                    code=INVALID_PARAMS,
+                    message=(
+                        "terminate action is not supported for computer control. This is a no-op."
+                    ),
+                )
+            )
+        if action == "answer":
+            raise McpError(
+                ErrorData(
+                    code=INVALID_PARAMS,
+                    message="answer action is not supported for computer control. This is a no-op.",
+                )
+            )
+        # Convert lists to tuples if needed
+        coord_tuple = None
+        if coordinate:
+            coord_tuple = tuple(coordinate) if isinstance(coordinate, list) else coordinate
+        # Map Qwen actions to HudComputerTool actions
+        if action == "left_click":
+            if coord_tuple and len(coord_tuple) >= 2:
+                scaled_x, scaled_y = self._scale_coordinates(coord_tuple[0], coord_tuple[1])
+                logger.info("Scaled coordinates: %s, %s", scaled_x, scaled_y)
+                result = await self.executor.click(x=scaled_x, y=scaled_y)
+            else:
+                raise McpError(
+                    ErrorData(code=INVALID_PARAMS, message="coordinate is required for left_click")
+                )
+        elif action == "double_click":
+            if coord_tuple and len(coord_tuple) >= 2:
+                # Use pattern for double-click
+                scaled_x, scaled_y = self._scale_coordinates(coord_tuple[0], coord_tuple[1])
+                result = await self.executor.click(x=scaled_x, y=scaled_y, pattern=[100])
+            else:
+                raise McpError(
+                    ErrorData(
+                        code=INVALID_PARAMS, message="coordinate is required for double_click"
+                    )
+                )
+        elif action == "triple_click":
+            if coord_tuple and len(coord_tuple) >= 2:
+                # Use pattern for triple-click (simulated as double-click)
+                scaled_x, scaled_y = self._scale_coordinates(coord_tuple[0], coord_tuple[1])
+                # Note: triple-click simulated as double-click as per requirement
+                result = await self.executor.click(x=scaled_x, y=scaled_y, pattern=[100])
+            else:
+                raise McpError(
+                    ErrorData(
+                        code=INVALID_PARAMS, message="coordinate is required for triple_click"
+                    )
+                )
+        elif action == "right_click":
+            if coord_tuple and len(coord_tuple) >= 2:
+                scaled_x, scaled_y = self._scale_coordinates(coord_tuple[0], coord_tuple[1])
+                result = await self.executor.click(x=scaled_x, y=scaled_y, button="right")
+            else:
+                raise McpError(
+                    ErrorData(code=INVALID_PARAMS, message="coordinate is required for right_click")
+                )
+        elif action == "middle_click":
+            if coord_tuple and len(coord_tuple) >= 2:
+                scaled_x, scaled_y = self._scale_coordinates(coord_tuple[0], coord_tuple[1])
+                result = await self.executor.click(x=scaled_x, y=scaled_y, button="middle")
+            else:
+                raise McpError(
+                    ErrorData(
+                        code=INVALID_PARAMS, message="coordinate is required for middle_click"
+                    )
+                )
+        elif action == "mouse_move":
+            if coord_tuple and len(coord_tuple) >= 2:
+                scaled_x, scaled_y = self._scale_coordinates(coord_tuple[0], coord_tuple[1])
+                result = await self.executor.move(x=scaled_x, y=scaled_y)
+            else:
+                raise McpError(
+                    ErrorData(code=INVALID_PARAMS, message="coordinate is required for mouse_move")
+                )
+        elif action == "type":
+            if text:
+                result = await self.executor.write(text=text)
+            else:
+                raise McpError(ErrorData(code=INVALID_PARAMS, message="text is required for type"))
+        elif action == "key":
+            if keys:
+                # Qwen sends an array of keys to press
+                result = await self.executor.press(keys=keys)
+            else:
+                raise McpError(ErrorData(code=INVALID_PARAMS, message="keys is required for key"))
+        elif action == "scroll":
+            if pixels is None:
+                raise McpError(
+                    ErrorData(code=INVALID_PARAMS, message="pixels is required for scroll")
+                )
+            # Qwen's pixels: positive scrolls up, negative scrolls down
+            # HUD's scroll_y: positive scrolls down, negative scrolls up
+            # So we need to negate the value
+            scroll_y = -pixels
+            if coord_tuple and len(coord_tuple) >= 2:
+                scaled_x, scaled_y = self._scale_coordinates(coord_tuple[0], coord_tuple[1])
+                result = await self.executor.scroll(x=scaled_x, y=scaled_y, scroll_y=scroll_y)
+            else:
+                result = await self.executor.scroll(scroll_y=scroll_y)
+        elif action == "hscroll":
+            if pixels is None:
+                raise McpError(
+                    ErrorData(code=INVALID_PARAMS, message="pixels is required for hscroll")
+                )
+            # For horizontal scroll, positive values scroll right, negative scroll left
+            scroll_x = pixels
+            if coord_tuple and len(coord_tuple) >= 2:
+                scaled_x, scaled_y = self._scale_coordinates(coord_tuple[0], coord_tuple[1])
+                result = await self.executor.scroll(x=scaled_x, y=scaled_y, scroll_x=scroll_x)
+            else:
+                result = await self.executor.scroll(scroll_x=scroll_x)
+        elif action == "left_click_drag":
+            if coord_tuple and len(coord_tuple) >= 2:
+                # For drag, we need a path. Qwen provides the end coordinate.
+                # We'll get the current position and drag from there to the target
+                current_pos = await self.executor.position()
+                if isinstance(current_pos, ContentResult) and current_pos.output:
+                    # Parse the position from the output
+                    match = re.search(r"x=(\d+), y=(\d+)", current_pos.output)
+                    if match:
+                        # Current position is in screen coordinates
+                        screen_start_x, screen_start_y = int(match.group(1)), int(match.group(2))
+                        # End position is in agent coordinates, needs scaling
+                        scaled_end_x, scaled_end_y = self._scale_coordinates(
+                            coord_tuple[0], coord_tuple[1]
+                        )
+                        # Create path in screen coordinates
+                        path = [(screen_start_x, screen_start_y), (scaled_end_x, scaled_end_y)]
+                        # Path is already in screen coordinates, no need to scale again
+                        result = await self.executor.drag(path=path)
+                    else:
+                        raise McpError(
+                            ErrorData(
+                                code=INTERNAL_ERROR, message="Failed to parse current position"
+                            )
+                        )
+                else:
+                    raise McpError(
+                        ErrorData(code=INTERNAL_ERROR, message="Failed to get current position")
+                    )
+            else:
+                raise McpError(
+                    ErrorData(
+                        code=INVALID_PARAMS, message="coordinate is required for left_click_drag"
+                    )
+                )
+        elif action == "wait":
+            if time is None:
+                raise McpError(ErrorData(code=INVALID_PARAMS, message="time is required for wait"))
+            if time < 0:
+                raise McpError(ErrorData(code=INVALID_PARAMS, message="time must be non-negative"))
+            # Convert seconds to milliseconds for HudComputerTool
+            result = await self.executor.wait(time=int(time * 1000))
+        else:
+            # Unknown action
+            raise McpError(ErrorData(code=INTERNAL_ERROR, message=f"Invalid action: {action}"))
+        # Rescale screenshot in result if present
+        if isinstance(result, ContentResult) and result.base64_image and self.rescale_images:
+            rescaled_image = await self._rescale_screenshot(result.base64_image)
+            result.base64_image = rescaled_image
+        # Auto-add screenshot for interactive actions
+        interactive_actions = {
+            "left_click",
+            "double_click",
+            "triple_click",
+            "right_click",
+            "middle_click",
+            "mouse_move",
+            "type",
+            "key",
+            "scroll",
+            "hscroll",
+            "left_click_drag",
+        }
+        if (
+            action in interactive_actions
+            and isinstance(result, ContentResult)
+            and not result.base64_image
+        ):
+            screenshot_base64 = await self.executor.screenshot()
+            if screenshot_base64:
+                # Rescale screenshot if requested
+                screenshot_base64 = await self._rescale_screenshot(screenshot_base64)
+                result = ContentResult(
+                    # note: we suppress the output since it's not useful
+                    output="",
+                    error=result.error,
+                    base64_image=screenshot_base64,
+                )
+        # Convert to content blocks
+        return result.to_content_blocks()

hud/tools/computer/settings.py CHANGED Viewed

@@ -62,6 +62,17 @@ class ComputerSettings(BaseSettings):
         validation_alias="OPENAI_COMPUTER_HEIGHT",
     )
+    QWEN_COMPUTER_WIDTH: int = Field(
+        default=700,
+        description="Width of the display to use for the Qwen computer tools",
+        validation_alias="QWEN_COMPUTER_WIDTH",
+    )
+    QWEN_COMPUTER_HEIGHT: int = Field(
+        default=448,
+        description="Height of the display to use for the Qwen computer tools",
+        validation_alias="QWEN_COMPUTER_HEIGHT",
+    )
     HUD_RESCALE_IMAGES: bool = Field(
         default=False,
         description="Whether to rescale images to the agent width and height",
@@ -77,6 +88,32 @@ class ComputerSettings(BaseSettings):
         description="Whether to rescale images to the agent width and height",
         validation_alias="OPENAI_RESCALE_IMAGES",
     )
+    QWEN_RESCALE_IMAGES: bool = Field(
+        default=True,
+        description="Whether to rescale images to the agent width and height",
+        validation_alias="QWEN_RESCALE_IMAGES",
+    )
+    GEMINI_COMPUTER_WIDTH: int = Field(
+        default=1440,
+        description="Width of the display to use for the Gemini computer tools",
+        validation_alias="GEMINI_COMPUTER_WIDTH",
+    )
+    GEMINI_COMPUTER_HEIGHT: int = Field(
+        default=900,
+        description="Height of the display to use for the Gemini computer tools",
+        validation_alias="GEMINI_COMPUTER_HEIGHT",
+    )
+    GEMINI_RESCALE_IMAGES: bool = Field(
+        default=True,
+        description="Whether to rescale images to the agent width and height",
+        validation_alias="GEMINI_RESCALE_IMAGES",
+    )
+    GEMINI_MAX_RECENT_TURN_WITH_SCREENSHOTS: int = Field(
+        default=3,
+        description="Maximum number of recent turns to keep screenshots for in Gemini agent",
+        validation_alias="GEMINI_MAX_RECENT_TURN_WITH_SCREENSHOTS",
+    )
 computer_settings = ComputerSettings()

hud/tools/edit.py CHANGED Viewed

@@ -1,16 +1,13 @@
-from __future__ import annotations
 from collections import defaultdict
 from pathlib import Path
-from typing import TYPE_CHECKING, Any, Literal, get_args
+from typing import Literal, get_args
+from mcp.types import ContentBlock
 from .base import BaseTool
 from .types import ContentResult, ToolError
 from .utils import maybe_truncate, run
-if TYPE_CHECKING:
-    from mcp.types import ContentBlock
 Command = Literal[
     "view",
     "create",
@@ -56,7 +53,6 @@ class EditTool(BaseTool):
         old_str: str | None = None,
         new_str: str | None = None,
         insert_line: int | None = None,
-        **kwargs: Any,
     ) -> list[ContentBlock]:
         _path = Path(path)
         self.validate_path(command, _path)

hud/tools/executors/base.py CHANGED Viewed

@@ -280,7 +280,7 @@ class BaseExecutor:
     # ===== Utility Actions =====
-    async def wait(self, time: int) -> ContentResult:
+    async def wait(self, time: int, take_screenshot: bool = True) -> ContentResult:
         """
         Wait for specified time.
@@ -289,7 +289,9 @@ class BaseExecutor:
         """
         duration_seconds = time / 1000.0
         await asyncio.sleep(duration_seconds)
-        return ContentResult(output=f"Waited {time}ms")
+        # take screenshot
+        screenshot = await self.screenshot() if take_screenshot else None
+        return ContentResult(output=f"Waited {time}ms", base64_image=screenshot)
     async def screenshot(self) -> str | None:
         """

hud/tools/executors/pyautogui.py CHANGED Viewed

@@ -31,7 +31,7 @@ def _get_pyautogui() -> Any | None:
             try:
                 from hud.tools.computer import computer_settings
-                os.environ["DISPLAY"] = str(computer_settings.DISPLAY_NUM)
+                os.environ["DISPLAY"] = f":{computer_settings.DISPLAY_NUM}"
             except (ImportError, AttributeError):
                 os.environ["DISPLAY"] = ":0"

hud-python 0.4.45__py3-none-any.whl → 0.5.1__py3-none-any.whl

hud-python 0.4.45py3-none-any.whl → 0.5.1py3-none-any.whl