hud-python 0.4.45__py3-none-any.whl → 0.5.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- hud/__init__.py +27 -7
- hud/agents/__init__.py +11 -5
- hud/agents/base.py +220 -500
- hud/agents/claude.py +200 -240
- hud/agents/gemini.py +275 -0
- hud/agents/gemini_cua.py +335 -0
- hud/agents/grounded_openai.py +98 -100
- hud/agents/misc/integration_test_agent.py +51 -20
- hud/agents/misc/response_agent.py +41 -36
- hud/agents/openai.py +291 -292
- hud/agents/{openai_chat_generic.py → openai_chat.py} +80 -34
- hud/agents/operator.py +211 -0
- hud/agents/tests/conftest.py +133 -0
- hud/agents/tests/test_base.py +300 -622
- hud/agents/tests/test_base_runtime.py +233 -0
- hud/agents/tests/test_claude.py +379 -210
- hud/agents/tests/test_client.py +9 -10
- hud/agents/tests/test_gemini.py +369 -0
- hud/agents/tests/test_grounded_openai_agent.py +65 -50
- hud/agents/tests/test_openai.py +376 -140
- hud/agents/tests/test_operator.py +362 -0
- hud/agents/tests/test_run_eval.py +179 -0
- hud/cli/__init__.py +461 -545
- hud/cli/analyze.py +43 -5
- hud/cli/build.py +664 -110
- hud/cli/debug.py +8 -5
- hud/cli/dev.py +882 -734
- hud/cli/eval.py +782 -668
- hud/cli/flows/dev.py +167 -0
- hud/cli/flows/init.py +191 -0
- hud/cli/flows/tasks.py +153 -56
- hud/cli/flows/templates.py +151 -0
- hud/cli/flows/tests/__init__.py +1 -0
- hud/cli/flows/tests/test_dev.py +126 -0
- hud/cli/init.py +60 -58
- hud/cli/push.py +29 -11
- hud/cli/rft.py +311 -0
- hud/cli/rft_status.py +145 -0
- hud/cli/tests/test_analyze.py +5 -5
- hud/cli/tests/test_analyze_metadata.py +3 -2
- hud/cli/tests/test_analyze_module.py +120 -0
- hud/cli/tests/test_build.py +108 -6
- hud/cli/tests/test_build_failure.py +41 -0
- hud/cli/tests/test_build_module.py +50 -0
- hud/cli/tests/test_cli_init.py +6 -1
- hud/cli/tests/test_cli_more_wrappers.py +30 -0
- hud/cli/tests/test_cli_root.py +140 -0
- hud/cli/tests/test_convert.py +361 -0
- hud/cli/tests/test_debug.py +12 -10
- hud/cli/tests/test_dev.py +197 -0
- hud/cli/tests/test_eval.py +251 -0
- hud/cli/tests/test_eval_bedrock.py +51 -0
- hud/cli/tests/test_init.py +124 -0
- hud/cli/tests/test_main_module.py +11 -5
- hud/cli/tests/test_mcp_server.py +12 -100
- hud/cli/tests/test_push_happy.py +74 -0
- hud/cli/tests/test_push_wrapper.py +23 -0
- hud/cli/tests/test_registry.py +1 -1
- hud/cli/tests/test_utils.py +1 -1
- hud/cli/{rl → utils}/celebrate.py +14 -12
- hud/cli/utils/config.py +18 -1
- hud/cli/utils/docker.py +130 -4
- hud/cli/utils/env_check.py +9 -9
- hud/cli/utils/git.py +136 -0
- hud/cli/utils/interactive.py +39 -5
- hud/cli/utils/metadata.py +69 -0
- hud/cli/utils/runner.py +1 -1
- hud/cli/utils/server.py +2 -2
- hud/cli/utils/source_hash.py +3 -3
- hud/cli/utils/tasks.py +4 -1
- hud/cli/utils/tests/__init__.py +0 -0
- hud/cli/utils/tests/test_config.py +58 -0
- hud/cli/utils/tests/test_docker.py +93 -0
- hud/cli/utils/tests/test_docker_hints.py +71 -0
- hud/cli/utils/tests/test_env_check.py +74 -0
- hud/cli/utils/tests/test_environment.py +42 -0
- hud/cli/utils/tests/test_git.py +142 -0
- hud/cli/utils/tests/test_interactive_module.py +60 -0
- hud/cli/utils/tests/test_local_runner.py +50 -0
- hud/cli/utils/tests/test_logging_utils.py +23 -0
- hud/cli/utils/tests/test_metadata.py +49 -0
- hud/cli/utils/tests/test_package_runner.py +35 -0
- hud/cli/utils/tests/test_registry_utils.py +49 -0
- hud/cli/utils/tests/test_remote_runner.py +25 -0
- hud/cli/utils/tests/test_runner_modules.py +52 -0
- hud/cli/utils/tests/test_source_hash.py +36 -0
- hud/cli/utils/tests/test_tasks.py +80 -0
- hud/cli/utils/version_check.py +258 -0
- hud/cli/{rl → utils}/viewer.py +2 -2
- hud/clients/README.md +12 -11
- hud/clients/__init__.py +4 -3
- hud/clients/base.py +166 -26
- hud/clients/environment.py +51 -0
- hud/clients/fastmcp.py +13 -6
- hud/clients/mcp_use.py +40 -15
- hud/clients/tests/test_analyze_scenarios.py +206 -0
- hud/clients/tests/test_protocol.py +9 -3
- hud/datasets/__init__.py +23 -20
- hud/datasets/loader.py +327 -0
- hud/datasets/runner.py +192 -105
- hud/datasets/tests/__init__.py +0 -0
- hud/datasets/tests/test_loader.py +221 -0
- hud/datasets/tests/test_utils.py +315 -0
- hud/datasets/utils.py +270 -90
- hud/environment/__init__.py +50 -0
- hud/environment/connection.py +206 -0
- hud/environment/connectors/__init__.py +33 -0
- hud/environment/connectors/base.py +68 -0
- hud/environment/connectors/local.py +177 -0
- hud/environment/connectors/mcp_config.py +109 -0
- hud/environment/connectors/openai.py +101 -0
- hud/environment/connectors/remote.py +172 -0
- hud/environment/environment.py +694 -0
- hud/environment/integrations/__init__.py +45 -0
- hud/environment/integrations/adk.py +67 -0
- hud/environment/integrations/anthropic.py +196 -0
- hud/environment/integrations/gemini.py +92 -0
- hud/environment/integrations/langchain.py +82 -0
- hud/environment/integrations/llamaindex.py +68 -0
- hud/environment/integrations/openai.py +238 -0
- hud/environment/mock.py +306 -0
- hud/environment/router.py +112 -0
- hud/environment/scenarios.py +493 -0
- hud/environment/tests/__init__.py +1 -0
- hud/environment/tests/test_connection.py +317 -0
- hud/environment/tests/test_connectors.py +218 -0
- hud/environment/tests/test_environment.py +161 -0
- hud/environment/tests/test_integrations.py +257 -0
- hud/environment/tests/test_local_connectors.py +201 -0
- hud/environment/tests/test_scenarios.py +280 -0
- hud/environment/tests/test_tools.py +208 -0
- hud/environment/types.py +23 -0
- hud/environment/utils/__init__.py +35 -0
- hud/environment/utils/formats.py +215 -0
- hud/environment/utils/schema.py +171 -0
- hud/environment/utils/tool_wrappers.py +113 -0
- hud/eval/__init__.py +67 -0
- hud/eval/context.py +674 -0
- hud/eval/display.py +299 -0
- hud/eval/instrument.py +185 -0
- hud/eval/manager.py +466 -0
- hud/eval/parallel.py +268 -0
- hud/eval/task.py +340 -0
- hud/eval/tests/__init__.py +1 -0
- hud/eval/tests/test_context.py +178 -0
- hud/eval/tests/test_eval.py +210 -0
- hud/eval/tests/test_manager.py +152 -0
- hud/eval/tests/test_parallel.py +168 -0
- hud/eval/tests/test_task.py +145 -0
- hud/eval/types.py +63 -0
- hud/eval/utils.py +183 -0
- hud/patches/__init__.py +19 -0
- hud/patches/mcp_patches.py +151 -0
- hud/patches/warnings.py +54 -0
- hud/samples/browser.py +4 -4
- hud/server/__init__.py +2 -1
- hud/server/low_level.py +2 -1
- hud/server/router.py +164 -0
- hud/server/server.py +567 -80
- hud/server/tests/test_mcp_server_integration.py +11 -11
- hud/server/tests/test_mcp_server_more.py +1 -1
- hud/server/tests/test_server_extra.py +2 -0
- hud/settings.py +45 -3
- hud/shared/exceptions.py +36 -10
- hud/shared/hints.py +26 -1
- hud/shared/requests.py +15 -3
- hud/shared/tests/test_exceptions.py +40 -31
- hud/shared/tests/test_hints.py +167 -0
- hud/telemetry/__init__.py +20 -19
- hud/telemetry/exporter.py +201 -0
- hud/telemetry/instrument.py +158 -253
- hud/telemetry/tests/test_eval_telemetry.py +356 -0
- hud/telemetry/tests/test_exporter.py +258 -0
- hud/telemetry/tests/test_instrument.py +401 -0
- hud/tools/__init__.py +16 -2
- hud/tools/apply_patch.py +639 -0
- hud/tools/base.py +54 -4
- hud/tools/bash.py +2 -2
- hud/tools/computer/__init__.py +4 -0
- hud/tools/computer/anthropic.py +2 -2
- hud/tools/computer/gemini.py +385 -0
- hud/tools/computer/hud.py +23 -6
- hud/tools/computer/openai.py +20 -21
- hud/tools/computer/qwen.py +434 -0
- hud/tools/computer/settings.py +37 -0
- hud/tools/edit.py +3 -7
- hud/tools/executors/base.py +4 -2
- hud/tools/executors/pyautogui.py +1 -1
- hud/tools/grounding/grounded_tool.py +13 -18
- hud/tools/grounding/grounder.py +10 -31
- hud/tools/grounding/tests/test_grounded_tool.py +26 -44
- hud/tools/jupyter.py +330 -0
- hud/tools/playwright.py +18 -3
- hud/tools/shell.py +308 -0
- hud/tools/tests/test_apply_patch.py +718 -0
- hud/tools/tests/test_computer.py +4 -9
- hud/tools/tests/test_computer_actions.py +24 -2
- hud/tools/tests/test_jupyter_tool.py +181 -0
- hud/tools/tests/test_shell.py +596 -0
- hud/tools/tests/test_submit.py +85 -0
- hud/tools/tests/test_types.py +193 -0
- hud/tools/types.py +21 -1
- hud/types.py +167 -57
- hud/utils/__init__.py +2 -0
- hud/utils/env.py +67 -0
- hud/utils/hud_console.py +61 -3
- hud/utils/mcp.py +15 -58
- hud/utils/strict_schema.py +162 -0
- hud/utils/tests/test_init.py +1 -2
- hud/utils/tests/test_mcp.py +1 -28
- hud/utils/tests/test_pretty_errors.py +186 -0
- hud/utils/tests/test_tool_shorthand.py +154 -0
- hud/utils/tests/test_version.py +1 -1
- hud/utils/types.py +20 -0
- hud/version.py +1 -1
- hud_python-0.5.1.dist-info/METADATA +264 -0
- hud_python-0.5.1.dist-info/RECORD +299 -0
- {hud_python-0.4.45.dist-info → hud_python-0.5.1.dist-info}/WHEEL +1 -1
- hud/agents/langchain.py +0 -261
- hud/agents/lite_llm.py +0 -72
- hud/cli/rl/__init__.py +0 -180
- hud/cli/rl/config.py +0 -101
- hud/cli/rl/display.py +0 -133
- hud/cli/rl/gpu.py +0 -63
- hud/cli/rl/gpu_utils.py +0 -321
- hud/cli/rl/local_runner.py +0 -595
- hud/cli/rl/presets.py +0 -96
- hud/cli/rl/remote_runner.py +0 -463
- hud/cli/rl/rl_api.py +0 -150
- hud/cli/rl/vllm.py +0 -177
- hud/cli/rl/wait_utils.py +0 -89
- hud/datasets/parallel.py +0 -687
- hud/misc/__init__.py +0 -1
- hud/misc/claude_plays_pokemon.py +0 -292
- hud/otel/__init__.py +0 -35
- hud/otel/collector.py +0 -142
- hud/otel/config.py +0 -181
- hud/otel/context.py +0 -570
- hud/otel/exporters.py +0 -369
- hud/otel/instrumentation.py +0 -135
- hud/otel/processors.py +0 -121
- hud/otel/tests/__init__.py +0 -1
- hud/otel/tests/test_processors.py +0 -197
- hud/rl/README.md +0 -30
- hud/rl/__init__.py +0 -1
- hud/rl/actor.py +0 -176
- hud/rl/buffer.py +0 -405
- hud/rl/chat_template.jinja +0 -101
- hud/rl/config.py +0 -192
- hud/rl/distributed.py +0 -132
- hud/rl/learner.py +0 -637
- hud/rl/tests/__init__.py +0 -1
- hud/rl/tests/test_learner.py +0 -186
- hud/rl/train.py +0 -382
- hud/rl/types.py +0 -101
- hud/rl/utils/start_vllm_server.sh +0 -30
- hud/rl/utils.py +0 -524
- hud/rl/vllm_adapter.py +0 -143
- hud/telemetry/job.py +0 -352
- hud/telemetry/replay.py +0 -74
- hud/telemetry/tests/test_replay.py +0 -40
- hud/telemetry/tests/test_trace.py +0 -63
- hud/telemetry/trace.py +0 -158
- hud/utils/agent_factories.py +0 -86
- hud/utils/async_utils.py +0 -65
- hud/utils/group_eval.py +0 -223
- hud/utils/progress.py +0 -149
- hud/utils/tasks.py +0 -127
- hud/utils/tests/test_async_utils.py +0 -173
- hud/utils/tests/test_progress.py +0 -261
- hud_python-0.4.45.dist-info/METADATA +0 -552
- hud_python-0.4.45.dist-info/RECORD +0 -228
- {hud_python-0.4.45.dist-info → hud_python-0.5.1.dist-info}/entry_points.txt +0 -0
- {hud_python-0.4.45.dist-info → hud_python-0.5.1.dist-info}/licenses/LICENSE +0 -0
hud/tools/computer/openai.py
CHANGED
|
@@ -9,7 +9,7 @@ from mcp.types import INTERNAL_ERROR, INVALID_PARAMS, ContentBlock, TextContent
|
|
|
9
9
|
from pydantic import Field
|
|
10
10
|
|
|
11
11
|
from hud.tools.computer.settings import computer_settings
|
|
12
|
-
from hud.tools.types import ContentResult
|
|
12
|
+
from hud.tools.types import ContentResult, Coordinate
|
|
13
13
|
|
|
14
14
|
from .hud import HudComputerTool
|
|
15
15
|
|
|
@@ -18,6 +18,7 @@ if TYPE_CHECKING:
|
|
|
18
18
|
|
|
19
19
|
logger = logging.getLogger(__name__)
|
|
20
20
|
|
|
21
|
+
|
|
21
22
|
# Map OpenAI key names to CLA standard keys
|
|
22
23
|
OPENAI_TO_CLA_KEYS = {
|
|
23
24
|
# Common variations
|
|
@@ -95,14 +96,26 @@ class OpenAIComputerTool(HudComputerTool):
|
|
|
95
96
|
# OpenAI uses lowercase key names
|
|
96
97
|
return OPENAI_TO_CLA_KEYS.get(key.lower(), key.lower())
|
|
97
98
|
|
|
98
|
-
async def __call__(
|
|
99
|
+
async def __call__( # type: ignore[override]
|
|
99
100
|
self,
|
|
100
|
-
type:
|
|
101
|
+
type: Literal[
|
|
102
|
+
"screenshot",
|
|
103
|
+
"click",
|
|
104
|
+
"double_click",
|
|
105
|
+
"scroll",
|
|
106
|
+
"type",
|
|
107
|
+
"wait",
|
|
108
|
+
"move",
|
|
109
|
+
"keypress",
|
|
110
|
+
"drag",
|
|
111
|
+
"response",
|
|
112
|
+
"custom",
|
|
113
|
+
] = Field(..., description="The action type to perform"),
|
|
101
114
|
# Coordinate parameters
|
|
102
115
|
x: int | None = Field(None, description="X coordinate for click/move/scroll actions"),
|
|
103
116
|
y: int | None = Field(None, description="Y coordinate for click/move/scroll actions"),
|
|
104
117
|
# Button parameter
|
|
105
|
-
button:
|
|
118
|
+
button: Literal["left", "right", "middle", "back", "forward"] | None = Field(
|
|
106
119
|
None, description="Mouse button for click actions (left, right, middle, wheel)"
|
|
107
120
|
),
|
|
108
121
|
# Text parameter
|
|
@@ -115,7 +128,7 @@ class OpenAIComputerTool(HudComputerTool):
|
|
|
115
128
|
# Key press parameter
|
|
116
129
|
keys: list[str] | None = Field(None, description="Keys to press"),
|
|
117
130
|
# Drag parameter
|
|
118
|
-
path: list[
|
|
131
|
+
path: list[Coordinate] | None = Field(
|
|
119
132
|
None, description="Path for drag actions as list of {x, y} dicts"
|
|
120
133
|
),
|
|
121
134
|
# Custom action parameter
|
|
@@ -131,11 +144,6 @@ class OpenAIComputerTool(HudComputerTool):
|
|
|
131
144
|
"""
|
|
132
145
|
logger.info("OpenAIComputerTool received type: %s", type)
|
|
133
146
|
|
|
134
|
-
# Map button names
|
|
135
|
-
button_map = {"wheel": "middle"}
|
|
136
|
-
if button:
|
|
137
|
-
button = button_map.get(button, button)
|
|
138
|
-
|
|
139
147
|
# Process based on action type
|
|
140
148
|
if type == "screenshot":
|
|
141
149
|
screenshot_base64 = await self.executor.screenshot()
|
|
@@ -227,17 +235,8 @@ class OpenAIComputerTool(HudComputerTool):
|
|
|
227
235
|
)
|
|
228
236
|
)
|
|
229
237
|
|
|
230
|
-
# Convert path from list of
|
|
231
|
-
drag_path = []
|
|
232
|
-
for point in path:
|
|
233
|
-
if "x" in point and "y" in point:
|
|
234
|
-
drag_path.append((point["x"], point["y"]))
|
|
235
|
-
else:
|
|
236
|
-
raise McpError(
|
|
237
|
-
ErrorData(
|
|
238
|
-
code=INVALID_PARAMS, message="Each point in path must have x and y"
|
|
239
|
-
)
|
|
240
|
-
)
|
|
238
|
+
# Convert path from list of Coordinate objects to list of tuples
|
|
239
|
+
drag_path = [(point.x, point.y) for point in path]
|
|
241
240
|
|
|
242
241
|
scaled_path = self._scale_path(drag_path)
|
|
243
242
|
result = await self.executor.drag(path=scaled_path)
|
|
@@ -0,0 +1,434 @@
|
|
|
1
|
+
# flake8: noqa: B008
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
import logging
|
|
5
|
+
import re
|
|
6
|
+
from typing import TYPE_CHECKING, Any, Literal
|
|
7
|
+
|
|
8
|
+
from mcp import ErrorData, McpError
|
|
9
|
+
from mcp.types import INTERNAL_ERROR, INVALID_PARAMS, ContentBlock
|
|
10
|
+
from pydantic import Field
|
|
11
|
+
|
|
12
|
+
from hud.tools.types import ContentResult
|
|
13
|
+
|
|
14
|
+
from .hud import HudComputerTool
|
|
15
|
+
from .settings import computer_settings
|
|
16
|
+
|
|
17
|
+
if TYPE_CHECKING:
|
|
18
|
+
from hud.tools.executors.base import BaseExecutor
|
|
19
|
+
|
|
20
|
+
logger = logging.getLogger(__name__)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class QwenComputerTool(HudComputerTool):
|
|
24
|
+
"""
|
|
25
|
+
Qwen Computer Use tool for interacting with the computer.
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
name: str = "computer_use"
|
|
29
|
+
api_type: str = "computer_use"
|
|
30
|
+
|
|
31
|
+
def __init__(
|
|
32
|
+
self,
|
|
33
|
+
# Define within environment based on platform
|
|
34
|
+
executor: BaseExecutor | None = None,
|
|
35
|
+
platform_type: Literal["auto", "xdo", "pyautogui"] = "auto",
|
|
36
|
+
display_num: int | None = None,
|
|
37
|
+
# Overrides for what dimensions the agent thinks it operates in
|
|
38
|
+
width: int = computer_settings.QWEN_COMPUTER_WIDTH,
|
|
39
|
+
height: int = computer_settings.QWEN_COMPUTER_HEIGHT,
|
|
40
|
+
rescale_images: bool = computer_settings.QWEN_RESCALE_IMAGES,
|
|
41
|
+
# What the agent sees as the tool's name, title, and description
|
|
42
|
+
name: str | None = None,
|
|
43
|
+
title: str | None = None,
|
|
44
|
+
description: str | None = None,
|
|
45
|
+
**kwargs: Any,
|
|
46
|
+
) -> None:
|
|
47
|
+
"""
|
|
48
|
+
Initialize with Qwen's default dimensions.
|
|
49
|
+
|
|
50
|
+
Args:
|
|
51
|
+
width: Target width for rescaling (None = use environment width)
|
|
52
|
+
height: Target height for rescaling (None = use environment height)
|
|
53
|
+
rescale_images: If True, rescale screenshots. If False, only rescale action coordinates
|
|
54
|
+
name: Tool name for MCP registration (auto-generated from class name if not provided)
|
|
55
|
+
title: Human-readable display name for the tool (auto-generated from class name)
|
|
56
|
+
description: Tool description (auto-generated from docstring if not provided)
|
|
57
|
+
"""
|
|
58
|
+
# Store dimensions for description
|
|
59
|
+
self.display_width_px = width
|
|
60
|
+
self.display_height_px = height
|
|
61
|
+
|
|
62
|
+
# Build custom description with resolution info
|
|
63
|
+
custom_description = (
|
|
64
|
+
description
|
|
65
|
+
or f"""
|
|
66
|
+
Use a mouse and keyboard to interact with a computer, and take screenshots.
|
|
67
|
+
* This is an interface to a desktop GUI. You do not have access to a terminal or
|
|
68
|
+
applications menu. You must click on desktop icons to start applications.
|
|
69
|
+
* Some applications may take time to start or process actions, so you may need to
|
|
70
|
+
wait and take successive screenshots to see the results of your actions. E.g. if you
|
|
71
|
+
click on Firefox and a window doesn't open, try wait and taking another screenshot.
|
|
72
|
+
* The screen's resolution is {width}x{height}.
|
|
73
|
+
* Whenever you intend to move the cursor to click on an element like an icon, you
|
|
74
|
+
should consult a screenshot to determine the coordinates of the element before
|
|
75
|
+
moving the cursor.
|
|
76
|
+
* If you tried clicking on a program or link but it failed to load, even after
|
|
77
|
+
waiting, try adjusting your cursor position so that the tip of the cursor visually
|
|
78
|
+
falls on the element that you want to click.
|
|
79
|
+
* Make sure to click any buttons, links, icons, etc with the cursor tip in the
|
|
80
|
+
center of the element. Don't click boxes on their edges.
|
|
81
|
+
""".strip()
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
super().__init__(
|
|
85
|
+
executor=executor,
|
|
86
|
+
platform_type=platform_type,
|
|
87
|
+
display_num=display_num,
|
|
88
|
+
width=width,
|
|
89
|
+
height=height,
|
|
90
|
+
rescale_images=rescale_images,
|
|
91
|
+
name=name or "qwen_computer",
|
|
92
|
+
title=title or "Qwen Computer Tool",
|
|
93
|
+
description=custom_description,
|
|
94
|
+
**kwargs,
|
|
95
|
+
)
|
|
96
|
+
|
|
97
|
+
def to_params(self) -> dict:
|
|
98
|
+
"""Convert to Qwen tool parameters."""
|
|
99
|
+
return {
|
|
100
|
+
"type": self.api_type,
|
|
101
|
+
"name": self.name,
|
|
102
|
+
"display_width_px": self.display_width_px,
|
|
103
|
+
"display_height_px": self.display_height_px,
|
|
104
|
+
"description": self.description,
|
|
105
|
+
"parameters": {
|
|
106
|
+
"properties": {
|
|
107
|
+
"action": {
|
|
108
|
+
"description": """
|
|
109
|
+
The action to perform. The available actions are:
|
|
110
|
+
* `key`: Performs key down presses on the arguments passed in order, then performs
|
|
111
|
+
key releases in reverse order.
|
|
112
|
+
* `type`: Type a string of text on the keyboard.
|
|
113
|
+
* `mouse_move`: Move the cursor to a specified (x, y) pixel coordinate on the
|
|
114
|
+
screen.
|
|
115
|
+
* `left_click`: Click the left mouse button at a specified (x, y) pixel coordinate
|
|
116
|
+
on the screen.
|
|
117
|
+
* `left_click_drag`: Click and drag the cursor to a specified (x, y) pixel
|
|
118
|
+
coordinate on the screen.
|
|
119
|
+
* `right_click`: Click the right mouse button at a specified (x, y) pixel
|
|
120
|
+
coordinate on the screen.
|
|
121
|
+
* `middle_click`: Click the middle mouse button at a specified (x, y) pixel
|
|
122
|
+
coordinate on the screen.
|
|
123
|
+
* `double_click`: Double-click the left mouse button at a specified (x, y) pixel
|
|
124
|
+
coordinate on the screen.
|
|
125
|
+
* `triple_click`: Triple-click the left mouse button at a specified (x, y) pixel
|
|
126
|
+
coordinate on the screen.
|
|
127
|
+
* `scroll`: Performs a scroll of the mouse scroll wheel.
|
|
128
|
+
* `hscroll`: Performs a horizontal scroll.
|
|
129
|
+
* `wait`: Wait specified seconds for the change to happen.
|
|
130
|
+
* `terminate`: Terminate the current task and report its completion status
|
|
131
|
+
(NOT SUPPORTED).
|
|
132
|
+
* `answer`: Answer a question (NOT SUPPORTED).
|
|
133
|
+
""".strip(),
|
|
134
|
+
"enum": [
|
|
135
|
+
"key",
|
|
136
|
+
"type",
|
|
137
|
+
"mouse_move",
|
|
138
|
+
"left_click",
|
|
139
|
+
"left_click_drag",
|
|
140
|
+
"right_click",
|
|
141
|
+
"middle_click",
|
|
142
|
+
"double_click",
|
|
143
|
+
"triple_click",
|
|
144
|
+
"scroll",
|
|
145
|
+
"hscroll",
|
|
146
|
+
"wait",
|
|
147
|
+
"terminate",
|
|
148
|
+
"answer",
|
|
149
|
+
],
|
|
150
|
+
"type": "string",
|
|
151
|
+
},
|
|
152
|
+
"keys": {
|
|
153
|
+
"description": "Required only by `action=key`.",
|
|
154
|
+
"type": "array",
|
|
155
|
+
},
|
|
156
|
+
"text": {
|
|
157
|
+
"description": "Required only by `action=type` and `action=answer`.",
|
|
158
|
+
"type": "string",
|
|
159
|
+
},
|
|
160
|
+
"coordinate": {
|
|
161
|
+
"description": (
|
|
162
|
+
"(x, y): The x (pixels from the left edge) and y "
|
|
163
|
+
"(pixels from the top edge) coordinates to move the mouse to."
|
|
164
|
+
),
|
|
165
|
+
"type": "array",
|
|
166
|
+
},
|
|
167
|
+
"pixels": {
|
|
168
|
+
"description": (
|
|
169
|
+
"The amount of scrolling to perform. Positive values scroll up, "
|
|
170
|
+
"negative values scroll down. Required only by `action=scroll` "
|
|
171
|
+
"and `action=hscroll`."
|
|
172
|
+
),
|
|
173
|
+
"type": "number",
|
|
174
|
+
},
|
|
175
|
+
"time": {
|
|
176
|
+
"description": "The seconds to wait. Required only by `action=wait`.",
|
|
177
|
+
"type": "number",
|
|
178
|
+
},
|
|
179
|
+
"status": {
|
|
180
|
+
"description": (
|
|
181
|
+
"The status of the task. Required only by `action=terminate`."
|
|
182
|
+
),
|
|
183
|
+
"type": "string",
|
|
184
|
+
"enum": ["success", "failure"],
|
|
185
|
+
},
|
|
186
|
+
},
|
|
187
|
+
"required": ["action"],
|
|
188
|
+
"type": "object",
|
|
189
|
+
},
|
|
190
|
+
}
|
|
191
|
+
|
|
192
|
+
async def __call__(
|
|
193
|
+
self,
|
|
194
|
+
action: str = Field(..., description="The action to perform on the computer"),
|
|
195
|
+
keys: list[str] | None = Field(None, description="Keys for key action"),
|
|
196
|
+
text: str | None = Field(None, description="Text to type"),
|
|
197
|
+
coordinate: list[int] | None = Field(
|
|
198
|
+
None, description="The coordinate to interact with on the computer [x, y]"
|
|
199
|
+
),
|
|
200
|
+
pixels: int | None = Field(None, description="Pixels to scroll"),
|
|
201
|
+
time: float | None = Field(None, description="Time to wait in seconds"),
|
|
202
|
+
status: str | None = Field(None, description="Status for terminate action"),
|
|
203
|
+
) -> list[ContentBlock]:
|
|
204
|
+
"""
|
|
205
|
+
Handle Qwen Computer Use API calls.
|
|
206
|
+
|
|
207
|
+
This converts Qwen's action format to HudComputerTool's format.
|
|
208
|
+
|
|
209
|
+
Returns:
|
|
210
|
+
List of MCP content blocks
|
|
211
|
+
"""
|
|
212
|
+
logger.info("QwenComputerTool received action: %s", action)
|
|
213
|
+
|
|
214
|
+
# Handle non-computer actions that should raise errors
|
|
215
|
+
if action == "terminate":
|
|
216
|
+
raise McpError(
|
|
217
|
+
ErrorData(
|
|
218
|
+
code=INVALID_PARAMS,
|
|
219
|
+
message=(
|
|
220
|
+
"terminate action is not supported for computer control. This is a no-op."
|
|
221
|
+
),
|
|
222
|
+
)
|
|
223
|
+
)
|
|
224
|
+
|
|
225
|
+
if action == "answer":
|
|
226
|
+
raise McpError(
|
|
227
|
+
ErrorData(
|
|
228
|
+
code=INVALID_PARAMS,
|
|
229
|
+
message="answer action is not supported for computer control. This is a no-op.",
|
|
230
|
+
)
|
|
231
|
+
)
|
|
232
|
+
|
|
233
|
+
# Convert lists to tuples if needed
|
|
234
|
+
coord_tuple = None
|
|
235
|
+
if coordinate:
|
|
236
|
+
coord_tuple = tuple(coordinate) if isinstance(coordinate, list) else coordinate
|
|
237
|
+
|
|
238
|
+
# Map Qwen actions to HudComputerTool actions
|
|
239
|
+
if action == "left_click":
|
|
240
|
+
if coord_tuple and len(coord_tuple) >= 2:
|
|
241
|
+
scaled_x, scaled_y = self._scale_coordinates(coord_tuple[0], coord_tuple[1])
|
|
242
|
+
logger.info("Scaled coordinates: %s, %s", scaled_x, scaled_y)
|
|
243
|
+
result = await self.executor.click(x=scaled_x, y=scaled_y)
|
|
244
|
+
else:
|
|
245
|
+
raise McpError(
|
|
246
|
+
ErrorData(code=INVALID_PARAMS, message="coordinate is required for left_click")
|
|
247
|
+
)
|
|
248
|
+
|
|
249
|
+
elif action == "double_click":
|
|
250
|
+
if coord_tuple and len(coord_tuple) >= 2:
|
|
251
|
+
# Use pattern for double-click
|
|
252
|
+
scaled_x, scaled_y = self._scale_coordinates(coord_tuple[0], coord_tuple[1])
|
|
253
|
+
result = await self.executor.click(x=scaled_x, y=scaled_y, pattern=[100])
|
|
254
|
+
else:
|
|
255
|
+
raise McpError(
|
|
256
|
+
ErrorData(
|
|
257
|
+
code=INVALID_PARAMS, message="coordinate is required for double_click"
|
|
258
|
+
)
|
|
259
|
+
)
|
|
260
|
+
|
|
261
|
+
elif action == "triple_click":
|
|
262
|
+
if coord_tuple and len(coord_tuple) >= 2:
|
|
263
|
+
# Use pattern for triple-click (simulated as double-click)
|
|
264
|
+
scaled_x, scaled_y = self._scale_coordinates(coord_tuple[0], coord_tuple[1])
|
|
265
|
+
# Note: triple-click simulated as double-click as per requirement
|
|
266
|
+
result = await self.executor.click(x=scaled_x, y=scaled_y, pattern=[100])
|
|
267
|
+
else:
|
|
268
|
+
raise McpError(
|
|
269
|
+
ErrorData(
|
|
270
|
+
code=INVALID_PARAMS, message="coordinate is required for triple_click"
|
|
271
|
+
)
|
|
272
|
+
)
|
|
273
|
+
|
|
274
|
+
elif action == "right_click":
|
|
275
|
+
if coord_tuple and len(coord_tuple) >= 2:
|
|
276
|
+
scaled_x, scaled_y = self._scale_coordinates(coord_tuple[0], coord_tuple[1])
|
|
277
|
+
result = await self.executor.click(x=scaled_x, y=scaled_y, button="right")
|
|
278
|
+
else:
|
|
279
|
+
raise McpError(
|
|
280
|
+
ErrorData(code=INVALID_PARAMS, message="coordinate is required for right_click")
|
|
281
|
+
)
|
|
282
|
+
|
|
283
|
+
elif action == "middle_click":
|
|
284
|
+
if coord_tuple and len(coord_tuple) >= 2:
|
|
285
|
+
scaled_x, scaled_y = self._scale_coordinates(coord_tuple[0], coord_tuple[1])
|
|
286
|
+
result = await self.executor.click(x=scaled_x, y=scaled_y, button="middle")
|
|
287
|
+
else:
|
|
288
|
+
raise McpError(
|
|
289
|
+
ErrorData(
|
|
290
|
+
code=INVALID_PARAMS, message="coordinate is required for middle_click"
|
|
291
|
+
)
|
|
292
|
+
)
|
|
293
|
+
|
|
294
|
+
elif action == "mouse_move":
|
|
295
|
+
if coord_tuple and len(coord_tuple) >= 2:
|
|
296
|
+
scaled_x, scaled_y = self._scale_coordinates(coord_tuple[0], coord_tuple[1])
|
|
297
|
+
result = await self.executor.move(x=scaled_x, y=scaled_y)
|
|
298
|
+
else:
|
|
299
|
+
raise McpError(
|
|
300
|
+
ErrorData(code=INVALID_PARAMS, message="coordinate is required for mouse_move")
|
|
301
|
+
)
|
|
302
|
+
|
|
303
|
+
elif action == "type":
|
|
304
|
+
if text:
|
|
305
|
+
result = await self.executor.write(text=text)
|
|
306
|
+
else:
|
|
307
|
+
raise McpError(ErrorData(code=INVALID_PARAMS, message="text is required for type"))
|
|
308
|
+
|
|
309
|
+
elif action == "key":
|
|
310
|
+
if keys:
|
|
311
|
+
# Qwen sends an array of keys to press
|
|
312
|
+
result = await self.executor.press(keys=keys)
|
|
313
|
+
else:
|
|
314
|
+
raise McpError(ErrorData(code=INVALID_PARAMS, message="keys is required for key"))
|
|
315
|
+
|
|
316
|
+
elif action == "scroll":
|
|
317
|
+
if pixels is None:
|
|
318
|
+
raise McpError(
|
|
319
|
+
ErrorData(code=INVALID_PARAMS, message="pixels is required for scroll")
|
|
320
|
+
)
|
|
321
|
+
|
|
322
|
+
# Qwen's pixels: positive scrolls up, negative scrolls down
|
|
323
|
+
# HUD's scroll_y: positive scrolls down, negative scrolls up
|
|
324
|
+
# So we need to negate the value
|
|
325
|
+
scroll_y = -pixels
|
|
326
|
+
|
|
327
|
+
if coord_tuple and len(coord_tuple) >= 2:
|
|
328
|
+
scaled_x, scaled_y = self._scale_coordinates(coord_tuple[0], coord_tuple[1])
|
|
329
|
+
result = await self.executor.scroll(x=scaled_x, y=scaled_y, scroll_y=scroll_y)
|
|
330
|
+
else:
|
|
331
|
+
result = await self.executor.scroll(scroll_y=scroll_y)
|
|
332
|
+
|
|
333
|
+
elif action == "hscroll":
|
|
334
|
+
if pixels is None:
|
|
335
|
+
raise McpError(
|
|
336
|
+
ErrorData(code=INVALID_PARAMS, message="pixels is required for hscroll")
|
|
337
|
+
)
|
|
338
|
+
|
|
339
|
+
# For horizontal scroll, positive values scroll right, negative scroll left
|
|
340
|
+
scroll_x = pixels
|
|
341
|
+
|
|
342
|
+
if coord_tuple and len(coord_tuple) >= 2:
|
|
343
|
+
scaled_x, scaled_y = self._scale_coordinates(coord_tuple[0], coord_tuple[1])
|
|
344
|
+
result = await self.executor.scroll(x=scaled_x, y=scaled_y, scroll_x=scroll_x)
|
|
345
|
+
else:
|
|
346
|
+
result = await self.executor.scroll(scroll_x=scroll_x)
|
|
347
|
+
|
|
348
|
+
elif action == "left_click_drag":
|
|
349
|
+
if coord_tuple and len(coord_tuple) >= 2:
|
|
350
|
+
# For drag, we need a path. Qwen provides the end coordinate.
|
|
351
|
+
# We'll get the current position and drag from there to the target
|
|
352
|
+
current_pos = await self.executor.position()
|
|
353
|
+
if isinstance(current_pos, ContentResult) and current_pos.output:
|
|
354
|
+
# Parse the position from the output
|
|
355
|
+
match = re.search(r"x=(\d+), y=(\d+)", current_pos.output)
|
|
356
|
+
if match:
|
|
357
|
+
# Current position is in screen coordinates
|
|
358
|
+
screen_start_x, screen_start_y = int(match.group(1)), int(match.group(2))
|
|
359
|
+
# End position is in agent coordinates, needs scaling
|
|
360
|
+
scaled_end_x, scaled_end_y = self._scale_coordinates(
|
|
361
|
+
coord_tuple[0], coord_tuple[1]
|
|
362
|
+
)
|
|
363
|
+
# Create path in screen coordinates
|
|
364
|
+
path = [(screen_start_x, screen_start_y), (scaled_end_x, scaled_end_y)]
|
|
365
|
+
# Path is already in screen coordinates, no need to scale again
|
|
366
|
+
result = await self.executor.drag(path=path)
|
|
367
|
+
else:
|
|
368
|
+
raise McpError(
|
|
369
|
+
ErrorData(
|
|
370
|
+
code=INTERNAL_ERROR, message="Failed to parse current position"
|
|
371
|
+
)
|
|
372
|
+
)
|
|
373
|
+
else:
|
|
374
|
+
raise McpError(
|
|
375
|
+
ErrorData(code=INTERNAL_ERROR, message="Failed to get current position")
|
|
376
|
+
)
|
|
377
|
+
else:
|
|
378
|
+
raise McpError(
|
|
379
|
+
ErrorData(
|
|
380
|
+
code=INVALID_PARAMS, message="coordinate is required for left_click_drag"
|
|
381
|
+
)
|
|
382
|
+
)
|
|
383
|
+
|
|
384
|
+
elif action == "wait":
|
|
385
|
+
if time is None:
|
|
386
|
+
raise McpError(ErrorData(code=INVALID_PARAMS, message="time is required for wait"))
|
|
387
|
+
if time < 0:
|
|
388
|
+
raise McpError(ErrorData(code=INVALID_PARAMS, message="time must be non-negative"))
|
|
389
|
+
|
|
390
|
+
# Convert seconds to milliseconds for HudComputerTool
|
|
391
|
+
result = await self.executor.wait(time=int(time * 1000))
|
|
392
|
+
|
|
393
|
+
else:
|
|
394
|
+
# Unknown action
|
|
395
|
+
raise McpError(ErrorData(code=INTERNAL_ERROR, message=f"Invalid action: {action}"))
|
|
396
|
+
|
|
397
|
+
# Rescale screenshot in result if present
|
|
398
|
+
if isinstance(result, ContentResult) and result.base64_image and self.rescale_images:
|
|
399
|
+
rescaled_image = await self._rescale_screenshot(result.base64_image)
|
|
400
|
+
result.base64_image = rescaled_image
|
|
401
|
+
|
|
402
|
+
# Auto-add screenshot for interactive actions
|
|
403
|
+
interactive_actions = {
|
|
404
|
+
"left_click",
|
|
405
|
+
"double_click",
|
|
406
|
+
"triple_click",
|
|
407
|
+
"right_click",
|
|
408
|
+
"middle_click",
|
|
409
|
+
"mouse_move",
|
|
410
|
+
"type",
|
|
411
|
+
"key",
|
|
412
|
+
"scroll",
|
|
413
|
+
"hscroll",
|
|
414
|
+
"left_click_drag",
|
|
415
|
+
}
|
|
416
|
+
|
|
417
|
+
if (
|
|
418
|
+
action in interactive_actions
|
|
419
|
+
and isinstance(result, ContentResult)
|
|
420
|
+
and not result.base64_image
|
|
421
|
+
):
|
|
422
|
+
screenshot_base64 = await self.executor.screenshot()
|
|
423
|
+
if screenshot_base64:
|
|
424
|
+
# Rescale screenshot if requested
|
|
425
|
+
screenshot_base64 = await self._rescale_screenshot(screenshot_base64)
|
|
426
|
+
result = ContentResult(
|
|
427
|
+
# note: we suppress the output since it's not useful
|
|
428
|
+
output="",
|
|
429
|
+
error=result.error,
|
|
430
|
+
base64_image=screenshot_base64,
|
|
431
|
+
)
|
|
432
|
+
|
|
433
|
+
# Convert to content blocks
|
|
434
|
+
return result.to_content_blocks()
|
hud/tools/computer/settings.py
CHANGED
|
@@ -62,6 +62,17 @@ class ComputerSettings(BaseSettings):
|
|
|
62
62
|
validation_alias="OPENAI_COMPUTER_HEIGHT",
|
|
63
63
|
)
|
|
64
64
|
|
|
65
|
+
QWEN_COMPUTER_WIDTH: int = Field(
|
|
66
|
+
default=700,
|
|
67
|
+
description="Width of the display to use for the Qwen computer tools",
|
|
68
|
+
validation_alias="QWEN_COMPUTER_WIDTH",
|
|
69
|
+
)
|
|
70
|
+
QWEN_COMPUTER_HEIGHT: int = Field(
|
|
71
|
+
default=448,
|
|
72
|
+
description="Height of the display to use for the Qwen computer tools",
|
|
73
|
+
validation_alias="QWEN_COMPUTER_HEIGHT",
|
|
74
|
+
)
|
|
75
|
+
|
|
65
76
|
HUD_RESCALE_IMAGES: bool = Field(
|
|
66
77
|
default=False,
|
|
67
78
|
description="Whether to rescale images to the agent width and height",
|
|
@@ -77,6 +88,32 @@ class ComputerSettings(BaseSettings):
|
|
|
77
88
|
description="Whether to rescale images to the agent width and height",
|
|
78
89
|
validation_alias="OPENAI_RESCALE_IMAGES",
|
|
79
90
|
)
|
|
91
|
+
QWEN_RESCALE_IMAGES: bool = Field(
|
|
92
|
+
default=True,
|
|
93
|
+
description="Whether to rescale images to the agent width and height",
|
|
94
|
+
validation_alias="QWEN_RESCALE_IMAGES",
|
|
95
|
+
)
|
|
96
|
+
|
|
97
|
+
GEMINI_COMPUTER_WIDTH: int = Field(
|
|
98
|
+
default=1440,
|
|
99
|
+
description="Width of the display to use for the Gemini computer tools",
|
|
100
|
+
validation_alias="GEMINI_COMPUTER_WIDTH",
|
|
101
|
+
)
|
|
102
|
+
GEMINI_COMPUTER_HEIGHT: int = Field(
|
|
103
|
+
default=900,
|
|
104
|
+
description="Height of the display to use for the Gemini computer tools",
|
|
105
|
+
validation_alias="GEMINI_COMPUTER_HEIGHT",
|
|
106
|
+
)
|
|
107
|
+
GEMINI_RESCALE_IMAGES: bool = Field(
|
|
108
|
+
default=True,
|
|
109
|
+
description="Whether to rescale images to the agent width and height",
|
|
110
|
+
validation_alias="GEMINI_RESCALE_IMAGES",
|
|
111
|
+
)
|
|
112
|
+
GEMINI_MAX_RECENT_TURN_WITH_SCREENSHOTS: int = Field(
|
|
113
|
+
default=3,
|
|
114
|
+
description="Maximum number of recent turns to keep screenshots for in Gemini agent",
|
|
115
|
+
validation_alias="GEMINI_MAX_RECENT_TURN_WITH_SCREENSHOTS",
|
|
116
|
+
)
|
|
80
117
|
|
|
81
118
|
|
|
82
119
|
computer_settings = ComputerSettings()
|
hud/tools/edit.py
CHANGED
|
@@ -1,16 +1,13 @@
|
|
|
1
|
-
from __future__ import annotations
|
|
2
|
-
|
|
3
1
|
from collections import defaultdict
|
|
4
2
|
from pathlib import Path
|
|
5
|
-
from typing import
|
|
3
|
+
from typing import Literal, get_args
|
|
4
|
+
|
|
5
|
+
from mcp.types import ContentBlock
|
|
6
6
|
|
|
7
7
|
from .base import BaseTool
|
|
8
8
|
from .types import ContentResult, ToolError
|
|
9
9
|
from .utils import maybe_truncate, run
|
|
10
10
|
|
|
11
|
-
if TYPE_CHECKING:
|
|
12
|
-
from mcp.types import ContentBlock
|
|
13
|
-
|
|
14
11
|
Command = Literal[
|
|
15
12
|
"view",
|
|
16
13
|
"create",
|
|
@@ -56,7 +53,6 @@ class EditTool(BaseTool):
|
|
|
56
53
|
old_str: str | None = None,
|
|
57
54
|
new_str: str | None = None,
|
|
58
55
|
insert_line: int | None = None,
|
|
59
|
-
**kwargs: Any,
|
|
60
56
|
) -> list[ContentBlock]:
|
|
61
57
|
_path = Path(path)
|
|
62
58
|
self.validate_path(command, _path)
|
hud/tools/executors/base.py
CHANGED
|
@@ -280,7 +280,7 @@ class BaseExecutor:
|
|
|
280
280
|
|
|
281
281
|
# ===== Utility Actions =====
|
|
282
282
|
|
|
283
|
-
async def wait(self, time: int) -> ContentResult:
|
|
283
|
+
async def wait(self, time: int, take_screenshot: bool = True) -> ContentResult:
|
|
284
284
|
"""
|
|
285
285
|
Wait for specified time.
|
|
286
286
|
|
|
@@ -289,7 +289,9 @@ class BaseExecutor:
|
|
|
289
289
|
"""
|
|
290
290
|
duration_seconds = time / 1000.0
|
|
291
291
|
await asyncio.sleep(duration_seconds)
|
|
292
|
-
|
|
292
|
+
# take screenshot
|
|
293
|
+
screenshot = await self.screenshot() if take_screenshot else None
|
|
294
|
+
return ContentResult(output=f"Waited {time}ms", base64_image=screenshot)
|
|
293
295
|
|
|
294
296
|
async def screenshot(self) -> str | None:
|
|
295
297
|
"""
|
hud/tools/executors/pyautogui.py
CHANGED
|
@@ -31,7 +31,7 @@ def _get_pyautogui() -> Any | None:
|
|
|
31
31
|
try:
|
|
32
32
|
from hud.tools.computer import computer_settings
|
|
33
33
|
|
|
34
|
-
os.environ["DISPLAY"] =
|
|
34
|
+
os.environ["DISPLAY"] = f":{computer_settings.DISPLAY_NUM}"
|
|
35
35
|
except (ImportError, AttributeError):
|
|
36
36
|
os.environ["DISPLAY"] = ":0"
|
|
37
37
|
|