hud-python 0.4.46__py3-none-any.whl → 0.4.48__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of hud-python might be problematic. Click here for more details.
- hud/agents/base.py +49 -142
- hud/agents/claude.py +5 -6
- hud/agents/misc/integration_test_agent.py +2 -0
- hud/agents/tests/test_base.py +2 -5
- hud/cli/__init__.py +2 -2
- hud/cli/eval.py +14 -9
- hud/cli/flows/tasks.py +2 -4
- hud/cli/rl/local_runner.py +25 -13
- hud/cli/rl/vllm.py +2 -0
- hud/cli/tests/test_analyze_metadata.py +3 -2
- hud/cli/tests/test_eval.py +525 -0
- hud/cli/tests/test_utils.py +1 -1
- hud/datasets/parallel.py +0 -12
- hud/datasets/runner.py +1 -4
- hud/rl/actor.py +4 -2
- hud/rl/distributed.py +1 -1
- hud/rl/learner.py +2 -1
- hud/rl/train.py +1 -1
- hud/telemetry/trace.py +1 -1
- hud/tools/base.py +11 -9
- hud/tools/computer/__init__.py +2 -0
- hud/tools/computer/qwen.py +431 -0
- hud/tools/computer/settings.py +16 -0
- hud/tools/executors/pyautogui.py +1 -1
- hud/tools/playwright.py +1 -1
- hud/types.py +2 -3
- hud/utils/tests/test_version.py +1 -1
- hud/version.py +1 -1
- {hud_python-0.4.46.dist-info → hud_python-0.4.48.dist-info}/METADATA +1 -1
- {hud_python-0.4.46.dist-info → hud_python-0.4.48.dist-info}/RECORD +33 -31
- {hud_python-0.4.46.dist-info → hud_python-0.4.48.dist-info}/WHEEL +0 -0
- {hud_python-0.4.46.dist-info → hud_python-0.4.48.dist-info}/entry_points.txt +0 -0
- {hud_python-0.4.46.dist-info → hud_python-0.4.48.dist-info}/licenses/LICENSE +0 -0
hud/tools/base.py
CHANGED
|
@@ -1,14 +1,15 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
+
import logging
|
|
3
4
|
from abc import ABC, abstractmethod
|
|
4
|
-
from typing import TYPE_CHECKING, Any, cast
|
|
5
|
+
from typing import TYPE_CHECKING, Any, cast
|
|
5
6
|
|
|
6
7
|
from fastmcp import FastMCP
|
|
7
8
|
|
|
8
9
|
from hud.tools.types import ContentBlock, EvaluationResult
|
|
9
10
|
|
|
10
11
|
if TYPE_CHECKING:
|
|
11
|
-
from collections.abc import Callable
|
|
12
|
+
from collections.abc import Awaitable, Callable
|
|
12
13
|
|
|
13
14
|
from fastmcp.tools import FunctionTool
|
|
14
15
|
from fastmcp.tools.tool import Tool, ToolResult
|
|
@@ -16,9 +17,9 @@ if TYPE_CHECKING:
|
|
|
16
17
|
# Basic result types for tools
|
|
17
18
|
BaseResult = list[ContentBlock] | EvaluationResult
|
|
18
19
|
|
|
19
|
-
import logging
|
|
20
20
|
logger = logging.getLogger(__name__)
|
|
21
21
|
|
|
22
|
+
|
|
22
23
|
class BaseTool(ABC):
|
|
23
24
|
"""
|
|
24
25
|
Base helper class for all MCP tools to constrain their output.
|
|
@@ -106,9 +107,9 @@ class BaseTool(ABC):
|
|
|
106
107
|
)
|
|
107
108
|
return self._mcp_tool
|
|
108
109
|
|
|
109
|
-
def add_callback(self, event_type: str, callback: Callable[..., Awaitable[Any]]):
|
|
110
|
+
def add_callback(self, event_type: str, callback: Callable[..., Awaitable[Any]]) -> None:
|
|
110
111
|
"""Register a callback function for specific event
|
|
111
|
-
|
|
112
|
+
|
|
112
113
|
Args:
|
|
113
114
|
event_type: (Required) Specific event name to trigger callback
|
|
114
115
|
e.g. "after_click", "before_navigate"
|
|
@@ -118,7 +119,7 @@ class BaseTool(ABC):
|
|
|
118
119
|
self._callbacks[event_type] = []
|
|
119
120
|
self._callbacks[event_type].append(callback)
|
|
120
121
|
|
|
121
|
-
def remove_callback(self, event_type: str, callback: Callable[..., Awaitable[Any]]):
|
|
122
|
+
def remove_callback(self, event_type: str, callback: Callable[..., Awaitable[Any]]) -> None:
|
|
122
123
|
"""Remove a registered callback
|
|
123
124
|
Args:
|
|
124
125
|
event_type: (Required) Specific event name to trigger callback
|
|
@@ -127,15 +128,16 @@ class BaseTool(ABC):
|
|
|
127
128
|
"""
|
|
128
129
|
if (event_type in self._callbacks) and (callback in self._callbacks[event_type]):
|
|
129
130
|
self._callbacks[event_type].remove(callback)
|
|
130
|
-
|
|
131
|
-
async def _trigger_callbacks(self, event_type: str, **kwargs):
|
|
131
|
+
|
|
132
|
+
async def _trigger_callbacks(self, event_type: str, **kwargs: Any) -> None:
|
|
132
133
|
"""Trigger all registered callback functions of an event type"""
|
|
133
134
|
callback_list = self._callbacks.get(event_type, [])
|
|
134
135
|
for callback in callback_list:
|
|
135
136
|
try:
|
|
136
137
|
await callback(**kwargs)
|
|
137
138
|
except Exception as e:
|
|
138
|
-
logger.warning(
|
|
139
|
+
logger.warning("Callback failed for %s: %s", event_type, e)
|
|
140
|
+
|
|
139
141
|
|
|
140
142
|
# Prefix for internal tool names
|
|
141
143
|
_INTERNAL_PREFIX = "int_"
|
hud/tools/computer/__init__.py
CHANGED
|
@@ -5,11 +5,13 @@ from __future__ import annotations
|
|
|
5
5
|
from .anthropic import AnthropicComputerTool
|
|
6
6
|
from .hud import HudComputerTool
|
|
7
7
|
from .openai import OpenAIComputerTool
|
|
8
|
+
from .qwen import QwenComputerTool
|
|
8
9
|
from .settings import computer_settings
|
|
9
10
|
|
|
10
11
|
__all__ = [
|
|
11
12
|
"AnthropicComputerTool",
|
|
12
13
|
"HudComputerTool",
|
|
13
14
|
"OpenAIComputerTool",
|
|
15
|
+
"QwenComputerTool",
|
|
14
16
|
"computer_settings",
|
|
15
17
|
]
|
|
@@ -0,0 +1,431 @@
|
|
|
1
|
+
# flake8: noqa: B008
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
import logging
|
|
5
|
+
import re
|
|
6
|
+
from typing import TYPE_CHECKING, Any, Literal
|
|
7
|
+
|
|
8
|
+
from mcp import ErrorData, McpError
|
|
9
|
+
from mcp.types import INTERNAL_ERROR, INVALID_PARAMS, ContentBlock
|
|
10
|
+
from pydantic import Field
|
|
11
|
+
|
|
12
|
+
from hud.tools.types import ContentResult
|
|
13
|
+
|
|
14
|
+
from .hud import HudComputerTool
|
|
15
|
+
from .settings import computer_settings
|
|
16
|
+
|
|
17
|
+
if TYPE_CHECKING:
|
|
18
|
+
from hud.tools.executors.base import BaseExecutor
|
|
19
|
+
|
|
20
|
+
logger = logging.getLogger(__name__)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class QwenComputerTool(HudComputerTool):
|
|
24
|
+
"""
|
|
25
|
+
Qwen Computer Use tool for interacting with the computer.
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
name: str = "computer_use"
|
|
29
|
+
api_type: str = "computer_use"
|
|
30
|
+
|
|
31
|
+
def __init__(
|
|
32
|
+
self,
|
|
33
|
+
# Define within environment based on platform
|
|
34
|
+
executor: BaseExecutor | None = None,
|
|
35
|
+
platform_type: Literal["auto", "xdo", "pyautogui"] = "auto",
|
|
36
|
+
display_num: int | None = None,
|
|
37
|
+
# Overrides for what dimensions the agent thinks it operates in
|
|
38
|
+
width: int = computer_settings.QWEN_COMPUTER_WIDTH,
|
|
39
|
+
height: int = computer_settings.QWEN_COMPUTER_HEIGHT,
|
|
40
|
+
rescale_images: bool = computer_settings.QWEN_RESCALE_IMAGES,
|
|
41
|
+
# What the agent sees as the tool's name, title, and description
|
|
42
|
+
name: str | None = None,
|
|
43
|
+
title: str | None = None,
|
|
44
|
+
description: str | None = None,
|
|
45
|
+
**kwargs: Any,
|
|
46
|
+
) -> None:
|
|
47
|
+
"""
|
|
48
|
+
Initialize with Qwen's default dimensions.
|
|
49
|
+
|
|
50
|
+
Args:
|
|
51
|
+
width: Target width for rescaling (None = use environment width)
|
|
52
|
+
height: Target height for rescaling (None = use environment height)
|
|
53
|
+
rescale_images: If True, rescale screenshots. If False, only rescale action coordinates
|
|
54
|
+
name: Tool name for MCP registration (auto-generated from class name if not provided)
|
|
55
|
+
title: Human-readable display name for the tool (auto-generated from class name)
|
|
56
|
+
description: Tool description (auto-generated from docstring if not provided)
|
|
57
|
+
"""
|
|
58
|
+
# Store dimensions for description
|
|
59
|
+
self.display_width_px = width
|
|
60
|
+
self.display_height_px = height
|
|
61
|
+
|
|
62
|
+
# Build custom description with resolution info
|
|
63
|
+
custom_description = (
|
|
64
|
+
description
|
|
65
|
+
or f"""
|
|
66
|
+
Use a mouse and keyboard to interact with a computer, and take screenshots.
|
|
67
|
+
* This is an interface to a desktop GUI. You do not have access to a terminal or
|
|
68
|
+
applications menu. You must click on desktop icons to start applications.
|
|
69
|
+
* Some applications may take time to start or process actions, so you may need to
|
|
70
|
+
wait and take successive screenshots to see the results of your actions. E.g. if you
|
|
71
|
+
click on Firefox and a window doesn't open, try wait and taking another screenshot.
|
|
72
|
+
* The screen's resolution is {width}x{height}.
|
|
73
|
+
* Whenever you intend to move the cursor to click on an element like an icon, you
|
|
74
|
+
should consult a screenshot to determine the coordinates of the element before
|
|
75
|
+
moving the cursor.
|
|
76
|
+
* If you tried clicking on a program or link but it failed to load, even after
|
|
77
|
+
waiting, try adjusting your cursor position so that the tip of the cursor visually
|
|
78
|
+
falls on the element that you want to click.
|
|
79
|
+
* Make sure to click any buttons, links, icons, etc with the cursor tip in the
|
|
80
|
+
center of the element. Don't click boxes on their edges.
|
|
81
|
+
""".strip()
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
super().__init__(
|
|
85
|
+
executor=executor,
|
|
86
|
+
platform_type=platform_type,
|
|
87
|
+
display_num=display_num,
|
|
88
|
+
width=width,
|
|
89
|
+
height=height,
|
|
90
|
+
rescale_images=rescale_images,
|
|
91
|
+
name=name or "qwen_computer",
|
|
92
|
+
title=title or "Qwen Computer Tool",
|
|
93
|
+
description=custom_description,
|
|
94
|
+
**kwargs,
|
|
95
|
+
)
|
|
96
|
+
|
|
97
|
+
def to_params(self) -> dict:
|
|
98
|
+
"""Convert to Qwen tool parameters."""
|
|
99
|
+
return {
|
|
100
|
+
"type": self.api_type,
|
|
101
|
+
"name": self.name,
|
|
102
|
+
"display_width_px": self.display_width_px,
|
|
103
|
+
"display_height_px": self.display_height_px,
|
|
104
|
+
"description": self.description,
|
|
105
|
+
"parameters": {
|
|
106
|
+
"properties": {
|
|
107
|
+
"action": {
|
|
108
|
+
"description": """
|
|
109
|
+
The action to perform. The available actions are:
|
|
110
|
+
* `key`: Performs key down presses on the arguments passed in order, then performs
|
|
111
|
+
key releases in reverse order.
|
|
112
|
+
* `type`: Type a string of text on the keyboard.
|
|
113
|
+
* `mouse_move`: Move the cursor to a specified (x, y) pixel coordinate on the
|
|
114
|
+
screen.
|
|
115
|
+
* `left_click`: Click the left mouse button at a specified (x, y) pixel coordinate
|
|
116
|
+
on the screen.
|
|
117
|
+
* `left_click_drag`: Click and drag the cursor to a specified (x, y) pixel
|
|
118
|
+
coordinate on the screen.
|
|
119
|
+
* `right_click`: Click the right mouse button at a specified (x, y) pixel
|
|
120
|
+
coordinate on the screen.
|
|
121
|
+
* `middle_click`: Click the middle mouse button at a specified (x, y) pixel
|
|
122
|
+
coordinate on the screen.
|
|
123
|
+
* `double_click`: Double-click the left mouse button at a specified (x, y) pixel
|
|
124
|
+
coordinate on the screen.
|
|
125
|
+
* `triple_click`: Triple-click the left mouse button at a specified (x, y) pixel
|
|
126
|
+
coordinate on the screen.
|
|
127
|
+
* `scroll`: Performs a scroll of the mouse scroll wheel.
|
|
128
|
+
* `hscroll`: Performs a horizontal scroll.
|
|
129
|
+
* `wait`: Wait specified seconds for the change to happen.
|
|
130
|
+
* `terminate`: Terminate the current task and report its completion status
|
|
131
|
+
(NOT SUPPORTED).
|
|
132
|
+
* `answer`: Answer a question (NOT SUPPORTED).
|
|
133
|
+
""".strip(),
|
|
134
|
+
"enum": [
|
|
135
|
+
"key",
|
|
136
|
+
"type",
|
|
137
|
+
"mouse_move",
|
|
138
|
+
"left_click",
|
|
139
|
+
"left_click_drag",
|
|
140
|
+
"right_click",
|
|
141
|
+
"middle_click",
|
|
142
|
+
"double_click",
|
|
143
|
+
"triple_click",
|
|
144
|
+
"scroll",
|
|
145
|
+
"hscroll",
|
|
146
|
+
"wait",
|
|
147
|
+
"terminate",
|
|
148
|
+
"answer",
|
|
149
|
+
],
|
|
150
|
+
"type": "string",
|
|
151
|
+
},
|
|
152
|
+
"keys": {
|
|
153
|
+
"description": "Required only by `action=key`.",
|
|
154
|
+
"type": "array",
|
|
155
|
+
},
|
|
156
|
+
"text": {
|
|
157
|
+
"description": "Required only by `action=type` and `action=answer`.",
|
|
158
|
+
"type": "string",
|
|
159
|
+
},
|
|
160
|
+
"coordinate": {
|
|
161
|
+
"description": (
|
|
162
|
+
"(x, y): The x (pixels from the left edge) and y "
|
|
163
|
+
"(pixels from the top edge) coordinates to move the mouse to."
|
|
164
|
+
),
|
|
165
|
+
"type": "array",
|
|
166
|
+
},
|
|
167
|
+
"pixels": {
|
|
168
|
+
"description": (
|
|
169
|
+
"The amount of scrolling to perform. Positive values scroll up, "
|
|
170
|
+
"negative values scroll down. Required only by `action=scroll` "
|
|
171
|
+
"and `action=hscroll`."
|
|
172
|
+
),
|
|
173
|
+
"type": "number",
|
|
174
|
+
},
|
|
175
|
+
"time": {
|
|
176
|
+
"description": "The seconds to wait. Required only by `action=wait`.",
|
|
177
|
+
"type": "number",
|
|
178
|
+
},
|
|
179
|
+
"status": {
|
|
180
|
+
"description": (
|
|
181
|
+
"The status of the task. Required only by `action=terminate`."
|
|
182
|
+
),
|
|
183
|
+
"type": "string",
|
|
184
|
+
"enum": ["success", "failure"],
|
|
185
|
+
},
|
|
186
|
+
},
|
|
187
|
+
"required": ["action"],
|
|
188
|
+
"type": "object",
|
|
189
|
+
},
|
|
190
|
+
}
|
|
191
|
+
|
|
192
|
+
async def __call__(
|
|
193
|
+
self,
|
|
194
|
+
action: str = Field(..., description="The action to perform on the computer"),
|
|
195
|
+
keys: list[str] | None = Field(None, description="Keys for key action"),
|
|
196
|
+
text: str | None = Field(None, description="Text to type"),
|
|
197
|
+
coordinate: list[int] | tuple[int, int] | None = Field(
|
|
198
|
+
None, description="The coordinate to interact with on the computer [x, y]"
|
|
199
|
+
),
|
|
200
|
+
pixels: int | None = Field(None, description="Pixels to scroll"),
|
|
201
|
+
time: float | None = Field(None, description="Time to wait in seconds"),
|
|
202
|
+
status: str | None = Field(None, description="Status for terminate action"),
|
|
203
|
+
) -> list[ContentBlock]:
|
|
204
|
+
"""
|
|
205
|
+
Handle Qwen Computer Use API calls.
|
|
206
|
+
|
|
207
|
+
This converts Qwen's action format to HudComputerTool's format.
|
|
208
|
+
|
|
209
|
+
Returns:
|
|
210
|
+
List of MCP content blocks
|
|
211
|
+
"""
|
|
212
|
+
logger.info("QwenComputerTool received action: %s", action)
|
|
213
|
+
|
|
214
|
+
# Handle non-computer actions that should raise errors
|
|
215
|
+
if action == "terminate":
|
|
216
|
+
raise McpError(
|
|
217
|
+
ErrorData(
|
|
218
|
+
code=INVALID_PARAMS,
|
|
219
|
+
message=(
|
|
220
|
+
"terminate action is not supported for computer control. This is a no-op."
|
|
221
|
+
),
|
|
222
|
+
)
|
|
223
|
+
)
|
|
224
|
+
|
|
225
|
+
if action == "answer":
|
|
226
|
+
raise McpError(
|
|
227
|
+
ErrorData(
|
|
228
|
+
code=INVALID_PARAMS,
|
|
229
|
+
message="answer action is not supported for computer control. This is a no-op.",
|
|
230
|
+
)
|
|
231
|
+
)
|
|
232
|
+
|
|
233
|
+
# Convert lists to tuples if needed
|
|
234
|
+
coord_tuple = None
|
|
235
|
+
if coordinate:
|
|
236
|
+
coord_tuple = tuple(coordinate) if isinstance(coordinate, list) else coordinate
|
|
237
|
+
|
|
238
|
+
# Map Qwen actions to HudComputerTool actions
|
|
239
|
+
if action == "left_click":
|
|
240
|
+
if coord_tuple and len(coord_tuple) >= 2:
|
|
241
|
+
scaled_x, scaled_y = self._scale_coordinates(coord_tuple[0], coord_tuple[1])
|
|
242
|
+
logger.info("Scaled coordinates: %s, %s", scaled_x, scaled_y)
|
|
243
|
+
result = await self.executor.click(x=scaled_x, y=scaled_y)
|
|
244
|
+
else:
|
|
245
|
+
raise McpError(
|
|
246
|
+
ErrorData(code=INVALID_PARAMS, message="coordinate is required for left_click")
|
|
247
|
+
)
|
|
248
|
+
|
|
249
|
+
elif action == "double_click":
|
|
250
|
+
if coord_tuple and len(coord_tuple) >= 2:
|
|
251
|
+
# Use pattern for double-click
|
|
252
|
+
scaled_x, scaled_y = self._scale_coordinates(coord_tuple[0], coord_tuple[1])
|
|
253
|
+
result = await self.executor.click(x=scaled_x, y=scaled_y, pattern=[100])
|
|
254
|
+
else:
|
|
255
|
+
raise McpError(
|
|
256
|
+
ErrorData(
|
|
257
|
+
code=INVALID_PARAMS, message="coordinate is required for double_click"
|
|
258
|
+
)
|
|
259
|
+
)
|
|
260
|
+
|
|
261
|
+
elif action == "triple_click":
|
|
262
|
+
if coord_tuple and len(coord_tuple) >= 2:
|
|
263
|
+
# Use pattern for triple-click (simulated as double-click)
|
|
264
|
+
scaled_x, scaled_y = self._scale_coordinates(coord_tuple[0], coord_tuple[1])
|
|
265
|
+
# Note: triple-click simulated as double-click as per requirement
|
|
266
|
+
result = await self.executor.click(x=scaled_x, y=scaled_y, pattern=[100])
|
|
267
|
+
else:
|
|
268
|
+
raise McpError(
|
|
269
|
+
ErrorData(
|
|
270
|
+
code=INVALID_PARAMS, message="coordinate is required for triple_click"
|
|
271
|
+
)
|
|
272
|
+
)
|
|
273
|
+
|
|
274
|
+
elif action == "right_click":
|
|
275
|
+
if coord_tuple and len(coord_tuple) >= 2:
|
|
276
|
+
scaled_x, scaled_y = self._scale_coordinates(coord_tuple[0], coord_tuple[1])
|
|
277
|
+
result = await self.executor.click(x=scaled_x, y=scaled_y, button="right")
|
|
278
|
+
else:
|
|
279
|
+
raise McpError(
|
|
280
|
+
ErrorData(code=INVALID_PARAMS, message="coordinate is required for right_click")
|
|
281
|
+
)
|
|
282
|
+
|
|
283
|
+
elif action == "middle_click":
|
|
284
|
+
if coord_tuple and len(coord_tuple) >= 2:
|
|
285
|
+
scaled_x, scaled_y = self._scale_coordinates(coord_tuple[0], coord_tuple[1])
|
|
286
|
+
result = await self.executor.click(x=scaled_x, y=scaled_y, button="middle")
|
|
287
|
+
else:
|
|
288
|
+
raise McpError(
|
|
289
|
+
ErrorData(
|
|
290
|
+
code=INVALID_PARAMS, message="coordinate is required for middle_click"
|
|
291
|
+
)
|
|
292
|
+
)
|
|
293
|
+
|
|
294
|
+
elif action == "mouse_move":
|
|
295
|
+
if coord_tuple and len(coord_tuple) >= 2:
|
|
296
|
+
scaled_x, scaled_y = self._scale_coordinates(coord_tuple[0], coord_tuple[1])
|
|
297
|
+
result = await self.executor.move(x=scaled_x, y=scaled_y)
|
|
298
|
+
else:
|
|
299
|
+
raise McpError(
|
|
300
|
+
ErrorData(code=INVALID_PARAMS, message="coordinate is required for mouse_move")
|
|
301
|
+
)
|
|
302
|
+
|
|
303
|
+
elif action == "type":
|
|
304
|
+
if text:
|
|
305
|
+
result = await self.executor.write(text=text)
|
|
306
|
+
else:
|
|
307
|
+
raise McpError(ErrorData(code=INVALID_PARAMS, message="text is required for type"))
|
|
308
|
+
|
|
309
|
+
elif action == "key":
|
|
310
|
+
if keys:
|
|
311
|
+
# Qwen sends an array of keys to press
|
|
312
|
+
result = await self.executor.press(keys=keys)
|
|
313
|
+
else:
|
|
314
|
+
raise McpError(ErrorData(code=INVALID_PARAMS, message="keys is required for key"))
|
|
315
|
+
|
|
316
|
+
elif action == "scroll":
|
|
317
|
+
if pixels is None:
|
|
318
|
+
raise McpError(
|
|
319
|
+
ErrorData(code=INVALID_PARAMS, message="pixels is required for scroll")
|
|
320
|
+
)
|
|
321
|
+
|
|
322
|
+
# Qwen's pixels: positive scrolls up, negative scrolls down
|
|
323
|
+
# HUD's scroll_y: positive scrolls down, negative scrolls up
|
|
324
|
+
# So we need to negate the value
|
|
325
|
+
scroll_y = -pixels
|
|
326
|
+
|
|
327
|
+
if coord_tuple and len(coord_tuple) >= 2:
|
|
328
|
+
scaled_x, scaled_y = self._scale_coordinates(coord_tuple[0], coord_tuple[1])
|
|
329
|
+
result = await self.executor.scroll(x=scaled_x, y=scaled_y, scroll_y=scroll_y)
|
|
330
|
+
else:
|
|
331
|
+
result = await self.executor.scroll(scroll_y=scroll_y)
|
|
332
|
+
|
|
333
|
+
elif action == "hscroll":
|
|
334
|
+
if pixels is None:
|
|
335
|
+
raise McpError(
|
|
336
|
+
ErrorData(code=INVALID_PARAMS, message="pixels is required for hscroll")
|
|
337
|
+
)
|
|
338
|
+
|
|
339
|
+
# For horizontal scroll, positive values scroll right, negative scroll left
|
|
340
|
+
scroll_x = pixels
|
|
341
|
+
|
|
342
|
+
if coord_tuple and len(coord_tuple) >= 2:
|
|
343
|
+
scaled_x, scaled_y = self._scale_coordinates(coord_tuple[0], coord_tuple[1])
|
|
344
|
+
result = await self.executor.scroll(x=scaled_x, y=scaled_y, scroll_x=scroll_x)
|
|
345
|
+
else:
|
|
346
|
+
result = await self.executor.scroll(scroll_x=scroll_x)
|
|
347
|
+
|
|
348
|
+
elif action == "left_click_drag":
|
|
349
|
+
if coord_tuple and len(coord_tuple) >= 2:
|
|
350
|
+
# For drag, we need a path. Qwen provides the end coordinate.
|
|
351
|
+
# We'll get the current position and drag from there to the target
|
|
352
|
+
current_pos = await self.executor.position()
|
|
353
|
+
if isinstance(current_pos, ContentResult) and current_pos.output:
|
|
354
|
+
# Parse the position from the output
|
|
355
|
+
match = re.search(r"x=(\d+), y=(\d+)", current_pos.output)
|
|
356
|
+
if match:
|
|
357
|
+
# Current position is in screen coordinates
|
|
358
|
+
screen_start_x, screen_start_y = int(match.group(1)), int(match.group(2))
|
|
359
|
+
# End position is in agent coordinates, needs scaling
|
|
360
|
+
scaled_end_x, scaled_end_y = self._scale_coordinates(
|
|
361
|
+
coord_tuple[0], coord_tuple[1]
|
|
362
|
+
)
|
|
363
|
+
# Create path in screen coordinates
|
|
364
|
+
path = [(screen_start_x, screen_start_y), (scaled_end_x, scaled_end_y)]
|
|
365
|
+
# Path is already in screen coordinates, no need to scale again
|
|
366
|
+
result = await self.executor.drag(path=path)
|
|
367
|
+
else:
|
|
368
|
+
raise McpError(
|
|
369
|
+
ErrorData(
|
|
370
|
+
code=INTERNAL_ERROR, message="Failed to parse current position"
|
|
371
|
+
)
|
|
372
|
+
)
|
|
373
|
+
else:
|
|
374
|
+
raise McpError(
|
|
375
|
+
ErrorData(code=INTERNAL_ERROR, message="Failed to get current position")
|
|
376
|
+
)
|
|
377
|
+
else:
|
|
378
|
+
raise McpError(
|
|
379
|
+
ErrorData(
|
|
380
|
+
code=INVALID_PARAMS, message="coordinate is required for left_click_drag"
|
|
381
|
+
)
|
|
382
|
+
)
|
|
383
|
+
|
|
384
|
+
elif action == "wait":
|
|
385
|
+
if time is None:
|
|
386
|
+
raise McpError(ErrorData(code=INVALID_PARAMS, message="time is required for wait"))
|
|
387
|
+
if time < 0:
|
|
388
|
+
raise McpError(ErrorData(code=INVALID_PARAMS, message="time must be non-negative"))
|
|
389
|
+
|
|
390
|
+
# Convert seconds to milliseconds for HudComputerTool
|
|
391
|
+
result = await self.executor.wait(time=int(time * 1000))
|
|
392
|
+
|
|
393
|
+
else:
|
|
394
|
+
# Unknown action
|
|
395
|
+
raise McpError(ErrorData(code=INTERNAL_ERROR, message=f"Invalid action: {action}"))
|
|
396
|
+
|
|
397
|
+
# Rescale screenshot in result if present
|
|
398
|
+
if isinstance(result, ContentResult) and result.base64_image and self.rescale_images:
|
|
399
|
+
rescaled_image = await self._rescale_screenshot(result.base64_image)
|
|
400
|
+
result.base64_image = rescaled_image
|
|
401
|
+
|
|
402
|
+
# Auto-add screenshot for interactive actions
|
|
403
|
+
interactive_actions = {
|
|
404
|
+
"left_click",
|
|
405
|
+
"double_click",
|
|
406
|
+
"triple_click",
|
|
407
|
+
"right_click",
|
|
408
|
+
"middle_click",
|
|
409
|
+
"mouse_move",
|
|
410
|
+
"type",
|
|
411
|
+
"key",
|
|
412
|
+
"scroll",
|
|
413
|
+
"hscroll",
|
|
414
|
+
"left_click_drag",
|
|
415
|
+
}
|
|
416
|
+
|
|
417
|
+
if (
|
|
418
|
+
action in interactive_actions
|
|
419
|
+
and isinstance(result, ContentResult)
|
|
420
|
+
and not result.base64_image
|
|
421
|
+
):
|
|
422
|
+
screenshot_base64 = await self.executor.screenshot()
|
|
423
|
+
if screenshot_base64:
|
|
424
|
+
# Rescale screenshot if requested
|
|
425
|
+
screenshot_base64 = await self._rescale_screenshot(screenshot_base64)
|
|
426
|
+
result = ContentResult(
|
|
427
|
+
output=result.output, error=result.error, base64_image=screenshot_base64
|
|
428
|
+
)
|
|
429
|
+
|
|
430
|
+
# Convert to content blocks
|
|
431
|
+
return result.to_content_blocks()
|
hud/tools/computer/settings.py
CHANGED
|
@@ -62,6 +62,17 @@ class ComputerSettings(BaseSettings):
|
|
|
62
62
|
validation_alias="OPENAI_COMPUTER_HEIGHT",
|
|
63
63
|
)
|
|
64
64
|
|
|
65
|
+
QWEN_COMPUTER_WIDTH: int = Field(
|
|
66
|
+
default=1920,
|
|
67
|
+
description="Width of the display to use for the Qwen computer tools",
|
|
68
|
+
validation_alias="QWEN_COMPUTER_WIDTH",
|
|
69
|
+
)
|
|
70
|
+
QWEN_COMPUTER_HEIGHT: int = Field(
|
|
71
|
+
default=1080,
|
|
72
|
+
description="Height of the display to use for the Qwen computer tools",
|
|
73
|
+
validation_alias="QWEN_COMPUTER_HEIGHT",
|
|
74
|
+
)
|
|
75
|
+
|
|
65
76
|
HUD_RESCALE_IMAGES: bool = Field(
|
|
66
77
|
default=False,
|
|
67
78
|
description="Whether to rescale images to the agent width and height",
|
|
@@ -77,6 +88,11 @@ class ComputerSettings(BaseSettings):
|
|
|
77
88
|
description="Whether to rescale images to the agent width and height",
|
|
78
89
|
validation_alias="OPENAI_RESCALE_IMAGES",
|
|
79
90
|
)
|
|
91
|
+
QWEN_RESCALE_IMAGES: bool = Field(
|
|
92
|
+
default=True,
|
|
93
|
+
description="Whether to rescale images to the agent width and height",
|
|
94
|
+
validation_alias="QWEN_RESCALE_IMAGES",
|
|
95
|
+
)
|
|
80
96
|
|
|
81
97
|
|
|
82
98
|
computer_settings = ComputerSettings()
|
hud/tools/executors/pyautogui.py
CHANGED
|
@@ -31,7 +31,7 @@ def _get_pyautogui() -> Any | None:
|
|
|
31
31
|
try:
|
|
32
32
|
from hud.tools.computer import computer_settings
|
|
33
33
|
|
|
34
|
-
os.environ["DISPLAY"] =
|
|
34
|
+
os.environ["DISPLAY"] = f":{computer_settings.DISPLAY_NUM}"
|
|
35
35
|
except (ImportError, AttributeError):
|
|
36
36
|
os.environ["DISPLAY"] = ":0"
|
|
37
37
|
|
hud/tools/playwright.py
CHANGED
|
@@ -280,7 +280,7 @@ class PlaywrightTool(BaseTool):
|
|
|
280
280
|
|
|
281
281
|
try:
|
|
282
282
|
# Always return base64 encoded screenshot as ToolResult
|
|
283
|
-
screenshot_bytes = await self.page.screenshot(full_page=
|
|
283
|
+
screenshot_bytes = await self.page.screenshot(full_page=False)
|
|
284
284
|
import base64
|
|
285
285
|
|
|
286
286
|
screenshot_b64 = base64.b64encode(screenshot_bytes).decode()
|
hud/types.py
CHANGED
|
@@ -43,11 +43,10 @@ class Task(BaseModel):
|
|
|
43
43
|
setup_tool: MCPToolCall | list[MCPToolCall] | None = None
|
|
44
44
|
evaluate_tool: MCPToolCall | list[MCPToolCall] | None = None
|
|
45
45
|
integration_test_tool: MCPToolCall | list[MCPToolCall] | None = None
|
|
46
|
-
|
|
47
|
-
system_prompt: str | None = None
|
|
46
|
+
agent_config: dict[str, Any] | None = None
|
|
48
47
|
metadata: dict[str, Any] = Field(default_factory=dict)
|
|
49
48
|
|
|
50
|
-
@field_validator("mcp_config", "metadata", mode="before")
|
|
49
|
+
@field_validator("mcp_config", "metadata", "agent_config", mode="before")
|
|
51
50
|
@classmethod
|
|
52
51
|
def parse_json_strings(cls, v: Any) -> Any:
|
|
53
52
|
"""Parse JSON strings into dictionaries."""
|
hud/utils/tests/test_version.py
CHANGED
hud/version.py
CHANGED