hud-python 0.4.47__py3-none-any.whl → 0.4.49__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of hud-python might be problematic. Click here for more details.
- hud/agents/base.py +55 -142
- hud/agents/claude.py +5 -6
- hud/agents/grounded_openai.py +1 -1
- hud/agents/misc/integration_test_agent.py +2 -0
- hud/agents/tests/test_base.py +2 -5
- hud/cli/__init__.py +80 -215
- hud/cli/build.py +105 -45
- hud/cli/dev.py +614 -743
- hud/cli/eval.py +14 -9
- hud/cli/flows/tasks.py +100 -21
- hud/cli/init.py +18 -14
- hud/cli/push.py +27 -9
- hud/cli/rl/local_runner.py +28 -16
- hud/cli/rl/vllm.py +2 -0
- hud/cli/tests/test_analyze_metadata.py +3 -2
- hud/cli/tests/test_eval.py +574 -0
- hud/cli/tests/test_mcp_server.py +6 -95
- hud/cli/tests/test_utils.py +1 -1
- hud/cli/utils/env_check.py +9 -9
- hud/cli/utils/source_hash.py +1 -1
- hud/datasets/parallel.py +0 -12
- hud/datasets/runner.py +1 -4
- hud/rl/actor.py +4 -2
- hud/rl/distributed.py +1 -1
- hud/rl/learner.py +2 -1
- hud/rl/train.py +1 -1
- hud/server/__init__.py +2 -1
- hud/server/router.py +160 -0
- hud/server/server.py +246 -79
- hud/telemetry/trace.py +1 -1
- hud/tools/base.py +20 -10
- hud/tools/computer/__init__.py +2 -0
- hud/tools/computer/qwen.py +431 -0
- hud/tools/computer/settings.py +16 -0
- hud/tools/executors/pyautogui.py +1 -1
- hud/tools/playwright.py +1 -1
- hud/types.py +2 -3
- hud/utils/hud_console.py +43 -0
- hud/utils/tests/test_version.py +1 -1
- hud/version.py +1 -1
- {hud_python-0.4.47.dist-info → hud_python-0.4.49.dist-info}/METADATA +1 -1
- {hud_python-0.4.47.dist-info → hud_python-0.4.49.dist-info}/RECORD +45 -42
- {hud_python-0.4.47.dist-info → hud_python-0.4.49.dist-info}/WHEEL +0 -0
- {hud_python-0.4.47.dist-info → hud_python-0.4.49.dist-info}/entry_points.txt +0 -0
- {hud_python-0.4.47.dist-info → hud_python-0.4.49.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,431 @@
|
|
|
1
|
+
# flake8: noqa: B008
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
import logging
|
|
5
|
+
import re
|
|
6
|
+
from typing import TYPE_CHECKING, Any, Literal
|
|
7
|
+
|
|
8
|
+
from mcp import ErrorData, McpError
|
|
9
|
+
from mcp.types import INTERNAL_ERROR, INVALID_PARAMS, ContentBlock
|
|
10
|
+
from pydantic import Field
|
|
11
|
+
|
|
12
|
+
from hud.tools.types import ContentResult
|
|
13
|
+
|
|
14
|
+
from .hud import HudComputerTool
|
|
15
|
+
from .settings import computer_settings
|
|
16
|
+
|
|
17
|
+
if TYPE_CHECKING:
|
|
18
|
+
from hud.tools.executors.base import BaseExecutor
|
|
19
|
+
|
|
20
|
+
logger = logging.getLogger(__name__)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class QwenComputerTool(HudComputerTool):
|
|
24
|
+
"""
|
|
25
|
+
Qwen Computer Use tool for interacting with the computer.
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
name: str = "computer_use"
|
|
29
|
+
api_type: str = "computer_use"
|
|
30
|
+
|
|
31
|
+
def __init__(
|
|
32
|
+
self,
|
|
33
|
+
# Define within environment based on platform
|
|
34
|
+
executor: BaseExecutor | None = None,
|
|
35
|
+
platform_type: Literal["auto", "xdo", "pyautogui"] = "auto",
|
|
36
|
+
display_num: int | None = None,
|
|
37
|
+
# Overrides for what dimensions the agent thinks it operates in
|
|
38
|
+
width: int = computer_settings.QWEN_COMPUTER_WIDTH,
|
|
39
|
+
height: int = computer_settings.QWEN_COMPUTER_HEIGHT,
|
|
40
|
+
rescale_images: bool = computer_settings.QWEN_RESCALE_IMAGES,
|
|
41
|
+
# What the agent sees as the tool's name, title, and description
|
|
42
|
+
name: str | None = None,
|
|
43
|
+
title: str | None = None,
|
|
44
|
+
description: str | None = None,
|
|
45
|
+
**kwargs: Any,
|
|
46
|
+
) -> None:
|
|
47
|
+
"""
|
|
48
|
+
Initialize with Qwen's default dimensions.
|
|
49
|
+
|
|
50
|
+
Args:
|
|
51
|
+
width: Target width for rescaling (None = use environment width)
|
|
52
|
+
height: Target height for rescaling (None = use environment height)
|
|
53
|
+
rescale_images: If True, rescale screenshots. If False, only rescale action coordinates
|
|
54
|
+
name: Tool name for MCP registration (auto-generated from class name if not provided)
|
|
55
|
+
title: Human-readable display name for the tool (auto-generated from class name)
|
|
56
|
+
description: Tool description (auto-generated from docstring if not provided)
|
|
57
|
+
"""
|
|
58
|
+
# Store dimensions for description
|
|
59
|
+
self.display_width_px = width
|
|
60
|
+
self.display_height_px = height
|
|
61
|
+
|
|
62
|
+
# Build custom description with resolution info
|
|
63
|
+
custom_description = (
|
|
64
|
+
description
|
|
65
|
+
or f"""
|
|
66
|
+
Use a mouse and keyboard to interact with a computer, and take screenshots.
|
|
67
|
+
* This is an interface to a desktop GUI. You do not have access to a terminal or
|
|
68
|
+
applications menu. You must click on desktop icons to start applications.
|
|
69
|
+
* Some applications may take time to start or process actions, so you may need to
|
|
70
|
+
wait and take successive screenshots to see the results of your actions. E.g. if you
|
|
71
|
+
click on Firefox and a window doesn't open, try wait and taking another screenshot.
|
|
72
|
+
* The screen's resolution is {width}x{height}.
|
|
73
|
+
* Whenever you intend to move the cursor to click on an element like an icon, you
|
|
74
|
+
should consult a screenshot to determine the coordinates of the element before
|
|
75
|
+
moving the cursor.
|
|
76
|
+
* If you tried clicking on a program or link but it failed to load, even after
|
|
77
|
+
waiting, try adjusting your cursor position so that the tip of the cursor visually
|
|
78
|
+
falls on the element that you want to click.
|
|
79
|
+
* Make sure to click any buttons, links, icons, etc with the cursor tip in the
|
|
80
|
+
center of the element. Don't click boxes on their edges.
|
|
81
|
+
""".strip()
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
super().__init__(
|
|
85
|
+
executor=executor,
|
|
86
|
+
platform_type=platform_type,
|
|
87
|
+
display_num=display_num,
|
|
88
|
+
width=width,
|
|
89
|
+
height=height,
|
|
90
|
+
rescale_images=rescale_images,
|
|
91
|
+
name=name or "qwen_computer",
|
|
92
|
+
title=title or "Qwen Computer Tool",
|
|
93
|
+
description=custom_description,
|
|
94
|
+
**kwargs,
|
|
95
|
+
)
|
|
96
|
+
|
|
97
|
+
def to_params(self) -> dict:
|
|
98
|
+
"""Convert to Qwen tool parameters."""
|
|
99
|
+
return {
|
|
100
|
+
"type": self.api_type,
|
|
101
|
+
"name": self.name,
|
|
102
|
+
"display_width_px": self.display_width_px,
|
|
103
|
+
"display_height_px": self.display_height_px,
|
|
104
|
+
"description": self.description,
|
|
105
|
+
"parameters": {
|
|
106
|
+
"properties": {
|
|
107
|
+
"action": {
|
|
108
|
+
"description": """
|
|
109
|
+
The action to perform. The available actions are:
|
|
110
|
+
* `key`: Performs key down presses on the arguments passed in order, then performs
|
|
111
|
+
key releases in reverse order.
|
|
112
|
+
* `type`: Type a string of text on the keyboard.
|
|
113
|
+
* `mouse_move`: Move the cursor to a specified (x, y) pixel coordinate on the
|
|
114
|
+
screen.
|
|
115
|
+
* `left_click`: Click the left mouse button at a specified (x, y) pixel coordinate
|
|
116
|
+
on the screen.
|
|
117
|
+
* `left_click_drag`: Click and drag the cursor to a specified (x, y) pixel
|
|
118
|
+
coordinate on the screen.
|
|
119
|
+
* `right_click`: Click the right mouse button at a specified (x, y) pixel
|
|
120
|
+
coordinate on the screen.
|
|
121
|
+
* `middle_click`: Click the middle mouse button at a specified (x, y) pixel
|
|
122
|
+
coordinate on the screen.
|
|
123
|
+
* `double_click`: Double-click the left mouse button at a specified (x, y) pixel
|
|
124
|
+
coordinate on the screen.
|
|
125
|
+
* `triple_click`: Triple-click the left mouse button at a specified (x, y) pixel
|
|
126
|
+
coordinate on the screen.
|
|
127
|
+
* `scroll`: Performs a scroll of the mouse scroll wheel.
|
|
128
|
+
* `hscroll`: Performs a horizontal scroll.
|
|
129
|
+
* `wait`: Wait specified seconds for the change to happen.
|
|
130
|
+
* `terminate`: Terminate the current task and report its completion status
|
|
131
|
+
(NOT SUPPORTED).
|
|
132
|
+
* `answer`: Answer a question (NOT SUPPORTED).
|
|
133
|
+
""".strip(),
|
|
134
|
+
"enum": [
|
|
135
|
+
"key",
|
|
136
|
+
"type",
|
|
137
|
+
"mouse_move",
|
|
138
|
+
"left_click",
|
|
139
|
+
"left_click_drag",
|
|
140
|
+
"right_click",
|
|
141
|
+
"middle_click",
|
|
142
|
+
"double_click",
|
|
143
|
+
"triple_click",
|
|
144
|
+
"scroll",
|
|
145
|
+
"hscroll",
|
|
146
|
+
"wait",
|
|
147
|
+
"terminate",
|
|
148
|
+
"answer",
|
|
149
|
+
],
|
|
150
|
+
"type": "string",
|
|
151
|
+
},
|
|
152
|
+
"keys": {
|
|
153
|
+
"description": "Required only by `action=key`.",
|
|
154
|
+
"type": "array",
|
|
155
|
+
},
|
|
156
|
+
"text": {
|
|
157
|
+
"description": "Required only by `action=type` and `action=answer`.",
|
|
158
|
+
"type": "string",
|
|
159
|
+
},
|
|
160
|
+
"coordinate": {
|
|
161
|
+
"description": (
|
|
162
|
+
"(x, y): The x (pixels from the left edge) and y "
|
|
163
|
+
"(pixels from the top edge) coordinates to move the mouse to."
|
|
164
|
+
),
|
|
165
|
+
"type": "array",
|
|
166
|
+
},
|
|
167
|
+
"pixels": {
|
|
168
|
+
"description": (
|
|
169
|
+
"The amount of scrolling to perform. Positive values scroll up, "
|
|
170
|
+
"negative values scroll down. Required only by `action=scroll` "
|
|
171
|
+
"and `action=hscroll`."
|
|
172
|
+
),
|
|
173
|
+
"type": "number",
|
|
174
|
+
},
|
|
175
|
+
"time": {
|
|
176
|
+
"description": "The seconds to wait. Required only by `action=wait`.",
|
|
177
|
+
"type": "number",
|
|
178
|
+
},
|
|
179
|
+
"status": {
|
|
180
|
+
"description": (
|
|
181
|
+
"The status of the task. Required only by `action=terminate`."
|
|
182
|
+
),
|
|
183
|
+
"type": "string",
|
|
184
|
+
"enum": ["success", "failure"],
|
|
185
|
+
},
|
|
186
|
+
},
|
|
187
|
+
"required": ["action"],
|
|
188
|
+
"type": "object",
|
|
189
|
+
},
|
|
190
|
+
}
|
|
191
|
+
|
|
192
|
+
async def __call__(
|
|
193
|
+
self,
|
|
194
|
+
action: str = Field(..., description="The action to perform on the computer"),
|
|
195
|
+
keys: list[str] | None = Field(None, description="Keys for key action"),
|
|
196
|
+
text: str | None = Field(None, description="Text to type"),
|
|
197
|
+
coordinate: list[int] | tuple[int, int] | None = Field(
|
|
198
|
+
None, description="The coordinate to interact with on the computer [x, y]"
|
|
199
|
+
),
|
|
200
|
+
pixels: int | None = Field(None, description="Pixels to scroll"),
|
|
201
|
+
time: float | None = Field(None, description="Time to wait in seconds"),
|
|
202
|
+
status: str | None = Field(None, description="Status for terminate action"),
|
|
203
|
+
) -> list[ContentBlock]:
|
|
204
|
+
"""
|
|
205
|
+
Handle Qwen Computer Use API calls.
|
|
206
|
+
|
|
207
|
+
This converts Qwen's action format to HudComputerTool's format.
|
|
208
|
+
|
|
209
|
+
Returns:
|
|
210
|
+
List of MCP content blocks
|
|
211
|
+
"""
|
|
212
|
+
logger.info("QwenComputerTool received action: %s", action)
|
|
213
|
+
|
|
214
|
+
# Handle non-computer actions that should raise errors
|
|
215
|
+
if action == "terminate":
|
|
216
|
+
raise McpError(
|
|
217
|
+
ErrorData(
|
|
218
|
+
code=INVALID_PARAMS,
|
|
219
|
+
message=(
|
|
220
|
+
"terminate action is not supported for computer control. This is a no-op."
|
|
221
|
+
),
|
|
222
|
+
)
|
|
223
|
+
)
|
|
224
|
+
|
|
225
|
+
if action == "answer":
|
|
226
|
+
raise McpError(
|
|
227
|
+
ErrorData(
|
|
228
|
+
code=INVALID_PARAMS,
|
|
229
|
+
message="answer action is not supported for computer control. This is a no-op.",
|
|
230
|
+
)
|
|
231
|
+
)
|
|
232
|
+
|
|
233
|
+
# Convert lists to tuples if needed
|
|
234
|
+
coord_tuple = None
|
|
235
|
+
if coordinate:
|
|
236
|
+
coord_tuple = tuple(coordinate) if isinstance(coordinate, list) else coordinate
|
|
237
|
+
|
|
238
|
+
# Map Qwen actions to HudComputerTool actions
|
|
239
|
+
if action == "left_click":
|
|
240
|
+
if coord_tuple and len(coord_tuple) >= 2:
|
|
241
|
+
scaled_x, scaled_y = self._scale_coordinates(coord_tuple[0], coord_tuple[1])
|
|
242
|
+
logger.info("Scaled coordinates: %s, %s", scaled_x, scaled_y)
|
|
243
|
+
result = await self.executor.click(x=scaled_x, y=scaled_y)
|
|
244
|
+
else:
|
|
245
|
+
raise McpError(
|
|
246
|
+
ErrorData(code=INVALID_PARAMS, message="coordinate is required for left_click")
|
|
247
|
+
)
|
|
248
|
+
|
|
249
|
+
elif action == "double_click":
|
|
250
|
+
if coord_tuple and len(coord_tuple) >= 2:
|
|
251
|
+
# Use pattern for double-click
|
|
252
|
+
scaled_x, scaled_y = self._scale_coordinates(coord_tuple[0], coord_tuple[1])
|
|
253
|
+
result = await self.executor.click(x=scaled_x, y=scaled_y, pattern=[100])
|
|
254
|
+
else:
|
|
255
|
+
raise McpError(
|
|
256
|
+
ErrorData(
|
|
257
|
+
code=INVALID_PARAMS, message="coordinate is required for double_click"
|
|
258
|
+
)
|
|
259
|
+
)
|
|
260
|
+
|
|
261
|
+
elif action == "triple_click":
|
|
262
|
+
if coord_tuple and len(coord_tuple) >= 2:
|
|
263
|
+
# Use pattern for triple-click (simulated as double-click)
|
|
264
|
+
scaled_x, scaled_y = self._scale_coordinates(coord_tuple[0], coord_tuple[1])
|
|
265
|
+
# Note: triple-click simulated as double-click as per requirement
|
|
266
|
+
result = await self.executor.click(x=scaled_x, y=scaled_y, pattern=[100])
|
|
267
|
+
else:
|
|
268
|
+
raise McpError(
|
|
269
|
+
ErrorData(
|
|
270
|
+
code=INVALID_PARAMS, message="coordinate is required for triple_click"
|
|
271
|
+
)
|
|
272
|
+
)
|
|
273
|
+
|
|
274
|
+
elif action == "right_click":
|
|
275
|
+
if coord_tuple and len(coord_tuple) >= 2:
|
|
276
|
+
scaled_x, scaled_y = self._scale_coordinates(coord_tuple[0], coord_tuple[1])
|
|
277
|
+
result = await self.executor.click(x=scaled_x, y=scaled_y, button="right")
|
|
278
|
+
else:
|
|
279
|
+
raise McpError(
|
|
280
|
+
ErrorData(code=INVALID_PARAMS, message="coordinate is required for right_click")
|
|
281
|
+
)
|
|
282
|
+
|
|
283
|
+
elif action == "middle_click":
|
|
284
|
+
if coord_tuple and len(coord_tuple) >= 2:
|
|
285
|
+
scaled_x, scaled_y = self._scale_coordinates(coord_tuple[0], coord_tuple[1])
|
|
286
|
+
result = await self.executor.click(x=scaled_x, y=scaled_y, button="middle")
|
|
287
|
+
else:
|
|
288
|
+
raise McpError(
|
|
289
|
+
ErrorData(
|
|
290
|
+
code=INVALID_PARAMS, message="coordinate is required for middle_click"
|
|
291
|
+
)
|
|
292
|
+
)
|
|
293
|
+
|
|
294
|
+
elif action == "mouse_move":
|
|
295
|
+
if coord_tuple and len(coord_tuple) >= 2:
|
|
296
|
+
scaled_x, scaled_y = self._scale_coordinates(coord_tuple[0], coord_tuple[1])
|
|
297
|
+
result = await self.executor.move(x=scaled_x, y=scaled_y)
|
|
298
|
+
else:
|
|
299
|
+
raise McpError(
|
|
300
|
+
ErrorData(code=INVALID_PARAMS, message="coordinate is required for mouse_move")
|
|
301
|
+
)
|
|
302
|
+
|
|
303
|
+
elif action == "type":
|
|
304
|
+
if text:
|
|
305
|
+
result = await self.executor.write(text=text)
|
|
306
|
+
else:
|
|
307
|
+
raise McpError(ErrorData(code=INVALID_PARAMS, message="text is required for type"))
|
|
308
|
+
|
|
309
|
+
elif action == "key":
|
|
310
|
+
if keys:
|
|
311
|
+
# Qwen sends an array of keys to press
|
|
312
|
+
result = await self.executor.press(keys=keys)
|
|
313
|
+
else:
|
|
314
|
+
raise McpError(ErrorData(code=INVALID_PARAMS, message="keys is required for key"))
|
|
315
|
+
|
|
316
|
+
elif action == "scroll":
|
|
317
|
+
if pixels is None:
|
|
318
|
+
raise McpError(
|
|
319
|
+
ErrorData(code=INVALID_PARAMS, message="pixels is required for scroll")
|
|
320
|
+
)
|
|
321
|
+
|
|
322
|
+
# Qwen's pixels: positive scrolls up, negative scrolls down
|
|
323
|
+
# HUD's scroll_y: positive scrolls down, negative scrolls up
|
|
324
|
+
# So we need to negate the value
|
|
325
|
+
scroll_y = -pixels
|
|
326
|
+
|
|
327
|
+
if coord_tuple and len(coord_tuple) >= 2:
|
|
328
|
+
scaled_x, scaled_y = self._scale_coordinates(coord_tuple[0], coord_tuple[1])
|
|
329
|
+
result = await self.executor.scroll(x=scaled_x, y=scaled_y, scroll_y=scroll_y)
|
|
330
|
+
else:
|
|
331
|
+
result = await self.executor.scroll(scroll_y=scroll_y)
|
|
332
|
+
|
|
333
|
+
elif action == "hscroll":
|
|
334
|
+
if pixels is None:
|
|
335
|
+
raise McpError(
|
|
336
|
+
ErrorData(code=INVALID_PARAMS, message="pixels is required for hscroll")
|
|
337
|
+
)
|
|
338
|
+
|
|
339
|
+
# For horizontal scroll, positive values scroll right, negative scroll left
|
|
340
|
+
scroll_x = pixels
|
|
341
|
+
|
|
342
|
+
if coord_tuple and len(coord_tuple) >= 2:
|
|
343
|
+
scaled_x, scaled_y = self._scale_coordinates(coord_tuple[0], coord_tuple[1])
|
|
344
|
+
result = await self.executor.scroll(x=scaled_x, y=scaled_y, scroll_x=scroll_x)
|
|
345
|
+
else:
|
|
346
|
+
result = await self.executor.scroll(scroll_x=scroll_x)
|
|
347
|
+
|
|
348
|
+
elif action == "left_click_drag":
|
|
349
|
+
if coord_tuple and len(coord_tuple) >= 2:
|
|
350
|
+
# For drag, we need a path. Qwen provides the end coordinate.
|
|
351
|
+
# We'll get the current position and drag from there to the target
|
|
352
|
+
current_pos = await self.executor.position()
|
|
353
|
+
if isinstance(current_pos, ContentResult) and current_pos.output:
|
|
354
|
+
# Parse the position from the output
|
|
355
|
+
match = re.search(r"x=(\d+), y=(\d+)", current_pos.output)
|
|
356
|
+
if match:
|
|
357
|
+
# Current position is in screen coordinates
|
|
358
|
+
screen_start_x, screen_start_y = int(match.group(1)), int(match.group(2))
|
|
359
|
+
# End position is in agent coordinates, needs scaling
|
|
360
|
+
scaled_end_x, scaled_end_y = self._scale_coordinates(
|
|
361
|
+
coord_tuple[0], coord_tuple[1]
|
|
362
|
+
)
|
|
363
|
+
# Create path in screen coordinates
|
|
364
|
+
path = [(screen_start_x, screen_start_y), (scaled_end_x, scaled_end_y)]
|
|
365
|
+
# Path is already in screen coordinates, no need to scale again
|
|
366
|
+
result = await self.executor.drag(path=path)
|
|
367
|
+
else:
|
|
368
|
+
raise McpError(
|
|
369
|
+
ErrorData(
|
|
370
|
+
code=INTERNAL_ERROR, message="Failed to parse current position"
|
|
371
|
+
)
|
|
372
|
+
)
|
|
373
|
+
else:
|
|
374
|
+
raise McpError(
|
|
375
|
+
ErrorData(code=INTERNAL_ERROR, message="Failed to get current position")
|
|
376
|
+
)
|
|
377
|
+
else:
|
|
378
|
+
raise McpError(
|
|
379
|
+
ErrorData(
|
|
380
|
+
code=INVALID_PARAMS, message="coordinate is required for left_click_drag"
|
|
381
|
+
)
|
|
382
|
+
)
|
|
383
|
+
|
|
384
|
+
elif action == "wait":
|
|
385
|
+
if time is None:
|
|
386
|
+
raise McpError(ErrorData(code=INVALID_PARAMS, message="time is required for wait"))
|
|
387
|
+
if time < 0:
|
|
388
|
+
raise McpError(ErrorData(code=INVALID_PARAMS, message="time must be non-negative"))
|
|
389
|
+
|
|
390
|
+
# Convert seconds to milliseconds for HudComputerTool
|
|
391
|
+
result = await self.executor.wait(time=int(time * 1000))
|
|
392
|
+
|
|
393
|
+
else:
|
|
394
|
+
# Unknown action
|
|
395
|
+
raise McpError(ErrorData(code=INTERNAL_ERROR, message=f"Invalid action: {action}"))
|
|
396
|
+
|
|
397
|
+
# Rescale screenshot in result if present
|
|
398
|
+
if isinstance(result, ContentResult) and result.base64_image and self.rescale_images:
|
|
399
|
+
rescaled_image = await self._rescale_screenshot(result.base64_image)
|
|
400
|
+
result.base64_image = rescaled_image
|
|
401
|
+
|
|
402
|
+
# Auto-add screenshot for interactive actions
|
|
403
|
+
interactive_actions = {
|
|
404
|
+
"left_click",
|
|
405
|
+
"double_click",
|
|
406
|
+
"triple_click",
|
|
407
|
+
"right_click",
|
|
408
|
+
"middle_click",
|
|
409
|
+
"mouse_move",
|
|
410
|
+
"type",
|
|
411
|
+
"key",
|
|
412
|
+
"scroll",
|
|
413
|
+
"hscroll",
|
|
414
|
+
"left_click_drag",
|
|
415
|
+
}
|
|
416
|
+
|
|
417
|
+
if (
|
|
418
|
+
action in interactive_actions
|
|
419
|
+
and isinstance(result, ContentResult)
|
|
420
|
+
and not result.base64_image
|
|
421
|
+
):
|
|
422
|
+
screenshot_base64 = await self.executor.screenshot()
|
|
423
|
+
if screenshot_base64:
|
|
424
|
+
# Rescale screenshot if requested
|
|
425
|
+
screenshot_base64 = await self._rescale_screenshot(screenshot_base64)
|
|
426
|
+
result = ContentResult(
|
|
427
|
+
output=result.output, error=result.error, base64_image=screenshot_base64
|
|
428
|
+
)
|
|
429
|
+
|
|
430
|
+
# Convert to content blocks
|
|
431
|
+
return result.to_content_blocks()
|
hud/tools/computer/settings.py
CHANGED
|
@@ -62,6 +62,17 @@ class ComputerSettings(BaseSettings):
|
|
|
62
62
|
validation_alias="OPENAI_COMPUTER_HEIGHT",
|
|
63
63
|
)
|
|
64
64
|
|
|
65
|
+
QWEN_COMPUTER_WIDTH: int = Field(
|
|
66
|
+
default=1920,
|
|
67
|
+
description="Width of the display to use for the Qwen computer tools",
|
|
68
|
+
validation_alias="QWEN_COMPUTER_WIDTH",
|
|
69
|
+
)
|
|
70
|
+
QWEN_COMPUTER_HEIGHT: int = Field(
|
|
71
|
+
default=1080,
|
|
72
|
+
description="Height of the display to use for the Qwen computer tools",
|
|
73
|
+
validation_alias="QWEN_COMPUTER_HEIGHT",
|
|
74
|
+
)
|
|
75
|
+
|
|
65
76
|
HUD_RESCALE_IMAGES: bool = Field(
|
|
66
77
|
default=False,
|
|
67
78
|
description="Whether to rescale images to the agent width and height",
|
|
@@ -77,6 +88,11 @@ class ComputerSettings(BaseSettings):
|
|
|
77
88
|
description="Whether to rescale images to the agent width and height",
|
|
78
89
|
validation_alias="OPENAI_RESCALE_IMAGES",
|
|
79
90
|
)
|
|
91
|
+
QWEN_RESCALE_IMAGES: bool = Field(
|
|
92
|
+
default=True,
|
|
93
|
+
description="Whether to rescale images to the agent width and height",
|
|
94
|
+
validation_alias="QWEN_RESCALE_IMAGES",
|
|
95
|
+
)
|
|
80
96
|
|
|
81
97
|
|
|
82
98
|
computer_settings = ComputerSettings()
|
hud/tools/executors/pyautogui.py
CHANGED
|
@@ -31,7 +31,7 @@ def _get_pyautogui() -> Any | None:
|
|
|
31
31
|
try:
|
|
32
32
|
from hud.tools.computer import computer_settings
|
|
33
33
|
|
|
34
|
-
os.environ["DISPLAY"] =
|
|
34
|
+
os.environ["DISPLAY"] = f":{computer_settings.DISPLAY_NUM}"
|
|
35
35
|
except (ImportError, AttributeError):
|
|
36
36
|
os.environ["DISPLAY"] = ":0"
|
|
37
37
|
|
hud/tools/playwright.py
CHANGED
|
@@ -280,7 +280,7 @@ class PlaywrightTool(BaseTool):
|
|
|
280
280
|
|
|
281
281
|
try:
|
|
282
282
|
# Always return base64 encoded screenshot as ToolResult
|
|
283
|
-
screenshot_bytes = await self.page.screenshot(full_page=
|
|
283
|
+
screenshot_bytes = await self.page.screenshot(full_page=False)
|
|
284
284
|
import base64
|
|
285
285
|
|
|
286
286
|
screenshot_b64 = base64.b64encode(screenshot_bytes).decode()
|
hud/types.py
CHANGED
|
@@ -43,11 +43,10 @@ class Task(BaseModel):
|
|
|
43
43
|
setup_tool: MCPToolCall | list[MCPToolCall] | None = None
|
|
44
44
|
evaluate_tool: MCPToolCall | list[MCPToolCall] | None = None
|
|
45
45
|
integration_test_tool: MCPToolCall | list[MCPToolCall] | None = None
|
|
46
|
-
|
|
47
|
-
system_prompt: str | None = None
|
|
46
|
+
agent_config: dict[str, Any] | None = None
|
|
48
47
|
metadata: dict[str, Any] = Field(default_factory=dict)
|
|
49
48
|
|
|
50
|
-
@field_validator("mcp_config", "metadata", mode="before")
|
|
49
|
+
@field_validator("mcp_config", "metadata", "agent_config", mode="before")
|
|
51
50
|
@classmethod
|
|
52
51
|
def parse_json_strings(cls, v: Any) -> Any:
|
|
53
52
|
"""Parse JSON strings into dictionaries."""
|
hud/utils/hud_console.py
CHANGED
|
@@ -38,9 +38,26 @@ TEXT = "bright_white" # Off-white that's readable on dark, not too bright on li
|
|
|
38
38
|
SECONDARY = "rgb(108,113,196)" # Muted blue-purple for secondary text
|
|
39
39
|
|
|
40
40
|
|
|
41
|
+
# HUD Symbol System - Minimal 3-category system with default colors
|
|
42
|
+
class Symbols:
|
|
43
|
+
"""Unicode symbols for consistent CLI output with default colors."""
|
|
44
|
+
|
|
45
|
+
# Info/Items - Use for all informational lines (gold)
|
|
46
|
+
ITEM = f"[{GOLD}]•[/{GOLD}]"
|
|
47
|
+
|
|
48
|
+
# Status - Use for state/completion (green)
|
|
49
|
+
SUCCESS = f"[{GREEN}]●[/{GREEN}]"
|
|
50
|
+
|
|
51
|
+
# Flow/Special - Use for transitions and important notes (gold)
|
|
52
|
+
FLOW = f"[{GOLD}]⟿[/{GOLD}]"
|
|
53
|
+
|
|
54
|
+
|
|
41
55
|
class HUDConsole:
|
|
42
56
|
"""Design system for HUD CLI output."""
|
|
43
57
|
|
|
58
|
+
# Make symbols easily accessible
|
|
59
|
+
sym = Symbols
|
|
60
|
+
|
|
44
61
|
def __init__(self, logger: logging.Logger | None = None) -> None:
|
|
45
62
|
"""Initialize the design system.
|
|
46
63
|
|
|
@@ -547,6 +564,32 @@ class HUDConsole:
|
|
|
547
564
|
"""
|
|
548
565
|
return questionary.confirm(message, default=default).ask()
|
|
549
566
|
|
|
567
|
+
# Symbol-based output methods
|
|
568
|
+
def symbol(self, symbol: str, message: str, color: str = GOLD, stderr: bool = True) -> None:
|
|
569
|
+
"""Print a message with a colored symbol prefix.
|
|
570
|
+
|
|
571
|
+
Args:
|
|
572
|
+
symbol: Symbol to use (use Symbols.* constants)
|
|
573
|
+
message: Message text
|
|
574
|
+
color: Color for the symbol (default: gold)
|
|
575
|
+
stderr: If True, output to stderr
|
|
576
|
+
"""
|
|
577
|
+
console = self._stderr_console if stderr else self._stdout_console
|
|
578
|
+
console.print(f"[{color}]{symbol}[/{color}] {message}")
|
|
579
|
+
|
|
580
|
+
def detail(self, message: str, stderr: bool = True) -> None:
|
|
581
|
+
"""Print an indented detail line with gold pointer symbol."""
|
|
582
|
+
console = self._stderr_console if stderr else self._stdout_console
|
|
583
|
+
console.print(f" [{GOLD}]{Symbols.ITEM}[/{GOLD}] {message}")
|
|
584
|
+
|
|
585
|
+
def flow(self, message: str, stderr: bool = True) -> None:
|
|
586
|
+
"""Print a flow/transition message with wave symbol."""
|
|
587
|
+
self.symbol(Symbols.FLOW, message, GOLD, stderr)
|
|
588
|
+
|
|
589
|
+
def note(self, message: str, stderr: bool = True) -> None:
|
|
590
|
+
"""Print an important note with asterism symbol."""
|
|
591
|
+
self.symbol(Symbols.ITEM, message, GOLD, stderr)
|
|
592
|
+
|
|
550
593
|
|
|
551
594
|
# Global design instance for convenience
|
|
552
595
|
class _ProgressContext:
|
hud/utils/tests/test_version.py
CHANGED
hud/version.py
CHANGED