hud-python 0.4.58__py3-none-any.whl → 0.4.59__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of hud-python might be problematic. Click here for more details.
- hud/agents/__init__.py +2 -0
- hud/agents/gemini.py +492 -0
- hud/agents/tests/test_gemini.py +372 -0
- hud/cli/__init__.py +26 -24
- hud/cli/eval.py +57 -1
- hud/cli/tests/test_eval.py +20 -0
- hud/settings.py +6 -0
- hud/tools/__init__.py +13 -2
- hud/tools/computer/__init__.py +2 -0
- hud/tools/computer/gemini.py +385 -0
- hud/tools/computer/settings.py +21 -0
- hud/tools/playwright.py +9 -1
- hud/tools/types.py +9 -1
- hud/types.py +1 -0
- hud/utils/tests/test_version.py +1 -1
- hud/version.py +1 -1
- {hud_python-0.4.58.dist-info → hud_python-0.4.59.dist-info}/METADATA +2 -1
- {hud_python-0.4.58.dist-info → hud_python-0.4.59.dist-info}/RECORD +21 -18
- {hud_python-0.4.58.dist-info → hud_python-0.4.59.dist-info}/WHEEL +0 -0
- {hud_python-0.4.58.dist-info → hud_python-0.4.59.dist-info}/entry_points.txt +0 -0
- {hud_python-0.4.58.dist-info → hud_python-0.4.59.dist-info}/licenses/LICENSE +0 -0
hud/cli/tests/test_eval.py
CHANGED
|
@@ -68,6 +68,26 @@ class TestBuildAgent:
|
|
|
68
68
|
)
|
|
69
69
|
assert result == mock_instance
|
|
70
70
|
|
|
71
|
+
def test_builds_gemini_agent(self) -> None:
|
|
72
|
+
"""Test building a Gemini agent."""
|
|
73
|
+
with patch("hud.agents.GeminiAgent") as mock_runner:
|
|
74
|
+
mock_instance = Mock()
|
|
75
|
+
mock_runner.return_value = mock_instance
|
|
76
|
+
|
|
77
|
+
result = build_agent(
|
|
78
|
+
AgentType.GEMINI,
|
|
79
|
+
model="gemini-test",
|
|
80
|
+
allowed_tools=["gemini_computer"],
|
|
81
|
+
verbose=True,
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
mock_runner.assert_called_once_with(
|
|
85
|
+
model="gemini-test",
|
|
86
|
+
verbose=True,
|
|
87
|
+
allowed_tools=["gemini_computer"],
|
|
88
|
+
)
|
|
89
|
+
assert result == mock_instance
|
|
90
|
+
|
|
71
91
|
|
|
72
92
|
class TestRunSingleTask:
|
|
73
93
|
"""Test the run_single_task function."""
|
hud/settings.py
CHANGED
|
@@ -94,6 +94,12 @@ class Settings(BaseSettings):
|
|
|
94
94
|
validation_alias="OPENAI_API_KEY",
|
|
95
95
|
)
|
|
96
96
|
|
|
97
|
+
gemini_api_key: str | None = Field(
|
|
98
|
+
default=None,
|
|
99
|
+
description="API key for Google Gemini models",
|
|
100
|
+
validation_alias="GEMINI_API_KEY",
|
|
101
|
+
)
|
|
102
|
+
|
|
97
103
|
openrouter_api_key: str | None = Field(
|
|
98
104
|
default=None,
|
|
99
105
|
description="API key for OpenRouter models",
|
hud/tools/__init__.py
CHANGED
|
@@ -12,7 +12,12 @@ from .response import ResponseTool
|
|
|
12
12
|
from .submit import SubmitTool
|
|
13
13
|
|
|
14
14
|
if TYPE_CHECKING:
|
|
15
|
-
from .computer import
|
|
15
|
+
from .computer import (
|
|
16
|
+
AnthropicComputerTool,
|
|
17
|
+
GeminiComputerTool,
|
|
18
|
+
HudComputerTool,
|
|
19
|
+
OpenAIComputerTool,
|
|
20
|
+
)
|
|
16
21
|
|
|
17
22
|
__all__ = [
|
|
18
23
|
"AnthropicComputerTool",
|
|
@@ -20,6 +25,7 @@ __all__ = [
|
|
|
20
25
|
"BaseTool",
|
|
21
26
|
"BashTool",
|
|
22
27
|
"EditTool",
|
|
28
|
+
"GeminiComputerTool",
|
|
23
29
|
"HudComputerTool",
|
|
24
30
|
"OpenAIComputerTool",
|
|
25
31
|
"PlaywrightTool",
|
|
@@ -30,7 +36,12 @@ __all__ = [
|
|
|
30
36
|
|
|
31
37
|
def __getattr__(name: str) -> Any:
|
|
32
38
|
"""Lazy import computer tools to avoid importing pyautogui unless needed."""
|
|
33
|
-
if name in (
|
|
39
|
+
if name in (
|
|
40
|
+
"AnthropicComputerTool",
|
|
41
|
+
"HudComputerTool",
|
|
42
|
+
"OpenAIComputerTool",
|
|
43
|
+
"GeminiComputerTool",
|
|
44
|
+
):
|
|
34
45
|
from . import computer
|
|
35
46
|
|
|
36
47
|
return getattr(computer, name)
|
hud/tools/computer/__init__.py
CHANGED
|
@@ -3,6 +3,7 @@
|
|
|
3
3
|
from __future__ import annotations
|
|
4
4
|
|
|
5
5
|
from .anthropic import AnthropicComputerTool
|
|
6
|
+
from .gemini import GeminiComputerTool
|
|
6
7
|
from .hud import HudComputerTool
|
|
7
8
|
from .openai import OpenAIComputerTool
|
|
8
9
|
from .qwen import QwenComputerTool
|
|
@@ -10,6 +11,7 @@ from .settings import computer_settings
|
|
|
10
11
|
|
|
11
12
|
__all__ = [
|
|
12
13
|
"AnthropicComputerTool",
|
|
14
|
+
"GeminiComputerTool",
|
|
13
15
|
"HudComputerTool",
|
|
14
16
|
"OpenAIComputerTool",
|
|
15
17
|
"QwenComputerTool",
|
|
@@ -0,0 +1,385 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
import platform
|
|
5
|
+
from typing import TYPE_CHECKING, Any, Literal
|
|
6
|
+
|
|
7
|
+
from mcp import ErrorData, McpError
|
|
8
|
+
from mcp.types import INVALID_PARAMS, ContentBlock
|
|
9
|
+
from pydantic import Field
|
|
10
|
+
|
|
11
|
+
from hud.tools.types import ContentResult
|
|
12
|
+
|
|
13
|
+
from .hud import HudComputerTool
|
|
14
|
+
from .settings import computer_settings
|
|
15
|
+
|
|
16
|
+
if TYPE_CHECKING:
|
|
17
|
+
from hud.tools.executors.base import BaseExecutor
|
|
18
|
+
|
|
19
|
+
logger = logging.getLogger(__name__)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
ACTION_FIELD = Field(..., description="Gemini Computer Use action to perform")
|
|
23
|
+
X_FIELD = Field(None, description="X coordinate (pixels in agent space)")
|
|
24
|
+
Y_FIELD = Field(None, description="Y coordinate (pixels in agent space)")
|
|
25
|
+
TEXT_FIELD = Field(None, description="Text to type")
|
|
26
|
+
PRESS_ENTER_FIELD = Field(None, description="Whether to press Enter after typing (type_text_at)")
|
|
27
|
+
CLEAR_BEFORE_TYPING_FIELD = Field(
|
|
28
|
+
None, description="Whether to select-all before typing (type_text_at)"
|
|
29
|
+
)
|
|
30
|
+
DIRECTION_FIELD = Field(None, description="Scroll direction for scroll_document/scroll_at")
|
|
31
|
+
MAGNITUDE_FIELD = Field(None, description="Scroll magnitude (pixels in agent space)")
|
|
32
|
+
URL_FIELD = Field(None, description="Target URL for navigate")
|
|
33
|
+
KEYS_FIELD = Field(None, description="Keys for key_combination")
|
|
34
|
+
DESTINATION_X_FIELD = Field(None, description="Destination X for drag_and_drop (agent space)")
|
|
35
|
+
DESTINATION_Y_FIELD = Field(None, description="Destination Y for drag_and_drop (agent space)")
|
|
36
|
+
TAKE_SCREENSHOT_ON_CLICK_FIELD = Field(
|
|
37
|
+
True, description="Whether to include a screenshot for interactive actions"
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
class GeminiComputerTool(HudComputerTool):
|
|
42
|
+
"""
|
|
43
|
+
Gemini Computer Use tool for interacting with a computer via MCP.
|
|
44
|
+
|
|
45
|
+
Maps Gemini's predefined function names (open_web_browser, click_at, hover_at,
|
|
46
|
+
type_text_at, scroll_document, scroll_at, wait_5_seconds, go_back, go_forward,
|
|
47
|
+
search, navigate, key_combination, drag_and_drop) to executor actions.
|
|
48
|
+
"""
|
|
49
|
+
|
|
50
|
+
def __init__(
|
|
51
|
+
self,
|
|
52
|
+
# Define within environment based on platform
|
|
53
|
+
executor: BaseExecutor | None = None,
|
|
54
|
+
platform_type: Literal["auto", "xdo", "pyautogui"] = "auto",
|
|
55
|
+
display_num: int | None = None,
|
|
56
|
+
# Overrides for what dimensions the agent thinks it operates in
|
|
57
|
+
width: int = computer_settings.GEMINI_COMPUTER_WIDTH,
|
|
58
|
+
height: int = computer_settings.GEMINI_COMPUTER_HEIGHT,
|
|
59
|
+
rescale_images: bool = computer_settings.GEMINI_RESCALE_IMAGES,
|
|
60
|
+
# What the agent sees as the tool's name, title, and description
|
|
61
|
+
name: str | None = None,
|
|
62
|
+
title: str | None = None,
|
|
63
|
+
description: str | None = None,
|
|
64
|
+
**kwargs: Any,
|
|
65
|
+
) -> None:
|
|
66
|
+
"""
|
|
67
|
+
Initialize with Gemini's default dimensions.
|
|
68
|
+
"""
|
|
69
|
+
super().__init__(
|
|
70
|
+
executor=executor,
|
|
71
|
+
platform_type=platform_type,
|
|
72
|
+
display_num=display_num,
|
|
73
|
+
width=width,
|
|
74
|
+
height=height,
|
|
75
|
+
rescale_images=rescale_images,
|
|
76
|
+
name=name or "gemini_computer",
|
|
77
|
+
title=title or "Gemini Computer Tool",
|
|
78
|
+
description=description or "Control computer with mouse, keyboard, and screenshots",
|
|
79
|
+
**kwargs,
|
|
80
|
+
)
|
|
81
|
+
|
|
82
|
+
async def __call__(
|
|
83
|
+
self,
|
|
84
|
+
action: str = ACTION_FIELD,
|
|
85
|
+
# Common coordinates
|
|
86
|
+
x: int | None = X_FIELD,
|
|
87
|
+
y: int | None = Y_FIELD,
|
|
88
|
+
# Text input
|
|
89
|
+
text: str | None = TEXT_FIELD,
|
|
90
|
+
press_enter: bool | None = PRESS_ENTER_FIELD,
|
|
91
|
+
clear_before_typing: bool | None = CLEAR_BEFORE_TYPING_FIELD,
|
|
92
|
+
# Scroll parameters
|
|
93
|
+
direction: Literal["up", "down", "left", "right"] | None = DIRECTION_FIELD,
|
|
94
|
+
magnitude: int | None = MAGNITUDE_FIELD,
|
|
95
|
+
# Navigation
|
|
96
|
+
url: str | None = URL_FIELD,
|
|
97
|
+
# Key combos
|
|
98
|
+
keys: list[str] | str | None = KEYS_FIELD,
|
|
99
|
+
# Drag parameters
|
|
100
|
+
destination_x: int | None = DESTINATION_X_FIELD,
|
|
101
|
+
destination_y: int | None = DESTINATION_Y_FIELD,
|
|
102
|
+
# Behavior
|
|
103
|
+
take_screenshot_on_click: bool = TAKE_SCREENSHOT_ON_CLICK_FIELD,
|
|
104
|
+
) -> list[ContentBlock]:
|
|
105
|
+
"""
|
|
106
|
+
Handle Gemini Computer Use API calls by mapping to executor actions.
|
|
107
|
+
|
|
108
|
+
Returns:
|
|
109
|
+
List of MCP content blocks
|
|
110
|
+
"""
|
|
111
|
+
logger.info("GeminiComputerTool received action: %s", action)
|
|
112
|
+
|
|
113
|
+
# Helper to finalize ContentResult: rescale if requested and ensure URL metadata
|
|
114
|
+
async def _finalize(
|
|
115
|
+
result: ContentResult, requested_url: str | None = None
|
|
116
|
+
) -> list[ContentBlock]:
|
|
117
|
+
if result.base64_image and self.rescale_images:
|
|
118
|
+
try:
|
|
119
|
+
result.base64_image = await self._rescale_screenshot(result.base64_image)
|
|
120
|
+
except Exception as e:
|
|
121
|
+
logger.warning("Failed to rescale screenshot: %s", e)
|
|
122
|
+
# Always include URL metadata if provided; otherwise default to about:blank
|
|
123
|
+
result.url = requested_url or result.url or "about:blank"
|
|
124
|
+
return result.to_content_blocks()
|
|
125
|
+
|
|
126
|
+
# Scale coordinates helper
|
|
127
|
+
def _scale(xv: int | None, yv: int | None) -> tuple[int | None, int | None]:
|
|
128
|
+
return self._scale_coordinates(xv, yv)
|
|
129
|
+
|
|
130
|
+
# Gemini emits coordinates/magnitudes in a 0-1000 normalized space.
|
|
131
|
+
def _denormalize(value: float | None, axis: Literal["x", "y"]) -> int | None:
|
|
132
|
+
if value is None:
|
|
133
|
+
return None
|
|
134
|
+
try:
|
|
135
|
+
numeric = float(value)
|
|
136
|
+
except (TypeError, ValueError):
|
|
137
|
+
try:
|
|
138
|
+
return int(value) # type: ignore[arg-type]
|
|
139
|
+
except (TypeError, ValueError):
|
|
140
|
+
return None
|
|
141
|
+
|
|
142
|
+
# Treat values within the normalized range (including defaults like 800).
|
|
143
|
+
if 0 <= numeric <= 1000:
|
|
144
|
+
target = self.width if axis == "x" else self.height
|
|
145
|
+
numeric = numeric / 1000 * target
|
|
146
|
+
|
|
147
|
+
return round(numeric)
|
|
148
|
+
|
|
149
|
+
def _scale_distance(value: int | None, axis: Literal["x", "y"]) -> int | None:
|
|
150
|
+
if value is None:
|
|
151
|
+
return None
|
|
152
|
+
scale = self.scale_x if axis == "x" else self.scale_y
|
|
153
|
+
if scale != 1.0:
|
|
154
|
+
return round(value / scale)
|
|
155
|
+
return value
|
|
156
|
+
|
|
157
|
+
# Map actions
|
|
158
|
+
if action == "open_web_browser":
|
|
159
|
+
screenshot = await self.executor.screenshot()
|
|
160
|
+
if screenshot:
|
|
161
|
+
result = ContentResult(base64_image=screenshot, url="about:blank")
|
|
162
|
+
else:
|
|
163
|
+
result = ContentResult(error="Failed to take screenshot", url="about:blank")
|
|
164
|
+
return await _finalize(result)
|
|
165
|
+
|
|
166
|
+
elif action == "click_at":
|
|
167
|
+
if x is None or y is None:
|
|
168
|
+
raise McpError(ErrorData(code=INVALID_PARAMS, message="x and y are required"))
|
|
169
|
+
dx = _denormalize(x, "x")
|
|
170
|
+
dy = _denormalize(y, "y")
|
|
171
|
+
sx, sy = _scale(dx, dy)
|
|
172
|
+
result = await self.executor.click(x=sx, y=sy)
|
|
173
|
+
return await _finalize(result)
|
|
174
|
+
|
|
175
|
+
elif action == "hover_at":
|
|
176
|
+
if x is None or y is None:
|
|
177
|
+
raise McpError(ErrorData(code=INVALID_PARAMS, message="x and y are required"))
|
|
178
|
+
dx = _denormalize(x, "x")
|
|
179
|
+
dy = _denormalize(y, "y")
|
|
180
|
+
sx, sy = _scale(dx, dy)
|
|
181
|
+
result = await self.executor.move(x=sx, y=sy)
|
|
182
|
+
return await _finalize(result)
|
|
183
|
+
|
|
184
|
+
elif action == "type_text_at":
|
|
185
|
+
if x is None or y is None:
|
|
186
|
+
raise McpError(ErrorData(code=INVALID_PARAMS, message="x and y are required"))
|
|
187
|
+
if text is None:
|
|
188
|
+
raise McpError(ErrorData(code=INVALID_PARAMS, message="text is required"))
|
|
189
|
+
|
|
190
|
+
dx = _denormalize(x, "x")
|
|
191
|
+
dy = _denormalize(y, "y")
|
|
192
|
+
sx, sy = _scale(dx, dy)
|
|
193
|
+
|
|
194
|
+
# Focus the field
|
|
195
|
+
await self.executor.move(x=sx, y=sy, take_screenshot=False)
|
|
196
|
+
await self.executor.click(x=sx, y=sy, take_screenshot=False)
|
|
197
|
+
|
|
198
|
+
# Clear existing text if requested
|
|
199
|
+
if clear_before_typing is None or clear_before_typing:
|
|
200
|
+
is_mac = platform.system().lower() == "darwin"
|
|
201
|
+
combo = ["cmd", "a"] if is_mac else ["ctrl", "a"]
|
|
202
|
+
await self.executor.press(keys=combo, take_screenshot=False)
|
|
203
|
+
delete_key = "backspace" if is_mac else "delete"
|
|
204
|
+
await self.executor.press(keys=[delete_key], take_screenshot=False)
|
|
205
|
+
|
|
206
|
+
# Type (optionally press enter after)
|
|
207
|
+
result = await self.executor.write(text=text, enter_after=bool(press_enter))
|
|
208
|
+
return await _finalize(result)
|
|
209
|
+
|
|
210
|
+
elif action == "scroll_document":
|
|
211
|
+
if direction is None:
|
|
212
|
+
raise McpError(ErrorData(code=INVALID_PARAMS, message="direction is required"))
|
|
213
|
+
# Default magnitude similar to reference implementation
|
|
214
|
+
mag = magnitude if magnitude is not None else 800
|
|
215
|
+
# Convert to environment units while preserving sign
|
|
216
|
+
if direction in ("down", "up"):
|
|
217
|
+
distance = _denormalize(mag, "y")
|
|
218
|
+
if distance is None:
|
|
219
|
+
raise McpError(
|
|
220
|
+
ErrorData(
|
|
221
|
+
code=INVALID_PARAMS, message="Unable to determine scroll magnitude"
|
|
222
|
+
)
|
|
223
|
+
)
|
|
224
|
+
distance = _scale_distance(distance, "y")
|
|
225
|
+
if distance is None:
|
|
226
|
+
raise McpError(
|
|
227
|
+
ErrorData(
|
|
228
|
+
code=INVALID_PARAMS,
|
|
229
|
+
message="Unable to determine scroll magnitude",
|
|
230
|
+
)
|
|
231
|
+
)
|
|
232
|
+
scroll_y = distance if direction == "down" else -distance
|
|
233
|
+
scroll_x = None
|
|
234
|
+
elif direction in ("right", "left"):
|
|
235
|
+
distance = _denormalize(mag, "x")
|
|
236
|
+
if distance is None:
|
|
237
|
+
raise McpError(
|
|
238
|
+
ErrorData(
|
|
239
|
+
code=INVALID_PARAMS, message="Unable to determine scroll magnitude"
|
|
240
|
+
)
|
|
241
|
+
)
|
|
242
|
+
distance = _scale_distance(distance, "x")
|
|
243
|
+
if distance is None:
|
|
244
|
+
raise McpError(
|
|
245
|
+
ErrorData(
|
|
246
|
+
code=INVALID_PARAMS,
|
|
247
|
+
message="Unable to determine scroll magnitude",
|
|
248
|
+
)
|
|
249
|
+
)
|
|
250
|
+
scroll_x = distance if direction == "right" else -distance
|
|
251
|
+
scroll_y = None
|
|
252
|
+
else:
|
|
253
|
+
raise McpError(
|
|
254
|
+
ErrorData(code=INVALID_PARAMS, message=f"Invalid direction: {direction}")
|
|
255
|
+
)
|
|
256
|
+
result = await self.executor.scroll(scroll_x=scroll_x, scroll_y=scroll_y)
|
|
257
|
+
return await _finalize(result)
|
|
258
|
+
|
|
259
|
+
elif action == "scroll_at":
|
|
260
|
+
if direction is None:
|
|
261
|
+
raise McpError(ErrorData(code=INVALID_PARAMS, message="direction is required"))
|
|
262
|
+
if x is None or y is None:
|
|
263
|
+
raise McpError(ErrorData(code=INVALID_PARAMS, message="x and y are required"))
|
|
264
|
+
mag = magnitude if magnitude is not None else 800
|
|
265
|
+
dx = _denormalize(x, "x")
|
|
266
|
+
dy = _denormalize(y, "y")
|
|
267
|
+
sx, sy = _scale(dx, dy)
|
|
268
|
+
if direction in ("down", "up"):
|
|
269
|
+
distance = _denormalize(mag, "y")
|
|
270
|
+
if distance is None:
|
|
271
|
+
raise McpError(
|
|
272
|
+
ErrorData(
|
|
273
|
+
code=INVALID_PARAMS, message="Unable to determine scroll magnitude"
|
|
274
|
+
)
|
|
275
|
+
)
|
|
276
|
+
distance = _scale_distance(distance, "y")
|
|
277
|
+
if distance is None:
|
|
278
|
+
raise McpError(
|
|
279
|
+
ErrorData(
|
|
280
|
+
code=INVALID_PARAMS,
|
|
281
|
+
message="Unable to determine scroll magnitude",
|
|
282
|
+
)
|
|
283
|
+
)
|
|
284
|
+
scroll_y = distance if direction == "down" else -distance
|
|
285
|
+
scroll_x = None
|
|
286
|
+
elif direction in ("right", "left"):
|
|
287
|
+
distance = _denormalize(mag, "x")
|
|
288
|
+
if distance is None:
|
|
289
|
+
raise McpError(
|
|
290
|
+
ErrorData(
|
|
291
|
+
code=INVALID_PARAMS, message="Unable to determine scroll magnitude"
|
|
292
|
+
)
|
|
293
|
+
)
|
|
294
|
+
distance = _scale_distance(distance, "x")
|
|
295
|
+
if distance is None:
|
|
296
|
+
raise McpError(
|
|
297
|
+
ErrorData(
|
|
298
|
+
code=INVALID_PARAMS,
|
|
299
|
+
message="Unable to determine scroll magnitude",
|
|
300
|
+
)
|
|
301
|
+
)
|
|
302
|
+
scroll_x = distance if direction == "right" else -distance
|
|
303
|
+
scroll_y = None
|
|
304
|
+
else:
|
|
305
|
+
raise McpError(
|
|
306
|
+
ErrorData(code=INVALID_PARAMS, message=f"Invalid direction: {direction}")
|
|
307
|
+
)
|
|
308
|
+
result = await self.executor.scroll(x=sx, y=sy, scroll_x=scroll_x, scroll_y=scroll_y)
|
|
309
|
+
return await _finalize(result)
|
|
310
|
+
|
|
311
|
+
elif action == "wait_5_seconds":
|
|
312
|
+
result = await self.executor.wait(time=5000)
|
|
313
|
+
return await _finalize(result)
|
|
314
|
+
|
|
315
|
+
elif action == "go_back":
|
|
316
|
+
is_mac = platform.system().lower() == "darwin"
|
|
317
|
+
combo = ["cmd", "["] if is_mac else ["alt", "left"]
|
|
318
|
+
result = await self.executor.press(keys=combo)
|
|
319
|
+
return await _finalize(result)
|
|
320
|
+
|
|
321
|
+
elif action == "go_forward":
|
|
322
|
+
is_mac = platform.system().lower() == "darwin"
|
|
323
|
+
combo = ["cmd", "]"] if is_mac else ["alt", "right"]
|
|
324
|
+
result = await self.executor.press(keys=combo)
|
|
325
|
+
return await _finalize(result)
|
|
326
|
+
|
|
327
|
+
elif action == "search":
|
|
328
|
+
# Best-effort navigate to a default search page
|
|
329
|
+
target = url or "https://www.google.com"
|
|
330
|
+
is_mac = platform.system().lower() == "darwin"
|
|
331
|
+
await self.executor.press(
|
|
332
|
+
keys=["cmd", "l"] if is_mac else ["ctrl", "l"], take_screenshot=False
|
|
333
|
+
)
|
|
334
|
+
result = await self.executor.write(text=target, enter_after=True)
|
|
335
|
+
return await _finalize(result, requested_url=target)
|
|
336
|
+
|
|
337
|
+
elif action == "navigate":
|
|
338
|
+
if not url:
|
|
339
|
+
raise McpError(ErrorData(code=INVALID_PARAMS, message="url is required"))
|
|
340
|
+
is_mac = platform.system().lower() == "darwin"
|
|
341
|
+
await self.executor.press(
|
|
342
|
+
keys=["cmd", "l"] if is_mac else ["ctrl", "l"], take_screenshot=False
|
|
343
|
+
)
|
|
344
|
+
result = await self.executor.write(text=url, enter_after=True)
|
|
345
|
+
return await _finalize(result, requested_url=url)
|
|
346
|
+
|
|
347
|
+
elif action == "key_combination":
|
|
348
|
+
if keys is None:
|
|
349
|
+
raise McpError(ErrorData(code=INVALID_PARAMS, message="keys is required"))
|
|
350
|
+
if isinstance(keys, str):
|
|
351
|
+
# Accept formats like "ctrl+c" or "ctrl+shift+t"
|
|
352
|
+
key_list = [k.strip() for k in keys.split("+") if k.strip()]
|
|
353
|
+
else:
|
|
354
|
+
key_list = keys
|
|
355
|
+
result = await self.executor.press(keys=key_list)
|
|
356
|
+
return await _finalize(result)
|
|
357
|
+
|
|
358
|
+
elif action == "drag_and_drop":
|
|
359
|
+
if x is None or y is None or destination_x is None or destination_y is None:
|
|
360
|
+
raise McpError(
|
|
361
|
+
ErrorData(
|
|
362
|
+
code=INVALID_PARAMS,
|
|
363
|
+
message="x, y, destination_x, and destination_y are required",
|
|
364
|
+
)
|
|
365
|
+
)
|
|
366
|
+
sx_norm = _denormalize(x, "x")
|
|
367
|
+
sy_norm = _denormalize(y, "y")
|
|
368
|
+
dx_norm = _denormalize(destination_x, "x")
|
|
369
|
+
dy_norm = _denormalize(destination_y, "y")
|
|
370
|
+
sx, sy = _scale(sx_norm, sy_norm)
|
|
371
|
+
dx_scaled, dy_scaled = _scale(dx_norm, dy_norm)
|
|
372
|
+
# Build a two-point path
|
|
373
|
+
path = [] # type: list[tuple[int, int]]
|
|
374
|
+
if (
|
|
375
|
+
sx is not None
|
|
376
|
+
and sy is not None
|
|
377
|
+
and dx_scaled is not None
|
|
378
|
+
and dy_scaled is not None
|
|
379
|
+
):
|
|
380
|
+
path = [(sx, sy), (dx_scaled, dy_scaled)]
|
|
381
|
+
result = await self.executor.drag(path=path)
|
|
382
|
+
return await _finalize(result)
|
|
383
|
+
|
|
384
|
+
else:
|
|
385
|
+
raise McpError(ErrorData(code=INVALID_PARAMS, message=f"Unknown action: {action}"))
|
hud/tools/computer/settings.py
CHANGED
|
@@ -94,5 +94,26 @@ class ComputerSettings(BaseSettings):
|
|
|
94
94
|
validation_alias="QWEN_RESCALE_IMAGES",
|
|
95
95
|
)
|
|
96
96
|
|
|
97
|
+
GEMINI_COMPUTER_WIDTH: int = Field(
|
|
98
|
+
default=1440,
|
|
99
|
+
description="Width of the display to use for the Gemini computer tools",
|
|
100
|
+
validation_alias="GEMINI_COMPUTER_WIDTH",
|
|
101
|
+
)
|
|
102
|
+
GEMINI_COMPUTER_HEIGHT: int = Field(
|
|
103
|
+
default=900,
|
|
104
|
+
description="Height of the display to use for the Gemini computer tools",
|
|
105
|
+
validation_alias="GEMINI_COMPUTER_HEIGHT",
|
|
106
|
+
)
|
|
107
|
+
GEMINI_RESCALE_IMAGES: bool = Field(
|
|
108
|
+
default=True,
|
|
109
|
+
description="Whether to rescale images to the agent width and height",
|
|
110
|
+
validation_alias="GEMINI_RESCALE_IMAGES",
|
|
111
|
+
)
|
|
112
|
+
GEMINI_MAX_RECENT_TURN_WITH_SCREENSHOTS: int = Field(
|
|
113
|
+
default=3,
|
|
114
|
+
description="Maximum number of recent turns to keep screenshots for in Gemini agent",
|
|
115
|
+
validation_alias="GEMINI_MAX_RECENT_TURN_WITH_SCREENSHOTS",
|
|
116
|
+
)
|
|
117
|
+
|
|
97
118
|
|
|
98
119
|
computer_settings = ComputerSettings()
|
hud/tools/playwright.py
CHANGED
|
@@ -84,6 +84,9 @@ class PlaywrightTool(BaseTool):
|
|
|
84
84
|
code=INVALID_PARAMS, message="url parameter is required for navigate"
|
|
85
85
|
)
|
|
86
86
|
)
|
|
87
|
+
# Guard against pydantic FieldInfo default leaking through
|
|
88
|
+
if not isinstance(wait_for_load_state, str):
|
|
89
|
+
wait_for_load_state = None
|
|
87
90
|
result = await self.navigate(url, wait_for_load_state or "networkidle")
|
|
88
91
|
|
|
89
92
|
elif action == "screenshot":
|
|
@@ -179,11 +182,16 @@ class PlaywrightTool(BaseTool):
|
|
|
179
182
|
if self._browser is None:
|
|
180
183
|
raise RuntimeError("Failed to connect to remote browser")
|
|
181
184
|
|
|
182
|
-
#
|
|
185
|
+
# Reuse existing context and page where possible to avoid spawning new windows
|
|
183
186
|
contexts = self._browser.contexts
|
|
184
187
|
if contexts:
|
|
185
188
|
self._browser_context = contexts[0]
|
|
189
|
+
# Prefer the first existing page to keep using the already visible window/tab
|
|
190
|
+
existing_pages = self._browser_context.pages
|
|
191
|
+
if existing_pages:
|
|
192
|
+
self.page = existing_pages[0]
|
|
186
193
|
else:
|
|
194
|
+
# As a fallback, create a new context
|
|
187
195
|
self._browser_context = await self._browser.new_context(
|
|
188
196
|
viewport={"width": 1920, "height": 1080},
|
|
189
197
|
ignore_https_errors=True,
|
hud/tools/types.py
CHANGED
|
@@ -28,6 +28,7 @@ class ContentResult(BaseModel):
|
|
|
28
28
|
error: str | None = Field(default=None, description="Error message")
|
|
29
29
|
base64_image: str | None = Field(default=None, description="Base64-encoded image")
|
|
30
30
|
system: str | None = Field(default=None, description="System message")
|
|
31
|
+
url: str | None = Field(default=None, description="Current page URL (for browser automation)")
|
|
31
32
|
|
|
32
33
|
def __add__(self, other: ContentResult) -> ContentResult:
|
|
33
34
|
def combine_fields(
|
|
@@ -44,6 +45,7 @@ class ContentResult(BaseModel):
|
|
|
44
45
|
error=combine_fields(self.error, other.error),
|
|
45
46
|
base64_image=combine_fields(self.base64_image, other.base64_image, False),
|
|
46
47
|
system=combine_fields(self.system, other.system),
|
|
48
|
+
url=combine_fields(self.url, other.url, False),
|
|
47
49
|
)
|
|
48
50
|
|
|
49
51
|
def to_content_blocks(self) -> list[ContentBlock]:
|
|
@@ -55,7 +57,7 @@ class ContentResult(BaseModel):
|
|
|
55
57
|
result: ContentResult to convert
|
|
56
58
|
|
|
57
59
|
Returns:
|
|
58
|
-
List of ContentBlock
|
|
60
|
+
List of ContentBlock with URL embedded as metadata if available
|
|
59
61
|
"""
|
|
60
62
|
blocks: list[ContentBlock] = []
|
|
61
63
|
|
|
@@ -65,6 +67,12 @@ class ContentResult(BaseModel):
|
|
|
65
67
|
blocks.append(TextContent(text=self.error, type="text"))
|
|
66
68
|
if self.base64_image:
|
|
67
69
|
blocks.append(ImageContent(data=self.base64_image, mimeType="image/png", type="image"))
|
|
70
|
+
|
|
71
|
+
# Add URL as a special metadata text block (for Gemini Computer Use)
|
|
72
|
+
# Always include URL if set, even if it's a placeholder like "about:blank"
|
|
73
|
+
if self.url:
|
|
74
|
+
blocks.append(TextContent(text=f"__URL__:{self.url}", type="text"))
|
|
75
|
+
|
|
68
76
|
return blocks
|
|
69
77
|
|
|
70
78
|
|
hud/types.py
CHANGED
hud/utils/tests/test_version.py
CHANGED
hud/version.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: hud-python
|
|
3
|
-
Version: 0.4.
|
|
3
|
+
Version: 0.4.59
|
|
4
4
|
Summary: SDK for the HUD platform.
|
|
5
5
|
Project-URL: Homepage, https://github.com/hud-evals/hud-python
|
|
6
6
|
Project-URL: Bug Tracker, https://github.com/hud-evals/hud-python/issues
|
|
@@ -38,6 +38,7 @@ Requires-Python: <3.13,>=3.11
|
|
|
38
38
|
Requires-Dist: anthropic
|
|
39
39
|
Requires-Dist: blessed>=1.20.0
|
|
40
40
|
Requires-Dist: datasets>=2.14.0
|
|
41
|
+
Requires-Dist: google-genai
|
|
41
42
|
Requires-Dist: httpx<1,>=0.23.0
|
|
42
43
|
Requires-Dist: hud-fastmcp-python-sdk>=0.1.2
|
|
43
44
|
Requires-Dist: hud-mcp-python-sdk>=3.13.2
|