hud-python 0.4.57__py3-none-any.whl → 0.4.59__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of hud-python might be problematic. Click here for more details.

@@ -0,0 +1,385 @@
1
+ from __future__ import annotations
2
+
3
+ import logging
4
+ import platform
5
+ from typing import TYPE_CHECKING, Any, Literal
6
+
7
+ from mcp import ErrorData, McpError
8
+ from mcp.types import INVALID_PARAMS, ContentBlock
9
+ from pydantic import Field
10
+
11
+ from hud.tools.types import ContentResult
12
+
13
+ from .hud import HudComputerTool
14
+ from .settings import computer_settings
15
+
16
+ if TYPE_CHECKING:
17
+ from hud.tools.executors.base import BaseExecutor
18
+
19
+ logger = logging.getLogger(__name__)
20
+
21
+
22
+ ACTION_FIELD = Field(..., description="Gemini Computer Use action to perform")
23
+ X_FIELD = Field(None, description="X coordinate (pixels in agent space)")
24
+ Y_FIELD = Field(None, description="Y coordinate (pixels in agent space)")
25
+ TEXT_FIELD = Field(None, description="Text to type")
26
+ PRESS_ENTER_FIELD = Field(None, description="Whether to press Enter after typing (type_text_at)")
27
+ CLEAR_BEFORE_TYPING_FIELD = Field(
28
+ None, description="Whether to select-all before typing (type_text_at)"
29
+ )
30
+ DIRECTION_FIELD = Field(None, description="Scroll direction for scroll_document/scroll_at")
31
+ MAGNITUDE_FIELD = Field(None, description="Scroll magnitude (pixels in agent space)")
32
+ URL_FIELD = Field(None, description="Target URL for navigate")
33
+ KEYS_FIELD = Field(None, description="Keys for key_combination")
34
+ DESTINATION_X_FIELD = Field(None, description="Destination X for drag_and_drop (agent space)")
35
+ DESTINATION_Y_FIELD = Field(None, description="Destination Y for drag_and_drop (agent space)")
36
+ TAKE_SCREENSHOT_ON_CLICK_FIELD = Field(
37
+ True, description="Whether to include a screenshot for interactive actions"
38
+ )
39
+
40
+
41
+ class GeminiComputerTool(HudComputerTool):
42
+ """
43
+ Gemini Computer Use tool for interacting with a computer via MCP.
44
+
45
+ Maps Gemini's predefined function names (open_web_browser, click_at, hover_at,
46
+ type_text_at, scroll_document, scroll_at, wait_5_seconds, go_back, go_forward,
47
+ search, navigate, key_combination, drag_and_drop) to executor actions.
48
+ """
49
+
50
+ def __init__(
51
+ self,
52
+ # Define within environment based on platform
53
+ executor: BaseExecutor | None = None,
54
+ platform_type: Literal["auto", "xdo", "pyautogui"] = "auto",
55
+ display_num: int | None = None,
56
+ # Overrides for what dimensions the agent thinks it operates in
57
+ width: int = computer_settings.GEMINI_COMPUTER_WIDTH,
58
+ height: int = computer_settings.GEMINI_COMPUTER_HEIGHT,
59
+ rescale_images: bool = computer_settings.GEMINI_RESCALE_IMAGES,
60
+ # What the agent sees as the tool's name, title, and description
61
+ name: str | None = None,
62
+ title: str | None = None,
63
+ description: str | None = None,
64
+ **kwargs: Any,
65
+ ) -> None:
66
+ """
67
+ Initialize with Gemini's default dimensions.
68
+ """
69
+ super().__init__(
70
+ executor=executor,
71
+ platform_type=platform_type,
72
+ display_num=display_num,
73
+ width=width,
74
+ height=height,
75
+ rescale_images=rescale_images,
76
+ name=name or "gemini_computer",
77
+ title=title or "Gemini Computer Tool",
78
+ description=description or "Control computer with mouse, keyboard, and screenshots",
79
+ **kwargs,
80
+ )
81
+
82
+ async def __call__(
83
+ self,
84
+ action: str = ACTION_FIELD,
85
+ # Common coordinates
86
+ x: int | None = X_FIELD,
87
+ y: int | None = Y_FIELD,
88
+ # Text input
89
+ text: str | None = TEXT_FIELD,
90
+ press_enter: bool | None = PRESS_ENTER_FIELD,
91
+ clear_before_typing: bool | None = CLEAR_BEFORE_TYPING_FIELD,
92
+ # Scroll parameters
93
+ direction: Literal["up", "down", "left", "right"] | None = DIRECTION_FIELD,
94
+ magnitude: int | None = MAGNITUDE_FIELD,
95
+ # Navigation
96
+ url: str | None = URL_FIELD,
97
+ # Key combos
98
+ keys: list[str] | str | None = KEYS_FIELD,
99
+ # Drag parameters
100
+ destination_x: int | None = DESTINATION_X_FIELD,
101
+ destination_y: int | None = DESTINATION_Y_FIELD,
102
+ # Behavior
103
+ take_screenshot_on_click: bool = TAKE_SCREENSHOT_ON_CLICK_FIELD,
104
+ ) -> list[ContentBlock]:
105
+ """
106
+ Handle Gemini Computer Use API calls by mapping to executor actions.
107
+
108
+ Returns:
109
+ List of MCP content blocks
110
+ """
111
+ logger.info("GeminiComputerTool received action: %s", action)
112
+
113
+ # Helper to finalize ContentResult: rescale if requested and ensure URL metadata
114
+ async def _finalize(
115
+ result: ContentResult, requested_url: str | None = None
116
+ ) -> list[ContentBlock]:
117
+ if result.base64_image and self.rescale_images:
118
+ try:
119
+ result.base64_image = await self._rescale_screenshot(result.base64_image)
120
+ except Exception as e:
121
+ logger.warning("Failed to rescale screenshot: %s", e)
122
+ # Always include URL metadata if provided; otherwise default to about:blank
123
+ result.url = requested_url or result.url or "about:blank"
124
+ return result.to_content_blocks()
125
+
126
+ # Scale coordinates helper
127
+ def _scale(xv: int | None, yv: int | None) -> tuple[int | None, int | None]:
128
+ return self._scale_coordinates(xv, yv)
129
+
130
+ # Gemini emits coordinates/magnitudes in a 0-1000 normalized space.
131
+ def _denormalize(value: float | None, axis: Literal["x", "y"]) -> int | None:
132
+ if value is None:
133
+ return None
134
+ try:
135
+ numeric = float(value)
136
+ except (TypeError, ValueError):
137
+ try:
138
+ return int(value) # type: ignore[arg-type]
139
+ except (TypeError, ValueError):
140
+ return None
141
+
142
+ # Treat values within the normalized range (including defaults like 800).
143
+ if 0 <= numeric <= 1000:
144
+ target = self.width if axis == "x" else self.height
145
+ numeric = numeric / 1000 * target
146
+
147
+ return round(numeric)
148
+
149
+ def _scale_distance(value: int | None, axis: Literal["x", "y"]) -> int | None:
150
+ if value is None:
151
+ return None
152
+ scale = self.scale_x if axis == "x" else self.scale_y
153
+ if scale != 1.0:
154
+ return round(value / scale)
155
+ return value
156
+
157
+ # Map actions
158
+ if action == "open_web_browser":
159
+ screenshot = await self.executor.screenshot()
160
+ if screenshot:
161
+ result = ContentResult(base64_image=screenshot, url="about:blank")
162
+ else:
163
+ result = ContentResult(error="Failed to take screenshot", url="about:blank")
164
+ return await _finalize(result)
165
+
166
+ elif action == "click_at":
167
+ if x is None or y is None:
168
+ raise McpError(ErrorData(code=INVALID_PARAMS, message="x and y are required"))
169
+ dx = _denormalize(x, "x")
170
+ dy = _denormalize(y, "y")
171
+ sx, sy = _scale(dx, dy)
172
+ result = await self.executor.click(x=sx, y=sy)
173
+ return await _finalize(result)
174
+
175
+ elif action == "hover_at":
176
+ if x is None or y is None:
177
+ raise McpError(ErrorData(code=INVALID_PARAMS, message="x and y are required"))
178
+ dx = _denormalize(x, "x")
179
+ dy = _denormalize(y, "y")
180
+ sx, sy = _scale(dx, dy)
181
+ result = await self.executor.move(x=sx, y=sy)
182
+ return await _finalize(result)
183
+
184
+ elif action == "type_text_at":
185
+ if x is None or y is None:
186
+ raise McpError(ErrorData(code=INVALID_PARAMS, message="x and y are required"))
187
+ if text is None:
188
+ raise McpError(ErrorData(code=INVALID_PARAMS, message="text is required"))
189
+
190
+ dx = _denormalize(x, "x")
191
+ dy = _denormalize(y, "y")
192
+ sx, sy = _scale(dx, dy)
193
+
194
+ # Focus the field
195
+ await self.executor.move(x=sx, y=sy, take_screenshot=False)
196
+ await self.executor.click(x=sx, y=sy, take_screenshot=False)
197
+
198
+ # Clear existing text if requested
199
+ if clear_before_typing is None or clear_before_typing:
200
+ is_mac = platform.system().lower() == "darwin"
201
+ combo = ["cmd", "a"] if is_mac else ["ctrl", "a"]
202
+ await self.executor.press(keys=combo, take_screenshot=False)
203
+ delete_key = "backspace" if is_mac else "delete"
204
+ await self.executor.press(keys=[delete_key], take_screenshot=False)
205
+
206
+ # Type (optionally press enter after)
207
+ result = await self.executor.write(text=text, enter_after=bool(press_enter))
208
+ return await _finalize(result)
209
+
210
+ elif action == "scroll_document":
211
+ if direction is None:
212
+ raise McpError(ErrorData(code=INVALID_PARAMS, message="direction is required"))
213
+ # Default magnitude similar to reference implementation
214
+ mag = magnitude if magnitude is not None else 800
215
+ # Convert to environment units while preserving sign
216
+ if direction in ("down", "up"):
217
+ distance = _denormalize(mag, "y")
218
+ if distance is None:
219
+ raise McpError(
220
+ ErrorData(
221
+ code=INVALID_PARAMS, message="Unable to determine scroll magnitude"
222
+ )
223
+ )
224
+ distance = _scale_distance(distance, "y")
225
+ if distance is None:
226
+ raise McpError(
227
+ ErrorData(
228
+ code=INVALID_PARAMS,
229
+ message="Unable to determine scroll magnitude",
230
+ )
231
+ )
232
+ scroll_y = distance if direction == "down" else -distance
233
+ scroll_x = None
234
+ elif direction in ("right", "left"):
235
+ distance = _denormalize(mag, "x")
236
+ if distance is None:
237
+ raise McpError(
238
+ ErrorData(
239
+ code=INVALID_PARAMS, message="Unable to determine scroll magnitude"
240
+ )
241
+ )
242
+ distance = _scale_distance(distance, "x")
243
+ if distance is None:
244
+ raise McpError(
245
+ ErrorData(
246
+ code=INVALID_PARAMS,
247
+ message="Unable to determine scroll magnitude",
248
+ )
249
+ )
250
+ scroll_x = distance if direction == "right" else -distance
251
+ scroll_y = None
252
+ else:
253
+ raise McpError(
254
+ ErrorData(code=INVALID_PARAMS, message=f"Invalid direction: {direction}")
255
+ )
256
+ result = await self.executor.scroll(scroll_x=scroll_x, scroll_y=scroll_y)
257
+ return await _finalize(result)
258
+
259
+ elif action == "scroll_at":
260
+ if direction is None:
261
+ raise McpError(ErrorData(code=INVALID_PARAMS, message="direction is required"))
262
+ if x is None or y is None:
263
+ raise McpError(ErrorData(code=INVALID_PARAMS, message="x and y are required"))
264
+ mag = magnitude if magnitude is not None else 800
265
+ dx = _denormalize(x, "x")
266
+ dy = _denormalize(y, "y")
267
+ sx, sy = _scale(dx, dy)
268
+ if direction in ("down", "up"):
269
+ distance = _denormalize(mag, "y")
270
+ if distance is None:
271
+ raise McpError(
272
+ ErrorData(
273
+ code=INVALID_PARAMS, message="Unable to determine scroll magnitude"
274
+ )
275
+ )
276
+ distance = _scale_distance(distance, "y")
277
+ if distance is None:
278
+ raise McpError(
279
+ ErrorData(
280
+ code=INVALID_PARAMS,
281
+ message="Unable to determine scroll magnitude",
282
+ )
283
+ )
284
+ scroll_y = distance if direction == "down" else -distance
285
+ scroll_x = None
286
+ elif direction in ("right", "left"):
287
+ distance = _denormalize(mag, "x")
288
+ if distance is None:
289
+ raise McpError(
290
+ ErrorData(
291
+ code=INVALID_PARAMS, message="Unable to determine scroll magnitude"
292
+ )
293
+ )
294
+ distance = _scale_distance(distance, "x")
295
+ if distance is None:
296
+ raise McpError(
297
+ ErrorData(
298
+ code=INVALID_PARAMS,
299
+ message="Unable to determine scroll magnitude",
300
+ )
301
+ )
302
+ scroll_x = distance if direction == "right" else -distance
303
+ scroll_y = None
304
+ else:
305
+ raise McpError(
306
+ ErrorData(code=INVALID_PARAMS, message=f"Invalid direction: {direction}")
307
+ )
308
+ result = await self.executor.scroll(x=sx, y=sy, scroll_x=scroll_x, scroll_y=scroll_y)
309
+ return await _finalize(result)
310
+
311
+ elif action == "wait_5_seconds":
312
+ result = await self.executor.wait(time=5000)
313
+ return await _finalize(result)
314
+
315
+ elif action == "go_back":
316
+ is_mac = platform.system().lower() == "darwin"
317
+ combo = ["cmd", "["] if is_mac else ["alt", "left"]
318
+ result = await self.executor.press(keys=combo)
319
+ return await _finalize(result)
320
+
321
+ elif action == "go_forward":
322
+ is_mac = platform.system().lower() == "darwin"
323
+ combo = ["cmd", "]"] if is_mac else ["alt", "right"]
324
+ result = await self.executor.press(keys=combo)
325
+ return await _finalize(result)
326
+
327
+ elif action == "search":
328
+ # Best-effort navigate to a default search page
329
+ target = url or "https://www.google.com"
330
+ is_mac = platform.system().lower() == "darwin"
331
+ await self.executor.press(
332
+ keys=["cmd", "l"] if is_mac else ["ctrl", "l"], take_screenshot=False
333
+ )
334
+ result = await self.executor.write(text=target, enter_after=True)
335
+ return await _finalize(result, requested_url=target)
336
+
337
+ elif action == "navigate":
338
+ if not url:
339
+ raise McpError(ErrorData(code=INVALID_PARAMS, message="url is required"))
340
+ is_mac = platform.system().lower() == "darwin"
341
+ await self.executor.press(
342
+ keys=["cmd", "l"] if is_mac else ["ctrl", "l"], take_screenshot=False
343
+ )
344
+ result = await self.executor.write(text=url, enter_after=True)
345
+ return await _finalize(result, requested_url=url)
346
+
347
+ elif action == "key_combination":
348
+ if keys is None:
349
+ raise McpError(ErrorData(code=INVALID_PARAMS, message="keys is required"))
350
+ if isinstance(keys, str):
351
+ # Accept formats like "ctrl+c" or "ctrl+shift+t"
352
+ key_list = [k.strip() for k in keys.split("+") if k.strip()]
353
+ else:
354
+ key_list = keys
355
+ result = await self.executor.press(keys=key_list)
356
+ return await _finalize(result)
357
+
358
+ elif action == "drag_and_drop":
359
+ if x is None or y is None or destination_x is None or destination_y is None:
360
+ raise McpError(
361
+ ErrorData(
362
+ code=INVALID_PARAMS,
363
+ message="x, y, destination_x, and destination_y are required",
364
+ )
365
+ )
366
+ sx_norm = _denormalize(x, "x")
367
+ sy_norm = _denormalize(y, "y")
368
+ dx_norm = _denormalize(destination_x, "x")
369
+ dy_norm = _denormalize(destination_y, "y")
370
+ sx, sy = _scale(sx_norm, sy_norm)
371
+ dx_scaled, dy_scaled = _scale(dx_norm, dy_norm)
372
+ # Build a two-point path
373
+ path = [] # type: list[tuple[int, int]]
374
+ if (
375
+ sx is not None
376
+ and sy is not None
377
+ and dx_scaled is not None
378
+ and dy_scaled is not None
379
+ ):
380
+ path = [(sx, sy), (dx_scaled, dy_scaled)]
381
+ result = await self.executor.drag(path=path)
382
+ return await _finalize(result)
383
+
384
+ else:
385
+ raise McpError(ErrorData(code=INVALID_PARAMS, message=f"Unknown action: {action}"))
@@ -94,5 +94,26 @@ class ComputerSettings(BaseSettings):
94
94
  validation_alias="QWEN_RESCALE_IMAGES",
95
95
  )
96
96
 
97
+ GEMINI_COMPUTER_WIDTH: int = Field(
98
+ default=1440,
99
+ description="Width of the display to use for the Gemini computer tools",
100
+ validation_alias="GEMINI_COMPUTER_WIDTH",
101
+ )
102
+ GEMINI_COMPUTER_HEIGHT: int = Field(
103
+ default=900,
104
+ description="Height of the display to use for the Gemini computer tools",
105
+ validation_alias="GEMINI_COMPUTER_HEIGHT",
106
+ )
107
+ GEMINI_RESCALE_IMAGES: bool = Field(
108
+ default=True,
109
+ description="Whether to rescale images to the agent width and height",
110
+ validation_alias="GEMINI_RESCALE_IMAGES",
111
+ )
112
+ GEMINI_MAX_RECENT_TURN_WITH_SCREENSHOTS: int = Field(
113
+ default=3,
114
+ description="Maximum number of recent turns to keep screenshots for in Gemini agent",
115
+ validation_alias="GEMINI_MAX_RECENT_TURN_WITH_SCREENSHOTS",
116
+ )
117
+
97
118
 
98
119
  computer_settings = ComputerSettings()
hud/tools/playwright.py CHANGED
@@ -84,6 +84,9 @@ class PlaywrightTool(BaseTool):
84
84
  code=INVALID_PARAMS, message="url parameter is required for navigate"
85
85
  )
86
86
  )
87
+ # Guard against pydantic FieldInfo default leaking through
88
+ if not isinstance(wait_for_load_state, str):
89
+ wait_for_load_state = None
87
90
  result = await self.navigate(url, wait_for_load_state or "networkidle")
88
91
 
89
92
  elif action == "screenshot":
@@ -179,11 +182,16 @@ class PlaywrightTool(BaseTool):
179
182
  if self._browser is None:
180
183
  raise RuntimeError("Failed to connect to remote browser")
181
184
 
182
- # Use existing context or create new one
185
+ # Reuse existing context and page where possible to avoid spawning new windows
183
186
  contexts = self._browser.contexts
184
187
  if contexts:
185
188
  self._browser_context = contexts[0]
189
+ # Prefer the first existing page to keep using the already visible window/tab
190
+ existing_pages = self._browser_context.pages
191
+ if existing_pages:
192
+ self.page = existing_pages[0]
186
193
  else:
194
+ # As a fallback, create a new context
187
195
  self._browser_context = await self._browser.new_context(
188
196
  viewport={"width": 1920, "height": 1080},
189
197
  ignore_https_errors=True,
@@ -225,7 +233,14 @@ class PlaywrightTool(BaseTool):
225
233
  if self._browser_context is None:
226
234
  raise RuntimeError("Browser context failed to initialize")
227
235
 
228
- self.page = await self._browser_context.new_page()
236
+ # Reuse existing page if available (for CDP connections), otherwise create new one
237
+ pages = self._browser_context.pages
238
+ if pages:
239
+ self.page = pages[0]
240
+ logger.info("Reusing existing browser page")
241
+ else:
242
+ self.page = await self._browser_context.new_page()
243
+ logger.info("Created new browser page")
229
244
  logger.info("Playwright browser launched successfully")
230
245
 
231
246
  async def navigate(
hud/tools/types.py CHANGED
@@ -28,6 +28,7 @@ class ContentResult(BaseModel):
28
28
  error: str | None = Field(default=None, description="Error message")
29
29
  base64_image: str | None = Field(default=None, description="Base64-encoded image")
30
30
  system: str | None = Field(default=None, description="System message")
31
+ url: str | None = Field(default=None, description="Current page URL (for browser automation)")
31
32
 
32
33
  def __add__(self, other: ContentResult) -> ContentResult:
33
34
  def combine_fields(
@@ -44,6 +45,7 @@ class ContentResult(BaseModel):
44
45
  error=combine_fields(self.error, other.error),
45
46
  base64_image=combine_fields(self.base64_image, other.base64_image, False),
46
47
  system=combine_fields(self.system, other.system),
48
+ url=combine_fields(self.url, other.url, False),
47
49
  )
48
50
 
49
51
  def to_content_blocks(self) -> list[ContentBlock]:
@@ -55,7 +57,7 @@ class ContentResult(BaseModel):
55
57
  result: ContentResult to convert
56
58
 
57
59
  Returns:
58
- List of ContentBlock
60
+ List of ContentBlock with URL embedded as metadata if available
59
61
  """
60
62
  blocks: list[ContentBlock] = []
61
63
 
@@ -65,6 +67,12 @@ class ContentResult(BaseModel):
65
67
  blocks.append(TextContent(text=self.error, type="text"))
66
68
  if self.base64_image:
67
69
  blocks.append(ImageContent(data=self.base64_image, mimeType="image/png", type="image"))
70
+
71
+ # Add URL as a special metadata text block (for Gemini Computer Use)
72
+ # Always include URL if set, even if it's a placeholder like "about:blank"
73
+ if self.url:
74
+ blocks.append(TextContent(text=f"__URL__:{self.url}", type="text"))
75
+
68
76
  return blocks
69
77
 
70
78
 
hud/types.py CHANGED
@@ -25,6 +25,7 @@ _missing_api_key_error_logged: bool = False
25
25
  class AgentType(str, Enum):
26
26
  CLAUDE = "claude"
27
27
  OPENAI = "openai"
28
+ GEMINI = "gemini"
28
29
  VLLM = "vllm"
29
30
  LITELLM = "litellm"
30
31
  INTEGRATION_TEST = "integration_test"
@@ -230,7 +231,7 @@ class AgentResponse(BaseModel):
230
231
  tool_calls: list[MCPToolCall] = Field(default_factory=list)
231
232
  done: bool = Field(default=False)
232
233
 
233
- # --- TELEMETRY [hud.so] ---
234
+ # --- TELEMETRY [hud.ai] ---
234
235
  # Responses
235
236
  content: str | None = Field(default=None)
236
237
  reasoning: str | None = Field(default=None)
@@ -5,4 +5,4 @@ def test_import():
5
5
  """Test that the package can be imported."""
6
6
  import hud
7
7
 
8
- assert hud.__version__ == "0.4.57"
8
+ assert hud.__version__ == "0.4.59"
hud/version.py CHANGED
@@ -4,4 +4,4 @@ Version information for the HUD SDK.
4
4
 
5
5
  from __future__ import annotations
6
6
 
7
- __version__ = "0.4.57"
7
+ __version__ = "0.4.59"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: hud-python
3
- Version: 0.4.57
3
+ Version: 0.4.59
4
4
  Summary: SDK for the HUD platform.
5
5
  Project-URL: Homepage, https://github.com/hud-evals/hud-python
6
6
  Project-URL: Bug Tracker, https://github.com/hud-evals/hud-python/issues
@@ -38,6 +38,7 @@ Requires-Python: <3.13,>=3.11
38
38
  Requires-Dist: anthropic
39
39
  Requires-Dist: blessed>=1.20.0
40
40
  Requires-Dist: datasets>=2.14.0
41
+ Requires-Dist: google-genai
41
42
  Requires-Dist: httpx<1,>=0.23.0
42
43
  Requires-Dist: hud-fastmcp-python-sdk>=0.1.2
43
44
  Requires-Dist: hud-mcp-python-sdk>=3.13.2