hud-python 0.2.10__py3-none-any.whl → 0.3.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of hud-python might be problematic. Click here for more details.

Files changed (86) hide show
  1. hud/__init__.py +20 -8
  2. hud/adapters/common/adapter.py +14 -3
  3. hud/adapters/common/tests/test_adapter.py +16 -4
  4. hud/datasets.py +188 -0
  5. hud/env/docker_client.py +15 -3
  6. hud/env/environment.py +10 -7
  7. hud/env/local_docker_client.py +29 -7
  8. hud/env/remote_client.py +1 -1
  9. hud/env/remote_docker_client.py +2 -2
  10. hud/exceptions.py +2 -1
  11. hud/gym.py +0 -9
  12. hud/mcp/__init__.py +17 -0
  13. hud/mcp/base.py +631 -0
  14. hud/mcp/claude.py +321 -0
  15. hud/mcp/client.py +312 -0
  16. hud/mcp/langchain.py +250 -0
  17. hud/mcp/openai.py +334 -0
  18. hud/mcp/tests/__init__.py +1 -0
  19. hud/mcp/tests/test_base.py +512 -0
  20. hud/mcp/tests/test_claude.py +294 -0
  21. hud/mcp/tests/test_client.py +324 -0
  22. hud/mcp/tests/test_openai.py +238 -0
  23. hud/settings.py +20 -2
  24. hud/task.py +5 -88
  25. hud/taskset.py +2 -23
  26. hud/telemetry/__init__.py +16 -7
  27. hud/telemetry/_trace.py +246 -72
  28. hud/telemetry/context.py +88 -27
  29. hud/telemetry/exporter.py +171 -11
  30. hud/telemetry/instrumentation/mcp.py +174 -410
  31. hud/telemetry/job.py +141 -0
  32. hud/telemetry/mcp_models.py +13 -74
  33. hud/telemetry/tests/test_context.py +9 -6
  34. hud/telemetry/tests/test_trace.py +120 -78
  35. hud/tools/__init__.py +34 -0
  36. hud/tools/base.py +65 -0
  37. hud/tools/bash.py +137 -0
  38. hud/tools/computer/__init__.py +13 -0
  39. hud/tools/computer/anthropic.py +411 -0
  40. hud/tools/computer/hud.py +315 -0
  41. hud/tools/computer/openai.py +283 -0
  42. hud/tools/edit.py +290 -0
  43. hud/tools/executors/__init__.py +30 -0
  44. hud/tools/executors/base.py +331 -0
  45. hud/tools/executors/pyautogui.py +619 -0
  46. hud/tools/executors/tests/__init__.py +1 -0
  47. hud/tools/executors/tests/test_base_executor.py +338 -0
  48. hud/tools/executors/tests/test_pyautogui_executor.py +165 -0
  49. hud/tools/executors/xdo.py +503 -0
  50. hud/tools/helper/README.md +56 -0
  51. hud/tools/helper/__init__.py +9 -0
  52. hud/tools/helper/mcp_server.py +78 -0
  53. hud/tools/helper/server_initialization.py +115 -0
  54. hud/tools/helper/utils.py +58 -0
  55. hud/tools/playwright_tool.py +379 -0
  56. hud/tools/tests/__init__.py +3 -0
  57. hud/tools/tests/test_bash.py +152 -0
  58. hud/tools/tests/test_computer.py +52 -0
  59. hud/tools/tests/test_computer_actions.py +34 -0
  60. hud/tools/tests/test_edit.py +240 -0
  61. hud/tools/tests/test_init.py +27 -0
  62. hud/tools/tests/test_playwright_tool.py +183 -0
  63. hud/tools/tests/test_tools.py +157 -0
  64. hud/tools/tests/test_utils.py +156 -0
  65. hud/tools/utils.py +50 -0
  66. hud/trajectory.py +5 -1
  67. hud/types.py +10 -1
  68. hud/utils/tests/test_init.py +21 -0
  69. hud/utils/tests/test_version.py +1 -1
  70. hud/version.py +1 -1
  71. {hud_python-0.2.10.dist-info → hud_python-0.3.1.dist-info}/METADATA +27 -18
  72. hud_python-0.3.1.dist-info/RECORD +119 -0
  73. hud/evaluators/__init__.py +0 -9
  74. hud/evaluators/base.py +0 -32
  75. hud/evaluators/inspect.py +0 -24
  76. hud/evaluators/judge.py +0 -189
  77. hud/evaluators/match.py +0 -156
  78. hud/evaluators/remote.py +0 -65
  79. hud/evaluators/tests/__init__.py +0 -0
  80. hud/evaluators/tests/test_inspect.py +0 -12
  81. hud/evaluators/tests/test_judge.py +0 -231
  82. hud/evaluators/tests/test_match.py +0 -115
  83. hud/evaluators/tests/test_remote.py +0 -98
  84. hud_python-0.2.10.dist-info/RECORD +0 -85
  85. {hud_python-0.2.10.dist-info → hud_python-0.3.1.dist-info}/WHEEL +0 -0
  86. {hud_python-0.2.10.dist-info → hud_python-0.3.1.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,315 @@
1
+ # flake8: noqa: B008
2
+ from __future__ import annotations
3
+
4
+ import logging
5
+ import platform
6
+ from typing import Literal
7
+
8
+ from mcp import ErrorData, McpError
9
+ from mcp.types import INVALID_PARAMS, ImageContent, TextContent
10
+ from pydantic import Field
11
+
12
+ from hud.tools.base import ToolError, ToolResult, tool_result_to_content_blocks
13
+ from hud.tools.executors.base import BaseExecutor
14
+ from hud.tools.executors.pyautogui import PyAutoGUIExecutor
15
+ from hud.tools.executors.xdo import XDOExecutor
16
+
17
+ logger = logging.getLogger(__name__)
18
+
19
+ BASE_SCREEN_WIDTH = 1920
20
+ BASE_SCREEN_HEIGHT = 1080
21
+
22
+
23
+ class HudComputerTool:
24
+ """
25
+ A tool that allows the agent to control the computer.
26
+ """
27
+
28
+ def __init__(
29
+ self,
30
+ width: int | None = None,
31
+ height: int | None = None,
32
+ display_num: int | None = None,
33
+ platform_type: Literal["auto", "xdo", "pyautogui"] = "auto",
34
+ rescale_images: bool = False,
35
+ ) -> None:
36
+ """
37
+ Initialize the HUD computer tool.
38
+
39
+ Args:
40
+ width: Target width for rescaling (None = use actual screen width)
41
+ height: Target height for rescaling (None = use actual screen height)
42
+ display_num: X display number
43
+ platform_type: Which executor to use:
44
+ - "auto": Automatically detect based on platform
45
+ - "xdo": Use XDOExecutor (Linux/X11 only)
46
+ - "pyautogui": Use PyAutoGUIExecutor (cross-platform)
47
+ rescale_images: If True, rescale screenshots. If False, only rescale action coordinates
48
+ """
49
+ # Use provided dimensions or defaults
50
+ self.width = width or BASE_SCREEN_WIDTH
51
+ self.height = height or BASE_SCREEN_HEIGHT
52
+ self.rescale_images = rescale_images
53
+
54
+ logger.info("Width: %s, Height: %s", self.width, self.height)
55
+ logger.info(
56
+ "Base Screen Width: %s, Base Screen Height: %s",
57
+ BASE_SCREEN_WIDTH,
58
+ BASE_SCREEN_HEIGHT,
59
+ )
60
+
61
+ # Calculate scaling factors from base screen size to target size
62
+ self.scale_x = self.width / BASE_SCREEN_WIDTH
63
+ self.scale_y = self.height / BASE_SCREEN_HEIGHT
64
+
65
+ logger.info("Scale X: %s, Scale Y: %s", self.scale_x, self.scale_y)
66
+ self.scale = min(self.scale_x, self.scale_y)
67
+
68
+ logger.info("Scaling factor: %s", self.scale)
69
+
70
+ # Check if we need to scale
71
+ self.needs_scaling = self.scale != 1.0
72
+
73
+ # Choose executor based on platform_type
74
+ if platform_type == "auto":
75
+ # Auto-detect based on platform
76
+ system = platform.system().lower()
77
+ if system == "linux":
78
+ # Try XDO first on Linux
79
+ if XDOExecutor.is_available():
80
+ self.executor = XDOExecutor(display_num=display_num)
81
+ logger.info("Using XDOExecutor")
82
+ elif PyAutoGUIExecutor.is_available():
83
+ self.executor = PyAutoGUIExecutor(display_num=display_num)
84
+ logger.info("Using PyAutoGUIExecutor")
85
+ else:
86
+ self.executor = BaseExecutor(display_num=display_num)
87
+ logger.info("No display available, using BaseExecutor (simulation mode)")
88
+ else:
89
+ # Windows/macOS - try PyAutoGUI
90
+ if PyAutoGUIExecutor.is_available():
91
+ self.executor = PyAutoGUIExecutor(display_num=display_num)
92
+ logger.info("Using PyAutoGUIExecutor")
93
+ else:
94
+ self.executor = BaseExecutor(display_num=display_num)
95
+ logger.info("PyAutoGUI not available, using BaseExecutor (simulation mode)")
96
+
97
+ elif platform_type == "xdo":
98
+ if XDOExecutor.is_available():
99
+ self.executor = XDOExecutor(display_num=display_num)
100
+ logger.info("Using XDOExecutor")
101
+ else:
102
+ self.executor = BaseExecutor(display_num=display_num)
103
+ logger.warning("XDO not available, using BaseExecutor (simulation mode)")
104
+
105
+ elif platform_type == "pyautogui":
106
+ if PyAutoGUIExecutor.is_available():
107
+ self.executor = PyAutoGUIExecutor(display_num=display_num)
108
+ logger.info("Using PyAutoGUIExecutor")
109
+ else:
110
+ self.executor = BaseExecutor(display_num=display_num)
111
+ logger.warning("PyAutoGUI not available, using BaseExecutor (simulation mode)")
112
+ else:
113
+ raise ValueError(f"Invalid platform_type: {platform_type}")
114
+
115
+ def _scale_coordinates(self, x: int | None, y: int | None) -> tuple[int | None, int | None]:
116
+ """Scale coordinates from target space to screen space."""
117
+ if x is not None:
118
+ x = int(x / self.scale_x)
119
+ if y is not None:
120
+ y = int(y / self.scale_y)
121
+
122
+ return x, y
123
+
124
+ def _scale_path(self, path: list[tuple[int, int]]) -> list[tuple[int, int]]:
125
+ """Scale a path from target space to screen space."""
126
+ scaled_path = []
127
+ for x, y in path:
128
+ scaled_x, scaled_y = self._scale_coordinates(x, y)
129
+ if scaled_x is not None and scaled_y is not None:
130
+ scaled_path.append((scaled_x, scaled_y))
131
+
132
+ return scaled_path
133
+
134
+ async def _rescale_screenshot(self, screenshot_base64: str) -> str:
135
+ """Rescale a screenshot if rescale_images is True."""
136
+ if not self.rescale_images or not self.needs_scaling:
137
+ return screenshot_base64
138
+
139
+ try:
140
+ import base64
141
+ from io import BytesIO
142
+
143
+ from PIL import Image
144
+
145
+ # Decode base64 to image
146
+ image_data = base64.b64decode(screenshot_base64)
147
+ image = Image.open(BytesIO(image_data))
148
+
149
+ # Resize to exact target dimensions
150
+ resized = image.resize((self.width, self.height), Image.Resampling.LANCZOS)
151
+
152
+ # Convert back to base64
153
+ buffer = BytesIO()
154
+ resized.save(buffer, format="PNG")
155
+ resized_base64 = base64.b64encode(buffer.getvalue()).decode("utf-8")
156
+
157
+ return resized_base64
158
+ except Exception as e:
159
+ logger.warning("Failed to rescale screenshot: %s", e)
160
+ return screenshot_base64
161
+
162
+ async def __call__(
163
+ self,
164
+ action: str = Field(..., description="The action name (click, type, move, etc.)"),
165
+ # Click parameters
166
+ x: int | None = Field(None, description="X coordinate for click/move/scroll actions"),
167
+ y: int | None = Field(None, description="Y coordinate for click/move/scroll actions"),
168
+ button: Literal["left", "right", "middle", "back", "forward"] | None = Field(
169
+ None, description="Mouse button for click actions"
170
+ ),
171
+ pattern: list[int] | None = Field(
172
+ None, description="Click pattern for multi-clicks (e.g., [100] for double-click)"
173
+ ),
174
+ # Key/Type parameters
175
+ text: str | None = Field(None, description="Text for type/response actions"),
176
+ keys: list[str] | None = Field(None, description="Keys for press/keydown/keyup actions"),
177
+ enter_after: bool | None = Field(None, description="Whether to press Enter after typing"),
178
+ # Scroll parameters
179
+ scroll_x: int | None = Field(
180
+ None, description="Horizontal scroll amount (positive = right)"
181
+ ),
182
+ scroll_y: int | None = Field(None, description="Vertical scroll amount (positive = down)"),
183
+ # Move parameters
184
+ offset_x: int | None = Field(None, description="X offset for relative move"),
185
+ offset_y: int | None = Field(None, description="Y offset for relative move"),
186
+ # Drag parameters
187
+ path: list[tuple[int, int]] | None = Field(
188
+ None, description="Path for drag actions as list of (x, y) coordinates"
189
+ ),
190
+ # Wait parameter
191
+ time: int | None = Field(None, description="Time in milliseconds for wait action"),
192
+ # General parameters
193
+ hold_keys: list[str] | None = Field(None, description="Keys to hold during action"),
194
+ # hold_key specific
195
+ duration: float | None = Field(None, description="Duration in seconds for hold_key action"),
196
+ ) -> list[ImageContent | TextContent]:
197
+ """
198
+ Execute a computer control action by name.
199
+
200
+ Returns:
201
+ List of MCP content blocks
202
+ """
203
+ logger.info("HudComputerTool executing action: %s", action)
204
+
205
+ try:
206
+ # Delegate to executor based on action
207
+ if action == "click":
208
+ # Scale coordinates from client space to screen space
209
+ scaled_x, scaled_y = self._scale_coordinates(x, y)
210
+ result = await self.executor.click(
211
+ x=scaled_x,
212
+ y=scaled_y,
213
+ button=button or "left",
214
+ pattern=pattern,
215
+ hold_keys=hold_keys,
216
+ )
217
+
218
+ elif action == "press":
219
+ if keys is None:
220
+ raise ToolError("keys parameter is required for press")
221
+ result = await self.executor.press(keys=keys)
222
+
223
+ elif action == "keydown":
224
+ if keys is None:
225
+ raise ToolError("keys parameter is required for keydown")
226
+ result = await self.executor.keydown(keys=keys)
227
+
228
+ elif action == "keyup":
229
+ if keys is None:
230
+ raise ToolError("keys parameter is required for keyup")
231
+ result = await self.executor.keyup(keys=keys)
232
+
233
+ elif action == "type":
234
+ if text is None:
235
+ raise ToolError("text parameter is required for type")
236
+ result = await self.executor.type(text=text, enter_after=enter_after or False)
237
+
238
+ elif action == "scroll":
239
+ # Scale coordinates from client space to screen space
240
+ scaled_x, scaled_y = self._scale_coordinates(x, y)
241
+ result = await self.executor.scroll(
242
+ x=scaled_x,
243
+ y=scaled_y,
244
+ scroll_x=scroll_x,
245
+ scroll_y=scroll_y,
246
+ hold_keys=hold_keys,
247
+ )
248
+
249
+ elif action == "move":
250
+ # Scale coordinates from client space to screen space
251
+ scaled_x, scaled_y = self._scale_coordinates(x, y)
252
+ scaled_offset_x, scaled_offset_y = self._scale_coordinates(offset_x, offset_y)
253
+ result = await self.executor.move(
254
+ x=scaled_x, y=scaled_y, offset_x=scaled_offset_x, offset_y=scaled_offset_y
255
+ )
256
+
257
+ elif action == "wait":
258
+ if time is None:
259
+ raise ToolError("time parameter is required for wait")
260
+ result = await self.executor.wait(time=time)
261
+
262
+ elif action == "drag":
263
+ if path is None:
264
+ raise ToolError("path parameter is required for drag")
265
+ # Scale path from client space to screen space
266
+ scaled_path = self._scale_path(path)
267
+ result = await self.executor.drag(
268
+ path=scaled_path, pattern=pattern, hold_keys=hold_keys
269
+ )
270
+
271
+ elif action == "response":
272
+ if text is None:
273
+ raise ToolError("text parameter is required for response")
274
+ return [TextContent(text=text, type="text")]
275
+
276
+ elif action == "screenshot":
277
+ screenshot = await self.executor.screenshot()
278
+ if screenshot:
279
+ # Rescale screenshot if requested
280
+ screenshot = await self._rescale_screenshot(screenshot)
281
+ result = ToolResult(base64_image=screenshot)
282
+ else:
283
+ result = ToolResult(error="Failed to take screenshot")
284
+
285
+ elif action == "position":
286
+ result = await self.executor.position()
287
+
288
+ elif action == "hold_key":
289
+ if text is None:
290
+ raise ToolError("text parameter is required for hold_key")
291
+ if duration is None:
292
+ raise ToolError("duration parameter is required for hold_key")
293
+ result = await self.executor.hold_key(key=text, duration=duration)
294
+
295
+ elif action == "mouse_down":
296
+ result = await self.executor.mouse_down(button=button or "left")
297
+
298
+ elif action == "mouse_up":
299
+ result = await self.executor.mouse_up(button=button or "left")
300
+
301
+ else:
302
+ raise McpError(ErrorData(code=INVALID_PARAMS, message=f"Unknown action: {action}"))
303
+
304
+ # Rescale screenshot in result if present
305
+ if isinstance(result, ToolResult) and result.base64_image and self.rescale_images:
306
+ rescaled_image = await self._rescale_screenshot(result.base64_image)
307
+ result = result.replace(base64_image=rescaled_image)
308
+
309
+ # Convert result to content blocks
310
+ return tool_result_to_content_blocks(result)
311
+
312
+ except TypeError as e:
313
+ raise McpError(
314
+ ErrorData(code=INVALID_PARAMS, message=f"Invalid parameters for {action}: {e!s}")
315
+ ) from e
@@ -0,0 +1,283 @@
1
+ # flake8: noqa: B008
2
+ from __future__ import annotations
3
+
4
+ import logging
5
+ from typing import Literal, cast
6
+
7
+ from mcp import ErrorData, McpError
8
+ from mcp.types import INTERNAL_ERROR, INVALID_PARAMS, ImageContent, TextContent
9
+ from pydantic import Field
10
+
11
+ from hud.tools.base import ToolResult, tool_result_to_content_blocks
12
+
13
+ from .hud import HudComputerTool
14
+
15
+ logger = logging.getLogger(__name__)
16
+
17
+ # Map OpenAI key names to CLA standard keys
18
+ OPENAI_TO_CLA_KEYS = {
19
+ # Common variations
20
+ "return": "enter",
21
+ "escape": "escape",
22
+ "arrowup": "up",
23
+ "arrowdown": "down",
24
+ "arrowleft": "left",
25
+ "arrowright": "right",
26
+ "backspace": "backspace",
27
+ "delete": "delete",
28
+ "tab": "tab",
29
+ "space": "space",
30
+ "control": "ctrl",
31
+ "alt": "alt",
32
+ "shift": "shift",
33
+ "meta": "win",
34
+ "cmd": "cmd",
35
+ "command": "cmd",
36
+ "super": "win",
37
+ "pageup": "pageup",
38
+ "pagedown": "pagedown",
39
+ "home": "home",
40
+ "end": "end",
41
+ "insert": "insert",
42
+ }
43
+
44
+
45
+ class OpenAIComputerTool(HudComputerTool):
46
+ """
47
+ OpenAI Computer Use tool for interacting with the computer.
48
+ """
49
+
50
+ def __init__(
51
+ self,
52
+ width: int = 1024,
53
+ height: int = 768,
54
+ display_num: int | None = None,
55
+ platform_type: Literal["auto", "xdo", "pyautogui"] = "auto",
56
+ rescale_images: bool = False,
57
+ ) -> None:
58
+ """
59
+ Initialize with OpenAI's default dimensions.
60
+
61
+ Args:
62
+ width: Target width for rescaling (default: 1024 for OpenAI)
63
+ height: Target height for rescaling (default: 768 for OpenAI)
64
+ display_num: X display number
65
+ platform_type: Which executor to use:
66
+ - "auto": Automatically detect based on platform
67
+ - "xdo": Use XDOExecutor (Linux/X11 only)
68
+ - "pyautogui": Use PyAutoGUIExecutor (cross-platform)
69
+ rescale_images: If True, rescale screenshots. If False, only rescale action coordinates
70
+ """
71
+ super().__init__(
72
+ width=width,
73
+ height=height,
74
+ display_num=display_num,
75
+ platform_type=platform_type,
76
+ rescale_images=rescale_images,
77
+ )
78
+
79
+ def _map_openai_key_to_cla(self, key: str) -> str:
80
+ """Map OpenAI key name to CLA standard key."""
81
+ # OpenAI uses lowercase key names
82
+ return OPENAI_TO_CLA_KEYS.get(key.lower(), key.lower())
83
+
84
+ async def __call__(
85
+ self,
86
+ *,
87
+ type: str = Field(..., description="The action type to perform"),
88
+ # Coordinate parameters
89
+ x: int | None = Field(None, description="X coordinate for click/move/scroll actions"),
90
+ y: int | None = Field(None, description="Y coordinate for click/move/scroll actions"),
91
+ # Button parameter
92
+ button: str | None = Field(
93
+ None, description="Mouse button for click actions (left, right, middle, wheel)"
94
+ ),
95
+ # Text parameter
96
+ text: str | None = Field(None, description="Text to type or response text"),
97
+ # Scroll parameters
98
+ scroll_x: int | None = Field(None, description="Horizontal scroll amount"),
99
+ scroll_y: int | None = Field(None, description="Vertical scroll amount"),
100
+ # Wait parameter
101
+ ms: int | None = Field(None, description="Time to wait in milliseconds"),
102
+ # Key press parameter
103
+ keys: list[str] | None = Field(None, description="Keys to press"),
104
+ # Drag parameter
105
+ path: list[dict[str, int]] | None = Field(
106
+ None, description="Path for drag actions as list of {x, y} dicts"
107
+ ),
108
+ # Custom action parameter
109
+ action: str | None = Field(None, description="Custom action name"),
110
+ ) -> list[ImageContent | TextContent]:
111
+ """
112
+ Handle OpenAI Computer Use API calls.
113
+
114
+ This converts OpenAI's action format (based on OperatorAdapter) to HudComputerTool's format.
115
+
116
+ Returns:
117
+ List of MCP content blocks
118
+ """
119
+ logger.info("OpenAIComputerTool received type: %s", type)
120
+
121
+ # Map button names
122
+ button_map = {"wheel": "middle"}
123
+ if button:
124
+ button = button_map.get(button, button)
125
+
126
+ # Process based on action type
127
+ if type == "screenshot":
128
+ screenshot_base64 = await self.executor.screenshot()
129
+ if screenshot_base64:
130
+ # Rescale screenshot if requested
131
+ screenshot_base64 = await self._rescale_screenshot(screenshot_base64)
132
+ result = ToolResult(base64_image=screenshot_base64)
133
+ else:
134
+ result = ToolResult(error="Failed to take screenshot")
135
+
136
+ elif type == "click":
137
+ if x is not None and y is not None:
138
+ # Cast button to proper literal type
139
+ button_literal = cast(
140
+ "Literal['left', 'right', 'middle', 'back', 'forward']", button or "left"
141
+ )
142
+ scaled_x, scaled_y = self._scale_coordinates(x, y)
143
+ logger.info("Scaled coordinates: %s, %s", scaled_x, scaled_y)
144
+ result = await self.executor.click(x=scaled_x, y=scaled_y, button=button_literal)
145
+ else:
146
+ raise McpError(
147
+ ErrorData(code=INVALID_PARAMS, message="x and y coordinates required for click")
148
+ )
149
+
150
+ elif type == "double_click":
151
+ if x is not None and y is not None:
152
+ # Use pattern for double-click
153
+ scaled_x, scaled_y = self._scale_coordinates(x, y)
154
+ result = await self.executor.click(
155
+ x=scaled_x, y=scaled_y, button="left", pattern=[100]
156
+ )
157
+ else:
158
+ raise McpError(
159
+ ErrorData(
160
+ code=INVALID_PARAMS, message="x and y coordinates required for double_click"
161
+ )
162
+ )
163
+
164
+ elif type == "scroll":
165
+ if x is None or y is None:
166
+ raise McpError(
167
+ ErrorData(
168
+ code=INVALID_PARAMS, message="x and y coordinates required for scroll"
169
+ )
170
+ )
171
+
172
+ # scroll_x and scroll_y default to 0 if not provided
173
+ scaled_x, scaled_y = self._scale_coordinates(x, y)
174
+ result = await self.executor.scroll(
175
+ x=scaled_x, y=scaled_y, scroll_x=scroll_x or 0, scroll_y=scroll_y or 0
176
+ )
177
+
178
+ elif type == "type":
179
+ if text is None:
180
+ raise McpError(ErrorData(code=INVALID_PARAMS, message="text is required for type"))
181
+ result = await self.executor.type(text=text, enter_after=False)
182
+
183
+ elif type == "wait":
184
+ wait_time = ms or 1000 # Default to 1 second
185
+ result = await self.executor.wait(time=wait_time)
186
+
187
+ elif type == "move":
188
+ if x is not None and y is not None:
189
+ scaled_x, scaled_y = self._scale_coordinates(x, y)
190
+ result = await self.executor.move(x=scaled_x, y=scaled_y)
191
+ else:
192
+ raise McpError(
193
+ ErrorData(code=INVALID_PARAMS, message="x and y coordinates required for move")
194
+ )
195
+
196
+ elif type == "keypress":
197
+ if keys is None or len(keys) == 0:
198
+ raise McpError(
199
+ ErrorData(code=INVALID_PARAMS, message="keys is required for keypress")
200
+ )
201
+
202
+ # Map OpenAI keys to CLA standard
203
+ cla_keys = []
204
+ for key in keys:
205
+ cla_key = self._map_openai_key_to_cla(key)
206
+ cla_keys.append(cla_key)
207
+
208
+ result = await self.executor.press(keys=cla_keys)
209
+
210
+ elif type == "drag":
211
+ if path is None or len(path) < 2:
212
+ raise McpError(
213
+ ErrorData(
214
+ code=INVALID_PARAMS, message="path with at least 2 points required for drag"
215
+ )
216
+ )
217
+
218
+ # Convert path from list of dicts to list of tuples
219
+ drag_path = []
220
+ for point in path:
221
+ if "x" in point and "y" in point:
222
+ drag_path.append((point["x"], point["y"]))
223
+ else:
224
+ raise McpError(
225
+ ErrorData(
226
+ code=INVALID_PARAMS, message="Each point in path must have x and y"
227
+ )
228
+ )
229
+
230
+ scaled_path = self._scale_path(drag_path)
231
+ result = await self.executor.drag(path=scaled_path)
232
+
233
+ elif type == "response":
234
+ if text is None:
235
+ raise McpError(
236
+ ErrorData(code=INVALID_PARAMS, message="text is required for response")
237
+ )
238
+ # Response returns content blocks directly
239
+ return [TextContent(text=text, type="text")]
240
+
241
+ elif type == "custom":
242
+ # For custom actions, we just return an error since HudComputerTool doesn't support them
243
+ raise McpError(
244
+ ErrorData(code=INVALID_PARAMS, message=f"Custom action not supported: {action}")
245
+ )
246
+
247
+ else:
248
+ raise McpError(ErrorData(code=INTERNAL_ERROR, message=f"Invalid action type: {type}"))
249
+
250
+ # Rescale screenshot in result if present
251
+ if isinstance(result, ToolResult) and result.base64_image and self.rescale_images:
252
+ rescaled_image = await self._rescale_screenshot(result.base64_image)
253
+ result = result.replace(base64_image=rescaled_image)
254
+
255
+ # Handle screenshot for actions that need it
256
+ screenshot_actions = {
257
+ "screenshot",
258
+ "click",
259
+ "double_click",
260
+ "scroll",
261
+ "type",
262
+ "move",
263
+ "keypress",
264
+ "drag",
265
+ "wait",
266
+ }
267
+
268
+ if (
269
+ type in screenshot_actions
270
+ and type != "screenshot"
271
+ and isinstance(result, ToolResult)
272
+ and not result.base64_image
273
+ ):
274
+ screenshot_base64 = await self.executor.screenshot()
275
+ if screenshot_base64:
276
+ # Rescale screenshot if requested
277
+ screenshot_base64 = await self._rescale_screenshot(screenshot_base64)
278
+ result = ToolResult(
279
+ output=result.output, error=result.error, base64_image=screenshot_base64
280
+ )
281
+
282
+ # Convert to content blocks
283
+ return tool_result_to_content_blocks(result)