hud-python 0.2.10__py3-none-any.whl → 0.3.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of hud-python might be problematic. Click here for more details.

Files changed (86) hide show
  1. hud/__init__.py +20 -8
  2. hud/adapters/common/adapter.py +14 -3
  3. hud/adapters/common/tests/test_adapter.py +16 -4
  4. hud/datasets.py +188 -0
  5. hud/env/docker_client.py +15 -3
  6. hud/env/environment.py +10 -7
  7. hud/env/local_docker_client.py +29 -7
  8. hud/env/remote_client.py +1 -1
  9. hud/env/remote_docker_client.py +2 -2
  10. hud/exceptions.py +2 -1
  11. hud/gym.py +0 -9
  12. hud/mcp/__init__.py +17 -0
  13. hud/mcp/base.py +631 -0
  14. hud/mcp/claude.py +321 -0
  15. hud/mcp/client.py +312 -0
  16. hud/mcp/langchain.py +250 -0
  17. hud/mcp/openai.py +334 -0
  18. hud/mcp/tests/__init__.py +1 -0
  19. hud/mcp/tests/test_base.py +512 -0
  20. hud/mcp/tests/test_claude.py +294 -0
  21. hud/mcp/tests/test_client.py +324 -0
  22. hud/mcp/tests/test_openai.py +238 -0
  23. hud/settings.py +20 -2
  24. hud/task.py +5 -88
  25. hud/taskset.py +2 -23
  26. hud/telemetry/__init__.py +16 -7
  27. hud/telemetry/_trace.py +246 -72
  28. hud/telemetry/context.py +88 -27
  29. hud/telemetry/exporter.py +171 -11
  30. hud/telemetry/instrumentation/mcp.py +174 -410
  31. hud/telemetry/job.py +141 -0
  32. hud/telemetry/mcp_models.py +13 -74
  33. hud/telemetry/tests/test_context.py +9 -6
  34. hud/telemetry/tests/test_trace.py +120 -78
  35. hud/tools/__init__.py +34 -0
  36. hud/tools/base.py +65 -0
  37. hud/tools/bash.py +137 -0
  38. hud/tools/computer/__init__.py +13 -0
  39. hud/tools/computer/anthropic.py +411 -0
  40. hud/tools/computer/hud.py +315 -0
  41. hud/tools/computer/openai.py +283 -0
  42. hud/tools/edit.py +290 -0
  43. hud/tools/executors/__init__.py +30 -0
  44. hud/tools/executors/base.py +331 -0
  45. hud/tools/executors/pyautogui.py +619 -0
  46. hud/tools/executors/tests/__init__.py +1 -0
  47. hud/tools/executors/tests/test_base_executor.py +338 -0
  48. hud/tools/executors/tests/test_pyautogui_executor.py +165 -0
  49. hud/tools/executors/xdo.py +503 -0
  50. hud/tools/helper/README.md +56 -0
  51. hud/tools/helper/__init__.py +9 -0
  52. hud/tools/helper/mcp_server.py +78 -0
  53. hud/tools/helper/server_initialization.py +115 -0
  54. hud/tools/helper/utils.py +58 -0
  55. hud/tools/playwright_tool.py +379 -0
  56. hud/tools/tests/__init__.py +3 -0
  57. hud/tools/tests/test_bash.py +152 -0
  58. hud/tools/tests/test_computer.py +52 -0
  59. hud/tools/tests/test_computer_actions.py +34 -0
  60. hud/tools/tests/test_edit.py +240 -0
  61. hud/tools/tests/test_init.py +27 -0
  62. hud/tools/tests/test_playwright_tool.py +183 -0
  63. hud/tools/tests/test_tools.py +157 -0
  64. hud/tools/tests/test_utils.py +156 -0
  65. hud/tools/utils.py +50 -0
  66. hud/trajectory.py +5 -1
  67. hud/types.py +10 -1
  68. hud/utils/tests/test_init.py +21 -0
  69. hud/utils/tests/test_version.py +1 -1
  70. hud/version.py +1 -1
  71. {hud_python-0.2.10.dist-info → hud_python-0.3.1.dist-info}/METADATA +27 -18
  72. hud_python-0.3.1.dist-info/RECORD +119 -0
  73. hud/evaluators/__init__.py +0 -9
  74. hud/evaluators/base.py +0 -32
  75. hud/evaluators/inspect.py +0 -24
  76. hud/evaluators/judge.py +0 -189
  77. hud/evaluators/match.py +0 -156
  78. hud/evaluators/remote.py +0 -65
  79. hud/evaluators/tests/__init__.py +0 -0
  80. hud/evaluators/tests/test_inspect.py +0 -12
  81. hud/evaluators/tests/test_judge.py +0 -231
  82. hud/evaluators/tests/test_match.py +0 -115
  83. hud/evaluators/tests/test_remote.py +0 -98
  84. hud_python-0.2.10.dist-info/RECORD +0 -85
  85. {hud_python-0.2.10.dist-info → hud_python-0.3.1.dist-info}/WHEEL +0 -0
  86. {hud_python-0.2.10.dist-info → hud_python-0.3.1.dist-info}/licenses/LICENSE +0 -0
hud/tools/__init__.py ADDED
@@ -0,0 +1,34 @@
1
+ """HUD tools for computer control, file editing, and bash commands."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import TYPE_CHECKING, Any
6
+
7
+ from .base import ToolError, ToolResult, tool_result_to_content_blocks
8
+ from .bash import BashTool
9
+ from .edit import EditTool
10
+ from .playwright_tool import PlaywrightTool
11
+
12
+ if TYPE_CHECKING:
13
+ from .computer import AnthropicComputerTool, HudComputerTool, OpenAIComputerTool
14
+
15
+ __all__ = [
16
+ "AnthropicComputerTool",
17
+ "BashTool",
18
+ "EditTool",
19
+ "HudComputerTool",
20
+ "OpenAIComputerTool",
21
+ "PlaywrightTool",
22
+ "ToolError",
23
+ "ToolResult",
24
+ "tool_result_to_content_blocks",
25
+ ]
26
+
27
+
28
+ def __getattr__(name: str) -> Any:
29
+ """Lazy import computer tools to avoid importing pyautogui unless needed."""
30
+ if name in ("AnthropicComputerTool", "HudComputerTool", "OpenAIComputerTool"):
31
+ from . import computer
32
+
33
+ return getattr(computer, name)
34
+ raise AttributeError(f"module '{__name__}' has no attribute '{name}'")
hud/tools/base.py ADDED
@@ -0,0 +1,65 @@
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass, fields, replace
4
+ from typing import Any
5
+
6
+ from mcp.types import ImageContent, TextContent
7
+
8
+
9
+ @dataclass(kw_only=True, frozen=True)
10
+ class ToolResult:
11
+ """Represents the result of a tool execution."""
12
+
13
+ output: str | None = None
14
+ error: str | None = None
15
+ base64_image: str | None = None
16
+ system: str | None = None
17
+
18
+ def __bool__(self) -> bool:
19
+ return any(getattr(self, field.name) for field in fields(self))
20
+
21
+ def __add__(self, other: ToolResult) -> ToolResult:
22
+ def combine_fields(
23
+ field: str | None, other_field: str | None, concatenate: bool = True
24
+ ) -> str | None:
25
+ if field and other_field:
26
+ if concatenate:
27
+ return field + other_field
28
+ raise ValueError("Cannot combine tool results")
29
+ return field or other_field
30
+
31
+ return ToolResult(
32
+ output=combine_fields(self.output, other.output),
33
+ error=combine_fields(self.error, other.error),
34
+ base64_image=combine_fields(self.base64_image, other.base64_image, False),
35
+ system=combine_fields(self.system, other.system),
36
+ )
37
+
38
+ def replace(self, **kwargs: Any) -> ToolResult:
39
+ """Returns a new ToolResult with the given fields replaced."""
40
+ return replace(self, **kwargs)
41
+
42
+
43
+ # Legacy alias for backward compatibility
44
+ CLIResult = ToolResult
45
+
46
+
47
+ class ToolError(Exception):
48
+ """An error raised by a tool."""
49
+
50
+
51
+ # Legacy alias for backward compatibility
52
+ CLIError = ToolError
53
+
54
+
55
+ def tool_result_to_content_blocks(result: ToolResult) -> list[ImageContent | TextContent]:
56
+ """Convert a ToolResult to MCP content blocks."""
57
+ blocks = []
58
+
59
+ if result.output:
60
+ blocks.append(TextContent(text=result.output, type="text"))
61
+ if result.error:
62
+ blocks.append(TextContent(text=result.error, type="text"))
63
+ if result.base64_image:
64
+ blocks.append(ImageContent(data=result.base64_image, mimeType="image/png", type="image"))
65
+ return blocks
hud/tools/bash.py ADDED
@@ -0,0 +1,137 @@
1
+ from __future__ import annotations
2
+
3
+ import asyncio
4
+ import os
5
+ import sys
6
+ from typing import Any
7
+
8
+ from .base import CLIResult, ToolError, ToolResult
9
+
10
+
11
+ class _BashSession:
12
+ """A session of a bash shell."""
13
+
14
+ _started: bool
15
+ _process: asyncio.subprocess.Process
16
+
17
+ command: str = "/bin/bash"
18
+ _output_delay: float = 0.2 # seconds
19
+ _timeout: float = 120.0 # seconds
20
+ _sentinel: str = "<<exit>>"
21
+
22
+ def __init__(self) -> None:
23
+ self._started = False
24
+ self._timed_out = False
25
+
26
+ async def start(self) -> None:
27
+ if self._started:
28
+ await asyncio.sleep(0)
29
+ return
30
+
31
+ # Platform-specific subprocess creation
32
+ kwargs = {
33
+ "shell": True,
34
+ "bufsize": 0,
35
+ "stdin": asyncio.subprocess.PIPE,
36
+ "stdout": asyncio.subprocess.PIPE,
37
+ "stderr": asyncio.subprocess.PIPE,
38
+ }
39
+
40
+ # Only use setsid on Unix-like systems
41
+ if sys.platform != "win32":
42
+ kwargs["preexec_fn"] = os.setsid
43
+
44
+ self._process = await asyncio.create_subprocess_shell(self.command, **kwargs)
45
+
46
+ self._started = True
47
+
48
+ def stop(self) -> None:
49
+ """Terminate the bash shell."""
50
+ if not self._started:
51
+ raise ToolError("Session has not started.")
52
+ if self._process.returncode is not None:
53
+ return
54
+ self._process.terminate()
55
+
56
+ async def run(self, command: str) -> CLIResult:
57
+ """Execute a command in the bash shell."""
58
+ if not self._started:
59
+ raise ToolError("Session has not started.")
60
+ if self._process.returncode is not None:
61
+ await asyncio.sleep(0)
62
+ return ToolResult(
63
+ system="tool must be restarted",
64
+ error=f"bash has exited with returncode {self._process.returncode}",
65
+ )
66
+ if self._timed_out:
67
+ raise ToolError(
68
+ f"timed out: bash did not return in {self._timeout} seconds and must be restarted",
69
+ ) from None
70
+
71
+ if self._process.stdin is None:
72
+ raise ToolError("stdin is None")
73
+ if self._process.stdout is None:
74
+ raise ToolError("stdout is None")
75
+ if self._process.stderr is None:
76
+ raise ToolError("stderr is None")
77
+
78
+ # Send command to the process
79
+ self._process.stdin.write(command.encode() + f"; echo '{self._sentinel}'\n".encode())
80
+ await self._process.stdin.drain()
81
+
82
+ # Read output from the process, until the sentinel is found
83
+ sentinel_line = f"{self._sentinel}\n"
84
+ sentinel_bytes = sentinel_line.encode()
85
+
86
+ try:
87
+ raw_out: bytes = await asyncio.wait_for(
88
+ self._process.stdout.readuntil(sentinel_bytes),
89
+ timeout=self._timeout,
90
+ )
91
+ output = raw_out.decode()[: -len(sentinel_line)]
92
+ except (TimeoutError, asyncio.LimitOverrunError):
93
+ self._timed_out = True
94
+ raise ToolError(
95
+ f"timed out: bash did not return in {self._timeout} seconds and must be restarted",
96
+ ) from None
97
+
98
+ # Attempt non-blocking stderr fetch (may return empty)
99
+ try:
100
+ error_bytes = await asyncio.wait_for(self._process.stderr.read(), timeout=0.01)
101
+ error = error_bytes.decode().rstrip("\n")
102
+ except TimeoutError:
103
+ error = ""
104
+
105
+ return CLIResult(output=output, error=error)
106
+
107
+
108
+ class BashTool:
109
+ """
110
+ A tool that allows the agent to run bash commands.
111
+ The tool parameters are defined by Anthropic and are not editable.
112
+ """
113
+
114
+ _session: _BashSession | None
115
+
116
+ def __init__(self) -> None:
117
+ self._session = None
118
+
119
+ async def __call__(
120
+ self, command: str | None = None, restart: bool = False, **kwargs: Any
121
+ ) -> ToolResult:
122
+ if restart:
123
+ if self._session:
124
+ self._session.stop()
125
+ self._session = _BashSession()
126
+ await self._session.start()
127
+
128
+ return ToolResult(system="tool has been restarted.")
129
+
130
+ if self._session is None:
131
+ self._session = _BashSession()
132
+ await self._session.start()
133
+
134
+ if command is not None:
135
+ return await self._session.run(command)
136
+
137
+ raise ToolError("no command provided.")
@@ -0,0 +1,13 @@
1
+ """Computer control tools for different agent APIs."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from .anthropic import AnthropicComputerTool
6
+ from .hud import HudComputerTool
7
+ from .openai import OpenAIComputerTool
8
+
9
+ __all__ = [
10
+ "AnthropicComputerTool",
11
+ "HudComputerTool",
12
+ "OpenAIComputerTool",
13
+ ]
@@ -0,0 +1,411 @@
1
+ # flake8: noqa: B008
2
+ from __future__ import annotations
3
+
4
+ import logging
5
+ from typing import TYPE_CHECKING, Literal, cast
6
+
7
+ from mcp import ErrorData, McpError
8
+ from mcp.types import INTERNAL_ERROR, INVALID_PARAMS, ImageContent, TextContent
9
+ from pydantic import Field
10
+
11
+ from hud.tools.base import ToolResult, tool_result_to_content_blocks
12
+
13
+ from .hud import HudComputerTool
14
+
15
+ if TYPE_CHECKING:
16
+ from anthropic.types.beta import BetaToolComputerUse20250124Param
17
+
18
+ logger = logging.getLogger(__name__)
19
+
20
+ # Map Anthropic key names to CLA standard keys
21
+ ANTHROPIC_TO_CLA_KEYS = {
22
+ # Common variations
23
+ "Return": "enter",
24
+ "Escape": "escape",
25
+ "ArrowUp": "up",
26
+ "ArrowDown": "down",
27
+ "ArrowLeft": "left",
28
+ "ArrowRight": "right",
29
+ "Backspace": "backspace",
30
+ "Delete": "delete",
31
+ "Tab": "tab",
32
+ "Space": "space",
33
+ "Control": "ctrl",
34
+ "Alt": "alt",
35
+ "Shift": "shift",
36
+ "Meta": "win", # Windows key
37
+ "Command": "cmd", # macOS
38
+ "Super": "win", # Linux
39
+ "PageUp": "pageup",
40
+ "PageDown": "pagedown",
41
+ "Home": "home",
42
+ "End": "end",
43
+ "Insert": "insert",
44
+ "F1": "f1",
45
+ "F2": "f2",
46
+ "F3": "f3",
47
+ "F4": "f4",
48
+ "F5": "f5",
49
+ "F6": "f6",
50
+ "F7": "f7",
51
+ "F8": "f8",
52
+ "F9": "f9",
53
+ "F10": "f10",
54
+ "F11": "f11",
55
+ "F12": "f12",
56
+ }
57
+
58
+
59
+ class AnthropicComputerTool(HudComputerTool):
60
+ """
61
+ Anthropic Computer Use tool for interacting with the computer.
62
+ """
63
+
64
+ name: str = "computer"
65
+ api_type: str = "computer_20250124"
66
+
67
+ def __init__(
68
+ self,
69
+ width: int = 1400,
70
+ height: int = 850,
71
+ display_num: int | None = None,
72
+ platform_type: Literal["auto", "xdo", "pyautogui"] = "auto",
73
+ rescale_images: bool = False,
74
+ ) -> None:
75
+ """
76
+ Initialize with Anthropic's default dimensions.
77
+
78
+ Args:
79
+ width: Target width for rescaling (default: 1024 for Anthropic)
80
+ height: Target height for rescaling (default: 768 for Anthropic)
81
+ display_num: X display number
82
+ platform_type: Which executor to use:
83
+ - "auto": Automatically detect based on platform
84
+ - "xdo": Use XDOExecutor (Linux/X11 only)
85
+ - "pyautogui": Use PyAutoGUIExecutor (cross-platform)
86
+ rescale_images: If True, rescale screenshots. If False, only rescale action coordinates
87
+ """
88
+ super().__init__(
89
+ width=width,
90
+ height=height,
91
+ display_num=display_num,
92
+ platform_type=platform_type,
93
+ rescale_images=rescale_images,
94
+ )
95
+
96
+ def to_params(self) -> BetaToolComputerUse20250124Param:
97
+ """Convert to Anthropic tool parameters."""
98
+ return cast(
99
+ "BetaToolComputerUse20250124Param",
100
+ {
101
+ "type": self.api_type,
102
+ "name": self.name,
103
+ "display_width_px": self.width,
104
+ "display_height_px": self.height,
105
+ },
106
+ )
107
+
108
+ def _map_anthropic_key_to_cla(self, key: str) -> str:
109
+ """Map Anthropic key name to CLA standard key."""
110
+ # Handle key combinations like "ctrl+a"
111
+ if "+" in key:
112
+ parts = key.split("+")
113
+ mapped_parts = []
114
+ for part in parts:
115
+ # Try exact match first, then case-insensitive
116
+ mapped = ANTHROPIC_TO_CLA_KEYS.get(
117
+ part, ANTHROPIC_TO_CLA_KEYS.get(part.capitalize(), part.lower())
118
+ )
119
+ mapped_parts.append(mapped)
120
+ return "+".join(mapped_parts)
121
+ else:
122
+ # Single key - try exact match first, then case-insensitive
123
+ return ANTHROPIC_TO_CLA_KEYS.get(
124
+ key, ANTHROPIC_TO_CLA_KEYS.get(key.capitalize(), key.lower())
125
+ )
126
+
127
+ async def __call__(
128
+ self,
129
+ action: str = Field(..., description="The action to perform on the computer"),
130
+ coordinate: list[int] | tuple[int, int] | None = Field(
131
+ None, description="The coordinate to interact with on the computer [x, y]"
132
+ ),
133
+ text: str | None = Field(
134
+ None, description="The text to type on the computer or key to press"
135
+ ),
136
+ start_coordinate: list[int] | tuple[int, int] | None = Field(
137
+ None, description="The starting coordinate for drag actions [x, y]"
138
+ ),
139
+ scroll_direction: str | None = Field(
140
+ None, description="The direction to scroll (up, down, left, right)"
141
+ ),
142
+ scroll_amount: int | None = Field(None, description="The amount to scroll"),
143
+ duration: float | None = Field(None, description="The duration of the action in seconds"),
144
+ take_screenshot_on_click: bool = Field(
145
+ True, description="Whether to take a screenshot after clicking"
146
+ ),
147
+ ) -> list[ImageContent | TextContent]:
148
+ """
149
+ Handle Anthropic Computer Use API calls.
150
+
151
+ This converts Anthropic's action format to HudComputerTool's format.
152
+
153
+ Returns:
154
+ List of MCP content blocks
155
+ """
156
+ logger.info("AnthropicComputerTool received action: %s", action)
157
+
158
+ # Convert lists to tuples if needed
159
+ coord_tuple = None
160
+ if coordinate:
161
+ coord_tuple = tuple(coordinate) if isinstance(coordinate, list) else coordinate
162
+
163
+ start_coord_tuple = None
164
+ if start_coordinate:
165
+ start_coord_tuple = (
166
+ tuple(start_coordinate) if isinstance(start_coordinate, list) else start_coordinate
167
+ )
168
+
169
+ # Map Anthropic actions to HudComputerTool actions
170
+ if action == "screenshot":
171
+ screenshot_base64 = await self.executor.screenshot()
172
+ if screenshot_base64:
173
+ # Rescale screenshot if requested
174
+ screenshot_base64 = await self._rescale_screenshot(screenshot_base64)
175
+ result = ToolResult(base64_image=screenshot_base64)
176
+ else:
177
+ result = ToolResult(error="Failed to take screenshot")
178
+
179
+ elif action == "left_click" or action == "click":
180
+ if coord_tuple and len(coord_tuple) >= 2:
181
+ scaled_x, scaled_y = self._scale_coordinates(coord_tuple[0], coord_tuple[1])
182
+ logger.info("Scaled coordinates: %s, %s", scaled_x, scaled_y)
183
+ result = await self.executor.click(x=scaled_x, y=scaled_y)
184
+ else:
185
+ result = await self.executor.click()
186
+
187
+ elif action == "double_click":
188
+ if coord_tuple and len(coord_tuple) >= 2:
189
+ # Use pattern for double-click
190
+ scaled_x, scaled_y = self._scale_coordinates(coord_tuple[0], coord_tuple[1])
191
+ result = await self.executor.click(x=scaled_x, y=scaled_y, pattern=[100])
192
+ else:
193
+ result = await self.executor.click(pattern=[100])
194
+
195
+ elif action == "triple_click":
196
+ if coord_tuple and len(coord_tuple) >= 2:
197
+ # Use pattern for triple-click
198
+ scaled_x, scaled_y = self._scale_coordinates(coord_tuple[0], coord_tuple[1])
199
+ result = await self.executor.click(x=scaled_x, y=scaled_y, pattern=[100, 100])
200
+ else:
201
+ result = await self.executor.click(pattern=[100, 100])
202
+
203
+ elif action == "right_click":
204
+ if coord_tuple and len(coord_tuple) >= 2:
205
+ scaled_x, scaled_y = self._scale_coordinates(coord_tuple[0], coord_tuple[1])
206
+ result = await self.executor.click(x=scaled_x, y=scaled_y, button="right")
207
+ else:
208
+ result = await self.executor.click(button="right")
209
+
210
+ elif action == "middle_click":
211
+ if coord_tuple and len(coord_tuple) >= 2:
212
+ scaled_x, scaled_y = self._scale_coordinates(coord_tuple[0], coord_tuple[1])
213
+ result = await self.executor.click(x=scaled_x, y=scaled_y, button="middle")
214
+ else:
215
+ result = await self.executor.click(button="middle")
216
+
217
+ elif action == "mouse_move" or action == "move":
218
+ if coord_tuple and len(coord_tuple) >= 2:
219
+ scaled_x, scaled_y = self._scale_coordinates(coord_tuple[0], coord_tuple[1])
220
+ result = await self.executor.move(x=scaled_x, y=scaled_y)
221
+ else:
222
+ raise McpError(
223
+ ErrorData(code=INVALID_PARAMS, message="coordinate is required for mouse_move")
224
+ )
225
+
226
+ elif action == "type":
227
+ if text:
228
+ result = await self.executor.type(text=text)
229
+ else:
230
+ raise McpError(ErrorData(code=INVALID_PARAMS, message="text is required for type"))
231
+
232
+ elif action == "key":
233
+ if text:
234
+ # Anthropic sends single key or combo like "ctrl+a"
235
+ # Map to CLA standard key format
236
+ mapped_key = self._map_anthropic_key_to_cla(text)
237
+ result = await self.executor.press(keys=[mapped_key])
238
+ else:
239
+ raise McpError(ErrorData(code=INVALID_PARAMS, message="text is required for key"))
240
+
241
+ elif action == "scroll":
242
+ # Original implementation validates scroll_direction and scroll_amount
243
+ if scroll_direction not in ["up", "down", "left", "right"]:
244
+ raise McpError(
245
+ ErrorData(
246
+ code=INVALID_PARAMS,
247
+ message="scroll_direction must be 'up', 'down', 'left', or 'right'",
248
+ )
249
+ )
250
+
251
+ if scroll_amount is None or scroll_amount < 0:
252
+ raise McpError(
253
+ ErrorData(
254
+ code=INVALID_PARAMS, message="scroll_amount must be a non-negative int"
255
+ )
256
+ )
257
+
258
+ # Convert direction to scroll amounts
259
+ scroll_x = None
260
+ scroll_y = None
261
+ if scroll_direction == "down":
262
+ scroll_y = scroll_amount
263
+ elif scroll_direction == "up":
264
+ scroll_y = -scroll_amount
265
+ elif scroll_direction == "right":
266
+ scroll_x = scroll_amount
267
+ elif scroll_direction == "left":
268
+ scroll_x = -scroll_amount
269
+
270
+ if coord_tuple and len(coord_tuple) >= 2:
271
+ scaled_x, scaled_y = self._scale_coordinates(coord_tuple[0], coord_tuple[1])
272
+ result = await self.executor.scroll(
273
+ x=scaled_x, y=scaled_y, scroll_x=scroll_x, scroll_y=scroll_y
274
+ )
275
+ else:
276
+ result = await self.executor.scroll(scroll_x=scroll_x, scroll_y=scroll_y)
277
+
278
+ elif action == "left_click_drag" or action == "drag":
279
+ # Anthropic sends drag with start and end coordinates
280
+ if coord_tuple and len(coord_tuple) >= 2:
281
+ if start_coord_tuple and len(start_coord_tuple) >= 2:
282
+ # Full drag path
283
+ path = [
284
+ (start_coord_tuple[0], start_coord_tuple[1]),
285
+ (coord_tuple[0], coord_tuple[1]),
286
+ ]
287
+ scaled_path = self._scale_path(path)
288
+ result = await self.executor.drag(path=scaled_path)
289
+ else:
290
+ # Just end coordinate, drag from current position
291
+ # Original spec allows this
292
+ current_pos = [(0, 0), (coord_tuple[0], coord_tuple[1])] # Simplified
293
+ scaled_path = self._scale_path(current_pos)
294
+ result = await self.executor.drag(path=scaled_path)
295
+ else:
296
+ raise McpError(
297
+ ErrorData(
298
+ code=INVALID_PARAMS, message="coordinate is required for left_click_drag"
299
+ )
300
+ )
301
+
302
+ elif action == "wait":
303
+ # Original spec expects duration in seconds
304
+ if duration is None:
305
+ raise McpError(
306
+ ErrorData(code=INVALID_PARAMS, message="duration is required for wait")
307
+ )
308
+ if duration < 0:
309
+ raise McpError(
310
+ ErrorData(code=INVALID_PARAMS, message="duration must be non-negative")
311
+ )
312
+ if duration > 100:
313
+ raise McpError(ErrorData(code=INVALID_PARAMS, message="duration is too long"))
314
+
315
+ # Convert seconds to milliseconds for HudComputerTool
316
+ result = await self.executor.wait(time=int(duration * 1000))
317
+
318
+ elif action == "hold_key":
319
+ # Original spec has hold_key action
320
+ if text is None:
321
+ raise McpError(
322
+ ErrorData(code=INVALID_PARAMS, message="text is required for hold_key")
323
+ )
324
+ if duration is None:
325
+ raise McpError(
326
+ ErrorData(code=INVALID_PARAMS, message="duration is required for hold_key")
327
+ )
328
+ if duration < 0:
329
+ raise McpError(
330
+ ErrorData(code=INVALID_PARAMS, message="duration must be non-negative")
331
+ )
332
+ if duration > 100:
333
+ raise McpError(ErrorData(code=INVALID_PARAMS, message="duration is too long"))
334
+
335
+ # Hold key action
336
+ result = await self.executor.hold_key(key=text, duration=duration)
337
+
338
+ elif action == "left_mouse_down":
339
+ # These don't accept coordinates in original spec
340
+ if coord_tuple is not None:
341
+ raise McpError(
342
+ ErrorData(
343
+ code=INVALID_PARAMS,
344
+ message="coordinate is not accepted for left_mouse_down",
345
+ )
346
+ )
347
+ # Use generic mouse_down method
348
+ result = await self.executor.mouse_down(button="left")
349
+
350
+ elif action == "left_mouse_up":
351
+ # These don't accept coordinates in original spec
352
+ if coord_tuple is not None:
353
+ raise McpError(
354
+ ErrorData(
355
+ code=INVALID_PARAMS, message="coordinate is not accepted for left_mouse_up"
356
+ )
357
+ )
358
+ # Use generic mouse_up method
359
+ result = await self.executor.mouse_up(button="left")
360
+
361
+ elif action == "cursor_position":
362
+ result = await self.executor.position()
363
+
364
+ else:
365
+ # Unknown action
366
+ raise McpError(ErrorData(code=INTERNAL_ERROR, message=f"Invalid action: {action}"))
367
+
368
+ # Rescale screenshot in result if present
369
+ if isinstance(result, ToolResult) and result.base64_image and self.rescale_images:
370
+ rescaled_image = await self._rescale_screenshot(result.base64_image)
371
+ result = result.replace(base64_image=rescaled_image)
372
+
373
+ # Handle screenshot for actions that need it
374
+ screenshot_actions = {
375
+ "screenshot",
376
+ "left_click",
377
+ "click",
378
+ "double_click",
379
+ "triple_click",
380
+ "right_click",
381
+ "middle_click",
382
+ "mouse_move",
383
+ "move",
384
+ "type",
385
+ "key",
386
+ "scroll",
387
+ "left_click_drag",
388
+ "drag",
389
+ "wait",
390
+ "hold_key",
391
+ "left_mouse_down",
392
+ "left_mouse_up",
393
+ }
394
+
395
+ if (
396
+ action in screenshot_actions
397
+ and action != "screenshot"
398
+ and take_screenshot_on_click
399
+ and isinstance(result, ToolResult)
400
+ and not result.base64_image
401
+ ):
402
+ screenshot_base64 = await self.executor.screenshot()
403
+ if screenshot_base64:
404
+ # Rescale screenshot if requested
405
+ screenshot_base64 = await self._rescale_screenshot(screenshot_base64)
406
+ result = ToolResult(
407
+ output=result.output, error=result.error, base64_image=screenshot_base64
408
+ )
409
+
410
+ # Convert to content blocks
411
+ return tool_result_to_content_blocks(result)