hud-python 0.4.1__py3-none-any.whl → 0.4.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of hud-python might be problematic. Click here for more details.
- hud/__init__.py +22 -22
- hud/agents/__init__.py +13 -15
- hud/agents/base.py +599 -599
- hud/agents/claude.py +373 -373
- hud/agents/langchain.py +261 -250
- hud/agents/misc/__init__.py +7 -7
- hud/agents/misc/response_agent.py +82 -80
- hud/agents/openai.py +352 -352
- hud/agents/openai_chat_generic.py +154 -154
- hud/agents/tests/__init__.py +1 -1
- hud/agents/tests/test_base.py +742 -742
- hud/agents/tests/test_claude.py +324 -324
- hud/agents/tests/test_client.py +363 -363
- hud/agents/tests/test_openai.py +237 -237
- hud/cli/__init__.py +617 -617
- hud/cli/__main__.py +8 -8
- hud/cli/analyze.py +371 -371
- hud/cli/analyze_metadata.py +230 -230
- hud/cli/build.py +498 -427
- hud/cli/clone.py +185 -185
- hud/cli/cursor.py +92 -92
- hud/cli/debug.py +392 -392
- hud/cli/docker_utils.py +83 -83
- hud/cli/init.py +280 -281
- hud/cli/interactive.py +353 -353
- hud/cli/mcp_server.py +764 -756
- hud/cli/pull.py +330 -336
- hud/cli/push.py +404 -370
- hud/cli/remote_runner.py +311 -311
- hud/cli/runner.py +160 -160
- hud/cli/tests/__init__.py +3 -3
- hud/cli/tests/test_analyze.py +284 -284
- hud/cli/tests/test_cli_init.py +265 -265
- hud/cli/tests/test_cli_main.py +27 -27
- hud/cli/tests/test_clone.py +142 -142
- hud/cli/tests/test_cursor.py +253 -253
- hud/cli/tests/test_debug.py +453 -453
- hud/cli/tests/test_mcp_server.py +139 -139
- hud/cli/tests/test_utils.py +388 -388
- hud/cli/utils.py +263 -263
- hud/clients/README.md +143 -143
- hud/clients/__init__.py +16 -16
- hud/clients/base.py +378 -379
- hud/clients/fastmcp.py +222 -222
- hud/clients/mcp_use.py +298 -278
- hud/clients/tests/__init__.py +1 -1
- hud/clients/tests/test_client_integration.py +111 -111
- hud/clients/tests/test_fastmcp.py +342 -342
- hud/clients/tests/test_protocol.py +188 -188
- hud/clients/utils/__init__.py +1 -1
- hud/clients/utils/retry_transport.py +160 -160
- hud/datasets.py +327 -322
- hud/misc/__init__.py +1 -1
- hud/misc/claude_plays_pokemon.py +292 -292
- hud/otel/__init__.py +35 -35
- hud/otel/collector.py +142 -142
- hud/otel/config.py +164 -164
- hud/otel/context.py +536 -536
- hud/otel/exporters.py +366 -366
- hud/otel/instrumentation.py +97 -97
- hud/otel/processors.py +118 -118
- hud/otel/tests/__init__.py +1 -1
- hud/otel/tests/test_processors.py +197 -197
- hud/server/__init__.py +5 -5
- hud/server/context.py +114 -114
- hud/server/helper/__init__.py +5 -5
- hud/server/low_level.py +132 -132
- hud/server/server.py +170 -166
- hud/server/tests/__init__.py +3 -3
- hud/settings.py +73 -73
- hud/shared/__init__.py +5 -5
- hud/shared/exceptions.py +180 -180
- hud/shared/requests.py +264 -264
- hud/shared/tests/test_exceptions.py +157 -157
- hud/shared/tests/test_requests.py +275 -275
- hud/telemetry/__init__.py +25 -25
- hud/telemetry/instrument.py +379 -379
- hud/telemetry/job.py +309 -309
- hud/telemetry/replay.py +74 -74
- hud/telemetry/trace.py +83 -83
- hud/tools/__init__.py +33 -33
- hud/tools/base.py +365 -365
- hud/tools/bash.py +161 -161
- hud/tools/computer/__init__.py +15 -15
- hud/tools/computer/anthropic.py +437 -437
- hud/tools/computer/hud.py +376 -376
- hud/tools/computer/openai.py +295 -295
- hud/tools/computer/settings.py +82 -82
- hud/tools/edit.py +314 -314
- hud/tools/executors/__init__.py +30 -30
- hud/tools/executors/base.py +539 -539
- hud/tools/executors/pyautogui.py +621 -621
- hud/tools/executors/tests/__init__.py +1 -1
- hud/tools/executors/tests/test_base_executor.py +338 -338
- hud/tools/executors/tests/test_pyautogui_executor.py +165 -165
- hud/tools/executors/xdo.py +511 -511
- hud/tools/playwright.py +412 -412
- hud/tools/tests/__init__.py +3 -3
- hud/tools/tests/test_base.py +282 -282
- hud/tools/tests/test_bash.py +158 -158
- hud/tools/tests/test_bash_extended.py +197 -197
- hud/tools/tests/test_computer.py +425 -425
- hud/tools/tests/test_computer_actions.py +34 -34
- hud/tools/tests/test_edit.py +259 -259
- hud/tools/tests/test_init.py +27 -27
- hud/tools/tests/test_playwright_tool.py +183 -183
- hud/tools/tests/test_tools.py +145 -145
- hud/tools/tests/test_utils.py +156 -156
- hud/tools/types.py +72 -72
- hud/tools/utils.py +50 -50
- hud/types.py +136 -136
- hud/utils/__init__.py +10 -10
- hud/utils/async_utils.py +65 -65
- hud/utils/design.py +236 -168
- hud/utils/mcp.py +55 -55
- hud/utils/progress.py +149 -149
- hud/utils/telemetry.py +66 -66
- hud/utils/tests/test_async_utils.py +173 -173
- hud/utils/tests/test_init.py +17 -17
- hud/utils/tests/test_progress.py +261 -261
- hud/utils/tests/test_telemetry.py +82 -82
- hud/utils/tests/test_version.py +8 -8
- hud/version.py +7 -7
- {hud_python-0.4.1.dist-info → hud_python-0.4.3.dist-info}/METADATA +10 -8
- hud_python-0.4.3.dist-info/RECORD +131 -0
- {hud_python-0.4.1.dist-info → hud_python-0.4.3.dist-info}/licenses/LICENSE +21 -21
- hud/agents/art.py +0 -101
- hud_python-0.4.1.dist-info/RECORD +0 -132
- {hud_python-0.4.1.dist-info → hud_python-0.4.3.dist-info}/WHEEL +0 -0
- {hud_python-0.4.1.dist-info → hud_python-0.4.3.dist-info}/entry_points.txt +0 -0
hud/tools/computer/anthropic.py
CHANGED
|
@@ -1,437 +1,437 @@
|
|
|
1
|
-
# flake8: noqa: B008
|
|
2
|
-
from __future__ import annotations
|
|
3
|
-
|
|
4
|
-
import logging
|
|
5
|
-
from typing import TYPE_CHECKING, Any, Literal, cast
|
|
6
|
-
|
|
7
|
-
from mcp import ErrorData, McpError
|
|
8
|
-
from mcp.types import INTERNAL_ERROR, INVALID_PARAMS, ContentBlock
|
|
9
|
-
from pydantic import Field
|
|
10
|
-
|
|
11
|
-
from hud.tools.types import ContentResult
|
|
12
|
-
|
|
13
|
-
from .hud import HudComputerTool
|
|
14
|
-
from .settings import computer_settings
|
|
15
|
-
|
|
16
|
-
if TYPE_CHECKING:
|
|
17
|
-
from anthropic.types.beta import BetaToolComputerUse20250124Param
|
|
18
|
-
|
|
19
|
-
from hud.tools.executors.base import BaseExecutor
|
|
20
|
-
|
|
21
|
-
logger = logging.getLogger(__name__)
|
|
22
|
-
|
|
23
|
-
# Map Anthropic key names to CLA standard keys
|
|
24
|
-
ANTHROPIC_TO_CLA_KEYS = {
|
|
25
|
-
# Common variations
|
|
26
|
-
"Return": "enter",
|
|
27
|
-
"Escape": "escape",
|
|
28
|
-
"ArrowUp": "up",
|
|
29
|
-
"ArrowDown": "down",
|
|
30
|
-
"ArrowLeft": "left",
|
|
31
|
-
"ArrowRight": "right",
|
|
32
|
-
"Backspace": "backspace",
|
|
33
|
-
"Delete": "delete",
|
|
34
|
-
"Tab": "tab",
|
|
35
|
-
"Space": "space",
|
|
36
|
-
"Control": "ctrl",
|
|
37
|
-
"Alt": "alt",
|
|
38
|
-
"Shift": "shift",
|
|
39
|
-
"Meta": "win", # Windows key
|
|
40
|
-
"Command": "cmd", # macOS
|
|
41
|
-
"Super": "win", # Linux
|
|
42
|
-
"PageUp": "pageup",
|
|
43
|
-
"PageDown": "pagedown",
|
|
44
|
-
"Home": "home",
|
|
45
|
-
"End": "end",
|
|
46
|
-
"Insert": "insert",
|
|
47
|
-
"F1": "f1",
|
|
48
|
-
"F2": "f2",
|
|
49
|
-
"F3": "f3",
|
|
50
|
-
"F4": "f4",
|
|
51
|
-
"F5": "f5",
|
|
52
|
-
"F6": "f6",
|
|
53
|
-
"F7": "f7",
|
|
54
|
-
"F8": "f8",
|
|
55
|
-
"F9": "f9",
|
|
56
|
-
"F10": "f10",
|
|
57
|
-
"F11": "f11",
|
|
58
|
-
"F12": "f12",
|
|
59
|
-
}
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
class AnthropicComputerTool(HudComputerTool):
|
|
63
|
-
"""
|
|
64
|
-
Anthropic Computer Use tool for interacting with the computer.
|
|
65
|
-
"""
|
|
66
|
-
|
|
67
|
-
name: str = "computer"
|
|
68
|
-
api_type: str = "computer_20250124"
|
|
69
|
-
|
|
70
|
-
def __init__(
|
|
71
|
-
self,
|
|
72
|
-
# Define within environment based on platform
|
|
73
|
-
executor: BaseExecutor | None = None,
|
|
74
|
-
platform_type: Literal["auto", "xdo", "pyautogui"] = "auto",
|
|
75
|
-
display_num: int | None = None,
|
|
76
|
-
# Overrides for what dimensions the agent thinks it operates in
|
|
77
|
-
width: int = computer_settings.ANTHROPIC_COMPUTER_WIDTH,
|
|
78
|
-
height: int = computer_settings.ANTHROPIC_COMPUTER_HEIGHT,
|
|
79
|
-
rescale_images: bool = computer_settings.ANTHROPIC_RESCALE_IMAGES,
|
|
80
|
-
# What the agent sees as the tool's name, title, and description
|
|
81
|
-
name: str | None = None,
|
|
82
|
-
title: str | None = None,
|
|
83
|
-
description: str | None = None,
|
|
84
|
-
**kwargs: Any,
|
|
85
|
-
) -> None:
|
|
86
|
-
"""
|
|
87
|
-
Initialize with Anthropic's default dimensions.
|
|
88
|
-
|
|
89
|
-
Args:
|
|
90
|
-
width: Target width for rescaling (None = use environment width)
|
|
91
|
-
height: Target height for rescaling (None = use environment height)
|
|
92
|
-
rescale_images: If True, rescale screenshots. If False, only rescale action coordinates
|
|
93
|
-
name: Tool name for MCP registration (auto-generated from class name if not provided)
|
|
94
|
-
title: Human-readable display name for the tool (auto-generated from class name)
|
|
95
|
-
description: Tool description (auto-generated from docstring if not provided)
|
|
96
|
-
"""
|
|
97
|
-
super().__init__(
|
|
98
|
-
executor=executor,
|
|
99
|
-
platform_type=platform_type,
|
|
100
|
-
display_num=display_num,
|
|
101
|
-
width=width,
|
|
102
|
-
height=height,
|
|
103
|
-
rescale_images=rescale_images,
|
|
104
|
-
name=name or "anthropic_computer",
|
|
105
|
-
title=title or "Anthropic Computer Tool",
|
|
106
|
-
description=description or "Control computer with mouse, keyboard, and screenshot",
|
|
107
|
-
**kwargs,
|
|
108
|
-
)
|
|
109
|
-
|
|
110
|
-
def to_params(self) -> BetaToolComputerUse20250124Param:
|
|
111
|
-
"""Convert to Anthropic tool parameters."""
|
|
112
|
-
return cast(
|
|
113
|
-
"BetaToolComputerUse20250124Param",
|
|
114
|
-
{
|
|
115
|
-
"type": self.api_type,
|
|
116
|
-
"name": self.name,
|
|
117
|
-
"display_width_px": self.width,
|
|
118
|
-
"display_height_px": self.height,
|
|
119
|
-
},
|
|
120
|
-
)
|
|
121
|
-
|
|
122
|
-
def _map_anthropic_key_to_cla(self, key: str) -> str:
|
|
123
|
-
"""Map Anthropic key name to CLA standard key."""
|
|
124
|
-
# Handle key combinations like "ctrl+a"
|
|
125
|
-
if "+" in key:
|
|
126
|
-
parts = key.split("+")
|
|
127
|
-
mapped_parts = []
|
|
128
|
-
for part in parts:
|
|
129
|
-
# Try exact match first, then case-insensitive
|
|
130
|
-
mapped = ANTHROPIC_TO_CLA_KEYS.get(
|
|
131
|
-
part, ANTHROPIC_TO_CLA_KEYS.get(part.capitalize(), part.lower())
|
|
132
|
-
)
|
|
133
|
-
mapped_parts.append(mapped)
|
|
134
|
-
return "+".join(mapped_parts)
|
|
135
|
-
else:
|
|
136
|
-
# Single key - try exact match first, then case-insensitive
|
|
137
|
-
return ANTHROPIC_TO_CLA_KEYS.get(
|
|
138
|
-
key, ANTHROPIC_TO_CLA_KEYS.get(key.capitalize(), key.lower())
|
|
139
|
-
)
|
|
140
|
-
|
|
141
|
-
async def __call__(
|
|
142
|
-
self,
|
|
143
|
-
action: str = Field(..., description="The action to perform on the computer"),
|
|
144
|
-
coordinate: list[int] | tuple[int, int] | None = Field(
|
|
145
|
-
None, description="The coordinate to interact with on the computer [x, y]"
|
|
146
|
-
),
|
|
147
|
-
text: str | None = Field(
|
|
148
|
-
None, description="The text to type on the computer or key to press"
|
|
149
|
-
),
|
|
150
|
-
start_coordinate: list[int] | tuple[int, int] | None = Field(
|
|
151
|
-
None, description="The starting coordinate for drag actions [x, y]"
|
|
152
|
-
),
|
|
153
|
-
scroll_direction: str | None = Field(
|
|
154
|
-
None, description="The direction to scroll (up, down, left, right)"
|
|
155
|
-
),
|
|
156
|
-
scroll_amount: int | None = Field(None, description="The amount to scroll"),
|
|
157
|
-
duration: float | None = Field(None, description="The duration of the action in seconds"),
|
|
158
|
-
take_screenshot_on_click: bool = Field(
|
|
159
|
-
True, description="Whether to take a screenshot after clicking"
|
|
160
|
-
),
|
|
161
|
-
) -> list[ContentBlock]:
|
|
162
|
-
"""
|
|
163
|
-
Handle Anthropic Computer Use API calls.
|
|
164
|
-
|
|
165
|
-
This converts Anthropic's action format to HudComputerTool's format.
|
|
166
|
-
|
|
167
|
-
Returns:
|
|
168
|
-
List of MCP content blocks
|
|
169
|
-
"""
|
|
170
|
-
logger.info("AnthropicComputerTool received action: %s", action)
|
|
171
|
-
|
|
172
|
-
# Convert lists to tuples if needed
|
|
173
|
-
coord_tuple = None
|
|
174
|
-
if coordinate:
|
|
175
|
-
coord_tuple = tuple(coordinate) if isinstance(coordinate, list) else coordinate
|
|
176
|
-
|
|
177
|
-
start_coord_tuple = None
|
|
178
|
-
if start_coordinate:
|
|
179
|
-
start_coord_tuple = (
|
|
180
|
-
tuple(start_coordinate) if isinstance(start_coordinate, list) else start_coordinate
|
|
181
|
-
)
|
|
182
|
-
|
|
183
|
-
# Map Anthropic actions to HudComputerTool actions
|
|
184
|
-
if action == "screenshot":
|
|
185
|
-
screenshot_base64 = await self.executor.screenshot()
|
|
186
|
-
if screenshot_base64:
|
|
187
|
-
# Rescale screenshot if requested
|
|
188
|
-
result = ContentResult(base64_image=screenshot_base64)
|
|
189
|
-
else:
|
|
190
|
-
result = ContentResult(error="Failed to take screenshot")
|
|
191
|
-
|
|
192
|
-
elif action == "left_click" or action == "click":
|
|
193
|
-
if coord_tuple and len(coord_tuple) >= 2:
|
|
194
|
-
scaled_x, scaled_y = self._scale_coordinates(coord_tuple[0], coord_tuple[1])
|
|
195
|
-
logger.info("Scaled coordinates: %s, %s", scaled_x, scaled_y)
|
|
196
|
-
result = await self.executor.click(x=scaled_x, y=scaled_y)
|
|
197
|
-
else:
|
|
198
|
-
result = await self.executor.click()
|
|
199
|
-
|
|
200
|
-
elif action == "double_click":
|
|
201
|
-
if coord_tuple and len(coord_tuple) >= 2:
|
|
202
|
-
# Use pattern for double-click
|
|
203
|
-
scaled_x, scaled_y = self._scale_coordinates(coord_tuple[0], coord_tuple[1])
|
|
204
|
-
result = await self.executor.click(x=scaled_x, y=scaled_y, pattern=[100])
|
|
205
|
-
else:
|
|
206
|
-
result = await self.executor.click(pattern=[100])
|
|
207
|
-
|
|
208
|
-
elif action == "triple_click":
|
|
209
|
-
if coord_tuple and len(coord_tuple) >= 2:
|
|
210
|
-
# Use pattern for triple-click
|
|
211
|
-
scaled_x, scaled_y = self._scale_coordinates(coord_tuple[0], coord_tuple[1])
|
|
212
|
-
result = await self.executor.click(x=scaled_x, y=scaled_y, pattern=[100, 100])
|
|
213
|
-
else:
|
|
214
|
-
result = await self.executor.click(pattern=[100, 100])
|
|
215
|
-
|
|
216
|
-
elif action == "right_click":
|
|
217
|
-
if coord_tuple and len(coord_tuple) >= 2:
|
|
218
|
-
scaled_x, scaled_y = self._scale_coordinates(coord_tuple[0], coord_tuple[1])
|
|
219
|
-
result = await self.executor.click(x=scaled_x, y=scaled_y, button="right")
|
|
220
|
-
else:
|
|
221
|
-
result = await self.executor.click(button="right")
|
|
222
|
-
|
|
223
|
-
elif action == "middle_click":
|
|
224
|
-
if coord_tuple and len(coord_tuple) >= 2:
|
|
225
|
-
scaled_x, scaled_y = self._scale_coordinates(coord_tuple[0], coord_tuple[1])
|
|
226
|
-
result = await self.executor.click(x=scaled_x, y=scaled_y, button="middle")
|
|
227
|
-
else:
|
|
228
|
-
result = await self.executor.click(button="middle")
|
|
229
|
-
|
|
230
|
-
elif action == "mouse_move" or action == "move":
|
|
231
|
-
if coord_tuple and len(coord_tuple) >= 2:
|
|
232
|
-
scaled_x, scaled_y = self._scale_coordinates(coord_tuple[0], coord_tuple[1])
|
|
233
|
-
result = await self.executor.move(x=scaled_x, y=scaled_y)
|
|
234
|
-
else:
|
|
235
|
-
raise McpError(
|
|
236
|
-
ErrorData(code=INVALID_PARAMS, message="coordinate is required for mouse_move")
|
|
237
|
-
)
|
|
238
|
-
|
|
239
|
-
elif action == "type":
|
|
240
|
-
if text:
|
|
241
|
-
result = await self.executor.write(text=text)
|
|
242
|
-
else:
|
|
243
|
-
raise McpError(ErrorData(code=INVALID_PARAMS, message="text is required for type"))
|
|
244
|
-
|
|
245
|
-
elif action == "key":
|
|
246
|
-
if text:
|
|
247
|
-
# Anthropic sends single key or combo like "ctrl+a"
|
|
248
|
-
# Map to CLA standard key format
|
|
249
|
-
mapped_key = self._map_anthropic_key_to_cla(text)
|
|
250
|
-
|
|
251
|
-
# Split key combination into list of keys
|
|
252
|
-
if "+" in mapped_key:
|
|
253
|
-
keys_list = [k.strip() for k in mapped_key.split("+")]
|
|
254
|
-
else:
|
|
255
|
-
keys_list = [mapped_key]
|
|
256
|
-
|
|
257
|
-
result = await self.executor.press(keys=keys_list)
|
|
258
|
-
else:
|
|
259
|
-
raise McpError(ErrorData(code=INVALID_PARAMS, message="text is required for key"))
|
|
260
|
-
|
|
261
|
-
elif action == "scroll":
|
|
262
|
-
# Original implementation validates scroll_direction and scroll_amount
|
|
263
|
-
if scroll_direction not in ["up", "down", "left", "right"]:
|
|
264
|
-
raise McpError(
|
|
265
|
-
ErrorData(
|
|
266
|
-
code=INVALID_PARAMS,
|
|
267
|
-
message="scroll_direction must be 'up', 'down', 'left', or 'right'",
|
|
268
|
-
)
|
|
269
|
-
)
|
|
270
|
-
|
|
271
|
-
if scroll_amount is None or scroll_amount < 0:
|
|
272
|
-
raise McpError(
|
|
273
|
-
ErrorData(
|
|
274
|
-
code=INVALID_PARAMS, message="scroll_amount must be a non-negative int"
|
|
275
|
-
)
|
|
276
|
-
)
|
|
277
|
-
|
|
278
|
-
# Convert scroll amount from "clicks" to pixels
|
|
279
|
-
# Anthropic's scroll_amount represents wheel clicks, not pixels
|
|
280
|
-
# Standard conversion: 1 wheel click ≈ 100 pixels (3 lines of text)
|
|
281
|
-
PIXELS_PER_WHEEL_CLICK = 100
|
|
282
|
-
pixel_amount = scroll_amount * PIXELS_PER_WHEEL_CLICK
|
|
283
|
-
|
|
284
|
-
# Convert direction to scroll amounts
|
|
285
|
-
scroll_x = None
|
|
286
|
-
scroll_y = None
|
|
287
|
-
if scroll_direction == "down":
|
|
288
|
-
scroll_y = pixel_amount
|
|
289
|
-
elif scroll_direction == "up":
|
|
290
|
-
scroll_y = -pixel_amount
|
|
291
|
-
elif scroll_direction == "right":
|
|
292
|
-
scroll_x = pixel_amount
|
|
293
|
-
elif scroll_direction == "left":
|
|
294
|
-
scroll_x = -pixel_amount
|
|
295
|
-
|
|
296
|
-
if coord_tuple and len(coord_tuple) >= 2:
|
|
297
|
-
scaled_x, scaled_y = self._scale_coordinates(coord_tuple[0], coord_tuple[1])
|
|
298
|
-
result = await self.executor.scroll(
|
|
299
|
-
x=scaled_x, y=scaled_y, scroll_x=scroll_x, scroll_y=scroll_y
|
|
300
|
-
)
|
|
301
|
-
else:
|
|
302
|
-
result = await self.executor.scroll(scroll_x=scroll_x, scroll_y=scroll_y)
|
|
303
|
-
|
|
304
|
-
elif action == "left_click_drag" or action == "drag":
|
|
305
|
-
# Anthropic sends drag with start and end coordinates
|
|
306
|
-
if coord_tuple and len(coord_tuple) >= 2:
|
|
307
|
-
if start_coord_tuple and len(start_coord_tuple) >= 2:
|
|
308
|
-
# Full drag path
|
|
309
|
-
path = [
|
|
310
|
-
(start_coord_tuple[0], start_coord_tuple[1]),
|
|
311
|
-
(coord_tuple[0], coord_tuple[1]),
|
|
312
|
-
]
|
|
313
|
-
scaled_path = self._scale_path(path)
|
|
314
|
-
result = await self.executor.drag(path=scaled_path)
|
|
315
|
-
else:
|
|
316
|
-
# Just end coordinate, drag from current position
|
|
317
|
-
# Original spec allows this
|
|
318
|
-
current_pos = [(0, 0), (coord_tuple[0], coord_tuple[1])] # Simplified
|
|
319
|
-
scaled_path = self._scale_path(current_pos)
|
|
320
|
-
result = await self.executor.drag(path=scaled_path)
|
|
321
|
-
else:
|
|
322
|
-
raise McpError(
|
|
323
|
-
ErrorData(
|
|
324
|
-
code=INVALID_PARAMS, message="coordinate is required for left_click_drag"
|
|
325
|
-
)
|
|
326
|
-
)
|
|
327
|
-
|
|
328
|
-
elif action == "wait":
|
|
329
|
-
# Original spec expects duration in seconds
|
|
330
|
-
if duration is None:
|
|
331
|
-
raise McpError(
|
|
332
|
-
ErrorData(code=INVALID_PARAMS, message="duration is required for wait")
|
|
333
|
-
)
|
|
334
|
-
if duration < 0:
|
|
335
|
-
raise McpError(
|
|
336
|
-
ErrorData(code=INVALID_PARAMS, message="duration must be non-negative")
|
|
337
|
-
)
|
|
338
|
-
if duration > 100:
|
|
339
|
-
raise McpError(ErrorData(code=INVALID_PARAMS, message="duration is too long"))
|
|
340
|
-
|
|
341
|
-
# Convert seconds to milliseconds for HudComputerTool
|
|
342
|
-
result = await self.executor.wait(time=int(duration * 1000))
|
|
343
|
-
|
|
344
|
-
elif action == "hold_key":
|
|
345
|
-
# Original spec has hold_key action
|
|
346
|
-
if text is None:
|
|
347
|
-
raise McpError(
|
|
348
|
-
ErrorData(code=INVALID_PARAMS, message="text is required for hold_key")
|
|
349
|
-
)
|
|
350
|
-
if duration is None:
|
|
351
|
-
raise McpError(
|
|
352
|
-
ErrorData(code=INVALID_PARAMS, message="duration is required for hold_key")
|
|
353
|
-
)
|
|
354
|
-
if duration < 0:
|
|
355
|
-
raise McpError(
|
|
356
|
-
ErrorData(code=INVALID_PARAMS, message="duration must be non-negative")
|
|
357
|
-
)
|
|
358
|
-
if duration > 100:
|
|
359
|
-
raise McpError(ErrorData(code=INVALID_PARAMS, message="duration is too long"))
|
|
360
|
-
|
|
361
|
-
# Hold key action
|
|
362
|
-
result = await self.executor.hold_key(key=text, duration=duration)
|
|
363
|
-
|
|
364
|
-
elif action == "left_mouse_down":
|
|
365
|
-
# These don't accept coordinates in original spec
|
|
366
|
-
if coord_tuple is not None:
|
|
367
|
-
raise McpError(
|
|
368
|
-
ErrorData(
|
|
369
|
-
code=INVALID_PARAMS,
|
|
370
|
-
message="coordinate is not accepted for left_mouse_down",
|
|
371
|
-
)
|
|
372
|
-
)
|
|
373
|
-
# Use generic mouse_down method
|
|
374
|
-
result = await self.executor.mouse_down(button="left")
|
|
375
|
-
|
|
376
|
-
elif action == "left_mouse_up":
|
|
377
|
-
# These don't accept coordinates in original spec
|
|
378
|
-
if coord_tuple is not None:
|
|
379
|
-
raise McpError(
|
|
380
|
-
ErrorData(
|
|
381
|
-
code=INVALID_PARAMS, message="coordinate is not accepted for left_mouse_up"
|
|
382
|
-
)
|
|
383
|
-
)
|
|
384
|
-
# Use generic mouse_up method
|
|
385
|
-
result = await self.executor.mouse_up(button="left")
|
|
386
|
-
|
|
387
|
-
elif action == "cursor_position":
|
|
388
|
-
result = await self.executor.position()
|
|
389
|
-
|
|
390
|
-
else:
|
|
391
|
-
# Unknown action
|
|
392
|
-
raise McpError(ErrorData(code=INTERNAL_ERROR, message=f"Invalid action: {action}"))
|
|
393
|
-
|
|
394
|
-
# Rescale screenshot in result if present
|
|
395
|
-
if isinstance(result, ContentResult) and result.base64_image and self.rescale_images:
|
|
396
|
-
rescaled_image = await self._rescale_screenshot(result.base64_image)
|
|
397
|
-
result.base64_image = rescaled_image
|
|
398
|
-
|
|
399
|
-
# Handle screenshot for actions that need it
|
|
400
|
-
screenshot_actions = {
|
|
401
|
-
"screenshot",
|
|
402
|
-
"left_click",
|
|
403
|
-
"click",
|
|
404
|
-
"double_click",
|
|
405
|
-
"triple_click",
|
|
406
|
-
"right_click",
|
|
407
|
-
"middle_click",
|
|
408
|
-
"mouse_move",
|
|
409
|
-
"move",
|
|
410
|
-
"type",
|
|
411
|
-
"key",
|
|
412
|
-
"scroll",
|
|
413
|
-
"left_click_drag",
|
|
414
|
-
"drag",
|
|
415
|
-
"wait",
|
|
416
|
-
"hold_key",
|
|
417
|
-
"left_mouse_down",
|
|
418
|
-
"left_mouse_up",
|
|
419
|
-
}
|
|
420
|
-
|
|
421
|
-
if (
|
|
422
|
-
action in screenshot_actions
|
|
423
|
-
and action != "screenshot"
|
|
424
|
-
and take_screenshot_on_click
|
|
425
|
-
and isinstance(result, ContentResult)
|
|
426
|
-
and not result.base64_image
|
|
427
|
-
):
|
|
428
|
-
screenshot_base64 = await self.executor.screenshot()
|
|
429
|
-
if screenshot_base64:
|
|
430
|
-
# Rescale screenshot if requested
|
|
431
|
-
screenshot_base64 = await self._rescale_screenshot(screenshot_base64)
|
|
432
|
-
result = ContentResult(
|
|
433
|
-
output=result.output, error=result.error, base64_image=screenshot_base64
|
|
434
|
-
)
|
|
435
|
-
|
|
436
|
-
# Convert to content blocks
|
|
437
|
-
return result.to_content_blocks()
|
|
1
|
+
# flake8: noqa: B008
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
import logging
|
|
5
|
+
from typing import TYPE_CHECKING, Any, Literal, cast
|
|
6
|
+
|
|
7
|
+
from mcp import ErrorData, McpError
|
|
8
|
+
from mcp.types import INTERNAL_ERROR, INVALID_PARAMS, ContentBlock
|
|
9
|
+
from pydantic import Field
|
|
10
|
+
|
|
11
|
+
from hud.tools.types import ContentResult
|
|
12
|
+
|
|
13
|
+
from .hud import HudComputerTool
|
|
14
|
+
from .settings import computer_settings
|
|
15
|
+
|
|
16
|
+
if TYPE_CHECKING:
|
|
17
|
+
from anthropic.types.beta import BetaToolComputerUse20250124Param
|
|
18
|
+
|
|
19
|
+
from hud.tools.executors.base import BaseExecutor
|
|
20
|
+
|
|
21
|
+
logger = logging.getLogger(__name__)
|
|
22
|
+
|
|
23
|
+
# Map Anthropic key names to CLA standard keys
|
|
24
|
+
ANTHROPIC_TO_CLA_KEYS = {
|
|
25
|
+
# Common variations
|
|
26
|
+
"Return": "enter",
|
|
27
|
+
"Escape": "escape",
|
|
28
|
+
"ArrowUp": "up",
|
|
29
|
+
"ArrowDown": "down",
|
|
30
|
+
"ArrowLeft": "left",
|
|
31
|
+
"ArrowRight": "right",
|
|
32
|
+
"Backspace": "backspace",
|
|
33
|
+
"Delete": "delete",
|
|
34
|
+
"Tab": "tab",
|
|
35
|
+
"Space": "space",
|
|
36
|
+
"Control": "ctrl",
|
|
37
|
+
"Alt": "alt",
|
|
38
|
+
"Shift": "shift",
|
|
39
|
+
"Meta": "win", # Windows key
|
|
40
|
+
"Command": "cmd", # macOS
|
|
41
|
+
"Super": "win", # Linux
|
|
42
|
+
"PageUp": "pageup",
|
|
43
|
+
"PageDown": "pagedown",
|
|
44
|
+
"Home": "home",
|
|
45
|
+
"End": "end",
|
|
46
|
+
"Insert": "insert",
|
|
47
|
+
"F1": "f1",
|
|
48
|
+
"F2": "f2",
|
|
49
|
+
"F3": "f3",
|
|
50
|
+
"F4": "f4",
|
|
51
|
+
"F5": "f5",
|
|
52
|
+
"F6": "f6",
|
|
53
|
+
"F7": "f7",
|
|
54
|
+
"F8": "f8",
|
|
55
|
+
"F9": "f9",
|
|
56
|
+
"F10": "f10",
|
|
57
|
+
"F11": "f11",
|
|
58
|
+
"F12": "f12",
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
class AnthropicComputerTool(HudComputerTool):
|
|
63
|
+
"""
|
|
64
|
+
Anthropic Computer Use tool for interacting with the computer.
|
|
65
|
+
"""
|
|
66
|
+
|
|
67
|
+
name: str = "computer"
|
|
68
|
+
api_type: str = "computer_20250124"
|
|
69
|
+
|
|
70
|
+
def __init__(
|
|
71
|
+
self,
|
|
72
|
+
# Define within environment based on platform
|
|
73
|
+
executor: BaseExecutor | None = None,
|
|
74
|
+
platform_type: Literal["auto", "xdo", "pyautogui"] = "auto",
|
|
75
|
+
display_num: int | None = None,
|
|
76
|
+
# Overrides for what dimensions the agent thinks it operates in
|
|
77
|
+
width: int = computer_settings.ANTHROPIC_COMPUTER_WIDTH,
|
|
78
|
+
height: int = computer_settings.ANTHROPIC_COMPUTER_HEIGHT,
|
|
79
|
+
rescale_images: bool = computer_settings.ANTHROPIC_RESCALE_IMAGES,
|
|
80
|
+
# What the agent sees as the tool's name, title, and description
|
|
81
|
+
name: str | None = None,
|
|
82
|
+
title: str | None = None,
|
|
83
|
+
description: str | None = None,
|
|
84
|
+
**kwargs: Any,
|
|
85
|
+
) -> None:
|
|
86
|
+
"""
|
|
87
|
+
Initialize with Anthropic's default dimensions.
|
|
88
|
+
|
|
89
|
+
Args:
|
|
90
|
+
width: Target width for rescaling (None = use environment width)
|
|
91
|
+
height: Target height for rescaling (None = use environment height)
|
|
92
|
+
rescale_images: If True, rescale screenshots. If False, only rescale action coordinates
|
|
93
|
+
name: Tool name for MCP registration (auto-generated from class name if not provided)
|
|
94
|
+
title: Human-readable display name for the tool (auto-generated from class name)
|
|
95
|
+
description: Tool description (auto-generated from docstring if not provided)
|
|
96
|
+
"""
|
|
97
|
+
super().__init__(
|
|
98
|
+
executor=executor,
|
|
99
|
+
platform_type=platform_type,
|
|
100
|
+
display_num=display_num,
|
|
101
|
+
width=width,
|
|
102
|
+
height=height,
|
|
103
|
+
rescale_images=rescale_images,
|
|
104
|
+
name=name or "anthropic_computer",
|
|
105
|
+
title=title or "Anthropic Computer Tool",
|
|
106
|
+
description=description or "Control computer with mouse, keyboard, and screenshot",
|
|
107
|
+
**kwargs,
|
|
108
|
+
)
|
|
109
|
+
|
|
110
|
+
def to_params(self) -> BetaToolComputerUse20250124Param:
|
|
111
|
+
"""Convert to Anthropic tool parameters."""
|
|
112
|
+
return cast(
|
|
113
|
+
"BetaToolComputerUse20250124Param",
|
|
114
|
+
{
|
|
115
|
+
"type": self.api_type,
|
|
116
|
+
"name": self.name,
|
|
117
|
+
"display_width_px": self.width,
|
|
118
|
+
"display_height_px": self.height,
|
|
119
|
+
},
|
|
120
|
+
)
|
|
121
|
+
|
|
122
|
+
def _map_anthropic_key_to_cla(self, key: str) -> str:
|
|
123
|
+
"""Map Anthropic key name to CLA standard key."""
|
|
124
|
+
# Handle key combinations like "ctrl+a"
|
|
125
|
+
if "+" in key:
|
|
126
|
+
parts = key.split("+")
|
|
127
|
+
mapped_parts = []
|
|
128
|
+
for part in parts:
|
|
129
|
+
# Try exact match first, then case-insensitive
|
|
130
|
+
mapped = ANTHROPIC_TO_CLA_KEYS.get(
|
|
131
|
+
part, ANTHROPIC_TO_CLA_KEYS.get(part.capitalize(), part.lower())
|
|
132
|
+
)
|
|
133
|
+
mapped_parts.append(mapped)
|
|
134
|
+
return "+".join(mapped_parts)
|
|
135
|
+
else:
|
|
136
|
+
# Single key - try exact match first, then case-insensitive
|
|
137
|
+
return ANTHROPIC_TO_CLA_KEYS.get(
|
|
138
|
+
key, ANTHROPIC_TO_CLA_KEYS.get(key.capitalize(), key.lower())
|
|
139
|
+
)
|
|
140
|
+
|
|
141
|
+
async def __call__(
|
|
142
|
+
self,
|
|
143
|
+
action: str = Field(..., description="The action to perform on the computer"),
|
|
144
|
+
coordinate: list[int] | tuple[int, int] | None = Field(
|
|
145
|
+
None, description="The coordinate to interact with on the computer [x, y]"
|
|
146
|
+
),
|
|
147
|
+
text: str | None = Field(
|
|
148
|
+
None, description="The text to type on the computer or key to press"
|
|
149
|
+
),
|
|
150
|
+
start_coordinate: list[int] | tuple[int, int] | None = Field(
|
|
151
|
+
None, description="The starting coordinate for drag actions [x, y]"
|
|
152
|
+
),
|
|
153
|
+
scroll_direction: str | None = Field(
|
|
154
|
+
None, description="The direction to scroll (up, down, left, right)"
|
|
155
|
+
),
|
|
156
|
+
scroll_amount: int | None = Field(None, description="The amount to scroll"),
|
|
157
|
+
duration: float | None = Field(None, description="The duration of the action in seconds"),
|
|
158
|
+
take_screenshot_on_click: bool = Field(
|
|
159
|
+
True, description="Whether to take a screenshot after clicking"
|
|
160
|
+
),
|
|
161
|
+
) -> list[ContentBlock]:
|
|
162
|
+
"""
|
|
163
|
+
Handle Anthropic Computer Use API calls.
|
|
164
|
+
|
|
165
|
+
This converts Anthropic's action format to HudComputerTool's format.
|
|
166
|
+
|
|
167
|
+
Returns:
|
|
168
|
+
List of MCP content blocks
|
|
169
|
+
"""
|
|
170
|
+
logger.info("AnthropicComputerTool received action: %s", action)
|
|
171
|
+
|
|
172
|
+
# Convert lists to tuples if needed
|
|
173
|
+
coord_tuple = None
|
|
174
|
+
if coordinate:
|
|
175
|
+
coord_tuple = tuple(coordinate) if isinstance(coordinate, list) else coordinate
|
|
176
|
+
|
|
177
|
+
start_coord_tuple = None
|
|
178
|
+
if start_coordinate:
|
|
179
|
+
start_coord_tuple = (
|
|
180
|
+
tuple(start_coordinate) if isinstance(start_coordinate, list) else start_coordinate
|
|
181
|
+
)
|
|
182
|
+
|
|
183
|
+
# Map Anthropic actions to HudComputerTool actions
|
|
184
|
+
if action == "screenshot":
|
|
185
|
+
screenshot_base64 = await self.executor.screenshot()
|
|
186
|
+
if screenshot_base64:
|
|
187
|
+
# Rescale screenshot if requested
|
|
188
|
+
result = ContentResult(base64_image=screenshot_base64)
|
|
189
|
+
else:
|
|
190
|
+
result = ContentResult(error="Failed to take screenshot")
|
|
191
|
+
|
|
192
|
+
elif action == "left_click" or action == "click":
|
|
193
|
+
if coord_tuple and len(coord_tuple) >= 2:
|
|
194
|
+
scaled_x, scaled_y = self._scale_coordinates(coord_tuple[0], coord_tuple[1])
|
|
195
|
+
logger.info("Scaled coordinates: %s, %s", scaled_x, scaled_y)
|
|
196
|
+
result = await self.executor.click(x=scaled_x, y=scaled_y)
|
|
197
|
+
else:
|
|
198
|
+
result = await self.executor.click()
|
|
199
|
+
|
|
200
|
+
elif action == "double_click":
|
|
201
|
+
if coord_tuple and len(coord_tuple) >= 2:
|
|
202
|
+
# Use pattern for double-click
|
|
203
|
+
scaled_x, scaled_y = self._scale_coordinates(coord_tuple[0], coord_tuple[1])
|
|
204
|
+
result = await self.executor.click(x=scaled_x, y=scaled_y, pattern=[100])
|
|
205
|
+
else:
|
|
206
|
+
result = await self.executor.click(pattern=[100])
|
|
207
|
+
|
|
208
|
+
elif action == "triple_click":
|
|
209
|
+
if coord_tuple and len(coord_tuple) >= 2:
|
|
210
|
+
# Use pattern for triple-click
|
|
211
|
+
scaled_x, scaled_y = self._scale_coordinates(coord_tuple[0], coord_tuple[1])
|
|
212
|
+
result = await self.executor.click(x=scaled_x, y=scaled_y, pattern=[100, 100])
|
|
213
|
+
else:
|
|
214
|
+
result = await self.executor.click(pattern=[100, 100])
|
|
215
|
+
|
|
216
|
+
elif action == "right_click":
|
|
217
|
+
if coord_tuple and len(coord_tuple) >= 2:
|
|
218
|
+
scaled_x, scaled_y = self._scale_coordinates(coord_tuple[0], coord_tuple[1])
|
|
219
|
+
result = await self.executor.click(x=scaled_x, y=scaled_y, button="right")
|
|
220
|
+
else:
|
|
221
|
+
result = await self.executor.click(button="right")
|
|
222
|
+
|
|
223
|
+
elif action == "middle_click":
|
|
224
|
+
if coord_tuple and len(coord_tuple) >= 2:
|
|
225
|
+
scaled_x, scaled_y = self._scale_coordinates(coord_tuple[0], coord_tuple[1])
|
|
226
|
+
result = await self.executor.click(x=scaled_x, y=scaled_y, button="middle")
|
|
227
|
+
else:
|
|
228
|
+
result = await self.executor.click(button="middle")
|
|
229
|
+
|
|
230
|
+
elif action == "mouse_move" or action == "move":
|
|
231
|
+
if coord_tuple and len(coord_tuple) >= 2:
|
|
232
|
+
scaled_x, scaled_y = self._scale_coordinates(coord_tuple[0], coord_tuple[1])
|
|
233
|
+
result = await self.executor.move(x=scaled_x, y=scaled_y)
|
|
234
|
+
else:
|
|
235
|
+
raise McpError(
|
|
236
|
+
ErrorData(code=INVALID_PARAMS, message="coordinate is required for mouse_move")
|
|
237
|
+
)
|
|
238
|
+
|
|
239
|
+
elif action == "type":
|
|
240
|
+
if text:
|
|
241
|
+
result = await self.executor.write(text=text)
|
|
242
|
+
else:
|
|
243
|
+
raise McpError(ErrorData(code=INVALID_PARAMS, message="text is required for type"))
|
|
244
|
+
|
|
245
|
+
elif action == "key":
|
|
246
|
+
if text:
|
|
247
|
+
# Anthropic sends single key or combo like "ctrl+a"
|
|
248
|
+
# Map to CLA standard key format
|
|
249
|
+
mapped_key = self._map_anthropic_key_to_cla(text)
|
|
250
|
+
|
|
251
|
+
# Split key combination into list of keys
|
|
252
|
+
if "+" in mapped_key:
|
|
253
|
+
keys_list = [k.strip() for k in mapped_key.split("+")]
|
|
254
|
+
else:
|
|
255
|
+
keys_list = [mapped_key]
|
|
256
|
+
|
|
257
|
+
result = await self.executor.press(keys=keys_list)
|
|
258
|
+
else:
|
|
259
|
+
raise McpError(ErrorData(code=INVALID_PARAMS, message="text is required for key"))
|
|
260
|
+
|
|
261
|
+
elif action == "scroll":
|
|
262
|
+
# Original implementation validates scroll_direction and scroll_amount
|
|
263
|
+
if scroll_direction not in ["up", "down", "left", "right"]:
|
|
264
|
+
raise McpError(
|
|
265
|
+
ErrorData(
|
|
266
|
+
code=INVALID_PARAMS,
|
|
267
|
+
message="scroll_direction must be 'up', 'down', 'left', or 'right'",
|
|
268
|
+
)
|
|
269
|
+
)
|
|
270
|
+
|
|
271
|
+
if scroll_amount is None or scroll_amount < 0:
|
|
272
|
+
raise McpError(
|
|
273
|
+
ErrorData(
|
|
274
|
+
code=INVALID_PARAMS, message="scroll_amount must be a non-negative int"
|
|
275
|
+
)
|
|
276
|
+
)
|
|
277
|
+
|
|
278
|
+
# Convert scroll amount from "clicks" to pixels
|
|
279
|
+
# Anthropic's scroll_amount represents wheel clicks, not pixels
|
|
280
|
+
# Standard conversion: 1 wheel click ≈ 100 pixels (3 lines of text)
|
|
281
|
+
PIXELS_PER_WHEEL_CLICK = 100
|
|
282
|
+
pixel_amount = scroll_amount * PIXELS_PER_WHEEL_CLICK
|
|
283
|
+
|
|
284
|
+
# Convert direction to scroll amounts
|
|
285
|
+
scroll_x = None
|
|
286
|
+
scroll_y = None
|
|
287
|
+
if scroll_direction == "down":
|
|
288
|
+
scroll_y = pixel_amount
|
|
289
|
+
elif scroll_direction == "up":
|
|
290
|
+
scroll_y = -pixel_amount
|
|
291
|
+
elif scroll_direction == "right":
|
|
292
|
+
scroll_x = pixel_amount
|
|
293
|
+
elif scroll_direction == "left":
|
|
294
|
+
scroll_x = -pixel_amount
|
|
295
|
+
|
|
296
|
+
if coord_tuple and len(coord_tuple) >= 2:
|
|
297
|
+
scaled_x, scaled_y = self._scale_coordinates(coord_tuple[0], coord_tuple[1])
|
|
298
|
+
result = await self.executor.scroll(
|
|
299
|
+
x=scaled_x, y=scaled_y, scroll_x=scroll_x, scroll_y=scroll_y
|
|
300
|
+
)
|
|
301
|
+
else:
|
|
302
|
+
result = await self.executor.scroll(scroll_x=scroll_x, scroll_y=scroll_y)
|
|
303
|
+
|
|
304
|
+
elif action == "left_click_drag" or action == "drag":
|
|
305
|
+
# Anthropic sends drag with start and end coordinates
|
|
306
|
+
if coord_tuple and len(coord_tuple) >= 2:
|
|
307
|
+
if start_coord_tuple and len(start_coord_tuple) >= 2:
|
|
308
|
+
# Full drag path
|
|
309
|
+
path = [
|
|
310
|
+
(start_coord_tuple[0], start_coord_tuple[1]),
|
|
311
|
+
(coord_tuple[0], coord_tuple[1]),
|
|
312
|
+
]
|
|
313
|
+
scaled_path = self._scale_path(path)
|
|
314
|
+
result = await self.executor.drag(path=scaled_path)
|
|
315
|
+
else:
|
|
316
|
+
# Just end coordinate, drag from current position
|
|
317
|
+
# Original spec allows this
|
|
318
|
+
current_pos = [(0, 0), (coord_tuple[0], coord_tuple[1])] # Simplified
|
|
319
|
+
scaled_path = self._scale_path(current_pos)
|
|
320
|
+
result = await self.executor.drag(path=scaled_path)
|
|
321
|
+
else:
|
|
322
|
+
raise McpError(
|
|
323
|
+
ErrorData(
|
|
324
|
+
code=INVALID_PARAMS, message="coordinate is required for left_click_drag"
|
|
325
|
+
)
|
|
326
|
+
)
|
|
327
|
+
|
|
328
|
+
elif action == "wait":
|
|
329
|
+
# Original spec expects duration in seconds
|
|
330
|
+
if duration is None:
|
|
331
|
+
raise McpError(
|
|
332
|
+
ErrorData(code=INVALID_PARAMS, message="duration is required for wait")
|
|
333
|
+
)
|
|
334
|
+
if duration < 0:
|
|
335
|
+
raise McpError(
|
|
336
|
+
ErrorData(code=INVALID_PARAMS, message="duration must be non-negative")
|
|
337
|
+
)
|
|
338
|
+
if duration > 100:
|
|
339
|
+
raise McpError(ErrorData(code=INVALID_PARAMS, message="duration is too long"))
|
|
340
|
+
|
|
341
|
+
# Convert seconds to milliseconds for HudComputerTool
|
|
342
|
+
result = await self.executor.wait(time=int(duration * 1000))
|
|
343
|
+
|
|
344
|
+
elif action == "hold_key":
|
|
345
|
+
# Original spec has hold_key action
|
|
346
|
+
if text is None:
|
|
347
|
+
raise McpError(
|
|
348
|
+
ErrorData(code=INVALID_PARAMS, message="text is required for hold_key")
|
|
349
|
+
)
|
|
350
|
+
if duration is None:
|
|
351
|
+
raise McpError(
|
|
352
|
+
ErrorData(code=INVALID_PARAMS, message="duration is required for hold_key")
|
|
353
|
+
)
|
|
354
|
+
if duration < 0:
|
|
355
|
+
raise McpError(
|
|
356
|
+
ErrorData(code=INVALID_PARAMS, message="duration must be non-negative")
|
|
357
|
+
)
|
|
358
|
+
if duration > 100:
|
|
359
|
+
raise McpError(ErrorData(code=INVALID_PARAMS, message="duration is too long"))
|
|
360
|
+
|
|
361
|
+
# Hold key action
|
|
362
|
+
result = await self.executor.hold_key(key=text, duration=duration)
|
|
363
|
+
|
|
364
|
+
elif action == "left_mouse_down":
|
|
365
|
+
# These don't accept coordinates in original spec
|
|
366
|
+
if coord_tuple is not None:
|
|
367
|
+
raise McpError(
|
|
368
|
+
ErrorData(
|
|
369
|
+
code=INVALID_PARAMS,
|
|
370
|
+
message="coordinate is not accepted for left_mouse_down",
|
|
371
|
+
)
|
|
372
|
+
)
|
|
373
|
+
# Use generic mouse_down method
|
|
374
|
+
result = await self.executor.mouse_down(button="left")
|
|
375
|
+
|
|
376
|
+
elif action == "left_mouse_up":
|
|
377
|
+
# These don't accept coordinates in original spec
|
|
378
|
+
if coord_tuple is not None:
|
|
379
|
+
raise McpError(
|
|
380
|
+
ErrorData(
|
|
381
|
+
code=INVALID_PARAMS, message="coordinate is not accepted for left_mouse_up"
|
|
382
|
+
)
|
|
383
|
+
)
|
|
384
|
+
# Use generic mouse_up method
|
|
385
|
+
result = await self.executor.mouse_up(button="left")
|
|
386
|
+
|
|
387
|
+
elif action == "cursor_position":
|
|
388
|
+
result = await self.executor.position()
|
|
389
|
+
|
|
390
|
+
else:
|
|
391
|
+
# Unknown action
|
|
392
|
+
raise McpError(ErrorData(code=INTERNAL_ERROR, message=f"Invalid action: {action}"))
|
|
393
|
+
|
|
394
|
+
# Rescale screenshot in result if present
|
|
395
|
+
if isinstance(result, ContentResult) and result.base64_image and self.rescale_images:
|
|
396
|
+
rescaled_image = await self._rescale_screenshot(result.base64_image)
|
|
397
|
+
result.base64_image = rescaled_image
|
|
398
|
+
|
|
399
|
+
# Handle screenshot for actions that need it
|
|
400
|
+
screenshot_actions = {
|
|
401
|
+
"screenshot",
|
|
402
|
+
"left_click",
|
|
403
|
+
"click",
|
|
404
|
+
"double_click",
|
|
405
|
+
"triple_click",
|
|
406
|
+
"right_click",
|
|
407
|
+
"middle_click",
|
|
408
|
+
"mouse_move",
|
|
409
|
+
"move",
|
|
410
|
+
"type",
|
|
411
|
+
"key",
|
|
412
|
+
"scroll",
|
|
413
|
+
"left_click_drag",
|
|
414
|
+
"drag",
|
|
415
|
+
"wait",
|
|
416
|
+
"hold_key",
|
|
417
|
+
"left_mouse_down",
|
|
418
|
+
"left_mouse_up",
|
|
419
|
+
}
|
|
420
|
+
|
|
421
|
+
if (
|
|
422
|
+
action in screenshot_actions
|
|
423
|
+
and action != "screenshot"
|
|
424
|
+
and take_screenshot_on_click
|
|
425
|
+
and isinstance(result, ContentResult)
|
|
426
|
+
and not result.base64_image
|
|
427
|
+
):
|
|
428
|
+
screenshot_base64 = await self.executor.screenshot()
|
|
429
|
+
if screenshot_base64:
|
|
430
|
+
# Rescale screenshot if requested
|
|
431
|
+
screenshot_base64 = await self._rescale_screenshot(screenshot_base64)
|
|
432
|
+
result = ContentResult(
|
|
433
|
+
output=result.output, error=result.error, base64_image=screenshot_base64
|
|
434
|
+
)
|
|
435
|
+
|
|
436
|
+
# Convert to content blocks
|
|
437
|
+
return result.to_content_blocks()
|