hud-python 0.4.1__py3-none-any.whl → 0.4.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of hud-python might be problematic. Click here for more details.
- hud/__init__.py +22 -22
- hud/agents/__init__.py +13 -15
- hud/agents/base.py +599 -599
- hud/agents/claude.py +373 -373
- hud/agents/langchain.py +261 -250
- hud/agents/misc/__init__.py +7 -7
- hud/agents/misc/response_agent.py +82 -80
- hud/agents/openai.py +352 -352
- hud/agents/openai_chat_generic.py +154 -154
- hud/agents/tests/__init__.py +1 -1
- hud/agents/tests/test_base.py +742 -742
- hud/agents/tests/test_claude.py +324 -324
- hud/agents/tests/test_client.py +363 -363
- hud/agents/tests/test_openai.py +237 -237
- hud/cli/__init__.py +617 -617
- hud/cli/__main__.py +8 -8
- hud/cli/analyze.py +371 -371
- hud/cli/analyze_metadata.py +230 -230
- hud/cli/build.py +498 -427
- hud/cli/clone.py +185 -185
- hud/cli/cursor.py +92 -92
- hud/cli/debug.py +392 -392
- hud/cli/docker_utils.py +83 -83
- hud/cli/init.py +280 -281
- hud/cli/interactive.py +353 -353
- hud/cli/mcp_server.py +764 -756
- hud/cli/pull.py +330 -336
- hud/cli/push.py +404 -370
- hud/cli/remote_runner.py +311 -311
- hud/cli/runner.py +160 -160
- hud/cli/tests/__init__.py +3 -3
- hud/cli/tests/test_analyze.py +284 -284
- hud/cli/tests/test_cli_init.py +265 -265
- hud/cli/tests/test_cli_main.py +27 -27
- hud/cli/tests/test_clone.py +142 -142
- hud/cli/tests/test_cursor.py +253 -253
- hud/cli/tests/test_debug.py +453 -453
- hud/cli/tests/test_mcp_server.py +139 -139
- hud/cli/tests/test_utils.py +388 -388
- hud/cli/utils.py +263 -263
- hud/clients/README.md +143 -143
- hud/clients/__init__.py +16 -16
- hud/clients/base.py +378 -379
- hud/clients/fastmcp.py +222 -222
- hud/clients/mcp_use.py +298 -278
- hud/clients/tests/__init__.py +1 -1
- hud/clients/tests/test_client_integration.py +111 -111
- hud/clients/tests/test_fastmcp.py +342 -342
- hud/clients/tests/test_protocol.py +188 -188
- hud/clients/utils/__init__.py +1 -1
- hud/clients/utils/retry_transport.py +160 -160
- hud/datasets.py +327 -322
- hud/misc/__init__.py +1 -1
- hud/misc/claude_plays_pokemon.py +292 -292
- hud/otel/__init__.py +35 -35
- hud/otel/collector.py +142 -142
- hud/otel/config.py +164 -164
- hud/otel/context.py +536 -536
- hud/otel/exporters.py +366 -366
- hud/otel/instrumentation.py +97 -97
- hud/otel/processors.py +118 -118
- hud/otel/tests/__init__.py +1 -1
- hud/otel/tests/test_processors.py +197 -197
- hud/server/__init__.py +5 -5
- hud/server/context.py +114 -114
- hud/server/helper/__init__.py +5 -5
- hud/server/low_level.py +132 -132
- hud/server/server.py +170 -166
- hud/server/tests/__init__.py +3 -3
- hud/settings.py +73 -73
- hud/shared/__init__.py +5 -5
- hud/shared/exceptions.py +180 -180
- hud/shared/requests.py +264 -264
- hud/shared/tests/test_exceptions.py +157 -157
- hud/shared/tests/test_requests.py +275 -275
- hud/telemetry/__init__.py +25 -25
- hud/telemetry/instrument.py +379 -379
- hud/telemetry/job.py +309 -309
- hud/telemetry/replay.py +74 -74
- hud/telemetry/trace.py +83 -83
- hud/tools/__init__.py +33 -33
- hud/tools/base.py +365 -365
- hud/tools/bash.py +161 -161
- hud/tools/computer/__init__.py +15 -15
- hud/tools/computer/anthropic.py +437 -437
- hud/tools/computer/hud.py +376 -376
- hud/tools/computer/openai.py +295 -295
- hud/tools/computer/settings.py +82 -82
- hud/tools/edit.py +314 -314
- hud/tools/executors/__init__.py +30 -30
- hud/tools/executors/base.py +539 -539
- hud/tools/executors/pyautogui.py +621 -621
- hud/tools/executors/tests/__init__.py +1 -1
- hud/tools/executors/tests/test_base_executor.py +338 -338
- hud/tools/executors/tests/test_pyautogui_executor.py +165 -165
- hud/tools/executors/xdo.py +511 -511
- hud/tools/playwright.py +412 -412
- hud/tools/tests/__init__.py +3 -3
- hud/tools/tests/test_base.py +282 -282
- hud/tools/tests/test_bash.py +158 -158
- hud/tools/tests/test_bash_extended.py +197 -197
- hud/tools/tests/test_computer.py +425 -425
- hud/tools/tests/test_computer_actions.py +34 -34
- hud/tools/tests/test_edit.py +259 -259
- hud/tools/tests/test_init.py +27 -27
- hud/tools/tests/test_playwright_tool.py +183 -183
- hud/tools/tests/test_tools.py +145 -145
- hud/tools/tests/test_utils.py +156 -156
- hud/tools/types.py +72 -72
- hud/tools/utils.py +50 -50
- hud/types.py +136 -136
- hud/utils/__init__.py +10 -10
- hud/utils/async_utils.py +65 -65
- hud/utils/design.py +236 -168
- hud/utils/mcp.py +55 -55
- hud/utils/progress.py +149 -149
- hud/utils/telemetry.py +66 -66
- hud/utils/tests/test_async_utils.py +173 -173
- hud/utils/tests/test_init.py +17 -17
- hud/utils/tests/test_progress.py +261 -261
- hud/utils/tests/test_telemetry.py +82 -82
- hud/utils/tests/test_version.py +8 -8
- hud/version.py +7 -7
- {hud_python-0.4.1.dist-info → hud_python-0.4.3.dist-info}/METADATA +10 -8
- hud_python-0.4.3.dist-info/RECORD +131 -0
- {hud_python-0.4.1.dist-info → hud_python-0.4.3.dist-info}/licenses/LICENSE +21 -21
- hud/agents/art.py +0 -101
- hud_python-0.4.1.dist-info/RECORD +0 -132
- {hud_python-0.4.1.dist-info → hud_python-0.4.3.dist-info}/WHEEL +0 -0
- {hud_python-0.4.1.dist-info → hud_python-0.4.3.dist-info}/entry_points.txt +0 -0
hud/tools/computer/openai.py
CHANGED
|
@@ -1,295 +1,295 @@
|
|
|
1
|
-
# flake8: noqa: B008
|
|
2
|
-
from __future__ import annotations
|
|
3
|
-
|
|
4
|
-
import logging
|
|
5
|
-
from typing import TYPE_CHECKING, Any, Literal, cast
|
|
6
|
-
|
|
7
|
-
from mcp import ErrorData, McpError
|
|
8
|
-
from mcp.types import INTERNAL_ERROR, INVALID_PARAMS, ContentBlock, TextContent
|
|
9
|
-
from pydantic import Field
|
|
10
|
-
|
|
11
|
-
from hud.tools.computer.settings import computer_settings
|
|
12
|
-
from hud.tools.types import ContentResult
|
|
13
|
-
|
|
14
|
-
from .hud import HudComputerTool
|
|
15
|
-
|
|
16
|
-
if TYPE_CHECKING:
|
|
17
|
-
from hud.tools.executors.base import BaseExecutor
|
|
18
|
-
|
|
19
|
-
logger = logging.getLogger(__name__)
|
|
20
|
-
|
|
21
|
-
# Map OpenAI key names to CLA standard keys
|
|
22
|
-
OPENAI_TO_CLA_KEYS = {
|
|
23
|
-
# Common variations
|
|
24
|
-
"return": "enter",
|
|
25
|
-
"escape": "escape",
|
|
26
|
-
"arrowup": "up",
|
|
27
|
-
"arrowdown": "down",
|
|
28
|
-
"arrowleft": "left",
|
|
29
|
-
"arrowright": "right",
|
|
30
|
-
"backspace": "backspace",
|
|
31
|
-
"delete": "delete",
|
|
32
|
-
"tab": "tab",
|
|
33
|
-
"space": "space",
|
|
34
|
-
"control": "ctrl",
|
|
35
|
-
"alt": "alt",
|
|
36
|
-
"shift": "shift",
|
|
37
|
-
"meta": "win",
|
|
38
|
-
"cmd": "cmd",
|
|
39
|
-
"command": "cmd",
|
|
40
|
-
"super": "win",
|
|
41
|
-
"pageup": "pageup",
|
|
42
|
-
"pagedown": "pagedown",
|
|
43
|
-
"home": "home",
|
|
44
|
-
"end": "end",
|
|
45
|
-
"insert": "insert",
|
|
46
|
-
}
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
class OpenAIComputerTool(HudComputerTool):
|
|
50
|
-
"""
|
|
51
|
-
OpenAI Computer Use tool for interacting with the computer.
|
|
52
|
-
"""
|
|
53
|
-
|
|
54
|
-
def __init__(
|
|
55
|
-
self,
|
|
56
|
-
# Define within environment based on platform
|
|
57
|
-
executor: BaseExecutor | None = None,
|
|
58
|
-
platform_type: Literal["auto", "xdo", "pyautogui"] = "auto",
|
|
59
|
-
display_num: int | None = None,
|
|
60
|
-
# Overrides for what dimensions the agent thinks it operates in
|
|
61
|
-
width: int = computer_settings.OPENAI_COMPUTER_WIDTH,
|
|
62
|
-
height: int = computer_settings.OPENAI_COMPUTER_HEIGHT,
|
|
63
|
-
rescale_images: bool = computer_settings.OPENAI_RESCALE_IMAGES,
|
|
64
|
-
name: str | None = None,
|
|
65
|
-
title: str | None = None,
|
|
66
|
-
description: str | None = None,
|
|
67
|
-
**kwargs: Any,
|
|
68
|
-
) -> None:
|
|
69
|
-
"""
|
|
70
|
-
Initialize with OpenAI's default dimensions.
|
|
71
|
-
|
|
72
|
-
Args:
|
|
73
|
-
width: Target width for rescaling (default: 1024 for OpenAI)
|
|
74
|
-
height: Target height for rescaling (default: 768 for OpenAI)
|
|
75
|
-
rescale_images: If True, rescale screenshots. If False, only rescale action coordinates
|
|
76
|
-
name: Tool name for MCP registration (auto-generated from class name if not provided)
|
|
77
|
-
title: Human-readable display name for the tool (auto-generated from class name)
|
|
78
|
-
description: Tool description (auto-generated from docstring if not provided)
|
|
79
|
-
"""
|
|
80
|
-
super().__init__(
|
|
81
|
-
executor=executor,
|
|
82
|
-
platform_type=platform_type,
|
|
83
|
-
display_num=display_num,
|
|
84
|
-
width=width,
|
|
85
|
-
height=height,
|
|
86
|
-
rescale_images=rescale_images,
|
|
87
|
-
name=name or "openai_computer",
|
|
88
|
-
title=title or "OpenAI Computer Tool",
|
|
89
|
-
description=description or "Control computer with mouse, keyboard, and screenshots",
|
|
90
|
-
**kwargs,
|
|
91
|
-
)
|
|
92
|
-
|
|
93
|
-
def _map_openai_key_to_cla(self, key: str) -> str:
|
|
94
|
-
"""Map OpenAI key name to CLA standard key."""
|
|
95
|
-
# OpenAI uses lowercase key names
|
|
96
|
-
return OPENAI_TO_CLA_KEYS.get(key.lower(), key.lower())
|
|
97
|
-
|
|
98
|
-
async def __call__(
|
|
99
|
-
self,
|
|
100
|
-
type: str = Field(..., description="The action type to perform"),
|
|
101
|
-
# Coordinate parameters
|
|
102
|
-
x: int | None = Field(None, description="X coordinate for click/move/scroll actions"),
|
|
103
|
-
y: int | None = Field(None, description="Y coordinate for click/move/scroll actions"),
|
|
104
|
-
# Button parameter
|
|
105
|
-
button: str | None = Field(
|
|
106
|
-
None, description="Mouse button for click actions (left, right, middle, wheel)"
|
|
107
|
-
),
|
|
108
|
-
# Text parameter
|
|
109
|
-
text: str | None = Field(None, description="Text to type or response text"),
|
|
110
|
-
# Scroll parameters
|
|
111
|
-
scroll_x: int | None = Field(None, description="Horizontal scroll amount"),
|
|
112
|
-
scroll_y: int | None = Field(None, description="Vertical scroll amount"),
|
|
113
|
-
# Wait parameter
|
|
114
|
-
ms: int | None = Field(None, description="Time to wait in milliseconds"),
|
|
115
|
-
# Key press parameter
|
|
116
|
-
keys: list[str] | None = Field(None, description="Keys to press"),
|
|
117
|
-
# Drag parameter
|
|
118
|
-
path: list[dict[str, int]] | None = Field(
|
|
119
|
-
None, description="Path for drag actions as list of {x, y} dicts"
|
|
120
|
-
),
|
|
121
|
-
# Custom action parameter
|
|
122
|
-
action: str | None = Field(None, description="Custom action name"),
|
|
123
|
-
) -> list[ContentBlock]:
|
|
124
|
-
"""
|
|
125
|
-
Handle OpenAI Computer Use API calls.
|
|
126
|
-
|
|
127
|
-
This converts OpenAI's action format (based on OperatorAdapter) to HudComputerTool's format.
|
|
128
|
-
|
|
129
|
-
Returns:
|
|
130
|
-
List of MCP content blocks
|
|
131
|
-
"""
|
|
132
|
-
logger.info("OpenAIComputerTool received type: %s", type)
|
|
133
|
-
|
|
134
|
-
# Map button names
|
|
135
|
-
button_map = {"wheel": "middle"}
|
|
136
|
-
if button:
|
|
137
|
-
button = button_map.get(button, button)
|
|
138
|
-
|
|
139
|
-
# Process based on action type
|
|
140
|
-
if type == "screenshot":
|
|
141
|
-
screenshot_base64 = await self.executor.screenshot()
|
|
142
|
-
if screenshot_base64:
|
|
143
|
-
# Rescale screenshot if requested
|
|
144
|
-
result = ContentResult(base64_image=screenshot_base64)
|
|
145
|
-
else:
|
|
146
|
-
result = ContentResult(error="Failed to take screenshot")
|
|
147
|
-
|
|
148
|
-
elif type == "click":
|
|
149
|
-
if x is not None and y is not None:
|
|
150
|
-
# Cast button to proper literal type
|
|
151
|
-
button_literal = cast(
|
|
152
|
-
"Literal['left', 'right', 'middle', 'back', 'forward']", button or "left"
|
|
153
|
-
)
|
|
154
|
-
scaled_x, scaled_y = self._scale_coordinates(x, y)
|
|
155
|
-
logger.info("Scaled coordinates: %s, %s", scaled_x, scaled_y)
|
|
156
|
-
result = await self.executor.click(x=scaled_x, y=scaled_y, button=button_literal)
|
|
157
|
-
else:
|
|
158
|
-
raise McpError(
|
|
159
|
-
ErrorData(code=INVALID_PARAMS, message="x and y coordinates required for click")
|
|
160
|
-
)
|
|
161
|
-
|
|
162
|
-
elif type == "double_click":
|
|
163
|
-
if x is not None and y is not None:
|
|
164
|
-
# Use pattern for double-click
|
|
165
|
-
scaled_x, scaled_y = self._scale_coordinates(x, y)
|
|
166
|
-
result = await self.executor.click(
|
|
167
|
-
x=scaled_x, y=scaled_y, button="left", pattern=[100]
|
|
168
|
-
)
|
|
169
|
-
else:
|
|
170
|
-
raise McpError(
|
|
171
|
-
ErrorData(
|
|
172
|
-
code=INVALID_PARAMS, message="x and y coordinates required for double_click"
|
|
173
|
-
)
|
|
174
|
-
)
|
|
175
|
-
|
|
176
|
-
elif type == "scroll":
|
|
177
|
-
if x is None or y is None:
|
|
178
|
-
raise McpError(
|
|
179
|
-
ErrorData(
|
|
180
|
-
code=INVALID_PARAMS, message="x and y coordinates required for scroll"
|
|
181
|
-
)
|
|
182
|
-
)
|
|
183
|
-
|
|
184
|
-
# scroll_x and scroll_y default to 0 if not provided
|
|
185
|
-
scaled_x, scaled_y = self._scale_coordinates(x, y)
|
|
186
|
-
result = await self.executor.scroll(
|
|
187
|
-
x=scaled_x, y=scaled_y, scroll_x=scroll_x or 0, scroll_y=scroll_y or 0
|
|
188
|
-
)
|
|
189
|
-
|
|
190
|
-
elif type == "type":
|
|
191
|
-
if text is None:
|
|
192
|
-
raise McpError(ErrorData(code=INVALID_PARAMS, message="text is required for type"))
|
|
193
|
-
result = await self.executor.write(text=text, enter_after=False)
|
|
194
|
-
|
|
195
|
-
elif type == "wait":
|
|
196
|
-
wait_time = ms or 1000 # Default to 1 second
|
|
197
|
-
result = await self.executor.wait(time=wait_time)
|
|
198
|
-
|
|
199
|
-
elif type == "move":
|
|
200
|
-
if x is not None and y is not None:
|
|
201
|
-
scaled_x, scaled_y = self._scale_coordinates(x, y)
|
|
202
|
-
result = await self.executor.move(x=scaled_x, y=scaled_y)
|
|
203
|
-
else:
|
|
204
|
-
raise McpError(
|
|
205
|
-
ErrorData(code=INVALID_PARAMS, message="x and y coordinates required for move")
|
|
206
|
-
)
|
|
207
|
-
|
|
208
|
-
elif type == "keypress":
|
|
209
|
-
if keys is None or len(keys) == 0:
|
|
210
|
-
raise McpError(
|
|
211
|
-
ErrorData(code=INVALID_PARAMS, message="keys is required for keypress")
|
|
212
|
-
)
|
|
213
|
-
|
|
214
|
-
# Map OpenAI keys to CLA standard
|
|
215
|
-
cla_keys = []
|
|
216
|
-
for key in keys:
|
|
217
|
-
cla_key = self._map_openai_key_to_cla(key)
|
|
218
|
-
cla_keys.append(cla_key)
|
|
219
|
-
|
|
220
|
-
result = await self.executor.press(keys=cla_keys)
|
|
221
|
-
|
|
222
|
-
elif type == "drag":
|
|
223
|
-
if path is None or len(path) < 2:
|
|
224
|
-
raise McpError(
|
|
225
|
-
ErrorData(
|
|
226
|
-
code=INVALID_PARAMS, message="path with at least 2 points required for drag"
|
|
227
|
-
)
|
|
228
|
-
)
|
|
229
|
-
|
|
230
|
-
# Convert path from list of dicts to list of tuples
|
|
231
|
-
drag_path = []
|
|
232
|
-
for point in path:
|
|
233
|
-
if "x" in point and "y" in point:
|
|
234
|
-
drag_path.append((point["x"], point["y"]))
|
|
235
|
-
else:
|
|
236
|
-
raise McpError(
|
|
237
|
-
ErrorData(
|
|
238
|
-
code=INVALID_PARAMS, message="Each point in path must have x and y"
|
|
239
|
-
)
|
|
240
|
-
)
|
|
241
|
-
|
|
242
|
-
scaled_path = self._scale_path(drag_path)
|
|
243
|
-
result = await self.executor.drag(path=scaled_path)
|
|
244
|
-
|
|
245
|
-
elif type == "response":
|
|
246
|
-
if text is None:
|
|
247
|
-
raise McpError(
|
|
248
|
-
ErrorData(code=INVALID_PARAMS, message="text is required for response")
|
|
249
|
-
)
|
|
250
|
-
# Response returns content blocks directly
|
|
251
|
-
return [TextContent(text=text, type="text")]
|
|
252
|
-
|
|
253
|
-
elif type == "custom":
|
|
254
|
-
# For custom actions, we just return an error since HudComputerTool doesn't support them
|
|
255
|
-
raise McpError(
|
|
256
|
-
ErrorData(code=INVALID_PARAMS, message=f"Custom action not supported: {action}")
|
|
257
|
-
)
|
|
258
|
-
|
|
259
|
-
else:
|
|
260
|
-
raise McpError(ErrorData(code=INTERNAL_ERROR, message=f"Invalid action type: {type}"))
|
|
261
|
-
|
|
262
|
-
# Rescale screenshot in result if present
|
|
263
|
-
if isinstance(result, ContentResult) and result.base64_image and self.rescale_images:
|
|
264
|
-
rescaled_image = await self._rescale_screenshot(result.base64_image)
|
|
265
|
-
result.base64_image = rescaled_image
|
|
266
|
-
|
|
267
|
-
# Handle screenshot for actions that need it
|
|
268
|
-
screenshot_actions = {
|
|
269
|
-
"screenshot",
|
|
270
|
-
"click",
|
|
271
|
-
"double_click",
|
|
272
|
-
"scroll",
|
|
273
|
-
"type",
|
|
274
|
-
"move",
|
|
275
|
-
"keypress",
|
|
276
|
-
"drag",
|
|
277
|
-
"wait",
|
|
278
|
-
}
|
|
279
|
-
|
|
280
|
-
if (
|
|
281
|
-
type in screenshot_actions
|
|
282
|
-
and type != "screenshot"
|
|
283
|
-
and isinstance(result, ContentResult)
|
|
284
|
-
and not result.base64_image
|
|
285
|
-
):
|
|
286
|
-
screenshot_base64 = await self.executor.screenshot()
|
|
287
|
-
if screenshot_base64:
|
|
288
|
-
# Rescale screenshot if requested
|
|
289
|
-
screenshot_base64 = await self._rescale_screenshot(screenshot_base64)
|
|
290
|
-
result = ContentResult(
|
|
291
|
-
output=result.output, error=result.error, base64_image=screenshot_base64
|
|
292
|
-
)
|
|
293
|
-
|
|
294
|
-
# Convert to content blocks
|
|
295
|
-
return result.to_content_blocks()
|
|
1
|
+
# flake8: noqa: B008
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
import logging
|
|
5
|
+
from typing import TYPE_CHECKING, Any, Literal, cast
|
|
6
|
+
|
|
7
|
+
from mcp import ErrorData, McpError
|
|
8
|
+
from mcp.types import INTERNAL_ERROR, INVALID_PARAMS, ContentBlock, TextContent
|
|
9
|
+
from pydantic import Field
|
|
10
|
+
|
|
11
|
+
from hud.tools.computer.settings import computer_settings
|
|
12
|
+
from hud.tools.types import ContentResult
|
|
13
|
+
|
|
14
|
+
from .hud import HudComputerTool
|
|
15
|
+
|
|
16
|
+
if TYPE_CHECKING:
|
|
17
|
+
from hud.tools.executors.base import BaseExecutor
|
|
18
|
+
|
|
19
|
+
logger = logging.getLogger(__name__)
|
|
20
|
+
|
|
21
|
+
# Map OpenAI key names to CLA standard keys
|
|
22
|
+
OPENAI_TO_CLA_KEYS = {
|
|
23
|
+
# Common variations
|
|
24
|
+
"return": "enter",
|
|
25
|
+
"escape": "escape",
|
|
26
|
+
"arrowup": "up",
|
|
27
|
+
"arrowdown": "down",
|
|
28
|
+
"arrowleft": "left",
|
|
29
|
+
"arrowright": "right",
|
|
30
|
+
"backspace": "backspace",
|
|
31
|
+
"delete": "delete",
|
|
32
|
+
"tab": "tab",
|
|
33
|
+
"space": "space",
|
|
34
|
+
"control": "ctrl",
|
|
35
|
+
"alt": "alt",
|
|
36
|
+
"shift": "shift",
|
|
37
|
+
"meta": "win",
|
|
38
|
+
"cmd": "cmd",
|
|
39
|
+
"command": "cmd",
|
|
40
|
+
"super": "win",
|
|
41
|
+
"pageup": "pageup",
|
|
42
|
+
"pagedown": "pagedown",
|
|
43
|
+
"home": "home",
|
|
44
|
+
"end": "end",
|
|
45
|
+
"insert": "insert",
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
class OpenAIComputerTool(HudComputerTool):
|
|
50
|
+
"""
|
|
51
|
+
OpenAI Computer Use tool for interacting with the computer.
|
|
52
|
+
"""
|
|
53
|
+
|
|
54
|
+
def __init__(
|
|
55
|
+
self,
|
|
56
|
+
# Define within environment based on platform
|
|
57
|
+
executor: BaseExecutor | None = None,
|
|
58
|
+
platform_type: Literal["auto", "xdo", "pyautogui"] = "auto",
|
|
59
|
+
display_num: int | None = None,
|
|
60
|
+
# Overrides for what dimensions the agent thinks it operates in
|
|
61
|
+
width: int = computer_settings.OPENAI_COMPUTER_WIDTH,
|
|
62
|
+
height: int = computer_settings.OPENAI_COMPUTER_HEIGHT,
|
|
63
|
+
rescale_images: bool = computer_settings.OPENAI_RESCALE_IMAGES,
|
|
64
|
+
name: str | None = None,
|
|
65
|
+
title: str | None = None,
|
|
66
|
+
description: str | None = None,
|
|
67
|
+
**kwargs: Any,
|
|
68
|
+
) -> None:
|
|
69
|
+
"""
|
|
70
|
+
Initialize with OpenAI's default dimensions.
|
|
71
|
+
|
|
72
|
+
Args:
|
|
73
|
+
width: Target width for rescaling (default: 1024 for OpenAI)
|
|
74
|
+
height: Target height for rescaling (default: 768 for OpenAI)
|
|
75
|
+
rescale_images: If True, rescale screenshots. If False, only rescale action coordinates
|
|
76
|
+
name: Tool name for MCP registration (auto-generated from class name if not provided)
|
|
77
|
+
title: Human-readable display name for the tool (auto-generated from class name)
|
|
78
|
+
description: Tool description (auto-generated from docstring if not provided)
|
|
79
|
+
"""
|
|
80
|
+
super().__init__(
|
|
81
|
+
executor=executor,
|
|
82
|
+
platform_type=platform_type,
|
|
83
|
+
display_num=display_num,
|
|
84
|
+
width=width,
|
|
85
|
+
height=height,
|
|
86
|
+
rescale_images=rescale_images,
|
|
87
|
+
name=name or "openai_computer",
|
|
88
|
+
title=title or "OpenAI Computer Tool",
|
|
89
|
+
description=description or "Control computer with mouse, keyboard, and screenshots",
|
|
90
|
+
**kwargs,
|
|
91
|
+
)
|
|
92
|
+
|
|
93
|
+
def _map_openai_key_to_cla(self, key: str) -> str:
|
|
94
|
+
"""Map OpenAI key name to CLA standard key."""
|
|
95
|
+
# OpenAI uses lowercase key names
|
|
96
|
+
return OPENAI_TO_CLA_KEYS.get(key.lower(), key.lower())
|
|
97
|
+
|
|
98
|
+
async def __call__(
|
|
99
|
+
self,
|
|
100
|
+
type: str = Field(..., description="The action type to perform"),
|
|
101
|
+
# Coordinate parameters
|
|
102
|
+
x: int | None = Field(None, description="X coordinate for click/move/scroll actions"),
|
|
103
|
+
y: int | None = Field(None, description="Y coordinate for click/move/scroll actions"),
|
|
104
|
+
# Button parameter
|
|
105
|
+
button: str | None = Field(
|
|
106
|
+
None, description="Mouse button for click actions (left, right, middle, wheel)"
|
|
107
|
+
),
|
|
108
|
+
# Text parameter
|
|
109
|
+
text: str | None = Field(None, description="Text to type or response text"),
|
|
110
|
+
# Scroll parameters
|
|
111
|
+
scroll_x: int | None = Field(None, description="Horizontal scroll amount"),
|
|
112
|
+
scroll_y: int | None = Field(None, description="Vertical scroll amount"),
|
|
113
|
+
# Wait parameter
|
|
114
|
+
ms: int | None = Field(None, description="Time to wait in milliseconds"),
|
|
115
|
+
# Key press parameter
|
|
116
|
+
keys: list[str] | None = Field(None, description="Keys to press"),
|
|
117
|
+
# Drag parameter
|
|
118
|
+
path: list[dict[str, int]] | None = Field(
|
|
119
|
+
None, description="Path for drag actions as list of {x, y} dicts"
|
|
120
|
+
),
|
|
121
|
+
# Custom action parameter
|
|
122
|
+
action: str | None = Field(None, description="Custom action name"),
|
|
123
|
+
) -> list[ContentBlock]:
|
|
124
|
+
"""
|
|
125
|
+
Handle OpenAI Computer Use API calls.
|
|
126
|
+
|
|
127
|
+
This converts OpenAI's action format (based on OperatorAdapter) to HudComputerTool's format.
|
|
128
|
+
|
|
129
|
+
Returns:
|
|
130
|
+
List of MCP content blocks
|
|
131
|
+
"""
|
|
132
|
+
logger.info("OpenAIComputerTool received type: %s", type)
|
|
133
|
+
|
|
134
|
+
# Map button names
|
|
135
|
+
button_map = {"wheel": "middle"}
|
|
136
|
+
if button:
|
|
137
|
+
button = button_map.get(button, button)
|
|
138
|
+
|
|
139
|
+
# Process based on action type
|
|
140
|
+
if type == "screenshot":
|
|
141
|
+
screenshot_base64 = await self.executor.screenshot()
|
|
142
|
+
if screenshot_base64:
|
|
143
|
+
# Rescale screenshot if requested
|
|
144
|
+
result = ContentResult(base64_image=screenshot_base64)
|
|
145
|
+
else:
|
|
146
|
+
result = ContentResult(error="Failed to take screenshot")
|
|
147
|
+
|
|
148
|
+
elif type == "click":
|
|
149
|
+
if x is not None and y is not None:
|
|
150
|
+
# Cast button to proper literal type
|
|
151
|
+
button_literal = cast(
|
|
152
|
+
"Literal['left', 'right', 'middle', 'back', 'forward']", button or "left"
|
|
153
|
+
)
|
|
154
|
+
scaled_x, scaled_y = self._scale_coordinates(x, y)
|
|
155
|
+
logger.info("Scaled coordinates: %s, %s", scaled_x, scaled_y)
|
|
156
|
+
result = await self.executor.click(x=scaled_x, y=scaled_y, button=button_literal)
|
|
157
|
+
else:
|
|
158
|
+
raise McpError(
|
|
159
|
+
ErrorData(code=INVALID_PARAMS, message="x and y coordinates required for click")
|
|
160
|
+
)
|
|
161
|
+
|
|
162
|
+
elif type == "double_click":
|
|
163
|
+
if x is not None and y is not None:
|
|
164
|
+
# Use pattern for double-click
|
|
165
|
+
scaled_x, scaled_y = self._scale_coordinates(x, y)
|
|
166
|
+
result = await self.executor.click(
|
|
167
|
+
x=scaled_x, y=scaled_y, button="left", pattern=[100]
|
|
168
|
+
)
|
|
169
|
+
else:
|
|
170
|
+
raise McpError(
|
|
171
|
+
ErrorData(
|
|
172
|
+
code=INVALID_PARAMS, message="x and y coordinates required for double_click"
|
|
173
|
+
)
|
|
174
|
+
)
|
|
175
|
+
|
|
176
|
+
elif type == "scroll":
|
|
177
|
+
if x is None or y is None:
|
|
178
|
+
raise McpError(
|
|
179
|
+
ErrorData(
|
|
180
|
+
code=INVALID_PARAMS, message="x and y coordinates required for scroll"
|
|
181
|
+
)
|
|
182
|
+
)
|
|
183
|
+
|
|
184
|
+
# scroll_x and scroll_y default to 0 if not provided
|
|
185
|
+
scaled_x, scaled_y = self._scale_coordinates(x, y)
|
|
186
|
+
result = await self.executor.scroll(
|
|
187
|
+
x=scaled_x, y=scaled_y, scroll_x=scroll_x or 0, scroll_y=scroll_y or 0
|
|
188
|
+
)
|
|
189
|
+
|
|
190
|
+
elif type == "type":
|
|
191
|
+
if text is None:
|
|
192
|
+
raise McpError(ErrorData(code=INVALID_PARAMS, message="text is required for type"))
|
|
193
|
+
result = await self.executor.write(text=text, enter_after=False)
|
|
194
|
+
|
|
195
|
+
elif type == "wait":
|
|
196
|
+
wait_time = ms or 1000 # Default to 1 second
|
|
197
|
+
result = await self.executor.wait(time=wait_time)
|
|
198
|
+
|
|
199
|
+
elif type == "move":
|
|
200
|
+
if x is not None and y is not None:
|
|
201
|
+
scaled_x, scaled_y = self._scale_coordinates(x, y)
|
|
202
|
+
result = await self.executor.move(x=scaled_x, y=scaled_y)
|
|
203
|
+
else:
|
|
204
|
+
raise McpError(
|
|
205
|
+
ErrorData(code=INVALID_PARAMS, message="x and y coordinates required for move")
|
|
206
|
+
)
|
|
207
|
+
|
|
208
|
+
elif type == "keypress":
|
|
209
|
+
if keys is None or len(keys) == 0:
|
|
210
|
+
raise McpError(
|
|
211
|
+
ErrorData(code=INVALID_PARAMS, message="keys is required for keypress")
|
|
212
|
+
)
|
|
213
|
+
|
|
214
|
+
# Map OpenAI keys to CLA standard
|
|
215
|
+
cla_keys = []
|
|
216
|
+
for key in keys:
|
|
217
|
+
cla_key = self._map_openai_key_to_cla(key)
|
|
218
|
+
cla_keys.append(cla_key)
|
|
219
|
+
|
|
220
|
+
result = await self.executor.press(keys=cla_keys)
|
|
221
|
+
|
|
222
|
+
elif type == "drag":
|
|
223
|
+
if path is None or len(path) < 2:
|
|
224
|
+
raise McpError(
|
|
225
|
+
ErrorData(
|
|
226
|
+
code=INVALID_PARAMS, message="path with at least 2 points required for drag"
|
|
227
|
+
)
|
|
228
|
+
)
|
|
229
|
+
|
|
230
|
+
# Convert path from list of dicts to list of tuples
|
|
231
|
+
drag_path = []
|
|
232
|
+
for point in path:
|
|
233
|
+
if "x" in point and "y" in point:
|
|
234
|
+
drag_path.append((point["x"], point["y"]))
|
|
235
|
+
else:
|
|
236
|
+
raise McpError(
|
|
237
|
+
ErrorData(
|
|
238
|
+
code=INVALID_PARAMS, message="Each point in path must have x and y"
|
|
239
|
+
)
|
|
240
|
+
)
|
|
241
|
+
|
|
242
|
+
scaled_path = self._scale_path(drag_path)
|
|
243
|
+
result = await self.executor.drag(path=scaled_path)
|
|
244
|
+
|
|
245
|
+
elif type == "response":
|
|
246
|
+
if text is None:
|
|
247
|
+
raise McpError(
|
|
248
|
+
ErrorData(code=INVALID_PARAMS, message="text is required for response")
|
|
249
|
+
)
|
|
250
|
+
# Response returns content blocks directly
|
|
251
|
+
return [TextContent(text=text, type="text")]
|
|
252
|
+
|
|
253
|
+
elif type == "custom":
|
|
254
|
+
# For custom actions, we just return an error since HudComputerTool doesn't support them
|
|
255
|
+
raise McpError(
|
|
256
|
+
ErrorData(code=INVALID_PARAMS, message=f"Custom action not supported: {action}")
|
|
257
|
+
)
|
|
258
|
+
|
|
259
|
+
else:
|
|
260
|
+
raise McpError(ErrorData(code=INTERNAL_ERROR, message=f"Invalid action type: {type}"))
|
|
261
|
+
|
|
262
|
+
# Rescale screenshot in result if present
|
|
263
|
+
if isinstance(result, ContentResult) and result.base64_image and self.rescale_images:
|
|
264
|
+
rescaled_image = await self._rescale_screenshot(result.base64_image)
|
|
265
|
+
result.base64_image = rescaled_image
|
|
266
|
+
|
|
267
|
+
# Handle screenshot for actions that need it
|
|
268
|
+
screenshot_actions = {
|
|
269
|
+
"screenshot",
|
|
270
|
+
"click",
|
|
271
|
+
"double_click",
|
|
272
|
+
"scroll",
|
|
273
|
+
"type",
|
|
274
|
+
"move",
|
|
275
|
+
"keypress",
|
|
276
|
+
"drag",
|
|
277
|
+
"wait",
|
|
278
|
+
}
|
|
279
|
+
|
|
280
|
+
if (
|
|
281
|
+
type in screenshot_actions
|
|
282
|
+
and type != "screenshot"
|
|
283
|
+
and isinstance(result, ContentResult)
|
|
284
|
+
and not result.base64_image
|
|
285
|
+
):
|
|
286
|
+
screenshot_base64 = await self.executor.screenshot()
|
|
287
|
+
if screenshot_base64:
|
|
288
|
+
# Rescale screenshot if requested
|
|
289
|
+
screenshot_base64 = await self._rescale_screenshot(screenshot_base64)
|
|
290
|
+
result = ContentResult(
|
|
291
|
+
output=result.output, error=result.error, base64_image=screenshot_base64
|
|
292
|
+
)
|
|
293
|
+
|
|
294
|
+
# Convert to content blocks
|
|
295
|
+
return result.to_content_blocks()
|