hud-python 0.4.1__py3-none-any.whl → 0.4.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of hud-python might be problematic. Click here for more details.
- hud/__init__.py +22 -22
- hud/agents/__init__.py +13 -15
- hud/agents/base.py +599 -599
- hud/agents/claude.py +373 -373
- hud/agents/langchain.py +261 -250
- hud/agents/misc/__init__.py +7 -7
- hud/agents/misc/response_agent.py +82 -80
- hud/agents/openai.py +352 -352
- hud/agents/openai_chat_generic.py +154 -154
- hud/agents/tests/__init__.py +1 -1
- hud/agents/tests/test_base.py +742 -742
- hud/agents/tests/test_claude.py +324 -324
- hud/agents/tests/test_client.py +363 -363
- hud/agents/tests/test_openai.py +237 -237
- hud/cli/__init__.py +617 -617
- hud/cli/__main__.py +8 -8
- hud/cli/analyze.py +371 -371
- hud/cli/analyze_metadata.py +230 -230
- hud/cli/build.py +498 -427
- hud/cli/clone.py +185 -185
- hud/cli/cursor.py +92 -92
- hud/cli/debug.py +392 -392
- hud/cli/docker_utils.py +83 -83
- hud/cli/init.py +280 -281
- hud/cli/interactive.py +353 -353
- hud/cli/mcp_server.py +764 -756
- hud/cli/pull.py +330 -336
- hud/cli/push.py +404 -370
- hud/cli/remote_runner.py +311 -311
- hud/cli/runner.py +160 -160
- hud/cli/tests/__init__.py +3 -3
- hud/cli/tests/test_analyze.py +284 -284
- hud/cli/tests/test_cli_init.py +265 -265
- hud/cli/tests/test_cli_main.py +27 -27
- hud/cli/tests/test_clone.py +142 -142
- hud/cli/tests/test_cursor.py +253 -253
- hud/cli/tests/test_debug.py +453 -453
- hud/cli/tests/test_mcp_server.py +139 -139
- hud/cli/tests/test_utils.py +388 -388
- hud/cli/utils.py +263 -263
- hud/clients/README.md +143 -143
- hud/clients/__init__.py +16 -16
- hud/clients/base.py +378 -379
- hud/clients/fastmcp.py +222 -222
- hud/clients/mcp_use.py +298 -278
- hud/clients/tests/__init__.py +1 -1
- hud/clients/tests/test_client_integration.py +111 -111
- hud/clients/tests/test_fastmcp.py +342 -342
- hud/clients/tests/test_protocol.py +188 -188
- hud/clients/utils/__init__.py +1 -1
- hud/clients/utils/retry_transport.py +160 -160
- hud/datasets.py +327 -322
- hud/misc/__init__.py +1 -1
- hud/misc/claude_plays_pokemon.py +292 -292
- hud/otel/__init__.py +35 -35
- hud/otel/collector.py +142 -142
- hud/otel/config.py +164 -164
- hud/otel/context.py +536 -536
- hud/otel/exporters.py +366 -366
- hud/otel/instrumentation.py +97 -97
- hud/otel/processors.py +118 -118
- hud/otel/tests/__init__.py +1 -1
- hud/otel/tests/test_processors.py +197 -197
- hud/server/__init__.py +5 -5
- hud/server/context.py +114 -114
- hud/server/helper/__init__.py +5 -5
- hud/server/low_level.py +132 -132
- hud/server/server.py +170 -166
- hud/server/tests/__init__.py +3 -3
- hud/settings.py +73 -73
- hud/shared/__init__.py +5 -5
- hud/shared/exceptions.py +180 -180
- hud/shared/requests.py +264 -264
- hud/shared/tests/test_exceptions.py +157 -157
- hud/shared/tests/test_requests.py +275 -275
- hud/telemetry/__init__.py +25 -25
- hud/telemetry/instrument.py +379 -379
- hud/telemetry/job.py +309 -309
- hud/telemetry/replay.py +74 -74
- hud/telemetry/trace.py +83 -83
- hud/tools/__init__.py +33 -33
- hud/tools/base.py +365 -365
- hud/tools/bash.py +161 -161
- hud/tools/computer/__init__.py +15 -15
- hud/tools/computer/anthropic.py +437 -437
- hud/tools/computer/hud.py +376 -376
- hud/tools/computer/openai.py +295 -295
- hud/tools/computer/settings.py +82 -82
- hud/tools/edit.py +314 -314
- hud/tools/executors/__init__.py +30 -30
- hud/tools/executors/base.py +539 -539
- hud/tools/executors/pyautogui.py +621 -621
- hud/tools/executors/tests/__init__.py +1 -1
- hud/tools/executors/tests/test_base_executor.py +338 -338
- hud/tools/executors/tests/test_pyautogui_executor.py +165 -165
- hud/tools/executors/xdo.py +511 -511
- hud/tools/playwright.py +412 -412
- hud/tools/tests/__init__.py +3 -3
- hud/tools/tests/test_base.py +282 -282
- hud/tools/tests/test_bash.py +158 -158
- hud/tools/tests/test_bash_extended.py +197 -197
- hud/tools/tests/test_computer.py +425 -425
- hud/tools/tests/test_computer_actions.py +34 -34
- hud/tools/tests/test_edit.py +259 -259
- hud/tools/tests/test_init.py +27 -27
- hud/tools/tests/test_playwright_tool.py +183 -183
- hud/tools/tests/test_tools.py +145 -145
- hud/tools/tests/test_utils.py +156 -156
- hud/tools/types.py +72 -72
- hud/tools/utils.py +50 -50
- hud/types.py +136 -136
- hud/utils/__init__.py +10 -10
- hud/utils/async_utils.py +65 -65
- hud/utils/design.py +236 -168
- hud/utils/mcp.py +55 -55
- hud/utils/progress.py +149 -149
- hud/utils/telemetry.py +66 -66
- hud/utils/tests/test_async_utils.py +173 -173
- hud/utils/tests/test_init.py +17 -17
- hud/utils/tests/test_progress.py +261 -261
- hud/utils/tests/test_telemetry.py +82 -82
- hud/utils/tests/test_version.py +8 -8
- hud/version.py +7 -7
- {hud_python-0.4.1.dist-info → hud_python-0.4.3.dist-info}/METADATA +10 -8
- hud_python-0.4.3.dist-info/RECORD +131 -0
- {hud_python-0.4.1.dist-info → hud_python-0.4.3.dist-info}/licenses/LICENSE +21 -21
- hud/agents/art.py +0 -101
- hud_python-0.4.1.dist-info/RECORD +0 -132
- {hud_python-0.4.1.dist-info → hud_python-0.4.3.dist-info}/WHEEL +0 -0
- {hud_python-0.4.1.dist-info → hud_python-0.4.3.dist-info}/entry_points.txt +0 -0
hud/tools/computer/hud.py
CHANGED
|
@@ -1,376 +1,376 @@
|
|
|
1
|
-
# flake8: noqa: B008
|
|
2
|
-
from __future__ import annotations
|
|
3
|
-
|
|
4
|
-
import logging
|
|
5
|
-
import platform
|
|
6
|
-
from typing import Literal
|
|
7
|
-
|
|
8
|
-
from mcp import ErrorData, McpError
|
|
9
|
-
from mcp.types import INVALID_PARAMS, ContentBlock, TextContent
|
|
10
|
-
from pydantic import Field
|
|
11
|
-
|
|
12
|
-
from hud.tools.base import BaseTool
|
|
13
|
-
from hud.tools.executors.base import BaseExecutor
|
|
14
|
-
from hud.tools.executors.pyautogui import PyAutoGUIExecutor
|
|
15
|
-
from hud.tools.executors.xdo import XDOExecutor
|
|
16
|
-
from hud.tools.types import ContentResult, ToolError
|
|
17
|
-
|
|
18
|
-
from .settings import computer_settings
|
|
19
|
-
|
|
20
|
-
logger = logging.getLogger(__name__)
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
class HudComputerTool(BaseTool):
|
|
24
|
-
"""
|
|
25
|
-
A tool that allows the agent to control the computer.
|
|
26
|
-
"""
|
|
27
|
-
|
|
28
|
-
def __init__(
|
|
29
|
-
self,
|
|
30
|
-
# Define within environment based on platform
|
|
31
|
-
executor: BaseExecutor | None = None,
|
|
32
|
-
platform_type: Literal["auto", "xdo", "pyautogui"] = "auto",
|
|
33
|
-
display_num: int | None = None,
|
|
34
|
-
# Overrides for what dimensions the agent thinks it operates in
|
|
35
|
-
# Define per subclass (e.g., Anthropic, OpenAI)
|
|
36
|
-
width: int | None = computer_settings.HUD_COMPUTER_WIDTH,
|
|
37
|
-
height: int | None = computer_settings.HUD_COMPUTER_HEIGHT,
|
|
38
|
-
rescale_images: bool = computer_settings.HUD_RESCALE_IMAGES,
|
|
39
|
-
# What the agent sees as the tool's name, title, and description
|
|
40
|
-
name: str | None = None,
|
|
41
|
-
title: str | None = None,
|
|
42
|
-
description: str | None = None,
|
|
43
|
-
) -> None:
|
|
44
|
-
"""
|
|
45
|
-
Initialize the HUD computer tool.
|
|
46
|
-
|
|
47
|
-
Args:
|
|
48
|
-
executor: Executor to use for the tool
|
|
49
|
-
platform_type: Which executor to use if executor not provided:
|
|
50
|
-
- "auto": Automatically detect based on platform
|
|
51
|
-
- "xdo": Use XDOExecutor (Linux/X11 only)
|
|
52
|
-
- "pyautogui": Use PyAutoGUIExecutor (cross-platform)
|
|
53
|
-
display_num: X display number
|
|
54
|
-
width: Target width for rescaling (None = use environment width)
|
|
55
|
-
height: Target height for rescaling (None = use environment height)
|
|
56
|
-
rescale_images: If True, rescale screenshots. If False, only rescale action coordinates
|
|
57
|
-
name: Tool name for MCP registration (auto-generated from class name if not provided)
|
|
58
|
-
title: Human-readable display name for the tool (auto-generated from class name)
|
|
59
|
-
description: Tool description (auto-generated from docstring if not provided)
|
|
60
|
-
"""
|
|
61
|
-
# Initialize base tool with executor as env
|
|
62
|
-
super().__init__(
|
|
63
|
-
env=executor,
|
|
64
|
-
name=name or "computer",
|
|
65
|
-
title=title or "Computer Control",
|
|
66
|
-
description=description or "Control computer with mouse, keyboard, and screenshots",
|
|
67
|
-
)
|
|
68
|
-
|
|
69
|
-
# This is the width and height the agent thinks it operates in
|
|
70
|
-
# By default, use subclass's width and height
|
|
71
|
-
# If specifically set to None, use environment width and height
|
|
72
|
-
self.width = width or computer_settings.DISPLAY_WIDTH
|
|
73
|
-
self.height = height or computer_settings.DISPLAY_HEIGHT
|
|
74
|
-
|
|
75
|
-
# This is the static width and height of the environment screen
|
|
76
|
-
# And the width and height of the screenshots taken by the tool
|
|
77
|
-
self.environment_width = computer_settings.DISPLAY_WIDTH
|
|
78
|
-
self.environment_height = computer_settings.DISPLAY_HEIGHT
|
|
79
|
-
|
|
80
|
-
# Some APIs rescale screenshots automatically to the agent's width and height, some don't
|
|
81
|
-
# Defined per subclass (e.g., Anthropic, OpenAI)
|
|
82
|
-
# In case you need your agent to receive pre-formatted screenshots, set env variable True
|
|
83
|
-
self.rescale_images = rescale_images
|
|
84
|
-
|
|
85
|
-
logger.debug(
|
|
86
|
-
"Agent Screen Width: %s, Agent Screen Height: %s",
|
|
87
|
-
self.width,
|
|
88
|
-
self.height,
|
|
89
|
-
"Environment Screen Width: %s, Environment Screen Height: %s",
|
|
90
|
-
self.environment_width,
|
|
91
|
-
self.environment_height,
|
|
92
|
-
)
|
|
93
|
-
|
|
94
|
-
# Calculate scaling factors from base screen size to target size
|
|
95
|
-
self.scale_x = self.width / self.environment_width
|
|
96
|
-
self.scale_y = self.height / self.environment_height
|
|
97
|
-
|
|
98
|
-
# Check if we need to scale
|
|
99
|
-
self.needs_scaling = min(self.scale_x, self.scale_y) != 1.0
|
|
100
|
-
|
|
101
|
-
# Use environment settings for display number
|
|
102
|
-
self.display_num = display_num or computer_settings.DISPLAY_NUM
|
|
103
|
-
|
|
104
|
-
logger.debug("Display number: %s", self.display_num)
|
|
105
|
-
|
|
106
|
-
# If no executor provided, create one based on platform
|
|
107
|
-
if self.env is None:
|
|
108
|
-
self._choose_executor(platform_type, self.display_num)
|
|
109
|
-
|
|
110
|
-
@property
|
|
111
|
-
def executor(self) -> BaseExecutor:
|
|
112
|
-
"""Get the executor (alias for context)."""
|
|
113
|
-
return self.env
|
|
114
|
-
|
|
115
|
-
@executor.setter
|
|
116
|
-
def executor(self, value: BaseExecutor) -> None:
|
|
117
|
-
"""Set the executor (alias for context)."""
|
|
118
|
-
self.env = value
|
|
119
|
-
|
|
120
|
-
def _choose_executor(
|
|
121
|
-
self,
|
|
122
|
-
platform_type: Literal["auto", "xdo", "pyautogui"],
|
|
123
|
-
display_num: int | None,
|
|
124
|
-
) -> None:
|
|
125
|
-
"""Choose executor based on platform_type."""
|
|
126
|
-
# Choose executor based on platform_type
|
|
127
|
-
if platform_type == "auto":
|
|
128
|
-
# Auto-detect based on platform
|
|
129
|
-
system = platform.system().lower()
|
|
130
|
-
if system == "linux":
|
|
131
|
-
# Try XDO first on Linux
|
|
132
|
-
if XDOExecutor.is_available():
|
|
133
|
-
self.executor = XDOExecutor(display_num=display_num)
|
|
134
|
-
logger.info("Using XDOExecutor")
|
|
135
|
-
elif PyAutoGUIExecutor.is_available():
|
|
136
|
-
self.executor = PyAutoGUIExecutor(display_num=display_num)
|
|
137
|
-
logger.info("Using PyAutoGUIExecutor")
|
|
138
|
-
else:
|
|
139
|
-
self.executor = BaseExecutor(display_num=display_num)
|
|
140
|
-
logger.info("No display available, using BaseExecutor (simulation mode)")
|
|
141
|
-
else:
|
|
142
|
-
# Windows/macOS - try PyAutoGUI
|
|
143
|
-
if PyAutoGUIExecutor.is_available():
|
|
144
|
-
self.executor = PyAutoGUIExecutor(display_num=display_num)
|
|
145
|
-
logger.info("Using PyAutoGUIExecutor")
|
|
146
|
-
else:
|
|
147
|
-
self.executor = BaseExecutor(display_num=display_num)
|
|
148
|
-
logger.info("PyAutoGUI not available, using BaseExecutor (simulation mode)")
|
|
149
|
-
|
|
150
|
-
elif platform_type == "xdo":
|
|
151
|
-
if XDOExecutor.is_available():
|
|
152
|
-
self.executor = XDOExecutor(display_num=display_num)
|
|
153
|
-
logger.info("Using XDOExecutor")
|
|
154
|
-
else:
|
|
155
|
-
self.executor = BaseExecutor(display_num=display_num)
|
|
156
|
-
logger.warning("XDO not available, using BaseExecutor (simulation mode)")
|
|
157
|
-
|
|
158
|
-
elif platform_type == "pyautogui":
|
|
159
|
-
if PyAutoGUIExecutor.is_available():
|
|
160
|
-
self.executor = PyAutoGUIExecutor(display_num=display_num)
|
|
161
|
-
logger.info("Using PyAutoGUIExecutor")
|
|
162
|
-
else:
|
|
163
|
-
self.executor = BaseExecutor(display_num=display_num)
|
|
164
|
-
logger.warning("PyAutoGUI not available, using BaseExecutor (simulation mode)")
|
|
165
|
-
else:
|
|
166
|
-
raise ValueError(f"Invalid platform_type: {platform_type}")
|
|
167
|
-
|
|
168
|
-
def _scale_coordinates(self, x: int | None, y: int | None) -> tuple[int | None, int | None]:
|
|
169
|
-
"""Scale coordinates from target space to screen space."""
|
|
170
|
-
if x is not None and self.scale_x != 1.0:
|
|
171
|
-
x = int(x / self.scale_x)
|
|
172
|
-
if y is not None and self.scale_y != 1.0:
|
|
173
|
-
y = int(y / self.scale_y)
|
|
174
|
-
|
|
175
|
-
return x, y
|
|
176
|
-
|
|
177
|
-
def _scale_path(self, path: list[tuple[int, int]]) -> list[tuple[int, int]]:
|
|
178
|
-
"""Scale a path from target space to screen space."""
|
|
179
|
-
scaled_path = []
|
|
180
|
-
for x, y in path:
|
|
181
|
-
scaled_x, scaled_y = self._scale_coordinates(x, y)
|
|
182
|
-
if scaled_x is not None and scaled_y is not None:
|
|
183
|
-
scaled_path.append((scaled_x, scaled_y))
|
|
184
|
-
|
|
185
|
-
return scaled_path
|
|
186
|
-
|
|
187
|
-
async def _rescale_screenshot(self, screenshot_base64: str) -> str:
|
|
188
|
-
"""Rescale a screenshot if rescale_images is True."""
|
|
189
|
-
if not self.rescale_images or not self.needs_scaling:
|
|
190
|
-
return screenshot_base64
|
|
191
|
-
|
|
192
|
-
try:
|
|
193
|
-
import base64
|
|
194
|
-
from io import BytesIO
|
|
195
|
-
|
|
196
|
-
from PIL import Image # type: ignore[import-not-found]
|
|
197
|
-
|
|
198
|
-
# Decode base64 to image
|
|
199
|
-
image_data = base64.b64decode(screenshot_base64)
|
|
200
|
-
image = Image.open(BytesIO(image_data))
|
|
201
|
-
|
|
202
|
-
logger.info(
|
|
203
|
-
"Resizing screenshot from %s x %s to %s x %s",
|
|
204
|
-
image.width,
|
|
205
|
-
image.height,
|
|
206
|
-
self.width,
|
|
207
|
-
self.height,
|
|
208
|
-
)
|
|
209
|
-
|
|
210
|
-
# Resize to exact target dimensions
|
|
211
|
-
resized = image.resize((self.width, self.height), Image.Resampling.LANCZOS)
|
|
212
|
-
|
|
213
|
-
# Convert back to base64
|
|
214
|
-
buffer = BytesIO()
|
|
215
|
-
resized.save(buffer, format="PNG")
|
|
216
|
-
resized_base64 = base64.b64encode(buffer.getvalue()).decode("utf-8")
|
|
217
|
-
|
|
218
|
-
return resized_base64
|
|
219
|
-
except Exception as e:
|
|
220
|
-
logger.warning("Failed to rescale screenshot: %s", e)
|
|
221
|
-
return screenshot_base64
|
|
222
|
-
|
|
223
|
-
async def __call__(
|
|
224
|
-
self,
|
|
225
|
-
action: str = Field(..., description="The action name (click, type, move, etc.)"),
|
|
226
|
-
# Click parameters
|
|
227
|
-
x: int | None = Field(None, description="X coordinate for click/move/scroll actions"),
|
|
228
|
-
y: int | None = Field(None, description="Y coordinate for click/move/scroll actions"),
|
|
229
|
-
button: Literal["left", "right", "middle", "back", "forward"] | None = Field(
|
|
230
|
-
None, description="Mouse button for click actions"
|
|
231
|
-
),
|
|
232
|
-
pattern: list[int] | None = Field(
|
|
233
|
-
None, description="Click pattern for multi-clicks (e.g., [100] for double-click)"
|
|
234
|
-
),
|
|
235
|
-
# Key/Type parameters
|
|
236
|
-
text: str | None = Field(None, description="Text for type/response actions"),
|
|
237
|
-
keys: list[str] | None = Field(None, description="Keys for press/keydown/keyup actions"),
|
|
238
|
-
enter_after: bool | None = Field(None, description="Whether to press Enter after typing"),
|
|
239
|
-
# Scroll parameters
|
|
240
|
-
scroll_x: int | None = Field(
|
|
241
|
-
None, description="Horizontal scroll amount (positive = right)"
|
|
242
|
-
),
|
|
243
|
-
scroll_y: int | None = Field(None, description="Vertical scroll amount (positive = down)"),
|
|
244
|
-
# Move parameters
|
|
245
|
-
offset_x: int | None = Field(None, description="X offset for relative move"),
|
|
246
|
-
offset_y: int | None = Field(None, description="Y offset for relative move"),
|
|
247
|
-
# Drag parameters
|
|
248
|
-
path: list[tuple[int, int]] | None = Field(
|
|
249
|
-
None, description="Path for drag actions as list of (x, y) coordinates"
|
|
250
|
-
),
|
|
251
|
-
# Wait parameter
|
|
252
|
-
time: int | None = Field(None, description="Time in milliseconds for wait action"),
|
|
253
|
-
# General parameters
|
|
254
|
-
hold_keys: list[str] | None = Field(None, description="Keys to hold during action"),
|
|
255
|
-
# hold_key specific
|
|
256
|
-
duration: float | None = Field(None, description="Duration in seconds for hold_key action"),
|
|
257
|
-
) -> list[ContentBlock]:
|
|
258
|
-
"""
|
|
259
|
-
Execute a computer control action by name.
|
|
260
|
-
|
|
261
|
-
Returns:
|
|
262
|
-
List of MCP content blocks
|
|
263
|
-
"""
|
|
264
|
-
logger.info("HudComputerTool executing action: %s", action)
|
|
265
|
-
|
|
266
|
-
try:
|
|
267
|
-
# Delegate to executor based on action
|
|
268
|
-
if action == "click":
|
|
269
|
-
# Scale coordinates from client space to screen space
|
|
270
|
-
scaled_x, scaled_y = self._scale_coordinates(x, y)
|
|
271
|
-
result = await self.executor.click(
|
|
272
|
-
x=scaled_x,
|
|
273
|
-
y=scaled_y,
|
|
274
|
-
button=button or "left",
|
|
275
|
-
pattern=pattern,
|
|
276
|
-
hold_keys=hold_keys,
|
|
277
|
-
)
|
|
278
|
-
|
|
279
|
-
elif action == "press":
|
|
280
|
-
if keys is None:
|
|
281
|
-
raise ToolError("keys parameter is required for press")
|
|
282
|
-
result = await self.executor.press(keys=keys)
|
|
283
|
-
|
|
284
|
-
elif action == "keydown":
|
|
285
|
-
if keys is None:
|
|
286
|
-
raise ToolError("keys parameter is required for keydown")
|
|
287
|
-
result = await self.executor.keydown(keys=keys)
|
|
288
|
-
|
|
289
|
-
elif action == "keyup":
|
|
290
|
-
if keys is None:
|
|
291
|
-
raise ToolError("keys parameter is required for keyup")
|
|
292
|
-
result = await self.executor.keyup(keys=keys)
|
|
293
|
-
|
|
294
|
-
elif action == "type":
|
|
295
|
-
if text is None:
|
|
296
|
-
raise ToolError("text parameter is required for type")
|
|
297
|
-
result = await self.executor.write(text=text, enter_after=enter_after or False)
|
|
298
|
-
|
|
299
|
-
elif action == "scroll":
|
|
300
|
-
# Scale coordinates from client space to screen space
|
|
301
|
-
scaled_x, scaled_y = self._scale_coordinates(x, y)
|
|
302
|
-
result = await self.executor.scroll(
|
|
303
|
-
x=scaled_x,
|
|
304
|
-
y=scaled_y,
|
|
305
|
-
scroll_x=scroll_x,
|
|
306
|
-
scroll_y=scroll_y,
|
|
307
|
-
hold_keys=hold_keys,
|
|
308
|
-
)
|
|
309
|
-
|
|
310
|
-
elif action == "move":
|
|
311
|
-
# Scale coordinates from client space to screen space
|
|
312
|
-
scaled_x, scaled_y = self._scale_coordinates(x, y)
|
|
313
|
-
scaled_offset_x, scaled_offset_y = self._scale_coordinates(offset_x, offset_y)
|
|
314
|
-
result = await self.executor.move(
|
|
315
|
-
x=scaled_x, y=scaled_y, offset_x=scaled_offset_x, offset_y=scaled_offset_y
|
|
316
|
-
)
|
|
317
|
-
|
|
318
|
-
elif action == "wait":
|
|
319
|
-
if time is None:
|
|
320
|
-
raise ToolError("time parameter is required for wait")
|
|
321
|
-
result = await self.executor.wait(time=time)
|
|
322
|
-
|
|
323
|
-
elif action == "drag":
|
|
324
|
-
if path is None:
|
|
325
|
-
raise ToolError("path parameter is required for drag")
|
|
326
|
-
# Scale path from client space to screen space
|
|
327
|
-
scaled_path = self._scale_path(path)
|
|
328
|
-
result = await self.executor.drag(
|
|
329
|
-
path=scaled_path, pattern=pattern, hold_keys=hold_keys
|
|
330
|
-
)
|
|
331
|
-
|
|
332
|
-
elif action == "response":
|
|
333
|
-
if text is None:
|
|
334
|
-
raise ToolError("text parameter is required for response")
|
|
335
|
-
return [TextContent(text=text, type="text")]
|
|
336
|
-
|
|
337
|
-
elif action == "screenshot":
|
|
338
|
-
screenshot = await self.executor.screenshot()
|
|
339
|
-
if screenshot:
|
|
340
|
-
# Rescale screenshot if requested
|
|
341
|
-
screenshot = await self._rescale_screenshot(screenshot)
|
|
342
|
-
result = ContentResult(base64_image=screenshot)
|
|
343
|
-
else:
|
|
344
|
-
result = ContentResult(error="Failed to take screenshot")
|
|
345
|
-
|
|
346
|
-
elif action == "position":
|
|
347
|
-
result = await self.executor.position()
|
|
348
|
-
|
|
349
|
-
elif action == "hold_key":
|
|
350
|
-
if text is None:
|
|
351
|
-
raise ToolError("text parameter is required for hold_key")
|
|
352
|
-
if duration is None:
|
|
353
|
-
raise ToolError("duration parameter is required for hold_key")
|
|
354
|
-
result = await self.executor.hold_key(key=text, duration=duration)
|
|
355
|
-
|
|
356
|
-
elif action == "mouse_down":
|
|
357
|
-
result = await self.executor.mouse_down(button=button or "left")
|
|
358
|
-
|
|
359
|
-
elif action == "mouse_up":
|
|
360
|
-
result = await self.executor.mouse_up(button=button or "left")
|
|
361
|
-
|
|
362
|
-
else:
|
|
363
|
-
raise McpError(ErrorData(code=INVALID_PARAMS, message=f"Unknown action: {action}"))
|
|
364
|
-
|
|
365
|
-
# Rescale screenshot in result if present
|
|
366
|
-
if isinstance(result, ContentResult) and result.base64_image and self.rescale_images:
|
|
367
|
-
rescaled_image = await self._rescale_screenshot(result.base64_image)
|
|
368
|
-
result.base64_image = rescaled_image
|
|
369
|
-
|
|
370
|
-
# Convert result to content blocks
|
|
371
|
-
return result.to_content_blocks()
|
|
372
|
-
|
|
373
|
-
except TypeError as e:
|
|
374
|
-
raise McpError(
|
|
375
|
-
ErrorData(code=INVALID_PARAMS, message=f"Invalid parameters for {action}: {e!s}")
|
|
376
|
-
) from e
|
|
1
|
+
# flake8: noqa: B008
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
import logging
|
|
5
|
+
import platform
|
|
6
|
+
from typing import Literal
|
|
7
|
+
|
|
8
|
+
from mcp import ErrorData, McpError
|
|
9
|
+
from mcp.types import INVALID_PARAMS, ContentBlock, TextContent
|
|
10
|
+
from pydantic import Field
|
|
11
|
+
|
|
12
|
+
from hud.tools.base import BaseTool
|
|
13
|
+
from hud.tools.executors.base import BaseExecutor
|
|
14
|
+
from hud.tools.executors.pyautogui import PyAutoGUIExecutor
|
|
15
|
+
from hud.tools.executors.xdo import XDOExecutor
|
|
16
|
+
from hud.tools.types import ContentResult, ToolError
|
|
17
|
+
|
|
18
|
+
from .settings import computer_settings
|
|
19
|
+
|
|
20
|
+
logger = logging.getLogger(__name__)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class HudComputerTool(BaseTool):
|
|
24
|
+
"""
|
|
25
|
+
A tool that allows the agent to control the computer.
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
def __init__(
|
|
29
|
+
self,
|
|
30
|
+
# Define within environment based on platform
|
|
31
|
+
executor: BaseExecutor | None = None,
|
|
32
|
+
platform_type: Literal["auto", "xdo", "pyautogui"] = "auto",
|
|
33
|
+
display_num: int | None = None,
|
|
34
|
+
# Overrides for what dimensions the agent thinks it operates in
|
|
35
|
+
# Define per subclass (e.g., Anthropic, OpenAI)
|
|
36
|
+
width: int | None = computer_settings.HUD_COMPUTER_WIDTH,
|
|
37
|
+
height: int | None = computer_settings.HUD_COMPUTER_HEIGHT,
|
|
38
|
+
rescale_images: bool = computer_settings.HUD_RESCALE_IMAGES,
|
|
39
|
+
# What the agent sees as the tool's name, title, and description
|
|
40
|
+
name: str | None = None,
|
|
41
|
+
title: str | None = None,
|
|
42
|
+
description: str | None = None,
|
|
43
|
+
) -> None:
|
|
44
|
+
"""
|
|
45
|
+
Initialize the HUD computer tool.
|
|
46
|
+
|
|
47
|
+
Args:
|
|
48
|
+
executor: Executor to use for the tool
|
|
49
|
+
platform_type: Which executor to use if executor not provided:
|
|
50
|
+
- "auto": Automatically detect based on platform
|
|
51
|
+
- "xdo": Use XDOExecutor (Linux/X11 only)
|
|
52
|
+
- "pyautogui": Use PyAutoGUIExecutor (cross-platform)
|
|
53
|
+
display_num: X display number
|
|
54
|
+
width: Target width for rescaling (None = use environment width)
|
|
55
|
+
height: Target height for rescaling (None = use environment height)
|
|
56
|
+
rescale_images: If True, rescale screenshots. If False, only rescale action coordinates
|
|
57
|
+
name: Tool name for MCP registration (auto-generated from class name if not provided)
|
|
58
|
+
title: Human-readable display name for the tool (auto-generated from class name)
|
|
59
|
+
description: Tool description (auto-generated from docstring if not provided)
|
|
60
|
+
"""
|
|
61
|
+
# Initialize base tool with executor as env
|
|
62
|
+
super().__init__(
|
|
63
|
+
env=executor,
|
|
64
|
+
name=name or "computer",
|
|
65
|
+
title=title or "Computer Control",
|
|
66
|
+
description=description or "Control computer with mouse, keyboard, and screenshots",
|
|
67
|
+
)
|
|
68
|
+
|
|
69
|
+
# This is the width and height the agent thinks it operates in
|
|
70
|
+
# By default, use subclass's width and height
|
|
71
|
+
# If specifically set to None, use environment width and height
|
|
72
|
+
self.width = width or computer_settings.DISPLAY_WIDTH
|
|
73
|
+
self.height = height or computer_settings.DISPLAY_HEIGHT
|
|
74
|
+
|
|
75
|
+
# This is the static width and height of the environment screen
|
|
76
|
+
# And the width and height of the screenshots taken by the tool
|
|
77
|
+
self.environment_width = computer_settings.DISPLAY_WIDTH
|
|
78
|
+
self.environment_height = computer_settings.DISPLAY_HEIGHT
|
|
79
|
+
|
|
80
|
+
# Some APIs rescale screenshots automatically to the agent's width and height, some don't
|
|
81
|
+
# Defined per subclass (e.g., Anthropic, OpenAI)
|
|
82
|
+
# In case you need your agent to receive pre-formatted screenshots, set env variable True
|
|
83
|
+
self.rescale_images = rescale_images
|
|
84
|
+
|
|
85
|
+
logger.debug(
|
|
86
|
+
"Agent Screen Width: %s, Agent Screen Height: %s",
|
|
87
|
+
self.width,
|
|
88
|
+
self.height,
|
|
89
|
+
"Environment Screen Width: %s, Environment Screen Height: %s",
|
|
90
|
+
self.environment_width,
|
|
91
|
+
self.environment_height,
|
|
92
|
+
)
|
|
93
|
+
|
|
94
|
+
# Calculate scaling factors from base screen size to target size
|
|
95
|
+
self.scale_x = self.width / self.environment_width
|
|
96
|
+
self.scale_y = self.height / self.environment_height
|
|
97
|
+
|
|
98
|
+
# Check if we need to scale
|
|
99
|
+
self.needs_scaling = min(self.scale_x, self.scale_y) != 1.0
|
|
100
|
+
|
|
101
|
+
# Use environment settings for display number
|
|
102
|
+
self.display_num = display_num or computer_settings.DISPLAY_NUM
|
|
103
|
+
|
|
104
|
+
logger.debug("Display number: %s", self.display_num)
|
|
105
|
+
|
|
106
|
+
# If no executor provided, create one based on platform
|
|
107
|
+
if self.env is None:
|
|
108
|
+
self._choose_executor(platform_type, self.display_num)
|
|
109
|
+
|
|
110
|
+
@property
|
|
111
|
+
def executor(self) -> BaseExecutor:
|
|
112
|
+
"""Get the executor (alias for context)."""
|
|
113
|
+
return self.env
|
|
114
|
+
|
|
115
|
+
@executor.setter
|
|
116
|
+
def executor(self, value: BaseExecutor) -> None:
|
|
117
|
+
"""Set the executor (alias for context)."""
|
|
118
|
+
self.env = value
|
|
119
|
+
|
|
120
|
+
def _choose_executor(
|
|
121
|
+
self,
|
|
122
|
+
platform_type: Literal["auto", "xdo", "pyautogui"],
|
|
123
|
+
display_num: int | None,
|
|
124
|
+
) -> None:
|
|
125
|
+
"""Choose executor based on platform_type."""
|
|
126
|
+
# Choose executor based on platform_type
|
|
127
|
+
if platform_type == "auto":
|
|
128
|
+
# Auto-detect based on platform
|
|
129
|
+
system = platform.system().lower()
|
|
130
|
+
if system == "linux":
|
|
131
|
+
# Try XDO first on Linux
|
|
132
|
+
if XDOExecutor.is_available():
|
|
133
|
+
self.executor = XDOExecutor(display_num=display_num)
|
|
134
|
+
logger.info("Using XDOExecutor")
|
|
135
|
+
elif PyAutoGUIExecutor.is_available():
|
|
136
|
+
self.executor = PyAutoGUIExecutor(display_num=display_num)
|
|
137
|
+
logger.info("Using PyAutoGUIExecutor")
|
|
138
|
+
else:
|
|
139
|
+
self.executor = BaseExecutor(display_num=display_num)
|
|
140
|
+
logger.info("No display available, using BaseExecutor (simulation mode)")
|
|
141
|
+
else:
|
|
142
|
+
# Windows/macOS - try PyAutoGUI
|
|
143
|
+
if PyAutoGUIExecutor.is_available():
|
|
144
|
+
self.executor = PyAutoGUIExecutor(display_num=display_num)
|
|
145
|
+
logger.info("Using PyAutoGUIExecutor")
|
|
146
|
+
else:
|
|
147
|
+
self.executor = BaseExecutor(display_num=display_num)
|
|
148
|
+
logger.info("PyAutoGUI not available, using BaseExecutor (simulation mode)")
|
|
149
|
+
|
|
150
|
+
elif platform_type == "xdo":
|
|
151
|
+
if XDOExecutor.is_available():
|
|
152
|
+
self.executor = XDOExecutor(display_num=display_num)
|
|
153
|
+
logger.info("Using XDOExecutor")
|
|
154
|
+
else:
|
|
155
|
+
self.executor = BaseExecutor(display_num=display_num)
|
|
156
|
+
logger.warning("XDO not available, using BaseExecutor (simulation mode)")
|
|
157
|
+
|
|
158
|
+
elif platform_type == "pyautogui":
|
|
159
|
+
if PyAutoGUIExecutor.is_available():
|
|
160
|
+
self.executor = PyAutoGUIExecutor(display_num=display_num)
|
|
161
|
+
logger.info("Using PyAutoGUIExecutor")
|
|
162
|
+
else:
|
|
163
|
+
self.executor = BaseExecutor(display_num=display_num)
|
|
164
|
+
logger.warning("PyAutoGUI not available, using BaseExecutor (simulation mode)")
|
|
165
|
+
else:
|
|
166
|
+
raise ValueError(f"Invalid platform_type: {platform_type}")
|
|
167
|
+
|
|
168
|
+
def _scale_coordinates(self, x: int | None, y: int | None) -> tuple[int | None, int | None]:
|
|
169
|
+
"""Scale coordinates from target space to screen space."""
|
|
170
|
+
if x is not None and self.scale_x != 1.0:
|
|
171
|
+
x = int(x / self.scale_x)
|
|
172
|
+
if y is not None and self.scale_y != 1.0:
|
|
173
|
+
y = int(y / self.scale_y)
|
|
174
|
+
|
|
175
|
+
return x, y
|
|
176
|
+
|
|
177
|
+
def _scale_path(self, path: list[tuple[int, int]]) -> list[tuple[int, int]]:
|
|
178
|
+
"""Scale a path from target space to screen space."""
|
|
179
|
+
scaled_path = []
|
|
180
|
+
for x, y in path:
|
|
181
|
+
scaled_x, scaled_y = self._scale_coordinates(x, y)
|
|
182
|
+
if scaled_x is not None and scaled_y is not None:
|
|
183
|
+
scaled_path.append((scaled_x, scaled_y))
|
|
184
|
+
|
|
185
|
+
return scaled_path
|
|
186
|
+
|
|
187
|
+
async def _rescale_screenshot(self, screenshot_base64: str) -> str:
|
|
188
|
+
"""Rescale a screenshot if rescale_images is True."""
|
|
189
|
+
if not self.rescale_images or not self.needs_scaling:
|
|
190
|
+
return screenshot_base64
|
|
191
|
+
|
|
192
|
+
try:
|
|
193
|
+
import base64
|
|
194
|
+
from io import BytesIO
|
|
195
|
+
|
|
196
|
+
from PIL import Image # type: ignore[import-not-found]
|
|
197
|
+
|
|
198
|
+
# Decode base64 to image
|
|
199
|
+
image_data = base64.b64decode(screenshot_base64)
|
|
200
|
+
image = Image.open(BytesIO(image_data))
|
|
201
|
+
|
|
202
|
+
logger.info(
|
|
203
|
+
"Resizing screenshot from %s x %s to %s x %s",
|
|
204
|
+
image.width,
|
|
205
|
+
image.height,
|
|
206
|
+
self.width,
|
|
207
|
+
self.height,
|
|
208
|
+
)
|
|
209
|
+
|
|
210
|
+
# Resize to exact target dimensions
|
|
211
|
+
resized = image.resize((self.width, self.height), Image.Resampling.LANCZOS)
|
|
212
|
+
|
|
213
|
+
# Convert back to base64
|
|
214
|
+
buffer = BytesIO()
|
|
215
|
+
resized.save(buffer, format="PNG")
|
|
216
|
+
resized_base64 = base64.b64encode(buffer.getvalue()).decode("utf-8")
|
|
217
|
+
|
|
218
|
+
return resized_base64
|
|
219
|
+
except Exception as e:
|
|
220
|
+
logger.warning("Failed to rescale screenshot: %s", e)
|
|
221
|
+
return screenshot_base64
|
|
222
|
+
|
|
223
|
+
async def __call__(
|
|
224
|
+
self,
|
|
225
|
+
action: str = Field(..., description="The action name (click, type, move, etc.)"),
|
|
226
|
+
# Click parameters
|
|
227
|
+
x: int | None = Field(None, description="X coordinate for click/move/scroll actions"),
|
|
228
|
+
y: int | None = Field(None, description="Y coordinate for click/move/scroll actions"),
|
|
229
|
+
button: Literal["left", "right", "middle", "back", "forward"] | None = Field(
|
|
230
|
+
None, description="Mouse button for click actions"
|
|
231
|
+
),
|
|
232
|
+
pattern: list[int] | None = Field(
|
|
233
|
+
None, description="Click pattern for multi-clicks (e.g., [100] for double-click)"
|
|
234
|
+
),
|
|
235
|
+
# Key/Type parameters
|
|
236
|
+
text: str | None = Field(None, description="Text for type/response actions"),
|
|
237
|
+
keys: list[str] | None = Field(None, description="Keys for press/keydown/keyup actions"),
|
|
238
|
+
enter_after: bool | None = Field(None, description="Whether to press Enter after typing"),
|
|
239
|
+
# Scroll parameters
|
|
240
|
+
scroll_x: int | None = Field(
|
|
241
|
+
None, description="Horizontal scroll amount (positive = right)"
|
|
242
|
+
),
|
|
243
|
+
scroll_y: int | None = Field(None, description="Vertical scroll amount (positive = down)"),
|
|
244
|
+
# Move parameters
|
|
245
|
+
offset_x: int | None = Field(None, description="X offset for relative move"),
|
|
246
|
+
offset_y: int | None = Field(None, description="Y offset for relative move"),
|
|
247
|
+
# Drag parameters
|
|
248
|
+
path: list[tuple[int, int]] | None = Field(
|
|
249
|
+
None, description="Path for drag actions as list of (x, y) coordinates"
|
|
250
|
+
),
|
|
251
|
+
# Wait parameter
|
|
252
|
+
time: int | None = Field(None, description="Time in milliseconds for wait action"),
|
|
253
|
+
# General parameters
|
|
254
|
+
hold_keys: list[str] | None = Field(None, description="Keys to hold during action"),
|
|
255
|
+
# hold_key specific
|
|
256
|
+
duration: float | None = Field(None, description="Duration in seconds for hold_key action"),
|
|
257
|
+
) -> list[ContentBlock]:
|
|
258
|
+
"""
|
|
259
|
+
Execute a computer control action by name.
|
|
260
|
+
|
|
261
|
+
Returns:
|
|
262
|
+
List of MCP content blocks
|
|
263
|
+
"""
|
|
264
|
+
logger.info("HudComputerTool executing action: %s", action)
|
|
265
|
+
|
|
266
|
+
try:
|
|
267
|
+
# Delegate to executor based on action
|
|
268
|
+
if action == "click":
|
|
269
|
+
# Scale coordinates from client space to screen space
|
|
270
|
+
scaled_x, scaled_y = self._scale_coordinates(x, y)
|
|
271
|
+
result = await self.executor.click(
|
|
272
|
+
x=scaled_x,
|
|
273
|
+
y=scaled_y,
|
|
274
|
+
button=button or "left",
|
|
275
|
+
pattern=pattern,
|
|
276
|
+
hold_keys=hold_keys,
|
|
277
|
+
)
|
|
278
|
+
|
|
279
|
+
elif action == "press":
|
|
280
|
+
if keys is None:
|
|
281
|
+
raise ToolError("keys parameter is required for press")
|
|
282
|
+
result = await self.executor.press(keys=keys)
|
|
283
|
+
|
|
284
|
+
elif action == "keydown":
|
|
285
|
+
if keys is None:
|
|
286
|
+
raise ToolError("keys parameter is required for keydown")
|
|
287
|
+
result = await self.executor.keydown(keys=keys)
|
|
288
|
+
|
|
289
|
+
elif action == "keyup":
|
|
290
|
+
if keys is None:
|
|
291
|
+
raise ToolError("keys parameter is required for keyup")
|
|
292
|
+
result = await self.executor.keyup(keys=keys)
|
|
293
|
+
|
|
294
|
+
elif action == "type":
|
|
295
|
+
if text is None:
|
|
296
|
+
raise ToolError("text parameter is required for type")
|
|
297
|
+
result = await self.executor.write(text=text, enter_after=enter_after or False)
|
|
298
|
+
|
|
299
|
+
elif action == "scroll":
|
|
300
|
+
# Scale coordinates from client space to screen space
|
|
301
|
+
scaled_x, scaled_y = self._scale_coordinates(x, y)
|
|
302
|
+
result = await self.executor.scroll(
|
|
303
|
+
x=scaled_x,
|
|
304
|
+
y=scaled_y,
|
|
305
|
+
scroll_x=scroll_x,
|
|
306
|
+
scroll_y=scroll_y,
|
|
307
|
+
hold_keys=hold_keys,
|
|
308
|
+
)
|
|
309
|
+
|
|
310
|
+
elif action == "move":
|
|
311
|
+
# Scale coordinates from client space to screen space
|
|
312
|
+
scaled_x, scaled_y = self._scale_coordinates(x, y)
|
|
313
|
+
scaled_offset_x, scaled_offset_y = self._scale_coordinates(offset_x, offset_y)
|
|
314
|
+
result = await self.executor.move(
|
|
315
|
+
x=scaled_x, y=scaled_y, offset_x=scaled_offset_x, offset_y=scaled_offset_y
|
|
316
|
+
)
|
|
317
|
+
|
|
318
|
+
elif action == "wait":
|
|
319
|
+
if time is None:
|
|
320
|
+
raise ToolError("time parameter is required for wait")
|
|
321
|
+
result = await self.executor.wait(time=time)
|
|
322
|
+
|
|
323
|
+
elif action == "drag":
|
|
324
|
+
if path is None:
|
|
325
|
+
raise ToolError("path parameter is required for drag")
|
|
326
|
+
# Scale path from client space to screen space
|
|
327
|
+
scaled_path = self._scale_path(path)
|
|
328
|
+
result = await self.executor.drag(
|
|
329
|
+
path=scaled_path, pattern=pattern, hold_keys=hold_keys
|
|
330
|
+
)
|
|
331
|
+
|
|
332
|
+
elif action == "response":
|
|
333
|
+
if text is None:
|
|
334
|
+
raise ToolError("text parameter is required for response")
|
|
335
|
+
return [TextContent(text=text, type="text")]
|
|
336
|
+
|
|
337
|
+
elif action == "screenshot":
|
|
338
|
+
screenshot = await self.executor.screenshot()
|
|
339
|
+
if screenshot:
|
|
340
|
+
# Rescale screenshot if requested
|
|
341
|
+
screenshot = await self._rescale_screenshot(screenshot)
|
|
342
|
+
result = ContentResult(base64_image=screenshot)
|
|
343
|
+
else:
|
|
344
|
+
result = ContentResult(error="Failed to take screenshot")
|
|
345
|
+
|
|
346
|
+
elif action == "position":
|
|
347
|
+
result = await self.executor.position()
|
|
348
|
+
|
|
349
|
+
elif action == "hold_key":
|
|
350
|
+
if text is None:
|
|
351
|
+
raise ToolError("text parameter is required for hold_key")
|
|
352
|
+
if duration is None:
|
|
353
|
+
raise ToolError("duration parameter is required for hold_key")
|
|
354
|
+
result = await self.executor.hold_key(key=text, duration=duration)
|
|
355
|
+
|
|
356
|
+
elif action == "mouse_down":
|
|
357
|
+
result = await self.executor.mouse_down(button=button or "left")
|
|
358
|
+
|
|
359
|
+
elif action == "mouse_up":
|
|
360
|
+
result = await self.executor.mouse_up(button=button or "left")
|
|
361
|
+
|
|
362
|
+
else:
|
|
363
|
+
raise McpError(ErrorData(code=INVALID_PARAMS, message=f"Unknown action: {action}"))
|
|
364
|
+
|
|
365
|
+
# Rescale screenshot in result if present
|
|
366
|
+
if isinstance(result, ContentResult) and result.base64_image and self.rescale_images:
|
|
367
|
+
rescaled_image = await self._rescale_screenshot(result.base64_image)
|
|
368
|
+
result.base64_image = rescaled_image
|
|
369
|
+
|
|
370
|
+
# Convert result to content blocks
|
|
371
|
+
return result.to_content_blocks()
|
|
372
|
+
|
|
373
|
+
except TypeError as e:
|
|
374
|
+
raise McpError(
|
|
375
|
+
ErrorData(code=INVALID_PARAMS, message=f"Invalid parameters for {action}: {e!s}")
|
|
376
|
+
) from e
|