hud-python 0.3.4__py3-none-any.whl → 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of hud-python might be problematic. Click here for more details.

Files changed (192) hide show
  1. hud/__init__.py +22 -89
  2. hud/agents/__init__.py +17 -0
  3. hud/agents/art.py +101 -0
  4. hud/agents/base.py +599 -0
  5. hud/{mcp → agents}/claude.py +373 -321
  6. hud/{mcp → agents}/langchain.py +250 -250
  7. hud/agents/misc/__init__.py +7 -0
  8. hud/{agent → agents}/misc/response_agent.py +80 -80
  9. hud/{mcp → agents}/openai.py +352 -334
  10. hud/agents/openai_chat_generic.py +154 -0
  11. hud/{mcp → agents}/tests/__init__.py +1 -1
  12. hud/agents/tests/test_base.py +742 -0
  13. hud/agents/tests/test_claude.py +324 -0
  14. hud/{mcp → agents}/tests/test_client.py +363 -324
  15. hud/{mcp → agents}/tests/test_openai.py +237 -238
  16. hud/cli/__init__.py +617 -0
  17. hud/cli/__main__.py +8 -0
  18. hud/cli/analyze.py +371 -0
  19. hud/cli/analyze_metadata.py +230 -0
  20. hud/cli/build.py +427 -0
  21. hud/cli/clone.py +185 -0
  22. hud/cli/cursor.py +92 -0
  23. hud/cli/debug.py +392 -0
  24. hud/cli/docker_utils.py +83 -0
  25. hud/cli/init.py +281 -0
  26. hud/cli/interactive.py +353 -0
  27. hud/cli/mcp_server.py +756 -0
  28. hud/cli/pull.py +336 -0
  29. hud/cli/push.py +379 -0
  30. hud/cli/remote_runner.py +311 -0
  31. hud/cli/runner.py +160 -0
  32. hud/cli/tests/__init__.py +3 -0
  33. hud/cli/tests/test_analyze.py +284 -0
  34. hud/cli/tests/test_cli_init.py +265 -0
  35. hud/cli/tests/test_cli_main.py +27 -0
  36. hud/cli/tests/test_clone.py +142 -0
  37. hud/cli/tests/test_cursor.py +253 -0
  38. hud/cli/tests/test_debug.py +453 -0
  39. hud/cli/tests/test_mcp_server.py +139 -0
  40. hud/cli/tests/test_utils.py +388 -0
  41. hud/cli/utils.py +263 -0
  42. hud/clients/README.md +143 -0
  43. hud/clients/__init__.py +16 -0
  44. hud/clients/base.py +354 -0
  45. hud/clients/fastmcp.py +202 -0
  46. hud/clients/mcp_use.py +278 -0
  47. hud/clients/tests/__init__.py +1 -0
  48. hud/clients/tests/test_client_integration.py +111 -0
  49. hud/clients/tests/test_fastmcp.py +342 -0
  50. hud/clients/tests/test_protocol.py +188 -0
  51. hud/clients/utils/__init__.py +1 -0
  52. hud/clients/utils/retry_transport.py +160 -0
  53. hud/datasets.py +322 -192
  54. hud/misc/__init__.py +1 -0
  55. hud/{agent → misc}/claude_plays_pokemon.py +292 -283
  56. hud/otel/__init__.py +35 -0
  57. hud/otel/collector.py +142 -0
  58. hud/otel/config.py +164 -0
  59. hud/otel/context.py +536 -0
  60. hud/otel/exporters.py +366 -0
  61. hud/otel/instrumentation.py +97 -0
  62. hud/otel/processors.py +118 -0
  63. hud/otel/tests/__init__.py +1 -0
  64. hud/otel/tests/test_processors.py +197 -0
  65. hud/server/__init__.py +5 -5
  66. hud/server/context.py +114 -0
  67. hud/server/helper/__init__.py +5 -0
  68. hud/server/low_level.py +132 -0
  69. hud/server/server.py +166 -0
  70. hud/server/tests/__init__.py +3 -0
  71. hud/settings.py +73 -79
  72. hud/shared/__init__.py +5 -0
  73. hud/{exceptions.py → shared/exceptions.py} +180 -180
  74. hud/{server → shared}/requests.py +264 -264
  75. hud/shared/tests/test_exceptions.py +157 -0
  76. hud/{server → shared}/tests/test_requests.py +275 -275
  77. hud/telemetry/__init__.py +25 -30
  78. hud/telemetry/instrument.py +379 -0
  79. hud/telemetry/job.py +309 -141
  80. hud/telemetry/replay.py +74 -0
  81. hud/telemetry/trace.py +83 -0
  82. hud/tools/__init__.py +33 -34
  83. hud/tools/base.py +365 -65
  84. hud/tools/bash.py +161 -137
  85. hud/tools/computer/__init__.py +15 -13
  86. hud/tools/computer/anthropic.py +437 -414
  87. hud/tools/computer/hud.py +376 -328
  88. hud/tools/computer/openai.py +295 -286
  89. hud/tools/computer/settings.py +82 -0
  90. hud/tools/edit.py +314 -290
  91. hud/tools/executors/__init__.py +30 -30
  92. hud/tools/executors/base.py +539 -532
  93. hud/tools/executors/pyautogui.py +621 -619
  94. hud/tools/executors/tests/__init__.py +1 -1
  95. hud/tools/executors/tests/test_base_executor.py +338 -338
  96. hud/tools/executors/tests/test_pyautogui_executor.py +165 -165
  97. hud/tools/executors/xdo.py +511 -503
  98. hud/tools/{playwright_tool.py → playwright.py} +412 -379
  99. hud/tools/tests/__init__.py +3 -3
  100. hud/tools/tests/test_base.py +282 -0
  101. hud/tools/tests/test_bash.py +158 -152
  102. hud/tools/tests/test_bash_extended.py +197 -0
  103. hud/tools/tests/test_computer.py +425 -52
  104. hud/tools/tests/test_computer_actions.py +34 -34
  105. hud/tools/tests/test_edit.py +259 -240
  106. hud/tools/tests/test_init.py +27 -27
  107. hud/tools/tests/test_playwright_tool.py +183 -183
  108. hud/tools/tests/test_tools.py +145 -157
  109. hud/tools/tests/test_utils.py +156 -156
  110. hud/tools/types.py +72 -0
  111. hud/tools/utils.py +50 -50
  112. hud/types.py +136 -89
  113. hud/utils/__init__.py +10 -16
  114. hud/utils/async_utils.py +65 -0
  115. hud/utils/design.py +168 -0
  116. hud/utils/mcp.py +55 -0
  117. hud/utils/progress.py +149 -149
  118. hud/utils/telemetry.py +66 -66
  119. hud/utils/tests/test_async_utils.py +173 -0
  120. hud/utils/tests/test_init.py +17 -21
  121. hud/utils/tests/test_progress.py +261 -225
  122. hud/utils/tests/test_telemetry.py +82 -37
  123. hud/utils/tests/test_version.py +8 -8
  124. hud/version.py +7 -7
  125. hud_python-0.4.0.dist-info/METADATA +474 -0
  126. hud_python-0.4.0.dist-info/RECORD +132 -0
  127. hud_python-0.4.0.dist-info/entry_points.txt +3 -0
  128. {hud_python-0.3.4.dist-info → hud_python-0.4.0.dist-info}/licenses/LICENSE +21 -21
  129. hud/adapters/__init__.py +0 -8
  130. hud/adapters/claude/__init__.py +0 -5
  131. hud/adapters/claude/adapter.py +0 -180
  132. hud/adapters/claude/tests/__init__.py +0 -1
  133. hud/adapters/claude/tests/test_adapter.py +0 -519
  134. hud/adapters/common/__init__.py +0 -6
  135. hud/adapters/common/adapter.py +0 -178
  136. hud/adapters/common/tests/test_adapter.py +0 -289
  137. hud/adapters/common/types.py +0 -446
  138. hud/adapters/operator/__init__.py +0 -5
  139. hud/adapters/operator/adapter.py +0 -108
  140. hud/adapters/operator/tests/__init__.py +0 -1
  141. hud/adapters/operator/tests/test_adapter.py +0 -370
  142. hud/agent/__init__.py +0 -19
  143. hud/agent/base.py +0 -126
  144. hud/agent/claude.py +0 -271
  145. hud/agent/langchain.py +0 -215
  146. hud/agent/misc/__init__.py +0 -3
  147. hud/agent/operator.py +0 -268
  148. hud/agent/tests/__init__.py +0 -1
  149. hud/agent/tests/test_base.py +0 -202
  150. hud/env/__init__.py +0 -11
  151. hud/env/client.py +0 -35
  152. hud/env/docker_client.py +0 -349
  153. hud/env/environment.py +0 -446
  154. hud/env/local_docker_client.py +0 -358
  155. hud/env/remote_client.py +0 -212
  156. hud/env/remote_docker_client.py +0 -292
  157. hud/gym.py +0 -130
  158. hud/job.py +0 -773
  159. hud/mcp/__init__.py +0 -17
  160. hud/mcp/base.py +0 -631
  161. hud/mcp/client.py +0 -312
  162. hud/mcp/tests/test_base.py +0 -512
  163. hud/mcp/tests/test_claude.py +0 -294
  164. hud/task.py +0 -149
  165. hud/taskset.py +0 -237
  166. hud/telemetry/_trace.py +0 -347
  167. hud/telemetry/context.py +0 -230
  168. hud/telemetry/exporter.py +0 -575
  169. hud/telemetry/instrumentation/__init__.py +0 -3
  170. hud/telemetry/instrumentation/mcp.py +0 -259
  171. hud/telemetry/instrumentation/registry.py +0 -59
  172. hud/telemetry/mcp_models.py +0 -270
  173. hud/telemetry/tests/__init__.py +0 -1
  174. hud/telemetry/tests/test_context.py +0 -210
  175. hud/telemetry/tests/test_trace.py +0 -312
  176. hud/tools/helper/README.md +0 -56
  177. hud/tools/helper/__init__.py +0 -9
  178. hud/tools/helper/mcp_server.py +0 -78
  179. hud/tools/helper/server_initialization.py +0 -115
  180. hud/tools/helper/utils.py +0 -58
  181. hud/trajectory.py +0 -94
  182. hud/utils/agent.py +0 -37
  183. hud/utils/common.py +0 -256
  184. hud/utils/config.py +0 -120
  185. hud/utils/deprecation.py +0 -115
  186. hud/utils/misc.py +0 -53
  187. hud/utils/tests/test_common.py +0 -277
  188. hud/utils/tests/test_config.py +0 -129
  189. hud_python-0.3.4.dist-info/METADATA +0 -284
  190. hud_python-0.3.4.dist-info/RECORD +0 -120
  191. /hud/{adapters/common → shared}/tests/__init__.py +0 -0
  192. {hud_python-0.3.4.dist-info → hud_python-0.4.0.dist-info}/WHEEL +0 -0
hud/tools/computer/hud.py CHANGED
@@ -1,328 +1,376 @@
1
- # flake8: noqa: B008
2
- from __future__ import annotations
3
-
4
- import logging
5
- import platform
6
- from typing import Literal
7
-
8
- from mcp import ErrorData, McpError
9
- from mcp.types import INVALID_PARAMS, ImageContent, TextContent
10
- from pydantic import Field
11
-
12
- from hud.tools.base import ToolError, ToolResult, tool_result_to_content_blocks
13
- from hud.tools.executors.base import BaseExecutor
14
- from hud.tools.executors.pyautogui import PyAutoGUIExecutor
15
- from hud.tools.executors.xdo import XDOExecutor
16
-
17
- logger = logging.getLogger(__name__)
18
-
19
- BASE_SCREEN_WIDTH = 1920
20
- BASE_SCREEN_HEIGHT = 1080
21
-
22
-
23
- class HudComputerTool:
24
- """
25
- A tool that allows the agent to control the computer.
26
- """
27
-
28
- def __init__(
29
- self,
30
- width: int | None = None,
31
- height: int | None = None,
32
- display_num: int | None = None,
33
- platform_type: Literal["auto", "xdo", "pyautogui"] = "auto",
34
- custom_executor: BaseExecutor | None = None,
35
- rescale_images: bool = False,
36
- ) -> None:
37
- """
38
- Initialize the HUD computer tool.
39
-
40
- Args:
41
- width: Target width for rescaling (None = use actual screen width)
42
- height: Target height for rescaling (None = use actual screen height)
43
- display_num: X display number
44
- platform_type: Which executor to use:
45
- - "auto": Automatically detect based on platform
46
- - "xdo": Use XDOExecutor (Linux/X11 only)
47
- - "pyautogui": Use PyAutoGUIExecutor (cross-platform)
48
- custom_executor: If None, executor class will be determined based on platform_type.
49
- rescale_images: If True, rescale screenshots. If False, only rescale action coordinates
50
- """
51
- # Use provided dimensions or defaults
52
- self.width = width or BASE_SCREEN_WIDTH
53
- self.height = height or BASE_SCREEN_HEIGHT
54
- self.rescale_images = rescale_images
55
-
56
- logger.info("Width: %s, Height: %s", self.width, self.height)
57
- logger.info(
58
- "Base Screen Width: %s, Base Screen Height: %s",
59
- BASE_SCREEN_WIDTH,
60
- BASE_SCREEN_HEIGHT,
61
- )
62
-
63
- # Calculate scaling factors from base screen size to target size
64
- self.scale_x = self.width / BASE_SCREEN_WIDTH
65
- self.scale_y = self.height / BASE_SCREEN_HEIGHT
66
-
67
- logger.info("Scale X: %s, Scale Y: %s", self.scale_x, self.scale_y)
68
- self.scale = min(self.scale_x, self.scale_y)
69
-
70
- logger.info("Scaling factor: %s", self.scale)
71
-
72
- # Check if we need to scale
73
- self.needs_scaling = self.scale != 1.0
74
-
75
- if custom_executor is None:
76
- self._choose_executor(platform_type, display_num)
77
- else:
78
- self.executor = custom_executor
79
-
80
- def _choose_executor(
81
- self,
82
- platform_type: Literal["auto", "xdo", "pyautogui"],
83
- display_num: int | None,
84
- ) -> None:
85
- """Choose executor based on platform_type."""
86
- # Choose executor based on platform_type
87
- if platform_type == "auto":
88
- # Auto-detect based on platform
89
- system = platform.system().lower()
90
- if system == "linux":
91
- # Try XDO first on Linux
92
- if XDOExecutor.is_available():
93
- self.executor = XDOExecutor(display_num=display_num)
94
- logger.info("Using XDOExecutor")
95
- elif PyAutoGUIExecutor.is_available():
96
- self.executor = PyAutoGUIExecutor(display_num=display_num)
97
- logger.info("Using PyAutoGUIExecutor")
98
- else:
99
- self.executor = BaseExecutor(display_num=display_num)
100
- logger.info("No display available, using BaseExecutor (simulation mode)")
101
- else:
102
- # Windows/macOS - try PyAutoGUI
103
- if PyAutoGUIExecutor.is_available():
104
- self.executor = PyAutoGUIExecutor(display_num=display_num)
105
- logger.info("Using PyAutoGUIExecutor")
106
- else:
107
- self.executor = BaseExecutor(display_num=display_num)
108
- logger.info("PyAutoGUI not available, using BaseExecutor (simulation mode)")
109
-
110
- elif platform_type == "xdo":
111
- if XDOExecutor.is_available():
112
- self.executor = XDOExecutor(display_num=display_num)
113
- logger.info("Using XDOExecutor")
114
- else:
115
- self.executor = BaseExecutor(display_num=display_num)
116
- logger.warning("XDO not available, using BaseExecutor (simulation mode)")
117
-
118
- elif platform_type == "pyautogui":
119
- if PyAutoGUIExecutor.is_available():
120
- self.executor = PyAutoGUIExecutor(display_num=display_num)
121
- logger.info("Using PyAutoGUIExecutor")
122
- else:
123
- self.executor = BaseExecutor(display_num=display_num)
124
- logger.warning("PyAutoGUI not available, using BaseExecutor (simulation mode)")
125
- else:
126
- raise ValueError(f"Invalid platform_type: {platform_type}")
127
-
128
- def _scale_coordinates(self, x: int | None, y: int | None) -> tuple[int | None, int | None]:
129
- """Scale coordinates from target space to screen space."""
130
- if x is not None:
131
- x = int(x / self.scale_x)
132
- if y is not None:
133
- y = int(y / self.scale_y)
134
-
135
- return x, y
136
-
137
- def _scale_path(self, path: list[tuple[int, int]]) -> list[tuple[int, int]]:
138
- """Scale a path from target space to screen space."""
139
- scaled_path = []
140
- for x, y in path:
141
- scaled_x, scaled_y = self._scale_coordinates(x, y)
142
- if scaled_x is not None and scaled_y is not None:
143
- scaled_path.append((scaled_x, scaled_y))
144
-
145
- return scaled_path
146
-
147
- async def _rescale_screenshot(self, screenshot_base64: str) -> str:
148
- """Rescale a screenshot if rescale_images is True."""
149
- if not self.rescale_images or not self.needs_scaling:
150
- return screenshot_base64
151
-
152
- try:
153
- import base64
154
- from io import BytesIO
155
-
156
- from PIL import Image
157
-
158
- # Decode base64 to image
159
- image_data = base64.b64decode(screenshot_base64)
160
- image = Image.open(BytesIO(image_data))
161
-
162
- # Resize to exact target dimensions
163
- resized = image.resize((self.width, self.height), Image.Resampling.LANCZOS)
164
-
165
- # Convert back to base64
166
- buffer = BytesIO()
167
- resized.save(buffer, format="PNG")
168
- resized_base64 = base64.b64encode(buffer.getvalue()).decode("utf-8")
169
-
170
- return resized_base64
171
- except Exception as e:
172
- logger.warning("Failed to rescale screenshot: %s", e)
173
- return screenshot_base64
174
-
175
- async def __call__(
176
- self,
177
- action: str = Field(..., description="The action name (click, type, move, etc.)"),
178
- # Click parameters
179
- x: int | None = Field(None, description="X coordinate for click/move/scroll actions"),
180
- y: int | None = Field(None, description="Y coordinate for click/move/scroll actions"),
181
- button: Literal["left", "right", "middle", "back", "forward"] | None = Field(
182
- None, description="Mouse button for click actions"
183
- ),
184
- pattern: list[int] | None = Field(
185
- None, description="Click pattern for multi-clicks (e.g., [100] for double-click)"
186
- ),
187
- # Key/Type parameters
188
- text: str | None = Field(None, description="Text for type/response actions"),
189
- keys: list[str] | None = Field(None, description="Keys for press/keydown/keyup actions"),
190
- enter_after: bool | None = Field(None, description="Whether to press Enter after typing"),
191
- # Scroll parameters
192
- scroll_x: int | None = Field(
193
- None, description="Horizontal scroll amount (positive = right)"
194
- ),
195
- scroll_y: int | None = Field(None, description="Vertical scroll amount (positive = down)"),
196
- # Move parameters
197
- offset_x: int | None = Field(None, description="X offset for relative move"),
198
- offset_y: int | None = Field(None, description="Y offset for relative move"),
199
- # Drag parameters
200
- path: list[tuple[int, int]] | None = Field(
201
- None, description="Path for drag actions as list of (x, y) coordinates"
202
- ),
203
- # Wait parameter
204
- time: int | None = Field(None, description="Time in milliseconds for wait action"),
205
- # General parameters
206
- hold_keys: list[str] | None = Field(None, description="Keys to hold during action"),
207
- # hold_key specific
208
- duration: float | None = Field(None, description="Duration in seconds for hold_key action"),
209
- ) -> list[ImageContent | TextContent]:
210
- """
211
- Execute a computer control action by name.
212
-
213
- Returns:
214
- List of MCP content blocks
215
- """
216
- logger.info("HudComputerTool executing action: %s", action)
217
-
218
- try:
219
- # Delegate to executor based on action
220
- if action == "click":
221
- # Scale coordinates from client space to screen space
222
- scaled_x, scaled_y = self._scale_coordinates(x, y)
223
- result = await self.executor.click(
224
- x=scaled_x,
225
- y=scaled_y,
226
- button=button or "left",
227
- pattern=pattern,
228
- hold_keys=hold_keys,
229
- )
230
-
231
- elif action == "press":
232
- if keys is None:
233
- raise ToolError("keys parameter is required for press")
234
- result = await self.executor.press(keys=keys)
235
-
236
- elif action == "keydown":
237
- if keys is None:
238
- raise ToolError("keys parameter is required for keydown")
239
- result = await self.executor.keydown(keys=keys)
240
-
241
- elif action == "keyup":
242
- if keys is None:
243
- raise ToolError("keys parameter is required for keyup")
244
- result = await self.executor.keyup(keys=keys)
245
-
246
- elif action == "type":
247
- if text is None:
248
- raise ToolError("text parameter is required for type")
249
- result = await self.executor.type(text=text, enter_after=enter_after or False)
250
-
251
- elif action == "scroll":
252
- # Scale coordinates from client space to screen space
253
- scaled_x, scaled_y = self._scale_coordinates(x, y)
254
- result = await self.executor.scroll(
255
- x=scaled_x,
256
- y=scaled_y,
257
- scroll_x=scroll_x,
258
- scroll_y=scroll_y,
259
- hold_keys=hold_keys,
260
- )
261
-
262
- elif action == "move":
263
- # Scale coordinates from client space to screen space
264
- scaled_x, scaled_y = self._scale_coordinates(x, y)
265
- scaled_offset_x, scaled_offset_y = self._scale_coordinates(offset_x, offset_y)
266
- result = await self.executor.move(
267
- x=scaled_x, y=scaled_y, offset_x=scaled_offset_x, offset_y=scaled_offset_y
268
- )
269
-
270
- elif action == "wait":
271
- if time is None:
272
- raise ToolError("time parameter is required for wait")
273
- result = await self.executor.wait(time=time)
274
-
275
- elif action == "drag":
276
- if path is None:
277
- raise ToolError("path parameter is required for drag")
278
- # Scale path from client space to screen space
279
- scaled_path = self._scale_path(path)
280
- result = await self.executor.drag(
281
- path=scaled_path, pattern=pattern, hold_keys=hold_keys
282
- )
283
-
284
- elif action == "response":
285
- if text is None:
286
- raise ToolError("text parameter is required for response")
287
- return [TextContent(text=text, type="text")]
288
-
289
- elif action == "screenshot":
290
- screenshot = await self.executor.screenshot()
291
- if screenshot:
292
- # Rescale screenshot if requested
293
- screenshot = await self._rescale_screenshot(screenshot)
294
- result = ToolResult(base64_image=screenshot)
295
- else:
296
- result = ToolResult(error="Failed to take screenshot")
297
-
298
- elif action == "position":
299
- result = await self.executor.position()
300
-
301
- elif action == "hold_key":
302
- if text is None:
303
- raise ToolError("text parameter is required for hold_key")
304
- if duration is None:
305
- raise ToolError("duration parameter is required for hold_key")
306
- result = await self.executor.hold_key(key=text, duration=duration)
307
-
308
- elif action == "mouse_down":
309
- result = await self.executor.mouse_down(button=button or "left")
310
-
311
- elif action == "mouse_up":
312
- result = await self.executor.mouse_up(button=button or "left")
313
-
314
- else:
315
- raise McpError(ErrorData(code=INVALID_PARAMS, message=f"Unknown action: {action}"))
316
-
317
- # Rescale screenshot in result if present
318
- if isinstance(result, ToolResult) and result.base64_image and self.rescale_images:
319
- rescaled_image = await self._rescale_screenshot(result.base64_image)
320
- result = result.replace(base64_image=rescaled_image)
321
-
322
- # Convert result to content blocks
323
- return tool_result_to_content_blocks(result)
324
-
325
- except TypeError as e:
326
- raise McpError(
327
- ErrorData(code=INVALID_PARAMS, message=f"Invalid parameters for {action}: {e!s}")
328
- ) from e
1
+ # flake8: noqa: B008
2
+ from __future__ import annotations
3
+
4
+ import logging
5
+ import platform
6
+ from typing import Literal
7
+
8
+ from mcp import ErrorData, McpError
9
+ from mcp.types import INVALID_PARAMS, ContentBlock, TextContent
10
+ from pydantic import Field
11
+
12
+ from hud.tools.base import BaseTool
13
+ from hud.tools.executors.base import BaseExecutor
14
+ from hud.tools.executors.pyautogui import PyAutoGUIExecutor
15
+ from hud.tools.executors.xdo import XDOExecutor
16
+ from hud.tools.types import ContentResult, ToolError
17
+
18
+ from .settings import computer_settings
19
+
20
+ logger = logging.getLogger(__name__)
21
+
22
+
23
+ class HudComputerTool(BaseTool):
24
+ """
25
+ A tool that allows the agent to control the computer.
26
+ """
27
+
28
+ def __init__(
29
+ self,
30
+ # Define within environment based on platform
31
+ executor: BaseExecutor | None = None,
32
+ platform_type: Literal["auto", "xdo", "pyautogui"] = "auto",
33
+ display_num: int | None = None,
34
+ # Overrides for what dimensions the agent thinks it operates in
35
+ # Define per subclass (e.g., Anthropic, OpenAI)
36
+ width: int | None = computer_settings.HUD_COMPUTER_WIDTH,
37
+ height: int | None = computer_settings.HUD_COMPUTER_HEIGHT,
38
+ rescale_images: bool = computer_settings.HUD_RESCALE_IMAGES,
39
+ # What the agent sees as the tool's name, title, and description
40
+ name: str | None = None,
41
+ title: str | None = None,
42
+ description: str | None = None,
43
+ ) -> None:
44
+ """
45
+ Initialize the HUD computer tool.
46
+
47
+ Args:
48
+ executor: Executor to use for the tool
49
+ platform_type: Which executor to use if executor not provided:
50
+ - "auto": Automatically detect based on platform
51
+ - "xdo": Use XDOExecutor (Linux/X11 only)
52
+ - "pyautogui": Use PyAutoGUIExecutor (cross-platform)
53
+ display_num: X display number
54
+ width: Target width for rescaling (None = use environment width)
55
+ height: Target height for rescaling (None = use environment height)
56
+ rescale_images: If True, rescale screenshots. If False, only rescale action coordinates
57
+ name: Tool name for MCP registration (auto-generated from class name if not provided)
58
+ title: Human-readable display name for the tool (auto-generated from class name)
59
+ description: Tool description (auto-generated from docstring if not provided)
60
+ """
61
+ # Initialize base tool with executor as env
62
+ super().__init__(
63
+ env=executor,
64
+ name=name or "computer",
65
+ title=title or "Computer Control",
66
+ description=description or "Control computer with mouse, keyboard, and screenshots",
67
+ )
68
+
69
+ # This is the width and height the agent thinks it operates in
70
+ # By default, use subclass's width and height
71
+ # If specifically set to None, use environment width and height
72
+ self.width = width or computer_settings.DISPLAY_WIDTH
73
+ self.height = height or computer_settings.DISPLAY_HEIGHT
74
+
75
+ # This is the static width and height of the environment screen
76
+ # And the width and height of the screenshots taken by the tool
77
+ self.environment_width = computer_settings.DISPLAY_WIDTH
78
+ self.environment_height = computer_settings.DISPLAY_HEIGHT
79
+
80
+ # Some APIs rescale screenshots automatically to the agent's width and height, some don't
81
+ # Defined per subclass (e.g., Anthropic, OpenAI)
82
+ # In case you need your agent to receive pre-formatted screenshots, set env variable True
83
+ self.rescale_images = rescale_images
84
+
85
+ logger.debug(
86
+ "Agent Screen Width: %s, Agent Screen Height: %s",
87
+ self.width,
88
+ self.height,
89
+ "Environment Screen Width: %s, Environment Screen Height: %s",
90
+ self.environment_width,
91
+ self.environment_height,
92
+ )
93
+
94
+ # Calculate scaling factors from base screen size to target size
95
+ self.scale_x = self.width / self.environment_width
96
+ self.scale_y = self.height / self.environment_height
97
+
98
+ # Check if we need to scale
99
+ self.needs_scaling = min(self.scale_x, self.scale_y) != 1.0
100
+
101
+ # Use environment settings for display number
102
+ self.display_num = display_num or computer_settings.DISPLAY_NUM
103
+
104
+ logger.debug("Display number: %s", self.display_num)
105
+
106
+ # If no executor provided, create one based on platform
107
+ if self.env is None:
108
+ self._choose_executor(platform_type, self.display_num)
109
+
110
+ @property
111
+ def executor(self) -> BaseExecutor:
112
+ """Get the executor (alias for context)."""
113
+ return self.env
114
+
115
+ @executor.setter
116
+ def executor(self, value: BaseExecutor) -> None:
117
+ """Set the executor (alias for context)."""
118
+ self.env = value
119
+
120
+ def _choose_executor(
121
+ self,
122
+ platform_type: Literal["auto", "xdo", "pyautogui"],
123
+ display_num: int | None,
124
+ ) -> None:
125
+ """Choose executor based on platform_type."""
126
+ # Choose executor based on platform_type
127
+ if platform_type == "auto":
128
+ # Auto-detect based on platform
129
+ system = platform.system().lower()
130
+ if system == "linux":
131
+ # Try XDO first on Linux
132
+ if XDOExecutor.is_available():
133
+ self.executor = XDOExecutor(display_num=display_num)
134
+ logger.info("Using XDOExecutor")
135
+ elif PyAutoGUIExecutor.is_available():
136
+ self.executor = PyAutoGUIExecutor(display_num=display_num)
137
+ logger.info("Using PyAutoGUIExecutor")
138
+ else:
139
+ self.executor = BaseExecutor(display_num=display_num)
140
+ logger.info("No display available, using BaseExecutor (simulation mode)")
141
+ else:
142
+ # Windows/macOS - try PyAutoGUI
143
+ if PyAutoGUIExecutor.is_available():
144
+ self.executor = PyAutoGUIExecutor(display_num=display_num)
145
+ logger.info("Using PyAutoGUIExecutor")
146
+ else:
147
+ self.executor = BaseExecutor(display_num=display_num)
148
+ logger.info("PyAutoGUI not available, using BaseExecutor (simulation mode)")
149
+
150
+ elif platform_type == "xdo":
151
+ if XDOExecutor.is_available():
152
+ self.executor = XDOExecutor(display_num=display_num)
153
+ logger.info("Using XDOExecutor")
154
+ else:
155
+ self.executor = BaseExecutor(display_num=display_num)
156
+ logger.warning("XDO not available, using BaseExecutor (simulation mode)")
157
+
158
+ elif platform_type == "pyautogui":
159
+ if PyAutoGUIExecutor.is_available():
160
+ self.executor = PyAutoGUIExecutor(display_num=display_num)
161
+ logger.info("Using PyAutoGUIExecutor")
162
+ else:
163
+ self.executor = BaseExecutor(display_num=display_num)
164
+ logger.warning("PyAutoGUI not available, using BaseExecutor (simulation mode)")
165
+ else:
166
+ raise ValueError(f"Invalid platform_type: {platform_type}")
167
+
168
+ def _scale_coordinates(self, x: int | None, y: int | None) -> tuple[int | None, int | None]:
169
+ """Scale coordinates from target space to screen space."""
170
+ if x is not None and self.scale_x != 1.0:
171
+ x = int(x / self.scale_x)
172
+ if y is not None and self.scale_y != 1.0:
173
+ y = int(y / self.scale_y)
174
+
175
+ return x, y
176
+
177
+ def _scale_path(self, path: list[tuple[int, int]]) -> list[tuple[int, int]]:
178
+ """Scale a path from target space to screen space."""
179
+ scaled_path = []
180
+ for x, y in path:
181
+ scaled_x, scaled_y = self._scale_coordinates(x, y)
182
+ if scaled_x is not None and scaled_y is not None:
183
+ scaled_path.append((scaled_x, scaled_y))
184
+
185
+ return scaled_path
186
+
187
+ async def _rescale_screenshot(self, screenshot_base64: str) -> str:
188
+ """Rescale a screenshot if rescale_images is True."""
189
+ if not self.rescale_images or not self.needs_scaling:
190
+ return screenshot_base64
191
+
192
+ try:
193
+ import base64
194
+ from io import BytesIO
195
+
196
+ from PIL import Image # type: ignore[import-not-found]
197
+
198
+ # Decode base64 to image
199
+ image_data = base64.b64decode(screenshot_base64)
200
+ image = Image.open(BytesIO(image_data))
201
+
202
+ logger.info(
203
+ "Resizing screenshot from %s x %s to %s x %s",
204
+ image.width,
205
+ image.height,
206
+ self.width,
207
+ self.height,
208
+ )
209
+
210
+ # Resize to exact target dimensions
211
+ resized = image.resize((self.width, self.height), Image.Resampling.LANCZOS)
212
+
213
+ # Convert back to base64
214
+ buffer = BytesIO()
215
+ resized.save(buffer, format="PNG")
216
+ resized_base64 = base64.b64encode(buffer.getvalue()).decode("utf-8")
217
+
218
+ return resized_base64
219
+ except Exception as e:
220
+ logger.warning("Failed to rescale screenshot: %s", e)
221
+ return screenshot_base64
222
+
223
+ async def __call__(
224
+ self,
225
+ action: str = Field(..., description="The action name (click, type, move, etc.)"),
226
+ # Click parameters
227
+ x: int | None = Field(None, description="X coordinate for click/move/scroll actions"),
228
+ y: int | None = Field(None, description="Y coordinate for click/move/scroll actions"),
229
+ button: Literal["left", "right", "middle", "back", "forward"] | None = Field(
230
+ None, description="Mouse button for click actions"
231
+ ),
232
+ pattern: list[int] | None = Field(
233
+ None, description="Click pattern for multi-clicks (e.g., [100] for double-click)"
234
+ ),
235
+ # Key/Type parameters
236
+ text: str | None = Field(None, description="Text for type/response actions"),
237
+ keys: list[str] | None = Field(None, description="Keys for press/keydown/keyup actions"),
238
+ enter_after: bool | None = Field(None, description="Whether to press Enter after typing"),
239
+ # Scroll parameters
240
+ scroll_x: int | None = Field(
241
+ None, description="Horizontal scroll amount (positive = right)"
242
+ ),
243
+ scroll_y: int | None = Field(None, description="Vertical scroll amount (positive = down)"),
244
+ # Move parameters
245
+ offset_x: int | None = Field(None, description="X offset for relative move"),
246
+ offset_y: int | None = Field(None, description="Y offset for relative move"),
247
+ # Drag parameters
248
+ path: list[tuple[int, int]] | None = Field(
249
+ None, description="Path for drag actions as list of (x, y) coordinates"
250
+ ),
251
+ # Wait parameter
252
+ time: int | None = Field(None, description="Time in milliseconds for wait action"),
253
+ # General parameters
254
+ hold_keys: list[str] | None = Field(None, description="Keys to hold during action"),
255
+ # hold_key specific
256
+ duration: float | None = Field(None, description="Duration in seconds for hold_key action"),
257
+ ) -> list[ContentBlock]:
258
+ """
259
+ Execute a computer control action by name.
260
+
261
+ Returns:
262
+ List of MCP content blocks
263
+ """
264
+ logger.info("HudComputerTool executing action: %s", action)
265
+
266
+ try:
267
+ # Delegate to executor based on action
268
+ if action == "click":
269
+ # Scale coordinates from client space to screen space
270
+ scaled_x, scaled_y = self._scale_coordinates(x, y)
271
+ result = await self.executor.click(
272
+ x=scaled_x,
273
+ y=scaled_y,
274
+ button=button or "left",
275
+ pattern=pattern,
276
+ hold_keys=hold_keys,
277
+ )
278
+
279
+ elif action == "press":
280
+ if keys is None:
281
+ raise ToolError("keys parameter is required for press")
282
+ result = await self.executor.press(keys=keys)
283
+
284
+ elif action == "keydown":
285
+ if keys is None:
286
+ raise ToolError("keys parameter is required for keydown")
287
+ result = await self.executor.keydown(keys=keys)
288
+
289
+ elif action == "keyup":
290
+ if keys is None:
291
+ raise ToolError("keys parameter is required for keyup")
292
+ result = await self.executor.keyup(keys=keys)
293
+
294
+ elif action == "type":
295
+ if text is None:
296
+ raise ToolError("text parameter is required for type")
297
+ result = await self.executor.write(text=text, enter_after=enter_after or False)
298
+
299
+ elif action == "scroll":
300
+ # Scale coordinates from client space to screen space
301
+ scaled_x, scaled_y = self._scale_coordinates(x, y)
302
+ result = await self.executor.scroll(
303
+ x=scaled_x,
304
+ y=scaled_y,
305
+ scroll_x=scroll_x,
306
+ scroll_y=scroll_y,
307
+ hold_keys=hold_keys,
308
+ )
309
+
310
+ elif action == "move":
311
+ # Scale coordinates from client space to screen space
312
+ scaled_x, scaled_y = self._scale_coordinates(x, y)
313
+ scaled_offset_x, scaled_offset_y = self._scale_coordinates(offset_x, offset_y)
314
+ result = await self.executor.move(
315
+ x=scaled_x, y=scaled_y, offset_x=scaled_offset_x, offset_y=scaled_offset_y
316
+ )
317
+
318
+ elif action == "wait":
319
+ if time is None:
320
+ raise ToolError("time parameter is required for wait")
321
+ result = await self.executor.wait(time=time)
322
+
323
+ elif action == "drag":
324
+ if path is None:
325
+ raise ToolError("path parameter is required for drag")
326
+ # Scale path from client space to screen space
327
+ scaled_path = self._scale_path(path)
328
+ result = await self.executor.drag(
329
+ path=scaled_path, pattern=pattern, hold_keys=hold_keys
330
+ )
331
+
332
+ elif action == "response":
333
+ if text is None:
334
+ raise ToolError("text parameter is required for response")
335
+ return [TextContent(text=text, type="text")]
336
+
337
+ elif action == "screenshot":
338
+ screenshot = await self.executor.screenshot()
339
+ if screenshot:
340
+ # Rescale screenshot if requested
341
+ screenshot = await self._rescale_screenshot(screenshot)
342
+ result = ContentResult(base64_image=screenshot)
343
+ else:
344
+ result = ContentResult(error="Failed to take screenshot")
345
+
346
+ elif action == "position":
347
+ result = await self.executor.position()
348
+
349
+ elif action == "hold_key":
350
+ if text is None:
351
+ raise ToolError("text parameter is required for hold_key")
352
+ if duration is None:
353
+ raise ToolError("duration parameter is required for hold_key")
354
+ result = await self.executor.hold_key(key=text, duration=duration)
355
+
356
+ elif action == "mouse_down":
357
+ result = await self.executor.mouse_down(button=button or "left")
358
+
359
+ elif action == "mouse_up":
360
+ result = await self.executor.mouse_up(button=button or "left")
361
+
362
+ else:
363
+ raise McpError(ErrorData(code=INVALID_PARAMS, message=f"Unknown action: {action}"))
364
+
365
+ # Rescale screenshot in result if present
366
+ if isinstance(result, ContentResult) and result.base64_image and self.rescale_images:
367
+ rescaled_image = await self._rescale_screenshot(result.base64_image)
368
+ result.base64_image = rescaled_image
369
+
370
+ # Convert result to content blocks
371
+ return result.to_content_blocks()
372
+
373
+ except TypeError as e:
374
+ raise McpError(
375
+ ErrorData(code=INVALID_PARAMS, message=f"Invalid parameters for {action}: {e!s}")
376
+ ) from e