hud-python 0.4.1__py3-none-any.whl → 0.4.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of hud-python might be problematic. Click here for more details.

Files changed (130) hide show
  1. hud/__init__.py +22 -22
  2. hud/agents/__init__.py +13 -15
  3. hud/agents/base.py +599 -599
  4. hud/agents/claude.py +373 -373
  5. hud/agents/langchain.py +261 -250
  6. hud/agents/misc/__init__.py +7 -7
  7. hud/agents/misc/response_agent.py +82 -80
  8. hud/agents/openai.py +352 -352
  9. hud/agents/openai_chat_generic.py +154 -154
  10. hud/agents/tests/__init__.py +1 -1
  11. hud/agents/tests/test_base.py +742 -742
  12. hud/agents/tests/test_claude.py +324 -324
  13. hud/agents/tests/test_client.py +363 -363
  14. hud/agents/tests/test_openai.py +237 -237
  15. hud/cli/__init__.py +617 -617
  16. hud/cli/__main__.py +8 -8
  17. hud/cli/analyze.py +371 -371
  18. hud/cli/analyze_metadata.py +230 -230
  19. hud/cli/build.py +498 -427
  20. hud/cli/clone.py +185 -185
  21. hud/cli/cursor.py +92 -92
  22. hud/cli/debug.py +392 -392
  23. hud/cli/docker_utils.py +83 -83
  24. hud/cli/init.py +280 -281
  25. hud/cli/interactive.py +353 -353
  26. hud/cli/mcp_server.py +764 -756
  27. hud/cli/pull.py +330 -336
  28. hud/cli/push.py +404 -370
  29. hud/cli/remote_runner.py +311 -311
  30. hud/cli/runner.py +160 -160
  31. hud/cli/tests/__init__.py +3 -3
  32. hud/cli/tests/test_analyze.py +284 -284
  33. hud/cli/tests/test_cli_init.py +265 -265
  34. hud/cli/tests/test_cli_main.py +27 -27
  35. hud/cli/tests/test_clone.py +142 -142
  36. hud/cli/tests/test_cursor.py +253 -253
  37. hud/cli/tests/test_debug.py +453 -453
  38. hud/cli/tests/test_mcp_server.py +139 -139
  39. hud/cli/tests/test_utils.py +388 -388
  40. hud/cli/utils.py +263 -263
  41. hud/clients/README.md +143 -143
  42. hud/clients/__init__.py +16 -16
  43. hud/clients/base.py +378 -379
  44. hud/clients/fastmcp.py +222 -222
  45. hud/clients/mcp_use.py +298 -278
  46. hud/clients/tests/__init__.py +1 -1
  47. hud/clients/tests/test_client_integration.py +111 -111
  48. hud/clients/tests/test_fastmcp.py +342 -342
  49. hud/clients/tests/test_protocol.py +188 -188
  50. hud/clients/utils/__init__.py +1 -1
  51. hud/clients/utils/retry_transport.py +160 -160
  52. hud/datasets.py +327 -322
  53. hud/misc/__init__.py +1 -1
  54. hud/misc/claude_plays_pokemon.py +292 -292
  55. hud/otel/__init__.py +35 -35
  56. hud/otel/collector.py +142 -142
  57. hud/otel/config.py +164 -164
  58. hud/otel/context.py +536 -536
  59. hud/otel/exporters.py +366 -366
  60. hud/otel/instrumentation.py +97 -97
  61. hud/otel/processors.py +118 -118
  62. hud/otel/tests/__init__.py +1 -1
  63. hud/otel/tests/test_processors.py +197 -197
  64. hud/server/__init__.py +5 -5
  65. hud/server/context.py +114 -114
  66. hud/server/helper/__init__.py +5 -5
  67. hud/server/low_level.py +132 -132
  68. hud/server/server.py +170 -166
  69. hud/server/tests/__init__.py +3 -3
  70. hud/settings.py +73 -73
  71. hud/shared/__init__.py +5 -5
  72. hud/shared/exceptions.py +180 -180
  73. hud/shared/requests.py +264 -264
  74. hud/shared/tests/test_exceptions.py +157 -157
  75. hud/shared/tests/test_requests.py +275 -275
  76. hud/telemetry/__init__.py +25 -25
  77. hud/telemetry/instrument.py +379 -379
  78. hud/telemetry/job.py +309 -309
  79. hud/telemetry/replay.py +74 -74
  80. hud/telemetry/trace.py +83 -83
  81. hud/tools/__init__.py +33 -33
  82. hud/tools/base.py +365 -365
  83. hud/tools/bash.py +161 -161
  84. hud/tools/computer/__init__.py +15 -15
  85. hud/tools/computer/anthropic.py +437 -437
  86. hud/tools/computer/hud.py +376 -376
  87. hud/tools/computer/openai.py +295 -295
  88. hud/tools/computer/settings.py +82 -82
  89. hud/tools/edit.py +314 -314
  90. hud/tools/executors/__init__.py +30 -30
  91. hud/tools/executors/base.py +539 -539
  92. hud/tools/executors/pyautogui.py +621 -621
  93. hud/tools/executors/tests/__init__.py +1 -1
  94. hud/tools/executors/tests/test_base_executor.py +338 -338
  95. hud/tools/executors/tests/test_pyautogui_executor.py +165 -165
  96. hud/tools/executors/xdo.py +511 -511
  97. hud/tools/playwright.py +412 -412
  98. hud/tools/tests/__init__.py +3 -3
  99. hud/tools/tests/test_base.py +282 -282
  100. hud/tools/tests/test_bash.py +158 -158
  101. hud/tools/tests/test_bash_extended.py +197 -197
  102. hud/tools/tests/test_computer.py +425 -425
  103. hud/tools/tests/test_computer_actions.py +34 -34
  104. hud/tools/tests/test_edit.py +259 -259
  105. hud/tools/tests/test_init.py +27 -27
  106. hud/tools/tests/test_playwright_tool.py +183 -183
  107. hud/tools/tests/test_tools.py +145 -145
  108. hud/tools/tests/test_utils.py +156 -156
  109. hud/tools/types.py +72 -72
  110. hud/tools/utils.py +50 -50
  111. hud/types.py +136 -136
  112. hud/utils/__init__.py +10 -10
  113. hud/utils/async_utils.py +65 -65
  114. hud/utils/design.py +236 -168
  115. hud/utils/mcp.py +55 -55
  116. hud/utils/progress.py +149 -149
  117. hud/utils/telemetry.py +66 -66
  118. hud/utils/tests/test_async_utils.py +173 -173
  119. hud/utils/tests/test_init.py +17 -17
  120. hud/utils/tests/test_progress.py +261 -261
  121. hud/utils/tests/test_telemetry.py +82 -82
  122. hud/utils/tests/test_version.py +8 -8
  123. hud/version.py +7 -7
  124. {hud_python-0.4.1.dist-info → hud_python-0.4.3.dist-info}/METADATA +10 -8
  125. hud_python-0.4.3.dist-info/RECORD +131 -0
  126. {hud_python-0.4.1.dist-info → hud_python-0.4.3.dist-info}/licenses/LICENSE +21 -21
  127. hud/agents/art.py +0 -101
  128. hud_python-0.4.1.dist-info/RECORD +0 -132
  129. {hud_python-0.4.1.dist-info → hud_python-0.4.3.dist-info}/WHEEL +0 -0
  130. {hud_python-0.4.1.dist-info → hud_python-0.4.3.dist-info}/entry_points.txt +0 -0
hud/tools/computer/hud.py CHANGED
@@ -1,376 +1,376 @@
1
- # flake8: noqa: B008
2
- from __future__ import annotations
3
-
4
- import logging
5
- import platform
6
- from typing import Literal
7
-
8
- from mcp import ErrorData, McpError
9
- from mcp.types import INVALID_PARAMS, ContentBlock, TextContent
10
- from pydantic import Field
11
-
12
- from hud.tools.base import BaseTool
13
- from hud.tools.executors.base import BaseExecutor
14
- from hud.tools.executors.pyautogui import PyAutoGUIExecutor
15
- from hud.tools.executors.xdo import XDOExecutor
16
- from hud.tools.types import ContentResult, ToolError
17
-
18
- from .settings import computer_settings
19
-
20
- logger = logging.getLogger(__name__)
21
-
22
-
23
- class HudComputerTool(BaseTool):
24
- """
25
- A tool that allows the agent to control the computer.
26
- """
27
-
28
- def __init__(
29
- self,
30
- # Define within environment based on platform
31
- executor: BaseExecutor | None = None,
32
- platform_type: Literal["auto", "xdo", "pyautogui"] = "auto",
33
- display_num: int | None = None,
34
- # Overrides for what dimensions the agent thinks it operates in
35
- # Define per subclass (e.g., Anthropic, OpenAI)
36
- width: int | None = computer_settings.HUD_COMPUTER_WIDTH,
37
- height: int | None = computer_settings.HUD_COMPUTER_HEIGHT,
38
- rescale_images: bool = computer_settings.HUD_RESCALE_IMAGES,
39
- # What the agent sees as the tool's name, title, and description
40
- name: str | None = None,
41
- title: str | None = None,
42
- description: str | None = None,
43
- ) -> None:
44
- """
45
- Initialize the HUD computer tool.
46
-
47
- Args:
48
- executor: Executor to use for the tool
49
- platform_type: Which executor to use if executor not provided:
50
- - "auto": Automatically detect based on platform
51
- - "xdo": Use XDOExecutor (Linux/X11 only)
52
- - "pyautogui": Use PyAutoGUIExecutor (cross-platform)
53
- display_num: X display number
54
- width: Target width for rescaling (None = use environment width)
55
- height: Target height for rescaling (None = use environment height)
56
- rescale_images: If True, rescale screenshots. If False, only rescale action coordinates
57
- name: Tool name for MCP registration (auto-generated from class name if not provided)
58
- title: Human-readable display name for the tool (auto-generated from class name)
59
- description: Tool description (auto-generated from docstring if not provided)
60
- """
61
- # Initialize base tool with executor as env
62
- super().__init__(
63
- env=executor,
64
- name=name or "computer",
65
- title=title or "Computer Control",
66
- description=description or "Control computer with mouse, keyboard, and screenshots",
67
- )
68
-
69
- # This is the width and height the agent thinks it operates in
70
- # By default, use subclass's width and height
71
- # If specifically set to None, use environment width and height
72
- self.width = width or computer_settings.DISPLAY_WIDTH
73
- self.height = height or computer_settings.DISPLAY_HEIGHT
74
-
75
- # This is the static width and height of the environment screen
76
- # And the width and height of the screenshots taken by the tool
77
- self.environment_width = computer_settings.DISPLAY_WIDTH
78
- self.environment_height = computer_settings.DISPLAY_HEIGHT
79
-
80
- # Some APIs rescale screenshots automatically to the agent's width and height, some don't
81
- # Defined per subclass (e.g., Anthropic, OpenAI)
82
- # In case you need your agent to receive pre-formatted screenshots, set env variable True
83
- self.rescale_images = rescale_images
84
-
85
- logger.debug(
86
- "Agent Screen Width: %s, Agent Screen Height: %s",
87
- self.width,
88
- self.height,
89
- "Environment Screen Width: %s, Environment Screen Height: %s",
90
- self.environment_width,
91
- self.environment_height,
92
- )
93
-
94
- # Calculate scaling factors from base screen size to target size
95
- self.scale_x = self.width / self.environment_width
96
- self.scale_y = self.height / self.environment_height
97
-
98
- # Check if we need to scale
99
- self.needs_scaling = min(self.scale_x, self.scale_y) != 1.0
100
-
101
- # Use environment settings for display number
102
- self.display_num = display_num or computer_settings.DISPLAY_NUM
103
-
104
- logger.debug("Display number: %s", self.display_num)
105
-
106
- # If no executor provided, create one based on platform
107
- if self.env is None:
108
- self._choose_executor(platform_type, self.display_num)
109
-
110
- @property
111
- def executor(self) -> BaseExecutor:
112
- """Get the executor (alias for context)."""
113
- return self.env
114
-
115
- @executor.setter
116
- def executor(self, value: BaseExecutor) -> None:
117
- """Set the executor (alias for context)."""
118
- self.env = value
119
-
120
- def _choose_executor(
121
- self,
122
- platform_type: Literal["auto", "xdo", "pyautogui"],
123
- display_num: int | None,
124
- ) -> None:
125
- """Choose executor based on platform_type."""
126
- # Choose executor based on platform_type
127
- if platform_type == "auto":
128
- # Auto-detect based on platform
129
- system = platform.system().lower()
130
- if system == "linux":
131
- # Try XDO first on Linux
132
- if XDOExecutor.is_available():
133
- self.executor = XDOExecutor(display_num=display_num)
134
- logger.info("Using XDOExecutor")
135
- elif PyAutoGUIExecutor.is_available():
136
- self.executor = PyAutoGUIExecutor(display_num=display_num)
137
- logger.info("Using PyAutoGUIExecutor")
138
- else:
139
- self.executor = BaseExecutor(display_num=display_num)
140
- logger.info("No display available, using BaseExecutor (simulation mode)")
141
- else:
142
- # Windows/macOS - try PyAutoGUI
143
- if PyAutoGUIExecutor.is_available():
144
- self.executor = PyAutoGUIExecutor(display_num=display_num)
145
- logger.info("Using PyAutoGUIExecutor")
146
- else:
147
- self.executor = BaseExecutor(display_num=display_num)
148
- logger.info("PyAutoGUI not available, using BaseExecutor (simulation mode)")
149
-
150
- elif platform_type == "xdo":
151
- if XDOExecutor.is_available():
152
- self.executor = XDOExecutor(display_num=display_num)
153
- logger.info("Using XDOExecutor")
154
- else:
155
- self.executor = BaseExecutor(display_num=display_num)
156
- logger.warning("XDO not available, using BaseExecutor (simulation mode)")
157
-
158
- elif platform_type == "pyautogui":
159
- if PyAutoGUIExecutor.is_available():
160
- self.executor = PyAutoGUIExecutor(display_num=display_num)
161
- logger.info("Using PyAutoGUIExecutor")
162
- else:
163
- self.executor = BaseExecutor(display_num=display_num)
164
- logger.warning("PyAutoGUI not available, using BaseExecutor (simulation mode)")
165
- else:
166
- raise ValueError(f"Invalid platform_type: {platform_type}")
167
-
168
- def _scale_coordinates(self, x: int | None, y: int | None) -> tuple[int | None, int | None]:
169
- """Scale coordinates from target space to screen space."""
170
- if x is not None and self.scale_x != 1.0:
171
- x = int(x / self.scale_x)
172
- if y is not None and self.scale_y != 1.0:
173
- y = int(y / self.scale_y)
174
-
175
- return x, y
176
-
177
- def _scale_path(self, path: list[tuple[int, int]]) -> list[tuple[int, int]]:
178
- """Scale a path from target space to screen space."""
179
- scaled_path = []
180
- for x, y in path:
181
- scaled_x, scaled_y = self._scale_coordinates(x, y)
182
- if scaled_x is not None and scaled_y is not None:
183
- scaled_path.append((scaled_x, scaled_y))
184
-
185
- return scaled_path
186
-
187
- async def _rescale_screenshot(self, screenshot_base64: str) -> str:
188
- """Rescale a screenshot if rescale_images is True."""
189
- if not self.rescale_images or not self.needs_scaling:
190
- return screenshot_base64
191
-
192
- try:
193
- import base64
194
- from io import BytesIO
195
-
196
- from PIL import Image # type: ignore[import-not-found]
197
-
198
- # Decode base64 to image
199
- image_data = base64.b64decode(screenshot_base64)
200
- image = Image.open(BytesIO(image_data))
201
-
202
- logger.info(
203
- "Resizing screenshot from %s x %s to %s x %s",
204
- image.width,
205
- image.height,
206
- self.width,
207
- self.height,
208
- )
209
-
210
- # Resize to exact target dimensions
211
- resized = image.resize((self.width, self.height), Image.Resampling.LANCZOS)
212
-
213
- # Convert back to base64
214
- buffer = BytesIO()
215
- resized.save(buffer, format="PNG")
216
- resized_base64 = base64.b64encode(buffer.getvalue()).decode("utf-8")
217
-
218
- return resized_base64
219
- except Exception as e:
220
- logger.warning("Failed to rescale screenshot: %s", e)
221
- return screenshot_base64
222
-
223
- async def __call__(
224
- self,
225
- action: str = Field(..., description="The action name (click, type, move, etc.)"),
226
- # Click parameters
227
- x: int | None = Field(None, description="X coordinate for click/move/scroll actions"),
228
- y: int | None = Field(None, description="Y coordinate for click/move/scroll actions"),
229
- button: Literal["left", "right", "middle", "back", "forward"] | None = Field(
230
- None, description="Mouse button for click actions"
231
- ),
232
- pattern: list[int] | None = Field(
233
- None, description="Click pattern for multi-clicks (e.g., [100] for double-click)"
234
- ),
235
- # Key/Type parameters
236
- text: str | None = Field(None, description="Text for type/response actions"),
237
- keys: list[str] | None = Field(None, description="Keys for press/keydown/keyup actions"),
238
- enter_after: bool | None = Field(None, description="Whether to press Enter after typing"),
239
- # Scroll parameters
240
- scroll_x: int | None = Field(
241
- None, description="Horizontal scroll amount (positive = right)"
242
- ),
243
- scroll_y: int | None = Field(None, description="Vertical scroll amount (positive = down)"),
244
- # Move parameters
245
- offset_x: int | None = Field(None, description="X offset for relative move"),
246
- offset_y: int | None = Field(None, description="Y offset for relative move"),
247
- # Drag parameters
248
- path: list[tuple[int, int]] | None = Field(
249
- None, description="Path for drag actions as list of (x, y) coordinates"
250
- ),
251
- # Wait parameter
252
- time: int | None = Field(None, description="Time in milliseconds for wait action"),
253
- # General parameters
254
- hold_keys: list[str] | None = Field(None, description="Keys to hold during action"),
255
- # hold_key specific
256
- duration: float | None = Field(None, description="Duration in seconds for hold_key action"),
257
- ) -> list[ContentBlock]:
258
- """
259
- Execute a computer control action by name.
260
-
261
- Returns:
262
- List of MCP content blocks
263
- """
264
- logger.info("HudComputerTool executing action: %s", action)
265
-
266
- try:
267
- # Delegate to executor based on action
268
- if action == "click":
269
- # Scale coordinates from client space to screen space
270
- scaled_x, scaled_y = self._scale_coordinates(x, y)
271
- result = await self.executor.click(
272
- x=scaled_x,
273
- y=scaled_y,
274
- button=button or "left",
275
- pattern=pattern,
276
- hold_keys=hold_keys,
277
- )
278
-
279
- elif action == "press":
280
- if keys is None:
281
- raise ToolError("keys parameter is required for press")
282
- result = await self.executor.press(keys=keys)
283
-
284
- elif action == "keydown":
285
- if keys is None:
286
- raise ToolError("keys parameter is required for keydown")
287
- result = await self.executor.keydown(keys=keys)
288
-
289
- elif action == "keyup":
290
- if keys is None:
291
- raise ToolError("keys parameter is required for keyup")
292
- result = await self.executor.keyup(keys=keys)
293
-
294
- elif action == "type":
295
- if text is None:
296
- raise ToolError("text parameter is required for type")
297
- result = await self.executor.write(text=text, enter_after=enter_after or False)
298
-
299
- elif action == "scroll":
300
- # Scale coordinates from client space to screen space
301
- scaled_x, scaled_y = self._scale_coordinates(x, y)
302
- result = await self.executor.scroll(
303
- x=scaled_x,
304
- y=scaled_y,
305
- scroll_x=scroll_x,
306
- scroll_y=scroll_y,
307
- hold_keys=hold_keys,
308
- )
309
-
310
- elif action == "move":
311
- # Scale coordinates from client space to screen space
312
- scaled_x, scaled_y = self._scale_coordinates(x, y)
313
- scaled_offset_x, scaled_offset_y = self._scale_coordinates(offset_x, offset_y)
314
- result = await self.executor.move(
315
- x=scaled_x, y=scaled_y, offset_x=scaled_offset_x, offset_y=scaled_offset_y
316
- )
317
-
318
- elif action == "wait":
319
- if time is None:
320
- raise ToolError("time parameter is required for wait")
321
- result = await self.executor.wait(time=time)
322
-
323
- elif action == "drag":
324
- if path is None:
325
- raise ToolError("path parameter is required for drag")
326
- # Scale path from client space to screen space
327
- scaled_path = self._scale_path(path)
328
- result = await self.executor.drag(
329
- path=scaled_path, pattern=pattern, hold_keys=hold_keys
330
- )
331
-
332
- elif action == "response":
333
- if text is None:
334
- raise ToolError("text parameter is required for response")
335
- return [TextContent(text=text, type="text")]
336
-
337
- elif action == "screenshot":
338
- screenshot = await self.executor.screenshot()
339
- if screenshot:
340
- # Rescale screenshot if requested
341
- screenshot = await self._rescale_screenshot(screenshot)
342
- result = ContentResult(base64_image=screenshot)
343
- else:
344
- result = ContentResult(error="Failed to take screenshot")
345
-
346
- elif action == "position":
347
- result = await self.executor.position()
348
-
349
- elif action == "hold_key":
350
- if text is None:
351
- raise ToolError("text parameter is required for hold_key")
352
- if duration is None:
353
- raise ToolError("duration parameter is required for hold_key")
354
- result = await self.executor.hold_key(key=text, duration=duration)
355
-
356
- elif action == "mouse_down":
357
- result = await self.executor.mouse_down(button=button or "left")
358
-
359
- elif action == "mouse_up":
360
- result = await self.executor.mouse_up(button=button or "left")
361
-
362
- else:
363
- raise McpError(ErrorData(code=INVALID_PARAMS, message=f"Unknown action: {action}"))
364
-
365
- # Rescale screenshot in result if present
366
- if isinstance(result, ContentResult) and result.base64_image and self.rescale_images:
367
- rescaled_image = await self._rescale_screenshot(result.base64_image)
368
- result.base64_image = rescaled_image
369
-
370
- # Convert result to content blocks
371
- return result.to_content_blocks()
372
-
373
- except TypeError as e:
374
- raise McpError(
375
- ErrorData(code=INVALID_PARAMS, message=f"Invalid parameters for {action}: {e!s}")
376
- ) from e
1
+ # flake8: noqa: B008
2
+ from __future__ import annotations
3
+
4
+ import logging
5
+ import platform
6
+ from typing import Literal
7
+
8
+ from mcp import ErrorData, McpError
9
+ from mcp.types import INVALID_PARAMS, ContentBlock, TextContent
10
+ from pydantic import Field
11
+
12
+ from hud.tools.base import BaseTool
13
+ from hud.tools.executors.base import BaseExecutor
14
+ from hud.tools.executors.pyautogui import PyAutoGUIExecutor
15
+ from hud.tools.executors.xdo import XDOExecutor
16
+ from hud.tools.types import ContentResult, ToolError
17
+
18
+ from .settings import computer_settings
19
+
20
+ logger = logging.getLogger(__name__)
21
+
22
+
23
+ class HudComputerTool(BaseTool):
24
+ """
25
+ A tool that allows the agent to control the computer.
26
+ """
27
+
28
+ def __init__(
29
+ self,
30
+ # Define within environment based on platform
31
+ executor: BaseExecutor | None = None,
32
+ platform_type: Literal["auto", "xdo", "pyautogui"] = "auto",
33
+ display_num: int | None = None,
34
+ # Overrides for what dimensions the agent thinks it operates in
35
+ # Define per subclass (e.g., Anthropic, OpenAI)
36
+ width: int | None = computer_settings.HUD_COMPUTER_WIDTH,
37
+ height: int | None = computer_settings.HUD_COMPUTER_HEIGHT,
38
+ rescale_images: bool = computer_settings.HUD_RESCALE_IMAGES,
39
+ # What the agent sees as the tool's name, title, and description
40
+ name: str | None = None,
41
+ title: str | None = None,
42
+ description: str | None = None,
43
+ ) -> None:
44
+ """
45
+ Initialize the HUD computer tool.
46
+
47
+ Args:
48
+ executor: Executor to use for the tool
49
+ platform_type: Which executor to use if executor not provided:
50
+ - "auto": Automatically detect based on platform
51
+ - "xdo": Use XDOExecutor (Linux/X11 only)
52
+ - "pyautogui": Use PyAutoGUIExecutor (cross-platform)
53
+ display_num: X display number
54
+ width: Target width for rescaling (None = use environment width)
55
+ height: Target height for rescaling (None = use environment height)
56
+ rescale_images: If True, rescale screenshots. If False, only rescale action coordinates
57
+ name: Tool name for MCP registration (auto-generated from class name if not provided)
58
+ title: Human-readable display name for the tool (auto-generated from class name)
59
+ description: Tool description (auto-generated from docstring if not provided)
60
+ """
61
+ # Initialize base tool with executor as env
62
+ super().__init__(
63
+ env=executor,
64
+ name=name or "computer",
65
+ title=title or "Computer Control",
66
+ description=description or "Control computer with mouse, keyboard, and screenshots",
67
+ )
68
+
69
+ # This is the width and height the agent thinks it operates in
70
+ # By default, use subclass's width and height
71
+ # If specifically set to None, use environment width and height
72
+ self.width = width or computer_settings.DISPLAY_WIDTH
73
+ self.height = height or computer_settings.DISPLAY_HEIGHT
74
+
75
+ # This is the static width and height of the environment screen
76
+ # And the width and height of the screenshots taken by the tool
77
+ self.environment_width = computer_settings.DISPLAY_WIDTH
78
+ self.environment_height = computer_settings.DISPLAY_HEIGHT
79
+
80
+ # Some APIs rescale screenshots automatically to the agent's width and height, some don't
81
+ # Defined per subclass (e.g., Anthropic, OpenAI)
82
+ # In case you need your agent to receive pre-formatted screenshots, set env variable True
83
+ self.rescale_images = rescale_images
84
+
85
+ logger.debug(
86
+ "Agent Screen Width: %s, Agent Screen Height: %s",
87
+ self.width,
88
+ self.height,
89
+ "Environment Screen Width: %s, Environment Screen Height: %s",
90
+ self.environment_width,
91
+ self.environment_height,
92
+ )
93
+
94
+ # Calculate scaling factors from base screen size to target size
95
+ self.scale_x = self.width / self.environment_width
96
+ self.scale_y = self.height / self.environment_height
97
+
98
+ # Check if we need to scale
99
+ self.needs_scaling = min(self.scale_x, self.scale_y) != 1.0
100
+
101
+ # Use environment settings for display number
102
+ self.display_num = display_num or computer_settings.DISPLAY_NUM
103
+
104
+ logger.debug("Display number: %s", self.display_num)
105
+
106
+ # If no executor provided, create one based on platform
107
+ if self.env is None:
108
+ self._choose_executor(platform_type, self.display_num)
109
+
110
+ @property
111
+ def executor(self) -> BaseExecutor:
112
+ """Get the executor (alias for context)."""
113
+ return self.env
114
+
115
+ @executor.setter
116
+ def executor(self, value: BaseExecutor) -> None:
117
+ """Set the executor (alias for context)."""
118
+ self.env = value
119
+
120
+ def _choose_executor(
121
+ self,
122
+ platform_type: Literal["auto", "xdo", "pyautogui"],
123
+ display_num: int | None,
124
+ ) -> None:
125
+ """Choose executor based on platform_type."""
126
+ # Choose executor based on platform_type
127
+ if platform_type == "auto":
128
+ # Auto-detect based on platform
129
+ system = platform.system().lower()
130
+ if system == "linux":
131
+ # Try XDO first on Linux
132
+ if XDOExecutor.is_available():
133
+ self.executor = XDOExecutor(display_num=display_num)
134
+ logger.info("Using XDOExecutor")
135
+ elif PyAutoGUIExecutor.is_available():
136
+ self.executor = PyAutoGUIExecutor(display_num=display_num)
137
+ logger.info("Using PyAutoGUIExecutor")
138
+ else:
139
+ self.executor = BaseExecutor(display_num=display_num)
140
+ logger.info("No display available, using BaseExecutor (simulation mode)")
141
+ else:
142
+ # Windows/macOS - try PyAutoGUI
143
+ if PyAutoGUIExecutor.is_available():
144
+ self.executor = PyAutoGUIExecutor(display_num=display_num)
145
+ logger.info("Using PyAutoGUIExecutor")
146
+ else:
147
+ self.executor = BaseExecutor(display_num=display_num)
148
+ logger.info("PyAutoGUI not available, using BaseExecutor (simulation mode)")
149
+
150
+ elif platform_type == "xdo":
151
+ if XDOExecutor.is_available():
152
+ self.executor = XDOExecutor(display_num=display_num)
153
+ logger.info("Using XDOExecutor")
154
+ else:
155
+ self.executor = BaseExecutor(display_num=display_num)
156
+ logger.warning("XDO not available, using BaseExecutor (simulation mode)")
157
+
158
+ elif platform_type == "pyautogui":
159
+ if PyAutoGUIExecutor.is_available():
160
+ self.executor = PyAutoGUIExecutor(display_num=display_num)
161
+ logger.info("Using PyAutoGUIExecutor")
162
+ else:
163
+ self.executor = BaseExecutor(display_num=display_num)
164
+ logger.warning("PyAutoGUI not available, using BaseExecutor (simulation mode)")
165
+ else:
166
+ raise ValueError(f"Invalid platform_type: {platform_type}")
167
+
168
+ def _scale_coordinates(self, x: int | None, y: int | None) -> tuple[int | None, int | None]:
169
+ """Scale coordinates from target space to screen space."""
170
+ if x is not None and self.scale_x != 1.0:
171
+ x = int(x / self.scale_x)
172
+ if y is not None and self.scale_y != 1.0:
173
+ y = int(y / self.scale_y)
174
+
175
+ return x, y
176
+
177
+ def _scale_path(self, path: list[tuple[int, int]]) -> list[tuple[int, int]]:
178
+ """Scale a path from target space to screen space."""
179
+ scaled_path = []
180
+ for x, y in path:
181
+ scaled_x, scaled_y = self._scale_coordinates(x, y)
182
+ if scaled_x is not None and scaled_y is not None:
183
+ scaled_path.append((scaled_x, scaled_y))
184
+
185
+ return scaled_path
186
+
187
+ async def _rescale_screenshot(self, screenshot_base64: str) -> str:
188
+ """Rescale a screenshot if rescale_images is True."""
189
+ if not self.rescale_images or not self.needs_scaling:
190
+ return screenshot_base64
191
+
192
+ try:
193
+ import base64
194
+ from io import BytesIO
195
+
196
+ from PIL import Image # type: ignore[import-not-found]
197
+
198
+ # Decode base64 to image
199
+ image_data = base64.b64decode(screenshot_base64)
200
+ image = Image.open(BytesIO(image_data))
201
+
202
+ logger.info(
203
+ "Resizing screenshot from %s x %s to %s x %s",
204
+ image.width,
205
+ image.height,
206
+ self.width,
207
+ self.height,
208
+ )
209
+
210
+ # Resize to exact target dimensions
211
+ resized = image.resize((self.width, self.height), Image.Resampling.LANCZOS)
212
+
213
+ # Convert back to base64
214
+ buffer = BytesIO()
215
+ resized.save(buffer, format="PNG")
216
+ resized_base64 = base64.b64encode(buffer.getvalue()).decode("utf-8")
217
+
218
+ return resized_base64
219
+ except Exception as e:
220
+ logger.warning("Failed to rescale screenshot: %s", e)
221
+ return screenshot_base64
222
+
223
+ async def __call__(
224
+ self,
225
+ action: str = Field(..., description="The action name (click, type, move, etc.)"),
226
+ # Click parameters
227
+ x: int | None = Field(None, description="X coordinate for click/move/scroll actions"),
228
+ y: int | None = Field(None, description="Y coordinate for click/move/scroll actions"),
229
+ button: Literal["left", "right", "middle", "back", "forward"] | None = Field(
230
+ None, description="Mouse button for click actions"
231
+ ),
232
+ pattern: list[int] | None = Field(
233
+ None, description="Click pattern for multi-clicks (e.g., [100] for double-click)"
234
+ ),
235
+ # Key/Type parameters
236
+ text: str | None = Field(None, description="Text for type/response actions"),
237
+ keys: list[str] | None = Field(None, description="Keys for press/keydown/keyup actions"),
238
+ enter_after: bool | None = Field(None, description="Whether to press Enter after typing"),
239
+ # Scroll parameters
240
+ scroll_x: int | None = Field(
241
+ None, description="Horizontal scroll amount (positive = right)"
242
+ ),
243
+ scroll_y: int | None = Field(None, description="Vertical scroll amount (positive = down)"),
244
+ # Move parameters
245
+ offset_x: int | None = Field(None, description="X offset for relative move"),
246
+ offset_y: int | None = Field(None, description="Y offset for relative move"),
247
+ # Drag parameters
248
+ path: list[tuple[int, int]] | None = Field(
249
+ None, description="Path for drag actions as list of (x, y) coordinates"
250
+ ),
251
+ # Wait parameter
252
+ time: int | None = Field(None, description="Time in milliseconds for wait action"),
253
+ # General parameters
254
+ hold_keys: list[str] | None = Field(None, description="Keys to hold during action"),
255
+ # hold_key specific
256
+ duration: float | None = Field(None, description="Duration in seconds for hold_key action"),
257
+ ) -> list[ContentBlock]:
258
+ """
259
+ Execute a computer control action by name.
260
+
261
+ Returns:
262
+ List of MCP content blocks
263
+ """
264
+ logger.info("HudComputerTool executing action: %s", action)
265
+
266
+ try:
267
+ # Delegate to executor based on action
268
+ if action == "click":
269
+ # Scale coordinates from client space to screen space
270
+ scaled_x, scaled_y = self._scale_coordinates(x, y)
271
+ result = await self.executor.click(
272
+ x=scaled_x,
273
+ y=scaled_y,
274
+ button=button or "left",
275
+ pattern=pattern,
276
+ hold_keys=hold_keys,
277
+ )
278
+
279
+ elif action == "press":
280
+ if keys is None:
281
+ raise ToolError("keys parameter is required for press")
282
+ result = await self.executor.press(keys=keys)
283
+
284
+ elif action == "keydown":
285
+ if keys is None:
286
+ raise ToolError("keys parameter is required for keydown")
287
+ result = await self.executor.keydown(keys=keys)
288
+
289
+ elif action == "keyup":
290
+ if keys is None:
291
+ raise ToolError("keys parameter is required for keyup")
292
+ result = await self.executor.keyup(keys=keys)
293
+
294
+ elif action == "type":
295
+ if text is None:
296
+ raise ToolError("text parameter is required for type")
297
+ result = await self.executor.write(text=text, enter_after=enter_after or False)
298
+
299
+ elif action == "scroll":
300
+ # Scale coordinates from client space to screen space
301
+ scaled_x, scaled_y = self._scale_coordinates(x, y)
302
+ result = await self.executor.scroll(
303
+ x=scaled_x,
304
+ y=scaled_y,
305
+ scroll_x=scroll_x,
306
+ scroll_y=scroll_y,
307
+ hold_keys=hold_keys,
308
+ )
309
+
310
+ elif action == "move":
311
+ # Scale coordinates from client space to screen space
312
+ scaled_x, scaled_y = self._scale_coordinates(x, y)
313
+ scaled_offset_x, scaled_offset_y = self._scale_coordinates(offset_x, offset_y)
314
+ result = await self.executor.move(
315
+ x=scaled_x, y=scaled_y, offset_x=scaled_offset_x, offset_y=scaled_offset_y
316
+ )
317
+
318
+ elif action == "wait":
319
+ if time is None:
320
+ raise ToolError("time parameter is required for wait")
321
+ result = await self.executor.wait(time=time)
322
+
323
+ elif action == "drag":
324
+ if path is None:
325
+ raise ToolError("path parameter is required for drag")
326
+ # Scale path from client space to screen space
327
+ scaled_path = self._scale_path(path)
328
+ result = await self.executor.drag(
329
+ path=scaled_path, pattern=pattern, hold_keys=hold_keys
330
+ )
331
+
332
+ elif action == "response":
333
+ if text is None:
334
+ raise ToolError("text parameter is required for response")
335
+ return [TextContent(text=text, type="text")]
336
+
337
+ elif action == "screenshot":
338
+ screenshot = await self.executor.screenshot()
339
+ if screenshot:
340
+ # Rescale screenshot if requested
341
+ screenshot = await self._rescale_screenshot(screenshot)
342
+ result = ContentResult(base64_image=screenshot)
343
+ else:
344
+ result = ContentResult(error="Failed to take screenshot")
345
+
346
+ elif action == "position":
347
+ result = await self.executor.position()
348
+
349
+ elif action == "hold_key":
350
+ if text is None:
351
+ raise ToolError("text parameter is required for hold_key")
352
+ if duration is None:
353
+ raise ToolError("duration parameter is required for hold_key")
354
+ result = await self.executor.hold_key(key=text, duration=duration)
355
+
356
+ elif action == "mouse_down":
357
+ result = await self.executor.mouse_down(button=button or "left")
358
+
359
+ elif action == "mouse_up":
360
+ result = await self.executor.mouse_up(button=button or "left")
361
+
362
+ else:
363
+ raise McpError(ErrorData(code=INVALID_PARAMS, message=f"Unknown action: {action}"))
364
+
365
+ # Rescale screenshot in result if present
366
+ if isinstance(result, ContentResult) and result.base64_image and self.rescale_images:
367
+ rescaled_image = await self._rescale_screenshot(result.base64_image)
368
+ result.base64_image = rescaled_image
369
+
370
+ # Convert result to content blocks
371
+ return result.to_content_blocks()
372
+
373
+ except TypeError as e:
374
+ raise McpError(
375
+ ErrorData(code=INVALID_PARAMS, message=f"Invalid parameters for {action}: {e!s}")
376
+ ) from e