hud-python 0.3.5__py3-none-any.whl → 0.4.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of hud-python might be problematic. Click here for more details.

Files changed (192) hide show
  1. hud/__init__.py +22 -89
  2. hud/agents/__init__.py +15 -0
  3. hud/agents/art.py +101 -0
  4. hud/agents/base.py +599 -0
  5. hud/{mcp → agents}/claude.py +373 -321
  6. hud/{mcp → agents}/langchain.py +250 -250
  7. hud/agents/misc/__init__.py +7 -0
  8. hud/{agent → agents}/misc/response_agent.py +80 -80
  9. hud/{mcp → agents}/openai.py +352 -334
  10. hud/agents/openai_chat_generic.py +154 -0
  11. hud/{mcp → agents}/tests/__init__.py +1 -1
  12. hud/agents/tests/test_base.py +742 -0
  13. hud/agents/tests/test_claude.py +324 -0
  14. hud/{mcp → agents}/tests/test_client.py +363 -324
  15. hud/{mcp → agents}/tests/test_openai.py +237 -238
  16. hud/cli/__init__.py +617 -0
  17. hud/cli/__main__.py +8 -0
  18. hud/cli/analyze.py +371 -0
  19. hud/cli/analyze_metadata.py +230 -0
  20. hud/cli/build.py +427 -0
  21. hud/cli/clone.py +185 -0
  22. hud/cli/cursor.py +92 -0
  23. hud/cli/debug.py +392 -0
  24. hud/cli/docker_utils.py +83 -0
  25. hud/cli/init.py +281 -0
  26. hud/cli/interactive.py +353 -0
  27. hud/cli/mcp_server.py +756 -0
  28. hud/cli/pull.py +336 -0
  29. hud/cli/push.py +370 -0
  30. hud/cli/remote_runner.py +311 -0
  31. hud/cli/runner.py +160 -0
  32. hud/cli/tests/__init__.py +3 -0
  33. hud/cli/tests/test_analyze.py +284 -0
  34. hud/cli/tests/test_cli_init.py +265 -0
  35. hud/cli/tests/test_cli_main.py +27 -0
  36. hud/cli/tests/test_clone.py +142 -0
  37. hud/cli/tests/test_cursor.py +253 -0
  38. hud/cli/tests/test_debug.py +453 -0
  39. hud/cli/tests/test_mcp_server.py +139 -0
  40. hud/cli/tests/test_utils.py +388 -0
  41. hud/cli/utils.py +263 -0
  42. hud/clients/README.md +143 -0
  43. hud/clients/__init__.py +16 -0
  44. hud/clients/base.py +379 -0
  45. hud/clients/fastmcp.py +222 -0
  46. hud/clients/mcp_use.py +278 -0
  47. hud/clients/tests/__init__.py +1 -0
  48. hud/clients/tests/test_client_integration.py +111 -0
  49. hud/clients/tests/test_fastmcp.py +342 -0
  50. hud/clients/tests/test_protocol.py +188 -0
  51. hud/clients/utils/__init__.py +1 -0
  52. hud/clients/utils/retry_transport.py +160 -0
  53. hud/datasets.py +322 -192
  54. hud/misc/__init__.py +1 -0
  55. hud/{agent → misc}/claude_plays_pokemon.py +292 -283
  56. hud/otel/__init__.py +35 -0
  57. hud/otel/collector.py +142 -0
  58. hud/otel/config.py +164 -0
  59. hud/otel/context.py +536 -0
  60. hud/otel/exporters.py +366 -0
  61. hud/otel/instrumentation.py +97 -0
  62. hud/otel/processors.py +118 -0
  63. hud/otel/tests/__init__.py +1 -0
  64. hud/otel/tests/test_processors.py +197 -0
  65. hud/server/__init__.py +5 -5
  66. hud/server/context.py +114 -0
  67. hud/server/helper/__init__.py +5 -0
  68. hud/server/low_level.py +132 -0
  69. hud/server/server.py +166 -0
  70. hud/server/tests/__init__.py +3 -0
  71. hud/settings.py +73 -79
  72. hud/shared/__init__.py +5 -0
  73. hud/{exceptions.py → shared/exceptions.py} +180 -180
  74. hud/{server → shared}/requests.py +264 -264
  75. hud/shared/tests/test_exceptions.py +157 -0
  76. hud/{server → shared}/tests/test_requests.py +275 -275
  77. hud/telemetry/__init__.py +25 -30
  78. hud/telemetry/instrument.py +379 -0
  79. hud/telemetry/job.py +309 -141
  80. hud/telemetry/replay.py +74 -0
  81. hud/telemetry/trace.py +83 -0
  82. hud/tools/__init__.py +33 -34
  83. hud/tools/base.py +365 -65
  84. hud/tools/bash.py +161 -137
  85. hud/tools/computer/__init__.py +15 -13
  86. hud/tools/computer/anthropic.py +437 -420
  87. hud/tools/computer/hud.py +376 -334
  88. hud/tools/computer/openai.py +295 -292
  89. hud/tools/computer/settings.py +82 -0
  90. hud/tools/edit.py +314 -290
  91. hud/tools/executors/__init__.py +30 -30
  92. hud/tools/executors/base.py +539 -532
  93. hud/tools/executors/pyautogui.py +621 -619
  94. hud/tools/executors/tests/__init__.py +1 -1
  95. hud/tools/executors/tests/test_base_executor.py +338 -338
  96. hud/tools/executors/tests/test_pyautogui_executor.py +165 -165
  97. hud/tools/executors/xdo.py +511 -503
  98. hud/tools/{playwright_tool.py → playwright.py} +412 -379
  99. hud/tools/tests/__init__.py +3 -3
  100. hud/tools/tests/test_base.py +282 -0
  101. hud/tools/tests/test_bash.py +158 -152
  102. hud/tools/tests/test_bash_extended.py +197 -0
  103. hud/tools/tests/test_computer.py +425 -52
  104. hud/tools/tests/test_computer_actions.py +34 -34
  105. hud/tools/tests/test_edit.py +259 -240
  106. hud/tools/tests/test_init.py +27 -27
  107. hud/tools/tests/test_playwright_tool.py +183 -183
  108. hud/tools/tests/test_tools.py +145 -157
  109. hud/tools/tests/test_utils.py +156 -156
  110. hud/tools/types.py +72 -0
  111. hud/tools/utils.py +50 -50
  112. hud/types.py +136 -89
  113. hud/utils/__init__.py +10 -16
  114. hud/utils/async_utils.py +65 -0
  115. hud/utils/design.py +168 -0
  116. hud/utils/mcp.py +55 -0
  117. hud/utils/progress.py +149 -149
  118. hud/utils/telemetry.py +66 -66
  119. hud/utils/tests/test_async_utils.py +173 -0
  120. hud/utils/tests/test_init.py +17 -21
  121. hud/utils/tests/test_progress.py +261 -225
  122. hud/utils/tests/test_telemetry.py +82 -37
  123. hud/utils/tests/test_version.py +8 -8
  124. hud/version.py +7 -7
  125. hud_python-0.4.1.dist-info/METADATA +476 -0
  126. hud_python-0.4.1.dist-info/RECORD +132 -0
  127. hud_python-0.4.1.dist-info/entry_points.txt +3 -0
  128. {hud_python-0.3.5.dist-info → hud_python-0.4.1.dist-info}/licenses/LICENSE +21 -21
  129. hud/adapters/__init__.py +0 -8
  130. hud/adapters/claude/__init__.py +0 -5
  131. hud/adapters/claude/adapter.py +0 -180
  132. hud/adapters/claude/tests/__init__.py +0 -1
  133. hud/adapters/claude/tests/test_adapter.py +0 -519
  134. hud/adapters/common/__init__.py +0 -6
  135. hud/adapters/common/adapter.py +0 -178
  136. hud/adapters/common/tests/test_adapter.py +0 -289
  137. hud/adapters/common/types.py +0 -446
  138. hud/adapters/operator/__init__.py +0 -5
  139. hud/adapters/operator/adapter.py +0 -108
  140. hud/adapters/operator/tests/__init__.py +0 -1
  141. hud/adapters/operator/tests/test_adapter.py +0 -370
  142. hud/agent/__init__.py +0 -19
  143. hud/agent/base.py +0 -126
  144. hud/agent/claude.py +0 -271
  145. hud/agent/langchain.py +0 -215
  146. hud/agent/misc/__init__.py +0 -3
  147. hud/agent/operator.py +0 -268
  148. hud/agent/tests/__init__.py +0 -1
  149. hud/agent/tests/test_base.py +0 -202
  150. hud/env/__init__.py +0 -11
  151. hud/env/client.py +0 -35
  152. hud/env/docker_client.py +0 -349
  153. hud/env/environment.py +0 -446
  154. hud/env/local_docker_client.py +0 -358
  155. hud/env/remote_client.py +0 -212
  156. hud/env/remote_docker_client.py +0 -292
  157. hud/gym.py +0 -130
  158. hud/job.py +0 -773
  159. hud/mcp/__init__.py +0 -17
  160. hud/mcp/base.py +0 -631
  161. hud/mcp/client.py +0 -312
  162. hud/mcp/tests/test_base.py +0 -512
  163. hud/mcp/tests/test_claude.py +0 -294
  164. hud/task.py +0 -149
  165. hud/taskset.py +0 -237
  166. hud/telemetry/_trace.py +0 -347
  167. hud/telemetry/context.py +0 -230
  168. hud/telemetry/exporter.py +0 -575
  169. hud/telemetry/instrumentation/__init__.py +0 -3
  170. hud/telemetry/instrumentation/mcp.py +0 -259
  171. hud/telemetry/instrumentation/registry.py +0 -59
  172. hud/telemetry/mcp_models.py +0 -270
  173. hud/telemetry/tests/__init__.py +0 -1
  174. hud/telemetry/tests/test_context.py +0 -210
  175. hud/telemetry/tests/test_trace.py +0 -312
  176. hud/tools/helper/README.md +0 -56
  177. hud/tools/helper/__init__.py +0 -9
  178. hud/tools/helper/mcp_server.py +0 -78
  179. hud/tools/helper/server_initialization.py +0 -115
  180. hud/tools/helper/utils.py +0 -58
  181. hud/trajectory.py +0 -94
  182. hud/utils/agent.py +0 -37
  183. hud/utils/common.py +0 -256
  184. hud/utils/config.py +0 -120
  185. hud/utils/deprecation.py +0 -115
  186. hud/utils/misc.py +0 -53
  187. hud/utils/tests/test_common.py +0 -277
  188. hud/utils/tests/test_config.py +0 -129
  189. hud_python-0.3.5.dist-info/METADATA +0 -284
  190. hud_python-0.3.5.dist-info/RECORD +0 -120
  191. /hud/{adapters/common → shared}/tests/__init__.py +0 -0
  192. {hud_python-0.3.5.dist-info → hud_python-0.4.1.dist-info}/WHEEL +0 -0
@@ -1,503 +1,511 @@
1
- from __future__ import annotations
2
-
3
- import asyncio
4
- import base64
5
- import logging
6
- import os
7
- import shlex
8
- from pathlib import Path
9
- from tempfile import gettempdir
10
- from typing import Literal
11
- from uuid import uuid4
12
-
13
- from hud.tools.base import ToolResult
14
- from hud.tools.utils import run
15
-
16
- from .base import BaseExecutor
17
-
18
- OUTPUT_DIR = os.environ.get("SCREENSHOT_DIR")
19
- logger = logging.getLogger(__name__)
20
-
21
- # Map CLA standard keys to X11/XDO key names
22
- CLA_TO_XDO = {
23
- "enter": "Return",
24
- "tab": "Tab",
25
- "space": "space",
26
- "backspace": "BackSpace",
27
- "delete": "Delete",
28
- "escape": "Escape",
29
- "esc": "Escape",
30
- "up": "Up",
31
- "down": "Down",
32
- "left": "Left",
33
- "right": "Right",
34
- "shift": "Shift_L",
35
- "shiftleft": "Shift_L",
36
- "shiftright": "Shift_R",
37
- "ctrl": "Control_L",
38
- "ctrlleft": "Control_L",
39
- "ctrlright": "Control_R",
40
- "alt": "Alt_L",
41
- "altleft": "Alt_L",
42
- "altright": "Alt_R",
43
- "win": "Super_L",
44
- "winleft": "Super_L",
45
- "winright": "Super_R",
46
- "cmd": "Control_L", # Map cmd to ctrl for Linux
47
- "command": "Control_L",
48
- "super": "Super_L",
49
- "pageup": "Page_Up",
50
- "pagedown": "Page_Down",
51
- "home": "Home",
52
- "end": "End",
53
- "insert": "Insert",
54
- "pause": "Pause",
55
- "capslock": "Caps_Lock",
56
- "numlock": "Num_Lock",
57
- "scrolllock": "Scroll_Lock",
58
- "printscreen": "Print",
59
- "prtsc": "Print",
60
- # Function keys
61
- **{f"f{i}": f"F{i}" for i in range(1, 25)},
62
- }
63
-
64
-
65
- class XDOExecutor(BaseExecutor):
66
- """
67
- Low-level executor for xdotool commands.
68
- Handles display management and screenshot capture on Linux/X11 systems.
69
-
70
- This executor should only be instantiated when X11 display is available.
71
- """
72
-
73
- def __init__(self, display_num: int | None = None) -> None:
74
- """Initialize with optional display number."""
75
- super().__init__(display_num)
76
-
77
- if display_num is not None:
78
- self._display_prefix = f"DISPLAY=:{display_num} "
79
- else:
80
- self._display_prefix = ""
81
-
82
- self.xdotool = f"{self._display_prefix}xdotool"
83
- logger.info("XDOExecutor initialized")
84
-
85
- def _map_key(self, key: str) -> str:
86
- """Map CLA standard key to XDO key."""
87
- return CLA_TO_XDO.get(key.lower(), key)
88
-
89
- def _map_keys(self, keys: list[str]) -> list[str]:
90
- """Map CLA standard keys to XDO keys."""
91
- mapped_keys = []
92
- for key in keys:
93
- # Handle key combinations like "ctrl+a"
94
- if "+" in key:
95
- parts = key.split("+")
96
- mapped_parts = [self._map_key(part) for part in parts]
97
- mapped_keys.append("+".join(mapped_parts))
98
- else:
99
- mapped_keys.append(self._map_key(key))
100
- return mapped_keys
101
-
102
- @classmethod
103
- def is_available(cls) -> bool:
104
- """
105
- Check if xdotool and X11 display are available.
106
-
107
- Returns:
108
- True if xdotool can be used, False otherwise
109
- """
110
- display = os.environ.get("DISPLAY")
111
- if not display:
112
- return False
113
-
114
- # Try a simple xdotool command to test availability
115
- try:
116
- import subprocess
117
-
118
- # Try without display prefix if DISPLAY is already set
119
- result = subprocess.run( # noqa: S603
120
- ["xdotool", "getdisplaygeometry"], # noqa: S607
121
- capture_output=True,
122
- timeout=2,
123
- )
124
- return result.returncode == 0
125
- except (subprocess.TimeoutExpired, FileNotFoundError, Exception):
126
- return False
127
-
128
- async def execute(self, command: str, take_screenshot: bool = True) -> ToolResult:
129
- """
130
- Execute an xdotool command.
131
-
132
- Args:
133
- command: The xdotool command (without xdotool prefix)
134
- take_screenshot: Whether to capture a screenshot after execution
135
-
136
- Returns:
137
- ToolResult with output, error, and optional screenshot
138
- """
139
- full_command = f"{self.xdotool} {command}"
140
-
141
- # Execute command
142
- returncode, stdout, stderr = await run(full_command)
143
-
144
- # Prepare result
145
- result = ToolResult(
146
- output=stdout if stdout else None, error=stderr if stderr or returncode != 0 else None
147
- )
148
-
149
- # Take screenshot if requested
150
- if take_screenshot:
151
- await asyncio.sleep(self._screenshot_delay)
152
- screenshot = await self.screenshot()
153
- if screenshot:
154
- result = ToolResult(
155
- output=result.output, error=result.error, base64_image=screenshot
156
- )
157
-
158
- return result
159
-
160
- async def screenshot(self) -> str | None:
161
- """
162
- Take a screenshot and return base64 encoded image.
163
-
164
- Returns:
165
- Base64 encoded PNG image or None if failed
166
- """
167
- # Real screenshot using scrot
168
- if OUTPUT_DIR:
169
- output_dir = Path(OUTPUT_DIR)
170
- output_dir.mkdir(parents=True, exist_ok=True)
171
- screenshot_path = output_dir / f"screenshot_{uuid4().hex}.png"
172
- else:
173
- # Generate a unique path in system temp dir without opening a file
174
- screenshot_path = Path(gettempdir()) / f"screenshot_{uuid4().hex}.png"
175
-
176
- screenshot_cmd = f"{self._display_prefix}scrot -p {screenshot_path}"
177
-
178
- returncode, _, stderr = await run(screenshot_cmd)
179
-
180
- if returncode == 0 and screenshot_path.exists():
181
- try:
182
- image_data = screenshot_path.read_bytes()
183
- # Remove the file unless user requested persistence via env var
184
- if not OUTPUT_DIR:
185
- screenshot_path.unlink(missing_ok=True)
186
- return base64.b64encode(image_data).decode()
187
- except Exception:
188
- return None
189
-
190
- return None
191
-
192
- # ===== Helper Methods =====
193
-
194
- async def _hold_keys_context(self, keys: list[str] | None) -> None:
195
- """
196
- Press and hold keys, to be used with try/finally.
197
-
198
- Args:
199
- keys: List of keys to hold
200
-
201
- Example:
202
- await self._hold_keys_context(['ctrl'])
203
- try:
204
- # Do action with ctrl held
205
- finally:
206
- await self._release_keys(['ctrl'])
207
- """
208
- if keys:
209
- for key in keys:
210
- escaped_key = shlex.quote(key)
211
- await self.execute(f"keydown {escaped_key}", take_screenshot=False)
212
-
213
- async def _release_keys(self, keys: list[str] | None) -> None:
214
- """Release held keys."""
215
- if keys:
216
- for key in reversed(keys): # Release in reverse order
217
- escaped_key = shlex.quote(key)
218
- await self.execute(f"keyup {escaped_key}", take_screenshot=False)
219
-
220
- # ===== CLA Action Implementations =====
221
-
222
- async def click(
223
- self,
224
- x: int | None = None,
225
- y: int | None = None,
226
- button: Literal["left", "right", "middle", "back", "forward"] = "left",
227
- pattern: list[int] | None = None,
228
- hold_keys: list[str] | None = None,
229
- take_screenshot: bool = True,
230
- ) -> ToolResult:
231
- """Click at specified coordinates or current position."""
232
- # Map button names to xdotool button numbers
233
- button_map = {"left": 1, "right": 3, "middle": 2, "back": 8, "forward": 9}
234
- button_num = button_map.get(button, 1)
235
-
236
- # Hold keys if specified
237
- await self._hold_keys_context(hold_keys)
238
-
239
- try:
240
- # Handle multi-clicks based on pattern
241
- if pattern:
242
- click_count = len(pattern) + 1
243
- delay = pattern[0] if pattern else 10 # Use first delay for all clicks
244
-
245
- if x is not None and y is not None:
246
- cmd = f"mousemove {x} {y} click --repeat {click_count} --delay {delay} {button_num}" # noqa: E501
247
- else:
248
- cmd = f"click --repeat {click_count} --delay {delay} {button_num}"
249
- else:
250
- # Single click
251
- if x is not None and y is not None:
252
- cmd = f"mousemove {x} {y} click {button_num}"
253
- else:
254
- cmd = f"click {button_num}"
255
-
256
- result = await self.execute(cmd, take_screenshot=take_screenshot)
257
- finally:
258
- # Release held keys
259
- await self._release_keys(hold_keys)
260
-
261
- return result
262
-
263
- async def type(
264
- self, text: str, enter_after: bool = False, delay: int = 12, take_screenshot: bool = True
265
- ) -> ToolResult:
266
- """Type text with specified delay between keystrokes."""
267
- # Escape text for shell
268
- escaped_text = shlex.quote(text)
269
- cmd = f"type --delay {delay} -- {escaped_text}"
270
- result = await self.execute(cmd, take_screenshot=False)
271
-
272
- if enter_after:
273
- enter_result = await self.key("Return", take_screenshot=False)
274
- # Combine outputs
275
- combined_output = (result.output or "") + "\n" + (enter_result.output or "")
276
- combined_error = None
277
- if result.error or enter_result.error:
278
- combined_error = (result.error or "") + "\n" + (enter_result.error or "")
279
- result = ToolResult(output=combined_output.strip(), error=combined_error)
280
-
281
- if take_screenshot:
282
- screenshot = await self.screenshot()
283
- if screenshot:
284
- result = ToolResult(
285
- output=result.output, error=result.error, base64_image=screenshot
286
- )
287
-
288
- return result
289
-
290
- async def key(self, key_sequence: str, take_screenshot: bool = True) -> ToolResult:
291
- """Press a key or key combination."""
292
- return await self.execute(f"key -- {key_sequence}", take_screenshot=take_screenshot)
293
-
294
- async def press(self, keys: list[str], take_screenshot: bool = True) -> ToolResult:
295
- """Press a key combination (hotkey)."""
296
- # Map CLA keys to XDO keys
297
- mapped_keys = self._map_keys(keys)
298
- # Convert list of keys to xdotool format
299
- key_combo = "+".join(mapped_keys)
300
- return await self.key(key_combo, take_screenshot=take_screenshot)
301
-
302
- async def keydown(self, keys: list[str], take_screenshot: bool = True) -> ToolResult:
303
- """Press and hold keys."""
304
- # Map CLA keys to XDO keys
305
- mapped_keys = self._map_keys(keys)
306
- last_result = None
307
- for key in mapped_keys:
308
- escaped_key = shlex.quote(key)
309
- last_result = await self.execute(f"keydown {escaped_key}", take_screenshot=False)
310
-
311
- if take_screenshot and last_result:
312
- screenshot = await self.screenshot()
313
- if screenshot:
314
- last_result = ToolResult(
315
- output=last_result.output, error=last_result.error, base64_image=screenshot
316
- )
317
-
318
- return last_result or ToolResult()
319
-
320
- async def keyup(self, keys: list[str], take_screenshot: bool = True) -> ToolResult:
321
- """Release held keys."""
322
- # Map CLA keys to XDO keys
323
- mapped_keys = self._map_keys(keys)
324
- last_result = None
325
- for key in mapped_keys:
326
- escaped_key = shlex.quote(key)
327
- last_result = await self.execute(f"keyup {escaped_key}", take_screenshot=False)
328
-
329
- if take_screenshot and last_result:
330
- screenshot = await self.screenshot()
331
- if screenshot:
332
- last_result = ToolResult(
333
- output=last_result.output, error=last_result.error, base64_image=screenshot
334
- )
335
-
336
- return last_result or ToolResult()
337
-
338
- async def scroll(
339
- self,
340
- x: int | None = None,
341
- y: int | None = None,
342
- scroll_x: int | None = None,
343
- scroll_y: int | None = None,
344
- hold_keys: list[str] | None = None,
345
- take_screenshot: bool = True,
346
- ) -> ToolResult:
347
- """Scroll at specified position."""
348
- # Convert scroll amounts to xdotool format
349
- scroll_button_map = {"up": 4, "down": 5, "left": 6, "right": 7}
350
-
351
- # Hold keys if specified
352
- await self._hold_keys_context(hold_keys)
353
-
354
- try:
355
- # Handle vertical scroll
356
- if scroll_y and scroll_y != 0:
357
- direction = "down" if scroll_y > 0 else "up"
358
- amount = abs(scroll_y)
359
- button = scroll_button_map.get(direction, 5)
360
-
361
- if x is not None and y is not None:
362
- cmd = f"mousemove {x} {y} click --repeat {amount} {button}"
363
- else:
364
- cmd = f"click --repeat {amount} {button}"
365
-
366
- result = await self.execute(cmd, take_screenshot=take_screenshot)
367
-
368
- # Handle horizontal scroll
369
- elif scroll_x and scroll_x != 0:
370
- direction = "right" if scroll_x > 0 else "left"
371
- amount = abs(scroll_x)
372
- button = scroll_button_map.get(direction, 7)
373
-
374
- if x is not None and y is not None:
375
- cmd = f"mousemove {x} {y} click --repeat {amount} {button}"
376
- else:
377
- cmd = f"click --repeat {amount} {button}"
378
-
379
- result = await self.execute(cmd, take_screenshot=take_screenshot)
380
-
381
- else:
382
- result = ToolResult(output="No scroll amount specified")
383
- finally:
384
- # Release held keys
385
- await self._release_keys(hold_keys)
386
-
387
- return result
388
-
389
- async def move(
390
- self,
391
- x: int | None = None,
392
- y: int | None = None,
393
- offset_x: int | None = None,
394
- offset_y: int | None = None,
395
- take_screenshot: bool = True,
396
- ) -> ToolResult:
397
- """Move mouse cursor."""
398
- if x is not None and y is not None:
399
- # Absolute move
400
- return await self.execute(f"mousemove {x} {y}", take_screenshot=take_screenshot)
401
- elif offset_x is not None or offset_y is not None:
402
- # Relative move
403
- offset_x = offset_x or 0
404
- offset_y = offset_y or 0
405
- return await self.execute(
406
- f"mousemove_relative -- {offset_x} {offset_y}", take_screenshot=take_screenshot
407
- )
408
- else:
409
- return ToolResult(output="No move coordinates specified")
410
-
411
- async def drag(
412
- self,
413
- path: list[tuple[int, int]],
414
- pattern: list[int] | None = None,
415
- hold_keys: list[str] | None = None,
416
- take_screenshot: bool = True,
417
- ) -> ToolResult:
418
- """Drag along a path."""
419
- if len(path) < 2:
420
- return ToolResult(error="Drag path must have at least 2 points")
421
-
422
- # Hold keys if specified
423
- await self._hold_keys_context(hold_keys)
424
-
425
- try:
426
- # Start drag
427
- start_x, start_y = path[0]
428
- await self.execute(f"mousemove {start_x} {start_y}", take_screenshot=False)
429
- await self.execute("mousedown 1", take_screenshot=False)
430
-
431
- # Move through intermediate points
432
- for i, (x, y) in enumerate(path[1:], 1):
433
- # Apply delay if pattern is specified
434
- if pattern and i - 1 < len(pattern):
435
- await asyncio.sleep(pattern[i - 1] / 1000.0) # Convert ms to seconds
436
-
437
- await self.execute(f"mousemove {x} {y}", take_screenshot=False)
438
-
439
- # End drag
440
- await self.execute("mouseup 1", take_screenshot=False)
441
-
442
- # Take final screenshot if requested
443
- if take_screenshot:
444
- screenshot = await self.screenshot()
445
- result = ToolResult(
446
- output=f"Dragged along {len(path)} points", base64_image=screenshot
447
- )
448
- else:
449
- result = ToolResult(output=f"Dragged along {len(path)} points")
450
-
451
- finally:
452
- # Release held keys
453
- await self._release_keys(hold_keys)
454
-
455
- return result
456
-
457
- async def mouse_down(
458
- self,
459
- button: Literal["left", "right", "middle", "back", "forward"] = "left",
460
- take_screenshot: bool = True,
461
- ) -> ToolResult:
462
- """Press and hold a mouse button."""
463
- button_map = {"left": 1, "right": 3, "middle": 2, "back": 8, "forward": 9}
464
- button_num = button_map.get(button, 1)
465
- return await self.execute(f"mousedown {button_num}", take_screenshot=take_screenshot)
466
-
467
- async def mouse_up(
468
- self,
469
- button: Literal["left", "right", "middle", "back", "forward"] = "left",
470
- take_screenshot: bool = True,
471
- ) -> ToolResult:
472
- """Release a mouse button."""
473
- button_map = {"left": 1, "right": 3, "middle": 2, "back": 8, "forward": 9}
474
- button_num = button_map.get(button, 1)
475
- return await self.execute(f"mouseup {button_num}", take_screenshot=take_screenshot)
476
-
477
- async def hold_key(self, key: str, duration: float, take_screenshot: bool = True) -> ToolResult:
478
- """Hold a key for a specified duration."""
479
- # Map CLA key to XDO key
480
- mapped_key = self._map_key(key)
481
- escaped_key = shlex.quote(mapped_key)
482
-
483
- # Press the key
484
- await self.execute(f"keydown {escaped_key}", take_screenshot=False)
485
-
486
- # Wait
487
- await asyncio.sleep(duration)
488
-
489
- # Release the key
490
- result = await self.execute(f"keyup {escaped_key}", take_screenshot=False)
491
-
492
- if take_screenshot:
493
- screenshot = await self.screenshot()
494
- if screenshot:
495
- result = ToolResult(
496
- output=result.output, error=result.error, base64_image=screenshot
497
- )
498
-
499
- return result
500
-
501
- async def position(self) -> ToolResult:
502
- """Get current cursor position."""
503
- return await self.execute("getmouselocation", take_screenshot=False)
1
+ from __future__ import annotations
2
+
3
+ import asyncio
4
+ import base64
5
+ import logging
6
+ import os
7
+ import shlex
8
+ from pathlib import Path
9
+ from tempfile import gettempdir
10
+ from typing import Literal
11
+ from uuid import uuid4
12
+
13
+ from hud.tools.types import ContentResult
14
+ from hud.tools.utils import run
15
+
16
+ from .base import BaseExecutor
17
+
18
+ OUTPUT_DIR = os.environ.get("SCREENSHOT_DIR")
19
+ logger = logging.getLogger(__name__)
20
+
21
+ # Map CLA standard keys to X11/XDO key names
22
+ CLA_TO_XDO = {
23
+ "enter": "Return",
24
+ "tab": "Tab",
25
+ "space": "space",
26
+ "backspace": "BackSpace",
27
+ "delete": "Delete",
28
+ "escape": "Escape",
29
+ "esc": "Escape",
30
+ "up": "Up",
31
+ "down": "Down",
32
+ "left": "Left",
33
+ "right": "Right",
34
+ "shift": "Shift_L",
35
+ "shiftleft": "Shift_L",
36
+ "shiftright": "Shift_R",
37
+ "ctrl": "Control_L",
38
+ "ctrlleft": "Control_L",
39
+ "ctrlright": "Control_R",
40
+ "alt": "Alt_L",
41
+ "altleft": "Alt_L",
42
+ "altright": "Alt_R",
43
+ "win": "Super_L",
44
+ "winleft": "Super_L",
45
+ "winright": "Super_R",
46
+ "cmd": "Control_L", # Map cmd to ctrl for Linux
47
+ "command": "Control_L",
48
+ "super": "Super_L",
49
+ "pageup": "Page_Up",
50
+ "pagedown": "Page_Down",
51
+ "home": "Home",
52
+ "end": "End",
53
+ "insert": "Insert",
54
+ "pause": "Pause",
55
+ "capslock": "Caps_Lock",
56
+ "numlock": "Num_Lock",
57
+ "scrolllock": "Scroll_Lock",
58
+ "printscreen": "Print",
59
+ "prtsc": "Print",
60
+ # Function keys
61
+ **{f"f{i}": f"F{i}" for i in range(1, 25)},
62
+ }
63
+
64
+
65
+ class XDOExecutor(BaseExecutor):
66
+ """
67
+ Low-level executor for xdotool commands.
68
+ Handles display management and screenshot capture on Linux/X11 systems.
69
+
70
+ This executor should only be instantiated when X11 display is available.
71
+ """
72
+
73
+ def __init__(self, display_num: int | None = None) -> None:
74
+ """Initialize with optional display number."""
75
+ super().__init__(display_num)
76
+
77
+ if display_num is not None:
78
+ self._display_prefix = f"DISPLAY=:{display_num} "
79
+ else:
80
+ self._display_prefix = ""
81
+
82
+ self.xdotool = f"{self._display_prefix}xdotool"
83
+ logger.info("XDOExecutor initialized")
84
+
85
+ def _map_key(self, key: str) -> str:
86
+ """Map CLA standard key to XDO key."""
87
+ return CLA_TO_XDO.get(key.lower(), key)
88
+
89
+ def _map_keys(self, keys: list[str]) -> list[str]:
90
+ """Map CLA standard keys to XDO keys."""
91
+ mapped_keys = []
92
+ for key in keys:
93
+ # Handle key combinations like "ctrl+a"
94
+ if "+" in key:
95
+ parts = key.split("+")
96
+ mapped_parts = [self._map_key(part) for part in parts]
97
+ mapped_keys.append("+".join(mapped_parts))
98
+ else:
99
+ mapped_keys.append(self._map_key(key))
100
+ return mapped_keys
101
+
102
+ @classmethod
103
+ def is_available(cls) -> bool:
104
+ """
105
+ Check if xdotool and X11 display are available.
106
+
107
+ Returns:
108
+ True if xdotool can be used, False otherwise
109
+ """
110
+ display = os.environ.get("DISPLAY")
111
+ if not display:
112
+ return False
113
+
114
+ # Try a simple xdotool command to test availability
115
+ try:
116
+ import subprocess
117
+
118
+ # Try without display prefix if DISPLAY is already set
119
+ result = subprocess.run(
120
+ ["xdotool", "getdisplaygeometry"], # noqa: S607
121
+ capture_output=True,
122
+ timeout=2,
123
+ )
124
+ return result.returncode == 0
125
+ except (subprocess.TimeoutExpired, FileNotFoundError, Exception):
126
+ return False
127
+
128
+ async def execute(self, command: str, take_screenshot: bool = True) -> ContentResult:
129
+ """
130
+ Execute an xdotool command.
131
+
132
+ Args:
133
+ command: The xdotool command (without xdotool prefix)
134
+ take_screenshot: Whether to capture a screenshot after execution
135
+
136
+ Returns:
137
+ ContentResult with output, error, and optional screenshot
138
+ """
139
+ full_command = f"{self.xdotool} {command}"
140
+
141
+ # Execute command
142
+ returncode, stdout, stderr = await run(full_command)
143
+
144
+ # Prepare result
145
+ result = ContentResult(
146
+ output=stdout if stdout else None, error=stderr if stderr or returncode != 0 else None
147
+ )
148
+
149
+ # Take screenshot if requested
150
+ if take_screenshot:
151
+ await asyncio.sleep(self._screenshot_delay)
152
+ screenshot = await self.screenshot()
153
+ if screenshot:
154
+ result = ContentResult(
155
+ output=result.output, error=result.error, base64_image=screenshot
156
+ )
157
+
158
+ return result
159
+
160
+ async def screenshot(self) -> str | None:
161
+ """
162
+ Take a screenshot and return base64 encoded image.
163
+
164
+ Returns:
165
+ Base64 encoded PNG image or None if failed
166
+ """
167
+ # Real screenshot using scrot
168
+ if OUTPUT_DIR:
169
+ output_dir = Path(OUTPUT_DIR)
170
+ output_dir.mkdir(parents=True, exist_ok=True)
171
+ screenshot_path = output_dir / f"screenshot_{uuid4().hex}.png"
172
+ else:
173
+ # Generate a unique path in system temp dir without opening a file
174
+ screenshot_path = Path(gettempdir()) / f"screenshot_{uuid4().hex}.png"
175
+
176
+ screenshot_cmd = f"{self._display_prefix}scrot -p {screenshot_path}"
177
+
178
+ returncode, _, stderr = await run(screenshot_cmd)
179
+
180
+ if returncode == 0 and screenshot_path.exists():
181
+ try:
182
+ image_data = screenshot_path.read_bytes()
183
+ # Remove the file unless user requested persistence via env var
184
+ if not OUTPUT_DIR:
185
+ screenshot_path.unlink(missing_ok=True)
186
+ return base64.b64encode(image_data).decode()
187
+ except Exception:
188
+ return None
189
+
190
+ return None
191
+
192
+ # ===== Helper Methods =====
193
+
194
+ async def _hold_keys_context(self, keys: list[str] | None) -> None:
195
+ """
196
+ Press and hold keys, to be used with try/finally.
197
+
198
+ Args:
199
+ keys: List of keys to hold
200
+
201
+ Example:
202
+ await self._hold_keys_context(['ctrl'])
203
+ try:
204
+ # Do action with ctrl held
205
+ finally:
206
+ await self._release_keys(['ctrl'])
207
+ """
208
+ if keys:
209
+ for key in keys:
210
+ escaped_key = shlex.quote(key)
211
+ await self.execute(f"keydown {escaped_key}", take_screenshot=False)
212
+
213
+ async def _release_keys(self, keys: list[str] | None) -> None:
214
+ """Release held keys."""
215
+ if keys:
216
+ for key in reversed(keys): # Release in reverse order
217
+ escaped_key = shlex.quote(key)
218
+ await self.execute(f"keyup {escaped_key}", take_screenshot=False)
219
+
220
+ # ===== CLA Action Implementations =====
221
+
222
+ async def click(
223
+ self,
224
+ x: int | None = None,
225
+ y: int | None = None,
226
+ button: Literal["left", "right", "middle", "back", "forward"] = "left",
227
+ pattern: list[int] | None = None,
228
+ hold_keys: list[str] | None = None,
229
+ take_screenshot: bool = True,
230
+ ) -> ContentResult:
231
+ """Click at specified coordinates or current position."""
232
+ # Map button names to xdotool button numbers
233
+ button_map = {"left": 1, "right": 3, "middle": 2, "back": 8, "forward": 9}
234
+ button_num = button_map.get(button, 1)
235
+
236
+ # Hold keys if specified
237
+ await self._hold_keys_context(hold_keys)
238
+
239
+ try:
240
+ # Handle multi-clicks based on pattern
241
+ if pattern:
242
+ click_count = len(pattern) + 1
243
+ delay = pattern[0] if pattern else 10 # Use first delay for all clicks
244
+
245
+ if x is not None and y is not None:
246
+ cmd = f"mousemove {x} {y} click --repeat {click_count} --delay {delay} {button_num}" # noqa: E501
247
+ else:
248
+ cmd = f"click --repeat {click_count} --delay {delay} {button_num}"
249
+ else:
250
+ # Single click
251
+ if x is not None and y is not None:
252
+ cmd = f"mousemove {x} {y} click {button_num}"
253
+ else:
254
+ cmd = f"click {button_num}"
255
+
256
+ result = await self.execute(cmd, take_screenshot=take_screenshot)
257
+ finally:
258
+ # Release held keys
259
+ await self._release_keys(hold_keys)
260
+
261
+ return result
262
+
263
+ async def write(
264
+ self, text: str, enter_after: bool = False, delay: int = 12, take_screenshot: bool = True
265
+ ) -> ContentResult:
266
+ """Type text with specified delay between keystrokes."""
267
+ # Escape text for shell
268
+ escaped_text = shlex.quote(text)
269
+ cmd = f"type --delay {delay} -- {escaped_text}"
270
+ result = await self.execute(cmd, take_screenshot=False)
271
+
272
+ if enter_after:
273
+ enter_result = await self.key("Return", take_screenshot=False)
274
+ # Combine outputs
275
+ combined_output = (result.output or "") + "\n" + (enter_result.output or "")
276
+ combined_error = None
277
+ if result.error or enter_result.error:
278
+ combined_error = (result.error or "") + "\n" + (enter_result.error or "")
279
+ result = ContentResult(output=combined_output.strip(), error=combined_error)
280
+
281
+ if take_screenshot:
282
+ screenshot = await self.screenshot()
283
+ if screenshot:
284
+ result = ContentResult(
285
+ output=result.output, error=result.error, base64_image=screenshot
286
+ )
287
+
288
+ return result
289
+
290
+ async def key(self, key_sequence: str, take_screenshot: bool = True) -> ContentResult:
291
+ """Press a key or key combination."""
292
+ return await self.execute(f"key -- {key_sequence}", take_screenshot=take_screenshot)
293
+
294
+ async def press(self, keys: list[str], take_screenshot: bool = True) -> ContentResult:
295
+ """Press a key combination (hotkey)."""
296
+ # Map CLA keys to XDO keys
297
+ mapped_keys = self._map_keys(keys)
298
+ # Convert list of keys to xdotool format
299
+ key_combo = "+".join(mapped_keys)
300
+ return await self.key(key_combo, take_screenshot=take_screenshot)
301
+
302
+ async def keydown(self, keys: list[str], take_screenshot: bool = True) -> ContentResult:
303
+ """Press and hold keys."""
304
+ # Map CLA keys to XDO keys
305
+ mapped_keys = self._map_keys(keys)
306
+ last_result = None
307
+ for key in mapped_keys:
308
+ escaped_key = shlex.quote(key)
309
+ last_result = await self.execute(f"keydown {escaped_key}", take_screenshot=False)
310
+
311
+ if take_screenshot and last_result:
312
+ screenshot = await self.screenshot()
313
+ if screenshot:
314
+ last_result = ContentResult(
315
+ output=last_result.output, error=last_result.error, base64_image=screenshot
316
+ )
317
+
318
+ return last_result or ContentResult()
319
+
320
+ async def keyup(self, keys: list[str], take_screenshot: bool = True) -> ContentResult:
321
+ """Release held keys."""
322
+ # Map CLA keys to XDO keys
323
+ mapped_keys = self._map_keys(keys)
324
+ last_result = None
325
+ for key in mapped_keys:
326
+ escaped_key = shlex.quote(key)
327
+ last_result = await self.execute(f"keyup {escaped_key}", take_screenshot=False)
328
+
329
+ if take_screenshot and last_result:
330
+ screenshot = await self.screenshot()
331
+ if screenshot:
332
+ last_result = ContentResult(
333
+ output=last_result.output, error=last_result.error, base64_image=screenshot
334
+ )
335
+
336
+ return last_result or ContentResult()
337
+
338
+ async def scroll(
339
+ self,
340
+ x: int | None = None,
341
+ y: int | None = None,
342
+ scroll_x: int | None = None,
343
+ scroll_y: int | None = None,
344
+ hold_keys: list[str] | None = None,
345
+ take_screenshot: bool = True,
346
+ ) -> ContentResult:
347
+ """Scroll at specified position."""
348
+ # Convert scroll amounts to xdotool format
349
+ scroll_button_map = {"up": 4, "down": 5, "left": 6, "right": 7}
350
+
351
+ # Convert pixels to wheel clicks
352
+ # Standard conversion: 1 wheel click ≈ 100 pixels
353
+ PIXELS_PER_WHEEL_CLICK = 100
354
+
355
+ # Hold keys if specified
356
+ await self._hold_keys_context(hold_keys)
357
+
358
+ try:
359
+ # Handle vertical scroll
360
+ if scroll_y and scroll_y != 0:
361
+ direction = "down" if scroll_y > 0 else "up"
362
+ # Convert pixels to clicks
363
+ clicks = max(1, abs(scroll_y) // PIXELS_PER_WHEEL_CLICK)
364
+ button = scroll_button_map.get(direction, 5)
365
+
366
+ if x is not None and y is not None:
367
+ cmd = f"mousemove {x} {y} click --repeat {clicks} {button}"
368
+ else:
369
+ cmd = f"click --repeat {clicks} {button}"
370
+
371
+ result = await self.execute(cmd, take_screenshot=take_screenshot)
372
+
373
+ # Handle horizontal scroll
374
+ elif scroll_x and scroll_x != 0:
375
+ direction = "right" if scroll_x > 0 else "left"
376
+ # Convert pixels to clicks
377
+ clicks = max(1, abs(scroll_x) // PIXELS_PER_WHEEL_CLICK)
378
+ button = scroll_button_map.get(direction, 7)
379
+
380
+ if x is not None and y is not None:
381
+ cmd = f"mousemove {x} {y} click --repeat {clicks} {button}"
382
+ else:
383
+ cmd = f"click --repeat {clicks} {button}"
384
+
385
+ result = await self.execute(cmd, take_screenshot=take_screenshot)
386
+
387
+ else:
388
+ result = ContentResult(output="No scroll amount specified")
389
+ finally:
390
+ # Release held keys
391
+ await self._release_keys(hold_keys)
392
+
393
+ return result
394
+
395
+ async def move(
396
+ self,
397
+ x: int | None = None,
398
+ y: int | None = None,
399
+ offset_x: int | None = None,
400
+ offset_y: int | None = None,
401
+ take_screenshot: bool = True,
402
+ ) -> ContentResult:
403
+ """Move mouse cursor."""
404
+ if x is not None and y is not None:
405
+ # Absolute move
406
+ return await self.execute(f"mousemove {x} {y}", take_screenshot=take_screenshot)
407
+ elif offset_x is not None or offset_y is not None:
408
+ # Relative move
409
+ offset_x = offset_x or 0
410
+ offset_y = offset_y or 0
411
+ return await self.execute(
412
+ f"mousemove_relative -- {offset_x} {offset_y}", take_screenshot=take_screenshot
413
+ )
414
+ else:
415
+ return ContentResult(output="No move coordinates specified")
416
+
417
+ async def drag(
418
+ self,
419
+ path: list[tuple[int, int]],
420
+ pattern: list[int] | None = None,
421
+ hold_keys: list[str] | None = None,
422
+ take_screenshot: bool = True,
423
+ ) -> ContentResult:
424
+ """Drag along a path."""
425
+ if len(path) < 2:
426
+ return ContentResult(error="Drag path must have at least 2 points")
427
+
428
+ # Hold keys if specified
429
+ await self._hold_keys_context(hold_keys)
430
+
431
+ try:
432
+ # Start drag
433
+ start_x, start_y = path[0]
434
+ await self.execute(f"mousemove {start_x} {start_y}", take_screenshot=False)
435
+ await self.execute("mousedown 1", take_screenshot=False)
436
+
437
+ # Move through intermediate points
438
+ for i, (x, y) in enumerate(path[1:], 1):
439
+ # Apply delay if pattern is specified
440
+ if pattern and i - 1 < len(pattern):
441
+ await asyncio.sleep(pattern[i - 1] / 1000.0) # Convert ms to seconds
442
+
443
+ await self.execute(f"mousemove {x} {y}", take_screenshot=False)
444
+
445
+ # End drag
446
+ await self.execute("mouseup 1", take_screenshot=False)
447
+
448
+ # Take final screenshot if requested
449
+ if take_screenshot:
450
+ screenshot = await self.screenshot()
451
+ result = ContentResult(
452
+ output=f"Dragged along {len(path)} points", base64_image=screenshot
453
+ )
454
+ else:
455
+ result = ContentResult(output=f"Dragged along {len(path)} points")
456
+
457
+ finally:
458
+ # Release held keys
459
+ await self._release_keys(hold_keys)
460
+
461
+ return result
462
+
463
+ async def mouse_down(
464
+ self,
465
+ button: Literal["left", "right", "middle", "back", "forward"] = "left",
466
+ take_screenshot: bool = True,
467
+ ) -> ContentResult:
468
+ """Press and hold a mouse button."""
469
+ button_map = {"left": 1, "right": 3, "middle": 2, "back": 8, "forward": 9}
470
+ button_num = button_map.get(button, 1)
471
+ return await self.execute(f"mousedown {button_num}", take_screenshot=take_screenshot)
472
+
473
+ async def mouse_up(
474
+ self,
475
+ button: Literal["left", "right", "middle", "back", "forward"] = "left",
476
+ take_screenshot: bool = True,
477
+ ) -> ContentResult:
478
+ """Release a mouse button."""
479
+ button_map = {"left": 1, "right": 3, "middle": 2, "back": 8, "forward": 9}
480
+ button_num = button_map.get(button, 1)
481
+ return await self.execute(f"mouseup {button_num}", take_screenshot=take_screenshot)
482
+
483
+ async def hold_key(
484
+ self, key: str, duration: float, take_screenshot: bool = True
485
+ ) -> ContentResult:
486
+ """Hold a key for a specified duration."""
487
+ # Map CLA key to XDO key
488
+ mapped_key = self._map_key(key)
489
+ escaped_key = shlex.quote(mapped_key)
490
+
491
+ # Press the key
492
+ await self.execute(f"keydown {escaped_key}", take_screenshot=False)
493
+
494
+ # Wait
495
+ await asyncio.sleep(duration)
496
+
497
+ # Release the key
498
+ result = await self.execute(f"keyup {escaped_key}", take_screenshot=False)
499
+
500
+ if take_screenshot:
501
+ screenshot = await self.screenshot()
502
+ if screenshot:
503
+ result = ContentResult(
504
+ output=result.output, error=result.error, base64_image=screenshot
505
+ )
506
+
507
+ return result
508
+
509
+ async def position(self) -> ContentResult:
510
+ """Get current cursor position."""
511
+ return await self.execute("getmouselocation", take_screenshot=False)