hud-python 0.4.1__py3-none-any.whl → 0.4.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of hud-python might be problematic. Click here for more details.

Files changed (130) hide show
  1. hud/__init__.py +22 -22
  2. hud/agents/__init__.py +13 -15
  3. hud/agents/base.py +599 -599
  4. hud/agents/claude.py +373 -373
  5. hud/agents/langchain.py +261 -250
  6. hud/agents/misc/__init__.py +7 -7
  7. hud/agents/misc/response_agent.py +82 -80
  8. hud/agents/openai.py +352 -352
  9. hud/agents/openai_chat_generic.py +154 -154
  10. hud/agents/tests/__init__.py +1 -1
  11. hud/agents/tests/test_base.py +742 -742
  12. hud/agents/tests/test_claude.py +324 -324
  13. hud/agents/tests/test_client.py +363 -363
  14. hud/agents/tests/test_openai.py +237 -237
  15. hud/cli/__init__.py +617 -617
  16. hud/cli/__main__.py +8 -8
  17. hud/cli/analyze.py +371 -371
  18. hud/cli/analyze_metadata.py +230 -230
  19. hud/cli/build.py +498 -427
  20. hud/cli/clone.py +185 -185
  21. hud/cli/cursor.py +92 -92
  22. hud/cli/debug.py +392 -392
  23. hud/cli/docker_utils.py +83 -83
  24. hud/cli/init.py +280 -281
  25. hud/cli/interactive.py +353 -353
  26. hud/cli/mcp_server.py +764 -756
  27. hud/cli/pull.py +330 -336
  28. hud/cli/push.py +404 -370
  29. hud/cli/remote_runner.py +311 -311
  30. hud/cli/runner.py +160 -160
  31. hud/cli/tests/__init__.py +3 -3
  32. hud/cli/tests/test_analyze.py +284 -284
  33. hud/cli/tests/test_cli_init.py +265 -265
  34. hud/cli/tests/test_cli_main.py +27 -27
  35. hud/cli/tests/test_clone.py +142 -142
  36. hud/cli/tests/test_cursor.py +253 -253
  37. hud/cli/tests/test_debug.py +453 -453
  38. hud/cli/tests/test_mcp_server.py +139 -139
  39. hud/cli/tests/test_utils.py +388 -388
  40. hud/cli/utils.py +263 -263
  41. hud/clients/README.md +143 -143
  42. hud/clients/__init__.py +16 -16
  43. hud/clients/base.py +378 -379
  44. hud/clients/fastmcp.py +222 -222
  45. hud/clients/mcp_use.py +298 -278
  46. hud/clients/tests/__init__.py +1 -1
  47. hud/clients/tests/test_client_integration.py +111 -111
  48. hud/clients/tests/test_fastmcp.py +342 -342
  49. hud/clients/tests/test_protocol.py +188 -188
  50. hud/clients/utils/__init__.py +1 -1
  51. hud/clients/utils/retry_transport.py +160 -160
  52. hud/datasets.py +327 -322
  53. hud/misc/__init__.py +1 -1
  54. hud/misc/claude_plays_pokemon.py +292 -292
  55. hud/otel/__init__.py +35 -35
  56. hud/otel/collector.py +142 -142
  57. hud/otel/config.py +164 -164
  58. hud/otel/context.py +536 -536
  59. hud/otel/exporters.py +366 -366
  60. hud/otel/instrumentation.py +97 -97
  61. hud/otel/processors.py +118 -118
  62. hud/otel/tests/__init__.py +1 -1
  63. hud/otel/tests/test_processors.py +197 -197
  64. hud/server/__init__.py +5 -5
  65. hud/server/context.py +114 -114
  66. hud/server/helper/__init__.py +5 -5
  67. hud/server/low_level.py +132 -132
  68. hud/server/server.py +170 -166
  69. hud/server/tests/__init__.py +3 -3
  70. hud/settings.py +73 -73
  71. hud/shared/__init__.py +5 -5
  72. hud/shared/exceptions.py +180 -180
  73. hud/shared/requests.py +264 -264
  74. hud/shared/tests/test_exceptions.py +157 -157
  75. hud/shared/tests/test_requests.py +275 -275
  76. hud/telemetry/__init__.py +25 -25
  77. hud/telemetry/instrument.py +379 -379
  78. hud/telemetry/job.py +309 -309
  79. hud/telemetry/replay.py +74 -74
  80. hud/telemetry/trace.py +83 -83
  81. hud/tools/__init__.py +33 -33
  82. hud/tools/base.py +365 -365
  83. hud/tools/bash.py +161 -161
  84. hud/tools/computer/__init__.py +15 -15
  85. hud/tools/computer/anthropic.py +437 -437
  86. hud/tools/computer/hud.py +376 -376
  87. hud/tools/computer/openai.py +295 -295
  88. hud/tools/computer/settings.py +82 -82
  89. hud/tools/edit.py +314 -314
  90. hud/tools/executors/__init__.py +30 -30
  91. hud/tools/executors/base.py +539 -539
  92. hud/tools/executors/pyautogui.py +621 -621
  93. hud/tools/executors/tests/__init__.py +1 -1
  94. hud/tools/executors/tests/test_base_executor.py +338 -338
  95. hud/tools/executors/tests/test_pyautogui_executor.py +165 -165
  96. hud/tools/executors/xdo.py +511 -511
  97. hud/tools/playwright.py +412 -412
  98. hud/tools/tests/__init__.py +3 -3
  99. hud/tools/tests/test_base.py +282 -282
  100. hud/tools/tests/test_bash.py +158 -158
  101. hud/tools/tests/test_bash_extended.py +197 -197
  102. hud/tools/tests/test_computer.py +425 -425
  103. hud/tools/tests/test_computer_actions.py +34 -34
  104. hud/tools/tests/test_edit.py +259 -259
  105. hud/tools/tests/test_init.py +27 -27
  106. hud/tools/tests/test_playwright_tool.py +183 -183
  107. hud/tools/tests/test_tools.py +145 -145
  108. hud/tools/tests/test_utils.py +156 -156
  109. hud/tools/types.py +72 -72
  110. hud/tools/utils.py +50 -50
  111. hud/types.py +136 -136
  112. hud/utils/__init__.py +10 -10
  113. hud/utils/async_utils.py +65 -65
  114. hud/utils/design.py +236 -168
  115. hud/utils/mcp.py +55 -55
  116. hud/utils/progress.py +149 -149
  117. hud/utils/telemetry.py +66 -66
  118. hud/utils/tests/test_async_utils.py +173 -173
  119. hud/utils/tests/test_init.py +17 -17
  120. hud/utils/tests/test_progress.py +261 -261
  121. hud/utils/tests/test_telemetry.py +82 -82
  122. hud/utils/tests/test_version.py +8 -8
  123. hud/version.py +7 -7
  124. {hud_python-0.4.1.dist-info → hud_python-0.4.3.dist-info}/METADATA +10 -8
  125. hud_python-0.4.3.dist-info/RECORD +131 -0
  126. {hud_python-0.4.1.dist-info → hud_python-0.4.3.dist-info}/licenses/LICENSE +21 -21
  127. hud/agents/art.py +0 -101
  128. hud_python-0.4.1.dist-info/RECORD +0 -132
  129. {hud_python-0.4.1.dist-info → hud_python-0.4.3.dist-info}/WHEEL +0 -0
  130. {hud_python-0.4.1.dist-info → hud_python-0.4.3.dist-info}/entry_points.txt +0 -0
@@ -1,511 +1,511 @@
1
- from __future__ import annotations
2
-
3
- import asyncio
4
- import base64
5
- import logging
6
- import os
7
- import shlex
8
- from pathlib import Path
9
- from tempfile import gettempdir
10
- from typing import Literal
11
- from uuid import uuid4
12
-
13
- from hud.tools.types import ContentResult
14
- from hud.tools.utils import run
15
-
16
- from .base import BaseExecutor
17
-
18
- OUTPUT_DIR = os.environ.get("SCREENSHOT_DIR")
19
- logger = logging.getLogger(__name__)
20
-
21
- # Map CLA standard keys to X11/XDO key names
22
- CLA_TO_XDO = {
23
- "enter": "Return",
24
- "tab": "Tab",
25
- "space": "space",
26
- "backspace": "BackSpace",
27
- "delete": "Delete",
28
- "escape": "Escape",
29
- "esc": "Escape",
30
- "up": "Up",
31
- "down": "Down",
32
- "left": "Left",
33
- "right": "Right",
34
- "shift": "Shift_L",
35
- "shiftleft": "Shift_L",
36
- "shiftright": "Shift_R",
37
- "ctrl": "Control_L",
38
- "ctrlleft": "Control_L",
39
- "ctrlright": "Control_R",
40
- "alt": "Alt_L",
41
- "altleft": "Alt_L",
42
- "altright": "Alt_R",
43
- "win": "Super_L",
44
- "winleft": "Super_L",
45
- "winright": "Super_R",
46
- "cmd": "Control_L", # Map cmd to ctrl for Linux
47
- "command": "Control_L",
48
- "super": "Super_L",
49
- "pageup": "Page_Up",
50
- "pagedown": "Page_Down",
51
- "home": "Home",
52
- "end": "End",
53
- "insert": "Insert",
54
- "pause": "Pause",
55
- "capslock": "Caps_Lock",
56
- "numlock": "Num_Lock",
57
- "scrolllock": "Scroll_Lock",
58
- "printscreen": "Print",
59
- "prtsc": "Print",
60
- # Function keys
61
- **{f"f{i}": f"F{i}" for i in range(1, 25)},
62
- }
63
-
64
-
65
- class XDOExecutor(BaseExecutor):
66
- """
67
- Low-level executor for xdotool commands.
68
- Handles display management and screenshot capture on Linux/X11 systems.
69
-
70
- This executor should only be instantiated when X11 display is available.
71
- """
72
-
73
- def __init__(self, display_num: int | None = None) -> None:
74
- """Initialize with optional display number."""
75
- super().__init__(display_num)
76
-
77
- if display_num is not None:
78
- self._display_prefix = f"DISPLAY=:{display_num} "
79
- else:
80
- self._display_prefix = ""
81
-
82
- self.xdotool = f"{self._display_prefix}xdotool"
83
- logger.info("XDOExecutor initialized")
84
-
85
- def _map_key(self, key: str) -> str:
86
- """Map CLA standard key to XDO key."""
87
- return CLA_TO_XDO.get(key.lower(), key)
88
-
89
- def _map_keys(self, keys: list[str]) -> list[str]:
90
- """Map CLA standard keys to XDO keys."""
91
- mapped_keys = []
92
- for key in keys:
93
- # Handle key combinations like "ctrl+a"
94
- if "+" in key:
95
- parts = key.split("+")
96
- mapped_parts = [self._map_key(part) for part in parts]
97
- mapped_keys.append("+".join(mapped_parts))
98
- else:
99
- mapped_keys.append(self._map_key(key))
100
- return mapped_keys
101
-
102
- @classmethod
103
- def is_available(cls) -> bool:
104
- """
105
- Check if xdotool and X11 display are available.
106
-
107
- Returns:
108
- True if xdotool can be used, False otherwise
109
- """
110
- display = os.environ.get("DISPLAY")
111
- if not display:
112
- return False
113
-
114
- # Try a simple xdotool command to test availability
115
- try:
116
- import subprocess
117
-
118
- # Try without display prefix if DISPLAY is already set
119
- result = subprocess.run(
120
- ["xdotool", "getdisplaygeometry"], # noqa: S607
121
- capture_output=True,
122
- timeout=2,
123
- )
124
- return result.returncode == 0
125
- except (subprocess.TimeoutExpired, FileNotFoundError, Exception):
126
- return False
127
-
128
- async def execute(self, command: str, take_screenshot: bool = True) -> ContentResult:
129
- """
130
- Execute an xdotool command.
131
-
132
- Args:
133
- command: The xdotool command (without xdotool prefix)
134
- take_screenshot: Whether to capture a screenshot after execution
135
-
136
- Returns:
137
- ContentResult with output, error, and optional screenshot
138
- """
139
- full_command = f"{self.xdotool} {command}"
140
-
141
- # Execute command
142
- returncode, stdout, stderr = await run(full_command)
143
-
144
- # Prepare result
145
- result = ContentResult(
146
- output=stdout if stdout else None, error=stderr if stderr or returncode != 0 else None
147
- )
148
-
149
- # Take screenshot if requested
150
- if take_screenshot:
151
- await asyncio.sleep(self._screenshot_delay)
152
- screenshot = await self.screenshot()
153
- if screenshot:
154
- result = ContentResult(
155
- output=result.output, error=result.error, base64_image=screenshot
156
- )
157
-
158
- return result
159
-
160
- async def screenshot(self) -> str | None:
161
- """
162
- Take a screenshot and return base64 encoded image.
163
-
164
- Returns:
165
- Base64 encoded PNG image or None if failed
166
- """
167
- # Real screenshot using scrot
168
- if OUTPUT_DIR:
169
- output_dir = Path(OUTPUT_DIR)
170
- output_dir.mkdir(parents=True, exist_ok=True)
171
- screenshot_path = output_dir / f"screenshot_{uuid4().hex}.png"
172
- else:
173
- # Generate a unique path in system temp dir without opening a file
174
- screenshot_path = Path(gettempdir()) / f"screenshot_{uuid4().hex}.png"
175
-
176
- screenshot_cmd = f"{self._display_prefix}scrot -p {screenshot_path}"
177
-
178
- returncode, _, stderr = await run(screenshot_cmd)
179
-
180
- if returncode == 0 and screenshot_path.exists():
181
- try:
182
- image_data = screenshot_path.read_bytes()
183
- # Remove the file unless user requested persistence via env var
184
- if not OUTPUT_DIR:
185
- screenshot_path.unlink(missing_ok=True)
186
- return base64.b64encode(image_data).decode()
187
- except Exception:
188
- return None
189
-
190
- return None
191
-
192
- # ===== Helper Methods =====
193
-
194
- async def _hold_keys_context(self, keys: list[str] | None) -> None:
195
- """
196
- Press and hold keys, to be used with try/finally.
197
-
198
- Args:
199
- keys: List of keys to hold
200
-
201
- Example:
202
- await self._hold_keys_context(['ctrl'])
203
- try:
204
- # Do action with ctrl held
205
- finally:
206
- await self._release_keys(['ctrl'])
207
- """
208
- if keys:
209
- for key in keys:
210
- escaped_key = shlex.quote(key)
211
- await self.execute(f"keydown {escaped_key}", take_screenshot=False)
212
-
213
- async def _release_keys(self, keys: list[str] | None) -> None:
214
- """Release held keys."""
215
- if keys:
216
- for key in reversed(keys): # Release in reverse order
217
- escaped_key = shlex.quote(key)
218
- await self.execute(f"keyup {escaped_key}", take_screenshot=False)
219
-
220
- # ===== CLA Action Implementations =====
221
-
222
- async def click(
223
- self,
224
- x: int | None = None,
225
- y: int | None = None,
226
- button: Literal["left", "right", "middle", "back", "forward"] = "left",
227
- pattern: list[int] | None = None,
228
- hold_keys: list[str] | None = None,
229
- take_screenshot: bool = True,
230
- ) -> ContentResult:
231
- """Click at specified coordinates or current position."""
232
- # Map button names to xdotool button numbers
233
- button_map = {"left": 1, "right": 3, "middle": 2, "back": 8, "forward": 9}
234
- button_num = button_map.get(button, 1)
235
-
236
- # Hold keys if specified
237
- await self._hold_keys_context(hold_keys)
238
-
239
- try:
240
- # Handle multi-clicks based on pattern
241
- if pattern:
242
- click_count = len(pattern) + 1
243
- delay = pattern[0] if pattern else 10 # Use first delay for all clicks
244
-
245
- if x is not None and y is not None:
246
- cmd = f"mousemove {x} {y} click --repeat {click_count} --delay {delay} {button_num}" # noqa: E501
247
- else:
248
- cmd = f"click --repeat {click_count} --delay {delay} {button_num}"
249
- else:
250
- # Single click
251
- if x is not None and y is not None:
252
- cmd = f"mousemove {x} {y} click {button_num}"
253
- else:
254
- cmd = f"click {button_num}"
255
-
256
- result = await self.execute(cmd, take_screenshot=take_screenshot)
257
- finally:
258
- # Release held keys
259
- await self._release_keys(hold_keys)
260
-
261
- return result
262
-
263
- async def write(
264
- self, text: str, enter_after: bool = False, delay: int = 12, take_screenshot: bool = True
265
- ) -> ContentResult:
266
- """Type text with specified delay between keystrokes."""
267
- # Escape text for shell
268
- escaped_text = shlex.quote(text)
269
- cmd = f"type --delay {delay} -- {escaped_text}"
270
- result = await self.execute(cmd, take_screenshot=False)
271
-
272
- if enter_after:
273
- enter_result = await self.key("Return", take_screenshot=False)
274
- # Combine outputs
275
- combined_output = (result.output or "") + "\n" + (enter_result.output or "")
276
- combined_error = None
277
- if result.error or enter_result.error:
278
- combined_error = (result.error or "") + "\n" + (enter_result.error or "")
279
- result = ContentResult(output=combined_output.strip(), error=combined_error)
280
-
281
- if take_screenshot:
282
- screenshot = await self.screenshot()
283
- if screenshot:
284
- result = ContentResult(
285
- output=result.output, error=result.error, base64_image=screenshot
286
- )
287
-
288
- return result
289
-
290
- async def key(self, key_sequence: str, take_screenshot: bool = True) -> ContentResult:
291
- """Press a key or key combination."""
292
- return await self.execute(f"key -- {key_sequence}", take_screenshot=take_screenshot)
293
-
294
- async def press(self, keys: list[str], take_screenshot: bool = True) -> ContentResult:
295
- """Press a key combination (hotkey)."""
296
- # Map CLA keys to XDO keys
297
- mapped_keys = self._map_keys(keys)
298
- # Convert list of keys to xdotool format
299
- key_combo = "+".join(mapped_keys)
300
- return await self.key(key_combo, take_screenshot=take_screenshot)
301
-
302
- async def keydown(self, keys: list[str], take_screenshot: bool = True) -> ContentResult:
303
- """Press and hold keys."""
304
- # Map CLA keys to XDO keys
305
- mapped_keys = self._map_keys(keys)
306
- last_result = None
307
- for key in mapped_keys:
308
- escaped_key = shlex.quote(key)
309
- last_result = await self.execute(f"keydown {escaped_key}", take_screenshot=False)
310
-
311
- if take_screenshot and last_result:
312
- screenshot = await self.screenshot()
313
- if screenshot:
314
- last_result = ContentResult(
315
- output=last_result.output, error=last_result.error, base64_image=screenshot
316
- )
317
-
318
- return last_result or ContentResult()
319
-
320
- async def keyup(self, keys: list[str], take_screenshot: bool = True) -> ContentResult:
321
- """Release held keys."""
322
- # Map CLA keys to XDO keys
323
- mapped_keys = self._map_keys(keys)
324
- last_result = None
325
- for key in mapped_keys:
326
- escaped_key = shlex.quote(key)
327
- last_result = await self.execute(f"keyup {escaped_key}", take_screenshot=False)
328
-
329
- if take_screenshot and last_result:
330
- screenshot = await self.screenshot()
331
- if screenshot:
332
- last_result = ContentResult(
333
- output=last_result.output, error=last_result.error, base64_image=screenshot
334
- )
335
-
336
- return last_result or ContentResult()
337
-
338
- async def scroll(
339
- self,
340
- x: int | None = None,
341
- y: int | None = None,
342
- scroll_x: int | None = None,
343
- scroll_y: int | None = None,
344
- hold_keys: list[str] | None = None,
345
- take_screenshot: bool = True,
346
- ) -> ContentResult:
347
- """Scroll at specified position."""
348
- # Convert scroll amounts to xdotool format
349
- scroll_button_map = {"up": 4, "down": 5, "left": 6, "right": 7}
350
-
351
- # Convert pixels to wheel clicks
352
- # Standard conversion: 1 wheel click ≈ 100 pixels
353
- PIXELS_PER_WHEEL_CLICK = 100
354
-
355
- # Hold keys if specified
356
- await self._hold_keys_context(hold_keys)
357
-
358
- try:
359
- # Handle vertical scroll
360
- if scroll_y and scroll_y != 0:
361
- direction = "down" if scroll_y > 0 else "up"
362
- # Convert pixels to clicks
363
- clicks = max(1, abs(scroll_y) // PIXELS_PER_WHEEL_CLICK)
364
- button = scroll_button_map.get(direction, 5)
365
-
366
- if x is not None and y is not None:
367
- cmd = f"mousemove {x} {y} click --repeat {clicks} {button}"
368
- else:
369
- cmd = f"click --repeat {clicks} {button}"
370
-
371
- result = await self.execute(cmd, take_screenshot=take_screenshot)
372
-
373
- # Handle horizontal scroll
374
- elif scroll_x and scroll_x != 0:
375
- direction = "right" if scroll_x > 0 else "left"
376
- # Convert pixels to clicks
377
- clicks = max(1, abs(scroll_x) // PIXELS_PER_WHEEL_CLICK)
378
- button = scroll_button_map.get(direction, 7)
379
-
380
- if x is not None and y is not None:
381
- cmd = f"mousemove {x} {y} click --repeat {clicks} {button}"
382
- else:
383
- cmd = f"click --repeat {clicks} {button}"
384
-
385
- result = await self.execute(cmd, take_screenshot=take_screenshot)
386
-
387
- else:
388
- result = ContentResult(output="No scroll amount specified")
389
- finally:
390
- # Release held keys
391
- await self._release_keys(hold_keys)
392
-
393
- return result
394
-
395
- async def move(
396
- self,
397
- x: int | None = None,
398
- y: int | None = None,
399
- offset_x: int | None = None,
400
- offset_y: int | None = None,
401
- take_screenshot: bool = True,
402
- ) -> ContentResult:
403
- """Move mouse cursor."""
404
- if x is not None and y is not None:
405
- # Absolute move
406
- return await self.execute(f"mousemove {x} {y}", take_screenshot=take_screenshot)
407
- elif offset_x is not None or offset_y is not None:
408
- # Relative move
409
- offset_x = offset_x or 0
410
- offset_y = offset_y or 0
411
- return await self.execute(
412
- f"mousemove_relative -- {offset_x} {offset_y}", take_screenshot=take_screenshot
413
- )
414
- else:
415
- return ContentResult(output="No move coordinates specified")
416
-
417
- async def drag(
418
- self,
419
- path: list[tuple[int, int]],
420
- pattern: list[int] | None = None,
421
- hold_keys: list[str] | None = None,
422
- take_screenshot: bool = True,
423
- ) -> ContentResult:
424
- """Drag along a path."""
425
- if len(path) < 2:
426
- return ContentResult(error="Drag path must have at least 2 points")
427
-
428
- # Hold keys if specified
429
- await self._hold_keys_context(hold_keys)
430
-
431
- try:
432
- # Start drag
433
- start_x, start_y = path[0]
434
- await self.execute(f"mousemove {start_x} {start_y}", take_screenshot=False)
435
- await self.execute("mousedown 1", take_screenshot=False)
436
-
437
- # Move through intermediate points
438
- for i, (x, y) in enumerate(path[1:], 1):
439
- # Apply delay if pattern is specified
440
- if pattern and i - 1 < len(pattern):
441
- await asyncio.sleep(pattern[i - 1] / 1000.0) # Convert ms to seconds
442
-
443
- await self.execute(f"mousemove {x} {y}", take_screenshot=False)
444
-
445
- # End drag
446
- await self.execute("mouseup 1", take_screenshot=False)
447
-
448
- # Take final screenshot if requested
449
- if take_screenshot:
450
- screenshot = await self.screenshot()
451
- result = ContentResult(
452
- output=f"Dragged along {len(path)} points", base64_image=screenshot
453
- )
454
- else:
455
- result = ContentResult(output=f"Dragged along {len(path)} points")
456
-
457
- finally:
458
- # Release held keys
459
- await self._release_keys(hold_keys)
460
-
461
- return result
462
-
463
- async def mouse_down(
464
- self,
465
- button: Literal["left", "right", "middle", "back", "forward"] = "left",
466
- take_screenshot: bool = True,
467
- ) -> ContentResult:
468
- """Press and hold a mouse button."""
469
- button_map = {"left": 1, "right": 3, "middle": 2, "back": 8, "forward": 9}
470
- button_num = button_map.get(button, 1)
471
- return await self.execute(f"mousedown {button_num}", take_screenshot=take_screenshot)
472
-
473
- async def mouse_up(
474
- self,
475
- button: Literal["left", "right", "middle", "back", "forward"] = "left",
476
- take_screenshot: bool = True,
477
- ) -> ContentResult:
478
- """Release a mouse button."""
479
- button_map = {"left": 1, "right": 3, "middle": 2, "back": 8, "forward": 9}
480
- button_num = button_map.get(button, 1)
481
- return await self.execute(f"mouseup {button_num}", take_screenshot=take_screenshot)
482
-
483
- async def hold_key(
484
- self, key: str, duration: float, take_screenshot: bool = True
485
- ) -> ContentResult:
486
- """Hold a key for a specified duration."""
487
- # Map CLA key to XDO key
488
- mapped_key = self._map_key(key)
489
- escaped_key = shlex.quote(mapped_key)
490
-
491
- # Press the key
492
- await self.execute(f"keydown {escaped_key}", take_screenshot=False)
493
-
494
- # Wait
495
- await asyncio.sleep(duration)
496
-
497
- # Release the key
498
- result = await self.execute(f"keyup {escaped_key}", take_screenshot=False)
499
-
500
- if take_screenshot:
501
- screenshot = await self.screenshot()
502
- if screenshot:
503
- result = ContentResult(
504
- output=result.output, error=result.error, base64_image=screenshot
505
- )
506
-
507
- return result
508
-
509
- async def position(self) -> ContentResult:
510
- """Get current cursor position."""
511
- return await self.execute("getmouselocation", take_screenshot=False)
1
+ from __future__ import annotations
2
+
3
+ import asyncio
4
+ import base64
5
+ import logging
6
+ import os
7
+ import shlex
8
+ from pathlib import Path
9
+ from tempfile import gettempdir
10
+ from typing import Literal
11
+ from uuid import uuid4
12
+
13
+ from hud.tools.types import ContentResult
14
+ from hud.tools.utils import run
15
+
16
+ from .base import BaseExecutor
17
+
18
+ OUTPUT_DIR = os.environ.get("SCREENSHOT_DIR")
19
+ logger = logging.getLogger(__name__)
20
+
21
+ # Map CLA standard keys to X11/XDO key names
22
+ CLA_TO_XDO = {
23
+ "enter": "Return",
24
+ "tab": "Tab",
25
+ "space": "space",
26
+ "backspace": "BackSpace",
27
+ "delete": "Delete",
28
+ "escape": "Escape",
29
+ "esc": "Escape",
30
+ "up": "Up",
31
+ "down": "Down",
32
+ "left": "Left",
33
+ "right": "Right",
34
+ "shift": "Shift_L",
35
+ "shiftleft": "Shift_L",
36
+ "shiftright": "Shift_R",
37
+ "ctrl": "Control_L",
38
+ "ctrlleft": "Control_L",
39
+ "ctrlright": "Control_R",
40
+ "alt": "Alt_L",
41
+ "altleft": "Alt_L",
42
+ "altright": "Alt_R",
43
+ "win": "Super_L",
44
+ "winleft": "Super_L",
45
+ "winright": "Super_R",
46
+ "cmd": "Control_L", # Map cmd to ctrl for Linux
47
+ "command": "Control_L",
48
+ "super": "Super_L",
49
+ "pageup": "Page_Up",
50
+ "pagedown": "Page_Down",
51
+ "home": "Home",
52
+ "end": "End",
53
+ "insert": "Insert",
54
+ "pause": "Pause",
55
+ "capslock": "Caps_Lock",
56
+ "numlock": "Num_Lock",
57
+ "scrolllock": "Scroll_Lock",
58
+ "printscreen": "Print",
59
+ "prtsc": "Print",
60
+ # Function keys
61
+ **{f"f{i}": f"F{i}" for i in range(1, 25)},
62
+ }
63
+
64
+
65
+ class XDOExecutor(BaseExecutor):
66
+ """
67
+ Low-level executor for xdotool commands.
68
+ Handles display management and screenshot capture on Linux/X11 systems.
69
+
70
+ This executor should only be instantiated when X11 display is available.
71
+ """
72
+
73
+ def __init__(self, display_num: int | None = None) -> None:
74
+ """Initialize with optional display number."""
75
+ super().__init__(display_num)
76
+
77
+ if display_num is not None:
78
+ self._display_prefix = f"DISPLAY=:{display_num} "
79
+ else:
80
+ self._display_prefix = ""
81
+
82
+ self.xdotool = f"{self._display_prefix}xdotool"
83
+ logger.info("XDOExecutor initialized")
84
+
85
+ def _map_key(self, key: str) -> str:
86
+ """Map CLA standard key to XDO key."""
87
+ return CLA_TO_XDO.get(key.lower(), key)
88
+
89
+ def _map_keys(self, keys: list[str]) -> list[str]:
90
+ """Map CLA standard keys to XDO keys."""
91
+ mapped_keys = []
92
+ for key in keys:
93
+ # Handle key combinations like "ctrl+a"
94
+ if "+" in key:
95
+ parts = key.split("+")
96
+ mapped_parts = [self._map_key(part) for part in parts]
97
+ mapped_keys.append("+".join(mapped_parts))
98
+ else:
99
+ mapped_keys.append(self._map_key(key))
100
+ return mapped_keys
101
+
102
+ @classmethod
103
+ def is_available(cls) -> bool:
104
+ """
105
+ Check if xdotool and X11 display are available.
106
+
107
+ Returns:
108
+ True if xdotool can be used, False otherwise
109
+ """
110
+ display = os.environ.get("DISPLAY")
111
+ if not display:
112
+ return False
113
+
114
+ # Try a simple xdotool command to test availability
115
+ try:
116
+ import subprocess
117
+
118
+ # Try without display prefix if DISPLAY is already set
119
+ result = subprocess.run(
120
+ ["xdotool", "getdisplaygeometry"], # noqa: S607
121
+ capture_output=True,
122
+ timeout=2,
123
+ )
124
+ return result.returncode == 0
125
+ except (subprocess.TimeoutExpired, FileNotFoundError, Exception):
126
+ return False
127
+
128
+ async def execute(self, command: str, take_screenshot: bool = True) -> ContentResult:
129
+ """
130
+ Execute an xdotool command.
131
+
132
+ Args:
133
+ command: The xdotool command (without xdotool prefix)
134
+ take_screenshot: Whether to capture a screenshot after execution
135
+
136
+ Returns:
137
+ ContentResult with output, error, and optional screenshot
138
+ """
139
+ full_command = f"{self.xdotool} {command}"
140
+
141
+ # Execute command
142
+ returncode, stdout, stderr = await run(full_command)
143
+
144
+ # Prepare result
145
+ result = ContentResult(
146
+ output=stdout if stdout else None, error=stderr if stderr or returncode != 0 else None
147
+ )
148
+
149
+ # Take screenshot if requested
150
+ if take_screenshot:
151
+ await asyncio.sleep(self._screenshot_delay)
152
+ screenshot = await self.screenshot()
153
+ if screenshot:
154
+ result = ContentResult(
155
+ output=result.output, error=result.error, base64_image=screenshot
156
+ )
157
+
158
+ return result
159
+
160
+ async def screenshot(self) -> str | None:
161
+ """
162
+ Take a screenshot and return base64 encoded image.
163
+
164
+ Returns:
165
+ Base64 encoded PNG image or None if failed
166
+ """
167
+ # Real screenshot using scrot
168
+ if OUTPUT_DIR:
169
+ output_dir = Path(OUTPUT_DIR)
170
+ output_dir.mkdir(parents=True, exist_ok=True)
171
+ screenshot_path = output_dir / f"screenshot_{uuid4().hex}.png"
172
+ else:
173
+ # Generate a unique path in system temp dir without opening a file
174
+ screenshot_path = Path(gettempdir()) / f"screenshot_{uuid4().hex}.png"
175
+
176
+ screenshot_cmd = f"{self._display_prefix}scrot -p {screenshot_path}"
177
+
178
+ returncode, _, stderr = await run(screenshot_cmd)
179
+
180
+ if returncode == 0 and screenshot_path.exists():
181
+ try:
182
+ image_data = screenshot_path.read_bytes()
183
+ # Remove the file unless user requested persistence via env var
184
+ if not OUTPUT_DIR:
185
+ screenshot_path.unlink(missing_ok=True)
186
+ return base64.b64encode(image_data).decode()
187
+ except Exception:
188
+ return None
189
+
190
+ return None
191
+
192
+ # ===== Helper Methods =====
193
+
194
+ async def _hold_keys_context(self, keys: list[str] | None) -> None:
195
+ """
196
+ Press and hold keys, to be used with try/finally.
197
+
198
+ Args:
199
+ keys: List of keys to hold
200
+
201
+ Example:
202
+ await self._hold_keys_context(['ctrl'])
203
+ try:
204
+ # Do action with ctrl held
205
+ finally:
206
+ await self._release_keys(['ctrl'])
207
+ """
208
+ if keys:
209
+ for key in keys:
210
+ escaped_key = shlex.quote(key)
211
+ await self.execute(f"keydown {escaped_key}", take_screenshot=False)
212
+
213
+ async def _release_keys(self, keys: list[str] | None) -> None:
214
+ """Release held keys."""
215
+ if keys:
216
+ for key in reversed(keys): # Release in reverse order
217
+ escaped_key = shlex.quote(key)
218
+ await self.execute(f"keyup {escaped_key}", take_screenshot=False)
219
+
220
+ # ===== CLA Action Implementations =====
221
+
222
+ async def click(
223
+ self,
224
+ x: int | None = None,
225
+ y: int | None = None,
226
+ button: Literal["left", "right", "middle", "back", "forward"] = "left",
227
+ pattern: list[int] | None = None,
228
+ hold_keys: list[str] | None = None,
229
+ take_screenshot: bool = True,
230
+ ) -> ContentResult:
231
+ """Click at specified coordinates or current position."""
232
+ # Map button names to xdotool button numbers
233
+ button_map = {"left": 1, "right": 3, "middle": 2, "back": 8, "forward": 9}
234
+ button_num = button_map.get(button, 1)
235
+
236
+ # Hold keys if specified
237
+ await self._hold_keys_context(hold_keys)
238
+
239
+ try:
240
+ # Handle multi-clicks based on pattern
241
+ if pattern:
242
+ click_count = len(pattern) + 1
243
+ delay = pattern[0] if pattern else 10 # Use first delay for all clicks
244
+
245
+ if x is not None and y is not None:
246
+ cmd = f"mousemove {x} {y} click --repeat {click_count} --delay {delay} {button_num}" # noqa: E501
247
+ else:
248
+ cmd = f"click --repeat {click_count} --delay {delay} {button_num}"
249
+ else:
250
+ # Single click
251
+ if x is not None and y is not None:
252
+ cmd = f"mousemove {x} {y} click {button_num}"
253
+ else:
254
+ cmd = f"click {button_num}"
255
+
256
+ result = await self.execute(cmd, take_screenshot=take_screenshot)
257
+ finally:
258
+ # Release held keys
259
+ await self._release_keys(hold_keys)
260
+
261
+ return result
262
+
263
+ async def write(
264
+ self, text: str, enter_after: bool = False, delay: int = 12, take_screenshot: bool = True
265
+ ) -> ContentResult:
266
+ """Type text with specified delay between keystrokes."""
267
+ # Escape text for shell
268
+ escaped_text = shlex.quote(text)
269
+ cmd = f"type --delay {delay} -- {escaped_text}"
270
+ result = await self.execute(cmd, take_screenshot=False)
271
+
272
+ if enter_after:
273
+ enter_result = await self.key("Return", take_screenshot=False)
274
+ # Combine outputs
275
+ combined_output = (result.output or "") + "\n" + (enter_result.output or "")
276
+ combined_error = None
277
+ if result.error or enter_result.error:
278
+ combined_error = (result.error or "") + "\n" + (enter_result.error or "")
279
+ result = ContentResult(output=combined_output.strip(), error=combined_error)
280
+
281
+ if take_screenshot:
282
+ screenshot = await self.screenshot()
283
+ if screenshot:
284
+ result = ContentResult(
285
+ output=result.output, error=result.error, base64_image=screenshot
286
+ )
287
+
288
+ return result
289
+
290
+ async def key(self, key_sequence: str, take_screenshot: bool = True) -> ContentResult:
291
+ """Press a key or key combination."""
292
+ return await self.execute(f"key -- {key_sequence}", take_screenshot=take_screenshot)
293
+
294
+ async def press(self, keys: list[str], take_screenshot: bool = True) -> ContentResult:
295
+ """Press a key combination (hotkey)."""
296
+ # Map CLA keys to XDO keys
297
+ mapped_keys = self._map_keys(keys)
298
+ # Convert list of keys to xdotool format
299
+ key_combo = "+".join(mapped_keys)
300
+ return await self.key(key_combo, take_screenshot=take_screenshot)
301
+
302
+ async def keydown(self, keys: list[str], take_screenshot: bool = True) -> ContentResult:
303
+ """Press and hold keys."""
304
+ # Map CLA keys to XDO keys
305
+ mapped_keys = self._map_keys(keys)
306
+ last_result = None
307
+ for key in mapped_keys:
308
+ escaped_key = shlex.quote(key)
309
+ last_result = await self.execute(f"keydown {escaped_key}", take_screenshot=False)
310
+
311
+ if take_screenshot and last_result:
312
+ screenshot = await self.screenshot()
313
+ if screenshot:
314
+ last_result = ContentResult(
315
+ output=last_result.output, error=last_result.error, base64_image=screenshot
316
+ )
317
+
318
+ return last_result or ContentResult()
319
+
320
+ async def keyup(self, keys: list[str], take_screenshot: bool = True) -> ContentResult:
321
+ """Release held keys."""
322
+ # Map CLA keys to XDO keys
323
+ mapped_keys = self._map_keys(keys)
324
+ last_result = None
325
+ for key in mapped_keys:
326
+ escaped_key = shlex.quote(key)
327
+ last_result = await self.execute(f"keyup {escaped_key}", take_screenshot=False)
328
+
329
+ if take_screenshot and last_result:
330
+ screenshot = await self.screenshot()
331
+ if screenshot:
332
+ last_result = ContentResult(
333
+ output=last_result.output, error=last_result.error, base64_image=screenshot
334
+ )
335
+
336
+ return last_result or ContentResult()
337
+
338
+ async def scroll(
339
+ self,
340
+ x: int | None = None,
341
+ y: int | None = None,
342
+ scroll_x: int | None = None,
343
+ scroll_y: int | None = None,
344
+ hold_keys: list[str] | None = None,
345
+ take_screenshot: bool = True,
346
+ ) -> ContentResult:
347
+ """Scroll at specified position."""
348
+ # Convert scroll amounts to xdotool format
349
+ scroll_button_map = {"up": 4, "down": 5, "left": 6, "right": 7}
350
+
351
+ # Convert pixels to wheel clicks
352
+ # Standard conversion: 1 wheel click ≈ 100 pixels
353
+ PIXELS_PER_WHEEL_CLICK = 100
354
+
355
+ # Hold keys if specified
356
+ await self._hold_keys_context(hold_keys)
357
+
358
+ try:
359
+ # Handle vertical scroll
360
+ if scroll_y and scroll_y != 0:
361
+ direction = "down" if scroll_y > 0 else "up"
362
+ # Convert pixels to clicks
363
+ clicks = max(1, abs(scroll_y) // PIXELS_PER_WHEEL_CLICK)
364
+ button = scroll_button_map.get(direction, 5)
365
+
366
+ if x is not None and y is not None:
367
+ cmd = f"mousemove {x} {y} click --repeat {clicks} {button}"
368
+ else:
369
+ cmd = f"click --repeat {clicks} {button}"
370
+
371
+ result = await self.execute(cmd, take_screenshot=take_screenshot)
372
+
373
+ # Handle horizontal scroll
374
+ elif scroll_x and scroll_x != 0:
375
+ direction = "right" if scroll_x > 0 else "left"
376
+ # Convert pixels to clicks
377
+ clicks = max(1, abs(scroll_x) // PIXELS_PER_WHEEL_CLICK)
378
+ button = scroll_button_map.get(direction, 7)
379
+
380
+ if x is not None and y is not None:
381
+ cmd = f"mousemove {x} {y} click --repeat {clicks} {button}"
382
+ else:
383
+ cmd = f"click --repeat {clicks} {button}"
384
+
385
+ result = await self.execute(cmd, take_screenshot=take_screenshot)
386
+
387
+ else:
388
+ result = ContentResult(output="No scroll amount specified")
389
+ finally:
390
+ # Release held keys
391
+ await self._release_keys(hold_keys)
392
+
393
+ return result
394
+
395
+ async def move(
396
+ self,
397
+ x: int | None = None,
398
+ y: int | None = None,
399
+ offset_x: int | None = None,
400
+ offset_y: int | None = None,
401
+ take_screenshot: bool = True,
402
+ ) -> ContentResult:
403
+ """Move mouse cursor."""
404
+ if x is not None and y is not None:
405
+ # Absolute move
406
+ return await self.execute(f"mousemove {x} {y}", take_screenshot=take_screenshot)
407
+ elif offset_x is not None or offset_y is not None:
408
+ # Relative move
409
+ offset_x = offset_x or 0
410
+ offset_y = offset_y or 0
411
+ return await self.execute(
412
+ f"mousemove_relative -- {offset_x} {offset_y}", take_screenshot=take_screenshot
413
+ )
414
+ else:
415
+ return ContentResult(output="No move coordinates specified")
416
+
417
+ async def drag(
418
+ self,
419
+ path: list[tuple[int, int]],
420
+ pattern: list[int] | None = None,
421
+ hold_keys: list[str] | None = None,
422
+ take_screenshot: bool = True,
423
+ ) -> ContentResult:
424
+ """Drag along a path."""
425
+ if len(path) < 2:
426
+ return ContentResult(error="Drag path must have at least 2 points")
427
+
428
+ # Hold keys if specified
429
+ await self._hold_keys_context(hold_keys)
430
+
431
+ try:
432
+ # Start drag
433
+ start_x, start_y = path[0]
434
+ await self.execute(f"mousemove {start_x} {start_y}", take_screenshot=False)
435
+ await self.execute("mousedown 1", take_screenshot=False)
436
+
437
+ # Move through intermediate points
438
+ for i, (x, y) in enumerate(path[1:], 1):
439
+ # Apply delay if pattern is specified
440
+ if pattern and i - 1 < len(pattern):
441
+ await asyncio.sleep(pattern[i - 1] / 1000.0) # Convert ms to seconds
442
+
443
+ await self.execute(f"mousemove {x} {y}", take_screenshot=False)
444
+
445
+ # End drag
446
+ await self.execute("mouseup 1", take_screenshot=False)
447
+
448
+ # Take final screenshot if requested
449
+ if take_screenshot:
450
+ screenshot = await self.screenshot()
451
+ result = ContentResult(
452
+ output=f"Dragged along {len(path)} points", base64_image=screenshot
453
+ )
454
+ else:
455
+ result = ContentResult(output=f"Dragged along {len(path)} points")
456
+
457
+ finally:
458
+ # Release held keys
459
+ await self._release_keys(hold_keys)
460
+
461
+ return result
462
+
463
+ async def mouse_down(
464
+ self,
465
+ button: Literal["left", "right", "middle", "back", "forward"] = "left",
466
+ take_screenshot: bool = True,
467
+ ) -> ContentResult:
468
+ """Press and hold a mouse button."""
469
+ button_map = {"left": 1, "right": 3, "middle": 2, "back": 8, "forward": 9}
470
+ button_num = button_map.get(button, 1)
471
+ return await self.execute(f"mousedown {button_num}", take_screenshot=take_screenshot)
472
+
473
+ async def mouse_up(
474
+ self,
475
+ button: Literal["left", "right", "middle", "back", "forward"] = "left",
476
+ take_screenshot: bool = True,
477
+ ) -> ContentResult:
478
+ """Release a mouse button."""
479
+ button_map = {"left": 1, "right": 3, "middle": 2, "back": 8, "forward": 9}
480
+ button_num = button_map.get(button, 1)
481
+ return await self.execute(f"mouseup {button_num}", take_screenshot=take_screenshot)
482
+
483
+ async def hold_key(
484
+ self, key: str, duration: float, take_screenshot: bool = True
485
+ ) -> ContentResult:
486
+ """Hold a key for a specified duration."""
487
+ # Map CLA key to XDO key
488
+ mapped_key = self._map_key(key)
489
+ escaped_key = shlex.quote(mapped_key)
490
+
491
+ # Press the key
492
+ await self.execute(f"keydown {escaped_key}", take_screenshot=False)
493
+
494
+ # Wait
495
+ await asyncio.sleep(duration)
496
+
497
+ # Release the key
498
+ result = await self.execute(f"keyup {escaped_key}", take_screenshot=False)
499
+
500
+ if take_screenshot:
501
+ screenshot = await self.screenshot()
502
+ if screenshot:
503
+ result = ContentResult(
504
+ output=result.output, error=result.error, base64_image=screenshot
505
+ )
506
+
507
+ return result
508
+
509
+ async def position(self) -> ContentResult:
510
+ """Get current cursor position."""
511
+ return await self.execute("getmouselocation", take_screenshot=False)