cua-agent 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cua-agent might be problematic. Click here for more details.

Files changed (65) hide show
  1. agent/README.md +63 -0
  2. agent/__init__.py +10 -0
  3. agent/core/README.md +101 -0
  4. agent/core/__init__.py +34 -0
  5. agent/core/agent.py +284 -0
  6. agent/core/base_agent.py +164 -0
  7. agent/core/callbacks.py +147 -0
  8. agent/core/computer_agent.py +69 -0
  9. agent/core/experiment.py +222 -0
  10. agent/core/factory.py +102 -0
  11. agent/core/loop.py +244 -0
  12. agent/core/messages.py +230 -0
  13. agent/core/tools/__init__.py +21 -0
  14. agent/core/tools/base.py +74 -0
  15. agent/core/tools/bash.py +52 -0
  16. agent/core/tools/collection.py +46 -0
  17. agent/core/tools/computer.py +113 -0
  18. agent/core/tools/edit.py +67 -0
  19. agent/core/tools/manager.py +56 -0
  20. agent/providers/__init__.py +4 -0
  21. agent/providers/anthropic/__init__.py +6 -0
  22. agent/providers/anthropic/api/client.py +222 -0
  23. agent/providers/anthropic/api/logging.py +150 -0
  24. agent/providers/anthropic/callbacks/manager.py +55 -0
  25. agent/providers/anthropic/loop.py +521 -0
  26. agent/providers/anthropic/messages/manager.py +110 -0
  27. agent/providers/anthropic/prompts.py +20 -0
  28. agent/providers/anthropic/tools/__init__.py +33 -0
  29. agent/providers/anthropic/tools/base.py +88 -0
  30. agent/providers/anthropic/tools/bash.py +163 -0
  31. agent/providers/anthropic/tools/collection.py +34 -0
  32. agent/providers/anthropic/tools/computer.py +550 -0
  33. agent/providers/anthropic/tools/edit.py +326 -0
  34. agent/providers/anthropic/tools/manager.py +54 -0
  35. agent/providers/anthropic/tools/run.py +42 -0
  36. agent/providers/anthropic/types.py +16 -0
  37. agent/providers/omni/__init__.py +27 -0
  38. agent/providers/omni/callbacks.py +78 -0
  39. agent/providers/omni/clients/anthropic.py +99 -0
  40. agent/providers/omni/clients/base.py +44 -0
  41. agent/providers/omni/clients/groq.py +101 -0
  42. agent/providers/omni/clients/openai.py +159 -0
  43. agent/providers/omni/clients/utils.py +25 -0
  44. agent/providers/omni/experiment.py +273 -0
  45. agent/providers/omni/image_utils.py +106 -0
  46. agent/providers/omni/loop.py +961 -0
  47. agent/providers/omni/messages.py +168 -0
  48. agent/providers/omni/parser.py +252 -0
  49. agent/providers/omni/prompts.py +78 -0
  50. agent/providers/omni/tool_manager.py +91 -0
  51. agent/providers/omni/tools/__init__.py +13 -0
  52. agent/providers/omni/tools/bash.py +69 -0
  53. agent/providers/omni/tools/computer.py +216 -0
  54. agent/providers/omni/tools/manager.py +83 -0
  55. agent/providers/omni/types.py +30 -0
  56. agent/providers/omni/utils.py +155 -0
  57. agent/providers/omni/visualization.py +130 -0
  58. agent/types/__init__.py +26 -0
  59. agent/types/base.py +52 -0
  60. agent/types/messages.py +36 -0
  61. agent/types/tools.py +32 -0
  62. cua_agent-0.1.0.dist-info/METADATA +44 -0
  63. cua_agent-0.1.0.dist-info/RECORD +65 -0
  64. cua_agent-0.1.0.dist-info/WHEEL +4 -0
  65. cua_agent-0.1.0.dist-info/entry_points.txt +4 -0
@@ -0,0 +1,550 @@
1
+ import asyncio
2
+ import base64
3
+ import io
4
+ import logging
5
+ from enum import StrEnum
6
+ from pathlib import Path
7
+ from typing import Literal, TypedDict, Any, Dict
8
+ import subprocess
9
+ from PIL import Image
10
+ from datetime import datetime
11
+
12
+ from computer.computer import Computer
13
+
14
+ from .base import BaseAnthropicTool, ToolError, ToolResult
15
+ from .run import run
16
+ from ....core.tools.computer import BaseComputerTool
17
+
18
+ TYPING_DELAY_MS = 12
19
+ TYPING_GROUP_SIZE = 50
20
+
21
+ Action = Literal[
22
+ "key",
23
+ "type",
24
+ "mouse_move",
25
+ "left_click",
26
+ "left_click_drag",
27
+ "right_click",
28
+ "middle_click",
29
+ "double_click",
30
+ "screenshot",
31
+ "cursor_position",
32
+ "scroll",
33
+ ]
34
+
35
+
36
+ class Resolution(TypedDict):
37
+ width: int
38
+ height: int
39
+
40
+
41
+ class ScalingSource(StrEnum):
42
+ COMPUTER = "computer"
43
+ API = "api"
44
+
45
+
46
+ class ComputerToolOptions(TypedDict):
47
+ display_height_px: int
48
+ display_width_px: int
49
+ display_number: int | None
50
+
51
+
52
+ def chunks(s: str, chunk_size: int) -> list[str]:
53
+ return [s[i : i + chunk_size] for i in range(0, len(s), chunk_size)]
54
+
55
+
56
+ class ComputerTool(BaseComputerTool, BaseAnthropicTool):
57
+ """
58
+ A tool that allows the agent to interact with the screen, keyboard, and mouse of the current macOS computer.
59
+ The tool parameters are defined by Anthropic and are not editable.
60
+ """
61
+
62
+ name: Literal["computer"] = "computer"
63
+ api_type: Literal["computer_20250124"] = "computer_20250124"
64
+ width: int | None
65
+ height: int | None
66
+ display_num: int | None
67
+ computer: Computer # The CUA Computer instance
68
+ logger = logging.getLogger(__name__)
69
+
70
+ _screenshot_delay = 1.0 # macOS is generally faster than X11
71
+ _scaling_enabled = True
72
+
73
+ @property
74
+ def options(self) -> ComputerToolOptions:
75
+ if self.width is None or self.height is None:
76
+ raise RuntimeError(
77
+ "Screen dimensions not initialized. Call initialize_dimensions() first."
78
+ )
79
+ return {
80
+ "display_width_px": self.width,
81
+ "display_height_px": self.height,
82
+ "display_number": self.display_num,
83
+ }
84
+
85
+ def to_params(self) -> Dict[str, Any]:
86
+ """Convert tool to API parameters.
87
+
88
+ Returns:
89
+ Dictionary with tool parameters
90
+ """
91
+ return {"name": self.name, "type": self.api_type, **self.options}
92
+
93
+ def __init__(self, computer):
94
+ # Initialize the base computer tool first
95
+ BaseComputerTool.__init__(self, computer)
96
+ # Then initialize the Anthropic tool
97
+ BaseAnthropicTool.__init__(self)
98
+
99
+ # Additional initialization
100
+ self.width = None # Will be initialized from computer interface
101
+ self.height = None # Will be initialized from computer interface
102
+ self.display_num = None
103
+
104
+ async def initialize_dimensions(self):
105
+ """Initialize screen dimensions from the computer interface."""
106
+ display_size = await self.computer.interface.get_screen_size()
107
+ self.width = display_size["width"]
108
+ self.height = display_size["height"]
109
+ self.logger.info(f"Initialized screen dimensions to {self.width}x{self.height}")
110
+
111
+ async def __call__(
112
+ self,
113
+ *,
114
+ action: Action,
115
+ text: str | None = None,
116
+ coordinate: tuple[int, int] | None = None,
117
+ **kwargs,
118
+ ):
119
+ try:
120
+ # Ensure dimensions are initialized
121
+ if self.width is None or self.height is None:
122
+ await self.initialize_dimensions()
123
+ except Exception as e:
124
+ raise ToolError(f"Failed to initialize dimensions: {e}")
125
+
126
+ if action in ("mouse_move", "left_click_drag"):
127
+ if coordinate is None:
128
+ raise ToolError(f"coordinate is required for {action}")
129
+ if text is not None:
130
+ raise ToolError(f"text is not accepted for {action}")
131
+ if not isinstance(coordinate, (list, tuple)) or len(coordinate) != 2:
132
+ raise ToolError(f"{coordinate} must be a tuple of length 2")
133
+ if not all(isinstance(i, int) and i >= 0 for i in coordinate):
134
+ raise ToolError(f"{coordinate} must be a tuple of non-negative ints")
135
+
136
+ try:
137
+ x, y = coordinate
138
+ self.logger.info(f"Handling {action} action:")
139
+ self.logger.info(f" Coordinates: ({x}, {y})")
140
+
141
+ # Take pre-action screenshot to get current dimensions
142
+ pre_screenshot = await self.computer.interface.screenshot()
143
+ pre_img = Image.open(io.BytesIO(pre_screenshot))
144
+
145
+ # Scale image to match screen dimensions if needed
146
+ if pre_img.size != (self.width, self.height):
147
+ self.logger.info(
148
+ f"Scaling image from {pre_img.size} to {self.width}x{self.height} to match screen dimensions"
149
+ )
150
+ pre_img = pre_img.resize((self.width, self.height), Image.Resampling.LANCZOS)
151
+
152
+ self.logger.info(f" Current dimensions: {pre_img.width}x{pre_img.height}")
153
+
154
+ if action == "mouse_move":
155
+ self.logger.info(f"Moving cursor to ({x}, {y})")
156
+ await self.computer.interface.move_cursor(x, y)
157
+ elif action == "left_click_drag":
158
+ self.logger.info(f"Dragging from ({x}, {y})")
159
+ # First move to the position
160
+ await self.computer.interface.move_cursor(x, y)
161
+ # Then perform drag operation - check if drag_to exists or we need to use other methods
162
+ try:
163
+ if hasattr(self.computer.interface, "drag_to"):
164
+ await self.computer.interface.drag_to(x, y)
165
+ else:
166
+ # Alternative approach: press mouse down, move, release
167
+ await self.computer.interface.mouse_down()
168
+ await asyncio.sleep(0.2)
169
+ await self.computer.interface.move_cursor(x, y)
170
+ await asyncio.sleep(0.2)
171
+ await self.computer.interface.mouse_up()
172
+ except Exception as e:
173
+ self.logger.error(f"Error during drag operation: {str(e)}")
174
+ raise ToolError(f"Failed to perform drag: {str(e)}")
175
+
176
+ # Wait briefly for any UI changes
177
+ await asyncio.sleep(0.5)
178
+
179
+ # Take post-action screenshot
180
+ post_screenshot = await self.computer.interface.screenshot()
181
+ post_img = Image.open(io.BytesIO(post_screenshot))
182
+
183
+ # Scale post-action image if needed
184
+ if post_img.size != (self.width, self.height):
185
+ self.logger.info(
186
+ f"Scaling post-action image from {post_img.size} to {self.width}x{self.height}"
187
+ )
188
+ post_img = post_img.resize((self.width, self.height), Image.Resampling.LANCZOS)
189
+ buffer = io.BytesIO()
190
+ post_img.save(buffer, format="PNG")
191
+ post_screenshot = buffer.getvalue()
192
+
193
+ return ToolResult(
194
+ output=f"{'Moved cursor to' if action == 'mouse_move' else 'Dragged to'} {x},{y}",
195
+ base64_image=base64.b64encode(post_screenshot).decode(),
196
+ )
197
+ except Exception as e:
198
+ self.logger.error(f"Error during {action} action: {str(e)}")
199
+ raise ToolError(f"Failed to perform {action}: {str(e)}")
200
+
201
+ elif action in ("left_click", "right_click", "double_click"):
202
+ if coordinate:
203
+ x, y = coordinate
204
+ self.logger.info(f"Handling {action} action:")
205
+ self.logger.info(f" Coordinates: ({x}, {y})")
206
+
207
+ try:
208
+ # Take pre-action screenshot to get current dimensions
209
+ pre_screenshot = await self.computer.interface.screenshot()
210
+ pre_img = Image.open(io.BytesIO(pre_screenshot))
211
+
212
+ # Scale image to match screen dimensions if needed
213
+ if pre_img.size != (self.width, self.height):
214
+ self.logger.info(
215
+ f"Scaling image from {pre_img.size} to {self.width}x{self.height} to match screen dimensions"
216
+ )
217
+ pre_img = pre_img.resize(
218
+ (self.width, self.height), Image.Resampling.LANCZOS
219
+ )
220
+ # Save the scaled image back to bytes
221
+ buffer = io.BytesIO()
222
+ pre_img.save(buffer, format="PNG")
223
+ pre_screenshot = buffer.getvalue()
224
+
225
+ self.logger.info(f" Current dimensions: {pre_img.width}x{pre_img.height}")
226
+
227
+ # Perform the click action
228
+ if action == "left_click":
229
+ self.logger.info(f"Clicking at ({x}, {y})")
230
+ await self.computer.interface.move_cursor(x, y)
231
+ await self.computer.interface.left_click()
232
+ elif action == "right_click":
233
+ self.logger.info(f"Right clicking at ({x}, {y})")
234
+ await self.computer.interface.move_cursor(x, y)
235
+ await self.computer.interface.right_click()
236
+ elif action == "double_click":
237
+ self.logger.info(f"Double clicking at ({x}, {y})")
238
+ await self.computer.interface.move_cursor(x, y)
239
+ await self.computer.interface.double_click()
240
+
241
+ # Wait briefly for any UI changes
242
+ await asyncio.sleep(0.5)
243
+
244
+ # Take and save post-action screenshot
245
+ post_screenshot = await self.computer.interface.screenshot()
246
+ post_img = Image.open(io.BytesIO(post_screenshot))
247
+
248
+ # Scale post-action image if needed
249
+ if post_img.size != (self.width, self.height):
250
+ self.logger.info(
251
+ f"Scaling post-action image from {post_img.size} to {self.width}x{self.height}"
252
+ )
253
+ post_img = post_img.resize(
254
+ (self.width, self.height), Image.Resampling.LANCZOS
255
+ )
256
+ buffer = io.BytesIO()
257
+ post_img.save(buffer, format="PNG")
258
+ post_screenshot = buffer.getvalue()
259
+
260
+ return ToolResult(
261
+ output=f"Performed {action} at ({x}, {y})",
262
+ base64_image=base64.b64encode(post_screenshot).decode(),
263
+ )
264
+ except Exception as e:
265
+ self.logger.error(f"Error during {action} action: {str(e)}")
266
+ raise ToolError(f"Failed to perform {action}: {str(e)}")
267
+ else:
268
+ try:
269
+ # Take pre-action screenshot
270
+ pre_screenshot = await self.computer.interface.screenshot()
271
+ pre_img = Image.open(io.BytesIO(pre_screenshot))
272
+
273
+ # Scale image if needed
274
+ if pre_img.size != (self.width, self.height):
275
+ self.logger.info(
276
+ f"Scaling image from {pre_img.size} to {self.width}x{self.height}"
277
+ )
278
+ pre_img = pre_img.resize(
279
+ (self.width, self.height), Image.Resampling.LANCZOS
280
+ )
281
+
282
+ # Perform the click action
283
+ if action == "left_click":
284
+ self.logger.info("Performing left click at current position")
285
+ await self.computer.interface.left_click()
286
+ elif action == "right_click":
287
+ self.logger.info("Performing right click at current position")
288
+ await self.computer.interface.right_click()
289
+ elif action == "double_click":
290
+ self.logger.info("Performing double click at current position")
291
+ await self.computer.interface.double_click()
292
+
293
+ # Wait briefly for any UI changes
294
+ await asyncio.sleep(0.5)
295
+
296
+ # Take post-action screenshot
297
+ post_screenshot = await self.computer.interface.screenshot()
298
+ post_img = Image.open(io.BytesIO(post_screenshot))
299
+
300
+ # Scale post-action image if needed
301
+ if post_img.size != (self.width, self.height):
302
+ self.logger.info(
303
+ f"Scaling post-action image from {post_img.size} to {self.width}x{self.height}"
304
+ )
305
+ post_img = post_img.resize(
306
+ (self.width, self.height), Image.Resampling.LANCZOS
307
+ )
308
+ buffer = io.BytesIO()
309
+ post_img.save(buffer, format="PNG")
310
+ post_screenshot = buffer.getvalue()
311
+
312
+ return ToolResult(
313
+ output=f"Performed {action} at current position",
314
+ base64_image=base64.b64encode(post_screenshot).decode(),
315
+ )
316
+ except Exception as e:
317
+ self.logger.error(f"Error during {action} action: {str(e)}")
318
+ raise ToolError(f"Failed to perform {action}: {str(e)}")
319
+
320
+ elif action in ("key", "type"):
321
+ if text is None:
322
+ raise ToolError(f"text is required for {action}")
323
+ if coordinate is not None:
324
+ raise ToolError(f"coordinate is not accepted for {action}")
325
+ if not isinstance(text, str):
326
+ raise ToolError(f"{text} must be a string")
327
+
328
+ try:
329
+ # Take pre-action screenshot
330
+ pre_screenshot = await self.computer.interface.screenshot()
331
+ pre_img = Image.open(io.BytesIO(pre_screenshot))
332
+
333
+ # Scale image if needed
334
+ if pre_img.size != (self.width, self.height):
335
+ self.logger.info(
336
+ f"Scaling image from {pre_img.size} to {self.width}x{self.height}"
337
+ )
338
+ pre_img = pre_img.resize((self.width, self.height), Image.Resampling.LANCZOS)
339
+
340
+ if action == "key":
341
+ # Special handling for page up/down on macOS
342
+ if text.lower() in ["pagedown", "page_down", "page down"]:
343
+ self.logger.info("Converting page down to fn+down for macOS")
344
+ await self.computer.interface.hotkey("fn", "down")
345
+ output_text = "fn+down"
346
+ elif text.lower() in ["pageup", "page_up", "page up"]:
347
+ self.logger.info("Converting page up to fn+up for macOS")
348
+ await self.computer.interface.hotkey("fn", "up")
349
+ output_text = "fn+up"
350
+ elif text == "fn+down":
351
+ self.logger.info("Using fn+down combination")
352
+ await self.computer.interface.hotkey("fn", "down")
353
+ output_text = text
354
+ elif text == "fn+up":
355
+ self.logger.info("Using fn+up combination")
356
+ await self.computer.interface.hotkey("fn", "up")
357
+ output_text = text
358
+ elif "+" in text:
359
+ # Handle hotkey combinations
360
+ keys = text.split("+")
361
+ self.logger.info(f"Pressing hotkey combination: {text}")
362
+ await self.computer.interface.hotkey(*keys)
363
+ output_text = text
364
+ else:
365
+ # Handle single key press
366
+ self.logger.info(f"Pressing key: {text}")
367
+ try:
368
+ await self.computer.interface.press(text)
369
+ output_text = text
370
+ except ValueError as e:
371
+ raise ToolError(f"Invalid key: {text}. {str(e)}")
372
+
373
+ # Wait briefly for UI changes
374
+ await asyncio.sleep(0.5)
375
+
376
+ # Take post-action screenshot
377
+ post_screenshot = await self.computer.interface.screenshot()
378
+ post_img = Image.open(io.BytesIO(post_screenshot))
379
+
380
+ # Scale post-action image if needed
381
+ if post_img.size != (self.width, self.height):
382
+ self.logger.info(
383
+ f"Scaling post-action image from {post_img.size} to {self.width}x{self.height}"
384
+ )
385
+ post_img = post_img.resize(
386
+ (self.width, self.height), Image.Resampling.LANCZOS
387
+ )
388
+ buffer = io.BytesIO()
389
+ post_img.save(buffer, format="PNG")
390
+ post_screenshot = buffer.getvalue()
391
+
392
+ return ToolResult(
393
+ output=f"Pressed key: {output_text}",
394
+ base64_image=base64.b64encode(post_screenshot).decode(),
395
+ )
396
+
397
+ elif action == "type":
398
+ self.logger.info(f"Typing text: {text}")
399
+ await self.computer.interface.type_text(text)
400
+
401
+ # Wait briefly for UI changes
402
+ await asyncio.sleep(0.5)
403
+
404
+ # Take post-action screenshot
405
+ post_screenshot = await self.computer.interface.screenshot()
406
+ post_img = Image.open(io.BytesIO(post_screenshot))
407
+
408
+ # Scale post-action image if needed
409
+ if post_img.size != (self.width, self.height):
410
+ self.logger.info(
411
+ f"Scaling post-action image from {post_img.size} to {self.width}x{self.height}"
412
+ )
413
+ post_img = post_img.resize(
414
+ (self.width, self.height), Image.Resampling.LANCZOS
415
+ )
416
+ buffer = io.BytesIO()
417
+ post_img.save(buffer, format="PNG")
418
+ post_screenshot = buffer.getvalue()
419
+
420
+ return ToolResult(
421
+ output=f"Typed text: {text}",
422
+ base64_image=base64.b64encode(post_screenshot).decode(),
423
+ )
424
+ except Exception as e:
425
+ self.logger.error(f"Error during {action} action: {str(e)}")
426
+ raise ToolError(f"Failed to perform {action}: {str(e)}")
427
+
428
+ elif action in ("screenshot", "cursor_position"):
429
+ if text is not None:
430
+ raise ToolError(f"text is not accepted for {action}")
431
+ if coordinate is not None:
432
+ raise ToolError(f"coordinate is not accepted for {action}")
433
+
434
+ try:
435
+ if action == "screenshot":
436
+ # Take screenshot
437
+ screenshot = await self.computer.interface.screenshot()
438
+ img = Image.open(io.BytesIO(screenshot))
439
+
440
+ # Scale image if needed
441
+ if img.size != (self.width, self.height):
442
+ self.logger.info(
443
+ f"Scaling image from {img.size} to {self.width}x{self.height}"
444
+ )
445
+ img = img.resize((self.width, self.height), Image.Resampling.LANCZOS)
446
+ buffer = io.BytesIO()
447
+ img.save(buffer, format="PNG")
448
+ screenshot = buffer.getvalue()
449
+
450
+ return ToolResult(base64_image=base64.b64encode(screenshot).decode())
451
+
452
+ elif action == "cursor_position":
453
+ pos = await self.computer.interface.get_cursor_position()
454
+ return ToolResult(output=f"X={int(pos[0])},Y={int(pos[1])}")
455
+
456
+ except Exception as e:
457
+ self.logger.error(f"Error during {action} action: {str(e)}")
458
+ raise ToolError(f"Failed to perform {action}: {str(e)}")
459
+
460
+ elif action == "scroll":
461
+ # Implement scroll action
462
+ direction = kwargs.get("direction", "down")
463
+ amount = kwargs.get("amount", 10)
464
+
465
+ if direction not in ["up", "down"]:
466
+ raise ToolError(f"Invalid scroll direction: {direction}. Must be 'up' or 'down'.")
467
+
468
+ try:
469
+ if direction == "down":
470
+ # Scroll down (Page Down on macOS)
471
+ self.logger.info(f"Scrolling down, amount: {amount}")
472
+ # Use fn+down for page down on macOS
473
+ for _ in range(amount):
474
+ await self.computer.interface.hotkey("fn", "down")
475
+ await asyncio.sleep(0.1)
476
+ else:
477
+ # Scroll up (Page Up on macOS)
478
+ self.logger.info(f"Scrolling up, amount: {amount}")
479
+ # Use fn+up for page up on macOS
480
+ for _ in range(amount):
481
+ await self.computer.interface.hotkey("fn", "up")
482
+ await asyncio.sleep(0.1)
483
+
484
+ # Wait briefly for UI changes
485
+ await asyncio.sleep(0.5)
486
+
487
+ # Take post-action screenshot
488
+ post_screenshot = await self.computer.interface.screenshot()
489
+ post_img = Image.open(io.BytesIO(post_screenshot))
490
+
491
+ # Scale post-action image if needed
492
+ if post_img.size != (self.width, self.height):
493
+ self.logger.info(
494
+ f"Scaling post-action image from {post_img.size} to {self.width}x{self.height}"
495
+ )
496
+ post_img = post_img.resize((self.width, self.height), Image.Resampling.LANCZOS)
497
+ buffer = io.BytesIO()
498
+ post_img.save(buffer, format="PNG")
499
+ post_screenshot = buffer.getvalue()
500
+
501
+ return ToolResult(
502
+ output=f"Scrolled {direction} by {amount} steps",
503
+ base64_image=base64.b64encode(post_screenshot).decode(),
504
+ )
505
+ except Exception as e:
506
+ self.logger.error(f"Error during scroll action: {str(e)}")
507
+ raise ToolError(f"Failed to perform scroll: {str(e)}")
508
+
509
+ raise ToolError(f"Invalid action: {action}")
510
+
511
+ async def screenshot(self):
512
+ """Take a screenshot and return it as a base64-encoded string."""
513
+ try:
514
+ screenshot = await self.computer.interface.screenshot()
515
+ img = Image.open(io.BytesIO(screenshot))
516
+
517
+ # Scale image if needed
518
+ if img.size != (self.width, self.height):
519
+ self.logger.info(f"Scaling image from {img.size} to {self.width}x{self.height}")
520
+ img = img.resize((self.width, self.height), Image.Resampling.LANCZOS)
521
+ buffer = io.BytesIO()
522
+ img.save(buffer, format="PNG")
523
+ screenshot = buffer.getvalue()
524
+
525
+ return ToolResult(base64_image=base64.b64encode(screenshot).decode())
526
+ except Exception as e:
527
+ self.logger.error(f"Error taking screenshot: {str(e)}")
528
+ return ToolResult(error=f"Failed to take screenshot: {str(e)}")
529
+
530
+ async def shell(self, command: str, take_screenshot=False) -> ToolResult:
531
+ """Run a shell command and return the output, error, and optionally a screenshot."""
532
+ try:
533
+ _, stdout, stderr = await run(command)
534
+ base64_image = None
535
+
536
+ if take_screenshot:
537
+ # delay to let things settle before taking a screenshot
538
+ await asyncio.sleep(self._screenshot_delay)
539
+ screenshot_result = await self.screenshot()
540
+ if screenshot_result.error:
541
+ return ToolResult(
542
+ output=stdout,
543
+ error=f"{stderr}\nScreenshot error: {screenshot_result.error}",
544
+ )
545
+ base64_image = screenshot_result.base64_image
546
+
547
+ return ToolResult(output=stdout, error=stderr, base64_image=base64_image)
548
+
549
+ except Exception as e:
550
+ return ToolResult(error=f"Shell command failed: {str(e)}")