cua-agent 0.3.1__py3-none-any.whl → 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cua-agent might be problematic. Click here for more details.

Files changed (112) hide show
  1. agent/__init__.py +21 -12
  2. agent/__main__.py +21 -0
  3. agent/adapters/__init__.py +9 -0
  4. agent/adapters/huggingfacelocal_adapter.py +229 -0
  5. agent/agent.py +594 -0
  6. agent/callbacks/__init__.py +19 -0
  7. agent/callbacks/base.py +153 -0
  8. agent/callbacks/budget_manager.py +44 -0
  9. agent/callbacks/image_retention.py +139 -0
  10. agent/callbacks/logging.py +247 -0
  11. agent/callbacks/pii_anonymization.py +259 -0
  12. agent/callbacks/telemetry.py +210 -0
  13. agent/callbacks/trajectory_saver.py +305 -0
  14. agent/cli.py +297 -0
  15. agent/computer_handler.py +107 -0
  16. agent/decorators.py +90 -0
  17. agent/loops/__init__.py +11 -0
  18. agent/loops/anthropic.py +728 -0
  19. agent/loops/omniparser.py +339 -0
  20. agent/loops/openai.py +95 -0
  21. agent/loops/uitars.py +688 -0
  22. agent/responses.py +207 -0
  23. agent/telemetry.py +135 -14
  24. agent/types.py +79 -0
  25. agent/ui/__init__.py +7 -1
  26. agent/ui/__main__.py +2 -13
  27. agent/ui/gradio/__init__.py +6 -19
  28. agent/ui/gradio/app.py +94 -1313
  29. agent/ui/gradio/ui_components.py +721 -0
  30. cua_agent-0.4.0.dist-info/METADATA +424 -0
  31. cua_agent-0.4.0.dist-info/RECORD +33 -0
  32. {cua_agent-0.3.1.dist-info → cua_agent-0.4.0.dist-info}/WHEEL +1 -1
  33. agent/core/__init__.py +0 -27
  34. agent/core/agent.py +0 -210
  35. agent/core/base.py +0 -217
  36. agent/core/callbacks.py +0 -200
  37. agent/core/experiment.py +0 -249
  38. agent/core/factory.py +0 -122
  39. agent/core/messages.py +0 -332
  40. agent/core/provider_config.py +0 -21
  41. agent/core/telemetry.py +0 -142
  42. agent/core/tools/__init__.py +0 -21
  43. agent/core/tools/base.py +0 -74
  44. agent/core/tools/bash.py +0 -52
  45. agent/core/tools/collection.py +0 -46
  46. agent/core/tools/computer.py +0 -113
  47. agent/core/tools/edit.py +0 -67
  48. agent/core/tools/manager.py +0 -56
  49. agent/core/tools.py +0 -32
  50. agent/core/types.py +0 -88
  51. agent/core/visualization.py +0 -197
  52. agent/providers/__init__.py +0 -4
  53. agent/providers/anthropic/__init__.py +0 -6
  54. agent/providers/anthropic/api/client.py +0 -360
  55. agent/providers/anthropic/api/logging.py +0 -150
  56. agent/providers/anthropic/api_handler.py +0 -140
  57. agent/providers/anthropic/callbacks/__init__.py +0 -5
  58. agent/providers/anthropic/callbacks/manager.py +0 -65
  59. agent/providers/anthropic/loop.py +0 -568
  60. agent/providers/anthropic/prompts.py +0 -23
  61. agent/providers/anthropic/response_handler.py +0 -226
  62. agent/providers/anthropic/tools/__init__.py +0 -33
  63. agent/providers/anthropic/tools/base.py +0 -88
  64. agent/providers/anthropic/tools/bash.py +0 -66
  65. agent/providers/anthropic/tools/collection.py +0 -34
  66. agent/providers/anthropic/tools/computer.py +0 -396
  67. agent/providers/anthropic/tools/edit.py +0 -326
  68. agent/providers/anthropic/tools/manager.py +0 -54
  69. agent/providers/anthropic/tools/run.py +0 -42
  70. agent/providers/anthropic/types.py +0 -16
  71. agent/providers/anthropic/utils.py +0 -367
  72. agent/providers/omni/__init__.py +0 -8
  73. agent/providers/omni/api_handler.py +0 -42
  74. agent/providers/omni/clients/anthropic.py +0 -103
  75. agent/providers/omni/clients/base.py +0 -35
  76. agent/providers/omni/clients/oaicompat.py +0 -195
  77. agent/providers/omni/clients/ollama.py +0 -122
  78. agent/providers/omni/clients/openai.py +0 -155
  79. agent/providers/omni/clients/utils.py +0 -25
  80. agent/providers/omni/image_utils.py +0 -34
  81. agent/providers/omni/loop.py +0 -990
  82. agent/providers/omni/parser.py +0 -307
  83. agent/providers/omni/prompts.py +0 -64
  84. agent/providers/omni/tools/__init__.py +0 -30
  85. agent/providers/omni/tools/base.py +0 -29
  86. agent/providers/omni/tools/bash.py +0 -74
  87. agent/providers/omni/tools/computer.py +0 -179
  88. agent/providers/omni/tools/manager.py +0 -61
  89. agent/providers/omni/utils.py +0 -236
  90. agent/providers/openai/__init__.py +0 -6
  91. agent/providers/openai/api_handler.py +0 -456
  92. agent/providers/openai/loop.py +0 -472
  93. agent/providers/openai/response_handler.py +0 -205
  94. agent/providers/openai/tools/__init__.py +0 -15
  95. agent/providers/openai/tools/base.py +0 -79
  96. agent/providers/openai/tools/computer.py +0 -326
  97. agent/providers/openai/tools/manager.py +0 -106
  98. agent/providers/openai/types.py +0 -36
  99. agent/providers/openai/utils.py +0 -98
  100. agent/providers/uitars/__init__.py +0 -1
  101. agent/providers/uitars/clients/base.py +0 -35
  102. agent/providers/uitars/clients/mlxvlm.py +0 -263
  103. agent/providers/uitars/clients/oaicompat.py +0 -214
  104. agent/providers/uitars/loop.py +0 -660
  105. agent/providers/uitars/prompts.py +0 -63
  106. agent/providers/uitars/tools/__init__.py +0 -1
  107. agent/providers/uitars/tools/computer.py +0 -283
  108. agent/providers/uitars/tools/manager.py +0 -60
  109. agent/providers/uitars/utils.py +0 -264
  110. cua_agent-0.3.1.dist-info/METADATA +0 -295
  111. cua_agent-0.3.1.dist-info/RECORD +0 -87
  112. {cua_agent-0.3.1.dist-info → cua_agent-0.4.0.dist-info}/entry_points.txt +0 -0
@@ -1,396 +0,0 @@
1
- import asyncio
2
- import base64
3
- import io
4
- import logging
5
- from enum import StrEnum
6
- from pathlib import Path
7
- from typing import Literal, TypedDict, Any, Dict
8
- import subprocess
9
- from PIL import Image
10
- from datetime import datetime
11
-
12
- from computer.computer import Computer
13
-
14
- from .base import BaseAnthropicTool, ToolError, ToolResult
15
- from .run import run
16
- from ....core.tools.computer import BaseComputerTool
17
-
18
- TYPING_DELAY_MS = 12
19
- TYPING_GROUP_SIZE = 50
20
-
21
- Action = Literal[
22
- "key",
23
- "type",
24
- "mouse_move",
25
- "left_click",
26
- "left_click_drag",
27
- "right_click",
28
- "middle_click",
29
- "double_click",
30
- "screenshot",
31
- "cursor_position",
32
- "scroll",
33
- ]
34
-
35
-
36
- class Resolution(TypedDict):
37
- width: int
38
- height: int
39
-
40
-
41
- class ScalingSource(StrEnum):
42
- COMPUTER = "computer"
43
- API = "api"
44
-
45
-
46
- class ComputerToolOptions(TypedDict):
47
- display_height_px: int
48
- display_width_px: int
49
- display_number: int | None
50
-
51
-
52
- def chunks(s: str, chunk_size: int) -> list[str]:
53
- return [s[i : i + chunk_size] for i in range(0, len(s), chunk_size)]
54
-
55
-
56
- class ComputerTool(BaseComputerTool, BaseAnthropicTool):
57
- """
58
- A tool that allows the agent to interact with the screen, keyboard, and mouse of the current macOS computer.
59
- The tool parameters are defined by Anthropic and are not editable.
60
- """
61
-
62
- name: Literal["computer"] = "computer"
63
- api_type: Literal["computer_20250124"] = "computer_20250124"
64
- width: int | None = None
65
- height: int | None = None
66
- display_num: int | None = None
67
- computer: Computer # The CUA Computer instance
68
- logger = logging.getLogger(__name__)
69
-
70
- _screenshot_delay = 1.0 # macOS is generally faster than X11
71
- _scaling_enabled = True
72
-
73
- @property
74
- def options(self) -> ComputerToolOptions:
75
- if self.width is None or self.height is None:
76
- raise RuntimeError(
77
- "Screen dimensions not initialized. Call initialize_dimensions() first."
78
- )
79
- return {
80
- "display_width_px": self.width,
81
- "display_height_px": self.height,
82
- "display_number": self.display_num,
83
- }
84
-
85
- def to_params(self) -> Dict[str, Any]:
86
- """Convert tool to API parameters.
87
-
88
- Returns:
89
- Dictionary with tool parameters
90
- """
91
- return {"name": self.name, "type": self.api_type, **self.options}
92
-
93
- def __init__(self, computer):
94
- # Initialize the base computer tool first
95
- BaseComputerTool.__init__(self, computer)
96
- # Then initialize the Anthropic tool
97
- BaseAnthropicTool.__init__(self)
98
-
99
- # Additional initialization
100
- self.width = None # Will be initialized from computer interface
101
- self.height = None # Will be initialized from computer interface
102
- self.display_num = None
103
-
104
- async def initialize_dimensions(self):
105
- """Initialize screen dimensions from the computer interface."""
106
- display_size = await self.computer.interface.get_screen_size()
107
- self.width = display_size["width"]
108
- self.height = display_size["height"]
109
- assert isinstance(self.width, int) and isinstance(self.height, int)
110
- self.logger.info(f"Initialized screen dimensions to {self.width}x{self.height}")
111
-
112
- async def __call__(
113
- self,
114
- *,
115
- action: Action,
116
- text: str | None = None,
117
- coordinate: tuple[int, int] | None = None,
118
- **kwargs,
119
- ):
120
- try:
121
- # Ensure dimensions are initialized
122
- if self.width is None or self.height is None:
123
- await self.initialize_dimensions()
124
- if self.width is None or self.height is None:
125
- raise ToolError("Failed to initialize screen dimensions")
126
- except Exception as e:
127
- raise ToolError(f"Failed to initialize dimensions: {e}")
128
-
129
- if action in ("mouse_move", "left_click_drag"):
130
- if coordinate is None:
131
- raise ToolError(f"coordinate is required for {action}")
132
- if text is not None:
133
- raise ToolError(f"text is not accepted for {action}")
134
- if not isinstance(coordinate, (list, tuple)) or len(coordinate) != 2:
135
- raise ToolError(f"{coordinate} must be a tuple of length 2")
136
- if not all(isinstance(i, int) and i >= 0 for i in coordinate):
137
- raise ToolError(f"{coordinate} must be a tuple of non-negative ints")
138
-
139
- try:
140
- x, y = coordinate
141
- self.logger.info(f"Handling {action} action:")
142
- self.logger.info(f" Coordinates: ({x}, {y})")
143
-
144
- # Take pre-action screenshot to get current dimensions
145
- pre_screenshot = await self.computer.interface.screenshot()
146
- pre_img = Image.open(io.BytesIO(pre_screenshot))
147
-
148
- # Scale image to match screen dimensions if needed
149
- if pre_img.size != (self.width, self.height):
150
- self.logger.info(
151
- f"Scaling image from {pre_img.size} to {self.width}x{self.height} to match screen dimensions"
152
- )
153
- if not isinstance(self.width, int) or not isinstance(self.height, int):
154
- raise ToolError("Screen dimensions must be integers")
155
- size = (int(self.width), int(self.height))
156
- pre_img = pre_img.resize(size, Image.Resampling.LANCZOS)
157
-
158
- self.logger.info(f" Current dimensions: {pre_img.width}x{pre_img.height}")
159
-
160
- if action == "mouse_move":
161
- self.logger.info(f"Moving cursor to ({x}, {y})")
162
- await self.computer.interface.move_cursor(x, y)
163
- elif action == "left_click_drag":
164
- # Get the start coordinate from kwargs
165
- start_coordinate = kwargs.get("start_coordinate")
166
- if not start_coordinate:
167
- raise ToolError("start_coordinate is required for left_click_drag action")
168
-
169
- start_x, start_y = start_coordinate
170
- end_x, end_y = x, y
171
-
172
- self.logger.info(f"Dragging from ({start_x}, {start_y}) to ({end_x}, {end_y})")
173
- await self.computer.interface.move_cursor(start_x, start_y)
174
- await self.computer.interface.drag_to(end_x, end_y)
175
-
176
- # Wait briefly for any UI changes
177
- await asyncio.sleep(0.5)
178
-
179
- # Take post-action screenshot
180
- post_screenshot = await self.computer.interface.screenshot()
181
- post_img = Image.open(io.BytesIO(post_screenshot))
182
-
183
- # Scale post-action image if needed
184
- if post_img.size != (self.width, self.height):
185
- self.logger.info(
186
- f"Scaling post-action image from {post_img.size} to {self.width}x{self.height}"
187
- )
188
- post_img = post_img.resize((self.width, self.height), Image.Resampling.LANCZOS)
189
- buffer = io.BytesIO()
190
- post_img.save(buffer, format="PNG")
191
- post_screenshot = buffer.getvalue()
192
-
193
- return ToolResult(
194
- output=f"{'Moved cursor to' if action == 'mouse_move' else 'Dragged to'} {x},{y}",
195
- base64_image=base64.b64encode(post_screenshot).decode(),
196
- )
197
- except Exception as e:
198
- self.logger.error(f"Error during {action} action: {str(e)}")
199
- raise ToolError(f"Failed to perform {action}: {str(e)}")
200
-
201
- elif action in ("left_click", "right_click", "double_click"):
202
- if coordinate:
203
- x, y = coordinate
204
- self.logger.info(f"Handling {action} action:")
205
- self.logger.info(f" Coordinates: ({x}, {y})")
206
-
207
- try:
208
- # Perform the click action
209
- if action == "left_click":
210
- self.logger.info(f"Clicking at ({x}, {y})")
211
- await self.computer.interface.move_cursor(x, y)
212
- await self.computer.interface.left_click()
213
- elif action == "right_click":
214
- self.logger.info(f"Right clicking at ({x}, {y})")
215
- await self.computer.interface.move_cursor(x, y)
216
- await self.computer.interface.right_click()
217
- elif action == "double_click":
218
- self.logger.info(f"Double clicking at ({x}, {y})")
219
- await self.computer.interface.move_cursor(x, y)
220
- await self.computer.interface.double_click()
221
-
222
- # Wait briefly for any UI changes
223
- await asyncio.sleep(0.5)
224
-
225
- return ToolResult(
226
- output=f"Performed {action} at ({x}, {y})",
227
- )
228
- except Exception as e:
229
- self.logger.error(f"Error during {action} action: {str(e)}")
230
- raise ToolError(f"Failed to perform {action}: {str(e)}")
231
- else:
232
- try:
233
- # Perform the click action
234
- if action == "left_click":
235
- self.logger.info("Performing left click at current position")
236
- await self.computer.interface.left_click()
237
- elif action == "right_click":
238
- self.logger.info("Performing right click at current position")
239
- await self.computer.interface.right_click()
240
- elif action == "double_click":
241
- self.logger.info("Performing double click at current position")
242
- await self.computer.interface.double_click()
243
-
244
- # Wait briefly for any UI changes
245
- await asyncio.sleep(0.5)
246
-
247
- return ToolResult(
248
- output=f"Performed {action} at current position",
249
- )
250
- except Exception as e:
251
- self.logger.error(f"Error during {action} action: {str(e)}")
252
- raise ToolError(f"Failed to perform {action}: {str(e)}")
253
-
254
- elif action in ("key", "type"):
255
- if text is None:
256
- raise ToolError(f"text is required for {action}")
257
- if coordinate is not None:
258
- raise ToolError(f"coordinate is not accepted for {action}")
259
- if not isinstance(text, str):
260
- raise ToolError(f"{text} must be a string")
261
-
262
- try:
263
- if action == "key":
264
- # Special handling for page up/down on macOS
265
- if text.lower() in ["pagedown", "page_down", "page down"]:
266
- self.logger.info("Converting page down to fn+down for macOS")
267
- await self.computer.interface.hotkey("fn", "down")
268
- output_text = "fn+down"
269
- elif text.lower() in ["pageup", "page_up", "page up"]:
270
- self.logger.info("Converting page up to fn+up for macOS")
271
- await self.computer.interface.hotkey("fn", "up")
272
- output_text = "fn+up"
273
- elif text == "fn+down":
274
- self.logger.info("Using fn+down combination")
275
- await self.computer.interface.hotkey("fn", "down")
276
- output_text = text
277
- elif text == "fn+up":
278
- self.logger.info("Using fn+up combination")
279
- await self.computer.interface.hotkey("fn", "up")
280
- output_text = text
281
- elif "+" in text:
282
- # Handle hotkey combinations
283
- keys = text.split("+")
284
- self.logger.info(f"Pressing hotkey combination: {text}")
285
- await self.computer.interface.hotkey(*keys)
286
- output_text = text
287
- else:
288
- # Handle single key press
289
- self.logger.info(f"Pressing key: {text}")
290
- try:
291
- await self.computer.interface.press_key(text)
292
- output_text = text
293
- except ValueError as e:
294
- raise ToolError(f"Invalid key: {text}. {str(e)}")
295
-
296
- # Wait briefly for UI changes
297
- await asyncio.sleep(0.5)
298
-
299
- return ToolResult(
300
- output=f"Pressed key: {output_text}",
301
- )
302
-
303
- elif action == "type":
304
- self.logger.info(f"Typing text: {text}")
305
- await self.computer.interface.type_text(text)
306
-
307
- # Wait briefly for UI changes
308
- await asyncio.sleep(0.5)
309
-
310
- return ToolResult(
311
- output=f"Typed text: {text}",
312
- )
313
- except Exception as e:
314
- self.logger.error(f"Error during {action} action: {str(e)}")
315
- raise ToolError(f"Failed to perform {action}: {str(e)}")
316
-
317
- elif action == "scroll":
318
- # Implement scroll action
319
- direction = kwargs.get("direction", "down")
320
- amount = kwargs.get("amount", 10)
321
-
322
- if direction not in ["up", "down"]:
323
- raise ToolError(f"Invalid scroll direction: {direction}. Must be 'up' or 'down'.")
324
-
325
- try:
326
- if direction == "down":
327
- # Scroll down (Page Down on macOS)
328
- self.logger.info(f"Scrolling down, amount: {amount}")
329
- await self.computer.interface.scroll_down(amount)
330
- else:
331
- # Scroll up (Page Up on macOS)
332
- self.logger.info(f"Scrolling up, amount: {amount}")
333
- await self.computer.interface.scroll_up(amount)
334
-
335
- # Wait briefly for UI changes
336
- await asyncio.sleep(0.5)
337
-
338
- return ToolResult(
339
- output=f"Scrolled {direction} by {amount} steps",
340
- )
341
- except Exception as e:
342
- self.logger.error(f"Error during scroll action: {str(e)}")
343
- raise ToolError(f"Failed to perform scroll: {str(e)}")
344
-
345
- elif action == "screenshot":
346
- # Take screenshot
347
- return await self.screenshot()
348
- elif action == "cursor_position":
349
- pos = await self.computer.interface.get_cursor_position()
350
- x, y = pos # Unpack the tuple
351
- return ToolResult(output=f"X={int(x)},Y={int(y)}")
352
- raise ToolError(f"Invalid action: {action}")
353
-
354
- async def screenshot(self):
355
- """Take a screenshot and return it as a base64-encoded string."""
356
- try:
357
- screenshot = await self.computer.interface.screenshot()
358
- img = Image.open(io.BytesIO(screenshot))
359
-
360
- # Scale image if needed
361
- if img.size != (self.width, self.height):
362
- self.logger.info(f"Scaling image from {img.size} to {self.width}x{self.height}")
363
- if not isinstance(self.width, int) or not isinstance(self.height, int):
364
- raise ToolError("Screen dimensions must be integers")
365
- size = (int(self.width), int(self.height))
366
- img = img.resize(size, Image.Resampling.LANCZOS)
367
- buffer = io.BytesIO()
368
- img.save(buffer, format="PNG")
369
- screenshot = buffer.getvalue()
370
-
371
- return ToolResult(base64_image=base64.b64encode(screenshot).decode())
372
- except Exception as e:
373
- self.logger.error(f"Error taking screenshot: {str(e)}")
374
- return ToolResult(error=f"Failed to take screenshot: {str(e)}")
375
-
376
- async def shell(self, command: str, take_screenshot=False) -> ToolResult:
377
- """Run a shell command and return the output, error, and optionally a screenshot."""
378
- try:
379
- _, stdout, stderr = await run(command)
380
- base64_image = None
381
-
382
- if take_screenshot:
383
- # delay to let things settle before taking a screenshot
384
- await asyncio.sleep(self._screenshot_delay)
385
- screenshot_result = await self.screenshot()
386
- if screenshot_result.error:
387
- return ToolResult(
388
- output=stdout,
389
- error=f"{stderr}\nScreenshot error: {screenshot_result.error}",
390
- )
391
- base64_image = screenshot_result.base64_image
392
-
393
- return ToolResult(output=stdout, error=stderr, base64_image=base64_image)
394
-
395
- except Exception as e:
396
- return ToolResult(error=f"Shell command failed: {str(e)}")