cua-agent 0.4.22__py3-none-any.whl → 0.7.16__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cua-agent might be problematic. Click here for more details.

Files changed (79) hide show
  1. agent/__init__.py +4 -10
  2. agent/__main__.py +2 -1
  3. agent/adapters/__init__.py +4 -0
  4. agent/adapters/azure_ml_adapter.py +283 -0
  5. agent/adapters/cua_adapter.py +161 -0
  6. agent/adapters/huggingfacelocal_adapter.py +67 -125
  7. agent/adapters/human_adapter.py +116 -114
  8. agent/adapters/mlxvlm_adapter.py +110 -99
  9. agent/adapters/models/__init__.py +41 -0
  10. agent/adapters/models/generic.py +78 -0
  11. agent/adapters/models/internvl.py +290 -0
  12. agent/adapters/models/opencua.py +115 -0
  13. agent/adapters/models/qwen2_5_vl.py +78 -0
  14. agent/agent.py +337 -185
  15. agent/callbacks/__init__.py +9 -4
  16. agent/callbacks/base.py +45 -31
  17. agent/callbacks/budget_manager.py +22 -10
  18. agent/callbacks/image_retention.py +54 -98
  19. agent/callbacks/logging.py +55 -42
  20. agent/callbacks/operator_validator.py +35 -33
  21. agent/callbacks/otel.py +291 -0
  22. agent/callbacks/pii_anonymization.py +19 -16
  23. agent/callbacks/prompt_instructions.py +47 -0
  24. agent/callbacks/telemetry.py +99 -61
  25. agent/callbacks/trajectory_saver.py +95 -69
  26. agent/cli.py +269 -119
  27. agent/computers/__init__.py +14 -9
  28. agent/computers/base.py +32 -19
  29. agent/computers/cua.py +52 -25
  30. agent/computers/custom.py +78 -71
  31. agent/decorators.py +23 -14
  32. agent/human_tool/__init__.py +2 -7
  33. agent/human_tool/__main__.py +6 -2
  34. agent/human_tool/server.py +48 -37
  35. agent/human_tool/ui.py +359 -235
  36. agent/integrations/hud/__init__.py +38 -99
  37. agent/integrations/hud/agent.py +369 -0
  38. agent/integrations/hud/proxy.py +166 -52
  39. agent/loops/__init__.py +44 -14
  40. agent/loops/anthropic.py +579 -492
  41. agent/loops/base.py +19 -15
  42. agent/loops/composed_grounded.py +136 -150
  43. agent/loops/fara/__init__.py +8 -0
  44. agent/loops/fara/config.py +506 -0
  45. agent/loops/fara/helpers.py +357 -0
  46. agent/loops/fara/schema.py +143 -0
  47. agent/loops/gelato.py +183 -0
  48. agent/loops/gemini.py +935 -0
  49. agent/loops/generic_vlm.py +601 -0
  50. agent/loops/glm45v.py +140 -135
  51. agent/loops/gta1.py +48 -51
  52. agent/loops/holo.py +218 -0
  53. agent/loops/internvl.py +180 -0
  54. agent/loops/moondream3.py +493 -0
  55. agent/loops/omniparser.py +326 -226
  56. agent/loops/openai.py +50 -51
  57. agent/loops/opencua.py +134 -0
  58. agent/loops/uiins.py +175 -0
  59. agent/loops/uitars.py +247 -206
  60. agent/loops/uitars2.py +951 -0
  61. agent/playground/__init__.py +5 -0
  62. agent/playground/server.py +301 -0
  63. agent/proxy/examples.py +61 -57
  64. agent/proxy/handlers.py +46 -39
  65. agent/responses.py +447 -347
  66. agent/tools/__init__.py +24 -0
  67. agent/tools/base.py +253 -0
  68. agent/tools/browser_tool.py +423 -0
  69. agent/types.py +11 -5
  70. agent/ui/__init__.py +1 -1
  71. agent/ui/__main__.py +1 -1
  72. agent/ui/gradio/app.py +25 -22
  73. agent/ui/gradio/ui_components.py +314 -167
  74. cua_agent-0.7.16.dist-info/METADATA +85 -0
  75. cua_agent-0.7.16.dist-info/RECORD +79 -0
  76. {cua_agent-0.4.22.dist-info → cua_agent-0.7.16.dist-info}/WHEEL +1 -1
  77. cua_agent-0.4.22.dist-info/METADATA +0 -436
  78. cua_agent-0.4.22.dist-info/RECORD +0 -51
  79. {cua_agent-0.4.22.dist-info → cua_agent-0.7.16.dist-info}/entry_points.txt +0 -0
@@ -2,26 +2,28 @@
2
2
  Trajectory saving callback handler for ComputerAgent.
3
3
  """
4
4
 
5
- import os
5
+ import base64
6
+ import io
6
7
  import json
8
+ import os
7
9
  import uuid
10
+ from copy import deepcopy
8
11
  from datetime import datetime
9
- import base64
10
12
  from pathlib import Path
11
- from typing import List, Dict, Any, Optional, Union, override
13
+ from typing import Any, Dict, List, Optional, Union, override
14
+
12
15
  from PIL import Image, ImageDraw
13
- import io
14
- from copy import deepcopy
15
16
 
16
17
  from .base import AsyncCallbackHandler
17
18
 
19
+
18
20
  def sanitize_image_urls(data: Any) -> Any:
19
21
  """
20
22
  Recursively search for 'image_url' keys and set their values to '[omitted]'.
21
-
23
+
22
24
  Args:
23
25
  data: Any data structure (dict, list, or primitive type)
24
-
26
+
25
27
  Returns:
26
28
  A deep copy of the data with all 'image_url' values replaced with '[omitted]'
27
29
  """
@@ -35,17 +37,19 @@ def sanitize_image_urls(data: Any) -> Any:
35
37
  # Recursively sanitize the value
36
38
  sanitized[key] = sanitize_image_urls(value)
37
39
  return sanitized
38
-
40
+
39
41
  elif isinstance(data, list):
40
42
  # Recursively sanitize each item in the list
41
43
  return [sanitize_image_urls(item) for item in data]
42
-
44
+
43
45
  else:
44
46
  # For primitive types (str, int, bool, None, etc.), return as-is
45
47
  return data
46
48
 
47
49
 
48
- def extract_computer_call_outputs(items: List[Dict[str, Any]], screenshot_dir: Optional[Path]) -> List[Dict[str, Any]]:
50
+ def extract_computer_call_outputs(
51
+ items: List[Dict[str, Any]], screenshot_dir: Optional[Path]
52
+ ) -> List[Dict[str, Any]]:
49
53
  """
50
54
  Save any base64-encoded screenshots from computer_call_output entries to files and
51
55
  replace their image_url with the saved file path when a call_id is present.
@@ -103,18 +107,21 @@ def extract_computer_call_outputs(items: List[Dict[str, Any]], screenshot_dir: O
103
107
  updated.append(msg)
104
108
  return updated
105
109
 
110
+
106
111
  class TrajectorySaverCallback(AsyncCallbackHandler):
107
112
  """
108
113
  Callback handler that saves agent trajectories to disk.
109
-
114
+
110
115
  Saves each run as a separate trajectory with unique ID, and each turn
111
116
  within the trajectory gets its own folder with screenshots and responses.
112
117
  """
113
-
114
- def __init__(self, trajectory_dir: str, reset_on_run: bool = True, screenshot_dir: Optional[str] = None):
118
+
119
+ def __init__(
120
+ self, trajectory_dir: str, reset_on_run: bool = True, screenshot_dir: Optional[str] = None
121
+ ):
115
122
  """
116
123
  Initialize trajectory saver.
117
-
124
+
118
125
  Args:
119
126
  trajectory_dir: Base directory to save trajectories
120
127
  reset_on_run: If True, reset trajectory_id/turn/artifact on each run.
@@ -129,15 +136,19 @@ class TrajectorySaverCallback(AsyncCallbackHandler):
129
136
  self.reset_on_run = reset_on_run
130
137
  # Optional directory to store extracted screenshots from metadata/new_items
131
138
  self.screenshot_dir: Optional[Path] = Path(screenshot_dir) if screenshot_dir else None
132
-
139
+
133
140
  # Ensure trajectory directory exists
134
141
  self.trajectory_dir.mkdir(parents=True, exist_ok=True)
135
142
 
143
+ # Ensure screenshot directory exists if specified
144
+ if self.screenshot_dir:
145
+ self.screenshot_dir.mkdir(parents=True, exist_ok=True)
146
+
136
147
  def _get_turn_dir(self) -> Path:
137
148
  """Get the directory for the current turn."""
138
149
  if not self.trajectory_id:
139
150
  raise ValueError("Trajectory not initialized - call _on_run_start first")
140
-
151
+
141
152
  # format: trajectory_id/turn_000
142
153
  turn_dir = self.trajectory_dir / self.trajectory_id / f"turn_{self.current_turn:03d}"
143
154
  turn_dir.mkdir(parents=True, exist_ok=True)
@@ -166,6 +177,7 @@ class TrajectorySaverCallback(AsyncCallbackHandler):
166
177
 
167
178
  def _update_usage(self, usage: Dict[str, Any]) -> None:
168
179
  """Update total usage statistics."""
180
+
169
181
  def add_dicts(target: Dict[str, Any], source: Dict[str, Any]) -> None:
170
182
  for key, value in source.items():
171
183
  if isinstance(value, dict):
@@ -176,18 +188,21 @@ class TrajectorySaverCallback(AsyncCallbackHandler):
176
188
  if key not in target:
177
189
  target[key] = 0
178
190
  target[key] += value
191
+
179
192
  add_dicts(self.total_usage, usage)
180
-
193
+
181
194
  @override
182
195
  async def on_run_start(self, kwargs: Dict[str, Any], old_items: List[Dict[str, Any]]) -> None:
183
196
  """Initialize trajectory tracking for a new run."""
184
197
  model = kwargs.get("model", "unknown")
185
-
198
+
186
199
  # Only reset trajectory state if reset_on_run is True or no trajectory exists
187
200
  if self.reset_on_run or not self.trajectory_id:
188
201
  model_name_short = model.split("+")[-1].split("/")[-1].lower()[:16]
189
202
  if "+" in model:
190
203
  model_name_short = model.split("+")[0].lower()[:4] + "_" + model_name_short
204
+ # strip non-alphanumeric characters from model_name_short
205
+ model_name_short = "".join(c for c in model_name_short if c.isalnum() or c == "_")
191
206
 
192
207
  # id format: yyyy-mm-dd_model_hhmmss_uuid[:4]
193
208
  now = datetime.now()
@@ -196,11 +211,11 @@ class TrajectorySaverCallback(AsyncCallbackHandler):
196
211
  self.current_artifact = 0
197
212
  self.model = model
198
213
  self.total_usage = {}
199
-
214
+
200
215
  # Create trajectory directory
201
216
  trajectory_path = self.trajectory_dir / self.trajectory_id
202
217
  trajectory_path.mkdir(parents=True, exist_ok=True)
203
-
218
+
204
219
  # Save trajectory metadata (optionally extract screenshots to screenshot_dir)
205
220
  kwargs_to_save = kwargs.copy()
206
221
  try:
@@ -217,7 +232,7 @@ class TrajectorySaverCallback(AsyncCallbackHandler):
217
232
  "status": "running",
218
233
  "kwargs": kwargs_to_save,
219
234
  }
220
-
235
+
221
236
  with open(trajectory_path / "metadata.json", "w") as f:
222
237
  json.dump(metadata, f, indent=2)
223
238
  else:
@@ -225,22 +240,27 @@ class TrajectorySaverCallback(AsyncCallbackHandler):
225
240
  self.model = model
226
241
 
227
242
  @override
228
- async def on_run_end(self, kwargs: Dict[str, Any], old_items: List[Dict[str, Any]], new_items: List[Dict[str, Any]]) -> None:
243
+ async def on_run_end(
244
+ self,
245
+ kwargs: Dict[str, Any],
246
+ old_items: List[Dict[str, Any]],
247
+ new_items: List[Dict[str, Any]],
248
+ ) -> None:
229
249
  """Finalize run tracking by updating metadata with completion status, usage, and new items."""
230
250
  if not self.trajectory_id:
231
251
  return
232
-
252
+
233
253
  # Update metadata with completion status, total usage, and new items
234
254
  trajectory_path = self.trajectory_dir / self.trajectory_id
235
255
  metadata_path = trajectory_path / "metadata.json"
236
-
256
+
237
257
  # Read existing metadata
238
258
  if metadata_path.exists():
239
259
  with open(metadata_path, "r") as f:
240
260
  metadata = json.load(f)
241
261
  else:
242
262
  metadata = {}
243
-
263
+
244
264
  # Update metadata with completion info
245
265
  # Optionally extract screenshots from new_items before persisting
246
266
  new_items_to_save = new_items
@@ -249,32 +269,34 @@ class TrajectorySaverCallback(AsyncCallbackHandler):
249
269
  except Exception:
250
270
  pass
251
271
 
252
- metadata.update({
253
- "status": "completed",
254
- "completed_at": str(uuid.uuid1().time),
255
- "total_usage": self.total_usage,
256
- "new_items": new_items_to_save,
257
- "total_turns": self.current_turn
258
- })
259
-
272
+ metadata.update(
273
+ {
274
+ "status": "completed",
275
+ "completed_at": str(uuid.uuid1().time),
276
+ "total_usage": self.total_usage,
277
+ "new_items": new_items_to_save,
278
+ "total_turns": self.current_turn,
279
+ }
280
+ )
281
+
260
282
  # Save updated metadata
261
283
  with open(metadata_path, "w") as f:
262
284
  json.dump(metadata, f, indent=2)
263
-
264
- @override
285
+
286
+ @override
265
287
  async def on_api_start(self, kwargs: Dict[str, Any]) -> None:
266
288
  if not self.trajectory_id:
267
289
  return
268
-
269
- self._save_artifact("api_start", { "kwargs": kwargs })
270
-
290
+
291
+ self._save_artifact("api_start", {"kwargs": kwargs})
292
+
271
293
  @override
272
294
  async def on_api_end(self, kwargs: Dict[str, Any], result: Any) -> None:
273
295
  """Save API call result."""
274
296
  if not self.trajectory_id:
275
297
  return
276
-
277
- self._save_artifact("api_result", { "kwargs": kwargs, "result": result })
298
+
299
+ self._save_artifact("api_result", {"kwargs": kwargs, "result": result})
278
300
 
279
301
  @override
280
302
  async def on_screenshot(self, screenshot: Union[str, bytes], name: str = "screenshot") -> None:
@@ -293,77 +315,83 @@ class TrajectorySaverCallback(AsyncCallbackHandler):
293
315
  """Save responses to the current turn directory and update usage statistics."""
294
316
  if not self.trajectory_id:
295
317
  return
296
-
318
+
297
319
  # Save responses
298
320
  turn_dir = self._get_turn_dir()
299
321
  response_data = {
300
322
  "timestamp": str(uuid.uuid1().time),
301
323
  "model": self.model,
302
324
  "kwargs": kwargs,
303
- "response": responses
325
+ "response": responses,
304
326
  }
305
-
327
+
306
328
  self._save_artifact("agent_response", response_data)
307
-
329
+
308
330
  # Increment turn counter
309
331
  self.current_turn += 1
310
332
 
311
333
  def _draw_crosshair_on_image(self, image_bytes: bytes, x: int, y: int) -> bytes:
312
334
  """
313
335
  Draw a red dot and crosshair at the specified coordinates on the image.
314
-
336
+
315
337
  Args:
316
338
  image_bytes: The original image as bytes
317
339
  x: X coordinate for the crosshair
318
340
  y: Y coordinate for the crosshair
319
-
341
+
320
342
  Returns:
321
343
  Modified image as bytes with red dot and crosshair
322
344
  """
323
345
  # Open the image
324
346
  image = Image.open(io.BytesIO(image_bytes))
325
347
  draw = ImageDraw.Draw(image)
326
-
348
+
327
349
  # Draw crosshair lines (red, 2px thick)
328
350
  crosshair_size = 20
329
351
  line_width = 2
330
352
  color = "red"
331
-
353
+
332
354
  # Horizontal line
333
355
  draw.line([(x - crosshair_size, y), (x + crosshair_size, y)], fill=color, width=line_width)
334
356
  # Vertical line
335
357
  draw.line([(x, y - crosshair_size), (x, y + crosshair_size)], fill=color, width=line_width)
336
-
358
+
337
359
  # Draw center dot (filled circle)
338
360
  dot_radius = 3
339
- draw.ellipse([(x - dot_radius, y - dot_radius), (x + dot_radius, y + dot_radius)], fill=color)
340
-
361
+ draw.ellipse(
362
+ [(x - dot_radius, y - dot_radius), (x + dot_radius, y + dot_radius)], fill=color
363
+ )
364
+
341
365
  # Convert back to bytes
342
366
  output = io.BytesIO()
343
- image.save(output, format='PNG')
367
+ image.save(output, format="PNG")
344
368
  return output.getvalue()
345
369
 
346
370
  @override
347
- async def on_computer_call_end(self, item: Dict[str, Any], result: List[Dict[str, Any]]) -> None:
371
+ async def on_computer_call_end(
372
+ self, item: Dict[str, Any], result: List[Dict[str, Any]]
373
+ ) -> None:
348
374
  """
349
375
  Called when a computer call has completed.
350
376
  Saves screenshots and computer call output.
351
377
  """
352
378
  if not self.trajectory_id:
353
379
  return
354
-
355
- self._save_artifact("computer_call_result", { "item": item, "result": result })
356
-
380
+
381
+ self._save_artifact("computer_call_result", {"item": item, "result": result})
382
+
357
383
  # Check if action has x/y coordinates and there's a screenshot in the result
358
384
  action = item.get("action", {})
359
385
  if "x" in action and "y" in action:
360
386
  # Look for screenshot in the result
361
387
  for result_item in result:
362
- if (result_item.get("type") == "computer_call_output" and
363
- result_item.get("output", {}).get("type") == "input_image"):
364
-
388
+ if (
389
+ result_item.get("type") == "computer_call_output"
390
+ and result_item.get("output", {}).get("type") == "input_image"
391
+ ):
392
+
365
393
  image_url = result_item["output"]["image_url"]
366
-
394
+
367
395
  # Extract base64 image data
368
396
  if image_url.startswith("data:image/"):
369
397
  # Format: data:image/png;base64,<base64_data>
@@ -371,26 +399,24 @@ class TrajectorySaverCallback(AsyncCallbackHandler):
371
399
  else:
372
400
  # Assume it's just base64 data
373
401
  base64_data = image_url
374
-
402
+
375
403
  try:
376
404
  # Decode the image
377
405
  image_bytes = base64.b64decode(base64_data)
378
-
406
+
379
407
  # Draw crosshair at the action coordinates
380
408
  annotated_image = self._draw_crosshair_on_image(
381
- image_bytes,
382
- int(action["x"]),
383
- int(action["y"])
409
+ image_bytes, int(action["x"]), int(action["y"])
384
410
  )
385
-
411
+
386
412
  # Save as screenshot_action
387
413
  self._save_artifact("screenshot_action", annotated_image)
388
-
414
+
389
415
  except Exception as e:
390
416
  # If annotation fails, just log and continue
391
417
  print(f"Failed to annotate screenshot: {e}")
392
-
418
+
393
419
  break # Only process the first screenshot found
394
420
 
395
421
  # Increment turn counter
396
- self.current_turn += 1
422
+ self.current_turn += 1