cua-agent 0.4.14__py3-none-any.whl → 0.7.16__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cua-agent might be problematic. Click here for more details.

Files changed (82) hide show
  1. agent/__init__.py +4 -19
  2. agent/__main__.py +2 -1
  3. agent/adapters/__init__.py +6 -0
  4. agent/adapters/azure_ml_adapter.py +283 -0
  5. agent/adapters/cua_adapter.py +161 -0
  6. agent/adapters/huggingfacelocal_adapter.py +67 -125
  7. agent/adapters/human_adapter.py +116 -114
  8. agent/adapters/mlxvlm_adapter.py +370 -0
  9. agent/adapters/models/__init__.py +41 -0
  10. agent/adapters/models/generic.py +78 -0
  11. agent/adapters/models/internvl.py +290 -0
  12. agent/adapters/models/opencua.py +115 -0
  13. agent/adapters/models/qwen2_5_vl.py +78 -0
  14. agent/agent.py +431 -241
  15. agent/callbacks/__init__.py +10 -3
  16. agent/callbacks/base.py +45 -31
  17. agent/callbacks/budget_manager.py +22 -10
  18. agent/callbacks/image_retention.py +54 -98
  19. agent/callbacks/logging.py +55 -42
  20. agent/callbacks/operator_validator.py +140 -0
  21. agent/callbacks/otel.py +291 -0
  22. agent/callbacks/pii_anonymization.py +19 -16
  23. agent/callbacks/prompt_instructions.py +47 -0
  24. agent/callbacks/telemetry.py +106 -69
  25. agent/callbacks/trajectory_saver.py +178 -70
  26. agent/cli.py +269 -119
  27. agent/computers/__init__.py +14 -9
  28. agent/computers/base.py +32 -19
  29. agent/computers/cua.py +52 -25
  30. agent/computers/custom.py +78 -71
  31. agent/decorators.py +23 -14
  32. agent/human_tool/__init__.py +2 -7
  33. agent/human_tool/__main__.py +6 -2
  34. agent/human_tool/server.py +48 -37
  35. agent/human_tool/ui.py +359 -235
  36. agent/integrations/hud/__init__.py +164 -74
  37. agent/integrations/hud/agent.py +338 -342
  38. agent/integrations/hud/proxy.py +297 -0
  39. agent/loops/__init__.py +44 -14
  40. agent/loops/anthropic.py +590 -492
  41. agent/loops/base.py +19 -15
  42. agent/loops/composed_grounded.py +142 -144
  43. agent/loops/fara/__init__.py +8 -0
  44. agent/loops/fara/config.py +506 -0
  45. agent/loops/fara/helpers.py +357 -0
  46. agent/loops/fara/schema.py +143 -0
  47. agent/loops/gelato.py +183 -0
  48. agent/loops/gemini.py +935 -0
  49. agent/loops/generic_vlm.py +601 -0
  50. agent/loops/glm45v.py +140 -135
  51. agent/loops/gta1.py +48 -51
  52. agent/loops/holo.py +218 -0
  53. agent/loops/internvl.py +180 -0
  54. agent/loops/moondream3.py +493 -0
  55. agent/loops/omniparser.py +326 -226
  56. agent/loops/openai.py +63 -56
  57. agent/loops/opencua.py +134 -0
  58. agent/loops/uiins.py +175 -0
  59. agent/loops/uitars.py +262 -212
  60. agent/loops/uitars2.py +951 -0
  61. agent/playground/__init__.py +5 -0
  62. agent/playground/server.py +301 -0
  63. agent/proxy/examples.py +196 -0
  64. agent/proxy/handlers.py +255 -0
  65. agent/responses.py +486 -339
  66. agent/tools/__init__.py +24 -0
  67. agent/tools/base.py +253 -0
  68. agent/tools/browser_tool.py +423 -0
  69. agent/types.py +20 -5
  70. agent/ui/__init__.py +1 -1
  71. agent/ui/__main__.py +1 -1
  72. agent/ui/gradio/app.py +25 -22
  73. agent/ui/gradio/ui_components.py +314 -167
  74. cua_agent-0.7.16.dist-info/METADATA +85 -0
  75. cua_agent-0.7.16.dist-info/RECORD +79 -0
  76. {cua_agent-0.4.14.dist-info → cua_agent-0.7.16.dist-info}/WHEEL +1 -1
  77. agent/integrations/hud/adapter.py +0 -121
  78. agent/integrations/hud/computer_handler.py +0 -187
  79. agent/telemetry.py +0 -142
  80. cua_agent-0.4.14.dist-info/METADATA +0 -436
  81. cua_agent-0.4.14.dist-info/RECORD +0 -50
  82. {cua_agent-0.4.14.dist-info → cua_agent-0.7.16.dist-info}/entry_points.txt +0 -0
@@ -2,24 +2,28 @@
2
2
  Trajectory saving callback handler for ComputerAgent.
3
3
  """
4
4
 
5
- import os
5
+ import base64
6
+ import io
6
7
  import json
8
+ import os
7
9
  import uuid
10
+ from copy import deepcopy
8
11
  from datetime import datetime
9
- import base64
10
12
  from pathlib import Path
11
- from typing import List, Dict, Any, Optional, Union, override
13
+ from typing import Any, Dict, List, Optional, Union, override
14
+
12
15
  from PIL import Image, ImageDraw
13
- import io
16
+
14
17
  from .base import AsyncCallbackHandler
15
18
 
19
+
16
20
  def sanitize_image_urls(data: Any) -> Any:
17
21
  """
18
22
  Recursively search for 'image_url' keys and set their values to '[omitted]'.
19
-
23
+
20
24
  Args:
21
25
  data: Any data structure (dict, list, or primitive type)
22
-
26
+
23
27
  Returns:
24
28
  A deep copy of the data with all 'image_url' values replaced with '[omitted]'
25
29
  """
@@ -33,28 +37,91 @@ def sanitize_image_urls(data: Any) -> Any:
33
37
  # Recursively sanitize the value
34
38
  sanitized[key] = sanitize_image_urls(value)
35
39
  return sanitized
36
-
40
+
37
41
  elif isinstance(data, list):
38
42
  # Recursively sanitize each item in the list
39
43
  return [sanitize_image_urls(item) for item in data]
40
-
44
+
41
45
  else:
42
46
  # For primitive types (str, int, bool, None, etc.), return as-is
43
47
  return data
44
48
 
45
49
 
50
+ def extract_computer_call_outputs(
51
+ items: List[Dict[str, Any]], screenshot_dir: Optional[Path]
52
+ ) -> List[Dict[str, Any]]:
53
+ """
54
+ Save any base64-encoded screenshots from computer_call_output entries to files and
55
+ replace their image_url with the saved file path when a call_id is present.
56
+
57
+ Only operates if screenshot_dir is provided and exists; otherwise returns items unchanged.
58
+
59
+ Args:
60
+ items: List of message/result dicts potentially containing computer_call_output entries
61
+ screenshot_dir: Directory to write screenshots into
62
+
63
+ Returns:
64
+ A new list with updated image_url fields when applicable.
65
+ """
66
+ if not items:
67
+ return items
68
+ if not screenshot_dir or not screenshot_dir.exists():
69
+ return items
70
+
71
+ updated: List[Dict[str, Any]] = []
72
+ for item in items:
73
+ # work on a shallow copy; deep copy nested 'output' if we modify it
74
+ msg = dict(item)
75
+ try:
76
+ if msg.get("type") == "computer_call_output":
77
+ call_id = msg.get("call_id")
78
+ output = msg.get("output", {})
79
+ image_url = output.get("image_url")
80
+ if call_id and isinstance(image_url, str) and image_url.startswith("data:"):
81
+ # derive extension from MIME type e.g. data:image/png;base64,
82
+ try:
83
+ ext = image_url.split(";", 1)[0].split("/")[-1]
84
+ if not ext:
85
+ ext = "png"
86
+ except Exception:
87
+ ext = "png"
88
+ out_path = screenshot_dir / f"{call_id}.{ext}"
89
+ # write file if it doesn't exist
90
+ if not out_path.exists():
91
+ try:
92
+ b64_payload = image_url.split(",", 1)[1]
93
+ img_bytes = base64.b64decode(b64_payload)
94
+ out_path.parent.mkdir(parents=True, exist_ok=True)
95
+ with open(out_path, "wb") as f:
96
+ f.write(img_bytes)
97
+ except Exception:
98
+ # if anything fails, skip modifying this message
99
+ pass
100
+ # update image_url to file path
101
+ new_output = dict(output)
102
+ new_output["image_url"] = str(out_path)
103
+ msg["output"] = new_output
104
+ except Exception:
105
+ # do not block on malformed entries; keep original
106
+ pass
107
+ updated.append(msg)
108
+ return updated
109
+
110
+
46
111
  class TrajectorySaverCallback(AsyncCallbackHandler):
47
112
  """
48
113
  Callback handler that saves agent trajectories to disk.
49
-
114
+
50
115
  Saves each run as a separate trajectory with unique ID, and each turn
51
116
  within the trajectory gets its own folder with screenshots and responses.
52
117
  """
53
-
54
- def __init__(self, trajectory_dir: str, reset_on_run: bool = True):
118
+
119
+ def __init__(
120
+ self, trajectory_dir: str, reset_on_run: bool = True, screenshot_dir: Optional[str] = None
121
+ ):
55
122
  """
56
123
  Initialize trajectory saver.
57
-
124
+
58
125
  Args:
59
126
  trajectory_dir: Base directory to save trajectories
60
127
  reset_on_run: If True, reset trajectory_id/turn/artifact on each run.
@@ -67,15 +134,21 @@ class TrajectorySaverCallback(AsyncCallbackHandler):
67
134
  self.model: Optional[str] = None
68
135
  self.total_usage: Dict[str, Any] = {}
69
136
  self.reset_on_run = reset_on_run
70
-
137
+ # Optional directory to store extracted screenshots from metadata/new_items
138
+ self.screenshot_dir: Optional[Path] = Path(screenshot_dir) if screenshot_dir else None
139
+
71
140
  # Ensure trajectory directory exists
72
141
  self.trajectory_dir.mkdir(parents=True, exist_ok=True)
73
-
142
+
143
+ # Ensure screenshot directory exists if specified
144
+ if self.screenshot_dir:
145
+ self.screenshot_dir.mkdir(parents=True, exist_ok=True)
146
+
74
147
  def _get_turn_dir(self) -> Path:
75
148
  """Get the directory for the current turn."""
76
149
  if not self.trajectory_id:
77
150
  raise ValueError("Trajectory not initialized - call _on_run_start first")
78
-
151
+
79
152
  # format: trajectory_id/turn_000
80
153
  turn_dir = self.trajectory_dir / self.trajectory_id / f"turn_{self.current_turn:03d}"
81
154
  turn_dir.mkdir(parents=True, exist_ok=True)
@@ -94,12 +167,17 @@ class TrajectorySaverCallback(AsyncCallbackHandler):
94
167
  # format: turn_000/0000_name.json
95
168
  artifact_filename = f"{self.current_artifact:04d}_{name}"
96
169
  artifact_path = turn_dir / f"{artifact_filename}.json"
170
+ # add created_at
171
+ if isinstance(artifact, dict):
172
+ artifact = artifact.copy()
173
+ artifact["created_at"] = str(uuid.uuid1().time)
97
174
  with open(artifact_path, "w") as f:
98
175
  json.dump(sanitize_image_urls(artifact), f, indent=2)
99
176
  self.current_artifact += 1
100
177
 
101
178
  def _update_usage(self, usage: Dict[str, Any]) -> None:
102
179
  """Update total usage statistics."""
180
+
103
181
  def add_dicts(target: Dict[str, Any], source: Dict[str, Any]) -> None:
104
182
  for key, value in source.items():
105
183
  if isinstance(value, dict):
@@ -110,18 +188,21 @@ class TrajectorySaverCallback(AsyncCallbackHandler):
110
188
  if key not in target:
111
189
  target[key] = 0
112
190
  target[key] += value
191
+
113
192
  add_dicts(self.total_usage, usage)
114
-
193
+
115
194
  @override
116
195
  async def on_run_start(self, kwargs: Dict[str, Any], old_items: List[Dict[str, Any]]) -> None:
117
196
  """Initialize trajectory tracking for a new run."""
118
197
  model = kwargs.get("model", "unknown")
119
-
198
+
120
199
  # Only reset trajectory state if reset_on_run is True or no trajectory exists
121
200
  if self.reset_on_run or not self.trajectory_id:
122
201
  model_name_short = model.split("+")[-1].split("/")[-1].lower()[:16]
123
202
  if "+" in model:
124
203
  model_name_short = model.split("+")[0].lower()[:4] + "_" + model_name_short
204
+ # strip non-alphanumeric characters from model_name_short
205
+ model_name_short = "".join(c for c in model_name_short if c.isalnum() or c == "_")
125
206
 
126
207
  # id format: yyyy-mm-dd_model_hhmmss_uuid[:4]
127
208
  now = datetime.now()
@@ -130,19 +211,28 @@ class TrajectorySaverCallback(AsyncCallbackHandler):
130
211
  self.current_artifact = 0
131
212
  self.model = model
132
213
  self.total_usage = {}
133
-
214
+
134
215
  # Create trajectory directory
135
216
  trajectory_path = self.trajectory_dir / self.trajectory_id
136
217
  trajectory_path.mkdir(parents=True, exist_ok=True)
137
-
138
- # Save trajectory metadata
218
+
219
+ # Save trajectory metadata (optionally extract screenshots to screenshot_dir)
220
+ kwargs_to_save = kwargs.copy()
221
+ try:
222
+ if "messages" in kwargs_to_save:
223
+ kwargs_to_save["messages"] = extract_computer_call_outputs(
224
+ kwargs_to_save["messages"], self.screenshot_dir
225
+ )
226
+ except Exception:
227
+ # If extraction fails, fall back to original messages
228
+ pass
139
229
  metadata = {
140
230
  "trajectory_id": self.trajectory_id,
141
231
  "created_at": str(uuid.uuid1().time),
142
232
  "status": "running",
143
- "kwargs": kwargs,
233
+ "kwargs": kwargs_to_save,
144
234
  }
145
-
235
+
146
236
  with open(trajectory_path / "metadata.json", "w") as f:
147
237
  json.dump(metadata, f, indent=2)
148
238
  else:
@@ -150,49 +240,63 @@ class TrajectorySaverCallback(AsyncCallbackHandler):
150
240
  self.model = model
151
241
 
152
242
  @override
153
- async def on_run_end(self, kwargs: Dict[str, Any], old_items: List[Dict[str, Any]], new_items: List[Dict[str, Any]]) -> None:
243
+ async def on_run_end(
244
+ self,
245
+ kwargs: Dict[str, Any],
246
+ old_items: List[Dict[str, Any]],
247
+ new_items: List[Dict[str, Any]],
248
+ ) -> None:
154
249
  """Finalize run tracking by updating metadata with completion status, usage, and new items."""
155
250
  if not self.trajectory_id:
156
251
  return
157
-
252
+
158
253
  # Update metadata with completion status, total usage, and new items
159
254
  trajectory_path = self.trajectory_dir / self.trajectory_id
160
255
  metadata_path = trajectory_path / "metadata.json"
161
-
256
+
162
257
  # Read existing metadata
163
258
  if metadata_path.exists():
164
259
  with open(metadata_path, "r") as f:
165
260
  metadata = json.load(f)
166
261
  else:
167
262
  metadata = {}
168
-
263
+
169
264
  # Update metadata with completion info
170
- metadata.update({
171
- "status": "completed",
172
- "completed_at": str(uuid.uuid1().time),
173
- "total_usage": self.total_usage,
174
- "new_items": sanitize_image_urls(new_items),
175
- "total_turns": self.current_turn
176
- })
177
-
265
+ # Optionally extract screenshots from new_items before persisting
266
+ new_items_to_save = new_items
267
+ try:
268
+ new_items_to_save = extract_computer_call_outputs(new_items, self.screenshot_dir)
269
+ except Exception:
270
+ pass
271
+
272
+ metadata.update(
273
+ {
274
+ "status": "completed",
275
+ "completed_at": str(uuid.uuid1().time),
276
+ "total_usage": self.total_usage,
277
+ "new_items": new_items_to_save,
278
+ "total_turns": self.current_turn,
279
+ }
280
+ )
281
+
178
282
  # Save updated metadata
179
283
  with open(metadata_path, "w") as f:
180
284
  json.dump(metadata, f, indent=2)
181
-
182
- @override
285
+
286
+ @override
183
287
  async def on_api_start(self, kwargs: Dict[str, Any]) -> None:
184
288
  if not self.trajectory_id:
185
289
  return
186
-
187
- self._save_artifact("api_start", { "kwargs": kwargs })
188
-
290
+
291
+ self._save_artifact("api_start", {"kwargs": kwargs})
292
+
189
293
  @override
190
294
  async def on_api_end(self, kwargs: Dict[str, Any], result: Any) -> None:
191
295
  """Save API call result."""
192
296
  if not self.trajectory_id:
193
297
  return
194
-
195
- self._save_artifact("api_result", { "kwargs": kwargs, "result": result })
298
+
299
+ self._save_artifact("api_result", {"kwargs": kwargs, "result": result})
196
300
 
197
301
  @override
198
302
  async def on_screenshot(self, screenshot: Union[str, bytes], name: str = "screenshot") -> None:
@@ -211,77 +315,83 @@ class TrajectorySaverCallback(AsyncCallbackHandler):
211
315
  """Save responses to the current turn directory and update usage statistics."""
212
316
  if not self.trajectory_id:
213
317
  return
214
-
318
+
215
319
  # Save responses
216
320
  turn_dir = self._get_turn_dir()
217
321
  response_data = {
218
322
  "timestamp": str(uuid.uuid1().time),
219
323
  "model": self.model,
220
324
  "kwargs": kwargs,
221
- "response": responses
325
+ "response": responses,
222
326
  }
223
-
327
+
224
328
  self._save_artifact("agent_response", response_data)
225
-
329
+
226
330
  # Increment turn counter
227
331
  self.current_turn += 1
228
332
 
229
333
  def _draw_crosshair_on_image(self, image_bytes: bytes, x: int, y: int) -> bytes:
230
334
  """
231
335
  Draw a red dot and crosshair at the specified coordinates on the image.
232
-
336
+
233
337
  Args:
234
338
  image_bytes: The original image as bytes
235
339
  x: X coordinate for the crosshair
236
340
  y: Y coordinate for the crosshair
237
-
341
+
238
342
  Returns:
239
343
  Modified image as bytes with red dot and crosshair
240
344
  """
241
345
  # Open the image
242
346
  image = Image.open(io.BytesIO(image_bytes))
243
347
  draw = ImageDraw.Draw(image)
244
-
348
+
245
349
  # Draw crosshair lines (red, 2px thick)
246
350
  crosshair_size = 20
247
351
  line_width = 2
248
352
  color = "red"
249
-
353
+
250
354
  # Horizontal line
251
355
  draw.line([(x - crosshair_size, y), (x + crosshair_size, y)], fill=color, width=line_width)
252
356
  # Vertical line
253
357
  draw.line([(x, y - crosshair_size), (x, y + crosshair_size)], fill=color, width=line_width)
254
-
358
+
255
359
  # Draw center dot (filled circle)
256
360
  dot_radius = 3
257
- draw.ellipse([(x - dot_radius, y - dot_radius), (x + dot_radius, y + dot_radius)], fill=color)
258
-
361
+ draw.ellipse(
362
+ [(x - dot_radius, y - dot_radius), (x + dot_radius, y + dot_radius)], fill=color
363
+ )
364
+
259
365
  # Convert back to bytes
260
366
  output = io.BytesIO()
261
- image.save(output, format='PNG')
367
+ image.save(output, format="PNG")
262
368
  return output.getvalue()
263
369
 
264
370
  @override
265
- async def on_computer_call_end(self, item: Dict[str, Any], result: List[Dict[str, Any]]) -> None:
371
+ async def on_computer_call_end(
372
+ self, item: Dict[str, Any], result: List[Dict[str, Any]]
373
+ ) -> None:
266
374
  """
267
375
  Called when a computer call has completed.
268
376
  Saves screenshots and computer call output.
269
377
  """
270
378
  if not self.trajectory_id:
271
379
  return
272
-
273
- self._save_artifact("computer_call_result", { "item": item, "result": result })
274
-
380
+
381
+ self._save_artifact("computer_call_result", {"item": item, "result": result})
382
+
275
383
  # Check if action has x/y coordinates and there's a screenshot in the result
276
384
  action = item.get("action", {})
277
385
  if "x" in action and "y" in action:
278
386
  # Look for screenshot in the result
279
387
  for result_item in result:
280
- if (result_item.get("type") == "computer_call_output" and
281
- result_item.get("output", {}).get("type") == "input_image"):
282
-
388
+ if (
389
+ result_item.get("type") == "computer_call_output"
390
+ and result_item.get("output", {}).get("type") == "input_image"
391
+ ):
392
+
283
393
  image_url = result_item["output"]["image_url"]
284
-
394
+
285
395
  # Extract base64 image data
286
396
  if image_url.startswith("data:image/"):
287
397
  # Format: data:image/png;base64,<base64_data>
@@ -289,26 +399,24 @@ class TrajectorySaverCallback(AsyncCallbackHandler):
289
399
  else:
290
400
  # Assume it's just base64 data
291
401
  base64_data = image_url
292
-
402
+
293
403
  try:
294
404
  # Decode the image
295
405
  image_bytes = base64.b64decode(base64_data)
296
-
406
+
297
407
  # Draw crosshair at the action coordinates
298
408
  annotated_image = self._draw_crosshair_on_image(
299
- image_bytes,
300
- int(action["x"]),
301
- int(action["y"])
409
+ image_bytes, int(action["x"]), int(action["y"])
302
410
  )
303
-
411
+
304
412
  # Save as screenshot_action
305
413
  self._save_artifact("screenshot_action", annotated_image)
306
-
414
+
307
415
  except Exception as e:
308
416
  # If annotation fails, just log and continue
309
417
  print(f"Failed to annotate screenshot: {e}")
310
-
418
+
311
419
  break # Only process the first screenshot found
312
420
 
313
421
  # Increment turn counter
314
- self.current_turn += 1
422
+ self.current_turn += 1