cua-agent 0.4.33__py3-none-any.whl → 0.4.35__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cua-agent might be problematic. Click here for more details.

Files changed (61) hide show
  1. agent/__init__.py +4 -10
  2. agent/__main__.py +2 -1
  3. agent/adapters/huggingfacelocal_adapter.py +54 -61
  4. agent/adapters/human_adapter.py +116 -114
  5. agent/adapters/mlxvlm_adapter.py +110 -99
  6. agent/adapters/models/__init__.py +14 -6
  7. agent/adapters/models/generic.py +7 -4
  8. agent/adapters/models/internvl.py +66 -30
  9. agent/adapters/models/opencua.py +23 -8
  10. agent/adapters/models/qwen2_5_vl.py +7 -4
  11. agent/agent.py +184 -158
  12. agent/callbacks/__init__.py +4 -4
  13. agent/callbacks/base.py +45 -31
  14. agent/callbacks/budget_manager.py +22 -10
  15. agent/callbacks/image_retention.py +18 -13
  16. agent/callbacks/logging.py +55 -42
  17. agent/callbacks/operator_validator.py +3 -1
  18. agent/callbacks/pii_anonymization.py +19 -16
  19. agent/callbacks/telemetry.py +67 -61
  20. agent/callbacks/trajectory_saver.py +90 -70
  21. agent/cli.py +115 -110
  22. agent/computers/__init__.py +13 -8
  23. agent/computers/base.py +26 -17
  24. agent/computers/cua.py +27 -23
  25. agent/computers/custom.py +72 -69
  26. agent/decorators.py +23 -14
  27. agent/human_tool/__init__.py +2 -7
  28. agent/human_tool/__main__.py +6 -2
  29. agent/human_tool/server.py +48 -37
  30. agent/human_tool/ui.py +235 -185
  31. agent/integrations/hud/__init__.py +15 -21
  32. agent/integrations/hud/agent.py +101 -83
  33. agent/integrations/hud/proxy.py +90 -57
  34. agent/loops/__init__.py +25 -21
  35. agent/loops/anthropic.py +537 -483
  36. agent/loops/base.py +13 -14
  37. agent/loops/composed_grounded.py +135 -149
  38. agent/loops/gemini.py +31 -12
  39. agent/loops/glm45v.py +135 -133
  40. agent/loops/gta1.py +47 -50
  41. agent/loops/holo.py +4 -2
  42. agent/loops/internvl.py +6 -11
  43. agent/loops/moondream3.py +49 -20
  44. agent/loops/omniparser.py +212 -209
  45. agent/loops/openai.py +49 -50
  46. agent/loops/opencua.py +29 -41
  47. agent/loops/qwen.py +475 -0
  48. agent/loops/uitars.py +237 -202
  49. agent/proxy/examples.py +54 -50
  50. agent/proxy/handlers.py +27 -34
  51. agent/responses.py +330 -330
  52. agent/types.py +11 -5
  53. agent/ui/__init__.py +1 -1
  54. agent/ui/__main__.py +1 -1
  55. agent/ui/gradio/app.py +23 -18
  56. agent/ui/gradio/ui_components.py +310 -161
  57. {cua_agent-0.4.33.dist-info → cua_agent-0.4.35.dist-info}/METADATA +22 -10
  58. cua_agent-0.4.35.dist-info/RECORD +64 -0
  59. cua_agent-0.4.33.dist-info/RECORD +0 -63
  60. {cua_agent-0.4.33.dist-info → cua_agent-0.4.35.dist-info}/WHEEL +0 -0
  61. {cua_agent-0.4.33.dist-info → cua_agent-0.4.35.dist-info}/entry_points.txt +0 -0
@@ -2,26 +2,28 @@
2
2
  Trajectory saving callback handler for ComputerAgent.
3
3
  """
4
4
 
5
- import os
5
+ import base64
6
+ import io
6
7
  import json
8
+ import os
7
9
  import uuid
10
+ from copy import deepcopy
8
11
  from datetime import datetime
9
- import base64
10
12
  from pathlib import Path
11
- from typing import List, Dict, Any, Optional, Union, override
13
+ from typing import Any, Dict, List, Optional, Union, override
14
+
12
15
  from PIL import Image, ImageDraw
13
- import io
14
- from copy import deepcopy
15
16
 
16
17
  from .base import AsyncCallbackHandler
17
18
 
19
+
18
20
  def sanitize_image_urls(data: Any) -> Any:
19
21
  """
20
22
  Recursively search for 'image_url' keys and set their values to '[omitted]'.
21
-
23
+
22
24
  Args:
23
25
  data: Any data structure (dict, list, or primitive type)
24
-
26
+
25
27
  Returns:
26
28
  A deep copy of the data with all 'image_url' values replaced with '[omitted]'
27
29
  """
@@ -35,17 +37,19 @@ def sanitize_image_urls(data: Any) -> Any:
35
37
  # Recursively sanitize the value
36
38
  sanitized[key] = sanitize_image_urls(value)
37
39
  return sanitized
38
-
40
+
39
41
  elif isinstance(data, list):
40
42
  # Recursively sanitize each item in the list
41
43
  return [sanitize_image_urls(item) for item in data]
42
-
44
+
43
45
  else:
44
46
  # For primitive types (str, int, bool, None, etc.), return as-is
45
47
  return data
46
48
 
47
49
 
48
- def extract_computer_call_outputs(items: List[Dict[str, Any]], screenshot_dir: Optional[Path]) -> List[Dict[str, Any]]:
50
+ def extract_computer_call_outputs(
51
+ items: List[Dict[str, Any]], screenshot_dir: Optional[Path]
52
+ ) -> List[Dict[str, Any]]:
49
53
  """
50
54
  Save any base64-encoded screenshots from computer_call_output entries to files and
51
55
  replace their image_url with the saved file path when a call_id is present.
@@ -103,18 +107,21 @@ def extract_computer_call_outputs(items: List[Dict[str, Any]], screenshot_dir: O
103
107
  updated.append(msg)
104
108
  return updated
105
109
 
110
+
106
111
  class TrajectorySaverCallback(AsyncCallbackHandler):
107
112
  """
108
113
  Callback handler that saves agent trajectories to disk.
109
-
114
+
110
115
  Saves each run as a separate trajectory with unique ID, and each turn
111
116
  within the trajectory gets its own folder with screenshots and responses.
112
117
  """
113
-
114
- def __init__(self, trajectory_dir: str, reset_on_run: bool = True, screenshot_dir: Optional[str] = None):
118
+
119
+ def __init__(
120
+ self, trajectory_dir: str, reset_on_run: bool = True, screenshot_dir: Optional[str] = None
121
+ ):
115
122
  """
116
123
  Initialize trajectory saver.
117
-
124
+
118
125
  Args:
119
126
  trajectory_dir: Base directory to save trajectories
120
127
  reset_on_run: If True, reset trajectory_id/turn/artifact on each run.
@@ -129,7 +136,7 @@ class TrajectorySaverCallback(AsyncCallbackHandler):
129
136
  self.reset_on_run = reset_on_run
130
137
  # Optional directory to store extracted screenshots from metadata/new_items
131
138
  self.screenshot_dir: Optional[Path] = Path(screenshot_dir) if screenshot_dir else None
132
-
139
+
133
140
  # Ensure trajectory directory exists
134
141
  self.trajectory_dir.mkdir(parents=True, exist_ok=True)
135
142
 
@@ -137,7 +144,7 @@ class TrajectorySaverCallback(AsyncCallbackHandler):
137
144
  """Get the directory for the current turn."""
138
145
  if not self.trajectory_id:
139
146
  raise ValueError("Trajectory not initialized - call _on_run_start first")
140
-
147
+
141
148
  # format: trajectory_id/turn_000
142
149
  turn_dir = self.trajectory_dir / self.trajectory_id / f"turn_{self.current_turn:03d}"
143
150
  turn_dir.mkdir(parents=True, exist_ok=True)
@@ -166,6 +173,7 @@ class TrajectorySaverCallback(AsyncCallbackHandler):
166
173
 
167
174
  def _update_usage(self, usage: Dict[str, Any]) -> None:
168
175
  """Update total usage statistics."""
176
+
169
177
  def add_dicts(target: Dict[str, Any], source: Dict[str, Any]) -> None:
170
178
  for key, value in source.items():
171
179
  if isinstance(value, dict):
@@ -176,20 +184,21 @@ class TrajectorySaverCallback(AsyncCallbackHandler):
176
184
  if key not in target:
177
185
  target[key] = 0
178
186
  target[key] += value
187
+
179
188
  add_dicts(self.total_usage, usage)
180
-
189
+
181
190
  @override
182
191
  async def on_run_start(self, kwargs: Dict[str, Any], old_items: List[Dict[str, Any]]) -> None:
183
192
  """Initialize trajectory tracking for a new run."""
184
193
  model = kwargs.get("model", "unknown")
185
-
194
+
186
195
  # Only reset trajectory state if reset_on_run is True or no trajectory exists
187
196
  if self.reset_on_run or not self.trajectory_id:
188
197
  model_name_short = model.split("+")[-1].split("/")[-1].lower()[:16]
189
198
  if "+" in model:
190
199
  model_name_short = model.split("+")[0].lower()[:4] + "_" + model_name_short
191
200
  # strip non-alphanumeric characters from model_name_short
192
- model_name_short = ''.join(c for c in model_name_short if c.isalnum() or c == '_')
201
+ model_name_short = "".join(c for c in model_name_short if c.isalnum() or c == "_")
193
202
 
194
203
  # id format: yyyy-mm-dd_model_hhmmss_uuid[:4]
195
204
  now = datetime.now()
@@ -198,11 +207,11 @@ class TrajectorySaverCallback(AsyncCallbackHandler):
198
207
  self.current_artifact = 0
199
208
  self.model = model
200
209
  self.total_usage = {}
201
-
210
+
202
211
  # Create trajectory directory
203
212
  trajectory_path = self.trajectory_dir / self.trajectory_id
204
213
  trajectory_path.mkdir(parents=True, exist_ok=True)
205
-
214
+
206
215
  # Save trajectory metadata (optionally extract screenshots to screenshot_dir)
207
216
  kwargs_to_save = kwargs.copy()
208
217
  try:
@@ -219,7 +228,7 @@ class TrajectorySaverCallback(AsyncCallbackHandler):
219
228
  "status": "running",
220
229
  "kwargs": kwargs_to_save,
221
230
  }
222
-
231
+
223
232
  with open(trajectory_path / "metadata.json", "w") as f:
224
233
  json.dump(metadata, f, indent=2)
225
234
  else:
@@ -227,22 +236,27 @@ class TrajectorySaverCallback(AsyncCallbackHandler):
227
236
  self.model = model
228
237
 
229
238
  @override
230
- async def on_run_end(self, kwargs: Dict[str, Any], old_items: List[Dict[str, Any]], new_items: List[Dict[str, Any]]) -> None:
239
+ async def on_run_end(
240
+ self,
241
+ kwargs: Dict[str, Any],
242
+ old_items: List[Dict[str, Any]],
243
+ new_items: List[Dict[str, Any]],
244
+ ) -> None:
231
245
  """Finalize run tracking by updating metadata with completion status, usage, and new items."""
232
246
  if not self.trajectory_id:
233
247
  return
234
-
248
+
235
249
  # Update metadata with completion status, total usage, and new items
236
250
  trajectory_path = self.trajectory_dir / self.trajectory_id
237
251
  metadata_path = trajectory_path / "metadata.json"
238
-
252
+
239
253
  # Read existing metadata
240
254
  if metadata_path.exists():
241
255
  with open(metadata_path, "r") as f:
242
256
  metadata = json.load(f)
243
257
  else:
244
258
  metadata = {}
245
-
259
+
246
260
  # Update metadata with completion info
247
261
  # Optionally extract screenshots from new_items before persisting
248
262
  new_items_to_save = new_items
@@ -251,32 +265,34 @@ class TrajectorySaverCallback(AsyncCallbackHandler):
251
265
  except Exception:
252
266
  pass
253
267
 
254
- metadata.update({
255
- "status": "completed",
256
- "completed_at": str(uuid.uuid1().time),
257
- "total_usage": self.total_usage,
258
- "new_items": new_items_to_save,
259
- "total_turns": self.current_turn
260
- })
261
-
268
+ metadata.update(
269
+ {
270
+ "status": "completed",
271
+ "completed_at": str(uuid.uuid1().time),
272
+ "total_usage": self.total_usage,
273
+ "new_items": new_items_to_save,
274
+ "total_turns": self.current_turn,
275
+ }
276
+ )
277
+
262
278
  # Save updated metadata
263
279
  with open(metadata_path, "w") as f:
264
280
  json.dump(metadata, f, indent=2)
265
-
266
- @override
281
+
282
+ @override
267
283
  async def on_api_start(self, kwargs: Dict[str, Any]) -> None:
268
284
  if not self.trajectory_id:
269
285
  return
270
-
271
- self._save_artifact("api_start", { "kwargs": kwargs })
272
-
286
+
287
+ self._save_artifact("api_start", {"kwargs": kwargs})
288
+
273
289
  @override
274
290
  async def on_api_end(self, kwargs: Dict[str, Any], result: Any) -> None:
275
291
  """Save API call result."""
276
292
  if not self.trajectory_id:
277
293
  return
278
-
279
- self._save_artifact("api_result", { "kwargs": kwargs, "result": result })
294
+
295
+ self._save_artifact("api_result", {"kwargs": kwargs, "result": result})
280
296
 
281
297
  @override
282
298
  async def on_screenshot(self, screenshot: Union[str, bytes], name: str = "screenshot") -> None:
@@ -295,77 +311,83 @@ class TrajectorySaverCallback(AsyncCallbackHandler):
295
311
  """Save responses to the current turn directory and update usage statistics."""
296
312
  if not self.trajectory_id:
297
313
  return
298
-
314
+
299
315
  # Save responses
300
316
  turn_dir = self._get_turn_dir()
301
317
  response_data = {
302
318
  "timestamp": str(uuid.uuid1().time),
303
319
  "model": self.model,
304
320
  "kwargs": kwargs,
305
- "response": responses
321
+ "response": responses,
306
322
  }
307
-
323
+
308
324
  self._save_artifact("agent_response", response_data)
309
-
325
+
310
326
  # Increment turn counter
311
327
  self.current_turn += 1
312
328
 
313
329
  def _draw_crosshair_on_image(self, image_bytes: bytes, x: int, y: int) -> bytes:
314
330
  """
315
331
  Draw a red dot and crosshair at the specified coordinates on the image.
316
-
332
+
317
333
  Args:
318
334
  image_bytes: The original image as bytes
319
335
  x: X coordinate for the crosshair
320
336
  y: Y coordinate for the crosshair
321
-
337
+
322
338
  Returns:
323
339
  Modified image as bytes with red dot and crosshair
324
340
  """
325
341
  # Open the image
326
342
  image = Image.open(io.BytesIO(image_bytes))
327
343
  draw = ImageDraw.Draw(image)
328
-
344
+
329
345
  # Draw crosshair lines (red, 2px thick)
330
346
  crosshair_size = 20
331
347
  line_width = 2
332
348
  color = "red"
333
-
349
+
334
350
  # Horizontal line
335
351
  draw.line([(x - crosshair_size, y), (x + crosshair_size, y)], fill=color, width=line_width)
336
352
  # Vertical line
337
353
  draw.line([(x, y - crosshair_size), (x, y + crosshair_size)], fill=color, width=line_width)
338
-
354
+
339
355
  # Draw center dot (filled circle)
340
356
  dot_radius = 3
341
- draw.ellipse([(x - dot_radius, y - dot_radius), (x + dot_radius, y + dot_radius)], fill=color)
342
-
357
+ draw.ellipse(
358
+ [(x - dot_radius, y - dot_radius), (x + dot_radius, y + dot_radius)], fill=color
359
+ )
360
+
343
361
  # Convert back to bytes
344
362
  output = io.BytesIO()
345
- image.save(output, format='PNG')
363
+ image.save(output, format="PNG")
346
364
  return output.getvalue()
347
365
 
348
366
  @override
349
- async def on_computer_call_end(self, item: Dict[str, Any], result: List[Dict[str, Any]]) -> None:
367
+ async def on_computer_call_end(
368
+ self, item: Dict[str, Any], result: List[Dict[str, Any]]
369
+ ) -> None:
350
370
  """
351
371
  Called when a computer call has completed.
352
372
  Saves screenshots and computer call output.
353
373
  """
354
374
  if not self.trajectory_id:
355
375
  return
356
-
357
- self._save_artifact("computer_call_result", { "item": item, "result": result })
358
-
376
+
377
+ self._save_artifact("computer_call_result", {"item": item, "result": result})
378
+
359
379
  # Check if action has x/y coordinates and there's a screenshot in the result
360
380
  action = item.get("action", {})
361
381
  if "x" in action and "y" in action:
362
382
  # Look for screenshot in the result
363
383
  for result_item in result:
364
- if (result_item.get("type") == "computer_call_output" and
365
- result_item.get("output", {}).get("type") == "input_image"):
366
-
384
+ if (
385
+ result_item.get("type") == "computer_call_output"
386
+ and result_item.get("output", {}).get("type") == "input_image"
387
+ ):
388
+
367
389
  image_url = result_item["output"]["image_url"]
368
-
390
+
369
391
  # Extract base64 image data
370
392
  if image_url.startswith("data:image/"):
371
393
  # Format: data:image/png;base64,<base64_data>
@@ -373,26 +395,24 @@ class TrajectorySaverCallback(AsyncCallbackHandler):
373
395
  else:
374
396
  # Assume it's just base64 data
375
397
  base64_data = image_url
376
-
398
+
377
399
  try:
378
400
  # Decode the image
379
401
  image_bytes = base64.b64decode(base64_data)
380
-
402
+
381
403
  # Draw crosshair at the action coordinates
382
404
  annotated_image = self._draw_crosshair_on_image(
383
- image_bytes,
384
- int(action["x"]),
385
- int(action["y"])
405
+ image_bytes, int(action["x"]), int(action["y"])
386
406
  )
387
-
407
+
388
408
  # Save as screenshot_action
389
409
  self._save_artifact("screenshot_action", annotated_image)
390
-
410
+
391
411
  except Exception as e:
392
412
  # If annotation fails, just log and continue
393
413
  print(f"Failed to annotate screenshot: {e}")
394
-
414
+
395
415
  break # Only process the first screenshot found
396
416
 
397
417
  # Increment turn counter
398
- self.current_turn += 1
418
+ self.current_turn += 1