cua-agent 0.4.34__py3-none-any.whl → 0.4.36__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of cua-agent might be problematic. Click here for more details.
- agent/__init__.py +4 -10
- agent/__main__.py +2 -1
- agent/adapters/huggingfacelocal_adapter.py +54 -61
- agent/adapters/human_adapter.py +116 -114
- agent/adapters/mlxvlm_adapter.py +110 -99
- agent/adapters/models/__init__.py +14 -6
- agent/adapters/models/generic.py +7 -4
- agent/adapters/models/internvl.py +66 -30
- agent/adapters/models/opencua.py +23 -8
- agent/adapters/models/qwen2_5_vl.py +7 -4
- agent/agent.py +184 -158
- agent/callbacks/__init__.py +4 -4
- agent/callbacks/base.py +45 -31
- agent/callbacks/budget_manager.py +22 -10
- agent/callbacks/image_retention.py +18 -13
- agent/callbacks/logging.py +55 -42
- agent/callbacks/operator_validator.py +3 -1
- agent/callbacks/pii_anonymization.py +19 -16
- agent/callbacks/telemetry.py +67 -61
- agent/callbacks/trajectory_saver.py +90 -70
- agent/cli.py +115 -110
- agent/computers/__init__.py +13 -8
- agent/computers/base.py +32 -19
- agent/computers/cua.py +33 -25
- agent/computers/custom.py +78 -71
- agent/decorators.py +23 -14
- agent/human_tool/__init__.py +2 -7
- agent/human_tool/__main__.py +6 -2
- agent/human_tool/server.py +48 -37
- agent/human_tool/ui.py +235 -185
- agent/integrations/hud/__init__.py +15 -21
- agent/integrations/hud/agent.py +101 -83
- agent/integrations/hud/proxy.py +90 -57
- agent/loops/__init__.py +25 -21
- agent/loops/anthropic.py +537 -483
- agent/loops/base.py +13 -14
- agent/loops/composed_grounded.py +135 -149
- agent/loops/gemini.py +31 -12
- agent/loops/glm45v.py +135 -133
- agent/loops/gta1.py +47 -50
- agent/loops/holo.py +4 -2
- agent/loops/internvl.py +6 -11
- agent/loops/moondream3.py +36 -12
- agent/loops/omniparser.py +215 -210
- agent/loops/openai.py +49 -50
- agent/loops/opencua.py +29 -41
- agent/loops/qwen.py +510 -0
- agent/loops/uitars.py +237 -202
- agent/proxy/examples.py +54 -50
- agent/proxy/handlers.py +27 -34
- agent/responses.py +330 -330
- agent/types.py +11 -5
- agent/ui/__init__.py +1 -1
- agent/ui/__main__.py +1 -1
- agent/ui/gradio/app.py +23 -18
- agent/ui/gradio/ui_components.py +310 -161
- {cua_agent-0.4.34.dist-info → cua_agent-0.4.36.dist-info}/METADATA +18 -10
- cua_agent-0.4.36.dist-info/RECORD +64 -0
- cua_agent-0.4.34.dist-info/RECORD +0 -63
- {cua_agent-0.4.34.dist-info → cua_agent-0.4.36.dist-info}/WHEEL +0 -0
- {cua_agent-0.4.34.dist-info → cua_agent-0.4.36.dist-info}/entry_points.txt +0 -0
|
@@ -2,26 +2,28 @@
|
|
|
2
2
|
Trajectory saving callback handler for ComputerAgent.
|
|
3
3
|
"""
|
|
4
4
|
|
|
5
|
-
import
|
|
5
|
+
import base64
|
|
6
|
+
import io
|
|
6
7
|
import json
|
|
8
|
+
import os
|
|
7
9
|
import uuid
|
|
10
|
+
from copy import deepcopy
|
|
8
11
|
from datetime import datetime
|
|
9
|
-
import base64
|
|
10
12
|
from pathlib import Path
|
|
11
|
-
from typing import
|
|
13
|
+
from typing import Any, Dict, List, Optional, Union, override
|
|
14
|
+
|
|
12
15
|
from PIL import Image, ImageDraw
|
|
13
|
-
import io
|
|
14
|
-
from copy import deepcopy
|
|
15
16
|
|
|
16
17
|
from .base import AsyncCallbackHandler
|
|
17
18
|
|
|
19
|
+
|
|
18
20
|
def sanitize_image_urls(data: Any) -> Any:
|
|
19
21
|
"""
|
|
20
22
|
Recursively search for 'image_url' keys and set their values to '[omitted]'.
|
|
21
|
-
|
|
23
|
+
|
|
22
24
|
Args:
|
|
23
25
|
data: Any data structure (dict, list, or primitive type)
|
|
24
|
-
|
|
26
|
+
|
|
25
27
|
Returns:
|
|
26
28
|
A deep copy of the data with all 'image_url' values replaced with '[omitted]'
|
|
27
29
|
"""
|
|
@@ -35,17 +37,19 @@ def sanitize_image_urls(data: Any) -> Any:
|
|
|
35
37
|
# Recursively sanitize the value
|
|
36
38
|
sanitized[key] = sanitize_image_urls(value)
|
|
37
39
|
return sanitized
|
|
38
|
-
|
|
40
|
+
|
|
39
41
|
elif isinstance(data, list):
|
|
40
42
|
# Recursively sanitize each item in the list
|
|
41
43
|
return [sanitize_image_urls(item) for item in data]
|
|
42
|
-
|
|
44
|
+
|
|
43
45
|
else:
|
|
44
46
|
# For primitive types (str, int, bool, None, etc.), return as-is
|
|
45
47
|
return data
|
|
46
48
|
|
|
47
49
|
|
|
48
|
-
def extract_computer_call_outputs(
|
|
50
|
+
def extract_computer_call_outputs(
|
|
51
|
+
items: List[Dict[str, Any]], screenshot_dir: Optional[Path]
|
|
52
|
+
) -> List[Dict[str, Any]]:
|
|
49
53
|
"""
|
|
50
54
|
Save any base64-encoded screenshots from computer_call_output entries to files and
|
|
51
55
|
replace their image_url with the saved file path when a call_id is present.
|
|
@@ -103,18 +107,21 @@ def extract_computer_call_outputs(items: List[Dict[str, Any]], screenshot_dir: O
|
|
|
103
107
|
updated.append(msg)
|
|
104
108
|
return updated
|
|
105
109
|
|
|
110
|
+
|
|
106
111
|
class TrajectorySaverCallback(AsyncCallbackHandler):
|
|
107
112
|
"""
|
|
108
113
|
Callback handler that saves agent trajectories to disk.
|
|
109
|
-
|
|
114
|
+
|
|
110
115
|
Saves each run as a separate trajectory with unique ID, and each turn
|
|
111
116
|
within the trajectory gets its own folder with screenshots and responses.
|
|
112
117
|
"""
|
|
113
|
-
|
|
114
|
-
def __init__(
|
|
118
|
+
|
|
119
|
+
def __init__(
|
|
120
|
+
self, trajectory_dir: str, reset_on_run: bool = True, screenshot_dir: Optional[str] = None
|
|
121
|
+
):
|
|
115
122
|
"""
|
|
116
123
|
Initialize trajectory saver.
|
|
117
|
-
|
|
124
|
+
|
|
118
125
|
Args:
|
|
119
126
|
trajectory_dir: Base directory to save trajectories
|
|
120
127
|
reset_on_run: If True, reset trajectory_id/turn/artifact on each run.
|
|
@@ -129,7 +136,7 @@ class TrajectorySaverCallback(AsyncCallbackHandler):
|
|
|
129
136
|
self.reset_on_run = reset_on_run
|
|
130
137
|
# Optional directory to store extracted screenshots from metadata/new_items
|
|
131
138
|
self.screenshot_dir: Optional[Path] = Path(screenshot_dir) if screenshot_dir else None
|
|
132
|
-
|
|
139
|
+
|
|
133
140
|
# Ensure trajectory directory exists
|
|
134
141
|
self.trajectory_dir.mkdir(parents=True, exist_ok=True)
|
|
135
142
|
|
|
@@ -137,7 +144,7 @@ class TrajectorySaverCallback(AsyncCallbackHandler):
|
|
|
137
144
|
"""Get the directory for the current turn."""
|
|
138
145
|
if not self.trajectory_id:
|
|
139
146
|
raise ValueError("Trajectory not initialized - call _on_run_start first")
|
|
140
|
-
|
|
147
|
+
|
|
141
148
|
# format: trajectory_id/turn_000
|
|
142
149
|
turn_dir = self.trajectory_dir / self.trajectory_id / f"turn_{self.current_turn:03d}"
|
|
143
150
|
turn_dir.mkdir(parents=True, exist_ok=True)
|
|
@@ -166,6 +173,7 @@ class TrajectorySaverCallback(AsyncCallbackHandler):
|
|
|
166
173
|
|
|
167
174
|
def _update_usage(self, usage: Dict[str, Any]) -> None:
|
|
168
175
|
"""Update total usage statistics."""
|
|
176
|
+
|
|
169
177
|
def add_dicts(target: Dict[str, Any], source: Dict[str, Any]) -> None:
|
|
170
178
|
for key, value in source.items():
|
|
171
179
|
if isinstance(value, dict):
|
|
@@ -176,20 +184,21 @@ class TrajectorySaverCallback(AsyncCallbackHandler):
|
|
|
176
184
|
if key not in target:
|
|
177
185
|
target[key] = 0
|
|
178
186
|
target[key] += value
|
|
187
|
+
|
|
179
188
|
add_dicts(self.total_usage, usage)
|
|
180
|
-
|
|
189
|
+
|
|
181
190
|
@override
|
|
182
191
|
async def on_run_start(self, kwargs: Dict[str, Any], old_items: List[Dict[str, Any]]) -> None:
|
|
183
192
|
"""Initialize trajectory tracking for a new run."""
|
|
184
193
|
model = kwargs.get("model", "unknown")
|
|
185
|
-
|
|
194
|
+
|
|
186
195
|
# Only reset trajectory state if reset_on_run is True or no trajectory exists
|
|
187
196
|
if self.reset_on_run or not self.trajectory_id:
|
|
188
197
|
model_name_short = model.split("+")[-1].split("/")[-1].lower()[:16]
|
|
189
198
|
if "+" in model:
|
|
190
199
|
model_name_short = model.split("+")[0].lower()[:4] + "_" + model_name_short
|
|
191
200
|
# strip non-alphanumeric characters from model_name_short
|
|
192
|
-
model_name_short =
|
|
201
|
+
model_name_short = "".join(c for c in model_name_short if c.isalnum() or c == "_")
|
|
193
202
|
|
|
194
203
|
# id format: yyyy-mm-dd_model_hhmmss_uuid[:4]
|
|
195
204
|
now = datetime.now()
|
|
@@ -198,11 +207,11 @@ class TrajectorySaverCallback(AsyncCallbackHandler):
|
|
|
198
207
|
self.current_artifact = 0
|
|
199
208
|
self.model = model
|
|
200
209
|
self.total_usage = {}
|
|
201
|
-
|
|
210
|
+
|
|
202
211
|
# Create trajectory directory
|
|
203
212
|
trajectory_path = self.trajectory_dir / self.trajectory_id
|
|
204
213
|
trajectory_path.mkdir(parents=True, exist_ok=True)
|
|
205
|
-
|
|
214
|
+
|
|
206
215
|
# Save trajectory metadata (optionally extract screenshots to screenshot_dir)
|
|
207
216
|
kwargs_to_save = kwargs.copy()
|
|
208
217
|
try:
|
|
@@ -219,7 +228,7 @@ class TrajectorySaverCallback(AsyncCallbackHandler):
|
|
|
219
228
|
"status": "running",
|
|
220
229
|
"kwargs": kwargs_to_save,
|
|
221
230
|
}
|
|
222
|
-
|
|
231
|
+
|
|
223
232
|
with open(trajectory_path / "metadata.json", "w") as f:
|
|
224
233
|
json.dump(metadata, f, indent=2)
|
|
225
234
|
else:
|
|
@@ -227,22 +236,27 @@ class TrajectorySaverCallback(AsyncCallbackHandler):
|
|
|
227
236
|
self.model = model
|
|
228
237
|
|
|
229
238
|
@override
|
|
230
|
-
async def on_run_end(
|
|
239
|
+
async def on_run_end(
|
|
240
|
+
self,
|
|
241
|
+
kwargs: Dict[str, Any],
|
|
242
|
+
old_items: List[Dict[str, Any]],
|
|
243
|
+
new_items: List[Dict[str, Any]],
|
|
244
|
+
) -> None:
|
|
231
245
|
"""Finalize run tracking by updating metadata with completion status, usage, and new items."""
|
|
232
246
|
if not self.trajectory_id:
|
|
233
247
|
return
|
|
234
|
-
|
|
248
|
+
|
|
235
249
|
# Update metadata with completion status, total usage, and new items
|
|
236
250
|
trajectory_path = self.trajectory_dir / self.trajectory_id
|
|
237
251
|
metadata_path = trajectory_path / "metadata.json"
|
|
238
|
-
|
|
252
|
+
|
|
239
253
|
# Read existing metadata
|
|
240
254
|
if metadata_path.exists():
|
|
241
255
|
with open(metadata_path, "r") as f:
|
|
242
256
|
metadata = json.load(f)
|
|
243
257
|
else:
|
|
244
258
|
metadata = {}
|
|
245
|
-
|
|
259
|
+
|
|
246
260
|
# Update metadata with completion info
|
|
247
261
|
# Optionally extract screenshots from new_items before persisting
|
|
248
262
|
new_items_to_save = new_items
|
|
@@ -251,32 +265,34 @@ class TrajectorySaverCallback(AsyncCallbackHandler):
|
|
|
251
265
|
except Exception:
|
|
252
266
|
pass
|
|
253
267
|
|
|
254
|
-
metadata.update(
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
268
|
+
metadata.update(
|
|
269
|
+
{
|
|
270
|
+
"status": "completed",
|
|
271
|
+
"completed_at": str(uuid.uuid1().time),
|
|
272
|
+
"total_usage": self.total_usage,
|
|
273
|
+
"new_items": new_items_to_save,
|
|
274
|
+
"total_turns": self.current_turn,
|
|
275
|
+
}
|
|
276
|
+
)
|
|
277
|
+
|
|
262
278
|
# Save updated metadata
|
|
263
279
|
with open(metadata_path, "w") as f:
|
|
264
280
|
json.dump(metadata, f, indent=2)
|
|
265
|
-
|
|
266
|
-
@override
|
|
281
|
+
|
|
282
|
+
@override
|
|
267
283
|
async def on_api_start(self, kwargs: Dict[str, Any]) -> None:
|
|
268
284
|
if not self.trajectory_id:
|
|
269
285
|
return
|
|
270
|
-
|
|
271
|
-
self._save_artifact("api_start", {
|
|
272
|
-
|
|
286
|
+
|
|
287
|
+
self._save_artifact("api_start", {"kwargs": kwargs})
|
|
288
|
+
|
|
273
289
|
@override
|
|
274
290
|
async def on_api_end(self, kwargs: Dict[str, Any], result: Any) -> None:
|
|
275
291
|
"""Save API call result."""
|
|
276
292
|
if not self.trajectory_id:
|
|
277
293
|
return
|
|
278
|
-
|
|
279
|
-
self._save_artifact("api_result", {
|
|
294
|
+
|
|
295
|
+
self._save_artifact("api_result", {"kwargs": kwargs, "result": result})
|
|
280
296
|
|
|
281
297
|
@override
|
|
282
298
|
async def on_screenshot(self, screenshot: Union[str, bytes], name: str = "screenshot") -> None:
|
|
@@ -295,77 +311,83 @@ class TrajectorySaverCallback(AsyncCallbackHandler):
|
|
|
295
311
|
"""Save responses to the current turn directory and update usage statistics."""
|
|
296
312
|
if not self.trajectory_id:
|
|
297
313
|
return
|
|
298
|
-
|
|
314
|
+
|
|
299
315
|
# Save responses
|
|
300
316
|
turn_dir = self._get_turn_dir()
|
|
301
317
|
response_data = {
|
|
302
318
|
"timestamp": str(uuid.uuid1().time),
|
|
303
319
|
"model": self.model,
|
|
304
320
|
"kwargs": kwargs,
|
|
305
|
-
"response": responses
|
|
321
|
+
"response": responses,
|
|
306
322
|
}
|
|
307
|
-
|
|
323
|
+
|
|
308
324
|
self._save_artifact("agent_response", response_data)
|
|
309
|
-
|
|
325
|
+
|
|
310
326
|
# Increment turn counter
|
|
311
327
|
self.current_turn += 1
|
|
312
328
|
|
|
313
329
|
def _draw_crosshair_on_image(self, image_bytes: bytes, x: int, y: int) -> bytes:
|
|
314
330
|
"""
|
|
315
331
|
Draw a red dot and crosshair at the specified coordinates on the image.
|
|
316
|
-
|
|
332
|
+
|
|
317
333
|
Args:
|
|
318
334
|
image_bytes: The original image as bytes
|
|
319
335
|
x: X coordinate for the crosshair
|
|
320
336
|
y: Y coordinate for the crosshair
|
|
321
|
-
|
|
337
|
+
|
|
322
338
|
Returns:
|
|
323
339
|
Modified image as bytes with red dot and crosshair
|
|
324
340
|
"""
|
|
325
341
|
# Open the image
|
|
326
342
|
image = Image.open(io.BytesIO(image_bytes))
|
|
327
343
|
draw = ImageDraw.Draw(image)
|
|
328
|
-
|
|
344
|
+
|
|
329
345
|
# Draw crosshair lines (red, 2px thick)
|
|
330
346
|
crosshair_size = 20
|
|
331
347
|
line_width = 2
|
|
332
348
|
color = "red"
|
|
333
|
-
|
|
349
|
+
|
|
334
350
|
# Horizontal line
|
|
335
351
|
draw.line([(x - crosshair_size, y), (x + crosshair_size, y)], fill=color, width=line_width)
|
|
336
352
|
# Vertical line
|
|
337
353
|
draw.line([(x, y - crosshair_size), (x, y + crosshair_size)], fill=color, width=line_width)
|
|
338
|
-
|
|
354
|
+
|
|
339
355
|
# Draw center dot (filled circle)
|
|
340
356
|
dot_radius = 3
|
|
341
|
-
draw.ellipse(
|
|
342
|
-
|
|
357
|
+
draw.ellipse(
|
|
358
|
+
[(x - dot_radius, y - dot_radius), (x + dot_radius, y + dot_radius)], fill=color
|
|
359
|
+
)
|
|
360
|
+
|
|
343
361
|
# Convert back to bytes
|
|
344
362
|
output = io.BytesIO()
|
|
345
|
-
image.save(output, format=
|
|
363
|
+
image.save(output, format="PNG")
|
|
346
364
|
return output.getvalue()
|
|
347
365
|
|
|
348
366
|
@override
|
|
349
|
-
async def on_computer_call_end(
|
|
367
|
+
async def on_computer_call_end(
|
|
368
|
+
self, item: Dict[str, Any], result: List[Dict[str, Any]]
|
|
369
|
+
) -> None:
|
|
350
370
|
"""
|
|
351
371
|
Called when a computer call has completed.
|
|
352
372
|
Saves screenshots and computer call output.
|
|
353
373
|
"""
|
|
354
374
|
if not self.trajectory_id:
|
|
355
375
|
return
|
|
356
|
-
|
|
357
|
-
self._save_artifact("computer_call_result", {
|
|
358
|
-
|
|
376
|
+
|
|
377
|
+
self._save_artifact("computer_call_result", {"item": item, "result": result})
|
|
378
|
+
|
|
359
379
|
# Check if action has x/y coordinates and there's a screenshot in the result
|
|
360
380
|
action = item.get("action", {})
|
|
361
381
|
if "x" in action and "y" in action:
|
|
362
382
|
# Look for screenshot in the result
|
|
363
383
|
for result_item in result:
|
|
364
|
-
if (
|
|
365
|
-
result_item.get("
|
|
366
|
-
|
|
384
|
+
if (
|
|
385
|
+
result_item.get("type") == "computer_call_output"
|
|
386
|
+
and result_item.get("output", {}).get("type") == "input_image"
|
|
387
|
+
):
|
|
388
|
+
|
|
367
389
|
image_url = result_item["output"]["image_url"]
|
|
368
|
-
|
|
390
|
+
|
|
369
391
|
# Extract base64 image data
|
|
370
392
|
if image_url.startswith("data:image/"):
|
|
371
393
|
# Format: data:image/png;base64,<base64_data>
|
|
@@ -373,26 +395,24 @@ class TrajectorySaverCallback(AsyncCallbackHandler):
|
|
|
373
395
|
else:
|
|
374
396
|
# Assume it's just base64 data
|
|
375
397
|
base64_data = image_url
|
|
376
|
-
|
|
398
|
+
|
|
377
399
|
try:
|
|
378
400
|
# Decode the image
|
|
379
401
|
image_bytes = base64.b64decode(base64_data)
|
|
380
|
-
|
|
402
|
+
|
|
381
403
|
# Draw crosshair at the action coordinates
|
|
382
404
|
annotated_image = self._draw_crosshair_on_image(
|
|
383
|
-
image_bytes,
|
|
384
|
-
int(action["x"]),
|
|
385
|
-
int(action["y"])
|
|
405
|
+
image_bytes, int(action["x"]), int(action["y"])
|
|
386
406
|
)
|
|
387
|
-
|
|
407
|
+
|
|
388
408
|
# Save as screenshot_action
|
|
389
409
|
self._save_artifact("screenshot_action", annotated_image)
|
|
390
|
-
|
|
410
|
+
|
|
391
411
|
except Exception as e:
|
|
392
412
|
# If annotation fails, just log and continue
|
|
393
413
|
print(f"Failed to annotate screenshot: {e}")
|
|
394
|
-
|
|
414
|
+
|
|
395
415
|
break # Only process the first screenshot found
|
|
396
416
|
|
|
397
417
|
# Increment turn counter
|
|
398
|
-
self.current_turn += 1
|
|
418
|
+
self.current_turn += 1
|