cua-agent 0.4.22__py3-none-any.whl → 0.7.16__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of cua-agent might be problematic. Click here for more details.
- agent/__init__.py +4 -10
- agent/__main__.py +2 -1
- agent/adapters/__init__.py +4 -0
- agent/adapters/azure_ml_adapter.py +283 -0
- agent/adapters/cua_adapter.py +161 -0
- agent/adapters/huggingfacelocal_adapter.py +67 -125
- agent/adapters/human_adapter.py +116 -114
- agent/adapters/mlxvlm_adapter.py +110 -99
- agent/adapters/models/__init__.py +41 -0
- agent/adapters/models/generic.py +78 -0
- agent/adapters/models/internvl.py +290 -0
- agent/adapters/models/opencua.py +115 -0
- agent/adapters/models/qwen2_5_vl.py +78 -0
- agent/agent.py +337 -185
- agent/callbacks/__init__.py +9 -4
- agent/callbacks/base.py +45 -31
- agent/callbacks/budget_manager.py +22 -10
- agent/callbacks/image_retention.py +54 -98
- agent/callbacks/logging.py +55 -42
- agent/callbacks/operator_validator.py +35 -33
- agent/callbacks/otel.py +291 -0
- agent/callbacks/pii_anonymization.py +19 -16
- agent/callbacks/prompt_instructions.py +47 -0
- agent/callbacks/telemetry.py +99 -61
- agent/callbacks/trajectory_saver.py +95 -69
- agent/cli.py +269 -119
- agent/computers/__init__.py +14 -9
- agent/computers/base.py +32 -19
- agent/computers/cua.py +52 -25
- agent/computers/custom.py +78 -71
- agent/decorators.py +23 -14
- agent/human_tool/__init__.py +2 -7
- agent/human_tool/__main__.py +6 -2
- agent/human_tool/server.py +48 -37
- agent/human_tool/ui.py +359 -235
- agent/integrations/hud/__init__.py +38 -99
- agent/integrations/hud/agent.py +369 -0
- agent/integrations/hud/proxy.py +166 -52
- agent/loops/__init__.py +44 -14
- agent/loops/anthropic.py +579 -492
- agent/loops/base.py +19 -15
- agent/loops/composed_grounded.py +136 -150
- agent/loops/fara/__init__.py +8 -0
- agent/loops/fara/config.py +506 -0
- agent/loops/fara/helpers.py +357 -0
- agent/loops/fara/schema.py +143 -0
- agent/loops/gelato.py +183 -0
- agent/loops/gemini.py +935 -0
- agent/loops/generic_vlm.py +601 -0
- agent/loops/glm45v.py +140 -135
- agent/loops/gta1.py +48 -51
- agent/loops/holo.py +218 -0
- agent/loops/internvl.py +180 -0
- agent/loops/moondream3.py +493 -0
- agent/loops/omniparser.py +326 -226
- agent/loops/openai.py +50 -51
- agent/loops/opencua.py +134 -0
- agent/loops/uiins.py +175 -0
- agent/loops/uitars.py +247 -206
- agent/loops/uitars2.py +951 -0
- agent/playground/__init__.py +5 -0
- agent/playground/server.py +301 -0
- agent/proxy/examples.py +61 -57
- agent/proxy/handlers.py +46 -39
- agent/responses.py +447 -347
- agent/tools/__init__.py +24 -0
- agent/tools/base.py +253 -0
- agent/tools/browser_tool.py +423 -0
- agent/types.py +11 -5
- agent/ui/__init__.py +1 -1
- agent/ui/__main__.py +1 -1
- agent/ui/gradio/app.py +25 -22
- agent/ui/gradio/ui_components.py +314 -167
- cua_agent-0.7.16.dist-info/METADATA +85 -0
- cua_agent-0.7.16.dist-info/RECORD +79 -0
- {cua_agent-0.4.22.dist-info → cua_agent-0.7.16.dist-info}/WHEEL +1 -1
- cua_agent-0.4.22.dist-info/METADATA +0 -436
- cua_agent-0.4.22.dist-info/RECORD +0 -51
- {cua_agent-0.4.22.dist-info → cua_agent-0.7.16.dist-info}/entry_points.txt +0 -0
|
@@ -2,26 +2,28 @@
|
|
|
2
2
|
Trajectory saving callback handler for ComputerAgent.
|
|
3
3
|
"""
|
|
4
4
|
|
|
5
|
-
import
|
|
5
|
+
import base64
|
|
6
|
+
import io
|
|
6
7
|
import json
|
|
8
|
+
import os
|
|
7
9
|
import uuid
|
|
10
|
+
from copy import deepcopy
|
|
8
11
|
from datetime import datetime
|
|
9
|
-
import base64
|
|
10
12
|
from pathlib import Path
|
|
11
|
-
from typing import
|
|
13
|
+
from typing import Any, Dict, List, Optional, Union, override
|
|
14
|
+
|
|
12
15
|
from PIL import Image, ImageDraw
|
|
13
|
-
import io
|
|
14
|
-
from copy import deepcopy
|
|
15
16
|
|
|
16
17
|
from .base import AsyncCallbackHandler
|
|
17
18
|
|
|
19
|
+
|
|
18
20
|
def sanitize_image_urls(data: Any) -> Any:
|
|
19
21
|
"""
|
|
20
22
|
Recursively search for 'image_url' keys and set their values to '[omitted]'.
|
|
21
|
-
|
|
23
|
+
|
|
22
24
|
Args:
|
|
23
25
|
data: Any data structure (dict, list, or primitive type)
|
|
24
|
-
|
|
26
|
+
|
|
25
27
|
Returns:
|
|
26
28
|
A deep copy of the data with all 'image_url' values replaced with '[omitted]'
|
|
27
29
|
"""
|
|
@@ -35,17 +37,19 @@ def sanitize_image_urls(data: Any) -> Any:
|
|
|
35
37
|
# Recursively sanitize the value
|
|
36
38
|
sanitized[key] = sanitize_image_urls(value)
|
|
37
39
|
return sanitized
|
|
38
|
-
|
|
40
|
+
|
|
39
41
|
elif isinstance(data, list):
|
|
40
42
|
# Recursively sanitize each item in the list
|
|
41
43
|
return [sanitize_image_urls(item) for item in data]
|
|
42
|
-
|
|
44
|
+
|
|
43
45
|
else:
|
|
44
46
|
# For primitive types (str, int, bool, None, etc.), return as-is
|
|
45
47
|
return data
|
|
46
48
|
|
|
47
49
|
|
|
48
|
-
def extract_computer_call_outputs(
|
|
50
|
+
def extract_computer_call_outputs(
|
|
51
|
+
items: List[Dict[str, Any]], screenshot_dir: Optional[Path]
|
|
52
|
+
) -> List[Dict[str, Any]]:
|
|
49
53
|
"""
|
|
50
54
|
Save any base64-encoded screenshots from computer_call_output entries to files and
|
|
51
55
|
replace their image_url with the saved file path when a call_id is present.
|
|
@@ -103,18 +107,21 @@ def extract_computer_call_outputs(items: List[Dict[str, Any]], screenshot_dir: O
|
|
|
103
107
|
updated.append(msg)
|
|
104
108
|
return updated
|
|
105
109
|
|
|
110
|
+
|
|
106
111
|
class TrajectorySaverCallback(AsyncCallbackHandler):
|
|
107
112
|
"""
|
|
108
113
|
Callback handler that saves agent trajectories to disk.
|
|
109
|
-
|
|
114
|
+
|
|
110
115
|
Saves each run as a separate trajectory with unique ID, and each turn
|
|
111
116
|
within the trajectory gets its own folder with screenshots and responses.
|
|
112
117
|
"""
|
|
113
|
-
|
|
114
|
-
def __init__(
|
|
118
|
+
|
|
119
|
+
def __init__(
|
|
120
|
+
self, trajectory_dir: str, reset_on_run: bool = True, screenshot_dir: Optional[str] = None
|
|
121
|
+
):
|
|
115
122
|
"""
|
|
116
123
|
Initialize trajectory saver.
|
|
117
|
-
|
|
124
|
+
|
|
118
125
|
Args:
|
|
119
126
|
trajectory_dir: Base directory to save trajectories
|
|
120
127
|
reset_on_run: If True, reset trajectory_id/turn/artifact on each run.
|
|
@@ -129,15 +136,19 @@ class TrajectorySaverCallback(AsyncCallbackHandler):
|
|
|
129
136
|
self.reset_on_run = reset_on_run
|
|
130
137
|
# Optional directory to store extracted screenshots from metadata/new_items
|
|
131
138
|
self.screenshot_dir: Optional[Path] = Path(screenshot_dir) if screenshot_dir else None
|
|
132
|
-
|
|
139
|
+
|
|
133
140
|
# Ensure trajectory directory exists
|
|
134
141
|
self.trajectory_dir.mkdir(parents=True, exist_ok=True)
|
|
135
142
|
|
|
143
|
+
# Ensure screenshot directory exists if specified
|
|
144
|
+
if self.screenshot_dir:
|
|
145
|
+
self.screenshot_dir.mkdir(parents=True, exist_ok=True)
|
|
146
|
+
|
|
136
147
|
def _get_turn_dir(self) -> Path:
|
|
137
148
|
"""Get the directory for the current turn."""
|
|
138
149
|
if not self.trajectory_id:
|
|
139
150
|
raise ValueError("Trajectory not initialized - call _on_run_start first")
|
|
140
|
-
|
|
151
|
+
|
|
141
152
|
# format: trajectory_id/turn_000
|
|
142
153
|
turn_dir = self.trajectory_dir / self.trajectory_id / f"turn_{self.current_turn:03d}"
|
|
143
154
|
turn_dir.mkdir(parents=True, exist_ok=True)
|
|
@@ -166,6 +177,7 @@ class TrajectorySaverCallback(AsyncCallbackHandler):
|
|
|
166
177
|
|
|
167
178
|
def _update_usage(self, usage: Dict[str, Any]) -> None:
|
|
168
179
|
"""Update total usage statistics."""
|
|
180
|
+
|
|
169
181
|
def add_dicts(target: Dict[str, Any], source: Dict[str, Any]) -> None:
|
|
170
182
|
for key, value in source.items():
|
|
171
183
|
if isinstance(value, dict):
|
|
@@ -176,18 +188,21 @@ class TrajectorySaverCallback(AsyncCallbackHandler):
|
|
|
176
188
|
if key not in target:
|
|
177
189
|
target[key] = 0
|
|
178
190
|
target[key] += value
|
|
191
|
+
|
|
179
192
|
add_dicts(self.total_usage, usage)
|
|
180
|
-
|
|
193
|
+
|
|
181
194
|
@override
|
|
182
195
|
async def on_run_start(self, kwargs: Dict[str, Any], old_items: List[Dict[str, Any]]) -> None:
|
|
183
196
|
"""Initialize trajectory tracking for a new run."""
|
|
184
197
|
model = kwargs.get("model", "unknown")
|
|
185
|
-
|
|
198
|
+
|
|
186
199
|
# Only reset trajectory state if reset_on_run is True or no trajectory exists
|
|
187
200
|
if self.reset_on_run or not self.trajectory_id:
|
|
188
201
|
model_name_short = model.split("+")[-1].split("/")[-1].lower()[:16]
|
|
189
202
|
if "+" in model:
|
|
190
203
|
model_name_short = model.split("+")[0].lower()[:4] + "_" + model_name_short
|
|
204
|
+
# strip non-alphanumeric characters from model_name_short
|
|
205
|
+
model_name_short = "".join(c for c in model_name_short if c.isalnum() or c == "_")
|
|
191
206
|
|
|
192
207
|
# id format: yyyy-mm-dd_model_hhmmss_uuid[:4]
|
|
193
208
|
now = datetime.now()
|
|
@@ -196,11 +211,11 @@ class TrajectorySaverCallback(AsyncCallbackHandler):
|
|
|
196
211
|
self.current_artifact = 0
|
|
197
212
|
self.model = model
|
|
198
213
|
self.total_usage = {}
|
|
199
|
-
|
|
214
|
+
|
|
200
215
|
# Create trajectory directory
|
|
201
216
|
trajectory_path = self.trajectory_dir / self.trajectory_id
|
|
202
217
|
trajectory_path.mkdir(parents=True, exist_ok=True)
|
|
203
|
-
|
|
218
|
+
|
|
204
219
|
# Save trajectory metadata (optionally extract screenshots to screenshot_dir)
|
|
205
220
|
kwargs_to_save = kwargs.copy()
|
|
206
221
|
try:
|
|
@@ -217,7 +232,7 @@ class TrajectorySaverCallback(AsyncCallbackHandler):
|
|
|
217
232
|
"status": "running",
|
|
218
233
|
"kwargs": kwargs_to_save,
|
|
219
234
|
}
|
|
220
|
-
|
|
235
|
+
|
|
221
236
|
with open(trajectory_path / "metadata.json", "w") as f:
|
|
222
237
|
json.dump(metadata, f, indent=2)
|
|
223
238
|
else:
|
|
@@ -225,22 +240,27 @@ class TrajectorySaverCallback(AsyncCallbackHandler):
|
|
|
225
240
|
self.model = model
|
|
226
241
|
|
|
227
242
|
@override
|
|
228
|
-
async def on_run_end(
|
|
243
|
+
async def on_run_end(
|
|
244
|
+
self,
|
|
245
|
+
kwargs: Dict[str, Any],
|
|
246
|
+
old_items: List[Dict[str, Any]],
|
|
247
|
+
new_items: List[Dict[str, Any]],
|
|
248
|
+
) -> None:
|
|
229
249
|
"""Finalize run tracking by updating metadata with completion status, usage, and new items."""
|
|
230
250
|
if not self.trajectory_id:
|
|
231
251
|
return
|
|
232
|
-
|
|
252
|
+
|
|
233
253
|
# Update metadata with completion status, total usage, and new items
|
|
234
254
|
trajectory_path = self.trajectory_dir / self.trajectory_id
|
|
235
255
|
metadata_path = trajectory_path / "metadata.json"
|
|
236
|
-
|
|
256
|
+
|
|
237
257
|
# Read existing metadata
|
|
238
258
|
if metadata_path.exists():
|
|
239
259
|
with open(metadata_path, "r") as f:
|
|
240
260
|
metadata = json.load(f)
|
|
241
261
|
else:
|
|
242
262
|
metadata = {}
|
|
243
|
-
|
|
263
|
+
|
|
244
264
|
# Update metadata with completion info
|
|
245
265
|
# Optionally extract screenshots from new_items before persisting
|
|
246
266
|
new_items_to_save = new_items
|
|
@@ -249,32 +269,34 @@ class TrajectorySaverCallback(AsyncCallbackHandler):
|
|
|
249
269
|
except Exception:
|
|
250
270
|
pass
|
|
251
271
|
|
|
252
|
-
metadata.update(
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
272
|
+
metadata.update(
|
|
273
|
+
{
|
|
274
|
+
"status": "completed",
|
|
275
|
+
"completed_at": str(uuid.uuid1().time),
|
|
276
|
+
"total_usage": self.total_usage,
|
|
277
|
+
"new_items": new_items_to_save,
|
|
278
|
+
"total_turns": self.current_turn,
|
|
279
|
+
}
|
|
280
|
+
)
|
|
281
|
+
|
|
260
282
|
# Save updated metadata
|
|
261
283
|
with open(metadata_path, "w") as f:
|
|
262
284
|
json.dump(metadata, f, indent=2)
|
|
263
|
-
|
|
264
|
-
@override
|
|
285
|
+
|
|
286
|
+
@override
|
|
265
287
|
async def on_api_start(self, kwargs: Dict[str, Any]) -> None:
|
|
266
288
|
if not self.trajectory_id:
|
|
267
289
|
return
|
|
268
|
-
|
|
269
|
-
self._save_artifact("api_start", {
|
|
270
|
-
|
|
290
|
+
|
|
291
|
+
self._save_artifact("api_start", {"kwargs": kwargs})
|
|
292
|
+
|
|
271
293
|
@override
|
|
272
294
|
async def on_api_end(self, kwargs: Dict[str, Any], result: Any) -> None:
|
|
273
295
|
"""Save API call result."""
|
|
274
296
|
if not self.trajectory_id:
|
|
275
297
|
return
|
|
276
|
-
|
|
277
|
-
self._save_artifact("api_result", {
|
|
298
|
+
|
|
299
|
+
self._save_artifact("api_result", {"kwargs": kwargs, "result": result})
|
|
278
300
|
|
|
279
301
|
@override
|
|
280
302
|
async def on_screenshot(self, screenshot: Union[str, bytes], name: str = "screenshot") -> None:
|
|
@@ -293,77 +315,83 @@ class TrajectorySaverCallback(AsyncCallbackHandler):
|
|
|
293
315
|
"""Save responses to the current turn directory and update usage statistics."""
|
|
294
316
|
if not self.trajectory_id:
|
|
295
317
|
return
|
|
296
|
-
|
|
318
|
+
|
|
297
319
|
# Save responses
|
|
298
320
|
turn_dir = self._get_turn_dir()
|
|
299
321
|
response_data = {
|
|
300
322
|
"timestamp": str(uuid.uuid1().time),
|
|
301
323
|
"model": self.model,
|
|
302
324
|
"kwargs": kwargs,
|
|
303
|
-
"response": responses
|
|
325
|
+
"response": responses,
|
|
304
326
|
}
|
|
305
|
-
|
|
327
|
+
|
|
306
328
|
self._save_artifact("agent_response", response_data)
|
|
307
|
-
|
|
329
|
+
|
|
308
330
|
# Increment turn counter
|
|
309
331
|
self.current_turn += 1
|
|
310
332
|
|
|
311
333
|
def _draw_crosshair_on_image(self, image_bytes: bytes, x: int, y: int) -> bytes:
|
|
312
334
|
"""
|
|
313
335
|
Draw a red dot and crosshair at the specified coordinates on the image.
|
|
314
|
-
|
|
336
|
+
|
|
315
337
|
Args:
|
|
316
338
|
image_bytes: The original image as bytes
|
|
317
339
|
x: X coordinate for the crosshair
|
|
318
340
|
y: Y coordinate for the crosshair
|
|
319
|
-
|
|
341
|
+
|
|
320
342
|
Returns:
|
|
321
343
|
Modified image as bytes with red dot and crosshair
|
|
322
344
|
"""
|
|
323
345
|
# Open the image
|
|
324
346
|
image = Image.open(io.BytesIO(image_bytes))
|
|
325
347
|
draw = ImageDraw.Draw(image)
|
|
326
|
-
|
|
348
|
+
|
|
327
349
|
# Draw crosshair lines (red, 2px thick)
|
|
328
350
|
crosshair_size = 20
|
|
329
351
|
line_width = 2
|
|
330
352
|
color = "red"
|
|
331
|
-
|
|
353
|
+
|
|
332
354
|
# Horizontal line
|
|
333
355
|
draw.line([(x - crosshair_size, y), (x + crosshair_size, y)], fill=color, width=line_width)
|
|
334
356
|
# Vertical line
|
|
335
357
|
draw.line([(x, y - crosshair_size), (x, y + crosshair_size)], fill=color, width=line_width)
|
|
336
|
-
|
|
358
|
+
|
|
337
359
|
# Draw center dot (filled circle)
|
|
338
360
|
dot_radius = 3
|
|
339
|
-
draw.ellipse(
|
|
340
|
-
|
|
361
|
+
draw.ellipse(
|
|
362
|
+
[(x - dot_radius, y - dot_radius), (x + dot_radius, y + dot_radius)], fill=color
|
|
363
|
+
)
|
|
364
|
+
|
|
341
365
|
# Convert back to bytes
|
|
342
366
|
output = io.BytesIO()
|
|
343
|
-
image.save(output, format=
|
|
367
|
+
image.save(output, format="PNG")
|
|
344
368
|
return output.getvalue()
|
|
345
369
|
|
|
346
370
|
@override
|
|
347
|
-
async def on_computer_call_end(
|
|
371
|
+
async def on_computer_call_end(
|
|
372
|
+
self, item: Dict[str, Any], result: List[Dict[str, Any]]
|
|
373
|
+
) -> None:
|
|
348
374
|
"""
|
|
349
375
|
Called when a computer call has completed.
|
|
350
376
|
Saves screenshots and computer call output.
|
|
351
377
|
"""
|
|
352
378
|
if not self.trajectory_id:
|
|
353
379
|
return
|
|
354
|
-
|
|
355
|
-
self._save_artifact("computer_call_result", {
|
|
356
|
-
|
|
380
|
+
|
|
381
|
+
self._save_artifact("computer_call_result", {"item": item, "result": result})
|
|
382
|
+
|
|
357
383
|
# Check if action has x/y coordinates and there's a screenshot in the result
|
|
358
384
|
action = item.get("action", {})
|
|
359
385
|
if "x" in action and "y" in action:
|
|
360
386
|
# Look for screenshot in the result
|
|
361
387
|
for result_item in result:
|
|
362
|
-
if (
|
|
363
|
-
result_item.get("
|
|
364
|
-
|
|
388
|
+
if (
|
|
389
|
+
result_item.get("type") == "computer_call_output"
|
|
390
|
+
and result_item.get("output", {}).get("type") == "input_image"
|
|
391
|
+
):
|
|
392
|
+
|
|
365
393
|
image_url = result_item["output"]["image_url"]
|
|
366
|
-
|
|
394
|
+
|
|
367
395
|
# Extract base64 image data
|
|
368
396
|
if image_url.startswith("data:image/"):
|
|
369
397
|
# Format: data:image/png;base64,<base64_data>
|
|
@@ -371,26 +399,24 @@ class TrajectorySaverCallback(AsyncCallbackHandler):
|
|
|
371
399
|
else:
|
|
372
400
|
# Assume it's just base64 data
|
|
373
401
|
base64_data = image_url
|
|
374
|
-
|
|
402
|
+
|
|
375
403
|
try:
|
|
376
404
|
# Decode the image
|
|
377
405
|
image_bytes = base64.b64decode(base64_data)
|
|
378
|
-
|
|
406
|
+
|
|
379
407
|
# Draw crosshair at the action coordinates
|
|
380
408
|
annotated_image = self._draw_crosshair_on_image(
|
|
381
|
-
image_bytes,
|
|
382
|
-
int(action["x"]),
|
|
383
|
-
int(action["y"])
|
|
409
|
+
image_bytes, int(action["x"]), int(action["y"])
|
|
384
410
|
)
|
|
385
|
-
|
|
411
|
+
|
|
386
412
|
# Save as screenshot_action
|
|
387
413
|
self._save_artifact("screenshot_action", annotated_image)
|
|
388
|
-
|
|
414
|
+
|
|
389
415
|
except Exception as e:
|
|
390
416
|
# If annotation fails, just log and continue
|
|
391
417
|
print(f"Failed to annotate screenshot: {e}")
|
|
392
|
-
|
|
418
|
+
|
|
393
419
|
break # Only process the first screenshot found
|
|
394
420
|
|
|
395
421
|
# Increment turn counter
|
|
396
|
-
self.current_turn += 1
|
|
422
|
+
self.current_turn += 1
|