cua-agent 0.4.14__py3-none-any.whl → 0.7.16__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of cua-agent might be problematic. Click here for more details.
- agent/__init__.py +4 -19
- agent/__main__.py +2 -1
- agent/adapters/__init__.py +6 -0
- agent/adapters/azure_ml_adapter.py +283 -0
- agent/adapters/cua_adapter.py +161 -0
- agent/adapters/huggingfacelocal_adapter.py +67 -125
- agent/adapters/human_adapter.py +116 -114
- agent/adapters/mlxvlm_adapter.py +370 -0
- agent/adapters/models/__init__.py +41 -0
- agent/adapters/models/generic.py +78 -0
- agent/adapters/models/internvl.py +290 -0
- agent/adapters/models/opencua.py +115 -0
- agent/adapters/models/qwen2_5_vl.py +78 -0
- agent/agent.py +431 -241
- agent/callbacks/__init__.py +10 -3
- agent/callbacks/base.py +45 -31
- agent/callbacks/budget_manager.py +22 -10
- agent/callbacks/image_retention.py +54 -98
- agent/callbacks/logging.py +55 -42
- agent/callbacks/operator_validator.py +140 -0
- agent/callbacks/otel.py +291 -0
- agent/callbacks/pii_anonymization.py +19 -16
- agent/callbacks/prompt_instructions.py +47 -0
- agent/callbacks/telemetry.py +106 -69
- agent/callbacks/trajectory_saver.py +178 -70
- agent/cli.py +269 -119
- agent/computers/__init__.py +14 -9
- agent/computers/base.py +32 -19
- agent/computers/cua.py +52 -25
- agent/computers/custom.py +78 -71
- agent/decorators.py +23 -14
- agent/human_tool/__init__.py +2 -7
- agent/human_tool/__main__.py +6 -2
- agent/human_tool/server.py +48 -37
- agent/human_tool/ui.py +359 -235
- agent/integrations/hud/__init__.py +164 -74
- agent/integrations/hud/agent.py +338 -342
- agent/integrations/hud/proxy.py +297 -0
- agent/loops/__init__.py +44 -14
- agent/loops/anthropic.py +590 -492
- agent/loops/base.py +19 -15
- agent/loops/composed_grounded.py +142 -144
- agent/loops/fara/__init__.py +8 -0
- agent/loops/fara/config.py +506 -0
- agent/loops/fara/helpers.py +357 -0
- agent/loops/fara/schema.py +143 -0
- agent/loops/gelato.py +183 -0
- agent/loops/gemini.py +935 -0
- agent/loops/generic_vlm.py +601 -0
- agent/loops/glm45v.py +140 -135
- agent/loops/gta1.py +48 -51
- agent/loops/holo.py +218 -0
- agent/loops/internvl.py +180 -0
- agent/loops/moondream3.py +493 -0
- agent/loops/omniparser.py +326 -226
- agent/loops/openai.py +63 -56
- agent/loops/opencua.py +134 -0
- agent/loops/uiins.py +175 -0
- agent/loops/uitars.py +262 -212
- agent/loops/uitars2.py +951 -0
- agent/playground/__init__.py +5 -0
- agent/playground/server.py +301 -0
- agent/proxy/examples.py +196 -0
- agent/proxy/handlers.py +255 -0
- agent/responses.py +486 -339
- agent/tools/__init__.py +24 -0
- agent/tools/base.py +253 -0
- agent/tools/browser_tool.py +423 -0
- agent/types.py +20 -5
- agent/ui/__init__.py +1 -1
- agent/ui/__main__.py +1 -1
- agent/ui/gradio/app.py +25 -22
- agent/ui/gradio/ui_components.py +314 -167
- cua_agent-0.7.16.dist-info/METADATA +85 -0
- cua_agent-0.7.16.dist-info/RECORD +79 -0
- {cua_agent-0.4.14.dist-info → cua_agent-0.7.16.dist-info}/WHEEL +1 -1
- agent/integrations/hud/adapter.py +0 -121
- agent/integrations/hud/computer_handler.py +0 -187
- agent/telemetry.py +0 -142
- cua_agent-0.4.14.dist-info/METADATA +0 -436
- cua_agent-0.4.14.dist-info/RECORD +0 -50
- {cua_agent-0.4.14.dist-info → cua_agent-0.7.16.dist-info}/entry_points.txt +0 -0
|
@@ -2,24 +2,28 @@
|
|
|
2
2
|
Trajectory saving callback handler for ComputerAgent.
|
|
3
3
|
"""
|
|
4
4
|
|
|
5
|
-
import
|
|
5
|
+
import base64
|
|
6
|
+
import io
|
|
6
7
|
import json
|
|
8
|
+
import os
|
|
7
9
|
import uuid
|
|
10
|
+
from copy import deepcopy
|
|
8
11
|
from datetime import datetime
|
|
9
|
-
import base64
|
|
10
12
|
from pathlib import Path
|
|
11
|
-
from typing import
|
|
13
|
+
from typing import Any, Dict, List, Optional, Union, override
|
|
14
|
+
|
|
12
15
|
from PIL import Image, ImageDraw
|
|
13
|
-
|
|
16
|
+
|
|
14
17
|
from .base import AsyncCallbackHandler
|
|
15
18
|
|
|
19
|
+
|
|
16
20
|
def sanitize_image_urls(data: Any) -> Any:
|
|
17
21
|
"""
|
|
18
22
|
Recursively search for 'image_url' keys and set their values to '[omitted]'.
|
|
19
|
-
|
|
23
|
+
|
|
20
24
|
Args:
|
|
21
25
|
data: Any data structure (dict, list, or primitive type)
|
|
22
|
-
|
|
26
|
+
|
|
23
27
|
Returns:
|
|
24
28
|
A deep copy of the data with all 'image_url' values replaced with '[omitted]'
|
|
25
29
|
"""
|
|
@@ -33,28 +37,91 @@ def sanitize_image_urls(data: Any) -> Any:
|
|
|
33
37
|
# Recursively sanitize the value
|
|
34
38
|
sanitized[key] = sanitize_image_urls(value)
|
|
35
39
|
return sanitized
|
|
36
|
-
|
|
40
|
+
|
|
37
41
|
elif isinstance(data, list):
|
|
38
42
|
# Recursively sanitize each item in the list
|
|
39
43
|
return [sanitize_image_urls(item) for item in data]
|
|
40
|
-
|
|
44
|
+
|
|
41
45
|
else:
|
|
42
46
|
# For primitive types (str, int, bool, None, etc.), return as-is
|
|
43
47
|
return data
|
|
44
48
|
|
|
45
49
|
|
|
50
|
+
def extract_computer_call_outputs(
|
|
51
|
+
items: List[Dict[str, Any]], screenshot_dir: Optional[Path]
|
|
52
|
+
) -> List[Dict[str, Any]]:
|
|
53
|
+
"""
|
|
54
|
+
Save any base64-encoded screenshots from computer_call_output entries to files and
|
|
55
|
+
replace their image_url with the saved file path when a call_id is present.
|
|
56
|
+
|
|
57
|
+
Only operates if screenshot_dir is provided and exists; otherwise returns items unchanged.
|
|
58
|
+
|
|
59
|
+
Args:
|
|
60
|
+
items: List of message/result dicts potentially containing computer_call_output entries
|
|
61
|
+
screenshot_dir: Directory to write screenshots into
|
|
62
|
+
|
|
63
|
+
Returns:
|
|
64
|
+
A new list with updated image_url fields when applicable.
|
|
65
|
+
"""
|
|
66
|
+
if not items:
|
|
67
|
+
return items
|
|
68
|
+
if not screenshot_dir or not screenshot_dir.exists():
|
|
69
|
+
return items
|
|
70
|
+
|
|
71
|
+
updated: List[Dict[str, Any]] = []
|
|
72
|
+
for item in items:
|
|
73
|
+
# work on a shallow copy; deep copy nested 'output' if we modify it
|
|
74
|
+
msg = dict(item)
|
|
75
|
+
try:
|
|
76
|
+
if msg.get("type") == "computer_call_output":
|
|
77
|
+
call_id = msg.get("call_id")
|
|
78
|
+
output = msg.get("output", {})
|
|
79
|
+
image_url = output.get("image_url")
|
|
80
|
+
if call_id and isinstance(image_url, str) and image_url.startswith("data:"):
|
|
81
|
+
# derive extension from MIME type e.g. data:image/png;base64,
|
|
82
|
+
try:
|
|
83
|
+
ext = image_url.split(";", 1)[0].split("/")[-1]
|
|
84
|
+
if not ext:
|
|
85
|
+
ext = "png"
|
|
86
|
+
except Exception:
|
|
87
|
+
ext = "png"
|
|
88
|
+
out_path = screenshot_dir / f"{call_id}.{ext}"
|
|
89
|
+
# write file if it doesn't exist
|
|
90
|
+
if not out_path.exists():
|
|
91
|
+
try:
|
|
92
|
+
b64_payload = image_url.split(",", 1)[1]
|
|
93
|
+
img_bytes = base64.b64decode(b64_payload)
|
|
94
|
+
out_path.parent.mkdir(parents=True, exist_ok=True)
|
|
95
|
+
with open(out_path, "wb") as f:
|
|
96
|
+
f.write(img_bytes)
|
|
97
|
+
except Exception:
|
|
98
|
+
# if anything fails, skip modifying this message
|
|
99
|
+
pass
|
|
100
|
+
# update image_url to file path
|
|
101
|
+
new_output = dict(output)
|
|
102
|
+
new_output["image_url"] = str(out_path)
|
|
103
|
+
msg["output"] = new_output
|
|
104
|
+
except Exception:
|
|
105
|
+
# do not block on malformed entries; keep original
|
|
106
|
+
pass
|
|
107
|
+
updated.append(msg)
|
|
108
|
+
return updated
|
|
109
|
+
|
|
110
|
+
|
|
46
111
|
class TrajectorySaverCallback(AsyncCallbackHandler):
|
|
47
112
|
"""
|
|
48
113
|
Callback handler that saves agent trajectories to disk.
|
|
49
|
-
|
|
114
|
+
|
|
50
115
|
Saves each run as a separate trajectory with unique ID, and each turn
|
|
51
116
|
within the trajectory gets its own folder with screenshots and responses.
|
|
52
117
|
"""
|
|
53
|
-
|
|
54
|
-
def __init__(
|
|
118
|
+
|
|
119
|
+
def __init__(
|
|
120
|
+
self, trajectory_dir: str, reset_on_run: bool = True, screenshot_dir: Optional[str] = None
|
|
121
|
+
):
|
|
55
122
|
"""
|
|
56
123
|
Initialize trajectory saver.
|
|
57
|
-
|
|
124
|
+
|
|
58
125
|
Args:
|
|
59
126
|
trajectory_dir: Base directory to save trajectories
|
|
60
127
|
reset_on_run: If True, reset trajectory_id/turn/artifact on each run.
|
|
@@ -67,15 +134,21 @@ class TrajectorySaverCallback(AsyncCallbackHandler):
|
|
|
67
134
|
self.model: Optional[str] = None
|
|
68
135
|
self.total_usage: Dict[str, Any] = {}
|
|
69
136
|
self.reset_on_run = reset_on_run
|
|
70
|
-
|
|
137
|
+
# Optional directory to store extracted screenshots from metadata/new_items
|
|
138
|
+
self.screenshot_dir: Optional[Path] = Path(screenshot_dir) if screenshot_dir else None
|
|
139
|
+
|
|
71
140
|
# Ensure trajectory directory exists
|
|
72
141
|
self.trajectory_dir.mkdir(parents=True, exist_ok=True)
|
|
73
|
-
|
|
142
|
+
|
|
143
|
+
# Ensure screenshot directory exists if specified
|
|
144
|
+
if self.screenshot_dir:
|
|
145
|
+
self.screenshot_dir.mkdir(parents=True, exist_ok=True)
|
|
146
|
+
|
|
74
147
|
def _get_turn_dir(self) -> Path:
|
|
75
148
|
"""Get the directory for the current turn."""
|
|
76
149
|
if not self.trajectory_id:
|
|
77
150
|
raise ValueError("Trajectory not initialized - call _on_run_start first")
|
|
78
|
-
|
|
151
|
+
|
|
79
152
|
# format: trajectory_id/turn_000
|
|
80
153
|
turn_dir = self.trajectory_dir / self.trajectory_id / f"turn_{self.current_turn:03d}"
|
|
81
154
|
turn_dir.mkdir(parents=True, exist_ok=True)
|
|
@@ -94,12 +167,17 @@ class TrajectorySaverCallback(AsyncCallbackHandler):
|
|
|
94
167
|
# format: turn_000/0000_name.json
|
|
95
168
|
artifact_filename = f"{self.current_artifact:04d}_{name}"
|
|
96
169
|
artifact_path = turn_dir / f"{artifact_filename}.json"
|
|
170
|
+
# add created_at
|
|
171
|
+
if isinstance(artifact, dict):
|
|
172
|
+
artifact = artifact.copy()
|
|
173
|
+
artifact["created_at"] = str(uuid.uuid1().time)
|
|
97
174
|
with open(artifact_path, "w") as f:
|
|
98
175
|
json.dump(sanitize_image_urls(artifact), f, indent=2)
|
|
99
176
|
self.current_artifact += 1
|
|
100
177
|
|
|
101
178
|
def _update_usage(self, usage: Dict[str, Any]) -> None:
|
|
102
179
|
"""Update total usage statistics."""
|
|
180
|
+
|
|
103
181
|
def add_dicts(target: Dict[str, Any], source: Dict[str, Any]) -> None:
|
|
104
182
|
for key, value in source.items():
|
|
105
183
|
if isinstance(value, dict):
|
|
@@ -110,18 +188,21 @@ class TrajectorySaverCallback(AsyncCallbackHandler):
|
|
|
110
188
|
if key not in target:
|
|
111
189
|
target[key] = 0
|
|
112
190
|
target[key] += value
|
|
191
|
+
|
|
113
192
|
add_dicts(self.total_usage, usage)
|
|
114
|
-
|
|
193
|
+
|
|
115
194
|
@override
|
|
116
195
|
async def on_run_start(self, kwargs: Dict[str, Any], old_items: List[Dict[str, Any]]) -> None:
|
|
117
196
|
"""Initialize trajectory tracking for a new run."""
|
|
118
197
|
model = kwargs.get("model", "unknown")
|
|
119
|
-
|
|
198
|
+
|
|
120
199
|
# Only reset trajectory state if reset_on_run is True or no trajectory exists
|
|
121
200
|
if self.reset_on_run or not self.trajectory_id:
|
|
122
201
|
model_name_short = model.split("+")[-1].split("/")[-1].lower()[:16]
|
|
123
202
|
if "+" in model:
|
|
124
203
|
model_name_short = model.split("+")[0].lower()[:4] + "_" + model_name_short
|
|
204
|
+
# strip non-alphanumeric characters from model_name_short
|
|
205
|
+
model_name_short = "".join(c for c in model_name_short if c.isalnum() or c == "_")
|
|
125
206
|
|
|
126
207
|
# id format: yyyy-mm-dd_model_hhmmss_uuid[:4]
|
|
127
208
|
now = datetime.now()
|
|
@@ -130,19 +211,28 @@ class TrajectorySaverCallback(AsyncCallbackHandler):
|
|
|
130
211
|
self.current_artifact = 0
|
|
131
212
|
self.model = model
|
|
132
213
|
self.total_usage = {}
|
|
133
|
-
|
|
214
|
+
|
|
134
215
|
# Create trajectory directory
|
|
135
216
|
trajectory_path = self.trajectory_dir / self.trajectory_id
|
|
136
217
|
trajectory_path.mkdir(parents=True, exist_ok=True)
|
|
137
|
-
|
|
138
|
-
# Save trajectory metadata
|
|
218
|
+
|
|
219
|
+
# Save trajectory metadata (optionally extract screenshots to screenshot_dir)
|
|
220
|
+
kwargs_to_save = kwargs.copy()
|
|
221
|
+
try:
|
|
222
|
+
if "messages" in kwargs_to_save:
|
|
223
|
+
kwargs_to_save["messages"] = extract_computer_call_outputs(
|
|
224
|
+
kwargs_to_save["messages"], self.screenshot_dir
|
|
225
|
+
)
|
|
226
|
+
except Exception:
|
|
227
|
+
# If extraction fails, fall back to original messages
|
|
228
|
+
pass
|
|
139
229
|
metadata = {
|
|
140
230
|
"trajectory_id": self.trajectory_id,
|
|
141
231
|
"created_at": str(uuid.uuid1().time),
|
|
142
232
|
"status": "running",
|
|
143
|
-
"kwargs":
|
|
233
|
+
"kwargs": kwargs_to_save,
|
|
144
234
|
}
|
|
145
|
-
|
|
235
|
+
|
|
146
236
|
with open(trajectory_path / "metadata.json", "w") as f:
|
|
147
237
|
json.dump(metadata, f, indent=2)
|
|
148
238
|
else:
|
|
@@ -150,49 +240,63 @@ class TrajectorySaverCallback(AsyncCallbackHandler):
|
|
|
150
240
|
self.model = model
|
|
151
241
|
|
|
152
242
|
@override
|
|
153
|
-
async def on_run_end(
|
|
243
|
+
async def on_run_end(
|
|
244
|
+
self,
|
|
245
|
+
kwargs: Dict[str, Any],
|
|
246
|
+
old_items: List[Dict[str, Any]],
|
|
247
|
+
new_items: List[Dict[str, Any]],
|
|
248
|
+
) -> None:
|
|
154
249
|
"""Finalize run tracking by updating metadata with completion status, usage, and new items."""
|
|
155
250
|
if not self.trajectory_id:
|
|
156
251
|
return
|
|
157
|
-
|
|
252
|
+
|
|
158
253
|
# Update metadata with completion status, total usage, and new items
|
|
159
254
|
trajectory_path = self.trajectory_dir / self.trajectory_id
|
|
160
255
|
metadata_path = trajectory_path / "metadata.json"
|
|
161
|
-
|
|
256
|
+
|
|
162
257
|
# Read existing metadata
|
|
163
258
|
if metadata_path.exists():
|
|
164
259
|
with open(metadata_path, "r") as f:
|
|
165
260
|
metadata = json.load(f)
|
|
166
261
|
else:
|
|
167
262
|
metadata = {}
|
|
168
|
-
|
|
263
|
+
|
|
169
264
|
# Update metadata with completion info
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
265
|
+
# Optionally extract screenshots from new_items before persisting
|
|
266
|
+
new_items_to_save = new_items
|
|
267
|
+
try:
|
|
268
|
+
new_items_to_save = extract_computer_call_outputs(new_items, self.screenshot_dir)
|
|
269
|
+
except Exception:
|
|
270
|
+
pass
|
|
271
|
+
|
|
272
|
+
metadata.update(
|
|
273
|
+
{
|
|
274
|
+
"status": "completed",
|
|
275
|
+
"completed_at": str(uuid.uuid1().time),
|
|
276
|
+
"total_usage": self.total_usage,
|
|
277
|
+
"new_items": new_items_to_save,
|
|
278
|
+
"total_turns": self.current_turn,
|
|
279
|
+
}
|
|
280
|
+
)
|
|
281
|
+
|
|
178
282
|
# Save updated metadata
|
|
179
283
|
with open(metadata_path, "w") as f:
|
|
180
284
|
json.dump(metadata, f, indent=2)
|
|
181
|
-
|
|
182
|
-
@override
|
|
285
|
+
|
|
286
|
+
@override
|
|
183
287
|
async def on_api_start(self, kwargs: Dict[str, Any]) -> None:
|
|
184
288
|
if not self.trajectory_id:
|
|
185
289
|
return
|
|
186
|
-
|
|
187
|
-
self._save_artifact("api_start", {
|
|
188
|
-
|
|
290
|
+
|
|
291
|
+
self._save_artifact("api_start", {"kwargs": kwargs})
|
|
292
|
+
|
|
189
293
|
@override
|
|
190
294
|
async def on_api_end(self, kwargs: Dict[str, Any], result: Any) -> None:
|
|
191
295
|
"""Save API call result."""
|
|
192
296
|
if not self.trajectory_id:
|
|
193
297
|
return
|
|
194
|
-
|
|
195
|
-
self._save_artifact("api_result", {
|
|
298
|
+
|
|
299
|
+
self._save_artifact("api_result", {"kwargs": kwargs, "result": result})
|
|
196
300
|
|
|
197
301
|
@override
|
|
198
302
|
async def on_screenshot(self, screenshot: Union[str, bytes], name: str = "screenshot") -> None:
|
|
@@ -211,77 +315,83 @@ class TrajectorySaverCallback(AsyncCallbackHandler):
|
|
|
211
315
|
"""Save responses to the current turn directory and update usage statistics."""
|
|
212
316
|
if not self.trajectory_id:
|
|
213
317
|
return
|
|
214
|
-
|
|
318
|
+
|
|
215
319
|
# Save responses
|
|
216
320
|
turn_dir = self._get_turn_dir()
|
|
217
321
|
response_data = {
|
|
218
322
|
"timestamp": str(uuid.uuid1().time),
|
|
219
323
|
"model": self.model,
|
|
220
324
|
"kwargs": kwargs,
|
|
221
|
-
"response": responses
|
|
325
|
+
"response": responses,
|
|
222
326
|
}
|
|
223
|
-
|
|
327
|
+
|
|
224
328
|
self._save_artifact("agent_response", response_data)
|
|
225
|
-
|
|
329
|
+
|
|
226
330
|
# Increment turn counter
|
|
227
331
|
self.current_turn += 1
|
|
228
332
|
|
|
229
333
|
def _draw_crosshair_on_image(self, image_bytes: bytes, x: int, y: int) -> bytes:
|
|
230
334
|
"""
|
|
231
335
|
Draw a red dot and crosshair at the specified coordinates on the image.
|
|
232
|
-
|
|
336
|
+
|
|
233
337
|
Args:
|
|
234
338
|
image_bytes: The original image as bytes
|
|
235
339
|
x: X coordinate for the crosshair
|
|
236
340
|
y: Y coordinate for the crosshair
|
|
237
|
-
|
|
341
|
+
|
|
238
342
|
Returns:
|
|
239
343
|
Modified image as bytes with red dot and crosshair
|
|
240
344
|
"""
|
|
241
345
|
# Open the image
|
|
242
346
|
image = Image.open(io.BytesIO(image_bytes))
|
|
243
347
|
draw = ImageDraw.Draw(image)
|
|
244
|
-
|
|
348
|
+
|
|
245
349
|
# Draw crosshair lines (red, 2px thick)
|
|
246
350
|
crosshair_size = 20
|
|
247
351
|
line_width = 2
|
|
248
352
|
color = "red"
|
|
249
|
-
|
|
353
|
+
|
|
250
354
|
# Horizontal line
|
|
251
355
|
draw.line([(x - crosshair_size, y), (x + crosshair_size, y)], fill=color, width=line_width)
|
|
252
356
|
# Vertical line
|
|
253
357
|
draw.line([(x, y - crosshair_size), (x, y + crosshair_size)], fill=color, width=line_width)
|
|
254
|
-
|
|
358
|
+
|
|
255
359
|
# Draw center dot (filled circle)
|
|
256
360
|
dot_radius = 3
|
|
257
|
-
draw.ellipse(
|
|
258
|
-
|
|
361
|
+
draw.ellipse(
|
|
362
|
+
[(x - dot_radius, y - dot_radius), (x + dot_radius, y + dot_radius)], fill=color
|
|
363
|
+
)
|
|
364
|
+
|
|
259
365
|
# Convert back to bytes
|
|
260
366
|
output = io.BytesIO()
|
|
261
|
-
image.save(output, format=
|
|
367
|
+
image.save(output, format="PNG")
|
|
262
368
|
return output.getvalue()
|
|
263
369
|
|
|
264
370
|
@override
|
|
265
|
-
async def on_computer_call_end(
|
|
371
|
+
async def on_computer_call_end(
|
|
372
|
+
self, item: Dict[str, Any], result: List[Dict[str, Any]]
|
|
373
|
+
) -> None:
|
|
266
374
|
"""
|
|
267
375
|
Called when a computer call has completed.
|
|
268
376
|
Saves screenshots and computer call output.
|
|
269
377
|
"""
|
|
270
378
|
if not self.trajectory_id:
|
|
271
379
|
return
|
|
272
|
-
|
|
273
|
-
self._save_artifact("computer_call_result", {
|
|
274
|
-
|
|
380
|
+
|
|
381
|
+
self._save_artifact("computer_call_result", {"item": item, "result": result})
|
|
382
|
+
|
|
275
383
|
# Check if action has x/y coordinates and there's a screenshot in the result
|
|
276
384
|
action = item.get("action", {})
|
|
277
385
|
if "x" in action and "y" in action:
|
|
278
386
|
# Look for screenshot in the result
|
|
279
387
|
for result_item in result:
|
|
280
|
-
if (
|
|
281
|
-
result_item.get("
|
|
282
|
-
|
|
388
|
+
if (
|
|
389
|
+
result_item.get("type") == "computer_call_output"
|
|
390
|
+
and result_item.get("output", {}).get("type") == "input_image"
|
|
391
|
+
):
|
|
392
|
+
|
|
283
393
|
image_url = result_item["output"]["image_url"]
|
|
284
|
-
|
|
394
|
+
|
|
285
395
|
# Extract base64 image data
|
|
286
396
|
if image_url.startswith("data:image/"):
|
|
287
397
|
# Format: data:image/png;base64,<base64_data>
|
|
@@ -289,26 +399,24 @@ class TrajectorySaverCallback(AsyncCallbackHandler):
|
|
|
289
399
|
else:
|
|
290
400
|
# Assume it's just base64 data
|
|
291
401
|
base64_data = image_url
|
|
292
|
-
|
|
402
|
+
|
|
293
403
|
try:
|
|
294
404
|
# Decode the image
|
|
295
405
|
image_bytes = base64.b64decode(base64_data)
|
|
296
|
-
|
|
406
|
+
|
|
297
407
|
# Draw crosshair at the action coordinates
|
|
298
408
|
annotated_image = self._draw_crosshair_on_image(
|
|
299
|
-
image_bytes,
|
|
300
|
-
int(action["x"]),
|
|
301
|
-
int(action["y"])
|
|
409
|
+
image_bytes, int(action["x"]), int(action["y"])
|
|
302
410
|
)
|
|
303
|
-
|
|
411
|
+
|
|
304
412
|
# Save as screenshot_action
|
|
305
413
|
self._save_artifact("screenshot_action", annotated_image)
|
|
306
|
-
|
|
414
|
+
|
|
307
415
|
except Exception as e:
|
|
308
416
|
# If annotation fails, just log and continue
|
|
309
417
|
print(f"Failed to annotate screenshot: {e}")
|
|
310
|
-
|
|
418
|
+
|
|
311
419
|
break # Only process the first screenshot found
|
|
312
420
|
|
|
313
421
|
# Increment turn counter
|
|
314
|
-
self.current_turn += 1
|
|
422
|
+
self.current_turn += 1
|