cua-agent 0.3.2__py3-none-any.whl → 0.4.0b1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of cua-agent might be problematic. Click here for more details.
- agent/__init__.py +15 -51
- agent/__main__.py +21 -0
- agent/adapters/__init__.py +9 -0
- agent/adapters/huggingfacelocal_adapter.py +216 -0
- agent/agent.py +577 -0
- agent/callbacks/__init__.py +17 -0
- agent/callbacks/base.py +153 -0
- agent/callbacks/budget_manager.py +44 -0
- agent/callbacks/image_retention.py +139 -0
- agent/callbacks/logging.py +247 -0
- agent/callbacks/pii_anonymization.py +259 -0
- agent/callbacks/trajectory_saver.py +305 -0
- agent/cli.py +290 -0
- agent/computer_handler.py +107 -0
- agent/decorators.py +90 -0
- agent/loops/__init__.py +11 -0
- agent/loops/anthropic.py +728 -0
- agent/loops/omniparser.py +339 -0
- agent/loops/openai.py +95 -0
- agent/loops/uitars.py +688 -0
- agent/responses.py +207 -0
- agent/types.py +79 -0
- agent/ui/__init__.py +7 -1
- agent/ui/gradio/__init__.py +6 -19
- agent/ui/gradio/app.py +80 -1299
- agent/ui/gradio/ui_components.py +703 -0
- cua_agent-0.4.0b1.dist-info/METADATA +424 -0
- cua_agent-0.4.0b1.dist-info/RECORD +30 -0
- agent/core/__init__.py +0 -27
- agent/core/agent.py +0 -210
- agent/core/base.py +0 -217
- agent/core/callbacks.py +0 -200
- agent/core/experiment.py +0 -249
- agent/core/factory.py +0 -122
- agent/core/messages.py +0 -332
- agent/core/provider_config.py +0 -21
- agent/core/telemetry.py +0 -142
- agent/core/tools/__init__.py +0 -21
- agent/core/tools/base.py +0 -74
- agent/core/tools/bash.py +0 -52
- agent/core/tools/collection.py +0 -46
- agent/core/tools/computer.py +0 -113
- agent/core/tools/edit.py +0 -67
- agent/core/tools/manager.py +0 -56
- agent/core/tools.py +0 -32
- agent/core/types.py +0 -88
- agent/core/visualization.py +0 -197
- agent/providers/__init__.py +0 -4
- agent/providers/anthropic/__init__.py +0 -6
- agent/providers/anthropic/api/client.py +0 -360
- agent/providers/anthropic/api/logging.py +0 -150
- agent/providers/anthropic/api_handler.py +0 -140
- agent/providers/anthropic/callbacks/__init__.py +0 -5
- agent/providers/anthropic/callbacks/manager.py +0 -65
- agent/providers/anthropic/loop.py +0 -568
- agent/providers/anthropic/prompts.py +0 -23
- agent/providers/anthropic/response_handler.py +0 -226
- agent/providers/anthropic/tools/__init__.py +0 -33
- agent/providers/anthropic/tools/base.py +0 -88
- agent/providers/anthropic/tools/bash.py +0 -66
- agent/providers/anthropic/tools/collection.py +0 -34
- agent/providers/anthropic/tools/computer.py +0 -396
- agent/providers/anthropic/tools/edit.py +0 -326
- agent/providers/anthropic/tools/manager.py +0 -54
- agent/providers/anthropic/tools/run.py +0 -42
- agent/providers/anthropic/types.py +0 -16
- agent/providers/anthropic/utils.py +0 -381
- agent/providers/omni/__init__.py +0 -8
- agent/providers/omni/api_handler.py +0 -42
- agent/providers/omni/clients/anthropic.py +0 -103
- agent/providers/omni/clients/base.py +0 -35
- agent/providers/omni/clients/oaicompat.py +0 -195
- agent/providers/omni/clients/ollama.py +0 -122
- agent/providers/omni/clients/openai.py +0 -155
- agent/providers/omni/clients/utils.py +0 -25
- agent/providers/omni/image_utils.py +0 -34
- agent/providers/omni/loop.py +0 -990
- agent/providers/omni/parser.py +0 -307
- agent/providers/omni/prompts.py +0 -64
- agent/providers/omni/tools/__init__.py +0 -30
- agent/providers/omni/tools/base.py +0 -29
- agent/providers/omni/tools/bash.py +0 -74
- agent/providers/omni/tools/computer.py +0 -179
- agent/providers/omni/tools/manager.py +0 -61
- agent/providers/omni/utils.py +0 -236
- agent/providers/openai/__init__.py +0 -6
- agent/providers/openai/api_handler.py +0 -456
- agent/providers/openai/loop.py +0 -472
- agent/providers/openai/response_handler.py +0 -205
- agent/providers/openai/tools/__init__.py +0 -15
- agent/providers/openai/tools/base.py +0 -79
- agent/providers/openai/tools/computer.py +0 -326
- agent/providers/openai/tools/manager.py +0 -106
- agent/providers/openai/types.py +0 -36
- agent/providers/openai/utils.py +0 -98
- agent/providers/uitars/__init__.py +0 -1
- agent/providers/uitars/clients/base.py +0 -35
- agent/providers/uitars/clients/mlxvlm.py +0 -263
- agent/providers/uitars/clients/oaicompat.py +0 -214
- agent/providers/uitars/loop.py +0 -660
- agent/providers/uitars/prompts.py +0 -63
- agent/providers/uitars/tools/__init__.py +0 -1
- agent/providers/uitars/tools/computer.py +0 -283
- agent/providers/uitars/tools/manager.py +0 -60
- agent/providers/uitars/utils.py +0 -264
- agent/telemetry.py +0 -21
- agent/ui/__main__.py +0 -15
- cua_agent-0.3.2.dist-info/METADATA +0 -295
- cua_agent-0.3.2.dist-info/RECORD +0 -87
- {cua_agent-0.3.2.dist-info → cua_agent-0.4.0b1.dist-info}/WHEEL +0 -0
- {cua_agent-0.3.2.dist-info → cua_agent-0.4.0b1.dist-info}/entry_points.txt +0 -0
|
@@ -0,0 +1,259 @@
|
|
|
1
|
+
"""
|
|
2
|
+
PII anonymization callback handler using Microsoft Presidio for text and image redaction.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from typing import List, Dict, Any, Optional, Tuple
|
|
6
|
+
from .base import AsyncCallbackHandler
|
|
7
|
+
import base64
|
|
8
|
+
import io
|
|
9
|
+
import logging
|
|
10
|
+
|
|
11
|
+
try:
|
|
12
|
+
from presidio_analyzer import AnalyzerEngine
|
|
13
|
+
from presidio_anonymizer import AnonymizerEngine, DeanonymizeEngine
|
|
14
|
+
from presidio_anonymizer.entities import RecognizerResult, OperatorConfig
|
|
15
|
+
from presidio_image_redactor import ImageRedactorEngine
|
|
16
|
+
from PIL import Image
|
|
17
|
+
PRESIDIO_AVAILABLE = True
|
|
18
|
+
except ImportError:
|
|
19
|
+
PRESIDIO_AVAILABLE = False
|
|
20
|
+
|
|
21
|
+
logger = logging.getLogger(__name__)
|
|
22
|
+
|
|
23
|
+
class PIIAnonymizationCallback(AsyncCallbackHandler):
|
|
24
|
+
"""
|
|
25
|
+
Callback handler that anonymizes PII in text and images using Microsoft Presidio.
|
|
26
|
+
|
|
27
|
+
This handler:
|
|
28
|
+
1. Anonymizes PII in messages before sending to the agent loop
|
|
29
|
+
2. Deanonymizes PII in tool calls and message outputs after the agent loop
|
|
30
|
+
3. Redacts PII from images in computer_call_output messages
|
|
31
|
+
"""
|
|
32
|
+
|
|
33
|
+
def __init__(
|
|
34
|
+
self,
|
|
35
|
+
anonymize_text: bool = True,
|
|
36
|
+
anonymize_images: bool = True,
|
|
37
|
+
entities_to_anonymize: Optional[List[str]] = None,
|
|
38
|
+
anonymization_operator: str = "replace",
|
|
39
|
+
image_redaction_color: Tuple[int, int, int] = (255, 192, 203) # Pink
|
|
40
|
+
):
|
|
41
|
+
"""
|
|
42
|
+
Initialize the PII anonymization callback.
|
|
43
|
+
|
|
44
|
+
Args:
|
|
45
|
+
anonymize_text: Whether to anonymize text content
|
|
46
|
+
anonymize_images: Whether to redact images
|
|
47
|
+
entities_to_anonymize: List of entity types to anonymize (None for all)
|
|
48
|
+
anonymization_operator: Presidio operator to use ("replace", "mask", "redact", etc.)
|
|
49
|
+
image_redaction_color: RGB color for image redaction
|
|
50
|
+
"""
|
|
51
|
+
if not PRESIDIO_AVAILABLE:
|
|
52
|
+
raise ImportError(
|
|
53
|
+
"Presidio is not available. Install with: "
|
|
54
|
+
"pip install presidio-analyzer presidio-anonymizer presidio-image-redactor"
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
self.anonymize_text = anonymize_text
|
|
58
|
+
self.anonymize_images = anonymize_images
|
|
59
|
+
self.entities_to_anonymize = entities_to_anonymize
|
|
60
|
+
self.anonymization_operator = anonymization_operator
|
|
61
|
+
self.image_redaction_color = image_redaction_color
|
|
62
|
+
|
|
63
|
+
# Initialize Presidio engines
|
|
64
|
+
self.analyzer = AnalyzerEngine()
|
|
65
|
+
self.anonymizer = AnonymizerEngine()
|
|
66
|
+
self.deanonymizer = DeanonymizeEngine()
|
|
67
|
+
self.image_redactor = ImageRedactorEngine()
|
|
68
|
+
|
|
69
|
+
# Store anonymization mappings for deanonymization
|
|
70
|
+
self.anonymization_mappings: Dict[str, Any] = {}
|
|
71
|
+
|
|
72
|
+
async def on_llm_start(self, messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
|
73
|
+
"""
|
|
74
|
+
Anonymize PII in messages before sending to agent loop.
|
|
75
|
+
|
|
76
|
+
Args:
|
|
77
|
+
messages: List of message dictionaries
|
|
78
|
+
|
|
79
|
+
Returns:
|
|
80
|
+
List of messages with PII anonymized
|
|
81
|
+
"""
|
|
82
|
+
if not self.anonymize_text and not self.anonymize_images:
|
|
83
|
+
return messages
|
|
84
|
+
|
|
85
|
+
anonymized_messages = []
|
|
86
|
+
for msg in messages:
|
|
87
|
+
anonymized_msg = await self._anonymize_message(msg)
|
|
88
|
+
anonymized_messages.append(anonymized_msg)
|
|
89
|
+
|
|
90
|
+
return anonymized_messages
|
|
91
|
+
|
|
92
|
+
async def on_llm_end(self, output: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
|
93
|
+
"""
|
|
94
|
+
Deanonymize PII in tool calls and message outputs after agent loop.
|
|
95
|
+
|
|
96
|
+
Args:
|
|
97
|
+
output: List of output dictionaries
|
|
98
|
+
|
|
99
|
+
Returns:
|
|
100
|
+
List of output with PII deanonymized for tool calls
|
|
101
|
+
"""
|
|
102
|
+
if not self.anonymize_text:
|
|
103
|
+
return output
|
|
104
|
+
|
|
105
|
+
deanonymized_output = []
|
|
106
|
+
for item in output:
|
|
107
|
+
# Only deanonymize tool calls and computer_call messages
|
|
108
|
+
if item.get("type") in ["computer_call", "computer_call_output"]:
|
|
109
|
+
deanonymized_item = await self._deanonymize_item(item)
|
|
110
|
+
deanonymized_output.append(deanonymized_item)
|
|
111
|
+
else:
|
|
112
|
+
deanonymized_output.append(item)
|
|
113
|
+
|
|
114
|
+
return deanonymized_output
|
|
115
|
+
|
|
116
|
+
async def _anonymize_message(self, message: Dict[str, Any]) -> Dict[str, Any]:
|
|
117
|
+
"""Anonymize PII in a single message."""
|
|
118
|
+
msg_copy = message.copy()
|
|
119
|
+
|
|
120
|
+
# Anonymize text content
|
|
121
|
+
if self.anonymize_text:
|
|
122
|
+
msg_copy = await self._anonymize_text_content(msg_copy)
|
|
123
|
+
|
|
124
|
+
# Redact images in computer_call_output
|
|
125
|
+
if self.anonymize_images and msg_copy.get("type") == "computer_call_output":
|
|
126
|
+
msg_copy = await self._redact_image_content(msg_copy)
|
|
127
|
+
|
|
128
|
+
return msg_copy
|
|
129
|
+
|
|
130
|
+
async def _anonymize_text_content(self, message: Dict[str, Any]) -> Dict[str, Any]:
|
|
131
|
+
"""Anonymize text content in a message."""
|
|
132
|
+
msg_copy = message.copy()
|
|
133
|
+
|
|
134
|
+
# Handle content array
|
|
135
|
+
content = msg_copy.get("content", [])
|
|
136
|
+
if isinstance(content, str):
|
|
137
|
+
anonymized_text, _ = await self._anonymize_text(content)
|
|
138
|
+
msg_copy["content"] = anonymized_text
|
|
139
|
+
elif isinstance(content, list):
|
|
140
|
+
anonymized_content = []
|
|
141
|
+
for item in content:
|
|
142
|
+
if isinstance(item, dict) and item.get("type") == "text":
|
|
143
|
+
text = item.get("text", "")
|
|
144
|
+
anonymized_text, _ = await self._anonymize_text(text)
|
|
145
|
+
item_copy = item.copy()
|
|
146
|
+
item_copy["text"] = anonymized_text
|
|
147
|
+
anonymized_content.append(item_copy)
|
|
148
|
+
else:
|
|
149
|
+
anonymized_content.append(item)
|
|
150
|
+
msg_copy["content"] = anonymized_content
|
|
151
|
+
|
|
152
|
+
return msg_copy
|
|
153
|
+
|
|
154
|
+
async def _redact_image_content(self, message: Dict[str, Any]) -> Dict[str, Any]:
|
|
155
|
+
"""Redact PII from images in computer_call_output messages."""
|
|
156
|
+
msg_copy = message.copy()
|
|
157
|
+
output = msg_copy.get("output", {})
|
|
158
|
+
|
|
159
|
+
if isinstance(output, dict) and "image_url" in output:
|
|
160
|
+
try:
|
|
161
|
+
# Extract base64 image data
|
|
162
|
+
image_url = output["image_url"]
|
|
163
|
+
if image_url.startswith("data:image/"):
|
|
164
|
+
# Parse data URL
|
|
165
|
+
header, data = image_url.split(",", 1)
|
|
166
|
+
image_data = base64.b64decode(data)
|
|
167
|
+
|
|
168
|
+
# Load image with PIL
|
|
169
|
+
image = Image.open(io.BytesIO(image_data))
|
|
170
|
+
|
|
171
|
+
# Redact PII from image
|
|
172
|
+
redacted_image = self.image_redactor.redact(image, self.image_redaction_color)
|
|
173
|
+
|
|
174
|
+
# Convert back to base64
|
|
175
|
+
buffer = io.BytesIO()
|
|
176
|
+
redacted_image.save(buffer, format="PNG")
|
|
177
|
+
redacted_data = base64.b64encode(buffer.getvalue()).decode()
|
|
178
|
+
|
|
179
|
+
# Update image URL
|
|
180
|
+
output_copy = output.copy()
|
|
181
|
+
output_copy["image_url"] = f"data:image/png;base64,{redacted_data}"
|
|
182
|
+
msg_copy["output"] = output_copy
|
|
183
|
+
|
|
184
|
+
except Exception as e:
|
|
185
|
+
logger.warning(f"Failed to redact image: {e}")
|
|
186
|
+
|
|
187
|
+
return msg_copy
|
|
188
|
+
|
|
189
|
+
async def _deanonymize_item(self, item: Dict[str, Any]) -> Dict[str, Any]:
|
|
190
|
+
"""Deanonymize PII in tool calls and computer outputs."""
|
|
191
|
+
item_copy = item.copy()
|
|
192
|
+
|
|
193
|
+
# Handle computer_call arguments
|
|
194
|
+
if item.get("type") == "computer_call":
|
|
195
|
+
args = item_copy.get("args", {})
|
|
196
|
+
if isinstance(args, dict):
|
|
197
|
+
deanonymized_args = {}
|
|
198
|
+
for key, value in args.items():
|
|
199
|
+
if isinstance(value, str):
|
|
200
|
+
deanonymized_value, _ = await self._deanonymize_text(value)
|
|
201
|
+
deanonymized_args[key] = deanonymized_value
|
|
202
|
+
else:
|
|
203
|
+
deanonymized_args[key] = value
|
|
204
|
+
item_copy["args"] = deanonymized_args
|
|
205
|
+
|
|
206
|
+
return item_copy
|
|
207
|
+
|
|
208
|
+
async def _anonymize_text(self, text: str) -> Tuple[str, List[RecognizerResult]]:
|
|
209
|
+
"""Anonymize PII in text and return the anonymized text and results."""
|
|
210
|
+
if not text.strip():
|
|
211
|
+
return text, []
|
|
212
|
+
|
|
213
|
+
try:
|
|
214
|
+
# Analyze text for PII
|
|
215
|
+
analyzer_results = self.analyzer.analyze(
|
|
216
|
+
text=text,
|
|
217
|
+
entities=self.entities_to_anonymize,
|
|
218
|
+
language="en"
|
|
219
|
+
)
|
|
220
|
+
|
|
221
|
+
if not analyzer_results:
|
|
222
|
+
return text, []
|
|
223
|
+
|
|
224
|
+
# Anonymize the text
|
|
225
|
+
anonymized_result = self.anonymizer.anonymize(
|
|
226
|
+
text=text,
|
|
227
|
+
analyzer_results=analyzer_results,
|
|
228
|
+
operators={entity_type: OperatorConfig(self.anonymization_operator)
|
|
229
|
+
for entity_type in set(result.entity_type for result in analyzer_results)}
|
|
230
|
+
)
|
|
231
|
+
|
|
232
|
+
# Store mapping for deanonymization
|
|
233
|
+
mapping_key = str(hash(text))
|
|
234
|
+
self.anonymization_mappings[mapping_key] = {
|
|
235
|
+
"original": text,
|
|
236
|
+
"anonymized": anonymized_result.text,
|
|
237
|
+
"results": analyzer_results
|
|
238
|
+
}
|
|
239
|
+
|
|
240
|
+
return anonymized_result.text, analyzer_results
|
|
241
|
+
|
|
242
|
+
except Exception as e:
|
|
243
|
+
logger.warning(f"Failed to anonymize text: {e}")
|
|
244
|
+
return text, []
|
|
245
|
+
|
|
246
|
+
async def _deanonymize_text(self, text: str) -> Tuple[str, bool]:
|
|
247
|
+
"""Attempt to deanonymize text using stored mappings."""
|
|
248
|
+
try:
|
|
249
|
+
# Look for matching anonymized text in mappings
|
|
250
|
+
for mapping_key, mapping in self.anonymization_mappings.items():
|
|
251
|
+
if mapping["anonymized"] == text:
|
|
252
|
+
return mapping["original"], True
|
|
253
|
+
|
|
254
|
+
# If no mapping found, return original text
|
|
255
|
+
return text, False
|
|
256
|
+
|
|
257
|
+
except Exception as e:
|
|
258
|
+
logger.warning(f"Failed to deanonymize text: {e}")
|
|
259
|
+
return text, False
|
|
@@ -0,0 +1,305 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Trajectory saving callback handler for ComputerAgent.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import os
|
|
6
|
+
import json
|
|
7
|
+
import uuid
|
|
8
|
+
from datetime import datetime
|
|
9
|
+
import base64
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
from typing import List, Dict, Any, Optional, Union, override
|
|
12
|
+
from PIL import Image, ImageDraw
|
|
13
|
+
import io
|
|
14
|
+
from .base import AsyncCallbackHandler
|
|
15
|
+
|
|
16
|
+
def sanitize_image_urls(data: Any) -> Any:
|
|
17
|
+
"""
|
|
18
|
+
Recursively search for 'image_url' keys and set their values to '[omitted]'.
|
|
19
|
+
|
|
20
|
+
Args:
|
|
21
|
+
data: Any data structure (dict, list, or primitive type)
|
|
22
|
+
|
|
23
|
+
Returns:
|
|
24
|
+
A deep copy of the data with all 'image_url' values replaced with '[omitted]'
|
|
25
|
+
"""
|
|
26
|
+
if isinstance(data, dict):
|
|
27
|
+
# Create a copy of the dictionary
|
|
28
|
+
sanitized = {}
|
|
29
|
+
for key, value in data.items():
|
|
30
|
+
if key == "image_url":
|
|
31
|
+
sanitized[key] = "[omitted]"
|
|
32
|
+
else:
|
|
33
|
+
# Recursively sanitize the value
|
|
34
|
+
sanitized[key] = sanitize_image_urls(value)
|
|
35
|
+
return sanitized
|
|
36
|
+
|
|
37
|
+
elif isinstance(data, list):
|
|
38
|
+
# Recursively sanitize each item in the list
|
|
39
|
+
return [sanitize_image_urls(item) for item in data]
|
|
40
|
+
|
|
41
|
+
else:
|
|
42
|
+
# For primitive types (str, int, bool, None, etc.), return as-is
|
|
43
|
+
return data
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
class TrajectorySaverCallback(AsyncCallbackHandler):
|
|
47
|
+
"""
|
|
48
|
+
Callback handler that saves agent trajectories to disk.
|
|
49
|
+
|
|
50
|
+
Saves each run as a separate trajectory with unique ID, and each turn
|
|
51
|
+
within the trajectory gets its own folder with screenshots and responses.
|
|
52
|
+
"""
|
|
53
|
+
|
|
54
|
+
def __init__(self, trajectory_dir: str):
|
|
55
|
+
"""
|
|
56
|
+
Initialize trajectory saver.
|
|
57
|
+
|
|
58
|
+
Args:
|
|
59
|
+
trajectory_dir: Base directory to save trajectories
|
|
60
|
+
"""
|
|
61
|
+
self.trajectory_dir = Path(trajectory_dir)
|
|
62
|
+
self.trajectory_id: Optional[str] = None
|
|
63
|
+
self.current_turn: int = 0
|
|
64
|
+
self.current_artifact: int = 0
|
|
65
|
+
self.model: Optional[str] = None
|
|
66
|
+
self.total_usage: Dict[str, Any] = {}
|
|
67
|
+
|
|
68
|
+
# Ensure trajectory directory exists
|
|
69
|
+
self.trajectory_dir.mkdir(parents=True, exist_ok=True)
|
|
70
|
+
|
|
71
|
+
def _get_turn_dir(self) -> Path:
|
|
72
|
+
"""Get the directory for the current turn."""
|
|
73
|
+
if not self.trajectory_id:
|
|
74
|
+
raise ValueError("Trajectory not initialized - call _on_run_start first")
|
|
75
|
+
|
|
76
|
+
# format: trajectory_id/turn_000
|
|
77
|
+
turn_dir = self.trajectory_dir / self.trajectory_id / f"turn_{self.current_turn:03d}"
|
|
78
|
+
turn_dir.mkdir(parents=True, exist_ok=True)
|
|
79
|
+
return turn_dir
|
|
80
|
+
|
|
81
|
+
def _save_artifact(self, name: str, artifact: Union[str, bytes, Dict[str, Any]]) -> None:
|
|
82
|
+
"""Save an artifact to the current turn directory."""
|
|
83
|
+
turn_dir = self._get_turn_dir()
|
|
84
|
+
if isinstance(artifact, bytes):
|
|
85
|
+
# format: turn_000/0000_name.png
|
|
86
|
+
artifact_filename = f"{self.current_artifact:04d}_{name}"
|
|
87
|
+
artifact_path = turn_dir / f"{artifact_filename}.png"
|
|
88
|
+
with open(artifact_path, "wb") as f:
|
|
89
|
+
f.write(artifact)
|
|
90
|
+
else:
|
|
91
|
+
# format: turn_000/0000_name.json
|
|
92
|
+
artifact_filename = f"{self.current_artifact:04d}_{name}"
|
|
93
|
+
artifact_path = turn_dir / f"{artifact_filename}.json"
|
|
94
|
+
with open(artifact_path, "w") as f:
|
|
95
|
+
json.dump(sanitize_image_urls(artifact), f, indent=2)
|
|
96
|
+
self.current_artifact += 1
|
|
97
|
+
|
|
98
|
+
def _update_usage(self, usage: Dict[str, Any]) -> None:
|
|
99
|
+
"""Update total usage statistics."""
|
|
100
|
+
def add_dicts(target: Dict[str, Any], source: Dict[str, Any]) -> None:
|
|
101
|
+
for key, value in source.items():
|
|
102
|
+
if isinstance(value, dict):
|
|
103
|
+
if key not in target:
|
|
104
|
+
target[key] = {}
|
|
105
|
+
add_dicts(target[key], value)
|
|
106
|
+
else:
|
|
107
|
+
if key not in target:
|
|
108
|
+
target[key] = 0
|
|
109
|
+
target[key] += value
|
|
110
|
+
add_dicts(self.total_usage, usage)
|
|
111
|
+
|
|
112
|
+
@override
|
|
113
|
+
async def on_run_start(self, kwargs: Dict[str, Any], old_items: List[Dict[str, Any]]) -> None:
|
|
114
|
+
"""Initialize trajectory tracking for a new run."""
|
|
115
|
+
model = kwargs.get("model", "unknown")
|
|
116
|
+
model_name_short = model.split("+")[-1].split("/")[-1].lower()[:16]
|
|
117
|
+
if "+" in model:
|
|
118
|
+
model_name_short = model.split("+")[0].lower()[:4] + "_" + model_name_short
|
|
119
|
+
|
|
120
|
+
# id format: yyyy-mm-dd_model_hhmmss_uuid[:4]
|
|
121
|
+
now = datetime.now()
|
|
122
|
+
self.trajectory_id = f"{now.strftime('%Y-%m-%d')}_{model_name_short}_{now.strftime('%H%M%S')}_{str(uuid.uuid4())[:4]}"
|
|
123
|
+
self.current_turn = 0
|
|
124
|
+
self.current_artifact = 0
|
|
125
|
+
self.model = model
|
|
126
|
+
self.total_usage = {}
|
|
127
|
+
|
|
128
|
+
# Create trajectory directory
|
|
129
|
+
trajectory_path = self.trajectory_dir / self.trajectory_id
|
|
130
|
+
trajectory_path.mkdir(parents=True, exist_ok=True)
|
|
131
|
+
|
|
132
|
+
# Save trajectory metadata
|
|
133
|
+
metadata = {
|
|
134
|
+
"trajectory_id": self.trajectory_id,
|
|
135
|
+
"created_at": str(uuid.uuid1().time),
|
|
136
|
+
"status": "running",
|
|
137
|
+
"kwargs": kwargs,
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
with open(trajectory_path / "metadata.json", "w") as f:
|
|
141
|
+
json.dump(metadata, f, indent=2)
|
|
142
|
+
|
|
143
|
+
@override
|
|
144
|
+
async def on_run_end(self, kwargs: Dict[str, Any], old_items: List[Dict[str, Any]], new_items: List[Dict[str, Any]]) -> None:
|
|
145
|
+
"""Finalize run tracking by updating metadata with completion status, usage, and new items."""
|
|
146
|
+
if not self.trajectory_id:
|
|
147
|
+
return
|
|
148
|
+
|
|
149
|
+
# Update metadata with completion status, total usage, and new items
|
|
150
|
+
trajectory_path = self.trajectory_dir / self.trajectory_id
|
|
151
|
+
metadata_path = trajectory_path / "metadata.json"
|
|
152
|
+
|
|
153
|
+
# Read existing metadata
|
|
154
|
+
if metadata_path.exists():
|
|
155
|
+
with open(metadata_path, "r") as f:
|
|
156
|
+
metadata = json.load(f)
|
|
157
|
+
else:
|
|
158
|
+
metadata = {}
|
|
159
|
+
|
|
160
|
+
# Update metadata with completion info
|
|
161
|
+
metadata.update({
|
|
162
|
+
"status": "completed",
|
|
163
|
+
"completed_at": str(uuid.uuid1().time),
|
|
164
|
+
"total_usage": self.total_usage,
|
|
165
|
+
"new_items": sanitize_image_urls(new_items),
|
|
166
|
+
"total_turns": self.current_turn
|
|
167
|
+
})
|
|
168
|
+
|
|
169
|
+
# Save updated metadata
|
|
170
|
+
with open(metadata_path, "w") as f:
|
|
171
|
+
json.dump(metadata, f, indent=2)
|
|
172
|
+
|
|
173
|
+
@override
|
|
174
|
+
async def on_api_start(self, kwargs: Dict[str, Any]) -> None:
|
|
175
|
+
if not self.trajectory_id:
|
|
176
|
+
return
|
|
177
|
+
|
|
178
|
+
self._save_artifact("api_start", { "kwargs": kwargs })
|
|
179
|
+
|
|
180
|
+
@override
|
|
181
|
+
async def on_api_end(self, kwargs: Dict[str, Any], result: Any) -> None:
|
|
182
|
+
"""Save API call result."""
|
|
183
|
+
if not self.trajectory_id:
|
|
184
|
+
return
|
|
185
|
+
|
|
186
|
+
self._save_artifact("api_result", { "kwargs": kwargs, "result": result })
|
|
187
|
+
|
|
188
|
+
@override
|
|
189
|
+
async def on_screenshot(self, screenshot: Union[str, bytes], name: str = "screenshot") -> None:
|
|
190
|
+
"""Save a screenshot."""
|
|
191
|
+
if isinstance(screenshot, str):
|
|
192
|
+
screenshot = base64.b64decode(screenshot)
|
|
193
|
+
self._save_artifact(name, screenshot)
|
|
194
|
+
|
|
195
|
+
@override
|
|
196
|
+
async def on_usage(self, usage: Dict[str, Any]) -> None:
|
|
197
|
+
"""Called when usage information is received."""
|
|
198
|
+
self._update_usage(usage)
|
|
199
|
+
|
|
200
|
+
@override
|
|
201
|
+
async def on_responses(self, kwargs: Dict[str, Any], responses: Dict[str, Any]) -> None:
|
|
202
|
+
"""Save responses to the current turn directory and update usage statistics."""
|
|
203
|
+
if not self.trajectory_id:
|
|
204
|
+
return
|
|
205
|
+
|
|
206
|
+
# Save responses
|
|
207
|
+
turn_dir = self._get_turn_dir()
|
|
208
|
+
response_data = {
|
|
209
|
+
"timestamp": str(uuid.uuid1().time),
|
|
210
|
+
"model": self.model,
|
|
211
|
+
"kwargs": kwargs,
|
|
212
|
+
"response": responses
|
|
213
|
+
}
|
|
214
|
+
|
|
215
|
+
self._save_artifact("agent_response", response_data)
|
|
216
|
+
|
|
217
|
+
# Increment turn counter
|
|
218
|
+
self.current_turn += 1
|
|
219
|
+
|
|
220
|
+
def _draw_crosshair_on_image(self, image_bytes: bytes, x: int, y: int) -> bytes:
|
|
221
|
+
"""
|
|
222
|
+
Draw a red dot and crosshair at the specified coordinates on the image.
|
|
223
|
+
|
|
224
|
+
Args:
|
|
225
|
+
image_bytes: The original image as bytes
|
|
226
|
+
x: X coordinate for the crosshair
|
|
227
|
+
y: Y coordinate for the crosshair
|
|
228
|
+
|
|
229
|
+
Returns:
|
|
230
|
+
Modified image as bytes with red dot and crosshair
|
|
231
|
+
"""
|
|
232
|
+
# Open the image
|
|
233
|
+
image = Image.open(io.BytesIO(image_bytes))
|
|
234
|
+
draw = ImageDraw.Draw(image)
|
|
235
|
+
|
|
236
|
+
# Draw crosshair lines (red, 2px thick)
|
|
237
|
+
crosshair_size = 20
|
|
238
|
+
line_width = 2
|
|
239
|
+
color = "red"
|
|
240
|
+
|
|
241
|
+
# Horizontal line
|
|
242
|
+
draw.line([(x - crosshair_size, y), (x + crosshair_size, y)], fill=color, width=line_width)
|
|
243
|
+
# Vertical line
|
|
244
|
+
draw.line([(x, y - crosshair_size), (x, y + crosshair_size)], fill=color, width=line_width)
|
|
245
|
+
|
|
246
|
+
# Draw center dot (filled circle)
|
|
247
|
+
dot_radius = 3
|
|
248
|
+
draw.ellipse([(x - dot_radius, y - dot_radius), (x + dot_radius, y + dot_radius)], fill=color)
|
|
249
|
+
|
|
250
|
+
# Convert back to bytes
|
|
251
|
+
output = io.BytesIO()
|
|
252
|
+
image.save(output, format='PNG')
|
|
253
|
+
return output.getvalue()
|
|
254
|
+
|
|
255
|
+
@override
|
|
256
|
+
async def on_computer_call_end(self, item: Dict[str, Any], result: List[Dict[str, Any]]) -> None:
|
|
257
|
+
"""
|
|
258
|
+
Called when a computer call has completed.
|
|
259
|
+
Saves screenshots and computer call output.
|
|
260
|
+
"""
|
|
261
|
+
if not self.trajectory_id:
|
|
262
|
+
return
|
|
263
|
+
|
|
264
|
+
self._save_artifact("computer_call_result", { "item": item, "result": result })
|
|
265
|
+
|
|
266
|
+
# Check if action has x/y coordinates and there's a screenshot in the result
|
|
267
|
+
action = item.get("action", {})
|
|
268
|
+
if "x" in action and "y" in action:
|
|
269
|
+
# Look for screenshot in the result
|
|
270
|
+
for result_item in result:
|
|
271
|
+
if (result_item.get("type") == "computer_call_output" and
|
|
272
|
+
result_item.get("output", {}).get("type") == "input_image"):
|
|
273
|
+
|
|
274
|
+
image_url = result_item["output"]["image_url"]
|
|
275
|
+
|
|
276
|
+
# Extract base64 image data
|
|
277
|
+
if image_url.startswith("data:image/"):
|
|
278
|
+
# Format: data:image/png;base64,<base64_data>
|
|
279
|
+
base64_data = image_url.split(",", 1)[1]
|
|
280
|
+
else:
|
|
281
|
+
# Assume it's just base64 data
|
|
282
|
+
base64_data = image_url
|
|
283
|
+
|
|
284
|
+
try:
|
|
285
|
+
# Decode the image
|
|
286
|
+
image_bytes = base64.b64decode(base64_data)
|
|
287
|
+
|
|
288
|
+
# Draw crosshair at the action coordinates
|
|
289
|
+
annotated_image = self._draw_crosshair_on_image(
|
|
290
|
+
image_bytes,
|
|
291
|
+
int(action["x"]),
|
|
292
|
+
int(action["y"])
|
|
293
|
+
)
|
|
294
|
+
|
|
295
|
+
# Save as screenshot_action
|
|
296
|
+
self._save_artifact("screenshot_action", annotated_image)
|
|
297
|
+
|
|
298
|
+
except Exception as e:
|
|
299
|
+
# If annotation fails, just log and continue
|
|
300
|
+
print(f"Failed to annotate screenshot: {e}")
|
|
301
|
+
|
|
302
|
+
break # Only process the first screenshot found
|
|
303
|
+
|
|
304
|
+
# Increment turn counter
|
|
305
|
+
self.current_turn += 1
|