openadapt-ml 0.2.0__py3-none-any.whl → 0.2.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- openadapt_ml/baselines/__init__.py +121 -0
- openadapt_ml/baselines/adapter.py +185 -0
- openadapt_ml/baselines/cli.py +314 -0
- openadapt_ml/baselines/config.py +448 -0
- openadapt_ml/baselines/parser.py +922 -0
- openadapt_ml/baselines/prompts.py +787 -0
- openadapt_ml/benchmarks/__init__.py +13 -115
- openadapt_ml/benchmarks/agent.py +265 -421
- openadapt_ml/benchmarks/azure.py +28 -19
- openadapt_ml/benchmarks/azure_ops_tracker.py +521 -0
- openadapt_ml/benchmarks/cli.py +1722 -4847
- openadapt_ml/benchmarks/trace_export.py +631 -0
- openadapt_ml/benchmarks/viewer.py +22 -5
- openadapt_ml/benchmarks/vm_monitor.py +530 -29
- openadapt_ml/benchmarks/waa_deploy/Dockerfile +47 -53
- openadapt_ml/benchmarks/waa_deploy/api_agent.py +21 -20
- openadapt_ml/cloud/azure_inference.py +3 -5
- openadapt_ml/cloud/lambda_labs.py +722 -307
- openadapt_ml/cloud/local.py +2038 -487
- openadapt_ml/cloud/ssh_tunnel.py +68 -26
- openadapt_ml/datasets/next_action.py +40 -30
- openadapt_ml/evals/grounding.py +8 -3
- openadapt_ml/evals/plot_eval_metrics.py +15 -13
- openadapt_ml/evals/trajectory_matching.py +41 -26
- openadapt_ml/experiments/demo_prompt/format_demo.py +16 -6
- openadapt_ml/experiments/demo_prompt/run_experiment.py +26 -16
- openadapt_ml/experiments/representation_shootout/__init__.py +70 -0
- openadapt_ml/experiments/representation_shootout/conditions.py +708 -0
- openadapt_ml/experiments/representation_shootout/config.py +390 -0
- openadapt_ml/experiments/representation_shootout/evaluator.py +659 -0
- openadapt_ml/experiments/representation_shootout/runner.py +687 -0
- openadapt_ml/experiments/waa_demo/runner.py +29 -14
- openadapt_ml/export/parquet.py +36 -24
- openadapt_ml/grounding/detector.py +18 -14
- openadapt_ml/ingest/__init__.py +8 -6
- openadapt_ml/ingest/capture.py +25 -22
- openadapt_ml/ingest/loader.py +7 -4
- openadapt_ml/ingest/synthetic.py +189 -100
- openadapt_ml/models/api_adapter.py +14 -4
- openadapt_ml/models/base_adapter.py +10 -2
- openadapt_ml/models/providers/__init__.py +288 -0
- openadapt_ml/models/providers/anthropic.py +266 -0
- openadapt_ml/models/providers/base.py +299 -0
- openadapt_ml/models/providers/google.py +376 -0
- openadapt_ml/models/providers/openai.py +342 -0
- openadapt_ml/models/qwen_vl.py +46 -19
- openadapt_ml/perception/__init__.py +35 -0
- openadapt_ml/perception/integration.py +399 -0
- openadapt_ml/retrieval/demo_retriever.py +50 -24
- openadapt_ml/retrieval/embeddings.py +9 -8
- openadapt_ml/retrieval/retriever.py +3 -1
- openadapt_ml/runtime/__init__.py +50 -0
- openadapt_ml/runtime/policy.py +18 -5
- openadapt_ml/runtime/safety_gate.py +471 -0
- openadapt_ml/schema/__init__.py +9 -0
- openadapt_ml/schema/converters.py +74 -27
- openadapt_ml/schema/episode.py +31 -18
- openadapt_ml/scripts/capture_screenshots.py +530 -0
- openadapt_ml/scripts/compare.py +85 -54
- openadapt_ml/scripts/demo_policy.py +4 -1
- openadapt_ml/scripts/eval_policy.py +15 -9
- openadapt_ml/scripts/make_gif.py +1 -1
- openadapt_ml/scripts/prepare_synthetic.py +3 -1
- openadapt_ml/scripts/train.py +21 -9
- openadapt_ml/segmentation/README.md +920 -0
- openadapt_ml/segmentation/__init__.py +97 -0
- openadapt_ml/segmentation/adapters/__init__.py +5 -0
- openadapt_ml/segmentation/adapters/capture_adapter.py +420 -0
- openadapt_ml/segmentation/annotator.py +610 -0
- openadapt_ml/segmentation/cache.py +290 -0
- openadapt_ml/segmentation/cli.py +674 -0
- openadapt_ml/segmentation/deduplicator.py +656 -0
- openadapt_ml/segmentation/frame_describer.py +788 -0
- openadapt_ml/segmentation/pipeline.py +340 -0
- openadapt_ml/segmentation/schemas.py +622 -0
- openadapt_ml/segmentation/segment_extractor.py +634 -0
- openadapt_ml/training/azure_ops_viewer.py +1097 -0
- openadapt_ml/training/benchmark_viewer.py +52 -41
- openadapt_ml/training/shared_ui.py +7 -7
- openadapt_ml/training/stub_provider.py +57 -35
- openadapt_ml/training/trainer.py +143 -86
- openadapt_ml/training/trl_trainer.py +70 -21
- openadapt_ml/training/viewer.py +323 -108
- openadapt_ml/training/viewer_components.py +180 -0
- {openadapt_ml-0.2.0.dist-info → openadapt_ml-0.2.1.dist-info}/METADATA +215 -14
- openadapt_ml-0.2.1.dist-info/RECORD +116 -0
- openadapt_ml/benchmarks/base.py +0 -366
- openadapt_ml/benchmarks/data_collection.py +0 -432
- openadapt_ml/benchmarks/live_tracker.py +0 -180
- openadapt_ml/benchmarks/runner.py +0 -418
- openadapt_ml/benchmarks/waa.py +0 -761
- openadapt_ml/benchmarks/waa_live.py +0 -619
- openadapt_ml-0.2.0.dist-info/RECORD +0 -86
- {openadapt_ml-0.2.0.dist-info → openadapt_ml-0.2.1.dist-info}/WHEEL +0 -0
- {openadapt_ml-0.2.0.dist-info → openadapt_ml-0.2.1.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,631 @@
|
|
|
1
|
+
"""Export WAA benchmark traces as training data.
|
|
2
|
+
|
|
3
|
+
This module provides functionality to filter and export successful WAA benchmark
|
|
4
|
+
traces in a format suitable for VLM fine-tuning. It converts benchmark execution
|
|
5
|
+
traces to the openadapt-ml Episode format.
|
|
6
|
+
|
|
7
|
+
Usage:
|
|
8
|
+
# Via CLI
|
|
9
|
+
uv run python -m openadapt_ml.benchmarks.cli export-traces --status passed --output training_data/
|
|
10
|
+
|
|
11
|
+
# Via Python
|
|
12
|
+
from openadapt_ml.benchmarks.trace_export import export_traces, TraceExporter
|
|
13
|
+
|
|
14
|
+
# Export all passing traces
|
|
15
|
+
exporter = TraceExporter(
|
|
16
|
+
benchmark_dir=Path("benchmark_results/waa_eval_20241214"),
|
|
17
|
+
output_dir=Path("training_data"),
|
|
18
|
+
status_filter="passed",
|
|
19
|
+
)
|
|
20
|
+
episodes = exporter.export()
|
|
21
|
+
|
|
22
|
+
# Or use convenience function
|
|
23
|
+
episodes = export_traces(
|
|
24
|
+
benchmark_dir="benchmark_results/waa_eval_20241214",
|
|
25
|
+
output_dir="training_data",
|
|
26
|
+
status_filter="passed",
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
Directory structure created:
|
|
30
|
+
training_data/
|
|
31
|
+
|-- episodes/
|
|
32
|
+
| |-- episode_001.json # Episode schema format
|
|
33
|
+
| |-- episode_002.json
|
|
34
|
+
| |-- ...
|
|
35
|
+
|-- screenshots/
|
|
36
|
+
| |-- episode_001/
|
|
37
|
+
| | |-- step_000.png
|
|
38
|
+
| | |-- step_001.png
|
|
39
|
+
| |-- episode_002/
|
|
40
|
+
|-- manifest.json # Index of all exported episodes
|
|
41
|
+
|-- training_samples.jsonl # JSONL format for training
|
|
42
|
+
"""
|
|
43
|
+
|
|
44
|
+
from __future__ import annotations
|
|
45
|
+
|
|
46
|
+
import json
|
|
47
|
+
import logging
|
|
48
|
+
import shutil
|
|
49
|
+
from dataclasses import dataclass, field
|
|
50
|
+
from datetime import datetime
|
|
51
|
+
from pathlib import Path
|
|
52
|
+
from typing import Any, Literal
|
|
53
|
+
|
|
54
|
+
from openadapt_ml.benchmarks.viewer import (
|
|
55
|
+
load_benchmark_metadata,
|
|
56
|
+
load_benchmark_summary,
|
|
57
|
+
load_task_results,
|
|
58
|
+
)
|
|
59
|
+
from openadapt_ml.schema import (
|
|
60
|
+
Action,
|
|
61
|
+
ActionType,
|
|
62
|
+
BenchmarkSource,
|
|
63
|
+
Coordinates,
|
|
64
|
+
Episode,
|
|
65
|
+
Observation,
|
|
66
|
+
Step,
|
|
67
|
+
save_episode,
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
logger = logging.getLogger(__name__)
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
StatusFilter = Literal["passed", "failed", "all"]
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
@dataclass
|
|
77
|
+
class ExportStats:
|
|
78
|
+
"""Statistics from a trace export operation."""
|
|
79
|
+
|
|
80
|
+
total_tasks: int = 0
|
|
81
|
+
exported_tasks: int = 0
|
|
82
|
+
skipped_tasks: int = 0
|
|
83
|
+
total_steps: int = 0
|
|
84
|
+
exported_screenshots: int = 0
|
|
85
|
+
errors: list[str] = field(default_factory=list)
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
@dataclass
|
|
89
|
+
class TraceExporter:
|
|
90
|
+
"""Export WAA benchmark traces as training data.
|
|
91
|
+
|
|
92
|
+
Filters and converts benchmark execution traces to Episode format,
|
|
93
|
+
copies screenshots, and creates training-ready data files.
|
|
94
|
+
|
|
95
|
+
Args:
|
|
96
|
+
benchmark_dir: Path to benchmark results directory containing metadata.json,
|
|
97
|
+
summary.json, and tasks/ subdirectory.
|
|
98
|
+
output_dir: Output directory for exported training data.
|
|
99
|
+
status_filter: Filter by task status ("passed", "failed", "all").
|
|
100
|
+
copy_screenshots: Whether to copy screenshots to output directory.
|
|
101
|
+
create_jsonl: Whether to create training_samples.jsonl file.
|
|
102
|
+
viewport_size: Default viewport size (width, height) for normalizing coordinates.
|
|
103
|
+
"""
|
|
104
|
+
|
|
105
|
+
benchmark_dir: Path
|
|
106
|
+
output_dir: Path
|
|
107
|
+
status_filter: StatusFilter = "passed"
|
|
108
|
+
copy_screenshots: bool = True
|
|
109
|
+
create_jsonl: bool = True
|
|
110
|
+
viewport_size: tuple[int, int] = (1920, 1200)
|
|
111
|
+
|
|
112
|
+
def __post_init__(self):
|
|
113
|
+
self.benchmark_dir = Path(self.benchmark_dir)
|
|
114
|
+
self.output_dir = Path(self.output_dir)
|
|
115
|
+
|
|
116
|
+
def export(self) -> list[Episode]:
|
|
117
|
+
"""Export traces according to configuration.
|
|
118
|
+
|
|
119
|
+
Returns:
|
|
120
|
+
List of Episode objects created from the traces.
|
|
121
|
+
"""
|
|
122
|
+
# Load benchmark data
|
|
123
|
+
metadata = load_benchmark_metadata(self.benchmark_dir)
|
|
124
|
+
load_benchmark_summary(self.benchmark_dir)
|
|
125
|
+
tasks = load_task_results(self.benchmark_dir)
|
|
126
|
+
|
|
127
|
+
logger.info(
|
|
128
|
+
f"Loaded {len(tasks)} tasks from {self.benchmark_dir.name} "
|
|
129
|
+
f"(model: {metadata.get('model_id', 'unknown')})"
|
|
130
|
+
)
|
|
131
|
+
|
|
132
|
+
# Filter tasks
|
|
133
|
+
filtered_tasks = self._filter_tasks(tasks)
|
|
134
|
+
logger.info(
|
|
135
|
+
f"Filtered to {len(filtered_tasks)} tasks with status={self.status_filter}"
|
|
136
|
+
)
|
|
137
|
+
|
|
138
|
+
if not filtered_tasks:
|
|
139
|
+
logger.warning("No tasks match the filter criteria")
|
|
140
|
+
return []
|
|
141
|
+
|
|
142
|
+
# Create output directories
|
|
143
|
+
self._setup_output_dirs()
|
|
144
|
+
|
|
145
|
+
# Convert and export
|
|
146
|
+
episodes = []
|
|
147
|
+
stats = ExportStats(total_tasks=len(tasks))
|
|
148
|
+
|
|
149
|
+
for i, task in enumerate(filtered_tasks):
|
|
150
|
+
try:
|
|
151
|
+
episode = self._convert_task_to_episode(task, i, metadata)
|
|
152
|
+
episodes.append(episode)
|
|
153
|
+
|
|
154
|
+
# Save episode JSON
|
|
155
|
+
episode_path = (
|
|
156
|
+
self.output_dir / "episodes" / f"{episode.episode_id}.json"
|
|
157
|
+
)
|
|
158
|
+
save_episode(episode, episode_path)
|
|
159
|
+
|
|
160
|
+
# Copy screenshots if enabled
|
|
161
|
+
if self.copy_screenshots:
|
|
162
|
+
self._copy_task_screenshots(task, episode.episode_id)
|
|
163
|
+
stats.exported_screenshots += len(task.get("screenshots", []))
|
|
164
|
+
|
|
165
|
+
stats.exported_tasks += 1
|
|
166
|
+
stats.total_steps += len(episode.steps)
|
|
167
|
+
|
|
168
|
+
logger.debug(
|
|
169
|
+
f"Exported episode {episode.episode_id}: "
|
|
170
|
+
f"{len(episode.steps)} steps, success={episode.success}"
|
|
171
|
+
)
|
|
172
|
+
|
|
173
|
+
except Exception as e:
|
|
174
|
+
error_msg = (
|
|
175
|
+
f"Failed to export task {task.get('task_id', 'unknown')}: {e}"
|
|
176
|
+
)
|
|
177
|
+
logger.error(error_msg)
|
|
178
|
+
stats.errors.append(error_msg)
|
|
179
|
+
stats.skipped_tasks += 1
|
|
180
|
+
|
|
181
|
+
# Create manifest
|
|
182
|
+
self._create_manifest(episodes, metadata, stats)
|
|
183
|
+
|
|
184
|
+
# Create JSONL training file
|
|
185
|
+
if self.create_jsonl:
|
|
186
|
+
self._create_training_jsonl(episodes)
|
|
187
|
+
|
|
188
|
+
# Log summary
|
|
189
|
+
logger.info(
|
|
190
|
+
f"Export complete: {stats.exported_tasks}/{stats.total_tasks} tasks, "
|
|
191
|
+
f"{stats.total_steps} steps, {stats.exported_screenshots} screenshots"
|
|
192
|
+
)
|
|
193
|
+
if stats.errors:
|
|
194
|
+
logger.warning(f"{len(stats.errors)} errors during export")
|
|
195
|
+
|
|
196
|
+
return episodes
|
|
197
|
+
|
|
198
|
+
def _filter_tasks(self, tasks: list[dict[str, Any]]) -> list[dict[str, Any]]:
|
|
199
|
+
"""Filter tasks by status.
|
|
200
|
+
|
|
201
|
+
Args:
|
|
202
|
+
tasks: List of task dictionaries from load_task_results.
|
|
203
|
+
|
|
204
|
+
Returns:
|
|
205
|
+
Filtered list of tasks.
|
|
206
|
+
"""
|
|
207
|
+
if self.status_filter == "all":
|
|
208
|
+
return tasks
|
|
209
|
+
|
|
210
|
+
filtered = []
|
|
211
|
+
for task in tasks:
|
|
212
|
+
execution = task.get("execution", {})
|
|
213
|
+
success = execution.get("success", False)
|
|
214
|
+
|
|
215
|
+
if self.status_filter == "passed" and success:
|
|
216
|
+
filtered.append(task)
|
|
217
|
+
elif self.status_filter == "failed" and not success:
|
|
218
|
+
filtered.append(task)
|
|
219
|
+
|
|
220
|
+
return filtered
|
|
221
|
+
|
|
222
|
+
def _setup_output_dirs(self) -> None:
|
|
223
|
+
"""Create output directory structure."""
|
|
224
|
+
self.output_dir.mkdir(parents=True, exist_ok=True)
|
|
225
|
+
(self.output_dir / "episodes").mkdir(exist_ok=True)
|
|
226
|
+
if self.copy_screenshots:
|
|
227
|
+
(self.output_dir / "screenshots").mkdir(exist_ok=True)
|
|
228
|
+
|
|
229
|
+
def _convert_task_to_episode(
|
|
230
|
+
self,
|
|
231
|
+
task: dict[str, Any],
|
|
232
|
+
index: int,
|
|
233
|
+
metadata: dict[str, Any],
|
|
234
|
+
) -> Episode:
|
|
235
|
+
"""Convert a benchmark task to Episode format.
|
|
236
|
+
|
|
237
|
+
Args:
|
|
238
|
+
task: Task dictionary from load_task_results.
|
|
239
|
+
index: Task index for episode ID generation.
|
|
240
|
+
metadata: Benchmark metadata.
|
|
241
|
+
|
|
242
|
+
Returns:
|
|
243
|
+
Episode instance.
|
|
244
|
+
"""
|
|
245
|
+
definition = task.get("definition", {})
|
|
246
|
+
execution = task.get("execution", {})
|
|
247
|
+
screenshots = task.get("screenshots", [])
|
|
248
|
+
execution_steps = execution.get("steps", [])
|
|
249
|
+
|
|
250
|
+
task_id = task.get("task_id", f"task_{index:03d}")
|
|
251
|
+
episode_id = f"waa_{task_id}"
|
|
252
|
+
|
|
253
|
+
# Convert execution steps to Episode steps
|
|
254
|
+
steps = []
|
|
255
|
+
for step_idx, step_data in enumerate(execution_steps):
|
|
256
|
+
step = self._convert_step(step_data, step_idx, screenshots)
|
|
257
|
+
steps.append(step)
|
|
258
|
+
|
|
259
|
+
return Episode(
|
|
260
|
+
episode_id=episode_id,
|
|
261
|
+
task_id=task_id,
|
|
262
|
+
instruction=definition.get("instruction", ""),
|
|
263
|
+
goal=definition.get("instruction", ""),
|
|
264
|
+
steps=steps,
|
|
265
|
+
success=execution.get("success", False),
|
|
266
|
+
final_reward=execution.get("score", 0.0),
|
|
267
|
+
source=BenchmarkSource.WAA,
|
|
268
|
+
source_file=str(self.benchmark_dir / "tasks" / task_id),
|
|
269
|
+
agent_model=metadata.get("model_id", "unknown"),
|
|
270
|
+
environment="Windows 11",
|
|
271
|
+
tags=[
|
|
272
|
+
definition.get("domain", "unknown"),
|
|
273
|
+
"waa",
|
|
274
|
+
"benchmark",
|
|
275
|
+
],
|
|
276
|
+
metadata={
|
|
277
|
+
"benchmark_name": metadata.get("benchmark_name", "waa"),
|
|
278
|
+
"run_name": metadata.get("run_name"),
|
|
279
|
+
"domain": definition.get("domain"),
|
|
280
|
+
"num_steps": execution.get("num_steps", len(steps)),
|
|
281
|
+
"total_time_seconds": execution.get("total_time_seconds"),
|
|
282
|
+
"error": execution.get("error"),
|
|
283
|
+
"reason": execution.get("reason"),
|
|
284
|
+
"evaluation_spec": definition.get("evaluation_spec"),
|
|
285
|
+
},
|
|
286
|
+
)
|
|
287
|
+
|
|
288
|
+
def _convert_step(
|
|
289
|
+
self,
|
|
290
|
+
step_data: dict[str, Any],
|
|
291
|
+
step_idx: int,
|
|
292
|
+
screenshots: list[str],
|
|
293
|
+
) -> Step:
|
|
294
|
+
"""Convert a benchmark execution step to Episode Step format.
|
|
295
|
+
|
|
296
|
+
Args:
|
|
297
|
+
step_data: Step data from execution.json.
|
|
298
|
+
step_idx: Step index.
|
|
299
|
+
screenshots: List of screenshot paths.
|
|
300
|
+
|
|
301
|
+
Returns:
|
|
302
|
+
Step instance.
|
|
303
|
+
"""
|
|
304
|
+
action_data = step_data.get("action", {})
|
|
305
|
+
|
|
306
|
+
# Build observation
|
|
307
|
+
screenshot_path = None
|
|
308
|
+
if step_idx < len(screenshots):
|
|
309
|
+
screenshot_path = screenshots[step_idx]
|
|
310
|
+
elif step_data.get("screenshot_path"):
|
|
311
|
+
screenshot_path = step_data["screenshot_path"]
|
|
312
|
+
|
|
313
|
+
observation = Observation(
|
|
314
|
+
screenshot_path=screenshot_path,
|
|
315
|
+
screen_size=self.viewport_size,
|
|
316
|
+
)
|
|
317
|
+
|
|
318
|
+
# Convert action type
|
|
319
|
+
action_type = self._map_action_type(action_data.get("type", "click"))
|
|
320
|
+
|
|
321
|
+
# Build action with coordinates
|
|
322
|
+
action_kwargs: dict[str, Any] = {
|
|
323
|
+
"type": action_type,
|
|
324
|
+
"raw": action_data,
|
|
325
|
+
}
|
|
326
|
+
|
|
327
|
+
# Handle coordinates - convert to normalized if pixel values
|
|
328
|
+
x = action_data.get("x")
|
|
329
|
+
y = action_data.get("y")
|
|
330
|
+
if x is not None and y is not None:
|
|
331
|
+
# Check if already normalized (0-1 range)
|
|
332
|
+
if 0 <= x <= 1 and 0 <= y <= 1:
|
|
333
|
+
action_kwargs["normalized_coordinates"] = (x, y)
|
|
334
|
+
else:
|
|
335
|
+
# Assume pixel coordinates, normalize
|
|
336
|
+
norm_x = x / self.viewport_size[0]
|
|
337
|
+
norm_y = y / self.viewport_size[1]
|
|
338
|
+
action_kwargs["normalized_coordinates"] = (norm_x, norm_y)
|
|
339
|
+
# Also store pixel coordinates
|
|
340
|
+
action_kwargs["coordinates"] = Coordinates(x=int(x), y=int(y))
|
|
341
|
+
|
|
342
|
+
# Handle text for type action
|
|
343
|
+
if action_data.get("text"):
|
|
344
|
+
action_kwargs["text"] = action_data["text"]
|
|
345
|
+
|
|
346
|
+
# Handle key for key action
|
|
347
|
+
if action_data.get("key"):
|
|
348
|
+
action_kwargs["key"] = action_data["key"]
|
|
349
|
+
|
|
350
|
+
# Handle modifiers
|
|
351
|
+
if action_data.get("modifiers"):
|
|
352
|
+
action_kwargs["modifiers"] = action_data["modifiers"]
|
|
353
|
+
|
|
354
|
+
# Handle scroll
|
|
355
|
+
if action_data.get("scroll_direction"):
|
|
356
|
+
action_kwargs["scroll_direction"] = action_data["scroll_direction"]
|
|
357
|
+
if action_data.get("scroll_amount"):
|
|
358
|
+
action_kwargs["scroll_amount"] = int(action_data["scroll_amount"])
|
|
359
|
+
|
|
360
|
+
# Handle drag end coordinates
|
|
361
|
+
end_x = action_data.get("end_x")
|
|
362
|
+
end_y = action_data.get("end_y")
|
|
363
|
+
if end_x is not None and end_y is not None:
|
|
364
|
+
if 0 <= end_x <= 1 and 0 <= end_y <= 1:
|
|
365
|
+
action_kwargs["normalized_end"] = (end_x, end_y)
|
|
366
|
+
else:
|
|
367
|
+
norm_end_x = end_x / self.viewport_size[0]
|
|
368
|
+
norm_end_y = end_y / self.viewport_size[1]
|
|
369
|
+
action_kwargs["normalized_end"] = (norm_end_x, norm_end_y)
|
|
370
|
+
action_kwargs["end_coordinates"] = Coordinates(
|
|
371
|
+
x=int(end_x), y=int(end_y)
|
|
372
|
+
)
|
|
373
|
+
|
|
374
|
+
# Handle element targeting
|
|
375
|
+
if action_data.get("target_node_id"):
|
|
376
|
+
from openadapt_ml.schema import UIElement
|
|
377
|
+
|
|
378
|
+
action_kwargs["element"] = UIElement(
|
|
379
|
+
element_id=action_data.get("target_node_id"),
|
|
380
|
+
role=action_data.get("target_role"),
|
|
381
|
+
name=action_data.get("target_name"),
|
|
382
|
+
)
|
|
383
|
+
|
|
384
|
+
action = Action(**action_kwargs)
|
|
385
|
+
|
|
386
|
+
return Step(
|
|
387
|
+
step_index=step_idx,
|
|
388
|
+
observation=observation,
|
|
389
|
+
action=action,
|
|
390
|
+
reasoning=step_data.get("reasoning"),
|
|
391
|
+
timestamp=step_data.get("timestamp"),
|
|
392
|
+
)
|
|
393
|
+
|
|
394
|
+
def _map_action_type(self, action_type_str: str) -> ActionType:
|
|
395
|
+
"""Map benchmark action type string to ActionType enum.
|
|
396
|
+
|
|
397
|
+
Args:
|
|
398
|
+
action_type_str: Action type string from benchmark.
|
|
399
|
+
|
|
400
|
+
Returns:
|
|
401
|
+
ActionType enum value.
|
|
402
|
+
"""
|
|
403
|
+
mapping = {
|
|
404
|
+
"click": ActionType.CLICK,
|
|
405
|
+
"double_click": ActionType.DOUBLE_CLICK,
|
|
406
|
+
"right_click": ActionType.RIGHT_CLICK,
|
|
407
|
+
"type": ActionType.TYPE,
|
|
408
|
+
"key": ActionType.KEY,
|
|
409
|
+
"scroll": ActionType.SCROLL,
|
|
410
|
+
"drag": ActionType.DRAG,
|
|
411
|
+
"hover": ActionType.HOVER,
|
|
412
|
+
"wait": ActionType.WAIT,
|
|
413
|
+
"done": ActionType.DONE,
|
|
414
|
+
"answer": ActionType.DONE,
|
|
415
|
+
"failed": ActionType.FAIL,
|
|
416
|
+
"fail": ActionType.FAIL,
|
|
417
|
+
}
|
|
418
|
+
return mapping.get(action_type_str.lower(), ActionType.CLICK)
|
|
419
|
+
|
|
420
|
+
def _copy_task_screenshots(self, task: dict[str, Any], episode_id: str) -> None:
|
|
421
|
+
"""Copy task screenshots to output directory.
|
|
422
|
+
|
|
423
|
+
Args:
|
|
424
|
+
task: Task dictionary.
|
|
425
|
+
episode_id: Episode ID for output subdirectory.
|
|
426
|
+
"""
|
|
427
|
+
screenshots = task.get("screenshots", [])
|
|
428
|
+
if not screenshots:
|
|
429
|
+
return
|
|
430
|
+
|
|
431
|
+
# Create episode screenshot directory
|
|
432
|
+
episode_screenshots_dir = self.output_dir / "screenshots" / episode_id
|
|
433
|
+
episode_screenshots_dir.mkdir(parents=True, exist_ok=True)
|
|
434
|
+
|
|
435
|
+
for i, rel_path in enumerate(screenshots):
|
|
436
|
+
src_path = self.benchmark_dir / rel_path
|
|
437
|
+
if src_path.exists():
|
|
438
|
+
dest_path = episode_screenshots_dir / f"step_{i:03d}.png"
|
|
439
|
+
shutil.copy2(src_path, dest_path)
|
|
440
|
+
|
|
441
|
+
def _create_manifest(
|
|
442
|
+
self,
|
|
443
|
+
episodes: list[Episode],
|
|
444
|
+
metadata: dict[str, Any],
|
|
445
|
+
stats: ExportStats,
|
|
446
|
+
) -> None:
|
|
447
|
+
"""Create manifest.json with export metadata.
|
|
448
|
+
|
|
449
|
+
Args:
|
|
450
|
+
episodes: List of exported episodes.
|
|
451
|
+
metadata: Benchmark metadata.
|
|
452
|
+
stats: Export statistics.
|
|
453
|
+
"""
|
|
454
|
+
manifest = {
|
|
455
|
+
"export_timestamp": datetime.utcnow().isoformat(),
|
|
456
|
+
"source_benchmark": metadata.get("benchmark_name", "waa"),
|
|
457
|
+
"source_run": metadata.get("run_name"),
|
|
458
|
+
"source_model": metadata.get("model_id"),
|
|
459
|
+
"status_filter": self.status_filter,
|
|
460
|
+
"statistics": {
|
|
461
|
+
"total_tasks": stats.total_tasks,
|
|
462
|
+
"exported_tasks": stats.exported_tasks,
|
|
463
|
+
"skipped_tasks": stats.skipped_tasks,
|
|
464
|
+
"total_steps": stats.total_steps,
|
|
465
|
+
"exported_screenshots": stats.exported_screenshots,
|
|
466
|
+
"errors": len(stats.errors),
|
|
467
|
+
},
|
|
468
|
+
"episodes": [
|
|
469
|
+
{
|
|
470
|
+
"episode_id": ep.episode_id,
|
|
471
|
+
"task_id": ep.task_id,
|
|
472
|
+
"instruction": ep.instruction,
|
|
473
|
+
"num_steps": len(ep.steps),
|
|
474
|
+
"success": ep.success,
|
|
475
|
+
"domain": ep.metadata.get("domain") if ep.metadata else None,
|
|
476
|
+
}
|
|
477
|
+
for ep in episodes
|
|
478
|
+
],
|
|
479
|
+
}
|
|
480
|
+
|
|
481
|
+
manifest_path = self.output_dir / "manifest.json"
|
|
482
|
+
with open(manifest_path, "w") as f:
|
|
483
|
+
json.dump(manifest, f, indent=2)
|
|
484
|
+
|
|
485
|
+
logger.info(f"Created manifest: {manifest_path}")
|
|
486
|
+
|
|
487
|
+
def _create_training_jsonl(self, episodes: list[Episode]) -> None:
|
|
488
|
+
"""Create JSONL file for training.
|
|
489
|
+
|
|
490
|
+
Each line contains a training sample with:
|
|
491
|
+
- instruction: Task instruction
|
|
492
|
+
- screenshot_path: Path to screenshot
|
|
493
|
+
- action: Action taken
|
|
494
|
+
- reasoning: Optional reasoning
|
|
495
|
+
|
|
496
|
+
Args:
|
|
497
|
+
episodes: List of exported episodes.
|
|
498
|
+
"""
|
|
499
|
+
jsonl_path = self.output_dir / "training_samples.jsonl"
|
|
500
|
+
|
|
501
|
+
with open(jsonl_path, "w") as f:
|
|
502
|
+
for episode in episodes:
|
|
503
|
+
for step in episode.steps:
|
|
504
|
+
sample = {
|
|
505
|
+
"episode_id": episode.episode_id,
|
|
506
|
+
"task_id": episode.task_id,
|
|
507
|
+
"instruction": episode.instruction,
|
|
508
|
+
"step_index": step.step_index,
|
|
509
|
+
"screenshot_path": step.observation.screenshot_path,
|
|
510
|
+
"action_type": step.action.type.value,
|
|
511
|
+
"action": {
|
|
512
|
+
"type": step.action.type.value,
|
|
513
|
+
"coordinates": (
|
|
514
|
+
{
|
|
515
|
+
"x": step.action.coordinates.x,
|
|
516
|
+
"y": step.action.coordinates.y,
|
|
517
|
+
}
|
|
518
|
+
if step.action.coordinates
|
|
519
|
+
else None
|
|
520
|
+
),
|
|
521
|
+
"normalized_coordinates": step.action.normalized_coordinates,
|
|
522
|
+
"text": step.action.text,
|
|
523
|
+
"key": step.action.key,
|
|
524
|
+
"modifiers": step.action.modifiers,
|
|
525
|
+
"scroll_direction": step.action.scroll_direction,
|
|
526
|
+
"scroll_amount": step.action.scroll_amount,
|
|
527
|
+
},
|
|
528
|
+
"reasoning": step.reasoning,
|
|
529
|
+
"domain": episode.metadata.get("domain")
|
|
530
|
+
if episode.metadata
|
|
531
|
+
else None,
|
|
532
|
+
"success": episode.success,
|
|
533
|
+
}
|
|
534
|
+
f.write(json.dumps(sample) + "\n")
|
|
535
|
+
|
|
536
|
+
logger.info(f"Created training JSONL: {jsonl_path}")
|
|
537
|
+
|
|
538
|
+
|
|
539
|
+
def export_traces(
|
|
540
|
+
benchmark_dir: str | Path,
|
|
541
|
+
output_dir: str | Path,
|
|
542
|
+
status_filter: StatusFilter = "passed",
|
|
543
|
+
copy_screenshots: bool = True,
|
|
544
|
+
create_jsonl: bool = True,
|
|
545
|
+
viewport_size: tuple[int, int] = (1920, 1200),
|
|
546
|
+
) -> list[Episode]:
|
|
547
|
+
"""Convenience function to export benchmark traces.
|
|
548
|
+
|
|
549
|
+
Args:
|
|
550
|
+
benchmark_dir: Path to benchmark results directory.
|
|
551
|
+
output_dir: Output directory for exported training data.
|
|
552
|
+
status_filter: Filter by task status ("passed", "failed", "all").
|
|
553
|
+
copy_screenshots: Whether to copy screenshots to output directory.
|
|
554
|
+
create_jsonl: Whether to create training_samples.jsonl file.
|
|
555
|
+
viewport_size: Default viewport size for normalizing coordinates.
|
|
556
|
+
|
|
557
|
+
Returns:
|
|
558
|
+
List of Episode objects created from the traces.
|
|
559
|
+
|
|
560
|
+
Example:
|
|
561
|
+
episodes = export_traces(
|
|
562
|
+
benchmark_dir="benchmark_results/waa_eval_20241214",
|
|
563
|
+
output_dir="training_data",
|
|
564
|
+
status_filter="passed",
|
|
565
|
+
)
|
|
566
|
+
print(f"Exported {len(episodes)} episodes")
|
|
567
|
+
"""
|
|
568
|
+
exporter = TraceExporter(
|
|
569
|
+
benchmark_dir=Path(benchmark_dir),
|
|
570
|
+
output_dir=Path(output_dir),
|
|
571
|
+
status_filter=status_filter,
|
|
572
|
+
copy_screenshots=copy_screenshots,
|
|
573
|
+
create_jsonl=create_jsonl,
|
|
574
|
+
viewport_size=viewport_size,
|
|
575
|
+
)
|
|
576
|
+
return exporter.export()
|
|
577
|
+
|
|
578
|
+
|
|
579
|
+
def list_available_runs(
|
|
580
|
+
benchmark_results_dir: str | Path = "benchmark_results",
|
|
581
|
+
) -> list[dict[str, Any]]:
|
|
582
|
+
"""List available benchmark runs for export.
|
|
583
|
+
|
|
584
|
+
Args:
|
|
585
|
+
benchmark_results_dir: Base directory containing benchmark results.
|
|
586
|
+
|
|
587
|
+
Returns:
|
|
588
|
+
List of dictionaries with run information.
|
|
589
|
+
"""
|
|
590
|
+
results_dir = Path(benchmark_results_dir)
|
|
591
|
+
if not results_dir.exists():
|
|
592
|
+
return []
|
|
593
|
+
|
|
594
|
+
runs = []
|
|
595
|
+
for run_dir in sorted(results_dir.iterdir()):
|
|
596
|
+
if not run_dir.is_dir():
|
|
597
|
+
continue
|
|
598
|
+
|
|
599
|
+
metadata_path = run_dir / "metadata.json"
|
|
600
|
+
summary_path = run_dir / "summary.json"
|
|
601
|
+
|
|
602
|
+
run_info = {
|
|
603
|
+
"run_name": run_dir.name,
|
|
604
|
+
"path": str(run_dir),
|
|
605
|
+
}
|
|
606
|
+
|
|
607
|
+
if metadata_path.exists():
|
|
608
|
+
with open(metadata_path) as f:
|
|
609
|
+
metadata = json.load(f)
|
|
610
|
+
run_info.update(
|
|
611
|
+
{
|
|
612
|
+
"benchmark_name": metadata.get("benchmark_name"),
|
|
613
|
+
"model_id": metadata.get("model_id"),
|
|
614
|
+
"created_at": metadata.get("created_at"),
|
|
615
|
+
}
|
|
616
|
+
)
|
|
617
|
+
|
|
618
|
+
if summary_path.exists():
|
|
619
|
+
with open(summary_path) as f:
|
|
620
|
+
summary = json.load(f)
|
|
621
|
+
run_info.update(
|
|
622
|
+
{
|
|
623
|
+
"num_tasks": summary.get("num_tasks", 0),
|
|
624
|
+
"num_success": summary.get("num_success", 0),
|
|
625
|
+
"success_rate": summary.get("success_rate", 0.0),
|
|
626
|
+
}
|
|
627
|
+
)
|
|
628
|
+
|
|
629
|
+
runs.append(run_info)
|
|
630
|
+
|
|
631
|
+
return runs
|