openadapt-ml 0.2.0__py3-none-any.whl → 0.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (95) hide show
  1. openadapt_ml/baselines/__init__.py +121 -0
  2. openadapt_ml/baselines/adapter.py +185 -0
  3. openadapt_ml/baselines/cli.py +314 -0
  4. openadapt_ml/baselines/config.py +448 -0
  5. openadapt_ml/baselines/parser.py +922 -0
  6. openadapt_ml/baselines/prompts.py +787 -0
  7. openadapt_ml/benchmarks/__init__.py +13 -115
  8. openadapt_ml/benchmarks/agent.py +265 -421
  9. openadapt_ml/benchmarks/azure.py +28 -19
  10. openadapt_ml/benchmarks/azure_ops_tracker.py +521 -0
  11. openadapt_ml/benchmarks/cli.py +1722 -4847
  12. openadapt_ml/benchmarks/trace_export.py +631 -0
  13. openadapt_ml/benchmarks/viewer.py +22 -5
  14. openadapt_ml/benchmarks/vm_monitor.py +530 -29
  15. openadapt_ml/benchmarks/waa_deploy/Dockerfile +47 -53
  16. openadapt_ml/benchmarks/waa_deploy/api_agent.py +21 -20
  17. openadapt_ml/cloud/azure_inference.py +3 -5
  18. openadapt_ml/cloud/lambda_labs.py +722 -307
  19. openadapt_ml/cloud/local.py +2038 -487
  20. openadapt_ml/cloud/ssh_tunnel.py +68 -26
  21. openadapt_ml/datasets/next_action.py +40 -30
  22. openadapt_ml/evals/grounding.py +8 -3
  23. openadapt_ml/evals/plot_eval_metrics.py +15 -13
  24. openadapt_ml/evals/trajectory_matching.py +41 -26
  25. openadapt_ml/experiments/demo_prompt/format_demo.py +16 -6
  26. openadapt_ml/experiments/demo_prompt/run_experiment.py +26 -16
  27. openadapt_ml/experiments/representation_shootout/__init__.py +70 -0
  28. openadapt_ml/experiments/representation_shootout/conditions.py +708 -0
  29. openadapt_ml/experiments/representation_shootout/config.py +390 -0
  30. openadapt_ml/experiments/representation_shootout/evaluator.py +659 -0
  31. openadapt_ml/experiments/representation_shootout/runner.py +687 -0
  32. openadapt_ml/experiments/waa_demo/runner.py +29 -14
  33. openadapt_ml/export/parquet.py +36 -24
  34. openadapt_ml/grounding/detector.py +18 -14
  35. openadapt_ml/ingest/__init__.py +8 -6
  36. openadapt_ml/ingest/capture.py +25 -22
  37. openadapt_ml/ingest/loader.py +7 -4
  38. openadapt_ml/ingest/synthetic.py +189 -100
  39. openadapt_ml/models/api_adapter.py +14 -4
  40. openadapt_ml/models/base_adapter.py +10 -2
  41. openadapt_ml/models/providers/__init__.py +288 -0
  42. openadapt_ml/models/providers/anthropic.py +266 -0
  43. openadapt_ml/models/providers/base.py +299 -0
  44. openadapt_ml/models/providers/google.py +376 -0
  45. openadapt_ml/models/providers/openai.py +342 -0
  46. openadapt_ml/models/qwen_vl.py +46 -19
  47. openadapt_ml/perception/__init__.py +35 -0
  48. openadapt_ml/perception/integration.py +399 -0
  49. openadapt_ml/retrieval/demo_retriever.py +50 -24
  50. openadapt_ml/retrieval/embeddings.py +9 -8
  51. openadapt_ml/retrieval/retriever.py +3 -1
  52. openadapt_ml/runtime/__init__.py +50 -0
  53. openadapt_ml/runtime/policy.py +18 -5
  54. openadapt_ml/runtime/safety_gate.py +471 -0
  55. openadapt_ml/schema/__init__.py +9 -0
  56. openadapt_ml/schema/converters.py +74 -27
  57. openadapt_ml/schema/episode.py +31 -18
  58. openadapt_ml/scripts/capture_screenshots.py +530 -0
  59. openadapt_ml/scripts/compare.py +85 -54
  60. openadapt_ml/scripts/demo_policy.py +4 -1
  61. openadapt_ml/scripts/eval_policy.py +15 -9
  62. openadapt_ml/scripts/make_gif.py +1 -1
  63. openadapt_ml/scripts/prepare_synthetic.py +3 -1
  64. openadapt_ml/scripts/train.py +21 -9
  65. openadapt_ml/segmentation/README.md +920 -0
  66. openadapt_ml/segmentation/__init__.py +97 -0
  67. openadapt_ml/segmentation/adapters/__init__.py +5 -0
  68. openadapt_ml/segmentation/adapters/capture_adapter.py +420 -0
  69. openadapt_ml/segmentation/annotator.py +610 -0
  70. openadapt_ml/segmentation/cache.py +290 -0
  71. openadapt_ml/segmentation/cli.py +674 -0
  72. openadapt_ml/segmentation/deduplicator.py +656 -0
  73. openadapt_ml/segmentation/frame_describer.py +788 -0
  74. openadapt_ml/segmentation/pipeline.py +340 -0
  75. openadapt_ml/segmentation/schemas.py +622 -0
  76. openadapt_ml/segmentation/segment_extractor.py +634 -0
  77. openadapt_ml/training/azure_ops_viewer.py +1097 -0
  78. openadapt_ml/training/benchmark_viewer.py +52 -41
  79. openadapt_ml/training/shared_ui.py +7 -7
  80. openadapt_ml/training/stub_provider.py +57 -35
  81. openadapt_ml/training/trainer.py +143 -86
  82. openadapt_ml/training/trl_trainer.py +70 -21
  83. openadapt_ml/training/viewer.py +323 -108
  84. openadapt_ml/training/viewer_components.py +180 -0
  85. {openadapt_ml-0.2.0.dist-info → openadapt_ml-0.2.1.dist-info}/METADATA +215 -14
  86. openadapt_ml-0.2.1.dist-info/RECORD +116 -0
  87. openadapt_ml/benchmarks/base.py +0 -366
  88. openadapt_ml/benchmarks/data_collection.py +0 -432
  89. openadapt_ml/benchmarks/live_tracker.py +0 -180
  90. openadapt_ml/benchmarks/runner.py +0 -418
  91. openadapt_ml/benchmarks/waa.py +0 -761
  92. openadapt_ml/benchmarks/waa_live.py +0 -619
  93. openadapt_ml-0.2.0.dist-info/RECORD +0 -86
  94. {openadapt_ml-0.2.0.dist-info → openadapt_ml-0.2.1.dist-info}/WHEEL +0 -0
  95. {openadapt_ml-0.2.0.dist-info → openadapt_ml-0.2.1.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,631 @@
1
+ """Export WAA benchmark traces as training data.
2
+
3
+ This module provides functionality to filter and export successful WAA benchmark
4
+ traces in a format suitable for VLM fine-tuning. It converts benchmark execution
5
+ traces to the openadapt-ml Episode format.
6
+
7
+ Usage:
8
+ # Via CLI
9
+ uv run python -m openadapt_ml.benchmarks.cli export-traces --status passed --output training_data/
10
+
11
+ # Via Python
12
+ from openadapt_ml.benchmarks.trace_export import export_traces, TraceExporter
13
+
14
+ # Export all passing traces
15
+ exporter = TraceExporter(
16
+ benchmark_dir=Path("benchmark_results/waa_eval_20241214"),
17
+ output_dir=Path("training_data"),
18
+ status_filter="passed",
19
+ )
20
+ episodes = exporter.export()
21
+
22
+ # Or use convenience function
23
+ episodes = export_traces(
24
+ benchmark_dir="benchmark_results/waa_eval_20241214",
25
+ output_dir="training_data",
26
+ status_filter="passed",
27
+ )
28
+
29
+ Directory structure created:
30
+ training_data/
31
+ |-- episodes/
32
+ | |-- episode_001.json # Episode schema format
33
+ | |-- episode_002.json
34
+ | |-- ...
35
+ |-- screenshots/
36
+ | |-- episode_001/
37
+ | | |-- step_000.png
38
+ | | |-- step_001.png
39
+ | |-- episode_002/
40
+ |-- manifest.json # Index of all exported episodes
41
+ |-- training_samples.jsonl # JSONL format for training
42
+ """
43
+
44
+ from __future__ import annotations
45
+
46
+ import json
47
+ import logging
48
+ import shutil
49
+ from dataclasses import dataclass, field
50
+ from datetime import datetime
51
+ from pathlib import Path
52
+ from typing import Any, Literal
53
+
54
+ from openadapt_ml.benchmarks.viewer import (
55
+ load_benchmark_metadata,
56
+ load_benchmark_summary,
57
+ load_task_results,
58
+ )
59
+ from openadapt_ml.schema import (
60
+ Action,
61
+ ActionType,
62
+ BenchmarkSource,
63
+ Coordinates,
64
+ Episode,
65
+ Observation,
66
+ Step,
67
+ save_episode,
68
+ )
69
+
70
+ logger = logging.getLogger(__name__)
71
+
72
+
73
+ StatusFilter = Literal["passed", "failed", "all"]
74
+
75
+
76
+ @dataclass
77
+ class ExportStats:
78
+ """Statistics from a trace export operation."""
79
+
80
+ total_tasks: int = 0
81
+ exported_tasks: int = 0
82
+ skipped_tasks: int = 0
83
+ total_steps: int = 0
84
+ exported_screenshots: int = 0
85
+ errors: list[str] = field(default_factory=list)
86
+
87
+
88
+ @dataclass
89
+ class TraceExporter:
90
+ """Export WAA benchmark traces as training data.
91
+
92
+ Filters and converts benchmark execution traces to Episode format,
93
+ copies screenshots, and creates training-ready data files.
94
+
95
+ Args:
96
+ benchmark_dir: Path to benchmark results directory containing metadata.json,
97
+ summary.json, and tasks/ subdirectory.
98
+ output_dir: Output directory for exported training data.
99
+ status_filter: Filter by task status ("passed", "failed", "all").
100
+ copy_screenshots: Whether to copy screenshots to output directory.
101
+ create_jsonl: Whether to create training_samples.jsonl file.
102
+ viewport_size: Default viewport size (width, height) for normalizing coordinates.
103
+ """
104
+
105
+ benchmark_dir: Path
106
+ output_dir: Path
107
+ status_filter: StatusFilter = "passed"
108
+ copy_screenshots: bool = True
109
+ create_jsonl: bool = True
110
+ viewport_size: tuple[int, int] = (1920, 1200)
111
+
112
+ def __post_init__(self):
113
+ self.benchmark_dir = Path(self.benchmark_dir)
114
+ self.output_dir = Path(self.output_dir)
115
+
116
+ def export(self) -> list[Episode]:
117
+ """Export traces according to configuration.
118
+
119
+ Returns:
120
+ List of Episode objects created from the traces.
121
+ """
122
+ # Load benchmark data
123
+ metadata = load_benchmark_metadata(self.benchmark_dir)
124
+ load_benchmark_summary(self.benchmark_dir)
125
+ tasks = load_task_results(self.benchmark_dir)
126
+
127
+ logger.info(
128
+ f"Loaded {len(tasks)} tasks from {self.benchmark_dir.name} "
129
+ f"(model: {metadata.get('model_id', 'unknown')})"
130
+ )
131
+
132
+ # Filter tasks
133
+ filtered_tasks = self._filter_tasks(tasks)
134
+ logger.info(
135
+ f"Filtered to {len(filtered_tasks)} tasks with status={self.status_filter}"
136
+ )
137
+
138
+ if not filtered_tasks:
139
+ logger.warning("No tasks match the filter criteria")
140
+ return []
141
+
142
+ # Create output directories
143
+ self._setup_output_dirs()
144
+
145
+ # Convert and export
146
+ episodes = []
147
+ stats = ExportStats(total_tasks=len(tasks))
148
+
149
+ for i, task in enumerate(filtered_tasks):
150
+ try:
151
+ episode = self._convert_task_to_episode(task, i, metadata)
152
+ episodes.append(episode)
153
+
154
+ # Save episode JSON
155
+ episode_path = (
156
+ self.output_dir / "episodes" / f"{episode.episode_id}.json"
157
+ )
158
+ save_episode(episode, episode_path)
159
+
160
+ # Copy screenshots if enabled
161
+ if self.copy_screenshots:
162
+ self._copy_task_screenshots(task, episode.episode_id)
163
+ stats.exported_screenshots += len(task.get("screenshots", []))
164
+
165
+ stats.exported_tasks += 1
166
+ stats.total_steps += len(episode.steps)
167
+
168
+ logger.debug(
169
+ f"Exported episode {episode.episode_id}: "
170
+ f"{len(episode.steps)} steps, success={episode.success}"
171
+ )
172
+
173
+ except Exception as e:
174
+ error_msg = (
175
+ f"Failed to export task {task.get('task_id', 'unknown')}: {e}"
176
+ )
177
+ logger.error(error_msg)
178
+ stats.errors.append(error_msg)
179
+ stats.skipped_tasks += 1
180
+
181
+ # Create manifest
182
+ self._create_manifest(episodes, metadata, stats)
183
+
184
+ # Create JSONL training file
185
+ if self.create_jsonl:
186
+ self._create_training_jsonl(episodes)
187
+
188
+ # Log summary
189
+ logger.info(
190
+ f"Export complete: {stats.exported_tasks}/{stats.total_tasks} tasks, "
191
+ f"{stats.total_steps} steps, {stats.exported_screenshots} screenshots"
192
+ )
193
+ if stats.errors:
194
+ logger.warning(f"{len(stats.errors)} errors during export")
195
+
196
+ return episodes
197
+
198
+ def _filter_tasks(self, tasks: list[dict[str, Any]]) -> list[dict[str, Any]]:
199
+ """Filter tasks by status.
200
+
201
+ Args:
202
+ tasks: List of task dictionaries from load_task_results.
203
+
204
+ Returns:
205
+ Filtered list of tasks.
206
+ """
207
+ if self.status_filter == "all":
208
+ return tasks
209
+
210
+ filtered = []
211
+ for task in tasks:
212
+ execution = task.get("execution", {})
213
+ success = execution.get("success", False)
214
+
215
+ if self.status_filter == "passed" and success:
216
+ filtered.append(task)
217
+ elif self.status_filter == "failed" and not success:
218
+ filtered.append(task)
219
+
220
+ return filtered
221
+
222
+ def _setup_output_dirs(self) -> None:
223
+ """Create output directory structure."""
224
+ self.output_dir.mkdir(parents=True, exist_ok=True)
225
+ (self.output_dir / "episodes").mkdir(exist_ok=True)
226
+ if self.copy_screenshots:
227
+ (self.output_dir / "screenshots").mkdir(exist_ok=True)
228
+
229
+ def _convert_task_to_episode(
230
+ self,
231
+ task: dict[str, Any],
232
+ index: int,
233
+ metadata: dict[str, Any],
234
+ ) -> Episode:
235
+ """Convert a benchmark task to Episode format.
236
+
237
+ Args:
238
+ task: Task dictionary from load_task_results.
239
+ index: Task index for episode ID generation.
240
+ metadata: Benchmark metadata.
241
+
242
+ Returns:
243
+ Episode instance.
244
+ """
245
+ definition = task.get("definition", {})
246
+ execution = task.get("execution", {})
247
+ screenshots = task.get("screenshots", [])
248
+ execution_steps = execution.get("steps", [])
249
+
250
+ task_id = task.get("task_id", f"task_{index:03d}")
251
+ episode_id = f"waa_{task_id}"
252
+
253
+ # Convert execution steps to Episode steps
254
+ steps = []
255
+ for step_idx, step_data in enumerate(execution_steps):
256
+ step = self._convert_step(step_data, step_idx, screenshots)
257
+ steps.append(step)
258
+
259
+ return Episode(
260
+ episode_id=episode_id,
261
+ task_id=task_id,
262
+ instruction=definition.get("instruction", ""),
263
+ goal=definition.get("instruction", ""),
264
+ steps=steps,
265
+ success=execution.get("success", False),
266
+ final_reward=execution.get("score", 0.0),
267
+ source=BenchmarkSource.WAA,
268
+ source_file=str(self.benchmark_dir / "tasks" / task_id),
269
+ agent_model=metadata.get("model_id", "unknown"),
270
+ environment="Windows 11",
271
+ tags=[
272
+ definition.get("domain", "unknown"),
273
+ "waa",
274
+ "benchmark",
275
+ ],
276
+ metadata={
277
+ "benchmark_name": metadata.get("benchmark_name", "waa"),
278
+ "run_name": metadata.get("run_name"),
279
+ "domain": definition.get("domain"),
280
+ "num_steps": execution.get("num_steps", len(steps)),
281
+ "total_time_seconds": execution.get("total_time_seconds"),
282
+ "error": execution.get("error"),
283
+ "reason": execution.get("reason"),
284
+ "evaluation_spec": definition.get("evaluation_spec"),
285
+ },
286
+ )
287
+
288
+ def _convert_step(
289
+ self,
290
+ step_data: dict[str, Any],
291
+ step_idx: int,
292
+ screenshots: list[str],
293
+ ) -> Step:
294
+ """Convert a benchmark execution step to Episode Step format.
295
+
296
+ Args:
297
+ step_data: Step data from execution.json.
298
+ step_idx: Step index.
299
+ screenshots: List of screenshot paths.
300
+
301
+ Returns:
302
+ Step instance.
303
+ """
304
+ action_data = step_data.get("action", {})
305
+
306
+ # Build observation
307
+ screenshot_path = None
308
+ if step_idx < len(screenshots):
309
+ screenshot_path = screenshots[step_idx]
310
+ elif step_data.get("screenshot_path"):
311
+ screenshot_path = step_data["screenshot_path"]
312
+
313
+ observation = Observation(
314
+ screenshot_path=screenshot_path,
315
+ screen_size=self.viewport_size,
316
+ )
317
+
318
+ # Convert action type
319
+ action_type = self._map_action_type(action_data.get("type", "click"))
320
+
321
+ # Build action with coordinates
322
+ action_kwargs: dict[str, Any] = {
323
+ "type": action_type,
324
+ "raw": action_data,
325
+ }
326
+
327
+ # Handle coordinates - convert to normalized if pixel values
328
+ x = action_data.get("x")
329
+ y = action_data.get("y")
330
+ if x is not None and y is not None:
331
+ # Check if already normalized (0-1 range)
332
+ if 0 <= x <= 1 and 0 <= y <= 1:
333
+ action_kwargs["normalized_coordinates"] = (x, y)
334
+ else:
335
+ # Assume pixel coordinates, normalize
336
+ norm_x = x / self.viewport_size[0]
337
+ norm_y = y / self.viewport_size[1]
338
+ action_kwargs["normalized_coordinates"] = (norm_x, norm_y)
339
+ # Also store pixel coordinates
340
+ action_kwargs["coordinates"] = Coordinates(x=int(x), y=int(y))
341
+
342
+ # Handle text for type action
343
+ if action_data.get("text"):
344
+ action_kwargs["text"] = action_data["text"]
345
+
346
+ # Handle key for key action
347
+ if action_data.get("key"):
348
+ action_kwargs["key"] = action_data["key"]
349
+
350
+ # Handle modifiers
351
+ if action_data.get("modifiers"):
352
+ action_kwargs["modifiers"] = action_data["modifiers"]
353
+
354
+ # Handle scroll
355
+ if action_data.get("scroll_direction"):
356
+ action_kwargs["scroll_direction"] = action_data["scroll_direction"]
357
+ if action_data.get("scroll_amount"):
358
+ action_kwargs["scroll_amount"] = int(action_data["scroll_amount"])
359
+
360
+ # Handle drag end coordinates
361
+ end_x = action_data.get("end_x")
362
+ end_y = action_data.get("end_y")
363
+ if end_x is not None and end_y is not None:
364
+ if 0 <= end_x <= 1 and 0 <= end_y <= 1:
365
+ action_kwargs["normalized_end"] = (end_x, end_y)
366
+ else:
367
+ norm_end_x = end_x / self.viewport_size[0]
368
+ norm_end_y = end_y / self.viewport_size[1]
369
+ action_kwargs["normalized_end"] = (norm_end_x, norm_end_y)
370
+ action_kwargs["end_coordinates"] = Coordinates(
371
+ x=int(end_x), y=int(end_y)
372
+ )
373
+
374
+ # Handle element targeting
375
+ if action_data.get("target_node_id"):
376
+ from openadapt_ml.schema import UIElement
377
+
378
+ action_kwargs["element"] = UIElement(
379
+ element_id=action_data.get("target_node_id"),
380
+ role=action_data.get("target_role"),
381
+ name=action_data.get("target_name"),
382
+ )
383
+
384
+ action = Action(**action_kwargs)
385
+
386
+ return Step(
387
+ step_index=step_idx,
388
+ observation=observation,
389
+ action=action,
390
+ reasoning=step_data.get("reasoning"),
391
+ timestamp=step_data.get("timestamp"),
392
+ )
393
+
394
+ def _map_action_type(self, action_type_str: str) -> ActionType:
395
+ """Map benchmark action type string to ActionType enum.
396
+
397
+ Args:
398
+ action_type_str: Action type string from benchmark.
399
+
400
+ Returns:
401
+ ActionType enum value.
402
+ """
403
+ mapping = {
404
+ "click": ActionType.CLICK,
405
+ "double_click": ActionType.DOUBLE_CLICK,
406
+ "right_click": ActionType.RIGHT_CLICK,
407
+ "type": ActionType.TYPE,
408
+ "key": ActionType.KEY,
409
+ "scroll": ActionType.SCROLL,
410
+ "drag": ActionType.DRAG,
411
+ "hover": ActionType.HOVER,
412
+ "wait": ActionType.WAIT,
413
+ "done": ActionType.DONE,
414
+ "answer": ActionType.DONE,
415
+ "failed": ActionType.FAIL,
416
+ "fail": ActionType.FAIL,
417
+ }
418
+ return mapping.get(action_type_str.lower(), ActionType.CLICK)
419
+
420
+ def _copy_task_screenshots(self, task: dict[str, Any], episode_id: str) -> None:
421
+ """Copy task screenshots to output directory.
422
+
423
+ Args:
424
+ task: Task dictionary.
425
+ episode_id: Episode ID for output subdirectory.
426
+ """
427
+ screenshots = task.get("screenshots", [])
428
+ if not screenshots:
429
+ return
430
+
431
+ # Create episode screenshot directory
432
+ episode_screenshots_dir = self.output_dir / "screenshots" / episode_id
433
+ episode_screenshots_dir.mkdir(parents=True, exist_ok=True)
434
+
435
+ for i, rel_path in enumerate(screenshots):
436
+ src_path = self.benchmark_dir / rel_path
437
+ if src_path.exists():
438
+ dest_path = episode_screenshots_dir / f"step_{i:03d}.png"
439
+ shutil.copy2(src_path, dest_path)
440
+
441
+ def _create_manifest(
442
+ self,
443
+ episodes: list[Episode],
444
+ metadata: dict[str, Any],
445
+ stats: ExportStats,
446
+ ) -> None:
447
+ """Create manifest.json with export metadata.
448
+
449
+ Args:
450
+ episodes: List of exported episodes.
451
+ metadata: Benchmark metadata.
452
+ stats: Export statistics.
453
+ """
454
+ manifest = {
455
+ "export_timestamp": datetime.utcnow().isoformat(),
456
+ "source_benchmark": metadata.get("benchmark_name", "waa"),
457
+ "source_run": metadata.get("run_name"),
458
+ "source_model": metadata.get("model_id"),
459
+ "status_filter": self.status_filter,
460
+ "statistics": {
461
+ "total_tasks": stats.total_tasks,
462
+ "exported_tasks": stats.exported_tasks,
463
+ "skipped_tasks": stats.skipped_tasks,
464
+ "total_steps": stats.total_steps,
465
+ "exported_screenshots": stats.exported_screenshots,
466
+ "errors": len(stats.errors),
467
+ },
468
+ "episodes": [
469
+ {
470
+ "episode_id": ep.episode_id,
471
+ "task_id": ep.task_id,
472
+ "instruction": ep.instruction,
473
+ "num_steps": len(ep.steps),
474
+ "success": ep.success,
475
+ "domain": ep.metadata.get("domain") if ep.metadata else None,
476
+ }
477
+ for ep in episodes
478
+ ],
479
+ }
480
+
481
+ manifest_path = self.output_dir / "manifest.json"
482
+ with open(manifest_path, "w") as f:
483
+ json.dump(manifest, f, indent=2)
484
+
485
+ logger.info(f"Created manifest: {manifest_path}")
486
+
487
+ def _create_training_jsonl(self, episodes: list[Episode]) -> None:
488
+ """Create JSONL file for training.
489
+
490
+ Each line contains a training sample with:
491
+ - instruction: Task instruction
492
+ - screenshot_path: Path to screenshot
493
+ - action: Action taken
494
+ - reasoning: Optional reasoning
495
+
496
+ Args:
497
+ episodes: List of exported episodes.
498
+ """
499
+ jsonl_path = self.output_dir / "training_samples.jsonl"
500
+
501
+ with open(jsonl_path, "w") as f:
502
+ for episode in episodes:
503
+ for step in episode.steps:
504
+ sample = {
505
+ "episode_id": episode.episode_id,
506
+ "task_id": episode.task_id,
507
+ "instruction": episode.instruction,
508
+ "step_index": step.step_index,
509
+ "screenshot_path": step.observation.screenshot_path,
510
+ "action_type": step.action.type.value,
511
+ "action": {
512
+ "type": step.action.type.value,
513
+ "coordinates": (
514
+ {
515
+ "x": step.action.coordinates.x,
516
+ "y": step.action.coordinates.y,
517
+ }
518
+ if step.action.coordinates
519
+ else None
520
+ ),
521
+ "normalized_coordinates": step.action.normalized_coordinates,
522
+ "text": step.action.text,
523
+ "key": step.action.key,
524
+ "modifiers": step.action.modifiers,
525
+ "scroll_direction": step.action.scroll_direction,
526
+ "scroll_amount": step.action.scroll_amount,
527
+ },
528
+ "reasoning": step.reasoning,
529
+ "domain": episode.metadata.get("domain")
530
+ if episode.metadata
531
+ else None,
532
+ "success": episode.success,
533
+ }
534
+ f.write(json.dumps(sample) + "\n")
535
+
536
+ logger.info(f"Created training JSONL: {jsonl_path}")
537
+
538
+
539
+ def export_traces(
540
+ benchmark_dir: str | Path,
541
+ output_dir: str | Path,
542
+ status_filter: StatusFilter = "passed",
543
+ copy_screenshots: bool = True,
544
+ create_jsonl: bool = True,
545
+ viewport_size: tuple[int, int] = (1920, 1200),
546
+ ) -> list[Episode]:
547
+ """Convenience function to export benchmark traces.
548
+
549
+ Args:
550
+ benchmark_dir: Path to benchmark results directory.
551
+ output_dir: Output directory for exported training data.
552
+ status_filter: Filter by task status ("passed", "failed", "all").
553
+ copy_screenshots: Whether to copy screenshots to output directory.
554
+ create_jsonl: Whether to create training_samples.jsonl file.
555
+ viewport_size: Default viewport size for normalizing coordinates.
556
+
557
+ Returns:
558
+ List of Episode objects created from the traces.
559
+
560
+ Example:
561
+ episodes = export_traces(
562
+ benchmark_dir="benchmark_results/waa_eval_20241214",
563
+ output_dir="training_data",
564
+ status_filter="passed",
565
+ )
566
+ print(f"Exported {len(episodes)} episodes")
567
+ """
568
+ exporter = TraceExporter(
569
+ benchmark_dir=Path(benchmark_dir),
570
+ output_dir=Path(output_dir),
571
+ status_filter=status_filter,
572
+ copy_screenshots=copy_screenshots,
573
+ create_jsonl=create_jsonl,
574
+ viewport_size=viewport_size,
575
+ )
576
+ return exporter.export()
577
+
578
+
579
+ def list_available_runs(
580
+ benchmark_results_dir: str | Path = "benchmark_results",
581
+ ) -> list[dict[str, Any]]:
582
+ """List available benchmark runs for export.
583
+
584
+ Args:
585
+ benchmark_results_dir: Base directory containing benchmark results.
586
+
587
+ Returns:
588
+ List of dictionaries with run information.
589
+ """
590
+ results_dir = Path(benchmark_results_dir)
591
+ if not results_dir.exists():
592
+ return []
593
+
594
+ runs = []
595
+ for run_dir in sorted(results_dir.iterdir()):
596
+ if not run_dir.is_dir():
597
+ continue
598
+
599
+ metadata_path = run_dir / "metadata.json"
600
+ summary_path = run_dir / "summary.json"
601
+
602
+ run_info = {
603
+ "run_name": run_dir.name,
604
+ "path": str(run_dir),
605
+ }
606
+
607
+ if metadata_path.exists():
608
+ with open(metadata_path) as f:
609
+ metadata = json.load(f)
610
+ run_info.update(
611
+ {
612
+ "benchmark_name": metadata.get("benchmark_name"),
613
+ "model_id": metadata.get("model_id"),
614
+ "created_at": metadata.get("created_at"),
615
+ }
616
+ )
617
+
618
+ if summary_path.exists():
619
+ with open(summary_path) as f:
620
+ summary = json.load(f)
621
+ run_info.update(
622
+ {
623
+ "num_tasks": summary.get("num_tasks", 0),
624
+ "num_success": summary.get("num_success", 0),
625
+ "success_rate": summary.get("success_rate", 0.0),
626
+ }
627
+ )
628
+
629
+ runs.append(run_info)
630
+
631
+ return runs