openadapt-ml 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. openadapt_ml/__init__.py +0 -0
  2. openadapt_ml/benchmarks/__init__.py +125 -0
  3. openadapt_ml/benchmarks/agent.py +825 -0
  4. openadapt_ml/benchmarks/azure.py +761 -0
  5. openadapt_ml/benchmarks/base.py +366 -0
  6. openadapt_ml/benchmarks/cli.py +884 -0
  7. openadapt_ml/benchmarks/data_collection.py +432 -0
  8. openadapt_ml/benchmarks/runner.py +381 -0
  9. openadapt_ml/benchmarks/waa.py +704 -0
  10. openadapt_ml/cloud/__init__.py +5 -0
  11. openadapt_ml/cloud/azure_inference.py +441 -0
  12. openadapt_ml/cloud/lambda_labs.py +2445 -0
  13. openadapt_ml/cloud/local.py +790 -0
  14. openadapt_ml/config.py +56 -0
  15. openadapt_ml/datasets/__init__.py +0 -0
  16. openadapt_ml/datasets/next_action.py +507 -0
  17. openadapt_ml/evals/__init__.py +23 -0
  18. openadapt_ml/evals/grounding.py +241 -0
  19. openadapt_ml/evals/plot_eval_metrics.py +174 -0
  20. openadapt_ml/evals/trajectory_matching.py +486 -0
  21. openadapt_ml/grounding/__init__.py +45 -0
  22. openadapt_ml/grounding/base.py +236 -0
  23. openadapt_ml/grounding/detector.py +570 -0
  24. openadapt_ml/ingest/__init__.py +43 -0
  25. openadapt_ml/ingest/capture.py +312 -0
  26. openadapt_ml/ingest/loader.py +232 -0
  27. openadapt_ml/ingest/synthetic.py +1102 -0
  28. openadapt_ml/models/__init__.py +0 -0
  29. openadapt_ml/models/api_adapter.py +171 -0
  30. openadapt_ml/models/base_adapter.py +59 -0
  31. openadapt_ml/models/dummy_adapter.py +42 -0
  32. openadapt_ml/models/qwen_vl.py +426 -0
  33. openadapt_ml/runtime/__init__.py +0 -0
  34. openadapt_ml/runtime/policy.py +182 -0
  35. openadapt_ml/schemas/__init__.py +53 -0
  36. openadapt_ml/schemas/sessions.py +122 -0
  37. openadapt_ml/schemas/validation.py +252 -0
  38. openadapt_ml/scripts/__init__.py +0 -0
  39. openadapt_ml/scripts/compare.py +1490 -0
  40. openadapt_ml/scripts/demo_policy.py +62 -0
  41. openadapt_ml/scripts/eval_policy.py +287 -0
  42. openadapt_ml/scripts/make_gif.py +153 -0
  43. openadapt_ml/scripts/prepare_synthetic.py +43 -0
  44. openadapt_ml/scripts/run_qwen_login_benchmark.py +192 -0
  45. openadapt_ml/scripts/train.py +174 -0
  46. openadapt_ml/training/__init__.py +0 -0
  47. openadapt_ml/training/benchmark_viewer.py +1538 -0
  48. openadapt_ml/training/shared_ui.py +157 -0
  49. openadapt_ml/training/stub_provider.py +276 -0
  50. openadapt_ml/training/trainer.py +2446 -0
  51. openadapt_ml/training/viewer.py +2970 -0
  52. openadapt_ml-0.1.0.dist-info/METADATA +818 -0
  53. openadapt_ml-0.1.0.dist-info/RECORD +55 -0
  54. openadapt_ml-0.1.0.dist-info/WHEEL +4 -0
  55. openadapt_ml-0.1.0.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,432 @@
1
+ """Data collection for benchmark viewer integration.
2
+
3
+ This module handles saving execution traces during benchmark runs for later
4
+ replay in the benchmark viewer. It creates a structured directory layout with
5
+ screenshots, metadata, and execution traces.
6
+
7
+ Directory structure:
8
+ benchmark_results/
9
+ ├── waa_eval_YYYYMMDD_HHMMSS/
10
+ │ ├── metadata.json
11
+ │ ├── tasks/
12
+ │ │ ├── task_001/
13
+ │ │ │ ├── task.json
14
+ │ │ │ ├── screenshots/
15
+ │ │ │ │ ├── step_000.png
16
+ │ │ │ │ ├── step_001.png
17
+ │ │ │ │ └── ...
18
+ │ │ │ └── execution.json
19
+ │ │ └── task_002/
20
+ │ │ └── ...
21
+ │ └── summary.json
22
+
23
+ Example:
24
+ from openadapt_ml.benchmarks.data_collection import ExecutionTraceCollector
25
+
26
+ collector = ExecutionTraceCollector(
27
+ benchmark_name="waa",
28
+ run_name="waa_eval_20241214",
29
+ model_id="qwen3vl-2b-epoch5"
30
+ )
31
+
32
+ # During evaluation
33
+ collector.start_task(task)
34
+ for step_idx, (obs, action) in enumerate(trajectory):
35
+ collector.record_step(step_idx, obs, action, reasoning="...")
36
+ collector.finish_task(result)
37
+ """
38
+
39
+ from __future__ import annotations
40
+
41
+ import json
42
+ import logging
43
+ from dataclasses import asdict, dataclass
44
+ from datetime import datetime
45
+ from pathlib import Path
46
+ from typing import Any
47
+
48
+ from openadapt_ml.benchmarks.base import (
49
+ BenchmarkAction,
50
+ BenchmarkObservation,
51
+ BenchmarkResult,
52
+ BenchmarkTask,
53
+ )
54
+
55
+ logger = logging.getLogger(__name__)
56
+
57
+
58
+ @dataclass
59
+ class ExecutionStep:
60
+ """Single step in execution trace.
61
+
62
+ Attributes:
63
+ step_idx: Step index in the trajectory.
64
+ screenshot_path: Relative path to screenshot image.
65
+ action: Action taken at this step.
66
+ reasoning: Optional reasoning/thought from the agent.
67
+ timestamp: Timestamp when step was recorded.
68
+ """
69
+
70
+ step_idx: int
71
+ screenshot_path: str | None
72
+ action: dict[str, Any] # Serialized BenchmarkAction
73
+ reasoning: str | None = None
74
+ timestamp: float | None = None
75
+
76
+
77
+ class ExecutionTraceCollector:
78
+ """Collects execution traces during benchmark runs.
79
+
80
+ This class handles:
81
+ - Creating the directory structure for a benchmark run
82
+ - Saving screenshots at each step
83
+ - Recording actions and reasoning
84
+ - Saving task results and metadata
85
+
86
+ Args:
87
+ benchmark_name: Name of the benchmark (e.g., "waa", "webarena").
88
+ run_name: Unique name for this evaluation run (e.g., "waa_eval_20241214").
89
+ model_id: Identifier for the model being evaluated.
90
+ output_dir: Base directory for benchmark results (default: "./benchmark_results").
91
+ """
92
+
93
+ def __init__(
94
+ self,
95
+ benchmark_name: str,
96
+ run_name: str | None = None,
97
+ model_id: str = "unknown",
98
+ output_dir: str | Path = "benchmark_results",
99
+ ):
100
+ self.benchmark_name = benchmark_name
101
+ self.model_id = model_id
102
+
103
+ # Auto-generate run_name if not provided
104
+ if run_name is None:
105
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
106
+ run_name = f"{benchmark_name}_eval_{timestamp}"
107
+ self.run_name = run_name
108
+
109
+ # Set up directory structure
110
+ self.output_dir = Path(output_dir)
111
+ self.run_dir = self.output_dir / run_name
112
+ self.tasks_dir = self.run_dir / "tasks"
113
+
114
+ # Current task tracking
115
+ self._current_task: BenchmarkTask | None = None
116
+ self._current_task_dir: Path | None = None
117
+ self._current_screenshots_dir: Path | None = None
118
+ self._current_steps: list[ExecutionStep] = []
119
+
120
+ # Initialize run
121
+ self._initialize_run()
122
+
123
+ def _initialize_run(self) -> None:
124
+ """Initialize the benchmark run directory and metadata."""
125
+ self.run_dir.mkdir(parents=True, exist_ok=True)
126
+ self.tasks_dir.mkdir(exist_ok=True)
127
+
128
+ # Save run metadata
129
+ metadata = {
130
+ "benchmark_name": self.benchmark_name,
131
+ "run_name": self.run_name,
132
+ "model_id": self.model_id,
133
+ "created_at": datetime.now().isoformat(),
134
+ }
135
+
136
+ metadata_path = self.run_dir / "metadata.json"
137
+ with open(metadata_path, "w") as f:
138
+ json.dump(metadata, f, indent=2)
139
+
140
+ logger.info(f"Initialized benchmark run at: {self.run_dir}")
141
+
142
+ def start_task(self, task: BenchmarkTask) -> None:
143
+ """Start collecting data for a new task.
144
+
145
+ Args:
146
+ task: The benchmark task being executed.
147
+ """
148
+ if self._current_task is not None:
149
+ logger.warning(
150
+ f"Starting new task {task.task_id} without finishing {self._current_task.task_id}"
151
+ )
152
+
153
+ self._current_task = task
154
+ self._current_steps = []
155
+
156
+ # Create task directory
157
+ task_dir_name = self._sanitize_task_id(task.task_id)
158
+ self._current_task_dir = self.tasks_dir / task_dir_name
159
+ self._current_task_dir.mkdir(parents=True, exist_ok=True)
160
+
161
+ # Create screenshots directory
162
+ self._current_screenshots_dir = self._current_task_dir / "screenshots"
163
+ self._current_screenshots_dir.mkdir(exist_ok=True)
164
+
165
+ # Save task definition
166
+ task_data = {
167
+ "task_id": task.task_id,
168
+ "instruction": task.instruction,
169
+ "domain": task.domain,
170
+ "initial_state_ref": task.initial_state_ref,
171
+ "time_limit_steps": task.time_limit_steps,
172
+ "raw_config": task.raw_config,
173
+ "evaluation_spec": task.evaluation_spec,
174
+ }
175
+
176
+ task_path = self._current_task_dir / "task.json"
177
+ with open(task_path, "w") as f:
178
+ json.dump(task_data, f, indent=2)
179
+
180
+ logger.info(f"Started collecting data for task: {task.task_id}")
181
+
182
+ def record_step(
183
+ self,
184
+ step_idx: int,
185
+ observation: BenchmarkObservation,
186
+ action: BenchmarkAction,
187
+ reasoning: str | None = None,
188
+ ) -> None:
189
+ """Record a single step in the execution trace.
190
+
191
+ Args:
192
+ step_idx: Index of this step in the trajectory.
193
+ observation: Observation at this step.
194
+ action: Action taken at this step.
195
+ reasoning: Optional reasoning/thought from the agent.
196
+ """
197
+ if self._current_task is None:
198
+ raise RuntimeError("No task started. Call start_task() first.")
199
+
200
+ # Save screenshot if available
201
+ screenshot_path = None
202
+ if observation.screenshot is not None:
203
+ screenshot_path = self._save_screenshot(step_idx, observation.screenshot)
204
+ elif observation.screenshot_path is not None:
205
+ # Copy existing screenshot
206
+ screenshot_path = self._copy_screenshot(step_idx, observation.screenshot_path)
207
+
208
+ # Create execution step record
209
+ step = ExecutionStep(
210
+ step_idx=step_idx,
211
+ screenshot_path=screenshot_path,
212
+ action=self._serialize_action(action),
213
+ reasoning=reasoning,
214
+ timestamp=datetime.now().timestamp(),
215
+ )
216
+
217
+ self._current_steps.append(step)
218
+
219
+ def finish_task(self, result: BenchmarkResult) -> None:
220
+ """Finish collecting data for the current task and save execution trace.
221
+
222
+ Args:
223
+ result: The evaluation result for the task.
224
+ """
225
+ if self._current_task is None:
226
+ raise RuntimeError("No task started. Call start_task() first.")
227
+
228
+ # Save execution trace
229
+ execution_data = {
230
+ "task_id": result.task_id,
231
+ "model_id": self.model_id,
232
+ "success": result.success,
233
+ "score": result.score,
234
+ "num_steps": result.num_steps,
235
+ "total_time_seconds": result.total_time_seconds,
236
+ "error": result.error,
237
+ "reason": result.reason,
238
+ "steps": [asdict(step) for step in self._current_steps],
239
+ }
240
+
241
+ execution_path = self._current_task_dir / "execution.json"
242
+ with open(execution_path, "w") as f:
243
+ json.dump(execution_data, f, indent=2)
244
+
245
+ logger.info(
246
+ f"Saved execution trace for task {result.task_id}: "
247
+ f"{'SUCCESS' if result.success else 'FAIL'} ({result.num_steps} steps)"
248
+ )
249
+
250
+ # Clear current task
251
+ self._current_task = None
252
+ self._current_task_dir = None
253
+ self._current_screenshots_dir = None
254
+ self._current_steps = []
255
+
256
+ def save_summary(self, all_results: list[BenchmarkResult]) -> None:
257
+ """Save summary of all task results.
258
+
259
+ Args:
260
+ all_results: List of all BenchmarkResult objects from the run.
261
+ """
262
+ summary = {
263
+ "benchmark_name": self.benchmark_name,
264
+ "run_name": self.run_name,
265
+ "model_id": self.model_id,
266
+ "num_tasks": len(all_results),
267
+ "num_success": sum(1 for r in all_results if r.success),
268
+ "success_rate": sum(1 for r in all_results if r.success) / len(all_results) if all_results else 0.0,
269
+ "avg_score": sum(r.score for r in all_results) / len(all_results) if all_results else 0.0,
270
+ "avg_steps": sum(r.num_steps for r in all_results) / len(all_results) if all_results else 0.0,
271
+ "avg_time_seconds": sum(r.total_time_seconds for r in all_results) / len(all_results) if all_results else 0.0,
272
+ "tasks": [
273
+ {
274
+ "task_id": r.task_id,
275
+ "success": r.success,
276
+ "score": r.score,
277
+ "num_steps": r.num_steps,
278
+ "error": r.error,
279
+ }
280
+ for r in all_results
281
+ ],
282
+ }
283
+
284
+ summary_path = self.run_dir / "summary.json"
285
+ with open(summary_path, "w") as f:
286
+ json.dump(summary, f, indent=2)
287
+
288
+ logger.info(
289
+ f"Saved summary: {summary['num_success']}/{summary['num_tasks']} tasks succeeded "
290
+ f"({summary['success_rate']:.1%})"
291
+ )
292
+
293
+ def _save_screenshot(self, step_idx: int, screenshot_bytes: bytes) -> str:
294
+ """Save screenshot bytes to file.
295
+
296
+ Args:
297
+ step_idx: Step index for naming the file.
298
+ screenshot_bytes: PNG image bytes.
299
+
300
+ Returns:
301
+ Relative path to the saved screenshot.
302
+ """
303
+ if self._current_screenshots_dir is None:
304
+ raise RuntimeError("No task started")
305
+
306
+ filename = f"step_{step_idx:03d}.png"
307
+ screenshot_path = self._current_screenshots_dir / filename
308
+
309
+ with open(screenshot_path, "wb") as f:
310
+ f.write(screenshot_bytes)
311
+
312
+ # Return relative path from task directory
313
+ return f"screenshots/{filename}"
314
+
315
+ def _copy_screenshot(self, step_idx: int, source_path: str) -> str:
316
+ """Copy screenshot from existing path.
317
+
318
+ Args:
319
+ step_idx: Step index for naming the file.
320
+ source_path: Path to existing screenshot.
321
+
322
+ Returns:
323
+ Relative path to the copied screenshot.
324
+ """
325
+ if self._current_screenshots_dir is None:
326
+ raise RuntimeError("No task started")
327
+
328
+ filename = f"step_{step_idx:03d}.png"
329
+ dest_path = self._current_screenshots_dir / filename
330
+
331
+ # Copy file
332
+ import shutil
333
+ shutil.copy2(source_path, dest_path)
334
+
335
+ return f"screenshots/{filename}"
336
+
337
+ def _serialize_action(self, action: BenchmarkAction) -> dict[str, Any]:
338
+ """Serialize BenchmarkAction to dict.
339
+
340
+ Args:
341
+ action: Action to serialize.
342
+
343
+ Returns:
344
+ Dictionary representation of the action.
345
+ """
346
+ return {
347
+ "type": action.type,
348
+ "x": action.x,
349
+ "y": action.y,
350
+ "target_node_id": action.target_node_id,
351
+ "target_bbox": action.target_bbox,
352
+ "target_role": action.target_role,
353
+ "target_name": action.target_name,
354
+ "text": action.text,
355
+ "key": action.key,
356
+ "modifiers": action.modifiers,
357
+ "scroll_direction": action.scroll_direction,
358
+ "scroll_amount": action.scroll_amount,
359
+ "end_x": action.end_x,
360
+ "end_y": action.end_y,
361
+ "answer": action.answer,
362
+ "raw_action": action.raw_action,
363
+ }
364
+
365
+ def _sanitize_task_id(self, task_id: str) -> str:
366
+ """Sanitize task ID for use as directory name.
367
+
368
+ Args:
369
+ task_id: Original task ID.
370
+
371
+ Returns:
372
+ Sanitized task ID safe for filesystem.
373
+ """
374
+ # Replace unsafe characters with underscores
375
+ safe_id = "".join(c if c.isalnum() or c in "-_" else "_" for c in task_id)
376
+ return safe_id
377
+
378
+
379
+ def save_execution_trace(
380
+ task: BenchmarkTask,
381
+ result: BenchmarkResult,
382
+ trajectory: list[tuple[BenchmarkObservation, BenchmarkAction]],
383
+ benchmark_name: str,
384
+ model_id: str = "unknown",
385
+ output_dir: str | Path = "benchmark_results",
386
+ run_name: str | None = None,
387
+ reasoning_map: dict[int, str] | None = None,
388
+ ) -> Path:
389
+ """Convenience function to save a complete execution trace.
390
+
391
+ This is a simpler alternative to using ExecutionTraceCollector directly
392
+ when you have the complete trajectory available.
393
+
394
+ Args:
395
+ task: The benchmark task.
396
+ result: The evaluation result.
397
+ trajectory: List of (observation, action) pairs.
398
+ benchmark_name: Name of the benchmark.
399
+ model_id: Identifier for the model.
400
+ output_dir: Base directory for results.
401
+ run_name: Optional run name (auto-generated if None).
402
+ reasoning_map: Optional map of step_idx -> reasoning text.
403
+
404
+ Returns:
405
+ Path to the task directory.
406
+
407
+ Example:
408
+ save_execution_trace(
409
+ task=task,
410
+ result=result,
411
+ trajectory=trajectory,
412
+ benchmark_name="waa",
413
+ model_id="qwen3vl-2b-epoch5",
414
+ reasoning_map={0: "I should click the button", 1: "Now type the text"}
415
+ )
416
+ """
417
+ collector = ExecutionTraceCollector(
418
+ benchmark_name=benchmark_name,
419
+ run_name=run_name,
420
+ model_id=model_id,
421
+ output_dir=output_dir,
422
+ )
423
+
424
+ collector.start_task(task)
425
+
426
+ for step_idx, (obs, action) in enumerate(trajectory):
427
+ reasoning = reasoning_map.get(step_idx) if reasoning_map else None
428
+ collector.record_step(step_idx, obs, action, reasoning)
429
+
430
+ collector.finish_task(result)
431
+
432
+ return collector._current_task_dir or collector.tasks_dir