openadapt-ml 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. openadapt_ml/__init__.py +0 -0
  2. openadapt_ml/benchmarks/__init__.py +125 -0
  3. openadapt_ml/benchmarks/agent.py +825 -0
  4. openadapt_ml/benchmarks/azure.py +761 -0
  5. openadapt_ml/benchmarks/base.py +366 -0
  6. openadapt_ml/benchmarks/cli.py +884 -0
  7. openadapt_ml/benchmarks/data_collection.py +432 -0
  8. openadapt_ml/benchmarks/runner.py +381 -0
  9. openadapt_ml/benchmarks/waa.py +704 -0
  10. openadapt_ml/cloud/__init__.py +5 -0
  11. openadapt_ml/cloud/azure_inference.py +441 -0
  12. openadapt_ml/cloud/lambda_labs.py +2445 -0
  13. openadapt_ml/cloud/local.py +790 -0
  14. openadapt_ml/config.py +56 -0
  15. openadapt_ml/datasets/__init__.py +0 -0
  16. openadapt_ml/datasets/next_action.py +507 -0
  17. openadapt_ml/evals/__init__.py +23 -0
  18. openadapt_ml/evals/grounding.py +241 -0
  19. openadapt_ml/evals/plot_eval_metrics.py +174 -0
  20. openadapt_ml/evals/trajectory_matching.py +486 -0
  21. openadapt_ml/grounding/__init__.py +45 -0
  22. openadapt_ml/grounding/base.py +236 -0
  23. openadapt_ml/grounding/detector.py +570 -0
  24. openadapt_ml/ingest/__init__.py +43 -0
  25. openadapt_ml/ingest/capture.py +312 -0
  26. openadapt_ml/ingest/loader.py +232 -0
  27. openadapt_ml/ingest/synthetic.py +1102 -0
  28. openadapt_ml/models/__init__.py +0 -0
  29. openadapt_ml/models/api_adapter.py +171 -0
  30. openadapt_ml/models/base_adapter.py +59 -0
  31. openadapt_ml/models/dummy_adapter.py +42 -0
  32. openadapt_ml/models/qwen_vl.py +426 -0
  33. openadapt_ml/runtime/__init__.py +0 -0
  34. openadapt_ml/runtime/policy.py +182 -0
  35. openadapt_ml/schemas/__init__.py +53 -0
  36. openadapt_ml/schemas/sessions.py +122 -0
  37. openadapt_ml/schemas/validation.py +252 -0
  38. openadapt_ml/scripts/__init__.py +0 -0
  39. openadapt_ml/scripts/compare.py +1490 -0
  40. openadapt_ml/scripts/demo_policy.py +62 -0
  41. openadapt_ml/scripts/eval_policy.py +287 -0
  42. openadapt_ml/scripts/make_gif.py +153 -0
  43. openadapt_ml/scripts/prepare_synthetic.py +43 -0
  44. openadapt_ml/scripts/run_qwen_login_benchmark.py +192 -0
  45. openadapt_ml/scripts/train.py +174 -0
  46. openadapt_ml/training/__init__.py +0 -0
  47. openadapt_ml/training/benchmark_viewer.py +1538 -0
  48. openadapt_ml/training/shared_ui.py +157 -0
  49. openadapt_ml/training/stub_provider.py +276 -0
  50. openadapt_ml/training/trainer.py +2446 -0
  51. openadapt_ml/training/viewer.py +2970 -0
  52. openadapt_ml-0.1.0.dist-info/METADATA +818 -0
  53. openadapt_ml-0.1.0.dist-info/RECORD +55 -0
  54. openadapt_ml-0.1.0.dist-info/WHEEL +4 -0
  55. openadapt_ml-0.1.0.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,381 @@
1
+ """Evaluation runner for benchmarks.
2
+
3
+ This module provides functions to run agents on benchmarks and collect results.
4
+
5
+ Example:
6
+ from openadapt_ml.benchmarks import WAAAdapter, PolicyAgent, evaluate_agent_on_benchmark
7
+
8
+ adapter = WAAAdapter(waa_repo_path="/path/to/WAA")
9
+ agent = PolicyAgent(policy)
10
+ results = evaluate_agent_on_benchmark(agent, adapter, max_steps=50)
11
+
12
+ print(f"Success rate: {sum(r.success for r in results) / len(results):.1%}")
13
+ """
14
+
15
+ from __future__ import annotations
16
+
17
+ import logging
18
+ import time
19
+ from concurrent.futures import ThreadPoolExecutor, as_completed
20
+ from dataclasses import dataclass
21
+ from typing import Callable
22
+
23
+ from openadapt_ml.benchmarks.agent import BenchmarkAgent
24
+ from openadapt_ml.benchmarks.base import (
25
+ BenchmarkAdapter,
26
+ BenchmarkAction,
27
+ BenchmarkObservation,
28
+ BenchmarkResult,
29
+ BenchmarkTask,
30
+ )
31
+ from openadapt_ml.benchmarks.data_collection import ExecutionTraceCollector
32
+
33
+ logger = logging.getLogger(__name__)
34
+
35
+
36
+ @dataclass
37
+ class EvaluationConfig:
38
+ """Configuration for benchmark evaluation.
39
+
40
+ Attributes:
41
+ max_steps: Maximum steps per task.
42
+ parallel: Number of parallel workers (if supported).
43
+ save_trajectories: Whether to save full trajectories in results.
44
+ verbose: Whether to print progress.
45
+ on_step: Optional callback called after each step.
46
+ on_task_complete: Optional callback called after each task.
47
+ save_execution_traces: Whether to save execution traces for viewer.
48
+ model_id: Model identifier for execution traces.
49
+ output_dir: Output directory for benchmark results.
50
+ run_name: Name for this evaluation run.
51
+ """
52
+
53
+ max_steps: int = 50
54
+ parallel: int = 1
55
+ save_trajectories: bool = True
56
+ verbose: bool = True
57
+ on_step: Callable[[BenchmarkObservation, BenchmarkAction, int], None] | None = None
58
+ on_task_complete: Callable[[BenchmarkResult], None] | None = None
59
+ save_execution_traces: bool = True
60
+ model_id: str = "unknown"
61
+ output_dir: str = "benchmark_results"
62
+ run_name: str | None = None
63
+
64
+
65
+ def evaluate_agent_on_benchmark(
66
+ agent: BenchmarkAgent,
67
+ adapter: BenchmarkAdapter,
68
+ task_ids: list[str] | None = None,
69
+ max_steps: int = 50,
70
+ parallel: int = 1,
71
+ config: EvaluationConfig | None = None,
72
+ ) -> list[BenchmarkResult]:
73
+ """Run agent on benchmark tasks and collect results.
74
+
75
+ Args:
76
+ agent: Agent to evaluate.
77
+ adapter: Benchmark adapter.
78
+ task_ids: Specific tasks to run (None = all tasks).
79
+ max_steps: Maximum steps per task (overridden by config if provided).
80
+ parallel: Number of parallel workers (overridden by config if provided).
81
+ config: Full evaluation configuration.
82
+
83
+ Returns:
84
+ List of BenchmarkResult for each task.
85
+ """
86
+ if config is None:
87
+ config = EvaluationConfig(max_steps=max_steps, parallel=parallel)
88
+
89
+ # Load tasks
90
+ if task_ids is not None:
91
+ tasks = [adapter.load_task(tid) for tid in task_ids]
92
+ else:
93
+ tasks = adapter.list_tasks()
94
+
95
+ if config.verbose:
96
+ logger.info(f"Evaluating {len(tasks)} tasks on {adapter.name}")
97
+
98
+ # Initialize execution trace collector if enabled
99
+ trace_collector = None
100
+ if config.save_execution_traces:
101
+ trace_collector = ExecutionTraceCollector(
102
+ benchmark_name=adapter.name,
103
+ run_name=config.run_name,
104
+ model_id=config.model_id,
105
+ output_dir=config.output_dir,
106
+ )
107
+ if config.verbose:
108
+ logger.info(f"Saving execution traces to: {trace_collector.run_dir}")
109
+
110
+ # Run evaluation
111
+ if config.parallel > 1 and adapter.supports_parallel:
112
+ results = _evaluate_parallel(agent, adapter, tasks, config, trace_collector)
113
+ else:
114
+ results = _evaluate_sequential(agent, adapter, tasks, config, trace_collector)
115
+
116
+ # Save summary if trace collection is enabled
117
+ if trace_collector is not None:
118
+ trace_collector.save_summary(results)
119
+
120
+ # Log summary
121
+ if config.verbose:
122
+ success_count = sum(1 for r in results if r.success)
123
+ success_rate = success_count / len(results) if results else 0
124
+ avg_steps = sum(r.num_steps for r in results) / len(results) if results else 0
125
+ logger.info(
126
+ f"Evaluation complete: {success_count}/{len(results)} "
127
+ f"({success_rate:.1%}) success, {avg_steps:.1f} avg steps"
128
+ )
129
+
130
+ return results
131
+
132
+
133
+ def _evaluate_sequential(
134
+ agent: BenchmarkAgent,
135
+ adapter: BenchmarkAdapter,
136
+ tasks: list[BenchmarkTask],
137
+ config: EvaluationConfig,
138
+ trace_collector: ExecutionTraceCollector | None = None,
139
+ ) -> list[BenchmarkResult]:
140
+ """Run evaluation sequentially.
141
+
142
+ Args:
143
+ agent: Agent to evaluate.
144
+ adapter: Benchmark adapter.
145
+ tasks: Tasks to evaluate.
146
+ config: Evaluation configuration.
147
+ trace_collector: Optional trace collector for saving execution data.
148
+
149
+ Returns:
150
+ List of results.
151
+ """
152
+ results = []
153
+ for i, task in enumerate(tasks):
154
+ if config.verbose:
155
+ logger.info(f"Task {i + 1}/{len(tasks)}: {task.task_id}")
156
+
157
+ result = _run_single_task(agent, adapter, task, config, trace_collector)
158
+ results.append(result)
159
+
160
+ if config.on_task_complete:
161
+ config.on_task_complete(result)
162
+
163
+ return results
164
+
165
+
166
+ def _evaluate_parallel(
167
+ agent: BenchmarkAgent,
168
+ adapter: BenchmarkAdapter,
169
+ tasks: list[BenchmarkTask],
170
+ config: EvaluationConfig,
171
+ trace_collector: ExecutionTraceCollector | None = None,
172
+ ) -> list[BenchmarkResult]:
173
+ """Run evaluation in parallel.
174
+
175
+ Note: This requires the adapter to support parallel execution
176
+ (e.g., via multiple VM instances).
177
+
178
+ Args:
179
+ agent: Agent to evaluate.
180
+ adapter: Benchmark adapter.
181
+ tasks: Tasks to evaluate.
182
+ config: Evaluation configuration.
183
+ trace_collector: Optional trace collector for saving execution data.
184
+
185
+ Returns:
186
+ List of results.
187
+ """
188
+ results = []
189
+
190
+ with ThreadPoolExecutor(max_workers=config.parallel) as executor:
191
+ # Submit all tasks
192
+ future_to_task = {
193
+ executor.submit(_run_single_task, agent, adapter, task, config, trace_collector): task
194
+ for task in tasks
195
+ }
196
+
197
+ # Collect results as they complete
198
+ for future in as_completed(future_to_task):
199
+ task = future_to_task[future]
200
+ try:
201
+ result = future.result()
202
+ results.append(result)
203
+
204
+ if config.on_task_complete:
205
+ config.on_task_complete(result)
206
+
207
+ if config.verbose:
208
+ status = "SUCCESS" if result.success else "FAIL"
209
+ logger.info(f"Task {task.task_id}: {status}")
210
+
211
+ except Exception as e:
212
+ logger.error(f"Task {task.task_id} failed with error: {e}")
213
+ results.append(
214
+ BenchmarkResult(
215
+ task_id=task.task_id,
216
+ success=False,
217
+ score=0.0,
218
+ error=str(e),
219
+ )
220
+ )
221
+
222
+ return results
223
+
224
+
225
+ def _run_single_task(
226
+ agent: BenchmarkAgent,
227
+ adapter: BenchmarkAdapter,
228
+ task: BenchmarkTask,
229
+ config: EvaluationConfig,
230
+ trace_collector: ExecutionTraceCollector | None = None,
231
+ ) -> BenchmarkResult:
232
+ """Run a single task and return result.
233
+
234
+ Args:
235
+ agent: Agent to evaluate.
236
+ adapter: Benchmark adapter.
237
+ task: Task to run.
238
+ config: Evaluation configuration.
239
+ trace_collector: Optional trace collector for saving execution data.
240
+
241
+ Returns:
242
+ BenchmarkResult.
243
+ """
244
+ start_time = time.perf_counter()
245
+ history: list[tuple[BenchmarkObservation, BenchmarkAction]] = []
246
+
247
+ # Start trace collection if enabled
248
+ if trace_collector is not None:
249
+ trace_collector.start_task(task)
250
+
251
+ try:
252
+ # Reset agent and environment
253
+ agent.reset()
254
+ obs = adapter.reset(task)
255
+
256
+ done = False
257
+ steps = 0
258
+ max_steps = task.time_limit_steps or config.max_steps
259
+
260
+ while not done and steps < max_steps:
261
+ # Get action from agent
262
+ action = agent.act(obs, task, history if config.save_trajectories else None)
263
+
264
+ # Extract reasoning if available from PolicyAgent
265
+ reasoning = None
266
+ if hasattr(action, "raw_action") and action.raw_action:
267
+ reasoning = action.raw_action.get("thought")
268
+
269
+ # Record step in trace collector
270
+ if trace_collector is not None:
271
+ trace_collector.record_step(steps, obs, action, reasoning)
272
+
273
+ # Record step in history
274
+ if config.save_trajectories:
275
+ history.append((obs, action))
276
+
277
+ if config.on_step:
278
+ config.on_step(obs, action, steps)
279
+
280
+ # Check for terminal action
281
+ if action.type == "done":
282
+ done = True
283
+ break
284
+
285
+ # Execute action
286
+ obs, done, info = adapter.step(action)
287
+ steps += 1
288
+
289
+ # Evaluate result
290
+ result = adapter.evaluate(task)
291
+
292
+ # Update result with trajectory info
293
+ result.steps = history if config.save_trajectories else []
294
+ result.num_steps = steps
295
+ result.total_time_seconds = time.perf_counter() - start_time
296
+
297
+ # Finish trace collection if enabled
298
+ if trace_collector is not None:
299
+ trace_collector.finish_task(result)
300
+
301
+ return result
302
+
303
+ except Exception as e:
304
+ logger.error(f"Error running task {task.task_id}: {e}")
305
+ result = BenchmarkResult(
306
+ task_id=task.task_id,
307
+ success=False,
308
+ score=0.0,
309
+ steps=history if config.save_trajectories else [],
310
+ num_steps=len(history),
311
+ error=str(e),
312
+ total_time_seconds=time.perf_counter() - start_time,
313
+ )
314
+
315
+ # Finish trace collection even on error
316
+ if trace_collector is not None:
317
+ trace_collector.finish_task(result)
318
+
319
+ return result
320
+
321
+
322
+ def compute_metrics(results: list[BenchmarkResult]) -> dict:
323
+ """Compute aggregate metrics from evaluation results.
324
+
325
+ Args:
326
+ results: List of BenchmarkResult from evaluation.
327
+
328
+ Returns:
329
+ Dict with aggregate metrics.
330
+ """
331
+ if not results:
332
+ return {
333
+ "num_tasks": 0,
334
+ "success_rate": 0.0,
335
+ "avg_score": 0.0,
336
+ "avg_steps": 0.0,
337
+ "avg_time_seconds": 0.0,
338
+ }
339
+
340
+ num_tasks = len(results)
341
+ success_count = sum(1 for r in results if r.success)
342
+ total_score = sum(r.score for r in results)
343
+ total_steps = sum(r.num_steps for r in results)
344
+ total_time = sum(r.total_time_seconds for r in results)
345
+
346
+ return {
347
+ "num_tasks": num_tasks,
348
+ "success_rate": success_count / num_tasks,
349
+ "avg_score": total_score / num_tasks,
350
+ "avg_steps": total_steps / num_tasks,
351
+ "avg_time_seconds": total_time / num_tasks,
352
+ "success_count": success_count,
353
+ "fail_count": num_tasks - success_count,
354
+ }
355
+
356
+
357
+ def compute_domain_metrics(
358
+ results: list[BenchmarkResult], tasks: list[BenchmarkTask]
359
+ ) -> dict[str, dict]:
360
+ """Compute per-domain metrics.
361
+
362
+ Args:
363
+ results: List of BenchmarkResult.
364
+ tasks: List of BenchmarkTask (to get domain info).
365
+
366
+ Returns:
367
+ Dict mapping domain to metrics dict.
368
+ """
369
+ # Build task_id -> domain mapping
370
+ task_domains = {t.task_id: t.domain for t in tasks}
371
+
372
+ # Group results by domain
373
+ domain_results: dict[str, list[BenchmarkResult]] = {}
374
+ for result in results:
375
+ domain = task_domains.get(result.task_id, "unknown")
376
+ if domain not in domain_results:
377
+ domain_results[domain] = []
378
+ domain_results[domain].append(result)
379
+
380
+ # Compute metrics per domain
381
+ return {domain: compute_metrics(res) for domain, res in domain_results.items()}