openadapt-ml 0.2.0__py3-none-any.whl → 0.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (95) hide show
  1. openadapt_ml/baselines/__init__.py +121 -0
  2. openadapt_ml/baselines/adapter.py +185 -0
  3. openadapt_ml/baselines/cli.py +314 -0
  4. openadapt_ml/baselines/config.py +448 -0
  5. openadapt_ml/baselines/parser.py +922 -0
  6. openadapt_ml/baselines/prompts.py +787 -0
  7. openadapt_ml/benchmarks/__init__.py +13 -115
  8. openadapt_ml/benchmarks/agent.py +265 -421
  9. openadapt_ml/benchmarks/azure.py +28 -19
  10. openadapt_ml/benchmarks/azure_ops_tracker.py +521 -0
  11. openadapt_ml/benchmarks/cli.py +1722 -4847
  12. openadapt_ml/benchmarks/trace_export.py +631 -0
  13. openadapt_ml/benchmarks/viewer.py +22 -5
  14. openadapt_ml/benchmarks/vm_monitor.py +530 -29
  15. openadapt_ml/benchmarks/waa_deploy/Dockerfile +47 -53
  16. openadapt_ml/benchmarks/waa_deploy/api_agent.py +21 -20
  17. openadapt_ml/cloud/azure_inference.py +3 -5
  18. openadapt_ml/cloud/lambda_labs.py +722 -307
  19. openadapt_ml/cloud/local.py +2038 -487
  20. openadapt_ml/cloud/ssh_tunnel.py +68 -26
  21. openadapt_ml/datasets/next_action.py +40 -30
  22. openadapt_ml/evals/grounding.py +8 -3
  23. openadapt_ml/evals/plot_eval_metrics.py +15 -13
  24. openadapt_ml/evals/trajectory_matching.py +41 -26
  25. openadapt_ml/experiments/demo_prompt/format_demo.py +16 -6
  26. openadapt_ml/experiments/demo_prompt/run_experiment.py +26 -16
  27. openadapt_ml/experiments/representation_shootout/__init__.py +70 -0
  28. openadapt_ml/experiments/representation_shootout/conditions.py +708 -0
  29. openadapt_ml/experiments/representation_shootout/config.py +390 -0
  30. openadapt_ml/experiments/representation_shootout/evaluator.py +659 -0
  31. openadapt_ml/experiments/representation_shootout/runner.py +687 -0
  32. openadapt_ml/experiments/waa_demo/runner.py +29 -14
  33. openadapt_ml/export/parquet.py +36 -24
  34. openadapt_ml/grounding/detector.py +18 -14
  35. openadapt_ml/ingest/__init__.py +8 -6
  36. openadapt_ml/ingest/capture.py +25 -22
  37. openadapt_ml/ingest/loader.py +7 -4
  38. openadapt_ml/ingest/synthetic.py +189 -100
  39. openadapt_ml/models/api_adapter.py +14 -4
  40. openadapt_ml/models/base_adapter.py +10 -2
  41. openadapt_ml/models/providers/__init__.py +288 -0
  42. openadapt_ml/models/providers/anthropic.py +266 -0
  43. openadapt_ml/models/providers/base.py +299 -0
  44. openadapt_ml/models/providers/google.py +376 -0
  45. openadapt_ml/models/providers/openai.py +342 -0
  46. openadapt_ml/models/qwen_vl.py +46 -19
  47. openadapt_ml/perception/__init__.py +35 -0
  48. openadapt_ml/perception/integration.py +399 -0
  49. openadapt_ml/retrieval/demo_retriever.py +50 -24
  50. openadapt_ml/retrieval/embeddings.py +9 -8
  51. openadapt_ml/retrieval/retriever.py +3 -1
  52. openadapt_ml/runtime/__init__.py +50 -0
  53. openadapt_ml/runtime/policy.py +18 -5
  54. openadapt_ml/runtime/safety_gate.py +471 -0
  55. openadapt_ml/schema/__init__.py +9 -0
  56. openadapt_ml/schema/converters.py +74 -27
  57. openadapt_ml/schema/episode.py +31 -18
  58. openadapt_ml/scripts/capture_screenshots.py +530 -0
  59. openadapt_ml/scripts/compare.py +85 -54
  60. openadapt_ml/scripts/demo_policy.py +4 -1
  61. openadapt_ml/scripts/eval_policy.py +15 -9
  62. openadapt_ml/scripts/make_gif.py +1 -1
  63. openadapt_ml/scripts/prepare_synthetic.py +3 -1
  64. openadapt_ml/scripts/train.py +21 -9
  65. openadapt_ml/segmentation/README.md +920 -0
  66. openadapt_ml/segmentation/__init__.py +97 -0
  67. openadapt_ml/segmentation/adapters/__init__.py +5 -0
  68. openadapt_ml/segmentation/adapters/capture_adapter.py +420 -0
  69. openadapt_ml/segmentation/annotator.py +610 -0
  70. openadapt_ml/segmentation/cache.py +290 -0
  71. openadapt_ml/segmentation/cli.py +674 -0
  72. openadapt_ml/segmentation/deduplicator.py +656 -0
  73. openadapt_ml/segmentation/frame_describer.py +788 -0
  74. openadapt_ml/segmentation/pipeline.py +340 -0
  75. openadapt_ml/segmentation/schemas.py +622 -0
  76. openadapt_ml/segmentation/segment_extractor.py +634 -0
  77. openadapt_ml/training/azure_ops_viewer.py +1097 -0
  78. openadapt_ml/training/benchmark_viewer.py +52 -41
  79. openadapt_ml/training/shared_ui.py +7 -7
  80. openadapt_ml/training/stub_provider.py +57 -35
  81. openadapt_ml/training/trainer.py +143 -86
  82. openadapt_ml/training/trl_trainer.py +70 -21
  83. openadapt_ml/training/viewer.py +323 -108
  84. openadapt_ml/training/viewer_components.py +180 -0
  85. {openadapt_ml-0.2.0.dist-info → openadapt_ml-0.2.1.dist-info}/METADATA +215 -14
  86. openadapt_ml-0.2.1.dist-info/RECORD +116 -0
  87. openadapt_ml/benchmarks/base.py +0 -366
  88. openadapt_ml/benchmarks/data_collection.py +0 -432
  89. openadapt_ml/benchmarks/live_tracker.py +0 -180
  90. openadapt_ml/benchmarks/runner.py +0 -418
  91. openadapt_ml/benchmarks/waa.py +0 -761
  92. openadapt_ml/benchmarks/waa_live.py +0 -619
  93. openadapt_ml-0.2.0.dist-info/RECORD +0 -86
  94. {openadapt_ml-0.2.0.dist-info → openadapt_ml-0.2.1.dist-info}/WHEEL +0 -0
  95. {openadapt_ml-0.2.0.dist-info → openadapt_ml-0.2.1.dist-info}/licenses/LICENSE +0 -0
@@ -1,418 +0,0 @@
1
- """Evaluation runner for benchmarks.
2
-
3
- This module provides functions to run agents on benchmarks and collect results.
4
-
5
- Example:
6
- from openadapt_ml.benchmarks import WAAAdapter, PolicyAgent, evaluate_agent_on_benchmark
7
-
8
- adapter = WAAAdapter(waa_repo_path="/path/to/WAA")
9
- agent = PolicyAgent(policy)
10
- results = evaluate_agent_on_benchmark(agent, adapter, max_steps=50)
11
-
12
- print(f"Success rate: {sum(r.success for r in results) / len(results):.1%}")
13
- """
14
-
15
- from __future__ import annotations
16
-
17
- import logging
18
- import time
19
- from concurrent.futures import ThreadPoolExecutor, as_completed
20
- from dataclasses import dataclass
21
- from typing import Callable
22
-
23
- from openadapt_ml.benchmarks.agent import BenchmarkAgent
24
- from openadapt_ml.benchmarks.base import (
25
- BenchmarkAdapter,
26
- BenchmarkAction,
27
- BenchmarkObservation,
28
- BenchmarkResult,
29
- BenchmarkTask,
30
- )
31
- from openadapt_ml.benchmarks.data_collection import ExecutionTraceCollector
32
- from openadapt_ml.benchmarks.live_tracker import LiveEvaluationTracker
33
-
34
- logger = logging.getLogger(__name__)
35
-
36
-
37
- @dataclass
38
- class EvaluationConfig:
39
- """Configuration for benchmark evaluation.
40
-
41
- Attributes:
42
- max_steps: Maximum steps per task.
43
- parallel: Number of parallel workers (if supported).
44
- save_trajectories: Whether to save full trajectories in results.
45
- verbose: Whether to print progress.
46
- on_step: Optional callback called after each step.
47
- on_task_complete: Optional callback called after each task.
48
- save_execution_traces: Whether to save execution traces for viewer.
49
- model_id: Model identifier for execution traces.
50
- output_dir: Output directory for benchmark results.
51
- run_name: Name for this evaluation run.
52
- enable_live_tracking: Whether to enable live evaluation progress tracking.
53
- live_tracking_file: Path to live tracking JSON file.
54
- """
55
-
56
- max_steps: int = 50
57
- parallel: int = 1
58
- save_trajectories: bool = True
59
- verbose: bool = True
60
- on_step: Callable[[BenchmarkObservation, BenchmarkAction, int], None] | None = None
61
- on_task_complete: Callable[[BenchmarkResult], None] | None = None
62
- save_execution_traces: bool = True
63
- model_id: str = "unknown"
64
- output_dir: str = "benchmark_results"
65
- run_name: str | None = None
66
- enable_live_tracking: bool = True
67
- live_tracking_file: str = "benchmark_live.json"
68
-
69
-
70
- def evaluate_agent_on_benchmark(
71
- agent: BenchmarkAgent,
72
- adapter: BenchmarkAdapter,
73
- task_ids: list[str] | None = None,
74
- max_steps: int = 50,
75
- parallel: int = 1,
76
- config: EvaluationConfig | None = None,
77
- ) -> list[BenchmarkResult]:
78
- """Run agent on benchmark tasks and collect results.
79
-
80
- Args:
81
- agent: Agent to evaluate.
82
- adapter: Benchmark adapter.
83
- task_ids: Specific tasks to run (None = all tasks).
84
- max_steps: Maximum steps per task (overridden by config if provided).
85
- parallel: Number of parallel workers (overridden by config if provided).
86
- config: Full evaluation configuration.
87
-
88
- Returns:
89
- List of BenchmarkResult for each task.
90
- """
91
- if config is None:
92
- config = EvaluationConfig(max_steps=max_steps, parallel=parallel)
93
-
94
- # Load tasks
95
- if task_ids is not None:
96
- tasks = [adapter.load_task(tid) for tid in task_ids]
97
- else:
98
- tasks = adapter.list_tasks()
99
-
100
- if config.verbose:
101
- logger.info(f"Evaluating {len(tasks)} tasks on {adapter.name}")
102
-
103
- # Initialize execution trace collector if enabled
104
- trace_collector = None
105
- if config.save_execution_traces:
106
- trace_collector = ExecutionTraceCollector(
107
- benchmark_name=adapter.name,
108
- run_name=config.run_name,
109
- model_id=config.model_id,
110
- output_dir=config.output_dir,
111
- )
112
- if config.verbose:
113
- logger.info(f"Saving execution traces to: {trace_collector.run_dir}")
114
-
115
- # Initialize live evaluation tracker if enabled
116
- live_tracker = None
117
- if config.enable_live_tracking:
118
- live_tracker = LiveEvaluationTracker(
119
- output_file=config.live_tracking_file,
120
- total_tasks=len(tasks),
121
- )
122
- if config.verbose:
123
- logger.info(f"Live tracking enabled: {config.live_tracking_file}")
124
-
125
- # Run evaluation
126
- if config.parallel > 1 and adapter.supports_parallel:
127
- results = _evaluate_parallel(agent, adapter, tasks, config, trace_collector, live_tracker)
128
- else:
129
- results = _evaluate_sequential(agent, adapter, tasks, config, trace_collector, live_tracker)
130
-
131
- # Save summary if trace collection is enabled
132
- if trace_collector is not None:
133
- trace_collector.save_summary(results)
134
-
135
- # Mark live tracking as complete
136
- if live_tracker is not None:
137
- live_tracker.finish()
138
-
139
- # Log summary
140
- if config.verbose:
141
- success_count = sum(1 for r in results if r.success)
142
- success_rate = success_count / len(results) if results else 0
143
- avg_steps = sum(r.num_steps for r in results) / len(results) if results else 0
144
- logger.info(
145
- f"Evaluation complete: {success_count}/{len(results)} "
146
- f"({success_rate:.1%}) success, {avg_steps:.1f} avg steps"
147
- )
148
-
149
- return results
150
-
151
-
152
- def _evaluate_sequential(
153
- agent: BenchmarkAgent,
154
- adapter: BenchmarkAdapter,
155
- tasks: list[BenchmarkTask],
156
- config: EvaluationConfig,
157
- trace_collector: ExecutionTraceCollector | None = None,
158
- live_tracker: LiveEvaluationTracker | None = None,
159
- ) -> list[BenchmarkResult]:
160
- """Run evaluation sequentially.
161
-
162
- Args:
163
- agent: Agent to evaluate.
164
- adapter: Benchmark adapter.
165
- tasks: Tasks to evaluate.
166
- config: Evaluation configuration.
167
- trace_collector: Optional trace collector for saving execution data.
168
- live_tracker: Optional live evaluation tracker.
169
-
170
- Returns:
171
- List of results.
172
- """
173
- results = []
174
- for i, task in enumerate(tasks):
175
- if config.verbose:
176
- logger.info(f"Task {i + 1}/{len(tasks)}: {task.task_id}")
177
-
178
- result = _run_single_task(agent, adapter, task, config, trace_collector, live_tracker)
179
- results.append(result)
180
-
181
- if config.on_task_complete:
182
- config.on_task_complete(result)
183
-
184
- return results
185
-
186
-
187
- def _evaluate_parallel(
188
- agent: BenchmarkAgent,
189
- adapter: BenchmarkAdapter,
190
- tasks: list[BenchmarkTask],
191
- config: EvaluationConfig,
192
- trace_collector: ExecutionTraceCollector | None = None,
193
- live_tracker: LiveEvaluationTracker | None = None,
194
- ) -> list[BenchmarkResult]:
195
- """Run evaluation in parallel.
196
-
197
- Note: This requires the adapter to support parallel execution
198
- (e.g., via multiple VM instances).
199
-
200
- Args:
201
- agent: Agent to evaluate.
202
- adapter: Benchmark adapter.
203
- tasks: Tasks to evaluate.
204
- config: Evaluation configuration.
205
- trace_collector: Optional trace collector for saving execution data.
206
- live_tracker: Optional live evaluation tracker.
207
-
208
- Returns:
209
- List of results.
210
- """
211
- results = []
212
-
213
- with ThreadPoolExecutor(max_workers=config.parallel) as executor:
214
- # Submit all tasks
215
- future_to_task = {
216
- executor.submit(_run_single_task, agent, adapter, task, config, trace_collector, live_tracker): task
217
- for task in tasks
218
- }
219
-
220
- # Collect results as they complete
221
- for future in as_completed(future_to_task):
222
- task = future_to_task[future]
223
- try:
224
- result = future.result()
225
- results.append(result)
226
-
227
- if config.on_task_complete:
228
- config.on_task_complete(result)
229
-
230
- if config.verbose:
231
- status = "SUCCESS" if result.success else "FAIL"
232
- logger.info(f"Task {task.task_id}: {status}")
233
-
234
- except Exception as e:
235
- logger.error(f"Task {task.task_id} failed with error: {e}")
236
- results.append(
237
- BenchmarkResult(
238
- task_id=task.task_id,
239
- success=False,
240
- score=0.0,
241
- error=str(e),
242
- )
243
- )
244
-
245
- return results
246
-
247
-
248
- def _run_single_task(
249
- agent: BenchmarkAgent,
250
- adapter: BenchmarkAdapter,
251
- task: BenchmarkTask,
252
- config: EvaluationConfig,
253
- trace_collector: ExecutionTraceCollector | None = None,
254
- live_tracker: LiveEvaluationTracker | None = None,
255
- ) -> BenchmarkResult:
256
- """Run a single task and return result.
257
-
258
- Args:
259
- agent: Agent to evaluate.
260
- adapter: Benchmark adapter.
261
- task: Task to run.
262
- config: Evaluation configuration.
263
- trace_collector: Optional trace collector for saving execution data.
264
- live_tracker: Optional live evaluation tracker.
265
-
266
- Returns:
267
- BenchmarkResult.
268
- """
269
- start_time = time.perf_counter()
270
- history: list[tuple[BenchmarkObservation, BenchmarkAction]] = []
271
-
272
- # Start trace collection if enabled
273
- if trace_collector is not None:
274
- trace_collector.start_task(task)
275
-
276
- # Start live tracking if enabled
277
- if live_tracker is not None:
278
- live_tracker.start_task(task)
279
-
280
- try:
281
- # Reset agent and environment
282
- agent.reset()
283
- obs = adapter.reset(task)
284
-
285
- done = False
286
- steps = 0
287
- max_steps = task.time_limit_steps or config.max_steps
288
-
289
- while not done and steps < max_steps:
290
- # Get action from agent
291
- action = agent.act(obs, task, history if config.save_trajectories else None)
292
-
293
- # Extract reasoning if available from PolicyAgent
294
- reasoning = None
295
- if hasattr(action, "raw_action") and action.raw_action:
296
- reasoning = action.raw_action.get("thought")
297
-
298
- # Record step in trace collector
299
- if trace_collector is not None:
300
- trace_collector.record_step(steps, obs, action, reasoning)
301
-
302
- # Record step in live tracker
303
- if live_tracker is not None:
304
- live_tracker.record_step(steps, obs, action, reasoning)
305
-
306
- # Record step in history
307
- if config.save_trajectories:
308
- history.append((obs, action))
309
-
310
- if config.on_step:
311
- config.on_step(obs, action, steps)
312
-
313
- # Check for terminal action
314
- if action.type == "done":
315
- done = True
316
- break
317
-
318
- # Execute action
319
- obs, done, info = adapter.step(action)
320
- steps += 1
321
-
322
- # Evaluate result
323
- result = adapter.evaluate(task)
324
-
325
- # Update result with trajectory info
326
- result.steps = history if config.save_trajectories else []
327
- result.num_steps = steps
328
- result.total_time_seconds = time.perf_counter() - start_time
329
-
330
- # Finish trace collection if enabled
331
- if trace_collector is not None:
332
- trace_collector.finish_task(result)
333
-
334
- # Finish live tracking if enabled
335
- if live_tracker is not None:
336
- live_tracker.finish_task(result)
337
-
338
- return result
339
-
340
- except Exception as e:
341
- logger.error(f"Error running task {task.task_id}: {e}")
342
- result = BenchmarkResult(
343
- task_id=task.task_id,
344
- success=False,
345
- score=0.0,
346
- steps=history if config.save_trajectories else [],
347
- num_steps=len(history),
348
- error=str(e),
349
- total_time_seconds=time.perf_counter() - start_time,
350
- )
351
-
352
- # Finish trace collection even on error
353
- if trace_collector is not None:
354
- trace_collector.finish_task(result)
355
-
356
- return result
357
-
358
-
359
- def compute_metrics(results: list[BenchmarkResult]) -> dict:
360
- """Compute aggregate metrics from evaluation results.
361
-
362
- Args:
363
- results: List of BenchmarkResult from evaluation.
364
-
365
- Returns:
366
- Dict with aggregate metrics.
367
- """
368
- if not results:
369
- return {
370
- "num_tasks": 0,
371
- "success_rate": 0.0,
372
- "avg_score": 0.0,
373
- "avg_steps": 0.0,
374
- "avg_time_seconds": 0.0,
375
- }
376
-
377
- num_tasks = len(results)
378
- success_count = sum(1 for r in results if r.success)
379
- total_score = sum(r.score for r in results)
380
- total_steps = sum(r.num_steps for r in results)
381
- total_time = sum(r.total_time_seconds for r in results)
382
-
383
- return {
384
- "num_tasks": num_tasks,
385
- "success_rate": success_count / num_tasks,
386
- "avg_score": total_score / num_tasks,
387
- "avg_steps": total_steps / num_tasks,
388
- "avg_time_seconds": total_time / num_tasks,
389
- "success_count": success_count,
390
- "fail_count": num_tasks - success_count,
391
- }
392
-
393
-
394
- def compute_domain_metrics(
395
- results: list[BenchmarkResult], tasks: list[BenchmarkTask]
396
- ) -> dict[str, dict]:
397
- """Compute per-domain metrics.
398
-
399
- Args:
400
- results: List of BenchmarkResult.
401
- tasks: List of BenchmarkTask (to get domain info).
402
-
403
- Returns:
404
- Dict mapping domain to metrics dict.
405
- """
406
- # Build task_id -> domain mapping
407
- task_domains = {t.task_id: t.domain for t in tasks}
408
-
409
- # Group results by domain
410
- domain_results: dict[str, list[BenchmarkResult]] = {}
411
- for result in results:
412
- domain = task_domains.get(result.task_id, "unknown")
413
- if domain not in domain_results:
414
- domain_results[domain] = []
415
- domain_results[domain].append(result)
416
-
417
- # Compute metrics per domain
418
- return {domain: compute_metrics(res) for domain, res in domain_results.items()}