openadapt-ml 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- openadapt_ml/__init__.py +0 -0
- openadapt_ml/benchmarks/__init__.py +125 -0
- openadapt_ml/benchmarks/agent.py +825 -0
- openadapt_ml/benchmarks/azure.py +761 -0
- openadapt_ml/benchmarks/base.py +366 -0
- openadapt_ml/benchmarks/cli.py +884 -0
- openadapt_ml/benchmarks/data_collection.py +432 -0
- openadapt_ml/benchmarks/runner.py +381 -0
- openadapt_ml/benchmarks/waa.py +704 -0
- openadapt_ml/cloud/__init__.py +5 -0
- openadapt_ml/cloud/azure_inference.py +441 -0
- openadapt_ml/cloud/lambda_labs.py +2445 -0
- openadapt_ml/cloud/local.py +790 -0
- openadapt_ml/config.py +56 -0
- openadapt_ml/datasets/__init__.py +0 -0
- openadapt_ml/datasets/next_action.py +507 -0
- openadapt_ml/evals/__init__.py +23 -0
- openadapt_ml/evals/grounding.py +241 -0
- openadapt_ml/evals/plot_eval_metrics.py +174 -0
- openadapt_ml/evals/trajectory_matching.py +486 -0
- openadapt_ml/grounding/__init__.py +45 -0
- openadapt_ml/grounding/base.py +236 -0
- openadapt_ml/grounding/detector.py +570 -0
- openadapt_ml/ingest/__init__.py +43 -0
- openadapt_ml/ingest/capture.py +312 -0
- openadapt_ml/ingest/loader.py +232 -0
- openadapt_ml/ingest/synthetic.py +1102 -0
- openadapt_ml/models/__init__.py +0 -0
- openadapt_ml/models/api_adapter.py +171 -0
- openadapt_ml/models/base_adapter.py +59 -0
- openadapt_ml/models/dummy_adapter.py +42 -0
- openadapt_ml/models/qwen_vl.py +426 -0
- openadapt_ml/runtime/__init__.py +0 -0
- openadapt_ml/runtime/policy.py +182 -0
- openadapt_ml/schemas/__init__.py +53 -0
- openadapt_ml/schemas/sessions.py +122 -0
- openadapt_ml/schemas/validation.py +252 -0
- openadapt_ml/scripts/__init__.py +0 -0
- openadapt_ml/scripts/compare.py +1490 -0
- openadapt_ml/scripts/demo_policy.py +62 -0
- openadapt_ml/scripts/eval_policy.py +287 -0
- openadapt_ml/scripts/make_gif.py +153 -0
- openadapt_ml/scripts/prepare_synthetic.py +43 -0
- openadapt_ml/scripts/run_qwen_login_benchmark.py +192 -0
- openadapt_ml/scripts/train.py +174 -0
- openadapt_ml/training/__init__.py +0 -0
- openadapt_ml/training/benchmark_viewer.py +1538 -0
- openadapt_ml/training/shared_ui.py +157 -0
- openadapt_ml/training/stub_provider.py +276 -0
- openadapt_ml/training/trainer.py +2446 -0
- openadapt_ml/training/viewer.py +2970 -0
- openadapt_ml-0.1.0.dist-info/METADATA +818 -0
- openadapt_ml-0.1.0.dist-info/RECORD +55 -0
- openadapt_ml-0.1.0.dist-info/WHEEL +4 -0
- openadapt_ml-0.1.0.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,381 @@
|
|
|
1
|
+
"""Evaluation runner for benchmarks.
|
|
2
|
+
|
|
3
|
+
This module provides functions to run agents on benchmarks and collect results.
|
|
4
|
+
|
|
5
|
+
Example:
|
|
6
|
+
from openadapt_ml.benchmarks import WAAAdapter, PolicyAgent, evaluate_agent_on_benchmark
|
|
7
|
+
|
|
8
|
+
adapter = WAAAdapter(waa_repo_path="/path/to/WAA")
|
|
9
|
+
agent = PolicyAgent(policy)
|
|
10
|
+
results = evaluate_agent_on_benchmark(agent, adapter, max_steps=50)
|
|
11
|
+
|
|
12
|
+
print(f"Success rate: {sum(r.success for r in results) / len(results):.1%}")
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
from __future__ import annotations
|
|
16
|
+
|
|
17
|
+
import logging
|
|
18
|
+
import time
|
|
19
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
20
|
+
from dataclasses import dataclass
|
|
21
|
+
from typing import Callable
|
|
22
|
+
|
|
23
|
+
from openadapt_ml.benchmarks.agent import BenchmarkAgent
|
|
24
|
+
from openadapt_ml.benchmarks.base import (
|
|
25
|
+
BenchmarkAdapter,
|
|
26
|
+
BenchmarkAction,
|
|
27
|
+
BenchmarkObservation,
|
|
28
|
+
BenchmarkResult,
|
|
29
|
+
BenchmarkTask,
|
|
30
|
+
)
|
|
31
|
+
from openadapt_ml.benchmarks.data_collection import ExecutionTraceCollector
|
|
32
|
+
|
|
33
|
+
logger = logging.getLogger(__name__)
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
@dataclass
|
|
37
|
+
class EvaluationConfig:
|
|
38
|
+
"""Configuration for benchmark evaluation.
|
|
39
|
+
|
|
40
|
+
Attributes:
|
|
41
|
+
max_steps: Maximum steps per task.
|
|
42
|
+
parallel: Number of parallel workers (if supported).
|
|
43
|
+
save_trajectories: Whether to save full trajectories in results.
|
|
44
|
+
verbose: Whether to print progress.
|
|
45
|
+
on_step: Optional callback called after each step.
|
|
46
|
+
on_task_complete: Optional callback called after each task.
|
|
47
|
+
save_execution_traces: Whether to save execution traces for viewer.
|
|
48
|
+
model_id: Model identifier for execution traces.
|
|
49
|
+
output_dir: Output directory for benchmark results.
|
|
50
|
+
run_name: Name for this evaluation run.
|
|
51
|
+
"""
|
|
52
|
+
|
|
53
|
+
max_steps: int = 50
|
|
54
|
+
parallel: int = 1
|
|
55
|
+
save_trajectories: bool = True
|
|
56
|
+
verbose: bool = True
|
|
57
|
+
on_step: Callable[[BenchmarkObservation, BenchmarkAction, int], None] | None = None
|
|
58
|
+
on_task_complete: Callable[[BenchmarkResult], None] | None = None
|
|
59
|
+
save_execution_traces: bool = True
|
|
60
|
+
model_id: str = "unknown"
|
|
61
|
+
output_dir: str = "benchmark_results"
|
|
62
|
+
run_name: str | None = None
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def evaluate_agent_on_benchmark(
|
|
66
|
+
agent: BenchmarkAgent,
|
|
67
|
+
adapter: BenchmarkAdapter,
|
|
68
|
+
task_ids: list[str] | None = None,
|
|
69
|
+
max_steps: int = 50,
|
|
70
|
+
parallel: int = 1,
|
|
71
|
+
config: EvaluationConfig | None = None,
|
|
72
|
+
) -> list[BenchmarkResult]:
|
|
73
|
+
"""Run agent on benchmark tasks and collect results.
|
|
74
|
+
|
|
75
|
+
Args:
|
|
76
|
+
agent: Agent to evaluate.
|
|
77
|
+
adapter: Benchmark adapter.
|
|
78
|
+
task_ids: Specific tasks to run (None = all tasks).
|
|
79
|
+
max_steps: Maximum steps per task (overridden by config if provided).
|
|
80
|
+
parallel: Number of parallel workers (overridden by config if provided).
|
|
81
|
+
config: Full evaluation configuration.
|
|
82
|
+
|
|
83
|
+
Returns:
|
|
84
|
+
List of BenchmarkResult for each task.
|
|
85
|
+
"""
|
|
86
|
+
if config is None:
|
|
87
|
+
config = EvaluationConfig(max_steps=max_steps, parallel=parallel)
|
|
88
|
+
|
|
89
|
+
# Load tasks
|
|
90
|
+
if task_ids is not None:
|
|
91
|
+
tasks = [adapter.load_task(tid) for tid in task_ids]
|
|
92
|
+
else:
|
|
93
|
+
tasks = adapter.list_tasks()
|
|
94
|
+
|
|
95
|
+
if config.verbose:
|
|
96
|
+
logger.info(f"Evaluating {len(tasks)} tasks on {adapter.name}")
|
|
97
|
+
|
|
98
|
+
# Initialize execution trace collector if enabled
|
|
99
|
+
trace_collector = None
|
|
100
|
+
if config.save_execution_traces:
|
|
101
|
+
trace_collector = ExecutionTraceCollector(
|
|
102
|
+
benchmark_name=adapter.name,
|
|
103
|
+
run_name=config.run_name,
|
|
104
|
+
model_id=config.model_id,
|
|
105
|
+
output_dir=config.output_dir,
|
|
106
|
+
)
|
|
107
|
+
if config.verbose:
|
|
108
|
+
logger.info(f"Saving execution traces to: {trace_collector.run_dir}")
|
|
109
|
+
|
|
110
|
+
# Run evaluation
|
|
111
|
+
if config.parallel > 1 and adapter.supports_parallel:
|
|
112
|
+
results = _evaluate_parallel(agent, adapter, tasks, config, trace_collector)
|
|
113
|
+
else:
|
|
114
|
+
results = _evaluate_sequential(agent, adapter, tasks, config, trace_collector)
|
|
115
|
+
|
|
116
|
+
# Save summary if trace collection is enabled
|
|
117
|
+
if trace_collector is not None:
|
|
118
|
+
trace_collector.save_summary(results)
|
|
119
|
+
|
|
120
|
+
# Log summary
|
|
121
|
+
if config.verbose:
|
|
122
|
+
success_count = sum(1 for r in results if r.success)
|
|
123
|
+
success_rate = success_count / len(results) if results else 0
|
|
124
|
+
avg_steps = sum(r.num_steps for r in results) / len(results) if results else 0
|
|
125
|
+
logger.info(
|
|
126
|
+
f"Evaluation complete: {success_count}/{len(results)} "
|
|
127
|
+
f"({success_rate:.1%}) success, {avg_steps:.1f} avg steps"
|
|
128
|
+
)
|
|
129
|
+
|
|
130
|
+
return results
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
def _evaluate_sequential(
|
|
134
|
+
agent: BenchmarkAgent,
|
|
135
|
+
adapter: BenchmarkAdapter,
|
|
136
|
+
tasks: list[BenchmarkTask],
|
|
137
|
+
config: EvaluationConfig,
|
|
138
|
+
trace_collector: ExecutionTraceCollector | None = None,
|
|
139
|
+
) -> list[BenchmarkResult]:
|
|
140
|
+
"""Run evaluation sequentially.
|
|
141
|
+
|
|
142
|
+
Args:
|
|
143
|
+
agent: Agent to evaluate.
|
|
144
|
+
adapter: Benchmark adapter.
|
|
145
|
+
tasks: Tasks to evaluate.
|
|
146
|
+
config: Evaluation configuration.
|
|
147
|
+
trace_collector: Optional trace collector for saving execution data.
|
|
148
|
+
|
|
149
|
+
Returns:
|
|
150
|
+
List of results.
|
|
151
|
+
"""
|
|
152
|
+
results = []
|
|
153
|
+
for i, task in enumerate(tasks):
|
|
154
|
+
if config.verbose:
|
|
155
|
+
logger.info(f"Task {i + 1}/{len(tasks)}: {task.task_id}")
|
|
156
|
+
|
|
157
|
+
result = _run_single_task(agent, adapter, task, config, trace_collector)
|
|
158
|
+
results.append(result)
|
|
159
|
+
|
|
160
|
+
if config.on_task_complete:
|
|
161
|
+
config.on_task_complete(result)
|
|
162
|
+
|
|
163
|
+
return results
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
def _evaluate_parallel(
|
|
167
|
+
agent: BenchmarkAgent,
|
|
168
|
+
adapter: BenchmarkAdapter,
|
|
169
|
+
tasks: list[BenchmarkTask],
|
|
170
|
+
config: EvaluationConfig,
|
|
171
|
+
trace_collector: ExecutionTraceCollector | None = None,
|
|
172
|
+
) -> list[BenchmarkResult]:
|
|
173
|
+
"""Run evaluation in parallel.
|
|
174
|
+
|
|
175
|
+
Note: This requires the adapter to support parallel execution
|
|
176
|
+
(e.g., via multiple VM instances).
|
|
177
|
+
|
|
178
|
+
Args:
|
|
179
|
+
agent: Agent to evaluate.
|
|
180
|
+
adapter: Benchmark adapter.
|
|
181
|
+
tasks: Tasks to evaluate.
|
|
182
|
+
config: Evaluation configuration.
|
|
183
|
+
trace_collector: Optional trace collector for saving execution data.
|
|
184
|
+
|
|
185
|
+
Returns:
|
|
186
|
+
List of results.
|
|
187
|
+
"""
|
|
188
|
+
results = []
|
|
189
|
+
|
|
190
|
+
with ThreadPoolExecutor(max_workers=config.parallel) as executor:
|
|
191
|
+
# Submit all tasks
|
|
192
|
+
future_to_task = {
|
|
193
|
+
executor.submit(_run_single_task, agent, adapter, task, config, trace_collector): task
|
|
194
|
+
for task in tasks
|
|
195
|
+
}
|
|
196
|
+
|
|
197
|
+
# Collect results as they complete
|
|
198
|
+
for future in as_completed(future_to_task):
|
|
199
|
+
task = future_to_task[future]
|
|
200
|
+
try:
|
|
201
|
+
result = future.result()
|
|
202
|
+
results.append(result)
|
|
203
|
+
|
|
204
|
+
if config.on_task_complete:
|
|
205
|
+
config.on_task_complete(result)
|
|
206
|
+
|
|
207
|
+
if config.verbose:
|
|
208
|
+
status = "SUCCESS" if result.success else "FAIL"
|
|
209
|
+
logger.info(f"Task {task.task_id}: {status}")
|
|
210
|
+
|
|
211
|
+
except Exception as e:
|
|
212
|
+
logger.error(f"Task {task.task_id} failed with error: {e}")
|
|
213
|
+
results.append(
|
|
214
|
+
BenchmarkResult(
|
|
215
|
+
task_id=task.task_id,
|
|
216
|
+
success=False,
|
|
217
|
+
score=0.0,
|
|
218
|
+
error=str(e),
|
|
219
|
+
)
|
|
220
|
+
)
|
|
221
|
+
|
|
222
|
+
return results
|
|
223
|
+
|
|
224
|
+
|
|
225
|
+
def _run_single_task(
|
|
226
|
+
agent: BenchmarkAgent,
|
|
227
|
+
adapter: BenchmarkAdapter,
|
|
228
|
+
task: BenchmarkTask,
|
|
229
|
+
config: EvaluationConfig,
|
|
230
|
+
trace_collector: ExecutionTraceCollector | None = None,
|
|
231
|
+
) -> BenchmarkResult:
|
|
232
|
+
"""Run a single task and return result.
|
|
233
|
+
|
|
234
|
+
Args:
|
|
235
|
+
agent: Agent to evaluate.
|
|
236
|
+
adapter: Benchmark adapter.
|
|
237
|
+
task: Task to run.
|
|
238
|
+
config: Evaluation configuration.
|
|
239
|
+
trace_collector: Optional trace collector for saving execution data.
|
|
240
|
+
|
|
241
|
+
Returns:
|
|
242
|
+
BenchmarkResult.
|
|
243
|
+
"""
|
|
244
|
+
start_time = time.perf_counter()
|
|
245
|
+
history: list[tuple[BenchmarkObservation, BenchmarkAction]] = []
|
|
246
|
+
|
|
247
|
+
# Start trace collection if enabled
|
|
248
|
+
if trace_collector is not None:
|
|
249
|
+
trace_collector.start_task(task)
|
|
250
|
+
|
|
251
|
+
try:
|
|
252
|
+
# Reset agent and environment
|
|
253
|
+
agent.reset()
|
|
254
|
+
obs = adapter.reset(task)
|
|
255
|
+
|
|
256
|
+
done = False
|
|
257
|
+
steps = 0
|
|
258
|
+
max_steps = task.time_limit_steps or config.max_steps
|
|
259
|
+
|
|
260
|
+
while not done and steps < max_steps:
|
|
261
|
+
# Get action from agent
|
|
262
|
+
action = agent.act(obs, task, history if config.save_trajectories else None)
|
|
263
|
+
|
|
264
|
+
# Extract reasoning if available from PolicyAgent
|
|
265
|
+
reasoning = None
|
|
266
|
+
if hasattr(action, "raw_action") and action.raw_action:
|
|
267
|
+
reasoning = action.raw_action.get("thought")
|
|
268
|
+
|
|
269
|
+
# Record step in trace collector
|
|
270
|
+
if trace_collector is not None:
|
|
271
|
+
trace_collector.record_step(steps, obs, action, reasoning)
|
|
272
|
+
|
|
273
|
+
# Record step in history
|
|
274
|
+
if config.save_trajectories:
|
|
275
|
+
history.append((obs, action))
|
|
276
|
+
|
|
277
|
+
if config.on_step:
|
|
278
|
+
config.on_step(obs, action, steps)
|
|
279
|
+
|
|
280
|
+
# Check for terminal action
|
|
281
|
+
if action.type == "done":
|
|
282
|
+
done = True
|
|
283
|
+
break
|
|
284
|
+
|
|
285
|
+
# Execute action
|
|
286
|
+
obs, done, info = adapter.step(action)
|
|
287
|
+
steps += 1
|
|
288
|
+
|
|
289
|
+
# Evaluate result
|
|
290
|
+
result = adapter.evaluate(task)
|
|
291
|
+
|
|
292
|
+
# Update result with trajectory info
|
|
293
|
+
result.steps = history if config.save_trajectories else []
|
|
294
|
+
result.num_steps = steps
|
|
295
|
+
result.total_time_seconds = time.perf_counter() - start_time
|
|
296
|
+
|
|
297
|
+
# Finish trace collection if enabled
|
|
298
|
+
if trace_collector is not None:
|
|
299
|
+
trace_collector.finish_task(result)
|
|
300
|
+
|
|
301
|
+
return result
|
|
302
|
+
|
|
303
|
+
except Exception as e:
|
|
304
|
+
logger.error(f"Error running task {task.task_id}: {e}")
|
|
305
|
+
result = BenchmarkResult(
|
|
306
|
+
task_id=task.task_id,
|
|
307
|
+
success=False,
|
|
308
|
+
score=0.0,
|
|
309
|
+
steps=history if config.save_trajectories else [],
|
|
310
|
+
num_steps=len(history),
|
|
311
|
+
error=str(e),
|
|
312
|
+
total_time_seconds=time.perf_counter() - start_time,
|
|
313
|
+
)
|
|
314
|
+
|
|
315
|
+
# Finish trace collection even on error
|
|
316
|
+
if trace_collector is not None:
|
|
317
|
+
trace_collector.finish_task(result)
|
|
318
|
+
|
|
319
|
+
return result
|
|
320
|
+
|
|
321
|
+
|
|
322
|
+
def compute_metrics(results: list[BenchmarkResult]) -> dict:
|
|
323
|
+
"""Compute aggregate metrics from evaluation results.
|
|
324
|
+
|
|
325
|
+
Args:
|
|
326
|
+
results: List of BenchmarkResult from evaluation.
|
|
327
|
+
|
|
328
|
+
Returns:
|
|
329
|
+
Dict with aggregate metrics.
|
|
330
|
+
"""
|
|
331
|
+
if not results:
|
|
332
|
+
return {
|
|
333
|
+
"num_tasks": 0,
|
|
334
|
+
"success_rate": 0.0,
|
|
335
|
+
"avg_score": 0.0,
|
|
336
|
+
"avg_steps": 0.0,
|
|
337
|
+
"avg_time_seconds": 0.0,
|
|
338
|
+
}
|
|
339
|
+
|
|
340
|
+
num_tasks = len(results)
|
|
341
|
+
success_count = sum(1 for r in results if r.success)
|
|
342
|
+
total_score = sum(r.score for r in results)
|
|
343
|
+
total_steps = sum(r.num_steps for r in results)
|
|
344
|
+
total_time = sum(r.total_time_seconds for r in results)
|
|
345
|
+
|
|
346
|
+
return {
|
|
347
|
+
"num_tasks": num_tasks,
|
|
348
|
+
"success_rate": success_count / num_tasks,
|
|
349
|
+
"avg_score": total_score / num_tasks,
|
|
350
|
+
"avg_steps": total_steps / num_tasks,
|
|
351
|
+
"avg_time_seconds": total_time / num_tasks,
|
|
352
|
+
"success_count": success_count,
|
|
353
|
+
"fail_count": num_tasks - success_count,
|
|
354
|
+
}
|
|
355
|
+
|
|
356
|
+
|
|
357
|
+
def compute_domain_metrics(
|
|
358
|
+
results: list[BenchmarkResult], tasks: list[BenchmarkTask]
|
|
359
|
+
) -> dict[str, dict]:
|
|
360
|
+
"""Compute per-domain metrics.
|
|
361
|
+
|
|
362
|
+
Args:
|
|
363
|
+
results: List of BenchmarkResult.
|
|
364
|
+
tasks: List of BenchmarkTask (to get domain info).
|
|
365
|
+
|
|
366
|
+
Returns:
|
|
367
|
+
Dict mapping domain to metrics dict.
|
|
368
|
+
"""
|
|
369
|
+
# Build task_id -> domain mapping
|
|
370
|
+
task_domains = {t.task_id: t.domain for t in tasks}
|
|
371
|
+
|
|
372
|
+
# Group results by domain
|
|
373
|
+
domain_results: dict[str, list[BenchmarkResult]] = {}
|
|
374
|
+
for result in results:
|
|
375
|
+
domain = task_domains.get(result.task_id, "unknown")
|
|
376
|
+
if domain not in domain_results:
|
|
377
|
+
domain_results[domain] = []
|
|
378
|
+
domain_results[domain].append(result)
|
|
379
|
+
|
|
380
|
+
# Compute metrics per domain
|
|
381
|
+
return {domain: compute_metrics(res) for domain, res in domain_results.items()}
|