openadapt-ml 0.1.0__py3-none-any.whl → 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- openadapt_ml/benchmarks/__init__.py +8 -0
- openadapt_ml/benchmarks/agent.py +90 -11
- openadapt_ml/benchmarks/azure.py +35 -6
- openadapt_ml/benchmarks/cli.py +4449 -201
- openadapt_ml/benchmarks/live_tracker.py +180 -0
- openadapt_ml/benchmarks/runner.py +41 -4
- openadapt_ml/benchmarks/viewer.py +1219 -0
- openadapt_ml/benchmarks/vm_monitor.py +610 -0
- openadapt_ml/benchmarks/waa.py +61 -4
- openadapt_ml/benchmarks/waa_deploy/Dockerfile +222 -0
- openadapt_ml/benchmarks/waa_deploy/__init__.py +10 -0
- openadapt_ml/benchmarks/waa_deploy/api_agent.py +539 -0
- openadapt_ml/benchmarks/waa_deploy/start_waa_server.bat +53 -0
- openadapt_ml/benchmarks/waa_live.py +619 -0
- openadapt_ml/cloud/local.py +1555 -1
- openadapt_ml/cloud/ssh_tunnel.py +553 -0
- openadapt_ml/datasets/next_action.py +87 -68
- openadapt_ml/evals/grounding.py +26 -8
- openadapt_ml/evals/trajectory_matching.py +84 -36
- openadapt_ml/experiments/demo_prompt/__init__.py +19 -0
- openadapt_ml/experiments/demo_prompt/format_demo.py +226 -0
- openadapt_ml/experiments/demo_prompt/results/experiment_20251231_002125.json +83 -0
- openadapt_ml/experiments/demo_prompt/results/experiment_n30_20251231_165958.json +1100 -0
- openadapt_ml/experiments/demo_prompt/results/multistep_20251231_025051.json +182 -0
- openadapt_ml/experiments/demo_prompt/run_experiment.py +531 -0
- openadapt_ml/experiments/waa_demo/__init__.py +10 -0
- openadapt_ml/experiments/waa_demo/demos.py +357 -0
- openadapt_ml/experiments/waa_demo/runner.py +717 -0
- openadapt_ml/experiments/waa_demo/tasks.py +151 -0
- openadapt_ml/export/__init__.py +9 -0
- openadapt_ml/export/__main__.py +6 -0
- openadapt_ml/export/cli.py +89 -0
- openadapt_ml/export/parquet.py +265 -0
- openadapt_ml/ingest/__init__.py +3 -4
- openadapt_ml/ingest/capture.py +89 -81
- openadapt_ml/ingest/loader.py +116 -68
- openadapt_ml/ingest/synthetic.py +221 -159
- openadapt_ml/retrieval/README.md +226 -0
- openadapt_ml/retrieval/USAGE.md +391 -0
- openadapt_ml/retrieval/__init__.py +91 -0
- openadapt_ml/retrieval/demo_retriever.py +817 -0
- openadapt_ml/retrieval/embeddings.py +629 -0
- openadapt_ml/retrieval/index.py +194 -0
- openadapt_ml/retrieval/retriever.py +160 -0
- openadapt_ml/runtime/policy.py +10 -10
- openadapt_ml/schema/__init__.py +104 -0
- openadapt_ml/schema/converters.py +541 -0
- openadapt_ml/schema/episode.py +457 -0
- openadapt_ml/scripts/compare.py +26 -16
- openadapt_ml/scripts/eval_policy.py +4 -5
- openadapt_ml/scripts/prepare_synthetic.py +14 -17
- openadapt_ml/scripts/train.py +81 -70
- openadapt_ml/training/benchmark_viewer.py +3225 -0
- openadapt_ml/training/trainer.py +120 -363
- openadapt_ml/training/trl_trainer.py +354 -0
- {openadapt_ml-0.1.0.dist-info → openadapt_ml-0.2.0.dist-info}/METADATA +102 -60
- openadapt_ml-0.2.0.dist-info/RECORD +86 -0
- openadapt_ml/schemas/__init__.py +0 -53
- openadapt_ml/schemas/sessions.py +0 -122
- openadapt_ml/schemas/validation.py +0 -252
- openadapt_ml-0.1.0.dist-info/RECORD +0 -55
- {openadapt_ml-0.1.0.dist-info → openadapt_ml-0.2.0.dist-info}/WHEEL +0 -0
- {openadapt_ml-0.1.0.dist-info → openadapt_ml-0.2.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,180 @@
|
|
|
1
|
+
"""Live evaluation progress tracker for benchmark viewer.
|
|
2
|
+
|
|
3
|
+
This module provides a tracker that writes real-time evaluation progress
|
|
4
|
+
to a JSON file that the viewer can poll via /api/benchmark-live.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import json
|
|
10
|
+
from dataclasses import asdict, dataclass
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
from typing import Any
|
|
13
|
+
|
|
14
|
+
from openadapt_ml.benchmarks.base import (
|
|
15
|
+
BenchmarkAction,
|
|
16
|
+
BenchmarkObservation,
|
|
17
|
+
BenchmarkResult,
|
|
18
|
+
BenchmarkTask,
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
@dataclass
|
|
23
|
+
class LiveStepData:
|
|
24
|
+
"""Data for a single step in live evaluation."""
|
|
25
|
+
|
|
26
|
+
step_idx: int
|
|
27
|
+
action: dict[str, Any]
|
|
28
|
+
reasoning: str | None = None
|
|
29
|
+
screenshot_url: str | None = None
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
@dataclass
|
|
33
|
+
class LiveTaskData:
|
|
34
|
+
"""Data for current task being evaluated."""
|
|
35
|
+
|
|
36
|
+
task_id: str
|
|
37
|
+
instruction: str
|
|
38
|
+
domain: str
|
|
39
|
+
steps: list[LiveStepData]
|
|
40
|
+
result: dict[str, Any] | None = None
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
class LiveEvaluationTracker:
|
|
44
|
+
"""Tracks live evaluation progress and writes to benchmark_live.json.
|
|
45
|
+
|
|
46
|
+
This class is designed to be used alongside ExecutionTraceCollector
|
|
47
|
+
to provide real-time progress updates to the viewer.
|
|
48
|
+
|
|
49
|
+
Args:
|
|
50
|
+
output_file: Path to output JSON file (default: benchmark_live.json).
|
|
51
|
+
total_tasks: Total number of tasks to evaluate.
|
|
52
|
+
"""
|
|
53
|
+
|
|
54
|
+
def __init__(
|
|
55
|
+
self,
|
|
56
|
+
output_file: str | Path = "benchmark_live.json",
|
|
57
|
+
total_tasks: int = 0,
|
|
58
|
+
):
|
|
59
|
+
self.output_file = Path(output_file)
|
|
60
|
+
self.total_tasks = total_tasks
|
|
61
|
+
self.tasks_completed = 0
|
|
62
|
+
self.current_task: LiveTaskData | None = None
|
|
63
|
+
|
|
64
|
+
# Initialize with idle state
|
|
65
|
+
self._write_state({"status": "idle"})
|
|
66
|
+
|
|
67
|
+
def start_task(self, task: BenchmarkTask) -> None:
|
|
68
|
+
"""Start tracking a new task.
|
|
69
|
+
|
|
70
|
+
Args:
|
|
71
|
+
task: The benchmark task being evaluated.
|
|
72
|
+
"""
|
|
73
|
+
self.current_task = LiveTaskData(
|
|
74
|
+
task_id=task.task_id,
|
|
75
|
+
instruction=task.instruction,
|
|
76
|
+
domain=task.domain or "unknown",
|
|
77
|
+
steps=[],
|
|
78
|
+
result=None,
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
self._write_state({
|
|
82
|
+
"status": "running",
|
|
83
|
+
"total_tasks": self.total_tasks,
|
|
84
|
+
"tasks_completed": self.tasks_completed,
|
|
85
|
+
"current_task": asdict(self.current_task),
|
|
86
|
+
})
|
|
87
|
+
|
|
88
|
+
def record_step(
|
|
89
|
+
self,
|
|
90
|
+
step_idx: int,
|
|
91
|
+
observation: BenchmarkObservation,
|
|
92
|
+
action: BenchmarkAction,
|
|
93
|
+
reasoning: str | None = None,
|
|
94
|
+
) -> None:
|
|
95
|
+
"""Record a step in the current task.
|
|
96
|
+
|
|
97
|
+
Args:
|
|
98
|
+
step_idx: Index of this step.
|
|
99
|
+
observation: Observation at this step.
|
|
100
|
+
action: Action taken at this step.
|
|
101
|
+
reasoning: Optional reasoning/thought from agent.
|
|
102
|
+
"""
|
|
103
|
+
if self.current_task is None:
|
|
104
|
+
raise RuntimeError("No task started. Call start_task() first.")
|
|
105
|
+
|
|
106
|
+
# Serialize action
|
|
107
|
+
action_data = {
|
|
108
|
+
"type": action.type,
|
|
109
|
+
"x": action.x,
|
|
110
|
+
"y": action.y,
|
|
111
|
+
"target_node_id": action.target_node_id,
|
|
112
|
+
"text": action.text,
|
|
113
|
+
"key": action.key,
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
# Create step data
|
|
117
|
+
step = LiveStepData(
|
|
118
|
+
step_idx=step_idx,
|
|
119
|
+
action=action_data,
|
|
120
|
+
reasoning=reasoning,
|
|
121
|
+
screenshot_url=None, # Could be populated if we serve screenshots
|
|
122
|
+
)
|
|
123
|
+
|
|
124
|
+
self.current_task.steps.append(step)
|
|
125
|
+
|
|
126
|
+
# Write updated state
|
|
127
|
+
self._write_state({
|
|
128
|
+
"status": "running",
|
|
129
|
+
"total_tasks": self.total_tasks,
|
|
130
|
+
"tasks_completed": self.tasks_completed,
|
|
131
|
+
"current_task": asdict(self.current_task),
|
|
132
|
+
})
|
|
133
|
+
|
|
134
|
+
def finish_task(self, result: BenchmarkResult) -> None:
|
|
135
|
+
"""Finish tracking the current task.
|
|
136
|
+
|
|
137
|
+
Args:
|
|
138
|
+
result: The evaluation result for the task.
|
|
139
|
+
"""
|
|
140
|
+
if self.current_task is None:
|
|
141
|
+
raise RuntimeError("No task started. Call start_task() first.")
|
|
142
|
+
|
|
143
|
+
# Add result to current task
|
|
144
|
+
self.current_task.result = {
|
|
145
|
+
"success": result.success,
|
|
146
|
+
"score": result.score,
|
|
147
|
+
"num_steps": result.num_steps,
|
|
148
|
+
"total_time_seconds": result.total_time_seconds,
|
|
149
|
+
}
|
|
150
|
+
|
|
151
|
+
# Increment completed count
|
|
152
|
+
self.tasks_completed += 1
|
|
153
|
+
|
|
154
|
+
# Write updated state
|
|
155
|
+
self._write_state({
|
|
156
|
+
"status": "running",
|
|
157
|
+
"total_tasks": self.total_tasks,
|
|
158
|
+
"tasks_completed": self.tasks_completed,
|
|
159
|
+
"current_task": asdict(self.current_task),
|
|
160
|
+
})
|
|
161
|
+
|
|
162
|
+
# Clear current task
|
|
163
|
+
self.current_task = None
|
|
164
|
+
|
|
165
|
+
def finish(self) -> None:
|
|
166
|
+
"""Mark evaluation as complete."""
|
|
167
|
+
self._write_state({
|
|
168
|
+
"status": "complete",
|
|
169
|
+
"total_tasks": self.total_tasks,
|
|
170
|
+
"tasks_completed": self.tasks_completed,
|
|
171
|
+
})
|
|
172
|
+
|
|
173
|
+
def _write_state(self, state: dict[str, Any]) -> None:
|
|
174
|
+
"""Write current state to JSON file.
|
|
175
|
+
|
|
176
|
+
Args:
|
|
177
|
+
state: State dictionary to write.
|
|
178
|
+
"""
|
|
179
|
+
with open(self.output_file, "w") as f:
|
|
180
|
+
json.dump(state, f, indent=2)
|
|
@@ -29,6 +29,7 @@ from openadapt_ml.benchmarks.base import (
|
|
|
29
29
|
BenchmarkTask,
|
|
30
30
|
)
|
|
31
31
|
from openadapt_ml.benchmarks.data_collection import ExecutionTraceCollector
|
|
32
|
+
from openadapt_ml.benchmarks.live_tracker import LiveEvaluationTracker
|
|
32
33
|
|
|
33
34
|
logger = logging.getLogger(__name__)
|
|
34
35
|
|
|
@@ -48,6 +49,8 @@ class EvaluationConfig:
|
|
|
48
49
|
model_id: Model identifier for execution traces.
|
|
49
50
|
output_dir: Output directory for benchmark results.
|
|
50
51
|
run_name: Name for this evaluation run.
|
|
52
|
+
enable_live_tracking: Whether to enable live evaluation progress tracking.
|
|
53
|
+
live_tracking_file: Path to live tracking JSON file.
|
|
51
54
|
"""
|
|
52
55
|
|
|
53
56
|
max_steps: int = 50
|
|
@@ -60,6 +63,8 @@ class EvaluationConfig:
|
|
|
60
63
|
model_id: str = "unknown"
|
|
61
64
|
output_dir: str = "benchmark_results"
|
|
62
65
|
run_name: str | None = None
|
|
66
|
+
enable_live_tracking: bool = True
|
|
67
|
+
live_tracking_file: str = "benchmark_live.json"
|
|
63
68
|
|
|
64
69
|
|
|
65
70
|
def evaluate_agent_on_benchmark(
|
|
@@ -107,16 +112,30 @@ def evaluate_agent_on_benchmark(
|
|
|
107
112
|
if config.verbose:
|
|
108
113
|
logger.info(f"Saving execution traces to: {trace_collector.run_dir}")
|
|
109
114
|
|
|
115
|
+
# Initialize live evaluation tracker if enabled
|
|
116
|
+
live_tracker = None
|
|
117
|
+
if config.enable_live_tracking:
|
|
118
|
+
live_tracker = LiveEvaluationTracker(
|
|
119
|
+
output_file=config.live_tracking_file,
|
|
120
|
+
total_tasks=len(tasks),
|
|
121
|
+
)
|
|
122
|
+
if config.verbose:
|
|
123
|
+
logger.info(f"Live tracking enabled: {config.live_tracking_file}")
|
|
124
|
+
|
|
110
125
|
# Run evaluation
|
|
111
126
|
if config.parallel > 1 and adapter.supports_parallel:
|
|
112
|
-
results = _evaluate_parallel(agent, adapter, tasks, config, trace_collector)
|
|
127
|
+
results = _evaluate_parallel(agent, adapter, tasks, config, trace_collector, live_tracker)
|
|
113
128
|
else:
|
|
114
|
-
results = _evaluate_sequential(agent, adapter, tasks, config, trace_collector)
|
|
129
|
+
results = _evaluate_sequential(agent, adapter, tasks, config, trace_collector, live_tracker)
|
|
115
130
|
|
|
116
131
|
# Save summary if trace collection is enabled
|
|
117
132
|
if trace_collector is not None:
|
|
118
133
|
trace_collector.save_summary(results)
|
|
119
134
|
|
|
135
|
+
# Mark live tracking as complete
|
|
136
|
+
if live_tracker is not None:
|
|
137
|
+
live_tracker.finish()
|
|
138
|
+
|
|
120
139
|
# Log summary
|
|
121
140
|
if config.verbose:
|
|
122
141
|
success_count = sum(1 for r in results if r.success)
|
|
@@ -136,6 +155,7 @@ def _evaluate_sequential(
|
|
|
136
155
|
tasks: list[BenchmarkTask],
|
|
137
156
|
config: EvaluationConfig,
|
|
138
157
|
trace_collector: ExecutionTraceCollector | None = None,
|
|
158
|
+
live_tracker: LiveEvaluationTracker | None = None,
|
|
139
159
|
) -> list[BenchmarkResult]:
|
|
140
160
|
"""Run evaluation sequentially.
|
|
141
161
|
|
|
@@ -145,6 +165,7 @@ def _evaluate_sequential(
|
|
|
145
165
|
tasks: Tasks to evaluate.
|
|
146
166
|
config: Evaluation configuration.
|
|
147
167
|
trace_collector: Optional trace collector for saving execution data.
|
|
168
|
+
live_tracker: Optional live evaluation tracker.
|
|
148
169
|
|
|
149
170
|
Returns:
|
|
150
171
|
List of results.
|
|
@@ -154,7 +175,7 @@ def _evaluate_sequential(
|
|
|
154
175
|
if config.verbose:
|
|
155
176
|
logger.info(f"Task {i + 1}/{len(tasks)}: {task.task_id}")
|
|
156
177
|
|
|
157
|
-
result = _run_single_task(agent, adapter, task, config, trace_collector)
|
|
178
|
+
result = _run_single_task(agent, adapter, task, config, trace_collector, live_tracker)
|
|
158
179
|
results.append(result)
|
|
159
180
|
|
|
160
181
|
if config.on_task_complete:
|
|
@@ -169,6 +190,7 @@ def _evaluate_parallel(
|
|
|
169
190
|
tasks: list[BenchmarkTask],
|
|
170
191
|
config: EvaluationConfig,
|
|
171
192
|
trace_collector: ExecutionTraceCollector | None = None,
|
|
193
|
+
live_tracker: LiveEvaluationTracker | None = None,
|
|
172
194
|
) -> list[BenchmarkResult]:
|
|
173
195
|
"""Run evaluation in parallel.
|
|
174
196
|
|
|
@@ -181,6 +203,7 @@ def _evaluate_parallel(
|
|
|
181
203
|
tasks: Tasks to evaluate.
|
|
182
204
|
config: Evaluation configuration.
|
|
183
205
|
trace_collector: Optional trace collector for saving execution data.
|
|
206
|
+
live_tracker: Optional live evaluation tracker.
|
|
184
207
|
|
|
185
208
|
Returns:
|
|
186
209
|
List of results.
|
|
@@ -190,7 +213,7 @@ def _evaluate_parallel(
|
|
|
190
213
|
with ThreadPoolExecutor(max_workers=config.parallel) as executor:
|
|
191
214
|
# Submit all tasks
|
|
192
215
|
future_to_task = {
|
|
193
|
-
executor.submit(_run_single_task, agent, adapter, task, config, trace_collector): task
|
|
216
|
+
executor.submit(_run_single_task, agent, adapter, task, config, trace_collector, live_tracker): task
|
|
194
217
|
for task in tasks
|
|
195
218
|
}
|
|
196
219
|
|
|
@@ -228,6 +251,7 @@ def _run_single_task(
|
|
|
228
251
|
task: BenchmarkTask,
|
|
229
252
|
config: EvaluationConfig,
|
|
230
253
|
trace_collector: ExecutionTraceCollector | None = None,
|
|
254
|
+
live_tracker: LiveEvaluationTracker | None = None,
|
|
231
255
|
) -> BenchmarkResult:
|
|
232
256
|
"""Run a single task and return result.
|
|
233
257
|
|
|
@@ -237,6 +261,7 @@ def _run_single_task(
|
|
|
237
261
|
task: Task to run.
|
|
238
262
|
config: Evaluation configuration.
|
|
239
263
|
trace_collector: Optional trace collector for saving execution data.
|
|
264
|
+
live_tracker: Optional live evaluation tracker.
|
|
240
265
|
|
|
241
266
|
Returns:
|
|
242
267
|
BenchmarkResult.
|
|
@@ -248,6 +273,10 @@ def _run_single_task(
|
|
|
248
273
|
if trace_collector is not None:
|
|
249
274
|
trace_collector.start_task(task)
|
|
250
275
|
|
|
276
|
+
# Start live tracking if enabled
|
|
277
|
+
if live_tracker is not None:
|
|
278
|
+
live_tracker.start_task(task)
|
|
279
|
+
|
|
251
280
|
try:
|
|
252
281
|
# Reset agent and environment
|
|
253
282
|
agent.reset()
|
|
@@ -270,6 +299,10 @@ def _run_single_task(
|
|
|
270
299
|
if trace_collector is not None:
|
|
271
300
|
trace_collector.record_step(steps, obs, action, reasoning)
|
|
272
301
|
|
|
302
|
+
# Record step in live tracker
|
|
303
|
+
if live_tracker is not None:
|
|
304
|
+
live_tracker.record_step(steps, obs, action, reasoning)
|
|
305
|
+
|
|
273
306
|
# Record step in history
|
|
274
307
|
if config.save_trajectories:
|
|
275
308
|
history.append((obs, action))
|
|
@@ -298,6 +331,10 @@ def _run_single_task(
|
|
|
298
331
|
if trace_collector is not None:
|
|
299
332
|
trace_collector.finish_task(result)
|
|
300
333
|
|
|
334
|
+
# Finish live tracking if enabled
|
|
335
|
+
if live_tracker is not None:
|
|
336
|
+
live_tracker.finish_task(result)
|
|
337
|
+
|
|
301
338
|
return result
|
|
302
339
|
|
|
303
340
|
except Exception as e:
|