openadapt-ml 0.1.0__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (63) hide show
  1. openadapt_ml/benchmarks/__init__.py +8 -0
  2. openadapt_ml/benchmarks/agent.py +90 -11
  3. openadapt_ml/benchmarks/azure.py +35 -6
  4. openadapt_ml/benchmarks/cli.py +4449 -201
  5. openadapt_ml/benchmarks/live_tracker.py +180 -0
  6. openadapt_ml/benchmarks/runner.py +41 -4
  7. openadapt_ml/benchmarks/viewer.py +1219 -0
  8. openadapt_ml/benchmarks/vm_monitor.py +610 -0
  9. openadapt_ml/benchmarks/waa.py +61 -4
  10. openadapt_ml/benchmarks/waa_deploy/Dockerfile +222 -0
  11. openadapt_ml/benchmarks/waa_deploy/__init__.py +10 -0
  12. openadapt_ml/benchmarks/waa_deploy/api_agent.py +539 -0
  13. openadapt_ml/benchmarks/waa_deploy/start_waa_server.bat +53 -0
  14. openadapt_ml/benchmarks/waa_live.py +619 -0
  15. openadapt_ml/cloud/local.py +1555 -1
  16. openadapt_ml/cloud/ssh_tunnel.py +553 -0
  17. openadapt_ml/datasets/next_action.py +87 -68
  18. openadapt_ml/evals/grounding.py +26 -8
  19. openadapt_ml/evals/trajectory_matching.py +84 -36
  20. openadapt_ml/experiments/demo_prompt/__init__.py +19 -0
  21. openadapt_ml/experiments/demo_prompt/format_demo.py +226 -0
  22. openadapt_ml/experiments/demo_prompt/results/experiment_20251231_002125.json +83 -0
  23. openadapt_ml/experiments/demo_prompt/results/experiment_n30_20251231_165958.json +1100 -0
  24. openadapt_ml/experiments/demo_prompt/results/multistep_20251231_025051.json +182 -0
  25. openadapt_ml/experiments/demo_prompt/run_experiment.py +531 -0
  26. openadapt_ml/experiments/waa_demo/__init__.py +10 -0
  27. openadapt_ml/experiments/waa_demo/demos.py +357 -0
  28. openadapt_ml/experiments/waa_demo/runner.py +717 -0
  29. openadapt_ml/experiments/waa_demo/tasks.py +151 -0
  30. openadapt_ml/export/__init__.py +9 -0
  31. openadapt_ml/export/__main__.py +6 -0
  32. openadapt_ml/export/cli.py +89 -0
  33. openadapt_ml/export/parquet.py +265 -0
  34. openadapt_ml/ingest/__init__.py +3 -4
  35. openadapt_ml/ingest/capture.py +89 -81
  36. openadapt_ml/ingest/loader.py +116 -68
  37. openadapt_ml/ingest/synthetic.py +221 -159
  38. openadapt_ml/retrieval/README.md +226 -0
  39. openadapt_ml/retrieval/USAGE.md +391 -0
  40. openadapt_ml/retrieval/__init__.py +91 -0
  41. openadapt_ml/retrieval/demo_retriever.py +817 -0
  42. openadapt_ml/retrieval/embeddings.py +629 -0
  43. openadapt_ml/retrieval/index.py +194 -0
  44. openadapt_ml/retrieval/retriever.py +160 -0
  45. openadapt_ml/runtime/policy.py +10 -10
  46. openadapt_ml/schema/__init__.py +104 -0
  47. openadapt_ml/schema/converters.py +541 -0
  48. openadapt_ml/schema/episode.py +457 -0
  49. openadapt_ml/scripts/compare.py +26 -16
  50. openadapt_ml/scripts/eval_policy.py +4 -5
  51. openadapt_ml/scripts/prepare_synthetic.py +14 -17
  52. openadapt_ml/scripts/train.py +81 -70
  53. openadapt_ml/training/benchmark_viewer.py +3225 -0
  54. openadapt_ml/training/trainer.py +120 -363
  55. openadapt_ml/training/trl_trainer.py +354 -0
  56. {openadapt_ml-0.1.0.dist-info → openadapt_ml-0.2.0.dist-info}/METADATA +102 -60
  57. openadapt_ml-0.2.0.dist-info/RECORD +86 -0
  58. openadapt_ml/schemas/__init__.py +0 -53
  59. openadapt_ml/schemas/sessions.py +0 -122
  60. openadapt_ml/schemas/validation.py +0 -252
  61. openadapt_ml-0.1.0.dist-info/RECORD +0 -55
  62. {openadapt_ml-0.1.0.dist-info → openadapt_ml-0.2.0.dist-info}/WHEEL +0 -0
  63. {openadapt_ml-0.1.0.dist-info → openadapt_ml-0.2.0.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,180 @@
1
+ """Live evaluation progress tracker for benchmark viewer.
2
+
3
+ This module provides a tracker that writes real-time evaluation progress
4
+ to a JSON file that the viewer can poll via /api/benchmark-live.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import json
10
+ from dataclasses import asdict, dataclass
11
+ from pathlib import Path
12
+ from typing import Any
13
+
14
+ from openadapt_ml.benchmarks.base import (
15
+ BenchmarkAction,
16
+ BenchmarkObservation,
17
+ BenchmarkResult,
18
+ BenchmarkTask,
19
+ )
20
+
21
+
22
+ @dataclass
23
+ class LiveStepData:
24
+ """Data for a single step in live evaluation."""
25
+
26
+ step_idx: int
27
+ action: dict[str, Any]
28
+ reasoning: str | None = None
29
+ screenshot_url: str | None = None
30
+
31
+
32
+ @dataclass
33
+ class LiveTaskData:
34
+ """Data for current task being evaluated."""
35
+
36
+ task_id: str
37
+ instruction: str
38
+ domain: str
39
+ steps: list[LiveStepData]
40
+ result: dict[str, Any] | None = None
41
+
42
+
43
+ class LiveEvaluationTracker:
44
+ """Tracks live evaluation progress and writes to benchmark_live.json.
45
+
46
+ This class is designed to be used alongside ExecutionTraceCollector
47
+ to provide real-time progress updates to the viewer.
48
+
49
+ Args:
50
+ output_file: Path to output JSON file (default: benchmark_live.json).
51
+ total_tasks: Total number of tasks to evaluate.
52
+ """
53
+
54
+ def __init__(
55
+ self,
56
+ output_file: str | Path = "benchmark_live.json",
57
+ total_tasks: int = 0,
58
+ ):
59
+ self.output_file = Path(output_file)
60
+ self.total_tasks = total_tasks
61
+ self.tasks_completed = 0
62
+ self.current_task: LiveTaskData | None = None
63
+
64
+ # Initialize with idle state
65
+ self._write_state({"status": "idle"})
66
+
67
+ def start_task(self, task: BenchmarkTask) -> None:
68
+ """Start tracking a new task.
69
+
70
+ Args:
71
+ task: The benchmark task being evaluated.
72
+ """
73
+ self.current_task = LiveTaskData(
74
+ task_id=task.task_id,
75
+ instruction=task.instruction,
76
+ domain=task.domain or "unknown",
77
+ steps=[],
78
+ result=None,
79
+ )
80
+
81
+ self._write_state({
82
+ "status": "running",
83
+ "total_tasks": self.total_tasks,
84
+ "tasks_completed": self.tasks_completed,
85
+ "current_task": asdict(self.current_task),
86
+ })
87
+
88
+ def record_step(
89
+ self,
90
+ step_idx: int,
91
+ observation: BenchmarkObservation,
92
+ action: BenchmarkAction,
93
+ reasoning: str | None = None,
94
+ ) -> None:
95
+ """Record a step in the current task.
96
+
97
+ Args:
98
+ step_idx: Index of this step.
99
+ observation: Observation at this step.
100
+ action: Action taken at this step.
101
+ reasoning: Optional reasoning/thought from agent.
102
+ """
103
+ if self.current_task is None:
104
+ raise RuntimeError("No task started. Call start_task() first.")
105
+
106
+ # Serialize action
107
+ action_data = {
108
+ "type": action.type,
109
+ "x": action.x,
110
+ "y": action.y,
111
+ "target_node_id": action.target_node_id,
112
+ "text": action.text,
113
+ "key": action.key,
114
+ }
115
+
116
+ # Create step data
117
+ step = LiveStepData(
118
+ step_idx=step_idx,
119
+ action=action_data,
120
+ reasoning=reasoning,
121
+ screenshot_url=None, # Could be populated if we serve screenshots
122
+ )
123
+
124
+ self.current_task.steps.append(step)
125
+
126
+ # Write updated state
127
+ self._write_state({
128
+ "status": "running",
129
+ "total_tasks": self.total_tasks,
130
+ "tasks_completed": self.tasks_completed,
131
+ "current_task": asdict(self.current_task),
132
+ })
133
+
134
+ def finish_task(self, result: BenchmarkResult) -> None:
135
+ """Finish tracking the current task.
136
+
137
+ Args:
138
+ result: The evaluation result for the task.
139
+ """
140
+ if self.current_task is None:
141
+ raise RuntimeError("No task started. Call start_task() first.")
142
+
143
+ # Add result to current task
144
+ self.current_task.result = {
145
+ "success": result.success,
146
+ "score": result.score,
147
+ "num_steps": result.num_steps,
148
+ "total_time_seconds": result.total_time_seconds,
149
+ }
150
+
151
+ # Increment completed count
152
+ self.tasks_completed += 1
153
+
154
+ # Write updated state
155
+ self._write_state({
156
+ "status": "running",
157
+ "total_tasks": self.total_tasks,
158
+ "tasks_completed": self.tasks_completed,
159
+ "current_task": asdict(self.current_task),
160
+ })
161
+
162
+ # Clear current task
163
+ self.current_task = None
164
+
165
+ def finish(self) -> None:
166
+ """Mark evaluation as complete."""
167
+ self._write_state({
168
+ "status": "complete",
169
+ "total_tasks": self.total_tasks,
170
+ "tasks_completed": self.tasks_completed,
171
+ })
172
+
173
+ def _write_state(self, state: dict[str, Any]) -> None:
174
+ """Write current state to JSON file.
175
+
176
+ Args:
177
+ state: State dictionary to write.
178
+ """
179
+ with open(self.output_file, "w") as f:
180
+ json.dump(state, f, indent=2)
@@ -29,6 +29,7 @@ from openadapt_ml.benchmarks.base import (
29
29
  BenchmarkTask,
30
30
  )
31
31
  from openadapt_ml.benchmarks.data_collection import ExecutionTraceCollector
32
+ from openadapt_ml.benchmarks.live_tracker import LiveEvaluationTracker
32
33
 
33
34
  logger = logging.getLogger(__name__)
34
35
 
@@ -48,6 +49,8 @@ class EvaluationConfig:
48
49
  model_id: Model identifier for execution traces.
49
50
  output_dir: Output directory for benchmark results.
50
51
  run_name: Name for this evaluation run.
52
+ enable_live_tracking: Whether to enable live evaluation progress tracking.
53
+ live_tracking_file: Path to live tracking JSON file.
51
54
  """
52
55
 
53
56
  max_steps: int = 50
@@ -60,6 +63,8 @@ class EvaluationConfig:
60
63
  model_id: str = "unknown"
61
64
  output_dir: str = "benchmark_results"
62
65
  run_name: str | None = None
66
+ enable_live_tracking: bool = True
67
+ live_tracking_file: str = "benchmark_live.json"
63
68
 
64
69
 
65
70
  def evaluate_agent_on_benchmark(
@@ -107,16 +112,30 @@ def evaluate_agent_on_benchmark(
107
112
  if config.verbose:
108
113
  logger.info(f"Saving execution traces to: {trace_collector.run_dir}")
109
114
 
115
+ # Initialize live evaluation tracker if enabled
116
+ live_tracker = None
117
+ if config.enable_live_tracking:
118
+ live_tracker = LiveEvaluationTracker(
119
+ output_file=config.live_tracking_file,
120
+ total_tasks=len(tasks),
121
+ )
122
+ if config.verbose:
123
+ logger.info(f"Live tracking enabled: {config.live_tracking_file}")
124
+
110
125
  # Run evaluation
111
126
  if config.parallel > 1 and adapter.supports_parallel:
112
- results = _evaluate_parallel(agent, adapter, tasks, config, trace_collector)
127
+ results = _evaluate_parallel(agent, adapter, tasks, config, trace_collector, live_tracker)
113
128
  else:
114
- results = _evaluate_sequential(agent, adapter, tasks, config, trace_collector)
129
+ results = _evaluate_sequential(agent, adapter, tasks, config, trace_collector, live_tracker)
115
130
 
116
131
  # Save summary if trace collection is enabled
117
132
  if trace_collector is not None:
118
133
  trace_collector.save_summary(results)
119
134
 
135
+ # Mark live tracking as complete
136
+ if live_tracker is not None:
137
+ live_tracker.finish()
138
+
120
139
  # Log summary
121
140
  if config.verbose:
122
141
  success_count = sum(1 for r in results if r.success)
@@ -136,6 +155,7 @@ def _evaluate_sequential(
136
155
  tasks: list[BenchmarkTask],
137
156
  config: EvaluationConfig,
138
157
  trace_collector: ExecutionTraceCollector | None = None,
158
+ live_tracker: LiveEvaluationTracker | None = None,
139
159
  ) -> list[BenchmarkResult]:
140
160
  """Run evaluation sequentially.
141
161
 
@@ -145,6 +165,7 @@ def _evaluate_sequential(
145
165
  tasks: Tasks to evaluate.
146
166
  config: Evaluation configuration.
147
167
  trace_collector: Optional trace collector for saving execution data.
168
+ live_tracker: Optional live evaluation tracker.
148
169
 
149
170
  Returns:
150
171
  List of results.
@@ -154,7 +175,7 @@ def _evaluate_sequential(
154
175
  if config.verbose:
155
176
  logger.info(f"Task {i + 1}/{len(tasks)}: {task.task_id}")
156
177
 
157
- result = _run_single_task(agent, adapter, task, config, trace_collector)
178
+ result = _run_single_task(agent, adapter, task, config, trace_collector, live_tracker)
158
179
  results.append(result)
159
180
 
160
181
  if config.on_task_complete:
@@ -169,6 +190,7 @@ def _evaluate_parallel(
169
190
  tasks: list[BenchmarkTask],
170
191
  config: EvaluationConfig,
171
192
  trace_collector: ExecutionTraceCollector | None = None,
193
+ live_tracker: LiveEvaluationTracker | None = None,
172
194
  ) -> list[BenchmarkResult]:
173
195
  """Run evaluation in parallel.
174
196
 
@@ -181,6 +203,7 @@ def _evaluate_parallel(
181
203
  tasks: Tasks to evaluate.
182
204
  config: Evaluation configuration.
183
205
  trace_collector: Optional trace collector for saving execution data.
206
+ live_tracker: Optional live evaluation tracker.
184
207
 
185
208
  Returns:
186
209
  List of results.
@@ -190,7 +213,7 @@ def _evaluate_parallel(
190
213
  with ThreadPoolExecutor(max_workers=config.parallel) as executor:
191
214
  # Submit all tasks
192
215
  future_to_task = {
193
- executor.submit(_run_single_task, agent, adapter, task, config, trace_collector): task
216
+ executor.submit(_run_single_task, agent, adapter, task, config, trace_collector, live_tracker): task
194
217
  for task in tasks
195
218
  }
196
219
 
@@ -228,6 +251,7 @@ def _run_single_task(
228
251
  task: BenchmarkTask,
229
252
  config: EvaluationConfig,
230
253
  trace_collector: ExecutionTraceCollector | None = None,
254
+ live_tracker: LiveEvaluationTracker | None = None,
231
255
  ) -> BenchmarkResult:
232
256
  """Run a single task and return result.
233
257
 
@@ -237,6 +261,7 @@ def _run_single_task(
237
261
  task: Task to run.
238
262
  config: Evaluation configuration.
239
263
  trace_collector: Optional trace collector for saving execution data.
264
+ live_tracker: Optional live evaluation tracker.
240
265
 
241
266
  Returns:
242
267
  BenchmarkResult.
@@ -248,6 +273,10 @@ def _run_single_task(
248
273
  if trace_collector is not None:
249
274
  trace_collector.start_task(task)
250
275
 
276
+ # Start live tracking if enabled
277
+ if live_tracker is not None:
278
+ live_tracker.start_task(task)
279
+
251
280
  try:
252
281
  # Reset agent and environment
253
282
  agent.reset()
@@ -270,6 +299,10 @@ def _run_single_task(
270
299
  if trace_collector is not None:
271
300
  trace_collector.record_step(steps, obs, action, reasoning)
272
301
 
302
+ # Record step in live tracker
303
+ if live_tracker is not None:
304
+ live_tracker.record_step(steps, obs, action, reasoning)
305
+
273
306
  # Record step in history
274
307
  if config.save_trajectories:
275
308
  history.append((obs, action))
@@ -298,6 +331,10 @@ def _run_single_task(
298
331
  if trace_collector is not None:
299
332
  trace_collector.finish_task(result)
300
333
 
334
+ # Finish live tracking if enabled
335
+ if live_tracker is not None:
336
+ live_tracker.finish_task(result)
337
+
301
338
  return result
302
339
 
303
340
  except Exception as e: