openadapt-ml 0.2.0__py3-none-any.whl → 0.2.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (95) hide show
  1. openadapt_ml/baselines/__init__.py +121 -0
  2. openadapt_ml/baselines/adapter.py +185 -0
  3. openadapt_ml/baselines/cli.py +314 -0
  4. openadapt_ml/baselines/config.py +448 -0
  5. openadapt_ml/baselines/parser.py +922 -0
  6. openadapt_ml/baselines/prompts.py +787 -0
  7. openadapt_ml/benchmarks/__init__.py +13 -115
  8. openadapt_ml/benchmarks/agent.py +265 -421
  9. openadapt_ml/benchmarks/azure.py +28 -19
  10. openadapt_ml/benchmarks/azure_ops_tracker.py +521 -0
  11. openadapt_ml/benchmarks/cli.py +1722 -4847
  12. openadapt_ml/benchmarks/trace_export.py +631 -0
  13. openadapt_ml/benchmarks/viewer.py +22 -5
  14. openadapt_ml/benchmarks/vm_monitor.py +530 -29
  15. openadapt_ml/benchmarks/waa_deploy/Dockerfile +47 -53
  16. openadapt_ml/benchmarks/waa_deploy/api_agent.py +21 -20
  17. openadapt_ml/cloud/azure_inference.py +3 -5
  18. openadapt_ml/cloud/lambda_labs.py +722 -307
  19. openadapt_ml/cloud/local.py +2038 -487
  20. openadapt_ml/cloud/ssh_tunnel.py +68 -26
  21. openadapt_ml/datasets/next_action.py +40 -30
  22. openadapt_ml/evals/grounding.py +8 -3
  23. openadapt_ml/evals/plot_eval_metrics.py +15 -13
  24. openadapt_ml/evals/trajectory_matching.py +41 -26
  25. openadapt_ml/experiments/demo_prompt/format_demo.py +16 -6
  26. openadapt_ml/experiments/demo_prompt/run_experiment.py +26 -16
  27. openadapt_ml/experiments/representation_shootout/__init__.py +70 -0
  28. openadapt_ml/experiments/representation_shootout/conditions.py +708 -0
  29. openadapt_ml/experiments/representation_shootout/config.py +390 -0
  30. openadapt_ml/experiments/representation_shootout/evaluator.py +659 -0
  31. openadapt_ml/experiments/representation_shootout/runner.py +687 -0
  32. openadapt_ml/experiments/waa_demo/runner.py +29 -14
  33. openadapt_ml/export/parquet.py +36 -24
  34. openadapt_ml/grounding/detector.py +18 -14
  35. openadapt_ml/ingest/__init__.py +8 -6
  36. openadapt_ml/ingest/capture.py +25 -22
  37. openadapt_ml/ingest/loader.py +7 -4
  38. openadapt_ml/ingest/synthetic.py +189 -100
  39. openadapt_ml/models/api_adapter.py +14 -4
  40. openadapt_ml/models/base_adapter.py +10 -2
  41. openadapt_ml/models/providers/__init__.py +288 -0
  42. openadapt_ml/models/providers/anthropic.py +266 -0
  43. openadapt_ml/models/providers/base.py +299 -0
  44. openadapt_ml/models/providers/google.py +376 -0
  45. openadapt_ml/models/providers/openai.py +342 -0
  46. openadapt_ml/models/qwen_vl.py +46 -19
  47. openadapt_ml/perception/__init__.py +35 -0
  48. openadapt_ml/perception/integration.py +399 -0
  49. openadapt_ml/retrieval/demo_retriever.py +50 -24
  50. openadapt_ml/retrieval/embeddings.py +9 -8
  51. openadapt_ml/retrieval/retriever.py +3 -1
  52. openadapt_ml/runtime/__init__.py +50 -0
  53. openadapt_ml/runtime/policy.py +18 -5
  54. openadapt_ml/runtime/safety_gate.py +471 -0
  55. openadapt_ml/schema/__init__.py +9 -0
  56. openadapt_ml/schema/converters.py +74 -27
  57. openadapt_ml/schema/episode.py +31 -18
  58. openadapt_ml/scripts/capture_screenshots.py +530 -0
  59. openadapt_ml/scripts/compare.py +85 -54
  60. openadapt_ml/scripts/demo_policy.py +4 -1
  61. openadapt_ml/scripts/eval_policy.py +15 -9
  62. openadapt_ml/scripts/make_gif.py +1 -1
  63. openadapt_ml/scripts/prepare_synthetic.py +3 -1
  64. openadapt_ml/scripts/train.py +21 -9
  65. openadapt_ml/segmentation/README.md +920 -0
  66. openadapt_ml/segmentation/__init__.py +97 -0
  67. openadapt_ml/segmentation/adapters/__init__.py +5 -0
  68. openadapt_ml/segmentation/adapters/capture_adapter.py +420 -0
  69. openadapt_ml/segmentation/annotator.py +610 -0
  70. openadapt_ml/segmentation/cache.py +290 -0
  71. openadapt_ml/segmentation/cli.py +674 -0
  72. openadapt_ml/segmentation/deduplicator.py +656 -0
  73. openadapt_ml/segmentation/frame_describer.py +788 -0
  74. openadapt_ml/segmentation/pipeline.py +340 -0
  75. openadapt_ml/segmentation/schemas.py +622 -0
  76. openadapt_ml/segmentation/segment_extractor.py +634 -0
  77. openadapt_ml/training/azure_ops_viewer.py +1097 -0
  78. openadapt_ml/training/benchmark_viewer.py +52 -41
  79. openadapt_ml/training/shared_ui.py +7 -7
  80. openadapt_ml/training/stub_provider.py +57 -35
  81. openadapt_ml/training/trainer.py +143 -86
  82. openadapt_ml/training/trl_trainer.py +70 -21
  83. openadapt_ml/training/viewer.py +323 -108
  84. openadapt_ml/training/viewer_components.py +180 -0
  85. {openadapt_ml-0.2.0.dist-info → openadapt_ml-0.2.2.dist-info}/METADATA +215 -14
  86. openadapt_ml-0.2.2.dist-info/RECORD +116 -0
  87. openadapt_ml/benchmarks/base.py +0 -366
  88. openadapt_ml/benchmarks/data_collection.py +0 -432
  89. openadapt_ml/benchmarks/live_tracker.py +0 -180
  90. openadapt_ml/benchmarks/runner.py +0 -418
  91. openadapt_ml/benchmarks/waa.py +0 -761
  92. openadapt_ml/benchmarks/waa_live.py +0 -619
  93. openadapt_ml-0.2.0.dist-info/RECORD +0 -86
  94. {openadapt_ml-0.2.0.dist-info → openadapt_ml-0.2.2.dist-info}/WHEEL +0 -0
  95. {openadapt_ml-0.2.0.dist-info → openadapt_ml-0.2.2.dist-info}/licenses/LICENSE +0 -0
@@ -1,432 +0,0 @@
1
- """Data collection for benchmark viewer integration.
2
-
3
- This module handles saving execution traces during benchmark runs for later
4
- replay in the benchmark viewer. It creates a structured directory layout with
5
- screenshots, metadata, and execution traces.
6
-
7
- Directory structure:
8
- benchmark_results/
9
- ├── waa_eval_YYYYMMDD_HHMMSS/
10
- │ ├── metadata.json
11
- │ ├── tasks/
12
- │ │ ├── task_001/
13
- │ │ │ ├── task.json
14
- │ │ │ ├── screenshots/
15
- │ │ │ │ ├── step_000.png
16
- │ │ │ │ ├── step_001.png
17
- │ │ │ │ └── ...
18
- │ │ │ └── execution.json
19
- │ │ └── task_002/
20
- │ │ └── ...
21
- │ └── summary.json
22
-
23
- Example:
24
- from openadapt_ml.benchmarks.data_collection import ExecutionTraceCollector
25
-
26
- collector = ExecutionTraceCollector(
27
- benchmark_name="waa",
28
- run_name="waa_eval_20241214",
29
- model_id="qwen3vl-2b-epoch5"
30
- )
31
-
32
- # During evaluation
33
- collector.start_task(task)
34
- for step_idx, (obs, action) in enumerate(trajectory):
35
- collector.record_step(step_idx, obs, action, reasoning="...")
36
- collector.finish_task(result)
37
- """
38
-
39
- from __future__ import annotations
40
-
41
- import json
42
- import logging
43
- from dataclasses import asdict, dataclass
44
- from datetime import datetime
45
- from pathlib import Path
46
- from typing import Any
47
-
48
- from openadapt_ml.benchmarks.base import (
49
- BenchmarkAction,
50
- BenchmarkObservation,
51
- BenchmarkResult,
52
- BenchmarkTask,
53
- )
54
-
55
- logger = logging.getLogger(__name__)
56
-
57
-
58
- @dataclass
59
- class ExecutionStep:
60
- """Single step in execution trace.
61
-
62
- Attributes:
63
- step_idx: Step index in the trajectory.
64
- screenshot_path: Relative path to screenshot image.
65
- action: Action taken at this step.
66
- reasoning: Optional reasoning/thought from the agent.
67
- timestamp: Timestamp when step was recorded.
68
- """
69
-
70
- step_idx: int
71
- screenshot_path: str | None
72
- action: dict[str, Any] # Serialized BenchmarkAction
73
- reasoning: str | None = None
74
- timestamp: float | None = None
75
-
76
-
77
- class ExecutionTraceCollector:
78
- """Collects execution traces during benchmark runs.
79
-
80
- This class handles:
81
- - Creating the directory structure for a benchmark run
82
- - Saving screenshots at each step
83
- - Recording actions and reasoning
84
- - Saving task results and metadata
85
-
86
- Args:
87
- benchmark_name: Name of the benchmark (e.g., "waa", "webarena").
88
- run_name: Unique name for this evaluation run (e.g., "waa_eval_20241214").
89
- model_id: Identifier for the model being evaluated.
90
- output_dir: Base directory for benchmark results (default: "./benchmark_results").
91
- """
92
-
93
- def __init__(
94
- self,
95
- benchmark_name: str,
96
- run_name: str | None = None,
97
- model_id: str = "unknown",
98
- output_dir: str | Path = "benchmark_results",
99
- ):
100
- self.benchmark_name = benchmark_name
101
- self.model_id = model_id
102
-
103
- # Auto-generate run_name if not provided
104
- if run_name is None:
105
- timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
106
- run_name = f"{benchmark_name}_eval_{timestamp}"
107
- self.run_name = run_name
108
-
109
- # Set up directory structure
110
- self.output_dir = Path(output_dir)
111
- self.run_dir = self.output_dir / run_name
112
- self.tasks_dir = self.run_dir / "tasks"
113
-
114
- # Current task tracking
115
- self._current_task: BenchmarkTask | None = None
116
- self._current_task_dir: Path | None = None
117
- self._current_screenshots_dir: Path | None = None
118
- self._current_steps: list[ExecutionStep] = []
119
-
120
- # Initialize run
121
- self._initialize_run()
122
-
123
- def _initialize_run(self) -> None:
124
- """Initialize the benchmark run directory and metadata."""
125
- self.run_dir.mkdir(parents=True, exist_ok=True)
126
- self.tasks_dir.mkdir(exist_ok=True)
127
-
128
- # Save run metadata
129
- metadata = {
130
- "benchmark_name": self.benchmark_name,
131
- "run_name": self.run_name,
132
- "model_id": self.model_id,
133
- "created_at": datetime.now().isoformat(),
134
- }
135
-
136
- metadata_path = self.run_dir / "metadata.json"
137
- with open(metadata_path, "w") as f:
138
- json.dump(metadata, f, indent=2)
139
-
140
- logger.info(f"Initialized benchmark run at: {self.run_dir}")
141
-
142
- def start_task(self, task: BenchmarkTask) -> None:
143
- """Start collecting data for a new task.
144
-
145
- Args:
146
- task: The benchmark task being executed.
147
- """
148
- if self._current_task is not None:
149
- logger.warning(
150
- f"Starting new task {task.task_id} without finishing {self._current_task.task_id}"
151
- )
152
-
153
- self._current_task = task
154
- self._current_steps = []
155
-
156
- # Create task directory
157
- task_dir_name = self._sanitize_task_id(task.task_id)
158
- self._current_task_dir = self.tasks_dir / task_dir_name
159
- self._current_task_dir.mkdir(parents=True, exist_ok=True)
160
-
161
- # Create screenshots directory
162
- self._current_screenshots_dir = self._current_task_dir / "screenshots"
163
- self._current_screenshots_dir.mkdir(exist_ok=True)
164
-
165
- # Save task definition
166
- task_data = {
167
- "task_id": task.task_id,
168
- "instruction": task.instruction,
169
- "domain": task.domain,
170
- "initial_state_ref": task.initial_state_ref,
171
- "time_limit_steps": task.time_limit_steps,
172
- "raw_config": task.raw_config,
173
- "evaluation_spec": task.evaluation_spec,
174
- }
175
-
176
- task_path = self._current_task_dir / "task.json"
177
- with open(task_path, "w") as f:
178
- json.dump(task_data, f, indent=2)
179
-
180
- logger.info(f"Started collecting data for task: {task.task_id}")
181
-
182
- def record_step(
183
- self,
184
- step_idx: int,
185
- observation: BenchmarkObservation,
186
- action: BenchmarkAction,
187
- reasoning: str | None = None,
188
- ) -> None:
189
- """Record a single step in the execution trace.
190
-
191
- Args:
192
- step_idx: Index of this step in the trajectory.
193
- observation: Observation at this step.
194
- action: Action taken at this step.
195
- reasoning: Optional reasoning/thought from the agent.
196
- """
197
- if self._current_task is None:
198
- raise RuntimeError("No task started. Call start_task() first.")
199
-
200
- # Save screenshot if available
201
- screenshot_path = None
202
- if observation.screenshot is not None:
203
- screenshot_path = self._save_screenshot(step_idx, observation.screenshot)
204
- elif observation.screenshot_path is not None:
205
- # Copy existing screenshot
206
- screenshot_path = self._copy_screenshot(step_idx, observation.screenshot_path)
207
-
208
- # Create execution step record
209
- step = ExecutionStep(
210
- step_idx=step_idx,
211
- screenshot_path=screenshot_path,
212
- action=self._serialize_action(action),
213
- reasoning=reasoning,
214
- timestamp=datetime.now().timestamp(),
215
- )
216
-
217
- self._current_steps.append(step)
218
-
219
- def finish_task(self, result: BenchmarkResult) -> None:
220
- """Finish collecting data for the current task and save execution trace.
221
-
222
- Args:
223
- result: The evaluation result for the task.
224
- """
225
- if self._current_task is None:
226
- raise RuntimeError("No task started. Call start_task() first.")
227
-
228
- # Save execution trace
229
- execution_data = {
230
- "task_id": result.task_id,
231
- "model_id": self.model_id,
232
- "success": result.success,
233
- "score": result.score,
234
- "num_steps": result.num_steps,
235
- "total_time_seconds": result.total_time_seconds,
236
- "error": result.error,
237
- "reason": result.reason,
238
- "steps": [asdict(step) for step in self._current_steps],
239
- }
240
-
241
- execution_path = self._current_task_dir / "execution.json"
242
- with open(execution_path, "w") as f:
243
- json.dump(execution_data, f, indent=2)
244
-
245
- logger.info(
246
- f"Saved execution trace for task {result.task_id}: "
247
- f"{'SUCCESS' if result.success else 'FAIL'} ({result.num_steps} steps)"
248
- )
249
-
250
- # Clear current task
251
- self._current_task = None
252
- self._current_task_dir = None
253
- self._current_screenshots_dir = None
254
- self._current_steps = []
255
-
256
- def save_summary(self, all_results: list[BenchmarkResult]) -> None:
257
- """Save summary of all task results.
258
-
259
- Args:
260
- all_results: List of all BenchmarkResult objects from the run.
261
- """
262
- summary = {
263
- "benchmark_name": self.benchmark_name,
264
- "run_name": self.run_name,
265
- "model_id": self.model_id,
266
- "num_tasks": len(all_results),
267
- "num_success": sum(1 for r in all_results if r.success),
268
- "success_rate": sum(1 for r in all_results if r.success) / len(all_results) if all_results else 0.0,
269
- "avg_score": sum(r.score for r in all_results) / len(all_results) if all_results else 0.0,
270
- "avg_steps": sum(r.num_steps for r in all_results) / len(all_results) if all_results else 0.0,
271
- "avg_time_seconds": sum(r.total_time_seconds for r in all_results) / len(all_results) if all_results else 0.0,
272
- "tasks": [
273
- {
274
- "task_id": r.task_id,
275
- "success": r.success,
276
- "score": r.score,
277
- "num_steps": r.num_steps,
278
- "error": r.error,
279
- }
280
- for r in all_results
281
- ],
282
- }
283
-
284
- summary_path = self.run_dir / "summary.json"
285
- with open(summary_path, "w") as f:
286
- json.dump(summary, f, indent=2)
287
-
288
- logger.info(
289
- f"Saved summary: {summary['num_success']}/{summary['num_tasks']} tasks succeeded "
290
- f"({summary['success_rate']:.1%})"
291
- )
292
-
293
- def _save_screenshot(self, step_idx: int, screenshot_bytes: bytes) -> str:
294
- """Save screenshot bytes to file.
295
-
296
- Args:
297
- step_idx: Step index for naming the file.
298
- screenshot_bytes: PNG image bytes.
299
-
300
- Returns:
301
- Relative path to the saved screenshot.
302
- """
303
- if self._current_screenshots_dir is None:
304
- raise RuntimeError("No task started")
305
-
306
- filename = f"step_{step_idx:03d}.png"
307
- screenshot_path = self._current_screenshots_dir / filename
308
-
309
- with open(screenshot_path, "wb") as f:
310
- f.write(screenshot_bytes)
311
-
312
- # Return relative path from task directory
313
- return f"screenshots/{filename}"
314
-
315
- def _copy_screenshot(self, step_idx: int, source_path: str) -> str:
316
- """Copy screenshot from existing path.
317
-
318
- Args:
319
- step_idx: Step index for naming the file.
320
- source_path: Path to existing screenshot.
321
-
322
- Returns:
323
- Relative path to the copied screenshot.
324
- """
325
- if self._current_screenshots_dir is None:
326
- raise RuntimeError("No task started")
327
-
328
- filename = f"step_{step_idx:03d}.png"
329
- dest_path = self._current_screenshots_dir / filename
330
-
331
- # Copy file
332
- import shutil
333
- shutil.copy2(source_path, dest_path)
334
-
335
- return f"screenshots/{filename}"
336
-
337
- def _serialize_action(self, action: BenchmarkAction) -> dict[str, Any]:
338
- """Serialize BenchmarkAction to dict.
339
-
340
- Args:
341
- action: Action to serialize.
342
-
343
- Returns:
344
- Dictionary representation of the action.
345
- """
346
- return {
347
- "type": action.type,
348
- "x": action.x,
349
- "y": action.y,
350
- "target_node_id": action.target_node_id,
351
- "target_bbox": action.target_bbox,
352
- "target_role": action.target_role,
353
- "target_name": action.target_name,
354
- "text": action.text,
355
- "key": action.key,
356
- "modifiers": action.modifiers,
357
- "scroll_direction": action.scroll_direction,
358
- "scroll_amount": action.scroll_amount,
359
- "end_x": action.end_x,
360
- "end_y": action.end_y,
361
- "answer": action.answer,
362
- "raw_action": action.raw_action,
363
- }
364
-
365
- def _sanitize_task_id(self, task_id: str) -> str:
366
- """Sanitize task ID for use as directory name.
367
-
368
- Args:
369
- task_id: Original task ID.
370
-
371
- Returns:
372
- Sanitized task ID safe for filesystem.
373
- """
374
- # Replace unsafe characters with underscores
375
- safe_id = "".join(c if c.isalnum() or c in "-_" else "_" for c in task_id)
376
- return safe_id
377
-
378
-
379
- def save_execution_trace(
380
- task: BenchmarkTask,
381
- result: BenchmarkResult,
382
- trajectory: list[tuple[BenchmarkObservation, BenchmarkAction]],
383
- benchmark_name: str,
384
- model_id: str = "unknown",
385
- output_dir: str | Path = "benchmark_results",
386
- run_name: str | None = None,
387
- reasoning_map: dict[int, str] | None = None,
388
- ) -> Path:
389
- """Convenience function to save a complete execution trace.
390
-
391
- This is a simpler alternative to using ExecutionTraceCollector directly
392
- when you have the complete trajectory available.
393
-
394
- Args:
395
- task: The benchmark task.
396
- result: The evaluation result.
397
- trajectory: List of (observation, action) pairs.
398
- benchmark_name: Name of the benchmark.
399
- model_id: Identifier for the model.
400
- output_dir: Base directory for results.
401
- run_name: Optional run name (auto-generated if None).
402
- reasoning_map: Optional map of step_idx -> reasoning text.
403
-
404
- Returns:
405
- Path to the task directory.
406
-
407
- Example:
408
- save_execution_trace(
409
- task=task,
410
- result=result,
411
- trajectory=trajectory,
412
- benchmark_name="waa",
413
- model_id="qwen3vl-2b-epoch5",
414
- reasoning_map={0: "I should click the button", 1: "Now type the text"}
415
- )
416
- """
417
- collector = ExecutionTraceCollector(
418
- benchmark_name=benchmark_name,
419
- run_name=run_name,
420
- model_id=model_id,
421
- output_dir=output_dir,
422
- )
423
-
424
- collector.start_task(task)
425
-
426
- for step_idx, (obs, action) in enumerate(trajectory):
427
- reasoning = reasoning_map.get(step_idx) if reasoning_map else None
428
- collector.record_step(step_idx, obs, action, reasoning)
429
-
430
- collector.finish_task(result)
431
-
432
- return collector._current_task_dir or collector.tasks_dir
@@ -1,180 +0,0 @@
1
- """Live evaluation progress tracker for benchmark viewer.
2
-
3
- This module provides a tracker that writes real-time evaluation progress
4
- to a JSON file that the viewer can poll via /api/benchmark-live.
5
- """
6
-
7
- from __future__ import annotations
8
-
9
- import json
10
- from dataclasses import asdict, dataclass
11
- from pathlib import Path
12
- from typing import Any
13
-
14
- from openadapt_ml.benchmarks.base import (
15
- BenchmarkAction,
16
- BenchmarkObservation,
17
- BenchmarkResult,
18
- BenchmarkTask,
19
- )
20
-
21
-
22
- @dataclass
23
- class LiveStepData:
24
- """Data for a single step in live evaluation."""
25
-
26
- step_idx: int
27
- action: dict[str, Any]
28
- reasoning: str | None = None
29
- screenshot_url: str | None = None
30
-
31
-
32
- @dataclass
33
- class LiveTaskData:
34
- """Data for current task being evaluated."""
35
-
36
- task_id: str
37
- instruction: str
38
- domain: str
39
- steps: list[LiveStepData]
40
- result: dict[str, Any] | None = None
41
-
42
-
43
- class LiveEvaluationTracker:
44
- """Tracks live evaluation progress and writes to benchmark_live.json.
45
-
46
- This class is designed to be used alongside ExecutionTraceCollector
47
- to provide real-time progress updates to the viewer.
48
-
49
- Args:
50
- output_file: Path to output JSON file (default: benchmark_live.json).
51
- total_tasks: Total number of tasks to evaluate.
52
- """
53
-
54
- def __init__(
55
- self,
56
- output_file: str | Path = "benchmark_live.json",
57
- total_tasks: int = 0,
58
- ):
59
- self.output_file = Path(output_file)
60
- self.total_tasks = total_tasks
61
- self.tasks_completed = 0
62
- self.current_task: LiveTaskData | None = None
63
-
64
- # Initialize with idle state
65
- self._write_state({"status": "idle"})
66
-
67
- def start_task(self, task: BenchmarkTask) -> None:
68
- """Start tracking a new task.
69
-
70
- Args:
71
- task: The benchmark task being evaluated.
72
- """
73
- self.current_task = LiveTaskData(
74
- task_id=task.task_id,
75
- instruction=task.instruction,
76
- domain=task.domain or "unknown",
77
- steps=[],
78
- result=None,
79
- )
80
-
81
- self._write_state({
82
- "status": "running",
83
- "total_tasks": self.total_tasks,
84
- "tasks_completed": self.tasks_completed,
85
- "current_task": asdict(self.current_task),
86
- })
87
-
88
- def record_step(
89
- self,
90
- step_idx: int,
91
- observation: BenchmarkObservation,
92
- action: BenchmarkAction,
93
- reasoning: str | None = None,
94
- ) -> None:
95
- """Record a step in the current task.
96
-
97
- Args:
98
- step_idx: Index of this step.
99
- observation: Observation at this step.
100
- action: Action taken at this step.
101
- reasoning: Optional reasoning/thought from agent.
102
- """
103
- if self.current_task is None:
104
- raise RuntimeError("No task started. Call start_task() first.")
105
-
106
- # Serialize action
107
- action_data = {
108
- "type": action.type,
109
- "x": action.x,
110
- "y": action.y,
111
- "target_node_id": action.target_node_id,
112
- "text": action.text,
113
- "key": action.key,
114
- }
115
-
116
- # Create step data
117
- step = LiveStepData(
118
- step_idx=step_idx,
119
- action=action_data,
120
- reasoning=reasoning,
121
- screenshot_url=None, # Could be populated if we serve screenshots
122
- )
123
-
124
- self.current_task.steps.append(step)
125
-
126
- # Write updated state
127
- self._write_state({
128
- "status": "running",
129
- "total_tasks": self.total_tasks,
130
- "tasks_completed": self.tasks_completed,
131
- "current_task": asdict(self.current_task),
132
- })
133
-
134
- def finish_task(self, result: BenchmarkResult) -> None:
135
- """Finish tracking the current task.
136
-
137
- Args:
138
- result: The evaluation result for the task.
139
- """
140
- if self.current_task is None:
141
- raise RuntimeError("No task started. Call start_task() first.")
142
-
143
- # Add result to current task
144
- self.current_task.result = {
145
- "success": result.success,
146
- "score": result.score,
147
- "num_steps": result.num_steps,
148
- "total_time_seconds": result.total_time_seconds,
149
- }
150
-
151
- # Increment completed count
152
- self.tasks_completed += 1
153
-
154
- # Write updated state
155
- self._write_state({
156
- "status": "running",
157
- "total_tasks": self.total_tasks,
158
- "tasks_completed": self.tasks_completed,
159
- "current_task": asdict(self.current_task),
160
- })
161
-
162
- # Clear current task
163
- self.current_task = None
164
-
165
- def finish(self) -> None:
166
- """Mark evaluation as complete."""
167
- self._write_state({
168
- "status": "complete",
169
- "total_tasks": self.total_tasks,
170
- "tasks_completed": self.tasks_completed,
171
- })
172
-
173
- def _write_state(self, state: dict[str, Any]) -> None:
174
- """Write current state to JSON file.
175
-
176
- Args:
177
- state: State dictionary to write.
178
- """
179
- with open(self.output_file, "w") as f:
180
- json.dump(state, f, indent=2)