openadapt-ml 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- openadapt_ml/__init__.py +0 -0
- openadapt_ml/benchmarks/__init__.py +125 -0
- openadapt_ml/benchmarks/agent.py +825 -0
- openadapt_ml/benchmarks/azure.py +761 -0
- openadapt_ml/benchmarks/base.py +366 -0
- openadapt_ml/benchmarks/cli.py +884 -0
- openadapt_ml/benchmarks/data_collection.py +432 -0
- openadapt_ml/benchmarks/runner.py +381 -0
- openadapt_ml/benchmarks/waa.py +704 -0
- openadapt_ml/cloud/__init__.py +5 -0
- openadapt_ml/cloud/azure_inference.py +441 -0
- openadapt_ml/cloud/lambda_labs.py +2445 -0
- openadapt_ml/cloud/local.py +790 -0
- openadapt_ml/config.py +56 -0
- openadapt_ml/datasets/__init__.py +0 -0
- openadapt_ml/datasets/next_action.py +507 -0
- openadapt_ml/evals/__init__.py +23 -0
- openadapt_ml/evals/grounding.py +241 -0
- openadapt_ml/evals/plot_eval_metrics.py +174 -0
- openadapt_ml/evals/trajectory_matching.py +486 -0
- openadapt_ml/grounding/__init__.py +45 -0
- openadapt_ml/grounding/base.py +236 -0
- openadapt_ml/grounding/detector.py +570 -0
- openadapt_ml/ingest/__init__.py +43 -0
- openadapt_ml/ingest/capture.py +312 -0
- openadapt_ml/ingest/loader.py +232 -0
- openadapt_ml/ingest/synthetic.py +1102 -0
- openadapt_ml/models/__init__.py +0 -0
- openadapt_ml/models/api_adapter.py +171 -0
- openadapt_ml/models/base_adapter.py +59 -0
- openadapt_ml/models/dummy_adapter.py +42 -0
- openadapt_ml/models/qwen_vl.py +426 -0
- openadapt_ml/runtime/__init__.py +0 -0
- openadapt_ml/runtime/policy.py +182 -0
- openadapt_ml/schemas/__init__.py +53 -0
- openadapt_ml/schemas/sessions.py +122 -0
- openadapt_ml/schemas/validation.py +252 -0
- openadapt_ml/scripts/__init__.py +0 -0
- openadapt_ml/scripts/compare.py +1490 -0
- openadapt_ml/scripts/demo_policy.py +62 -0
- openadapt_ml/scripts/eval_policy.py +287 -0
- openadapt_ml/scripts/make_gif.py +153 -0
- openadapt_ml/scripts/prepare_synthetic.py +43 -0
- openadapt_ml/scripts/run_qwen_login_benchmark.py +192 -0
- openadapt_ml/scripts/train.py +174 -0
- openadapt_ml/training/__init__.py +0 -0
- openadapt_ml/training/benchmark_viewer.py +1538 -0
- openadapt_ml/training/shared_ui.py +157 -0
- openadapt_ml/training/stub_provider.py +276 -0
- openadapt_ml/training/trainer.py +2446 -0
- openadapt_ml/training/viewer.py +2970 -0
- openadapt_ml-0.1.0.dist-info/METADATA +818 -0
- openadapt_ml-0.1.0.dist-info/RECORD +55 -0
- openadapt_ml-0.1.0.dist-info/WHEEL +4 -0
- openadapt_ml-0.1.0.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,432 @@
|
|
|
1
|
+
"""Data collection for benchmark viewer integration.
|
|
2
|
+
|
|
3
|
+
This module handles saving execution traces during benchmark runs for later
|
|
4
|
+
replay in the benchmark viewer. It creates a structured directory layout with
|
|
5
|
+
screenshots, metadata, and execution traces.
|
|
6
|
+
|
|
7
|
+
Directory structure:
|
|
8
|
+
benchmark_results/
|
|
9
|
+
├── waa_eval_YYYYMMDD_HHMMSS/
|
|
10
|
+
│ ├── metadata.json
|
|
11
|
+
│ ├── tasks/
|
|
12
|
+
│ │ ├── task_001/
|
|
13
|
+
│ │ │ ├── task.json
|
|
14
|
+
│ │ │ ├── screenshots/
|
|
15
|
+
│ │ │ │ ├── step_000.png
|
|
16
|
+
│ │ │ │ ├── step_001.png
|
|
17
|
+
│ │ │ │ └── ...
|
|
18
|
+
│ │ │ └── execution.json
|
|
19
|
+
│ │ └── task_002/
|
|
20
|
+
│ │ └── ...
|
|
21
|
+
│ └── summary.json
|
|
22
|
+
|
|
23
|
+
Example:
|
|
24
|
+
from openadapt_ml.benchmarks.data_collection import ExecutionTraceCollector
|
|
25
|
+
|
|
26
|
+
collector = ExecutionTraceCollector(
|
|
27
|
+
benchmark_name="waa",
|
|
28
|
+
run_name="waa_eval_20241214",
|
|
29
|
+
model_id="qwen3vl-2b-epoch5"
|
|
30
|
+
)
|
|
31
|
+
|
|
32
|
+
# During evaluation
|
|
33
|
+
collector.start_task(task)
|
|
34
|
+
for step_idx, (obs, action) in enumerate(trajectory):
|
|
35
|
+
collector.record_step(step_idx, obs, action, reasoning="...")
|
|
36
|
+
collector.finish_task(result)
|
|
37
|
+
"""
|
|
38
|
+
|
|
39
|
+
from __future__ import annotations
|
|
40
|
+
|
|
41
|
+
import json
|
|
42
|
+
import logging
|
|
43
|
+
from dataclasses import asdict, dataclass
|
|
44
|
+
from datetime import datetime
|
|
45
|
+
from pathlib import Path
|
|
46
|
+
from typing import Any
|
|
47
|
+
|
|
48
|
+
from openadapt_ml.benchmarks.base import (
|
|
49
|
+
BenchmarkAction,
|
|
50
|
+
BenchmarkObservation,
|
|
51
|
+
BenchmarkResult,
|
|
52
|
+
BenchmarkTask,
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
logger = logging.getLogger(__name__)
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
@dataclass
|
|
59
|
+
class ExecutionStep:
|
|
60
|
+
"""Single step in execution trace.
|
|
61
|
+
|
|
62
|
+
Attributes:
|
|
63
|
+
step_idx: Step index in the trajectory.
|
|
64
|
+
screenshot_path: Relative path to screenshot image.
|
|
65
|
+
action: Action taken at this step.
|
|
66
|
+
reasoning: Optional reasoning/thought from the agent.
|
|
67
|
+
timestamp: Timestamp when step was recorded.
|
|
68
|
+
"""
|
|
69
|
+
|
|
70
|
+
step_idx: int
|
|
71
|
+
screenshot_path: str | None
|
|
72
|
+
action: dict[str, Any] # Serialized BenchmarkAction
|
|
73
|
+
reasoning: str | None = None
|
|
74
|
+
timestamp: float | None = None
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
class ExecutionTraceCollector:
|
|
78
|
+
"""Collects execution traces during benchmark runs.
|
|
79
|
+
|
|
80
|
+
This class handles:
|
|
81
|
+
- Creating the directory structure for a benchmark run
|
|
82
|
+
- Saving screenshots at each step
|
|
83
|
+
- Recording actions and reasoning
|
|
84
|
+
- Saving task results and metadata
|
|
85
|
+
|
|
86
|
+
Args:
|
|
87
|
+
benchmark_name: Name of the benchmark (e.g., "waa", "webarena").
|
|
88
|
+
run_name: Unique name for this evaluation run (e.g., "waa_eval_20241214").
|
|
89
|
+
model_id: Identifier for the model being evaluated.
|
|
90
|
+
output_dir: Base directory for benchmark results (default: "./benchmark_results").
|
|
91
|
+
"""
|
|
92
|
+
|
|
93
|
+
def __init__(
|
|
94
|
+
self,
|
|
95
|
+
benchmark_name: str,
|
|
96
|
+
run_name: str | None = None,
|
|
97
|
+
model_id: str = "unknown",
|
|
98
|
+
output_dir: str | Path = "benchmark_results",
|
|
99
|
+
):
|
|
100
|
+
self.benchmark_name = benchmark_name
|
|
101
|
+
self.model_id = model_id
|
|
102
|
+
|
|
103
|
+
# Auto-generate run_name if not provided
|
|
104
|
+
if run_name is None:
|
|
105
|
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
|
106
|
+
run_name = f"{benchmark_name}_eval_{timestamp}"
|
|
107
|
+
self.run_name = run_name
|
|
108
|
+
|
|
109
|
+
# Set up directory structure
|
|
110
|
+
self.output_dir = Path(output_dir)
|
|
111
|
+
self.run_dir = self.output_dir / run_name
|
|
112
|
+
self.tasks_dir = self.run_dir / "tasks"
|
|
113
|
+
|
|
114
|
+
# Current task tracking
|
|
115
|
+
self._current_task: BenchmarkTask | None = None
|
|
116
|
+
self._current_task_dir: Path | None = None
|
|
117
|
+
self._current_screenshots_dir: Path | None = None
|
|
118
|
+
self._current_steps: list[ExecutionStep] = []
|
|
119
|
+
|
|
120
|
+
# Initialize run
|
|
121
|
+
self._initialize_run()
|
|
122
|
+
|
|
123
|
+
def _initialize_run(self) -> None:
|
|
124
|
+
"""Initialize the benchmark run directory and metadata."""
|
|
125
|
+
self.run_dir.mkdir(parents=True, exist_ok=True)
|
|
126
|
+
self.tasks_dir.mkdir(exist_ok=True)
|
|
127
|
+
|
|
128
|
+
# Save run metadata
|
|
129
|
+
metadata = {
|
|
130
|
+
"benchmark_name": self.benchmark_name,
|
|
131
|
+
"run_name": self.run_name,
|
|
132
|
+
"model_id": self.model_id,
|
|
133
|
+
"created_at": datetime.now().isoformat(),
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
metadata_path = self.run_dir / "metadata.json"
|
|
137
|
+
with open(metadata_path, "w") as f:
|
|
138
|
+
json.dump(metadata, f, indent=2)
|
|
139
|
+
|
|
140
|
+
logger.info(f"Initialized benchmark run at: {self.run_dir}")
|
|
141
|
+
|
|
142
|
+
def start_task(self, task: BenchmarkTask) -> None:
|
|
143
|
+
"""Start collecting data for a new task.
|
|
144
|
+
|
|
145
|
+
Args:
|
|
146
|
+
task: The benchmark task being executed.
|
|
147
|
+
"""
|
|
148
|
+
if self._current_task is not None:
|
|
149
|
+
logger.warning(
|
|
150
|
+
f"Starting new task {task.task_id} without finishing {self._current_task.task_id}"
|
|
151
|
+
)
|
|
152
|
+
|
|
153
|
+
self._current_task = task
|
|
154
|
+
self._current_steps = []
|
|
155
|
+
|
|
156
|
+
# Create task directory
|
|
157
|
+
task_dir_name = self._sanitize_task_id(task.task_id)
|
|
158
|
+
self._current_task_dir = self.tasks_dir / task_dir_name
|
|
159
|
+
self._current_task_dir.mkdir(parents=True, exist_ok=True)
|
|
160
|
+
|
|
161
|
+
# Create screenshots directory
|
|
162
|
+
self._current_screenshots_dir = self._current_task_dir / "screenshots"
|
|
163
|
+
self._current_screenshots_dir.mkdir(exist_ok=True)
|
|
164
|
+
|
|
165
|
+
# Save task definition
|
|
166
|
+
task_data = {
|
|
167
|
+
"task_id": task.task_id,
|
|
168
|
+
"instruction": task.instruction,
|
|
169
|
+
"domain": task.domain,
|
|
170
|
+
"initial_state_ref": task.initial_state_ref,
|
|
171
|
+
"time_limit_steps": task.time_limit_steps,
|
|
172
|
+
"raw_config": task.raw_config,
|
|
173
|
+
"evaluation_spec": task.evaluation_spec,
|
|
174
|
+
}
|
|
175
|
+
|
|
176
|
+
task_path = self._current_task_dir / "task.json"
|
|
177
|
+
with open(task_path, "w") as f:
|
|
178
|
+
json.dump(task_data, f, indent=2)
|
|
179
|
+
|
|
180
|
+
logger.info(f"Started collecting data for task: {task.task_id}")
|
|
181
|
+
|
|
182
|
+
def record_step(
|
|
183
|
+
self,
|
|
184
|
+
step_idx: int,
|
|
185
|
+
observation: BenchmarkObservation,
|
|
186
|
+
action: BenchmarkAction,
|
|
187
|
+
reasoning: str | None = None,
|
|
188
|
+
) -> None:
|
|
189
|
+
"""Record a single step in the execution trace.
|
|
190
|
+
|
|
191
|
+
Args:
|
|
192
|
+
step_idx: Index of this step in the trajectory.
|
|
193
|
+
observation: Observation at this step.
|
|
194
|
+
action: Action taken at this step.
|
|
195
|
+
reasoning: Optional reasoning/thought from the agent.
|
|
196
|
+
"""
|
|
197
|
+
if self._current_task is None:
|
|
198
|
+
raise RuntimeError("No task started. Call start_task() first.")
|
|
199
|
+
|
|
200
|
+
# Save screenshot if available
|
|
201
|
+
screenshot_path = None
|
|
202
|
+
if observation.screenshot is not None:
|
|
203
|
+
screenshot_path = self._save_screenshot(step_idx, observation.screenshot)
|
|
204
|
+
elif observation.screenshot_path is not None:
|
|
205
|
+
# Copy existing screenshot
|
|
206
|
+
screenshot_path = self._copy_screenshot(step_idx, observation.screenshot_path)
|
|
207
|
+
|
|
208
|
+
# Create execution step record
|
|
209
|
+
step = ExecutionStep(
|
|
210
|
+
step_idx=step_idx,
|
|
211
|
+
screenshot_path=screenshot_path,
|
|
212
|
+
action=self._serialize_action(action),
|
|
213
|
+
reasoning=reasoning,
|
|
214
|
+
timestamp=datetime.now().timestamp(),
|
|
215
|
+
)
|
|
216
|
+
|
|
217
|
+
self._current_steps.append(step)
|
|
218
|
+
|
|
219
|
+
def finish_task(self, result: BenchmarkResult) -> None:
|
|
220
|
+
"""Finish collecting data for the current task and save execution trace.
|
|
221
|
+
|
|
222
|
+
Args:
|
|
223
|
+
result: The evaluation result for the task.
|
|
224
|
+
"""
|
|
225
|
+
if self._current_task is None:
|
|
226
|
+
raise RuntimeError("No task started. Call start_task() first.")
|
|
227
|
+
|
|
228
|
+
# Save execution trace
|
|
229
|
+
execution_data = {
|
|
230
|
+
"task_id": result.task_id,
|
|
231
|
+
"model_id": self.model_id,
|
|
232
|
+
"success": result.success,
|
|
233
|
+
"score": result.score,
|
|
234
|
+
"num_steps": result.num_steps,
|
|
235
|
+
"total_time_seconds": result.total_time_seconds,
|
|
236
|
+
"error": result.error,
|
|
237
|
+
"reason": result.reason,
|
|
238
|
+
"steps": [asdict(step) for step in self._current_steps],
|
|
239
|
+
}
|
|
240
|
+
|
|
241
|
+
execution_path = self._current_task_dir / "execution.json"
|
|
242
|
+
with open(execution_path, "w") as f:
|
|
243
|
+
json.dump(execution_data, f, indent=2)
|
|
244
|
+
|
|
245
|
+
logger.info(
|
|
246
|
+
f"Saved execution trace for task {result.task_id}: "
|
|
247
|
+
f"{'SUCCESS' if result.success else 'FAIL'} ({result.num_steps} steps)"
|
|
248
|
+
)
|
|
249
|
+
|
|
250
|
+
# Clear current task
|
|
251
|
+
self._current_task = None
|
|
252
|
+
self._current_task_dir = None
|
|
253
|
+
self._current_screenshots_dir = None
|
|
254
|
+
self._current_steps = []
|
|
255
|
+
|
|
256
|
+
def save_summary(self, all_results: list[BenchmarkResult]) -> None:
|
|
257
|
+
"""Save summary of all task results.
|
|
258
|
+
|
|
259
|
+
Args:
|
|
260
|
+
all_results: List of all BenchmarkResult objects from the run.
|
|
261
|
+
"""
|
|
262
|
+
summary = {
|
|
263
|
+
"benchmark_name": self.benchmark_name,
|
|
264
|
+
"run_name": self.run_name,
|
|
265
|
+
"model_id": self.model_id,
|
|
266
|
+
"num_tasks": len(all_results),
|
|
267
|
+
"num_success": sum(1 for r in all_results if r.success),
|
|
268
|
+
"success_rate": sum(1 for r in all_results if r.success) / len(all_results) if all_results else 0.0,
|
|
269
|
+
"avg_score": sum(r.score for r in all_results) / len(all_results) if all_results else 0.0,
|
|
270
|
+
"avg_steps": sum(r.num_steps for r in all_results) / len(all_results) if all_results else 0.0,
|
|
271
|
+
"avg_time_seconds": sum(r.total_time_seconds for r in all_results) / len(all_results) if all_results else 0.0,
|
|
272
|
+
"tasks": [
|
|
273
|
+
{
|
|
274
|
+
"task_id": r.task_id,
|
|
275
|
+
"success": r.success,
|
|
276
|
+
"score": r.score,
|
|
277
|
+
"num_steps": r.num_steps,
|
|
278
|
+
"error": r.error,
|
|
279
|
+
}
|
|
280
|
+
for r in all_results
|
|
281
|
+
],
|
|
282
|
+
}
|
|
283
|
+
|
|
284
|
+
summary_path = self.run_dir / "summary.json"
|
|
285
|
+
with open(summary_path, "w") as f:
|
|
286
|
+
json.dump(summary, f, indent=2)
|
|
287
|
+
|
|
288
|
+
logger.info(
|
|
289
|
+
f"Saved summary: {summary['num_success']}/{summary['num_tasks']} tasks succeeded "
|
|
290
|
+
f"({summary['success_rate']:.1%})"
|
|
291
|
+
)
|
|
292
|
+
|
|
293
|
+
def _save_screenshot(self, step_idx: int, screenshot_bytes: bytes) -> str:
|
|
294
|
+
"""Save screenshot bytes to file.
|
|
295
|
+
|
|
296
|
+
Args:
|
|
297
|
+
step_idx: Step index for naming the file.
|
|
298
|
+
screenshot_bytes: PNG image bytes.
|
|
299
|
+
|
|
300
|
+
Returns:
|
|
301
|
+
Relative path to the saved screenshot.
|
|
302
|
+
"""
|
|
303
|
+
if self._current_screenshots_dir is None:
|
|
304
|
+
raise RuntimeError("No task started")
|
|
305
|
+
|
|
306
|
+
filename = f"step_{step_idx:03d}.png"
|
|
307
|
+
screenshot_path = self._current_screenshots_dir / filename
|
|
308
|
+
|
|
309
|
+
with open(screenshot_path, "wb") as f:
|
|
310
|
+
f.write(screenshot_bytes)
|
|
311
|
+
|
|
312
|
+
# Return relative path from task directory
|
|
313
|
+
return f"screenshots/{filename}"
|
|
314
|
+
|
|
315
|
+
def _copy_screenshot(self, step_idx: int, source_path: str) -> str:
|
|
316
|
+
"""Copy screenshot from existing path.
|
|
317
|
+
|
|
318
|
+
Args:
|
|
319
|
+
step_idx: Step index for naming the file.
|
|
320
|
+
source_path: Path to existing screenshot.
|
|
321
|
+
|
|
322
|
+
Returns:
|
|
323
|
+
Relative path to the copied screenshot.
|
|
324
|
+
"""
|
|
325
|
+
if self._current_screenshots_dir is None:
|
|
326
|
+
raise RuntimeError("No task started")
|
|
327
|
+
|
|
328
|
+
filename = f"step_{step_idx:03d}.png"
|
|
329
|
+
dest_path = self._current_screenshots_dir / filename
|
|
330
|
+
|
|
331
|
+
# Copy file
|
|
332
|
+
import shutil
|
|
333
|
+
shutil.copy2(source_path, dest_path)
|
|
334
|
+
|
|
335
|
+
return f"screenshots/{filename}"
|
|
336
|
+
|
|
337
|
+
def _serialize_action(self, action: BenchmarkAction) -> dict[str, Any]:
|
|
338
|
+
"""Serialize BenchmarkAction to dict.
|
|
339
|
+
|
|
340
|
+
Args:
|
|
341
|
+
action: Action to serialize.
|
|
342
|
+
|
|
343
|
+
Returns:
|
|
344
|
+
Dictionary representation of the action.
|
|
345
|
+
"""
|
|
346
|
+
return {
|
|
347
|
+
"type": action.type,
|
|
348
|
+
"x": action.x,
|
|
349
|
+
"y": action.y,
|
|
350
|
+
"target_node_id": action.target_node_id,
|
|
351
|
+
"target_bbox": action.target_bbox,
|
|
352
|
+
"target_role": action.target_role,
|
|
353
|
+
"target_name": action.target_name,
|
|
354
|
+
"text": action.text,
|
|
355
|
+
"key": action.key,
|
|
356
|
+
"modifiers": action.modifiers,
|
|
357
|
+
"scroll_direction": action.scroll_direction,
|
|
358
|
+
"scroll_amount": action.scroll_amount,
|
|
359
|
+
"end_x": action.end_x,
|
|
360
|
+
"end_y": action.end_y,
|
|
361
|
+
"answer": action.answer,
|
|
362
|
+
"raw_action": action.raw_action,
|
|
363
|
+
}
|
|
364
|
+
|
|
365
|
+
def _sanitize_task_id(self, task_id: str) -> str:
|
|
366
|
+
"""Sanitize task ID for use as directory name.
|
|
367
|
+
|
|
368
|
+
Args:
|
|
369
|
+
task_id: Original task ID.
|
|
370
|
+
|
|
371
|
+
Returns:
|
|
372
|
+
Sanitized task ID safe for filesystem.
|
|
373
|
+
"""
|
|
374
|
+
# Replace unsafe characters with underscores
|
|
375
|
+
safe_id = "".join(c if c.isalnum() or c in "-_" else "_" for c in task_id)
|
|
376
|
+
return safe_id
|
|
377
|
+
|
|
378
|
+
|
|
379
|
+
def save_execution_trace(
|
|
380
|
+
task: BenchmarkTask,
|
|
381
|
+
result: BenchmarkResult,
|
|
382
|
+
trajectory: list[tuple[BenchmarkObservation, BenchmarkAction]],
|
|
383
|
+
benchmark_name: str,
|
|
384
|
+
model_id: str = "unknown",
|
|
385
|
+
output_dir: str | Path = "benchmark_results",
|
|
386
|
+
run_name: str | None = None,
|
|
387
|
+
reasoning_map: dict[int, str] | None = None,
|
|
388
|
+
) -> Path:
|
|
389
|
+
"""Convenience function to save a complete execution trace.
|
|
390
|
+
|
|
391
|
+
This is a simpler alternative to using ExecutionTraceCollector directly
|
|
392
|
+
when you have the complete trajectory available.
|
|
393
|
+
|
|
394
|
+
Args:
|
|
395
|
+
task: The benchmark task.
|
|
396
|
+
result: The evaluation result.
|
|
397
|
+
trajectory: List of (observation, action) pairs.
|
|
398
|
+
benchmark_name: Name of the benchmark.
|
|
399
|
+
model_id: Identifier for the model.
|
|
400
|
+
output_dir: Base directory for results.
|
|
401
|
+
run_name: Optional run name (auto-generated if None).
|
|
402
|
+
reasoning_map: Optional map of step_idx -> reasoning text.
|
|
403
|
+
|
|
404
|
+
Returns:
|
|
405
|
+
Path to the task directory.
|
|
406
|
+
|
|
407
|
+
Example:
|
|
408
|
+
save_execution_trace(
|
|
409
|
+
task=task,
|
|
410
|
+
result=result,
|
|
411
|
+
trajectory=trajectory,
|
|
412
|
+
benchmark_name="waa",
|
|
413
|
+
model_id="qwen3vl-2b-epoch5",
|
|
414
|
+
reasoning_map={0: "I should click the button", 1: "Now type the text"}
|
|
415
|
+
)
|
|
416
|
+
"""
|
|
417
|
+
collector = ExecutionTraceCollector(
|
|
418
|
+
benchmark_name=benchmark_name,
|
|
419
|
+
run_name=run_name,
|
|
420
|
+
model_id=model_id,
|
|
421
|
+
output_dir=output_dir,
|
|
422
|
+
)
|
|
423
|
+
|
|
424
|
+
collector.start_task(task)
|
|
425
|
+
|
|
426
|
+
for step_idx, (obs, action) in enumerate(trajectory):
|
|
427
|
+
reasoning = reasoning_map.get(step_idx) if reasoning_map else None
|
|
428
|
+
collector.record_step(step_idx, obs, action, reasoning)
|
|
429
|
+
|
|
430
|
+
collector.finish_task(result)
|
|
431
|
+
|
|
432
|
+
return collector._current_task_dir or collector.tasks_dir
|