openadapt-ml 0.1.0__py3-none-any.whl → 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- openadapt_ml/benchmarks/__init__.py +8 -0
- openadapt_ml/benchmarks/agent.py +90 -11
- openadapt_ml/benchmarks/azure.py +35 -6
- openadapt_ml/benchmarks/cli.py +4449 -201
- openadapt_ml/benchmarks/live_tracker.py +180 -0
- openadapt_ml/benchmarks/runner.py +41 -4
- openadapt_ml/benchmarks/viewer.py +1219 -0
- openadapt_ml/benchmarks/vm_monitor.py +610 -0
- openadapt_ml/benchmarks/waa.py +61 -4
- openadapt_ml/benchmarks/waa_deploy/Dockerfile +222 -0
- openadapt_ml/benchmarks/waa_deploy/__init__.py +10 -0
- openadapt_ml/benchmarks/waa_deploy/api_agent.py +539 -0
- openadapt_ml/benchmarks/waa_deploy/start_waa_server.bat +53 -0
- openadapt_ml/benchmarks/waa_live.py +619 -0
- openadapt_ml/cloud/local.py +1555 -1
- openadapt_ml/cloud/ssh_tunnel.py +553 -0
- openadapt_ml/datasets/next_action.py +87 -68
- openadapt_ml/evals/grounding.py +26 -8
- openadapt_ml/evals/trajectory_matching.py +84 -36
- openadapt_ml/experiments/demo_prompt/__init__.py +19 -0
- openadapt_ml/experiments/demo_prompt/format_demo.py +226 -0
- openadapt_ml/experiments/demo_prompt/results/experiment_20251231_002125.json +83 -0
- openadapt_ml/experiments/demo_prompt/results/experiment_n30_20251231_165958.json +1100 -0
- openadapt_ml/experiments/demo_prompt/results/multistep_20251231_025051.json +182 -0
- openadapt_ml/experiments/demo_prompt/run_experiment.py +531 -0
- openadapt_ml/experiments/waa_demo/__init__.py +10 -0
- openadapt_ml/experiments/waa_demo/demos.py +357 -0
- openadapt_ml/experiments/waa_demo/runner.py +717 -0
- openadapt_ml/experiments/waa_demo/tasks.py +151 -0
- openadapt_ml/export/__init__.py +9 -0
- openadapt_ml/export/__main__.py +6 -0
- openadapt_ml/export/cli.py +89 -0
- openadapt_ml/export/parquet.py +265 -0
- openadapt_ml/ingest/__init__.py +3 -4
- openadapt_ml/ingest/capture.py +89 -81
- openadapt_ml/ingest/loader.py +116 -68
- openadapt_ml/ingest/synthetic.py +221 -159
- openadapt_ml/retrieval/README.md +226 -0
- openadapt_ml/retrieval/USAGE.md +391 -0
- openadapt_ml/retrieval/__init__.py +91 -0
- openadapt_ml/retrieval/demo_retriever.py +817 -0
- openadapt_ml/retrieval/embeddings.py +629 -0
- openadapt_ml/retrieval/index.py +194 -0
- openadapt_ml/retrieval/retriever.py +160 -0
- openadapt_ml/runtime/policy.py +10 -10
- openadapt_ml/schema/__init__.py +104 -0
- openadapt_ml/schema/converters.py +541 -0
- openadapt_ml/schema/episode.py +457 -0
- openadapt_ml/scripts/compare.py +26 -16
- openadapt_ml/scripts/eval_policy.py +4 -5
- openadapt_ml/scripts/prepare_synthetic.py +14 -17
- openadapt_ml/scripts/train.py +81 -70
- openadapt_ml/training/benchmark_viewer.py +3225 -0
- openadapt_ml/training/trainer.py +120 -363
- openadapt_ml/training/trl_trainer.py +354 -0
- {openadapt_ml-0.1.0.dist-info → openadapt_ml-0.2.0.dist-info}/METADATA +102 -60
- openadapt_ml-0.2.0.dist-info/RECORD +86 -0
- openadapt_ml/schemas/__init__.py +0 -53
- openadapt_ml/schemas/sessions.py +0 -122
- openadapt_ml/schemas/validation.py +0 -252
- openadapt_ml-0.1.0.dist-info/RECORD +0 -55
- {openadapt_ml-0.1.0.dist-info → openadapt_ml-0.2.0.dist-info}/WHEEL +0 -0
- {openadapt_ml-0.1.0.dist-info → openadapt_ml-0.2.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,1219 @@
|
|
|
1
|
+
"""Benchmark viewer HTML generation.
|
|
2
|
+
|
|
3
|
+
This module generates a standalone HTML viewer for benchmark results,
|
|
4
|
+
showing task list with pass/fail status, step-by-step replay of
|
|
5
|
+
benchmark executions, screenshots, actions, and reasoning at each step.
|
|
6
|
+
|
|
7
|
+
Usage:
|
|
8
|
+
from openadapt_ml.benchmarks.viewer import generate_benchmark_viewer
|
|
9
|
+
|
|
10
|
+
# Generate viewer from benchmark results directory
|
|
11
|
+
generate_benchmark_viewer(
|
|
12
|
+
benchmark_dir=Path("benchmark_results/waa_eval_20241214"),
|
|
13
|
+
output_path=Path("benchmark_results/waa_eval_20241214/benchmark.html"),
|
|
14
|
+
)
|
|
15
|
+
|
|
16
|
+
Directory structure expected:
|
|
17
|
+
benchmark_results/{run_name}/
|
|
18
|
+
|-- metadata.json # Benchmark config, models evaluated
|
|
19
|
+
|-- summary.json # Aggregate results
|
|
20
|
+
|-- tasks/
|
|
21
|
+
| |-- task_001/
|
|
22
|
+
| | |-- task.json # Task definition
|
|
23
|
+
| | |-- execution.json # Execution trace with steps
|
|
24
|
+
| | |-- screenshots/ # Step screenshots
|
|
25
|
+
| | |-- step_000.png
|
|
26
|
+
| | |-- step_001.png
|
|
27
|
+
| | |-- ...
|
|
28
|
+
| |-- task_002/
|
|
29
|
+
| | |-- ...
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
from __future__ import annotations
|
|
33
|
+
|
|
34
|
+
import base64
|
|
35
|
+
import json
|
|
36
|
+
import logging
|
|
37
|
+
from pathlib import Path
|
|
38
|
+
from typing import Any
|
|
39
|
+
|
|
40
|
+
from openadapt_ml.training.shared_ui import (
|
|
41
|
+
get_shared_header_css as _get_shared_header_css,
|
|
42
|
+
generate_shared_header_html as _generate_shared_header_html,
|
|
43
|
+
)
|
|
44
|
+
|
|
45
|
+
logger = logging.getLogger(__name__)
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def load_benchmark_metadata(benchmark_dir: Path) -> dict[str, Any]:
|
|
49
|
+
"""Load benchmark metadata from metadata.json.
|
|
50
|
+
|
|
51
|
+
Args:
|
|
52
|
+
benchmark_dir: Path to benchmark run directory.
|
|
53
|
+
|
|
54
|
+
Returns:
|
|
55
|
+
Metadata dictionary with benchmark_name, run_name, model_id, etc.
|
|
56
|
+
"""
|
|
57
|
+
metadata_path = benchmark_dir / "metadata.json"
|
|
58
|
+
if metadata_path.exists():
|
|
59
|
+
with open(metadata_path) as f:
|
|
60
|
+
return json.load(f)
|
|
61
|
+
return {
|
|
62
|
+
"benchmark_name": "unknown",
|
|
63
|
+
"run_name": benchmark_dir.name,
|
|
64
|
+
"model_id": "unknown",
|
|
65
|
+
"created_at": None,
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def load_benchmark_summary(benchmark_dir: Path) -> dict[str, Any]:
|
|
70
|
+
"""Load benchmark summary from summary.json.
|
|
71
|
+
|
|
72
|
+
Args:
|
|
73
|
+
benchmark_dir: Path to benchmark run directory.
|
|
74
|
+
|
|
75
|
+
Returns:
|
|
76
|
+
Summary dictionary with success_rate, num_tasks, etc.
|
|
77
|
+
"""
|
|
78
|
+
summary_path = benchmark_dir / "summary.json"
|
|
79
|
+
if summary_path.exists():
|
|
80
|
+
with open(summary_path) as f:
|
|
81
|
+
return json.load(f)
|
|
82
|
+
return {
|
|
83
|
+
"num_tasks": 0,
|
|
84
|
+
"num_success": 0,
|
|
85
|
+
"success_rate": 0.0,
|
|
86
|
+
"avg_score": 0.0,
|
|
87
|
+
"avg_steps": 0.0,
|
|
88
|
+
"tasks": [],
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def load_task_results(benchmark_dir: Path) -> list[dict[str, Any]]:
|
|
93
|
+
"""Load all task results from benchmark run.
|
|
94
|
+
|
|
95
|
+
Args:
|
|
96
|
+
benchmark_dir: Path to benchmark run directory.
|
|
97
|
+
|
|
98
|
+
Returns:
|
|
99
|
+
List of task dictionaries with task definition, execution trace,
|
|
100
|
+
and screenshot paths.
|
|
101
|
+
"""
|
|
102
|
+
tasks_dir = benchmark_dir / "tasks"
|
|
103
|
+
if not tasks_dir.exists():
|
|
104
|
+
return []
|
|
105
|
+
|
|
106
|
+
results = []
|
|
107
|
+
for task_dir in sorted(tasks_dir.iterdir()):
|
|
108
|
+
if not task_dir.is_dir():
|
|
109
|
+
continue
|
|
110
|
+
|
|
111
|
+
task_data: dict[str, Any] = {
|
|
112
|
+
"task_dir": str(task_dir),
|
|
113
|
+
"task_id": task_dir.name,
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
# Load task definition
|
|
117
|
+
task_json = task_dir / "task.json"
|
|
118
|
+
if task_json.exists():
|
|
119
|
+
with open(task_json) as f:
|
|
120
|
+
task_data["definition"] = json.load(f)
|
|
121
|
+
else:
|
|
122
|
+
task_data["definition"] = {"task_id": task_dir.name, "instruction": ""}
|
|
123
|
+
|
|
124
|
+
# Load execution trace
|
|
125
|
+
execution_json = task_dir / "execution.json"
|
|
126
|
+
if execution_json.exists():
|
|
127
|
+
with open(execution_json) as f:
|
|
128
|
+
task_data["execution"] = json.load(f)
|
|
129
|
+
else:
|
|
130
|
+
task_data["execution"] = {"steps": [], "success": False, "num_steps": 0}
|
|
131
|
+
|
|
132
|
+
# Load screenshot paths
|
|
133
|
+
screenshots_dir = task_dir / "screenshots"
|
|
134
|
+
if screenshots_dir.exists():
|
|
135
|
+
screenshot_paths = sorted(screenshots_dir.glob("*.png"))
|
|
136
|
+
task_data["screenshots"] = [str(p.relative_to(benchmark_dir)) for p in screenshot_paths]
|
|
137
|
+
else:
|
|
138
|
+
task_data["screenshots"] = []
|
|
139
|
+
|
|
140
|
+
results.append(task_data)
|
|
141
|
+
|
|
142
|
+
return results
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
def _encode_image_to_base64(image_path: Path) -> str | None:
|
|
146
|
+
"""Encode image to base64 data URL for embedding in HTML.
|
|
147
|
+
|
|
148
|
+
Args:
|
|
149
|
+
image_path: Path to PNG image.
|
|
150
|
+
|
|
151
|
+
Returns:
|
|
152
|
+
Data URL string or None if image cannot be loaded.
|
|
153
|
+
"""
|
|
154
|
+
try:
|
|
155
|
+
if image_path.exists():
|
|
156
|
+
with open(image_path, "rb") as f:
|
|
157
|
+
data = f.read()
|
|
158
|
+
return f"data:image/png;base64,{base64.b64encode(data).decode()}"
|
|
159
|
+
except Exception as e:
|
|
160
|
+
logger.warning(f"Failed to encode image {image_path}: {e}")
|
|
161
|
+
return None
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
def _get_domain_stats(tasks: list[dict[str, Any]]) -> dict[str, dict[str, int]]:
|
|
165
|
+
"""Calculate per-domain statistics.
|
|
166
|
+
|
|
167
|
+
Args:
|
|
168
|
+
tasks: List of task result dictionaries.
|
|
169
|
+
|
|
170
|
+
Returns:
|
|
171
|
+
Dictionary mapping domain name to {total, success, fail} counts.
|
|
172
|
+
"""
|
|
173
|
+
domain_stats: dict[str, dict[str, int]] = {}
|
|
174
|
+
|
|
175
|
+
for task in tasks:
|
|
176
|
+
domain = task.get("definition", {}).get("domain", "unknown")
|
|
177
|
+
success = task.get("execution", {}).get("success", False)
|
|
178
|
+
|
|
179
|
+
if domain not in domain_stats:
|
|
180
|
+
domain_stats[domain] = {"total": 0, "success": 0, "fail": 0}
|
|
181
|
+
|
|
182
|
+
domain_stats[domain]["total"] += 1
|
|
183
|
+
if success:
|
|
184
|
+
domain_stats[domain]["success"] += 1
|
|
185
|
+
else:
|
|
186
|
+
domain_stats[domain]["fail"] += 1
|
|
187
|
+
|
|
188
|
+
return domain_stats
|
|
189
|
+
|
|
190
|
+
|
|
191
|
+
def generate_benchmark_viewer(
|
|
192
|
+
benchmark_dir: Path,
|
|
193
|
+
output_path: Path | None = None,
|
|
194
|
+
embed_screenshots: bool = False,
|
|
195
|
+
) -> Path:
|
|
196
|
+
"""Generate HTML viewer for benchmark results.
|
|
197
|
+
|
|
198
|
+
Args:
|
|
199
|
+
benchmark_dir: Path to benchmark run directory containing metadata.json,
|
|
200
|
+
summary.json, and tasks/ subdirectory.
|
|
201
|
+
output_path: Path for output HTML file. Defaults to benchmark_dir/benchmark.html.
|
|
202
|
+
embed_screenshots: If True, embed screenshots as base64 data URLs.
|
|
203
|
+
This creates a larger but fully standalone HTML file.
|
|
204
|
+
|
|
205
|
+
Returns:
|
|
206
|
+
Path to generated HTML file.
|
|
207
|
+
"""
|
|
208
|
+
benchmark_dir = Path(benchmark_dir)
|
|
209
|
+
if output_path is None:
|
|
210
|
+
output_path = benchmark_dir / "benchmark.html"
|
|
211
|
+
|
|
212
|
+
# Load all data
|
|
213
|
+
metadata = load_benchmark_metadata(benchmark_dir)
|
|
214
|
+
summary = load_benchmark_summary(benchmark_dir)
|
|
215
|
+
tasks = load_task_results(benchmark_dir)
|
|
216
|
+
|
|
217
|
+
# Calculate domain statistics
|
|
218
|
+
domain_stats = _get_domain_stats(tasks)
|
|
219
|
+
|
|
220
|
+
# Generate HTML
|
|
221
|
+
html = _generate_benchmark_viewer_html(
|
|
222
|
+
metadata=metadata,
|
|
223
|
+
summary=summary,
|
|
224
|
+
tasks=tasks,
|
|
225
|
+
domain_stats=domain_stats,
|
|
226
|
+
benchmark_dir=benchmark_dir,
|
|
227
|
+
embed_screenshots=embed_screenshots,
|
|
228
|
+
)
|
|
229
|
+
|
|
230
|
+
# Write output
|
|
231
|
+
output_path = Path(output_path)
|
|
232
|
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
233
|
+
output_path.write_text(html)
|
|
234
|
+
|
|
235
|
+
logger.info(f"Generated benchmark viewer: {output_path}")
|
|
236
|
+
return output_path
|
|
237
|
+
|
|
238
|
+
|
|
239
|
+
def _generate_benchmark_viewer_html(
|
|
240
|
+
metadata: dict[str, Any],
|
|
241
|
+
summary: dict[str, Any],
|
|
242
|
+
tasks: list[dict[str, Any]],
|
|
243
|
+
domain_stats: dict[str, dict[str, int]],
|
|
244
|
+
benchmark_dir: Path,
|
|
245
|
+
embed_screenshots: bool = False,
|
|
246
|
+
) -> str:
|
|
247
|
+
"""Generate the HTML content for benchmark viewer.
|
|
248
|
+
|
|
249
|
+
Args:
|
|
250
|
+
metadata: Benchmark metadata.
|
|
251
|
+
summary: Summary statistics.
|
|
252
|
+
tasks: List of task result dictionaries.
|
|
253
|
+
domain_stats: Per-domain statistics.
|
|
254
|
+
benchmark_dir: Base directory for resolving relative paths.
|
|
255
|
+
embed_screenshots: If True, embed screenshots as base64.
|
|
256
|
+
|
|
257
|
+
Returns:
|
|
258
|
+
HTML string.
|
|
259
|
+
"""
|
|
260
|
+
# Get shared header components
|
|
261
|
+
shared_header_css = _get_shared_header_css()
|
|
262
|
+
shared_header_html = _generate_shared_header_html("benchmarks")
|
|
263
|
+
|
|
264
|
+
# Serialize data for JavaScript
|
|
265
|
+
metadata_json = json.dumps(metadata)
|
|
266
|
+
summary_json = json.dumps(summary)
|
|
267
|
+
domain_stats_json = json.dumps(domain_stats)
|
|
268
|
+
|
|
269
|
+
# Process tasks for JavaScript - include execution steps and screenshot paths
|
|
270
|
+
tasks_for_js = []
|
|
271
|
+
for task in tasks:
|
|
272
|
+
task_js = {
|
|
273
|
+
"task_id": task.get("task_id"),
|
|
274
|
+
"definition": task.get("definition", {}),
|
|
275
|
+
"execution": task.get("execution", {}),
|
|
276
|
+
"screenshots": task.get("screenshots", []),
|
|
277
|
+
}
|
|
278
|
+
|
|
279
|
+
# Optionally embed screenshots as base64
|
|
280
|
+
if embed_screenshots:
|
|
281
|
+
embedded_screenshots = []
|
|
282
|
+
for screenshot_rel_path in task.get("screenshots", []):
|
|
283
|
+
screenshot_path = benchmark_dir / screenshot_rel_path
|
|
284
|
+
data_url = _encode_image_to_base64(screenshot_path)
|
|
285
|
+
embedded_screenshots.append(data_url or "")
|
|
286
|
+
task_js["embedded_screenshots"] = embedded_screenshots
|
|
287
|
+
|
|
288
|
+
tasks_for_js.append(task_js)
|
|
289
|
+
|
|
290
|
+
tasks_json = json.dumps(tasks_for_js)
|
|
291
|
+
|
|
292
|
+
# Calculate aggregate metrics
|
|
293
|
+
num_tasks = len(tasks)
|
|
294
|
+
num_success = sum(1 for t in tasks if t.get("execution", {}).get("success", False))
|
|
295
|
+
success_rate = (num_success / num_tasks * 100) if num_tasks > 0 else 0
|
|
296
|
+
|
|
297
|
+
html = f'''<!DOCTYPE html>
|
|
298
|
+
<html lang="en">
|
|
299
|
+
<head>
|
|
300
|
+
<meta charset="UTF-8">
|
|
301
|
+
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
|
302
|
+
<title>Benchmark Viewer - {metadata.get("run_name", "Unknown")}</title>
|
|
303
|
+
<style>
|
|
304
|
+
:root {{
|
|
305
|
+
--bg-primary: #0a0a0f;
|
|
306
|
+
--bg-secondary: #12121a;
|
|
307
|
+
--bg-tertiary: #1a1a24;
|
|
308
|
+
--border-color: rgba(255, 255, 255, 0.06);
|
|
309
|
+
--text-primary: #f0f0f0;
|
|
310
|
+
--text-secondary: #888;
|
|
311
|
+
--text-muted: #555;
|
|
312
|
+
--accent: #00d4aa;
|
|
313
|
+
--accent-dim: rgba(0, 212, 170, 0.15);
|
|
314
|
+
--success: #34d399;
|
|
315
|
+
--error: #ff5f5f;
|
|
316
|
+
--warning: #f59e0b;
|
|
317
|
+
}}
|
|
318
|
+
* {{ box-sizing: border-box; margin: 0; padding: 0; }}
|
|
319
|
+
body {{
|
|
320
|
+
font-family: "SF Pro Display", -apple-system, BlinkMacSystemFont, "Inter", sans-serif;
|
|
321
|
+
background: var(--bg-primary);
|
|
322
|
+
color: var(--text-primary);
|
|
323
|
+
min-height: 100vh;
|
|
324
|
+
line-height: 1.5;
|
|
325
|
+
}}
|
|
326
|
+
.container {{
|
|
327
|
+
max-width: 1600px;
|
|
328
|
+
margin: 0 auto;
|
|
329
|
+
padding: 24px;
|
|
330
|
+
}}
|
|
331
|
+
{shared_header_css}
|
|
332
|
+
|
|
333
|
+
/* Summary Panel */
|
|
334
|
+
.summary-panel {{
|
|
335
|
+
background: var(--bg-secondary);
|
|
336
|
+
border: 1px solid var(--border-color);
|
|
337
|
+
border-radius: 12px;
|
|
338
|
+
padding: 20px;
|
|
339
|
+
margin-bottom: 24px;
|
|
340
|
+
}}
|
|
341
|
+
.summary-header {{
|
|
342
|
+
display: flex;
|
|
343
|
+
justify-content: space-between;
|
|
344
|
+
align-items: center;
|
|
345
|
+
margin-bottom: 16px;
|
|
346
|
+
}}
|
|
347
|
+
.summary-header h2 {{
|
|
348
|
+
font-size: 1rem;
|
|
349
|
+
font-weight: 600;
|
|
350
|
+
}}
|
|
351
|
+
.summary-meta {{
|
|
352
|
+
font-size: 0.75rem;
|
|
353
|
+
color: var(--text-secondary);
|
|
354
|
+
font-family: "SF Mono", Monaco, monospace;
|
|
355
|
+
}}
|
|
356
|
+
.summary-stats {{
|
|
357
|
+
display: grid;
|
|
358
|
+
grid-template-columns: repeat(auto-fit, minmax(140px, 1fr));
|
|
359
|
+
gap: 16px;
|
|
360
|
+
margin-bottom: 16px;
|
|
361
|
+
}}
|
|
362
|
+
.stat-card {{
|
|
363
|
+
background: var(--bg-tertiary);
|
|
364
|
+
border-radius: 8px;
|
|
365
|
+
padding: 16px;
|
|
366
|
+
}}
|
|
367
|
+
.stat-card .stat-value {{
|
|
368
|
+
font-size: 1.8rem;
|
|
369
|
+
font-weight: 600;
|
|
370
|
+
font-family: "SF Mono", Monaco, monospace;
|
|
371
|
+
}}
|
|
372
|
+
.stat-card .stat-value.success {{ color: var(--success); }}
|
|
373
|
+
.stat-card .stat-value.error {{ color: var(--error); }}
|
|
374
|
+
.stat-card .stat-label {{
|
|
375
|
+
font-size: 0.7rem;
|
|
376
|
+
color: var(--text-muted);
|
|
377
|
+
text-transform: uppercase;
|
|
378
|
+
letter-spacing: 0.05em;
|
|
379
|
+
margin-top: 4px;
|
|
380
|
+
}}
|
|
381
|
+
|
|
382
|
+
/* Domain breakdown */
|
|
383
|
+
.domain-breakdown {{
|
|
384
|
+
display: flex;
|
|
385
|
+
flex-wrap: wrap;
|
|
386
|
+
gap: 8px;
|
|
387
|
+
}}
|
|
388
|
+
.domain-tag {{
|
|
389
|
+
display: inline-flex;
|
|
390
|
+
align-items: center;
|
|
391
|
+
gap: 6px;
|
|
392
|
+
padding: 6px 12px;
|
|
393
|
+
background: var(--bg-tertiary);
|
|
394
|
+
border-radius: 6px;
|
|
395
|
+
font-size: 0.75rem;
|
|
396
|
+
}}
|
|
397
|
+
.domain-tag .domain-name {{
|
|
398
|
+
color: var(--text-primary);
|
|
399
|
+
}}
|
|
400
|
+
.domain-tag .domain-stats {{
|
|
401
|
+
font-family: "SF Mono", Monaco, monospace;
|
|
402
|
+
color: var(--text-secondary);
|
|
403
|
+
}}
|
|
404
|
+
|
|
405
|
+
/* Filters */
|
|
406
|
+
.filter-bar {{
|
|
407
|
+
display: flex;
|
|
408
|
+
gap: 16px;
|
|
409
|
+
padding: 12px 16px;
|
|
410
|
+
background: var(--bg-secondary);
|
|
411
|
+
border: 1px solid var(--border-color);
|
|
412
|
+
border-radius: 8px;
|
|
413
|
+
margin-bottom: 16px;
|
|
414
|
+
flex-wrap: wrap;
|
|
415
|
+
align-items: center;
|
|
416
|
+
}}
|
|
417
|
+
.filter-group {{
|
|
418
|
+
display: flex;
|
|
419
|
+
align-items: center;
|
|
420
|
+
gap: 8px;
|
|
421
|
+
}}
|
|
422
|
+
.filter-label {{
|
|
423
|
+
font-size: 0.7rem;
|
|
424
|
+
color: var(--text-muted);
|
|
425
|
+
text-transform: uppercase;
|
|
426
|
+
letter-spacing: 0.05em;
|
|
427
|
+
}}
|
|
428
|
+
.filter-select {{
|
|
429
|
+
padding: 8px 32px 8px 12px;
|
|
430
|
+
border-radius: 8px;
|
|
431
|
+
font-size: 0.85rem;
|
|
432
|
+
background: var(--bg-tertiary);
|
|
433
|
+
color: var(--text-primary);
|
|
434
|
+
border: 1px solid var(--border-color);
|
|
435
|
+
cursor: pointer;
|
|
436
|
+
appearance: none;
|
|
437
|
+
background-image: url("data:image/svg+xml,%3Csvg xmlns='http://www.w3.org/2000/svg' width='12' height='12' viewBox='0 0 12 12'%3E%3Cpath fill='%23888' d='M3 4.5L6 7.5L9 4.5'/%3E%3C/svg%3E");
|
|
438
|
+
background-repeat: no-repeat;
|
|
439
|
+
background-position: right 10px center;
|
|
440
|
+
transition: all 0.2s;
|
|
441
|
+
}}
|
|
442
|
+
.filter-select:hover {{ border-color: var(--accent); }}
|
|
443
|
+
.filter-count {{
|
|
444
|
+
font-size: 0.8rem;
|
|
445
|
+
color: var(--text-secondary);
|
|
446
|
+
margin-left: auto;
|
|
447
|
+
}}
|
|
448
|
+
|
|
449
|
+
/* Main Content Layout */
|
|
450
|
+
.main-content {{
|
|
451
|
+
display: grid;
|
|
452
|
+
grid-template-columns: 350px 1fr;
|
|
453
|
+
gap: 24px;
|
|
454
|
+
}}
|
|
455
|
+
@media (max-width: 1200px) {{
|
|
456
|
+
.main-content {{ grid-template-columns: 1fr; }}
|
|
457
|
+
}}
|
|
458
|
+
|
|
459
|
+
/* Task List */
|
|
460
|
+
.task-list {{
|
|
461
|
+
background: var(--bg-secondary);
|
|
462
|
+
border: 1px solid var(--border-color);
|
|
463
|
+
border-radius: 12px;
|
|
464
|
+
max-height: calc(100vh - 300px);
|
|
465
|
+
overflow-y: auto;
|
|
466
|
+
}}
|
|
467
|
+
.task-list-header {{
|
|
468
|
+
display: flex;
|
|
469
|
+
justify-content: space-between;
|
|
470
|
+
align-items: center;
|
|
471
|
+
padding: 14px 16px;
|
|
472
|
+
border-bottom: 1px solid var(--border-color);
|
|
473
|
+
position: sticky;
|
|
474
|
+
top: 0;
|
|
475
|
+
background: var(--bg-secondary);
|
|
476
|
+
z-index: 10;
|
|
477
|
+
}}
|
|
478
|
+
.task-list-header h3 {{
|
|
479
|
+
font-size: 0.9rem;
|
|
480
|
+
font-weight: 600;
|
|
481
|
+
}}
|
|
482
|
+
.task-item {{
|
|
483
|
+
padding: 12px 16px;
|
|
484
|
+
border-bottom: 1px solid var(--border-color);
|
|
485
|
+
cursor: pointer;
|
|
486
|
+
transition: background 0.2s;
|
|
487
|
+
}}
|
|
488
|
+
.task-item:hover {{ background: var(--bg-tertiary); }}
|
|
489
|
+
.task-item.active {{
|
|
490
|
+
background: var(--accent-dim);
|
|
491
|
+
border-left: 3px solid var(--accent);
|
|
492
|
+
}}
|
|
493
|
+
.task-item.hidden {{ display: none; }}
|
|
494
|
+
.task-item .task-header {{
|
|
495
|
+
display: flex;
|
|
496
|
+
justify-content: space-between;
|
|
497
|
+
align-items: center;
|
|
498
|
+
margin-bottom: 4px;
|
|
499
|
+
}}
|
|
500
|
+
.task-item .task-id {{
|
|
501
|
+
font-family: "SF Mono", Monaco, monospace;
|
|
502
|
+
font-size: 0.8rem;
|
|
503
|
+
font-weight: 600;
|
|
504
|
+
}}
|
|
505
|
+
.task-item .task-status {{
|
|
506
|
+
font-size: 0.7rem;
|
|
507
|
+
font-weight: 600;
|
|
508
|
+
padding: 2px 8px;
|
|
509
|
+
border-radius: 4px;
|
|
510
|
+
}}
|
|
511
|
+
.task-item .task-status.success {{
|
|
512
|
+
background: rgba(52, 211, 153, 0.2);
|
|
513
|
+
color: var(--success);
|
|
514
|
+
}}
|
|
515
|
+
.task-item .task-status.fail {{
|
|
516
|
+
background: rgba(255, 95, 95, 0.2);
|
|
517
|
+
color: var(--error);
|
|
518
|
+
}}
|
|
519
|
+
.task-item .task-info {{
|
|
520
|
+
font-size: 0.75rem;
|
|
521
|
+
color: var(--text-secondary);
|
|
522
|
+
}}
|
|
523
|
+
.task-item .task-domain {{
|
|
524
|
+
color: var(--accent);
|
|
525
|
+
}}
|
|
526
|
+
|
|
527
|
+
/* Task Detail Panel */
|
|
528
|
+
.task-detail {{
|
|
529
|
+
background: var(--bg-secondary);
|
|
530
|
+
border: 1px solid var(--border-color);
|
|
531
|
+
border-radius: 12px;
|
|
532
|
+
overflow: hidden;
|
|
533
|
+
}}
|
|
534
|
+
.task-detail-header {{
|
|
535
|
+
padding: 16px 20px;
|
|
536
|
+
border-bottom: 1px solid var(--border-color);
|
|
537
|
+
}}
|
|
538
|
+
.task-detail-header h2 {{
|
|
539
|
+
font-size: 1rem;
|
|
540
|
+
font-weight: 600;
|
|
541
|
+
margin-bottom: 8px;
|
|
542
|
+
}}
|
|
543
|
+
.task-detail-meta {{
|
|
544
|
+
font-size: 0.8rem;
|
|
545
|
+
color: var(--text-secondary);
|
|
546
|
+
line-height: 1.6;
|
|
547
|
+
}}
|
|
548
|
+
.task-detail-instruction {{
|
|
549
|
+
font-style: italic;
|
|
550
|
+
color: var(--text-primary);
|
|
551
|
+
margin-top: 8px;
|
|
552
|
+
padding: 10px;
|
|
553
|
+
background: var(--bg-tertiary);
|
|
554
|
+
border-radius: 6px;
|
|
555
|
+
font-size: 0.85rem;
|
|
556
|
+
}}
|
|
557
|
+
|
|
558
|
+
/* Step Viewer */
|
|
559
|
+
.step-viewer {{
|
|
560
|
+
display: grid;
|
|
561
|
+
grid-template-columns: 1fr 300px;
|
|
562
|
+
gap: 16px;
|
|
563
|
+
padding: 16px;
|
|
564
|
+
}}
|
|
565
|
+
@media (max-width: 900px) {{
|
|
566
|
+
.step-viewer {{ grid-template-columns: 1fr; }}
|
|
567
|
+
}}
|
|
568
|
+
.screenshot-container {{
|
|
569
|
+
position: relative;
|
|
570
|
+
background: #000;
|
|
571
|
+
border-radius: 8px;
|
|
572
|
+
overflow: hidden;
|
|
573
|
+
min-height: 400px;
|
|
574
|
+
display: flex;
|
|
575
|
+
align-items: center;
|
|
576
|
+
justify-content: center;
|
|
577
|
+
}}
|
|
578
|
+
.screenshot-container img {{
|
|
579
|
+
max-width: 100%;
|
|
580
|
+
max-height: 70vh;
|
|
581
|
+
object-fit: contain;
|
|
582
|
+
}}
|
|
583
|
+
.screenshot-placeholder {{
|
|
584
|
+
color: var(--text-muted);
|
|
585
|
+
font-size: 0.9rem;
|
|
586
|
+
}}
|
|
587
|
+
.click-marker {{
|
|
588
|
+
position: absolute;
|
|
589
|
+
width: 24px;
|
|
590
|
+
height: 24px;
|
|
591
|
+
border-radius: 50%;
|
|
592
|
+
transform: translate(-50%, -50%);
|
|
593
|
+
display: flex;
|
|
594
|
+
align-items: center;
|
|
595
|
+
justify-content: center;
|
|
596
|
+
font-size: 10px;
|
|
597
|
+
font-weight: bold;
|
|
598
|
+
pointer-events: none;
|
|
599
|
+
z-index: 100;
|
|
600
|
+
background: rgba(167, 139, 250, 0.4);
|
|
601
|
+
border: 2px solid #a78bfa;
|
|
602
|
+
color: #a78bfa;
|
|
603
|
+
}}
|
|
604
|
+
|
|
605
|
+
/* Step Controls */
|
|
606
|
+
.step-sidebar {{
|
|
607
|
+
display: flex;
|
|
608
|
+
flex-direction: column;
|
|
609
|
+
gap: 16px;
|
|
610
|
+
}}
|
|
611
|
+
.step-controls {{
|
|
612
|
+
display: flex;
|
|
613
|
+
gap: 8px;
|
|
614
|
+
flex-wrap: wrap;
|
|
615
|
+
align-items: center;
|
|
616
|
+
}}
|
|
617
|
+
.step-btn {{
|
|
618
|
+
padding: 8px 12px;
|
|
619
|
+
border: 1px solid var(--border-color);
|
|
620
|
+
background: var(--bg-tertiary);
|
|
621
|
+
color: var(--text-primary);
|
|
622
|
+
border-radius: 6px;
|
|
623
|
+
cursor: pointer;
|
|
624
|
+
font-size: 0.85rem;
|
|
625
|
+
min-width: 40px;
|
|
626
|
+
text-align: center;
|
|
627
|
+
transition: all 0.2s;
|
|
628
|
+
}}
|
|
629
|
+
.step-btn:hover {{ border-color: var(--accent); }}
|
|
630
|
+
.step-btn.primary {{ flex: 1; min-width: 60px; }}
|
|
631
|
+
.step-btn.active {{
|
|
632
|
+
background: var(--accent);
|
|
633
|
+
color: var(--bg-primary);
|
|
634
|
+
border-color: var(--accent);
|
|
635
|
+
}}
|
|
636
|
+
.step-progress {{
|
|
637
|
+
font-size: 0.8rem;
|
|
638
|
+
color: var(--text-secondary);
|
|
639
|
+
font-family: "SF Mono", Monaco, monospace;
|
|
640
|
+
}}
|
|
641
|
+
|
|
642
|
+
/* Step List */
|
|
643
|
+
.step-list {{
|
|
644
|
+
background: var(--bg-tertiary);
|
|
645
|
+
border-radius: 8px;
|
|
646
|
+
max-height: 300px;
|
|
647
|
+
overflow-y: auto;
|
|
648
|
+
}}
|
|
649
|
+
.step-list-item {{
|
|
650
|
+
padding: 10px 12px;
|
|
651
|
+
border-bottom: 1px solid var(--border-color);
|
|
652
|
+
cursor: pointer;
|
|
653
|
+
transition: background 0.2s;
|
|
654
|
+
font-size: 0.8rem;
|
|
655
|
+
}}
|
|
656
|
+
.step-list-item:hover {{ background: var(--bg-secondary); }}
|
|
657
|
+
.step-list-item.active {{
|
|
658
|
+
background: var(--accent-dim);
|
|
659
|
+
border-left: 2px solid var(--accent);
|
|
660
|
+
}}
|
|
661
|
+
.step-list-item .step-num {{
|
|
662
|
+
font-weight: 600;
|
|
663
|
+
color: var(--accent);
|
|
664
|
+
margin-right: 8px;
|
|
665
|
+
}}
|
|
666
|
+
.step-list-item .step-action {{
|
|
667
|
+
color: var(--text-secondary);
|
|
668
|
+
}}
|
|
669
|
+
|
|
670
|
+
/* Action Detail */
|
|
671
|
+
.action-detail {{
|
|
672
|
+
background: var(--bg-tertiary);
|
|
673
|
+
border-radius: 8px;
|
|
674
|
+
padding: 12px;
|
|
675
|
+
}}
|
|
676
|
+
.action-detail h4 {{
|
|
677
|
+
font-size: 0.8rem;
|
|
678
|
+
color: var(--text-muted);
|
|
679
|
+
text-transform: uppercase;
|
|
680
|
+
letter-spacing: 0.05em;
|
|
681
|
+
margin-bottom: 8px;
|
|
682
|
+
}}
|
|
683
|
+
.action-content {{
|
|
684
|
+
font-family: "SF Mono", Monaco, monospace;
|
|
685
|
+
font-size: 0.8rem;
|
|
686
|
+
color: var(--text-primary);
|
|
687
|
+
word-break: break-word;
|
|
688
|
+
}}
|
|
689
|
+
.reasoning-box {{
|
|
690
|
+
margin-top: 12px;
|
|
691
|
+
padding: 10px;
|
|
692
|
+
background: var(--bg-secondary);
|
|
693
|
+
border-radius: 6px;
|
|
694
|
+
font-size: 0.8rem;
|
|
695
|
+
color: var(--text-secondary);
|
|
696
|
+
line-height: 1.6;
|
|
697
|
+
max-height: 200px;
|
|
698
|
+
overflow-y: auto;
|
|
699
|
+
}}
|
|
700
|
+
.reasoning-box h4 {{
|
|
701
|
+
margin-bottom: 8px;
|
|
702
|
+
}}
|
|
703
|
+
|
|
704
|
+
/* Speed Control */
|
|
705
|
+
.speed-control {{
|
|
706
|
+
display: flex;
|
|
707
|
+
align-items: center;
|
|
708
|
+
gap: 6px;
|
|
709
|
+
margin-left: auto;
|
|
710
|
+
}}
|
|
711
|
+
.speed-control label {{
|
|
712
|
+
font-size: 0.7rem;
|
|
713
|
+
color: var(--text-muted);
|
|
714
|
+
text-transform: uppercase;
|
|
715
|
+
}}
|
|
716
|
+
.speed-control select {{
|
|
717
|
+
padding: 4px 8px;
|
|
718
|
+
border-radius: 4px;
|
|
719
|
+
background: var(--bg-tertiary);
|
|
720
|
+
color: var(--text-primary);
|
|
721
|
+
border: 1px solid var(--border-color);
|
|
722
|
+
font-size: 0.8rem;
|
|
723
|
+
cursor: pointer;
|
|
724
|
+
}}
|
|
725
|
+
|
|
726
|
+
/* Progress Bar */
|
|
727
|
+
.progress-bar {{
|
|
728
|
+
width: 100%;
|
|
729
|
+
height: 4px;
|
|
730
|
+
background: var(--bg-tertiary);
|
|
731
|
+
border-radius: 2px;
|
|
732
|
+
margin-top: 8px;
|
|
733
|
+
overflow: hidden;
|
|
734
|
+
cursor: pointer;
|
|
735
|
+
}}
|
|
736
|
+
.progress-bar .progress {{
|
|
737
|
+
height: 100%;
|
|
738
|
+
background: var(--accent);
|
|
739
|
+
transition: width 0.1s ease;
|
|
740
|
+
}}
|
|
741
|
+
|
|
742
|
+
/* No task selected state */
|
|
743
|
+
.no-task-selected {{
|
|
744
|
+
display: flex;
|
|
745
|
+
flex-direction: column;
|
|
746
|
+
align-items: center;
|
|
747
|
+
justify-content: center;
|
|
748
|
+
min-height: 400px;
|
|
749
|
+
color: var(--text-muted);
|
|
750
|
+
}}
|
|
751
|
+
.no-task-selected .icon {{
|
|
752
|
+
font-size: 3rem;
|
|
753
|
+
margin-bottom: 16px;
|
|
754
|
+
}}
|
|
755
|
+
.no-task-selected p {{
|
|
756
|
+
font-size: 0.9rem;
|
|
757
|
+
}}
|
|
758
|
+
</style>
|
|
759
|
+
</head>
|
|
760
|
+
<body>
|
|
761
|
+
{shared_header_html}
|
|
762
|
+
|
|
763
|
+
<div class="container">
|
|
764
|
+
<!-- Summary Panel -->
|
|
765
|
+
<div class="summary-panel">
|
|
766
|
+
<div class="summary-header">
|
|
767
|
+
<h2>Benchmark Results: {metadata.get("run_name", "Unknown")}</h2>
|
|
768
|
+
<div class="summary-meta">
|
|
769
|
+
<span>Model: {metadata.get("model_id", "unknown")}</span>
|
|
770
|
+
<span> | </span>
|
|
771
|
+
<span>Created: {metadata.get("created_at", "N/A")}</span>
|
|
772
|
+
</div>
|
|
773
|
+
</div>
|
|
774
|
+
<div class="summary-stats">
|
|
775
|
+
<div class="stat-card">
|
|
776
|
+
<div class="stat-value">{num_tasks}</div>
|
|
777
|
+
<div class="stat-label">Total Tasks</div>
|
|
778
|
+
</div>
|
|
779
|
+
<div class="stat-card">
|
|
780
|
+
<div class="stat-value success">{num_success}</div>
|
|
781
|
+
<div class="stat-label">Passed</div>
|
|
782
|
+
</div>
|
|
783
|
+
<div class="stat-card">
|
|
784
|
+
<div class="stat-value error">{num_tasks - num_success}</div>
|
|
785
|
+
<div class="stat-label">Failed</div>
|
|
786
|
+
</div>
|
|
787
|
+
<div class="stat-card">
|
|
788
|
+
<div class="stat-value {'success' if success_rate >= 50 else 'error'}">{success_rate:.1f}%</div>
|
|
789
|
+
<div class="stat-label">Success Rate</div>
|
|
790
|
+
</div>
|
|
791
|
+
</div>
|
|
792
|
+
<div class="domain-breakdown" id="domain-breakdown"></div>
|
|
793
|
+
</div>
|
|
794
|
+
|
|
795
|
+
<!-- Filters -->
|
|
796
|
+
<div class="filter-bar">
|
|
797
|
+
<div class="filter-group">
|
|
798
|
+
<span class="filter-label">Domain:</span>
|
|
799
|
+
<select class="filter-select" id="domain-filter">
|
|
800
|
+
<option value="all">All Domains</option>
|
|
801
|
+
</select>
|
|
802
|
+
</div>
|
|
803
|
+
<div class="filter-group">
|
|
804
|
+
<span class="filter-label">Status:</span>
|
|
805
|
+
<select class="filter-select" id="status-filter">
|
|
806
|
+
<option value="all">All</option>
|
|
807
|
+
<option value="success">Passed</option>
|
|
808
|
+
<option value="fail">Failed</option>
|
|
809
|
+
</select>
|
|
810
|
+
</div>
|
|
811
|
+
<span class="filter-count" id="filter-count">{num_tasks} tasks</span>
|
|
812
|
+
</div>
|
|
813
|
+
|
|
814
|
+
<!-- Main Content -->
|
|
815
|
+
<div class="main-content">
|
|
816
|
+
<!-- Task List -->
|
|
817
|
+
<div class="task-list">
|
|
818
|
+
<div class="task-list-header">
|
|
819
|
+
<h3>Tasks</h3>
|
|
820
|
+
</div>
|
|
821
|
+
<div id="task-list-items"></div>
|
|
822
|
+
</div>
|
|
823
|
+
|
|
824
|
+
<!-- Task Detail Panel -->
|
|
825
|
+
<div class="task-detail" id="task-detail">
|
|
826
|
+
<div class="no-task-selected" id="no-task-selected">
|
|
827
|
+
<div class="icon">+</div>
|
|
828
|
+
<p>Select a task from the list to view details</p>
|
|
829
|
+
</div>
|
|
830
|
+
<div id="task-detail-content" style="display:none;"></div>
|
|
831
|
+
</div>
|
|
832
|
+
</div>
|
|
833
|
+
</div>
|
|
834
|
+
|
|
835
|
+
<script>
|
|
836
|
+
// Data from Python
|
|
837
|
+
const metadata = {metadata_json};
|
|
838
|
+
const summary = {summary_json};
|
|
839
|
+
const domainStats = {domain_stats_json};
|
|
840
|
+
const tasks = {tasks_json};
|
|
841
|
+
const embedScreenshots = {'true' if embed_screenshots else 'false'};
|
|
842
|
+
|
|
843
|
+
let currentTaskIndex = -1;
|
|
844
|
+
let currentStepIndex = 0;
|
|
845
|
+
let isPlaying = false;
|
|
846
|
+
let playInterval = null;
|
|
847
|
+
let playSpeed = 1000;
|
|
848
|
+
|
|
849
|
+
// Initialize page
|
|
850
|
+
function init() {{
|
|
851
|
+
renderDomainBreakdown();
|
|
852
|
+
populateDomainFilter();
|
|
853
|
+
renderTaskList();
|
|
854
|
+
setupFilters();
|
|
855
|
+
}}
|
|
856
|
+
|
|
857
|
+
function renderDomainBreakdown() {{
|
|
858
|
+
const container = document.getElementById('domain-breakdown');
|
|
859
|
+
let html = '';
|
|
860
|
+
for (const [domain, stats] of Object.entries(domainStats)) {{
|
|
861
|
+
const rate = stats.total > 0 ? (stats.success / stats.total * 100).toFixed(0) : 0;
|
|
862
|
+
html += `
|
|
863
|
+
<div class="domain-tag">
|
|
864
|
+
<span class="domain-name">${{domain}}</span>
|
|
865
|
+
<span class="domain-stats">${{stats.success}}/${{stats.total}} (${{rate}}%)</span>
|
|
866
|
+
</div>
|
|
867
|
+
`;
|
|
868
|
+
}}
|
|
869
|
+
container.innerHTML = html;
|
|
870
|
+
}}
|
|
871
|
+
|
|
872
|
+
function populateDomainFilter() {{
|
|
873
|
+
const select = document.getElementById('domain-filter');
|
|
874
|
+
for (const domain of Object.keys(domainStats).sort()) {{
|
|
875
|
+
const option = document.createElement('option');
|
|
876
|
+
option.value = domain;
|
|
877
|
+
option.textContent = domain;
|
|
878
|
+
select.appendChild(option);
|
|
879
|
+
}}
|
|
880
|
+
}}
|
|
881
|
+
|
|
882
|
+
function renderTaskList() {{
|
|
883
|
+
const container = document.getElementById('task-list-items');
|
|
884
|
+
let html = '';
|
|
885
|
+
tasks.forEach((task, idx) => {{
|
|
886
|
+
const def = task.definition || {{}};
|
|
887
|
+
const exec = task.execution || {{}};
|
|
888
|
+
const success = exec.success || false;
|
|
889
|
+
const domain = def.domain || 'unknown';
|
|
890
|
+
const numSteps = exec.num_steps || 0;
|
|
891
|
+
|
|
892
|
+
html += `
|
|
893
|
+
<div class="task-item" data-idx="${{idx}}" data-domain="${{domain}}" data-status="${{success ? 'success' : 'fail'}}" onclick="selectTask(${{idx}})">
|
|
894
|
+
<div class="task-header">
|
|
895
|
+
<span class="task-id">${{task.task_id}}</span>
|
|
896
|
+
<span class="task-status ${{success ? 'success' : 'fail'}}">${{success ? 'PASS' : 'FAIL'}}</span>
|
|
897
|
+
</div>
|
|
898
|
+
<div class="task-info">
|
|
899
|
+
<span class="task-domain">${{domain}}</span>
|
|
900
|
+
<span> | ${{numSteps}} steps</span>
|
|
901
|
+
</div>
|
|
902
|
+
</div>
|
|
903
|
+
`;
|
|
904
|
+
}});
|
|
905
|
+
container.innerHTML = html;
|
|
906
|
+
}}
|
|
907
|
+
|
|
908
|
+
function setupFilters() {{
|
|
909
|
+
document.getElementById('domain-filter').addEventListener('change', filterTasks);
|
|
910
|
+
document.getElementById('status-filter').addEventListener('change', filterTasks);
|
|
911
|
+
}}
|
|
912
|
+
|
|
913
|
+
function filterTasks() {{
|
|
914
|
+
const domainFilter = document.getElementById('domain-filter').value;
|
|
915
|
+
const statusFilter = document.getElementById('status-filter').value;
|
|
916
|
+
|
|
917
|
+
let visibleCount = 0;
|
|
918
|
+
document.querySelectorAll('.task-item').forEach(item => {{
|
|
919
|
+
const domain = item.dataset.domain;
|
|
920
|
+
const status = item.dataset.status;
|
|
921
|
+
|
|
922
|
+
const matchDomain = domainFilter === 'all' || domain === domainFilter;
|
|
923
|
+
const matchStatus = statusFilter === 'all' || status === statusFilter;
|
|
924
|
+
|
|
925
|
+
if (matchDomain && matchStatus) {{
|
|
926
|
+
item.classList.remove('hidden');
|
|
927
|
+
visibleCount++;
|
|
928
|
+
}} else {{
|
|
929
|
+
item.classList.add('hidden');
|
|
930
|
+
}}
|
|
931
|
+
}});
|
|
932
|
+
|
|
933
|
+
document.getElementById('filter-count').textContent = `${{visibleCount}} tasks`;
|
|
934
|
+
}}
|
|
935
|
+
|
|
936
|
+
function selectTask(idx) {{
|
|
937
|
+
currentTaskIndex = idx;
|
|
938
|
+
currentStepIndex = 0;
|
|
939
|
+
|
|
940
|
+
// Update active state in list
|
|
941
|
+
document.querySelectorAll('.task-item').forEach((item, i) => {{
|
|
942
|
+
item.classList.toggle('active', parseInt(item.dataset.idx) === idx);
|
|
943
|
+
}});
|
|
944
|
+
|
|
945
|
+
// Show task detail
|
|
946
|
+
document.getElementById('no-task-selected').style.display = 'none';
|
|
947
|
+
document.getElementById('task-detail-content').style.display = 'block';
|
|
948
|
+
|
|
949
|
+
renderTaskDetail();
|
|
950
|
+
}}
|
|
951
|
+
|
|
952
|
+
function renderTaskDetail() {{
|
|
953
|
+
if (currentTaskIndex < 0) return;
|
|
954
|
+
|
|
955
|
+
const task = tasks[currentTaskIndex];
|
|
956
|
+
const def = task.definition || {{}};
|
|
957
|
+
const exec = task.execution || {{}};
|
|
958
|
+
const steps = exec.steps || [];
|
|
959
|
+
const success = exec.success || false;
|
|
960
|
+
|
|
961
|
+
const container = document.getElementById('task-detail-content');
|
|
962
|
+
container.innerHTML = `
|
|
963
|
+
<div class="task-detail-header">
|
|
964
|
+
<h2>${{task.task_id}} - <span style="color: ${{success ? 'var(--success)' : 'var(--error)'}}">${{success ? 'PASSED' : 'FAILED'}}</span></h2>
|
|
965
|
+
<div class="task-detail-meta">
|
|
966
|
+
Domain: <strong>${{def.domain || 'unknown'}}</strong> |
|
|
967
|
+
Steps: <strong>${{exec.num_steps || steps.length}}</strong> |
|
|
968
|
+
Time: <strong>${{(exec.total_time_seconds || 0).toFixed(1)}}s</strong>
|
|
969
|
+
${{exec.error ? `<br>Error: <span style="color:var(--error)">${{exec.error}}</span>` : ''}}
|
|
970
|
+
</div>
|
|
971
|
+
<div class="task-detail-instruction">
|
|
972
|
+
${{def.instruction || 'No instruction available'}}
|
|
973
|
+
</div>
|
|
974
|
+
</div>
|
|
975
|
+
<div class="step-viewer">
|
|
976
|
+
<div class="screenshot-container" id="screenshot-container">
|
|
977
|
+
${{steps.length > 0 ? '<img id="screenshot-img" src="" alt="Step screenshot">' : '<span class="screenshot-placeholder">No screenshots available</span>'}}
|
|
978
|
+
</div>
|
|
979
|
+
<div class="step-sidebar">
|
|
980
|
+
<div class="step-controls">
|
|
981
|
+
<button class="step-btn" onclick="prevStep()">Prev</button>
|
|
982
|
+
<button class="step-btn primary" id="play-btn" onclick="togglePlay()">Play</button>
|
|
983
|
+
<button class="step-btn" onclick="nextStep()">Next</button>
|
|
984
|
+
<span class="step-progress" id="step-progress">0 / ${{steps.length}}</span>
|
|
985
|
+
<div class="speed-control">
|
|
986
|
+
<label>Speed</label>
|
|
987
|
+
<select id="speed-select" onchange="changeSpeed(this.value)">
|
|
988
|
+
<option value="2000">0.5x</option>
|
|
989
|
+
<option value="1000" selected>1x</option>
|
|
990
|
+
<option value="500">2x</option>
|
|
991
|
+
<option value="250">4x</option>
|
|
992
|
+
</select>
|
|
993
|
+
</div>
|
|
994
|
+
</div>
|
|
995
|
+
<div class="progress-bar" onclick="seekStep(event)">
|
|
996
|
+
<div class="progress" id="step-progress-bar" style="width: 0%"></div>
|
|
997
|
+
</div>
|
|
998
|
+
<div class="step-list" id="step-list"></div>
|
|
999
|
+
<div class="action-detail" id="action-detail">
|
|
1000
|
+
<h4>Action</h4>
|
|
1001
|
+
<div class="action-content" id="action-content">-</div>
|
|
1002
|
+
</div>
|
|
1003
|
+
<div class="reasoning-box" id="reasoning-box" style="display:none;">
|
|
1004
|
+
<h4>Reasoning</h4>
|
|
1005
|
+
<div id="reasoning-content"></div>
|
|
1006
|
+
</div>
|
|
1007
|
+
</div>
|
|
1008
|
+
</div>
|
|
1009
|
+
`;
|
|
1010
|
+
|
|
1011
|
+
renderStepList();
|
|
1012
|
+
if (steps.length > 0) {{
|
|
1013
|
+
updateStep();
|
|
1014
|
+
}}
|
|
1015
|
+
}}
|
|
1016
|
+
|
|
1017
|
+
function renderStepList() {{
|
|
1018
|
+
if (currentTaskIndex < 0) return;
|
|
1019
|
+
|
|
1020
|
+
const task = tasks[currentTaskIndex];
|
|
1021
|
+
const steps = task.execution?.steps || [];
|
|
1022
|
+
const container = document.getElementById('step-list');
|
|
1023
|
+
|
|
1024
|
+
let html = '';
|
|
1025
|
+
steps.forEach((step, idx) => {{
|
|
1026
|
+
const action = step.action || {{}};
|
|
1027
|
+
const actionType = action.type || 'unknown';
|
|
1028
|
+
html += `
|
|
1029
|
+
<div class="step-list-item ${{idx === currentStepIndex ? 'active' : ''}}" onclick="goToStep(${{idx}})">
|
|
1030
|
+
<span class="step-num">#${{idx}}</span>
|
|
1031
|
+
<span class="step-action">${{actionType.toUpperCase()}}</span>
|
|
1032
|
+
</div>
|
|
1033
|
+
`;
|
|
1034
|
+
}});
|
|
1035
|
+
container.innerHTML = html || '<div style="padding:12px;color:var(--text-muted);">No steps</div>';
|
|
1036
|
+
}}
|
|
1037
|
+
|
|
1038
|
+
function updateStep() {{
|
|
1039
|
+
if (currentTaskIndex < 0) return;
|
|
1040
|
+
|
|
1041
|
+
const task = tasks[currentTaskIndex];
|
|
1042
|
+
const steps = task.execution?.steps || [];
|
|
1043
|
+
const screenshots = task.screenshots || [];
|
|
1044
|
+
|
|
1045
|
+
if (steps.length === 0) return;
|
|
1046
|
+
|
|
1047
|
+
const step = steps[currentStepIndex] || {{}};
|
|
1048
|
+
const action = step.action || {{}};
|
|
1049
|
+
|
|
1050
|
+
// Update screenshot
|
|
1051
|
+
const img = document.getElementById('screenshot-img');
|
|
1052
|
+
if (img) {{
|
|
1053
|
+
if (embedScreenshots && task.embedded_screenshots && task.embedded_screenshots[currentStepIndex]) {{
|
|
1054
|
+
img.src = task.embedded_screenshots[currentStepIndex];
|
|
1055
|
+
}} else if (screenshots[currentStepIndex]) {{
|
|
1056
|
+
img.src = screenshots[currentStepIndex];
|
|
1057
|
+
}} else if (step.screenshot_path) {{
|
|
1058
|
+
img.src = step.screenshot_path;
|
|
1059
|
+
}} else {{
|
|
1060
|
+
img.src = '';
|
|
1061
|
+
}}
|
|
1062
|
+
}}
|
|
1063
|
+
|
|
1064
|
+
// Update click marker if action has coordinates
|
|
1065
|
+
const container = document.getElementById('screenshot-container');
|
|
1066
|
+
// Remove existing markers
|
|
1067
|
+
container.querySelectorAll('.click-marker').forEach(m => m.remove());
|
|
1068
|
+
|
|
1069
|
+
if (action.x !== null && action.y !== null && action.x !== undefined && action.y !== undefined) {{
|
|
1070
|
+
const marker = document.createElement('div');
|
|
1071
|
+
marker.className = 'click-marker';
|
|
1072
|
+
marker.style.left = `${{action.x * 100}}%`;
|
|
1073
|
+
marker.style.top = `${{action.y * 100}}%`;
|
|
1074
|
+
marker.textContent = 'AI';
|
|
1075
|
+
container.appendChild(marker);
|
|
1076
|
+
}}
|
|
1077
|
+
|
|
1078
|
+
// Update progress
|
|
1079
|
+
document.getElementById('step-progress').textContent = `${{currentStepIndex + 1}} / ${{steps.length}}`;
|
|
1080
|
+
const progressPct = steps.length > 1 ? (currentStepIndex / (steps.length - 1)) * 100 : 0;
|
|
1081
|
+
document.getElementById('step-progress-bar').style.width = `${{progressPct}}%`;
|
|
1082
|
+
|
|
1083
|
+
// Update action detail
|
|
1084
|
+
const actionContent = document.getElementById('action-content');
|
|
1085
|
+
let actionText = action.type ? action.type.toUpperCase() : 'unknown';
|
|
1086
|
+
if (action.x !== null && action.y !== null && action.x !== undefined && action.y !== undefined) {{
|
|
1087
|
+
actionText += ` (${{(action.x * 100).toFixed(1)}}%, ${{(action.y * 100).toFixed(1)}}%)`;
|
|
1088
|
+
}}
|
|
1089
|
+
if (action.text) {{
|
|
1090
|
+
actionText += ` "${{action.text}}"`;
|
|
1091
|
+
}}
|
|
1092
|
+
if (action.key) {{
|
|
1093
|
+
actionText += ` [${{action.key}}]`;
|
|
1094
|
+
}}
|
|
1095
|
+
actionContent.textContent = actionText;
|
|
1096
|
+
|
|
1097
|
+
// Update reasoning
|
|
1098
|
+
const reasoningBox = document.getElementById('reasoning-box');
|
|
1099
|
+
const reasoningContent = document.getElementById('reasoning-content');
|
|
1100
|
+
if (step.reasoning) {{
|
|
1101
|
+
reasoningBox.style.display = 'block';
|
|
1102
|
+
reasoningContent.textContent = step.reasoning;
|
|
1103
|
+
}} else {{
|
|
1104
|
+
reasoningBox.style.display = 'none';
|
|
1105
|
+
}}
|
|
1106
|
+
|
|
1107
|
+
// Update step list active state
|
|
1108
|
+
document.querySelectorAll('.step-list-item').forEach((item, idx) => {{
|
|
1109
|
+
item.classList.toggle('active', idx === currentStepIndex);
|
|
1110
|
+
}});
|
|
1111
|
+
}}
|
|
1112
|
+
|
|
1113
|
+
function prevStep() {{
|
|
1114
|
+
if (currentStepIndex > 0) {{
|
|
1115
|
+
currentStepIndex--;
|
|
1116
|
+
updateStep();
|
|
1117
|
+
}}
|
|
1118
|
+
}}
|
|
1119
|
+
|
|
1120
|
+
function nextStep() {{
|
|
1121
|
+
const task = tasks[currentTaskIndex];
|
|
1122
|
+
const steps = task?.execution?.steps || [];
|
|
1123
|
+
if (currentStepIndex < steps.length - 1) {{
|
|
1124
|
+
currentStepIndex++;
|
|
1125
|
+
updateStep();
|
|
1126
|
+
}} else if (isPlaying) {{
|
|
1127
|
+
stopPlay();
|
|
1128
|
+
}}
|
|
1129
|
+
}}
|
|
1130
|
+
|
|
1131
|
+
function goToStep(idx) {{
|
|
1132
|
+
currentStepIndex = idx;
|
|
1133
|
+
updateStep();
|
|
1134
|
+
}}
|
|
1135
|
+
|
|
1136
|
+
function seekStep(event) {{
|
|
1137
|
+
const task = tasks[currentTaskIndex];
|
|
1138
|
+
const steps = task?.execution?.steps || [];
|
|
1139
|
+
if (steps.length === 0) return;
|
|
1140
|
+
|
|
1141
|
+
const bar = event.currentTarget;
|
|
1142
|
+
const rect = bar.getBoundingClientRect();
|
|
1143
|
+
const pct = (event.clientX - rect.left) / rect.width;
|
|
1144
|
+
currentStepIndex = Math.floor(pct * steps.length);
|
|
1145
|
+
currentStepIndex = Math.max(0, Math.min(currentStepIndex, steps.length - 1));
|
|
1146
|
+
updateStep();
|
|
1147
|
+
}}
|
|
1148
|
+
|
|
1149
|
+
function togglePlay() {{
|
|
1150
|
+
if (isPlaying) {{
|
|
1151
|
+
stopPlay();
|
|
1152
|
+
}} else {{
|
|
1153
|
+
startPlay();
|
|
1154
|
+
}}
|
|
1155
|
+
}}
|
|
1156
|
+
|
|
1157
|
+
function startPlay() {{
|
|
1158
|
+
isPlaying = true;
|
|
1159
|
+
document.getElementById('play-btn').textContent = 'Pause';
|
|
1160
|
+
document.getElementById('play-btn').classList.add('active');
|
|
1161
|
+
playInterval = setInterval(nextStep, playSpeed);
|
|
1162
|
+
}}
|
|
1163
|
+
|
|
1164
|
+
function stopPlay() {{
|
|
1165
|
+
isPlaying = false;
|
|
1166
|
+
document.getElementById('play-btn').textContent = 'Play';
|
|
1167
|
+
document.getElementById('play-btn').classList.remove('active');
|
|
1168
|
+
if (playInterval) {{
|
|
1169
|
+
clearInterval(playInterval);
|
|
1170
|
+
playInterval = null;
|
|
1171
|
+
}}
|
|
1172
|
+
}}
|
|
1173
|
+
|
|
1174
|
+
function changeSpeed(value) {{
|
|
1175
|
+
playSpeed = parseInt(value);
|
|
1176
|
+
if (isPlaying) {{
|
|
1177
|
+
stopPlay();
|
|
1178
|
+
startPlay();
|
|
1179
|
+
}}
|
|
1180
|
+
}}
|
|
1181
|
+
|
|
1182
|
+
// Keyboard shortcuts
|
|
1183
|
+
document.addEventListener('keydown', (e) => {{
|
|
1184
|
+
if (e.target.tagName === 'INPUT' || e.target.tagName === 'TEXTAREA') return;
|
|
1185
|
+
|
|
1186
|
+
switch (e.key) {{
|
|
1187
|
+
case ' ':
|
|
1188
|
+
e.preventDefault();
|
|
1189
|
+
togglePlay();
|
|
1190
|
+
break;
|
|
1191
|
+
case 'ArrowLeft':
|
|
1192
|
+
e.preventDefault();
|
|
1193
|
+
prevStep();
|
|
1194
|
+
break;
|
|
1195
|
+
case 'ArrowRight':
|
|
1196
|
+
e.preventDefault();
|
|
1197
|
+
nextStep();
|
|
1198
|
+
break;
|
|
1199
|
+
case 'Home':
|
|
1200
|
+
e.preventDefault();
|
|
1201
|
+
goToStep(0);
|
|
1202
|
+
break;
|
|
1203
|
+
case 'End':
|
|
1204
|
+
e.preventDefault();
|
|
1205
|
+
const task = tasks[currentTaskIndex];
|
|
1206
|
+
const steps = task?.execution?.steps || [];
|
|
1207
|
+
goToStep(steps.length - 1);
|
|
1208
|
+
break;
|
|
1209
|
+
}}
|
|
1210
|
+
}});
|
|
1211
|
+
|
|
1212
|
+
// Initialize on load
|
|
1213
|
+
document.addEventListener('DOMContentLoaded', init);
|
|
1214
|
+
</script>
|
|
1215
|
+
</body>
|
|
1216
|
+
</html>
|
|
1217
|
+
'''
|
|
1218
|
+
|
|
1219
|
+
return html
|