openadapt-ml 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. openadapt_ml/__init__.py +0 -0
  2. openadapt_ml/benchmarks/__init__.py +125 -0
  3. openadapt_ml/benchmarks/agent.py +825 -0
  4. openadapt_ml/benchmarks/azure.py +761 -0
  5. openadapt_ml/benchmarks/base.py +366 -0
  6. openadapt_ml/benchmarks/cli.py +884 -0
  7. openadapt_ml/benchmarks/data_collection.py +432 -0
  8. openadapt_ml/benchmarks/runner.py +381 -0
  9. openadapt_ml/benchmarks/waa.py +704 -0
  10. openadapt_ml/cloud/__init__.py +5 -0
  11. openadapt_ml/cloud/azure_inference.py +441 -0
  12. openadapt_ml/cloud/lambda_labs.py +2445 -0
  13. openadapt_ml/cloud/local.py +790 -0
  14. openadapt_ml/config.py +56 -0
  15. openadapt_ml/datasets/__init__.py +0 -0
  16. openadapt_ml/datasets/next_action.py +507 -0
  17. openadapt_ml/evals/__init__.py +23 -0
  18. openadapt_ml/evals/grounding.py +241 -0
  19. openadapt_ml/evals/plot_eval_metrics.py +174 -0
  20. openadapt_ml/evals/trajectory_matching.py +486 -0
  21. openadapt_ml/grounding/__init__.py +45 -0
  22. openadapt_ml/grounding/base.py +236 -0
  23. openadapt_ml/grounding/detector.py +570 -0
  24. openadapt_ml/ingest/__init__.py +43 -0
  25. openadapt_ml/ingest/capture.py +312 -0
  26. openadapt_ml/ingest/loader.py +232 -0
  27. openadapt_ml/ingest/synthetic.py +1102 -0
  28. openadapt_ml/models/__init__.py +0 -0
  29. openadapt_ml/models/api_adapter.py +171 -0
  30. openadapt_ml/models/base_adapter.py +59 -0
  31. openadapt_ml/models/dummy_adapter.py +42 -0
  32. openadapt_ml/models/qwen_vl.py +426 -0
  33. openadapt_ml/runtime/__init__.py +0 -0
  34. openadapt_ml/runtime/policy.py +182 -0
  35. openadapt_ml/schemas/__init__.py +53 -0
  36. openadapt_ml/schemas/sessions.py +122 -0
  37. openadapt_ml/schemas/validation.py +252 -0
  38. openadapt_ml/scripts/__init__.py +0 -0
  39. openadapt_ml/scripts/compare.py +1490 -0
  40. openadapt_ml/scripts/demo_policy.py +62 -0
  41. openadapt_ml/scripts/eval_policy.py +287 -0
  42. openadapt_ml/scripts/make_gif.py +153 -0
  43. openadapt_ml/scripts/prepare_synthetic.py +43 -0
  44. openadapt_ml/scripts/run_qwen_login_benchmark.py +192 -0
  45. openadapt_ml/scripts/train.py +174 -0
  46. openadapt_ml/training/__init__.py +0 -0
  47. openadapt_ml/training/benchmark_viewer.py +1538 -0
  48. openadapt_ml/training/shared_ui.py +157 -0
  49. openadapt_ml/training/stub_provider.py +276 -0
  50. openadapt_ml/training/trainer.py +2446 -0
  51. openadapt_ml/training/viewer.py +2970 -0
  52. openadapt_ml-0.1.0.dist-info/METADATA +818 -0
  53. openadapt_ml-0.1.0.dist-info/RECORD +55 -0
  54. openadapt_ml-0.1.0.dist-info/WHEEL +4 -0
  55. openadapt_ml-0.1.0.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,1538 @@
1
+ """Benchmark viewer generation functions.
2
+
3
+ This module provides functions to generate HTML viewers for benchmark evaluation results.
4
+ It is imported and used by trainer.py to maintain consistency with other viewer components.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import json
10
+ from pathlib import Path
11
+
12
+
13
+ def generate_benchmark_viewer(
14
+ benchmark_dir: Path | str,
15
+ output_path: Path | str | None = None,
16
+ ) -> Path:
17
+ """Generate benchmark viewer HTML from benchmark results directory.
18
+
19
+ Args:
20
+ benchmark_dir: Path to benchmark results directory (e.g., benchmark_results/waa_eval_20241214/)
21
+ output_path: Optional path for output benchmark.html (default: benchmark_dir/benchmark.html)
22
+
23
+ Returns:
24
+ Path to generated benchmark.html file
25
+
26
+ Example:
27
+ from openadapt_ml.training.benchmark_viewer import generate_benchmark_viewer
28
+
29
+ viewer_path = generate_benchmark_viewer("benchmark_results/test_run_phase1")
30
+ print(f"Generated: {viewer_path}")
31
+ """
32
+ benchmark_dir = Path(benchmark_dir)
33
+ if not benchmark_dir.exists():
34
+ raise FileNotFoundError(f"Benchmark directory not found: {benchmark_dir}")
35
+
36
+ if output_path is None:
37
+ output_path = benchmark_dir / "benchmark.html"
38
+ else:
39
+ output_path = Path(output_path)
40
+
41
+ # Load metadata
42
+ metadata_path = benchmark_dir / "metadata.json"
43
+ if not metadata_path.exists():
44
+ raise FileNotFoundError(f"metadata.json not found in {benchmark_dir}")
45
+
46
+ with open(metadata_path) as f:
47
+ metadata = json.load(f)
48
+
49
+ # Load summary
50
+ summary_path = benchmark_dir / "summary.json"
51
+ summary = {}
52
+ if summary_path.exists():
53
+ with open(summary_path) as f:
54
+ summary = json.load(f)
55
+
56
+ # Load all task results
57
+ tasks_dir = benchmark_dir / "tasks"
58
+ task_results = []
59
+
60
+ if tasks_dir.exists():
61
+ for task_dir in sorted(tasks_dir.iterdir()):
62
+ if not task_dir.is_dir():
63
+ continue
64
+
65
+ task_json = task_dir / "task.json"
66
+ execution_json = task_dir / "execution.json"
67
+
68
+ if not task_json.exists() or not execution_json.exists():
69
+ continue
70
+
71
+ with open(task_json) as f:
72
+ task_data = json.load(f)
73
+
74
+ with open(execution_json) as f:
75
+ execution_data = json.load(f)
76
+
77
+ # Combine task and execution data
78
+ task_result = {
79
+ "task_id": task_data["task_id"],
80
+ "instruction": task_data["instruction"],
81
+ "domain": task_data.get("domain", "unknown"),
82
+ "success": execution_data["success"],
83
+ "score": execution_data.get("score", 0.0),
84
+ "num_steps": execution_data["num_steps"],
85
+ "total_time_seconds": execution_data.get("total_time_seconds", 0.0),
86
+ "error": execution_data.get("error"),
87
+ "reason": execution_data.get("reason"),
88
+ "steps": execution_data.get("steps", []),
89
+ "screenshots_dir": str(task_dir / "screenshots"),
90
+ }
91
+ task_results.append(task_result)
92
+
93
+ # Import shared header components from trainer
94
+ from openadapt_ml.training.trainer import _get_shared_header_css, _generate_shared_header_html
95
+
96
+ # Generate HTML
97
+ html = _generate_benchmark_viewer_html(
98
+ metadata=metadata,
99
+ summary=summary,
100
+ tasks=task_results,
101
+ benchmark_dir=benchmark_dir,
102
+ shared_header_css=_get_shared_header_css(),
103
+ shared_header_html=_generate_shared_header_html("benchmarks"),
104
+ )
105
+
106
+ output_path.write_text(html)
107
+ print(f"Generated benchmark viewer: {output_path}")
108
+ return output_path
109
+
110
+
111
+ def generate_multi_run_benchmark_viewer(
112
+ benchmark_dirs: list[Path],
113
+ output_path: Path | str,
114
+ ) -> Path:
115
+ """Generate benchmark viewer HTML supporting multiple benchmark runs.
116
+
117
+ Args:
118
+ benchmark_dirs: List of benchmark result directories (sorted most recent first)
119
+ output_path: Path for output benchmark.html
120
+
121
+ Returns:
122
+ Path to generated benchmark.html file
123
+ """
124
+ output_path = Path(output_path)
125
+
126
+ # Load metadata and summary for all runs
127
+ all_runs = []
128
+ for benchmark_dir in benchmark_dirs:
129
+ metadata_path = benchmark_dir / "metadata.json"
130
+ summary_path = benchmark_dir / "summary.json"
131
+
132
+ if not metadata_path.exists() or not summary_path.exists():
133
+ continue
134
+
135
+ with open(metadata_path) as f:
136
+ metadata = json.load(f)
137
+ with open(summary_path) as f:
138
+ summary = json.load(f)
139
+
140
+ # Load all task results for this run
141
+ tasks_dir = benchmark_dir / "tasks"
142
+ task_results = []
143
+
144
+ if tasks_dir.exists():
145
+ for task_dir in sorted(tasks_dir.iterdir()):
146
+ if not task_dir.is_dir():
147
+ continue
148
+
149
+ task_json = task_dir / "task.json"
150
+ execution_json = task_dir / "execution.json"
151
+
152
+ if not task_json.exists() or not execution_json.exists():
153
+ continue
154
+
155
+ with open(task_json) as f:
156
+ task_data = json.load(f)
157
+
158
+ with open(execution_json) as f:
159
+ execution_data = json.load(f)
160
+
161
+ # Combine task and execution data
162
+ task_result = {
163
+ "task_id": task_data["task_id"],
164
+ "instruction": task_data["instruction"],
165
+ "domain": task_data.get("domain", "unknown"),
166
+ "success": execution_data["success"],
167
+ "score": execution_data.get("score", 0.0),
168
+ "num_steps": execution_data["num_steps"],
169
+ "total_time_seconds": execution_data.get("total_time_seconds", 0.0),
170
+ "error": execution_data.get("error"),
171
+ "reason": execution_data.get("reason"),
172
+ "steps": execution_data.get("steps", []),
173
+ }
174
+ task_results.append(task_result)
175
+
176
+ all_runs.append({
177
+ "run_name": metadata.get("run_name", benchmark_dir.name),
178
+ "model_id": metadata.get("model_id", "unknown"),
179
+ "created_at": metadata.get("created_at", ""),
180
+ "benchmark_name": metadata.get("benchmark_name", ""),
181
+ "dir_name": benchmark_dir.name, # For screenshot paths
182
+ "summary": summary,
183
+ "tasks": task_results,
184
+ })
185
+
186
+ if not all_runs:
187
+ return generate_empty_benchmark_viewer(output_path)
188
+
189
+ # Import shared header components from trainer
190
+ from openadapt_ml.training.trainer import _get_shared_header_css, _generate_shared_header_html
191
+
192
+ # Generate HTML
193
+ html = _generate_multi_run_benchmark_viewer_html(
194
+ runs=all_runs,
195
+ shared_header_css=_get_shared_header_css(),
196
+ shared_header_html=_generate_shared_header_html("benchmarks"),
197
+ )
198
+
199
+ output_path.write_text(html)
200
+ print(f"Generated multi-run benchmark viewer: {output_path}")
201
+ return output_path
202
+
203
+
204
+ def generate_empty_benchmark_viewer(output_path: Path | str) -> Path:
205
+ """Generate an empty benchmark viewer with guidance when no real data exists.
206
+
207
+ Args:
208
+ output_path: Path to output benchmark.html
209
+
210
+ Returns:
211
+ Path to generated file
212
+ """
213
+ output_path = Path(output_path)
214
+
215
+ # Import shared header components from trainer
216
+ from openadapt_ml.training.trainer import _get_shared_header_css, _generate_shared_header_html
217
+
218
+ shared_header_css = _get_shared_header_css()
219
+ shared_header_html = _generate_shared_header_html("benchmarks")
220
+
221
+ html = f'''<!DOCTYPE html>
222
+ <html lang="en">
223
+ <head>
224
+ <meta charset="UTF-8">
225
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
226
+ <title>Benchmark Viewer - No Data</title>
227
+ <style>
228
+ :root {{
229
+ --bg-primary: #0a0a0f;
230
+ --bg-secondary: #12121a;
231
+ --bg-tertiary: #1a1a24;
232
+ --border-color: rgba(255, 255, 255, 0.06);
233
+ --text-primary: #f0f0f0;
234
+ --text-secondary: #888;
235
+ --text-muted: #555;
236
+ --accent: #00d4aa;
237
+ --accent-dim: rgba(0, 212, 170, 0.15);
238
+ }}
239
+ * {{ margin: 0; padding: 0; box-sizing: border-box; }}
240
+ body {{
241
+ font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
242
+ background-color: var(--bg-primary);
243
+ color: var(--text-primary);
244
+ min-height: 100vh;
245
+ }}
246
+ {shared_header_css}
247
+ .empty-state {{
248
+ display: flex;
249
+ flex-direction: column;
250
+ align-items: center;
251
+ justify-content: center;
252
+ min-height: calc(100vh - 60px);
253
+ padding: 40px;
254
+ text-align: center;
255
+ }}
256
+ .empty-icon {{
257
+ font-size: 64px;
258
+ margin-bottom: 24px;
259
+ opacity: 0.5;
260
+ }}
261
+ .empty-title {{
262
+ font-size: 24px;
263
+ font-weight: 600;
264
+ margin-bottom: 12px;
265
+ }}
266
+ .empty-description {{
267
+ color: var(--text-secondary);
268
+ margin-bottom: 32px;
269
+ max-width: 500px;
270
+ line-height: 1.6;
271
+ }}
272
+ .guide-card {{
273
+ background: var(--bg-secondary);
274
+ border: 1px solid var(--border-color);
275
+ border-radius: 12px;
276
+ padding: 24px;
277
+ margin-bottom: 16px;
278
+ max-width: 600px;
279
+ text-align: left;
280
+ }}
281
+ .guide-card h3 {{
282
+ color: var(--accent);
283
+ margin-bottom: 12px;
284
+ font-size: 16px;
285
+ }}
286
+ .guide-card code {{
287
+ background: var(--bg-tertiary);
288
+ padding: 12px 16px;
289
+ border-radius: 8px;
290
+ display: block;
291
+ font-family: 'SF Mono', Monaco, monospace;
292
+ font-size: 13px;
293
+ color: var(--text-primary);
294
+ white-space: pre-wrap;
295
+ margin-bottom: 12px;
296
+ }}
297
+ .guide-card p {{
298
+ color: var(--text-secondary);
299
+ font-size: 14px;
300
+ line-height: 1.5;
301
+ }}
302
+ a {{
303
+ color: var(--accent);
304
+ text-decoration: none;
305
+ }}
306
+ a:hover {{
307
+ text-decoration: underline;
308
+ }}
309
+ </style>
310
+ </head>
311
+ <body>
312
+ {shared_header_html}
313
+
314
+ <div class="empty-state">
315
+ <div class="empty-icon">🚧</div>
316
+ <h1 class="empty-title">Windows Agent Arena Integration</h1>
317
+ <p class="empty-description">
318
+ This tab will display results from <strong>WAA benchmark</strong> evaluations (154 real Windows tasks).<br>
319
+ <span style="color: var(--text-muted);">Status: Work in Progress - requires Windows VM or Azure setup</span>
320
+ </p>
321
+
322
+ <div class="guide-card" style="background: var(--bg-tertiary); border-color: var(--accent);">
323
+ <h3 style="color: var(--text-primary);">Looking for synthetic benchmark results?</h3>
324
+ <code>uv run python -m openadapt_ml.scripts.eval_policy \\
325
+ --config configs/qwen3vl_synthetic_som.yaml \\
326
+ --backend qwen3 --dsl-mode som</code>
327
+ <p>The synthetic login benchmark (with SoM mode achieving 100%) uses eval_policy.py, not this viewer.</p>
328
+ </div>
329
+
330
+ <div class="guide-card">
331
+ <h3>WAA Local Setup (Windows Required)</h3>
332
+ <code># Clone WAA repository
333
+ git clone https://github.com/anthropics/WindowsAgentArena
334
+
335
+ # Run evaluation
336
+ uv run python -m openadapt_ml.benchmarks.cli run-local \\
337
+ --waa-path /path/to/WindowsAgentArena</code>
338
+ <p>Requires Windows environment. See <a href="https://github.com/anthropics/WindowsAgentArena" style="color: var(--accent);">WAA repo</a> for setup.</p>
339
+ </div>
340
+
341
+ <div class="guide-card">
342
+ <h3>WAA on Azure (Parallel VMs)</h3>
343
+ <code># Setup Azure resources
344
+ python scripts/setup_azure.py
345
+
346
+ # Run evaluation on Azure VMs
347
+ uv run python -m openadapt_ml.benchmarks.cli run-azure --workers 4</code>
348
+ <p>Runs WAA tasks in parallel on Azure Windows VMs. See docs/azure_waa_setup.md</p>
349
+ </div>
350
+ </div>
351
+ </body>
352
+ </html>'''
353
+
354
+ output_path.write_text(html)
355
+ return output_path
356
+
357
+
358
+ def _generate_benchmark_viewer_html(
359
+ metadata: dict,
360
+ summary: dict,
361
+ tasks: list[dict],
362
+ benchmark_dir: Path,
363
+ shared_header_css: str,
364
+ shared_header_html: str,
365
+ ) -> str:
366
+ """Generate the benchmark viewer HTML content.
367
+
368
+ Args:
369
+ metadata: Benchmark metadata (run name, model ID, etc.)
370
+ summary: Summary statistics (success rate, avg steps, etc.)
371
+ tasks: List of task results with execution data
372
+ benchmark_dir: Path to benchmark directory (for relative paths)
373
+ shared_header_css: CSS for shared header
374
+ shared_header_html: HTML for shared header
375
+
376
+ Returns:
377
+ Complete HTML string
378
+ """
379
+ # Prepare data as JSON
380
+ tasks_json = json.dumps(tasks)
381
+ summary_json = json.dumps(summary)
382
+ metadata_json = json.dumps(metadata)
383
+
384
+ # Calculate unique domains for filter
385
+ domains = sorted(set(task["domain"] for task in tasks))
386
+ domains_json = json.dumps(domains)
387
+
388
+ # Generate HTML
389
+ html = f'''<!DOCTYPE html>
390
+ <html lang="en">
391
+ <head>
392
+ <meta charset="UTF-8">
393
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
394
+ <title>Benchmark Viewer - {metadata.get("run_name", "Unknown")}</title>
395
+ <style>
396
+ :root {{
397
+ --bg-primary: #0a0a0f;
398
+ --bg-secondary: #12121a;
399
+ --bg-tertiary: #1a1a24;
400
+ --border-color: rgba(255, 255, 255, 0.06);
401
+ --text-primary: #f0f0f0;
402
+ --text-secondary: #888;
403
+ --text-muted: #555;
404
+ --accent: #00d4aa;
405
+ --accent-dim: rgba(0, 212, 170, 0.15);
406
+ --success: #00d4aa;
407
+ --failure: #ff4444;
408
+ }}
409
+
410
+ * {{ box-sizing: border-box; margin: 0; padding: 0; }}
411
+
412
+ body {{
413
+ font-family: "SF Pro Display", -apple-system, BlinkMacSystemFont, "Inter", sans-serif;
414
+ background: var(--bg-primary);
415
+ color: var(--text-primary);
416
+ min-height: 100vh;
417
+ line-height: 1.5;
418
+ }}
419
+
420
+ .container {{
421
+ max-width: 1440px;
422
+ margin: 0 auto;
423
+ padding: 24px;
424
+ }}
425
+
426
+ {shared_header_css}
427
+
428
+ .summary-cards {{
429
+ display: grid;
430
+ grid-template-columns: repeat(auto-fit, minmax(220px, 1fr));
431
+ gap: 16px;
432
+ margin-bottom: 32px;
433
+ }}
434
+
435
+ .summary-card {{
436
+ background: var(--bg-secondary);
437
+ border: 1px solid var(--border-color);
438
+ border-radius: 12px;
439
+ padding: 20px;
440
+ transition: all 0.2s;
441
+ }}
442
+
443
+ .summary-card:hover {{
444
+ border-color: var(--accent);
445
+ transform: translateY(-2px);
446
+ }}
447
+
448
+ .summary-card .label {{
449
+ font-size: 0.75rem;
450
+ color: var(--text-muted);
451
+ text-transform: uppercase;
452
+ letter-spacing: 0.5px;
453
+ font-weight: 600;
454
+ margin-bottom: 8px;
455
+ }}
456
+
457
+ .summary-card .value {{
458
+ font-size: 2rem;
459
+ font-weight: 700;
460
+ color: var(--text-primary);
461
+ }}
462
+
463
+ .summary-card .subtitle {{
464
+ font-size: 0.85rem;
465
+ color: var(--text-secondary);
466
+ margin-top: 4px;
467
+ }}
468
+
469
+ .filters {{
470
+ display: flex;
471
+ gap: 12px;
472
+ padding: 16px;
473
+ background: var(--bg-secondary);
474
+ border: 1px solid var(--border-color);
475
+ border-radius: 8px;
476
+ margin-bottom: 24px;
477
+ flex-wrap: wrap;
478
+ align-items: center;
479
+ }}
480
+
481
+ .filter-label {{
482
+ font-size: 0.75rem;
483
+ color: var(--text-muted);
484
+ text-transform: uppercase;
485
+ letter-spacing: 0.5px;
486
+ font-weight: 600;
487
+ }}
488
+
489
+ .filter-select {{
490
+ padding: 8px 32px 8px 12px;
491
+ border-radius: 8px;
492
+ font-size: 0.85rem;
493
+ background: rgba(0,0,0,0.4);
494
+ color: var(--text-primary);
495
+ border: 1px solid rgba(255,255,255,0.1);
496
+ cursor: pointer;
497
+ appearance: none;
498
+ background-image: url('data:image/svg+xml,%3Csvg xmlns=%27http://www.w3.org/2000/svg%27 width=%2712%27 height=%278%27%3E%3Cpath fill=%27%23888%27 d=%27M0 0l6 8 6-8z%27/%3E%3C/svg%3E');
499
+ background-repeat: no-repeat;
500
+ background-position: right 10px center;
501
+ transition: all 0.2s;
502
+ }}
503
+
504
+ .filter-select:hover {{
505
+ border-color: var(--accent);
506
+ background-color: rgba(0,212,170,0.1);
507
+ }}
508
+
509
+ .task-list {{
510
+ display: flex;
511
+ flex-direction: column;
512
+ gap: 12px;
513
+ }}
514
+
515
+ .task-item {{
516
+ background: var(--bg-secondary);
517
+ border: 1px solid var(--border-color);
518
+ border-radius: 8px;
519
+ overflow: hidden;
520
+ transition: all 0.2s;
521
+ }}
522
+
523
+ .task-item:hover {{
524
+ border-color: var(--accent);
525
+ }}
526
+
527
+ .task-header {{
528
+ display: flex;
529
+ align-items: center;
530
+ gap: 16px;
531
+ padding: 16px 20px;
532
+ cursor: pointer;
533
+ user-select: none;
534
+ }}
535
+
536
+ .task-header:hover {{
537
+ background: var(--bg-tertiary);
538
+ }}
539
+
540
+ .task-status {{
541
+ width: 24px;
542
+ height: 24px;
543
+ border-radius: 50%;
544
+ display: flex;
545
+ align-items: center;
546
+ justify-content: center;
547
+ font-weight: bold;
548
+ font-size: 0.9rem;
549
+ flex-shrink: 0;
550
+ }}
551
+
552
+ .task-status.success {{
553
+ background: var(--success);
554
+ color: var(--bg-primary);
555
+ }}
556
+
557
+ .task-status.failure {{
558
+ background: var(--failure);
559
+ color: var(--bg-primary);
560
+ }}
561
+
562
+ .task-info {{
563
+ flex: 1;
564
+ min-width: 0;
565
+ }}
566
+
567
+ .task-id {{
568
+ font-weight: 600;
569
+ font-size: 0.95rem;
570
+ margin-bottom: 4px;
571
+ }}
572
+
573
+ .task-instruction {{
574
+ font-size: 0.85rem;
575
+ color: var(--text-secondary);
576
+ overflow: hidden;
577
+ text-overflow: ellipsis;
578
+ white-space: nowrap;
579
+ }}
580
+
581
+ .task-meta {{
582
+ display: flex;
583
+ gap: 20px;
584
+ font-size: 0.8rem;
585
+ color: var(--text-muted);
586
+ font-family: "SF Mono", Monaco, monospace;
587
+ }}
588
+
589
+ .task-domain {{
590
+ padding: 4px 10px;
591
+ background: rgba(0,212,170,0.15);
592
+ border-radius: 4px;
593
+ font-size: 0.75rem;
594
+ color: var(--accent);
595
+ font-weight: 600;
596
+ }}
597
+
598
+ .task-expand-icon {{
599
+ color: var(--text-muted);
600
+ transition: transform 0.2s;
601
+ }}
602
+
603
+ .task-item.expanded .task-expand-icon {{
604
+ transform: rotate(90deg);
605
+ }}
606
+
607
+ .task-details {{
608
+ display: none;
609
+ padding: 0 20px 20px;
610
+ border-top: 1px solid var(--border-color);
611
+ }}
612
+
613
+ .task-item.expanded .task-details {{
614
+ display: block;
615
+ }}
616
+
617
+ .steps-list {{
618
+ margin-top: 16px;
619
+ }}
620
+
621
+ .step-item {{
622
+ display: flex;
623
+ gap: 16px;
624
+ padding: 12px;
625
+ background: var(--bg-tertiary);
626
+ border: 1px solid var(--border-color);
627
+ border-radius: 6px;
628
+ margin-bottom: 8px;
629
+ }}
630
+
631
+ .step-number {{
632
+ font-weight: 600;
633
+ color: var(--accent);
634
+ min-width: 60px;
635
+ }}
636
+
637
+ .step-screenshot {{
638
+ max-width: 200px;
639
+ border-radius: 4px;
640
+ border: 1px solid var(--border-color);
641
+ }}
642
+
643
+ .step-action {{
644
+ flex: 1;
645
+ }}
646
+
647
+ .action-type {{
648
+ font-weight: 600;
649
+ text-transform: uppercase;
650
+ font-size: 0.85rem;
651
+ color: var(--accent);
652
+ margin-bottom: 4px;
653
+ }}
654
+
655
+ .action-details {{
656
+ font-size: 0.8rem;
657
+ color: var(--text-secondary);
658
+ font-family: "SF Mono", Monaco, monospace;
659
+ }}
660
+
661
+ .no-tasks {{
662
+ text-align: center;
663
+ padding: 60px 20px;
664
+ color: var(--text-muted);
665
+ }}
666
+
667
+ .no-tasks-icon {{
668
+ font-size: 3rem;
669
+ margin-bottom: 16px;
670
+ opacity: 0.5;
671
+ }}
672
+ </style>
673
+ </head>
674
+ <body>
675
+ {shared_header_html}
676
+
677
+ <div class="container">
678
+ <div class="summary-cards">
679
+ <div class="summary-card">
680
+ <div class="label">Total Tasks</div>
681
+ <div class="value" id="total-tasks">0</div>
682
+ </div>
683
+ <div class="summary-card">
684
+ <div class="label">Success Rate</div>
685
+ <div class="value" id="success-rate">0%</div>
686
+ <div class="subtitle" id="success-count">0 / 0 passed</div>
687
+ </div>
688
+ <div class="summary-card">
689
+ <div class="label">Avg Steps</div>
690
+ <div class="value" id="avg-steps">0</div>
691
+ </div>
692
+ <div class="summary-card">
693
+ <div class="label">Avg Time</div>
694
+ <div class="value" id="avg-time">0s</div>
695
+ </div>
696
+ </div>
697
+
698
+ <div class="filters">
699
+ <span class="filter-label">Status:</span>
700
+ <select class="filter-select" id="filter-status">
701
+ <option value="all">All Tasks</option>
702
+ <option value="success">Success Only</option>
703
+ <option value="failure">Failure Only</option>
704
+ </select>
705
+
706
+ <span class="filter-label">Domain:</span>
707
+ <select class="filter-select" id="filter-domain">
708
+ <option value="all">All Domains</option>
709
+ </select>
710
+ </div>
711
+
712
+ <div class="task-list" id="task-list"></div>
713
+
714
+ <div class="no-tasks" id="no-tasks" style="display: none;">
715
+ <div class="no-tasks-icon">📋</div>
716
+ <div>No tasks match the current filters</div>
717
+ </div>
718
+ </div>
719
+
720
+ <script>
721
+ // Data from backend
722
+ const tasks = {tasks_json};
723
+ const summary = {summary_json};
724
+ const metadata = {metadata_json};
725
+ const domains = {domains_json};
726
+
727
+ // State
728
+ let currentFilters = {{
729
+ status: 'all',
730
+ domain: 'all'
731
+ }};
732
+
733
+ // Initialize
734
+ function init() {{
735
+ updateSummaryCards();
736
+ populateDomainFilter();
737
+ renderTaskList();
738
+
739
+ // Event listeners
740
+ document.getElementById('filter-status').addEventListener('change', (e) => {{
741
+ currentFilters.status = e.target.value;
742
+ renderTaskList();
743
+ }});
744
+
745
+ document.getElementById('filter-domain').addEventListener('change', (e) => {{
746
+ currentFilters.domain = e.target.value;
747
+ renderTaskList();
748
+ }});
749
+ }}
750
+
751
+ function updateSummaryCards() {{
752
+ document.getElementById('total-tasks').textContent = summary.num_tasks || tasks.length;
753
+
754
+ const successRate = (summary.success_rate || 0) * 100;
755
+ document.getElementById('success-rate').textContent = successRate.toFixed(1) + '%';
756
+ document.getElementById('success-count').textContent =
757
+ `${{summary.num_success || 0}} / ${{summary.num_tasks || tasks.length}} passed`;
758
+
759
+ const avgSteps = summary.avg_steps || 0;
760
+ document.getElementById('avg-steps').textContent = avgSteps.toFixed(1);
761
+
762
+ const avgTime = summary.avg_time_seconds || 0;
763
+ document.getElementById('avg-time').textContent = avgTime.toFixed(2) + 's';
764
+ }}
765
+
766
+ function populateDomainFilter() {{
767
+ const select = document.getElementById('filter-domain');
768
+ domains.forEach(domain => {{
769
+ const option = document.createElement('option');
770
+ option.value = domain;
771
+ option.textContent = domain.charAt(0).toUpperCase() + domain.slice(1);
772
+ select.appendChild(option);
773
+ }});
774
+ }}
775
+
776
+ function filterTasks() {{
777
+ return tasks.filter(task => {{
778
+ if (currentFilters.status !== 'all') {{
779
+ const isSuccess = task.success;
780
+ if (currentFilters.status === 'success' && !isSuccess) return false;
781
+ if (currentFilters.status === 'failure' && isSuccess) return false;
782
+ }}
783
+
784
+ if (currentFilters.domain !== 'all' && task.domain !== currentFilters.domain) {{
785
+ return false;
786
+ }}
787
+
788
+ return true;
789
+ }});
790
+ }}
791
+
792
+ function renderTaskList() {{
793
+ const filteredTasks = filterTasks();
794
+ const container = document.getElementById('task-list');
795
+ const noTasks = document.getElementById('no-tasks');
796
+
797
+ if (filteredTasks.length === 0) {{
798
+ container.innerHTML = '';
799
+ noTasks.style.display = 'block';
800
+ return;
801
+ }}
802
+
803
+ noTasks.style.display = 'none';
804
+ container.innerHTML = filteredTasks.map(task => renderTaskItem(task)).join('');
805
+
806
+ // Add click handlers
807
+ document.querySelectorAll('.task-header').forEach(header => {{
808
+ header.addEventListener('click', () => {{
809
+ const item = header.closest('.task-item');
810
+ item.classList.toggle('expanded');
811
+ }});
812
+ }});
813
+ }}
814
+
815
+ function renderTaskItem(task) {{
816
+ const statusClass = task.success ? 'success' : 'failure';
817
+ const statusIcon = task.success ? '✓' : '✗';
818
+
819
+ const stepsHtml = task.steps && task.steps.length > 0
820
+ ? task.steps.map(step => renderStep(step, task)).join('')
821
+ : '<div style="padding: 12px; color: var(--text-muted);">No step details available</div>';
822
+
823
+ return `
824
+ <div class="task-item" data-task-id="${{task.task_id}}">
825
+ <div class="task-header">
826
+ <div class="task-status ${{statusClass}}">${{statusIcon}}</div>
827
+ <div class="task-info">
828
+ <div class="task-id">${{task.task_id}}</div>
829
+ <div class="task-instruction">${{task.instruction}}</div>
830
+ </div>
831
+ <div class="task-domain">${{task.domain}}</div>
832
+ <div class="task-meta">
833
+ <span>${{task.num_steps}} steps</span>
834
+ <span>${{task.total_time_seconds.toFixed(2)}}s</span>
835
+ </div>
836
+ <div class="task-expand-icon">▶</div>
837
+ </div>
838
+ <div class="task-details">
839
+ <div class="steps-list">
840
+ ${{stepsHtml}}
841
+ </div>
842
+ </div>
843
+ </div>
844
+ `;
845
+ }}
846
+
847
+ function renderStep(step, task) {{
848
+ const actionType = step.action.type || 'unknown';
849
+ const actionDetails = formatActionDetails(step.action);
850
+
851
+ // Build screenshot path relative to benchmark.html
852
+ const screenshotPath = step.screenshot_path
853
+ ? `tasks/${{task.task_id}}/${{step.screenshot_path}}`
854
+ : '';
855
+
856
+ const screenshotHtml = screenshotPath
857
+ ? `<img src="${{screenshotPath}}" class="step-screenshot" alt="Step ${{step.step_idx}}" />`
858
+ : '';
859
+
860
+ return `
861
+ <div class="step-item">
862
+ <div class="step-number">Step ${{step.step_idx}}</div>
863
+ ${{screenshotHtml}}
864
+ <div class="step-action">
865
+ <div class="action-type">${{actionType}}</div>
866
+ <div class="action-details">${{actionDetails}}</div>
867
+ ${{step.reasoning ? `<div style="margin-top: 8px; font-style: italic; color: var(--text-secondary);">${{step.reasoning}}</div>` : ''}}
868
+ </div>
869
+ </div>
870
+ `;
871
+ }}
872
+
873
+ function formatActionDetails(action) {{
874
+ const parts = [];
875
+
876
+ if (action.x !== null && action.y !== null) {{
877
+ parts.push(`x: ${{action.x.toFixed(3)}}, y: ${{action.y.toFixed(3)}}`);
878
+ }}
879
+
880
+ if (action.text) {{
881
+ parts.push(`text: "${{action.text}}"`);
882
+ }}
883
+
884
+ if (action.key) {{
885
+ parts.push(`key: ${{action.key}}`);
886
+ }}
887
+
888
+ if (action.target_name) {{
889
+ parts.push(`target: ${{action.target_name}}`);
890
+ }}
891
+
892
+ return parts.length > 0 ? parts.join(', ') : 'No details';
893
+ }}
894
+
895
+ // Initialize on page load
896
+ init();
897
+ </script>
898
+ </body>
899
+ </html>'''
900
+
901
+ return html
902
+
903
+
904
+ def _generate_multi_run_benchmark_viewer_html(
905
+ runs: list[dict],
906
+ shared_header_css: str,
907
+ shared_header_html: str,
908
+ ) -> str:
909
+ """Generate HTML for multi-run benchmark viewer with run selector.
910
+
911
+ Args:
912
+ runs: List of run dictionaries with metadata, summary, and tasks
913
+ shared_header_css: CSS for shared header
914
+ shared_header_html: HTML for shared header
915
+
916
+ Returns:
917
+ Complete HTML string
918
+ """
919
+ # Prepare runs data as JSON
920
+ runs_json = json.dumps(runs)
921
+
922
+ # Calculate unique domains across all runs
923
+ all_domains = set()
924
+ for run in runs:
925
+ for task in run["tasks"]:
926
+ all_domains.add(task["domain"])
927
+ domains = sorted(all_domains)
928
+ domains_json = json.dumps(domains)
929
+
930
+ # Build run selector options
931
+ run_options = []
932
+ for i, run in enumerate(runs):
933
+ success_rate = run["summary"].get("success_rate", 0) * 100
934
+ label = f"{run['model_id']} - {success_rate:.0f}% ({run['run_name']})"
935
+ run_options.append(f'<option value="{i}">{label}</option>')
936
+ run_options_html = "\n".join(run_options)
937
+
938
+ # Generate HTML
939
+ html = f'''<!DOCTYPE html>
940
+ <html lang="en">
941
+ <head>
942
+ <meta charset="UTF-8">
943
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
944
+ <title>Benchmark Viewer - Multiple Runs</title>
945
+ <style>
946
+ :root {{
947
+ --bg-primary: #0a0a0f;
948
+ --bg-secondary: #12121a;
949
+ --bg-tertiary: #1a1a24;
950
+ --border-color: rgba(255, 255, 255, 0.06);
951
+ --text-primary: #f0f0f0;
952
+ --text-secondary: #888;
953
+ --text-muted: #555;
954
+ --accent: #00d4aa;
955
+ --accent-dim: rgba(0, 212, 170, 0.15);
956
+ --success: #00d4aa;
957
+ --failure: #ff4444;
958
+ }}
959
+
960
+ * {{ box-sizing: border-box; margin: 0; padding: 0; }}
961
+
962
+ body {{
963
+ font-family: "SF Pro Display", -apple-system, BlinkMacSystemFont, "Inter", sans-serif;
964
+ background: var(--bg-primary);
965
+ color: var(--text-primary);
966
+ min-height: 100vh;
967
+ line-height: 1.5;
968
+ }}
969
+
970
+ .container {{
971
+ max-width: 1440px;
972
+ margin: 0 auto;
973
+ padding: 24px;
974
+ }}
975
+
976
+ {shared_header_css}
977
+
978
+ .run-selector-section {{
979
+ background: var(--bg-secondary);
980
+ border: 1px solid var(--border-color);
981
+ border-radius: 12px;
982
+ padding: 20px 24px;
983
+ margin-bottom: 24px;
984
+ display: flex;
985
+ align-items: center;
986
+ gap: 16px;
987
+ }}
988
+
989
+ .run-selector-label {{
990
+ font-size: 0.85rem;
991
+ font-weight: 600;
992
+ color: var(--text-secondary);
993
+ text-transform: uppercase;
994
+ letter-spacing: 0.5px;
995
+ }}
996
+
997
+ #run-selector {{
998
+ flex: 1;
999
+ max-width: 600px;
1000
+ padding: 10px 36px 10px 14px;
1001
+ border-radius: 8px;
1002
+ font-size: 0.9rem;
1003
+ background: rgba(0,0,0,0.4);
1004
+ color: var(--text-primary);
1005
+ border: 1px solid rgba(255,255,255,0.1);
1006
+ cursor: pointer;
1007
+ appearance: none;
1008
+ background-image: url('data:image/svg+xml,%3Csvg xmlns=%27http://www.w3.org/2000/svg%27 width=%2712%27 height=%278%27%3E%3Cpath fill=%27%23888%27 d=%27M0 0l6 8 6-8z%27/%3E%3C/svg%3E');
1009
+ background-repeat: no-repeat;
1010
+ background-position: right 12px center;
1011
+ transition: all 0.2s;
1012
+ }}
1013
+
1014
+ #run-selector:hover {{
1015
+ border-color: var(--accent);
1016
+ background-color: rgba(0,212,170,0.1);
1017
+ }}
1018
+
1019
+ #run-selector:focus {{
1020
+ outline: none;
1021
+ border-color: var(--accent);
1022
+ box-shadow: 0 0 0 2px rgba(0,212,170,0.2);
1023
+ }}
1024
+
1025
+ .summary-cards {{
1026
+ display: grid;
1027
+ grid-template-columns: repeat(auto-fit, minmax(220px, 1fr));
1028
+ gap: 16px;
1029
+ margin-bottom: 32px;
1030
+ }}
1031
+
1032
+ .summary-card {{
1033
+ background: var(--bg-secondary);
1034
+ border: 1px solid var(--border-color);
1035
+ border-radius: 12px;
1036
+ padding: 20px;
1037
+ transition: all 0.2s;
1038
+ }}
1039
+
1040
+ .summary-card:hover {{
1041
+ border-color: var(--accent);
1042
+ transform: translateY(-2px);
1043
+ }}
1044
+
1045
+ .summary-card .label {{
1046
+ font-size: 0.75rem;
1047
+ color: var(--text-muted);
1048
+ text-transform: uppercase;
1049
+ letter-spacing: 0.5px;
1050
+ font-weight: 600;
1051
+ margin-bottom: 8px;
1052
+ }}
1053
+
1054
+ .summary-card .value {{
1055
+ font-size: 2rem;
1056
+ font-weight: 700;
1057
+ color: var(--text-primary);
1058
+ }}
1059
+
1060
+ .summary-card .subtitle {{
1061
+ font-size: 0.85rem;
1062
+ color: var(--text-secondary);
1063
+ margin-top: 4px;
1064
+ }}
1065
+
1066
+ .filters {{
1067
+ display: flex;
1068
+ gap: 12px;
1069
+ padding: 16px;
1070
+ background: var(--bg-secondary);
1071
+ border: 1px solid var(--border-color);
1072
+ border-radius: 8px;
1073
+ margin-bottom: 24px;
1074
+ flex-wrap: wrap;
1075
+ align-items: center;
1076
+ }}
1077
+
1078
+ .filter-label {{
1079
+ font-size: 0.75rem;
1080
+ color: var(--text-muted);
1081
+ text-transform: uppercase;
1082
+ letter-spacing: 0.5px;
1083
+ font-weight: 600;
1084
+ }}
1085
+
1086
+ .filter-select {{
1087
+ padding: 8px 32px 8px 12px;
1088
+ border-radius: 8px;
1089
+ font-size: 0.85rem;
1090
+ background: rgba(0,0,0,0.4);
1091
+ color: var(--text-primary);
1092
+ border: 1px solid rgba(255,255,255,0.1);
1093
+ cursor: pointer;
1094
+ appearance: none;
1095
+ background-image: url('data:image/svg+xml,%3Csvg xmlns=%27http://www.w3.org/2000/svg%27 width=%2712%27 height=%278%27%3E%3Cpath fill=%27%23888%27 d=%27M0 0l6 8 6-8z%27/%3E%3C/svg%3E');
1096
+ background-repeat: no-repeat;
1097
+ background-position: right 10px center;
1098
+ transition: all 0.2s;
1099
+ }}
1100
+
1101
+ .filter-select:hover {{
1102
+ border-color: var(--accent);
1103
+ background-color: rgba(0,212,170,0.1);
1104
+ }}
1105
+
1106
+ .task-list {{
1107
+ display: flex;
1108
+ flex-direction: column;
1109
+ gap: 12px;
1110
+ }}
1111
+
1112
+ .task-item {{
1113
+ background: var(--bg-secondary);
1114
+ border: 1px solid var(--border-color);
1115
+ border-radius: 8px;
1116
+ overflow: hidden;
1117
+ transition: all 0.2s;
1118
+ }}
1119
+
1120
+ .task-item:hover {{
1121
+ border-color: var(--accent);
1122
+ }}
1123
+
1124
+ .task-header {{
1125
+ display: flex;
1126
+ align-items: center;
1127
+ gap: 16px;
1128
+ padding: 16px 20px;
1129
+ cursor: pointer;
1130
+ user-select: none;
1131
+ }}
1132
+
1133
+ .task-header:hover {{
1134
+ background: var(--bg-tertiary);
1135
+ }}
1136
+
1137
+ .task-status {{
1138
+ width: 24px;
1139
+ height: 24px;
1140
+ border-radius: 50%;
1141
+ display: flex;
1142
+ align-items: center;
1143
+ justify-content: center;
1144
+ font-weight: bold;
1145
+ font-size: 0.9rem;
1146
+ flex-shrink: 0;
1147
+ }}
1148
+
1149
+ .task-status.success {{
1150
+ background: var(--success);
1151
+ color: var(--bg-primary);
1152
+ }}
1153
+
1154
+ .task-status.failure {{
1155
+ background: var(--failure);
1156
+ color: var(--bg-primary);
1157
+ }}
1158
+
1159
+ .task-info {{
1160
+ flex: 1;
1161
+ min-width: 0;
1162
+ }}
1163
+
1164
+ .task-id {{
1165
+ font-weight: 600;
1166
+ font-size: 0.95rem;
1167
+ margin-bottom: 4px;
1168
+ }}
1169
+
1170
+ .task-instruction {{
1171
+ font-size: 0.85rem;
1172
+ color: var(--text-secondary);
1173
+ overflow: hidden;
1174
+ text-overflow: ellipsis;
1175
+ white-space: nowrap;
1176
+ }}
1177
+
1178
+ .task-meta {{
1179
+ display: flex;
1180
+ gap: 20px;
1181
+ font-size: 0.8rem;
1182
+ color: var(--text-muted);
1183
+ font-family: "SF Mono", Monaco, monospace;
1184
+ }}
1185
+
1186
+ .task-domain {{
1187
+ padding: 4px 10px;
1188
+ background: rgba(0,212,170,0.15);
1189
+ border-radius: 4px;
1190
+ font-size: 0.75rem;
1191
+ color: var(--accent);
1192
+ font-weight: 600;
1193
+ }}
1194
+
1195
+ .task-expand-icon {{
1196
+ color: var(--text-muted);
1197
+ transition: transform 0.2s;
1198
+ }}
1199
+
1200
+ .task-item.expanded .task-expand-icon {{
1201
+ transform: rotate(90deg);
1202
+ }}
1203
+
1204
+ .task-details {{
1205
+ display: none;
1206
+ padding: 0 20px 20px;
1207
+ border-top: 1px solid var(--border-color);
1208
+ }}
1209
+
1210
+ .task-item.expanded .task-details {{
1211
+ display: block;
1212
+ }}
1213
+
1214
+ .steps-list {{
1215
+ margin-top: 16px;
1216
+ }}
1217
+
1218
+ .step-item {{
1219
+ display: flex;
1220
+ gap: 16px;
1221
+ padding: 12px;
1222
+ background: var(--bg-tertiary);
1223
+ border: 1px solid var(--border-color);
1224
+ border-radius: 6px;
1225
+ margin-bottom: 8px;
1226
+ }}
1227
+
1228
+ .step-number {{
1229
+ font-weight: 600;
1230
+ color: var(--accent);
1231
+ min-width: 60px;
1232
+ }}
1233
+
1234
+ .step-screenshot {{
1235
+ max-width: 200px;
1236
+ border-radius: 4px;
1237
+ border: 1px solid var(--border-color);
1238
+ }}
1239
+
1240
+ .step-action {{
1241
+ flex: 1;
1242
+ }}
1243
+
1244
+ .action-type {{
1245
+ font-weight: 600;
1246
+ text-transform: uppercase;
1247
+ font-size: 0.85rem;
1248
+ color: var(--accent);
1249
+ margin-bottom: 4px;
1250
+ }}
1251
+
1252
+ .action-details {{
1253
+ font-size: 0.8rem;
1254
+ color: var(--text-secondary);
1255
+ font-family: "SF Mono", Monaco, monospace;
1256
+ }}
1257
+
1258
+ .no-tasks {{
1259
+ text-align: center;
1260
+ padding: 60px 20px;
1261
+ color: var(--text-muted);
1262
+ }}
1263
+
1264
+ .no-tasks-icon {{
1265
+ font-size: 3rem;
1266
+ margin-bottom: 16px;
1267
+ opacity: 0.5;
1268
+ }}
1269
+ </style>
1270
+ </head>
1271
+ <body>
1272
+ {shared_header_html}
1273
+
1274
+ <div class="container">
1275
+ <div class="run-selector-section">
1276
+ <span class="run-selector-label">Benchmark Run:</span>
1277
+ <select id="run-selector">
1278
+ {run_options_html}
1279
+ </select>
1280
+ </div>
1281
+
1282
+ <div class="summary-cards">
1283
+ <div class="summary-card">
1284
+ <div class="label">Total Tasks</div>
1285
+ <div class="value" id="total-tasks">0</div>
1286
+ </div>
1287
+ <div class="summary-card">
1288
+ <div class="label">Success Rate</div>
1289
+ <div class="value" id="success-rate">0%</div>
1290
+ <div class="subtitle" id="success-count">0 / 0 passed</div>
1291
+ </div>
1292
+ <div class="summary-card">
1293
+ <div class="label">Avg Steps</div>
1294
+ <div class="value" id="avg-steps">0</div>
1295
+ </div>
1296
+ <div class="summary-card">
1297
+ <div class="label">Avg Time</div>
1298
+ <div class="value" id="avg-time">0s</div>
1299
+ </div>
1300
+ </div>
1301
+
1302
+ <div class="filters">
1303
+ <span class="filter-label">Status:</span>
1304
+ <select class="filter-select" id="filter-status">
1305
+ <option value="all">All Tasks</option>
1306
+ <option value="success">Success Only</option>
1307
+ <option value="failure">Failure Only</option>
1308
+ </select>
1309
+
1310
+ <span class="filter-label">Domain:</span>
1311
+ <select class="filter-select" id="filter-domain">
1312
+ <option value="all">All Domains</option>
1313
+ </select>
1314
+ </div>
1315
+
1316
+ <div class="task-list" id="task-list"></div>
1317
+
1318
+ <div class="no-tasks" id="no-tasks" style="display: none;">
1319
+ <div class="no-tasks-icon">📋</div>
1320
+ <div>No tasks match the current filters</div>
1321
+ </div>
1322
+ </div>
1323
+
1324
+ <script>
1325
+ // Data from backend
1326
+ const allRuns = {runs_json};
1327
+ const allDomains = {domains_json};
1328
+
1329
+ // State
1330
+ let currentRunIndex = 0;
1331
+ let currentFilters = {{
1332
+ status: 'all',
1333
+ domain: 'all'
1334
+ }};
1335
+
1336
+ // Get current run data
1337
+ function getCurrentRun() {{
1338
+ return allRuns[currentRunIndex];
1339
+ }}
1340
+
1341
+ function getCurrentTasks() {{
1342
+ return getCurrentRun().tasks;
1343
+ }}
1344
+
1345
+ function getCurrentSummary() {{
1346
+ return getCurrentRun().summary;
1347
+ }}
1348
+
1349
+ // Initialize
1350
+ function init() {{
1351
+ populateDomainFilter();
1352
+ updateDisplay();
1353
+
1354
+ // Event listeners
1355
+ document.getElementById('run-selector').addEventListener('change', (e) => {{
1356
+ currentRunIndex = parseInt(e.target.value);
1357
+ updateDisplay();
1358
+ }});
1359
+
1360
+ document.getElementById('filter-status').addEventListener('change', (e) => {{
1361
+ currentFilters.status = e.target.value;
1362
+ renderTaskList();
1363
+ }});
1364
+
1365
+ document.getElementById('filter-domain').addEventListener('change', (e) => {{
1366
+ currentFilters.domain = e.target.value;
1367
+ renderTaskList();
1368
+ }});
1369
+ }}
1370
+
1371
+ function updateDisplay() {{
1372
+ updateSummaryCards();
1373
+ renderTaskList();
1374
+ }}
1375
+
1376
+ function updateSummaryCards() {{
1377
+ const summary = getCurrentSummary();
1378
+ const tasks = getCurrentTasks();
1379
+
1380
+ document.getElementById('total-tasks').textContent = summary.num_tasks || tasks.length;
1381
+
1382
+ const successRate = (summary.success_rate || 0) * 100;
1383
+ document.getElementById('success-rate').textContent = successRate.toFixed(1) + '%';
1384
+ document.getElementById('success-count').textContent =
1385
+ `${{summary.num_success || 0}} / ${{summary.num_tasks || tasks.length}} passed`;
1386
+
1387
+ const avgSteps = summary.avg_steps || 0;
1388
+ document.getElementById('avg-steps').textContent = avgSteps.toFixed(1);
1389
+
1390
+ const avgTime = summary.avg_time_seconds || 0;
1391
+ document.getElementById('avg-time').textContent = avgTime.toFixed(2) + 's';
1392
+ }}
1393
+
1394
+ function populateDomainFilter() {{
1395
+ const select = document.getElementById('filter-domain');
1396
+ // Clear existing options except "All Domains"
1397
+ select.innerHTML = '<option value="all">All Domains</option>';
1398
+
1399
+ allDomains.forEach(domain => {{
1400
+ const option = document.createElement('option');
1401
+ option.value = domain;
1402
+ option.textContent = domain.charAt(0).toUpperCase() + domain.slice(1);
1403
+ select.appendChild(option);
1404
+ }});
1405
+ }}
1406
+
1407
+ function filterTasks() {{
1408
+ const tasks = getCurrentTasks();
1409
+ return tasks.filter(task => {{
1410
+ if (currentFilters.status !== 'all') {{
1411
+ const isSuccess = task.success;
1412
+ if (currentFilters.status === 'success' && !isSuccess) return false;
1413
+ if (currentFilters.status === 'failure' && isSuccess) return false;
1414
+ }}
1415
+
1416
+ if (currentFilters.domain !== 'all' && task.domain !== currentFilters.domain) {{
1417
+ return false;
1418
+ }}
1419
+
1420
+ return true;
1421
+ }});
1422
+ }}
1423
+
1424
+ function renderTaskList() {{
1425
+ const filteredTasks = filterTasks();
1426
+ const container = document.getElementById('task-list');
1427
+ const noTasks = document.getElementById('no-tasks');
1428
+
1429
+ if (filteredTasks.length === 0) {{
1430
+ container.innerHTML = '';
1431
+ noTasks.style.display = 'block';
1432
+ return;
1433
+ }}
1434
+
1435
+ noTasks.style.display = 'none';
1436
+ container.innerHTML = filteredTasks.map(task => renderTaskItem(task)).join('');
1437
+
1438
+ // Add click handlers
1439
+ document.querySelectorAll('.task-header').forEach(header => {{
1440
+ header.addEventListener('click', () => {{
1441
+ const item = header.closest('.task-item');
1442
+ item.classList.toggle('expanded');
1443
+ }});
1444
+ }});
1445
+ }}
1446
+
1447
+ function renderTaskItem(task) {{
1448
+ const statusClass = task.success ? 'success' : 'failure';
1449
+ const statusIcon = task.success ? '✓' : '✗';
1450
+
1451
+ const stepsHtml = task.steps && task.steps.length > 0
1452
+ ? task.steps.map(step => renderStep(step, task)).join('')
1453
+ : '<div style="padding: 12px; color: var(--text-muted);">No step details available</div>';
1454
+
1455
+ return `
1456
+ <div class="task-item" data-task-id="${{task.task_id}}">
1457
+ <div class="task-header">
1458
+ <div class="task-status ${{statusClass}}">${{statusIcon}}</div>
1459
+ <div class="task-info">
1460
+ <div class="task-id">${{task.task_id}}</div>
1461
+ <div class="task-instruction">${{task.instruction}}</div>
1462
+ </div>
1463
+ <div class="task-domain">${{task.domain}}</div>
1464
+ <div class="task-meta">
1465
+ <span>${{task.num_steps}} steps</span>
1466
+ <span>${{task.total_time_seconds.toFixed(2)}}s</span>
1467
+ </div>
1468
+ <div class="task-expand-icon">▶</div>
1469
+ </div>
1470
+ <div class="task-details">
1471
+ <div class="steps-list">
1472
+ ${{stepsHtml}}
1473
+ </div>
1474
+ </div>
1475
+ </div>
1476
+ `;
1477
+ }}
1478
+
1479
+ function renderStep(step, task) {{
1480
+ const actionType = step.action.type || 'unknown';
1481
+ const actionDetails = formatActionDetails(step.action);
1482
+ const runDirName = getCurrentRun().dir_name;
1483
+
1484
+ // Build screenshot path relative to benchmark.html
1485
+ const screenshotPath = step.screenshot_path
1486
+ ? `benchmark_tasks/${{runDirName}}/${{task.task_id}}/${{step.screenshot_path}}`
1487
+ : '';
1488
+
1489
+ const screenshotHtml = screenshotPath
1490
+ ? `<img src="${{screenshotPath}}" class="step-screenshot" alt="Step ${{step.step_idx}}" />`
1491
+ : '';
1492
+
1493
+ return `
1494
+ <div class="step-item">
1495
+ <div class="step-number">Step ${{step.step_idx}}</div>
1496
+ ${{screenshotHtml}}
1497
+ <div class="step-action">
1498
+ <div class="action-type">${{actionType}}</div>
1499
+ <div class="action-details">${{actionDetails}}</div>
1500
+ ${{step.reasoning ? `<div style="margin-top: 8px; font-style: italic; color: var(--text-secondary);">${{step.reasoning}}</div>` : ''}}
1501
+ </div>
1502
+ </div>
1503
+ `;
1504
+ }}
1505
+
1506
+ function formatActionDetails(action) {{
1507
+ const parts = [];
1508
+
1509
+ if (action.x !== null && action.y !== null) {{
1510
+ parts.push(`x: ${{action.x.toFixed(3)}}, y: ${{action.y.toFixed(3)}}`);
1511
+ }}
1512
+
1513
+ if (action.text) {{
1514
+ parts.push(`text: "${{action.text}}"`);
1515
+ }}
1516
+
1517
+ if (action.key) {{
1518
+ parts.push(`key: ${{action.key}}`);
1519
+ }}
1520
+
1521
+ if (action.target_node_id) {{
1522
+ parts.push(`element: [${{action.target_node_id}}]`);
1523
+ }}
1524
+
1525
+ if (action.target_name) {{
1526
+ parts.push(`target: ${{action.target_name}}`);
1527
+ }}
1528
+
1529
+ return parts.length > 0 ? parts.join(', ') : 'No details';
1530
+ }}
1531
+
1532
+ // Initialize on page load
1533
+ init();
1534
+ </script>
1535
+ </body>
1536
+ </html>'''
1537
+
1538
+ return html