openadapt-ml 0.1.0__py3-none-any.whl → 0.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (112) hide show
  1. openadapt_ml/baselines/__init__.py +121 -0
  2. openadapt_ml/baselines/adapter.py +185 -0
  3. openadapt_ml/baselines/cli.py +314 -0
  4. openadapt_ml/baselines/config.py +448 -0
  5. openadapt_ml/baselines/parser.py +922 -0
  6. openadapt_ml/baselines/prompts.py +787 -0
  7. openadapt_ml/benchmarks/__init__.py +13 -107
  8. openadapt_ml/benchmarks/agent.py +297 -374
  9. openadapt_ml/benchmarks/azure.py +62 -24
  10. openadapt_ml/benchmarks/azure_ops_tracker.py +521 -0
  11. openadapt_ml/benchmarks/cli.py +1874 -751
  12. openadapt_ml/benchmarks/trace_export.py +631 -0
  13. openadapt_ml/benchmarks/viewer.py +1236 -0
  14. openadapt_ml/benchmarks/vm_monitor.py +1111 -0
  15. openadapt_ml/benchmarks/waa_deploy/Dockerfile +216 -0
  16. openadapt_ml/benchmarks/waa_deploy/__init__.py +10 -0
  17. openadapt_ml/benchmarks/waa_deploy/api_agent.py +540 -0
  18. openadapt_ml/benchmarks/waa_deploy/start_waa_server.bat +53 -0
  19. openadapt_ml/cloud/azure_inference.py +3 -5
  20. openadapt_ml/cloud/lambda_labs.py +722 -307
  21. openadapt_ml/cloud/local.py +3194 -89
  22. openadapt_ml/cloud/ssh_tunnel.py +595 -0
  23. openadapt_ml/datasets/next_action.py +125 -96
  24. openadapt_ml/evals/grounding.py +32 -9
  25. openadapt_ml/evals/plot_eval_metrics.py +15 -13
  26. openadapt_ml/evals/trajectory_matching.py +120 -57
  27. openadapt_ml/experiments/demo_prompt/__init__.py +19 -0
  28. openadapt_ml/experiments/demo_prompt/format_demo.py +236 -0
  29. openadapt_ml/experiments/demo_prompt/results/experiment_20251231_002125.json +83 -0
  30. openadapt_ml/experiments/demo_prompt/results/experiment_n30_20251231_165958.json +1100 -0
  31. openadapt_ml/experiments/demo_prompt/results/multistep_20251231_025051.json +182 -0
  32. openadapt_ml/experiments/demo_prompt/run_experiment.py +541 -0
  33. openadapt_ml/experiments/representation_shootout/__init__.py +70 -0
  34. openadapt_ml/experiments/representation_shootout/conditions.py +708 -0
  35. openadapt_ml/experiments/representation_shootout/config.py +390 -0
  36. openadapt_ml/experiments/representation_shootout/evaluator.py +659 -0
  37. openadapt_ml/experiments/representation_shootout/runner.py +687 -0
  38. openadapt_ml/experiments/waa_demo/__init__.py +10 -0
  39. openadapt_ml/experiments/waa_demo/demos.py +357 -0
  40. openadapt_ml/experiments/waa_demo/runner.py +732 -0
  41. openadapt_ml/experiments/waa_demo/tasks.py +151 -0
  42. openadapt_ml/export/__init__.py +9 -0
  43. openadapt_ml/export/__main__.py +6 -0
  44. openadapt_ml/export/cli.py +89 -0
  45. openadapt_ml/export/parquet.py +277 -0
  46. openadapt_ml/grounding/detector.py +18 -14
  47. openadapt_ml/ingest/__init__.py +11 -10
  48. openadapt_ml/ingest/capture.py +97 -86
  49. openadapt_ml/ingest/loader.py +120 -69
  50. openadapt_ml/ingest/synthetic.py +344 -193
  51. openadapt_ml/models/api_adapter.py +14 -4
  52. openadapt_ml/models/base_adapter.py +10 -2
  53. openadapt_ml/models/providers/__init__.py +288 -0
  54. openadapt_ml/models/providers/anthropic.py +266 -0
  55. openadapt_ml/models/providers/base.py +299 -0
  56. openadapt_ml/models/providers/google.py +376 -0
  57. openadapt_ml/models/providers/openai.py +342 -0
  58. openadapt_ml/models/qwen_vl.py +46 -19
  59. openadapt_ml/perception/__init__.py +35 -0
  60. openadapt_ml/perception/integration.py +399 -0
  61. openadapt_ml/retrieval/README.md +226 -0
  62. openadapt_ml/retrieval/USAGE.md +391 -0
  63. openadapt_ml/retrieval/__init__.py +91 -0
  64. openadapt_ml/retrieval/demo_retriever.py +843 -0
  65. openadapt_ml/retrieval/embeddings.py +630 -0
  66. openadapt_ml/retrieval/index.py +194 -0
  67. openadapt_ml/retrieval/retriever.py +162 -0
  68. openadapt_ml/runtime/__init__.py +50 -0
  69. openadapt_ml/runtime/policy.py +27 -14
  70. openadapt_ml/runtime/safety_gate.py +471 -0
  71. openadapt_ml/schema/__init__.py +113 -0
  72. openadapt_ml/schema/converters.py +588 -0
  73. openadapt_ml/schema/episode.py +470 -0
  74. openadapt_ml/scripts/capture_screenshots.py +530 -0
  75. openadapt_ml/scripts/compare.py +102 -61
  76. openadapt_ml/scripts/demo_policy.py +4 -1
  77. openadapt_ml/scripts/eval_policy.py +19 -14
  78. openadapt_ml/scripts/make_gif.py +1 -1
  79. openadapt_ml/scripts/prepare_synthetic.py +16 -17
  80. openadapt_ml/scripts/train.py +98 -75
  81. openadapt_ml/segmentation/README.md +920 -0
  82. openadapt_ml/segmentation/__init__.py +97 -0
  83. openadapt_ml/segmentation/adapters/__init__.py +5 -0
  84. openadapt_ml/segmentation/adapters/capture_adapter.py +420 -0
  85. openadapt_ml/segmentation/annotator.py +610 -0
  86. openadapt_ml/segmentation/cache.py +290 -0
  87. openadapt_ml/segmentation/cli.py +674 -0
  88. openadapt_ml/segmentation/deduplicator.py +656 -0
  89. openadapt_ml/segmentation/frame_describer.py +788 -0
  90. openadapt_ml/segmentation/pipeline.py +340 -0
  91. openadapt_ml/segmentation/schemas.py +622 -0
  92. openadapt_ml/segmentation/segment_extractor.py +634 -0
  93. openadapt_ml/training/azure_ops_viewer.py +1097 -0
  94. openadapt_ml/training/benchmark_viewer.py +3255 -19
  95. openadapt_ml/training/shared_ui.py +7 -7
  96. openadapt_ml/training/stub_provider.py +57 -35
  97. openadapt_ml/training/trainer.py +255 -441
  98. openadapt_ml/training/trl_trainer.py +403 -0
  99. openadapt_ml/training/viewer.py +323 -108
  100. openadapt_ml/training/viewer_components.py +180 -0
  101. {openadapt_ml-0.1.0.dist-info → openadapt_ml-0.2.1.dist-info}/METADATA +312 -69
  102. openadapt_ml-0.2.1.dist-info/RECORD +116 -0
  103. openadapt_ml/benchmarks/base.py +0 -366
  104. openadapt_ml/benchmarks/data_collection.py +0 -432
  105. openadapt_ml/benchmarks/runner.py +0 -381
  106. openadapt_ml/benchmarks/waa.py +0 -704
  107. openadapt_ml/schemas/__init__.py +0 -53
  108. openadapt_ml/schemas/sessions.py +0 -122
  109. openadapt_ml/schemas/validation.py +0 -252
  110. openadapt_ml-0.1.0.dist-info/RECORD +0 -55
  111. {openadapt_ml-0.1.0.dist-info → openadapt_ml-0.2.1.dist-info}/WHEEL +0 -0
  112. {openadapt_ml-0.1.0.dist-info → openadapt_ml-0.2.1.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,1236 @@
1
+ """Benchmark viewer HTML generation.
2
+
3
+ .. deprecated::
4
+ This module is deprecated. Use ``openadapt_viewer`` instead::
5
+
6
+ from openadapt_viewer import generate_benchmark_viewer
7
+
8
+ The openadapt-viewer package is the canonical location for viewer code.
9
+
10
+ This module generates a standalone HTML viewer for benchmark results,
11
+ showing task list with pass/fail status, step-by-step replay of
12
+ benchmark executions, screenshots, actions, and reasoning at each step.
13
+
14
+ Usage:
15
+ from openadapt_ml.benchmarks.viewer import generate_benchmark_viewer
16
+
17
+ # Generate viewer from benchmark results directory
18
+ generate_benchmark_viewer(
19
+ benchmark_dir=Path("benchmark_results/waa_eval_20241214"),
20
+ output_path=Path("benchmark_results/waa_eval_20241214/benchmark.html"),
21
+ )
22
+
23
+ Directory structure expected:
24
+ benchmark_results/{run_name}/
25
+ |-- metadata.json # Benchmark config, models evaluated
26
+ |-- summary.json # Aggregate results
27
+ |-- tasks/
28
+ | |-- task_001/
29
+ | | |-- task.json # Task definition
30
+ | | |-- execution.json # Execution trace with steps
31
+ | | |-- screenshots/ # Step screenshots
32
+ | | |-- step_000.png
33
+ | | |-- step_001.png
34
+ | | |-- ...
35
+ | |-- task_002/
36
+ | | |-- ...
37
+ """
38
+
39
+ from __future__ import annotations
40
+
41
+ import base64
42
+ import json
43
+ import logging
44
+ import warnings
45
+ from pathlib import Path
46
+ from typing import Any
47
+
48
+ from openadapt_ml.training.shared_ui import (
49
+ get_shared_header_css as _get_shared_header_css,
50
+ generate_shared_header_html as _generate_shared_header_html,
51
+ )
52
+
53
+ warnings.warn(
54
+ "openadapt_ml.benchmarks.viewer is deprecated. "
55
+ "Use openadapt_viewer instead: from openadapt_viewer import generate_benchmark_viewer",
56
+ DeprecationWarning,
57
+ stacklevel=2,
58
+ )
59
+
60
+ logger = logging.getLogger(__name__)
61
+
62
+
63
+ def load_benchmark_metadata(benchmark_dir: Path) -> dict[str, Any]:
64
+ """Load benchmark metadata from metadata.json.
65
+
66
+ Args:
67
+ benchmark_dir: Path to benchmark run directory.
68
+
69
+ Returns:
70
+ Metadata dictionary with benchmark_name, run_name, model_id, etc.
71
+ """
72
+ metadata_path = benchmark_dir / "metadata.json"
73
+ if metadata_path.exists():
74
+ with open(metadata_path) as f:
75
+ return json.load(f)
76
+ return {
77
+ "benchmark_name": "unknown",
78
+ "run_name": benchmark_dir.name,
79
+ "model_id": "unknown",
80
+ "created_at": None,
81
+ }
82
+
83
+
84
+ def load_benchmark_summary(benchmark_dir: Path) -> dict[str, Any]:
85
+ """Load benchmark summary from summary.json.
86
+
87
+ Args:
88
+ benchmark_dir: Path to benchmark run directory.
89
+
90
+ Returns:
91
+ Summary dictionary with success_rate, num_tasks, etc.
92
+ """
93
+ summary_path = benchmark_dir / "summary.json"
94
+ if summary_path.exists():
95
+ with open(summary_path) as f:
96
+ return json.load(f)
97
+ return {
98
+ "num_tasks": 0,
99
+ "num_success": 0,
100
+ "success_rate": 0.0,
101
+ "avg_score": 0.0,
102
+ "avg_steps": 0.0,
103
+ "tasks": [],
104
+ }
105
+
106
+
107
+ def load_task_results(benchmark_dir: Path) -> list[dict[str, Any]]:
108
+ """Load all task results from benchmark run.
109
+
110
+ Args:
111
+ benchmark_dir: Path to benchmark run directory.
112
+
113
+ Returns:
114
+ List of task dictionaries with task definition, execution trace,
115
+ and screenshot paths.
116
+ """
117
+ tasks_dir = benchmark_dir / "tasks"
118
+ if not tasks_dir.exists():
119
+ return []
120
+
121
+ results = []
122
+ for task_dir in sorted(tasks_dir.iterdir()):
123
+ if not task_dir.is_dir():
124
+ continue
125
+
126
+ task_data: dict[str, Any] = {
127
+ "task_dir": str(task_dir),
128
+ "task_id": task_dir.name,
129
+ }
130
+
131
+ # Load task definition
132
+ task_json = task_dir / "task.json"
133
+ if task_json.exists():
134
+ with open(task_json) as f:
135
+ task_data["definition"] = json.load(f)
136
+ else:
137
+ task_data["definition"] = {"task_id": task_dir.name, "instruction": ""}
138
+
139
+ # Load execution trace
140
+ execution_json = task_dir / "execution.json"
141
+ if execution_json.exists():
142
+ with open(execution_json) as f:
143
+ task_data["execution"] = json.load(f)
144
+ else:
145
+ task_data["execution"] = {"steps": [], "success": False, "num_steps": 0}
146
+
147
+ # Load screenshot paths
148
+ screenshots_dir = task_dir / "screenshots"
149
+ if screenshots_dir.exists():
150
+ screenshot_paths = sorted(screenshots_dir.glob("*.png"))
151
+ task_data["screenshots"] = [
152
+ str(p.relative_to(benchmark_dir)) for p in screenshot_paths
153
+ ]
154
+ else:
155
+ task_data["screenshots"] = []
156
+
157
+ results.append(task_data)
158
+
159
+ return results
160
+
161
+
162
+ def _encode_image_to_base64(image_path: Path) -> str | None:
163
+ """Encode image to base64 data URL for embedding in HTML.
164
+
165
+ Args:
166
+ image_path: Path to PNG image.
167
+
168
+ Returns:
169
+ Data URL string or None if image cannot be loaded.
170
+ """
171
+ try:
172
+ if image_path.exists():
173
+ with open(image_path, "rb") as f:
174
+ data = f.read()
175
+ return f"data:image/png;base64,{base64.b64encode(data).decode()}"
176
+ except Exception as e:
177
+ logger.warning(f"Failed to encode image {image_path}: {e}")
178
+ return None
179
+
180
+
181
+ def _get_domain_stats(tasks: list[dict[str, Any]]) -> dict[str, dict[str, int]]:
182
+ """Calculate per-domain statistics.
183
+
184
+ Args:
185
+ tasks: List of task result dictionaries.
186
+
187
+ Returns:
188
+ Dictionary mapping domain name to {total, success, fail} counts.
189
+ """
190
+ domain_stats: dict[str, dict[str, int]] = {}
191
+
192
+ for task in tasks:
193
+ domain = task.get("definition", {}).get("domain", "unknown")
194
+ success = task.get("execution", {}).get("success", False)
195
+
196
+ if domain not in domain_stats:
197
+ domain_stats[domain] = {"total": 0, "success": 0, "fail": 0}
198
+
199
+ domain_stats[domain]["total"] += 1
200
+ if success:
201
+ domain_stats[domain]["success"] += 1
202
+ else:
203
+ domain_stats[domain]["fail"] += 1
204
+
205
+ return domain_stats
206
+
207
+
208
+ def generate_benchmark_viewer(
209
+ benchmark_dir: Path,
210
+ output_path: Path | None = None,
211
+ embed_screenshots: bool = False,
212
+ ) -> Path:
213
+ """Generate HTML viewer for benchmark results.
214
+
215
+ Args:
216
+ benchmark_dir: Path to benchmark run directory containing metadata.json,
217
+ summary.json, and tasks/ subdirectory.
218
+ output_path: Path for output HTML file. Defaults to benchmark_dir/benchmark.html.
219
+ embed_screenshots: If True, embed screenshots as base64 data URLs.
220
+ This creates a larger but fully standalone HTML file.
221
+
222
+ Returns:
223
+ Path to generated HTML file.
224
+ """
225
+ benchmark_dir = Path(benchmark_dir)
226
+ if output_path is None:
227
+ output_path = benchmark_dir / "benchmark.html"
228
+
229
+ # Load all data
230
+ metadata = load_benchmark_metadata(benchmark_dir)
231
+ summary = load_benchmark_summary(benchmark_dir)
232
+ tasks = load_task_results(benchmark_dir)
233
+
234
+ # Calculate domain statistics
235
+ domain_stats = _get_domain_stats(tasks)
236
+
237
+ # Generate HTML
238
+ html = _generate_benchmark_viewer_html(
239
+ metadata=metadata,
240
+ summary=summary,
241
+ tasks=tasks,
242
+ domain_stats=domain_stats,
243
+ benchmark_dir=benchmark_dir,
244
+ embed_screenshots=embed_screenshots,
245
+ )
246
+
247
+ # Write output
248
+ output_path = Path(output_path)
249
+ output_path.parent.mkdir(parents=True, exist_ok=True)
250
+ output_path.write_text(html)
251
+
252
+ logger.info(f"Generated benchmark viewer: {output_path}")
253
+ return output_path
254
+
255
+
256
+ def _generate_benchmark_viewer_html(
257
+ metadata: dict[str, Any],
258
+ summary: dict[str, Any],
259
+ tasks: list[dict[str, Any]],
260
+ domain_stats: dict[str, dict[str, int]],
261
+ benchmark_dir: Path,
262
+ embed_screenshots: bool = False,
263
+ ) -> str:
264
+ """Generate the HTML content for benchmark viewer.
265
+
266
+ Args:
267
+ metadata: Benchmark metadata.
268
+ summary: Summary statistics.
269
+ tasks: List of task result dictionaries.
270
+ domain_stats: Per-domain statistics.
271
+ benchmark_dir: Base directory for resolving relative paths.
272
+ embed_screenshots: If True, embed screenshots as base64.
273
+
274
+ Returns:
275
+ HTML string.
276
+ """
277
+ # Get shared header components
278
+ shared_header_css = _get_shared_header_css()
279
+ shared_header_html = _generate_shared_header_html("benchmarks")
280
+
281
+ # Serialize data for JavaScript
282
+ metadata_json = json.dumps(metadata)
283
+ summary_json = json.dumps(summary)
284
+ domain_stats_json = json.dumps(domain_stats)
285
+
286
+ # Process tasks for JavaScript - include execution steps and screenshot paths
287
+ tasks_for_js = []
288
+ for task in tasks:
289
+ task_js = {
290
+ "task_id": task.get("task_id"),
291
+ "definition": task.get("definition", {}),
292
+ "execution": task.get("execution", {}),
293
+ "screenshots": task.get("screenshots", []),
294
+ }
295
+
296
+ # Optionally embed screenshots as base64
297
+ if embed_screenshots:
298
+ embedded_screenshots = []
299
+ for screenshot_rel_path in task.get("screenshots", []):
300
+ screenshot_path = benchmark_dir / screenshot_rel_path
301
+ data_url = _encode_image_to_base64(screenshot_path)
302
+ embedded_screenshots.append(data_url or "")
303
+ task_js["embedded_screenshots"] = embedded_screenshots
304
+
305
+ tasks_for_js.append(task_js)
306
+
307
+ tasks_json = json.dumps(tasks_for_js)
308
+
309
+ # Calculate aggregate metrics
310
+ num_tasks = len(tasks)
311
+ num_success = sum(1 for t in tasks if t.get("execution", {}).get("success", False))
312
+ success_rate = (num_success / num_tasks * 100) if num_tasks > 0 else 0
313
+
314
+ html = f"""<!DOCTYPE html>
315
+ <html lang="en">
316
+ <head>
317
+ <meta charset="UTF-8">
318
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
319
+ <title>Benchmark Viewer - {metadata.get("run_name", "Unknown")}</title>
320
+ <style>
321
+ :root {{
322
+ --bg-primary: #0a0a0f;
323
+ --bg-secondary: #12121a;
324
+ --bg-tertiary: #1a1a24;
325
+ --border-color: rgba(255, 255, 255, 0.06);
326
+ --text-primary: #f0f0f0;
327
+ --text-secondary: #888;
328
+ --text-muted: #555;
329
+ --accent: #00d4aa;
330
+ --accent-dim: rgba(0, 212, 170, 0.15);
331
+ --success: #34d399;
332
+ --error: #ff5f5f;
333
+ --warning: #f59e0b;
334
+ }}
335
+ * {{ box-sizing: border-box; margin: 0; padding: 0; }}
336
+ body {{
337
+ font-family: "SF Pro Display", -apple-system, BlinkMacSystemFont, "Inter", sans-serif;
338
+ background: var(--bg-primary);
339
+ color: var(--text-primary);
340
+ min-height: 100vh;
341
+ line-height: 1.5;
342
+ }}
343
+ .container {{
344
+ max-width: 1600px;
345
+ margin: 0 auto;
346
+ padding: 24px;
347
+ }}
348
+ {shared_header_css}
349
+
350
+ /* Summary Panel */
351
+ .summary-panel {{
352
+ background: var(--bg-secondary);
353
+ border: 1px solid var(--border-color);
354
+ border-radius: 12px;
355
+ padding: 20px;
356
+ margin-bottom: 24px;
357
+ }}
358
+ .summary-header {{
359
+ display: flex;
360
+ justify-content: space-between;
361
+ align-items: center;
362
+ margin-bottom: 16px;
363
+ }}
364
+ .summary-header h2 {{
365
+ font-size: 1rem;
366
+ font-weight: 600;
367
+ }}
368
+ .summary-meta {{
369
+ font-size: 0.75rem;
370
+ color: var(--text-secondary);
371
+ font-family: "SF Mono", Monaco, monospace;
372
+ }}
373
+ .summary-stats {{
374
+ display: grid;
375
+ grid-template-columns: repeat(auto-fit, minmax(140px, 1fr));
376
+ gap: 16px;
377
+ margin-bottom: 16px;
378
+ }}
379
+ .stat-card {{
380
+ background: var(--bg-tertiary);
381
+ border-radius: 8px;
382
+ padding: 16px;
383
+ }}
384
+ .stat-card .stat-value {{
385
+ font-size: 1.8rem;
386
+ font-weight: 600;
387
+ font-family: "SF Mono", Monaco, monospace;
388
+ }}
389
+ .stat-card .stat-value.success {{ color: var(--success); }}
390
+ .stat-card .stat-value.error {{ color: var(--error); }}
391
+ .stat-card .stat-label {{
392
+ font-size: 0.7rem;
393
+ color: var(--text-muted);
394
+ text-transform: uppercase;
395
+ letter-spacing: 0.05em;
396
+ margin-top: 4px;
397
+ }}
398
+
399
+ /* Domain breakdown */
400
+ .domain-breakdown {{
401
+ display: flex;
402
+ flex-wrap: wrap;
403
+ gap: 8px;
404
+ }}
405
+ .domain-tag {{
406
+ display: inline-flex;
407
+ align-items: center;
408
+ gap: 6px;
409
+ padding: 6px 12px;
410
+ background: var(--bg-tertiary);
411
+ border-radius: 6px;
412
+ font-size: 0.75rem;
413
+ }}
414
+ .domain-tag .domain-name {{
415
+ color: var(--text-primary);
416
+ }}
417
+ .domain-tag .domain-stats {{
418
+ font-family: "SF Mono", Monaco, monospace;
419
+ color: var(--text-secondary);
420
+ }}
421
+
422
+ /* Filters */
423
+ .filter-bar {{
424
+ display: flex;
425
+ gap: 16px;
426
+ padding: 12px 16px;
427
+ background: var(--bg-secondary);
428
+ border: 1px solid var(--border-color);
429
+ border-radius: 8px;
430
+ margin-bottom: 16px;
431
+ flex-wrap: wrap;
432
+ align-items: center;
433
+ }}
434
+ .filter-group {{
435
+ display: flex;
436
+ align-items: center;
437
+ gap: 8px;
438
+ }}
439
+ .filter-label {{
440
+ font-size: 0.7rem;
441
+ color: var(--text-muted);
442
+ text-transform: uppercase;
443
+ letter-spacing: 0.05em;
444
+ }}
445
+ .filter-select {{
446
+ padding: 8px 32px 8px 12px;
447
+ border-radius: 8px;
448
+ font-size: 0.85rem;
449
+ background: var(--bg-tertiary);
450
+ color: var(--text-primary);
451
+ border: 1px solid var(--border-color);
452
+ cursor: pointer;
453
+ appearance: none;
454
+ background-image: url("data:image/svg+xml,%3Csvg xmlns='http://www.w3.org/2000/svg' width='12' height='12' viewBox='0 0 12 12'%3E%3Cpath fill='%23888' d='M3 4.5L6 7.5L9 4.5'/%3E%3C/svg%3E");
455
+ background-repeat: no-repeat;
456
+ background-position: right 10px center;
457
+ transition: all 0.2s;
458
+ }}
459
+ .filter-select:hover {{ border-color: var(--accent); }}
460
+ .filter-count {{
461
+ font-size: 0.8rem;
462
+ color: var(--text-secondary);
463
+ margin-left: auto;
464
+ }}
465
+
466
+ /* Main Content Layout */
467
+ .main-content {{
468
+ display: grid;
469
+ grid-template-columns: 350px 1fr;
470
+ gap: 24px;
471
+ }}
472
+ @media (max-width: 1200px) {{
473
+ .main-content {{ grid-template-columns: 1fr; }}
474
+ }}
475
+
476
+ /* Task List */
477
+ .task-list {{
478
+ background: var(--bg-secondary);
479
+ border: 1px solid var(--border-color);
480
+ border-radius: 12px;
481
+ max-height: calc(100vh - 300px);
482
+ overflow-y: auto;
483
+ }}
484
+ .task-list-header {{
485
+ display: flex;
486
+ justify-content: space-between;
487
+ align-items: center;
488
+ padding: 14px 16px;
489
+ border-bottom: 1px solid var(--border-color);
490
+ position: sticky;
491
+ top: 0;
492
+ background: var(--bg-secondary);
493
+ z-index: 10;
494
+ }}
495
+ .task-list-header h3 {{
496
+ font-size: 0.9rem;
497
+ font-weight: 600;
498
+ }}
499
+ .task-item {{
500
+ padding: 12px 16px;
501
+ border-bottom: 1px solid var(--border-color);
502
+ cursor: pointer;
503
+ transition: background 0.2s;
504
+ }}
505
+ .task-item:hover {{ background: var(--bg-tertiary); }}
506
+ .task-item.active {{
507
+ background: var(--accent-dim);
508
+ border-left: 3px solid var(--accent);
509
+ }}
510
+ .task-item.hidden {{ display: none; }}
511
+ .task-item .task-header {{
512
+ display: flex;
513
+ justify-content: space-between;
514
+ align-items: center;
515
+ margin-bottom: 4px;
516
+ }}
517
+ .task-item .task-id {{
518
+ font-family: "SF Mono", Monaco, monospace;
519
+ font-size: 0.8rem;
520
+ font-weight: 600;
521
+ }}
522
+ .task-item .task-status {{
523
+ font-size: 0.7rem;
524
+ font-weight: 600;
525
+ padding: 2px 8px;
526
+ border-radius: 4px;
527
+ }}
528
+ .task-item .task-status.success {{
529
+ background: rgba(52, 211, 153, 0.2);
530
+ color: var(--success);
531
+ }}
532
+ .task-item .task-status.fail {{
533
+ background: rgba(255, 95, 95, 0.2);
534
+ color: var(--error);
535
+ }}
536
+ .task-item .task-info {{
537
+ font-size: 0.75rem;
538
+ color: var(--text-secondary);
539
+ }}
540
+ .task-item .task-domain {{
541
+ color: var(--accent);
542
+ }}
543
+
544
+ /* Task Detail Panel */
545
+ .task-detail {{
546
+ background: var(--bg-secondary);
547
+ border: 1px solid var(--border-color);
548
+ border-radius: 12px;
549
+ overflow: hidden;
550
+ }}
551
+ .task-detail-header {{
552
+ padding: 16px 20px;
553
+ border-bottom: 1px solid var(--border-color);
554
+ }}
555
+ .task-detail-header h2 {{
556
+ font-size: 1rem;
557
+ font-weight: 600;
558
+ margin-bottom: 8px;
559
+ }}
560
+ .task-detail-meta {{
561
+ font-size: 0.8rem;
562
+ color: var(--text-secondary);
563
+ line-height: 1.6;
564
+ }}
565
+ .task-detail-instruction {{
566
+ font-style: italic;
567
+ color: var(--text-primary);
568
+ margin-top: 8px;
569
+ padding: 10px;
570
+ background: var(--bg-tertiary);
571
+ border-radius: 6px;
572
+ font-size: 0.85rem;
573
+ }}
574
+
575
+ /* Step Viewer */
576
+ .step-viewer {{
577
+ display: grid;
578
+ grid-template-columns: 1fr 300px;
579
+ gap: 16px;
580
+ padding: 16px;
581
+ }}
582
+ @media (max-width: 900px) {{
583
+ .step-viewer {{ grid-template-columns: 1fr; }}
584
+ }}
585
+ .screenshot-container {{
586
+ position: relative;
587
+ background: #000;
588
+ border-radius: 8px;
589
+ overflow: hidden;
590
+ min-height: 400px;
591
+ display: flex;
592
+ align-items: center;
593
+ justify-content: center;
594
+ }}
595
+ .screenshot-container img {{
596
+ max-width: 100%;
597
+ max-height: 70vh;
598
+ object-fit: contain;
599
+ }}
600
+ .screenshot-placeholder {{
601
+ color: var(--text-muted);
602
+ font-size: 0.9rem;
603
+ }}
604
+ .click-marker {{
605
+ position: absolute;
606
+ width: 24px;
607
+ height: 24px;
608
+ border-radius: 50%;
609
+ transform: translate(-50%, -50%);
610
+ display: flex;
611
+ align-items: center;
612
+ justify-content: center;
613
+ font-size: 10px;
614
+ font-weight: bold;
615
+ pointer-events: none;
616
+ z-index: 100;
617
+ background: rgba(167, 139, 250, 0.4);
618
+ border: 2px solid #a78bfa;
619
+ color: #a78bfa;
620
+ }}
621
+
622
+ /* Step Controls */
623
+ .step-sidebar {{
624
+ display: flex;
625
+ flex-direction: column;
626
+ gap: 16px;
627
+ }}
628
+ .step-controls {{
629
+ display: flex;
630
+ gap: 8px;
631
+ flex-wrap: wrap;
632
+ align-items: center;
633
+ }}
634
+ .step-btn {{
635
+ padding: 8px 12px;
636
+ border: 1px solid var(--border-color);
637
+ background: var(--bg-tertiary);
638
+ color: var(--text-primary);
639
+ border-radius: 6px;
640
+ cursor: pointer;
641
+ font-size: 0.85rem;
642
+ min-width: 40px;
643
+ text-align: center;
644
+ transition: all 0.2s;
645
+ }}
646
+ .step-btn:hover {{ border-color: var(--accent); }}
647
+ .step-btn.primary {{ flex: 1; min-width: 60px; }}
648
+ .step-btn.active {{
649
+ background: var(--accent);
650
+ color: var(--bg-primary);
651
+ border-color: var(--accent);
652
+ }}
653
+ .step-progress {{
654
+ font-size: 0.8rem;
655
+ color: var(--text-secondary);
656
+ font-family: "SF Mono", Monaco, monospace;
657
+ }}
658
+
659
+ /* Step List */
660
+ .step-list {{
661
+ background: var(--bg-tertiary);
662
+ border-radius: 8px;
663
+ max-height: 300px;
664
+ overflow-y: auto;
665
+ }}
666
+ .step-list-item {{
667
+ padding: 10px 12px;
668
+ border-bottom: 1px solid var(--border-color);
669
+ cursor: pointer;
670
+ transition: background 0.2s;
671
+ font-size: 0.8rem;
672
+ }}
673
+ .step-list-item:hover {{ background: var(--bg-secondary); }}
674
+ .step-list-item.active {{
675
+ background: var(--accent-dim);
676
+ border-left: 2px solid var(--accent);
677
+ }}
678
+ .step-list-item .step-num {{
679
+ font-weight: 600;
680
+ color: var(--accent);
681
+ margin-right: 8px;
682
+ }}
683
+ .step-list-item .step-action {{
684
+ color: var(--text-secondary);
685
+ }}
686
+
687
+ /* Action Detail */
688
+ .action-detail {{
689
+ background: var(--bg-tertiary);
690
+ border-radius: 8px;
691
+ padding: 12px;
692
+ }}
693
+ .action-detail h4 {{
694
+ font-size: 0.8rem;
695
+ color: var(--text-muted);
696
+ text-transform: uppercase;
697
+ letter-spacing: 0.05em;
698
+ margin-bottom: 8px;
699
+ }}
700
+ .action-content {{
701
+ font-family: "SF Mono", Monaco, monospace;
702
+ font-size: 0.8rem;
703
+ color: var(--text-primary);
704
+ word-break: break-word;
705
+ }}
706
+ .reasoning-box {{
707
+ margin-top: 12px;
708
+ padding: 10px;
709
+ background: var(--bg-secondary);
710
+ border-radius: 6px;
711
+ font-size: 0.8rem;
712
+ color: var(--text-secondary);
713
+ line-height: 1.6;
714
+ max-height: 200px;
715
+ overflow-y: auto;
716
+ }}
717
+ .reasoning-box h4 {{
718
+ margin-bottom: 8px;
719
+ }}
720
+
721
+ /* Speed Control */
722
+ .speed-control {{
723
+ display: flex;
724
+ align-items: center;
725
+ gap: 6px;
726
+ margin-left: auto;
727
+ }}
728
+ .speed-control label {{
729
+ font-size: 0.7rem;
730
+ color: var(--text-muted);
731
+ text-transform: uppercase;
732
+ }}
733
+ .speed-control select {{
734
+ padding: 4px 8px;
735
+ border-radius: 4px;
736
+ background: var(--bg-tertiary);
737
+ color: var(--text-primary);
738
+ border: 1px solid var(--border-color);
739
+ font-size: 0.8rem;
740
+ cursor: pointer;
741
+ }}
742
+
743
+ /* Progress Bar */
744
+ .progress-bar {{
745
+ width: 100%;
746
+ height: 4px;
747
+ background: var(--bg-tertiary);
748
+ border-radius: 2px;
749
+ margin-top: 8px;
750
+ overflow: hidden;
751
+ cursor: pointer;
752
+ }}
753
+ .progress-bar .progress {{
754
+ height: 100%;
755
+ background: var(--accent);
756
+ transition: width 0.1s ease;
757
+ }}
758
+
759
+ /* No task selected state */
760
+ .no-task-selected {{
761
+ display: flex;
762
+ flex-direction: column;
763
+ align-items: center;
764
+ justify-content: center;
765
+ min-height: 400px;
766
+ color: var(--text-muted);
767
+ }}
768
+ .no-task-selected .icon {{
769
+ font-size: 3rem;
770
+ margin-bottom: 16px;
771
+ }}
772
+ .no-task-selected p {{
773
+ font-size: 0.9rem;
774
+ }}
775
+ </style>
776
+ </head>
777
+ <body>
778
+ {shared_header_html}
779
+
780
+ <div class="container">
781
+ <!-- Summary Panel -->
782
+ <div class="summary-panel">
783
+ <div class="summary-header">
784
+ <h2>Benchmark Results: {metadata.get("run_name", "Unknown")}</h2>
785
+ <div class="summary-meta">
786
+ <span>Model: {metadata.get("model_id", "unknown")}</span>
787
+ <span> | </span>
788
+ <span>Created: {metadata.get("created_at", "N/A")}</span>
789
+ </div>
790
+ </div>
791
+ <div class="summary-stats">
792
+ <div class="stat-card">
793
+ <div class="stat-value">{num_tasks}</div>
794
+ <div class="stat-label">Total Tasks</div>
795
+ </div>
796
+ <div class="stat-card">
797
+ <div class="stat-value success">{num_success}</div>
798
+ <div class="stat-label">Passed</div>
799
+ </div>
800
+ <div class="stat-card">
801
+ <div class="stat-value error">{num_tasks - num_success}</div>
802
+ <div class="stat-label">Failed</div>
803
+ </div>
804
+ <div class="stat-card">
805
+ <div class="stat-value {"success" if success_rate >= 50 else "error"}">{success_rate:.1f}%</div>
806
+ <div class="stat-label">Success Rate</div>
807
+ </div>
808
+ </div>
809
+ <div class="domain-breakdown" id="domain-breakdown"></div>
810
+ </div>
811
+
812
+ <!-- Filters -->
813
+ <div class="filter-bar">
814
+ <div class="filter-group">
815
+ <span class="filter-label">Domain:</span>
816
+ <select class="filter-select" id="domain-filter">
817
+ <option value="all">All Domains</option>
818
+ </select>
819
+ </div>
820
+ <div class="filter-group">
821
+ <span class="filter-label">Status:</span>
822
+ <select class="filter-select" id="status-filter">
823
+ <option value="all">All</option>
824
+ <option value="success">Passed</option>
825
+ <option value="fail">Failed</option>
826
+ </select>
827
+ </div>
828
+ <span class="filter-count" id="filter-count">{num_tasks} tasks</span>
829
+ </div>
830
+
831
+ <!-- Main Content -->
832
+ <div class="main-content">
833
+ <!-- Task List -->
834
+ <div class="task-list">
835
+ <div class="task-list-header">
836
+ <h3>Tasks</h3>
837
+ </div>
838
+ <div id="task-list-items"></div>
839
+ </div>
840
+
841
+ <!-- Task Detail Panel -->
842
+ <div class="task-detail" id="task-detail">
843
+ <div class="no-task-selected" id="no-task-selected">
844
+ <div class="icon">+</div>
845
+ <p>Select a task from the list to view details</p>
846
+ </div>
847
+ <div id="task-detail-content" style="display:none;"></div>
848
+ </div>
849
+ </div>
850
+ </div>
851
+
852
+ <script>
853
+ // Data from Python
854
+ const metadata = {metadata_json};
855
+ const summary = {summary_json};
856
+ const domainStats = {domain_stats_json};
857
+ const tasks = {tasks_json};
858
+ const embedScreenshots = {"true" if embed_screenshots else "false"};
859
+
860
+ let currentTaskIndex = -1;
861
+ let currentStepIndex = 0;
862
+ let isPlaying = false;
863
+ let playInterval = null;
864
+ let playSpeed = 1000;
865
+
866
+ // Initialize page
867
+ function init() {{
868
+ renderDomainBreakdown();
869
+ populateDomainFilter();
870
+ renderTaskList();
871
+ setupFilters();
872
+ }}
873
+
874
+ function renderDomainBreakdown() {{
875
+ const container = document.getElementById('domain-breakdown');
876
+ let html = '';
877
+ for (const [domain, stats] of Object.entries(domainStats)) {{
878
+ const rate = stats.total > 0 ? (stats.success / stats.total * 100).toFixed(0) : 0;
879
+ html += `
880
+ <div class="domain-tag">
881
+ <span class="domain-name">${{domain}}</span>
882
+ <span class="domain-stats">${{stats.success}}/${{stats.total}} (${{rate}}%)</span>
883
+ </div>
884
+ `;
885
+ }}
886
+ container.innerHTML = html;
887
+ }}
888
+
889
+ function populateDomainFilter() {{
890
+ const select = document.getElementById('domain-filter');
891
+ for (const domain of Object.keys(domainStats).sort()) {{
892
+ const option = document.createElement('option');
893
+ option.value = domain;
894
+ option.textContent = domain;
895
+ select.appendChild(option);
896
+ }}
897
+ }}
898
+
899
+ function renderTaskList() {{
900
+ const container = document.getElementById('task-list-items');
901
+ let html = '';
902
+ tasks.forEach((task, idx) => {{
903
+ const def = task.definition || {{}};
904
+ const exec = task.execution || {{}};
905
+ const success = exec.success || false;
906
+ const domain = def.domain || 'unknown';
907
+ const numSteps = exec.num_steps || 0;
908
+
909
+ html += `
910
+ <div class="task-item" data-idx="${{idx}}" data-domain="${{domain}}" data-status="${{success ? 'success' : 'fail'}}" onclick="selectTask(${{idx}})">
911
+ <div class="task-header">
912
+ <span class="task-id">${{task.task_id}}</span>
913
+ <span class="task-status ${{success ? 'success' : 'fail'}}">${{success ? 'PASS' : 'FAIL'}}</span>
914
+ </div>
915
+ <div class="task-info">
916
+ <span class="task-domain">${{domain}}</span>
917
+ <span> | ${{numSteps}} steps</span>
918
+ </div>
919
+ </div>
920
+ `;
921
+ }});
922
+ container.innerHTML = html;
923
+ }}
924
+
925
+ function setupFilters() {{
926
+ document.getElementById('domain-filter').addEventListener('change', filterTasks);
927
+ document.getElementById('status-filter').addEventListener('change', filterTasks);
928
+ }}
929
+
930
+ function filterTasks() {{
931
+ const domainFilter = document.getElementById('domain-filter').value;
932
+ const statusFilter = document.getElementById('status-filter').value;
933
+
934
+ let visibleCount = 0;
935
+ document.querySelectorAll('.task-item').forEach(item => {{
936
+ const domain = item.dataset.domain;
937
+ const status = item.dataset.status;
938
+
939
+ const matchDomain = domainFilter === 'all' || domain === domainFilter;
940
+ const matchStatus = statusFilter === 'all' || status === statusFilter;
941
+
942
+ if (matchDomain && matchStatus) {{
943
+ item.classList.remove('hidden');
944
+ visibleCount++;
945
+ }} else {{
946
+ item.classList.add('hidden');
947
+ }}
948
+ }});
949
+
950
+ document.getElementById('filter-count').textContent = `${{visibleCount}} tasks`;
951
+ }}
952
+
953
+ function selectTask(idx) {{
954
+ currentTaskIndex = idx;
955
+ currentStepIndex = 0;
956
+
957
+ // Update active state in list
958
+ document.querySelectorAll('.task-item').forEach((item, i) => {{
959
+ item.classList.toggle('active', parseInt(item.dataset.idx) === idx);
960
+ }});
961
+
962
+ // Show task detail
963
+ document.getElementById('no-task-selected').style.display = 'none';
964
+ document.getElementById('task-detail-content').style.display = 'block';
965
+
966
+ renderTaskDetail();
967
+ }}
968
+
969
+ function renderTaskDetail() {{
970
+ if (currentTaskIndex < 0) return;
971
+
972
+ const task = tasks[currentTaskIndex];
973
+ const def = task.definition || {{}};
974
+ const exec = task.execution || {{}};
975
+ const steps = exec.steps || [];
976
+ const success = exec.success || false;
977
+
978
+ const container = document.getElementById('task-detail-content');
979
+ container.innerHTML = `
980
+ <div class="task-detail-header">
981
+ <h2>${{task.task_id}} - <span style="color: ${{success ? 'var(--success)' : 'var(--error)'}}">${{success ? 'PASSED' : 'FAILED'}}</span></h2>
982
+ <div class="task-detail-meta">
983
+ Domain: <strong>${{def.domain || 'unknown'}}</strong> |
984
+ Steps: <strong>${{exec.num_steps || steps.length}}</strong> |
985
+ Time: <strong>${{(exec.total_time_seconds || 0).toFixed(1)}}s</strong>
986
+ ${{exec.error ? `<br>Error: <span style="color:var(--error)">${{exec.error}}</span>` : ''}}
987
+ </div>
988
+ <div class="task-detail-instruction">
989
+ ${{def.instruction || 'No instruction available'}}
990
+ </div>
991
+ </div>
992
+ <div class="step-viewer">
993
+ <div class="screenshot-container" id="screenshot-container">
994
+ ${{steps.length > 0 ? '<img id="screenshot-img" src="" alt="Step screenshot">' : '<span class="screenshot-placeholder">No screenshots available</span>'}}
995
+ </div>
996
+ <div class="step-sidebar">
997
+ <div class="step-controls">
998
+ <button class="step-btn" onclick="prevStep()">Prev</button>
999
+ <button class="step-btn primary" id="play-btn" onclick="togglePlay()">Play</button>
1000
+ <button class="step-btn" onclick="nextStep()">Next</button>
1001
+ <span class="step-progress" id="step-progress">0 / ${{steps.length}}</span>
1002
+ <div class="speed-control">
1003
+ <label>Speed</label>
1004
+ <select id="speed-select" onchange="changeSpeed(this.value)">
1005
+ <option value="2000">0.5x</option>
1006
+ <option value="1000" selected>1x</option>
1007
+ <option value="500">2x</option>
1008
+ <option value="250">4x</option>
1009
+ </select>
1010
+ </div>
1011
+ </div>
1012
+ <div class="progress-bar" onclick="seekStep(event)">
1013
+ <div class="progress" id="step-progress-bar" style="width: 0%"></div>
1014
+ </div>
1015
+ <div class="step-list" id="step-list"></div>
1016
+ <div class="action-detail" id="action-detail">
1017
+ <h4>Action</h4>
1018
+ <div class="action-content" id="action-content">-</div>
1019
+ </div>
1020
+ <div class="reasoning-box" id="reasoning-box" style="display:none;">
1021
+ <h4>Reasoning</h4>
1022
+ <div id="reasoning-content"></div>
1023
+ </div>
1024
+ </div>
1025
+ </div>
1026
+ `;
1027
+
1028
+ renderStepList();
1029
+ if (steps.length > 0) {{
1030
+ updateStep();
1031
+ }}
1032
+ }}
1033
+
1034
+ function renderStepList() {{
1035
+ if (currentTaskIndex < 0) return;
1036
+
1037
+ const task = tasks[currentTaskIndex];
1038
+ const steps = task.execution?.steps || [];
1039
+ const container = document.getElementById('step-list');
1040
+
1041
+ let html = '';
1042
+ steps.forEach((step, idx) => {{
1043
+ const action = step.action || {{}};
1044
+ const actionType = action.type || 'unknown';
1045
+ html += `
1046
+ <div class="step-list-item ${{idx === currentStepIndex ? 'active' : ''}}" onclick="goToStep(${{idx}})">
1047
+ <span class="step-num">#${{idx}}</span>
1048
+ <span class="step-action">${{actionType.toUpperCase()}}</span>
1049
+ </div>
1050
+ `;
1051
+ }});
1052
+ container.innerHTML = html || '<div style="padding:12px;color:var(--text-muted);">No steps</div>';
1053
+ }}
1054
+
1055
+ function updateStep() {{
1056
+ if (currentTaskIndex < 0) return;
1057
+
1058
+ const task = tasks[currentTaskIndex];
1059
+ const steps = task.execution?.steps || [];
1060
+ const screenshots = task.screenshots || [];
1061
+
1062
+ if (steps.length === 0) return;
1063
+
1064
+ const step = steps[currentStepIndex] || {{}};
1065
+ const action = step.action || {{}};
1066
+
1067
+ // Update screenshot
1068
+ const img = document.getElementById('screenshot-img');
1069
+ if (img) {{
1070
+ if (embedScreenshots && task.embedded_screenshots && task.embedded_screenshots[currentStepIndex]) {{
1071
+ img.src = task.embedded_screenshots[currentStepIndex];
1072
+ }} else if (screenshots[currentStepIndex]) {{
1073
+ img.src = screenshots[currentStepIndex];
1074
+ }} else if (step.screenshot_path) {{
1075
+ img.src = step.screenshot_path;
1076
+ }} else {{
1077
+ img.src = '';
1078
+ }}
1079
+ }}
1080
+
1081
+ // Update click marker if action has coordinates
1082
+ const container = document.getElementById('screenshot-container');
1083
+ // Remove existing markers
1084
+ container.querySelectorAll('.click-marker').forEach(m => m.remove());
1085
+
1086
+ if (action.x !== null && action.y !== null && action.x !== undefined && action.y !== undefined) {{
1087
+ const marker = document.createElement('div');
1088
+ marker.className = 'click-marker';
1089
+ marker.style.left = `${{action.x * 100}}%`;
1090
+ marker.style.top = `${{action.y * 100}}%`;
1091
+ marker.textContent = 'AI';
1092
+ container.appendChild(marker);
1093
+ }}
1094
+
1095
+ // Update progress
1096
+ document.getElementById('step-progress').textContent = `${{currentStepIndex + 1}} / ${{steps.length}}`;
1097
+ const progressPct = steps.length > 1 ? (currentStepIndex / (steps.length - 1)) * 100 : 0;
1098
+ document.getElementById('step-progress-bar').style.width = `${{progressPct}}%`;
1099
+
1100
+ // Update action detail
1101
+ const actionContent = document.getElementById('action-content');
1102
+ let actionText = action.type ? action.type.toUpperCase() : 'unknown';
1103
+ if (action.x !== null && action.y !== null && action.x !== undefined && action.y !== undefined) {{
1104
+ actionText += ` (${{(action.x * 100).toFixed(1)}}%, ${{(action.y * 100).toFixed(1)}}%)`;
1105
+ }}
1106
+ if (action.text) {{
1107
+ actionText += ` "${{action.text}}"`;
1108
+ }}
1109
+ if (action.key) {{
1110
+ actionText += ` [${{action.key}}]`;
1111
+ }}
1112
+ actionContent.textContent = actionText;
1113
+
1114
+ // Update reasoning
1115
+ const reasoningBox = document.getElementById('reasoning-box');
1116
+ const reasoningContent = document.getElementById('reasoning-content');
1117
+ if (step.reasoning) {{
1118
+ reasoningBox.style.display = 'block';
1119
+ reasoningContent.textContent = step.reasoning;
1120
+ }} else {{
1121
+ reasoningBox.style.display = 'none';
1122
+ }}
1123
+
1124
+ // Update step list active state
1125
+ document.querySelectorAll('.step-list-item').forEach((item, idx) => {{
1126
+ item.classList.toggle('active', idx === currentStepIndex);
1127
+ }});
1128
+ }}
1129
+
1130
+ function prevStep() {{
1131
+ if (currentStepIndex > 0) {{
1132
+ currentStepIndex--;
1133
+ updateStep();
1134
+ }}
1135
+ }}
1136
+
1137
+ function nextStep() {{
1138
+ const task = tasks[currentTaskIndex];
1139
+ const steps = task?.execution?.steps || [];
1140
+ if (currentStepIndex < steps.length - 1) {{
1141
+ currentStepIndex++;
1142
+ updateStep();
1143
+ }} else if (isPlaying) {{
1144
+ stopPlay();
1145
+ }}
1146
+ }}
1147
+
1148
+ function goToStep(idx) {{
1149
+ currentStepIndex = idx;
1150
+ updateStep();
1151
+ }}
1152
+
1153
+ function seekStep(event) {{
1154
+ const task = tasks[currentTaskIndex];
1155
+ const steps = task?.execution?.steps || [];
1156
+ if (steps.length === 0) return;
1157
+
1158
+ const bar = event.currentTarget;
1159
+ const rect = bar.getBoundingClientRect();
1160
+ const pct = (event.clientX - rect.left) / rect.width;
1161
+ currentStepIndex = Math.floor(pct * steps.length);
1162
+ currentStepIndex = Math.max(0, Math.min(currentStepIndex, steps.length - 1));
1163
+ updateStep();
1164
+ }}
1165
+
1166
+ function togglePlay() {{
1167
+ if (isPlaying) {{
1168
+ stopPlay();
1169
+ }} else {{
1170
+ startPlay();
1171
+ }}
1172
+ }}
1173
+
1174
+ function startPlay() {{
1175
+ isPlaying = true;
1176
+ document.getElementById('play-btn').textContent = 'Pause';
1177
+ document.getElementById('play-btn').classList.add('active');
1178
+ playInterval = setInterval(nextStep, playSpeed);
1179
+ }}
1180
+
1181
+ function stopPlay() {{
1182
+ isPlaying = false;
1183
+ document.getElementById('play-btn').textContent = 'Play';
1184
+ document.getElementById('play-btn').classList.remove('active');
1185
+ if (playInterval) {{
1186
+ clearInterval(playInterval);
1187
+ playInterval = null;
1188
+ }}
1189
+ }}
1190
+
1191
+ function changeSpeed(value) {{
1192
+ playSpeed = parseInt(value);
1193
+ if (isPlaying) {{
1194
+ stopPlay();
1195
+ startPlay();
1196
+ }}
1197
+ }}
1198
+
1199
+ // Keyboard shortcuts
1200
+ document.addEventListener('keydown', (e) => {{
1201
+ if (e.target.tagName === 'INPUT' || e.target.tagName === 'TEXTAREA') return;
1202
+
1203
+ switch (e.key) {{
1204
+ case ' ':
1205
+ e.preventDefault();
1206
+ togglePlay();
1207
+ break;
1208
+ case 'ArrowLeft':
1209
+ e.preventDefault();
1210
+ prevStep();
1211
+ break;
1212
+ case 'ArrowRight':
1213
+ e.preventDefault();
1214
+ nextStep();
1215
+ break;
1216
+ case 'Home':
1217
+ e.preventDefault();
1218
+ goToStep(0);
1219
+ break;
1220
+ case 'End':
1221
+ e.preventDefault();
1222
+ const task = tasks[currentTaskIndex];
1223
+ const steps = task?.execution?.steps || [];
1224
+ goToStep(steps.length - 1);
1225
+ break;
1226
+ }}
1227
+ }});
1228
+
1229
+ // Initialize on load
1230
+ document.addEventListener('DOMContentLoaded', init);
1231
+ </script>
1232
+ </body>
1233
+ </html>
1234
+ """
1235
+
1236
+ return html