PyPI - openadapt-ml - Versions diffs - 0.1.0__py3-none-any.whl → 0.2.1__py3-none-any.whl - Mend

openadapt-ml 0.1.0py3-none-any.whl → 0.2.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (112) hide show

openadapt_ml/baselines/__init__.py +121 -0
openadapt_ml/baselines/adapter.py +185 -0
openadapt_ml/baselines/cli.py +314 -0
openadapt_ml/baselines/config.py +448 -0
openadapt_ml/baselines/parser.py +922 -0
openadapt_ml/baselines/prompts.py +787 -0
openadapt_ml/benchmarks/__init__.py +13 -107
openadapt_ml/benchmarks/agent.py +297 -374
openadapt_ml/benchmarks/azure.py +62 -24
openadapt_ml/benchmarks/azure_ops_tracker.py +521 -0
openadapt_ml/benchmarks/cli.py +1874 -751
openadapt_ml/benchmarks/trace_export.py +631 -0
openadapt_ml/benchmarks/viewer.py +1236 -0
openadapt_ml/benchmarks/vm_monitor.py +1111 -0
openadapt_ml/benchmarks/waa_deploy/Dockerfile +216 -0
openadapt_ml/benchmarks/waa_deploy/__init__.py +10 -0
openadapt_ml/benchmarks/waa_deploy/api_agent.py +540 -0
openadapt_ml/benchmarks/waa_deploy/start_waa_server.bat +53 -0
openadapt_ml/cloud/azure_inference.py +3 -5
openadapt_ml/cloud/lambda_labs.py +722 -307
openadapt_ml/cloud/local.py +3194 -89
openadapt_ml/cloud/ssh_tunnel.py +595 -0
openadapt_ml/datasets/next_action.py +125 -96
openadapt_ml/evals/grounding.py +32 -9
openadapt_ml/evals/plot_eval_metrics.py +15 -13
openadapt_ml/evals/trajectory_matching.py +120 -57
openadapt_ml/experiments/demo_prompt/__init__.py +19 -0
openadapt_ml/experiments/demo_prompt/format_demo.py +236 -0
openadapt_ml/experiments/demo_prompt/results/experiment_20251231_002125.json +83 -0
openadapt_ml/experiments/demo_prompt/results/experiment_n30_20251231_165958.json +1100 -0
openadapt_ml/experiments/demo_prompt/results/multistep_20251231_025051.json +182 -0
openadapt_ml/experiments/demo_prompt/run_experiment.py +541 -0
openadapt_ml/experiments/representation_shootout/__init__.py +70 -0
openadapt_ml/experiments/representation_shootout/conditions.py +708 -0
openadapt_ml/experiments/representation_shootout/config.py +390 -0
openadapt_ml/experiments/representation_shootout/evaluator.py +659 -0
openadapt_ml/experiments/representation_shootout/runner.py +687 -0
openadapt_ml/experiments/waa_demo/__init__.py +10 -0
openadapt_ml/experiments/waa_demo/demos.py +357 -0
openadapt_ml/experiments/waa_demo/runner.py +732 -0
openadapt_ml/experiments/waa_demo/tasks.py +151 -0
openadapt_ml/export/__init__.py +9 -0
openadapt_ml/export/__main__.py +6 -0
openadapt_ml/export/cli.py +89 -0
openadapt_ml/export/parquet.py +277 -0
openadapt_ml/grounding/detector.py +18 -14
openadapt_ml/ingest/__init__.py +11 -10
openadapt_ml/ingest/capture.py +97 -86
openadapt_ml/ingest/loader.py +120 -69
openadapt_ml/ingest/synthetic.py +344 -193
openadapt_ml/models/api_adapter.py +14 -4
openadapt_ml/models/base_adapter.py +10 -2
openadapt_ml/models/providers/__init__.py +288 -0
openadapt_ml/models/providers/anthropic.py +266 -0
openadapt_ml/models/providers/base.py +299 -0
openadapt_ml/models/providers/google.py +376 -0
openadapt_ml/models/providers/openai.py +342 -0
openadapt_ml/models/qwen_vl.py +46 -19
openadapt_ml/perception/__init__.py +35 -0
openadapt_ml/perception/integration.py +399 -0
openadapt_ml/retrieval/README.md +226 -0
openadapt_ml/retrieval/USAGE.md +391 -0
openadapt_ml/retrieval/__init__.py +91 -0
openadapt_ml/retrieval/demo_retriever.py +843 -0
openadapt_ml/retrieval/embeddings.py +630 -0
openadapt_ml/retrieval/index.py +194 -0
openadapt_ml/retrieval/retriever.py +162 -0
openadapt_ml/runtime/__init__.py +50 -0
openadapt_ml/runtime/policy.py +27 -14
openadapt_ml/runtime/safety_gate.py +471 -0
openadapt_ml/schema/__init__.py +113 -0
openadapt_ml/schema/converters.py +588 -0
openadapt_ml/schema/episode.py +470 -0
openadapt_ml/scripts/capture_screenshots.py +530 -0
openadapt_ml/scripts/compare.py +102 -61
openadapt_ml/scripts/demo_policy.py +4 -1
openadapt_ml/scripts/eval_policy.py +19 -14
openadapt_ml/scripts/make_gif.py +1 -1
openadapt_ml/scripts/prepare_synthetic.py +16 -17
openadapt_ml/scripts/train.py +98 -75
openadapt_ml/segmentation/README.md +920 -0
openadapt_ml/segmentation/__init__.py +97 -0
openadapt_ml/segmentation/adapters/__init__.py +5 -0
openadapt_ml/segmentation/adapters/capture_adapter.py +420 -0
openadapt_ml/segmentation/annotator.py +610 -0
openadapt_ml/segmentation/cache.py +290 -0
openadapt_ml/segmentation/cli.py +674 -0
openadapt_ml/segmentation/deduplicator.py +656 -0
openadapt_ml/segmentation/frame_describer.py +788 -0
openadapt_ml/segmentation/pipeline.py +340 -0
openadapt_ml/segmentation/schemas.py +622 -0
openadapt_ml/segmentation/segment_extractor.py +634 -0
openadapt_ml/training/azure_ops_viewer.py +1097 -0
openadapt_ml/training/benchmark_viewer.py +3255 -19
openadapt_ml/training/shared_ui.py +7 -7
openadapt_ml/training/stub_provider.py +57 -35
openadapt_ml/training/trainer.py +255 -441
openadapt_ml/training/trl_trainer.py +403 -0
openadapt_ml/training/viewer.py +323 -108
openadapt_ml/training/viewer_components.py +180 -0
{openadapt_ml-0.1.0.dist-info → openadapt_ml-0.2.1.dist-info}/METADATA +312 -69
openadapt_ml-0.2.1.dist-info/RECORD +116 -0
openadapt_ml/benchmarks/base.py +0 -366
openadapt_ml/benchmarks/data_collection.py +0 -432
openadapt_ml/benchmarks/runner.py +0 -381
openadapt_ml/benchmarks/waa.py +0 -704
openadapt_ml/schemas/__init__.py +0 -53
openadapt_ml/schemas/sessions.py +0 -122
openadapt_ml/schemas/validation.py +0 -252
openadapt_ml-0.1.0.dist-info/RECORD +0 -55
{openadapt_ml-0.1.0.dist-info → openadapt_ml-0.2.1.dist-info}/WHEEL +0 -0
{openadapt_ml-0.1.0.dist-info → openadapt_ml-0.2.1.dist-info}/licenses/LICENSE +0 -0

openadapt_ml/cloud/local.py CHANGED Viewed

@@ -27,7 +27,6 @@ import http.server
 import json
 import os
 import shutil
-import signal
 import socketserver
 import subprocess
 import sys
@@ -36,6 +35,8 @@ import webbrowser
 from pathlib import Path
 from typing import Any
+from openadapt_ml.cloud.ssh_tunnel import get_tunnel_manager
 # Training output directory
 TRAINING_OUTPUT = Path("training_output")
@@ -107,7 +108,10 @@ def _is_mock_benchmark(benchmark_dir: Path) -> bool:
     # Check for test runs (but allow waa-mock evaluations with real API models)
     # Only filter out purely synthetic test data directories
-    if any(term in benchmark_dir.name.lower() for term in ["test_run", "test_cli", "quick_demo"]):
+    if any(
+        term in benchmark_dir.name.lower()
+        for term in ["test_run", "test_cli", "quick_demo"]
+    ):
         return True
     return False
@@ -143,6 +147,16 @@ def _regenerate_benchmark_viewer_if_available(output_dir: Path) -> bool:
         # No real benchmark data - generate empty state viewer
         try:
             generate_empty_benchmark_viewer(benchmark_html_path)
+            # Still create symlink for azure_jobs.json access (even without real benchmarks)
+            if benchmark_results_dir.exists():
+                benchmark_results_link = output_dir / "benchmark_results"
+                if benchmark_results_link.is_symlink():
+                    benchmark_results_link.unlink()
+                elif benchmark_results_link.exists():
+                    shutil.rmtree(benchmark_results_link)
+                benchmark_results_link.symlink_to(benchmark_results_dir.absolute())
             print("  Generated benchmark viewer: No real evaluation data yet")
             return True
         except Exception as e:
@@ -168,11 +182,20 @@ def _regenerate_benchmark_viewer_if_available(output_dir: Path) -> bool:
                 tasks_dst = benchmark_tasks_dir / benchmark_dir.name
                 shutil.copytree(tasks_src, tasks_dst)
+        # Create symlink for benchmark_results directory (for azure_jobs.json access)
+        benchmark_results_link = output_dir / "benchmark_results"
+        if benchmark_results_link.is_symlink():
+            benchmark_results_link.unlink()
+        elif benchmark_results_link.exists():
+            shutil.rmtree(benchmark_results_link)
+        benchmark_results_link.symlink_to(benchmark_results_dir.absolute())
         print(f"  Regenerated benchmark viewer with {len(real_benchmarks)} run(s)")
         return True
     except Exception as e:
         print(f"  Could not regenerate benchmark viewer: {e}")
         import traceback
         traceback.print_exc()
         return False
@@ -181,6 +204,7 @@ def detect_device() -> str:
     """Detect available compute device."""
     try:
         import torch
         if torch.cuda.is_available():
             device_name = torch.cuda.get_device_name(0)
             return f"cuda ({device_name})"
@@ -231,10 +255,13 @@ def get_training_status() -> dict[str, Any]:
     # Find checkpoints
     checkpoints_dir = Path("checkpoints")
     if checkpoints_dir.exists():
-        status["checkpoints"] = sorted([
-            d.name for d in checkpoints_dir.iterdir()
-            if d.is_dir() and (d / "adapter_config.json").exists()
-        ])
+        status["checkpoints"] = sorted(
+            [
+                d.name
+                for d in checkpoints_dir.iterdir()
+                if d.is_dir() and (d / "adapter_config.json").exists()
+            ]
+        )
     return status
@@ -244,9 +271,9 @@ def cmd_status(args: argparse.Namespace) -> int:
     status = get_training_status()
     current_dir = get_current_output_dir()
-    print(f"\n{'='*50}")
+    print(f"\n{'=' * 50}")
     print("LOCAL TRAINING STATUS")
-    print(f"{'='*50}")
+    print(f"{'=' * 50}")
     print(f"Device: {status['device']}")
     print(f"Status: {'RUNNING' if status['running'] else 'IDLE'}")
     if status.get("job_id"):
@@ -254,7 +281,7 @@ def cmd_status(args: argparse.Namespace) -> int:
     print(f"Output: {current_dir}")
     if status.get("epoch"):
-        print(f"\nProgress:")
+        print("\nProgress:")
         print(f"  Epoch: {status['epoch']}")
         print(f"  Step: {status['step']}")
         if status.get("loss"):
@@ -267,7 +294,9 @@ def cmd_status(args: argparse.Namespace) -> int:
         for cp in status["checkpoints"][-5:]:  # Show last 5
             print(f"  - {cp}")
-    print(f"\nDashboard: {'✓' if status['has_dashboard'] else '✗'} {current_dir}/dashboard.html")
+    print(
+        f"\nDashboard: {'✓' if status['has_dashboard'] else '✗'} {current_dir}/dashboard.html"
+    )
     print(f"Viewer: {'✓' if status['has_viewer'] else '✗'} {current_dir}/viewer.html")
     print()
@@ -300,9 +329,9 @@ def cmd_train(args: argparse.Namespace) -> int:
         print(f"Error: Config not found: {config_path}")
         return 1
-    print(f"\n{'='*50}")
+    print(f"\n{'=' * 50}")
     print("STARTING LOCAL TRAINING")
-    print(f"{'='*50}")
+    print(f"{'=' * 50}")
     print(f"Capture: {capture_path}")
     print(f"Goal: {goal}")
     print(f"Config: {config}")
@@ -311,10 +340,15 @@ def cmd_train(args: argparse.Namespace) -> int:
     # Build command
     cmd = [
-        sys.executable, "-m", "openadapt_ml.scripts.train",
-        "--config", str(config_path),
-        "--capture", str(capture_path),
-        "--goal", goal,
+        sys.executable,
+        "-m",
+        "openadapt_ml.scripts.train",
+        "--config",
+        str(config_path),
+        "--capture",
+        str(capture_path),
+        "--goal",
+        goal,
     ]
     if args.open:
@@ -333,14 +367,16 @@ def cmd_check(args: argparse.Namespace) -> int:
     """Check training health and early stopping analysis."""
     status = get_training_status()
-    print(f"\n{'='*50}")
+    print(f"\n{'=' * 50}")
     print("TRAINING HEALTH CHECK")
-    print(f"{'='*50}")
+    print(f"{'=' * 50}")
     raw_losses = status.get("losses", [])
     if not raw_losses:
         print("No training data found.")
-        print("Run training first with: uv run python -m openadapt_ml.cloud.local train --capture <path>")
+        print(
+            "Run training first with: uv run python -m openadapt_ml.cloud.local train --capture <path>"
+        )
         return 1
     # Extract loss values (handle both dict and float formats)
@@ -361,7 +397,7 @@ def cmd_check(args: argparse.Namespace) -> int:
         min_loss = min(losses)
         max_loss = max(losses)
-        print(f"\nLoss progression:")
+        print("\nLoss progression:")
         print(f"  First: {first_loss:.4f}")
         print(f"  Last: {last_loss:.4f}")
         print(f"  Min: {min_loss:.4f}")
@@ -372,9 +408,11 @@ def cmd_check(args: argparse.Namespace) -> int:
         if len(losses) >= 10:
             recent = losses[-10:]
             recent_avg = sum(recent) / len(recent)
-            recent_std = (sum((x - recent_avg) ** 2 for x in recent) / len(recent)) ** 0.5
+            recent_std = (
+                sum((x - recent_avg) ** 2 for x in recent) / len(recent)
+            ) ** 0.5
-            print(f"\nRecent stability (last 10 steps):")
+            print("\nRecent stability (last 10 steps):")
             print(f"  Avg loss: {recent_avg:.4f}")
             print(f"  Std dev: {recent_std:.4f}")
@@ -393,14 +431,18 @@ def cmd_serve(args: argparse.Namespace) -> int:
     """Start local web server for dashboard.
     Automatically regenerates dashboard and viewer before serving to ensure
-    the latest code and data are reflected.
+    the latest code and data are reflected. Also ensures the 'current' symlink
+    points to the most recent training run.
     """
-    from openadapt_ml.training.trainer import regenerate_local_dashboard
+    from openadapt_ml.training.trainer import (
+        regenerate_local_dashboard,
+        update_current_symlink_to_latest,
+    )
     port = args.port
     # Determine what to serve: benchmark directory or training output
-    if hasattr(args, 'benchmark') and args.benchmark:
+    if hasattr(args, "benchmark") and args.benchmark:
         serve_dir = Path(args.benchmark).expanduser().resolve()
         if not serve_dir.exists():
             print(f"Error: Benchmark directory not found: {serve_dir}")
@@ -410,7 +452,10 @@ def cmd_serve(args: argparse.Namespace) -> int:
         if not args.no_regenerate:
             print("Regenerating benchmark viewer...")
             try:
-                from openadapt_ml.training.benchmark_viewer import generate_benchmark_viewer
+                from openadapt_ml.training.benchmark_viewer import (
+                    generate_benchmark_viewer,
+                )
                 generate_benchmark_viewer(serve_dir)
             except Exception as e:
                 print(f"Warning: Could not regenerate benchmark viewer: {e}")
@@ -419,6 +464,17 @@ def cmd_serve(args: argparse.Namespace) -> int:
     else:
         serve_dir = get_current_output_dir()
+        # If current symlink doesn't exist or is broken, update to latest run
+        if not serve_dir.exists() or not serve_dir.is_dir():
+            print("Updating 'current' symlink to latest training run...")
+            latest = update_current_symlink_to_latest()
+            if latest:
+                serve_dir = get_current_output_dir()
+                print(f"  Updated to: {latest.name}")
+            else:
+                print(f"Error: {serve_dir} not found. Run training first.")
+                return 1
         if not serve_dir.exists():
             print(f"Error: {serve_dir} not found. Run training first.")
             return 1
@@ -427,7 +483,9 @@ def cmd_serve(args: argparse.Namespace) -> int:
         if not args.no_regenerate:
             print("Regenerating dashboard and viewer...")
             try:
-                regenerate_local_dashboard(str(serve_dir))
+                # Use keep_polling=True so JavaScript fetches live data from training_log.json
+                # This ensures the dashboard shows current data instead of stale embedded data
+                regenerate_local_dashboard(str(serve_dir), keep_polling=True)
                 # Also regenerate viewer if comparison data exists
                 _regenerate_viewer_if_possible(serve_dir)
             except Exception as e:
@@ -436,8 +494,23 @@ def cmd_serve(args: argparse.Namespace) -> int:
             # Also regenerate benchmark viewer from latest benchmark results
             _regenerate_benchmark_viewer_if_available(serve_dir)
+            # Generate Azure ops dashboard
+            try:
+                from openadapt_ml.training.azure_ops_viewer import (
+                    generate_azure_ops_dashboard,
+                )
+                generate_azure_ops_dashboard(serve_dir / "azure_ops.html")
+                print("  Generated Azure ops dashboard")
+            except Exception as e:
+                print(f"  Warning: Could not generate Azure ops dashboard: {e}")
         start_page = "dashboard.html"
+    # Override start page if specified
+    if hasattr(args, "start_page") and args.start_page:
+        start_page = args.start_page
     # Serve from the specified directory
     os.chdir(serve_dir)
@@ -452,33 +525,41 @@ def cmd_serve(args: argparse.Namespace) -> int:
                 super().log_message(format, *log_args)
         def do_POST(self):
-            if self.path == '/api/stop':
+            if self.path == "/api/stop":
                 # Create stop signal file
                 stop_file = serve_dir / "STOP_TRAINING"
                 stop_file.touch()
                 self.send_response(200)
-                self.send_header('Content-Type', 'application/json')
-                self.send_header('Access-Control-Allow-Origin', '*')
+                self.send_header("Content-Type", "application/json")
+                self.send_header("Access-Control-Allow-Origin", "*")
                 self.end_headers()
                 self.wfile.write(b'{"status": "stop_signal_created"}')
                 print(f"\n⏹ Stop signal created: {stop_file}")
-            elif self.path == '/api/run-benchmark':
+            elif self.path == "/api/run-benchmark":
                 # Parse request body for provider
-                content_length = int(self.headers.get('Content-Length', 0))
-                body = self.rfile.read(content_length).decode('utf-8') if content_length else '{}'
+                content_length = int(self.headers.get("Content-Length", 0))
+                body = (
+                    self.rfile.read(content_length).decode("utf-8")
+                    if content_length
+                    else "{}"
+                )
                 try:
                     params = json.loads(body)
                 except json.JSONDecodeError:
                     params = {}
-                provider = params.get('provider', 'anthropic')
-                tasks = params.get('tasks', 5)
+                provider = params.get("provider", "anthropic")
+                tasks = params.get("tasks", 5)
                 self.send_response(200)
-                self.send_header('Content-Type', 'application/json')
-                self.send_header('Access-Control-Allow-Origin', '*')
+                self.send_header("Content-Type", "application/json")
+                self.send_header("Access-Control-Allow-Origin", "*")
                 self.end_headers()
-                self.wfile.write(json.dumps({"status": "started", "provider": provider, "tasks": tasks}).encode())
+                self.wfile.write(
+                    json.dumps(
+                        {"status": "started", "provider": provider, "tasks": tasks}
+                    ).encode()
+                )
                 # Run benchmark in background thread with progress logging
                 def run_benchmark():
@@ -492,25 +573,45 @@ def cmd_serve(args: argparse.Namespace) -> int:
                     # Create progress log file (in cwd which is serve_dir)
                     progress_file = Path("benchmark_progress.json")
-                    print(f"\n🚀 Starting {provider} benchmark evaluation ({tasks} tasks)...")
+                    print(
+                        f"\n🚀 Starting {provider} benchmark evaluation ({tasks} tasks)..."
+                    )
                     # Write initial progress
-                    progress_file.write_text(json.dumps({
-                        "status": "running",
-                        "provider": provider,
-                        "tasks_total": tasks,
-                        "tasks_complete": 0,
-                        "message": f"Starting {provider} evaluation..."
-                    }))
+                    progress_file.write_text(
+                        json.dumps(
+                            {
+                                "status": "running",
+                                "provider": provider,
+                                "tasks_total": tasks,
+                                "tasks_complete": 0,
+                                "message": f"Starting {provider} evaluation...",
+                            }
+                        )
+                    )
                     # Copy environment with loaded vars
                     env = os.environ.copy()
                     result = subprocess.run(
-                        ["uv", "run", "python", "-m", "openadapt_ml.benchmarks.cli", "run-api",
-                         "--provider", provider, "--tasks", str(tasks),
-                         "--model-id", f"{provider}-api"],
-                        capture_output=True, text=True, cwd=str(project_root), env=env
+                        [
+                            "uv",
+                            "run",
+                            "python",
+                            "-m",
+                            "openadapt_ml.benchmarks.cli",
+                            "run-api",
+                            "--provider",
+                            provider,
+                            "--tasks",
+                            str(tasks),
+                            "--model-id",
+                            f"{provider}-api",
+                        ],
+                        capture_output=True,
+                        text=True,
+                        cwd=str(project_root),
+                        env=env,
                     )
                     print(f"\n📋 Benchmark output:\n{result.stdout}")
@@ -518,53 +619,2995 @@ def cmd_serve(args: argparse.Namespace) -> int:
                         print(f"Stderr: {result.stderr}")
                     if result.returncode == 0:
-                        print(f"✅ Benchmark complete. Regenerating viewer...")
-                        progress_file.write_text(json.dumps({
-                            "status": "complete",
-                            "provider": provider,
-                            "message": "Evaluation complete! Refreshing results..."
-                        }))
+                        print("✅ Benchmark complete. Regenerating viewer...")
+                        progress_file.write_text(
+                            json.dumps(
+                                {
+                                    "status": "complete",
+                                    "provider": provider,
+                                    "message": "Evaluation complete! Refreshing results...",
+                                }
+                            )
+                        )
                         # Regenerate benchmark viewer
                         _regenerate_benchmark_viewer_if_available(serve_dir)
                     else:
                         print(f"❌ Benchmark failed: {result.stderr}")
-                        progress_file.write_text(json.dumps({
-                            "status": "error",
-                            "provider": provider,
-                            "message": f"Evaluation failed: {result.stderr[:200]}"
-                        }))
+                        progress_file.write_text(
+                            json.dumps(
+                                {
+                                    "status": "error",
+                                    "provider": provider,
+                                    "message": f"Evaluation failed: {result.stderr[:200]}",
+                                }
+                            )
+                        )
                 threading.Thread(target=run_benchmark, daemon=True).start()
+            elif self.path == "/api/vms/register":
+                # Register a new VM
+                content_length = int(self.headers.get("Content-Length", 0))
+                body = (
+                    self.rfile.read(content_length).decode("utf-8")
+                    if content_length
+                    else "{}"
+                )
+                try:
+                    vm_data = json.loads(body)
+                    result = self._register_vm(vm_data)
+                    self.send_response(200)
+                    self.send_header("Content-Type", "application/json")
+                    self.send_header("Access-Control-Allow-Origin", "*")
+                    self.end_headers()
+                    self.wfile.write(json.dumps(result).encode())
+                except Exception as e:
+                    self.send_response(500)
+                    self.send_header("Content-Type", "application/json")
+                    self.send_header("Access-Control-Allow-Origin", "*")
+                    self.end_headers()
+                    self.wfile.write(json.dumps({"error": str(e)}).encode())
+            elif self.path == "/api/benchmark/start":
+                # Start a benchmark run with configurable parameters
+                content_length = int(self.headers.get("Content-Length", 0))
+                body = (
+                    self.rfile.read(content_length).decode("utf-8")
+                    if content_length
+                    else "{}"
+                )
+                try:
+                    params = json.loads(body)
+                    result = self._start_benchmark_run(params)
+                    self.send_response(200)
+                    self.send_header("Content-Type", "application/json")
+                    self.send_header("Access-Control-Allow-Origin", "*")
+                    self.end_headers()
+                    self.wfile.write(json.dumps(result).encode())
+                except Exception as e:
+                    self.send_response(500)
+                    self.send_header("Content-Type", "application/json")
+                    self.send_header("Access-Control-Allow-Origin", "*")
+                    self.end_headers()
+                    self.wfile.write(json.dumps({"error": str(e)}).encode())
             else:
                 self.send_error(404, "Not found")
         def do_GET(self):
-            if self.path.startswith('/api/benchmark-progress'):
+            if self.path.startswith("/api/benchmark-progress"):
                 # Return benchmark progress
-                progress_file = Path("benchmark_progress.json")  # Relative to serve_dir (cwd)
+                progress_file = Path(
+                    "benchmark_progress.json"
+                )  # Relative to serve_dir (cwd)
                 if progress_file.exists():
                     progress = progress_file.read_text()
                 else:
                     progress = json.dumps({"status": "idle"})
                 self.send_response(200)
-                self.send_header('Content-Type', 'application/json')
-                self.send_header('Access-Control-Allow-Origin', '*')
+                self.send_header("Content-Type", "application/json")
+                self.send_header("Access-Control-Allow-Origin", "*")
                 self.end_headers()
                 self.wfile.write(progress.encode())
+            elif self.path.startswith("/api/benchmark-live"):
+                # Return live evaluation state
+                live_file = Path("benchmark_live.json")  # Relative to serve_dir (cwd)
+                if live_file.exists():
+                    live_state = live_file.read_text()
+                else:
+                    live_state = json.dumps({"status": "idle"})
+                self.send_response(200)
+                self.send_header("Content-Type", "application/json")
+                self.send_header("Access-Control-Allow-Origin", "*")
+                self.end_headers()
+                self.wfile.write(live_state.encode())
+            elif self.path.startswith("/api/tasks"):
+                # Return background task status (VM, Docker, benchmarks)
+                try:
+                    tasks = self._fetch_background_tasks()
+                    self.send_response(200)
+                    self.send_header("Content-Type", "application/json")
+                    self.send_header("Access-Control-Allow-Origin", "*")
+                    self.end_headers()
+                    self.wfile.write(json.dumps(tasks).encode())
+                except Exception as e:
+                    self.send_response(500)
+                    self.send_header("Content-Type", "application/json")
+                    self.send_header("Access-Control-Allow-Origin", "*")
+                    self.end_headers()
+                    self.wfile.write(json.dumps({"error": str(e)}).encode())
+            elif self.path.startswith("/api/azure-jobs"):
+                # Return LIVE Azure job status from Azure ML
+                # Supports ?force=true parameter for manual refresh (always fetches live)
+                try:
+                    from urllib.parse import urlparse, parse_qs
+                    query = parse_qs(urlparse(self.path).query)
+                    force_refresh = query.get("force", ["false"])[0].lower() == "true"
+                    # Always fetch live data (force just indicates manual refresh for logging)
+                    if force_refresh:
+                        print("Azure Jobs: Manual refresh requested")
+                    jobs = self._fetch_live_azure_jobs()
+                    self.send_response(200)
+                    self.send_header("Content-Type", "application/json")
+                    self.send_header("Access-Control-Allow-Origin", "*")
+                    self.end_headers()
+                    self.wfile.write(json.dumps(jobs).encode())
+                except Exception as e:
+                    self.send_response(500)
+                    self.send_header("Content-Type", "application/json")
+                    self.send_header("Access-Control-Allow-Origin", "*")
+                    self.end_headers()
+                    self.wfile.write(json.dumps({"error": str(e)}).encode())
+            elif self.path.startswith("/api/benchmark-sse"):
+                # Server-Sent Events endpoint for real-time benchmark updates
+                try:
+                    from urllib.parse import urlparse, parse_qs
+                    query = parse_qs(urlparse(self.path).query)
+                    interval = int(query.get("interval", [5])[0])
+                    # Validate interval (min 1s, max 60s)
+                    interval = max(1, min(60, interval))
+                    self._stream_benchmark_updates(interval)
+                except Exception as e:
+                    self.send_error(500, f"SSE error: {e}")
+            elif self.path.startswith("/api/vms"):
+                # Return VM registry with live status
+                try:
+                    vms = self._fetch_vm_registry()
+                    self.send_response(200)
+                    self.send_header("Content-Type", "application/json")
+                    self.send_header("Access-Control-Allow-Origin", "*")
+                    self.end_headers()
+                    self.wfile.write(json.dumps(vms).encode())
+                except Exception as e:
+                    self.send_response(500)
+                    self.send_header("Content-Type", "application/json")
+                    self.send_header("Access-Control-Allow-Origin", "*")
+                    self.end_headers()
+                    self.wfile.write(json.dumps({"error": str(e)}).encode())
+            elif self.path.startswith("/api/azure-job-logs"):
+                # Return live logs for running Azure job
+                try:
+                    # Parse job_id from query string
+                    from urllib.parse import urlparse, parse_qs
+                    query = parse_qs(urlparse(self.path).query)
+                    job_id = query.get("job_id", [None])[0]
+                    logs = self._fetch_azure_job_logs(job_id)
+                    self.send_response(200)
+                    self.send_header("Content-Type", "application/json")
+                    self.send_header("Access-Control-Allow-Origin", "*")
+                    self.end_headers()
+                    self.wfile.write(json.dumps(logs).encode())
+                except Exception as e:
+                    self.send_response(500)
+                    self.send_header("Content-Type", "application/json")
+                    self.send_header("Access-Control-Allow-Origin", "*")
+                    self.end_headers()
+                    self.wfile.write(json.dumps({"error": str(e)}).encode())
+            elif self.path.startswith("/api/probe-vm"):
+                # Probe the VM to check if WAA server is responding
+                try:
+                    result = self._probe_vm()
+                    self.send_response(200)
+                    self.send_header("Content-Type", "application/json")
+                    self.send_header("Access-Control-Allow-Origin", "*")
+                    self.end_headers()
+                    self.wfile.write(json.dumps(result).encode())
+                except Exception as e:
+                    self.send_response(500)
+                    self.send_header("Content-Type", "application/json")
+                    self.send_header("Access-Control-Allow-Origin", "*")
+                    self.end_headers()
+                    self.wfile.write(
+                        json.dumps({"error": str(e), "responding": False}).encode()
+                    )
+            elif self.path.startswith("/api/tunnels"):
+                # Return SSH tunnel status
+                try:
+                    tunnel_mgr = get_tunnel_manager()
+                    status = tunnel_mgr.get_tunnel_status()
+                    result = {
+                        name: {
+                            "active": s.active,
+                            "local_port": s.local_port,
+                            "remote_endpoint": s.remote_endpoint,
+                            "pid": s.pid,
+                            "error": s.error,
+                        }
+                        for name, s in status.items()
+                    }
+                    self.send_response(200)
+                    self.send_header("Content-Type", "application/json")
+                    self.send_header("Access-Control-Allow-Origin", "*")
+                    self.end_headers()
+                    self.wfile.write(json.dumps(result).encode())
+                except Exception as e:
+                    self.send_response(500)
+                    self.send_header("Content-Type", "application/json")
+                    self.send_header("Access-Control-Allow-Origin", "*")
+                    self.end_headers()
+                    self.wfile.write(json.dumps({"error": str(e)}).encode())
+            elif self.path.startswith("/api/current-run"):
+                # Return currently running benchmark info
+                try:
+                    result = self._get_current_run()
+                    self.send_response(200)
+                    self.send_header("Content-Type", "application/json")
+                    self.send_header("Access-Control-Allow-Origin", "*")
+                    self.end_headers()
+                    self.wfile.write(json.dumps(result).encode())
+                except Exception as e:
+                    self.send_response(500)
+                    self.send_header("Content-Type", "application/json")
+                    self.send_header("Access-Control-Allow-Origin", "*")
+                    self.end_headers()
+                    self.wfile.write(
+                        json.dumps({"error": str(e), "running": False}).encode()
+                    )
+            elif self.path.startswith("/api/background-tasks"):
+                # Alias for /api/tasks - background task status
+                try:
+                    tasks = self._fetch_background_tasks()
+                    self.send_response(200)
+                    self.send_header("Content-Type", "application/json")
+                    self.send_header("Access-Control-Allow-Origin", "*")
+                    self.end_headers()
+                    self.wfile.write(json.dumps(tasks).encode())
+                except Exception as e:
+                    self.send_response(500)
+                    self.send_header("Content-Type", "application/json")
+                    self.send_header("Access-Control-Allow-Origin", "*")
+                    self.end_headers()
+                    self.wfile.write(json.dumps({"error": str(e)}).encode())
+            elif self.path.startswith("/api/benchmark/status"):
+                # Return current benchmark job status with ETA
+                try:
+                    status = self._get_benchmark_status()
+                    self.send_response(200)
+                    self.send_header("Content-Type", "application/json")
+                    self.send_header("Access-Control-Allow-Origin", "*")
+                    self.end_headers()
+                    self.wfile.write(json.dumps(status).encode())
+                except Exception as e:
+                    self.send_response(500)
+                    self.send_header("Content-Type", "application/json")
+                    self.send_header("Access-Control-Allow-Origin", "*")
+                    self.end_headers()
+                    self.wfile.write(
+                        json.dumps({"error": str(e), "status": "error"}).encode()
+                    )
+            elif self.path.startswith("/api/benchmark/costs"):
+                # Return cost breakdown (Azure VM, API calls, GPU)
+                try:
+                    costs = self._get_benchmark_costs()
+                    self.send_response(200)
+                    self.send_header("Content-Type", "application/json")
+                    self.send_header("Access-Control-Allow-Origin", "*")
+                    self.end_headers()
+                    self.wfile.write(json.dumps(costs).encode())
+                except Exception as e:
+                    self.send_response(500)
+                    self.send_header("Content-Type", "application/json")
+                    self.send_header("Access-Control-Allow-Origin", "*")
+                    self.end_headers()
+                    self.wfile.write(json.dumps({"error": str(e)}).encode())
+            elif self.path.startswith("/api/benchmark/metrics"):
+                # Return performance metrics (success rate, domain breakdown)
+                try:
+                    metrics = self._get_benchmark_metrics()
+                    self.send_response(200)
+                    self.send_header("Content-Type", "application/json")
+                    self.send_header("Access-Control-Allow-Origin", "*")
+                    self.end_headers()
+                    self.wfile.write(json.dumps(metrics).encode())
+                except Exception as e:
+                    self.send_response(500)
+                    self.send_header("Content-Type", "application/json")
+                    self.send_header("Access-Control-Allow-Origin", "*")
+                    self.end_headers()
+                    self.wfile.write(json.dumps({"error": str(e)}).encode())
+            elif self.path.startswith("/api/benchmark/workers"):
+                # Return worker status and utilization
+                try:
+                    workers = self._get_benchmark_workers()
+                    self.send_response(200)
+                    self.send_header("Content-Type", "application/json")
+                    self.send_header("Access-Control-Allow-Origin", "*")
+                    self.end_headers()
+                    self.wfile.write(json.dumps(workers).encode())
+                except Exception as e:
+                    self.send_response(500)
+                    self.send_header("Content-Type", "application/json")
+                    self.send_header("Access-Control-Allow-Origin", "*")
+                    self.end_headers()
+                    self.wfile.write(json.dumps({"error": str(e)}).encode())
+            elif self.path.startswith("/api/benchmark/runs"):
+                # Return list of all benchmark runs
+                try:
+                    runs = self._get_benchmark_runs()
+                    self.send_response(200)
+                    self.send_header("Content-Type", "application/json")
+                    self.send_header("Access-Control-Allow-Origin", "*")
+                    self.end_headers()
+                    self.wfile.write(json.dumps(runs).encode())
+                except Exception as e:
+                    self.send_response(500)
+                    self.send_header("Content-Type", "application/json")
+                    self.send_header("Access-Control-Allow-Origin", "*")
+                    self.end_headers()
+                    self.wfile.write(json.dumps({"error": str(e)}).encode())
+            elif self.path.startswith("/api/benchmark/tasks/"):
+                # Return task execution details
+                # URL format: /api/benchmark/tasks/{run_name}/{task_id}
+                try:
+                    parts = self.path.split("/")
+                    if len(parts) >= 6:
+                        run_name = parts[4]
+                        task_id = parts[5]
+                        execution = self._get_task_execution(run_name, task_id)
+                        self.send_response(200)
+                        self.send_header("Content-Type", "application/json")
+                        self.send_header("Access-Control-Allow-Origin", "*")
+                        self.end_headers()
+                        self.wfile.write(json.dumps(execution).encode())
+                    else:
+                        self.send_error(400, "Invalid path format")
+                except Exception as e:
+                    self.send_response(500)
+                    self.send_header("Content-Type", "application/json")
+                    self.send_header("Access-Control-Allow-Origin", "*")
+                    self.end_headers()
+                    self.wfile.write(json.dumps({"error": str(e)}).encode())
+            elif self.path.startswith("/api/benchmark/screenshots/"):
+                # Serve screenshot files
+                # URL format: /api/benchmark/screenshots/{run_name}/{task_id}/screenshots/{filename}
+                try:
+                    # Remove /api/benchmark/screenshots/ prefix
+                    path_parts = self.path.replace(
+                        "/api/benchmark/screenshots/", ""
+                    ).split("/")
+                    if len(path_parts) >= 4:
+                        run_name = path_parts[0]
+                        task_id = path_parts[1]
+                        # path_parts[2] should be 'screenshots'
+                        filename = path_parts[3]
+                        results_dir = Path("benchmark_results")
+                        screenshot_path = (
+                            results_dir
+                            / run_name
+                            / "tasks"
+                            / task_id
+                            / "screenshots"
+                            / filename
+                        )
+                        if screenshot_path.exists():
+                            self.send_response(200)
+                            self.send_header("Content-Type", "image/png")
+                            self.send_header("Access-Control-Allow-Origin", "*")
+                            self.end_headers()
+                            with open(screenshot_path, "rb") as f:
+                                self.wfile.write(f.read())
+                        else:
+                            self.send_error(
+                                404, f"Screenshot not found: {screenshot_path}"
+                            )
+                    else:
+                        self.send_error(400, "Invalid screenshot path format")
+                except Exception as e:
+                    self.send_error(500, f"Error serving screenshot: {e}")
+            elif self.path.startswith("/api/azure-ops-sse"):
+                # Server-Sent Events endpoint for Azure operations status
+                try:
+                    self._stream_azure_ops_updates()
+                except Exception as e:
+                    self.send_error(500, f"SSE error: {e}")
+            elif self.path.startswith("/api/azure-ops-status"):
+                # Return Azure operations status from JSON file
+                # Session tracker provides elapsed_seconds and cost_usd for
+                # persistence across page refreshes
+                try:
+                    from openadapt_ml.benchmarks.azure_ops_tracker import read_status
+                    from openadapt_ml.benchmarks.session_tracker import (
+                        get_session,
+                        update_session_vm_state,
+                    )
+                    # Get operation status (current task)
+                    status = read_status()
+                    # Get session data (persistent across refreshes)
+                    session = get_session()
+                    # Update session based on VM state if we have VM info
+                    # IMPORTANT: Only pass vm_ip if it's truthy to avoid
+                    # overwriting session's stable vm_ip with None
+                    if status.get("vm_state") and status.get("vm_state") != "unknown":
+                        status_vm_ip = status.get("vm_ip")
+                        # Build update kwargs - only include vm_ip if present
+                        update_kwargs = {
+                            "vm_state": status["vm_state"],
+                            "vm_size": status.get("vm_size"),
+                        }
+                        if status_vm_ip:  # Only include if truthy
+                            update_kwargs["vm_ip"] = status_vm_ip
+                        session = update_session_vm_state(**update_kwargs)
+                    # Use session's vm_ip as authoritative source
+                    # This prevents IP flickering when status file has stale/None values
+                    if session.get("vm_ip"):
+                        status["vm_ip"] = session["vm_ip"]
+                    # Use session's elapsed_seconds and cost_usd for persistence
+                    # These survive page refreshes and track total VM runtime
+                    if (
+                        session.get("is_active")
+                        or session.get("accumulated_seconds", 0) > 0
+                    ):
+                        status["elapsed_seconds"] = session.get("elapsed_seconds", 0.0)
+                        status["cost_usd"] = session.get("cost_usd", 0.0)
+                        status["started_at"] = session.get("started_at")
+                        # Include session metadata for debugging
+                        status["session_id"] = session.get("session_id")
+                        status["session_is_active"] = session.get("is_active", False)
+                        # Include accumulated time from previous sessions for hybrid display
+                        status["accumulated_seconds"] = session.get(
+                            "accumulated_seconds", 0.0
+                        )
+                        # Calculate current session time (total - accumulated)
+                        current_session_seconds = max(
+                            0, status["elapsed_seconds"] - status["accumulated_seconds"]
+                        )
+                        status["current_session_seconds"] = current_session_seconds
+                        status["current_session_cost_usd"] = (
+                            current_session_seconds / 3600
+                        ) * session.get("hourly_rate_usd", 0.422)
+                    try:
+                        tunnel_mgr = get_tunnel_manager()
+                        tunnel_status = tunnel_mgr.get_tunnel_status()
+                        status["tunnels"] = {
+                            name: {
+                                "active": s.active,
+                                "local_port": s.local_port,
+                                "remote_endpoint": s.remote_endpoint,
+                                "pid": s.pid,
+                                "error": s.error,
+                            }
+                            for name, s in tunnel_status.items()
+                        }
+                    except Exception as e:
+                        status["tunnels"] = {"error": str(e)}
+                    self.send_response(200)
+                    self.send_header("Content-Type", "application/json")
+                    self.send_header("Access-Control-Allow-Origin", "*")
+                    self.end_headers()
+                    self.wfile.write(json.dumps(status).encode())
+                except Exception as e:
+                    self.send_response(500)
+                    self.send_header("Content-Type", "application/json")
+                    self.send_header("Access-Control-Allow-Origin", "*")
+                    self.end_headers()
+                    self.wfile.write(json.dumps({"error": str(e)}).encode())
+            elif self.path.startswith("/api/vm-diagnostics"):
+                # Return VM diagnostics: disk usage, Docker stats, memory usage
+                try:
+                    diagnostics = self._get_vm_diagnostics()
+                    self.send_response(200)
+                    self.send_header("Content-Type", "application/json")
+                    self.send_header("Access-Control-Allow-Origin", "*")
+                    self.end_headers()
+                    self.wfile.write(json.dumps(diagnostics).encode())
+                except Exception as e:
+                    self.send_response(500)
+                    self.send_header("Content-Type", "application/json")
+                    self.send_header("Access-Control-Allow-Origin", "*")
+                    self.end_headers()
+                    self.wfile.write(json.dumps({"error": str(e)}).encode())
             else:
                 # Default file serving
                 super().do_GET()
+        def _fetch_live_azure_jobs(self):
+            """Fetch live job status from Azure ML."""
+            import subprocess
+            result = subprocess.run(
+                [
+                    "az",
+                    "ml",
+                    "job",
+                    "list",
+                    "--resource-group",
+                    "openadapt-agents",
+                    "--workspace-name",
+                    "openadapt-ml",
+                    "--query",
+                    "[].{name:name,display_name:display_name,status:status,creation_context:creation_context.created_at}",
+                    "-o",
+                    "json",
+                ],
+                capture_output=True,
+                text=True,
+                timeout=30,
+            )
+            if result.returncode != 0:
+                raise Exception(f"Azure CLI error: {result.stderr}")
+            jobs = json.loads(result.stdout)
+            # Format for frontend
+            experiment_id = "ad29082c-0607-4fda-8cc7-38944eb5a518"
+            wsid = "/subscriptions/78add6c6-c92a-4a53-b751-eb644ac77e59/resourceGroups/openadapt-agents/providers/Microsoft.MachineLearningServices/workspaces/openadapt-ml"
+            formatted = []
+            for job in jobs[:10]:  # Limit to 10 most recent
+                formatted.append(
+                    {
+                        "job_id": job.get("name", "unknown"),
+                        "display_name": job.get("display_name", ""),
+                        "status": job.get("status", "unknown").lower(),
+                        "started_at": job.get("creation_context", ""),
+                        "azure_dashboard_url": f"https://ml.azure.com/experiments/id/{experiment_id}/runs/{job.get('name', '')}?wsid={wsid}",
+                        "is_live": True,  # Flag to indicate this is live data
+                    }
+                )
+            return formatted
+        def _fetch_azure_job_logs(self, job_id: str | None):
+            """Fetch logs for an Azure ML job (streaming for running jobs)."""
+            import subprocess
+            if not job_id:
+                # Get the most recent running job
+                jobs = self._fetch_live_azure_jobs()
+                running = [j for j in jobs if j["status"] == "running"]
+                if running:
+                    job_id = running[0]["job_id"]
+                else:
+                    return {
+                        "logs": "No running jobs found",
+                        "job_id": None,
+                        "status": "idle",
+                    }
+            # Try to stream logs for running job using az ml job stream
+            try:
+                result = subprocess.run(
+                    [
+                        "az",
+                        "ml",
+                        "job",
+                        "stream",
+                        "--name",
+                        job_id,
+                        "--resource-group",
+                        "openadapt-agents",
+                        "--workspace-name",
+                        "openadapt-ml",
+                    ],
+                    capture_output=True,
+                    text=True,
+                    timeout=3,  # Short timeout
+                )
+                if result.returncode == 0 and result.stdout.strip():
+                    return {
+                        "logs": result.stdout[-5000:],
+                        "job_id": job_id,
+                        "status": "streaming",
+                    }
+            except subprocess.TimeoutExpired:
+                pass  # Fall through to job show
+            # Get job details instead
+            result = subprocess.run(
+                [
+                    "az",
+                    "ml",
+                    "job",
+                    "show",
+                    "--name",
+                    job_id,
+                    "--resource-group",
+                    "openadapt-agents",
+                    "--workspace-name",
+                    "openadapt-ml",
+                    "-o",
+                    "json",
+                ],
+                capture_output=True,
+                text=True,
+                timeout=10,
+            )
+            if result.returncode == 0:
+                job_info = json.loads(result.stdout)
+                return {
+                    "logs": f"Job {job_id} is {job_info.get('status', 'unknown')}\\n\\nCommand: {job_info.get('command', 'N/A')}",
+                    "job_id": job_id,
+                    "status": job_info.get("status", "unknown").lower(),
+                    "command": job_info.get("command", ""),
+                }
+            return {
+                "logs": f"Could not fetch logs: {result.stderr}",
+                "job_id": job_id,
+                "status": "error",
+            }
+        def _get_vm_detailed_metadata(
+            self, vm_ip: str, container_name: str, logs: str, phase: str
+        ) -> dict:
+            """Get detailed VM metadata for the VM Details panel.
+            Returns:
+                dict with disk_usage_gb, memory_usage_mb, setup_script_phase, probe_response, qmp_connected, dependencies
+            """
+            import subprocess
+            metadata = {
+                "disk_usage_gb": None,
+                "memory_usage_mb": None,
+                "setup_script_phase": None,
+                "probe_response": None,
+                "qmp_connected": False,
+                "dependencies": [],
+            }
+            # 1. Get disk usage from docker stats
+            try:
+                disk_result = subprocess.run(
+                    [
+                        "ssh",
+                        "-o",
+                        "StrictHostKeyChecking=no",
+                        "-o",
+                        "ConnectTimeout=5",
+                        "-i",
+                        str(Path.home() / ".ssh" / "id_rsa"),
+                        f"azureuser@{vm_ip}",
+                        f"docker exec {container_name} df -h /storage 2>/dev/null | tail -1",
+                    ],
+                    capture_output=True,
+                    text=True,
+                    timeout=10,
+                )
+                if disk_result.returncode == 0 and disk_result.stdout.strip():
+                    # Parse: "Filesystem      Size  Used Avail Use% Mounted on"
+                    # Example: "/dev/sda1        30G  9.2G   20G  31% /storage"
+                    parts = disk_result.stdout.split()
+                    if len(parts) >= 3:
+                        used_str = parts[2]  # e.g., "9.2G"
+                        total_str = parts[1]  # e.g., "30G"
+                        # Convert to GB (handle M/G suffixes)
+                        def to_gb(s):
+                            if s.endswith("G"):
+                                return float(s[:-1])
+                            elif s.endswith("M"):
+                                return float(s[:-1]) / 1024
+                            elif s.endswith("K"):
+                                return float(s[:-1]) / (1024 * 1024)
+                            return 0
+                        metadata["disk_usage_gb"] = (
+                            f"{to_gb(used_str):.1f} GB / {to_gb(total_str):.0f} GB used"
+                        )
+            except Exception:
+                pass
+            # 2. Get memory usage from docker stats
+            try:
+                mem_result = subprocess.run(
+                    [
+                        "ssh",
+                        "-o",
+                        "StrictHostKeyChecking=no",
+                        "-o",
+                        "ConnectTimeout=5",
+                        "-i",
+                        str(Path.home() / ".ssh" / "id_rsa"),
+                        f"azureuser@{vm_ip}",
+                        f"docker stats {container_name} --no-stream --format '{{{{.MemUsage}}}}'",
+                    ],
+                    capture_output=True,
+                    text=True,
+                    timeout=10,
+                )
+                if mem_result.returncode == 0 and mem_result.stdout.strip():
+                    # Example: "1.5GiB / 4GiB"
+                    metadata["memory_usage_mb"] = mem_result.stdout.strip()
+            except Exception:
+                pass
+            # 3. Parse setup script phase from logs
+            metadata["setup_script_phase"] = self._parse_setup_phase_from_logs(
+                logs, phase
+            )
+            # 4. Check /probe endpoint
+            try:
+                probe_result = subprocess.run(
+                    [
+                        "ssh",
+                        "-o",
+                        "StrictHostKeyChecking=no",
+                        "-o",
+                        "ConnectTimeout=5",
+                        "-i",
+                        str(Path.home() / ".ssh" / "id_rsa"),
+                        f"azureuser@{vm_ip}",
+                        "curl -s --connect-timeout 2 http://20.20.20.21:5000/probe 2>/dev/null",
+                    ],
+                    capture_output=True,
+                    text=True,
+                    timeout=10,
+                )
+                if probe_result.returncode == 0 and probe_result.stdout.strip():
+                    metadata["probe_response"] = probe_result.stdout.strip()
+                else:
+                    metadata["probe_response"] = "Not responding"
+            except Exception:
+                metadata["probe_response"] = "Connection failed"
+            # 5. Check QMP connection (port 7200)
+            try:
+                qmp_result = subprocess.run(
+                    [
+                        "ssh",
+                        "-o",
+                        "StrictHostKeyChecking=no",
+                        "-o",
+                        "ConnectTimeout=5",
+                        "-i",
+                        str(Path.home() / ".ssh" / "id_rsa"),
+                        f"azureuser@{vm_ip}",
+                        "nc -z -w2 localhost 7200 2>&1",
+                    ],
+                    capture_output=True,
+                    text=True,
+                    timeout=10,
+                )
+                metadata["qmp_connected"] = qmp_result.returncode == 0
+            except Exception:
+                pass
+            # 6. Parse dependencies from logs
+            metadata["dependencies"] = self._parse_dependencies_from_logs(logs, phase)
+            return metadata
+        def _parse_setup_phase_from_logs(self, logs: str, current_phase: str) -> str:
+            """Parse the current setup script phase from logs.
+            Looks for patterns indicating which script is running:
+            - install.bat
+            - setup.ps1
+            - on-logon.ps1
+            """
+            if current_phase == "ready":
+                return "Setup complete"
+            elif current_phase == "oobe":
+                # Check for specific script patterns
+                if "on-logon.ps1" in logs.lower():
+                    return "Running on-logon.ps1"
+                elif "setup.ps1" in logs.lower():
+                    return "Running setup.ps1"
+                elif "install.bat" in logs.lower():
+                    return "Running install.bat"
+                else:
+                    return "Windows installation in progress"
+            elif current_phase == "booting":
+                return "Booting Windows"
+            elif current_phase in [
+                "downloading",
+                "extracting",
+                "configuring",
+                "building",
+            ]:
+                return "Preparing Windows VM"
+            else:
+                return "Initializing..."
+        def _parse_dependencies_from_logs(self, logs: str, phase: str) -> list[dict]:
+            """Parse dependency installation status from logs.
+            Returns list of dependencies with their installation status:
+            - Python
+            - Chrome
+            - LibreOffice
+            - VSCode
+            - etc.
+            """
+            dependencies = [
+                {"name": "Python", "icon": "🐍", "status": "pending"},
+                {"name": "Chrome", "icon": "🌐", "status": "pending"},
+                {"name": "LibreOffice", "icon": "📝", "status": "pending"},
+                {"name": "VSCode", "icon": "💻", "status": "pending"},
+                {"name": "WAA Server", "icon": "🔧", "status": "pending"},
+            ]
+            if phase not in ["oobe", "ready"]:
+                # Not yet at Windows setup phase
+                return dependencies
+            logs_lower = logs.lower()
+            # Check for installation patterns
+            if "python" in logs_lower and (
+                "installing python" in logs_lower or "python.exe" in logs_lower
+            ):
+                dependencies[0]["status"] = "installing"
+            elif "python" in logs_lower and "installed" in logs_lower:
+                dependencies[0]["status"] = "complete"
+            if "chrome" in logs_lower and (
+                "downloading" in logs_lower or "installing" in logs_lower
+            ):
+                dependencies[1]["status"] = "installing"
+            elif "chrome" in logs_lower and "installed" in logs_lower:
+                dependencies[1]["status"] = "complete"
+            if "libreoffice" in logs_lower and (
+                "downloading" in logs_lower or "installing" in logs_lower
+            ):
+                dependencies[2]["status"] = "installing"
+            elif "libreoffice" in logs_lower and "installed" in logs_lower:
+                dependencies[2]["status"] = "complete"
+            if "vscode" in logs_lower or "visual studio code" in logs_lower:
+                if "installing" in logs_lower:
+                    dependencies[3]["status"] = "installing"
+                elif "installed" in logs_lower:
+                    dependencies[3]["status"] = "complete"
+            if "waa" in logs_lower or "flask" in logs_lower:
+                if "starting" in logs_lower or "running" in logs_lower:
+                    dependencies[4]["status"] = "installing"
+                elif phase == "ready":
+                    dependencies[4]["status"] = "complete"
+            return dependencies
+        def _get_vm_diagnostics(self) -> dict:
+            """Get VM diagnostics: disk usage, Docker stats, memory usage.
+            Returns a dictionary with:
+            - vm_online: bool - whether VM is reachable
+            - disk_usage: list of disk partitions with usage stats
+            - docker_stats: list of container stats (CPU, memory)
+            - memory_usage: VM host memory stats
+            - docker_system: Docker system disk usage
+            - error: str if any error occurred
+            """
+            import subprocess
+            from openadapt_ml.benchmarks.session_tracker import get_session
+            diagnostics = {
+                "vm_online": False,
+                "disk_usage": [],
+                "docker_stats": [],
+                "memory_usage": {},
+                "docker_system": {},
+                "docker_images": [],
+                "error": None,
+            }
+            # Get VM IP from session
+            session = get_session()
+            vm_ip = session.get("vm_ip")
+            if not vm_ip:
+                diagnostics["error"] = (
+                    "VM IP not found in session. VM may not be running."
+                )
+                return diagnostics
+            # SSH options for Azure VM
+            ssh_opts = [
+                "-o",
+                "StrictHostKeyChecking=no",
+                "-o",
+                "UserKnownHostsFile=/dev/null",
+                "-o",
+                "ConnectTimeout=10",
+                "-o",
+                "ServerAliveInterval=30",
+            ]
+            # Test VM connectivity
+            try:
+                test_result = subprocess.run(
+                    ["ssh", *ssh_opts, f"azureuser@{vm_ip}", "echo 'online'"],
+                    capture_output=True,
+                    text=True,
+                    timeout=15,
+                )
+                if test_result.returncode != 0:
+                    diagnostics["error"] = f"Cannot connect to VM at {vm_ip}"
+                    return diagnostics
+                diagnostics["vm_online"] = True
+            except subprocess.TimeoutExpired:
+                diagnostics["error"] = f"Connection to VM at {vm_ip} timed out"
+                return diagnostics
+            except Exception as e:
+                diagnostics["error"] = f"SSH error: {str(e)}"
+                return diagnostics
+            # 1. Disk usage (df -h)
+            try:
+                df_result = subprocess.run(
+                    [
+                        "ssh",
+                        *ssh_opts,
+                        f"azureuser@{vm_ip}",
+                        "df -h / /mnt 2>/dev/null | tail -n +2",
+                    ],
+                    capture_output=True,
+                    text=True,
+                    timeout=15,
+                )
+                if df_result.returncode == 0 and df_result.stdout.strip():
+                    for line in df_result.stdout.strip().split("\n"):
+                        parts = line.split()
+                        if len(parts) >= 6:
+                            diagnostics["disk_usage"].append(
+                                {
+                                    "filesystem": parts[0],
+                                    "size": parts[1],
+                                    "used": parts[2],
+                                    "available": parts[3],
+                                    "use_percent": parts[4],
+                                    "mount_point": parts[5],
+                                }
+                            )
+            except Exception as e:
+                diagnostics["disk_usage"] = [{"error": str(e)}]
+            # 2. Docker container stats
+            try:
+                stats_result = subprocess.run(
+                    [
+                        "ssh",
+                        *ssh_opts,
+                        f"azureuser@{vm_ip}",
+                        "docker stats --no-stream --format '{{.Name}}|{{.CPUPerc}}|{{.MemUsage}}|{{.MemPerc}}|{{.NetIO}}|{{.BlockIO}}' 2>/dev/null || echo ''",
+                    ],
+                    capture_output=True,
+                    text=True,
+                    timeout=30,
+                )
+                if stats_result.returncode == 0 and stats_result.stdout.strip():
+                    for line in stats_result.stdout.strip().split("\n"):
+                        if "|" in line:
+                            parts = line.split("|")
+                            if len(parts) >= 6:
+                                diagnostics["docker_stats"].append(
+                                    {
+                                        "container": parts[0],
+                                        "cpu_percent": parts[1],
+                                        "memory_usage": parts[2],
+                                        "memory_percent": parts[3],
+                                        "net_io": parts[4],
+                                        "block_io": parts[5],
+                                    }
+                                )
+            except Exception as e:
+                diagnostics["docker_stats"] = [{"error": str(e)}]
+            # 3. VM host memory usage (free -h)
+            try:
+                mem_result = subprocess.run(
+                    [
+                        "ssh",
+                        *ssh_opts,
+                        f"azureuser@{vm_ip}",
+                        "free -h | head -2 | tail -1",
+                    ],
+                    capture_output=True,
+                    text=True,
+                    timeout=15,
+                )
+                if mem_result.returncode == 0 and mem_result.stdout.strip():
+                    parts = mem_result.stdout.strip().split()
+                    if len(parts) >= 7:
+                        diagnostics["memory_usage"] = {
+                            "total": parts[1],
+                            "used": parts[2],
+                            "free": parts[3],
+                            "shared": parts[4],
+                            "buff_cache": parts[5],
+                            "available": parts[6],
+                        }
+            except Exception as e:
+                diagnostics["memory_usage"] = {"error": str(e)}
+            # 4. Docker system disk usage
+            try:
+                docker_df_result = subprocess.run(
+                    [
+                        "ssh",
+                        *ssh_opts,
+                        f"azureuser@{vm_ip}",
+                        "docker system df 2>/dev/null || echo ''",
+                    ],
+                    capture_output=True,
+                    text=True,
+                    timeout=15,
+                )
+                if docker_df_result.returncode == 0 and docker_df_result.stdout.strip():
+                    lines = docker_df_result.stdout.strip().split("\n")
+                    # Parse the table: TYPE, TOTAL, ACTIVE, SIZE, RECLAIMABLE
+                    for line in lines[1:]:  # Skip header
+                        parts = line.split()
+                        if len(parts) >= 5:
+                            dtype = parts[0]
+                            diagnostics["docker_system"][dtype.lower()] = {
+                                "total": parts[1],
+                                "active": parts[2],
+                                "size": parts[3],
+                                "reclaimable": " ".join(parts[4:]),
+                            }
+            except Exception as e:
+                diagnostics["docker_system"] = {"error": str(e)}
+            # 5. Docker images
+            try:
+                images_result = subprocess.run(
+                    [
+                        "ssh",
+                        *ssh_opts,
+                        f"azureuser@{vm_ip}",
+                        "docker images --format '{{.Repository}}:{{.Tag}}|{{.Size}}|{{.CreatedSince}}' 2>/dev/null || echo ''",
+                    ],
+                    capture_output=True,
+                    text=True,
+                    timeout=15,
+                )
+                if images_result.returncode == 0 and images_result.stdout.strip():
+                    for line in images_result.stdout.strip().split("\n"):
+                        if "|" in line:
+                            parts = line.split("|")
+                            if len(parts) >= 3:
+                                diagnostics["docker_images"].append(
+                                    {
+                                        "image": parts[0],
+                                        "size": parts[1],
+                                        "created": parts[2],
+                                    }
+                                )
+            except Exception as e:
+                diagnostics["docker_images"] = [{"error": str(e)}]
+            return diagnostics
+        def _fetch_background_tasks(self):
+            """Fetch status of all background tasks: Azure VM, Docker containers, benchmarks."""
+            import subprocess
+            tasks = []
+            # Check for VM IP from environment (set by CLI when auto-launching viewer)
+            env_vm_ip = os.environ.get("WAA_VM_IP")
+            env_internal_ip = os.environ.get("WAA_INTERNAL_IP", "172.30.0.2")
+            # 1. Check Azure WAA VM status
+            vm_ip = None
+            if env_vm_ip:
+                # Use environment variable - VM IP was provided directly
+                vm_ip = env_vm_ip
+                tasks.append(
+                    {
+                        "task_id": "azure-vm-waa",
+                        "task_type": "vm_provision",
+                        "status": "completed",
+                        "phase": "ready",  # Match status to prevent "Starting" + "completed" conflict
+                        "title": "Azure VM Host",
+                        "description": f"Linux host running at {vm_ip}",
+                        "progress_percent": 100.0,
+                        "elapsed_seconds": 0,
+                        "metadata": {
+                            "vm_name": "waa-eval-vm",
+                            "ip_address": vm_ip,
+                            "internal_ip": env_internal_ip,
+                        },
+                    }
+                )
+            else:
+                # Query Azure CLI for VM status
+                try:
+                    result = subprocess.run(
+                        [
+                            "az",
+                            "vm",
+                            "get-instance-view",
+                            "--name",
+                            "waa-eval-vm",
+                            "--resource-group",
+                            "openadapt-agents",
+                            "--query",
+                            "instanceView.statuses",
+                            "-o",
+                            "json",
+                        ],
+                        capture_output=True,
+                        text=True,
+                        timeout=10,
+                    )
+                    if result.returncode == 0:
+                        statuses = json.loads(result.stdout)
+                        power_state = "unknown"
+                        for s in statuses:
+                            if s.get("code", "").startswith("PowerState/"):
+                                power_state = s["code"].replace("PowerState/", "")
+                        # Get VM IP
+                        ip_result = subprocess.run(
+                            [
+                                "az",
+                                "vm",
+                                "list-ip-addresses",
+                                "--name",
+                                "waa-eval-vm",
+                                "--resource-group",
+                                "openadapt-agents",
+                                "--query",
+                                "[0].virtualMachine.network.publicIpAddresses[0].ipAddress",
+                                "-o",
+                                "tsv",
+                            ],
+                            capture_output=True,
+                            text=True,
+                            timeout=10,
+                        )
+                        vm_ip = (
+                            ip_result.stdout.strip()
+                            if ip_result.returncode == 0
+                            else None
+                        )
+                        if power_state == "running":
+                            tasks.append(
+                                {
+                                    "task_id": "azure-vm-waa",
+                                    "task_type": "vm_provision",
+                                    "status": "completed",
+                                    "phase": "ready",  # Match status to prevent "Starting" + "completed" conflict
+                                    "title": "Azure VM Host",
+                                    "description": f"Linux host running at {vm_ip}"
+                                    if vm_ip
+                                    else "Linux host running",
+                                    "progress_percent": 100.0,
+                                    "elapsed_seconds": 0,
+                                    "metadata": {
+                                        "vm_name": "waa-eval-vm",
+                                        "ip_address": vm_ip,
+                                        # No VNC link - that's for the Windows container
+                                    },
+                                }
+                            )
+                except subprocess.TimeoutExpired:
+                    pass
+                except Exception:
+                    pass
+            # 2. Check Docker container status on VM (if we have an IP)
+            if vm_ip:
+                try:
+                    docker_result = subprocess.run(
+                        [
+                            "ssh",
+                            "-o",
+                            "StrictHostKeyChecking=no",
+                            "-o",
+                            "ConnectTimeout=5",
+                            "-i",
+                            str(Path.home() / ".ssh" / "id_rsa"),
+                            f"azureuser@{vm_ip}",
+                            "docker ps --format '{{.Names}}|{{.Status}}|{{.Image}}'",
+                        ],
+                        capture_output=True,
+                        text=True,
+                        timeout=15,
+                    )
+                    if docker_result.returncode == 0 and docker_result.stdout.strip():
+                        for line in docker_result.stdout.strip().split("\n"):
+                            parts = line.split("|")
+                            if len(parts) >= 3:
+                                container_name, status, image = (
+                                    parts[0],
+                                    parts[1],
+                                    parts[2],
+                                )
+                                # Parse "Up X minutes" to determine if healthy
+                                # Check for Windows VM specifically
+                                if (
+                                    "windows" in image.lower()
+                                    or container_name == "winarena"
+                                ):
+                                    # Get detailed progress from docker logs
+                                    log_check = subprocess.run(
+                                        [
+                                            "ssh",
+                                            "-o",
+                                            "StrictHostKeyChecking=no",
+                                            "-o",
+                                            "ConnectTimeout=5",
+                                            "-i",
+                                            str(Path.home() / ".ssh" / "id_rsa"),
+                                            f"azureuser@{vm_ip}",
+                                            f"docker logs {container_name} 2>&1 | tail -30",
+                                        ],
+                                        capture_output=True,
+                                        text=True,
+                                        timeout=10,
+                                    )
+                                    logs = (
+                                        log_check.stdout
+                                        if log_check.returncode == 0
+                                        else ""
+                                    )
+                                    # Parse progress from logs
+                                    phase = "unknown"
+                                    progress = 0.0
+                                    description = "Starting..."
+                                    if "Windows started successfully" in logs:
+                                        # Check if WAA server is ready via Docker port forwarding
+                                        # See docs/waa_network_architecture.md - always use localhost
+                                        server_check = subprocess.run(
+                                            [
+                                                "ssh",
+                                                "-o",
+                                                "StrictHostKeyChecking=no",
+                                                "-o",
+                                                "ConnectTimeout=5",
+                                                "-i",
+                                                str(Path.home() / ".ssh" / "id_rsa"),
+                                                f"azureuser@{vm_ip}",
+                                                "curl -s --connect-timeout 2 http://localhost:5000/probe 2>/dev/null",
+                                            ],
+                                            capture_output=True,
+                                            text=True,
+                                            timeout=10,
+                                        )
+                                        waa_ready = (
+                                            server_check.returncode == 0
+                                            and "Service is operational"
+                                            in server_check.stdout
+                                        )
+                                        if waa_ready:
+                                            phase = "ready"
+                                            progress = 100.0
+                                            description = (
+                                                "WAA Server ready - benchmarks can run"
+                                            )
+                                        else:
+                                            phase = "oobe"
+                                            progress = 80.0  # Phase 5/6 - VM install in progress
+                                            description = "Phase 5/6: Windows installing (check VNC for %). OEM scripts will run after."
+                                    elif "Booting Windows" in logs:
+                                        phase = "booting"
+                                        progress = 70.0  # Phase 4/6
+                                        description = "Phase 4/6: Booting Windows from installer..."
+                                    elif (
+                                        "Building Windows" in logs
+                                        or "Creating a" in logs
+                                    ):
+                                        phase = "building"
+                                        progress = 60.0  # Phase 3/6
+                                        description = (
+                                            "Phase 3/6: Building Windows VM disk..."
+                                        )
+                                    elif "Adding" in logs and "image" in logs:
+                                        phase = "configuring"
+                                        progress = 50.0  # Phase 2/6
+                                        description = "Phase 2/6: Configuring Windows image with WAA scripts..."
+                                    elif "Extracting" in logs:
+                                        phase = "extracting"
+                                        progress = 35.0  # Phase 1/6 (after download)
+                                        description = (
+                                            "Phase 1/6: Extracting Windows ISO..."
+                                        )
+                                    else:
+                                        # Check for download progress (e.g., "1234K ........ 45% 80M 30s")
+                                        import re
+                                        download_match = re.search(
+                                            r"(\d+)%\s+[\d.]+[KMG]\s+(\d+)s", logs
+                                        )
+                                        if download_match:
+                                            phase = "downloading"
+                                            dl_pct = float(download_match.group(1))
+                                            progress = (
+                                                dl_pct * 0.30
+                                            )  # 0-30% for download phase
+                                            eta = download_match.group(2)
+                                            description = f"Phase 0/6: Downloading Windows 11... {download_match.group(1)}% ({eta}s left)"
+                                    # Improve phase detection - if Windows is booted but WAA not ready,
+                                    # it might be at login screen waiting for OEM scripts or running install.bat
+                                    if phase == "oobe" and "Boot0004" in logs:
+                                        # Windows finished installing, at login/desktop
+                                        # install.bat should auto-run from FirstLogonCommands (see Dockerfile)
+                                        description = "Phase 5/6: Windows at desktop, OEM scripts running... (WAA server starting)"
+                                        progress = 90.0
+                                    # Get detailed metadata for VM Details panel
+                                    vm_metadata = self._get_vm_detailed_metadata(
+                                        vm_ip, container_name, logs, phase
+                                    )
+                                    tasks.append(
+                                        {
+                                            "task_id": f"docker-{container_name}",
+                                            "task_type": "docker_container",
+                                            "status": "completed"
+                                            if phase == "ready"
+                                            else "running",
+                                            "title": "Windows 11 + WAA Server",
+                                            "description": description,
+                                            "progress_percent": progress,
+                                            "elapsed_seconds": 0,
+                                            "phase": phase,
+                                            "metadata": {
+                                                "container": container_name,
+                                                "image": image,
+                                                "status": status,
+                                                "phase": phase,
+                                                "windows_ready": phase
+                                                in ["oobe", "ready"],
+                                                "waa_server_ready": phase == "ready",
+                                                # Use localhost - SSH tunnel handles routing to VM
+                                                # See docs/waa_network_architecture.md
+                                                "vnc_url": "http://localhost:8006",
+                                                "windows_username": "Docker",
+                                                "windows_password": "admin",
+                                                "recent_logs": logs[-500:]
+                                                if logs
+                                                else "",
+                                                # Enhanced VM details
+                                                "disk_usage_gb": vm_metadata[
+                                                    "disk_usage_gb"
+                                                ],
+                                                "memory_usage_mb": vm_metadata[
+                                                    "memory_usage_mb"
+                                                ],
+                                                "setup_script_phase": vm_metadata[
+                                                    "setup_script_phase"
+                                                ],
+                                                "probe_response": vm_metadata[
+                                                    "probe_response"
+                                                ],
+                                                "qmp_connected": vm_metadata[
+                                                    "qmp_connected"
+                                                ],
+                                                "dependencies": vm_metadata[
+                                                    "dependencies"
+                                                ],
+                                            },
+                                        }
+                                    )
+                except Exception:
+                    # SSH failed, VM might still be starting
+                    pass
+            # 3. Check local benchmark progress
+            progress_file = Path("benchmark_progress.json")
+            if progress_file.exists():
+                try:
+                    progress = json.loads(progress_file.read_text())
+                    if progress.get("status") == "running":
+                        tasks.append(
+                            {
+                                "task_id": "benchmark-local",
+                                "task_type": "benchmark_run",
+                                "status": "running",
+                                "title": f"{progress.get('provider', 'API').upper()} Benchmark",
+                                "description": progress.get(
+                                    "message", "Running benchmark..."
+                                ),
+                                "progress_percent": (
+                                    progress.get("tasks_complete", 0)
+                                    / max(progress.get("tasks_total", 1), 1)
+                                )
+                                * 100,
+                                "elapsed_seconds": 0,
+                                "metadata": progress,
+                            }
+                        )
+                except Exception:
+                    pass
+            return tasks
+        def _fetch_vm_registry(self):
+            """Fetch VM registry with live status checks.
+            NOTE: We now fetch the VM IP from Azure CLI at runtime to avoid
+            stale IP issues. The registry file is only used as a fallback.
+            """
+            import subprocess
+            from datetime import datetime
+            # Try to get VM IP from Azure CLI (always fresh)
+            vm_ip = None
+            resource_group = "openadapt-agents"
+            vm_name = "azure-waa-vm"
+            try:
+                result = subprocess.run(
+                    [
+                        "az",
+                        "vm",
+                        "show",
+                        "-d",
+                        "-g",
+                        resource_group,
+                        "-n",
+                        vm_name,
+                        "--query",
+                        "publicIps",
+                        "-o",
+                        "tsv",
+                    ],
+                    capture_output=True,
+                    text=True,
+                    timeout=10,
+                )
+                if result.returncode == 0 and result.stdout.strip():
+                    vm_ip = result.stdout.strip()
+            except Exception:
+                pass
+            # If we have a fresh IP from Azure, use it
+            if vm_ip:
+                vms = [
+                    {
+                        "name": vm_name,
+                        "ssh_host": vm_ip,
+                        "ssh_user": "azureuser",
+                        "vnc_port": 8006,
+                        "waa_port": 5000,
+                        "docker_container": "winarena",
+                        "internal_ip": "localhost",
+                    }
+                ]
+            else:
+                # Fallback to registry file
+                project_root = Path(__file__).parent.parent.parent
+                registry_file = project_root / "benchmark_results" / "vm_registry.json"
+                if not registry_file.exists():
+                    return []
+                try:
+                    with open(registry_file) as f:
+                        vms = json.load(f)
+                except Exception as e:
+                    return {"error": f"Failed to read VM registry: {e}"}
+            # Check status for each VM
+            for vm in vms:
+                vm["status"] = "unknown"
+                vm["last_checked"] = datetime.now().isoformat()
+                vm["vnc_reachable"] = False
+                vm["waa_probe_status"] = "unknown"
+                # Check VNC (HTTP HEAD request)
+                try:
+                    vnc_url = f"http://{vm['ssh_host']}:{vm['vnc_port']}"
+                    result = subprocess.run(
+                        ["curl", "-I", "-s", "--connect-timeout", "3", vnc_url],
+                        capture_output=True,
+                        text=True,
+                        timeout=5,
+                    )
+                    if result.returncode == 0 and "200" in result.stdout:
+                        vm["vnc_reachable"] = True
+                except Exception:
+                    pass
+                # Check WAA probe via SSH
+                # Probe WAA via localhost (Docker port forwarding handles routing)
+                # See docs/waa_network_architecture.md for architecture details
+                try:
+                    waa_port = vm.get("waa_port", 5000)
+                    ssh_cmd = f"curl -s --connect-timeout 2 http://localhost:{waa_port}/probe 2>/dev/null"
+                    result = subprocess.run(
+                        [
+                            "ssh",
+                            "-o",
+                            "StrictHostKeyChecking=no",
+                            "-o",
+                            "ConnectTimeout=3",
+                            "-i",
+                            str(Path.home() / ".ssh" / "id_rsa"),
+                            f"{vm['ssh_user']}@{vm['ssh_host']}",
+                            ssh_cmd,
+                        ],
+                        capture_output=True,
+                        text=True,
+                        timeout=5,
+                    )
+                    probe_success = (
+                        result.returncode == 0
+                        and "Service is operational" in result.stdout
+                    )
+                    if probe_success:
+                        vm["waa_probe_status"] = "ready"
+                        vm["status"] = "online"
+                        # Auto-start SSH tunnels for VNC and WAA
+                        try:
+                            tunnel_mgr = get_tunnel_manager()
+                            tunnel_status = tunnel_mgr.ensure_tunnels_for_vm(
+                                vm_ip=vm["ssh_host"],
+                                ssh_user=vm.get("ssh_user", "azureuser"),
+                            )
+                            vm["tunnels"] = {
+                                name: {
+                                    "active": s.active,
+                                    "local_port": s.local_port,
+                                    "error": s.error,
+                                }
+                                for name, s in tunnel_status.items()
+                            }
+                        except Exception as e:
+                            vm["tunnels"] = {"error": str(e)}
+                    else:
+                        vm["waa_probe_status"] = "not responding"
+                        vm["status"] = "offline"
+                        # Stop tunnels when VM goes offline
+                        try:
+                            tunnel_mgr = get_tunnel_manager()
+                            tunnel_mgr.stop_all_tunnels()
+                            vm["tunnels"] = {}
+                        except Exception:
+                            pass
+                except Exception:
+                    vm["waa_probe_status"] = "ssh failed"
+                    vm["status"] = "offline"
+            return vms
+        def _probe_vm(self) -> dict:
+            """Probe the Azure VM to check if WAA server is responding.
+            Returns:
+                dict with:
+                - responding: bool - whether the WAA server is responding
+                - vm_ip: str - the VM's IP address
+                - container: str - the container name
+                - probe_result: str - the raw probe response or error message
+                - last_checked: str - ISO timestamp
+            """
+            import subprocess
+            from datetime import datetime
+            result = {
+                "responding": False,
+                "vm_ip": None,
+                "container": None,
+                "probe_result": None,
+                "last_checked": datetime.now().isoformat(),
+            }
+            # First get VM IP
+            try:
+                ip_result = subprocess.run(
+                    [
+                        "az",
+                        "vm",
+                        "list-ip-addresses",
+                        "--name",
+                        "waa-eval-vm",
+                        "--resource-group",
+                        "openadapt-agents",
+                        "--query",
+                        "[0].virtualMachine.network.publicIpAddresses[0].ipAddress",
+                        "-o",
+                        "tsv",
+                    ],
+                    capture_output=True,
+                    text=True,
+                    timeout=10,
+                )
+                if ip_result.returncode == 0 and ip_result.stdout.strip():
+                    vm_ip = ip_result.stdout.strip()
+                    result["vm_ip"] = vm_ip
+                    # Try to probe WAA server via SSH
+                    # Use the correct internal IP for the Windows VM inside Docker
+                    probe_result = subprocess.run(
+                        [
+                            "ssh",
+                            "-o",
+                            "StrictHostKeyChecking=no",
+                            "-o",
+                            "ConnectTimeout=5",
+                            "-i",
+                            str(Path.home() / ".ssh" / "id_rsa"),
+                            f"azureuser@{vm_ip}",
+                            "docker exec waa-container curl -s --connect-timeout 3 http://172.30.0.2:5000/probe 2>/dev/null || echo 'probe_failed'",
+                        ],
+                        capture_output=True,
+                        text=True,
+                        timeout=15,
+                    )
+                    result["container"] = "waa-container"
+                    if probe_result.returncode == 0:
+                        probe_output = probe_result.stdout.strip()
+                        if probe_output and "probe_failed" not in probe_output:
+                            result["responding"] = True
+                            result["probe_result"] = probe_output
+                        else:
+                            result["probe_result"] = "WAA server not responding"
+                    else:
+                        result["probe_result"] = (
+                            f"SSH/Docker error: {probe_result.stderr[:200]}"
+                        )
+                else:
+                    result["probe_result"] = "Could not get VM IP"
+            except subprocess.TimeoutExpired:
+                result["probe_result"] = "Connection timeout"
+            except Exception as e:
+                result["probe_result"] = f"Error: {str(e)}"
+            return result
+        def _get_current_run(self) -> dict:
+            """Get info about any currently running benchmark.
+            Checks:
+            1. Local benchmark_progress.json for API benchmarks
+            2. Azure VM for WAA benchmarks running via SSH
+            Returns:
+                dict with:
+                - running: bool - whether a benchmark is running
+                - type: str - 'local' or 'azure_vm'
+                - model: str - model being evaluated
+                - progress: dict with tasks_completed, total_tasks
+                - current_task: str - current task ID
+                - started_at: str - ISO timestamp
+                - elapsed_minutes: int
+            """
+            import subprocess
+            import re
+            result = {
+                "running": False,
+                "type": None,
+                "model": None,
+                "progress": {"tasks_completed": 0, "total_tasks": 0},
+                "current_task": None,
+                "started_at": None,
+                "elapsed_minutes": 0,
+            }
+            # Check local benchmark progress first
+            progress_file = Path("benchmark_progress.json")
+            if progress_file.exists():
+                try:
+                    progress = json.loads(progress_file.read_text())
+                    if progress.get("status") == "running":
+                        result["running"] = True
+                        result["type"] = "local"
+                        result["model"] = progress.get("provider", "unknown")
+                        result["progress"]["tasks_completed"] = progress.get(
+                            "tasks_complete", 0
+                        )
+                        result["progress"]["total_tasks"] = progress.get(
+                            "tasks_total", 0
+                        )
+                        return result
+                except Exception:
+                    pass
+            # Check Azure VM for running benchmark
+            try:
+                # Get VM IP
+                ip_result = subprocess.run(
+                    [
+                        "az",
+                        "vm",
+                        "list-ip-addresses",
+                        "--name",
+                        "waa-eval-vm",
+                        "--resource-group",
+                        "openadapt-agents",
+                        "--query",
+                        "[0].virtualMachine.network.publicIpAddresses[0].ipAddress",
+                        "-o",
+                        "tsv",
+                    ],
+                    capture_output=True,
+                    text=True,
+                    timeout=10,
+                )
+                if ip_result.returncode == 0 and ip_result.stdout.strip():
+                    vm_ip = ip_result.stdout.strip()
+                    # Check if benchmark process is running
+                    process_check = subprocess.run(
+                        [
+                            "ssh",
+                            "-o",
+                            "StrictHostKeyChecking=no",
+                            "-o",
+                            "ConnectTimeout=5",
+                            "-i",
+                            str(Path.home() / ".ssh" / "id_rsa"),
+                            f"azureuser@{vm_ip}",
+                            "docker exec waa-container pgrep -f 'python.*run.py' 2>/dev/null && echo 'RUNNING' || echo 'NOT_RUNNING'",
+                        ],
+                        capture_output=True,
+                        text=True,
+                        timeout=10,
+                    )
+                    if (
+                        process_check.returncode == 0
+                        and "RUNNING" in process_check.stdout
+                    ):
+                        result["running"] = True
+                        result["type"] = "azure_vm"
+                        # Get log file for more details
+                        log_check = subprocess.run(
+                            [
+                                "ssh",
+                                "-o",
+                                "StrictHostKeyChecking=no",
+                                "-o",
+                                "ConnectTimeout=5",
+                                "-i",
+                                str(Path.home() / ".ssh" / "id_rsa"),
+                                f"azureuser@{vm_ip}",
+                                "tail -100 /tmp/waa_benchmark.log 2>/dev/null || echo ''",
+                            ],
+                            capture_output=True,
+                            text=True,
+                            timeout=10,
+                        )
+                        if log_check.returncode == 0 and log_check.stdout.strip():
+                            logs = log_check.stdout
+                            # Parse model from logs
+                            model_match = re.search(
+                                r"model[=:\s]+([^\s,]+)", logs, re.IGNORECASE
+                            )
+                            if model_match:
+                                result["model"] = model_match.group(1)
+                            # Parse progress
+                            task_match = re.search(r"Task\s+(\d+)/(\d+)", logs)
+                            if task_match:
+                                result["progress"]["tasks_completed"] = int(
+                                    task_match.group(1)
+                                )
+                                result["progress"]["total_tasks"] = int(
+                                    task_match.group(2)
+                                )
+                            # Parse current task
+                            task_id_match = re.search(
+                                r"(?:Running|Processing|task)[:\s]+([a-f0-9-]+)",
+                                logs,
+                                re.IGNORECASE,
+                            )
+                            if task_id_match:
+                                result["current_task"] = task_id_match.group(1)
+            except Exception:
+                pass
+            return result
+        def _get_benchmark_status(self) -> dict:
+            """Get current benchmark job status with ETA calculation.
+            Returns:
+                dict with job status, progress, ETA, and current task info
+            """
+            import time
+            # Check for live evaluation state
+            live_file = Path("benchmark_live.json")
+            if live_file.exists():
+                try:
+                    live_state = json.loads(live_file.read_text())
+                    if live_state.get("status") == "running":
+                        total_tasks = live_state.get("total_tasks", 0)
+                        completed_tasks = live_state.get("tasks_completed", 0)
+                        current_task = live_state.get("current_task", {})
+                        # Calculate ETA based on completed tasks
+                        eta_seconds = None
+                        avg_task_seconds = None
+                        if completed_tasks > 0 and total_tasks > 0:
+                            # Estimate from live state timestamp or use fallback
+                            elapsed = time.time() - live_state.get(
+                                "start_time", time.time()
+                            )
+                            avg_task_seconds = (
+                                elapsed / completed_tasks
+                                if completed_tasks > 0
+                                else 30.0
+                            )
+                            remaining_tasks = total_tasks - completed_tasks
+                            eta_seconds = remaining_tasks * avg_task_seconds
+                        return {
+                            "status": "running",
+                            "current_job": {
+                                "run_id": live_state.get("run_id", "unknown"),
+                                "model_id": live_state.get("model_id", "unknown"),
+                                "total_tasks": total_tasks,
+                                "completed_tasks": completed_tasks,
+                                "current_task": current_task,
+                                "eta_seconds": eta_seconds,
+                                "avg_task_seconds": avg_task_seconds,
+                            },
+                            "queue": [],  # TODO: implement queue tracking
+                        }
+                except Exception as e:
+                    return {"status": "error", "error": str(e)}
+            # Fallback to current_run check
+            current_run = self._get_current_run()
+            if current_run.get("running"):
+                return {
+                    "status": "running",
+                    "current_job": {
+                        "run_id": "unknown",
+                        "model_id": current_run.get("model", "unknown"),
+                        "total_tasks": current_run["progress"]["total_tasks"],
+                        "completed_tasks": current_run["progress"]["tasks_completed"],
+                        "current_task": {"task_id": current_run.get("current_task")},
+                    },
+                    "queue": [],
+                }
+            return {"status": "idle"}
+        def _get_benchmark_costs(self) -> dict:
+            """Get cost breakdown for current benchmark run.
+            Returns:
+                dict with Azure VM, API calls, and GPU costs
+            """
+            # Check for cost tracking file
+            cost_file = Path("benchmark_costs.json")
+            if cost_file.exists():
+                try:
+                    return json.loads(cost_file.read_text())
+                except Exception:
+                    pass
+            # Return placeholder structure
+            return {
+                "azure_vm": {
+                    "instance_type": "Standard_D4ds_v5",
+                    "hourly_rate_usd": 0.192,
+                    "hours_elapsed": 0.0,
+                    "cost_usd": 0.0,
+                },
+                "api_calls": {
+                    "anthropic": {"cost_usd": 0.0},
+                    "openai": {"cost_usd": 0.0},
+                },
+                "gpu_time": {
+                    "lambda_labs": {"cost_usd": 0.0},
+                },
+                "total_cost_usd": 0.0,
+            }
+        def _get_benchmark_metrics(self) -> dict:
+            """Get performance metrics for current/completed benchmarks.
+            Returns:
+                dict with success rate trends, domain breakdown, episode metrics
+            """
+            # Check for metrics file
+            metrics_file = Path("benchmark_metrics.json")
+            if metrics_file.exists():
+                try:
+                    return json.loads(metrics_file.read_text())
+                except Exception:
+                    pass
+            # Load completed runs from benchmark_results/
+            benchmark_results_dir = Path("benchmark_results")
+            if not benchmark_results_dir.exists():
+                return {"error": "No benchmark results found"}
+            # Find most recent run
+            runs = sorted(
+                benchmark_results_dir.iterdir(),
+                key=lambda p: p.stat().st_mtime,
+                reverse=True,
+            )
+            if not runs:
+                return {"error": "No benchmark runs found"}
+            recent_run = runs[0]
+            summary_path = recent_run / "summary.json"
+            if not summary_path.exists():
+                return {"error": f"No summary.json in {recent_run.name}"}
+            try:
+                summary = json.loads(summary_path.read_text())
+                # Build domain breakdown from tasks
+                domain_breakdown = {}
+                tasks_dir = recent_run / "tasks"
+                if tasks_dir.exists():
+                    for task_dir in tasks_dir.iterdir():
+                        if not task_dir.is_dir():
+                            continue
+                        task_json = task_dir / "task.json"
+                        execution_json = task_dir / "execution.json"
+                        if not (task_json.exists() and execution_json.exists()):
+                            continue
+                        try:
+                            task_def = json.loads(task_json.read_text())
+                            execution = json.loads(execution_json.read_text())
+                            domain = task_def.get("domain", "unknown")
+                            if domain not in domain_breakdown:
+                                domain_breakdown[domain] = {
+                                    "total": 0,
+                                    "success": 0,
+                                    "rate": 0.0,
+                                    "avg_steps": 0.0,
+                                    "total_steps": 0,
+                                }
+                            domain_breakdown[domain]["total"] += 1
+                            if execution.get("success"):
+                                domain_breakdown[domain]["success"] += 1
+                            domain_breakdown[domain]["total_steps"] += execution.get(
+                                "num_steps", 0
+                            )
+                        except Exception:
+                            continue
+                # Calculate averages
+                for domain, stats in domain_breakdown.items():
+                    if stats["total"] > 0:
+                        stats["rate"] = stats["success"] / stats["total"]
+                        stats["avg_steps"] = stats["total_steps"] / stats["total"]
+                return {
+                    "success_rate_over_time": [],  # TODO: implement trend tracking
+                    "avg_steps_per_task": [],  # TODO: implement trend tracking
+                    "domain_breakdown": domain_breakdown,
+                    "episode_success_metrics": {
+                        "first_action_accuracy": summary.get(
+                            "first_action_accuracy", 0.0
+                        ),
+                        "episode_success_rate": summary.get("success_rate", 0.0),
+                        "avg_steps_to_success": summary.get("avg_steps", 0.0),
+                        "avg_steps_to_failure": 0.0,  # TODO: calculate from failed tasks
+                    },
+                }
+            except Exception as e:
+                return {"error": f"Failed to load metrics: {str(e)}"}
+        def _get_benchmark_workers(self) -> dict:
+            """Get worker status and utilization.
+            Returns:
+                dict with total/active/idle workers and per-worker stats
+            """
+            # Get VM registry
+            vms = self._fetch_vm_registry()
+            active_workers = [v for v in vms if v.get("status") == "online"]
+            idle_workers = [v for v in vms if v.get("status") != "online"]
+            workers = []
+            for vm in vms:
+                workers.append(
+                    {
+                        "worker_id": vm.get("name", "unknown"),
+                        "status": "running" if vm.get("status") == "online" else "idle",
+                        "current_task": vm.get("current_task"),
+                        "tasks_completed": vm.get("tasks_completed", 0),
+                        "uptime_seconds": vm.get("uptime_seconds", 0),
+                        "idle_time_seconds": vm.get("idle_time_seconds", 0),
+                    }
+                )
+            return {
+                "total_workers": len(vms),
+                "active_workers": len(active_workers),
+                "idle_workers": len(idle_workers),
+                "workers": workers,
+            }
+        def _get_benchmark_runs(self) -> list[dict]:
+            """Load all benchmark runs from benchmark_results directory.
+            Returns:
+                List of benchmark run summaries sorted by timestamp (newest first)
+            """
+            results_dir = Path("benchmark_results")
+            if not results_dir.exists():
+                return []
+            runs = []
+            for run_dir in results_dir.iterdir():
+                if run_dir.is_dir():
+                    summary_file = run_dir / "summary.json"
+                    if summary_file.exists():
+                        try:
+                            summary = json.loads(summary_file.read_text())
+                            runs.append(summary)
+                        except (json.JSONDecodeError, IOError) as e:
+                            print(f"Warning: Failed to load {summary_file}: {e}")
+            # Sort by run_name descending (newest first)
+            runs.sort(key=lambda r: r.get("run_name", ""), reverse=True)
+            return runs
+        def _get_task_execution(self, run_name: str, task_id: str) -> dict:
+            """Load task execution details from execution.json.
+            Args:
+                run_name: Name of the benchmark run
+                task_id: Task identifier
+            Returns:
+                Task execution data with steps and screenshots
+            """
+            results_dir = Path("benchmark_results")
+            execution_file = (
+                results_dir / run_name / "tasks" / task_id / "execution.json"
+            )
+            if not execution_file.exists():
+                raise FileNotFoundError(f"Execution file not found: {execution_file}")
+            try:
+                return json.loads(execution_file.read_text())
+            except (json.JSONDecodeError, IOError) as e:
+                raise Exception(f"Failed to load execution data: {e}")
+        async def _detect_running_benchmark(
+            self, vm_ip: str, container_name: str = "winarena"
+        ) -> dict:
+            """Detect if a benchmark is running on the VM and extract progress.
+            SSH into VM and check:
+            1. Process running: docker exec {container} pgrep -f 'python.*run.py'
+            2. Log progress: tail /tmp/waa_benchmark.log
+            Returns:
+                dict with:
+                - running: bool
+                - current_task: str (task ID or description)
+                - progress: dict with tasks_completed, total_tasks, current_step
+                - recent_logs: str (last few log lines)
+            """
+            import subprocess
+            import re
+            result = {
+                "running": False,
+                "current_task": None,
+                "progress": {
+                    "tasks_completed": 0,
+                    "total_tasks": 0,
+                    "current_step": 0,
+                },
+                "recent_logs": "",
+            }
+            try:
+                # Check if benchmark process is running
+                process_check = subprocess.run(
+                    [
+                        "ssh",
+                        "-o",
+                        "StrictHostKeyChecking=no",
+                        "-o",
+                        "ConnectTimeout=5",
+                        "-i",
+                        str(Path.home() / ".ssh" / "id_rsa"),
+                        f"azureuser@{vm_ip}",
+                        f"docker exec {container_name} pgrep -f 'python.*run.py' 2>/dev/null || echo ''",
+                    ],
+                    capture_output=True,
+                    text=True,
+                    timeout=10,
+                )
+                if process_check.returncode == 0 and process_check.stdout.strip():
+                    result["running"] = True
+                    # Get benchmark log
+                    log_check = subprocess.run(
+                        [
+                            "ssh",
+                            "-o",
+                            "StrictHostKeyChecking=no",
+                            "-o",
+                            "ConnectTimeout=5",
+                            "-i",
+                            str(Path.home() / ".ssh" / "id_rsa"),
+                            f"azureuser@{vm_ip}",
+                            "tail -100 /tmp/waa_benchmark.log 2>/dev/null || echo ''",
+                        ],
+                        capture_output=True,
+                        text=True,
+                        timeout=10,
+                    )
+                    if log_check.returncode == 0 and log_check.stdout.strip():
+                        logs = log_check.stdout
+                        result["recent_logs"] = logs[-500:]  # Last 500 chars
+                        # Parse progress from logs
+                        # Look for patterns like "Task 5/30" or "Completed: 5, Remaining: 25"
+                        task_match = re.search(r"Task\s+(\d+)/(\d+)", logs)
+                        if task_match:
+                            result["progress"]["tasks_completed"] = int(
+                                task_match.group(1)
+                            )
+                            result["progress"]["total_tasks"] = int(task_match.group(2))
+                        # Extract current task ID
+                        task_id_match = re.search(
+                            r"(?:Running|Processing) task:\s*(\S+)", logs
+                        )
+                        if task_id_match:
+                            result["current_task"] = task_id_match.group(1)
+                        # Extract step info
+                        step_match = re.search(r"Step\s+(\d+)", logs)
+                        if step_match:
+                            result["progress"]["current_step"] = int(
+                                step_match.group(1)
+                            )
+            except Exception:
+                # SSH or parsing failed - leave defaults
+                pass
+            return result
+        def _parse_task_result(self, log_lines: list[str], task_id: str) -> dict:
+            """Parse task success/failure from log output.
+            WAA log patterns:
+            - Success: "Task task_001 completed successfully"
+            - Success: "Result: PASS"
+            - Failure: "Task task_001 failed"
+            - Failure: "Result: FAIL"
+            - Score: "Score: 0.85"
+            """
+            import re
+            success = None
+            score = None
+            # Search backwards from most recent
+            for line in reversed(log_lines):
+                # Check for explicit result
+                if "Result: PASS" in line or "completed successfully" in line:
+                    success = True
+                elif "Result: FAIL" in line or "failed" in line.lower():
+                    success = False
+                # Check for score
+                score_match = re.search(r"Score:\s*([\d.]+)", line)
+                if score_match:
+                    try:
+                        score = float(score_match.group(1))
+                    except ValueError:
+                        pass
+                # Check for task-specific completion
+                if task_id in line:
+                    if "success" in line.lower() or "pass" in line.lower():
+                        success = True
+                    elif "fail" in line.lower() or "error" in line.lower():
+                        success = False
+            # Default to True if no explicit failure found (backwards compatible)
+            if success is None:
+                success = True
+            return {"success": success, "score": score}
+        def _stream_benchmark_updates(self, interval: int):
+            """Stream Server-Sent Events for benchmark status updates.
+            Streams events:
+            - connected: Initial connection event
+            - status: VM status and probe results
+            - progress: Benchmark progress (tasks completed, current task)
+            - task_complete: When a task finishes
+            - heartbeat: Keep-alive signal every 30 seconds
+            - error: Error messages
+            Uses a generator-based approach to avoid blocking the main thread
+            and properly handles client disconnection.
+            """
+            import time
+            import select
+            HEARTBEAT_INTERVAL = 30  # seconds
+            # Set SSE headers
+            self.send_response(200)
+            self.send_header("Content-Type", "text/event-stream")
+            self.send_header("Cache-Control", "no-cache")
+            self.send_header("Access-Control-Allow-Origin", "*")
+            self.send_header("Connection", "keep-alive")
+            self.send_header("X-Accel-Buffering", "no")  # Disable nginx buffering
+            self.end_headers()
+            # Track connection state
+            client_connected = True
+            def send_event(event_type: str, data: dict) -> bool:
+                """Send an SSE event. Returns False if client disconnected."""
+                nonlocal client_connected
+                if not client_connected:
+                    return False
+                try:
+                    event_str = f"event: {event_type}\ndata: {json.dumps(data)}\n\n"
+                    self.wfile.write(event_str.encode("utf-8"))
+                    self.wfile.flush()
+                    return True
+                except (BrokenPipeError, ConnectionResetError, ConnectionAbortedError):
+                    # Client disconnected
+                    client_connected = False
+                    return False
+                except Exception as e:
+                    # Other error - log and assume disconnected
+                    print(f"SSE send error: {e}")
+                    client_connected = False
+                    return False
+            def check_client_connected() -> bool:
+                """Check if client is still connected using socket select."""
+                nonlocal client_connected
+                if not client_connected:
+                    return False
+                try:
+                    # Check if socket has data (would indicate client sent something or closed)
+                    # Use non-blocking check with 0 timeout
+                    rlist, _, xlist = select.select([self.rfile], [], [self.rfile], 0)
+                    if xlist:
+                        # Error condition on socket
+                        client_connected = False
+                        return False
+                    if rlist:
+                        # Client sent data - for SSE this usually means disconnect
+                        # (SSE is server-push only, client doesn't send data)
+                        data = self.rfile.read(1)
+                        if not data:
+                            client_connected = False
+                            return False
+                    return True
+                except Exception:
+                    client_connected = False
+                    return False
+            last_task = None
+            last_heartbeat = time.time()
+            recent_log_lines = []
+            # Send initial connected event
+            if not send_event(
+                "connected",
+                {"timestamp": time.time(), "interval": interval, "version": "1.0"},
+            ):
+                return
+            try:
+                iteration_count = 0
+                max_iterations = 3600 // interval  # Max 1 hour of streaming
+                while client_connected and iteration_count < max_iterations:
+                    iteration_count += 1
+                    current_time = time.time()
+                    # Check client connection before doing work
+                    if not check_client_connected():
+                        break
+                    # Send heartbeat every 30 seconds to prevent proxy/LB timeouts
+                    if current_time - last_heartbeat >= HEARTBEAT_INTERVAL:
+                        if not send_event("heartbeat", {"timestamp": current_time}):
+                            break
+                        last_heartbeat = current_time
+                    # Fetch background tasks (includes VM status)
+                    tasks = self._fetch_background_tasks()
+                    # Send VM status event
+                    vm_task = next(
+                        (t for t in tasks if t.get("task_type") == "docker_container"),
+                        None,
+                    )
+                    if vm_task:
+                        vm_data = {
+                            "type": "vm_status",
+                            "connected": vm_task.get("status")
+                            in ["running", "completed"],
+                            "phase": vm_task.get("phase", "unknown"),
+                            "waa_ready": vm_task.get("metadata", {}).get(
+                                "waa_server_ready", False
+                            ),
+                            "probe": {
+                                "status": vm_task.get("metadata", {}).get(
+                                    "probe_response", "unknown"
+                                ),
+                                "vnc_url": vm_task.get("metadata", {}).get("vnc_url"),
+                            },
+                        }
+                        if not send_event("status", vm_data):
+                            break
+                        # If VM is ready, check for running benchmark
+                        if vm_data["waa_ready"]:
+                            # Get VM IP from tasks
+                            vm_ip = None
+                            azure_vm = next(
+                                (
+                                    t
+                                    for t in tasks
+                                    if t.get("task_type") == "vm_provision"
+                                ),
+                                None,
+                            )
+                            if azure_vm:
+                                vm_ip = azure_vm.get("metadata", {}).get("ip_address")
+                            if vm_ip:
+                                # Detect running benchmark using sync version
+                                benchmark_status = self._detect_running_benchmark_sync(
+                                    vm_ip,
+                                    vm_task.get("metadata", {}).get(
+                                        "container", "winarena"
+                                    ),
+                                )
+                                if benchmark_status["running"]:
+                                    # Store log lines for result parsing
+                                    if benchmark_status.get("recent_logs"):
+                                        recent_log_lines = benchmark_status[
+                                            "recent_logs"
+                                        ].split("\n")
+                                    # Send progress event
+                                    progress_data = {
+                                        "tasks_completed": benchmark_status["progress"][
+                                            "tasks_completed"
+                                        ],
+                                        "total_tasks": benchmark_status["progress"][
+                                            "total_tasks"
+                                        ],
+                                        "current_task": benchmark_status[
+                                            "current_task"
+                                        ],
+                                        "current_step": benchmark_status["progress"][
+                                            "current_step"
+                                        ],
+                                    }
+                                    if not send_event("progress", progress_data):
+                                        break
+                                    # Check if task completed
+                                    current_task = benchmark_status["current_task"]
+                                    if current_task and current_task != last_task:
+                                        if last_task is not None:
+                                            # Previous task completed - parse result from logs
+                                            result = self._parse_task_result(
+                                                recent_log_lines, last_task
+                                            )
+                                            complete_data = {
+                                                "task_id": last_task,
+                                                "success": result["success"],
+                                                "score": result["score"],
+                                            }
+                                            if not send_event(
+                                                "task_complete", complete_data
+                                            ):
+                                                break
+                                        last_task = current_task
+                    # Check local benchmark progress file
+                    progress_file = Path("benchmark_progress.json")
+                    if progress_file.exists():
+                        try:
+                            progress = json.loads(progress_file.read_text())
+                            if progress.get("status") == "running":
+                                progress_data = {
+                                    "tasks_completed": progress.get(
+                                        "tasks_complete", 0
+                                    ),
+                                    "total_tasks": progress.get("tasks_total", 0),
+                                    "current_task": progress.get("provider", "unknown"),
+                                }
+                                if not send_event("progress", progress_data):
+                                    break
+                        except Exception:
+                            pass
+                    # Non-blocking sleep using select with timeout
+                    # This allows checking for client disconnect during sleep
+                    try:
+                        select.select([self.rfile], [], [], interval)
+                    except Exception:
+                        break
+            except (BrokenPipeError, ConnectionResetError, ConnectionAbortedError):
+                # Client disconnected - this is normal, don't log as error
+                pass
+            except Exception as e:
+                # Send error event if still connected
+                send_event("error", {"message": str(e)})
+            finally:
+                # Cleanup - connection is ending
+                client_connected = False
+        def _stream_azure_ops_updates(self):
+            """Stream Server-Sent Events for Azure operations status updates.
+            Monitors azure_ops_status.json for changes and streams updates.
+            Uses file modification time to detect changes efficiently.
+            Streams events:
+            - connected: Initial connection event
+            - status: Azure ops status update when file changes
+            - heartbeat: Keep-alive signal every 30 seconds
+            - error: Error messages
+            """
+            import time
+            import select
+            from pathlib import Path
+            HEARTBEAT_INTERVAL = 30  # seconds
+            CHECK_INTERVAL = 1  # Check file every second
+            # Set SSE headers
+            self.send_response(200)
+            self.send_header("Content-Type", "text/event-stream")
+            self.send_header("Cache-Control", "no-cache")
+            self.send_header("Access-Control-Allow-Origin", "*")
+            self.send_header("Connection", "keep-alive")
+            self.send_header("X-Accel-Buffering", "no")  # Disable nginx buffering
+            self.end_headers()
+            # Track connection state
+            client_connected = True
+            last_mtime = 0.0
+            last_session_mtime = 0.0
+            last_heartbeat = time.time()
+            def send_event(event_type: str, data: dict) -> bool:
+                """Send an SSE event. Returns False if client disconnected."""
+                nonlocal client_connected
+                if not client_connected:
+                    return False
+                try:
+                    event_str = f"event: {event_type}\ndata: {json.dumps(data)}\n\n"
+                    self.wfile.write(event_str.encode("utf-8"))
+                    self.wfile.flush()
+                    return True
+                except (BrokenPipeError, ConnectionResetError, ConnectionAbortedError):
+                    client_connected = False
+                    return False
+                except Exception as e:
+                    print(f"Azure ops SSE send error: {e}")
+                    client_connected = False
+                    return False
+            def check_client_connected() -> bool:
+                """Check if client is still connected using socket select."""
+                nonlocal client_connected
+                if not client_connected:
+                    return False
+                try:
+                    rlist, _, xlist = select.select([self.rfile], [], [self.rfile], 0)
+                    if xlist:
+                        client_connected = False
+                        return False
+                    if rlist:
+                        data = self.rfile.read(1)
+                        if not data:
+                            client_connected = False
+                            return False
+                    return True
+                except Exception:
+                    client_connected = False
+                    return False
+            # Status file path
+            from openadapt_ml.benchmarks.azure_ops_tracker import (
+                DEFAULT_OUTPUT_FILE,
+                read_status,
+            )
+            from openadapt_ml.benchmarks.session_tracker import (
+                get_session,
+                update_session_vm_state,
+                DEFAULT_SESSION_FILE,
+            )
+            status_file = Path(DEFAULT_OUTPUT_FILE)
+            session_file = Path(DEFAULT_SESSION_FILE)
+            def compute_server_side_values(status: dict) -> dict:
+                """Get elapsed_seconds and cost_usd from session tracker for persistence."""
+                # Get session data (persistent across refreshes)
+                session = get_session()
+                # Update session based on VM state if we have VM info
+                # IMPORTANT: Only pass vm_ip if it's truthy to avoid
+                # overwriting session's stable vm_ip with None
+                if status.get("vm_state") and status.get("vm_state") != "unknown":
+                    status_vm_ip = status.get("vm_ip")
+                    # Build update kwargs - only include vm_ip if present
+                    update_kwargs = {
+                        "vm_state": status["vm_state"],
+                        "vm_size": status.get("vm_size"),
+                    }
+                    if status_vm_ip:  # Only include if truthy
+                        update_kwargs["vm_ip"] = status_vm_ip
+                    session = update_session_vm_state(**update_kwargs)
+                # Use session's vm_ip as authoritative source
+                # This prevents IP flickering when status file has stale/None values
+                if session.get("vm_ip"):
+                    status["vm_ip"] = session["vm_ip"]
+                # Use session's elapsed_seconds and cost_usd for persistence
+                if (
+                    session.get("is_active")
+                    or session.get("accumulated_seconds", 0) > 0
+                ):
+                    status["elapsed_seconds"] = session.get("elapsed_seconds", 0.0)
+                    status["cost_usd"] = session.get("cost_usd", 0.0)
+                    status["started_at"] = session.get("started_at")
+                    status["session_id"] = session.get("session_id")
+                    status["session_is_active"] = session.get("is_active", False)
+                    # Include accumulated time from previous sessions for hybrid display
+                    status["accumulated_seconds"] = session.get(
+                        "accumulated_seconds", 0.0
+                    )
+                    # Calculate current session time (total - accumulated)
+                    current_session_seconds = max(
+                        0, status["elapsed_seconds"] - status["accumulated_seconds"]
+                    )
+                    status["current_session_seconds"] = current_session_seconds
+                    hourly_rate = session.get("hourly_rate_usd", 0.422)
+                    status["current_session_cost_usd"] = (
+                        current_session_seconds / 3600
+                    ) * hourly_rate
+                try:
+                    tunnel_mgr = get_tunnel_manager()
+                    tunnel_status = tunnel_mgr.get_tunnel_status()
+                    status["tunnels"] = {
+                        name: {
+                            "active": s.active,
+                            "local_port": s.local_port,
+                            "remote_endpoint": s.remote_endpoint,
+                            "pid": s.pid,
+                            "error": s.error,
+                        }
+                        for name, s in tunnel_status.items()
+                    }
+                except Exception as e:
+                    status["tunnels"] = {"error": str(e)}
+                return status
+            # Send initial connected event
+            if not send_event(
+                "connected",
+                {"timestamp": time.time(), "version": "1.0"},
+            ):
+                return
+            # Send initial status immediately
+            try:
+                status = compute_server_side_values(read_status())
+                if not send_event("status", status):
+                    return
+                if status_file.exists():
+                    last_mtime = status_file.stat().st_mtime
+            except Exception as e:
+                send_event("error", {"message": str(e)})
+            try:
+                iteration_count = 0
+                max_iterations = 3600  # Max 1 hour of streaming
+                last_status_send = 0.0
+                STATUS_SEND_INTERVAL = 2  # Send status every 2 seconds for live updates
+                while client_connected and iteration_count < max_iterations:
+                    iteration_count += 1
+                    current_time = time.time()
+                    # Check client connection
+                    if not check_client_connected():
+                        break
+                    # Send heartbeat every 30 seconds
+                    if current_time - last_heartbeat >= HEARTBEAT_INTERVAL:
+                        if not send_event("heartbeat", {"timestamp": current_time}):
+                            break
+                        last_heartbeat = current_time
+                    # Check if status or session file changed OR if enough time passed
+                    try:
+                        status_changed = False
+                        session_changed = False
+                        time_to_send = (
+                            current_time - last_status_send >= STATUS_SEND_INTERVAL
+                        )
+                        if status_file.exists():
+                            current_mtime = status_file.stat().st_mtime
+                            if current_mtime > last_mtime:
+                                status_changed = True
+                                last_mtime = current_mtime
+                        if session_file.exists():
+                            current_session_mtime = session_file.stat().st_mtime
+                            if current_session_mtime > last_session_mtime:
+                                session_changed = True
+                                last_session_mtime = current_session_mtime
+                        # Send status if file changed OR periodic timer expired
+                        # This ensures live elapsed time/cost updates even without file changes
+                        if status_changed or session_changed or time_to_send:
+                            # File changed or time to send - send update with session values
+                            status = compute_server_side_values(read_status())
+                            if not send_event("status", status):
+                                break
+                            last_status_send = current_time
+                    except Exception as e:
+                        # File access error - log but continue
+                        print(f"Azure ops SSE file check error: {e}")
+                    # Sleep briefly before next check
+                    try:
+                        select.select([self.rfile], [], [], CHECK_INTERVAL)
+                    except Exception:
+                        break
+            except (BrokenPipeError, ConnectionResetError, ConnectionAbortedError):
+                # Client disconnected - normal
+                pass
+            except Exception as e:
+                send_event("error", {"message": str(e)})
+            finally:
+                client_connected = False
+        def _detect_running_benchmark_sync(
+            self, vm_ip: str, container_name: str = "winarena"
+        ) -> dict:
+            """Synchronous version of _detect_running_benchmark.
+            Avoids creating a new event loop on each call which causes issues
+            when called from a synchronous context.
+            """
+            import subprocess
+            import re
+            result = {
+                "running": False,
+                "current_task": None,
+                "progress": {
+                    "tasks_completed": 0,
+                    "total_tasks": 0,
+                    "current_step": 0,
+                },
+                "recent_logs": "",
+            }
+            try:
+                # Check if benchmark process is running
+                process_check = subprocess.run(
+                    [
+                        "ssh",
+                        "-o",
+                        "StrictHostKeyChecking=no",
+                        "-o",
+                        "ConnectTimeout=5",
+                        "-i",
+                        str(Path.home() / ".ssh" / "id_rsa"),
+                        f"azureuser@{vm_ip}",
+                        f"docker exec {container_name} pgrep -f 'python.*run.py' 2>/dev/null || echo ''",
+                    ],
+                    capture_output=True,
+                    text=True,
+                    timeout=10,
+                )
+                if process_check.returncode == 0 and process_check.stdout.strip():
+                    result["running"] = True
+                    # Get benchmark log
+                    log_check = subprocess.run(
+                        [
+                            "ssh",
+                            "-o",
+                            "StrictHostKeyChecking=no",
+                            "-o",
+                            "ConnectTimeout=5",
+                            "-i",
+                            str(Path.home() / ".ssh" / "id_rsa"),
+                            f"azureuser@{vm_ip}",
+                            "tail -100 /tmp/waa_benchmark.log 2>/dev/null || echo ''",
+                        ],
+                        capture_output=True,
+                        text=True,
+                        timeout=10,
+                    )
+                    if log_check.returncode == 0 and log_check.stdout.strip():
+                        logs = log_check.stdout
+                        result["recent_logs"] = logs[-500:]  # Last 500 chars
+                        # Parse progress from logs
+                        task_match = re.search(r"Task\s+(\d+)/(\d+)", logs)
+                        if task_match:
+                            result["progress"]["tasks_completed"] = int(
+                                task_match.group(1)
+                            )
+                            result["progress"]["total_tasks"] = int(task_match.group(2))
+                        # Extract current task ID
+                        task_id_match = re.search(
+                            r"(?:Running|Processing) task:\s*(\S+)", logs
+                        )
+                        if task_id_match:
+                            result["current_task"] = task_id_match.group(1)
+                        # Extract step info
+                        step_match = re.search(r"Step\s+(\d+)", logs)
+                        if step_match:
+                            result["progress"]["current_step"] = int(
+                                step_match.group(1)
+                            )
+            except Exception:
+                # SSH or parsing failed - leave defaults
+                pass
+            return result
+        def _register_vm(self, vm_data):
+            """Register a new VM in the registry."""
+            # Path to VM registry file (relative to project root)
+            project_root = Path(__file__).parent.parent.parent
+            registry_file = project_root / "benchmark_results" / "vm_registry.json"
+            # Load existing registry
+            vms = []
+            if registry_file.exists():
+                try:
+                    with open(registry_file) as f:
+                        vms = json.load(f)
+                except Exception:
+                    pass
+            # Add new VM
+            new_vm = {
+                "name": vm_data.get("name", "unnamed-vm"),
+                "ssh_host": vm_data.get("ssh_host", ""),
+                "ssh_user": vm_data.get("ssh_user", "azureuser"),
+                "vnc_port": vm_data.get("vnc_port", 8006),
+                "waa_port": vm_data.get("waa_port", 5000),
+                "docker_container": vm_data.get("docker_container", "win11-waa"),
+                "internal_ip": vm_data.get("internal_ip", "20.20.20.21"),
+            }
+            vms.append(new_vm)
+            # Save registry
+            try:
+                registry_file.parent.mkdir(parents=True, exist_ok=True)
+                with open(registry_file, "w") as f:
+                    json.dump(vms, f, indent=2)
+                return {"status": "success", "vm": new_vm}
+            except Exception as e:
+                return {"status": "error", "message": str(e)}
+        def _start_benchmark_run(self, params: dict) -> dict:
+            """Start a benchmark run with the given parameters.
+            Runs the benchmark in a background thread and returns immediately.
+            Progress is tracked via benchmark_progress.json.
+            Expected params:
+            {
+                "model": "gpt-4o",
+                "num_tasks": 5,
+                "agent": "navi",
+                "task_selection": "all" | "domain" | "task_ids",
+                "domain": "general",  // if task_selection == "domain"
+                "task_ids": ["task_001", "task_015"]  // if task_selection == "task_ids"
+            }
+            Returns:
+                dict with status and params
+            """
+            from dotenv import load_dotenv
+            # Load .env file for API keys
+            project_root = Path(__file__).parent.parent.parent
+            load_dotenv(project_root / ".env")
+            # Build CLI command
+            cmd = [
+                "uv",
+                "run",
+                "python",
+                "-m",
+                "openadapt_ml.benchmarks.cli",
+                "vm",
+                "run-waa",
+                "--num-tasks",
+                str(params.get("num_tasks", 5)),
+                "--model",
+                params.get("model", "gpt-4o"),
+                "--agent",
+                params.get("agent", "navi"),
+                "--no-open",  # Don't open viewer (already open)
+            ]
+            # Add task selection args
+            task_selection = params.get("task_selection", "all")
+            if task_selection == "domain":
+                domain = params.get("domain", "general")
+                cmd.extend(["--domain", domain])
+            elif task_selection == "task_ids":
+                task_ids = params.get("task_ids", [])
+                if task_ids:
+                    cmd.extend(["--task-ids", ",".join(task_ids)])
+            # Create progress log file (in cwd which is serve_dir)
+            progress_file = Path("benchmark_progress.json")
+            # Write initial progress
+            model = params.get("model", "gpt-4o")
+            num_tasks = params.get("num_tasks", 5)
+            agent = params.get("agent", "navi")
+            print(
+                f"\n[Benchmark] Starting WAA benchmark: model={model}, tasks={num_tasks}, agent={agent}"
+            )
+            print(f"[Benchmark] Task selection: {task_selection}")
+            if task_selection == "domain":
+                print(f"[Benchmark] Domain: {params.get('domain', 'general')}")
+            elif task_selection == "task_ids":
+                print(f"[Benchmark] Task IDs: {params.get('task_ids', [])}")
+            print(f"[Benchmark] Command: {' '.join(cmd)}")
+            progress_file.write_text(
+                json.dumps(
+                    {
+                        "status": "running",
+                        "model": model,
+                        "num_tasks": num_tasks,
+                        "agent": agent,
+                        "task_selection": task_selection,
+                        "tasks_complete": 0,
+                        "message": f"Starting {model} benchmark with {num_tasks} tasks...",
+                    }
+                )
+            )
+            # Copy environment with loaded vars
+            env = os.environ.copy()
+            # Run in background thread
+            def run():
+                result = subprocess.run(
+                    cmd, capture_output=True, text=True, cwd=str(project_root), env=env
+                )
+                print(f"\n[Benchmark] Output:\n{result.stdout}")
+                if result.stderr:
+                    print(f"[Benchmark] Stderr: {result.stderr}")
+                if result.returncode == 0:
+                    print("[Benchmark] Complete. Regenerating viewer...")
+                    progress_file.write_text(
+                        json.dumps(
+                            {
+                                "status": "complete",
+                                "model": model,
+                                "num_tasks": num_tasks,
+                                "message": "Benchmark complete. Refresh to see results.",
+                            }
+                        )
+                    )
+                    # Regenerate benchmark viewer
+                    _regenerate_benchmark_viewer_if_available(serve_dir)
+                else:
+                    error_msg = (
+                        result.stderr[:200] if result.stderr else "Unknown error"
+                    )
+                    print(f"[Benchmark] Failed: {error_msg}")
+                    progress_file.write_text(
+                        json.dumps(
+                            {
+                                "status": "error",
+                                "model": model,
+                                "num_tasks": num_tasks,
+                                "message": f"Benchmark failed: {error_msg}",
+                            }
+                        )
+                    )
+            threading.Thread(target=run, daemon=True).start()
+            return {"status": "started", "params": params}
         def do_OPTIONS(self):
             # Handle CORS preflight
             self.send_response(200)
-            self.send_header('Access-Control-Allow-Origin', '*')
-            self.send_header('Access-Control-Allow-Methods', 'POST, OPTIONS')
-            self.send_header('Access-Control-Allow-Headers', 'Content-Type')
+            self.send_header("Access-Control-Allow-Origin", "*")
+            self.send_header("Access-Control-Allow-Methods", "POST, OPTIONS")
+            self.send_header("Access-Control-Allow-Headers", "Content-Type")
             self.end_headers()
-    with socketserver.TCPServer(("", port), StopHandler) as httpd:
+    class ThreadedTCPServer(socketserver.ThreadingMixIn, socketserver.TCPServer):
+        allow_reuse_address = True
+        daemon_threads = True  # Don't block shutdown
+    with ThreadedTCPServer(("", port), StopHandler) as httpd:
         url = f"http://localhost:{port}/{start_page}"
         print(f"\nServing at: {url}")
         print(f"Directory: {serve_dir}")
@@ -611,7 +3654,40 @@ def cmd_viewer(args: argparse.Namespace) -> int:
         state.learning_rate = data.get("learning_rate", 0)
         state.losses = data.get("losses", [])
         state.status = data.get("status", "completed")
-        state.elapsed_time = data.get("elapsed_time", 0.0)  # Load elapsed time for completed training
+        state.elapsed_time = data.get(
+            "elapsed_time", 0.0
+        )  # Load elapsed time for completed training
+        state.goal = data.get("goal", "")
+        state.config_path = data.get("config_path", "")
+        state.capture_path = data.get("capture_path", "")
+        # Load model config from training_log.json or fall back to reading config file
+        state.model_name = data.get("model_name", "")
+        state.lora_r = data.get("lora_r", 0)
+        state.lora_alpha = data.get("lora_alpha", 0)
+        state.load_in_4bit = data.get("load_in_4bit", False)
+        # If model config not in JSON, try to read from config file
+        if not state.model_name and state.config_path:
+            try:
+                import yaml
+                # Try relative to project root first, then as absolute path
+                project_root = Path(__file__).parent.parent.parent
+                config_file = project_root / state.config_path
+                if not config_file.exists():
+                    config_file = Path(state.config_path)
+                if config_file.exists():
+                    with open(config_file) as cf:
+                        cfg = yaml.safe_load(cf)
+                    if cfg and "model" in cfg:
+                        state.model_name = cfg["model"].get("name", "")
+                        state.load_in_4bit = cfg["model"].get("load_in_4bit", False)
+                    if cfg and "lora" in cfg:
+                        state.lora_r = cfg["lora"].get("r", 0)
+                        state.lora_alpha = cfg["lora"].get("lora_alpha", 0)
+            except Exception as e:
+                print(f"  Warning: Could not read config file: {e}")
         config = TrainingConfig(
             num_train_epochs=data.get("total_epochs", 5),
@@ -620,14 +3696,16 @@ def cmd_viewer(args: argparse.Namespace) -> int:
         dashboard_html = generate_training_dashboard(state, config)
         (current_dir / "dashboard.html").write_text(dashboard_html)
-        print(f"  Regenerated: dashboard.html")
+        print("  Regenerated: dashboard.html")
     # Generate unified viewer using consolidated function
     viewer_path = generate_unified_viewer_from_output_dir(current_dir)
     if viewer_path:
         print(f"\nGenerated: {viewer_path}")
     else:
-        print("\nNo comparison data found. Run comparison first or copy from capture directory.")
+        print(
+            "\nNo comparison data found. Run comparison first or copy from capture directory."
+        )
     # Also regenerate benchmark viewer from latest benchmark results
     _regenerate_benchmark_viewer_if_available(current_dir)
@@ -647,9 +3725,9 @@ def cmd_benchmark_viewer(args: argparse.Namespace) -> int:
         print(f"Error: Benchmark directory not found: {benchmark_dir}")
         return 1
-    print(f"\n{'='*50}")
+    print(f"\n{'=' * 50}")
     print("GENERATING BENCHMARK VIEWER")
-    print(f"{'='*50}")
+    print(f"{'=' * 50}")
     print(f"Benchmark dir: {benchmark_dir}")
     print()
@@ -664,6 +3742,7 @@ def cmd_benchmark_viewer(args: argparse.Namespace) -> int:
     except Exception as e:
         print(f"Error generating benchmark viewer: {e}")
         import traceback
         traceback.print_exc()
         return 1
@@ -680,16 +3759,19 @@ def cmd_compare(args: argparse.Namespace) -> int:
         print(f"Error: Checkpoint not found: {checkpoint}")
         return 1
-    print(f"\n{'='*50}")
+    print(f"\n{'=' * 50}")
     print("RUNNING COMPARISON")
-    print(f"{'='*50}")
+    print(f"{'=' * 50}")
     print(f"Capture: {capture_path}")
     print(f"Checkpoint: {checkpoint or 'None (capture only)'}")
     print()
     cmd = [
-        sys.executable, "-m", "openadapt_ml.scripts.compare",
-        "--capture", str(capture_path),
+        sys.executable,
+        "-m",
+        "openadapt_ml.scripts.compare",
+        "--capture",
+        str(capture_path),
     ]
     if checkpoint:
@@ -728,7 +3810,7 @@ Examples:
   # Run comparison
   uv run python -m openadapt_ml.cloud.local compare --capture ~/captures/my-workflow --checkpoint checkpoints/model
-"""
+""",
     )
     subparsers = parser.add_subparsers(dest="command", help="Command")
@@ -740,9 +3822,15 @@ Examples:
     # train
     p_train = subparsers.add_parser("train", help="Run training locally")
     p_train.add_argument("--capture", required=True, help="Path to capture directory")
-    p_train.add_argument("--goal", help="Task goal (default: derived from capture name)")
-    p_train.add_argument("--config", help="Config file (default: auto-select based on device)")
-    p_train.add_argument("--open", action="store_true", help="Open dashboard in browser")
+    p_train.add_argument(
+        "--goal", help="Task goal (default: derived from capture name)"
+    )
+    p_train.add_argument(
+        "--config", help="Config file (default: auto-select based on device)"
+    )
+    p_train.add_argument(
+        "--open", action="store_true", help="Open dashboard in browser"
+    )
     p_train.set_defaults(func=cmd_train)
     # check
@@ -753,10 +3841,21 @@ Examples:
     p_serve = subparsers.add_parser("serve", help="Start web server for dashboard")
     p_serve.add_argument("--port", type=int, default=8765, help="Port number")
     p_serve.add_argument("--open", action="store_true", help="Open in browser")
-    p_serve.add_argument("--quiet", "-q", action="store_true", help="Suppress request logging")
-    p_serve.add_argument("--no-regenerate", action="store_true",
-                         help="Skip regenerating dashboard/viewer (serve existing files)")
-    p_serve.add_argument("--benchmark", help="Serve benchmark results directory instead of training output")
+    p_serve.add_argument(
+        "--quiet", "-q", action="store_true", help="Suppress request logging"
+    )
+    p_serve.add_argument(
+        "--no-regenerate",
+        action="store_true",
+        help="Skip regenerating dashboard/viewer (serve existing files)",
+    )
+    p_serve.add_argument(
+        "--benchmark",
+        help="Serve benchmark results directory instead of training output",
+    )
+    p_serve.add_argument(
+        "--start-page", help="Override default start page (e.g., benchmark.html)"
+    )
     p_serve.set_defaults(func=cmd_serve)
     # viewer
@@ -765,9 +3864,15 @@ Examples:
     p_viewer.set_defaults(func=cmd_viewer)
     # benchmark_viewer
-    p_benchmark = subparsers.add_parser("benchmark-viewer", help="Generate benchmark viewer")
-    p_benchmark.add_argument("benchmark_dir", help="Path to benchmark results directory")
-    p_benchmark.add_argument("--open", action="store_true", help="Open viewer in browser")
+    p_benchmark = subparsers.add_parser(
+        "benchmark-viewer", help="Generate benchmark viewer"
+    )
+    p_benchmark.add_argument(
+        "benchmark_dir", help="Path to benchmark results directory"
+    )
+    p_benchmark.add_argument(
+        "--open", action="store_true", help="Open viewer in browser"
+    )
     p_benchmark.set_defaults(func=cmd_benchmark_viewer)
     # compare

openadapt-ml 0.1.0__py3-none-any.whl → 0.2.1__py3-none-any.whl

openadapt-ml 0.1.0py3-none-any.whl → 0.2.1py3-none-any.whl