openadapt-ml 0.1.0__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (63) hide show
  1. openadapt_ml/benchmarks/__init__.py +8 -0
  2. openadapt_ml/benchmarks/agent.py +90 -11
  3. openadapt_ml/benchmarks/azure.py +35 -6
  4. openadapt_ml/benchmarks/cli.py +4449 -201
  5. openadapt_ml/benchmarks/live_tracker.py +180 -0
  6. openadapt_ml/benchmarks/runner.py +41 -4
  7. openadapt_ml/benchmarks/viewer.py +1219 -0
  8. openadapt_ml/benchmarks/vm_monitor.py +610 -0
  9. openadapt_ml/benchmarks/waa.py +61 -4
  10. openadapt_ml/benchmarks/waa_deploy/Dockerfile +222 -0
  11. openadapt_ml/benchmarks/waa_deploy/__init__.py +10 -0
  12. openadapt_ml/benchmarks/waa_deploy/api_agent.py +539 -0
  13. openadapt_ml/benchmarks/waa_deploy/start_waa_server.bat +53 -0
  14. openadapt_ml/benchmarks/waa_live.py +619 -0
  15. openadapt_ml/cloud/local.py +1555 -1
  16. openadapt_ml/cloud/ssh_tunnel.py +553 -0
  17. openadapt_ml/datasets/next_action.py +87 -68
  18. openadapt_ml/evals/grounding.py +26 -8
  19. openadapt_ml/evals/trajectory_matching.py +84 -36
  20. openadapt_ml/experiments/demo_prompt/__init__.py +19 -0
  21. openadapt_ml/experiments/demo_prompt/format_demo.py +226 -0
  22. openadapt_ml/experiments/demo_prompt/results/experiment_20251231_002125.json +83 -0
  23. openadapt_ml/experiments/demo_prompt/results/experiment_n30_20251231_165958.json +1100 -0
  24. openadapt_ml/experiments/demo_prompt/results/multistep_20251231_025051.json +182 -0
  25. openadapt_ml/experiments/demo_prompt/run_experiment.py +531 -0
  26. openadapt_ml/experiments/waa_demo/__init__.py +10 -0
  27. openadapt_ml/experiments/waa_demo/demos.py +357 -0
  28. openadapt_ml/experiments/waa_demo/runner.py +717 -0
  29. openadapt_ml/experiments/waa_demo/tasks.py +151 -0
  30. openadapt_ml/export/__init__.py +9 -0
  31. openadapt_ml/export/__main__.py +6 -0
  32. openadapt_ml/export/cli.py +89 -0
  33. openadapt_ml/export/parquet.py +265 -0
  34. openadapt_ml/ingest/__init__.py +3 -4
  35. openadapt_ml/ingest/capture.py +89 -81
  36. openadapt_ml/ingest/loader.py +116 -68
  37. openadapt_ml/ingest/synthetic.py +221 -159
  38. openadapt_ml/retrieval/README.md +226 -0
  39. openadapt_ml/retrieval/USAGE.md +391 -0
  40. openadapt_ml/retrieval/__init__.py +91 -0
  41. openadapt_ml/retrieval/demo_retriever.py +817 -0
  42. openadapt_ml/retrieval/embeddings.py +629 -0
  43. openadapt_ml/retrieval/index.py +194 -0
  44. openadapt_ml/retrieval/retriever.py +160 -0
  45. openadapt_ml/runtime/policy.py +10 -10
  46. openadapt_ml/schema/__init__.py +104 -0
  47. openadapt_ml/schema/converters.py +541 -0
  48. openadapt_ml/schema/episode.py +457 -0
  49. openadapt_ml/scripts/compare.py +26 -16
  50. openadapt_ml/scripts/eval_policy.py +4 -5
  51. openadapt_ml/scripts/prepare_synthetic.py +14 -17
  52. openadapt_ml/scripts/train.py +81 -70
  53. openadapt_ml/training/benchmark_viewer.py +3225 -0
  54. openadapt_ml/training/trainer.py +120 -363
  55. openadapt_ml/training/trl_trainer.py +354 -0
  56. {openadapt_ml-0.1.0.dist-info → openadapt_ml-0.2.0.dist-info}/METADATA +102 -60
  57. openadapt_ml-0.2.0.dist-info/RECORD +86 -0
  58. openadapt_ml/schemas/__init__.py +0 -53
  59. openadapt_ml/schemas/sessions.py +0 -122
  60. openadapt_ml/schemas/validation.py +0 -252
  61. openadapt_ml-0.1.0.dist-info/RECORD +0 -55
  62. {openadapt_ml-0.1.0.dist-info → openadapt_ml-0.2.0.dist-info}/WHEEL +0 -0
  63. {openadapt_ml-0.1.0.dist-info → openadapt_ml-0.2.0.dist-info}/licenses/LICENSE +0 -0
@@ -10,6 +10,2960 @@ import json
10
10
  from pathlib import Path
11
11
 
12
12
 
13
+ def _get_background_tasks_panel_css() -> str:
14
+ """Return CSS for background tasks panel."""
15
+ return '''
16
+ .tasks-panel {
17
+ background: linear-gradient(135deg, rgba(100, 100, 255, 0.1) 0%, rgba(100, 100, 255, 0.05) 100%);
18
+ border: 1px solid rgba(100, 100, 255, 0.3);
19
+ border-radius: 12px;
20
+ padding: 20px 24px;
21
+ margin-bottom: 24px;
22
+ }
23
+ .tasks-header {
24
+ display: flex;
25
+ align-items: center;
26
+ justify-content: space-between;
27
+ margin-bottom: 16px;
28
+ }
29
+ .tasks-title {
30
+ display: flex;
31
+ align-items: center;
32
+ gap: 10px;
33
+ font-size: 1rem;
34
+ font-weight: 600;
35
+ color: #6366f1;
36
+ }
37
+ .tasks-title svg {
38
+ width: 20px;
39
+ height: 20px;
40
+ }
41
+ .tasks-refresh {
42
+ font-size: 0.75rem;
43
+ color: var(--text-muted);
44
+ }
45
+ .task-card {
46
+ background: rgba(0, 0, 0, 0.3);
47
+ border: 1px solid var(--border-color);
48
+ border-radius: 8px;
49
+ padding: 16px;
50
+ margin-bottom: 12px;
51
+ }
52
+ .task-card:last-child {
53
+ margin-bottom: 0;
54
+ }
55
+ .task-card-header {
56
+ display: flex;
57
+ align-items: center;
58
+ gap: 12px;
59
+ margin-bottom: 8px;
60
+ }
61
+ .task-status-indicator {
62
+ width: 12px;
63
+ height: 12px;
64
+ border-radius: 50%;
65
+ flex-shrink: 0;
66
+ }
67
+ .task-status-indicator.running {
68
+ background: #3b82f6;
69
+ animation: pulse-task 2s infinite;
70
+ }
71
+ .task-status-indicator.completed {
72
+ background: #10b981;
73
+ }
74
+ .task-status-indicator.failed {
75
+ background: #ef4444;
76
+ }
77
+ .task-status-indicator.pending {
78
+ background: #f59e0b;
79
+ }
80
+ @keyframes pulse-task {
81
+ 0%, 100% { opacity: 1; box-shadow: 0 0 0 0 rgba(59, 130, 246, 0.5); }
82
+ 50% { opacity: 0.8; box-shadow: 0 0 0 8px rgba(59, 130, 246, 0); }
83
+ }
84
+ .task-title {
85
+ font-weight: 600;
86
+ font-size: 0.95rem;
87
+ color: var(--text-primary);
88
+ }
89
+ .task-description {
90
+ font-size: 0.85rem;
91
+ color: var(--text-secondary);
92
+ margin-bottom: 12px;
93
+ }
94
+ .task-progress-bar {
95
+ height: 8px;
96
+ background: rgba(255, 255, 255, 0.1);
97
+ border-radius: 4px;
98
+ overflow: hidden;
99
+ margin-bottom: 8px;
100
+ }
101
+ .task-progress-fill {
102
+ height: 100%;
103
+ background: linear-gradient(90deg, #3b82f6, #06b6d4);
104
+ border-radius: 4px;
105
+ transition: width 0.5s ease;
106
+ }
107
+ .task-progress-fill.completed {
108
+ background: linear-gradient(90deg, #10b981, #059669);
109
+ }
110
+ .task-meta {
111
+ display: flex;
112
+ justify-content: space-between;
113
+ font-size: 0.75rem;
114
+ color: var(--text-muted);
115
+ }
116
+ .task-link {
117
+ display: inline-flex;
118
+ align-items: center;
119
+ gap: 4px;
120
+ padding: 4px 8px;
121
+ background: rgba(99, 102, 241, 0.2);
122
+ border: 1px solid rgba(99, 102, 241, 0.4);
123
+ border-radius: 4px;
124
+ color: #818cf8;
125
+ text-decoration: none;
126
+ font-size: 0.75rem;
127
+ margin-top: 8px;
128
+ transition: all 0.2s;
129
+ }
130
+ .task-link:hover {
131
+ background: rgba(99, 102, 241, 0.3);
132
+ transform: translateY(-1px);
133
+ }
134
+ .task-credentials {
135
+ display: flex;
136
+ align-items: center;
137
+ gap: 8px;
138
+ padding: 8px 12px;
139
+ background: rgba(245, 158, 11, 0.15);
140
+ border: 1px solid rgba(245, 158, 11, 0.3);
141
+ border-radius: 6px;
142
+ margin: 8px 0;
143
+ font-size: 0.85rem;
144
+ }
145
+ .task-credentials .cred-label {
146
+ color: #fbbf24;
147
+ }
148
+ .task-credentials code {
149
+ background: rgba(0, 0, 0, 0.3);
150
+ padding: 2px 6px;
151
+ border-radius: 4px;
152
+ font-family: 'SF Mono', Monaco, monospace;
153
+ color: #fcd34d;
154
+ }
155
+ .no-tasks {
156
+ text-align: center;
157
+ padding: 20px;
158
+ color: var(--text-muted);
159
+ font-size: 0.9rem;
160
+ }
161
+ .task-phase-badge {
162
+ margin-left: auto;
163
+ padding: 2px 8px;
164
+ background: rgba(99, 102, 241, 0.2);
165
+ border-radius: 12px;
166
+ font-size: 0.75rem;
167
+ color: #a5b4fc;
168
+ }
169
+ .task-logs-details {
170
+ margin-top: 12px;
171
+ border-top: 1px solid var(--border-color);
172
+ padding-top: 8px;
173
+ }
174
+ .task-logs-summary {
175
+ cursor: pointer;
176
+ font-size: 0.75rem;
177
+ color: var(--text-muted);
178
+ user-select: none;
179
+ }
180
+ .task-logs-summary:hover {
181
+ color: var(--text-secondary);
182
+ }
183
+ .task-logs-content {
184
+ margin-top: 8px;
185
+ padding: 8px;
186
+ background: rgba(0, 0, 0, 0.4);
187
+ border-radius: 4px;
188
+ font-size: 0.7rem;
189
+ line-height: 1.4;
190
+ max-height: 150px;
191
+ overflow-y: auto;
192
+ white-space: pre-wrap;
193
+ word-break: break-all;
194
+ color: #10b981;
195
+ font-family: 'SF Mono', Monaco, 'Cascadia Code', monospace;
196
+ }
197
+ /* VM Details section - using native <details> element to preserve state across re-renders */
198
+ .vm-details-section {
199
+ margin-top: 12px;
200
+ border-top: 1px solid var(--border-color);
201
+ padding-top: 12px;
202
+ }
203
+ .vm-details-summary {
204
+ cursor: pointer;
205
+ font-size: 0.75rem;
206
+ color: var(--text-muted);
207
+ user-select: none;
208
+ display: flex;
209
+ align-items: center;
210
+ gap: 6px;
211
+ padding: 6px 0;
212
+ list-style: none;
213
+ }
214
+ .vm-details-summary::-webkit-details-marker {
215
+ display: none;
216
+ }
217
+ .vm-details-summary:hover {
218
+ color: var(--text-secondary);
219
+ }
220
+ .vm-details-icon {
221
+ transition: transform 0.2s;
222
+ }
223
+ details.vm-details[open] .vm-details-icon {
224
+ transform: rotate(90deg);
225
+ }
226
+ .vm-details-content {
227
+ margin-top: 8px;
228
+ padding: 12px;
229
+ background: rgba(0, 0, 0, 0.3);
230
+ border-radius: 6px;
231
+ font-size: 0.75rem;
232
+ }
233
+ .vm-detail-row {
234
+ display: flex;
235
+ justify-content: space-between;
236
+ align-items: center;
237
+ padding: 6px 0;
238
+ border-bottom: 1px solid rgba(255, 255, 255, 0.05);
239
+ }
240
+ .vm-detail-row:last-child {
241
+ border-bottom: none;
242
+ }
243
+ .vm-detail-label {
244
+ color: var(--text-muted);
245
+ font-weight: 500;
246
+ }
247
+ .vm-detail-value {
248
+ color: var(--text-primary);
249
+ font-family: 'SF Mono', Monaco, monospace;
250
+ }
251
+ .vm-detail-value.success {
252
+ color: #10b981;
253
+ }
254
+ .vm-detail-value.warning {
255
+ color: #f59e0b;
256
+ }
257
+ .vm-detail-value.error {
258
+ color: #ef4444;
259
+ }
260
+ .vm-dependencies-list {
261
+ margin-top: 8px;
262
+ padding: 8px;
263
+ background: rgba(0, 0, 0, 0.2);
264
+ border-radius: 4px;
265
+ }
266
+ .vm-dependency-item {
267
+ display: flex;
268
+ align-items: center;
269
+ gap: 8px;
270
+ padding: 4px 0;
271
+ font-size: 0.7rem;
272
+ }
273
+ .vm-dependency-icon {
274
+ font-size: 1rem;
275
+ }
276
+ .vm-progress-bar {
277
+ width: 100%;
278
+ height: 6px;
279
+ background: rgba(255, 255, 255, 0.1);
280
+ border-radius: 3px;
281
+ overflow: hidden;
282
+ margin: 8px 0;
283
+ }
284
+ .vm-progress-fill {
285
+ height: 100%;
286
+ background: linear-gradient(90deg, #10b981, #059669);
287
+ border-radius: 3px;
288
+ transition: width 0.5s ease;
289
+ }
290
+ '''
291
+
292
+
293
+ def _get_background_tasks_panel_html() -> str:
294
+ """Return HTML for background tasks panel with JS polling and improved styling."""
295
+ return '''
296
+ <div class="tasks-panel" id="tasks-panel">
297
+ <div class="tasks-header">
298
+ <div class="tasks-title">
299
+ <svg viewBox="0 0 24 24" fill="currentColor">
300
+ <path d="M19 3H5c-1.1 0-2 .9-2 2v14c0 1.1.9 2 2 2h14c1.1 0 2-.9 2-2V5c0-1.1-.9-2-2-2zm-5 14H7v-2h7v2zm3-4H7v-2h10v2zm0-4H7V7h10v2z"/>
301
+ </svg>
302
+ Background Tasks
303
+ </div>
304
+ <div style="display: flex; align-items: center; gap: 12px;">
305
+ <span class="tasks-refresh" id="tasks-refresh-time">Checking...</span>
306
+ <button class="refresh-btn" onclick="refreshBackgroundTasks()" title="Refresh tasks" id="tasks-refresh-btn" style="background: rgba(99, 102, 241, 0.2); border-color: rgba(99, 102, 241, 0.4);">
307
+ <span class="refresh-icon">&#8635;</span>
308
+ <span class="spinner" style="border-top-color: #6366f1;"></span>
309
+ Refresh
310
+ </button>
311
+ </div>
312
+ </div>
313
+
314
+ <!-- API Error Banner -->
315
+ <div class="api-error-banner" id="tasks-api-error" style="display: none;">
316
+ <span class="error-icon">!</span>
317
+ <span class="error-message" id="tasks-error-msg">Failed to fetch tasks</span>
318
+ <button class="retry-btn" onclick="refreshBackgroundTasks()">Retry</button>
319
+ </div>
320
+
321
+ <!-- Loading state -->
322
+ <div id="tasks-loading" style="display: none; text-align: center; padding: 30px;">
323
+ <div style="display: inline-block; width: 24px; height: 24px; border: 3px solid rgba(99,102,241,0.3); border-top-color: #6366f1; border-radius: 50%; animation: spin 1s linear infinite;"></div>
324
+ <div style="margin-top: 12px; color: var(--text-muted); font-size: 0.85rem;">Loading tasks...</div>
325
+ </div>
326
+
327
+ <div id="tasks-list">
328
+ <div class="no-tasks">
329
+ <div style="font-size: 2rem; margin-bottom: 12px; opacity: 0.5;">&#128203;</div>
330
+ Checking for active tasks...
331
+ </div>
332
+ </div>
333
+ </div>
334
+
335
+ <script>
336
+ let isTasksRefreshing = false;
337
+ let tasksErrorCount = 0;
338
+
339
+ function setTasksLoadingState(loading) {
340
+ const loadingEl = document.getElementById('tasks-loading');
341
+ const listEl = document.getElementById('tasks-list');
342
+ const btn = document.getElementById('tasks-refresh-btn');
343
+
344
+ if (loading) {
345
+ loadingEl.style.display = 'block';
346
+ listEl.style.display = 'none';
347
+ if (btn) btn.classList.add('loading');
348
+ } else {
349
+ loadingEl.style.display = 'none';
350
+ listEl.style.display = 'block';
351
+ if (btn) btn.classList.remove('loading');
352
+ }
353
+ }
354
+
355
+ function showTasksError(msg) {
356
+ const errorEl = document.getElementById('tasks-api-error');
357
+ const errorMsgEl = document.getElementById('tasks-error-msg');
358
+ if (errorEl && errorMsgEl) {
359
+ errorMsgEl.textContent = msg;
360
+ errorEl.style.display = 'flex';
361
+ }
362
+ }
363
+
364
+ function hideTasksError() {
365
+ const errorEl = document.getElementById('tasks-api-error');
366
+ if (errorEl) errorEl.style.display = 'none';
367
+ }
368
+
369
+ async function refreshBackgroundTasks() {
370
+ if (isTasksRefreshing) return;
371
+ isTasksRefreshing = true;
372
+ setTasksLoadingState(true);
373
+ hideTasksError();
374
+
375
+ try {
376
+ const response = await fetch('/api/tasks?' + Date.now());
377
+ if (!response.ok) throw new Error('HTTP ' + response.status);
378
+ const tasks = await response.json();
379
+ if (tasks.error) throw new Error(tasks.error);
380
+
381
+ renderBackgroundTasks(tasks);
382
+ tasksErrorCount = 0;
383
+ document.getElementById('tasks-refresh-time').textContent =
384
+ 'Updated ' + new Date().toLocaleTimeString();
385
+ } catch (e) {
386
+ console.error('Tasks refresh failed:', e);
387
+ tasksErrorCount++;
388
+ showTasksError(e.message || 'Connection failed');
389
+ } finally {
390
+ isTasksRefreshing = false;
391
+ setTasksLoadingState(false);
392
+ }
393
+ }
394
+
395
+ async function fetchBackgroundTasks() {
396
+ if (isTasksRefreshing) return;
397
+ if (tasksErrorCount >= 3) {
398
+ document.getElementById('tasks-refresh-time').textContent = 'Polling paused';
399
+ return;
400
+ }
401
+
402
+ try {
403
+ const response = await fetch('/api/tasks?' + Date.now());
404
+ if (response.ok) {
405
+ const tasks = await response.json();
406
+ if (!tasks.error) {
407
+ renderBackgroundTasks(tasks);
408
+ hideTasksError();
409
+ tasksErrorCount = 0;
410
+ document.getElementById('tasks-refresh-time').textContent =
411
+ 'Updated ' + new Date().toLocaleTimeString();
412
+ }
413
+ }
414
+ } catch (e) {
415
+ console.log('Tasks API unavailable:', e);
416
+ tasksErrorCount++;
417
+ }
418
+ }
419
+
420
+ function renderVMDetails(metadata) {
421
+ if (!metadata) return '';
422
+
423
+ const statusClass = (value, type = 'default') => {
424
+ if (type === 'probe') {
425
+ return value && value !== 'Not responding' && value !== 'Connection failed' ? 'success' : 'error';
426
+ } else if (type === 'qmp') {
427
+ return value ? 'success' : 'warning';
428
+ }
429
+ return '';
430
+ };
431
+
432
+ const renderDependencies = (deps) => {
433
+ if (!deps || deps.length === 0) return '';
434
+
435
+ const statusIcons = {
436
+ 'complete': '✓',
437
+ 'installing': '⏳',
438
+ 'pending': '○'
439
+ };
440
+
441
+ return `
442
+ <div class="vm-detail-row">
443
+ <div class="vm-detail-label">Dependencies</div>
444
+ </div>
445
+ <div class="vm-dependencies-list">
446
+ ${deps.map(dep => `
447
+ <div class="vm-dependency-item">
448
+ <span class="vm-dependency-icon">${dep.icon || '📦'}</span>
449
+ <span>${statusIcons[dep.status] || '○'} ${dep.name}</span>
450
+ </div>
451
+ `).join('')}
452
+ </div>
453
+ `;
454
+ };
455
+
456
+ // Use native <details> element to preserve expanded state across SSE re-renders
457
+ return `
458
+ <div class="vm-details-section">
459
+ <details class="vm-details">
460
+ <summary class="vm-details-summary">
461
+ <span class="vm-details-icon">&#9654;</span>
462
+ <span>VM Details</span>
463
+ </summary>
464
+ <div class="vm-details-content">
465
+ ${metadata.setup_script_phase ? `
466
+ <div class="vm-detail-row">
467
+ <div class="vm-detail-label">Setup Phase</div>
468
+ <div class="vm-detail-value">${metadata.setup_script_phase}</div>
469
+ </div>
470
+ ` : ''}
471
+ ${metadata.disk_usage_gb ? `
472
+ <div class="vm-detail-row">
473
+ <div class="vm-detail-label">Disk Usage</div>
474
+ <div class="vm-detail-value">${metadata.disk_usage_gb}</div>
475
+ </div>
476
+ ` : ''}
477
+ ${metadata.memory_usage_mb ? `
478
+ <div class="vm-detail-row">
479
+ <div class="vm-detail-label">Memory Usage</div>
480
+ <div class="vm-detail-value">${metadata.memory_usage_mb}</div>
481
+ </div>
482
+ ` : ''}
483
+ ${metadata.probe_response !== undefined ? `
484
+ <div class="vm-detail-row">
485
+ <div class="vm-detail-label">WAA Server (/probe)</div>
486
+ <div class="vm-detail-value ${statusClass(metadata.probe_response, 'probe')}">
487
+ ${metadata.probe_response}
488
+ </div>
489
+ </div>
490
+ ` : ''}
491
+ ${metadata.qmp_connected !== undefined ? `
492
+ <div class="vm-detail-row">
493
+ <div class="vm-detail-label">QMP (port 7200)</div>
494
+ <div class="vm-detail-value ${statusClass(metadata.qmp_connected, 'qmp')}">
495
+ ${metadata.qmp_connected ? 'Connected ✓' : 'Not connected'}
496
+ </div>
497
+ </div>
498
+ ` : ''}
499
+ ${renderDependencies(metadata.dependencies)}
500
+ </div>
501
+ </details>
502
+ </div>
503
+ `;
504
+ }
505
+
506
+ // Track expanded states for VM Details and logs panels across page refreshes
507
+ // Uses localStorage to persist states across browser reloads
508
+ // Key: task_id, Value: { vmDetailsExpanded: bool, logsExpanded: bool }
509
+ const STORAGE_KEY = 'openadapt_task_expanded_states';
510
+
511
+ function getTaskExpandedStates() {
512
+ try {
513
+ const stored = localStorage.getItem(STORAGE_KEY);
514
+ return stored ? JSON.parse(stored) : {};
515
+ } catch (e) {
516
+ console.warn('Failed to load expanded states from localStorage:', e);
517
+ return {};
518
+ }
519
+ }
520
+
521
+ function saveTaskExpandedStates() {
522
+ const taskExpandedStates = getTaskExpandedStates();
523
+
524
+ // First, clear all expanded states (we'll re-add the currently expanded ones)
525
+ // This handles the case where a user collapses a panel
526
+ for (const key of Object.keys(taskExpandedStates)) {
527
+ taskExpandedStates[key].vmDetailsExpanded = false;
528
+ taskExpandedStates[key].logsExpanded = false;
529
+ }
530
+
531
+ // Save VM Details expanded states (using native <details> element)
532
+ document.querySelectorAll('details.vm-details[open]').forEach(details => {
533
+ const card = details.closest('.task-card');
534
+ if (card) {
535
+ const taskTitle = card.querySelector('.task-title')?.textContent || '';
536
+ if (taskTitle) {
537
+ if (!taskExpandedStates[taskTitle]) taskExpandedStates[taskTitle] = {};
538
+ taskExpandedStates[taskTitle].vmDetailsExpanded = true;
539
+ }
540
+ }
541
+ });
542
+
543
+ // Save logs details expanded states
544
+ document.querySelectorAll('.task-logs-details[open]').forEach(details => {
545
+ const card = details.closest('.task-card');
546
+ if (card) {
547
+ const taskTitle = card.querySelector('.task-title')?.textContent || '';
548
+ if (taskTitle) {
549
+ if (!taskExpandedStates[taskTitle]) taskExpandedStates[taskTitle] = {};
550
+ taskExpandedStates[taskTitle].logsExpanded = true;
551
+ }
552
+ }
553
+ });
554
+
555
+ // Persist to localStorage
556
+ try {
557
+ localStorage.setItem(STORAGE_KEY, JSON.stringify(taskExpandedStates));
558
+ } catch (e) {
559
+ console.warn('Failed to save expanded states to localStorage:', e);
560
+ }
561
+ }
562
+
563
+ function restoreTaskExpandedStates() {
564
+ const taskExpandedStates = getTaskExpandedStates();
565
+
566
+ // Restore VM Details expanded states (using native <details> element)
567
+ document.querySelectorAll('.task-card').forEach(card => {
568
+ const taskTitle = card.querySelector('.task-title')?.textContent || '';
569
+ const state = taskExpandedStates[taskTitle];
570
+ if (state) {
571
+ if (state.vmDetailsExpanded) {
572
+ const details = card.querySelector('details.vm-details');
573
+ if (details) details.open = true;
574
+ }
575
+ if (state.logsExpanded) {
576
+ const details = card.querySelector('.task-logs-details');
577
+ if (details) details.open = true;
578
+ }
579
+ }
580
+ });
581
+ }
582
+
583
+ function renderBackgroundTasks(tasks) {
584
+ const container = document.getElementById('tasks-list');
585
+
586
+ // Debug: Log incoming tasks data
587
+ console.log('[SSE Debug] renderBackgroundTasks called with:', JSON.stringify(tasks, null, 2));
588
+
589
+ // Save expanded states before replacing DOM
590
+ saveTaskExpandedStates();
591
+
592
+ if (!tasks || tasks.length === 0) {
593
+ container.innerHTML = '<div class="no-tasks">No active background tasks</div>';
594
+ return;
595
+ }
596
+
597
+ const phaseLabels = {
598
+ 'downloading': '⬇️ Downloading',
599
+ 'extracting': '📦 Extracting',
600
+ 'configuring': '⚙️ Configuring',
601
+ 'building': '🔨 Building',
602
+ 'booting': '🚀 Booting',
603
+ 'oobe': '🪟 Windows Setup',
604
+ 'ready': '✅ Ready',
605
+ 'unknown': '⏳ Starting'
606
+ };
607
+
608
+ const html = tasks.map(task => {
609
+ const statusClass = task.status || 'pending';
610
+ const progressPercent = task.progress_percent || 0;
611
+ const progressClass = task.status === 'completed' ? 'completed' : '';
612
+
613
+ // Determine phase: use task.phase, fall back to metadata.phase,
614
+ // then if status is 'completed' use 'ready', otherwise 'unknown'
615
+ let phase = task.phase || task.metadata?.phase;
616
+ if (!phase) {
617
+ // If no phase specified, infer from status to prevent "Starting" + "completed" conflict
618
+ phase = (task.status === 'completed') ? 'ready' : 'unknown';
619
+ }
620
+ const phaseLabel = phaseLabels[phase] || phase;
621
+
622
+ // Debug: Log per-task phase/status mapping
623
+ console.log(`[SSE Debug] Task ${task.task_id}: status=${task.status}, phase=${task.phase}, resolvedPhase=${phase}, phaseLabel=${phaseLabel}`);
624
+
625
+ // Build link if VNC URL available
626
+ let linkHtml = '';
627
+ if (task.metadata && task.metadata.vnc_url) {
628
+ linkHtml = `<a href="${task.metadata.vnc_url}" target="_blank" class="task-link">
629
+ Open VNC →
630
+ </a>`;
631
+ }
632
+
633
+ // Show Windows credentials if available
634
+ let credentialsHtml = '';
635
+ if (task.metadata && task.metadata.windows_username) {
636
+ credentialsHtml = `
637
+ <div class="task-credentials">
638
+ <span class="cred-label">🔑 Login:</span>
639
+ <code>${task.metadata.windows_username}</code> /
640
+ <code>${task.metadata.windows_password || '(empty)'}</code>
641
+ </div>
642
+ `;
643
+ }
644
+
645
+ // Add expandable logs if available
646
+ let logsHtml = '';
647
+ if (task.metadata && task.metadata.recent_logs) {
648
+ const taskId = task.task_id.replace(/[^a-z0-9]/gi, '_');
649
+ logsHtml = `
650
+ <details class="task-logs-details">
651
+ <summary class="task-logs-summary">Show recent logs</summary>
652
+ <pre class="task-logs-content">${task.metadata.recent_logs.replace(/</g, '&lt;').replace(/>/g, '&gt;')}</pre>
653
+ </details>
654
+ `;
655
+ }
656
+
657
+ // Add VM Details expandable section for Windows containers
658
+ let vmDetailsHtml = '';
659
+ if (task.task_type === 'docker_container' && task.metadata) {
660
+ vmDetailsHtml = renderVMDetails(task.metadata);
661
+ }
662
+
663
+ // Progress label clarifies what % means
664
+ // Use a single unified status display to avoid showing conflicting states
665
+ let progressLabel;
666
+ if (task.status === 'completed' || phase === 'ready') {
667
+ progressLabel = 'Complete';
668
+ } else {
669
+ progressLabel = `Setup phase progress: ${progressPercent.toFixed(0)}%`;
670
+ }
671
+
672
+ return `
673
+ <div class="task-card">
674
+ <div class="task-card-header">
675
+ <div class="task-status-indicator ${statusClass}"></div>
676
+ <span class="task-title">${task.title || 'Unknown Task'}</span>
677
+ <span class="task-phase-badge">${phaseLabel}</span>
678
+ </div>
679
+ <div class="task-description">${task.description || ''}</div>
680
+ <div class="task-progress-bar">
681
+ <div class="task-progress-fill ${progressClass}" style="width: ${progressPercent}%"></div>
682
+ </div>
683
+ <div class="task-meta">
684
+ <span>${progressLabel}</span>
685
+ </div>
686
+ ${credentialsHtml}
687
+ ${linkHtml}
688
+ ${vmDetailsHtml}
689
+ ${logsHtml}
690
+ </div>
691
+ `;
692
+ }).join('');
693
+
694
+ container.innerHTML = html;
695
+
696
+ // Restore expanded states after DOM update
697
+ restoreTaskExpandedStates();
698
+ }
699
+
700
+ // Initial fetch and poll every 10 seconds
701
+ fetchBackgroundTasks();
702
+ setInterval(fetchBackgroundTasks, 10000);
703
+ </script>
704
+ '''
705
+
706
+
707
+ def _get_live_evaluation_panel_css() -> str:
708
+ """Return CSS for live evaluation progress panel."""
709
+ return '''
710
+ .live-eval-panel {
711
+ background: linear-gradient(135deg, rgba(139, 92, 246, 0.15) 0%, rgba(139, 92, 246, 0.05) 100%);
712
+ border: 1px solid rgba(139, 92, 246, 0.3);
713
+ border-radius: 12px;
714
+ padding: 20px 24px;
715
+ margin-bottom: 24px;
716
+ }
717
+ .live-eval-header {
718
+ display: flex;
719
+ align-items: center;
720
+ justify-content: space-between;
721
+ margin-bottom: 16px;
722
+ }
723
+ .live-eval-title {
724
+ display: flex;
725
+ align-items: center;
726
+ gap: 10px;
727
+ font-size: 1rem;
728
+ font-weight: 600;
729
+ color: #8b5cf6;
730
+ }
731
+ .live-eval-title svg {
732
+ width: 20px;
733
+ height: 20px;
734
+ }
735
+ .live-eval-refresh {
736
+ font-size: 0.75rem;
737
+ color: var(--text-muted);
738
+ }
739
+ .live-eval-status {
740
+ padding: 12px 16px;
741
+ background: rgba(0, 0, 0, 0.3);
742
+ border-radius: 8px;
743
+ margin-bottom: 12px;
744
+ }
745
+ .live-eval-progress {
746
+ font-size: 0.95rem;
747
+ color: var(--text-primary);
748
+ font-weight: 600;
749
+ margin-bottom: 8px;
750
+ }
751
+ .live-eval-task-name {
752
+ font-size: 0.85rem;
753
+ color: var(--text-secondary);
754
+ margin-bottom: 4px;
755
+ }
756
+ .live-eval-step {
757
+ padding: 12px;
758
+ background: var(--bg-tertiary);
759
+ border: 1px solid var(--border-color);
760
+ border-radius: 6px;
761
+ margin-bottom: 8px;
762
+ }
763
+ .live-eval-step-header {
764
+ display: flex;
765
+ align-items: center;
766
+ gap: 12px;
767
+ margin-bottom: 8px;
768
+ }
769
+ .live-eval-step-number {
770
+ font-weight: 600;
771
+ color: var(--accent);
772
+ min-width: 60px;
773
+ }
774
+ .live-eval-action {
775
+ flex: 1;
776
+ font-family: "SF Mono", Monaco, monospace;
777
+ font-size: 0.85rem;
778
+ color: var(--text-primary);
779
+ }
780
+ .live-eval-screenshot {
781
+ max-width: 300px;
782
+ border-radius: 4px;
783
+ border: 1px solid var(--border-color);
784
+ margin: 8px 0;
785
+ }
786
+ .live-eval-reasoning {
787
+ font-size: 0.8rem;
788
+ color: var(--text-secondary);
789
+ font-style: italic;
790
+ margin-top: 8px;
791
+ padding: 8px;
792
+ background: rgba(0, 0, 0, 0.2);
793
+ border-radius: 4px;
794
+ }
795
+ .live-eval-result {
796
+ display: inline-flex;
797
+ align-items: center;
798
+ gap: 6px;
799
+ padding: 4px 10px;
800
+ border-radius: 4px;
801
+ font-size: 0.75rem;
802
+ font-weight: 600;
803
+ }
804
+ .live-eval-result.success {
805
+ background: rgba(16, 185, 129, 0.2);
806
+ color: #10b981;
807
+ }
808
+ .live-eval-result.failure {
809
+ background: rgba(239, 68, 68, 0.2);
810
+ color: #ef4444;
811
+ }
812
+ .live-eval-idle {
813
+ text-align: center;
814
+ padding: 40px 20px;
815
+ color: var(--text-muted);
816
+ font-size: 0.9rem;
817
+ }
818
+ .live-eval-steps-container {
819
+ max-height: 400px;
820
+ overflow-y: auto;
821
+ }
822
+ /* SSE Connection Status Indicator */
823
+ .sse-connection-status {
824
+ display: inline-flex;
825
+ align-items: center;
826
+ gap: 6px;
827
+ padding: 4px 10px;
828
+ border-radius: 12px;
829
+ font-size: 0.7rem;
830
+ font-weight: 600;
831
+ margin-left: 12px;
832
+ }
833
+ .sse-connection-status.connected {
834
+ background: rgba(16, 185, 129, 0.2);
835
+ color: #10b981;
836
+ }
837
+ .sse-connection-status.connecting {
838
+ background: rgba(245, 158, 11, 0.2);
839
+ color: #f59e0b;
840
+ }
841
+ .sse-connection-status.disconnected {
842
+ background: rgba(239, 68, 68, 0.2);
843
+ color: #ef4444;
844
+ }
845
+ .sse-connection-status.fallback {
846
+ background: rgba(156, 163, 175, 0.2);
847
+ color: #9ca3af;
848
+ }
849
+ .sse-connection-dot {
850
+ width: 6px;
851
+ height: 6px;
852
+ border-radius: 50%;
853
+ background: currentColor;
854
+ }
855
+ .sse-connection-status.connecting .sse-connection-dot {
856
+ animation: pulse 1.5s ease-in-out infinite;
857
+ }
858
+ @keyframes pulse {
859
+ 0%, 100% { opacity: 1; }
860
+ 50% { opacity: 0.3; }
861
+ }
862
+ '''
863
+
864
+
865
+ def _get_live_evaluation_panel_html() -> str:
866
+ """Return HTML for live evaluation panel with SSE and polling fallback."""
867
+ return '''
868
+ <div class="live-eval-panel" id="live-eval-panel">
869
+ <div class="live-eval-header">
870
+ <div class="live-eval-title">
871
+ <svg viewBox="0 0 24 24" fill="currentColor">
872
+ <path d="M9 5H7a2 2 0 00-2 2v12a2 2 0 002 2h10a2 2 0 002-2V7a2 2 0 00-2-2h-2M9 5a2 2 0 002 2h2a2 2 0 002-2M9 5a2 2 0 012-2h2a2 2 0 012 2m-6 9l2 2 4-4"/>
873
+ </svg>
874
+ Live Evaluation
875
+ <span class="sse-connection-status connecting" id="sse-status">
876
+ <span class="sse-connection-dot"></span>
877
+ <span id="sse-status-text">Connecting</span>
878
+ </span>
879
+ </div>
880
+ <div style="display: flex; align-items: center; gap: 12px;">
881
+ <span class="live-eval-refresh" id="live-eval-refresh-time">Checking...</span>
882
+ <button class="refresh-btn" onclick="if(window.sseManager) { window.sseManager.disconnect(); window.sseManager.connect(); }" title="Reconnect to live updates" style="background: rgba(245, 158, 11, 0.2); border-color: rgba(245, 158, 11, 0.4);">
883
+ <span class="refresh-icon">&#8635;</span>
884
+ <span class="spinner" style="border-top-color: #f59e0b;"></span>
885
+ Reconnect
886
+ </button>
887
+ </div>
888
+ </div>
889
+ <div id="live-eval-content">
890
+ <div class="live-eval-idle">
891
+ <div style="font-size: 2rem; margin-bottom: 12px; opacity: 0.5;">&#9889;</div>
892
+ No evaluation running
893
+ <div style="font-size: 0.8rem; color: var(--text-muted); margin-top: 8px;">
894
+ Start an evaluation to see real-time progress
895
+ </div>
896
+ </div>
897
+ </div>
898
+ </div>
899
+
900
+ <script>
901
+ // SSE Manager for real-time benchmark updates
902
+ class BenchmarkSSEManager {
903
+ constructor() {
904
+ this.eventSource = null;
905
+ this.pollingInterval = null;
906
+ this.staleCheckInterval = null; // Track stale connection check interval
907
+ this.usePolling = false;
908
+ this.reconnectAttempts = 0;
909
+ this.maxReconnectAttempts = 5;
910
+ this.reconnectDelay = 2000;
911
+ this.lastHeartbeat = Date.now();
912
+ this.state = {
913
+ status: 'idle',
914
+ tasks_completed: 0,
915
+ total_tasks: 0,
916
+ current_task: null,
917
+ results: []
918
+ };
919
+ }
920
+
921
+ // Clear all intervals to prevent memory leaks
922
+ clearAllIntervals() {
923
+ if (this.pollingInterval) {
924
+ clearInterval(this.pollingInterval);
925
+ this.pollingInterval = null;
926
+ }
927
+ if (this.staleCheckInterval) {
928
+ clearInterval(this.staleCheckInterval);
929
+ this.staleCheckInterval = null;
930
+ }
931
+ }
932
+
933
+ connect() {
934
+ // Check if EventSource is supported
935
+ if (!window.EventSource) {
936
+ console.log('SSE not supported, falling back to polling');
937
+ this.startPolling();
938
+ return;
939
+ }
940
+
941
+ // Clear any existing intervals before reconnecting
942
+ this.clearAllIntervals();
943
+
944
+ this.updateConnectionStatus('connecting');
945
+
946
+ try {
947
+ this.eventSource = new EventSource('/api/benchmark-sse?interval=2');
948
+
949
+ this.eventSource.addEventListener('connected', (e) => {
950
+ console.log('SSE connected:', e.data);
951
+ this.reconnectAttempts = 0;
952
+ this.updateConnectionStatus('connected');
953
+ });
954
+
955
+ this.eventSource.addEventListener('status', (e) => {
956
+ const data = JSON.parse(e.data);
957
+ this.handleStatusEvent(data);
958
+ this.updateTimestamp();
959
+ });
960
+
961
+ this.eventSource.addEventListener('progress', (e) => {
962
+ const data = JSON.parse(e.data);
963
+ this.handleProgressEvent(data);
964
+ this.updateTimestamp();
965
+ });
966
+
967
+ this.eventSource.addEventListener('task_complete', (e) => {
968
+ const data = JSON.parse(e.data);
969
+ this.handleTaskCompleteEvent(data);
970
+ this.updateTimestamp();
971
+ });
972
+
973
+ this.eventSource.addEventListener('heartbeat', (e) => {
974
+ this.lastHeartbeat = Date.now();
975
+ // Heartbeats keep connection alive, no UI update needed
976
+ });
977
+
978
+ this.eventSource.addEventListener('error', (e) => {
979
+ const data = JSON.parse(e.data);
980
+ console.error('SSE error event:', data);
981
+ });
982
+
983
+ this.eventSource.onerror = (e) => {
984
+ console.error('SSE connection error:', e);
985
+ this.handleConnectionError();
986
+ };
987
+
988
+ // Check for stale connection (no heartbeat in 60 seconds)
989
+ // Store interval ID to clear on reconnect
990
+ this.staleCheckInterval = setInterval(() => {
991
+ if (this.eventSource && (Date.now() - this.lastHeartbeat > 60000)) {
992
+ console.log('SSE connection stale, reconnecting...');
993
+ this.reconnect();
994
+ }
995
+ }, 30000);
996
+
997
+ } catch (e) {
998
+ console.error('SSE connection failed:', e);
999
+ this.startPolling();
1000
+ }
1001
+ }
1002
+
1003
+ handleStatusEvent(data) {
1004
+ console.log('[SSE Debug] handleStatusEvent:', JSON.stringify(data));
1005
+ // Clear previous vmStatus to prevent stale state accumulation
1006
+ this.state.vmStatus = data;
1007
+ if (data.waa_ready) {
1008
+ this.state.status = 'ready';
1009
+ }
1010
+ console.log('[SSE Debug] Updated state after status event:', JSON.stringify(this.state));
1011
+ this.render();
1012
+ }
1013
+
1014
+ handleProgressEvent(data) {
1015
+ console.log('[SSE Debug] handleProgressEvent:', JSON.stringify(data));
1016
+ this.state.status = 'running';
1017
+ this.state.tasks_completed = data.tasks_completed;
1018
+ this.state.total_tasks = data.total_tasks;
1019
+ this.state.current_task = {
1020
+ task_id: data.current_task,
1021
+ instruction: `Task ${data.current_task}`,
1022
+ domain: 'waa'
1023
+ };
1024
+ console.log('[SSE Debug] Updated state after progress event:', JSON.stringify(this.state));
1025
+ this.render();
1026
+ }
1027
+
1028
+ handleTaskCompleteEvent(data) {
1029
+ this.state.results.push({
1030
+ task_id: data.task_id,
1031
+ success: data.success,
1032
+ score: data.score
1033
+ });
1034
+ this.render();
1035
+ }
1036
+
1037
+ handleConnectionError() {
1038
+ this.updateConnectionStatus('disconnected');
1039
+
1040
+ if (this.reconnectAttempts < this.maxReconnectAttempts) {
1041
+ this.reconnectAttempts++;
1042
+ console.log(`SSE reconnect attempt ${this.reconnectAttempts}/${this.maxReconnectAttempts}`);
1043
+ setTimeout(() => this.reconnect(), this.reconnectDelay * this.reconnectAttempts);
1044
+ } else {
1045
+ console.log('Max SSE reconnect attempts reached, falling back to polling');
1046
+ this.startPolling();
1047
+ }
1048
+ }
1049
+
1050
+ reconnect() {
1051
+ if (this.eventSource) {
1052
+ this.eventSource.close();
1053
+ this.eventSource = null;
1054
+ }
1055
+ this.connect();
1056
+ }
1057
+
1058
+ startPolling() {
1059
+ this.usePolling = true;
1060
+ this.updateConnectionStatus('fallback');
1061
+
1062
+ if (this.eventSource) {
1063
+ this.eventSource.close();
1064
+ this.eventSource = null;
1065
+ }
1066
+
1067
+ // Clear any existing intervals before starting new polling
1068
+ this.clearAllIntervals();
1069
+
1070
+ // Use existing polling function
1071
+ fetchLiveEvaluationPolling();
1072
+ this.pollingInterval = setInterval(fetchLiveEvaluationPolling, 2000);
1073
+ }
1074
+
1075
+ updateConnectionStatus(status) {
1076
+ const el = document.getElementById('sse-status');
1077
+ const textEl = document.getElementById('sse-status-text');
1078
+ if (!el || !textEl) return;
1079
+
1080
+ el.className = 'sse-connection-status ' + status;
1081
+ const statusText = {
1082
+ 'connected': 'Live',
1083
+ 'connecting': 'Connecting',
1084
+ 'disconnected': 'Disconnected',
1085
+ 'fallback': 'Polling'
1086
+ };
1087
+ textEl.textContent = statusText[status] || status;
1088
+ }
1089
+
1090
+ updateTimestamp() {
1091
+ const el = document.getElementById('live-eval-refresh-time');
1092
+ if (el) {
1093
+ el.textContent = 'Updated ' + new Date().toLocaleTimeString();
1094
+ }
1095
+ }
1096
+
1097
+ render() {
1098
+ renderLiveEvaluation(this.state);
1099
+ }
1100
+
1101
+ disconnect() {
1102
+ if (this.eventSource) {
1103
+ this.eventSource.close();
1104
+ this.eventSource = null;
1105
+ }
1106
+ // Clear all intervals using centralized cleanup
1107
+ this.clearAllIntervals();
1108
+ }
1109
+ }
1110
+
1111
+ // Polling fallback function
1112
+ async function fetchLiveEvaluationPolling() {
1113
+ try {
1114
+ const response = await fetch('/api/benchmark-live?' + Date.now());
1115
+ if (response.ok) {
1116
+ const state = await response.json();
1117
+ console.log('[SSE Debug] Polling received state:', JSON.stringify(state));
1118
+ renderLiveEvaluation(state);
1119
+ document.getElementById('live-eval-refresh-time').textContent =
1120
+ 'Updated ' + new Date().toLocaleTimeString();
1121
+ }
1122
+ } catch (e) {
1123
+ console.log('Live evaluation API unavailable:', e);
1124
+ document.getElementById('live-eval-content').innerHTML =
1125
+ '<div class="live-eval-idle">Live evaluation API not available</div>';
1126
+ }
1127
+ }
1128
+
1129
+ function renderLiveEvaluation(state) {
1130
+ const container = document.getElementById('live-eval-content');
1131
+
1132
+ if (!state || state.status === 'idle' || !state.current_task) {
1133
+ container.innerHTML = '<div class="live-eval-idle">No evaluation running</div>';
1134
+ return;
1135
+ }
1136
+
1137
+ const task = state.current_task;
1138
+ const progress = `${state.tasks_completed || 0}/${state.total_tasks || 0}`;
1139
+
1140
+ // Build status section
1141
+ let statusHtml = `
1142
+ <div class="live-eval-status">
1143
+ <div class="live-eval-progress">Evaluating task ${progress}: ${task.task_id}</div>
1144
+ <div class="live-eval-task-name">${task.instruction || 'No instruction'}</div>
1145
+ <div class="live-eval-task-name">Domain: ${task.domain || 'unknown'}</div>
1146
+ </div>
1147
+ `;
1148
+
1149
+ // Build steps section
1150
+ let stepsHtml = '';
1151
+ if (task.steps && task.steps.length > 0) {
1152
+ stepsHtml = '<div class="live-eval-steps-container">';
1153
+
1154
+ // Show last 5 steps
1155
+ const recentSteps = task.steps.slice(-5);
1156
+ recentSteps.forEach(step => {
1157
+ const actionText = formatAction(step.action);
1158
+ const screenshotHtml = step.screenshot_url
1159
+ ? `<img src="${step.screenshot_url}" class="live-eval-screenshot" alt="Step ${step.step_idx}" />`
1160
+ : '';
1161
+ const reasoningHtml = step.reasoning
1162
+ ? `<div class="live-eval-reasoning">"${step.reasoning}"</div>`
1163
+ : '';
1164
+
1165
+ stepsHtml += `
1166
+ <div class="live-eval-step">
1167
+ <div class="live-eval-step-header">
1168
+ <div class="live-eval-step-number">Step ${step.step_idx}</div>
1169
+ <div class="live-eval-action">${actionText}</div>
1170
+ </div>
1171
+ ${screenshotHtml}
1172
+ ${reasoningHtml}
1173
+ </div>
1174
+ `;
1175
+ });
1176
+
1177
+ stepsHtml += '</div>';
1178
+ }
1179
+
1180
+ // Show result if task completed
1181
+ let resultHtml = '';
1182
+ if (task.result) {
1183
+ const resultClass = task.result.success ? 'success' : 'failure';
1184
+ const resultIcon = task.result.success ? '✓' : '✗';
1185
+ resultHtml = `
1186
+ <div class="live-eval-status">
1187
+ <div class="live-eval-result ${resultClass}">
1188
+ ${resultIcon} ${task.result.success ? 'Success' : 'Failure'}
1189
+ (${task.result.num_steps} steps in ${task.result.total_time_seconds.toFixed(2)}s)
1190
+ </div>
1191
+ </div>
1192
+ `;
1193
+ }
1194
+
1195
+ // Show recent results summary
1196
+ if (state.results && state.results.length > 0) {
1197
+ const successCount = state.results.filter(r => r.success).length;
1198
+ resultHtml += `
1199
+ <div class="live-eval-status" style="margin-top: 8px;">
1200
+ <small>Results: ${successCount}/${state.results.length} passed</small>
1201
+ </div>
1202
+ `;
1203
+ }
1204
+
1205
+ container.innerHTML = statusHtml + stepsHtml + resultHtml;
1206
+ }
1207
+
1208
+ function formatAction(action) {
1209
+ if (!action) return 'No action';
1210
+
1211
+ const type = action.type || 'unknown';
1212
+ const parts = [type.toUpperCase()];
1213
+
1214
+ if (action.x !== null && action.y !== null) {
1215
+ parts.push(`(x=${action.x.toFixed(3)}, y=${action.y.toFixed(3)})`);
1216
+ } else if (action.target_node_id) {
1217
+ parts.push(`[${action.target_node_id}]`);
1218
+ }
1219
+
1220
+ if (action.text) {
1221
+ parts.push(`"${action.text}"`);
1222
+ }
1223
+
1224
+ if (action.key) {
1225
+ parts.push(`key=${action.key}`);
1226
+ }
1227
+
1228
+ return parts.join(' ');
1229
+ }
1230
+
1231
+ // Initialize SSE manager and store on window for reconnect button
1232
+ window.sseManager = new BenchmarkSSEManager();
1233
+ window.sseManager.connect();
1234
+
1235
+ // Cleanup on page unload
1236
+ window.addEventListener('beforeunload', () => {
1237
+ if (window.sseManager) window.sseManager.disconnect();
1238
+ });
1239
+ </script>
1240
+ '''
1241
+
1242
+
1243
+ def _get_azure_jobs_panel_css() -> str:
1244
+ """Return CSS for the Azure jobs status panel with color-coded status indicators."""
1245
+ return '''
1246
+ .azure-jobs-panel {
1247
+ background: linear-gradient(135deg, rgba(0, 120, 212, 0.15) 0%, rgba(0, 120, 212, 0.05) 100%);
1248
+ border: 1px solid rgba(0, 120, 212, 0.3);
1249
+ border-radius: 12px;
1250
+ margin-bottom: 24px;
1251
+ overflow: hidden;
1252
+ }
1253
+ .azure-jobs-panel.collapsed .azure-jobs-body {
1254
+ display: none;
1255
+ }
1256
+ .azure-jobs-panel.collapsed .azure-jobs-header {
1257
+ margin-bottom: 0;
1258
+ }
1259
+ .azure-jobs-header {
1260
+ display: flex;
1261
+ align-items: center;
1262
+ justify-content: space-between;
1263
+ padding: 16px 24px;
1264
+ cursor: pointer;
1265
+ transition: background 0.2s;
1266
+ }
1267
+ .azure-jobs-header:hover {
1268
+ background: rgba(0, 120, 212, 0.1);
1269
+ }
1270
+ .azure-jobs-body {
1271
+ padding: 0 24px 20px 24px;
1272
+ }
1273
+ .azure-jobs-title {
1274
+ display: flex;
1275
+ align-items: center;
1276
+ gap: 10px;
1277
+ font-size: 1rem;
1278
+ font-weight: 600;
1279
+ color: #0078d4;
1280
+ }
1281
+ .azure-jobs-title svg {
1282
+ width: 20px;
1283
+ height: 20px;
1284
+ }
1285
+ .azure-jobs-expand-icon {
1286
+ font-size: 0.75rem;
1287
+ transition: transform 0.2s;
1288
+ margin-left: 8px;
1289
+ color: var(--text-muted);
1290
+ }
1291
+ .azure-jobs-panel:not(.collapsed) .azure-jobs-expand-icon {
1292
+ transform: rotate(90deg);
1293
+ }
1294
+ .azure-jobs-tooltip {
1295
+ font-size: 0.7rem;
1296
+ color: var(--text-muted);
1297
+ font-weight: 400;
1298
+ margin-left: 8px;
1299
+ }
1300
+ .azure-jobs-controls {
1301
+ display: flex;
1302
+ align-items: center;
1303
+ gap: 12px;
1304
+ }
1305
+ .azure-jobs-refresh {
1306
+ font-size: 0.75rem;
1307
+ color: var(--text-muted);
1308
+ transition: color 0.2s;
1309
+ }
1310
+ .azure-jobs-refresh.error {
1311
+ color: #ef4444;
1312
+ }
1313
+ .azure-jobs-refresh.success {
1314
+ color: #10b981;
1315
+ }
1316
+ /* API Error Banner */
1317
+ .api-error-banner {
1318
+ background: linear-gradient(135deg, rgba(239, 68, 68, 0.2) 0%, rgba(239, 68, 68, 0.1) 100%);
1319
+ border: 1px solid rgba(239, 68, 68, 0.4);
1320
+ border-radius: 8px;
1321
+ padding: 12px 16px;
1322
+ margin-bottom: 16px;
1323
+ display: none;
1324
+ align-items: center;
1325
+ gap: 12px;
1326
+ font-size: 0.85rem;
1327
+ color: #fca5a5;
1328
+ }
1329
+ .api-error-banner.show {
1330
+ display: flex;
1331
+ }
1332
+ .api-error-banner .error-icon {
1333
+ font-size: 1.2rem;
1334
+ flex-shrink: 0;
1335
+ }
1336
+ .api-error-banner .error-message {
1337
+ flex: 1;
1338
+ }
1339
+ .api-error-banner .retry-btn {
1340
+ padding: 4px 10px;
1341
+ background: rgba(239, 68, 68, 0.3);
1342
+ border: 1px solid rgba(239, 68, 68, 0.5);
1343
+ border-radius: 4px;
1344
+ color: #fca5a5;
1345
+ cursor: pointer;
1346
+ font-size: 0.75rem;
1347
+ transition: background 0.2s;
1348
+ }
1349
+ .api-error-banner .retry-btn:hover {
1350
+ background: rgba(239, 68, 68, 0.4);
1351
+ }
1352
+ /* Job items with color-coded borders */
1353
+ .azure-job-item {
1354
+ display: flex;
1355
+ align-items: center;
1356
+ gap: 16px;
1357
+ padding: 14px 18px;
1358
+ background: rgba(0, 0, 0, 0.3);
1359
+ border-radius: 8px;
1360
+ margin-bottom: 10px;
1361
+ border-left: 4px solid transparent;
1362
+ transition: all 0.2s ease;
1363
+ }
1364
+ .azure-job-item:last-child {
1365
+ margin-bottom: 0;
1366
+ }
1367
+ .azure-job-item:hover {
1368
+ background: rgba(0, 0, 0, 0.4);
1369
+ }
1370
+ /* Color-coded left border based on status - Running=Yellow, Completed=Green, Failed=Red */
1371
+ .azure-job-item.status-running {
1372
+ border-left-color: #f59e0b;
1373
+ background: linear-gradient(90deg, rgba(245, 158, 11, 0.1) 0%, rgba(0, 0, 0, 0.3) 20%);
1374
+ }
1375
+ .azure-job-item.status-completed {
1376
+ border-left-color: #10b981;
1377
+ background: linear-gradient(90deg, rgba(16, 185, 129, 0.1) 0%, rgba(0, 0, 0, 0.3) 20%);
1378
+ }
1379
+ .azure-job-item.status-failed,
1380
+ .azure-job-item.status-canceled {
1381
+ border-left-color: #ef4444;
1382
+ background: linear-gradient(90deg, rgba(239, 68, 68, 0.1) 0%, rgba(0, 0, 0, 0.3) 20%);
1383
+ }
1384
+ .azure-job-item.status-provisioning,
1385
+ .azure-job-item.status-preparing,
1386
+ .azure-job-item.status-queued,
1387
+ .azure-job-item.status-starting {
1388
+ border-left-color: #3b82f6;
1389
+ background: linear-gradient(90deg, rgba(59, 130, 246, 0.1) 0%, rgba(0, 0, 0, 0.3) 20%);
1390
+ }
1391
+ .azure-job-status {
1392
+ display: flex;
1393
+ align-items: center;
1394
+ gap: 8px;
1395
+ min-width: 130px;
1396
+ }
1397
+ .status-dot {
1398
+ width: 10px;
1399
+ height: 10px;
1400
+ border-radius: 50%;
1401
+ flex-shrink: 0;
1402
+ }
1403
+ .status-dot.provisioning,
1404
+ .status-dot.preparing,
1405
+ .status-dot.queued,
1406
+ .status-dot.starting {
1407
+ background: #3b82f6;
1408
+ animation: pulse-status 2s infinite;
1409
+ }
1410
+ .status-dot.running {
1411
+ background: #f59e0b;
1412
+ animation: pulse-status 1.5s infinite;
1413
+ }
1414
+ .status-dot.completed {
1415
+ background: #10b981;
1416
+ animation: none;
1417
+ }
1418
+ .status-dot.failed,
1419
+ .status-dot.canceled {
1420
+ background: #ef4444;
1421
+ animation: none;
1422
+ }
1423
+ .status-dot.unknown {
1424
+ background: #6b7280;
1425
+ animation: none;
1426
+ }
1427
+ @keyframes pulse-status {
1428
+ 0%, 100% { opacity: 1; transform: scale(1); box-shadow: 0 0 0 0 currentColor; }
1429
+ 50% { opacity: 0.6; transform: scale(0.9); }
1430
+ }
1431
+ .status-text {
1432
+ font-weight: 600;
1433
+ font-size: 0.8rem;
1434
+ text-transform: uppercase;
1435
+ letter-spacing: 0.5px;
1436
+ }
1437
+ .status-text.running { color: #f59e0b; }
1438
+ .status-text.completed { color: #10b981; }
1439
+ .status-text.failed, .status-text.canceled { color: #ef4444; }
1440
+ .status-text.provisioning, .status-text.preparing, .status-text.queued, .status-text.starting { color: #3b82f6; }
1441
+ .azure-job-info {
1442
+ flex: 1;
1443
+ min-width: 0;
1444
+ }
1445
+ .azure-job-id {
1446
+ font-family: "SF Mono", Monaco, monospace;
1447
+ font-size: 0.85rem;
1448
+ color: var(--text-primary);
1449
+ font-weight: 500;
1450
+ }
1451
+ .azure-job-meta {
1452
+ font-size: 0.75rem;
1453
+ color: var(--text-secondary);
1454
+ margin-top: 4px;
1455
+ display: flex;
1456
+ flex-wrap: wrap;
1457
+ gap: 8px;
1458
+ }
1459
+ .azure-job-meta-item {
1460
+ display: inline-flex;
1461
+ align-items: center;
1462
+ gap: 4px;
1463
+ }
1464
+ .azure-job-link {
1465
+ display: inline-flex;
1466
+ align-items: center;
1467
+ gap: 6px;
1468
+ padding: 8px 14px;
1469
+ background: #0078d4;
1470
+ color: white;
1471
+ border-radius: 6px;
1472
+ text-decoration: none;
1473
+ font-size: 0.8rem;
1474
+ font-weight: 500;
1475
+ transition: all 0.2s;
1476
+ }
1477
+ .azure-job-link:hover {
1478
+ background: #106ebe;
1479
+ transform: translateY(-1px);
1480
+ box-shadow: 0 4px 12px rgba(0, 120, 212, 0.3);
1481
+ }
1482
+ .no-jobs {
1483
+ text-align: center;
1484
+ padding: 30px 20px;
1485
+ color: var(--text-muted);
1486
+ font-size: 0.9rem;
1487
+ }
1488
+ .no-jobs code {
1489
+ display: block;
1490
+ margin-top: 12px;
1491
+ padding: 10px 14px;
1492
+ background: rgba(0, 0, 0, 0.4);
1493
+ border-radius: 6px;
1494
+ font-family: "SF Mono", Monaco, monospace;
1495
+ font-size: 0.8rem;
1496
+ color: var(--text-secondary);
1497
+ }
1498
+ /* Refresh button with loading spinner */
1499
+ .refresh-btn {
1500
+ background: rgba(0, 120, 212, 0.2);
1501
+ border: 1px solid rgba(0, 120, 212, 0.4);
1502
+ border-radius: 6px;
1503
+ color: var(--text-primary);
1504
+ cursor: pointer;
1505
+ padding: 6px 12px;
1506
+ font-size: 0.8rem;
1507
+ display: flex;
1508
+ align-items: center;
1509
+ gap: 6px;
1510
+ transition: all 0.2s;
1511
+ }
1512
+ .refresh-btn:hover:not(:disabled) {
1513
+ background: rgba(0, 120, 212, 0.3);
1514
+ transform: translateY(-1px);
1515
+ }
1516
+ .refresh-btn:disabled {
1517
+ opacity: 0.6;
1518
+ cursor: not-allowed;
1519
+ }
1520
+ .refresh-btn .spinner {
1521
+ display: none;
1522
+ width: 14px;
1523
+ height: 14px;
1524
+ border: 2px solid rgba(255,255,255,0.3);
1525
+ border-top-color: #0078d4;
1526
+ border-radius: 50%;
1527
+ animation: spin 0.8s linear infinite;
1528
+ }
1529
+ .refresh-btn.loading .spinner {
1530
+ display: inline-block;
1531
+ }
1532
+ .refresh-btn.loading .refresh-icon {
1533
+ display: none;
1534
+ }
1535
+ @keyframes spin {
1536
+ to { transform: rotate(360deg); }
1537
+ }
1538
+ '''
1539
+
1540
+
1541
+ def _get_azure_jobs_panel_html() -> str:
1542
+ """Return HTML for the Azure jobs status panel with JS polling, error handling, and loading states.
1543
+
1544
+ NOTE: This panel is now used in the Training tab (not Benchmarks) because Azure ML
1545
+ is used for training jobs, not for WAA benchmarks (which require nested virtualization
1546
+ that managed compute doesn't support).
1547
+ """
1548
+ return '''
1549
+ <div class="azure-jobs-panel collapsed" id="azure-jobs-panel">
1550
+ <div class="azure-jobs-header" onclick="toggleAzureJobsPanel()" title="Azure ML training jobs">
1551
+ <div class="azure-jobs-title">
1552
+ <svg viewBox="0 0 24 24" fill="currentColor">
1553
+ <path d="M12 2C6.48 2 2 6.48 2 12s4.48 10 10 10 10-4.48 10-10S17.52 2 12 2zm-1 17.93c-3.95-.49-7-3.85-7-7.93 0-.62.08-1.21.21-1.79L9 15v1c0 1.1.9 2 2 2v1.93zm6.9-2.54c-.26-.81-1-1.39-1.9-1.39h-1v-3c0-.55-.45-1-1-1H8v-2h2c.55 0 1-.45 1-1V7h2c1.1 0 2-.9 2-2v-.41c2.93 1.19 5 4.06 5 7.41 0 2.08-.8 3.97-2.1 5.39z"/>
1554
+ </svg>
1555
+ Azure ML Jobs
1556
+ <span class="azure-jobs-expand-icon">&#9654;</span>
1557
+ </div>
1558
+ <div class="azure-jobs-controls" onclick="event.stopPropagation()">
1559
+ <span class="azure-jobs-refresh" id="jobs-refresh-time">Checking...</span>
1560
+ <button id="azure-jobs-refresh-btn" class="refresh-btn" onclick="refreshAzureJobs()" title="Refresh job status from Azure">
1561
+ <span class="refresh-icon">&#8635;</span>
1562
+ <span class="spinner"></span>
1563
+ Refresh
1564
+ </button>
1565
+ </div>
1566
+ </div>
1567
+
1568
+ <div class="azure-jobs-body">
1569
+ <!-- API Error Banner (hidden by default) -->
1570
+ <div class="api-error-banner" id="azure-jobs-error">
1571
+ <span class="error-icon">!</span>
1572
+ <span class="error-message" id="azure-jobs-error-msg">Failed to fetch Azure jobs</span>
1573
+ <button class="retry-btn" onclick="refreshAzureJobs()">Retry</button>
1574
+ </div>
1575
+
1576
+ <!-- Loading state -->
1577
+ <div id="azure-jobs-loading" style="display: none; text-align: center; padding: 30px;">
1578
+ <div style="display: inline-block; width: 24px; height: 24px; border: 3px solid rgba(0,120,212,0.3); border-top-color: #0078d4; border-radius: 50%; animation: spin 1s linear infinite;"></div>
1579
+ <div style="margin-top: 12px; color: var(--text-muted); font-size: 0.85rem;">Loading Azure jobs...</div>
1580
+ </div>
1581
+
1582
+ <div id="azure-jobs-list">
1583
+ <div class="no-jobs">
1584
+ <div style="font-size: 2rem; margin-bottom: 12px; opacity: 0.5;">&#9729;</div>
1585
+ Checking Azure ML for jobs...
1586
+ </div>
1587
+ </div>
1588
+
1589
+ <button id="toggle-logs-btn" onclick="toggleLogs()" style="
1590
+ margin-top: 12px;
1591
+ padding: 8px 14px;
1592
+ background: rgba(0, 120, 212, 0.2);
1593
+ border: 1px solid rgba(0, 120, 212, 0.4);
1594
+ border-radius: 6px;
1595
+ color: var(--text-primary);
1596
+ cursor: pointer;
1597
+ font-size: 0.8rem;
1598
+ display: flex;
1599
+ align-items: center;
1600
+ gap: 6px;
1601
+ transition: all 0.2s;
1602
+ ">
1603
+ <span id="logs-icon">&#9660;</span>
1604
+ <span id="logs-btn-text">Show Logs</span>
1605
+ </button>
1606
+ <div id="job-logs-panel" style="display: none; margin-top: 12px;">
1607
+ <div id="log-job-status" style="font-size: 0.75rem; color: var(--text-muted); margin-bottom: 6px;"></div>
1608
+ <pre id="job-logs-content" style="
1609
+ background: #1a1a1a;
1610
+ color: #10b981;
1611
+ padding: 14px;
1612
+ border-radius: 6px;
1613
+ font-size: 0.75rem;
1614
+ max-height: 300px;
1615
+ overflow-y: auto;
1616
+ white-space: pre-wrap;
1617
+ word-wrap: break-word;
1618
+ font-family: 'SF Mono', Monaco, monospace;
1619
+ border: 1px solid rgba(255,255,255,0.1);
1620
+ ">Loading logs...</pre>
1621
+ </div>
1622
+ </div>
1623
+ </div>
1624
+
1625
+ <script>
1626
+ // Track refresh state
1627
+ let isAzureJobsRefreshing = false;
1628
+ let azureJobsErrorCount = 0;
1629
+ let azureJobsPanelUserToggled = false; // Track if user manually toggled panel
1630
+
1631
+ // Toggle Azure jobs panel expand/collapse
1632
+ function toggleAzureJobsPanel() {
1633
+ const panel = document.getElementById('azure-jobs-panel');
1634
+ if (panel) {
1635
+ panel.classList.toggle('collapsed');
1636
+ azureJobsPanelUserToggled = true; // User manually toggled, respect their choice
1637
+ }
1638
+ }
1639
+
1640
+ // Check if panel should auto-expand based on jobs (only for running jobs)
1641
+ // NOTE: Panel is collapsed by default and only auto-expands if there are running jobs
1642
+ function shouldAutoExpandAzurePanel(jobs) {
1643
+ if (!jobs || jobs.length === 0) return false;
1644
+
1645
+ for (const job of jobs) {
1646
+ const status = (job.status || '').toLowerCase();
1647
+ // Auto-expand only for running/active jobs
1648
+ if (['running', 'provisioning', 'preparing', 'queued', 'starting'].includes(status)) {
1649
+ return true;
1650
+ }
1651
+ }
1652
+ return false;
1653
+ }
1654
+
1655
+ // Auto-expand panel if there are running/recent jobs (only if user hasn't manually toggled)
1656
+ function maybeAutoExpandAzurePanel(jobs) {
1657
+ if (azureJobsPanelUserToggled) return; // Respect user's manual choice
1658
+
1659
+ const panel = document.getElementById('azure-jobs-panel');
1660
+ if (!panel) return;
1661
+
1662
+ if (shouldAutoExpandAzurePanel(jobs)) {
1663
+ panel.classList.remove('collapsed');
1664
+ }
1665
+ }
1666
+
1667
+ // Show/hide loading state and error banner
1668
+ function setAzureJobsState(state, errorMsg = '') {
1669
+ const loadingEl = document.getElementById('azure-jobs-loading');
1670
+ const listEl = document.getElementById('azure-jobs-list');
1671
+ const errorEl = document.getElementById('azure-jobs-error');
1672
+ const errorMsgEl = document.getElementById('azure-jobs-error-msg');
1673
+ const refreshTimeEl = document.getElementById('jobs-refresh-time');
1674
+ const refreshBtn = document.getElementById('azure-jobs-refresh-btn');
1675
+
1676
+ // Reset states
1677
+ loadingEl.style.display = 'none';
1678
+ errorEl.classList.remove('show');
1679
+
1680
+ if (state === 'loading') {
1681
+ loadingEl.style.display = 'block';
1682
+ listEl.style.display = 'none';
1683
+ refreshBtn.classList.add('loading');
1684
+ refreshBtn.disabled = true;
1685
+ } else if (state === 'error') {
1686
+ listEl.style.display = 'block';
1687
+ errorEl.classList.add('show');
1688
+ errorMsgEl.textContent = errorMsg || 'Failed to fetch Azure jobs. Check Azure CLI login.';
1689
+ refreshTimeEl.textContent = 'Error';
1690
+ refreshTimeEl.classList.add('error');
1691
+ refreshTimeEl.classList.remove('success');
1692
+ refreshBtn.classList.remove('loading');
1693
+ refreshBtn.disabled = false;
1694
+ } else if (state === 'success') {
1695
+ listEl.style.display = 'block';
1696
+ refreshTimeEl.classList.remove('error');
1697
+ refreshTimeEl.classList.add('success');
1698
+ refreshBtn.classList.remove('loading');
1699
+ refreshBtn.disabled = false;
1700
+ azureJobsErrorCount = 0; // Reset error count on success
1701
+ } else {
1702
+ listEl.style.display = 'block';
1703
+ refreshBtn.classList.remove('loading');
1704
+ refreshBtn.disabled = false;
1705
+ }
1706
+ }
1707
+
1708
+ // Force refresh from Azure (bypasses cache)
1709
+ async function refreshAzureJobs() {
1710
+ if (isAzureJobsRefreshing) return;
1711
+ isAzureJobsRefreshing = true;
1712
+ setAzureJobsState('loading');
1713
+ document.getElementById('jobs-refresh-time').textContent = 'Refreshing...';
1714
+
1715
+ try {
1716
+ const response = await fetch('/api/azure-jobs?force=true&t=' + Date.now());
1717
+ if (!response.ok) {
1718
+ throw new Error(`HTTP ${response.status}`);
1719
+ }
1720
+ const jobs = await response.json();
1721
+ if (jobs.error) {
1722
+ throw new Error(jobs.error);
1723
+ }
1724
+ renderAzureJobs(jobs, true);
1725
+ setAzureJobsState('success');
1726
+ document.getElementById('jobs-refresh-time').textContent =
1727
+ 'Live from Azure - ' + new Date().toLocaleTimeString();
1728
+ } catch (e) {
1729
+ console.error('Azure jobs refresh failed:', e);
1730
+ azureJobsErrorCount++;
1731
+ setAzureJobsState('error', e.message || 'Connection failed');
1732
+ } finally {
1733
+ isAzureJobsRefreshing = false;
1734
+ }
1735
+ }
1736
+
1737
+ // Fetch Azure job status from API (normal polling)
1738
+ async function fetchAzureJobs() {
1739
+ if (isAzureJobsRefreshing) return;
1740
+
1741
+ // If we've had multiple errors, slow down polling
1742
+ if (azureJobsErrorCount >= 3) {
1743
+ document.getElementById('jobs-refresh-time').textContent =
1744
+ 'Polling paused (too many errors). Click Refresh.';
1745
+ return;
1746
+ }
1747
+
1748
+ try {
1749
+ const response = await fetch('/api/azure-jobs?t=' + Date.now());
1750
+ if (!response.ok) {
1751
+ throw new Error(`HTTP ${response.status}`);
1752
+ }
1753
+ const jobs = await response.json();
1754
+ if (jobs.error) {
1755
+ throw new Error(jobs.error);
1756
+ }
1757
+ renderAzureJobs(jobs, true);
1758
+ setAzureJobsState('success');
1759
+ document.getElementById('jobs-refresh-time').textContent =
1760
+ 'Live - ' + new Date().toLocaleTimeString();
1761
+ } catch (e) {
1762
+ console.log('Azure API error:', e);
1763
+ azureJobsErrorCount++;
1764
+
1765
+ // Try cached fallback
1766
+ try {
1767
+ const fallbackResponse = await fetch('benchmark_results/azure_jobs.json?t=' + Date.now());
1768
+ if (fallbackResponse.ok) {
1769
+ const jobs = await fallbackResponse.json();
1770
+ renderAzureJobs(jobs, false);
1771
+ document.getElementById('jobs-refresh-time').textContent =
1772
+ 'Cached - ' + new Date().toLocaleTimeString();
1773
+ document.getElementById('jobs-refresh-time').classList.remove('error');
1774
+ return;
1775
+ }
1776
+ } catch (fallbackError) {
1777
+ // Fallback also failed
1778
+ }
1779
+
1780
+ // Show empty state with guidance
1781
+ document.getElementById('azure-jobs-list').innerHTML =
1782
+ '<div class="no-jobs">' +
1783
+ '<div style="font-size: 2rem; margin-bottom: 12px; opacity: 0.5;">&#9729;</div>' +
1784
+ 'No Azure jobs found<code>uv run python -m openadapt_ml.benchmarks.cli run-azure</code>' +
1785
+ '</div>';
1786
+ }
1787
+ }
1788
+
1789
+ function renderAzureJobs(jobs, isLive) {
1790
+ // Auto-expand panel if there are running/recent jobs
1791
+ maybeAutoExpandAzurePanel(jobs);
1792
+
1793
+ if (!jobs || jobs.length === 0) {
1794
+ document.getElementById('azure-jobs-list').innerHTML =
1795
+ '<div class="no-jobs">' +
1796
+ '<div style="font-size: 2rem; margin-bottom: 12px; opacity: 0.5;">&#9729;</div>' +
1797
+ 'No Azure jobs found<code>uv run python -m openadapt_ml.benchmarks.cli run-azure</code>' +
1798
+ '</div>';
1799
+ return;
1800
+ }
1801
+
1802
+ const html = jobs.slice(0, 5).map(job => {
1803
+ const status = (job.status || 'unknown').toLowerCase();
1804
+ const statusClass = status;
1805
+ let statusText = job.status ? job.status.charAt(0).toUpperCase() + job.status.slice(1) : 'Unknown';
1806
+
1807
+ // Show display_name if available (live data), otherwise job_id
1808
+ const displayName = job.display_name || job.job_id;
1809
+
1810
+ // Calculate elapsed time for running jobs
1811
+ let elapsedMins = 0;
1812
+ let elapsedText = '';
1813
+ let isStuck = false;
1814
+ if (job.started_at) {
1815
+ const start = new Date(job.started_at);
1816
+ elapsedMins = (Date.now() - start.getTime()) / 60000;
1817
+ if (status === 'running') {
1818
+ elapsedText = elapsedMins < 60
1819
+ ? Math.round(elapsedMins) + 'm'
1820
+ : Math.round(elapsedMins / 60) + 'h ' + Math.round(elapsedMins % 60) + 'm';
1821
+ // Warn if running > 30 mins
1822
+ if (elapsedMins > 30) {
1823
+ isStuck = true;
1824
+ }
1825
+ }
1826
+ }
1827
+
1828
+ // Build metadata items
1829
+ const metaItems = [];
1830
+ if (elapsedText && status === 'running') {
1831
+ metaItems.push('<span class="azure-job-meta-item">&#128337; ' + elapsedText + '</span>');
1832
+ }
1833
+ if (!isLive && job.num_tasks) {
1834
+ metaItems.push('<span class="azure-job-meta-item">~' + job.num_tasks + ' tasks</span>');
1835
+ }
1836
+ if (job.results?.success_rate !== undefined) {
1837
+ metaItems.push('<span class="azure-job-meta-item">' + (job.results.success_rate * 100).toFixed(1) + '% success</span>');
1838
+ }
1839
+ if (job.started_at && status !== 'running') {
1840
+ const date = new Date(job.started_at);
1841
+ metaItems.push('<span class="azure-job-meta-item">' + date.toLocaleString() + '</span>');
1842
+ }
1843
+ const metaHtml = metaItems.join('');
1844
+
1845
+ // Add warning for stuck jobs
1846
+ const stuckWarning = isStuck
1847
+ ? '<div style="color: #ff9800; font-size: 0.7rem; margin-top: 6px; display: flex; align-items: center; gap: 4px;"><span>&#9888;</span> Running > 30min. May be stuck. Consider canceling.</div>'
1848
+ : '';
1849
+
1850
+ return '<div class="azure-job-item status-' + statusClass + '">' +
1851
+ '<div class="azure-job-status">' +
1852
+ '<span class="status-dot ' + statusClass + '"></span>' +
1853
+ '<span class="status-text ' + statusClass + '">' + statusText + '</span>' +
1854
+ '</div>' +
1855
+ '<div class="azure-job-info">' +
1856
+ '<div class="azure-job-id">' + displayName + '</div>' +
1857
+ '<div class="azure-job-meta">' + metaHtml + '</div>' +
1858
+ stuckWarning +
1859
+ '</div>' +
1860
+ '<a href="' + (job.azure_dashboard_url || '#') + '" target="_blank" class="azure-job-link">' +
1861
+ 'Open in Azure &#8594;' +
1862
+ '</a>' +
1863
+ '</div>';
1864
+ }).join('');
1865
+
1866
+ document.getElementById('azure-jobs-list').innerHTML = html;
1867
+ }
1868
+
1869
+ // Log viewer state
1870
+ let showLogs = false;
1871
+ let currentLogJobId = null;
1872
+
1873
+ async function fetchJobLogs() {
1874
+ if (!showLogs) return;
1875
+
1876
+ const logEl = document.getElementById('job-logs-content');
1877
+ const statusEl = document.getElementById('log-job-status');
1878
+
1879
+ try {
1880
+ const url = currentLogJobId
1881
+ ? '/api/azure-job-logs?job_id=' + currentLogJobId
1882
+ : '/api/azure-job-logs';
1883
+ const response = await fetch(url + '&t=' + Date.now());
1884
+ if (response.ok) {
1885
+ const data = await response.json();
1886
+ if (logEl) {
1887
+ logEl.textContent = data.logs || 'No logs available';
1888
+ if (data.command) {
1889
+ logEl.textContent = 'Command: ' + data.command + '\\n\\n' + (data.logs || '');
1890
+ }
1891
+ // Color code based on status
1892
+ logEl.style.color = data.status === 'running' ? '#f59e0b' :
1893
+ data.status === 'completed' ? '#10b981' :
1894
+ data.status === 'failed' ? '#ef4444' : '#10b981';
1895
+ }
1896
+ if (statusEl && data.job_id) {
1897
+ statusEl.textContent = 'Job: ' + data.job_id + ' (' + data.status + ')';
1898
+ }
1899
+ } else {
1900
+ if (logEl) logEl.textContent = 'Failed to fetch logs (HTTP ' + response.status + ')';
1901
+ }
1902
+ } catch (e) {
1903
+ console.log('Error fetching logs:', e);
1904
+ if (logEl) logEl.textContent = 'Error fetching logs: ' + e.message;
1905
+ }
1906
+ }
1907
+
1908
+ function toggleLogs() {
1909
+ showLogs = !showLogs;
1910
+ const panel = document.getElementById('job-logs-panel');
1911
+ const icon = document.getElementById('logs-icon');
1912
+ const btnText = document.getElementById('logs-btn-text');
1913
+
1914
+ if (panel) {
1915
+ panel.style.display = showLogs ? 'block' : 'none';
1916
+ }
1917
+ if (icon) {
1918
+ icon.innerHTML = showLogs ? '&#9650;' : '&#9660;';
1919
+ }
1920
+ if (btnText) {
1921
+ btnText.textContent = showLogs ? 'Hide Logs' : 'Show Logs';
1922
+ }
1923
+ if (showLogs) fetchJobLogs();
1924
+ }
1925
+
1926
+ // Initial fetch and poll every 30 seconds (use Refresh button for immediate updates)
1927
+ fetchAzureJobs();
1928
+ setInterval(fetchAzureJobs, 30000);
1929
+ setInterval(fetchJobLogs, 5000); // Poll logs every 5 seconds
1930
+ </script>
1931
+ '''
1932
+
1933
+
1934
+ def _get_vm_discovery_panel_css() -> str:
1935
+ """Return CSS for VM Discovery panel with prominent VNC button."""
1936
+ return '''
1937
+ .vm-discovery-panel {
1938
+ background: linear-gradient(135deg, rgba(16, 185, 129, 0.15) 0%, rgba(5, 150, 105, 0.05) 100%);
1939
+ border: 1px solid rgba(16, 185, 129, 0.3);
1940
+ border-radius: 12px;
1941
+ padding: 20px 24px;
1942
+ margin-bottom: 24px;
1943
+ }
1944
+ .vm-discovery-header {
1945
+ display: flex;
1946
+ align-items: center;
1947
+ justify-content: space-between;
1948
+ margin-bottom: 16px;
1949
+ }
1950
+ .vm-discovery-title {
1951
+ display: flex;
1952
+ align-items: center;
1953
+ gap: 10px;
1954
+ font-size: 1rem;
1955
+ font-weight: 600;
1956
+ color: #10b981;
1957
+ }
1958
+ .vm-discovery-title svg {
1959
+ width: 20px;
1960
+ height: 20px;
1961
+ }
1962
+ .vm-discovery-controls {
1963
+ display: flex;
1964
+ align-items: center;
1965
+ gap: 12px;
1966
+ }
1967
+ .vm-discovery-refresh {
1968
+ font-size: 0.75rem;
1969
+ color: var(--text-muted);
1970
+ }
1971
+ .vm-item {
1972
+ background: rgba(0, 0, 0, 0.3);
1973
+ border: 1px solid var(--border-color);
1974
+ border-radius: 10px;
1975
+ padding: 18px;
1976
+ margin-bottom: 12px;
1977
+ transition: all 0.2s;
1978
+ }
1979
+ .vm-item:last-child {
1980
+ margin-bottom: 0;
1981
+ }
1982
+ .vm-item:hover {
1983
+ border-color: rgba(16, 185, 129, 0.5);
1984
+ }
1985
+ .vm-item-header {
1986
+ display: flex;
1987
+ align-items: center;
1988
+ justify-content: space-between;
1989
+ margin-bottom: 12px;
1990
+ }
1991
+ .vm-name {
1992
+ font-weight: 600;
1993
+ font-size: 1rem;
1994
+ color: var(--text-primary);
1995
+ }
1996
+ .vm-status-indicator {
1997
+ display: flex;
1998
+ align-items: center;
1999
+ gap: 6px;
2000
+ font-size: 0.8rem;
2001
+ padding: 4px 10px;
2002
+ border-radius: 12px;
2003
+ background: rgba(0, 0, 0, 0.2);
2004
+ }
2005
+ .vm-status-indicator.online {
2006
+ background: rgba(16, 185, 129, 0.2);
2007
+ color: #10b981;
2008
+ }
2009
+ .vm-status-indicator.offline {
2010
+ background: rgba(239, 68, 68, 0.2);
2011
+ color: #ef4444;
2012
+ }
2013
+ .vm-status-dot {
2014
+ width: 8px;
2015
+ height: 8px;
2016
+ border-radius: 50%;
2017
+ }
2018
+ .vm-status-dot.online {
2019
+ background: #10b981;
2020
+ box-shadow: 0 0 6px #10b981;
2021
+ }
2022
+ .vm-status-dot.offline {
2023
+ background: #ef4444;
2024
+ }
2025
+ .vm-status-dot.unknown {
2026
+ background: #6b7280;
2027
+ }
2028
+ /* IP Address display - prominent */
2029
+ .vm-ip-display {
2030
+ display: flex;
2031
+ align-items: center;
2032
+ gap: 8px;
2033
+ padding: 10px 14px;
2034
+ background: rgba(0, 0, 0, 0.4);
2035
+ border: 1px solid rgba(16, 185, 129, 0.3);
2036
+ border-radius: 8px;
2037
+ margin-bottom: 14px;
2038
+ }
2039
+ .vm-ip-label {
2040
+ font-size: 0.75rem;
2041
+ color: var(--text-muted);
2042
+ text-transform: uppercase;
2043
+ letter-spacing: 0.5px;
2044
+ }
2045
+ .vm-ip-value {
2046
+ font-family: 'SF Mono', Monaco, monospace;
2047
+ font-size: 1.1rem;
2048
+ font-weight: 600;
2049
+ color: #10b981;
2050
+ letter-spacing: 0.5px;
2051
+ }
2052
+ .vm-ip-copy {
2053
+ margin-left: auto;
2054
+ padding: 4px 8px;
2055
+ background: rgba(16, 185, 129, 0.2);
2056
+ border: 1px solid rgba(16, 185, 129, 0.3);
2057
+ border-radius: 4px;
2058
+ color: #10b981;
2059
+ cursor: pointer;
2060
+ font-size: 0.7rem;
2061
+ transition: all 0.2s;
2062
+ }
2063
+ .vm-ip-copy:hover {
2064
+ background: rgba(16, 185, 129, 0.3);
2065
+ }
2066
+ .vm-info {
2067
+ display: grid;
2068
+ grid-template-columns: repeat(auto-fit, minmax(180px, 1fr));
2069
+ gap: 10px;
2070
+ margin-bottom: 14px;
2071
+ font-size: 0.85rem;
2072
+ color: var(--text-secondary);
2073
+ }
2074
+ .vm-info-item {
2075
+ display: flex;
2076
+ gap: 6px;
2077
+ }
2078
+ .vm-info-label {
2079
+ color: var(--text-muted);
2080
+ }
2081
+ .vm-info-value {
2082
+ color: var(--text-primary);
2083
+ font-family: 'SF Mono', Monaco, monospace;
2084
+ }
2085
+ .vm-actions {
2086
+ display: flex;
2087
+ gap: 10px;
2088
+ align-items: center;
2089
+ flex-wrap: wrap;
2090
+ }
2091
+ /* VNC Button - Large and Prominent */
2092
+ .vm-vnc-link {
2093
+ display: inline-flex;
2094
+ align-items: center;
2095
+ gap: 8px;
2096
+ padding: 12px 20px;
2097
+ background: linear-gradient(135deg, #10b981 0%, #059669 100%);
2098
+ border: none;
2099
+ border-radius: 8px;
2100
+ color: white;
2101
+ text-decoration: none;
2102
+ font-size: 0.9rem;
2103
+ font-weight: 600;
2104
+ transition: all 0.2s;
2105
+ box-shadow: 0 4px 12px rgba(16, 185, 129, 0.3);
2106
+ }
2107
+ .vm-vnc-link:hover {
2108
+ background: linear-gradient(135deg, #059669 0%, #047857 100%);
2109
+ transform: translateY(-2px);
2110
+ box-shadow: 0 6px 16px rgba(16, 185, 129, 0.4);
2111
+ }
2112
+ .vm-vnc-link .vnc-icon {
2113
+ font-size: 1.1rem;
2114
+ }
2115
+ .vm-vnc-link .vnc-ip {
2116
+ font-family: 'SF Mono', Monaco, monospace;
2117
+ font-size: 0.8rem;
2118
+ opacity: 0.9;
2119
+ margin-left: 4px;
2120
+ }
2121
+ .vm-vnc-link .tunnel-badge {
2122
+ font-size: 0.7rem;
2123
+ padding: 2px 6px;
2124
+ border-radius: 4px;
2125
+ background: rgba(255, 255, 255, 0.2);
2126
+ margin-left: 6px;
2127
+ }
2128
+ .vm-vnc-link .tunnel-badge.tunnel-error {
2129
+ background: rgba(239, 68, 68, 0.3);
2130
+ color: #fca5a5;
2131
+ }
2132
+ .vm-vnc-link.tunnel-inactive {
2133
+ background: linear-gradient(135deg, #6b7280 0%, #4b5563 100%);
2134
+ opacity: 0.8;
2135
+ }
2136
+ .vm-vnc-link.tunnel-inactive:hover {
2137
+ background: linear-gradient(135deg, #4b5563 0%, #374151 100%);
2138
+ }
2139
+ .tunnel-mini {
2140
+ font-size: 0.7rem;
2141
+ color: #10b981;
2142
+ }
2143
+ .vm-waa-status {
2144
+ display: inline-flex;
2145
+ align-items: center;
2146
+ gap: 6px;
2147
+ padding: 8px 14px;
2148
+ background: rgba(0, 0, 0, 0.2);
2149
+ border-radius: 6px;
2150
+ font-size: 0.85rem;
2151
+ font-weight: 500;
2152
+ }
2153
+ .vm-waa-status.ready {
2154
+ color: #10b981;
2155
+ border: 1px solid rgba(16, 185, 129, 0.4);
2156
+ background: rgba(16, 185, 129, 0.1);
2157
+ }
2158
+ .vm-waa-status.not-ready {
2159
+ color: #ef4444;
2160
+ border: 1px solid rgba(239, 68, 68, 0.4);
2161
+ background: rgba(239, 68, 68, 0.1);
2162
+ }
2163
+ .vm-waa-status.checking {
2164
+ color: #f59e0b;
2165
+ border: 1px solid rgba(245, 158, 11, 0.4);
2166
+ background: rgba(245, 158, 11, 0.1);
2167
+ }
2168
+ .vm-last-checked {
2169
+ font-size: 0.7rem;
2170
+ color: var(--text-muted);
2171
+ margin-top: 10px;
2172
+ display: flex;
2173
+ align-items: center;
2174
+ gap: 6px;
2175
+ }
2176
+ .no-vms {
2177
+ text-align: center;
2178
+ padding: 30px 20px;
2179
+ color: var(--text-muted);
2180
+ font-size: 0.9rem;
2181
+ }
2182
+ .no-vms-icon {
2183
+ font-size: 2rem;
2184
+ margin-bottom: 12px;
2185
+ opacity: 0.5;
2186
+ }
2187
+ .vm-add-button {
2188
+ margin-top: 12px;
2189
+ padding: 10px 18px;
2190
+ background: rgba(16, 185, 129, 0.2);
2191
+ border: 1px solid rgba(16, 185, 129, 0.4);
2192
+ border-radius: 6px;
2193
+ color: #10b981;
2194
+ cursor: pointer;
2195
+ font-size: 0.85rem;
2196
+ font-weight: 500;
2197
+ transition: all 0.2s;
2198
+ display: flex;
2199
+ align-items: center;
2200
+ gap: 6px;
2201
+ }
2202
+ .vm-add-button:hover {
2203
+ background: rgba(16, 185, 129, 0.3);
2204
+ transform: translateY(-1px);
2205
+ }
2206
+ .vm-add-form {
2207
+ display: none;
2208
+ margin-top: 12px;
2209
+ padding: 18px;
2210
+ background: rgba(0, 0, 0, 0.3);
2211
+ border: 1px solid var(--border-color);
2212
+ border-radius: 10px;
2213
+ }
2214
+ .vm-add-form.show {
2215
+ display: block;
2216
+ }
2217
+ .vm-form-row {
2218
+ margin-bottom: 14px;
2219
+ }
2220
+ .vm-form-row label {
2221
+ display: block;
2222
+ font-size: 0.8rem;
2223
+ color: var(--text-secondary);
2224
+ margin-bottom: 6px;
2225
+ font-weight: 500;
2226
+ }
2227
+ .vm-form-row input {
2228
+ width: 100%;
2229
+ padding: 8px 12px;
2230
+ background: rgba(0, 0, 0, 0.4);
2231
+ border: 1px solid var(--border-color);
2232
+ border-radius: 6px;
2233
+ color: var(--text-primary);
2234
+ font-size: 0.85rem;
2235
+ transition: border-color 0.2s;
2236
+ }
2237
+ .vm-form-row input:focus {
2238
+ outline: none;
2239
+ border-color: #10b981;
2240
+ }
2241
+ .vm-form-actions {
2242
+ display: flex;
2243
+ gap: 10px;
2244
+ margin-top: 18px;
2245
+ }
2246
+ .vm-form-submit {
2247
+ padding: 10px 18px;
2248
+ background: #10b981;
2249
+ border: none;
2250
+ border-radius: 6px;
2251
+ color: white;
2252
+ cursor: pointer;
2253
+ font-size: 0.85rem;
2254
+ font-weight: 500;
2255
+ }
2256
+ .vm-form-cancel {
2257
+ padding: 8px 16px;
2258
+ background: rgba(255, 255, 255, 0.1);
2259
+ border: 1px solid var(--border-color);
2260
+ border-radius: 6px;
2261
+ color: var(--text-primary);
2262
+ cursor: pointer;
2263
+ font-size: 0.85rem;
2264
+ }
2265
+ '''
2266
+
2267
+
2268
+ def _get_vm_discovery_panel_html() -> str:
2269
+ """Return HTML for VM Discovery panel with prominent VNC button and loading states."""
2270
+ return '''
2271
+ <div class="vm-discovery-panel" id="vm-discovery-panel">
2272
+ <div class="vm-discovery-header">
2273
+ <div class="vm-discovery-title">
2274
+ <svg viewBox="0 0 24 24" fill="currentColor">
2275
+ <path d="M3 3h18v4H3V3zm0 6h18v12H3V9zm2 2v8h14v-8H5zm2 2h4v4H7v-4z"/>
2276
+ </svg>
2277
+ Windows VMs
2278
+ </div>
2279
+ <div class="vm-discovery-controls">
2280
+ <span class="vm-discovery-refresh" id="vm-refresh-time">Checking...</span>
2281
+ <button class="refresh-btn" onclick="refreshVMs()" title="Refresh VM status" id="vm-refresh-btn">
2282
+ <span class="refresh-icon">&#8635;</span>
2283
+ <span class="spinner"></span>
2284
+ Refresh
2285
+ </button>
2286
+ </div>
2287
+ </div>
2288
+
2289
+ <!-- API Error Banner -->
2290
+ <div class="api-error-banner" id="vm-api-error">
2291
+ <span class="error-icon">!</span>
2292
+ <span class="error-message" id="vm-error-msg">Failed to fetch VMs</span>
2293
+ <button class="retry-btn" onclick="refreshVMs()">Retry</button>
2294
+ </div>
2295
+
2296
+ <!-- Loading state -->
2297
+ <div id="vm-loading" style="display: none; text-align: center; padding: 30px;">
2298
+ <div style="display: inline-block; width: 24px; height: 24px; border: 3px solid rgba(16,185,129,0.3); border-top-color: #10b981; border-radius: 50%; animation: spin 1s linear infinite;"></div>
2299
+ <div style="margin-top: 12px; color: var(--text-muted); font-size: 0.85rem;">Checking VM status...</div>
2300
+ </div>
2301
+
2302
+ <div id="vm-list">
2303
+ <div class="no-vms">
2304
+ <div class="no-vms-icon">&#128187;</div>
2305
+ Checking for registered VMs...
2306
+ </div>
2307
+ </div>
2308
+ <button id="vm-add-button" class="vm-add-button" onclick="toggleVMAddForm()">
2309
+ <span>+</span> Add VM
2310
+ </button>
2311
+ <div id="vm-add-form" class="vm-add-form">
2312
+ <div class="vm-form-row">
2313
+ <label>VM Name:</label>
2314
+ <input type="text" id="vm-name" placeholder="e.g., azure-waa-vm" />
2315
+ </div>
2316
+ <div class="vm-form-row">
2317
+ <label>SSH Host (IP):</label>
2318
+ <input type="text" id="vm-ssh-host" placeholder="e.g., 172.171.112.41" />
2319
+ </div>
2320
+ <div class="vm-form-row">
2321
+ <label>SSH User:</label>
2322
+ <input type="text" id="vm-ssh-user" value="azureuser" />
2323
+ </div>
2324
+ <div class="vm-form-row">
2325
+ <label>VNC Port:</label>
2326
+ <input type="number" id="vm-vnc-port" value="8006" />
2327
+ </div>
2328
+ <div class="vm-form-row">
2329
+ <label>WAA Port:</label>
2330
+ <input type="number" id="vm-waa-port" value="5000" />
2331
+ </div>
2332
+ <div class="vm-form-row">
2333
+ <label>Docker Container:</label>
2334
+ <input type="text" id="vm-docker-container" value="win11-waa" />
2335
+ </div>
2336
+ <div class="vm-form-row">
2337
+ <label>Internal IP:</label>
2338
+ <input type="text" id="vm-internal-ip" value="20.20.20.21" />
2339
+ </div>
2340
+ <div class="vm-form-actions">
2341
+ <button class="vm-form-submit" onclick="submitVMRegistration()">Register VM</button>
2342
+ <button class="vm-form-cancel" onclick="toggleVMAddForm()">Cancel</button>
2343
+ </div>
2344
+ </div>
2345
+ </div>
2346
+
2347
+ <script>
2348
+ let isVMRefreshing = false;
2349
+ let vmErrorCount = 0;
2350
+
2351
+ function setVMLoadingState(loading) {
2352
+ const loadingEl = document.getElementById('vm-loading');
2353
+ const listEl = document.getElementById('vm-list');
2354
+ const btn = document.getElementById('vm-refresh-btn');
2355
+
2356
+ if (loading) {
2357
+ loadingEl.style.display = 'block';
2358
+ listEl.style.display = 'none';
2359
+ if (btn) btn.classList.add('loading');
2360
+ } else {
2361
+ loadingEl.style.display = 'none';
2362
+ listEl.style.display = 'block';
2363
+ if (btn) btn.classList.remove('loading');
2364
+ }
2365
+ }
2366
+
2367
+ function showVMError(msg) {
2368
+ const errorEl = document.getElementById('vm-api-error');
2369
+ const errorMsgEl = document.getElementById('vm-error-msg');
2370
+ if (errorEl && errorMsgEl) {
2371
+ errorMsgEl.textContent = msg;
2372
+ errorEl.style.display = 'flex'; // Override any inline display:none
2373
+ errorEl.classList.add('show');
2374
+ }
2375
+ }
2376
+
2377
+ function hideVMError() {
2378
+ const errorEl = document.getElementById('vm-api-error');
2379
+ if (errorEl) {
2380
+ errorEl.classList.remove('show');
2381
+ errorEl.style.display = 'none'; // Explicit hide as backup
2382
+ }
2383
+ }
2384
+
2385
+ async function refreshVMs() {
2386
+ if (isVMRefreshing) return;
2387
+ isVMRefreshing = true;
2388
+ setVMLoadingState(true);
2389
+ hideVMError();
2390
+
2391
+ try {
2392
+ const response = await fetch('/api/vms?' + Date.now());
2393
+ if (!response.ok) throw new Error('HTTP ' + response.status);
2394
+ const vms = await response.json();
2395
+ if (vms.error) throw new Error(vms.error);
2396
+
2397
+ renderVMs(vms);
2398
+ hideVMError(); // Hide error again after successful render
2399
+ vmErrorCount = 0;
2400
+ document.getElementById('vm-refresh-time').textContent =
2401
+ 'Updated ' + new Date().toLocaleTimeString();
2402
+ } catch (e) {
2403
+ console.error('VM refresh failed:', e);
2404
+ vmErrorCount++;
2405
+ showVMError(e.message || 'Connection failed');
2406
+ } finally {
2407
+ isVMRefreshing = false;
2408
+ setVMLoadingState(false);
2409
+ }
2410
+ }
2411
+
2412
+ async function fetchVMs() {
2413
+ if (isVMRefreshing) return;
2414
+ if (vmErrorCount >= 3) {
2415
+ document.getElementById('vm-refresh-time').textContent = 'Polling paused';
2416
+ return;
2417
+ }
2418
+
2419
+ try {
2420
+ const response = await fetch('/api/vms?' + Date.now());
2421
+ if (response.ok) {
2422
+ const vms = await response.json();
2423
+ if (!vms.error) {
2424
+ renderVMs(vms);
2425
+ hideVMError();
2426
+ vmErrorCount = 0;
2427
+ document.getElementById('vm-refresh-time').textContent =
2428
+ 'Updated ' + new Date().toLocaleTimeString();
2429
+ }
2430
+ }
2431
+ } catch (e) {
2432
+ console.log('VM API unavailable:', e);
2433
+ vmErrorCount++;
2434
+ }
2435
+ }
2436
+
2437
+ function copyToClipboard(text, btn) {
2438
+ navigator.clipboard.writeText(text).then(() => {
2439
+ const originalText = btn.textContent;
2440
+ btn.textContent = 'Copied!';
2441
+ setTimeout(() => { btn.textContent = originalText; }, 1500);
2442
+ });
2443
+ }
2444
+
2445
+ function renderVMs(vms) {
2446
+ const container = document.getElementById('vm-list');
2447
+
2448
+ if (!vms || vms.length === 0) {
2449
+ container.innerHTML = '<div class="no-vms"><div class="no-vms-icon">&#128187;</div>No VMs registered. Click "Add VM" to register one.</div>';
2450
+ return;
2451
+ }
2452
+
2453
+ const html = vms.map(vm => {
2454
+ const statusClass = vm.status || 'unknown';
2455
+ const statusText = statusClass.charAt(0).toUpperCase() + statusClass.slice(1);
2456
+ const waaStatusClass = vm.waa_probe_status === 'ready' ? 'ready' :
2457
+ vm.waa_probe_status === 'checking' ? 'checking' : 'not-ready';
2458
+ const waaStatusIcon = vm.waa_probe_status === 'ready' ? '&#10003;' :
2459
+ vm.waa_probe_status === 'checking' ? '&#8987;' : '&#10007;';
2460
+ const waaStatusText = vm.waa_probe_status === 'ready' ? 'WAA Server Ready' :
2461
+ vm.waa_probe_status === 'not responding' ? 'WAA Not Responding' :
2462
+ vm.waa_probe_status === 'checking' ? 'Checking...' :
2463
+ vm.waa_probe_status === 'ssh failed' ? 'SSH Failed' : 'Unknown';
2464
+
2465
+ // Use localhost for VNC (requires SSH tunnel: ssh -fN -L 8006:localhost:8006 user@vm-ip)
2466
+ const vncPort = vm.vnc_port || 8006;
2467
+ const vncUrl = 'http://localhost:' + vncPort;
2468
+ const vmIp = vm.ssh_host;
2469
+
2470
+ return '<div class="vm-item">' +
2471
+ '<div class="vm-item-header">' +
2472
+ '<span class="vm-name">' + (vm.name || 'Unnamed VM') + '</span>' +
2473
+ '<div class="vm-status-indicator ' + statusClass + '">' +
2474
+ '<div class="vm-status-dot ' + statusClass + '"></div>' +
2475
+ '<span>' + statusText + '</span>' +
2476
+ '</div>' +
2477
+ '</div>' +
2478
+
2479
+ // Prominent IP display
2480
+ '<div class="vm-ip-display">' +
2481
+ '<span class="vm-ip-label">IP Address:</span>' +
2482
+ '<span class="vm-ip-value">' + vmIp + '</span>' +
2483
+ '<button class="vm-ip-copy" onclick="copyToClipboard(\\\'' + vmIp + '\\\', this)">Copy</button>' +
2484
+ '</div>' +
2485
+
2486
+ '<div class="vm-info">' +
2487
+ '<div class="vm-info-item">' +
2488
+ '<span class="vm-info-label">SSH:</span>' +
2489
+ '<span class="vm-info-value">' + (vm.ssh_user || 'azureuser') + '@' + vmIp + '</span>' +
2490
+ '</div>' +
2491
+ '<div class="vm-info-item">' +
2492
+ '<span class="vm-info-label">Container:</span>' +
2493
+ '<span class="vm-info-value">' + (vm.docker_container || 'win11-waa') + '</span>' +
2494
+ '</div>' +
2495
+ '</div>' +
2496
+
2497
+ '<div class="vm-actions">' +
2498
+ // Large prominent VNC button - uses localhost (SSH tunnel)
2499
+ '<a href="' + vncUrl + '" target="_blank" class="vm-vnc-link' + (vm.tunnels && vm.tunnels.vnc && vm.tunnels.vnc.active ? ' tunnel-active' : ' tunnel-inactive') + '">' +
2500
+ '<span class="vnc-icon">&#128424;</span>' +
2501
+ 'Open VNC' +
2502
+ '<span class="vnc-ip">localhost:' + vncPort + '</span>' +
2503
+ (vm.tunnels && vm.tunnels.vnc && vm.tunnels.vnc.active ? '<span class="tunnel-badge">&#10003; tunnel</span>' : '<span class="tunnel-badge tunnel-error">&#10007; no tunnel</span>') +
2504
+ '</a>' +
2505
+ '<div class="vm-waa-status ' + waaStatusClass + '">' +
2506
+ waaStatusIcon + ' ' + waaStatusText +
2507
+ (vm.tunnels && vm.tunnels.waa && vm.tunnels.waa.active ? ' <span class="tunnel-mini">&#10003;</span>' : '') +
2508
+ '</div>' +
2509
+ '</div>' +
2510
+
2511
+ '<div class="vm-last-checked">' +
2512
+ '<span>&#128337;</span> Last checked: ' + (vm.last_checked ? new Date(vm.last_checked).toLocaleString() : 'Never') +
2513
+ '</div>' +
2514
+ '</div>';
2515
+ }).join('');
2516
+
2517
+ container.innerHTML = html;
2518
+ }
2519
+
2520
+ function toggleVMAddForm() {
2521
+ const form = document.getElementById('vm-add-form');
2522
+ form.classList.toggle('show');
2523
+ }
2524
+
2525
+ async function submitVMRegistration() {
2526
+ const vmData = {
2527
+ name: document.getElementById('vm-name').value,
2528
+ ssh_host: document.getElementById('vm-ssh-host').value,
2529
+ ssh_user: document.getElementById('vm-ssh-user').value,
2530
+ vnc_port: parseInt(document.getElementById('vm-vnc-port').value),
2531
+ waa_port: parseInt(document.getElementById('vm-waa-port').value),
2532
+ docker_container: document.getElementById('vm-docker-container').value,
2533
+ internal_ip: document.getElementById('vm-internal-ip').value
2534
+ };
2535
+
2536
+ // Basic validation
2537
+ if (!vmData.name || !vmData.ssh_host) {
2538
+ alert('Please fill in VM Name and SSH Host');
2539
+ return;
2540
+ }
2541
+
2542
+ try {
2543
+ const response = await fetch('/api/vms/register', {
2544
+ method: 'POST',
2545
+ headers: {
2546
+ 'Content-Type': 'application/json'
2547
+ },
2548
+ body: JSON.stringify(vmData)
2549
+ });
2550
+
2551
+ if (response.ok) {
2552
+ const result = await response.json();
2553
+ if (result.status === 'success') {
2554
+ toggleVMAddForm();
2555
+ fetchVMs();
2556
+ // Clear form
2557
+ document.getElementById('vm-name').value = '';
2558
+ document.getElementById('vm-ssh-host').value = '';
2559
+ } else {
2560
+ alert('Failed to register VM: ' + (result.message || 'Unknown error'));
2561
+ }
2562
+ } else {
2563
+ alert('Failed to register VM: Server error (HTTP ' + response.status + ')');
2564
+ }
2565
+ } catch (e) {
2566
+ alert('Failed to register VM: ' + e.message);
2567
+ }
2568
+ }
2569
+
2570
+ // Initial fetch and poll every 10 seconds
2571
+ fetchVMs();
2572
+ setInterval(fetchVMs, 10000);
2573
+ </script>
2574
+ '''
2575
+
2576
+
2577
+ def _get_run_benchmark_panel_css() -> str:
2578
+ """Return CSS for the Run Benchmark configuration panel."""
2579
+ return '''
2580
+ .run-benchmark-panel {
2581
+ background: linear-gradient(135deg, rgba(16, 185, 129, 0.1) 0%, rgba(16, 185, 129, 0.05) 100%);
2582
+ border: 1px solid rgba(16, 185, 129, 0.3);
2583
+ border-radius: 12px;
2584
+ padding: 20px 24px;
2585
+ margin-bottom: 24px;
2586
+ }
2587
+ .run-benchmark-header {
2588
+ display: flex;
2589
+ align-items: center;
2590
+ justify-content: space-between;
2591
+ margin-bottom: 16px;
2592
+ }
2593
+ .run-benchmark-title {
2594
+ display: flex;
2595
+ align-items: center;
2596
+ gap: 10px;
2597
+ font-size: 1rem;
2598
+ font-weight: 600;
2599
+ color: #10b981;
2600
+ }
2601
+ .run-benchmark-title svg {
2602
+ width: 20px;
2603
+ height: 20px;
2604
+ }
2605
+ .run-benchmark-form {
2606
+ display: grid;
2607
+ grid-template-columns: repeat(auto-fit, minmax(200px, 1fr));
2608
+ gap: 16px;
2609
+ margin-bottom: 16px;
2610
+ }
2611
+ .form-group {
2612
+ display: flex;
2613
+ flex-direction: column;
2614
+ gap: 6px;
2615
+ }
2616
+ .form-group label {
2617
+ font-size: 0.8rem;
2618
+ color: var(--text-secondary);
2619
+ font-weight: 500;
2620
+ }
2621
+ .form-group select,
2622
+ .form-group input[type="text"],
2623
+ .form-group input[type="number"] {
2624
+ padding: 8px 12px;
2625
+ background: rgba(0, 0, 0, 0.3);
2626
+ border: 1px solid var(--border-color);
2627
+ border-radius: 6px;
2628
+ color: var(--text-primary);
2629
+ font-size: 0.9rem;
2630
+ }
2631
+ .form-group select:focus,
2632
+ .form-group input:focus {
2633
+ outline: none;
2634
+ border-color: #10b981;
2635
+ }
2636
+ .task-selection-group {
2637
+ grid-column: 1 / -1;
2638
+ display: flex;
2639
+ flex-direction: column;
2640
+ gap: 10px;
2641
+ padding: 12px 16px;
2642
+ background: rgba(0, 0, 0, 0.2);
2643
+ border-radius: 8px;
2644
+ }
2645
+ .task-selection-group-label {
2646
+ font-size: 0.8rem;
2647
+ color: var(--text-secondary);
2648
+ font-weight: 500;
2649
+ margin-bottom: 4px;
2650
+ }
2651
+ .task-selection-option {
2652
+ display: flex;
2653
+ align-items: center;
2654
+ gap: 10px;
2655
+ }
2656
+ .task-selection-option input[type="radio"] {
2657
+ accent-color: #10b981;
2658
+ }
2659
+ .task-selection-option label {
2660
+ font-size: 0.85rem;
2661
+ color: var(--text-primary);
2662
+ cursor: pointer;
2663
+ }
2664
+ .task-selection-option select,
2665
+ .task-selection-option input[type="text"] {
2666
+ padding: 6px 10px;
2667
+ background: rgba(0, 0, 0, 0.3);
2668
+ border: 1px solid var(--border-color);
2669
+ border-radius: 4px;
2670
+ color: var(--text-primary);
2671
+ font-size: 0.85rem;
2672
+ flex: 1;
2673
+ max-width: 200px;
2674
+ }
2675
+ .task-selection-option select:disabled,
2676
+ .task-selection-option input:disabled {
2677
+ opacity: 0.5;
2678
+ cursor: not-allowed;
2679
+ }
2680
+ .custom-model-input {
2681
+ display: none;
2682
+ margin-top: 8px;
2683
+ }
2684
+ .custom-model-input.show {
2685
+ display: block;
2686
+ }
2687
+ .start-btn {
2688
+ padding: 10px 20px;
2689
+ background: linear-gradient(135deg, #10b981, #059669);
2690
+ border: none;
2691
+ border-radius: 8px;
2692
+ color: white;
2693
+ font-weight: 600;
2694
+ cursor: pointer;
2695
+ transition: all 0.2s;
2696
+ display: flex;
2697
+ align-items: center;
2698
+ gap: 8px;
2699
+ }
2700
+ .start-btn:hover:not(:disabled) {
2701
+ transform: translateY(-2px);
2702
+ box-shadow: 0 4px 12px rgba(16, 185, 129, 0.3);
2703
+ }
2704
+ .start-btn:disabled {
2705
+ opacity: 0.5;
2706
+ cursor: not-allowed;
2707
+ }
2708
+ .start-btn .spinner {
2709
+ display: none;
2710
+ width: 14px;
2711
+ height: 14px;
2712
+ border: 2px solid rgba(255,255,255,0.3);
2713
+ border-top-color: white;
2714
+ border-radius: 50%;
2715
+ animation: spin 0.8s linear infinite;
2716
+ }
2717
+ .start-btn.loading .spinner {
2718
+ display: inline-block;
2719
+ }
2720
+ .start-btn.loading .start-icon {
2721
+ display: none;
2722
+ }
2723
+ .run-benchmark-status {
2724
+ margin-top: 12px;
2725
+ padding: 10px 14px;
2726
+ background: rgba(0, 0, 0, 0.2);
2727
+ border-radius: 6px;
2728
+ font-size: 0.85rem;
2729
+ color: var(--text-secondary);
2730
+ display: none;
2731
+ }
2732
+ .run-benchmark-status.show {
2733
+ display: block;
2734
+ }
2735
+ .run-benchmark-status.error {
2736
+ background: rgba(239, 68, 68, 0.15);
2737
+ color: #fca5a5;
2738
+ border: 1px solid rgba(239, 68, 68, 0.3);
2739
+ }
2740
+ .run-benchmark-status.success {
2741
+ background: rgba(16, 185, 129, 0.15);
2742
+ color: #6ee7b7;
2743
+ border: 1px solid rgba(16, 185, 129, 0.3);
2744
+ }
2745
+ '''
2746
+
2747
+
2748
+ def _get_run_benchmark_panel_html() -> str:
2749
+ """Return HTML for the Run Benchmark configuration panel."""
2750
+ return '''
2751
+ <div class="run-benchmark-panel" id="run-benchmark-panel">
2752
+ <div class="run-benchmark-header">
2753
+ <div class="run-benchmark-title">
2754
+ <svg viewBox="0 0 24 24" fill="currentColor">
2755
+ <path d="M8 5v14l11-7z"/>
2756
+ </svg>
2757
+ Run Benchmark
2758
+ </div>
2759
+ <button class="start-btn" id="start-benchmark-btn" onclick="startBenchmarkRun()">
2760
+ <span class="start-icon">&#9654;</span>
2761
+ <span class="spinner"></span>
2762
+ Start Run
2763
+ </button>
2764
+ </div>
2765
+
2766
+ <div class="run-benchmark-form">
2767
+ <div class="form-group">
2768
+ <label for="benchmark-model">Model</label>
2769
+ <select id="benchmark-model" onchange="handleModelChange()">
2770
+ <option value="gpt-4o">GPT-4o</option>
2771
+ <option value="gpt-4o-mini">GPT-4o-mini</option>
2772
+ <option value="claude-sonnet-4-5-20250929">Claude Sonnet 4.5</option>
2773
+ <option value="claude-opus-4-5-20251101">Claude Opus 4.5</option>
2774
+ <option value="custom">Custom...</option>
2775
+ </select>
2776
+ <div class="custom-model-input" id="custom-model-container">
2777
+ <input type="text" id="custom-model-id" placeholder="Enter model ID (e.g., gpt-4-turbo)">
2778
+ </div>
2779
+ </div>
2780
+
2781
+ <div class="form-group">
2782
+ <label for="benchmark-tasks">Number of Tasks</label>
2783
+ <input type="number" id="benchmark-tasks" value="5" min="1" max="154">
2784
+ </div>
2785
+
2786
+ <div class="form-group">
2787
+ <label for="benchmark-agent">Agent</label>
2788
+ <select id="benchmark-agent">
2789
+ <option value="navi">Navi (default)</option>
2790
+ <option value="som">Set-of-Marks</option>
2791
+ <option value="random">Random (baseline)</option>
2792
+ </select>
2793
+ </div>
2794
+
2795
+ <div class="task-selection-group">
2796
+ <div class="task-selection-group-label">Task Selection</div>
2797
+
2798
+ <div class="task-selection-option">
2799
+ <input type="radio" id="task-selection-all" name="task-selection" value="all" checked onchange="updateTaskSelectionState()">
2800
+ <label for="task-selection-all">All tasks (154 total, random selection)</label>
2801
+ </div>
2802
+
2803
+ <div class="task-selection-option">
2804
+ <input type="radio" id="task-selection-domain" name="task-selection" value="domain" onchange="updateTaskSelectionState()">
2805
+ <label for="task-selection-domain">Domain:</label>
2806
+ <select id="benchmark-domain" disabled>
2807
+ <option value="general">General</option>
2808
+ <option value="office">Office</option>
2809
+ <option value="web">Web</option>
2810
+ <option value="coding">Coding</option>
2811
+ <option value="system">System</option>
2812
+ <option value="creative">Creative</option>
2813
+ <option value="data">Data</option>
2814
+ <option value="communication">Communication</option>
2815
+ <option value="media">Media</option>
2816
+ <option value="gaming">Gaming</option>
2817
+ <option value="utility">Utility</option>
2818
+ </select>
2819
+ </div>
2820
+
2821
+ <div class="task-selection-option">
2822
+ <input type="radio" id="task-selection-ids" name="task-selection" value="task_ids" onchange="updateTaskSelectionState()">
2823
+ <label for="task-selection-ids">Task IDs:</label>
2824
+ <input type="text" id="benchmark-task-ids" placeholder="e.g., task_001, task_015, task_042" disabled>
2825
+ </div>
2826
+ </div>
2827
+ </div>
2828
+
2829
+ <div class="run-benchmark-status" id="run-benchmark-status"></div>
2830
+ </div>
2831
+ '''
2832
+
2833
+
2834
+ def _get_run_benchmark_panel_js(include_script_tags: bool = True) -> str:
2835
+ """Return JavaScript for the Run Benchmark panel form handling and API calls.
2836
+
2837
+ Args:
2838
+ include_script_tags: If True, wrap JS in <script> tags. Set to False when
2839
+ inserting into an existing script block.
2840
+ """
2841
+ js_code = '''
2842
+ // Handle model dropdown change to show/hide custom input
2843
+ function handleModelChange() {
2844
+ const select = document.getElementById('benchmark-model');
2845
+ const customContainer = document.getElementById('custom-model-container');
2846
+ if (select.value === 'custom') {
2847
+ customContainer.classList.add('show');
2848
+ } else {
2849
+ customContainer.classList.remove('show');
2850
+ }
2851
+ }
2852
+
2853
+ // Enable/disable task selection inputs based on radio selection
2854
+ function updateTaskSelectionState() {
2855
+ const allRadio = document.getElementById('task-selection-all');
2856
+ const domainRadio = document.getElementById('task-selection-domain');
2857
+ const idsRadio = document.getElementById('task-selection-ids');
2858
+ const domainSelect = document.getElementById('benchmark-domain');
2859
+ const taskIdsInput = document.getElementById('benchmark-task-ids');
2860
+
2861
+ domainSelect.disabled = !domainRadio.checked;
2862
+ taskIdsInput.disabled = !idsRadio.checked;
2863
+ }
2864
+
2865
+ // Show status message
2866
+ function showBenchmarkStatus(message, type) {
2867
+ const statusEl = document.getElementById('run-benchmark-status');
2868
+ statusEl.textContent = message;
2869
+ statusEl.className = 'run-benchmark-status show ' + (type || '');
2870
+ }
2871
+
2872
+ // Hide status message
2873
+ function hideBenchmarkStatus() {
2874
+ const statusEl = document.getElementById('run-benchmark-status');
2875
+ statusEl.classList.remove('show');
2876
+ }
2877
+
2878
+ // Start benchmark run
2879
+ async function startBenchmarkRun() {
2880
+ const btn = document.getElementById('start-benchmark-btn');
2881
+
2882
+ // Build params object
2883
+ const modelSelect = document.getElementById('benchmark-model');
2884
+ let model = modelSelect.value;
2885
+ if (model === 'custom') {
2886
+ model = document.getElementById('custom-model-id').value.trim();
2887
+ if (!model) {
2888
+ showBenchmarkStatus('Please enter a custom model ID', 'error');
2889
+ return;
2890
+ }
2891
+ }
2892
+
2893
+ const numTasks = parseInt(document.getElementById('benchmark-tasks').value);
2894
+ if (isNaN(numTasks) || numTasks < 1 || numTasks > 154) {
2895
+ showBenchmarkStatus('Number of tasks must be between 1 and 154', 'error');
2896
+ return;
2897
+ }
2898
+
2899
+ const agent = document.getElementById('benchmark-agent').value;
2900
+
2901
+ // Get task selection
2902
+ const taskSelection = document.querySelector('input[name="task-selection"]:checked').value;
2903
+
2904
+ const params = {
2905
+ model: model,
2906
+ num_tasks: numTasks,
2907
+ agent: agent,
2908
+ task_selection: taskSelection
2909
+ };
2910
+
2911
+ if (taskSelection === 'domain') {
2912
+ params.domain = document.getElementById('benchmark-domain').value;
2913
+ } else if (taskSelection === 'task_ids') {
2914
+ const taskIdsStr = document.getElementById('benchmark-task-ids').value.trim();
2915
+ if (!taskIdsStr) {
2916
+ showBenchmarkStatus('Please enter task IDs', 'error');
2917
+ return;
2918
+ }
2919
+ params.task_ids = taskIdsStr.split(',').map(id => id.trim()).filter(id => id);
2920
+ if (params.task_ids.length === 0) {
2921
+ showBenchmarkStatus('Please enter valid task IDs', 'error');
2922
+ return;
2923
+ }
2924
+ }
2925
+
2926
+ // Disable button and show loading state
2927
+ btn.disabled = true;
2928
+ btn.classList.add('loading');
2929
+ hideBenchmarkStatus();
2930
+
2931
+ try {
2932
+ const response = await fetch('/api/benchmark/start', {
2933
+ method: 'POST',
2934
+ headers: {'Content-Type': 'application/json'},
2935
+ body: JSON.stringify(params)
2936
+ });
2937
+
2938
+ const result = await response.json();
2939
+
2940
+ if (response.ok && result.status === 'started') {
2941
+ showBenchmarkStatus('Benchmark started! Model: ' + params.model + ', Tasks: ' + params.num_tasks + '. Check progress in Background Tasks section below.', 'success');
2942
+ // Refresh background tasks to show new benchmark
2943
+ if (typeof refreshBackgroundTasks === 'function') {
2944
+ setTimeout(refreshBackgroundTasks, 1000);
2945
+ }
2946
+ } else {
2947
+ throw new Error(result.error || result.message || 'Failed to start benchmark');
2948
+ }
2949
+ } catch (e) {
2950
+ console.error('Failed to start benchmark:', e);
2951
+ showBenchmarkStatus('Error: ' + e.message, 'error');
2952
+ btn.disabled = false;
2953
+ btn.classList.remove('loading');
2954
+ }
2955
+ }
2956
+
2957
+ // Initialize on load
2958
+ document.addEventListener('DOMContentLoaded', function() {
2959
+ updateTaskSelectionState();
2960
+ });
2961
+ '''
2962
+ if include_script_tags:
2963
+ return f'<script>{js_code}</script>'
2964
+ return js_code
2965
+
2966
+
13
2967
  def generate_benchmark_viewer(
14
2968
  benchmark_dir: Path | str,
15
2969
  output_path: Path | str | None = None,
@@ -217,6 +3171,16 @@ def generate_empty_benchmark_viewer(output_path: Path | str) -> Path:
217
3171
 
218
3172
  shared_header_css = _get_shared_header_css()
219
3173
  shared_header_html = _generate_shared_header_html("benchmarks")
3174
+ # NOTE: Azure ML Jobs panel moved to Training tab (not used for WAA benchmarks)
3175
+ run_benchmark_css = _get_run_benchmark_panel_css()
3176
+ run_benchmark_html = _get_run_benchmark_panel_html()
3177
+ run_benchmark_js = _get_run_benchmark_panel_js()
3178
+ tasks_css = _get_background_tasks_panel_css()
3179
+ tasks_html = _get_background_tasks_panel_html()
3180
+ live_eval_css = _get_live_evaluation_panel_css()
3181
+ live_eval_html = _get_live_evaluation_panel_html()
3182
+ vm_discovery_css = _get_vm_discovery_panel_css()
3183
+ vm_discovery_html = _get_vm_discovery_panel_html()
220
3184
 
221
3185
  html = f'''<!DOCTYPE html>
222
3186
  <html lang="en">
@@ -244,6 +3208,15 @@ def generate_empty_benchmark_viewer(output_path: Path | str) -> Path:
244
3208
  min-height: 100vh;
245
3209
  }}
246
3210
  {shared_header_css}
3211
+ {run_benchmark_css}
3212
+ {tasks_css}
3213
+ {live_eval_css}
3214
+ {vm_discovery_css}
3215
+ .container {{
3216
+ max-width: 900px;
3217
+ margin: 0 auto;
3218
+ padding: 24px;
3219
+ }}
247
3220
  .empty-state {{
248
3221
  display: flex;
249
3222
  flex-direction: column;
@@ -311,6 +3284,15 @@ def generate_empty_benchmark_viewer(output_path: Path | str) -> Path:
311
3284
  <body>
312
3285
  {shared_header_html}
313
3286
 
3287
+ <div class="container">
3288
+ {run_benchmark_html}
3289
+ {live_eval_html}
3290
+ {tasks_html}
3291
+ {vm_discovery_html}
3292
+ </div>
3293
+
3294
+ {run_benchmark_js}
3295
+
314
3296
  <div class="empty-state">
315
3297
  <div class="empty-icon">🚧</div>
316
3298
  <h1 class="empty-title">Windows Agent Arena Integration</h1>
@@ -669,12 +3651,89 @@ def _generate_benchmark_viewer_html(
669
3651
  margin-bottom: 16px;
670
3652
  opacity: 0.5;
671
3653
  }}
3654
+
3655
+ .mock-banner {{
3656
+ background: linear-gradient(135deg, rgba(255, 152, 0, 0.2) 0%, rgba(255, 87, 34, 0.2) 100%);
3657
+ border: 2px solid #ff9800;
3658
+ border-radius: 12px;
3659
+ padding: 20px 24px;
3660
+ margin-bottom: 24px;
3661
+ display: flex;
3662
+ align-items: center;
3663
+ gap: 16px;
3664
+ }}
3665
+
3666
+ .mock-banner-icon {{
3667
+ font-size: 2rem;
3668
+ flex-shrink: 0;
3669
+ }}
3670
+
3671
+ .mock-banner-content {{
3672
+ flex: 1;
3673
+ }}
3674
+
3675
+ .mock-banner-title {{
3676
+ font-size: 1.1rem;
3677
+ font-weight: 700;
3678
+ color: #ff9800;
3679
+ margin-bottom: 6px;
3680
+ }}
3681
+
3682
+ .mock-banner-text {{
3683
+ font-size: 0.9rem;
3684
+ color: var(--text-secondary);
3685
+ line-height: 1.5;
3686
+ }}
3687
+
3688
+ .run-badge {{
3689
+ display: inline-flex;
3690
+ align-items: center;
3691
+ gap: 8px;
3692
+ padding: 8px 16px;
3693
+ border-radius: 8px;
3694
+ font-size: 0.85rem;
3695
+ font-weight: 600;
3696
+ margin-bottom: 24px;
3697
+ }}
3698
+
3699
+ .run-badge.mock {{
3700
+ background: linear-gradient(135deg, rgba(255, 152, 0, 0.2) 0%, rgba(255, 87, 34, 0.2) 100%);
3701
+ border: 1px solid #ff9800;
3702
+ color: #ffb74d;
3703
+ }}
3704
+
3705
+ .run-badge.real {{
3706
+ background: linear-gradient(135deg, rgba(0, 212, 170, 0.2) 0%, rgba(0, 150, 136, 0.2) 100%);
3707
+ border: 1px solid var(--success);
3708
+ color: var(--success);
3709
+ }}
3710
+
3711
+ .run-badge-icon {{
3712
+ font-size: 1rem;
3713
+ }}
672
3714
  </style>
673
3715
  </head>
674
3716
  <body>
675
3717
  {shared_header_html}
676
3718
 
677
3719
  <div class="container">
3720
+ <div id="mock-banner" class="mock-banner" style="display: none;">
3721
+ <div class="mock-banner-icon">WARNING</div>
3722
+ <div class="mock-banner-content">
3723
+ <div class="mock-banner-title">Mock Data - Simulated Results Only</div>
3724
+ <div class="mock-banner-text">
3725
+ This benchmark run uses simulated mock data for pipeline testing and development.
3726
+ These results do NOT represent actual Windows Agent Arena evaluation performance.
3727
+ To run real WAA evaluation, use: <code>uv run python -m openadapt_ml.benchmarks.cli run-local</code> or <code>run-azure</code>
3728
+ </div>
3729
+ </div>
3730
+ </div>
3731
+
3732
+ <div id="run-badge" class="run-badge" style="display: none;">
3733
+ <span class="run-badge-icon"></span>
3734
+ <span class="run-badge-text"></span>
3735
+ </div>
3736
+
678
3737
  <div class="summary-cards">
679
3738
  <div class="summary-card">
680
3739
  <div class="label">Total Tasks</div>
@@ -730,8 +3789,40 @@ def _generate_benchmark_viewer_html(
730
3789
  domain: 'all'
731
3790
  }};
732
3791
 
3792
+ // Detect mock vs real run and show appropriate badges
3793
+ function detectAndShowRunType() {{
3794
+ const isMock = metadata.benchmark_name && metadata.benchmark_name.includes('mock');
3795
+ const badge = document.getElementById('run-badge');
3796
+ const banner = document.getElementById('mock-banner');
3797
+ const badgeIcon = badge.querySelector('.run-badge-icon');
3798
+ const badgeText = badge.querySelector('.run-badge-text');
3799
+
3800
+ if (isMock) {{
3801
+ // Show mock warning badge
3802
+ badge.classList.add('mock');
3803
+ badge.classList.remove('real');
3804
+ badgeIcon.textContent = '⚠️';
3805
+ badgeText.textContent = 'MOCK DATA - Simulated results for pipeline testing';
3806
+ badge.style.display = 'inline-flex';
3807
+
3808
+ // Show mock banner
3809
+ banner.style.display = 'flex';
3810
+ }} else {{
3811
+ // Show real evaluation badge
3812
+ badge.classList.add('real');
3813
+ badge.classList.remove('mock');
3814
+ badgeIcon.textContent = '✓';
3815
+ badgeText.textContent = 'REAL - Actual Windows Agent Arena evaluation';
3816
+ badge.style.display = 'inline-flex';
3817
+
3818
+ // Hide mock banner
3819
+ banner.style.display = 'none';
3820
+ }}
3821
+ }}
3822
+
733
3823
  // Initialize
734
3824
  function init() {{
3825
+ detectAndShowRunType();
735
3826
  updateSummaryCards();
736
3827
  populateDomainFilter();
737
3828
  renderTaskList();
@@ -916,6 +4007,18 @@ def _generate_multi_run_benchmark_viewer_html(
916
4007
  Returns:
917
4008
  Complete HTML string
918
4009
  """
4010
+ # NOTE: Azure ML Jobs panel moved to Training tab (not used for WAA benchmarks)
4011
+ run_benchmark_css = _get_run_benchmark_panel_css()
4012
+ run_benchmark_html = _get_run_benchmark_panel_html()
4013
+ # Use include_script_tags=False since we insert into existing script block
4014
+ run_benchmark_js = _get_run_benchmark_panel_js(include_script_tags=False)
4015
+ tasks_css = _get_background_tasks_panel_css()
4016
+ tasks_html = _get_background_tasks_panel_html()
4017
+ live_eval_css = _get_live_evaluation_panel_css()
4018
+ live_eval_html = _get_live_evaluation_panel_html()
4019
+ vm_discovery_css = _get_vm_discovery_panel_css()
4020
+ vm_discovery_html = _get_vm_discovery_panel_html()
4021
+
919
4022
  # Prepare runs data as JSON
920
4023
  runs_json = json.dumps(runs)
921
4024
 
@@ -974,6 +4077,10 @@ def _generate_multi_run_benchmark_viewer_html(
974
4077
  }}
975
4078
 
976
4079
  {shared_header_css}
4080
+ {run_benchmark_css}
4081
+ {tasks_css}
4082
+ {live_eval_css}
4083
+ {vm_discovery_css}
977
4084
 
978
4085
  .run-selector-section {{
979
4086
  background: var(--bg-secondary);
@@ -1266,12 +4373,89 @@ def _generate_multi_run_benchmark_viewer_html(
1266
4373
  margin-bottom: 16px;
1267
4374
  opacity: 0.5;
1268
4375
  }}
4376
+
4377
+ .mock-banner {{
4378
+ background: linear-gradient(135deg, rgba(255, 152, 0, 0.2) 0%, rgba(255, 87, 34, 0.2) 100%);
4379
+ border: 2px solid #ff9800;
4380
+ border-radius: 12px;
4381
+ padding: 20px 24px;
4382
+ margin-bottom: 24px;
4383
+ display: flex;
4384
+ align-items: center;
4385
+ gap: 16px;
4386
+ }}
4387
+
4388
+ .mock-banner-icon {{
4389
+ font-size: 2rem;
4390
+ flex-shrink: 0;
4391
+ }}
4392
+
4393
+ .mock-banner-content {{
4394
+ flex: 1;
4395
+ }}
4396
+
4397
+ .mock-banner-title {{
4398
+ font-size: 1.1rem;
4399
+ font-weight: 700;
4400
+ color: #ff9800;
4401
+ margin-bottom: 6px;
4402
+ }}
4403
+
4404
+ .mock-banner-text {{
4405
+ font-size: 0.9rem;
4406
+ color: var(--text-secondary);
4407
+ line-height: 1.5;
4408
+ }}
4409
+
4410
+ .run-badge {{
4411
+ display: inline-flex;
4412
+ align-items: center;
4413
+ gap: 8px;
4414
+ padding: 8px 16px;
4415
+ border-radius: 8px;
4416
+ font-size: 0.85rem;
4417
+ font-weight: 600;
4418
+ margin-bottom: 24px;
4419
+ }}
4420
+
4421
+ .run-badge.mock {{
4422
+ background: linear-gradient(135deg, rgba(255, 152, 0, 0.2) 0%, rgba(255, 87, 34, 0.2) 100%);
4423
+ border: 1px solid #ff9800;
4424
+ color: #ffb74d;
4425
+ }}
4426
+
4427
+ .run-badge.real {{
4428
+ background: linear-gradient(135deg, rgba(0, 212, 170, 0.2) 0%, rgba(0, 150, 136, 0.2) 100%);
4429
+ border: 1px solid var(--success);
4430
+ color: var(--success);
4431
+ }}
4432
+
4433
+ .run-badge-icon {{
4434
+ font-size: 1rem;
4435
+ }}
1269
4436
  </style>
1270
4437
  </head>
1271
4438
  <body>
1272
4439
  {shared_header_html}
1273
4440
 
1274
4441
  <div class="container">
4442
+ {run_benchmark_html}
4443
+ {live_eval_html}
4444
+ {tasks_html}
4445
+ {vm_discovery_html}
4446
+
4447
+ <div id="mock-banner" class="mock-banner" style="display: none;">
4448
+ <div class="mock-banner-icon">WARNING</div>
4449
+ <div class="mock-banner-content">
4450
+ <div class="mock-banner-title">Mock Data - Simulated Results Only</div>
4451
+ <div class="mock-banner-text">
4452
+ This benchmark run uses simulated mock data for pipeline testing and development.
4453
+ These results do NOT represent actual Windows Agent Arena evaluation performance.
4454
+ To run real WAA evaluation, use: <code>uv run python -m openadapt_ml.benchmarks.cli run-local</code> or <code>run-azure</code>
4455
+ </div>
4456
+ </div>
4457
+ </div>
4458
+
1275
4459
  <div class="run-selector-section">
1276
4460
  <span class="run-selector-label">Benchmark Run:</span>
1277
4461
  <select id="run-selector">
@@ -1279,6 +4463,11 @@ def _generate_multi_run_benchmark_viewer_html(
1279
4463
  </select>
1280
4464
  </div>
1281
4465
 
4466
+ <div id="run-badge" class="run-badge" style="display: none;">
4467
+ <span class="run-badge-icon"></span>
4468
+ <span class="run-badge-text"></span>
4469
+ </div>
4470
+
1282
4471
  <div class="summary-cards">
1283
4472
  <div class="summary-card">
1284
4473
  <div class="label">Total Tasks</div>
@@ -1346,6 +4535,38 @@ def _generate_multi_run_benchmark_viewer_html(
1346
4535
  return getCurrentRun().summary;
1347
4536
  }}
1348
4537
 
4538
+ // Detect mock vs real run and show appropriate badges
4539
+ function detectAndShowRunType() {{
4540
+ const currentRun = getCurrentRun();
4541
+ const isMock = currentRun.benchmark_name && currentRun.benchmark_name.includes('mock');
4542
+ const badge = document.getElementById('run-badge');
4543
+ const banner = document.getElementById('mock-banner');
4544
+ const badgeIcon = badge.querySelector('.run-badge-icon');
4545
+ const badgeText = badge.querySelector('.run-badge-text');
4546
+
4547
+ if (isMock) {{
4548
+ // Show mock warning badge
4549
+ badge.classList.add('mock');
4550
+ badge.classList.remove('real');
4551
+ badgeIcon.textContent = '⚠️';
4552
+ badgeText.textContent = 'MOCK DATA - Simulated results for pipeline testing';
4553
+ badge.style.display = 'inline-flex';
4554
+
4555
+ // Show mock banner
4556
+ banner.style.display = 'flex';
4557
+ }} else {{
4558
+ // Show real evaluation badge
4559
+ badge.classList.add('real');
4560
+ badge.classList.remove('mock');
4561
+ badgeIcon.textContent = '✓';
4562
+ badgeText.textContent = 'REAL - Actual Windows Agent Arena evaluation';
4563
+ badge.style.display = 'inline-flex';
4564
+
4565
+ // Hide mock banner
4566
+ banner.style.display = 'none';
4567
+ }}
4568
+ }}
4569
+
1349
4570
  // Initialize
1350
4571
  function init() {{
1351
4572
  populateDomainFilter();
@@ -1369,6 +4590,7 @@ def _generate_multi_run_benchmark_viewer_html(
1369
4590
  }}
1370
4591
 
1371
4592
  function updateDisplay() {{
4593
+ detectAndShowRunType();
1372
4594
  updateSummaryCards();
1373
4595
  renderTaskList();
1374
4596
  }}
@@ -1529,6 +4751,9 @@ def _generate_multi_run_benchmark_viewer_html(
1529
4751
  return parts.length > 0 ? parts.join(', ') : 'No details';
1530
4752
  }}
1531
4753
 
4754
+ // Run Benchmark panel functionality
4755
+ {run_benchmark_js}
4756
+
1532
4757
  // Initialize on page load
1533
4758
  init();
1534
4759
  </script>