quickdistill 0.1.5__py3-none-any.whl → 0.1.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -316,6 +316,22 @@
316
316
  Manage Judges
317
317
  </a>
318
318
 
319
+ <button id="open-test-judge-btn" style="padding: 8px 16px; background: #6a4a7e; color: white; border: none; border-radius: 4px; cursor: pointer;">
320
+ Test Judges
321
+ </button>
322
+
323
+ <button id="open-settings-btn" style="padding: 8px 16px; background: #5a5a5a; color: white; border: none; border-radius: 4px; cursor: pointer;">
324
+ Settings
325
+ </button>
326
+
327
+ <div style="margin: 20px 0; padding: 15px; background: #2a1a2a; border-radius: 8px; border: 1px solid #4a2a4a;">
328
+ <div style="color: #aaa; font-size: 13px; margin-bottom: 10px;">Automatic Workflow:</div>
329
+ <button id="open-e2e-btn" style="padding: 10px 20px; background: #7a4a9e; color: white; border: none; border-radius: 4px; cursor: pointer; font-weight: 500;">
330
+ ⚡ Run End-to-End Test
331
+ </button>
332
+ <div style="color: #666; font-size: 11px; margin-top: 8px;">Export → Generate → Evaluate (all in one)</div>
333
+ </div>
334
+
319
335
  <div class="stats">
320
336
  <div>Total: <span id="total-count">0</span></div>
321
337
  <div>Shown: <span id="shown-count">0</span></div>
@@ -375,6 +391,9 @@
375
391
 
376
392
  <div id="inference-progress" style="display: none; margin-top: 20px; padding: 15px; background: #0f0f0f; border-radius: 4px;">
377
393
  <div style="color: #4a9eff; margin-bottom: 10px;">Running inference...</div>
394
+ <div id="inference-progress-bar" style="width: 100%; height: 6px; background: #2a2a2a; border-radius: 3px; margin-bottom: 15px; overflow: hidden;">
395
+ <div id="inference-progress-fill" style="height: 100%; background: #4a9eff; width: 0%; transition: width 0.3s;"></div>
396
+ </div>
378
397
  <div id="progress-text" style="color: #888; font-family: monospace; font-size: 12px; white-space: pre-wrap;"></div>
379
398
  </div>
380
399
  </div>
@@ -424,6 +443,193 @@
424
443
  </div>
425
444
  </div>
426
445
  </div>
446
+
447
+ <!-- Settings Panel -->
448
+ <div id="settings-panel" style="display: none; position: fixed; top: 0; left: 0; width: 100%; height: 100%; background: rgba(0,0,0,0.8); z-index: 1000; padding: 40px;">
449
+ <div style="max-width: 600px; margin: 0 auto; background: #1a1a1a; border-radius: 8px; padding: 30px; border: 1px solid #2a2a2a;">
450
+ <h2 style="color: #fff; margin-bottom: 20px;">Settings</h2>
451
+
452
+ <div style="margin-bottom: 20px;">
453
+ <label style="display: block; color: #aaa; margin-bottom: 8px; font-size: 14px;">W&B Inference Project</label>
454
+ <input type="text" id="settings-inference-project" placeholder="e.g., wandb_fc/quickstart_playground"
455
+ style="width: 100%; padding: 10px; background: #2a2a2a; color: #fff; border: 1px solid #3a3a3a; border-radius: 4px; font-size: 14px;">
456
+ <div style="color: #666; font-size: 12px; margin-top: 5px;">Used for running weak model inference</div>
457
+ </div>
458
+
459
+ <div style="margin-bottom: 30px;">
460
+ <label style="display: block; color: #aaa; margin-bottom: 8px; font-size: 14px;">W&B Evaluation Project</label>
461
+ <input type="text" id="settings-evaluation-project" placeholder="e.g., wandb_inference"
462
+ style="width: 100%; padding: 10px; background: #2a2a2a; color: #fff; border: 1px solid #3a3a3a; border-radius: 4px; font-size: 14px;">
463
+ <div style="color: #666; font-size: 12px; margin-top: 5px;">Used for logging evaluation results with Weave</div>
464
+ </div>
465
+
466
+ <div style="display: flex; gap: 10px; justify-content: flex-end;">
467
+ <button id="close-settings-btn" style="padding: 10px 20px; background: #5a2a2a; color: white; border: none; border-radius: 4px; cursor: pointer;">
468
+ Cancel
469
+ </button>
470
+ <button id="save-settings-btn" style="padding: 10px 20px; background: #2a7c4a; color: white; border: none; border-radius: 4px; cursor: pointer;">
471
+ Save Settings
472
+ </button>
473
+ </div>
474
+ </div>
475
+ </div>
476
+
477
+ <!-- Test Judges Panel -->
478
+ <div id="test-judge-panel" style="display: none; position: fixed; top: 0; left: 0; width: 100%; height: 100%; background: rgba(0,0,0,0.8); z-index: 1000; padding: 40px; overflow-y: auto;">
479
+ <div style="max-width: 1000px; margin: 0 auto; background: #1a1a1a; border-radius: 8px; padding: 30px; border: 1px solid #3a2a4a;">
480
+ <h2 style="color: #fff; margin-bottom: 10px;">Test Judge</h2>
481
+ <p style="color: #888; font-size: 13px; margin-bottom: 25px;">
482
+ Test your judge on sample data to see exactly what inputs/outputs it receives
483
+ </p>
484
+
485
+ <!-- Configuration -->
486
+ <div style="display: grid; grid-template-columns: 1fr 1fr; gap: 20px; margin-bottom: 25px;">
487
+ <div>
488
+ <label style="display: block; color: #aaa; margin-bottom: 8px; font-size: 14px;">Select Judge:</label>
489
+ <select id="test-judge-select" style="width: 100%; padding: 10px; background: #2a2a2a; color: #fff; border: 1px solid #3a3a3a; border-radius: 4px; font-size: 14px;">
490
+ <option value="">Loading judges...</option>
491
+ </select>
492
+ </div>
493
+
494
+ <div>
495
+ <label style="display: block; color: #aaa; margin-bottom: 8px; font-size: 14px;">Weak Model Data:</label>
496
+ <select id="test-weak-model-select" style="width: 100%; padding: 10px; background: #2a2a2a; color: #fff; border: 1px solid #3a3a3a; border-radius: 4px; font-size: 14px;">
497
+ <option value="">Loading weak model files...</option>
498
+ </select>
499
+ </div>
500
+ </div>
501
+
502
+ <div style="margin-bottom: 20px;">
503
+ <label style="display: block; color: #aaa; margin-bottom: 8px; font-size: 14px;">Number of Samples:</label>
504
+ <input type="number" id="test-num-samples" value="5" min="1" max="50"
505
+ style="width: 150px; padding: 10px; background: #2a2a2a; color: #fff; border: 1px solid #3a3a3a; border-radius: 4px; font-size: 14px;">
506
+ <span style="color: #666; font-size: 12px; margin-left: 10px;">Max: 50</span>
507
+ </div>
508
+
509
+ <!-- Judge Model -->
510
+ <div style="margin-bottom: 20px;">
511
+ <label style="display: block; color: #aaa; margin-bottom: 8px; font-size: 14px;">Judge Model:</label>
512
+ <input type="text" id="test-judge-model"
513
+ style="width: 100%; padding: 10px; background: #2a2a2a; color: #fff; border: 1px solid #3a3a3a; border-radius: 4px; font-size: 14px;"
514
+ placeholder="e.g., gpt-4o, claude-3-5-sonnet-20241022">
515
+ <div style="color: #666; font-size: 12px; margin-top: 5px;">
516
+ Override the judge's model for this test
517
+ </div>
518
+ </div>
519
+
520
+ <!-- Judge Prompt -->
521
+ <div style="margin-bottom: 30px;">
522
+ <label style="display: block; color: #aaa; margin-bottom: 8px; font-size: 14px;">Judge Prompt:</label>
523
+ <textarea id="test-judge-prompt"
524
+ style="width: 100%; min-height: 200px; padding: 10px; background: #2a2a2a; color: #fff; border: 1px solid #3a3a3a; border-radius: 4px; font-size: 13px; font-family: 'Courier New', monospace; resize: vertical;"
525
+ placeholder="Select a judge to load its prompt..."></textarea>
526
+ <div style="color: #666; font-size: 12px; margin-top: 5px;">
527
+ Edit the prompt and test changes, or save to update the judge permanently
528
+ </div>
529
+ </div>
530
+
531
+ <!-- Actions -->
532
+ <div style="display: flex; gap: 10px; margin-bottom: 30px;">
533
+ <button id="run-test-judge-btn" style="padding: 10px 20px; background: #6a4a7e; color: white; border: none; border-radius: 4px; cursor: pointer; font-weight: 500;">
534
+ Run Test
535
+ </button>
536
+ <button id="save-test-judge-prompt-btn" style="padding: 10px 20px; background: #2a7c4a; color: white; border: none; border-radius: 4px; cursor: pointer;">
537
+ Save Prompt to Judge
538
+ </button>
539
+ <button id="close-test-judge-btn" style="padding: 10px 20px; background: #5a2a2a; color: white; border: none; border-radius: 4px; cursor: pointer;">
540
+ Close
541
+ </button>
542
+ </div>
543
+
544
+ <!-- Results -->
545
+ <div id="test-judge-results" style="display: none;">
546
+ <h3 style="color: #4a9eff; margin-bottom: 15px;">Test Results</h3>
547
+ <div id="test-judge-results-content" style="max-height: 600px; overflow-y: auto;">
548
+ <!-- Results populated here -->
549
+ </div>
550
+ </div>
551
+ </div>
552
+ </div>
553
+
554
+ <!-- End-to-End Test Panel -->
555
+ <div id="e2e-panel" style="display: none; position: fixed; top: 0; left: 0; width: 100%; height: 100%; background: rgba(0,0,0,0.8); z-index: 1000; padding: 40px; overflow-y: auto;">
556
+ <div style="max-width: 800px; margin: 0 auto; background: #1a1a1a; border-radius: 8px; padding: 30px; border: 1px solid #4a2a4a;">
557
+ <h2 style="color: #fff; margin-bottom: 10px;">⚡ Run End-to-End Test</h2>
558
+ <p style="color: #888; font-size: 13px; margin-bottom: 25px;">
559
+ This will automatically: Export selected traces → Run weak models → Evaluate with judge
560
+ </p>
561
+
562
+ <!-- Weak Model Selection -->
563
+ <div style="margin-bottom: 25px;">
564
+ <h3 style="color: #fff; font-size: 16px; margin-bottom: 15px;">1. Select Weak Models</h3>
565
+
566
+ <div style="margin-bottom: 15px;">
567
+ <label style="display: block; color: #aaa; margin-bottom: 8px; font-size: 13px;">W&B Models:</label>
568
+ <div id="e2e-wandb-models" style="max-height: 150px; overflow-y: auto; background: #0f0f0f; padding: 10px; border-radius: 4px; border: 1px solid #2a2a2a;">
569
+ <!-- Populated dynamically -->
570
+ </div>
571
+ </div>
572
+
573
+ <div style="margin-bottom: 15px;">
574
+ <label style="display: block; color: #aaa; margin-bottom: 8px; font-size: 13px;">OpenRouter Models (optional):</label>
575
+ <textarea id="e2e-openrouter-models" placeholder="Enter OpenRouter models (one per line)&#10;e.g.,&#10;meta-llama/llama-3.3-70b-instruct&#10;anthropic/claude-3.5-sonnet"
576
+ style="width: 100%; padding: 8px; background: #2a2a2a; color: #fff; border: 1px solid #3a3a3a; border-radius: 4px; font-size: 13px; min-height: 80px; font-family: monospace;"></textarea>
577
+ <div style="color: #666; font-size: 11px; margin-top: 5px;">One model per line</div>
578
+ </div>
579
+
580
+ <div>
581
+ <label style="display: block; color: #aaa; margin-bottom: 8px; font-size: 13px;">Max Examples (optional):</label>
582
+ <input type="number" id="e2e-num-examples" placeholder="Leave empty to use all selected traces"
583
+ style="width: 200px; padding: 8px; background: #2a2a2a; color: #fff; border: 1px solid #3a3a3a; border-radius: 4px; font-size: 13px;">
584
+ </div>
585
+ </div>
586
+
587
+ <!-- Judge Selection -->
588
+ <div style="margin-bottom: 30px;">
589
+ <h3 style="color: #fff; font-size: 16px; margin-bottom: 15px;">2. Select Judge</h3>
590
+ <select id="e2e-judge" style="width: 100%; padding: 10px; background: #2a2a2a; color: #fff; border: 1px solid #3a3a3a; border-radius: 4px; font-size: 14px;">
591
+ <option value="">Loading judges...</option>
592
+ </select>
593
+ </div>
594
+
595
+ <!-- Actions -->
596
+ <div style="display: flex; gap: 10px; justify-content: flex-end;">
597
+ <button id="close-e2e-btn" style="padding: 10px 20px; background: #5a2a2a; color: white; border: none; border-radius: 4px; cursor: pointer;">
598
+ Cancel
599
+ </button>
600
+ <button id="run-e2e-btn" style="padding: 10px 20px; background: #7a4a9e; color: white; border: none; border-radius: 4px; cursor: pointer; font-weight: 500;">
601
+ ⚡ Run Test
602
+ </button>
603
+ </div>
604
+ </div>
605
+ </div>
606
+
607
+ <!-- End-to-End Progress Panel -->
608
+ <div id="e2e-progress-panel" style="display: none; position: fixed; top: 0; left: 0; width: 100%; height: 100%; background: rgba(0,0,0,0.9); z-index: 1100; padding: 40px; overflow-y: auto;">
609
+ <div style="max-width: 800px; margin: 0 auto; background: #1a1a1a; border-radius: 8px; padding: 30px; border: 1px solid #4a2a4a;">
610
+ <h2 style="color: #fff; margin-bottom: 20px;">Running End-to-End Test</h2>
611
+
612
+ <!-- Overall Progress -->
613
+ <div style="margin-bottom: 30px;">
614
+ <div style="color: #4a9eff; font-size: 14px; margin-bottom: 10px;" id="e2e-step-label">Step 1/3: Exporting traces...</div>
615
+ <div style="width: 100%; height: 8px; background: #2a2a2a; border-radius: 4px; overflow: hidden;">
616
+ <div id="e2e-overall-progress" style="height: 100%; background: #7a4a9e; width: 0%; transition: width 0.3s;"></div>
617
+ </div>
618
+ </div>
619
+
620
+ <!-- Detailed Progress -->
621
+ <div id="e2e-progress-text" style="color: #888; font-family: monospace; font-size: 12px; white-space: pre-wrap; background: #0f0f0f; padding: 15px; border-radius: 4px; max-height: 400px; overflow-y: auto;"></div>
622
+
623
+ <!-- Results -->
624
+ <div id="e2e-results" style="display: none; margin-top: 20px;">
625
+ <h3 style="color: #4a9eff; margin-bottom: 15px;">✓ Test Complete!</h3>
626
+ <div id="e2e-results-content" style="background: #0f0f0f; padding: 15px; border-radius: 4px;"></div>
627
+ <button id="close-e2e-progress-btn" style="margin-top: 20px; padding: 10px 20px; background: #2a7c4a; color: white; border: none; border-radius: 4px; cursor: pointer;">
628
+ Close
629
+ </button>
630
+ </div>
631
+ </div>
632
+ </div>
427
633
  </div>
428
634
 
429
635
  <script>
@@ -919,20 +1125,46 @@
919
1125
  // Show progress
920
1126
  document.getElementById('inference-progress').style.display = 'block';
921
1127
  const progressText = document.getElementById('progress-text');
1128
+ const progressFill = document.getElementById('inference-progress-fill');
922
1129
  progressText.textContent = `Starting inference...\n`;
923
- progressText.textContent += `Strong Export: ${strongExportFile}\n`;
924
- progressText.textContent += `Models: ${allModels.join(', ')}\n`;
925
- progressText.textContent += `Max Examples: ${numExamples}\n\n`;
1130
+ progressFill.style.width = '0%';
1131
+
1132
+ // Start inference and poll for progress
1133
+ let taskId = null;
1134
+ let pollInterval = null;
1135
+
1136
+ const pollProgress = async () => {
1137
+ if (!taskId) return;
1138
+ try {
1139
+ const resp = await fetch(`/progress/${taskId}`);
1140
+ if (resp.ok) {
1141
+ const progress = await resp.json();
1142
+ const percent = (progress.current / progress.total) * 100;
1143
+ progressFill.style.width = `${percent}%`;
1144
+ progressText.textContent = `${progress.message}\nProgress: ${progress.current}/${progress.total} (${percent.toFixed(1)}%)\n`;
1145
+ }
1146
+ } catch (e) {
1147
+ console.error('Error polling progress:', e);
1148
+ }
1149
+ };
926
1150
 
927
1151
  // Call backend API
928
1152
  try {
1153
+ // Generate a task ID for polling
1154
+ taskId = `inference_${Date.now()}`;
1155
+
1156
+ // Start polling immediately
1157
+ pollInterval = setInterval(pollProgress, 300);
1158
+
1159
+ // Start the inference
929
1160
  const response = await fetch('/run_inference', {
930
1161
  method: 'POST',
931
1162
  headers: { 'Content-Type': 'application/json' },
932
1163
  body: JSON.stringify({
933
1164
  models: allModels,
934
1165
  strong_export_file: strongExportFile,
935
- num_examples: numExamples
1166
+ num_examples: numExamples,
1167
+ task_id: taskId
936
1168
  })
937
1169
  });
938
1170
 
@@ -941,8 +1173,12 @@
941
1173
  }
942
1174
 
943
1175
  const result = await response.json();
944
- progressText.textContent += `\n✓ Complete!\n`;
945
- progressText.textContent += `Results saved to: ${result.files.join(', ')}\n`;
1176
+
1177
+ // Stop polling
1178
+ if (pollInterval) clearInterval(pollInterval);
1179
+
1180
+ progressText.textContent = `\n✓ Complete!\nResults saved to: ${result.files.join(', ')}\n`;
1181
+ progressFill.style.width = '100%';
946
1182
 
947
1183
  setTimeout(() => {
948
1184
  document.getElementById('inference-panel').style.display = 'none';
@@ -951,8 +1187,7 @@
951
1187
 
952
1188
  } catch (error) {
953
1189
  progressText.textContent += `\n✗ Error: ${error.message}\n`;
954
- progressText.textContent += `\nNote: You need to run the backend server for inference.\n`;
955
- progressText.textContent += `Run: python inference_server.py\n`;
1190
+ if (pollInterval) clearInterval(pollInterval);
956
1191
  }
957
1192
  });
958
1193
 
@@ -1091,21 +1326,44 @@
1091
1326
  const modelFiles = Array.from(selectedEvalModels);
1092
1327
  const results = [];
1093
1328
 
1094
- // Run evaluations sequentially
1329
+ // Run evaluations sequentially with granular progress
1095
1330
  for (let i = 0; i < modelFiles.length; i++) {
1096
1331
  const modelFile = modelFiles[i];
1097
- const progress = ((i) / modelFiles.length) * 100;
1098
- progressFill.style.width = `${progress}%`;
1099
1332
 
1100
- progressText.textContent += `[${i+1}/${modelFiles.length}] Evaluating ${modelFile}...\n`;
1333
+ progressText.textContent += `[${i+1}/${modelFiles.length}] Starting ${modelFile}...\n`;
1334
+
1335
+ let pollInterval = null;
1336
+ let taskId = null;
1337
+
1338
+ const pollProgress = async () => {
1339
+ if (!taskId) return;
1340
+ try {
1341
+ const resp = await fetch(`/progress/${taskId}`);
1342
+ if (resp.ok) {
1343
+ const progress = await resp.json();
1344
+ const percent = (progress.current / progress.total) * 100;
1345
+ progressFill.style.width = `${percent}%`;
1346
+ progressText.textContent = `[${i+1}/${modelFiles.length}] ${progress.message}\nProgress: ${progress.current}/${progress.total} (${percent.toFixed(1)}%)\n`;
1347
+ }
1348
+ } catch (e) {
1349
+ console.error('Error polling eval progress:', e);
1350
+ }
1351
+ };
1101
1352
 
1102
1353
  try {
1354
+ // Generate task ID for this evaluation
1355
+ taskId = `eval_${Date.now()}_${i}`;
1356
+
1357
+ // Start polling
1358
+ pollInterval = setInterval(pollProgress, 300);
1359
+
1103
1360
  const response = await fetch('/run_evaluation', {
1104
1361
  method: 'POST',
1105
1362
  headers: { 'Content-Type': 'application/json' },
1106
1363
  body: JSON.stringify({
1107
1364
  model_file: modelFile,
1108
- judge: judge
1365
+ judge: judge,
1366
+ task_id: taskId
1109
1367
  })
1110
1368
  });
1111
1369
 
@@ -1114,6 +1372,10 @@
1114
1372
  }
1115
1373
 
1116
1374
  const result = await response.json();
1375
+
1376
+ // Clear polling when done
1377
+ if (pollInterval) clearInterval(pollInterval);
1378
+
1117
1379
  progressText.textContent += ` ✓ Complete: ${result.evaluation_name}\n`;
1118
1380
  progressText.textContent += ` Examples: ${result.examples_evaluated}\n\n`;
1119
1381
 
@@ -1125,6 +1387,7 @@
1125
1387
  });
1126
1388
 
1127
1389
  } catch (error) {
1390
+ if (pollInterval) clearInterval(pollInterval);
1128
1391
  progressText.textContent += ` ✗ Error: ${error.message}\n\n`;
1129
1392
  }
1130
1393
  }
@@ -1227,6 +1490,517 @@
1227
1490
  console.error('Delete error:', error);
1228
1491
  }
1229
1492
  }
1493
+
1494
+ // === SETTINGS ===
1495
+
1496
+ // Load and display settings
1497
+ async function loadSettings() {
1498
+ try {
1499
+ const response = await fetch('/settings');
1500
+ const settings = await response.json();
1501
+ document.getElementById('settings-inference-project').value = settings.inference_project || '';
1502
+ document.getElementById('settings-evaluation-project').value = settings.evaluation_project || '';
1503
+ } catch (error) {
1504
+ console.error('Error loading settings:', error);
1505
+ }
1506
+ }
1507
+
1508
+ // Open settings panel
1509
+ document.getElementById('open-settings-btn').addEventListener('click', async () => {
1510
+ await loadSettings();
1511
+ document.getElementById('settings-panel').style.display = 'block';
1512
+ });
1513
+
1514
+ // Close settings panel
1515
+ document.getElementById('close-settings-btn').addEventListener('click', () => {
1516
+ document.getElementById('settings-panel').style.display = 'none';
1517
+ });
1518
+
1519
+ // Save settings
1520
+ document.getElementById('save-settings-btn').addEventListener('click', async () => {
1521
+ const settings = {
1522
+ inference_project: document.getElementById('settings-inference-project').value.trim(),
1523
+ evaluation_project: document.getElementById('settings-evaluation-project').value.trim()
1524
+ };
1525
+
1526
+ if (!settings.inference_project || !settings.evaluation_project) {
1527
+ alert('Both project fields are required');
1528
+ return;
1529
+ }
1530
+
1531
+ try {
1532
+ const response = await fetch('/settings', {
1533
+ method: 'POST',
1534
+ headers: { 'Content-Type': 'application/json' },
1535
+ body: JSON.stringify(settings)
1536
+ });
1537
+
1538
+ const result = await response.json();
1539
+ if (result.status === 'success') {
1540
+ alert('Settings saved! Please restart the server for changes to take effect.');
1541
+ document.getElementById('settings-panel').style.display = 'none';
1542
+ } else {
1543
+ alert('Error saving settings');
1544
+ }
1545
+ } catch (error) {
1546
+ alert('Error saving settings: ' + error.message);
1547
+ }
1548
+ });
1549
+
1550
+ // === TEST JUDGES ===
1551
+
1552
+ let testJudgesData = []; // Store judges globally for test panel
1553
+
1554
+ // Open test judge panel
1555
+ document.getElementById('open-test-judge-btn').addEventListener('click', async () => {
1556
+ // Load judges
1557
+ try {
1558
+ const response = await fetch('/list_judges');
1559
+ const data = await response.json();
1560
+ testJudgesData = data.judges || []; // Store globally
1561
+ const judgeSelect = document.getElementById('test-judge-select');
1562
+
1563
+ if (testJudgesData.length > 0) {
1564
+ judgeSelect.innerHTML = testJudgesData.map((judge, idx) =>
1565
+ `<option value="${idx}">${judge.name} (${judge.type})</option>`
1566
+ ).join('');
1567
+
1568
+ // Load first judge's prompt and model
1569
+ if (testJudgesData[0]) {
1570
+ document.getElementById('test-judge-prompt').value = testJudgesData[0].prompt || '';
1571
+ document.getElementById('test-judge-model').value = testJudgesData[0].model || '';
1572
+ }
1573
+ } else {
1574
+ judgeSelect.innerHTML = '<option value="">No judges available</option>';
1575
+ }
1576
+ } catch (error) {
1577
+ console.error('Error loading judges:', error);
1578
+ }
1579
+
1580
+ // Load weak model files
1581
+ try {
1582
+ const response = await fetch('/list_weak_models');
1583
+ const data = await response.json();
1584
+ const weakModelSelect = document.getElementById('test-weak-model-select');
1585
+
1586
+ if (data.files && data.files.length > 0) {
1587
+ weakModelSelect.innerHTML = data.files.map(f =>
1588
+ `<option value="${f.filename}">${f.weak_model || f.filename}</option>`
1589
+ ).join('');
1590
+ } else {
1591
+ weakModelSelect.innerHTML = '<option value="">No weak model files available</option>';
1592
+ }
1593
+ } catch (error) {
1594
+ console.error('Error loading weak models:', error);
1595
+ }
1596
+
1597
+ document.getElementById('test-judge-panel').style.display = 'block';
1598
+ document.getElementById('test-judge-results').style.display = 'none';
1599
+ });
1600
+
1601
+ // When judge selection changes, update the prompt and model
1602
+ document.getElementById('test-judge-select').addEventListener('change', (e) => {
1603
+ const judgeIndex = parseInt(e.target.value);
1604
+ if (!isNaN(judgeIndex) && testJudgesData[judgeIndex]) {
1605
+ const judge = testJudgesData[judgeIndex];
1606
+ document.getElementById('test-judge-prompt').value = judge.prompt || '';
1607
+ document.getElementById('test-judge-model').value = judge.model || '';
1608
+ }
1609
+ });
1610
+
1611
+ // Close test judge panel
1612
+ document.getElementById('close-test-judge-btn').addEventListener('click', () => {
1613
+ document.getElementById('test-judge-panel').style.display = 'none';
1614
+ });
1615
+
1616
+ // Run test judge
1617
+ document.getElementById('run-test-judge-btn').addEventListener('click', async () => {
1618
+ const judgeIndex = document.getElementById('test-judge-select').value;
1619
+ const weakModelFile = document.getElementById('test-weak-model-select').value;
1620
+ const numSamples = parseInt(document.getElementById('test-num-samples').value) || 5;
1621
+ const editedPrompt = document.getElementById('test-judge-prompt').value;
1622
+ const editedModel = document.getElementById('test-judge-model').value;
1623
+
1624
+ if (!judgeIndex) {
1625
+ alert('Please select a judge');
1626
+ return;
1627
+ }
1628
+
1629
+ if (!weakModelFile) {
1630
+ alert('Please select a weak model file');
1631
+ return;
1632
+ }
1633
+
1634
+ if (!editedPrompt.trim()) {
1635
+ alert('Please enter a judge prompt');
1636
+ return;
1637
+ }
1638
+
1639
+ if (!editedModel.trim()) {
1640
+ alert('Please enter a judge model');
1641
+ return;
1642
+ }
1643
+
1644
+ // Get judge data and override with edited prompt and model
1645
+ const judge = { ...testJudgesData[parseInt(judgeIndex)] };
1646
+ judge.prompt = editedPrompt; // Use the edited prompt from textarea
1647
+ judge.model = editedModel; // Use the edited model from input
1648
+
1649
+ // Call test endpoint
1650
+ try {
1651
+ const response = await fetch('/test_judge', {
1652
+ method: 'POST',
1653
+ headers: { 'Content-Type': 'application/json' },
1654
+ body: JSON.stringify({
1655
+ judge: judge,
1656
+ weak_model_file: weakModelFile,
1657
+ num_samples: numSamples
1658
+ })
1659
+ });
1660
+
1661
+ if (!response.ok) {
1662
+ throw new Error('Failed to test judge');
1663
+ }
1664
+
1665
+ const result = await response.json();
1666
+
1667
+ // Display results
1668
+ const resultsDiv = document.getElementById('test-judge-results-content');
1669
+ resultsDiv.innerHTML = result.samples.map((sample, idx) => `
1670
+ <div style="margin-bottom: 20px; padding: 20px; background: #0f0f0f; border-radius: 8px; border: 1px solid #2a2a2a;">
1671
+ <h4 style="color: #4a9eff; margin-bottom: 15px;">Sample ${idx + 1} of ${result.samples.length}</h4>
1672
+
1673
+ <div style="margin-bottom: 15px;">
1674
+ <div style="color: #888; font-size: 12px; margin-bottom: 5px;">Question:</div>
1675
+ <div style="color: #ccc; font-size: 13px; background: #1a1a1a; padding: 10px; border-radius: 4px; max-height: 100px; overflow-y: auto;">${sample.question || 'N/A'}</div>
1676
+ </div>
1677
+
1678
+ <div style="margin-bottom: 15px;">
1679
+ <div style="color: #888; font-size: 12px; margin-bottom: 5px;">Strong Model Output:</div>
1680
+ <div style="color: #ccc; font-size: 13px; background: #1a1a1a; padding: 10px; border-radius: 4px; max-height: 100px; overflow-y: auto;">${sample.strong_output || 'N/A'}</div>
1681
+ </div>
1682
+
1683
+ <div style="margin-bottom: 15px;">
1684
+ <div style="color: #888; font-size: 12px; margin-bottom: 5px;">Weak Model Output:</div>
1685
+ <div style="color: #ccc; font-size: 13px; background: #1a1a1a; padding: 10px; border-radius: 4px; max-height: 100px; overflow-y: auto;">${sample.weak_output || 'N/A'}</div>
1686
+ </div>
1687
+
1688
+ <div style="margin-bottom: 15px;">
1689
+ <div style="color: #888; font-size: 12px; margin-bottom: 5px;">Judge Prompt (filled):</div>
1690
+ <pre style="color: #aaa; font-size: 11px; background: #1a1a1a; padding: 10px; border-radius: 4px; max-height: 150px; overflow-y: auto; white-space: pre-wrap; font-family: monospace;">${sample.judge_prompt}</pre>
1691
+ </div>
1692
+
1693
+ <div style="margin-bottom: 15px;">
1694
+ <div style="color: #888; font-size: 12px; margin-bottom: 5px;">Raw Judge Response:</div>
1695
+ <pre style="color: #f4d03f; font-size: 11px; background: #1a1a1a; padding: 10px; border-radius: 4px; max-height: 150px; overflow-y: auto; white-space: pre-wrap; font-family: monospace;">${sample.raw_response}</pre>
1696
+ </div>
1697
+
1698
+ <div>
1699
+ <div style="color: #888; font-size: 12px; margin-bottom: 5px;">Parsed Scores:</div>
1700
+ <div style="color: #4a9eff; font-size: 13px; background: #1a1a1a; padding: 10px; border-radius: 4px; font-family: monospace;">${JSON.stringify(sample.parsed_scores, null, 2)}</div>
1701
+ </div>
1702
+ </div>
1703
+ `).join('');
1704
+
1705
+ document.getElementById('test-judge-results').style.display = 'block';
1706
+
1707
+ } catch (error) {
1708
+ alert('Error testing judge: ' + error.message);
1709
+ console.error('Test error:', error);
1710
+ }
1711
+ });
1712
+
1713
+ // Save prompt to judge
1714
+ document.getElementById('save-test-judge-prompt-btn').addEventListener('click', async () => {
1715
+ const judgeIndex = document.getElementById('test-judge-select').value;
1716
+ const editedPrompt = document.getElementById('test-judge-prompt').value;
1717
+
1718
+ if (!judgeIndex) {
1719
+ alert('Please select a judge');
1720
+ return;
1721
+ }
1722
+
1723
+ if (!editedPrompt.trim()) {
1724
+ alert('Please enter a judge prompt');
1725
+ return;
1726
+ }
1727
+
1728
+ // Get judge data and update prompt
1729
+ const judge = { ...testJudgesData[parseInt(judgeIndex)] };
1730
+ judge.prompt = editedPrompt;
1731
+
1732
+ // Confirm with user
1733
+ if (!confirm(`Save this prompt to judge "${judge.name}"? This will permanently update the judge.`)) {
1734
+ return;
1735
+ }
1736
+
1737
+ // Call save endpoint
1738
+ try {
1739
+ const response = await fetch('/save_judge', {
1740
+ method: 'POST',
1741
+ headers: { 'Content-Type': 'application/json' },
1742
+ body: JSON.stringify({ judge: judge })
1743
+ });
1744
+
1745
+ if (!response.ok) {
1746
+ throw new Error('Failed to save judge');
1747
+ }
1748
+
1749
+ const result = await response.json();
1750
+
1751
+ // Update local judges data
1752
+ testJudgesData = result.judges || [];
1753
+
1754
+ alert('Judge prompt saved successfully!');
1755
+ } catch (error) {
1756
+ alert('Error saving judge: ' + error.message);
1757
+ console.error('Save error:', error);
1758
+ }
1759
+ });
1760
+
1761
+ // === END-TO-END TEST ===
1762
+
1763
+ // Open E2E panel
1764
+ document.getElementById('open-e2e-btn').addEventListener('click', async () => {
1765
+ if (selectedTraces.size === 0) {
1766
+ alert('Please select at least one trace first!');
1767
+ return;
1768
+ }
1769
+
1770
+ // Populate W&B models
1771
+ const wandbModelsDiv = document.getElementById('e2e-wandb-models');
1772
+ wandbModelsDiv.innerHTML = AVAILABLE_MODELS.map(model => `
1773
+ <label style="display: block; padding: 5px 0; color: #ccc; cursor: pointer;">
1774
+ <input type="checkbox" class="e2e-model-checkbox" value="${model}" style="margin-right: 8px;">
1775
+ ${model}
1776
+ </label>
1777
+ `).join('');
1778
+
1779
+ // Load judges
1780
+ try {
1781
+ const response = await fetch('/list_judges');
1782
+ const data = await response.json();
1783
+ const judgeSelect = document.getElementById('e2e-judge');
1784
+
1785
+ if (data.judges && data.judges.length > 0) {
1786
+ judgeSelect.innerHTML = data.judges.map((judge, idx) =>
1787
+ `<option value="${idx}">${judge.name} (${judge.type})</option>`
1788
+ ).join('');
1789
+ } else {
1790
+ judgeSelect.innerHTML = '<option value="">No judges available - create one first</option>';
1791
+ }
1792
+ } catch (error) {
1793
+ console.error('Error loading judges:', error);
1794
+ }
1795
+
1796
+ document.getElementById('e2e-panel').style.display = 'block';
1797
+ });
1798
+
1799
+ // Close E2E panel
1800
+ document.getElementById('close-e2e-btn').addEventListener('click', () => {
1801
+ document.getElementById('e2e-panel').style.display = 'none';
1802
+ });
1803
+
1804
+ // Close E2E progress
1805
+ document.getElementById('close-e2e-progress-btn').addEventListener('click', () => {
1806
+ document.getElementById('e2e-progress-panel').style.display = 'none';
1807
+ document.getElementById('e2e-results').style.display = 'none';
1808
+ });
1809
+
1810
+ // Run end-to-end test
1811
+ document.getElementById('run-e2e-btn').addEventListener('click', async () => {
1812
+ // Gather selected models
1813
+ const selectedWanbModels = Array.from(document.querySelectorAll('.e2e-model-checkbox:checked')).map(cb => cb.value);
1814
+ const openRouterModelsText = document.getElementById('e2e-openrouter-models').value.trim();
1815
+ const openRouterModels = openRouterModelsText
1816
+ .split('\n')
1817
+ .map(m => m.trim())
1818
+ .filter(m => m.length > 0);
1819
+ const allModels = [...selectedWanbModels, ...openRouterModels];
1820
+
1821
+ if (allModels.length === 0) {
1822
+ alert('Please select at least one model!');
1823
+ return;
1824
+ }
1825
+
1826
+ const judgeIndex = document.getElementById('e2e-judge').value;
1827
+ if (!judgeIndex) {
1828
+ alert('Please select a judge!');
1829
+ return;
1830
+ }
1831
+
1832
+ const numExamples = document.getElementById('e2e-num-examples').value ? parseInt(document.getElementById('e2e-num-examples').value) : null;
1833
+
1834
+ // Load judge data
1835
+ const judgesResponse = await fetch('/list_judges');
1836
+ const judgesData = await judgesResponse.json();
1837
+ const judge = judgesData.judges[parseInt(judgeIndex)];
1838
+
1839
+ // Hide config panel, show progress panel
1840
+ document.getElementById('e2e-panel').style.display = 'none';
1841
+ document.getElementById('e2e-progress-panel').style.display = 'block';
1842
+
1843
+ const progressText = document.getElementById('e2e-progress-text');
1844
+ const stepLabel = document.getElementById('e2e-step-label');
1845
+ const overallProgress = document.getElementById('e2e-overall-progress');
1846
+
1847
+ progressText.textContent = '';
1848
+
1849
+ try {
1850
+ // === STEP 1: Export Selected Traces ===
1851
+ stepLabel.textContent = 'Step 1/3: Exporting selected traces...';
1852
+ overallProgress.style.width = '10%';
1853
+ progressText.textContent += '📦 Exporting selected traces...\n';
1854
+
1855
+ // Get full trace objects for selected IDs
1856
+ const selectedTraceObjects = allTraces.filter(t => selectedTraces.has(t.id));
1857
+
1858
+ const exportResponse = await fetch('/export_strong_traces', {
1859
+ method: 'POST',
1860
+ headers: { 'Content-Type': 'application/json' },
1861
+ body: JSON.stringify({
1862
+ traces: selectedTraceObjects,
1863
+ nickname: `e2e_export_${Date.now()}`
1864
+ })
1865
+ });
1866
+
1867
+ if (!exportResponse.ok) {
1868
+ throw new Error('Failed to export traces');
1869
+ }
1870
+
1871
+ const exportResult = await exportResponse.json();
1872
+ const exportFilename = exportResult.filename;
1873
+ progressText.textContent += `✓ Exported ${exportResult.count} traces to ${exportFilename}\n\n`;
1874
+ overallProgress.style.width = '20%';
1875
+
1876
+ // === STEP 2: Run Weak Model Inference ===
1877
+ stepLabel.textContent = 'Step 2/3: Running weak model inference...';
1878
+ progressText.textContent += `⚙️ Running inference with ${allModels.length} model(s)...\n`;
1879
+
1880
+ const taskId = `inference_${Date.now()}`;
1881
+ let pollInterval = null;
1882
+
1883
+ const pollProgress = async () => {
1884
+ try {
1885
+ const resp = await fetch(`/progress/${taskId}`);
1886
+ if (resp.ok) {
1887
+ const progress = await resp.json();
1888
+ const percent = (progress.current / progress.total) * 100;
1889
+ // Map inference progress to 20-60% of overall
1890
+ const overallPercent = 20 + (percent * 0.4);
1891
+ overallProgress.style.width = `${overallPercent}%`;
1892
+ }
1893
+ } catch (e) {
1894
+ console.error('Error polling progress:', e);
1895
+ }
1896
+ };
1897
+
1898
+ pollInterval = setInterval(pollProgress, 300);
1899
+
1900
+ const inferenceResponse = await fetch('/run_inference', {
1901
+ method: 'POST',
1902
+ headers: { 'Content-Type': 'application/json' },
1903
+ body: JSON.stringify({
1904
+ models: allModels,
1905
+ strong_export_file: exportFilename,
1906
+ num_examples: numExamples,
1907
+ task_id: taskId
1908
+ })
1909
+ });
1910
+
1911
+ if (pollInterval) clearInterval(pollInterval);
1912
+
1913
+ if (!inferenceResponse.ok) {
1914
+ throw new Error('Failed to run inference');
1915
+ }
1916
+
1917
+ const inferenceResult = await inferenceResponse.json();
1918
+ progressText.textContent += `✓ Generated outputs for ${allModels.length} model(s)\n\n`;
1919
+ overallProgress.style.width = '60%';
1920
+
1921
+ // === STEP 3: Run Evaluations ===
1922
+ stepLabel.textContent = 'Step 3/3: Running evaluations...';
1923
+ progressText.textContent += `📊 Running evaluations with judge: ${judge.name}...\n`;
1924
+
1925
+ const evaluationResults = [];
1926
+
1927
+ // Get list of weak model files that were just generated
1928
+ const weakModelsResponse = await fetch('/list_weak_models');
1929
+ const weakModelsData = await weakModelsResponse.json();
1930
+
1931
+ // Filter to only the models we just ran
1932
+ const weakModelFiles = weakModelsData.files
1933
+ .filter(f => allModels.some(m => f.filename.includes(m.replace('/', '_'))))
1934
+ .map(f => f.filename);
1935
+
1936
+ for (let i = 0; i < weakModelFiles.length; i++) {
1937
+ const modelFile = weakModelFiles[i];
1938
+ const evalTaskId = `eval_${Date.now()}_${i}`;
1939
+
1940
+ progressText.textContent += `\n[${i+1}/${weakModelFiles.length}] Evaluating ${modelFile}...\n`;
1941
+
1942
+ let evalPollInterval = null;
1943
+ const pollEvalProgress = async () => {
1944
+ try {
1945
+ const resp = await fetch(`/progress/${evalTaskId}`);
1946
+ if (resp.ok) {
1947
+ const progress = await resp.json();
1948
+ const percent = (progress.current / progress.total) * 100;
1949
+ // Map eval progress to 60-100% of overall
1950
+ const basePercent = 60 + (i / weakModelFiles.length) * 40;
1951
+ const stepPercent = (percent / 100) * (40 / weakModelFiles.length);
1952
+ overallProgress.style.width = `${basePercent + stepPercent}%`;
1953
+ }
1954
+ } catch (e) {
1955
+ console.error('Error polling eval progress:', e);
1956
+ }
1957
+ };
1958
+
1959
+ evalPollInterval = setInterval(pollEvalProgress, 300);
1960
+
1961
+ const evalResponse = await fetch('/run_evaluation', {
1962
+ method: 'POST',
1963
+ headers: { 'Content-Type': 'application/json' },
1964
+ body: JSON.stringify({
1965
+ model_file: modelFile,
1966
+ judge: judge,
1967
+ task_id: evalTaskId
1968
+ })
1969
+ });
1970
+
1971
+ if (evalPollInterval) clearInterval(evalPollInterval);
1972
+
1973
+ if (evalResponse.ok) {
1974
+ const evalResult = await evalResponse.json();
1975
+ progressText.textContent += ` ✓ Complete: ${evalResult.examples_evaluated} examples evaluated\n`;
1976
+ evaluationResults.push(evalResult);
1977
+ } else {
1978
+ progressText.textContent += ` ✗ Error evaluating ${modelFile}\n`;
1979
+ }
1980
+ }
1981
+
1982
+ overallProgress.style.width = '100%';
1983
+ stepLabel.textContent = 'Complete!';
1984
+ progressText.textContent += `\n✅ All evaluations complete!\n`;
1985
+
1986
+ // Show results
1987
+ document.getElementById('e2e-results').style.display = 'block';
1988
+ const resultsContent = document.getElementById('e2e-results-content');
1989
+ resultsContent.innerHTML = evaluationResults.map(r => `
1990
+ <div style="margin-bottom: 15px; padding: 15px; background: #1a1a1a; border-radius: 4px; border: 1px solid #2a2a2a;">
1991
+ <div style="font-weight: bold; color: #fff; margin-bottom: 8px;">${r.evaluation_name}</div>
1992
+ <div style="font-size: 12px; color: #888; margin-bottom: 8px;">
1993
+ ${r.examples_evaluated} examples evaluated
1994
+ </div>
1995
+ <a href="${r.weave_url}" target="_blank" style="color: #4a9eff; font-size: 13px;">View in Weave →</a>
1996
+ </div>
1997
+ `).join('');
1998
+
1999
+ } catch (error) {
2000
+ progressText.textContent += `\n\n❌ Error: ${error.message}\n`;
2001
+ stepLabel.textContent = 'Error occurred';
2002
+ }
2003
+ });
1230
2004
  </script>
1231
2005
  </body>
1232
2006
  </html>