quickdistill 0.1.6__py3-none-any.whl → 0.1.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -300,22 +300,46 @@
300
300
  Select All Filtered
301
301
  </button>
302
302
 
303
- <button id="export-btn" style="padding: 8px 16px; background: #4a9eff; color: white; border: none; border-radius: 4px; cursor: pointer;">
304
- Export Selected to Test Set (<span id="selected-count">0</span>)
305
- </button>
303
+ <!-- Manual Workflow Section -->
304
+ <div style="margin: 20px 0; padding: 15px; background: #1a2a1a; border-radius: 8px; border: 3px solid #ffffff;">
305
+ <div style="color: #ffffff; font-size: 14px; font-weight: 500; margin-bottom: 12px;">📋 Manual Workflow (Step-by-Step):</div>
306
+ <div style="display: flex; flex-wrap: wrap; gap: 10px;">
307
+ <button id="export-btn" style="padding: 8px 16px; background: #4a9eff; color: white; border: none; border-radius: 4px; cursor: pointer;">
308
+ Export Selected to Test Set (<span id="selected-count">0</span>)
309
+ </button>
306
310
 
307
- <button id="open-inference-btn" style="padding: 8px 16px; background: #7c4a9e; color: white; border: none; border-radius: 4px; cursor: pointer;">
308
- Run Weak Models
309
- </button>
311
+ <button id="open-inference-btn" style="padding: 8px 16px; background: #7c4a9e; color: white; border: none; border-radius: 4px; cursor: pointer;">
312
+ Run Weak Models
313
+ </button>
310
314
 
311
- <button id="open-eval-btn" style="padding: 8px 16px; background: #9e6a4a; color: white; border: none; border-radius: 4px; cursor: pointer;">
312
- Run Evaluation
313
- </button>
315
+ <button id="open-eval-btn" style="padding: 8px 16px; background: #9e6a4a; color: white; border: none; border-radius: 4px; cursor: pointer;">
316
+ Run Evaluation
317
+ </button>
318
+ </div>
319
+ </div>
314
320
 
321
+ <!-- Utilities -->
315
322
  <a href="/judge" target="_blank" style="padding: 8px 16px; background: #4a5a9e; color: white; border: none; border-radius: 4px; text-decoration: none; display: inline-block;">
316
323
  Manage Judges
317
324
  </a>
318
325
 
326
+ <button id="open-test-judge-btn" style="padding: 8px 16px; background: #6a4a7e; color: white; border: none; border-radius: 4px; cursor: pointer;">
327
+ Test Judges
328
+ </button>
329
+
330
+ <button id="open-settings-btn" style="padding: 8px 16px; background: #5a5a5a; color: white; border: none; border-radius: 4px; cursor: pointer;">
331
+ Settings
332
+ </button>
333
+
334
+ <!-- Automatic Workflow Section -->
335
+ <div style="margin: 20px 0; padding: 15px; background: #2a1a2a; border-radius: 8px; border: 1px solid #4a2a4a;">
336
+ <div style="color: #aaa; font-size: 13px; margin-bottom: 10px;">Automatic Workflow:</div>
337
+ <button id="open-e2e-btn" style="padding: 10px 20px; background: #7a4a9e; color: white; border: none; border-radius: 4px; cursor: pointer; font-weight: 500;">
338
+ ⚡ Run End-to-End Test
339
+ </button>
340
+ <div style="color: #666; font-size: 11px; margin-top: 8px;">Export → Generate → Evaluate (all in one)</div>
341
+ </div>
342
+
319
343
  <div class="stats">
320
344
  <div>Total: <span id="total-count">0</span></div>
321
345
  <div>Shown: <span id="shown-count">0</span></div>
@@ -427,6 +451,193 @@
427
451
  </div>
428
452
  </div>
429
453
  </div>
454
+
455
+ <!-- Settings Panel -->
456
+ <div id="settings-panel" style="display: none; position: fixed; top: 0; left: 0; width: 100%; height: 100%; background: rgba(0,0,0,0.8); z-index: 1000; padding: 40px;">
457
+ <div style="max-width: 600px; margin: 0 auto; background: #1a1a1a; border-radius: 8px; padding: 30px; border: 1px solid #2a2a2a;">
458
+ <h2 style="color: #fff; margin-bottom: 20px;">Settings</h2>
459
+
460
+ <div style="margin-bottom: 20px;">
461
+ <label style="display: block; color: #aaa; margin-bottom: 8px; font-size: 14px;">W&B Inference Project</label>
462
+ <input type="text" id="settings-inference-project" placeholder="e.g., wandb_fc/quickstart_playground"
463
+ style="width: 100%; padding: 10px; background: #2a2a2a; color: #fff; border: 1px solid #3a3a3a; border-radius: 4px; font-size: 14px;">
464
+ <div style="color: #666; font-size: 12px; margin-top: 5px;">Used for running weak model inference</div>
465
+ </div>
466
+
467
+ <div style="margin-bottom: 30px;">
468
+ <label style="display: block; color: #aaa; margin-bottom: 8px; font-size: 14px;">W&B Evaluation Project</label>
469
+ <input type="text" id="settings-evaluation-project" placeholder="e.g., wandb_inference"
470
+ style="width: 100%; padding: 10px; background: #2a2a2a; color: #fff; border: 1px solid #3a3a3a; border-radius: 4px; font-size: 14px;">
471
+ <div style="color: #666; font-size: 12px; margin-top: 5px;">Used for logging evaluation results with Weave</div>
472
+ </div>
473
+
474
+ <div style="display: flex; gap: 10px; justify-content: flex-end;">
475
+ <button id="close-settings-btn" style="padding: 10px 20px; background: #5a2a2a; color: white; border: none; border-radius: 4px; cursor: pointer;">
476
+ Cancel
477
+ </button>
478
+ <button id="save-settings-btn" style="padding: 10px 20px; background: #2a7c4a; color: white; border: none; border-radius: 4px; cursor: pointer;">
479
+ Save Settings
480
+ </button>
481
+ </div>
482
+ </div>
483
+ </div>
484
+
485
+ <!-- Test Judges Panel -->
486
+ <div id="test-judge-panel" style="display: none; position: fixed; top: 0; left: 0; width: 100%; height: 100%; background: rgba(0,0,0,0.8); z-index: 1000; padding: 40px; overflow-y: auto;">
487
+ <div style="max-width: 1000px; margin: 0 auto; background: #1a1a1a; border-radius: 8px; padding: 30px; border: 1px solid #3a2a4a;">
488
+ <h2 style="color: #fff; margin-bottom: 10px;">Test Judge</h2>
489
+ <p style="color: #888; font-size: 13px; margin-bottom: 25px;">
490
+ Test your judge on sample data to see exactly what inputs/outputs it receives
491
+ </p>
492
+
493
+ <!-- Configuration -->
494
+ <div style="display: grid; grid-template-columns: 1fr 1fr; gap: 20px; margin-bottom: 25px;">
495
+ <div>
496
+ <label style="display: block; color: #aaa; margin-bottom: 8px; font-size: 14px;">Select Judge:</label>
497
+ <select id="test-judge-select" style="width: 100%; padding: 10px; background: #2a2a2a; color: #fff; border: 1px solid #3a3a3a; border-radius: 4px; font-size: 14px;">
498
+ <option value="">Loading judges...</option>
499
+ </select>
500
+ </div>
501
+
502
+ <div>
503
+ <label style="display: block; color: #aaa; margin-bottom: 8px; font-size: 14px;">Weak Model Data:</label>
504
+ <select id="test-weak-model-select" style="width: 100%; padding: 10px; background: #2a2a2a; color: #fff; border: 1px solid #3a3a3a; border-radius: 4px; font-size: 14px;">
505
+ <option value="">Loading weak model files...</option>
506
+ </select>
507
+ </div>
508
+ </div>
509
+
510
+ <div style="margin-bottom: 20px;">
511
+ <label style="display: block; color: #aaa; margin-bottom: 8px; font-size: 14px;">Number of Samples:</label>
512
+ <input type="number" id="test-num-samples" value="5" min="1" max="50"
513
+ style="width: 150px; padding: 10px; background: #2a2a2a; color: #fff; border: 1px solid #3a3a3a; border-radius: 4px; font-size: 14px;">
514
+ <span style="color: #666; font-size: 12px; margin-left: 10px;">Max: 50</span>
515
+ </div>
516
+
517
+ <!-- Judge Model -->
518
+ <div style="margin-bottom: 20px;">
519
+ <label style="display: block; color: #aaa; margin-bottom: 8px; font-size: 14px;">Judge Model:</label>
520
+ <input type="text" id="test-judge-model"
521
+ style="width: 100%; padding: 10px; background: #2a2a2a; color: #fff; border: 1px solid #3a3a3a; border-radius: 4px; font-size: 14px;"
522
+ placeholder="e.g., openai/gpt-5, anthropic/claude-3.5-sonnet">
523
+ <div style="color: #666; font-size: 12px; margin-top: 5px;">
524
+ Override the judge's model for this test. Uses LiteLLM format (e.g., <code style="color: #aaa;">openai/gpt-5</code>, <code style="color: #aaa;">anthropic/claude-3.5-sonnet</code>)
525
+ </div>
526
+ </div>
527
+
528
+ <!-- Judge Prompt -->
529
+ <div style="margin-bottom: 30px;">
530
+ <label style="display: block; color: #aaa; margin-bottom: 8px; font-size: 14px;">Judge Prompt:</label>
531
+ <textarea id="test-judge-prompt"
532
+ style="width: 100%; min-height: 200px; padding: 10px; background: #2a2a2a; color: #fff; border: 1px solid #3a3a3a; border-radius: 4px; font-size: 13px; font-family: 'Courier New', monospace; resize: vertical;"
533
+ placeholder="Select a judge to load its prompt..."></textarea>
534
+ <div style="color: #666; font-size: 12px; margin-top: 5px;">
535
+ Edit the prompt and test changes, or save to update the judge permanently
536
+ </div>
537
+ </div>
538
+
539
+ <!-- Actions -->
540
+ <div style="display: flex; gap: 10px; margin-bottom: 30px;">
541
+ <button id="run-test-judge-btn" style="padding: 10px 20px; background: #6a4a7e; color: white; border: none; border-radius: 4px; cursor: pointer; font-weight: 500;">
542
+ Run Test
543
+ </button>
544
+ <button id="save-test-judge-prompt-btn" style="padding: 10px 20px; background: #2a7c4a; color: white; border: none; border-radius: 4px; cursor: pointer;">
545
+ Save Prompt to Judge
546
+ </button>
547
+ <button id="close-test-judge-btn" style="padding: 10px 20px; background: #5a2a2a; color: white; border: none; border-radius: 4px; cursor: pointer;">
548
+ Close
549
+ </button>
550
+ </div>
551
+
552
+ <!-- Results -->
553
+ <div id="test-judge-results" style="display: none;">
554
+ <h3 style="color: #4a9eff; margin-bottom: 15px;">Test Results</h3>
555
+ <div id="test-judge-results-content" style="max-height: 600px; overflow-y: auto;">
556
+ <!-- Results populated here -->
557
+ </div>
558
+ </div>
559
+ </div>
560
+ </div>
561
+
562
+ <!-- End-to-End Test Panel -->
563
+ <div id="e2e-panel" style="display: none; position: fixed; top: 0; left: 0; width: 100%; height: 100%; background: rgba(0,0,0,0.8); z-index: 1000; padding: 40px; overflow-y: auto;">
564
+ <div style="max-width: 800px; margin: 0 auto; background: #1a1a1a; border-radius: 8px; padding: 30px; border: 1px solid #4a2a4a;">
565
+ <h2 style="color: #fff; margin-bottom: 10px;">⚡ Run End-to-End Test</h2>
566
+ <p style="color: #888; font-size: 13px; margin-bottom: 25px;">
567
+ This will automatically: Export selected traces → Run weak models → Evaluate with judge
568
+ </p>
569
+
570
+ <!-- Weak Model Selection -->
571
+ <div style="margin-bottom: 25px;">
572
+ <h3 style="color: #fff; font-size: 16px; margin-bottom: 15px;">1. Select Weak Models</h3>
573
+
574
+ <div style="margin-bottom: 15px;">
575
+ <label style="display: block; color: #aaa; margin-bottom: 8px; font-size: 13px;">W&B Models:</label>
576
+ <div id="e2e-wandb-models" style="max-height: 150px; overflow-y: auto; background: #0f0f0f; padding: 10px; border-radius: 4px; border: 1px solid #2a2a2a;">
577
+ <!-- Populated dynamically -->
578
+ </div>
579
+ </div>
580
+
581
+ <div style="margin-bottom: 15px;">
582
+ <label style="display: block; color: #aaa; margin-bottom: 8px; font-size: 13px;">OpenRouter Models (optional):</label>
583
+ <textarea id="e2e-openrouter-models" placeholder="Enter OpenRouter models (one per line)&#10;e.g.,&#10;meta-llama/llama-3.3-70b-instruct&#10;anthropic/claude-3.5-sonnet"
584
+ style="width: 100%; padding: 8px; background: #2a2a2a; color: #fff; border: 1px solid #3a3a3a; border-radius: 4px; font-size: 13px; min-height: 80px; font-family: monospace;"></textarea>
585
+ <div style="color: #666; font-size: 11px; margin-top: 5px;">One model per line</div>
586
+ </div>
587
+
588
+ <div>
589
+ <label style="display: block; color: #aaa; margin-bottom: 8px; font-size: 13px;">Max Examples (optional):</label>
590
+ <input type="number" id="e2e-num-examples" placeholder="Leave empty to use all selected traces"
591
+ style="width: 200px; padding: 8px; background: #2a2a2a; color: #fff; border: 1px solid #3a3a3a; border-radius: 4px; font-size: 13px;">
592
+ </div>
593
+ </div>
594
+
595
+ <!-- Judge Selection -->
596
+ <div style="margin-bottom: 30px;">
597
+ <h3 style="color: #fff; font-size: 16px; margin-bottom: 15px;">2. Select Judge</h3>
598
+ <select id="e2e-judge" style="width: 100%; padding: 10px; background: #2a2a2a; color: #fff; border: 1px solid #3a3a3a; border-radius: 4px; font-size: 14px;">
599
+ <option value="">Loading judges...</option>
600
+ </select>
601
+ </div>
602
+
603
+ <!-- Actions -->
604
+ <div style="display: flex; gap: 10px; justify-content: flex-end;">
605
+ <button id="close-e2e-btn" style="padding: 10px 20px; background: #5a2a2a; color: white; border: none; border-radius: 4px; cursor: pointer;">
606
+ Cancel
607
+ </button>
608
+ <button id="run-e2e-btn" style="padding: 10px 20px; background: #7a4a9e; color: white; border: none; border-radius: 4px; cursor: pointer; font-weight: 500;">
609
+ ⚡ Run Test
610
+ </button>
611
+ </div>
612
+ </div>
613
+ </div>
614
+
615
+ <!-- End-to-End Progress Panel -->
616
+ <div id="e2e-progress-panel" style="display: none; position: fixed; top: 0; left: 0; width: 100%; height: 100%; background: rgba(0,0,0,0.9); z-index: 1100; padding: 40px; overflow-y: auto;">
617
+ <div style="max-width: 800px; margin: 0 auto; background: #1a1a1a; border-radius: 8px; padding: 30px; border: 1px solid #4a2a4a;">
618
+ <h2 style="color: #fff; margin-bottom: 20px;">Running End-to-End Test</h2>
619
+
620
+ <!-- Overall Progress -->
621
+ <div style="margin-bottom: 30px;">
622
+ <div style="color: #4a9eff; font-size: 14px; margin-bottom: 10px;" id="e2e-step-label">Step 1/3: Exporting traces...</div>
623
+ <div style="width: 100%; height: 8px; background: #2a2a2a; border-radius: 4px; overflow: hidden;">
624
+ <div id="e2e-overall-progress" style="height: 100%; background: #7a4a9e; width: 0%; transition: width 0.3s;"></div>
625
+ </div>
626
+ </div>
627
+
628
+ <!-- Detailed Progress -->
629
+ <div id="e2e-progress-text" style="color: #888; font-family: monospace; font-size: 12px; white-space: pre-wrap; background: #0f0f0f; padding: 15px; border-radius: 4px; max-height: 400px; overflow-y: auto;"></div>
630
+
631
+ <!-- Results -->
632
+ <div id="e2e-results" style="display: none; margin-top: 20px;">
633
+ <h3 style="color: #4a9eff; margin-bottom: 15px;">✓ Test Complete!</h3>
634
+ <div id="e2e-results-content" style="background: #0f0f0f; padding: 15px; border-radius: 4px;"></div>
635
+ <button id="close-e2e-progress-btn" style="margin-top: 20px; padding: 10px 20px; background: #2a7c4a; color: white; border: none; border-radius: 4px; cursor: pointer;">
636
+ Close
637
+ </button>
638
+ </div>
639
+ </div>
640
+ </div>
430
641
  </div>
431
642
 
432
643
  <script>
@@ -1287,6 +1498,517 @@
1287
1498
  console.error('Delete error:', error);
1288
1499
  }
1289
1500
  }
1501
+
1502
+ // === SETTINGS ===
1503
+
1504
+ // Load and display settings
1505
+ async function loadSettings() {
1506
+ try {
1507
+ const response = await fetch('/settings');
1508
+ const settings = await response.json();
1509
+ document.getElementById('settings-inference-project').value = settings.inference_project || '';
1510
+ document.getElementById('settings-evaluation-project').value = settings.evaluation_project || '';
1511
+ } catch (error) {
1512
+ console.error('Error loading settings:', error);
1513
+ }
1514
+ }
1515
+
1516
+ // Open settings panel
1517
+ document.getElementById('open-settings-btn').addEventListener('click', async () => {
1518
+ await loadSettings();
1519
+ document.getElementById('settings-panel').style.display = 'block';
1520
+ });
1521
+
1522
+ // Close settings panel
1523
+ document.getElementById('close-settings-btn').addEventListener('click', () => {
1524
+ document.getElementById('settings-panel').style.display = 'none';
1525
+ });
1526
+
1527
+ // Save settings
1528
+ document.getElementById('save-settings-btn').addEventListener('click', async () => {
1529
+ const settings = {
1530
+ inference_project: document.getElementById('settings-inference-project').value.trim(),
1531
+ evaluation_project: document.getElementById('settings-evaluation-project').value.trim()
1532
+ };
1533
+
1534
+ if (!settings.inference_project || !settings.evaluation_project) {
1535
+ alert('Both project fields are required');
1536
+ return;
1537
+ }
1538
+
1539
+ try {
1540
+ const response = await fetch('/settings', {
1541
+ method: 'POST',
1542
+ headers: { 'Content-Type': 'application/json' },
1543
+ body: JSON.stringify(settings)
1544
+ });
1545
+
1546
+ const result = await response.json();
1547
+ if (result.status === 'success') {
1548
+ alert('Settings saved! Please restart the server for changes to take effect.');
1549
+ document.getElementById('settings-panel').style.display = 'none';
1550
+ } else {
1551
+ alert('Error saving settings');
1552
+ }
1553
+ } catch (error) {
1554
+ alert('Error saving settings: ' + error.message);
1555
+ }
1556
+ });
1557
+
1558
+ // === TEST JUDGES ===
1559
+
1560
+ let testJudgesData = []; // Store judges globally for test panel
1561
+
1562
+ // Open test judge panel
1563
+ document.getElementById('open-test-judge-btn').addEventListener('click', async () => {
1564
+ // Load judges
1565
+ try {
1566
+ const response = await fetch('/list_judges');
1567
+ const data = await response.json();
1568
+ testJudgesData = data.judges || []; // Store globally
1569
+ const judgeSelect = document.getElementById('test-judge-select');
1570
+
1571
+ if (testJudgesData.length > 0) {
1572
+ judgeSelect.innerHTML = testJudgesData.map((judge, idx) =>
1573
+ `<option value="${idx}">${judge.name} (${judge.type})</option>`
1574
+ ).join('');
1575
+
1576
+ // Load first judge's prompt and model
1577
+ if (testJudgesData[0]) {
1578
+ document.getElementById('test-judge-prompt').value = testJudgesData[0].prompt || '';
1579
+ document.getElementById('test-judge-model').value = testJudgesData[0].model || '';
1580
+ }
1581
+ } else {
1582
+ judgeSelect.innerHTML = '<option value="">No judges available</option>';
1583
+ }
1584
+ } catch (error) {
1585
+ console.error('Error loading judges:', error);
1586
+ }
1587
+
1588
+ // Load weak model files
1589
+ try {
1590
+ const response = await fetch('/list_weak_models');
1591
+ const data = await response.json();
1592
+ const weakModelSelect = document.getElementById('test-weak-model-select');
1593
+
1594
+ if (data.files && data.files.length > 0) {
1595
+ weakModelSelect.innerHTML = data.files.map(f =>
1596
+ `<option value="${f.filename}">${f.weak_model || f.filename}</option>`
1597
+ ).join('');
1598
+ } else {
1599
+ weakModelSelect.innerHTML = '<option value="">No weak model files available</option>';
1600
+ }
1601
+ } catch (error) {
1602
+ console.error('Error loading weak models:', error);
1603
+ }
1604
+
1605
+ document.getElementById('test-judge-panel').style.display = 'block';
1606
+ document.getElementById('test-judge-results').style.display = 'none';
1607
+ });
1608
+
1609
+ // When judge selection changes, update the prompt and model
1610
+ document.getElementById('test-judge-select').addEventListener('change', (e) => {
1611
+ const judgeIndex = parseInt(e.target.value);
1612
+ if (!isNaN(judgeIndex) && testJudgesData[judgeIndex]) {
1613
+ const judge = testJudgesData[judgeIndex];
1614
+ document.getElementById('test-judge-prompt').value = judge.prompt || '';
1615
+ document.getElementById('test-judge-model').value = judge.model || '';
1616
+ }
1617
+ });
1618
+
1619
+ // Close test judge panel
1620
+ document.getElementById('close-test-judge-btn').addEventListener('click', () => {
1621
+ document.getElementById('test-judge-panel').style.display = 'none';
1622
+ });
1623
+
1624
+ // Run test judge
1625
+ document.getElementById('run-test-judge-btn').addEventListener('click', async () => {
1626
+ const judgeIndex = document.getElementById('test-judge-select').value;
1627
+ const weakModelFile = document.getElementById('test-weak-model-select').value;
1628
+ const numSamples = parseInt(document.getElementById('test-num-samples').value) || 5;
1629
+ const editedPrompt = document.getElementById('test-judge-prompt').value;
1630
+ const editedModel = document.getElementById('test-judge-model').value;
1631
+
1632
+ if (!judgeIndex) {
1633
+ alert('Please select a judge');
1634
+ return;
1635
+ }
1636
+
1637
+ if (!weakModelFile) {
1638
+ alert('Please select a weak model file');
1639
+ return;
1640
+ }
1641
+
1642
+ if (!editedPrompt.trim()) {
1643
+ alert('Please enter a judge prompt');
1644
+ return;
1645
+ }
1646
+
1647
+ if (!editedModel.trim()) {
1648
+ alert('Please enter a judge model');
1649
+ return;
1650
+ }
1651
+
1652
+ // Get judge data and override with edited prompt and model
1653
+ const judge = { ...testJudgesData[parseInt(judgeIndex)] };
1654
+ judge.prompt = editedPrompt; // Use the edited prompt from textarea
1655
+ judge.model = editedModel; // Use the edited model from input
1656
+
1657
+ // Call test endpoint
1658
+ try {
1659
+ const response = await fetch('/test_judge', {
1660
+ method: 'POST',
1661
+ headers: { 'Content-Type': 'application/json' },
1662
+ body: JSON.stringify({
1663
+ judge: judge,
1664
+ weak_model_file: weakModelFile,
1665
+ num_samples: numSamples
1666
+ })
1667
+ });
1668
+
1669
+ if (!response.ok) {
1670
+ throw new Error('Failed to test judge');
1671
+ }
1672
+
1673
+ const result = await response.json();
1674
+
1675
+ // Display results
1676
+ const resultsDiv = document.getElementById('test-judge-results-content');
1677
+ resultsDiv.innerHTML = result.samples.map((sample, idx) => `
1678
+ <div style="margin-bottom: 20px; padding: 20px; background: #0f0f0f; border-radius: 8px; border: 1px solid #2a2a2a;">
1679
+ <h4 style="color: #4a9eff; margin-bottom: 15px;">Sample ${idx + 1} of ${result.samples.length}</h4>
1680
+
1681
+ <div style="margin-bottom: 15px;">
1682
+ <div style="color: #888; font-size: 12px; margin-bottom: 5px;">Question:</div>
1683
+ <div style="color: #ccc; font-size: 13px; background: #1a1a1a; padding: 10px; border-radius: 4px; max-height: 100px; overflow-y: auto;">${sample.question || 'N/A'}</div>
1684
+ </div>
1685
+
1686
+ <div style="margin-bottom: 15px;">
1687
+ <div style="color: #888; font-size: 12px; margin-bottom: 5px;">Strong Model Output:</div>
1688
+ <div style="color: #ccc; font-size: 13px; background: #1a1a1a; padding: 10px; border-radius: 4px; max-height: 100px; overflow-y: auto;">${sample.strong_output || 'N/A'}</div>
1689
+ </div>
1690
+
1691
+ <div style="margin-bottom: 15px;">
1692
+ <div style="color: #888; font-size: 12px; margin-bottom: 5px;">Weak Model Output:</div>
1693
+ <div style="color: #ccc; font-size: 13px; background: #1a1a1a; padding: 10px; border-radius: 4px; max-height: 100px; overflow-y: auto;">${sample.weak_output || 'N/A'}</div>
1694
+ </div>
1695
+
1696
+ <div style="margin-bottom: 15px;">
1697
+ <div style="color: #888; font-size: 12px; margin-bottom: 5px;">Judge Prompt (filled):</div>
1698
+ <pre style="color: #aaa; font-size: 11px; background: #1a1a1a; padding: 10px; border-radius: 4px; max-height: 150px; overflow-y: auto; white-space: pre-wrap; font-family: monospace;">${sample.judge_prompt}</pre>
1699
+ </div>
1700
+
1701
+ <div style="margin-bottom: 15px;">
1702
+ <div style="color: #888; font-size: 12px; margin-bottom: 5px;">Raw Judge Response:</div>
1703
+ <pre style="color: #f4d03f; font-size: 11px; background: #1a1a1a; padding: 10px; border-radius: 4px; max-height: 150px; overflow-y: auto; white-space: pre-wrap; font-family: monospace;">${sample.raw_response}</pre>
1704
+ </div>
1705
+
1706
+ <div>
1707
+ <div style="color: #888; font-size: 12px; margin-bottom: 5px;">Parsed Scores:</div>
1708
+ <div style="color: #4a9eff; font-size: 13px; background: #1a1a1a; padding: 10px; border-radius: 4px; font-family: monospace;">${JSON.stringify(sample.parsed_scores, null, 2)}</div>
1709
+ </div>
1710
+ </div>
1711
+ `).join('');
1712
+
1713
+ document.getElementById('test-judge-results').style.display = 'block';
1714
+
1715
+ } catch (error) {
1716
+ alert('Error testing judge: ' + error.message);
1717
+ console.error('Test error:', error);
1718
+ }
1719
+ });
1720
+
1721
+ // Save prompt to judge
1722
+ document.getElementById('save-test-judge-prompt-btn').addEventListener('click', async () => {
1723
+ const judgeIndex = document.getElementById('test-judge-select').value;
1724
+ const editedPrompt = document.getElementById('test-judge-prompt').value;
1725
+
1726
+ if (!judgeIndex) {
1727
+ alert('Please select a judge');
1728
+ return;
1729
+ }
1730
+
1731
+ if (!editedPrompt.trim()) {
1732
+ alert('Please enter a judge prompt');
1733
+ return;
1734
+ }
1735
+
1736
+ // Get judge data and update prompt
1737
+ const judge = { ...testJudgesData[parseInt(judgeIndex)] };
1738
+ judge.prompt = editedPrompt;
1739
+
1740
+ // Confirm with user
1741
+ if (!confirm(`Save this prompt to judge "${judge.name}"? This will permanently update the judge.`)) {
1742
+ return;
1743
+ }
1744
+
1745
+ // Call save endpoint
1746
+ try {
1747
+ const response = await fetch('/save_judge', {
1748
+ method: 'POST',
1749
+ headers: { 'Content-Type': 'application/json' },
1750
+ body: JSON.stringify({ judge: judge })
1751
+ });
1752
+
1753
+ if (!response.ok) {
1754
+ throw new Error('Failed to save judge');
1755
+ }
1756
+
1757
+ const result = await response.json();
1758
+
1759
+ // Update local judges data
1760
+ testJudgesData = result.judges || [];
1761
+
1762
+ alert('Judge prompt saved successfully!');
1763
+ } catch (error) {
1764
+ alert('Error saving judge: ' + error.message);
1765
+ console.error('Save error:', error);
1766
+ }
1767
+ });
1768
+
1769
+ // === END-TO-END TEST ===
1770
+
1771
+ // Open E2E panel
1772
+ document.getElementById('open-e2e-btn').addEventListener('click', async () => {
1773
+ if (selectedTraces.size === 0) {
1774
+ alert('Please select at least one trace first!');
1775
+ return;
1776
+ }
1777
+
1778
+ // Populate W&B models
1779
+ const wandbModelsDiv = document.getElementById('e2e-wandb-models');
1780
+ wandbModelsDiv.innerHTML = AVAILABLE_MODELS.map(model => `
1781
+ <label style="display: block; padding: 5px 0; color: #ccc; cursor: pointer;">
1782
+ <input type="checkbox" class="e2e-model-checkbox" value="${model}" style="margin-right: 8px;">
1783
+ ${model}
1784
+ </label>
1785
+ `).join('');
1786
+
1787
+ // Load judges
1788
+ try {
1789
+ const response = await fetch('/list_judges');
1790
+ const data = await response.json();
1791
+ const judgeSelect = document.getElementById('e2e-judge');
1792
+
1793
+ if (data.judges && data.judges.length > 0) {
1794
+ judgeSelect.innerHTML = data.judges.map((judge, idx) =>
1795
+ `<option value="${idx}">${judge.name} (${judge.type})</option>`
1796
+ ).join('');
1797
+ } else {
1798
+ judgeSelect.innerHTML = '<option value="">No judges available - create one first</option>';
1799
+ }
1800
+ } catch (error) {
1801
+ console.error('Error loading judges:', error);
1802
+ }
1803
+
1804
+ document.getElementById('e2e-panel').style.display = 'block';
1805
+ });
1806
+
1807
+ // Close E2E panel
1808
+ document.getElementById('close-e2e-btn').addEventListener('click', () => {
1809
+ document.getElementById('e2e-panel').style.display = 'none';
1810
+ });
1811
+
1812
+ // Close E2E progress
1813
+ document.getElementById('close-e2e-progress-btn').addEventListener('click', () => {
1814
+ document.getElementById('e2e-progress-panel').style.display = 'none';
1815
+ document.getElementById('e2e-results').style.display = 'none';
1816
+ });
1817
+
1818
+ // Run end-to-end test
1819
+ document.getElementById('run-e2e-btn').addEventListener('click', async () => {
1820
+ // Gather selected models
1821
+ const selectedWanbModels = Array.from(document.querySelectorAll('.e2e-model-checkbox:checked')).map(cb => cb.value);
1822
+ const openRouterModelsText = document.getElementById('e2e-openrouter-models').value.trim();
1823
+ const openRouterModels = openRouterModelsText
1824
+ .split('\n')
1825
+ .map(m => m.trim())
1826
+ .filter(m => m.length > 0);
1827
+ const allModels = [...selectedWanbModels, ...openRouterModels];
1828
+
1829
+ if (allModels.length === 0) {
1830
+ alert('Please select at least one model!');
1831
+ return;
1832
+ }
1833
+
1834
+ const judgeIndex = document.getElementById('e2e-judge').value;
1835
+ if (!judgeIndex) {
1836
+ alert('Please select a judge!');
1837
+ return;
1838
+ }
1839
+
1840
+ const numExamples = document.getElementById('e2e-num-examples').value ? parseInt(document.getElementById('e2e-num-examples').value) : null;
1841
+
1842
+ // Load judge data
1843
+ const judgesResponse = await fetch('/list_judges');
1844
+ const judgesData = await judgesResponse.json();
1845
+ const judge = judgesData.judges[parseInt(judgeIndex)];
1846
+
1847
+ // Hide config panel, show progress panel
1848
+ document.getElementById('e2e-panel').style.display = 'none';
1849
+ document.getElementById('e2e-progress-panel').style.display = 'block';
1850
+
1851
+ const progressText = document.getElementById('e2e-progress-text');
1852
+ const stepLabel = document.getElementById('e2e-step-label');
1853
+ const overallProgress = document.getElementById('e2e-overall-progress');
1854
+
1855
+ progressText.textContent = '';
1856
+
1857
+ try {
1858
+ // === STEP 1: Export Selected Traces ===
1859
+ stepLabel.textContent = 'Step 1/3: Exporting selected traces...';
1860
+ overallProgress.style.width = '10%';
1861
+ progressText.textContent += '📦 Exporting selected traces...\n';
1862
+
1863
+ // Get full trace objects for selected IDs
1864
+ const selectedTraceObjects = allTraces.filter(t => selectedTraces.has(t.id));
1865
+
1866
+ const exportResponse = await fetch('/export_strong_traces', {
1867
+ method: 'POST',
1868
+ headers: { 'Content-Type': 'application/json' },
1869
+ body: JSON.stringify({
1870
+ traces: selectedTraceObjects,
1871
+ nickname: `e2e_export_${Date.now()}`
1872
+ })
1873
+ });
1874
+
1875
+ if (!exportResponse.ok) {
1876
+ throw new Error('Failed to export traces');
1877
+ }
1878
+
1879
+ const exportResult = await exportResponse.json();
1880
+ const exportFilename = exportResult.filename;
1881
+ progressText.textContent += `✓ Exported ${exportResult.count} traces to ${exportFilename}\n\n`;
1882
+ overallProgress.style.width = '20%';
1883
+
1884
+ // === STEP 2: Run Weak Model Inference ===
1885
+ stepLabel.textContent = 'Step 2/3: Running weak model inference...';
1886
+ progressText.textContent += `⚙️ Running inference with ${allModels.length} model(s)...\n`;
1887
+
1888
+ const taskId = `inference_${Date.now()}`;
1889
+ let pollInterval = null;
1890
+
1891
+ const pollProgress = async () => {
1892
+ try {
1893
+ const resp = await fetch(`/progress/${taskId}`);
1894
+ if (resp.ok) {
1895
+ const progress = await resp.json();
1896
+ const percent = (progress.current / progress.total) * 100;
1897
+ // Map inference progress to 20-60% of overall
1898
+ const overallPercent = 20 + (percent * 0.4);
1899
+ overallProgress.style.width = `${overallPercent}%`;
1900
+ }
1901
+ } catch (e) {
1902
+ console.error('Error polling progress:', e);
1903
+ }
1904
+ };
1905
+
1906
+ pollInterval = setInterval(pollProgress, 300);
1907
+
1908
+ const inferenceResponse = await fetch('/run_inference', {
1909
+ method: 'POST',
1910
+ headers: { 'Content-Type': 'application/json' },
1911
+ body: JSON.stringify({
1912
+ models: allModels,
1913
+ strong_export_file: exportFilename,
1914
+ num_examples: numExamples,
1915
+ task_id: taskId
1916
+ })
1917
+ });
1918
+
1919
+ if (pollInterval) clearInterval(pollInterval);
1920
+
1921
+ if (!inferenceResponse.ok) {
1922
+ throw new Error('Failed to run inference');
1923
+ }
1924
+
1925
+ const inferenceResult = await inferenceResponse.json();
1926
+ progressText.textContent += `✓ Generated outputs for ${allModels.length} model(s)\n\n`;
1927
+ overallProgress.style.width = '60%';
1928
+
1929
+ // === STEP 3: Run Evaluations ===
1930
+ stepLabel.textContent = 'Step 3/3: Running evaluations...';
1931
+ progressText.textContent += `📊 Running evaluations with judge: ${judge.name}...\n`;
1932
+
1933
+ const evaluationResults = [];
1934
+
1935
+ // Get list of weak model files that were just generated
1936
+ const weakModelsResponse = await fetch('/list_weak_models');
1937
+ const weakModelsData = await weakModelsResponse.json();
1938
+
1939
+ // Filter to only the models we just ran
1940
+ const weakModelFiles = weakModelsData.files
1941
+ .filter(f => allModels.some(m => f.filename.includes(m.replace('/', '_'))))
1942
+ .map(f => f.filename);
1943
+
1944
+ for (let i = 0; i < weakModelFiles.length; i++) {
1945
+ const modelFile = weakModelFiles[i];
1946
+ const evalTaskId = `eval_${Date.now()}_${i}`;
1947
+
1948
+ progressText.textContent += `\n[${i+1}/${weakModelFiles.length}] Evaluating ${modelFile}...\n`;
1949
+
1950
+ let evalPollInterval = null;
1951
+ const pollEvalProgress = async () => {
1952
+ try {
1953
+ const resp = await fetch(`/progress/${evalTaskId}`);
1954
+ if (resp.ok) {
1955
+ const progress = await resp.json();
1956
+ const percent = (progress.current / progress.total) * 100;
1957
+ // Map eval progress to 60-100% of overall
1958
+ const basePercent = 60 + (i / weakModelFiles.length) * 40;
1959
+ const stepPercent = (percent / 100) * (40 / weakModelFiles.length);
1960
+ overallProgress.style.width = `${basePercent + stepPercent}%`;
1961
+ }
1962
+ } catch (e) {
1963
+ console.error('Error polling eval progress:', e);
1964
+ }
1965
+ };
1966
+
1967
+ evalPollInterval = setInterval(pollEvalProgress, 300);
1968
+
1969
+ const evalResponse = await fetch('/run_evaluation', {
1970
+ method: 'POST',
1971
+ headers: { 'Content-Type': 'application/json' },
1972
+ body: JSON.stringify({
1973
+ model_file: modelFile,
1974
+ judge: judge,
1975
+ task_id: evalTaskId
1976
+ })
1977
+ });
1978
+
1979
+ if (evalPollInterval) clearInterval(evalPollInterval);
1980
+
1981
+ if (evalResponse.ok) {
1982
+ const evalResult = await evalResponse.json();
1983
+ progressText.textContent += ` ✓ Complete: ${evalResult.examples_evaluated} examples evaluated\n`;
1984
+ evaluationResults.push(evalResult);
1985
+ } else {
1986
+ progressText.textContent += ` ✗ Error evaluating ${modelFile}\n`;
1987
+ }
1988
+ }
1989
+
1990
+ overallProgress.style.width = '100%';
1991
+ stepLabel.textContent = 'Complete!';
1992
+ progressText.textContent += `\n✅ All evaluations complete!\n`;
1993
+
1994
+ // Show results
1995
+ document.getElementById('e2e-results').style.display = 'block';
1996
+ const resultsContent = document.getElementById('e2e-results-content');
1997
+ resultsContent.innerHTML = evaluationResults.map(r => `
1998
+ <div style="margin-bottom: 15px; padding: 15px; background: #1a1a1a; border-radius: 4px; border: 1px solid #2a2a2a;">
1999
+ <div style="font-weight: bold; color: #fff; margin-bottom: 8px;">${r.evaluation_name}</div>
2000
+ <div style="font-size: 12px; color: #888; margin-bottom: 8px;">
2001
+ ${r.examples_evaluated} examples evaluated
2002
+ </div>
2003
+ <a href="${r.weave_url}" target="_blank" style="color: #4a9eff; font-size: 13px;">View in Weave →</a>
2004
+ </div>
2005
+ `).join('');
2006
+
2007
+ } catch (error) {
2008
+ progressText.textContent += `\n\n❌ Error: ${error.message}\n`;
2009
+ stepLabel.textContent = 'Error occurred';
2010
+ }
2011
+ });
1290
2012
  </script>
1291
2013
  </body>
1292
2014
  </html>