quickdistill 0.1.5__py3-none-any.whl → 0.1.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- quickdistill/__init__.py +1 -1
- quickdistill/__pycache__/__init__.cpython-310.pyc +0 -0
- quickdistill/__pycache__/server.cpython-310.pyc +0 -0
- quickdistill/server.py +330 -14
- quickdistill/static/judge_manager.html +183 -16
- quickdistill/static/trace_viewer.html +787 -13
- {quickdistill-0.1.5.dist-info → quickdistill-0.1.7.dist-info}/METADATA +1 -1
- quickdistill-0.1.7.dist-info/RECORD +17 -0
- quickdistill-0.1.5.dist-info/RECORD +0 -17
- {quickdistill-0.1.5.dist-info → quickdistill-0.1.7.dist-info}/WHEEL +0 -0
- {quickdistill-0.1.5.dist-info → quickdistill-0.1.7.dist-info}/entry_points.txt +0 -0
- {quickdistill-0.1.5.dist-info → quickdistill-0.1.7.dist-info}/top_level.txt +0 -0
|
@@ -316,6 +316,22 @@
|
|
|
316
316
|
Manage Judges
|
|
317
317
|
</a>
|
|
318
318
|
|
|
319
|
+
<button id="open-test-judge-btn" style="padding: 8px 16px; background: #6a4a7e; color: white; border: none; border-radius: 4px; cursor: pointer;">
|
|
320
|
+
Test Judges
|
|
321
|
+
</button>
|
|
322
|
+
|
|
323
|
+
<button id="open-settings-btn" style="padding: 8px 16px; background: #5a5a5a; color: white; border: none; border-radius: 4px; cursor: pointer;">
|
|
324
|
+
Settings
|
|
325
|
+
</button>
|
|
326
|
+
|
|
327
|
+
<div style="margin: 20px 0; padding: 15px; background: #2a1a2a; border-radius: 8px; border: 1px solid #4a2a4a;">
|
|
328
|
+
<div style="color: #aaa; font-size: 13px; margin-bottom: 10px;">Automatic Workflow:</div>
|
|
329
|
+
<button id="open-e2e-btn" style="padding: 10px 20px; background: #7a4a9e; color: white; border: none; border-radius: 4px; cursor: pointer; font-weight: 500;">
|
|
330
|
+
⚡ Run End-to-End Test
|
|
331
|
+
</button>
|
|
332
|
+
<div style="color: #666; font-size: 11px; margin-top: 8px;">Export → Generate → Evaluate (all in one)</div>
|
|
333
|
+
</div>
|
|
334
|
+
|
|
319
335
|
<div class="stats">
|
|
320
336
|
<div>Total: <span id="total-count">0</span></div>
|
|
321
337
|
<div>Shown: <span id="shown-count">0</span></div>
|
|
@@ -375,6 +391,9 @@
|
|
|
375
391
|
|
|
376
392
|
<div id="inference-progress" style="display: none; margin-top: 20px; padding: 15px; background: #0f0f0f; border-radius: 4px;">
|
|
377
393
|
<div style="color: #4a9eff; margin-bottom: 10px;">Running inference...</div>
|
|
394
|
+
<div id="inference-progress-bar" style="width: 100%; height: 6px; background: #2a2a2a; border-radius: 3px; margin-bottom: 15px; overflow: hidden;">
|
|
395
|
+
<div id="inference-progress-fill" style="height: 100%; background: #4a9eff; width: 0%; transition: width 0.3s;"></div>
|
|
396
|
+
</div>
|
|
378
397
|
<div id="progress-text" style="color: #888; font-family: monospace; font-size: 12px; white-space: pre-wrap;"></div>
|
|
379
398
|
</div>
|
|
380
399
|
</div>
|
|
@@ -424,6 +443,193 @@
|
|
|
424
443
|
</div>
|
|
425
444
|
</div>
|
|
426
445
|
</div>
|
|
446
|
+
|
|
447
|
+
<!-- Settings Panel -->
|
|
448
|
+
<div id="settings-panel" style="display: none; position: fixed; top: 0; left: 0; width: 100%; height: 100%; background: rgba(0,0,0,0.8); z-index: 1000; padding: 40px;">
|
|
449
|
+
<div style="max-width: 600px; margin: 0 auto; background: #1a1a1a; border-radius: 8px; padding: 30px; border: 1px solid #2a2a2a;">
|
|
450
|
+
<h2 style="color: #fff; margin-bottom: 20px;">Settings</h2>
|
|
451
|
+
|
|
452
|
+
<div style="margin-bottom: 20px;">
|
|
453
|
+
<label style="display: block; color: #aaa; margin-bottom: 8px; font-size: 14px;">W&B Inference Project</label>
|
|
454
|
+
<input type="text" id="settings-inference-project" placeholder="e.g., wandb_fc/quickstart_playground"
|
|
455
|
+
style="width: 100%; padding: 10px; background: #2a2a2a; color: #fff; border: 1px solid #3a3a3a; border-radius: 4px; font-size: 14px;">
|
|
456
|
+
<div style="color: #666; font-size: 12px; margin-top: 5px;">Used for running weak model inference</div>
|
|
457
|
+
</div>
|
|
458
|
+
|
|
459
|
+
<div style="margin-bottom: 30px;">
|
|
460
|
+
<label style="display: block; color: #aaa; margin-bottom: 8px; font-size: 14px;">W&B Evaluation Project</label>
|
|
461
|
+
<input type="text" id="settings-evaluation-project" placeholder="e.g., wandb_inference"
|
|
462
|
+
style="width: 100%; padding: 10px; background: #2a2a2a; color: #fff; border: 1px solid #3a3a3a; border-radius: 4px; font-size: 14px;">
|
|
463
|
+
<div style="color: #666; font-size: 12px; margin-top: 5px;">Used for logging evaluation results with Weave</div>
|
|
464
|
+
</div>
|
|
465
|
+
|
|
466
|
+
<div style="display: flex; gap: 10px; justify-content: flex-end;">
|
|
467
|
+
<button id="close-settings-btn" style="padding: 10px 20px; background: #5a2a2a; color: white; border: none; border-radius: 4px; cursor: pointer;">
|
|
468
|
+
Cancel
|
|
469
|
+
</button>
|
|
470
|
+
<button id="save-settings-btn" style="padding: 10px 20px; background: #2a7c4a; color: white; border: none; border-radius: 4px; cursor: pointer;">
|
|
471
|
+
Save Settings
|
|
472
|
+
</button>
|
|
473
|
+
</div>
|
|
474
|
+
</div>
|
|
475
|
+
</div>
|
|
476
|
+
|
|
477
|
+
<!-- Test Judges Panel -->
|
|
478
|
+
<div id="test-judge-panel" style="display: none; position: fixed; top: 0; left: 0; width: 100%; height: 100%; background: rgba(0,0,0,0.8); z-index: 1000; padding: 40px; overflow-y: auto;">
|
|
479
|
+
<div style="max-width: 1000px; margin: 0 auto; background: #1a1a1a; border-radius: 8px; padding: 30px; border: 1px solid #3a2a4a;">
|
|
480
|
+
<h2 style="color: #fff; margin-bottom: 10px;">Test Judge</h2>
|
|
481
|
+
<p style="color: #888; font-size: 13px; margin-bottom: 25px;">
|
|
482
|
+
Test your judge on sample data to see exactly what inputs/outputs it receives
|
|
483
|
+
</p>
|
|
484
|
+
|
|
485
|
+
<!-- Configuration -->
|
|
486
|
+
<div style="display: grid; grid-template-columns: 1fr 1fr; gap: 20px; margin-bottom: 25px;">
|
|
487
|
+
<div>
|
|
488
|
+
<label style="display: block; color: #aaa; margin-bottom: 8px; font-size: 14px;">Select Judge:</label>
|
|
489
|
+
<select id="test-judge-select" style="width: 100%; padding: 10px; background: #2a2a2a; color: #fff; border: 1px solid #3a3a3a; border-radius: 4px; font-size: 14px;">
|
|
490
|
+
<option value="">Loading judges...</option>
|
|
491
|
+
</select>
|
|
492
|
+
</div>
|
|
493
|
+
|
|
494
|
+
<div>
|
|
495
|
+
<label style="display: block; color: #aaa; margin-bottom: 8px; font-size: 14px;">Weak Model Data:</label>
|
|
496
|
+
<select id="test-weak-model-select" style="width: 100%; padding: 10px; background: #2a2a2a; color: #fff; border: 1px solid #3a3a3a; border-radius: 4px; font-size: 14px;">
|
|
497
|
+
<option value="">Loading weak model files...</option>
|
|
498
|
+
</select>
|
|
499
|
+
</div>
|
|
500
|
+
</div>
|
|
501
|
+
|
|
502
|
+
<div style="margin-bottom: 20px;">
|
|
503
|
+
<label style="display: block; color: #aaa; margin-bottom: 8px; font-size: 14px;">Number of Samples:</label>
|
|
504
|
+
<input type="number" id="test-num-samples" value="5" min="1" max="50"
|
|
505
|
+
style="width: 150px; padding: 10px; background: #2a2a2a; color: #fff; border: 1px solid #3a3a3a; border-radius: 4px; font-size: 14px;">
|
|
506
|
+
<span style="color: #666; font-size: 12px; margin-left: 10px;">Max: 50</span>
|
|
507
|
+
</div>
|
|
508
|
+
|
|
509
|
+
<!-- Judge Model -->
|
|
510
|
+
<div style="margin-bottom: 20px;">
|
|
511
|
+
<label style="display: block; color: #aaa; margin-bottom: 8px; font-size: 14px;">Judge Model:</label>
|
|
512
|
+
<input type="text" id="test-judge-model"
|
|
513
|
+
style="width: 100%; padding: 10px; background: #2a2a2a; color: #fff; border: 1px solid #3a3a3a; border-radius: 4px; font-size: 14px;"
|
|
514
|
+
placeholder="e.g., gpt-4o, claude-3-5-sonnet-20241022">
|
|
515
|
+
<div style="color: #666; font-size: 12px; margin-top: 5px;">
|
|
516
|
+
Override the judge's model for this test
|
|
517
|
+
</div>
|
|
518
|
+
</div>
|
|
519
|
+
|
|
520
|
+
<!-- Judge Prompt -->
|
|
521
|
+
<div style="margin-bottom: 30px;">
|
|
522
|
+
<label style="display: block; color: #aaa; margin-bottom: 8px; font-size: 14px;">Judge Prompt:</label>
|
|
523
|
+
<textarea id="test-judge-prompt"
|
|
524
|
+
style="width: 100%; min-height: 200px; padding: 10px; background: #2a2a2a; color: #fff; border: 1px solid #3a3a3a; border-radius: 4px; font-size: 13px; font-family: 'Courier New', monospace; resize: vertical;"
|
|
525
|
+
placeholder="Select a judge to load its prompt..."></textarea>
|
|
526
|
+
<div style="color: #666; font-size: 12px; margin-top: 5px;">
|
|
527
|
+
Edit the prompt and test changes, or save to update the judge permanently
|
|
528
|
+
</div>
|
|
529
|
+
</div>
|
|
530
|
+
|
|
531
|
+
<!-- Actions -->
|
|
532
|
+
<div style="display: flex; gap: 10px; margin-bottom: 30px;">
|
|
533
|
+
<button id="run-test-judge-btn" style="padding: 10px 20px; background: #6a4a7e; color: white; border: none; border-radius: 4px; cursor: pointer; font-weight: 500;">
|
|
534
|
+
Run Test
|
|
535
|
+
</button>
|
|
536
|
+
<button id="save-test-judge-prompt-btn" style="padding: 10px 20px; background: #2a7c4a; color: white; border: none; border-radius: 4px; cursor: pointer;">
|
|
537
|
+
Save Prompt to Judge
|
|
538
|
+
</button>
|
|
539
|
+
<button id="close-test-judge-btn" style="padding: 10px 20px; background: #5a2a2a; color: white; border: none; border-radius: 4px; cursor: pointer;">
|
|
540
|
+
Close
|
|
541
|
+
</button>
|
|
542
|
+
</div>
|
|
543
|
+
|
|
544
|
+
<!-- Results -->
|
|
545
|
+
<div id="test-judge-results" style="display: none;">
|
|
546
|
+
<h3 style="color: #4a9eff; margin-bottom: 15px;">Test Results</h3>
|
|
547
|
+
<div id="test-judge-results-content" style="max-height: 600px; overflow-y: auto;">
|
|
548
|
+
<!-- Results populated here -->
|
|
549
|
+
</div>
|
|
550
|
+
</div>
|
|
551
|
+
</div>
|
|
552
|
+
</div>
|
|
553
|
+
|
|
554
|
+
<!-- End-to-End Test Panel -->
|
|
555
|
+
<div id="e2e-panel" style="display: none; position: fixed; top: 0; left: 0; width: 100%; height: 100%; background: rgba(0,0,0,0.8); z-index: 1000; padding: 40px; overflow-y: auto;">
|
|
556
|
+
<div style="max-width: 800px; margin: 0 auto; background: #1a1a1a; border-radius: 8px; padding: 30px; border: 1px solid #4a2a4a;">
|
|
557
|
+
<h2 style="color: #fff; margin-bottom: 10px;">⚡ Run End-to-End Test</h2>
|
|
558
|
+
<p style="color: #888; font-size: 13px; margin-bottom: 25px;">
|
|
559
|
+
This will automatically: Export selected traces → Run weak models → Evaluate with judge
|
|
560
|
+
</p>
|
|
561
|
+
|
|
562
|
+
<!-- Weak Model Selection -->
|
|
563
|
+
<div style="margin-bottom: 25px;">
|
|
564
|
+
<h3 style="color: #fff; font-size: 16px; margin-bottom: 15px;">1. Select Weak Models</h3>
|
|
565
|
+
|
|
566
|
+
<div style="margin-bottom: 15px;">
|
|
567
|
+
<label style="display: block; color: #aaa; margin-bottom: 8px; font-size: 13px;">W&B Models:</label>
|
|
568
|
+
<div id="e2e-wandb-models" style="max-height: 150px; overflow-y: auto; background: #0f0f0f; padding: 10px; border-radius: 4px; border: 1px solid #2a2a2a;">
|
|
569
|
+
<!-- Populated dynamically -->
|
|
570
|
+
</div>
|
|
571
|
+
</div>
|
|
572
|
+
|
|
573
|
+
<div style="margin-bottom: 15px;">
|
|
574
|
+
<label style="display: block; color: #aaa; margin-bottom: 8px; font-size: 13px;">OpenRouter Models (optional):</label>
|
|
575
|
+
<textarea id="e2e-openrouter-models" placeholder="Enter OpenRouter models (one per line) e.g., meta-llama/llama-3.3-70b-instruct anthropic/claude-3.5-sonnet"
|
|
576
|
+
style="width: 100%; padding: 8px; background: #2a2a2a; color: #fff; border: 1px solid #3a3a3a; border-radius: 4px; font-size: 13px; min-height: 80px; font-family: monospace;"></textarea>
|
|
577
|
+
<div style="color: #666; font-size: 11px; margin-top: 5px;">One model per line</div>
|
|
578
|
+
</div>
|
|
579
|
+
|
|
580
|
+
<div>
|
|
581
|
+
<label style="display: block; color: #aaa; margin-bottom: 8px; font-size: 13px;">Max Examples (optional):</label>
|
|
582
|
+
<input type="number" id="e2e-num-examples" placeholder="Leave empty to use all selected traces"
|
|
583
|
+
style="width: 200px; padding: 8px; background: #2a2a2a; color: #fff; border: 1px solid #3a3a3a; border-radius: 4px; font-size: 13px;">
|
|
584
|
+
</div>
|
|
585
|
+
</div>
|
|
586
|
+
|
|
587
|
+
<!-- Judge Selection -->
|
|
588
|
+
<div style="margin-bottom: 30px;">
|
|
589
|
+
<h3 style="color: #fff; font-size: 16px; margin-bottom: 15px;">2. Select Judge</h3>
|
|
590
|
+
<select id="e2e-judge" style="width: 100%; padding: 10px; background: #2a2a2a; color: #fff; border: 1px solid #3a3a3a; border-radius: 4px; font-size: 14px;">
|
|
591
|
+
<option value="">Loading judges...</option>
|
|
592
|
+
</select>
|
|
593
|
+
</div>
|
|
594
|
+
|
|
595
|
+
<!-- Actions -->
|
|
596
|
+
<div style="display: flex; gap: 10px; justify-content: flex-end;">
|
|
597
|
+
<button id="close-e2e-btn" style="padding: 10px 20px; background: #5a2a2a; color: white; border: none; border-radius: 4px; cursor: pointer;">
|
|
598
|
+
Cancel
|
|
599
|
+
</button>
|
|
600
|
+
<button id="run-e2e-btn" style="padding: 10px 20px; background: #7a4a9e; color: white; border: none; border-radius: 4px; cursor: pointer; font-weight: 500;">
|
|
601
|
+
⚡ Run Test
|
|
602
|
+
</button>
|
|
603
|
+
</div>
|
|
604
|
+
</div>
|
|
605
|
+
</div>
|
|
606
|
+
|
|
607
|
+
<!-- End-to-End Progress Panel -->
|
|
608
|
+
<div id="e2e-progress-panel" style="display: none; position: fixed; top: 0; left: 0; width: 100%; height: 100%; background: rgba(0,0,0,0.9); z-index: 1100; padding: 40px; overflow-y: auto;">
|
|
609
|
+
<div style="max-width: 800px; margin: 0 auto; background: #1a1a1a; border-radius: 8px; padding: 30px; border: 1px solid #4a2a4a;">
|
|
610
|
+
<h2 style="color: #fff; margin-bottom: 20px;">Running End-to-End Test</h2>
|
|
611
|
+
|
|
612
|
+
<!-- Overall Progress -->
|
|
613
|
+
<div style="margin-bottom: 30px;">
|
|
614
|
+
<div style="color: #4a9eff; font-size: 14px; margin-bottom: 10px;" id="e2e-step-label">Step 1/3: Exporting traces...</div>
|
|
615
|
+
<div style="width: 100%; height: 8px; background: #2a2a2a; border-radius: 4px; overflow: hidden;">
|
|
616
|
+
<div id="e2e-overall-progress" style="height: 100%; background: #7a4a9e; width: 0%; transition: width 0.3s;"></div>
|
|
617
|
+
</div>
|
|
618
|
+
</div>
|
|
619
|
+
|
|
620
|
+
<!-- Detailed Progress -->
|
|
621
|
+
<div id="e2e-progress-text" style="color: #888; font-family: monospace; font-size: 12px; white-space: pre-wrap; background: #0f0f0f; padding: 15px; border-radius: 4px; max-height: 400px; overflow-y: auto;"></div>
|
|
622
|
+
|
|
623
|
+
<!-- Results -->
|
|
624
|
+
<div id="e2e-results" style="display: none; margin-top: 20px;">
|
|
625
|
+
<h3 style="color: #4a9eff; margin-bottom: 15px;">✓ Test Complete!</h3>
|
|
626
|
+
<div id="e2e-results-content" style="background: #0f0f0f; padding: 15px; border-radius: 4px;"></div>
|
|
627
|
+
<button id="close-e2e-progress-btn" style="margin-top: 20px; padding: 10px 20px; background: #2a7c4a; color: white; border: none; border-radius: 4px; cursor: pointer;">
|
|
628
|
+
Close
|
|
629
|
+
</button>
|
|
630
|
+
</div>
|
|
631
|
+
</div>
|
|
632
|
+
</div>
|
|
427
633
|
</div>
|
|
428
634
|
|
|
429
635
|
<script>
|
|
@@ -919,20 +1125,46 @@
|
|
|
919
1125
|
// Show progress
|
|
920
1126
|
document.getElementById('inference-progress').style.display = 'block';
|
|
921
1127
|
const progressText = document.getElementById('progress-text');
|
|
1128
|
+
const progressFill = document.getElementById('inference-progress-fill');
|
|
922
1129
|
progressText.textContent = `Starting inference...\n`;
|
|
923
|
-
|
|
924
|
-
|
|
925
|
-
|
|
1130
|
+
progressFill.style.width = '0%';
|
|
1131
|
+
|
|
1132
|
+
// Start inference and poll for progress
|
|
1133
|
+
let taskId = null;
|
|
1134
|
+
let pollInterval = null;
|
|
1135
|
+
|
|
1136
|
+
const pollProgress = async () => {
|
|
1137
|
+
if (!taskId) return;
|
|
1138
|
+
try {
|
|
1139
|
+
const resp = await fetch(`/progress/${taskId}`);
|
|
1140
|
+
if (resp.ok) {
|
|
1141
|
+
const progress = await resp.json();
|
|
1142
|
+
const percent = (progress.current / progress.total) * 100;
|
|
1143
|
+
progressFill.style.width = `${percent}%`;
|
|
1144
|
+
progressText.textContent = `${progress.message}\nProgress: ${progress.current}/${progress.total} (${percent.toFixed(1)}%)\n`;
|
|
1145
|
+
}
|
|
1146
|
+
} catch (e) {
|
|
1147
|
+
console.error('Error polling progress:', e);
|
|
1148
|
+
}
|
|
1149
|
+
};
|
|
926
1150
|
|
|
927
1151
|
// Call backend API
|
|
928
1152
|
try {
|
|
1153
|
+
// Generate a task ID for polling
|
|
1154
|
+
taskId = `inference_${Date.now()}`;
|
|
1155
|
+
|
|
1156
|
+
// Start polling immediately
|
|
1157
|
+
pollInterval = setInterval(pollProgress, 300);
|
|
1158
|
+
|
|
1159
|
+
// Start the inference
|
|
929
1160
|
const response = await fetch('/run_inference', {
|
|
930
1161
|
method: 'POST',
|
|
931
1162
|
headers: { 'Content-Type': 'application/json' },
|
|
932
1163
|
body: JSON.stringify({
|
|
933
1164
|
models: allModels,
|
|
934
1165
|
strong_export_file: strongExportFile,
|
|
935
|
-
num_examples: numExamples
|
|
1166
|
+
num_examples: numExamples,
|
|
1167
|
+
task_id: taskId
|
|
936
1168
|
})
|
|
937
1169
|
});
|
|
938
1170
|
|
|
@@ -941,8 +1173,12 @@
|
|
|
941
1173
|
}
|
|
942
1174
|
|
|
943
1175
|
const result = await response.json();
|
|
944
|
-
|
|
945
|
-
|
|
1176
|
+
|
|
1177
|
+
// Stop polling
|
|
1178
|
+
if (pollInterval) clearInterval(pollInterval);
|
|
1179
|
+
|
|
1180
|
+
progressText.textContent = `\n✓ Complete!\nResults saved to: ${result.files.join(', ')}\n`;
|
|
1181
|
+
progressFill.style.width = '100%';
|
|
946
1182
|
|
|
947
1183
|
setTimeout(() => {
|
|
948
1184
|
document.getElementById('inference-panel').style.display = 'none';
|
|
@@ -951,8 +1187,7 @@
|
|
|
951
1187
|
|
|
952
1188
|
} catch (error) {
|
|
953
1189
|
progressText.textContent += `\n✗ Error: ${error.message}\n`;
|
|
954
|
-
|
|
955
|
-
progressText.textContent += `Run: python inference_server.py\n`;
|
|
1190
|
+
if (pollInterval) clearInterval(pollInterval);
|
|
956
1191
|
}
|
|
957
1192
|
});
|
|
958
1193
|
|
|
@@ -1091,21 +1326,44 @@
|
|
|
1091
1326
|
const modelFiles = Array.from(selectedEvalModels);
|
|
1092
1327
|
const results = [];
|
|
1093
1328
|
|
|
1094
|
-
// Run evaluations sequentially
|
|
1329
|
+
// Run evaluations sequentially with granular progress
|
|
1095
1330
|
for (let i = 0; i < modelFiles.length; i++) {
|
|
1096
1331
|
const modelFile = modelFiles[i];
|
|
1097
|
-
const progress = ((i) / modelFiles.length) * 100;
|
|
1098
|
-
progressFill.style.width = `${progress}%`;
|
|
1099
1332
|
|
|
1100
|
-
progressText.textContent += `[${i+1}/${modelFiles.length}]
|
|
1333
|
+
progressText.textContent += `[${i+1}/${modelFiles.length}] Starting ${modelFile}...\n`;
|
|
1334
|
+
|
|
1335
|
+
let pollInterval = null;
|
|
1336
|
+
let taskId = null;
|
|
1337
|
+
|
|
1338
|
+
const pollProgress = async () => {
|
|
1339
|
+
if (!taskId) return;
|
|
1340
|
+
try {
|
|
1341
|
+
const resp = await fetch(`/progress/${taskId}`);
|
|
1342
|
+
if (resp.ok) {
|
|
1343
|
+
const progress = await resp.json();
|
|
1344
|
+
const percent = (progress.current / progress.total) * 100;
|
|
1345
|
+
progressFill.style.width = `${percent}%`;
|
|
1346
|
+
progressText.textContent = `[${i+1}/${modelFiles.length}] ${progress.message}\nProgress: ${progress.current}/${progress.total} (${percent.toFixed(1)}%)\n`;
|
|
1347
|
+
}
|
|
1348
|
+
} catch (e) {
|
|
1349
|
+
console.error('Error polling eval progress:', e);
|
|
1350
|
+
}
|
|
1351
|
+
};
|
|
1101
1352
|
|
|
1102
1353
|
try {
|
|
1354
|
+
// Generate task ID for this evaluation
|
|
1355
|
+
taskId = `eval_${Date.now()}_${i}`;
|
|
1356
|
+
|
|
1357
|
+
// Start polling
|
|
1358
|
+
pollInterval = setInterval(pollProgress, 300);
|
|
1359
|
+
|
|
1103
1360
|
const response = await fetch('/run_evaluation', {
|
|
1104
1361
|
method: 'POST',
|
|
1105
1362
|
headers: { 'Content-Type': 'application/json' },
|
|
1106
1363
|
body: JSON.stringify({
|
|
1107
1364
|
model_file: modelFile,
|
|
1108
|
-
judge: judge
|
|
1365
|
+
judge: judge,
|
|
1366
|
+
task_id: taskId
|
|
1109
1367
|
})
|
|
1110
1368
|
});
|
|
1111
1369
|
|
|
@@ -1114,6 +1372,10 @@
|
|
|
1114
1372
|
}
|
|
1115
1373
|
|
|
1116
1374
|
const result = await response.json();
|
|
1375
|
+
|
|
1376
|
+
// Clear polling when done
|
|
1377
|
+
if (pollInterval) clearInterval(pollInterval);
|
|
1378
|
+
|
|
1117
1379
|
progressText.textContent += ` ✓ Complete: ${result.evaluation_name}\n`;
|
|
1118
1380
|
progressText.textContent += ` Examples: ${result.examples_evaluated}\n\n`;
|
|
1119
1381
|
|
|
@@ -1125,6 +1387,7 @@
|
|
|
1125
1387
|
});
|
|
1126
1388
|
|
|
1127
1389
|
} catch (error) {
|
|
1390
|
+
if (pollInterval) clearInterval(pollInterval);
|
|
1128
1391
|
progressText.textContent += ` ✗ Error: ${error.message}\n\n`;
|
|
1129
1392
|
}
|
|
1130
1393
|
}
|
|
@@ -1227,6 +1490,517 @@
|
|
|
1227
1490
|
console.error('Delete error:', error);
|
|
1228
1491
|
}
|
|
1229
1492
|
}
|
|
1493
|
+
|
|
1494
|
+
// === SETTINGS ===
|
|
1495
|
+
|
|
1496
|
+
// Load and display settings
|
|
1497
|
+
async function loadSettings() {
|
|
1498
|
+
try {
|
|
1499
|
+
const response = await fetch('/settings');
|
|
1500
|
+
const settings = await response.json();
|
|
1501
|
+
document.getElementById('settings-inference-project').value = settings.inference_project || '';
|
|
1502
|
+
document.getElementById('settings-evaluation-project').value = settings.evaluation_project || '';
|
|
1503
|
+
} catch (error) {
|
|
1504
|
+
console.error('Error loading settings:', error);
|
|
1505
|
+
}
|
|
1506
|
+
}
|
|
1507
|
+
|
|
1508
|
+
// Open settings panel
|
|
1509
|
+
document.getElementById('open-settings-btn').addEventListener('click', async () => {
|
|
1510
|
+
await loadSettings();
|
|
1511
|
+
document.getElementById('settings-panel').style.display = 'block';
|
|
1512
|
+
});
|
|
1513
|
+
|
|
1514
|
+
// Close settings panel
|
|
1515
|
+
document.getElementById('close-settings-btn').addEventListener('click', () => {
|
|
1516
|
+
document.getElementById('settings-panel').style.display = 'none';
|
|
1517
|
+
});
|
|
1518
|
+
|
|
1519
|
+
// Save settings
|
|
1520
|
+
document.getElementById('save-settings-btn').addEventListener('click', async () => {
|
|
1521
|
+
const settings = {
|
|
1522
|
+
inference_project: document.getElementById('settings-inference-project').value.trim(),
|
|
1523
|
+
evaluation_project: document.getElementById('settings-evaluation-project').value.trim()
|
|
1524
|
+
};
|
|
1525
|
+
|
|
1526
|
+
if (!settings.inference_project || !settings.evaluation_project) {
|
|
1527
|
+
alert('Both project fields are required');
|
|
1528
|
+
return;
|
|
1529
|
+
}
|
|
1530
|
+
|
|
1531
|
+
try {
|
|
1532
|
+
const response = await fetch('/settings', {
|
|
1533
|
+
method: 'POST',
|
|
1534
|
+
headers: { 'Content-Type': 'application/json' },
|
|
1535
|
+
body: JSON.stringify(settings)
|
|
1536
|
+
});
|
|
1537
|
+
|
|
1538
|
+
const result = await response.json();
|
|
1539
|
+
if (result.status === 'success') {
|
|
1540
|
+
alert('Settings saved! Please restart the server for changes to take effect.');
|
|
1541
|
+
document.getElementById('settings-panel').style.display = 'none';
|
|
1542
|
+
} else {
|
|
1543
|
+
alert('Error saving settings');
|
|
1544
|
+
}
|
|
1545
|
+
} catch (error) {
|
|
1546
|
+
alert('Error saving settings: ' + error.message);
|
|
1547
|
+
}
|
|
1548
|
+
});
|
|
1549
|
+
|
|
1550
|
+
// === TEST JUDGES ===
|
|
1551
|
+
|
|
1552
|
+
let testJudgesData = []; // Store judges globally for test panel
|
|
1553
|
+
|
|
1554
|
+
// Open test judge panel
|
|
1555
|
+
document.getElementById('open-test-judge-btn').addEventListener('click', async () => {
|
|
1556
|
+
// Load judges
|
|
1557
|
+
try {
|
|
1558
|
+
const response = await fetch('/list_judges');
|
|
1559
|
+
const data = await response.json();
|
|
1560
|
+
testJudgesData = data.judges || []; // Store globally
|
|
1561
|
+
const judgeSelect = document.getElementById('test-judge-select');
|
|
1562
|
+
|
|
1563
|
+
if (testJudgesData.length > 0) {
|
|
1564
|
+
judgeSelect.innerHTML = testJudgesData.map((judge, idx) =>
|
|
1565
|
+
`<option value="${idx}">${judge.name} (${judge.type})</option>`
|
|
1566
|
+
).join('');
|
|
1567
|
+
|
|
1568
|
+
// Load first judge's prompt and model
|
|
1569
|
+
if (testJudgesData[0]) {
|
|
1570
|
+
document.getElementById('test-judge-prompt').value = testJudgesData[0].prompt || '';
|
|
1571
|
+
document.getElementById('test-judge-model').value = testJudgesData[0].model || '';
|
|
1572
|
+
}
|
|
1573
|
+
} else {
|
|
1574
|
+
judgeSelect.innerHTML = '<option value="">No judges available</option>';
|
|
1575
|
+
}
|
|
1576
|
+
} catch (error) {
|
|
1577
|
+
console.error('Error loading judges:', error);
|
|
1578
|
+
}
|
|
1579
|
+
|
|
1580
|
+
// Load weak model files
|
|
1581
|
+
try {
|
|
1582
|
+
const response = await fetch('/list_weak_models');
|
|
1583
|
+
const data = await response.json();
|
|
1584
|
+
const weakModelSelect = document.getElementById('test-weak-model-select');
|
|
1585
|
+
|
|
1586
|
+
if (data.files && data.files.length > 0) {
|
|
1587
|
+
weakModelSelect.innerHTML = data.files.map(f =>
|
|
1588
|
+
`<option value="${f.filename}">${f.weak_model || f.filename}</option>`
|
|
1589
|
+
).join('');
|
|
1590
|
+
} else {
|
|
1591
|
+
weakModelSelect.innerHTML = '<option value="">No weak model files available</option>';
|
|
1592
|
+
}
|
|
1593
|
+
} catch (error) {
|
|
1594
|
+
console.error('Error loading weak models:', error);
|
|
1595
|
+
}
|
|
1596
|
+
|
|
1597
|
+
document.getElementById('test-judge-panel').style.display = 'block';
|
|
1598
|
+
document.getElementById('test-judge-results').style.display = 'none';
|
|
1599
|
+
});
|
|
1600
|
+
|
|
1601
|
+
// When judge selection changes, update the prompt and model
|
|
1602
|
+
document.getElementById('test-judge-select').addEventListener('change', (e) => {
|
|
1603
|
+
const judgeIndex = parseInt(e.target.value);
|
|
1604
|
+
if (!isNaN(judgeIndex) && testJudgesData[judgeIndex]) {
|
|
1605
|
+
const judge = testJudgesData[judgeIndex];
|
|
1606
|
+
document.getElementById('test-judge-prompt').value = judge.prompt || '';
|
|
1607
|
+
document.getElementById('test-judge-model').value = judge.model || '';
|
|
1608
|
+
}
|
|
1609
|
+
});
|
|
1610
|
+
|
|
1611
|
+
// Close test judge panel
|
|
1612
|
+
document.getElementById('close-test-judge-btn').addEventListener('click', () => {
|
|
1613
|
+
document.getElementById('test-judge-panel').style.display = 'none';
|
|
1614
|
+
});
|
|
1615
|
+
|
|
1616
|
+
// Run test judge
|
|
1617
|
+
document.getElementById('run-test-judge-btn').addEventListener('click', async () => {
|
|
1618
|
+
const judgeIndex = document.getElementById('test-judge-select').value;
|
|
1619
|
+
const weakModelFile = document.getElementById('test-weak-model-select').value;
|
|
1620
|
+
const numSamples = parseInt(document.getElementById('test-num-samples').value) || 5;
|
|
1621
|
+
const editedPrompt = document.getElementById('test-judge-prompt').value;
|
|
1622
|
+
const editedModel = document.getElementById('test-judge-model').value;
|
|
1623
|
+
|
|
1624
|
+
if (!judgeIndex) {
|
|
1625
|
+
alert('Please select a judge');
|
|
1626
|
+
return;
|
|
1627
|
+
}
|
|
1628
|
+
|
|
1629
|
+
if (!weakModelFile) {
|
|
1630
|
+
alert('Please select a weak model file');
|
|
1631
|
+
return;
|
|
1632
|
+
}
|
|
1633
|
+
|
|
1634
|
+
if (!editedPrompt.trim()) {
|
|
1635
|
+
alert('Please enter a judge prompt');
|
|
1636
|
+
return;
|
|
1637
|
+
}
|
|
1638
|
+
|
|
1639
|
+
if (!editedModel.trim()) {
|
|
1640
|
+
alert('Please enter a judge model');
|
|
1641
|
+
return;
|
|
1642
|
+
}
|
|
1643
|
+
|
|
1644
|
+
// Get judge data and override with edited prompt and model
|
|
1645
|
+
const judge = { ...testJudgesData[parseInt(judgeIndex)] };
|
|
1646
|
+
judge.prompt = editedPrompt; // Use the edited prompt from textarea
|
|
1647
|
+
judge.model = editedModel; // Use the edited model from input
|
|
1648
|
+
|
|
1649
|
+
// Call test endpoint
|
|
1650
|
+
try {
|
|
1651
|
+
const response = await fetch('/test_judge', {
|
|
1652
|
+
method: 'POST',
|
|
1653
|
+
headers: { 'Content-Type': 'application/json' },
|
|
1654
|
+
body: JSON.stringify({
|
|
1655
|
+
judge: judge,
|
|
1656
|
+
weak_model_file: weakModelFile,
|
|
1657
|
+
num_samples: numSamples
|
|
1658
|
+
})
|
|
1659
|
+
});
|
|
1660
|
+
|
|
1661
|
+
if (!response.ok) {
|
|
1662
|
+
throw new Error('Failed to test judge');
|
|
1663
|
+
}
|
|
1664
|
+
|
|
1665
|
+
const result = await response.json();
|
|
1666
|
+
|
|
1667
|
+
// Display results
|
|
1668
|
+
const resultsDiv = document.getElementById('test-judge-results-content');
|
|
1669
|
+
resultsDiv.innerHTML = result.samples.map((sample, idx) => `
|
|
1670
|
+
<div style="margin-bottom: 20px; padding: 20px; background: #0f0f0f; border-radius: 8px; border: 1px solid #2a2a2a;">
|
|
1671
|
+
<h4 style="color: #4a9eff; margin-bottom: 15px;">Sample ${idx + 1} of ${result.samples.length}</h4>
|
|
1672
|
+
|
|
1673
|
+
<div style="margin-bottom: 15px;">
|
|
1674
|
+
<div style="color: #888; font-size: 12px; margin-bottom: 5px;">Question:</div>
|
|
1675
|
+
<div style="color: #ccc; font-size: 13px; background: #1a1a1a; padding: 10px; border-radius: 4px; max-height: 100px; overflow-y: auto;">${sample.question || 'N/A'}</div>
|
|
1676
|
+
</div>
|
|
1677
|
+
|
|
1678
|
+
<div style="margin-bottom: 15px;">
|
|
1679
|
+
<div style="color: #888; font-size: 12px; margin-bottom: 5px;">Strong Model Output:</div>
|
|
1680
|
+
<div style="color: #ccc; font-size: 13px; background: #1a1a1a; padding: 10px; border-radius: 4px; max-height: 100px; overflow-y: auto;">${sample.strong_output || 'N/A'}</div>
|
|
1681
|
+
</div>
|
|
1682
|
+
|
|
1683
|
+
<div style="margin-bottom: 15px;">
|
|
1684
|
+
<div style="color: #888; font-size: 12px; margin-bottom: 5px;">Weak Model Output:</div>
|
|
1685
|
+
<div style="color: #ccc; font-size: 13px; background: #1a1a1a; padding: 10px; border-radius: 4px; max-height: 100px; overflow-y: auto;">${sample.weak_output || 'N/A'}</div>
|
|
1686
|
+
</div>
|
|
1687
|
+
|
|
1688
|
+
<div style="margin-bottom: 15px;">
|
|
1689
|
+
<div style="color: #888; font-size: 12px; margin-bottom: 5px;">Judge Prompt (filled):</div>
|
|
1690
|
+
<pre style="color: #aaa; font-size: 11px; background: #1a1a1a; padding: 10px; border-radius: 4px; max-height: 150px; overflow-y: auto; white-space: pre-wrap; font-family: monospace;">${sample.judge_prompt}</pre>
|
|
1691
|
+
</div>
|
|
1692
|
+
|
|
1693
|
+
<div style="margin-bottom: 15px;">
|
|
1694
|
+
<div style="color: #888; font-size: 12px; margin-bottom: 5px;">Raw Judge Response:</div>
|
|
1695
|
+
<pre style="color: #f4d03f; font-size: 11px; background: #1a1a1a; padding: 10px; border-radius: 4px; max-height: 150px; overflow-y: auto; white-space: pre-wrap; font-family: monospace;">${sample.raw_response}</pre>
|
|
1696
|
+
</div>
|
|
1697
|
+
|
|
1698
|
+
<div>
|
|
1699
|
+
<div style="color: #888; font-size: 12px; margin-bottom: 5px;">Parsed Scores:</div>
|
|
1700
|
+
<div style="color: #4a9eff; font-size: 13px; background: #1a1a1a; padding: 10px; border-radius: 4px; font-family: monospace;">${JSON.stringify(sample.parsed_scores, null, 2)}</div>
|
|
1701
|
+
</div>
|
|
1702
|
+
</div>
|
|
1703
|
+
`).join('');
|
|
1704
|
+
|
|
1705
|
+
document.getElementById('test-judge-results').style.display = 'block';
|
|
1706
|
+
|
|
1707
|
+
} catch (error) {
|
|
1708
|
+
alert('Error testing judge: ' + error.message);
|
|
1709
|
+
console.error('Test error:', error);
|
|
1710
|
+
}
|
|
1711
|
+
});
|
|
1712
|
+
|
|
1713
|
+
// Save prompt to judge
|
|
1714
|
+
document.getElementById('save-test-judge-prompt-btn').addEventListener('click', async () => {
|
|
1715
|
+
const judgeIndex = document.getElementById('test-judge-select').value;
|
|
1716
|
+
const editedPrompt = document.getElementById('test-judge-prompt').value;
|
|
1717
|
+
|
|
1718
|
+
if (!judgeIndex) {
|
|
1719
|
+
alert('Please select a judge');
|
|
1720
|
+
return;
|
|
1721
|
+
}
|
|
1722
|
+
|
|
1723
|
+
if (!editedPrompt.trim()) {
|
|
1724
|
+
alert('Please enter a judge prompt');
|
|
1725
|
+
return;
|
|
1726
|
+
}
|
|
1727
|
+
|
|
1728
|
+
// Get judge data and update prompt
|
|
1729
|
+
const judge = { ...testJudgesData[parseInt(judgeIndex)] };
|
|
1730
|
+
judge.prompt = editedPrompt;
|
|
1731
|
+
|
|
1732
|
+
// Confirm with user
|
|
1733
|
+
if (!confirm(`Save this prompt to judge "${judge.name}"? This will permanently update the judge.`)) {
|
|
1734
|
+
return;
|
|
1735
|
+
}
|
|
1736
|
+
|
|
1737
|
+
// Call save endpoint
|
|
1738
|
+
try {
|
|
1739
|
+
const response = await fetch('/save_judge', {
|
|
1740
|
+
method: 'POST',
|
|
1741
|
+
headers: { 'Content-Type': 'application/json' },
|
|
1742
|
+
body: JSON.stringify({ judge: judge })
|
|
1743
|
+
});
|
|
1744
|
+
|
|
1745
|
+
if (!response.ok) {
|
|
1746
|
+
throw new Error('Failed to save judge');
|
|
1747
|
+
}
|
|
1748
|
+
|
|
1749
|
+
const result = await response.json();
|
|
1750
|
+
|
|
1751
|
+
// Update local judges data
|
|
1752
|
+
testJudgesData = result.judges || [];
|
|
1753
|
+
|
|
1754
|
+
alert('Judge prompt saved successfully!');
|
|
1755
|
+
} catch (error) {
|
|
1756
|
+
alert('Error saving judge: ' + error.message);
|
|
1757
|
+
console.error('Save error:', error);
|
|
1758
|
+
}
|
|
1759
|
+
});
|
|
1760
|
+
|
|
1761
|
+
// === END-TO-END TEST ===
|
|
1762
|
+
|
|
1763
|
+
// Open E2E panel
|
|
1764
|
+
document.getElementById('open-e2e-btn').addEventListener('click', async () => {
|
|
1765
|
+
if (selectedTraces.size === 0) {
|
|
1766
|
+
alert('Please select at least one trace first!');
|
|
1767
|
+
return;
|
|
1768
|
+
}
|
|
1769
|
+
|
|
1770
|
+
// Populate W&B models
|
|
1771
|
+
const wandbModelsDiv = document.getElementById('e2e-wandb-models');
|
|
1772
|
+
wandbModelsDiv.innerHTML = AVAILABLE_MODELS.map(model => `
|
|
1773
|
+
<label style="display: block; padding: 5px 0; color: #ccc; cursor: pointer;">
|
|
1774
|
+
<input type="checkbox" class="e2e-model-checkbox" value="${model}" style="margin-right: 8px;">
|
|
1775
|
+
${model}
|
|
1776
|
+
</label>
|
|
1777
|
+
`).join('');
|
|
1778
|
+
|
|
1779
|
+
// Load judges
|
|
1780
|
+
try {
|
|
1781
|
+
const response = await fetch('/list_judges');
|
|
1782
|
+
const data = await response.json();
|
|
1783
|
+
const judgeSelect = document.getElementById('e2e-judge');
|
|
1784
|
+
|
|
1785
|
+
if (data.judges && data.judges.length > 0) {
|
|
1786
|
+
judgeSelect.innerHTML = data.judges.map((judge, idx) =>
|
|
1787
|
+
`<option value="${idx}">${judge.name} (${judge.type})</option>`
|
|
1788
|
+
).join('');
|
|
1789
|
+
} else {
|
|
1790
|
+
judgeSelect.innerHTML = '<option value="">No judges available - create one first</option>';
|
|
1791
|
+
}
|
|
1792
|
+
} catch (error) {
|
|
1793
|
+
console.error('Error loading judges:', error);
|
|
1794
|
+
}
|
|
1795
|
+
|
|
1796
|
+
document.getElementById('e2e-panel').style.display = 'block';
|
|
1797
|
+
});
|
|
1798
|
+
|
|
1799
|
+
// Close E2E panel
|
|
1800
|
+
document.getElementById('close-e2e-btn').addEventListener('click', () => {
|
|
1801
|
+
document.getElementById('e2e-panel').style.display = 'none';
|
|
1802
|
+
});
|
|
1803
|
+
|
|
1804
|
+
// Close E2E progress
|
|
1805
|
+
document.getElementById('close-e2e-progress-btn').addEventListener('click', () => {
|
|
1806
|
+
document.getElementById('e2e-progress-panel').style.display = 'none';
|
|
1807
|
+
document.getElementById('e2e-results').style.display = 'none';
|
|
1808
|
+
});
|
|
1809
|
+
|
|
1810
|
+
// Run end-to-end test
|
|
1811
|
+
document.getElementById('run-e2e-btn').addEventListener('click', async () => {
|
|
1812
|
+
// Gather selected models
|
|
1813
|
+
const selectedWanbModels = Array.from(document.querySelectorAll('.e2e-model-checkbox:checked')).map(cb => cb.value);
|
|
1814
|
+
const openRouterModelsText = document.getElementById('e2e-openrouter-models').value.trim();
|
|
1815
|
+
const openRouterModels = openRouterModelsText
|
|
1816
|
+
.split('\n')
|
|
1817
|
+
.map(m => m.trim())
|
|
1818
|
+
.filter(m => m.length > 0);
|
|
1819
|
+
const allModels = [...selectedWanbModels, ...openRouterModels];
|
|
1820
|
+
|
|
1821
|
+
if (allModels.length === 0) {
|
|
1822
|
+
alert('Please select at least one model!');
|
|
1823
|
+
return;
|
|
1824
|
+
}
|
|
1825
|
+
|
|
1826
|
+
const judgeIndex = document.getElementById('e2e-judge').value;
|
|
1827
|
+
if (!judgeIndex) {
|
|
1828
|
+
alert('Please select a judge!');
|
|
1829
|
+
return;
|
|
1830
|
+
}
|
|
1831
|
+
|
|
1832
|
+
const numExamples = document.getElementById('e2e-num-examples').value ? parseInt(document.getElementById('e2e-num-examples').value) : null;
|
|
1833
|
+
|
|
1834
|
+
// Load judge data
|
|
1835
|
+
const judgesResponse = await fetch('/list_judges');
|
|
1836
|
+
const judgesData = await judgesResponse.json();
|
|
1837
|
+
const judge = judgesData.judges[parseInt(judgeIndex)];
|
|
1838
|
+
|
|
1839
|
+
// Hide config panel, show progress panel
|
|
1840
|
+
document.getElementById('e2e-panel').style.display = 'none';
|
|
1841
|
+
document.getElementById('e2e-progress-panel').style.display = 'block';
|
|
1842
|
+
|
|
1843
|
+
const progressText = document.getElementById('e2e-progress-text');
|
|
1844
|
+
const stepLabel = document.getElementById('e2e-step-label');
|
|
1845
|
+
const overallProgress = document.getElementById('e2e-overall-progress');
|
|
1846
|
+
|
|
1847
|
+
progressText.textContent = '';
|
|
1848
|
+
|
|
1849
|
+
try {
|
|
1850
|
+
// === STEP 1: Export Selected Traces ===
|
|
1851
|
+
stepLabel.textContent = 'Step 1/3: Exporting selected traces...';
|
|
1852
|
+
overallProgress.style.width = '10%';
|
|
1853
|
+
progressText.textContent += '📦 Exporting selected traces...\n';
|
|
1854
|
+
|
|
1855
|
+
// Get full trace objects for selected IDs
|
|
1856
|
+
const selectedTraceObjects = allTraces.filter(t => selectedTraces.has(t.id));
|
|
1857
|
+
|
|
1858
|
+
const exportResponse = await fetch('/export_strong_traces', {
|
|
1859
|
+
method: 'POST',
|
|
1860
|
+
headers: { 'Content-Type': 'application/json' },
|
|
1861
|
+
body: JSON.stringify({
|
|
1862
|
+
traces: selectedTraceObjects,
|
|
1863
|
+
nickname: `e2e_export_${Date.now()}`
|
|
1864
|
+
})
|
|
1865
|
+
});
|
|
1866
|
+
|
|
1867
|
+
if (!exportResponse.ok) {
|
|
1868
|
+
throw new Error('Failed to export traces');
|
|
1869
|
+
}
|
|
1870
|
+
|
|
1871
|
+
const exportResult = await exportResponse.json();
|
|
1872
|
+
const exportFilename = exportResult.filename;
|
|
1873
|
+
progressText.textContent += `✓ Exported ${exportResult.count} traces to ${exportFilename}\n\n`;
|
|
1874
|
+
overallProgress.style.width = '20%';
|
|
1875
|
+
|
|
1876
|
+
// === STEP 2: Run Weak Model Inference ===
|
|
1877
|
+
stepLabel.textContent = 'Step 2/3: Running weak model inference...';
|
|
1878
|
+
progressText.textContent += `⚙️ Running inference with ${allModels.length} model(s)...\n`;
|
|
1879
|
+
|
|
1880
|
+
const taskId = `inference_${Date.now()}`;
|
|
1881
|
+
let pollInterval = null;
|
|
1882
|
+
|
|
1883
|
+
const pollProgress = async () => {
|
|
1884
|
+
try {
|
|
1885
|
+
const resp = await fetch(`/progress/${taskId}`);
|
|
1886
|
+
if (resp.ok) {
|
|
1887
|
+
const progress = await resp.json();
|
|
1888
|
+
const percent = (progress.current / progress.total) * 100;
|
|
1889
|
+
// Map inference progress to 20-60% of overall
|
|
1890
|
+
const overallPercent = 20 + (percent * 0.4);
|
|
1891
|
+
overallProgress.style.width = `${overallPercent}%`;
|
|
1892
|
+
}
|
|
1893
|
+
} catch (e) {
|
|
1894
|
+
console.error('Error polling progress:', e);
|
|
1895
|
+
}
|
|
1896
|
+
};
|
|
1897
|
+
|
|
1898
|
+
pollInterval = setInterval(pollProgress, 300);
|
|
1899
|
+
|
|
1900
|
+
const inferenceResponse = await fetch('/run_inference', {
|
|
1901
|
+
method: 'POST',
|
|
1902
|
+
headers: { 'Content-Type': 'application/json' },
|
|
1903
|
+
body: JSON.stringify({
|
|
1904
|
+
models: allModels,
|
|
1905
|
+
strong_export_file: exportFilename,
|
|
1906
|
+
num_examples: numExamples,
|
|
1907
|
+
task_id: taskId
|
|
1908
|
+
})
|
|
1909
|
+
});
|
|
1910
|
+
|
|
1911
|
+
if (pollInterval) clearInterval(pollInterval);
|
|
1912
|
+
|
|
1913
|
+
if (!inferenceResponse.ok) {
|
|
1914
|
+
throw new Error('Failed to run inference');
|
|
1915
|
+
}
|
|
1916
|
+
|
|
1917
|
+
const inferenceResult = await inferenceResponse.json();
|
|
1918
|
+
progressText.textContent += `✓ Generated outputs for ${allModels.length} model(s)\n\n`;
|
|
1919
|
+
overallProgress.style.width = '60%';
|
|
1920
|
+
|
|
1921
|
+
// === STEP 3: Run Evaluations ===
|
|
1922
|
+
stepLabel.textContent = 'Step 3/3: Running evaluations...';
|
|
1923
|
+
progressText.textContent += `📊 Running evaluations with judge: ${judge.name}...\n`;
|
|
1924
|
+
|
|
1925
|
+
const evaluationResults = [];
|
|
1926
|
+
|
|
1927
|
+
// Get list of weak model files that were just generated
|
|
1928
|
+
const weakModelsResponse = await fetch('/list_weak_models');
|
|
1929
|
+
const weakModelsData = await weakModelsResponse.json();
|
|
1930
|
+
|
|
1931
|
+
// Filter to only the models we just ran
|
|
1932
|
+
const weakModelFiles = weakModelsData.files
|
|
1933
|
+
.filter(f => allModels.some(m => f.filename.includes(m.replace('/', '_'))))
|
|
1934
|
+
.map(f => f.filename);
|
|
1935
|
+
|
|
1936
|
+
for (let i = 0; i < weakModelFiles.length; i++) {
|
|
1937
|
+
const modelFile = weakModelFiles[i];
|
|
1938
|
+
const evalTaskId = `eval_${Date.now()}_${i}`;
|
|
1939
|
+
|
|
1940
|
+
progressText.textContent += `\n[${i+1}/${weakModelFiles.length}] Evaluating ${modelFile}...\n`;
|
|
1941
|
+
|
|
1942
|
+
let evalPollInterval = null;
|
|
1943
|
+
const pollEvalProgress = async () => {
|
|
1944
|
+
try {
|
|
1945
|
+
const resp = await fetch(`/progress/${evalTaskId}`);
|
|
1946
|
+
if (resp.ok) {
|
|
1947
|
+
const progress = await resp.json();
|
|
1948
|
+
const percent = (progress.current / progress.total) * 100;
|
|
1949
|
+
// Map eval progress to 60-100% of overall
|
|
1950
|
+
const basePercent = 60 + (i / weakModelFiles.length) * 40;
|
|
1951
|
+
const stepPercent = (percent / 100) * (40 / weakModelFiles.length);
|
|
1952
|
+
overallProgress.style.width = `${basePercent + stepPercent}%`;
|
|
1953
|
+
}
|
|
1954
|
+
} catch (e) {
|
|
1955
|
+
console.error('Error polling eval progress:', e);
|
|
1956
|
+
}
|
|
1957
|
+
};
|
|
1958
|
+
|
|
1959
|
+
evalPollInterval = setInterval(pollEvalProgress, 300);
|
|
1960
|
+
|
|
1961
|
+
const evalResponse = await fetch('/run_evaluation', {
|
|
1962
|
+
method: 'POST',
|
|
1963
|
+
headers: { 'Content-Type': 'application/json' },
|
|
1964
|
+
body: JSON.stringify({
|
|
1965
|
+
model_file: modelFile,
|
|
1966
|
+
judge: judge,
|
|
1967
|
+
task_id: evalTaskId
|
|
1968
|
+
})
|
|
1969
|
+
});
|
|
1970
|
+
|
|
1971
|
+
if (evalPollInterval) clearInterval(evalPollInterval);
|
|
1972
|
+
|
|
1973
|
+
if (evalResponse.ok) {
|
|
1974
|
+
const evalResult = await evalResponse.json();
|
|
1975
|
+
progressText.textContent += ` ✓ Complete: ${evalResult.examples_evaluated} examples evaluated\n`;
|
|
1976
|
+
evaluationResults.push(evalResult);
|
|
1977
|
+
} else {
|
|
1978
|
+
progressText.textContent += ` ✗ Error evaluating ${modelFile}\n`;
|
|
1979
|
+
}
|
|
1980
|
+
}
|
|
1981
|
+
|
|
1982
|
+
overallProgress.style.width = '100%';
|
|
1983
|
+
stepLabel.textContent = 'Complete!';
|
|
1984
|
+
progressText.textContent += `\n✅ All evaluations complete!\n`;
|
|
1985
|
+
|
|
1986
|
+
// Show results
|
|
1987
|
+
document.getElementById('e2e-results').style.display = 'block';
|
|
1988
|
+
const resultsContent = document.getElementById('e2e-results-content');
|
|
1989
|
+
resultsContent.innerHTML = evaluationResults.map(r => `
|
|
1990
|
+
<div style="margin-bottom: 15px; padding: 15px; background: #1a1a1a; border-radius: 4px; border: 1px solid #2a2a2a;">
|
|
1991
|
+
<div style="font-weight: bold; color: #fff; margin-bottom: 8px;">${r.evaluation_name}</div>
|
|
1992
|
+
<div style="font-size: 12px; color: #888; margin-bottom: 8px;">
|
|
1993
|
+
${r.examples_evaluated} examples evaluated
|
|
1994
|
+
</div>
|
|
1995
|
+
<a href="${r.weave_url}" target="_blank" style="color: #4a9eff; font-size: 13px;">View in Weave →</a>
|
|
1996
|
+
</div>
|
|
1997
|
+
`).join('');
|
|
1998
|
+
|
|
1999
|
+
} catch (error) {
|
|
2000
|
+
progressText.textContent += `\n\n❌ Error: ${error.message}\n`;
|
|
2001
|
+
stepLabel.textContent = 'Error occurred';
|
|
2002
|
+
}
|
|
2003
|
+
});
|
|
1230
2004
|
</script>
|
|
1231
2005
|
</body>
|
|
1232
2006
|
</html>
|