quickdistill 0.1.6__py3-none-any.whl → 0.1.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
quickdistill/__init__.py CHANGED
@@ -8,7 +8,7 @@ This package provides tools to:
8
8
  - Export datasets for model evaluation
9
9
  """
10
10
 
11
- __version__ = "0.1.6"
11
+ __version__ = "0.1.7"
12
12
  __author__ = "Brett Young"
13
13
  __email__ = "bdytx5@umsystem.edu"
14
14
 
quickdistill/server.py CHANGED
@@ -3,7 +3,6 @@ import json
3
3
  import openai
4
4
  import weave
5
5
  import shutil
6
- import threading
7
6
  from flask import Flask, request, jsonify, send_from_directory
8
7
  from flask_cors import CORS
9
8
  from llmasajudge import LLMAsAJudge
@@ -34,8 +33,25 @@ CORS(app)
34
33
  # Progress tracking for long-running operations
35
34
  progress_state = {}
36
35
 
37
- # Configuration
38
- PROJECT = "wandb_inference"
36
+ # Load settings
37
+ SETTINGS_FILE = DATA_DIR / 'settings.json'
38
+ DEFAULT_SETTINGS = {
39
+ 'inference_project': 'wandb_fc/quickstart_playground',
40
+ 'evaluation_project': 'wandb_inference'
41
+ }
42
+
43
+ def load_settings():
44
+ if SETTINGS_FILE.exists():
45
+ with open(SETTINGS_FILE, 'r') as f:
46
+ return {**DEFAULT_SETTINGS, **json.load(f)}
47
+ return DEFAULT_SETTINGS.copy()
48
+
49
+ def save_settings(settings):
50
+ with open(SETTINGS_FILE, 'w') as f:
51
+ json.dump(settings, f, indent=2)
52
+
53
+ settings = load_settings()
54
+ PROJECT = settings['evaluation_project']
39
55
 
40
56
  weave.init(PROJECT)
41
57
 
@@ -46,7 +62,7 @@ def create_client():
46
62
  api_key=os.getenv("WANDB_API_KEY"),
47
63
  project=PROJECT,
48
64
  default_headers={
49
- "OpenAI-Project": "wandb_fc/quickstart_playground" # replace with your team/project
65
+ "OpenAI-Project": settings['inference_project']
50
66
  }
51
67
  )
52
68
 
@@ -175,8 +191,9 @@ def run_inference_endpoint():
175
191
  if not traces:
176
192
  return jsonify({'error': 'No traces in export file'}), 400
177
193
 
178
- # Limit traces to num_examples
194
+ # Limit traces to num_examples (convert to int if needed)
179
195
  if num_examples:
196
+ num_examples = int(num_examples)
180
197
  traces = traces[:num_examples]
181
198
 
182
199
  output_files = []
@@ -284,6 +301,241 @@ def get_progress(task_id):
284
301
  return jsonify({'error': 'Task not found'}), 404
285
302
 
286
303
 
304
+ @app.route('/settings', methods=['GET'])
305
+ def get_settings():
306
+ """Get current settings"""
307
+ return jsonify(settings)
308
+
309
+
310
+ @app.route('/settings', methods=['POST'])
311
+ def update_settings():
312
+ """Update settings"""
313
+ global settings
314
+ data = request.json
315
+ settings.update(data)
316
+ save_settings(settings)
317
+ return jsonify({'status': 'success', 'settings': settings})
318
+
319
+
320
+ @app.route('/test_judge', methods=['POST'])
321
+ def test_judge():
322
+ """Test a judge on sample data to see raw inputs/outputs"""
323
+ data = request.json
324
+ judge = data.get('judge')
325
+ weak_model_file = data.get('weak_model_file')
326
+ num_samples = data.get('num_samples', 5)
327
+
328
+ if not judge or not weak_model_file:
329
+ return jsonify({'error': 'Missing judge or weak_model_file'}), 400
330
+
331
+ # Load weak model results
332
+ model_path = DATA_DIR / weak_model_file
333
+ with open(model_path, 'r') as f:
334
+ file_data = json.load(f)
335
+
336
+ # Handle both formats
337
+ if isinstance(file_data, dict) and 'results' in file_data:
338
+ results = file_data['results']
339
+ else:
340
+ results = file_data
341
+
342
+ # Limit to num_samples
343
+ samples_to_test = results[:min(num_samples, len(results))]
344
+
345
+ test_results = []
346
+
347
+ for example in samples_to_test:
348
+ # Skip examples with errors
349
+ if example.get('error') or not example.get('output'):
350
+ continue
351
+
352
+ strong_output = example.get('strong_model_output', '')
353
+ weak_output = example.get('output', '')
354
+
355
+ # Extract question
356
+ question = ""
357
+ messages = example.get('messages', [])
358
+ if messages and len(messages) > 0:
359
+ question = messages[0].get('content', '')
360
+
361
+ # Build the prompt
362
+ prompt = judge['prompt']
363
+ if '{question}' in prompt:
364
+ prompt = prompt.replace('{question}', question or '')
365
+ if '{strong_output}' in prompt:
366
+ prompt = prompt.replace('{strong_output}', strong_output or '')
367
+ if '{weak_output}' in prompt:
368
+ prompt = prompt.replace('{weak_output}', weak_output or '')
369
+
370
+ # Run the judge and capture raw response
371
+ if judge['type'] == 'llm':
372
+ return_type = judge.get('returnType', 'scalar')
373
+
374
+ # Use a list to capture the raw response (mutable so we can access from closure)
375
+ captured_raw = []
376
+
377
+ def score_parser(response: str):
378
+ """Parse the judge response based on return type"""
379
+ # Capture the raw response before any processing
380
+ captured_raw.append(response)
381
+
382
+ response = response.strip()
383
+
384
+ # Remove markdown code blocks if present
385
+ if response.startswith('```'):
386
+ # Remove ```json or ``` at start
387
+ response = response.split('\n', 1)[1] if '\n' in response else response[3:]
388
+ # Remove ``` at end
389
+ if response.endswith('```'):
390
+ response = response.rsplit('\n', 1)[0] if '\n' in response else response[:-3]
391
+ response = response.strip()
392
+
393
+ try:
394
+ # Parse JSON response
395
+ parsed = json.loads(response)
396
+
397
+ if return_type == 'boolean':
398
+ # Extract boolean value - return just the bool
399
+ val = parsed.get('correct', parsed.get('result', parsed.get('value', False)))
400
+ return bool(val)
401
+ elif return_type == 'scalar':
402
+ # Extract numeric score - return just the number
403
+ val = parsed.get('score', parsed.get('scores', parsed.get('value', 0)))
404
+ return float(val) if isinstance(val, (int, float)) else 0
405
+ else:
406
+ # Unsupported return type
407
+ print(f"Unsupported return type: {return_type}")
408
+ return 0
409
+ except:
410
+ print(f"Failed to parse judge response as JSON: {response}")
411
+ if return_type == 'scalar':
412
+ return 0
413
+ elif return_type == 'boolean':
414
+ return False
415
+ else:
416
+ return 0
417
+
418
+ # Use LLMAsAJudge exactly like the evaluation code
419
+ try:
420
+ # Initialize LLMAsAJudge with custom prompt
421
+ judge_instance = LLMAsAJudge(
422
+ models=[judge['model']],
423
+ use_fully_custom_prompt=True,
424
+ output_parser=score_parser,
425
+ return_type=return_type if return_type else None
426
+ )
427
+
428
+ # Get judgment
429
+ result = judge_instance.judge(prompt=prompt)
430
+
431
+ # Extract the raw response that was captured
432
+ raw_text = captured_raw[0] if captured_raw else "No response captured"
433
+
434
+ # Extract parsed scores from result
435
+ if return_type == 'scalar':
436
+ score_val = result.get('scores', result.get('correct', 0))
437
+ parsed_scores = {'score': score_val}
438
+ elif return_type == 'boolean':
439
+ bool_val = result.get('correct', False)
440
+ parsed_scores = {'correct': bool_val}
441
+ else:
442
+ # Unsupported return type - default to scalar
443
+ score_val = result.get('scores', result.get('correct', 0))
444
+ parsed_scores = {'score': score_val}
445
+
446
+ except Exception as e:
447
+ raw_text = f"Error: {str(e)}"
448
+ parsed_scores = {'error': str(e)}
449
+
450
+ test_results.append({
451
+ 'question': question,
452
+ 'strong_output': strong_output,
453
+ 'weak_output': weak_output,
454
+ 'judge_prompt': prompt,
455
+ 'raw_response': raw_text,
456
+ 'parsed_scores': parsed_scores
457
+ })
458
+
459
+ return jsonify({
460
+ 'status': 'success',
461
+ 'judge_name': judge['name'],
462
+ 'num_samples': len(test_results),
463
+ 'samples': test_results
464
+ })
465
+
466
+
467
+ @app.route('/generate_judge_prompt', methods=['POST'])
468
+ def generate_judge_prompt():
469
+ """Generate a judge prompt using AI based on sample data"""
470
+ data = request.json
471
+ weak_model_file = data.get('weak_model_file')
472
+ num_samples = data.get('num_samples', 3)
473
+ model = data.get('model', 'openai/gpt-5')
474
+ meta_prompt = data.get('meta_prompt')
475
+
476
+ if not weak_model_file or not meta_prompt:
477
+ return jsonify({'error': 'Missing weak_model_file or meta_prompt'}), 400
478
+
479
+ # Load weak model results
480
+ model_path = DATA_DIR / weak_model_file
481
+ with open(model_path, 'r') as f:
482
+ file_data = json.load(f)
483
+
484
+ # Handle both formats
485
+ if isinstance(file_data, dict) and 'results' in file_data:
486
+ results = file_data['results']
487
+ else:
488
+ results = file_data
489
+
490
+ # Limit to num_samples
491
+ samples_to_use = results[:min(num_samples, len(results))]
492
+
493
+ # Format samples for meta-prompt
494
+ samples_text = []
495
+ for i, example in enumerate(samples_to_use):
496
+ # Skip examples with errors
497
+ if example.get('error') or not example.get('output'):
498
+ continue
499
+
500
+ strong_output = example.get('strong_model_output', '')
501
+ weak_output = example.get('output', '')
502
+
503
+ # Extract question
504
+ question = ""
505
+ messages = example.get('messages', [])
506
+ if messages and len(messages) > 0:
507
+ question = messages[0].get('content', '')
508
+
509
+ samples_text.append(f"""Sample {i+1}:
510
+ Question: {question}
511
+ Strong Model Output: {strong_output}
512
+ Weak Model Output: {weak_output}
513
+ ---""")
514
+
515
+ samples_formatted = "\n\n".join(samples_text)
516
+
517
+ # Replace {SAMPLES} placeholder in meta-prompt
518
+ final_prompt = meta_prompt.replace('{SAMPLES}', samples_formatted)
519
+
520
+ # Call OpenRouter to generate the prompt
521
+ try:
522
+ client = create_openrouter_client()
523
+ response = client.chat.completions.create(
524
+ model=model,
525
+ messages=[{"role": "user", "content": final_prompt}]
526
+ )
527
+ generated_prompt = response.choices[0].message.content.strip()
528
+
529
+ return jsonify({
530
+ 'status': 'success',
531
+ 'generated_prompt': generated_prompt,
532
+ 'num_samples_used': len(samples_text)
533
+ })
534
+
535
+ except Exception as e:
536
+ return jsonify({'error': f'Failed to generate prompt: {str(e)}'}), 500
537
+
538
+
287
539
  @app.route('/list_weak_models', methods=['GET'])
288
540
  def list_weak_models():
289
541
  """List available weak model result files with metadata"""
@@ -162,6 +162,13 @@
162
162
  <div class="container">
163
163
  <h1>Judge Manager</h1>
164
164
 
165
+ <!-- Prompt Generator Button -->
166
+ <div style="margin-bottom: 20px; display: none;">
167
+ <button onclick="openPromptGenerator()" style="padding: 10px 20px; background: #6a4a7e; color: white; border: none; border-radius: 4px; cursor: pointer; font-size: 14px;">
168
+ ✨ Generate Judge Prompt with AI
169
+ </button>
170
+ </div>
171
+
165
172
  <!-- Create/Edit Judge Section -->
166
173
  <div class="section">
167
174
  <h2 id="form-title">Create New Judge</h2>
@@ -217,6 +224,73 @@
217
224
  </div>
218
225
  </div>
219
226
 
227
+ <!-- Prompt Generator Panel -->
228
+ <div id="prompt-generator-panel" style="display: none; position: fixed; top: 0; left: 0; width: 100%; height: 100%; background: rgba(0,0,0,0.9); z-index: 1000; padding: 40px; overflow-y: auto;">
229
+ <div style="max-width: 1200px; margin: 0 auto; background: #1a1a1a; border-radius: 8px; padding: 30px; border: 1px solid #3a2a4a;">
230
+ <h2 style="color: #fff; margin-bottom: 10px;">AI-Powered Judge Prompt Generator</h2>
231
+ <p style="color: #888; font-size: 13px; margin-bottom: 25px;">
232
+ Generate specialized judge prompts by showing sample data to an AI model
233
+ </p>
234
+
235
+ <!-- Configuration -->
236
+ <div style="display: grid; grid-template-columns: 1fr 1fr; gap: 20px; margin-bottom: 20px;">
237
+ <div>
238
+ <label style="display: block; color: #aaa; margin-bottom: 8px; font-size: 14px;">Weak Model Dataset:</label>
239
+ <select id="gen-weak-model-select" style="width: 100%; padding: 10px; background: #2a2a2a; color: #fff; border: 1px solid #3a3a3a; border-radius: 4px; font-size: 14px;">
240
+ <option value="">Loading weak model files...</option>
241
+ </select>
242
+ </div>
243
+
244
+ <div>
245
+ <label style="display: block; color: #aaa; margin-bottom: 8px; font-size: 14px;">Number of Samples:</label>
246
+ <input type="number" id="gen-num-samples" value="3" min="1" max="10"
247
+ style="width: 100%; padding: 10px; background: #2a2a2a; color: #fff; border: 1px solid #3a3a3a; border-radius: 4px; font-size: 14px;">
248
+ <div style="color: #666; font-size: 12px; margin-top: 5px;">Max: 10 (for context limits)</div>
249
+ </div>
250
+ </div>
251
+
252
+ <div style="margin-bottom: 20px;">
253
+ <label style="display: block; color: #aaa; margin-bottom: 8px; font-size: 14px;">Generation Model:</label>
254
+ <input type="text" id="gen-model" value="openai/gpt-5"
255
+ style="width: 100%; padding: 10px; background: #2a2a2a; color: #fff; border: 1px solid #3a3a3a; border-radius: 4px; font-size: 14px;"
256
+ placeholder="e.g., openai/gpt-5, anthropic/claude-3.5-sonnet">
257
+ <div style="color: #666; font-size: 12px; margin-top: 5px;">OpenRouter model to use for generating the prompt</div>
258
+ </div>
259
+
260
+ <!-- Meta-Prompt -->
261
+ <div style="margin-bottom: 25px;">
262
+ <label style="display: block; color: #aaa; margin-bottom: 8px; font-size: 14px;">Meta-Prompt (edit as needed):</label>
263
+ <textarea id="gen-meta-prompt"
264
+ style="width: 100%; min-height: 250px; padding: 10px; background: #2a2a2a; color: #fff; border: 1px solid #3a3a3a; border-radius: 4px; font-size: 13px; font-family: 'Courier New', monospace; resize: vertical;"></textarea>
265
+ <div style="color: #666; font-size: 12px; margin-top: 5px;">
266
+ This prompt will be sent to the generation model along with sample data
267
+ </div>
268
+ </div>
269
+
270
+ <!-- Actions -->
271
+ <div style="display: flex; gap: 10px; margin-bottom: 25px;">
272
+ <button onclick="generatePrompt()" style="padding: 10px 20px; background: #6a4a7e; color: white; border: none; border-radius: 4px; cursor: pointer; font-weight: 500;">
273
+ Generate Prompt
274
+ </button>
275
+ <button onclick="closePromptGenerator()" style="padding: 10px 20px; background: #5a2a2a; color: white; border: none; border-radius: 4px; cursor: pointer;">
276
+ Close
277
+ </button>
278
+ </div>
279
+
280
+ <!-- Generated Output -->
281
+ <div id="gen-output-section" style="display: none;">
282
+ <h3 style="color: #4a9eff; margin-bottom: 15px;">Generated Judge Prompt</h3>
283
+ <textarea id="gen-output" readonly
284
+ style="width: 100%; min-height: 300px; padding: 15px; background: #0f0f0f; color: #4a9eff; border: 1px solid #4a9eff; border-radius: 4px; font-size: 13px; font-family: 'Courier New', monospace; resize: vertical;"></textarea>
285
+ <div style="margin-top: 10px;">
286
+ <button onclick="copyGeneratedPrompt()" style="padding: 8px 16px; background: #2a7c4a; color: white; border: none; border-radius: 4px; cursor: pointer;">
287
+ Copy to Clipboard
288
+ </button>
289
+ </div>
290
+ </div>
291
+ </div>
292
+ </div>
293
+
220
294
  <script>
221
295
  let judges = [];
222
296
  let editingIndex = null;
@@ -489,6 +563,113 @@ Respond in JSON format: {'correct': true} or {'correct': false}`
489
563
  console.log('Not changing prompt - user has edited it');
490
564
  }
491
565
  });
566
+
567
+ // === PROMPT GENERATOR ===
568
+
569
+ const DEFAULT_META_PROMPT = `You are an expert at creating evaluation prompts for judging AI model outputs. I'm building a specialized judge prompt to evaluate the quality/similarity of weak model outputs compared to strong reference model outputs.
570
+
571
+ I will show you some sample data below. Each sample contains:
572
+ - A question/input
573
+ - The strong reference model's output (ground truth)
574
+ - The weak model's output (what we're evaluating)
575
+
576
+ Your task: Create a specialized, detailed judge prompt that can be used to systematically evaluate the delta/difference between these outputs. The prompt should:
577
+ 1. Be specific to the patterns you see in this data
578
+ 2. Include clear evaluation criteria
579
+ 3. Be written in second-person ("You are...")
580
+ 4. Include the placeholders {question}, {strong_output}, and {weak_output}
581
+ 5. Specify the exact JSON format to return (either {'score': number} for scalar or {'correct': boolean} for boolean)
582
+
583
+ Sample Data:
584
+ {SAMPLES}
585
+
586
+ Based on these samples, create a specialized judge prompt that would effectively evaluate this type of data. Return ONLY the judge prompt text, nothing else.`;
587
+
588
+ async function openPromptGenerator() {
589
+ // Load weak model files
590
+ try {
591
+ const response = await fetch('/list_weak_models');
592
+ const data = await response.json();
593
+ const select = document.getElementById('gen-weak-model-select');
594
+
595
+ if (data.files && data.files.length > 0) {
596
+ select.innerHTML = data.files.map(f =>
597
+ `<option value="${f.filename}">${f.weak_model || f.filename}</option>`
598
+ ).join('');
599
+ } else {
600
+ select.innerHTML = '<option value="">No weak model files available</option>';
601
+ }
602
+ } catch (error) {
603
+ console.error('Error loading weak models:', error);
604
+ }
605
+
606
+ // Set default meta-prompt
607
+ document.getElementById('gen-meta-prompt').value = DEFAULT_META_PROMPT;
608
+
609
+ // Show panel
610
+ document.getElementById('prompt-generator-panel').style.display = 'block';
611
+ document.getElementById('gen-output-section').style.display = 'none';
612
+ }
613
+
614
+ function closePromptGenerator() {
615
+ document.getElementById('prompt-generator-panel').style.display = 'none';
616
+ }
617
+
618
+ async function generatePrompt() {
619
+ const weakModelFile = document.getElementById('gen-weak-model-select').value;
620
+ const numSamples = parseInt(document.getElementById('gen-num-samples').value) || 3;
621
+ const model = document.getElementById('gen-model').value.trim();
622
+ const metaPrompt = document.getElementById('gen-meta-prompt').value.trim();
623
+
624
+ if (!weakModelFile) {
625
+ alert('Please select a weak model dataset');
626
+ return;
627
+ }
628
+
629
+ if (!model) {
630
+ alert('Please enter a generation model');
631
+ return;
632
+ }
633
+
634
+ if (!metaPrompt) {
635
+ alert('Please enter a meta-prompt');
636
+ return;
637
+ }
638
+
639
+ try {
640
+ const response = await fetch('/generate_judge_prompt', {
641
+ method: 'POST',
642
+ headers: { 'Content-Type': 'application/json' },
643
+ body: JSON.stringify({
644
+ weak_model_file: weakModelFile,
645
+ num_samples: numSamples,
646
+ model: model,
647
+ meta_prompt: metaPrompt
648
+ })
649
+ });
650
+
651
+ if (!response.ok) {
652
+ throw new Error('Failed to generate prompt');
653
+ }
654
+
655
+ const result = await response.json();
656
+
657
+ // Display generated prompt
658
+ document.getElementById('gen-output').value = result.generated_prompt;
659
+ document.getElementById('gen-output-section').style.display = 'block';
660
+
661
+ } catch (error) {
662
+ alert('Error generating prompt: ' + error.message);
663
+ console.error('Generation error:', error);
664
+ }
665
+ }
666
+
667
+ function copyGeneratedPrompt() {
668
+ const output = document.getElementById('gen-output');
669
+ output.select();
670
+ document.execCommand('copy');
671
+ alert('Prompt copied to clipboard!');
672
+ }
492
673
  </script>
493
674
  </body>
494
675
  </html>
@@ -316,6 +316,22 @@
316
316
  Manage Judges
317
317
  </a>
318
318
 
319
+ <button id="open-test-judge-btn" style="padding: 8px 16px; background: #6a4a7e; color: white; border: none; border-radius: 4px; cursor: pointer;">
320
+ Test Judges
321
+ </button>
322
+
323
+ <button id="open-settings-btn" style="padding: 8px 16px; background: #5a5a5a; color: white; border: none; border-radius: 4px; cursor: pointer;">
324
+ Settings
325
+ </button>
326
+
327
+ <div style="margin: 20px 0; padding: 15px; background: #2a1a2a; border-radius: 8px; border: 1px solid #4a2a4a;">
328
+ <div style="color: #aaa; font-size: 13px; margin-bottom: 10px;">Automatic Workflow:</div>
329
+ <button id="open-e2e-btn" style="padding: 10px 20px; background: #7a4a9e; color: white; border: none; border-radius: 4px; cursor: pointer; font-weight: 500;">
330
+ ⚡ Run End-to-End Test
331
+ </button>
332
+ <div style="color: #666; font-size: 11px; margin-top: 8px;">Export → Generate → Evaluate (all in one)</div>
333
+ </div>
334
+
319
335
  <div class="stats">
320
336
  <div>Total: <span id="total-count">0</span></div>
321
337
  <div>Shown: <span id="shown-count">0</span></div>
@@ -427,6 +443,193 @@
427
443
  </div>
428
444
  </div>
429
445
  </div>
446
+
447
+ <!-- Settings Panel -->
448
+ <div id="settings-panel" style="display: none; position: fixed; top: 0; left: 0; width: 100%; height: 100%; background: rgba(0,0,0,0.8); z-index: 1000; padding: 40px;">
449
+ <div style="max-width: 600px; margin: 0 auto; background: #1a1a1a; border-radius: 8px; padding: 30px; border: 1px solid #2a2a2a;">
450
+ <h2 style="color: #fff; margin-bottom: 20px;">Settings</h2>
451
+
452
+ <div style="margin-bottom: 20px;">
453
+ <label style="display: block; color: #aaa; margin-bottom: 8px; font-size: 14px;">W&B Inference Project</label>
454
+ <input type="text" id="settings-inference-project" placeholder="e.g., wandb_fc/quickstart_playground"
455
+ style="width: 100%; padding: 10px; background: #2a2a2a; color: #fff; border: 1px solid #3a3a3a; border-radius: 4px; font-size: 14px;">
456
+ <div style="color: #666; font-size: 12px; margin-top: 5px;">Used for running weak model inference</div>
457
+ </div>
458
+
459
+ <div style="margin-bottom: 30px;">
460
+ <label style="display: block; color: #aaa; margin-bottom: 8px; font-size: 14px;">W&B Evaluation Project</label>
461
+ <input type="text" id="settings-evaluation-project" placeholder="e.g., wandb_inference"
462
+ style="width: 100%; padding: 10px; background: #2a2a2a; color: #fff; border: 1px solid #3a3a3a; border-radius: 4px; font-size: 14px;">
463
+ <div style="color: #666; font-size: 12px; margin-top: 5px;">Used for logging evaluation results with Weave</div>
464
+ </div>
465
+
466
+ <div style="display: flex; gap: 10px; justify-content: flex-end;">
467
+ <button id="close-settings-btn" style="padding: 10px 20px; background: #5a2a2a; color: white; border: none; border-radius: 4px; cursor: pointer;">
468
+ Cancel
469
+ </button>
470
+ <button id="save-settings-btn" style="padding: 10px 20px; background: #2a7c4a; color: white; border: none; border-radius: 4px; cursor: pointer;">
471
+ Save Settings
472
+ </button>
473
+ </div>
474
+ </div>
475
+ </div>
476
+
477
+ <!-- Test Judges Panel -->
478
+ <div id="test-judge-panel" style="display: none; position: fixed; top: 0; left: 0; width: 100%; height: 100%; background: rgba(0,0,0,0.8); z-index: 1000; padding: 40px; overflow-y: auto;">
479
+ <div style="max-width: 1000px; margin: 0 auto; background: #1a1a1a; border-radius: 8px; padding: 30px; border: 1px solid #3a2a4a;">
480
+ <h2 style="color: #fff; margin-bottom: 10px;">Test Judge</h2>
481
+ <p style="color: #888; font-size: 13px; margin-bottom: 25px;">
482
+ Test your judge on sample data to see exactly what inputs/outputs it receives
483
+ </p>
484
+
485
+ <!-- Configuration -->
486
+ <div style="display: grid; grid-template-columns: 1fr 1fr; gap: 20px; margin-bottom: 25px;">
487
+ <div>
488
+ <label style="display: block; color: #aaa; margin-bottom: 8px; font-size: 14px;">Select Judge:</label>
489
+ <select id="test-judge-select" style="width: 100%; padding: 10px; background: #2a2a2a; color: #fff; border: 1px solid #3a3a3a; border-radius: 4px; font-size: 14px;">
490
+ <option value="">Loading judges...</option>
491
+ </select>
492
+ </div>
493
+
494
+ <div>
495
+ <label style="display: block; color: #aaa; margin-bottom: 8px; font-size: 14px;">Weak Model Data:</label>
496
+ <select id="test-weak-model-select" style="width: 100%; padding: 10px; background: #2a2a2a; color: #fff; border: 1px solid #3a3a3a; border-radius: 4px; font-size: 14px;">
497
+ <option value="">Loading weak model files...</option>
498
+ </select>
499
+ </div>
500
+ </div>
501
+
502
+ <div style="margin-bottom: 20px;">
503
+ <label style="display: block; color: #aaa; margin-bottom: 8px; font-size: 14px;">Number of Samples:</label>
504
+ <input type="number" id="test-num-samples" value="5" min="1" max="50"
505
+ style="width: 150px; padding: 10px; background: #2a2a2a; color: #fff; border: 1px solid #3a3a3a; border-radius: 4px; font-size: 14px;">
506
+ <span style="color: #666; font-size: 12px; margin-left: 10px;">Max: 50</span>
507
+ </div>
508
+
509
+ <!-- Judge Model -->
510
+ <div style="margin-bottom: 20px;">
511
+ <label style="display: block; color: #aaa; margin-bottom: 8px; font-size: 14px;">Judge Model:</label>
512
+ <input type="text" id="test-judge-model"
513
+ style="width: 100%; padding: 10px; background: #2a2a2a; color: #fff; border: 1px solid #3a3a3a; border-radius: 4px; font-size: 14px;"
514
+ placeholder="e.g., gpt-4o, claude-3-5-sonnet-20241022">
515
+ <div style="color: #666; font-size: 12px; margin-top: 5px;">
516
+ Override the judge's model for this test
517
+ </div>
518
+ </div>
519
+
520
+ <!-- Judge Prompt -->
521
+ <div style="margin-bottom: 30px;">
522
+ <label style="display: block; color: #aaa; margin-bottom: 8px; font-size: 14px;">Judge Prompt:</label>
523
+ <textarea id="test-judge-prompt"
524
+ style="width: 100%; min-height: 200px; padding: 10px; background: #2a2a2a; color: #fff; border: 1px solid #3a3a3a; border-radius: 4px; font-size: 13px; font-family: 'Courier New', monospace; resize: vertical;"
525
+ placeholder="Select a judge to load its prompt..."></textarea>
526
+ <div style="color: #666; font-size: 12px; margin-top: 5px;">
527
+ Edit the prompt and test changes, or save to update the judge permanently
528
+ </div>
529
+ </div>
530
+
531
+ <!-- Actions -->
532
+ <div style="display: flex; gap: 10px; margin-bottom: 30px;">
533
+ <button id="run-test-judge-btn" style="padding: 10px 20px; background: #6a4a7e; color: white; border: none; border-radius: 4px; cursor: pointer; font-weight: 500;">
534
+ Run Test
535
+ </button>
536
+ <button id="save-test-judge-prompt-btn" style="padding: 10px 20px; background: #2a7c4a; color: white; border: none; border-radius: 4px; cursor: pointer;">
537
+ Save Prompt to Judge
538
+ </button>
539
+ <button id="close-test-judge-btn" style="padding: 10px 20px; background: #5a2a2a; color: white; border: none; border-radius: 4px; cursor: pointer;">
540
+ Close
541
+ </button>
542
+ </div>
543
+
544
+ <!-- Results -->
545
+ <div id="test-judge-results" style="display: none;">
546
+ <h3 style="color: #4a9eff; margin-bottom: 15px;">Test Results</h3>
547
+ <div id="test-judge-results-content" style="max-height: 600px; overflow-y: auto;">
548
+ <!-- Results populated here -->
549
+ </div>
550
+ </div>
551
+ </div>
552
+ </div>
553
+
554
+ <!-- End-to-End Test Panel -->
555
+ <div id="e2e-panel" style="display: none; position: fixed; top: 0; left: 0; width: 100%; height: 100%; background: rgba(0,0,0,0.8); z-index: 1000; padding: 40px; overflow-y: auto;">
556
+ <div style="max-width: 800px; margin: 0 auto; background: #1a1a1a; border-radius: 8px; padding: 30px; border: 1px solid #4a2a4a;">
557
+ <h2 style="color: #fff; margin-bottom: 10px;">⚡ Run End-to-End Test</h2>
558
+ <p style="color: #888; font-size: 13px; margin-bottom: 25px;">
559
+ This will automatically: Export selected traces → Run weak models → Evaluate with judge
560
+ </p>
561
+
562
+ <!-- Weak Model Selection -->
563
+ <div style="margin-bottom: 25px;">
564
+ <h3 style="color: #fff; font-size: 16px; margin-bottom: 15px;">1. Select Weak Models</h3>
565
+
566
+ <div style="margin-bottom: 15px;">
567
+ <label style="display: block; color: #aaa; margin-bottom: 8px; font-size: 13px;">W&B Models:</label>
568
+ <div id="e2e-wandb-models" style="max-height: 150px; overflow-y: auto; background: #0f0f0f; padding: 10px; border-radius: 4px; border: 1px solid #2a2a2a;">
569
+ <!-- Populated dynamically -->
570
+ </div>
571
+ </div>
572
+
573
+ <div style="margin-bottom: 15px;">
574
+ <label style="display: block; color: #aaa; margin-bottom: 8px; font-size: 13px;">OpenRouter Models (optional):</label>
575
+ <textarea id="e2e-openrouter-models" placeholder="Enter OpenRouter models (one per line)&#10;e.g.,&#10;meta-llama/llama-3.3-70b-instruct&#10;anthropic/claude-3.5-sonnet"
576
+ style="width: 100%; padding: 8px; background: #2a2a2a; color: #fff; border: 1px solid #3a3a3a; border-radius: 4px; font-size: 13px; min-height: 80px; font-family: monospace;"></textarea>
577
+ <div style="color: #666; font-size: 11px; margin-top: 5px;">One model per line</div>
578
+ </div>
579
+
580
+ <div>
581
+ <label style="display: block; color: #aaa; margin-bottom: 8px; font-size: 13px;">Max Examples (optional):</label>
582
+ <input type="number" id="e2e-num-examples" placeholder="Leave empty to use all selected traces"
583
+ style="width: 200px; padding: 8px; background: #2a2a2a; color: #fff; border: 1px solid #3a3a3a; border-radius: 4px; font-size: 13px;">
584
+ </div>
585
+ </div>
586
+
587
+ <!-- Judge Selection -->
588
+ <div style="margin-bottom: 30px;">
589
+ <h3 style="color: #fff; font-size: 16px; margin-bottom: 15px;">2. Select Judge</h3>
590
+ <select id="e2e-judge" style="width: 100%; padding: 10px; background: #2a2a2a; color: #fff; border: 1px solid #3a3a3a; border-radius: 4px; font-size: 14px;">
591
+ <option value="">Loading judges...</option>
592
+ </select>
593
+ </div>
594
+
595
+ <!-- Actions -->
596
+ <div style="display: flex; gap: 10px; justify-content: flex-end;">
597
+ <button id="close-e2e-btn" style="padding: 10px 20px; background: #5a2a2a; color: white; border: none; border-radius: 4px; cursor: pointer;">
598
+ Cancel
599
+ </button>
600
+ <button id="run-e2e-btn" style="padding: 10px 20px; background: #7a4a9e; color: white; border: none; border-radius: 4px; cursor: pointer; font-weight: 500;">
601
+ ⚡ Run Test
602
+ </button>
603
+ </div>
604
+ </div>
605
+ </div>
606
+
607
+ <!-- End-to-End Progress Panel -->
608
+ <div id="e2e-progress-panel" style="display: none; position: fixed; top: 0; left: 0; width: 100%; height: 100%; background: rgba(0,0,0,0.9); z-index: 1100; padding: 40px; overflow-y: auto;">
609
+ <div style="max-width: 800px; margin: 0 auto; background: #1a1a1a; border-radius: 8px; padding: 30px; border: 1px solid #4a2a4a;">
610
+ <h2 style="color: #fff; margin-bottom: 20px;">Running End-to-End Test</h2>
611
+
612
+ <!-- Overall Progress -->
613
+ <div style="margin-bottom: 30px;">
614
+ <div style="color: #4a9eff; font-size: 14px; margin-bottom: 10px;" id="e2e-step-label">Step 1/3: Exporting traces...</div>
615
+ <div style="width: 100%; height: 8px; background: #2a2a2a; border-radius: 4px; overflow: hidden;">
616
+ <div id="e2e-overall-progress" style="height: 100%; background: #7a4a9e; width: 0%; transition: width 0.3s;"></div>
617
+ </div>
618
+ </div>
619
+
620
+ <!-- Detailed Progress -->
621
+ <div id="e2e-progress-text" style="color: #888; font-family: monospace; font-size: 12px; white-space: pre-wrap; background: #0f0f0f; padding: 15px; border-radius: 4px; max-height: 400px; overflow-y: auto;"></div>
622
+
623
+ <!-- Results -->
624
+ <div id="e2e-results" style="display: none; margin-top: 20px;">
625
+ <h3 style="color: #4a9eff; margin-bottom: 15px;">✓ Test Complete!</h3>
626
+ <div id="e2e-results-content" style="background: #0f0f0f; padding: 15px; border-radius: 4px;"></div>
627
+ <button id="close-e2e-progress-btn" style="margin-top: 20px; padding: 10px 20px; background: #2a7c4a; color: white; border: none; border-radius: 4px; cursor: pointer;">
628
+ Close
629
+ </button>
630
+ </div>
631
+ </div>
632
+ </div>
430
633
  </div>
431
634
 
432
635
  <script>
@@ -1287,6 +1490,517 @@
1287
1490
  console.error('Delete error:', error);
1288
1491
  }
1289
1492
  }
1493
+
1494
+ // === SETTINGS ===
1495
+
1496
+ // Load and display settings
1497
+ async function loadSettings() {
1498
+ try {
1499
+ const response = await fetch('/settings');
1500
+ const settings = await response.json();
1501
+ document.getElementById('settings-inference-project').value = settings.inference_project || '';
1502
+ document.getElementById('settings-evaluation-project').value = settings.evaluation_project || '';
1503
+ } catch (error) {
1504
+ console.error('Error loading settings:', error);
1505
+ }
1506
+ }
1507
+
1508
+ // Open settings panel
1509
+ document.getElementById('open-settings-btn').addEventListener('click', async () => {
1510
+ await loadSettings();
1511
+ document.getElementById('settings-panel').style.display = 'block';
1512
+ });
1513
+
1514
+ // Close settings panel
1515
+ document.getElementById('close-settings-btn').addEventListener('click', () => {
1516
+ document.getElementById('settings-panel').style.display = 'none';
1517
+ });
1518
+
1519
+ // Save settings
1520
+ document.getElementById('save-settings-btn').addEventListener('click', async () => {
1521
+ const settings = {
1522
+ inference_project: document.getElementById('settings-inference-project').value.trim(),
1523
+ evaluation_project: document.getElementById('settings-evaluation-project').value.trim()
1524
+ };
1525
+
1526
+ if (!settings.inference_project || !settings.evaluation_project) {
1527
+ alert('Both project fields are required');
1528
+ return;
1529
+ }
1530
+
1531
+ try {
1532
+ const response = await fetch('/settings', {
1533
+ method: 'POST',
1534
+ headers: { 'Content-Type': 'application/json' },
1535
+ body: JSON.stringify(settings)
1536
+ });
1537
+
1538
+ const result = await response.json();
1539
+ if (result.status === 'success') {
1540
+ alert('Settings saved! Please restart the server for changes to take effect.');
1541
+ document.getElementById('settings-panel').style.display = 'none';
1542
+ } else {
1543
+ alert('Error saving settings');
1544
+ }
1545
+ } catch (error) {
1546
+ alert('Error saving settings: ' + error.message);
1547
+ }
1548
+ });
1549
+
1550
+ // === TEST JUDGES ===
1551
+
1552
+ let testJudgesData = []; // Store judges globally for test panel
1553
+
1554
+ // Open test judge panel
1555
+ document.getElementById('open-test-judge-btn').addEventListener('click', async () => {
1556
+ // Load judges
1557
+ try {
1558
+ const response = await fetch('/list_judges');
1559
+ const data = await response.json();
1560
+ testJudgesData = data.judges || []; // Store globally
1561
+ const judgeSelect = document.getElementById('test-judge-select');
1562
+
1563
+ if (testJudgesData.length > 0) {
1564
+ judgeSelect.innerHTML = testJudgesData.map((judge, idx) =>
1565
+ `<option value="${idx}">${judge.name} (${judge.type})</option>`
1566
+ ).join('');
1567
+
1568
+ // Load first judge's prompt and model
1569
+ if (testJudgesData[0]) {
1570
+ document.getElementById('test-judge-prompt').value = testJudgesData[0].prompt || '';
1571
+ document.getElementById('test-judge-model').value = testJudgesData[0].model || '';
1572
+ }
1573
+ } else {
1574
+ judgeSelect.innerHTML = '<option value="">No judges available</option>';
1575
+ }
1576
+ } catch (error) {
1577
+ console.error('Error loading judges:', error);
1578
+ }
1579
+
1580
+ // Load weak model files
1581
+ try {
1582
+ const response = await fetch('/list_weak_models');
1583
+ const data = await response.json();
1584
+ const weakModelSelect = document.getElementById('test-weak-model-select');
1585
+
1586
+ if (data.files && data.files.length > 0) {
1587
+ weakModelSelect.innerHTML = data.files.map(f =>
1588
+ `<option value="${f.filename}">${f.weak_model || f.filename}</option>`
1589
+ ).join('');
1590
+ } else {
1591
+ weakModelSelect.innerHTML = '<option value="">No weak model files available</option>';
1592
+ }
1593
+ } catch (error) {
1594
+ console.error('Error loading weak models:', error);
1595
+ }
1596
+
1597
+ document.getElementById('test-judge-panel').style.display = 'block';
1598
+ document.getElementById('test-judge-results').style.display = 'none';
1599
+ });
1600
+
1601
+ // When judge selection changes, update the prompt and model
1602
+ document.getElementById('test-judge-select').addEventListener('change', (e) => {
1603
+ const judgeIndex = parseInt(e.target.value);
1604
+ if (!isNaN(judgeIndex) && testJudgesData[judgeIndex]) {
1605
+ const judge = testJudgesData[judgeIndex];
1606
+ document.getElementById('test-judge-prompt').value = judge.prompt || '';
1607
+ document.getElementById('test-judge-model').value = judge.model || '';
1608
+ }
1609
+ });
1610
+
1611
+ // Close test judge panel
1612
+ document.getElementById('close-test-judge-btn').addEventListener('click', () => {
1613
+ document.getElementById('test-judge-panel').style.display = 'none';
1614
+ });
1615
+
1616
+ // Run test judge
1617
+ document.getElementById('run-test-judge-btn').addEventListener('click', async () => {
1618
+ const judgeIndex = document.getElementById('test-judge-select').value;
1619
+ const weakModelFile = document.getElementById('test-weak-model-select').value;
1620
+ const numSamples = parseInt(document.getElementById('test-num-samples').value) || 5;
1621
+ const editedPrompt = document.getElementById('test-judge-prompt').value;
1622
+ const editedModel = document.getElementById('test-judge-model').value;
1623
+
1624
+ if (!judgeIndex) {
1625
+ alert('Please select a judge');
1626
+ return;
1627
+ }
1628
+
1629
+ if (!weakModelFile) {
1630
+ alert('Please select a weak model file');
1631
+ return;
1632
+ }
1633
+
1634
+ if (!editedPrompt.trim()) {
1635
+ alert('Please enter a judge prompt');
1636
+ return;
1637
+ }
1638
+
1639
+ if (!editedModel.trim()) {
1640
+ alert('Please enter a judge model');
1641
+ return;
1642
+ }
1643
+
1644
+ // Get judge data and override with edited prompt and model
1645
+ const judge = { ...testJudgesData[parseInt(judgeIndex)] };
1646
+ judge.prompt = editedPrompt; // Use the edited prompt from textarea
1647
+ judge.model = editedModel; // Use the edited model from input
1648
+
1649
+ // Call test endpoint
1650
+ try {
1651
+ const response = await fetch('/test_judge', {
1652
+ method: 'POST',
1653
+ headers: { 'Content-Type': 'application/json' },
1654
+ body: JSON.stringify({
1655
+ judge: judge,
1656
+ weak_model_file: weakModelFile,
1657
+ num_samples: numSamples
1658
+ })
1659
+ });
1660
+
1661
+ if (!response.ok) {
1662
+ throw new Error('Failed to test judge');
1663
+ }
1664
+
1665
+ const result = await response.json();
1666
+
1667
+ // Display results
1668
+ const resultsDiv = document.getElementById('test-judge-results-content');
1669
+ resultsDiv.innerHTML = result.samples.map((sample, idx) => `
1670
+ <div style="margin-bottom: 20px; padding: 20px; background: #0f0f0f; border-radius: 8px; border: 1px solid #2a2a2a;">
1671
+ <h4 style="color: #4a9eff; margin-bottom: 15px;">Sample ${idx + 1} of ${result.samples.length}</h4>
1672
+
1673
+ <div style="margin-bottom: 15px;">
1674
+ <div style="color: #888; font-size: 12px; margin-bottom: 5px;">Question:</div>
1675
+ <div style="color: #ccc; font-size: 13px; background: #1a1a1a; padding: 10px; border-radius: 4px; max-height: 100px; overflow-y: auto;">${sample.question || 'N/A'}</div>
1676
+ </div>
1677
+
1678
+ <div style="margin-bottom: 15px;">
1679
+ <div style="color: #888; font-size: 12px; margin-bottom: 5px;">Strong Model Output:</div>
1680
+ <div style="color: #ccc; font-size: 13px; background: #1a1a1a; padding: 10px; border-radius: 4px; max-height: 100px; overflow-y: auto;">${sample.strong_output || 'N/A'}</div>
1681
+ </div>
1682
+
1683
+ <div style="margin-bottom: 15px;">
1684
+ <div style="color: #888; font-size: 12px; margin-bottom: 5px;">Weak Model Output:</div>
1685
+ <div style="color: #ccc; font-size: 13px; background: #1a1a1a; padding: 10px; border-radius: 4px; max-height: 100px; overflow-y: auto;">${sample.weak_output || 'N/A'}</div>
1686
+ </div>
1687
+
1688
+ <div style="margin-bottom: 15px;">
1689
+ <div style="color: #888; font-size: 12px; margin-bottom: 5px;">Judge Prompt (filled):</div>
1690
+ <pre style="color: #aaa; font-size: 11px; background: #1a1a1a; padding: 10px; border-radius: 4px; max-height: 150px; overflow-y: auto; white-space: pre-wrap; font-family: monospace;">${sample.judge_prompt}</pre>
1691
+ </div>
1692
+
1693
+ <div style="margin-bottom: 15px;">
1694
+ <div style="color: #888; font-size: 12px; margin-bottom: 5px;">Raw Judge Response:</div>
1695
+ <pre style="color: #f4d03f; font-size: 11px; background: #1a1a1a; padding: 10px; border-radius: 4px; max-height: 150px; overflow-y: auto; white-space: pre-wrap; font-family: monospace;">${sample.raw_response}</pre>
1696
+ </div>
1697
+
1698
+ <div>
1699
+ <div style="color: #888; font-size: 12px; margin-bottom: 5px;">Parsed Scores:</div>
1700
+ <div style="color: #4a9eff; font-size: 13px; background: #1a1a1a; padding: 10px; border-radius: 4px; font-family: monospace;">${JSON.stringify(sample.parsed_scores, null, 2)}</div>
1701
+ </div>
1702
+ </div>
1703
+ `).join('');
1704
+
1705
+ document.getElementById('test-judge-results').style.display = 'block';
1706
+
1707
+ } catch (error) {
1708
+ alert('Error testing judge: ' + error.message);
1709
+ console.error('Test error:', error);
1710
+ }
1711
+ });
1712
+
1713
+ // Save prompt to judge
1714
+ document.getElementById('save-test-judge-prompt-btn').addEventListener('click', async () => {
1715
+ const judgeIndex = document.getElementById('test-judge-select').value;
1716
+ const editedPrompt = document.getElementById('test-judge-prompt').value;
1717
+
1718
+ if (!judgeIndex) {
1719
+ alert('Please select a judge');
1720
+ return;
1721
+ }
1722
+
1723
+ if (!editedPrompt.trim()) {
1724
+ alert('Please enter a judge prompt');
1725
+ return;
1726
+ }
1727
+
1728
+ // Get judge data and update prompt
1729
+ const judge = { ...testJudgesData[parseInt(judgeIndex)] };
1730
+ judge.prompt = editedPrompt;
1731
+
1732
+ // Confirm with user
1733
+ if (!confirm(`Save this prompt to judge "${judge.name}"? This will permanently update the judge.`)) {
1734
+ return;
1735
+ }
1736
+
1737
+ // Call save endpoint
1738
+ try {
1739
+ const response = await fetch('/save_judge', {
1740
+ method: 'POST',
1741
+ headers: { 'Content-Type': 'application/json' },
1742
+ body: JSON.stringify({ judge: judge })
1743
+ });
1744
+
1745
+ if (!response.ok) {
1746
+ throw new Error('Failed to save judge');
1747
+ }
1748
+
1749
+ const result = await response.json();
1750
+
1751
+ // Update local judges data
1752
+ testJudgesData = result.judges || [];
1753
+
1754
+ alert('Judge prompt saved successfully!');
1755
+ } catch (error) {
1756
+ alert('Error saving judge: ' + error.message);
1757
+ console.error('Save error:', error);
1758
+ }
1759
+ });
1760
+
1761
+ // === END-TO-END TEST ===
1762
+
1763
+ // Open E2E panel
1764
+ document.getElementById('open-e2e-btn').addEventListener('click', async () => {
1765
+ if (selectedTraces.size === 0) {
1766
+ alert('Please select at least one trace first!');
1767
+ return;
1768
+ }
1769
+
1770
+ // Populate W&B models
1771
+ const wandbModelsDiv = document.getElementById('e2e-wandb-models');
1772
+ wandbModelsDiv.innerHTML = AVAILABLE_MODELS.map(model => `
1773
+ <label style="display: block; padding: 5px 0; color: #ccc; cursor: pointer;">
1774
+ <input type="checkbox" class="e2e-model-checkbox" value="${model}" style="margin-right: 8px;">
1775
+ ${model}
1776
+ </label>
1777
+ `).join('');
1778
+
1779
+ // Load judges
1780
+ try {
1781
+ const response = await fetch('/list_judges');
1782
+ const data = await response.json();
1783
+ const judgeSelect = document.getElementById('e2e-judge');
1784
+
1785
+ if (data.judges && data.judges.length > 0) {
1786
+ judgeSelect.innerHTML = data.judges.map((judge, idx) =>
1787
+ `<option value="${idx}">${judge.name} (${judge.type})</option>`
1788
+ ).join('');
1789
+ } else {
1790
+ judgeSelect.innerHTML = '<option value="">No judges available - create one first</option>';
1791
+ }
1792
+ } catch (error) {
1793
+ console.error('Error loading judges:', error);
1794
+ }
1795
+
1796
+ document.getElementById('e2e-panel').style.display = 'block';
1797
+ });
1798
+
1799
+ // Close E2E panel
1800
+ document.getElementById('close-e2e-btn').addEventListener('click', () => {
1801
+ document.getElementById('e2e-panel').style.display = 'none';
1802
+ });
1803
+
1804
+ // Close E2E progress
1805
+ document.getElementById('close-e2e-progress-btn').addEventListener('click', () => {
1806
+ document.getElementById('e2e-progress-panel').style.display = 'none';
1807
+ document.getElementById('e2e-results').style.display = 'none';
1808
+ });
1809
+
1810
+ // Run end-to-end test
1811
+ document.getElementById('run-e2e-btn').addEventListener('click', async () => {
1812
+ // Gather selected models
1813
+ const selectedWanbModels = Array.from(document.querySelectorAll('.e2e-model-checkbox:checked')).map(cb => cb.value);
1814
+ const openRouterModelsText = document.getElementById('e2e-openrouter-models').value.trim();
1815
+ const openRouterModels = openRouterModelsText
1816
+ .split('\n')
1817
+ .map(m => m.trim())
1818
+ .filter(m => m.length > 0);
1819
+ const allModels = [...selectedWanbModels, ...openRouterModels];
1820
+
1821
+ if (allModels.length === 0) {
1822
+ alert('Please select at least one model!');
1823
+ return;
1824
+ }
1825
+
1826
+ const judgeIndex = document.getElementById('e2e-judge').value;
1827
+ if (!judgeIndex) {
1828
+ alert('Please select a judge!');
1829
+ return;
1830
+ }
1831
+
1832
+ const numExamples = document.getElementById('e2e-num-examples').value ? parseInt(document.getElementById('e2e-num-examples').value) : null;
1833
+
1834
+ // Load judge data
1835
+ const judgesResponse = await fetch('/list_judges');
1836
+ const judgesData = await judgesResponse.json();
1837
+ const judge = judgesData.judges[parseInt(judgeIndex)];
1838
+
1839
+ // Hide config panel, show progress panel
1840
+ document.getElementById('e2e-panel').style.display = 'none';
1841
+ document.getElementById('e2e-progress-panel').style.display = 'block';
1842
+
1843
+ const progressText = document.getElementById('e2e-progress-text');
1844
+ const stepLabel = document.getElementById('e2e-step-label');
1845
+ const overallProgress = document.getElementById('e2e-overall-progress');
1846
+
1847
+ progressText.textContent = '';
1848
+
1849
+ try {
1850
+ // === STEP 1: Export Selected Traces ===
1851
+ stepLabel.textContent = 'Step 1/3: Exporting selected traces...';
1852
+ overallProgress.style.width = '10%';
1853
+ progressText.textContent += '📦 Exporting selected traces...\n';
1854
+
1855
+ // Get full trace objects for selected IDs
1856
+ const selectedTraceObjects = allTraces.filter(t => selectedTraces.has(t.id));
1857
+
1858
+ const exportResponse = await fetch('/export_strong_traces', {
1859
+ method: 'POST',
1860
+ headers: { 'Content-Type': 'application/json' },
1861
+ body: JSON.stringify({
1862
+ traces: selectedTraceObjects,
1863
+ nickname: `e2e_export_${Date.now()}`
1864
+ })
1865
+ });
1866
+
1867
+ if (!exportResponse.ok) {
1868
+ throw new Error('Failed to export traces');
1869
+ }
1870
+
1871
+ const exportResult = await exportResponse.json();
1872
+ const exportFilename = exportResult.filename;
1873
+ progressText.textContent += `✓ Exported ${exportResult.count} traces to ${exportFilename}\n\n`;
1874
+ overallProgress.style.width = '20%';
1875
+
1876
+ // === STEP 2: Run Weak Model Inference ===
1877
+ stepLabel.textContent = 'Step 2/3: Running weak model inference...';
1878
+ progressText.textContent += `⚙️ Running inference with ${allModels.length} model(s)...\n`;
1879
+
1880
+ const taskId = `inference_${Date.now()}`;
1881
+ let pollInterval = null;
1882
+
1883
+ const pollProgress = async () => {
1884
+ try {
1885
+ const resp = await fetch(`/progress/${taskId}`);
1886
+ if (resp.ok) {
1887
+ const progress = await resp.json();
1888
+ const percent = (progress.current / progress.total) * 100;
1889
+ // Map inference progress to 20-60% of overall
1890
+ const overallPercent = 20 + (percent * 0.4);
1891
+ overallProgress.style.width = `${overallPercent}%`;
1892
+ }
1893
+ } catch (e) {
1894
+ console.error('Error polling progress:', e);
1895
+ }
1896
+ };
1897
+
1898
+ pollInterval = setInterval(pollProgress, 300);
1899
+
1900
+ const inferenceResponse = await fetch('/run_inference', {
1901
+ method: 'POST',
1902
+ headers: { 'Content-Type': 'application/json' },
1903
+ body: JSON.stringify({
1904
+ models: allModels,
1905
+ strong_export_file: exportFilename,
1906
+ num_examples: numExamples,
1907
+ task_id: taskId
1908
+ })
1909
+ });
1910
+
1911
+ if (pollInterval) clearInterval(pollInterval);
1912
+
1913
+ if (!inferenceResponse.ok) {
1914
+ throw new Error('Failed to run inference');
1915
+ }
1916
+
1917
+ const inferenceResult = await inferenceResponse.json();
1918
+ progressText.textContent += `✓ Generated outputs for ${allModels.length} model(s)\n\n`;
1919
+ overallProgress.style.width = '60%';
1920
+
1921
+ // === STEP 3: Run Evaluations ===
1922
+ stepLabel.textContent = 'Step 3/3: Running evaluations...';
1923
+ progressText.textContent += `📊 Running evaluations with judge: ${judge.name}...\n`;
1924
+
1925
+ const evaluationResults = [];
1926
+
1927
+ // Get list of weak model files that were just generated
1928
+ const weakModelsResponse = await fetch('/list_weak_models');
1929
+ const weakModelsData = await weakModelsResponse.json();
1930
+
1931
+ // Filter to only the models we just ran
1932
+ const weakModelFiles = weakModelsData.files
1933
+ .filter(f => allModels.some(m => f.filename.includes(m.replace('/', '_'))))
1934
+ .map(f => f.filename);
1935
+
1936
+ for (let i = 0; i < weakModelFiles.length; i++) {
1937
+ const modelFile = weakModelFiles[i];
1938
+ const evalTaskId = `eval_${Date.now()}_${i}`;
1939
+
1940
+ progressText.textContent += `\n[${i+1}/${weakModelFiles.length}] Evaluating ${modelFile}...\n`;
1941
+
1942
+ let evalPollInterval = null;
1943
+ const pollEvalProgress = async () => {
1944
+ try {
1945
+ const resp = await fetch(`/progress/${evalTaskId}`);
1946
+ if (resp.ok) {
1947
+ const progress = await resp.json();
1948
+ const percent = (progress.current / progress.total) * 100;
1949
+ // Map eval progress to 60-100% of overall
1950
+ const basePercent = 60 + (i / weakModelFiles.length) * 40;
1951
+ const stepPercent = (percent / 100) * (40 / weakModelFiles.length);
1952
+ overallProgress.style.width = `${basePercent + stepPercent}%`;
1953
+ }
1954
+ } catch (e) {
1955
+ console.error('Error polling eval progress:', e);
1956
+ }
1957
+ };
1958
+
1959
+ evalPollInterval = setInterval(pollEvalProgress, 300);
1960
+
1961
+ const evalResponse = await fetch('/run_evaluation', {
1962
+ method: 'POST',
1963
+ headers: { 'Content-Type': 'application/json' },
1964
+ body: JSON.stringify({
1965
+ model_file: modelFile,
1966
+ judge: judge,
1967
+ task_id: evalTaskId
1968
+ })
1969
+ });
1970
+
1971
+ if (evalPollInterval) clearInterval(evalPollInterval);
1972
+
1973
+ if (evalResponse.ok) {
1974
+ const evalResult = await evalResponse.json();
1975
+ progressText.textContent += ` ✓ Complete: ${evalResult.examples_evaluated} examples evaluated\n`;
1976
+ evaluationResults.push(evalResult);
1977
+ } else {
1978
+ progressText.textContent += ` ✗ Error evaluating ${modelFile}\n`;
1979
+ }
1980
+ }
1981
+
1982
+ overallProgress.style.width = '100%';
1983
+ stepLabel.textContent = 'Complete!';
1984
+ progressText.textContent += `\n✅ All evaluations complete!\n`;
1985
+
1986
+ // Show results
1987
+ document.getElementById('e2e-results').style.display = 'block';
1988
+ const resultsContent = document.getElementById('e2e-results-content');
1989
+ resultsContent.innerHTML = evaluationResults.map(r => `
1990
+ <div style="margin-bottom: 15px; padding: 15px; background: #1a1a1a; border-radius: 4px; border: 1px solid #2a2a2a;">
1991
+ <div style="font-weight: bold; color: #fff; margin-bottom: 8px;">${r.evaluation_name}</div>
1992
+ <div style="font-size: 12px; color: #888; margin-bottom: 8px;">
1993
+ ${r.examples_evaluated} examples evaluated
1994
+ </div>
1995
+ <a href="${r.weave_url}" target="_blank" style="color: #4a9eff; font-size: 13px;">View in Weave →</a>
1996
+ </div>
1997
+ `).join('');
1998
+
1999
+ } catch (error) {
2000
+ progressText.textContent += `\n\n❌ Error: ${error.message}\n`;
2001
+ stepLabel.textContent = 'Error occurred';
2002
+ }
2003
+ });
1290
2004
  </script>
1291
2005
  </body>
1292
2006
  </html>
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: quickdistill
3
- Version: 0.1.6
3
+ Version: 0.1.7
4
4
  Summary: Fast and easy toolkit for distilling AI models
5
5
  Author-email: Brett Young <bdytx5@umsystem.edu>
6
6
  License: MIT
@@ -0,0 +1,17 @@
1
+ quickdistill/__init__.py,sha256=U8mvMbfYKLFegcEA4D-P6AFHvSiHQPXoFn0KKd-xh0A,397
2
+ quickdistill/cli.py,sha256=A8d5GN9NdBS299WyAsJ6-p8ynW3DJnDRHZ-UGH7TXLM,2212
3
+ quickdistill/default_judges.json,sha256=w0TkIniELPPG-Mi3hm7zPW06eq46W1BI_ufWXnkDDDM,1432
4
+ quickdistill/get_traces.py,sha256=mfy9fMiK-CZQN1noZ4DfOwdwP45ntthVDLgh4-u2iNk,4896
5
+ quickdistill/server.py,sha256=0Y0XG-8oYoNZgmo10LPZgtwlHuGqrq0urxE-KabyIvI,36789
6
+ quickdistill/__pycache__/__init__.cpython-310.pyc,sha256=Tbov274p3OjaOuOsQwcW-meATEfkz0mHKmpytksuDJI,603
7
+ quickdistill/__pycache__/cli.cpython-310.pyc,sha256=xtVgJTayQLKS4gE_te7U1Wo8LmkDtPkaa2rnzu8h9fY,2443
8
+ quickdistill/__pycache__/get_traces.cpython-310.pyc,sha256=T7Suxp9vpqYDQJ_3uJvXWemqoLf5tnRC2I0BfHrSiNM,2956
9
+ quickdistill/__pycache__/server.cpython-310.pyc,sha256=_taKWofMtdgfMZzfVsd7PoC4jnuKxEOGzW82YBxqPPc,22051
10
+ quickdistill/default_projects/byyoung3_arena-detailed/traces_data.json,sha256=iz-cBmXBYj0bC3Vn754QTnGuDh6sRvlE_RzSyGXaxbY,15496950
11
+ quickdistill/static/judge_manager.html,sha256=fXteyx_ry4gY166WypBkVGGCqieE88MigqLRLVCKnG8,26887
12
+ quickdistill/static/trace_viewer.html,sha256=kPC4GnxeDPq7jxClRhZBOuS6xmA3RaY-loJDZmKDADE,94426
13
+ quickdistill-0.1.7.dist-info/METADATA,sha256=1pE5fDep0l0kAxhHuT1C_H4CYHIiPLP4n9QraAqI9bM,5084
14
+ quickdistill-0.1.7.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
15
+ quickdistill-0.1.7.dist-info/entry_points.txt,sha256=AUUTxnwdD9gRnsOEcTXQTAZIZ_F0aRU7JGstIJ3Xk_o,55
16
+ quickdistill-0.1.7.dist-info/top_level.txt,sha256=ysiMvurJYsE1IhkxmObe-0G8A-GIav40kTh2z6axjxg,13
17
+ quickdistill-0.1.7.dist-info/RECORD,,
@@ -1,17 +0,0 @@
1
- quickdistill/__init__.py,sha256=DquS7slegbCcNr33DA4WEy4RnHFUPHbl3tGhOkw8Yzo,397
2
- quickdistill/cli.py,sha256=A8d5GN9NdBS299WyAsJ6-p8ynW3DJnDRHZ-UGH7TXLM,2212
3
- quickdistill/default_judges.json,sha256=w0TkIniELPPG-Mi3hm7zPW06eq46W1BI_ufWXnkDDDM,1432
4
- quickdistill/get_traces.py,sha256=mfy9fMiK-CZQN1noZ4DfOwdwP45ntthVDLgh4-u2iNk,4896
5
- quickdistill/server.py,sha256=EXifo8rF8wU_5mhX7ZnpYTi3iRus9XL9nuBdR7FFBRg,27761
6
- quickdistill/__pycache__/__init__.cpython-310.pyc,sha256=jC6GheK56FqSP9ZP_kHookaiqaKcfY82xOlo2Qn8sag,603
7
- quickdistill/__pycache__/cli.cpython-310.pyc,sha256=xtVgJTayQLKS4gE_te7U1Wo8LmkDtPkaa2rnzu8h9fY,2443
8
- quickdistill/__pycache__/get_traces.cpython-310.pyc,sha256=T7Suxp9vpqYDQJ_3uJvXWemqoLf5tnRC2I0BfHrSiNM,2956
9
- quickdistill/__pycache__/server.cpython-310.pyc,sha256=DXP_J3nP4lPDWwB4fiXbRTSTzM-A1ymWqX593-gDMuA,17503
10
- quickdistill/default_projects/byyoung3_arena-detailed/traces_data.json,sha256=iz-cBmXBYj0bC3Vn754QTnGuDh6sRvlE_RzSyGXaxbY,15496950
11
- quickdistill/static/judge_manager.html,sha256=YzzMWpNHyVc7Lyu8Cn55hWzAvYJ1WscXEufQLQ4jR18,17410
12
- quickdistill/static/trace_viewer.html,sha256=MoXxp_FroAbs8PLzFV7qNkxRI-IY3GGkQWDOHnZM_j8,56257
13
- quickdistill-0.1.6.dist-info/METADATA,sha256=-Ku5g1GGf8jo7rCZb_vcwy5AB968Zv8INGymUPZDNAg,5084
14
- quickdistill-0.1.6.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
15
- quickdistill-0.1.6.dist-info/entry_points.txt,sha256=AUUTxnwdD9gRnsOEcTXQTAZIZ_F0aRU7JGstIJ3Xk_o,55
16
- quickdistill-0.1.6.dist-info/top_level.txt,sha256=ysiMvurJYsE1IhkxmObe-0G8A-GIav40kTh2z6axjxg,13
17
- quickdistill-0.1.6.dist-info/RECORD,,