quickdistill 0.1.6__py3-none-any.whl → 0.1.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- quickdistill/__init__.py +1 -1
- quickdistill/__pycache__/__init__.cpython-310.pyc +0 -0
- quickdistill/__pycache__/server.cpython-310.pyc +0 -0
- quickdistill/server.py +257 -5
- quickdistill/static/judge_manager.html +181 -0
- quickdistill/static/trace_viewer.html +714 -0
- {quickdistill-0.1.6.dist-info → quickdistill-0.1.7.dist-info}/METADATA +1 -1
- quickdistill-0.1.7.dist-info/RECORD +17 -0
- quickdistill-0.1.6.dist-info/RECORD +0 -17
- {quickdistill-0.1.6.dist-info → quickdistill-0.1.7.dist-info}/WHEEL +0 -0
- {quickdistill-0.1.6.dist-info → quickdistill-0.1.7.dist-info}/entry_points.txt +0 -0
- {quickdistill-0.1.6.dist-info → quickdistill-0.1.7.dist-info}/top_level.txt +0 -0
quickdistill/__init__.py
CHANGED
|
Binary file
|
|
Binary file
|
quickdistill/server.py
CHANGED
|
@@ -3,7 +3,6 @@ import json
|
|
|
3
3
|
import openai
|
|
4
4
|
import weave
|
|
5
5
|
import shutil
|
|
6
|
-
import threading
|
|
7
6
|
from flask import Flask, request, jsonify, send_from_directory
|
|
8
7
|
from flask_cors import CORS
|
|
9
8
|
from llmasajudge import LLMAsAJudge
|
|
@@ -34,8 +33,25 @@ CORS(app)
|
|
|
34
33
|
# Progress tracking for long-running operations
|
|
35
34
|
progress_state = {}
|
|
36
35
|
|
|
37
|
-
#
|
|
38
|
-
|
|
36
|
+
# Load settings
|
|
37
|
+
SETTINGS_FILE = DATA_DIR / 'settings.json'
|
|
38
|
+
DEFAULT_SETTINGS = {
|
|
39
|
+
'inference_project': 'wandb_fc/quickstart_playground',
|
|
40
|
+
'evaluation_project': 'wandb_inference'
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
def load_settings():
|
|
44
|
+
if SETTINGS_FILE.exists():
|
|
45
|
+
with open(SETTINGS_FILE, 'r') as f:
|
|
46
|
+
return {**DEFAULT_SETTINGS, **json.load(f)}
|
|
47
|
+
return DEFAULT_SETTINGS.copy()
|
|
48
|
+
|
|
49
|
+
def save_settings(settings):
|
|
50
|
+
with open(SETTINGS_FILE, 'w') as f:
|
|
51
|
+
json.dump(settings, f, indent=2)
|
|
52
|
+
|
|
53
|
+
settings = load_settings()
|
|
54
|
+
PROJECT = settings['evaluation_project']
|
|
39
55
|
|
|
40
56
|
weave.init(PROJECT)
|
|
41
57
|
|
|
@@ -46,7 +62,7 @@ def create_client():
|
|
|
46
62
|
api_key=os.getenv("WANDB_API_KEY"),
|
|
47
63
|
project=PROJECT,
|
|
48
64
|
default_headers={
|
|
49
|
-
"OpenAI-Project":
|
|
65
|
+
"OpenAI-Project": settings['inference_project']
|
|
50
66
|
}
|
|
51
67
|
)
|
|
52
68
|
|
|
@@ -175,8 +191,9 @@ def run_inference_endpoint():
|
|
|
175
191
|
if not traces:
|
|
176
192
|
return jsonify({'error': 'No traces in export file'}), 400
|
|
177
193
|
|
|
178
|
-
# Limit traces to num_examples
|
|
194
|
+
# Limit traces to num_examples (convert to int if needed)
|
|
179
195
|
if num_examples:
|
|
196
|
+
num_examples = int(num_examples)
|
|
180
197
|
traces = traces[:num_examples]
|
|
181
198
|
|
|
182
199
|
output_files = []
|
|
@@ -284,6 +301,241 @@ def get_progress(task_id):
|
|
|
284
301
|
return jsonify({'error': 'Task not found'}), 404
|
|
285
302
|
|
|
286
303
|
|
|
304
|
+
@app.route('/settings', methods=['GET'])
|
|
305
|
+
def get_settings():
|
|
306
|
+
"""Get current settings"""
|
|
307
|
+
return jsonify(settings)
|
|
308
|
+
|
|
309
|
+
|
|
310
|
+
@app.route('/settings', methods=['POST'])
|
|
311
|
+
def update_settings():
|
|
312
|
+
"""Update settings"""
|
|
313
|
+
global settings
|
|
314
|
+
data = request.json
|
|
315
|
+
settings.update(data)
|
|
316
|
+
save_settings(settings)
|
|
317
|
+
return jsonify({'status': 'success', 'settings': settings})
|
|
318
|
+
|
|
319
|
+
|
|
320
|
+
@app.route('/test_judge', methods=['POST'])
|
|
321
|
+
def test_judge():
|
|
322
|
+
"""Test a judge on sample data to see raw inputs/outputs"""
|
|
323
|
+
data = request.json
|
|
324
|
+
judge = data.get('judge')
|
|
325
|
+
weak_model_file = data.get('weak_model_file')
|
|
326
|
+
num_samples = data.get('num_samples', 5)
|
|
327
|
+
|
|
328
|
+
if not judge or not weak_model_file:
|
|
329
|
+
return jsonify({'error': 'Missing judge or weak_model_file'}), 400
|
|
330
|
+
|
|
331
|
+
# Load weak model results
|
|
332
|
+
model_path = DATA_DIR / weak_model_file
|
|
333
|
+
with open(model_path, 'r') as f:
|
|
334
|
+
file_data = json.load(f)
|
|
335
|
+
|
|
336
|
+
# Handle both formats
|
|
337
|
+
if isinstance(file_data, dict) and 'results' in file_data:
|
|
338
|
+
results = file_data['results']
|
|
339
|
+
else:
|
|
340
|
+
results = file_data
|
|
341
|
+
|
|
342
|
+
# Limit to num_samples
|
|
343
|
+
samples_to_test = results[:min(num_samples, len(results))]
|
|
344
|
+
|
|
345
|
+
test_results = []
|
|
346
|
+
|
|
347
|
+
for example in samples_to_test:
|
|
348
|
+
# Skip examples with errors
|
|
349
|
+
if example.get('error') or not example.get('output'):
|
|
350
|
+
continue
|
|
351
|
+
|
|
352
|
+
strong_output = example.get('strong_model_output', '')
|
|
353
|
+
weak_output = example.get('output', '')
|
|
354
|
+
|
|
355
|
+
# Extract question
|
|
356
|
+
question = ""
|
|
357
|
+
messages = example.get('messages', [])
|
|
358
|
+
if messages and len(messages) > 0:
|
|
359
|
+
question = messages[0].get('content', '')
|
|
360
|
+
|
|
361
|
+
# Build the prompt
|
|
362
|
+
prompt = judge['prompt']
|
|
363
|
+
if '{question}' in prompt:
|
|
364
|
+
prompt = prompt.replace('{question}', question or '')
|
|
365
|
+
if '{strong_output}' in prompt:
|
|
366
|
+
prompt = prompt.replace('{strong_output}', strong_output or '')
|
|
367
|
+
if '{weak_output}' in prompt:
|
|
368
|
+
prompt = prompt.replace('{weak_output}', weak_output or '')
|
|
369
|
+
|
|
370
|
+
# Run the judge and capture raw response
|
|
371
|
+
if judge['type'] == 'llm':
|
|
372
|
+
return_type = judge.get('returnType', 'scalar')
|
|
373
|
+
|
|
374
|
+
# Use a list to capture the raw response (mutable so we can access from closure)
|
|
375
|
+
captured_raw = []
|
|
376
|
+
|
|
377
|
+
def score_parser(response: str):
|
|
378
|
+
"""Parse the judge response based on return type"""
|
|
379
|
+
# Capture the raw response before any processing
|
|
380
|
+
captured_raw.append(response)
|
|
381
|
+
|
|
382
|
+
response = response.strip()
|
|
383
|
+
|
|
384
|
+
# Remove markdown code blocks if present
|
|
385
|
+
if response.startswith('```'):
|
|
386
|
+
# Remove ```json or ``` at start
|
|
387
|
+
response = response.split('\n', 1)[1] if '\n' in response else response[3:]
|
|
388
|
+
# Remove ``` at end
|
|
389
|
+
if response.endswith('```'):
|
|
390
|
+
response = response.rsplit('\n', 1)[0] if '\n' in response else response[:-3]
|
|
391
|
+
response = response.strip()
|
|
392
|
+
|
|
393
|
+
try:
|
|
394
|
+
# Parse JSON response
|
|
395
|
+
parsed = json.loads(response)
|
|
396
|
+
|
|
397
|
+
if return_type == 'boolean':
|
|
398
|
+
# Extract boolean value - return just the bool
|
|
399
|
+
val = parsed.get('correct', parsed.get('result', parsed.get('value', False)))
|
|
400
|
+
return bool(val)
|
|
401
|
+
elif return_type == 'scalar':
|
|
402
|
+
# Extract numeric score - return just the number
|
|
403
|
+
val = parsed.get('score', parsed.get('scores', parsed.get('value', 0)))
|
|
404
|
+
return float(val) if isinstance(val, (int, float)) else 0
|
|
405
|
+
else:
|
|
406
|
+
# Unsupported return type
|
|
407
|
+
print(f"Unsupported return type: {return_type}")
|
|
408
|
+
return 0
|
|
409
|
+
except:
|
|
410
|
+
print(f"Failed to parse judge response as JSON: {response}")
|
|
411
|
+
if return_type == 'scalar':
|
|
412
|
+
return 0
|
|
413
|
+
elif return_type == 'boolean':
|
|
414
|
+
return False
|
|
415
|
+
else:
|
|
416
|
+
return 0
|
|
417
|
+
|
|
418
|
+
# Use LLMAsAJudge exactly like the evaluation code
|
|
419
|
+
try:
|
|
420
|
+
# Initialize LLMAsAJudge with custom prompt
|
|
421
|
+
judge_instance = LLMAsAJudge(
|
|
422
|
+
models=[judge['model']],
|
|
423
|
+
use_fully_custom_prompt=True,
|
|
424
|
+
output_parser=score_parser,
|
|
425
|
+
return_type=return_type if return_type else None
|
|
426
|
+
)
|
|
427
|
+
|
|
428
|
+
# Get judgment
|
|
429
|
+
result = judge_instance.judge(prompt=prompt)
|
|
430
|
+
|
|
431
|
+
# Extract the raw response that was captured
|
|
432
|
+
raw_text = captured_raw[0] if captured_raw else "No response captured"
|
|
433
|
+
|
|
434
|
+
# Extract parsed scores from result
|
|
435
|
+
if return_type == 'scalar':
|
|
436
|
+
score_val = result.get('scores', result.get('correct', 0))
|
|
437
|
+
parsed_scores = {'score': score_val}
|
|
438
|
+
elif return_type == 'boolean':
|
|
439
|
+
bool_val = result.get('correct', False)
|
|
440
|
+
parsed_scores = {'correct': bool_val}
|
|
441
|
+
else:
|
|
442
|
+
# Unsupported return type - default to scalar
|
|
443
|
+
score_val = result.get('scores', result.get('correct', 0))
|
|
444
|
+
parsed_scores = {'score': score_val}
|
|
445
|
+
|
|
446
|
+
except Exception as e:
|
|
447
|
+
raw_text = f"Error: {str(e)}"
|
|
448
|
+
parsed_scores = {'error': str(e)}
|
|
449
|
+
|
|
450
|
+
test_results.append({
|
|
451
|
+
'question': question,
|
|
452
|
+
'strong_output': strong_output,
|
|
453
|
+
'weak_output': weak_output,
|
|
454
|
+
'judge_prompt': prompt,
|
|
455
|
+
'raw_response': raw_text,
|
|
456
|
+
'parsed_scores': parsed_scores
|
|
457
|
+
})
|
|
458
|
+
|
|
459
|
+
return jsonify({
|
|
460
|
+
'status': 'success',
|
|
461
|
+
'judge_name': judge['name'],
|
|
462
|
+
'num_samples': len(test_results),
|
|
463
|
+
'samples': test_results
|
|
464
|
+
})
|
|
465
|
+
|
|
466
|
+
|
|
467
|
+
@app.route('/generate_judge_prompt', methods=['POST'])
|
|
468
|
+
def generate_judge_prompt():
|
|
469
|
+
"""Generate a judge prompt using AI based on sample data"""
|
|
470
|
+
data = request.json
|
|
471
|
+
weak_model_file = data.get('weak_model_file')
|
|
472
|
+
num_samples = data.get('num_samples', 3)
|
|
473
|
+
model = data.get('model', 'openai/gpt-5')
|
|
474
|
+
meta_prompt = data.get('meta_prompt')
|
|
475
|
+
|
|
476
|
+
if not weak_model_file or not meta_prompt:
|
|
477
|
+
return jsonify({'error': 'Missing weak_model_file or meta_prompt'}), 400
|
|
478
|
+
|
|
479
|
+
# Load weak model results
|
|
480
|
+
model_path = DATA_DIR / weak_model_file
|
|
481
|
+
with open(model_path, 'r') as f:
|
|
482
|
+
file_data = json.load(f)
|
|
483
|
+
|
|
484
|
+
# Handle both formats
|
|
485
|
+
if isinstance(file_data, dict) and 'results' in file_data:
|
|
486
|
+
results = file_data['results']
|
|
487
|
+
else:
|
|
488
|
+
results = file_data
|
|
489
|
+
|
|
490
|
+
# Limit to num_samples
|
|
491
|
+
samples_to_use = results[:min(num_samples, len(results))]
|
|
492
|
+
|
|
493
|
+
# Format samples for meta-prompt
|
|
494
|
+
samples_text = []
|
|
495
|
+
for i, example in enumerate(samples_to_use):
|
|
496
|
+
# Skip examples with errors
|
|
497
|
+
if example.get('error') or not example.get('output'):
|
|
498
|
+
continue
|
|
499
|
+
|
|
500
|
+
strong_output = example.get('strong_model_output', '')
|
|
501
|
+
weak_output = example.get('output', '')
|
|
502
|
+
|
|
503
|
+
# Extract question
|
|
504
|
+
question = ""
|
|
505
|
+
messages = example.get('messages', [])
|
|
506
|
+
if messages and len(messages) > 0:
|
|
507
|
+
question = messages[0].get('content', '')
|
|
508
|
+
|
|
509
|
+
samples_text.append(f"""Sample {i+1}:
|
|
510
|
+
Question: {question}
|
|
511
|
+
Strong Model Output: {strong_output}
|
|
512
|
+
Weak Model Output: {weak_output}
|
|
513
|
+
---""")
|
|
514
|
+
|
|
515
|
+
samples_formatted = "\n\n".join(samples_text)
|
|
516
|
+
|
|
517
|
+
# Replace {SAMPLES} placeholder in meta-prompt
|
|
518
|
+
final_prompt = meta_prompt.replace('{SAMPLES}', samples_formatted)
|
|
519
|
+
|
|
520
|
+
# Call OpenRouter to generate the prompt
|
|
521
|
+
try:
|
|
522
|
+
client = create_openrouter_client()
|
|
523
|
+
response = client.chat.completions.create(
|
|
524
|
+
model=model,
|
|
525
|
+
messages=[{"role": "user", "content": final_prompt}]
|
|
526
|
+
)
|
|
527
|
+
generated_prompt = response.choices[0].message.content.strip()
|
|
528
|
+
|
|
529
|
+
return jsonify({
|
|
530
|
+
'status': 'success',
|
|
531
|
+
'generated_prompt': generated_prompt,
|
|
532
|
+
'num_samples_used': len(samples_text)
|
|
533
|
+
})
|
|
534
|
+
|
|
535
|
+
except Exception as e:
|
|
536
|
+
return jsonify({'error': f'Failed to generate prompt: {str(e)}'}), 500
|
|
537
|
+
|
|
538
|
+
|
|
287
539
|
@app.route('/list_weak_models', methods=['GET'])
|
|
288
540
|
def list_weak_models():
|
|
289
541
|
"""List available weak model result files with metadata"""
|
|
@@ -162,6 +162,13 @@
|
|
|
162
162
|
<div class="container">
|
|
163
163
|
<h1>Judge Manager</h1>
|
|
164
164
|
|
|
165
|
+
<!-- Prompt Generator Button -->
|
|
166
|
+
<div style="margin-bottom: 20px; display: none;">
|
|
167
|
+
<button onclick="openPromptGenerator()" style="padding: 10px 20px; background: #6a4a7e; color: white; border: none; border-radius: 4px; cursor: pointer; font-size: 14px;">
|
|
168
|
+
✨ Generate Judge Prompt with AI
|
|
169
|
+
</button>
|
|
170
|
+
</div>
|
|
171
|
+
|
|
165
172
|
<!-- Create/Edit Judge Section -->
|
|
166
173
|
<div class="section">
|
|
167
174
|
<h2 id="form-title">Create New Judge</h2>
|
|
@@ -217,6 +224,73 @@
|
|
|
217
224
|
</div>
|
|
218
225
|
</div>
|
|
219
226
|
|
|
227
|
+
<!-- Prompt Generator Panel -->
|
|
228
|
+
<div id="prompt-generator-panel" style="display: none; position: fixed; top: 0; left: 0; width: 100%; height: 100%; background: rgba(0,0,0,0.9); z-index: 1000; padding: 40px; overflow-y: auto;">
|
|
229
|
+
<div style="max-width: 1200px; margin: 0 auto; background: #1a1a1a; border-radius: 8px; padding: 30px; border: 1px solid #3a2a4a;">
|
|
230
|
+
<h2 style="color: #fff; margin-bottom: 10px;">AI-Powered Judge Prompt Generator</h2>
|
|
231
|
+
<p style="color: #888; font-size: 13px; margin-bottom: 25px;">
|
|
232
|
+
Generate specialized judge prompts by showing sample data to an AI model
|
|
233
|
+
</p>
|
|
234
|
+
|
|
235
|
+
<!-- Configuration -->
|
|
236
|
+
<div style="display: grid; grid-template-columns: 1fr 1fr; gap: 20px; margin-bottom: 20px;">
|
|
237
|
+
<div>
|
|
238
|
+
<label style="display: block; color: #aaa; margin-bottom: 8px; font-size: 14px;">Weak Model Dataset:</label>
|
|
239
|
+
<select id="gen-weak-model-select" style="width: 100%; padding: 10px; background: #2a2a2a; color: #fff; border: 1px solid #3a3a3a; border-radius: 4px; font-size: 14px;">
|
|
240
|
+
<option value="">Loading weak model files...</option>
|
|
241
|
+
</select>
|
|
242
|
+
</div>
|
|
243
|
+
|
|
244
|
+
<div>
|
|
245
|
+
<label style="display: block; color: #aaa; margin-bottom: 8px; font-size: 14px;">Number of Samples:</label>
|
|
246
|
+
<input type="number" id="gen-num-samples" value="3" min="1" max="10"
|
|
247
|
+
style="width: 100%; padding: 10px; background: #2a2a2a; color: #fff; border: 1px solid #3a3a3a; border-radius: 4px; font-size: 14px;">
|
|
248
|
+
<div style="color: #666; font-size: 12px; margin-top: 5px;">Max: 10 (for context limits)</div>
|
|
249
|
+
</div>
|
|
250
|
+
</div>
|
|
251
|
+
|
|
252
|
+
<div style="margin-bottom: 20px;">
|
|
253
|
+
<label style="display: block; color: #aaa; margin-bottom: 8px; font-size: 14px;">Generation Model:</label>
|
|
254
|
+
<input type="text" id="gen-model" value="openai/gpt-5"
|
|
255
|
+
style="width: 100%; padding: 10px; background: #2a2a2a; color: #fff; border: 1px solid #3a3a3a; border-radius: 4px; font-size: 14px;"
|
|
256
|
+
placeholder="e.g., openai/gpt-5, anthropic/claude-3.5-sonnet">
|
|
257
|
+
<div style="color: #666; font-size: 12px; margin-top: 5px;">OpenRouter model to use for generating the prompt</div>
|
|
258
|
+
</div>
|
|
259
|
+
|
|
260
|
+
<!-- Meta-Prompt -->
|
|
261
|
+
<div style="margin-bottom: 25px;">
|
|
262
|
+
<label style="display: block; color: #aaa; margin-bottom: 8px; font-size: 14px;">Meta-Prompt (edit as needed):</label>
|
|
263
|
+
<textarea id="gen-meta-prompt"
|
|
264
|
+
style="width: 100%; min-height: 250px; padding: 10px; background: #2a2a2a; color: #fff; border: 1px solid #3a3a3a; border-radius: 4px; font-size: 13px; font-family: 'Courier New', monospace; resize: vertical;"></textarea>
|
|
265
|
+
<div style="color: #666; font-size: 12px; margin-top: 5px;">
|
|
266
|
+
This prompt will be sent to the generation model along with sample data
|
|
267
|
+
</div>
|
|
268
|
+
</div>
|
|
269
|
+
|
|
270
|
+
<!-- Actions -->
|
|
271
|
+
<div style="display: flex; gap: 10px; margin-bottom: 25px;">
|
|
272
|
+
<button onclick="generatePrompt()" style="padding: 10px 20px; background: #6a4a7e; color: white; border: none; border-radius: 4px; cursor: pointer; font-weight: 500;">
|
|
273
|
+
Generate Prompt
|
|
274
|
+
</button>
|
|
275
|
+
<button onclick="closePromptGenerator()" style="padding: 10px 20px; background: #5a2a2a; color: white; border: none; border-radius: 4px; cursor: pointer;">
|
|
276
|
+
Close
|
|
277
|
+
</button>
|
|
278
|
+
</div>
|
|
279
|
+
|
|
280
|
+
<!-- Generated Output -->
|
|
281
|
+
<div id="gen-output-section" style="display: none;">
|
|
282
|
+
<h3 style="color: #4a9eff; margin-bottom: 15px;">Generated Judge Prompt</h3>
|
|
283
|
+
<textarea id="gen-output" readonly
|
|
284
|
+
style="width: 100%; min-height: 300px; padding: 15px; background: #0f0f0f; color: #4a9eff; border: 1px solid #4a9eff; border-radius: 4px; font-size: 13px; font-family: 'Courier New', monospace; resize: vertical;"></textarea>
|
|
285
|
+
<div style="margin-top: 10px;">
|
|
286
|
+
<button onclick="copyGeneratedPrompt()" style="padding: 8px 16px; background: #2a7c4a; color: white; border: none; border-radius: 4px; cursor: pointer;">
|
|
287
|
+
Copy to Clipboard
|
|
288
|
+
</button>
|
|
289
|
+
</div>
|
|
290
|
+
</div>
|
|
291
|
+
</div>
|
|
292
|
+
</div>
|
|
293
|
+
|
|
220
294
|
<script>
|
|
221
295
|
let judges = [];
|
|
222
296
|
let editingIndex = null;
|
|
@@ -489,6 +563,113 @@ Respond in JSON format: {'correct': true} or {'correct': false}`
|
|
|
489
563
|
console.log('Not changing prompt - user has edited it');
|
|
490
564
|
}
|
|
491
565
|
});
|
|
566
|
+
|
|
567
|
+
// === PROMPT GENERATOR ===
|
|
568
|
+
|
|
569
|
+
const DEFAULT_META_PROMPT = `You are an expert at creating evaluation prompts for judging AI model outputs. I'm building a specialized judge prompt to evaluate the quality/similarity of weak model outputs compared to strong reference model outputs.
|
|
570
|
+
|
|
571
|
+
I will show you some sample data below. Each sample contains:
|
|
572
|
+
- A question/input
|
|
573
|
+
- The strong reference model's output (ground truth)
|
|
574
|
+
- The weak model's output (what we're evaluating)
|
|
575
|
+
|
|
576
|
+
Your task: Create a specialized, detailed judge prompt that can be used to systematically evaluate the delta/difference between these outputs. The prompt should:
|
|
577
|
+
1. Be specific to the patterns you see in this data
|
|
578
|
+
2. Include clear evaluation criteria
|
|
579
|
+
3. Be written in second-person ("You are...")
|
|
580
|
+
4. Include the placeholders {question}, {strong_output}, and {weak_output}
|
|
581
|
+
5. Specify the exact JSON format to return (either {'score': number} for scalar or {'correct': boolean} for boolean)
|
|
582
|
+
|
|
583
|
+
Sample Data:
|
|
584
|
+
{SAMPLES}
|
|
585
|
+
|
|
586
|
+
Based on these samples, create a specialized judge prompt that would effectively evaluate this type of data. Return ONLY the judge prompt text, nothing else.`;
|
|
587
|
+
|
|
588
|
+
async function openPromptGenerator() {
|
|
589
|
+
// Load weak model files
|
|
590
|
+
try {
|
|
591
|
+
const response = await fetch('/list_weak_models');
|
|
592
|
+
const data = await response.json();
|
|
593
|
+
const select = document.getElementById('gen-weak-model-select');
|
|
594
|
+
|
|
595
|
+
if (data.files && data.files.length > 0) {
|
|
596
|
+
select.innerHTML = data.files.map(f =>
|
|
597
|
+
`<option value="${f.filename}">${f.weak_model || f.filename}</option>`
|
|
598
|
+
).join('');
|
|
599
|
+
} else {
|
|
600
|
+
select.innerHTML = '<option value="">No weak model files available</option>';
|
|
601
|
+
}
|
|
602
|
+
} catch (error) {
|
|
603
|
+
console.error('Error loading weak models:', error);
|
|
604
|
+
}
|
|
605
|
+
|
|
606
|
+
// Set default meta-prompt
|
|
607
|
+
document.getElementById('gen-meta-prompt').value = DEFAULT_META_PROMPT;
|
|
608
|
+
|
|
609
|
+
// Show panel
|
|
610
|
+
document.getElementById('prompt-generator-panel').style.display = 'block';
|
|
611
|
+
document.getElementById('gen-output-section').style.display = 'none';
|
|
612
|
+
}
|
|
613
|
+
|
|
614
|
+
function closePromptGenerator() {
|
|
615
|
+
document.getElementById('prompt-generator-panel').style.display = 'none';
|
|
616
|
+
}
|
|
617
|
+
|
|
618
|
+
async function generatePrompt() {
|
|
619
|
+
const weakModelFile = document.getElementById('gen-weak-model-select').value;
|
|
620
|
+
const numSamples = parseInt(document.getElementById('gen-num-samples').value) || 3;
|
|
621
|
+
const model = document.getElementById('gen-model').value.trim();
|
|
622
|
+
const metaPrompt = document.getElementById('gen-meta-prompt').value.trim();
|
|
623
|
+
|
|
624
|
+
if (!weakModelFile) {
|
|
625
|
+
alert('Please select a weak model dataset');
|
|
626
|
+
return;
|
|
627
|
+
}
|
|
628
|
+
|
|
629
|
+
if (!model) {
|
|
630
|
+
alert('Please enter a generation model');
|
|
631
|
+
return;
|
|
632
|
+
}
|
|
633
|
+
|
|
634
|
+
if (!metaPrompt) {
|
|
635
|
+
alert('Please enter a meta-prompt');
|
|
636
|
+
return;
|
|
637
|
+
}
|
|
638
|
+
|
|
639
|
+
try {
|
|
640
|
+
const response = await fetch('/generate_judge_prompt', {
|
|
641
|
+
method: 'POST',
|
|
642
|
+
headers: { 'Content-Type': 'application/json' },
|
|
643
|
+
body: JSON.stringify({
|
|
644
|
+
weak_model_file: weakModelFile,
|
|
645
|
+
num_samples: numSamples,
|
|
646
|
+
model: model,
|
|
647
|
+
meta_prompt: metaPrompt
|
|
648
|
+
})
|
|
649
|
+
});
|
|
650
|
+
|
|
651
|
+
if (!response.ok) {
|
|
652
|
+
throw new Error('Failed to generate prompt');
|
|
653
|
+
}
|
|
654
|
+
|
|
655
|
+
const result = await response.json();
|
|
656
|
+
|
|
657
|
+
// Display generated prompt
|
|
658
|
+
document.getElementById('gen-output').value = result.generated_prompt;
|
|
659
|
+
document.getElementById('gen-output-section').style.display = 'block';
|
|
660
|
+
|
|
661
|
+
} catch (error) {
|
|
662
|
+
alert('Error generating prompt: ' + error.message);
|
|
663
|
+
console.error('Generation error:', error);
|
|
664
|
+
}
|
|
665
|
+
}
|
|
666
|
+
|
|
667
|
+
function copyGeneratedPrompt() {
|
|
668
|
+
const output = document.getElementById('gen-output');
|
|
669
|
+
output.select();
|
|
670
|
+
document.execCommand('copy');
|
|
671
|
+
alert('Prompt copied to clipboard!');
|
|
672
|
+
}
|
|
492
673
|
</script>
|
|
493
674
|
</body>
|
|
494
675
|
</html>
|
|
@@ -316,6 +316,22 @@
|
|
|
316
316
|
Manage Judges
|
|
317
317
|
</a>
|
|
318
318
|
|
|
319
|
+
<button id="open-test-judge-btn" style="padding: 8px 16px; background: #6a4a7e; color: white; border: none; border-radius: 4px; cursor: pointer;">
|
|
320
|
+
Test Judges
|
|
321
|
+
</button>
|
|
322
|
+
|
|
323
|
+
<button id="open-settings-btn" style="padding: 8px 16px; background: #5a5a5a; color: white; border: none; border-radius: 4px; cursor: pointer;">
|
|
324
|
+
Settings
|
|
325
|
+
</button>
|
|
326
|
+
|
|
327
|
+
<div style="margin: 20px 0; padding: 15px; background: #2a1a2a; border-radius: 8px; border: 1px solid #4a2a4a;">
|
|
328
|
+
<div style="color: #aaa; font-size: 13px; margin-bottom: 10px;">Automatic Workflow:</div>
|
|
329
|
+
<button id="open-e2e-btn" style="padding: 10px 20px; background: #7a4a9e; color: white; border: none; border-radius: 4px; cursor: pointer; font-weight: 500;">
|
|
330
|
+
⚡ Run End-to-End Test
|
|
331
|
+
</button>
|
|
332
|
+
<div style="color: #666; font-size: 11px; margin-top: 8px;">Export → Generate → Evaluate (all in one)</div>
|
|
333
|
+
</div>
|
|
334
|
+
|
|
319
335
|
<div class="stats">
|
|
320
336
|
<div>Total: <span id="total-count">0</span></div>
|
|
321
337
|
<div>Shown: <span id="shown-count">0</span></div>
|
|
@@ -427,6 +443,193 @@
|
|
|
427
443
|
</div>
|
|
428
444
|
</div>
|
|
429
445
|
</div>
|
|
446
|
+
|
|
447
|
+
<!-- Settings Panel -->
|
|
448
|
+
<div id="settings-panel" style="display: none; position: fixed; top: 0; left: 0; width: 100%; height: 100%; background: rgba(0,0,0,0.8); z-index: 1000; padding: 40px;">
|
|
449
|
+
<div style="max-width: 600px; margin: 0 auto; background: #1a1a1a; border-radius: 8px; padding: 30px; border: 1px solid #2a2a2a;">
|
|
450
|
+
<h2 style="color: #fff; margin-bottom: 20px;">Settings</h2>
|
|
451
|
+
|
|
452
|
+
<div style="margin-bottom: 20px;">
|
|
453
|
+
<label style="display: block; color: #aaa; margin-bottom: 8px; font-size: 14px;">W&B Inference Project</label>
|
|
454
|
+
<input type="text" id="settings-inference-project" placeholder="e.g., wandb_fc/quickstart_playground"
|
|
455
|
+
style="width: 100%; padding: 10px; background: #2a2a2a; color: #fff; border: 1px solid #3a3a3a; border-radius: 4px; font-size: 14px;">
|
|
456
|
+
<div style="color: #666; font-size: 12px; margin-top: 5px;">Used for running weak model inference</div>
|
|
457
|
+
</div>
|
|
458
|
+
|
|
459
|
+
<div style="margin-bottom: 30px;">
|
|
460
|
+
<label style="display: block; color: #aaa; margin-bottom: 8px; font-size: 14px;">W&B Evaluation Project</label>
|
|
461
|
+
<input type="text" id="settings-evaluation-project" placeholder="e.g., wandb_inference"
|
|
462
|
+
style="width: 100%; padding: 10px; background: #2a2a2a; color: #fff; border: 1px solid #3a3a3a; border-radius: 4px; font-size: 14px;">
|
|
463
|
+
<div style="color: #666; font-size: 12px; margin-top: 5px;">Used for logging evaluation results with Weave</div>
|
|
464
|
+
</div>
|
|
465
|
+
|
|
466
|
+
<div style="display: flex; gap: 10px; justify-content: flex-end;">
|
|
467
|
+
<button id="close-settings-btn" style="padding: 10px 20px; background: #5a2a2a; color: white; border: none; border-radius: 4px; cursor: pointer;">
|
|
468
|
+
Cancel
|
|
469
|
+
</button>
|
|
470
|
+
<button id="save-settings-btn" style="padding: 10px 20px; background: #2a7c4a; color: white; border: none; border-radius: 4px; cursor: pointer;">
|
|
471
|
+
Save Settings
|
|
472
|
+
</button>
|
|
473
|
+
</div>
|
|
474
|
+
</div>
|
|
475
|
+
</div>
|
|
476
|
+
|
|
477
|
+
<!-- Test Judges Panel -->
|
|
478
|
+
<div id="test-judge-panel" style="display: none; position: fixed; top: 0; left: 0; width: 100%; height: 100%; background: rgba(0,0,0,0.8); z-index: 1000; padding: 40px; overflow-y: auto;">
|
|
479
|
+
<div style="max-width: 1000px; margin: 0 auto; background: #1a1a1a; border-radius: 8px; padding: 30px; border: 1px solid #3a2a4a;">
|
|
480
|
+
<h2 style="color: #fff; margin-bottom: 10px;">Test Judge</h2>
|
|
481
|
+
<p style="color: #888; font-size: 13px; margin-bottom: 25px;">
|
|
482
|
+
Test your judge on sample data to see exactly what inputs/outputs it receives
|
|
483
|
+
</p>
|
|
484
|
+
|
|
485
|
+
<!-- Configuration -->
|
|
486
|
+
<div style="display: grid; grid-template-columns: 1fr 1fr; gap: 20px; margin-bottom: 25px;">
|
|
487
|
+
<div>
|
|
488
|
+
<label style="display: block; color: #aaa; margin-bottom: 8px; font-size: 14px;">Select Judge:</label>
|
|
489
|
+
<select id="test-judge-select" style="width: 100%; padding: 10px; background: #2a2a2a; color: #fff; border: 1px solid #3a3a3a; border-radius: 4px; font-size: 14px;">
|
|
490
|
+
<option value="">Loading judges...</option>
|
|
491
|
+
</select>
|
|
492
|
+
</div>
|
|
493
|
+
|
|
494
|
+
<div>
|
|
495
|
+
<label style="display: block; color: #aaa; margin-bottom: 8px; font-size: 14px;">Weak Model Data:</label>
|
|
496
|
+
<select id="test-weak-model-select" style="width: 100%; padding: 10px; background: #2a2a2a; color: #fff; border: 1px solid #3a3a3a; border-radius: 4px; font-size: 14px;">
|
|
497
|
+
<option value="">Loading weak model files...</option>
|
|
498
|
+
</select>
|
|
499
|
+
</div>
|
|
500
|
+
</div>
|
|
501
|
+
|
|
502
|
+
<div style="margin-bottom: 20px;">
|
|
503
|
+
<label style="display: block; color: #aaa; margin-bottom: 8px; font-size: 14px;">Number of Samples:</label>
|
|
504
|
+
<input type="number" id="test-num-samples" value="5" min="1" max="50"
|
|
505
|
+
style="width: 150px; padding: 10px; background: #2a2a2a; color: #fff; border: 1px solid #3a3a3a; border-radius: 4px; font-size: 14px;">
|
|
506
|
+
<span style="color: #666; font-size: 12px; margin-left: 10px;">Max: 50</span>
|
|
507
|
+
</div>
|
|
508
|
+
|
|
509
|
+
<!-- Judge Model -->
|
|
510
|
+
<div style="margin-bottom: 20px;">
|
|
511
|
+
<label style="display: block; color: #aaa; margin-bottom: 8px; font-size: 14px;">Judge Model:</label>
|
|
512
|
+
<input type="text" id="test-judge-model"
|
|
513
|
+
style="width: 100%; padding: 10px; background: #2a2a2a; color: #fff; border: 1px solid #3a3a3a; border-radius: 4px; font-size: 14px;"
|
|
514
|
+
placeholder="e.g., gpt-4o, claude-3-5-sonnet-20241022">
|
|
515
|
+
<div style="color: #666; font-size: 12px; margin-top: 5px;">
|
|
516
|
+
Override the judge's model for this test
|
|
517
|
+
</div>
|
|
518
|
+
</div>
|
|
519
|
+
|
|
520
|
+
<!-- Judge Prompt -->
|
|
521
|
+
<div style="margin-bottom: 30px;">
|
|
522
|
+
<label style="display: block; color: #aaa; margin-bottom: 8px; font-size: 14px;">Judge Prompt:</label>
|
|
523
|
+
<textarea id="test-judge-prompt"
|
|
524
|
+
style="width: 100%; min-height: 200px; padding: 10px; background: #2a2a2a; color: #fff; border: 1px solid #3a3a3a; border-radius: 4px; font-size: 13px; font-family: 'Courier New', monospace; resize: vertical;"
|
|
525
|
+
placeholder="Select a judge to load its prompt..."></textarea>
|
|
526
|
+
<div style="color: #666; font-size: 12px; margin-top: 5px;">
|
|
527
|
+
Edit the prompt and test changes, or save to update the judge permanently
|
|
528
|
+
</div>
|
|
529
|
+
</div>
|
|
530
|
+
|
|
531
|
+
<!-- Actions -->
|
|
532
|
+
<div style="display: flex; gap: 10px; margin-bottom: 30px;">
|
|
533
|
+
<button id="run-test-judge-btn" style="padding: 10px 20px; background: #6a4a7e; color: white; border: none; border-radius: 4px; cursor: pointer; font-weight: 500;">
|
|
534
|
+
Run Test
|
|
535
|
+
</button>
|
|
536
|
+
<button id="save-test-judge-prompt-btn" style="padding: 10px 20px; background: #2a7c4a; color: white; border: none; border-radius: 4px; cursor: pointer;">
|
|
537
|
+
Save Prompt to Judge
|
|
538
|
+
</button>
|
|
539
|
+
<button id="close-test-judge-btn" style="padding: 10px 20px; background: #5a2a2a; color: white; border: none; border-radius: 4px; cursor: pointer;">
|
|
540
|
+
Close
|
|
541
|
+
</button>
|
|
542
|
+
</div>
|
|
543
|
+
|
|
544
|
+
<!-- Results -->
|
|
545
|
+
<div id="test-judge-results" style="display: none;">
|
|
546
|
+
<h3 style="color: #4a9eff; margin-bottom: 15px;">Test Results</h3>
|
|
547
|
+
<div id="test-judge-results-content" style="max-height: 600px; overflow-y: auto;">
|
|
548
|
+
<!-- Results populated here -->
|
|
549
|
+
</div>
|
|
550
|
+
</div>
|
|
551
|
+
</div>
|
|
552
|
+
</div>
|
|
553
|
+
|
|
554
|
+
<!-- End-to-End Test Panel -->
|
|
555
|
+
<div id="e2e-panel" style="display: none; position: fixed; top: 0; left: 0; width: 100%; height: 100%; background: rgba(0,0,0,0.8); z-index: 1000; padding: 40px; overflow-y: auto;">
|
|
556
|
+
<div style="max-width: 800px; margin: 0 auto; background: #1a1a1a; border-radius: 8px; padding: 30px; border: 1px solid #4a2a4a;">
|
|
557
|
+
<h2 style="color: #fff; margin-bottom: 10px;">⚡ Run End-to-End Test</h2>
|
|
558
|
+
<p style="color: #888; font-size: 13px; margin-bottom: 25px;">
|
|
559
|
+
This will automatically: Export selected traces → Run weak models → Evaluate with judge
|
|
560
|
+
</p>
|
|
561
|
+
|
|
562
|
+
<!-- Weak Model Selection -->
|
|
563
|
+
<div style="margin-bottom: 25px;">
|
|
564
|
+
<h3 style="color: #fff; font-size: 16px; margin-bottom: 15px;">1. Select Weak Models</h3>
|
|
565
|
+
|
|
566
|
+
<div style="margin-bottom: 15px;">
|
|
567
|
+
<label style="display: block; color: #aaa; margin-bottom: 8px; font-size: 13px;">W&B Models:</label>
|
|
568
|
+
<div id="e2e-wandb-models" style="max-height: 150px; overflow-y: auto; background: #0f0f0f; padding: 10px; border-radius: 4px; border: 1px solid #2a2a2a;">
|
|
569
|
+
<!-- Populated dynamically -->
|
|
570
|
+
</div>
|
|
571
|
+
</div>
|
|
572
|
+
|
|
573
|
+
<div style="margin-bottom: 15px;">
|
|
574
|
+
<label style="display: block; color: #aaa; margin-bottom: 8px; font-size: 13px;">OpenRouter Models (optional):</label>
|
|
575
|
+
<textarea id="e2e-openrouter-models" placeholder="Enter OpenRouter models (one per line) e.g., meta-llama/llama-3.3-70b-instruct anthropic/claude-3.5-sonnet"
|
|
576
|
+
style="width: 100%; padding: 8px; background: #2a2a2a; color: #fff; border: 1px solid #3a3a3a; border-radius: 4px; font-size: 13px; min-height: 80px; font-family: monospace;"></textarea>
|
|
577
|
+
<div style="color: #666; font-size: 11px; margin-top: 5px;">One model per line</div>
|
|
578
|
+
</div>
|
|
579
|
+
|
|
580
|
+
<div>
|
|
581
|
+
<label style="display: block; color: #aaa; margin-bottom: 8px; font-size: 13px;">Max Examples (optional):</label>
|
|
582
|
+
<input type="number" id="e2e-num-examples" placeholder="Leave empty to use all selected traces"
|
|
583
|
+
style="width: 200px; padding: 8px; background: #2a2a2a; color: #fff; border: 1px solid #3a3a3a; border-radius: 4px; font-size: 13px;">
|
|
584
|
+
</div>
|
|
585
|
+
</div>
|
|
586
|
+
|
|
587
|
+
<!-- Judge Selection -->
|
|
588
|
+
<div style="margin-bottom: 30px;">
|
|
589
|
+
<h3 style="color: #fff; font-size: 16px; margin-bottom: 15px;">2. Select Judge</h3>
|
|
590
|
+
<select id="e2e-judge" style="width: 100%; padding: 10px; background: #2a2a2a; color: #fff; border: 1px solid #3a3a3a; border-radius: 4px; font-size: 14px;">
|
|
591
|
+
<option value="">Loading judges...</option>
|
|
592
|
+
</select>
|
|
593
|
+
</div>
|
|
594
|
+
|
|
595
|
+
<!-- Actions -->
|
|
596
|
+
<div style="display: flex; gap: 10px; justify-content: flex-end;">
|
|
597
|
+
<button id="close-e2e-btn" style="padding: 10px 20px; background: #5a2a2a; color: white; border: none; border-radius: 4px; cursor: pointer;">
|
|
598
|
+
Cancel
|
|
599
|
+
</button>
|
|
600
|
+
<button id="run-e2e-btn" style="padding: 10px 20px; background: #7a4a9e; color: white; border: none; border-radius: 4px; cursor: pointer; font-weight: 500;">
|
|
601
|
+
⚡ Run Test
|
|
602
|
+
</button>
|
|
603
|
+
</div>
|
|
604
|
+
</div>
|
|
605
|
+
</div>
|
|
606
|
+
|
|
607
|
+
<!-- End-to-End Progress Panel -->
|
|
608
|
+
<div id="e2e-progress-panel" style="display: none; position: fixed; top: 0; left: 0; width: 100%; height: 100%; background: rgba(0,0,0,0.9); z-index: 1100; padding: 40px; overflow-y: auto;">
|
|
609
|
+
<div style="max-width: 800px; margin: 0 auto; background: #1a1a1a; border-radius: 8px; padding: 30px; border: 1px solid #4a2a4a;">
|
|
610
|
+
<h2 style="color: #fff; margin-bottom: 20px;">Running End-to-End Test</h2>
|
|
611
|
+
|
|
612
|
+
<!-- Overall Progress -->
|
|
613
|
+
<div style="margin-bottom: 30px;">
|
|
614
|
+
<div style="color: #4a9eff; font-size: 14px; margin-bottom: 10px;" id="e2e-step-label">Step 1/3: Exporting traces...</div>
|
|
615
|
+
<div style="width: 100%; height: 8px; background: #2a2a2a; border-radius: 4px; overflow: hidden;">
|
|
616
|
+
<div id="e2e-overall-progress" style="height: 100%; background: #7a4a9e; width: 0%; transition: width 0.3s;"></div>
|
|
617
|
+
</div>
|
|
618
|
+
</div>
|
|
619
|
+
|
|
620
|
+
<!-- Detailed Progress -->
|
|
621
|
+
<div id="e2e-progress-text" style="color: #888; font-family: monospace; font-size: 12px; white-space: pre-wrap; background: #0f0f0f; padding: 15px; border-radius: 4px; max-height: 400px; overflow-y: auto;"></div>
|
|
622
|
+
|
|
623
|
+
<!-- Results -->
|
|
624
|
+
<div id="e2e-results" style="display: none; margin-top: 20px;">
|
|
625
|
+
<h3 style="color: #4a9eff; margin-bottom: 15px;">✓ Test Complete!</h3>
|
|
626
|
+
<div id="e2e-results-content" style="background: #0f0f0f; padding: 15px; border-radius: 4px;"></div>
|
|
627
|
+
<button id="close-e2e-progress-btn" style="margin-top: 20px; padding: 10px 20px; background: #2a7c4a; color: white; border: none; border-radius: 4px; cursor: pointer;">
|
|
628
|
+
Close
|
|
629
|
+
</button>
|
|
630
|
+
</div>
|
|
631
|
+
</div>
|
|
632
|
+
</div>
|
|
430
633
|
</div>
|
|
431
634
|
|
|
432
635
|
<script>
|
|
@@ -1287,6 +1490,517 @@
|
|
|
1287
1490
|
console.error('Delete error:', error);
|
|
1288
1491
|
}
|
|
1289
1492
|
}
|
|
1493
|
+
|
|
1494
|
+
// === SETTINGS ===
|
|
1495
|
+
|
|
1496
|
+
// Load and display settings
|
|
1497
|
+
async function loadSettings() {
|
|
1498
|
+
try {
|
|
1499
|
+
const response = await fetch('/settings');
|
|
1500
|
+
const settings = await response.json();
|
|
1501
|
+
document.getElementById('settings-inference-project').value = settings.inference_project || '';
|
|
1502
|
+
document.getElementById('settings-evaluation-project').value = settings.evaluation_project || '';
|
|
1503
|
+
} catch (error) {
|
|
1504
|
+
console.error('Error loading settings:', error);
|
|
1505
|
+
}
|
|
1506
|
+
}
|
|
1507
|
+
|
|
1508
|
+
// Open settings panel
|
|
1509
|
+
document.getElementById('open-settings-btn').addEventListener('click', async () => {
|
|
1510
|
+
await loadSettings();
|
|
1511
|
+
document.getElementById('settings-panel').style.display = 'block';
|
|
1512
|
+
});
|
|
1513
|
+
|
|
1514
|
+
// Close settings panel
|
|
1515
|
+
document.getElementById('close-settings-btn').addEventListener('click', () => {
|
|
1516
|
+
document.getElementById('settings-panel').style.display = 'none';
|
|
1517
|
+
});
|
|
1518
|
+
|
|
1519
|
+
// Save settings
|
|
1520
|
+
document.getElementById('save-settings-btn').addEventListener('click', async () => {
|
|
1521
|
+
const settings = {
|
|
1522
|
+
inference_project: document.getElementById('settings-inference-project').value.trim(),
|
|
1523
|
+
evaluation_project: document.getElementById('settings-evaluation-project').value.trim()
|
|
1524
|
+
};
|
|
1525
|
+
|
|
1526
|
+
if (!settings.inference_project || !settings.evaluation_project) {
|
|
1527
|
+
alert('Both project fields are required');
|
|
1528
|
+
return;
|
|
1529
|
+
}
|
|
1530
|
+
|
|
1531
|
+
try {
|
|
1532
|
+
const response = await fetch('/settings', {
|
|
1533
|
+
method: 'POST',
|
|
1534
|
+
headers: { 'Content-Type': 'application/json' },
|
|
1535
|
+
body: JSON.stringify(settings)
|
|
1536
|
+
});
|
|
1537
|
+
|
|
1538
|
+
const result = await response.json();
|
|
1539
|
+
if (result.status === 'success') {
|
|
1540
|
+
alert('Settings saved! Please restart the server for changes to take effect.');
|
|
1541
|
+
document.getElementById('settings-panel').style.display = 'none';
|
|
1542
|
+
} else {
|
|
1543
|
+
alert('Error saving settings');
|
|
1544
|
+
}
|
|
1545
|
+
} catch (error) {
|
|
1546
|
+
alert('Error saving settings: ' + error.message);
|
|
1547
|
+
}
|
|
1548
|
+
});
|
|
1549
|
+
|
|
1550
|
+
// === TEST JUDGES ===
|
|
1551
|
+
|
|
1552
|
+
let testJudgesData = []; // Store judges globally for test panel
|
|
1553
|
+
|
|
1554
|
+
// Open test judge panel
|
|
1555
|
+
document.getElementById('open-test-judge-btn').addEventListener('click', async () => {
|
|
1556
|
+
// Load judges
|
|
1557
|
+
try {
|
|
1558
|
+
const response = await fetch('/list_judges');
|
|
1559
|
+
const data = await response.json();
|
|
1560
|
+
testJudgesData = data.judges || []; // Store globally
|
|
1561
|
+
const judgeSelect = document.getElementById('test-judge-select');
|
|
1562
|
+
|
|
1563
|
+
if (testJudgesData.length > 0) {
|
|
1564
|
+
judgeSelect.innerHTML = testJudgesData.map((judge, idx) =>
|
|
1565
|
+
`<option value="${idx}">${judge.name} (${judge.type})</option>`
|
|
1566
|
+
).join('');
|
|
1567
|
+
|
|
1568
|
+
// Load first judge's prompt and model
|
|
1569
|
+
if (testJudgesData[0]) {
|
|
1570
|
+
document.getElementById('test-judge-prompt').value = testJudgesData[0].prompt || '';
|
|
1571
|
+
document.getElementById('test-judge-model').value = testJudgesData[0].model || '';
|
|
1572
|
+
}
|
|
1573
|
+
} else {
|
|
1574
|
+
judgeSelect.innerHTML = '<option value="">No judges available</option>';
|
|
1575
|
+
}
|
|
1576
|
+
} catch (error) {
|
|
1577
|
+
console.error('Error loading judges:', error);
|
|
1578
|
+
}
|
|
1579
|
+
|
|
1580
|
+
// Load weak model files
|
|
1581
|
+
try {
|
|
1582
|
+
const response = await fetch('/list_weak_models');
|
|
1583
|
+
const data = await response.json();
|
|
1584
|
+
const weakModelSelect = document.getElementById('test-weak-model-select');
|
|
1585
|
+
|
|
1586
|
+
if (data.files && data.files.length > 0) {
|
|
1587
|
+
weakModelSelect.innerHTML = data.files.map(f =>
|
|
1588
|
+
`<option value="${f.filename}">${f.weak_model || f.filename}</option>`
|
|
1589
|
+
).join('');
|
|
1590
|
+
} else {
|
|
1591
|
+
weakModelSelect.innerHTML = '<option value="">No weak model files available</option>';
|
|
1592
|
+
}
|
|
1593
|
+
} catch (error) {
|
|
1594
|
+
console.error('Error loading weak models:', error);
|
|
1595
|
+
}
|
|
1596
|
+
|
|
1597
|
+
document.getElementById('test-judge-panel').style.display = 'block';
|
|
1598
|
+
document.getElementById('test-judge-results').style.display = 'none';
|
|
1599
|
+
});
|
|
1600
|
+
|
|
1601
|
+
// When judge selection changes, update the prompt and model
|
|
1602
|
+
document.getElementById('test-judge-select').addEventListener('change', (e) => {
|
|
1603
|
+
const judgeIndex = parseInt(e.target.value);
|
|
1604
|
+
if (!isNaN(judgeIndex) && testJudgesData[judgeIndex]) {
|
|
1605
|
+
const judge = testJudgesData[judgeIndex];
|
|
1606
|
+
document.getElementById('test-judge-prompt').value = judge.prompt || '';
|
|
1607
|
+
document.getElementById('test-judge-model').value = judge.model || '';
|
|
1608
|
+
}
|
|
1609
|
+
});
|
|
1610
|
+
|
|
1611
|
+
// Close test judge panel
|
|
1612
|
+
document.getElementById('close-test-judge-btn').addEventListener('click', () => {
|
|
1613
|
+
document.getElementById('test-judge-panel').style.display = 'none';
|
|
1614
|
+
});
|
|
1615
|
+
|
|
1616
|
+
// Run test judge
|
|
1617
|
+
document.getElementById('run-test-judge-btn').addEventListener('click', async () => {
|
|
1618
|
+
const judgeIndex = document.getElementById('test-judge-select').value;
|
|
1619
|
+
const weakModelFile = document.getElementById('test-weak-model-select').value;
|
|
1620
|
+
const numSamples = parseInt(document.getElementById('test-num-samples').value) || 5;
|
|
1621
|
+
const editedPrompt = document.getElementById('test-judge-prompt').value;
|
|
1622
|
+
const editedModel = document.getElementById('test-judge-model').value;
|
|
1623
|
+
|
|
1624
|
+
if (!judgeIndex) {
|
|
1625
|
+
alert('Please select a judge');
|
|
1626
|
+
return;
|
|
1627
|
+
}
|
|
1628
|
+
|
|
1629
|
+
if (!weakModelFile) {
|
|
1630
|
+
alert('Please select a weak model file');
|
|
1631
|
+
return;
|
|
1632
|
+
}
|
|
1633
|
+
|
|
1634
|
+
if (!editedPrompt.trim()) {
|
|
1635
|
+
alert('Please enter a judge prompt');
|
|
1636
|
+
return;
|
|
1637
|
+
}
|
|
1638
|
+
|
|
1639
|
+
if (!editedModel.trim()) {
|
|
1640
|
+
alert('Please enter a judge model');
|
|
1641
|
+
return;
|
|
1642
|
+
}
|
|
1643
|
+
|
|
1644
|
+
// Get judge data and override with edited prompt and model
|
|
1645
|
+
const judge = { ...testJudgesData[parseInt(judgeIndex)] };
|
|
1646
|
+
judge.prompt = editedPrompt; // Use the edited prompt from textarea
|
|
1647
|
+
judge.model = editedModel; // Use the edited model from input
|
|
1648
|
+
|
|
1649
|
+
// Call test endpoint
|
|
1650
|
+
try {
|
|
1651
|
+
const response = await fetch('/test_judge', {
|
|
1652
|
+
method: 'POST',
|
|
1653
|
+
headers: { 'Content-Type': 'application/json' },
|
|
1654
|
+
body: JSON.stringify({
|
|
1655
|
+
judge: judge,
|
|
1656
|
+
weak_model_file: weakModelFile,
|
|
1657
|
+
num_samples: numSamples
|
|
1658
|
+
})
|
|
1659
|
+
});
|
|
1660
|
+
|
|
1661
|
+
if (!response.ok) {
|
|
1662
|
+
throw new Error('Failed to test judge');
|
|
1663
|
+
}
|
|
1664
|
+
|
|
1665
|
+
const result = await response.json();
|
|
1666
|
+
|
|
1667
|
+
// Display results
|
|
1668
|
+
const resultsDiv = document.getElementById('test-judge-results-content');
|
|
1669
|
+
resultsDiv.innerHTML = result.samples.map((sample, idx) => `
|
|
1670
|
+
<div style="margin-bottom: 20px; padding: 20px; background: #0f0f0f; border-radius: 8px; border: 1px solid #2a2a2a;">
|
|
1671
|
+
<h4 style="color: #4a9eff; margin-bottom: 15px;">Sample ${idx + 1} of ${result.samples.length}</h4>
|
|
1672
|
+
|
|
1673
|
+
<div style="margin-bottom: 15px;">
|
|
1674
|
+
<div style="color: #888; font-size: 12px; margin-bottom: 5px;">Question:</div>
|
|
1675
|
+
<div style="color: #ccc; font-size: 13px; background: #1a1a1a; padding: 10px; border-radius: 4px; max-height: 100px; overflow-y: auto;">${sample.question || 'N/A'}</div>
|
|
1676
|
+
</div>
|
|
1677
|
+
|
|
1678
|
+
<div style="margin-bottom: 15px;">
|
|
1679
|
+
<div style="color: #888; font-size: 12px; margin-bottom: 5px;">Strong Model Output:</div>
|
|
1680
|
+
<div style="color: #ccc; font-size: 13px; background: #1a1a1a; padding: 10px; border-radius: 4px; max-height: 100px; overflow-y: auto;">${sample.strong_output || 'N/A'}</div>
|
|
1681
|
+
</div>
|
|
1682
|
+
|
|
1683
|
+
<div style="margin-bottom: 15px;">
|
|
1684
|
+
<div style="color: #888; font-size: 12px; margin-bottom: 5px;">Weak Model Output:</div>
|
|
1685
|
+
<div style="color: #ccc; font-size: 13px; background: #1a1a1a; padding: 10px; border-radius: 4px; max-height: 100px; overflow-y: auto;">${sample.weak_output || 'N/A'}</div>
|
|
1686
|
+
</div>
|
|
1687
|
+
|
|
1688
|
+
<div style="margin-bottom: 15px;">
|
|
1689
|
+
<div style="color: #888; font-size: 12px; margin-bottom: 5px;">Judge Prompt (filled):</div>
|
|
1690
|
+
<pre style="color: #aaa; font-size: 11px; background: #1a1a1a; padding: 10px; border-radius: 4px; max-height: 150px; overflow-y: auto; white-space: pre-wrap; font-family: monospace;">${sample.judge_prompt}</pre>
|
|
1691
|
+
</div>
|
|
1692
|
+
|
|
1693
|
+
<div style="margin-bottom: 15px;">
|
|
1694
|
+
<div style="color: #888; font-size: 12px; margin-bottom: 5px;">Raw Judge Response:</div>
|
|
1695
|
+
<pre style="color: #f4d03f; font-size: 11px; background: #1a1a1a; padding: 10px; border-radius: 4px; max-height: 150px; overflow-y: auto; white-space: pre-wrap; font-family: monospace;">${sample.raw_response}</pre>
|
|
1696
|
+
</div>
|
|
1697
|
+
|
|
1698
|
+
<div>
|
|
1699
|
+
<div style="color: #888; font-size: 12px; margin-bottom: 5px;">Parsed Scores:</div>
|
|
1700
|
+
<div style="color: #4a9eff; font-size: 13px; background: #1a1a1a; padding: 10px; border-radius: 4px; font-family: monospace;">${JSON.stringify(sample.parsed_scores, null, 2)}</div>
|
|
1701
|
+
</div>
|
|
1702
|
+
</div>
|
|
1703
|
+
`).join('');
|
|
1704
|
+
|
|
1705
|
+
document.getElementById('test-judge-results').style.display = 'block';
|
|
1706
|
+
|
|
1707
|
+
} catch (error) {
|
|
1708
|
+
alert('Error testing judge: ' + error.message);
|
|
1709
|
+
console.error('Test error:', error);
|
|
1710
|
+
}
|
|
1711
|
+
});
|
|
1712
|
+
|
|
1713
|
+
// Save prompt to judge
|
|
1714
|
+
document.getElementById('save-test-judge-prompt-btn').addEventListener('click', async () => {
|
|
1715
|
+
const judgeIndex = document.getElementById('test-judge-select').value;
|
|
1716
|
+
const editedPrompt = document.getElementById('test-judge-prompt').value;
|
|
1717
|
+
|
|
1718
|
+
if (!judgeIndex) {
|
|
1719
|
+
alert('Please select a judge');
|
|
1720
|
+
return;
|
|
1721
|
+
}
|
|
1722
|
+
|
|
1723
|
+
if (!editedPrompt.trim()) {
|
|
1724
|
+
alert('Please enter a judge prompt');
|
|
1725
|
+
return;
|
|
1726
|
+
}
|
|
1727
|
+
|
|
1728
|
+
// Get judge data and update prompt
|
|
1729
|
+
const judge = { ...testJudgesData[parseInt(judgeIndex)] };
|
|
1730
|
+
judge.prompt = editedPrompt;
|
|
1731
|
+
|
|
1732
|
+
// Confirm with user
|
|
1733
|
+
if (!confirm(`Save this prompt to judge "${judge.name}"? This will permanently update the judge.`)) {
|
|
1734
|
+
return;
|
|
1735
|
+
}
|
|
1736
|
+
|
|
1737
|
+
// Call save endpoint
|
|
1738
|
+
try {
|
|
1739
|
+
const response = await fetch('/save_judge', {
|
|
1740
|
+
method: 'POST',
|
|
1741
|
+
headers: { 'Content-Type': 'application/json' },
|
|
1742
|
+
body: JSON.stringify({ judge: judge })
|
|
1743
|
+
});
|
|
1744
|
+
|
|
1745
|
+
if (!response.ok) {
|
|
1746
|
+
throw new Error('Failed to save judge');
|
|
1747
|
+
}
|
|
1748
|
+
|
|
1749
|
+
const result = await response.json();
|
|
1750
|
+
|
|
1751
|
+
// Update local judges data
|
|
1752
|
+
testJudgesData = result.judges || [];
|
|
1753
|
+
|
|
1754
|
+
alert('Judge prompt saved successfully!');
|
|
1755
|
+
} catch (error) {
|
|
1756
|
+
alert('Error saving judge: ' + error.message);
|
|
1757
|
+
console.error('Save error:', error);
|
|
1758
|
+
}
|
|
1759
|
+
});
|
|
1760
|
+
|
|
1761
|
+
// === END-TO-END TEST ===
|
|
1762
|
+
|
|
1763
|
+
// Open E2E panel
|
|
1764
|
+
document.getElementById('open-e2e-btn').addEventListener('click', async () => {
|
|
1765
|
+
if (selectedTraces.size === 0) {
|
|
1766
|
+
alert('Please select at least one trace first!');
|
|
1767
|
+
return;
|
|
1768
|
+
}
|
|
1769
|
+
|
|
1770
|
+
// Populate W&B models
|
|
1771
|
+
const wandbModelsDiv = document.getElementById('e2e-wandb-models');
|
|
1772
|
+
wandbModelsDiv.innerHTML = AVAILABLE_MODELS.map(model => `
|
|
1773
|
+
<label style="display: block; padding: 5px 0; color: #ccc; cursor: pointer;">
|
|
1774
|
+
<input type="checkbox" class="e2e-model-checkbox" value="${model}" style="margin-right: 8px;">
|
|
1775
|
+
${model}
|
|
1776
|
+
</label>
|
|
1777
|
+
`).join('');
|
|
1778
|
+
|
|
1779
|
+
// Load judges
|
|
1780
|
+
try {
|
|
1781
|
+
const response = await fetch('/list_judges');
|
|
1782
|
+
const data = await response.json();
|
|
1783
|
+
const judgeSelect = document.getElementById('e2e-judge');
|
|
1784
|
+
|
|
1785
|
+
if (data.judges && data.judges.length > 0) {
|
|
1786
|
+
judgeSelect.innerHTML = data.judges.map((judge, idx) =>
|
|
1787
|
+
`<option value="${idx}">${judge.name} (${judge.type})</option>`
|
|
1788
|
+
).join('');
|
|
1789
|
+
} else {
|
|
1790
|
+
judgeSelect.innerHTML = '<option value="">No judges available - create one first</option>';
|
|
1791
|
+
}
|
|
1792
|
+
} catch (error) {
|
|
1793
|
+
console.error('Error loading judges:', error);
|
|
1794
|
+
}
|
|
1795
|
+
|
|
1796
|
+
document.getElementById('e2e-panel').style.display = 'block';
|
|
1797
|
+
});
|
|
1798
|
+
|
|
1799
|
+
// Close E2E panel
|
|
1800
|
+
document.getElementById('close-e2e-btn').addEventListener('click', () => {
|
|
1801
|
+
document.getElementById('e2e-panel').style.display = 'none';
|
|
1802
|
+
});
|
|
1803
|
+
|
|
1804
|
+
// Close E2E progress
|
|
1805
|
+
document.getElementById('close-e2e-progress-btn').addEventListener('click', () => {
|
|
1806
|
+
document.getElementById('e2e-progress-panel').style.display = 'none';
|
|
1807
|
+
document.getElementById('e2e-results').style.display = 'none';
|
|
1808
|
+
});
|
|
1809
|
+
|
|
1810
|
+
// Run end-to-end test
|
|
1811
|
+
document.getElementById('run-e2e-btn').addEventListener('click', async () => {
|
|
1812
|
+
// Gather selected models
|
|
1813
|
+
const selectedWanbModels = Array.from(document.querySelectorAll('.e2e-model-checkbox:checked')).map(cb => cb.value);
|
|
1814
|
+
const openRouterModelsText = document.getElementById('e2e-openrouter-models').value.trim();
|
|
1815
|
+
const openRouterModels = openRouterModelsText
|
|
1816
|
+
.split('\n')
|
|
1817
|
+
.map(m => m.trim())
|
|
1818
|
+
.filter(m => m.length > 0);
|
|
1819
|
+
const allModels = [...selectedWanbModels, ...openRouterModels];
|
|
1820
|
+
|
|
1821
|
+
if (allModels.length === 0) {
|
|
1822
|
+
alert('Please select at least one model!');
|
|
1823
|
+
return;
|
|
1824
|
+
}
|
|
1825
|
+
|
|
1826
|
+
const judgeIndex = document.getElementById('e2e-judge').value;
|
|
1827
|
+
if (!judgeIndex) {
|
|
1828
|
+
alert('Please select a judge!');
|
|
1829
|
+
return;
|
|
1830
|
+
}
|
|
1831
|
+
|
|
1832
|
+
const numExamples = document.getElementById('e2e-num-examples').value ? parseInt(document.getElementById('e2e-num-examples').value) : null;
|
|
1833
|
+
|
|
1834
|
+
// Load judge data
|
|
1835
|
+
const judgesResponse = await fetch('/list_judges');
|
|
1836
|
+
const judgesData = await judgesResponse.json();
|
|
1837
|
+
const judge = judgesData.judges[parseInt(judgeIndex)];
|
|
1838
|
+
|
|
1839
|
+
// Hide config panel, show progress panel
|
|
1840
|
+
document.getElementById('e2e-panel').style.display = 'none';
|
|
1841
|
+
document.getElementById('e2e-progress-panel').style.display = 'block';
|
|
1842
|
+
|
|
1843
|
+
const progressText = document.getElementById('e2e-progress-text');
|
|
1844
|
+
const stepLabel = document.getElementById('e2e-step-label');
|
|
1845
|
+
const overallProgress = document.getElementById('e2e-overall-progress');
|
|
1846
|
+
|
|
1847
|
+
progressText.textContent = '';
|
|
1848
|
+
|
|
1849
|
+
try {
|
|
1850
|
+
// === STEP 1: Export Selected Traces ===
|
|
1851
|
+
stepLabel.textContent = 'Step 1/3: Exporting selected traces...';
|
|
1852
|
+
overallProgress.style.width = '10%';
|
|
1853
|
+
progressText.textContent += '📦 Exporting selected traces...\n';
|
|
1854
|
+
|
|
1855
|
+
// Get full trace objects for selected IDs
|
|
1856
|
+
const selectedTraceObjects = allTraces.filter(t => selectedTraces.has(t.id));
|
|
1857
|
+
|
|
1858
|
+
const exportResponse = await fetch('/export_strong_traces', {
|
|
1859
|
+
method: 'POST',
|
|
1860
|
+
headers: { 'Content-Type': 'application/json' },
|
|
1861
|
+
body: JSON.stringify({
|
|
1862
|
+
traces: selectedTraceObjects,
|
|
1863
|
+
nickname: `e2e_export_${Date.now()}`
|
|
1864
|
+
})
|
|
1865
|
+
});
|
|
1866
|
+
|
|
1867
|
+
if (!exportResponse.ok) {
|
|
1868
|
+
throw new Error('Failed to export traces');
|
|
1869
|
+
}
|
|
1870
|
+
|
|
1871
|
+
const exportResult = await exportResponse.json();
|
|
1872
|
+
const exportFilename = exportResult.filename;
|
|
1873
|
+
progressText.textContent += `✓ Exported ${exportResult.count} traces to ${exportFilename}\n\n`;
|
|
1874
|
+
overallProgress.style.width = '20%';
|
|
1875
|
+
|
|
1876
|
+
// === STEP 2: Run Weak Model Inference ===
|
|
1877
|
+
stepLabel.textContent = 'Step 2/3: Running weak model inference...';
|
|
1878
|
+
progressText.textContent += `⚙️ Running inference with ${allModels.length} model(s)...\n`;
|
|
1879
|
+
|
|
1880
|
+
const taskId = `inference_${Date.now()}`;
|
|
1881
|
+
let pollInterval = null;
|
|
1882
|
+
|
|
1883
|
+
const pollProgress = async () => {
|
|
1884
|
+
try {
|
|
1885
|
+
const resp = await fetch(`/progress/${taskId}`);
|
|
1886
|
+
if (resp.ok) {
|
|
1887
|
+
const progress = await resp.json();
|
|
1888
|
+
const percent = (progress.current / progress.total) * 100;
|
|
1889
|
+
// Map inference progress to 20-60% of overall
|
|
1890
|
+
const overallPercent = 20 + (percent * 0.4);
|
|
1891
|
+
overallProgress.style.width = `${overallPercent}%`;
|
|
1892
|
+
}
|
|
1893
|
+
} catch (e) {
|
|
1894
|
+
console.error('Error polling progress:', e);
|
|
1895
|
+
}
|
|
1896
|
+
};
|
|
1897
|
+
|
|
1898
|
+
pollInterval = setInterval(pollProgress, 300);
|
|
1899
|
+
|
|
1900
|
+
const inferenceResponse = await fetch('/run_inference', {
|
|
1901
|
+
method: 'POST',
|
|
1902
|
+
headers: { 'Content-Type': 'application/json' },
|
|
1903
|
+
body: JSON.stringify({
|
|
1904
|
+
models: allModels,
|
|
1905
|
+
strong_export_file: exportFilename,
|
|
1906
|
+
num_examples: numExamples,
|
|
1907
|
+
task_id: taskId
|
|
1908
|
+
})
|
|
1909
|
+
});
|
|
1910
|
+
|
|
1911
|
+
if (pollInterval) clearInterval(pollInterval);
|
|
1912
|
+
|
|
1913
|
+
if (!inferenceResponse.ok) {
|
|
1914
|
+
throw new Error('Failed to run inference');
|
|
1915
|
+
}
|
|
1916
|
+
|
|
1917
|
+
const inferenceResult = await inferenceResponse.json();
|
|
1918
|
+
progressText.textContent += `✓ Generated outputs for ${allModels.length} model(s)\n\n`;
|
|
1919
|
+
overallProgress.style.width = '60%';
|
|
1920
|
+
|
|
1921
|
+
// === STEP 3: Run Evaluations ===
|
|
1922
|
+
stepLabel.textContent = 'Step 3/3: Running evaluations...';
|
|
1923
|
+
progressText.textContent += `📊 Running evaluations with judge: ${judge.name}...\n`;
|
|
1924
|
+
|
|
1925
|
+
const evaluationResults = [];
|
|
1926
|
+
|
|
1927
|
+
// Get list of weak model files that were just generated
|
|
1928
|
+
const weakModelsResponse = await fetch('/list_weak_models');
|
|
1929
|
+
const weakModelsData = await weakModelsResponse.json();
|
|
1930
|
+
|
|
1931
|
+
// Filter to only the models we just ran
|
|
1932
|
+
const weakModelFiles = weakModelsData.files
|
|
1933
|
+
.filter(f => allModels.some(m => f.filename.includes(m.replace('/', '_'))))
|
|
1934
|
+
.map(f => f.filename);
|
|
1935
|
+
|
|
1936
|
+
for (let i = 0; i < weakModelFiles.length; i++) {
|
|
1937
|
+
const modelFile = weakModelFiles[i];
|
|
1938
|
+
const evalTaskId = `eval_${Date.now()}_${i}`;
|
|
1939
|
+
|
|
1940
|
+
progressText.textContent += `\n[${i+1}/${weakModelFiles.length}] Evaluating ${modelFile}...\n`;
|
|
1941
|
+
|
|
1942
|
+
let evalPollInterval = null;
|
|
1943
|
+
const pollEvalProgress = async () => {
|
|
1944
|
+
try {
|
|
1945
|
+
const resp = await fetch(`/progress/${evalTaskId}`);
|
|
1946
|
+
if (resp.ok) {
|
|
1947
|
+
const progress = await resp.json();
|
|
1948
|
+
const percent = (progress.current / progress.total) * 100;
|
|
1949
|
+
// Map eval progress to 60-100% of overall
|
|
1950
|
+
const basePercent = 60 + (i / weakModelFiles.length) * 40;
|
|
1951
|
+
const stepPercent = (percent / 100) * (40 / weakModelFiles.length);
|
|
1952
|
+
overallProgress.style.width = `${basePercent + stepPercent}%`;
|
|
1953
|
+
}
|
|
1954
|
+
} catch (e) {
|
|
1955
|
+
console.error('Error polling eval progress:', e);
|
|
1956
|
+
}
|
|
1957
|
+
};
|
|
1958
|
+
|
|
1959
|
+
evalPollInterval = setInterval(pollEvalProgress, 300);
|
|
1960
|
+
|
|
1961
|
+
const evalResponse = await fetch('/run_evaluation', {
|
|
1962
|
+
method: 'POST',
|
|
1963
|
+
headers: { 'Content-Type': 'application/json' },
|
|
1964
|
+
body: JSON.stringify({
|
|
1965
|
+
model_file: modelFile,
|
|
1966
|
+
judge: judge,
|
|
1967
|
+
task_id: evalTaskId
|
|
1968
|
+
})
|
|
1969
|
+
});
|
|
1970
|
+
|
|
1971
|
+
if (evalPollInterval) clearInterval(evalPollInterval);
|
|
1972
|
+
|
|
1973
|
+
if (evalResponse.ok) {
|
|
1974
|
+
const evalResult = await evalResponse.json();
|
|
1975
|
+
progressText.textContent += ` ✓ Complete: ${evalResult.examples_evaluated} examples evaluated\n`;
|
|
1976
|
+
evaluationResults.push(evalResult);
|
|
1977
|
+
} else {
|
|
1978
|
+
progressText.textContent += ` ✗ Error evaluating ${modelFile}\n`;
|
|
1979
|
+
}
|
|
1980
|
+
}
|
|
1981
|
+
|
|
1982
|
+
overallProgress.style.width = '100%';
|
|
1983
|
+
stepLabel.textContent = 'Complete!';
|
|
1984
|
+
progressText.textContent += `\n✅ All evaluations complete!\n`;
|
|
1985
|
+
|
|
1986
|
+
// Show results
|
|
1987
|
+
document.getElementById('e2e-results').style.display = 'block';
|
|
1988
|
+
const resultsContent = document.getElementById('e2e-results-content');
|
|
1989
|
+
resultsContent.innerHTML = evaluationResults.map(r => `
|
|
1990
|
+
<div style="margin-bottom: 15px; padding: 15px; background: #1a1a1a; border-radius: 4px; border: 1px solid #2a2a2a;">
|
|
1991
|
+
<div style="font-weight: bold; color: #fff; margin-bottom: 8px;">${r.evaluation_name}</div>
|
|
1992
|
+
<div style="font-size: 12px; color: #888; margin-bottom: 8px;">
|
|
1993
|
+
${r.examples_evaluated} examples evaluated
|
|
1994
|
+
</div>
|
|
1995
|
+
<a href="${r.weave_url}" target="_blank" style="color: #4a9eff; font-size: 13px;">View in Weave →</a>
|
|
1996
|
+
</div>
|
|
1997
|
+
`).join('');
|
|
1998
|
+
|
|
1999
|
+
} catch (error) {
|
|
2000
|
+
progressText.textContent += `\n\n❌ Error: ${error.message}\n`;
|
|
2001
|
+
stepLabel.textContent = 'Error occurred';
|
|
2002
|
+
}
|
|
2003
|
+
});
|
|
1290
2004
|
</script>
|
|
1291
2005
|
</body>
|
|
1292
2006
|
</html>
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
quickdistill/__init__.py,sha256=U8mvMbfYKLFegcEA4D-P6AFHvSiHQPXoFn0KKd-xh0A,397
|
|
2
|
+
quickdistill/cli.py,sha256=A8d5GN9NdBS299WyAsJ6-p8ynW3DJnDRHZ-UGH7TXLM,2212
|
|
3
|
+
quickdistill/default_judges.json,sha256=w0TkIniELPPG-Mi3hm7zPW06eq46W1BI_ufWXnkDDDM,1432
|
|
4
|
+
quickdistill/get_traces.py,sha256=mfy9fMiK-CZQN1noZ4DfOwdwP45ntthVDLgh4-u2iNk,4896
|
|
5
|
+
quickdistill/server.py,sha256=0Y0XG-8oYoNZgmo10LPZgtwlHuGqrq0urxE-KabyIvI,36789
|
|
6
|
+
quickdistill/__pycache__/__init__.cpython-310.pyc,sha256=Tbov274p3OjaOuOsQwcW-meATEfkz0mHKmpytksuDJI,603
|
|
7
|
+
quickdistill/__pycache__/cli.cpython-310.pyc,sha256=xtVgJTayQLKS4gE_te7U1Wo8LmkDtPkaa2rnzu8h9fY,2443
|
|
8
|
+
quickdistill/__pycache__/get_traces.cpython-310.pyc,sha256=T7Suxp9vpqYDQJ_3uJvXWemqoLf5tnRC2I0BfHrSiNM,2956
|
|
9
|
+
quickdistill/__pycache__/server.cpython-310.pyc,sha256=_taKWofMtdgfMZzfVsd7PoC4jnuKxEOGzW82YBxqPPc,22051
|
|
10
|
+
quickdistill/default_projects/byyoung3_arena-detailed/traces_data.json,sha256=iz-cBmXBYj0bC3Vn754QTnGuDh6sRvlE_RzSyGXaxbY,15496950
|
|
11
|
+
quickdistill/static/judge_manager.html,sha256=fXteyx_ry4gY166WypBkVGGCqieE88MigqLRLVCKnG8,26887
|
|
12
|
+
quickdistill/static/trace_viewer.html,sha256=kPC4GnxeDPq7jxClRhZBOuS6xmA3RaY-loJDZmKDADE,94426
|
|
13
|
+
quickdistill-0.1.7.dist-info/METADATA,sha256=1pE5fDep0l0kAxhHuT1C_H4CYHIiPLP4n9QraAqI9bM,5084
|
|
14
|
+
quickdistill-0.1.7.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
|
|
15
|
+
quickdistill-0.1.7.dist-info/entry_points.txt,sha256=AUUTxnwdD9gRnsOEcTXQTAZIZ_F0aRU7JGstIJ3Xk_o,55
|
|
16
|
+
quickdistill-0.1.7.dist-info/top_level.txt,sha256=ysiMvurJYsE1IhkxmObe-0G8A-GIav40kTh2z6axjxg,13
|
|
17
|
+
quickdistill-0.1.7.dist-info/RECORD,,
|
|
@@ -1,17 +0,0 @@
|
|
|
1
|
-
quickdistill/__init__.py,sha256=DquS7slegbCcNr33DA4WEy4RnHFUPHbl3tGhOkw8Yzo,397
|
|
2
|
-
quickdistill/cli.py,sha256=A8d5GN9NdBS299WyAsJ6-p8ynW3DJnDRHZ-UGH7TXLM,2212
|
|
3
|
-
quickdistill/default_judges.json,sha256=w0TkIniELPPG-Mi3hm7zPW06eq46W1BI_ufWXnkDDDM,1432
|
|
4
|
-
quickdistill/get_traces.py,sha256=mfy9fMiK-CZQN1noZ4DfOwdwP45ntthVDLgh4-u2iNk,4896
|
|
5
|
-
quickdistill/server.py,sha256=EXifo8rF8wU_5mhX7ZnpYTi3iRus9XL9nuBdR7FFBRg,27761
|
|
6
|
-
quickdistill/__pycache__/__init__.cpython-310.pyc,sha256=jC6GheK56FqSP9ZP_kHookaiqaKcfY82xOlo2Qn8sag,603
|
|
7
|
-
quickdistill/__pycache__/cli.cpython-310.pyc,sha256=xtVgJTayQLKS4gE_te7U1Wo8LmkDtPkaa2rnzu8h9fY,2443
|
|
8
|
-
quickdistill/__pycache__/get_traces.cpython-310.pyc,sha256=T7Suxp9vpqYDQJ_3uJvXWemqoLf5tnRC2I0BfHrSiNM,2956
|
|
9
|
-
quickdistill/__pycache__/server.cpython-310.pyc,sha256=DXP_J3nP4lPDWwB4fiXbRTSTzM-A1ymWqX593-gDMuA,17503
|
|
10
|
-
quickdistill/default_projects/byyoung3_arena-detailed/traces_data.json,sha256=iz-cBmXBYj0bC3Vn754QTnGuDh6sRvlE_RzSyGXaxbY,15496950
|
|
11
|
-
quickdistill/static/judge_manager.html,sha256=YzzMWpNHyVc7Lyu8Cn55hWzAvYJ1WscXEufQLQ4jR18,17410
|
|
12
|
-
quickdistill/static/trace_viewer.html,sha256=MoXxp_FroAbs8PLzFV7qNkxRI-IY3GGkQWDOHnZM_j8,56257
|
|
13
|
-
quickdistill-0.1.6.dist-info/METADATA,sha256=-Ku5g1GGf8jo7rCZb_vcwy5AB968Zv8INGymUPZDNAg,5084
|
|
14
|
-
quickdistill-0.1.6.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
|
|
15
|
-
quickdistill-0.1.6.dist-info/entry_points.txt,sha256=AUUTxnwdD9gRnsOEcTXQTAZIZ_F0aRU7JGstIJ3Xk_o,55
|
|
16
|
-
quickdistill-0.1.6.dist-info/top_level.txt,sha256=ysiMvurJYsE1IhkxmObe-0G8A-GIav40kTh2z6axjxg,13
|
|
17
|
-
quickdistill-0.1.6.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|