quickdistill 0.1.6__py3-none-any.whl → 0.1.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- quickdistill/__init__.py +1 -1
- quickdistill/__pycache__/__init__.cpython-310.pyc +0 -0
- quickdistill/__pycache__/server.cpython-310.pyc +0 -0
- quickdistill/default_judges.json +2 -2
- quickdistill/server.py +257 -5
- quickdistill/static/judge_manager.html +193 -8
- quickdistill/static/trace_viewer.html +731 -9
- {quickdistill-0.1.6.dist-info → quickdistill-0.1.8.dist-info}/METADATA +1 -1
- quickdistill-0.1.8.dist-info/RECORD +17 -0
- quickdistill-0.1.6.dist-info/RECORD +0 -17
- {quickdistill-0.1.6.dist-info → quickdistill-0.1.8.dist-info}/WHEEL +0 -0
- {quickdistill-0.1.6.dist-info → quickdistill-0.1.8.dist-info}/entry_points.txt +0 -0
- {quickdistill-0.1.6.dist-info → quickdistill-0.1.8.dist-info}/top_level.txt +0 -0
quickdistill/__init__.py
CHANGED
|
Binary file
|
|
Binary file
|
quickdistill/default_judges.json
CHANGED
|
@@ -2,14 +2,14 @@
|
|
|
2
2
|
{
|
|
3
3
|
"name": "boolean_scorer",
|
|
4
4
|
"type": "llm",
|
|
5
|
-
"model": "gpt-5",
|
|
5
|
+
"model": "openai/gpt-5",
|
|
6
6
|
"returnType": "boolean",
|
|
7
7
|
"prompt": "You are a strict evaluator comparing two AI responses (one from a strong reference model which is the ground truth, and one from a weaker model which we are testing to see how similar the responses it generates are to the strong model).\n\nStrong Model Response: {strong_output}\nWeak Model Response: {weak_output}\n\nDetermine if the weak model response is CORRECT compared to the strong model response.\nConsider a response CORRECT if it conveys the same key information and meaning, even if worded differently.\n\nRespond in JSON format: {'correct': true} or {'correct': false}"
|
|
8
8
|
},
|
|
9
9
|
{
|
|
10
10
|
"name": "scalar_scorer",
|
|
11
11
|
"type": "llm",
|
|
12
|
-
"model": "gpt-5",
|
|
12
|
+
"model": "openai/gpt-5",
|
|
13
13
|
"returnType": "scalar",
|
|
14
14
|
"prompt": "You are a strict evaluator comparing two AI responses (one from a strong reference model which is the ground truth, and one from a weaker model which we are testing to see how similar the responses it generates are to the strong model).\n\nStrong Model Response: {strong_output}\nWeak Model Response: {weak_output}\n\nEvaluate how similar the weak model response is to the strong model response.\nRate on a scale of 1-5 where 1=completely different and 5=nearly identical. RETURN ONLY ONE SCORE REPRESENTY THE AVERAGE SIMILARITY (EG 5-(avg_error))\n\nRespond in JSON format eg {'scores': the_score }"
|
|
15
15
|
}
|
quickdistill/server.py
CHANGED
|
@@ -3,7 +3,6 @@ import json
|
|
|
3
3
|
import openai
|
|
4
4
|
import weave
|
|
5
5
|
import shutil
|
|
6
|
-
import threading
|
|
7
6
|
from flask import Flask, request, jsonify, send_from_directory
|
|
8
7
|
from flask_cors import CORS
|
|
9
8
|
from llmasajudge import LLMAsAJudge
|
|
@@ -34,8 +33,25 @@ CORS(app)
|
|
|
34
33
|
# Progress tracking for long-running operations
|
|
35
34
|
progress_state = {}
|
|
36
35
|
|
|
37
|
-
#
|
|
38
|
-
|
|
36
|
+
# Load settings
|
|
37
|
+
SETTINGS_FILE = DATA_DIR / 'settings.json'
|
|
38
|
+
DEFAULT_SETTINGS = {
|
|
39
|
+
'inference_project': 'wandb_fc/quickstart_playground',
|
|
40
|
+
'evaluation_project': 'wandb_inference'
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
def load_settings():
|
|
44
|
+
if SETTINGS_FILE.exists():
|
|
45
|
+
with open(SETTINGS_FILE, 'r') as f:
|
|
46
|
+
return {**DEFAULT_SETTINGS, **json.load(f)}
|
|
47
|
+
return DEFAULT_SETTINGS.copy()
|
|
48
|
+
|
|
49
|
+
def save_settings(settings):
|
|
50
|
+
with open(SETTINGS_FILE, 'w') as f:
|
|
51
|
+
json.dump(settings, f, indent=2)
|
|
52
|
+
|
|
53
|
+
settings = load_settings()
|
|
54
|
+
PROJECT = settings['evaluation_project']
|
|
39
55
|
|
|
40
56
|
weave.init(PROJECT)
|
|
41
57
|
|
|
@@ -46,7 +62,7 @@ def create_client():
|
|
|
46
62
|
api_key=os.getenv("WANDB_API_KEY"),
|
|
47
63
|
project=PROJECT,
|
|
48
64
|
default_headers={
|
|
49
|
-
"OpenAI-Project":
|
|
65
|
+
"OpenAI-Project": settings['inference_project']
|
|
50
66
|
}
|
|
51
67
|
)
|
|
52
68
|
|
|
@@ -175,8 +191,9 @@ def run_inference_endpoint():
|
|
|
175
191
|
if not traces:
|
|
176
192
|
return jsonify({'error': 'No traces in export file'}), 400
|
|
177
193
|
|
|
178
|
-
# Limit traces to num_examples
|
|
194
|
+
# Limit traces to num_examples (convert to int if needed)
|
|
179
195
|
if num_examples:
|
|
196
|
+
num_examples = int(num_examples)
|
|
180
197
|
traces = traces[:num_examples]
|
|
181
198
|
|
|
182
199
|
output_files = []
|
|
@@ -284,6 +301,241 @@ def get_progress(task_id):
|
|
|
284
301
|
return jsonify({'error': 'Task not found'}), 404
|
|
285
302
|
|
|
286
303
|
|
|
304
|
+
@app.route('/settings', methods=['GET'])
|
|
305
|
+
def get_settings():
|
|
306
|
+
"""Get current settings"""
|
|
307
|
+
return jsonify(settings)
|
|
308
|
+
|
|
309
|
+
|
|
310
|
+
@app.route('/settings', methods=['POST'])
|
|
311
|
+
def update_settings():
|
|
312
|
+
"""Update settings"""
|
|
313
|
+
global settings
|
|
314
|
+
data = request.json
|
|
315
|
+
settings.update(data)
|
|
316
|
+
save_settings(settings)
|
|
317
|
+
return jsonify({'status': 'success', 'settings': settings})
|
|
318
|
+
|
|
319
|
+
|
|
320
|
+
@app.route('/test_judge', methods=['POST'])
|
|
321
|
+
def test_judge():
|
|
322
|
+
"""Test a judge on sample data to see raw inputs/outputs"""
|
|
323
|
+
data = request.json
|
|
324
|
+
judge = data.get('judge')
|
|
325
|
+
weak_model_file = data.get('weak_model_file')
|
|
326
|
+
num_samples = data.get('num_samples', 5)
|
|
327
|
+
|
|
328
|
+
if not judge or not weak_model_file:
|
|
329
|
+
return jsonify({'error': 'Missing judge or weak_model_file'}), 400
|
|
330
|
+
|
|
331
|
+
# Load weak model results
|
|
332
|
+
model_path = DATA_DIR / weak_model_file
|
|
333
|
+
with open(model_path, 'r') as f:
|
|
334
|
+
file_data = json.load(f)
|
|
335
|
+
|
|
336
|
+
# Handle both formats
|
|
337
|
+
if isinstance(file_data, dict) and 'results' in file_data:
|
|
338
|
+
results = file_data['results']
|
|
339
|
+
else:
|
|
340
|
+
results = file_data
|
|
341
|
+
|
|
342
|
+
# Limit to num_samples
|
|
343
|
+
samples_to_test = results[:min(num_samples, len(results))]
|
|
344
|
+
|
|
345
|
+
test_results = []
|
|
346
|
+
|
|
347
|
+
for example in samples_to_test:
|
|
348
|
+
# Skip examples with errors
|
|
349
|
+
if example.get('error') or not example.get('output'):
|
|
350
|
+
continue
|
|
351
|
+
|
|
352
|
+
strong_output = example.get('strong_model_output', '')
|
|
353
|
+
weak_output = example.get('output', '')
|
|
354
|
+
|
|
355
|
+
# Extract question
|
|
356
|
+
question = ""
|
|
357
|
+
messages = example.get('messages', [])
|
|
358
|
+
if messages and len(messages) > 0:
|
|
359
|
+
question = messages[0].get('content', '')
|
|
360
|
+
|
|
361
|
+
# Build the prompt
|
|
362
|
+
prompt = judge['prompt']
|
|
363
|
+
if '{question}' in prompt:
|
|
364
|
+
prompt = prompt.replace('{question}', question or '')
|
|
365
|
+
if '{strong_output}' in prompt:
|
|
366
|
+
prompt = prompt.replace('{strong_output}', strong_output or '')
|
|
367
|
+
if '{weak_output}' in prompt:
|
|
368
|
+
prompt = prompt.replace('{weak_output}', weak_output or '')
|
|
369
|
+
|
|
370
|
+
# Run the judge and capture raw response
|
|
371
|
+
if judge['type'] == 'llm':
|
|
372
|
+
return_type = judge.get('returnType', 'scalar')
|
|
373
|
+
|
|
374
|
+
# Use a list to capture the raw response (mutable so we can access from closure)
|
|
375
|
+
captured_raw = []
|
|
376
|
+
|
|
377
|
+
def score_parser(response: str):
|
|
378
|
+
"""Parse the judge response based on return type"""
|
|
379
|
+
# Capture the raw response before any processing
|
|
380
|
+
captured_raw.append(response)
|
|
381
|
+
|
|
382
|
+
response = response.strip()
|
|
383
|
+
|
|
384
|
+
# Remove markdown code blocks if present
|
|
385
|
+
if response.startswith('```'):
|
|
386
|
+
# Remove ```json or ``` at start
|
|
387
|
+
response = response.split('\n', 1)[1] if '\n' in response else response[3:]
|
|
388
|
+
# Remove ``` at end
|
|
389
|
+
if response.endswith('```'):
|
|
390
|
+
response = response.rsplit('\n', 1)[0] if '\n' in response else response[:-3]
|
|
391
|
+
response = response.strip()
|
|
392
|
+
|
|
393
|
+
try:
|
|
394
|
+
# Parse JSON response
|
|
395
|
+
parsed = json.loads(response)
|
|
396
|
+
|
|
397
|
+
if return_type == 'boolean':
|
|
398
|
+
# Extract boolean value - return just the bool
|
|
399
|
+
val = parsed.get('correct', parsed.get('result', parsed.get('value', False)))
|
|
400
|
+
return bool(val)
|
|
401
|
+
elif return_type == 'scalar':
|
|
402
|
+
# Extract numeric score - return just the number
|
|
403
|
+
val = parsed.get('score', parsed.get('scores', parsed.get('value', 0)))
|
|
404
|
+
return float(val) if isinstance(val, (int, float)) else 0
|
|
405
|
+
else:
|
|
406
|
+
# Unsupported return type
|
|
407
|
+
print(f"Unsupported return type: {return_type}")
|
|
408
|
+
return 0
|
|
409
|
+
except:
|
|
410
|
+
print(f"Failed to parse judge response as JSON: {response}")
|
|
411
|
+
if return_type == 'scalar':
|
|
412
|
+
return 0
|
|
413
|
+
elif return_type == 'boolean':
|
|
414
|
+
return False
|
|
415
|
+
else:
|
|
416
|
+
return 0
|
|
417
|
+
|
|
418
|
+
# Use LLMAsAJudge exactly like the evaluation code
|
|
419
|
+
try:
|
|
420
|
+
# Initialize LLMAsAJudge with custom prompt
|
|
421
|
+
judge_instance = LLMAsAJudge(
|
|
422
|
+
models=[judge['model']],
|
|
423
|
+
use_fully_custom_prompt=True,
|
|
424
|
+
output_parser=score_parser,
|
|
425
|
+
return_type=return_type if return_type else None
|
|
426
|
+
)
|
|
427
|
+
|
|
428
|
+
# Get judgment
|
|
429
|
+
result = judge_instance.judge(prompt=prompt)
|
|
430
|
+
|
|
431
|
+
# Extract the raw response that was captured
|
|
432
|
+
raw_text = captured_raw[0] if captured_raw else "No response captured"
|
|
433
|
+
|
|
434
|
+
# Extract parsed scores from result
|
|
435
|
+
if return_type == 'scalar':
|
|
436
|
+
score_val = result.get('scores', result.get('correct', 0))
|
|
437
|
+
parsed_scores = {'score': score_val}
|
|
438
|
+
elif return_type == 'boolean':
|
|
439
|
+
bool_val = result.get('correct', False)
|
|
440
|
+
parsed_scores = {'correct': bool_val}
|
|
441
|
+
else:
|
|
442
|
+
# Unsupported return type - default to scalar
|
|
443
|
+
score_val = result.get('scores', result.get('correct', 0))
|
|
444
|
+
parsed_scores = {'score': score_val}
|
|
445
|
+
|
|
446
|
+
except Exception as e:
|
|
447
|
+
raw_text = f"Error: {str(e)}"
|
|
448
|
+
parsed_scores = {'error': str(e)}
|
|
449
|
+
|
|
450
|
+
test_results.append({
|
|
451
|
+
'question': question,
|
|
452
|
+
'strong_output': strong_output,
|
|
453
|
+
'weak_output': weak_output,
|
|
454
|
+
'judge_prompt': prompt,
|
|
455
|
+
'raw_response': raw_text,
|
|
456
|
+
'parsed_scores': parsed_scores
|
|
457
|
+
})
|
|
458
|
+
|
|
459
|
+
return jsonify({
|
|
460
|
+
'status': 'success',
|
|
461
|
+
'judge_name': judge['name'],
|
|
462
|
+
'num_samples': len(test_results),
|
|
463
|
+
'samples': test_results
|
|
464
|
+
})
|
|
465
|
+
|
|
466
|
+
|
|
467
|
+
@app.route('/generate_judge_prompt', methods=['POST'])
|
|
468
|
+
def generate_judge_prompt():
|
|
469
|
+
"""Generate a judge prompt using AI based on sample data"""
|
|
470
|
+
data = request.json
|
|
471
|
+
weak_model_file = data.get('weak_model_file')
|
|
472
|
+
num_samples = data.get('num_samples', 3)
|
|
473
|
+
model = data.get('model', 'openai/gpt-5')
|
|
474
|
+
meta_prompt = data.get('meta_prompt')
|
|
475
|
+
|
|
476
|
+
if not weak_model_file or not meta_prompt:
|
|
477
|
+
return jsonify({'error': 'Missing weak_model_file or meta_prompt'}), 400
|
|
478
|
+
|
|
479
|
+
# Load weak model results
|
|
480
|
+
model_path = DATA_DIR / weak_model_file
|
|
481
|
+
with open(model_path, 'r') as f:
|
|
482
|
+
file_data = json.load(f)
|
|
483
|
+
|
|
484
|
+
# Handle both formats
|
|
485
|
+
if isinstance(file_data, dict) and 'results' in file_data:
|
|
486
|
+
results = file_data['results']
|
|
487
|
+
else:
|
|
488
|
+
results = file_data
|
|
489
|
+
|
|
490
|
+
# Limit to num_samples
|
|
491
|
+
samples_to_use = results[:min(num_samples, len(results))]
|
|
492
|
+
|
|
493
|
+
# Format samples for meta-prompt
|
|
494
|
+
samples_text = []
|
|
495
|
+
for i, example in enumerate(samples_to_use):
|
|
496
|
+
# Skip examples with errors
|
|
497
|
+
if example.get('error') or not example.get('output'):
|
|
498
|
+
continue
|
|
499
|
+
|
|
500
|
+
strong_output = example.get('strong_model_output', '')
|
|
501
|
+
weak_output = example.get('output', '')
|
|
502
|
+
|
|
503
|
+
# Extract question
|
|
504
|
+
question = ""
|
|
505
|
+
messages = example.get('messages', [])
|
|
506
|
+
if messages and len(messages) > 0:
|
|
507
|
+
question = messages[0].get('content', '')
|
|
508
|
+
|
|
509
|
+
samples_text.append(f"""Sample {i+1}:
|
|
510
|
+
Question: {question}
|
|
511
|
+
Strong Model Output: {strong_output}
|
|
512
|
+
Weak Model Output: {weak_output}
|
|
513
|
+
---""")
|
|
514
|
+
|
|
515
|
+
samples_formatted = "\n\n".join(samples_text)
|
|
516
|
+
|
|
517
|
+
# Replace {SAMPLES} placeholder in meta-prompt
|
|
518
|
+
final_prompt = meta_prompt.replace('{SAMPLES}', samples_formatted)
|
|
519
|
+
|
|
520
|
+
# Call OpenRouter to generate the prompt
|
|
521
|
+
try:
|
|
522
|
+
client = create_openrouter_client()
|
|
523
|
+
response = client.chat.completions.create(
|
|
524
|
+
model=model,
|
|
525
|
+
messages=[{"role": "user", "content": final_prompt}]
|
|
526
|
+
)
|
|
527
|
+
generated_prompt = response.choices[0].message.content.strip()
|
|
528
|
+
|
|
529
|
+
return jsonify({
|
|
530
|
+
'status': 'success',
|
|
531
|
+
'generated_prompt': generated_prompt,
|
|
532
|
+
'num_samples_used': len(samples_text)
|
|
533
|
+
})
|
|
534
|
+
|
|
535
|
+
except Exception as e:
|
|
536
|
+
return jsonify({'error': f'Failed to generate prompt: {str(e)}'}), 500
|
|
537
|
+
|
|
538
|
+
|
|
287
539
|
@app.route('/list_weak_models', methods=['GET'])
|
|
288
540
|
def list_weak_models():
|
|
289
541
|
"""List available weak model result files with metadata"""
|
|
@@ -162,6 +162,13 @@
|
|
|
162
162
|
<div class="container">
|
|
163
163
|
<h1>Judge Manager</h1>
|
|
164
164
|
|
|
165
|
+
<!-- Prompt Generator Button -->
|
|
166
|
+
<div style="margin-bottom: 20px; display: none;">
|
|
167
|
+
<button onclick="openPromptGenerator()" style="padding: 10px 20px; background: #6a4a7e; color: white; border: none; border-radius: 4px; cursor: pointer; font-size: 14px;">
|
|
168
|
+
✨ Generate Judge Prompt with AI
|
|
169
|
+
</button>
|
|
170
|
+
</div>
|
|
171
|
+
|
|
165
172
|
<!-- Create/Edit Judge Section -->
|
|
166
173
|
<div class="section">
|
|
167
174
|
<h2 id="form-title">Create New Judge</h2>
|
|
@@ -176,12 +183,10 @@
|
|
|
176
183
|
|
|
177
184
|
<div id="llm-options" style="display: block;">
|
|
178
185
|
<label for="judge-model">Model</label>
|
|
179
|
-
<
|
|
180
|
-
|
|
181
|
-
<
|
|
182
|
-
|
|
183
|
-
<option value="claude-3-5-sonnet-20241022">claude-3-5-sonnet</option>
|
|
184
|
-
</select>
|
|
186
|
+
<input type="text" id="judge-model" placeholder="e.g., openai/gpt-5, anthropic/claude-3.5-sonnet" value="openai/gpt-5">
|
|
187
|
+
<p style="color: #888; font-size: 12px; margin-top: 5px; margin-bottom: 15px;">
|
|
188
|
+
<strong>Note:</strong> Uses LiteLLM format. Examples: <code>openai/gpt-5</code>, <code>anthropic/claude-3.5-sonnet</code>, <code>openai/gpt-4o</code>
|
|
189
|
+
</p>
|
|
185
190
|
|
|
186
191
|
<label for="judge-return-type">Return Type</label>
|
|
187
192
|
<select id="judge-return-type">
|
|
@@ -217,6 +222,73 @@
|
|
|
217
222
|
</div>
|
|
218
223
|
</div>
|
|
219
224
|
|
|
225
|
+
<!-- Prompt Generator Panel -->
|
|
226
|
+
<div id="prompt-generator-panel" style="display: none; position: fixed; top: 0; left: 0; width: 100%; height: 100%; background: rgba(0,0,0,0.9); z-index: 1000; padding: 40px; overflow-y: auto;">
|
|
227
|
+
<div style="max-width: 1200px; margin: 0 auto; background: #1a1a1a; border-radius: 8px; padding: 30px; border: 1px solid #3a2a4a;">
|
|
228
|
+
<h2 style="color: #fff; margin-bottom: 10px;">AI-Powered Judge Prompt Generator</h2>
|
|
229
|
+
<p style="color: #888; font-size: 13px; margin-bottom: 25px;">
|
|
230
|
+
Generate specialized judge prompts by showing sample data to an AI model
|
|
231
|
+
</p>
|
|
232
|
+
|
|
233
|
+
<!-- Configuration -->
|
|
234
|
+
<div style="display: grid; grid-template-columns: 1fr 1fr; gap: 20px; margin-bottom: 20px;">
|
|
235
|
+
<div>
|
|
236
|
+
<label style="display: block; color: #aaa; margin-bottom: 8px; font-size: 14px;">Weak Model Dataset:</label>
|
|
237
|
+
<select id="gen-weak-model-select" style="width: 100%; padding: 10px; background: #2a2a2a; color: #fff; border: 1px solid #3a3a3a; border-radius: 4px; font-size: 14px;">
|
|
238
|
+
<option value="">Loading weak model files...</option>
|
|
239
|
+
</select>
|
|
240
|
+
</div>
|
|
241
|
+
|
|
242
|
+
<div>
|
|
243
|
+
<label style="display: block; color: #aaa; margin-bottom: 8px; font-size: 14px;">Number of Samples:</label>
|
|
244
|
+
<input type="number" id="gen-num-samples" value="3" min="1" max="10"
|
|
245
|
+
style="width: 100%; padding: 10px; background: #2a2a2a; color: #fff; border: 1px solid #3a3a3a; border-radius: 4px; font-size: 14px;">
|
|
246
|
+
<div style="color: #666; font-size: 12px; margin-top: 5px;">Max: 10 (for context limits)</div>
|
|
247
|
+
</div>
|
|
248
|
+
</div>
|
|
249
|
+
|
|
250
|
+
<div style="margin-bottom: 20px;">
|
|
251
|
+
<label style="display: block; color: #aaa; margin-bottom: 8px; font-size: 14px;">Generation Model:</label>
|
|
252
|
+
<input type="text" id="gen-model" value="openai/gpt-5"
|
|
253
|
+
style="width: 100%; padding: 10px; background: #2a2a2a; color: #fff; border: 1px solid #3a3a3a; border-radius: 4px; font-size: 14px;"
|
|
254
|
+
placeholder="e.g., openai/gpt-5, anthropic/claude-3.5-sonnet">
|
|
255
|
+
<div style="color: #666; font-size: 12px; margin-top: 5px;">OpenRouter model to use for generating the prompt</div>
|
|
256
|
+
</div>
|
|
257
|
+
|
|
258
|
+
<!-- Meta-Prompt -->
|
|
259
|
+
<div style="margin-bottom: 25px;">
|
|
260
|
+
<label style="display: block; color: #aaa; margin-bottom: 8px; font-size: 14px;">Meta-Prompt (edit as needed):</label>
|
|
261
|
+
<textarea id="gen-meta-prompt"
|
|
262
|
+
style="width: 100%; min-height: 250px; padding: 10px; background: #2a2a2a; color: #fff; border: 1px solid #3a3a3a; border-radius: 4px; font-size: 13px; font-family: 'Courier New', monospace; resize: vertical;"></textarea>
|
|
263
|
+
<div style="color: #666; font-size: 12px; margin-top: 5px;">
|
|
264
|
+
This prompt will be sent to the generation model along with sample data
|
|
265
|
+
</div>
|
|
266
|
+
</div>
|
|
267
|
+
|
|
268
|
+
<!-- Actions -->
|
|
269
|
+
<div style="display: flex; gap: 10px; margin-bottom: 25px;">
|
|
270
|
+
<button onclick="generatePrompt()" style="padding: 10px 20px; background: #6a4a7e; color: white; border: none; border-radius: 4px; cursor: pointer; font-weight: 500;">
|
|
271
|
+
Generate Prompt
|
|
272
|
+
</button>
|
|
273
|
+
<button onclick="closePromptGenerator()" style="padding: 10px 20px; background: #5a2a2a; color: white; border: none; border-radius: 4px; cursor: pointer;">
|
|
274
|
+
Close
|
|
275
|
+
</button>
|
|
276
|
+
</div>
|
|
277
|
+
|
|
278
|
+
<!-- Generated Output -->
|
|
279
|
+
<div id="gen-output-section" style="display: none;">
|
|
280
|
+
<h3 style="color: #4a9eff; margin-bottom: 15px;">Generated Judge Prompt</h3>
|
|
281
|
+
<textarea id="gen-output" readonly
|
|
282
|
+
style="width: 100%; min-height: 300px; padding: 15px; background: #0f0f0f; color: #4a9eff; border: 1px solid #4a9eff; border-radius: 4px; font-size: 13px; font-family: 'Courier New', monospace; resize: vertical;"></textarea>
|
|
283
|
+
<div style="margin-top: 10px;">
|
|
284
|
+
<button onclick="copyGeneratedPrompt()" style="padding: 8px 16px; background: #2a7c4a; color: white; border: none; border-radius: 4px; cursor: pointer;">
|
|
285
|
+
Copy to Clipboard
|
|
286
|
+
</button>
|
|
287
|
+
</div>
|
|
288
|
+
</div>
|
|
289
|
+
</div>
|
|
290
|
+
</div>
|
|
291
|
+
|
|
220
292
|
<script>
|
|
221
293
|
let judges = [];
|
|
222
294
|
let editingIndex = null;
|
|
@@ -319,10 +391,16 @@ Respond in JSON format: {'correct': true} or {'correct': false}`
|
|
|
319
391
|
};
|
|
320
392
|
|
|
321
393
|
if (type === 'llm') {
|
|
322
|
-
judge.model = document.getElementById('judge-model').value;
|
|
394
|
+
judge.model = document.getElementById('judge-model').value.trim();
|
|
323
395
|
judge.returnType = document.getElementById('judge-return-type').value;
|
|
324
396
|
judge.prompt = document.getElementById('judge-prompt').value.trim();
|
|
325
397
|
|
|
398
|
+
// Validate model
|
|
399
|
+
if (!judge.model) {
|
|
400
|
+
alert('Error: Please enter a model (e.g., openai/gpt-5)');
|
|
401
|
+
return;
|
|
402
|
+
}
|
|
403
|
+
|
|
326
404
|
// Validate required placeholders
|
|
327
405
|
if (!judge.prompt.includes('{strong_output}')) {
|
|
328
406
|
alert('Error: Judge prompt must include {strong_output} placeholder');
|
|
@@ -346,7 +424,7 @@ Respond in JSON format: {'correct': true} or {'correct': false}`
|
|
|
346
424
|
function resetForm() {
|
|
347
425
|
document.getElementById('judge-name').value = '';
|
|
348
426
|
document.getElementById('judge-type').value = 'llm';
|
|
349
|
-
document.getElementById('judge-model').value = 'gpt-5
|
|
427
|
+
document.getElementById('judge-model').value = 'openai/gpt-5';
|
|
350
428
|
document.getElementById('judge-prompt').value = '';
|
|
351
429
|
document.getElementById('form-title').textContent = 'Create New Judge';
|
|
352
430
|
document.getElementById('save-btn').textContent = 'Save Judge';
|
|
@@ -489,6 +567,113 @@ Respond in JSON format: {'correct': true} or {'correct': false}`
|
|
|
489
567
|
console.log('Not changing prompt - user has edited it');
|
|
490
568
|
}
|
|
491
569
|
});
|
|
570
|
+
|
|
571
|
+
// === PROMPT GENERATOR ===
|
|
572
|
+
|
|
573
|
+
const DEFAULT_META_PROMPT = `You are an expert at creating evaluation prompts for judging AI model outputs. I'm building a specialized judge prompt to evaluate the quality/similarity of weak model outputs compared to strong reference model outputs.
|
|
574
|
+
|
|
575
|
+
I will show you some sample data below. Each sample contains:
|
|
576
|
+
- A question/input
|
|
577
|
+
- The strong reference model's output (ground truth)
|
|
578
|
+
- The weak model's output (what we're evaluating)
|
|
579
|
+
|
|
580
|
+
Your task: Create a specialized, detailed judge prompt that can be used to systematically evaluate the delta/difference between these outputs. The prompt should:
|
|
581
|
+
1. Be specific to the patterns you see in this data
|
|
582
|
+
2. Include clear evaluation criteria
|
|
583
|
+
3. Be written in second-person ("You are...")
|
|
584
|
+
4. Include the placeholders {question}, {strong_output}, and {weak_output}
|
|
585
|
+
5. Specify the exact JSON format to return (either {'score': number} for scalar or {'correct': boolean} for boolean)
|
|
586
|
+
|
|
587
|
+
Sample Data:
|
|
588
|
+
{SAMPLES}
|
|
589
|
+
|
|
590
|
+
Based on these samples, create a specialized judge prompt that would effectively evaluate this type of data. Return ONLY the judge prompt text, nothing else.`;
|
|
591
|
+
|
|
592
|
+
async function openPromptGenerator() {
|
|
593
|
+
// Load weak model files
|
|
594
|
+
try {
|
|
595
|
+
const response = await fetch('/list_weak_models');
|
|
596
|
+
const data = await response.json();
|
|
597
|
+
const select = document.getElementById('gen-weak-model-select');
|
|
598
|
+
|
|
599
|
+
if (data.files && data.files.length > 0) {
|
|
600
|
+
select.innerHTML = data.files.map(f =>
|
|
601
|
+
`<option value="${f.filename}">${f.weak_model || f.filename}</option>`
|
|
602
|
+
).join('');
|
|
603
|
+
} else {
|
|
604
|
+
select.innerHTML = '<option value="">No weak model files available</option>';
|
|
605
|
+
}
|
|
606
|
+
} catch (error) {
|
|
607
|
+
console.error('Error loading weak models:', error);
|
|
608
|
+
}
|
|
609
|
+
|
|
610
|
+
// Set default meta-prompt
|
|
611
|
+
document.getElementById('gen-meta-prompt').value = DEFAULT_META_PROMPT;
|
|
612
|
+
|
|
613
|
+
// Show panel
|
|
614
|
+
document.getElementById('prompt-generator-panel').style.display = 'block';
|
|
615
|
+
document.getElementById('gen-output-section').style.display = 'none';
|
|
616
|
+
}
|
|
617
|
+
|
|
618
|
+
function closePromptGenerator() {
|
|
619
|
+
document.getElementById('prompt-generator-panel').style.display = 'none';
|
|
620
|
+
}
|
|
621
|
+
|
|
622
|
+
async function generatePrompt() {
|
|
623
|
+
const weakModelFile = document.getElementById('gen-weak-model-select').value;
|
|
624
|
+
const numSamples = parseInt(document.getElementById('gen-num-samples').value) || 3;
|
|
625
|
+
const model = document.getElementById('gen-model').value.trim();
|
|
626
|
+
const metaPrompt = document.getElementById('gen-meta-prompt').value.trim();
|
|
627
|
+
|
|
628
|
+
if (!weakModelFile) {
|
|
629
|
+
alert('Please select a weak model dataset');
|
|
630
|
+
return;
|
|
631
|
+
}
|
|
632
|
+
|
|
633
|
+
if (!model) {
|
|
634
|
+
alert('Please enter a generation model');
|
|
635
|
+
return;
|
|
636
|
+
}
|
|
637
|
+
|
|
638
|
+
if (!metaPrompt) {
|
|
639
|
+
alert('Please enter a meta-prompt');
|
|
640
|
+
return;
|
|
641
|
+
}
|
|
642
|
+
|
|
643
|
+
try {
|
|
644
|
+
const response = await fetch('/generate_judge_prompt', {
|
|
645
|
+
method: 'POST',
|
|
646
|
+
headers: { 'Content-Type': 'application/json' },
|
|
647
|
+
body: JSON.stringify({
|
|
648
|
+
weak_model_file: weakModelFile,
|
|
649
|
+
num_samples: numSamples,
|
|
650
|
+
model: model,
|
|
651
|
+
meta_prompt: metaPrompt
|
|
652
|
+
})
|
|
653
|
+
});
|
|
654
|
+
|
|
655
|
+
if (!response.ok) {
|
|
656
|
+
throw new Error('Failed to generate prompt');
|
|
657
|
+
}
|
|
658
|
+
|
|
659
|
+
const result = await response.json();
|
|
660
|
+
|
|
661
|
+
// Display generated prompt
|
|
662
|
+
document.getElementById('gen-output').value = result.generated_prompt;
|
|
663
|
+
document.getElementById('gen-output-section').style.display = 'block';
|
|
664
|
+
|
|
665
|
+
} catch (error) {
|
|
666
|
+
alert('Error generating prompt: ' + error.message);
|
|
667
|
+
console.error('Generation error:', error);
|
|
668
|
+
}
|
|
669
|
+
}
|
|
670
|
+
|
|
671
|
+
function copyGeneratedPrompt() {
|
|
672
|
+
const output = document.getElementById('gen-output');
|
|
673
|
+
output.select();
|
|
674
|
+
document.execCommand('copy');
|
|
675
|
+
alert('Prompt copied to clipboard!');
|
|
676
|
+
}
|
|
492
677
|
</script>
|
|
493
678
|
</body>
|
|
494
679
|
</html>
|