quickdistill 0.1.6__tar.gz → 0.1.7__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {quickdistill-0.1.6/quickdistill.egg-info → quickdistill-0.1.7}/PKG-INFO +1 -1
- {quickdistill-0.1.6 → quickdistill-0.1.7}/pyproject.toml +1 -1
- {quickdistill-0.1.6 → quickdistill-0.1.7}/quickdistill/__init__.py +1 -1
- {quickdistill-0.1.6 → quickdistill-0.1.7}/quickdistill/__pycache__/__init__.cpython-310.pyc +0 -0
- quickdistill-0.1.7/quickdistill/__pycache__/server.cpython-310.pyc +0 -0
- {quickdistill-0.1.6 → quickdistill-0.1.7}/quickdistill/server.py +257 -5
- {quickdistill-0.1.6 → quickdistill-0.1.7}/quickdistill/static/judge_manager.html +181 -0
- {quickdistill-0.1.6 → quickdistill-0.1.7}/quickdistill/static/trace_viewer.html +714 -0
- {quickdistill-0.1.6 → quickdistill-0.1.7/quickdistill.egg-info}/PKG-INFO +1 -1
- {quickdistill-0.1.6 → quickdistill-0.1.7}/quickdistill.egg-info/SOURCES.txt +1 -0
- quickdistill-0.1.7/update.sh +111 -0
- quickdistill-0.1.6/quickdistill/__pycache__/server.cpython-310.pyc +0 -0
- {quickdistill-0.1.6 → quickdistill-0.1.7}/.pycommands +0 -0
- {quickdistill-0.1.6 → quickdistill-0.1.7}/README.md +0 -0
- {quickdistill-0.1.6 → quickdistill-0.1.7}/dev/generate_test_traces.py +0 -0
- {quickdistill-0.1.6 → quickdistill-0.1.7}/dev/get_call.py +0 -0
- {quickdistill-0.1.6 → quickdistill-0.1.7}/dev/get_traces.py +0 -0
- {quickdistill-0.1.6 → quickdistill-0.1.7}/dev/inference_server.py +0 -0
- {quickdistill-0.1.6 → quickdistill-0.1.7}/dev/judge_manager.html +0 -0
- {quickdistill-0.1.6 → quickdistill-0.1.7}/dev/judges.json +0 -0
- {quickdistill-0.1.6 → quickdistill-0.1.7}/dev/old/TEST_TRACE_GENERATION.md +0 -0
- {quickdistill-0.1.6 → quickdistill-0.1.7}/dev/old/traces_data.json +0 -0
- {quickdistill-0.1.6 → quickdistill-0.1.7}/dev/projects/byyoung3_arena-detailed/traces_data.json +0 -0
- {quickdistill-0.1.6 → quickdistill-0.1.7}/dev/projects/byyoung3_claude-opus-4-1-tutorial/traces_data.json +0 -0
- {quickdistill-0.1.6 → quickdistill-0.1.7}/dev/projects/byyoung3_test-financial-qa/traces_data.json +0 -0
- {quickdistill-0.1.6 → quickdistill-0.1.7}/dev/pystatus +0 -0
- {quickdistill-0.1.6 → quickdistill-0.1.7}/dev/run_evaluation.py +0 -0
- {quickdistill-0.1.6 → quickdistill-0.1.7}/dev/run_weak_models.py +0 -0
- {quickdistill-0.1.6 → quickdistill-0.1.7}/dev/strong_exports/anthropic_claude-3.5-sonnet_10traces_v2.json +0 -0
- {quickdistill-0.1.6 → quickdistill-0.1.7}/dev/strong_exports/anthropic_claude-3.5-sonnet_20traces.json +0 -0
- {quickdistill-0.1.6 → quickdistill-0.1.7}/dev/strong_exports/claude-opus-4-1-20250805_1traces.json +0 -0
- {quickdistill-0.1.6 → quickdistill-0.1.7}/dev/strong_exports/gpt-5-2025-08-07_199traces.json +0 -0
- {quickdistill-0.1.6 → quickdistill-0.1.7}/dev/trace_viewer.html +0 -0
- {quickdistill-0.1.6 → quickdistill-0.1.7}/dev/traces_data.json +0 -0
- {quickdistill-0.1.6 → quickdistill-0.1.7}/dev/weak_model_google_gemini-2.5-flash.json +0 -0
- {quickdistill-0.1.6 → quickdistill-0.1.7}/dev/weak_model_meta-llama_Llama-3.1-8B-Instruct.json +0 -0
- {quickdistill-0.1.6 → quickdistill-0.1.7}/dev/weak_model_meta-llama_Llama-3.3-70B-Instruct.json +0 -0
- {quickdistill-0.1.6 → quickdistill-0.1.7}/dev/weak_model_openai_gpt-oss-20b.json +0 -0
- {quickdistill-0.1.6 → quickdistill-0.1.7}/quickdistill/__pycache__/cli.cpython-310.pyc +0 -0
- {quickdistill-0.1.6 → quickdistill-0.1.7}/quickdistill/__pycache__/get_traces.cpython-310.pyc +0 -0
- {quickdistill-0.1.6 → quickdistill-0.1.7}/quickdistill/cli.py +0 -0
- {quickdistill-0.1.6 → quickdistill-0.1.7}/quickdistill/default_judges.json +0 -0
- {quickdistill-0.1.6 → quickdistill-0.1.7}/quickdistill/default_projects/byyoung3_arena-detailed/traces_data.json +0 -0
- {quickdistill-0.1.6 → quickdistill-0.1.7}/quickdistill/get_traces.py +0 -0
- {quickdistill-0.1.6 → quickdistill-0.1.7}/quickdistill.egg-info/dependency_links.txt +0 -0
- {quickdistill-0.1.6 → quickdistill-0.1.7}/quickdistill.egg-info/entry_points.txt +0 -0
- {quickdistill-0.1.6 → quickdistill-0.1.7}/quickdistill.egg-info/requires.txt +0 -0
- {quickdistill-0.1.6 → quickdistill-0.1.7}/quickdistill.egg-info/top_level.txt +0 -0
- {quickdistill-0.1.6 → quickdistill-0.1.7}/setup.cfg +0 -0
|
Binary file
|
|
Binary file
|
|
@@ -3,7 +3,6 @@ import json
|
|
|
3
3
|
import openai
|
|
4
4
|
import weave
|
|
5
5
|
import shutil
|
|
6
|
-
import threading
|
|
7
6
|
from flask import Flask, request, jsonify, send_from_directory
|
|
8
7
|
from flask_cors import CORS
|
|
9
8
|
from llmasajudge import LLMAsAJudge
|
|
@@ -34,8 +33,25 @@ CORS(app)
|
|
|
34
33
|
# Progress tracking for long-running operations
|
|
35
34
|
progress_state = {}
|
|
36
35
|
|
|
37
|
-
#
|
|
38
|
-
|
|
36
|
+
# Load settings
|
|
37
|
+
SETTINGS_FILE = DATA_DIR / 'settings.json'
|
|
38
|
+
DEFAULT_SETTINGS = {
|
|
39
|
+
'inference_project': 'wandb_fc/quickstart_playground',
|
|
40
|
+
'evaluation_project': 'wandb_inference'
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
def load_settings():
|
|
44
|
+
if SETTINGS_FILE.exists():
|
|
45
|
+
with open(SETTINGS_FILE, 'r') as f:
|
|
46
|
+
return {**DEFAULT_SETTINGS, **json.load(f)}
|
|
47
|
+
return DEFAULT_SETTINGS.copy()
|
|
48
|
+
|
|
49
|
+
def save_settings(settings):
|
|
50
|
+
with open(SETTINGS_FILE, 'w') as f:
|
|
51
|
+
json.dump(settings, f, indent=2)
|
|
52
|
+
|
|
53
|
+
settings = load_settings()
|
|
54
|
+
PROJECT = settings['evaluation_project']
|
|
39
55
|
|
|
40
56
|
weave.init(PROJECT)
|
|
41
57
|
|
|
@@ -46,7 +62,7 @@ def create_client():
|
|
|
46
62
|
api_key=os.getenv("WANDB_API_KEY"),
|
|
47
63
|
project=PROJECT,
|
|
48
64
|
default_headers={
|
|
49
|
-
"OpenAI-Project":
|
|
65
|
+
"OpenAI-Project": settings['inference_project']
|
|
50
66
|
}
|
|
51
67
|
)
|
|
52
68
|
|
|
@@ -175,8 +191,9 @@ def run_inference_endpoint():
|
|
|
175
191
|
if not traces:
|
|
176
192
|
return jsonify({'error': 'No traces in export file'}), 400
|
|
177
193
|
|
|
178
|
-
# Limit traces to num_examples
|
|
194
|
+
# Limit traces to num_examples (convert to int if needed)
|
|
179
195
|
if num_examples:
|
|
196
|
+
num_examples = int(num_examples)
|
|
180
197
|
traces = traces[:num_examples]
|
|
181
198
|
|
|
182
199
|
output_files = []
|
|
@@ -284,6 +301,241 @@ def get_progress(task_id):
|
|
|
284
301
|
return jsonify({'error': 'Task not found'}), 404
|
|
285
302
|
|
|
286
303
|
|
|
304
|
+
@app.route('/settings', methods=['GET'])
|
|
305
|
+
def get_settings():
|
|
306
|
+
"""Get current settings"""
|
|
307
|
+
return jsonify(settings)
|
|
308
|
+
|
|
309
|
+
|
|
310
|
+
@app.route('/settings', methods=['POST'])
|
|
311
|
+
def update_settings():
|
|
312
|
+
"""Update settings"""
|
|
313
|
+
global settings
|
|
314
|
+
data = request.json
|
|
315
|
+
settings.update(data)
|
|
316
|
+
save_settings(settings)
|
|
317
|
+
return jsonify({'status': 'success', 'settings': settings})
|
|
318
|
+
|
|
319
|
+
|
|
320
|
+
@app.route('/test_judge', methods=['POST'])
|
|
321
|
+
def test_judge():
|
|
322
|
+
"""Test a judge on sample data to see raw inputs/outputs"""
|
|
323
|
+
data = request.json
|
|
324
|
+
judge = data.get('judge')
|
|
325
|
+
weak_model_file = data.get('weak_model_file')
|
|
326
|
+
num_samples = data.get('num_samples', 5)
|
|
327
|
+
|
|
328
|
+
if not judge or not weak_model_file:
|
|
329
|
+
return jsonify({'error': 'Missing judge or weak_model_file'}), 400
|
|
330
|
+
|
|
331
|
+
# Load weak model results
|
|
332
|
+
model_path = DATA_DIR / weak_model_file
|
|
333
|
+
with open(model_path, 'r') as f:
|
|
334
|
+
file_data = json.load(f)
|
|
335
|
+
|
|
336
|
+
# Handle both formats
|
|
337
|
+
if isinstance(file_data, dict) and 'results' in file_data:
|
|
338
|
+
results = file_data['results']
|
|
339
|
+
else:
|
|
340
|
+
results = file_data
|
|
341
|
+
|
|
342
|
+
# Limit to num_samples
|
|
343
|
+
samples_to_test = results[:min(num_samples, len(results))]
|
|
344
|
+
|
|
345
|
+
test_results = []
|
|
346
|
+
|
|
347
|
+
for example in samples_to_test:
|
|
348
|
+
# Skip examples with errors
|
|
349
|
+
if example.get('error') or not example.get('output'):
|
|
350
|
+
continue
|
|
351
|
+
|
|
352
|
+
strong_output = example.get('strong_model_output', '')
|
|
353
|
+
weak_output = example.get('output', '')
|
|
354
|
+
|
|
355
|
+
# Extract question
|
|
356
|
+
question = ""
|
|
357
|
+
messages = example.get('messages', [])
|
|
358
|
+
if messages and len(messages) > 0:
|
|
359
|
+
question = messages[0].get('content', '')
|
|
360
|
+
|
|
361
|
+
# Build the prompt
|
|
362
|
+
prompt = judge['prompt']
|
|
363
|
+
if '{question}' in prompt:
|
|
364
|
+
prompt = prompt.replace('{question}', question or '')
|
|
365
|
+
if '{strong_output}' in prompt:
|
|
366
|
+
prompt = prompt.replace('{strong_output}', strong_output or '')
|
|
367
|
+
if '{weak_output}' in prompt:
|
|
368
|
+
prompt = prompt.replace('{weak_output}', weak_output or '')
|
|
369
|
+
|
|
370
|
+
# Run the judge and capture raw response
|
|
371
|
+
if judge['type'] == 'llm':
|
|
372
|
+
return_type = judge.get('returnType', 'scalar')
|
|
373
|
+
|
|
374
|
+
# Use a list to capture the raw response (mutable so we can access from closure)
|
|
375
|
+
captured_raw = []
|
|
376
|
+
|
|
377
|
+
def score_parser(response: str):
|
|
378
|
+
"""Parse the judge response based on return type"""
|
|
379
|
+
# Capture the raw response before any processing
|
|
380
|
+
captured_raw.append(response)
|
|
381
|
+
|
|
382
|
+
response = response.strip()
|
|
383
|
+
|
|
384
|
+
# Remove markdown code blocks if present
|
|
385
|
+
if response.startswith('```'):
|
|
386
|
+
# Remove ```json or ``` at start
|
|
387
|
+
response = response.split('\n', 1)[1] if '\n' in response else response[3:]
|
|
388
|
+
# Remove ``` at end
|
|
389
|
+
if response.endswith('```'):
|
|
390
|
+
response = response.rsplit('\n', 1)[0] if '\n' in response else response[:-3]
|
|
391
|
+
response = response.strip()
|
|
392
|
+
|
|
393
|
+
try:
|
|
394
|
+
# Parse JSON response
|
|
395
|
+
parsed = json.loads(response)
|
|
396
|
+
|
|
397
|
+
if return_type == 'boolean':
|
|
398
|
+
# Extract boolean value - return just the bool
|
|
399
|
+
val = parsed.get('correct', parsed.get('result', parsed.get('value', False)))
|
|
400
|
+
return bool(val)
|
|
401
|
+
elif return_type == 'scalar':
|
|
402
|
+
# Extract numeric score - return just the number
|
|
403
|
+
val = parsed.get('score', parsed.get('scores', parsed.get('value', 0)))
|
|
404
|
+
return float(val) if isinstance(val, (int, float)) else 0
|
|
405
|
+
else:
|
|
406
|
+
# Unsupported return type
|
|
407
|
+
print(f"Unsupported return type: {return_type}")
|
|
408
|
+
return 0
|
|
409
|
+
except:
|
|
410
|
+
print(f"Failed to parse judge response as JSON: {response}")
|
|
411
|
+
if return_type == 'scalar':
|
|
412
|
+
return 0
|
|
413
|
+
elif return_type == 'boolean':
|
|
414
|
+
return False
|
|
415
|
+
else:
|
|
416
|
+
return 0
|
|
417
|
+
|
|
418
|
+
# Use LLMAsAJudge exactly like the evaluation code
|
|
419
|
+
try:
|
|
420
|
+
# Initialize LLMAsAJudge with custom prompt
|
|
421
|
+
judge_instance = LLMAsAJudge(
|
|
422
|
+
models=[judge['model']],
|
|
423
|
+
use_fully_custom_prompt=True,
|
|
424
|
+
output_parser=score_parser,
|
|
425
|
+
return_type=return_type if return_type else None
|
|
426
|
+
)
|
|
427
|
+
|
|
428
|
+
# Get judgment
|
|
429
|
+
result = judge_instance.judge(prompt=prompt)
|
|
430
|
+
|
|
431
|
+
# Extract the raw response that was captured
|
|
432
|
+
raw_text = captured_raw[0] if captured_raw else "No response captured"
|
|
433
|
+
|
|
434
|
+
# Extract parsed scores from result
|
|
435
|
+
if return_type == 'scalar':
|
|
436
|
+
score_val = result.get('scores', result.get('correct', 0))
|
|
437
|
+
parsed_scores = {'score': score_val}
|
|
438
|
+
elif return_type == 'boolean':
|
|
439
|
+
bool_val = result.get('correct', False)
|
|
440
|
+
parsed_scores = {'correct': bool_val}
|
|
441
|
+
else:
|
|
442
|
+
# Unsupported return type - default to scalar
|
|
443
|
+
score_val = result.get('scores', result.get('correct', 0))
|
|
444
|
+
parsed_scores = {'score': score_val}
|
|
445
|
+
|
|
446
|
+
except Exception as e:
|
|
447
|
+
raw_text = f"Error: {str(e)}"
|
|
448
|
+
parsed_scores = {'error': str(e)}
|
|
449
|
+
|
|
450
|
+
test_results.append({
|
|
451
|
+
'question': question,
|
|
452
|
+
'strong_output': strong_output,
|
|
453
|
+
'weak_output': weak_output,
|
|
454
|
+
'judge_prompt': prompt,
|
|
455
|
+
'raw_response': raw_text,
|
|
456
|
+
'parsed_scores': parsed_scores
|
|
457
|
+
})
|
|
458
|
+
|
|
459
|
+
return jsonify({
|
|
460
|
+
'status': 'success',
|
|
461
|
+
'judge_name': judge['name'],
|
|
462
|
+
'num_samples': len(test_results),
|
|
463
|
+
'samples': test_results
|
|
464
|
+
})
|
|
465
|
+
|
|
466
|
+
|
|
467
|
+
@app.route('/generate_judge_prompt', methods=['POST'])
|
|
468
|
+
def generate_judge_prompt():
|
|
469
|
+
"""Generate a judge prompt using AI based on sample data"""
|
|
470
|
+
data = request.json
|
|
471
|
+
weak_model_file = data.get('weak_model_file')
|
|
472
|
+
num_samples = data.get('num_samples', 3)
|
|
473
|
+
model = data.get('model', 'openai/gpt-5')
|
|
474
|
+
meta_prompt = data.get('meta_prompt')
|
|
475
|
+
|
|
476
|
+
if not weak_model_file or not meta_prompt:
|
|
477
|
+
return jsonify({'error': 'Missing weak_model_file or meta_prompt'}), 400
|
|
478
|
+
|
|
479
|
+
# Load weak model results
|
|
480
|
+
model_path = DATA_DIR / weak_model_file
|
|
481
|
+
with open(model_path, 'r') as f:
|
|
482
|
+
file_data = json.load(f)
|
|
483
|
+
|
|
484
|
+
# Handle both formats
|
|
485
|
+
if isinstance(file_data, dict) and 'results' in file_data:
|
|
486
|
+
results = file_data['results']
|
|
487
|
+
else:
|
|
488
|
+
results = file_data
|
|
489
|
+
|
|
490
|
+
# Limit to num_samples
|
|
491
|
+
samples_to_use = results[:min(num_samples, len(results))]
|
|
492
|
+
|
|
493
|
+
# Format samples for meta-prompt
|
|
494
|
+
samples_text = []
|
|
495
|
+
for i, example in enumerate(samples_to_use):
|
|
496
|
+
# Skip examples with errors
|
|
497
|
+
if example.get('error') or not example.get('output'):
|
|
498
|
+
continue
|
|
499
|
+
|
|
500
|
+
strong_output = example.get('strong_model_output', '')
|
|
501
|
+
weak_output = example.get('output', '')
|
|
502
|
+
|
|
503
|
+
# Extract question
|
|
504
|
+
question = ""
|
|
505
|
+
messages = example.get('messages', [])
|
|
506
|
+
if messages and len(messages) > 0:
|
|
507
|
+
question = messages[0].get('content', '')
|
|
508
|
+
|
|
509
|
+
samples_text.append(f"""Sample {i+1}:
|
|
510
|
+
Question: {question}
|
|
511
|
+
Strong Model Output: {strong_output}
|
|
512
|
+
Weak Model Output: {weak_output}
|
|
513
|
+
---""")
|
|
514
|
+
|
|
515
|
+
samples_formatted = "\n\n".join(samples_text)
|
|
516
|
+
|
|
517
|
+
# Replace {SAMPLES} placeholder in meta-prompt
|
|
518
|
+
final_prompt = meta_prompt.replace('{SAMPLES}', samples_formatted)
|
|
519
|
+
|
|
520
|
+
# Call OpenRouter to generate the prompt
|
|
521
|
+
try:
|
|
522
|
+
client = create_openrouter_client()
|
|
523
|
+
response = client.chat.completions.create(
|
|
524
|
+
model=model,
|
|
525
|
+
messages=[{"role": "user", "content": final_prompt}]
|
|
526
|
+
)
|
|
527
|
+
generated_prompt = response.choices[0].message.content.strip()
|
|
528
|
+
|
|
529
|
+
return jsonify({
|
|
530
|
+
'status': 'success',
|
|
531
|
+
'generated_prompt': generated_prompt,
|
|
532
|
+
'num_samples_used': len(samples_text)
|
|
533
|
+
})
|
|
534
|
+
|
|
535
|
+
except Exception as e:
|
|
536
|
+
return jsonify({'error': f'Failed to generate prompt: {str(e)}'}), 500
|
|
537
|
+
|
|
538
|
+
|
|
287
539
|
@app.route('/list_weak_models', methods=['GET'])
|
|
288
540
|
def list_weak_models():
|
|
289
541
|
"""List available weak model result files with metadata"""
|
|
@@ -162,6 +162,13 @@
|
|
|
162
162
|
<div class="container">
|
|
163
163
|
<h1>Judge Manager</h1>
|
|
164
164
|
|
|
165
|
+
<!-- Prompt Generator Button -->
|
|
166
|
+
<div style="margin-bottom: 20px; display: none;">
|
|
167
|
+
<button onclick="openPromptGenerator()" style="padding: 10px 20px; background: #6a4a7e; color: white; border: none; border-radius: 4px; cursor: pointer; font-size: 14px;">
|
|
168
|
+
✨ Generate Judge Prompt with AI
|
|
169
|
+
</button>
|
|
170
|
+
</div>
|
|
171
|
+
|
|
165
172
|
<!-- Create/Edit Judge Section -->
|
|
166
173
|
<div class="section">
|
|
167
174
|
<h2 id="form-title">Create New Judge</h2>
|
|
@@ -217,6 +224,73 @@
|
|
|
217
224
|
</div>
|
|
218
225
|
</div>
|
|
219
226
|
|
|
227
|
+
<!-- Prompt Generator Panel -->
|
|
228
|
+
<div id="prompt-generator-panel" style="display: none; position: fixed; top: 0; left: 0; width: 100%; height: 100%; background: rgba(0,0,0,0.9); z-index: 1000; padding: 40px; overflow-y: auto;">
|
|
229
|
+
<div style="max-width: 1200px; margin: 0 auto; background: #1a1a1a; border-radius: 8px; padding: 30px; border: 1px solid #3a2a4a;">
|
|
230
|
+
<h2 style="color: #fff; margin-bottom: 10px;">AI-Powered Judge Prompt Generator</h2>
|
|
231
|
+
<p style="color: #888; font-size: 13px; margin-bottom: 25px;">
|
|
232
|
+
Generate specialized judge prompts by showing sample data to an AI model
|
|
233
|
+
</p>
|
|
234
|
+
|
|
235
|
+
<!-- Configuration -->
|
|
236
|
+
<div style="display: grid; grid-template-columns: 1fr 1fr; gap: 20px; margin-bottom: 20px;">
|
|
237
|
+
<div>
|
|
238
|
+
<label style="display: block; color: #aaa; margin-bottom: 8px; font-size: 14px;">Weak Model Dataset:</label>
|
|
239
|
+
<select id="gen-weak-model-select" style="width: 100%; padding: 10px; background: #2a2a2a; color: #fff; border: 1px solid #3a3a3a; border-radius: 4px; font-size: 14px;">
|
|
240
|
+
<option value="">Loading weak model files...</option>
|
|
241
|
+
</select>
|
|
242
|
+
</div>
|
|
243
|
+
|
|
244
|
+
<div>
|
|
245
|
+
<label style="display: block; color: #aaa; margin-bottom: 8px; font-size: 14px;">Number of Samples:</label>
|
|
246
|
+
<input type="number" id="gen-num-samples" value="3" min="1" max="10"
|
|
247
|
+
style="width: 100%; padding: 10px; background: #2a2a2a; color: #fff; border: 1px solid #3a3a3a; border-radius: 4px; font-size: 14px;">
|
|
248
|
+
<div style="color: #666; font-size: 12px; margin-top: 5px;">Max: 10 (for context limits)</div>
|
|
249
|
+
</div>
|
|
250
|
+
</div>
|
|
251
|
+
|
|
252
|
+
<div style="margin-bottom: 20px;">
|
|
253
|
+
<label style="display: block; color: #aaa; margin-bottom: 8px; font-size: 14px;">Generation Model:</label>
|
|
254
|
+
<input type="text" id="gen-model" value="openai/gpt-5"
|
|
255
|
+
style="width: 100%; padding: 10px; background: #2a2a2a; color: #fff; border: 1px solid #3a3a3a; border-radius: 4px; font-size: 14px;"
|
|
256
|
+
placeholder="e.g., openai/gpt-5, anthropic/claude-3.5-sonnet">
|
|
257
|
+
<div style="color: #666; font-size: 12px; margin-top: 5px;">OpenRouter model to use for generating the prompt</div>
|
|
258
|
+
</div>
|
|
259
|
+
|
|
260
|
+
<!-- Meta-Prompt -->
|
|
261
|
+
<div style="margin-bottom: 25px;">
|
|
262
|
+
<label style="display: block; color: #aaa; margin-bottom: 8px; font-size: 14px;">Meta-Prompt (edit as needed):</label>
|
|
263
|
+
<textarea id="gen-meta-prompt"
|
|
264
|
+
style="width: 100%; min-height: 250px; padding: 10px; background: #2a2a2a; color: #fff; border: 1px solid #3a3a3a; border-radius: 4px; font-size: 13px; font-family: 'Courier New', monospace; resize: vertical;"></textarea>
|
|
265
|
+
<div style="color: #666; font-size: 12px; margin-top: 5px;">
|
|
266
|
+
This prompt will be sent to the generation model along with sample data
|
|
267
|
+
</div>
|
|
268
|
+
</div>
|
|
269
|
+
|
|
270
|
+
<!-- Actions -->
|
|
271
|
+
<div style="display: flex; gap: 10px; margin-bottom: 25px;">
|
|
272
|
+
<button onclick="generatePrompt()" style="padding: 10px 20px; background: #6a4a7e; color: white; border: none; border-radius: 4px; cursor: pointer; font-weight: 500;">
|
|
273
|
+
Generate Prompt
|
|
274
|
+
</button>
|
|
275
|
+
<button onclick="closePromptGenerator()" style="padding: 10px 20px; background: #5a2a2a; color: white; border: none; border-radius: 4px; cursor: pointer;">
|
|
276
|
+
Close
|
|
277
|
+
</button>
|
|
278
|
+
</div>
|
|
279
|
+
|
|
280
|
+
<!-- Generated Output -->
|
|
281
|
+
<div id="gen-output-section" style="display: none;">
|
|
282
|
+
<h3 style="color: #4a9eff; margin-bottom: 15px;">Generated Judge Prompt</h3>
|
|
283
|
+
<textarea id="gen-output" readonly
|
|
284
|
+
style="width: 100%; min-height: 300px; padding: 15px; background: #0f0f0f; color: #4a9eff; border: 1px solid #4a9eff; border-radius: 4px; font-size: 13px; font-family: 'Courier New', monospace; resize: vertical;"></textarea>
|
|
285
|
+
<div style="margin-top: 10px;">
|
|
286
|
+
<button onclick="copyGeneratedPrompt()" style="padding: 8px 16px; background: #2a7c4a; color: white; border: none; border-radius: 4px; cursor: pointer;">
|
|
287
|
+
Copy to Clipboard
|
|
288
|
+
</button>
|
|
289
|
+
</div>
|
|
290
|
+
</div>
|
|
291
|
+
</div>
|
|
292
|
+
</div>
|
|
293
|
+
|
|
220
294
|
<script>
|
|
221
295
|
let judges = [];
|
|
222
296
|
let editingIndex = null;
|
|
@@ -489,6 +563,113 @@ Respond in JSON format: {'correct': true} or {'correct': false}`
|
|
|
489
563
|
console.log('Not changing prompt - user has edited it');
|
|
490
564
|
}
|
|
491
565
|
});
|
|
566
|
+
|
|
567
|
+
// === PROMPT GENERATOR ===
|
|
568
|
+
|
|
569
|
+
const DEFAULT_META_PROMPT = `You are an expert at creating evaluation prompts for judging AI model outputs. I'm building a specialized judge prompt to evaluate the quality/similarity of weak model outputs compared to strong reference model outputs.
|
|
570
|
+
|
|
571
|
+
I will show you some sample data below. Each sample contains:
|
|
572
|
+
- A question/input
|
|
573
|
+
- The strong reference model's output (ground truth)
|
|
574
|
+
- The weak model's output (what we're evaluating)
|
|
575
|
+
|
|
576
|
+
Your task: Create a specialized, detailed judge prompt that can be used to systematically evaluate the delta/difference between these outputs. The prompt should:
|
|
577
|
+
1. Be specific to the patterns you see in this data
|
|
578
|
+
2. Include clear evaluation criteria
|
|
579
|
+
3. Be written in second-person ("You are...")
|
|
580
|
+
4. Include the placeholders {question}, {strong_output}, and {weak_output}
|
|
581
|
+
5. Specify the exact JSON format to return (either {'score': number} for scalar or {'correct': boolean} for boolean)
|
|
582
|
+
|
|
583
|
+
Sample Data:
|
|
584
|
+
{SAMPLES}
|
|
585
|
+
|
|
586
|
+
Based on these samples, create a specialized judge prompt that would effectively evaluate this type of data. Return ONLY the judge prompt text, nothing else.`;
|
|
587
|
+
|
|
588
|
+
async function openPromptGenerator() {
|
|
589
|
+
// Load weak model files
|
|
590
|
+
try {
|
|
591
|
+
const response = await fetch('/list_weak_models');
|
|
592
|
+
const data = await response.json();
|
|
593
|
+
const select = document.getElementById('gen-weak-model-select');
|
|
594
|
+
|
|
595
|
+
if (data.files && data.files.length > 0) {
|
|
596
|
+
select.innerHTML = data.files.map(f =>
|
|
597
|
+
`<option value="${f.filename}">${f.weak_model || f.filename}</option>`
|
|
598
|
+
).join('');
|
|
599
|
+
} else {
|
|
600
|
+
select.innerHTML = '<option value="">No weak model files available</option>';
|
|
601
|
+
}
|
|
602
|
+
} catch (error) {
|
|
603
|
+
console.error('Error loading weak models:', error);
|
|
604
|
+
}
|
|
605
|
+
|
|
606
|
+
// Set default meta-prompt
|
|
607
|
+
document.getElementById('gen-meta-prompt').value = DEFAULT_META_PROMPT;
|
|
608
|
+
|
|
609
|
+
// Show panel
|
|
610
|
+
document.getElementById('prompt-generator-panel').style.display = 'block';
|
|
611
|
+
document.getElementById('gen-output-section').style.display = 'none';
|
|
612
|
+
}
|
|
613
|
+
|
|
614
|
+
function closePromptGenerator() {
|
|
615
|
+
document.getElementById('prompt-generator-panel').style.display = 'none';
|
|
616
|
+
}
|
|
617
|
+
|
|
618
|
+
async function generatePrompt() {
|
|
619
|
+
const weakModelFile = document.getElementById('gen-weak-model-select').value;
|
|
620
|
+
const numSamples = parseInt(document.getElementById('gen-num-samples').value) || 3;
|
|
621
|
+
const model = document.getElementById('gen-model').value.trim();
|
|
622
|
+
const metaPrompt = document.getElementById('gen-meta-prompt').value.trim();
|
|
623
|
+
|
|
624
|
+
if (!weakModelFile) {
|
|
625
|
+
alert('Please select a weak model dataset');
|
|
626
|
+
return;
|
|
627
|
+
}
|
|
628
|
+
|
|
629
|
+
if (!model) {
|
|
630
|
+
alert('Please enter a generation model');
|
|
631
|
+
return;
|
|
632
|
+
}
|
|
633
|
+
|
|
634
|
+
if (!metaPrompt) {
|
|
635
|
+
alert('Please enter a meta-prompt');
|
|
636
|
+
return;
|
|
637
|
+
}
|
|
638
|
+
|
|
639
|
+
try {
|
|
640
|
+
const response = await fetch('/generate_judge_prompt', {
|
|
641
|
+
method: 'POST',
|
|
642
|
+
headers: { 'Content-Type': 'application/json' },
|
|
643
|
+
body: JSON.stringify({
|
|
644
|
+
weak_model_file: weakModelFile,
|
|
645
|
+
num_samples: numSamples,
|
|
646
|
+
model: model,
|
|
647
|
+
meta_prompt: metaPrompt
|
|
648
|
+
})
|
|
649
|
+
});
|
|
650
|
+
|
|
651
|
+
if (!response.ok) {
|
|
652
|
+
throw new Error('Failed to generate prompt');
|
|
653
|
+
}
|
|
654
|
+
|
|
655
|
+
const result = await response.json();
|
|
656
|
+
|
|
657
|
+
// Display generated prompt
|
|
658
|
+
document.getElementById('gen-output').value = result.generated_prompt;
|
|
659
|
+
document.getElementById('gen-output-section').style.display = 'block';
|
|
660
|
+
|
|
661
|
+
} catch (error) {
|
|
662
|
+
alert('Error generating prompt: ' + error.message);
|
|
663
|
+
console.error('Generation error:', error);
|
|
664
|
+
}
|
|
665
|
+
}
|
|
666
|
+
|
|
667
|
+
function copyGeneratedPrompt() {
|
|
668
|
+
const output = document.getElementById('gen-output');
|
|
669
|
+
output.select();
|
|
670
|
+
document.execCommand('copy');
|
|
671
|
+
alert('Prompt copied to clipboard!');
|
|
672
|
+
}
|
|
492
673
|
</script>
|
|
493
674
|
</body>
|
|
494
675
|
</html>
|