quickdistill 0.1.5__tar.gz → 0.1.7__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {quickdistill-0.1.5/quickdistill.egg-info → quickdistill-0.1.7}/PKG-INFO +1 -1
- {quickdistill-0.1.5 → quickdistill-0.1.7}/pyproject.toml +1 -1
- {quickdistill-0.1.5 → quickdistill-0.1.7}/quickdistill/__init__.py +1 -1
- {quickdistill-0.1.5 → quickdistill-0.1.7}/quickdistill/__pycache__/__init__.cpython-310.pyc +0 -0
- quickdistill-0.1.7/quickdistill/__pycache__/server.cpython-310.pyc +0 -0
- {quickdistill-0.1.5 → quickdistill-0.1.7}/quickdistill/server.py +330 -14
- {quickdistill-0.1.5 → quickdistill-0.1.7}/quickdistill/static/judge_manager.html +183 -16
- {quickdistill-0.1.5 → quickdistill-0.1.7}/quickdistill/static/trace_viewer.html +787 -13
- {quickdistill-0.1.5 → quickdistill-0.1.7/quickdistill.egg-info}/PKG-INFO +1 -1
- {quickdistill-0.1.5 → quickdistill-0.1.7}/quickdistill.egg-info/SOURCES.txt +1 -0
- quickdistill-0.1.7/update.sh +111 -0
- quickdistill-0.1.5/quickdistill/__pycache__/server.cpython-310.pyc +0 -0
- {quickdistill-0.1.5 → quickdistill-0.1.7}/.pycommands +0 -0
- {quickdistill-0.1.5 → quickdistill-0.1.7}/README.md +0 -0
- {quickdistill-0.1.5 → quickdistill-0.1.7}/dev/generate_test_traces.py +0 -0
- {quickdistill-0.1.5 → quickdistill-0.1.7}/dev/get_call.py +0 -0
- {quickdistill-0.1.5 → quickdistill-0.1.7}/dev/get_traces.py +0 -0
- {quickdistill-0.1.5 → quickdistill-0.1.7}/dev/inference_server.py +0 -0
- {quickdistill-0.1.5 → quickdistill-0.1.7}/dev/judge_manager.html +0 -0
- {quickdistill-0.1.5 → quickdistill-0.1.7}/dev/judges.json +0 -0
- {quickdistill-0.1.5 → quickdistill-0.1.7}/dev/old/TEST_TRACE_GENERATION.md +0 -0
- {quickdistill-0.1.5 → quickdistill-0.1.7}/dev/old/traces_data.json +0 -0
- {quickdistill-0.1.5 → quickdistill-0.1.7}/dev/projects/byyoung3_arena-detailed/traces_data.json +0 -0
- {quickdistill-0.1.5 → quickdistill-0.1.7}/dev/projects/byyoung3_claude-opus-4-1-tutorial/traces_data.json +0 -0
- {quickdistill-0.1.5 → quickdistill-0.1.7}/dev/projects/byyoung3_test-financial-qa/traces_data.json +0 -0
- {quickdistill-0.1.5 → quickdistill-0.1.7}/dev/pystatus +0 -0
- {quickdistill-0.1.5 → quickdistill-0.1.7}/dev/run_evaluation.py +0 -0
- {quickdistill-0.1.5 → quickdistill-0.1.7}/dev/run_weak_models.py +0 -0
- {quickdistill-0.1.5 → quickdistill-0.1.7}/dev/strong_exports/anthropic_claude-3.5-sonnet_10traces_v2.json +0 -0
- {quickdistill-0.1.5 → quickdistill-0.1.7}/dev/strong_exports/anthropic_claude-3.5-sonnet_20traces.json +0 -0
- {quickdistill-0.1.5 → quickdistill-0.1.7}/dev/strong_exports/claude-opus-4-1-20250805_1traces.json +0 -0
- {quickdistill-0.1.5 → quickdistill-0.1.7}/dev/strong_exports/gpt-5-2025-08-07_199traces.json +0 -0
- {quickdistill-0.1.5 → quickdistill-0.1.7}/dev/trace_viewer.html +0 -0
- {quickdistill-0.1.5 → quickdistill-0.1.7}/dev/traces_data.json +0 -0
- {quickdistill-0.1.5 → quickdistill-0.1.7}/dev/weak_model_google_gemini-2.5-flash.json +0 -0
- {quickdistill-0.1.5 → quickdistill-0.1.7}/dev/weak_model_meta-llama_Llama-3.1-8B-Instruct.json +0 -0
- {quickdistill-0.1.5 → quickdistill-0.1.7}/dev/weak_model_meta-llama_Llama-3.3-70B-Instruct.json +0 -0
- {quickdistill-0.1.5 → quickdistill-0.1.7}/dev/weak_model_openai_gpt-oss-20b.json +0 -0
- {quickdistill-0.1.5 → quickdistill-0.1.7}/quickdistill/__pycache__/cli.cpython-310.pyc +0 -0
- {quickdistill-0.1.5 → quickdistill-0.1.7}/quickdistill/__pycache__/get_traces.cpython-310.pyc +0 -0
- {quickdistill-0.1.5 → quickdistill-0.1.7}/quickdistill/cli.py +0 -0
- {quickdistill-0.1.5 → quickdistill-0.1.7}/quickdistill/default_judges.json +0 -0
- {quickdistill-0.1.5 → quickdistill-0.1.7}/quickdistill/default_projects/byyoung3_arena-detailed/traces_data.json +0 -0
- {quickdistill-0.1.5 → quickdistill-0.1.7}/quickdistill/get_traces.py +0 -0
- {quickdistill-0.1.5 → quickdistill-0.1.7}/quickdistill.egg-info/dependency_links.txt +0 -0
- {quickdistill-0.1.5 → quickdistill-0.1.7}/quickdistill.egg-info/entry_points.txt +0 -0
- {quickdistill-0.1.5 → quickdistill-0.1.7}/quickdistill.egg-info/requires.txt +0 -0
- {quickdistill-0.1.5 → quickdistill-0.1.7}/quickdistill.egg-info/top_level.txt +0 -0
- {quickdistill-0.1.5 → quickdistill-0.1.7}/setup.cfg +0 -0
|
Binary file
|
|
Binary file
|
|
@@ -30,8 +30,28 @@ if default_project_src.exists() and not default_project_dst.exists():
|
|
|
30
30
|
app = Flask(__name__, static_folder=str(STATIC_DIR))
|
|
31
31
|
CORS(app)
|
|
32
32
|
|
|
33
|
-
#
|
|
34
|
-
|
|
33
|
+
# Progress tracking for long-running operations
|
|
34
|
+
progress_state = {}
|
|
35
|
+
|
|
36
|
+
# Load settings
|
|
37
|
+
SETTINGS_FILE = DATA_DIR / 'settings.json'
|
|
38
|
+
DEFAULT_SETTINGS = {
|
|
39
|
+
'inference_project': 'wandb_fc/quickstart_playground',
|
|
40
|
+
'evaluation_project': 'wandb_inference'
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
def load_settings():
|
|
44
|
+
if SETTINGS_FILE.exists():
|
|
45
|
+
with open(SETTINGS_FILE, 'r') as f:
|
|
46
|
+
return {**DEFAULT_SETTINGS, **json.load(f)}
|
|
47
|
+
return DEFAULT_SETTINGS.copy()
|
|
48
|
+
|
|
49
|
+
def save_settings(settings):
|
|
50
|
+
with open(SETTINGS_FILE, 'w') as f:
|
|
51
|
+
json.dump(settings, f, indent=2)
|
|
52
|
+
|
|
53
|
+
settings = load_settings()
|
|
54
|
+
PROJECT = settings['evaluation_project']
|
|
35
55
|
|
|
36
56
|
weave.init(PROJECT)
|
|
37
57
|
|
|
@@ -42,7 +62,7 @@ def create_client():
|
|
|
42
62
|
api_key=os.getenv("WANDB_API_KEY"),
|
|
43
63
|
project=PROJECT,
|
|
44
64
|
default_headers={
|
|
45
|
-
"OpenAI-Project":
|
|
65
|
+
"OpenAI-Project": settings['inference_project']
|
|
46
66
|
}
|
|
47
67
|
)
|
|
48
68
|
|
|
@@ -152,6 +172,7 @@ def run_inference_endpoint():
|
|
|
152
172
|
models = data.get('models', [])
|
|
153
173
|
strong_export_file = data.get('strong_export_file')
|
|
154
174
|
num_examples = data.get('num_examples')
|
|
175
|
+
task_id = data.get('task_id', f"inference_{id(models)}")
|
|
155
176
|
|
|
156
177
|
if not models:
|
|
157
178
|
return jsonify({'error': 'No models provided'}), 400
|
|
@@ -170,14 +191,24 @@ def run_inference_endpoint():
|
|
|
170
191
|
if not traces:
|
|
171
192
|
return jsonify({'error': 'No traces in export file'}), 400
|
|
172
193
|
|
|
173
|
-
# Limit traces to num_examples
|
|
194
|
+
# Limit traces to num_examples (convert to int if needed)
|
|
174
195
|
if num_examples:
|
|
196
|
+
num_examples = int(num_examples)
|
|
175
197
|
traces = traces[:num_examples]
|
|
176
198
|
|
|
177
199
|
output_files = []
|
|
178
200
|
|
|
201
|
+
# Initialize progress tracking
|
|
202
|
+
total_steps = len(models) * len(traces)
|
|
203
|
+
progress_state[task_id] = {
|
|
204
|
+
'current': 0,
|
|
205
|
+
'total': total_steps,
|
|
206
|
+
'message': 'Starting inference...',
|
|
207
|
+
'status': 'running'
|
|
208
|
+
}
|
|
209
|
+
|
|
179
210
|
# Run inference for each model
|
|
180
|
-
for model in models:
|
|
211
|
+
for model_idx, model in enumerate(models):
|
|
181
212
|
print(f"Running model: {model}")
|
|
182
213
|
results = []
|
|
183
214
|
|
|
@@ -185,6 +216,13 @@ def run_inference_endpoint():
|
|
|
185
216
|
client = get_client_for_model(model)
|
|
186
217
|
|
|
187
218
|
for i, trace in enumerate(traces):
|
|
219
|
+
step = model_idx * len(traces) + i + 1
|
|
220
|
+
progress_state[task_id] = {
|
|
221
|
+
'current': step,
|
|
222
|
+
'total': total_steps,
|
|
223
|
+
'message': f'[{model_idx+1}/{len(models)}] {model} - Example {i+1}/{len(traces)}',
|
|
224
|
+
'status': 'running'
|
|
225
|
+
}
|
|
188
226
|
print(f" Processing example {i+1}/{len(traces)}...", end=' ')
|
|
189
227
|
|
|
190
228
|
# Extract messages
|
|
@@ -239,13 +277,265 @@ def run_inference_endpoint():
|
|
|
239
277
|
output_files.append(str(output_file))
|
|
240
278
|
print(f"Saved {len(results)} results to {output_file}")
|
|
241
279
|
|
|
280
|
+
# Mark progress as complete
|
|
281
|
+
progress_state[task_id] = {
|
|
282
|
+
'current': total_steps,
|
|
283
|
+
'total': total_steps,
|
|
284
|
+
'message': 'Complete!',
|
|
285
|
+
'status': 'complete'
|
|
286
|
+
}
|
|
287
|
+
|
|
242
288
|
return jsonify({
|
|
243
289
|
'status': 'success',
|
|
244
290
|
'files': output_files,
|
|
245
291
|
'total_examples': len(traces),
|
|
246
|
-
'models_run': len(models)
|
|
292
|
+
'models_run': len(models),
|
|
293
|
+
'task_id': task_id
|
|
294
|
+
})
|
|
295
|
+
|
|
296
|
+
@app.route('/progress/<task_id>', methods=['GET'])
|
|
297
|
+
def get_progress(task_id):
|
|
298
|
+
"""Get progress for a running task"""
|
|
299
|
+
if task_id in progress_state:
|
|
300
|
+
return jsonify(progress_state[task_id])
|
|
301
|
+
return jsonify({'error': 'Task not found'}), 404
|
|
302
|
+
|
|
303
|
+
|
|
304
|
+
@app.route('/settings', methods=['GET'])
|
|
305
|
+
def get_settings():
|
|
306
|
+
"""Get current settings"""
|
|
307
|
+
return jsonify(settings)
|
|
308
|
+
|
|
309
|
+
|
|
310
|
+
@app.route('/settings', methods=['POST'])
|
|
311
|
+
def update_settings():
|
|
312
|
+
"""Update settings"""
|
|
313
|
+
global settings
|
|
314
|
+
data = request.json
|
|
315
|
+
settings.update(data)
|
|
316
|
+
save_settings(settings)
|
|
317
|
+
return jsonify({'status': 'success', 'settings': settings})
|
|
318
|
+
|
|
319
|
+
|
|
320
|
+
@app.route('/test_judge', methods=['POST'])
|
|
321
|
+
def test_judge():
|
|
322
|
+
"""Test a judge on sample data to see raw inputs/outputs"""
|
|
323
|
+
data = request.json
|
|
324
|
+
judge = data.get('judge')
|
|
325
|
+
weak_model_file = data.get('weak_model_file')
|
|
326
|
+
num_samples = data.get('num_samples', 5)
|
|
327
|
+
|
|
328
|
+
if not judge or not weak_model_file:
|
|
329
|
+
return jsonify({'error': 'Missing judge or weak_model_file'}), 400
|
|
330
|
+
|
|
331
|
+
# Load weak model results
|
|
332
|
+
model_path = DATA_DIR / weak_model_file
|
|
333
|
+
with open(model_path, 'r') as f:
|
|
334
|
+
file_data = json.load(f)
|
|
335
|
+
|
|
336
|
+
# Handle both formats
|
|
337
|
+
if isinstance(file_data, dict) and 'results' in file_data:
|
|
338
|
+
results = file_data['results']
|
|
339
|
+
else:
|
|
340
|
+
results = file_data
|
|
341
|
+
|
|
342
|
+
# Limit to num_samples
|
|
343
|
+
samples_to_test = results[:min(num_samples, len(results))]
|
|
344
|
+
|
|
345
|
+
test_results = []
|
|
346
|
+
|
|
347
|
+
for example in samples_to_test:
|
|
348
|
+
# Skip examples with errors
|
|
349
|
+
if example.get('error') or not example.get('output'):
|
|
350
|
+
continue
|
|
351
|
+
|
|
352
|
+
strong_output = example.get('strong_model_output', '')
|
|
353
|
+
weak_output = example.get('output', '')
|
|
354
|
+
|
|
355
|
+
# Extract question
|
|
356
|
+
question = ""
|
|
357
|
+
messages = example.get('messages', [])
|
|
358
|
+
if messages and len(messages) > 0:
|
|
359
|
+
question = messages[0].get('content', '')
|
|
360
|
+
|
|
361
|
+
# Build the prompt
|
|
362
|
+
prompt = judge['prompt']
|
|
363
|
+
if '{question}' in prompt:
|
|
364
|
+
prompt = prompt.replace('{question}', question or '')
|
|
365
|
+
if '{strong_output}' in prompt:
|
|
366
|
+
prompt = prompt.replace('{strong_output}', strong_output or '')
|
|
367
|
+
if '{weak_output}' in prompt:
|
|
368
|
+
prompt = prompt.replace('{weak_output}', weak_output or '')
|
|
369
|
+
|
|
370
|
+
# Run the judge and capture raw response
|
|
371
|
+
if judge['type'] == 'llm':
|
|
372
|
+
return_type = judge.get('returnType', 'scalar')
|
|
373
|
+
|
|
374
|
+
# Use a list to capture the raw response (mutable so we can access from closure)
|
|
375
|
+
captured_raw = []
|
|
376
|
+
|
|
377
|
+
def score_parser(response: str):
|
|
378
|
+
"""Parse the judge response based on return type"""
|
|
379
|
+
# Capture the raw response before any processing
|
|
380
|
+
captured_raw.append(response)
|
|
381
|
+
|
|
382
|
+
response = response.strip()
|
|
383
|
+
|
|
384
|
+
# Remove markdown code blocks if present
|
|
385
|
+
if response.startswith('```'):
|
|
386
|
+
# Remove ```json or ``` at start
|
|
387
|
+
response = response.split('\n', 1)[1] if '\n' in response else response[3:]
|
|
388
|
+
# Remove ``` at end
|
|
389
|
+
if response.endswith('```'):
|
|
390
|
+
response = response.rsplit('\n', 1)[0] if '\n' in response else response[:-3]
|
|
391
|
+
response = response.strip()
|
|
392
|
+
|
|
393
|
+
try:
|
|
394
|
+
# Parse JSON response
|
|
395
|
+
parsed = json.loads(response)
|
|
396
|
+
|
|
397
|
+
if return_type == 'boolean':
|
|
398
|
+
# Extract boolean value - return just the bool
|
|
399
|
+
val = parsed.get('correct', parsed.get('result', parsed.get('value', False)))
|
|
400
|
+
return bool(val)
|
|
401
|
+
elif return_type == 'scalar':
|
|
402
|
+
# Extract numeric score - return just the number
|
|
403
|
+
val = parsed.get('score', parsed.get('scores', parsed.get('value', 0)))
|
|
404
|
+
return float(val) if isinstance(val, (int, float)) else 0
|
|
405
|
+
else:
|
|
406
|
+
# Unsupported return type
|
|
407
|
+
print(f"Unsupported return type: {return_type}")
|
|
408
|
+
return 0
|
|
409
|
+
except:
|
|
410
|
+
print(f"Failed to parse judge response as JSON: {response}")
|
|
411
|
+
if return_type == 'scalar':
|
|
412
|
+
return 0
|
|
413
|
+
elif return_type == 'boolean':
|
|
414
|
+
return False
|
|
415
|
+
else:
|
|
416
|
+
return 0
|
|
417
|
+
|
|
418
|
+
# Use LLMAsAJudge exactly like the evaluation code
|
|
419
|
+
try:
|
|
420
|
+
# Initialize LLMAsAJudge with custom prompt
|
|
421
|
+
judge_instance = LLMAsAJudge(
|
|
422
|
+
models=[judge['model']],
|
|
423
|
+
use_fully_custom_prompt=True,
|
|
424
|
+
output_parser=score_parser,
|
|
425
|
+
return_type=return_type if return_type else None
|
|
426
|
+
)
|
|
427
|
+
|
|
428
|
+
# Get judgment
|
|
429
|
+
result = judge_instance.judge(prompt=prompt)
|
|
430
|
+
|
|
431
|
+
# Extract the raw response that was captured
|
|
432
|
+
raw_text = captured_raw[0] if captured_raw else "No response captured"
|
|
433
|
+
|
|
434
|
+
# Extract parsed scores from result
|
|
435
|
+
if return_type == 'scalar':
|
|
436
|
+
score_val = result.get('scores', result.get('correct', 0))
|
|
437
|
+
parsed_scores = {'score': score_val}
|
|
438
|
+
elif return_type == 'boolean':
|
|
439
|
+
bool_val = result.get('correct', False)
|
|
440
|
+
parsed_scores = {'correct': bool_val}
|
|
441
|
+
else:
|
|
442
|
+
# Unsupported return type - default to scalar
|
|
443
|
+
score_val = result.get('scores', result.get('correct', 0))
|
|
444
|
+
parsed_scores = {'score': score_val}
|
|
445
|
+
|
|
446
|
+
except Exception as e:
|
|
447
|
+
raw_text = f"Error: {str(e)}"
|
|
448
|
+
parsed_scores = {'error': str(e)}
|
|
449
|
+
|
|
450
|
+
test_results.append({
|
|
451
|
+
'question': question,
|
|
452
|
+
'strong_output': strong_output,
|
|
453
|
+
'weak_output': weak_output,
|
|
454
|
+
'judge_prompt': prompt,
|
|
455
|
+
'raw_response': raw_text,
|
|
456
|
+
'parsed_scores': parsed_scores
|
|
457
|
+
})
|
|
458
|
+
|
|
459
|
+
return jsonify({
|
|
460
|
+
'status': 'success',
|
|
461
|
+
'judge_name': judge['name'],
|
|
462
|
+
'num_samples': len(test_results),
|
|
463
|
+
'samples': test_results
|
|
247
464
|
})
|
|
248
465
|
|
|
466
|
+
|
|
467
|
+
@app.route('/generate_judge_prompt', methods=['POST'])
|
|
468
|
+
def generate_judge_prompt():
|
|
469
|
+
"""Generate a judge prompt using AI based on sample data"""
|
|
470
|
+
data = request.json
|
|
471
|
+
weak_model_file = data.get('weak_model_file')
|
|
472
|
+
num_samples = data.get('num_samples', 3)
|
|
473
|
+
model = data.get('model', 'openai/gpt-5')
|
|
474
|
+
meta_prompt = data.get('meta_prompt')
|
|
475
|
+
|
|
476
|
+
if not weak_model_file or not meta_prompt:
|
|
477
|
+
return jsonify({'error': 'Missing weak_model_file or meta_prompt'}), 400
|
|
478
|
+
|
|
479
|
+
# Load weak model results
|
|
480
|
+
model_path = DATA_DIR / weak_model_file
|
|
481
|
+
with open(model_path, 'r') as f:
|
|
482
|
+
file_data = json.load(f)
|
|
483
|
+
|
|
484
|
+
# Handle both formats
|
|
485
|
+
if isinstance(file_data, dict) and 'results' in file_data:
|
|
486
|
+
results = file_data['results']
|
|
487
|
+
else:
|
|
488
|
+
results = file_data
|
|
489
|
+
|
|
490
|
+
# Limit to num_samples
|
|
491
|
+
samples_to_use = results[:min(num_samples, len(results))]
|
|
492
|
+
|
|
493
|
+
# Format samples for meta-prompt
|
|
494
|
+
samples_text = []
|
|
495
|
+
for i, example in enumerate(samples_to_use):
|
|
496
|
+
# Skip examples with errors
|
|
497
|
+
if example.get('error') or not example.get('output'):
|
|
498
|
+
continue
|
|
499
|
+
|
|
500
|
+
strong_output = example.get('strong_model_output', '')
|
|
501
|
+
weak_output = example.get('output', '')
|
|
502
|
+
|
|
503
|
+
# Extract question
|
|
504
|
+
question = ""
|
|
505
|
+
messages = example.get('messages', [])
|
|
506
|
+
if messages and len(messages) > 0:
|
|
507
|
+
question = messages[0].get('content', '')
|
|
508
|
+
|
|
509
|
+
samples_text.append(f"""Sample {i+1}:
|
|
510
|
+
Question: {question}
|
|
511
|
+
Strong Model Output: {strong_output}
|
|
512
|
+
Weak Model Output: {weak_output}
|
|
513
|
+
---""")
|
|
514
|
+
|
|
515
|
+
samples_formatted = "\n\n".join(samples_text)
|
|
516
|
+
|
|
517
|
+
# Replace {SAMPLES} placeholder in meta-prompt
|
|
518
|
+
final_prompt = meta_prompt.replace('{SAMPLES}', samples_formatted)
|
|
519
|
+
|
|
520
|
+
# Call OpenRouter to generate the prompt
|
|
521
|
+
try:
|
|
522
|
+
client = create_openrouter_client()
|
|
523
|
+
response = client.chat.completions.create(
|
|
524
|
+
model=model,
|
|
525
|
+
messages=[{"role": "user", "content": final_prompt}]
|
|
526
|
+
)
|
|
527
|
+
generated_prompt = response.choices[0].message.content.strip()
|
|
528
|
+
|
|
529
|
+
return jsonify({
|
|
530
|
+
'status': 'success',
|
|
531
|
+
'generated_prompt': generated_prompt,
|
|
532
|
+
'num_samples_used': len(samples_text)
|
|
533
|
+
})
|
|
534
|
+
|
|
535
|
+
except Exception as e:
|
|
536
|
+
return jsonify({'error': f'Failed to generate prompt: {str(e)}'}), 500
|
|
537
|
+
|
|
538
|
+
|
|
249
539
|
@app.route('/list_weak_models', methods=['GET'])
|
|
250
540
|
def list_weak_models():
|
|
251
541
|
"""List available weak model result files with metadata"""
|
|
@@ -469,27 +759,38 @@ def run_evaluation_endpoint():
|
|
|
469
759
|
data = request.json
|
|
470
760
|
model_file = data.get('model_file')
|
|
471
761
|
judge = data.get('judge')
|
|
762
|
+
task_id = data.get('task_id', f"eval_{id(data)}")
|
|
472
763
|
|
|
473
764
|
if not model_file or not judge:
|
|
474
765
|
return jsonify({'error': 'Missing model_file or judge'}), 400
|
|
475
766
|
|
|
476
767
|
# Load weak model results
|
|
477
|
-
|
|
478
|
-
|
|
768
|
+
model_path = DATA_DIR / model_file
|
|
769
|
+
with open(model_path, 'r') as f:
|
|
770
|
+
file_data = json.load(f)
|
|
479
771
|
|
|
480
772
|
# Handle both old format (list) and new format (dict with metadata)
|
|
481
|
-
if isinstance(
|
|
482
|
-
metadata =
|
|
483
|
-
results =
|
|
773
|
+
if isinstance(file_data, dict) and 'results' in file_data:
|
|
774
|
+
metadata = file_data.get('metadata', {})
|
|
775
|
+
results = file_data['results']
|
|
484
776
|
strong_export = metadata.get('strong_export_file', 'unknown')
|
|
485
777
|
else:
|
|
486
778
|
# Old format - just a list
|
|
487
|
-
results =
|
|
779
|
+
results = file_data
|
|
488
780
|
strong_export = 'unknown'
|
|
489
781
|
|
|
490
782
|
# Extract model name from filename
|
|
491
783
|
model_name = model_file.replace('weak_model_', '').replace('.json', '')
|
|
492
784
|
|
|
785
|
+
# Initialize progress tracking
|
|
786
|
+
total_steps = len(results)
|
|
787
|
+
progress_state[task_id] = {
|
|
788
|
+
'current': 0,
|
|
789
|
+
'total': total_steps,
|
|
790
|
+
'message': f'Starting evaluation: {model_name} with {judge["name"]}...',
|
|
791
|
+
'status': 'running'
|
|
792
|
+
}
|
|
793
|
+
|
|
493
794
|
# Create evaluation logger
|
|
494
795
|
ev = weave.EvaluationLogger(
|
|
495
796
|
name=f"eval-{model_name}-{judge['name']}",
|
|
@@ -497,7 +798,13 @@ def run_evaluation_endpoint():
|
|
|
497
798
|
)
|
|
498
799
|
|
|
499
800
|
# Run evaluation
|
|
500
|
-
for example in results:
|
|
801
|
+
for idx, example in enumerate(results):
|
|
802
|
+
progress_state[task_id] = {
|
|
803
|
+
'current': idx + 1,
|
|
804
|
+
'total': total_steps,
|
|
805
|
+
'message': f'{model_name} - Example {idx+1}/{total_steps}',
|
|
806
|
+
'status': 'running'
|
|
807
|
+
}
|
|
501
808
|
# Skip examples with errors (null messages/output)
|
|
502
809
|
if example.get('error') or not example.get('output'):
|
|
503
810
|
continue
|
|
@@ -533,12 +840,21 @@ def run_evaluation_endpoint():
|
|
|
533
840
|
# Finish evaluation
|
|
534
841
|
ev.log_summary()
|
|
535
842
|
|
|
843
|
+
# Mark progress as complete
|
|
844
|
+
progress_state[task_id] = {
|
|
845
|
+
'current': total_steps,
|
|
846
|
+
'total': total_steps,
|
|
847
|
+
'message': 'Complete!',
|
|
848
|
+
'status': 'complete'
|
|
849
|
+
}
|
|
850
|
+
|
|
536
851
|
return jsonify({
|
|
537
852
|
'status': 'success',
|
|
538
853
|
'evaluation_name': f"eval-{model_name}-{judge['name']}",
|
|
539
854
|
'examples_evaluated': len(results),
|
|
540
855
|
'weave_url': ev.ui_url,
|
|
541
|
-
'strong_export': strong_export
|
|
856
|
+
'strong_export': strong_export,
|
|
857
|
+
'task_id': task_id
|
|
542
858
|
})
|
|
543
859
|
|
|
544
860
|
|