quickdistill 0.1.5__tar.gz → 0.1.6__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {quickdistill-0.1.5/quickdistill.egg-info → quickdistill-0.1.6}/PKG-INFO +1 -1
- {quickdistill-0.1.5 → quickdistill-0.1.6}/pyproject.toml +1 -1
- {quickdistill-0.1.5 → quickdistill-0.1.6}/quickdistill/__init__.py +1 -1
- {quickdistill-0.1.5 → quickdistill-0.1.6}/quickdistill/__pycache__/__init__.cpython-310.pyc +0 -0
- quickdistill-0.1.6/quickdistill/__pycache__/server.cpython-310.pyc +0 -0
- {quickdistill-0.1.5 → quickdistill-0.1.6}/quickdistill/server.py +74 -10
- {quickdistill-0.1.5 → quickdistill-0.1.6}/quickdistill/static/judge_manager.html +2 -16
- {quickdistill-0.1.5 → quickdistill-0.1.6}/quickdistill/static/trace_viewer.html +73 -13
- {quickdistill-0.1.5 → quickdistill-0.1.6/quickdistill.egg-info}/PKG-INFO +1 -1
- quickdistill-0.1.5/quickdistill/__pycache__/server.cpython-310.pyc +0 -0
- {quickdistill-0.1.5 → quickdistill-0.1.6}/.pycommands +0 -0
- {quickdistill-0.1.5 → quickdistill-0.1.6}/README.md +0 -0
- {quickdistill-0.1.5 → quickdistill-0.1.6}/dev/generate_test_traces.py +0 -0
- {quickdistill-0.1.5 → quickdistill-0.1.6}/dev/get_call.py +0 -0
- {quickdistill-0.1.5 → quickdistill-0.1.6}/dev/get_traces.py +0 -0
- {quickdistill-0.1.5 → quickdistill-0.1.6}/dev/inference_server.py +0 -0
- {quickdistill-0.1.5 → quickdistill-0.1.6}/dev/judge_manager.html +0 -0
- {quickdistill-0.1.5 → quickdistill-0.1.6}/dev/judges.json +0 -0
- {quickdistill-0.1.5 → quickdistill-0.1.6}/dev/old/TEST_TRACE_GENERATION.md +0 -0
- {quickdistill-0.1.5 → quickdistill-0.1.6}/dev/old/traces_data.json +0 -0
- {quickdistill-0.1.5 → quickdistill-0.1.6}/dev/projects/byyoung3_arena-detailed/traces_data.json +0 -0
- {quickdistill-0.1.5 → quickdistill-0.1.6}/dev/projects/byyoung3_claude-opus-4-1-tutorial/traces_data.json +0 -0
- {quickdistill-0.1.5 → quickdistill-0.1.6}/dev/projects/byyoung3_test-financial-qa/traces_data.json +0 -0
- {quickdistill-0.1.5 → quickdistill-0.1.6}/dev/pystatus +0 -0
- {quickdistill-0.1.5 → quickdistill-0.1.6}/dev/run_evaluation.py +0 -0
- {quickdistill-0.1.5 → quickdistill-0.1.6}/dev/run_weak_models.py +0 -0
- {quickdistill-0.1.5 → quickdistill-0.1.6}/dev/strong_exports/anthropic_claude-3.5-sonnet_10traces_v2.json +0 -0
- {quickdistill-0.1.5 → quickdistill-0.1.6}/dev/strong_exports/anthropic_claude-3.5-sonnet_20traces.json +0 -0
- {quickdistill-0.1.5 → quickdistill-0.1.6}/dev/strong_exports/claude-opus-4-1-20250805_1traces.json +0 -0
- {quickdistill-0.1.5 → quickdistill-0.1.6}/dev/strong_exports/gpt-5-2025-08-07_199traces.json +0 -0
- {quickdistill-0.1.5 → quickdistill-0.1.6}/dev/trace_viewer.html +0 -0
- {quickdistill-0.1.5 → quickdistill-0.1.6}/dev/traces_data.json +0 -0
- {quickdistill-0.1.5 → quickdistill-0.1.6}/dev/weak_model_google_gemini-2.5-flash.json +0 -0
- {quickdistill-0.1.5 → quickdistill-0.1.6}/dev/weak_model_meta-llama_Llama-3.1-8B-Instruct.json +0 -0
- {quickdistill-0.1.5 → quickdistill-0.1.6}/dev/weak_model_meta-llama_Llama-3.3-70B-Instruct.json +0 -0
- {quickdistill-0.1.5 → quickdistill-0.1.6}/dev/weak_model_openai_gpt-oss-20b.json +0 -0
- {quickdistill-0.1.5 → quickdistill-0.1.6}/quickdistill/__pycache__/cli.cpython-310.pyc +0 -0
- {quickdistill-0.1.5 → quickdistill-0.1.6}/quickdistill/__pycache__/get_traces.cpython-310.pyc +0 -0
- {quickdistill-0.1.5 → quickdistill-0.1.6}/quickdistill/cli.py +0 -0
- {quickdistill-0.1.5 → quickdistill-0.1.6}/quickdistill/default_judges.json +0 -0
- {quickdistill-0.1.5 → quickdistill-0.1.6}/quickdistill/default_projects/byyoung3_arena-detailed/traces_data.json +0 -0
- {quickdistill-0.1.5 → quickdistill-0.1.6}/quickdistill/get_traces.py +0 -0
- {quickdistill-0.1.5 → quickdistill-0.1.6}/quickdistill.egg-info/SOURCES.txt +0 -0
- {quickdistill-0.1.5 → quickdistill-0.1.6}/quickdistill.egg-info/dependency_links.txt +0 -0
- {quickdistill-0.1.5 → quickdistill-0.1.6}/quickdistill.egg-info/entry_points.txt +0 -0
- {quickdistill-0.1.5 → quickdistill-0.1.6}/quickdistill.egg-info/requires.txt +0 -0
- {quickdistill-0.1.5 → quickdistill-0.1.6}/quickdistill.egg-info/top_level.txt +0 -0
- {quickdistill-0.1.5 → quickdistill-0.1.6}/setup.cfg +0 -0
|
Binary file
|
|
Binary file
|
|
@@ -3,6 +3,7 @@ import json
|
|
|
3
3
|
import openai
|
|
4
4
|
import weave
|
|
5
5
|
import shutil
|
|
6
|
+
import threading
|
|
6
7
|
from flask import Flask, request, jsonify, send_from_directory
|
|
7
8
|
from flask_cors import CORS
|
|
8
9
|
from llmasajudge import LLMAsAJudge
|
|
@@ -30,6 +31,9 @@ if default_project_src.exists() and not default_project_dst.exists():
|
|
|
30
31
|
app = Flask(__name__, static_folder=str(STATIC_DIR))
|
|
31
32
|
CORS(app)
|
|
32
33
|
|
|
34
|
+
# Progress tracking for long-running operations
|
|
35
|
+
progress_state = {}
|
|
36
|
+
|
|
33
37
|
# Configuration
|
|
34
38
|
PROJECT = "wandb_inference"
|
|
35
39
|
|
|
@@ -152,6 +156,7 @@ def run_inference_endpoint():
|
|
|
152
156
|
models = data.get('models', [])
|
|
153
157
|
strong_export_file = data.get('strong_export_file')
|
|
154
158
|
num_examples = data.get('num_examples')
|
|
159
|
+
task_id = data.get('task_id', f"inference_{id(models)}")
|
|
155
160
|
|
|
156
161
|
if not models:
|
|
157
162
|
return jsonify({'error': 'No models provided'}), 400
|
|
@@ -176,8 +181,17 @@ def run_inference_endpoint():
|
|
|
176
181
|
|
|
177
182
|
output_files = []
|
|
178
183
|
|
|
184
|
+
# Initialize progress tracking
|
|
185
|
+
total_steps = len(models) * len(traces)
|
|
186
|
+
progress_state[task_id] = {
|
|
187
|
+
'current': 0,
|
|
188
|
+
'total': total_steps,
|
|
189
|
+
'message': 'Starting inference...',
|
|
190
|
+
'status': 'running'
|
|
191
|
+
}
|
|
192
|
+
|
|
179
193
|
# Run inference for each model
|
|
180
|
-
for model in models:
|
|
194
|
+
for model_idx, model in enumerate(models):
|
|
181
195
|
print(f"Running model: {model}")
|
|
182
196
|
results = []
|
|
183
197
|
|
|
@@ -185,6 +199,13 @@ def run_inference_endpoint():
|
|
|
185
199
|
client = get_client_for_model(model)
|
|
186
200
|
|
|
187
201
|
for i, trace in enumerate(traces):
|
|
202
|
+
step = model_idx * len(traces) + i + 1
|
|
203
|
+
progress_state[task_id] = {
|
|
204
|
+
'current': step,
|
|
205
|
+
'total': total_steps,
|
|
206
|
+
'message': f'[{model_idx+1}/{len(models)}] {model} - Example {i+1}/{len(traces)}',
|
|
207
|
+
'status': 'running'
|
|
208
|
+
}
|
|
188
209
|
print(f" Processing example {i+1}/{len(traces)}...", end=' ')
|
|
189
210
|
|
|
190
211
|
# Extract messages
|
|
@@ -239,13 +260,30 @@ def run_inference_endpoint():
|
|
|
239
260
|
output_files.append(str(output_file))
|
|
240
261
|
print(f"Saved {len(results)} results to {output_file}")
|
|
241
262
|
|
|
263
|
+
# Mark progress as complete
|
|
264
|
+
progress_state[task_id] = {
|
|
265
|
+
'current': total_steps,
|
|
266
|
+
'total': total_steps,
|
|
267
|
+
'message': 'Complete!',
|
|
268
|
+
'status': 'complete'
|
|
269
|
+
}
|
|
270
|
+
|
|
242
271
|
return jsonify({
|
|
243
272
|
'status': 'success',
|
|
244
273
|
'files': output_files,
|
|
245
274
|
'total_examples': len(traces),
|
|
246
|
-
'models_run': len(models)
|
|
275
|
+
'models_run': len(models),
|
|
276
|
+
'task_id': task_id
|
|
247
277
|
})
|
|
248
278
|
|
|
279
|
+
@app.route('/progress/<task_id>', methods=['GET'])
|
|
280
|
+
def get_progress(task_id):
|
|
281
|
+
"""Get progress for a running task"""
|
|
282
|
+
if task_id in progress_state:
|
|
283
|
+
return jsonify(progress_state[task_id])
|
|
284
|
+
return jsonify({'error': 'Task not found'}), 404
|
|
285
|
+
|
|
286
|
+
|
|
249
287
|
@app.route('/list_weak_models', methods=['GET'])
|
|
250
288
|
def list_weak_models():
|
|
251
289
|
"""List available weak model result files with metadata"""
|
|
@@ -469,27 +507,38 @@ def run_evaluation_endpoint():
|
|
|
469
507
|
data = request.json
|
|
470
508
|
model_file = data.get('model_file')
|
|
471
509
|
judge = data.get('judge')
|
|
510
|
+
task_id = data.get('task_id', f"eval_{id(data)}")
|
|
472
511
|
|
|
473
512
|
if not model_file or not judge:
|
|
474
513
|
return jsonify({'error': 'Missing model_file or judge'}), 400
|
|
475
514
|
|
|
476
515
|
# Load weak model results
|
|
477
|
-
|
|
478
|
-
|
|
516
|
+
model_path = DATA_DIR / model_file
|
|
517
|
+
with open(model_path, 'r') as f:
|
|
518
|
+
file_data = json.load(f)
|
|
479
519
|
|
|
480
520
|
# Handle both old format (list) and new format (dict with metadata)
|
|
481
|
-
if isinstance(
|
|
482
|
-
metadata =
|
|
483
|
-
results =
|
|
521
|
+
if isinstance(file_data, dict) and 'results' in file_data:
|
|
522
|
+
metadata = file_data.get('metadata', {})
|
|
523
|
+
results = file_data['results']
|
|
484
524
|
strong_export = metadata.get('strong_export_file', 'unknown')
|
|
485
525
|
else:
|
|
486
526
|
# Old format - just a list
|
|
487
|
-
results =
|
|
527
|
+
results = file_data
|
|
488
528
|
strong_export = 'unknown'
|
|
489
529
|
|
|
490
530
|
# Extract model name from filename
|
|
491
531
|
model_name = model_file.replace('weak_model_', '').replace('.json', '')
|
|
492
532
|
|
|
533
|
+
# Initialize progress tracking
|
|
534
|
+
total_steps = len(results)
|
|
535
|
+
progress_state[task_id] = {
|
|
536
|
+
'current': 0,
|
|
537
|
+
'total': total_steps,
|
|
538
|
+
'message': f'Starting evaluation: {model_name} with {judge["name"]}...',
|
|
539
|
+
'status': 'running'
|
|
540
|
+
}
|
|
541
|
+
|
|
493
542
|
# Create evaluation logger
|
|
494
543
|
ev = weave.EvaluationLogger(
|
|
495
544
|
name=f"eval-{model_name}-{judge['name']}",
|
|
@@ -497,7 +546,13 @@ def run_evaluation_endpoint():
|
|
|
497
546
|
)
|
|
498
547
|
|
|
499
548
|
# Run evaluation
|
|
500
|
-
for example in results:
|
|
549
|
+
for idx, example in enumerate(results):
|
|
550
|
+
progress_state[task_id] = {
|
|
551
|
+
'current': idx + 1,
|
|
552
|
+
'total': total_steps,
|
|
553
|
+
'message': f'{model_name} - Example {idx+1}/{total_steps}',
|
|
554
|
+
'status': 'running'
|
|
555
|
+
}
|
|
501
556
|
# Skip examples with errors (null messages/output)
|
|
502
557
|
if example.get('error') or not example.get('output'):
|
|
503
558
|
continue
|
|
@@ -533,12 +588,21 @@ def run_evaluation_endpoint():
|
|
|
533
588
|
# Finish evaluation
|
|
534
589
|
ev.log_summary()
|
|
535
590
|
|
|
591
|
+
# Mark progress as complete
|
|
592
|
+
progress_state[task_id] = {
|
|
593
|
+
'current': total_steps,
|
|
594
|
+
'total': total_steps,
|
|
595
|
+
'message': 'Complete!',
|
|
596
|
+
'status': 'complete'
|
|
597
|
+
}
|
|
598
|
+
|
|
536
599
|
return jsonify({
|
|
537
600
|
'status': 'success',
|
|
538
601
|
'evaluation_name': f"eval-{model_name}-{judge['name']}",
|
|
539
602
|
'examples_evaluated': len(results),
|
|
540
603
|
'weave_url': ev.ui_url,
|
|
541
|
-
'strong_export': strong_export
|
|
604
|
+
'strong_export': strong_export,
|
|
605
|
+
'task_id': task_id
|
|
542
606
|
})
|
|
543
607
|
|
|
544
608
|
|
|
@@ -172,7 +172,6 @@
|
|
|
172
172
|
<label for="judge-type">Judge Type</label>
|
|
173
173
|
<select id="judge-type">
|
|
174
174
|
<option value="llm">LLM-as-a-Judge</option>
|
|
175
|
-
<option value="custom">Custom Function</option>
|
|
176
175
|
</select>
|
|
177
176
|
|
|
178
177
|
<div id="llm-options" style="display: block;">
|
|
@@ -204,13 +203,6 @@
|
|
|
204
203
|
<textarea id="judge-prompt"></textarea>
|
|
205
204
|
</div>
|
|
206
205
|
|
|
207
|
-
<div id="custom-options" style="display: none;">
|
|
208
|
-
<label for="custom-function">Custom Function (Python)</label>
|
|
209
|
-
<textarea id="custom-function" placeholder="def custom_judge(strong_output: str, weak_output: str) -> dict:
|
|
210
|
-
# Your custom logic here
|
|
211
|
-
return {'similarity': 1.0 if strong_output == weak_output else 0.0}"></textarea>
|
|
212
|
-
</div>
|
|
213
|
-
|
|
214
206
|
<button onclick="saveJudge()" id="save-btn">Save Judge</button>
|
|
215
207
|
<button onclick="cancelEdit()" id="cancel-btn" style="display: none; background: #5a2a2a; margin-left: 10px;">Cancel</button>
|
|
216
208
|
</div>
|
|
@@ -340,8 +332,6 @@ Respond in JSON format: {'correct': true} or {'correct': false}`
|
|
|
340
332
|
alert('Error: Judge prompt must include {weak_output} placeholder');
|
|
341
333
|
return;
|
|
342
334
|
}
|
|
343
|
-
} else {
|
|
344
|
-
judge.customFunction = document.getElementById('custom-function').value.trim();
|
|
345
335
|
}
|
|
346
336
|
|
|
347
337
|
const success = await saveJudgeToServer(judge);
|
|
@@ -358,7 +348,6 @@ Respond in JSON format: {'correct': true} or {'correct': false}`
|
|
|
358
348
|
document.getElementById('judge-type').value = 'llm';
|
|
359
349
|
document.getElementById('judge-model').value = 'gpt-5-2025-08-07';
|
|
360
350
|
document.getElementById('judge-prompt').value = '';
|
|
361
|
-
document.getElementById('custom-function').value = '';
|
|
362
351
|
document.getElementById('form-title').textContent = 'Create New Judge';
|
|
363
352
|
document.getElementById('save-btn').textContent = 'Save Judge';
|
|
364
353
|
document.getElementById('cancel-btn').style.display = 'none';
|
|
@@ -383,8 +372,6 @@ Respond in JSON format: {'correct': true} or {'correct': false}`
|
|
|
383
372
|
document.getElementById('judge-model').value = judge.model;
|
|
384
373
|
document.getElementById('judge-return-type').value = judge.returnType || 'scalar';
|
|
385
374
|
document.getElementById('judge-prompt').value = judge.prompt || '';
|
|
386
|
-
} else {
|
|
387
|
-
document.getElementById('custom-function').value = judge.customFunction || '';
|
|
388
375
|
}
|
|
389
376
|
|
|
390
377
|
document.getElementById('form-title').textContent = 'Edit Judge';
|
|
@@ -437,9 +424,8 @@ Respond in JSON format: {'correct': true} or {'correct': false}`
|
|
|
437
424
|
|
|
438
425
|
// Toggle judge type options
|
|
439
426
|
function toggleJudgeType() {
|
|
440
|
-
|
|
441
|
-
document.getElementById('llm-options').style.display =
|
|
442
|
-
document.getElementById('custom-options').style.display = type === 'custom' ? 'block' : 'none';
|
|
427
|
+
// Only LLM type is supported now
|
|
428
|
+
document.getElementById('llm-options').style.display = 'block';
|
|
443
429
|
}
|
|
444
430
|
|
|
445
431
|
document.getElementById('judge-type').addEventListener('change', toggleJudgeType);
|
|
@@ -375,6 +375,9 @@
|
|
|
375
375
|
|
|
376
376
|
<div id="inference-progress" style="display: none; margin-top: 20px; padding: 15px; background: #0f0f0f; border-radius: 4px;">
|
|
377
377
|
<div style="color: #4a9eff; margin-bottom: 10px;">Running inference...</div>
|
|
378
|
+
<div id="inference-progress-bar" style="width: 100%; height: 6px; background: #2a2a2a; border-radius: 3px; margin-bottom: 15px; overflow: hidden;">
|
|
379
|
+
<div id="inference-progress-fill" style="height: 100%; background: #4a9eff; width: 0%; transition: width 0.3s;"></div>
|
|
380
|
+
</div>
|
|
378
381
|
<div id="progress-text" style="color: #888; font-family: monospace; font-size: 12px; white-space: pre-wrap;"></div>
|
|
379
382
|
</div>
|
|
380
383
|
</div>
|
|
@@ -919,20 +922,46 @@
|
|
|
919
922
|
// Show progress
|
|
920
923
|
document.getElementById('inference-progress').style.display = 'block';
|
|
921
924
|
const progressText = document.getElementById('progress-text');
|
|
925
|
+
const progressFill = document.getElementById('inference-progress-fill');
|
|
922
926
|
progressText.textContent = `Starting inference...\n`;
|
|
923
|
-
|
|
924
|
-
|
|
925
|
-
|
|
927
|
+
progressFill.style.width = '0%';
|
|
928
|
+
|
|
929
|
+
// Start inference and poll for progress
|
|
930
|
+
let taskId = null;
|
|
931
|
+
let pollInterval = null;
|
|
932
|
+
|
|
933
|
+
const pollProgress = async () => {
|
|
934
|
+
if (!taskId) return;
|
|
935
|
+
try {
|
|
936
|
+
const resp = await fetch(`/progress/${taskId}`);
|
|
937
|
+
if (resp.ok) {
|
|
938
|
+
const progress = await resp.json();
|
|
939
|
+
const percent = (progress.current / progress.total) * 100;
|
|
940
|
+
progressFill.style.width = `${percent}%`;
|
|
941
|
+
progressText.textContent = `${progress.message}\nProgress: ${progress.current}/${progress.total} (${percent.toFixed(1)}%)\n`;
|
|
942
|
+
}
|
|
943
|
+
} catch (e) {
|
|
944
|
+
console.error('Error polling progress:', e);
|
|
945
|
+
}
|
|
946
|
+
};
|
|
926
947
|
|
|
927
948
|
// Call backend API
|
|
928
949
|
try {
|
|
950
|
+
// Generate a task ID for polling
|
|
951
|
+
taskId = `inference_${Date.now()}`;
|
|
952
|
+
|
|
953
|
+
// Start polling immediately
|
|
954
|
+
pollInterval = setInterval(pollProgress, 300);
|
|
955
|
+
|
|
956
|
+
// Start the inference
|
|
929
957
|
const response = await fetch('/run_inference', {
|
|
930
958
|
method: 'POST',
|
|
931
959
|
headers: { 'Content-Type': 'application/json' },
|
|
932
960
|
body: JSON.stringify({
|
|
933
961
|
models: allModels,
|
|
934
962
|
strong_export_file: strongExportFile,
|
|
935
|
-
num_examples: numExamples
|
|
963
|
+
num_examples: numExamples,
|
|
964
|
+
task_id: taskId
|
|
936
965
|
})
|
|
937
966
|
});
|
|
938
967
|
|
|
@@ -941,8 +970,12 @@
|
|
|
941
970
|
}
|
|
942
971
|
|
|
943
972
|
const result = await response.json();
|
|
944
|
-
|
|
945
|
-
|
|
973
|
+
|
|
974
|
+
// Stop polling
|
|
975
|
+
if (pollInterval) clearInterval(pollInterval);
|
|
976
|
+
|
|
977
|
+
progressText.textContent = `\n✓ Complete!\nResults saved to: ${result.files.join(', ')}\n`;
|
|
978
|
+
progressFill.style.width = '100%';
|
|
946
979
|
|
|
947
980
|
setTimeout(() => {
|
|
948
981
|
document.getElementById('inference-panel').style.display = 'none';
|
|
@@ -951,8 +984,7 @@
|
|
|
951
984
|
|
|
952
985
|
} catch (error) {
|
|
953
986
|
progressText.textContent += `\n✗ Error: ${error.message}\n`;
|
|
954
|
-
|
|
955
|
-
progressText.textContent += `Run: python inference_server.py\n`;
|
|
987
|
+
if (pollInterval) clearInterval(pollInterval);
|
|
956
988
|
}
|
|
957
989
|
});
|
|
958
990
|
|
|
@@ -1091,21 +1123,44 @@
|
|
|
1091
1123
|
const modelFiles = Array.from(selectedEvalModels);
|
|
1092
1124
|
const results = [];
|
|
1093
1125
|
|
|
1094
|
-
// Run evaluations sequentially
|
|
1126
|
+
// Run evaluations sequentially with granular progress
|
|
1095
1127
|
for (let i = 0; i < modelFiles.length; i++) {
|
|
1096
1128
|
const modelFile = modelFiles[i];
|
|
1097
|
-
const progress = ((i) / modelFiles.length) * 100;
|
|
1098
|
-
progressFill.style.width = `${progress}%`;
|
|
1099
1129
|
|
|
1100
|
-
progressText.textContent += `[${i+1}/${modelFiles.length}]
|
|
1130
|
+
progressText.textContent += `[${i+1}/${modelFiles.length}] Starting ${modelFile}...\n`;
|
|
1131
|
+
|
|
1132
|
+
let pollInterval = null;
|
|
1133
|
+
let taskId = null;
|
|
1134
|
+
|
|
1135
|
+
const pollProgress = async () => {
|
|
1136
|
+
if (!taskId) return;
|
|
1137
|
+
try {
|
|
1138
|
+
const resp = await fetch(`/progress/${taskId}`);
|
|
1139
|
+
if (resp.ok) {
|
|
1140
|
+
const progress = await resp.json();
|
|
1141
|
+
const percent = (progress.current / progress.total) * 100;
|
|
1142
|
+
progressFill.style.width = `${percent}%`;
|
|
1143
|
+
progressText.textContent = `[${i+1}/${modelFiles.length}] ${progress.message}\nProgress: ${progress.current}/${progress.total} (${percent.toFixed(1)}%)\n`;
|
|
1144
|
+
}
|
|
1145
|
+
} catch (e) {
|
|
1146
|
+
console.error('Error polling eval progress:', e);
|
|
1147
|
+
}
|
|
1148
|
+
};
|
|
1101
1149
|
|
|
1102
1150
|
try {
|
|
1151
|
+
// Generate task ID for this evaluation
|
|
1152
|
+
taskId = `eval_${Date.now()}_${i}`;
|
|
1153
|
+
|
|
1154
|
+
// Start polling
|
|
1155
|
+
pollInterval = setInterval(pollProgress, 300);
|
|
1156
|
+
|
|
1103
1157
|
const response = await fetch('/run_evaluation', {
|
|
1104
1158
|
method: 'POST',
|
|
1105
1159
|
headers: { 'Content-Type': 'application/json' },
|
|
1106
1160
|
body: JSON.stringify({
|
|
1107
1161
|
model_file: modelFile,
|
|
1108
|
-
judge: judge
|
|
1162
|
+
judge: judge,
|
|
1163
|
+
task_id: taskId
|
|
1109
1164
|
})
|
|
1110
1165
|
});
|
|
1111
1166
|
|
|
@@ -1114,6 +1169,10 @@
|
|
|
1114
1169
|
}
|
|
1115
1170
|
|
|
1116
1171
|
const result = await response.json();
|
|
1172
|
+
|
|
1173
|
+
// Clear polling when done
|
|
1174
|
+
if (pollInterval) clearInterval(pollInterval);
|
|
1175
|
+
|
|
1117
1176
|
progressText.textContent += ` ✓ Complete: ${result.evaluation_name}\n`;
|
|
1118
1177
|
progressText.textContent += ` Examples: ${result.examples_evaluated}\n\n`;
|
|
1119
1178
|
|
|
@@ -1125,6 +1184,7 @@
|
|
|
1125
1184
|
});
|
|
1126
1185
|
|
|
1127
1186
|
} catch (error) {
|
|
1187
|
+
if (pollInterval) clearInterval(pollInterval);
|
|
1128
1188
|
progressText.textContent += ` ✗ Error: ${error.message}\n\n`;
|
|
1129
1189
|
}
|
|
1130
1190
|
}
|
|
Binary file
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{quickdistill-0.1.5 → quickdistill-0.1.6}/dev/projects/byyoung3_arena-detailed/traces_data.json
RENAMED
|
File without changes
|
|
File without changes
|
{quickdistill-0.1.5 → quickdistill-0.1.6}/dev/projects/byyoung3_test-financial-qa/traces_data.json
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{quickdistill-0.1.5 → quickdistill-0.1.6}/dev/strong_exports/claude-opus-4-1-20250805_1traces.json
RENAMED
|
File without changes
|
{quickdistill-0.1.5 → quickdistill-0.1.6}/dev/strong_exports/gpt-5-2025-08-07_199traces.json
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{quickdistill-0.1.5 → quickdistill-0.1.6}/dev/weak_model_meta-llama_Llama-3.1-8B-Instruct.json
RENAMED
|
File without changes
|
{quickdistill-0.1.5 → quickdistill-0.1.6}/dev/weak_model_meta-llama_Llama-3.3-70B-Instruct.json
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{quickdistill-0.1.5 → quickdistill-0.1.6}/quickdistill/__pycache__/get_traces.cpython-310.pyc
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|