quickdistill 0.1.5__tar.gz → 0.1.7__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. {quickdistill-0.1.5/quickdistill.egg-info → quickdistill-0.1.7}/PKG-INFO +1 -1
  2. {quickdistill-0.1.5 → quickdistill-0.1.7}/pyproject.toml +1 -1
  3. {quickdistill-0.1.5 → quickdistill-0.1.7}/quickdistill/__init__.py +1 -1
  4. {quickdistill-0.1.5 → quickdistill-0.1.7}/quickdistill/__pycache__/__init__.cpython-310.pyc +0 -0
  5. quickdistill-0.1.7/quickdistill/__pycache__/server.cpython-310.pyc +0 -0
  6. {quickdistill-0.1.5 → quickdistill-0.1.7}/quickdistill/server.py +330 -14
  7. {quickdistill-0.1.5 → quickdistill-0.1.7}/quickdistill/static/judge_manager.html +183 -16
  8. {quickdistill-0.1.5 → quickdistill-0.1.7}/quickdistill/static/trace_viewer.html +787 -13
  9. {quickdistill-0.1.5 → quickdistill-0.1.7/quickdistill.egg-info}/PKG-INFO +1 -1
  10. {quickdistill-0.1.5 → quickdistill-0.1.7}/quickdistill.egg-info/SOURCES.txt +1 -0
  11. quickdistill-0.1.7/update.sh +111 -0
  12. quickdistill-0.1.5/quickdistill/__pycache__/server.cpython-310.pyc +0 -0
  13. {quickdistill-0.1.5 → quickdistill-0.1.7}/.pycommands +0 -0
  14. {quickdistill-0.1.5 → quickdistill-0.1.7}/README.md +0 -0
  15. {quickdistill-0.1.5 → quickdistill-0.1.7}/dev/generate_test_traces.py +0 -0
  16. {quickdistill-0.1.5 → quickdistill-0.1.7}/dev/get_call.py +0 -0
  17. {quickdistill-0.1.5 → quickdistill-0.1.7}/dev/get_traces.py +0 -0
  18. {quickdistill-0.1.5 → quickdistill-0.1.7}/dev/inference_server.py +0 -0
  19. {quickdistill-0.1.5 → quickdistill-0.1.7}/dev/judge_manager.html +0 -0
  20. {quickdistill-0.1.5 → quickdistill-0.1.7}/dev/judges.json +0 -0
  21. {quickdistill-0.1.5 → quickdistill-0.1.7}/dev/old/TEST_TRACE_GENERATION.md +0 -0
  22. {quickdistill-0.1.5 → quickdistill-0.1.7}/dev/old/traces_data.json +0 -0
  23. {quickdistill-0.1.5 → quickdistill-0.1.7}/dev/projects/byyoung3_arena-detailed/traces_data.json +0 -0
  24. {quickdistill-0.1.5 → quickdistill-0.1.7}/dev/projects/byyoung3_claude-opus-4-1-tutorial/traces_data.json +0 -0
  25. {quickdistill-0.1.5 → quickdistill-0.1.7}/dev/projects/byyoung3_test-financial-qa/traces_data.json +0 -0
  26. {quickdistill-0.1.5 → quickdistill-0.1.7}/dev/pystatus +0 -0
  27. {quickdistill-0.1.5 → quickdistill-0.1.7}/dev/run_evaluation.py +0 -0
  28. {quickdistill-0.1.5 → quickdistill-0.1.7}/dev/run_weak_models.py +0 -0
  29. {quickdistill-0.1.5 → quickdistill-0.1.7}/dev/strong_exports/anthropic_claude-3.5-sonnet_10traces_v2.json +0 -0
  30. {quickdistill-0.1.5 → quickdistill-0.1.7}/dev/strong_exports/anthropic_claude-3.5-sonnet_20traces.json +0 -0
  31. {quickdistill-0.1.5 → quickdistill-0.1.7}/dev/strong_exports/claude-opus-4-1-20250805_1traces.json +0 -0
  32. {quickdistill-0.1.5 → quickdistill-0.1.7}/dev/strong_exports/gpt-5-2025-08-07_199traces.json +0 -0
  33. {quickdistill-0.1.5 → quickdistill-0.1.7}/dev/trace_viewer.html +0 -0
  34. {quickdistill-0.1.5 → quickdistill-0.1.7}/dev/traces_data.json +0 -0
  35. {quickdistill-0.1.5 → quickdistill-0.1.7}/dev/weak_model_google_gemini-2.5-flash.json +0 -0
  36. {quickdistill-0.1.5 → quickdistill-0.1.7}/dev/weak_model_meta-llama_Llama-3.1-8B-Instruct.json +0 -0
  37. {quickdistill-0.1.5 → quickdistill-0.1.7}/dev/weak_model_meta-llama_Llama-3.3-70B-Instruct.json +0 -0
  38. {quickdistill-0.1.5 → quickdistill-0.1.7}/dev/weak_model_openai_gpt-oss-20b.json +0 -0
  39. {quickdistill-0.1.5 → quickdistill-0.1.7}/quickdistill/__pycache__/cli.cpython-310.pyc +0 -0
  40. {quickdistill-0.1.5 → quickdistill-0.1.7}/quickdistill/__pycache__/get_traces.cpython-310.pyc +0 -0
  41. {quickdistill-0.1.5 → quickdistill-0.1.7}/quickdistill/cli.py +0 -0
  42. {quickdistill-0.1.5 → quickdistill-0.1.7}/quickdistill/default_judges.json +0 -0
  43. {quickdistill-0.1.5 → quickdistill-0.1.7}/quickdistill/default_projects/byyoung3_arena-detailed/traces_data.json +0 -0
  44. {quickdistill-0.1.5 → quickdistill-0.1.7}/quickdistill/get_traces.py +0 -0
  45. {quickdistill-0.1.5 → quickdistill-0.1.7}/quickdistill.egg-info/dependency_links.txt +0 -0
  46. {quickdistill-0.1.5 → quickdistill-0.1.7}/quickdistill.egg-info/entry_points.txt +0 -0
  47. {quickdistill-0.1.5 → quickdistill-0.1.7}/quickdistill.egg-info/requires.txt +0 -0
  48. {quickdistill-0.1.5 → quickdistill-0.1.7}/quickdistill.egg-info/top_level.txt +0 -0
  49. {quickdistill-0.1.5 → quickdistill-0.1.7}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: quickdistill
3
- Version: 0.1.5
3
+ Version: 0.1.7
4
4
  Summary: Fast and easy toolkit for distilling AI models
5
5
  Author-email: Brett Young <bdytx5@umsystem.edu>
6
6
  License: MIT
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "quickdistill"
7
- version = "0.1.5"
7
+ version = "0.1.7"
8
8
  description = "Fast and easy toolkit for distilling AI models"
9
9
  readme = "README.md"
10
10
  authors = [
@@ -8,7 +8,7 @@ This package provides tools to:
8
8
  - Export datasets for model evaluation
9
9
  """
10
10
 
11
- __version__ = "0.1.5"
11
+ __version__ = "0.1.7"
12
12
  __author__ = "Brett Young"
13
13
  __email__ = "bdytx5@umsystem.edu"
14
14
 
@@ -30,8 +30,28 @@ if default_project_src.exists() and not default_project_dst.exists():
30
30
  app = Flask(__name__, static_folder=str(STATIC_DIR))
31
31
  CORS(app)
32
32
 
33
- # Configuration
34
- PROJECT = "wandb_inference"
33
+ # Progress tracking for long-running operations
34
+ progress_state = {}
35
+
36
+ # Load settings
37
+ SETTINGS_FILE = DATA_DIR / 'settings.json'
38
+ DEFAULT_SETTINGS = {
39
+ 'inference_project': 'wandb_fc/quickstart_playground',
40
+ 'evaluation_project': 'wandb_inference'
41
+ }
42
+
43
+ def load_settings():
44
+ if SETTINGS_FILE.exists():
45
+ with open(SETTINGS_FILE, 'r') as f:
46
+ return {**DEFAULT_SETTINGS, **json.load(f)}
47
+ return DEFAULT_SETTINGS.copy()
48
+
49
+ def save_settings(settings):
50
+ with open(SETTINGS_FILE, 'w') as f:
51
+ json.dump(settings, f, indent=2)
52
+
53
+ settings = load_settings()
54
+ PROJECT = settings['evaluation_project']
35
55
 
36
56
  weave.init(PROJECT)
37
57
 
@@ -42,7 +62,7 @@ def create_client():
42
62
  api_key=os.getenv("WANDB_API_KEY"),
43
63
  project=PROJECT,
44
64
  default_headers={
45
- "OpenAI-Project": "wandb_fc/quickstart_playground" # replace with your team/project
65
+ "OpenAI-Project": settings['inference_project']
46
66
  }
47
67
  )
48
68
 
@@ -152,6 +172,7 @@ def run_inference_endpoint():
152
172
  models = data.get('models', [])
153
173
  strong_export_file = data.get('strong_export_file')
154
174
  num_examples = data.get('num_examples')
175
+ task_id = data.get('task_id', f"inference_{id(models)}")
155
176
 
156
177
  if not models:
157
178
  return jsonify({'error': 'No models provided'}), 400
@@ -170,14 +191,24 @@ def run_inference_endpoint():
170
191
  if not traces:
171
192
  return jsonify({'error': 'No traces in export file'}), 400
172
193
 
173
- # Limit traces to num_examples
194
+ # Limit traces to num_examples (convert to int if needed)
174
195
  if num_examples:
196
+ num_examples = int(num_examples)
175
197
  traces = traces[:num_examples]
176
198
 
177
199
  output_files = []
178
200
 
201
+ # Initialize progress tracking
202
+ total_steps = len(models) * len(traces)
203
+ progress_state[task_id] = {
204
+ 'current': 0,
205
+ 'total': total_steps,
206
+ 'message': 'Starting inference...',
207
+ 'status': 'running'
208
+ }
209
+
179
210
  # Run inference for each model
180
- for model in models:
211
+ for model_idx, model in enumerate(models):
181
212
  print(f"Running model: {model}")
182
213
  results = []
183
214
 
@@ -185,6 +216,13 @@ def run_inference_endpoint():
185
216
  client = get_client_for_model(model)
186
217
 
187
218
  for i, trace in enumerate(traces):
219
+ step = model_idx * len(traces) + i + 1
220
+ progress_state[task_id] = {
221
+ 'current': step,
222
+ 'total': total_steps,
223
+ 'message': f'[{model_idx+1}/{len(models)}] {model} - Example {i+1}/{len(traces)}',
224
+ 'status': 'running'
225
+ }
188
226
  print(f" Processing example {i+1}/{len(traces)}...", end=' ')
189
227
 
190
228
  # Extract messages
@@ -239,13 +277,265 @@ def run_inference_endpoint():
239
277
  output_files.append(str(output_file))
240
278
  print(f"Saved {len(results)} results to {output_file}")
241
279
 
280
+ # Mark progress as complete
281
+ progress_state[task_id] = {
282
+ 'current': total_steps,
283
+ 'total': total_steps,
284
+ 'message': 'Complete!',
285
+ 'status': 'complete'
286
+ }
287
+
242
288
  return jsonify({
243
289
  'status': 'success',
244
290
  'files': output_files,
245
291
  'total_examples': len(traces),
246
- 'models_run': len(models)
292
+ 'models_run': len(models),
293
+ 'task_id': task_id
294
+ })
295
+
296
+ @app.route('/progress/<task_id>', methods=['GET'])
297
+ def get_progress(task_id):
298
+ """Get progress for a running task"""
299
+ if task_id in progress_state:
300
+ return jsonify(progress_state[task_id])
301
+ return jsonify({'error': 'Task not found'}), 404
302
+
303
+
304
+ @app.route('/settings', methods=['GET'])
305
+ def get_settings():
306
+ """Get current settings"""
307
+ return jsonify(settings)
308
+
309
+
310
+ @app.route('/settings', methods=['POST'])
311
+ def update_settings():
312
+ """Update settings"""
313
+ global settings
314
+ data = request.json
315
+ settings.update(data)
316
+ save_settings(settings)
317
+ return jsonify({'status': 'success', 'settings': settings})
318
+
319
+
320
+ @app.route('/test_judge', methods=['POST'])
321
+ def test_judge():
322
+ """Test a judge on sample data to see raw inputs/outputs"""
323
+ data = request.json
324
+ judge = data.get('judge')
325
+ weak_model_file = data.get('weak_model_file')
326
+ num_samples = data.get('num_samples', 5)
327
+
328
+ if not judge or not weak_model_file:
329
+ return jsonify({'error': 'Missing judge or weak_model_file'}), 400
330
+
331
+ # Load weak model results
332
+ model_path = DATA_DIR / weak_model_file
333
+ with open(model_path, 'r') as f:
334
+ file_data = json.load(f)
335
+
336
+ # Handle both formats
337
+ if isinstance(file_data, dict) and 'results' in file_data:
338
+ results = file_data['results']
339
+ else:
340
+ results = file_data
341
+
342
+ # Limit to num_samples
343
+ samples_to_test = results[:min(num_samples, len(results))]
344
+
345
+ test_results = []
346
+
347
+ for example in samples_to_test:
348
+ # Skip examples with errors
349
+ if example.get('error') or not example.get('output'):
350
+ continue
351
+
352
+ strong_output = example.get('strong_model_output', '')
353
+ weak_output = example.get('output', '')
354
+
355
+ # Extract question
356
+ question = ""
357
+ messages = example.get('messages', [])
358
+ if messages and len(messages) > 0:
359
+ question = messages[0].get('content', '')
360
+
361
+ # Build the prompt
362
+ prompt = judge['prompt']
363
+ if '{question}' in prompt:
364
+ prompt = prompt.replace('{question}', question or '')
365
+ if '{strong_output}' in prompt:
366
+ prompt = prompt.replace('{strong_output}', strong_output or '')
367
+ if '{weak_output}' in prompt:
368
+ prompt = prompt.replace('{weak_output}', weak_output or '')
369
+
370
+ # Run the judge and capture raw response
371
+ if judge['type'] == 'llm':
372
+ return_type = judge.get('returnType', 'scalar')
373
+
374
+ # Use a list to capture the raw response (mutable so we can access from closure)
375
+ captured_raw = []
376
+
377
+ def score_parser(response: str):
378
+ """Parse the judge response based on return type"""
379
+ # Capture the raw response before any processing
380
+ captured_raw.append(response)
381
+
382
+ response = response.strip()
383
+
384
+ # Remove markdown code blocks if present
385
+ if response.startswith('```'):
386
+ # Remove ```json or ``` at start
387
+ response = response.split('\n', 1)[1] if '\n' in response else response[3:]
388
+ # Remove ``` at end
389
+ if response.endswith('```'):
390
+ response = response.rsplit('\n', 1)[0] if '\n' in response else response[:-3]
391
+ response = response.strip()
392
+
393
+ try:
394
+ # Parse JSON response
395
+ parsed = json.loads(response)
396
+
397
+ if return_type == 'boolean':
398
+ # Extract boolean value - return just the bool
399
+ val = parsed.get('correct', parsed.get('result', parsed.get('value', False)))
400
+ return bool(val)
401
+ elif return_type == 'scalar':
402
+ # Extract numeric score - return just the number
403
+ val = parsed.get('score', parsed.get('scores', parsed.get('value', 0)))
404
+ return float(val) if isinstance(val, (int, float)) else 0
405
+ else:
406
+ # Unsupported return type
407
+ print(f"Unsupported return type: {return_type}")
408
+ return 0
409
+ except:
410
+ print(f"Failed to parse judge response as JSON: {response}")
411
+ if return_type == 'scalar':
412
+ return 0
413
+ elif return_type == 'boolean':
414
+ return False
415
+ else:
416
+ return 0
417
+
418
+ # Use LLMAsAJudge exactly like the evaluation code
419
+ try:
420
+ # Initialize LLMAsAJudge with custom prompt
421
+ judge_instance = LLMAsAJudge(
422
+ models=[judge['model']],
423
+ use_fully_custom_prompt=True,
424
+ output_parser=score_parser,
425
+ return_type=return_type if return_type else None
426
+ )
427
+
428
+ # Get judgment
429
+ result = judge_instance.judge(prompt=prompt)
430
+
431
+ # Extract the raw response that was captured
432
+ raw_text = captured_raw[0] if captured_raw else "No response captured"
433
+
434
+ # Extract parsed scores from result
435
+ if return_type == 'scalar':
436
+ score_val = result.get('scores', result.get('correct', 0))
437
+ parsed_scores = {'score': score_val}
438
+ elif return_type == 'boolean':
439
+ bool_val = result.get('correct', False)
440
+ parsed_scores = {'correct': bool_val}
441
+ else:
442
+ # Unsupported return type - default to scalar
443
+ score_val = result.get('scores', result.get('correct', 0))
444
+ parsed_scores = {'score': score_val}
445
+
446
+ except Exception as e:
447
+ raw_text = f"Error: {str(e)}"
448
+ parsed_scores = {'error': str(e)}
449
+
450
+ test_results.append({
451
+ 'question': question,
452
+ 'strong_output': strong_output,
453
+ 'weak_output': weak_output,
454
+ 'judge_prompt': prompt,
455
+ 'raw_response': raw_text,
456
+ 'parsed_scores': parsed_scores
457
+ })
458
+
459
+ return jsonify({
460
+ 'status': 'success',
461
+ 'judge_name': judge['name'],
462
+ 'num_samples': len(test_results),
463
+ 'samples': test_results
247
464
  })
248
465
 
466
+
467
+ @app.route('/generate_judge_prompt', methods=['POST'])
468
+ def generate_judge_prompt():
469
+ """Generate a judge prompt using AI based on sample data"""
470
+ data = request.json
471
+ weak_model_file = data.get('weak_model_file')
472
+ num_samples = data.get('num_samples', 3)
473
+ model = data.get('model', 'openai/gpt-5')
474
+ meta_prompt = data.get('meta_prompt')
475
+
476
+ if not weak_model_file or not meta_prompt:
477
+ return jsonify({'error': 'Missing weak_model_file or meta_prompt'}), 400
478
+
479
+ # Load weak model results
480
+ model_path = DATA_DIR / weak_model_file
481
+ with open(model_path, 'r') as f:
482
+ file_data = json.load(f)
483
+
484
+ # Handle both formats
485
+ if isinstance(file_data, dict) and 'results' in file_data:
486
+ results = file_data['results']
487
+ else:
488
+ results = file_data
489
+
490
+ # Limit to num_samples
491
+ samples_to_use = results[:min(num_samples, len(results))]
492
+
493
+ # Format samples for meta-prompt
494
+ samples_text = []
495
+ for i, example in enumerate(samples_to_use):
496
+ # Skip examples with errors
497
+ if example.get('error') or not example.get('output'):
498
+ continue
499
+
500
+ strong_output = example.get('strong_model_output', '')
501
+ weak_output = example.get('output', '')
502
+
503
+ # Extract question
504
+ question = ""
505
+ messages = example.get('messages', [])
506
+ if messages and len(messages) > 0:
507
+ question = messages[0].get('content', '')
508
+
509
+ samples_text.append(f"""Sample {i+1}:
510
+ Question: {question}
511
+ Strong Model Output: {strong_output}
512
+ Weak Model Output: {weak_output}
513
+ ---""")
514
+
515
+ samples_formatted = "\n\n".join(samples_text)
516
+
517
+ # Replace {SAMPLES} placeholder in meta-prompt
518
+ final_prompt = meta_prompt.replace('{SAMPLES}', samples_formatted)
519
+
520
+ # Call OpenRouter to generate the prompt
521
+ try:
522
+ client = create_openrouter_client()
523
+ response = client.chat.completions.create(
524
+ model=model,
525
+ messages=[{"role": "user", "content": final_prompt}]
526
+ )
527
+ generated_prompt = response.choices[0].message.content.strip()
528
+
529
+ return jsonify({
530
+ 'status': 'success',
531
+ 'generated_prompt': generated_prompt,
532
+ 'num_samples_used': len(samples_text)
533
+ })
534
+
535
+ except Exception as e:
536
+ return jsonify({'error': f'Failed to generate prompt: {str(e)}'}), 500
537
+
538
+
249
539
  @app.route('/list_weak_models', methods=['GET'])
250
540
  def list_weak_models():
251
541
  """List available weak model result files with metadata"""
@@ -469,27 +759,38 @@ def run_evaluation_endpoint():
469
759
  data = request.json
470
760
  model_file = data.get('model_file')
471
761
  judge = data.get('judge')
762
+ task_id = data.get('task_id', f"eval_{id(data)}")
472
763
 
473
764
  if not model_file or not judge:
474
765
  return jsonify({'error': 'Missing model_file or judge'}), 400
475
766
 
476
767
  # Load weak model results
477
- with open(model_file, 'r') as f:
478
- data = json.load(f)
768
+ model_path = DATA_DIR / model_file
769
+ with open(model_path, 'r') as f:
770
+ file_data = json.load(f)
479
771
 
480
772
  # Handle both old format (list) and new format (dict with metadata)
481
- if isinstance(data, dict) and 'results' in data:
482
- metadata = data.get('metadata', {})
483
- results = data['results']
773
+ if isinstance(file_data, dict) and 'results' in file_data:
774
+ metadata = file_data.get('metadata', {})
775
+ results = file_data['results']
484
776
  strong_export = metadata.get('strong_export_file', 'unknown')
485
777
  else:
486
778
  # Old format - just a list
487
- results = data
779
+ results = file_data
488
780
  strong_export = 'unknown'
489
781
 
490
782
  # Extract model name from filename
491
783
  model_name = model_file.replace('weak_model_', '').replace('.json', '')
492
784
 
785
+ # Initialize progress tracking
786
+ total_steps = len(results)
787
+ progress_state[task_id] = {
788
+ 'current': 0,
789
+ 'total': total_steps,
790
+ 'message': f'Starting evaluation: {model_name} with {judge["name"]}...',
791
+ 'status': 'running'
792
+ }
793
+
493
794
  # Create evaluation logger
494
795
  ev = weave.EvaluationLogger(
495
796
  name=f"eval-{model_name}-{judge['name']}",
@@ -497,7 +798,13 @@ def run_evaluation_endpoint():
497
798
  )
498
799
 
499
800
  # Run evaluation
500
- for example in results:
801
+ for idx, example in enumerate(results):
802
+ progress_state[task_id] = {
803
+ 'current': idx + 1,
804
+ 'total': total_steps,
805
+ 'message': f'{model_name} - Example {idx+1}/{total_steps}',
806
+ 'status': 'running'
807
+ }
501
808
  # Skip examples with errors (null messages/output)
502
809
  if example.get('error') or not example.get('output'):
503
810
  continue
@@ -533,12 +840,21 @@ def run_evaluation_endpoint():
533
840
  # Finish evaluation
534
841
  ev.log_summary()
535
842
 
843
+ # Mark progress as complete
844
+ progress_state[task_id] = {
845
+ 'current': total_steps,
846
+ 'total': total_steps,
847
+ 'message': 'Complete!',
848
+ 'status': 'complete'
849
+ }
850
+
536
851
  return jsonify({
537
852
  'status': 'success',
538
853
  'evaluation_name': f"eval-{model_name}-{judge['name']}",
539
854
  'examples_evaluated': len(results),
540
855
  'weave_url': ev.ui_url,
541
- 'strong_export': strong_export
856
+ 'strong_export': strong_export,
857
+ 'task_id': task_id
542
858
  })
543
859
 
544
860