PyPI - quickdistill - Versions diffs - 0.1.6__py3-none-any.whl → 0.1.8__py3-none-any.whl - Mend

quickdistill 0.1.6py3-none-any.whl → 0.1.8py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

quickdistill/__init__.py +1 -1
quickdistill/__pycache__/__init__.cpython-310.pyc +0 -0
quickdistill/__pycache__/server.cpython-310.pyc +0 -0
quickdistill/default_judges.json +2 -2
quickdistill/server.py +257 -5
quickdistill/static/judge_manager.html +193 -8
quickdistill/static/trace_viewer.html +731 -9
{quickdistill-0.1.6.dist-info → quickdistill-0.1.8.dist-info}/METADATA +1 -1
quickdistill-0.1.8.dist-info/RECORD +17 -0
quickdistill-0.1.6.dist-info/RECORD +0 -17
{quickdistill-0.1.6.dist-info → quickdistill-0.1.8.dist-info}/WHEEL +0 -0
{quickdistill-0.1.6.dist-info → quickdistill-0.1.8.dist-info}/entry_points.txt +0 -0
{quickdistill-0.1.6.dist-info → quickdistill-0.1.8.dist-info}/top_level.txt +0 -0

quickdistill/__init__.py CHANGED Viewed

@@ -8,7 +8,7 @@ This package provides tools to:
 - Export datasets for model evaluation
 """
-__version__ = "0.1.6"
+__version__ = "0.1.8"
 __author__ = "Brett Young"
 __email__ = "bdytx5@umsystem.edu"

quickdistill/__pycache__/__init__.cpython-310.pyc CHANGED Viewed

Binary file

quickdistill/__pycache__/server.cpython-310.pyc CHANGED Viewed

Binary file

quickdistill/default_judges.json CHANGED Viewed

@@ -2,14 +2,14 @@
   {
     "name": "boolean_scorer",
     "type": "llm",
-    "model": "gpt-5",
+    "model": "openai/gpt-5",
     "returnType": "boolean",
     "prompt": "You are a strict evaluator comparing two AI responses (one from a strong reference model which is the ground truth, and one from a weaker model which we are testing to see how similar the responses it generates are to the strong model).\n\nStrong Model Response: {strong_output}\nWeak Model Response: {weak_output}\n\nDetermine if the weak model response is CORRECT compared to the strong model response.\nConsider a response CORRECT if it conveys the same key information and meaning, even if worded differently.\n\nRespond in JSON format: {'correct': true} or {'correct': false}"
   },
   {
     "name": "scalar_scorer",
     "type": "llm",
-    "model": "gpt-5",
+    "model": "openai/gpt-5",
     "returnType": "scalar",
     "prompt": "You are a strict evaluator comparing two AI responses (one from a strong reference model which is the ground truth, and one from a weaker model which we are testing to see how similar the responses it generates are to the strong model).\n\nStrong Model Response: {strong_output}\nWeak Model Response: {weak_output}\n\nEvaluate how similar the weak model response is to the strong model response.\nRate on a scale of 1-5 where 1=completely different and 5=nearly identical. RETURN ONLY ONE SCORE REPRESENTY THE AVERAGE SIMILARITY (EG 5-(avg_error))\n\nRespond in JSON format eg {'scores': the_score }"
   }

quickdistill/server.py CHANGED Viewed

@@ -3,7 +3,6 @@ import json
 import openai
 import weave
 import shutil
-import threading
 from flask import Flask, request, jsonify, send_from_directory
 from flask_cors import CORS
 from llmasajudge import LLMAsAJudge
@@ -34,8 +33,25 @@ CORS(app)
 # Progress tracking for long-running operations
 progress_state = {}
-# Configuration
-PROJECT = "wandb_inference"
+# Load settings
+SETTINGS_FILE = DATA_DIR / 'settings.json'
+DEFAULT_SETTINGS = {
+    'inference_project': 'wandb_fc/quickstart_playground',
+    'evaluation_project': 'wandb_inference'
+}
+def load_settings():
+    if SETTINGS_FILE.exists():
+        with open(SETTINGS_FILE, 'r') as f:
+            return {**DEFAULT_SETTINGS, **json.load(f)}
+    return DEFAULT_SETTINGS.copy()
+def save_settings(settings):
+    with open(SETTINGS_FILE, 'w') as f:
+        json.dump(settings, f, indent=2)
+settings = load_settings()
+PROJECT = settings['evaluation_project']
 weave.init(PROJECT)
@@ -46,7 +62,7 @@ def create_client():
         api_key=os.getenv("WANDB_API_KEY"),
         project=PROJECT,
         default_headers={
-            "OpenAI-Project": "wandb_fc/quickstart_playground"  # replace with your team/project
+            "OpenAI-Project": settings['inference_project']
         }
     )
@@ -175,8 +191,9 @@ def run_inference_endpoint():
     if not traces:
         return jsonify({'error': 'No traces in export file'}), 400
-    # Limit traces to num_examples
+    # Limit traces to num_examples (convert to int if needed)
     if num_examples:
+        num_examples = int(num_examples)
         traces = traces[:num_examples]
     output_files = []
@@ -284,6 +301,241 @@ def get_progress(task_id):
     return jsonify({'error': 'Task not found'}), 404
+@app.route('/settings', methods=['GET'])
+def get_settings():
+    """Get current settings"""
+    return jsonify(settings)
+@app.route('/settings', methods=['POST'])
+def update_settings():
+    """Update settings"""
+    global settings
+    data = request.json
+    settings.update(data)
+    save_settings(settings)
+    return jsonify({'status': 'success', 'settings': settings})
+@app.route('/test_judge', methods=['POST'])
+def test_judge():
+    """Test a judge on sample data to see raw inputs/outputs"""
+    data = request.json
+    judge = data.get('judge')
+    weak_model_file = data.get('weak_model_file')
+    num_samples = data.get('num_samples', 5)
+    if not judge or not weak_model_file:
+        return jsonify({'error': 'Missing judge or weak_model_file'}), 400
+    # Load weak model results
+    model_path = DATA_DIR / weak_model_file
+    with open(model_path, 'r') as f:
+        file_data = json.load(f)
+    # Handle both formats
+    if isinstance(file_data, dict) and 'results' in file_data:
+        results = file_data['results']
+    else:
+        results = file_data
+    # Limit to num_samples
+    samples_to_test = results[:min(num_samples, len(results))]
+    test_results = []
+    for example in samples_to_test:
+        # Skip examples with errors
+        if example.get('error') or not example.get('output'):
+            continue
+        strong_output = example.get('strong_model_output', '')
+        weak_output = example.get('output', '')
+        # Extract question
+        question = ""
+        messages = example.get('messages', [])
+        if messages and len(messages) > 0:
+            question = messages[0].get('content', '')
+        # Build the prompt
+        prompt = judge['prompt']
+        if '{question}' in prompt:
+            prompt = prompt.replace('{question}', question or '')
+        if '{strong_output}' in prompt:
+            prompt = prompt.replace('{strong_output}', strong_output or '')
+        if '{weak_output}' in prompt:
+            prompt = prompt.replace('{weak_output}', weak_output or '')
+        # Run the judge and capture raw response
+        if judge['type'] == 'llm':
+            return_type = judge.get('returnType', 'scalar')
+            # Use a list to capture the raw response (mutable so we can access from closure)
+            captured_raw = []
+            def score_parser(response: str):
+                """Parse the judge response based on return type"""
+                # Capture the raw response before any processing
+                captured_raw.append(response)
+                response = response.strip()
+                # Remove markdown code blocks if present
+                if response.startswith('```'):
+                    # Remove ```json or ``` at start
+                    response = response.split('\n', 1)[1] if '\n' in response else response[3:]
+                    # Remove ``` at end
+                    if response.endswith('```'):
+                        response = response.rsplit('\n', 1)[0] if '\n' in response else response[:-3]
+                    response = response.strip()
+                try:
+                    # Parse JSON response
+                    parsed = json.loads(response)
+                    if return_type == 'boolean':
+                        # Extract boolean value - return just the bool
+                        val = parsed.get('correct', parsed.get('result', parsed.get('value', False)))
+                        return bool(val)
+                    elif return_type == 'scalar':
+                        # Extract numeric score - return just the number
+                        val = parsed.get('score', parsed.get('scores', parsed.get('value', 0)))
+                        return float(val) if isinstance(val, (int, float)) else 0
+                    else:
+                        # Unsupported return type
+                        print(f"Unsupported return type: {return_type}")
+                        return 0
+                except:
+                    print(f"Failed to parse judge response as JSON: {response}")
+                    if return_type == 'scalar':
+                        return 0
+                    elif return_type == 'boolean':
+                        return False
+                    else:
+                        return 0
+            # Use LLMAsAJudge exactly like the evaluation code
+            try:
+                # Initialize LLMAsAJudge with custom prompt
+                judge_instance = LLMAsAJudge(
+                    models=[judge['model']],
+                    use_fully_custom_prompt=True,
+                    output_parser=score_parser,
+                    return_type=return_type if return_type else None
+                )
+                # Get judgment
+                result = judge_instance.judge(prompt=prompt)
+                # Extract the raw response that was captured
+                raw_text = captured_raw[0] if captured_raw else "No response captured"
+                # Extract parsed scores from result
+                if return_type == 'scalar':
+                    score_val = result.get('scores', result.get('correct', 0))
+                    parsed_scores = {'score': score_val}
+                elif return_type == 'boolean':
+                    bool_val = result.get('correct', False)
+                    parsed_scores = {'correct': bool_val}
+                else:
+                    # Unsupported return type - default to scalar
+                    score_val = result.get('scores', result.get('correct', 0))
+                    parsed_scores = {'score': score_val}
+            except Exception as e:
+                raw_text = f"Error: {str(e)}"
+                parsed_scores = {'error': str(e)}
+            test_results.append({
+                'question': question,
+                'strong_output': strong_output,
+                'weak_output': weak_output,
+                'judge_prompt': prompt,
+                'raw_response': raw_text,
+                'parsed_scores': parsed_scores
+            })
+    return jsonify({
+        'status': 'success',
+        'judge_name': judge['name'],
+        'num_samples': len(test_results),
+        'samples': test_results
+    })
+@app.route('/generate_judge_prompt', methods=['POST'])
+def generate_judge_prompt():
+    """Generate a judge prompt using AI based on sample data"""
+    data = request.json
+    weak_model_file = data.get('weak_model_file')
+    num_samples = data.get('num_samples', 3)
+    model = data.get('model', 'openai/gpt-5')
+    meta_prompt = data.get('meta_prompt')
+    if not weak_model_file or not meta_prompt:
+        return jsonify({'error': 'Missing weak_model_file or meta_prompt'}), 400
+    # Load weak model results
+    model_path = DATA_DIR / weak_model_file
+    with open(model_path, 'r') as f:
+        file_data = json.load(f)
+    # Handle both formats
+    if isinstance(file_data, dict) and 'results' in file_data:
+        results = file_data['results']
+    else:
+        results = file_data
+    # Limit to num_samples
+    samples_to_use = results[:min(num_samples, len(results))]
+    # Format samples for meta-prompt
+    samples_text = []
+    for i, example in enumerate(samples_to_use):
+        # Skip examples with errors
+        if example.get('error') or not example.get('output'):
+            continue
+        strong_output = example.get('strong_model_output', '')
+        weak_output = example.get('output', '')
+        # Extract question
+        question = ""
+        messages = example.get('messages', [])
+        if messages and len(messages) > 0:
+            question = messages[0].get('content', '')
+        samples_text.append(f"""Sample {i+1}:
+Question: {question}
+Strong Model Output: {strong_output}
+Weak Model Output: {weak_output}
+---""")
+    samples_formatted = "\n\n".join(samples_text)
+    # Replace {SAMPLES} placeholder in meta-prompt
+    final_prompt = meta_prompt.replace('{SAMPLES}', samples_formatted)
+    # Call OpenRouter to generate the prompt
+    try:
+        client = create_openrouter_client()
+        response = client.chat.completions.create(
+            model=model,
+            messages=[{"role": "user", "content": final_prompt}]
+        )
+        generated_prompt = response.choices[0].message.content.strip()
+        return jsonify({
+            'status': 'success',
+            'generated_prompt': generated_prompt,
+            'num_samples_used': len(samples_text)
+        })
+    except Exception as e:
+        return jsonify({'error': f'Failed to generate prompt: {str(e)}'}), 500
 @app.route('/list_weak_models', methods=['GET'])
 def list_weak_models():
     """List available weak model result files with metadata"""

quickdistill/static/judge_manager.html CHANGED Viewed

@@ -162,6 +162,13 @@
     <div class="container">
         <h1>Judge Manager</h1>
+        <!-- Prompt Generator Button -->
+        <div style="margin-bottom: 20px; display: none;">
+            <button onclick="openPromptGenerator()" style="padding: 10px 20px; background: #6a4a7e; color: white; border: none; border-radius: 4px; cursor: pointer; font-size: 14px;">
+                ✨ Generate Judge Prompt with AI
+            </button>
+        </div>
         <!-- Create/Edit Judge Section -->
         <div class="section">
             <h2 id="form-title">Create New Judge</h2>
@@ -176,12 +183,10 @@
             <div id="llm-options" style="display: block;">
                 <label for="judge-model">Model</label>
-                <select id="judge-model">
-                    <option value="gpt-5">gpt-5</option>
-                    <option value="gpt-4o">gpt-4o</option>
-                    <option value="gpt-4o-mini">gpt-4o-mini</option>
-                    <option value="claude-3-5-sonnet-20241022">claude-3-5-sonnet</option>
-                </select>
+                <input type="text" id="judge-model" placeholder="e.g., openai/gpt-5, anthropic/claude-3.5-sonnet" value="openai/gpt-5">
+                <p style="color: #888; font-size: 12px; margin-top: 5px; margin-bottom: 15px;">
+                    <strong>Note:</strong> Uses LiteLLM format. Examples: <code>openai/gpt-5</code>, <code>anthropic/claude-3.5-sonnet</code>, <code>openai/gpt-4o</code>
+                </p>
                 <label for="judge-return-type">Return Type</label>
                 <select id="judge-return-type">
@@ -217,6 +222,73 @@
         </div>
     </div>
+    <!-- Prompt Generator Panel -->
+    <div id="prompt-generator-panel" style="display: none; position: fixed; top: 0; left: 0; width: 100%; height: 100%; background: rgba(0,0,0,0.9); z-index: 1000; padding: 40px; overflow-y: auto;">
+        <div style="max-width: 1200px; margin: 0 auto; background: #1a1a1a; border-radius: 8px; padding: 30px; border: 1px solid #3a2a4a;">
+            <h2 style="color: #fff; margin-bottom: 10px;">AI-Powered Judge Prompt Generator</h2>
+            <p style="color: #888; font-size: 13px; margin-bottom: 25px;">
+                Generate specialized judge prompts by showing sample data to an AI model
+            </p>
+            <!-- Configuration -->
+            <div style="display: grid; grid-template-columns: 1fr 1fr; gap: 20px; margin-bottom: 20px;">
+                <div>
+                    <label style="display: block; color: #aaa; margin-bottom: 8px; font-size: 14px;">Weak Model Dataset:</label>
+                    <select id="gen-weak-model-select" style="width: 100%; padding: 10px; background: #2a2a2a; color: #fff; border: 1px solid #3a3a3a; border-radius: 4px; font-size: 14px;">
+                        <option value="">Loading weak model files...</option>
+                    </select>
+                </div>
+                <div>
+                    <label style="display: block; color: #aaa; margin-bottom: 8px; font-size: 14px;">Number of Samples:</label>
+                    <input type="number" id="gen-num-samples" value="3" min="1" max="10"
+                        style="width: 100%; padding: 10px; background: #2a2a2a; color: #fff; border: 1px solid #3a3a3a; border-radius: 4px; font-size: 14px;">
+                    <div style="color: #666; font-size: 12px; margin-top: 5px;">Max: 10 (for context limits)</div>
+                </div>
+            </div>
+            <div style="margin-bottom: 20px;">
+                <label style="display: block; color: #aaa; margin-bottom: 8px; font-size: 14px;">Generation Model:</label>
+                <input type="text" id="gen-model" value="openai/gpt-5"
+                    style="width: 100%; padding: 10px; background: #2a2a2a; color: #fff; border: 1px solid #3a3a3a; border-radius: 4px; font-size: 14px;"
+                    placeholder="e.g., openai/gpt-5, anthropic/claude-3.5-sonnet">
+                <div style="color: #666; font-size: 12px; margin-top: 5px;">OpenRouter model to use for generating the prompt</div>
+            </div>
+            <!-- Meta-Prompt -->
+            <div style="margin-bottom: 25px;">
+                <label style="display: block; color: #aaa; margin-bottom: 8px; font-size: 14px;">Meta-Prompt (edit as needed):</label>
+                <textarea id="gen-meta-prompt"
+                    style="width: 100%; min-height: 250px; padding: 10px; background: #2a2a2a; color: #fff; border: 1px solid #3a3a3a; border-radius: 4px; font-size: 13px; font-family: 'Courier New', monospace; resize: vertical;"></textarea>
+                <div style="color: #666; font-size: 12px; margin-top: 5px;">
+                    This prompt will be sent to the generation model along with sample data
+                </div>
+            </div>
+            <!-- Actions -->
+            <div style="display: flex; gap: 10px; margin-bottom: 25px;">
+                <button onclick="generatePrompt()" style="padding: 10px 20px; background: #6a4a7e; color: white; border: none; border-radius: 4px; cursor: pointer; font-weight: 500;">
+                    Generate Prompt
+                </button>
+                <button onclick="closePromptGenerator()" style="padding: 10px 20px; background: #5a2a2a; color: white; border: none; border-radius: 4px; cursor: pointer;">
+                    Close
+                </button>
+            </div>
+            <!-- Generated Output -->
+            <div id="gen-output-section" style="display: none;">
+                <h3 style="color: #4a9eff; margin-bottom: 15px;">Generated Judge Prompt</h3>
+                <textarea id="gen-output" readonly
+                    style="width: 100%; min-height: 300px; padding: 15px; background: #0f0f0f; color: #4a9eff; border: 1px solid #4a9eff; border-radius: 4px; font-size: 13px; font-family: 'Courier New', monospace; resize: vertical;"></textarea>
+                <div style="margin-top: 10px;">
+                    <button onclick="copyGeneratedPrompt()" style="padding: 8px 16px; background: #2a7c4a; color: white; border: none; border-radius: 4px; cursor: pointer;">
+                        Copy to Clipboard
+                    </button>
+                </div>
+            </div>
+        </div>
+    </div>
     <script>
         let judges = [];
         let editingIndex = null;
@@ -319,10 +391,16 @@ Respond in JSON format: {'correct': true} or {'correct': false}`
             };
             if (type === 'llm') {
-                judge.model = document.getElementById('judge-model').value;
+                judge.model = document.getElementById('judge-model').value.trim();
                 judge.returnType = document.getElementById('judge-return-type').value;
                 judge.prompt = document.getElementById('judge-prompt').value.trim();
+                // Validate model
+                if (!judge.model) {
+                    alert('Error: Please enter a model (e.g., openai/gpt-5)');
+                    return;
+                }
                 // Validate required placeholders
                 if (!judge.prompt.includes('{strong_output}')) {
                     alert('Error: Judge prompt must include {strong_output} placeholder');
@@ -346,7 +424,7 @@ Respond in JSON format: {'correct': true} or {'correct': false}`
         function resetForm() {
             document.getElementById('judge-name').value = '';
             document.getElementById('judge-type').value = 'llm';
-            document.getElementById('judge-model').value = 'gpt-5-2025-08-07';
+            document.getElementById('judge-model').value = 'openai/gpt-5';
             document.getElementById('judge-prompt').value = '';
             document.getElementById('form-title').textContent = 'Create New Judge';
             document.getElementById('save-btn').textContent = 'Save Judge';
@@ -489,6 +567,113 @@ Respond in JSON format: {'correct': true} or {'correct': false}`
                 console.log('Not changing prompt - user has edited it');
             }
         });
+        // === PROMPT GENERATOR ===
+        const DEFAULT_META_PROMPT = `You are an expert at creating evaluation prompts for judging AI model outputs. I'm building a specialized judge prompt to evaluate the quality/similarity of weak model outputs compared to strong reference model outputs.
+I will show you some sample data below. Each sample contains:
+- A question/input
+- The strong reference model's output (ground truth)
+- The weak model's output (what we're evaluating)
+Your task: Create a specialized, detailed judge prompt that can be used to systematically evaluate the delta/difference between these outputs. The prompt should:
+1. Be specific to the patterns you see in this data
+2. Include clear evaluation criteria
+3. Be written in second-person ("You are...")
+4. Include the placeholders {question}, {strong_output}, and {weak_output}
+5. Specify the exact JSON format to return (either {'score': number} for scalar or {'correct': boolean} for boolean)
+Sample Data:
+{SAMPLES}
+Based on these samples, create a specialized judge prompt that would effectively evaluate this type of data. Return ONLY the judge prompt text, nothing else.`;
+        async function openPromptGenerator() {
+            // Load weak model files
+            try {
+                const response = await fetch('/list_weak_models');
+                const data = await response.json();
+                const select = document.getElementById('gen-weak-model-select');
+                if (data.files && data.files.length > 0) {
+                    select.innerHTML = data.files.map(f =>
+                        `<option value="${f.filename}">${f.weak_model || f.filename}</option>`
+                    ).join('');
+                } else {
+                    select.innerHTML = '<option value="">No weak model files available</option>';
+                }
+            } catch (error) {
+                console.error('Error loading weak models:', error);
+            }
+            // Set default meta-prompt
+            document.getElementById('gen-meta-prompt').value = DEFAULT_META_PROMPT;
+            // Show panel
+            document.getElementById('prompt-generator-panel').style.display = 'block';
+            document.getElementById('gen-output-section').style.display = 'none';
+        }
+        function closePromptGenerator() {
+            document.getElementById('prompt-generator-panel').style.display = 'none';
+        }
+        async function generatePrompt() {
+            const weakModelFile = document.getElementById('gen-weak-model-select').value;
+            const numSamples = parseInt(document.getElementById('gen-num-samples').value) || 3;
+            const model = document.getElementById('gen-model').value.trim();
+            const metaPrompt = document.getElementById('gen-meta-prompt').value.trim();
+            if (!weakModelFile) {
+                alert('Please select a weak model dataset');
+                return;
+            }
+            if (!model) {
+                alert('Please enter a generation model');
+                return;
+            }
+            if (!metaPrompt) {
+                alert('Please enter a meta-prompt');
+                return;
+            }
+            try {
+                const response = await fetch('/generate_judge_prompt', {
+                    method: 'POST',
+                    headers: { 'Content-Type': 'application/json' },
+                    body: JSON.stringify({
+                        weak_model_file: weakModelFile,
+                        num_samples: numSamples,
+                        model: model,
+                        meta_prompt: metaPrompt
+                    })
+                });
+                if (!response.ok) {
+                    throw new Error('Failed to generate prompt');
+                }
+                const result = await response.json();
+                // Display generated prompt
+                document.getElementById('gen-output').value = result.generated_prompt;
+                document.getElementById('gen-output-section').style.display = 'block';
+            } catch (error) {
+                alert('Error generating prompt: ' + error.message);
+                console.error('Generation error:', error);
+            }
+        }
+        function copyGeneratedPrompt() {
+            const output = document.getElementById('gen-output');
+            output.select();
+            document.execCommand('copy');
+            alert('Prompt copied to clipboard!');
+        }
     </script>
 </body>
 </html>

quickdistill 0.1.6__py3-none-any.whl → 0.1.8__py3-none-any.whl

quickdistill 0.1.6py3-none-any.whl → 0.1.8py3-none-any.whl