PyPI - quickdistill - Versions diffs - 0.1.6__tar.gz → 0.1.7__tar.gz - Mend

quickdistill 0.1.6tar.gz → 0.1.7tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (49) hide show

{quickdistill-0.1.6/quickdistill.egg-info → quickdistill-0.1.7}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: quickdistill
-Version: 0.1.6
+Version: 0.1.7
 Summary: Fast and easy toolkit for distilling AI models
 Author-email: Brett Young <bdytx5@umsystem.edu>
 License: MIT

{quickdistill-0.1.6 → quickdistill-0.1.7}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "quickdistill"
-version = "0.1.6"
+version = "0.1.7"
 description = "Fast and easy toolkit for distilling AI models"
 readme = "README.md"
 authors = [

{quickdistill-0.1.6 → quickdistill-0.1.7}/quickdistill/__init__.py RENAMED Viewed

@@ -8,7 +8,7 @@ This package provides tools to:
 - Export datasets for model evaluation
 """
-__version__ = "0.1.6"
+__version__ = "0.1.7"
 __author__ = "Brett Young"
 __email__ = "bdytx5@umsystem.edu"

{quickdistill-0.1.6 → quickdistill-0.1.7}/quickdistill/__pycache__/__init__.cpython-310.pyc RENAMED Viewed

Binary file

quickdistill-0.1.7/quickdistill/__pycache__/server.cpython-310.pyc ADDED Viewed

Binary file

{quickdistill-0.1.6 → quickdistill-0.1.7}/quickdistill/server.py RENAMED Viewed

@@ -3,7 +3,6 @@ import json
 import openai
 import weave
 import shutil
-import threading
 from flask import Flask, request, jsonify, send_from_directory
 from flask_cors import CORS
 from llmasajudge import LLMAsAJudge
@@ -34,8 +33,25 @@ CORS(app)
 # Progress tracking for long-running operations
 progress_state = {}
-# Configuration
-PROJECT = "wandb_inference"
+# Load settings
+SETTINGS_FILE = DATA_DIR / 'settings.json'
+DEFAULT_SETTINGS = {
+    'inference_project': 'wandb_fc/quickstart_playground',
+    'evaluation_project': 'wandb_inference'
+}
+def load_settings():
+    if SETTINGS_FILE.exists():
+        with open(SETTINGS_FILE, 'r') as f:
+            return {**DEFAULT_SETTINGS, **json.load(f)}
+    return DEFAULT_SETTINGS.copy()
+def save_settings(settings):
+    with open(SETTINGS_FILE, 'w') as f:
+        json.dump(settings, f, indent=2)
+settings = load_settings()
+PROJECT = settings['evaluation_project']
 weave.init(PROJECT)
@@ -46,7 +62,7 @@ def create_client():
         api_key=os.getenv("WANDB_API_KEY"),
         project=PROJECT,
         default_headers={
-            "OpenAI-Project": "wandb_fc/quickstart_playground"  # replace with your team/project
+            "OpenAI-Project": settings['inference_project']
         }
     )
@@ -175,8 +191,9 @@ def run_inference_endpoint():
     if not traces:
         return jsonify({'error': 'No traces in export file'}), 400
-    # Limit traces to num_examples
+    # Limit traces to num_examples (convert to int if needed)
     if num_examples:
+        num_examples = int(num_examples)
         traces = traces[:num_examples]
     output_files = []
@@ -284,6 +301,241 @@ def get_progress(task_id):
     return jsonify({'error': 'Task not found'}), 404
+@app.route('/settings', methods=['GET'])
+def get_settings():
+    """Get current settings"""
+    return jsonify(settings)
+@app.route('/settings', methods=['POST'])
+def update_settings():
+    """Update settings"""
+    global settings
+    data = request.json
+    settings.update(data)
+    save_settings(settings)
+    return jsonify({'status': 'success', 'settings': settings})
+@app.route('/test_judge', methods=['POST'])
+def test_judge():
+    """Test a judge on sample data to see raw inputs/outputs"""
+    data = request.json
+    judge = data.get('judge')
+    weak_model_file = data.get('weak_model_file')
+    num_samples = data.get('num_samples', 5)
+    if not judge or not weak_model_file:
+        return jsonify({'error': 'Missing judge or weak_model_file'}), 400
+    # Load weak model results
+    model_path = DATA_DIR / weak_model_file
+    with open(model_path, 'r') as f:
+        file_data = json.load(f)
+    # Handle both formats
+    if isinstance(file_data, dict) and 'results' in file_data:
+        results = file_data['results']
+    else:
+        results = file_data
+    # Limit to num_samples
+    samples_to_test = results[:min(num_samples, len(results))]
+    test_results = []
+    for example in samples_to_test:
+        # Skip examples with errors
+        if example.get('error') or not example.get('output'):
+            continue
+        strong_output = example.get('strong_model_output', '')
+        weak_output = example.get('output', '')
+        # Extract question
+        question = ""
+        messages = example.get('messages', [])
+        if messages and len(messages) > 0:
+            question = messages[0].get('content', '')
+        # Build the prompt
+        prompt = judge['prompt']
+        if '{question}' in prompt:
+            prompt = prompt.replace('{question}', question or '')
+        if '{strong_output}' in prompt:
+            prompt = prompt.replace('{strong_output}', strong_output or '')
+        if '{weak_output}' in prompt:
+            prompt = prompt.replace('{weak_output}', weak_output or '')
+        # Run the judge and capture raw response
+        if judge['type'] == 'llm':
+            return_type = judge.get('returnType', 'scalar')
+            # Use a list to capture the raw response (mutable so we can access from closure)
+            captured_raw = []
+            def score_parser(response: str):
+                """Parse the judge response based on return type"""
+                # Capture the raw response before any processing
+                captured_raw.append(response)
+                response = response.strip()
+                # Remove markdown code blocks if present
+                if response.startswith('```'):
+                    # Remove ```json or ``` at start
+                    response = response.split('\n', 1)[1] if '\n' in response else response[3:]
+                    # Remove ``` at end
+                    if response.endswith('```'):
+                        response = response.rsplit('\n', 1)[0] if '\n' in response else response[:-3]
+                    response = response.strip()
+                try:
+                    # Parse JSON response
+                    parsed = json.loads(response)
+                    if return_type == 'boolean':
+                        # Extract boolean value - return just the bool
+                        val = parsed.get('correct', parsed.get('result', parsed.get('value', False)))
+                        return bool(val)
+                    elif return_type == 'scalar':
+                        # Extract numeric score - return just the number
+                        val = parsed.get('score', parsed.get('scores', parsed.get('value', 0)))
+                        return float(val) if isinstance(val, (int, float)) else 0
+                    else:
+                        # Unsupported return type
+                        print(f"Unsupported return type: {return_type}")
+                        return 0
+                except:
+                    print(f"Failed to parse judge response as JSON: {response}")
+                    if return_type == 'scalar':
+                        return 0
+                    elif return_type == 'boolean':
+                        return False
+                    else:
+                        return 0
+            # Use LLMAsAJudge exactly like the evaluation code
+            try:
+                # Initialize LLMAsAJudge with custom prompt
+                judge_instance = LLMAsAJudge(
+                    models=[judge['model']],
+                    use_fully_custom_prompt=True,
+                    output_parser=score_parser,
+                    return_type=return_type if return_type else None
+                )
+                # Get judgment
+                result = judge_instance.judge(prompt=prompt)
+                # Extract the raw response that was captured
+                raw_text = captured_raw[0] if captured_raw else "No response captured"
+                # Extract parsed scores from result
+                if return_type == 'scalar':
+                    score_val = result.get('scores', result.get('correct', 0))
+                    parsed_scores = {'score': score_val}
+                elif return_type == 'boolean':
+                    bool_val = result.get('correct', False)
+                    parsed_scores = {'correct': bool_val}
+                else:
+                    # Unsupported return type - default to scalar
+                    score_val = result.get('scores', result.get('correct', 0))
+                    parsed_scores = {'score': score_val}
+            except Exception as e:
+                raw_text = f"Error: {str(e)}"
+                parsed_scores = {'error': str(e)}
+            test_results.append({
+                'question': question,
+                'strong_output': strong_output,
+                'weak_output': weak_output,
+                'judge_prompt': prompt,
+                'raw_response': raw_text,
+                'parsed_scores': parsed_scores
+            })
+    return jsonify({
+        'status': 'success',
+        'judge_name': judge['name'],
+        'num_samples': len(test_results),
+        'samples': test_results
+    })
+@app.route('/generate_judge_prompt', methods=['POST'])
+def generate_judge_prompt():
+    """Generate a judge prompt using AI based on sample data"""
+    data = request.json
+    weak_model_file = data.get('weak_model_file')
+    num_samples = data.get('num_samples', 3)
+    model = data.get('model', 'openai/gpt-5')
+    meta_prompt = data.get('meta_prompt')
+    if not weak_model_file or not meta_prompt:
+        return jsonify({'error': 'Missing weak_model_file or meta_prompt'}), 400
+    # Load weak model results
+    model_path = DATA_DIR / weak_model_file
+    with open(model_path, 'r') as f:
+        file_data = json.load(f)
+    # Handle both formats
+    if isinstance(file_data, dict) and 'results' in file_data:
+        results = file_data['results']
+    else:
+        results = file_data
+    # Limit to num_samples
+    samples_to_use = results[:min(num_samples, len(results))]
+    # Format samples for meta-prompt
+    samples_text = []
+    for i, example in enumerate(samples_to_use):
+        # Skip examples with errors
+        if example.get('error') or not example.get('output'):
+            continue
+        strong_output = example.get('strong_model_output', '')
+        weak_output = example.get('output', '')
+        # Extract question
+        question = ""
+        messages = example.get('messages', [])
+        if messages and len(messages) > 0:
+            question = messages[0].get('content', '')
+        samples_text.append(f"""Sample {i+1}:
+Question: {question}
+Strong Model Output: {strong_output}
+Weak Model Output: {weak_output}
+---""")
+    samples_formatted = "\n\n".join(samples_text)
+    # Replace {SAMPLES} placeholder in meta-prompt
+    final_prompt = meta_prompt.replace('{SAMPLES}', samples_formatted)
+    # Call OpenRouter to generate the prompt
+    try:
+        client = create_openrouter_client()
+        response = client.chat.completions.create(
+            model=model,
+            messages=[{"role": "user", "content": final_prompt}]
+        )
+        generated_prompt = response.choices[0].message.content.strip()
+        return jsonify({
+            'status': 'success',
+            'generated_prompt': generated_prompt,
+            'num_samples_used': len(samples_text)
+        })
+    except Exception as e:
+        return jsonify({'error': f'Failed to generate prompt: {str(e)}'}), 500
 @app.route('/list_weak_models', methods=['GET'])
 def list_weak_models():
     """List available weak model result files with metadata"""

{quickdistill-0.1.6 → quickdistill-0.1.7}/quickdistill/static/judge_manager.html RENAMED Viewed

@@ -162,6 +162,13 @@
     <div class="container">
         <h1>Judge Manager</h1>
+        <!-- Prompt Generator Button -->
+        <div style="margin-bottom: 20px; display: none;">
+            <button onclick="openPromptGenerator()" style="padding: 10px 20px; background: #6a4a7e; color: white; border: none; border-radius: 4px; cursor: pointer; font-size: 14px;">
+                ✨ Generate Judge Prompt with AI
+            </button>
+        </div>
         <!-- Create/Edit Judge Section -->
         <div class="section">
             <h2 id="form-title">Create New Judge</h2>
@@ -217,6 +224,73 @@
         </div>
     </div>
+    <!-- Prompt Generator Panel -->
+    <div id="prompt-generator-panel" style="display: none; position: fixed; top: 0; left: 0; width: 100%; height: 100%; background: rgba(0,0,0,0.9); z-index: 1000; padding: 40px; overflow-y: auto;">
+        <div style="max-width: 1200px; margin: 0 auto; background: #1a1a1a; border-radius: 8px; padding: 30px; border: 1px solid #3a2a4a;">
+            <h2 style="color: #fff; margin-bottom: 10px;">AI-Powered Judge Prompt Generator</h2>
+            <p style="color: #888; font-size: 13px; margin-bottom: 25px;">
+                Generate specialized judge prompts by showing sample data to an AI model
+            </p>
+            <!-- Configuration -->
+            <div style="display: grid; grid-template-columns: 1fr 1fr; gap: 20px; margin-bottom: 20px;">
+                <div>
+                    <label style="display: block; color: #aaa; margin-bottom: 8px; font-size: 14px;">Weak Model Dataset:</label>
+                    <select id="gen-weak-model-select" style="width: 100%; padding: 10px; background: #2a2a2a; color: #fff; border: 1px solid #3a3a3a; border-radius: 4px; font-size: 14px;">
+                        <option value="">Loading weak model files...</option>
+                    </select>
+                </div>
+                <div>
+                    <label style="display: block; color: #aaa; margin-bottom: 8px; font-size: 14px;">Number of Samples:</label>
+                    <input type="number" id="gen-num-samples" value="3" min="1" max="10"
+                        style="width: 100%; padding: 10px; background: #2a2a2a; color: #fff; border: 1px solid #3a3a3a; border-radius: 4px; font-size: 14px;">
+                    <div style="color: #666; font-size: 12px; margin-top: 5px;">Max: 10 (for context limits)</div>
+                </div>
+            </div>
+            <div style="margin-bottom: 20px;">
+                <label style="display: block; color: #aaa; margin-bottom: 8px; font-size: 14px;">Generation Model:</label>
+                <input type="text" id="gen-model" value="openai/gpt-5"
+                    style="width: 100%; padding: 10px; background: #2a2a2a; color: #fff; border: 1px solid #3a3a3a; border-radius: 4px; font-size: 14px;"
+                    placeholder="e.g., openai/gpt-5, anthropic/claude-3.5-sonnet">
+                <div style="color: #666; font-size: 12px; margin-top: 5px;">OpenRouter model to use for generating the prompt</div>
+            </div>
+            <!-- Meta-Prompt -->
+            <div style="margin-bottom: 25px;">
+                <label style="display: block; color: #aaa; margin-bottom: 8px; font-size: 14px;">Meta-Prompt (edit as needed):</label>
+                <textarea id="gen-meta-prompt"
+                    style="width: 100%; min-height: 250px; padding: 10px; background: #2a2a2a; color: #fff; border: 1px solid #3a3a3a; border-radius: 4px; font-size: 13px; font-family: 'Courier New', monospace; resize: vertical;"></textarea>
+                <div style="color: #666; font-size: 12px; margin-top: 5px;">
+                    This prompt will be sent to the generation model along with sample data
+                </div>
+            </div>
+            <!-- Actions -->
+            <div style="display: flex; gap: 10px; margin-bottom: 25px;">
+                <button onclick="generatePrompt()" style="padding: 10px 20px; background: #6a4a7e; color: white; border: none; border-radius: 4px; cursor: pointer; font-weight: 500;">
+                    Generate Prompt
+                </button>
+                <button onclick="closePromptGenerator()" style="padding: 10px 20px; background: #5a2a2a; color: white; border: none; border-radius: 4px; cursor: pointer;">
+                    Close
+                </button>
+            </div>
+            <!-- Generated Output -->
+            <div id="gen-output-section" style="display: none;">
+                <h3 style="color: #4a9eff; margin-bottom: 15px;">Generated Judge Prompt</h3>
+                <textarea id="gen-output" readonly
+                    style="width: 100%; min-height: 300px; padding: 15px; background: #0f0f0f; color: #4a9eff; border: 1px solid #4a9eff; border-radius: 4px; font-size: 13px; font-family: 'Courier New', monospace; resize: vertical;"></textarea>
+                <div style="margin-top: 10px;">
+                    <button onclick="copyGeneratedPrompt()" style="padding: 8px 16px; background: #2a7c4a; color: white; border: none; border-radius: 4px; cursor: pointer;">
+                        Copy to Clipboard
+                    </button>
+                </div>
+            </div>
+        </div>
+    </div>
     <script>
         let judges = [];
         let editingIndex = null;
@@ -489,6 +563,113 @@ Respond in JSON format: {'correct': true} or {'correct': false}`
                 console.log('Not changing prompt - user has edited it');
             }
         });
+        // === PROMPT GENERATOR ===
+        const DEFAULT_META_PROMPT = `You are an expert at creating evaluation prompts for judging AI model outputs. I'm building a specialized judge prompt to evaluate the quality/similarity of weak model outputs compared to strong reference model outputs.
+I will show you some sample data below. Each sample contains:
+- A question/input
+- The strong reference model's output (ground truth)
+- The weak model's output (what we're evaluating)
+Your task: Create a specialized, detailed judge prompt that can be used to systematically evaluate the delta/difference between these outputs. The prompt should:
+1. Be specific to the patterns you see in this data
+2. Include clear evaluation criteria
+3. Be written in second-person ("You are...")
+4. Include the placeholders {question}, {strong_output}, and {weak_output}
+5. Specify the exact JSON format to return (either {'score': number} for scalar or {'correct': boolean} for boolean)
+Sample Data:
+{SAMPLES}
+Based on these samples, create a specialized judge prompt that would effectively evaluate this type of data. Return ONLY the judge prompt text, nothing else.`;
+        async function openPromptGenerator() {
+            // Load weak model files
+            try {
+                const response = await fetch('/list_weak_models');
+                const data = await response.json();
+                const select = document.getElementById('gen-weak-model-select');
+                if (data.files && data.files.length > 0) {
+                    select.innerHTML = data.files.map(f =>
+                        `<option value="${f.filename}">${f.weak_model || f.filename}</option>`
+                    ).join('');
+                } else {
+                    select.innerHTML = '<option value="">No weak model files available</option>';
+                }
+            } catch (error) {
+                console.error('Error loading weak models:', error);
+            }
+            // Set default meta-prompt
+            document.getElementById('gen-meta-prompt').value = DEFAULT_META_PROMPT;
+            // Show panel
+            document.getElementById('prompt-generator-panel').style.display = 'block';
+            document.getElementById('gen-output-section').style.display = 'none';
+        }
+        function closePromptGenerator() {
+            document.getElementById('prompt-generator-panel').style.display = 'none';
+        }
+        async function generatePrompt() {
+            const weakModelFile = document.getElementById('gen-weak-model-select').value;
+            const numSamples = parseInt(document.getElementById('gen-num-samples').value) || 3;
+            const model = document.getElementById('gen-model').value.trim();
+            const metaPrompt = document.getElementById('gen-meta-prompt').value.trim();
+            if (!weakModelFile) {
+                alert('Please select a weak model dataset');
+                return;
+            }
+            if (!model) {
+                alert('Please enter a generation model');
+                return;
+            }
+            if (!metaPrompt) {
+                alert('Please enter a meta-prompt');
+                return;
+            }
+            try {
+                const response = await fetch('/generate_judge_prompt', {
+                    method: 'POST',
+                    headers: { 'Content-Type': 'application/json' },
+                    body: JSON.stringify({
+                        weak_model_file: weakModelFile,
+                        num_samples: numSamples,
+                        model: model,
+                        meta_prompt: metaPrompt
+                    })
+                });
+                if (!response.ok) {
+                    throw new Error('Failed to generate prompt');
+                }
+                const result = await response.json();
+                // Display generated prompt
+                document.getElementById('gen-output').value = result.generated_prompt;
+                document.getElementById('gen-output-section').style.display = 'block';
+            } catch (error) {
+                alert('Error generating prompt: ' + error.message);
+                console.error('Generation error:', error);
+            }
+        }
+        function copyGeneratedPrompt() {
+            const output = document.getElementById('gen-output');
+            output.select();
+            document.execCommand('copy');
+            alert('Prompt copied to clipboard!');
+        }
     </script>
 </body>
 </html>

quickdistill 0.1.6__tar.gz → 0.1.7__tar.gz

quickdistill 0.1.6tar.gz → 0.1.7tar.gz