PyPI - quickdistill - Versions diffs - 0.1.7__py3-none-any.whl → 0.1.9__py3-none-any.whl - Mend

quickdistill 0.1.7py3-none-any.whl → 0.1.9py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

quickdistill/__init__.py +12 -1
quickdistill/__pycache__/__init__.cpython-310.pyc +0 -0
quickdistill/__pycache__/server.cpython-310.pyc +0 -0
quickdistill/default_judges.json +2 -2
quickdistill/server.py +170 -29
quickdistill/static/judge_manager.html +12 -8
quickdistill/static/trace_viewer.html +379 -112
{quickdistill-0.1.7.dist-info → quickdistill-0.1.9.dist-info}/METADATA +1 -1
quickdistill-0.1.9.dist-info/RECORD +17 -0
quickdistill-0.1.7.dist-info/RECORD +0 -17
{quickdistill-0.1.7.dist-info → quickdistill-0.1.9.dist-info}/WHEEL +0 -0
{quickdistill-0.1.7.dist-info → quickdistill-0.1.9.dist-info}/entry_points.txt +0 -0
{quickdistill-0.1.7.dist-info → quickdistill-0.1.9.dist-info}/top_level.txt +0 -0

quickdistill/__init__.py CHANGED Viewed

@@ -8,7 +8,18 @@ This package provides tools to:
 - Export datasets for model evaluation
 """
-__version__ = "0.1.7"
+# Monkey patch for aiohttp/litellm compatibility
+# litellm expects aiohttp.ConnectionTimeoutError but it doesn't exist in some versions
+try:
+    import aiohttp
+    if not hasattr(aiohttp, 'ConnectionTimeoutError'):
+        aiohttp.ConnectionTimeoutError = aiohttp.ServerTimeoutError
+    if not hasattr(aiohttp, 'SocketTimeoutError'):
+        aiohttp.SocketTimeoutError = aiohttp.ServerTimeoutError
+except Exception:
+    pass
+__version__ = "0.1.9"
 __author__ = "Brett Young"
 __email__ = "bdytx5@umsystem.edu"

quickdistill/__pycache__/__init__.cpython-310.pyc CHANGED Viewed

Binary file

quickdistill/__pycache__/server.cpython-310.pyc CHANGED Viewed

Binary file

quickdistill/default_judges.json CHANGED Viewed

@@ -2,14 +2,14 @@
   {
     "name": "boolean_scorer",
     "type": "llm",
-    "model": "gpt-5",
+    "model": "openai/gpt-5",
     "returnType": "boolean",
     "prompt": "You are a strict evaluator comparing two AI responses (one from a strong reference model which is the ground truth, and one from a weaker model which we are testing to see how similar the responses it generates are to the strong model).\n\nStrong Model Response: {strong_output}\nWeak Model Response: {weak_output}\n\nDetermine if the weak model response is CORRECT compared to the strong model response.\nConsider a response CORRECT if it conveys the same key information and meaning, even if worded differently.\n\nRespond in JSON format: {'correct': true} or {'correct': false}"
   },
   {
     "name": "scalar_scorer",
     "type": "llm",
-    "model": "gpt-5",
+    "model": "openai/gpt-5",
     "returnType": "scalar",
     "prompt": "You are a strict evaluator comparing two AI responses (one from a strong reference model which is the ground truth, and one from a weaker model which we are testing to see how similar the responses it generates are to the strong model).\n\nStrong Model Response: {strong_output}\nWeak Model Response: {weak_output}\n\nEvaluate how similar the weak model response is to the strong model response.\nRate on a scale of 1-5 where 1=completely different and 5=nearly identical. RETURN ONLY ONE SCORE REPRESENTY THE AVERAGE SIMILARITY (EG 5-(avg_error))\n\nRespond in JSON format eg {'scores': the_score }"
   }

quickdistill/server.py CHANGED Viewed

@@ -100,40 +100,133 @@ def run_inference(client, model, messages, max_tokens=1000):
         return f"ERROR: {str(e)}"
 def extract_output_content(output_str):
-    """Extract actual content from WeaveObject string or regular output"""
+    """Extract actual content from WeaveObject string, JSON response, or regular output.
+    Handles outputs from:
+    - OpenAI chat.completions.create (plain text)
+    - OpenAI responses.create (JSON with nested structure)
+    - Anthropic Messages (WeaveObject with content[0].text)
+    - Google Gemini (WeaveObject with candidates[0].content.parts[0].text)
+    """
+    import re
+    import json
     if not output_str:
         return None
-    # If it's a WeaveObject string, try to extract the text content
-    if isinstance(output_str, str) and 'WeaveObject' in output_str:
-        import re
-        # Try to find the 'text' field in the WeaveObject
-        match = re.search(r"'text':\s*'([^']*(?:\\'[^']*)*)'", output_str)
+    if not isinstance(output_str, str):
+        return str(output_str)
+    # Handle empty/streaming responses
+    if output_str in ('', 'None', 'null'):
+        return '[Streaming output - not captured]'
+    # Handle OpenAI responses.create JSON format
+    if output_str.startswith('{') and '"output"' in output_str:
+        try:
+            resp_obj = json.loads(output_str)
+            if 'output' in resp_obj and isinstance(resp_obj['output'], list):
+                # Extract text from output messages
+                text_parts = []
+                for item in resp_obj['output']:
+                    if item.get('type') == 'message' and 'content' in item:
+                        for content in item['content']:
+                            if content.get('type') == 'output_text' and 'text' in content:
+                                text_parts.append(content['text'])
+                if text_parts:
+                    return '\n\n'.join(text_parts)
+        except (json.JSONDecodeError, KeyError, TypeError):
+            pass  # Fall through to other handlers
+    # Handle WeaveObject strings (Anthropic, Gemini)
+    if 'WeaveObject' in output_str:
+        # Improved regex that handles escape sequences properly
+        match = re.search(r"'text':\s*'((?:[^'\\]|\\.)*)'", output_str, re.DOTALL)
         if match:
-            # Unescape the string
+            # Unescape the string properly (order matters!)
             text = match.group(1)
-            text = text.replace('\\n', '\n').replace("\\'", "'").replace('\\\\', '\\')
+            text = text.replace("\\'", "'")      # escaped single quotes
+            text = text.replace('\\"', '"')      # escaped double quotes
+            text = text.replace('\\n', '\n')     # newlines
+            text = text.replace('\\t', '\t')     # tabs
+            text = text.replace('\\r', '\r')     # carriage returns
+            text = text.replace('\\\\', '\\')    # escaped backslashes (do this last!)
             return text
-    # Otherwise return as-is
+        # If no text field found, return truncated version
+        return f"[Complex WeaveObject - could not extract text]\n{output_str[:500]}..."
+    # Plain text output (standard OpenAI chat format)
     return output_str
 def extract_messages_from_trace(trace):
-    """Extract messages from a trace in the format needed for inference"""
-    # Check if messages are at top level
+    """Extract messages from a trace in the format needed for inference.
+    Handles message extraction from:
+    - OpenAI chat.completions.create (messages at top level or in inputs.messages)
+    - OpenAI responses.create (inputs.input field)
+    - Anthropic Messages (inputs.messages)
+    - Google Gemini generate_content (inputs.contents array)
+    - Google Gemini Chat.send_message (inputs.message string)
+    """
+    import re
+    # Get op_display_name for provider detection
+    op_name = trace.get('op_display_name', '')
+    # Check if messages are at top level (already extracted/cached)
     if trace.get('messages') and isinstance(trace['messages'], list) and len(trace['messages']) > 0:
         return trace['messages']
     # Check if messages are in inputs
     if trace.get('inputs') and isinstance(trace['inputs'], dict):
-        messages = trace['inputs'].get('messages', [])
+        inputs = trace['inputs']
+        # Standard OpenAI/Anthropic: inputs.messages
+        messages = inputs.get('messages', [])
         if isinstance(messages, list) and len(messages) > 0:
             return messages
+        # OpenAI responses.create: inputs.input (simple string)
+        if 'openai.responses' in op_name and 'input' in inputs:
+            return [{"role": "user", "content": inputs['input']}]
+        # Gemini Chat.send_message: inputs.message (simple string)
+        if 'Chat.send_message' in op_name and 'message' in inputs:
+            return [{"role": "user", "content": inputs['message']}]
+        # Gemini generate_content: inputs.contents (array of content objects or WeaveObject strings)
+        if 'google.genai' in op_name and 'contents' in inputs:
+            contents = inputs['contents']
+            if isinstance(contents, list) and len(contents) > 0:
+                messages = []
+                for content in contents:
+                    # Handle WeaveObject string format
+                    if isinstance(content, str) and 'WeaveObject' in content:
+                        role_match = re.search(r"'role':\s*'(\w+)'", content)
+                        text_match = re.search(r"'text':\s*'((?:[^'\\]|\\.)*)'", content, re.DOTALL)
+                        text = '[Complex content]'
+                        if text_match:
+                            text = text_match.group(1)
+                            text = text.replace("\\'", "'").replace('\\n', '\n').replace('\\\\', '\\')
+                        messages.append({
+                            "role": role_match.group(1) if role_match else "user",
+                            "content": text
+                        })
+                    # Handle regular dict format
+                    elif isinstance(content, dict):
+                        role = content.get('role', 'user')
+                        parts = content.get('parts', [])
+                        if isinstance(parts, list):
+                            text = '\n'.join([p.get('text', '') for p in parts if isinstance(p, dict)])
+                            messages.append({"role": role, "content": text})
+                if messages:
+                    return messages
         # Check if inputs has question/context format (from generate_test_traces.py wrapper traces)
-        question = trace['inputs'].get('question')
-        context = trace['inputs'].get('context')
+        question = inputs.get('question')
+        context = inputs.get('context')
         if question:
             if context:
                 prompt = f"""Based on the following context, answer the question concisely.
@@ -753,16 +846,26 @@ def delete_judge():
 @app.route('/run_evaluation', methods=['POST'])
 def run_evaluation_endpoint():
-    """Run evaluation using specified judge"""
+    """Run evaluation using specified judge(s) - supports multiple judges"""
     data = request.json
     model_file = data.get('model_file')
-    judge = data.get('judge')
+    judges = data.get('judges')  # Can be a list or single judge dict
     task_id = data.get('task_id', f"eval_{id(data)}")
-    if not model_file or not judge:
-        return jsonify({'error': 'Missing model_file or judge'}), 400
+    # Handle both single judge (backwards compat) and multiple judges
+    if data.get('judge'):
+        judges = [data.get('judge')]
+    elif not judges:
+        return jsonify({'error': 'Missing judge or judges'}), 400
+    # Ensure judges is a list
+    if not isinstance(judges, list):
+        judges = [judges]
+    if not model_file:
+        return jsonify({'error': 'Missing model_file'}), 400
     # Load weak model results
     model_path = DATA_DIR / model_file
@@ -782,18 +885,22 @@ def run_evaluation_endpoint():
     # Extract model name from filename
     model_name = model_file.replace('weak_model_', '').replace('.json', '')
+    # Create evaluation name with all judges
+    judges_names = '_'.join([j['name'] for j in judges])
+    eval_name = f"eval-{model_name}-{judges_names}"
     # Initialize progress tracking
     total_steps = len(results)
     progress_state[task_id] = {
         'current': 0,
         'total': total_steps,
-        'message': f'Starting evaluation: {model_name} with {judge["name"]}...',
+        'message': f'Starting evaluation: {model_name} with {len(judges)} judge(s)...',
         'status': 'running'
     }
     # Create evaluation logger
     ev = weave.EvaluationLogger(
-        name=f"eval-{model_name}-{judge['name']}",
+        name=eval_name,
         model=model_name
     )
@@ -818,13 +925,20 @@ def run_evaluation_endpoint():
         if messages and len(messages) > 0:
             question = messages[0].get('content', '')
-        # Run judge
-        if judge['type'] == 'llm':
-            scores = run_llm_judge_eval(judge, strong_output, weak_output, question)
-        else:
-            scores = run_custom_judge_eval(judge, strong_output, weak_output)
+        # Run all judges and collect scores
+        all_scores = {}
+        for judge in judges:
+            # Run judge
+            if judge['type'] == 'llm':
+                scores = run_llm_judge_eval(judge, strong_output, weak_output, question)
+            else:
+                scores = run_custom_judge_eval(judge, strong_output, weak_output)
+            # Merge scores with judge name prefix to avoid conflicts
+            for score_key, score_value in scores.items():
+                all_scores[f"{judge['name']}_{score_key}"] = score_value
-        # Log to weave
+        # Log to weave with all scores from all judges
         ev.log_example(
             inputs={
                 "question": question,
@@ -834,7 +948,7 @@ def run_evaluation_endpoint():
                 "weak_output": weak_output
             },
-            scores=scores
+            scores=all_scores
         )
     # Finish evaluation
@@ -850,10 +964,11 @@ def run_evaluation_endpoint():
     return jsonify({
         'status': 'success',
-        'evaluation_name': f"eval-{model_name}-{judge['name']}",
+        'evaluation_name': eval_name,
         'examples_evaluated': len(results),
         'weave_url': ev.ui_url,
         'strong_export': strong_export,
+        'judges': [j['name'] for j in judges],
         'task_id': task_id
     })
@@ -1032,6 +1147,32 @@ def list_projects():
     return jsonify({'projects': projects})
+@app.route('/get_preferences', methods=['GET'])
+def get_preferences():
+    """Get saved user preferences"""
+    prefs_file = DATA_DIR / 'preferences.json'
+    if prefs_file.exists():
+        try:
+            with open(prefs_file, 'r') as f:
+                return jsonify(json.load(f))
+        except:
+            pass
+    return jsonify({})
+@app.route('/save_preferences', methods=['POST'])
+def save_preferences():
+    """Save user preferences"""
+    try:
+        data = request.json
+        prefs_file = DATA_DIR / 'preferences.json'
+        with open(prefs_file, 'w') as f:
+            json.dump(data, f, indent=2)
+        return jsonify({'status': 'success'})
+    except Exception as e:
+        return jsonify({'status': 'error', 'message': str(e)}), 500
 # Routes for serving HTML pages
 @app.route('/')
 def index():

quickdistill/static/judge_manager.html CHANGED Viewed

@@ -183,12 +183,10 @@
             <div id="llm-options" style="display: block;">
                 <label for="judge-model">Model</label>
-                <select id="judge-model">
-                    <option value="gpt-5">gpt-5</option>
-                    <option value="gpt-4o">gpt-4o</option>
-                    <option value="gpt-4o-mini">gpt-4o-mini</option>
-                    <option value="claude-3-5-sonnet-20241022">claude-3-5-sonnet</option>
-                </select>
+                <input type="text" id="judge-model" placeholder="e.g., openai/gpt-5, anthropic/claude-3.5-sonnet" value="openai/gpt-5">
+                <p style="color: #888; font-size: 12px; margin-top: 5px; margin-bottom: 15px;">
+                    <strong>Note:</strong> Uses LiteLLM format. Examples: <code>openai/gpt-5</code>, <code>anthropic/claude-3.5-sonnet</code>, <code>openai/gpt-4o</code>
+                </p>
                 <label for="judge-return-type">Return Type</label>
                 <select id="judge-return-type">
@@ -393,10 +391,16 @@ Respond in JSON format: {'correct': true} or {'correct': false}`
             };
             if (type === 'llm') {
-                judge.model = document.getElementById('judge-model').value;
+                judge.model = document.getElementById('judge-model').value.trim();
                 judge.returnType = document.getElementById('judge-return-type').value;
                 judge.prompt = document.getElementById('judge-prompt').value.trim();
+                // Validate model
+                if (!judge.model) {
+                    alert('Error: Please enter a model (e.g., openai/gpt-5)');
+                    return;
+                }
                 // Validate required placeholders
                 if (!judge.prompt.includes('{strong_output}')) {
                     alert('Error: Judge prompt must include {strong_output} placeholder');
@@ -420,7 +424,7 @@ Respond in JSON format: {'correct': true} or {'correct': false}`
         function resetForm() {
             document.getElementById('judge-name').value = '';
             document.getElementById('judge-type').value = 'llm';
-            document.getElementById('judge-model').value = 'gpt-5-2025-08-07';
+            document.getElementById('judge-model').value = 'openai/gpt-5';
             document.getElementById('judge-prompt').value = '';
             document.getElementById('form-title').textContent = 'Create New Judge';
             document.getElementById('save-btn').textContent = 'Save Judge';

quickdistill/static/trace_viewer.html CHANGED Viewed

@@ -43,10 +43,21 @@
             padding: 20px;
             border-radius: 8px;
             margin-bottom: 20px;
-            display: flex;
+        }
+        .filter-row {
+            display: grid;
+            grid-template-columns: auto 1fr auto 1fr auto auto;
             gap: 15px;
-            align-items: center;
-            flex-wrap: wrap;
+            align-items: start;
+            margin-bottom: 20px;
+        }
+        .filter-group {
+            display: flex;
+            flex-direction: column;
+            gap: 8px;
+            min-width: 250px;
         }
         .controls label {
@@ -283,58 +294,75 @@
         </div>
         <div class="controls">
-            <div style="display: flex; flex-direction: column; gap: 4px;">
-                <label for="op-filter">Filter by Operation:</label>
-                <span style="color: #666; font-size: 11px;">Primary supported: openai.chat.completions.create</span>
-            </div>
-            <select id="op-filter">
-                <option value="all">All Operations</option>
-            </select>
-            <label for="model-filter">Filter by Model:</label>
-            <select id="model-filter">
-                <option value="all">All Models</option>
-            </select>
-            <button id="select-all-btn" style="margin-left: 20px; padding: 8px 16px; background: #2a7c4a; color: white; border: none; border-radius: 4px; cursor: pointer;">
-                Select All Filtered
-            </button>
-            <button id="export-btn" style="padding: 8px 16px; background: #4a9eff; color: white; border: none; border-radius: 4px; cursor: pointer;">
-                Export Selected to Test Set (<span id="selected-count">0</span>)
-            </button>
+            <!-- Filters Row -->
+            <div class="filter-row">
+                <div class="filter-group">
+                    <label for="op-filter">Operation Filter:</label>
+                    <select id="op-filter" style="background: #2a2a2a; color: #fff; border: 1px solid #3a3a3a; padding: 8px 12px; border-radius: 4px; font-size: 14px; cursor: pointer;">
+                        <option value="all">All Operations</option>
+                    </select>
+                    <span style="color: #4a9eff; font-size: 11px; font-weight: 500;">✅ Fully supported: OpenAI (chat.completions, responses), Anthropic (Messages), Google Gemini (generate_content, Chat)</span>
+                </div>
-            <button id="open-inference-btn" style="padding: 8px 16px; background: #7c4a9e; color: white; border: none; border-radius: 4px; cursor: pointer;">
-                Run Weak Models
-            </button>
+                <div class="filter-group">
+                    <label for="model-filter">Model Filter:</label>
+                    <select id="model-filter" style="background: #2a2a2a; color: #fff; border: 1px solid #3a3a3a; padding: 8px 12px; border-radius: 4px; font-size: 14px; cursor: pointer;">
+                        <option value="all">All Models</option>
+                    </select>
+                </div>
-            <button id="open-eval-btn" style="padding: 8px 16px; background: #9e6a4a; color: white; border: none; border-radius: 4px; cursor: pointer;">
-                Run Evaluation
-            </button>
+                <button id="select-all-btn" style="padding: 8px 16px; background: #2a7c4a; color: white; border: none; border-radius: 4px; cursor: pointer; align-self: end; white-space: nowrap;">
+                    Select All
+                </button>
-            <a href="/judge" target="_blank" style="padding: 8px 16px; background: #4a5a9e; color: white; border: none; border-radius: 4px; text-decoration: none; display: inline-block;">
-                Manage Judges
-            </a>
+                <div style="display: flex; flex-direction: column; gap: 4px; align-self: end;">
+                    <div style="color: #888; font-size: 13px;">Total: <span id="total-count" style="color: #fff; font-weight: 600;">0</span></div>
+                    <div style="color: #888; font-size: 13px;">Shown: <span id="shown-count" style="color: #4a9eff; font-weight: 600;">0</span></div>
+                </div>
+            </div>
-            <button id="open-test-judge-btn" style="padding: 8px 16px; background: #6a4a7e; color: white; border: none; border-radius: 4px; cursor: pointer;">
-                Test Judges
-            </button>
+            <!-- Action Buttons Row -->
+            <div style="display: grid; grid-template-columns: 2fr 1fr; gap: 20px;">
+                <!-- Main Workflow -->
+                <div style="padding: 15px; background: #0f1f0f; border-radius: 8px; border: 2px solid #2a4a2a;">
+                    <div style="color: #6dd36d; font-size: 13px; font-weight: 600; margin-bottom: 12px;">📋 MANUAL WORKFLOW</div>
+                    <div style="display: flex; flex-wrap: wrap; gap: 10px;">
+                        <button id="export-btn" style="padding: 10px 16px; background: #4a9eff; color: white; border: none; border-radius: 4px; cursor: pointer; font-weight: 500;">
+                            1. Export Test Set (<span id="selected-count">0</span>)
+                        </button>
+                        <button id="open-inference-btn" style="padding: 10px 16px; background: #7c4a9e; color: white; border: none; border-radius: 4px; cursor: pointer; font-weight: 500;">
+                            2. Run Weak Models
+                        </button>
+                        <button id="open-eval-btn" style="padding: 10px 16px; background: #9e6a4a; color: white; border: none; border-radius: 4px; cursor: pointer; font-weight: 500;">
+                            3. Evaluate Results
+                        </button>
+                    </div>
+                </div>
-            <button id="open-settings-btn" style="padding: 8px 16px; background: #5a5a5a; color: white; border: none; border-radius: 4px; cursor: pointer;">
-                Settings
-            </button>
+                <!-- Utilities -->
+                <div style="padding: 15px; background: #1a1a2a; border-radius: 8px; border: 1px solid #2a2a3a;">
+                    <div style="color: #aaa; font-size: 13px; font-weight: 600; margin-bottom: 12px;">⚙️ TOOLS</div>
+                    <div style="display: flex; flex-wrap: wrap; gap: 8px;">
+                        <a href="/judge" target="_blank" style="padding: 8px 14px; background: #4a5a9e; color: white; border: none; border-radius: 4px; text-decoration: none; display: inline-block; font-size: 13px;">
+                            Judges
+                        </a>
+                        <button id="open-test-judge-btn" style="padding: 8px 14px; background: #6a4a7e; color: white; border: none; border-radius: 4px; cursor: pointer; font-size: 13px;">
+                            Test Judge
+                        </button>
+                        <button id="open-settings-btn" style="padding: 8px 14px; background: #5a5a5a; color: white; border: none; border-radius: 4px; cursor: pointer; font-size: 13px;">
+                            Settings
+                        </button>
+                    </div>
+                </div>
+            </div>
-            <div style="margin: 20px 0; padding: 15px; background: #2a1a2a; border-radius: 8px; border: 1px solid #4a2a4a;">
-                <div style="color: #aaa; font-size: 13px; margin-bottom: 10px;">Automatic Workflow:</div>
-                <button id="open-e2e-btn" style="padding: 10px 20px; background: #7a4a9e; color: white; border: none; border-radius: 4px; cursor: pointer; font-weight: 500;">
+            <!-- Automatic Workflow -->
+            <div style="margin-top: 20px; padding: 15px; background: #2a1a2a; border-radius: 8px; border: 2px solid #7a4a9e;">
+                <div style="color: #bb88ff; font-size: 13px; font-weight: 600; margin-bottom: 10px;">⚡ AUTOMATIC WORKFLOW</div>
+                <button id="open-e2e-btn" style="padding: 12px 20px; background: #7a4a9e; color: white; border: none; border-radius: 4px; cursor: pointer; font-weight: 600; font-size: 14px;">
                     ⚡ Run End-to-End Test
                 </button>
-                <div style="color: #666; font-size: 11px; margin-top: 8px;">Export → Generate → Evaluate (all in one)</div>
-            </div>
-            <div class="stats">
-                <div>Total: <span id="total-count">0</span></div>
-                <div>Shown: <span id="shown-count">0</span></div>
+                <div style="color: #888; font-size: 11px; margin-top: 8px;">Export → Generate → Evaluate (all in one)</div>
             </div>
         </div>
@@ -412,10 +440,10 @@
                 </div>
                 <div style="margin-bottom: 20px;">
-                    <label style="color: #aaa; display: block; margin-bottom: 10px;">Select Judge:</label>
-                    <select id="eval-judge" style="width: 100%; padding: 8px; background: #2a2a2a; color: #fff; border: 1px solid #3a3a3a; border-radius: 4px;">
-                        <!-- Judges populated dynamically -->
-                    </select>
+                    <label style="color: #aaa; display: block; margin-bottom: 10px;">Select Judge(s) - you can select multiple:</label>
+                    <div id="eval-judge-list" style="max-height: 200px; overflow-y: auto; background: #0f0f0f; padding: 15px; border-radius: 4px;">
+                        <!-- Judges populated dynamically as checkboxes -->
+                    </div>
                     <div style="color: #666; font-size: 12px; margin-top: 5px;">
                         <a href="/judge" target="_blank" style="color: #4a9eff;">Create/manage judges</a>
                     </div>
@@ -511,9 +539,9 @@
                     <label style="display: block; color: #aaa; margin-bottom: 8px; font-size: 14px;">Judge Model:</label>
                     <input type="text" id="test-judge-model"
                         style="width: 100%; padding: 10px; background: #2a2a2a; color: #fff; border: 1px solid #3a3a3a; border-radius: 4px; font-size: 14px;"
-                        placeholder="e.g., gpt-4o, claude-3-5-sonnet-20241022">
+                        placeholder="e.g., openai/gpt-5, anthropic/claude-3.5-sonnet">
                     <div style="color: #666; font-size: 12px; margin-top: 5px;">
-                        Override the judge's model for this test
+                        Override the judge's model for this test. Uses LiteLLM format (e.g., <code style="color: #aaa;">openai/gpt-5</code>, <code style="color: #aaa;">anthropic/claude-3.5-sonnet</code>)
                     </div>
                 </div>
@@ -586,10 +614,10 @@
                 <!-- Judge Selection -->
                 <div style="margin-bottom: 30px;">
-                    <h3 style="color: #fff; font-size: 16px; margin-bottom: 15px;">2. Select Judge</h3>
-                    <select id="e2e-judge" style="width: 100%; padding: 10px; background: #2a2a2a; color: #fff; border: 1px solid #3a3a3a; border-radius: 4px; font-size: 14px;">
-                        <option value="">Loading judges...</option>
-                    </select>
+                    <h3 style="color: #fff; font-size: 16px; margin-bottom: 15px;">2. Select Judges</h3>
+                    <div id="e2e-judge-list" style="max-height: 200px; overflow-y: auto; background: #2a2a2a; border: 1px solid #3a3a3a; border-radius: 4px; padding: 10px;">
+                        <p style="color: #888;">Loading judges...</p>
+                    </div>
                 </div>
                 <!-- Actions -->
@@ -648,6 +676,17 @@
             "Qwen/Qwen3-Coder-480B-A35B-Instruct",
         ];
+        const SUPPORTED_OPS = [
+            'openai.chat.completions.create',
+            'openai.responses.create',
+            'anthropic.Messages.create',
+            'anthropic.Messages.stream',
+            'google.genai.models.Models.generate_content',
+            'google.genai.models.Models.generate_content_stream',
+            'google.genai.chats.Chat.send_message',
+            'google.genai.chats.Chat.send_message_stream'
+        ];
         let allTraces = [];
         let currentOpFilter = 'all';
         let currentModelFilter = 'all';
@@ -659,6 +698,10 @@
         // Load projects list
         async function loadProjects() {
             try {
+                // Load saved preferences
+                const prefsResponse = await fetch('/get_preferences');
+                const prefs = await prefsResponse.json();
                 const response = await fetch('/list_projects');
                 const data = await response.json();
                 const select = document.getElementById('project-select');
@@ -670,11 +713,23 @@
                         `<option value="${p.name}">${p.name} (${p.trace_count} traces)</option>`
                     ).join('');
-                    // Auto-select first project and load it
-                    if (data.projects.length > 0) {
-                        currentProject = data.projects[0].name;
-                        select.value = currentProject;
-                        await loadTraces(currentProject);
+                    // Use saved project or first project
+                    let projectToLoad = prefs.lastProject || data.projects[0].name;
+                    // Check if saved project still exists
+                    const projectExists = data.projects.some(p => p.name === projectToLoad);
+                    if (!projectExists) {
+                        projectToLoad = data.projects[0].name;
+                    }
+                    currentProject = projectToLoad;
+                    select.value = currentProject;
+                    await loadTraces(currentProject);
+                    // Set default filter to "All Supported Ops"
+                    if (!prefs.lastOpFilter) {
+                        currentOpFilter = 'supported';
+                        document.getElementById('op-filter').value = 'supported';
                     }
                 }
             } catch (e) {
@@ -682,6 +737,153 @@
             }
         }
+        // Patch traces to handle different provider formats
+        function patchTracesForProviders(traces) {
+            return traces.map(trace => {
+                const patched = { ...trace };
+                // Extract provider from op_display_name
+                const opName = trace.op_display_name || '';
+                // === PARSE WEAVEOBJECT OUTPUTS ===
+                if (patched.output && typeof patched.output === 'string') {
+                    // Check if it's a streaming operation (empty or None)
+                    if (patched.output === '' || patched.output === 'None' || patched.output === 'null') {
+                        if (opName.includes('stream') || opName.includes('Stream')) {
+                            patched.output = '[Streaming output - not captured in trace]';
+                        }
+                    }
+                    // Parse WeaveObject strings
+                    else if (patched.output.startsWith('WeaveObject(')) {
+                        patched.output = extractFromWeaveObject(patched.output, opName);
+                    }
+                    // Parse OpenAI responses.create JSON output
+                    else if (opName.includes('openai.responses.create')) {
+                        try {
+                            const respObj = JSON.parse(patched.output);
+                            if (respObj.output && Array.isArray(respObj.output)) {
+                                // Extract text from output messages
+                                const textParts = respObj.output
+                                    .filter(item => item.type === 'message')
+                                    .flatMap(msg => msg.content || [])
+                                    .filter(c => c.type === 'output_text')
+                                    .map(c => c.text);
+                                patched.output = textParts.join('\n\n') || JSON.stringify(respObj, null, 2);
+                            }
+                        } catch (e) {
+                            // Keep original if parsing fails
+                        }
+                    }
+                }
+                // === EXTRACT MESSAGES FOR NON-OPENAI FORMATS ===
+                if (patched.inputs && (!patched.messages || patched.messages.length === 0)) {
+                    // Anthropic format
+                    if (opName.includes('anthropic') && patched.inputs.messages) {
+                        patched.messages = patched.inputs.messages;
+                    }
+                    // Gemini contents format
+                    else if (opName.includes('google.genai') && patched.inputs.contents) {
+                        patched.messages = extractGeminiMessages(patched.inputs.contents);
+                    }
+                    // Gemini Chat.send_message format
+                    else if (opName.includes('Chat.send_message') && patched.inputs.message) {
+                        patched.messages = [{ role: 'user', content: patched.inputs.message }];
+                    }
+                    // OpenAI responses.create input format
+                    else if (opName.includes('openai.responses') && patched.inputs.input) {
+                        patched.messages = [{ role: 'user', content: patched.inputs.input }];
+                    }
+                }
+                // === ADD PROVIDER-SPECIFIC USAGE INFO ===
+                if (patched.usage) {
+                    // Gemini thoughts tokens
+                    if (patched.usage.thoughts_tokens) {
+                        patched.usage.thoughts_tokens_label = 'Thinking';
+                    }
+                    // OpenAI reasoning tokens
+                    if (patched.usage.output_tokens_details && patched.usage.output_tokens_details.reasoning_tokens) {
+                        patched.usage.reasoning_tokens = patched.usage.output_tokens_details.reasoning_tokens;
+                    }
+                    // Anthropic cache metrics
+                    if (patched.usage.cache_read_input_tokens || patched.usage.cache_creation_input_tokens) {
+                        patched.usage.has_cache_info = true;
+                    }
+                }
+                return patched;
+            });
+        }
+        // Extract text from WeaveObject string based on provider
+        function extractFromWeaveObject(weaveStr, opName) {
+            try {
+                // Find the 'text' field and extract everything until the next unescaped quote
+                // This handles multiline strings with escaped quotes and newlines
+                const textMatch = weaveStr.match(/'text':\s*'((?:[^'\\]|\\.)*)'/s);
+                if (textMatch && textMatch[1]) {
+                    let extracted = textMatch[1];
+                    // Unescape common escape sequences
+                    extracted = extracted
+                        .replace(/\\'/g, "'")      // escaped single quotes
+                        .replace(/\\"/g, '"')      // escaped double quotes
+                        .replace(/\\n/g, '\n')     // newlines
+                        .replace(/\\t/g, '\t')     // tabs
+                        .replace(/\\r/g, '\r')     // carriage returns
+                        .replace(/\\\\/g, '\\');   // escaped backslashes (do this last)
+                    return extracted;
+                }
+                // Fallback: if no text field found, show truncated version
+                return `[Complex WeaveObject - see raw data]\n${weaveStr.substring(0, 500)}...`;
+            } catch (e) {
+                console.error('Failed to parse WeaveObject:', e);
+                return `[Failed to parse WeaveObject]\n${weaveStr.substring(0, 200)}...`;
+            }
+        }
+        // Extract messages from Gemini contents format
+        function extractGeminiMessages(contents) {
+            if (!Array.isArray(contents)) return [];
+            return contents.map(content => {
+                // Handle WeaveObject string
+                if (typeof content === 'string' && content.startsWith('WeaveObject(')) {
+                    // Try to extract basic info
+                    const roleMatch = content.match(/'role':\s*'(\w+)'/);
+                    const textMatch = content.match(/'text':\s*'((?:[^'\\]|\\.)*)'/s);
+                    let text = '[Complex content]';
+                    if (textMatch && textMatch[1]) {
+                        text = textMatch[1]
+                            .replace(/\\'/g, "'")
+                            .replace(/\\"/g, '"')
+                            .replace(/\\n/g, '\n')
+                            .replace(/\\t/g, '\t')
+                            .replace(/\\r/g, '\r')
+                            .replace(/\\\\/g, '\\');
+                    }
+                    return {
+                        role: roleMatch ? roleMatch[1] : 'user',
+                        content: text
+                    };
+                }
+                // Handle regular object
+                else if (content.role && content.parts) {
+                    return {
+                        role: content.role,
+                        content: content.parts.map(p => p.text || '').join('\n')
+                    };
+                }
+                return { role: 'user', content: String(content) };
+            });
+        }
         // Load traces from selected project
         async function loadTraces(projectName) {
             const projectPath = projectName.replace('/', '_');
@@ -696,7 +898,7 @@
                 }
                 const data = await response.json();
-                allTraces = data;
+                allTraces = patchTracesForProviders(data);
                 currentProject = projectName;
                 populateFilters();
                 renderTraces();
@@ -774,6 +976,12 @@
             const projectName = e.target.value;
             if (projectName) {
                 await loadTraces(projectName);
+                // Save preference
+                await fetch('/save_preferences', {
+                    method: 'POST',
+                    headers: { 'Content-Type': 'application/json' },
+                    body: JSON.stringify({ lastProject: projectName })
+                });
             }
         });
@@ -782,9 +990,21 @@
         // Populate filter dropdowns
         function populateFilters() {
-            // Populate operation filter
-            const ops = new Set(allTraces.map(t => t.op_display_name || 'unknown'));
+            // Clear existing options (except "All") to avoid duplicates when switching projects
             const opSelect = document.getElementById('op-filter');
+            const modelSelect = document.getElementById('model-filter');
+            // Save current filter values
+            const savedOpFilter = currentOpFilter;
+            const savedModelFilter = currentModelFilter;
+            // Clear dropdowns but keep the "All" option
+            opSelect.innerHTML = '<option value="all">All Operations</option>';
+            opSelect.innerHTML += '<option value="supported">All Supported Ops</option>';
+            modelSelect.innerHTML = '<option value="all">All Models</option>';
+            // Populate operation filter with operations from current project only
+            const ops = new Set(allTraces.map(t => t.op_display_name || 'unknown'));
             const sortedOps = [...ops].sort();
             sortedOps.forEach(op => {
                 const option = document.createElement('option');
@@ -793,21 +1013,36 @@
                 opSelect.appendChild(option);
             });
-            // Set default to openai.chat.completions.create if it exists
-            if (sortedOps.includes('openai.chat.completions.create')) {
-                opSelect.value = 'openai.chat.completions.create';
-                currentOpFilter = 'openai.chat.completions.create';
-            }
-            // Populate model filter
+            // Populate model filter with models from current project only
             const models = new Set(allTraces.map(t => t.model));
-            const modelSelect = document.getElementById('model-filter');
             [...models].sort().forEach(model => {
                 const option = document.createElement('option');
                 option.value = model;
                 option.textContent = model;
                 modelSelect.appendChild(option);
             });
+            // Restore previous filter values if they still exist
+            // Special handling for 'all' and 'supported' which always exist
+            if (savedOpFilter === 'all' || savedOpFilter === 'supported') {
+                opSelect.value = savedOpFilter;
+                currentOpFilter = savedOpFilter;
+            } else if (sortedOps.includes(savedOpFilter)) {
+                opSelect.value = savedOpFilter;
+                currentOpFilter = savedOpFilter;
+            } else {
+                // Default to 'supported' when switching projects
+                opSelect.value = 'supported';
+                currentOpFilter = 'supported';
+            }
+            if ([...models].includes(savedModelFilter)) {
+                modelSelect.value = savedModelFilter;
+                currentModelFilter = savedModelFilter;
+            } else {
+                modelSelect.value = 'all';
+                currentModelFilter = 'all';
+            }
         }
         // Filter change handlers
@@ -826,7 +1061,13 @@
             let filteredTraces = allTraces;
             // Apply operation filter
-            if (currentOpFilter !== 'all') {
+            if (currentOpFilter === 'supported') {
+                // Filter to only supported operations
+                filteredTraces = filteredTraces.filter(t => {
+                    const opDisplayName = t.op_display_name || '';
+                    return SUPPORTED_OPS.some(op => opDisplayName.includes(op));
+                });
+            } else if (currentOpFilter !== 'all') {
                 filteredTraces = filteredTraces.filter(t => t.op_display_name === currentOpFilter);
             }
@@ -860,9 +1101,12 @@
                     ${trace.usage && (trace.usage.total_tokens || trace.usage.requests) ? `
                         <div class="usage-info">
                             ${trace.usage.requests ? `<div class="usage-item"><span class="usage-label">Requests:</span> ${trace.usage.requests}</div>` : ''}
-                            ${trace.usage.prompt_tokens ? `<div class="usage-item"><span class="usage-label">Prompt:</span> ${trace.usage.prompt_tokens}</div>` : ''}
-                            ${trace.usage.completion_tokens ? `<div class="usage-item"><span class="usage-label">Completion:</span> ${trace.usage.completion_tokens}</div>` : ''}
+                            ${trace.usage.prompt_tokens || trace.usage.input_tokens ? `<div class="usage-item"><span class="usage-label">Input:</span> ${trace.usage.prompt_tokens || trace.usage.input_tokens}</div>` : ''}
+                            ${trace.usage.completion_tokens || trace.usage.output_tokens ? `<div class="usage-item"><span class="usage-label">Output:</span> ${trace.usage.completion_tokens || trace.usage.output_tokens}</div>` : ''}
                             ${trace.usage.total_tokens ? `<div class="usage-item"><span class="usage-label">Total:</span> ${trace.usage.total_tokens}</div>` : ''}
+                            ${trace.usage.reasoning_tokens ? `<div class="usage-item" style="color: #ff9d00;"><span class="usage-label">Reasoning:</span> ${trace.usage.reasoning_tokens}</div>` : ''}
+                            ${trace.usage.thoughts_tokens ? `<div class="usage-item" style="color: #9d66ff;"><span class="usage-label">Thinking:</span> ${trace.usage.thoughts_tokens}</div>` : ''}
+                            ${trace.usage.cache_read_input_tokens ? `<div class="usage-item" style="color: #4a9eff;"><span class="usage-label">Cache Read:</span> ${trace.usage.cache_read_input_tokens}</div>` : ''}
                         </div>
                     ` : ''}
@@ -972,17 +1216,15 @@
                 return;
             }
-            // Filter to only OpenAI completion traces (exclude wrapper function traces)
+            // Filter to only supported provider traces (exclude wrapper function traces)
             const completionTraces = selectedData.filter(t => {
-                const opName = t.op_name || '';
                 const opDisplayName = t.op_display_name || '';
-                // Only include traces from openai.chat.completions.create
-                return opDisplayName === 'openai.chat.completions.create' ||
-                       opName.includes('openai.chat.completions.create');
+                // Check if it's one of our supported operations
+                return SUPPORTED_OPS.some(op => opDisplayName.includes(op));
             });
             if (completionTraces.length === 0) {
-                alert('No OpenAI completion traces selected! Please select traces from actual API calls, not wrapper functions.');
+                alert('No supported provider traces selected! Supported: OpenAI, Anthropic, Gemini');
                 return;
             }
@@ -1036,9 +1278,18 @@
         // Get filtered traces
         function getFilteredTraces() {
             let filtered = allTraces;
-            if (currentOpFilter !== 'all') {
+            // Apply operation filter
+            if (currentOpFilter === 'supported') {
+                filtered = filtered.filter(t => {
+                    const opDisplayName = t.op_display_name || '';
+                    return SUPPORTED_OPS.some(op => opDisplayName.includes(op));
+                });
+            } else if (currentOpFilter !== 'all') {
                 filtered = filtered.filter(t => t.op_display_name === currentOpFilter);
             }
+            // Apply model filter
             if (currentModelFilter !== 'all') {
                 filtered = filtered.filter(t => t.model === currentModelFilter);
             }
@@ -1279,38 +1530,48 @@
                 const response = await fetch('/list_judges');
                 const data = await response.json();
                 const judges = data.judges || [];
-                const judgeSelect = document.getElementById('eval-judge');
+                const judgeList = document.getElementById('eval-judge-list');
                 if (judges.length === 0) {
-                    judgeSelect.innerHTML = '<option value="">No judges defined - create one first</option>';
+                    judgeList.innerHTML = '<div style="color: #888;">No judges defined - <a href="/judge" target="_blank" style="color: #4a9eff;">create one first</a></div>';
                 } else {
-                    judgeSelect.innerHTML = judges.map((j, i) => `<option value="${i}">${j.name} (${j.type})</option>`).join('');
+                    judgeList.innerHTML = judges.map((j, i) => `
+                        <label style="display: flex; align-items: center; padding: 8px; margin-bottom: 8px; background: #1a1a1a; border-radius: 4px; cursor: pointer; transition: background 0.2s;">
+                            <input type="checkbox" class="eval-judge-checkbox" data-judge-index="${i}" style="margin-right: 10px; width: 18px; height: 18px; cursor: pointer;">
+                            <div style="flex: 1;">
+                                <div style="color: #fff; font-size: 14px; font-weight: 500;">${j.name}</div>
+                                <div style="color: #888; font-size: 12px;">${j.type}</div>
+                            </div>
+                        </label>
+                    `).join('');
                 }
             } catch (e) {
                 console.error('Error loading judges:', e);
-                document.getElementById('eval-judge').innerHTML = '<option value="">Error loading judges</option>';
+                document.getElementById('eval-judge-list').innerHTML = '<div style="color: #f88;">Error loading judges</div>';
             }
         }
         // Run evaluation
         document.getElementById('run-eval-btn').addEventListener('click', async () => {
-            const judgeIndex = document.getElementById('eval-judge').value;
+            // Get selected judges
+            const selectedJudgeCheckboxes = document.querySelectorAll('.eval-judge-checkbox:checked');
+            const selectedJudgeIndices = Array.from(selectedJudgeCheckboxes).map(cb => parseInt(cb.dataset.judgeIndex));
             if (selectedEvalModels.size === 0) {
                 alert('Please select at least one weak model');
                 return;
             }
-            if (!judgeIndex) {
-                alert('Please select a judge');
+            if (selectedJudgeIndices.length === 0) {
+                alert('Please select at least one judge');
                 return;
             }
             // Load judges from server
             const judgesResponse = await fetch('/list_judges');
             const judgesData = await judgesResponse.json();
-            const judges = judgesData.judges || [];
-            const judge = judges[parseInt(judgeIndex)];
+            const allJudges = judgesData.judges || [];
+            const selectedJudges = selectedJudgeIndices.map(idx => allJudges[idx]);
             // Show progress
             document.getElementById('eval-progress').style.display = 'block';
@@ -1320,17 +1581,17 @@
             const resultsDiv = document.getElementById('eval-results-links');
             progressText.textContent = `Starting evaluations...\n`;
-            progressText.textContent += `Judge: ${judge.name}\n`;
+            progressText.textContent += `Judges: ${selectedJudges.map(j => j.name).join(', ')}\n`;
             progressText.textContent += `Models: ${selectedEvalModels.size}\n\n`;
             const modelFiles = Array.from(selectedEvalModels);
             const results = [];
-            // Run evaluations sequentially with granular progress
-            for (let i = 0; i < modelFiles.length; i++) {
-                const modelFile = modelFiles[i];
+            // Run one evaluation per model with ALL judges combined
+            for (let modelIdx = 0; modelIdx < modelFiles.length; modelIdx++) {
+                const modelFile = modelFiles[modelIdx];
-                progressText.textContent += `[${i+1}/${modelFiles.length}] Starting ${modelFile}...\n`;
+                progressText.textContent += `[${modelIdx + 1}/${modelFiles.length}] Evaluating ${modelFile} with ${selectedJudges.length} judge(s)...\n`;
                 let pollInterval = null;
                 let taskId = null;
@@ -1341,9 +1602,8 @@
                         const resp = await fetch(`/progress/${taskId}`);
                         if (resp.ok) {
                             const progress = await resp.json();
-                            const percent = (progress.current / progress.total) * 100;
+                            const percent = ((modelIdx + 1) / modelFiles.length) * 100;
                             progressFill.style.width = `${percent}%`;
-                            progressText.textContent = `[${i+1}/${modelFiles.length}] ${progress.message}\nProgress: ${progress.current}/${progress.total} (${percent.toFixed(1)}%)\n`;
                         }
                     } catch (e) {
                         console.error('Error polling eval progress:', e);
@@ -1352,17 +1612,18 @@
                 try {
                     // Generate task ID for this evaluation
-                    taskId = `eval_${Date.now()}_${i}`;
+                    taskId = `eval_${Date.now()}_${modelIdx}`;
                     // Start polling
                     pollInterval = setInterval(pollProgress, 300);
+                    // Send all judges in one request
                     const response = await fetch('/run_evaluation', {
                         method: 'POST',
                         headers: { 'Content-Type': 'application/json' },
                         body: JSON.stringify({
                             model_file: modelFile,
-                            judge: judge,
+                            judges: selectedJudges,  // Send all judges
                             task_id: taskId
                         })
                     });
@@ -1377,6 +1638,7 @@
                     if (pollInterval) clearInterval(pollInterval);
                     progressText.textContent += `  ✓ Complete: ${result.evaluation_name}\n`;
+                    progressText.textContent += `  Judges used: ${result.judges.join(', ')}\n`;
                     progressText.textContent += `  Examples: ${result.examples_evaluated}\n\n`;
                     results.push({
@@ -1780,14 +2042,17 @@
             try {
                 const response = await fetch('/list_judges');
                 const data = await response.json();
-                const judgeSelect = document.getElementById('e2e-judge');
+                const judgeList = document.getElementById('e2e-judge-list');
                 if (data.judges && data.judges.length > 0) {
-                    judgeSelect.innerHTML = data.judges.map((judge, idx) =>
-                        `<option value="${idx}">${judge.name} (${judge.type})</option>`
-                    ).join('');
+                    judgeList.innerHTML = data.judges.map((judge, idx) => `
+                        <label style="display: block; padding: 5px 0; color: #ccc; cursor: pointer;">
+                            <input type="checkbox" class="e2e-judge-checkbox" value="${idx}" style="margin-right: 8px;">
+                            ${judge.name} (${judge.type})
+                        </label>
+                    `).join('');
                 } else {
-                    judgeSelect.innerHTML = '<option value="">No judges available - create one first</option>';
+                    judgeList.innerHTML = '<p style="color: #888;">No judges available - create one first</p>';
                 }
             } catch (error) {
                 console.error('Error loading judges:', error);
@@ -1823,9 +2088,10 @@
                 return;
             }
-            const judgeIndex = document.getElementById('e2e-judge').value;
-            if (!judgeIndex) {
-                alert('Please select a judge!');
+            // Get selected judges
+            const selectedJudgeIndices = Array.from(document.querySelectorAll('.e2e-judge-checkbox:checked')).map(cb => parseInt(cb.value));
+            if (selectedJudgeIndices.length === 0) {
+                alert('Please select at least one judge!');
                 return;
             }
@@ -1834,7 +2100,7 @@
             // Load judge data
             const judgesResponse = await fetch('/list_judges');
             const judgesData = await judgesResponse.json();
-            const judge = judgesData.judges[parseInt(judgeIndex)];
+            const judges = selectedJudgeIndices.map(idx => judgesData.judges[idx]);
             // Hide config panel, show progress panel
             document.getElementById('e2e-panel').style.display = 'none';
@@ -1920,7 +2186,8 @@
                 // === STEP 3: Run Evaluations ===
                 stepLabel.textContent = 'Step 3/3: Running evaluations...';
-                progressText.textContent += `📊 Running evaluations with judge: ${judge.name}...\n`;
+                const judgeNames = judges.map(j => j.name).join(', ');
+                progressText.textContent += `📊 Running evaluations with ${judges.length} judge(s): ${judgeNames}...\n`;
                 const evaluationResults = [];
@@ -1963,7 +2230,7 @@
                         headers: { 'Content-Type': 'application/json' },
                         body: JSON.stringify({
                             model_file: modelFile,
-                            judge: judge,
+                            judges: judges,
                             task_id: evalTaskId
                         })
                     });

{quickdistill-0.1.7.dist-info → quickdistill-0.1.9.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: quickdistill
-Version: 0.1.7
+Version: 0.1.9
 Summary: Fast and easy toolkit for distilling AI models
 Author-email: Brett Young <bdytx5@umsystem.edu>
 License: MIT

quickdistill-0.1.9.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,17 @@
+quickdistill/__init__.py,sha256=dOl_wXruBGyDGhe1Iu4-SQLu_6-_b6rt1lkxfOp3Jqo,823
+quickdistill/cli.py,sha256=A8d5GN9NdBS299WyAsJ6-p8ynW3DJnDRHZ-UGH7TXLM,2212
+quickdistill/default_judges.json,sha256=9uDqsYc9CsJwZAWwOkWcqgmlGZNJ0zzyXpv4wZ8vtuE,1446
+quickdistill/get_traces.py,sha256=mfy9fMiK-CZQN1noZ4DfOwdwP45ntthVDLgh4-u2iNk,4896
+quickdistill/server.py,sha256=0yBQ5vt1oD7OkhH7ap2cR8j-wuVG3fU7jARijmD1eOs,42849
+quickdistill/__pycache__/__init__.cpython-310.pyc,sha256=RqzjvxzPxHFJZkBjX6DSH9vbVTtskVgJ4pTQ6EX2A6o,794
+quickdistill/__pycache__/cli.cpython-310.pyc,sha256=xtVgJTayQLKS4gE_te7U1Wo8LmkDtPkaa2rnzu8h9fY,2443
+quickdistill/__pycache__/get_traces.cpython-310.pyc,sha256=T7Suxp9vpqYDQJ_3uJvXWemqoLf5tnRC2I0BfHrSiNM,2956
+quickdistill/__pycache__/server.cpython-310.pyc,sha256=8W74-E_S0dJRRwRG7nF9UL64kdbyDoNswAi5y51Xc3I,25593
+quickdistill/default_projects/byyoung3_arena-detailed/traces_data.json,sha256=iz-cBmXBYj0bC3Vn754QTnGuDh6sRvlE_RzSyGXaxbY,15496950
+quickdistill/static/judge_manager.html,sha256=t6dSPwo_d-GIu1FscuK1KDgxKCnmiOekQTMu80lZPPY,27166
+quickdistill/static/trace_viewer.html,sha256=lAMO6Mj-MWQqXGC4bo2v8ybM4ci082h2HaDQ1AOl2jM,109884
+quickdistill-0.1.9.dist-info/METADATA,sha256=-VH48FybeQbxuxUOlSn0zHJfCOkxfklCrxCHbdRYFRQ,5084
+quickdistill-0.1.9.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
+quickdistill-0.1.9.dist-info/entry_points.txt,sha256=AUUTxnwdD9gRnsOEcTXQTAZIZ_F0aRU7JGstIJ3Xk_o,55
+quickdistill-0.1.9.dist-info/top_level.txt,sha256=ysiMvurJYsE1IhkxmObe-0G8A-GIav40kTh2z6axjxg,13
+quickdistill-0.1.9.dist-info/RECORD,,

quickdistill-0.1.7.dist-info/RECORD DELETED Viewed

@@ -1,17 +0,0 @@
-quickdistill/__init__.py,sha256=U8mvMbfYKLFegcEA4D-P6AFHvSiHQPXoFn0KKd-xh0A,397
-quickdistill/cli.py,sha256=A8d5GN9NdBS299WyAsJ6-p8ynW3DJnDRHZ-UGH7TXLM,2212
-quickdistill/default_judges.json,sha256=w0TkIniELPPG-Mi3hm7zPW06eq46W1BI_ufWXnkDDDM,1432
-quickdistill/get_traces.py,sha256=mfy9fMiK-CZQN1noZ4DfOwdwP45ntthVDLgh4-u2iNk,4896
-quickdistill/server.py,sha256=0Y0XG-8oYoNZgmo10LPZgtwlHuGqrq0urxE-KabyIvI,36789
-quickdistill/__pycache__/__init__.cpython-310.pyc,sha256=Tbov274p3OjaOuOsQwcW-meATEfkz0mHKmpytksuDJI,603
-quickdistill/__pycache__/cli.cpython-310.pyc,sha256=xtVgJTayQLKS4gE_te7U1Wo8LmkDtPkaa2rnzu8h9fY,2443
-quickdistill/__pycache__/get_traces.cpython-310.pyc,sha256=T7Suxp9vpqYDQJ_3uJvXWemqoLf5tnRC2I0BfHrSiNM,2956
-quickdistill/__pycache__/server.cpython-310.pyc,sha256=_taKWofMtdgfMZzfVsd7PoC4jnuKxEOGzW82YBxqPPc,22051
-quickdistill/default_projects/byyoung3_arena-detailed/traces_data.json,sha256=iz-cBmXBYj0bC3Vn754QTnGuDh6sRvlE_RzSyGXaxbY,15496950
-quickdistill/static/judge_manager.html,sha256=fXteyx_ry4gY166WypBkVGGCqieE88MigqLRLVCKnG8,26887
-quickdistill/static/trace_viewer.html,sha256=kPC4GnxeDPq7jxClRhZBOuS6xmA3RaY-loJDZmKDADE,94426
-quickdistill-0.1.7.dist-info/METADATA,sha256=1pE5fDep0l0kAxhHuT1C_H4CYHIiPLP4n9QraAqI9bM,5084
-quickdistill-0.1.7.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
-quickdistill-0.1.7.dist-info/entry_points.txt,sha256=AUUTxnwdD9gRnsOEcTXQTAZIZ_F0aRU7JGstIJ3Xk_o,55
-quickdistill-0.1.7.dist-info/top_level.txt,sha256=ysiMvurJYsE1IhkxmObe-0G8A-GIav40kTh2z6axjxg,13
-quickdistill-0.1.7.dist-info/RECORD,,

{quickdistill-0.1.7.dist-info → quickdistill-0.1.9.dist-info}/WHEEL RENAMED Viewed

File without changes

{quickdistill-0.1.7.dist-info → quickdistill-0.1.9.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{quickdistill-0.1.7.dist-info → quickdistill-0.1.9.dist-info}/top_level.txt RENAMED Viewed

File without changes

quickdistill 0.1.7__py3-none-any.whl → 0.1.9__py3-none-any.whl

quickdistill 0.1.7py3-none-any.whl → 0.1.9py3-none-any.whl