quickdistill 0.1.7__py3-none-any.whl → 0.1.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
quickdistill/__init__.py CHANGED
@@ -8,7 +8,18 @@ This package provides tools to:
8
8
  - Export datasets for model evaluation
9
9
  """
10
10
 
11
- __version__ = "0.1.7"
11
+ # Monkey patch for aiohttp/litellm compatibility
12
+ # litellm expects aiohttp.ConnectionTimeoutError but it doesn't exist in some versions
13
+ try:
14
+ import aiohttp
15
+ if not hasattr(aiohttp, 'ConnectionTimeoutError'):
16
+ aiohttp.ConnectionTimeoutError = aiohttp.ServerTimeoutError
17
+ if not hasattr(aiohttp, 'SocketTimeoutError'):
18
+ aiohttp.SocketTimeoutError = aiohttp.ServerTimeoutError
19
+ except Exception:
20
+ pass
21
+
22
+ __version__ = "0.1.9"
12
23
  __author__ = "Brett Young"
13
24
  __email__ = "bdytx5@umsystem.edu"
14
25
 
@@ -2,14 +2,14 @@
2
2
  {
3
3
  "name": "boolean_scorer",
4
4
  "type": "llm",
5
- "model": "gpt-5",
5
+ "model": "openai/gpt-5",
6
6
  "returnType": "boolean",
7
7
  "prompt": "You are a strict evaluator comparing two AI responses (one from a strong reference model which is the ground truth, and one from a weaker model which we are testing to see how similar the responses it generates are to the strong model).\n\nStrong Model Response: {strong_output}\nWeak Model Response: {weak_output}\n\nDetermine if the weak model response is CORRECT compared to the strong model response.\nConsider a response CORRECT if it conveys the same key information and meaning, even if worded differently.\n\nRespond in JSON format: {'correct': true} or {'correct': false}"
8
8
  },
9
9
  {
10
10
  "name": "scalar_scorer",
11
11
  "type": "llm",
12
- "model": "gpt-5",
12
+ "model": "openai/gpt-5",
13
13
  "returnType": "scalar",
14
14
  "prompt": "You are a strict evaluator comparing two AI responses (one from a strong reference model which is the ground truth, and one from a weaker model which we are testing to see how similar the responses it generates are to the strong model).\n\nStrong Model Response: {strong_output}\nWeak Model Response: {weak_output}\n\nEvaluate how similar the weak model response is to the strong model response.\nRate on a scale of 1-5 where 1=completely different and 5=nearly identical. RETURN ONLY ONE SCORE REPRESENTY THE AVERAGE SIMILARITY (EG 5-(avg_error))\n\nRespond in JSON format eg {'scores': the_score }"
15
15
  }
quickdistill/server.py CHANGED
@@ -100,40 +100,133 @@ def run_inference(client, model, messages, max_tokens=1000):
100
100
  return f"ERROR: {str(e)}"
101
101
 
102
102
  def extract_output_content(output_str):
103
- """Extract actual content from WeaveObject string or regular output"""
103
+ """Extract actual content from WeaveObject string, JSON response, or regular output.
104
+
105
+ Handles outputs from:
106
+ - OpenAI chat.completions.create (plain text)
107
+ - OpenAI responses.create (JSON with nested structure)
108
+ - Anthropic Messages (WeaveObject with content[0].text)
109
+ - Google Gemini (WeaveObject with candidates[0].content.parts[0].text)
110
+ """
111
+ import re
112
+ import json
113
+
104
114
  if not output_str:
105
115
  return None
106
116
 
107
- # If it's a WeaveObject string, try to extract the text content
108
- if isinstance(output_str, str) and 'WeaveObject' in output_str:
109
- import re
110
- # Try to find the 'text' field in the WeaveObject
111
- match = re.search(r"'text':\s*'([^']*(?:\\'[^']*)*)'", output_str)
117
+ if not isinstance(output_str, str):
118
+ return str(output_str)
119
+
120
+ # Handle empty/streaming responses
121
+ if output_str in ('', 'None', 'null'):
122
+ return '[Streaming output - not captured]'
123
+
124
+ # Handle OpenAI responses.create JSON format
125
+ if output_str.startswith('{') and '"output"' in output_str:
126
+ try:
127
+ resp_obj = json.loads(output_str)
128
+ if 'output' in resp_obj and isinstance(resp_obj['output'], list):
129
+ # Extract text from output messages
130
+ text_parts = []
131
+ for item in resp_obj['output']:
132
+ if item.get('type') == 'message' and 'content' in item:
133
+ for content in item['content']:
134
+ if content.get('type') == 'output_text' and 'text' in content:
135
+ text_parts.append(content['text'])
136
+ if text_parts:
137
+ return '\n\n'.join(text_parts)
138
+ except (json.JSONDecodeError, KeyError, TypeError):
139
+ pass # Fall through to other handlers
140
+
141
+ # Handle WeaveObject strings (Anthropic, Gemini)
142
+ if 'WeaveObject' in output_str:
143
+ # Improved regex that handles escape sequences properly
144
+ match = re.search(r"'text':\s*'((?:[^'\\]|\\.)*)'", output_str, re.DOTALL)
112
145
  if match:
113
- # Unescape the string
146
+ # Unescape the string properly (order matters!)
114
147
  text = match.group(1)
115
- text = text.replace('\\n', '\n').replace("\\'", "'").replace('\\\\', '\\')
148
+ text = text.replace("\\'", "'") # escaped single quotes
149
+ text = text.replace('\\"', '"') # escaped double quotes
150
+ text = text.replace('\\n', '\n') # newlines
151
+ text = text.replace('\\t', '\t') # tabs
152
+ text = text.replace('\\r', '\r') # carriage returns
153
+ text = text.replace('\\\\', '\\') # escaped backslashes (do this last!)
116
154
  return text
117
155
 
118
- # Otherwise return as-is
156
+ # If no text field found, return truncated version
157
+ return f"[Complex WeaveObject - could not extract text]\n{output_str[:500]}..."
158
+
159
+ # Plain text output (standard OpenAI chat format)
119
160
  return output_str
120
161
 
121
162
 
122
163
  def extract_messages_from_trace(trace):
123
- """Extract messages from a trace in the format needed for inference"""
124
- # Check if messages are at top level
164
+ """Extract messages from a trace in the format needed for inference.
165
+
166
+ Handles message extraction from:
167
+ - OpenAI chat.completions.create (messages at top level or in inputs.messages)
168
+ - OpenAI responses.create (inputs.input field)
169
+ - Anthropic Messages (inputs.messages)
170
+ - Google Gemini generate_content (inputs.contents array)
171
+ - Google Gemini Chat.send_message (inputs.message string)
172
+ """
173
+ import re
174
+
175
+ # Get op_display_name for provider detection
176
+ op_name = trace.get('op_display_name', '')
177
+
178
+ # Check if messages are at top level (already extracted/cached)
125
179
  if trace.get('messages') and isinstance(trace['messages'], list) and len(trace['messages']) > 0:
126
180
  return trace['messages']
127
181
 
128
182
  # Check if messages are in inputs
129
183
  if trace.get('inputs') and isinstance(trace['inputs'], dict):
130
- messages = trace['inputs'].get('messages', [])
184
+ inputs = trace['inputs']
185
+
186
+ # Standard OpenAI/Anthropic: inputs.messages
187
+ messages = inputs.get('messages', [])
131
188
  if isinstance(messages, list) and len(messages) > 0:
132
189
  return messages
133
190
 
191
+ # OpenAI responses.create: inputs.input (simple string)
192
+ if 'openai.responses' in op_name and 'input' in inputs:
193
+ return [{"role": "user", "content": inputs['input']}]
194
+
195
+ # Gemini Chat.send_message: inputs.message (simple string)
196
+ if 'Chat.send_message' in op_name and 'message' in inputs:
197
+ return [{"role": "user", "content": inputs['message']}]
198
+
199
+ # Gemini generate_content: inputs.contents (array of content objects or WeaveObject strings)
200
+ if 'google.genai' in op_name and 'contents' in inputs:
201
+ contents = inputs['contents']
202
+ if isinstance(contents, list) and len(contents) > 0:
203
+ messages = []
204
+ for content in contents:
205
+ # Handle WeaveObject string format
206
+ if isinstance(content, str) and 'WeaveObject' in content:
207
+ role_match = re.search(r"'role':\s*'(\w+)'", content)
208
+ text_match = re.search(r"'text':\s*'((?:[^'\\]|\\.)*)'", content, re.DOTALL)
209
+ text = '[Complex content]'
210
+ if text_match:
211
+ text = text_match.group(1)
212
+ text = text.replace("\\'", "'").replace('\\n', '\n').replace('\\\\', '\\')
213
+ messages.append({
214
+ "role": role_match.group(1) if role_match else "user",
215
+ "content": text
216
+ })
217
+ # Handle regular dict format
218
+ elif isinstance(content, dict):
219
+ role = content.get('role', 'user')
220
+ parts = content.get('parts', [])
221
+ if isinstance(parts, list):
222
+ text = '\n'.join([p.get('text', '') for p in parts if isinstance(p, dict)])
223
+ messages.append({"role": role, "content": text})
224
+ if messages:
225
+ return messages
226
+
134
227
  # Check if inputs has question/context format (from generate_test_traces.py wrapper traces)
135
- question = trace['inputs'].get('question')
136
- context = trace['inputs'].get('context')
228
+ question = inputs.get('question')
229
+ context = inputs.get('context')
137
230
  if question:
138
231
  if context:
139
232
  prompt = f"""Based on the following context, answer the question concisely.
@@ -753,16 +846,26 @@ def delete_judge():
753
846
 
754
847
  @app.route('/run_evaluation', methods=['POST'])
755
848
  def run_evaluation_endpoint():
756
- """Run evaluation using specified judge"""
757
-
849
+ """Run evaluation using specified judge(s) - supports multiple judges"""
850
+
758
851
 
759
852
  data = request.json
760
853
  model_file = data.get('model_file')
761
- judge = data.get('judge')
854
+ judges = data.get('judges') # Can be a list or single judge dict
762
855
  task_id = data.get('task_id', f"eval_{id(data)}")
763
856
 
764
- if not model_file or not judge:
765
- return jsonify({'error': 'Missing model_file or judge'}), 400
857
+ # Handle both single judge (backwards compat) and multiple judges
858
+ if data.get('judge'):
859
+ judges = [data.get('judge')]
860
+ elif not judges:
861
+ return jsonify({'error': 'Missing judge or judges'}), 400
862
+
863
+ # Ensure judges is a list
864
+ if not isinstance(judges, list):
865
+ judges = [judges]
866
+
867
+ if not model_file:
868
+ return jsonify({'error': 'Missing model_file'}), 400
766
869
 
767
870
  # Load weak model results
768
871
  model_path = DATA_DIR / model_file
@@ -782,18 +885,22 @@ def run_evaluation_endpoint():
782
885
  # Extract model name from filename
783
886
  model_name = model_file.replace('weak_model_', '').replace('.json', '')
784
887
 
888
+ # Create evaluation name with all judges
889
+ judges_names = '_'.join([j['name'] for j in judges])
890
+ eval_name = f"eval-{model_name}-{judges_names}"
891
+
785
892
  # Initialize progress tracking
786
893
  total_steps = len(results)
787
894
  progress_state[task_id] = {
788
895
  'current': 0,
789
896
  'total': total_steps,
790
- 'message': f'Starting evaluation: {model_name} with {judge["name"]}...',
897
+ 'message': f'Starting evaluation: {model_name} with {len(judges)} judge(s)...',
791
898
  'status': 'running'
792
899
  }
793
900
 
794
901
  # Create evaluation logger
795
902
  ev = weave.EvaluationLogger(
796
- name=f"eval-{model_name}-{judge['name']}",
903
+ name=eval_name,
797
904
  model=model_name
798
905
  )
799
906
 
@@ -818,13 +925,20 @@ def run_evaluation_endpoint():
818
925
  if messages and len(messages) > 0:
819
926
  question = messages[0].get('content', '')
820
927
 
821
- # Run judge
822
- if judge['type'] == 'llm':
823
- scores = run_llm_judge_eval(judge, strong_output, weak_output, question)
824
- else:
825
- scores = run_custom_judge_eval(judge, strong_output, weak_output)
928
+ # Run all judges and collect scores
929
+ all_scores = {}
930
+ for judge in judges:
931
+ # Run judge
932
+ if judge['type'] == 'llm':
933
+ scores = run_llm_judge_eval(judge, strong_output, weak_output, question)
934
+ else:
935
+ scores = run_custom_judge_eval(judge, strong_output, weak_output)
936
+
937
+ # Merge scores with judge name prefix to avoid conflicts
938
+ for score_key, score_value in scores.items():
939
+ all_scores[f"{judge['name']}_{score_key}"] = score_value
826
940
 
827
- # Log to weave
941
+ # Log to weave with all scores from all judges
828
942
  ev.log_example(
829
943
  inputs={
830
944
  "question": question,
@@ -834,7 +948,7 @@ def run_evaluation_endpoint():
834
948
  "weak_output": weak_output
835
949
 
836
950
  },
837
- scores=scores
951
+ scores=all_scores
838
952
  )
839
953
 
840
954
  # Finish evaluation
@@ -850,10 +964,11 @@ def run_evaluation_endpoint():
850
964
 
851
965
  return jsonify({
852
966
  'status': 'success',
853
- 'evaluation_name': f"eval-{model_name}-{judge['name']}",
967
+ 'evaluation_name': eval_name,
854
968
  'examples_evaluated': len(results),
855
969
  'weave_url': ev.ui_url,
856
970
  'strong_export': strong_export,
971
+ 'judges': [j['name'] for j in judges],
857
972
  'task_id': task_id
858
973
  })
859
974
 
@@ -1032,6 +1147,32 @@ def list_projects():
1032
1147
  return jsonify({'projects': projects})
1033
1148
 
1034
1149
 
1150
+ @app.route('/get_preferences', methods=['GET'])
1151
+ def get_preferences():
1152
+ """Get saved user preferences"""
1153
+ prefs_file = DATA_DIR / 'preferences.json'
1154
+ if prefs_file.exists():
1155
+ try:
1156
+ with open(prefs_file, 'r') as f:
1157
+ return jsonify(json.load(f))
1158
+ except:
1159
+ pass
1160
+ return jsonify({})
1161
+
1162
+
1163
+ @app.route('/save_preferences', methods=['POST'])
1164
+ def save_preferences():
1165
+ """Save user preferences"""
1166
+ try:
1167
+ data = request.json
1168
+ prefs_file = DATA_DIR / 'preferences.json'
1169
+ with open(prefs_file, 'w') as f:
1170
+ json.dump(data, f, indent=2)
1171
+ return jsonify({'status': 'success'})
1172
+ except Exception as e:
1173
+ return jsonify({'status': 'error', 'message': str(e)}), 500
1174
+
1175
+
1035
1176
  # Routes for serving HTML pages
1036
1177
  @app.route('/')
1037
1178
  def index():
@@ -183,12 +183,10 @@
183
183
 
184
184
  <div id="llm-options" style="display: block;">
185
185
  <label for="judge-model">Model</label>
186
- <select id="judge-model">
187
- <option value="gpt-5">gpt-5</option>
188
- <option value="gpt-4o">gpt-4o</option>
189
- <option value="gpt-4o-mini">gpt-4o-mini</option>
190
- <option value="claude-3-5-sonnet-20241022">claude-3-5-sonnet</option>
191
- </select>
186
+ <input type="text" id="judge-model" placeholder="e.g., openai/gpt-5, anthropic/claude-3.5-sonnet" value="openai/gpt-5">
187
+ <p style="color: #888; font-size: 12px; margin-top: 5px; margin-bottom: 15px;">
188
+ <strong>Note:</strong> Uses LiteLLM format. Examples: <code>openai/gpt-5</code>, <code>anthropic/claude-3.5-sonnet</code>, <code>openai/gpt-4o</code>
189
+ </p>
192
190
 
193
191
  <label for="judge-return-type">Return Type</label>
194
192
  <select id="judge-return-type">
@@ -393,10 +391,16 @@ Respond in JSON format: {'correct': true} or {'correct': false}`
393
391
  };
394
392
 
395
393
  if (type === 'llm') {
396
- judge.model = document.getElementById('judge-model').value;
394
+ judge.model = document.getElementById('judge-model').value.trim();
397
395
  judge.returnType = document.getElementById('judge-return-type').value;
398
396
  judge.prompt = document.getElementById('judge-prompt').value.trim();
399
397
 
398
+ // Validate model
399
+ if (!judge.model) {
400
+ alert('Error: Please enter a model (e.g., openai/gpt-5)');
401
+ return;
402
+ }
403
+
400
404
  // Validate required placeholders
401
405
  if (!judge.prompt.includes('{strong_output}')) {
402
406
  alert('Error: Judge prompt must include {strong_output} placeholder');
@@ -420,7 +424,7 @@ Respond in JSON format: {'correct': true} or {'correct': false}`
420
424
  function resetForm() {
421
425
  document.getElementById('judge-name').value = '';
422
426
  document.getElementById('judge-type').value = 'llm';
423
- document.getElementById('judge-model').value = 'gpt-5-2025-08-07';
427
+ document.getElementById('judge-model').value = 'openai/gpt-5';
424
428
  document.getElementById('judge-prompt').value = '';
425
429
  document.getElementById('form-title').textContent = 'Create New Judge';
426
430
  document.getElementById('save-btn').textContent = 'Save Judge';
@@ -43,10 +43,21 @@
43
43
  padding: 20px;
44
44
  border-radius: 8px;
45
45
  margin-bottom: 20px;
46
- display: flex;
46
+ }
47
+
48
+ .filter-row {
49
+ display: grid;
50
+ grid-template-columns: auto 1fr auto 1fr auto auto;
47
51
  gap: 15px;
48
- align-items: center;
49
- flex-wrap: wrap;
52
+ align-items: start;
53
+ margin-bottom: 20px;
54
+ }
55
+
56
+ .filter-group {
57
+ display: flex;
58
+ flex-direction: column;
59
+ gap: 8px;
60
+ min-width: 250px;
50
61
  }
51
62
 
52
63
  .controls label {
@@ -283,58 +294,75 @@
283
294
  </div>
284
295
 
285
296
  <div class="controls">
286
- <div style="display: flex; flex-direction: column; gap: 4px;">
287
- <label for="op-filter">Filter by Operation:</label>
288
- <span style="color: #666; font-size: 11px;">Primary supported: openai.chat.completions.create</span>
289
- </div>
290
- <select id="op-filter">
291
- <option value="all">All Operations</option>
292
- </select>
293
-
294
- <label for="model-filter">Filter by Model:</label>
295
- <select id="model-filter">
296
- <option value="all">All Models</option>
297
- </select>
298
-
299
- <button id="select-all-btn" style="margin-left: 20px; padding: 8px 16px; background: #2a7c4a; color: white; border: none; border-radius: 4px; cursor: pointer;">
300
- Select All Filtered
301
- </button>
302
-
303
- <button id="export-btn" style="padding: 8px 16px; background: #4a9eff; color: white; border: none; border-radius: 4px; cursor: pointer;">
304
- Export Selected to Test Set (<span id="selected-count">0</span>)
305
- </button>
297
+ <!-- Filters Row -->
298
+ <div class="filter-row">
299
+ <div class="filter-group">
300
+ <label for="op-filter">Operation Filter:</label>
301
+ <select id="op-filter" style="background: #2a2a2a; color: #fff; border: 1px solid #3a3a3a; padding: 8px 12px; border-radius: 4px; font-size: 14px; cursor: pointer;">
302
+ <option value="all">All Operations</option>
303
+ </select>
304
+ <span style="color: #4a9eff; font-size: 11px; font-weight: 500;">✅ Fully supported: OpenAI (chat.completions, responses), Anthropic (Messages), Google Gemini (generate_content, Chat)</span>
305
+ </div>
306
306
 
307
- <button id="open-inference-btn" style="padding: 8px 16px; background: #7c4a9e; color: white; border: none; border-radius: 4px; cursor: pointer;">
308
- Run Weak Models
309
- </button>
307
+ <div class="filter-group">
308
+ <label for="model-filter">Model Filter:</label>
309
+ <select id="model-filter" style="background: #2a2a2a; color: #fff; border: 1px solid #3a3a3a; padding: 8px 12px; border-radius: 4px; font-size: 14px; cursor: pointer;">
310
+ <option value="all">All Models</option>
311
+ </select>
312
+ </div>
310
313
 
311
- <button id="open-eval-btn" style="padding: 8px 16px; background: #9e6a4a; color: white; border: none; border-radius: 4px; cursor: pointer;">
312
- Run Evaluation
313
- </button>
314
+ <button id="select-all-btn" style="padding: 8px 16px; background: #2a7c4a; color: white; border: none; border-radius: 4px; cursor: pointer; align-self: end; white-space: nowrap;">
315
+ Select All
316
+ </button>
314
317
 
315
- <a href="/judge" target="_blank" style="padding: 8px 16px; background: #4a5a9e; color: white; border: none; border-radius: 4px; text-decoration: none; display: inline-block;">
316
- Manage Judges
317
- </a>
318
+ <div style="display: flex; flex-direction: column; gap: 4px; align-self: end;">
319
+ <div style="color: #888; font-size: 13px;">Total: <span id="total-count" style="color: #fff; font-weight: 600;">0</span></div>
320
+ <div style="color: #888; font-size: 13px;">Shown: <span id="shown-count" style="color: #4a9eff; font-weight: 600;">0</span></div>
321
+ </div>
322
+ </div>
318
323
 
319
- <button id="open-test-judge-btn" style="padding: 8px 16px; background: #6a4a7e; color: white; border: none; border-radius: 4px; cursor: pointer;">
320
- Test Judges
321
- </button>
324
+ <!-- Action Buttons Row -->
325
+ <div style="display: grid; grid-template-columns: 2fr 1fr; gap: 20px;">
326
+ <!-- Main Workflow -->
327
+ <div style="padding: 15px; background: #0f1f0f; border-radius: 8px; border: 2px solid #2a4a2a;">
328
+ <div style="color: #6dd36d; font-size: 13px; font-weight: 600; margin-bottom: 12px;">📋 MANUAL WORKFLOW</div>
329
+ <div style="display: flex; flex-wrap: wrap; gap: 10px;">
330
+ <button id="export-btn" style="padding: 10px 16px; background: #4a9eff; color: white; border: none; border-radius: 4px; cursor: pointer; font-weight: 500;">
331
+ 1. Export Test Set (<span id="selected-count">0</span>)
332
+ </button>
333
+ <button id="open-inference-btn" style="padding: 10px 16px; background: #7c4a9e; color: white; border: none; border-radius: 4px; cursor: pointer; font-weight: 500;">
334
+ 2. Run Weak Models
335
+ </button>
336
+ <button id="open-eval-btn" style="padding: 10px 16px; background: #9e6a4a; color: white; border: none; border-radius: 4px; cursor: pointer; font-weight: 500;">
337
+ 3. Evaluate Results
338
+ </button>
339
+ </div>
340
+ </div>
322
341
 
323
- <button id="open-settings-btn" style="padding: 8px 16px; background: #5a5a5a; color: white; border: none; border-radius: 4px; cursor: pointer;">
324
- Settings
325
- </button>
342
+ <!-- Utilities -->
343
+ <div style="padding: 15px; background: #1a1a2a; border-radius: 8px; border: 1px solid #2a2a3a;">
344
+ <div style="color: #aaa; font-size: 13px; font-weight: 600; margin-bottom: 12px;">⚙️ TOOLS</div>
345
+ <div style="display: flex; flex-wrap: wrap; gap: 8px;">
346
+ <a href="/judge" target="_blank" style="padding: 8px 14px; background: #4a5a9e; color: white; border: none; border-radius: 4px; text-decoration: none; display: inline-block; font-size: 13px;">
347
+ Judges
348
+ </a>
349
+ <button id="open-test-judge-btn" style="padding: 8px 14px; background: #6a4a7e; color: white; border: none; border-radius: 4px; cursor: pointer; font-size: 13px;">
350
+ Test Judge
351
+ </button>
352
+ <button id="open-settings-btn" style="padding: 8px 14px; background: #5a5a5a; color: white; border: none; border-radius: 4px; cursor: pointer; font-size: 13px;">
353
+ Settings
354
+ </button>
355
+ </div>
356
+ </div>
357
+ </div>
326
358
 
327
- <div style="margin: 20px 0; padding: 15px; background: #2a1a2a; border-radius: 8px; border: 1px solid #4a2a4a;">
328
- <div style="color: #aaa; font-size: 13px; margin-bottom: 10px;">Automatic Workflow:</div>
329
- <button id="open-e2e-btn" style="padding: 10px 20px; background: #7a4a9e; color: white; border: none; border-radius: 4px; cursor: pointer; font-weight: 500;">
359
+ <!-- Automatic Workflow -->
360
+ <div style="margin-top: 20px; padding: 15px; background: #2a1a2a; border-radius: 8px; border: 2px solid #7a4a9e;">
361
+ <div style="color: #bb88ff; font-size: 13px; font-weight: 600; margin-bottom: 10px;">⚡ AUTOMATIC WORKFLOW</div>
362
+ <button id="open-e2e-btn" style="padding: 12px 20px; background: #7a4a9e; color: white; border: none; border-radius: 4px; cursor: pointer; font-weight: 600; font-size: 14px;">
330
363
  ⚡ Run End-to-End Test
331
364
  </button>
332
- <div style="color: #666; font-size: 11px; margin-top: 8px;">Export → Generate → Evaluate (all in one)</div>
333
- </div>
334
-
335
- <div class="stats">
336
- <div>Total: <span id="total-count">0</span></div>
337
- <div>Shown: <span id="shown-count">0</span></div>
365
+ <div style="color: #888; font-size: 11px; margin-top: 8px;">Export → Generate → Evaluate (all in one)</div>
338
366
  </div>
339
367
  </div>
340
368
 
@@ -412,10 +440,10 @@
412
440
  </div>
413
441
 
414
442
  <div style="margin-bottom: 20px;">
415
- <label style="color: #aaa; display: block; margin-bottom: 10px;">Select Judge:</label>
416
- <select id="eval-judge" style="width: 100%; padding: 8px; background: #2a2a2a; color: #fff; border: 1px solid #3a3a3a; border-radius: 4px;">
417
- <!-- Judges populated dynamically -->
418
- </select>
443
+ <label style="color: #aaa; display: block; margin-bottom: 10px;">Select Judge(s) - you can select multiple:</label>
444
+ <div id="eval-judge-list" style="max-height: 200px; overflow-y: auto; background: #0f0f0f; padding: 15px; border-radius: 4px;">
445
+ <!-- Judges populated dynamically as checkboxes -->
446
+ </div>
419
447
  <div style="color: #666; font-size: 12px; margin-top: 5px;">
420
448
  <a href="/judge" target="_blank" style="color: #4a9eff;">Create/manage judges</a>
421
449
  </div>
@@ -511,9 +539,9 @@
511
539
  <label style="display: block; color: #aaa; margin-bottom: 8px; font-size: 14px;">Judge Model:</label>
512
540
  <input type="text" id="test-judge-model"
513
541
  style="width: 100%; padding: 10px; background: #2a2a2a; color: #fff; border: 1px solid #3a3a3a; border-radius: 4px; font-size: 14px;"
514
- placeholder="e.g., gpt-4o, claude-3-5-sonnet-20241022">
542
+ placeholder="e.g., openai/gpt-5, anthropic/claude-3.5-sonnet">
515
543
  <div style="color: #666; font-size: 12px; margin-top: 5px;">
516
- Override the judge's model for this test
544
+ Override the judge's model for this test. Uses LiteLLM format (e.g., <code style="color: #aaa;">openai/gpt-5</code>, <code style="color: #aaa;">anthropic/claude-3.5-sonnet</code>)
517
545
  </div>
518
546
  </div>
519
547
 
@@ -586,10 +614,10 @@
586
614
 
587
615
  <!-- Judge Selection -->
588
616
  <div style="margin-bottom: 30px;">
589
- <h3 style="color: #fff; font-size: 16px; margin-bottom: 15px;">2. Select Judge</h3>
590
- <select id="e2e-judge" style="width: 100%; padding: 10px; background: #2a2a2a; color: #fff; border: 1px solid #3a3a3a; border-radius: 4px; font-size: 14px;">
591
- <option value="">Loading judges...</option>
592
- </select>
617
+ <h3 style="color: #fff; font-size: 16px; margin-bottom: 15px;">2. Select Judges</h3>
618
+ <div id="e2e-judge-list" style="max-height: 200px; overflow-y: auto; background: #2a2a2a; border: 1px solid #3a3a3a; border-radius: 4px; padding: 10px;">
619
+ <p style="color: #888;">Loading judges...</p>
620
+ </div>
593
621
  </div>
594
622
 
595
623
  <!-- Actions -->
@@ -648,6 +676,17 @@
648
676
  "Qwen/Qwen3-Coder-480B-A35B-Instruct",
649
677
  ];
650
678
 
679
+ const SUPPORTED_OPS = [
680
+ 'openai.chat.completions.create',
681
+ 'openai.responses.create',
682
+ 'anthropic.Messages.create',
683
+ 'anthropic.Messages.stream',
684
+ 'google.genai.models.Models.generate_content',
685
+ 'google.genai.models.Models.generate_content_stream',
686
+ 'google.genai.chats.Chat.send_message',
687
+ 'google.genai.chats.Chat.send_message_stream'
688
+ ];
689
+
651
690
  let allTraces = [];
652
691
  let currentOpFilter = 'all';
653
692
  let currentModelFilter = 'all';
@@ -659,6 +698,10 @@
659
698
  // Load projects list
660
699
  async function loadProjects() {
661
700
  try {
701
+ // Load saved preferences
702
+ const prefsResponse = await fetch('/get_preferences');
703
+ const prefs = await prefsResponse.json();
704
+
662
705
  const response = await fetch('/list_projects');
663
706
  const data = await response.json();
664
707
  const select = document.getElementById('project-select');
@@ -670,11 +713,23 @@
670
713
  `<option value="${p.name}">${p.name} (${p.trace_count} traces)</option>`
671
714
  ).join('');
672
715
 
673
- // Auto-select first project and load it
674
- if (data.projects.length > 0) {
675
- currentProject = data.projects[0].name;
676
- select.value = currentProject;
677
- await loadTraces(currentProject);
716
+ // Use saved project or first project
717
+ let projectToLoad = prefs.lastProject || data.projects[0].name;
718
+
719
+ // Check if saved project still exists
720
+ const projectExists = data.projects.some(p => p.name === projectToLoad);
721
+ if (!projectExists) {
722
+ projectToLoad = data.projects[0].name;
723
+ }
724
+
725
+ currentProject = projectToLoad;
726
+ select.value = currentProject;
727
+ await loadTraces(currentProject);
728
+
729
+ // Set default filter to "All Supported Ops"
730
+ if (!prefs.lastOpFilter) {
731
+ currentOpFilter = 'supported';
732
+ document.getElementById('op-filter').value = 'supported';
678
733
  }
679
734
  }
680
735
  } catch (e) {
@@ -682,6 +737,153 @@
682
737
  }
683
738
  }
684
739
 
740
+ // Patch traces to handle different provider formats
741
+ function patchTracesForProviders(traces) {
742
+ return traces.map(trace => {
743
+ const patched = { ...trace };
744
+
745
+ // Extract provider from op_display_name
746
+ const opName = trace.op_display_name || '';
747
+
748
+ // === PARSE WEAVEOBJECT OUTPUTS ===
749
+ if (patched.output && typeof patched.output === 'string') {
750
+ // Check if it's a streaming operation (empty or None)
751
+ if (patched.output === '' || patched.output === 'None' || patched.output === 'null') {
752
+ if (opName.includes('stream') || opName.includes('Stream')) {
753
+ patched.output = '[Streaming output - not captured in trace]';
754
+ }
755
+ }
756
+ // Parse WeaveObject strings
757
+ else if (patched.output.startsWith('WeaveObject(')) {
758
+ patched.output = extractFromWeaveObject(patched.output, opName);
759
+ }
760
+ // Parse OpenAI responses.create JSON output
761
+ else if (opName.includes('openai.responses.create')) {
762
+ try {
763
+ const respObj = JSON.parse(patched.output);
764
+ if (respObj.output && Array.isArray(respObj.output)) {
765
+ // Extract text from output messages
766
+ const textParts = respObj.output
767
+ .filter(item => item.type === 'message')
768
+ .flatMap(msg => msg.content || [])
769
+ .filter(c => c.type === 'output_text')
770
+ .map(c => c.text);
771
+ patched.output = textParts.join('\n\n') || JSON.stringify(respObj, null, 2);
772
+ }
773
+ } catch (e) {
774
+ // Keep original if parsing fails
775
+ }
776
+ }
777
+ }
778
+
779
+ // === EXTRACT MESSAGES FOR NON-OPENAI FORMATS ===
780
+ if (patched.inputs && (!patched.messages || patched.messages.length === 0)) {
781
+ // Anthropic format
782
+ if (opName.includes('anthropic') && patched.inputs.messages) {
783
+ patched.messages = patched.inputs.messages;
784
+ }
785
+ // Gemini contents format
786
+ else if (opName.includes('google.genai') && patched.inputs.contents) {
787
+ patched.messages = extractGeminiMessages(patched.inputs.contents);
788
+ }
789
+ // Gemini Chat.send_message format
790
+ else if (opName.includes('Chat.send_message') && patched.inputs.message) {
791
+ patched.messages = [{ role: 'user', content: patched.inputs.message }];
792
+ }
793
+ // OpenAI responses.create input format
794
+ else if (opName.includes('openai.responses') && patched.inputs.input) {
795
+ patched.messages = [{ role: 'user', content: patched.inputs.input }];
796
+ }
797
+ }
798
+
799
+ // === ADD PROVIDER-SPECIFIC USAGE INFO ===
800
+ if (patched.usage) {
801
+ // Gemini thoughts tokens
802
+ if (patched.usage.thoughts_tokens) {
803
+ patched.usage.thoughts_tokens_label = 'Thinking';
804
+ }
805
+ // OpenAI reasoning tokens
806
+ if (patched.usage.output_tokens_details && patched.usage.output_tokens_details.reasoning_tokens) {
807
+ patched.usage.reasoning_tokens = patched.usage.output_tokens_details.reasoning_tokens;
808
+ }
809
+ // Anthropic cache metrics
810
+ if (patched.usage.cache_read_input_tokens || patched.usage.cache_creation_input_tokens) {
811
+ patched.usage.has_cache_info = true;
812
+ }
813
+ }
814
+
815
+ return patched;
816
+ });
817
+ }
818
+
819
+ // Extract text from WeaveObject string based on provider
820
+ function extractFromWeaveObject(weaveStr, opName) {
821
+ try {
822
+ // Find the 'text' field and extract everything until the next unescaped quote
823
+ // This handles multiline strings with escaped quotes and newlines
824
+ const textMatch = weaveStr.match(/'text':\s*'((?:[^'\\]|\\.)*)'/s);
825
+
826
+ if (textMatch && textMatch[1]) {
827
+ let extracted = textMatch[1];
828
+
829
+ // Unescape common escape sequences
830
+ extracted = extracted
831
+ .replace(/\\'/g, "'") // escaped single quotes
832
+ .replace(/\\"/g, '"') // escaped double quotes
833
+ .replace(/\\n/g, '\n') // newlines
834
+ .replace(/\\t/g, '\t') // tabs
835
+ .replace(/\\r/g, '\r') // carriage returns
836
+ .replace(/\\\\/g, '\\'); // escaped backslashes (do this last)
837
+
838
+ return extracted;
839
+ }
840
+
841
+ // Fallback: if no text field found, show truncated version
842
+ return `[Complex WeaveObject - see raw data]\n${weaveStr.substring(0, 500)}...`;
843
+ } catch (e) {
844
+ console.error('Failed to parse WeaveObject:', e);
845
+ return `[Failed to parse WeaveObject]\n${weaveStr.substring(0, 200)}...`;
846
+ }
847
+ }
848
+
849
+ // Extract messages from Gemini contents format
850
+ function extractGeminiMessages(contents) {
851
+ if (!Array.isArray(contents)) return [];
852
+
853
+ return contents.map(content => {
854
+ // Handle WeaveObject string
855
+ if (typeof content === 'string' && content.startsWith('WeaveObject(')) {
856
+ // Try to extract basic info
857
+ const roleMatch = content.match(/'role':\s*'(\w+)'/);
858
+ const textMatch = content.match(/'text':\s*'((?:[^'\\]|\\.)*)'/s);
859
+
860
+ let text = '[Complex content]';
861
+ if (textMatch && textMatch[1]) {
862
+ text = textMatch[1]
863
+ .replace(/\\'/g, "'")
864
+ .replace(/\\"/g, '"')
865
+ .replace(/\\n/g, '\n')
866
+ .replace(/\\t/g, '\t')
867
+ .replace(/\\r/g, '\r')
868
+ .replace(/\\\\/g, '\\');
869
+ }
870
+
871
+ return {
872
+ role: roleMatch ? roleMatch[1] : 'user',
873
+ content: text
874
+ };
875
+ }
876
+ // Handle regular object
877
+ else if (content.role && content.parts) {
878
+ return {
879
+ role: content.role,
880
+ content: content.parts.map(p => p.text || '').join('\n')
881
+ };
882
+ }
883
+ return { role: 'user', content: String(content) };
884
+ });
885
+ }
886
+
685
887
  // Load traces from selected project
686
888
  async function loadTraces(projectName) {
687
889
  const projectPath = projectName.replace('/', '_');
@@ -696,7 +898,7 @@
696
898
  }
697
899
 
698
900
  const data = await response.json();
699
- allTraces = data;
901
+ allTraces = patchTracesForProviders(data);
700
902
  currentProject = projectName;
701
903
  populateFilters();
702
904
  renderTraces();
@@ -774,6 +976,12 @@
774
976
  const projectName = e.target.value;
775
977
  if (projectName) {
776
978
  await loadTraces(projectName);
979
+ // Save preference
980
+ await fetch('/save_preferences', {
981
+ method: 'POST',
982
+ headers: { 'Content-Type': 'application/json' },
983
+ body: JSON.stringify({ lastProject: projectName })
984
+ });
777
985
  }
778
986
  });
779
987
 
@@ -782,9 +990,21 @@
782
990
 
783
991
  // Populate filter dropdowns
784
992
  function populateFilters() {
785
- // Populate operation filter
786
- const ops = new Set(allTraces.map(t => t.op_display_name || 'unknown'));
993
+ // Clear existing options (except "All") to avoid duplicates when switching projects
787
994
  const opSelect = document.getElementById('op-filter');
995
+ const modelSelect = document.getElementById('model-filter');
996
+
997
+ // Save current filter values
998
+ const savedOpFilter = currentOpFilter;
999
+ const savedModelFilter = currentModelFilter;
1000
+
1001
+ // Clear dropdowns but keep the "All" option
1002
+ opSelect.innerHTML = '<option value="all">All Operations</option>';
1003
+ opSelect.innerHTML += '<option value="supported">All Supported Ops</option>';
1004
+ modelSelect.innerHTML = '<option value="all">All Models</option>';
1005
+
1006
+ // Populate operation filter with operations from current project only
1007
+ const ops = new Set(allTraces.map(t => t.op_display_name || 'unknown'));
788
1008
  const sortedOps = [...ops].sort();
789
1009
  sortedOps.forEach(op => {
790
1010
  const option = document.createElement('option');
@@ -793,21 +1013,36 @@
793
1013
  opSelect.appendChild(option);
794
1014
  });
795
1015
 
796
- // Set default to openai.chat.completions.create if it exists
797
- if (sortedOps.includes('openai.chat.completions.create')) {
798
- opSelect.value = 'openai.chat.completions.create';
799
- currentOpFilter = 'openai.chat.completions.create';
800
- }
801
-
802
- // Populate model filter
1016
+ // Populate model filter with models from current project only
803
1017
  const models = new Set(allTraces.map(t => t.model));
804
- const modelSelect = document.getElementById('model-filter');
805
1018
  [...models].sort().forEach(model => {
806
1019
  const option = document.createElement('option');
807
1020
  option.value = model;
808
1021
  option.textContent = model;
809
1022
  modelSelect.appendChild(option);
810
1023
  });
1024
+
1025
+ // Restore previous filter values if they still exist
1026
+ // Special handling for 'all' and 'supported' which always exist
1027
+ if (savedOpFilter === 'all' || savedOpFilter === 'supported') {
1028
+ opSelect.value = savedOpFilter;
1029
+ currentOpFilter = savedOpFilter;
1030
+ } else if (sortedOps.includes(savedOpFilter)) {
1031
+ opSelect.value = savedOpFilter;
1032
+ currentOpFilter = savedOpFilter;
1033
+ } else {
1034
+ // Default to 'supported' when switching projects
1035
+ opSelect.value = 'supported';
1036
+ currentOpFilter = 'supported';
1037
+ }
1038
+
1039
+ if ([...models].includes(savedModelFilter)) {
1040
+ modelSelect.value = savedModelFilter;
1041
+ currentModelFilter = savedModelFilter;
1042
+ } else {
1043
+ modelSelect.value = 'all';
1044
+ currentModelFilter = 'all';
1045
+ }
811
1046
  }
812
1047
 
813
1048
  // Filter change handlers
@@ -826,7 +1061,13 @@
826
1061
  let filteredTraces = allTraces;
827
1062
 
828
1063
  // Apply operation filter
829
- if (currentOpFilter !== 'all') {
1064
+ if (currentOpFilter === 'supported') {
1065
+ // Filter to only supported operations
1066
+ filteredTraces = filteredTraces.filter(t => {
1067
+ const opDisplayName = t.op_display_name || '';
1068
+ return SUPPORTED_OPS.some(op => opDisplayName.includes(op));
1069
+ });
1070
+ } else if (currentOpFilter !== 'all') {
830
1071
  filteredTraces = filteredTraces.filter(t => t.op_display_name === currentOpFilter);
831
1072
  }
832
1073
 
@@ -860,9 +1101,12 @@
860
1101
  ${trace.usage && (trace.usage.total_tokens || trace.usage.requests) ? `
861
1102
  <div class="usage-info">
862
1103
  ${trace.usage.requests ? `<div class="usage-item"><span class="usage-label">Requests:</span> ${trace.usage.requests}</div>` : ''}
863
- ${trace.usage.prompt_tokens ? `<div class="usage-item"><span class="usage-label">Prompt:</span> ${trace.usage.prompt_tokens}</div>` : ''}
864
- ${trace.usage.completion_tokens ? `<div class="usage-item"><span class="usage-label">Completion:</span> ${trace.usage.completion_tokens}</div>` : ''}
1104
+ ${trace.usage.prompt_tokens || trace.usage.input_tokens ? `<div class="usage-item"><span class="usage-label">Input:</span> ${trace.usage.prompt_tokens || trace.usage.input_tokens}</div>` : ''}
1105
+ ${trace.usage.completion_tokens || trace.usage.output_tokens ? `<div class="usage-item"><span class="usage-label">Output:</span> ${trace.usage.completion_tokens || trace.usage.output_tokens}</div>` : ''}
865
1106
  ${trace.usage.total_tokens ? `<div class="usage-item"><span class="usage-label">Total:</span> ${trace.usage.total_tokens}</div>` : ''}
1107
+ ${trace.usage.reasoning_tokens ? `<div class="usage-item" style="color: #ff9d00;"><span class="usage-label">Reasoning:</span> ${trace.usage.reasoning_tokens}</div>` : ''}
1108
+ ${trace.usage.thoughts_tokens ? `<div class="usage-item" style="color: #9d66ff;"><span class="usage-label">Thinking:</span> ${trace.usage.thoughts_tokens}</div>` : ''}
1109
+ ${trace.usage.cache_read_input_tokens ? `<div class="usage-item" style="color: #4a9eff;"><span class="usage-label">Cache Read:</span> ${trace.usage.cache_read_input_tokens}</div>` : ''}
866
1110
  </div>
867
1111
  ` : ''}
868
1112
 
@@ -972,17 +1216,15 @@
972
1216
  return;
973
1217
  }
974
1218
 
975
- // Filter to only OpenAI completion traces (exclude wrapper function traces)
1219
+ // Filter to only supported provider traces (exclude wrapper function traces)
976
1220
  const completionTraces = selectedData.filter(t => {
977
- const opName = t.op_name || '';
978
1221
  const opDisplayName = t.op_display_name || '';
979
- // Only include traces from openai.chat.completions.create
980
- return opDisplayName === 'openai.chat.completions.create' ||
981
- opName.includes('openai.chat.completions.create');
1222
+ // Check if it's one of our supported operations
1223
+ return SUPPORTED_OPS.some(op => opDisplayName.includes(op));
982
1224
  });
983
1225
 
984
1226
  if (completionTraces.length === 0) {
985
- alert('No OpenAI completion traces selected! Please select traces from actual API calls, not wrapper functions.');
1227
+ alert('No supported provider traces selected! Supported: OpenAI, Anthropic, Gemini');
986
1228
  return;
987
1229
  }
988
1230
 
@@ -1036,9 +1278,18 @@
1036
1278
  // Get filtered traces
1037
1279
  function getFilteredTraces() {
1038
1280
  let filtered = allTraces;
1039
- if (currentOpFilter !== 'all') {
1281
+
1282
+ // Apply operation filter
1283
+ if (currentOpFilter === 'supported') {
1284
+ filtered = filtered.filter(t => {
1285
+ const opDisplayName = t.op_display_name || '';
1286
+ return SUPPORTED_OPS.some(op => opDisplayName.includes(op));
1287
+ });
1288
+ } else if (currentOpFilter !== 'all') {
1040
1289
  filtered = filtered.filter(t => t.op_display_name === currentOpFilter);
1041
1290
  }
1291
+
1292
+ // Apply model filter
1042
1293
  if (currentModelFilter !== 'all') {
1043
1294
  filtered = filtered.filter(t => t.model === currentModelFilter);
1044
1295
  }
@@ -1279,38 +1530,48 @@
1279
1530
  const response = await fetch('/list_judges');
1280
1531
  const data = await response.json();
1281
1532
  const judges = data.judges || [];
1282
- const judgeSelect = document.getElementById('eval-judge');
1533
+ const judgeList = document.getElementById('eval-judge-list');
1283
1534
 
1284
1535
  if (judges.length === 0) {
1285
- judgeSelect.innerHTML = '<option value="">No judges defined - create one first</option>';
1536
+ judgeList.innerHTML = '<div style="color: #888;">No judges defined - <a href="/judge" target="_blank" style="color: #4a9eff;">create one first</a></div>';
1286
1537
  } else {
1287
- judgeSelect.innerHTML = judges.map((j, i) => `<option value="${i}">${j.name} (${j.type})</option>`).join('');
1538
+ judgeList.innerHTML = judges.map((j, i) => `
1539
+ <label style="display: flex; align-items: center; padding: 8px; margin-bottom: 8px; background: #1a1a1a; border-radius: 4px; cursor: pointer; transition: background 0.2s;">
1540
+ <input type="checkbox" class="eval-judge-checkbox" data-judge-index="${i}" style="margin-right: 10px; width: 18px; height: 18px; cursor: pointer;">
1541
+ <div style="flex: 1;">
1542
+ <div style="color: #fff; font-size: 14px; font-weight: 500;">${j.name}</div>
1543
+ <div style="color: #888; font-size: 12px;">${j.type}</div>
1544
+ </div>
1545
+ </label>
1546
+ `).join('');
1288
1547
  }
1289
1548
  } catch (e) {
1290
1549
  console.error('Error loading judges:', e);
1291
- document.getElementById('eval-judge').innerHTML = '<option value="">Error loading judges</option>';
1550
+ document.getElementById('eval-judge-list').innerHTML = '<div style="color: #f88;">Error loading judges</div>';
1292
1551
  }
1293
1552
  }
1294
1553
 
1295
1554
  // Run evaluation
1296
1555
  document.getElementById('run-eval-btn').addEventListener('click', async () => {
1297
- const judgeIndex = document.getElementById('eval-judge').value;
1556
+ // Get selected judges
1557
+ const selectedJudgeCheckboxes = document.querySelectorAll('.eval-judge-checkbox:checked');
1558
+ const selectedJudgeIndices = Array.from(selectedJudgeCheckboxes).map(cb => parseInt(cb.dataset.judgeIndex));
1298
1559
 
1299
1560
  if (selectedEvalModels.size === 0) {
1300
1561
  alert('Please select at least one weak model');
1301
1562
  return;
1302
1563
  }
1303
1564
 
1304
- if (!judgeIndex) {
1305
- alert('Please select a judge');
1565
+ if (selectedJudgeIndices.length === 0) {
1566
+ alert('Please select at least one judge');
1306
1567
  return;
1307
1568
  }
1308
1569
 
1309
1570
  // Load judges from server
1310
1571
  const judgesResponse = await fetch('/list_judges');
1311
1572
  const judgesData = await judgesResponse.json();
1312
- const judges = judgesData.judges || [];
1313
- const judge = judges[parseInt(judgeIndex)];
1573
+ const allJudges = judgesData.judges || [];
1574
+ const selectedJudges = selectedJudgeIndices.map(idx => allJudges[idx]);
1314
1575
 
1315
1576
  // Show progress
1316
1577
  document.getElementById('eval-progress').style.display = 'block';
@@ -1320,17 +1581,17 @@
1320
1581
  const resultsDiv = document.getElementById('eval-results-links');
1321
1582
 
1322
1583
  progressText.textContent = `Starting evaluations...\n`;
1323
- progressText.textContent += `Judge: ${judge.name}\n`;
1584
+ progressText.textContent += `Judges: ${selectedJudges.map(j => j.name).join(', ')}\n`;
1324
1585
  progressText.textContent += `Models: ${selectedEvalModels.size}\n\n`;
1325
1586
 
1326
1587
  const modelFiles = Array.from(selectedEvalModels);
1327
1588
  const results = [];
1328
1589
 
1329
- // Run evaluations sequentially with granular progress
1330
- for (let i = 0; i < modelFiles.length; i++) {
1331
- const modelFile = modelFiles[i];
1590
+ // Run one evaluation per model with ALL judges combined
1591
+ for (let modelIdx = 0; modelIdx < modelFiles.length; modelIdx++) {
1592
+ const modelFile = modelFiles[modelIdx];
1332
1593
 
1333
- progressText.textContent += `[${i+1}/${modelFiles.length}] Starting ${modelFile}...\n`;
1594
+ progressText.textContent += `[${modelIdx + 1}/${modelFiles.length}] Evaluating ${modelFile} with ${selectedJudges.length} judge(s)...\n`;
1334
1595
 
1335
1596
  let pollInterval = null;
1336
1597
  let taskId = null;
@@ -1341,9 +1602,8 @@
1341
1602
  const resp = await fetch(`/progress/${taskId}`);
1342
1603
  if (resp.ok) {
1343
1604
  const progress = await resp.json();
1344
- const percent = (progress.current / progress.total) * 100;
1605
+ const percent = ((modelIdx + 1) / modelFiles.length) * 100;
1345
1606
  progressFill.style.width = `${percent}%`;
1346
- progressText.textContent = `[${i+1}/${modelFiles.length}] ${progress.message}\nProgress: ${progress.current}/${progress.total} (${percent.toFixed(1)}%)\n`;
1347
1607
  }
1348
1608
  } catch (e) {
1349
1609
  console.error('Error polling eval progress:', e);
@@ -1352,17 +1612,18 @@
1352
1612
 
1353
1613
  try {
1354
1614
  // Generate task ID for this evaluation
1355
- taskId = `eval_${Date.now()}_${i}`;
1615
+ taskId = `eval_${Date.now()}_${modelIdx}`;
1356
1616
 
1357
1617
  // Start polling
1358
1618
  pollInterval = setInterval(pollProgress, 300);
1359
1619
 
1620
+ // Send all judges in one request
1360
1621
  const response = await fetch('/run_evaluation', {
1361
1622
  method: 'POST',
1362
1623
  headers: { 'Content-Type': 'application/json' },
1363
1624
  body: JSON.stringify({
1364
1625
  model_file: modelFile,
1365
- judge: judge,
1626
+ judges: selectedJudges, // Send all judges
1366
1627
  task_id: taskId
1367
1628
  })
1368
1629
  });
@@ -1377,6 +1638,7 @@
1377
1638
  if (pollInterval) clearInterval(pollInterval);
1378
1639
 
1379
1640
  progressText.textContent += ` ✓ Complete: ${result.evaluation_name}\n`;
1641
+ progressText.textContent += ` Judges used: ${result.judges.join(', ')}\n`;
1380
1642
  progressText.textContent += ` Examples: ${result.examples_evaluated}\n\n`;
1381
1643
 
1382
1644
  results.push({
@@ -1780,14 +2042,17 @@
1780
2042
  try {
1781
2043
  const response = await fetch('/list_judges');
1782
2044
  const data = await response.json();
1783
- const judgeSelect = document.getElementById('e2e-judge');
2045
+ const judgeList = document.getElementById('e2e-judge-list');
1784
2046
 
1785
2047
  if (data.judges && data.judges.length > 0) {
1786
- judgeSelect.innerHTML = data.judges.map((judge, idx) =>
1787
- `<option value="${idx}">${judge.name} (${judge.type})</option>`
1788
- ).join('');
2048
+ judgeList.innerHTML = data.judges.map((judge, idx) => `
2049
+ <label style="display: block; padding: 5px 0; color: #ccc; cursor: pointer;">
2050
+ <input type="checkbox" class="e2e-judge-checkbox" value="${idx}" style="margin-right: 8px;">
2051
+ ${judge.name} (${judge.type})
2052
+ </label>
2053
+ `).join('');
1789
2054
  } else {
1790
- judgeSelect.innerHTML = '<option value="">No judges available - create one first</option>';
2055
+ judgeList.innerHTML = '<p style="color: #888;">No judges available - create one first</p>';
1791
2056
  }
1792
2057
  } catch (error) {
1793
2058
  console.error('Error loading judges:', error);
@@ -1823,9 +2088,10 @@
1823
2088
  return;
1824
2089
  }
1825
2090
 
1826
- const judgeIndex = document.getElementById('e2e-judge').value;
1827
- if (!judgeIndex) {
1828
- alert('Please select a judge!');
2091
+ // Get selected judges
2092
+ const selectedJudgeIndices = Array.from(document.querySelectorAll('.e2e-judge-checkbox:checked')).map(cb => parseInt(cb.value));
2093
+ if (selectedJudgeIndices.length === 0) {
2094
+ alert('Please select at least one judge!');
1829
2095
  return;
1830
2096
  }
1831
2097
 
@@ -1834,7 +2100,7 @@
1834
2100
  // Load judge data
1835
2101
  const judgesResponse = await fetch('/list_judges');
1836
2102
  const judgesData = await judgesResponse.json();
1837
- const judge = judgesData.judges[parseInt(judgeIndex)];
2103
+ const judges = selectedJudgeIndices.map(idx => judgesData.judges[idx]);
1838
2104
 
1839
2105
  // Hide config panel, show progress panel
1840
2106
  document.getElementById('e2e-panel').style.display = 'none';
@@ -1920,7 +2186,8 @@
1920
2186
 
1921
2187
  // === STEP 3: Run Evaluations ===
1922
2188
  stepLabel.textContent = 'Step 3/3: Running evaluations...';
1923
- progressText.textContent += `📊 Running evaluations with judge: ${judge.name}...\n`;
2189
+ const judgeNames = judges.map(j => j.name).join(', ');
2190
+ progressText.textContent += `📊 Running evaluations with ${judges.length} judge(s): ${judgeNames}...\n`;
1924
2191
 
1925
2192
  const evaluationResults = [];
1926
2193
 
@@ -1963,7 +2230,7 @@
1963
2230
  headers: { 'Content-Type': 'application/json' },
1964
2231
  body: JSON.stringify({
1965
2232
  model_file: modelFile,
1966
- judge: judge,
2233
+ judges: judges,
1967
2234
  task_id: evalTaskId
1968
2235
  })
1969
2236
  });
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: quickdistill
3
- Version: 0.1.7
3
+ Version: 0.1.9
4
4
  Summary: Fast and easy toolkit for distilling AI models
5
5
  Author-email: Brett Young <bdytx5@umsystem.edu>
6
6
  License: MIT
@@ -0,0 +1,17 @@
1
+ quickdistill/__init__.py,sha256=dOl_wXruBGyDGhe1Iu4-SQLu_6-_b6rt1lkxfOp3Jqo,823
2
+ quickdistill/cli.py,sha256=A8d5GN9NdBS299WyAsJ6-p8ynW3DJnDRHZ-UGH7TXLM,2212
3
+ quickdistill/default_judges.json,sha256=9uDqsYc9CsJwZAWwOkWcqgmlGZNJ0zzyXpv4wZ8vtuE,1446
4
+ quickdistill/get_traces.py,sha256=mfy9fMiK-CZQN1noZ4DfOwdwP45ntthVDLgh4-u2iNk,4896
5
+ quickdistill/server.py,sha256=0yBQ5vt1oD7OkhH7ap2cR8j-wuVG3fU7jARijmD1eOs,42849
6
+ quickdistill/__pycache__/__init__.cpython-310.pyc,sha256=RqzjvxzPxHFJZkBjX6DSH9vbVTtskVgJ4pTQ6EX2A6o,794
7
+ quickdistill/__pycache__/cli.cpython-310.pyc,sha256=xtVgJTayQLKS4gE_te7U1Wo8LmkDtPkaa2rnzu8h9fY,2443
8
+ quickdistill/__pycache__/get_traces.cpython-310.pyc,sha256=T7Suxp9vpqYDQJ_3uJvXWemqoLf5tnRC2I0BfHrSiNM,2956
9
+ quickdistill/__pycache__/server.cpython-310.pyc,sha256=8W74-E_S0dJRRwRG7nF9UL64kdbyDoNswAi5y51Xc3I,25593
10
+ quickdistill/default_projects/byyoung3_arena-detailed/traces_data.json,sha256=iz-cBmXBYj0bC3Vn754QTnGuDh6sRvlE_RzSyGXaxbY,15496950
11
+ quickdistill/static/judge_manager.html,sha256=t6dSPwo_d-GIu1FscuK1KDgxKCnmiOekQTMu80lZPPY,27166
12
+ quickdistill/static/trace_viewer.html,sha256=lAMO6Mj-MWQqXGC4bo2v8ybM4ci082h2HaDQ1AOl2jM,109884
13
+ quickdistill-0.1.9.dist-info/METADATA,sha256=-VH48FybeQbxuxUOlSn0zHJfCOkxfklCrxCHbdRYFRQ,5084
14
+ quickdistill-0.1.9.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
15
+ quickdistill-0.1.9.dist-info/entry_points.txt,sha256=AUUTxnwdD9gRnsOEcTXQTAZIZ_F0aRU7JGstIJ3Xk_o,55
16
+ quickdistill-0.1.9.dist-info/top_level.txt,sha256=ysiMvurJYsE1IhkxmObe-0G8A-GIav40kTh2z6axjxg,13
17
+ quickdistill-0.1.9.dist-info/RECORD,,
@@ -1,17 +0,0 @@
1
- quickdistill/__init__.py,sha256=U8mvMbfYKLFegcEA4D-P6AFHvSiHQPXoFn0KKd-xh0A,397
2
- quickdistill/cli.py,sha256=A8d5GN9NdBS299WyAsJ6-p8ynW3DJnDRHZ-UGH7TXLM,2212
3
- quickdistill/default_judges.json,sha256=w0TkIniELPPG-Mi3hm7zPW06eq46W1BI_ufWXnkDDDM,1432
4
- quickdistill/get_traces.py,sha256=mfy9fMiK-CZQN1noZ4DfOwdwP45ntthVDLgh4-u2iNk,4896
5
- quickdistill/server.py,sha256=0Y0XG-8oYoNZgmo10LPZgtwlHuGqrq0urxE-KabyIvI,36789
6
- quickdistill/__pycache__/__init__.cpython-310.pyc,sha256=Tbov274p3OjaOuOsQwcW-meATEfkz0mHKmpytksuDJI,603
7
- quickdistill/__pycache__/cli.cpython-310.pyc,sha256=xtVgJTayQLKS4gE_te7U1Wo8LmkDtPkaa2rnzu8h9fY,2443
8
- quickdistill/__pycache__/get_traces.cpython-310.pyc,sha256=T7Suxp9vpqYDQJ_3uJvXWemqoLf5tnRC2I0BfHrSiNM,2956
9
- quickdistill/__pycache__/server.cpython-310.pyc,sha256=_taKWofMtdgfMZzfVsd7PoC4jnuKxEOGzW82YBxqPPc,22051
10
- quickdistill/default_projects/byyoung3_arena-detailed/traces_data.json,sha256=iz-cBmXBYj0bC3Vn754QTnGuDh6sRvlE_RzSyGXaxbY,15496950
11
- quickdistill/static/judge_manager.html,sha256=fXteyx_ry4gY166WypBkVGGCqieE88MigqLRLVCKnG8,26887
12
- quickdistill/static/trace_viewer.html,sha256=kPC4GnxeDPq7jxClRhZBOuS6xmA3RaY-loJDZmKDADE,94426
13
- quickdistill-0.1.7.dist-info/METADATA,sha256=1pE5fDep0l0kAxhHuT1C_H4CYHIiPLP4n9QraAqI9bM,5084
14
- quickdistill-0.1.7.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
15
- quickdistill-0.1.7.dist-info/entry_points.txt,sha256=AUUTxnwdD9gRnsOEcTXQTAZIZ_F0aRU7JGstIJ3Xk_o,55
16
- quickdistill-0.1.7.dist-info/top_level.txt,sha256=ysiMvurJYsE1IhkxmObe-0G8A-GIav40kTh2z6axjxg,13
17
- quickdistill-0.1.7.dist-info/RECORD,,