quickdistill 0.1.8__py3-none-any.whl → 0.1.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
quickdistill/__init__.py CHANGED
@@ -8,7 +8,18 @@ This package provides tools to:
8
8
  - Export datasets for model evaluation
9
9
  """
10
10
 
11
- __version__ = "0.1.8"
11
+ # Monkey patch for aiohttp/litellm compatibility
12
+ # litellm expects aiohttp.ConnectionTimeoutError but it doesn't exist in some versions
13
+ try:
14
+ import aiohttp
15
+ if not hasattr(aiohttp, 'ConnectionTimeoutError'):
16
+ aiohttp.ConnectionTimeoutError = aiohttp.ServerTimeoutError
17
+ if not hasattr(aiohttp, 'SocketTimeoutError'):
18
+ aiohttp.SocketTimeoutError = aiohttp.ServerTimeoutError
19
+ except Exception:
20
+ pass
21
+
22
+ __version__ = "0.1.9"
12
23
  __author__ = "Brett Young"
13
24
  __email__ = "bdytx5@umsystem.edu"
14
25
 
quickdistill/server.py CHANGED
@@ -100,40 +100,133 @@ def run_inference(client, model, messages, max_tokens=1000):
100
100
  return f"ERROR: {str(e)}"
101
101
 
102
102
  def extract_output_content(output_str):
103
- """Extract actual content from WeaveObject string or regular output"""
103
+ """Extract actual content from WeaveObject string, JSON response, or regular output.
104
+
105
+ Handles outputs from:
106
+ - OpenAI chat.completions.create (plain text)
107
+ - OpenAI responses.create (JSON with nested structure)
108
+ - Anthropic Messages (WeaveObject with content[0].text)
109
+ - Google Gemini (WeaveObject with candidates[0].content.parts[0].text)
110
+ """
111
+ import re
112
+ import json
113
+
104
114
  if not output_str:
105
115
  return None
106
116
 
107
- # If it's a WeaveObject string, try to extract the text content
108
- if isinstance(output_str, str) and 'WeaveObject' in output_str:
109
- import re
110
- # Try to find the 'text' field in the WeaveObject
111
- match = re.search(r"'text':\s*'([^']*(?:\\'[^']*)*)'", output_str)
117
+ if not isinstance(output_str, str):
118
+ return str(output_str)
119
+
120
+ # Handle empty/streaming responses
121
+ if output_str in ('', 'None', 'null'):
122
+ return '[Streaming output - not captured]'
123
+
124
+ # Handle OpenAI responses.create JSON format
125
+ if output_str.startswith('{') and '"output"' in output_str:
126
+ try:
127
+ resp_obj = json.loads(output_str)
128
+ if 'output' in resp_obj and isinstance(resp_obj['output'], list):
129
+ # Extract text from output messages
130
+ text_parts = []
131
+ for item in resp_obj['output']:
132
+ if item.get('type') == 'message' and 'content' in item:
133
+ for content in item['content']:
134
+ if content.get('type') == 'output_text' and 'text' in content:
135
+ text_parts.append(content['text'])
136
+ if text_parts:
137
+ return '\n\n'.join(text_parts)
138
+ except (json.JSONDecodeError, KeyError, TypeError):
139
+ pass # Fall through to other handlers
140
+
141
+ # Handle WeaveObject strings (Anthropic, Gemini)
142
+ if 'WeaveObject' in output_str:
143
+ # Improved regex that handles escape sequences properly
144
+ match = re.search(r"'text':\s*'((?:[^'\\]|\\.)*)'", output_str, re.DOTALL)
112
145
  if match:
113
- # Unescape the string
146
+ # Unescape the string properly (order matters!)
114
147
  text = match.group(1)
115
- text = text.replace('\\n', '\n').replace("\\'", "'").replace('\\\\', '\\')
148
+ text = text.replace("\\'", "'") # escaped single quotes
149
+ text = text.replace('\\"', '"') # escaped double quotes
150
+ text = text.replace('\\n', '\n') # newlines
151
+ text = text.replace('\\t', '\t') # tabs
152
+ text = text.replace('\\r', '\r') # carriage returns
153
+ text = text.replace('\\\\', '\\') # escaped backslashes (do this last!)
116
154
  return text
117
155
 
118
- # Otherwise return as-is
156
+ # If no text field found, return truncated version
157
+ return f"[Complex WeaveObject - could not extract text]\n{output_str[:500]}..."
158
+
159
+ # Plain text output (standard OpenAI chat format)
119
160
  return output_str
120
161
 
121
162
 
122
163
  def extract_messages_from_trace(trace):
123
- """Extract messages from a trace in the format needed for inference"""
124
- # Check if messages are at top level
164
+ """Extract messages from a trace in the format needed for inference.
165
+
166
+ Handles message extraction from:
167
+ - OpenAI chat.completions.create (messages at top level or in inputs.messages)
168
+ - OpenAI responses.create (inputs.input field)
169
+ - Anthropic Messages (inputs.messages)
170
+ - Google Gemini generate_content (inputs.contents array)
171
+ - Google Gemini Chat.send_message (inputs.message string)
172
+ """
173
+ import re
174
+
175
+ # Get op_display_name for provider detection
176
+ op_name = trace.get('op_display_name', '')
177
+
178
+ # Check if messages are at top level (already extracted/cached)
125
179
  if trace.get('messages') and isinstance(trace['messages'], list) and len(trace['messages']) > 0:
126
180
  return trace['messages']
127
181
 
128
182
  # Check if messages are in inputs
129
183
  if trace.get('inputs') and isinstance(trace['inputs'], dict):
130
- messages = trace['inputs'].get('messages', [])
184
+ inputs = trace['inputs']
185
+
186
+ # Standard OpenAI/Anthropic: inputs.messages
187
+ messages = inputs.get('messages', [])
131
188
  if isinstance(messages, list) and len(messages) > 0:
132
189
  return messages
133
190
 
191
+ # OpenAI responses.create: inputs.input (simple string)
192
+ if 'openai.responses' in op_name and 'input' in inputs:
193
+ return [{"role": "user", "content": inputs['input']}]
194
+
195
+ # Gemini Chat.send_message: inputs.message (simple string)
196
+ if 'Chat.send_message' in op_name and 'message' in inputs:
197
+ return [{"role": "user", "content": inputs['message']}]
198
+
199
+ # Gemini generate_content: inputs.contents (array of content objects or WeaveObject strings)
200
+ if 'google.genai' in op_name and 'contents' in inputs:
201
+ contents = inputs['contents']
202
+ if isinstance(contents, list) and len(contents) > 0:
203
+ messages = []
204
+ for content in contents:
205
+ # Handle WeaveObject string format
206
+ if isinstance(content, str) and 'WeaveObject' in content:
207
+ role_match = re.search(r"'role':\s*'(\w+)'", content)
208
+ text_match = re.search(r"'text':\s*'((?:[^'\\]|\\.)*)'", content, re.DOTALL)
209
+ text = '[Complex content]'
210
+ if text_match:
211
+ text = text_match.group(1)
212
+ text = text.replace("\\'", "'").replace('\\n', '\n').replace('\\\\', '\\')
213
+ messages.append({
214
+ "role": role_match.group(1) if role_match else "user",
215
+ "content": text
216
+ })
217
+ # Handle regular dict format
218
+ elif isinstance(content, dict):
219
+ role = content.get('role', 'user')
220
+ parts = content.get('parts', [])
221
+ if isinstance(parts, list):
222
+ text = '\n'.join([p.get('text', '') for p in parts if isinstance(p, dict)])
223
+ messages.append({"role": role, "content": text})
224
+ if messages:
225
+ return messages
226
+
134
227
  # Check if inputs has question/context format (from generate_test_traces.py wrapper traces)
135
- question = trace['inputs'].get('question')
136
- context = trace['inputs'].get('context')
228
+ question = inputs.get('question')
229
+ context = inputs.get('context')
137
230
  if question:
138
231
  if context:
139
232
  prompt = f"""Based on the following context, answer the question concisely.
@@ -753,16 +846,26 @@ def delete_judge():
753
846
 
754
847
  @app.route('/run_evaluation', methods=['POST'])
755
848
  def run_evaluation_endpoint():
756
- """Run evaluation using specified judge"""
757
-
849
+ """Run evaluation using specified judge(s) - supports multiple judges"""
850
+
758
851
 
759
852
  data = request.json
760
853
  model_file = data.get('model_file')
761
- judge = data.get('judge')
854
+ judges = data.get('judges') # Can be a list or single judge dict
762
855
  task_id = data.get('task_id', f"eval_{id(data)}")
763
856
 
764
- if not model_file or not judge:
765
- return jsonify({'error': 'Missing model_file or judge'}), 400
857
+ # Handle both single judge (backwards compat) and multiple judges
858
+ if data.get('judge'):
859
+ judges = [data.get('judge')]
860
+ elif not judges:
861
+ return jsonify({'error': 'Missing judge or judges'}), 400
862
+
863
+ # Ensure judges is a list
864
+ if not isinstance(judges, list):
865
+ judges = [judges]
866
+
867
+ if not model_file:
868
+ return jsonify({'error': 'Missing model_file'}), 400
766
869
 
767
870
  # Load weak model results
768
871
  model_path = DATA_DIR / model_file
@@ -782,18 +885,22 @@ def run_evaluation_endpoint():
782
885
  # Extract model name from filename
783
886
  model_name = model_file.replace('weak_model_', '').replace('.json', '')
784
887
 
888
+ # Create evaluation name with all judges
889
+ judges_names = '_'.join([j['name'] for j in judges])
890
+ eval_name = f"eval-{model_name}-{judges_names}"
891
+
785
892
  # Initialize progress tracking
786
893
  total_steps = len(results)
787
894
  progress_state[task_id] = {
788
895
  'current': 0,
789
896
  'total': total_steps,
790
- 'message': f'Starting evaluation: {model_name} with {judge["name"]}...',
897
+ 'message': f'Starting evaluation: {model_name} with {len(judges)} judge(s)...',
791
898
  'status': 'running'
792
899
  }
793
900
 
794
901
  # Create evaluation logger
795
902
  ev = weave.EvaluationLogger(
796
- name=f"eval-{model_name}-{judge['name']}",
903
+ name=eval_name,
797
904
  model=model_name
798
905
  )
799
906
 
@@ -818,13 +925,20 @@ def run_evaluation_endpoint():
818
925
  if messages and len(messages) > 0:
819
926
  question = messages[0].get('content', '')
820
927
 
821
- # Run judge
822
- if judge['type'] == 'llm':
823
- scores = run_llm_judge_eval(judge, strong_output, weak_output, question)
824
- else:
825
- scores = run_custom_judge_eval(judge, strong_output, weak_output)
928
+ # Run all judges and collect scores
929
+ all_scores = {}
930
+ for judge in judges:
931
+ # Run judge
932
+ if judge['type'] == 'llm':
933
+ scores = run_llm_judge_eval(judge, strong_output, weak_output, question)
934
+ else:
935
+ scores = run_custom_judge_eval(judge, strong_output, weak_output)
936
+
937
+ # Merge scores with judge name prefix to avoid conflicts
938
+ for score_key, score_value in scores.items():
939
+ all_scores[f"{judge['name']}_{score_key}"] = score_value
826
940
 
827
- # Log to weave
941
+ # Log to weave with all scores from all judges
828
942
  ev.log_example(
829
943
  inputs={
830
944
  "question": question,
@@ -834,7 +948,7 @@ def run_evaluation_endpoint():
834
948
  "weak_output": weak_output
835
949
 
836
950
  },
837
- scores=scores
951
+ scores=all_scores
838
952
  )
839
953
 
840
954
  # Finish evaluation
@@ -850,10 +964,11 @@ def run_evaluation_endpoint():
850
964
 
851
965
  return jsonify({
852
966
  'status': 'success',
853
- 'evaluation_name': f"eval-{model_name}-{judge['name']}",
967
+ 'evaluation_name': eval_name,
854
968
  'examples_evaluated': len(results),
855
969
  'weave_url': ev.ui_url,
856
970
  'strong_export': strong_export,
971
+ 'judges': [j['name'] for j in judges],
857
972
  'task_id': task_id
858
973
  })
859
974
 
@@ -1032,6 +1147,32 @@ def list_projects():
1032
1147
  return jsonify({'projects': projects})
1033
1148
 
1034
1149
 
1150
+ @app.route('/get_preferences', methods=['GET'])
1151
+ def get_preferences():
1152
+ """Get saved user preferences"""
1153
+ prefs_file = DATA_DIR / 'preferences.json'
1154
+ if prefs_file.exists():
1155
+ try:
1156
+ with open(prefs_file, 'r') as f:
1157
+ return jsonify(json.load(f))
1158
+ except:
1159
+ pass
1160
+ return jsonify({})
1161
+
1162
+
1163
+ @app.route('/save_preferences', methods=['POST'])
1164
+ def save_preferences():
1165
+ """Save user preferences"""
1166
+ try:
1167
+ data = request.json
1168
+ prefs_file = DATA_DIR / 'preferences.json'
1169
+ with open(prefs_file, 'w') as f:
1170
+ json.dump(data, f, indent=2)
1171
+ return jsonify({'status': 'success'})
1172
+ except Exception as e:
1173
+ return jsonify({'status': 'error', 'message': str(e)}), 500
1174
+
1175
+
1035
1176
  # Routes for serving HTML pages
1036
1177
  @app.route('/')
1037
1178
  def index():
@@ -43,10 +43,21 @@
43
43
  padding: 20px;
44
44
  border-radius: 8px;
45
45
  margin-bottom: 20px;
46
- display: flex;
46
+ }
47
+
48
+ .filter-row {
49
+ display: grid;
50
+ grid-template-columns: auto 1fr auto 1fr auto auto;
47
51
  gap: 15px;
48
- align-items: center;
49
- flex-wrap: wrap;
52
+ align-items: start;
53
+ margin-bottom: 20px;
54
+ }
55
+
56
+ .filter-group {
57
+ display: flex;
58
+ flex-direction: column;
59
+ gap: 8px;
60
+ min-width: 250px;
50
61
  }
51
62
 
52
63
  .controls label {
@@ -283,66 +294,75 @@
283
294
  </div>
284
295
 
285
296
  <div class="controls">
286
- <div style="display: flex; flex-direction: column; gap: 4px;">
287
- <label for="op-filter">Filter by Operation:</label>
288
- <span style="color: #666; font-size: 11px;">Primary supported: openai.chat.completions.create</span>
289
- </div>
290
- <select id="op-filter">
291
- <option value="all">All Operations</option>
292
- </select>
293
-
294
- <label for="model-filter">Filter by Model:</label>
295
- <select id="model-filter">
296
- <option value="all">All Models</option>
297
- </select>
298
-
299
- <button id="select-all-btn" style="margin-left: 20px; padding: 8px 16px; background: #2a7c4a; color: white; border: none; border-radius: 4px; cursor: pointer;">
300
- Select All Filtered
301
- </button>
297
+ <!-- Filters Row -->
298
+ <div class="filter-row">
299
+ <div class="filter-group">
300
+ <label for="op-filter">Operation Filter:</label>
301
+ <select id="op-filter" style="background: #2a2a2a; color: #fff; border: 1px solid #3a3a3a; padding: 8px 12px; border-radius: 4px; font-size: 14px; cursor: pointer;">
302
+ <option value="all">All Operations</option>
303
+ </select>
304
+ <span style="color: #4a9eff; font-size: 11px; font-weight: 500;">✅ Fully supported: OpenAI (chat.completions, responses), Anthropic (Messages), Google Gemini (generate_content, Chat)</span>
305
+ </div>
302
306
 
303
- <!-- Manual Workflow Section -->
304
- <div style="margin: 20px 0; padding: 15px; background: #1a2a1a; border-radius: 8px; border: 3px solid #ffffff;">
305
- <div style="color: #ffffff; font-size: 14px; font-weight: 500; margin-bottom: 12px;">📋 Manual Workflow (Step-by-Step):</div>
306
- <div style="display: flex; flex-wrap: wrap; gap: 10px;">
307
- <button id="export-btn" style="padding: 8px 16px; background: #4a9eff; color: white; border: none; border-radius: 4px; cursor: pointer;">
308
- Export Selected to Test Set (<span id="selected-count">0</span>)
309
- </button>
307
+ <div class="filter-group">
308
+ <label for="model-filter">Model Filter:</label>
309
+ <select id="model-filter" style="background: #2a2a2a; color: #fff; border: 1px solid #3a3a3a; padding: 8px 12px; border-radius: 4px; font-size: 14px; cursor: pointer;">
310
+ <option value="all">All Models</option>
311
+ </select>
312
+ </div>
310
313
 
311
- <button id="open-inference-btn" style="padding: 8px 16px; background: #7c4a9e; color: white; border: none; border-radius: 4px; cursor: pointer;">
312
- Run Weak Models
313
- </button>
314
+ <button id="select-all-btn" style="padding: 8px 16px; background: #2a7c4a; color: white; border: none; border-radius: 4px; cursor: pointer; align-self: end; white-space: nowrap;">
315
+ Select All
316
+ </button>
314
317
 
315
- <button id="open-eval-btn" style="padding: 8px 16px; background: #9e6a4a; color: white; border: none; border-radius: 4px; cursor: pointer;">
316
- Run Evaluation
317
- </button>
318
+ <div style="display: flex; flex-direction: column; gap: 4px; align-self: end;">
319
+ <div style="color: #888; font-size: 13px;">Total: <span id="total-count" style="color: #fff; font-weight: 600;">0</span></div>
320
+ <div style="color: #888; font-size: 13px;">Shown: <span id="shown-count" style="color: #4a9eff; font-weight: 600;">0</span></div>
318
321
  </div>
319
322
  </div>
320
323
 
321
- <!-- Utilities -->
322
- <a href="/judge" target="_blank" style="padding: 8px 16px; background: #4a5a9e; color: white; border: none; border-radius: 4px; text-decoration: none; display: inline-block;">
323
- Manage Judges
324
- </a>
325
-
326
- <button id="open-test-judge-btn" style="padding: 8px 16px; background: #6a4a7e; color: white; border: none; border-radius: 4px; cursor: pointer;">
327
- Test Judges
328
- </button>
324
+ <!-- Action Buttons Row -->
325
+ <div style="display: grid; grid-template-columns: 2fr 1fr; gap: 20px;">
326
+ <!-- Main Workflow -->
327
+ <div style="padding: 15px; background: #0f1f0f; border-radius: 8px; border: 2px solid #2a4a2a;">
328
+ <div style="color: #6dd36d; font-size: 13px; font-weight: 600; margin-bottom: 12px;">📋 MANUAL WORKFLOW</div>
329
+ <div style="display: flex; flex-wrap: wrap; gap: 10px;">
330
+ <button id="export-btn" style="padding: 10px 16px; background: #4a9eff; color: white; border: none; border-radius: 4px; cursor: pointer; font-weight: 500;">
331
+ 1. Export Test Set (<span id="selected-count">0</span>)
332
+ </button>
333
+ <button id="open-inference-btn" style="padding: 10px 16px; background: #7c4a9e; color: white; border: none; border-radius: 4px; cursor: pointer; font-weight: 500;">
334
+ 2. Run Weak Models
335
+ </button>
336
+ <button id="open-eval-btn" style="padding: 10px 16px; background: #9e6a4a; color: white; border: none; border-radius: 4px; cursor: pointer; font-weight: 500;">
337
+ 3. Evaluate Results
338
+ </button>
339
+ </div>
340
+ </div>
329
341
 
330
- <button id="open-settings-btn" style="padding: 8px 16px; background: #5a5a5a; color: white; border: none; border-radius: 4px; cursor: pointer;">
331
- Settings
332
- </button>
342
+ <!-- Utilities -->
343
+ <div style="padding: 15px; background: #1a1a2a; border-radius: 8px; border: 1px solid #2a2a3a;">
344
+ <div style="color: #aaa; font-size: 13px; font-weight: 600; margin-bottom: 12px;">⚙️ TOOLS</div>
345
+ <div style="display: flex; flex-wrap: wrap; gap: 8px;">
346
+ <a href="/judge" target="_blank" style="padding: 8px 14px; background: #4a5a9e; color: white; border: none; border-radius: 4px; text-decoration: none; display: inline-block; font-size: 13px;">
347
+ Judges
348
+ </a>
349
+ <button id="open-test-judge-btn" style="padding: 8px 14px; background: #6a4a7e; color: white; border: none; border-radius: 4px; cursor: pointer; font-size: 13px;">
350
+ Test Judge
351
+ </button>
352
+ <button id="open-settings-btn" style="padding: 8px 14px; background: #5a5a5a; color: white; border: none; border-radius: 4px; cursor: pointer; font-size: 13px;">
353
+ Settings
354
+ </button>
355
+ </div>
356
+ </div>
357
+ </div>
333
358
 
334
- <!-- Automatic Workflow Section -->
335
- <div style="margin: 20px 0; padding: 15px; background: #2a1a2a; border-radius: 8px; border: 1px solid #4a2a4a;">
336
- <div style="color: #aaa; font-size: 13px; margin-bottom: 10px;">Automatic Workflow:</div>
337
- <button id="open-e2e-btn" style="padding: 10px 20px; background: #7a4a9e; color: white; border: none; border-radius: 4px; cursor: pointer; font-weight: 500;">
359
+ <!-- Automatic Workflow -->
360
+ <div style="margin-top: 20px; padding: 15px; background: #2a1a2a; border-radius: 8px; border: 2px solid #7a4a9e;">
361
+ <div style="color: #bb88ff; font-size: 13px; font-weight: 600; margin-bottom: 10px;">⚡ AUTOMATIC WORKFLOW</div>
362
+ <button id="open-e2e-btn" style="padding: 12px 20px; background: #7a4a9e; color: white; border: none; border-radius: 4px; cursor: pointer; font-weight: 600; font-size: 14px;">
338
363
  ⚡ Run End-to-End Test
339
364
  </button>
340
- <div style="color: #666; font-size: 11px; margin-top: 8px;">Export → Generate → Evaluate (all in one)</div>
341
- </div>
342
-
343
- <div class="stats">
344
- <div>Total: <span id="total-count">0</span></div>
345
- <div>Shown: <span id="shown-count">0</span></div>
365
+ <div style="color: #888; font-size: 11px; margin-top: 8px;">Export → Generate → Evaluate (all in one)</div>
346
366
  </div>
347
367
  </div>
348
368
 
@@ -420,10 +440,10 @@
420
440
  </div>
421
441
 
422
442
  <div style="margin-bottom: 20px;">
423
- <label style="color: #aaa; display: block; margin-bottom: 10px;">Select Judge:</label>
424
- <select id="eval-judge" style="width: 100%; padding: 8px; background: #2a2a2a; color: #fff; border: 1px solid #3a3a3a; border-radius: 4px;">
425
- <!-- Judges populated dynamically -->
426
- </select>
443
+ <label style="color: #aaa; display: block; margin-bottom: 10px;">Select Judge(s) - you can select multiple:</label>
444
+ <div id="eval-judge-list" style="max-height: 200px; overflow-y: auto; background: #0f0f0f; padding: 15px; border-radius: 4px;">
445
+ <!-- Judges populated dynamically as checkboxes -->
446
+ </div>
427
447
  <div style="color: #666; font-size: 12px; margin-top: 5px;">
428
448
  <a href="/judge" target="_blank" style="color: #4a9eff;">Create/manage judges</a>
429
449
  </div>
@@ -594,10 +614,10 @@
594
614
 
595
615
  <!-- Judge Selection -->
596
616
  <div style="margin-bottom: 30px;">
597
- <h3 style="color: #fff; font-size: 16px; margin-bottom: 15px;">2. Select Judge</h3>
598
- <select id="e2e-judge" style="width: 100%; padding: 10px; background: #2a2a2a; color: #fff; border: 1px solid #3a3a3a; border-radius: 4px; font-size: 14px;">
599
- <option value="">Loading judges...</option>
600
- </select>
617
+ <h3 style="color: #fff; font-size: 16px; margin-bottom: 15px;">2. Select Judges</h3>
618
+ <div id="e2e-judge-list" style="max-height: 200px; overflow-y: auto; background: #2a2a2a; border: 1px solid #3a3a3a; border-radius: 4px; padding: 10px;">
619
+ <p style="color: #888;">Loading judges...</p>
620
+ </div>
601
621
  </div>
602
622
 
603
623
  <!-- Actions -->
@@ -656,6 +676,17 @@
656
676
  "Qwen/Qwen3-Coder-480B-A35B-Instruct",
657
677
  ];
658
678
 
679
+ const SUPPORTED_OPS = [
680
+ 'openai.chat.completions.create',
681
+ 'openai.responses.create',
682
+ 'anthropic.Messages.create',
683
+ 'anthropic.Messages.stream',
684
+ 'google.genai.models.Models.generate_content',
685
+ 'google.genai.models.Models.generate_content_stream',
686
+ 'google.genai.chats.Chat.send_message',
687
+ 'google.genai.chats.Chat.send_message_stream'
688
+ ];
689
+
659
690
  let allTraces = [];
660
691
  let currentOpFilter = 'all';
661
692
  let currentModelFilter = 'all';
@@ -667,6 +698,10 @@
667
698
  // Load projects list
668
699
  async function loadProjects() {
669
700
  try {
701
+ // Load saved preferences
702
+ const prefsResponse = await fetch('/get_preferences');
703
+ const prefs = await prefsResponse.json();
704
+
670
705
  const response = await fetch('/list_projects');
671
706
  const data = await response.json();
672
707
  const select = document.getElementById('project-select');
@@ -678,11 +713,23 @@
678
713
  `<option value="${p.name}">${p.name} (${p.trace_count} traces)</option>`
679
714
  ).join('');
680
715
 
681
- // Auto-select first project and load it
682
- if (data.projects.length > 0) {
683
- currentProject = data.projects[0].name;
684
- select.value = currentProject;
685
- await loadTraces(currentProject);
716
+ // Use saved project or first project
717
+ let projectToLoad = prefs.lastProject || data.projects[0].name;
718
+
719
+ // Check if saved project still exists
720
+ const projectExists = data.projects.some(p => p.name === projectToLoad);
721
+ if (!projectExists) {
722
+ projectToLoad = data.projects[0].name;
723
+ }
724
+
725
+ currentProject = projectToLoad;
726
+ select.value = currentProject;
727
+ await loadTraces(currentProject);
728
+
729
+ // Set default filter to "All Supported Ops"
730
+ if (!prefs.lastOpFilter) {
731
+ currentOpFilter = 'supported';
732
+ document.getElementById('op-filter').value = 'supported';
686
733
  }
687
734
  }
688
735
  } catch (e) {
@@ -690,6 +737,153 @@
690
737
  }
691
738
  }
692
739
 
740
+ // Patch traces to handle different provider formats
741
+ function patchTracesForProviders(traces) {
742
+ return traces.map(trace => {
743
+ const patched = { ...trace };
744
+
745
+ // Extract provider from op_display_name
746
+ const opName = trace.op_display_name || '';
747
+
748
+ // === PARSE WEAVEOBJECT OUTPUTS ===
749
+ if (patched.output && typeof patched.output === 'string') {
750
+ // Check if it's a streaming operation (empty or None)
751
+ if (patched.output === '' || patched.output === 'None' || patched.output === 'null') {
752
+ if (opName.includes('stream') || opName.includes('Stream')) {
753
+ patched.output = '[Streaming output - not captured in trace]';
754
+ }
755
+ }
756
+ // Parse WeaveObject strings
757
+ else if (patched.output.startsWith('WeaveObject(')) {
758
+ patched.output = extractFromWeaveObject(patched.output, opName);
759
+ }
760
+ // Parse OpenAI responses.create JSON output
761
+ else if (opName.includes('openai.responses.create')) {
762
+ try {
763
+ const respObj = JSON.parse(patched.output);
764
+ if (respObj.output && Array.isArray(respObj.output)) {
765
+ // Extract text from output messages
766
+ const textParts = respObj.output
767
+ .filter(item => item.type === 'message')
768
+ .flatMap(msg => msg.content || [])
769
+ .filter(c => c.type === 'output_text')
770
+ .map(c => c.text);
771
+ patched.output = textParts.join('\n\n') || JSON.stringify(respObj, null, 2);
772
+ }
773
+ } catch (e) {
774
+ // Keep original if parsing fails
775
+ }
776
+ }
777
+ }
778
+
779
+ // === EXTRACT MESSAGES FOR NON-OPENAI FORMATS ===
780
+ if (patched.inputs && (!patched.messages || patched.messages.length === 0)) {
781
+ // Anthropic format
782
+ if (opName.includes('anthropic') && patched.inputs.messages) {
783
+ patched.messages = patched.inputs.messages;
784
+ }
785
+ // Gemini contents format
786
+ else if (opName.includes('google.genai') && patched.inputs.contents) {
787
+ patched.messages = extractGeminiMessages(patched.inputs.contents);
788
+ }
789
+ // Gemini Chat.send_message format
790
+ else if (opName.includes('Chat.send_message') && patched.inputs.message) {
791
+ patched.messages = [{ role: 'user', content: patched.inputs.message }];
792
+ }
793
+ // OpenAI responses.create input format
794
+ else if (opName.includes('openai.responses') && patched.inputs.input) {
795
+ patched.messages = [{ role: 'user', content: patched.inputs.input }];
796
+ }
797
+ }
798
+
799
+ // === ADD PROVIDER-SPECIFIC USAGE INFO ===
800
+ if (patched.usage) {
801
+ // Gemini thoughts tokens
802
+ if (patched.usage.thoughts_tokens) {
803
+ patched.usage.thoughts_tokens_label = 'Thinking';
804
+ }
805
+ // OpenAI reasoning tokens
806
+ if (patched.usage.output_tokens_details && patched.usage.output_tokens_details.reasoning_tokens) {
807
+ patched.usage.reasoning_tokens = patched.usage.output_tokens_details.reasoning_tokens;
808
+ }
809
+ // Anthropic cache metrics
810
+ if (patched.usage.cache_read_input_tokens || patched.usage.cache_creation_input_tokens) {
811
+ patched.usage.has_cache_info = true;
812
+ }
813
+ }
814
+
815
+ return patched;
816
+ });
817
+ }
818
+
819
+ // Extract text from WeaveObject string based on provider
820
+ function extractFromWeaveObject(weaveStr, opName) {
821
+ try {
822
+ // Find the 'text' field and extract everything until the next unescaped quote
823
+ // This handles multiline strings with escaped quotes and newlines
824
+ const textMatch = weaveStr.match(/'text':\s*'((?:[^'\\]|\\.)*)'/s);
825
+
826
+ if (textMatch && textMatch[1]) {
827
+ let extracted = textMatch[1];
828
+
829
+ // Unescape common escape sequences
830
+ extracted = extracted
831
+ .replace(/\\'/g, "'") // escaped single quotes
832
+ .replace(/\\"/g, '"') // escaped double quotes
833
+ .replace(/\\n/g, '\n') // newlines
834
+ .replace(/\\t/g, '\t') // tabs
835
+ .replace(/\\r/g, '\r') // carriage returns
836
+ .replace(/\\\\/g, '\\'); // escaped backslashes (do this last)
837
+
838
+ return extracted;
839
+ }
840
+
841
+ // Fallback: if no text field found, show truncated version
842
+ return `[Complex WeaveObject - see raw data]\n${weaveStr.substring(0, 500)}...`;
843
+ } catch (e) {
844
+ console.error('Failed to parse WeaveObject:', e);
845
+ return `[Failed to parse WeaveObject]\n${weaveStr.substring(0, 200)}...`;
846
+ }
847
+ }
848
+
849
+ // Extract messages from Gemini contents format
850
+ function extractGeminiMessages(contents) {
851
+ if (!Array.isArray(contents)) return [];
852
+
853
+ return contents.map(content => {
854
+ // Handle WeaveObject string
855
+ if (typeof content === 'string' && content.startsWith('WeaveObject(')) {
856
+ // Try to extract basic info
857
+ const roleMatch = content.match(/'role':\s*'(\w+)'/);
858
+ const textMatch = content.match(/'text':\s*'((?:[^'\\]|\\.)*)'/s);
859
+
860
+ let text = '[Complex content]';
861
+ if (textMatch && textMatch[1]) {
862
+ text = textMatch[1]
863
+ .replace(/\\'/g, "'")
864
+ .replace(/\\"/g, '"')
865
+ .replace(/\\n/g, '\n')
866
+ .replace(/\\t/g, '\t')
867
+ .replace(/\\r/g, '\r')
868
+ .replace(/\\\\/g, '\\');
869
+ }
870
+
871
+ return {
872
+ role: roleMatch ? roleMatch[1] : 'user',
873
+ content: text
874
+ };
875
+ }
876
+ // Handle regular object
877
+ else if (content.role && content.parts) {
878
+ return {
879
+ role: content.role,
880
+ content: content.parts.map(p => p.text || '').join('\n')
881
+ };
882
+ }
883
+ return { role: 'user', content: String(content) };
884
+ });
885
+ }
886
+
693
887
  // Load traces from selected project
694
888
  async function loadTraces(projectName) {
695
889
  const projectPath = projectName.replace('/', '_');
@@ -704,7 +898,7 @@
704
898
  }
705
899
 
706
900
  const data = await response.json();
707
- allTraces = data;
901
+ allTraces = patchTracesForProviders(data);
708
902
  currentProject = projectName;
709
903
  populateFilters();
710
904
  renderTraces();
@@ -782,6 +976,12 @@
782
976
  const projectName = e.target.value;
783
977
  if (projectName) {
784
978
  await loadTraces(projectName);
979
+ // Save preference
980
+ await fetch('/save_preferences', {
981
+ method: 'POST',
982
+ headers: { 'Content-Type': 'application/json' },
983
+ body: JSON.stringify({ lastProject: projectName })
984
+ });
785
985
  }
786
986
  });
787
987
 
@@ -790,9 +990,21 @@
790
990
 
791
991
  // Populate filter dropdowns
792
992
  function populateFilters() {
793
- // Populate operation filter
794
- const ops = new Set(allTraces.map(t => t.op_display_name || 'unknown'));
993
+ // Clear existing options (except "All") to avoid duplicates when switching projects
795
994
  const opSelect = document.getElementById('op-filter');
995
+ const modelSelect = document.getElementById('model-filter');
996
+
997
+ // Save current filter values
998
+ const savedOpFilter = currentOpFilter;
999
+ const savedModelFilter = currentModelFilter;
1000
+
1001
+ // Clear dropdowns but keep the "All" option
1002
+ opSelect.innerHTML = '<option value="all">All Operations</option>';
1003
+ opSelect.innerHTML += '<option value="supported">All Supported Ops</option>';
1004
+ modelSelect.innerHTML = '<option value="all">All Models</option>';
1005
+
1006
+ // Populate operation filter with operations from current project only
1007
+ const ops = new Set(allTraces.map(t => t.op_display_name || 'unknown'));
796
1008
  const sortedOps = [...ops].sort();
797
1009
  sortedOps.forEach(op => {
798
1010
  const option = document.createElement('option');
@@ -801,21 +1013,36 @@
801
1013
  opSelect.appendChild(option);
802
1014
  });
803
1015
 
804
- // Set default to openai.chat.completions.create if it exists
805
- if (sortedOps.includes('openai.chat.completions.create')) {
806
- opSelect.value = 'openai.chat.completions.create';
807
- currentOpFilter = 'openai.chat.completions.create';
808
- }
809
-
810
- // Populate model filter
1016
+ // Populate model filter with models from current project only
811
1017
  const models = new Set(allTraces.map(t => t.model));
812
- const modelSelect = document.getElementById('model-filter');
813
1018
  [...models].sort().forEach(model => {
814
1019
  const option = document.createElement('option');
815
1020
  option.value = model;
816
1021
  option.textContent = model;
817
1022
  modelSelect.appendChild(option);
818
1023
  });
1024
+
1025
+ // Restore previous filter values if they still exist
1026
+ // Special handling for 'all' and 'supported' which always exist
1027
+ if (savedOpFilter === 'all' || savedOpFilter === 'supported') {
1028
+ opSelect.value = savedOpFilter;
1029
+ currentOpFilter = savedOpFilter;
1030
+ } else if (sortedOps.includes(savedOpFilter)) {
1031
+ opSelect.value = savedOpFilter;
1032
+ currentOpFilter = savedOpFilter;
1033
+ } else {
1034
+ // Default to 'supported' when switching projects
1035
+ opSelect.value = 'supported';
1036
+ currentOpFilter = 'supported';
1037
+ }
1038
+
1039
+ if ([...models].includes(savedModelFilter)) {
1040
+ modelSelect.value = savedModelFilter;
1041
+ currentModelFilter = savedModelFilter;
1042
+ } else {
1043
+ modelSelect.value = 'all';
1044
+ currentModelFilter = 'all';
1045
+ }
819
1046
  }
820
1047
 
821
1048
  // Filter change handlers
@@ -834,7 +1061,13 @@
834
1061
  let filteredTraces = allTraces;
835
1062
 
836
1063
  // Apply operation filter
837
- if (currentOpFilter !== 'all') {
1064
+ if (currentOpFilter === 'supported') {
1065
+ // Filter to only supported operations
1066
+ filteredTraces = filteredTraces.filter(t => {
1067
+ const opDisplayName = t.op_display_name || '';
1068
+ return SUPPORTED_OPS.some(op => opDisplayName.includes(op));
1069
+ });
1070
+ } else if (currentOpFilter !== 'all') {
838
1071
  filteredTraces = filteredTraces.filter(t => t.op_display_name === currentOpFilter);
839
1072
  }
840
1073
 
@@ -868,9 +1101,12 @@
868
1101
  ${trace.usage && (trace.usage.total_tokens || trace.usage.requests) ? `
869
1102
  <div class="usage-info">
870
1103
  ${trace.usage.requests ? `<div class="usage-item"><span class="usage-label">Requests:</span> ${trace.usage.requests}</div>` : ''}
871
- ${trace.usage.prompt_tokens ? `<div class="usage-item"><span class="usage-label">Prompt:</span> ${trace.usage.prompt_tokens}</div>` : ''}
872
- ${trace.usage.completion_tokens ? `<div class="usage-item"><span class="usage-label">Completion:</span> ${trace.usage.completion_tokens}</div>` : ''}
1104
+ ${trace.usage.prompt_tokens || trace.usage.input_tokens ? `<div class="usage-item"><span class="usage-label">Input:</span> ${trace.usage.prompt_tokens || trace.usage.input_tokens}</div>` : ''}
1105
+ ${trace.usage.completion_tokens || trace.usage.output_tokens ? `<div class="usage-item"><span class="usage-label">Output:</span> ${trace.usage.completion_tokens || trace.usage.output_tokens}</div>` : ''}
873
1106
  ${trace.usage.total_tokens ? `<div class="usage-item"><span class="usage-label">Total:</span> ${trace.usage.total_tokens}</div>` : ''}
1107
+ ${trace.usage.reasoning_tokens ? `<div class="usage-item" style="color: #ff9d00;"><span class="usage-label">Reasoning:</span> ${trace.usage.reasoning_tokens}</div>` : ''}
1108
+ ${trace.usage.thoughts_tokens ? `<div class="usage-item" style="color: #9d66ff;"><span class="usage-label">Thinking:</span> ${trace.usage.thoughts_tokens}</div>` : ''}
1109
+ ${trace.usage.cache_read_input_tokens ? `<div class="usage-item" style="color: #4a9eff;"><span class="usage-label">Cache Read:</span> ${trace.usage.cache_read_input_tokens}</div>` : ''}
874
1110
  </div>
875
1111
  ` : ''}
876
1112
 
@@ -980,17 +1216,15 @@
980
1216
  return;
981
1217
  }
982
1218
 
983
- // Filter to only OpenAI completion traces (exclude wrapper function traces)
1219
+ // Filter to only supported provider traces (exclude wrapper function traces)
984
1220
  const completionTraces = selectedData.filter(t => {
985
- const opName = t.op_name || '';
986
1221
  const opDisplayName = t.op_display_name || '';
987
- // Only include traces from openai.chat.completions.create
988
- return opDisplayName === 'openai.chat.completions.create' ||
989
- opName.includes('openai.chat.completions.create');
1222
+ // Check if it's one of our supported operations
1223
+ return SUPPORTED_OPS.some(op => opDisplayName.includes(op));
990
1224
  });
991
1225
 
992
1226
  if (completionTraces.length === 0) {
993
- alert('No OpenAI completion traces selected! Please select traces from actual API calls, not wrapper functions.');
1227
+ alert('No supported provider traces selected! Supported: OpenAI, Anthropic, Gemini');
994
1228
  return;
995
1229
  }
996
1230
 
@@ -1044,9 +1278,18 @@
1044
1278
  // Get filtered traces
1045
1279
  function getFilteredTraces() {
1046
1280
  let filtered = allTraces;
1047
- if (currentOpFilter !== 'all') {
1281
+
1282
+ // Apply operation filter
1283
+ if (currentOpFilter === 'supported') {
1284
+ filtered = filtered.filter(t => {
1285
+ const opDisplayName = t.op_display_name || '';
1286
+ return SUPPORTED_OPS.some(op => opDisplayName.includes(op));
1287
+ });
1288
+ } else if (currentOpFilter !== 'all') {
1048
1289
  filtered = filtered.filter(t => t.op_display_name === currentOpFilter);
1049
1290
  }
1291
+
1292
+ // Apply model filter
1050
1293
  if (currentModelFilter !== 'all') {
1051
1294
  filtered = filtered.filter(t => t.model === currentModelFilter);
1052
1295
  }
@@ -1287,38 +1530,48 @@
1287
1530
  const response = await fetch('/list_judges');
1288
1531
  const data = await response.json();
1289
1532
  const judges = data.judges || [];
1290
- const judgeSelect = document.getElementById('eval-judge');
1533
+ const judgeList = document.getElementById('eval-judge-list');
1291
1534
 
1292
1535
  if (judges.length === 0) {
1293
- judgeSelect.innerHTML = '<option value="">No judges defined - create one first</option>';
1536
+ judgeList.innerHTML = '<div style="color: #888;">No judges defined - <a href="/judge" target="_blank" style="color: #4a9eff;">create one first</a></div>';
1294
1537
  } else {
1295
- judgeSelect.innerHTML = judges.map((j, i) => `<option value="${i}">${j.name} (${j.type})</option>`).join('');
1538
+ judgeList.innerHTML = judges.map((j, i) => `
1539
+ <label style="display: flex; align-items: center; padding: 8px; margin-bottom: 8px; background: #1a1a1a; border-radius: 4px; cursor: pointer; transition: background 0.2s;">
1540
+ <input type="checkbox" class="eval-judge-checkbox" data-judge-index="${i}" style="margin-right: 10px; width: 18px; height: 18px; cursor: pointer;">
1541
+ <div style="flex: 1;">
1542
+ <div style="color: #fff; font-size: 14px; font-weight: 500;">${j.name}</div>
1543
+ <div style="color: #888; font-size: 12px;">${j.type}</div>
1544
+ </div>
1545
+ </label>
1546
+ `).join('');
1296
1547
  }
1297
1548
  } catch (e) {
1298
1549
  console.error('Error loading judges:', e);
1299
- document.getElementById('eval-judge').innerHTML = '<option value="">Error loading judges</option>';
1550
+ document.getElementById('eval-judge-list').innerHTML = '<div style="color: #f88;">Error loading judges</div>';
1300
1551
  }
1301
1552
  }
1302
1553
 
1303
1554
  // Run evaluation
1304
1555
  document.getElementById('run-eval-btn').addEventListener('click', async () => {
1305
- const judgeIndex = document.getElementById('eval-judge').value;
1556
+ // Get selected judges
1557
+ const selectedJudgeCheckboxes = document.querySelectorAll('.eval-judge-checkbox:checked');
1558
+ const selectedJudgeIndices = Array.from(selectedJudgeCheckboxes).map(cb => parseInt(cb.dataset.judgeIndex));
1306
1559
 
1307
1560
  if (selectedEvalModels.size === 0) {
1308
1561
  alert('Please select at least one weak model');
1309
1562
  return;
1310
1563
  }
1311
1564
 
1312
- if (!judgeIndex) {
1313
- alert('Please select a judge');
1565
+ if (selectedJudgeIndices.length === 0) {
1566
+ alert('Please select at least one judge');
1314
1567
  return;
1315
1568
  }
1316
1569
 
1317
1570
  // Load judges from server
1318
1571
  const judgesResponse = await fetch('/list_judges');
1319
1572
  const judgesData = await judgesResponse.json();
1320
- const judges = judgesData.judges || [];
1321
- const judge = judges[parseInt(judgeIndex)];
1573
+ const allJudges = judgesData.judges || [];
1574
+ const selectedJudges = selectedJudgeIndices.map(idx => allJudges[idx]);
1322
1575
 
1323
1576
  // Show progress
1324
1577
  document.getElementById('eval-progress').style.display = 'block';
@@ -1328,17 +1581,17 @@
1328
1581
  const resultsDiv = document.getElementById('eval-results-links');
1329
1582
 
1330
1583
  progressText.textContent = `Starting evaluations...\n`;
1331
- progressText.textContent += `Judge: ${judge.name}\n`;
1584
+ progressText.textContent += `Judges: ${selectedJudges.map(j => j.name).join(', ')}\n`;
1332
1585
  progressText.textContent += `Models: ${selectedEvalModels.size}\n\n`;
1333
1586
 
1334
1587
  const modelFiles = Array.from(selectedEvalModels);
1335
1588
  const results = [];
1336
1589
 
1337
- // Run evaluations sequentially with granular progress
1338
- for (let i = 0; i < modelFiles.length; i++) {
1339
- const modelFile = modelFiles[i];
1590
+ // Run one evaluation per model with ALL judges combined
1591
+ for (let modelIdx = 0; modelIdx < modelFiles.length; modelIdx++) {
1592
+ const modelFile = modelFiles[modelIdx];
1340
1593
 
1341
- progressText.textContent += `[${i+1}/${modelFiles.length}] Starting ${modelFile}...\n`;
1594
+ progressText.textContent += `[${modelIdx + 1}/${modelFiles.length}] Evaluating ${modelFile} with ${selectedJudges.length} judge(s)...\n`;
1342
1595
 
1343
1596
  let pollInterval = null;
1344
1597
  let taskId = null;
@@ -1349,9 +1602,8 @@
1349
1602
  const resp = await fetch(`/progress/${taskId}`);
1350
1603
  if (resp.ok) {
1351
1604
  const progress = await resp.json();
1352
- const percent = (progress.current / progress.total) * 100;
1605
+ const percent = ((modelIdx + 1) / modelFiles.length) * 100;
1353
1606
  progressFill.style.width = `${percent}%`;
1354
- progressText.textContent = `[${i+1}/${modelFiles.length}] ${progress.message}\nProgress: ${progress.current}/${progress.total} (${percent.toFixed(1)}%)\n`;
1355
1607
  }
1356
1608
  } catch (e) {
1357
1609
  console.error('Error polling eval progress:', e);
@@ -1360,17 +1612,18 @@
1360
1612
 
1361
1613
  try {
1362
1614
  // Generate task ID for this evaluation
1363
- taskId = `eval_${Date.now()}_${i}`;
1615
+ taskId = `eval_${Date.now()}_${modelIdx}`;
1364
1616
 
1365
1617
  // Start polling
1366
1618
  pollInterval = setInterval(pollProgress, 300);
1367
1619
 
1620
+ // Send all judges in one request
1368
1621
  const response = await fetch('/run_evaluation', {
1369
1622
  method: 'POST',
1370
1623
  headers: { 'Content-Type': 'application/json' },
1371
1624
  body: JSON.stringify({
1372
1625
  model_file: modelFile,
1373
- judge: judge,
1626
+ judges: selectedJudges, // Send all judges
1374
1627
  task_id: taskId
1375
1628
  })
1376
1629
  });
@@ -1385,6 +1638,7 @@
1385
1638
  if (pollInterval) clearInterval(pollInterval);
1386
1639
 
1387
1640
  progressText.textContent += ` ✓ Complete: ${result.evaluation_name}\n`;
1641
+ progressText.textContent += ` Judges used: ${result.judges.join(', ')}\n`;
1388
1642
  progressText.textContent += ` Examples: ${result.examples_evaluated}\n\n`;
1389
1643
 
1390
1644
  results.push({
@@ -1788,14 +2042,17 @@
1788
2042
  try {
1789
2043
  const response = await fetch('/list_judges');
1790
2044
  const data = await response.json();
1791
- const judgeSelect = document.getElementById('e2e-judge');
2045
+ const judgeList = document.getElementById('e2e-judge-list');
1792
2046
 
1793
2047
  if (data.judges && data.judges.length > 0) {
1794
- judgeSelect.innerHTML = data.judges.map((judge, idx) =>
1795
- `<option value="${idx}">${judge.name} (${judge.type})</option>`
1796
- ).join('');
2048
+ judgeList.innerHTML = data.judges.map((judge, idx) => `
2049
+ <label style="display: block; padding: 5px 0; color: #ccc; cursor: pointer;">
2050
+ <input type="checkbox" class="e2e-judge-checkbox" value="${idx}" style="margin-right: 8px;">
2051
+ ${judge.name} (${judge.type})
2052
+ </label>
2053
+ `).join('');
1797
2054
  } else {
1798
- judgeSelect.innerHTML = '<option value="">No judges available - create one first</option>';
2055
+ judgeList.innerHTML = '<p style="color: #888;">No judges available - create one first</p>';
1799
2056
  }
1800
2057
  } catch (error) {
1801
2058
  console.error('Error loading judges:', error);
@@ -1831,9 +2088,10 @@
1831
2088
  return;
1832
2089
  }
1833
2090
 
1834
- const judgeIndex = document.getElementById('e2e-judge').value;
1835
- if (!judgeIndex) {
1836
- alert('Please select a judge!');
2091
+ // Get selected judges
2092
+ const selectedJudgeIndices = Array.from(document.querySelectorAll('.e2e-judge-checkbox:checked')).map(cb => parseInt(cb.value));
2093
+ if (selectedJudgeIndices.length === 0) {
2094
+ alert('Please select at least one judge!');
1837
2095
  return;
1838
2096
  }
1839
2097
 
@@ -1842,7 +2100,7 @@
1842
2100
  // Load judge data
1843
2101
  const judgesResponse = await fetch('/list_judges');
1844
2102
  const judgesData = await judgesResponse.json();
1845
- const judge = judgesData.judges[parseInt(judgeIndex)];
2103
+ const judges = selectedJudgeIndices.map(idx => judgesData.judges[idx]);
1846
2104
 
1847
2105
  // Hide config panel, show progress panel
1848
2106
  document.getElementById('e2e-panel').style.display = 'none';
@@ -1928,7 +2186,8 @@
1928
2186
 
1929
2187
  // === STEP 3: Run Evaluations ===
1930
2188
  stepLabel.textContent = 'Step 3/3: Running evaluations...';
1931
- progressText.textContent += `📊 Running evaluations with judge: ${judge.name}...\n`;
2189
+ const judgeNames = judges.map(j => j.name).join(', ');
2190
+ progressText.textContent += `📊 Running evaluations with ${judges.length} judge(s): ${judgeNames}...\n`;
1932
2191
 
1933
2192
  const evaluationResults = [];
1934
2193
 
@@ -1971,7 +2230,7 @@
1971
2230
  headers: { 'Content-Type': 'application/json' },
1972
2231
  body: JSON.stringify({
1973
2232
  model_file: modelFile,
1974
- judge: judge,
2233
+ judges: judges,
1975
2234
  task_id: evalTaskId
1976
2235
  })
1977
2236
  });
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: quickdistill
3
- Version: 0.1.8
3
+ Version: 0.1.9
4
4
  Summary: Fast and easy toolkit for distilling AI models
5
5
  Author-email: Brett Young <bdytx5@umsystem.edu>
6
6
  License: MIT
@@ -0,0 +1,17 @@
1
+ quickdistill/__init__.py,sha256=dOl_wXruBGyDGhe1Iu4-SQLu_6-_b6rt1lkxfOp3Jqo,823
2
+ quickdistill/cli.py,sha256=A8d5GN9NdBS299WyAsJ6-p8ynW3DJnDRHZ-UGH7TXLM,2212
3
+ quickdistill/default_judges.json,sha256=9uDqsYc9CsJwZAWwOkWcqgmlGZNJ0zzyXpv4wZ8vtuE,1446
4
+ quickdistill/get_traces.py,sha256=mfy9fMiK-CZQN1noZ4DfOwdwP45ntthVDLgh4-u2iNk,4896
5
+ quickdistill/server.py,sha256=0yBQ5vt1oD7OkhH7ap2cR8j-wuVG3fU7jARijmD1eOs,42849
6
+ quickdistill/__pycache__/__init__.cpython-310.pyc,sha256=RqzjvxzPxHFJZkBjX6DSH9vbVTtskVgJ4pTQ6EX2A6o,794
7
+ quickdistill/__pycache__/cli.cpython-310.pyc,sha256=xtVgJTayQLKS4gE_te7U1Wo8LmkDtPkaa2rnzu8h9fY,2443
8
+ quickdistill/__pycache__/get_traces.cpython-310.pyc,sha256=T7Suxp9vpqYDQJ_3uJvXWemqoLf5tnRC2I0BfHrSiNM,2956
9
+ quickdistill/__pycache__/server.cpython-310.pyc,sha256=8W74-E_S0dJRRwRG7nF9UL64kdbyDoNswAi5y51Xc3I,25593
10
+ quickdistill/default_projects/byyoung3_arena-detailed/traces_data.json,sha256=iz-cBmXBYj0bC3Vn754QTnGuDh6sRvlE_RzSyGXaxbY,15496950
11
+ quickdistill/static/judge_manager.html,sha256=t6dSPwo_d-GIu1FscuK1KDgxKCnmiOekQTMu80lZPPY,27166
12
+ quickdistill/static/trace_viewer.html,sha256=lAMO6Mj-MWQqXGC4bo2v8ybM4ci082h2HaDQ1AOl2jM,109884
13
+ quickdistill-0.1.9.dist-info/METADATA,sha256=-VH48FybeQbxuxUOlSn0zHJfCOkxfklCrxCHbdRYFRQ,5084
14
+ quickdistill-0.1.9.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
15
+ quickdistill-0.1.9.dist-info/entry_points.txt,sha256=AUUTxnwdD9gRnsOEcTXQTAZIZ_F0aRU7JGstIJ3Xk_o,55
16
+ quickdistill-0.1.9.dist-info/top_level.txt,sha256=ysiMvurJYsE1IhkxmObe-0G8A-GIav40kTh2z6axjxg,13
17
+ quickdistill-0.1.9.dist-info/RECORD,,
@@ -1,17 +0,0 @@
1
- quickdistill/__init__.py,sha256=4hLOUVOlPTaZaCLc7950TQGMb-EV_3J9t2qT7StwA7k,397
2
- quickdistill/cli.py,sha256=A8d5GN9NdBS299WyAsJ6-p8ynW3DJnDRHZ-UGH7TXLM,2212
3
- quickdistill/default_judges.json,sha256=9uDqsYc9CsJwZAWwOkWcqgmlGZNJ0zzyXpv4wZ8vtuE,1446
4
- quickdistill/get_traces.py,sha256=mfy9fMiK-CZQN1noZ4DfOwdwP45ntthVDLgh4-u2iNk,4896
5
- quickdistill/server.py,sha256=0Y0XG-8oYoNZgmo10LPZgtwlHuGqrq0urxE-KabyIvI,36789
6
- quickdistill/__pycache__/__init__.cpython-310.pyc,sha256=kCGMGP5qGjIpf2QZcBVLVTVlQKd-HHy_l9tHr1LfysU,603
7
- quickdistill/__pycache__/cli.cpython-310.pyc,sha256=xtVgJTayQLKS4gE_te7U1Wo8LmkDtPkaa2rnzu8h9fY,2443
8
- quickdistill/__pycache__/get_traces.cpython-310.pyc,sha256=T7Suxp9vpqYDQJ_3uJvXWemqoLf5tnRC2I0BfHrSiNM,2956
9
- quickdistill/__pycache__/server.cpython-310.pyc,sha256=_taKWofMtdgfMZzfVsd7PoC4jnuKxEOGzW82YBxqPPc,22051
10
- quickdistill/default_projects/byyoung3_arena-detailed/traces_data.json,sha256=iz-cBmXBYj0bC3Vn754QTnGuDh6sRvlE_RzSyGXaxbY,15496950
11
- quickdistill/static/judge_manager.html,sha256=t6dSPwo_d-GIu1FscuK1KDgxKCnmiOekQTMu80lZPPY,27166
12
- quickdistill/static/trace_viewer.html,sha256=yt_zPP88px_51a9ilv8UhrssnVOT-2hjEPHEGoRlPrQ,95152
13
- quickdistill-0.1.8.dist-info/METADATA,sha256=q4uGRUvQ3HSlHff0ZKs1tBzGos-iOiSxHq3HbKJHa-k,5084
14
- quickdistill-0.1.8.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
15
- quickdistill-0.1.8.dist-info/entry_points.txt,sha256=AUUTxnwdD9gRnsOEcTXQTAZIZ_F0aRU7JGstIJ3Xk_o,55
16
- quickdistill-0.1.8.dist-info/top_level.txt,sha256=ysiMvurJYsE1IhkxmObe-0G8A-GIav40kTh2z6axjxg,13
17
- quickdistill-0.1.8.dist-info/RECORD,,