quickdistill 0.1.7__py3-none-any.whl → 0.1.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- quickdistill/__init__.py +12 -1
- quickdistill/__pycache__/__init__.cpython-310.pyc +0 -0
- quickdistill/__pycache__/server.cpython-310.pyc +0 -0
- quickdistill/default_judges.json +2 -2
- quickdistill/server.py +170 -29
- quickdistill/static/judge_manager.html +12 -8
- quickdistill/static/trace_viewer.html +379 -112
- {quickdistill-0.1.7.dist-info → quickdistill-0.1.9.dist-info}/METADATA +1 -1
- quickdistill-0.1.9.dist-info/RECORD +17 -0
- quickdistill-0.1.7.dist-info/RECORD +0 -17
- {quickdistill-0.1.7.dist-info → quickdistill-0.1.9.dist-info}/WHEEL +0 -0
- {quickdistill-0.1.7.dist-info → quickdistill-0.1.9.dist-info}/entry_points.txt +0 -0
- {quickdistill-0.1.7.dist-info → quickdistill-0.1.9.dist-info}/top_level.txt +0 -0
quickdistill/__init__.py
CHANGED
|
@@ -8,7 +8,18 @@ This package provides tools to:
|
|
|
8
8
|
- Export datasets for model evaluation
|
|
9
9
|
"""
|
|
10
10
|
|
|
11
|
-
|
|
11
|
+
# Monkey patch for aiohttp/litellm compatibility
|
|
12
|
+
# litellm expects aiohttp.ConnectionTimeoutError but it doesn't exist in some versions
|
|
13
|
+
try:
|
|
14
|
+
import aiohttp
|
|
15
|
+
if not hasattr(aiohttp, 'ConnectionTimeoutError'):
|
|
16
|
+
aiohttp.ConnectionTimeoutError = aiohttp.ServerTimeoutError
|
|
17
|
+
if not hasattr(aiohttp, 'SocketTimeoutError'):
|
|
18
|
+
aiohttp.SocketTimeoutError = aiohttp.ServerTimeoutError
|
|
19
|
+
except Exception:
|
|
20
|
+
pass
|
|
21
|
+
|
|
22
|
+
__version__ = "0.1.9"
|
|
12
23
|
__author__ = "Brett Young"
|
|
13
24
|
__email__ = "bdytx5@umsystem.edu"
|
|
14
25
|
|
|
Binary file
|
|
Binary file
|
quickdistill/default_judges.json
CHANGED
|
@@ -2,14 +2,14 @@
|
|
|
2
2
|
{
|
|
3
3
|
"name": "boolean_scorer",
|
|
4
4
|
"type": "llm",
|
|
5
|
-
"model": "gpt-5",
|
|
5
|
+
"model": "openai/gpt-5",
|
|
6
6
|
"returnType": "boolean",
|
|
7
7
|
"prompt": "You are a strict evaluator comparing two AI responses (one from a strong reference model which is the ground truth, and one from a weaker model which we are testing to see how similar the responses it generates are to the strong model).\n\nStrong Model Response: {strong_output}\nWeak Model Response: {weak_output}\n\nDetermine if the weak model response is CORRECT compared to the strong model response.\nConsider a response CORRECT if it conveys the same key information and meaning, even if worded differently.\n\nRespond in JSON format: {'correct': true} or {'correct': false}"
|
|
8
8
|
},
|
|
9
9
|
{
|
|
10
10
|
"name": "scalar_scorer",
|
|
11
11
|
"type": "llm",
|
|
12
|
-
"model": "gpt-5",
|
|
12
|
+
"model": "openai/gpt-5",
|
|
13
13
|
"returnType": "scalar",
|
|
14
14
|
"prompt": "You are a strict evaluator comparing two AI responses (one from a strong reference model which is the ground truth, and one from a weaker model which we are testing to see how similar the responses it generates are to the strong model).\n\nStrong Model Response: {strong_output}\nWeak Model Response: {weak_output}\n\nEvaluate how similar the weak model response is to the strong model response.\nRate on a scale of 1-5 where 1=completely different and 5=nearly identical. RETURN ONLY ONE SCORE REPRESENTY THE AVERAGE SIMILARITY (EG 5-(avg_error))\n\nRespond in JSON format eg {'scores': the_score }"
|
|
15
15
|
}
|
quickdistill/server.py
CHANGED
|
@@ -100,40 +100,133 @@ def run_inference(client, model, messages, max_tokens=1000):
|
|
|
100
100
|
return f"ERROR: {str(e)}"
|
|
101
101
|
|
|
102
102
|
def extract_output_content(output_str):
|
|
103
|
-
"""Extract actual content from WeaveObject string or regular output
|
|
103
|
+
"""Extract actual content from WeaveObject string, JSON response, or regular output.
|
|
104
|
+
|
|
105
|
+
Handles outputs from:
|
|
106
|
+
- OpenAI chat.completions.create (plain text)
|
|
107
|
+
- OpenAI responses.create (JSON with nested structure)
|
|
108
|
+
- Anthropic Messages (WeaveObject with content[0].text)
|
|
109
|
+
- Google Gemini (WeaveObject with candidates[0].content.parts[0].text)
|
|
110
|
+
"""
|
|
111
|
+
import re
|
|
112
|
+
import json
|
|
113
|
+
|
|
104
114
|
if not output_str:
|
|
105
115
|
return None
|
|
106
116
|
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
117
|
+
if not isinstance(output_str, str):
|
|
118
|
+
return str(output_str)
|
|
119
|
+
|
|
120
|
+
# Handle empty/streaming responses
|
|
121
|
+
if output_str in ('', 'None', 'null'):
|
|
122
|
+
return '[Streaming output - not captured]'
|
|
123
|
+
|
|
124
|
+
# Handle OpenAI responses.create JSON format
|
|
125
|
+
if output_str.startswith('{') and '"output"' in output_str:
|
|
126
|
+
try:
|
|
127
|
+
resp_obj = json.loads(output_str)
|
|
128
|
+
if 'output' in resp_obj and isinstance(resp_obj['output'], list):
|
|
129
|
+
# Extract text from output messages
|
|
130
|
+
text_parts = []
|
|
131
|
+
for item in resp_obj['output']:
|
|
132
|
+
if item.get('type') == 'message' and 'content' in item:
|
|
133
|
+
for content in item['content']:
|
|
134
|
+
if content.get('type') == 'output_text' and 'text' in content:
|
|
135
|
+
text_parts.append(content['text'])
|
|
136
|
+
if text_parts:
|
|
137
|
+
return '\n\n'.join(text_parts)
|
|
138
|
+
except (json.JSONDecodeError, KeyError, TypeError):
|
|
139
|
+
pass # Fall through to other handlers
|
|
140
|
+
|
|
141
|
+
# Handle WeaveObject strings (Anthropic, Gemini)
|
|
142
|
+
if 'WeaveObject' in output_str:
|
|
143
|
+
# Improved regex that handles escape sequences properly
|
|
144
|
+
match = re.search(r"'text':\s*'((?:[^'\\]|\\.)*)'", output_str, re.DOTALL)
|
|
112
145
|
if match:
|
|
113
|
-
# Unescape the string
|
|
146
|
+
# Unescape the string properly (order matters!)
|
|
114
147
|
text = match.group(1)
|
|
115
|
-
text = text.replace(
|
|
148
|
+
text = text.replace("\\'", "'") # escaped single quotes
|
|
149
|
+
text = text.replace('\\"', '"') # escaped double quotes
|
|
150
|
+
text = text.replace('\\n', '\n') # newlines
|
|
151
|
+
text = text.replace('\\t', '\t') # tabs
|
|
152
|
+
text = text.replace('\\r', '\r') # carriage returns
|
|
153
|
+
text = text.replace('\\\\', '\\') # escaped backslashes (do this last!)
|
|
116
154
|
return text
|
|
117
155
|
|
|
118
|
-
|
|
156
|
+
# If no text field found, return truncated version
|
|
157
|
+
return f"[Complex WeaveObject - could not extract text]\n{output_str[:500]}..."
|
|
158
|
+
|
|
159
|
+
# Plain text output (standard OpenAI chat format)
|
|
119
160
|
return output_str
|
|
120
161
|
|
|
121
162
|
|
|
122
163
|
def extract_messages_from_trace(trace):
|
|
123
|
-
"""Extract messages from a trace in the format needed for inference
|
|
124
|
-
|
|
164
|
+
"""Extract messages from a trace in the format needed for inference.
|
|
165
|
+
|
|
166
|
+
Handles message extraction from:
|
|
167
|
+
- OpenAI chat.completions.create (messages at top level or in inputs.messages)
|
|
168
|
+
- OpenAI responses.create (inputs.input field)
|
|
169
|
+
- Anthropic Messages (inputs.messages)
|
|
170
|
+
- Google Gemini generate_content (inputs.contents array)
|
|
171
|
+
- Google Gemini Chat.send_message (inputs.message string)
|
|
172
|
+
"""
|
|
173
|
+
import re
|
|
174
|
+
|
|
175
|
+
# Get op_display_name for provider detection
|
|
176
|
+
op_name = trace.get('op_display_name', '')
|
|
177
|
+
|
|
178
|
+
# Check if messages are at top level (already extracted/cached)
|
|
125
179
|
if trace.get('messages') and isinstance(trace['messages'], list) and len(trace['messages']) > 0:
|
|
126
180
|
return trace['messages']
|
|
127
181
|
|
|
128
182
|
# Check if messages are in inputs
|
|
129
183
|
if trace.get('inputs') and isinstance(trace['inputs'], dict):
|
|
130
|
-
|
|
184
|
+
inputs = trace['inputs']
|
|
185
|
+
|
|
186
|
+
# Standard OpenAI/Anthropic: inputs.messages
|
|
187
|
+
messages = inputs.get('messages', [])
|
|
131
188
|
if isinstance(messages, list) and len(messages) > 0:
|
|
132
189
|
return messages
|
|
133
190
|
|
|
191
|
+
# OpenAI responses.create: inputs.input (simple string)
|
|
192
|
+
if 'openai.responses' in op_name and 'input' in inputs:
|
|
193
|
+
return [{"role": "user", "content": inputs['input']}]
|
|
194
|
+
|
|
195
|
+
# Gemini Chat.send_message: inputs.message (simple string)
|
|
196
|
+
if 'Chat.send_message' in op_name and 'message' in inputs:
|
|
197
|
+
return [{"role": "user", "content": inputs['message']}]
|
|
198
|
+
|
|
199
|
+
# Gemini generate_content: inputs.contents (array of content objects or WeaveObject strings)
|
|
200
|
+
if 'google.genai' in op_name and 'contents' in inputs:
|
|
201
|
+
contents = inputs['contents']
|
|
202
|
+
if isinstance(contents, list) and len(contents) > 0:
|
|
203
|
+
messages = []
|
|
204
|
+
for content in contents:
|
|
205
|
+
# Handle WeaveObject string format
|
|
206
|
+
if isinstance(content, str) and 'WeaveObject' in content:
|
|
207
|
+
role_match = re.search(r"'role':\s*'(\w+)'", content)
|
|
208
|
+
text_match = re.search(r"'text':\s*'((?:[^'\\]|\\.)*)'", content, re.DOTALL)
|
|
209
|
+
text = '[Complex content]'
|
|
210
|
+
if text_match:
|
|
211
|
+
text = text_match.group(1)
|
|
212
|
+
text = text.replace("\\'", "'").replace('\\n', '\n').replace('\\\\', '\\')
|
|
213
|
+
messages.append({
|
|
214
|
+
"role": role_match.group(1) if role_match else "user",
|
|
215
|
+
"content": text
|
|
216
|
+
})
|
|
217
|
+
# Handle regular dict format
|
|
218
|
+
elif isinstance(content, dict):
|
|
219
|
+
role = content.get('role', 'user')
|
|
220
|
+
parts = content.get('parts', [])
|
|
221
|
+
if isinstance(parts, list):
|
|
222
|
+
text = '\n'.join([p.get('text', '') for p in parts if isinstance(p, dict)])
|
|
223
|
+
messages.append({"role": role, "content": text})
|
|
224
|
+
if messages:
|
|
225
|
+
return messages
|
|
226
|
+
|
|
134
227
|
# Check if inputs has question/context format (from generate_test_traces.py wrapper traces)
|
|
135
|
-
question =
|
|
136
|
-
context =
|
|
228
|
+
question = inputs.get('question')
|
|
229
|
+
context = inputs.get('context')
|
|
137
230
|
if question:
|
|
138
231
|
if context:
|
|
139
232
|
prompt = f"""Based on the following context, answer the question concisely.
|
|
@@ -753,16 +846,26 @@ def delete_judge():
|
|
|
753
846
|
|
|
754
847
|
@app.route('/run_evaluation', methods=['POST'])
|
|
755
848
|
def run_evaluation_endpoint():
|
|
756
|
-
"""Run evaluation using specified judge"""
|
|
757
|
-
|
|
849
|
+
"""Run evaluation using specified judge(s) - supports multiple judges"""
|
|
850
|
+
|
|
758
851
|
|
|
759
852
|
data = request.json
|
|
760
853
|
model_file = data.get('model_file')
|
|
761
|
-
|
|
854
|
+
judges = data.get('judges') # Can be a list or single judge dict
|
|
762
855
|
task_id = data.get('task_id', f"eval_{id(data)}")
|
|
763
856
|
|
|
764
|
-
|
|
765
|
-
|
|
857
|
+
# Handle both single judge (backwards compat) and multiple judges
|
|
858
|
+
if data.get('judge'):
|
|
859
|
+
judges = [data.get('judge')]
|
|
860
|
+
elif not judges:
|
|
861
|
+
return jsonify({'error': 'Missing judge or judges'}), 400
|
|
862
|
+
|
|
863
|
+
# Ensure judges is a list
|
|
864
|
+
if not isinstance(judges, list):
|
|
865
|
+
judges = [judges]
|
|
866
|
+
|
|
867
|
+
if not model_file:
|
|
868
|
+
return jsonify({'error': 'Missing model_file'}), 400
|
|
766
869
|
|
|
767
870
|
# Load weak model results
|
|
768
871
|
model_path = DATA_DIR / model_file
|
|
@@ -782,18 +885,22 @@ def run_evaluation_endpoint():
|
|
|
782
885
|
# Extract model name from filename
|
|
783
886
|
model_name = model_file.replace('weak_model_', '').replace('.json', '')
|
|
784
887
|
|
|
888
|
+
# Create evaluation name with all judges
|
|
889
|
+
judges_names = '_'.join([j['name'] for j in judges])
|
|
890
|
+
eval_name = f"eval-{model_name}-{judges_names}"
|
|
891
|
+
|
|
785
892
|
# Initialize progress tracking
|
|
786
893
|
total_steps = len(results)
|
|
787
894
|
progress_state[task_id] = {
|
|
788
895
|
'current': 0,
|
|
789
896
|
'total': total_steps,
|
|
790
|
-
'message': f'Starting evaluation: {model_name} with {judge
|
|
897
|
+
'message': f'Starting evaluation: {model_name} with {len(judges)} judge(s)...',
|
|
791
898
|
'status': 'running'
|
|
792
899
|
}
|
|
793
900
|
|
|
794
901
|
# Create evaluation logger
|
|
795
902
|
ev = weave.EvaluationLogger(
|
|
796
|
-
name=
|
|
903
|
+
name=eval_name,
|
|
797
904
|
model=model_name
|
|
798
905
|
)
|
|
799
906
|
|
|
@@ -818,13 +925,20 @@ def run_evaluation_endpoint():
|
|
|
818
925
|
if messages and len(messages) > 0:
|
|
819
926
|
question = messages[0].get('content', '')
|
|
820
927
|
|
|
821
|
-
# Run
|
|
822
|
-
|
|
823
|
-
|
|
824
|
-
|
|
825
|
-
|
|
928
|
+
# Run all judges and collect scores
|
|
929
|
+
all_scores = {}
|
|
930
|
+
for judge in judges:
|
|
931
|
+
# Run judge
|
|
932
|
+
if judge['type'] == 'llm':
|
|
933
|
+
scores = run_llm_judge_eval(judge, strong_output, weak_output, question)
|
|
934
|
+
else:
|
|
935
|
+
scores = run_custom_judge_eval(judge, strong_output, weak_output)
|
|
936
|
+
|
|
937
|
+
# Merge scores with judge name prefix to avoid conflicts
|
|
938
|
+
for score_key, score_value in scores.items():
|
|
939
|
+
all_scores[f"{judge['name']}_{score_key}"] = score_value
|
|
826
940
|
|
|
827
|
-
# Log to weave
|
|
941
|
+
# Log to weave with all scores from all judges
|
|
828
942
|
ev.log_example(
|
|
829
943
|
inputs={
|
|
830
944
|
"question": question,
|
|
@@ -834,7 +948,7 @@ def run_evaluation_endpoint():
|
|
|
834
948
|
"weak_output": weak_output
|
|
835
949
|
|
|
836
950
|
},
|
|
837
|
-
scores=
|
|
951
|
+
scores=all_scores
|
|
838
952
|
)
|
|
839
953
|
|
|
840
954
|
# Finish evaluation
|
|
@@ -850,10 +964,11 @@ def run_evaluation_endpoint():
|
|
|
850
964
|
|
|
851
965
|
return jsonify({
|
|
852
966
|
'status': 'success',
|
|
853
|
-
'evaluation_name':
|
|
967
|
+
'evaluation_name': eval_name,
|
|
854
968
|
'examples_evaluated': len(results),
|
|
855
969
|
'weave_url': ev.ui_url,
|
|
856
970
|
'strong_export': strong_export,
|
|
971
|
+
'judges': [j['name'] for j in judges],
|
|
857
972
|
'task_id': task_id
|
|
858
973
|
})
|
|
859
974
|
|
|
@@ -1032,6 +1147,32 @@ def list_projects():
|
|
|
1032
1147
|
return jsonify({'projects': projects})
|
|
1033
1148
|
|
|
1034
1149
|
|
|
1150
|
+
@app.route('/get_preferences', methods=['GET'])
|
|
1151
|
+
def get_preferences():
|
|
1152
|
+
"""Get saved user preferences"""
|
|
1153
|
+
prefs_file = DATA_DIR / 'preferences.json'
|
|
1154
|
+
if prefs_file.exists():
|
|
1155
|
+
try:
|
|
1156
|
+
with open(prefs_file, 'r') as f:
|
|
1157
|
+
return jsonify(json.load(f))
|
|
1158
|
+
except:
|
|
1159
|
+
pass
|
|
1160
|
+
return jsonify({})
|
|
1161
|
+
|
|
1162
|
+
|
|
1163
|
+
@app.route('/save_preferences', methods=['POST'])
|
|
1164
|
+
def save_preferences():
|
|
1165
|
+
"""Save user preferences"""
|
|
1166
|
+
try:
|
|
1167
|
+
data = request.json
|
|
1168
|
+
prefs_file = DATA_DIR / 'preferences.json'
|
|
1169
|
+
with open(prefs_file, 'w') as f:
|
|
1170
|
+
json.dump(data, f, indent=2)
|
|
1171
|
+
return jsonify({'status': 'success'})
|
|
1172
|
+
except Exception as e:
|
|
1173
|
+
return jsonify({'status': 'error', 'message': str(e)}), 500
|
|
1174
|
+
|
|
1175
|
+
|
|
1035
1176
|
# Routes for serving HTML pages
|
|
1036
1177
|
@app.route('/')
|
|
1037
1178
|
def index():
|
|
@@ -183,12 +183,10 @@
|
|
|
183
183
|
|
|
184
184
|
<div id="llm-options" style="display: block;">
|
|
185
185
|
<label for="judge-model">Model</label>
|
|
186
|
-
<
|
|
187
|
-
|
|
188
|
-
<
|
|
189
|
-
|
|
190
|
-
<option value="claude-3-5-sonnet-20241022">claude-3-5-sonnet</option>
|
|
191
|
-
</select>
|
|
186
|
+
<input type="text" id="judge-model" placeholder="e.g., openai/gpt-5, anthropic/claude-3.5-sonnet" value="openai/gpt-5">
|
|
187
|
+
<p style="color: #888; font-size: 12px; margin-top: 5px; margin-bottom: 15px;">
|
|
188
|
+
<strong>Note:</strong> Uses LiteLLM format. Examples: <code>openai/gpt-5</code>, <code>anthropic/claude-3.5-sonnet</code>, <code>openai/gpt-4o</code>
|
|
189
|
+
</p>
|
|
192
190
|
|
|
193
191
|
<label for="judge-return-type">Return Type</label>
|
|
194
192
|
<select id="judge-return-type">
|
|
@@ -393,10 +391,16 @@ Respond in JSON format: {'correct': true} or {'correct': false}`
|
|
|
393
391
|
};
|
|
394
392
|
|
|
395
393
|
if (type === 'llm') {
|
|
396
|
-
judge.model = document.getElementById('judge-model').value;
|
|
394
|
+
judge.model = document.getElementById('judge-model').value.trim();
|
|
397
395
|
judge.returnType = document.getElementById('judge-return-type').value;
|
|
398
396
|
judge.prompt = document.getElementById('judge-prompt').value.trim();
|
|
399
397
|
|
|
398
|
+
// Validate model
|
|
399
|
+
if (!judge.model) {
|
|
400
|
+
alert('Error: Please enter a model (e.g., openai/gpt-5)');
|
|
401
|
+
return;
|
|
402
|
+
}
|
|
403
|
+
|
|
400
404
|
// Validate required placeholders
|
|
401
405
|
if (!judge.prompt.includes('{strong_output}')) {
|
|
402
406
|
alert('Error: Judge prompt must include {strong_output} placeholder');
|
|
@@ -420,7 +424,7 @@ Respond in JSON format: {'correct': true} or {'correct': false}`
|
|
|
420
424
|
function resetForm() {
|
|
421
425
|
document.getElementById('judge-name').value = '';
|
|
422
426
|
document.getElementById('judge-type').value = 'llm';
|
|
423
|
-
document.getElementById('judge-model').value = 'gpt-5
|
|
427
|
+
document.getElementById('judge-model').value = 'openai/gpt-5';
|
|
424
428
|
document.getElementById('judge-prompt').value = '';
|
|
425
429
|
document.getElementById('form-title').textContent = 'Create New Judge';
|
|
426
430
|
document.getElementById('save-btn').textContent = 'Save Judge';
|
|
@@ -43,10 +43,21 @@
|
|
|
43
43
|
padding: 20px;
|
|
44
44
|
border-radius: 8px;
|
|
45
45
|
margin-bottom: 20px;
|
|
46
|
-
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
.filter-row {
|
|
49
|
+
display: grid;
|
|
50
|
+
grid-template-columns: auto 1fr auto 1fr auto auto;
|
|
47
51
|
gap: 15px;
|
|
48
|
-
align-items:
|
|
49
|
-
|
|
52
|
+
align-items: start;
|
|
53
|
+
margin-bottom: 20px;
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
.filter-group {
|
|
57
|
+
display: flex;
|
|
58
|
+
flex-direction: column;
|
|
59
|
+
gap: 8px;
|
|
60
|
+
min-width: 250px;
|
|
50
61
|
}
|
|
51
62
|
|
|
52
63
|
.controls label {
|
|
@@ -283,58 +294,75 @@
|
|
|
283
294
|
</div>
|
|
284
295
|
|
|
285
296
|
<div class="controls">
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
<
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
<select id="model-filter">
|
|
296
|
-
<option value="all">All Models</option>
|
|
297
|
-
</select>
|
|
298
|
-
|
|
299
|
-
<button id="select-all-btn" style="margin-left: 20px; padding: 8px 16px; background: #2a7c4a; color: white; border: none; border-radius: 4px; cursor: pointer;">
|
|
300
|
-
Select All Filtered
|
|
301
|
-
</button>
|
|
302
|
-
|
|
303
|
-
<button id="export-btn" style="padding: 8px 16px; background: #4a9eff; color: white; border: none; border-radius: 4px; cursor: pointer;">
|
|
304
|
-
Export Selected to Test Set (<span id="selected-count">0</span>)
|
|
305
|
-
</button>
|
|
297
|
+
<!-- Filters Row -->
|
|
298
|
+
<div class="filter-row">
|
|
299
|
+
<div class="filter-group">
|
|
300
|
+
<label for="op-filter">Operation Filter:</label>
|
|
301
|
+
<select id="op-filter" style="background: #2a2a2a; color: #fff; border: 1px solid #3a3a3a; padding: 8px 12px; border-radius: 4px; font-size: 14px; cursor: pointer;">
|
|
302
|
+
<option value="all">All Operations</option>
|
|
303
|
+
</select>
|
|
304
|
+
<span style="color: #4a9eff; font-size: 11px; font-weight: 500;">✅ Fully supported: OpenAI (chat.completions, responses), Anthropic (Messages), Google Gemini (generate_content, Chat)</span>
|
|
305
|
+
</div>
|
|
306
306
|
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
307
|
+
<div class="filter-group">
|
|
308
|
+
<label for="model-filter">Model Filter:</label>
|
|
309
|
+
<select id="model-filter" style="background: #2a2a2a; color: #fff; border: 1px solid #3a3a3a; padding: 8px 12px; border-radius: 4px; font-size: 14px; cursor: pointer;">
|
|
310
|
+
<option value="all">All Models</option>
|
|
311
|
+
</select>
|
|
312
|
+
</div>
|
|
310
313
|
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
+
<button id="select-all-btn" style="padding: 8px 16px; background: #2a7c4a; color: white; border: none; border-radius: 4px; cursor: pointer; align-self: end; white-space: nowrap;">
|
|
315
|
+
Select All
|
|
316
|
+
</button>
|
|
314
317
|
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
318
|
+
<div style="display: flex; flex-direction: column; gap: 4px; align-self: end;">
|
|
319
|
+
<div style="color: #888; font-size: 13px;">Total: <span id="total-count" style="color: #fff; font-weight: 600;">0</span></div>
|
|
320
|
+
<div style="color: #888; font-size: 13px;">Shown: <span id="shown-count" style="color: #4a9eff; font-weight: 600;">0</span></div>
|
|
321
|
+
</div>
|
|
322
|
+
</div>
|
|
318
323
|
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
324
|
+
<!-- Action Buttons Row -->
|
|
325
|
+
<div style="display: grid; grid-template-columns: 2fr 1fr; gap: 20px;">
|
|
326
|
+
<!-- Main Workflow -->
|
|
327
|
+
<div style="padding: 15px; background: #0f1f0f; border-radius: 8px; border: 2px solid #2a4a2a;">
|
|
328
|
+
<div style="color: #6dd36d; font-size: 13px; font-weight: 600; margin-bottom: 12px;">📋 MANUAL WORKFLOW</div>
|
|
329
|
+
<div style="display: flex; flex-wrap: wrap; gap: 10px;">
|
|
330
|
+
<button id="export-btn" style="padding: 10px 16px; background: #4a9eff; color: white; border: none; border-radius: 4px; cursor: pointer; font-weight: 500;">
|
|
331
|
+
1. Export Test Set (<span id="selected-count">0</span>)
|
|
332
|
+
</button>
|
|
333
|
+
<button id="open-inference-btn" style="padding: 10px 16px; background: #7c4a9e; color: white; border: none; border-radius: 4px; cursor: pointer; font-weight: 500;">
|
|
334
|
+
2. Run Weak Models
|
|
335
|
+
</button>
|
|
336
|
+
<button id="open-eval-btn" style="padding: 10px 16px; background: #9e6a4a; color: white; border: none; border-radius: 4px; cursor: pointer; font-weight: 500;">
|
|
337
|
+
3. Evaluate Results
|
|
338
|
+
</button>
|
|
339
|
+
</div>
|
|
340
|
+
</div>
|
|
322
341
|
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
342
|
+
<!-- Utilities -->
|
|
343
|
+
<div style="padding: 15px; background: #1a1a2a; border-radius: 8px; border: 1px solid #2a2a3a;">
|
|
344
|
+
<div style="color: #aaa; font-size: 13px; font-weight: 600; margin-bottom: 12px;">⚙️ TOOLS</div>
|
|
345
|
+
<div style="display: flex; flex-wrap: wrap; gap: 8px;">
|
|
346
|
+
<a href="/judge" target="_blank" style="padding: 8px 14px; background: #4a5a9e; color: white; border: none; border-radius: 4px; text-decoration: none; display: inline-block; font-size: 13px;">
|
|
347
|
+
Judges
|
|
348
|
+
</a>
|
|
349
|
+
<button id="open-test-judge-btn" style="padding: 8px 14px; background: #6a4a7e; color: white; border: none; border-radius: 4px; cursor: pointer; font-size: 13px;">
|
|
350
|
+
Test Judge
|
|
351
|
+
</button>
|
|
352
|
+
<button id="open-settings-btn" style="padding: 8px 14px; background: #5a5a5a; color: white; border: none; border-radius: 4px; cursor: pointer; font-size: 13px;">
|
|
353
|
+
Settings
|
|
354
|
+
</button>
|
|
355
|
+
</div>
|
|
356
|
+
</div>
|
|
357
|
+
</div>
|
|
326
358
|
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
<
|
|
359
|
+
<!-- Automatic Workflow -->
|
|
360
|
+
<div style="margin-top: 20px; padding: 15px; background: #2a1a2a; border-radius: 8px; border: 2px solid #7a4a9e;">
|
|
361
|
+
<div style="color: #bb88ff; font-size: 13px; font-weight: 600; margin-bottom: 10px;">⚡ AUTOMATIC WORKFLOW</div>
|
|
362
|
+
<button id="open-e2e-btn" style="padding: 12px 20px; background: #7a4a9e; color: white; border: none; border-radius: 4px; cursor: pointer; font-weight: 600; font-size: 14px;">
|
|
330
363
|
⚡ Run End-to-End Test
|
|
331
364
|
</button>
|
|
332
|
-
<div style="color: #
|
|
333
|
-
</div>
|
|
334
|
-
|
|
335
|
-
<div class="stats">
|
|
336
|
-
<div>Total: <span id="total-count">0</span></div>
|
|
337
|
-
<div>Shown: <span id="shown-count">0</span></div>
|
|
365
|
+
<div style="color: #888; font-size: 11px; margin-top: 8px;">Export → Generate → Evaluate (all in one)</div>
|
|
338
366
|
</div>
|
|
339
367
|
</div>
|
|
340
368
|
|
|
@@ -412,10 +440,10 @@
|
|
|
412
440
|
</div>
|
|
413
441
|
|
|
414
442
|
<div style="margin-bottom: 20px;">
|
|
415
|
-
<label style="color: #aaa; display: block; margin-bottom: 10px;">Select Judge:</label>
|
|
416
|
-
<
|
|
417
|
-
<!-- Judges populated dynamically -->
|
|
418
|
-
</
|
|
443
|
+
<label style="color: #aaa; display: block; margin-bottom: 10px;">Select Judge(s) - you can select multiple:</label>
|
|
444
|
+
<div id="eval-judge-list" style="max-height: 200px; overflow-y: auto; background: #0f0f0f; padding: 15px; border-radius: 4px;">
|
|
445
|
+
<!-- Judges populated dynamically as checkboxes -->
|
|
446
|
+
</div>
|
|
419
447
|
<div style="color: #666; font-size: 12px; margin-top: 5px;">
|
|
420
448
|
<a href="/judge" target="_blank" style="color: #4a9eff;">Create/manage judges</a>
|
|
421
449
|
</div>
|
|
@@ -511,9 +539,9 @@
|
|
|
511
539
|
<label style="display: block; color: #aaa; margin-bottom: 8px; font-size: 14px;">Judge Model:</label>
|
|
512
540
|
<input type="text" id="test-judge-model"
|
|
513
541
|
style="width: 100%; padding: 10px; background: #2a2a2a; color: #fff; border: 1px solid #3a3a3a; border-radius: 4px; font-size: 14px;"
|
|
514
|
-
placeholder="e.g., gpt-
|
|
542
|
+
placeholder="e.g., openai/gpt-5, anthropic/claude-3.5-sonnet">
|
|
515
543
|
<div style="color: #666; font-size: 12px; margin-top: 5px;">
|
|
516
|
-
Override the judge's model for this test
|
|
544
|
+
Override the judge's model for this test. Uses LiteLLM format (e.g., <code style="color: #aaa;">openai/gpt-5</code>, <code style="color: #aaa;">anthropic/claude-3.5-sonnet</code>)
|
|
517
545
|
</div>
|
|
518
546
|
</div>
|
|
519
547
|
|
|
@@ -586,10 +614,10 @@
|
|
|
586
614
|
|
|
587
615
|
<!-- Judge Selection -->
|
|
588
616
|
<div style="margin-bottom: 30px;">
|
|
589
|
-
<h3 style="color: #fff; font-size: 16px; margin-bottom: 15px;">2. Select
|
|
590
|
-
<
|
|
591
|
-
<
|
|
592
|
-
</
|
|
617
|
+
<h3 style="color: #fff; font-size: 16px; margin-bottom: 15px;">2. Select Judges</h3>
|
|
618
|
+
<div id="e2e-judge-list" style="max-height: 200px; overflow-y: auto; background: #2a2a2a; border: 1px solid #3a3a3a; border-radius: 4px; padding: 10px;">
|
|
619
|
+
<p style="color: #888;">Loading judges...</p>
|
|
620
|
+
</div>
|
|
593
621
|
</div>
|
|
594
622
|
|
|
595
623
|
<!-- Actions -->
|
|
@@ -648,6 +676,17 @@
|
|
|
648
676
|
"Qwen/Qwen3-Coder-480B-A35B-Instruct",
|
|
649
677
|
];
|
|
650
678
|
|
|
679
|
+
const SUPPORTED_OPS = [
|
|
680
|
+
'openai.chat.completions.create',
|
|
681
|
+
'openai.responses.create',
|
|
682
|
+
'anthropic.Messages.create',
|
|
683
|
+
'anthropic.Messages.stream',
|
|
684
|
+
'google.genai.models.Models.generate_content',
|
|
685
|
+
'google.genai.models.Models.generate_content_stream',
|
|
686
|
+
'google.genai.chats.Chat.send_message',
|
|
687
|
+
'google.genai.chats.Chat.send_message_stream'
|
|
688
|
+
];
|
|
689
|
+
|
|
651
690
|
let allTraces = [];
|
|
652
691
|
let currentOpFilter = 'all';
|
|
653
692
|
let currentModelFilter = 'all';
|
|
@@ -659,6 +698,10 @@
|
|
|
659
698
|
// Load projects list
|
|
660
699
|
async function loadProjects() {
|
|
661
700
|
try {
|
|
701
|
+
// Load saved preferences
|
|
702
|
+
const prefsResponse = await fetch('/get_preferences');
|
|
703
|
+
const prefs = await prefsResponse.json();
|
|
704
|
+
|
|
662
705
|
const response = await fetch('/list_projects');
|
|
663
706
|
const data = await response.json();
|
|
664
707
|
const select = document.getElementById('project-select');
|
|
@@ -670,11 +713,23 @@
|
|
|
670
713
|
`<option value="${p.name}">${p.name} (${p.trace_count} traces)</option>`
|
|
671
714
|
).join('');
|
|
672
715
|
|
|
673
|
-
//
|
|
674
|
-
|
|
675
|
-
|
|
676
|
-
|
|
677
|
-
|
|
716
|
+
// Use saved project or first project
|
|
717
|
+
let projectToLoad = prefs.lastProject || data.projects[0].name;
|
|
718
|
+
|
|
719
|
+
// Check if saved project still exists
|
|
720
|
+
const projectExists = data.projects.some(p => p.name === projectToLoad);
|
|
721
|
+
if (!projectExists) {
|
|
722
|
+
projectToLoad = data.projects[0].name;
|
|
723
|
+
}
|
|
724
|
+
|
|
725
|
+
currentProject = projectToLoad;
|
|
726
|
+
select.value = currentProject;
|
|
727
|
+
await loadTraces(currentProject);
|
|
728
|
+
|
|
729
|
+
// Set default filter to "All Supported Ops"
|
|
730
|
+
if (!prefs.lastOpFilter) {
|
|
731
|
+
currentOpFilter = 'supported';
|
|
732
|
+
document.getElementById('op-filter').value = 'supported';
|
|
678
733
|
}
|
|
679
734
|
}
|
|
680
735
|
} catch (e) {
|
|
@@ -682,6 +737,153 @@
|
|
|
682
737
|
}
|
|
683
738
|
}
|
|
684
739
|
|
|
740
|
+
// Patch traces to handle different provider formats
|
|
741
|
+
function patchTracesForProviders(traces) {
|
|
742
|
+
return traces.map(trace => {
|
|
743
|
+
const patched = { ...trace };
|
|
744
|
+
|
|
745
|
+
// Extract provider from op_display_name
|
|
746
|
+
const opName = trace.op_display_name || '';
|
|
747
|
+
|
|
748
|
+
// === PARSE WEAVEOBJECT OUTPUTS ===
|
|
749
|
+
if (patched.output && typeof patched.output === 'string') {
|
|
750
|
+
// Check if it's a streaming operation (empty or None)
|
|
751
|
+
if (patched.output === '' || patched.output === 'None' || patched.output === 'null') {
|
|
752
|
+
if (opName.includes('stream') || opName.includes('Stream')) {
|
|
753
|
+
patched.output = '[Streaming output - not captured in trace]';
|
|
754
|
+
}
|
|
755
|
+
}
|
|
756
|
+
// Parse WeaveObject strings
|
|
757
|
+
else if (patched.output.startsWith('WeaveObject(')) {
|
|
758
|
+
patched.output = extractFromWeaveObject(patched.output, opName);
|
|
759
|
+
}
|
|
760
|
+
// Parse OpenAI responses.create JSON output
|
|
761
|
+
else if (opName.includes('openai.responses.create')) {
|
|
762
|
+
try {
|
|
763
|
+
const respObj = JSON.parse(patched.output);
|
|
764
|
+
if (respObj.output && Array.isArray(respObj.output)) {
|
|
765
|
+
// Extract text from output messages
|
|
766
|
+
const textParts = respObj.output
|
|
767
|
+
.filter(item => item.type === 'message')
|
|
768
|
+
.flatMap(msg => msg.content || [])
|
|
769
|
+
.filter(c => c.type === 'output_text')
|
|
770
|
+
.map(c => c.text);
|
|
771
|
+
patched.output = textParts.join('\n\n') || JSON.stringify(respObj, null, 2);
|
|
772
|
+
}
|
|
773
|
+
} catch (e) {
|
|
774
|
+
// Keep original if parsing fails
|
|
775
|
+
}
|
|
776
|
+
}
|
|
777
|
+
}
|
|
778
|
+
|
|
779
|
+
// === EXTRACT MESSAGES FOR NON-OPENAI FORMATS ===
|
|
780
|
+
if (patched.inputs && (!patched.messages || patched.messages.length === 0)) {
|
|
781
|
+
// Anthropic format
|
|
782
|
+
if (opName.includes('anthropic') && patched.inputs.messages) {
|
|
783
|
+
patched.messages = patched.inputs.messages;
|
|
784
|
+
}
|
|
785
|
+
// Gemini contents format
|
|
786
|
+
else if (opName.includes('google.genai') && patched.inputs.contents) {
|
|
787
|
+
patched.messages = extractGeminiMessages(patched.inputs.contents);
|
|
788
|
+
}
|
|
789
|
+
// Gemini Chat.send_message format
|
|
790
|
+
else if (opName.includes('Chat.send_message') && patched.inputs.message) {
|
|
791
|
+
patched.messages = [{ role: 'user', content: patched.inputs.message }];
|
|
792
|
+
}
|
|
793
|
+
// OpenAI responses.create input format
|
|
794
|
+
else if (opName.includes('openai.responses') && patched.inputs.input) {
|
|
795
|
+
patched.messages = [{ role: 'user', content: patched.inputs.input }];
|
|
796
|
+
}
|
|
797
|
+
}
|
|
798
|
+
|
|
799
|
+
// === ADD PROVIDER-SPECIFIC USAGE INFO ===
|
|
800
|
+
if (patched.usage) {
|
|
801
|
+
// Gemini thoughts tokens
|
|
802
|
+
if (patched.usage.thoughts_tokens) {
|
|
803
|
+
patched.usage.thoughts_tokens_label = 'Thinking';
|
|
804
|
+
}
|
|
805
|
+
// OpenAI reasoning tokens
|
|
806
|
+
if (patched.usage.output_tokens_details && patched.usage.output_tokens_details.reasoning_tokens) {
|
|
807
|
+
patched.usage.reasoning_tokens = patched.usage.output_tokens_details.reasoning_tokens;
|
|
808
|
+
}
|
|
809
|
+
// Anthropic cache metrics
|
|
810
|
+
if (patched.usage.cache_read_input_tokens || patched.usage.cache_creation_input_tokens) {
|
|
811
|
+
patched.usage.has_cache_info = true;
|
|
812
|
+
}
|
|
813
|
+
}
|
|
814
|
+
|
|
815
|
+
return patched;
|
|
816
|
+
});
|
|
817
|
+
}
|
|
818
|
+
|
|
819
|
+
// Extract text from WeaveObject string based on provider
|
|
820
|
+
function extractFromWeaveObject(weaveStr, opName) {
|
|
821
|
+
try {
|
|
822
|
+
// Find the 'text' field and extract everything until the next unescaped quote
|
|
823
|
+
// This handles multiline strings with escaped quotes and newlines
|
|
824
|
+
const textMatch = weaveStr.match(/'text':\s*'((?:[^'\\]|\\.)*)'/s);
|
|
825
|
+
|
|
826
|
+
if (textMatch && textMatch[1]) {
|
|
827
|
+
let extracted = textMatch[1];
|
|
828
|
+
|
|
829
|
+
// Unescape common escape sequences
|
|
830
|
+
extracted = extracted
|
|
831
|
+
.replace(/\\'/g, "'") // escaped single quotes
|
|
832
|
+
.replace(/\\"/g, '"') // escaped double quotes
|
|
833
|
+
.replace(/\\n/g, '\n') // newlines
|
|
834
|
+
.replace(/\\t/g, '\t') // tabs
|
|
835
|
+
.replace(/\\r/g, '\r') // carriage returns
|
|
836
|
+
.replace(/\\\\/g, '\\'); // escaped backslashes (do this last)
|
|
837
|
+
|
|
838
|
+
return extracted;
|
|
839
|
+
}
|
|
840
|
+
|
|
841
|
+
// Fallback: if no text field found, show truncated version
|
|
842
|
+
return `[Complex WeaveObject - see raw data]\n${weaveStr.substring(0, 500)}...`;
|
|
843
|
+
} catch (e) {
|
|
844
|
+
console.error('Failed to parse WeaveObject:', e);
|
|
845
|
+
return `[Failed to parse WeaveObject]\n${weaveStr.substring(0, 200)}...`;
|
|
846
|
+
}
|
|
847
|
+
}
|
|
848
|
+
|
|
849
|
+
// Extract messages from Gemini contents format
|
|
850
|
+
function extractGeminiMessages(contents) {
|
|
851
|
+
if (!Array.isArray(contents)) return [];
|
|
852
|
+
|
|
853
|
+
return contents.map(content => {
|
|
854
|
+
// Handle WeaveObject string
|
|
855
|
+
if (typeof content === 'string' && content.startsWith('WeaveObject(')) {
|
|
856
|
+
// Try to extract basic info
|
|
857
|
+
const roleMatch = content.match(/'role':\s*'(\w+)'/);
|
|
858
|
+
const textMatch = content.match(/'text':\s*'((?:[^'\\]|\\.)*)'/s);
|
|
859
|
+
|
|
860
|
+
let text = '[Complex content]';
|
|
861
|
+
if (textMatch && textMatch[1]) {
|
|
862
|
+
text = textMatch[1]
|
|
863
|
+
.replace(/\\'/g, "'")
|
|
864
|
+
.replace(/\\"/g, '"')
|
|
865
|
+
.replace(/\\n/g, '\n')
|
|
866
|
+
.replace(/\\t/g, '\t')
|
|
867
|
+
.replace(/\\r/g, '\r')
|
|
868
|
+
.replace(/\\\\/g, '\\');
|
|
869
|
+
}
|
|
870
|
+
|
|
871
|
+
return {
|
|
872
|
+
role: roleMatch ? roleMatch[1] : 'user',
|
|
873
|
+
content: text
|
|
874
|
+
};
|
|
875
|
+
}
|
|
876
|
+
// Handle regular object
|
|
877
|
+
else if (content.role && content.parts) {
|
|
878
|
+
return {
|
|
879
|
+
role: content.role,
|
|
880
|
+
content: content.parts.map(p => p.text || '').join('\n')
|
|
881
|
+
};
|
|
882
|
+
}
|
|
883
|
+
return { role: 'user', content: String(content) };
|
|
884
|
+
});
|
|
885
|
+
}
|
|
886
|
+
|
|
685
887
|
// Load traces from selected project
|
|
686
888
|
async function loadTraces(projectName) {
|
|
687
889
|
const projectPath = projectName.replace('/', '_');
|
|
@@ -696,7 +898,7 @@
|
|
|
696
898
|
}
|
|
697
899
|
|
|
698
900
|
const data = await response.json();
|
|
699
|
-
allTraces = data;
|
|
901
|
+
allTraces = patchTracesForProviders(data);
|
|
700
902
|
currentProject = projectName;
|
|
701
903
|
populateFilters();
|
|
702
904
|
renderTraces();
|
|
@@ -774,6 +976,12 @@
|
|
|
774
976
|
const projectName = e.target.value;
|
|
775
977
|
if (projectName) {
|
|
776
978
|
await loadTraces(projectName);
|
|
979
|
+
// Save preference
|
|
980
|
+
await fetch('/save_preferences', {
|
|
981
|
+
method: 'POST',
|
|
982
|
+
headers: { 'Content-Type': 'application/json' },
|
|
983
|
+
body: JSON.stringify({ lastProject: projectName })
|
|
984
|
+
});
|
|
777
985
|
}
|
|
778
986
|
});
|
|
779
987
|
|
|
@@ -782,9 +990,21 @@
|
|
|
782
990
|
|
|
783
991
|
// Populate filter dropdowns
|
|
784
992
|
function populateFilters() {
|
|
785
|
-
//
|
|
786
|
-
const ops = new Set(allTraces.map(t => t.op_display_name || 'unknown'));
|
|
993
|
+
// Clear existing options (except "All") to avoid duplicates when switching projects
|
|
787
994
|
const opSelect = document.getElementById('op-filter');
|
|
995
|
+
const modelSelect = document.getElementById('model-filter');
|
|
996
|
+
|
|
997
|
+
// Save current filter values
|
|
998
|
+
const savedOpFilter = currentOpFilter;
|
|
999
|
+
const savedModelFilter = currentModelFilter;
|
|
1000
|
+
|
|
1001
|
+
// Clear dropdowns but keep the "All" option
|
|
1002
|
+
opSelect.innerHTML = '<option value="all">All Operations</option>';
|
|
1003
|
+
opSelect.innerHTML += '<option value="supported">All Supported Ops</option>';
|
|
1004
|
+
modelSelect.innerHTML = '<option value="all">All Models</option>';
|
|
1005
|
+
|
|
1006
|
+
// Populate operation filter with operations from current project only
|
|
1007
|
+
const ops = new Set(allTraces.map(t => t.op_display_name || 'unknown'));
|
|
788
1008
|
const sortedOps = [...ops].sort();
|
|
789
1009
|
sortedOps.forEach(op => {
|
|
790
1010
|
const option = document.createElement('option');
|
|
@@ -793,21 +1013,36 @@
|
|
|
793
1013
|
opSelect.appendChild(option);
|
|
794
1014
|
});
|
|
795
1015
|
|
|
796
|
-
//
|
|
797
|
-
if (sortedOps.includes('openai.chat.completions.create')) {
|
|
798
|
-
opSelect.value = 'openai.chat.completions.create';
|
|
799
|
-
currentOpFilter = 'openai.chat.completions.create';
|
|
800
|
-
}
|
|
801
|
-
|
|
802
|
-
// Populate model filter
|
|
1016
|
+
// Populate model filter with models from current project only
|
|
803
1017
|
const models = new Set(allTraces.map(t => t.model));
|
|
804
|
-
const modelSelect = document.getElementById('model-filter');
|
|
805
1018
|
[...models].sort().forEach(model => {
|
|
806
1019
|
const option = document.createElement('option');
|
|
807
1020
|
option.value = model;
|
|
808
1021
|
option.textContent = model;
|
|
809
1022
|
modelSelect.appendChild(option);
|
|
810
1023
|
});
|
|
1024
|
+
|
|
1025
|
+
// Restore previous filter values if they still exist
|
|
1026
|
+
// Special handling for 'all' and 'supported' which always exist
|
|
1027
|
+
if (savedOpFilter === 'all' || savedOpFilter === 'supported') {
|
|
1028
|
+
opSelect.value = savedOpFilter;
|
|
1029
|
+
currentOpFilter = savedOpFilter;
|
|
1030
|
+
} else if (sortedOps.includes(savedOpFilter)) {
|
|
1031
|
+
opSelect.value = savedOpFilter;
|
|
1032
|
+
currentOpFilter = savedOpFilter;
|
|
1033
|
+
} else {
|
|
1034
|
+
// Default to 'supported' when switching projects
|
|
1035
|
+
opSelect.value = 'supported';
|
|
1036
|
+
currentOpFilter = 'supported';
|
|
1037
|
+
}
|
|
1038
|
+
|
|
1039
|
+
if ([...models].includes(savedModelFilter)) {
|
|
1040
|
+
modelSelect.value = savedModelFilter;
|
|
1041
|
+
currentModelFilter = savedModelFilter;
|
|
1042
|
+
} else {
|
|
1043
|
+
modelSelect.value = 'all';
|
|
1044
|
+
currentModelFilter = 'all';
|
|
1045
|
+
}
|
|
811
1046
|
}
|
|
812
1047
|
|
|
813
1048
|
// Filter change handlers
|
|
@@ -826,7 +1061,13 @@
|
|
|
826
1061
|
let filteredTraces = allTraces;
|
|
827
1062
|
|
|
828
1063
|
// Apply operation filter
|
|
829
|
-
if (currentOpFilter
|
|
1064
|
+
if (currentOpFilter === 'supported') {
|
|
1065
|
+
// Filter to only supported operations
|
|
1066
|
+
filteredTraces = filteredTraces.filter(t => {
|
|
1067
|
+
const opDisplayName = t.op_display_name || '';
|
|
1068
|
+
return SUPPORTED_OPS.some(op => opDisplayName.includes(op));
|
|
1069
|
+
});
|
|
1070
|
+
} else if (currentOpFilter !== 'all') {
|
|
830
1071
|
filteredTraces = filteredTraces.filter(t => t.op_display_name === currentOpFilter);
|
|
831
1072
|
}
|
|
832
1073
|
|
|
@@ -860,9 +1101,12 @@
|
|
|
860
1101
|
${trace.usage && (trace.usage.total_tokens || trace.usage.requests) ? `
|
|
861
1102
|
<div class="usage-info">
|
|
862
1103
|
${trace.usage.requests ? `<div class="usage-item"><span class="usage-label">Requests:</span> ${trace.usage.requests}</div>` : ''}
|
|
863
|
-
${trace.usage.prompt_tokens ? `<div class="usage-item"><span class="usage-label">
|
|
864
|
-
${trace.usage.completion_tokens ? `<div class="usage-item"><span class="usage-label">
|
|
1104
|
+
${trace.usage.prompt_tokens || trace.usage.input_tokens ? `<div class="usage-item"><span class="usage-label">Input:</span> ${trace.usage.prompt_tokens || trace.usage.input_tokens}</div>` : ''}
|
|
1105
|
+
${trace.usage.completion_tokens || trace.usage.output_tokens ? `<div class="usage-item"><span class="usage-label">Output:</span> ${trace.usage.completion_tokens || trace.usage.output_tokens}</div>` : ''}
|
|
865
1106
|
${trace.usage.total_tokens ? `<div class="usage-item"><span class="usage-label">Total:</span> ${trace.usage.total_tokens}</div>` : ''}
|
|
1107
|
+
${trace.usage.reasoning_tokens ? `<div class="usage-item" style="color: #ff9d00;"><span class="usage-label">Reasoning:</span> ${trace.usage.reasoning_tokens}</div>` : ''}
|
|
1108
|
+
${trace.usage.thoughts_tokens ? `<div class="usage-item" style="color: #9d66ff;"><span class="usage-label">Thinking:</span> ${trace.usage.thoughts_tokens}</div>` : ''}
|
|
1109
|
+
${trace.usage.cache_read_input_tokens ? `<div class="usage-item" style="color: #4a9eff;"><span class="usage-label">Cache Read:</span> ${trace.usage.cache_read_input_tokens}</div>` : ''}
|
|
866
1110
|
</div>
|
|
867
1111
|
` : ''}
|
|
868
1112
|
|
|
@@ -972,17 +1216,15 @@
|
|
|
972
1216
|
return;
|
|
973
1217
|
}
|
|
974
1218
|
|
|
975
|
-
// Filter to only
|
|
1219
|
+
// Filter to only supported provider traces (exclude wrapper function traces)
|
|
976
1220
|
const completionTraces = selectedData.filter(t => {
|
|
977
|
-
const opName = t.op_name || '';
|
|
978
1221
|
const opDisplayName = t.op_display_name || '';
|
|
979
|
-
//
|
|
980
|
-
return
|
|
981
|
-
opName.includes('openai.chat.completions.create');
|
|
1222
|
+
// Check if it's one of our supported operations
|
|
1223
|
+
return SUPPORTED_OPS.some(op => opDisplayName.includes(op));
|
|
982
1224
|
});
|
|
983
1225
|
|
|
984
1226
|
if (completionTraces.length === 0) {
|
|
985
|
-
alert('No
|
|
1227
|
+
alert('No supported provider traces selected! Supported: OpenAI, Anthropic, Gemini');
|
|
986
1228
|
return;
|
|
987
1229
|
}
|
|
988
1230
|
|
|
@@ -1036,9 +1278,18 @@
|
|
|
1036
1278
|
// Get filtered traces
|
|
1037
1279
|
function getFilteredTraces() {
|
|
1038
1280
|
let filtered = allTraces;
|
|
1039
|
-
|
|
1281
|
+
|
|
1282
|
+
// Apply operation filter
|
|
1283
|
+
if (currentOpFilter === 'supported') {
|
|
1284
|
+
filtered = filtered.filter(t => {
|
|
1285
|
+
const opDisplayName = t.op_display_name || '';
|
|
1286
|
+
return SUPPORTED_OPS.some(op => opDisplayName.includes(op));
|
|
1287
|
+
});
|
|
1288
|
+
} else if (currentOpFilter !== 'all') {
|
|
1040
1289
|
filtered = filtered.filter(t => t.op_display_name === currentOpFilter);
|
|
1041
1290
|
}
|
|
1291
|
+
|
|
1292
|
+
// Apply model filter
|
|
1042
1293
|
if (currentModelFilter !== 'all') {
|
|
1043
1294
|
filtered = filtered.filter(t => t.model === currentModelFilter);
|
|
1044
1295
|
}
|
|
@@ -1279,38 +1530,48 @@
|
|
|
1279
1530
|
const response = await fetch('/list_judges');
|
|
1280
1531
|
const data = await response.json();
|
|
1281
1532
|
const judges = data.judges || [];
|
|
1282
|
-
const
|
|
1533
|
+
const judgeList = document.getElementById('eval-judge-list');
|
|
1283
1534
|
|
|
1284
1535
|
if (judges.length === 0) {
|
|
1285
|
-
|
|
1536
|
+
judgeList.innerHTML = '<div style="color: #888;">No judges defined - <a href="/judge" target="_blank" style="color: #4a9eff;">create one first</a></div>';
|
|
1286
1537
|
} else {
|
|
1287
|
-
|
|
1538
|
+
judgeList.innerHTML = judges.map((j, i) => `
|
|
1539
|
+
<label style="display: flex; align-items: center; padding: 8px; margin-bottom: 8px; background: #1a1a1a; border-radius: 4px; cursor: pointer; transition: background 0.2s;">
|
|
1540
|
+
<input type="checkbox" class="eval-judge-checkbox" data-judge-index="${i}" style="margin-right: 10px; width: 18px; height: 18px; cursor: pointer;">
|
|
1541
|
+
<div style="flex: 1;">
|
|
1542
|
+
<div style="color: #fff; font-size: 14px; font-weight: 500;">${j.name}</div>
|
|
1543
|
+
<div style="color: #888; font-size: 12px;">${j.type}</div>
|
|
1544
|
+
</div>
|
|
1545
|
+
</label>
|
|
1546
|
+
`).join('');
|
|
1288
1547
|
}
|
|
1289
1548
|
} catch (e) {
|
|
1290
1549
|
console.error('Error loading judges:', e);
|
|
1291
|
-
document.getElementById('eval-judge').innerHTML = '<
|
|
1550
|
+
document.getElementById('eval-judge-list').innerHTML = '<div style="color: #f88;">Error loading judges</div>';
|
|
1292
1551
|
}
|
|
1293
1552
|
}
|
|
1294
1553
|
|
|
1295
1554
|
// Run evaluation
|
|
1296
1555
|
document.getElementById('run-eval-btn').addEventListener('click', async () => {
|
|
1297
|
-
|
|
1556
|
+
// Get selected judges
|
|
1557
|
+
const selectedJudgeCheckboxes = document.querySelectorAll('.eval-judge-checkbox:checked');
|
|
1558
|
+
const selectedJudgeIndices = Array.from(selectedJudgeCheckboxes).map(cb => parseInt(cb.dataset.judgeIndex));
|
|
1298
1559
|
|
|
1299
1560
|
if (selectedEvalModels.size === 0) {
|
|
1300
1561
|
alert('Please select at least one weak model');
|
|
1301
1562
|
return;
|
|
1302
1563
|
}
|
|
1303
1564
|
|
|
1304
|
-
if (
|
|
1305
|
-
alert('Please select
|
|
1565
|
+
if (selectedJudgeIndices.length === 0) {
|
|
1566
|
+
alert('Please select at least one judge');
|
|
1306
1567
|
return;
|
|
1307
1568
|
}
|
|
1308
1569
|
|
|
1309
1570
|
// Load judges from server
|
|
1310
1571
|
const judgesResponse = await fetch('/list_judges');
|
|
1311
1572
|
const judgesData = await judgesResponse.json();
|
|
1312
|
-
const
|
|
1313
|
-
const
|
|
1573
|
+
const allJudges = judgesData.judges || [];
|
|
1574
|
+
const selectedJudges = selectedJudgeIndices.map(idx => allJudges[idx]);
|
|
1314
1575
|
|
|
1315
1576
|
// Show progress
|
|
1316
1577
|
document.getElementById('eval-progress').style.display = 'block';
|
|
@@ -1320,17 +1581,17 @@
|
|
|
1320
1581
|
const resultsDiv = document.getElementById('eval-results-links');
|
|
1321
1582
|
|
|
1322
1583
|
progressText.textContent = `Starting evaluations...\n`;
|
|
1323
|
-
progressText.textContent += `
|
|
1584
|
+
progressText.textContent += `Judges: ${selectedJudges.map(j => j.name).join(', ')}\n`;
|
|
1324
1585
|
progressText.textContent += `Models: ${selectedEvalModels.size}\n\n`;
|
|
1325
1586
|
|
|
1326
1587
|
const modelFiles = Array.from(selectedEvalModels);
|
|
1327
1588
|
const results = [];
|
|
1328
1589
|
|
|
1329
|
-
// Run
|
|
1330
|
-
for (let
|
|
1331
|
-
const modelFile = modelFiles[
|
|
1590
|
+
// Run one evaluation per model with ALL judges combined
|
|
1591
|
+
for (let modelIdx = 0; modelIdx < modelFiles.length; modelIdx++) {
|
|
1592
|
+
const modelFile = modelFiles[modelIdx];
|
|
1332
1593
|
|
|
1333
|
-
progressText.textContent += `[${
|
|
1594
|
+
progressText.textContent += `[${modelIdx + 1}/${modelFiles.length}] Evaluating ${modelFile} with ${selectedJudges.length} judge(s)...\n`;
|
|
1334
1595
|
|
|
1335
1596
|
let pollInterval = null;
|
|
1336
1597
|
let taskId = null;
|
|
@@ -1341,9 +1602,8 @@
|
|
|
1341
1602
|
const resp = await fetch(`/progress/${taskId}`);
|
|
1342
1603
|
if (resp.ok) {
|
|
1343
1604
|
const progress = await resp.json();
|
|
1344
|
-
const percent = (
|
|
1605
|
+
const percent = ((modelIdx + 1) / modelFiles.length) * 100;
|
|
1345
1606
|
progressFill.style.width = `${percent}%`;
|
|
1346
|
-
progressText.textContent = `[${i+1}/${modelFiles.length}] ${progress.message}\nProgress: ${progress.current}/${progress.total} (${percent.toFixed(1)}%)\n`;
|
|
1347
1607
|
}
|
|
1348
1608
|
} catch (e) {
|
|
1349
1609
|
console.error('Error polling eval progress:', e);
|
|
@@ -1352,17 +1612,18 @@
|
|
|
1352
1612
|
|
|
1353
1613
|
try {
|
|
1354
1614
|
// Generate task ID for this evaluation
|
|
1355
|
-
taskId = `eval_${Date.now()}_${
|
|
1615
|
+
taskId = `eval_${Date.now()}_${modelIdx}`;
|
|
1356
1616
|
|
|
1357
1617
|
// Start polling
|
|
1358
1618
|
pollInterval = setInterval(pollProgress, 300);
|
|
1359
1619
|
|
|
1620
|
+
// Send all judges in one request
|
|
1360
1621
|
const response = await fetch('/run_evaluation', {
|
|
1361
1622
|
method: 'POST',
|
|
1362
1623
|
headers: { 'Content-Type': 'application/json' },
|
|
1363
1624
|
body: JSON.stringify({
|
|
1364
1625
|
model_file: modelFile,
|
|
1365
|
-
|
|
1626
|
+
judges: selectedJudges, // Send all judges
|
|
1366
1627
|
task_id: taskId
|
|
1367
1628
|
})
|
|
1368
1629
|
});
|
|
@@ -1377,6 +1638,7 @@
|
|
|
1377
1638
|
if (pollInterval) clearInterval(pollInterval);
|
|
1378
1639
|
|
|
1379
1640
|
progressText.textContent += ` ✓ Complete: ${result.evaluation_name}\n`;
|
|
1641
|
+
progressText.textContent += ` Judges used: ${result.judges.join(', ')}\n`;
|
|
1380
1642
|
progressText.textContent += ` Examples: ${result.examples_evaluated}\n\n`;
|
|
1381
1643
|
|
|
1382
1644
|
results.push({
|
|
@@ -1780,14 +2042,17 @@
|
|
|
1780
2042
|
try {
|
|
1781
2043
|
const response = await fetch('/list_judges');
|
|
1782
2044
|
const data = await response.json();
|
|
1783
|
-
const
|
|
2045
|
+
const judgeList = document.getElementById('e2e-judge-list');
|
|
1784
2046
|
|
|
1785
2047
|
if (data.judges && data.judges.length > 0) {
|
|
1786
|
-
|
|
1787
|
-
|
|
1788
|
-
|
|
2048
|
+
judgeList.innerHTML = data.judges.map((judge, idx) => `
|
|
2049
|
+
<label style="display: block; padding: 5px 0; color: #ccc; cursor: pointer;">
|
|
2050
|
+
<input type="checkbox" class="e2e-judge-checkbox" value="${idx}" style="margin-right: 8px;">
|
|
2051
|
+
${judge.name} (${judge.type})
|
|
2052
|
+
</label>
|
|
2053
|
+
`).join('');
|
|
1789
2054
|
} else {
|
|
1790
|
-
|
|
2055
|
+
judgeList.innerHTML = '<p style="color: #888;">No judges available - create one first</p>';
|
|
1791
2056
|
}
|
|
1792
2057
|
} catch (error) {
|
|
1793
2058
|
console.error('Error loading judges:', error);
|
|
@@ -1823,9 +2088,10 @@
|
|
|
1823
2088
|
return;
|
|
1824
2089
|
}
|
|
1825
2090
|
|
|
1826
|
-
|
|
1827
|
-
|
|
1828
|
-
|
|
2091
|
+
// Get selected judges
|
|
2092
|
+
const selectedJudgeIndices = Array.from(document.querySelectorAll('.e2e-judge-checkbox:checked')).map(cb => parseInt(cb.value));
|
|
2093
|
+
if (selectedJudgeIndices.length === 0) {
|
|
2094
|
+
alert('Please select at least one judge!');
|
|
1829
2095
|
return;
|
|
1830
2096
|
}
|
|
1831
2097
|
|
|
@@ -1834,7 +2100,7 @@
|
|
|
1834
2100
|
// Load judge data
|
|
1835
2101
|
const judgesResponse = await fetch('/list_judges');
|
|
1836
2102
|
const judgesData = await judgesResponse.json();
|
|
1837
|
-
const
|
|
2103
|
+
const judges = selectedJudgeIndices.map(idx => judgesData.judges[idx]);
|
|
1838
2104
|
|
|
1839
2105
|
// Hide config panel, show progress panel
|
|
1840
2106
|
document.getElementById('e2e-panel').style.display = 'none';
|
|
@@ -1920,7 +2186,8 @@
|
|
|
1920
2186
|
|
|
1921
2187
|
// === STEP 3: Run Evaluations ===
|
|
1922
2188
|
stepLabel.textContent = 'Step 3/3: Running evaluations...';
|
|
1923
|
-
|
|
2189
|
+
const judgeNames = judges.map(j => j.name).join(', ');
|
|
2190
|
+
progressText.textContent += `📊 Running evaluations with ${judges.length} judge(s): ${judgeNames}...\n`;
|
|
1924
2191
|
|
|
1925
2192
|
const evaluationResults = [];
|
|
1926
2193
|
|
|
@@ -1963,7 +2230,7 @@
|
|
|
1963
2230
|
headers: { 'Content-Type': 'application/json' },
|
|
1964
2231
|
body: JSON.stringify({
|
|
1965
2232
|
model_file: modelFile,
|
|
1966
|
-
|
|
2233
|
+
judges: judges,
|
|
1967
2234
|
task_id: evalTaskId
|
|
1968
2235
|
})
|
|
1969
2236
|
});
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
quickdistill/__init__.py,sha256=dOl_wXruBGyDGhe1Iu4-SQLu_6-_b6rt1lkxfOp3Jqo,823
|
|
2
|
+
quickdistill/cli.py,sha256=A8d5GN9NdBS299WyAsJ6-p8ynW3DJnDRHZ-UGH7TXLM,2212
|
|
3
|
+
quickdistill/default_judges.json,sha256=9uDqsYc9CsJwZAWwOkWcqgmlGZNJ0zzyXpv4wZ8vtuE,1446
|
|
4
|
+
quickdistill/get_traces.py,sha256=mfy9fMiK-CZQN1noZ4DfOwdwP45ntthVDLgh4-u2iNk,4896
|
|
5
|
+
quickdistill/server.py,sha256=0yBQ5vt1oD7OkhH7ap2cR8j-wuVG3fU7jARijmD1eOs,42849
|
|
6
|
+
quickdistill/__pycache__/__init__.cpython-310.pyc,sha256=RqzjvxzPxHFJZkBjX6DSH9vbVTtskVgJ4pTQ6EX2A6o,794
|
|
7
|
+
quickdistill/__pycache__/cli.cpython-310.pyc,sha256=xtVgJTayQLKS4gE_te7U1Wo8LmkDtPkaa2rnzu8h9fY,2443
|
|
8
|
+
quickdistill/__pycache__/get_traces.cpython-310.pyc,sha256=T7Suxp9vpqYDQJ_3uJvXWemqoLf5tnRC2I0BfHrSiNM,2956
|
|
9
|
+
quickdistill/__pycache__/server.cpython-310.pyc,sha256=8W74-E_S0dJRRwRG7nF9UL64kdbyDoNswAi5y51Xc3I,25593
|
|
10
|
+
quickdistill/default_projects/byyoung3_arena-detailed/traces_data.json,sha256=iz-cBmXBYj0bC3Vn754QTnGuDh6sRvlE_RzSyGXaxbY,15496950
|
|
11
|
+
quickdistill/static/judge_manager.html,sha256=t6dSPwo_d-GIu1FscuK1KDgxKCnmiOekQTMu80lZPPY,27166
|
|
12
|
+
quickdistill/static/trace_viewer.html,sha256=lAMO6Mj-MWQqXGC4bo2v8ybM4ci082h2HaDQ1AOl2jM,109884
|
|
13
|
+
quickdistill-0.1.9.dist-info/METADATA,sha256=-VH48FybeQbxuxUOlSn0zHJfCOkxfklCrxCHbdRYFRQ,5084
|
|
14
|
+
quickdistill-0.1.9.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
|
|
15
|
+
quickdistill-0.1.9.dist-info/entry_points.txt,sha256=AUUTxnwdD9gRnsOEcTXQTAZIZ_F0aRU7JGstIJ3Xk_o,55
|
|
16
|
+
quickdistill-0.1.9.dist-info/top_level.txt,sha256=ysiMvurJYsE1IhkxmObe-0G8A-GIav40kTh2z6axjxg,13
|
|
17
|
+
quickdistill-0.1.9.dist-info/RECORD,,
|
|
@@ -1,17 +0,0 @@
|
|
|
1
|
-
quickdistill/__init__.py,sha256=U8mvMbfYKLFegcEA4D-P6AFHvSiHQPXoFn0KKd-xh0A,397
|
|
2
|
-
quickdistill/cli.py,sha256=A8d5GN9NdBS299WyAsJ6-p8ynW3DJnDRHZ-UGH7TXLM,2212
|
|
3
|
-
quickdistill/default_judges.json,sha256=w0TkIniELPPG-Mi3hm7zPW06eq46W1BI_ufWXnkDDDM,1432
|
|
4
|
-
quickdistill/get_traces.py,sha256=mfy9fMiK-CZQN1noZ4DfOwdwP45ntthVDLgh4-u2iNk,4896
|
|
5
|
-
quickdistill/server.py,sha256=0Y0XG-8oYoNZgmo10LPZgtwlHuGqrq0urxE-KabyIvI,36789
|
|
6
|
-
quickdistill/__pycache__/__init__.cpython-310.pyc,sha256=Tbov274p3OjaOuOsQwcW-meATEfkz0mHKmpytksuDJI,603
|
|
7
|
-
quickdistill/__pycache__/cli.cpython-310.pyc,sha256=xtVgJTayQLKS4gE_te7U1Wo8LmkDtPkaa2rnzu8h9fY,2443
|
|
8
|
-
quickdistill/__pycache__/get_traces.cpython-310.pyc,sha256=T7Suxp9vpqYDQJ_3uJvXWemqoLf5tnRC2I0BfHrSiNM,2956
|
|
9
|
-
quickdistill/__pycache__/server.cpython-310.pyc,sha256=_taKWofMtdgfMZzfVsd7PoC4jnuKxEOGzW82YBxqPPc,22051
|
|
10
|
-
quickdistill/default_projects/byyoung3_arena-detailed/traces_data.json,sha256=iz-cBmXBYj0bC3Vn754QTnGuDh6sRvlE_RzSyGXaxbY,15496950
|
|
11
|
-
quickdistill/static/judge_manager.html,sha256=fXteyx_ry4gY166WypBkVGGCqieE88MigqLRLVCKnG8,26887
|
|
12
|
-
quickdistill/static/trace_viewer.html,sha256=kPC4GnxeDPq7jxClRhZBOuS6xmA3RaY-loJDZmKDADE,94426
|
|
13
|
-
quickdistill-0.1.7.dist-info/METADATA,sha256=1pE5fDep0l0kAxhHuT1C_H4CYHIiPLP4n9QraAqI9bM,5084
|
|
14
|
-
quickdistill-0.1.7.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
|
|
15
|
-
quickdistill-0.1.7.dist-info/entry_points.txt,sha256=AUUTxnwdD9gRnsOEcTXQTAZIZ_F0aRU7JGstIJ3Xk_o,55
|
|
16
|
-
quickdistill-0.1.7.dist-info/top_level.txt,sha256=ysiMvurJYsE1IhkxmObe-0G8A-GIav40kTh2z6axjxg,13
|
|
17
|
-
quickdistill-0.1.7.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|