quickdistill 0.1.8__py3-none-any.whl → 0.1.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- quickdistill/__init__.py +12 -1
- quickdistill/__pycache__/__init__.cpython-310.pyc +0 -0
- quickdistill/__pycache__/server.cpython-310.pyc +0 -0
- quickdistill/server.py +170 -29
- quickdistill/static/trace_viewer.html +375 -116
- {quickdistill-0.1.8.dist-info → quickdistill-0.1.9.dist-info}/METADATA +1 -1
- quickdistill-0.1.9.dist-info/RECORD +17 -0
- quickdistill-0.1.8.dist-info/RECORD +0 -17
- {quickdistill-0.1.8.dist-info → quickdistill-0.1.9.dist-info}/WHEEL +0 -0
- {quickdistill-0.1.8.dist-info → quickdistill-0.1.9.dist-info}/entry_points.txt +0 -0
- {quickdistill-0.1.8.dist-info → quickdistill-0.1.9.dist-info}/top_level.txt +0 -0
quickdistill/__init__.py
CHANGED
|
@@ -8,7 +8,18 @@ This package provides tools to:
|
|
|
8
8
|
- Export datasets for model evaluation
|
|
9
9
|
"""
|
|
10
10
|
|
|
11
|
-
|
|
11
|
+
# Monkey patch for aiohttp/litellm compatibility
|
|
12
|
+
# litellm expects aiohttp.ConnectionTimeoutError but it doesn't exist in some versions
|
|
13
|
+
try:
|
|
14
|
+
import aiohttp
|
|
15
|
+
if not hasattr(aiohttp, 'ConnectionTimeoutError'):
|
|
16
|
+
aiohttp.ConnectionTimeoutError = aiohttp.ServerTimeoutError
|
|
17
|
+
if not hasattr(aiohttp, 'SocketTimeoutError'):
|
|
18
|
+
aiohttp.SocketTimeoutError = aiohttp.ServerTimeoutError
|
|
19
|
+
except Exception:
|
|
20
|
+
pass
|
|
21
|
+
|
|
22
|
+
__version__ = "0.1.9"
|
|
12
23
|
__author__ = "Brett Young"
|
|
13
24
|
__email__ = "bdytx5@umsystem.edu"
|
|
14
25
|
|
|
Binary file
|
|
Binary file
|
quickdistill/server.py
CHANGED
|
@@ -100,40 +100,133 @@ def run_inference(client, model, messages, max_tokens=1000):
|
|
|
100
100
|
return f"ERROR: {str(e)}"
|
|
101
101
|
|
|
102
102
|
def extract_output_content(output_str):
|
|
103
|
-
"""Extract actual content from WeaveObject string or regular output
|
|
103
|
+
"""Extract actual content from WeaveObject string, JSON response, or regular output.
|
|
104
|
+
|
|
105
|
+
Handles outputs from:
|
|
106
|
+
- OpenAI chat.completions.create (plain text)
|
|
107
|
+
- OpenAI responses.create (JSON with nested structure)
|
|
108
|
+
- Anthropic Messages (WeaveObject with content[0].text)
|
|
109
|
+
- Google Gemini (WeaveObject with candidates[0].content.parts[0].text)
|
|
110
|
+
"""
|
|
111
|
+
import re
|
|
112
|
+
import json
|
|
113
|
+
|
|
104
114
|
if not output_str:
|
|
105
115
|
return None
|
|
106
116
|
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
117
|
+
if not isinstance(output_str, str):
|
|
118
|
+
return str(output_str)
|
|
119
|
+
|
|
120
|
+
# Handle empty/streaming responses
|
|
121
|
+
if output_str in ('', 'None', 'null'):
|
|
122
|
+
return '[Streaming output - not captured]'
|
|
123
|
+
|
|
124
|
+
# Handle OpenAI responses.create JSON format
|
|
125
|
+
if output_str.startswith('{') and '"output"' in output_str:
|
|
126
|
+
try:
|
|
127
|
+
resp_obj = json.loads(output_str)
|
|
128
|
+
if 'output' in resp_obj and isinstance(resp_obj['output'], list):
|
|
129
|
+
# Extract text from output messages
|
|
130
|
+
text_parts = []
|
|
131
|
+
for item in resp_obj['output']:
|
|
132
|
+
if item.get('type') == 'message' and 'content' in item:
|
|
133
|
+
for content in item['content']:
|
|
134
|
+
if content.get('type') == 'output_text' and 'text' in content:
|
|
135
|
+
text_parts.append(content['text'])
|
|
136
|
+
if text_parts:
|
|
137
|
+
return '\n\n'.join(text_parts)
|
|
138
|
+
except (json.JSONDecodeError, KeyError, TypeError):
|
|
139
|
+
pass # Fall through to other handlers
|
|
140
|
+
|
|
141
|
+
# Handle WeaveObject strings (Anthropic, Gemini)
|
|
142
|
+
if 'WeaveObject' in output_str:
|
|
143
|
+
# Improved regex that handles escape sequences properly
|
|
144
|
+
match = re.search(r"'text':\s*'((?:[^'\\]|\\.)*)'", output_str, re.DOTALL)
|
|
112
145
|
if match:
|
|
113
|
-
# Unescape the string
|
|
146
|
+
# Unescape the string properly (order matters!)
|
|
114
147
|
text = match.group(1)
|
|
115
|
-
text = text.replace(
|
|
148
|
+
text = text.replace("\\'", "'") # escaped single quotes
|
|
149
|
+
text = text.replace('\\"', '"') # escaped double quotes
|
|
150
|
+
text = text.replace('\\n', '\n') # newlines
|
|
151
|
+
text = text.replace('\\t', '\t') # tabs
|
|
152
|
+
text = text.replace('\\r', '\r') # carriage returns
|
|
153
|
+
text = text.replace('\\\\', '\\') # escaped backslashes (do this last!)
|
|
116
154
|
return text
|
|
117
155
|
|
|
118
|
-
|
|
156
|
+
# If no text field found, return truncated version
|
|
157
|
+
return f"[Complex WeaveObject - could not extract text]\n{output_str[:500]}..."
|
|
158
|
+
|
|
159
|
+
# Plain text output (standard OpenAI chat format)
|
|
119
160
|
return output_str
|
|
120
161
|
|
|
121
162
|
|
|
122
163
|
def extract_messages_from_trace(trace):
|
|
123
|
-
"""Extract messages from a trace in the format needed for inference
|
|
124
|
-
|
|
164
|
+
"""Extract messages from a trace in the format needed for inference.
|
|
165
|
+
|
|
166
|
+
Handles message extraction from:
|
|
167
|
+
- OpenAI chat.completions.create (messages at top level or in inputs.messages)
|
|
168
|
+
- OpenAI responses.create (inputs.input field)
|
|
169
|
+
- Anthropic Messages (inputs.messages)
|
|
170
|
+
- Google Gemini generate_content (inputs.contents array)
|
|
171
|
+
- Google Gemini Chat.send_message (inputs.message string)
|
|
172
|
+
"""
|
|
173
|
+
import re
|
|
174
|
+
|
|
175
|
+
# Get op_display_name for provider detection
|
|
176
|
+
op_name = trace.get('op_display_name', '')
|
|
177
|
+
|
|
178
|
+
# Check if messages are at top level (already extracted/cached)
|
|
125
179
|
if trace.get('messages') and isinstance(trace['messages'], list) and len(trace['messages']) > 0:
|
|
126
180
|
return trace['messages']
|
|
127
181
|
|
|
128
182
|
# Check if messages are in inputs
|
|
129
183
|
if trace.get('inputs') and isinstance(trace['inputs'], dict):
|
|
130
|
-
|
|
184
|
+
inputs = trace['inputs']
|
|
185
|
+
|
|
186
|
+
# Standard OpenAI/Anthropic: inputs.messages
|
|
187
|
+
messages = inputs.get('messages', [])
|
|
131
188
|
if isinstance(messages, list) and len(messages) > 0:
|
|
132
189
|
return messages
|
|
133
190
|
|
|
191
|
+
# OpenAI responses.create: inputs.input (simple string)
|
|
192
|
+
if 'openai.responses' in op_name and 'input' in inputs:
|
|
193
|
+
return [{"role": "user", "content": inputs['input']}]
|
|
194
|
+
|
|
195
|
+
# Gemini Chat.send_message: inputs.message (simple string)
|
|
196
|
+
if 'Chat.send_message' in op_name and 'message' in inputs:
|
|
197
|
+
return [{"role": "user", "content": inputs['message']}]
|
|
198
|
+
|
|
199
|
+
# Gemini generate_content: inputs.contents (array of content objects or WeaveObject strings)
|
|
200
|
+
if 'google.genai' in op_name and 'contents' in inputs:
|
|
201
|
+
contents = inputs['contents']
|
|
202
|
+
if isinstance(contents, list) and len(contents) > 0:
|
|
203
|
+
messages = []
|
|
204
|
+
for content in contents:
|
|
205
|
+
# Handle WeaveObject string format
|
|
206
|
+
if isinstance(content, str) and 'WeaveObject' in content:
|
|
207
|
+
role_match = re.search(r"'role':\s*'(\w+)'", content)
|
|
208
|
+
text_match = re.search(r"'text':\s*'((?:[^'\\]|\\.)*)'", content, re.DOTALL)
|
|
209
|
+
text = '[Complex content]'
|
|
210
|
+
if text_match:
|
|
211
|
+
text = text_match.group(1)
|
|
212
|
+
text = text.replace("\\'", "'").replace('\\n', '\n').replace('\\\\', '\\')
|
|
213
|
+
messages.append({
|
|
214
|
+
"role": role_match.group(1) if role_match else "user",
|
|
215
|
+
"content": text
|
|
216
|
+
})
|
|
217
|
+
# Handle regular dict format
|
|
218
|
+
elif isinstance(content, dict):
|
|
219
|
+
role = content.get('role', 'user')
|
|
220
|
+
parts = content.get('parts', [])
|
|
221
|
+
if isinstance(parts, list):
|
|
222
|
+
text = '\n'.join([p.get('text', '') for p in parts if isinstance(p, dict)])
|
|
223
|
+
messages.append({"role": role, "content": text})
|
|
224
|
+
if messages:
|
|
225
|
+
return messages
|
|
226
|
+
|
|
134
227
|
# Check if inputs has question/context format (from generate_test_traces.py wrapper traces)
|
|
135
|
-
question =
|
|
136
|
-
context =
|
|
228
|
+
question = inputs.get('question')
|
|
229
|
+
context = inputs.get('context')
|
|
137
230
|
if question:
|
|
138
231
|
if context:
|
|
139
232
|
prompt = f"""Based on the following context, answer the question concisely.
|
|
@@ -753,16 +846,26 @@ def delete_judge():
|
|
|
753
846
|
|
|
754
847
|
@app.route('/run_evaluation', methods=['POST'])
|
|
755
848
|
def run_evaluation_endpoint():
|
|
756
|
-
"""Run evaluation using specified judge"""
|
|
757
|
-
|
|
849
|
+
"""Run evaluation using specified judge(s) - supports multiple judges"""
|
|
850
|
+
|
|
758
851
|
|
|
759
852
|
data = request.json
|
|
760
853
|
model_file = data.get('model_file')
|
|
761
|
-
|
|
854
|
+
judges = data.get('judges') # Can be a list or single judge dict
|
|
762
855
|
task_id = data.get('task_id', f"eval_{id(data)}")
|
|
763
856
|
|
|
764
|
-
|
|
765
|
-
|
|
857
|
+
# Handle both single judge (backwards compat) and multiple judges
|
|
858
|
+
if data.get('judge'):
|
|
859
|
+
judges = [data.get('judge')]
|
|
860
|
+
elif not judges:
|
|
861
|
+
return jsonify({'error': 'Missing judge or judges'}), 400
|
|
862
|
+
|
|
863
|
+
# Ensure judges is a list
|
|
864
|
+
if not isinstance(judges, list):
|
|
865
|
+
judges = [judges]
|
|
866
|
+
|
|
867
|
+
if not model_file:
|
|
868
|
+
return jsonify({'error': 'Missing model_file'}), 400
|
|
766
869
|
|
|
767
870
|
# Load weak model results
|
|
768
871
|
model_path = DATA_DIR / model_file
|
|
@@ -782,18 +885,22 @@ def run_evaluation_endpoint():
|
|
|
782
885
|
# Extract model name from filename
|
|
783
886
|
model_name = model_file.replace('weak_model_', '').replace('.json', '')
|
|
784
887
|
|
|
888
|
+
# Create evaluation name with all judges
|
|
889
|
+
judges_names = '_'.join([j['name'] for j in judges])
|
|
890
|
+
eval_name = f"eval-{model_name}-{judges_names}"
|
|
891
|
+
|
|
785
892
|
# Initialize progress tracking
|
|
786
893
|
total_steps = len(results)
|
|
787
894
|
progress_state[task_id] = {
|
|
788
895
|
'current': 0,
|
|
789
896
|
'total': total_steps,
|
|
790
|
-
'message': f'Starting evaluation: {model_name} with {judge
|
|
897
|
+
'message': f'Starting evaluation: {model_name} with {len(judges)} judge(s)...',
|
|
791
898
|
'status': 'running'
|
|
792
899
|
}
|
|
793
900
|
|
|
794
901
|
# Create evaluation logger
|
|
795
902
|
ev = weave.EvaluationLogger(
|
|
796
|
-
name=
|
|
903
|
+
name=eval_name,
|
|
797
904
|
model=model_name
|
|
798
905
|
)
|
|
799
906
|
|
|
@@ -818,13 +925,20 @@ def run_evaluation_endpoint():
|
|
|
818
925
|
if messages and len(messages) > 0:
|
|
819
926
|
question = messages[0].get('content', '')
|
|
820
927
|
|
|
821
|
-
# Run
|
|
822
|
-
|
|
823
|
-
|
|
824
|
-
|
|
825
|
-
|
|
928
|
+
# Run all judges and collect scores
|
|
929
|
+
all_scores = {}
|
|
930
|
+
for judge in judges:
|
|
931
|
+
# Run judge
|
|
932
|
+
if judge['type'] == 'llm':
|
|
933
|
+
scores = run_llm_judge_eval(judge, strong_output, weak_output, question)
|
|
934
|
+
else:
|
|
935
|
+
scores = run_custom_judge_eval(judge, strong_output, weak_output)
|
|
936
|
+
|
|
937
|
+
# Merge scores with judge name prefix to avoid conflicts
|
|
938
|
+
for score_key, score_value in scores.items():
|
|
939
|
+
all_scores[f"{judge['name']}_{score_key}"] = score_value
|
|
826
940
|
|
|
827
|
-
# Log to weave
|
|
941
|
+
# Log to weave with all scores from all judges
|
|
828
942
|
ev.log_example(
|
|
829
943
|
inputs={
|
|
830
944
|
"question": question,
|
|
@@ -834,7 +948,7 @@ def run_evaluation_endpoint():
|
|
|
834
948
|
"weak_output": weak_output
|
|
835
949
|
|
|
836
950
|
},
|
|
837
|
-
scores=
|
|
951
|
+
scores=all_scores
|
|
838
952
|
)
|
|
839
953
|
|
|
840
954
|
# Finish evaluation
|
|
@@ -850,10 +964,11 @@ def run_evaluation_endpoint():
|
|
|
850
964
|
|
|
851
965
|
return jsonify({
|
|
852
966
|
'status': 'success',
|
|
853
|
-
'evaluation_name':
|
|
967
|
+
'evaluation_name': eval_name,
|
|
854
968
|
'examples_evaluated': len(results),
|
|
855
969
|
'weave_url': ev.ui_url,
|
|
856
970
|
'strong_export': strong_export,
|
|
971
|
+
'judges': [j['name'] for j in judges],
|
|
857
972
|
'task_id': task_id
|
|
858
973
|
})
|
|
859
974
|
|
|
@@ -1032,6 +1147,32 @@ def list_projects():
|
|
|
1032
1147
|
return jsonify({'projects': projects})
|
|
1033
1148
|
|
|
1034
1149
|
|
|
1150
|
+
@app.route('/get_preferences', methods=['GET'])
|
|
1151
|
+
def get_preferences():
|
|
1152
|
+
"""Get saved user preferences"""
|
|
1153
|
+
prefs_file = DATA_DIR / 'preferences.json'
|
|
1154
|
+
if prefs_file.exists():
|
|
1155
|
+
try:
|
|
1156
|
+
with open(prefs_file, 'r') as f:
|
|
1157
|
+
return jsonify(json.load(f))
|
|
1158
|
+
except:
|
|
1159
|
+
pass
|
|
1160
|
+
return jsonify({})
|
|
1161
|
+
|
|
1162
|
+
|
|
1163
|
+
@app.route('/save_preferences', methods=['POST'])
|
|
1164
|
+
def save_preferences():
|
|
1165
|
+
"""Save user preferences"""
|
|
1166
|
+
try:
|
|
1167
|
+
data = request.json
|
|
1168
|
+
prefs_file = DATA_DIR / 'preferences.json'
|
|
1169
|
+
with open(prefs_file, 'w') as f:
|
|
1170
|
+
json.dump(data, f, indent=2)
|
|
1171
|
+
return jsonify({'status': 'success'})
|
|
1172
|
+
except Exception as e:
|
|
1173
|
+
return jsonify({'status': 'error', 'message': str(e)}), 500
|
|
1174
|
+
|
|
1175
|
+
|
|
1035
1176
|
# Routes for serving HTML pages
|
|
1036
1177
|
@app.route('/')
|
|
1037
1178
|
def index():
|
|
@@ -43,10 +43,21 @@
|
|
|
43
43
|
padding: 20px;
|
|
44
44
|
border-radius: 8px;
|
|
45
45
|
margin-bottom: 20px;
|
|
46
|
-
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
.filter-row {
|
|
49
|
+
display: grid;
|
|
50
|
+
grid-template-columns: auto 1fr auto 1fr auto auto;
|
|
47
51
|
gap: 15px;
|
|
48
|
-
align-items:
|
|
49
|
-
|
|
52
|
+
align-items: start;
|
|
53
|
+
margin-bottom: 20px;
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
.filter-group {
|
|
57
|
+
display: flex;
|
|
58
|
+
flex-direction: column;
|
|
59
|
+
gap: 8px;
|
|
60
|
+
min-width: 250px;
|
|
50
61
|
}
|
|
51
62
|
|
|
52
63
|
.controls label {
|
|
@@ -283,66 +294,75 @@
|
|
|
283
294
|
</div>
|
|
284
295
|
|
|
285
296
|
<div class="controls">
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
<
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
<select id="model-filter">
|
|
296
|
-
<option value="all">All Models</option>
|
|
297
|
-
</select>
|
|
298
|
-
|
|
299
|
-
<button id="select-all-btn" style="margin-left: 20px; padding: 8px 16px; background: #2a7c4a; color: white; border: none; border-radius: 4px; cursor: pointer;">
|
|
300
|
-
Select All Filtered
|
|
301
|
-
</button>
|
|
297
|
+
<!-- Filters Row -->
|
|
298
|
+
<div class="filter-row">
|
|
299
|
+
<div class="filter-group">
|
|
300
|
+
<label for="op-filter">Operation Filter:</label>
|
|
301
|
+
<select id="op-filter" style="background: #2a2a2a; color: #fff; border: 1px solid #3a3a3a; padding: 8px 12px; border-radius: 4px; font-size: 14px; cursor: pointer;">
|
|
302
|
+
<option value="all">All Operations</option>
|
|
303
|
+
</select>
|
|
304
|
+
<span style="color: #4a9eff; font-size: 11px; font-weight: 500;">✅ Fully supported: OpenAI (chat.completions, responses), Anthropic (Messages), Google Gemini (generate_content, Chat)</span>
|
|
305
|
+
</div>
|
|
302
306
|
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
</button>
|
|
307
|
+
<div class="filter-group">
|
|
308
|
+
<label for="model-filter">Model Filter:</label>
|
|
309
|
+
<select id="model-filter" style="background: #2a2a2a; color: #fff; border: 1px solid #3a3a3a; padding: 8px 12px; border-radius: 4px; font-size: 14px; cursor: pointer;">
|
|
310
|
+
<option value="all">All Models</option>
|
|
311
|
+
</select>
|
|
312
|
+
</div>
|
|
310
313
|
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
+
<button id="select-all-btn" style="padding: 8px 16px; background: #2a7c4a; color: white; border: none; border-radius: 4px; cursor: pointer; align-self: end; white-space: nowrap;">
|
|
315
|
+
Select All
|
|
316
|
+
</button>
|
|
314
317
|
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
</
|
|
318
|
+
<div style="display: flex; flex-direction: column; gap: 4px; align-self: end;">
|
|
319
|
+
<div style="color: #888; font-size: 13px;">Total: <span id="total-count" style="color: #fff; font-weight: 600;">0</span></div>
|
|
320
|
+
<div style="color: #888; font-size: 13px;">Shown: <span id="shown-count" style="color: #4a9eff; font-weight: 600;">0</span></div>
|
|
318
321
|
</div>
|
|
319
322
|
</div>
|
|
320
323
|
|
|
321
|
-
<!--
|
|
322
|
-
<
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
324
|
+
<!-- Action Buttons Row -->
|
|
325
|
+
<div style="display: grid; grid-template-columns: 2fr 1fr; gap: 20px;">
|
|
326
|
+
<!-- Main Workflow -->
|
|
327
|
+
<div style="padding: 15px; background: #0f1f0f; border-radius: 8px; border: 2px solid #2a4a2a;">
|
|
328
|
+
<div style="color: #6dd36d; font-size: 13px; font-weight: 600; margin-bottom: 12px;">📋 MANUAL WORKFLOW</div>
|
|
329
|
+
<div style="display: flex; flex-wrap: wrap; gap: 10px;">
|
|
330
|
+
<button id="export-btn" style="padding: 10px 16px; background: #4a9eff; color: white; border: none; border-radius: 4px; cursor: pointer; font-weight: 500;">
|
|
331
|
+
1. Export Test Set (<span id="selected-count">0</span>)
|
|
332
|
+
</button>
|
|
333
|
+
<button id="open-inference-btn" style="padding: 10px 16px; background: #7c4a9e; color: white; border: none; border-radius: 4px; cursor: pointer; font-weight: 500;">
|
|
334
|
+
2. Run Weak Models
|
|
335
|
+
</button>
|
|
336
|
+
<button id="open-eval-btn" style="padding: 10px 16px; background: #9e6a4a; color: white; border: none; border-radius: 4px; cursor: pointer; font-weight: 500;">
|
|
337
|
+
3. Evaluate Results
|
|
338
|
+
</button>
|
|
339
|
+
</div>
|
|
340
|
+
</div>
|
|
329
341
|
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
342
|
+
<!-- Utilities -->
|
|
343
|
+
<div style="padding: 15px; background: #1a1a2a; border-radius: 8px; border: 1px solid #2a2a3a;">
|
|
344
|
+
<div style="color: #aaa; font-size: 13px; font-weight: 600; margin-bottom: 12px;">⚙️ TOOLS</div>
|
|
345
|
+
<div style="display: flex; flex-wrap: wrap; gap: 8px;">
|
|
346
|
+
<a href="/judge" target="_blank" style="padding: 8px 14px; background: #4a5a9e; color: white; border: none; border-radius: 4px; text-decoration: none; display: inline-block; font-size: 13px;">
|
|
347
|
+
Judges
|
|
348
|
+
</a>
|
|
349
|
+
<button id="open-test-judge-btn" style="padding: 8px 14px; background: #6a4a7e; color: white; border: none; border-radius: 4px; cursor: pointer; font-size: 13px;">
|
|
350
|
+
Test Judge
|
|
351
|
+
</button>
|
|
352
|
+
<button id="open-settings-btn" style="padding: 8px 14px; background: #5a5a5a; color: white; border: none; border-radius: 4px; cursor: pointer; font-size: 13px;">
|
|
353
|
+
Settings
|
|
354
|
+
</button>
|
|
355
|
+
</div>
|
|
356
|
+
</div>
|
|
357
|
+
</div>
|
|
333
358
|
|
|
334
|
-
<!-- Automatic Workflow
|
|
335
|
-
<div style="margin: 20px
|
|
336
|
-
<div style="color: #
|
|
337
|
-
<button id="open-e2e-btn" style="padding:
|
|
359
|
+
<!-- Automatic Workflow -->
|
|
360
|
+
<div style="margin-top: 20px; padding: 15px; background: #2a1a2a; border-radius: 8px; border: 2px solid #7a4a9e;">
|
|
361
|
+
<div style="color: #bb88ff; font-size: 13px; font-weight: 600; margin-bottom: 10px;">⚡ AUTOMATIC WORKFLOW</div>
|
|
362
|
+
<button id="open-e2e-btn" style="padding: 12px 20px; background: #7a4a9e; color: white; border: none; border-radius: 4px; cursor: pointer; font-weight: 600; font-size: 14px;">
|
|
338
363
|
⚡ Run End-to-End Test
|
|
339
364
|
</button>
|
|
340
|
-
<div style="color: #
|
|
341
|
-
</div>
|
|
342
|
-
|
|
343
|
-
<div class="stats">
|
|
344
|
-
<div>Total: <span id="total-count">0</span></div>
|
|
345
|
-
<div>Shown: <span id="shown-count">0</span></div>
|
|
365
|
+
<div style="color: #888; font-size: 11px; margin-top: 8px;">Export → Generate → Evaluate (all in one)</div>
|
|
346
366
|
</div>
|
|
347
367
|
</div>
|
|
348
368
|
|
|
@@ -420,10 +440,10 @@
|
|
|
420
440
|
</div>
|
|
421
441
|
|
|
422
442
|
<div style="margin-bottom: 20px;">
|
|
423
|
-
<label style="color: #aaa; display: block; margin-bottom: 10px;">Select Judge:</label>
|
|
424
|
-
<
|
|
425
|
-
<!-- Judges populated dynamically -->
|
|
426
|
-
</
|
|
443
|
+
<label style="color: #aaa; display: block; margin-bottom: 10px;">Select Judge(s) - you can select multiple:</label>
|
|
444
|
+
<div id="eval-judge-list" style="max-height: 200px; overflow-y: auto; background: #0f0f0f; padding: 15px; border-radius: 4px;">
|
|
445
|
+
<!-- Judges populated dynamically as checkboxes -->
|
|
446
|
+
</div>
|
|
427
447
|
<div style="color: #666; font-size: 12px; margin-top: 5px;">
|
|
428
448
|
<a href="/judge" target="_blank" style="color: #4a9eff;">Create/manage judges</a>
|
|
429
449
|
</div>
|
|
@@ -594,10 +614,10 @@
|
|
|
594
614
|
|
|
595
615
|
<!-- Judge Selection -->
|
|
596
616
|
<div style="margin-bottom: 30px;">
|
|
597
|
-
<h3 style="color: #fff; font-size: 16px; margin-bottom: 15px;">2. Select
|
|
598
|
-
<
|
|
599
|
-
<
|
|
600
|
-
</
|
|
617
|
+
<h3 style="color: #fff; font-size: 16px; margin-bottom: 15px;">2. Select Judges</h3>
|
|
618
|
+
<div id="e2e-judge-list" style="max-height: 200px; overflow-y: auto; background: #2a2a2a; border: 1px solid #3a3a3a; border-radius: 4px; padding: 10px;">
|
|
619
|
+
<p style="color: #888;">Loading judges...</p>
|
|
620
|
+
</div>
|
|
601
621
|
</div>
|
|
602
622
|
|
|
603
623
|
<!-- Actions -->
|
|
@@ -656,6 +676,17 @@
|
|
|
656
676
|
"Qwen/Qwen3-Coder-480B-A35B-Instruct",
|
|
657
677
|
];
|
|
658
678
|
|
|
679
|
+
const SUPPORTED_OPS = [
|
|
680
|
+
'openai.chat.completions.create',
|
|
681
|
+
'openai.responses.create',
|
|
682
|
+
'anthropic.Messages.create',
|
|
683
|
+
'anthropic.Messages.stream',
|
|
684
|
+
'google.genai.models.Models.generate_content',
|
|
685
|
+
'google.genai.models.Models.generate_content_stream',
|
|
686
|
+
'google.genai.chats.Chat.send_message',
|
|
687
|
+
'google.genai.chats.Chat.send_message_stream'
|
|
688
|
+
];
|
|
689
|
+
|
|
659
690
|
let allTraces = [];
|
|
660
691
|
let currentOpFilter = 'all';
|
|
661
692
|
let currentModelFilter = 'all';
|
|
@@ -667,6 +698,10 @@
|
|
|
667
698
|
// Load projects list
|
|
668
699
|
async function loadProjects() {
|
|
669
700
|
try {
|
|
701
|
+
// Load saved preferences
|
|
702
|
+
const prefsResponse = await fetch('/get_preferences');
|
|
703
|
+
const prefs = await prefsResponse.json();
|
|
704
|
+
|
|
670
705
|
const response = await fetch('/list_projects');
|
|
671
706
|
const data = await response.json();
|
|
672
707
|
const select = document.getElementById('project-select');
|
|
@@ -678,11 +713,23 @@
|
|
|
678
713
|
`<option value="${p.name}">${p.name} (${p.trace_count} traces)</option>`
|
|
679
714
|
).join('');
|
|
680
715
|
|
|
681
|
-
//
|
|
682
|
-
|
|
683
|
-
|
|
684
|
-
|
|
685
|
-
|
|
716
|
+
// Use saved project or first project
|
|
717
|
+
let projectToLoad = prefs.lastProject || data.projects[0].name;
|
|
718
|
+
|
|
719
|
+
// Check if saved project still exists
|
|
720
|
+
const projectExists = data.projects.some(p => p.name === projectToLoad);
|
|
721
|
+
if (!projectExists) {
|
|
722
|
+
projectToLoad = data.projects[0].name;
|
|
723
|
+
}
|
|
724
|
+
|
|
725
|
+
currentProject = projectToLoad;
|
|
726
|
+
select.value = currentProject;
|
|
727
|
+
await loadTraces(currentProject);
|
|
728
|
+
|
|
729
|
+
// Set default filter to "All Supported Ops"
|
|
730
|
+
if (!prefs.lastOpFilter) {
|
|
731
|
+
currentOpFilter = 'supported';
|
|
732
|
+
document.getElementById('op-filter').value = 'supported';
|
|
686
733
|
}
|
|
687
734
|
}
|
|
688
735
|
} catch (e) {
|
|
@@ -690,6 +737,153 @@
|
|
|
690
737
|
}
|
|
691
738
|
}
|
|
692
739
|
|
|
740
|
+
// Patch traces to handle different provider formats
|
|
741
|
+
function patchTracesForProviders(traces) {
|
|
742
|
+
return traces.map(trace => {
|
|
743
|
+
const patched = { ...trace };
|
|
744
|
+
|
|
745
|
+
// Extract provider from op_display_name
|
|
746
|
+
const opName = trace.op_display_name || '';
|
|
747
|
+
|
|
748
|
+
// === PARSE WEAVEOBJECT OUTPUTS ===
|
|
749
|
+
if (patched.output && typeof patched.output === 'string') {
|
|
750
|
+
// Check if it's a streaming operation (empty or None)
|
|
751
|
+
if (patched.output === '' || patched.output === 'None' || patched.output === 'null') {
|
|
752
|
+
if (opName.includes('stream') || opName.includes('Stream')) {
|
|
753
|
+
patched.output = '[Streaming output - not captured in trace]';
|
|
754
|
+
}
|
|
755
|
+
}
|
|
756
|
+
// Parse WeaveObject strings
|
|
757
|
+
else if (patched.output.startsWith('WeaveObject(')) {
|
|
758
|
+
patched.output = extractFromWeaveObject(patched.output, opName);
|
|
759
|
+
}
|
|
760
|
+
// Parse OpenAI responses.create JSON output
|
|
761
|
+
else if (opName.includes('openai.responses.create')) {
|
|
762
|
+
try {
|
|
763
|
+
const respObj = JSON.parse(patched.output);
|
|
764
|
+
if (respObj.output && Array.isArray(respObj.output)) {
|
|
765
|
+
// Extract text from output messages
|
|
766
|
+
const textParts = respObj.output
|
|
767
|
+
.filter(item => item.type === 'message')
|
|
768
|
+
.flatMap(msg => msg.content || [])
|
|
769
|
+
.filter(c => c.type === 'output_text')
|
|
770
|
+
.map(c => c.text);
|
|
771
|
+
patched.output = textParts.join('\n\n') || JSON.stringify(respObj, null, 2);
|
|
772
|
+
}
|
|
773
|
+
} catch (e) {
|
|
774
|
+
// Keep original if parsing fails
|
|
775
|
+
}
|
|
776
|
+
}
|
|
777
|
+
}
|
|
778
|
+
|
|
779
|
+
// === EXTRACT MESSAGES FOR NON-OPENAI FORMATS ===
|
|
780
|
+
if (patched.inputs && (!patched.messages || patched.messages.length === 0)) {
|
|
781
|
+
// Anthropic format
|
|
782
|
+
if (opName.includes('anthropic') && patched.inputs.messages) {
|
|
783
|
+
patched.messages = patched.inputs.messages;
|
|
784
|
+
}
|
|
785
|
+
// Gemini contents format
|
|
786
|
+
else if (opName.includes('google.genai') && patched.inputs.contents) {
|
|
787
|
+
patched.messages = extractGeminiMessages(patched.inputs.contents);
|
|
788
|
+
}
|
|
789
|
+
// Gemini Chat.send_message format
|
|
790
|
+
else if (opName.includes('Chat.send_message') && patched.inputs.message) {
|
|
791
|
+
patched.messages = [{ role: 'user', content: patched.inputs.message }];
|
|
792
|
+
}
|
|
793
|
+
// OpenAI responses.create input format
|
|
794
|
+
else if (opName.includes('openai.responses') && patched.inputs.input) {
|
|
795
|
+
patched.messages = [{ role: 'user', content: patched.inputs.input }];
|
|
796
|
+
}
|
|
797
|
+
}
|
|
798
|
+
|
|
799
|
+
// === ADD PROVIDER-SPECIFIC USAGE INFO ===
|
|
800
|
+
if (patched.usage) {
|
|
801
|
+
// Gemini thoughts tokens
|
|
802
|
+
if (patched.usage.thoughts_tokens) {
|
|
803
|
+
patched.usage.thoughts_tokens_label = 'Thinking';
|
|
804
|
+
}
|
|
805
|
+
// OpenAI reasoning tokens
|
|
806
|
+
if (patched.usage.output_tokens_details && patched.usage.output_tokens_details.reasoning_tokens) {
|
|
807
|
+
patched.usage.reasoning_tokens = patched.usage.output_tokens_details.reasoning_tokens;
|
|
808
|
+
}
|
|
809
|
+
// Anthropic cache metrics
|
|
810
|
+
if (patched.usage.cache_read_input_tokens || patched.usage.cache_creation_input_tokens) {
|
|
811
|
+
patched.usage.has_cache_info = true;
|
|
812
|
+
}
|
|
813
|
+
}
|
|
814
|
+
|
|
815
|
+
return patched;
|
|
816
|
+
});
|
|
817
|
+
}
|
|
818
|
+
|
|
819
|
+
// Extract text from WeaveObject string based on provider
|
|
820
|
+
function extractFromWeaveObject(weaveStr, opName) {
|
|
821
|
+
try {
|
|
822
|
+
// Find the 'text' field and extract everything until the next unescaped quote
|
|
823
|
+
// This handles multiline strings with escaped quotes and newlines
|
|
824
|
+
const textMatch = weaveStr.match(/'text':\s*'((?:[^'\\]|\\.)*)'/s);
|
|
825
|
+
|
|
826
|
+
if (textMatch && textMatch[1]) {
|
|
827
|
+
let extracted = textMatch[1];
|
|
828
|
+
|
|
829
|
+
// Unescape common escape sequences
|
|
830
|
+
extracted = extracted
|
|
831
|
+
.replace(/\\'/g, "'") // escaped single quotes
|
|
832
|
+
.replace(/\\"/g, '"') // escaped double quotes
|
|
833
|
+
.replace(/\\n/g, '\n') // newlines
|
|
834
|
+
.replace(/\\t/g, '\t') // tabs
|
|
835
|
+
.replace(/\\r/g, '\r') // carriage returns
|
|
836
|
+
.replace(/\\\\/g, '\\'); // escaped backslashes (do this last)
|
|
837
|
+
|
|
838
|
+
return extracted;
|
|
839
|
+
}
|
|
840
|
+
|
|
841
|
+
// Fallback: if no text field found, show truncated version
|
|
842
|
+
return `[Complex WeaveObject - see raw data]\n${weaveStr.substring(0, 500)}...`;
|
|
843
|
+
} catch (e) {
|
|
844
|
+
console.error('Failed to parse WeaveObject:', e);
|
|
845
|
+
return `[Failed to parse WeaveObject]\n${weaveStr.substring(0, 200)}...`;
|
|
846
|
+
}
|
|
847
|
+
}
|
|
848
|
+
|
|
849
|
+
// Extract messages from Gemini contents format
|
|
850
|
+
function extractGeminiMessages(contents) {
|
|
851
|
+
if (!Array.isArray(contents)) return [];
|
|
852
|
+
|
|
853
|
+
return contents.map(content => {
|
|
854
|
+
// Handle WeaveObject string
|
|
855
|
+
if (typeof content === 'string' && content.startsWith('WeaveObject(')) {
|
|
856
|
+
// Try to extract basic info
|
|
857
|
+
const roleMatch = content.match(/'role':\s*'(\w+)'/);
|
|
858
|
+
const textMatch = content.match(/'text':\s*'((?:[^'\\]|\\.)*)'/s);
|
|
859
|
+
|
|
860
|
+
let text = '[Complex content]';
|
|
861
|
+
if (textMatch && textMatch[1]) {
|
|
862
|
+
text = textMatch[1]
|
|
863
|
+
.replace(/\\'/g, "'")
|
|
864
|
+
.replace(/\\"/g, '"')
|
|
865
|
+
.replace(/\\n/g, '\n')
|
|
866
|
+
.replace(/\\t/g, '\t')
|
|
867
|
+
.replace(/\\r/g, '\r')
|
|
868
|
+
.replace(/\\\\/g, '\\');
|
|
869
|
+
}
|
|
870
|
+
|
|
871
|
+
return {
|
|
872
|
+
role: roleMatch ? roleMatch[1] : 'user',
|
|
873
|
+
content: text
|
|
874
|
+
};
|
|
875
|
+
}
|
|
876
|
+
// Handle regular object
|
|
877
|
+
else if (content.role && content.parts) {
|
|
878
|
+
return {
|
|
879
|
+
role: content.role,
|
|
880
|
+
content: content.parts.map(p => p.text || '').join('\n')
|
|
881
|
+
};
|
|
882
|
+
}
|
|
883
|
+
return { role: 'user', content: String(content) };
|
|
884
|
+
});
|
|
885
|
+
}
|
|
886
|
+
|
|
693
887
|
// Load traces from selected project
|
|
694
888
|
async function loadTraces(projectName) {
|
|
695
889
|
const projectPath = projectName.replace('/', '_');
|
|
@@ -704,7 +898,7 @@
|
|
|
704
898
|
}
|
|
705
899
|
|
|
706
900
|
const data = await response.json();
|
|
707
|
-
allTraces = data;
|
|
901
|
+
allTraces = patchTracesForProviders(data);
|
|
708
902
|
currentProject = projectName;
|
|
709
903
|
populateFilters();
|
|
710
904
|
renderTraces();
|
|
@@ -782,6 +976,12 @@
|
|
|
782
976
|
const projectName = e.target.value;
|
|
783
977
|
if (projectName) {
|
|
784
978
|
await loadTraces(projectName);
|
|
979
|
+
// Save preference
|
|
980
|
+
await fetch('/save_preferences', {
|
|
981
|
+
method: 'POST',
|
|
982
|
+
headers: { 'Content-Type': 'application/json' },
|
|
983
|
+
body: JSON.stringify({ lastProject: projectName })
|
|
984
|
+
});
|
|
785
985
|
}
|
|
786
986
|
});
|
|
787
987
|
|
|
@@ -790,9 +990,21 @@
|
|
|
790
990
|
|
|
791
991
|
// Populate filter dropdowns
|
|
792
992
|
function populateFilters() {
|
|
793
|
-
//
|
|
794
|
-
const ops = new Set(allTraces.map(t => t.op_display_name || 'unknown'));
|
|
993
|
+
// Clear existing options (except "All") to avoid duplicates when switching projects
|
|
795
994
|
const opSelect = document.getElementById('op-filter');
|
|
995
|
+
const modelSelect = document.getElementById('model-filter');
|
|
996
|
+
|
|
997
|
+
// Save current filter values
|
|
998
|
+
const savedOpFilter = currentOpFilter;
|
|
999
|
+
const savedModelFilter = currentModelFilter;
|
|
1000
|
+
|
|
1001
|
+
// Clear dropdowns but keep the "All" option
|
|
1002
|
+
opSelect.innerHTML = '<option value="all">All Operations</option>';
|
|
1003
|
+
opSelect.innerHTML += '<option value="supported">All Supported Ops</option>';
|
|
1004
|
+
modelSelect.innerHTML = '<option value="all">All Models</option>';
|
|
1005
|
+
|
|
1006
|
+
// Populate operation filter with operations from current project only
|
|
1007
|
+
const ops = new Set(allTraces.map(t => t.op_display_name || 'unknown'));
|
|
796
1008
|
const sortedOps = [...ops].sort();
|
|
797
1009
|
sortedOps.forEach(op => {
|
|
798
1010
|
const option = document.createElement('option');
|
|
@@ -801,21 +1013,36 @@
|
|
|
801
1013
|
opSelect.appendChild(option);
|
|
802
1014
|
});
|
|
803
1015
|
|
|
804
|
-
//
|
|
805
|
-
if (sortedOps.includes('openai.chat.completions.create')) {
|
|
806
|
-
opSelect.value = 'openai.chat.completions.create';
|
|
807
|
-
currentOpFilter = 'openai.chat.completions.create';
|
|
808
|
-
}
|
|
809
|
-
|
|
810
|
-
// Populate model filter
|
|
1016
|
+
// Populate model filter with models from current project only
|
|
811
1017
|
const models = new Set(allTraces.map(t => t.model));
|
|
812
|
-
const modelSelect = document.getElementById('model-filter');
|
|
813
1018
|
[...models].sort().forEach(model => {
|
|
814
1019
|
const option = document.createElement('option');
|
|
815
1020
|
option.value = model;
|
|
816
1021
|
option.textContent = model;
|
|
817
1022
|
modelSelect.appendChild(option);
|
|
818
1023
|
});
|
|
1024
|
+
|
|
1025
|
+
// Restore previous filter values if they still exist
|
|
1026
|
+
// Special handling for 'all' and 'supported' which always exist
|
|
1027
|
+
if (savedOpFilter === 'all' || savedOpFilter === 'supported') {
|
|
1028
|
+
opSelect.value = savedOpFilter;
|
|
1029
|
+
currentOpFilter = savedOpFilter;
|
|
1030
|
+
} else if (sortedOps.includes(savedOpFilter)) {
|
|
1031
|
+
opSelect.value = savedOpFilter;
|
|
1032
|
+
currentOpFilter = savedOpFilter;
|
|
1033
|
+
} else {
|
|
1034
|
+
// Default to 'supported' when switching projects
|
|
1035
|
+
opSelect.value = 'supported';
|
|
1036
|
+
currentOpFilter = 'supported';
|
|
1037
|
+
}
|
|
1038
|
+
|
|
1039
|
+
if ([...models].includes(savedModelFilter)) {
|
|
1040
|
+
modelSelect.value = savedModelFilter;
|
|
1041
|
+
currentModelFilter = savedModelFilter;
|
|
1042
|
+
} else {
|
|
1043
|
+
modelSelect.value = 'all';
|
|
1044
|
+
currentModelFilter = 'all';
|
|
1045
|
+
}
|
|
819
1046
|
}
|
|
820
1047
|
|
|
821
1048
|
// Filter change handlers
|
|
@@ -834,7 +1061,13 @@
|
|
|
834
1061
|
let filteredTraces = allTraces;
|
|
835
1062
|
|
|
836
1063
|
// Apply operation filter
|
|
837
|
-
if (currentOpFilter
|
|
1064
|
+
if (currentOpFilter === 'supported') {
|
|
1065
|
+
// Filter to only supported operations
|
|
1066
|
+
filteredTraces = filteredTraces.filter(t => {
|
|
1067
|
+
const opDisplayName = t.op_display_name || '';
|
|
1068
|
+
return SUPPORTED_OPS.some(op => opDisplayName.includes(op));
|
|
1069
|
+
});
|
|
1070
|
+
} else if (currentOpFilter !== 'all') {
|
|
838
1071
|
filteredTraces = filteredTraces.filter(t => t.op_display_name === currentOpFilter);
|
|
839
1072
|
}
|
|
840
1073
|
|
|
@@ -868,9 +1101,12 @@
|
|
|
868
1101
|
${trace.usage && (trace.usage.total_tokens || trace.usage.requests) ? `
|
|
869
1102
|
<div class="usage-info">
|
|
870
1103
|
${trace.usage.requests ? `<div class="usage-item"><span class="usage-label">Requests:</span> ${trace.usage.requests}</div>` : ''}
|
|
871
|
-
${trace.usage.prompt_tokens ? `<div class="usage-item"><span class="usage-label">
|
|
872
|
-
${trace.usage.completion_tokens ? `<div class="usage-item"><span class="usage-label">
|
|
1104
|
+
${trace.usage.prompt_tokens || trace.usage.input_tokens ? `<div class="usage-item"><span class="usage-label">Input:</span> ${trace.usage.prompt_tokens || trace.usage.input_tokens}</div>` : ''}
|
|
1105
|
+
${trace.usage.completion_tokens || trace.usage.output_tokens ? `<div class="usage-item"><span class="usage-label">Output:</span> ${trace.usage.completion_tokens || trace.usage.output_tokens}</div>` : ''}
|
|
873
1106
|
${trace.usage.total_tokens ? `<div class="usage-item"><span class="usage-label">Total:</span> ${trace.usage.total_tokens}</div>` : ''}
|
|
1107
|
+
${trace.usage.reasoning_tokens ? `<div class="usage-item" style="color: #ff9d00;"><span class="usage-label">Reasoning:</span> ${trace.usage.reasoning_tokens}</div>` : ''}
|
|
1108
|
+
${trace.usage.thoughts_tokens ? `<div class="usage-item" style="color: #9d66ff;"><span class="usage-label">Thinking:</span> ${trace.usage.thoughts_tokens}</div>` : ''}
|
|
1109
|
+
${trace.usage.cache_read_input_tokens ? `<div class="usage-item" style="color: #4a9eff;"><span class="usage-label">Cache Read:</span> ${trace.usage.cache_read_input_tokens}</div>` : ''}
|
|
874
1110
|
</div>
|
|
875
1111
|
` : ''}
|
|
876
1112
|
|
|
@@ -980,17 +1216,15 @@
|
|
|
980
1216
|
return;
|
|
981
1217
|
}
|
|
982
1218
|
|
|
983
|
-
// Filter to only
|
|
1219
|
+
// Filter to only supported provider traces (exclude wrapper function traces)
|
|
984
1220
|
const completionTraces = selectedData.filter(t => {
|
|
985
|
-
const opName = t.op_name || '';
|
|
986
1221
|
const opDisplayName = t.op_display_name || '';
|
|
987
|
-
//
|
|
988
|
-
return
|
|
989
|
-
opName.includes('openai.chat.completions.create');
|
|
1222
|
+
// Check if it's one of our supported operations
|
|
1223
|
+
return SUPPORTED_OPS.some(op => opDisplayName.includes(op));
|
|
990
1224
|
});
|
|
991
1225
|
|
|
992
1226
|
if (completionTraces.length === 0) {
|
|
993
|
-
alert('No
|
|
1227
|
+
alert('No supported provider traces selected! Supported: OpenAI, Anthropic, Gemini');
|
|
994
1228
|
return;
|
|
995
1229
|
}
|
|
996
1230
|
|
|
@@ -1044,9 +1278,18 @@
|
|
|
1044
1278
|
// Get filtered traces
|
|
1045
1279
|
function getFilteredTraces() {
|
|
1046
1280
|
let filtered = allTraces;
|
|
1047
|
-
|
|
1281
|
+
|
|
1282
|
+
// Apply operation filter
|
|
1283
|
+
if (currentOpFilter === 'supported') {
|
|
1284
|
+
filtered = filtered.filter(t => {
|
|
1285
|
+
const opDisplayName = t.op_display_name || '';
|
|
1286
|
+
return SUPPORTED_OPS.some(op => opDisplayName.includes(op));
|
|
1287
|
+
});
|
|
1288
|
+
} else if (currentOpFilter !== 'all') {
|
|
1048
1289
|
filtered = filtered.filter(t => t.op_display_name === currentOpFilter);
|
|
1049
1290
|
}
|
|
1291
|
+
|
|
1292
|
+
// Apply model filter
|
|
1050
1293
|
if (currentModelFilter !== 'all') {
|
|
1051
1294
|
filtered = filtered.filter(t => t.model === currentModelFilter);
|
|
1052
1295
|
}
|
|
@@ -1287,38 +1530,48 @@
|
|
|
1287
1530
|
const response = await fetch('/list_judges');
|
|
1288
1531
|
const data = await response.json();
|
|
1289
1532
|
const judges = data.judges || [];
|
|
1290
|
-
const
|
|
1533
|
+
const judgeList = document.getElementById('eval-judge-list');
|
|
1291
1534
|
|
|
1292
1535
|
if (judges.length === 0) {
|
|
1293
|
-
|
|
1536
|
+
judgeList.innerHTML = '<div style="color: #888;">No judges defined - <a href="/judge" target="_blank" style="color: #4a9eff;">create one first</a></div>';
|
|
1294
1537
|
} else {
|
|
1295
|
-
|
|
1538
|
+
judgeList.innerHTML = judges.map((j, i) => `
|
|
1539
|
+
<label style="display: flex; align-items: center; padding: 8px; margin-bottom: 8px; background: #1a1a1a; border-radius: 4px; cursor: pointer; transition: background 0.2s;">
|
|
1540
|
+
<input type="checkbox" class="eval-judge-checkbox" data-judge-index="${i}" style="margin-right: 10px; width: 18px; height: 18px; cursor: pointer;">
|
|
1541
|
+
<div style="flex: 1;">
|
|
1542
|
+
<div style="color: #fff; font-size: 14px; font-weight: 500;">${j.name}</div>
|
|
1543
|
+
<div style="color: #888; font-size: 12px;">${j.type}</div>
|
|
1544
|
+
</div>
|
|
1545
|
+
</label>
|
|
1546
|
+
`).join('');
|
|
1296
1547
|
}
|
|
1297
1548
|
} catch (e) {
|
|
1298
1549
|
console.error('Error loading judges:', e);
|
|
1299
|
-
document.getElementById('eval-judge').innerHTML = '<
|
|
1550
|
+
document.getElementById('eval-judge-list').innerHTML = '<div style="color: #f88;">Error loading judges</div>';
|
|
1300
1551
|
}
|
|
1301
1552
|
}
|
|
1302
1553
|
|
|
1303
1554
|
// Run evaluation
|
|
1304
1555
|
document.getElementById('run-eval-btn').addEventListener('click', async () => {
|
|
1305
|
-
|
|
1556
|
+
// Get selected judges
|
|
1557
|
+
const selectedJudgeCheckboxes = document.querySelectorAll('.eval-judge-checkbox:checked');
|
|
1558
|
+
const selectedJudgeIndices = Array.from(selectedJudgeCheckboxes).map(cb => parseInt(cb.dataset.judgeIndex));
|
|
1306
1559
|
|
|
1307
1560
|
if (selectedEvalModels.size === 0) {
|
|
1308
1561
|
alert('Please select at least one weak model');
|
|
1309
1562
|
return;
|
|
1310
1563
|
}
|
|
1311
1564
|
|
|
1312
|
-
if (
|
|
1313
|
-
alert('Please select
|
|
1565
|
+
if (selectedJudgeIndices.length === 0) {
|
|
1566
|
+
alert('Please select at least one judge');
|
|
1314
1567
|
return;
|
|
1315
1568
|
}
|
|
1316
1569
|
|
|
1317
1570
|
// Load judges from server
|
|
1318
1571
|
const judgesResponse = await fetch('/list_judges');
|
|
1319
1572
|
const judgesData = await judgesResponse.json();
|
|
1320
|
-
const
|
|
1321
|
-
const
|
|
1573
|
+
const allJudges = judgesData.judges || [];
|
|
1574
|
+
const selectedJudges = selectedJudgeIndices.map(idx => allJudges[idx]);
|
|
1322
1575
|
|
|
1323
1576
|
// Show progress
|
|
1324
1577
|
document.getElementById('eval-progress').style.display = 'block';
|
|
@@ -1328,17 +1581,17 @@
|
|
|
1328
1581
|
const resultsDiv = document.getElementById('eval-results-links');
|
|
1329
1582
|
|
|
1330
1583
|
progressText.textContent = `Starting evaluations...\n`;
|
|
1331
|
-
progressText.textContent += `
|
|
1584
|
+
progressText.textContent += `Judges: ${selectedJudges.map(j => j.name).join(', ')}\n`;
|
|
1332
1585
|
progressText.textContent += `Models: ${selectedEvalModels.size}\n\n`;
|
|
1333
1586
|
|
|
1334
1587
|
const modelFiles = Array.from(selectedEvalModels);
|
|
1335
1588
|
const results = [];
|
|
1336
1589
|
|
|
1337
|
-
// Run
|
|
1338
|
-
for (let
|
|
1339
|
-
const modelFile = modelFiles[
|
|
1590
|
+
// Run one evaluation per model with ALL judges combined
|
|
1591
|
+
for (let modelIdx = 0; modelIdx < modelFiles.length; modelIdx++) {
|
|
1592
|
+
const modelFile = modelFiles[modelIdx];
|
|
1340
1593
|
|
|
1341
|
-
progressText.textContent += `[${
|
|
1594
|
+
progressText.textContent += `[${modelIdx + 1}/${modelFiles.length}] Evaluating ${modelFile} with ${selectedJudges.length} judge(s)...\n`;
|
|
1342
1595
|
|
|
1343
1596
|
let pollInterval = null;
|
|
1344
1597
|
let taskId = null;
|
|
@@ -1349,9 +1602,8 @@
|
|
|
1349
1602
|
const resp = await fetch(`/progress/${taskId}`);
|
|
1350
1603
|
if (resp.ok) {
|
|
1351
1604
|
const progress = await resp.json();
|
|
1352
|
-
const percent = (
|
|
1605
|
+
const percent = ((modelIdx + 1) / modelFiles.length) * 100;
|
|
1353
1606
|
progressFill.style.width = `${percent}%`;
|
|
1354
|
-
progressText.textContent = `[${i+1}/${modelFiles.length}] ${progress.message}\nProgress: ${progress.current}/${progress.total} (${percent.toFixed(1)}%)\n`;
|
|
1355
1607
|
}
|
|
1356
1608
|
} catch (e) {
|
|
1357
1609
|
console.error('Error polling eval progress:', e);
|
|
@@ -1360,17 +1612,18 @@
|
|
|
1360
1612
|
|
|
1361
1613
|
try {
|
|
1362
1614
|
// Generate task ID for this evaluation
|
|
1363
|
-
taskId = `eval_${Date.now()}_${
|
|
1615
|
+
taskId = `eval_${Date.now()}_${modelIdx}`;
|
|
1364
1616
|
|
|
1365
1617
|
// Start polling
|
|
1366
1618
|
pollInterval = setInterval(pollProgress, 300);
|
|
1367
1619
|
|
|
1620
|
+
// Send all judges in one request
|
|
1368
1621
|
const response = await fetch('/run_evaluation', {
|
|
1369
1622
|
method: 'POST',
|
|
1370
1623
|
headers: { 'Content-Type': 'application/json' },
|
|
1371
1624
|
body: JSON.stringify({
|
|
1372
1625
|
model_file: modelFile,
|
|
1373
|
-
|
|
1626
|
+
judges: selectedJudges, // Send all judges
|
|
1374
1627
|
task_id: taskId
|
|
1375
1628
|
})
|
|
1376
1629
|
});
|
|
@@ -1385,6 +1638,7 @@
|
|
|
1385
1638
|
if (pollInterval) clearInterval(pollInterval);
|
|
1386
1639
|
|
|
1387
1640
|
progressText.textContent += ` ✓ Complete: ${result.evaluation_name}\n`;
|
|
1641
|
+
progressText.textContent += ` Judges used: ${result.judges.join(', ')}\n`;
|
|
1388
1642
|
progressText.textContent += ` Examples: ${result.examples_evaluated}\n\n`;
|
|
1389
1643
|
|
|
1390
1644
|
results.push({
|
|
@@ -1788,14 +2042,17 @@
|
|
|
1788
2042
|
try {
|
|
1789
2043
|
const response = await fetch('/list_judges');
|
|
1790
2044
|
const data = await response.json();
|
|
1791
|
-
const
|
|
2045
|
+
const judgeList = document.getElementById('e2e-judge-list');
|
|
1792
2046
|
|
|
1793
2047
|
if (data.judges && data.judges.length > 0) {
|
|
1794
|
-
|
|
1795
|
-
|
|
1796
|
-
|
|
2048
|
+
judgeList.innerHTML = data.judges.map((judge, idx) => `
|
|
2049
|
+
<label style="display: block; padding: 5px 0; color: #ccc; cursor: pointer;">
|
|
2050
|
+
<input type="checkbox" class="e2e-judge-checkbox" value="${idx}" style="margin-right: 8px;">
|
|
2051
|
+
${judge.name} (${judge.type})
|
|
2052
|
+
</label>
|
|
2053
|
+
`).join('');
|
|
1797
2054
|
} else {
|
|
1798
|
-
|
|
2055
|
+
judgeList.innerHTML = '<p style="color: #888;">No judges available - create one first</p>';
|
|
1799
2056
|
}
|
|
1800
2057
|
} catch (error) {
|
|
1801
2058
|
console.error('Error loading judges:', error);
|
|
@@ -1831,9 +2088,10 @@
|
|
|
1831
2088
|
return;
|
|
1832
2089
|
}
|
|
1833
2090
|
|
|
1834
|
-
|
|
1835
|
-
|
|
1836
|
-
|
|
2091
|
+
// Get selected judges
|
|
2092
|
+
const selectedJudgeIndices = Array.from(document.querySelectorAll('.e2e-judge-checkbox:checked')).map(cb => parseInt(cb.value));
|
|
2093
|
+
if (selectedJudgeIndices.length === 0) {
|
|
2094
|
+
alert('Please select at least one judge!');
|
|
1837
2095
|
return;
|
|
1838
2096
|
}
|
|
1839
2097
|
|
|
@@ -1842,7 +2100,7 @@
|
|
|
1842
2100
|
// Load judge data
|
|
1843
2101
|
const judgesResponse = await fetch('/list_judges');
|
|
1844
2102
|
const judgesData = await judgesResponse.json();
|
|
1845
|
-
const
|
|
2103
|
+
const judges = selectedJudgeIndices.map(idx => judgesData.judges[idx]);
|
|
1846
2104
|
|
|
1847
2105
|
// Hide config panel, show progress panel
|
|
1848
2106
|
document.getElementById('e2e-panel').style.display = 'none';
|
|
@@ -1928,7 +2186,8 @@
|
|
|
1928
2186
|
|
|
1929
2187
|
// === STEP 3: Run Evaluations ===
|
|
1930
2188
|
stepLabel.textContent = 'Step 3/3: Running evaluations...';
|
|
1931
|
-
|
|
2189
|
+
const judgeNames = judges.map(j => j.name).join(', ');
|
|
2190
|
+
progressText.textContent += `📊 Running evaluations with ${judges.length} judge(s): ${judgeNames}...\n`;
|
|
1932
2191
|
|
|
1933
2192
|
const evaluationResults = [];
|
|
1934
2193
|
|
|
@@ -1971,7 +2230,7 @@
|
|
|
1971
2230
|
headers: { 'Content-Type': 'application/json' },
|
|
1972
2231
|
body: JSON.stringify({
|
|
1973
2232
|
model_file: modelFile,
|
|
1974
|
-
|
|
2233
|
+
judges: judges,
|
|
1975
2234
|
task_id: evalTaskId
|
|
1976
2235
|
})
|
|
1977
2236
|
});
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
quickdistill/__init__.py,sha256=dOl_wXruBGyDGhe1Iu4-SQLu_6-_b6rt1lkxfOp3Jqo,823
|
|
2
|
+
quickdistill/cli.py,sha256=A8d5GN9NdBS299WyAsJ6-p8ynW3DJnDRHZ-UGH7TXLM,2212
|
|
3
|
+
quickdistill/default_judges.json,sha256=9uDqsYc9CsJwZAWwOkWcqgmlGZNJ0zzyXpv4wZ8vtuE,1446
|
|
4
|
+
quickdistill/get_traces.py,sha256=mfy9fMiK-CZQN1noZ4DfOwdwP45ntthVDLgh4-u2iNk,4896
|
|
5
|
+
quickdistill/server.py,sha256=0yBQ5vt1oD7OkhH7ap2cR8j-wuVG3fU7jARijmD1eOs,42849
|
|
6
|
+
quickdistill/__pycache__/__init__.cpython-310.pyc,sha256=RqzjvxzPxHFJZkBjX6DSH9vbVTtskVgJ4pTQ6EX2A6o,794
|
|
7
|
+
quickdistill/__pycache__/cli.cpython-310.pyc,sha256=xtVgJTayQLKS4gE_te7U1Wo8LmkDtPkaa2rnzu8h9fY,2443
|
|
8
|
+
quickdistill/__pycache__/get_traces.cpython-310.pyc,sha256=T7Suxp9vpqYDQJ_3uJvXWemqoLf5tnRC2I0BfHrSiNM,2956
|
|
9
|
+
quickdistill/__pycache__/server.cpython-310.pyc,sha256=8W74-E_S0dJRRwRG7nF9UL64kdbyDoNswAi5y51Xc3I,25593
|
|
10
|
+
quickdistill/default_projects/byyoung3_arena-detailed/traces_data.json,sha256=iz-cBmXBYj0bC3Vn754QTnGuDh6sRvlE_RzSyGXaxbY,15496950
|
|
11
|
+
quickdistill/static/judge_manager.html,sha256=t6dSPwo_d-GIu1FscuK1KDgxKCnmiOekQTMu80lZPPY,27166
|
|
12
|
+
quickdistill/static/trace_viewer.html,sha256=lAMO6Mj-MWQqXGC4bo2v8ybM4ci082h2HaDQ1AOl2jM,109884
|
|
13
|
+
quickdistill-0.1.9.dist-info/METADATA,sha256=-VH48FybeQbxuxUOlSn0zHJfCOkxfklCrxCHbdRYFRQ,5084
|
|
14
|
+
quickdistill-0.1.9.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
|
|
15
|
+
quickdistill-0.1.9.dist-info/entry_points.txt,sha256=AUUTxnwdD9gRnsOEcTXQTAZIZ_F0aRU7JGstIJ3Xk_o,55
|
|
16
|
+
quickdistill-0.1.9.dist-info/top_level.txt,sha256=ysiMvurJYsE1IhkxmObe-0G8A-GIav40kTh2z6axjxg,13
|
|
17
|
+
quickdistill-0.1.9.dist-info/RECORD,,
|
|
@@ -1,17 +0,0 @@
|
|
|
1
|
-
quickdistill/__init__.py,sha256=4hLOUVOlPTaZaCLc7950TQGMb-EV_3J9t2qT7StwA7k,397
|
|
2
|
-
quickdistill/cli.py,sha256=A8d5GN9NdBS299WyAsJ6-p8ynW3DJnDRHZ-UGH7TXLM,2212
|
|
3
|
-
quickdistill/default_judges.json,sha256=9uDqsYc9CsJwZAWwOkWcqgmlGZNJ0zzyXpv4wZ8vtuE,1446
|
|
4
|
-
quickdistill/get_traces.py,sha256=mfy9fMiK-CZQN1noZ4DfOwdwP45ntthVDLgh4-u2iNk,4896
|
|
5
|
-
quickdistill/server.py,sha256=0Y0XG-8oYoNZgmo10LPZgtwlHuGqrq0urxE-KabyIvI,36789
|
|
6
|
-
quickdistill/__pycache__/__init__.cpython-310.pyc,sha256=kCGMGP5qGjIpf2QZcBVLVTVlQKd-HHy_l9tHr1LfysU,603
|
|
7
|
-
quickdistill/__pycache__/cli.cpython-310.pyc,sha256=xtVgJTayQLKS4gE_te7U1Wo8LmkDtPkaa2rnzu8h9fY,2443
|
|
8
|
-
quickdistill/__pycache__/get_traces.cpython-310.pyc,sha256=T7Suxp9vpqYDQJ_3uJvXWemqoLf5tnRC2I0BfHrSiNM,2956
|
|
9
|
-
quickdistill/__pycache__/server.cpython-310.pyc,sha256=_taKWofMtdgfMZzfVsd7PoC4jnuKxEOGzW82YBxqPPc,22051
|
|
10
|
-
quickdistill/default_projects/byyoung3_arena-detailed/traces_data.json,sha256=iz-cBmXBYj0bC3Vn754QTnGuDh6sRvlE_RzSyGXaxbY,15496950
|
|
11
|
-
quickdistill/static/judge_manager.html,sha256=t6dSPwo_d-GIu1FscuK1KDgxKCnmiOekQTMu80lZPPY,27166
|
|
12
|
-
quickdistill/static/trace_viewer.html,sha256=yt_zPP88px_51a9ilv8UhrssnVOT-2hjEPHEGoRlPrQ,95152
|
|
13
|
-
quickdistill-0.1.8.dist-info/METADATA,sha256=q4uGRUvQ3HSlHff0ZKs1tBzGos-iOiSxHq3HbKJHa-k,5084
|
|
14
|
-
quickdistill-0.1.8.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
|
|
15
|
-
quickdistill-0.1.8.dist-info/entry_points.txt,sha256=AUUTxnwdD9gRnsOEcTXQTAZIZ_F0aRU7JGstIJ3Xk_o,55
|
|
16
|
-
quickdistill-0.1.8.dist-info/top_level.txt,sha256=ysiMvurJYsE1IhkxmObe-0G8A-GIav40kTh2z6axjxg,13
|
|
17
|
-
quickdistill-0.1.8.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|