quickdistill 0.1.7__tar.gz → 0.1.9__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. {quickdistill-0.1.7/quickdistill.egg-info → quickdistill-0.1.9}/PKG-INFO +1 -1
  2. quickdistill-0.1.9/dev/run_inf_with_providers.py +89 -0
  3. quickdistill-0.1.9/dev/v2_run_inf_w_providers.py +174 -0
  4. {quickdistill-0.1.7 → quickdistill-0.1.9}/pyproject.toml +1 -1
  5. quickdistill-0.1.9/quickdistill/__init__.py +28 -0
  6. quickdistill-0.1.9/quickdistill/__pycache__/__init__.cpython-310.pyc +0 -0
  7. quickdistill-0.1.9/quickdistill/__pycache__/server.cpython-310.pyc +0 -0
  8. {quickdistill-0.1.7 → quickdistill-0.1.9}/quickdistill/default_judges.json +2 -2
  9. {quickdistill-0.1.7 → quickdistill-0.1.9}/quickdistill/server.py +170 -29
  10. {quickdistill-0.1.7 → quickdistill-0.1.9}/quickdistill/static/judge_manager.html +12 -8
  11. {quickdistill-0.1.7 → quickdistill-0.1.9}/quickdistill/static/trace_viewer.html +379 -112
  12. {quickdistill-0.1.7 → quickdistill-0.1.9/quickdistill.egg-info}/PKG-INFO +1 -1
  13. {quickdistill-0.1.7 → quickdistill-0.1.9}/quickdistill.egg-info/SOURCES.txt +2 -0
  14. quickdistill-0.1.7/quickdistill/__init__.py +0 -17
  15. quickdistill-0.1.7/quickdistill/__pycache__/__init__.cpython-310.pyc +0 -0
  16. quickdistill-0.1.7/quickdistill/__pycache__/server.cpython-310.pyc +0 -0
  17. {quickdistill-0.1.7 → quickdistill-0.1.9}/.pycommands +0 -0
  18. {quickdistill-0.1.7 → quickdistill-0.1.9}/README.md +0 -0
  19. {quickdistill-0.1.7 → quickdistill-0.1.9}/dev/generate_test_traces.py +0 -0
  20. {quickdistill-0.1.7 → quickdistill-0.1.9}/dev/get_call.py +0 -0
  21. {quickdistill-0.1.7 → quickdistill-0.1.9}/dev/get_traces.py +0 -0
  22. {quickdistill-0.1.7 → quickdistill-0.1.9}/dev/inference_server.py +0 -0
  23. {quickdistill-0.1.7 → quickdistill-0.1.9}/dev/judge_manager.html +0 -0
  24. {quickdistill-0.1.7 → quickdistill-0.1.9}/dev/judges.json +0 -0
  25. {quickdistill-0.1.7 → quickdistill-0.1.9}/dev/old/TEST_TRACE_GENERATION.md +0 -0
  26. {quickdistill-0.1.7 → quickdistill-0.1.9}/dev/old/traces_data.json +0 -0
  27. {quickdistill-0.1.7 → quickdistill-0.1.9}/dev/projects/byyoung3_arena-detailed/traces_data.json +0 -0
  28. {quickdistill-0.1.7 → quickdistill-0.1.9}/dev/projects/byyoung3_claude-opus-4-1-tutorial/traces_data.json +0 -0
  29. {quickdistill-0.1.7 → quickdistill-0.1.9}/dev/projects/byyoung3_test-financial-qa/traces_data.json +0 -0
  30. {quickdistill-0.1.7 → quickdistill-0.1.9}/dev/pystatus +0 -0
  31. {quickdistill-0.1.7 → quickdistill-0.1.9}/dev/run_evaluation.py +0 -0
  32. {quickdistill-0.1.7 → quickdistill-0.1.9}/dev/run_weak_models.py +0 -0
  33. {quickdistill-0.1.7 → quickdistill-0.1.9}/dev/strong_exports/anthropic_claude-3.5-sonnet_10traces_v2.json +0 -0
  34. {quickdistill-0.1.7 → quickdistill-0.1.9}/dev/strong_exports/anthropic_claude-3.5-sonnet_20traces.json +0 -0
  35. {quickdistill-0.1.7 → quickdistill-0.1.9}/dev/strong_exports/claude-opus-4-1-20250805_1traces.json +0 -0
  36. {quickdistill-0.1.7 → quickdistill-0.1.9}/dev/strong_exports/gpt-5-2025-08-07_199traces.json +0 -0
  37. {quickdistill-0.1.7 → quickdistill-0.1.9}/dev/trace_viewer.html +0 -0
  38. {quickdistill-0.1.7 → quickdistill-0.1.9}/dev/traces_data.json +0 -0
  39. {quickdistill-0.1.7 → quickdistill-0.1.9}/dev/weak_model_google_gemini-2.5-flash.json +0 -0
  40. {quickdistill-0.1.7 → quickdistill-0.1.9}/dev/weak_model_meta-llama_Llama-3.1-8B-Instruct.json +0 -0
  41. {quickdistill-0.1.7 → quickdistill-0.1.9}/dev/weak_model_meta-llama_Llama-3.3-70B-Instruct.json +0 -0
  42. {quickdistill-0.1.7 → quickdistill-0.1.9}/dev/weak_model_openai_gpt-oss-20b.json +0 -0
  43. {quickdistill-0.1.7 → quickdistill-0.1.9}/quickdistill/__pycache__/cli.cpython-310.pyc +0 -0
  44. {quickdistill-0.1.7 → quickdistill-0.1.9}/quickdistill/__pycache__/get_traces.cpython-310.pyc +0 -0
  45. {quickdistill-0.1.7 → quickdistill-0.1.9}/quickdistill/cli.py +0 -0
  46. {quickdistill-0.1.7 → quickdistill-0.1.9}/quickdistill/default_projects/byyoung3_arena-detailed/traces_data.json +0 -0
  47. {quickdistill-0.1.7 → quickdistill-0.1.9}/quickdistill/get_traces.py +0 -0
  48. {quickdistill-0.1.7 → quickdistill-0.1.9}/quickdistill.egg-info/dependency_links.txt +0 -0
  49. {quickdistill-0.1.7 → quickdistill-0.1.9}/quickdistill.egg-info/entry_points.txt +0 -0
  50. {quickdistill-0.1.7 → quickdistill-0.1.9}/quickdistill.egg-info/requires.txt +0 -0
  51. {quickdistill-0.1.7 → quickdistill-0.1.9}/quickdistill.egg-info/top_level.txt +0 -0
  52. {quickdistill-0.1.7 → quickdistill-0.1.9}/setup.cfg +0 -0
  53. {quickdistill-0.1.7 → quickdistill-0.1.9}/update.sh +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: quickdistill
3
- Version: 0.1.7
3
+ Version: 0.1.9
4
4
  Summary: Fast and easy toolkit for distilling AI models
5
5
  Author-email: Brett Young <bdytx5@umsystem.edu>
6
6
  License: MIT
@@ -0,0 +1,89 @@
1
+ import os
2
+ import anthropic
3
+
4
+ # ---------------- GEMINI ----------------
5
+ from google import genai
6
+ # ---------------- GROK ----------------
7
+ from xai_sdk import Client as XAIClient
8
+ from xai_sdk.chat import user, system
9
+
10
+ import weave; weave.init("providers-testing")
11
+
12
+
13
+ def run_gemini(prompt: str):
14
+ client = genai.Client(api_key=os.environ["GEMINI_API_KEY"])
15
+
16
+ resp = client.models.generate_content(
17
+ model="gemini-2.5-flash",
18
+ contents=[{
19
+ "role": "user",
20
+ "parts": [{"text": prompt}]
21
+ }]
22
+ )
23
+
24
+ return resp.text
25
+
26
+
27
+ # ---------------- CLAUDE ----------------
28
+
29
+ def run_claude(prompt: str):
30
+ client = anthropic.Anthropic(
31
+ api_key=os.environ["ANTHROPIC_API_KEY"]
32
+ )
33
+
34
+ msg = client.messages.create(
35
+ model="claude-4.5-haiku",
36
+ max_tokens=512,
37
+ messages=[
38
+ {"role": "user", "content": prompt}
39
+ ]
40
+ )
41
+
42
+ return msg.content[0].text
43
+
44
+
45
+
46
+ def run_grok(prompt: str):
47
+ client = XAIClient(
48
+ api_key=os.environ["XAI_API_KEY"],
49
+ timeout=3600
50
+ )
51
+
52
+ chat = client.chat.create(
53
+ model="grok-4-1-fast-reasoning"
54
+ )
55
+
56
+ chat.append(system("You are Grok, a helpful AI assistant."))
57
+ chat.append(user(prompt))
58
+
59
+ resp = chat.sample()
60
+
61
+ return resp.content
62
+
63
+
64
+ # ---------------- UNIFIED ROUTER ----------------
65
+ def run_model(provider: str, prompt: str):
66
+ provider = provider.lower()
67
+
68
+ if provider == "gemini":
69
+ return run_gemini(prompt)
70
+
71
+ if provider == "claude":
72
+ return run_claude(prompt)
73
+
74
+ if provider == "grok":
75
+ return run_grok(prompt)
76
+
77
+ raise ValueError(provider)
78
+
79
+
80
+ # ---------------- TEST ----------------
81
+ if __name__ == "__main__":
82
+ prompt = "Explain transformers simply"
83
+
84
+ for provider in ["gemini", "claude", "grok"]:
85
+ try:
86
+ print(f"\n=== {provider.upper()} ===")
87
+ print(run_model(provider, prompt))
88
+ except Exception as e:
89
+ print(provider, "failed:", e)
@@ -0,0 +1,174 @@
1
+ # pip install openai anthropic google-genai
2
+
3
+ import os
4
+
5
+ # ================= OPENAI =================
6
+ from openai import OpenAI
7
+ from google import genai
8
+ import anthropic
9
+
10
+ import weave; weave.init("providers-testing")
11
+
12
+ openai_client = OpenAI(api_key=os.environ["OPENAI_API_KEY"])
13
+
14
+
15
+
16
+
17
+
18
+
19
+
20
+
21
+ def openai_responses(prompt: str):
22
+ resp = openai_client.responses.create(
23
+ model="gpt-5-mini",
24
+ input=prompt
25
+ )
26
+ return resp.output_text
27
+
28
+
29
+ def openai_chat(prompt: str):
30
+ resp = openai_client.chat.completions.create(
31
+ model="gpt-4.1-mini",
32
+ messages=[{"role": "user", "content": prompt}]
33
+ )
34
+ return resp.choices[0].message.content
35
+
36
+
37
+ def openai_stream(prompt: str):
38
+ print("\n[OpenAI Streaming]")
39
+ with openai_client.responses.stream(
40
+ model="gpt-5-mini",
41
+ input=prompt
42
+ ) as stream:
43
+ for event in stream:
44
+ if event.type == "response.output_text.delta":
45
+ print(event.delta, end="", flush=True)
46
+ print()
47
+
48
+
49
+ # ================= ANTHROPIC =================
50
+
51
+
52
+ anthropic_client = anthropic.Anthropic(
53
+ api_key=os.environ["ANTHROPIC_API_KEY"]
54
+ )
55
+
56
+
57
+ def anthropic_messages(prompt: str):
58
+ resp = anthropic_client.messages.create(
59
+ model="claude-haiku-4-5-20251001",
60
+ max_tokens=512,
61
+ messages=[{"role": "user", "content": prompt}]
62
+ )
63
+ return resp.content[0].text
64
+
65
+
66
+ def anthropic_stream(prompt: str):
67
+ print("\n[Anthropic Streaming]")
68
+ with anthropic_client.messages.stream(
69
+ model="claude-haiku-4-5-20251001",
70
+ max_tokens=512,
71
+ messages=[{"role": "user", "content": prompt}]
72
+ ) as stream:
73
+ for text in stream.text_stream:
74
+ print(text, end="", flush=True)
75
+ print()
76
+
77
+
78
+ # ================= GEMINI =================
79
+
80
+ gemini_client = genai.Client(
81
+ api_key=os.environ["GEMINI_API_KEY"]
82
+ )
83
+
84
+
85
+ def gemini_generate(prompt: str):
86
+ resp = gemini_client.models.generate_content(
87
+ model="gemini-2.5-flash",
88
+ contents=[{
89
+ "role": "user",
90
+ "parts": [{"text": prompt}]
91
+ }]
92
+ )
93
+ return resp.text
94
+
95
+
96
+ def gemini_chat(prompt: str):
97
+ chat = gemini_client.chats.create(
98
+ model="gemini-2.5-flash"
99
+ )
100
+ resp = chat.send_message(prompt)
101
+ return resp.text
102
+
103
+
104
+ def gemini_stream(prompt: str):
105
+ print("\n[Gemini Streaming]")
106
+ chat = gemini_client.chats.create(
107
+ model="gemini-2.5-flash"
108
+ )
109
+ stream = chat.send_message_stream(prompt)
110
+
111
+ for chunk in stream:
112
+ if chunk.text:
113
+ print(chunk.text, end="", flush=True)
114
+ print()
115
+
116
+
117
+ # ================= TOOL CALL EXAMPLE =================
118
+ # Minimal cross-provider demonstration using OpenAI only
119
+ # (Anthropic/Gemini support tools but schemas differ heavily)
120
+
121
+ def openai_tool_example():
122
+
123
+ tools = [{
124
+ "type": "function",
125
+ "function": {
126
+ "name": "get_weather",
127
+ "parameters": {
128
+ "type": "object",
129
+ "properties": {
130
+ "city": {"type": "string"}
131
+ },
132
+ "required": ["city"]
133
+ }
134
+ }
135
+ }]
136
+
137
+ resp = openai_client.chat.completions.create(
138
+ model="gpt-4.1-mini",
139
+ messages=[{"role": "user", "content": "What's weather in Tokyo?"}],
140
+ tools=tools
141
+ )
142
+
143
+ return resp.choices[0].message.tool_calls
144
+
145
+
146
+ # ================= RUN ALL =================
147
+
148
+ if __name__ == "__main__":
149
+
150
+ prompt = "Explain transformers simply."
151
+
152
+ # print("\n==== OPENAI RESPONSES ====")
153
+ # print(openai_responses(prompt))
154
+
155
+ # print("\n==== OPENAI CHAT ====")
156
+ # print(openai_chat(prompt))
157
+
158
+ # openai_stream(prompt)
159
+
160
+ print("\n==== ANTHROPIC ====")
161
+ print(anthropic_messages(prompt))
162
+
163
+ anthropic_stream(prompt)
164
+
165
+ print("\n==== GEMINI GENERATE ====")
166
+ print(gemini_generate(prompt))
167
+
168
+ print("\n==== GEMINI CHAT ====")
169
+ print(gemini_chat(prompt))
170
+
171
+ gemini_stream(prompt)
172
+
173
+ print("\n==== OPENAI TOOL CALL ====")
174
+ print(openai_tool_example())
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "quickdistill"
7
- version = "0.1.7"
7
+ version = "0.1.9"
8
8
  description = "Fast and easy toolkit for distilling AI models"
9
9
  readme = "README.md"
10
10
  authors = [
@@ -0,0 +1,28 @@
1
+ """
2
+ QuickDistill - A fast and easy toolkit for distilling AI models.
3
+
4
+ This package provides tools to:
5
+ - Capture and view Weave traces
6
+ - Run weak models on strong model outputs
7
+ - Evaluate similarity using LLM judges
8
+ - Export datasets for model evaluation
9
+ """
10
+
11
+ # Monkey patch for aiohttp/litellm compatibility
12
+ # litellm expects aiohttp.ConnectionTimeoutError but it doesn't exist in some versions
13
+ try:
14
+ import aiohttp
15
+ if not hasattr(aiohttp, 'ConnectionTimeoutError'):
16
+ aiohttp.ConnectionTimeoutError = aiohttp.ServerTimeoutError
17
+ if not hasattr(aiohttp, 'SocketTimeoutError'):
18
+ aiohttp.SocketTimeoutError = aiohttp.ServerTimeoutError
19
+ except Exception:
20
+ pass
21
+
22
+ __version__ = "0.1.9"
23
+ __author__ = "Brett Young"
24
+ __email__ = "bdytx5@umsystem.edu"
25
+
26
+ from quickdistill.cli import main
27
+
28
+ __all__ = ["main"]
@@ -2,14 +2,14 @@
2
2
  {
3
3
  "name": "boolean_scorer",
4
4
  "type": "llm",
5
- "model": "gpt-5",
5
+ "model": "openai/gpt-5",
6
6
  "returnType": "boolean",
7
7
  "prompt": "You are a strict evaluator comparing two AI responses (one from a strong reference model which is the ground truth, and one from a weaker model which we are testing to see how similar the responses it generates are to the strong model).\n\nStrong Model Response: {strong_output}\nWeak Model Response: {weak_output}\n\nDetermine if the weak model response is CORRECT compared to the strong model response.\nConsider a response CORRECT if it conveys the same key information and meaning, even if worded differently.\n\nRespond in JSON format: {'correct': true} or {'correct': false}"
8
8
  },
9
9
  {
10
10
  "name": "scalar_scorer",
11
11
  "type": "llm",
12
- "model": "gpt-5",
12
+ "model": "openai/gpt-5",
13
13
  "returnType": "scalar",
14
14
  "prompt": "You are a strict evaluator comparing two AI responses (one from a strong reference model which is the ground truth, and one from a weaker model which we are testing to see how similar the responses it generates are to the strong model).\n\nStrong Model Response: {strong_output}\nWeak Model Response: {weak_output}\n\nEvaluate how similar the weak model response is to the strong model response.\nRate on a scale of 1-5 where 1=completely different and 5=nearly identical. RETURN ONLY ONE SCORE REPRESENTY THE AVERAGE SIMILARITY (EG 5-(avg_error))\n\nRespond in JSON format eg {'scores': the_score }"
15
15
  }
@@ -100,40 +100,133 @@ def run_inference(client, model, messages, max_tokens=1000):
100
100
  return f"ERROR: {str(e)}"
101
101
 
102
102
  def extract_output_content(output_str):
103
- """Extract actual content from WeaveObject string or regular output"""
103
+ """Extract actual content from WeaveObject string, JSON response, or regular output.
104
+
105
+ Handles outputs from:
106
+ - OpenAI chat.completions.create (plain text)
107
+ - OpenAI responses.create (JSON with nested structure)
108
+ - Anthropic Messages (WeaveObject with content[0].text)
109
+ - Google Gemini (WeaveObject with candidates[0].content.parts[0].text)
110
+ """
111
+ import re
112
+ import json
113
+
104
114
  if not output_str:
105
115
  return None
106
116
 
107
- # If it's a WeaveObject string, try to extract the text content
108
- if isinstance(output_str, str) and 'WeaveObject' in output_str:
109
- import re
110
- # Try to find the 'text' field in the WeaveObject
111
- match = re.search(r"'text':\s*'([^']*(?:\\'[^']*)*)'", output_str)
117
+ if not isinstance(output_str, str):
118
+ return str(output_str)
119
+
120
+ # Handle empty/streaming responses
121
+ if output_str in ('', 'None', 'null'):
122
+ return '[Streaming output - not captured]'
123
+
124
+ # Handle OpenAI responses.create JSON format
125
+ if output_str.startswith('{') and '"output"' in output_str:
126
+ try:
127
+ resp_obj = json.loads(output_str)
128
+ if 'output' in resp_obj and isinstance(resp_obj['output'], list):
129
+ # Extract text from output messages
130
+ text_parts = []
131
+ for item in resp_obj['output']:
132
+ if item.get('type') == 'message' and 'content' in item:
133
+ for content in item['content']:
134
+ if content.get('type') == 'output_text' and 'text' in content:
135
+ text_parts.append(content['text'])
136
+ if text_parts:
137
+ return '\n\n'.join(text_parts)
138
+ except (json.JSONDecodeError, KeyError, TypeError):
139
+ pass # Fall through to other handlers
140
+
141
+ # Handle WeaveObject strings (Anthropic, Gemini)
142
+ if 'WeaveObject' in output_str:
143
+ # Improved regex that handles escape sequences properly
144
+ match = re.search(r"'text':\s*'((?:[^'\\]|\\.)*)'", output_str, re.DOTALL)
112
145
  if match:
113
- # Unescape the string
146
+ # Unescape the string properly (order matters!)
114
147
  text = match.group(1)
115
- text = text.replace('\\n', '\n').replace("\\'", "'").replace('\\\\', '\\')
148
+ text = text.replace("\\'", "'") # escaped single quotes
149
+ text = text.replace('\\"', '"') # escaped double quotes
150
+ text = text.replace('\\n', '\n') # newlines
151
+ text = text.replace('\\t', '\t') # tabs
152
+ text = text.replace('\\r', '\r') # carriage returns
153
+ text = text.replace('\\\\', '\\') # escaped backslashes (do this last!)
116
154
  return text
117
155
 
118
- # Otherwise return as-is
156
+ # If no text field found, return truncated version
157
+ return f"[Complex WeaveObject - could not extract text]\n{output_str[:500]}..."
158
+
159
+ # Plain text output (standard OpenAI chat format)
119
160
  return output_str
120
161
 
121
162
 
122
163
  def extract_messages_from_trace(trace):
123
- """Extract messages from a trace in the format needed for inference"""
124
- # Check if messages are at top level
164
+ """Extract messages from a trace in the format needed for inference.
165
+
166
+ Handles message extraction from:
167
+ - OpenAI chat.completions.create (messages at top level or in inputs.messages)
168
+ - OpenAI responses.create (inputs.input field)
169
+ - Anthropic Messages (inputs.messages)
170
+ - Google Gemini generate_content (inputs.contents array)
171
+ - Google Gemini Chat.send_message (inputs.message string)
172
+ """
173
+ import re
174
+
175
+ # Get op_display_name for provider detection
176
+ op_name = trace.get('op_display_name', '')
177
+
178
+ # Check if messages are at top level (already extracted/cached)
125
179
  if trace.get('messages') and isinstance(trace['messages'], list) and len(trace['messages']) > 0:
126
180
  return trace['messages']
127
181
 
128
182
  # Check if messages are in inputs
129
183
  if trace.get('inputs') and isinstance(trace['inputs'], dict):
130
- messages = trace['inputs'].get('messages', [])
184
+ inputs = trace['inputs']
185
+
186
+ # Standard OpenAI/Anthropic: inputs.messages
187
+ messages = inputs.get('messages', [])
131
188
  if isinstance(messages, list) and len(messages) > 0:
132
189
  return messages
133
190
 
191
+ # OpenAI responses.create: inputs.input (simple string)
192
+ if 'openai.responses' in op_name and 'input' in inputs:
193
+ return [{"role": "user", "content": inputs['input']}]
194
+
195
+ # Gemini Chat.send_message: inputs.message (simple string)
196
+ if 'Chat.send_message' in op_name and 'message' in inputs:
197
+ return [{"role": "user", "content": inputs['message']}]
198
+
199
+ # Gemini generate_content: inputs.contents (array of content objects or WeaveObject strings)
200
+ if 'google.genai' in op_name and 'contents' in inputs:
201
+ contents = inputs['contents']
202
+ if isinstance(contents, list) and len(contents) > 0:
203
+ messages = []
204
+ for content in contents:
205
+ # Handle WeaveObject string format
206
+ if isinstance(content, str) and 'WeaveObject' in content:
207
+ role_match = re.search(r"'role':\s*'(\w+)'", content)
208
+ text_match = re.search(r"'text':\s*'((?:[^'\\]|\\.)*)'", content, re.DOTALL)
209
+ text = '[Complex content]'
210
+ if text_match:
211
+ text = text_match.group(1)
212
+ text = text.replace("\\'", "'").replace('\\n', '\n').replace('\\\\', '\\')
213
+ messages.append({
214
+ "role": role_match.group(1) if role_match else "user",
215
+ "content": text
216
+ })
217
+ # Handle regular dict format
218
+ elif isinstance(content, dict):
219
+ role = content.get('role', 'user')
220
+ parts = content.get('parts', [])
221
+ if isinstance(parts, list):
222
+ text = '\n'.join([p.get('text', '') for p in parts if isinstance(p, dict)])
223
+ messages.append({"role": role, "content": text})
224
+ if messages:
225
+ return messages
226
+
134
227
  # Check if inputs has question/context format (from generate_test_traces.py wrapper traces)
135
- question = trace['inputs'].get('question')
136
- context = trace['inputs'].get('context')
228
+ question = inputs.get('question')
229
+ context = inputs.get('context')
137
230
  if question:
138
231
  if context:
139
232
  prompt = f"""Based on the following context, answer the question concisely.
@@ -753,16 +846,26 @@ def delete_judge():
753
846
 
754
847
  @app.route('/run_evaluation', methods=['POST'])
755
848
  def run_evaluation_endpoint():
756
- """Run evaluation using specified judge"""
757
-
849
+ """Run evaluation using specified judge(s) - supports multiple judges"""
850
+
758
851
 
759
852
  data = request.json
760
853
  model_file = data.get('model_file')
761
- judge = data.get('judge')
854
+ judges = data.get('judges') # Can be a list or single judge dict
762
855
  task_id = data.get('task_id', f"eval_{id(data)}")
763
856
 
764
- if not model_file or not judge:
765
- return jsonify({'error': 'Missing model_file or judge'}), 400
857
+ # Handle both single judge (backwards compat) and multiple judges
858
+ if data.get('judge'):
859
+ judges = [data.get('judge')]
860
+ elif not judges:
861
+ return jsonify({'error': 'Missing judge or judges'}), 400
862
+
863
+ # Ensure judges is a list
864
+ if not isinstance(judges, list):
865
+ judges = [judges]
866
+
867
+ if not model_file:
868
+ return jsonify({'error': 'Missing model_file'}), 400
766
869
 
767
870
  # Load weak model results
768
871
  model_path = DATA_DIR / model_file
@@ -782,18 +885,22 @@ def run_evaluation_endpoint():
782
885
  # Extract model name from filename
783
886
  model_name = model_file.replace('weak_model_', '').replace('.json', '')
784
887
 
888
+ # Create evaluation name with all judges
889
+ judges_names = '_'.join([j['name'] for j in judges])
890
+ eval_name = f"eval-{model_name}-{judges_names}"
891
+
785
892
  # Initialize progress tracking
786
893
  total_steps = len(results)
787
894
  progress_state[task_id] = {
788
895
  'current': 0,
789
896
  'total': total_steps,
790
- 'message': f'Starting evaluation: {model_name} with {judge["name"]}...',
897
+ 'message': f'Starting evaluation: {model_name} with {len(judges)} judge(s)...',
791
898
  'status': 'running'
792
899
  }
793
900
 
794
901
  # Create evaluation logger
795
902
  ev = weave.EvaluationLogger(
796
- name=f"eval-{model_name}-{judge['name']}",
903
+ name=eval_name,
797
904
  model=model_name
798
905
  )
799
906
 
@@ -818,13 +925,20 @@ def run_evaluation_endpoint():
818
925
  if messages and len(messages) > 0:
819
926
  question = messages[0].get('content', '')
820
927
 
821
- # Run judge
822
- if judge['type'] == 'llm':
823
- scores = run_llm_judge_eval(judge, strong_output, weak_output, question)
824
- else:
825
- scores = run_custom_judge_eval(judge, strong_output, weak_output)
928
+ # Run all judges and collect scores
929
+ all_scores = {}
930
+ for judge in judges:
931
+ # Run judge
932
+ if judge['type'] == 'llm':
933
+ scores = run_llm_judge_eval(judge, strong_output, weak_output, question)
934
+ else:
935
+ scores = run_custom_judge_eval(judge, strong_output, weak_output)
936
+
937
+ # Merge scores with judge name prefix to avoid conflicts
938
+ for score_key, score_value in scores.items():
939
+ all_scores[f"{judge['name']}_{score_key}"] = score_value
826
940
 
827
- # Log to weave
941
+ # Log to weave with all scores from all judges
828
942
  ev.log_example(
829
943
  inputs={
830
944
  "question": question,
@@ -834,7 +948,7 @@ def run_evaluation_endpoint():
834
948
  "weak_output": weak_output
835
949
 
836
950
  },
837
- scores=scores
951
+ scores=all_scores
838
952
  )
839
953
 
840
954
  # Finish evaluation
@@ -850,10 +964,11 @@ def run_evaluation_endpoint():
850
964
 
851
965
  return jsonify({
852
966
  'status': 'success',
853
- 'evaluation_name': f"eval-{model_name}-{judge['name']}",
967
+ 'evaluation_name': eval_name,
854
968
  'examples_evaluated': len(results),
855
969
  'weave_url': ev.ui_url,
856
970
  'strong_export': strong_export,
971
+ 'judges': [j['name'] for j in judges],
857
972
  'task_id': task_id
858
973
  })
859
974
 
@@ -1032,6 +1147,32 @@ def list_projects():
1032
1147
  return jsonify({'projects': projects})
1033
1148
 
1034
1149
 
1150
+ @app.route('/get_preferences', methods=['GET'])
1151
+ def get_preferences():
1152
+ """Get saved user preferences"""
1153
+ prefs_file = DATA_DIR / 'preferences.json'
1154
+ if prefs_file.exists():
1155
+ try:
1156
+ with open(prefs_file, 'r') as f:
1157
+ return jsonify(json.load(f))
1158
+ except:
1159
+ pass
1160
+ return jsonify({})
1161
+
1162
+
1163
+ @app.route('/save_preferences', methods=['POST'])
1164
+ def save_preferences():
1165
+ """Save user preferences"""
1166
+ try:
1167
+ data = request.json
1168
+ prefs_file = DATA_DIR / 'preferences.json'
1169
+ with open(prefs_file, 'w') as f:
1170
+ json.dump(data, f, indent=2)
1171
+ return jsonify({'status': 'success'})
1172
+ except Exception as e:
1173
+ return jsonify({'status': 'error', 'message': str(e)}), 500
1174
+
1175
+
1035
1176
  # Routes for serving HTML pages
1036
1177
  @app.route('/')
1037
1178
  def index():
@@ -183,12 +183,10 @@
183
183
 
184
184
  <div id="llm-options" style="display: block;">
185
185
  <label for="judge-model">Model</label>
186
- <select id="judge-model">
187
- <option value="gpt-5">gpt-5</option>
188
- <option value="gpt-4o">gpt-4o</option>
189
- <option value="gpt-4o-mini">gpt-4o-mini</option>
190
- <option value="claude-3-5-sonnet-20241022">claude-3-5-sonnet</option>
191
- </select>
186
+ <input type="text" id="judge-model" placeholder="e.g., openai/gpt-5, anthropic/claude-3.5-sonnet" value="openai/gpt-5">
187
+ <p style="color: #888; font-size: 12px; margin-top: 5px; margin-bottom: 15px;">
188
+ <strong>Note:</strong> Uses LiteLLM format. Examples: <code>openai/gpt-5</code>, <code>anthropic/claude-3.5-sonnet</code>, <code>openai/gpt-4o</code>
189
+ </p>
192
190
 
193
191
  <label for="judge-return-type">Return Type</label>
194
192
  <select id="judge-return-type">
@@ -393,10 +391,16 @@ Respond in JSON format: {'correct': true} or {'correct': false}`
393
391
  };
394
392
 
395
393
  if (type === 'llm') {
396
- judge.model = document.getElementById('judge-model').value;
394
+ judge.model = document.getElementById('judge-model').value.trim();
397
395
  judge.returnType = document.getElementById('judge-return-type').value;
398
396
  judge.prompt = document.getElementById('judge-prompt').value.trim();
399
397
 
398
+ // Validate model
399
+ if (!judge.model) {
400
+ alert('Error: Please enter a model (e.g., openai/gpt-5)');
401
+ return;
402
+ }
403
+
400
404
  // Validate required placeholders
401
405
  if (!judge.prompt.includes('{strong_output}')) {
402
406
  alert('Error: Judge prompt must include {strong_output} placeholder');
@@ -420,7 +424,7 @@ Respond in JSON format: {'correct': true} or {'correct': false}`
420
424
  function resetForm() {
421
425
  document.getElementById('judge-name').value = '';
422
426
  document.getElementById('judge-type').value = 'llm';
423
- document.getElementById('judge-model').value = 'gpt-5-2025-08-07';
427
+ document.getElementById('judge-model').value = 'openai/gpt-5';
424
428
  document.getElementById('judge-prompt').value = '';
425
429
  document.getElementById('form-title').textContent = 'Create New Judge';
426
430
  document.getElementById('save-btn').textContent = 'Save Judge';