quickdistill 0.1.8__tar.gz → 0.1.9__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {quickdistill-0.1.8/quickdistill.egg-info → quickdistill-0.1.9}/PKG-INFO +1 -1
- quickdistill-0.1.9/dev/run_inf_with_providers.py +89 -0
- quickdistill-0.1.9/dev/v2_run_inf_w_providers.py +174 -0
- {quickdistill-0.1.8 → quickdistill-0.1.9}/pyproject.toml +1 -1
- quickdistill-0.1.9/quickdistill/__init__.py +28 -0
- quickdistill-0.1.9/quickdistill/__pycache__/__init__.cpython-310.pyc +0 -0
- quickdistill-0.1.9/quickdistill/__pycache__/server.cpython-310.pyc +0 -0
- {quickdistill-0.1.8 → quickdistill-0.1.9}/quickdistill/server.py +170 -29
- {quickdistill-0.1.8 → quickdistill-0.1.9}/quickdistill/static/trace_viewer.html +375 -116
- {quickdistill-0.1.8 → quickdistill-0.1.9/quickdistill.egg-info}/PKG-INFO +1 -1
- {quickdistill-0.1.8 → quickdistill-0.1.9}/quickdistill.egg-info/SOURCES.txt +2 -0
- quickdistill-0.1.8/quickdistill/__init__.py +0 -17
- quickdistill-0.1.8/quickdistill/__pycache__/__init__.cpython-310.pyc +0 -0
- quickdistill-0.1.8/quickdistill/__pycache__/server.cpython-310.pyc +0 -0
- {quickdistill-0.1.8 → quickdistill-0.1.9}/.pycommands +0 -0
- {quickdistill-0.1.8 → quickdistill-0.1.9}/README.md +0 -0
- {quickdistill-0.1.8 → quickdistill-0.1.9}/dev/generate_test_traces.py +0 -0
- {quickdistill-0.1.8 → quickdistill-0.1.9}/dev/get_call.py +0 -0
- {quickdistill-0.1.8 → quickdistill-0.1.9}/dev/get_traces.py +0 -0
- {quickdistill-0.1.8 → quickdistill-0.1.9}/dev/inference_server.py +0 -0
- {quickdistill-0.1.8 → quickdistill-0.1.9}/dev/judge_manager.html +0 -0
- {quickdistill-0.1.8 → quickdistill-0.1.9}/dev/judges.json +0 -0
- {quickdistill-0.1.8 → quickdistill-0.1.9}/dev/old/TEST_TRACE_GENERATION.md +0 -0
- {quickdistill-0.1.8 → quickdistill-0.1.9}/dev/old/traces_data.json +0 -0
- {quickdistill-0.1.8 → quickdistill-0.1.9}/dev/projects/byyoung3_arena-detailed/traces_data.json +0 -0
- {quickdistill-0.1.8 → quickdistill-0.1.9}/dev/projects/byyoung3_claude-opus-4-1-tutorial/traces_data.json +0 -0
- {quickdistill-0.1.8 → quickdistill-0.1.9}/dev/projects/byyoung3_test-financial-qa/traces_data.json +0 -0
- {quickdistill-0.1.8 → quickdistill-0.1.9}/dev/pystatus +0 -0
- {quickdistill-0.1.8 → quickdistill-0.1.9}/dev/run_evaluation.py +0 -0
- {quickdistill-0.1.8 → quickdistill-0.1.9}/dev/run_weak_models.py +0 -0
- {quickdistill-0.1.8 → quickdistill-0.1.9}/dev/strong_exports/anthropic_claude-3.5-sonnet_10traces_v2.json +0 -0
- {quickdistill-0.1.8 → quickdistill-0.1.9}/dev/strong_exports/anthropic_claude-3.5-sonnet_20traces.json +0 -0
- {quickdistill-0.1.8 → quickdistill-0.1.9}/dev/strong_exports/claude-opus-4-1-20250805_1traces.json +0 -0
- {quickdistill-0.1.8 → quickdistill-0.1.9}/dev/strong_exports/gpt-5-2025-08-07_199traces.json +0 -0
- {quickdistill-0.1.8 → quickdistill-0.1.9}/dev/trace_viewer.html +0 -0
- {quickdistill-0.1.8 → quickdistill-0.1.9}/dev/traces_data.json +0 -0
- {quickdistill-0.1.8 → quickdistill-0.1.9}/dev/weak_model_google_gemini-2.5-flash.json +0 -0
- {quickdistill-0.1.8 → quickdistill-0.1.9}/dev/weak_model_meta-llama_Llama-3.1-8B-Instruct.json +0 -0
- {quickdistill-0.1.8 → quickdistill-0.1.9}/dev/weak_model_meta-llama_Llama-3.3-70B-Instruct.json +0 -0
- {quickdistill-0.1.8 → quickdistill-0.1.9}/dev/weak_model_openai_gpt-oss-20b.json +0 -0
- {quickdistill-0.1.8 → quickdistill-0.1.9}/quickdistill/__pycache__/cli.cpython-310.pyc +0 -0
- {quickdistill-0.1.8 → quickdistill-0.1.9}/quickdistill/__pycache__/get_traces.cpython-310.pyc +0 -0
- {quickdistill-0.1.8 → quickdistill-0.1.9}/quickdistill/cli.py +0 -0
- {quickdistill-0.1.8 → quickdistill-0.1.9}/quickdistill/default_judges.json +0 -0
- {quickdistill-0.1.8 → quickdistill-0.1.9}/quickdistill/default_projects/byyoung3_arena-detailed/traces_data.json +0 -0
- {quickdistill-0.1.8 → quickdistill-0.1.9}/quickdistill/get_traces.py +0 -0
- {quickdistill-0.1.8 → quickdistill-0.1.9}/quickdistill/static/judge_manager.html +0 -0
- {quickdistill-0.1.8 → quickdistill-0.1.9}/quickdistill.egg-info/dependency_links.txt +0 -0
- {quickdistill-0.1.8 → quickdistill-0.1.9}/quickdistill.egg-info/entry_points.txt +0 -0
- {quickdistill-0.1.8 → quickdistill-0.1.9}/quickdistill.egg-info/requires.txt +0 -0
- {quickdistill-0.1.8 → quickdistill-0.1.9}/quickdistill.egg-info/top_level.txt +0 -0
- {quickdistill-0.1.8 → quickdistill-0.1.9}/setup.cfg +0 -0
- {quickdistill-0.1.8 → quickdistill-0.1.9}/update.sh +0 -0
|
@@ -0,0 +1,89 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import anthropic
|
|
3
|
+
|
|
4
|
+
# ---------------- GEMINI ----------------
|
|
5
|
+
from google import genai
|
|
6
|
+
# ---------------- GROK ----------------
|
|
7
|
+
from xai_sdk import Client as XAIClient
|
|
8
|
+
from xai_sdk.chat import user, system
|
|
9
|
+
|
|
10
|
+
import weave; weave.init("providers-testing")
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def run_gemini(prompt: str):
|
|
14
|
+
client = genai.Client(api_key=os.environ["GEMINI_API_KEY"])
|
|
15
|
+
|
|
16
|
+
resp = client.models.generate_content(
|
|
17
|
+
model="gemini-2.5-flash",
|
|
18
|
+
contents=[{
|
|
19
|
+
"role": "user",
|
|
20
|
+
"parts": [{"text": prompt}]
|
|
21
|
+
}]
|
|
22
|
+
)
|
|
23
|
+
|
|
24
|
+
return resp.text
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
# ---------------- CLAUDE ----------------
|
|
28
|
+
|
|
29
|
+
def run_claude(prompt: str):
|
|
30
|
+
client = anthropic.Anthropic(
|
|
31
|
+
api_key=os.environ["ANTHROPIC_API_KEY"]
|
|
32
|
+
)
|
|
33
|
+
|
|
34
|
+
msg = client.messages.create(
|
|
35
|
+
model="claude-4.5-haiku",
|
|
36
|
+
max_tokens=512,
|
|
37
|
+
messages=[
|
|
38
|
+
{"role": "user", "content": prompt}
|
|
39
|
+
]
|
|
40
|
+
)
|
|
41
|
+
|
|
42
|
+
return msg.content[0].text
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def run_grok(prompt: str):
|
|
47
|
+
client = XAIClient(
|
|
48
|
+
api_key=os.environ["XAI_API_KEY"],
|
|
49
|
+
timeout=3600
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
chat = client.chat.create(
|
|
53
|
+
model="grok-4-1-fast-reasoning"
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
chat.append(system("You are Grok, a helpful AI assistant."))
|
|
57
|
+
chat.append(user(prompt))
|
|
58
|
+
|
|
59
|
+
resp = chat.sample()
|
|
60
|
+
|
|
61
|
+
return resp.content
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
# ---------------- UNIFIED ROUTER ----------------
|
|
65
|
+
def run_model(provider: str, prompt: str):
|
|
66
|
+
provider = provider.lower()
|
|
67
|
+
|
|
68
|
+
if provider == "gemini":
|
|
69
|
+
return run_gemini(prompt)
|
|
70
|
+
|
|
71
|
+
if provider == "claude":
|
|
72
|
+
return run_claude(prompt)
|
|
73
|
+
|
|
74
|
+
if provider == "grok":
|
|
75
|
+
return run_grok(prompt)
|
|
76
|
+
|
|
77
|
+
raise ValueError(provider)
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
# ---------------- TEST ----------------
|
|
81
|
+
if __name__ == "__main__":
|
|
82
|
+
prompt = "Explain transformers simply"
|
|
83
|
+
|
|
84
|
+
for provider in ["gemini", "claude", "grok"]:
|
|
85
|
+
try:
|
|
86
|
+
print(f"\n=== {provider.upper()} ===")
|
|
87
|
+
print(run_model(provider, prompt))
|
|
88
|
+
except Exception as e:
|
|
89
|
+
print(provider, "failed:", e)
|
|
@@ -0,0 +1,174 @@
|
|
|
1
|
+
# pip install openai anthropic google-genai
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
|
|
5
|
+
# ================= OPENAI =================
|
|
6
|
+
from openai import OpenAI
|
|
7
|
+
from google import genai
|
|
8
|
+
import anthropic
|
|
9
|
+
|
|
10
|
+
import weave; weave.init("providers-testing")
|
|
11
|
+
|
|
12
|
+
openai_client = OpenAI(api_key=os.environ["OPENAI_API_KEY"])
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def openai_responses(prompt: str):
|
|
22
|
+
resp = openai_client.responses.create(
|
|
23
|
+
model="gpt-5-mini",
|
|
24
|
+
input=prompt
|
|
25
|
+
)
|
|
26
|
+
return resp.output_text
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def openai_chat(prompt: str):
|
|
30
|
+
resp = openai_client.chat.completions.create(
|
|
31
|
+
model="gpt-4.1-mini",
|
|
32
|
+
messages=[{"role": "user", "content": prompt}]
|
|
33
|
+
)
|
|
34
|
+
return resp.choices[0].message.content
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def openai_stream(prompt: str):
|
|
38
|
+
print("\n[OpenAI Streaming]")
|
|
39
|
+
with openai_client.responses.stream(
|
|
40
|
+
model="gpt-5-mini",
|
|
41
|
+
input=prompt
|
|
42
|
+
) as stream:
|
|
43
|
+
for event in stream:
|
|
44
|
+
if event.type == "response.output_text.delta":
|
|
45
|
+
print(event.delta, end="", flush=True)
|
|
46
|
+
print()
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
# ================= ANTHROPIC =================
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
anthropic_client = anthropic.Anthropic(
|
|
53
|
+
api_key=os.environ["ANTHROPIC_API_KEY"]
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def anthropic_messages(prompt: str):
|
|
58
|
+
resp = anthropic_client.messages.create(
|
|
59
|
+
model="claude-haiku-4-5-20251001",
|
|
60
|
+
max_tokens=512,
|
|
61
|
+
messages=[{"role": "user", "content": prompt}]
|
|
62
|
+
)
|
|
63
|
+
return resp.content[0].text
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def anthropic_stream(prompt: str):
|
|
67
|
+
print("\n[Anthropic Streaming]")
|
|
68
|
+
with anthropic_client.messages.stream(
|
|
69
|
+
model="claude-haiku-4-5-20251001",
|
|
70
|
+
max_tokens=512,
|
|
71
|
+
messages=[{"role": "user", "content": prompt}]
|
|
72
|
+
) as stream:
|
|
73
|
+
for text in stream.text_stream:
|
|
74
|
+
print(text, end="", flush=True)
|
|
75
|
+
print()
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
# ================= GEMINI =================
|
|
79
|
+
|
|
80
|
+
gemini_client = genai.Client(
|
|
81
|
+
api_key=os.environ["GEMINI_API_KEY"]
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
def gemini_generate(prompt: str):
|
|
86
|
+
resp = gemini_client.models.generate_content(
|
|
87
|
+
model="gemini-2.5-flash",
|
|
88
|
+
contents=[{
|
|
89
|
+
"role": "user",
|
|
90
|
+
"parts": [{"text": prompt}]
|
|
91
|
+
}]
|
|
92
|
+
)
|
|
93
|
+
return resp.text
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
def gemini_chat(prompt: str):
|
|
97
|
+
chat = gemini_client.chats.create(
|
|
98
|
+
model="gemini-2.5-flash"
|
|
99
|
+
)
|
|
100
|
+
resp = chat.send_message(prompt)
|
|
101
|
+
return resp.text
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
def gemini_stream(prompt: str):
|
|
105
|
+
print("\n[Gemini Streaming]")
|
|
106
|
+
chat = gemini_client.chats.create(
|
|
107
|
+
model="gemini-2.5-flash"
|
|
108
|
+
)
|
|
109
|
+
stream = chat.send_message_stream(prompt)
|
|
110
|
+
|
|
111
|
+
for chunk in stream:
|
|
112
|
+
if chunk.text:
|
|
113
|
+
print(chunk.text, end="", flush=True)
|
|
114
|
+
print()
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
# ================= TOOL CALL EXAMPLE =================
|
|
118
|
+
# Minimal cross-provider demonstration using OpenAI only
|
|
119
|
+
# (Anthropic/Gemini support tools but schemas differ heavily)
|
|
120
|
+
|
|
121
|
+
def openai_tool_example():
|
|
122
|
+
|
|
123
|
+
tools = [{
|
|
124
|
+
"type": "function",
|
|
125
|
+
"function": {
|
|
126
|
+
"name": "get_weather",
|
|
127
|
+
"parameters": {
|
|
128
|
+
"type": "object",
|
|
129
|
+
"properties": {
|
|
130
|
+
"city": {"type": "string"}
|
|
131
|
+
},
|
|
132
|
+
"required": ["city"]
|
|
133
|
+
}
|
|
134
|
+
}
|
|
135
|
+
}]
|
|
136
|
+
|
|
137
|
+
resp = openai_client.chat.completions.create(
|
|
138
|
+
model="gpt-4.1-mini",
|
|
139
|
+
messages=[{"role": "user", "content": "What's weather in Tokyo?"}],
|
|
140
|
+
tools=tools
|
|
141
|
+
)
|
|
142
|
+
|
|
143
|
+
return resp.choices[0].message.tool_calls
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
# ================= RUN ALL =================
|
|
147
|
+
|
|
148
|
+
if __name__ == "__main__":
|
|
149
|
+
|
|
150
|
+
prompt = "Explain transformers simply."
|
|
151
|
+
|
|
152
|
+
# print("\n==== OPENAI RESPONSES ====")
|
|
153
|
+
# print(openai_responses(prompt))
|
|
154
|
+
|
|
155
|
+
# print("\n==== OPENAI CHAT ====")
|
|
156
|
+
# print(openai_chat(prompt))
|
|
157
|
+
|
|
158
|
+
# openai_stream(prompt)
|
|
159
|
+
|
|
160
|
+
print("\n==== ANTHROPIC ====")
|
|
161
|
+
print(anthropic_messages(prompt))
|
|
162
|
+
|
|
163
|
+
anthropic_stream(prompt)
|
|
164
|
+
|
|
165
|
+
print("\n==== GEMINI GENERATE ====")
|
|
166
|
+
print(gemini_generate(prompt))
|
|
167
|
+
|
|
168
|
+
print("\n==== GEMINI CHAT ====")
|
|
169
|
+
print(gemini_chat(prompt))
|
|
170
|
+
|
|
171
|
+
gemini_stream(prompt)
|
|
172
|
+
|
|
173
|
+
print("\n==== OPENAI TOOL CALL ====")
|
|
174
|
+
print(openai_tool_example())
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
"""
|
|
2
|
+
QuickDistill - A fast and easy toolkit for distilling AI models.
|
|
3
|
+
|
|
4
|
+
This package provides tools to:
|
|
5
|
+
- Capture and view Weave traces
|
|
6
|
+
- Run weak models on strong model outputs
|
|
7
|
+
- Evaluate similarity using LLM judges
|
|
8
|
+
- Export datasets for model evaluation
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
# Monkey patch for aiohttp/litellm compatibility
|
|
12
|
+
# litellm expects aiohttp.ConnectionTimeoutError but it doesn't exist in some versions
|
|
13
|
+
try:
|
|
14
|
+
import aiohttp
|
|
15
|
+
if not hasattr(aiohttp, 'ConnectionTimeoutError'):
|
|
16
|
+
aiohttp.ConnectionTimeoutError = aiohttp.ServerTimeoutError
|
|
17
|
+
if not hasattr(aiohttp, 'SocketTimeoutError'):
|
|
18
|
+
aiohttp.SocketTimeoutError = aiohttp.ServerTimeoutError
|
|
19
|
+
except Exception:
|
|
20
|
+
pass
|
|
21
|
+
|
|
22
|
+
__version__ = "0.1.9"
|
|
23
|
+
__author__ = "Brett Young"
|
|
24
|
+
__email__ = "bdytx5@umsystem.edu"
|
|
25
|
+
|
|
26
|
+
from quickdistill.cli import main
|
|
27
|
+
|
|
28
|
+
__all__ = ["main"]
|
|
Binary file
|
|
Binary file
|
|
@@ -100,40 +100,133 @@ def run_inference(client, model, messages, max_tokens=1000):
|
|
|
100
100
|
return f"ERROR: {str(e)}"
|
|
101
101
|
|
|
102
102
|
def extract_output_content(output_str):
|
|
103
|
-
"""Extract actual content from WeaveObject string or regular output
|
|
103
|
+
"""Extract actual content from WeaveObject string, JSON response, or regular output.
|
|
104
|
+
|
|
105
|
+
Handles outputs from:
|
|
106
|
+
- OpenAI chat.completions.create (plain text)
|
|
107
|
+
- OpenAI responses.create (JSON with nested structure)
|
|
108
|
+
- Anthropic Messages (WeaveObject with content[0].text)
|
|
109
|
+
- Google Gemini (WeaveObject with candidates[0].content.parts[0].text)
|
|
110
|
+
"""
|
|
111
|
+
import re
|
|
112
|
+
import json
|
|
113
|
+
|
|
104
114
|
if not output_str:
|
|
105
115
|
return None
|
|
106
116
|
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
117
|
+
if not isinstance(output_str, str):
|
|
118
|
+
return str(output_str)
|
|
119
|
+
|
|
120
|
+
# Handle empty/streaming responses
|
|
121
|
+
if output_str in ('', 'None', 'null'):
|
|
122
|
+
return '[Streaming output - not captured]'
|
|
123
|
+
|
|
124
|
+
# Handle OpenAI responses.create JSON format
|
|
125
|
+
if output_str.startswith('{') and '"output"' in output_str:
|
|
126
|
+
try:
|
|
127
|
+
resp_obj = json.loads(output_str)
|
|
128
|
+
if 'output' in resp_obj and isinstance(resp_obj['output'], list):
|
|
129
|
+
# Extract text from output messages
|
|
130
|
+
text_parts = []
|
|
131
|
+
for item in resp_obj['output']:
|
|
132
|
+
if item.get('type') == 'message' and 'content' in item:
|
|
133
|
+
for content in item['content']:
|
|
134
|
+
if content.get('type') == 'output_text' and 'text' in content:
|
|
135
|
+
text_parts.append(content['text'])
|
|
136
|
+
if text_parts:
|
|
137
|
+
return '\n\n'.join(text_parts)
|
|
138
|
+
except (json.JSONDecodeError, KeyError, TypeError):
|
|
139
|
+
pass # Fall through to other handlers
|
|
140
|
+
|
|
141
|
+
# Handle WeaveObject strings (Anthropic, Gemini)
|
|
142
|
+
if 'WeaveObject' in output_str:
|
|
143
|
+
# Improved regex that handles escape sequences properly
|
|
144
|
+
match = re.search(r"'text':\s*'((?:[^'\\]|\\.)*)'", output_str, re.DOTALL)
|
|
112
145
|
if match:
|
|
113
|
-
# Unescape the string
|
|
146
|
+
# Unescape the string properly (order matters!)
|
|
114
147
|
text = match.group(1)
|
|
115
|
-
text = text.replace(
|
|
148
|
+
text = text.replace("\\'", "'") # escaped single quotes
|
|
149
|
+
text = text.replace('\\"', '"') # escaped double quotes
|
|
150
|
+
text = text.replace('\\n', '\n') # newlines
|
|
151
|
+
text = text.replace('\\t', '\t') # tabs
|
|
152
|
+
text = text.replace('\\r', '\r') # carriage returns
|
|
153
|
+
text = text.replace('\\\\', '\\') # escaped backslashes (do this last!)
|
|
116
154
|
return text
|
|
117
155
|
|
|
118
|
-
|
|
156
|
+
# If no text field found, return truncated version
|
|
157
|
+
return f"[Complex WeaveObject - could not extract text]\n{output_str[:500]}..."
|
|
158
|
+
|
|
159
|
+
# Plain text output (standard OpenAI chat format)
|
|
119
160
|
return output_str
|
|
120
161
|
|
|
121
162
|
|
|
122
163
|
def extract_messages_from_trace(trace):
|
|
123
|
-
"""Extract messages from a trace in the format needed for inference
|
|
124
|
-
|
|
164
|
+
"""Extract messages from a trace in the format needed for inference.
|
|
165
|
+
|
|
166
|
+
Handles message extraction from:
|
|
167
|
+
- OpenAI chat.completions.create (messages at top level or in inputs.messages)
|
|
168
|
+
- OpenAI responses.create (inputs.input field)
|
|
169
|
+
- Anthropic Messages (inputs.messages)
|
|
170
|
+
- Google Gemini generate_content (inputs.contents array)
|
|
171
|
+
- Google Gemini Chat.send_message (inputs.message string)
|
|
172
|
+
"""
|
|
173
|
+
import re
|
|
174
|
+
|
|
175
|
+
# Get op_display_name for provider detection
|
|
176
|
+
op_name = trace.get('op_display_name', '')
|
|
177
|
+
|
|
178
|
+
# Check if messages are at top level (already extracted/cached)
|
|
125
179
|
if trace.get('messages') and isinstance(trace['messages'], list) and len(trace['messages']) > 0:
|
|
126
180
|
return trace['messages']
|
|
127
181
|
|
|
128
182
|
# Check if messages are in inputs
|
|
129
183
|
if trace.get('inputs') and isinstance(trace['inputs'], dict):
|
|
130
|
-
|
|
184
|
+
inputs = trace['inputs']
|
|
185
|
+
|
|
186
|
+
# Standard OpenAI/Anthropic: inputs.messages
|
|
187
|
+
messages = inputs.get('messages', [])
|
|
131
188
|
if isinstance(messages, list) and len(messages) > 0:
|
|
132
189
|
return messages
|
|
133
190
|
|
|
191
|
+
# OpenAI responses.create: inputs.input (simple string)
|
|
192
|
+
if 'openai.responses' in op_name and 'input' in inputs:
|
|
193
|
+
return [{"role": "user", "content": inputs['input']}]
|
|
194
|
+
|
|
195
|
+
# Gemini Chat.send_message: inputs.message (simple string)
|
|
196
|
+
if 'Chat.send_message' in op_name and 'message' in inputs:
|
|
197
|
+
return [{"role": "user", "content": inputs['message']}]
|
|
198
|
+
|
|
199
|
+
# Gemini generate_content: inputs.contents (array of content objects or WeaveObject strings)
|
|
200
|
+
if 'google.genai' in op_name and 'contents' in inputs:
|
|
201
|
+
contents = inputs['contents']
|
|
202
|
+
if isinstance(contents, list) and len(contents) > 0:
|
|
203
|
+
messages = []
|
|
204
|
+
for content in contents:
|
|
205
|
+
# Handle WeaveObject string format
|
|
206
|
+
if isinstance(content, str) and 'WeaveObject' in content:
|
|
207
|
+
role_match = re.search(r"'role':\s*'(\w+)'", content)
|
|
208
|
+
text_match = re.search(r"'text':\s*'((?:[^'\\]|\\.)*)'", content, re.DOTALL)
|
|
209
|
+
text = '[Complex content]'
|
|
210
|
+
if text_match:
|
|
211
|
+
text = text_match.group(1)
|
|
212
|
+
text = text.replace("\\'", "'").replace('\\n', '\n').replace('\\\\', '\\')
|
|
213
|
+
messages.append({
|
|
214
|
+
"role": role_match.group(1) if role_match else "user",
|
|
215
|
+
"content": text
|
|
216
|
+
})
|
|
217
|
+
# Handle regular dict format
|
|
218
|
+
elif isinstance(content, dict):
|
|
219
|
+
role = content.get('role', 'user')
|
|
220
|
+
parts = content.get('parts', [])
|
|
221
|
+
if isinstance(parts, list):
|
|
222
|
+
text = '\n'.join([p.get('text', '') for p in parts if isinstance(p, dict)])
|
|
223
|
+
messages.append({"role": role, "content": text})
|
|
224
|
+
if messages:
|
|
225
|
+
return messages
|
|
226
|
+
|
|
134
227
|
# Check if inputs has question/context format (from generate_test_traces.py wrapper traces)
|
|
135
|
-
question =
|
|
136
|
-
context =
|
|
228
|
+
question = inputs.get('question')
|
|
229
|
+
context = inputs.get('context')
|
|
137
230
|
if question:
|
|
138
231
|
if context:
|
|
139
232
|
prompt = f"""Based on the following context, answer the question concisely.
|
|
@@ -753,16 +846,26 @@ def delete_judge():
|
|
|
753
846
|
|
|
754
847
|
@app.route('/run_evaluation', methods=['POST'])
|
|
755
848
|
def run_evaluation_endpoint():
|
|
756
|
-
"""Run evaluation using specified judge"""
|
|
757
|
-
|
|
849
|
+
"""Run evaluation using specified judge(s) - supports multiple judges"""
|
|
850
|
+
|
|
758
851
|
|
|
759
852
|
data = request.json
|
|
760
853
|
model_file = data.get('model_file')
|
|
761
|
-
|
|
854
|
+
judges = data.get('judges') # Can be a list or single judge dict
|
|
762
855
|
task_id = data.get('task_id', f"eval_{id(data)}")
|
|
763
856
|
|
|
764
|
-
|
|
765
|
-
|
|
857
|
+
# Handle both single judge (backwards compat) and multiple judges
|
|
858
|
+
if data.get('judge'):
|
|
859
|
+
judges = [data.get('judge')]
|
|
860
|
+
elif not judges:
|
|
861
|
+
return jsonify({'error': 'Missing judge or judges'}), 400
|
|
862
|
+
|
|
863
|
+
# Ensure judges is a list
|
|
864
|
+
if not isinstance(judges, list):
|
|
865
|
+
judges = [judges]
|
|
866
|
+
|
|
867
|
+
if not model_file:
|
|
868
|
+
return jsonify({'error': 'Missing model_file'}), 400
|
|
766
869
|
|
|
767
870
|
# Load weak model results
|
|
768
871
|
model_path = DATA_DIR / model_file
|
|
@@ -782,18 +885,22 @@ def run_evaluation_endpoint():
|
|
|
782
885
|
# Extract model name from filename
|
|
783
886
|
model_name = model_file.replace('weak_model_', '').replace('.json', '')
|
|
784
887
|
|
|
888
|
+
# Create evaluation name with all judges
|
|
889
|
+
judges_names = '_'.join([j['name'] for j in judges])
|
|
890
|
+
eval_name = f"eval-{model_name}-{judges_names}"
|
|
891
|
+
|
|
785
892
|
# Initialize progress tracking
|
|
786
893
|
total_steps = len(results)
|
|
787
894
|
progress_state[task_id] = {
|
|
788
895
|
'current': 0,
|
|
789
896
|
'total': total_steps,
|
|
790
|
-
'message': f'Starting evaluation: {model_name} with {judge
|
|
897
|
+
'message': f'Starting evaluation: {model_name} with {len(judges)} judge(s)...',
|
|
791
898
|
'status': 'running'
|
|
792
899
|
}
|
|
793
900
|
|
|
794
901
|
# Create evaluation logger
|
|
795
902
|
ev = weave.EvaluationLogger(
|
|
796
|
-
name=
|
|
903
|
+
name=eval_name,
|
|
797
904
|
model=model_name
|
|
798
905
|
)
|
|
799
906
|
|
|
@@ -818,13 +925,20 @@ def run_evaluation_endpoint():
|
|
|
818
925
|
if messages and len(messages) > 0:
|
|
819
926
|
question = messages[0].get('content', '')
|
|
820
927
|
|
|
821
|
-
# Run
|
|
822
|
-
|
|
823
|
-
|
|
824
|
-
|
|
825
|
-
|
|
928
|
+
# Run all judges and collect scores
|
|
929
|
+
all_scores = {}
|
|
930
|
+
for judge in judges:
|
|
931
|
+
# Run judge
|
|
932
|
+
if judge['type'] == 'llm':
|
|
933
|
+
scores = run_llm_judge_eval(judge, strong_output, weak_output, question)
|
|
934
|
+
else:
|
|
935
|
+
scores = run_custom_judge_eval(judge, strong_output, weak_output)
|
|
936
|
+
|
|
937
|
+
# Merge scores with judge name prefix to avoid conflicts
|
|
938
|
+
for score_key, score_value in scores.items():
|
|
939
|
+
all_scores[f"{judge['name']}_{score_key}"] = score_value
|
|
826
940
|
|
|
827
|
-
# Log to weave
|
|
941
|
+
# Log to weave with all scores from all judges
|
|
828
942
|
ev.log_example(
|
|
829
943
|
inputs={
|
|
830
944
|
"question": question,
|
|
@@ -834,7 +948,7 @@ def run_evaluation_endpoint():
|
|
|
834
948
|
"weak_output": weak_output
|
|
835
949
|
|
|
836
950
|
},
|
|
837
|
-
scores=
|
|
951
|
+
scores=all_scores
|
|
838
952
|
)
|
|
839
953
|
|
|
840
954
|
# Finish evaluation
|
|
@@ -850,10 +964,11 @@ def run_evaluation_endpoint():
|
|
|
850
964
|
|
|
851
965
|
return jsonify({
|
|
852
966
|
'status': 'success',
|
|
853
|
-
'evaluation_name':
|
|
967
|
+
'evaluation_name': eval_name,
|
|
854
968
|
'examples_evaluated': len(results),
|
|
855
969
|
'weave_url': ev.ui_url,
|
|
856
970
|
'strong_export': strong_export,
|
|
971
|
+
'judges': [j['name'] for j in judges],
|
|
857
972
|
'task_id': task_id
|
|
858
973
|
})
|
|
859
974
|
|
|
@@ -1032,6 +1147,32 @@ def list_projects():
|
|
|
1032
1147
|
return jsonify({'projects': projects})
|
|
1033
1148
|
|
|
1034
1149
|
|
|
1150
|
+
@app.route('/get_preferences', methods=['GET'])
|
|
1151
|
+
def get_preferences():
|
|
1152
|
+
"""Get saved user preferences"""
|
|
1153
|
+
prefs_file = DATA_DIR / 'preferences.json'
|
|
1154
|
+
if prefs_file.exists():
|
|
1155
|
+
try:
|
|
1156
|
+
with open(prefs_file, 'r') as f:
|
|
1157
|
+
return jsonify(json.load(f))
|
|
1158
|
+
except:
|
|
1159
|
+
pass
|
|
1160
|
+
return jsonify({})
|
|
1161
|
+
|
|
1162
|
+
|
|
1163
|
+
@app.route('/save_preferences', methods=['POST'])
|
|
1164
|
+
def save_preferences():
|
|
1165
|
+
"""Save user preferences"""
|
|
1166
|
+
try:
|
|
1167
|
+
data = request.json
|
|
1168
|
+
prefs_file = DATA_DIR / 'preferences.json'
|
|
1169
|
+
with open(prefs_file, 'w') as f:
|
|
1170
|
+
json.dump(data, f, indent=2)
|
|
1171
|
+
return jsonify({'status': 'success'})
|
|
1172
|
+
except Exception as e:
|
|
1173
|
+
return jsonify({'status': 'error', 'message': str(e)}), 500
|
|
1174
|
+
|
|
1175
|
+
|
|
1035
1176
|
# Routes for serving HTML pages
|
|
1036
1177
|
@app.route('/')
|
|
1037
1178
|
def index():
|