aiqa-client 0.5.2__py3-none-any.whl → 0.6.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- aiqa/__init__.py +8 -2
- aiqa/client.py +17 -2
- aiqa/constants.py +1 -1
- aiqa/experiment_runner.py +248 -77
- aiqa/llm_as_judge.py +281 -0
- aiqa/span_helpers.py +511 -0
- aiqa/tracing.py +169 -561
- aiqa/tracing_llm_utils.py +20 -9
- aiqa/types.py +61 -0
- {aiqa_client-0.5.2.dist-info → aiqa_client-0.6.1.dist-info}/METADATA +1 -1
- aiqa_client-0.6.1.dist-info/RECORD +17 -0
- {aiqa_client-0.5.2.dist-info → aiqa_client-0.6.1.dist-info}/WHEEL +1 -1
- aiqa_client-0.5.2.dist-info/RECORD +0 -14
- {aiqa_client-0.5.2.dist-info → aiqa_client-0.6.1.dist-info}/licenses/LICENSE.txt +0 -0
- {aiqa_client-0.5.2.dist-info → aiqa_client-0.6.1.dist-info}/top_level.txt +0 -0
aiqa/llm_as_judge.py
ADDED
|
@@ -0,0 +1,281 @@
|
|
|
1
|
+
"""
|
|
2
|
+
LLM-as-judge scoring functionality for evaluating outputs.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import os
|
|
6
|
+
import json
|
|
7
|
+
import re
|
|
8
|
+
import asyncio
|
|
9
|
+
import requests
|
|
10
|
+
from typing import Any, Dict, Optional, Callable, Awaitable
|
|
11
|
+
from .types import MetricResult, Metric, Example, CallLLMType
|
|
12
|
+
|
|
13
|
+
def parse_llm_response(content: str) -> Optional[MetricResult]:
|
|
14
|
+
"""
|
|
15
|
+
Parse LLM response content string into score and message.
|
|
16
|
+
|
|
17
|
+
Args:
|
|
18
|
+
content: Raw content string from LLM response
|
|
19
|
+
|
|
20
|
+
Returns:
|
|
21
|
+
MetricResult object with score:[0,1], message (optional), and error (optional)
|
|
22
|
+
"""
|
|
23
|
+
try:
|
|
24
|
+
result = json.loads(content)
|
|
25
|
+
score = result.get("score")
|
|
26
|
+
if not score:
|
|
27
|
+
return None
|
|
28
|
+
message = result.get("message")
|
|
29
|
+
# Ensure score is in [0, 1] range
|
|
30
|
+
score = max(0.0, min(1.0, float(score)))
|
|
31
|
+
return {"score": score, "message": message}
|
|
32
|
+
except (json.JSONDecodeError, ValueError, TypeError) as e:
|
|
33
|
+
print(f"Failed to parse JSON response: {e}")
|
|
34
|
+
# Fallback: try to extract score from text
|
|
35
|
+
raise Exception(f"Could not parse LLM response: {content}")
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
async def get_model_from_server(
|
|
39
|
+
model_id: str, server_url: str, headers: Dict[str, str]
|
|
40
|
+
) -> Optional[Dict[str, Any]]:
|
|
41
|
+
"""
|
|
42
|
+
Fetch a model from the server with API key included.
|
|
43
|
+
|
|
44
|
+
Args:
|
|
45
|
+
model_id: ID of the model to fetch
|
|
46
|
+
server_url: URL of the AIQA server
|
|
47
|
+
headers: HTTP headers for authentication
|
|
48
|
+
|
|
49
|
+
Returns:
|
|
50
|
+
Model dictionary with api_key if available, None if not found or error
|
|
51
|
+
"""
|
|
52
|
+
try:
|
|
53
|
+
def _do_request():
|
|
54
|
+
return requests.get(
|
|
55
|
+
f"{server_url}/model/{model_id}?fields=api_key",
|
|
56
|
+
headers=headers,
|
|
57
|
+
)
|
|
58
|
+
|
|
59
|
+
response = await asyncio.to_thread(_do_request)
|
|
60
|
+
if response.ok:
|
|
61
|
+
model = response.json()
|
|
62
|
+
if model.get("api_key"):
|
|
63
|
+
return model
|
|
64
|
+
return None
|
|
65
|
+
except Exception as e:
|
|
66
|
+
print(f"Warning: Could not fetch model from server: {e}")
|
|
67
|
+
return None
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
async def call_openai(
|
|
71
|
+
system_prompt: str, user_message: str, api_key: str, output_schema: str = None
|
|
72
|
+
) -> str:
|
|
73
|
+
"""Call OpenAI API for LLM-as-judge scoring. Returns raw content string."""
|
|
74
|
+
def _do_request():
|
|
75
|
+
# TODO send output_schema if provided
|
|
76
|
+
return requests.post(
|
|
77
|
+
"https://api.openai.com/v1/chat/completions",
|
|
78
|
+
headers={
|
|
79
|
+
"Content-Type": "application/json",
|
|
80
|
+
"Authorization": f"Bearer {api_key}",
|
|
81
|
+
},
|
|
82
|
+
json={
|
|
83
|
+
"model": "gpt-4o-mini", # Default model, can be made configurable
|
|
84
|
+
"messages": [
|
|
85
|
+
{"role": "system", "content": system_prompt},
|
|
86
|
+
{"role": "user", "content": user_message},
|
|
87
|
+
],
|
|
88
|
+
"temperature": 0,
|
|
89
|
+
"response_format": {"type": "json_object"},
|
|
90
|
+
},
|
|
91
|
+
)
|
|
92
|
+
|
|
93
|
+
response = await asyncio.to_thread(_do_request)
|
|
94
|
+
|
|
95
|
+
if not response.ok:
|
|
96
|
+
error_text = response.text
|
|
97
|
+
raise Exception(
|
|
98
|
+
f"OpenAI API error: {response.status_code} {response.reason} - {error_text}"
|
|
99
|
+
)
|
|
100
|
+
|
|
101
|
+
data = response.json()
|
|
102
|
+
choices = data.get("choices", [])
|
|
103
|
+
if not choices or not isinstance(choices, list):
|
|
104
|
+
raise Exception("OpenAI API did not return choices")
|
|
105
|
+
content = choices[0].get("message", {}).get("content")
|
|
106
|
+
if not content:
|
|
107
|
+
raise Exception("OpenAI API did not return content")
|
|
108
|
+
|
|
109
|
+
print(f"OpenAI raw response: {content[:500]}...") # Show first 500 chars
|
|
110
|
+
return content
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
async def call_anthropic(
|
|
114
|
+
system_prompt: str, user_message: str, api_key: str
|
|
115
|
+
) -> str:
|
|
116
|
+
"""Call Anthropic (Claude) API for LLM-as-judge scoring. Returns raw content string."""
|
|
117
|
+
def _do_request():
|
|
118
|
+
return requests.post(
|
|
119
|
+
"https://api.anthropic.com/v1/messages",
|
|
120
|
+
headers={
|
|
121
|
+
"Content-Type": "application/json",
|
|
122
|
+
"x-api-key": api_key,
|
|
123
|
+
"anthropic-version": "2023-06-01",
|
|
124
|
+
},
|
|
125
|
+
json={
|
|
126
|
+
"model": "claude-3-5-sonnet-20241022", # Default model
|
|
127
|
+
"max_tokens": 1024,
|
|
128
|
+
"temperature": 0,
|
|
129
|
+
"system": system_prompt,
|
|
130
|
+
"messages": [{"role": "user", "content": user_message}],
|
|
131
|
+
},
|
|
132
|
+
)
|
|
133
|
+
|
|
134
|
+
response = await asyncio.to_thread(_do_request)
|
|
135
|
+
|
|
136
|
+
if not response.ok:
|
|
137
|
+
error_text = response.text
|
|
138
|
+
raise Exception(
|
|
139
|
+
f"Anthropic API error: {response.status_code} {response.reason} - {error_text}"
|
|
140
|
+
)
|
|
141
|
+
|
|
142
|
+
data = response.json()
|
|
143
|
+
content_list = data.get("content", [])
|
|
144
|
+
if not content_list or not isinstance(content_list, list):
|
|
145
|
+
raise Exception("Anthropic API did not return content")
|
|
146
|
+
content = content_list[0].get("text", "")
|
|
147
|
+
if not content:
|
|
148
|
+
raise Exception("Anthropic API did not return content")
|
|
149
|
+
|
|
150
|
+
print(f"Anthropic raw response: {content[:500]}...") # Show first 500 chars
|
|
151
|
+
return content
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
async def call_llm_fallback(
|
|
155
|
+
system_prompt: str,
|
|
156
|
+
user_message: str,
|
|
157
|
+
api_key: Optional[str] = None,
|
|
158
|
+
provider: Optional[str] = None,
|
|
159
|
+
) -> str:
|
|
160
|
+
"""
|
|
161
|
+
Fallback LLM call function that checks for API key parameter or environment variables.
|
|
162
|
+
|
|
163
|
+
Args:
|
|
164
|
+
system_prompt: System prompt for the LLM
|
|
165
|
+
user_message: User message containing the output to score
|
|
166
|
+
api_key: Optional API key to use (takes precedence over env vars)
|
|
167
|
+
provider: Optional provider name ('openai', 'anthropic', etc.) to determine which API to call
|
|
168
|
+
|
|
169
|
+
Returns:
|
|
170
|
+
Dictionary with "score" (float 0-1) and "message" (str)
|
|
171
|
+
"""
|
|
172
|
+
# If API key provided, use it with the specified provider
|
|
173
|
+
if api_key:
|
|
174
|
+
if provider == 'openai' or provider is None:
|
|
175
|
+
content = await call_openai(system_prompt, user_message, api_key)
|
|
176
|
+
elif provider == 'anthropic':
|
|
177
|
+
content = await call_anthropic(system_prompt, user_message, api_key)
|
|
178
|
+
else:
|
|
179
|
+
# Try OpenAI first, then Anthropic for unknown providers
|
|
180
|
+
try:
|
|
181
|
+
content = await call_openai(system_prompt, user_message, api_key)
|
|
182
|
+
except Exception:
|
|
183
|
+
content = await call_anthropic(system_prompt, user_message, api_key)
|
|
184
|
+
else:
|
|
185
|
+
# Fallback to environment variables
|
|
186
|
+
openai_key = os.getenv("OPENAI_API_KEY")
|
|
187
|
+
anthropic_key = os.getenv("ANTHROPIC_API_KEY")
|
|
188
|
+
|
|
189
|
+
if openai_key:
|
|
190
|
+
print("Using OpenAI API (from OPENAI_API_KEY env var)")
|
|
191
|
+
content = await call_openai(system_prompt, user_message, openai_key)
|
|
192
|
+
elif anthropic_key:
|
|
193
|
+
print("Using Anthropic API (from ANTHROPIC_API_KEY env var)")
|
|
194
|
+
content = await call_anthropic(system_prompt, user_message, anthropic_key)
|
|
195
|
+
else:
|
|
196
|
+
raise Exception(
|
|
197
|
+
"No LLM API key found. Either:\n"
|
|
198
|
+
" - Specify a model in metric.parameters.model (to use server-stored API key), or\n"
|
|
199
|
+
" - Set OPENAI_API_KEY or ANTHROPIC_API_KEY environment variable (for local API key)"
|
|
200
|
+
)
|
|
201
|
+
return content
|
|
202
|
+
|
|
203
|
+
|
|
204
|
+
async def score_llm_metric_local(
|
|
205
|
+
input_data: Any,
|
|
206
|
+
output: Any,
|
|
207
|
+
example: Example,
|
|
208
|
+
metric: Metric,
|
|
209
|
+
llm_call_fn: Optional[CallLLMType] = None,
|
|
210
|
+
) -> MetricResult:
|
|
211
|
+
"""
|
|
212
|
+
Score an LLM-as-judge metric.
|
|
213
|
+
|
|
214
|
+
Args:
|
|
215
|
+
input_data: The input data to score
|
|
216
|
+
output: The output to score
|
|
217
|
+
example: The example object
|
|
218
|
+
metric: The metric definition
|
|
219
|
+
llm_call_fn: Optional async function that takes (system_prompt, user_message) and returns
|
|
220
|
+
raw content string (typically JSON). If not provided, will use fallback.
|
|
221
|
+
|
|
222
|
+
Returns:
|
|
223
|
+
MetricResult object with score:[0,1], message (optional), and error (optional)
|
|
224
|
+
"""
|
|
225
|
+
# Build system prompt from metric description or prompt field
|
|
226
|
+
metric_prompt = metric.get("prompt")
|
|
227
|
+
if metric_prompt:
|
|
228
|
+
system_prompt = metric_prompt
|
|
229
|
+
else:
|
|
230
|
+
metric_text = metric.get("description", "") or metric.get("name", "")
|
|
231
|
+
system_prompt = f"""You are a judge evaluating AI assistant OUTPUTs for this metric:
|
|
232
|
+
{metric_text}
|
|
233
|
+
|
|
234
|
+
You MUST respond with ONLY a valid JSON object (no other text before or after) containing:
|
|
235
|
+
- "score": a number between 0 and 1 (where 1 is best)
|
|
236
|
+
- "message": a brief explanation of your scoring
|
|
237
|
+
|
|
238
|
+
Example response format:
|
|
239
|
+
{{"score": 0.75, "message": "The response was helpful but could be more concise"}}
|
|
240
|
+
|
|
241
|
+
Be strict but fair in your evaluation."""
|
|
242
|
+
|
|
243
|
+
if isinstance(input_data, dict):
|
|
244
|
+
input_text = json.dumps(input_data)
|
|
245
|
+
else:
|
|
246
|
+
input_text = str(input_data)
|
|
247
|
+
|
|
248
|
+
if isinstance(output, dict):
|
|
249
|
+
output_text = json.dumps(output)
|
|
250
|
+
else:
|
|
251
|
+
output_text = str(output)
|
|
252
|
+
|
|
253
|
+
# Build user message with the input and output to score
|
|
254
|
+
user_message = f"""<INPUT>:
|
|
255
|
+
{input_text}
|
|
256
|
+
</INPUT>
|
|
257
|
+
<OUTPUT>
|
|
258
|
+
{output_text}
|
|
259
|
+
</OUTPUT>
|
|
260
|
+
Evaluate this OUTPUT according to the metric and return ONLY a valid JSON object {{"score": number, "message": string}}."""
|
|
261
|
+
|
|
262
|
+
# Debug: print what we're sending to the LLM
|
|
263
|
+
print(f"DEBUG: Sending to LLM scorer:")
|
|
264
|
+
print(f" System prompt: {system_prompt[:200]}...")
|
|
265
|
+
print(f" User message: {user_message[:500]}...")
|
|
266
|
+
print(f" Example input extracted: {repr(input_text[:100])}")
|
|
267
|
+
|
|
268
|
+
# Call LLM
|
|
269
|
+
print(f"Calling LLM to score metric '{metric.get('name')}'...")
|
|
270
|
+
if llm_call_fn:
|
|
271
|
+
result = await llm_call_fn(system_prompt, user_message)
|
|
272
|
+
else:
|
|
273
|
+
# Note: api_key and provider should be handled by the caller if model is specified
|
|
274
|
+
# This fallback uses environment variables
|
|
275
|
+
result = await call_llm_fallback(system_prompt, user_message)
|
|
276
|
+
|
|
277
|
+
score_result = parse_llm_response(result)
|
|
278
|
+
if not score_result:
|
|
279
|
+
raise Exception(f"Failed to parse LLM response for metric {metric.get('id', 'unknown')}: {result}")
|
|
280
|
+
return score_result
|
|
281
|
+
|