aiqa-client 0.4.7__py3-none-any.whl → 0.6.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
aiqa/llm_as_judge.py ADDED
@@ -0,0 +1,281 @@
1
+ """
2
+ LLM-as-judge scoring functionality for evaluating outputs.
3
+ """
4
+
5
+ import os
6
+ import json
7
+ import re
8
+ import asyncio
9
+ import requests
10
+ from typing import Any, Dict, Optional, Callable, Awaitable
11
+ from .types import MetricResult, Metric, Example, CallLLMType
12
+
13
+ def parse_llm_response(content: str) -> Optional[MetricResult]:
14
+ """
15
+ Parse LLM response content string into score and message.
16
+
17
+ Args:
18
+ content: Raw content string from LLM response
19
+
20
+ Returns:
21
+ MetricResult object with score:[0,1], message (optional), and error (optional)
22
+ """
23
+ try:
24
+ result = json.loads(content)
25
+ score = result.get("score")
26
+ if not score:
27
+ return None
28
+ message = result.get("message")
29
+ # Ensure score is in [0, 1] range
30
+ score = max(0.0, min(1.0, float(score)))
31
+ return {"score": score, "message": message}
32
+ except (json.JSONDecodeError, ValueError, TypeError) as e:
33
+ print(f"Failed to parse JSON response: {e}")
34
+ # Fallback: try to extract score from text
35
+ raise Exception(f"Could not parse LLM response: {content}")
36
+
37
+
38
+ async def get_model_from_server(
39
+ model_id: str, server_url: str, headers: Dict[str, str]
40
+ ) -> Optional[Dict[str, Any]]:
41
+ """
42
+ Fetch a model from the server with API key included.
43
+
44
+ Args:
45
+ model_id: ID of the model to fetch
46
+ server_url: URL of the AIQA server
47
+ headers: HTTP headers for authentication
48
+
49
+ Returns:
50
+ Model dictionary with api_key if available, None if not found or error
51
+ """
52
+ try:
53
+ def _do_request():
54
+ return requests.get(
55
+ f"{server_url}/model/{model_id}?fields=api_key",
56
+ headers=headers,
57
+ )
58
+
59
+ response = await asyncio.to_thread(_do_request)
60
+ if response.ok:
61
+ model = response.json()
62
+ if model.get("api_key"):
63
+ return model
64
+ return None
65
+ except Exception as e:
66
+ print(f"Warning: Could not fetch model from server: {e}")
67
+ return None
68
+
69
+
70
+ async def call_openai(
71
+ system_prompt: str, user_message: str, api_key: str, output_schema: str = None
72
+ ) -> str:
73
+ """Call OpenAI API for LLM-as-judge scoring. Returns raw content string."""
74
+ def _do_request():
75
+ # TODO send output_schema if provided
76
+ return requests.post(
77
+ "https://api.openai.com/v1/chat/completions",
78
+ headers={
79
+ "Content-Type": "application/json",
80
+ "Authorization": f"Bearer {api_key}",
81
+ },
82
+ json={
83
+ "model": "gpt-4o-mini", # Default model, can be made configurable
84
+ "messages": [
85
+ {"role": "system", "content": system_prompt},
86
+ {"role": "user", "content": user_message},
87
+ ],
88
+ "temperature": 0,
89
+ "response_format": {"type": "json_object"},
90
+ },
91
+ )
92
+
93
+ response = await asyncio.to_thread(_do_request)
94
+
95
+ if not response.ok:
96
+ error_text = response.text
97
+ raise Exception(
98
+ f"OpenAI API error: {response.status_code} {response.reason} - {error_text}"
99
+ )
100
+
101
+ data = response.json()
102
+ choices = data.get("choices", [])
103
+ if not choices or not isinstance(choices, list):
104
+ raise Exception("OpenAI API did not return choices")
105
+ content = choices[0].get("message", {}).get("content")
106
+ if not content:
107
+ raise Exception("OpenAI API did not return content")
108
+
109
+ print(f"OpenAI raw response: {content[:500]}...") # Show first 500 chars
110
+ return content
111
+
112
+
113
+ async def call_anthropic(
114
+ system_prompt: str, user_message: str, api_key: str
115
+ ) -> str:
116
+ """Call Anthropic (Claude) API for LLM-as-judge scoring. Returns raw content string."""
117
+ def _do_request():
118
+ return requests.post(
119
+ "https://api.anthropic.com/v1/messages",
120
+ headers={
121
+ "Content-Type": "application/json",
122
+ "x-api-key": api_key,
123
+ "anthropic-version": "2023-06-01",
124
+ },
125
+ json={
126
+ "model": "claude-3-5-sonnet-20241022", # Default model
127
+ "max_tokens": 1024,
128
+ "temperature": 0,
129
+ "system": system_prompt,
130
+ "messages": [{"role": "user", "content": user_message}],
131
+ },
132
+ )
133
+
134
+ response = await asyncio.to_thread(_do_request)
135
+
136
+ if not response.ok:
137
+ error_text = response.text
138
+ raise Exception(
139
+ f"Anthropic API error: {response.status_code} {response.reason} - {error_text}"
140
+ )
141
+
142
+ data = response.json()
143
+ content_list = data.get("content", [])
144
+ if not content_list or not isinstance(content_list, list):
145
+ raise Exception("Anthropic API did not return content")
146
+ content = content_list[0].get("text", "")
147
+ if not content:
148
+ raise Exception("Anthropic API did not return content")
149
+
150
+ print(f"Anthropic raw response: {content[:500]}...") # Show first 500 chars
151
+ return content
152
+
153
+
154
+ async def call_llm_fallback(
155
+ system_prompt: str,
156
+ user_message: str,
157
+ api_key: Optional[str] = None,
158
+ provider: Optional[str] = None,
159
+ ) -> str:
160
+ """
161
+ Fallback LLM call function that checks for API key parameter or environment variables.
162
+
163
+ Args:
164
+ system_prompt: System prompt for the LLM
165
+ user_message: User message containing the output to score
166
+ api_key: Optional API key to use (takes precedence over env vars)
167
+ provider: Optional provider name ('openai', 'anthropic', etc.) to determine which API to call
168
+
169
+ Returns:
170
+ Dictionary with "score" (float 0-1) and "message" (str)
171
+ """
172
+ # If API key provided, use it with the specified provider
173
+ if api_key:
174
+ if provider == 'openai' or provider is None:
175
+ content = await call_openai(system_prompt, user_message, api_key)
176
+ elif provider == 'anthropic':
177
+ content = await call_anthropic(system_prompt, user_message, api_key)
178
+ else:
179
+ # Try OpenAI first, then Anthropic for unknown providers
180
+ try:
181
+ content = await call_openai(system_prompt, user_message, api_key)
182
+ except Exception:
183
+ content = await call_anthropic(system_prompt, user_message, api_key)
184
+ else:
185
+ # Fallback to environment variables
186
+ openai_key = os.getenv("OPENAI_API_KEY")
187
+ anthropic_key = os.getenv("ANTHROPIC_API_KEY")
188
+
189
+ if openai_key:
190
+ print("Using OpenAI API (from OPENAI_API_KEY env var)")
191
+ content = await call_openai(system_prompt, user_message, openai_key)
192
+ elif anthropic_key:
193
+ print("Using Anthropic API (from ANTHROPIC_API_KEY env var)")
194
+ content = await call_anthropic(system_prompt, user_message, anthropic_key)
195
+ else:
196
+ raise Exception(
197
+ "No LLM API key found. Either:\n"
198
+ " - Specify a model in metric.parameters.model (to use server-stored API key), or\n"
199
+ " - Set OPENAI_API_KEY or ANTHROPIC_API_KEY environment variable (for local API key)"
200
+ )
201
+ return content
202
+
203
+
204
+ async def score_llm_metric_local(
205
+ input_data: Any,
206
+ output: Any,
207
+ example: Example,
208
+ metric: Metric,
209
+ llm_call_fn: Optional[CallLLMType] = None,
210
+ ) -> MetricResult:
211
+ """
212
+ Score an LLM-as-judge metric.
213
+
214
+ Args:
215
+ input_data: The input data to score
216
+ output: The output to score
217
+ example: The example object
218
+ metric: The metric definition
219
+ llm_call_fn: Optional async function that takes (system_prompt, user_message) and returns
220
+ raw content string (typically JSON). If not provided, will use fallback.
221
+
222
+ Returns:
223
+ MetricResult object with score:[0,1], message (optional), and error (optional)
224
+ """
225
+ # Build system prompt from metric description or prompt field
226
+ metric_prompt = metric.get("prompt")
227
+ if metric_prompt:
228
+ system_prompt = metric_prompt
229
+ else:
230
+ metric_text = metric.get("description", "") or metric.get("name", "")
231
+ system_prompt = f"""You are a judge evaluating AI assistant OUTPUTs for this metric:
232
+ {metric_text}
233
+
234
+ You MUST respond with ONLY a valid JSON object (no other text before or after) containing:
235
+ - "score": a number between 0 and 1 (where 1 is best)
236
+ - "message": a brief explanation of your scoring
237
+
238
+ Example response format:
239
+ {{"score": 0.75, "message": "The response was helpful but could be more concise"}}
240
+
241
+ Be strict but fair in your evaluation."""
242
+
243
+ if isinstance(input_data, dict):
244
+ input_text = json.dumps(input_data)
245
+ else:
246
+ input_text = str(input_data)
247
+
248
+ if isinstance(output, dict):
249
+ output_text = json.dumps(output)
250
+ else:
251
+ output_text = str(output)
252
+
253
+ # Build user message with the input and output to score
254
+ user_message = f"""<INPUT>:
255
+ {input_text}
256
+ </INPUT>
257
+ <OUTPUT>
258
+ {output_text}
259
+ </OUTPUT>
260
+ Evaluate this OUTPUT according to the metric and return ONLY a valid JSON object {{"score": number, "message": string}}."""
261
+
262
+ # Debug: print what we're sending to the LLM
263
+ print(f"DEBUG: Sending to LLM scorer:")
264
+ print(f" System prompt: {system_prompt[:200]}...")
265
+ print(f" User message: {user_message[:500]}...")
266
+ print(f" Example input extracted: {repr(input_text[:100])}")
267
+
268
+ # Call LLM
269
+ print(f"Calling LLM to score metric '{metric.get('name')}'...")
270
+ if llm_call_fn:
271
+ result = await llm_call_fn(system_prompt, user_message)
272
+ else:
273
+ # Note: api_key and provider should be handled by the caller if model is specified
274
+ # This fallback uses environment variables
275
+ result = await call_llm_fallback(system_prompt, user_message)
276
+
277
+ score_result = parse_llm_response(result)
278
+ if not score_result:
279
+ raise Exception(f"Failed to parse LLM response for metric {metric.get('id', 'unknown')}: {result}")
280
+ return score_result
281
+