aiqa-client 0.5.2__py3-none-any.whl → 0.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
aiqa/llm_as_judge.py ADDED
@@ -0,0 +1,282 @@
1
+ """
2
+ LLM-as-judge scoring functionality for evaluating outputs.
3
+ """
4
+
5
+ import os
6
+ import json
7
+ import re
8
+ import asyncio
9
+ import requests
10
+ from typing import Any, Dict, Optional, Callable, Awaitable
11
+ from .types import MetricResult, Metric, Example, CallLLMType
12
+
13
+ def parse_llm_response(content: str) -> Optional[MetricResult]:
14
+ """
15
+ Parse LLM response content string into score and message.
16
+
17
+ Args:
18
+ content: Raw content string from LLM response
19
+
20
+ Returns:
21
+ MetricResult object with score:[0,1], message (optional), and error (optional)
22
+ """
23
+ try:
24
+ result = json.loads(content)
25
+ score = result.get("score")
26
+ if not score:
27
+ return None
28
+ message = result.get("message")
29
+ # Ensure score is in [0, 1] range
30
+ score = max(0.0, min(1.0, float(score)))
31
+ return {"score": score, "message": message}
32
+ except (json.JSONDecodeError, ValueError, TypeError) as e:
33
+ print(f"Failed to parse JSON response: {e}")
34
+ # Fallback: try to extract score from text
35
+ raise Exception(f"Could not parse LLM response: {content}")
36
+
37
+
38
+ async def get_model_from_server(
39
+ model_id: str, server_url: str, headers: Dict[str, str]
40
+ ) -> Optional[Dict[str, Any]]:
41
+ """
42
+ Fetch a model from the server with API key included.
43
+
44
+ Args:
45
+ model_id: ID of the model to fetch
46
+ server_url: URL of the AIQA server
47
+ headers: HTTP headers for authentication
48
+
49
+ Returns:
50
+ Model dictionary with api_key if available, None if not found or error
51
+ """
52
+ try:
53
+ def _do_request():
54
+ return requests.get(
55
+ f"{server_url}/model/{model_id}?fields=apiKey", # Server uses camelCase 'apiKey' (also accepts 'api_key')
56
+ headers=headers,
57
+ )
58
+
59
+ response = await asyncio.to_thread(_do_request)
60
+ if response.ok:
61
+ model = response.json()
62
+ # Server returns 'apiKey' (camelCase)
63
+ if model.get("apiKey"):
64
+ return model
65
+ return None
66
+ except Exception as e:
67
+ print(f"Warning: Could not fetch model from server: {e}")
68
+ return None
69
+
70
+
71
+ async def call_openai(
72
+ system_prompt: str, user_message: str, api_key: str, output_schema: str = None
73
+ ) -> str:
74
+ """Call OpenAI API for LLM-as-judge scoring. Returns raw content string."""
75
+ def _do_request():
76
+ # TODO send output_schema if provided
77
+ return requests.post(
78
+ "https://api.openai.com/v1/chat/completions",
79
+ headers={
80
+ "Content-Type": "application/json",
81
+ "Authorization": f"Bearer {api_key}",
82
+ },
83
+ json={
84
+ "model": "gpt-4o-mini", # Default model, can be made configurable
85
+ "messages": [
86
+ {"role": "system", "content": system_prompt},
87
+ {"role": "user", "content": user_message},
88
+ ],
89
+ "temperature": 0,
90
+ "response_format": {"type": "json_object"},
91
+ },
92
+ )
93
+
94
+ response = await asyncio.to_thread(_do_request)
95
+
96
+ if not response.ok:
97
+ error_text = response.text
98
+ raise Exception(
99
+ f"OpenAI API error: {response.status_code} {response.reason} - {error_text}"
100
+ )
101
+
102
+ data = response.json()
103
+ choices = data.get("choices", [])
104
+ if not choices or not isinstance(choices, list):
105
+ raise Exception("OpenAI API did not return choices")
106
+ content = choices[0].get("message", {}).get("content")
107
+ if not content:
108
+ raise Exception("OpenAI API did not return content")
109
+
110
+ print(f"OpenAI raw response: {content[:500]}...") # Show first 500 chars
111
+ return content
112
+
113
+
114
+ async def call_anthropic(
115
+ system_prompt: str, user_message: str, api_key: str
116
+ ) -> str:
117
+ """Call Anthropic (Claude) API for LLM-as-judge scoring. Returns raw content string."""
118
+ def _do_request():
119
+ return requests.post(
120
+ "https://api.anthropic.com/v1/messages",
121
+ headers={
122
+ "Content-Type": "application/json",
123
+ "x-api-key": api_key,
124
+ "anthropic-version": "2023-06-01",
125
+ },
126
+ json={
127
+ "model": "claude-3-5-sonnet-20241022", # Default model
128
+ "max_tokens": 1024,
129
+ "temperature": 0,
130
+ "system": system_prompt,
131
+ "messages": [{"role": "user", "content": user_message}],
132
+ },
133
+ )
134
+
135
+ response = await asyncio.to_thread(_do_request)
136
+
137
+ if not response.ok:
138
+ error_text = response.text
139
+ raise Exception(
140
+ f"Anthropic API error: {response.status_code} {response.reason} - {error_text}"
141
+ )
142
+
143
+ data = response.json()
144
+ content_list = data.get("content", [])
145
+ if not content_list or not isinstance(content_list, list):
146
+ raise Exception("Anthropic API did not return content")
147
+ content = content_list[0].get("text", "")
148
+ if not content:
149
+ raise Exception("Anthropic API did not return content")
150
+
151
+ print(f"Anthropic raw response: {content[:500]}...") # Show first 500 chars
152
+ return content
153
+
154
+
155
+ async def call_llm_fallback(
156
+ system_prompt: str,
157
+ user_message: str,
158
+ api_key: Optional[str] = None,
159
+ provider: Optional[str] = None,
160
+ ) -> str:
161
+ """
162
+ Fallback LLM call function that checks for API key parameter or environment variables.
163
+
164
+ Args:
165
+ system_prompt: System prompt for the LLM
166
+ user_message: User message containing the output to score
167
+ api_key: Optional API key to use (takes precedence over env vars)
168
+ provider: Optional provider name ('openai', 'anthropic', etc.) to determine which API to call
169
+
170
+ Returns:
171
+ Dictionary with "score" (float 0-1) and "message" (str)
172
+ """
173
+ # If API key provided, use it with the specified provider
174
+ if api_key:
175
+ if provider == 'openai' or provider is None:
176
+ content = await call_openai(system_prompt, user_message, api_key)
177
+ elif provider == 'anthropic':
178
+ content = await call_anthropic(system_prompt, user_message, api_key)
179
+ else:
180
+ # Try OpenAI first, then Anthropic for unknown providers
181
+ try:
182
+ content = await call_openai(system_prompt, user_message, api_key)
183
+ except Exception:
184
+ content = await call_anthropic(system_prompt, user_message, api_key)
185
+ else:
186
+ # Fallback to environment variables
187
+ openai_key = os.getenv("OPENAI_API_KEY")
188
+ anthropic_key = os.getenv("ANTHROPIC_API_KEY")
189
+
190
+ if openai_key:
191
+ print("Using OpenAI API (from OPENAI_API_KEY env var)")
192
+ content = await call_openai(system_prompt, user_message, openai_key)
193
+ elif anthropic_key:
194
+ print("Using Anthropic API (from ANTHROPIC_API_KEY env var)")
195
+ content = await call_anthropic(system_prompt, user_message, anthropic_key)
196
+ else:
197
+ raise Exception(
198
+ "No LLM API key found. Either:\n"
199
+ " - Specify a model in metric.parameters.model (to use server-stored API key), or\n"
200
+ " - Set OPENAI_API_KEY or ANTHROPIC_API_KEY environment variable (for local API key)"
201
+ )
202
+ return content
203
+
204
+
205
+ async def score_llm_metric_local(
206
+ input_data: Any,
207
+ output: Any,
208
+ example: Example,
209
+ metric: Metric,
210
+ llm_call_fn: Optional[CallLLMType] = None,
211
+ ) -> MetricResult:
212
+ """
213
+ Score an LLM-as-judge metric.
214
+
215
+ Args:
216
+ input_data: The input data to score
217
+ output: The output to score
218
+ example: The example object
219
+ metric: The metric definition
220
+ llm_call_fn: Optional async function that takes (system_prompt, user_message) and returns
221
+ raw content string (typically JSON). If not provided, will use fallback.
222
+
223
+ Returns:
224
+ MetricResult object with score:[0,1], message (optional), and error (optional)
225
+ """
226
+ # Build system prompt from metric description or prompt field
227
+ metric_prompt = metric.get("prompt")
228
+ if metric_prompt:
229
+ system_prompt = metric_prompt
230
+ else:
231
+ metric_text = metric.get("description", "") or metric.get("name", "")
232
+ system_prompt = f"""You are a judge evaluating AI assistant OUTPUTs for this metric:
233
+ {metric_text}
234
+
235
+ You MUST respond with ONLY a valid JSON object (no other text before or after) containing:
236
+ - "score": a number between 0 and 1 (where 1 is best)
237
+ - "message": a brief explanation of your scoring
238
+
239
+ Example response format:
240
+ {{"score": 0.75, "message": "The response was helpful but could be more concise"}}
241
+
242
+ Be strict but fair in your evaluation."""
243
+
244
+ if isinstance(input_data, dict):
245
+ input_text = json.dumps(input_data)
246
+ else:
247
+ input_text = str(input_data)
248
+
249
+ if isinstance(output, dict):
250
+ output_text = json.dumps(output)
251
+ else:
252
+ output_text = str(output)
253
+
254
+ # Build user message with the input and output to score
255
+ user_message = f"""<INPUT>:
256
+ {input_text}
257
+ </INPUT>
258
+ <OUTPUT>
259
+ {output_text}
260
+ </OUTPUT>
261
+ Evaluate this OUTPUT according to the metric and return ONLY a valid JSON object {{"score": number, "message": string}}."""
262
+
263
+ # Debug: print what we're sending to the LLM
264
+ print(f"DEBUG: Sending to LLM scorer:")
265
+ print(f" System prompt: {system_prompt[:200]}...")
266
+ print(f" User message: {user_message[:500]}...")
267
+ print(f" Example input extracted: {repr(input_text[:100])}")
268
+
269
+ # Call LLM
270
+ print(f"Calling LLM to score metric '{metric.get('name')}'...")
271
+ if llm_call_fn:
272
+ result = await llm_call_fn(system_prompt, user_message)
273
+ else:
274
+ # Note: api_key and provider should be handled by the caller if model is specified
275
+ # This fallback uses environment variables
276
+ result = await call_llm_fallback(system_prompt, user_message)
277
+
278
+ score_result = parse_llm_response(result)
279
+ if not score_result:
280
+ raise Exception(f"Failed to parse LLM response for metric {metric.get('id', 'unknown')}: {result}")
281
+ return score_result
282
+
aiqa/object_serialiser.py CHANGED
@@ -25,7 +25,7 @@ def sanitize_string_for_utf8(text: str) -> str:
25
25
  Returns:
26
26
  A string with surrogate characters replaced by the Unicode replacement character (U+FFFD)
27
27
  """
28
- if text == None:
28
+ if text is None:
29
29
  return None
30
30
  if not isinstance(text, str): # paranoia
31
31
  text = str(text)
@@ -43,7 +43,10 @@ def toNumber(value: str|int|None) -> int:
43
43
  if value is None:
44
44
  return 0
45
45
  if isinstance(value, int):
46
- return value
46
+ return value
47
+ # Convert to string if not already
48
+ if not isinstance(value, str):
49
+ value = str(value)
47
50
  if value.endswith("b"): # drop the b
48
51
  value = value[:-1]
49
52
  if value.endswith("g"):