aiqa-client 0.5.2__py3-none-any.whl → 0.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
aiqa/__init__.py CHANGED
@@ -26,8 +26,8 @@ Example:
26
26
  result = my_function()
27
27
  """
28
28
 
29
- from .tracing import (
30
- WithTracing,
29
+ from .tracing import WithTracing
30
+ from .span_helpers import (
31
31
  flush_tracing,
32
32
  set_span_attribute,
33
33
  set_span_name,
@@ -39,7 +39,10 @@ from .tracing import (
39
39
  extract_trace_context,
40
40
  set_conversation_id,
41
41
  set_component_tag,
42
+ set_token_usage,
43
+ set_provider_and_model,
42
44
  get_span,
45
+ submit_feedback,
43
46
  )
44
47
  from .client import get_aiqa_client
45
48
  from .experiment_runner import ExperimentRunner
@@ -60,7 +63,10 @@ __all__ = [
60
63
  "extract_trace_context",
61
64
  "set_conversation_id",
62
65
  "set_component_tag",
66
+ "set_token_usage",
67
+ "set_provider_and_model",
63
68
  "get_span",
69
+ "submit_feedback",
64
70
  "VERSION",
65
71
  ]
66
72
 
aiqa/client.py CHANGED
@@ -2,11 +2,13 @@
2
2
  import os
3
3
  import logging
4
4
  from functools import lru_cache
5
- from typing import Optional, TYPE_CHECKING, Any, Dict
5
+ from typing import Optional, TYPE_CHECKING, Any, Dict, List
6
6
  from opentelemetry import trace
7
7
  from opentelemetry.sdk.trace import TracerProvider
8
- from opentelemetry.sdk.trace.export import BatchSpanProcessor
8
+ from opentelemetry.sdk.trace.export import BatchSpanProcessor, SpanExporter, SpanExportResult
9
9
  from opentelemetry.exporter.otlp.proto.http.trace_exporter import OTLPSpanExporter
10
+ from opentelemetry.sdk.trace import ReadableSpan
11
+ from opentelemetry.trace import SpanContext
10
12
  import requests
11
13
 
12
14
  from .constants import AIQA_TRACER_NAME, LOG_TAG
@@ -50,6 +52,8 @@ class AIQAClient:
50
52
  cls._instance._exporter = None # reduce circular import issues by not importing for typecheck here
51
53
  cls._instance._enabled: bool = True
52
54
  cls._instance._initialized: bool = False
55
+ cls._instance._default_ignore_patterns: List[str] = ["_*"] # Default: filter properties starting with '_'
56
+ cls._instance._ignore_recursive: bool = True # Default: recursive filtering enabled
53
57
  return cls._instance
54
58
 
55
59
  @property
@@ -88,6 +92,76 @@ class AIQAClient:
88
92
  logger.info(f"AIQA tracing {'enabled' if value else 'disabled'}")
89
93
  self._enabled = value
90
94
 
95
+ @property
96
+ def default_ignore_patterns(self) -> List[str]:
97
+ """
98
+ Get the default ignore patterns applied to all traced inputs and outputs.
99
+
100
+ Default: ["_*"] (filters properties starting with '_')
101
+
102
+ Returns:
103
+ List of ignore patterns (supports wildcards like "_*")
104
+ """
105
+ return self._default_ignore_patterns.copy()
106
+
107
+ @default_ignore_patterns.setter
108
+ def default_ignore_patterns(self, value: Optional[List[str]]) -> None:
109
+ """
110
+ Set the default ignore patterns applied to all traced inputs and outputs.
111
+
112
+ Args:
113
+ value: List of patterns to ignore (e.g., ["_*", "password"]).
114
+ Set to None or [] to disable default ignore patterns.
115
+ Supports wildcards (e.g., "_*" matches "_apple", "_fruit").
116
+
117
+ Example:
118
+ from aiqa import get_aiqa_client
119
+
120
+ client = get_aiqa_client()
121
+ # Add password to default ignore patterns
122
+ client.default_ignore_patterns = ["_*", "password", "api_key"]
123
+ # Disable default ignore patterns
124
+ client.default_ignore_patterns = []
125
+ """
126
+ if value is None:
127
+ self._default_ignore_patterns = []
128
+ else:
129
+ self._default_ignore_patterns = list(value)
130
+ logger.info(f"Default ignore patterns set to: {self._default_ignore_patterns}")
131
+
132
+ @property
133
+ def ignore_recursive(self) -> bool:
134
+ """
135
+ Get whether ignore patterns are applied recursively to nested objects.
136
+
137
+ Default: True (recursive filtering enabled)
138
+
139
+ Returns:
140
+ True if recursive filtering is enabled, False otherwise
141
+ """
142
+ return self._ignore_recursive
143
+
144
+ @ignore_recursive.setter
145
+ def ignore_recursive(self, value: bool) -> None:
146
+ """
147
+ Set whether ignore patterns are applied recursively to nested objects.
148
+
149
+ When True (default), ignore patterns are applied at all nesting levels.
150
+ When False, ignore patterns are only applied to top-level keys.
151
+
152
+ Args:
153
+ value: True to enable recursive filtering, False to disable
154
+
155
+ Example:
156
+ from aiqa import get_aiqa_client
157
+
158
+ client = get_aiqa_client()
159
+ # Disable recursive filtering (only filter top-level keys)
160
+ client.ignore_recursive = False
161
+ """
162
+ self._ignore_recursive = bool(value)
163
+ logger.info(f"Ignore recursive filtering {'enabled' if self._ignore_recursive else 'disabled'}")
164
+
91
165
  def shutdown(self) -> None:
92
166
  """
93
167
  Shutdown the tracer provider and exporter.
@@ -243,8 +317,6 @@ def _attach_aiqa_processor(provider: TracerProvider) -> None:
243
317
  auth_headers = {}
244
318
  if api_key:
245
319
  auth_headers["Authorization"] = f"ApiKey {api_key}"
246
- elif os.getenv("AIQA_API_KEY"):
247
- auth_headers["Authorization"] = f"ApiKey {os.getenv('AIQA_API_KEY')}"
248
320
 
249
321
  # OTLP HTTP exporter requires the full endpoint URL including /v1/traces
250
322
  # Ensure server_url doesn't have trailing slash or /v1/traces, then append /v1/traces
@@ -254,11 +326,24 @@ def _attach_aiqa_processor(provider: TracerProvider) -> None:
254
326
  else:
255
327
  endpoint = f"{base_url}/v1/traces"
256
328
 
257
- # Create OTLP exporter with authentication headers only
329
+ # Get timeout from environment variable (in seconds)
330
+ # Supports OTEL_EXPORTER_OTLP_TIMEOUT (standard) or AIQA_EXPORT_TIMEOUT (custom)
331
+ # Default is 30 seconds (more generous than OTLP default of 10s)
332
+ timeout = 30.0
333
+ otlp_timeout = os.getenv("OTEL_EXPORTER_OTLP_TIMEOUT")
334
+
335
+ if otlp_timeout:
336
+ try:
337
+ timeout = float(otlp_timeout)
338
+ except ValueError:
339
+ logger.warning(f"Invalid OTEL_EXPORTER_OTLP_TIMEOUT value '{otlp_timeout}', using default 30.0")
340
+
341
+ # Create OTLP exporter with authentication headers and timeout
258
342
  # The exporter will set Content-Type and other headers automatically
259
343
  exporter = OTLPSpanExporter(
260
344
  endpoint=endpoint,
261
345
  headers=auth_headers if auth_headers else None,
346
+ timeout=timeout,
262
347
  )
263
348
 
264
349
  provider.add_span_processor(BatchSpanProcessor(exporter))
aiqa/constants.py CHANGED
@@ -3,6 +3,6 @@ Constants used across the AIQA client package.
3
3
  """
4
4
 
5
5
  AIQA_TRACER_NAME = "aiqa-tracer"
6
- VERSION = "0.5.2" # automatically updated by set-version-json.sh
6
+ VERSION = "0.7.0" # automatically updated by set-version-json.sh
7
7
 
8
8
  LOG_TAG = "AIQA" # Used in all logging output to identify AIQA messages
aiqa/experiment_runner.py CHANGED
@@ -4,10 +4,52 @@ ExperimentRunner - runs experiments on datasets and scores results
4
4
 
5
5
  import os
6
6
  import time
7
+ import asyncio
7
8
  from .constants import LOG_TAG
8
9
  from .http_utils import build_headers, get_server_url, get_api_key, format_http_error
9
10
  from typing import Any, Dict, List, Optional, Callable, Awaitable, Union
11
+ from .tracing import WithTracing
12
+ from .span_helpers import set_span_attribute, flush_tracing
13
+ from .llm_as_judge import score_llm_metric_local, get_model_from_server, call_llm_fallback
10
14
  import requests
15
+ from .types import MetricResult, ScoreThisInputOutputMetricType, Example, Result, Metric, CallLLMType
16
+
17
+ # Type aliases for engine/scoring functions to improve code completion and clarity
18
+ from typing import TypedDict
19
+
20
+ # Function that processes input and parameters to produce an output (sync or async)
21
+ CallMyCodeType = Callable[[Any, Dict[str, Any]], Union[Any, Awaitable[Any]]]
22
+
23
+ # Function that scores a given output, using input, example, and parameters (usually async)
24
+ # Returns a dictionary with score/message/etc.
25
+ ScoreThisOutputType = Callable[[Any, Any, Dict[str, Any], Dict[str, Any]], Awaitable[Dict[str, Any]]]
26
+
27
+
28
+
29
+ def _filter_input_for_run(input_data: Any) -> Dict[str, Any]:
30
+ """Tracing:Filter input - drop most, keep just ids"""
31
+ if not isinstance(input_data, dict):
32
+ return {}
33
+ self_obj = input_data.get("self")
34
+ if not self_obj:
35
+ return {}
36
+ return {
37
+ "dataset": getattr(self_obj, "dataset_id", None),
38
+ "experiment": getattr(self_obj, "experiment_id", None),
39
+ }
40
+
41
+
42
+ def _filter_input_for_run_example(
43
+ self: "ExperimentRunner",
44
+ example: Dict[str, Any],
45
+ call_my_code: Any = None,
46
+ score_this_output: Any = None,
47
+ ) -> Dict[str, Any]:
48
+ """Filter input for run_example method to extract dataset, experiment, and example IDs."""
49
+ result = _filter_input_for_run({"self": self})
50
+ if isinstance(example, dict):
51
+ result["example"] = example.get("id")
52
+ return result
11
53
 
12
54
 
13
55
  class ExperimentRunner:
@@ -24,6 +66,7 @@ class ExperimentRunner:
24
66
  server_url: Optional[str] = None,
25
67
  api_key: Optional[str] = None,
26
68
  organisation_id: Optional[str] = None,
69
+ llm_call_fn: Optional[CallLLMType] = None,
27
70
  ):
28
71
  """
29
72
  Initialize the ExperimentRunner.
@@ -33,7 +76,11 @@ class ExperimentRunner:
33
76
  experiment_id: Usually unset, and a fresh experiment is created with a random ID
34
77
  server_url: URL of the AIQA server (defaults to AIQA_SERVER_URL env var)
35
78
  api_key: API key for authentication (defaults to AIQA_API_KEY env var)
36
- organisation_id: Organisation ID for the experiment
79
+ organisation_id: Optional organisation ID for the experiment. If not provided, will be
80
+ derived from the dataset when needed.
81
+ llm_call_fn: Optional async function that takes (system_prompt, user_message) and returns
82
+ raw content string (typically JSON). If not provided, will check for OPENAI_API_KEY
83
+ or ANTHROPIC_API_KEY environment variables.
37
84
  """
38
85
  self.dataset_id = dataset_id
39
86
  self.experiment_id = experiment_id
@@ -42,6 +89,8 @@ class ExperimentRunner:
42
89
  self.organisation = organisation_id
43
90
  self.experiment: Optional[Dict[str, Any]] = None
44
91
  self.scores: List[Dict[str, Any]] = []
92
+ self.llm_call_fn = llm_call_fn
93
+ self._dataset_cache: Optional[Dict[str, Any]] = None
45
94
 
46
95
  def _get_headers(self) -> Dict[str, str]:
47
96
  """Build HTTP headers for API requests."""
@@ -54,6 +103,9 @@ class ExperimentRunner:
54
103
  Returns:
55
104
  The dataset object with metrics and other information
56
105
  """
106
+ if self._dataset_cache is not None:
107
+ return self._dataset_cache
108
+
57
109
  response = requests.get(
58
110
  f"{self.server_url}/dataset/{self.dataset_id}",
59
111
  headers=self._get_headers(),
@@ -62,9 +114,26 @@ class ExperimentRunner:
62
114
  if not response.ok:
63
115
  raise Exception(format_http_error(response, "fetch dataset"))
64
116
 
117
+ dataset = response.json()
118
+ self._dataset_cache = dataset
119
+
120
+ # If organisation_id wasn't set, derive it from the dataset
121
+ if not self.organisation and dataset.get("organisation"):
122
+ self.organisation = dataset.get("organisation")
123
+
124
+ return dataset
125
+
126
+ def get_example(self, example_id: str) -> Dict[str, Any]:
127
+ """
128
+ Fetch an example by ID.
129
+ """
130
+ response = requests.get(
131
+ f"{self.server_url}/example/{example_id}",
132
+ headers=self._get_headers(),
133
+ )
65
134
  return response.json()
66
135
 
67
- def get_example_inputs(self, limit: int = 10000) -> List[Dict[str, Any]]:
136
+ def get_examples_for_dataset(self, limit: int = 10000) -> List[Dict[str, Any]]:
68
137
  """
69
138
  Fetch example inputs from the dataset.
70
139
 
@@ -103,13 +172,17 @@ class ExperimentRunner:
103
172
  experiment_setup: Optional setup for the experiment object. You may wish to set:
104
173
  - name (recommended for labelling the experiment)
105
174
  - parameters
106
- - comparison_parameters
107
175
 
108
176
  Returns:
109
177
  The created experiment object
110
178
  """
179
+ # Ensure we have the organisation ID - try to get it from the dataset if not set
180
+ if not self.organisation:
181
+ dataset = self.get_dataset()
182
+ self.organisation = dataset.get("organisation")
183
+
111
184
  if not self.organisation or not self.dataset_id:
112
- raise Exception("Organisation and dataset ID are required to create an experiment")
185
+ raise Exception("Organisation and dataset ID are required to create an experiment. Organisation can be derived from the dataset or set via organisation_id parameter.")
113
186
 
114
187
  if not experiment_setup:
115
188
  experiment_setup = {}
@@ -120,7 +193,7 @@ class ExperimentRunner:
120
193
  "organisation": self.organisation,
121
194
  "dataset": self.dataset_id,
122
195
  "results": [],
123
- "summary_results": {},
196
+ "summaries": {},
124
197
  }
125
198
 
126
199
  print(f"Creating experiment")
@@ -138,19 +211,19 @@ class ExperimentRunner:
138
211
  self.experiment = experiment
139
212
  return experiment
140
213
 
141
- def score_and_store(
214
+ async def score_and_store(
142
215
  self,
143
- example: Dict[str, Any],
144
- result: Any,
145
- scores: Optional[Dict[str, Any]] = None,
146
- ) -> Dict[str, Any]:
216
+ example: Example,
217
+ output: Any,
218
+ result: Result,
219
+ ) -> Result:
147
220
  """
148
221
  Ask the server to score an example result. Stores the score for later summary calculation.
149
222
 
150
223
  Args:
151
224
  example: The example object
152
- result: The output from running the engine on the example
153
- scores: Optional pre-computed scores
225
+ output: The output from running the engine on the example
226
+ result: The result object for locally calculated scores
154
227
 
155
228
  Returns:
156
229
  The score result from the server
@@ -158,22 +231,31 @@ class ExperimentRunner:
158
231
  # Do we have an experiment ID? If not, we need to create the experiment first
159
232
  if not self.experiment_id:
160
233
  self.create_experiment()
161
-
162
- if scores is None:
163
- scores = {}
164
-
165
- print(f"Scoring and storing example: {example['id']}")
234
+ example_id = example.get("id")
235
+ if not example_id:
236
+ raise ValueError("Example must have an 'id' field")
237
+ if result is None:
238
+ result = Result(example=example_id, scores={}, messages={}, errors={})
239
+ scores = result.get("scores") or {}
240
+
241
+
242
+
243
+ print(f"Scoring and storing example: {example_id}")
166
244
  print(f"Scores: {scores}")
167
245
 
168
- response = requests.post(
169
- f"{self.server_url}/experiment/{self.experiment_id}/example/{example['id']}/scoreAndStore",
170
- json={
171
- "output": result,
172
- "traceId": example.get("traceId"),
173
- "scores": scores,
174
- },
175
- headers=self._get_headers(),
176
- )
246
+ # Run synchronous requests.post in a thread pool to avoid blocking
247
+ def _do_request():
248
+ return requests.post(
249
+ f"{self.server_url}/experiment/{self.experiment_id}/example/{example_id}/scoreAndStore",
250
+ json={
251
+ "output": result,
252
+ "traceId": example.get("trace"), # Server returns 'trace' (lowercase), but API expects 'traceId' (camelCase)
253
+ "scores": scores,
254
+ },
255
+ headers=self._get_headers(),
256
+ )
257
+
258
+ response = await asyncio.to_thread(_do_request)
177
259
 
178
260
  if not response.ok:
179
261
  raise Exception(format_http_error(response, "score and store"))
@@ -182,12 +264,11 @@ class ExperimentRunner:
182
264
  print(f"scoreAndStore response: {json_result}")
183
265
  return json_result
184
266
 
267
+ @WithTracing(filter_input=_filter_input_for_run)
185
268
  async def run(
186
269
  self,
187
- engine: Callable[[Any], Union[Any, Awaitable[Any]]],
188
- scorer: Optional[
189
- Callable[[Any, Dict[str, Any]], Awaitable[Dict[str, Any]]]
190
- ] = None,
270
+ call_my_code: CallMyCodeType,
271
+ scorer_for_metric_id: Optional[Dict[str, ScoreThisInputOutputMetricType]] = None,
191
272
  ) -> None:
192
273
  """
193
274
  Run an engine function on all examples and score the results.
@@ -196,124 +277,179 @@ class ExperimentRunner:
196
277
  engine: Function that takes input, returns output (can be async)
197
278
  scorer: Optional function that scores the output given the example
198
279
  """
199
- examples = self.get_example_inputs()
280
+ examples = self.get_examples_for_dataset()
200
281
 
201
282
  # Wrap engine to match run_example signature (input, parameters)
202
- def wrapped_engine(input_data, parameters):
203
- return engine(input_data)
204
-
205
- # Wrap scorer to match run_example signature (output, example, parameters)
206
- async def wrapped_scorer(output, example, parameters):
207
- if scorer:
208
- return await scorer(output, example)
209
- return {}
283
+ async def wrapped_engine(input_data, parameters):
284
+ result = call_my_code(input_data, parameters)
285
+ # Handle async functions
286
+ if hasattr(result, "__await__"):
287
+ result = await result
288
+ return result
210
289
 
211
290
  for example in examples:
212
- scores = await self.run_example(example, wrapped_engine, wrapped_scorer)
213
- if scores:
214
- self.scores.append(
215
- {
216
- "example": example,
217
- "result": scores,
218
- "scores": scores,
219
- }
220
- )
221
-
291
+ try:
292
+ scores = await self.run_example(example, wrapped_engine, scorer_for_metric_id)
293
+ if scores:
294
+ self.scores.append(
295
+ {
296
+ "example": example,
297
+ "result": scores,
298
+ "scores": scores,
299
+ }
300
+ )
301
+ except Exception as e:
302
+ print(f"Error processing example {example.get('id', 'unknown')}: {e}")
303
+ # Continue with next example instead of failing entire run
304
+
305
+ @WithTracing(filter_input=_filter_input_for_run_example)
222
306
  async def run_example(
223
307
  self,
224
- example: Dict[str, Any],
225
- call_my_code: Callable[[Any, Dict[str, Any]], Union[Any, Awaitable[Any]]],
226
- score_this_output: Optional[
227
- Callable[[Any, Dict[str, Any], Dict[str, Any]], Awaitable[Dict[str, Any]]]
228
- ] = None,
229
- ) -> List[Dict[str, Any]]:
308
+ example: Example,
309
+ call_my_code: CallMyCodeType,
310
+ scorer_for_metric_id: Optional[Dict[str, ScoreThisInputOutputMetricType]] = None,
311
+ ) -> List[Result]:
230
312
  """
231
- Run the engine on an example with the given parameters (looping over comparison parameters),
232
- and score the result. Also calls scoreAndStore to store the result in the server.
313
+ Run the engine on an example with the experiment's parameters, score the result, and store it.
233
314
 
234
315
  Args:
235
- example: The example to run
316
+ example: The example to run. See Example.ts type
236
317
  call_my_code: Function that takes input and parameters, returns output (can be async)
237
- score_this_output: Optional function that scores the output given the example and parameters
318
+ scorer_for_metric_id: Optional dictionary of metric IDs to functions that score the output given the example and parameters
238
319
 
239
320
  Returns:
240
- One set of scores for each comparison parameter set. If no comparison parameters,
241
- returns an array of one.
321
+ List of one result (for API compatibility).
242
322
  """
243
- # Ensure experiment exists
244
323
  if not self.experiment:
245
324
  self.create_experiment()
246
325
  if not self.experiment:
247
326
  raise Exception("Failed to create experiment")
248
327
 
249
- # Make the parameters
250
- parameters_fixed = self.experiment.get("parameters") or {}
251
- # If comparison_parameters is empty/undefined, default to [{}] so we run at least once
252
- parameters_loop = self.experiment.get("comparison_parameters") or [{}]
253
-
254
- # Handle both spans array and input field
328
+ parameters_here = self.experiment.get("parameters") or {}
255
329
  input_data = example.get("input")
256
330
  if not input_data and example.get("spans") and len(example["spans"]) > 0:
257
331
  input_data = example["spans"][0].get("attributes", {}).get("input")
258
-
259
332
  if not input_data:
260
- print(f"Warning: Example has no input field or spans with input attribute: {example}"
261
- )
262
- # Run engine anyway -- this could make sense if it's all about the parameters
263
-
264
- all_scores: List[Dict[str, Any]] = []
265
- # This loop should not be parallelized - it should run sequentially, one after the other
266
- # to avoid creating interference between the runs.
267
- for parameters in parameters_loop:
268
- parameters_here = {**parameters_fixed, **parameters}
269
- print(f"Running with parameters: {parameters_here}")
270
-
271
- # Set env vars from parameters_here
272
- for key, value in parameters_here.items():
273
- if value:
274
- os.environ[key] = str(value)
275
-
276
- start = time.time() * 1000 # milliseconds
333
+ print(f"Warning: Example has no input field or spans with input attribute: {example}")
334
+
335
+ example_id = example.get("id")
336
+ if not example_id:
337
+ raise ValueError("Example must have an 'id' field")
338
+ set_span_attribute("example", example_id)
339
+
340
+ print(f"Running with parameters: {parameters_here}")
341
+ original_env_vars: Dict[str, Optional[str]] = {}
342
+ for key, value in parameters_here.items():
343
+ if value:
344
+ original_env_vars[key] = os.environ.get(key)
345
+ os.environ[key] = str(value)
346
+ try:
347
+ start = time.time() * 1000
277
348
  output = call_my_code(input_data, parameters_here)
278
- # Handle async functions
279
349
  if hasattr(output, "__await__"):
280
- import asyncio
281
-
282
350
  output = await output
283
- end = time.time() * 1000 # milliseconds
284
- duration = int(end - start)
285
-
351
+ duration = int((time.time() * 1000) - start)
286
352
  print(f"Output: {output}")
287
353
 
288
- scores: Dict[str, Any] = {}
289
- if score_this_output:
290
- scores = await score_this_output(output, example, parameters_here)
291
-
292
- scores["duration"] = duration
293
-
294
- # TODO: this call as async and wait for all to complete before returning
295
- print(f"Call scoreAndStore ... for example: {example['id']} with scores: {scores}")
296
- result = self.score_and_store(example, output, scores)
354
+ dataset_metrics = self.get_dataset().get("metrics", [])
355
+ specific_metrics = example.get("metrics", [])
356
+ metrics = [*dataset_metrics, *specific_metrics]
357
+ result = Result(example=example_id, scores={}, messages={}, errors={})
358
+ for metric in metrics:
359
+ metric_id = metric.get("id")
360
+ if not metric_id:
361
+ continue
362
+ scorer = scorer_for_metric_id.get(metric_id) if scorer_for_metric_id else None
363
+ if scorer:
364
+ metric_result = await scorer(input_data, output, metric)
365
+ elif metric.get("type") == "llm":
366
+ metric_result = await self._score_llm_metric(input_data, output, example, metric)
367
+ else:
368
+ continue
369
+ if not metric_result:
370
+ result["errors"][metric_id] = "Scoring function returned None"
371
+ continue
372
+ result["scores"][metric_id] = metric_result.get("score")
373
+ result["messages"][metric_id] = metric_result.get("message")
374
+ result["errors"][metric_id] = metric_result.get("error")
375
+ result["scores"]["duration"] = duration
376
+ await flush_tracing()
377
+ print(f"Call scoreAndStore ... for example: {example_id} with scores: {result['scores']}")
378
+ result = await self.score_and_store(example, output, result)
297
379
  print(f"scoreAndStore returned: {result}")
298
- all_scores.append(result)
299
-
300
- return all_scores
301
-
302
- def get_summary_results(self) -> Dict[str, Any]:
380
+ return [result]
381
+ finally:
382
+ for key, original_value in original_env_vars.items():
383
+ if original_value is None:
384
+ os.environ.pop(key, None)
385
+ else:
386
+ os.environ[key] = original_value
387
+
388
+ def get_summaries(self) -> Dict[str, Any]:
303
389
  """
304
- Get summary results from the experiment.
390
+ Get summaries from the experiment.
305
391
 
306
392
  Returns:
307
393
  Dictionary of metric names to summary statistics
308
394
  """
395
+ if not self.experiment_id:
396
+ raise ValueError("No experiment ID available. Create an experiment first.")
397
+
309
398
  response = requests.get(
310
399
  f"{self.server_url}/experiment/{self.experiment_id}",
311
400
  headers=self._get_headers(),
312
401
  )
313
-
402
+
314
403
  if not response.ok:
315
404
  raise Exception(format_http_error(response, "fetch summary results"))
316
405
 
317
406
  experiment2 = response.json()
318
- return experiment2.get("summary_results", {})
407
+ return experiment2.get("summaries", {})
408
+
409
+ async def _score_llm_metric(
410
+ self,
411
+ input_data: Any,
412
+ output: Any,
413
+ example: Example,
414
+ metric: Metric,
415
+ ) -> MetricResult:
416
+ """
417
+ Score an LLM metric by fetching model API key from server if needed.
418
+
419
+ Args:
420
+ input_data: The input data to score
421
+ output: The output to score
422
+ example: The example object
423
+ metric: The metric definition
424
+
425
+ Returns:
426
+ MetricResult object with score:[0,1], message (optional), and error (optional)
427
+ """
428
+ # If model is specified, try to fetch API key from server
429
+ model_id = metric.get("model")
430
+ api_key = None
431
+ provider = metric.get("provider")
432
+
433
+ if model_id:
434
+ model_data = await get_model_from_server(
435
+ model_id, self.server_url, self._get_headers()
436
+ )
437
+ if model_data:
438
+ # Server returns 'apiKey' (camelCase)
439
+ api_key = model_data.get("apiKey")
440
+ # If provider not set in metric, try to get it from model
441
+ if not provider and model_data.get("provider"):
442
+ provider = model_data.get("provider")
443
+
444
+ # Create a custom llm_call_fn if we have an API key from the model
445
+ llm_call_fn = self.llm_call_fn
446
+ if api_key and not llm_call_fn:
447
+ async def _model_llm_call(system_prompt: str, user_message: str) -> str:
448
+ return await call_llm_fallback(system_prompt, user_message, api_key, provider)
449
+ llm_call_fn = _model_llm_call
450
+
451
+ return await score_llm_metric_local(
452
+ input_data, output, example, metric, llm_call_fn
453
+ )
454
+
319
455