aiqa-client 0.6.1__tar.gz → 0.7.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (30) hide show
  1. {aiqa_client-0.6.1/aiqa_client.egg-info → aiqa_client-0.7.2}/PKG-INFO +1 -1
  2. {aiqa_client-0.6.1 → aiqa_client-0.7.2}/aiqa/client.py +74 -4
  3. {aiqa_client-0.6.1 → aiqa_client-0.7.2}/aiqa/constants.py +1 -1
  4. {aiqa_client-0.6.1 → aiqa_client-0.7.2}/aiqa/experiment_runner.py +108 -149
  5. {aiqa_client-0.6.1 → aiqa_client-0.7.2}/aiqa/llm_as_judge.py +3 -2
  6. {aiqa_client-0.6.1 → aiqa_client-0.7.2}/aiqa/object_serialiser.py +5 -2
  7. {aiqa_client-0.6.1 → aiqa_client-0.7.2}/aiqa/tracing.py +124 -39
  8. {aiqa_client-0.6.1 → aiqa_client-0.7.2}/aiqa/types.py +1 -1
  9. {aiqa_client-0.6.1 → aiqa_client-0.7.2/aiqa_client.egg-info}/PKG-INFO +1 -1
  10. {aiqa_client-0.6.1 → aiqa_client-0.7.2}/pyproject.toml +1 -1
  11. {aiqa_client-0.6.1 → aiqa_client-0.7.2}/tests/test_tracing.py +365 -1
  12. {aiqa_client-0.6.1 → aiqa_client-0.7.2}/LICENSE.txt +0 -0
  13. {aiqa_client-0.6.1 → aiqa_client-0.7.2}/MANIFEST.in +0 -0
  14. {aiqa_client-0.6.1 → aiqa_client-0.7.2}/README.md +0 -0
  15. {aiqa_client-0.6.1 → aiqa_client-0.7.2}/aiqa/__init__.py +0 -0
  16. {aiqa_client-0.6.1 → aiqa_client-0.7.2}/aiqa/http_utils.py +0 -0
  17. {aiqa_client-0.6.1 → aiqa_client-0.7.2}/aiqa/py.typed +0 -0
  18. {aiqa_client-0.6.1 → aiqa_client-0.7.2}/aiqa/span_helpers.py +0 -0
  19. {aiqa_client-0.6.1 → aiqa_client-0.7.2}/aiqa/tracing_llm_utils.py +0 -0
  20. {aiqa_client-0.6.1 → aiqa_client-0.7.2}/aiqa_client.egg-info/SOURCES.txt +0 -0
  21. {aiqa_client-0.6.1 → aiqa_client-0.7.2}/aiqa_client.egg-info/dependency_links.txt +0 -0
  22. {aiqa_client-0.6.1 → aiqa_client-0.7.2}/aiqa_client.egg-info/requires.txt +0 -0
  23. {aiqa_client-0.6.1 → aiqa_client-0.7.2}/aiqa_client.egg-info/top_level.txt +0 -0
  24. {aiqa_client-0.6.1 → aiqa_client-0.7.2}/setup.cfg +0 -0
  25. {aiqa_client-0.6.1 → aiqa_client-0.7.2}/tests/test_chatbot.py +0 -0
  26. {aiqa_client-0.6.1 → aiqa_client-0.7.2}/tests/test_integration.py +0 -0
  27. {aiqa_client-0.6.1 → aiqa_client-0.7.2}/tests/test_integration_api_key.py +0 -0
  28. {aiqa_client-0.6.1 → aiqa_client-0.7.2}/tests/test_object_serialiser.py +0 -0
  29. {aiqa_client-0.6.1 → aiqa_client-0.7.2}/tests/test_span_helpers.py +0 -0
  30. {aiqa_client-0.6.1 → aiqa_client-0.7.2}/tests/test_startup_reliability.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: aiqa-client
3
- Version: 0.6.1
3
+ Version: 0.7.2
4
4
  Summary: OpenTelemetry-based Python client for tracing functions and sending traces to the AIQA server
5
5
  Author-email: AIQA <info@aiqa.dev>
6
6
  License: MIT
@@ -2,10 +2,10 @@
2
2
  import os
3
3
  import logging
4
4
  from functools import lru_cache
5
- from typing import Optional, TYPE_CHECKING, Any, Dict
5
+ from typing import Optional, TYPE_CHECKING, Any, Dict, List
6
6
  from opentelemetry import trace
7
7
  from opentelemetry.sdk.trace import TracerProvider
8
- from opentelemetry.sdk.trace.export import BatchSpanProcessor, SpanExporter, SpanExportResult, SpanExporter as SpanExporterBase
8
+ from opentelemetry.sdk.trace.export import BatchSpanProcessor, SpanExporter, SpanExportResult
9
9
  from opentelemetry.exporter.otlp.proto.http.trace_exporter import OTLPSpanExporter
10
10
  from opentelemetry.sdk.trace import ReadableSpan
11
11
  from opentelemetry.trace import SpanContext
@@ -52,6 +52,8 @@ class AIQAClient:
52
52
  cls._instance._exporter = None # reduce circular import issues by not importing for typecheck here
53
53
  cls._instance._enabled: bool = True
54
54
  cls._instance._initialized: bool = False
55
+ cls._instance._default_ignore_patterns: List[str] = ["_*"] # Default: filter properties starting with '_'
56
+ cls._instance._ignore_recursive: bool = True # Default: recursive filtering enabled
55
57
  return cls._instance
56
58
 
57
59
  @property
@@ -90,6 +92,76 @@ class AIQAClient:
90
92
  logger.info(f"AIQA tracing {'enabled' if value else 'disabled'}")
91
93
  self._enabled = value
92
94
 
95
+ @property
96
+ def default_ignore_patterns(self) -> List[str]:
97
+ """
98
+ Get the default ignore patterns applied to all traced inputs and outputs.
99
+
100
+ Default: ["_*"] (filters properties starting with '_')
101
+
102
+ Returns:
103
+ List of ignore patterns (supports wildcards like "_*")
104
+ """
105
+ return self._default_ignore_patterns.copy()
106
+
107
+ @default_ignore_patterns.setter
108
+ def default_ignore_patterns(self, value: Optional[List[str]]) -> None:
109
+ """
110
+ Set the default ignore patterns applied to all traced inputs and outputs.
111
+
112
+ Args:
113
+ value: List of patterns to ignore (e.g., ["_*", "password"]).
114
+ Set to None or [] to disable default ignore patterns.
115
+ Supports wildcards (e.g., "_*" matches "_apple", "_fruit").
116
+
117
+ Example:
118
+ from aiqa import get_aiqa_client
119
+
120
+ client = get_aiqa_client()
121
+ # Add password to default ignore patterns
122
+ client.default_ignore_patterns = ["_*", "password", "api_key"]
123
+ # Disable default ignore patterns
124
+ client.default_ignore_patterns = []
125
+ """
126
+ if value is None:
127
+ self._default_ignore_patterns = []
128
+ else:
129
+ self._default_ignore_patterns = list(value)
130
+ logger.info(f"Default ignore patterns set to: {self._default_ignore_patterns}")
131
+
132
+ @property
133
+ def ignore_recursive(self) -> bool:
134
+ """
135
+ Get whether ignore patterns are applied recursively to nested objects.
136
+
137
+ Default: True (recursive filtering enabled)
138
+
139
+ Returns:
140
+ True if recursive filtering is enabled, False otherwise
141
+ """
142
+ return self._ignore_recursive
143
+
144
+ @ignore_recursive.setter
145
+ def ignore_recursive(self, value: bool) -> None:
146
+ """
147
+ Set whether ignore patterns are applied recursively to nested objects.
148
+
149
+ When True (default), ignore patterns are applied at all nesting levels.
150
+ When False, ignore patterns are only applied to top-level keys.
151
+
152
+ Args:
153
+ value: True to enable recursive filtering, False to disable
154
+
155
+ Example:
156
+ from aiqa import get_aiqa_client
157
+
158
+ client = get_aiqa_client()
159
+ # Disable recursive filtering (only filter top-level keys)
160
+ client.ignore_recursive = False
161
+ """
162
+ self._ignore_recursive = bool(value)
163
+ logger.info(f"Ignore recursive filtering {'enabled' if self._ignore_recursive else 'disabled'}")
164
+
93
165
  def shutdown(self) -> None:
94
166
  """
95
167
  Shutdown the tracer provider and exporter.
@@ -245,8 +317,6 @@ def _attach_aiqa_processor(provider: TracerProvider) -> None:
245
317
  auth_headers = {}
246
318
  if api_key:
247
319
  auth_headers["Authorization"] = f"ApiKey {api_key}"
248
- elif os.getenv("AIQA_API_KEY"):
249
- auth_headers["Authorization"] = f"ApiKey {os.getenv('AIQA_API_KEY')}"
250
320
 
251
321
  # OTLP HTTP exporter requires the full endpoint URL including /v1/traces
252
322
  # Ensure server_url doesn't have trailing slash or /v1/traces, then append /v1/traces
@@ -3,6 +3,6 @@ Constants used across the AIQA client package.
3
3
  """
4
4
 
5
5
  AIQA_TRACER_NAME = "aiqa-tracer"
6
- VERSION = "0.6.1" # automatically updated by set-version-json.sh
6
+ VERSION = "0.7.2" # automatically updated by set-version-json.sh
7
7
 
8
8
  LOG_TAG = "AIQA" # Used in all logging output to identify AIQA messages
@@ -5,11 +5,16 @@ ExperimentRunner - runs experiments on datasets and scores results
5
5
  import os
6
6
  import time
7
7
  import asyncio
8
+ from opentelemetry import context as otel_context
9
+ from opentelemetry.trace import Status, StatusCode, set_span_in_context
8
10
  from .constants import LOG_TAG
9
11
  from .http_utils import build_headers, get_server_url, get_api_key, format_http_error
10
12
  from typing import Any, Dict, List, Optional, Callable, Awaitable, Union
11
13
  from .tracing import WithTracing
12
- from .span_helpers import set_span_attribute, flush_tracing
14
+ from .span_helpers import set_span_attribute, flush_tracing, get_active_trace_id
15
+ from .client import get_aiqa_client, get_aiqa_tracer, get_component_tag
16
+ from .object_serialiser import serialize_for_span
17
+ from .tracing_llm_utils import _extract_and_set_token_usage, _extract_and_set_provider_and_model
13
18
  from .llm_as_judge import score_llm_metric_local, get_model_from_server, call_llm_fallback
14
19
  import requests
15
20
  from .types import MetricResult, ScoreThisInputOutputMetricType, Example, Result, Metric, CallLLMType
@@ -25,31 +30,9 @@ CallMyCodeType = Callable[[Any, Dict[str, Any]], Union[Any, Awaitable[Any]]]
25
30
  ScoreThisOutputType = Callable[[Any, Any, Dict[str, Any], Dict[str, Any]], Awaitable[Dict[str, Any]]]
26
31
 
27
32
 
28
-
29
- def _filter_input_for_run(input_data: Any) -> Dict[str, Any]:
30
- """Tracing:Filter input - drop most, keep just ids"""
31
- if not isinstance(input_data, dict):
32
- return {}
33
- self_obj = input_data.get("self")
34
- if not self_obj:
35
- return {}
36
- return {
37
- "dataset": getattr(self_obj, "dataset_id", None),
38
- "experiment": getattr(self_obj, "experiment_id", None),
39
- }
40
-
41
-
42
- def _filter_input_for_run_example(
43
- self: "ExperimentRunner",
44
- example: Dict[str, Any],
45
- call_my_code: Any = None,
46
- score_this_output: Any = None,
47
- ) -> Dict[str, Any]:
48
- """Filter input for run_example method to extract dataset, experiment, and example IDs."""
49
- result = _filter_input_for_run({"self": self})
50
- if isinstance(example, dict):
51
- result["example"] = example.get("id")
52
- return result
33
+ def _metric_score_key(metric: Dict[str, Any]) -> str:
34
+ """Key for scores in API: server expects metric name (fallback to id)."""
35
+ return (metric.get("name") or metric.get("id")) or ""
53
36
 
54
37
 
55
38
  class ExperimentRunner:
@@ -123,7 +106,17 @@ class ExperimentRunner:
123
106
 
124
107
  return dataset
125
108
 
126
- def get_example_inputs(self, limit: int = 10000) -> List[Dict[str, Any]]:
109
+ def get_example(self, example_id: str) -> Dict[str, Any]:
110
+ """
111
+ Fetch an example by ID.
112
+ """
113
+ response = requests.get(
114
+ f"{self.server_url}/example/{example_id}",
115
+ headers=self._get_headers(),
116
+ )
117
+ return response.json()
118
+
119
+ def get_examples_for_dataset(self, limit: int = 10000) -> List[Dict[str, Any]]:
127
120
  """
128
121
  Fetch example inputs from the dataset.
129
122
 
@@ -134,7 +127,7 @@ class ExperimentRunner:
134
127
  List of example objects
135
128
  """
136
129
  params = {
137
- "dataset_id": self.dataset_id,
130
+ "dataset": self.dataset_id,
138
131
  "limit": str(limit),
139
132
  }
140
133
  if self.organisation:
@@ -162,7 +155,6 @@ class ExperimentRunner:
162
155
  experiment_setup: Optional setup for the experiment object. You may wish to set:
163
156
  - name (recommended for labelling the experiment)
164
157
  - parameters
165
- - comparison_parameters
166
158
 
167
159
  Returns:
168
160
  The created experiment object
@@ -184,7 +176,7 @@ class ExperimentRunner:
184
176
  "organisation": self.organisation,
185
177
  "dataset": self.dataset_id,
186
178
  "results": [],
187
- "summary_results": {},
179
+ "summaries": {},
188
180
  }
189
181
 
190
182
  print(f"Creating experiment")
@@ -207,6 +199,7 @@ class ExperimentRunner:
207
199
  example: Example,
208
200
  output: Any,
209
201
  result: Result,
202
+ trace_id: Optional[str] = None,
210
203
  ) -> Result:
211
204
  """
212
205
  Ask the server to score an example result. Stores the score for later summary calculation.
@@ -226,24 +219,20 @@ class ExperimentRunner:
226
219
  if not example_id:
227
220
  raise ValueError("Example must have an 'id' field")
228
221
  if result is None:
229
- example_id = example.get("id")
230
- if not example_id:
231
- raise ValueError("Example must have an 'id' field")
232
- result = Result(exampleId=example_id, scores={}, messages={}, errors={})
222
+ result = {"example": example_id, "scores": {}, "messages": {}, "errors": {}}
233
223
  scores = result.get("scores") or {}
234
-
235
-
236
224
 
237
225
  print(f"Scoring and storing example: {example_id}")
238
226
  print(f"Scores: {scores}")
239
227
 
240
228
  # Run synchronous requests.post in a thread pool to avoid blocking
229
+ # Server expects output = raw output to score, not the result dict; scores keyed by metric name
241
230
  def _do_request():
242
231
  return requests.post(
243
232
  f"{self.server_url}/experiment/{self.experiment_id}/example/{example_id}/scoreAndStore",
244
233
  json={
245
- "output": result,
246
- "traceId": example.get("traceId"),
234
+ "output": output,
235
+ "trace": trace_id,
247
236
  "scores": scores,
248
237
  },
249
238
  headers=self._get_headers(),
@@ -258,7 +247,6 @@ class ExperimentRunner:
258
247
  print(f"scoreAndStore response: {json_result}")
259
248
  return json_result
260
249
 
261
- @WithTracing(filter_input=_filter_input_for_run)
262
250
  async def run(
263
251
  self,
264
252
  call_my_code: CallMyCodeType,
@@ -271,19 +259,11 @@ class ExperimentRunner:
271
259
  engine: Function that takes input, returns output (can be async)
272
260
  scorer: Optional function that scores the output given the example
273
261
  """
274
- examples = self.get_example_inputs()
275
-
276
- # Wrap engine to match run_example signature (input, parameters)
277
- async def wrapped_engine(input_data, parameters):
278
- result = call_my_code(input_data, parameters)
279
- # Handle async functions
280
- if hasattr(result, "__await__"):
281
- result = await result
282
- return result
262
+ examples = self.get_examples_for_dataset()
283
263
 
284
264
  for example in examples:
285
265
  try:
286
- scores = await self.run_example(example, wrapped_engine, scorer_for_metric_id)
266
+ scores = await self.run_example(example, call_my_code, scorer_for_metric_id)
287
267
  if scores:
288
268
  self.scores.append(
289
269
  {
@@ -296,7 +276,6 @@ class ExperimentRunner:
296
276
  print(f"Error processing example {example.get('id', 'unknown')}: {e}")
297
277
  # Continue with next example instead of failing entire run
298
278
 
299
- @WithTracing(filter_input=_filter_input_for_run_example)
300
279
  async def run_example(
301
280
  self,
302
281
  example: Example,
@@ -304,8 +283,10 @@ class ExperimentRunner:
304
283
  scorer_for_metric_id: Optional[Dict[str, ScoreThisInputOutputMetricType]] = None,
305
284
  ) -> List[Result]:
306
285
  """
307
- Run the engine on an example with the given parameters (looping over comparison parameters),
308
- and score the result. Also calls scoreAndStore to store the result in the server.
286
+ Run the engine on an example with the experiment's parameters, score the result, and store it.
287
+
288
+ Spans: one root "RunExample" span (input, call_my_code, output) and one child "ScoreExample"
289
+ span for scoring, so the server sees a clear call_my_code vs scoring split (aligned with client-go).
309
290
 
310
291
  Args:
311
292
  example: The example to run. See Example.ts type
@@ -313,117 +294,94 @@ class ExperimentRunner:
313
294
  scorer_for_metric_id: Optional dictionary of metric IDs to functions that score the output given the example and parameters
314
295
 
315
296
  Returns:
316
- One set of scores for each comparison parameter set. If no comparison parameters,
317
- returns an array of one.
297
+ List of one result (for API compatibility).
318
298
  """
319
- # Ensure experiment exists
320
299
  if not self.experiment:
321
300
  self.create_experiment()
322
301
  if not self.experiment:
323
302
  raise Exception("Failed to create experiment")
324
303
 
325
- # Make the parameters
326
- parameters_fixed = self.experiment.get("parameters") or {}
327
- # If comparison_parameters is empty/undefined, default to [{}] so we run at least once
328
- parameters_loop = self.experiment.get("comparison_parameters") or [{}]
329
-
330
- # Handle both spans array and input field
304
+ parameters_here = self.experiment.get("parameters") or {}
331
305
  input_data = example.get("input")
332
306
  if not input_data and example.get("spans") and len(example["spans"]) > 0:
333
307
  input_data = example["spans"][0].get("attributes", {}).get("input")
334
-
335
308
  if not input_data:
336
- print(f"Warning: Example has no input field or spans with input attribute: {example}"
337
- )
338
- # Run engine anyway -- this could make sense if it's all about the parameters
309
+ print(f"Warning: Example has no input field or spans with input attribute: {example}")
339
310
 
340
- # Set example.id on the root span (created by @WithTracing decorator)
341
- # This ensures the root span from the trace has example=Example.id set
342
311
  example_id = example.get("id")
343
312
  if not example_id:
344
313
  raise ValueError("Example must have an 'id' field")
345
- set_span_attribute("example", example_id)
346
-
347
- all_scores: List[Dict[str, Any]] = []
348
- dataset_metrics = self.get_dataset().get("metrics", [])
349
- specific_metrics = example.get("metrics", [])
350
- metrics = [*dataset_metrics, *specific_metrics]
351
- # This loop should not be parallelized - it should run sequentially, one after the other
352
- # to avoid creating interference between the runs.
353
- for parameters in parameters_loop:
354
- parameters_here = {**parameters_fixed, **parameters}
355
- print(f"Running with parameters: {parameters_here}")
356
-
357
- # Save original env var values for cleanup
358
- original_env_vars: Dict[str, Optional[str]] = {}
359
- # Set env vars from parameters_here
360
- for key, value in parameters_here.items():
361
- if value:
362
- original_env_vars[key] = os.environ.get(key)
363
- os.environ[key] = str(value)
364
314
 
365
- try:
366
- start = time.time() * 1000 # milliseconds
367
- output = call_my_code(input_data, parameters_here)
315
+ print(f"Running with parameters: {parameters_here}")
316
+ original_env_vars: Dict[str, Optional[str]] = {}
317
+ for key, value in parameters_here.items():
318
+ if value:
319
+ original_env_vars[key] = os.environ.get(key)
320
+ os.environ[key] = str(value)
321
+ try:
322
+ start = time.time() * 1000
323
+
324
+ run_trace_id_ref: List[Optional[str]] = [None]
325
+
326
+ # Wrap engine to match run_example signature (input, parameters)
327
+ # Root span so server can find it by parent:unset; trace ID is sent to scoreAndStore
328
+ def set_trace_id(tid: Optional[str]) -> None:
329
+ run_trace_id_ref[0] = tid
330
+
331
+ @WithTracing(root=True)
332
+ async def wrapped_engine(input_data, parameters, set_trace_id: Callable[[Optional[str]], None]):
333
+ trace_id_here = get_active_trace_id()
334
+ set_trace_id(trace_id_here)
335
+ result = call_my_code(input_data, parameters)
368
336
  # Handle async functions
369
- if hasattr(output, "__await__"):
370
- output = await output
371
- end = time.time() * 1000 # milliseconds
372
- duration = int(end - start)
373
-
374
- print(f"Output: {output}")
375
- # Score it
376
- result = Result(exampleId=example_id, scores={}, messages={}, errors={})
377
- for metric in metrics:
378
- metric_id = metric.get("id")
379
- if not metric_id:
380
- print(f"Warning: Metric missing 'id' field, skipping: {metric}")
381
- continue
382
- scorer = scorer_for_metric_id.get(metric_id) if scorer_for_metric_id else None
383
- if scorer:
384
- metric_result = await scorer(input_data, output, metric)
385
- elif metric.get("type") == "llm":
386
- metric_result = await self._score_llm_metric(input_data, output, example, metric)
387
- else:
388
- metric_type = metric.get("type", "unknown")
389
- print(f"Skipping metric: {metric_id} {metric_type} - no scorer")
390
- continue
391
-
392
- # Handle None metric_result (e.g., if scoring failed)
393
- if not metric_result:
394
- print(f"Warning: Metric {metric_id} returned None result, skipping")
395
- result["errors"][metric_id] = "Scoring function returned None"
396
- continue
397
-
398
- result["scores"][metric_id] = metric_result.get("score")
399
- result["messages"][metric_id] = metric_result.get("message")
400
- result["errors"][metric_id] = metric_result.get("error")
401
- # Always add duration to scores as a system metric
402
- result["scores"]["duration"] = duration
403
-
404
- # Flush spans before scoreAndStore to ensure they're indexed in ES
405
- # This prevents race condition where scoreAndStore looks up spans before they're indexed
406
- await flush_tracing()
407
-
408
- print(f"Call scoreAndStore ... for example: {example_id} with scores: {result['scores']}")
409
- result = await self.score_and_store(example, output, result)
410
- print(f"scoreAndStore returned: {result}")
411
- all_scores.append(result)
412
- finally:
413
- # Restore original env var values
414
- for key, original_value in original_env_vars.items():
415
- if original_value is None:
416
- # Variable didn't exist before, remove it
417
- os.environ.pop(key, None)
418
- else:
419
- # Restore original value
420
- os.environ[key] = original_value
421
-
422
- return all_scores
423
-
424
- def get_summary_results(self) -> Dict[str, Any]:
337
+ if hasattr(result, "__await__"):
338
+ result = await result
339
+ return result
340
+
341
+ output = wrapped_engine(input_data, parameters_here, set_trace_id)
342
+ if hasattr(output, "__await__"):
343
+ output = await output
344
+ duration = int((time.time() * 1000) - start)
345
+ print(f"Output: {output}")
346
+
347
+ dataset_metrics = self.get_dataset().get("metrics", [])
348
+ specific_metrics = example.get("metrics", [])
349
+ metrics = [*dataset_metrics, *specific_metrics]
350
+ result: Result = {"example": example_id, "scores": {}, "messages": {}, "errors": {}}
351
+ for metric in metrics:
352
+ metric_id = metric.get("id")
353
+ score_key = _metric_score_key(metric)
354
+ if not metric_id or not score_key:
355
+ continue
356
+ scorer = scorer_for_metric_id.get(metric_id) if scorer_for_metric_id else None
357
+ if scorer:
358
+ metric_result = await scorer(input_data, output, metric)
359
+ elif metric.get("type") == "llm":
360
+ metric_result = await self._score_llm_metric(input_data, output, example, metric)
361
+ else:
362
+ continue
363
+ if not metric_result:
364
+ result["errors"][score_key] = "Scoring function returned None"
365
+ continue
366
+ result["scores"][score_key] = metric_result.get("score")
367
+ result["messages"][score_key] = metric_result.get("message")
368
+ result["errors"][score_key] = metric_result.get("error")
369
+ result["scores"]["duration"] = duration
370
+ await flush_tracing()
371
+ print(f"Call scoreAndStore ... for example: {example_id} with scores: {result['scores']}")
372
+ result = await self.score_and_store(example, output, result, trace_id=run_trace_id_ref[0])
373
+ print(f"scoreAndStore returned: {result}")
374
+ return [result]
375
+ finally:
376
+ for key, original_value in original_env_vars.items():
377
+ if original_value is None:
378
+ os.environ.pop(key, None)
379
+ else:
380
+ os.environ[key] = original_value
381
+
382
+ def get_summaries(self) -> Dict[str, Any]:
425
383
  """
426
- Get summary results from the experiment.
384
+ Get summaries from the experiment.
427
385
 
428
386
  Returns:
429
387
  Dictionary of metric names to summary statistics
@@ -435,12 +393,12 @@ class ExperimentRunner:
435
393
  f"{self.server_url}/experiment/{self.experiment_id}",
436
394
  headers=self._get_headers(),
437
395
  )
438
-
396
+
439
397
  if not response.ok:
440
398
  raise Exception(format_http_error(response, "fetch summary results"))
441
399
 
442
400
  experiment2 = response.json()
443
- return experiment2.get("summary_results", {})
401
+ return experiment2.get("summaries", {})
444
402
 
445
403
  async def _score_llm_metric(
446
404
  self,
@@ -471,7 +429,8 @@ class ExperimentRunner:
471
429
  model_id, self.server_url, self._get_headers()
472
430
  )
473
431
  if model_data:
474
- api_key = model_data.get("api_key")
432
+ # Server returns 'apiKey' (camelCase)
433
+ api_key = model_data.get("apiKey")
475
434
  # If provider not set in metric, try to get it from model
476
435
  if not provider and model_data.get("provider"):
477
436
  provider = model_data.get("provider")
@@ -52,14 +52,15 @@ async def get_model_from_server(
52
52
  try:
53
53
  def _do_request():
54
54
  return requests.get(
55
- f"{server_url}/model/{model_id}?fields=api_key",
55
+ f"{server_url}/model/{model_id}?fields=apiKey", # Server uses camelCase 'apiKey' (also accepts 'api_key')
56
56
  headers=headers,
57
57
  )
58
58
 
59
59
  response = await asyncio.to_thread(_do_request)
60
60
  if response.ok:
61
61
  model = response.json()
62
- if model.get("api_key"):
62
+ # Server returns 'apiKey' (camelCase)
63
+ if model.get("apiKey"):
63
64
  return model
64
65
  return None
65
66
  except Exception as e:
@@ -25,7 +25,7 @@ def sanitize_string_for_utf8(text: str) -> str:
25
25
  Returns:
26
26
  A string with surrogate characters replaced by the Unicode replacement character (U+FFFD)
27
27
  """
28
- if text == None:
28
+ if text is None:
29
29
  return None
30
30
  if not isinstance(text, str): # paranoia
31
31
  text = str(text)
@@ -43,7 +43,10 @@ def toNumber(value: str|int|None) -> int:
43
43
  if value is None:
44
44
  return 0
45
45
  if isinstance(value, int):
46
- return value
46
+ return value
47
+ # Convert to string if not already
48
+ if not isinstance(value, str):
49
+ value = str(value)
47
50
  if value.endswith("b"): # drop the b
48
51
  value = value[:-1]
49
52
  if value.endswith("g"):