aiqa-client 0.5.2__tar.gz → 0.6.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (32) hide show
  1. {aiqa_client-0.5.2/aiqa_client.egg-info → aiqa_client-0.6.1}/PKG-INFO +1 -1
  2. {aiqa_client-0.5.2 → aiqa_client-0.6.1}/aiqa/__init__.py +8 -2
  3. {aiqa_client-0.5.2 → aiqa_client-0.6.1}/aiqa/client.py +17 -2
  4. {aiqa_client-0.5.2 → aiqa_client-0.6.1}/aiqa/constants.py +1 -1
  5. aiqa_client-0.6.1/aiqa/experiment_runner.py +490 -0
  6. aiqa_client-0.6.1/aiqa/llm_as_judge.py +281 -0
  7. aiqa_client-0.6.1/aiqa/span_helpers.py +511 -0
  8. {aiqa_client-0.5.2 → aiqa_client-0.6.1}/aiqa/tracing.py +169 -561
  9. {aiqa_client-0.5.2 → aiqa_client-0.6.1}/aiqa/tracing_llm_utils.py +20 -9
  10. aiqa_client-0.6.1/aiqa/types.py +61 -0
  11. {aiqa_client-0.5.2 → aiqa_client-0.6.1/aiqa_client.egg-info}/PKG-INFO +1 -1
  12. {aiqa_client-0.5.2 → aiqa_client-0.6.1}/aiqa_client.egg-info/SOURCES.txt +6 -1
  13. {aiqa_client-0.5.2 → aiqa_client-0.6.1}/pyproject.toml +1 -1
  14. aiqa_client-0.6.1/tests/test_chatbot.py +87 -0
  15. {aiqa_client-0.5.2 → aiqa_client-0.6.1}/tests/test_integration.py +42 -5
  16. aiqa_client-0.5.2/tests/test_api_key.py → aiqa_client-0.6.1/tests/test_integration_api_key.py +5 -5
  17. aiqa_client-0.6.1/tests/test_span_helpers.py +345 -0
  18. aiqa_client-0.6.1/tests/test_tracing.py +798 -0
  19. aiqa_client-0.5.2/aiqa/experiment_runner.py +0 -319
  20. aiqa_client-0.5.2/tests/test_tracing.py +0 -413
  21. {aiqa_client-0.5.2 → aiqa_client-0.6.1}/LICENSE.txt +0 -0
  22. {aiqa_client-0.5.2 → aiqa_client-0.6.1}/MANIFEST.in +0 -0
  23. {aiqa_client-0.5.2 → aiqa_client-0.6.1}/README.md +0 -0
  24. {aiqa_client-0.5.2 → aiqa_client-0.6.1}/aiqa/http_utils.py +0 -0
  25. {aiqa_client-0.5.2 → aiqa_client-0.6.1}/aiqa/object_serialiser.py +0 -0
  26. {aiqa_client-0.5.2 → aiqa_client-0.6.1}/aiqa/py.typed +0 -0
  27. {aiqa_client-0.5.2 → aiqa_client-0.6.1}/aiqa_client.egg-info/dependency_links.txt +0 -0
  28. {aiqa_client-0.5.2 → aiqa_client-0.6.1}/aiqa_client.egg-info/requires.txt +0 -0
  29. {aiqa_client-0.5.2 → aiqa_client-0.6.1}/aiqa_client.egg-info/top_level.txt +0 -0
  30. {aiqa_client-0.5.2 → aiqa_client-0.6.1}/setup.cfg +0 -0
  31. {aiqa_client-0.5.2 → aiqa_client-0.6.1}/tests/test_object_serialiser.py +0 -0
  32. {aiqa_client-0.5.2 → aiqa_client-0.6.1}/tests/test_startup_reliability.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: aiqa-client
3
- Version: 0.5.2
3
+ Version: 0.6.1
4
4
  Summary: OpenTelemetry-based Python client for tracing functions and sending traces to the AIQA server
5
5
  Author-email: AIQA <info@aiqa.dev>
6
6
  License: MIT
@@ -26,8 +26,8 @@ Example:
26
26
  result = my_function()
27
27
  """
28
28
 
29
- from .tracing import (
30
- WithTracing,
29
+ from .tracing import WithTracing
30
+ from .span_helpers import (
31
31
  flush_tracing,
32
32
  set_span_attribute,
33
33
  set_span_name,
@@ -39,7 +39,10 @@ from .tracing import (
39
39
  extract_trace_context,
40
40
  set_conversation_id,
41
41
  set_component_tag,
42
+ set_token_usage,
43
+ set_provider_and_model,
42
44
  get_span,
45
+ submit_feedback,
43
46
  )
44
47
  from .client import get_aiqa_client
45
48
  from .experiment_runner import ExperimentRunner
@@ -60,7 +63,10 @@ __all__ = [
60
63
  "extract_trace_context",
61
64
  "set_conversation_id",
62
65
  "set_component_tag",
66
+ "set_token_usage",
67
+ "set_provider_and_model",
63
68
  "get_span",
69
+ "submit_feedback",
64
70
  "VERSION",
65
71
  ]
66
72
 
@@ -5,8 +5,10 @@ from functools import lru_cache
5
5
  from typing import Optional, TYPE_CHECKING, Any, Dict
6
6
  from opentelemetry import trace
7
7
  from opentelemetry.sdk.trace import TracerProvider
8
- from opentelemetry.sdk.trace.export import BatchSpanProcessor
8
+ from opentelemetry.sdk.trace.export import BatchSpanProcessor, SpanExporter, SpanExportResult, SpanExporter as SpanExporterBase
9
9
  from opentelemetry.exporter.otlp.proto.http.trace_exporter import OTLPSpanExporter
10
+ from opentelemetry.sdk.trace import ReadableSpan
11
+ from opentelemetry.trace import SpanContext
10
12
  import requests
11
13
 
12
14
  from .constants import AIQA_TRACER_NAME, LOG_TAG
@@ -254,11 +256,24 @@ def _attach_aiqa_processor(provider: TracerProvider) -> None:
254
256
  else:
255
257
  endpoint = f"{base_url}/v1/traces"
256
258
 
257
- # Create OTLP exporter with authentication headers only
259
+ # Get timeout from environment variable (in seconds)
260
+ # Supports OTEL_EXPORTER_OTLP_TIMEOUT (standard) or AIQA_EXPORT_TIMEOUT (custom)
261
+ # Default is 30 seconds (more generous than OTLP default of 10s)
262
+ timeout = 30.0
263
+ otlp_timeout = os.getenv("OTEL_EXPORTER_OTLP_TIMEOUT")
264
+
265
+ if otlp_timeout:
266
+ try:
267
+ timeout = float(otlp_timeout)
268
+ except ValueError:
269
+ logger.warning(f"Invalid OTEL_EXPORTER_OTLP_TIMEOUT value '{otlp_timeout}', using default 30.0")
270
+
271
+ # Create OTLP exporter with authentication headers and timeout
258
272
  # The exporter will set Content-Type and other headers automatically
259
273
  exporter = OTLPSpanExporter(
260
274
  endpoint=endpoint,
261
275
  headers=auth_headers if auth_headers else None,
276
+ timeout=timeout,
262
277
  )
263
278
 
264
279
  provider.add_span_processor(BatchSpanProcessor(exporter))
@@ -3,6 +3,6 @@ Constants used across the AIQA client package.
3
3
  """
4
4
 
5
5
  AIQA_TRACER_NAME = "aiqa-tracer"
6
- VERSION = "0.5.2" # automatically updated by set-version-json.sh
6
+ VERSION = "0.6.1" # automatically updated by set-version-json.sh
7
7
 
8
8
  LOG_TAG = "AIQA" # Used in all logging output to identify AIQA messages
@@ -0,0 +1,490 @@
1
+ """
2
+ ExperimentRunner - runs experiments on datasets and scores results
3
+ """
4
+
5
+ import os
6
+ import time
7
+ import asyncio
8
+ from .constants import LOG_TAG
9
+ from .http_utils import build_headers, get_server_url, get_api_key, format_http_error
10
+ from typing import Any, Dict, List, Optional, Callable, Awaitable, Union
11
+ from .tracing import WithTracing
12
+ from .span_helpers import set_span_attribute, flush_tracing
13
+ from .llm_as_judge import score_llm_metric_local, get_model_from_server, call_llm_fallback
14
+ import requests
15
+ from .types import MetricResult, ScoreThisInputOutputMetricType, Example, Result, Metric, CallLLMType
16
+
17
+ # Type aliases for engine/scoring functions to improve code completion and clarity
18
+ from typing import TypedDict
19
+
20
+ # Function that processes input and parameters to produce an output (sync or async)
21
+ CallMyCodeType = Callable[[Any, Dict[str, Any]], Union[Any, Awaitable[Any]]]
22
+
23
+ # Function that scores a given output, using input, example, and parameters (usually async)
24
+ # Returns a dictionary with score/message/etc.
25
+ ScoreThisOutputType = Callable[[Any, Any, Dict[str, Any], Dict[str, Any]], Awaitable[Dict[str, Any]]]
26
+
27
+
28
+
29
+ def _filter_input_for_run(input_data: Any) -> Dict[str, Any]:
30
+ """Tracing:Filter input - drop most, keep just ids"""
31
+ if not isinstance(input_data, dict):
32
+ return {}
33
+ self_obj = input_data.get("self")
34
+ if not self_obj:
35
+ return {}
36
+ return {
37
+ "dataset": getattr(self_obj, "dataset_id", None),
38
+ "experiment": getattr(self_obj, "experiment_id", None),
39
+ }
40
+
41
+
42
+ def _filter_input_for_run_example(
43
+ self: "ExperimentRunner",
44
+ example: Dict[str, Any],
45
+ call_my_code: Any = None,
46
+ score_this_output: Any = None,
47
+ ) -> Dict[str, Any]:
48
+ """Filter input for run_example method to extract dataset, experiment, and example IDs."""
49
+ result = _filter_input_for_run({"self": self})
50
+ if isinstance(example, dict):
51
+ result["example"] = example.get("id")
52
+ return result
53
+
54
+
55
+ class ExperimentRunner:
56
+ """
57
+ The ExperimentRunner is the main class for running experiments on datasets.
58
+ It can create an experiment, run it, and score the results.
59
+ Handles setting up environment variables and passing parameters to the engine function.
60
+ """
61
+
62
+ def __init__(
63
+ self,
64
+ dataset_id: str,
65
+ experiment_id: Optional[str] = None,
66
+ server_url: Optional[str] = None,
67
+ api_key: Optional[str] = None,
68
+ organisation_id: Optional[str] = None,
69
+ llm_call_fn: Optional[CallLLMType] = None,
70
+ ):
71
+ """
72
+ Initialize the ExperimentRunner.
73
+
74
+ Args:
75
+ dataset_id: ID of the dataset to run experiments on
76
+ experiment_id: Usually unset, and a fresh experiment is created with a random ID
77
+ server_url: URL of the AIQA server (defaults to AIQA_SERVER_URL env var)
78
+ api_key: API key for authentication (defaults to AIQA_API_KEY env var)
79
+ organisation_id: Optional organisation ID for the experiment. If not provided, will be
80
+ derived from the dataset when needed.
81
+ llm_call_fn: Optional async function that takes (system_prompt, user_message) and returns
82
+ raw content string (typically JSON). If not provided, will check for OPENAI_API_KEY
83
+ or ANTHROPIC_API_KEY environment variables.
84
+ """
85
+ self.dataset_id = dataset_id
86
+ self.experiment_id = experiment_id
87
+ self.server_url = get_server_url(server_url)
88
+ self.api_key = get_api_key(api_key)
89
+ self.organisation = organisation_id
90
+ self.experiment: Optional[Dict[str, Any]] = None
91
+ self.scores: List[Dict[str, Any]] = []
92
+ self.llm_call_fn = llm_call_fn
93
+ self._dataset_cache: Optional[Dict[str, Any]] = None
94
+
95
+ def _get_headers(self) -> Dict[str, str]:
96
+ """Build HTTP headers for API requests."""
97
+ return build_headers(self.api_key)
98
+
99
+ def get_dataset(self) -> Dict[str, Any]:
100
+ """
101
+ Fetch the dataset to get its metrics.
102
+
103
+ Returns:
104
+ The dataset object with metrics and other information
105
+ """
106
+ if self._dataset_cache is not None:
107
+ return self._dataset_cache
108
+
109
+ response = requests.get(
110
+ f"{self.server_url}/dataset/{self.dataset_id}",
111
+ headers=self._get_headers(),
112
+ )
113
+
114
+ if not response.ok:
115
+ raise Exception(format_http_error(response, "fetch dataset"))
116
+
117
+ dataset = response.json()
118
+ self._dataset_cache = dataset
119
+
120
+ # If organisation_id wasn't set, derive it from the dataset
121
+ if not self.organisation and dataset.get("organisation"):
122
+ self.organisation = dataset.get("organisation")
123
+
124
+ return dataset
125
+
126
+ def get_example_inputs(self, limit: int = 10000) -> List[Dict[str, Any]]:
127
+ """
128
+ Fetch example inputs from the dataset.
129
+
130
+ Args:
131
+ limit: Maximum number of examples to fetch (default: 10000)
132
+
133
+ Returns:
134
+ List of example objects
135
+ """
136
+ params = {
137
+ "dataset_id": self.dataset_id,
138
+ "limit": str(limit),
139
+ }
140
+ if self.organisation:
141
+ params["organisation"] = self.organisation
142
+
143
+ response = requests.get(
144
+ f"{self.server_url}/example",
145
+ params=params,
146
+ headers=self._get_headers(),
147
+ )
148
+
149
+ if not response.ok:
150
+ raise Exception(format_http_error(response, "fetch example inputs"))
151
+
152
+ data = response.json()
153
+ return data.get("hits", [])
154
+
155
+ def create_experiment(
156
+ self, experiment_setup: Optional[Dict[str, Any]] = None
157
+ ) -> Dict[str, Any]:
158
+ """
159
+ Create an experiment if one does not exist.
160
+
161
+ Args:
162
+ experiment_setup: Optional setup for the experiment object. You may wish to set:
163
+ - name (recommended for labelling the experiment)
164
+ - parameters
165
+ - comparison_parameters
166
+
167
+ Returns:
168
+ The created experiment object
169
+ """
170
+ # Ensure we have the organisation ID - try to get it from the dataset if not set
171
+ if not self.organisation:
172
+ dataset = self.get_dataset()
173
+ self.organisation = dataset.get("organisation")
174
+
175
+ if not self.organisation or not self.dataset_id:
176
+ raise Exception("Organisation and dataset ID are required to create an experiment. Organisation can be derived from the dataset or set via organisation_id parameter.")
177
+
178
+ if not experiment_setup:
179
+ experiment_setup = {}
180
+
181
+ # Fill in if not set
182
+ experiment_setup = {
183
+ **experiment_setup,
184
+ "organisation": self.organisation,
185
+ "dataset": self.dataset_id,
186
+ "results": [],
187
+ "summary_results": {},
188
+ }
189
+
190
+ print(f"Creating experiment")
191
+ response = requests.post(
192
+ f"{self.server_url}/experiment",
193
+ json=experiment_setup,
194
+ headers=self._get_headers(),
195
+ )
196
+
197
+ if not response.ok:
198
+ raise Exception(format_http_error(response, "create experiment"))
199
+
200
+ experiment = response.json()
201
+ self.experiment_id = experiment["id"]
202
+ self.experiment = experiment
203
+ return experiment
204
+
205
+ async def score_and_store(
206
+ self,
207
+ example: Example,
208
+ output: Any,
209
+ result: Result,
210
+ ) -> Result:
211
+ """
212
+ Ask the server to score an example result. Stores the score for later summary calculation.
213
+
214
+ Args:
215
+ example: The example object
216
+ output: The output from running the engine on the example
217
+ result: The result object for locally calculated scores
218
+
219
+ Returns:
220
+ The score result from the server
221
+ """
222
+ # Do we have an experiment ID? If not, we need to create the experiment first
223
+ if not self.experiment_id:
224
+ self.create_experiment()
225
+ example_id = example.get("id")
226
+ if not example_id:
227
+ raise ValueError("Example must have an 'id' field")
228
+ if result is None:
229
+ example_id = example.get("id")
230
+ if not example_id:
231
+ raise ValueError("Example must have an 'id' field")
232
+ result = Result(exampleId=example_id, scores={}, messages={}, errors={})
233
+ scores = result.get("scores") or {}
234
+
235
+
236
+
237
+ print(f"Scoring and storing example: {example_id}")
238
+ print(f"Scores: {scores}")
239
+
240
+ # Run synchronous requests.post in a thread pool to avoid blocking
241
+ def _do_request():
242
+ return requests.post(
243
+ f"{self.server_url}/experiment/{self.experiment_id}/example/{example_id}/scoreAndStore",
244
+ json={
245
+ "output": result,
246
+ "traceId": example.get("traceId"),
247
+ "scores": scores,
248
+ },
249
+ headers=self._get_headers(),
250
+ )
251
+
252
+ response = await asyncio.to_thread(_do_request)
253
+
254
+ if not response.ok:
255
+ raise Exception(format_http_error(response, "score and store"))
256
+
257
+ json_result = response.json()
258
+ print(f"scoreAndStore response: {json_result}")
259
+ return json_result
260
+
261
+ @WithTracing(filter_input=_filter_input_for_run)
262
+ async def run(
263
+ self,
264
+ call_my_code: CallMyCodeType,
265
+ scorer_for_metric_id: Optional[Dict[str, ScoreThisInputOutputMetricType]] = None,
266
+ ) -> None:
267
+ """
268
+ Run an engine function on all examples and score the results.
269
+
270
+ Args:
271
+ engine: Function that takes input, returns output (can be async)
272
+ scorer: Optional function that scores the output given the example
273
+ """
274
+ examples = self.get_example_inputs()
275
+
276
+ # Wrap engine to match run_example signature (input, parameters)
277
+ async def wrapped_engine(input_data, parameters):
278
+ result = call_my_code(input_data, parameters)
279
+ # Handle async functions
280
+ if hasattr(result, "__await__"):
281
+ result = await result
282
+ return result
283
+
284
+ for example in examples:
285
+ try:
286
+ scores = await self.run_example(example, wrapped_engine, scorer_for_metric_id)
287
+ if scores:
288
+ self.scores.append(
289
+ {
290
+ "example": example,
291
+ "result": scores,
292
+ "scores": scores,
293
+ }
294
+ )
295
+ except Exception as e:
296
+ print(f"Error processing example {example.get('id', 'unknown')}: {e}")
297
+ # Continue with next example instead of failing entire run
298
+
299
+ @WithTracing(filter_input=_filter_input_for_run_example)
300
+ async def run_example(
301
+ self,
302
+ example: Example,
303
+ call_my_code: CallMyCodeType,
304
+ scorer_for_metric_id: Optional[Dict[str, ScoreThisInputOutputMetricType]] = None,
305
+ ) -> List[Result]:
306
+ """
307
+ Run the engine on an example with the given parameters (looping over comparison parameters),
308
+ and score the result. Also calls scoreAndStore to store the result in the server.
309
+
310
+ Args:
311
+ example: The example to run. See Example.ts type
312
+ call_my_code: Function that takes input and parameters, returns output (can be async)
313
+ scorer_for_metric_id: Optional dictionary of metric IDs to functions that score the output given the example and parameters
314
+
315
+ Returns:
316
+ One set of scores for each comparison parameter set. If no comparison parameters,
317
+ returns an array of one.
318
+ """
319
+ # Ensure experiment exists
320
+ if not self.experiment:
321
+ self.create_experiment()
322
+ if not self.experiment:
323
+ raise Exception("Failed to create experiment")
324
+
325
+ # Make the parameters
326
+ parameters_fixed = self.experiment.get("parameters") or {}
327
+ # If comparison_parameters is empty/undefined, default to [{}] so we run at least once
328
+ parameters_loop = self.experiment.get("comparison_parameters") or [{}]
329
+
330
+ # Handle both spans array and input field
331
+ input_data = example.get("input")
332
+ if not input_data and example.get("spans") and len(example["spans"]) > 0:
333
+ input_data = example["spans"][0].get("attributes", {}).get("input")
334
+
335
+ if not input_data:
336
+ print(f"Warning: Example has no input field or spans with input attribute: {example}"
337
+ )
338
+ # Run engine anyway -- this could make sense if it's all about the parameters
339
+
340
+ # Set example.id on the root span (created by @WithTracing decorator)
341
+ # This ensures the root span from the trace has example=Example.id set
342
+ example_id = example.get("id")
343
+ if not example_id:
344
+ raise ValueError("Example must have an 'id' field")
345
+ set_span_attribute("example", example_id)
346
+
347
+ all_scores: List[Dict[str, Any]] = []
348
+ dataset_metrics = self.get_dataset().get("metrics", [])
349
+ specific_metrics = example.get("metrics", [])
350
+ metrics = [*dataset_metrics, *specific_metrics]
351
+ # This loop should not be parallelized - it should run sequentially, one after the other
352
+ # to avoid creating interference between the runs.
353
+ for parameters in parameters_loop:
354
+ parameters_here = {**parameters_fixed, **parameters}
355
+ print(f"Running with parameters: {parameters_here}")
356
+
357
+ # Save original env var values for cleanup
358
+ original_env_vars: Dict[str, Optional[str]] = {}
359
+ # Set env vars from parameters_here
360
+ for key, value in parameters_here.items():
361
+ if value:
362
+ original_env_vars[key] = os.environ.get(key)
363
+ os.environ[key] = str(value)
364
+
365
+ try:
366
+ start = time.time() * 1000 # milliseconds
367
+ output = call_my_code(input_data, parameters_here)
368
+ # Handle async functions
369
+ if hasattr(output, "__await__"):
370
+ output = await output
371
+ end = time.time() * 1000 # milliseconds
372
+ duration = int(end - start)
373
+
374
+ print(f"Output: {output}")
375
+ # Score it
376
+ result = Result(exampleId=example_id, scores={}, messages={}, errors={})
377
+ for metric in metrics:
378
+ metric_id = metric.get("id")
379
+ if not metric_id:
380
+ print(f"Warning: Metric missing 'id' field, skipping: {metric}")
381
+ continue
382
+ scorer = scorer_for_metric_id.get(metric_id) if scorer_for_metric_id else None
383
+ if scorer:
384
+ metric_result = await scorer(input_data, output, metric)
385
+ elif metric.get("type") == "llm":
386
+ metric_result = await self._score_llm_metric(input_data, output, example, metric)
387
+ else:
388
+ metric_type = metric.get("type", "unknown")
389
+ print(f"Skipping metric: {metric_id} {metric_type} - no scorer")
390
+ continue
391
+
392
+ # Handle None metric_result (e.g., if scoring failed)
393
+ if not metric_result:
394
+ print(f"Warning: Metric {metric_id} returned None result, skipping")
395
+ result["errors"][metric_id] = "Scoring function returned None"
396
+ continue
397
+
398
+ result["scores"][metric_id] = metric_result.get("score")
399
+ result["messages"][metric_id] = metric_result.get("message")
400
+ result["errors"][metric_id] = metric_result.get("error")
401
+ # Always add duration to scores as a system metric
402
+ result["scores"]["duration"] = duration
403
+
404
+ # Flush spans before scoreAndStore to ensure they're indexed in ES
405
+ # This prevents race condition where scoreAndStore looks up spans before they're indexed
406
+ await flush_tracing()
407
+
408
+ print(f"Call scoreAndStore ... for example: {example_id} with scores: {result['scores']}")
409
+ result = await self.score_and_store(example, output, result)
410
+ print(f"scoreAndStore returned: {result}")
411
+ all_scores.append(result)
412
+ finally:
413
+ # Restore original env var values
414
+ for key, original_value in original_env_vars.items():
415
+ if original_value is None:
416
+ # Variable didn't exist before, remove it
417
+ os.environ.pop(key, None)
418
+ else:
419
+ # Restore original value
420
+ os.environ[key] = original_value
421
+
422
+ return all_scores
423
+
424
+ def get_summary_results(self) -> Dict[str, Any]:
425
+ """
426
+ Get summary results from the experiment.
427
+
428
+ Returns:
429
+ Dictionary of metric names to summary statistics
430
+ """
431
+ if not self.experiment_id:
432
+ raise ValueError("No experiment ID available. Create an experiment first.")
433
+
434
+ response = requests.get(
435
+ f"{self.server_url}/experiment/{self.experiment_id}",
436
+ headers=self._get_headers(),
437
+ )
438
+
439
+ if not response.ok:
440
+ raise Exception(format_http_error(response, "fetch summary results"))
441
+
442
+ experiment2 = response.json()
443
+ return experiment2.get("summary_results", {})
444
+
445
+ async def _score_llm_metric(
446
+ self,
447
+ input_data: Any,
448
+ output: Any,
449
+ example: Example,
450
+ metric: Metric,
451
+ ) -> MetricResult:
452
+ """
453
+ Score an LLM metric by fetching model API key from server if needed.
454
+
455
+ Args:
456
+ input_data: The input data to score
457
+ output: The output to score
458
+ example: The example object
459
+ metric: The metric definition
460
+
461
+ Returns:
462
+ MetricResult object with score:[0,1], message (optional), and error (optional)
463
+ """
464
+ # If model is specified, try to fetch API key from server
465
+ model_id = metric.get("model")
466
+ api_key = None
467
+ provider = metric.get("provider")
468
+
469
+ if model_id:
470
+ model_data = await get_model_from_server(
471
+ model_id, self.server_url, self._get_headers()
472
+ )
473
+ if model_data:
474
+ api_key = model_data.get("api_key")
475
+ # If provider not set in metric, try to get it from model
476
+ if not provider and model_data.get("provider"):
477
+ provider = model_data.get("provider")
478
+
479
+ # Create a custom llm_call_fn if we have an API key from the model
480
+ llm_call_fn = self.llm_call_fn
481
+ if api_key and not llm_call_fn:
482
+ async def _model_llm_call(system_prompt: str, user_message: str) -> str:
483
+ return await call_llm_fallback(system_prompt, user_message, api_key, provider)
484
+ llm_call_fn = _model_llm_call
485
+
486
+ return await score_llm_metric_local(
487
+ input_data, output, example, metric, llm_call_fn
488
+ )
489
+
490
+