aiqa-client 0.6.1__tar.gz → 0.7.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {aiqa_client-0.6.1/aiqa_client.egg-info → aiqa_client-0.7.2}/PKG-INFO +1 -1
- {aiqa_client-0.6.1 → aiqa_client-0.7.2}/aiqa/client.py +74 -4
- {aiqa_client-0.6.1 → aiqa_client-0.7.2}/aiqa/constants.py +1 -1
- {aiqa_client-0.6.1 → aiqa_client-0.7.2}/aiqa/experiment_runner.py +108 -149
- {aiqa_client-0.6.1 → aiqa_client-0.7.2}/aiqa/llm_as_judge.py +3 -2
- {aiqa_client-0.6.1 → aiqa_client-0.7.2}/aiqa/object_serialiser.py +5 -2
- {aiqa_client-0.6.1 → aiqa_client-0.7.2}/aiqa/tracing.py +124 -39
- {aiqa_client-0.6.1 → aiqa_client-0.7.2}/aiqa/types.py +1 -1
- {aiqa_client-0.6.1 → aiqa_client-0.7.2/aiqa_client.egg-info}/PKG-INFO +1 -1
- {aiqa_client-0.6.1 → aiqa_client-0.7.2}/pyproject.toml +1 -1
- {aiqa_client-0.6.1 → aiqa_client-0.7.2}/tests/test_tracing.py +365 -1
- {aiqa_client-0.6.1 → aiqa_client-0.7.2}/LICENSE.txt +0 -0
- {aiqa_client-0.6.1 → aiqa_client-0.7.2}/MANIFEST.in +0 -0
- {aiqa_client-0.6.1 → aiqa_client-0.7.2}/README.md +0 -0
- {aiqa_client-0.6.1 → aiqa_client-0.7.2}/aiqa/__init__.py +0 -0
- {aiqa_client-0.6.1 → aiqa_client-0.7.2}/aiqa/http_utils.py +0 -0
- {aiqa_client-0.6.1 → aiqa_client-0.7.2}/aiqa/py.typed +0 -0
- {aiqa_client-0.6.1 → aiqa_client-0.7.2}/aiqa/span_helpers.py +0 -0
- {aiqa_client-0.6.1 → aiqa_client-0.7.2}/aiqa/tracing_llm_utils.py +0 -0
- {aiqa_client-0.6.1 → aiqa_client-0.7.2}/aiqa_client.egg-info/SOURCES.txt +0 -0
- {aiqa_client-0.6.1 → aiqa_client-0.7.2}/aiqa_client.egg-info/dependency_links.txt +0 -0
- {aiqa_client-0.6.1 → aiqa_client-0.7.2}/aiqa_client.egg-info/requires.txt +0 -0
- {aiqa_client-0.6.1 → aiqa_client-0.7.2}/aiqa_client.egg-info/top_level.txt +0 -0
- {aiqa_client-0.6.1 → aiqa_client-0.7.2}/setup.cfg +0 -0
- {aiqa_client-0.6.1 → aiqa_client-0.7.2}/tests/test_chatbot.py +0 -0
- {aiqa_client-0.6.1 → aiqa_client-0.7.2}/tests/test_integration.py +0 -0
- {aiqa_client-0.6.1 → aiqa_client-0.7.2}/tests/test_integration_api_key.py +0 -0
- {aiqa_client-0.6.1 → aiqa_client-0.7.2}/tests/test_object_serialiser.py +0 -0
- {aiqa_client-0.6.1 → aiqa_client-0.7.2}/tests/test_span_helpers.py +0 -0
- {aiqa_client-0.6.1 → aiqa_client-0.7.2}/tests/test_startup_reliability.py +0 -0
|
@@ -2,10 +2,10 @@
|
|
|
2
2
|
import os
|
|
3
3
|
import logging
|
|
4
4
|
from functools import lru_cache
|
|
5
|
-
from typing import Optional, TYPE_CHECKING, Any, Dict
|
|
5
|
+
from typing import Optional, TYPE_CHECKING, Any, Dict, List
|
|
6
6
|
from opentelemetry import trace
|
|
7
7
|
from opentelemetry.sdk.trace import TracerProvider
|
|
8
|
-
from opentelemetry.sdk.trace.export import BatchSpanProcessor, SpanExporter, SpanExportResult
|
|
8
|
+
from opentelemetry.sdk.trace.export import BatchSpanProcessor, SpanExporter, SpanExportResult
|
|
9
9
|
from opentelemetry.exporter.otlp.proto.http.trace_exporter import OTLPSpanExporter
|
|
10
10
|
from opentelemetry.sdk.trace import ReadableSpan
|
|
11
11
|
from opentelemetry.trace import SpanContext
|
|
@@ -52,6 +52,8 @@ class AIQAClient:
|
|
|
52
52
|
cls._instance._exporter = None # reduce circular import issues by not importing for typecheck here
|
|
53
53
|
cls._instance._enabled: bool = True
|
|
54
54
|
cls._instance._initialized: bool = False
|
|
55
|
+
cls._instance._default_ignore_patterns: List[str] = ["_*"] # Default: filter properties starting with '_'
|
|
56
|
+
cls._instance._ignore_recursive: bool = True # Default: recursive filtering enabled
|
|
55
57
|
return cls._instance
|
|
56
58
|
|
|
57
59
|
@property
|
|
@@ -90,6 +92,76 @@ class AIQAClient:
|
|
|
90
92
|
logger.info(f"AIQA tracing {'enabled' if value else 'disabled'}")
|
|
91
93
|
self._enabled = value
|
|
92
94
|
|
|
95
|
+
@property
|
|
96
|
+
def default_ignore_patterns(self) -> List[str]:
|
|
97
|
+
"""
|
|
98
|
+
Get the default ignore patterns applied to all traced inputs and outputs.
|
|
99
|
+
|
|
100
|
+
Default: ["_*"] (filters properties starting with '_')
|
|
101
|
+
|
|
102
|
+
Returns:
|
|
103
|
+
List of ignore patterns (supports wildcards like "_*")
|
|
104
|
+
"""
|
|
105
|
+
return self._default_ignore_patterns.copy()
|
|
106
|
+
|
|
107
|
+
@default_ignore_patterns.setter
|
|
108
|
+
def default_ignore_patterns(self, value: Optional[List[str]]) -> None:
|
|
109
|
+
"""
|
|
110
|
+
Set the default ignore patterns applied to all traced inputs and outputs.
|
|
111
|
+
|
|
112
|
+
Args:
|
|
113
|
+
value: List of patterns to ignore (e.g., ["_*", "password"]).
|
|
114
|
+
Set to None or [] to disable default ignore patterns.
|
|
115
|
+
Supports wildcards (e.g., "_*" matches "_apple", "_fruit").
|
|
116
|
+
|
|
117
|
+
Example:
|
|
118
|
+
from aiqa import get_aiqa_client
|
|
119
|
+
|
|
120
|
+
client = get_aiqa_client()
|
|
121
|
+
# Add password to default ignore patterns
|
|
122
|
+
client.default_ignore_patterns = ["_*", "password", "api_key"]
|
|
123
|
+
# Disable default ignore patterns
|
|
124
|
+
client.default_ignore_patterns = []
|
|
125
|
+
"""
|
|
126
|
+
if value is None:
|
|
127
|
+
self._default_ignore_patterns = []
|
|
128
|
+
else:
|
|
129
|
+
self._default_ignore_patterns = list(value)
|
|
130
|
+
logger.info(f"Default ignore patterns set to: {self._default_ignore_patterns}")
|
|
131
|
+
|
|
132
|
+
@property
|
|
133
|
+
def ignore_recursive(self) -> bool:
|
|
134
|
+
"""
|
|
135
|
+
Get whether ignore patterns are applied recursively to nested objects.
|
|
136
|
+
|
|
137
|
+
Default: True (recursive filtering enabled)
|
|
138
|
+
|
|
139
|
+
Returns:
|
|
140
|
+
True if recursive filtering is enabled, False otherwise
|
|
141
|
+
"""
|
|
142
|
+
return self._ignore_recursive
|
|
143
|
+
|
|
144
|
+
@ignore_recursive.setter
|
|
145
|
+
def ignore_recursive(self, value: bool) -> None:
|
|
146
|
+
"""
|
|
147
|
+
Set whether ignore patterns are applied recursively to nested objects.
|
|
148
|
+
|
|
149
|
+
When True (default), ignore patterns are applied at all nesting levels.
|
|
150
|
+
When False, ignore patterns are only applied to top-level keys.
|
|
151
|
+
|
|
152
|
+
Args:
|
|
153
|
+
value: True to enable recursive filtering, False to disable
|
|
154
|
+
|
|
155
|
+
Example:
|
|
156
|
+
from aiqa import get_aiqa_client
|
|
157
|
+
|
|
158
|
+
client = get_aiqa_client()
|
|
159
|
+
# Disable recursive filtering (only filter top-level keys)
|
|
160
|
+
client.ignore_recursive = False
|
|
161
|
+
"""
|
|
162
|
+
self._ignore_recursive = bool(value)
|
|
163
|
+
logger.info(f"Ignore recursive filtering {'enabled' if self._ignore_recursive else 'disabled'}")
|
|
164
|
+
|
|
93
165
|
def shutdown(self) -> None:
|
|
94
166
|
"""
|
|
95
167
|
Shutdown the tracer provider and exporter.
|
|
@@ -245,8 +317,6 @@ def _attach_aiqa_processor(provider: TracerProvider) -> None:
|
|
|
245
317
|
auth_headers = {}
|
|
246
318
|
if api_key:
|
|
247
319
|
auth_headers["Authorization"] = f"ApiKey {api_key}"
|
|
248
|
-
elif os.getenv("AIQA_API_KEY"):
|
|
249
|
-
auth_headers["Authorization"] = f"ApiKey {os.getenv('AIQA_API_KEY')}"
|
|
250
320
|
|
|
251
321
|
# OTLP HTTP exporter requires the full endpoint URL including /v1/traces
|
|
252
322
|
# Ensure server_url doesn't have trailing slash or /v1/traces, then append /v1/traces
|
|
@@ -3,6 +3,6 @@ Constants used across the AIQA client package.
|
|
|
3
3
|
"""
|
|
4
4
|
|
|
5
5
|
AIQA_TRACER_NAME = "aiqa-tracer"
|
|
6
|
-
VERSION = "0.
|
|
6
|
+
VERSION = "0.7.2" # automatically updated by set-version-json.sh
|
|
7
7
|
|
|
8
8
|
LOG_TAG = "AIQA" # Used in all logging output to identify AIQA messages
|
|
@@ -5,11 +5,16 @@ ExperimentRunner - runs experiments on datasets and scores results
|
|
|
5
5
|
import os
|
|
6
6
|
import time
|
|
7
7
|
import asyncio
|
|
8
|
+
from opentelemetry import context as otel_context
|
|
9
|
+
from opentelemetry.trace import Status, StatusCode, set_span_in_context
|
|
8
10
|
from .constants import LOG_TAG
|
|
9
11
|
from .http_utils import build_headers, get_server_url, get_api_key, format_http_error
|
|
10
12
|
from typing import Any, Dict, List, Optional, Callable, Awaitable, Union
|
|
11
13
|
from .tracing import WithTracing
|
|
12
|
-
from .span_helpers import set_span_attribute, flush_tracing
|
|
14
|
+
from .span_helpers import set_span_attribute, flush_tracing, get_active_trace_id
|
|
15
|
+
from .client import get_aiqa_client, get_aiqa_tracer, get_component_tag
|
|
16
|
+
from .object_serialiser import serialize_for_span
|
|
17
|
+
from .tracing_llm_utils import _extract_and_set_token_usage, _extract_and_set_provider_and_model
|
|
13
18
|
from .llm_as_judge import score_llm_metric_local, get_model_from_server, call_llm_fallback
|
|
14
19
|
import requests
|
|
15
20
|
from .types import MetricResult, ScoreThisInputOutputMetricType, Example, Result, Metric, CallLLMType
|
|
@@ -25,31 +30,9 @@ CallMyCodeType = Callable[[Any, Dict[str, Any]], Union[Any, Awaitable[Any]]]
|
|
|
25
30
|
ScoreThisOutputType = Callable[[Any, Any, Dict[str, Any], Dict[str, Any]], Awaitable[Dict[str, Any]]]
|
|
26
31
|
|
|
27
32
|
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
""
|
|
31
|
-
if not isinstance(input_data, dict):
|
|
32
|
-
return {}
|
|
33
|
-
self_obj = input_data.get("self")
|
|
34
|
-
if not self_obj:
|
|
35
|
-
return {}
|
|
36
|
-
return {
|
|
37
|
-
"dataset": getattr(self_obj, "dataset_id", None),
|
|
38
|
-
"experiment": getattr(self_obj, "experiment_id", None),
|
|
39
|
-
}
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
def _filter_input_for_run_example(
|
|
43
|
-
self: "ExperimentRunner",
|
|
44
|
-
example: Dict[str, Any],
|
|
45
|
-
call_my_code: Any = None,
|
|
46
|
-
score_this_output: Any = None,
|
|
47
|
-
) -> Dict[str, Any]:
|
|
48
|
-
"""Filter input for run_example method to extract dataset, experiment, and example IDs."""
|
|
49
|
-
result = _filter_input_for_run({"self": self})
|
|
50
|
-
if isinstance(example, dict):
|
|
51
|
-
result["example"] = example.get("id")
|
|
52
|
-
return result
|
|
33
|
+
def _metric_score_key(metric: Dict[str, Any]) -> str:
|
|
34
|
+
"""Key for scores in API: server expects metric name (fallback to id)."""
|
|
35
|
+
return (metric.get("name") or metric.get("id")) or ""
|
|
53
36
|
|
|
54
37
|
|
|
55
38
|
class ExperimentRunner:
|
|
@@ -123,7 +106,17 @@ class ExperimentRunner:
|
|
|
123
106
|
|
|
124
107
|
return dataset
|
|
125
108
|
|
|
126
|
-
def
|
|
109
|
+
def get_example(self, example_id: str) -> Dict[str, Any]:
|
|
110
|
+
"""
|
|
111
|
+
Fetch an example by ID.
|
|
112
|
+
"""
|
|
113
|
+
response = requests.get(
|
|
114
|
+
f"{self.server_url}/example/{example_id}",
|
|
115
|
+
headers=self._get_headers(),
|
|
116
|
+
)
|
|
117
|
+
return response.json()
|
|
118
|
+
|
|
119
|
+
def get_examples_for_dataset(self, limit: int = 10000) -> List[Dict[str, Any]]:
|
|
127
120
|
"""
|
|
128
121
|
Fetch example inputs from the dataset.
|
|
129
122
|
|
|
@@ -134,7 +127,7 @@ class ExperimentRunner:
|
|
|
134
127
|
List of example objects
|
|
135
128
|
"""
|
|
136
129
|
params = {
|
|
137
|
-
"
|
|
130
|
+
"dataset": self.dataset_id,
|
|
138
131
|
"limit": str(limit),
|
|
139
132
|
}
|
|
140
133
|
if self.organisation:
|
|
@@ -162,7 +155,6 @@ class ExperimentRunner:
|
|
|
162
155
|
experiment_setup: Optional setup for the experiment object. You may wish to set:
|
|
163
156
|
- name (recommended for labelling the experiment)
|
|
164
157
|
- parameters
|
|
165
|
-
- comparison_parameters
|
|
166
158
|
|
|
167
159
|
Returns:
|
|
168
160
|
The created experiment object
|
|
@@ -184,7 +176,7 @@ class ExperimentRunner:
|
|
|
184
176
|
"organisation": self.organisation,
|
|
185
177
|
"dataset": self.dataset_id,
|
|
186
178
|
"results": [],
|
|
187
|
-
"
|
|
179
|
+
"summaries": {},
|
|
188
180
|
}
|
|
189
181
|
|
|
190
182
|
print(f"Creating experiment")
|
|
@@ -207,6 +199,7 @@ class ExperimentRunner:
|
|
|
207
199
|
example: Example,
|
|
208
200
|
output: Any,
|
|
209
201
|
result: Result,
|
|
202
|
+
trace_id: Optional[str] = None,
|
|
210
203
|
) -> Result:
|
|
211
204
|
"""
|
|
212
205
|
Ask the server to score an example result. Stores the score for later summary calculation.
|
|
@@ -226,24 +219,20 @@ class ExperimentRunner:
|
|
|
226
219
|
if not example_id:
|
|
227
220
|
raise ValueError("Example must have an 'id' field")
|
|
228
221
|
if result is None:
|
|
229
|
-
|
|
230
|
-
if not example_id:
|
|
231
|
-
raise ValueError("Example must have an 'id' field")
|
|
232
|
-
result = Result(exampleId=example_id, scores={}, messages={}, errors={})
|
|
222
|
+
result = {"example": example_id, "scores": {}, "messages": {}, "errors": {}}
|
|
233
223
|
scores = result.get("scores") or {}
|
|
234
|
-
|
|
235
|
-
|
|
236
224
|
|
|
237
225
|
print(f"Scoring and storing example: {example_id}")
|
|
238
226
|
print(f"Scores: {scores}")
|
|
239
227
|
|
|
240
228
|
# Run synchronous requests.post in a thread pool to avoid blocking
|
|
229
|
+
# Server expects output = raw output to score, not the result dict; scores keyed by metric name
|
|
241
230
|
def _do_request():
|
|
242
231
|
return requests.post(
|
|
243
232
|
f"{self.server_url}/experiment/{self.experiment_id}/example/{example_id}/scoreAndStore",
|
|
244
233
|
json={
|
|
245
|
-
"output":
|
|
246
|
-
"
|
|
234
|
+
"output": output,
|
|
235
|
+
"trace": trace_id,
|
|
247
236
|
"scores": scores,
|
|
248
237
|
},
|
|
249
238
|
headers=self._get_headers(),
|
|
@@ -258,7 +247,6 @@ class ExperimentRunner:
|
|
|
258
247
|
print(f"scoreAndStore response: {json_result}")
|
|
259
248
|
return json_result
|
|
260
249
|
|
|
261
|
-
@WithTracing(filter_input=_filter_input_for_run)
|
|
262
250
|
async def run(
|
|
263
251
|
self,
|
|
264
252
|
call_my_code: CallMyCodeType,
|
|
@@ -271,19 +259,11 @@ class ExperimentRunner:
|
|
|
271
259
|
engine: Function that takes input, returns output (can be async)
|
|
272
260
|
scorer: Optional function that scores the output given the example
|
|
273
261
|
"""
|
|
274
|
-
examples = self.
|
|
275
|
-
|
|
276
|
-
# Wrap engine to match run_example signature (input, parameters)
|
|
277
|
-
async def wrapped_engine(input_data, parameters):
|
|
278
|
-
result = call_my_code(input_data, parameters)
|
|
279
|
-
# Handle async functions
|
|
280
|
-
if hasattr(result, "__await__"):
|
|
281
|
-
result = await result
|
|
282
|
-
return result
|
|
262
|
+
examples = self.get_examples_for_dataset()
|
|
283
263
|
|
|
284
264
|
for example in examples:
|
|
285
265
|
try:
|
|
286
|
-
scores = await self.run_example(example,
|
|
266
|
+
scores = await self.run_example(example, call_my_code, scorer_for_metric_id)
|
|
287
267
|
if scores:
|
|
288
268
|
self.scores.append(
|
|
289
269
|
{
|
|
@@ -296,7 +276,6 @@ class ExperimentRunner:
|
|
|
296
276
|
print(f"Error processing example {example.get('id', 'unknown')}: {e}")
|
|
297
277
|
# Continue with next example instead of failing entire run
|
|
298
278
|
|
|
299
|
-
@WithTracing(filter_input=_filter_input_for_run_example)
|
|
300
279
|
async def run_example(
|
|
301
280
|
self,
|
|
302
281
|
example: Example,
|
|
@@ -304,8 +283,10 @@ class ExperimentRunner:
|
|
|
304
283
|
scorer_for_metric_id: Optional[Dict[str, ScoreThisInputOutputMetricType]] = None,
|
|
305
284
|
) -> List[Result]:
|
|
306
285
|
"""
|
|
307
|
-
Run the engine on an example with the
|
|
308
|
-
|
|
286
|
+
Run the engine on an example with the experiment's parameters, score the result, and store it.
|
|
287
|
+
|
|
288
|
+
Spans: one root "RunExample" span (input, call_my_code, output) and one child "ScoreExample"
|
|
289
|
+
span for scoring, so the server sees a clear call_my_code vs scoring split (aligned with client-go).
|
|
309
290
|
|
|
310
291
|
Args:
|
|
311
292
|
example: The example to run. See Example.ts type
|
|
@@ -313,117 +294,94 @@ class ExperimentRunner:
|
|
|
313
294
|
scorer_for_metric_id: Optional dictionary of metric IDs to functions that score the output given the example and parameters
|
|
314
295
|
|
|
315
296
|
Returns:
|
|
316
|
-
|
|
317
|
-
returns an array of one.
|
|
297
|
+
List of one result (for API compatibility).
|
|
318
298
|
"""
|
|
319
|
-
# Ensure experiment exists
|
|
320
299
|
if not self.experiment:
|
|
321
300
|
self.create_experiment()
|
|
322
301
|
if not self.experiment:
|
|
323
302
|
raise Exception("Failed to create experiment")
|
|
324
303
|
|
|
325
|
-
|
|
326
|
-
parameters_fixed = self.experiment.get("parameters") or {}
|
|
327
|
-
# If comparison_parameters is empty/undefined, default to [{}] so we run at least once
|
|
328
|
-
parameters_loop = self.experiment.get("comparison_parameters") or [{}]
|
|
329
|
-
|
|
330
|
-
# Handle both spans array and input field
|
|
304
|
+
parameters_here = self.experiment.get("parameters") or {}
|
|
331
305
|
input_data = example.get("input")
|
|
332
306
|
if not input_data and example.get("spans") and len(example["spans"]) > 0:
|
|
333
307
|
input_data = example["spans"][0].get("attributes", {}).get("input")
|
|
334
|
-
|
|
335
308
|
if not input_data:
|
|
336
|
-
print(f"Warning: Example has no input field or spans with input attribute: {example}"
|
|
337
|
-
)
|
|
338
|
-
# Run engine anyway -- this could make sense if it's all about the parameters
|
|
309
|
+
print(f"Warning: Example has no input field or spans with input attribute: {example}")
|
|
339
310
|
|
|
340
|
-
# Set example.id on the root span (created by @WithTracing decorator)
|
|
341
|
-
# This ensures the root span from the trace has example=Example.id set
|
|
342
311
|
example_id = example.get("id")
|
|
343
312
|
if not example_id:
|
|
344
313
|
raise ValueError("Example must have an 'id' field")
|
|
345
|
-
set_span_attribute("example", example_id)
|
|
346
|
-
|
|
347
|
-
all_scores: List[Dict[str, Any]] = []
|
|
348
|
-
dataset_metrics = self.get_dataset().get("metrics", [])
|
|
349
|
-
specific_metrics = example.get("metrics", [])
|
|
350
|
-
metrics = [*dataset_metrics, *specific_metrics]
|
|
351
|
-
# This loop should not be parallelized - it should run sequentially, one after the other
|
|
352
|
-
# to avoid creating interference between the runs.
|
|
353
|
-
for parameters in parameters_loop:
|
|
354
|
-
parameters_here = {**parameters_fixed, **parameters}
|
|
355
|
-
print(f"Running with parameters: {parameters_here}")
|
|
356
|
-
|
|
357
|
-
# Save original env var values for cleanup
|
|
358
|
-
original_env_vars: Dict[str, Optional[str]] = {}
|
|
359
|
-
# Set env vars from parameters_here
|
|
360
|
-
for key, value in parameters_here.items():
|
|
361
|
-
if value:
|
|
362
|
-
original_env_vars[key] = os.environ.get(key)
|
|
363
|
-
os.environ[key] = str(value)
|
|
364
314
|
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
315
|
+
print(f"Running with parameters: {parameters_here}")
|
|
316
|
+
original_env_vars: Dict[str, Optional[str]] = {}
|
|
317
|
+
for key, value in parameters_here.items():
|
|
318
|
+
if value:
|
|
319
|
+
original_env_vars[key] = os.environ.get(key)
|
|
320
|
+
os.environ[key] = str(value)
|
|
321
|
+
try:
|
|
322
|
+
start = time.time() * 1000
|
|
323
|
+
|
|
324
|
+
run_trace_id_ref: List[Optional[str]] = [None]
|
|
325
|
+
|
|
326
|
+
# Wrap engine to match run_example signature (input, parameters)
|
|
327
|
+
# Root span so server can find it by parent:unset; trace ID is sent to scoreAndStore
|
|
328
|
+
def set_trace_id(tid: Optional[str]) -> None:
|
|
329
|
+
run_trace_id_ref[0] = tid
|
|
330
|
+
|
|
331
|
+
@WithTracing(root=True)
|
|
332
|
+
async def wrapped_engine(input_data, parameters, set_trace_id: Callable[[Optional[str]], None]):
|
|
333
|
+
trace_id_here = get_active_trace_id()
|
|
334
|
+
set_trace_id(trace_id_here)
|
|
335
|
+
result = call_my_code(input_data, parameters)
|
|
368
336
|
# Handle async functions
|
|
369
|
-
if hasattr(
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
if original_value is None:
|
|
416
|
-
# Variable didn't exist before, remove it
|
|
417
|
-
os.environ.pop(key, None)
|
|
418
|
-
else:
|
|
419
|
-
# Restore original value
|
|
420
|
-
os.environ[key] = original_value
|
|
421
|
-
|
|
422
|
-
return all_scores
|
|
423
|
-
|
|
424
|
-
def get_summary_results(self) -> Dict[str, Any]:
|
|
337
|
+
if hasattr(result, "__await__"):
|
|
338
|
+
result = await result
|
|
339
|
+
return result
|
|
340
|
+
|
|
341
|
+
output = wrapped_engine(input_data, parameters_here, set_trace_id)
|
|
342
|
+
if hasattr(output, "__await__"):
|
|
343
|
+
output = await output
|
|
344
|
+
duration = int((time.time() * 1000) - start)
|
|
345
|
+
print(f"Output: {output}")
|
|
346
|
+
|
|
347
|
+
dataset_metrics = self.get_dataset().get("metrics", [])
|
|
348
|
+
specific_metrics = example.get("metrics", [])
|
|
349
|
+
metrics = [*dataset_metrics, *specific_metrics]
|
|
350
|
+
result: Result = {"example": example_id, "scores": {}, "messages": {}, "errors": {}}
|
|
351
|
+
for metric in metrics:
|
|
352
|
+
metric_id = metric.get("id")
|
|
353
|
+
score_key = _metric_score_key(metric)
|
|
354
|
+
if not metric_id or not score_key:
|
|
355
|
+
continue
|
|
356
|
+
scorer = scorer_for_metric_id.get(metric_id) if scorer_for_metric_id else None
|
|
357
|
+
if scorer:
|
|
358
|
+
metric_result = await scorer(input_data, output, metric)
|
|
359
|
+
elif metric.get("type") == "llm":
|
|
360
|
+
metric_result = await self._score_llm_metric(input_data, output, example, metric)
|
|
361
|
+
else:
|
|
362
|
+
continue
|
|
363
|
+
if not metric_result:
|
|
364
|
+
result["errors"][score_key] = "Scoring function returned None"
|
|
365
|
+
continue
|
|
366
|
+
result["scores"][score_key] = metric_result.get("score")
|
|
367
|
+
result["messages"][score_key] = metric_result.get("message")
|
|
368
|
+
result["errors"][score_key] = metric_result.get("error")
|
|
369
|
+
result["scores"]["duration"] = duration
|
|
370
|
+
await flush_tracing()
|
|
371
|
+
print(f"Call scoreAndStore ... for example: {example_id} with scores: {result['scores']}")
|
|
372
|
+
result = await self.score_and_store(example, output, result, trace_id=run_trace_id_ref[0])
|
|
373
|
+
print(f"scoreAndStore returned: {result}")
|
|
374
|
+
return [result]
|
|
375
|
+
finally:
|
|
376
|
+
for key, original_value in original_env_vars.items():
|
|
377
|
+
if original_value is None:
|
|
378
|
+
os.environ.pop(key, None)
|
|
379
|
+
else:
|
|
380
|
+
os.environ[key] = original_value
|
|
381
|
+
|
|
382
|
+
def get_summaries(self) -> Dict[str, Any]:
|
|
425
383
|
"""
|
|
426
|
-
Get
|
|
384
|
+
Get summaries from the experiment.
|
|
427
385
|
|
|
428
386
|
Returns:
|
|
429
387
|
Dictionary of metric names to summary statistics
|
|
@@ -435,12 +393,12 @@ class ExperimentRunner:
|
|
|
435
393
|
f"{self.server_url}/experiment/{self.experiment_id}",
|
|
436
394
|
headers=self._get_headers(),
|
|
437
395
|
)
|
|
438
|
-
|
|
396
|
+
|
|
439
397
|
if not response.ok:
|
|
440
398
|
raise Exception(format_http_error(response, "fetch summary results"))
|
|
441
399
|
|
|
442
400
|
experiment2 = response.json()
|
|
443
|
-
return experiment2.get("
|
|
401
|
+
return experiment2.get("summaries", {})
|
|
444
402
|
|
|
445
403
|
async def _score_llm_metric(
|
|
446
404
|
self,
|
|
@@ -471,7 +429,8 @@ class ExperimentRunner:
|
|
|
471
429
|
model_id, self.server_url, self._get_headers()
|
|
472
430
|
)
|
|
473
431
|
if model_data:
|
|
474
|
-
|
|
432
|
+
# Server returns 'apiKey' (camelCase)
|
|
433
|
+
api_key = model_data.get("apiKey")
|
|
475
434
|
# If provider not set in metric, try to get it from model
|
|
476
435
|
if not provider and model_data.get("provider"):
|
|
477
436
|
provider = model_data.get("provider")
|
|
@@ -52,14 +52,15 @@ async def get_model_from_server(
|
|
|
52
52
|
try:
|
|
53
53
|
def _do_request():
|
|
54
54
|
return requests.get(
|
|
55
|
-
f"{server_url}/model/{model_id}?fields=
|
|
55
|
+
f"{server_url}/model/{model_id}?fields=apiKey", # Server uses camelCase 'apiKey' (also accepts 'api_key')
|
|
56
56
|
headers=headers,
|
|
57
57
|
)
|
|
58
58
|
|
|
59
59
|
response = await asyncio.to_thread(_do_request)
|
|
60
60
|
if response.ok:
|
|
61
61
|
model = response.json()
|
|
62
|
-
|
|
62
|
+
# Server returns 'apiKey' (camelCase)
|
|
63
|
+
if model.get("apiKey"):
|
|
63
64
|
return model
|
|
64
65
|
return None
|
|
65
66
|
except Exception as e:
|
|
@@ -25,7 +25,7 @@ def sanitize_string_for_utf8(text: str) -> str:
|
|
|
25
25
|
Returns:
|
|
26
26
|
A string with surrogate characters replaced by the Unicode replacement character (U+FFFD)
|
|
27
27
|
"""
|
|
28
|
-
if text
|
|
28
|
+
if text is None:
|
|
29
29
|
return None
|
|
30
30
|
if not isinstance(text, str): # paranoia
|
|
31
31
|
text = str(text)
|
|
@@ -43,7 +43,10 @@ def toNumber(value: str|int|None) -> int:
|
|
|
43
43
|
if value is None:
|
|
44
44
|
return 0
|
|
45
45
|
if isinstance(value, int):
|
|
46
|
-
return value
|
|
46
|
+
return value
|
|
47
|
+
# Convert to string if not already
|
|
48
|
+
if not isinstance(value, str):
|
|
49
|
+
value = str(value)
|
|
47
50
|
if value.endswith("b"): # drop the b
|
|
48
51
|
value = value[:-1]
|
|
49
52
|
if value.endswith("g"):
|