aiqa-client 0.5.2__py3-none-any.whl → 0.7.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- aiqa/__init__.py +8 -2
- aiqa/client.py +90 -5
- aiqa/constants.py +1 -1
- aiqa/experiment_runner.py +245 -109
- aiqa/llm_as_judge.py +282 -0
- aiqa/object_serialiser.py +5 -2
- aiqa/span_helpers.py +511 -0
- aiqa/tracing.py +252 -565
- aiqa/tracing_llm_utils.py +20 -9
- aiqa/types.py +61 -0
- {aiqa_client-0.5.2.dist-info → aiqa_client-0.7.0.dist-info}/METADATA +1 -1
- aiqa_client-0.7.0.dist-info/RECORD +17 -0
- {aiqa_client-0.5.2.dist-info → aiqa_client-0.7.0.dist-info}/WHEEL +1 -1
- aiqa_client-0.5.2.dist-info/RECORD +0 -14
- {aiqa_client-0.5.2.dist-info → aiqa_client-0.7.0.dist-info}/licenses/LICENSE.txt +0 -0
- {aiqa_client-0.5.2.dist-info → aiqa_client-0.7.0.dist-info}/top_level.txt +0 -0
aiqa/__init__.py
CHANGED
|
@@ -26,8 +26,8 @@ Example:
|
|
|
26
26
|
result = my_function()
|
|
27
27
|
"""
|
|
28
28
|
|
|
29
|
-
from .tracing import
|
|
30
|
-
|
|
29
|
+
from .tracing import WithTracing
|
|
30
|
+
from .span_helpers import (
|
|
31
31
|
flush_tracing,
|
|
32
32
|
set_span_attribute,
|
|
33
33
|
set_span_name,
|
|
@@ -39,7 +39,10 @@ from .tracing import (
|
|
|
39
39
|
extract_trace_context,
|
|
40
40
|
set_conversation_id,
|
|
41
41
|
set_component_tag,
|
|
42
|
+
set_token_usage,
|
|
43
|
+
set_provider_and_model,
|
|
42
44
|
get_span,
|
|
45
|
+
submit_feedback,
|
|
43
46
|
)
|
|
44
47
|
from .client import get_aiqa_client
|
|
45
48
|
from .experiment_runner import ExperimentRunner
|
|
@@ -60,7 +63,10 @@ __all__ = [
|
|
|
60
63
|
"extract_trace_context",
|
|
61
64
|
"set_conversation_id",
|
|
62
65
|
"set_component_tag",
|
|
66
|
+
"set_token_usage",
|
|
67
|
+
"set_provider_and_model",
|
|
63
68
|
"get_span",
|
|
69
|
+
"submit_feedback",
|
|
64
70
|
"VERSION",
|
|
65
71
|
]
|
|
66
72
|
|
aiqa/client.py
CHANGED
|
@@ -2,11 +2,13 @@
|
|
|
2
2
|
import os
|
|
3
3
|
import logging
|
|
4
4
|
from functools import lru_cache
|
|
5
|
-
from typing import Optional, TYPE_CHECKING, Any, Dict
|
|
5
|
+
from typing import Optional, TYPE_CHECKING, Any, Dict, List
|
|
6
6
|
from opentelemetry import trace
|
|
7
7
|
from opentelemetry.sdk.trace import TracerProvider
|
|
8
|
-
from opentelemetry.sdk.trace.export import BatchSpanProcessor
|
|
8
|
+
from opentelemetry.sdk.trace.export import BatchSpanProcessor, SpanExporter, SpanExportResult
|
|
9
9
|
from opentelemetry.exporter.otlp.proto.http.trace_exporter import OTLPSpanExporter
|
|
10
|
+
from opentelemetry.sdk.trace import ReadableSpan
|
|
11
|
+
from opentelemetry.trace import SpanContext
|
|
10
12
|
import requests
|
|
11
13
|
|
|
12
14
|
from .constants import AIQA_TRACER_NAME, LOG_TAG
|
|
@@ -50,6 +52,8 @@ class AIQAClient:
|
|
|
50
52
|
cls._instance._exporter = None # reduce circular import issues by not importing for typecheck here
|
|
51
53
|
cls._instance._enabled: bool = True
|
|
52
54
|
cls._instance._initialized: bool = False
|
|
55
|
+
cls._instance._default_ignore_patterns: List[str] = ["_*"] # Default: filter properties starting with '_'
|
|
56
|
+
cls._instance._ignore_recursive: bool = True # Default: recursive filtering enabled
|
|
53
57
|
return cls._instance
|
|
54
58
|
|
|
55
59
|
@property
|
|
@@ -88,6 +92,76 @@ class AIQAClient:
|
|
|
88
92
|
logger.info(f"AIQA tracing {'enabled' if value else 'disabled'}")
|
|
89
93
|
self._enabled = value
|
|
90
94
|
|
|
95
|
+
@property
|
|
96
|
+
def default_ignore_patterns(self) -> List[str]:
|
|
97
|
+
"""
|
|
98
|
+
Get the default ignore patterns applied to all traced inputs and outputs.
|
|
99
|
+
|
|
100
|
+
Default: ["_*"] (filters properties starting with '_')
|
|
101
|
+
|
|
102
|
+
Returns:
|
|
103
|
+
List of ignore patterns (supports wildcards like "_*")
|
|
104
|
+
"""
|
|
105
|
+
return self._default_ignore_patterns.copy()
|
|
106
|
+
|
|
107
|
+
@default_ignore_patterns.setter
|
|
108
|
+
def default_ignore_patterns(self, value: Optional[List[str]]) -> None:
|
|
109
|
+
"""
|
|
110
|
+
Set the default ignore patterns applied to all traced inputs and outputs.
|
|
111
|
+
|
|
112
|
+
Args:
|
|
113
|
+
value: List of patterns to ignore (e.g., ["_*", "password"]).
|
|
114
|
+
Set to None or [] to disable default ignore patterns.
|
|
115
|
+
Supports wildcards (e.g., "_*" matches "_apple", "_fruit").
|
|
116
|
+
|
|
117
|
+
Example:
|
|
118
|
+
from aiqa import get_aiqa_client
|
|
119
|
+
|
|
120
|
+
client = get_aiqa_client()
|
|
121
|
+
# Add password to default ignore patterns
|
|
122
|
+
client.default_ignore_patterns = ["_*", "password", "api_key"]
|
|
123
|
+
# Disable default ignore patterns
|
|
124
|
+
client.default_ignore_patterns = []
|
|
125
|
+
"""
|
|
126
|
+
if value is None:
|
|
127
|
+
self._default_ignore_patterns = []
|
|
128
|
+
else:
|
|
129
|
+
self._default_ignore_patterns = list(value)
|
|
130
|
+
logger.info(f"Default ignore patterns set to: {self._default_ignore_patterns}")
|
|
131
|
+
|
|
132
|
+
@property
|
|
133
|
+
def ignore_recursive(self) -> bool:
|
|
134
|
+
"""
|
|
135
|
+
Get whether ignore patterns are applied recursively to nested objects.
|
|
136
|
+
|
|
137
|
+
Default: True (recursive filtering enabled)
|
|
138
|
+
|
|
139
|
+
Returns:
|
|
140
|
+
True if recursive filtering is enabled, False otherwise
|
|
141
|
+
"""
|
|
142
|
+
return self._ignore_recursive
|
|
143
|
+
|
|
144
|
+
@ignore_recursive.setter
|
|
145
|
+
def ignore_recursive(self, value: bool) -> None:
|
|
146
|
+
"""
|
|
147
|
+
Set whether ignore patterns are applied recursively to nested objects.
|
|
148
|
+
|
|
149
|
+
When True (default), ignore patterns are applied at all nesting levels.
|
|
150
|
+
When False, ignore patterns are only applied to top-level keys.
|
|
151
|
+
|
|
152
|
+
Args:
|
|
153
|
+
value: True to enable recursive filtering, False to disable
|
|
154
|
+
|
|
155
|
+
Example:
|
|
156
|
+
from aiqa import get_aiqa_client
|
|
157
|
+
|
|
158
|
+
client = get_aiqa_client()
|
|
159
|
+
# Disable recursive filtering (only filter top-level keys)
|
|
160
|
+
client.ignore_recursive = False
|
|
161
|
+
"""
|
|
162
|
+
self._ignore_recursive = bool(value)
|
|
163
|
+
logger.info(f"Ignore recursive filtering {'enabled' if self._ignore_recursive else 'disabled'}")
|
|
164
|
+
|
|
91
165
|
def shutdown(self) -> None:
|
|
92
166
|
"""
|
|
93
167
|
Shutdown the tracer provider and exporter.
|
|
@@ -243,8 +317,6 @@ def _attach_aiqa_processor(provider: TracerProvider) -> None:
|
|
|
243
317
|
auth_headers = {}
|
|
244
318
|
if api_key:
|
|
245
319
|
auth_headers["Authorization"] = f"ApiKey {api_key}"
|
|
246
|
-
elif os.getenv("AIQA_API_KEY"):
|
|
247
|
-
auth_headers["Authorization"] = f"ApiKey {os.getenv('AIQA_API_KEY')}"
|
|
248
320
|
|
|
249
321
|
# OTLP HTTP exporter requires the full endpoint URL including /v1/traces
|
|
250
322
|
# Ensure server_url doesn't have trailing slash or /v1/traces, then append /v1/traces
|
|
@@ -254,11 +326,24 @@ def _attach_aiqa_processor(provider: TracerProvider) -> None:
|
|
|
254
326
|
else:
|
|
255
327
|
endpoint = f"{base_url}/v1/traces"
|
|
256
328
|
|
|
257
|
-
#
|
|
329
|
+
# Get timeout from environment variable (in seconds)
|
|
330
|
+
# Supports OTEL_EXPORTER_OTLP_TIMEOUT (standard) or AIQA_EXPORT_TIMEOUT (custom)
|
|
331
|
+
# Default is 30 seconds (more generous than OTLP default of 10s)
|
|
332
|
+
timeout = 30.0
|
|
333
|
+
otlp_timeout = os.getenv("OTEL_EXPORTER_OTLP_TIMEOUT")
|
|
334
|
+
|
|
335
|
+
if otlp_timeout:
|
|
336
|
+
try:
|
|
337
|
+
timeout = float(otlp_timeout)
|
|
338
|
+
except ValueError:
|
|
339
|
+
logger.warning(f"Invalid OTEL_EXPORTER_OTLP_TIMEOUT value '{otlp_timeout}', using default 30.0")
|
|
340
|
+
|
|
341
|
+
# Create OTLP exporter with authentication headers and timeout
|
|
258
342
|
# The exporter will set Content-Type and other headers automatically
|
|
259
343
|
exporter = OTLPSpanExporter(
|
|
260
344
|
endpoint=endpoint,
|
|
261
345
|
headers=auth_headers if auth_headers else None,
|
|
346
|
+
timeout=timeout,
|
|
262
347
|
)
|
|
263
348
|
|
|
264
349
|
provider.add_span_processor(BatchSpanProcessor(exporter))
|
aiqa/constants.py
CHANGED
|
@@ -3,6 +3,6 @@ Constants used across the AIQA client package.
|
|
|
3
3
|
"""
|
|
4
4
|
|
|
5
5
|
AIQA_TRACER_NAME = "aiqa-tracer"
|
|
6
|
-
VERSION = "0.
|
|
6
|
+
VERSION = "0.7.0" # automatically updated by set-version-json.sh
|
|
7
7
|
|
|
8
8
|
LOG_TAG = "AIQA" # Used in all logging output to identify AIQA messages
|
aiqa/experiment_runner.py
CHANGED
|
@@ -4,10 +4,52 @@ ExperimentRunner - runs experiments on datasets and scores results
|
|
|
4
4
|
|
|
5
5
|
import os
|
|
6
6
|
import time
|
|
7
|
+
import asyncio
|
|
7
8
|
from .constants import LOG_TAG
|
|
8
9
|
from .http_utils import build_headers, get_server_url, get_api_key, format_http_error
|
|
9
10
|
from typing import Any, Dict, List, Optional, Callable, Awaitable, Union
|
|
11
|
+
from .tracing import WithTracing
|
|
12
|
+
from .span_helpers import set_span_attribute, flush_tracing
|
|
13
|
+
from .llm_as_judge import score_llm_metric_local, get_model_from_server, call_llm_fallback
|
|
10
14
|
import requests
|
|
15
|
+
from .types import MetricResult, ScoreThisInputOutputMetricType, Example, Result, Metric, CallLLMType
|
|
16
|
+
|
|
17
|
+
# Type aliases for engine/scoring functions to improve code completion and clarity
|
|
18
|
+
from typing import TypedDict
|
|
19
|
+
|
|
20
|
+
# Function that processes input and parameters to produce an output (sync or async)
|
|
21
|
+
CallMyCodeType = Callable[[Any, Dict[str, Any]], Union[Any, Awaitable[Any]]]
|
|
22
|
+
|
|
23
|
+
# Function that scores a given output, using input, example, and parameters (usually async)
|
|
24
|
+
# Returns a dictionary with score/message/etc.
|
|
25
|
+
ScoreThisOutputType = Callable[[Any, Any, Dict[str, Any], Dict[str, Any]], Awaitable[Dict[str, Any]]]
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def _filter_input_for_run(input_data: Any) -> Dict[str, Any]:
|
|
30
|
+
"""Tracing:Filter input - drop most, keep just ids"""
|
|
31
|
+
if not isinstance(input_data, dict):
|
|
32
|
+
return {}
|
|
33
|
+
self_obj = input_data.get("self")
|
|
34
|
+
if not self_obj:
|
|
35
|
+
return {}
|
|
36
|
+
return {
|
|
37
|
+
"dataset": getattr(self_obj, "dataset_id", None),
|
|
38
|
+
"experiment": getattr(self_obj, "experiment_id", None),
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def _filter_input_for_run_example(
|
|
43
|
+
self: "ExperimentRunner",
|
|
44
|
+
example: Dict[str, Any],
|
|
45
|
+
call_my_code: Any = None,
|
|
46
|
+
score_this_output: Any = None,
|
|
47
|
+
) -> Dict[str, Any]:
|
|
48
|
+
"""Filter input for run_example method to extract dataset, experiment, and example IDs."""
|
|
49
|
+
result = _filter_input_for_run({"self": self})
|
|
50
|
+
if isinstance(example, dict):
|
|
51
|
+
result["example"] = example.get("id")
|
|
52
|
+
return result
|
|
11
53
|
|
|
12
54
|
|
|
13
55
|
class ExperimentRunner:
|
|
@@ -24,6 +66,7 @@ class ExperimentRunner:
|
|
|
24
66
|
server_url: Optional[str] = None,
|
|
25
67
|
api_key: Optional[str] = None,
|
|
26
68
|
organisation_id: Optional[str] = None,
|
|
69
|
+
llm_call_fn: Optional[CallLLMType] = None,
|
|
27
70
|
):
|
|
28
71
|
"""
|
|
29
72
|
Initialize the ExperimentRunner.
|
|
@@ -33,7 +76,11 @@ class ExperimentRunner:
|
|
|
33
76
|
experiment_id: Usually unset, and a fresh experiment is created with a random ID
|
|
34
77
|
server_url: URL of the AIQA server (defaults to AIQA_SERVER_URL env var)
|
|
35
78
|
api_key: API key for authentication (defaults to AIQA_API_KEY env var)
|
|
36
|
-
organisation_id:
|
|
79
|
+
organisation_id: Optional organisation ID for the experiment. If not provided, will be
|
|
80
|
+
derived from the dataset when needed.
|
|
81
|
+
llm_call_fn: Optional async function that takes (system_prompt, user_message) and returns
|
|
82
|
+
raw content string (typically JSON). If not provided, will check for OPENAI_API_KEY
|
|
83
|
+
or ANTHROPIC_API_KEY environment variables.
|
|
37
84
|
"""
|
|
38
85
|
self.dataset_id = dataset_id
|
|
39
86
|
self.experiment_id = experiment_id
|
|
@@ -42,6 +89,8 @@ class ExperimentRunner:
|
|
|
42
89
|
self.organisation = organisation_id
|
|
43
90
|
self.experiment: Optional[Dict[str, Any]] = None
|
|
44
91
|
self.scores: List[Dict[str, Any]] = []
|
|
92
|
+
self.llm_call_fn = llm_call_fn
|
|
93
|
+
self._dataset_cache: Optional[Dict[str, Any]] = None
|
|
45
94
|
|
|
46
95
|
def _get_headers(self) -> Dict[str, str]:
|
|
47
96
|
"""Build HTTP headers for API requests."""
|
|
@@ -54,6 +103,9 @@ class ExperimentRunner:
|
|
|
54
103
|
Returns:
|
|
55
104
|
The dataset object with metrics and other information
|
|
56
105
|
"""
|
|
106
|
+
if self._dataset_cache is not None:
|
|
107
|
+
return self._dataset_cache
|
|
108
|
+
|
|
57
109
|
response = requests.get(
|
|
58
110
|
f"{self.server_url}/dataset/{self.dataset_id}",
|
|
59
111
|
headers=self._get_headers(),
|
|
@@ -62,9 +114,26 @@ class ExperimentRunner:
|
|
|
62
114
|
if not response.ok:
|
|
63
115
|
raise Exception(format_http_error(response, "fetch dataset"))
|
|
64
116
|
|
|
117
|
+
dataset = response.json()
|
|
118
|
+
self._dataset_cache = dataset
|
|
119
|
+
|
|
120
|
+
# If organisation_id wasn't set, derive it from the dataset
|
|
121
|
+
if not self.organisation and dataset.get("organisation"):
|
|
122
|
+
self.organisation = dataset.get("organisation")
|
|
123
|
+
|
|
124
|
+
return dataset
|
|
125
|
+
|
|
126
|
+
def get_example(self, example_id: str) -> Dict[str, Any]:
|
|
127
|
+
"""
|
|
128
|
+
Fetch an example by ID.
|
|
129
|
+
"""
|
|
130
|
+
response = requests.get(
|
|
131
|
+
f"{self.server_url}/example/{example_id}",
|
|
132
|
+
headers=self._get_headers(),
|
|
133
|
+
)
|
|
65
134
|
return response.json()
|
|
66
135
|
|
|
67
|
-
def
|
|
136
|
+
def get_examples_for_dataset(self, limit: int = 10000) -> List[Dict[str, Any]]:
|
|
68
137
|
"""
|
|
69
138
|
Fetch example inputs from the dataset.
|
|
70
139
|
|
|
@@ -103,13 +172,17 @@ class ExperimentRunner:
|
|
|
103
172
|
experiment_setup: Optional setup for the experiment object. You may wish to set:
|
|
104
173
|
- name (recommended for labelling the experiment)
|
|
105
174
|
- parameters
|
|
106
|
-
- comparison_parameters
|
|
107
175
|
|
|
108
176
|
Returns:
|
|
109
177
|
The created experiment object
|
|
110
178
|
"""
|
|
179
|
+
# Ensure we have the organisation ID - try to get it from the dataset if not set
|
|
180
|
+
if not self.organisation:
|
|
181
|
+
dataset = self.get_dataset()
|
|
182
|
+
self.organisation = dataset.get("organisation")
|
|
183
|
+
|
|
111
184
|
if not self.organisation or not self.dataset_id:
|
|
112
|
-
raise Exception("Organisation and dataset ID are required to create an experiment")
|
|
185
|
+
raise Exception("Organisation and dataset ID are required to create an experiment. Organisation can be derived from the dataset or set via organisation_id parameter.")
|
|
113
186
|
|
|
114
187
|
if not experiment_setup:
|
|
115
188
|
experiment_setup = {}
|
|
@@ -120,7 +193,7 @@ class ExperimentRunner:
|
|
|
120
193
|
"organisation": self.organisation,
|
|
121
194
|
"dataset": self.dataset_id,
|
|
122
195
|
"results": [],
|
|
123
|
-
"
|
|
196
|
+
"summaries": {},
|
|
124
197
|
}
|
|
125
198
|
|
|
126
199
|
print(f"Creating experiment")
|
|
@@ -138,19 +211,19 @@ class ExperimentRunner:
|
|
|
138
211
|
self.experiment = experiment
|
|
139
212
|
return experiment
|
|
140
213
|
|
|
141
|
-
def score_and_store(
|
|
214
|
+
async def score_and_store(
|
|
142
215
|
self,
|
|
143
|
-
example:
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
) ->
|
|
216
|
+
example: Example,
|
|
217
|
+
output: Any,
|
|
218
|
+
result: Result,
|
|
219
|
+
) -> Result:
|
|
147
220
|
"""
|
|
148
221
|
Ask the server to score an example result. Stores the score for later summary calculation.
|
|
149
222
|
|
|
150
223
|
Args:
|
|
151
224
|
example: The example object
|
|
152
|
-
|
|
153
|
-
|
|
225
|
+
output: The output from running the engine on the example
|
|
226
|
+
result: The result object for locally calculated scores
|
|
154
227
|
|
|
155
228
|
Returns:
|
|
156
229
|
The score result from the server
|
|
@@ -158,22 +231,31 @@ class ExperimentRunner:
|
|
|
158
231
|
# Do we have an experiment ID? If not, we need to create the experiment first
|
|
159
232
|
if not self.experiment_id:
|
|
160
233
|
self.create_experiment()
|
|
161
|
-
|
|
162
|
-
if
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
234
|
+
example_id = example.get("id")
|
|
235
|
+
if not example_id:
|
|
236
|
+
raise ValueError("Example must have an 'id' field")
|
|
237
|
+
if result is None:
|
|
238
|
+
result = Result(example=example_id, scores={}, messages={}, errors={})
|
|
239
|
+
scores = result.get("scores") or {}
|
|
240
|
+
|
|
241
|
+
|
|
242
|
+
|
|
243
|
+
print(f"Scoring and storing example: {example_id}")
|
|
166
244
|
print(f"Scores: {scores}")
|
|
167
245
|
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
"
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
246
|
+
# Run synchronous requests.post in a thread pool to avoid blocking
|
|
247
|
+
def _do_request():
|
|
248
|
+
return requests.post(
|
|
249
|
+
f"{self.server_url}/experiment/{self.experiment_id}/example/{example_id}/scoreAndStore",
|
|
250
|
+
json={
|
|
251
|
+
"output": result,
|
|
252
|
+
"traceId": example.get("trace"), # Server returns 'trace' (lowercase), but API expects 'traceId' (camelCase)
|
|
253
|
+
"scores": scores,
|
|
254
|
+
},
|
|
255
|
+
headers=self._get_headers(),
|
|
256
|
+
)
|
|
257
|
+
|
|
258
|
+
response = await asyncio.to_thread(_do_request)
|
|
177
259
|
|
|
178
260
|
if not response.ok:
|
|
179
261
|
raise Exception(format_http_error(response, "score and store"))
|
|
@@ -182,12 +264,11 @@ class ExperimentRunner:
|
|
|
182
264
|
print(f"scoreAndStore response: {json_result}")
|
|
183
265
|
return json_result
|
|
184
266
|
|
|
267
|
+
@WithTracing(filter_input=_filter_input_for_run)
|
|
185
268
|
async def run(
|
|
186
269
|
self,
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
Callable[[Any, Dict[str, Any]], Awaitable[Dict[str, Any]]]
|
|
190
|
-
] = None,
|
|
270
|
+
call_my_code: CallMyCodeType,
|
|
271
|
+
scorer_for_metric_id: Optional[Dict[str, ScoreThisInputOutputMetricType]] = None,
|
|
191
272
|
) -> None:
|
|
192
273
|
"""
|
|
193
274
|
Run an engine function on all examples and score the results.
|
|
@@ -196,124 +277,179 @@ class ExperimentRunner:
|
|
|
196
277
|
engine: Function that takes input, returns output (can be async)
|
|
197
278
|
scorer: Optional function that scores the output given the example
|
|
198
279
|
"""
|
|
199
|
-
examples = self.
|
|
280
|
+
examples = self.get_examples_for_dataset()
|
|
200
281
|
|
|
201
282
|
# Wrap engine to match run_example signature (input, parameters)
|
|
202
|
-
def wrapped_engine(input_data, parameters):
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
return await scorer(output, example)
|
|
209
|
-
return {}
|
|
283
|
+
async def wrapped_engine(input_data, parameters):
|
|
284
|
+
result = call_my_code(input_data, parameters)
|
|
285
|
+
# Handle async functions
|
|
286
|
+
if hasattr(result, "__await__"):
|
|
287
|
+
result = await result
|
|
288
|
+
return result
|
|
210
289
|
|
|
211
290
|
for example in examples:
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
291
|
+
try:
|
|
292
|
+
scores = await self.run_example(example, wrapped_engine, scorer_for_metric_id)
|
|
293
|
+
if scores:
|
|
294
|
+
self.scores.append(
|
|
295
|
+
{
|
|
296
|
+
"example": example,
|
|
297
|
+
"result": scores,
|
|
298
|
+
"scores": scores,
|
|
299
|
+
}
|
|
300
|
+
)
|
|
301
|
+
except Exception as e:
|
|
302
|
+
print(f"Error processing example {example.get('id', 'unknown')}: {e}")
|
|
303
|
+
# Continue with next example instead of failing entire run
|
|
304
|
+
|
|
305
|
+
@WithTracing(filter_input=_filter_input_for_run_example)
|
|
222
306
|
async def run_example(
|
|
223
307
|
self,
|
|
224
|
-
example:
|
|
225
|
-
call_my_code:
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
] = None,
|
|
229
|
-
) -> List[Dict[str, Any]]:
|
|
308
|
+
example: Example,
|
|
309
|
+
call_my_code: CallMyCodeType,
|
|
310
|
+
scorer_for_metric_id: Optional[Dict[str, ScoreThisInputOutputMetricType]] = None,
|
|
311
|
+
) -> List[Result]:
|
|
230
312
|
"""
|
|
231
|
-
Run the engine on an example with the
|
|
232
|
-
and score the result. Also calls scoreAndStore to store the result in the server.
|
|
313
|
+
Run the engine on an example with the experiment's parameters, score the result, and store it.
|
|
233
314
|
|
|
234
315
|
Args:
|
|
235
|
-
example: The example to run
|
|
316
|
+
example: The example to run. See Example.ts type
|
|
236
317
|
call_my_code: Function that takes input and parameters, returns output (can be async)
|
|
237
|
-
|
|
318
|
+
scorer_for_metric_id: Optional dictionary of metric IDs to functions that score the output given the example and parameters
|
|
238
319
|
|
|
239
320
|
Returns:
|
|
240
|
-
|
|
241
|
-
returns an array of one.
|
|
321
|
+
List of one result (for API compatibility).
|
|
242
322
|
"""
|
|
243
|
-
# Ensure experiment exists
|
|
244
323
|
if not self.experiment:
|
|
245
324
|
self.create_experiment()
|
|
246
325
|
if not self.experiment:
|
|
247
326
|
raise Exception("Failed to create experiment")
|
|
248
327
|
|
|
249
|
-
|
|
250
|
-
parameters_fixed = self.experiment.get("parameters") or {}
|
|
251
|
-
# If comparison_parameters is empty/undefined, default to [{}] so we run at least once
|
|
252
|
-
parameters_loop = self.experiment.get("comparison_parameters") or [{}]
|
|
253
|
-
|
|
254
|
-
# Handle both spans array and input field
|
|
328
|
+
parameters_here = self.experiment.get("parameters") or {}
|
|
255
329
|
input_data = example.get("input")
|
|
256
330
|
if not input_data and example.get("spans") and len(example["spans"]) > 0:
|
|
257
331
|
input_data = example["spans"][0].get("attributes", {}).get("input")
|
|
258
|
-
|
|
259
332
|
if not input_data:
|
|
260
|
-
print(f"Warning: Example has no input field or spans with input attribute: {example}"
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
start = time.time() * 1000 # milliseconds
|
|
333
|
+
print(f"Warning: Example has no input field or spans with input attribute: {example}")
|
|
334
|
+
|
|
335
|
+
example_id = example.get("id")
|
|
336
|
+
if not example_id:
|
|
337
|
+
raise ValueError("Example must have an 'id' field")
|
|
338
|
+
set_span_attribute("example", example_id)
|
|
339
|
+
|
|
340
|
+
print(f"Running with parameters: {parameters_here}")
|
|
341
|
+
original_env_vars: Dict[str, Optional[str]] = {}
|
|
342
|
+
for key, value in parameters_here.items():
|
|
343
|
+
if value:
|
|
344
|
+
original_env_vars[key] = os.environ.get(key)
|
|
345
|
+
os.environ[key] = str(value)
|
|
346
|
+
try:
|
|
347
|
+
start = time.time() * 1000
|
|
277
348
|
output = call_my_code(input_data, parameters_here)
|
|
278
|
-
# Handle async functions
|
|
279
349
|
if hasattr(output, "__await__"):
|
|
280
|
-
import asyncio
|
|
281
|
-
|
|
282
350
|
output = await output
|
|
283
|
-
|
|
284
|
-
duration = int(end - start)
|
|
285
|
-
|
|
351
|
+
duration = int((time.time() * 1000) - start)
|
|
286
352
|
print(f"Output: {output}")
|
|
287
353
|
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
354
|
+
dataset_metrics = self.get_dataset().get("metrics", [])
|
|
355
|
+
specific_metrics = example.get("metrics", [])
|
|
356
|
+
metrics = [*dataset_metrics, *specific_metrics]
|
|
357
|
+
result = Result(example=example_id, scores={}, messages={}, errors={})
|
|
358
|
+
for metric in metrics:
|
|
359
|
+
metric_id = metric.get("id")
|
|
360
|
+
if not metric_id:
|
|
361
|
+
continue
|
|
362
|
+
scorer = scorer_for_metric_id.get(metric_id) if scorer_for_metric_id else None
|
|
363
|
+
if scorer:
|
|
364
|
+
metric_result = await scorer(input_data, output, metric)
|
|
365
|
+
elif metric.get("type") == "llm":
|
|
366
|
+
metric_result = await self._score_llm_metric(input_data, output, example, metric)
|
|
367
|
+
else:
|
|
368
|
+
continue
|
|
369
|
+
if not metric_result:
|
|
370
|
+
result["errors"][metric_id] = "Scoring function returned None"
|
|
371
|
+
continue
|
|
372
|
+
result["scores"][metric_id] = metric_result.get("score")
|
|
373
|
+
result["messages"][metric_id] = metric_result.get("message")
|
|
374
|
+
result["errors"][metric_id] = metric_result.get("error")
|
|
375
|
+
result["scores"]["duration"] = duration
|
|
376
|
+
await flush_tracing()
|
|
377
|
+
print(f"Call scoreAndStore ... for example: {example_id} with scores: {result['scores']}")
|
|
378
|
+
result = await self.score_and_store(example, output, result)
|
|
297
379
|
print(f"scoreAndStore returned: {result}")
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
380
|
+
return [result]
|
|
381
|
+
finally:
|
|
382
|
+
for key, original_value in original_env_vars.items():
|
|
383
|
+
if original_value is None:
|
|
384
|
+
os.environ.pop(key, None)
|
|
385
|
+
else:
|
|
386
|
+
os.environ[key] = original_value
|
|
387
|
+
|
|
388
|
+
def get_summaries(self) -> Dict[str, Any]:
|
|
303
389
|
"""
|
|
304
|
-
Get
|
|
390
|
+
Get summaries from the experiment.
|
|
305
391
|
|
|
306
392
|
Returns:
|
|
307
393
|
Dictionary of metric names to summary statistics
|
|
308
394
|
"""
|
|
395
|
+
if not self.experiment_id:
|
|
396
|
+
raise ValueError("No experiment ID available. Create an experiment first.")
|
|
397
|
+
|
|
309
398
|
response = requests.get(
|
|
310
399
|
f"{self.server_url}/experiment/{self.experiment_id}",
|
|
311
400
|
headers=self._get_headers(),
|
|
312
401
|
)
|
|
313
|
-
|
|
402
|
+
|
|
314
403
|
if not response.ok:
|
|
315
404
|
raise Exception(format_http_error(response, "fetch summary results"))
|
|
316
405
|
|
|
317
406
|
experiment2 = response.json()
|
|
318
|
-
return experiment2.get("
|
|
407
|
+
return experiment2.get("summaries", {})
|
|
408
|
+
|
|
409
|
+
async def _score_llm_metric(
|
|
410
|
+
self,
|
|
411
|
+
input_data: Any,
|
|
412
|
+
output: Any,
|
|
413
|
+
example: Example,
|
|
414
|
+
metric: Metric,
|
|
415
|
+
) -> MetricResult:
|
|
416
|
+
"""
|
|
417
|
+
Score an LLM metric by fetching model API key from server if needed.
|
|
418
|
+
|
|
419
|
+
Args:
|
|
420
|
+
input_data: The input data to score
|
|
421
|
+
output: The output to score
|
|
422
|
+
example: The example object
|
|
423
|
+
metric: The metric definition
|
|
424
|
+
|
|
425
|
+
Returns:
|
|
426
|
+
MetricResult object with score:[0,1], message (optional), and error (optional)
|
|
427
|
+
"""
|
|
428
|
+
# If model is specified, try to fetch API key from server
|
|
429
|
+
model_id = metric.get("model")
|
|
430
|
+
api_key = None
|
|
431
|
+
provider = metric.get("provider")
|
|
432
|
+
|
|
433
|
+
if model_id:
|
|
434
|
+
model_data = await get_model_from_server(
|
|
435
|
+
model_id, self.server_url, self._get_headers()
|
|
436
|
+
)
|
|
437
|
+
if model_data:
|
|
438
|
+
# Server returns 'apiKey' (camelCase)
|
|
439
|
+
api_key = model_data.get("apiKey")
|
|
440
|
+
# If provider not set in metric, try to get it from model
|
|
441
|
+
if not provider and model_data.get("provider"):
|
|
442
|
+
provider = model_data.get("provider")
|
|
443
|
+
|
|
444
|
+
# Create a custom llm_call_fn if we have an API key from the model
|
|
445
|
+
llm_call_fn = self.llm_call_fn
|
|
446
|
+
if api_key and not llm_call_fn:
|
|
447
|
+
async def _model_llm_call(system_prompt: str, user_message: str) -> str:
|
|
448
|
+
return await call_llm_fallback(system_prompt, user_message, api_key, provider)
|
|
449
|
+
llm_call_fn = _model_llm_call
|
|
450
|
+
|
|
451
|
+
return await score_llm_metric_local(
|
|
452
|
+
input_data, output, example, metric, llm_call_fn
|
|
453
|
+
)
|
|
454
|
+
|
|
319
455
|
|