aiqa-client 0.7.0__py3-none-any.whl → 0.7.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- aiqa/constants.py +1 -1
- aiqa/experiment_runner.py +46 -52
- aiqa/tracing.py +11 -5
- {aiqa_client-0.7.0.dist-info → aiqa_client-0.7.2.dist-info}/METADATA +1 -1
- {aiqa_client-0.7.0.dist-info → aiqa_client-0.7.2.dist-info}/RECORD +8 -8
- {aiqa_client-0.7.0.dist-info → aiqa_client-0.7.2.dist-info}/WHEEL +0 -0
- {aiqa_client-0.7.0.dist-info → aiqa_client-0.7.2.dist-info}/licenses/LICENSE.txt +0 -0
- {aiqa_client-0.7.0.dist-info → aiqa_client-0.7.2.dist-info}/top_level.txt +0 -0
aiqa/constants.py
CHANGED
|
@@ -3,6 +3,6 @@ Constants used across the AIQA client package.
|
|
|
3
3
|
"""
|
|
4
4
|
|
|
5
5
|
AIQA_TRACER_NAME = "aiqa-tracer"
|
|
6
|
-
VERSION = "0.7.
|
|
6
|
+
VERSION = "0.7.2" # automatically updated by set-version-json.sh
|
|
7
7
|
|
|
8
8
|
LOG_TAG = "AIQA" # Used in all logging output to identify AIQA messages
|
aiqa/experiment_runner.py
CHANGED
|
@@ -5,11 +5,16 @@ ExperimentRunner - runs experiments on datasets and scores results
|
|
|
5
5
|
import os
|
|
6
6
|
import time
|
|
7
7
|
import asyncio
|
|
8
|
+
from opentelemetry import context as otel_context
|
|
9
|
+
from opentelemetry.trace import Status, StatusCode, set_span_in_context
|
|
8
10
|
from .constants import LOG_TAG
|
|
9
11
|
from .http_utils import build_headers, get_server_url, get_api_key, format_http_error
|
|
10
12
|
from typing import Any, Dict, List, Optional, Callable, Awaitable, Union
|
|
11
13
|
from .tracing import WithTracing
|
|
12
|
-
from .span_helpers import set_span_attribute, flush_tracing
|
|
14
|
+
from .span_helpers import set_span_attribute, flush_tracing, get_active_trace_id
|
|
15
|
+
from .client import get_aiqa_client, get_aiqa_tracer, get_component_tag
|
|
16
|
+
from .object_serialiser import serialize_for_span
|
|
17
|
+
from .tracing_llm_utils import _extract_and_set_token_usage, _extract_and_set_provider_and_model
|
|
13
18
|
from .llm_as_judge import score_llm_metric_local, get_model_from_server, call_llm_fallback
|
|
14
19
|
import requests
|
|
15
20
|
from .types import MetricResult, ScoreThisInputOutputMetricType, Example, Result, Metric, CallLLMType
|
|
@@ -25,31 +30,9 @@ CallMyCodeType = Callable[[Any, Dict[str, Any]], Union[Any, Awaitable[Any]]]
|
|
|
25
30
|
ScoreThisOutputType = Callable[[Any, Any, Dict[str, Any], Dict[str, Any]], Awaitable[Dict[str, Any]]]
|
|
26
31
|
|
|
27
32
|
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
""
|
|
31
|
-
if not isinstance(input_data, dict):
|
|
32
|
-
return {}
|
|
33
|
-
self_obj = input_data.get("self")
|
|
34
|
-
if not self_obj:
|
|
35
|
-
return {}
|
|
36
|
-
return {
|
|
37
|
-
"dataset": getattr(self_obj, "dataset_id", None),
|
|
38
|
-
"experiment": getattr(self_obj, "experiment_id", None),
|
|
39
|
-
}
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
def _filter_input_for_run_example(
|
|
43
|
-
self: "ExperimentRunner",
|
|
44
|
-
example: Dict[str, Any],
|
|
45
|
-
call_my_code: Any = None,
|
|
46
|
-
score_this_output: Any = None,
|
|
47
|
-
) -> Dict[str, Any]:
|
|
48
|
-
"""Filter input for run_example method to extract dataset, experiment, and example IDs."""
|
|
49
|
-
result = _filter_input_for_run({"self": self})
|
|
50
|
-
if isinstance(example, dict):
|
|
51
|
-
result["example"] = example.get("id")
|
|
52
|
-
return result
|
|
33
|
+
def _metric_score_key(metric: Dict[str, Any]) -> str:
|
|
34
|
+
"""Key for scores in API: server expects metric name (fallback to id)."""
|
|
35
|
+
return (metric.get("name") or metric.get("id")) or ""
|
|
53
36
|
|
|
54
37
|
|
|
55
38
|
class ExperimentRunner:
|
|
@@ -144,7 +127,7 @@ class ExperimentRunner:
|
|
|
144
127
|
List of example objects
|
|
145
128
|
"""
|
|
146
129
|
params = {
|
|
147
|
-
"
|
|
130
|
+
"dataset": self.dataset_id,
|
|
148
131
|
"limit": str(limit),
|
|
149
132
|
}
|
|
150
133
|
if self.organisation:
|
|
@@ -216,6 +199,7 @@ class ExperimentRunner:
|
|
|
216
199
|
example: Example,
|
|
217
200
|
output: Any,
|
|
218
201
|
result: Result,
|
|
202
|
+
trace_id: Optional[str] = None,
|
|
219
203
|
) -> Result:
|
|
220
204
|
"""
|
|
221
205
|
Ask the server to score an example result. Stores the score for later summary calculation.
|
|
@@ -235,21 +219,20 @@ class ExperimentRunner:
|
|
|
235
219
|
if not example_id:
|
|
236
220
|
raise ValueError("Example must have an 'id' field")
|
|
237
221
|
if result is None:
|
|
238
|
-
result =
|
|
222
|
+
result = {"example": example_id, "scores": {}, "messages": {}, "errors": {}}
|
|
239
223
|
scores = result.get("scores") or {}
|
|
240
|
-
|
|
241
|
-
|
|
242
224
|
|
|
243
225
|
print(f"Scoring and storing example: {example_id}")
|
|
244
226
|
print(f"Scores: {scores}")
|
|
245
227
|
|
|
246
228
|
# Run synchronous requests.post in a thread pool to avoid blocking
|
|
229
|
+
# Server expects output = raw output to score, not the result dict; scores keyed by metric name
|
|
247
230
|
def _do_request():
|
|
248
231
|
return requests.post(
|
|
249
232
|
f"{self.server_url}/experiment/{self.experiment_id}/example/{example_id}/scoreAndStore",
|
|
250
233
|
json={
|
|
251
|
-
"output":
|
|
252
|
-
"
|
|
234
|
+
"output": output,
|
|
235
|
+
"trace": trace_id,
|
|
253
236
|
"scores": scores,
|
|
254
237
|
},
|
|
255
238
|
headers=self._get_headers(),
|
|
@@ -264,7 +247,6 @@ class ExperimentRunner:
|
|
|
264
247
|
print(f"scoreAndStore response: {json_result}")
|
|
265
248
|
return json_result
|
|
266
249
|
|
|
267
|
-
@WithTracing(filter_input=_filter_input_for_run)
|
|
268
250
|
async def run(
|
|
269
251
|
self,
|
|
270
252
|
call_my_code: CallMyCodeType,
|
|
@@ -279,17 +261,9 @@ class ExperimentRunner:
|
|
|
279
261
|
"""
|
|
280
262
|
examples = self.get_examples_for_dataset()
|
|
281
263
|
|
|
282
|
-
# Wrap engine to match run_example signature (input, parameters)
|
|
283
|
-
async def wrapped_engine(input_data, parameters):
|
|
284
|
-
result = call_my_code(input_data, parameters)
|
|
285
|
-
# Handle async functions
|
|
286
|
-
if hasattr(result, "__await__"):
|
|
287
|
-
result = await result
|
|
288
|
-
return result
|
|
289
|
-
|
|
290
264
|
for example in examples:
|
|
291
265
|
try:
|
|
292
|
-
scores = await self.run_example(example,
|
|
266
|
+
scores = await self.run_example(example, call_my_code, scorer_for_metric_id)
|
|
293
267
|
if scores:
|
|
294
268
|
self.scores.append(
|
|
295
269
|
{
|
|
@@ -302,7 +276,6 @@ class ExperimentRunner:
|
|
|
302
276
|
print(f"Error processing example {example.get('id', 'unknown')}: {e}")
|
|
303
277
|
# Continue with next example instead of failing entire run
|
|
304
278
|
|
|
305
|
-
@WithTracing(filter_input=_filter_input_for_run_example)
|
|
306
279
|
async def run_example(
|
|
307
280
|
self,
|
|
308
281
|
example: Example,
|
|
@@ -312,6 +285,9 @@ class ExperimentRunner:
|
|
|
312
285
|
"""
|
|
313
286
|
Run the engine on an example with the experiment's parameters, score the result, and store it.
|
|
314
287
|
|
|
288
|
+
Spans: one root "RunExample" span (input, call_my_code, output) and one child "ScoreExample"
|
|
289
|
+
span for scoring, so the server sees a clear call_my_code vs scoring split (aligned with client-go).
|
|
290
|
+
|
|
315
291
|
Args:
|
|
316
292
|
example: The example to run. See Example.ts type
|
|
317
293
|
call_my_code: Function that takes input and parameters, returns output (can be async)
|
|
@@ -335,7 +311,6 @@ class ExperimentRunner:
|
|
|
335
311
|
example_id = example.get("id")
|
|
336
312
|
if not example_id:
|
|
337
313
|
raise ValueError("Example must have an 'id' field")
|
|
338
|
-
set_span_attribute("example", example_id)
|
|
339
314
|
|
|
340
315
|
print(f"Running with parameters: {parameters_here}")
|
|
341
316
|
original_env_vars: Dict[str, Optional[str]] = {}
|
|
@@ -345,7 +320,25 @@ class ExperimentRunner:
|
|
|
345
320
|
os.environ[key] = str(value)
|
|
346
321
|
try:
|
|
347
322
|
start = time.time() * 1000
|
|
348
|
-
|
|
323
|
+
|
|
324
|
+
run_trace_id_ref: List[Optional[str]] = [None]
|
|
325
|
+
|
|
326
|
+
# Wrap engine to match run_example signature (input, parameters)
|
|
327
|
+
# Root span so server can find it by parent:unset; trace ID is sent to scoreAndStore
|
|
328
|
+
def set_trace_id(tid: Optional[str]) -> None:
|
|
329
|
+
run_trace_id_ref[0] = tid
|
|
330
|
+
|
|
331
|
+
@WithTracing(root=True)
|
|
332
|
+
async def wrapped_engine(input_data, parameters, set_trace_id: Callable[[Optional[str]], None]):
|
|
333
|
+
trace_id_here = get_active_trace_id()
|
|
334
|
+
set_trace_id(trace_id_here)
|
|
335
|
+
result = call_my_code(input_data, parameters)
|
|
336
|
+
# Handle async functions
|
|
337
|
+
if hasattr(result, "__await__"):
|
|
338
|
+
result = await result
|
|
339
|
+
return result
|
|
340
|
+
|
|
341
|
+
output = wrapped_engine(input_data, parameters_here, set_trace_id)
|
|
349
342
|
if hasattr(output, "__await__"):
|
|
350
343
|
output = await output
|
|
351
344
|
duration = int((time.time() * 1000) - start)
|
|
@@ -354,10 +347,11 @@ class ExperimentRunner:
|
|
|
354
347
|
dataset_metrics = self.get_dataset().get("metrics", [])
|
|
355
348
|
specific_metrics = example.get("metrics", [])
|
|
356
349
|
metrics = [*dataset_metrics, *specific_metrics]
|
|
357
|
-
result =
|
|
350
|
+
result: Result = {"example": example_id, "scores": {}, "messages": {}, "errors": {}}
|
|
358
351
|
for metric in metrics:
|
|
359
352
|
metric_id = metric.get("id")
|
|
360
|
-
|
|
353
|
+
score_key = _metric_score_key(metric)
|
|
354
|
+
if not metric_id or not score_key:
|
|
361
355
|
continue
|
|
362
356
|
scorer = scorer_for_metric_id.get(metric_id) if scorer_for_metric_id else None
|
|
363
357
|
if scorer:
|
|
@@ -367,15 +361,15 @@ class ExperimentRunner:
|
|
|
367
361
|
else:
|
|
368
362
|
continue
|
|
369
363
|
if not metric_result:
|
|
370
|
-
result["errors"][
|
|
364
|
+
result["errors"][score_key] = "Scoring function returned None"
|
|
371
365
|
continue
|
|
372
|
-
result["scores"][
|
|
373
|
-
result["messages"][
|
|
374
|
-
result["errors"][
|
|
366
|
+
result["scores"][score_key] = metric_result.get("score")
|
|
367
|
+
result["messages"][score_key] = metric_result.get("message")
|
|
368
|
+
result["errors"][score_key] = metric_result.get("error")
|
|
375
369
|
result["scores"]["duration"] = duration
|
|
376
370
|
await flush_tracing()
|
|
377
371
|
print(f"Call scoreAndStore ... for example: {example_id} with scores: {result['scores']}")
|
|
378
|
-
result = await self.score_and_store(example, output, result)
|
|
372
|
+
result = await self.score_and_store(example, output, result, trace_id=run_trace_id_ref[0])
|
|
379
373
|
print(f"scoreAndStore returned: {result}")
|
|
380
374
|
return [result]
|
|
381
375
|
finally:
|
aiqa/tracing.py
CHANGED
|
@@ -7,7 +7,7 @@ import inspect
|
|
|
7
7
|
import fnmatch
|
|
8
8
|
from typing import Any, Callable, Optional, List
|
|
9
9
|
from functools import wraps
|
|
10
|
-
from opentelemetry import trace
|
|
10
|
+
from opentelemetry import context as otel_context, trace
|
|
11
11
|
from opentelemetry.trace import Status, StatusCode
|
|
12
12
|
|
|
13
13
|
from .client import get_aiqa_client, get_component_tag, get_aiqa_tracer
|
|
@@ -563,6 +563,7 @@ def WithTracing(
|
|
|
563
563
|
ignore_output: Optional[List[str]] = None,
|
|
564
564
|
filter_input: Optional[Callable[[Any], Any]] = None,
|
|
565
565
|
filter_output: Optional[Callable[[Any], Any]] = None,
|
|
566
|
+
root: bool = False,
|
|
566
567
|
):
|
|
567
568
|
"""
|
|
568
569
|
Decorator to automatically create spans for function calls.
|
|
@@ -592,6 +593,7 @@ def WithTracing(
|
|
|
592
593
|
Returns a dict or any value (will be converted to dict). Applied before ignore_input.
|
|
593
594
|
filter_output: Function to filter/transform output before recording.
|
|
594
595
|
Receives the output value and returns a filtered/transformed version.
|
|
596
|
+
root: Whether this is a root span. If True, the span will not be linked to any parent spans.
|
|
595
597
|
|
|
596
598
|
Example:
|
|
597
599
|
@WithTracing
|
|
@@ -685,7 +687,8 @@ def WithTracing(
|
|
|
685
687
|
return executor()
|
|
686
688
|
# Get tracer after initialization (lazy)
|
|
687
689
|
tracer = get_aiqa_tracer()
|
|
688
|
-
|
|
690
|
+
span_kw = {"context": otel_context.Context()} if root else {}
|
|
691
|
+
with tracer.start_as_current_span(fn_name, **span_kw) as span:
|
|
689
692
|
if not _setup_span(span, input_data):
|
|
690
693
|
return executor() # span is not recording, so just execute the function and return the result
|
|
691
694
|
try:
|
|
@@ -706,7 +709,8 @@ def WithTracing(
|
|
|
706
709
|
|
|
707
710
|
# Get tracer after initialization (lazy)
|
|
708
711
|
tracer = get_aiqa_tracer()
|
|
709
|
-
|
|
712
|
+
span_kw = {"context": otel_context.Context()} if root else {}
|
|
713
|
+
with tracer.start_as_current_span(fn_name, **span_kw) as span:
|
|
710
714
|
if not _setup_span(span, input_data):
|
|
711
715
|
return await executor()
|
|
712
716
|
|
|
@@ -732,7 +736,8 @@ def WithTracing(
|
|
|
732
736
|
# Get tracer after initialization (lazy)
|
|
733
737
|
tracer = get_aiqa_tracer()
|
|
734
738
|
# Create span but don't use 'with' - span will be closed by TracedGenerator
|
|
735
|
-
|
|
739
|
+
span_kw = {"context": otel_context.Context()} if root else {}
|
|
740
|
+
span = tracer.start_span(fn_name, **span_kw)
|
|
736
741
|
token = trace.context_api.attach(trace.context_api.set_span_in_context(span))
|
|
737
742
|
|
|
738
743
|
try:
|
|
@@ -762,7 +767,8 @@ def WithTracing(
|
|
|
762
767
|
# Get tracer after initialization (lazy)
|
|
763
768
|
tracer = get_aiqa_tracer()
|
|
764
769
|
# Create span but don't use 'with' - span will be closed by TracedAsyncGenerator
|
|
765
|
-
|
|
770
|
+
span_kw = {"context": otel_context.Context()} if root else {}
|
|
771
|
+
span = tracer.start_span(fn_name, **span_kw)
|
|
766
772
|
token = trace.context_api.attach(trace.context_api.set_span_in_context(span))
|
|
767
773
|
|
|
768
774
|
try:
|
|
@@ -1,17 +1,17 @@
|
|
|
1
1
|
aiqa/__init__.py,sha256=JLQjgQgsyGQ1mRl4kcYygJq9i_91jN4WDem3dF1eMGA,1888
|
|
2
2
|
aiqa/client.py,sha256=zS9OQQhdvVeIoBz0o8qrz-rjXngEbS9Lrli2ZWNIsrM,15993
|
|
3
|
-
aiqa/constants.py,sha256=
|
|
4
|
-
aiqa/experiment_runner.py,sha256=
|
|
3
|
+
aiqa/constants.py,sha256=if54R1OD111iPvB53mw0U9NRrBV-zvvm1gOAVxRj-vE,226
|
|
4
|
+
aiqa/experiment_runner.py,sha256=YpUOoBS_3DvT_ipofWe8MnrSjmWz4Bmfe8yaErdprBA,17730
|
|
5
5
|
aiqa/http_utils.py,sha256=OIB4tRI2TiDl4VKDmtbLWg9Q7TicMBeL7scLYEhVPXI,4944
|
|
6
6
|
aiqa/llm_as_judge.py,sha256=ESmqQfaYpypCNfsODkdn5s85n_nzJ4WKbhUMVTb2djE,10087
|
|
7
7
|
aiqa/object_serialiser.py,sha256=mzd2U_mFcAPalN2m9wxq35-BBeRJOhNK1k0-BmRSfQM,17055
|
|
8
8
|
aiqa/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
9
9
|
aiqa/span_helpers.py,sha256=Ht4T_JJXK4HlqBY_Qwe8QDk9XwWCjagx_DkOUVY-PmY,18189
|
|
10
|
-
aiqa/tracing.py,sha256=
|
|
10
|
+
aiqa/tracing.py,sha256=XKYUwZUIkybxIkOKMj5xwVV2IwX6QTvRIzsZOv8jUOc,35771
|
|
11
11
|
aiqa/tracing_llm_utils.py,sha256=zQSxzkEhPmgel1P2kFueNWTr846re-qHEFxD-_EHhNQ,10241
|
|
12
12
|
aiqa/types.py,sha256=Rv27oC1R0P1soJz5wsdwkVW-jfHQEVi4vUhwRJid270,2529
|
|
13
|
-
aiqa_client-0.7.
|
|
14
|
-
aiqa_client-0.7.
|
|
15
|
-
aiqa_client-0.7.
|
|
16
|
-
aiqa_client-0.7.
|
|
17
|
-
aiqa_client-0.7.
|
|
13
|
+
aiqa_client-0.7.2.dist-info/licenses/LICENSE.txt,sha256=kIzkzLuzG0HHaWYm4F4W5FeJ1Yxut3Ec6bhLWyw798A,1062
|
|
14
|
+
aiqa_client-0.7.2.dist-info/METADATA,sha256=Q4Wwu_FqNSB7IRdydBcRFDcL2bHNLyStT6DYkc_aS8E,7705
|
|
15
|
+
aiqa_client-0.7.2.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
|
|
16
|
+
aiqa_client-0.7.2.dist-info/top_level.txt,sha256=nwcsuVVSuWu27iLxZd4n1evVzv1W6FVTrSnCXCc-NQs,5
|
|
17
|
+
aiqa_client-0.7.2.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|