aiqtoolkit 1.2.0a20250621__py3-none-any.whl → 1.2.0a20250623__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of aiqtoolkit might be problematic. Click here for more details.
- aiq/eval/evaluate.py +16 -1
- aiq/eval/utils/weave_eval.py +135 -0
- {aiqtoolkit-1.2.0a20250621.dist-info → aiqtoolkit-1.2.0a20250623.dist-info}/METADATA +1 -1
- {aiqtoolkit-1.2.0a20250621.dist-info → aiqtoolkit-1.2.0a20250623.dist-info}/RECORD +9 -8
- {aiqtoolkit-1.2.0a20250621.dist-info → aiqtoolkit-1.2.0a20250623.dist-info}/WHEEL +0 -0
- {aiqtoolkit-1.2.0a20250621.dist-info → aiqtoolkit-1.2.0a20250623.dist-info}/entry_points.txt +0 -0
- {aiqtoolkit-1.2.0a20250621.dist-info → aiqtoolkit-1.2.0a20250623.dist-info}/licenses/LICENSE-3rd-party.txt +0 -0
- {aiqtoolkit-1.2.0a20250621.dist-info → aiqtoolkit-1.2.0a20250623.dist-info}/licenses/LICENSE.md +0 -0
- {aiqtoolkit-1.2.0a20250621.dist-info → aiqtoolkit-1.2.0a20250623.dist-info}/top_level.txt +0 -0
aiq/eval/evaluate.py
CHANGED
|
@@ -32,6 +32,7 @@ from aiq.eval.evaluator.evaluator_model import EvalInput
|
|
|
32
32
|
from aiq.eval.evaluator.evaluator_model import EvalInputItem
|
|
33
33
|
from aiq.eval.evaluator.evaluator_model import EvalOutput
|
|
34
34
|
from aiq.eval.utils.output_uploader import OutputUploader
|
|
35
|
+
from aiq.eval.utils.weave_eval import WeaveEvaluationIntegration
|
|
35
36
|
from aiq.runtime.session import AIQSessionManager
|
|
36
37
|
|
|
37
38
|
logger = logging.getLogger(__name__)
|
|
@@ -54,7 +55,7 @@ class EvaluationRun: # pylint: disable=too-many-public-methods
|
|
|
54
55
|
|
|
55
56
|
# Helpers
|
|
56
57
|
self.intermediate_step_adapter: IntermediateStepAdapter = IntermediateStepAdapter()
|
|
57
|
-
|
|
58
|
+
self.weave_eval: WeaveEvaluationIntegration = WeaveEvaluationIntegration()
|
|
58
59
|
# Metadata
|
|
59
60
|
self.eval_input: EvalInput | None = None
|
|
60
61
|
self.workflow_interrupted: bool = False
|
|
@@ -138,6 +139,8 @@ class EvaluationRun: # pylint: disable=too-many-public-methods
|
|
|
138
139
|
item.output_obj = output
|
|
139
140
|
item.trajectory = self.intermediate_step_adapter.validate_intermediate_steps(intermediate_steps)
|
|
140
141
|
|
|
142
|
+
self.weave_eval.log_prediction(item, output)
|
|
143
|
+
|
|
141
144
|
async def wrapped_run(item: EvalInputItem) -> None:
|
|
142
145
|
await run_one(item)
|
|
143
146
|
pbar.update(1)
|
|
@@ -268,11 +271,15 @@ class EvaluationRun: # pylint: disable=too-many-public-methods
|
|
|
268
271
|
"`eval` with the --skip_completed_entries flag.")
|
|
269
272
|
logger.warning(msg)
|
|
270
273
|
|
|
274
|
+
self.weave_eval.log_summary(self.evaluation_results)
|
|
275
|
+
|
|
271
276
|
async def run_single_evaluator(self, evaluator_name: str, evaluator: Any):
|
|
272
277
|
"""Run a single evaluator and store its results."""
|
|
273
278
|
try:
|
|
274
279
|
eval_output = await evaluator.evaluate_fn(self.eval_input)
|
|
275
280
|
self.evaluation_results.append((evaluator_name, eval_output))
|
|
281
|
+
|
|
282
|
+
await self.weave_eval.alog_score(eval_output, evaluator_name)
|
|
276
283
|
except Exception as e:
|
|
277
284
|
logger.exception("An error occurred while running evaluator %s: %s", evaluator_name, e, exc_info=True)
|
|
278
285
|
|
|
@@ -289,6 +296,9 @@ class EvaluationRun: # pylint: disable=too-many-public-methods
|
|
|
289
296
|
except Exception as e:
|
|
290
297
|
logger.exception("An error occurred while running evaluators: %s", e, exc_info=True)
|
|
291
298
|
raise
|
|
299
|
+
finally:
|
|
300
|
+
# Finish prediction loggers in Weave
|
|
301
|
+
await self.weave_eval.afinish_loggers()
|
|
292
302
|
|
|
293
303
|
def apply_overrides(self):
|
|
294
304
|
from aiq.cli.cli_utils.config_override import load_and_override_config
|
|
@@ -362,6 +372,11 @@ class EvaluationRun: # pylint: disable=too-many-public-methods
|
|
|
362
372
|
|
|
363
373
|
# Run workflow and evaluate
|
|
364
374
|
async with WorkflowEvalBuilder.from_config(config=config) as eval_workflow:
|
|
375
|
+
# Initialize Weave integration
|
|
376
|
+
self.weave_eval.initialize_client()
|
|
377
|
+
if self.weave_eval.client:
|
|
378
|
+
self.weave_eval.initialize_logger(self.eval_input, config)
|
|
379
|
+
|
|
365
380
|
if self.config.endpoint:
|
|
366
381
|
await self.run_workflow_remote()
|
|
367
382
|
else:
|
|
@@ -0,0 +1,135 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
#
|
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
# you may not use this file except in compliance with the License.
|
|
6
|
+
# You may obtain a copy of the License at
|
|
7
|
+
#
|
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
#
|
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
# See the License for the specific language governing permissions and
|
|
14
|
+
# limitations under the License.
|
|
15
|
+
|
|
16
|
+
import asyncio
|
|
17
|
+
import logging
|
|
18
|
+
from typing import Any
|
|
19
|
+
from typing import List
|
|
20
|
+
|
|
21
|
+
from aiq.eval.evaluator.evaluator_model import EvalInput
|
|
22
|
+
from aiq.eval.evaluator.evaluator_model import EvalInputItem
|
|
23
|
+
from aiq.eval.evaluator.evaluator_model import EvalOutput
|
|
24
|
+
|
|
25
|
+
logger = logging.getLogger(__name__)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class WeaveEvaluationIntegration: # pylint: disable=too-many-public-methods
|
|
29
|
+
"""
|
|
30
|
+
Class to handle all Weave integration functionality.
|
|
31
|
+
"""
|
|
32
|
+
|
|
33
|
+
def __init__(self):
|
|
34
|
+
self.available = False
|
|
35
|
+
self.client = None
|
|
36
|
+
self.eval_logger = None
|
|
37
|
+
self.pred_loggers = {}
|
|
38
|
+
|
|
39
|
+
try:
|
|
40
|
+
from weave.flow.eval_imperative import EvaluationLogger
|
|
41
|
+
from weave.flow.eval_imperative import ScoreLogger
|
|
42
|
+
from weave.trace.context import weave_client_context
|
|
43
|
+
self.EvaluationLogger = EvaluationLogger
|
|
44
|
+
self.ScoreLogger = ScoreLogger
|
|
45
|
+
self.weave_client_context = weave_client_context
|
|
46
|
+
self.available = True
|
|
47
|
+
except ImportError:
|
|
48
|
+
self.available = False
|
|
49
|
+
# we simply don't do anything if weave is not available
|
|
50
|
+
pass
|
|
51
|
+
|
|
52
|
+
def initialize_client(self):
|
|
53
|
+
"""Initialize the Weave client if available."""
|
|
54
|
+
if not self.available:
|
|
55
|
+
return False
|
|
56
|
+
|
|
57
|
+
try:
|
|
58
|
+
self.client = self.weave_client_context.require_weave_client()
|
|
59
|
+
return self.client is not None
|
|
60
|
+
except Exception:
|
|
61
|
+
self.client = None
|
|
62
|
+
return False
|
|
63
|
+
|
|
64
|
+
def initialize_logger(self, eval_input: EvalInput, config: Any):
|
|
65
|
+
"""Initialize the Weave evaluation logger."""
|
|
66
|
+
if not self.client:
|
|
67
|
+
return False
|
|
68
|
+
|
|
69
|
+
try:
|
|
70
|
+
weave_dataset = [
|
|
71
|
+
item.model_dump(exclude={"output_obj", "trajectory"}) for item in eval_input.eval_input_items
|
|
72
|
+
]
|
|
73
|
+
config_dict = config.model_dump(mode="json")
|
|
74
|
+
# TODO: make this configurable
|
|
75
|
+
config_dict["name"] = "aiqtoolkit-eval"
|
|
76
|
+
self.eval_logger = self.EvaluationLogger(model=config_dict, dataset=weave_dataset)
|
|
77
|
+
self.pred_loggers = {}
|
|
78
|
+
|
|
79
|
+
del weave_dataset
|
|
80
|
+
del config_dict
|
|
81
|
+
return True
|
|
82
|
+
except Exception as e:
|
|
83
|
+
self.eval_logger = None
|
|
84
|
+
logger.warning("Failed to initialize Weave `EvaluationLogger`: %s", e)
|
|
85
|
+
|
|
86
|
+
return False
|
|
87
|
+
|
|
88
|
+
def log_prediction(self, item: EvalInputItem, output: Any):
|
|
89
|
+
"""Log a prediction to Weave."""
|
|
90
|
+
if not self.eval_logger:
|
|
91
|
+
return
|
|
92
|
+
|
|
93
|
+
pred_logger = self.eval_logger.log_prediction(inputs=item.model_dump(exclude={"output_obj", "trajectory"}),
|
|
94
|
+
output=output)
|
|
95
|
+
self.pred_loggers[item.id] = pred_logger
|
|
96
|
+
|
|
97
|
+
async def alog_score(self, eval_output: EvalOutput, evaluator_name: str):
|
|
98
|
+
"""Log scores for evaluation outputs."""
|
|
99
|
+
if not self.eval_logger:
|
|
100
|
+
return
|
|
101
|
+
|
|
102
|
+
for eval_output_item in eval_output.eval_output_items:
|
|
103
|
+
if eval_output_item.id in self.pred_loggers:
|
|
104
|
+
await self.pred_loggers[eval_output_item.id].alog_score(
|
|
105
|
+
scorer=evaluator_name,
|
|
106
|
+
score=eval_output_item.score,
|
|
107
|
+
)
|
|
108
|
+
|
|
109
|
+
async def afinish_loggers(self):
|
|
110
|
+
"""Finish all prediction loggers."""
|
|
111
|
+
if not self.eval_logger:
|
|
112
|
+
return
|
|
113
|
+
|
|
114
|
+
async def _finish_one(pred_logger):
|
|
115
|
+
if hasattr(pred_logger, '_has_finished') and not pred_logger._has_finished:
|
|
116
|
+
return
|
|
117
|
+
# run the *blocking* finish() in a thread so we don’t nest loops
|
|
118
|
+
await asyncio.to_thread(pred_logger.finish)
|
|
119
|
+
|
|
120
|
+
await asyncio.gather(*[_finish_one(pl) for pl in self.pred_loggers.values()])
|
|
121
|
+
|
|
122
|
+
def log_summary(self, evaluation_results: List[tuple[str, EvalOutput]]):
|
|
123
|
+
"""Log summary statistics to Weave."""
|
|
124
|
+
if not self.eval_logger:
|
|
125
|
+
return
|
|
126
|
+
|
|
127
|
+
summary = {}
|
|
128
|
+
for evaluator_name, eval_output in evaluation_results:
|
|
129
|
+
# Calculate average score for this evaluator
|
|
130
|
+
scores = [item.score for item in eval_output.eval_output_items if item.score is not None]
|
|
131
|
+
if scores:
|
|
132
|
+
summary[f"{evaluator_name}_avg"] = sum(scores) / len(scores)
|
|
133
|
+
|
|
134
|
+
# Log the summary to finish the evaluation
|
|
135
|
+
self.eval_logger.log_summary(summary)
|
|
@@ -107,7 +107,7 @@ aiq/embedder/openai_embedder.py,sha256=5FO3xsyNvEmbLBsZb3xsCpbN1Soxio4yf4b5gTPVx
|
|
|
107
107
|
aiq/embedder/register.py,sha256=3MTZrfNQKp6AZTbfaA-PpTnyXiMyu-8HH9JnDCC0v9o,978
|
|
108
108
|
aiq/eval/__init__.py,sha256=Xs1JQ16L9btwreh4pdGKwskffAw1YFO48jKrU4ib_7c,685
|
|
109
109
|
aiq/eval/config.py,sha256=IlOr2o618kbkXP0G1F-AklZfsKYVos9UB4Dvlxf66xk,1431
|
|
110
|
-
aiq/eval/evaluate.py,sha256=
|
|
110
|
+
aiq/eval/evaluate.py,sha256=VdVdB_CV842gIV4diHciJ1qrof5_N3H8I16WwracCsQ,17940
|
|
111
111
|
aiq/eval/intermediate_step_adapter.py,sha256=4cSsGgFBvNjXnclk5FvZnQaFEdeulp7VEdRWKLcREAQ,4498
|
|
112
112
|
aiq/eval/register.py,sha256=QOHJqA2CQixeWMC9InyKbzXo1jByvrntD_m9-2Mvg9k,1076
|
|
113
113
|
aiq/eval/remote_workflow.py,sha256=Fb7Z6gdP2L_gqyWB--AEWfcXe9xPpQ_hPsf9lmqGXjI,5524
|
|
@@ -134,6 +134,7 @@ aiq/eval/tunable_rag_evaluator/register.py,sha256=q4p2rFyMzWmaINJc961ZV4jzIlAN4G
|
|
|
134
134
|
aiq/eval/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
135
135
|
aiq/eval/utils/output_uploader.py,sha256=SaQbZPkw-Q0H7t5yG60Kh-p1cflR7gPklVkilC4uPbU,5141
|
|
136
136
|
aiq/eval/utils/tqdm_position_registry.py,sha256=9CtpCk1wtYCSyieHPaSp8nlZu6EcNUOaUz2RTqfekrA,1286
|
|
137
|
+
aiq/eval/utils/weave_eval.py,sha256=yIdlp4UdCPgwFYJNJon5eZD1d99E-6dcmfVg6B-4RKE,5076
|
|
137
138
|
aiq/front_ends/__init__.py,sha256=Xs1JQ16L9btwreh4pdGKwskffAw1YFO48jKrU4ib_7c,685
|
|
138
139
|
aiq/front_ends/register.py,sha256=OKv1xi-g8WHtUMuIPhwjG6wOYqaGDD-Q9vDtKtT9d1Y,889
|
|
139
140
|
aiq/front_ends/console/__init__.py,sha256=Xs1JQ16L9btwreh4pdGKwskffAw1YFO48jKrU4ib_7c,685
|
|
@@ -309,10 +310,10 @@ aiq/utils/reactive/base/observer_base.py,sha256=UAlyAY_ky4q2t0P81RVFo2Bs_R7z5Nde
|
|
|
309
310
|
aiq/utils/reactive/base/subject_base.py,sha256=Ed-AC6P7cT3qkW1EXjzbd5M9WpVoeN_9KCe3OM3FLU4,2521
|
|
310
311
|
aiq/utils/settings/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
311
312
|
aiq/utils/settings/global_settings.py,sha256=U9TCLdoZsKq5qOVGjREipGVv9e-FlStzqy5zv82_VYk,7454
|
|
312
|
-
aiqtoolkit-1.2.
|
|
313
|
-
aiqtoolkit-1.2.
|
|
314
|
-
aiqtoolkit-1.2.
|
|
315
|
-
aiqtoolkit-1.2.
|
|
316
|
-
aiqtoolkit-1.2.
|
|
317
|
-
aiqtoolkit-1.2.
|
|
318
|
-
aiqtoolkit-1.2.
|
|
313
|
+
aiqtoolkit-1.2.0a20250623.dist-info/licenses/LICENSE-3rd-party.txt,sha256=8o7aySJa9CBvFshPcsRdJbczzdNyDGJ8b0J67WRUQ2k,183936
|
|
314
|
+
aiqtoolkit-1.2.0a20250623.dist-info/licenses/LICENSE.md,sha256=QwcOLU5TJoTeUhuIXzhdCEEDDvorGiC6-3YTOl4TecE,11356
|
|
315
|
+
aiqtoolkit-1.2.0a20250623.dist-info/METADATA,sha256=M98GDq-TQ5hxx-6C2mgkvNJD-NPsksihuT96qOukjEE,20274
|
|
316
|
+
aiqtoolkit-1.2.0a20250623.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
317
|
+
aiqtoolkit-1.2.0a20250623.dist-info/entry_points.txt,sha256=gRlPfR5g21t328WNEQ4CcEz80S1sJNS8A7rMDYnzl4A,452
|
|
318
|
+
aiqtoolkit-1.2.0a20250623.dist-info/top_level.txt,sha256=fo7AzYcNhZ_tRWrhGumtxwnxMew4xrT1iwouDy_f0Kc,4
|
|
319
|
+
aiqtoolkit-1.2.0a20250623.dist-info/RECORD,,
|
|
File without changes
|
{aiqtoolkit-1.2.0a20250621.dist-info → aiqtoolkit-1.2.0a20250623.dist-info}/entry_points.txt
RENAMED
|
File without changes
|
|
File without changes
|
{aiqtoolkit-1.2.0a20250621.dist-info → aiqtoolkit-1.2.0a20250623.dist-info}/licenses/LICENSE.md
RENAMED
|
File without changes
|
|
File without changes
|