aiqtoolkit 1.2.0a20250622__py3-none-any.whl → 1.2.0a20250623__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of aiqtoolkit might be problematic. Click here for more details.

aiq/eval/evaluate.py CHANGED
@@ -32,6 +32,7 @@ from aiq.eval.evaluator.evaluator_model import EvalInput
32
32
  from aiq.eval.evaluator.evaluator_model import EvalInputItem
33
33
  from aiq.eval.evaluator.evaluator_model import EvalOutput
34
34
  from aiq.eval.utils.output_uploader import OutputUploader
35
+ from aiq.eval.utils.weave_eval import WeaveEvaluationIntegration
35
36
  from aiq.runtime.session import AIQSessionManager
36
37
 
37
38
  logger = logging.getLogger(__name__)
@@ -54,7 +55,7 @@ class EvaluationRun: # pylint: disable=too-many-public-methods
54
55
 
55
56
  # Helpers
56
57
  self.intermediate_step_adapter: IntermediateStepAdapter = IntermediateStepAdapter()
57
-
58
+ self.weave_eval: WeaveEvaluationIntegration = WeaveEvaluationIntegration()
58
59
  # Metadata
59
60
  self.eval_input: EvalInput | None = None
60
61
  self.workflow_interrupted: bool = False
@@ -138,6 +139,8 @@ class EvaluationRun: # pylint: disable=too-many-public-methods
138
139
  item.output_obj = output
139
140
  item.trajectory = self.intermediate_step_adapter.validate_intermediate_steps(intermediate_steps)
140
141
 
142
+ self.weave_eval.log_prediction(item, output)
143
+
141
144
  async def wrapped_run(item: EvalInputItem) -> None:
142
145
  await run_one(item)
143
146
  pbar.update(1)
@@ -268,11 +271,15 @@ class EvaluationRun: # pylint: disable=too-many-public-methods
268
271
  "`eval` with the --skip_completed_entries flag.")
269
272
  logger.warning(msg)
270
273
 
274
+ self.weave_eval.log_summary(self.evaluation_results)
275
+
271
276
  async def run_single_evaluator(self, evaluator_name: str, evaluator: Any):
272
277
  """Run a single evaluator and store its results."""
273
278
  try:
274
279
  eval_output = await evaluator.evaluate_fn(self.eval_input)
275
280
  self.evaluation_results.append((evaluator_name, eval_output))
281
+
282
+ await self.weave_eval.alog_score(eval_output, evaluator_name)
276
283
  except Exception as e:
277
284
  logger.exception("An error occurred while running evaluator %s: %s", evaluator_name, e, exc_info=True)
278
285
 
@@ -289,6 +296,9 @@ class EvaluationRun: # pylint: disable=too-many-public-methods
289
296
  except Exception as e:
290
297
  logger.exception("An error occurred while running evaluators: %s", e, exc_info=True)
291
298
  raise
299
+ finally:
300
+ # Finish prediction loggers in Weave
301
+ await self.weave_eval.afinish_loggers()
292
302
 
293
303
  def apply_overrides(self):
294
304
  from aiq.cli.cli_utils.config_override import load_and_override_config
@@ -362,6 +372,11 @@ class EvaluationRun: # pylint: disable=too-many-public-methods
362
372
 
363
373
  # Run workflow and evaluate
364
374
  async with WorkflowEvalBuilder.from_config(config=config) as eval_workflow:
375
+ # Initialize Weave integration
376
+ self.weave_eval.initialize_client()
377
+ if self.weave_eval.client:
378
+ self.weave_eval.initialize_logger(self.eval_input, config)
379
+
365
380
  if self.config.endpoint:
366
381
  await self.run_workflow_remote()
367
382
  else:
@@ -0,0 +1,135 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ import asyncio
17
+ import logging
18
+ from typing import Any
19
+ from typing import List
20
+
21
+ from aiq.eval.evaluator.evaluator_model import EvalInput
22
+ from aiq.eval.evaluator.evaluator_model import EvalInputItem
23
+ from aiq.eval.evaluator.evaluator_model import EvalOutput
24
+
25
+ logger = logging.getLogger(__name__)
26
+
27
+
28
+ class WeaveEvaluationIntegration: # pylint: disable=too-many-public-methods
29
+ """
30
+ Class to handle all Weave integration functionality.
31
+ """
32
+
33
+ def __init__(self):
34
+ self.available = False
35
+ self.client = None
36
+ self.eval_logger = None
37
+ self.pred_loggers = {}
38
+
39
+ try:
40
+ from weave.flow.eval_imperative import EvaluationLogger
41
+ from weave.flow.eval_imperative import ScoreLogger
42
+ from weave.trace.context import weave_client_context
43
+ self.EvaluationLogger = EvaluationLogger
44
+ self.ScoreLogger = ScoreLogger
45
+ self.weave_client_context = weave_client_context
46
+ self.available = True
47
+ except ImportError:
48
+ self.available = False
49
+ # we simply don't do anything if weave is not available
50
+ pass
51
+
52
+ def initialize_client(self):
53
+ """Initialize the Weave client if available."""
54
+ if not self.available:
55
+ return False
56
+
57
+ try:
58
+ self.client = self.weave_client_context.require_weave_client()
59
+ return self.client is not None
60
+ except Exception:
61
+ self.client = None
62
+ return False
63
+
64
+ def initialize_logger(self, eval_input: EvalInput, config: Any):
65
+ """Initialize the Weave evaluation logger."""
66
+ if not self.client:
67
+ return False
68
+
69
+ try:
70
+ weave_dataset = [
71
+ item.model_dump(exclude={"output_obj", "trajectory"}) for item in eval_input.eval_input_items
72
+ ]
73
+ config_dict = config.model_dump(mode="json")
74
+ # TODO: make this configurable
75
+ config_dict["name"] = "aiqtoolkit-eval"
76
+ self.eval_logger = self.EvaluationLogger(model=config_dict, dataset=weave_dataset)
77
+ self.pred_loggers = {}
78
+
79
+ del weave_dataset
80
+ del config_dict
81
+ return True
82
+ except Exception as e:
83
+ self.eval_logger = None
84
+ logger.warning("Failed to initialize Weave `EvaluationLogger`: %s", e)
85
+
86
+ return False
87
+
88
+ def log_prediction(self, item: EvalInputItem, output: Any):
89
+ """Log a prediction to Weave."""
90
+ if not self.eval_logger:
91
+ return
92
+
93
+ pred_logger = self.eval_logger.log_prediction(inputs=item.model_dump(exclude={"output_obj", "trajectory"}),
94
+ output=output)
95
+ self.pred_loggers[item.id] = pred_logger
96
+
97
+ async def alog_score(self, eval_output: EvalOutput, evaluator_name: str):
98
+ """Log scores for evaluation outputs."""
99
+ if not self.eval_logger:
100
+ return
101
+
102
+ for eval_output_item in eval_output.eval_output_items:
103
+ if eval_output_item.id in self.pred_loggers:
104
+ await self.pred_loggers[eval_output_item.id].alog_score(
105
+ scorer=evaluator_name,
106
+ score=eval_output_item.score,
107
+ )
108
+
109
+ async def afinish_loggers(self):
110
+ """Finish all prediction loggers."""
111
+ if not self.eval_logger:
112
+ return
113
+
114
+ async def _finish_one(pred_logger):
115
+ if hasattr(pred_logger, '_has_finished') and not pred_logger._has_finished:
116
+ return
117
+ # run the *blocking* finish() in a thread so we don’t nest loops
118
+ await asyncio.to_thread(pred_logger.finish)
119
+
120
+ await asyncio.gather(*[_finish_one(pl) for pl in self.pred_loggers.values()])
121
+
122
+ def log_summary(self, evaluation_results: List[tuple[str, EvalOutput]]):
123
+ """Log summary statistics to Weave."""
124
+ if not self.eval_logger:
125
+ return
126
+
127
+ summary = {}
128
+ for evaluator_name, eval_output in evaluation_results:
129
+ # Calculate average score for this evaluator
130
+ scores = [item.score for item in eval_output.eval_output_items if item.score is not None]
131
+ if scores:
132
+ summary[f"{evaluator_name}_avg"] = sum(scores) / len(scores)
133
+
134
+ # Log the summary to finish the evaluation
135
+ self.eval_logger.log_summary(summary)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: aiqtoolkit
3
- Version: 1.2.0a20250622
3
+ Version: 1.2.0a20250623
4
4
  Summary: NVIDIA Agent Intelligence toolkit
5
5
  Author: NVIDIA Corporation
6
6
  Maintainer: NVIDIA Corporation
@@ -107,7 +107,7 @@ aiq/embedder/openai_embedder.py,sha256=5FO3xsyNvEmbLBsZb3xsCpbN1Soxio4yf4b5gTPVx
107
107
  aiq/embedder/register.py,sha256=3MTZrfNQKp6AZTbfaA-PpTnyXiMyu-8HH9JnDCC0v9o,978
108
108
  aiq/eval/__init__.py,sha256=Xs1JQ16L9btwreh4pdGKwskffAw1YFO48jKrU4ib_7c,685
109
109
  aiq/eval/config.py,sha256=IlOr2o618kbkXP0G1F-AklZfsKYVos9UB4Dvlxf66xk,1431
110
- aiq/eval/evaluate.py,sha256=AGEvmagd43jLq0aE_yNs_FFPFxVJEx49cu6Fl3WeQqA,17270
110
+ aiq/eval/evaluate.py,sha256=VdVdB_CV842gIV4diHciJ1qrof5_N3H8I16WwracCsQ,17940
111
111
  aiq/eval/intermediate_step_adapter.py,sha256=4cSsGgFBvNjXnclk5FvZnQaFEdeulp7VEdRWKLcREAQ,4498
112
112
  aiq/eval/register.py,sha256=QOHJqA2CQixeWMC9InyKbzXo1jByvrntD_m9-2Mvg9k,1076
113
113
  aiq/eval/remote_workflow.py,sha256=Fb7Z6gdP2L_gqyWB--AEWfcXe9xPpQ_hPsf9lmqGXjI,5524
@@ -134,6 +134,7 @@ aiq/eval/tunable_rag_evaluator/register.py,sha256=q4p2rFyMzWmaINJc961ZV4jzIlAN4G
134
134
  aiq/eval/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
135
135
  aiq/eval/utils/output_uploader.py,sha256=SaQbZPkw-Q0H7t5yG60Kh-p1cflR7gPklVkilC4uPbU,5141
136
136
  aiq/eval/utils/tqdm_position_registry.py,sha256=9CtpCk1wtYCSyieHPaSp8nlZu6EcNUOaUz2RTqfekrA,1286
137
+ aiq/eval/utils/weave_eval.py,sha256=yIdlp4UdCPgwFYJNJon5eZD1d99E-6dcmfVg6B-4RKE,5076
137
138
  aiq/front_ends/__init__.py,sha256=Xs1JQ16L9btwreh4pdGKwskffAw1YFO48jKrU4ib_7c,685
138
139
  aiq/front_ends/register.py,sha256=OKv1xi-g8WHtUMuIPhwjG6wOYqaGDD-Q9vDtKtT9d1Y,889
139
140
  aiq/front_ends/console/__init__.py,sha256=Xs1JQ16L9btwreh4pdGKwskffAw1YFO48jKrU4ib_7c,685
@@ -309,10 +310,10 @@ aiq/utils/reactive/base/observer_base.py,sha256=UAlyAY_ky4q2t0P81RVFo2Bs_R7z5Nde
309
310
  aiq/utils/reactive/base/subject_base.py,sha256=Ed-AC6P7cT3qkW1EXjzbd5M9WpVoeN_9KCe3OM3FLU4,2521
310
311
  aiq/utils/settings/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
311
312
  aiq/utils/settings/global_settings.py,sha256=U9TCLdoZsKq5qOVGjREipGVv9e-FlStzqy5zv82_VYk,7454
312
- aiqtoolkit-1.2.0a20250622.dist-info/licenses/LICENSE-3rd-party.txt,sha256=8o7aySJa9CBvFshPcsRdJbczzdNyDGJ8b0J67WRUQ2k,183936
313
- aiqtoolkit-1.2.0a20250622.dist-info/licenses/LICENSE.md,sha256=QwcOLU5TJoTeUhuIXzhdCEEDDvorGiC6-3YTOl4TecE,11356
314
- aiqtoolkit-1.2.0a20250622.dist-info/METADATA,sha256=KCxZI4ThHpTWV2J0dM8RhFcfl9FNrytkdUbEd6ZgQ08,20274
315
- aiqtoolkit-1.2.0a20250622.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
316
- aiqtoolkit-1.2.0a20250622.dist-info/entry_points.txt,sha256=gRlPfR5g21t328WNEQ4CcEz80S1sJNS8A7rMDYnzl4A,452
317
- aiqtoolkit-1.2.0a20250622.dist-info/top_level.txt,sha256=fo7AzYcNhZ_tRWrhGumtxwnxMew4xrT1iwouDy_f0Kc,4
318
- aiqtoolkit-1.2.0a20250622.dist-info/RECORD,,
313
+ aiqtoolkit-1.2.0a20250623.dist-info/licenses/LICENSE-3rd-party.txt,sha256=8o7aySJa9CBvFshPcsRdJbczzdNyDGJ8b0J67WRUQ2k,183936
314
+ aiqtoolkit-1.2.0a20250623.dist-info/licenses/LICENSE.md,sha256=QwcOLU5TJoTeUhuIXzhdCEEDDvorGiC6-3YTOl4TecE,11356
315
+ aiqtoolkit-1.2.0a20250623.dist-info/METADATA,sha256=M98GDq-TQ5hxx-6C2mgkvNJD-NPsksihuT96qOukjEE,20274
316
+ aiqtoolkit-1.2.0a20250623.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
317
+ aiqtoolkit-1.2.0a20250623.dist-info/entry_points.txt,sha256=gRlPfR5g21t328WNEQ4CcEz80S1sJNS8A7rMDYnzl4A,452
318
+ aiqtoolkit-1.2.0a20250623.dist-info/top_level.txt,sha256=fo7AzYcNhZ_tRWrhGumtxwnxMew4xrT1iwouDy_f0Kc,4
319
+ aiqtoolkit-1.2.0a20250623.dist-info/RECORD,,