aiqtoolkit 1.2.0a20250626__py3-none-any.whl → 1.2.0a20250628__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of aiqtoolkit might be problematic. Click here for more details.

@@ -30,7 +30,8 @@ from aiq.data_models.common import TypedBaseModel
30
30
 
31
31
  class EvalS3Config(BaseModel):
32
32
 
33
- endpoint_url: str
33
+ endpoint_url: str | None = None
34
+ region_name: str | None = None
34
35
  bucket: str
35
36
  access_key: str
36
37
  secret_key: str
@@ -72,6 +72,10 @@ class EvalOutputConfig(BaseModel):
72
72
  class EvalGeneralConfig(BaseModel):
73
73
  max_concurrency: int = 8
74
74
 
75
+ # Workflow alias for displaying in evaluation UI, if not provided,
76
+ # the workflow type will be used
77
+ workflow_alias: str | None = None
78
+
75
79
  # Output directory for the workflow and evaluation results
76
80
  output_dir: Path = Path("/tmp/aiq/examples/default/")
77
81
 
@@ -42,6 +42,7 @@ class PrefixSpanConfig(BaseModel):
42
42
 
43
43
  class ProfilerConfig(BaseModel):
44
44
 
45
+ base_metrics: bool = False
45
46
  token_usage_forecast: bool = False
46
47
  token_uniqueness_forecast: bool = False
47
48
  workflow_runtime_forecast: bool = False
@@ -152,6 +152,16 @@ class DatasetHandler:
152
152
  allow re-running evaluation using the orignal config file and '--skip_workflow' option.
153
153
  """
154
154
 
155
+ def parse_if_json_string(value):
156
+ if isinstance(value, str):
157
+ try:
158
+ return json.loads(value)
159
+ except json.JSONDecodeError:
160
+ return value
161
+ if hasattr(value, "model_dump"):
162
+ return value.model_dump()
163
+ return value
164
+
155
165
  indent = 2
156
166
  if self.is_structured_input():
157
167
  # Extract structured data from EvalInputItems
@@ -165,6 +175,6 @@ class DatasetHandler:
165
175
  } for item in eval_input.eval_input_items]
166
176
  else:
167
177
  # Unstructured case: return only raw output objects as a JSON array
168
- data = [json.loads(item.output_obj) for item in eval_input.eval_input_items]
178
+ data = [parse_if_json_string(item.output_obj) for item in eval_input.eval_input_items]
169
179
 
170
180
  return json.dumps(data, indent=indent, ensure_ascii=False, default=str)
aiq/eval/evaluate.py CHANGED
@@ -31,8 +31,12 @@ from aiq.eval.dataset_handler.dataset_handler import DatasetHandler
31
31
  from aiq.eval.evaluator.evaluator_model import EvalInput
32
32
  from aiq.eval.evaluator.evaluator_model import EvalInputItem
33
33
  from aiq.eval.evaluator.evaluator_model import EvalOutput
34
+ from aiq.eval.usage_stats import UsageStats
35
+ from aiq.eval.usage_stats import UsageStatsItem
36
+ from aiq.eval.usage_stats import UsageStatsLLM
34
37
  from aiq.eval.utils.output_uploader import OutputUploader
35
38
  from aiq.eval.utils.weave_eval import WeaveEvaluationIntegration
39
+ from aiq.profiler.data_models import ProfilerResults
36
40
  from aiq.runtime.session import AIQSessionManager
37
41
 
38
42
  logger = logging.getLogger(__name__)
@@ -63,12 +67,46 @@ class EvaluationRun: # pylint: disable=too-many-public-methods
63
67
  # evaluation_results is list of tuples (evaluator_name, EvalOutput)
64
68
  self.evaluation_results: list[tuple[str, EvalOutput]] = []
65
69
 
70
+ # usage stats
71
+ self.usage_stats: UsageStats = UsageStats()
72
+
66
73
  # workflow output file
67
74
  self.workflow_output_file: Path | None = None
68
75
 
69
76
  # evaluation output files
70
77
  self.evaluator_output_files: list[Path] = []
71
78
 
79
+ def _compute_usage_stats(self, item: EvalInputItem):
80
+ """Compute usage stats for a single item using the intermediate steps"""
81
+ # get the prompt and completion tokens from the intermediate steps
82
+ from aiq.profiler.intermediate_property_adapter import IntermediatePropertyAdaptor
83
+ steps = [IntermediatePropertyAdaptor.from_intermediate_step(step) for step in item.trajectory]
84
+ usage_stats_per_llm = {}
85
+ total_tokens = 0
86
+ for step in steps:
87
+ if step.event_type == "LLM_END":
88
+ llm_name = step.llm_name
89
+ if llm_name not in usage_stats_per_llm:
90
+ usage_stats_per_llm[llm_name] = UsageStatsLLM()
91
+ usage_stats_per_llm[llm_name].prompt_tokens += step.token_usage.prompt_tokens
92
+ usage_stats_per_llm[llm_name].completion_tokens += step.token_usage.completion_tokens
93
+ usage_stats_per_llm[llm_name].total_tokens += step.token_usage.total_tokens
94
+ total_tokens += step.token_usage.total_tokens
95
+
96
+ # find min and max event timestamps
97
+ if item.trajectory:
98
+ min_timestamp = min(step.event_timestamp for step in item.trajectory)
99
+ max_timestamp = max(step.event_timestamp for step in item.trajectory)
100
+ runtime = max_timestamp - min_timestamp
101
+ else:
102
+ runtime = 0.0
103
+
104
+ # add the usage stats to the usage stats dict
105
+ self.usage_stats.usage_stats_items[item.id] = UsageStatsItem(usage_stats_per_llm=usage_stats_per_llm,
106
+ runtime=runtime,
107
+ total_tokens=total_tokens)
108
+ return self.usage_stats.usage_stats_items[item.id]
109
+
72
110
  async def run_workflow_local(self, session_manager: AIQSessionManager):
73
111
  '''
74
112
  Launch the workflow with the specified questions and extract the output using the jsonpath
@@ -138,8 +176,10 @@ class EvaluationRun: # pylint: disable=too-many-public-methods
138
176
 
139
177
  item.output_obj = output
140
178
  item.trajectory = self.intermediate_step_adapter.validate_intermediate_steps(intermediate_steps)
179
+ usage_stats_item = self._compute_usage_stats(item)
141
180
 
142
181
  self.weave_eval.log_prediction(item, output)
182
+ await self.weave_eval.log_usage_stats(item, usage_stats_item)
143
183
 
144
184
  async def wrapped_run(item: EvalInputItem) -> None:
145
185
  await run_one(item)
@@ -161,15 +201,19 @@ class EvaluationRun: # pylint: disable=too-many-public-methods
161
201
  from aiq.eval.remote_workflow import EvaluationRemoteWorkflowHandler
162
202
  handler = EvaluationRemoteWorkflowHandler(self.config, self.eval_config.general.max_concurrency)
163
203
  await handler.run_workflow_remote(self.eval_input)
204
+ for item in self.eval_input.eval_input_items:
205
+ usage_stats_item = self._compute_usage_stats(item)
206
+ self.weave_eval.log_prediction(item, item.output_obj)
207
+ await self.weave_eval.log_usage_stats(item, usage_stats_item)
164
208
 
165
- async def profile_workflow(self):
209
+ async def profile_workflow(self) -> ProfilerResults:
166
210
  """
167
211
  Profile a dataset
168
212
  """
169
213
 
170
214
  if not self.eval_config.general.profiler:
171
215
  logger.info("Profiler is not enabled. Skipping profiling.")
172
- return
216
+ return ProfilerResults()
173
217
 
174
218
  from aiq.profiler.profile_runner import ProfilerRunner
175
219
 
@@ -179,7 +223,7 @@ class EvaluationRun: # pylint: disable=too-many-public-methods
179
223
 
180
224
  profiler_runner = ProfilerRunner(self.eval_config.general.profiler, self.eval_config.general.output_dir)
181
225
 
182
- await profiler_runner.run(all_stats)
226
+ return await profiler_runner.run(all_stats)
183
227
 
184
228
  def cleanup_output_directory(self):
185
229
  '''Remove contents of the output directory if it exists'''
@@ -238,7 +282,7 @@ class EvaluationRun: # pylint: disable=too-many-public-methods
238
282
  except Exception as e:
239
283
  logger.exception("Failed to delete old job directory: %s: %s", dir_to_delete, e, exc_info=True)
240
284
 
241
- def write_output(self, dataset_handler: DatasetHandler):
285
+ def write_output(self, dataset_handler: DatasetHandler, profiler_results: ProfilerResults):
242
286
  workflow_output_file = self.eval_config.general.output_dir / "workflow_output.json"
243
287
  workflow_output_file.parent.mkdir(parents=True, exist_ok=True)
244
288
 
@@ -271,7 +315,7 @@ class EvaluationRun: # pylint: disable=too-many-public-methods
271
315
  "`eval` with the --skip_completed_entries flag.")
272
316
  logger.warning(msg)
273
317
 
274
- self.weave_eval.log_summary(self.evaluation_results)
318
+ self.weave_eval.log_summary(self.usage_stats, self.evaluation_results, profiler_results)
275
319
 
276
320
  async def run_single_evaluator(self, evaluator_name: str, evaluator: Any):
277
321
  """Run a single evaluator and store its results."""
@@ -314,6 +358,16 @@ class EvaluationRun: # pylint: disable=too-many-public-methods
314
358
  config = validate_schema(config_dict, AIQConfig)
315
359
  return config
316
360
 
361
+ def _get_workflow_alias(self, workflow_type: str | None = None):
362
+ """Get the workflow alias for displaying in evaluation UI."""
363
+ if self.eval_config.general.workflow_alias:
364
+ return self.eval_config.general.workflow_alias
365
+
366
+ if not workflow_type or workflow_type == "EmptyFunctionConfig":
367
+ return "aiqtoolkit-eval"
368
+
369
+ return workflow_type
370
+
317
371
  async def run_and_evaluate(self,
318
372
  session_manager: AIQSessionManager | None = None,
319
373
  job_id: str | None = None) -> EvaluationRunOutput:
@@ -331,7 +385,8 @@ class EvaluationRun: # pylint: disable=too-many-public-methods
331
385
  else:
332
386
  config = load_config(self.config.config_file)
333
387
  self.eval_config = config.eval
334
- logger.debug("Loaded evaluation configuration: %s", self.eval_config)
388
+ workflow_alias = self._get_workflow_alias(config.workflow.type)
389
+ logger.debug("Loaded %s evaluation configuration: %s", workflow_alias, self.eval_config)
335
390
 
336
391
  # Cleanup the output directory
337
392
  if self.eval_config.general.output:
@@ -373,10 +428,9 @@ class EvaluationRun: # pylint: disable=too-many-public-methods
373
428
  # Run workflow and evaluate
374
429
  async with WorkflowEvalBuilder.from_config(config=config) as eval_workflow:
375
430
  # Initialize Weave integration
376
- self.weave_eval.initialize_client()
377
- if self.weave_eval.client:
378
- self.weave_eval.initialize_logger(self.eval_input, config)
431
+ self.weave_eval.initialize_logger(workflow_alias, self.eval_input, config)
379
432
 
433
+ # Run workflow
380
434
  if self.config.endpoint:
381
435
  await self.run_workflow_remote()
382
436
  else:
@@ -391,10 +445,10 @@ class EvaluationRun: # pylint: disable=too-many-public-methods
391
445
  await self.run_evaluators(evaluators)
392
446
 
393
447
  # Profile the workflow
394
- await self.profile_workflow()
448
+ profiler_results = await self.profile_workflow()
395
449
 
396
450
  # Write the results to the output directory
397
- self.write_output(dataset_handler)
451
+ self.write_output(dataset_handler, profiler_results)
398
452
 
399
453
  # Run custom scripts and upload evaluation outputs to S3
400
454
  if self.eval_config.general.output:
@@ -0,0 +1,35 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ import typing
17
+
18
+ from pydantic import BaseModel
19
+
20
+
21
+ class UsageStatsLLM(BaseModel):
22
+ prompt_tokens: int = 0
23
+ completion_tokens: int = 0
24
+ total_tokens: int = 0
25
+
26
+
27
+ class UsageStatsItem(BaseModel):
28
+ usage_stats_per_llm: dict[str, UsageStatsLLM]
29
+ total_tokens: int | None = None
30
+ runtime: float = 0.0
31
+
32
+
33
+ class UsageStats(BaseModel):
34
+ # key is the id or input_obj from EvalInputItem
35
+ usage_stats_items: dict[typing.Any, UsageStatsItem] = {}
@@ -78,9 +78,18 @@ class OutputUploader:
78
78
 
79
79
  session = aioboto3.Session()
80
80
  try:
81
+ if self.s3_config.endpoint_url:
82
+ region_name = None
83
+ endpoint_url = self.s3_config.endpoint_url
84
+ elif self.s3_config.region_name:
85
+ region_name = self.s3_config.region_name
86
+ endpoint_url = None
87
+ else:
88
+ raise ValueError("No endpoint_url or region_name provided in the config: eval.general.output.s3")
81
89
  async with session.client(
82
90
  "s3",
83
- endpoint_url=self.s3_config.endpoint_url,
91
+ endpoint_url=endpoint_url,
92
+ region_name=region_name,
84
93
  aws_access_key_id=self.s3_config.access_key,
85
94
  aws_secret_access_key=self.s3_config.secret_key,
86
95
  ) as s3_client:
@@ -16,11 +16,13 @@
16
16
  import asyncio
17
17
  import logging
18
18
  from typing import Any
19
- from typing import List
20
19
 
21
20
  from aiq.eval.evaluator.evaluator_model import EvalInput
22
21
  from aiq.eval.evaluator.evaluator_model import EvalInputItem
23
22
  from aiq.eval.evaluator.evaluator_model import EvalOutput
23
+ from aiq.eval.usage_stats import UsageStats
24
+ from aiq.eval.usage_stats import UsageStatsItem
25
+ from aiq.profiler.data_models import ProfilerResults
24
26
 
25
27
  logger = logging.getLogger(__name__)
26
28
 
@@ -61,23 +63,35 @@ class WeaveEvaluationIntegration: # pylint: disable=too-many-public-methods
61
63
  self.client = None
62
64
  return False
63
65
 
64
- def initialize_logger(self, eval_input: EvalInput, config: Any):
66
+ def _get_prediction_inputs(self, item: EvalInputItem):
67
+ """Get the inputs for displaying in the UI.
68
+ The following fields are excluded as they are too large to display in the UI:
69
+ - full_dataset_entry
70
+ - expected_trajectory
71
+ - trajectory
72
+
73
+ output_obj is excluded because it is displayed separately.
74
+ """
75
+ include = {"id", "input_obj", "expected_output_obj"}
76
+ return item.model_dump(include=include)
77
+
78
+ def _get_weave_dataset(self, eval_input: EvalInput):
79
+ """Get the full dataset for Weave."""
80
+ return [item.full_dataset_entry for item in eval_input.eval_input_items]
81
+
82
+ def initialize_logger(self, workflow_alias: str, eval_input: EvalInput, config: Any):
65
83
  """Initialize the Weave evaluation logger."""
66
- if not self.client:
84
+ if not self.client and not self.initialize_client():
85
+ # lazy init the client
67
86
  return False
68
87
 
69
88
  try:
70
- weave_dataset = [
71
- item.model_dump(exclude={"output_obj", "trajectory"}) for item in eval_input.eval_input_items
72
- ]
89
+ weave_dataset = self._get_weave_dataset(eval_input)
73
90
  config_dict = config.model_dump(mode="json")
74
- # TODO: make this configurable
75
- config_dict["name"] = "aiqtoolkit-eval"
91
+ config_dict["name"] = workflow_alias
76
92
  self.eval_logger = self.EvaluationLogger(model=config_dict, dataset=weave_dataset)
77
93
  self.pred_loggers = {}
78
94
 
79
- del weave_dataset
80
- del config_dict
81
95
  return True
82
96
  except Exception as e:
83
97
  self.eval_logger = None
@@ -90,21 +104,37 @@ class WeaveEvaluationIntegration: # pylint: disable=too-many-public-methods
90
104
  if not self.eval_logger:
91
105
  return
92
106
 
93
- pred_logger = self.eval_logger.log_prediction(inputs=item.model_dump(exclude={"output_obj", "trajectory"}),
94
- output=output)
107
+ pred_logger = self.eval_logger.log_prediction(inputs=self._get_prediction_inputs(item), output=output)
95
108
  self.pred_loggers[item.id] = pred_logger
96
109
 
110
+ async def log_usage_stats(self, item: EvalInputItem, usage_stats_item: UsageStatsItem):
111
+ """Log usage stats to Weave."""
112
+ if not self.eval_logger:
113
+ return
114
+
115
+ # log each usage stat as a score
116
+ await self.pred_loggers[item.id].alog_score(scorer="wf_runtime", score=usage_stats_item.runtime)
117
+
118
+ # log the total tokens for this item, per-llm tokens can be exported later if needed
119
+ await self.pred_loggers[item.id].alog_score(scorer="wf_tokens", score=usage_stats_item.total_tokens)
120
+
97
121
  async def alog_score(self, eval_output: EvalOutput, evaluator_name: str):
98
122
  """Log scores for evaluation outputs."""
99
123
  if not self.eval_logger:
100
124
  return
101
125
 
126
+ # Create coroutines for all score logging operations
127
+ coros = []
102
128
  for eval_output_item in eval_output.eval_output_items:
103
129
  if eval_output_item.id in self.pred_loggers:
104
- await self.pred_loggers[eval_output_item.id].alog_score(
130
+ coros.append(self.pred_loggers[eval_output_item.id].alog_score(
105
131
  scorer=evaluator_name,
106
132
  score=eval_output_item.score,
107
- )
133
+ ))
134
+
135
+ # Execute all coroutines concurrently
136
+ if coros:
137
+ await asyncio.gather(*coros)
108
138
 
109
139
  async def afinish_loggers(self):
110
140
  """Finish all prediction loggers."""
@@ -114,22 +144,37 @@ class WeaveEvaluationIntegration: # pylint: disable=too-many-public-methods
114
144
  async def _finish_one(pred_logger):
115
145
  if hasattr(pred_logger, '_has_finished') and not pred_logger._has_finished:
116
146
  return
117
- # run the *blocking* finish() in a thread so we dont nest loops
147
+ # run the *blocking* finish() in a thread so we don't nest loops
118
148
  await asyncio.to_thread(pred_logger.finish)
119
149
 
120
150
  await asyncio.gather(*[_finish_one(pl) for pl in self.pred_loggers.values()])
121
151
 
122
- def log_summary(self, evaluation_results: List[tuple[str, EvalOutput]]):
152
+ def _log_profiler_metrics(self, profiler_results: ProfilerResults, usage_stats: UsageStats) -> dict[str, Any]:
153
+ """Log profiler metrics to Weave."""
154
+ profile_metrics = {}
155
+ if profiler_results.workflow_runtime_metrics:
156
+ profile_metrics["wf_p95_runtime"] = profiler_results.workflow_runtime_metrics.p95
157
+
158
+ # TODO:get the LLM tokens from the usage stats and log them
159
+ return profile_metrics
160
+
161
+ def log_summary(self,
162
+ usage_stats: UsageStats,
163
+ evaluation_results: list[tuple[str, EvalOutput]],
164
+ profiler_results: ProfilerResults):
123
165
  """Log summary statistics to Weave."""
124
166
  if not self.eval_logger:
125
167
  return
126
168
 
127
169
  summary = {}
170
+ # add evaluation results to the summary
128
171
  for evaluator_name, eval_output in evaluation_results:
129
- # Calculate average score for this evaluator
130
- scores = [item.score for item in eval_output.eval_output_items if item.score is not None]
131
- if scores:
132
- summary[f"{evaluator_name}_avg"] = sum(scores) / len(scores)
172
+ summary[evaluator_name] = eval_output.average_score
173
+
174
+ # add profiler metrics to the summary
175
+ profile_metrics = self._log_profiler_metrics(profiler_results, usage_stats)
176
+ summary.update(profile_metrics)
133
177
 
134
- # Log the summary to finish the evaluation
135
- self.eval_logger.log_summary(summary)
178
+ # Log the summary to finish the evaluation, disable auto-summarize
179
+ # as we will be adding profiler metrics to the summary
180
+ self.eval_logger.log_summary(summary, auto_summarize=False)
@@ -0,0 +1,22 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ from pydantic import BaseModel
17
+
18
+ from aiq.profiler.inference_optimization.data_models import WorkflowRuntimeMetrics
19
+
20
+
21
+ class ProfilerResults(BaseModel):
22
+ workflow_runtime_metrics: WorkflowRuntimeMetrics | None = None
@@ -23,3 +23,6 @@ class InferenceMetricsModel(BaseModel):
23
23
  ninetieth_interval: tuple[float, float] = Field(default=(0, 0), description="90% confidence interval")
24
24
  ninety_fifth_interval: tuple[float, float] = Field(default=(0, 0), description="95% confidence interval")
25
25
  ninety_ninth_interval: tuple[float, float] = Field(default=(0, 0), description="99% confidence interval")
26
+ p90: float = Field(default=0, description="90th percentile of the samples")
27
+ p95: float = Field(default=0, description="95th percentile of the samples")
28
+ p99: float = Field(default=0, description="99th percentile of the samples")
@@ -25,6 +25,7 @@ from pydantic import BaseModel
25
25
 
26
26
  from aiq.data_models.evaluate import ProfilerConfig
27
27
  from aiq.data_models.intermediate_step import IntermediateStep
28
+ from aiq.profiler.data_models import ProfilerResults
28
29
  from aiq.profiler.forecasting.model_trainer import ModelTrainer
29
30
  from aiq.profiler.inference_metrics_model import InferenceMetricsModel
30
31
  from aiq.profiler.utils import create_standardized_dataframe
@@ -80,7 +81,7 @@ class ProfilerRunner:
80
81
  # Ensure output directory
81
82
  os.makedirs(output_dir, exist_ok=True)
82
83
 
83
- async def run(self, all_steps: list[list[IntermediateStep]]):
84
+ async def run(self, all_steps: list[list[IntermediateStep]]) -> ProfilerResults:
84
85
  """
85
86
  Main entrypoint: Works on Input DataFrame generated from eval to fit forecasting model,
86
87
  writes out combined requests JSON, then computes and saves additional metrics,
@@ -171,7 +172,7 @@ class ProfilerRunner:
171
172
  uniqueness = compute_inter_query_token_uniqueness_by_llm(all_steps)
172
173
  token_uniqueness_results = uniqueness
173
174
 
174
- if self.profile_config.workflow_runtime_forecast:
175
+ if self.profile_config.workflow_runtime_forecast or self.profile_config.base_metrics:
175
176
  # ------------------------------------------------------------
176
177
  # Compute and save workflow runtime metrics
177
178
  # ------------------------------------------------------------
@@ -275,7 +276,7 @@ class ProfilerRunner:
275
276
  logger.info("Fitted model for forecasting.")
276
277
  except Exception as e:
277
278
  logger.exception("Fitting model failed. %s", e, exc_info=True)
278
- return
279
+ return ProfilerResults()
279
280
 
280
281
  os.makedirs(self.output_dir, exist_ok=True)
281
282
 
@@ -285,6 +286,8 @@ class ProfilerRunner:
285
286
 
286
287
  logger.info("Saved fitted model to disk.")
287
288
 
289
+ return ProfilerResults(workflow_runtime_metrics=workflow_runtimes_results)
290
+
288
291
  # -------------------------------------------------------------------
289
292
  # Confidence Intervals / Metrics
290
293
  # -------------------------------------------------------------------
@@ -391,7 +394,8 @@ class ProfilerRunner:
391
394
 
392
395
  def _compute_confidence_intervals(self, data: list[float], metric_name: str) -> InferenceMetricsModel:
393
396
  """
394
- Helper to compute 90, 95, 99% confidence intervals for the mean of a dataset.
397
+ Helper to compute 90, 95, 99% confidence intervals **and** the empirical
398
+ 90th/95th/99th percentiles (p90/p95/p99) for the mean of a dataset.
395
399
  Uses a z-score from the normal approximation for large samples.
396
400
 
397
401
  Returns a dict like::
@@ -409,11 +413,16 @@ class ProfilerRunner:
409
413
  n = len(data)
410
414
  mean_val = statistics.mean(data)
411
415
  if n <= 1:
412
- return InferenceMetricsModel(n=n,
413
- mean=mean_val,
414
- ninetieth_interval=(mean_val, mean_val),
415
- ninety_fifth_interval=(mean_val, mean_val),
416
- ninety_ninth_interval=(mean_val, mean_val))
416
+ return InferenceMetricsModel(
417
+ n=n,
418
+ mean=mean_val,
419
+ ninetieth_interval=(mean_val, mean_val),
420
+ ninety_fifth_interval=(mean_val, mean_val),
421
+ ninety_ninth_interval=(mean_val, mean_val),
422
+ p90=mean_val,
423
+ p95=mean_val,
424
+ p99=mean_val,
425
+ )
417
426
 
418
427
  stdev_val = statistics.pstdev(data) # population stdev or use stdev for sample
419
428
  # standard error
@@ -430,4 +439,32 @@ class ProfilerRunner:
430
439
  # Optionally, store more info
431
440
  intervals["n"] = n
432
441
  intervals["mean"] = mean_val
442
+
443
+ # ------------------------------------------------------------------
444
+ # Percentiles
445
+ # ------------------------------------------------------------------
446
+ sorted_data = sorted(data)
447
+
448
+ def _percentile(arr: list[float], pct: float) -> float:
449
+ """
450
+ Linear interpolation between closest ranks.
451
+ pct is given from 0‑100 (e.g. 90 for p90).
452
+ """
453
+ if not arr:
454
+ return 0.0
455
+ k = (len(arr) - 1) * (pct / 100.0)
456
+ f = math.floor(k)
457
+ c = math.ceil(k)
458
+ if f == c:
459
+ return arr[int(k)]
460
+ return arr[f] + (arr[c] - arr[f]) * (k - f)
461
+
462
+ p90_val = _percentile(sorted_data, 90)
463
+ p95_val = _percentile(sorted_data, 95)
464
+ p99_val = _percentile(sorted_data, 99)
465
+
466
+ intervals["p90"] = p90_val
467
+ intervals["p95"] = p95_val
468
+ intervals["p99"] = p99_val
469
+
433
470
  return InferenceMetricsModel(**intervals)
@@ -68,6 +68,16 @@ def model_from_mcp_schema(name: str, mcp_input_schema: dict) -> type[BaseModel]:
68
68
  else:
69
69
  item_type = _type_map.get(item_properties.get("type", "string"), Any)
70
70
  field_type = list[item_type]
71
+ elif isinstance(json_type, list):
72
+ field_type = None
73
+ for t in json_type:
74
+ mapped = _type_map.get(t, Any)
75
+ field_type = mapped if field_type is None else field_type | mapped
76
+
77
+ return field_type, Field(
78
+ default=field_properties.get("default", None if "null" in json_type else ...),
79
+ description=field_properties.get("description", "")
80
+ )
71
81
  else:
72
82
  field_type = _type_map.get(json_type, Any)
73
83
 
aiq/tool/mcp/mcp_tool.py CHANGED
@@ -75,7 +75,8 @@ async def mcp_tool(config: MCPToolConfig, builder: Builder): # pylint: disable=
75
75
  return await tool.acall(args)
76
76
 
77
77
  _ = tool.input_schema.model_validate(kwargs)
78
- return await tool.acall(kwargs)
78
+ filtered_kwargs = {k: v for k, v in kwargs.items() if v is not None}
79
+ return await tool.acall(filtered_kwargs)
79
80
  except Exception as e:
80
81
  if config.return_exception:
81
82
  if tool_input:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: aiqtoolkit
3
- Version: 1.2.0a20250626
3
+ Version: 1.2.0a20250628
4
4
  Summary: NVIDIA Agent Intelligence toolkit
5
5
  Author: NVIDIA Corporation
6
6
  Maintainer: NVIDIA Corporation
@@ -79,10 +79,10 @@ aiq/data_models/common.py,sha256=G63rUXvDAtK6p1SrRyH0VlHGqrDgCZVVjbnzgGSl2Ic,421
79
79
  aiq/data_models/component.py,sha256=x6jm1Fhn1k1hGu-5AjM0ywuyvs6ztaZfapD8bLUXSqc,1469
80
80
  aiq/data_models/component_ref.py,sha256=GyyIf4k80aUIn6LV9r84m5imbiVhpdaY7uKMMpYpbzU,3872
81
81
  aiq/data_models/config.py,sha256=ERLjZY0iqexZ-gSXsCSN1UqgNeiwkEjWdYJEdKqeYTY,14116
82
- aiq/data_models/dataset_handler.py,sha256=SifWhFHtxTMEjrXaXOYQgBOSKfWOzkc6OtOoPJ39pD4,3978
82
+ aiq/data_models/dataset_handler.py,sha256=liMB3xRohkr4VTMmNWPvWi9qhbhlJQfQK36g5Rknweo,4027
83
83
  aiq/data_models/discovery_metadata.py,sha256=OcITQc5VeML4bTHurrsMNiK_oB3z7wudMxcyN7LI8pY,12785
84
84
  aiq/data_models/embedder.py,sha256=0v917IiohVA_7zdF7hoO_zQcmNe4hQEFhh4fxRiYBbk,940
85
- aiq/data_models/evaluate.py,sha256=tLL-AidxW6-VnEpIDYqGpvIdcNXnDee7Ooze9_bzXeY,4557
85
+ aiq/data_models/evaluate.py,sha256=WBeABZsIa6W04MPj24SRu4s-ty2PkJ7_4SLojXmj5Pk,4704
86
86
  aiq/data_models/evaluator.py,sha256=bd2njsyQB2t6ClJ66gJiCjYHsQpWZwPD7rsU0J109TI,939
87
87
  aiq/data_models/front_end.py,sha256=z8k6lSWjt1vMOYFbjWQxodpwAqPeuGS0hRBjsriDW2s,932
88
88
  aiq/data_models/function.py,sha256=M_duXVXL5MvYe0WVLvqEgEzXs0UAYNSMfy9ZTpxuKPA,1013
@@ -93,7 +93,7 @@ aiq/data_models/invocation_node.py,sha256=nDRylgzBfJduGA-lme9xN4P6BdOYj0L6ytLHnT
93
93
  aiq/data_models/llm.py,sha256=McbDdUUtWfp9WCdMMJA2xh7mvlmyNdGDCH8P_7l2iKU,920
94
94
  aiq/data_models/logging.py,sha256=1QtVjIQ99PgMYUuzw4h1FAoPRteZY7uf3oFTqV3ONgA,940
95
95
  aiq/data_models/memory.py,sha256=RYwmE8I0PJ-h1GD-689abgt5DDi7JlWANeXpOsvWT9E,932
96
- aiq/data_models/profiler.py,sha256=99KBOnFDJWtmTUIscivk-hHYvbNax-QPe7mQwTCgu88,1750
96
+ aiq/data_models/profiler.py,sha256=z3IlEhj-veB4Yz85271bTkScSUkVwK50tR3dwlDRgcE,1781
97
97
  aiq/data_models/registry_handler.py,sha256=g1rFaz4uSydMJn7qpdX-DNHJd_rNf8tXYN49dLDYHPo,968
98
98
  aiq/data_models/retriever.py,sha256=UOfss4sru5ku5E8YZYN5qz4MVbFi2VwvpNUPVp9hsnQ,1202
99
99
  aiq/data_models/step_adaptor.py,sha256=h7nVAwdgbuHd1e1-SR5jY9nkDMBDGqzTzrl-4lBQX7o,2615
@@ -107,15 +107,16 @@ aiq/embedder/openai_embedder.py,sha256=5FO3xsyNvEmbLBsZb3xsCpbN1Soxio4yf4b5gTPVx
107
107
  aiq/embedder/register.py,sha256=3MTZrfNQKp6AZTbfaA-PpTnyXiMyu-8HH9JnDCC0v9o,978
108
108
  aiq/eval/__init__.py,sha256=Xs1JQ16L9btwreh4pdGKwskffAw1YFO48jKrU4ib_7c,685
109
109
  aiq/eval/config.py,sha256=IlOr2o618kbkXP0G1F-AklZfsKYVos9UB4Dvlxf66xk,1431
110
- aiq/eval/evaluate.py,sha256=VdVdB_CV842gIV4diHciJ1qrof5_N3H8I16WwracCsQ,17940
110
+ aiq/eval/evaluate.py,sha256=zv2AQfcf-aaQO0Tx7VV5Qc7KZ6DMniKPjXG-BUrtlMA,20983
111
111
  aiq/eval/intermediate_step_adapter.py,sha256=4cSsGgFBvNjXnclk5FvZnQaFEdeulp7VEdRWKLcREAQ,4498
112
112
  aiq/eval/register.py,sha256=QOHJqA2CQixeWMC9InyKbzXo1jByvrntD_m9-2Mvg9k,1076
113
113
  aiq/eval/remote_workflow.py,sha256=Fb7Z6gdP2L_gqyWB--AEWfcXe9xPpQ_hPsf9lmqGXjI,5524
114
114
  aiq/eval/runtime_event_subscriber.py,sha256=2VM8MqmPc_EWPxxrDDR9naiioZirkJUfGwzbXQqbdZA,1906
115
+ aiq/eval/usage_stats.py,sha256=izIIoHElo3mvysq_Z3hw9YPcxhR6G_zaIF4CzyPdJR4,1135
115
116
  aiq/eval/dataset_handler/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
116
117
  aiq/eval/dataset_handler/dataset_downloader.py,sha256=Zvfbd-fPOhB9n8ZiCBaBKW0y-5v97mQAy3dkBL0OFZ0,4553
117
118
  aiq/eval/dataset_handler/dataset_filter.py,sha256=mop6wa4P_QtQ5QkfXv-hVBm3EMerfNECSTJGGDB1YWE,2115
118
- aiq/eval/dataset_handler/dataset_handler.py,sha256=z4trKYPnqSrLvsKctU9d5WrQW7ddbZZx0zOrYVLqbAA,7847
119
+ aiq/eval/dataset_handler/dataset_handler.py,sha256=sJhjZrasAZiDI_B2GM3czb6HTY0xSfqHV8386jmjCjI,8194
119
120
  aiq/eval/evaluator/__init__.py,sha256=GUJrgGtpvyMUCjUBvR3faAdv-tZzbU9W-izgx9aMEQg,680
120
121
  aiq/eval/evaluator/base_evaluator.py,sha256=5kqOcTYNecnh9us_XvV58pj5tZI82NGkVN4tg9-R_ZE,3040
121
122
  aiq/eval/evaluator/evaluator_model.py,sha256=5cxe3mqznlNGzv29v_VseYU7OzoT1eTf7hgSPQxytsM,1440
@@ -132,9 +133,9 @@ aiq/eval/tunable_rag_evaluator/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5
132
133
  aiq/eval/tunable_rag_evaluator/evaluate.py,sha256=f4jfn9VVLmkOg631TQr2wy7hPwGMJMsQa4kmXsu0-Uc,13069
133
134
  aiq/eval/tunable_rag_evaluator/register.py,sha256=q4p2rFyMzWmaINJc961ZV4jzIlAN4GfWsoImHo0ovsY,2558
134
135
  aiq/eval/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
135
- aiq/eval/utils/output_uploader.py,sha256=SaQbZPkw-Q0H7t5yG60Kh-p1cflR7gPklVkilC4uPbU,5141
136
+ aiq/eval/utils/output_uploader.py,sha256=lkV63Jr97YuG1vr04uOZDvs9e1pGP4FbJykRxS2d7a4,5579
136
137
  aiq/eval/utils/tqdm_position_registry.py,sha256=9CtpCk1wtYCSyieHPaSp8nlZu6EcNUOaUz2RTqfekrA,1286
137
- aiq/eval/utils/weave_eval.py,sha256=yIdlp4UdCPgwFYJNJon5eZD1d99E-6dcmfVg6B-4RKE,5076
138
+ aiq/eval/utils/weave_eval.py,sha256=l9NTkgLTb30wBnfiHI_yefPFVNyIBrNdbPNq2o58HO4,7088
138
139
  aiq/front_ends/__init__.py,sha256=Xs1JQ16L9btwreh4pdGKwskffAw1YFO48jKrU4ib_7c,685
139
140
  aiq/front_ends/register.py,sha256=OKv1xi-g8WHtUMuIPhwjG6wOYqaGDD-Q9vDtKtT9d1Y,889
140
141
  aiq/front_ends/console/__init__.py,sha256=Xs1JQ16L9btwreh4pdGKwskffAw1YFO48jKrU4ib_7c,685
@@ -181,9 +182,10 @@ aiq/observability/register.py,sha256=mejMBVr3dHHfShIiyn1fIbA0Gb6z9Ayg8WRMgB0wf5E
181
182
  aiq/plugins/.namespace,sha256=Gace0pOC3ETEJf-TBVuNw0TQV6J_KtOPpEiSzMH-odo,215
182
183
  aiq/profiler/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
183
184
  aiq/profiler/data_frame_row.py,sha256=vudqk1ZzZtlZln2Ir43mPl3nwNc0pQlhwbtdY9oSKtI,1755
184
- aiq/profiler/inference_metrics_model.py,sha256=e_M0ApsyDgPMrOIOnm1beHtNeHKwOh5CAxu-OiJaEzQ,1241
185
+ aiq/profiler/data_models.py,sha256=I8k1zjg8KaLjjpc1SxMgaEC2h_jW2bv1cENl1BsTcG8,899
186
+ aiq/profiler/inference_metrics_model.py,sha256=Thz3OHBDzGrpPYaOm8m8_pNeEA_q0yDlUUDHFkQ3U90,1481
185
187
  aiq/profiler/intermediate_property_adapter.py,sha256=XZ_A8f2S5M-EJSkErY6I750Y8HAZPdXsr6Cpb1wXlNM,3537
186
- aiq/profiler/profile_runner.py,sha256=ltADgYhZvcsYtgYahFXW6FtTLm9DSepJUE2U2w0ZU-A,20855
188
+ aiq/profiler/profile_runner.py,sha256=Xyh0wl2aeRJtRBzvvkMYkFvqUptB7XUfYJ7jdbBCPuE,22102
187
189
  aiq/profiler/utils.py,sha256=hNh_JfxXDrACIp4usXtlriTfVuYUkk3Pv-x74K34MQg,8180
188
190
  aiq/profiler/callbacks/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
189
191
  aiq/profiler/callbacks/agno_callback_handler.py,sha256=aDAUY6GDIUtly6KowXXKUqLc7NbE6khg1aXT1AritaA,14930
@@ -279,8 +281,8 @@ aiq/tool/github_tools/get_github_issue.py,sha256=vwLNkNOszLlymkQju0cR8BNvfdH4Enm
279
281
  aiq/tool/github_tools/get_github_pr.py,sha256=b7eCOqrVoejGjRwmUVdU45uF07ihbY8lRacMYOSgMrY,9716
280
282
  aiq/tool/github_tools/update_github_issue.py,sha256=TUElxUuzjZr_QldL_48RcqSx0A9b23NB_lA82QwFjkM,4103
281
283
  aiq/tool/mcp/__init__.py,sha256=GUJrgGtpvyMUCjUBvR3faAdv-tZzbU9W-izgx9aMEQg,680
282
- aiq/tool/mcp/mcp_client.py,sha256=lYbf669ATqGKkL0jjd76r0aAtAFnWeruWw-lOPsmYu8,8103
283
- aiq/tool/mcp/mcp_tool.py,sha256=rQQcaCT-GHQcDmG5weX-2Y-HxBPX-0cC73LjL1u0FUU,4009
284
+ aiq/tool/mcp/mcp_client.py,sha256=bTZGh_Y3mRJA9BGbbmfVNRddTIcK251jKlPL7kAjFK0,8553
285
+ aiq/tool/mcp/mcp_tool.py,sha256=0L2Zj1CBwrvv5P9A8-lj_Ao_oBaC6aYRJXw9q5Et4uo,4099
284
286
  aiq/tool/memory_tools/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
285
287
  aiq/tool/memory_tools/add_memory_tool.py,sha256=9EjB3DpYhxwasz7o3O8Rq__Ys5986fciv44ahC6mVCo,3349
286
288
  aiq/tool/memory_tools/delete_memory_tool.py,sha256=wdB_I8y-1D1OpNtBi6ZOg36vvNkbaxp-yvdqFMc2Suk,2532
@@ -310,10 +312,10 @@ aiq/utils/reactive/base/observer_base.py,sha256=UAlyAY_ky4q2t0P81RVFo2Bs_R7z5Nde
310
312
  aiq/utils/reactive/base/subject_base.py,sha256=Ed-AC6P7cT3qkW1EXjzbd5M9WpVoeN_9KCe3OM3FLU4,2521
311
313
  aiq/utils/settings/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
312
314
  aiq/utils/settings/global_settings.py,sha256=U9TCLdoZsKq5qOVGjREipGVv9e-FlStzqy5zv82_VYk,7454
313
- aiqtoolkit-1.2.0a20250626.dist-info/licenses/LICENSE-3rd-party.txt,sha256=8o7aySJa9CBvFshPcsRdJbczzdNyDGJ8b0J67WRUQ2k,183936
314
- aiqtoolkit-1.2.0a20250626.dist-info/licenses/LICENSE.md,sha256=QwcOLU5TJoTeUhuIXzhdCEEDDvorGiC6-3YTOl4TecE,11356
315
- aiqtoolkit-1.2.0a20250626.dist-info/METADATA,sha256=37IlijO2OTc7Oi5tW3vdnnI9OjBFUM5xzjwKd2RBYtU,20274
316
- aiqtoolkit-1.2.0a20250626.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
317
- aiqtoolkit-1.2.0a20250626.dist-info/entry_points.txt,sha256=gRlPfR5g21t328WNEQ4CcEz80S1sJNS8A7rMDYnzl4A,452
318
- aiqtoolkit-1.2.0a20250626.dist-info/top_level.txt,sha256=fo7AzYcNhZ_tRWrhGumtxwnxMew4xrT1iwouDy_f0Kc,4
319
- aiqtoolkit-1.2.0a20250626.dist-info/RECORD,,
315
+ aiqtoolkit-1.2.0a20250628.dist-info/licenses/LICENSE-3rd-party.txt,sha256=8o7aySJa9CBvFshPcsRdJbczzdNyDGJ8b0J67WRUQ2k,183936
316
+ aiqtoolkit-1.2.0a20250628.dist-info/licenses/LICENSE.md,sha256=QwcOLU5TJoTeUhuIXzhdCEEDDvorGiC6-3YTOl4TecE,11356
317
+ aiqtoolkit-1.2.0a20250628.dist-info/METADATA,sha256=kGslYo0xYh5ERzp0dvetOiCsiTWFe__dUyWxvxWkIiM,20274
318
+ aiqtoolkit-1.2.0a20250628.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
319
+ aiqtoolkit-1.2.0a20250628.dist-info/entry_points.txt,sha256=gRlPfR5g21t328WNEQ4CcEz80S1sJNS8A7rMDYnzl4A,452
320
+ aiqtoolkit-1.2.0a20250628.dist-info/top_level.txt,sha256=fo7AzYcNhZ_tRWrhGumtxwnxMew4xrT1iwouDy_f0Kc,4
321
+ aiqtoolkit-1.2.0a20250628.dist-info/RECORD,,