aiqtoolkit 1.2.0a20250616__py3-none-any.whl → 1.2.0a20250617__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of aiqtoolkit might be problematic. Click here for more details.

aiq/builder/function.py CHANGED
@@ -48,7 +48,8 @@ class Function(FunctionBase[InputT, StreamingOutputT, SingleOutputT], ABC):
48
48
  input_schema: type[BaseModel] | None = None,
49
49
  streaming_output_schema: type[BaseModel] | type[None] | None = None,
50
50
  single_output_schema: type[BaseModel] | type[None] | None = None,
51
- converters: list[Callable[[typing.Any], typing.Any]] | None = None):
51
+ converters: list[Callable[[typing.Any], typing.Any]] | None = None,
52
+ instance_name: str | None = None):
52
53
 
53
54
  super().__init__(input_schema=input_schema,
54
55
  streaming_output_schema=streaming_output_schema,
@@ -57,6 +58,7 @@ class Function(FunctionBase[InputT, StreamingOutputT, SingleOutputT], ABC):
57
58
 
58
59
  self.config = config
59
60
  self.description = description
61
+ self.instance_name = instance_name or config.type
60
62
  self._context = AIQContext.get()
61
63
 
62
64
  def convert(self, value: typing.Any, to_type: type[_T]) -> _T:
@@ -110,7 +112,7 @@ class Function(FunctionBase[InputT, StreamingOutputT, SingleOutputT], ABC):
110
112
  The output of the function optionally converted to the specified type.
111
113
  """
112
114
 
113
- with self._context.push_active_function(self.config.type,
115
+ with self._context.push_active_function(self.instance_name,
114
116
  input_data=value) as manager: # Set the current invocation context
115
117
  try:
116
118
  converted_input: InputT = self._convert_input(value) # type: ignore
@@ -254,17 +256,17 @@ class Function(FunctionBase[InputT, StreamingOutputT, SingleOutputT], ABC):
254
256
 
255
257
  class LambdaFunction(Function[InputT, StreamingOutputT, SingleOutputT]):
256
258
 
257
- def __init__(self, *, config: FunctionBaseConfig, info: FunctionInfo):
259
+ def __init__(self, *, config: FunctionBaseConfig, info: FunctionInfo, instance_name: str | None = None):
258
260
 
259
261
  super().__init__(config=config,
260
262
  description=info.description,
261
263
  input_schema=info.input_schema,
262
264
  streaming_output_schema=info.stream_output_schema,
263
265
  single_output_schema=info.single_output_schema,
264
- converters=info.converters)
266
+ converters=info.converters,
267
+ instance_name=instance_name)
265
268
 
266
269
  self._info = info
267
-
268
270
  self._ainvoke_fn: _InvokeFnT = info.single_fn
269
271
  self._astream_fn: _StreamFnT = info.stream_fn
270
272
 
@@ -284,8 +286,10 @@ class LambdaFunction(Function[InputT, StreamingOutputT, SingleOutputT]):
284
286
  yield x
285
287
 
286
288
  @staticmethod
287
- def from_info(*, config: FunctionBaseConfig,
288
- info: FunctionInfo) -> 'LambdaFunction[InputT, StreamingOutputT, SingleOutputT]':
289
+ def from_info(*,
290
+ config: FunctionBaseConfig,
291
+ info: FunctionInfo,
292
+ instance_name: str | None = None) -> 'LambdaFunction[InputT, StreamingOutputT, SingleOutputT]':
289
293
 
290
294
  input_type: type = info.input_type
291
295
  streaming_output_type = info.stream_output_type
@@ -294,4 +298,4 @@ class LambdaFunction(Function[InputT, StreamingOutputT, SingleOutputT]):
294
298
  class FunctionImpl(LambdaFunction[input_type, streaming_output_type, single_output_type]):
295
299
  pass
296
300
 
297
- return FunctionImpl(config=config, info=info)
301
+ return FunctionImpl(config=config, info=info, instance_name=instance_name)
@@ -333,7 +333,7 @@ class WorkflowBuilder(Builder, AbstractAsyncContextManager):
333
333
 
334
334
  if (isinstance(build_result, FunctionInfo)):
335
335
  # Create the function object
336
- build_result = LambdaFunction.from_info(config=config, info=build_result)
336
+ build_result = LambdaFunction.from_info(config=config, info=build_result, instance_name=name)
337
337
 
338
338
  if (not isinstance(build_result, Function)):
339
339
  raise ValueError("Expected a function, FunctionInfo object, or FunctionBase object to be "
@@ -14,6 +14,7 @@
14
14
  # limitations under the License.
15
15
 
16
16
  import typing
17
+ from enum import Enum
17
18
  from pathlib import Path
18
19
 
19
20
  from pydantic import BaseModel
@@ -28,6 +29,12 @@ from aiq.data_models.intermediate_step import IntermediateStepType
28
29
  from aiq.data_models.profiler import ProfilerConfig
29
30
 
30
31
 
32
+ class JobEvictionPolicy(str, Enum):
33
+ """Policy for evicting old jobs when max_jobs is exceeded."""
34
+ TIME_CREATED = "time_created"
35
+ TIME_MODIFIED = "time_modified"
36
+
37
+
31
38
  class EvalCustomScriptConfig(BaseModel):
32
39
  # Path to the script to run
33
40
  script: Path
@@ -35,6 +42,16 @@ class EvalCustomScriptConfig(BaseModel):
35
42
  kwargs: dict[str, str] = {}
36
43
 
37
44
 
45
+ class JobManagementConfig(BaseModel):
46
+ # Whether to append a unique job ID to the output directory for each run
47
+ append_job_id_to_output_dir: bool = False
48
+ # Maximum number of jobs to keep in the output directory. Oldest jobs will be evicted.
49
+ # A value of 0 means no limit.
50
+ max_jobs: int = 0
51
+ # Policy for evicting old jobs. Defaults to using time_created.
52
+ eviction_policy: JobEvictionPolicy = JobEvictionPolicy.TIME_CREATED
53
+
54
+
38
55
  class EvalOutputConfig(BaseModel):
39
56
  # Output directory for the workflow and evaluation results
40
57
  dir: Path = Path("/tmp/aiq/examples/default/")
@@ -46,6 +63,8 @@ class EvalOutputConfig(BaseModel):
46
63
  s3: EvalS3Config | None = None
47
64
  # Whether to cleanup the output directory before running the workflow
48
65
  cleanup: bool = True
66
+ # Job management configuration (job id, eviction, etc.)
67
+ job_management: JobManagementConfig = JobManagementConfig()
49
68
  # Filter for the workflow output steps
50
69
  workflow_output_step_filter: list[IntermediateStepType] | None = None
51
70
 
aiq/eval/evaluate.py CHANGED
@@ -18,11 +18,13 @@ import logging
18
18
  import shutil
19
19
  from pathlib import Path
20
20
  from typing import Any
21
+ from uuid import uuid4
21
22
 
22
23
  from pydantic import BaseModel
23
24
  from tqdm import tqdm
24
25
 
25
26
  from aiq.data_models.evaluate import EvalConfig
27
+ from aiq.data_models.evaluate import JobEvictionPolicy
26
28
  from aiq.eval.config import EvaluationRunConfig
27
29
  from aiq.eval.config import EvaluationRunOutput
28
30
  from aiq.eval.dataset_handler.dataset_handler import DatasetHandler
@@ -178,10 +180,60 @@ class EvaluationRun: # pylint: disable=too-many-public-methods
178
180
 
179
181
  def cleanup_output_directory(self):
180
182
  '''Remove contents of the output directory if it exists'''
181
- if self.eval_config.general.output and self.eval_config.general.output.dir and \
182
- self.eval_config.general.output.dir.exists():
183
- logger.info("Cleaning up output directory %s", self.eval_config.general.output.dir)
184
- shutil.rmtree(self.eval_config.general.output.dir)
183
+ output_config = self.eval_config.general.output
184
+ output_dir = output_config.dir
185
+
186
+ if not (output_config and output_dir.exists()):
187
+ return
188
+
189
+ # If cleanup is true, remove the entire directory and we are done
190
+ if output_config.cleanup:
191
+ logger.info("Cleaning up entire output directory: %s", output_config.dir)
192
+ shutil.rmtree(output_config.dir)
193
+ return
194
+
195
+ if output_config.job_management.max_jobs == 0:
196
+ # No eviction policy
197
+ return
198
+
199
+ base_dir = output_dir / "jobs"
200
+ if not base_dir.exists():
201
+ return
202
+
203
+ # Get all subdirectories, which represent individual job runs
204
+ job_dirs = [d for d in base_dir.iterdir() if d.is_dir()]
205
+ if len(job_dirs) <= output_config.job_management.max_jobs:
206
+ return
207
+
208
+ # Determine sort key based on eviction_policy, defaulting to creation time
209
+ if output_config.job_management.eviction_policy == JobEvictionPolicy.TIME_MODIFIED:
210
+
211
+ def sort_key(x):
212
+ return x.stat().st_mtime
213
+
214
+ logger.info("Using last modified time for job eviction policy.")
215
+ else:
216
+
217
+ def sort_key(x):
218
+ return x.stat().st_ctime
219
+
220
+ logger.info("Using creation time for job eviction policy.")
221
+
222
+ # Sort directories (oldest first)
223
+ job_dirs.sort(key=sort_key)
224
+ num_to_delete = len(job_dirs) - output_config.job_management.max_jobs
225
+
226
+ logger.info("Found %d jobs, exceeding limit of %d. Removing %d oldest jobs.",
227
+ len(job_dirs),
228
+ output_config.job_management.max_jobs,
229
+ num_to_delete)
230
+
231
+ for dir_to_delete in job_dirs[:num_to_delete]:
232
+ try:
233
+ logger.info("Deleting old job directory: %s", dir_to_delete)
234
+ shutil.rmtree(dir_to_delete)
235
+ except Exception as e:
236
+ logger.exception("Failed to delete old job directory: %s: %s", dir_to_delete, e, exc_info=True)
185
237
 
186
238
  def write_output(self, dataset_handler: DatasetHandler):
187
239
  workflow_output_file = self.eval_config.general.output_dir / "workflow_output.json"
@@ -272,9 +324,15 @@ class EvaluationRun: # pylint: disable=too-many-public-methods
272
324
  logger.debug("Loaded evaluation configuration: %s", self.eval_config)
273
325
 
274
326
  # Cleanup the output directory
275
- if self.eval_config.general.output and self.eval_config.general.output.cleanup:
327
+ if self.eval_config.general.output:
276
328
  self.cleanup_output_directory()
277
329
 
330
+ # Generate a job_id if append_job_id_to_output_dir is enabled and no job_id provided
331
+ if (self.eval_config.general.output
332
+ and self.eval_config.general.output.job_management.append_job_id_to_output_dir and not job_id):
333
+ job_id = "job_" + str(uuid4())
334
+ logger.info("Generated job ID for output directory: %s", job_id)
335
+
278
336
  # If a job id is provided keep the data per-job
279
337
  if job_id:
280
338
  self.eval_config.general.output_dir = self.eval_config.general.output_dir / f"jobs/{job_id}"
@@ -25,11 +25,9 @@ from langchain_core.language_models import BaseChatModel
25
25
  from langchain_core.runnables import RunnableLambda
26
26
  from tqdm import tqdm
27
27
 
28
- from aiq.eval.evaluator.evaluator_model import EvalInput
28
+ from aiq.eval.evaluator.base_evaluator import BaseEvaluator
29
29
  from aiq.eval.evaluator.evaluator_model import EvalInputItem
30
- from aiq.eval.evaluator.evaluator_model import EvalOutput
31
30
  from aiq.eval.evaluator.evaluator_model import EvalOutputItem
32
- from aiq.eval.utils.tqdm_position_registry import TqdmPositionRegistry
33
31
 
34
32
  logger = logging.getLogger(__name__)
35
33
 
@@ -96,7 +94,7 @@ def runnable_with_retries(original_fn: Callable, llm_retry_control_params: dict
96
94
  )
97
95
 
98
96
 
99
- class TunableRagEvaluator:
97
+ class TunableRagEvaluator(BaseEvaluator):
100
98
  '''Tunable RAG evaluator class with customizable LLM prompt for scoring.'''
101
99
 
102
100
  def __init__(self,
@@ -106,187 +104,142 @@ class TunableRagEvaluator:
106
104
  max_concurrency: int,
107
105
  default_scoring: bool,
108
106
  default_score_weights: dict):
107
+ super().__init__(max_concurrency=max_concurrency, tqdm_desc="Evaluating RAG")
109
108
  self.llm = llm
110
- self.max_concurrency = max_concurrency
111
109
  self.judge_llm_prompt = judge_llm_prompt
112
110
  self.llm_retry_control_params = llm_retry_control_params
113
- self.semaphore = asyncio.Semaphore(self.max_concurrency)
114
111
  self.default_scoring = default_scoring
115
112
  # Use user-provided weights if available; otherwise, set equal weights for each score
116
113
  self.default_score_weights = default_score_weights if default_score_weights else {
117
114
  "coverage": 1 / 3, "correctness": 1 / 3, "relevance": 1 / 3
118
115
  }
119
116
 
120
- async def evaluate(self, eval_input: EvalInput) -> EvalOutput:
121
- '''Evaluate function'''
122
-
123
- async def process_item(item):
124
- """Compute RAG evaluation for an individual item"""
125
- question = item.input_obj
126
- answer_description = item.expected_output_obj
127
- generated_answer = item.output_obj
128
-
129
- # Call judge LLM to generate score
130
- score = 0.0
131
-
132
- default_evaluation_schema = [
133
- ResponseSchema(
134
- name="coverage_score",
135
- description=
136
- "Score for the coverage of all critical aspects mentioned in the expected answer. Ex. 0.5",
137
- type="float"),
138
- ResponseSchema(
139
- name="correctness_score",
140
- description=
141
- "Score for the accuracy of the generated answer compared to the expected answer. Ex. 0.5",
142
- type="float"),
143
- ResponseSchema(name="relevance_score",
144
- description="Score for the relevance of the generated answer to the question. Ex. 0.5",
145
- type="float"),
146
- ResponseSchema(
147
- name="reasoning",
148
- description=
149
- "1-2 summarized sentences of reasoning for the scores. Ex. 'The generated answer covers all critical aspects mentioned in the expected answer, is correct, and is relevant to the question.'",
150
- type="string"),
151
- ]
152
-
153
- custom_evaluation_schema = [
154
- ResponseSchema(name="score", description="Score for the generated answer. Ex. 0.5", type="float"),
155
- ResponseSchema(
156
- name="reasoning",
157
- description=
158
- "1-2 sentence reasoning for the score. Ex. 'The generated answer is exactly the same as the description of the expected answer.'",
159
- type="string"),
160
- ]
117
+ async def evaluate_item(self, item: EvalInputItem) -> EvalOutputItem:
118
+ """Compute RAG evaluation for an individual item and return EvalOutputItem"""
119
+ question = item.input_obj
120
+ answer_description = item.expected_output_obj
121
+ generated_answer = item.output_obj
122
+
123
+ # Call judge LLM to generate score
124
+ score = 0.0
125
+
126
+ default_evaluation_schema = [
127
+ ResponseSchema(
128
+ name="coverage_score",
129
+ description="Score for the coverage of all critical aspects mentioned in the expected answer. Ex. 0.5",
130
+ type="float"),
131
+ ResponseSchema(
132
+ name="correctness_score",
133
+ description="Score for the accuracy of the generated answer compared to the expected answer. Ex. 0.5",
134
+ type="float"),
135
+ ResponseSchema(name="relevance_score",
136
+ description="Score for the relevance of the generated answer to the question. Ex. 0.5",
137
+ type="float"),
138
+ ResponseSchema(
139
+ name="reasoning",
140
+ description=
141
+ "1-2 summarized sentences of reasoning for the scores. Ex. 'The generated answer covers all critical aspects mentioned in the expected answer, is correct, and is relevant to the question.'",
142
+ type="string"),
143
+ ]
161
144
 
162
- if self.default_scoring:
163
- evaluation_schema = default_evaluation_schema
164
- else:
165
- evaluation_schema = custom_evaluation_schema
145
+ custom_evaluation_schema = [
146
+ ResponseSchema(name="score", description="Score for the generated answer. Ex. 0.5", type="float"),
147
+ ResponseSchema(
148
+ name="reasoning",
149
+ description=
150
+ "1-2 sentence reasoning for the score. Ex. 'The generated answer is exactly the same as the description of the expected answer.'",
151
+ type="string"),
152
+ ]
166
153
 
167
- llm_input_response_parser = StructuredOutputParser.from_response_schemas(evaluation_schema)
168
- format_instructions = llm_input_response_parser.get_format_instructions()
154
+ if self.default_scoring:
155
+ evaluation_schema = default_evaluation_schema
156
+ else:
157
+ evaluation_schema = custom_evaluation_schema
169
158
 
170
- eval_prompt = evaluation_prompt(judge_llm_prompt=self.judge_llm_prompt,
171
- question=question,
172
- answer_description=answer_description,
173
- generated_answer=generated_answer,
174
- format_instructions=format_instructions,
175
- default_scoring=self.default_scoring)
159
+ llm_input_response_parser = StructuredOutputParser.from_response_schemas(evaluation_schema)
160
+ format_instructions = llm_input_response_parser.get_format_instructions()
176
161
 
177
- messages = [
178
- SystemMessage(content="You must respond only in JSON format."), HumanMessage(content=eval_prompt)
179
- ]
162
+ eval_prompt = evaluation_prompt(judge_llm_prompt=self.judge_llm_prompt,
163
+ question=question,
164
+ answer_description=answer_description,
165
+ generated_answer=generated_answer,
166
+ format_instructions=format_instructions,
167
+ default_scoring=self.default_scoring)
180
168
 
181
- response = await runnable_with_retries(self.llm.ainvoke, self.llm_retry_control_params).ainvoke(messages)
169
+ messages = [SystemMessage(content="You must respond only in JSON format."), HumanMessage(content=eval_prompt)]
182
170
 
183
- # Initialize default values to handle service errors
184
- coverage_score = 0.0
185
- correctness_score = 0.0
186
- relevance_score = 0.0
187
- reasoning = "Error in evaluator from parsing judge LLM response."
171
+ response = await runnable_with_retries(self.llm.ainvoke, self.llm_retry_control_params).ainvoke(messages)
188
172
 
189
- try:
190
- parsed_response = llm_input_response_parser.parse(response.content)
191
- if self.default_scoring:
192
- try:
193
- coverage_score = parsed_response["coverage_score"]
194
- correctness_score = parsed_response["correctness_score"]
195
- relevance_score = parsed_response["relevance_score"]
196
- reasoning = parsed_response["reasoning"]
197
- except KeyError as e:
198
- logger.error("Missing required keys in default scoring response: %s",
199
- ", ".join(str(arg) for arg in e.args))
200
- reasoning = f"Error in evaluator from parsing judge LLM response. Missing required key(s): {', '.join(str(arg) for arg in e.args)}"
201
-
202
- coverage_weight = self.default_score_weights.get("coverage", 1 / 3)
203
- correctness_weight = self.default_score_weights.get("correctness", 1 / 3)
204
- relevance_weight = self.default_score_weights.get("relevance", 1 / 3)
205
-
206
- # Calculate score
207
- total_weight = coverage_weight + correctness_weight + relevance_weight
208
- coverage_weight = coverage_weight / total_weight
209
- correctness_weight = correctness_weight / total_weight
210
- relevance_weight = relevance_weight / total_weight
211
-
212
- if round(coverage_weight + correctness_weight + relevance_weight, 2) != 1:
213
- logger.warning("The sum of the default score weights is not 1. The weights will be normalized.")
214
- coverage_weight = coverage_weight / (coverage_weight + correctness_weight + relevance_weight)
215
- correctness_weight = correctness_weight / (coverage_weight + correctness_weight +
216
- relevance_weight)
217
- relevance_weight = relevance_weight / (coverage_weight + correctness_weight + relevance_weight)
218
-
219
- score = (coverage_weight * coverage_score + correctness_weight * correctness_score +
220
- relevance_weight * relevance_score)
221
-
222
- else:
223
- try:
224
- score = parsed_response["score"]
225
- reasoning = parsed_response["reasoning"]
226
- except KeyError as e:
227
- logger.error("Missing required keys in custom scoring response: %s",
228
- ", ".join(str(arg) for arg in e.args))
229
- reasoning = f"Error in evaluator from parsing judge LLM response. Missing required key(s): {', '.join(str(arg) for arg in e.args)}"
230
- raise
231
- except (KeyError, ValueError) as e:
232
- logger.error("Error parsing judge LLM response: %s", e)
233
- score = 0.0
234
- reasoning = "Error in evaluator from parsing judge LLM response."
173
+ # Initialize default values to handle service errors
174
+ coverage_score = 0.0
175
+ correctness_score = 0.0
176
+ relevance_score = 0.0
177
+ reasoning = "Error in evaluator from parsing judge LLM response."
235
178
 
179
+ try:
180
+ parsed_response = llm_input_response_parser.parse(response.content)
236
181
  if self.default_scoring:
237
- reasoning = {
238
- "question": question,
239
- "answer_description": answer_description,
240
- "generated_answer": generated_answer,
241
- "score_breakdown": {
242
- "coverage_score": coverage_score,
243
- "correctness_score": correctness_score,
244
- "relevance_score": relevance_score,
245
- },
246
- "reasoning": reasoning,
247
- }
248
- else:
249
- reasoning = {
250
- "question": question,
251
- "answer_description": answer_description,
252
- "generated_answer": generated_answer,
253
- "reasoning": reasoning
254
- }
255
-
256
- return score, reasoning
257
-
258
- async def wrapped_process(item: EvalInputItem) -> tuple[float, dict]:
259
- """
260
- Process an item asynchronously and update the progress bar.
261
- Use the semaphore to limit the number of concurrent items.
262
- """
263
- async with self.semaphore:
264
- result = await process_item(item)
265
- # Update the progress bar
266
- pbar.update(1)
267
- return result
182
+ try:
183
+ coverage_score = parsed_response["coverage_score"]
184
+ correctness_score = parsed_response["correctness_score"]
185
+ relevance_score = parsed_response["relevance_score"]
186
+ reasoning = parsed_response["reasoning"]
187
+ except KeyError as e:
188
+ logger.error("Missing required keys in default scoring response: %s",
189
+ ", ".join(str(arg) for arg in e.args))
190
+ reasoning = f"Error in evaluator from parsing judge LLM response. Missing required key(s): {', '.join(str(arg) for arg in e.args)}"
191
+
192
+ coverage_weight = self.default_score_weights.get("coverage", 1 / 3)
193
+ correctness_weight = self.default_score_weights.get("correctness", 1 / 3)
194
+ relevance_weight = self.default_score_weights.get("relevance", 1 / 3)
195
+
196
+ # Calculate score
197
+ total_weight = coverage_weight + correctness_weight + relevance_weight
198
+ coverage_weight = coverage_weight / total_weight
199
+ correctness_weight = correctness_weight / total_weight
200
+ relevance_weight = relevance_weight / total_weight
201
+
202
+ if round(coverage_weight + correctness_weight + relevance_weight, 2) != 1:
203
+ logger.warning("The sum of the default score weights is not 1. The weights will be normalized.")
204
+ coverage_weight = coverage_weight / (coverage_weight + correctness_weight + relevance_weight)
205
+ correctness_weight = correctness_weight / (coverage_weight + correctness_weight + relevance_weight)
206
+ relevance_weight = relevance_weight / (coverage_weight + correctness_weight + relevance_weight)
207
+
208
+ score = (coverage_weight * coverage_score + correctness_weight * correctness_score +
209
+ relevance_weight * relevance_score)
268
210
 
269
- try:
270
- # Claim a tqdm position to display the progress bar
271
- tqdm_position = TqdmPositionRegistry.claim()
272
- # Create a progress bar
273
- pbar = tqdm(total=len(eval_input.eval_input_items), desc="Evaluating RAG", position=tqdm_position)
274
- # Process items concurrently with a limit on concurrency
275
- results = await asyncio.gather(*[wrapped_process(item) for item in eval_input.eval_input_items])
276
- finally:
277
- pbar.close()
278
- TqdmPositionRegistry.release(tqdm_position)
279
-
280
- # Extract scores and reasonings
281
- sample_scores, sample_reasonings = zip(*results) if results else ([], [])
282
-
283
- # Compute average score
284
- avg_score = round(sum(sample_scores) / len(sample_scores), 2) if sample_scores else 0.0
285
-
286
- # Construct EvalOutputItems
287
- eval_output_items = [
288
- EvalOutputItem(id=item.id, score=score, reasoning=reasoning)
289
- for item, score, reasoning in zip(eval_input.eval_input_items, sample_scores, sample_reasonings)
290
- ]
211
+ else:
212
+ try:
213
+ score = parsed_response["score"]
214
+ reasoning = parsed_response["reasoning"]
215
+ except KeyError as e:
216
+ logger.error("Missing required keys in custom scoring response: %s",
217
+ ", ".join(str(arg) for arg in e.args))
218
+ reasoning = f"Error in evaluator from parsing judge LLM response. Missing required key(s): {', '.join(str(arg) for arg in e.args)}"
219
+ raise
220
+ except (KeyError, ValueError) as e:
221
+ logger.error("Error parsing judge LLM response: %s", e)
222
+ score = 0.0
223
+ reasoning = "Error in evaluator from parsing judge LLM response."
291
224
 
292
- return EvalOutput(average_score=avg_score, eval_output_items=eval_output_items)
225
+ if self.default_scoring:
226
+ reasoning = {
227
+ "question": question,
228
+ "answer_description": answer_description,
229
+ "generated_answer": generated_answer,
230
+ "score_breakdown": {
231
+ "coverage_score": coverage_score,
232
+ "correctness_score": correctness_score,
233
+ "relevance_score": relevance_score,
234
+ },
235
+ "reasoning": reasoning,
236
+ }
237
+ else:
238
+ reasoning = {
239
+ "question": question,
240
+ "answer_description": answer_description,
241
+ "generated_answer": generated_answer,
242
+ "reasoning": reasoning
243
+ }
244
+
245
+ return EvalOutputItem(id=item.id, score=score, reasoning=reasoning)
@@ -69,13 +69,20 @@ def build_call_tree_for_example(example_df: pd.DataFrame) -> list[CallNode]:
69
69
  return "LLM"
70
70
  if evt.startswith("TOOL_"):
71
71
  return "TOOL"
72
+ if evt.startswith("FUNCTION_"):
73
+ return "FUNCTION"
74
+ if evt.startswith("SPAN_"):
75
+ return "FUNCTION"
72
76
  return None
73
77
 
74
78
  def get_op_name(row: pd.Series, op_type: str) -> str:
75
79
  if op_type == "LLM":
76
80
  return row.get("llm_name") or "unknown_llm"
81
+ if op_type == "FUNCTION":
82
+ return row.get("function_name") or "unknown_function"
77
83
  if op_type == "TOOL":
78
84
  return row.get("tool_name") or "unknown_tool"
85
+
79
86
  return "unknown_op"
80
87
 
81
88
  for _, row in example_df.iterrows():
@@ -309,6 +316,7 @@ def save_gantt_chart(all_nodes: list[CallNode], output_path: str) -> None:
309
316
  color_map = {
310
317
  "LLM": "tab:blue",
311
318
  "TOOL": "tab:green",
319
+ "FUNCTION": "tab:orange",
312
320
  }
313
321
  default_color = "tab:gray"
314
322
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: aiqtoolkit
3
- Version: 1.2.0a20250616
3
+ Version: 1.2.0a20250617
4
4
  Summary: NVIDIA Agent Intelligence toolkit
5
5
  Author: NVIDIA Corporation
6
6
  Maintainer: NVIDIA Corporation
@@ -25,7 +25,7 @@ aiq/builder/eval_builder.py,sha256=UnNgtQiDAUfT3yuwjZQVerenI09-4q0Cse9uwLjk3Fg,4
25
25
  aiq/builder/evaluator.py,sha256=O6Gu0cUwQkrPxPX29Vf_-RopgijxPnhy7mhg_j-9A84,1162
26
26
  aiq/builder/framework_enum.py,sha256=eYwHQifZ86dx-OTubVA3qhCLRqhB4ElMBYBGA0gYtic,885
27
27
  aiq/builder/front_end.py,sha256=Xhvfi4VcDh5EoCtLr6AlLQfbRm8_TyugUc_IRfirN6Y,2225
28
- aiq/builder/function.py,sha256=Sh4LKgC-gipsMkNexUY4mw-Br4dWZxq6AHv-als0-e0,11430
28
+ aiq/builder/function.py,sha256=74mZuDemcgpuRAKfn_aSOz1Pqjem3x_9nR0Oh61Tai8,11727
29
29
  aiq/builder/function_base.py,sha256=AF5a56y-Nw9OpWsP8IFukUKM2FtP8758qYQW6EfObO0,13109
30
30
  aiq/builder/function_info.py,sha256=pGPIAL0tjVqLOJymIRB0boI9pzJGdXiPK3KiZvXQsqM,25266
31
31
  aiq/builder/intermediate_step_manager.py,sha256=aKjOK7Gk9XbKhKvRMQTylRGDFZJU7rwqSuiZYaPfwjA,7830
@@ -33,7 +33,7 @@ aiq/builder/llm.py,sha256=DcoYCyschsRjkW_yGsa_Ci7ELSpk5KRbi9778Dm_B9c,951
33
33
  aiq/builder/retriever.py,sha256=GM7L1T4NdNZKerFZiCfLcQOwsGoX0NRlF8my7SMq3l4,970
34
34
  aiq/builder/user_interaction_manager.py,sha256=OXr-RxWf1sEZjzQH_jt0nmqrLBtYLHGEZEcfDYYFV88,2913
35
35
  aiq/builder/workflow.py,sha256=UOjrXmu1sxWTxTjygszqYgK0gK65r_beLKUwOpxtXuc,5894
36
- aiq/builder/workflow_builder.py,sha256=w5XBzACKp4yQu8aUYTiO1xVX78uyfDQnuquqgRT30Hc,30392
36
+ aiq/builder/workflow_builder.py,sha256=0bRcNdjMCIfY60oaMu4hvH-mldIcxEDkm_lKRCBywnU,30412
37
37
  aiq/cli/__init__.py,sha256=Xs1JQ16L9btwreh4pdGKwskffAw1YFO48jKrU4ib_7c,685
38
38
  aiq/cli/entrypoint.py,sha256=BJsCZgC2nVyAWj7tBXwW67OIteg833xAI54R-e9O8oc,4757
39
39
  aiq/cli/main.py,sha256=yVTX5-5-21OOfG8qAdcK3M1fCQUxdr3G37Mb5OldPQc,1772
@@ -82,7 +82,7 @@ aiq/data_models/config.py,sha256=ERLjZY0iqexZ-gSXsCSN1UqgNeiwkEjWdYJEdKqeYTY,141
82
82
  aiq/data_models/dataset_handler.py,sha256=SifWhFHtxTMEjrXaXOYQgBOSKfWOzkc6OtOoPJ39pD4,3978
83
83
  aiq/data_models/discovery_metadata.py,sha256=OcITQc5VeML4bTHurrsMNiK_oB3z7wudMxcyN7LI8pY,12785
84
84
  aiq/data_models/embedder.py,sha256=0v917IiohVA_7zdF7hoO_zQcmNe4hQEFhh4fxRiYBbk,940
85
- aiq/data_models/evaluate.py,sha256=Llu9_H840nbV_nAimcFQaTeK6oLRmGac9UKsoaLlL58,3786
85
+ aiq/data_models/evaluate.py,sha256=tLL-AidxW6-VnEpIDYqGpvIdcNXnDee7Ooze9_bzXeY,4557
86
86
  aiq/data_models/evaluator.py,sha256=bd2njsyQB2t6ClJ66gJiCjYHsQpWZwPD7rsU0J109TI,939
87
87
  aiq/data_models/front_end.py,sha256=z8k6lSWjt1vMOYFbjWQxodpwAqPeuGS0hRBjsriDW2s,932
88
88
  aiq/data_models/function.py,sha256=M_duXVXL5MvYe0WVLvqEgEzXs0UAYNSMfy9ZTpxuKPA,1013
@@ -107,7 +107,7 @@ aiq/embedder/openai_embedder.py,sha256=5FO3xsyNvEmbLBsZb3xsCpbN1Soxio4yf4b5gTPVx
107
107
  aiq/embedder/register.py,sha256=3MTZrfNQKp6AZTbfaA-PpTnyXiMyu-8HH9JnDCC0v9o,978
108
108
  aiq/eval/__init__.py,sha256=Xs1JQ16L9btwreh4pdGKwskffAw1YFO48jKrU4ib_7c,685
109
109
  aiq/eval/config.py,sha256=IlOr2o618kbkXP0G1F-AklZfsKYVos9UB4Dvlxf66xk,1431
110
- aiq/eval/evaluate.py,sha256=FFKIWRse9C3z6A7Fyu8GN0ZHMrxGspw9LnhQ7ulEYSE,15125
110
+ aiq/eval/evaluate.py,sha256=AGEvmagd43jLq0aE_yNs_FFPFxVJEx49cu6Fl3WeQqA,17270
111
111
  aiq/eval/intermediate_step_adapter.py,sha256=4cSsGgFBvNjXnclk5FvZnQaFEdeulp7VEdRWKLcREAQ,4498
112
112
  aiq/eval/register.py,sha256=QOHJqA2CQixeWMC9InyKbzXo1jByvrntD_m9-2Mvg9k,1076
113
113
  aiq/eval/remote_workflow.py,sha256=Fb7Z6gdP2L_gqyWB--AEWfcXe9xPpQ_hPsf9lmqGXjI,5524
@@ -129,7 +129,7 @@ aiq/eval/trajectory_evaluator/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5N
129
129
  aiq/eval/trajectory_evaluator/evaluate.py,sha256=Y51KMhJ9t8AoYWrQlrwipc2CtgIXA9IUGZTbKegtsnw,3257
130
130
  aiq/eval/trajectory_evaluator/register.py,sha256=kktT4fu5_1Cou-iohD3YhQevsWiR3TA5NpFSweVz0eQ,1709
131
131
  aiq/eval/tunable_rag_evaluator/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
132
- aiq/eval/tunable_rag_evaluator/evaluate.py,sha256=xo7gtBI-cOrmk8s6FNLDoMhn2F0ODOxdAtg37i4Vu24,15387
132
+ aiq/eval/tunable_rag_evaluator/evaluate.py,sha256=f4jfn9VVLmkOg631TQr2wy7hPwGMJMsQa4kmXsu0-Uc,13069
133
133
  aiq/eval/tunable_rag_evaluator/register.py,sha256=q4p2rFyMzWmaINJc961ZV4jzIlAN4GfWsoImHo0ovsY,2558
134
134
  aiq/eval/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
135
135
  aiq/eval/utils/output_uploader.py,sha256=SaQbZPkw-Q0H7t5yG60Kh-p1cflR7gPklVkilC4uPbU,5141
@@ -208,7 +208,7 @@ aiq/profiler/inference_optimization/prompt_caching.py,sha256=LGfxJG4R2y4vMFoiFzt
208
208
  aiq/profiler/inference_optimization/token_uniqueness.py,sha256=OCNlVmemMLS2kt0OZIXOGt8MbrTy5mbdhSMPYHs31a4,4571
209
209
  aiq/profiler/inference_optimization/workflow_runtimes.py,sha256=lnGa0eTpHiDEbx9rX-tcx100qSd6amePLlgb4Gx7JBc,2664
210
210
  aiq/profiler/inference_optimization/bottleneck_analysis/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
211
- aiq/profiler/inference_optimization/bottleneck_analysis/nested_stack_analysis.py,sha256=8IgeAImmYlRy-JEaGeoYE6_BuNZ_3tyZmzXOGvDKCeg,16461
211
+ aiq/profiler/inference_optimization/bottleneck_analysis/nested_stack_analysis.py,sha256=yr81PsQ4TcrEnuPDlRwhL9Hcox3gO855DsS-BDo00u0,16732
212
212
  aiq/profiler/inference_optimization/bottleneck_analysis/simple_stack_analysis.py,sha256=VZLBgsIUGOkY0ZUCLHQM4LpBQpJBM5JKRTUBGyoOFWU,11100
213
213
  aiq/profiler/inference_optimization/experimental/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
214
214
  aiq/profiler/inference_optimization/experimental/concurrency_spike_analysis.py,sha256=J-oMRCEnd6I1XFXiyLUu8VPR745ptnzgzvn0Opsi208,16953
@@ -309,10 +309,10 @@ aiq/utils/reactive/base/observer_base.py,sha256=UAlyAY_ky4q2t0P81RVFo2Bs_R7z5Nde
309
309
  aiq/utils/reactive/base/subject_base.py,sha256=Ed-AC6P7cT3qkW1EXjzbd5M9WpVoeN_9KCe3OM3FLU4,2521
310
310
  aiq/utils/settings/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
311
311
  aiq/utils/settings/global_settings.py,sha256=U9TCLdoZsKq5qOVGjREipGVv9e-FlStzqy5zv82_VYk,7454
312
- aiqtoolkit-1.2.0a20250616.dist-info/licenses/LICENSE-3rd-party.txt,sha256=8o7aySJa9CBvFshPcsRdJbczzdNyDGJ8b0J67WRUQ2k,183936
313
- aiqtoolkit-1.2.0a20250616.dist-info/licenses/LICENSE.md,sha256=QwcOLU5TJoTeUhuIXzhdCEEDDvorGiC6-3YTOl4TecE,11356
314
- aiqtoolkit-1.2.0a20250616.dist-info/METADATA,sha256=Es-uNK-zqvm2HoDhXHzB1O7gj-hY1K3-ovK9nqOBsow,20274
315
- aiqtoolkit-1.2.0a20250616.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
316
- aiqtoolkit-1.2.0a20250616.dist-info/entry_points.txt,sha256=gRlPfR5g21t328WNEQ4CcEz80S1sJNS8A7rMDYnzl4A,452
317
- aiqtoolkit-1.2.0a20250616.dist-info/top_level.txt,sha256=fo7AzYcNhZ_tRWrhGumtxwnxMew4xrT1iwouDy_f0Kc,4
318
- aiqtoolkit-1.2.0a20250616.dist-info/RECORD,,
312
+ aiqtoolkit-1.2.0a20250617.dist-info/licenses/LICENSE-3rd-party.txt,sha256=8o7aySJa9CBvFshPcsRdJbczzdNyDGJ8b0J67WRUQ2k,183936
313
+ aiqtoolkit-1.2.0a20250617.dist-info/licenses/LICENSE.md,sha256=QwcOLU5TJoTeUhuIXzhdCEEDDvorGiC6-3YTOl4TecE,11356
314
+ aiqtoolkit-1.2.0a20250617.dist-info/METADATA,sha256=7U1QTdteJlIPEhjypLhDj_VsNOp3dcGC5GyK4sPecD8,20274
315
+ aiqtoolkit-1.2.0a20250617.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
316
+ aiqtoolkit-1.2.0a20250617.dist-info/entry_points.txt,sha256=gRlPfR5g21t328WNEQ4CcEz80S1sJNS8A7rMDYnzl4A,452
317
+ aiqtoolkit-1.2.0a20250617.dist-info/top_level.txt,sha256=fo7AzYcNhZ_tRWrhGumtxwnxMew4xrT1iwouDy_f0Kc,4
318
+ aiqtoolkit-1.2.0a20250617.dist-info/RECORD,,