aiqtoolkit 1.2.0a20250612__py3-none-any.whl → 1.2.0a20250613__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of aiqtoolkit might be problematic. Click here for more details.

@@ -81,6 +81,7 @@ class DatasetHandler:
81
81
  output_obj=row.get(self.generated_answer_key, "") if structured else "",
82
82
  trajectory=row.get(self.trajectory_key, []) if structured else [],
83
83
  expected_trajectory=row.get(self.expected_trajectory_key, []) if structured else [],
84
+ full_dataset_entry=row.to_dict(),
84
85
  )
85
86
 
86
87
  # if input dataframe is empty return an empty list
@@ -0,0 +1,73 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ import asyncio
17
+ from abc import ABC
18
+ from abc import abstractmethod
19
+
20
+ from tqdm import tqdm
21
+
22
+ from aiq.eval.evaluator.evaluator_model import EvalInput
23
+ from aiq.eval.evaluator.evaluator_model import EvalInputItem
24
+ from aiq.eval.evaluator.evaluator_model import EvalOutput
25
+ from aiq.eval.evaluator.evaluator_model import EvalOutputItem
26
+ from aiq.eval.utils.tqdm_position_registry import TqdmPositionRegistry
27
+
28
+
29
+ class BaseEvaluator(ABC):
30
+ """
31
+ Base class for custom evaluators.
32
+
33
+ Each custom evaluator must implement the `evaluate_item` method which is used to evaluate a
34
+ single EvalInputItem.
35
+ """
36
+
37
+ def __init__(self, max_concurrency: int = 4, tqdm_desc: str = "Evaluating"):
38
+ self.max_concurrency = max_concurrency
39
+ self.semaphore = asyncio.Semaphore(max_concurrency)
40
+ self.tqdm_desc = tqdm_desc
41
+
42
+ @abstractmethod
43
+ async def evaluate_item(self, item: EvalInputItem) -> EvalOutputItem:
44
+ """Each evaluator must implement this for item-level evaluation"""
45
+ pass
46
+
47
+ async def evaluate(self, eval_input: EvalInput) -> EvalOutput:
48
+ pbar = None
49
+ try:
50
+ tqdm_position = TqdmPositionRegistry.claim()
51
+ pbar = tqdm(total=len(eval_input.eval_input_items), desc=self.tqdm_desc, position=tqdm_position)
52
+
53
+ async def wrapped(item):
54
+ async with self.semaphore:
55
+ try:
56
+ output_item = await self.evaluate_item(item)
57
+ pbar.update(1)
58
+ return output_item
59
+ except Exception as e:
60
+ # If the evaluator fails, return an error item with a score of 0.0
61
+ pbar.update(1)
62
+ return EvalOutputItem(id=item.id, score=0.0, reasoning={"error": f"Evaluator error: {str(e)}"})
63
+
64
+ output_items = await asyncio.gather(*[wrapped(item) for item in eval_input.eval_input_items])
65
+ finally:
66
+ pbar.close()
67
+ TqdmPositionRegistry.release(tqdm_position)
68
+
69
+ # Compute average if possible
70
+ numeric_scores = [item.score for item in output_items if isinstance(item.score, (int, float))]
71
+ avg_score = round(sum(numeric_scores) / len(numeric_scores), 2) if numeric_scores else None
72
+
73
+ return EvalOutput(average_score=avg_score, eval_output_items=output_items)
@@ -27,6 +27,7 @@ class EvalInputItem(BaseModel):
27
27
  output_obj: typing.Any
28
28
  expected_trajectory: list[IntermediateStep]
29
29
  trajectory: list[IntermediateStep]
30
+ full_dataset_entry: typing.Any
30
31
 
31
32
 
32
33
  class EvalInput(BaseModel):
@@ -13,24 +13,20 @@
13
13
  # See the License for the specific language governing permissions and
14
14
  # limitations under the License.
15
15
 
16
- import asyncio
17
16
  import logging
18
17
 
19
18
  from langchain.evaluation import TrajectoryEvalChain
20
19
  from langchain_core.language_models import BaseChatModel
21
20
  from langchain_core.tools import BaseTool
22
- from tqdm import tqdm
23
21
 
24
- from aiq.eval.evaluator.evaluator_model import EvalInput
22
+ from aiq.eval.evaluator.base_evaluator import BaseEvaluator
25
23
  from aiq.eval.evaluator.evaluator_model import EvalInputItem
26
- from aiq.eval.evaluator.evaluator_model import EvalOutput
27
24
  from aiq.eval.evaluator.evaluator_model import EvalOutputItem
28
- from aiq.eval.utils.tqdm_position_registry import TqdmPositionRegistry
29
25
 
30
26
  logger = logging.getLogger(__name__)
31
27
 
32
28
 
33
- class TrajectoryEvaluator:
29
+ class TrajectoryEvaluator(BaseEvaluator):
34
30
 
35
31
  def __init__(
36
32
  self,
@@ -38,11 +34,9 @@ class TrajectoryEvaluator:
38
34
  tools: list[BaseTool] | None = None,
39
35
  max_concurrency: int = 8,
40
36
  ):
41
-
37
+ super().__init__(max_concurrency=max_concurrency, tqdm_desc="Evaluating Trajectory")
42
38
  self.llm = llm
43
39
  self.tools = tools
44
- self.max_concurrency = max_concurrency
45
- self.semaphore = asyncio.Semaphore(self.max_concurrency)
46
40
  # Initialize trajectory evaluation chain
47
41
  self.traj_eval_chain = TrajectoryEvalChain.from_llm(llm=self.llm,
48
42
  tools=self.tools,
@@ -50,69 +44,32 @@ class TrajectoryEvaluator:
50
44
  requires_reference=True)
51
45
  logger.debug("Trajectory evaluation chain initialized.")
52
46
 
53
- async def evaluate(self, eval_input: EvalInput) -> EvalOutput:
47
+ async def evaluate_item(self, item: EvalInputItem) -> EvalOutputItem:
54
48
  """
55
- Evaluates the agent trajectories using trajectory evaluation chain.
49
+ Evaluate a single EvalInputItem and return an EvalOutputItem.
56
50
  """
57
-
58
- num_records = len(eval_input.eval_input_items)
59
- logger.info("Running trajectory evaluation with %d records", num_records)
60
51
  from aiq.data_models.intermediate_step import IntermediateStepType
61
52
  from aiq.eval.intermediate_step_adapter import IntermediateStepAdapter
62
53
 
63
54
  intermediate_step_adapter = IntermediateStepAdapter()
64
55
  event_filter = [IntermediateStepType.LLM_END, IntermediateStepType.TOOL_END]
65
56
 
66
- async def process_item(item: EvalInputItem) -> tuple[float, dict]:
67
- """
68
- Evaluate a single EvalInputItem asynchronously and return a tuple of-
69
- 1. score
70
- 2. reasoning for the score
71
- """
72
- question = item.input_obj
73
- generated_answer = item.output_obj
74
- agent_trajectory = intermediate_step_adapter.get_agent_actions(item.trajectory, event_filter)
75
- try:
76
- eval_result = await self.traj_eval_chain.aevaluate_agent_trajectory(
77
- input=question,
78
- agent_trajectory=agent_trajectory,
79
- prediction=generated_answer,
80
- )
81
- except Exception as e:
82
- logger.exception("Error evaluating trajectory for question: %s, Error: %s", question, e, exc_info=True)
83
- return 0.0, f"Error evaluating trajectory: {e}"
84
-
85
- reasoning = {
86
- "reasoning": eval_result["reasoning"],
87
- "trajectory": [(action.model_dump(), output) for (action, output) in agent_trajectory]
88
- }
89
- return eval_result["score"], reasoning
90
-
91
- async def wrapped_process(item: EvalInputItem) -> tuple[float, dict]:
92
- async with self.semaphore:
93
- result = await process_item(item)
94
- pbar.update(1)
95
- return result
57
+ question = item.input_obj
58
+ generated_answer = item.output_obj
59
+ agent_trajectory = intermediate_step_adapter.get_agent_actions(item.trajectory, event_filter)
96
60
 
97
- # Execute all evaluations asynchronously
98
61
  try:
99
- tqdm_position = TqdmPositionRegistry.claim()
100
- pbar = tqdm(total=len(eval_input.eval_input_items), desc="Evaluating Trajectory", position=tqdm_position)
101
- results = await asyncio.gather(*[wrapped_process(item) for item in eval_input.eval_input_items])
102
- finally:
103
- pbar.close()
104
- TqdmPositionRegistry.release(tqdm_position)
105
-
106
- # Extract scores and reasonings
107
- sample_scores, sample_reasonings = zip(*results) if results else ([], [])
108
-
109
- # Compute average score
110
- avg_score = round(sum(sample_scores) / len(sample_scores), 2) if sample_scores else 0.0
111
-
112
- # Construct EvalOutputItems
113
- eval_output_items = [
114
- EvalOutputItem(id=item.id, score=score, reasoning=reasoning)
115
- for item, score, reasoning in zip(eval_input.eval_input_items, sample_scores, sample_reasonings)
116
- ]
117
-
118
- return EvalOutput(average_score=avg_score, eval_output_items=eval_output_items)
62
+ eval_result = await self.traj_eval_chain.aevaluate_agent_trajectory(
63
+ input=question,
64
+ agent_trajectory=agent_trajectory,
65
+ prediction=generated_answer,
66
+ )
67
+ except Exception as e:
68
+ logger.exception("Error evaluating trajectory for question: %s, Error: %s", question, e, exc_info=True)
69
+ return EvalOutputItem(id=item.id, score=0.0, reasoning=f"Error evaluating trajectory: {e}")
70
+
71
+ reasoning = {
72
+ "reasoning": eval_result["reasoning"],
73
+ "trajectory": [(action.model_dump(), output) for (action, output) in agent_trajectory]
74
+ }
75
+ return EvalOutputItem(id=item.id, score=eval_result["score"], reasoning=reasoning)
@@ -13,7 +13,6 @@
13
13
  # See the License for the specific language governing permissions and
14
14
  # limitations under the License.
15
15
 
16
- import asyncio
17
16
  import logging
18
17
 
19
18
  from langchain.output_parsers import ResponseSchema
@@ -21,13 +20,10 @@ from langchain.output_parsers import StructuredOutputParser
21
20
  from langchain.schema import HumanMessage
22
21
  from langchain.schema import SystemMessage
23
22
  from langchain_core.language_models import BaseChatModel
24
- from tqdm import tqdm
25
23
 
26
- from aiq.eval.evaluator.evaluator_model import EvalInput
24
+ from aiq.eval.evaluator.base_evaluator import BaseEvaluator
27
25
  from aiq.eval.evaluator.evaluator_model import EvalInputItem
28
- from aiq.eval.evaluator.evaluator_model import EvalOutput
29
26
  from aiq.eval.evaluator.evaluator_model import EvalOutputItem
30
- from aiq.eval.utils.tqdm_position_registry import TqdmPositionRegistry
31
27
 
32
28
  logger = logging.getLogger(__name__)
33
29
 
@@ -69,7 +65,7 @@ def evaluation_prompt(judge_llm_prompt: str,
69
65
  return EVAL_PROMPT if not default_scoring else DEFAULT_EVAL_PROMPT
70
66
 
71
67
 
72
- class TunableRagEvaluator:
68
+ class TunableRagEvaluator(BaseEvaluator):
73
69
  '''Tunable RAG evaluator class with customizable LLM prompt for scoring.'''
74
70
 
75
71
  def __init__(self,
@@ -78,186 +74,141 @@ class TunableRagEvaluator:
78
74
  max_concurrency: int,
79
75
  default_scoring: bool,
80
76
  default_score_weights: dict):
77
+ super().__init__(max_concurrency=max_concurrency, tqdm_desc="Evaluating RAG")
81
78
  self.llm = llm
82
- self.max_concurrency = max_concurrency
83
79
  self.judge_llm_prompt = judge_llm_prompt
84
- self.semaphore = asyncio.Semaphore(self.max_concurrency)
85
80
  self.default_scoring = default_scoring
86
81
  # Use user-provided weights if available; otherwise, set equal weights for each score
87
82
  self.default_score_weights = default_score_weights if default_score_weights else {
88
83
  "coverage": 1 / 3, "correctness": 1 / 3, "relevance": 1 / 3
89
84
  }
90
85
 
91
- async def evaluate(self, eval_input: EvalInput) -> EvalOutput:
92
- '''Evaluate function'''
93
-
94
- async def process_item(item):
95
- """Compute RAG evaluation for an individual item"""
96
- question = item.input_obj
97
- answer_description = item.expected_output_obj
98
- generated_answer = item.output_obj
99
-
100
- # Call judge LLM to generate score
101
- score = 0.0
102
-
103
- default_evaluation_schema = [
104
- ResponseSchema(
105
- name="coverage_score",
106
- description=
107
- "Score for the coverage of all critical aspects mentioned in the expected answer. Ex. 0.5",
108
- type="float"),
109
- ResponseSchema(
110
- name="correctness_score",
111
- description=
112
- "Score for the accuracy of the generated answer compared to the expected answer. Ex. 0.5",
113
- type="float"),
114
- ResponseSchema(name="relevance_score",
115
- description="Score for the relevance of the generated answer to the question. Ex. 0.5",
116
- type="float"),
117
- ResponseSchema(
118
- name="reasoning",
119
- description=
120
- "1-2 summarized sentences of reasoning for the scores. Ex. 'The generated answer covers all critical aspects mentioned in the expected answer, is correct, and is relevant to the question.'",
121
- type="string"),
122
- ]
123
-
124
- custom_evaluation_schema = [
125
- ResponseSchema(name="score", description="Score for the generated answer. Ex. 0.5", type="float"),
126
- ResponseSchema(
127
- name="reasoning",
128
- description=
129
- "1-2 sentence reasoning for the score. Ex. 'The generated answer is exactly the same as the description of the expected answer.'",
130
- type="string"),
131
- ]
86
+ async def evaluate_item(self, item: EvalInputItem) -> EvalOutputItem:
87
+ '''Evaluate a single item'''
88
+ question = item.input_obj
89
+ answer_description = item.expected_output_obj
90
+ generated_answer = item.output_obj
91
+
92
+ # Call judge LLM to generate score
93
+ score = 0.0
94
+
95
+ default_evaluation_schema = [
96
+ ResponseSchema(
97
+ name="coverage_score",
98
+ description="Score for the coverage of all critical aspects mentioned in the expected answer. Ex. 0.5",
99
+ type="float"),
100
+ ResponseSchema(
101
+ name="correctness_score",
102
+ description="Score for the accuracy of the generated answer compared to the expected answer. Ex. 0.5",
103
+ type="float"),
104
+ ResponseSchema(name="relevance_score",
105
+ description="Score for the relevance of the generated answer to the question. Ex. 0.5",
106
+ type="float"),
107
+ ResponseSchema(
108
+ name="reasoning",
109
+ description=
110
+ "1-2 summarized sentences of reasoning for the scores. Ex. 'The generated answer covers all critical aspects mentioned in the expected answer, is correct, and is relevant to the question.'",
111
+ type="string"),
112
+ ]
132
113
 
133
- if self.default_scoring:
134
- evaluation_schema = default_evaluation_schema
135
- else:
136
- evaluation_schema = custom_evaluation_schema
114
+ custom_evaluation_schema = [
115
+ ResponseSchema(name="score", description="Score for the generated answer. Ex. 0.5", type="float"),
116
+ ResponseSchema(
117
+ name="reasoning",
118
+ description=
119
+ "1-2 sentence reasoning for the score. Ex. 'The generated answer is exactly the same as the description of the expected answer.'",
120
+ type="string"),
121
+ ]
137
122
 
138
- llm_input_response_parser = StructuredOutputParser.from_response_schemas(evaluation_schema)
139
- format_instructions = llm_input_response_parser.get_format_instructions()
123
+ if self.default_scoring:
124
+ evaluation_schema = default_evaluation_schema
125
+ else:
126
+ evaluation_schema = custom_evaluation_schema
140
127
 
141
- eval_prompt = evaluation_prompt(judge_llm_prompt=self.judge_llm_prompt,
142
- question=question,
143
- answer_description=answer_description,
144
- generated_answer=generated_answer,
145
- format_instructions=format_instructions,
146
- default_scoring=self.default_scoring)
128
+ llm_input_response_parser = StructuredOutputParser.from_response_schemas(evaluation_schema)
129
+ format_instructions = llm_input_response_parser.get_format_instructions()
147
130
 
148
- messages = [
149
- SystemMessage(content="You must respond only in JSON format."), HumanMessage(content=eval_prompt)
150
- ]
131
+ eval_prompt = evaluation_prompt(judge_llm_prompt=self.judge_llm_prompt,
132
+ question=question,
133
+ answer_description=answer_description,
134
+ generated_answer=generated_answer,
135
+ format_instructions=format_instructions,
136
+ default_scoring=self.default_scoring)
151
137
 
152
- response = await self.llm.ainvoke(messages)
138
+ messages = [SystemMessage(content="You must respond only in JSON format."), HumanMessage(content=eval_prompt)]
153
139
 
154
- # Initialize default values to handle service errors
155
- coverage_score = 0.0
156
- correctness_score = 0.0
157
- relevance_score = 0.0
158
- reasoning = "Error in evaluator from parsing judge LLM response."
140
+ response = await self.llm.ainvoke(messages)
159
141
 
160
- try:
161
- parsed_response = llm_input_response_parser.parse(response.content)
162
- if self.default_scoring:
163
- try:
164
- coverage_score = parsed_response["coverage_score"]
165
- correctness_score = parsed_response["correctness_score"]
166
- relevance_score = parsed_response["relevance_score"]
167
- reasoning = parsed_response["reasoning"]
168
- except KeyError as e:
169
- logger.error("Missing required keys in default scoring response: %s",
170
- ", ".join(str(arg) for arg in e.args))
171
- reasoning = f"Error in evaluator from parsing judge LLM response. Missing required key(s): {', '.join(str(arg) for arg in e.args)}"
172
-
173
- coverage_weight = self.default_score_weights.get("coverage", 1 / 3)
174
- correctness_weight = self.default_score_weights.get("correctness", 1 / 3)
175
- relevance_weight = self.default_score_weights.get("relevance", 1 / 3)
176
-
177
- # Calculate score
178
- total_weight = coverage_weight + correctness_weight + relevance_weight
179
- coverage_weight = coverage_weight / total_weight
180
- correctness_weight = correctness_weight / total_weight
181
- relevance_weight = relevance_weight / total_weight
182
-
183
- if round(coverage_weight + correctness_weight + relevance_weight, 2) != 1:
184
- logger.warning("The sum of the default score weights is not 1. The weights will be normalized.")
185
- coverage_weight = coverage_weight / (coverage_weight + correctness_weight + relevance_weight)
186
- correctness_weight = correctness_weight / (coverage_weight + correctness_weight +
187
- relevance_weight)
188
- relevance_weight = relevance_weight / (coverage_weight + correctness_weight + relevance_weight)
189
-
190
- score = (coverage_weight * coverage_score + correctness_weight * correctness_score +
191
- relevance_weight * relevance_score)
192
-
193
- else:
194
- try:
195
- score = parsed_response["score"]
196
- reasoning = parsed_response["reasoning"]
197
- except KeyError as e:
198
- logger.error("Missing required keys in custom scoring response: %s",
199
- ", ".join(str(arg) for arg in e.args))
200
- reasoning = f"Error in evaluator from parsing judge LLM response. Missing required key(s): {', '.join(str(arg) for arg in e.args)}"
201
- raise
202
- except (KeyError, ValueError) as e:
203
- logger.error("Error parsing judge LLM response: %s", e)
204
- score = 0.0
205
- reasoning = "Error in evaluator from parsing judge LLM response."
142
+ # Initialize default values to handle service errors
143
+ coverage_score = 0.0
144
+ correctness_score = 0.0
145
+ relevance_score = 0.0
146
+ reasoning = "Error in evaluator from parsing judge LLM response."
206
147
 
148
+ try:
149
+ parsed_response = llm_input_response_parser.parse(response.content)
207
150
  if self.default_scoring:
208
- reasoning = {
209
- "question": question,
210
- "answer_description": answer_description,
211
- "generated_answer": generated_answer,
212
- "score_breakdown": {
213
- "coverage_score": coverage_score,
214
- "correctness_score": correctness_score,
215
- "relevance_score": relevance_score,
216
- },
217
- "reasoning": reasoning,
218
- }
219
- else:
220
- reasoning = {
221
- "question": question,
222
- "answer_description": answer_description,
223
- "generated_answer": generated_answer,
224
- "reasoning": reasoning
225
- }
226
-
227
- return score, reasoning
228
-
229
- async def wrapped_process(item: EvalInputItem) -> tuple[float, dict]:
230
- """
231
- Process an item asynchronously and update the progress bar.
232
- Use the semaphore to limit the number of concurrent items.
233
- """
234
- async with self.semaphore:
235
- result = await process_item(item)
236
- # Update the progress bar
237
- pbar.update(1)
238
- return result
151
+ try:
152
+ coverage_score = parsed_response["coverage_score"]
153
+ correctness_score = parsed_response["correctness_score"]
154
+ relevance_score = parsed_response["relevance_score"]
155
+ reasoning = parsed_response["reasoning"]
156
+ except KeyError as e:
157
+ logger.error("Missing required keys in default scoring response: %s",
158
+ ", ".join(str(arg) for arg in e.args))
159
+ reasoning = f"Error in evaluator from parsing judge LLM response. Missing required key(s): {', '.join(str(arg) for arg in e.args)}"
160
+
161
+ coverage_weight = self.default_score_weights.get("coverage", 1 / 3)
162
+ correctness_weight = self.default_score_weights.get("correctness", 1 / 3)
163
+ relevance_weight = self.default_score_weights.get("relevance", 1 / 3)
164
+
165
+ # Calculate score
166
+ total_weight = coverage_weight + correctness_weight + relevance_weight
167
+ coverage_weight = coverage_weight / total_weight
168
+ correctness_weight = correctness_weight / total_weight
169
+ relevance_weight = relevance_weight / total_weight
170
+
171
+ if round(coverage_weight + correctness_weight + relevance_weight, 2) != 1:
172
+ logger.warning("The sum of the default score weights is not 1. The weights will be normalized.")
173
+ coverage_weight = coverage_weight / (coverage_weight + correctness_weight + relevance_weight)
174
+ correctness_weight = correctness_weight / (coverage_weight + correctness_weight + relevance_weight)
175
+ relevance_weight = relevance_weight / (coverage_weight + correctness_weight + relevance_weight)
176
+
177
+ score = (coverage_weight * coverage_score + correctness_weight * correctness_score +
178
+ relevance_weight * relevance_score)
239
179
 
240
- try:
241
- # Claim a tqdm position to display the progress bar
242
- tqdm_position = TqdmPositionRegistry.claim()
243
- # Create a progress bar
244
- pbar = tqdm(total=len(eval_input.eval_input_items), desc="Evaluating RAG", position=tqdm_position)
245
- # Process items concurrently with a limit on concurrency
246
- results = await asyncio.gather(*[wrapped_process(item) for item in eval_input.eval_input_items])
247
- finally:
248
- pbar.close()
249
- TqdmPositionRegistry.release(tqdm_position)
250
-
251
- # Extract scores and reasonings
252
- sample_scores, sample_reasonings = zip(*results) if results else ([], [])
253
-
254
- # Compute average score
255
- avg_score = round(sum(sample_scores) / len(sample_scores), 2) if sample_scores else 0.0
256
-
257
- # Construct EvalOutputItems
258
- eval_output_items = [
259
- EvalOutputItem(id=item.id, score=score, reasoning=reasoning)
260
- for item, score, reasoning in zip(eval_input.eval_input_items, sample_scores, sample_reasonings)
261
- ]
180
+ else:
181
+ try:
182
+ score = parsed_response["score"]
183
+ reasoning = parsed_response["reasoning"]
184
+ except KeyError as e:
185
+ logger.error("Missing required keys in custom scoring response: %s",
186
+ ", ".join(str(arg) for arg in e.args))
187
+ reasoning = f"Error in evaluator from parsing judge LLM response. Missing required key(s): {', '.join(str(arg) for arg in e.args)}"
188
+ raise
189
+ except (KeyError, ValueError) as e:
190
+ logger.error("Error parsing judge LLM response: %s", e)
191
+ score = 0.0
192
+ reasoning = "Error in evaluator from parsing judge LLM response."
262
193
 
263
- return EvalOutput(average_score=avg_score, eval_output_items=eval_output_items)
194
+ if self.default_scoring:
195
+ reasoning = {
196
+ "question": question,
197
+ "answer_description": answer_description,
198
+ "generated_answer": generated_answer,
199
+ "score_breakdown": {
200
+ "coverage_score": coverage_score,
201
+ "correctness_score": correctness_score,
202
+ "relevance_score": relevance_score,
203
+ },
204
+ "reasoning": reasoning,
205
+ }
206
+ else:
207
+ reasoning = {
208
+ "question": question,
209
+ "answer_description": answer_description,
210
+ "generated_answer": generated_answer,
211
+ "reasoning": reasoning
212
+ }
213
+
214
+ return EvalOutputItem(id=item.id, score=score, reasoning=reasoning)
@@ -154,3 +154,27 @@ async def file_logging_method(config: FileLoggingMethod, builder: Builder):
154
154
  handler = logging.FileHandler(filename=config.path, mode="a", encoding="utf-8")
155
155
  handler.setLevel(level)
156
156
  yield handler
157
+
158
+
159
+ class PatronusTelemetryExporter(TelemetryExporterBaseConfig, name="patronus"):
160
+ """A telemetry exporter to transmit traces to Patronus service."""
161
+
162
+ endpoint: str = Field(description="The Patronus OTEL endpoint")
163
+ api_key: str = Field(description="The Patronus API key", default="")
164
+ project: str = Field(description="The project name to group the telemetry traces.")
165
+
166
+
167
+ @register_telemetry_exporter(config_type=PatronusTelemetryExporter)
168
+ async def patronus_telemetry_exporter(config: PatronusTelemetryExporter, builder: Builder):
169
+ """Create a Patronus telemetry exporter."""
170
+ trace_exporter = telemetry_optional_import("opentelemetry.exporter.otlp.proto.grpc.trace_exporter")
171
+
172
+ api_key = config.api_key or os.environ.get("PATRONUS_API_KEY")
173
+ if not api_key:
174
+ raise ValueError("API key is required for Patronus")
175
+
176
+ headers = {
177
+ "x-api-key": api_key,
178
+ "pat-project-name": config.project,
179
+ }
180
+ yield trace_exporter.OTLPSpanExporter(endpoint=config.endpoint, headers=headers)
@@ -45,6 +45,7 @@ def model_from_mcp_schema(name: str, mcp_input_schema: dict) -> type[BaseModel]:
45
45
  }
46
46
 
47
47
  properties = mcp_input_schema.get("properties", {})
48
+ required_fields = set(mcp_input_schema.get("required", []))
48
49
  schema_dict = {}
49
50
 
50
51
  def _generate_valid_classname(class_name: str):
@@ -70,7 +71,17 @@ def model_from_mcp_schema(name: str, mcp_input_schema: dict) -> type[BaseModel]:
70
71
  else:
71
72
  field_type = _type_map.get(json_type, Any)
72
73
 
73
- default_value = field_properties.get("default", ...)
74
+ # Determine the default value based on whether the field is required
75
+ if field_name in required_fields:
76
+ # Field is required - use explicit default if provided, otherwise make it required
77
+ default_value = field_properties.get("default", ...)
78
+ else:
79
+ # Field is optional - use explicit default if provided, otherwise None
80
+ default_value = field_properties.get("default", None)
81
+ # Make the type optional if no default was provided
82
+ if "default" not in field_properties:
83
+ field_type = field_type | None
84
+
74
85
  nullable = field_properties.get("nullable", False)
75
86
  description = field_properties.get("description", "")
76
87
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: aiqtoolkit
3
- Version: 1.2.0a20250612
3
+ Version: 1.2.0a20250613
4
4
  Summary: NVIDIA Agent Intelligence toolkit
5
5
  Author: NVIDIA Corporation
6
6
  Maintainer: NVIDIA Corporation
@@ -115,9 +115,10 @@ aiq/eval/runtime_event_subscriber.py,sha256=2VM8MqmPc_EWPxxrDDR9naiioZirkJUfGwzb
115
115
  aiq/eval/dataset_handler/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
116
116
  aiq/eval/dataset_handler/dataset_downloader.py,sha256=Zvfbd-fPOhB9n8ZiCBaBKW0y-5v97mQAy3dkBL0OFZ0,4553
117
117
  aiq/eval/dataset_handler/dataset_filter.py,sha256=mop6wa4P_QtQ5QkfXv-hVBm3EMerfNECSTJGGDB1YWE,2115
118
- aiq/eval/dataset_handler/dataset_handler.py,sha256=cqdGVgHm6tsKk3TwFcFhptxAvcHVlZTOh4bXuBsfCYs,7797
118
+ aiq/eval/dataset_handler/dataset_handler.py,sha256=z4trKYPnqSrLvsKctU9d5WrQW7ddbZZx0zOrYVLqbAA,7847
119
119
  aiq/eval/evaluator/__init__.py,sha256=GUJrgGtpvyMUCjUBvR3faAdv-tZzbU9W-izgx9aMEQg,680
120
- aiq/eval/evaluator/evaluator_model.py,sha256=alO8bVoGmvej1LpN5wZ5HG29TSrL4IMWdVcMew8IOzM,1405
120
+ aiq/eval/evaluator/base_evaluator.py,sha256=5kqOcTYNecnh9us_XvV58pj5tZI82NGkVN4tg9-R_ZE,3040
121
+ aiq/eval/evaluator/evaluator_model.py,sha256=5cxe3mqznlNGzv29v_VseYU7OzoT1eTf7hgSPQxytsM,1440
121
122
  aiq/eval/rag_evaluator/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
122
123
  aiq/eval/rag_evaluator/evaluate.py,sha256=lEjXKiuELAHyWckz-bM91dZ6AT2J6NC7SfvtedR-Qdk,6548
123
124
  aiq/eval/rag_evaluator/register.py,sha256=2NzxkgqyoZ4wC8ARj3tiVoE8ENCmplBCIKrNOFh6_VI,5642
@@ -125,10 +126,10 @@ aiq/eval/swe_bench_evaluator/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NM
125
126
  aiq/eval/swe_bench_evaluator/evaluate.py,sha256=kNukRruq1EM1RsGLvpVuC22xcP0gpn9acF3edGak9vY,9858
126
127
  aiq/eval/swe_bench_evaluator/register.py,sha256=sTb74F7w4iuI0ROsEJ4bV13Nt1GEWQn7UvO2O0HXwXk,1537
127
128
  aiq/eval/trajectory_evaluator/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
128
- aiq/eval/trajectory_evaluator/evaluate.py,sha256=pfcrFGMmunHS8lG13Rdi0Vf4dw7cTwY0uUN5eOXAA1s,5064
129
+ aiq/eval/trajectory_evaluator/evaluate.py,sha256=Y51KMhJ9t8AoYWrQlrwipc2CtgIXA9IUGZTbKegtsnw,3257
129
130
  aiq/eval/trajectory_evaluator/register.py,sha256=kktT4fu5_1Cou-iohD3YhQevsWiR3TA5NpFSweVz0eQ,1709
130
131
  aiq/eval/tunable_rag_evaluator/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
131
- aiq/eval/tunable_rag_evaluator/evaluate.py,sha256=XsQK8BPTWLkolRUd5wZtQpUITukLCmE_bOlY2tjFvGs,13844
132
+ aiq/eval/tunable_rag_evaluator/evaluate.py,sha256=lZxQDhvcAu0JR1RApkbs-G3T9pUOSfh822TYGp7vrQw,11440
132
133
  aiq/eval/tunable_rag_evaluator/register.py,sha256=uV36xONVxQW8qBO_bsvbvZk4-J4IhowxiRKErnYsbzA,2369
133
134
  aiq/eval/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
134
135
  aiq/eval/utils/output_uploader.py,sha256=SaQbZPkw-Q0H7t5yG60Kh-p1cflR7gPklVkilC4uPbU,5141
@@ -175,7 +176,7 @@ aiq/meta/module_to_distro.json,sha256=1XV7edobFrdDKvsSoynfodXg_hczUWpDrQzGkW9qqE
175
176
  aiq/meta/pypi.md,sha256=N1fvWaio3KhnAw9yigeM-oWaLuT5i_C7U_2UVzyPbks,4386
176
177
  aiq/observability/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
177
178
  aiq/observability/async_otel_listener.py,sha256=2Ye9bkHfAssuxFS_ECyRyl-bTa73yYvsPyO4BaK5Beg,19662
178
- aiq/observability/register.py,sha256=CoYr6-rt7Go3fhJZHlQg52SfPqHqySaexBxlv4xtRwA,6619
179
+ aiq/observability/register.py,sha256=mejMBVr3dHHfShIiyn1fIbA0Gb6z9Ayg8WRMgB0wf5E,7646
179
180
  aiq/plugins/.namespace,sha256=Gace0pOC3ETEJf-TBVuNw0TQV6J_KtOPpEiSzMH-odo,215
180
181
  aiq/profiler/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
181
182
  aiq/profiler/data_frame_row.py,sha256=vudqk1ZzZtlZln2Ir43mPl3nwNc0pQlhwbtdY9oSKtI,1755
@@ -277,7 +278,7 @@ aiq/tool/github_tools/get_github_issue.py,sha256=vwLNkNOszLlymkQju0cR8BNvfdH4Enm
277
278
  aiq/tool/github_tools/get_github_pr.py,sha256=b7eCOqrVoejGjRwmUVdU45uF07ihbY8lRacMYOSgMrY,9716
278
279
  aiq/tool/github_tools/update_github_issue.py,sha256=TUElxUuzjZr_QldL_48RcqSx0A9b23NB_lA82QwFjkM,4103
279
280
  aiq/tool/mcp/__init__.py,sha256=GUJrgGtpvyMUCjUBvR3faAdv-tZzbU9W-izgx9aMEQg,680
280
- aiq/tool/mcp/mcp_client.py,sha256=HWyYsbpA5IysWWdB3XipWzqCfYUio7cJWymt4TbQKyM,7496
281
+ aiq/tool/mcp/mcp_client.py,sha256=lYbf669ATqGKkL0jjd76r0aAtAFnWeruWw-lOPsmYu8,8103
281
282
  aiq/tool/mcp/mcp_tool.py,sha256=rQQcaCT-GHQcDmG5weX-2Y-HxBPX-0cC73LjL1u0FUU,4009
282
283
  aiq/tool/memory_tools/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
283
284
  aiq/tool/memory_tools/add_memory_tool.py,sha256=9EjB3DpYhxwasz7o3O8Rq__Ys5986fciv44ahC6mVCo,3349
@@ -308,10 +309,10 @@ aiq/utils/reactive/base/observer_base.py,sha256=UAlyAY_ky4q2t0P81RVFo2Bs_R7z5Nde
308
309
  aiq/utils/reactive/base/subject_base.py,sha256=Ed-AC6P7cT3qkW1EXjzbd5M9WpVoeN_9KCe3OM3FLU4,2521
309
310
  aiq/utils/settings/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
310
311
  aiq/utils/settings/global_settings.py,sha256=U9TCLdoZsKq5qOVGjREipGVv9e-FlStzqy5zv82_VYk,7454
311
- aiqtoolkit-1.2.0a20250612.dist-info/licenses/LICENSE-3rd-party.txt,sha256=8o7aySJa9CBvFshPcsRdJbczzdNyDGJ8b0J67WRUQ2k,183936
312
- aiqtoolkit-1.2.0a20250612.dist-info/licenses/LICENSE.md,sha256=QwcOLU5TJoTeUhuIXzhdCEEDDvorGiC6-3YTOl4TecE,11356
313
- aiqtoolkit-1.2.0a20250612.dist-info/METADATA,sha256=1SStcP6v0If09VOfwJk8W0EHi7waeCGhJ8KVMW2Q5-o,20274
314
- aiqtoolkit-1.2.0a20250612.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
315
- aiqtoolkit-1.2.0a20250612.dist-info/entry_points.txt,sha256=gRlPfR5g21t328WNEQ4CcEz80S1sJNS8A7rMDYnzl4A,452
316
- aiqtoolkit-1.2.0a20250612.dist-info/top_level.txt,sha256=fo7AzYcNhZ_tRWrhGumtxwnxMew4xrT1iwouDy_f0Kc,4
317
- aiqtoolkit-1.2.0a20250612.dist-info/RECORD,,
312
+ aiqtoolkit-1.2.0a20250613.dist-info/licenses/LICENSE-3rd-party.txt,sha256=8o7aySJa9CBvFshPcsRdJbczzdNyDGJ8b0J67WRUQ2k,183936
313
+ aiqtoolkit-1.2.0a20250613.dist-info/licenses/LICENSE.md,sha256=QwcOLU5TJoTeUhuIXzhdCEEDDvorGiC6-3YTOl4TecE,11356
314
+ aiqtoolkit-1.2.0a20250613.dist-info/METADATA,sha256=0LV-fg4UXDznF9C1ojoVD1qrvT1Spoc0w7duaBn_QVI,20274
315
+ aiqtoolkit-1.2.0a20250613.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
316
+ aiqtoolkit-1.2.0a20250613.dist-info/entry_points.txt,sha256=gRlPfR5g21t328WNEQ4CcEz80S1sJNS8A7rMDYnzl4A,452
317
+ aiqtoolkit-1.2.0a20250613.dist-info/top_level.txt,sha256=fo7AzYcNhZ_tRWrhGumtxwnxMew4xrT1iwouDy_f0Kc,4
318
+ aiqtoolkit-1.2.0a20250613.dist-info/RECORD,,