aiqtoolkit 1.2.0a20250612__py3-none-any.whl → 1.2.0a20250613__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of aiqtoolkit might be problematic. Click here for more details.
- aiq/eval/dataset_handler/dataset_handler.py +1 -0
- aiq/eval/evaluator/base_evaluator.py +73 -0
- aiq/eval/evaluator/evaluator_model.py +1 -0
- aiq/eval/trajectory_evaluator/evaluate.py +22 -65
- aiq/eval/tunable_rag_evaluator/evaluate.py +121 -170
- aiq/observability/register.py +24 -0
- aiq/tool/mcp/mcp_client.py +12 -1
- {aiqtoolkit-1.2.0a20250612.dist-info → aiqtoolkit-1.2.0a20250613.dist-info}/METADATA +1 -1
- {aiqtoolkit-1.2.0a20250612.dist-info → aiqtoolkit-1.2.0a20250613.dist-info}/RECORD +14 -13
- {aiqtoolkit-1.2.0a20250612.dist-info → aiqtoolkit-1.2.0a20250613.dist-info}/WHEEL +0 -0
- {aiqtoolkit-1.2.0a20250612.dist-info → aiqtoolkit-1.2.0a20250613.dist-info}/entry_points.txt +0 -0
- {aiqtoolkit-1.2.0a20250612.dist-info → aiqtoolkit-1.2.0a20250613.dist-info}/licenses/LICENSE-3rd-party.txt +0 -0
- {aiqtoolkit-1.2.0a20250612.dist-info → aiqtoolkit-1.2.0a20250613.dist-info}/licenses/LICENSE.md +0 -0
- {aiqtoolkit-1.2.0a20250612.dist-info → aiqtoolkit-1.2.0a20250613.dist-info}/top_level.txt +0 -0
|
@@ -81,6 +81,7 @@ class DatasetHandler:
|
|
|
81
81
|
output_obj=row.get(self.generated_answer_key, "") if structured else "",
|
|
82
82
|
trajectory=row.get(self.trajectory_key, []) if structured else [],
|
|
83
83
|
expected_trajectory=row.get(self.expected_trajectory_key, []) if structured else [],
|
|
84
|
+
full_dataset_entry=row.to_dict(),
|
|
84
85
|
)
|
|
85
86
|
|
|
86
87
|
# if input dataframe is empty return an empty list
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
#
|
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
# you may not use this file except in compliance with the License.
|
|
6
|
+
# You may obtain a copy of the License at
|
|
7
|
+
#
|
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
#
|
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
# See the License for the specific language governing permissions and
|
|
14
|
+
# limitations under the License.
|
|
15
|
+
|
|
16
|
+
import asyncio
|
|
17
|
+
from abc import ABC
|
|
18
|
+
from abc import abstractmethod
|
|
19
|
+
|
|
20
|
+
from tqdm import tqdm
|
|
21
|
+
|
|
22
|
+
from aiq.eval.evaluator.evaluator_model import EvalInput
|
|
23
|
+
from aiq.eval.evaluator.evaluator_model import EvalInputItem
|
|
24
|
+
from aiq.eval.evaluator.evaluator_model import EvalOutput
|
|
25
|
+
from aiq.eval.evaluator.evaluator_model import EvalOutputItem
|
|
26
|
+
from aiq.eval.utils.tqdm_position_registry import TqdmPositionRegistry
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class BaseEvaluator(ABC):
|
|
30
|
+
"""
|
|
31
|
+
Base class for custom evaluators.
|
|
32
|
+
|
|
33
|
+
Each custom evaluator must implement the `evaluate_item` method which is used to evaluate a
|
|
34
|
+
single EvalInputItem.
|
|
35
|
+
"""
|
|
36
|
+
|
|
37
|
+
def __init__(self, max_concurrency: int = 4, tqdm_desc: str = "Evaluating"):
|
|
38
|
+
self.max_concurrency = max_concurrency
|
|
39
|
+
self.semaphore = asyncio.Semaphore(max_concurrency)
|
|
40
|
+
self.tqdm_desc = tqdm_desc
|
|
41
|
+
|
|
42
|
+
@abstractmethod
|
|
43
|
+
async def evaluate_item(self, item: EvalInputItem) -> EvalOutputItem:
|
|
44
|
+
"""Each evaluator must implement this for item-level evaluation"""
|
|
45
|
+
pass
|
|
46
|
+
|
|
47
|
+
async def evaluate(self, eval_input: EvalInput) -> EvalOutput:
|
|
48
|
+
pbar = None
|
|
49
|
+
try:
|
|
50
|
+
tqdm_position = TqdmPositionRegistry.claim()
|
|
51
|
+
pbar = tqdm(total=len(eval_input.eval_input_items), desc=self.tqdm_desc, position=tqdm_position)
|
|
52
|
+
|
|
53
|
+
async def wrapped(item):
|
|
54
|
+
async with self.semaphore:
|
|
55
|
+
try:
|
|
56
|
+
output_item = await self.evaluate_item(item)
|
|
57
|
+
pbar.update(1)
|
|
58
|
+
return output_item
|
|
59
|
+
except Exception as e:
|
|
60
|
+
# If the evaluator fails, return an error item with a score of 0.0
|
|
61
|
+
pbar.update(1)
|
|
62
|
+
return EvalOutputItem(id=item.id, score=0.0, reasoning={"error": f"Evaluator error: {str(e)}"})
|
|
63
|
+
|
|
64
|
+
output_items = await asyncio.gather(*[wrapped(item) for item in eval_input.eval_input_items])
|
|
65
|
+
finally:
|
|
66
|
+
pbar.close()
|
|
67
|
+
TqdmPositionRegistry.release(tqdm_position)
|
|
68
|
+
|
|
69
|
+
# Compute average if possible
|
|
70
|
+
numeric_scores = [item.score for item in output_items if isinstance(item.score, (int, float))]
|
|
71
|
+
avg_score = round(sum(numeric_scores) / len(numeric_scores), 2) if numeric_scores else None
|
|
72
|
+
|
|
73
|
+
return EvalOutput(average_score=avg_score, eval_output_items=output_items)
|
|
@@ -13,24 +13,20 @@
|
|
|
13
13
|
# See the License for the specific language governing permissions and
|
|
14
14
|
# limitations under the License.
|
|
15
15
|
|
|
16
|
-
import asyncio
|
|
17
16
|
import logging
|
|
18
17
|
|
|
19
18
|
from langchain.evaluation import TrajectoryEvalChain
|
|
20
19
|
from langchain_core.language_models import BaseChatModel
|
|
21
20
|
from langchain_core.tools import BaseTool
|
|
22
|
-
from tqdm import tqdm
|
|
23
21
|
|
|
24
|
-
from aiq.eval.evaluator.
|
|
22
|
+
from aiq.eval.evaluator.base_evaluator import BaseEvaluator
|
|
25
23
|
from aiq.eval.evaluator.evaluator_model import EvalInputItem
|
|
26
|
-
from aiq.eval.evaluator.evaluator_model import EvalOutput
|
|
27
24
|
from aiq.eval.evaluator.evaluator_model import EvalOutputItem
|
|
28
|
-
from aiq.eval.utils.tqdm_position_registry import TqdmPositionRegistry
|
|
29
25
|
|
|
30
26
|
logger = logging.getLogger(__name__)
|
|
31
27
|
|
|
32
28
|
|
|
33
|
-
class TrajectoryEvaluator:
|
|
29
|
+
class TrajectoryEvaluator(BaseEvaluator):
|
|
34
30
|
|
|
35
31
|
def __init__(
|
|
36
32
|
self,
|
|
@@ -38,11 +34,9 @@ class TrajectoryEvaluator:
|
|
|
38
34
|
tools: list[BaseTool] | None = None,
|
|
39
35
|
max_concurrency: int = 8,
|
|
40
36
|
):
|
|
41
|
-
|
|
37
|
+
super().__init__(max_concurrency=max_concurrency, tqdm_desc="Evaluating Trajectory")
|
|
42
38
|
self.llm = llm
|
|
43
39
|
self.tools = tools
|
|
44
|
-
self.max_concurrency = max_concurrency
|
|
45
|
-
self.semaphore = asyncio.Semaphore(self.max_concurrency)
|
|
46
40
|
# Initialize trajectory evaluation chain
|
|
47
41
|
self.traj_eval_chain = TrajectoryEvalChain.from_llm(llm=self.llm,
|
|
48
42
|
tools=self.tools,
|
|
@@ -50,69 +44,32 @@ class TrajectoryEvaluator:
|
|
|
50
44
|
requires_reference=True)
|
|
51
45
|
logger.debug("Trajectory evaluation chain initialized.")
|
|
52
46
|
|
|
53
|
-
async def
|
|
47
|
+
async def evaluate_item(self, item: EvalInputItem) -> EvalOutputItem:
|
|
54
48
|
"""
|
|
55
|
-
|
|
49
|
+
Evaluate a single EvalInputItem and return an EvalOutputItem.
|
|
56
50
|
"""
|
|
57
|
-
|
|
58
|
-
num_records = len(eval_input.eval_input_items)
|
|
59
|
-
logger.info("Running trajectory evaluation with %d records", num_records)
|
|
60
51
|
from aiq.data_models.intermediate_step import IntermediateStepType
|
|
61
52
|
from aiq.eval.intermediate_step_adapter import IntermediateStepAdapter
|
|
62
53
|
|
|
63
54
|
intermediate_step_adapter = IntermediateStepAdapter()
|
|
64
55
|
event_filter = [IntermediateStepType.LLM_END, IntermediateStepType.TOOL_END]
|
|
65
56
|
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
1. score
|
|
70
|
-
2. reasoning for the score
|
|
71
|
-
"""
|
|
72
|
-
question = item.input_obj
|
|
73
|
-
generated_answer = item.output_obj
|
|
74
|
-
agent_trajectory = intermediate_step_adapter.get_agent_actions(item.trajectory, event_filter)
|
|
75
|
-
try:
|
|
76
|
-
eval_result = await self.traj_eval_chain.aevaluate_agent_trajectory(
|
|
77
|
-
input=question,
|
|
78
|
-
agent_trajectory=agent_trajectory,
|
|
79
|
-
prediction=generated_answer,
|
|
80
|
-
)
|
|
81
|
-
except Exception as e:
|
|
82
|
-
logger.exception("Error evaluating trajectory for question: %s, Error: %s", question, e, exc_info=True)
|
|
83
|
-
return 0.0, f"Error evaluating trajectory: {e}"
|
|
84
|
-
|
|
85
|
-
reasoning = {
|
|
86
|
-
"reasoning": eval_result["reasoning"],
|
|
87
|
-
"trajectory": [(action.model_dump(), output) for (action, output) in agent_trajectory]
|
|
88
|
-
}
|
|
89
|
-
return eval_result["score"], reasoning
|
|
90
|
-
|
|
91
|
-
async def wrapped_process(item: EvalInputItem) -> tuple[float, dict]:
|
|
92
|
-
async with self.semaphore:
|
|
93
|
-
result = await process_item(item)
|
|
94
|
-
pbar.update(1)
|
|
95
|
-
return result
|
|
57
|
+
question = item.input_obj
|
|
58
|
+
generated_answer = item.output_obj
|
|
59
|
+
agent_trajectory = intermediate_step_adapter.get_agent_actions(item.trajectory, event_filter)
|
|
96
60
|
|
|
97
|
-
# Execute all evaluations asynchronously
|
|
98
61
|
try:
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
eval_output_items = [
|
|
114
|
-
EvalOutputItem(id=item.id, score=score, reasoning=reasoning)
|
|
115
|
-
for item, score, reasoning in zip(eval_input.eval_input_items, sample_scores, sample_reasonings)
|
|
116
|
-
]
|
|
117
|
-
|
|
118
|
-
return EvalOutput(average_score=avg_score, eval_output_items=eval_output_items)
|
|
62
|
+
eval_result = await self.traj_eval_chain.aevaluate_agent_trajectory(
|
|
63
|
+
input=question,
|
|
64
|
+
agent_trajectory=agent_trajectory,
|
|
65
|
+
prediction=generated_answer,
|
|
66
|
+
)
|
|
67
|
+
except Exception as e:
|
|
68
|
+
logger.exception("Error evaluating trajectory for question: %s, Error: %s", question, e, exc_info=True)
|
|
69
|
+
return EvalOutputItem(id=item.id, score=0.0, reasoning=f"Error evaluating trajectory: {e}")
|
|
70
|
+
|
|
71
|
+
reasoning = {
|
|
72
|
+
"reasoning": eval_result["reasoning"],
|
|
73
|
+
"trajectory": [(action.model_dump(), output) for (action, output) in agent_trajectory]
|
|
74
|
+
}
|
|
75
|
+
return EvalOutputItem(id=item.id, score=eval_result["score"], reasoning=reasoning)
|
|
@@ -13,7 +13,6 @@
|
|
|
13
13
|
# See the License for the specific language governing permissions and
|
|
14
14
|
# limitations under the License.
|
|
15
15
|
|
|
16
|
-
import asyncio
|
|
17
16
|
import logging
|
|
18
17
|
|
|
19
18
|
from langchain.output_parsers import ResponseSchema
|
|
@@ -21,13 +20,10 @@ from langchain.output_parsers import StructuredOutputParser
|
|
|
21
20
|
from langchain.schema import HumanMessage
|
|
22
21
|
from langchain.schema import SystemMessage
|
|
23
22
|
from langchain_core.language_models import BaseChatModel
|
|
24
|
-
from tqdm import tqdm
|
|
25
23
|
|
|
26
|
-
from aiq.eval.evaluator.
|
|
24
|
+
from aiq.eval.evaluator.base_evaluator import BaseEvaluator
|
|
27
25
|
from aiq.eval.evaluator.evaluator_model import EvalInputItem
|
|
28
|
-
from aiq.eval.evaluator.evaluator_model import EvalOutput
|
|
29
26
|
from aiq.eval.evaluator.evaluator_model import EvalOutputItem
|
|
30
|
-
from aiq.eval.utils.tqdm_position_registry import TqdmPositionRegistry
|
|
31
27
|
|
|
32
28
|
logger = logging.getLogger(__name__)
|
|
33
29
|
|
|
@@ -69,7 +65,7 @@ def evaluation_prompt(judge_llm_prompt: str,
|
|
|
69
65
|
return EVAL_PROMPT if not default_scoring else DEFAULT_EVAL_PROMPT
|
|
70
66
|
|
|
71
67
|
|
|
72
|
-
class TunableRagEvaluator:
|
|
68
|
+
class TunableRagEvaluator(BaseEvaluator):
|
|
73
69
|
'''Tunable RAG evaluator class with customizable LLM prompt for scoring.'''
|
|
74
70
|
|
|
75
71
|
def __init__(self,
|
|
@@ -78,186 +74,141 @@ class TunableRagEvaluator:
|
|
|
78
74
|
max_concurrency: int,
|
|
79
75
|
default_scoring: bool,
|
|
80
76
|
default_score_weights: dict):
|
|
77
|
+
super().__init__(max_concurrency=max_concurrency, tqdm_desc="Evaluating RAG")
|
|
81
78
|
self.llm = llm
|
|
82
|
-
self.max_concurrency = max_concurrency
|
|
83
79
|
self.judge_llm_prompt = judge_llm_prompt
|
|
84
|
-
self.semaphore = asyncio.Semaphore(self.max_concurrency)
|
|
85
80
|
self.default_scoring = default_scoring
|
|
86
81
|
# Use user-provided weights if available; otherwise, set equal weights for each score
|
|
87
82
|
self.default_score_weights = default_score_weights if default_score_weights else {
|
|
88
83
|
"coverage": 1 / 3, "correctness": 1 / 3, "relevance": 1 / 3
|
|
89
84
|
}
|
|
90
85
|
|
|
91
|
-
async def
|
|
92
|
-
'''Evaluate
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
name="reasoning",
|
|
119
|
-
description=
|
|
120
|
-
"1-2 summarized sentences of reasoning for the scores. Ex. 'The generated answer covers all critical aspects mentioned in the expected answer, is correct, and is relevant to the question.'",
|
|
121
|
-
type="string"),
|
|
122
|
-
]
|
|
123
|
-
|
|
124
|
-
custom_evaluation_schema = [
|
|
125
|
-
ResponseSchema(name="score", description="Score for the generated answer. Ex. 0.5", type="float"),
|
|
126
|
-
ResponseSchema(
|
|
127
|
-
name="reasoning",
|
|
128
|
-
description=
|
|
129
|
-
"1-2 sentence reasoning for the score. Ex. 'The generated answer is exactly the same as the description of the expected answer.'",
|
|
130
|
-
type="string"),
|
|
131
|
-
]
|
|
86
|
+
async def evaluate_item(self, item: EvalInputItem) -> EvalOutputItem:
|
|
87
|
+
'''Evaluate a single item'''
|
|
88
|
+
question = item.input_obj
|
|
89
|
+
answer_description = item.expected_output_obj
|
|
90
|
+
generated_answer = item.output_obj
|
|
91
|
+
|
|
92
|
+
# Call judge LLM to generate score
|
|
93
|
+
score = 0.0
|
|
94
|
+
|
|
95
|
+
default_evaluation_schema = [
|
|
96
|
+
ResponseSchema(
|
|
97
|
+
name="coverage_score",
|
|
98
|
+
description="Score for the coverage of all critical aspects mentioned in the expected answer. Ex. 0.5",
|
|
99
|
+
type="float"),
|
|
100
|
+
ResponseSchema(
|
|
101
|
+
name="correctness_score",
|
|
102
|
+
description="Score for the accuracy of the generated answer compared to the expected answer. Ex. 0.5",
|
|
103
|
+
type="float"),
|
|
104
|
+
ResponseSchema(name="relevance_score",
|
|
105
|
+
description="Score for the relevance of the generated answer to the question. Ex. 0.5",
|
|
106
|
+
type="float"),
|
|
107
|
+
ResponseSchema(
|
|
108
|
+
name="reasoning",
|
|
109
|
+
description=
|
|
110
|
+
"1-2 summarized sentences of reasoning for the scores. Ex. 'The generated answer covers all critical aspects mentioned in the expected answer, is correct, and is relevant to the question.'",
|
|
111
|
+
type="string"),
|
|
112
|
+
]
|
|
132
113
|
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
114
|
+
custom_evaluation_schema = [
|
|
115
|
+
ResponseSchema(name="score", description="Score for the generated answer. Ex. 0.5", type="float"),
|
|
116
|
+
ResponseSchema(
|
|
117
|
+
name="reasoning",
|
|
118
|
+
description=
|
|
119
|
+
"1-2 sentence reasoning for the score. Ex. 'The generated answer is exactly the same as the description of the expected answer.'",
|
|
120
|
+
type="string"),
|
|
121
|
+
]
|
|
137
122
|
|
|
138
|
-
|
|
139
|
-
|
|
123
|
+
if self.default_scoring:
|
|
124
|
+
evaluation_schema = default_evaluation_schema
|
|
125
|
+
else:
|
|
126
|
+
evaluation_schema = custom_evaluation_schema
|
|
140
127
|
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
answer_description=answer_description,
|
|
144
|
-
generated_answer=generated_answer,
|
|
145
|
-
format_instructions=format_instructions,
|
|
146
|
-
default_scoring=self.default_scoring)
|
|
128
|
+
llm_input_response_parser = StructuredOutputParser.from_response_schemas(evaluation_schema)
|
|
129
|
+
format_instructions = llm_input_response_parser.get_format_instructions()
|
|
147
130
|
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
131
|
+
eval_prompt = evaluation_prompt(judge_llm_prompt=self.judge_llm_prompt,
|
|
132
|
+
question=question,
|
|
133
|
+
answer_description=answer_description,
|
|
134
|
+
generated_answer=generated_answer,
|
|
135
|
+
format_instructions=format_instructions,
|
|
136
|
+
default_scoring=self.default_scoring)
|
|
151
137
|
|
|
152
|
-
|
|
138
|
+
messages = [SystemMessage(content="You must respond only in JSON format."), HumanMessage(content=eval_prompt)]
|
|
153
139
|
|
|
154
|
-
|
|
155
|
-
coverage_score = 0.0
|
|
156
|
-
correctness_score = 0.0
|
|
157
|
-
relevance_score = 0.0
|
|
158
|
-
reasoning = "Error in evaluator from parsing judge LLM response."
|
|
140
|
+
response = await self.llm.ainvoke(messages)
|
|
159
141
|
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
correctness_score = parsed_response["correctness_score"]
|
|
166
|
-
relevance_score = parsed_response["relevance_score"]
|
|
167
|
-
reasoning = parsed_response["reasoning"]
|
|
168
|
-
except KeyError as e:
|
|
169
|
-
logger.error("Missing required keys in default scoring response: %s",
|
|
170
|
-
", ".join(str(arg) for arg in e.args))
|
|
171
|
-
reasoning = f"Error in evaluator from parsing judge LLM response. Missing required key(s): {', '.join(str(arg) for arg in e.args)}"
|
|
172
|
-
|
|
173
|
-
coverage_weight = self.default_score_weights.get("coverage", 1 / 3)
|
|
174
|
-
correctness_weight = self.default_score_weights.get("correctness", 1 / 3)
|
|
175
|
-
relevance_weight = self.default_score_weights.get("relevance", 1 / 3)
|
|
176
|
-
|
|
177
|
-
# Calculate score
|
|
178
|
-
total_weight = coverage_weight + correctness_weight + relevance_weight
|
|
179
|
-
coverage_weight = coverage_weight / total_weight
|
|
180
|
-
correctness_weight = correctness_weight / total_weight
|
|
181
|
-
relevance_weight = relevance_weight / total_weight
|
|
182
|
-
|
|
183
|
-
if round(coverage_weight + correctness_weight + relevance_weight, 2) != 1:
|
|
184
|
-
logger.warning("The sum of the default score weights is not 1. The weights will be normalized.")
|
|
185
|
-
coverage_weight = coverage_weight / (coverage_weight + correctness_weight + relevance_weight)
|
|
186
|
-
correctness_weight = correctness_weight / (coverage_weight + correctness_weight +
|
|
187
|
-
relevance_weight)
|
|
188
|
-
relevance_weight = relevance_weight / (coverage_weight + correctness_weight + relevance_weight)
|
|
189
|
-
|
|
190
|
-
score = (coverage_weight * coverage_score + correctness_weight * correctness_score +
|
|
191
|
-
relevance_weight * relevance_score)
|
|
192
|
-
|
|
193
|
-
else:
|
|
194
|
-
try:
|
|
195
|
-
score = parsed_response["score"]
|
|
196
|
-
reasoning = parsed_response["reasoning"]
|
|
197
|
-
except KeyError as e:
|
|
198
|
-
logger.error("Missing required keys in custom scoring response: %s",
|
|
199
|
-
", ".join(str(arg) for arg in e.args))
|
|
200
|
-
reasoning = f"Error in evaluator from parsing judge LLM response. Missing required key(s): {', '.join(str(arg) for arg in e.args)}"
|
|
201
|
-
raise
|
|
202
|
-
except (KeyError, ValueError) as e:
|
|
203
|
-
logger.error("Error parsing judge LLM response: %s", e)
|
|
204
|
-
score = 0.0
|
|
205
|
-
reasoning = "Error in evaluator from parsing judge LLM response."
|
|
142
|
+
# Initialize default values to handle service errors
|
|
143
|
+
coverage_score = 0.0
|
|
144
|
+
correctness_score = 0.0
|
|
145
|
+
relevance_score = 0.0
|
|
146
|
+
reasoning = "Error in evaluator from parsing judge LLM response."
|
|
206
147
|
|
|
148
|
+
try:
|
|
149
|
+
parsed_response = llm_input_response_parser.parse(response.content)
|
|
207
150
|
if self.default_scoring:
|
|
208
|
-
|
|
209
|
-
"
|
|
210
|
-
"
|
|
211
|
-
"
|
|
212
|
-
"
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
}
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
# Update the progress bar
|
|
237
|
-
pbar.update(1)
|
|
238
|
-
return result
|
|
151
|
+
try:
|
|
152
|
+
coverage_score = parsed_response["coverage_score"]
|
|
153
|
+
correctness_score = parsed_response["correctness_score"]
|
|
154
|
+
relevance_score = parsed_response["relevance_score"]
|
|
155
|
+
reasoning = parsed_response["reasoning"]
|
|
156
|
+
except KeyError as e:
|
|
157
|
+
logger.error("Missing required keys in default scoring response: %s",
|
|
158
|
+
", ".join(str(arg) for arg in e.args))
|
|
159
|
+
reasoning = f"Error in evaluator from parsing judge LLM response. Missing required key(s): {', '.join(str(arg) for arg in e.args)}"
|
|
160
|
+
|
|
161
|
+
coverage_weight = self.default_score_weights.get("coverage", 1 / 3)
|
|
162
|
+
correctness_weight = self.default_score_weights.get("correctness", 1 / 3)
|
|
163
|
+
relevance_weight = self.default_score_weights.get("relevance", 1 / 3)
|
|
164
|
+
|
|
165
|
+
# Calculate score
|
|
166
|
+
total_weight = coverage_weight + correctness_weight + relevance_weight
|
|
167
|
+
coverage_weight = coverage_weight / total_weight
|
|
168
|
+
correctness_weight = correctness_weight / total_weight
|
|
169
|
+
relevance_weight = relevance_weight / total_weight
|
|
170
|
+
|
|
171
|
+
if round(coverage_weight + correctness_weight + relevance_weight, 2) != 1:
|
|
172
|
+
logger.warning("The sum of the default score weights is not 1. The weights will be normalized.")
|
|
173
|
+
coverage_weight = coverage_weight / (coverage_weight + correctness_weight + relevance_weight)
|
|
174
|
+
correctness_weight = correctness_weight / (coverage_weight + correctness_weight + relevance_weight)
|
|
175
|
+
relevance_weight = relevance_weight / (coverage_weight + correctness_weight + relevance_weight)
|
|
176
|
+
|
|
177
|
+
score = (coverage_weight * coverage_score + correctness_weight * correctness_score +
|
|
178
|
+
relevance_weight * relevance_score)
|
|
239
179
|
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
# Compute average score
|
|
255
|
-
avg_score = round(sum(sample_scores) / len(sample_scores), 2) if sample_scores else 0.0
|
|
256
|
-
|
|
257
|
-
# Construct EvalOutputItems
|
|
258
|
-
eval_output_items = [
|
|
259
|
-
EvalOutputItem(id=item.id, score=score, reasoning=reasoning)
|
|
260
|
-
for item, score, reasoning in zip(eval_input.eval_input_items, sample_scores, sample_reasonings)
|
|
261
|
-
]
|
|
180
|
+
else:
|
|
181
|
+
try:
|
|
182
|
+
score = parsed_response["score"]
|
|
183
|
+
reasoning = parsed_response["reasoning"]
|
|
184
|
+
except KeyError as e:
|
|
185
|
+
logger.error("Missing required keys in custom scoring response: %s",
|
|
186
|
+
", ".join(str(arg) for arg in e.args))
|
|
187
|
+
reasoning = f"Error in evaluator from parsing judge LLM response. Missing required key(s): {', '.join(str(arg) for arg in e.args)}"
|
|
188
|
+
raise
|
|
189
|
+
except (KeyError, ValueError) as e:
|
|
190
|
+
logger.error("Error parsing judge LLM response: %s", e)
|
|
191
|
+
score = 0.0
|
|
192
|
+
reasoning = "Error in evaluator from parsing judge LLM response."
|
|
262
193
|
|
|
263
|
-
|
|
194
|
+
if self.default_scoring:
|
|
195
|
+
reasoning = {
|
|
196
|
+
"question": question,
|
|
197
|
+
"answer_description": answer_description,
|
|
198
|
+
"generated_answer": generated_answer,
|
|
199
|
+
"score_breakdown": {
|
|
200
|
+
"coverage_score": coverage_score,
|
|
201
|
+
"correctness_score": correctness_score,
|
|
202
|
+
"relevance_score": relevance_score,
|
|
203
|
+
},
|
|
204
|
+
"reasoning": reasoning,
|
|
205
|
+
}
|
|
206
|
+
else:
|
|
207
|
+
reasoning = {
|
|
208
|
+
"question": question,
|
|
209
|
+
"answer_description": answer_description,
|
|
210
|
+
"generated_answer": generated_answer,
|
|
211
|
+
"reasoning": reasoning
|
|
212
|
+
}
|
|
213
|
+
|
|
214
|
+
return EvalOutputItem(id=item.id, score=score, reasoning=reasoning)
|
aiq/observability/register.py
CHANGED
|
@@ -154,3 +154,27 @@ async def file_logging_method(config: FileLoggingMethod, builder: Builder):
|
|
|
154
154
|
handler = logging.FileHandler(filename=config.path, mode="a", encoding="utf-8")
|
|
155
155
|
handler.setLevel(level)
|
|
156
156
|
yield handler
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
class PatronusTelemetryExporter(TelemetryExporterBaseConfig, name="patronus"):
|
|
160
|
+
"""A telemetry exporter to transmit traces to Patronus service."""
|
|
161
|
+
|
|
162
|
+
endpoint: str = Field(description="The Patronus OTEL endpoint")
|
|
163
|
+
api_key: str = Field(description="The Patronus API key", default="")
|
|
164
|
+
project: str = Field(description="The project name to group the telemetry traces.")
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
@register_telemetry_exporter(config_type=PatronusTelemetryExporter)
|
|
168
|
+
async def patronus_telemetry_exporter(config: PatronusTelemetryExporter, builder: Builder):
|
|
169
|
+
"""Create a Patronus telemetry exporter."""
|
|
170
|
+
trace_exporter = telemetry_optional_import("opentelemetry.exporter.otlp.proto.grpc.trace_exporter")
|
|
171
|
+
|
|
172
|
+
api_key = config.api_key or os.environ.get("PATRONUS_API_KEY")
|
|
173
|
+
if not api_key:
|
|
174
|
+
raise ValueError("API key is required for Patronus")
|
|
175
|
+
|
|
176
|
+
headers = {
|
|
177
|
+
"x-api-key": api_key,
|
|
178
|
+
"pat-project-name": config.project,
|
|
179
|
+
}
|
|
180
|
+
yield trace_exporter.OTLPSpanExporter(endpoint=config.endpoint, headers=headers)
|
aiq/tool/mcp/mcp_client.py
CHANGED
|
@@ -45,6 +45,7 @@ def model_from_mcp_schema(name: str, mcp_input_schema: dict) -> type[BaseModel]:
|
|
|
45
45
|
}
|
|
46
46
|
|
|
47
47
|
properties = mcp_input_schema.get("properties", {})
|
|
48
|
+
required_fields = set(mcp_input_schema.get("required", []))
|
|
48
49
|
schema_dict = {}
|
|
49
50
|
|
|
50
51
|
def _generate_valid_classname(class_name: str):
|
|
@@ -70,7 +71,17 @@ def model_from_mcp_schema(name: str, mcp_input_schema: dict) -> type[BaseModel]:
|
|
|
70
71
|
else:
|
|
71
72
|
field_type = _type_map.get(json_type, Any)
|
|
72
73
|
|
|
73
|
-
|
|
74
|
+
# Determine the default value based on whether the field is required
|
|
75
|
+
if field_name in required_fields:
|
|
76
|
+
# Field is required - use explicit default if provided, otherwise make it required
|
|
77
|
+
default_value = field_properties.get("default", ...)
|
|
78
|
+
else:
|
|
79
|
+
# Field is optional - use explicit default if provided, otherwise None
|
|
80
|
+
default_value = field_properties.get("default", None)
|
|
81
|
+
# Make the type optional if no default was provided
|
|
82
|
+
if "default" not in field_properties:
|
|
83
|
+
field_type = field_type | None
|
|
84
|
+
|
|
74
85
|
nullable = field_properties.get("nullable", False)
|
|
75
86
|
description = field_properties.get("description", "")
|
|
76
87
|
|
|
@@ -115,9 +115,10 @@ aiq/eval/runtime_event_subscriber.py,sha256=2VM8MqmPc_EWPxxrDDR9naiioZirkJUfGwzb
|
|
|
115
115
|
aiq/eval/dataset_handler/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
116
116
|
aiq/eval/dataset_handler/dataset_downloader.py,sha256=Zvfbd-fPOhB9n8ZiCBaBKW0y-5v97mQAy3dkBL0OFZ0,4553
|
|
117
117
|
aiq/eval/dataset_handler/dataset_filter.py,sha256=mop6wa4P_QtQ5QkfXv-hVBm3EMerfNECSTJGGDB1YWE,2115
|
|
118
|
-
aiq/eval/dataset_handler/dataset_handler.py,sha256=
|
|
118
|
+
aiq/eval/dataset_handler/dataset_handler.py,sha256=z4trKYPnqSrLvsKctU9d5WrQW7ddbZZx0zOrYVLqbAA,7847
|
|
119
119
|
aiq/eval/evaluator/__init__.py,sha256=GUJrgGtpvyMUCjUBvR3faAdv-tZzbU9W-izgx9aMEQg,680
|
|
120
|
-
aiq/eval/evaluator/
|
|
120
|
+
aiq/eval/evaluator/base_evaluator.py,sha256=5kqOcTYNecnh9us_XvV58pj5tZI82NGkVN4tg9-R_ZE,3040
|
|
121
|
+
aiq/eval/evaluator/evaluator_model.py,sha256=5cxe3mqznlNGzv29v_VseYU7OzoT1eTf7hgSPQxytsM,1440
|
|
121
122
|
aiq/eval/rag_evaluator/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
122
123
|
aiq/eval/rag_evaluator/evaluate.py,sha256=lEjXKiuELAHyWckz-bM91dZ6AT2J6NC7SfvtedR-Qdk,6548
|
|
123
124
|
aiq/eval/rag_evaluator/register.py,sha256=2NzxkgqyoZ4wC8ARj3tiVoE8ENCmplBCIKrNOFh6_VI,5642
|
|
@@ -125,10 +126,10 @@ aiq/eval/swe_bench_evaluator/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NM
|
|
|
125
126
|
aiq/eval/swe_bench_evaluator/evaluate.py,sha256=kNukRruq1EM1RsGLvpVuC22xcP0gpn9acF3edGak9vY,9858
|
|
126
127
|
aiq/eval/swe_bench_evaluator/register.py,sha256=sTb74F7w4iuI0ROsEJ4bV13Nt1GEWQn7UvO2O0HXwXk,1537
|
|
127
128
|
aiq/eval/trajectory_evaluator/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
128
|
-
aiq/eval/trajectory_evaluator/evaluate.py,sha256=
|
|
129
|
+
aiq/eval/trajectory_evaluator/evaluate.py,sha256=Y51KMhJ9t8AoYWrQlrwipc2CtgIXA9IUGZTbKegtsnw,3257
|
|
129
130
|
aiq/eval/trajectory_evaluator/register.py,sha256=kktT4fu5_1Cou-iohD3YhQevsWiR3TA5NpFSweVz0eQ,1709
|
|
130
131
|
aiq/eval/tunable_rag_evaluator/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
131
|
-
aiq/eval/tunable_rag_evaluator/evaluate.py,sha256=
|
|
132
|
+
aiq/eval/tunable_rag_evaluator/evaluate.py,sha256=lZxQDhvcAu0JR1RApkbs-G3T9pUOSfh822TYGp7vrQw,11440
|
|
132
133
|
aiq/eval/tunable_rag_evaluator/register.py,sha256=uV36xONVxQW8qBO_bsvbvZk4-J4IhowxiRKErnYsbzA,2369
|
|
133
134
|
aiq/eval/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
134
135
|
aiq/eval/utils/output_uploader.py,sha256=SaQbZPkw-Q0H7t5yG60Kh-p1cflR7gPklVkilC4uPbU,5141
|
|
@@ -175,7 +176,7 @@ aiq/meta/module_to_distro.json,sha256=1XV7edobFrdDKvsSoynfodXg_hczUWpDrQzGkW9qqE
|
|
|
175
176
|
aiq/meta/pypi.md,sha256=N1fvWaio3KhnAw9yigeM-oWaLuT5i_C7U_2UVzyPbks,4386
|
|
176
177
|
aiq/observability/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
177
178
|
aiq/observability/async_otel_listener.py,sha256=2Ye9bkHfAssuxFS_ECyRyl-bTa73yYvsPyO4BaK5Beg,19662
|
|
178
|
-
aiq/observability/register.py,sha256=
|
|
179
|
+
aiq/observability/register.py,sha256=mejMBVr3dHHfShIiyn1fIbA0Gb6z9Ayg8WRMgB0wf5E,7646
|
|
179
180
|
aiq/plugins/.namespace,sha256=Gace0pOC3ETEJf-TBVuNw0TQV6J_KtOPpEiSzMH-odo,215
|
|
180
181
|
aiq/profiler/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
181
182
|
aiq/profiler/data_frame_row.py,sha256=vudqk1ZzZtlZln2Ir43mPl3nwNc0pQlhwbtdY9oSKtI,1755
|
|
@@ -277,7 +278,7 @@ aiq/tool/github_tools/get_github_issue.py,sha256=vwLNkNOszLlymkQju0cR8BNvfdH4Enm
|
|
|
277
278
|
aiq/tool/github_tools/get_github_pr.py,sha256=b7eCOqrVoejGjRwmUVdU45uF07ihbY8lRacMYOSgMrY,9716
|
|
278
279
|
aiq/tool/github_tools/update_github_issue.py,sha256=TUElxUuzjZr_QldL_48RcqSx0A9b23NB_lA82QwFjkM,4103
|
|
279
280
|
aiq/tool/mcp/__init__.py,sha256=GUJrgGtpvyMUCjUBvR3faAdv-tZzbU9W-izgx9aMEQg,680
|
|
280
|
-
aiq/tool/mcp/mcp_client.py,sha256=
|
|
281
|
+
aiq/tool/mcp/mcp_client.py,sha256=lYbf669ATqGKkL0jjd76r0aAtAFnWeruWw-lOPsmYu8,8103
|
|
281
282
|
aiq/tool/mcp/mcp_tool.py,sha256=rQQcaCT-GHQcDmG5weX-2Y-HxBPX-0cC73LjL1u0FUU,4009
|
|
282
283
|
aiq/tool/memory_tools/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
283
284
|
aiq/tool/memory_tools/add_memory_tool.py,sha256=9EjB3DpYhxwasz7o3O8Rq__Ys5986fciv44ahC6mVCo,3349
|
|
@@ -308,10 +309,10 @@ aiq/utils/reactive/base/observer_base.py,sha256=UAlyAY_ky4q2t0P81RVFo2Bs_R7z5Nde
|
|
|
308
309
|
aiq/utils/reactive/base/subject_base.py,sha256=Ed-AC6P7cT3qkW1EXjzbd5M9WpVoeN_9KCe3OM3FLU4,2521
|
|
309
310
|
aiq/utils/settings/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
310
311
|
aiq/utils/settings/global_settings.py,sha256=U9TCLdoZsKq5qOVGjREipGVv9e-FlStzqy5zv82_VYk,7454
|
|
311
|
-
aiqtoolkit-1.2.
|
|
312
|
-
aiqtoolkit-1.2.
|
|
313
|
-
aiqtoolkit-1.2.
|
|
314
|
-
aiqtoolkit-1.2.
|
|
315
|
-
aiqtoolkit-1.2.
|
|
316
|
-
aiqtoolkit-1.2.
|
|
317
|
-
aiqtoolkit-1.2.
|
|
312
|
+
aiqtoolkit-1.2.0a20250613.dist-info/licenses/LICENSE-3rd-party.txt,sha256=8o7aySJa9CBvFshPcsRdJbczzdNyDGJ8b0J67WRUQ2k,183936
|
|
313
|
+
aiqtoolkit-1.2.0a20250613.dist-info/licenses/LICENSE.md,sha256=QwcOLU5TJoTeUhuIXzhdCEEDDvorGiC6-3YTOl4TecE,11356
|
|
314
|
+
aiqtoolkit-1.2.0a20250613.dist-info/METADATA,sha256=0LV-fg4UXDznF9C1ojoVD1qrvT1Spoc0w7duaBn_QVI,20274
|
|
315
|
+
aiqtoolkit-1.2.0a20250613.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
316
|
+
aiqtoolkit-1.2.0a20250613.dist-info/entry_points.txt,sha256=gRlPfR5g21t328WNEQ4CcEz80S1sJNS8A7rMDYnzl4A,452
|
|
317
|
+
aiqtoolkit-1.2.0a20250613.dist-info/top_level.txt,sha256=fo7AzYcNhZ_tRWrhGumtxwnxMew4xrT1iwouDy_f0Kc,4
|
|
318
|
+
aiqtoolkit-1.2.0a20250613.dist-info/RECORD,,
|
|
File without changes
|
{aiqtoolkit-1.2.0a20250612.dist-info → aiqtoolkit-1.2.0a20250613.dist-info}/entry_points.txt
RENAMED
|
File without changes
|
|
File without changes
|
{aiqtoolkit-1.2.0a20250612.dist-info → aiqtoolkit-1.2.0a20250613.dist-info}/licenses/LICENSE.md
RENAMED
|
File without changes
|
|
File without changes
|