aiqtoolkit 1.2.0a20250616__py3-none-any.whl → 1.2.0a20250617__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of aiqtoolkit might be problematic. Click here for more details.
- aiq/builder/function.py +12 -8
- aiq/builder/workflow_builder.py +1 -1
- aiq/data_models/evaluate.py +19 -0
- aiq/eval/evaluate.py +63 -5
- aiq/eval/tunable_rag_evaluator/evaluate.py +121 -168
- aiq/profiler/inference_optimization/bottleneck_analysis/nested_stack_analysis.py +8 -0
- {aiqtoolkit-1.2.0a20250616.dist-info → aiqtoolkit-1.2.0a20250617.dist-info}/METADATA +1 -1
- {aiqtoolkit-1.2.0a20250616.dist-info → aiqtoolkit-1.2.0a20250617.dist-info}/RECORD +13 -13
- {aiqtoolkit-1.2.0a20250616.dist-info → aiqtoolkit-1.2.0a20250617.dist-info}/WHEEL +0 -0
- {aiqtoolkit-1.2.0a20250616.dist-info → aiqtoolkit-1.2.0a20250617.dist-info}/entry_points.txt +0 -0
- {aiqtoolkit-1.2.0a20250616.dist-info → aiqtoolkit-1.2.0a20250617.dist-info}/licenses/LICENSE-3rd-party.txt +0 -0
- {aiqtoolkit-1.2.0a20250616.dist-info → aiqtoolkit-1.2.0a20250617.dist-info}/licenses/LICENSE.md +0 -0
- {aiqtoolkit-1.2.0a20250616.dist-info → aiqtoolkit-1.2.0a20250617.dist-info}/top_level.txt +0 -0
aiq/builder/function.py
CHANGED
|
@@ -48,7 +48,8 @@ class Function(FunctionBase[InputT, StreamingOutputT, SingleOutputT], ABC):
|
|
|
48
48
|
input_schema: type[BaseModel] | None = None,
|
|
49
49
|
streaming_output_schema: type[BaseModel] | type[None] | None = None,
|
|
50
50
|
single_output_schema: type[BaseModel] | type[None] | None = None,
|
|
51
|
-
converters: list[Callable[[typing.Any], typing.Any]] | None = None
|
|
51
|
+
converters: list[Callable[[typing.Any], typing.Any]] | None = None,
|
|
52
|
+
instance_name: str | None = None):
|
|
52
53
|
|
|
53
54
|
super().__init__(input_schema=input_schema,
|
|
54
55
|
streaming_output_schema=streaming_output_schema,
|
|
@@ -57,6 +58,7 @@ class Function(FunctionBase[InputT, StreamingOutputT, SingleOutputT], ABC):
|
|
|
57
58
|
|
|
58
59
|
self.config = config
|
|
59
60
|
self.description = description
|
|
61
|
+
self.instance_name = instance_name or config.type
|
|
60
62
|
self._context = AIQContext.get()
|
|
61
63
|
|
|
62
64
|
def convert(self, value: typing.Any, to_type: type[_T]) -> _T:
|
|
@@ -110,7 +112,7 @@ class Function(FunctionBase[InputT, StreamingOutputT, SingleOutputT], ABC):
|
|
|
110
112
|
The output of the function optionally converted to the specified type.
|
|
111
113
|
"""
|
|
112
114
|
|
|
113
|
-
with self._context.push_active_function(self.
|
|
115
|
+
with self._context.push_active_function(self.instance_name,
|
|
114
116
|
input_data=value) as manager: # Set the current invocation context
|
|
115
117
|
try:
|
|
116
118
|
converted_input: InputT = self._convert_input(value) # type: ignore
|
|
@@ -254,17 +256,17 @@ class Function(FunctionBase[InputT, StreamingOutputT, SingleOutputT], ABC):
|
|
|
254
256
|
|
|
255
257
|
class LambdaFunction(Function[InputT, StreamingOutputT, SingleOutputT]):
|
|
256
258
|
|
|
257
|
-
def __init__(self, *, config: FunctionBaseConfig, info: FunctionInfo):
|
|
259
|
+
def __init__(self, *, config: FunctionBaseConfig, info: FunctionInfo, instance_name: str | None = None):
|
|
258
260
|
|
|
259
261
|
super().__init__(config=config,
|
|
260
262
|
description=info.description,
|
|
261
263
|
input_schema=info.input_schema,
|
|
262
264
|
streaming_output_schema=info.stream_output_schema,
|
|
263
265
|
single_output_schema=info.single_output_schema,
|
|
264
|
-
converters=info.converters
|
|
266
|
+
converters=info.converters,
|
|
267
|
+
instance_name=instance_name)
|
|
265
268
|
|
|
266
269
|
self._info = info
|
|
267
|
-
|
|
268
270
|
self._ainvoke_fn: _InvokeFnT = info.single_fn
|
|
269
271
|
self._astream_fn: _StreamFnT = info.stream_fn
|
|
270
272
|
|
|
@@ -284,8 +286,10 @@ class LambdaFunction(Function[InputT, StreamingOutputT, SingleOutputT]):
|
|
|
284
286
|
yield x
|
|
285
287
|
|
|
286
288
|
@staticmethod
|
|
287
|
-
def from_info(*,
|
|
288
|
-
|
|
289
|
+
def from_info(*,
|
|
290
|
+
config: FunctionBaseConfig,
|
|
291
|
+
info: FunctionInfo,
|
|
292
|
+
instance_name: str | None = None) -> 'LambdaFunction[InputT, StreamingOutputT, SingleOutputT]':
|
|
289
293
|
|
|
290
294
|
input_type: type = info.input_type
|
|
291
295
|
streaming_output_type = info.stream_output_type
|
|
@@ -294,4 +298,4 @@ class LambdaFunction(Function[InputT, StreamingOutputT, SingleOutputT]):
|
|
|
294
298
|
class FunctionImpl(LambdaFunction[input_type, streaming_output_type, single_output_type]):
|
|
295
299
|
pass
|
|
296
300
|
|
|
297
|
-
return FunctionImpl(config=config, info=info)
|
|
301
|
+
return FunctionImpl(config=config, info=info, instance_name=instance_name)
|
aiq/builder/workflow_builder.py
CHANGED
|
@@ -333,7 +333,7 @@ class WorkflowBuilder(Builder, AbstractAsyncContextManager):
|
|
|
333
333
|
|
|
334
334
|
if (isinstance(build_result, FunctionInfo)):
|
|
335
335
|
# Create the function object
|
|
336
|
-
build_result = LambdaFunction.from_info(config=config, info=build_result)
|
|
336
|
+
build_result = LambdaFunction.from_info(config=config, info=build_result, instance_name=name)
|
|
337
337
|
|
|
338
338
|
if (not isinstance(build_result, Function)):
|
|
339
339
|
raise ValueError("Expected a function, FunctionInfo object, or FunctionBase object to be "
|
aiq/data_models/evaluate.py
CHANGED
|
@@ -14,6 +14,7 @@
|
|
|
14
14
|
# limitations under the License.
|
|
15
15
|
|
|
16
16
|
import typing
|
|
17
|
+
from enum import Enum
|
|
17
18
|
from pathlib import Path
|
|
18
19
|
|
|
19
20
|
from pydantic import BaseModel
|
|
@@ -28,6 +29,12 @@ from aiq.data_models.intermediate_step import IntermediateStepType
|
|
|
28
29
|
from aiq.data_models.profiler import ProfilerConfig
|
|
29
30
|
|
|
30
31
|
|
|
32
|
+
class JobEvictionPolicy(str, Enum):
|
|
33
|
+
"""Policy for evicting old jobs when max_jobs is exceeded."""
|
|
34
|
+
TIME_CREATED = "time_created"
|
|
35
|
+
TIME_MODIFIED = "time_modified"
|
|
36
|
+
|
|
37
|
+
|
|
31
38
|
class EvalCustomScriptConfig(BaseModel):
|
|
32
39
|
# Path to the script to run
|
|
33
40
|
script: Path
|
|
@@ -35,6 +42,16 @@ class EvalCustomScriptConfig(BaseModel):
|
|
|
35
42
|
kwargs: dict[str, str] = {}
|
|
36
43
|
|
|
37
44
|
|
|
45
|
+
class JobManagementConfig(BaseModel):
|
|
46
|
+
# Whether to append a unique job ID to the output directory for each run
|
|
47
|
+
append_job_id_to_output_dir: bool = False
|
|
48
|
+
# Maximum number of jobs to keep in the output directory. Oldest jobs will be evicted.
|
|
49
|
+
# A value of 0 means no limit.
|
|
50
|
+
max_jobs: int = 0
|
|
51
|
+
# Policy for evicting old jobs. Defaults to using time_created.
|
|
52
|
+
eviction_policy: JobEvictionPolicy = JobEvictionPolicy.TIME_CREATED
|
|
53
|
+
|
|
54
|
+
|
|
38
55
|
class EvalOutputConfig(BaseModel):
|
|
39
56
|
# Output directory for the workflow and evaluation results
|
|
40
57
|
dir: Path = Path("/tmp/aiq/examples/default/")
|
|
@@ -46,6 +63,8 @@ class EvalOutputConfig(BaseModel):
|
|
|
46
63
|
s3: EvalS3Config | None = None
|
|
47
64
|
# Whether to cleanup the output directory before running the workflow
|
|
48
65
|
cleanup: bool = True
|
|
66
|
+
# Job management configuration (job id, eviction, etc.)
|
|
67
|
+
job_management: JobManagementConfig = JobManagementConfig()
|
|
49
68
|
# Filter for the workflow output steps
|
|
50
69
|
workflow_output_step_filter: list[IntermediateStepType] | None = None
|
|
51
70
|
|
aiq/eval/evaluate.py
CHANGED
|
@@ -18,11 +18,13 @@ import logging
|
|
|
18
18
|
import shutil
|
|
19
19
|
from pathlib import Path
|
|
20
20
|
from typing import Any
|
|
21
|
+
from uuid import uuid4
|
|
21
22
|
|
|
22
23
|
from pydantic import BaseModel
|
|
23
24
|
from tqdm import tqdm
|
|
24
25
|
|
|
25
26
|
from aiq.data_models.evaluate import EvalConfig
|
|
27
|
+
from aiq.data_models.evaluate import JobEvictionPolicy
|
|
26
28
|
from aiq.eval.config import EvaluationRunConfig
|
|
27
29
|
from aiq.eval.config import EvaluationRunOutput
|
|
28
30
|
from aiq.eval.dataset_handler.dataset_handler import DatasetHandler
|
|
@@ -178,10 +180,60 @@ class EvaluationRun: # pylint: disable=too-many-public-methods
|
|
|
178
180
|
|
|
179
181
|
def cleanup_output_directory(self):
|
|
180
182
|
'''Remove contents of the output directory if it exists'''
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
183
|
+
output_config = self.eval_config.general.output
|
|
184
|
+
output_dir = output_config.dir
|
|
185
|
+
|
|
186
|
+
if not (output_config and output_dir.exists()):
|
|
187
|
+
return
|
|
188
|
+
|
|
189
|
+
# If cleanup is true, remove the entire directory and we are done
|
|
190
|
+
if output_config.cleanup:
|
|
191
|
+
logger.info("Cleaning up entire output directory: %s", output_config.dir)
|
|
192
|
+
shutil.rmtree(output_config.dir)
|
|
193
|
+
return
|
|
194
|
+
|
|
195
|
+
if output_config.job_management.max_jobs == 0:
|
|
196
|
+
# No eviction policy
|
|
197
|
+
return
|
|
198
|
+
|
|
199
|
+
base_dir = output_dir / "jobs"
|
|
200
|
+
if not base_dir.exists():
|
|
201
|
+
return
|
|
202
|
+
|
|
203
|
+
# Get all subdirectories, which represent individual job runs
|
|
204
|
+
job_dirs = [d for d in base_dir.iterdir() if d.is_dir()]
|
|
205
|
+
if len(job_dirs) <= output_config.job_management.max_jobs:
|
|
206
|
+
return
|
|
207
|
+
|
|
208
|
+
# Determine sort key based on eviction_policy, defaulting to creation time
|
|
209
|
+
if output_config.job_management.eviction_policy == JobEvictionPolicy.TIME_MODIFIED:
|
|
210
|
+
|
|
211
|
+
def sort_key(x):
|
|
212
|
+
return x.stat().st_mtime
|
|
213
|
+
|
|
214
|
+
logger.info("Using last modified time for job eviction policy.")
|
|
215
|
+
else:
|
|
216
|
+
|
|
217
|
+
def sort_key(x):
|
|
218
|
+
return x.stat().st_ctime
|
|
219
|
+
|
|
220
|
+
logger.info("Using creation time for job eviction policy.")
|
|
221
|
+
|
|
222
|
+
# Sort directories (oldest first)
|
|
223
|
+
job_dirs.sort(key=sort_key)
|
|
224
|
+
num_to_delete = len(job_dirs) - output_config.job_management.max_jobs
|
|
225
|
+
|
|
226
|
+
logger.info("Found %d jobs, exceeding limit of %d. Removing %d oldest jobs.",
|
|
227
|
+
len(job_dirs),
|
|
228
|
+
output_config.job_management.max_jobs,
|
|
229
|
+
num_to_delete)
|
|
230
|
+
|
|
231
|
+
for dir_to_delete in job_dirs[:num_to_delete]:
|
|
232
|
+
try:
|
|
233
|
+
logger.info("Deleting old job directory: %s", dir_to_delete)
|
|
234
|
+
shutil.rmtree(dir_to_delete)
|
|
235
|
+
except Exception as e:
|
|
236
|
+
logger.exception("Failed to delete old job directory: %s: %s", dir_to_delete, e, exc_info=True)
|
|
185
237
|
|
|
186
238
|
def write_output(self, dataset_handler: DatasetHandler):
|
|
187
239
|
workflow_output_file = self.eval_config.general.output_dir / "workflow_output.json"
|
|
@@ -272,9 +324,15 @@ class EvaluationRun: # pylint: disable=too-many-public-methods
|
|
|
272
324
|
logger.debug("Loaded evaluation configuration: %s", self.eval_config)
|
|
273
325
|
|
|
274
326
|
# Cleanup the output directory
|
|
275
|
-
if self.eval_config.general.output
|
|
327
|
+
if self.eval_config.general.output:
|
|
276
328
|
self.cleanup_output_directory()
|
|
277
329
|
|
|
330
|
+
# Generate a job_id if append_job_id_to_output_dir is enabled and no job_id provided
|
|
331
|
+
if (self.eval_config.general.output
|
|
332
|
+
and self.eval_config.general.output.job_management.append_job_id_to_output_dir and not job_id):
|
|
333
|
+
job_id = "job_" + str(uuid4())
|
|
334
|
+
logger.info("Generated job ID for output directory: %s", job_id)
|
|
335
|
+
|
|
278
336
|
# If a job id is provided keep the data per-job
|
|
279
337
|
if job_id:
|
|
280
338
|
self.eval_config.general.output_dir = self.eval_config.general.output_dir / f"jobs/{job_id}"
|
|
@@ -25,11 +25,9 @@ from langchain_core.language_models import BaseChatModel
|
|
|
25
25
|
from langchain_core.runnables import RunnableLambda
|
|
26
26
|
from tqdm import tqdm
|
|
27
27
|
|
|
28
|
-
from aiq.eval.evaluator.
|
|
28
|
+
from aiq.eval.evaluator.base_evaluator import BaseEvaluator
|
|
29
29
|
from aiq.eval.evaluator.evaluator_model import EvalInputItem
|
|
30
|
-
from aiq.eval.evaluator.evaluator_model import EvalOutput
|
|
31
30
|
from aiq.eval.evaluator.evaluator_model import EvalOutputItem
|
|
32
|
-
from aiq.eval.utils.tqdm_position_registry import TqdmPositionRegistry
|
|
33
31
|
|
|
34
32
|
logger = logging.getLogger(__name__)
|
|
35
33
|
|
|
@@ -96,7 +94,7 @@ def runnable_with_retries(original_fn: Callable, llm_retry_control_params: dict
|
|
|
96
94
|
)
|
|
97
95
|
|
|
98
96
|
|
|
99
|
-
class TunableRagEvaluator:
|
|
97
|
+
class TunableRagEvaluator(BaseEvaluator):
|
|
100
98
|
'''Tunable RAG evaluator class with customizable LLM prompt for scoring.'''
|
|
101
99
|
|
|
102
100
|
def __init__(self,
|
|
@@ -106,187 +104,142 @@ class TunableRagEvaluator:
|
|
|
106
104
|
max_concurrency: int,
|
|
107
105
|
default_scoring: bool,
|
|
108
106
|
default_score_weights: dict):
|
|
107
|
+
super().__init__(max_concurrency=max_concurrency, tqdm_desc="Evaluating RAG")
|
|
109
108
|
self.llm = llm
|
|
110
|
-
self.max_concurrency = max_concurrency
|
|
111
109
|
self.judge_llm_prompt = judge_llm_prompt
|
|
112
110
|
self.llm_retry_control_params = llm_retry_control_params
|
|
113
|
-
self.semaphore = asyncio.Semaphore(self.max_concurrency)
|
|
114
111
|
self.default_scoring = default_scoring
|
|
115
112
|
# Use user-provided weights if available; otherwise, set equal weights for each score
|
|
116
113
|
self.default_score_weights = default_score_weights if default_score_weights else {
|
|
117
114
|
"coverage": 1 / 3, "correctness": 1 / 3, "relevance": 1 / 3
|
|
118
115
|
}
|
|
119
116
|
|
|
120
|
-
async def
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
name="reasoning",
|
|
148
|
-
description=
|
|
149
|
-
"1-2 summarized sentences of reasoning for the scores. Ex. 'The generated answer covers all critical aspects mentioned in the expected answer, is correct, and is relevant to the question.'",
|
|
150
|
-
type="string"),
|
|
151
|
-
]
|
|
152
|
-
|
|
153
|
-
custom_evaluation_schema = [
|
|
154
|
-
ResponseSchema(name="score", description="Score for the generated answer. Ex. 0.5", type="float"),
|
|
155
|
-
ResponseSchema(
|
|
156
|
-
name="reasoning",
|
|
157
|
-
description=
|
|
158
|
-
"1-2 sentence reasoning for the score. Ex. 'The generated answer is exactly the same as the description of the expected answer.'",
|
|
159
|
-
type="string"),
|
|
160
|
-
]
|
|
117
|
+
async def evaluate_item(self, item: EvalInputItem) -> EvalOutputItem:
|
|
118
|
+
"""Compute RAG evaluation for an individual item and return EvalOutputItem"""
|
|
119
|
+
question = item.input_obj
|
|
120
|
+
answer_description = item.expected_output_obj
|
|
121
|
+
generated_answer = item.output_obj
|
|
122
|
+
|
|
123
|
+
# Call judge LLM to generate score
|
|
124
|
+
score = 0.0
|
|
125
|
+
|
|
126
|
+
default_evaluation_schema = [
|
|
127
|
+
ResponseSchema(
|
|
128
|
+
name="coverage_score",
|
|
129
|
+
description="Score for the coverage of all critical aspects mentioned in the expected answer. Ex. 0.5",
|
|
130
|
+
type="float"),
|
|
131
|
+
ResponseSchema(
|
|
132
|
+
name="correctness_score",
|
|
133
|
+
description="Score for the accuracy of the generated answer compared to the expected answer. Ex. 0.5",
|
|
134
|
+
type="float"),
|
|
135
|
+
ResponseSchema(name="relevance_score",
|
|
136
|
+
description="Score for the relevance of the generated answer to the question. Ex. 0.5",
|
|
137
|
+
type="float"),
|
|
138
|
+
ResponseSchema(
|
|
139
|
+
name="reasoning",
|
|
140
|
+
description=
|
|
141
|
+
"1-2 summarized sentences of reasoning for the scores. Ex. 'The generated answer covers all critical aspects mentioned in the expected answer, is correct, and is relevant to the question.'",
|
|
142
|
+
type="string"),
|
|
143
|
+
]
|
|
161
144
|
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
145
|
+
custom_evaluation_schema = [
|
|
146
|
+
ResponseSchema(name="score", description="Score for the generated answer. Ex. 0.5", type="float"),
|
|
147
|
+
ResponseSchema(
|
|
148
|
+
name="reasoning",
|
|
149
|
+
description=
|
|
150
|
+
"1-2 sentence reasoning for the score. Ex. 'The generated answer is exactly the same as the description of the expected answer.'",
|
|
151
|
+
type="string"),
|
|
152
|
+
]
|
|
166
153
|
|
|
167
|
-
|
|
168
|
-
|
|
154
|
+
if self.default_scoring:
|
|
155
|
+
evaluation_schema = default_evaluation_schema
|
|
156
|
+
else:
|
|
157
|
+
evaluation_schema = custom_evaluation_schema
|
|
169
158
|
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
answer_description=answer_description,
|
|
173
|
-
generated_answer=generated_answer,
|
|
174
|
-
format_instructions=format_instructions,
|
|
175
|
-
default_scoring=self.default_scoring)
|
|
159
|
+
llm_input_response_parser = StructuredOutputParser.from_response_schemas(evaluation_schema)
|
|
160
|
+
format_instructions = llm_input_response_parser.get_format_instructions()
|
|
176
161
|
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
162
|
+
eval_prompt = evaluation_prompt(judge_llm_prompt=self.judge_llm_prompt,
|
|
163
|
+
question=question,
|
|
164
|
+
answer_description=answer_description,
|
|
165
|
+
generated_answer=generated_answer,
|
|
166
|
+
format_instructions=format_instructions,
|
|
167
|
+
default_scoring=self.default_scoring)
|
|
180
168
|
|
|
181
|
-
|
|
169
|
+
messages = [SystemMessage(content="You must respond only in JSON format."), HumanMessage(content=eval_prompt)]
|
|
182
170
|
|
|
183
|
-
|
|
184
|
-
coverage_score = 0.0
|
|
185
|
-
correctness_score = 0.0
|
|
186
|
-
relevance_score = 0.0
|
|
187
|
-
reasoning = "Error in evaluator from parsing judge LLM response."
|
|
171
|
+
response = await runnable_with_retries(self.llm.ainvoke, self.llm_retry_control_params).ainvoke(messages)
|
|
188
172
|
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
correctness_score = parsed_response["correctness_score"]
|
|
195
|
-
relevance_score = parsed_response["relevance_score"]
|
|
196
|
-
reasoning = parsed_response["reasoning"]
|
|
197
|
-
except KeyError as e:
|
|
198
|
-
logger.error("Missing required keys in default scoring response: %s",
|
|
199
|
-
", ".join(str(arg) for arg in e.args))
|
|
200
|
-
reasoning = f"Error in evaluator from parsing judge LLM response. Missing required key(s): {', '.join(str(arg) for arg in e.args)}"
|
|
201
|
-
|
|
202
|
-
coverage_weight = self.default_score_weights.get("coverage", 1 / 3)
|
|
203
|
-
correctness_weight = self.default_score_weights.get("correctness", 1 / 3)
|
|
204
|
-
relevance_weight = self.default_score_weights.get("relevance", 1 / 3)
|
|
205
|
-
|
|
206
|
-
# Calculate score
|
|
207
|
-
total_weight = coverage_weight + correctness_weight + relevance_weight
|
|
208
|
-
coverage_weight = coverage_weight / total_weight
|
|
209
|
-
correctness_weight = correctness_weight / total_weight
|
|
210
|
-
relevance_weight = relevance_weight / total_weight
|
|
211
|
-
|
|
212
|
-
if round(coverage_weight + correctness_weight + relevance_weight, 2) != 1:
|
|
213
|
-
logger.warning("The sum of the default score weights is not 1. The weights will be normalized.")
|
|
214
|
-
coverage_weight = coverage_weight / (coverage_weight + correctness_weight + relevance_weight)
|
|
215
|
-
correctness_weight = correctness_weight / (coverage_weight + correctness_weight +
|
|
216
|
-
relevance_weight)
|
|
217
|
-
relevance_weight = relevance_weight / (coverage_weight + correctness_weight + relevance_weight)
|
|
218
|
-
|
|
219
|
-
score = (coverage_weight * coverage_score + correctness_weight * correctness_score +
|
|
220
|
-
relevance_weight * relevance_score)
|
|
221
|
-
|
|
222
|
-
else:
|
|
223
|
-
try:
|
|
224
|
-
score = parsed_response["score"]
|
|
225
|
-
reasoning = parsed_response["reasoning"]
|
|
226
|
-
except KeyError as e:
|
|
227
|
-
logger.error("Missing required keys in custom scoring response: %s",
|
|
228
|
-
", ".join(str(arg) for arg in e.args))
|
|
229
|
-
reasoning = f"Error in evaluator from parsing judge LLM response. Missing required key(s): {', '.join(str(arg) for arg in e.args)}"
|
|
230
|
-
raise
|
|
231
|
-
except (KeyError, ValueError) as e:
|
|
232
|
-
logger.error("Error parsing judge LLM response: %s", e)
|
|
233
|
-
score = 0.0
|
|
234
|
-
reasoning = "Error in evaluator from parsing judge LLM response."
|
|
173
|
+
# Initialize default values to handle service errors
|
|
174
|
+
coverage_score = 0.0
|
|
175
|
+
correctness_score = 0.0
|
|
176
|
+
relevance_score = 0.0
|
|
177
|
+
reasoning = "Error in evaluator from parsing judge LLM response."
|
|
235
178
|
|
|
179
|
+
try:
|
|
180
|
+
parsed_response = llm_input_response_parser.parse(response.content)
|
|
236
181
|
if self.default_scoring:
|
|
237
|
-
|
|
238
|
-
"
|
|
239
|
-
"
|
|
240
|
-
"
|
|
241
|
-
"
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
}
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
# Update the progress bar
|
|
266
|
-
pbar.update(1)
|
|
267
|
-
return result
|
|
182
|
+
try:
|
|
183
|
+
coverage_score = parsed_response["coverage_score"]
|
|
184
|
+
correctness_score = parsed_response["correctness_score"]
|
|
185
|
+
relevance_score = parsed_response["relevance_score"]
|
|
186
|
+
reasoning = parsed_response["reasoning"]
|
|
187
|
+
except KeyError as e:
|
|
188
|
+
logger.error("Missing required keys in default scoring response: %s",
|
|
189
|
+
", ".join(str(arg) for arg in e.args))
|
|
190
|
+
reasoning = f"Error in evaluator from parsing judge LLM response. Missing required key(s): {', '.join(str(arg) for arg in e.args)}"
|
|
191
|
+
|
|
192
|
+
coverage_weight = self.default_score_weights.get("coverage", 1 / 3)
|
|
193
|
+
correctness_weight = self.default_score_weights.get("correctness", 1 / 3)
|
|
194
|
+
relevance_weight = self.default_score_weights.get("relevance", 1 / 3)
|
|
195
|
+
|
|
196
|
+
# Calculate score
|
|
197
|
+
total_weight = coverage_weight + correctness_weight + relevance_weight
|
|
198
|
+
coverage_weight = coverage_weight / total_weight
|
|
199
|
+
correctness_weight = correctness_weight / total_weight
|
|
200
|
+
relevance_weight = relevance_weight / total_weight
|
|
201
|
+
|
|
202
|
+
if round(coverage_weight + correctness_weight + relevance_weight, 2) != 1:
|
|
203
|
+
logger.warning("The sum of the default score weights is not 1. The weights will be normalized.")
|
|
204
|
+
coverage_weight = coverage_weight / (coverage_weight + correctness_weight + relevance_weight)
|
|
205
|
+
correctness_weight = correctness_weight / (coverage_weight + correctness_weight + relevance_weight)
|
|
206
|
+
relevance_weight = relevance_weight / (coverage_weight + correctness_weight + relevance_weight)
|
|
207
|
+
|
|
208
|
+
score = (coverage_weight * coverage_score + correctness_weight * correctness_score +
|
|
209
|
+
relevance_weight * relevance_score)
|
|
268
210
|
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
# Compute average score
|
|
284
|
-
avg_score = round(sum(sample_scores) / len(sample_scores), 2) if sample_scores else 0.0
|
|
285
|
-
|
|
286
|
-
# Construct EvalOutputItems
|
|
287
|
-
eval_output_items = [
|
|
288
|
-
EvalOutputItem(id=item.id, score=score, reasoning=reasoning)
|
|
289
|
-
for item, score, reasoning in zip(eval_input.eval_input_items, sample_scores, sample_reasonings)
|
|
290
|
-
]
|
|
211
|
+
else:
|
|
212
|
+
try:
|
|
213
|
+
score = parsed_response["score"]
|
|
214
|
+
reasoning = parsed_response["reasoning"]
|
|
215
|
+
except KeyError as e:
|
|
216
|
+
logger.error("Missing required keys in custom scoring response: %s",
|
|
217
|
+
", ".join(str(arg) for arg in e.args))
|
|
218
|
+
reasoning = f"Error in evaluator from parsing judge LLM response. Missing required key(s): {', '.join(str(arg) for arg in e.args)}"
|
|
219
|
+
raise
|
|
220
|
+
except (KeyError, ValueError) as e:
|
|
221
|
+
logger.error("Error parsing judge LLM response: %s", e)
|
|
222
|
+
score = 0.0
|
|
223
|
+
reasoning = "Error in evaluator from parsing judge LLM response."
|
|
291
224
|
|
|
292
|
-
|
|
225
|
+
if self.default_scoring:
|
|
226
|
+
reasoning = {
|
|
227
|
+
"question": question,
|
|
228
|
+
"answer_description": answer_description,
|
|
229
|
+
"generated_answer": generated_answer,
|
|
230
|
+
"score_breakdown": {
|
|
231
|
+
"coverage_score": coverage_score,
|
|
232
|
+
"correctness_score": correctness_score,
|
|
233
|
+
"relevance_score": relevance_score,
|
|
234
|
+
},
|
|
235
|
+
"reasoning": reasoning,
|
|
236
|
+
}
|
|
237
|
+
else:
|
|
238
|
+
reasoning = {
|
|
239
|
+
"question": question,
|
|
240
|
+
"answer_description": answer_description,
|
|
241
|
+
"generated_answer": generated_answer,
|
|
242
|
+
"reasoning": reasoning
|
|
243
|
+
}
|
|
244
|
+
|
|
245
|
+
return EvalOutputItem(id=item.id, score=score, reasoning=reasoning)
|
|
@@ -69,13 +69,20 @@ def build_call_tree_for_example(example_df: pd.DataFrame) -> list[CallNode]:
|
|
|
69
69
|
return "LLM"
|
|
70
70
|
if evt.startswith("TOOL_"):
|
|
71
71
|
return "TOOL"
|
|
72
|
+
if evt.startswith("FUNCTION_"):
|
|
73
|
+
return "FUNCTION"
|
|
74
|
+
if evt.startswith("SPAN_"):
|
|
75
|
+
return "FUNCTION"
|
|
72
76
|
return None
|
|
73
77
|
|
|
74
78
|
def get_op_name(row: pd.Series, op_type: str) -> str:
|
|
75
79
|
if op_type == "LLM":
|
|
76
80
|
return row.get("llm_name") or "unknown_llm"
|
|
81
|
+
if op_type == "FUNCTION":
|
|
82
|
+
return row.get("function_name") or "unknown_function"
|
|
77
83
|
if op_type == "TOOL":
|
|
78
84
|
return row.get("tool_name") or "unknown_tool"
|
|
85
|
+
|
|
79
86
|
return "unknown_op"
|
|
80
87
|
|
|
81
88
|
for _, row in example_df.iterrows():
|
|
@@ -309,6 +316,7 @@ def save_gantt_chart(all_nodes: list[CallNode], output_path: str) -> None:
|
|
|
309
316
|
color_map = {
|
|
310
317
|
"LLM": "tab:blue",
|
|
311
318
|
"TOOL": "tab:green",
|
|
319
|
+
"FUNCTION": "tab:orange",
|
|
312
320
|
}
|
|
313
321
|
default_color = "tab:gray"
|
|
314
322
|
|
|
@@ -25,7 +25,7 @@ aiq/builder/eval_builder.py,sha256=UnNgtQiDAUfT3yuwjZQVerenI09-4q0Cse9uwLjk3Fg,4
|
|
|
25
25
|
aiq/builder/evaluator.py,sha256=O6Gu0cUwQkrPxPX29Vf_-RopgijxPnhy7mhg_j-9A84,1162
|
|
26
26
|
aiq/builder/framework_enum.py,sha256=eYwHQifZ86dx-OTubVA3qhCLRqhB4ElMBYBGA0gYtic,885
|
|
27
27
|
aiq/builder/front_end.py,sha256=Xhvfi4VcDh5EoCtLr6AlLQfbRm8_TyugUc_IRfirN6Y,2225
|
|
28
|
-
aiq/builder/function.py,sha256=
|
|
28
|
+
aiq/builder/function.py,sha256=74mZuDemcgpuRAKfn_aSOz1Pqjem3x_9nR0Oh61Tai8,11727
|
|
29
29
|
aiq/builder/function_base.py,sha256=AF5a56y-Nw9OpWsP8IFukUKM2FtP8758qYQW6EfObO0,13109
|
|
30
30
|
aiq/builder/function_info.py,sha256=pGPIAL0tjVqLOJymIRB0boI9pzJGdXiPK3KiZvXQsqM,25266
|
|
31
31
|
aiq/builder/intermediate_step_manager.py,sha256=aKjOK7Gk9XbKhKvRMQTylRGDFZJU7rwqSuiZYaPfwjA,7830
|
|
@@ -33,7 +33,7 @@ aiq/builder/llm.py,sha256=DcoYCyschsRjkW_yGsa_Ci7ELSpk5KRbi9778Dm_B9c,951
|
|
|
33
33
|
aiq/builder/retriever.py,sha256=GM7L1T4NdNZKerFZiCfLcQOwsGoX0NRlF8my7SMq3l4,970
|
|
34
34
|
aiq/builder/user_interaction_manager.py,sha256=OXr-RxWf1sEZjzQH_jt0nmqrLBtYLHGEZEcfDYYFV88,2913
|
|
35
35
|
aiq/builder/workflow.py,sha256=UOjrXmu1sxWTxTjygszqYgK0gK65r_beLKUwOpxtXuc,5894
|
|
36
|
-
aiq/builder/workflow_builder.py,sha256=
|
|
36
|
+
aiq/builder/workflow_builder.py,sha256=0bRcNdjMCIfY60oaMu4hvH-mldIcxEDkm_lKRCBywnU,30412
|
|
37
37
|
aiq/cli/__init__.py,sha256=Xs1JQ16L9btwreh4pdGKwskffAw1YFO48jKrU4ib_7c,685
|
|
38
38
|
aiq/cli/entrypoint.py,sha256=BJsCZgC2nVyAWj7tBXwW67OIteg833xAI54R-e9O8oc,4757
|
|
39
39
|
aiq/cli/main.py,sha256=yVTX5-5-21OOfG8qAdcK3M1fCQUxdr3G37Mb5OldPQc,1772
|
|
@@ -82,7 +82,7 @@ aiq/data_models/config.py,sha256=ERLjZY0iqexZ-gSXsCSN1UqgNeiwkEjWdYJEdKqeYTY,141
|
|
|
82
82
|
aiq/data_models/dataset_handler.py,sha256=SifWhFHtxTMEjrXaXOYQgBOSKfWOzkc6OtOoPJ39pD4,3978
|
|
83
83
|
aiq/data_models/discovery_metadata.py,sha256=OcITQc5VeML4bTHurrsMNiK_oB3z7wudMxcyN7LI8pY,12785
|
|
84
84
|
aiq/data_models/embedder.py,sha256=0v917IiohVA_7zdF7hoO_zQcmNe4hQEFhh4fxRiYBbk,940
|
|
85
|
-
aiq/data_models/evaluate.py,sha256=
|
|
85
|
+
aiq/data_models/evaluate.py,sha256=tLL-AidxW6-VnEpIDYqGpvIdcNXnDee7Ooze9_bzXeY,4557
|
|
86
86
|
aiq/data_models/evaluator.py,sha256=bd2njsyQB2t6ClJ66gJiCjYHsQpWZwPD7rsU0J109TI,939
|
|
87
87
|
aiq/data_models/front_end.py,sha256=z8k6lSWjt1vMOYFbjWQxodpwAqPeuGS0hRBjsriDW2s,932
|
|
88
88
|
aiq/data_models/function.py,sha256=M_duXVXL5MvYe0WVLvqEgEzXs0UAYNSMfy9ZTpxuKPA,1013
|
|
@@ -107,7 +107,7 @@ aiq/embedder/openai_embedder.py,sha256=5FO3xsyNvEmbLBsZb3xsCpbN1Soxio4yf4b5gTPVx
|
|
|
107
107
|
aiq/embedder/register.py,sha256=3MTZrfNQKp6AZTbfaA-PpTnyXiMyu-8HH9JnDCC0v9o,978
|
|
108
108
|
aiq/eval/__init__.py,sha256=Xs1JQ16L9btwreh4pdGKwskffAw1YFO48jKrU4ib_7c,685
|
|
109
109
|
aiq/eval/config.py,sha256=IlOr2o618kbkXP0G1F-AklZfsKYVos9UB4Dvlxf66xk,1431
|
|
110
|
-
aiq/eval/evaluate.py,sha256=
|
|
110
|
+
aiq/eval/evaluate.py,sha256=AGEvmagd43jLq0aE_yNs_FFPFxVJEx49cu6Fl3WeQqA,17270
|
|
111
111
|
aiq/eval/intermediate_step_adapter.py,sha256=4cSsGgFBvNjXnclk5FvZnQaFEdeulp7VEdRWKLcREAQ,4498
|
|
112
112
|
aiq/eval/register.py,sha256=QOHJqA2CQixeWMC9InyKbzXo1jByvrntD_m9-2Mvg9k,1076
|
|
113
113
|
aiq/eval/remote_workflow.py,sha256=Fb7Z6gdP2L_gqyWB--AEWfcXe9xPpQ_hPsf9lmqGXjI,5524
|
|
@@ -129,7 +129,7 @@ aiq/eval/trajectory_evaluator/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5N
|
|
|
129
129
|
aiq/eval/trajectory_evaluator/evaluate.py,sha256=Y51KMhJ9t8AoYWrQlrwipc2CtgIXA9IUGZTbKegtsnw,3257
|
|
130
130
|
aiq/eval/trajectory_evaluator/register.py,sha256=kktT4fu5_1Cou-iohD3YhQevsWiR3TA5NpFSweVz0eQ,1709
|
|
131
131
|
aiq/eval/tunable_rag_evaluator/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
132
|
-
aiq/eval/tunable_rag_evaluator/evaluate.py,sha256=
|
|
132
|
+
aiq/eval/tunable_rag_evaluator/evaluate.py,sha256=f4jfn9VVLmkOg631TQr2wy7hPwGMJMsQa4kmXsu0-Uc,13069
|
|
133
133
|
aiq/eval/tunable_rag_evaluator/register.py,sha256=q4p2rFyMzWmaINJc961ZV4jzIlAN4GfWsoImHo0ovsY,2558
|
|
134
134
|
aiq/eval/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
135
135
|
aiq/eval/utils/output_uploader.py,sha256=SaQbZPkw-Q0H7t5yG60Kh-p1cflR7gPklVkilC4uPbU,5141
|
|
@@ -208,7 +208,7 @@ aiq/profiler/inference_optimization/prompt_caching.py,sha256=LGfxJG4R2y4vMFoiFzt
|
|
|
208
208
|
aiq/profiler/inference_optimization/token_uniqueness.py,sha256=OCNlVmemMLS2kt0OZIXOGt8MbrTy5mbdhSMPYHs31a4,4571
|
|
209
209
|
aiq/profiler/inference_optimization/workflow_runtimes.py,sha256=lnGa0eTpHiDEbx9rX-tcx100qSd6amePLlgb4Gx7JBc,2664
|
|
210
210
|
aiq/profiler/inference_optimization/bottleneck_analysis/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
211
|
-
aiq/profiler/inference_optimization/bottleneck_analysis/nested_stack_analysis.py,sha256=
|
|
211
|
+
aiq/profiler/inference_optimization/bottleneck_analysis/nested_stack_analysis.py,sha256=yr81PsQ4TcrEnuPDlRwhL9Hcox3gO855DsS-BDo00u0,16732
|
|
212
212
|
aiq/profiler/inference_optimization/bottleneck_analysis/simple_stack_analysis.py,sha256=VZLBgsIUGOkY0ZUCLHQM4LpBQpJBM5JKRTUBGyoOFWU,11100
|
|
213
213
|
aiq/profiler/inference_optimization/experimental/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
214
214
|
aiq/profiler/inference_optimization/experimental/concurrency_spike_analysis.py,sha256=J-oMRCEnd6I1XFXiyLUu8VPR745ptnzgzvn0Opsi208,16953
|
|
@@ -309,10 +309,10 @@ aiq/utils/reactive/base/observer_base.py,sha256=UAlyAY_ky4q2t0P81RVFo2Bs_R7z5Nde
|
|
|
309
309
|
aiq/utils/reactive/base/subject_base.py,sha256=Ed-AC6P7cT3qkW1EXjzbd5M9WpVoeN_9KCe3OM3FLU4,2521
|
|
310
310
|
aiq/utils/settings/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
311
311
|
aiq/utils/settings/global_settings.py,sha256=U9TCLdoZsKq5qOVGjREipGVv9e-FlStzqy5zv82_VYk,7454
|
|
312
|
-
aiqtoolkit-1.2.
|
|
313
|
-
aiqtoolkit-1.2.
|
|
314
|
-
aiqtoolkit-1.2.
|
|
315
|
-
aiqtoolkit-1.2.
|
|
316
|
-
aiqtoolkit-1.2.
|
|
317
|
-
aiqtoolkit-1.2.
|
|
318
|
-
aiqtoolkit-1.2.
|
|
312
|
+
aiqtoolkit-1.2.0a20250617.dist-info/licenses/LICENSE-3rd-party.txt,sha256=8o7aySJa9CBvFshPcsRdJbczzdNyDGJ8b0J67WRUQ2k,183936
|
|
313
|
+
aiqtoolkit-1.2.0a20250617.dist-info/licenses/LICENSE.md,sha256=QwcOLU5TJoTeUhuIXzhdCEEDDvorGiC6-3YTOl4TecE,11356
|
|
314
|
+
aiqtoolkit-1.2.0a20250617.dist-info/METADATA,sha256=7U1QTdteJlIPEhjypLhDj_VsNOp3dcGC5GyK4sPecD8,20274
|
|
315
|
+
aiqtoolkit-1.2.0a20250617.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
316
|
+
aiqtoolkit-1.2.0a20250617.dist-info/entry_points.txt,sha256=gRlPfR5g21t328WNEQ4CcEz80S1sJNS8A7rMDYnzl4A,452
|
|
317
|
+
aiqtoolkit-1.2.0a20250617.dist-info/top_level.txt,sha256=fo7AzYcNhZ_tRWrhGumtxwnxMew4xrT1iwouDy_f0Kc,4
|
|
318
|
+
aiqtoolkit-1.2.0a20250617.dist-info/RECORD,,
|
|
File without changes
|
{aiqtoolkit-1.2.0a20250616.dist-info → aiqtoolkit-1.2.0a20250617.dist-info}/entry_points.txt
RENAMED
|
File without changes
|
|
File without changes
|
{aiqtoolkit-1.2.0a20250616.dist-info → aiqtoolkit-1.2.0a20250617.dist-info}/licenses/LICENSE.md
RENAMED
|
File without changes
|
|
File without changes
|