aiqtoolkit 1.2.0.dev0__py3-none-any.whl → 1.2.0rc2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of aiqtoolkit might be problematic. Click here for more details.
- aiq/agent/base.py +170 -8
- aiq/agent/dual_node.py +1 -1
- aiq/agent/react_agent/agent.py +146 -112
- aiq/agent/react_agent/prompt.py +1 -6
- aiq/agent/react_agent/register.py +36 -35
- aiq/agent/rewoo_agent/agent.py +36 -35
- aiq/agent/rewoo_agent/register.py +2 -2
- aiq/agent/tool_calling_agent/agent.py +3 -7
- aiq/agent/tool_calling_agent/register.py +1 -1
- aiq/authentication/__init__.py +14 -0
- aiq/authentication/api_key/__init__.py +14 -0
- aiq/authentication/api_key/api_key_auth_provider.py +92 -0
- aiq/authentication/api_key/api_key_auth_provider_config.py +124 -0
- aiq/authentication/api_key/register.py +26 -0
- aiq/authentication/exceptions/__init__.py +14 -0
- aiq/authentication/exceptions/api_key_exceptions.py +38 -0
- aiq/authentication/exceptions/auth_code_grant_exceptions.py +86 -0
- aiq/authentication/exceptions/call_back_exceptions.py +38 -0
- aiq/authentication/exceptions/request_exceptions.py +54 -0
- aiq/authentication/http_basic_auth/__init__.py +0 -0
- aiq/authentication/http_basic_auth/http_basic_auth_provider.py +81 -0
- aiq/authentication/http_basic_auth/register.py +30 -0
- aiq/authentication/interfaces.py +93 -0
- aiq/authentication/oauth2/__init__.py +14 -0
- aiq/authentication/oauth2/oauth2_auth_code_flow_provider.py +107 -0
- aiq/authentication/oauth2/oauth2_auth_code_flow_provider_config.py +39 -0
- aiq/authentication/oauth2/register.py +25 -0
- aiq/authentication/register.py +21 -0
- aiq/builder/builder.py +64 -2
- aiq/builder/component_utils.py +16 -3
- aiq/builder/context.py +37 -0
- aiq/builder/eval_builder.py +43 -2
- aiq/builder/function.py +44 -12
- aiq/builder/function_base.py +1 -1
- aiq/builder/intermediate_step_manager.py +6 -8
- aiq/builder/user_interaction_manager.py +3 -0
- aiq/builder/workflow.py +23 -18
- aiq/builder/workflow_builder.py +421 -61
- aiq/cli/commands/info/list_mcp.py +103 -16
- aiq/cli/commands/sizing/__init__.py +14 -0
- aiq/cli/commands/sizing/calc.py +294 -0
- aiq/cli/commands/sizing/sizing.py +27 -0
- aiq/cli/commands/start.py +2 -1
- aiq/cli/entrypoint.py +2 -0
- aiq/cli/register_workflow.py +80 -0
- aiq/cli/type_registry.py +151 -30
- aiq/data_models/api_server.py +124 -12
- aiq/data_models/authentication.py +231 -0
- aiq/data_models/common.py +35 -7
- aiq/data_models/component.py +17 -9
- aiq/data_models/component_ref.py +33 -0
- aiq/data_models/config.py +60 -3
- aiq/data_models/dataset_handler.py +2 -1
- aiq/data_models/embedder.py +1 -0
- aiq/data_models/evaluate.py +23 -0
- aiq/data_models/function_dependencies.py +8 -0
- aiq/data_models/interactive.py +10 -1
- aiq/data_models/intermediate_step.py +38 -5
- aiq/data_models/its_strategy.py +30 -0
- aiq/data_models/llm.py +1 -0
- aiq/data_models/memory.py +1 -0
- aiq/data_models/object_store.py +44 -0
- aiq/data_models/profiler.py +1 -0
- aiq/data_models/retry_mixin.py +35 -0
- aiq/data_models/span.py +187 -0
- aiq/data_models/telemetry_exporter.py +2 -2
- aiq/embedder/nim_embedder.py +2 -1
- aiq/embedder/openai_embedder.py +2 -1
- aiq/eval/config.py +19 -1
- aiq/eval/dataset_handler/dataset_handler.py +87 -2
- aiq/eval/evaluate.py +208 -27
- aiq/eval/evaluator/base_evaluator.py +73 -0
- aiq/eval/evaluator/evaluator_model.py +1 -0
- aiq/eval/intermediate_step_adapter.py +11 -5
- aiq/eval/rag_evaluator/evaluate.py +55 -15
- aiq/eval/rag_evaluator/register.py +6 -1
- aiq/eval/remote_workflow.py +7 -2
- aiq/eval/runners/__init__.py +14 -0
- aiq/eval/runners/config.py +39 -0
- aiq/eval/runners/multi_eval_runner.py +54 -0
- aiq/eval/trajectory_evaluator/evaluate.py +22 -65
- aiq/eval/tunable_rag_evaluator/evaluate.py +150 -168
- aiq/eval/tunable_rag_evaluator/register.py +2 -0
- aiq/eval/usage_stats.py +41 -0
- aiq/eval/utils/output_uploader.py +10 -1
- aiq/eval/utils/weave_eval.py +184 -0
- aiq/experimental/__init__.py +0 -0
- aiq/experimental/decorators/__init__.py +0 -0
- aiq/experimental/decorators/experimental_warning_decorator.py +130 -0
- aiq/experimental/inference_time_scaling/__init__.py +0 -0
- aiq/experimental/inference_time_scaling/editing/__init__.py +0 -0
- aiq/experimental/inference_time_scaling/editing/iterative_plan_refinement_editor.py +147 -0
- aiq/experimental/inference_time_scaling/editing/llm_as_a_judge_editor.py +204 -0
- aiq/experimental/inference_time_scaling/editing/motivation_aware_summarization.py +107 -0
- aiq/experimental/inference_time_scaling/functions/__init__.py +0 -0
- aiq/experimental/inference_time_scaling/functions/execute_score_select_function.py +105 -0
- aiq/experimental/inference_time_scaling/functions/its_tool_orchestration_function.py +205 -0
- aiq/experimental/inference_time_scaling/functions/its_tool_wrapper_function.py +146 -0
- aiq/experimental/inference_time_scaling/functions/plan_select_execute_function.py +224 -0
- aiq/experimental/inference_time_scaling/models/__init__.py +0 -0
- aiq/experimental/inference_time_scaling/models/editor_config.py +132 -0
- aiq/experimental/inference_time_scaling/models/its_item.py +48 -0
- aiq/experimental/inference_time_scaling/models/scoring_config.py +112 -0
- aiq/experimental/inference_time_scaling/models/search_config.py +120 -0
- aiq/experimental/inference_time_scaling/models/selection_config.py +154 -0
- aiq/experimental/inference_time_scaling/models/stage_enums.py +43 -0
- aiq/experimental/inference_time_scaling/models/strategy_base.py +66 -0
- aiq/experimental/inference_time_scaling/models/tool_use_config.py +41 -0
- aiq/experimental/inference_time_scaling/register.py +36 -0
- aiq/experimental/inference_time_scaling/scoring/__init__.py +0 -0
- aiq/experimental/inference_time_scaling/scoring/llm_based_agent_scorer.py +168 -0
- aiq/experimental/inference_time_scaling/scoring/llm_based_plan_scorer.py +168 -0
- aiq/experimental/inference_time_scaling/scoring/motivation_aware_scorer.py +111 -0
- aiq/experimental/inference_time_scaling/search/__init__.py +0 -0
- aiq/experimental/inference_time_scaling/search/multi_llm_planner.py +128 -0
- aiq/experimental/inference_time_scaling/search/multi_query_retrieval_search.py +122 -0
- aiq/experimental/inference_time_scaling/search/single_shot_multi_plan_planner.py +128 -0
- aiq/experimental/inference_time_scaling/selection/__init__.py +0 -0
- aiq/experimental/inference_time_scaling/selection/best_of_n_selector.py +63 -0
- aiq/experimental/inference_time_scaling/selection/llm_based_agent_output_selector.py +131 -0
- aiq/experimental/inference_time_scaling/selection/llm_based_output_merging_selector.py +159 -0
- aiq/experimental/inference_time_scaling/selection/llm_based_plan_selector.py +128 -0
- aiq/experimental/inference_time_scaling/selection/threshold_selector.py +58 -0
- aiq/front_ends/console/authentication_flow_handler.py +233 -0
- aiq/front_ends/console/console_front_end_plugin.py +11 -2
- aiq/front_ends/fastapi/auth_flow_handlers/__init__.py +0 -0
- aiq/front_ends/fastapi/auth_flow_handlers/http_flow_handler.py +27 -0
- aiq/front_ends/fastapi/auth_flow_handlers/websocket_flow_handler.py +107 -0
- aiq/front_ends/fastapi/fastapi_front_end_config.py +93 -9
- aiq/front_ends/fastapi/fastapi_front_end_controller.py +68 -0
- aiq/front_ends/fastapi/fastapi_front_end_plugin.py +14 -1
- aiq/front_ends/fastapi/fastapi_front_end_plugin_worker.py +537 -52
- aiq/front_ends/fastapi/html_snippets/__init__.py +14 -0
- aiq/front_ends/fastapi/html_snippets/auth_code_grant_success.py +35 -0
- aiq/front_ends/fastapi/job_store.py +47 -25
- aiq/front_ends/fastapi/main.py +2 -0
- aiq/front_ends/fastapi/message_handler.py +108 -89
- aiq/front_ends/fastapi/step_adaptor.py +2 -1
- aiq/llm/aws_bedrock_llm.py +57 -0
- aiq/llm/nim_llm.py +2 -1
- aiq/llm/openai_llm.py +3 -2
- aiq/llm/register.py +1 -0
- aiq/meta/pypi.md +12 -12
- aiq/object_store/__init__.py +20 -0
- aiq/object_store/in_memory_object_store.py +74 -0
- aiq/object_store/interfaces.py +84 -0
- aiq/object_store/models.py +36 -0
- aiq/object_store/register.py +20 -0
- aiq/observability/__init__.py +14 -0
- aiq/observability/exporter/__init__.py +14 -0
- aiq/observability/exporter/base_exporter.py +449 -0
- aiq/observability/exporter/exporter.py +78 -0
- aiq/observability/exporter/file_exporter.py +33 -0
- aiq/observability/exporter/processing_exporter.py +269 -0
- aiq/observability/exporter/raw_exporter.py +52 -0
- aiq/observability/exporter/span_exporter.py +264 -0
- aiq/observability/exporter_manager.py +335 -0
- aiq/observability/mixin/__init__.py +14 -0
- aiq/observability/mixin/batch_config_mixin.py +26 -0
- aiq/observability/mixin/collector_config_mixin.py +23 -0
- aiq/observability/mixin/file_mixin.py +288 -0
- aiq/observability/mixin/file_mode.py +23 -0
- aiq/observability/mixin/resource_conflict_mixin.py +134 -0
- aiq/observability/mixin/serialize_mixin.py +61 -0
- aiq/observability/mixin/type_introspection_mixin.py +183 -0
- aiq/observability/processor/__init__.py +14 -0
- aiq/observability/processor/batching_processor.py +316 -0
- aiq/observability/processor/intermediate_step_serializer.py +28 -0
- aiq/observability/processor/processor.py +68 -0
- aiq/observability/register.py +36 -39
- aiq/observability/utils/__init__.py +14 -0
- aiq/observability/utils/dict_utils.py +236 -0
- aiq/observability/utils/time_utils.py +31 -0
- aiq/profiler/calc/__init__.py +14 -0
- aiq/profiler/calc/calc_runner.py +623 -0
- aiq/profiler/calc/calculations.py +288 -0
- aiq/profiler/calc/data_models.py +176 -0
- aiq/profiler/calc/plot.py +345 -0
- aiq/profiler/callbacks/langchain_callback_handler.py +22 -10
- aiq/profiler/data_models.py +24 -0
- aiq/profiler/inference_metrics_model.py +3 -0
- aiq/profiler/inference_optimization/bottleneck_analysis/nested_stack_analysis.py +8 -0
- aiq/profiler/inference_optimization/data_models.py +2 -2
- aiq/profiler/inference_optimization/llm_metrics.py +2 -2
- aiq/profiler/profile_runner.py +61 -21
- aiq/runtime/loader.py +9 -3
- aiq/runtime/runner.py +23 -9
- aiq/runtime/session.py +25 -7
- aiq/runtime/user_metadata.py +2 -3
- aiq/tool/chat_completion.py +74 -0
- aiq/tool/code_execution/README.md +152 -0
- aiq/tool/code_execution/code_sandbox.py +151 -72
- aiq/tool/code_execution/local_sandbox/.gitignore +1 -0
- aiq/tool/code_execution/local_sandbox/local_sandbox_server.py +139 -24
- aiq/tool/code_execution/local_sandbox/sandbox.requirements.txt +3 -1
- aiq/tool/code_execution/local_sandbox/start_local_sandbox.sh +27 -2
- aiq/tool/code_execution/register.py +7 -3
- aiq/tool/code_execution/test_code_execution_sandbox.py +414 -0
- aiq/tool/mcp/exceptions.py +142 -0
- aiq/tool/mcp/mcp_client.py +41 -6
- aiq/tool/mcp/mcp_tool.py +3 -2
- aiq/tool/register.py +1 -0
- aiq/tool/server_tools.py +6 -3
- aiq/utils/exception_handlers/automatic_retries.py +289 -0
- aiq/utils/exception_handlers/mcp.py +211 -0
- aiq/utils/io/model_processing.py +28 -0
- aiq/utils/log_utils.py +37 -0
- aiq/utils/string_utils.py +38 -0
- aiq/utils/type_converter.py +18 -2
- aiq/utils/type_utils.py +87 -0
- {aiqtoolkit-1.2.0.dev0.dist-info → aiqtoolkit-1.2.0rc2.dist-info}/METADATA +53 -21
- aiqtoolkit-1.2.0rc2.dist-info/RECORD +436 -0
- {aiqtoolkit-1.2.0.dev0.dist-info → aiqtoolkit-1.2.0rc2.dist-info}/WHEEL +1 -1
- {aiqtoolkit-1.2.0.dev0.dist-info → aiqtoolkit-1.2.0rc2.dist-info}/entry_points.txt +3 -0
- aiq/front_ends/fastapi/websocket.py +0 -148
- aiq/observability/async_otel_listener.py +0 -429
- aiqtoolkit-1.2.0.dev0.dist-info/RECORD +0 -316
- {aiqtoolkit-1.2.0.dev0.dist-info → aiqtoolkit-1.2.0rc2.dist-info}/licenses/LICENSE-3rd-party.txt +0 -0
- {aiqtoolkit-1.2.0.dev0.dist-info → aiqtoolkit-1.2.0rc2.dist-info}/licenses/LICENSE.md +0 -0
- {aiqtoolkit-1.2.0.dev0.dist-info → aiqtoolkit-1.2.0rc2.dist-info}/top_level.txt +0 -0
aiq/eval/remote_workflow.py
CHANGED
|
@@ -24,6 +24,7 @@ from tqdm import tqdm
|
|
|
24
24
|
from aiq.data_models.api_server import AIQResponseIntermediateStep
|
|
25
25
|
from aiq.data_models.intermediate_step import IntermediateStep
|
|
26
26
|
from aiq.data_models.intermediate_step import IntermediateStepPayload
|
|
27
|
+
from aiq.data_models.invocation_node import InvocationNode
|
|
27
28
|
from aiq.eval.config import EvaluationRunConfig
|
|
28
29
|
from aiq.eval.evaluator.evaluator_model import EvalInput
|
|
29
30
|
from aiq.eval.evaluator.evaluator_model import EvalInputItem
|
|
@@ -81,8 +82,12 @@ class EvaluationRemoteWorkflowHandler:
|
|
|
81
82
|
step_data = json.loads(line[len(INTERMEDIATE_DATA_PREFIX):])
|
|
82
83
|
response_intermediate = AIQResponseIntermediateStep.model_validate(step_data)
|
|
83
84
|
# The payload is expected to be IntermediateStepPayload
|
|
84
|
-
|
|
85
|
-
|
|
85
|
+
payload = IntermediateStepPayload.model_validate_json(response_intermediate.payload)
|
|
86
|
+
intermediate_step = IntermediateStep(parent_id="remote",
|
|
87
|
+
function_ancestry=InvocationNode(
|
|
88
|
+
function_name=payload.name or "remote_function",
|
|
89
|
+
function_id=payload.UUID or "remote_function_id"),
|
|
90
|
+
payload=payload)
|
|
86
91
|
intermediate_steps.append(intermediate_step)
|
|
87
92
|
except (json.JSONDecodeError, ValidationError) as e:
|
|
88
93
|
logger.error("Failed to parse intermediate step: %s", e)
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
#
|
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
# you may not use this file except in compliance with the License.
|
|
6
|
+
# You may obtain a copy of the License at
|
|
7
|
+
#
|
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
#
|
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
# See the License for the specific language governing permissions and
|
|
14
|
+
# limitations under the License.
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
#
|
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
# you may not use this file except in compliance with the License.
|
|
6
|
+
# You may obtain a copy of the License at
|
|
7
|
+
#
|
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
#
|
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
# See the License for the specific language governing permissions and
|
|
14
|
+
# limitations under the License.
|
|
15
|
+
|
|
16
|
+
import typing
|
|
17
|
+
|
|
18
|
+
from pydantic import BaseModel
|
|
19
|
+
|
|
20
|
+
from aiq.eval.config import EvaluationRunConfig
|
|
21
|
+
from aiq.eval.config import EvaluationRunOutput
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class MultiEvaluationRunConfig(BaseModel):
|
|
25
|
+
"""
|
|
26
|
+
Parameters used for a multi-evaluation run.
|
|
27
|
+
This includes a dict of configs. The key is an id of any type.
|
|
28
|
+
Each pass loads the config, applies the overrides and runs to completion
|
|
29
|
+
before the next pass starts.
|
|
30
|
+
"""
|
|
31
|
+
configs: dict[typing.Any, EvaluationRunConfig]
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class MultiEvaluationRunOutput(BaseModel):
|
|
35
|
+
"""
|
|
36
|
+
Output of a multi-evaluation run.
|
|
37
|
+
The results per-pass are accumulated in the evaluation_run_outputs dict.
|
|
38
|
+
"""
|
|
39
|
+
evaluation_run_outputs: dict[typing.Any, EvaluationRunOutput]
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
#
|
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
# you may not use this file except in compliance with the License.
|
|
6
|
+
# You may obtain a copy of the License at
|
|
7
|
+
#
|
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
#
|
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
# See the License for the specific language governing permissions and
|
|
14
|
+
# limitations under the License.
|
|
15
|
+
|
|
16
|
+
import copy
|
|
17
|
+
import typing
|
|
18
|
+
|
|
19
|
+
from aiq.eval.config import EvaluationRunConfig
|
|
20
|
+
from aiq.eval.config import EvaluationRunOutput
|
|
21
|
+
from aiq.eval.evaluate import EvaluationRun
|
|
22
|
+
from aiq.eval.runners.config import MultiEvaluationRunConfig
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class MultiEvaluationRunner:
|
|
26
|
+
"""
|
|
27
|
+
Run a multi-evaluation run.
|
|
28
|
+
"""
|
|
29
|
+
|
|
30
|
+
def __init__(self, config: MultiEvaluationRunConfig):
|
|
31
|
+
"""
|
|
32
|
+
Initialize a multi-evaluation run.
|
|
33
|
+
"""
|
|
34
|
+
self.config = config
|
|
35
|
+
self.evaluation_run_outputs: dict[typing.Any, EvaluationRunOutput] = {}
|
|
36
|
+
|
|
37
|
+
async def run_all(self):
|
|
38
|
+
"""
|
|
39
|
+
Run all evaluations defined by the overrides.
|
|
40
|
+
"""
|
|
41
|
+
for id, config in self.config.configs.items():
|
|
42
|
+
output = await self.run_single_evaluation(id, config)
|
|
43
|
+
self.evaluation_run_outputs[id] = output
|
|
44
|
+
|
|
45
|
+
return self.evaluation_run_outputs
|
|
46
|
+
|
|
47
|
+
async def run_single_evaluation(self, id: typing.Any, config: EvaluationRunConfig) -> EvaluationRunOutput:
|
|
48
|
+
"""
|
|
49
|
+
Run a single evaluation and return the output.
|
|
50
|
+
"""
|
|
51
|
+
# copy the config in case the caller is using the same config for multiple evaluations
|
|
52
|
+
config_copy = copy.deepcopy(config)
|
|
53
|
+
evaluation_run = EvaluationRun(config_copy)
|
|
54
|
+
return await evaluation_run.run_and_evaluate()
|
|
@@ -13,24 +13,20 @@
|
|
|
13
13
|
# See the License for the specific language governing permissions and
|
|
14
14
|
# limitations under the License.
|
|
15
15
|
|
|
16
|
-
import asyncio
|
|
17
16
|
import logging
|
|
18
17
|
|
|
19
18
|
from langchain.evaluation import TrajectoryEvalChain
|
|
20
19
|
from langchain_core.language_models import BaseChatModel
|
|
21
20
|
from langchain_core.tools import BaseTool
|
|
22
|
-
from tqdm import tqdm
|
|
23
21
|
|
|
24
|
-
from aiq.eval.evaluator.
|
|
22
|
+
from aiq.eval.evaluator.base_evaluator import BaseEvaluator
|
|
25
23
|
from aiq.eval.evaluator.evaluator_model import EvalInputItem
|
|
26
|
-
from aiq.eval.evaluator.evaluator_model import EvalOutput
|
|
27
24
|
from aiq.eval.evaluator.evaluator_model import EvalOutputItem
|
|
28
|
-
from aiq.eval.utils.tqdm_position_registry import TqdmPositionRegistry
|
|
29
25
|
|
|
30
26
|
logger = logging.getLogger(__name__)
|
|
31
27
|
|
|
32
28
|
|
|
33
|
-
class TrajectoryEvaluator:
|
|
29
|
+
class TrajectoryEvaluator(BaseEvaluator):
|
|
34
30
|
|
|
35
31
|
def __init__(
|
|
36
32
|
self,
|
|
@@ -38,11 +34,9 @@ class TrajectoryEvaluator:
|
|
|
38
34
|
tools: list[BaseTool] | None = None,
|
|
39
35
|
max_concurrency: int = 8,
|
|
40
36
|
):
|
|
41
|
-
|
|
37
|
+
super().__init__(max_concurrency=max_concurrency, tqdm_desc="Evaluating Trajectory")
|
|
42
38
|
self.llm = llm
|
|
43
39
|
self.tools = tools
|
|
44
|
-
self.max_concurrency = max_concurrency
|
|
45
|
-
self.semaphore = asyncio.Semaphore(self.max_concurrency)
|
|
46
40
|
# Initialize trajectory evaluation chain
|
|
47
41
|
self.traj_eval_chain = TrajectoryEvalChain.from_llm(llm=self.llm,
|
|
48
42
|
tools=self.tools,
|
|
@@ -50,69 +44,32 @@ class TrajectoryEvaluator:
|
|
|
50
44
|
requires_reference=True)
|
|
51
45
|
logger.debug("Trajectory evaluation chain initialized.")
|
|
52
46
|
|
|
53
|
-
async def
|
|
47
|
+
async def evaluate_item(self, item: EvalInputItem) -> EvalOutputItem:
|
|
54
48
|
"""
|
|
55
|
-
|
|
49
|
+
Evaluate a single EvalInputItem and return an EvalOutputItem.
|
|
56
50
|
"""
|
|
57
|
-
|
|
58
|
-
num_records = len(eval_input.eval_input_items)
|
|
59
|
-
logger.info("Running trajectory evaluation with %d records", num_records)
|
|
60
51
|
from aiq.data_models.intermediate_step import IntermediateStepType
|
|
61
52
|
from aiq.eval.intermediate_step_adapter import IntermediateStepAdapter
|
|
62
53
|
|
|
63
54
|
intermediate_step_adapter = IntermediateStepAdapter()
|
|
64
55
|
event_filter = [IntermediateStepType.LLM_END, IntermediateStepType.TOOL_END]
|
|
65
56
|
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
1. score
|
|
70
|
-
2. reasoning for the score
|
|
71
|
-
"""
|
|
72
|
-
question = item.input_obj
|
|
73
|
-
generated_answer = item.output_obj
|
|
74
|
-
agent_trajectory = intermediate_step_adapter.get_agent_actions(item.trajectory, event_filter)
|
|
75
|
-
try:
|
|
76
|
-
eval_result = await self.traj_eval_chain.aevaluate_agent_trajectory(
|
|
77
|
-
input=question,
|
|
78
|
-
agent_trajectory=agent_trajectory,
|
|
79
|
-
prediction=generated_answer,
|
|
80
|
-
)
|
|
81
|
-
except Exception as e:
|
|
82
|
-
logger.exception("Error evaluating trajectory for question: %s, Error: %s", question, e, exc_info=True)
|
|
83
|
-
return 0.0, f"Error evaluating trajectory: {e}"
|
|
84
|
-
|
|
85
|
-
reasoning = {
|
|
86
|
-
"reasoning": eval_result["reasoning"],
|
|
87
|
-
"trajectory": [(action.model_dump(), output) for (action, output) in agent_trajectory]
|
|
88
|
-
}
|
|
89
|
-
return eval_result["score"], reasoning
|
|
90
|
-
|
|
91
|
-
async def wrapped_process(item: EvalInputItem) -> tuple[float, dict]:
|
|
92
|
-
async with self.semaphore:
|
|
93
|
-
result = await process_item(item)
|
|
94
|
-
pbar.update(1)
|
|
95
|
-
return result
|
|
57
|
+
question = item.input_obj
|
|
58
|
+
generated_answer = item.output_obj
|
|
59
|
+
agent_trajectory = intermediate_step_adapter.get_agent_actions(item.trajectory, event_filter)
|
|
96
60
|
|
|
97
|
-
# Execute all evaluations asynchronously
|
|
98
61
|
try:
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
eval_output_items = [
|
|
114
|
-
EvalOutputItem(id=item.id, score=score, reasoning=reasoning)
|
|
115
|
-
for item, score, reasoning in zip(eval_input.eval_input_items, sample_scores, sample_reasonings)
|
|
116
|
-
]
|
|
117
|
-
|
|
118
|
-
return EvalOutput(average_score=avg_score, eval_output_items=eval_output_items)
|
|
62
|
+
eval_result = await self.traj_eval_chain.aevaluate_agent_trajectory(
|
|
63
|
+
input=question,
|
|
64
|
+
agent_trajectory=agent_trajectory,
|
|
65
|
+
prediction=generated_answer,
|
|
66
|
+
)
|
|
67
|
+
except Exception as e:
|
|
68
|
+
logger.exception("Error evaluating trajectory for question: %s, Error: %s", question, e, exc_info=True)
|
|
69
|
+
return EvalOutputItem(id=item.id, score=0.0, reasoning=f"Error evaluating trajectory: {e}")
|
|
70
|
+
|
|
71
|
+
reasoning = {
|
|
72
|
+
"reasoning": eval_result["reasoning"],
|
|
73
|
+
"trajectory": [(action.model_dump(), output) for (action, output) in agent_trajectory]
|
|
74
|
+
}
|
|
75
|
+
return EvalOutputItem(id=item.id, score=eval_result["score"], reasoning=reasoning)
|
|
@@ -15,19 +15,19 @@
|
|
|
15
15
|
|
|
16
16
|
import asyncio
|
|
17
17
|
import logging
|
|
18
|
+
from typing import Callable
|
|
18
19
|
|
|
19
20
|
from langchain.output_parsers import ResponseSchema
|
|
20
21
|
from langchain.output_parsers import StructuredOutputParser
|
|
21
22
|
from langchain.schema import HumanMessage
|
|
22
23
|
from langchain.schema import SystemMessage
|
|
23
24
|
from langchain_core.language_models import BaseChatModel
|
|
25
|
+
from langchain_core.runnables import RunnableLambda
|
|
24
26
|
from tqdm import tqdm
|
|
25
27
|
|
|
26
|
-
from aiq.eval.evaluator.
|
|
28
|
+
from aiq.eval.evaluator.base_evaluator import BaseEvaluator
|
|
27
29
|
from aiq.eval.evaluator.evaluator_model import EvalInputItem
|
|
28
|
-
from aiq.eval.evaluator.evaluator_model import EvalOutput
|
|
29
30
|
from aiq.eval.evaluator.evaluator_model import EvalOutputItem
|
|
30
|
-
from aiq.eval.utils.tqdm_position_registry import TqdmPositionRegistry
|
|
31
31
|
|
|
32
32
|
logger = logging.getLogger(__name__)
|
|
33
33
|
|
|
@@ -69,195 +69,177 @@ def evaluation_prompt(judge_llm_prompt: str,
|
|
|
69
69
|
return EVAL_PROMPT if not default_scoring else DEFAULT_EVAL_PROMPT
|
|
70
70
|
|
|
71
71
|
|
|
72
|
-
|
|
72
|
+
def runnable_with_retries(original_fn: Callable, llm_retry_control_params: dict | None = None):
|
|
73
|
+
runnable = RunnableLambda(original_fn)
|
|
74
|
+
|
|
75
|
+
if llm_retry_control_params is None:
|
|
76
|
+
llm_retry_control_params = {
|
|
77
|
+
"stop_after_attempt": 3, "initial_backoff_delay_seconds": 1, "has_exponential_jitter": True
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
if llm_retry_control_params["has_exponential_jitter"] is None:
|
|
81
|
+
llm_retry_control_params["has_exponential_jitter"] = True
|
|
82
|
+
if llm_retry_control_params["stop_after_attempt"] is None:
|
|
83
|
+
llm_retry_control_params["stop_after_attempt"] = 3
|
|
84
|
+
if llm_retry_control_params["initial_backoff_delay_seconds"] is None:
|
|
85
|
+
llm_retry_control_params["initial_backoff_delay_seconds"] = 1
|
|
86
|
+
|
|
87
|
+
# Add retry logic with exponential backoff and jitter
|
|
88
|
+
return runnable.with_retry(
|
|
89
|
+
retry_if_exception_type=(Exception, ), # Retry on any error
|
|
90
|
+
wait_exponential_jitter=llm_retry_control_params["has_exponential_jitter"], # Add jitter to exponential backoff
|
|
91
|
+
stop_after_attempt=llm_retry_control_params["stop_after_attempt"],
|
|
92
|
+
exponential_jitter_params={"initial": llm_retry_control_params["initial_backoff_delay_seconds"]
|
|
93
|
+
} # Optional: set initial backoff (seconds)
|
|
94
|
+
)
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
class TunableRagEvaluator(BaseEvaluator):
|
|
73
98
|
'''Tunable RAG evaluator class with customizable LLM prompt for scoring.'''
|
|
74
99
|
|
|
75
100
|
def __init__(self,
|
|
76
101
|
llm: BaseChatModel,
|
|
77
102
|
judge_llm_prompt: str,
|
|
103
|
+
llm_retry_control_params: dict | None,
|
|
78
104
|
max_concurrency: int,
|
|
79
105
|
default_scoring: bool,
|
|
80
106
|
default_score_weights: dict):
|
|
107
|
+
super().__init__(max_concurrency=max_concurrency, tqdm_desc="Evaluating RAG")
|
|
81
108
|
self.llm = llm
|
|
82
|
-
self.max_concurrency = max_concurrency
|
|
83
109
|
self.judge_llm_prompt = judge_llm_prompt
|
|
84
|
-
self.
|
|
110
|
+
self.llm_retry_control_params = llm_retry_control_params
|
|
85
111
|
self.default_scoring = default_scoring
|
|
86
112
|
# Use user-provided weights if available; otherwise, set equal weights for each score
|
|
87
113
|
self.default_score_weights = default_score_weights if default_score_weights else {
|
|
88
114
|
"coverage": 1 / 3, "correctness": 1 / 3, "relevance": 1 / 3
|
|
89
115
|
}
|
|
90
116
|
|
|
91
|
-
async def
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
name="reasoning",
|
|
119
|
-
description=
|
|
120
|
-
"1-2 summarized sentences of reasoning for the scores. Ex. 'The generated answer covers all critical aspects mentioned in the expected answer, is correct, and is relevant to the question.'",
|
|
121
|
-
type="string"),
|
|
122
|
-
]
|
|
123
|
-
|
|
124
|
-
custom_evaluation_schema = [
|
|
125
|
-
ResponseSchema(name="score", description="Score for the generated answer. Ex. 0.5", type="float"),
|
|
126
|
-
ResponseSchema(
|
|
127
|
-
name="reasoning",
|
|
128
|
-
description=
|
|
129
|
-
"1-2 sentence reasoning for the score. Ex. 'The generated answer is exactly the same as the description of the expected answer.'",
|
|
130
|
-
type="string"),
|
|
131
|
-
]
|
|
117
|
+
async def evaluate_item(self, item: EvalInputItem) -> EvalOutputItem:
|
|
118
|
+
"""Compute RAG evaluation for an individual item and return EvalOutputItem"""
|
|
119
|
+
question = item.input_obj
|
|
120
|
+
answer_description = item.expected_output_obj
|
|
121
|
+
generated_answer = item.output_obj
|
|
122
|
+
|
|
123
|
+
# Call judge LLM to generate score
|
|
124
|
+
score = 0.0
|
|
125
|
+
|
|
126
|
+
default_evaluation_schema = [
|
|
127
|
+
ResponseSchema(
|
|
128
|
+
name="coverage_score",
|
|
129
|
+
description="Score for the coverage of all critical aspects mentioned in the expected answer. Ex. 0.5",
|
|
130
|
+
type="float"),
|
|
131
|
+
ResponseSchema(
|
|
132
|
+
name="correctness_score",
|
|
133
|
+
description="Score for the accuracy of the generated answer compared to the expected answer. Ex. 0.5",
|
|
134
|
+
type="float"),
|
|
135
|
+
ResponseSchema(name="relevance_score",
|
|
136
|
+
description="Score for the relevance of the generated answer to the question. Ex. 0.5",
|
|
137
|
+
type="float"),
|
|
138
|
+
ResponseSchema(
|
|
139
|
+
name="reasoning",
|
|
140
|
+
description=
|
|
141
|
+
"1-2 summarized sentences of reasoning for the scores. Ex. 'The generated answer covers all critical aspects mentioned in the expected answer, is correct, and is relevant to the question.'",
|
|
142
|
+
type="string"),
|
|
143
|
+
]
|
|
132
144
|
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
145
|
+
custom_evaluation_schema = [
|
|
146
|
+
ResponseSchema(name="score", description="Score for the generated answer. Ex. 0.5", type="float"),
|
|
147
|
+
ResponseSchema(
|
|
148
|
+
name="reasoning",
|
|
149
|
+
description=
|
|
150
|
+
"1-2 sentence reasoning for the score. Ex. 'The generated answer is exactly the same as the description of the expected answer.'",
|
|
151
|
+
type="string"),
|
|
152
|
+
]
|
|
137
153
|
|
|
138
|
-
|
|
139
|
-
|
|
154
|
+
if self.default_scoring:
|
|
155
|
+
evaluation_schema = default_evaluation_schema
|
|
156
|
+
else:
|
|
157
|
+
evaluation_schema = custom_evaluation_schema
|
|
140
158
|
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
answer_description=answer_description,
|
|
144
|
-
generated_answer=generated_answer,
|
|
145
|
-
format_instructions=format_instructions,
|
|
146
|
-
default_scoring=self.default_scoring)
|
|
159
|
+
llm_input_response_parser = StructuredOutputParser.from_response_schemas(evaluation_schema)
|
|
160
|
+
format_instructions = llm_input_response_parser.get_format_instructions()
|
|
147
161
|
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
162
|
+
eval_prompt = evaluation_prompt(judge_llm_prompt=self.judge_llm_prompt,
|
|
163
|
+
question=question,
|
|
164
|
+
answer_description=answer_description,
|
|
165
|
+
generated_answer=generated_answer,
|
|
166
|
+
format_instructions=format_instructions,
|
|
167
|
+
default_scoring=self.default_scoring)
|
|
151
168
|
|
|
152
|
-
|
|
169
|
+
messages = [SystemMessage(content="You must respond only in JSON format."), HumanMessage(content=eval_prompt)]
|
|
153
170
|
|
|
154
|
-
|
|
155
|
-
coverage_score = 0.0
|
|
156
|
-
correctness_score = 0.0
|
|
157
|
-
relevance_score = 0.0
|
|
158
|
-
reasoning = "Error in evaluator from parsing judge LLM response."
|
|
171
|
+
response = await runnable_with_retries(self.llm.ainvoke, self.llm_retry_control_params).ainvoke(messages)
|
|
159
172
|
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
correctness_score = parsed_response["correctness_score"]
|
|
166
|
-
relevance_score = parsed_response["relevance_score"]
|
|
167
|
-
reasoning = parsed_response["reasoning"]
|
|
168
|
-
except KeyError as e:
|
|
169
|
-
logger.error("Missing required keys in default scoring response: %s",
|
|
170
|
-
", ".join(str(arg) for arg in e.args))
|
|
171
|
-
reasoning = f"Error in evaluator from parsing judge LLM response. Missing required key(s): {', '.join(str(arg) for arg in e.args)}"
|
|
172
|
-
|
|
173
|
-
coverage_weight = self.default_score_weights.get("coverage", 1 / 3)
|
|
174
|
-
correctness_weight = self.default_score_weights.get("correctness", 1 / 3)
|
|
175
|
-
relevance_weight = self.default_score_weights.get("relevance", 1 / 3)
|
|
176
|
-
|
|
177
|
-
# Calculate score
|
|
178
|
-
total_weight = coverage_weight + correctness_weight + relevance_weight
|
|
179
|
-
coverage_weight = coverage_weight / total_weight
|
|
180
|
-
correctness_weight = correctness_weight / total_weight
|
|
181
|
-
relevance_weight = relevance_weight / total_weight
|
|
182
|
-
|
|
183
|
-
if round(coverage_weight + correctness_weight + relevance_weight, 2) != 1:
|
|
184
|
-
logger.warning("The sum of the default score weights is not 1. The weights will be normalized.")
|
|
185
|
-
coverage_weight = coverage_weight / (coverage_weight + correctness_weight + relevance_weight)
|
|
186
|
-
correctness_weight = correctness_weight / (coverage_weight + correctness_weight +
|
|
187
|
-
relevance_weight)
|
|
188
|
-
relevance_weight = relevance_weight / (coverage_weight + correctness_weight + relevance_weight)
|
|
189
|
-
|
|
190
|
-
score = (coverage_weight * coverage_score + correctness_weight * correctness_score +
|
|
191
|
-
relevance_weight * relevance_score)
|
|
192
|
-
|
|
193
|
-
else:
|
|
194
|
-
try:
|
|
195
|
-
score = parsed_response["score"]
|
|
196
|
-
reasoning = parsed_response["reasoning"]
|
|
197
|
-
except KeyError as e:
|
|
198
|
-
logger.error("Missing required keys in custom scoring response: %s",
|
|
199
|
-
", ".join(str(arg) for arg in e.args))
|
|
200
|
-
reasoning = f"Error in evaluator from parsing judge LLM response. Missing required key(s): {', '.join(str(arg) for arg in e.args)}"
|
|
201
|
-
raise
|
|
202
|
-
except (KeyError, ValueError) as e:
|
|
203
|
-
logger.error("Error parsing judge LLM response: %s", e)
|
|
204
|
-
score = 0.0
|
|
205
|
-
reasoning = "Error in evaluator from parsing judge LLM response."
|
|
173
|
+
# Initialize default values to handle service errors
|
|
174
|
+
coverage_score = 0.0
|
|
175
|
+
correctness_score = 0.0
|
|
176
|
+
relevance_score = 0.0
|
|
177
|
+
reasoning = "Error in evaluator from parsing judge LLM response."
|
|
206
178
|
|
|
179
|
+
try:
|
|
180
|
+
parsed_response = llm_input_response_parser.parse(response.content)
|
|
207
181
|
if self.default_scoring:
|
|
208
|
-
|
|
209
|
-
"
|
|
210
|
-
"
|
|
211
|
-
"
|
|
212
|
-
"
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
}
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
# Update the progress bar
|
|
237
|
-
pbar.update(1)
|
|
238
|
-
return result
|
|
182
|
+
try:
|
|
183
|
+
coverage_score = parsed_response["coverage_score"]
|
|
184
|
+
correctness_score = parsed_response["correctness_score"]
|
|
185
|
+
relevance_score = parsed_response["relevance_score"]
|
|
186
|
+
reasoning = parsed_response["reasoning"]
|
|
187
|
+
except KeyError as e:
|
|
188
|
+
logger.error("Missing required keys in default scoring response: %s",
|
|
189
|
+
", ".join(str(arg) for arg in e.args))
|
|
190
|
+
reasoning = f"Error in evaluator from parsing judge LLM response. Missing required key(s): {', '.join(str(arg) for arg in e.args)}"
|
|
191
|
+
|
|
192
|
+
coverage_weight = self.default_score_weights.get("coverage", 1 / 3)
|
|
193
|
+
correctness_weight = self.default_score_weights.get("correctness", 1 / 3)
|
|
194
|
+
relevance_weight = self.default_score_weights.get("relevance", 1 / 3)
|
|
195
|
+
|
|
196
|
+
# Calculate score
|
|
197
|
+
total_weight = coverage_weight + correctness_weight + relevance_weight
|
|
198
|
+
coverage_weight = coverage_weight / total_weight
|
|
199
|
+
correctness_weight = correctness_weight / total_weight
|
|
200
|
+
relevance_weight = relevance_weight / total_weight
|
|
201
|
+
|
|
202
|
+
if round(coverage_weight + correctness_weight + relevance_weight, 2) != 1:
|
|
203
|
+
logger.warning("The sum of the default score weights is not 1. The weights will be normalized.")
|
|
204
|
+
coverage_weight = coverage_weight / (coverage_weight + correctness_weight + relevance_weight)
|
|
205
|
+
correctness_weight = correctness_weight / (coverage_weight + correctness_weight + relevance_weight)
|
|
206
|
+
relevance_weight = relevance_weight / (coverage_weight + correctness_weight + relevance_weight)
|
|
207
|
+
|
|
208
|
+
score = (coverage_weight * coverage_score + correctness_weight * correctness_score +
|
|
209
|
+
relevance_weight * relevance_score)
|
|
239
210
|
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
# Compute average score
|
|
255
|
-
avg_score = round(sum(sample_scores) / len(sample_scores), 2) if sample_scores else 0.0
|
|
256
|
-
|
|
257
|
-
# Construct EvalOutputItems
|
|
258
|
-
eval_output_items = [
|
|
259
|
-
EvalOutputItem(id=item.id, score=score, reasoning=reasoning)
|
|
260
|
-
for item, score, reasoning in zip(eval_input.eval_input_items, sample_scores, sample_reasonings)
|
|
261
|
-
]
|
|
211
|
+
else:
|
|
212
|
+
try:
|
|
213
|
+
score = parsed_response["score"]
|
|
214
|
+
reasoning = parsed_response["reasoning"]
|
|
215
|
+
except KeyError as e:
|
|
216
|
+
logger.error("Missing required keys in custom scoring response: %s",
|
|
217
|
+
", ".join(str(arg) for arg in e.args))
|
|
218
|
+
reasoning = f"Error in evaluator from parsing judge LLM response. Missing required key(s): {', '.join(str(arg) for arg in e.args)}"
|
|
219
|
+
raise
|
|
220
|
+
except (KeyError, ValueError) as e:
|
|
221
|
+
logger.error("Error parsing judge LLM response: %s", e)
|
|
222
|
+
score = 0.0
|
|
223
|
+
reasoning = "Error in evaluator from parsing judge LLM response."
|
|
262
224
|
|
|
263
|
-
|
|
225
|
+
if self.default_scoring:
|
|
226
|
+
reasoning = {
|
|
227
|
+
"question": question,
|
|
228
|
+
"answer_description": answer_description,
|
|
229
|
+
"generated_answer": generated_answer,
|
|
230
|
+
"score_breakdown": {
|
|
231
|
+
"coverage_score": coverage_score,
|
|
232
|
+
"correctness_score": correctness_score,
|
|
233
|
+
"relevance_score": relevance_score,
|
|
234
|
+
},
|
|
235
|
+
"reasoning": reasoning,
|
|
236
|
+
}
|
|
237
|
+
else:
|
|
238
|
+
reasoning = {
|
|
239
|
+
"question": question,
|
|
240
|
+
"answer_description": answer_description,
|
|
241
|
+
"generated_answer": generated_answer,
|
|
242
|
+
"reasoning": reasoning
|
|
243
|
+
}
|
|
244
|
+
|
|
245
|
+
return EvalOutputItem(id=item.id, score=score, reasoning=reasoning)
|
|
@@ -26,6 +26,7 @@ from aiq.data_models.evaluator import EvaluatorBaseConfig
|
|
|
26
26
|
class TunableRagEvaluatorConfig(EvaluatorBaseConfig, name="tunable_rag_evaluator"):
|
|
27
27
|
'''Configuration for tunable RAG evaluator'''
|
|
28
28
|
llm_name: LLMRef = Field(description="Name of the judge LLM")
|
|
29
|
+
llm_retry_control_params: dict | None = Field(description="Parameters to control LLM retry behavior", default=None)
|
|
29
30
|
judge_llm_prompt: str = Field(description="LLM prompt for the judge LLM")
|
|
30
31
|
default_scoring: bool = Field(description="Whether to use default scoring", default=False)
|
|
31
32
|
default_score_weights: dict = Field(
|
|
@@ -43,6 +44,7 @@ async def register_tunable_rag_evaluator(config: TunableRagEvaluatorConfig, buil
|
|
|
43
44
|
llm = await builder.get_llm(config.llm_name, wrapper_type=LLMFrameworkEnum.LANGCHAIN)
|
|
44
45
|
evaluator = TunableRagEvaluator(llm,
|
|
45
46
|
config.judge_llm_prompt,
|
|
47
|
+
config.llm_retry_control_params,
|
|
46
48
|
builder.get_max_concurrency(),
|
|
47
49
|
config.default_scoring,
|
|
48
50
|
config.default_score_weights)
|