PyPI - aiqtoolkit - Versions diffs - 1.2.0.dev0__py3-none-any.whl → 1.2.0rc2__py3-none-any.whl - Mend

aiqtoolkit 1.2.0.dev0py3-none-any.whl → 1.2.0rc2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of aiqtoolkit might be problematic. Click here for more details.

Files changed (220) hide show

aiq/agent/base.py +170 -8
aiq/agent/dual_node.py +1 -1
aiq/agent/react_agent/agent.py +146 -112
aiq/agent/react_agent/prompt.py +1 -6
aiq/agent/react_agent/register.py +36 -35
aiq/agent/rewoo_agent/agent.py +36 -35
aiq/agent/rewoo_agent/register.py +2 -2
aiq/agent/tool_calling_agent/agent.py +3 -7
aiq/agent/tool_calling_agent/register.py +1 -1
aiq/authentication/__init__.py +14 -0
aiq/authentication/api_key/__init__.py +14 -0
aiq/authentication/api_key/api_key_auth_provider.py +92 -0
aiq/authentication/api_key/api_key_auth_provider_config.py +124 -0
aiq/authentication/api_key/register.py +26 -0
aiq/authentication/exceptions/__init__.py +14 -0
aiq/authentication/exceptions/api_key_exceptions.py +38 -0
aiq/authentication/exceptions/auth_code_grant_exceptions.py +86 -0
aiq/authentication/exceptions/call_back_exceptions.py +38 -0
aiq/authentication/exceptions/request_exceptions.py +54 -0
aiq/authentication/http_basic_auth/__init__.py +0 -0
aiq/authentication/http_basic_auth/http_basic_auth_provider.py +81 -0
aiq/authentication/http_basic_auth/register.py +30 -0
aiq/authentication/interfaces.py +93 -0
aiq/authentication/oauth2/__init__.py +14 -0
aiq/authentication/oauth2/oauth2_auth_code_flow_provider.py +107 -0
aiq/authentication/oauth2/oauth2_auth_code_flow_provider_config.py +39 -0
aiq/authentication/oauth2/register.py +25 -0
aiq/authentication/register.py +21 -0
aiq/builder/builder.py +64 -2
aiq/builder/component_utils.py +16 -3
aiq/builder/context.py +37 -0
aiq/builder/eval_builder.py +43 -2
aiq/builder/function.py +44 -12
aiq/builder/function_base.py +1 -1
aiq/builder/intermediate_step_manager.py +6 -8
aiq/builder/user_interaction_manager.py +3 -0
aiq/builder/workflow.py +23 -18
aiq/builder/workflow_builder.py +421 -61
aiq/cli/commands/info/list_mcp.py +103 -16
aiq/cli/commands/sizing/__init__.py +14 -0
aiq/cli/commands/sizing/calc.py +294 -0
aiq/cli/commands/sizing/sizing.py +27 -0
aiq/cli/commands/start.py +2 -1
aiq/cli/entrypoint.py +2 -0
aiq/cli/register_workflow.py +80 -0
aiq/cli/type_registry.py +151 -30
aiq/data_models/api_server.py +124 -12
aiq/data_models/authentication.py +231 -0
aiq/data_models/common.py +35 -7
aiq/data_models/component.py +17 -9
aiq/data_models/component_ref.py +33 -0
aiq/data_models/config.py +60 -3
aiq/data_models/dataset_handler.py +2 -1
aiq/data_models/embedder.py +1 -0
aiq/data_models/evaluate.py +23 -0
aiq/data_models/function_dependencies.py +8 -0
aiq/data_models/interactive.py +10 -1
aiq/data_models/intermediate_step.py +38 -5
aiq/data_models/its_strategy.py +30 -0
aiq/data_models/llm.py +1 -0
aiq/data_models/memory.py +1 -0
aiq/data_models/object_store.py +44 -0
aiq/data_models/profiler.py +1 -0
aiq/data_models/retry_mixin.py +35 -0
aiq/data_models/span.py +187 -0
aiq/data_models/telemetry_exporter.py +2 -2
aiq/embedder/nim_embedder.py +2 -1
aiq/embedder/openai_embedder.py +2 -1
aiq/eval/config.py +19 -1
aiq/eval/dataset_handler/dataset_handler.py +87 -2
aiq/eval/evaluate.py +208 -27
aiq/eval/evaluator/base_evaluator.py +73 -0
aiq/eval/evaluator/evaluator_model.py +1 -0
aiq/eval/intermediate_step_adapter.py +11 -5
aiq/eval/rag_evaluator/evaluate.py +55 -15
aiq/eval/rag_evaluator/register.py +6 -1
aiq/eval/remote_workflow.py +7 -2
aiq/eval/runners/__init__.py +14 -0
aiq/eval/runners/config.py +39 -0
aiq/eval/runners/multi_eval_runner.py +54 -0
aiq/eval/trajectory_evaluator/evaluate.py +22 -65
aiq/eval/tunable_rag_evaluator/evaluate.py +150 -168
aiq/eval/tunable_rag_evaluator/register.py +2 -0
aiq/eval/usage_stats.py +41 -0
aiq/eval/utils/output_uploader.py +10 -1
aiq/eval/utils/weave_eval.py +184 -0
aiq/experimental/__init__.py +0 -0
aiq/experimental/decorators/__init__.py +0 -0
aiq/experimental/decorators/experimental_warning_decorator.py +130 -0
aiq/experimental/inference_time_scaling/__init__.py +0 -0
aiq/experimental/inference_time_scaling/editing/__init__.py +0 -0
aiq/experimental/inference_time_scaling/editing/iterative_plan_refinement_editor.py +147 -0
aiq/experimental/inference_time_scaling/editing/llm_as_a_judge_editor.py +204 -0
aiq/experimental/inference_time_scaling/editing/motivation_aware_summarization.py +107 -0
aiq/experimental/inference_time_scaling/functions/__init__.py +0 -0
aiq/experimental/inference_time_scaling/functions/execute_score_select_function.py +105 -0
aiq/experimental/inference_time_scaling/functions/its_tool_orchestration_function.py +205 -0
aiq/experimental/inference_time_scaling/functions/its_tool_wrapper_function.py +146 -0
aiq/experimental/inference_time_scaling/functions/plan_select_execute_function.py +224 -0
aiq/experimental/inference_time_scaling/models/__init__.py +0 -0
aiq/experimental/inference_time_scaling/models/editor_config.py +132 -0
aiq/experimental/inference_time_scaling/models/its_item.py +48 -0
aiq/experimental/inference_time_scaling/models/scoring_config.py +112 -0
aiq/experimental/inference_time_scaling/models/search_config.py +120 -0
aiq/experimental/inference_time_scaling/models/selection_config.py +154 -0
aiq/experimental/inference_time_scaling/models/stage_enums.py +43 -0
aiq/experimental/inference_time_scaling/models/strategy_base.py +66 -0
aiq/experimental/inference_time_scaling/models/tool_use_config.py +41 -0
aiq/experimental/inference_time_scaling/register.py +36 -0
aiq/experimental/inference_time_scaling/scoring/__init__.py +0 -0
aiq/experimental/inference_time_scaling/scoring/llm_based_agent_scorer.py +168 -0
aiq/experimental/inference_time_scaling/scoring/llm_based_plan_scorer.py +168 -0
aiq/experimental/inference_time_scaling/scoring/motivation_aware_scorer.py +111 -0
aiq/experimental/inference_time_scaling/search/__init__.py +0 -0
aiq/experimental/inference_time_scaling/search/multi_llm_planner.py +128 -0
aiq/experimental/inference_time_scaling/search/multi_query_retrieval_search.py +122 -0
aiq/experimental/inference_time_scaling/search/single_shot_multi_plan_planner.py +128 -0
aiq/experimental/inference_time_scaling/selection/__init__.py +0 -0
aiq/experimental/inference_time_scaling/selection/best_of_n_selector.py +63 -0
aiq/experimental/inference_time_scaling/selection/llm_based_agent_output_selector.py +131 -0
aiq/experimental/inference_time_scaling/selection/llm_based_output_merging_selector.py +159 -0
aiq/experimental/inference_time_scaling/selection/llm_based_plan_selector.py +128 -0
aiq/experimental/inference_time_scaling/selection/threshold_selector.py +58 -0
aiq/front_ends/console/authentication_flow_handler.py +233 -0
aiq/front_ends/console/console_front_end_plugin.py +11 -2
aiq/front_ends/fastapi/auth_flow_handlers/__init__.py +0 -0
aiq/front_ends/fastapi/auth_flow_handlers/http_flow_handler.py +27 -0
aiq/front_ends/fastapi/auth_flow_handlers/websocket_flow_handler.py +107 -0
aiq/front_ends/fastapi/fastapi_front_end_config.py +93 -9
aiq/front_ends/fastapi/fastapi_front_end_controller.py +68 -0
aiq/front_ends/fastapi/fastapi_front_end_plugin.py +14 -1
aiq/front_ends/fastapi/fastapi_front_end_plugin_worker.py +537 -52
aiq/front_ends/fastapi/html_snippets/__init__.py +14 -0
aiq/front_ends/fastapi/html_snippets/auth_code_grant_success.py +35 -0
aiq/front_ends/fastapi/job_store.py +47 -25
aiq/front_ends/fastapi/main.py +2 -0
aiq/front_ends/fastapi/message_handler.py +108 -89
aiq/front_ends/fastapi/step_adaptor.py +2 -1
aiq/llm/aws_bedrock_llm.py +57 -0
aiq/llm/nim_llm.py +2 -1
aiq/llm/openai_llm.py +3 -2
aiq/llm/register.py +1 -0
aiq/meta/pypi.md +12 -12
aiq/object_store/__init__.py +20 -0
aiq/object_store/in_memory_object_store.py +74 -0
aiq/object_store/interfaces.py +84 -0
aiq/object_store/models.py +36 -0
aiq/object_store/register.py +20 -0
aiq/observability/__init__.py +14 -0
aiq/observability/exporter/__init__.py +14 -0
aiq/observability/exporter/base_exporter.py +449 -0
aiq/observability/exporter/exporter.py +78 -0
aiq/observability/exporter/file_exporter.py +33 -0
aiq/observability/exporter/processing_exporter.py +269 -0
aiq/observability/exporter/raw_exporter.py +52 -0
aiq/observability/exporter/span_exporter.py +264 -0
aiq/observability/exporter_manager.py +335 -0
aiq/observability/mixin/__init__.py +14 -0
aiq/observability/mixin/batch_config_mixin.py +26 -0
aiq/observability/mixin/collector_config_mixin.py +23 -0
aiq/observability/mixin/file_mixin.py +288 -0
aiq/observability/mixin/file_mode.py +23 -0
aiq/observability/mixin/resource_conflict_mixin.py +134 -0
aiq/observability/mixin/serialize_mixin.py +61 -0
aiq/observability/mixin/type_introspection_mixin.py +183 -0
aiq/observability/processor/__init__.py +14 -0
aiq/observability/processor/batching_processor.py +316 -0
aiq/observability/processor/intermediate_step_serializer.py +28 -0
aiq/observability/processor/processor.py +68 -0
aiq/observability/register.py +36 -39
aiq/observability/utils/__init__.py +14 -0
aiq/observability/utils/dict_utils.py +236 -0
aiq/observability/utils/time_utils.py +31 -0
aiq/profiler/calc/__init__.py +14 -0
aiq/profiler/calc/calc_runner.py +623 -0
aiq/profiler/calc/calculations.py +288 -0
aiq/profiler/calc/data_models.py +176 -0
aiq/profiler/calc/plot.py +345 -0
aiq/profiler/callbacks/langchain_callback_handler.py +22 -10
aiq/profiler/data_models.py +24 -0
aiq/profiler/inference_metrics_model.py +3 -0
aiq/profiler/inference_optimization/bottleneck_analysis/nested_stack_analysis.py +8 -0
aiq/profiler/inference_optimization/data_models.py +2 -2
aiq/profiler/inference_optimization/llm_metrics.py +2 -2
aiq/profiler/profile_runner.py +61 -21
aiq/runtime/loader.py +9 -3
aiq/runtime/runner.py +23 -9
aiq/runtime/session.py +25 -7
aiq/runtime/user_metadata.py +2 -3
aiq/tool/chat_completion.py +74 -0
aiq/tool/code_execution/README.md +152 -0
aiq/tool/code_execution/code_sandbox.py +151 -72
aiq/tool/code_execution/local_sandbox/.gitignore +1 -0
aiq/tool/code_execution/local_sandbox/local_sandbox_server.py +139 -24
aiq/tool/code_execution/local_sandbox/sandbox.requirements.txt +3 -1
aiq/tool/code_execution/local_sandbox/start_local_sandbox.sh +27 -2
aiq/tool/code_execution/register.py +7 -3
aiq/tool/code_execution/test_code_execution_sandbox.py +414 -0
aiq/tool/mcp/exceptions.py +142 -0
aiq/tool/mcp/mcp_client.py +41 -6
aiq/tool/mcp/mcp_tool.py +3 -2
aiq/tool/register.py +1 -0
aiq/tool/server_tools.py +6 -3
aiq/utils/exception_handlers/automatic_retries.py +289 -0
aiq/utils/exception_handlers/mcp.py +211 -0
aiq/utils/io/model_processing.py +28 -0
aiq/utils/log_utils.py +37 -0
aiq/utils/string_utils.py +38 -0
aiq/utils/type_converter.py +18 -2
aiq/utils/type_utils.py +87 -0
{aiqtoolkit-1.2.0.dev0.dist-info → aiqtoolkit-1.2.0rc2.dist-info}/METADATA +53 -21
aiqtoolkit-1.2.0rc2.dist-info/RECORD +436 -0
{aiqtoolkit-1.2.0.dev0.dist-info → aiqtoolkit-1.2.0rc2.dist-info}/WHEEL +1 -1
{aiqtoolkit-1.2.0.dev0.dist-info → aiqtoolkit-1.2.0rc2.dist-info}/entry_points.txt +3 -0
aiq/front_ends/fastapi/websocket.py +0 -148
aiq/observability/async_otel_listener.py +0 -429
aiqtoolkit-1.2.0.dev0.dist-info/RECORD +0 -316
{aiqtoolkit-1.2.0.dev0.dist-info → aiqtoolkit-1.2.0rc2.dist-info}/licenses/LICENSE-3rd-party.txt +0 -0
{aiqtoolkit-1.2.0.dev0.dist-info → aiqtoolkit-1.2.0rc2.dist-info}/licenses/LICENSE.md +0 -0
{aiqtoolkit-1.2.0.dev0.dist-info → aiqtoolkit-1.2.0rc2.dist-info}/top_level.txt +0 -0

aiq/eval/remote_workflow.py CHANGED Viewed

@@ -24,6 +24,7 @@ from tqdm import tqdm
 from aiq.data_models.api_server import AIQResponseIntermediateStep
 from aiq.data_models.intermediate_step import IntermediateStep
 from aiq.data_models.intermediate_step import IntermediateStepPayload
+from aiq.data_models.invocation_node import InvocationNode
 from aiq.eval.config import EvaluationRunConfig
 from aiq.eval.evaluator.evaluator_model import EvalInput
 from aiq.eval.evaluator.evaluator_model import EvalInputItem
@@ -81,8 +82,12 @@ class EvaluationRemoteWorkflowHandler:
                             step_data = json.loads(line[len(INTERMEDIATE_DATA_PREFIX):])
                             response_intermediate = AIQResponseIntermediateStep.model_validate(step_data)
                             # The payload is expected to be IntermediateStepPayload
-                            intermediate_step = IntermediateStep(
-                                payload=IntermediateStepPayload.model_validate_json(response_intermediate.payload))
+                            payload = IntermediateStepPayload.model_validate_json(response_intermediate.payload)
+                            intermediate_step = IntermediateStep(parent_id="remote",
+                                                                 function_ancestry=InvocationNode(
+                                                                     function_name=payload.name or "remote_function",
+                                                                     function_id=payload.UUID or "remote_function_id"),
+                                                                 payload=payload)
                             intermediate_steps.append(intermediate_step)
                         except (json.JSONDecodeError, ValidationError) as e:
                             logger.error("Failed to parse intermediate step: %s", e)

aiq/eval/runners/__init__.py ADDED Viewed

@@ -0,0 +1,14 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.

aiq/eval/runners/config.py ADDED Viewed

@@ -0,0 +1,39 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import typing
+from pydantic import BaseModel
+from aiq.eval.config import EvaluationRunConfig
+from aiq.eval.config import EvaluationRunOutput
+class MultiEvaluationRunConfig(BaseModel):
+    """
+    Parameters used for a multi-evaluation run.
+    This includes a dict of configs. The key is an id of any type.
+    Each pass loads the config, applies the overrides and runs to completion
+    before the next pass starts.
+    """
+    configs: dict[typing.Any, EvaluationRunConfig]
+class MultiEvaluationRunOutput(BaseModel):
+    """
+    Output of a multi-evaluation run.
+    The results per-pass are accumulated in the evaluation_run_outputs dict.
+    """
+    evaluation_run_outputs: dict[typing.Any, EvaluationRunOutput]

aiq/eval/runners/multi_eval_runner.py ADDED Viewed

@@ -0,0 +1,54 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import copy
+import typing
+from aiq.eval.config import EvaluationRunConfig
+from aiq.eval.config import EvaluationRunOutput
+from aiq.eval.evaluate import EvaluationRun
+from aiq.eval.runners.config import MultiEvaluationRunConfig
+class MultiEvaluationRunner:
+    """
+    Run a multi-evaluation run.
+    """
+    def __init__(self, config: MultiEvaluationRunConfig):
+        """
+        Initialize a multi-evaluation run.
+        """
+        self.config = config
+        self.evaluation_run_outputs: dict[typing.Any, EvaluationRunOutput] = {}
+    async def run_all(self):
+        """
+        Run all evaluations defined by the overrides.
+        """
+        for id, config in self.config.configs.items():
+            output = await self.run_single_evaluation(id, config)
+            self.evaluation_run_outputs[id] = output
+        return self.evaluation_run_outputs
+    async def run_single_evaluation(self, id: typing.Any, config: EvaluationRunConfig) -> EvaluationRunOutput:
+        """
+        Run a single evaluation and return the output.
+        """
+        # copy the config in case the caller is using the same config for multiple evaluations
+        config_copy = copy.deepcopy(config)
+        evaluation_run = EvaluationRun(config_copy)
+        return await evaluation_run.run_and_evaluate()

aiq/eval/trajectory_evaluator/evaluate.py CHANGED Viewed

@@ -13,24 +13,20 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import asyncio
 import logging
 from langchain.evaluation import TrajectoryEvalChain
 from langchain_core.language_models import BaseChatModel
 from langchain_core.tools import BaseTool
-from tqdm import tqdm
-from aiq.eval.evaluator.evaluator_model import EvalInput
+from aiq.eval.evaluator.base_evaluator import BaseEvaluator
 from aiq.eval.evaluator.evaluator_model import EvalInputItem
-from aiq.eval.evaluator.evaluator_model import EvalOutput
 from aiq.eval.evaluator.evaluator_model import EvalOutputItem
-from aiq.eval.utils.tqdm_position_registry import TqdmPositionRegistry
 logger = logging.getLogger(__name__)
-class TrajectoryEvaluator:
+class TrajectoryEvaluator(BaseEvaluator):
     def __init__(
         self,
@@ -38,11 +34,9 @@ class TrajectoryEvaluator:
         tools: list[BaseTool] | None = None,
         max_concurrency: int = 8,
     ):
+        super().__init__(max_concurrency=max_concurrency, tqdm_desc="Evaluating Trajectory")
         self.llm = llm
         self.tools = tools
-        self.max_concurrency = max_concurrency
-        self.semaphore = asyncio.Semaphore(self.max_concurrency)
         # Initialize trajectory evaluation chain
         self.traj_eval_chain = TrajectoryEvalChain.from_llm(llm=self.llm,
                                                             tools=self.tools,
@@ -50,69 +44,32 @@ class TrajectoryEvaluator:
                                                             requires_reference=True)
         logger.debug("Trajectory evaluation chain initialized.")
-    async def evaluate(self, eval_input: EvalInput) -> EvalOutput:
+    async def evaluate_item(self, item: EvalInputItem) -> EvalOutputItem:
         """
-        Evaluates the agent trajectories using trajectory evaluation chain.
+        Evaluate a single EvalInputItem and return an EvalOutputItem.
         """
-        num_records = len(eval_input.eval_input_items)
-        logger.info("Running trajectory evaluation with %d records", num_records)
         from aiq.data_models.intermediate_step import IntermediateStepType
         from aiq.eval.intermediate_step_adapter import IntermediateStepAdapter
         intermediate_step_adapter = IntermediateStepAdapter()
         event_filter = [IntermediateStepType.LLM_END, IntermediateStepType.TOOL_END]
-        async def process_item(item: EvalInputItem) -> tuple[float, dict]:
-            """
-            Evaluate a single EvalInputItem asynchronously and return a tuple of-
-            1. score
-            2. reasoning for the score
-            """
-            question = item.input_obj
-            generated_answer = item.output_obj
-            agent_trajectory = intermediate_step_adapter.get_agent_actions(item.trajectory, event_filter)
-            try:
-                eval_result = await self.traj_eval_chain.aevaluate_agent_trajectory(
-                    input=question,
-                    agent_trajectory=agent_trajectory,
-                    prediction=generated_answer,
-                )
-            except Exception as e:
-                logger.exception("Error evaluating trajectory for question: %s, Error: %s", question, e, exc_info=True)
-                return 0.0, f"Error evaluating trajectory: {e}"
-            reasoning = {
-                "reasoning": eval_result["reasoning"],
-                "trajectory": [(action.model_dump(), output) for (action, output) in agent_trajectory]
-            }
-            return eval_result["score"], reasoning
-        async def wrapped_process(item: EvalInputItem) -> tuple[float, dict]:
-            async with self.semaphore:
-                result = await process_item(item)
-                pbar.update(1)
-                return result
+        question = item.input_obj
+        generated_answer = item.output_obj
+        agent_trajectory = intermediate_step_adapter.get_agent_actions(item.trajectory, event_filter)
-        # Execute all evaluations asynchronously
         try:
-            tqdm_position = TqdmPositionRegistry.claim()
-            pbar = tqdm(total=len(eval_input.eval_input_items), desc="Evaluating Trajectory", position=tqdm_position)
-            results = await asyncio.gather(*[wrapped_process(item) for item in eval_input.eval_input_items])
-        finally:
-            pbar.close()
-            TqdmPositionRegistry.release(tqdm_position)
-        # Extract scores and reasonings
-        sample_scores, sample_reasonings = zip(*results) if results else ([], [])
-        # Compute average score
-        avg_score = round(sum(sample_scores) / len(sample_scores), 2) if sample_scores else 0.0
-        # Construct EvalOutputItems
-        eval_output_items = [
-            EvalOutputItem(id=item.id, score=score, reasoning=reasoning)
-            for item, score, reasoning in zip(eval_input.eval_input_items, sample_scores, sample_reasonings)
-        ]
-        return EvalOutput(average_score=avg_score, eval_output_items=eval_output_items)
+            eval_result = await self.traj_eval_chain.aevaluate_agent_trajectory(
+                input=question,
+                agent_trajectory=agent_trajectory,
+                prediction=generated_answer,
+            )
+        except Exception as e:
+            logger.exception("Error evaluating trajectory for question: %s, Error: %s", question, e, exc_info=True)
+            return EvalOutputItem(id=item.id, score=0.0, reasoning=f"Error evaluating trajectory: {e}")
+        reasoning = {
+            "reasoning": eval_result["reasoning"],
+            "trajectory": [(action.model_dump(), output) for (action, output) in agent_trajectory]
+        }
+        return EvalOutputItem(id=item.id, score=eval_result["score"], reasoning=reasoning)

aiq/eval/tunable_rag_evaluator/evaluate.py CHANGED Viewed

@@ -15,19 +15,19 @@
 import asyncio
 import logging
+from typing import Callable
 from langchain.output_parsers import ResponseSchema
 from langchain.output_parsers import StructuredOutputParser
 from langchain.schema import HumanMessage
 from langchain.schema import SystemMessage
 from langchain_core.language_models import BaseChatModel
+from langchain_core.runnables import RunnableLambda
 from tqdm import tqdm
-from aiq.eval.evaluator.evaluator_model import EvalInput
+from aiq.eval.evaluator.base_evaluator import BaseEvaluator
 from aiq.eval.evaluator.evaluator_model import EvalInputItem
-from aiq.eval.evaluator.evaluator_model import EvalOutput
 from aiq.eval.evaluator.evaluator_model import EvalOutputItem
-from aiq.eval.utils.tqdm_position_registry import TqdmPositionRegistry
 logger = logging.getLogger(__name__)
@@ -69,195 +69,177 @@ def evaluation_prompt(judge_llm_prompt: str,
     return EVAL_PROMPT if not default_scoring else DEFAULT_EVAL_PROMPT
-class TunableRagEvaluator:
+def runnable_with_retries(original_fn: Callable, llm_retry_control_params: dict | None = None):
+    runnable = RunnableLambda(original_fn)
+    if llm_retry_control_params is None:
+        llm_retry_control_params = {
+            "stop_after_attempt": 3, "initial_backoff_delay_seconds": 1, "has_exponential_jitter": True
+        }
+    if llm_retry_control_params["has_exponential_jitter"] is None:
+        llm_retry_control_params["has_exponential_jitter"] = True
+    if llm_retry_control_params["stop_after_attempt"] is None:
+        llm_retry_control_params["stop_after_attempt"] = 3
+    if llm_retry_control_params["initial_backoff_delay_seconds"] is None:
+        llm_retry_control_params["initial_backoff_delay_seconds"] = 1
+    # Add retry logic with exponential backoff and jitter
+    return runnable.with_retry(
+        retry_if_exception_type=(Exception, ),  # Retry on any error
+        wait_exponential_jitter=llm_retry_control_params["has_exponential_jitter"],  # Add jitter to exponential backoff
+        stop_after_attempt=llm_retry_control_params["stop_after_attempt"],
+        exponential_jitter_params={"initial": llm_retry_control_params["initial_backoff_delay_seconds"]
+                                   }  # Optional: set initial backoff (seconds)
+    )
+class TunableRagEvaluator(BaseEvaluator):
     '''Tunable RAG evaluator class with customizable LLM prompt for scoring.'''
     def __init__(self,
                  llm: BaseChatModel,
                  judge_llm_prompt: str,
+                 llm_retry_control_params: dict | None,
                  max_concurrency: int,
                  default_scoring: bool,
                  default_score_weights: dict):
+        super().__init__(max_concurrency=max_concurrency, tqdm_desc="Evaluating RAG")
         self.llm = llm
-        self.max_concurrency = max_concurrency
         self.judge_llm_prompt = judge_llm_prompt
-        self.semaphore = asyncio.Semaphore(self.max_concurrency)
+        self.llm_retry_control_params = llm_retry_control_params
         self.default_scoring = default_scoring
         # Use user-provided weights if available; otherwise, set equal weights for each score
         self.default_score_weights = default_score_weights if default_score_weights else {
             "coverage": 1 / 3, "correctness": 1 / 3, "relevance": 1 / 3
         }
-    async def evaluate(self, eval_input: EvalInput) -> EvalOutput:
-        '''Evaluate function'''
-        async def process_item(item):
-            """Compute RAG evaluation for an individual item"""
-            question = item.input_obj
-            answer_description = item.expected_output_obj
-            generated_answer = item.output_obj
-            # Call judge LLM to generate score
-            score = 0.0
-            default_evaluation_schema = [
-                ResponseSchema(
-                    name="coverage_score",
-                    description=
-                    "Score for the coverage of all critical aspects mentioned in the expected answer. Ex. 0.5",
-                    type="float"),
-                ResponseSchema(
-                    name="correctness_score",
-                    description=
-                    "Score for the accuracy of the generated answer compared to the expected answer. Ex. 0.5",
-                    type="float"),
-                ResponseSchema(name="relevance_score",
-                               description="Score for the relevance of the generated answer to the question. Ex. 0.5",
-                               type="float"),
-                ResponseSchema(
-                    name="reasoning",
-                    description=
-                    "1-2 summarized sentences of reasoning for the scores. Ex. 'The generated answer covers all critical aspects mentioned in the expected answer, is correct, and is relevant to the question.'",
-                    type="string"),
-            ]
-            custom_evaluation_schema = [
-                ResponseSchema(name="score", description="Score for the generated answer. Ex. 0.5", type="float"),
-                ResponseSchema(
-                    name="reasoning",
-                    description=
-                    "1-2 sentence reasoning for the score. Ex. 'The generated answer is exactly the same as the description of the expected answer.'",
-                    type="string"),
-            ]
+    async def evaluate_item(self, item: EvalInputItem) -> EvalOutputItem:
+        """Compute RAG evaluation for an individual item and return EvalOutputItem"""
+        question = item.input_obj
+        answer_description = item.expected_output_obj
+        generated_answer = item.output_obj
+        # Call judge LLM to generate score
+        score = 0.0
+        default_evaluation_schema = [
+            ResponseSchema(
+                name="coverage_score",
+                description="Score for the coverage of all critical aspects mentioned in the expected answer. Ex. 0.5",
+                type="float"),
+            ResponseSchema(
+                name="correctness_score",
+                description="Score for the accuracy of the generated answer compared to the expected answer. Ex. 0.5",
+                type="float"),
+            ResponseSchema(name="relevance_score",
+                           description="Score for the relevance of the generated answer to the question. Ex. 0.5",
+                           type="float"),
+            ResponseSchema(
+                name="reasoning",
+                description=
+                "1-2 summarized sentences of reasoning for the scores. Ex. 'The generated answer covers all critical aspects mentioned in the expected answer, is correct, and is relevant to the question.'",
+                type="string"),
+        ]
-            if self.default_scoring:
-                evaluation_schema = default_evaluation_schema
-            else:
-                evaluation_schema = custom_evaluation_schema
+        custom_evaluation_schema = [
+            ResponseSchema(name="score", description="Score for the generated answer. Ex. 0.5", type="float"),
+            ResponseSchema(
+                name="reasoning",
+                description=
+                "1-2 sentence reasoning for the score. Ex. 'The generated answer is exactly the same as the description of the expected answer.'",
+                type="string"),
+        ]
-            llm_input_response_parser = StructuredOutputParser.from_response_schemas(evaluation_schema)
-            format_instructions = llm_input_response_parser.get_format_instructions()
+        if self.default_scoring:
+            evaluation_schema = default_evaluation_schema
+        else:
+            evaluation_schema = custom_evaluation_schema
-            eval_prompt = evaluation_prompt(judge_llm_prompt=self.judge_llm_prompt,
-                                            question=question,
-                                            answer_description=answer_description,
-                                            generated_answer=generated_answer,
-                                            format_instructions=format_instructions,
-                                            default_scoring=self.default_scoring)
+        llm_input_response_parser = StructuredOutputParser.from_response_schemas(evaluation_schema)
+        format_instructions = llm_input_response_parser.get_format_instructions()
-            messages = [
-                SystemMessage(content="You must respond only in JSON format."), HumanMessage(content=eval_prompt)
-            ]
+        eval_prompt = evaluation_prompt(judge_llm_prompt=self.judge_llm_prompt,
+                                        question=question,
+                                        answer_description=answer_description,
+                                        generated_answer=generated_answer,
+                                        format_instructions=format_instructions,
+                                        default_scoring=self.default_scoring)
-            response = await self.llm.ainvoke(messages)
+        messages = [SystemMessage(content="You must respond only in JSON format."), HumanMessage(content=eval_prompt)]
-            # Initialize default values to handle service errors
-            coverage_score = 0.0
-            correctness_score = 0.0
-            relevance_score = 0.0
-            reasoning = "Error in evaluator from parsing judge LLM response."
+        response = await runnable_with_retries(self.llm.ainvoke, self.llm_retry_control_params).ainvoke(messages)
-            try:
-                parsed_response = llm_input_response_parser.parse(response.content)
-                if self.default_scoring:
-                    try:
-                        coverage_score = parsed_response["coverage_score"]
-                        correctness_score = parsed_response["correctness_score"]
-                        relevance_score = parsed_response["relevance_score"]
-                        reasoning = parsed_response["reasoning"]
-                    except KeyError as e:
-                        logger.error("Missing required keys in default scoring response: %s",
-                                     ", ".join(str(arg) for arg in e.args))
-                        reasoning = f"Error in evaluator from parsing judge LLM response. Missing required key(s): {', '.join(str(arg) for arg in e.args)}"
-                    coverage_weight = self.default_score_weights.get("coverage", 1 / 3)
-                    correctness_weight = self.default_score_weights.get("correctness", 1 / 3)
-                    relevance_weight = self.default_score_weights.get("relevance", 1 / 3)
-                    # Calculate score
-                    total_weight = coverage_weight + correctness_weight + relevance_weight
-                    coverage_weight = coverage_weight / total_weight
-                    correctness_weight = correctness_weight / total_weight
-                    relevance_weight = relevance_weight / total_weight
-                    if round(coverage_weight + correctness_weight + relevance_weight, 2) != 1:
-                        logger.warning("The sum of the default score weights is not 1. The weights will be normalized.")
-                        coverage_weight = coverage_weight / (coverage_weight + correctness_weight + relevance_weight)
-                        correctness_weight = correctness_weight / (coverage_weight + correctness_weight +
-                                                                   relevance_weight)
-                        relevance_weight = relevance_weight / (coverage_weight + correctness_weight + relevance_weight)
-                    score = (coverage_weight * coverage_score + correctness_weight * correctness_score +
-                             relevance_weight * relevance_score)
-                else:
-                    try:
-                        score = parsed_response["score"]
-                        reasoning = parsed_response["reasoning"]
-                    except KeyError as e:
-                        logger.error("Missing required keys in custom scoring response: %s",
-                                     ", ".join(str(arg) for arg in e.args))
-                        reasoning = f"Error in evaluator from parsing judge LLM response. Missing required key(s): {', '.join(str(arg) for arg in e.args)}"
-                        raise
-            except (KeyError, ValueError) as e:
-                logger.error("Error parsing judge LLM response: %s", e)
-                score = 0.0
-                reasoning = "Error in evaluator from parsing judge LLM response."
+        # Initialize default values to handle service errors
+        coverage_score = 0.0
+        correctness_score = 0.0
+        relevance_score = 0.0
+        reasoning = "Error in evaluator from parsing judge LLM response."
+        try:
+            parsed_response = llm_input_response_parser.parse(response.content)
             if self.default_scoring:
-                reasoning = {
-                    "question": question,
-                    "answer_description": answer_description,
-                    "generated_answer": generated_answer,
-                    "score_breakdown": {
-                        "coverage_score": coverage_score,
-                        "correctness_score": correctness_score,
-                        "relevance_score": relevance_score,
-                    },
-                    "reasoning": reasoning,
-                }
-            else:
-                reasoning = {
-                    "question": question,
-                    "answer_description": answer_description,
-                    "generated_answer": generated_answer,
-                    "reasoning": reasoning
-                }
-            return score, reasoning
-        async def wrapped_process(item: EvalInputItem) -> tuple[float, dict]:
-            """
-            Process an item asynchronously and update the progress bar.
-            Use the semaphore to limit the number of concurrent items.
-            """
-            async with self.semaphore:
-                result = await process_item(item)
-                # Update the progress bar
-                pbar.update(1)
-                return result
+                try:
+                    coverage_score = parsed_response["coverage_score"]
+                    correctness_score = parsed_response["correctness_score"]
+                    relevance_score = parsed_response["relevance_score"]
+                    reasoning = parsed_response["reasoning"]
+                except KeyError as e:
+                    logger.error("Missing required keys in default scoring response: %s",
+                                 ", ".join(str(arg) for arg in e.args))
+                    reasoning = f"Error in evaluator from parsing judge LLM response. Missing required key(s): {', '.join(str(arg) for arg in e.args)}"
+                coverage_weight = self.default_score_weights.get("coverage", 1 / 3)
+                correctness_weight = self.default_score_weights.get("correctness", 1 / 3)
+                relevance_weight = self.default_score_weights.get("relevance", 1 / 3)
+                # Calculate score
+                total_weight = coverage_weight + correctness_weight + relevance_weight
+                coverage_weight = coverage_weight / total_weight
+                correctness_weight = correctness_weight / total_weight
+                relevance_weight = relevance_weight / total_weight
+                if round(coverage_weight + correctness_weight + relevance_weight, 2) != 1:
+                    logger.warning("The sum of the default score weights is not 1. The weights will be normalized.")
+                    coverage_weight = coverage_weight / (coverage_weight + correctness_weight + relevance_weight)
+                    correctness_weight = correctness_weight / (coverage_weight + correctness_weight + relevance_weight)
+                    relevance_weight = relevance_weight / (coverage_weight + correctness_weight + relevance_weight)
+                score = (coverage_weight * coverage_score + correctness_weight * correctness_score +
+                         relevance_weight * relevance_score)
-        try:
-            # Claim a tqdm position to display the progress bar
-            tqdm_position = TqdmPositionRegistry.claim()
-            # Create a progress bar
-            pbar = tqdm(total=len(eval_input.eval_input_items), desc="Evaluating RAG", position=tqdm_position)
-            # Process items concurrently with a limit on concurrency
-            results = await asyncio.gather(*[wrapped_process(item) for item in eval_input.eval_input_items])
-        finally:
-            pbar.close()
-            TqdmPositionRegistry.release(tqdm_position)
-        # Extract scores and reasonings
-        sample_scores, sample_reasonings = zip(*results) if results else ([], [])
-        # Compute average score
-        avg_score = round(sum(sample_scores) / len(sample_scores), 2) if sample_scores else 0.0
-        # Construct EvalOutputItems
-        eval_output_items = [
-            EvalOutputItem(id=item.id, score=score, reasoning=reasoning)
-            for item, score, reasoning in zip(eval_input.eval_input_items, sample_scores, sample_reasonings)
-        ]
+            else:
+                try:
+                    score = parsed_response["score"]
+                    reasoning = parsed_response["reasoning"]
+                except KeyError as e:
+                    logger.error("Missing required keys in custom scoring response: %s",
+                                 ", ".join(str(arg) for arg in e.args))
+                    reasoning = f"Error in evaluator from parsing judge LLM response. Missing required key(s): {', '.join(str(arg) for arg in e.args)}"
+                    raise
+        except (KeyError, ValueError) as e:
+            logger.error("Error parsing judge LLM response: %s", e)
+            score = 0.0
+            reasoning = "Error in evaluator from parsing judge LLM response."
-        return EvalOutput(average_score=avg_score, eval_output_items=eval_output_items)
+        if self.default_scoring:
+            reasoning = {
+                "question": question,
+                "answer_description": answer_description,
+                "generated_answer": generated_answer,
+                "score_breakdown": {
+                    "coverage_score": coverage_score,
+                    "correctness_score": correctness_score,
+                    "relevance_score": relevance_score,
+                },
+                "reasoning": reasoning,
+            }
+        else:
+            reasoning = {
+                "question": question,
+                "answer_description": answer_description,
+                "generated_answer": generated_answer,
+                "reasoning": reasoning
+            }
+        return EvalOutputItem(id=item.id, score=score, reasoning=reasoning)

aiq/eval/tunable_rag_evaluator/register.py CHANGED Viewed

@@ -26,6 +26,7 @@ from aiq.data_models.evaluator import EvaluatorBaseConfig
 class TunableRagEvaluatorConfig(EvaluatorBaseConfig, name="tunable_rag_evaluator"):
     '''Configuration for tunable RAG evaluator'''
     llm_name: LLMRef = Field(description="Name of the judge LLM")
+    llm_retry_control_params: dict | None = Field(description="Parameters to control LLM retry behavior", default=None)
     judge_llm_prompt: str = Field(description="LLM prompt for the judge LLM")
     default_scoring: bool = Field(description="Whether to use default scoring", default=False)
     default_score_weights: dict = Field(
@@ -43,6 +44,7 @@ async def register_tunable_rag_evaluator(config: TunableRagEvaluatorConfig, buil
     llm = await builder.get_llm(config.llm_name, wrapper_type=LLMFrameworkEnum.LANGCHAIN)
     evaluator = TunableRagEvaluator(llm,
                                     config.judge_llm_prompt,
+                                    config.llm_retry_control_params,
                                     builder.get_max_concurrency(),
                                     config.default_scoring,
                                     config.default_score_weights)

aiqtoolkit 1.2.0.dev0__py3-none-any.whl → 1.2.0rc2__py3-none-any.whl

Potentially problematic release.

aiqtoolkit 1.2.0.dev0py3-none-any.whl → 1.2.0rc2py3-none-any.whl