PyPI - aiqtoolkit - Versions diffs - 1.1.0__py3-none-any.whl - Mend

aiqtoolkit 1.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of aiqtoolkit might be problematic. Click here for more details.

Files changed (316) hide show

aiq/agent/__init__.py +0 -0
aiq/agent/base.py +76 -0
aiq/agent/dual_node.py +67 -0
aiq/agent/react_agent/__init__.py +0 -0
aiq/agent/react_agent/agent.py +322 -0
aiq/agent/react_agent/output_parser.py +104 -0
aiq/agent/react_agent/prompt.py +46 -0
aiq/agent/react_agent/register.py +148 -0
aiq/agent/reasoning_agent/__init__.py +0 -0
aiq/agent/reasoning_agent/reasoning_agent.py +224 -0
aiq/agent/register.py +23 -0
aiq/agent/rewoo_agent/__init__.py +0 -0
aiq/agent/rewoo_agent/agent.py +410 -0
aiq/agent/rewoo_agent/prompt.py +108 -0
aiq/agent/rewoo_agent/register.py +158 -0
aiq/agent/tool_calling_agent/__init__.py +0 -0
aiq/agent/tool_calling_agent/agent.py +123 -0
aiq/agent/tool_calling_agent/register.py +105 -0
aiq/builder/__init__.py +0 -0
aiq/builder/builder.py +223 -0
aiq/builder/component_utils.py +303 -0
aiq/builder/context.py +227 -0
aiq/builder/embedder.py +24 -0
aiq/builder/eval_builder.py +120 -0
aiq/builder/evaluator.py +29 -0
aiq/builder/framework_enum.py +24 -0
aiq/builder/front_end.py +73 -0
aiq/builder/function.py +297 -0
aiq/builder/function_base.py +376 -0
aiq/builder/function_info.py +627 -0
aiq/builder/intermediate_step_manager.py +176 -0
aiq/builder/llm.py +25 -0
aiq/builder/retriever.py +25 -0
aiq/builder/user_interaction_manager.py +71 -0
aiq/builder/workflow.py +143 -0
aiq/builder/workflow_builder.py +757 -0
aiq/cli/__init__.py +14 -0
aiq/cli/cli_utils/__init__.py +0 -0
aiq/cli/cli_utils/config_override.py +231 -0
aiq/cli/cli_utils/validation.py +37 -0
aiq/cli/commands/__init__.py +0 -0
aiq/cli/commands/configure/__init__.py +0 -0
aiq/cli/commands/configure/channel/__init__.py +0 -0
aiq/cli/commands/configure/channel/add.py +28 -0
aiq/cli/commands/configure/channel/channel.py +36 -0
aiq/cli/commands/configure/channel/remove.py +30 -0
aiq/cli/commands/configure/channel/update.py +30 -0
aiq/cli/commands/configure/configure.py +33 -0
aiq/cli/commands/evaluate.py +139 -0
aiq/cli/commands/info/__init__.py +14 -0
aiq/cli/commands/info/info.py +39 -0
aiq/cli/commands/info/list_channels.py +32 -0
aiq/cli/commands/info/list_components.py +129 -0
aiq/cli/commands/info/list_mcp.py +126 -0
aiq/cli/commands/registry/__init__.py +14 -0
aiq/cli/commands/registry/publish.py +88 -0
aiq/cli/commands/registry/pull.py +118 -0
aiq/cli/commands/registry/registry.py +38 -0
aiq/cli/commands/registry/remove.py +108 -0
aiq/cli/commands/registry/search.py +155 -0
aiq/cli/commands/start.py +250 -0
aiq/cli/commands/uninstall.py +83 -0
aiq/cli/commands/validate.py +47 -0
aiq/cli/commands/workflow/__init__.py +14 -0
aiq/cli/commands/workflow/templates/__init__.py.j2 +0 -0
aiq/cli/commands/workflow/templates/config.yml.j2 +16 -0
aiq/cli/commands/workflow/templates/pyproject.toml.j2 +22 -0
aiq/cli/commands/workflow/templates/register.py.j2 +5 -0
aiq/cli/commands/workflow/templates/workflow.py.j2 +36 -0
aiq/cli/commands/workflow/workflow.py +37 -0
aiq/cli/commands/workflow/workflow_commands.py +313 -0
aiq/cli/entrypoint.py +133 -0
aiq/cli/main.py +44 -0
aiq/cli/register_workflow.py +408 -0
aiq/cli/type_registry.py +879 -0
aiq/data_models/__init__.py +14 -0
aiq/data_models/api_server.py +588 -0
aiq/data_models/common.py +143 -0
aiq/data_models/component.py +46 -0
aiq/data_models/component_ref.py +135 -0
aiq/data_models/config.py +349 -0
aiq/data_models/dataset_handler.py +122 -0
aiq/data_models/discovery_metadata.py +286 -0
aiq/data_models/embedder.py +26 -0
aiq/data_models/evaluate.py +104 -0
aiq/data_models/evaluator.py +26 -0
aiq/data_models/front_end.py +26 -0
aiq/data_models/function.py +30 -0
aiq/data_models/function_dependencies.py +64 -0
aiq/data_models/interactive.py +237 -0
aiq/data_models/intermediate_step.py +269 -0
aiq/data_models/invocation_node.py +38 -0
aiq/data_models/llm.py +26 -0
aiq/data_models/logging.py +26 -0
aiq/data_models/memory.py +26 -0
aiq/data_models/profiler.py +53 -0
aiq/data_models/registry_handler.py +26 -0
aiq/data_models/retriever.py +30 -0
aiq/data_models/step_adaptor.py +64 -0
aiq/data_models/streaming.py +33 -0
aiq/data_models/swe_bench_model.py +54 -0
aiq/data_models/telemetry_exporter.py +26 -0
aiq/embedder/__init__.py +0 -0
aiq/embedder/langchain_client.py +41 -0
aiq/embedder/nim_embedder.py +58 -0
aiq/embedder/openai_embedder.py +42 -0
aiq/embedder/register.py +24 -0
aiq/eval/__init__.py +14 -0
aiq/eval/config.py +42 -0
aiq/eval/dataset_handler/__init__.py +0 -0
aiq/eval/dataset_handler/dataset_downloader.py +106 -0
aiq/eval/dataset_handler/dataset_filter.py +52 -0
aiq/eval/dataset_handler/dataset_handler.py +169 -0
aiq/eval/evaluate.py +325 -0
aiq/eval/evaluator/__init__.py +14 -0
aiq/eval/evaluator/evaluator_model.py +44 -0
aiq/eval/intermediate_step_adapter.py +93 -0
aiq/eval/rag_evaluator/__init__.py +0 -0
aiq/eval/rag_evaluator/evaluate.py +138 -0
aiq/eval/rag_evaluator/register.py +138 -0
aiq/eval/register.py +23 -0
aiq/eval/remote_workflow.py +128 -0
aiq/eval/runtime_event_subscriber.py +52 -0
aiq/eval/swe_bench_evaluator/__init__.py +0 -0
aiq/eval/swe_bench_evaluator/evaluate.py +215 -0
aiq/eval/swe_bench_evaluator/register.py +36 -0
aiq/eval/trajectory_evaluator/__init__.py +0 -0
aiq/eval/trajectory_evaluator/evaluate.py +118 -0
aiq/eval/trajectory_evaluator/register.py +40 -0
aiq/eval/tunable_rag_evaluator/__init__.py +0 -0
aiq/eval/tunable_rag_evaluator/evaluate.py +263 -0
aiq/eval/tunable_rag_evaluator/register.py +50 -0
aiq/eval/utils/__init__.py +0 -0
aiq/eval/utils/output_uploader.py +131 -0
aiq/eval/utils/tqdm_position_registry.py +40 -0
aiq/front_ends/__init__.py +14 -0
aiq/front_ends/console/__init__.py +14 -0
aiq/front_ends/console/console_front_end_config.py +32 -0
aiq/front_ends/console/console_front_end_plugin.py +107 -0
aiq/front_ends/console/register.py +25 -0
aiq/front_ends/cron/__init__.py +14 -0
aiq/front_ends/fastapi/__init__.py +14 -0
aiq/front_ends/fastapi/fastapi_front_end_config.py +150 -0
aiq/front_ends/fastapi/fastapi_front_end_plugin.py +103 -0
aiq/front_ends/fastapi/fastapi_front_end_plugin_worker.py +607 -0
aiq/front_ends/fastapi/intermediate_steps_subscriber.py +80 -0
aiq/front_ends/fastapi/job_store.py +161 -0
aiq/front_ends/fastapi/main.py +70 -0
aiq/front_ends/fastapi/message_handler.py +279 -0
aiq/front_ends/fastapi/message_validator.py +345 -0
aiq/front_ends/fastapi/register.py +25 -0
aiq/front_ends/fastapi/response_helpers.py +195 -0
aiq/front_ends/fastapi/step_adaptor.py +320 -0
aiq/front_ends/fastapi/websocket.py +148 -0
aiq/front_ends/mcp/__init__.py +14 -0
aiq/front_ends/mcp/mcp_front_end_config.py +32 -0
aiq/front_ends/mcp/mcp_front_end_plugin.py +93 -0
aiq/front_ends/mcp/register.py +27 -0
aiq/front_ends/mcp/tool_converter.py +242 -0
aiq/front_ends/register.py +22 -0
aiq/front_ends/simple_base/__init__.py +14 -0
aiq/front_ends/simple_base/simple_front_end_plugin_base.py +52 -0
aiq/llm/__init__.py +0 -0
aiq/llm/nim_llm.py +45 -0
aiq/llm/openai_llm.py +45 -0
aiq/llm/register.py +22 -0
aiq/llm/utils/__init__.py +14 -0
aiq/llm/utils/env_config_value.py +94 -0
aiq/llm/utils/error.py +17 -0
aiq/memory/__init__.py +20 -0
aiq/memory/interfaces.py +183 -0
aiq/memory/models.py +112 -0
aiq/meta/module_to_distro.json +3 -0
aiq/meta/pypi.md +58 -0
aiq/observability/__init__.py +0 -0
aiq/observability/async_otel_listener.py +429 -0
aiq/observability/register.py +99 -0
aiq/plugins/.namespace +1 -0
aiq/profiler/__init__.py +0 -0
aiq/profiler/callbacks/__init__.py +0 -0
aiq/profiler/callbacks/agno_callback_handler.py +295 -0
aiq/profiler/callbacks/base_callback_class.py +20 -0
aiq/profiler/callbacks/langchain_callback_handler.py +278 -0
aiq/profiler/callbacks/llama_index_callback_handler.py +205 -0
aiq/profiler/callbacks/semantic_kernel_callback_handler.py +238 -0
aiq/profiler/callbacks/token_usage_base_model.py +27 -0
aiq/profiler/data_frame_row.py +51 -0
aiq/profiler/decorators/__init__.py +0 -0
aiq/profiler/decorators/framework_wrapper.py +131 -0
aiq/profiler/decorators/function_tracking.py +254 -0
aiq/profiler/forecasting/__init__.py +0 -0
aiq/profiler/forecasting/config.py +18 -0
aiq/profiler/forecasting/model_trainer.py +75 -0
aiq/profiler/forecasting/models/__init__.py +22 -0
aiq/profiler/forecasting/models/forecasting_base_model.py +40 -0
aiq/profiler/forecasting/models/linear_model.py +196 -0
aiq/profiler/forecasting/models/random_forest_regressor.py +268 -0
aiq/profiler/inference_metrics_model.py +25 -0
aiq/profiler/inference_optimization/__init__.py +0 -0
aiq/profiler/inference_optimization/bottleneck_analysis/__init__.py +0 -0
aiq/profiler/inference_optimization/bottleneck_analysis/nested_stack_analysis.py +452 -0
aiq/profiler/inference_optimization/bottleneck_analysis/simple_stack_analysis.py +258 -0
aiq/profiler/inference_optimization/data_models.py +386 -0
aiq/profiler/inference_optimization/experimental/__init__.py +0 -0
aiq/profiler/inference_optimization/experimental/concurrency_spike_analysis.py +468 -0
aiq/profiler/inference_optimization/experimental/prefix_span_analysis.py +405 -0
aiq/profiler/inference_optimization/llm_metrics.py +212 -0
aiq/profiler/inference_optimization/prompt_caching.py +163 -0
aiq/profiler/inference_optimization/token_uniqueness.py +107 -0
aiq/profiler/inference_optimization/workflow_runtimes.py +72 -0
aiq/profiler/intermediate_property_adapter.py +102 -0
aiq/profiler/profile_runner.py +433 -0
aiq/profiler/utils.py +184 -0
aiq/registry_handlers/__init__.py +0 -0
aiq/registry_handlers/local/__init__.py +0 -0
aiq/registry_handlers/local/local_handler.py +176 -0
aiq/registry_handlers/local/register_local.py +37 -0
aiq/registry_handlers/metadata_factory.py +60 -0
aiq/registry_handlers/package_utils.py +198 -0
aiq/registry_handlers/pypi/__init__.py +0 -0
aiq/registry_handlers/pypi/pypi_handler.py +251 -0
aiq/registry_handlers/pypi/register_pypi.py +40 -0
aiq/registry_handlers/register.py +21 -0
aiq/registry_handlers/registry_handler_base.py +157 -0
aiq/registry_handlers/rest/__init__.py +0 -0
aiq/registry_handlers/rest/register_rest.py +56 -0
aiq/registry_handlers/rest/rest_handler.py +237 -0
aiq/registry_handlers/schemas/__init__.py +0 -0
aiq/registry_handlers/schemas/headers.py +42 -0
aiq/registry_handlers/schemas/package.py +68 -0
aiq/registry_handlers/schemas/publish.py +63 -0
aiq/registry_handlers/schemas/pull.py +82 -0
aiq/registry_handlers/schemas/remove.py +36 -0
aiq/registry_handlers/schemas/search.py +91 -0
aiq/registry_handlers/schemas/status.py +47 -0
aiq/retriever/__init__.py +0 -0
aiq/retriever/interface.py +37 -0
aiq/retriever/milvus/__init__.py +14 -0
aiq/retriever/milvus/register.py +81 -0
aiq/retriever/milvus/retriever.py +228 -0
aiq/retriever/models.py +74 -0
aiq/retriever/nemo_retriever/__init__.py +14 -0
aiq/retriever/nemo_retriever/register.py +60 -0
aiq/retriever/nemo_retriever/retriever.py +190 -0
aiq/retriever/register.py +22 -0
aiq/runtime/__init__.py +14 -0
aiq/runtime/loader.py +188 -0
aiq/runtime/runner.py +176 -0
aiq/runtime/session.py +140 -0
aiq/runtime/user_metadata.py +131 -0
aiq/settings/__init__.py +0 -0
aiq/settings/global_settings.py +318 -0
aiq/test/.namespace +1 -0
aiq/tool/__init__.py +0 -0
aiq/tool/code_execution/__init__.py +0 -0
aiq/tool/code_execution/code_sandbox.py +188 -0
aiq/tool/code_execution/local_sandbox/Dockerfile.sandbox +60 -0
aiq/tool/code_execution/local_sandbox/__init__.py +13 -0
aiq/tool/code_execution/local_sandbox/local_sandbox_server.py +83 -0
aiq/tool/code_execution/local_sandbox/sandbox.requirements.txt +4 -0
aiq/tool/code_execution/local_sandbox/start_local_sandbox.sh +25 -0
aiq/tool/code_execution/register.py +70 -0
aiq/tool/code_execution/utils.py +100 -0
aiq/tool/datetime_tools.py +42 -0
aiq/tool/document_search.py +141 -0
aiq/tool/github_tools/__init__.py +0 -0
aiq/tool/github_tools/create_github_commit.py +133 -0
aiq/tool/github_tools/create_github_issue.py +87 -0
aiq/tool/github_tools/create_github_pr.py +106 -0
aiq/tool/github_tools/get_github_file.py +106 -0
aiq/tool/github_tools/get_github_issue.py +166 -0
aiq/tool/github_tools/get_github_pr.py +256 -0
aiq/tool/github_tools/update_github_issue.py +100 -0
aiq/tool/mcp/__init__.py +14 -0
aiq/tool/mcp/mcp_client.py +220 -0
aiq/tool/mcp/mcp_tool.py +95 -0
aiq/tool/memory_tools/__init__.py +0 -0
aiq/tool/memory_tools/add_memory_tool.py +79 -0
aiq/tool/memory_tools/delete_memory_tool.py +67 -0
aiq/tool/memory_tools/get_memory_tool.py +72 -0
aiq/tool/nvidia_rag.py +95 -0
aiq/tool/register.py +37 -0
aiq/tool/retriever.py +89 -0
aiq/tool/server_tools.py +63 -0
aiq/utils/__init__.py +0 -0
aiq/utils/data_models/__init__.py +0 -0
aiq/utils/data_models/schema_validator.py +58 -0
aiq/utils/debugging_utils.py +43 -0
aiq/utils/exception_handlers/__init__.py +0 -0
aiq/utils/exception_handlers/schemas.py +114 -0
aiq/utils/io/__init__.py +0 -0
aiq/utils/io/yaml_tools.py +119 -0
aiq/utils/metadata_utils.py +74 -0
aiq/utils/optional_imports.py +142 -0
aiq/utils/producer_consumer_queue.py +178 -0
aiq/utils/reactive/__init__.py +0 -0
aiq/utils/reactive/base/__init__.py +0 -0
aiq/utils/reactive/base/observable_base.py +65 -0
aiq/utils/reactive/base/observer_base.py +55 -0
aiq/utils/reactive/base/subject_base.py +79 -0
aiq/utils/reactive/observable.py +59 -0
aiq/utils/reactive/observer.py +76 -0
aiq/utils/reactive/subject.py +131 -0
aiq/utils/reactive/subscription.py +49 -0
aiq/utils/settings/__init__.py +0 -0
aiq/utils/settings/global_settings.py +197 -0
aiq/utils/type_converter.py +232 -0
aiq/utils/type_utils.py +397 -0
aiq/utils/url_utils.py +27 -0
aiqtoolkit-1.1.0.dist-info/METADATA +331 -0
aiqtoolkit-1.1.0.dist-info/RECORD +316 -0
aiqtoolkit-1.1.0.dist-info/WHEEL +5 -0
aiqtoolkit-1.1.0.dist-info/entry_points.txt +17 -0
aiqtoolkit-1.1.0.dist-info/licenses/LICENSE-3rd-party.txt +3686 -0
aiqtoolkit-1.1.0.dist-info/licenses/LICENSE.md +201 -0
aiqtoolkit-1.1.0.dist-info/top_level.txt +1 -0

aiq/eval/dataset_handler/dataset_handler.py ADDED Viewed

@@ -0,0 +1,169 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import json
+import pandas as pd
+from aiq.data_models.dataset_handler import EvalDatasetConfig
+from aiq.data_models.dataset_handler import EvalDatasetJsonConfig
+from aiq.data_models.intermediate_step import IntermediateStep
+from aiq.data_models.intermediate_step import IntermediateStepType
+from aiq.eval.dataset_handler.dataset_downloader import DatasetDownloader
+from aiq.eval.dataset_handler.dataset_filter import DatasetFilter
+from aiq.eval.evaluator.evaluator_model import EvalInput
+from aiq.eval.evaluator.evaluator_model import EvalInputItem
+class DatasetHandler:
+    """
+    Read the datasets and pre-process (apply filters, deduplicate etc.) before turning them into EvalInput objects.
+    One DatasetHandler object is needed for each dataset to be evaluated.
+    """
+    def __init__(self, dataset_config: EvalDatasetConfig, reps: int):
+        from aiq.eval.intermediate_step_adapter import IntermediateStepAdapter
+        self.dataset_config = dataset_config
+        self.dataset_filter = DatasetFilter(dataset_config.filter)
+        self.reps = reps
+        # Helpers
+        self.intermediate_step_adapter = IntermediateStepAdapter()
+    def is_structured_input(self) -> bool:
+        '''Check if the input is structured or unstructured'''
+        return not self.dataset_config.structure.disable
+    @property
+    def id_key(self) -> str:
+        return self.dataset_config.id_key
+    @property
+    def question_key(self) -> str:
+        return self.dataset_config.structure.question_key
+    @property
+    def answer_key(self) -> str:
+        return self.dataset_config.structure.answer_key
+    @property
+    def generated_answer_key(self) -> str:
+        return self.dataset_config.structure.generated_answer_key
+    @property
+    def trajectory_key(self) -> str:
+        return self.dataset_config.structure.trajectory_key
+    @property
+    def expected_trajectory_key(self) -> str:
+        return self.dataset_config.structure.expected_trajectory_key
+    def get_eval_input_from_df(self, input_df: pd.DataFrame) -> EvalInput:
+        def create_eval_item(row: pd.Series, structured: bool) -> EvalInputItem:
+            """Helper function to create EvalInputItem."""
+            return EvalInputItem(
+                id=row.get(self.id_key, ""),
+                input_obj=row.to_json() if not structured else row.get(self.question_key, ""),
+                expected_output_obj=row.get(self.answer_key, "") if structured else "",
+                output_obj=row.get(self.generated_answer_key, "") if structured else "",
+                trajectory=row.get(self.trajectory_key, []) if structured else [],
+                expected_trajectory=row.get(self.expected_trajectory_key, []) if structured else [],
+            )
+        # if input dataframe is empty return an empty list
+        if input_df.empty:
+            return EvalInput(eval_input_items=[])
+        structured = self.is_structured_input()
+        if structured:
+            # For structured input, question is mandatory. Ignore rows with missing or empty questions
+            input_df = input_df[input_df[self.question_key].notnull() & input_df[self.question_key].str.strip().ne("")]
+        eval_input_items = [create_eval_item(row, structured) for _, row in input_df.iterrows()]
+        return EvalInput(eval_input_items=eval_input_items)
+    def setup_reps(self, input_df: pd.DataFrame) -> pd.DataFrame:
+        """replicate the rows and update the id to id_key + "_rep" + rep_number"""
+        # Replicate the rows
+        input_df = pd.concat([input_df] * self.reps, ignore_index=True)
+        # Compute repetition index
+        rep_index = input_df.groupby(self.dataset_config.id_key).cumcount().astype(str)
+        # Convert id_key to string (id can be integer) if needed and update IDs
+        input_df[self.dataset_config.id_key] = input_df[self.dataset_config.id_key].astype(str) + "_rep" + rep_index
+        # Ensure unique ID values after modification
+        input_df.drop_duplicates(subset=[self.dataset_config.id_key], inplace=True)
+        return input_df
+    def get_eval_input_from_dataset(self, dataset: str) -> EvalInput:
+        # read the dataset and convert it to EvalInput
+        # if a dataset file has been provided in the command line, use that
+        dataset_config = EvalDatasetJsonConfig(file_path=dataset) if dataset else self.dataset_config
+        # Download the dataset if it is remote
+        downloader = DatasetDownloader(dataset_config=dataset_config)
+        downloader.download_dataset()
+        parser, kwargs = dataset_config.parser()
+        # Parse the dataset into a DataFrame
+        input_df = parser(dataset_config.file_path, **kwargs)
+        # Apply filters and deduplicate
+        input_df = self.dataset_filter.apply_filters(input_df)
+        input_df.drop_duplicates(subset=[self.dataset_config.id_key], inplace=True)
+        # If more than one repetition is needed, replicate the rows
+        if self.reps > 1:
+            input_df = self.setup_reps(input_df)
+        # Convert the DataFrame to a list of EvalInput objects
+        return self.get_eval_input_from_df(input_df)
+    def filter_intermediate_steps(self,
+                                  intermediate_steps: list[IntermediateStep],
+                                  event_filter: list[IntermediateStepType] = None) -> list[dict]:
+        """
+        Filter out the intermediate steps that are not relevant for evaluation.
+        The output is written with with the intention of re-running the evaluation using the original config file.
+        """
+        if event_filter is None:
+            event_filter = self.intermediate_step_adapter.DEFAULT_EVENT_FILTER
+        filtered_steps = self.intermediate_step_adapter.filter_intermediate_steps(intermediate_steps, event_filter)
+        return self.intermediate_step_adapter.serialize_intermediate_steps(filtered_steps)
+    def publish_eval_input(self, eval_input, workflow_output_step_filter: list[IntermediateStepType] = None) -> str:
+        """
+        Convert the EvalInput object to a JSON output for storing in a file. Use the orginal keys to
+        allow re-running evaluation using the orignal config file and '--skip_workflow' option.
+        """
+        indent = 2
+        if self.is_structured_input():
+            # Extract structured data from EvalInputItems
+            data = [{
+                self.id_key: item.id,
+                self.question_key: item.input_obj,
+                self.answer_key: item.expected_output_obj,
+                self.generated_answer_key: item.output_obj,
+                self.trajectory_key: self.filter_intermediate_steps(item.trajectory, workflow_output_step_filter),
+                self.expected_trajectory_key: self.filter_intermediate_steps(item.expected_trajectory),
+            } for item in eval_input.eval_input_items]
+        else:
+            # Unstructured case: return only raw output objects as a JSON array
+            data = [json.loads(item.output_obj) for item in eval_input.eval_input_items]
+        return json.dumps(data, indent=indent, ensure_ascii=False, default=str)

aiq/eval/evaluate.py ADDED Viewed

@@ -0,0 +1,325 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import asyncio
+import logging
+import shutil
+from pathlib import Path
+from typing import Any
+from pydantic import BaseModel
+from tqdm import tqdm
+from aiq.data_models.evaluate import EvalConfig
+from aiq.eval.config import EvaluationRunConfig
+from aiq.eval.config import EvaluationRunOutput
+from aiq.eval.dataset_handler.dataset_handler import DatasetHandler
+from aiq.eval.evaluator.evaluator_model import EvalInput
+from aiq.eval.evaluator.evaluator_model import EvalInputItem
+from aiq.eval.evaluator.evaluator_model import EvalOutput
+from aiq.eval.utils.output_uploader import OutputUploader
+from aiq.runtime.session import AIQSessionManager
+logger = logging.getLogger(__name__)
+class EvaluationRun:  # pylint: disable=too-many-public-methods
+    """
+    Instantiated for each evaluation run and used to store data for that single run.
+    """
+    def __init__(self, config: EvaluationRunConfig):
+        """
+        Initialize an EvaluationRun with configuration.
+        """
+        from aiq.eval.intermediate_step_adapter import IntermediateStepAdapter
+        # Run-specific configuration
+        self.config: EvaluationRunConfig = config
+        self.eval_config: EvalConfig | None = None
+        # Helpers
+        self.intermediate_step_adapter: IntermediateStepAdapter = IntermediateStepAdapter()
+        # Metadata
+        self.eval_input: EvalInput | None = None
+        self.workflow_interrupted: bool = False
+        # evaluation_results is list of tuples (evaluator_name, EvalOutput)
+        self.evaluation_results: list[tuple[str, EvalOutput]] = []
+        # workflow output file
+        self.workflow_output_file: Path | None = None
+        # evaluation output files
+        self.evaluator_output_files: list[Path] = []
+    async def run_workflow_local(self, session_manager: AIQSessionManager):
+        '''
+        Launch the workflow with the specified questions and extract the output using the jsonpath
+        '''
+        # import function level dependencies
+        from jsonpath_ng import parse
+        from aiq.eval.runtime_event_subscriber import pull_intermediate
+        # Run the workflow
+        jsonpath_expr = parse(self.config.result_json_path)
+        stop_event = asyncio.Event()
+        async def run_one(item: EvalInputItem):
+            if stop_event.is_set():
+                return "", []
+            async with session_manager.run(item.input_obj) as runner:
+                try:
+                    # Start usage stats and intermediate steps collection in parallel
+                    intermediate_future = pull_intermediate()
+                    if session_manager.workflow.has_single_output:
+                        base_output = await runner.result()
+                    else:
+                        # raise an error if the workflow has multiple outputs
+                        raise NotImplementedError("Multiple outputs are not supported")
+                    intermediate_steps = await intermediate_future
+                except NotImplementedError as e:
+                    # raise original error
+                    raise e
+                except Exception as e:
+                    logger.exception("Failed to run the workflow: %s", e, exc_info=True)
+                    # stop processing if a workflow error occurs
+                    self.workflow_interrupted = True
+                    stop_event.set()
+                    return
+                try:
+                    base_output = runner.convert(base_output, to_type=str)
+                except ValueError:
+                    pass
+                # if base_output is a pydantic model dump it to json
+                if isinstance(base_output, BaseModel):
+                    output = base_output.model_dump_json(indent=2)
+                else:
+                    m = jsonpath_expr.find(base_output)
+                    if (not m):
+                        raise RuntimeError(f"Failed to extract output using jsonpath: {self.config.result_json_path}")
+                    if (len(m) > 1):
+                        logger.warning("Multiple matches found for jsonpath at row '%s'. Matches: %s. Using the first",
+                                       base_output,
+                                       m)
+                    output = m[0].value
+                item.output_obj = output
+                item.trajectory = self.intermediate_step_adapter.validate_intermediate_steps(intermediate_steps)
+        async def wrapped_run(item: EvalInputItem) -> None:
+            await run_one(item)
+            pbar.update(1)
+        # if self.config.skip_complete is set skip eval_input_items with a non-empty output_obj
+        if self.config.skip_completed_entries:
+            eval_input_items = [item for item in self.eval_input.eval_input_items if not item.output_obj]
+            if not eval_input_items:
+                logger.warning("All items have a non-empty output. Skipping workflow pass altogether.")
+                return
+        else:
+            eval_input_items = self.eval_input.eval_input_items
+        pbar = tqdm(total=len(eval_input_items), desc="Running workflow")
+        await asyncio.gather(*[wrapped_run(item) for item in eval_input_items])
+        pbar.close()
+    async def run_workflow_remote(self):
+        from aiq.eval.remote_workflow import EvaluationRemoteWorkflowHandler
+        handler = EvaluationRemoteWorkflowHandler(self.config, self.eval_config.general.max_concurrency)
+        await handler.run_workflow_remote(self.eval_input)
+    async def profile_workflow(self):
+        """
+        Profile a dataset
+        """
+        if not self.eval_config.general.profiler:
+            logger.info("Profiler is not enabled. Skipping profiling.")
+            return
+        from aiq.profiler.profile_runner import ProfilerRunner
+        all_stats = []
+        for input_item in self.eval_input.eval_input_items:
+            all_stats.append(input_item.trajectory)
+        profiler_runner = ProfilerRunner(self.eval_config.general.profiler, self.eval_config.general.output_dir)
+        await profiler_runner.run(all_stats)
+    def cleanup_output_directory(self):
+        '''Remove contents of the output directory if it exists'''
+        if self.eval_config.general.output and self.eval_config.general.output.dir and \
+                self.eval_config.general.output.dir.exists():
+            logger.info("Cleaning up output directory %s", self.eval_config.general.output.dir)
+            shutil.rmtree(self.eval_config.general.output.dir)
+    def write_output(self, dataset_handler: DatasetHandler):
+        workflow_output_file = self.eval_config.general.output_dir / "workflow_output.json"
+        workflow_output_file.parent.mkdir(parents=True, exist_ok=True)
+        # Write the workflow output to a file (this can be used for re-running the evaluation)
+        step_filter = self.eval_config.general.output.workflow_output_step_filter \
+            if self.eval_config.general.output else None
+        workflow_output = dataset_handler.publish_eval_input(self.eval_input, step_filter)
+        with open(workflow_output_file, "w", encoding="utf-8") as f:
+            # set indent to 2 for pretty printing
+            f.write(workflow_output)
+        self.workflow_output_file = workflow_output_file
+        logger.info("Workflow output written to %s", workflow_output_file)
+        # Write the output of each evaluator to a separate json file
+        for evaluator_name, eval_output in self.evaluation_results:
+            output_file = self.eval_config.general.output_dir / f"{evaluator_name}_output.json"
+            output_file.parent.mkdir(parents=True, exist_ok=True)
+            # create json content using the evaluation results
+            output = eval_output.model_dump_json(indent=2)
+            with open(output_file, "w", encoding="utf-8") as f:
+                f.write(output)
+            self.evaluator_output_files.append(output_file)
+            logger.info("Evaluation results written to %s", output_file)
+        if self.workflow_interrupted:
+            # Issue a warning if the workflow was not completed on all datasets
+            msg = ("Workflow execution was interrupted due to an error. The results may be incomplete. "
+                   "You can re-execute evaluation for incomplete results by running "
+                   "`eval` with the --skip_completed_entries flag.")
+            logger.warning(msg)
+    async def run_single_evaluator(self, evaluator_name: str, evaluator: Any):
+        """Run a single evaluator and store its results."""
+        try:
+            eval_output = await evaluator.evaluate_fn(self.eval_input)
+            self.evaluation_results.append((evaluator_name, eval_output))
+        except Exception as e:
+            logger.exception("An error occurred while running evaluator %s: %s", evaluator_name, e, exc_info=True)
+    async def run_evaluators(self, evaluators: dict[str, Any]):
+        """Run all configured evaluators asynchronously."""
+        tasks = [self.run_single_evaluator(name, evaluator) for name, evaluator in evaluators.items() if evaluator]
+        if not tasks:
+            logger.warning("All evaluators were empty or invalid.")
+            return
+        try:
+            await asyncio.gather(*tasks)
+        except Exception as e:
+            logger.exception("An error occurred while running evaluators: %s", e, exc_info=True)
+            raise
+    def apply_overrides(self):
+        from aiq.cli.cli_utils.config_override import load_and_override_config
+        from aiq.data_models.config import AIQConfig
+        from aiq.runtime.loader import PluginTypes
+        from aiq.runtime.loader import discover_and_register_plugins
+        from aiq.utils.data_models.schema_validator import validate_schema
+        # Register plugins before validation
+        discover_and_register_plugins(PluginTypes.CONFIG_OBJECT)
+        config_dict = load_and_override_config(self.config.config_file, self.config.override)
+        config = validate_schema(config_dict, AIQConfig)
+        return config
+    async def run_and_evaluate(self,
+                               session_manager: AIQSessionManager | None = None,
+                               job_id: str | None = None) -> EvaluationRunOutput:
+        """
+        Run the workflow with the specified config file and evaluate the dataset
+        """
+        logger.info("Starting evaluation run with config file: %s", self.config.config_file)
+        from aiq.builder.eval_builder import WorkflowEvalBuilder
+        from aiq.runtime.loader import load_config
+        # Load and override the config
+        if self.config.override:
+            config = self.apply_overrides()
+        else:
+            config = load_config(self.config.config_file)
+        self.eval_config = config.eval
+        logger.debug("Loaded evaluation configuration: %s", self.eval_config)
+        # Cleanup the output directory
+        if self.eval_config.general.output and self.eval_config.general.output.cleanup:
+            self.cleanup_output_directory()
+        # If a job id is provided keep the data per-job
+        if job_id:
+            self.eval_config.general.output_dir = self.eval_config.general.output_dir / f"jobs/{job_id}"
+            if self.eval_config.general.output:
+                self.eval_config.general.output.dir = self.eval_config.general.output_dir
+        # Load the input dataset
+        # For multiple datasets, one handler per dataset can be created
+        dataset_config = self.eval_config.general.dataset  # Currently only one dataset is supported
+        if not dataset_config:
+            logger.info("No dataset found, nothing to evaluate")
+            return EvaluationRunOutput(
+                workflow_output_file=self.workflow_output_file,
+                evaluator_output_files=self.evaluator_output_files,
+                workflow_interrupted=self.workflow_interrupted,
+            )
+        dataset_handler = DatasetHandler(dataset_config=dataset_config, reps=self.config.reps)
+        self.eval_input = dataset_handler.get_eval_input_from_dataset(self.config.dataset)
+        if not self.eval_input.eval_input_items:
+            logger.info("Dataset is empty. Nothing to evaluate.")
+            return EvaluationRunOutput(
+                workflow_output_file=self.workflow_output_file,
+                evaluator_output_files=self.evaluator_output_files,
+                workflow_interrupted=self.workflow_interrupted,
+            )
+        # Run workflow and evaluate
+        async with WorkflowEvalBuilder.from_config(config=config) as eval_workflow:
+            if self.config.endpoint:
+                await self.run_workflow_remote()
+            else:
+                if not self.config.skip_workflow:
+                    if session_manager is None:
+                        session_manager = AIQSessionManager(eval_workflow.build(),
+                                                            max_concurrency=self.eval_config.general.max_concurrency)
+                    await self.run_workflow_local(session_manager)
+            # Evaluate
+            evaluators = {name: eval_workflow.get_evaluator(name) for name in self.eval_config.evaluators}
+            await self.run_evaluators(evaluators)
+        # Profile the workflow
+        await self.profile_workflow()
+        # Write the results to the output directory
+        self.write_output(dataset_handler)
+        # Run custom scripts and upload evaluation outputs to S3
+        if self.eval_config.general.output:
+            output_uploader = OutputUploader(self.eval_config.general.output, job_id=job_id)
+            output_uploader.run_custom_scripts()
+            await output_uploader.upload_directory()
+        return EvaluationRunOutput(
+            workflow_output_file=self.workflow_output_file,
+            evaluator_output_files=self.evaluator_output_files,
+            workflow_interrupted=self.workflow_interrupted,
+        )

aiq/eval/evaluator/__init__.py ADDED Viewed

@@ -0,0 +1,14 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.

aiq/eval/evaluator/evaluator_model.py ADDED Viewed

@@ -0,0 +1,44 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import typing
+from pydantic import BaseModel
+from aiq.data_models.intermediate_step import IntermediateStep
+class EvalInputItem(BaseModel):
+    id: typing.Any
+    input_obj: typing.Any
+    expected_output_obj: typing.Any
+    output_obj: typing.Any
+    expected_trajectory: list[IntermediateStep]
+    trajectory: list[IntermediateStep]
+class EvalInput(BaseModel):
+    eval_input_items: list[EvalInputItem]
+class EvalOutputItem(BaseModel):
+    id: typing.Any  # id or input_obj from EvalInputItem
+    score: typing.Any  # float or any serializable type
+    reasoning: typing.Any
+class EvalOutput(BaseModel):
+    average_score: typing.Any  # float or any serializable type
+    eval_output_items: list[EvalOutputItem]

aiq/eval/intermediate_step_adapter.py ADDED Viewed

@@ -0,0 +1,93 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import logging
+from langchain_core.agents import AgentAction
+from aiq.data_models.intermediate_step import IntermediateStep
+from aiq.data_models.intermediate_step import IntermediateStepType
+logger = logging.getLogger(__name__)
+class IntermediateStepAdapter:
+    DEFAULT_EVENT_FILTER = [IntermediateStepType.LLM_END, IntermediateStepType.TOOL_END]
+    def filter_intermediate_steps(self,
+                                  intermediate_steps: list[IntermediateStep],
+                                  event_filter: list[IntermediateStepType]) -> list[IntermediateStep]:
+        """ Filters intermediate steps"""
+        if not event_filter:
+            return intermediate_steps
+        return [step for step in intermediate_steps if step.event_type in event_filter]
+    def validate_intermediate_steps(self, intermediate_steps: list[dict]) -> list[IntermediateStep]:
+        validated_steps = []
+        for step_data in intermediate_steps:
+            try:
+                validated_steps.append(IntermediateStep.model_validate(step_data))
+            except Exception as e:
+                logger.exception("Validation failed for step: %r, Error: %s", step_data, e, exc_info=True)
+        return validated_steps
+    def serialize_intermediate_steps(self, intermediate_steps: list[IntermediateStep]) -> list[dict]:
+        """Converts a list of IntermediateStep objects to a list of dictionaries."""
+        return [step.model_dump() for step in intermediate_steps]
+    @staticmethod
+    def agent_action_to_dict(action) -> dict:
+        """Convert AgentAction to a JSON-serializable dictionary."""
+        return {
+            "tool": action.tool,
+            "tool_input": action.tool_input,
+            "log": action.log,
+            "type": action.type,
+        }
+    def get_agent_action_single(self, step: IntermediateStep,
+                                last_llm_end_step: IntermediateStep | None) -> tuple[AgentAction, str]:
+        """Converts a single intermediate step to Tuple[AgentAction, str]."""
+        # use the previous llm output as log
+        log = getattr(last_llm_end_step.data, "output", "") if last_llm_end_step else ""
+        tool_name = step.name or ""
+        tool_input = getattr(step.data, "input", "") if step.data else ""
+        tool_output = getattr(step.data, "output", "") if step.data else ""
+        action = AgentAction(tool=tool_name, tool_input=tool_input, log=log)
+        return action, tool_output
+    def get_agent_actions(self, intermediate_steps: list[IntermediateStep],
+                          event_filter: list[IntermediateStepType]) -> list[tuple[AgentAction, str]]:
+        """Converts a list of intermediate steps to a list of (AgentAction, output)."""
+        steps = self.filter_intermediate_steps(intermediate_steps, event_filter)
+        last_llm_end_step = None
+        agent_actions = []
+        for step in steps:
+            if step.event_type == IntermediateStepType.LLM_END:
+                last_llm_end_step = step
+            else:
+                action = self.get_agent_action_single(step, last_llm_end_step)
+                agent_actions.append(action)
+        return agent_actions
+    def get_context(self, intermediate_steps: list[IntermediateStep]) -> list[str]:
+        """Grab the output of all the tools and return them as retrieved context."""
+        return [
+            str(step.data.output) for step in intermediate_steps
+            if step.event_type == IntermediateStepType.TOOL_END and step.data and step.data.output
+        ]

aiq/eval/rag_evaluator/__init__.py ADDED Viewed

File without changes