aiqtoolkit 1.1.0rc5__py3-none-any.whl → 1.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of aiqtoolkit might be problematic. Click here for more details.
- aiqtoolkit-1.2.0.dist-info/METADATA +29 -0
- aiqtoolkit-1.2.0.dist-info/RECORD +4 -0
- {aiqtoolkit-1.1.0rc5.dist-info → aiqtoolkit-1.2.0.dist-info}/WHEEL +1 -1
- aiqtoolkit-1.2.0.dist-info/top_level.txt +1 -0
- aiq/agent/__init__.py +0 -0
- aiq/agent/base.py +0 -76
- aiq/agent/dual_node.py +0 -67
- aiq/agent/react_agent/__init__.py +0 -0
- aiq/agent/react_agent/agent.py +0 -322
- aiq/agent/react_agent/output_parser.py +0 -104
- aiq/agent/react_agent/prompt.py +0 -46
- aiq/agent/react_agent/register.py +0 -148
- aiq/agent/reasoning_agent/__init__.py +0 -0
- aiq/agent/reasoning_agent/reasoning_agent.py +0 -224
- aiq/agent/register.py +0 -23
- aiq/agent/rewoo_agent/__init__.py +0 -0
- aiq/agent/rewoo_agent/agent.py +0 -410
- aiq/agent/rewoo_agent/prompt.py +0 -108
- aiq/agent/rewoo_agent/register.py +0 -158
- aiq/agent/tool_calling_agent/__init__.py +0 -0
- aiq/agent/tool_calling_agent/agent.py +0 -123
- aiq/agent/tool_calling_agent/register.py +0 -105
- aiq/builder/__init__.py +0 -0
- aiq/builder/builder.py +0 -223
- aiq/builder/component_utils.py +0 -303
- aiq/builder/context.py +0 -227
- aiq/builder/embedder.py +0 -24
- aiq/builder/eval_builder.py +0 -120
- aiq/builder/evaluator.py +0 -29
- aiq/builder/framework_enum.py +0 -24
- aiq/builder/front_end.py +0 -73
- aiq/builder/function.py +0 -297
- aiq/builder/function_base.py +0 -376
- aiq/builder/function_info.py +0 -627
- aiq/builder/intermediate_step_manager.py +0 -135
- aiq/builder/llm.py +0 -25
- aiq/builder/retriever.py +0 -25
- aiq/builder/user_interaction_manager.py +0 -71
- aiq/builder/workflow.py +0 -143
- aiq/builder/workflow_builder.py +0 -757
- aiq/cli/__init__.py +0 -14
- aiq/cli/cli_utils/__init__.py +0 -0
- aiq/cli/cli_utils/config_override.py +0 -231
- aiq/cli/cli_utils/validation.py +0 -37
- aiq/cli/commands/__init__.py +0 -0
- aiq/cli/commands/configure/__init__.py +0 -0
- aiq/cli/commands/configure/channel/__init__.py +0 -0
- aiq/cli/commands/configure/channel/add.py +0 -28
- aiq/cli/commands/configure/channel/channel.py +0 -36
- aiq/cli/commands/configure/channel/remove.py +0 -30
- aiq/cli/commands/configure/channel/update.py +0 -30
- aiq/cli/commands/configure/configure.py +0 -33
- aiq/cli/commands/evaluate.py +0 -139
- aiq/cli/commands/info/__init__.py +0 -14
- aiq/cli/commands/info/info.py +0 -39
- aiq/cli/commands/info/list_channels.py +0 -32
- aiq/cli/commands/info/list_components.py +0 -129
- aiq/cli/commands/info/list_mcp.py +0 -126
- aiq/cli/commands/registry/__init__.py +0 -14
- aiq/cli/commands/registry/publish.py +0 -88
- aiq/cli/commands/registry/pull.py +0 -118
- aiq/cli/commands/registry/registry.py +0 -38
- aiq/cli/commands/registry/remove.py +0 -108
- aiq/cli/commands/registry/search.py +0 -155
- aiq/cli/commands/start.py +0 -250
- aiq/cli/commands/uninstall.py +0 -83
- aiq/cli/commands/validate.py +0 -47
- aiq/cli/commands/workflow/__init__.py +0 -14
- aiq/cli/commands/workflow/templates/__init__.py.j2 +0 -0
- aiq/cli/commands/workflow/templates/config.yml.j2 +0 -16
- aiq/cli/commands/workflow/templates/pyproject.toml.j2 +0 -22
- aiq/cli/commands/workflow/templates/register.py.j2 +0 -5
- aiq/cli/commands/workflow/templates/workflow.py.j2 +0 -36
- aiq/cli/commands/workflow/workflow.py +0 -37
- aiq/cli/commands/workflow/workflow_commands.py +0 -313
- aiq/cli/entrypoint.py +0 -133
- aiq/cli/main.py +0 -44
- aiq/cli/register_workflow.py +0 -408
- aiq/cli/type_registry.py +0 -879
- aiq/data_models/__init__.py +0 -14
- aiq/data_models/api_server.py +0 -588
- aiq/data_models/common.py +0 -143
- aiq/data_models/component.py +0 -46
- aiq/data_models/component_ref.py +0 -135
- aiq/data_models/config.py +0 -349
- aiq/data_models/dataset_handler.py +0 -122
- aiq/data_models/discovery_metadata.py +0 -286
- aiq/data_models/embedder.py +0 -26
- aiq/data_models/evaluate.py +0 -104
- aiq/data_models/evaluator.py +0 -26
- aiq/data_models/front_end.py +0 -26
- aiq/data_models/function.py +0 -30
- aiq/data_models/function_dependencies.py +0 -64
- aiq/data_models/interactive.py +0 -237
- aiq/data_models/intermediate_step.py +0 -269
- aiq/data_models/invocation_node.py +0 -38
- aiq/data_models/llm.py +0 -26
- aiq/data_models/logging.py +0 -26
- aiq/data_models/memory.py +0 -26
- aiq/data_models/profiler.py +0 -53
- aiq/data_models/registry_handler.py +0 -26
- aiq/data_models/retriever.py +0 -30
- aiq/data_models/step_adaptor.py +0 -64
- aiq/data_models/streaming.py +0 -33
- aiq/data_models/swe_bench_model.py +0 -54
- aiq/data_models/telemetry_exporter.py +0 -26
- aiq/embedder/__init__.py +0 -0
- aiq/embedder/langchain_client.py +0 -41
- aiq/embedder/nim_embedder.py +0 -58
- aiq/embedder/openai_embedder.py +0 -42
- aiq/embedder/register.py +0 -24
- aiq/eval/__init__.py +0 -14
- aiq/eval/config.py +0 -42
- aiq/eval/dataset_handler/__init__.py +0 -0
- aiq/eval/dataset_handler/dataset_downloader.py +0 -106
- aiq/eval/dataset_handler/dataset_filter.py +0 -52
- aiq/eval/dataset_handler/dataset_handler.py +0 -169
- aiq/eval/evaluate.py +0 -325
- aiq/eval/evaluator/__init__.py +0 -14
- aiq/eval/evaluator/evaluator_model.py +0 -44
- aiq/eval/intermediate_step_adapter.py +0 -93
- aiq/eval/rag_evaluator/__init__.py +0 -0
- aiq/eval/rag_evaluator/evaluate.py +0 -138
- aiq/eval/rag_evaluator/register.py +0 -138
- aiq/eval/register.py +0 -23
- aiq/eval/remote_workflow.py +0 -128
- aiq/eval/runtime_event_subscriber.py +0 -52
- aiq/eval/swe_bench_evaluator/__init__.py +0 -0
- aiq/eval/swe_bench_evaluator/evaluate.py +0 -215
- aiq/eval/swe_bench_evaluator/register.py +0 -36
- aiq/eval/trajectory_evaluator/__init__.py +0 -0
- aiq/eval/trajectory_evaluator/evaluate.py +0 -118
- aiq/eval/trajectory_evaluator/register.py +0 -40
- aiq/eval/tunable_rag_evaluator/__init__.py +0 -0
- aiq/eval/tunable_rag_evaluator/evaluate.py +0 -263
- aiq/eval/tunable_rag_evaluator/register.py +0 -50
- aiq/eval/utils/__init__.py +0 -0
- aiq/eval/utils/output_uploader.py +0 -131
- aiq/eval/utils/tqdm_position_registry.py +0 -40
- aiq/front_ends/__init__.py +0 -14
- aiq/front_ends/console/__init__.py +0 -14
- aiq/front_ends/console/console_front_end_config.py +0 -32
- aiq/front_ends/console/console_front_end_plugin.py +0 -107
- aiq/front_ends/console/register.py +0 -25
- aiq/front_ends/cron/__init__.py +0 -14
- aiq/front_ends/fastapi/__init__.py +0 -14
- aiq/front_ends/fastapi/fastapi_front_end_config.py +0 -150
- aiq/front_ends/fastapi/fastapi_front_end_plugin.py +0 -103
- aiq/front_ends/fastapi/fastapi_front_end_plugin_worker.py +0 -607
- aiq/front_ends/fastapi/intermediate_steps_subscriber.py +0 -80
- aiq/front_ends/fastapi/job_store.py +0 -161
- aiq/front_ends/fastapi/main.py +0 -70
- aiq/front_ends/fastapi/message_handler.py +0 -279
- aiq/front_ends/fastapi/message_validator.py +0 -345
- aiq/front_ends/fastapi/register.py +0 -25
- aiq/front_ends/fastapi/response_helpers.py +0 -195
- aiq/front_ends/fastapi/step_adaptor.py +0 -320
- aiq/front_ends/fastapi/websocket.py +0 -148
- aiq/front_ends/mcp/__init__.py +0 -14
- aiq/front_ends/mcp/mcp_front_end_config.py +0 -32
- aiq/front_ends/mcp/mcp_front_end_plugin.py +0 -93
- aiq/front_ends/mcp/register.py +0 -27
- aiq/front_ends/mcp/tool_converter.py +0 -242
- aiq/front_ends/register.py +0 -22
- aiq/front_ends/simple_base/__init__.py +0 -14
- aiq/front_ends/simple_base/simple_front_end_plugin_base.py +0 -52
- aiq/llm/__init__.py +0 -0
- aiq/llm/nim_llm.py +0 -45
- aiq/llm/openai_llm.py +0 -45
- aiq/llm/register.py +0 -22
- aiq/llm/utils/__init__.py +0 -14
- aiq/llm/utils/env_config_value.py +0 -94
- aiq/llm/utils/error.py +0 -17
- aiq/memory/__init__.py +0 -20
- aiq/memory/interfaces.py +0 -183
- aiq/memory/models.py +0 -112
- aiq/meta/module_to_distro.json +0 -3
- aiq/meta/pypi.md +0 -58
- aiq/observability/__init__.py +0 -0
- aiq/observability/async_otel_listener.py +0 -429
- aiq/observability/register.py +0 -99
- aiq/plugins/.namespace +0 -1
- aiq/profiler/__init__.py +0 -0
- aiq/profiler/callbacks/__init__.py +0 -0
- aiq/profiler/callbacks/agno_callback_handler.py +0 -295
- aiq/profiler/callbacks/base_callback_class.py +0 -20
- aiq/profiler/callbacks/langchain_callback_handler.py +0 -278
- aiq/profiler/callbacks/llama_index_callback_handler.py +0 -205
- aiq/profiler/callbacks/semantic_kernel_callback_handler.py +0 -238
- aiq/profiler/callbacks/token_usage_base_model.py +0 -27
- aiq/profiler/data_frame_row.py +0 -51
- aiq/profiler/decorators/__init__.py +0 -0
- aiq/profiler/decorators/framework_wrapper.py +0 -131
- aiq/profiler/decorators/function_tracking.py +0 -254
- aiq/profiler/forecasting/__init__.py +0 -0
- aiq/profiler/forecasting/config.py +0 -18
- aiq/profiler/forecasting/model_trainer.py +0 -75
- aiq/profiler/forecasting/models/__init__.py +0 -22
- aiq/profiler/forecasting/models/forecasting_base_model.py +0 -40
- aiq/profiler/forecasting/models/linear_model.py +0 -196
- aiq/profiler/forecasting/models/random_forest_regressor.py +0 -268
- aiq/profiler/inference_metrics_model.py +0 -25
- aiq/profiler/inference_optimization/__init__.py +0 -0
- aiq/profiler/inference_optimization/bottleneck_analysis/__init__.py +0 -0
- aiq/profiler/inference_optimization/bottleneck_analysis/nested_stack_analysis.py +0 -452
- aiq/profiler/inference_optimization/bottleneck_analysis/simple_stack_analysis.py +0 -258
- aiq/profiler/inference_optimization/data_models.py +0 -386
- aiq/profiler/inference_optimization/experimental/__init__.py +0 -0
- aiq/profiler/inference_optimization/experimental/concurrency_spike_analysis.py +0 -468
- aiq/profiler/inference_optimization/experimental/prefix_span_analysis.py +0 -405
- aiq/profiler/inference_optimization/llm_metrics.py +0 -212
- aiq/profiler/inference_optimization/prompt_caching.py +0 -163
- aiq/profiler/inference_optimization/token_uniqueness.py +0 -107
- aiq/profiler/inference_optimization/workflow_runtimes.py +0 -72
- aiq/profiler/intermediate_property_adapter.py +0 -102
- aiq/profiler/profile_runner.py +0 -433
- aiq/profiler/utils.py +0 -184
- aiq/registry_handlers/__init__.py +0 -0
- aiq/registry_handlers/local/__init__.py +0 -0
- aiq/registry_handlers/local/local_handler.py +0 -176
- aiq/registry_handlers/local/register_local.py +0 -37
- aiq/registry_handlers/metadata_factory.py +0 -60
- aiq/registry_handlers/package_utils.py +0 -198
- aiq/registry_handlers/pypi/__init__.py +0 -0
- aiq/registry_handlers/pypi/pypi_handler.py +0 -251
- aiq/registry_handlers/pypi/register_pypi.py +0 -40
- aiq/registry_handlers/register.py +0 -21
- aiq/registry_handlers/registry_handler_base.py +0 -157
- aiq/registry_handlers/rest/__init__.py +0 -0
- aiq/registry_handlers/rest/register_rest.py +0 -56
- aiq/registry_handlers/rest/rest_handler.py +0 -237
- aiq/registry_handlers/schemas/__init__.py +0 -0
- aiq/registry_handlers/schemas/headers.py +0 -42
- aiq/registry_handlers/schemas/package.py +0 -68
- aiq/registry_handlers/schemas/publish.py +0 -63
- aiq/registry_handlers/schemas/pull.py +0 -82
- aiq/registry_handlers/schemas/remove.py +0 -36
- aiq/registry_handlers/schemas/search.py +0 -91
- aiq/registry_handlers/schemas/status.py +0 -47
- aiq/retriever/__init__.py +0 -0
- aiq/retriever/interface.py +0 -37
- aiq/retriever/milvus/__init__.py +0 -14
- aiq/retriever/milvus/register.py +0 -81
- aiq/retriever/milvus/retriever.py +0 -228
- aiq/retriever/models.py +0 -74
- aiq/retriever/nemo_retriever/__init__.py +0 -14
- aiq/retriever/nemo_retriever/register.py +0 -60
- aiq/retriever/nemo_retriever/retriever.py +0 -190
- aiq/retriever/register.py +0 -22
- aiq/runtime/__init__.py +0 -14
- aiq/runtime/loader.py +0 -188
- aiq/runtime/runner.py +0 -176
- aiq/runtime/session.py +0 -140
- aiq/runtime/user_metadata.py +0 -131
- aiq/settings/__init__.py +0 -0
- aiq/settings/global_settings.py +0 -318
- aiq/test/.namespace +0 -1
- aiq/tool/__init__.py +0 -0
- aiq/tool/code_execution/__init__.py +0 -0
- aiq/tool/code_execution/code_sandbox.py +0 -188
- aiq/tool/code_execution/local_sandbox/Dockerfile.sandbox +0 -60
- aiq/tool/code_execution/local_sandbox/__init__.py +0 -13
- aiq/tool/code_execution/local_sandbox/local_sandbox_server.py +0 -83
- aiq/tool/code_execution/local_sandbox/sandbox.requirements.txt +0 -4
- aiq/tool/code_execution/local_sandbox/start_local_sandbox.sh +0 -25
- aiq/tool/code_execution/register.py +0 -70
- aiq/tool/code_execution/utils.py +0 -100
- aiq/tool/datetime_tools.py +0 -42
- aiq/tool/document_search.py +0 -141
- aiq/tool/github_tools/__init__.py +0 -0
- aiq/tool/github_tools/create_github_commit.py +0 -133
- aiq/tool/github_tools/create_github_issue.py +0 -87
- aiq/tool/github_tools/create_github_pr.py +0 -106
- aiq/tool/github_tools/get_github_file.py +0 -106
- aiq/tool/github_tools/get_github_issue.py +0 -166
- aiq/tool/github_tools/get_github_pr.py +0 -256
- aiq/tool/github_tools/update_github_issue.py +0 -100
- aiq/tool/mcp/__init__.py +0 -14
- aiq/tool/mcp/mcp_client.py +0 -220
- aiq/tool/mcp/mcp_tool.py +0 -95
- aiq/tool/memory_tools/__init__.py +0 -0
- aiq/tool/memory_tools/add_memory_tool.py +0 -79
- aiq/tool/memory_tools/delete_memory_tool.py +0 -67
- aiq/tool/memory_tools/get_memory_tool.py +0 -72
- aiq/tool/nvidia_rag.py +0 -95
- aiq/tool/register.py +0 -37
- aiq/tool/retriever.py +0 -89
- aiq/tool/server_tools.py +0 -63
- aiq/utils/__init__.py +0 -0
- aiq/utils/data_models/__init__.py +0 -0
- aiq/utils/data_models/schema_validator.py +0 -58
- aiq/utils/debugging_utils.py +0 -43
- aiq/utils/exception_handlers/__init__.py +0 -0
- aiq/utils/exception_handlers/schemas.py +0 -114
- aiq/utils/io/__init__.py +0 -0
- aiq/utils/io/yaml_tools.py +0 -119
- aiq/utils/metadata_utils.py +0 -74
- aiq/utils/optional_imports.py +0 -142
- aiq/utils/producer_consumer_queue.py +0 -178
- aiq/utils/reactive/__init__.py +0 -0
- aiq/utils/reactive/base/__init__.py +0 -0
- aiq/utils/reactive/base/observable_base.py +0 -65
- aiq/utils/reactive/base/observer_base.py +0 -55
- aiq/utils/reactive/base/subject_base.py +0 -79
- aiq/utils/reactive/observable.py +0 -59
- aiq/utils/reactive/observer.py +0 -76
- aiq/utils/reactive/subject.py +0 -131
- aiq/utils/reactive/subscription.py +0 -49
- aiq/utils/settings/__init__.py +0 -0
- aiq/utils/settings/global_settings.py +0 -197
- aiq/utils/type_converter.py +0 -232
- aiq/utils/type_utils.py +0 -397
- aiq/utils/url_utils.py +0 -27
- aiqtoolkit-1.1.0rc5.dist-info/METADATA +0 -331
- aiqtoolkit-1.1.0rc5.dist-info/RECORD +0 -316
- aiqtoolkit-1.1.0rc5.dist-info/entry_points.txt +0 -17
- aiqtoolkit-1.1.0rc5.dist-info/licenses/LICENSE-3rd-party.txt +0 -3686
- aiqtoolkit-1.1.0rc5.dist-info/licenses/LICENSE.md +0 -201
- aiqtoolkit-1.1.0rc5.dist-info/top_level.txt +0 -1
|
@@ -1,138 +0,0 @@
|
|
|
1
|
-
# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
|
-
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
-
#
|
|
4
|
-
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
-
# you may not use this file except in compliance with the License.
|
|
6
|
-
# You may obtain a copy of the License at
|
|
7
|
-
#
|
|
8
|
-
# http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
-
#
|
|
10
|
-
# Unless required by applicable law or agreed to in writing, software
|
|
11
|
-
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
-
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
-
# See the License for the specific language governing permissions and
|
|
14
|
-
# limitations under the License.
|
|
15
|
-
|
|
16
|
-
import logging
|
|
17
|
-
from collections.abc import Sequence
|
|
18
|
-
|
|
19
|
-
from ragas import EvaluationDataset
|
|
20
|
-
from ragas import SingleTurnSample
|
|
21
|
-
from ragas.dataset_schema import EvaluationResult
|
|
22
|
-
from ragas.llms import LangchainLLMWrapper
|
|
23
|
-
from ragas.metrics import Metric
|
|
24
|
-
from tqdm import tqdm
|
|
25
|
-
|
|
26
|
-
from aiq.eval.evaluator.evaluator_model import EvalInput
|
|
27
|
-
from aiq.eval.evaluator.evaluator_model import EvalOutput
|
|
28
|
-
from aiq.eval.evaluator.evaluator_model import EvalOutputItem
|
|
29
|
-
from aiq.eval.utils.tqdm_position_registry import TqdmPositionRegistry
|
|
30
|
-
|
|
31
|
-
logger = logging.getLogger(__name__)
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
class RAGEvaluator:
|
|
35
|
-
|
|
36
|
-
def __init__(self, evaluator_llm: LangchainLLMWrapper, metrics: Sequence[Metric]):
|
|
37
|
-
self.evaluator_llm = evaluator_llm
|
|
38
|
-
self.metrics = metrics
|
|
39
|
-
|
|
40
|
-
@staticmethod
|
|
41
|
-
def eval_input_to_ragas(eval_input: EvalInput) -> EvaluationDataset:
|
|
42
|
-
"""Converts EvalInput into a Ragas-compatible EvaluationDataset."""
|
|
43
|
-
from aiq.eval.intermediate_step_adapter import IntermediateStepAdapter
|
|
44
|
-
|
|
45
|
-
samples = []
|
|
46
|
-
|
|
47
|
-
intermediate_step_adapter = IntermediateStepAdapter()
|
|
48
|
-
for item in eval_input.eval_input_items:
|
|
49
|
-
# Extract required fields from EvalInputItem
|
|
50
|
-
user_input = item.input_obj # Assumes input_obj is a string (modify if needed)
|
|
51
|
-
reference = item.expected_output_obj # Reference correct answer
|
|
52
|
-
response = item.output_obj # Model's generated response
|
|
53
|
-
|
|
54
|
-
# Handle context extraction from trajectory if available
|
|
55
|
-
reference_contexts = [""] # Default to empty context
|
|
56
|
-
# implement context extraction from expected_trajectory
|
|
57
|
-
|
|
58
|
-
retrieved_contexts = intermediate_step_adapter.get_context(item.trajectory)
|
|
59
|
-
# implement context extraction from expected_trajectory
|
|
60
|
-
|
|
61
|
-
# Create a SingleTurnSample
|
|
62
|
-
sample = SingleTurnSample(
|
|
63
|
-
user_input=user_input,
|
|
64
|
-
reference=reference,
|
|
65
|
-
response=response,
|
|
66
|
-
reference_contexts=reference_contexts,
|
|
67
|
-
retrieved_contexts=retrieved_contexts,
|
|
68
|
-
)
|
|
69
|
-
samples.append(sample)
|
|
70
|
-
|
|
71
|
-
return EvaluationDataset(samples=samples)
|
|
72
|
-
|
|
73
|
-
def ragas_to_eval_output(self, eval_input: EvalInput, results_dataset: EvaluationResult | None) -> EvalOutput:
|
|
74
|
-
"""Converts the ragas EvaluationResult to aiq EvalOutput"""
|
|
75
|
-
|
|
76
|
-
if not results_dataset:
|
|
77
|
-
logger.error("Ragas evaluation failed with no results")
|
|
78
|
-
return EvalOutput(average_score=0.0, eval_output_items=[])
|
|
79
|
-
|
|
80
|
-
scores: list[dict[str, float]] = results_dataset.scores
|
|
81
|
-
if not scores:
|
|
82
|
-
logger.error("Ragas returned empty score list")
|
|
83
|
-
return EvalOutput(average_score=0.0, eval_output_items=[])
|
|
84
|
-
|
|
85
|
-
# Convert from list of dicts to dict of lists
|
|
86
|
-
scores_dict = {metric: [score[metric] for score in scores] for metric in scores[0]}
|
|
87
|
-
|
|
88
|
-
# Compute the average of each metric
|
|
89
|
-
average_scores = {metric: sum(values) / len(values) for metric, values in scores_dict.items()}
|
|
90
|
-
|
|
91
|
-
# Extract the first (and only) metric's average score
|
|
92
|
-
first_avg_score = next(iter(average_scores.values()))
|
|
93
|
-
first_metric_name = list(scores_dict.keys())[0]
|
|
94
|
-
|
|
95
|
-
df = results_dataset.to_pandas()
|
|
96
|
-
# Get id from eval_input if df size matches number of eval_input_items
|
|
97
|
-
if len(eval_input.eval_input_items) >= len(df):
|
|
98
|
-
ids = [item.id for item in eval_input.eval_input_items] # Extract IDs
|
|
99
|
-
else:
|
|
100
|
-
ids = df["user_input"].tolist() # Use "user_input" as ID fallback
|
|
101
|
-
|
|
102
|
-
# Construct EvalOutputItem list
|
|
103
|
-
eval_output_items = [
|
|
104
|
-
EvalOutputItem(
|
|
105
|
-
id=ids[i],
|
|
106
|
-
score=getattr(row, first_metric_name, 0.0),
|
|
107
|
-
reasoning={
|
|
108
|
-
key:
|
|
109
|
-
getattr(row, key, None) # Use getattr to safely access attributes
|
|
110
|
-
for key in ["user_input", "reference", "response", "retrieved_contexts"]
|
|
111
|
-
}) for i, row in enumerate(df.itertuples(index=False))
|
|
112
|
-
]
|
|
113
|
-
# Return EvalOutput
|
|
114
|
-
return EvalOutput(average_score=first_avg_score, eval_output_items=eval_output_items)
|
|
115
|
-
|
|
116
|
-
async def evaluate(self, eval_input: EvalInput) -> EvalOutput:
|
|
117
|
-
"""Run Ragas metrics evaluation on the provided EvalInput"""
|
|
118
|
-
from ragas import evaluate as ragas_evaluate
|
|
119
|
-
|
|
120
|
-
ragas_dataset = self.eval_input_to_ragas(eval_input)
|
|
121
|
-
tqdm_position = TqdmPositionRegistry.claim()
|
|
122
|
-
first_metric_name = self.metrics[0].name
|
|
123
|
-
pbar = tqdm(total=len(ragas_dataset), desc=f"Evaluating Ragas {first_metric_name}", position=tqdm_position)
|
|
124
|
-
try:
|
|
125
|
-
results_dataset = ragas_evaluate(dataset=ragas_dataset,
|
|
126
|
-
metrics=self.metrics,
|
|
127
|
-
show_progress=True,
|
|
128
|
-
llm=self.evaluator_llm,
|
|
129
|
-
_pbar=pbar)
|
|
130
|
-
except Exception as e:
|
|
131
|
-
# On exception we still continue with other evaluators. Log and return an avg_score of 0.0
|
|
132
|
-
logger.exception("Error evaluating ragas metric, Error: %s", e, exc_info=True)
|
|
133
|
-
results_dataset = None
|
|
134
|
-
finally:
|
|
135
|
-
pbar.close()
|
|
136
|
-
TqdmPositionRegistry.release(tqdm_position)
|
|
137
|
-
|
|
138
|
-
return self.ragas_to_eval_output(eval_input, results_dataset)
|
|
@@ -1,138 +0,0 @@
|
|
|
1
|
-
# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
|
-
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
-
#
|
|
4
|
-
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
-
# you may not use this file except in compliance with the License.
|
|
6
|
-
# You may obtain a copy of the License at
|
|
7
|
-
#
|
|
8
|
-
# http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
-
#
|
|
10
|
-
# Unless required by applicable law or agreed to in writing, software
|
|
11
|
-
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
-
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
-
# See the License for the specific language governing permissions and
|
|
14
|
-
# limitations under the License.
|
|
15
|
-
|
|
16
|
-
import logging
|
|
17
|
-
|
|
18
|
-
from pydantic import BaseModel
|
|
19
|
-
from pydantic import Field
|
|
20
|
-
from pydantic import model_validator
|
|
21
|
-
|
|
22
|
-
from aiq.builder.builder import EvalBuilder
|
|
23
|
-
from aiq.builder.evaluator import EvaluatorInfo
|
|
24
|
-
from aiq.builder.framework_enum import LLMFrameworkEnum
|
|
25
|
-
from aiq.cli.register_workflow import register_evaluator
|
|
26
|
-
from aiq.data_models.evaluator import EvaluatorBaseConfig
|
|
27
|
-
from aiq.eval.evaluator.evaluator_model import EvalInput
|
|
28
|
-
from aiq.eval.evaluator.evaluator_model import EvalOutput
|
|
29
|
-
|
|
30
|
-
logger = logging.getLogger(__name__)
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
class RagasMetricConfig(BaseModel):
|
|
34
|
-
''' RAGAS metrics configuration
|
|
35
|
-
skip: Allows the metric config to be present but not used
|
|
36
|
-
kwargs: Additional arguments to pass to the metric's callable
|
|
37
|
-
'''
|
|
38
|
-
skip: bool = False
|
|
39
|
-
# kwargs specific to the metric's callable
|
|
40
|
-
kwargs: dict | None = None
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
class RagasEvaluatorConfig(EvaluatorBaseConfig, name="ragas"):
|
|
44
|
-
"""Evaluation using RAGAS metrics."""
|
|
45
|
-
|
|
46
|
-
llm_name: str = Field(description="LLM as a judge.")
|
|
47
|
-
# Ragas metric
|
|
48
|
-
metric: str | dict[str, RagasMetricConfig] = Field(default="AnswerAccuracy",
|
|
49
|
-
description="RAGAS metric callable with optional 'kwargs:'")
|
|
50
|
-
|
|
51
|
-
@model_validator(mode="before")
|
|
52
|
-
@classmethod
|
|
53
|
-
def validate_metric(cls, values):
|
|
54
|
-
"""Ensures metric is either a string or a single-item dictionary."""
|
|
55
|
-
metric = values.get("metric")
|
|
56
|
-
|
|
57
|
-
if isinstance(metric, dict):
|
|
58
|
-
if len(metric) != 1:
|
|
59
|
-
raise ValueError("Only one metric is allowed in the configuration.")
|
|
60
|
-
_, value = next(iter(metric.items()))
|
|
61
|
-
if not isinstance(value, dict):
|
|
62
|
-
raise ValueError("Metric value must be a RagasMetricConfig object.")
|
|
63
|
-
elif not isinstance(metric, str):
|
|
64
|
-
raise ValueError("Metric must be either a string or a single-item dictionary.")
|
|
65
|
-
|
|
66
|
-
return values
|
|
67
|
-
|
|
68
|
-
@property
|
|
69
|
-
def metric_name(self) -> str:
|
|
70
|
-
"""Returns the single metric name."""
|
|
71
|
-
if isinstance(self.metric, str):
|
|
72
|
-
return self.metric
|
|
73
|
-
if isinstance(self.metric, dict) and self.metric:
|
|
74
|
-
return next(iter(self.metric.keys())) # pylint: disable=no-member
|
|
75
|
-
return ""
|
|
76
|
-
|
|
77
|
-
@property
|
|
78
|
-
def metric_config(self) -> RagasMetricConfig:
|
|
79
|
-
"""Returns the metric configuration (or a default if only a string is provided)."""
|
|
80
|
-
if isinstance(self.metric, str):
|
|
81
|
-
return RagasMetricConfig() # Default config when only a metric name is given
|
|
82
|
-
if isinstance(self.metric, dict) and self.metric:
|
|
83
|
-
return next(iter(self.metric.values())) # pylint: disable=no-member
|
|
84
|
-
return RagasMetricConfig() # Default config when an invalid type is provided
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
@register_evaluator(config_type=RagasEvaluatorConfig)
|
|
88
|
-
async def register_ragas_evaluator(config: RagasEvaluatorConfig, builder: EvalBuilder):
|
|
89
|
-
from ragas.metrics import Metric
|
|
90
|
-
|
|
91
|
-
def get_ragas_metric(metric_name: str) -> Metric | None:
|
|
92
|
-
"""
|
|
93
|
-
Fetch callable for RAGAS metrics
|
|
94
|
-
"""
|
|
95
|
-
try:
|
|
96
|
-
import ragas.metrics as ragas_metrics
|
|
97
|
-
|
|
98
|
-
return getattr(ragas_metrics, metric_name)
|
|
99
|
-
except ImportError as e:
|
|
100
|
-
message = f"Ragas metrics not found {e}."
|
|
101
|
-
logger.error(message)
|
|
102
|
-
raise ValueError(message) from e
|
|
103
|
-
except AttributeError as e:
|
|
104
|
-
message = f"Ragas metric {metric_name} not found {e}."
|
|
105
|
-
logger.error(message)
|
|
106
|
-
return None
|
|
107
|
-
|
|
108
|
-
async def evaluate_fn(eval_input: EvalInput) -> EvalOutput:
|
|
109
|
-
'''Run the RAGAS evaluation and return the average scores and evaluation results dataframe'''
|
|
110
|
-
if not _evaluator:
|
|
111
|
-
logger.warning("No evaluator found for RAGAS metrics.")
|
|
112
|
-
# return empty results if no evaluator is found
|
|
113
|
-
return EvalOutput(average_score=0.0, eval_output_items=[])
|
|
114
|
-
|
|
115
|
-
return await _evaluator.evaluate(eval_input)
|
|
116
|
-
|
|
117
|
-
from .evaluate import RAGEvaluator
|
|
118
|
-
|
|
119
|
-
# Get LLM
|
|
120
|
-
llm = await builder.get_llm(config.llm_name, wrapper_type=LLMFrameworkEnum.LANGCHAIN)
|
|
121
|
-
|
|
122
|
-
# Get RAGAS metric callable from the metric config and create a list of metric-callables
|
|
123
|
-
metrics = []
|
|
124
|
-
# currently only one metric is supported
|
|
125
|
-
metric_name = config.metric_name # Extracts the metric name
|
|
126
|
-
metric_config = config.metric_config # Extracts the config (handles str/dict cases)
|
|
127
|
-
|
|
128
|
-
# Skip if `skip` is True
|
|
129
|
-
if not metric_config.skip:
|
|
130
|
-
metric_callable = get_ragas_metric(metric_name)
|
|
131
|
-
if metric_callable:
|
|
132
|
-
kwargs = metric_config.kwargs or {}
|
|
133
|
-
metrics.append(metric_callable(**kwargs))
|
|
134
|
-
|
|
135
|
-
# Create the RAG evaluator
|
|
136
|
-
_evaluator = RAGEvaluator(evaluator_llm=llm, metrics=metrics) if metrics else None
|
|
137
|
-
|
|
138
|
-
yield EvaluatorInfo(config=config, evaluate_fn=evaluate_fn, description="Evaluator for RAGAS metrics")
|
aiq/eval/register.py
DELETED
|
@@ -1,23 +0,0 @@
|
|
|
1
|
-
# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
|
-
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
-
#
|
|
4
|
-
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
-
# you may not use this file except in compliance with the License.
|
|
6
|
-
# You may obtain a copy of the License at
|
|
7
|
-
#
|
|
8
|
-
# http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
-
#
|
|
10
|
-
# Unless required by applicable law or agreed to in writing, software
|
|
11
|
-
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
-
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
-
# See the License for the specific language governing permissions and
|
|
14
|
-
# limitations under the License.
|
|
15
|
-
|
|
16
|
-
# flake8: noqa
|
|
17
|
-
# pylint: disable=unused-import
|
|
18
|
-
|
|
19
|
-
# Import evaluators which need to be automatically registered here
|
|
20
|
-
from .rag_evaluator.register import register_ragas_evaluator
|
|
21
|
-
from .swe_bench_evaluator.register import register_swe_bench_evaluator
|
|
22
|
-
from .trajectory_evaluator.register import register_trajectory_evaluator
|
|
23
|
-
from .tunable_rag_evaluator.register import register_tunable_rag_evaluator
|
aiq/eval/remote_workflow.py
DELETED
|
@@ -1,128 +0,0 @@
|
|
|
1
|
-
# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
|
-
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
-
#
|
|
4
|
-
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
-
# you may not use this file except in compliance with the License.
|
|
6
|
-
# You may obtain a copy of the License at
|
|
7
|
-
#
|
|
8
|
-
# http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
-
#
|
|
10
|
-
# Unless required by applicable law or agreed to in writing, software
|
|
11
|
-
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
-
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
-
# See the License for the specific language governing permissions and
|
|
14
|
-
# limitations under the License.
|
|
15
|
-
|
|
16
|
-
import asyncio
|
|
17
|
-
import json
|
|
18
|
-
import logging
|
|
19
|
-
|
|
20
|
-
import aiohttp
|
|
21
|
-
from pydantic import ValidationError
|
|
22
|
-
from tqdm import tqdm
|
|
23
|
-
|
|
24
|
-
from aiq.data_models.api_server import AIQResponseIntermediateStep
|
|
25
|
-
from aiq.data_models.intermediate_step import IntermediateStep
|
|
26
|
-
from aiq.data_models.intermediate_step import IntermediateStepPayload
|
|
27
|
-
from aiq.eval.config import EvaluationRunConfig
|
|
28
|
-
from aiq.eval.evaluator.evaluator_model import EvalInput
|
|
29
|
-
from aiq.eval.evaluator.evaluator_model import EvalInputItem
|
|
30
|
-
|
|
31
|
-
logger = logging.getLogger(__name__)
|
|
32
|
-
|
|
33
|
-
# Constants for streaming response prefixes
|
|
34
|
-
DATA_PREFIX = "data: "
|
|
35
|
-
INTERMEDIATE_DATA_PREFIX = "intermediate_data: "
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
class EvaluationRemoteWorkflowHandler:
|
|
39
|
-
|
|
40
|
-
def __init__(self, config: EvaluationRunConfig, max_concurrency: int):
|
|
41
|
-
self.config = config
|
|
42
|
-
# Run metadata
|
|
43
|
-
self.semaphore = asyncio.Semaphore(max_concurrency)
|
|
44
|
-
|
|
45
|
-
async def run_workflow_remote_single(self, session: aiohttp.ClientSession, item: EvalInputItem):
|
|
46
|
-
"""
|
|
47
|
-
Sends a single input to the endpoint hosting the workflow and retrieves the response.
|
|
48
|
-
"""
|
|
49
|
-
question = item.input_obj
|
|
50
|
-
# generate request format
|
|
51
|
-
payload = {"input_message": question}
|
|
52
|
-
|
|
53
|
-
try:
|
|
54
|
-
# Use the streaming endpoint
|
|
55
|
-
endpoint = f"{self.config.endpoint}/generate/full"
|
|
56
|
-
async with session.post(endpoint, json=payload) as response:
|
|
57
|
-
response.raise_for_status() # Raise an exception for HTTP errors
|
|
58
|
-
|
|
59
|
-
# Initialize variables to store the response
|
|
60
|
-
final_response = None
|
|
61
|
-
intermediate_steps = []
|
|
62
|
-
|
|
63
|
-
# Process the streaming response
|
|
64
|
-
async for line in response.content:
|
|
65
|
-
line = line.decode('utf-8').strip()
|
|
66
|
-
if not line:
|
|
67
|
-
continue
|
|
68
|
-
|
|
69
|
-
if line.startswith(DATA_PREFIX):
|
|
70
|
-
# This is a generate response chunk
|
|
71
|
-
try:
|
|
72
|
-
chunk_data = json.loads(line[len(DATA_PREFIX):])
|
|
73
|
-
if chunk_data.get("value"):
|
|
74
|
-
final_response = chunk_data.get("value")
|
|
75
|
-
except json.JSONDecodeError as e:
|
|
76
|
-
logger.error("Failed to parse generate response chunk: %s", e)
|
|
77
|
-
continue
|
|
78
|
-
elif line.startswith(INTERMEDIATE_DATA_PREFIX):
|
|
79
|
-
# This is an intermediate step
|
|
80
|
-
try:
|
|
81
|
-
step_data = json.loads(line[len(INTERMEDIATE_DATA_PREFIX):])
|
|
82
|
-
response_intermediate = AIQResponseIntermediateStep.model_validate(step_data)
|
|
83
|
-
# The payload is expected to be IntermediateStepPayload
|
|
84
|
-
intermediate_step = IntermediateStep(
|
|
85
|
-
payload=IntermediateStepPayload.model_validate_json(response_intermediate.payload))
|
|
86
|
-
intermediate_steps.append(intermediate_step)
|
|
87
|
-
except (json.JSONDecodeError, ValidationError) as e:
|
|
88
|
-
logger.error("Failed to parse intermediate step: %s", e)
|
|
89
|
-
continue
|
|
90
|
-
|
|
91
|
-
except aiohttp.ClientError as e:
|
|
92
|
-
# Handle connection or HTTP-related errors
|
|
93
|
-
logger.error("Request failed for question %s: %s", question, e)
|
|
94
|
-
item.output_obj = None
|
|
95
|
-
item.trajectory = []
|
|
96
|
-
return
|
|
97
|
-
|
|
98
|
-
# Extract and fill the item with the response and intermediate steps
|
|
99
|
-
item.output_obj = final_response
|
|
100
|
-
item.trajectory = intermediate_steps
|
|
101
|
-
return
|
|
102
|
-
|
|
103
|
-
async def run_workflow_remote_with_limits(self, session: aiohttp.ClientSession, item: EvalInputItem, pbar: tqdm):
|
|
104
|
-
"""
|
|
105
|
-
Sends limited number of concurrent requests to a remote workflow and retrieves responses.
|
|
106
|
-
"""
|
|
107
|
-
async with self.semaphore:
|
|
108
|
-
await self.run_workflow_remote_single(session=session, item=item)
|
|
109
|
-
pbar.update(1)
|
|
110
|
-
|
|
111
|
-
async def run_workflow_remote(self, eval_input: EvalInput) -> EvalInput:
|
|
112
|
-
"""
|
|
113
|
-
Sends inputs to a workflow hosted on a remote endpoint.
|
|
114
|
-
"""
|
|
115
|
-
timeout = aiohttp.ClientTimeout(total=self.config.endpoint_timeout)
|
|
116
|
-
try:
|
|
117
|
-
pbar = tqdm(total=len(eval_input.eval_input_items), desc="Running workflow", unit="item")
|
|
118
|
-
async with aiohttp.ClientSession(timeout=timeout) as session:
|
|
119
|
-
# get the questions from the eval_input
|
|
120
|
-
tasks = [
|
|
121
|
-
self.run_workflow_remote_with_limits(session, item, pbar) for item in eval_input.eval_input_items
|
|
122
|
-
]
|
|
123
|
-
await asyncio.gather(*tasks)
|
|
124
|
-
|
|
125
|
-
finally:
|
|
126
|
-
pbar.close()
|
|
127
|
-
|
|
128
|
-
return eval_input
|
|
@@ -1,52 +0,0 @@
|
|
|
1
|
-
# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
|
-
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
-
#
|
|
4
|
-
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
-
# you may not use this file except in compliance with the License.
|
|
6
|
-
# You may obtain a copy of the License at
|
|
7
|
-
#
|
|
8
|
-
# http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
-
#
|
|
10
|
-
# Unless required by applicable law or agreed to in writing, software
|
|
11
|
-
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
-
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
-
# See the License for the specific language governing permissions and
|
|
14
|
-
# limitations under the License.
|
|
15
|
-
|
|
16
|
-
import asyncio
|
|
17
|
-
import logging
|
|
18
|
-
|
|
19
|
-
from aiq.builder.context import AIQContext
|
|
20
|
-
from aiq.data_models.intermediate_step import IntermediateStep
|
|
21
|
-
|
|
22
|
-
logger = logging.getLogger(__name__)
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
def pull_intermediate() -> asyncio.Future[list[dict]]:
|
|
26
|
-
"""
|
|
27
|
-
Subscribes to the runner's event stream using callbacks.
|
|
28
|
-
Intermediate steps are collected and, when complete, the future is set
|
|
29
|
-
with the list of dumped intermediate steps.
|
|
30
|
-
"""
|
|
31
|
-
future = asyncio.Future()
|
|
32
|
-
intermediate_steps = [] # We'll store the dumped steps here.
|
|
33
|
-
context = AIQContext.get()
|
|
34
|
-
|
|
35
|
-
def on_next_cb(item: IntermediateStep):
|
|
36
|
-
# Append each new intermediate step (dumped to dict) to the list.
|
|
37
|
-
intermediate_steps.append(item.model_dump())
|
|
38
|
-
|
|
39
|
-
def on_error_cb(exc: Exception):
|
|
40
|
-
logger.error("Hit on_error: %s", exc)
|
|
41
|
-
if not future.done():
|
|
42
|
-
future.set_exception(exc)
|
|
43
|
-
|
|
44
|
-
def on_complete_cb():
|
|
45
|
-
logger.debug("Completed reading intermediate steps")
|
|
46
|
-
if not future.done():
|
|
47
|
-
future.set_result(intermediate_steps)
|
|
48
|
-
|
|
49
|
-
# Subscribe with our callbacks.
|
|
50
|
-
context.intermediate_step_manager.subscribe(on_next=on_next_cb, on_error=on_error_cb, on_complete=on_complete_cb)
|
|
51
|
-
|
|
52
|
-
return future
|
|
File without changes
|