nvidia-nat 1.2.0rc5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- aiq/agent/__init__.py +0 -0
- aiq/agent/base.py +239 -0
- aiq/agent/dual_node.py +67 -0
- aiq/agent/react_agent/__init__.py +0 -0
- aiq/agent/react_agent/agent.py +355 -0
- aiq/agent/react_agent/output_parser.py +104 -0
- aiq/agent/react_agent/prompt.py +41 -0
- aiq/agent/react_agent/register.py +149 -0
- aiq/agent/reasoning_agent/__init__.py +0 -0
- aiq/agent/reasoning_agent/reasoning_agent.py +225 -0
- aiq/agent/register.py +23 -0
- aiq/agent/rewoo_agent/__init__.py +0 -0
- aiq/agent/rewoo_agent/agent.py +411 -0
- aiq/agent/rewoo_agent/prompt.py +108 -0
- aiq/agent/rewoo_agent/register.py +158 -0
- aiq/agent/tool_calling_agent/__init__.py +0 -0
- aiq/agent/tool_calling_agent/agent.py +119 -0
- aiq/agent/tool_calling_agent/register.py +106 -0
- aiq/authentication/__init__.py +14 -0
- aiq/authentication/api_key/__init__.py +14 -0
- aiq/authentication/api_key/api_key_auth_provider.py +96 -0
- aiq/authentication/api_key/api_key_auth_provider_config.py +124 -0
- aiq/authentication/api_key/register.py +26 -0
- aiq/authentication/exceptions/__init__.py +14 -0
- aiq/authentication/exceptions/api_key_exceptions.py +38 -0
- aiq/authentication/http_basic_auth/__init__.py +0 -0
- aiq/authentication/http_basic_auth/http_basic_auth_provider.py +81 -0
- aiq/authentication/http_basic_auth/register.py +30 -0
- aiq/authentication/interfaces.py +93 -0
- aiq/authentication/oauth2/__init__.py +14 -0
- aiq/authentication/oauth2/oauth2_auth_code_flow_provider.py +107 -0
- aiq/authentication/oauth2/oauth2_auth_code_flow_provider_config.py +39 -0
- aiq/authentication/oauth2/register.py +25 -0
- aiq/authentication/register.py +21 -0
- aiq/builder/__init__.py +0 -0
- aiq/builder/builder.py +285 -0
- aiq/builder/component_utils.py +316 -0
- aiq/builder/context.py +264 -0
- aiq/builder/embedder.py +24 -0
- aiq/builder/eval_builder.py +161 -0
- aiq/builder/evaluator.py +29 -0
- aiq/builder/framework_enum.py +24 -0
- aiq/builder/front_end.py +73 -0
- aiq/builder/function.py +344 -0
- aiq/builder/function_base.py +380 -0
- aiq/builder/function_info.py +627 -0
- aiq/builder/intermediate_step_manager.py +174 -0
- aiq/builder/llm.py +25 -0
- aiq/builder/retriever.py +25 -0
- aiq/builder/user_interaction_manager.py +74 -0
- aiq/builder/workflow.py +148 -0
- aiq/builder/workflow_builder.py +1117 -0
- aiq/cli/__init__.py +14 -0
- aiq/cli/cli_utils/__init__.py +0 -0
- aiq/cli/cli_utils/config_override.py +231 -0
- aiq/cli/cli_utils/validation.py +37 -0
- aiq/cli/commands/__init__.py +0 -0
- aiq/cli/commands/configure/__init__.py +0 -0
- aiq/cli/commands/configure/channel/__init__.py +0 -0
- aiq/cli/commands/configure/channel/add.py +28 -0
- aiq/cli/commands/configure/channel/channel.py +36 -0
- aiq/cli/commands/configure/channel/remove.py +30 -0
- aiq/cli/commands/configure/channel/update.py +30 -0
- aiq/cli/commands/configure/configure.py +33 -0
- aiq/cli/commands/evaluate.py +139 -0
- aiq/cli/commands/info/__init__.py +14 -0
- aiq/cli/commands/info/info.py +39 -0
- aiq/cli/commands/info/list_channels.py +32 -0
- aiq/cli/commands/info/list_components.py +129 -0
- aiq/cli/commands/info/list_mcp.py +213 -0
- aiq/cli/commands/registry/__init__.py +14 -0
- aiq/cli/commands/registry/publish.py +88 -0
- aiq/cli/commands/registry/pull.py +118 -0
- aiq/cli/commands/registry/registry.py +38 -0
- aiq/cli/commands/registry/remove.py +108 -0
- aiq/cli/commands/registry/search.py +155 -0
- aiq/cli/commands/sizing/__init__.py +14 -0
- aiq/cli/commands/sizing/calc.py +297 -0
- aiq/cli/commands/sizing/sizing.py +27 -0
- aiq/cli/commands/start.py +246 -0
- aiq/cli/commands/uninstall.py +81 -0
- aiq/cli/commands/validate.py +47 -0
- aiq/cli/commands/workflow/__init__.py +14 -0
- aiq/cli/commands/workflow/templates/__init__.py.j2 +0 -0
- aiq/cli/commands/workflow/templates/config.yml.j2 +16 -0
- aiq/cli/commands/workflow/templates/pyproject.toml.j2 +22 -0
- aiq/cli/commands/workflow/templates/register.py.j2 +5 -0
- aiq/cli/commands/workflow/templates/workflow.py.j2 +36 -0
- aiq/cli/commands/workflow/workflow.py +37 -0
- aiq/cli/commands/workflow/workflow_commands.py +313 -0
- aiq/cli/entrypoint.py +135 -0
- aiq/cli/main.py +44 -0
- aiq/cli/register_workflow.py +488 -0
- aiq/cli/type_registry.py +1000 -0
- aiq/data_models/__init__.py +14 -0
- aiq/data_models/api_server.py +694 -0
- aiq/data_models/authentication.py +231 -0
- aiq/data_models/common.py +171 -0
- aiq/data_models/component.py +54 -0
- aiq/data_models/component_ref.py +168 -0
- aiq/data_models/config.py +406 -0
- aiq/data_models/dataset_handler.py +123 -0
- aiq/data_models/discovery_metadata.py +335 -0
- aiq/data_models/embedder.py +27 -0
- aiq/data_models/evaluate.py +127 -0
- aiq/data_models/evaluator.py +26 -0
- aiq/data_models/front_end.py +26 -0
- aiq/data_models/function.py +30 -0
- aiq/data_models/function_dependencies.py +72 -0
- aiq/data_models/interactive.py +246 -0
- aiq/data_models/intermediate_step.py +302 -0
- aiq/data_models/invocation_node.py +38 -0
- aiq/data_models/llm.py +27 -0
- aiq/data_models/logging.py +26 -0
- aiq/data_models/memory.py +27 -0
- aiq/data_models/object_store.py +44 -0
- aiq/data_models/profiler.py +54 -0
- aiq/data_models/registry_handler.py +26 -0
- aiq/data_models/retriever.py +30 -0
- aiq/data_models/retry_mixin.py +35 -0
- aiq/data_models/span.py +187 -0
- aiq/data_models/step_adaptor.py +64 -0
- aiq/data_models/streaming.py +33 -0
- aiq/data_models/swe_bench_model.py +54 -0
- aiq/data_models/telemetry_exporter.py +26 -0
- aiq/data_models/ttc_strategy.py +30 -0
- aiq/embedder/__init__.py +0 -0
- aiq/embedder/langchain_client.py +41 -0
- aiq/embedder/nim_embedder.py +59 -0
- aiq/embedder/openai_embedder.py +43 -0
- aiq/embedder/register.py +24 -0
- aiq/eval/__init__.py +14 -0
- aiq/eval/config.py +60 -0
- aiq/eval/dataset_handler/__init__.py +0 -0
- aiq/eval/dataset_handler/dataset_downloader.py +106 -0
- aiq/eval/dataset_handler/dataset_filter.py +52 -0
- aiq/eval/dataset_handler/dataset_handler.py +254 -0
- aiq/eval/evaluate.py +506 -0
- aiq/eval/evaluator/__init__.py +14 -0
- aiq/eval/evaluator/base_evaluator.py +73 -0
- aiq/eval/evaluator/evaluator_model.py +45 -0
- aiq/eval/intermediate_step_adapter.py +99 -0
- aiq/eval/rag_evaluator/__init__.py +0 -0
- aiq/eval/rag_evaluator/evaluate.py +178 -0
- aiq/eval/rag_evaluator/register.py +143 -0
- aiq/eval/register.py +23 -0
- aiq/eval/remote_workflow.py +133 -0
- aiq/eval/runners/__init__.py +14 -0
- aiq/eval/runners/config.py +39 -0
- aiq/eval/runners/multi_eval_runner.py +54 -0
- aiq/eval/runtime_event_subscriber.py +52 -0
- aiq/eval/swe_bench_evaluator/__init__.py +0 -0
- aiq/eval/swe_bench_evaluator/evaluate.py +215 -0
- aiq/eval/swe_bench_evaluator/register.py +36 -0
- aiq/eval/trajectory_evaluator/__init__.py +0 -0
- aiq/eval/trajectory_evaluator/evaluate.py +75 -0
- aiq/eval/trajectory_evaluator/register.py +40 -0
- aiq/eval/tunable_rag_evaluator/__init__.py +0 -0
- aiq/eval/tunable_rag_evaluator/evaluate.py +245 -0
- aiq/eval/tunable_rag_evaluator/register.py +52 -0
- aiq/eval/usage_stats.py +41 -0
- aiq/eval/utils/__init__.py +0 -0
- aiq/eval/utils/output_uploader.py +140 -0
- aiq/eval/utils/tqdm_position_registry.py +40 -0
- aiq/eval/utils/weave_eval.py +184 -0
- aiq/experimental/__init__.py +0 -0
- aiq/experimental/decorators/__init__.py +0 -0
- aiq/experimental/decorators/experimental_warning_decorator.py +130 -0
- aiq/experimental/test_time_compute/__init__.py +0 -0
- aiq/experimental/test_time_compute/editing/__init__.py +0 -0
- aiq/experimental/test_time_compute/editing/iterative_plan_refinement_editor.py +147 -0
- aiq/experimental/test_time_compute/editing/llm_as_a_judge_editor.py +204 -0
- aiq/experimental/test_time_compute/editing/motivation_aware_summarization.py +107 -0
- aiq/experimental/test_time_compute/functions/__init__.py +0 -0
- aiq/experimental/test_time_compute/functions/execute_score_select_function.py +105 -0
- aiq/experimental/test_time_compute/functions/its_tool_orchestration_function.py +205 -0
- aiq/experimental/test_time_compute/functions/its_tool_wrapper_function.py +146 -0
- aiq/experimental/test_time_compute/functions/plan_select_execute_function.py +224 -0
- aiq/experimental/test_time_compute/models/__init__.py +0 -0
- aiq/experimental/test_time_compute/models/editor_config.py +132 -0
- aiq/experimental/test_time_compute/models/scoring_config.py +112 -0
- aiq/experimental/test_time_compute/models/search_config.py +120 -0
- aiq/experimental/test_time_compute/models/selection_config.py +154 -0
- aiq/experimental/test_time_compute/models/stage_enums.py +43 -0
- aiq/experimental/test_time_compute/models/strategy_base.py +66 -0
- aiq/experimental/test_time_compute/models/tool_use_config.py +41 -0
- aiq/experimental/test_time_compute/models/ttc_item.py +48 -0
- aiq/experimental/test_time_compute/register.py +36 -0
- aiq/experimental/test_time_compute/scoring/__init__.py +0 -0
- aiq/experimental/test_time_compute/scoring/llm_based_agent_scorer.py +168 -0
- aiq/experimental/test_time_compute/scoring/llm_based_plan_scorer.py +168 -0
- aiq/experimental/test_time_compute/scoring/motivation_aware_scorer.py +111 -0
- aiq/experimental/test_time_compute/search/__init__.py +0 -0
- aiq/experimental/test_time_compute/search/multi_llm_planner.py +128 -0
- aiq/experimental/test_time_compute/search/multi_query_retrieval_search.py +122 -0
- aiq/experimental/test_time_compute/search/single_shot_multi_plan_planner.py +128 -0
- aiq/experimental/test_time_compute/selection/__init__.py +0 -0
- aiq/experimental/test_time_compute/selection/best_of_n_selector.py +63 -0
- aiq/experimental/test_time_compute/selection/llm_based_agent_output_selector.py +131 -0
- aiq/experimental/test_time_compute/selection/llm_based_output_merging_selector.py +159 -0
- aiq/experimental/test_time_compute/selection/llm_based_plan_selector.py +128 -0
- aiq/experimental/test_time_compute/selection/threshold_selector.py +58 -0
- aiq/front_ends/__init__.py +14 -0
- aiq/front_ends/console/__init__.py +14 -0
- aiq/front_ends/console/authentication_flow_handler.py +233 -0
- aiq/front_ends/console/console_front_end_config.py +32 -0
- aiq/front_ends/console/console_front_end_plugin.py +96 -0
- aiq/front_ends/console/register.py +25 -0
- aiq/front_ends/cron/__init__.py +14 -0
- aiq/front_ends/fastapi/__init__.py +14 -0
- aiq/front_ends/fastapi/auth_flow_handlers/__init__.py +0 -0
- aiq/front_ends/fastapi/auth_flow_handlers/http_flow_handler.py +27 -0
- aiq/front_ends/fastapi/auth_flow_handlers/websocket_flow_handler.py +107 -0
- aiq/front_ends/fastapi/fastapi_front_end_config.py +234 -0
- aiq/front_ends/fastapi/fastapi_front_end_controller.py +68 -0
- aiq/front_ends/fastapi/fastapi_front_end_plugin.py +116 -0
- aiq/front_ends/fastapi/fastapi_front_end_plugin_worker.py +1092 -0
- aiq/front_ends/fastapi/html_snippets/__init__.py +14 -0
- aiq/front_ends/fastapi/html_snippets/auth_code_grant_success.py +35 -0
- aiq/front_ends/fastapi/intermediate_steps_subscriber.py +80 -0
- aiq/front_ends/fastapi/job_store.py +183 -0
- aiq/front_ends/fastapi/main.py +72 -0
- aiq/front_ends/fastapi/message_handler.py +298 -0
- aiq/front_ends/fastapi/message_validator.py +345 -0
- aiq/front_ends/fastapi/register.py +25 -0
- aiq/front_ends/fastapi/response_helpers.py +195 -0
- aiq/front_ends/fastapi/step_adaptor.py +321 -0
- aiq/front_ends/mcp/__init__.py +14 -0
- aiq/front_ends/mcp/mcp_front_end_config.py +32 -0
- aiq/front_ends/mcp/mcp_front_end_plugin.py +93 -0
- aiq/front_ends/mcp/register.py +27 -0
- aiq/front_ends/mcp/tool_converter.py +242 -0
- aiq/front_ends/register.py +22 -0
- aiq/front_ends/simple_base/__init__.py +14 -0
- aiq/front_ends/simple_base/simple_front_end_plugin_base.py +54 -0
- aiq/llm/__init__.py +0 -0
- aiq/llm/aws_bedrock_llm.py +57 -0
- aiq/llm/nim_llm.py +46 -0
- aiq/llm/openai_llm.py +46 -0
- aiq/llm/register.py +23 -0
- aiq/llm/utils/__init__.py +14 -0
- aiq/llm/utils/env_config_value.py +94 -0
- aiq/llm/utils/error.py +17 -0
- aiq/memory/__init__.py +20 -0
- aiq/memory/interfaces.py +183 -0
- aiq/memory/models.py +112 -0
- aiq/meta/module_to_distro.json +3 -0
- aiq/meta/pypi.md +58 -0
- aiq/object_store/__init__.py +20 -0
- aiq/object_store/in_memory_object_store.py +76 -0
- aiq/object_store/interfaces.py +84 -0
- aiq/object_store/models.py +36 -0
- aiq/object_store/register.py +20 -0
- aiq/observability/__init__.py +14 -0
- aiq/observability/exporter/__init__.py +14 -0
- aiq/observability/exporter/base_exporter.py +449 -0
- aiq/observability/exporter/exporter.py +78 -0
- aiq/observability/exporter/file_exporter.py +33 -0
- aiq/observability/exporter/processing_exporter.py +322 -0
- aiq/observability/exporter/raw_exporter.py +52 -0
- aiq/observability/exporter/span_exporter.py +265 -0
- aiq/observability/exporter_manager.py +335 -0
- aiq/observability/mixin/__init__.py +14 -0
- aiq/observability/mixin/batch_config_mixin.py +26 -0
- aiq/observability/mixin/collector_config_mixin.py +23 -0
- aiq/observability/mixin/file_mixin.py +288 -0
- aiq/observability/mixin/file_mode.py +23 -0
- aiq/observability/mixin/resource_conflict_mixin.py +134 -0
- aiq/observability/mixin/serialize_mixin.py +61 -0
- aiq/observability/mixin/type_introspection_mixin.py +183 -0
- aiq/observability/processor/__init__.py +14 -0
- aiq/observability/processor/batching_processor.py +310 -0
- aiq/observability/processor/callback_processor.py +42 -0
- aiq/observability/processor/intermediate_step_serializer.py +28 -0
- aiq/observability/processor/processor.py +71 -0
- aiq/observability/register.py +96 -0
- aiq/observability/utils/__init__.py +14 -0
- aiq/observability/utils/dict_utils.py +236 -0
- aiq/observability/utils/time_utils.py +31 -0
- aiq/plugins/.namespace +1 -0
- aiq/profiler/__init__.py +0 -0
- aiq/profiler/calc/__init__.py +14 -0
- aiq/profiler/calc/calc_runner.py +627 -0
- aiq/profiler/calc/calculations.py +288 -0
- aiq/profiler/calc/data_models.py +188 -0
- aiq/profiler/calc/plot.py +345 -0
- aiq/profiler/callbacks/__init__.py +0 -0
- aiq/profiler/callbacks/agno_callback_handler.py +295 -0
- aiq/profiler/callbacks/base_callback_class.py +20 -0
- aiq/profiler/callbacks/langchain_callback_handler.py +290 -0
- aiq/profiler/callbacks/llama_index_callback_handler.py +205 -0
- aiq/profiler/callbacks/semantic_kernel_callback_handler.py +238 -0
- aiq/profiler/callbacks/token_usage_base_model.py +27 -0
- aiq/profiler/data_frame_row.py +51 -0
- aiq/profiler/data_models.py +24 -0
- aiq/profiler/decorators/__init__.py +0 -0
- aiq/profiler/decorators/framework_wrapper.py +131 -0
- aiq/profiler/decorators/function_tracking.py +254 -0
- aiq/profiler/forecasting/__init__.py +0 -0
- aiq/profiler/forecasting/config.py +18 -0
- aiq/profiler/forecasting/model_trainer.py +75 -0
- aiq/profiler/forecasting/models/__init__.py +22 -0
- aiq/profiler/forecasting/models/forecasting_base_model.py +40 -0
- aiq/profiler/forecasting/models/linear_model.py +196 -0
- aiq/profiler/forecasting/models/random_forest_regressor.py +268 -0
- aiq/profiler/inference_metrics_model.py +28 -0
- aiq/profiler/inference_optimization/__init__.py +0 -0
- aiq/profiler/inference_optimization/bottleneck_analysis/__init__.py +0 -0
- aiq/profiler/inference_optimization/bottleneck_analysis/nested_stack_analysis.py +460 -0
- aiq/profiler/inference_optimization/bottleneck_analysis/simple_stack_analysis.py +258 -0
- aiq/profiler/inference_optimization/data_models.py +386 -0
- aiq/profiler/inference_optimization/experimental/__init__.py +0 -0
- aiq/profiler/inference_optimization/experimental/concurrency_spike_analysis.py +468 -0
- aiq/profiler/inference_optimization/experimental/prefix_span_analysis.py +405 -0
- aiq/profiler/inference_optimization/llm_metrics.py +212 -0
- aiq/profiler/inference_optimization/prompt_caching.py +163 -0
- aiq/profiler/inference_optimization/token_uniqueness.py +107 -0
- aiq/profiler/inference_optimization/workflow_runtimes.py +72 -0
- aiq/profiler/intermediate_property_adapter.py +102 -0
- aiq/profiler/profile_runner.py +473 -0
- aiq/profiler/utils.py +184 -0
- aiq/registry_handlers/__init__.py +0 -0
- aiq/registry_handlers/local/__init__.py +0 -0
- aiq/registry_handlers/local/local_handler.py +176 -0
- aiq/registry_handlers/local/register_local.py +37 -0
- aiq/registry_handlers/metadata_factory.py +60 -0
- aiq/registry_handlers/package_utils.py +567 -0
- aiq/registry_handlers/pypi/__init__.py +0 -0
- aiq/registry_handlers/pypi/pypi_handler.py +251 -0
- aiq/registry_handlers/pypi/register_pypi.py +40 -0
- aiq/registry_handlers/register.py +21 -0
- aiq/registry_handlers/registry_handler_base.py +157 -0
- aiq/registry_handlers/rest/__init__.py +0 -0
- aiq/registry_handlers/rest/register_rest.py +56 -0
- aiq/registry_handlers/rest/rest_handler.py +237 -0
- aiq/registry_handlers/schemas/__init__.py +0 -0
- aiq/registry_handlers/schemas/headers.py +42 -0
- aiq/registry_handlers/schemas/package.py +68 -0
- aiq/registry_handlers/schemas/publish.py +63 -0
- aiq/registry_handlers/schemas/pull.py +82 -0
- aiq/registry_handlers/schemas/remove.py +36 -0
- aiq/registry_handlers/schemas/search.py +91 -0
- aiq/registry_handlers/schemas/status.py +47 -0
- aiq/retriever/__init__.py +0 -0
- aiq/retriever/interface.py +37 -0
- aiq/retriever/milvus/__init__.py +14 -0
- aiq/retriever/milvus/register.py +81 -0
- aiq/retriever/milvus/retriever.py +228 -0
- aiq/retriever/models.py +74 -0
- aiq/retriever/nemo_retriever/__init__.py +14 -0
- aiq/retriever/nemo_retriever/register.py +60 -0
- aiq/retriever/nemo_retriever/retriever.py +190 -0
- aiq/retriever/register.py +22 -0
- aiq/runtime/__init__.py +14 -0
- aiq/runtime/loader.py +215 -0
- aiq/runtime/runner.py +190 -0
- aiq/runtime/session.py +158 -0
- aiq/runtime/user_metadata.py +130 -0
- aiq/settings/__init__.py +0 -0
- aiq/settings/global_settings.py +318 -0
- aiq/test/.namespace +1 -0
- aiq/tool/__init__.py +0 -0
- aiq/tool/chat_completion.py +74 -0
- aiq/tool/code_execution/README.md +151 -0
- aiq/tool/code_execution/__init__.py +0 -0
- aiq/tool/code_execution/code_sandbox.py +267 -0
- aiq/tool/code_execution/local_sandbox/.gitignore +1 -0
- aiq/tool/code_execution/local_sandbox/Dockerfile.sandbox +60 -0
- aiq/tool/code_execution/local_sandbox/__init__.py +13 -0
- aiq/tool/code_execution/local_sandbox/local_sandbox_server.py +198 -0
- aiq/tool/code_execution/local_sandbox/sandbox.requirements.txt +6 -0
- aiq/tool/code_execution/local_sandbox/start_local_sandbox.sh +50 -0
- aiq/tool/code_execution/register.py +74 -0
- aiq/tool/code_execution/test_code_execution_sandbox.py +414 -0
- aiq/tool/code_execution/utils.py +100 -0
- aiq/tool/datetime_tools.py +42 -0
- aiq/tool/document_search.py +141 -0
- aiq/tool/github_tools/__init__.py +0 -0
- aiq/tool/github_tools/create_github_commit.py +133 -0
- aiq/tool/github_tools/create_github_issue.py +87 -0
- aiq/tool/github_tools/create_github_pr.py +106 -0
- aiq/tool/github_tools/get_github_file.py +106 -0
- aiq/tool/github_tools/get_github_issue.py +166 -0
- aiq/tool/github_tools/get_github_pr.py +256 -0
- aiq/tool/github_tools/update_github_issue.py +100 -0
- aiq/tool/mcp/__init__.py +14 -0
- aiq/tool/mcp/exceptions.py +142 -0
- aiq/tool/mcp/mcp_client.py +255 -0
- aiq/tool/mcp/mcp_tool.py +96 -0
- aiq/tool/memory_tools/__init__.py +0 -0
- aiq/tool/memory_tools/add_memory_tool.py +79 -0
- aiq/tool/memory_tools/delete_memory_tool.py +67 -0
- aiq/tool/memory_tools/get_memory_tool.py +72 -0
- aiq/tool/nvidia_rag.py +95 -0
- aiq/tool/register.py +38 -0
- aiq/tool/retriever.py +89 -0
- aiq/tool/server_tools.py +66 -0
- aiq/utils/__init__.py +0 -0
- aiq/utils/data_models/__init__.py +0 -0
- aiq/utils/data_models/schema_validator.py +58 -0
- aiq/utils/debugging_utils.py +43 -0
- aiq/utils/dump_distro_mapping.py +32 -0
- aiq/utils/exception_handlers/__init__.py +0 -0
- aiq/utils/exception_handlers/automatic_retries.py +289 -0
- aiq/utils/exception_handlers/mcp.py +211 -0
- aiq/utils/exception_handlers/schemas.py +114 -0
- aiq/utils/io/__init__.py +0 -0
- aiq/utils/io/model_processing.py +28 -0
- aiq/utils/io/yaml_tools.py +119 -0
- aiq/utils/log_utils.py +37 -0
- aiq/utils/metadata_utils.py +74 -0
- aiq/utils/optional_imports.py +142 -0
- aiq/utils/producer_consumer_queue.py +178 -0
- aiq/utils/reactive/__init__.py +0 -0
- aiq/utils/reactive/base/__init__.py +0 -0
- aiq/utils/reactive/base/observable_base.py +65 -0
- aiq/utils/reactive/base/observer_base.py +55 -0
- aiq/utils/reactive/base/subject_base.py +79 -0
- aiq/utils/reactive/observable.py +59 -0
- aiq/utils/reactive/observer.py +76 -0
- aiq/utils/reactive/subject.py +131 -0
- aiq/utils/reactive/subscription.py +49 -0
- aiq/utils/settings/__init__.py +0 -0
- aiq/utils/settings/global_settings.py +197 -0
- aiq/utils/string_utils.py +38 -0
- aiq/utils/type_converter.py +290 -0
- aiq/utils/type_utils.py +484 -0
- aiq/utils/url_utils.py +27 -0
- nvidia_nat-1.2.0rc5.dist-info/METADATA +363 -0
- nvidia_nat-1.2.0rc5.dist-info/RECORD +435 -0
- nvidia_nat-1.2.0rc5.dist-info/WHEEL +5 -0
- nvidia_nat-1.2.0rc5.dist-info/entry_points.txt +20 -0
- nvidia_nat-1.2.0rc5.dist-info/licenses/LICENSE-3rd-party.txt +3686 -0
- nvidia_nat-1.2.0rc5.dist-info/licenses/LICENSE.md +201 -0
- nvidia_nat-1.2.0rc5.dist-info/top_level.txt +1 -0
aiq/eval/evaluate.py
ADDED
|
@@ -0,0 +1,506 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
#
|
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
# you may not use this file except in compliance with the License.
|
|
6
|
+
# You may obtain a copy of the License at
|
|
7
|
+
#
|
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
#
|
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
# See the License for the specific language governing permissions and
|
|
14
|
+
# limitations under the License.
|
|
15
|
+
|
|
16
|
+
import asyncio
|
|
17
|
+
import logging
|
|
18
|
+
import shutil
|
|
19
|
+
from pathlib import Path
|
|
20
|
+
from typing import Any
|
|
21
|
+
from uuid import uuid4
|
|
22
|
+
|
|
23
|
+
from pydantic import BaseModel
|
|
24
|
+
from tqdm import tqdm
|
|
25
|
+
|
|
26
|
+
from aiq.data_models.evaluate import EvalConfig
|
|
27
|
+
from aiq.data_models.evaluate import JobEvictionPolicy
|
|
28
|
+
from aiq.eval.config import EvaluationRunConfig
|
|
29
|
+
from aiq.eval.config import EvaluationRunOutput
|
|
30
|
+
from aiq.eval.dataset_handler.dataset_handler import DatasetHandler
|
|
31
|
+
from aiq.eval.evaluator.evaluator_model import EvalInput
|
|
32
|
+
from aiq.eval.evaluator.evaluator_model import EvalInputItem
|
|
33
|
+
from aiq.eval.evaluator.evaluator_model import EvalOutput
|
|
34
|
+
from aiq.eval.usage_stats import UsageStats
|
|
35
|
+
from aiq.eval.usage_stats import UsageStatsItem
|
|
36
|
+
from aiq.eval.usage_stats import UsageStatsLLM
|
|
37
|
+
from aiq.eval.utils.output_uploader import OutputUploader
|
|
38
|
+
from aiq.eval.utils.weave_eval import WeaveEvaluationIntegration
|
|
39
|
+
from aiq.profiler.data_models import ProfilerResults
|
|
40
|
+
from aiq.runtime.session import AIQSessionManager
|
|
41
|
+
|
|
42
|
+
logger = logging.getLogger(__name__)
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
class EvaluationRun: # pylint: disable=too-many-public-methods
|
|
46
|
+
"""
|
|
47
|
+
Instantiated for each evaluation run and used to store data for that single run.
|
|
48
|
+
"""
|
|
49
|
+
|
|
50
|
+
def __init__(self, config: EvaluationRunConfig):
|
|
51
|
+
"""
|
|
52
|
+
Initialize an EvaluationRun with configuration.
|
|
53
|
+
"""
|
|
54
|
+
from aiq.eval.intermediate_step_adapter import IntermediateStepAdapter
|
|
55
|
+
|
|
56
|
+
# Run-specific configuration
|
|
57
|
+
self.config: EvaluationRunConfig = config
|
|
58
|
+
self.eval_config: EvalConfig | None = None
|
|
59
|
+
|
|
60
|
+
# Helpers
|
|
61
|
+
self.intermediate_step_adapter: IntermediateStepAdapter = IntermediateStepAdapter()
|
|
62
|
+
self.weave_eval: WeaveEvaluationIntegration = WeaveEvaluationIntegration()
|
|
63
|
+
# Metadata
|
|
64
|
+
self.eval_input: EvalInput | None = None
|
|
65
|
+
self.workflow_interrupted: bool = False
|
|
66
|
+
|
|
67
|
+
# evaluation_results is list of tuples (evaluator_name, EvalOutput)
|
|
68
|
+
self.evaluation_results: list[tuple[str, EvalOutput]] = []
|
|
69
|
+
|
|
70
|
+
# usage stats
|
|
71
|
+
self.usage_stats: UsageStats = UsageStats()
|
|
72
|
+
|
|
73
|
+
# workflow output file
|
|
74
|
+
self.workflow_output_file: Path | None = None
|
|
75
|
+
|
|
76
|
+
# evaluation output files
|
|
77
|
+
self.evaluator_output_files: list[Path] = []
|
|
78
|
+
|
|
79
|
+
def _compute_usage_stats(self, item: EvalInputItem):
|
|
80
|
+
"""Compute usage stats for a single item using the intermediate steps"""
|
|
81
|
+
# get the prompt and completion tokens from the intermediate steps
|
|
82
|
+
from aiq.profiler.intermediate_property_adapter import IntermediatePropertyAdaptor
|
|
83
|
+
steps = [IntermediatePropertyAdaptor.from_intermediate_step(step) for step in item.trajectory]
|
|
84
|
+
usage_stats_per_llm = {}
|
|
85
|
+
total_tokens = 0
|
|
86
|
+
for step in steps:
|
|
87
|
+
if step.event_type == "LLM_END":
|
|
88
|
+
llm_name = step.llm_name
|
|
89
|
+
if llm_name not in usage_stats_per_llm:
|
|
90
|
+
usage_stats_per_llm[llm_name] = UsageStatsLLM()
|
|
91
|
+
usage_stats_per_llm[llm_name].prompt_tokens += step.token_usage.prompt_tokens
|
|
92
|
+
usage_stats_per_llm[llm_name].completion_tokens += step.token_usage.completion_tokens
|
|
93
|
+
usage_stats_per_llm[llm_name].total_tokens += step.token_usage.total_tokens
|
|
94
|
+
total_tokens += step.token_usage.total_tokens
|
|
95
|
+
|
|
96
|
+
# find min and max event timestamps
|
|
97
|
+
if item.trajectory:
|
|
98
|
+
min_timestamp = min(step.event_timestamp for step in item.trajectory)
|
|
99
|
+
max_timestamp = max(step.event_timestamp for step in item.trajectory)
|
|
100
|
+
runtime = max_timestamp - min_timestamp
|
|
101
|
+
else:
|
|
102
|
+
min_timestamp = 0.0
|
|
103
|
+
max_timestamp = 0.0
|
|
104
|
+
runtime = 0.0
|
|
105
|
+
|
|
106
|
+
# find llm latency by calculating p95 of all llm calls
|
|
107
|
+
llm_latencies = []
|
|
108
|
+
previous_llm_start_time = None
|
|
109
|
+
for step in steps:
|
|
110
|
+
if step.event_type == "LLM_START":
|
|
111
|
+
previous_llm_start_time = step.event_timestamp
|
|
112
|
+
elif step.event_type == "LLM_END" and previous_llm_start_time is not None:
|
|
113
|
+
llm_latencies.append(step.event_timestamp - previous_llm_start_time)
|
|
114
|
+
previous_llm_start_time = None
|
|
115
|
+
|
|
116
|
+
# Calculate p95 LLM latency (or 0 if no LLM calls)
|
|
117
|
+
if llm_latencies:
|
|
118
|
+
import numpy as np
|
|
119
|
+
llm_latency = float(np.percentile(llm_latencies, 95))
|
|
120
|
+
else:
|
|
121
|
+
llm_latency = 0.0
|
|
122
|
+
|
|
123
|
+
# add the usage stats to the usage stats dict
|
|
124
|
+
self.usage_stats.usage_stats_items[item.id] = UsageStatsItem(usage_stats_per_llm=usage_stats_per_llm,
|
|
125
|
+
runtime=runtime,
|
|
126
|
+
total_tokens=total_tokens,
|
|
127
|
+
min_timestamp=min_timestamp,
|
|
128
|
+
max_timestamp=max_timestamp,
|
|
129
|
+
llm_latency=llm_latency)
|
|
130
|
+
return self.usage_stats.usage_stats_items[item.id]
|
|
131
|
+
|
|
132
|
+
async def run_workflow_local(self, session_manager: AIQSessionManager):
|
|
133
|
+
'''
|
|
134
|
+
Launch the workflow with the specified questions and extract the output using the jsonpath
|
|
135
|
+
'''
|
|
136
|
+
# import function level dependencies
|
|
137
|
+
from jsonpath_ng import parse
|
|
138
|
+
|
|
139
|
+
from aiq.eval.runtime_event_subscriber import pull_intermediate
|
|
140
|
+
|
|
141
|
+
# Run the workflow
|
|
142
|
+
jsonpath_expr = parse(self.config.result_json_path)
|
|
143
|
+
stop_event = asyncio.Event()
|
|
144
|
+
|
|
145
|
+
async def run_one(item: EvalInputItem):
|
|
146
|
+
if stop_event.is_set():
|
|
147
|
+
return "", []
|
|
148
|
+
|
|
149
|
+
async with session_manager.run(item.input_obj) as runner:
|
|
150
|
+
if not session_manager.workflow.has_single_output:
|
|
151
|
+
# raise an error if the workflow has multiple outputs
|
|
152
|
+
raise NotImplementedError("Multiple outputs are not supported")
|
|
153
|
+
|
|
154
|
+
runner_result = None
|
|
155
|
+
intermediate_future = None
|
|
156
|
+
|
|
157
|
+
try:
|
|
158
|
+
|
|
159
|
+
# Start usage stats and intermediate steps collection in parallel
|
|
160
|
+
intermediate_future = pull_intermediate()
|
|
161
|
+
runner_result = runner.result()
|
|
162
|
+
base_output = await runner_result
|
|
163
|
+
intermediate_steps = await intermediate_future
|
|
164
|
+
except NotImplementedError as e:
|
|
165
|
+
# raise original error
|
|
166
|
+
raise e
|
|
167
|
+
except Exception as e:
|
|
168
|
+
logger.exception("Failed to run the workflow: %s", e, exc_info=True)
|
|
169
|
+
# stop processing if a workflow error occurs
|
|
170
|
+
self.workflow_interrupted = True
|
|
171
|
+
|
|
172
|
+
# Cancel any coroutines that are still running, avoiding a warning about unawaited coroutines
|
|
173
|
+
# (typically one of these two is what raised the exception and the other is still running)
|
|
174
|
+
for coro in (runner_result, intermediate_future):
|
|
175
|
+
if coro is not None:
|
|
176
|
+
asyncio.ensure_future(coro).cancel()
|
|
177
|
+
|
|
178
|
+
stop_event.set()
|
|
179
|
+
return
|
|
180
|
+
|
|
181
|
+
try:
|
|
182
|
+
base_output = runner.convert(base_output, to_type=str)
|
|
183
|
+
except ValueError:
|
|
184
|
+
pass
|
|
185
|
+
|
|
186
|
+
# if base_output is a pydantic model dump it to json
|
|
187
|
+
if isinstance(base_output, BaseModel):
|
|
188
|
+
output = base_output.model_dump_json(indent=2)
|
|
189
|
+
else:
|
|
190
|
+
m = jsonpath_expr.find(base_output)
|
|
191
|
+
if (not m):
|
|
192
|
+
raise RuntimeError(f"Failed to extract output using jsonpath: {self.config.result_json_path}")
|
|
193
|
+
if (len(m) > 1):
|
|
194
|
+
logger.warning("Multiple matches found for jsonpath at row '%s'. Matches: %s. Using the first",
|
|
195
|
+
base_output,
|
|
196
|
+
m)
|
|
197
|
+
output = m[0].value
|
|
198
|
+
|
|
199
|
+
item.output_obj = output
|
|
200
|
+
item.trajectory = self.intermediate_step_adapter.validate_intermediate_steps(intermediate_steps)
|
|
201
|
+
usage_stats_item = self._compute_usage_stats(item)
|
|
202
|
+
|
|
203
|
+
self.weave_eval.log_prediction(item, output)
|
|
204
|
+
await self.weave_eval.log_usage_stats(item, usage_stats_item)
|
|
205
|
+
|
|
206
|
+
async def wrapped_run(item: EvalInputItem) -> None:
|
|
207
|
+
await run_one(item)
|
|
208
|
+
pbar.update(1)
|
|
209
|
+
|
|
210
|
+
# if self.config.skip_complete is set skip eval_input_items with a non-empty output_obj
|
|
211
|
+
if self.config.skip_completed_entries:
|
|
212
|
+
eval_input_items = [item for item in self.eval_input.eval_input_items if not item.output_obj]
|
|
213
|
+
if not eval_input_items:
|
|
214
|
+
logger.warning("All items have a non-empty output. Skipping workflow pass altogether.")
|
|
215
|
+
return
|
|
216
|
+
else:
|
|
217
|
+
eval_input_items = self.eval_input.eval_input_items
|
|
218
|
+
pbar = tqdm(total=len(eval_input_items), desc="Running workflow")
|
|
219
|
+
await asyncio.gather(*[wrapped_run(item) for item in eval_input_items])
|
|
220
|
+
pbar.close()
|
|
221
|
+
|
|
222
|
+
async def run_workflow_remote(self):
|
|
223
|
+
from aiq.eval.remote_workflow import EvaluationRemoteWorkflowHandler
|
|
224
|
+
handler = EvaluationRemoteWorkflowHandler(self.config, self.eval_config.general.max_concurrency)
|
|
225
|
+
await handler.run_workflow_remote(self.eval_input)
|
|
226
|
+
for item in self.eval_input.eval_input_items:
|
|
227
|
+
usage_stats_item = self._compute_usage_stats(item)
|
|
228
|
+
self.weave_eval.log_prediction(item, item.output_obj)
|
|
229
|
+
await self.weave_eval.log_usage_stats(item, usage_stats_item)
|
|
230
|
+
|
|
231
|
+
async def profile_workflow(self) -> ProfilerResults:
|
|
232
|
+
"""
|
|
233
|
+
Profile a dataset
|
|
234
|
+
"""
|
|
235
|
+
|
|
236
|
+
if not self.eval_config.general.profiler:
|
|
237
|
+
logger.info("Profiler is not enabled. Skipping profiling.")
|
|
238
|
+
return ProfilerResults()
|
|
239
|
+
|
|
240
|
+
from aiq.profiler.profile_runner import ProfilerRunner
|
|
241
|
+
|
|
242
|
+
all_stats = []
|
|
243
|
+
for input_item in self.eval_input.eval_input_items:
|
|
244
|
+
all_stats.append(input_item.trajectory)
|
|
245
|
+
|
|
246
|
+
profiler_runner = ProfilerRunner(self.eval_config.general.profiler,
|
|
247
|
+
self.eval_config.general.output_dir,
|
|
248
|
+
write_output=self.config.write_output)
|
|
249
|
+
|
|
250
|
+
return await profiler_runner.run(all_stats)
|
|
251
|
+
|
|
252
|
+
def cleanup_output_directory(self):
|
|
253
|
+
'''Remove contents of the output directory if it exists'''
|
|
254
|
+
output_config = self.eval_config.general.output
|
|
255
|
+
output_dir = output_config.dir
|
|
256
|
+
|
|
257
|
+
if not (output_config and output_dir.exists()):
|
|
258
|
+
return
|
|
259
|
+
|
|
260
|
+
# If cleanup is true, remove the entire directory and we are done
|
|
261
|
+
if output_config.cleanup:
|
|
262
|
+
logger.info("Cleaning up entire output directory: %s", output_config.dir)
|
|
263
|
+
shutil.rmtree(output_config.dir)
|
|
264
|
+
return
|
|
265
|
+
|
|
266
|
+
if output_config.job_management.max_jobs == 0:
|
|
267
|
+
# No eviction policy
|
|
268
|
+
return
|
|
269
|
+
|
|
270
|
+
base_dir = output_dir / "jobs"
|
|
271
|
+
if not base_dir.exists():
|
|
272
|
+
return
|
|
273
|
+
|
|
274
|
+
# Get all subdirectories, which represent individual job runs
|
|
275
|
+
job_dirs = [d for d in base_dir.iterdir() if d.is_dir()]
|
|
276
|
+
if len(job_dirs) <= output_config.job_management.max_jobs:
|
|
277
|
+
return
|
|
278
|
+
|
|
279
|
+
# Determine sort key based on eviction_policy, defaulting to creation time
|
|
280
|
+
if output_config.job_management.eviction_policy == JobEvictionPolicy.TIME_MODIFIED:
|
|
281
|
+
|
|
282
|
+
def sort_key(x):
|
|
283
|
+
return x.stat().st_mtime
|
|
284
|
+
|
|
285
|
+
logger.info("Using last modified time for job eviction policy.")
|
|
286
|
+
else:
|
|
287
|
+
|
|
288
|
+
def sort_key(x):
|
|
289
|
+
return x.stat().st_ctime
|
|
290
|
+
|
|
291
|
+
logger.info("Using creation time for job eviction policy.")
|
|
292
|
+
|
|
293
|
+
# Sort directories (oldest first)
|
|
294
|
+
job_dirs.sort(key=sort_key)
|
|
295
|
+
num_to_delete = len(job_dirs) - output_config.job_management.max_jobs
|
|
296
|
+
|
|
297
|
+
logger.info("Found %d jobs, exceeding limit of %d. Removing %d oldest jobs.",
|
|
298
|
+
len(job_dirs),
|
|
299
|
+
output_config.job_management.max_jobs,
|
|
300
|
+
num_to_delete)
|
|
301
|
+
|
|
302
|
+
for dir_to_delete in job_dirs[:num_to_delete]:
|
|
303
|
+
try:
|
|
304
|
+
logger.info("Deleting old job directory: %s", dir_to_delete)
|
|
305
|
+
shutil.rmtree(dir_to_delete)
|
|
306
|
+
except Exception as e:
|
|
307
|
+
logger.exception("Failed to delete old job directory: %s: %s", dir_to_delete, e, exc_info=True)
|
|
308
|
+
|
|
309
|
+
def write_output(self, dataset_handler: DatasetHandler, profiler_results: ProfilerResults):
|
|
310
|
+
workflow_output_file = self.eval_config.general.output_dir / "workflow_output.json"
|
|
311
|
+
workflow_output_file.parent.mkdir(parents=True, exist_ok=True)
|
|
312
|
+
|
|
313
|
+
# Write the workflow output to a file (this can be used for re-running the evaluation)
|
|
314
|
+
|
|
315
|
+
step_filter = self.eval_config.general.output.workflow_output_step_filter \
|
|
316
|
+
if self.eval_config.general.output else None
|
|
317
|
+
workflow_output = dataset_handler.publish_eval_input(self.eval_input, step_filter)
|
|
318
|
+
with open(workflow_output_file, "w", encoding="utf-8") as f:
|
|
319
|
+
# set indent to 2 for pretty printing
|
|
320
|
+
f.write(workflow_output)
|
|
321
|
+
self.workflow_output_file = workflow_output_file
|
|
322
|
+
logger.info("Workflow output written to %s", workflow_output_file)
|
|
323
|
+
|
|
324
|
+
# Write the output of each evaluator to a separate json file
|
|
325
|
+
for evaluator_name, eval_output in self.evaluation_results:
|
|
326
|
+
output_file = self.eval_config.general.output_dir / f"{evaluator_name}_output.json"
|
|
327
|
+
output_file.parent.mkdir(parents=True, exist_ok=True)
|
|
328
|
+
# create json content using the evaluation results
|
|
329
|
+
output = eval_output.model_dump_json(indent=2)
|
|
330
|
+
with open(output_file, "w", encoding="utf-8") as f:
|
|
331
|
+
f.write(output)
|
|
332
|
+
self.evaluator_output_files.append(output_file)
|
|
333
|
+
logger.info("Evaluation results written to %s", output_file)
|
|
334
|
+
|
|
335
|
+
def publish_output(self, dataset_handler: DatasetHandler, profiler_results: ProfilerResults):
|
|
336
|
+
"""Publish the output"""
|
|
337
|
+
if self.config.write_output:
|
|
338
|
+
self.write_output(dataset_handler, profiler_results)
|
|
339
|
+
|
|
340
|
+
if self.workflow_interrupted:
|
|
341
|
+
# Issue a warning if the workflow was not completed on all datasets
|
|
342
|
+
msg = ("Workflow execution was interrupted due to an error. The results may be incomplete. "
|
|
343
|
+
"You can re-execute evaluation for incomplete results by running "
|
|
344
|
+
"`eval` with the --skip_completed_entries flag.")
|
|
345
|
+
logger.warning(msg)
|
|
346
|
+
|
|
347
|
+
self.weave_eval.log_summary(self.usage_stats, self.evaluation_results, profiler_results)
|
|
348
|
+
|
|
349
|
+
async def run_single_evaluator(self, evaluator_name: str, evaluator: Any):
|
|
350
|
+
"""Run a single evaluator and store its results."""
|
|
351
|
+
try:
|
|
352
|
+
eval_output = await evaluator.evaluate_fn(self.eval_input)
|
|
353
|
+
self.evaluation_results.append((evaluator_name, eval_output))
|
|
354
|
+
|
|
355
|
+
await self.weave_eval.alog_score(eval_output, evaluator_name)
|
|
356
|
+
except Exception as e:
|
|
357
|
+
logger.exception("An error occurred while running evaluator %s: %s", evaluator_name, e, exc_info=True)
|
|
358
|
+
|
|
359
|
+
async def run_evaluators(self, evaluators: dict[str, Any]):
|
|
360
|
+
"""Run all configured evaluators asynchronously."""
|
|
361
|
+
tasks = [self.run_single_evaluator(name, evaluator) for name, evaluator in evaluators.items() if evaluator]
|
|
362
|
+
|
|
363
|
+
if not tasks:
|
|
364
|
+
logger.warning("All evaluators were empty or invalid.")
|
|
365
|
+
return
|
|
366
|
+
|
|
367
|
+
try:
|
|
368
|
+
await asyncio.gather(*tasks)
|
|
369
|
+
except Exception as e:
|
|
370
|
+
logger.exception("An error occurred while running evaluators: %s", e, exc_info=True)
|
|
371
|
+
raise
|
|
372
|
+
finally:
|
|
373
|
+
# Finish prediction loggers in Weave
|
|
374
|
+
await self.weave_eval.afinish_loggers()
|
|
375
|
+
|
|
376
|
+
def apply_overrides(self):
|
|
377
|
+
from aiq.cli.cli_utils.config_override import load_and_override_config
|
|
378
|
+
from aiq.data_models.config import AIQConfig
|
|
379
|
+
from aiq.runtime.loader import PluginTypes
|
|
380
|
+
from aiq.runtime.loader import discover_and_register_plugins
|
|
381
|
+
from aiq.utils.data_models.schema_validator import validate_schema
|
|
382
|
+
|
|
383
|
+
# Register plugins before validation
|
|
384
|
+
discover_and_register_plugins(PluginTypes.CONFIG_OBJECT)
|
|
385
|
+
|
|
386
|
+
config_dict = load_and_override_config(self.config.config_file, self.config.override)
|
|
387
|
+
config = validate_schema(config_dict, AIQConfig)
|
|
388
|
+
return config
|
|
389
|
+
|
|
390
|
+
def _get_workflow_alias(self, workflow_type: str | None = None):
|
|
391
|
+
"""Get the workflow alias for displaying in evaluation UI."""
|
|
392
|
+
if self.eval_config.general.workflow_alias:
|
|
393
|
+
return self.eval_config.general.workflow_alias
|
|
394
|
+
|
|
395
|
+
if not workflow_type or workflow_type == "EmptyFunctionConfig":
|
|
396
|
+
return "aiqtoolkit-eval"
|
|
397
|
+
|
|
398
|
+
return workflow_type
|
|
399
|
+
|
|
400
|
+
async def run_and_evaluate(self,
|
|
401
|
+
session_manager: AIQSessionManager | None = None,
|
|
402
|
+
job_id: str | None = None) -> EvaluationRunOutput:
|
|
403
|
+
"""
|
|
404
|
+
Run the workflow with the specified config file and evaluate the dataset
|
|
405
|
+
"""
|
|
406
|
+
logger.info("Starting evaluation run with config file: %s", self.config.config_file)
|
|
407
|
+
|
|
408
|
+
from aiq.builder.eval_builder import WorkflowEvalBuilder
|
|
409
|
+
from aiq.runtime.loader import load_config
|
|
410
|
+
|
|
411
|
+
# Load and override the config
|
|
412
|
+
if self.config.override:
|
|
413
|
+
config = self.apply_overrides()
|
|
414
|
+
else:
|
|
415
|
+
config = load_config(self.config.config_file)
|
|
416
|
+
self.eval_config = config.eval
|
|
417
|
+
workflow_alias = self._get_workflow_alias(config.workflow.type)
|
|
418
|
+
logger.debug("Loaded %s evaluation configuration: %s", workflow_alias, self.eval_config)
|
|
419
|
+
|
|
420
|
+
# Cleanup the output directory
|
|
421
|
+
if self.eval_config.general.output:
|
|
422
|
+
self.cleanup_output_directory()
|
|
423
|
+
|
|
424
|
+
# Generate a job_id if append_job_id_to_output_dir is enabled and no job_id provided
|
|
425
|
+
if (self.eval_config.general.output
|
|
426
|
+
and self.eval_config.general.output.job_management.append_job_id_to_output_dir and not job_id):
|
|
427
|
+
job_id = "job_" + str(uuid4())
|
|
428
|
+
logger.info("Generated job ID for output directory: %s", job_id)
|
|
429
|
+
|
|
430
|
+
# If a job id is provided keep the data per-job
|
|
431
|
+
if job_id:
|
|
432
|
+
self.eval_config.general.output_dir = self.eval_config.general.output_dir / f"jobs/{job_id}"
|
|
433
|
+
if self.eval_config.general.output:
|
|
434
|
+
self.eval_config.general.output.dir = self.eval_config.general.output_dir
|
|
435
|
+
|
|
436
|
+
# Load the input dataset
|
|
437
|
+
# For multiple datasets, one handler per dataset can be created
|
|
438
|
+
dataset_config = self.eval_config.general.dataset # Currently only one dataset is supported
|
|
439
|
+
if not dataset_config:
|
|
440
|
+
logger.info("No dataset found, nothing to evaluate")
|
|
441
|
+
return EvaluationRunOutput(
|
|
442
|
+
workflow_output_file=self.workflow_output_file,
|
|
443
|
+
evaluator_output_files=self.evaluator_output_files,
|
|
444
|
+
workflow_interrupted=self.workflow_interrupted,
|
|
445
|
+
)
|
|
446
|
+
|
|
447
|
+
dataset_handler = DatasetHandler(dataset_config=dataset_config,
|
|
448
|
+
reps=self.config.reps,
|
|
449
|
+
concurrency=self.eval_config.general.max_concurrency,
|
|
450
|
+
num_passes=self.config.num_passes,
|
|
451
|
+
adjust_dataset_size=self.config.adjust_dataset_size)
|
|
452
|
+
self.eval_input = dataset_handler.get_eval_input_from_dataset(self.config.dataset)
|
|
453
|
+
if not self.eval_input.eval_input_items:
|
|
454
|
+
logger.info("Dataset is empty. Nothing to evaluate.")
|
|
455
|
+
return EvaluationRunOutput(
|
|
456
|
+
workflow_output_file=self.workflow_output_file,
|
|
457
|
+
evaluator_output_files=self.evaluator_output_files,
|
|
458
|
+
workflow_interrupted=self.workflow_interrupted,
|
|
459
|
+
)
|
|
460
|
+
|
|
461
|
+
# Run workflow and evaluate
|
|
462
|
+
async with WorkflowEvalBuilder.from_config(config=config) as eval_workflow:
|
|
463
|
+
# Initialize Weave integration
|
|
464
|
+
self.weave_eval.initialize_logger(workflow_alias, self.eval_input, config)
|
|
465
|
+
|
|
466
|
+
# Run workflow
|
|
467
|
+
if self.config.endpoint:
|
|
468
|
+
await self.run_workflow_remote()
|
|
469
|
+
else:
|
|
470
|
+
if not self.config.skip_workflow:
|
|
471
|
+
if session_manager is None:
|
|
472
|
+
session_manager = AIQSessionManager(eval_workflow.build(),
|
|
473
|
+
max_concurrency=self.eval_config.general.max_concurrency)
|
|
474
|
+
await self.run_workflow_local(session_manager)
|
|
475
|
+
|
|
476
|
+
# Evaluate
|
|
477
|
+
evaluators = {name: eval_workflow.get_evaluator(name) for name in self.eval_config.evaluators}
|
|
478
|
+
await self.run_evaluators(evaluators)
|
|
479
|
+
|
|
480
|
+
# Profile the workflow
|
|
481
|
+
profiler_results = await self.profile_workflow()
|
|
482
|
+
|
|
483
|
+
# compute total runtime
|
|
484
|
+
if self.usage_stats.usage_stats_items:
|
|
485
|
+
self.usage_stats.total_runtime = max(self.usage_stats.usage_stats_items.values(),
|
|
486
|
+
key=lambda x: x.max_timestamp).max_timestamp - \
|
|
487
|
+
min(self.usage_stats.usage_stats_items.values(), key=lambda x: x.min_timestamp).min_timestamp
|
|
488
|
+
else:
|
|
489
|
+
self.usage_stats.total_runtime = 0.0
|
|
490
|
+
|
|
491
|
+
# Publish the results
|
|
492
|
+
self.publish_output(dataset_handler, profiler_results)
|
|
493
|
+
|
|
494
|
+
# Run custom scripts and upload evaluation outputs to S3
|
|
495
|
+
if self.eval_config.general.output:
|
|
496
|
+
output_uploader = OutputUploader(self.eval_config.general.output, job_id=job_id)
|
|
497
|
+
output_uploader.run_custom_scripts()
|
|
498
|
+
await output_uploader.upload_directory()
|
|
499
|
+
|
|
500
|
+
return EvaluationRunOutput(workflow_output_file=self.workflow_output_file,
|
|
501
|
+
evaluator_output_files=self.evaluator_output_files,
|
|
502
|
+
workflow_interrupted=self.workflow_interrupted,
|
|
503
|
+
eval_input=self.eval_input,
|
|
504
|
+
evaluation_results=self.evaluation_results,
|
|
505
|
+
usage_stats=self.usage_stats,
|
|
506
|
+
profiler_results=profiler_results)
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
#
|
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
# you may not use this file except in compliance with the License.
|
|
6
|
+
# You may obtain a copy of the License at
|
|
7
|
+
#
|
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
#
|
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
# See the License for the specific language governing permissions and
|
|
14
|
+
# limitations under the License.
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
#
|
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
# you may not use this file except in compliance with the License.
|
|
6
|
+
# You may obtain a copy of the License at
|
|
7
|
+
#
|
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
#
|
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
# See the License for the specific language governing permissions and
|
|
14
|
+
# limitations under the License.
|
|
15
|
+
|
|
16
|
+
import asyncio
|
|
17
|
+
from abc import ABC
|
|
18
|
+
from abc import abstractmethod
|
|
19
|
+
|
|
20
|
+
from tqdm import tqdm
|
|
21
|
+
|
|
22
|
+
from aiq.eval.evaluator.evaluator_model import EvalInput
|
|
23
|
+
from aiq.eval.evaluator.evaluator_model import EvalInputItem
|
|
24
|
+
from aiq.eval.evaluator.evaluator_model import EvalOutput
|
|
25
|
+
from aiq.eval.evaluator.evaluator_model import EvalOutputItem
|
|
26
|
+
from aiq.eval.utils.tqdm_position_registry import TqdmPositionRegistry
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class BaseEvaluator(ABC):
|
|
30
|
+
"""
|
|
31
|
+
Base class for custom evaluators.
|
|
32
|
+
|
|
33
|
+
Each custom evaluator must implement the `evaluate_item` method which is used to evaluate a
|
|
34
|
+
single EvalInputItem.
|
|
35
|
+
"""
|
|
36
|
+
|
|
37
|
+
def __init__(self, max_concurrency: int = 4, tqdm_desc: str = "Evaluating"):
|
|
38
|
+
self.max_concurrency = max_concurrency
|
|
39
|
+
self.semaphore = asyncio.Semaphore(max_concurrency)
|
|
40
|
+
self.tqdm_desc = tqdm_desc
|
|
41
|
+
|
|
42
|
+
@abstractmethod
|
|
43
|
+
async def evaluate_item(self, item: EvalInputItem) -> EvalOutputItem:
|
|
44
|
+
"""Each evaluator must implement this for item-level evaluation"""
|
|
45
|
+
pass
|
|
46
|
+
|
|
47
|
+
async def evaluate(self, eval_input: EvalInput) -> EvalOutput:
|
|
48
|
+
pbar = None
|
|
49
|
+
try:
|
|
50
|
+
tqdm_position = TqdmPositionRegistry.claim()
|
|
51
|
+
pbar = tqdm(total=len(eval_input.eval_input_items), desc=self.tqdm_desc, position=tqdm_position)
|
|
52
|
+
|
|
53
|
+
async def wrapped(item):
|
|
54
|
+
async with self.semaphore:
|
|
55
|
+
try:
|
|
56
|
+
output_item = await self.evaluate_item(item)
|
|
57
|
+
pbar.update(1)
|
|
58
|
+
return output_item
|
|
59
|
+
except Exception as e:
|
|
60
|
+
# If the evaluator fails, return an error item with a score of 0.0
|
|
61
|
+
pbar.update(1)
|
|
62
|
+
return EvalOutputItem(id=item.id, score=0.0, reasoning={"error": f"Evaluator error: {str(e)}"})
|
|
63
|
+
|
|
64
|
+
output_items = await asyncio.gather(*[wrapped(item) for item in eval_input.eval_input_items])
|
|
65
|
+
finally:
|
|
66
|
+
pbar.close()
|
|
67
|
+
TqdmPositionRegistry.release(tqdm_position)
|
|
68
|
+
|
|
69
|
+
# Compute average if possible
|
|
70
|
+
numeric_scores = [item.score for item in output_items if isinstance(item.score, (int, float))]
|
|
71
|
+
avg_score = round(sum(numeric_scores) / len(numeric_scores), 2) if numeric_scores else None
|
|
72
|
+
|
|
73
|
+
return EvalOutput(average_score=avg_score, eval_output_items=output_items)
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
#
|
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
# you may not use this file except in compliance with the License.
|
|
6
|
+
# You may obtain a copy of the License at
|
|
7
|
+
#
|
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
#
|
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
# See the License for the specific language governing permissions and
|
|
14
|
+
# limitations under the License.
|
|
15
|
+
|
|
16
|
+
import typing
|
|
17
|
+
|
|
18
|
+
from pydantic import BaseModel
|
|
19
|
+
|
|
20
|
+
from aiq.data_models.intermediate_step import IntermediateStep
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class EvalInputItem(BaseModel):
|
|
24
|
+
id: typing.Any
|
|
25
|
+
input_obj: typing.Any
|
|
26
|
+
expected_output_obj: typing.Any
|
|
27
|
+
output_obj: typing.Any
|
|
28
|
+
expected_trajectory: list[IntermediateStep]
|
|
29
|
+
trajectory: list[IntermediateStep]
|
|
30
|
+
full_dataset_entry: typing.Any
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class EvalInput(BaseModel):
|
|
34
|
+
eval_input_items: list[EvalInputItem]
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
class EvalOutputItem(BaseModel):
|
|
38
|
+
id: typing.Any # id or input_obj from EvalInputItem
|
|
39
|
+
score: typing.Any # float or any serializable type
|
|
40
|
+
reasoning: typing.Any
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
class EvalOutput(BaseModel):
|
|
44
|
+
average_score: typing.Any # float or any serializable type
|
|
45
|
+
eval_output_items: list[EvalOutputItem]
|