PyPI - nvidia-nat - Versions diffs - 1.2.1rc1__py3-none-any.whl → 1.3.0__py3-none-any.whl - Mend

nvidia-nat 1.2.1rc1py3-none-any.whl → 1.3.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (257) hide show

aiq/__init__.py +2 -2
nat/agent/base.py +27 -18
nat/agent/dual_node.py +9 -4
nat/agent/prompt_optimizer/prompt.py +68 -0
nat/agent/prompt_optimizer/register.py +149 -0
nat/agent/react_agent/agent.py +81 -50
nat/agent/react_agent/register.py +59 -40
nat/agent/reasoning_agent/reasoning_agent.py +17 -15
nat/agent/register.py +1 -1
nat/agent/rewoo_agent/agent.py +327 -149
nat/agent/rewoo_agent/prompt.py +19 -22
nat/agent/rewoo_agent/register.py +64 -46
nat/agent/tool_calling_agent/agent.py +152 -29
nat/agent/tool_calling_agent/register.py +61 -38
nat/authentication/api_key/api_key_auth_provider.py +2 -2
nat/authentication/credential_validator/bearer_token_validator.py +557 -0
nat/authentication/http_basic_auth/http_basic_auth_provider.py +1 -1
nat/authentication/interfaces.py +5 -2
nat/authentication/oauth2/oauth2_auth_code_flow_provider.py +69 -36
nat/authentication/oauth2/oauth2_resource_server_config.py +124 -0
nat/authentication/register.py +0 -1
nat/builder/builder.py +56 -24
nat/builder/component_utils.py +10 -6
nat/builder/context.py +70 -18
nat/builder/eval_builder.py +16 -11
nat/builder/framework_enum.py +1 -0
nat/builder/front_end.py +1 -1
nat/builder/function.py +378 -8
nat/builder/function_base.py +3 -3
nat/builder/function_info.py +6 -8
nat/builder/intermediate_step_manager.py +6 -2
nat/builder/user_interaction_manager.py +2 -2
nat/builder/workflow.py +13 -1
nat/builder/workflow_builder.py +327 -79
nat/cli/cli_utils/config_override.py +2 -2
nat/cli/commands/evaluate.py +1 -1
nat/cli/commands/info/info.py +16 -6
nat/cli/commands/info/list_channels.py +1 -1
nat/cli/commands/info/list_components.py +7 -8
nat/cli/commands/mcp/__init__.py +14 -0
nat/cli/commands/mcp/mcp.py +986 -0
nat/cli/commands/object_store/__init__.py +14 -0
nat/cli/commands/object_store/object_store.py +227 -0
nat/cli/commands/optimize.py +90 -0
nat/cli/commands/registry/publish.py +2 -2
nat/cli/commands/registry/pull.py +2 -2
nat/cli/commands/registry/remove.py +2 -2
nat/cli/commands/registry/search.py +15 -17
nat/cli/commands/start.py +16 -5
nat/cli/commands/uninstall.py +1 -1
nat/cli/commands/workflow/templates/config.yml.j2 +14 -13
nat/cli/commands/workflow/templates/pyproject.toml.j2 +5 -2
nat/cli/commands/workflow/templates/register.py.j2 +2 -3
nat/cli/commands/workflow/templates/workflow.py.j2 +35 -21
nat/cli/commands/workflow/workflow_commands.py +105 -19
nat/cli/entrypoint.py +17 -11
nat/cli/main.py +3 -0
nat/cli/register_workflow.py +38 -4
nat/cli/type_registry.py +79 -10
nat/control_flow/__init__.py +0 -0
nat/control_flow/register.py +20 -0
nat/control_flow/router_agent/__init__.py +0 -0
nat/control_flow/router_agent/agent.py +329 -0
nat/control_flow/router_agent/prompt.py +48 -0
nat/control_flow/router_agent/register.py +91 -0
nat/control_flow/sequential_executor.py +166 -0
nat/data_models/agent.py +34 -0
nat/data_models/api_server.py +196 -67
nat/data_models/authentication.py +23 -9
nat/data_models/common.py +1 -1
nat/data_models/component.py +2 -0
nat/data_models/component_ref.py +11 -0
nat/data_models/config.py +42 -18
nat/data_models/dataset_handler.py +1 -1
nat/data_models/discovery_metadata.py +4 -4
nat/data_models/evaluate.py +4 -1
nat/data_models/function.py +34 -0
nat/data_models/function_dependencies.py +14 -6
nat/data_models/gated_field_mixin.py +242 -0
nat/data_models/intermediate_step.py +3 -3
nat/data_models/optimizable.py +119 -0
nat/data_models/optimizer.py +149 -0
nat/data_models/span.py +41 -3
nat/data_models/swe_bench_model.py +1 -1
nat/data_models/temperature_mixin.py +44 -0
nat/data_models/thinking_mixin.py +86 -0
nat/data_models/top_p_mixin.py +44 -0
nat/embedder/azure_openai_embedder.py +46 -0
nat/embedder/nim_embedder.py +1 -1
nat/embedder/openai_embedder.py +2 -3
nat/embedder/register.py +1 -1
nat/eval/config.py +3 -1
nat/eval/dataset_handler/dataset_handler.py +71 -7
nat/eval/evaluate.py +86 -31
nat/eval/evaluator/base_evaluator.py +1 -1
nat/eval/evaluator/evaluator_model.py +13 -0
nat/eval/intermediate_step_adapter.py +1 -1
nat/eval/rag_evaluator/evaluate.py +9 -6
nat/eval/rag_evaluator/register.py +3 -3
nat/eval/register.py +4 -1
nat/eval/remote_workflow.py +3 -3
nat/eval/runtime_evaluator/__init__.py +14 -0
nat/eval/runtime_evaluator/evaluate.py +123 -0
nat/eval/runtime_evaluator/register.py +100 -0
nat/eval/swe_bench_evaluator/evaluate.py +6 -6
nat/eval/trajectory_evaluator/evaluate.py +1 -1
nat/eval/trajectory_evaluator/register.py +1 -1
nat/eval/tunable_rag_evaluator/evaluate.py +4 -7
nat/eval/utils/eval_trace_ctx.py +89 -0
nat/eval/utils/weave_eval.py +18 -9
nat/experimental/decorators/experimental_warning_decorator.py +27 -7
nat/experimental/test_time_compute/functions/execute_score_select_function.py +1 -1
nat/experimental/test_time_compute/functions/plan_select_execute_function.py +7 -3
nat/experimental/test_time_compute/functions/ttc_tool_orchestration_function.py +3 -3
nat/experimental/test_time_compute/functions/ttc_tool_wrapper_function.py +3 -3
nat/experimental/test_time_compute/models/strategy_base.py +5 -4
nat/experimental/test_time_compute/register.py +0 -1
nat/experimental/test_time_compute/selection/llm_based_output_merging_selector.py +1 -3
nat/front_ends/console/authentication_flow_handler.py +82 -30
nat/front_ends/console/console_front_end_plugin.py +19 -7
nat/front_ends/fastapi/auth_flow_handlers/http_flow_handler.py +1 -1
nat/front_ends/fastapi/auth_flow_handlers/websocket_flow_handler.py +52 -17
nat/front_ends/fastapi/dask_client_mixin.py +65 -0
nat/front_ends/fastapi/fastapi_front_end_config.py +36 -5
nat/front_ends/fastapi/fastapi_front_end_controller.py +4 -4
nat/front_ends/fastapi/fastapi_front_end_plugin.py +135 -4
nat/front_ends/fastapi/fastapi_front_end_plugin_worker.py +455 -282
nat/front_ends/fastapi/job_store.py +518 -99
nat/front_ends/fastapi/main.py +11 -19
nat/front_ends/fastapi/message_handler.py +74 -50
nat/front_ends/fastapi/message_validator.py +20 -21
nat/front_ends/fastapi/response_helpers.py +4 -4
nat/front_ends/fastapi/step_adaptor.py +2 -2
nat/front_ends/fastapi/utils.py +57 -0
nat/front_ends/mcp/introspection_token_verifier.py +73 -0
nat/front_ends/mcp/mcp_front_end_config.py +47 -3
nat/front_ends/mcp/mcp_front_end_plugin.py +48 -13
nat/front_ends/mcp/mcp_front_end_plugin_worker.py +120 -8
nat/front_ends/mcp/tool_converter.py +44 -14
nat/front_ends/register.py +0 -1
nat/front_ends/simple_base/simple_front_end_plugin_base.py +3 -1
nat/llm/aws_bedrock_llm.py +24 -12
nat/llm/azure_openai_llm.py +57 -0
nat/llm/litellm_llm.py +69 -0
nat/llm/nim_llm.py +20 -8
nat/llm/openai_llm.py +14 -6
nat/llm/register.py +5 -1
nat/llm/utils/env_config_value.py +2 -3
nat/llm/utils/thinking.py +215 -0
nat/meta/pypi.md +9 -9
nat/object_store/register.py +0 -1
nat/observability/exporter/base_exporter.py +3 -3
nat/observability/exporter/file_exporter.py +1 -1
nat/observability/exporter/processing_exporter.py +309 -81
nat/observability/exporter/span_exporter.py +35 -15
nat/observability/exporter_manager.py +7 -7
nat/observability/mixin/file_mixin.py +7 -7
nat/observability/mixin/redaction_config_mixin.py +42 -0
nat/observability/mixin/tagging_config_mixin.py +62 -0
nat/observability/mixin/type_introspection_mixin.py +420 -107
nat/observability/processor/batching_processor.py +5 -7
nat/observability/processor/falsy_batch_filter_processor.py +55 -0
nat/observability/processor/processor.py +3 -0
nat/observability/processor/processor_factory.py +70 -0
nat/observability/processor/redaction/__init__.py +24 -0
nat/observability/processor/redaction/contextual_redaction_processor.py +125 -0
nat/observability/processor/redaction/contextual_span_redaction_processor.py +66 -0
nat/observability/processor/redaction/redaction_processor.py +177 -0
nat/observability/processor/redaction/span_header_redaction_processor.py +92 -0
nat/observability/processor/span_tagging_processor.py +68 -0
nat/observability/register.py +22 -4
nat/profiler/calc/calc_runner.py +3 -4
nat/profiler/callbacks/agno_callback_handler.py +1 -1
nat/profiler/callbacks/langchain_callback_handler.py +14 -7
nat/profiler/callbacks/llama_index_callback_handler.py +3 -3
nat/profiler/callbacks/semantic_kernel_callback_handler.py +3 -3
nat/profiler/data_frame_row.py +1 -1
nat/profiler/decorators/framework_wrapper.py +62 -13
nat/profiler/decorators/function_tracking.py +160 -3
nat/profiler/forecasting/models/forecasting_base_model.py +3 -1
nat/profiler/forecasting/models/linear_model.py +1 -1
nat/profiler/forecasting/models/random_forest_regressor.py +1 -1
nat/profiler/inference_optimization/bottleneck_analysis/nested_stack_analysis.py +1 -1
nat/profiler/inference_optimization/bottleneck_analysis/simple_stack_analysis.py +1 -1
nat/profiler/inference_optimization/data_models.py +3 -3
nat/profiler/inference_optimization/experimental/prefix_span_analysis.py +8 -9
nat/profiler/inference_optimization/token_uniqueness.py +1 -1
nat/profiler/parameter_optimization/__init__.py +0 -0
nat/profiler/parameter_optimization/optimizable_utils.py +93 -0
nat/profiler/parameter_optimization/optimizer_runtime.py +67 -0
nat/profiler/parameter_optimization/parameter_optimizer.py +164 -0
nat/profiler/parameter_optimization/parameter_selection.py +107 -0
nat/profiler/parameter_optimization/pareto_visualizer.py +395 -0
nat/profiler/parameter_optimization/prompt_optimizer.py +384 -0
nat/profiler/parameter_optimization/update_helpers.py +66 -0
nat/profiler/profile_runner.py +14 -9
nat/profiler/utils.py +4 -2
nat/registry_handlers/local/local_handler.py +2 -2
nat/registry_handlers/package_utils.py +1 -2
nat/registry_handlers/pypi/pypi_handler.py +23 -26
nat/registry_handlers/register.py +3 -4
nat/registry_handlers/rest/rest_handler.py +12 -13
nat/retriever/milvus/retriever.py +2 -2
nat/retriever/nemo_retriever/retriever.py +1 -1
nat/retriever/register.py +0 -1
nat/runtime/loader.py +2 -2
nat/runtime/runner.py +105 -8
nat/runtime/session.py +69 -8
nat/settings/global_settings.py +16 -5
nat/tool/chat_completion.py +5 -2
nat/tool/code_execution/local_sandbox/local_sandbox_server.py +3 -3
nat/tool/datetime_tools.py +49 -9
nat/tool/document_search.py +2 -2
nat/tool/github_tools.py +450 -0
nat/tool/memory_tools/add_memory_tool.py +3 -3
nat/tool/memory_tools/delete_memory_tool.py +3 -4
nat/tool/memory_tools/get_memory_tool.py +4 -4
nat/tool/nvidia_rag.py +1 -1
nat/tool/register.py +2 -9
nat/tool/retriever.py +3 -2
nat/utils/callable_utils.py +70 -0
nat/utils/data_models/schema_validator.py +3 -3
nat/utils/decorators.py +210 -0
nat/utils/exception_handlers/automatic_retries.py +104 -51
nat/utils/exception_handlers/schemas.py +1 -1
nat/utils/io/yaml_tools.py +2 -2
nat/utils/log_levels.py +25 -0
nat/utils/reactive/base/observable_base.py +2 -2
nat/utils/reactive/base/observer_base.py +1 -1
nat/utils/reactive/observable.py +2 -2
nat/utils/reactive/observer.py +4 -4
nat/utils/reactive/subscription.py +1 -1
nat/utils/settings/global_settings.py +6 -8
nat/utils/type_converter.py +12 -3
nat/utils/type_utils.py +9 -5
nvidia_nat-1.3.0.dist-info/METADATA +195 -0
{nvidia_nat-1.2.1rc1.dist-info → nvidia_nat-1.3.0.dist-info}/RECORD +244 -200
{nvidia_nat-1.2.1rc1.dist-info → nvidia_nat-1.3.0.dist-info}/entry_points.txt +1 -0
nat/cli/commands/info/list_mcp.py +0 -304
nat/tool/github_tools/create_github_commit.py +0 -133
nat/tool/github_tools/create_github_issue.py +0 -87
nat/tool/github_tools/create_github_pr.py +0 -106
nat/tool/github_tools/get_github_file.py +0 -106
nat/tool/github_tools/get_github_issue.py +0 -166
nat/tool/github_tools/get_github_pr.py +0 -256
nat/tool/github_tools/update_github_issue.py +0 -100
nat/tool/mcp/exceptions.py +0 -142
nat/tool/mcp/mcp_client.py +0 -255
nat/tool/mcp/mcp_tool.py +0 -96
nat/utils/exception_handlers/mcp.py +0 -211
nvidia_nat-1.2.1rc1.dist-info/METADATA +0 -365
/nat/{tool/github_tools → agent/prompt_optimizer}/__init__.py +0 -0
/nat/{tool/mcp → authentication/credential_validator}/__init__.py +0 -0
{nvidia_nat-1.2.1rc1.dist-info → nvidia_nat-1.3.0.dist-info}/WHEEL +0 -0
{nvidia_nat-1.2.1rc1.dist-info → nvidia_nat-1.3.0.dist-info}/licenses/LICENSE-3rd-party.txt +0 -0
{nvidia_nat-1.2.1rc1.dist-info → nvidia_nat-1.3.0.dist-info}/licenses/LICENSE.md +0 -0
{nvidia_nat-1.2.1rc1.dist-info → nvidia_nat-1.3.0.dist-info}/top_level.txt +0 -0

nat/eval/dataset_handler/dataset_handler.py CHANGED Viewed

@@ -13,6 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import importlib
 import json
 import math
 from pathlib import Path
@@ -41,7 +42,8 @@ class DatasetHandler:
                  reps: int,
                  concurrency: int,
                  num_passes: int = 1,
-                 adjust_dataset_size: bool = False):
+                 adjust_dataset_size: bool = False,
+                 custom_pre_eval_process_function: str | None = None):
         from nat.eval.intermediate_step_adapter import IntermediateStepAdapter
         self.dataset_config = dataset_config
@@ -53,6 +55,9 @@ class DatasetHandler:
         self.num_passes = num_passes
         self.adjust_dataset_size = adjust_dataset_size
+        # Custom pre-evaluation process function
+        self.custom_pre_eval_process_function = custom_pre_eval_process_function
         # Helpers
         self.intermediate_step_adapter = IntermediateStepAdapter()
@@ -146,13 +151,12 @@ class DatasetHandler:
             # When num_passes is specified, always use concurrency * num_passes
             # This respects the user's intent for exact number of passes
             target_size = self.concurrency * self.num_passes
+        # When num_passes = 0, use the largest multiple of concurrency <= original_size
+        # If original_size < concurrency, we need at least concurrency rows
+        elif original_size >= self.concurrency:
+            target_size = (original_size // self.concurrency) * self.concurrency
         else:
-            # When num_passes = 0, use the largest multiple of concurrency <= original_size
-            # If original_size < concurrency, we need at least concurrency rows
-            if original_size >= self.concurrency:
-                target_size = (original_size // self.concurrency) * self.concurrency
-            else:
-                target_size = self.concurrency
+            target_size = self.concurrency
         if target_size == 0:
             raise ValueError("Input dataset too small for even one batch at given concurrency.")
@@ -331,6 +335,66 @@ class DatasetHandler:
         filtered_steps = self.intermediate_step_adapter.filter_intermediate_steps(intermediate_steps, event_filter)
         return self.intermediate_step_adapter.serialize_intermediate_steps(filtered_steps)
+    def pre_eval_process_eval_input(self, eval_input: EvalInput) -> EvalInput:
+        """
+        Pre-evaluation process the eval input using custom function if provided.
+        The custom pre-evaluation process function should have the signature:
+        def custom_pre_eval_process(item: EvalInputItem) -> EvalInputItem
+        The framework will iterate through all items and call this function on each one.
+        Args:
+            eval_input: The EvalInput object to pre-evaluation process
+        Returns:
+            The pre-evaluation processed EvalInput object
+        """
+        if self.custom_pre_eval_process_function:
+            try:
+                custom_function = self._load_custom_pre_eval_process_function()
+                processed_items = []
+                for item in eval_input.eval_input_items:
+                    processed_item = custom_function(item)
+                    if not isinstance(processed_item, EvalInputItem):
+                        raise TypeError(f"Custom pre-evaluation '{self.custom_pre_eval_process_function}' must return "
+                                        f"EvalInputItem, got {type(processed_item)}")
+                    processed_items.append(processed_item)
+                return EvalInput(eval_input_items=processed_items)
+            except Exception as e:
+                raise RuntimeError(f"Error calling custom pre-evaluation process function "
+                                   f"'{self.custom_pre_eval_process_function}': {e}") from e
+        return eval_input
+    def _load_custom_pre_eval_process_function(self):
+        """
+        Import and return the custom pre-evaluation process function using standard Python import path.
+        The function should process individual EvalInputItem objects.
+        """
+        # Split the function path to get module and function name
+        if "." not in self.custom_pre_eval_process_function:
+            raise ValueError(f"Invalid custom_pre_eval_process_function '{self.custom_pre_eval_process_function}'. "
+                             "Expected format: '<module_path>.<function_name>'")
+        module_path, function_name = self.custom_pre_eval_process_function.rsplit(".", 1)
+        # Import the module
+        module = importlib.import_module(module_path)
+        # Get the function from the module
+        if not hasattr(module, function_name):
+            raise AttributeError(f"Function '{function_name}' not found in module '{module_path}'")
+        custom_function = getattr(module, function_name)
+        if not callable(custom_function):
+            raise ValueError(f"'{self.custom_pre_eval_process_function}' is not callable")
+        return custom_function
     def publish_eval_input(self,
                            eval_input,
                            workflow_output_step_filter: list[IntermediateStepType] | None = None) -> str:

nat/eval/evaluate.py CHANGED Viewed

@@ -42,7 +42,7 @@ from nat.runtime.session import SessionManager
 logger = logging.getLogger(__name__)
-class EvaluationRun:  # pylint: disable=too-many-public-methods
+class EvaluationRun:
     """
     Instantiated for each evaluation run and used to store data for that single run.
@@ -63,7 +63,16 @@ class EvaluationRun:  # pylint: disable=too-many-public-methods
         # Helpers
         self.intermediate_step_adapter: IntermediateStepAdapter = IntermediateStepAdapter()
-        self.weave_eval: WeaveEvaluationIntegration = WeaveEvaluationIntegration()
+        # Create evaluation trace context
+        try:
+            from nat.eval.utils.eval_trace_ctx import WeaveEvalTraceContext
+            self.eval_trace_context = WeaveEvalTraceContext()
+        except Exception:
+            from nat.eval.utils.eval_trace_ctx import EvalTraceContext
+            self.eval_trace_context = EvalTraceContext()
+        self.weave_eval: WeaveEvaluationIntegration = WeaveEvaluationIntegration(self.eval_trace_context)
         # Metadata
         self.eval_input: EvalInput | None = None
         self.workflow_interrupted: bool = False
@@ -159,17 +168,17 @@ class EvaluationRun:  # pylint: disable=too-many-public-methods
                 intermediate_future = None
                 try:
                     # Start usage stats and intermediate steps collection in parallel
                     intermediate_future = pull_intermediate()
                     runner_result = runner.result()
                     base_output = await runner_result
                     intermediate_steps = await intermediate_future
                 except NotImplementedError as e:
+                    logger.error("Failed to run the workflow: %s", e)
                     # raise original error
-                    raise e
+                    raise
                 except Exception as e:
-                    logger.exception("Failed to run the workflow: %s", e, exc_info=True)
+                    logger.exception("Failed to run the workflow: %s", e)
                     # stop processing if a workflow error occurs
                     self.workflow_interrupted = True
@@ -308,9 +317,9 @@ class EvaluationRun:  # pylint: disable=too-many-public-methods
                 logger.info("Deleting old job directory: %s", dir_to_delete)
                 shutil.rmtree(dir_to_delete)
             except Exception as e:
-                logger.exception("Failed to delete old job directory: %s: %s", dir_to_delete, e, exc_info=True)
+                logger.exception("Failed to delete old job directory: %s: %s", dir_to_delete, e)
-    def write_output(self, dataset_handler: DatasetHandler, profiler_results: ProfilerResults):  # pylint: disable=unused-argument  # noqa: E501
+    def write_output(self, dataset_handler: DatasetHandler, profiler_results: ProfilerResults):
         workflow_output_file = self.eval_config.general.output_dir / "workflow_output.json"
         workflow_output_file.parent.mkdir(parents=True, exist_ok=True)
@@ -358,7 +367,7 @@ class EvaluationRun:  # pylint: disable=too-many-public-methods
             await self.weave_eval.alog_score(eval_output, evaluator_name)
         except Exception as e:
-            logger.exception("An error occurred while running evaluator %s: %s", evaluator_name, e, exc_info=True)
+            logger.exception("An error occurred while running evaluator %s: %s", evaluator_name, e)
     async def run_evaluators(self, evaluators: dict[str, Any]):
         """Run all configured evaluators asynchronously."""
@@ -371,7 +380,7 @@ class EvaluationRun:  # pylint: disable=too-many-public-methods
         try:
             await asyncio.gather(*tasks)
         except Exception as e:
-            logger.exception("An error occurred while running evaluators: %s", e, exc_info=True)
+            logger.error("An error occurred while running evaluators: %s", e)
             raise
         finally:
             # Finish prediction loggers in Weave
@@ -401,6 +410,33 @@ class EvaluationRun:  # pylint: disable=too-many-public-methods
         return workflow_type
+    async def wait_for_all_export_tasks_local(self, session_manager: SessionManager, timeout: float) -> None:
+        """Wait for all trace export tasks to complete for local workflows.
+        This only works for local workflows where we have direct access to the
+        SessionManager and its underlying workflow with exporter manager.
+        """
+        try:
+            workflow = session_manager.workflow
+            all_exporters = await workflow.get_all_exporters()
+            if not all_exporters:
+                logger.debug("No exporters to wait for")
+                return
+            logger.info("Waiting for export tasks from %d local exporters (timeout: %ds)", len(all_exporters), timeout)
+            for name, exporter in all_exporters.items():
+                try:
+                    await exporter.wait_for_tasks(timeout=timeout)
+                    logger.info("Export tasks completed for exporter: %s", name)
+                except Exception as e:
+                    logger.warning("Error waiting for export tasks from %s: %s", name, e)
+            logger.info("All local export task waiting completed")
+        except Exception as e:
+            logger.warning("Failed to wait for local export tasks: %s", e)
     async def run_and_evaluate(self,
                                session_manager: SessionManager | None = None,
                                job_id: str | None = None) -> EvaluationRunOutput:
@@ -413,10 +449,14 @@ class EvaluationRun:  # pylint: disable=too-many-public-methods
         from nat.runtime.loader import load_config
         # Load and override the config
-        if self.config.override:
+        config = None
+        if isinstance(self.config.config_file, BaseModel):
+            config = self.config.config_file
+        elif self.config.override:
             config = self.apply_overrides()
         else:
             config = load_config(self.config.config_file)
         self.eval_config = config.eval
         workflow_alias = self._get_workflow_alias(config.workflow.type)
         logger.debug("Loaded %s evaluation configuration: %s", workflow_alias, self.eval_config)
@@ -442,44 +482,59 @@ class EvaluationRun:  # pylint: disable=too-many-public-methods
         dataset_config = self.eval_config.general.dataset  # Currently only one dataset is supported
         if not dataset_config:
             logger.info("No dataset found, nothing to evaluate")
-            return EvaluationRunOutput(
-                workflow_output_file=self.workflow_output_file,
-                evaluator_output_files=self.evaluator_output_files,
-                workflow_interrupted=self.workflow_interrupted,
-            )
+            return EvaluationRunOutput(workflow_output_file=self.workflow_output_file,
+                                       evaluator_output_files=self.evaluator_output_files,
+                                       workflow_interrupted=self.workflow_interrupted,
+                                       eval_input=EvalInput(eval_input_items=[]),
+                                       evaluation_results=[],
+                                       usage_stats=UsageStats(),
+                                       profiler_results=ProfilerResults())
+        custom_pre_eval_process_function = self.eval_config.general.output.custom_pre_eval_process_function \
+            if self.eval_config.general.output else None
         dataset_handler = DatasetHandler(dataset_config=dataset_config,
                                          reps=self.config.reps,
                                          concurrency=self.eval_config.general.max_concurrency,
                                          num_passes=self.config.num_passes,
-                                         adjust_dataset_size=self.config.adjust_dataset_size)
+                                         adjust_dataset_size=self.config.adjust_dataset_size,
+                                         custom_pre_eval_process_function=custom_pre_eval_process_function)
         self.eval_input = dataset_handler.get_eval_input_from_dataset(self.config.dataset)
         if not self.eval_input.eval_input_items:
             logger.info("Dataset is empty. Nothing to evaluate.")
-            return EvaluationRunOutput(
-                workflow_output_file=self.workflow_output_file,
-                evaluator_output_files=self.evaluator_output_files,
-                workflow_interrupted=self.workflow_interrupted,
-            )
+            return EvaluationRunOutput(workflow_output_file=self.workflow_output_file,
+                                       evaluator_output_files=self.evaluator_output_files,
+                                       workflow_interrupted=self.workflow_interrupted,
+                                       eval_input=self.eval_input,
+                                       evaluation_results=self.evaluation_results,
+                                       usage_stats=self.usage_stats,
+                                       profiler_results=ProfilerResults())
         # Run workflow and evaluate
         async with WorkflowEvalBuilder.from_config(config=config) as eval_workflow:
             # Initialize Weave integration
             self.weave_eval.initialize_logger(workflow_alias, self.eval_input, config)
-            # Run workflow
-            if self.config.endpoint:
-                await self.run_workflow_remote()
-            else:
-                if not self.config.skip_workflow:
+            with self.eval_trace_context.evaluation_context():
+                # Run workflow
+                if self.config.endpoint:
+                    await self.run_workflow_remote()
+                elif not self.config.skip_workflow:
                     if session_manager is None:
-                        session_manager = SessionManager(eval_workflow.build(),
+                        workflow = await eval_workflow.build()
+                        session_manager = SessionManager(workflow,
                                                          max_concurrency=self.eval_config.general.max_concurrency)
                     await self.run_workflow_local(session_manager)
-            # Evaluate
-            evaluators = {name: eval_workflow.get_evaluator(name) for name in self.eval_config.evaluators}
-            await self.run_evaluators(evaluators)
+                # Pre-evaluation process the workflow output
+                self.eval_input = dataset_handler.pre_eval_process_eval_input(self.eval_input)
+                # Evaluate
+                evaluators = {name: eval_workflow.get_evaluator(name) for name in self.eval_config.evaluators}
+                await self.run_evaluators(evaluators)
+                # Wait for all trace export tasks to complete (local workflows only)
+                if session_manager and not self.config.endpoint:
+                    await self.wait_for_all_export_tasks_local(session_manager, timeout=self.config.export_timeout)
         # Profile the workflow
         profiler_results = await self.profile_workflow()

nat/eval/evaluator/base_evaluator.py CHANGED Viewed

@@ -71,7 +71,7 @@ class BaseEvaluator(ABC):
             TqdmPositionRegistry.release(tqdm_position)
         # Compute average if possible
-        numeric_scores = [item.score for item in output_items if isinstance(item.score, (int, float))]
+        numeric_scores = [item.score for item in output_items if isinstance(item.score, int | float)]
         avg_score = round(sum(numeric_scores) / len(numeric_scores), 2) if numeric_scores else None
         return EvalOutput(average_score=avg_score, eval_output_items=output_items)

nat/eval/evaluator/evaluator_model.py CHANGED Viewed

@@ -29,6 +29,19 @@ class EvalInputItem(BaseModel):
     trajectory: list[IntermediateStep] = []  # populated by the workflow
     full_dataset_entry: typing.Any
+    def copy_with_updates(self, **updates) -> "EvalInputItem":
+        """
+        Copy EvalInputItem with optional field updates.
+        """
+        # Get all current fields
+        item_data = self.model_dump()
+        # Apply any updates
+        item_data.update(updates)
+        # Create new item with all fields
+        return EvalInputItem(**item_data)
 class EvalInput(BaseModel):
     eval_input_items: list[EvalInputItem]

nat/eval/intermediate_step_adapter.py CHANGED Viewed

@@ -40,7 +40,7 @@ class IntermediateStepAdapter:
             try:
                 validated_steps.append(IntermediateStep.model_validate(step_data))
             except Exception as e:
-                logger.exception("Validation failed for step: %r, Error: %s", step_data, e, exc_info=True)
+                logger.exception("Validation failed for step: %r, Error: %s", step_data, e)
         return validated_steps
     def serialize_intermediate_steps(self, intermediate_steps: list[IntermediateStep]) -> list[dict]:

nat/eval/rag_evaluator/evaluate.py CHANGED Viewed

@@ -102,7 +102,7 @@ class RAGEvaluator:
         """Converts the ragas EvaluationResult to nat EvalOutput"""
         if not results_dataset:
-            logger.error("Ragas evaluation failed with no results")
+            logger.error("Ragas evaluation failed with no results", exc_info=True)
             return EvalOutput(average_score=0.0, eval_output_items=[])
         scores: list[dict[str, float]] = results_dataset.scores
@@ -116,11 +116,14 @@ class RAGEvaluator:
             """Convert NaN or None to 0.0 for safe arithmetic/serialization."""
             return 0.0 if v is None or (isinstance(v, float) and math.isnan(v)) else v
-        # Convert from list of dicts to dict of lists, coercing NaN/None to 0.0
+        # Keep original scores (preserving NaN/None) for output
+        original_scores_dict = {metric: [score.get(metric) for score in scores] for metric in scores[0]}
+        # Convert from list of dicts to dict of lists, coercing NaN/None to 0.0 for average calculation
         scores_dict = {metric: [_nan_to_zero(score.get(metric)) for score in scores] for metric in scores[0]}
         first_metric_name = list(scores_dict.keys())[0] if scores_dict else None
-        # Compute the average of each metric, guarding against empty lists
+        # Compute the average of each metric using cleaned scores (NaN/None -> 0.0)
         average_scores = {
             metric: (sum(values) / len(values) if values else 0.0)
             for metric, values in scores_dict.items()
@@ -137,11 +140,11 @@ class RAGEvaluator:
         else:
             ids = df["user_input"].tolist()  # Use "user_input" as ID fallback
-        # Construct EvalOutputItem list
+        # Construct EvalOutputItem list using original scores (preserving NaN/None)
         eval_output_items = [
             EvalOutputItem(
                 id=ids[i],
-                score=_nan_to_zero(getattr(row, first_metric_name, 0.0) if first_metric_name else 0.0),
+                score=original_scores_dict[first_metric_name][i] if first_metric_name else None,
                 reasoning={
                     key:
                         getattr(row, key, None)  # Use getattr to safely access attributes
@@ -169,7 +172,7 @@ class RAGEvaluator:
                                              _pbar=pbar)
         except Exception as e:
             # On exception we still continue with other evaluators. Log and return an avg_score of 0.0
-            logger.exception("Error evaluating ragas metric, Error: %s", e, exc_info=True)
+            logger.exception("Error evaluating ragas metric, Error: %s", e)
             results_dataset = None
         finally:
             pbar.close()

nat/eval/rag_evaluator/register.py CHANGED Viewed

@@ -73,7 +73,7 @@ class RagasEvaluatorConfig(EvaluatorBaseConfig, name="ragas"):
         if isinstance(self.metric, str):
             return self.metric
         if isinstance(self.metric, dict) and self.metric:
-            return next(iter(self.metric.keys()))  # pylint: disable=no-member
+            return next(iter(self.metric.keys()))
         return ""
     @property
@@ -82,7 +82,7 @@ class RagasEvaluatorConfig(EvaluatorBaseConfig, name="ragas"):
         if isinstance(self.metric, str):
             return RagasMetricConfig()  # Default config when only a metric name is given
         if isinstance(self.metric, dict) and self.metric:
-            return next(iter(self.metric.values()))  # pylint: disable=no-member
+            return next(iter(self.metric.values()))
         return RagasMetricConfig()  # Default config when an invalid type is provided
@@ -104,7 +104,7 @@ async def register_ragas_evaluator(config: RagasEvaluatorConfig, builder: EvalBu
             raise ValueError(message) from e
         except AttributeError as e:
             message = f"Ragas metric {metric_name} not found {e}."
-            logger.error(message)
+            logger.exception(message)
             return None
     async def evaluate_fn(eval_input: EvalInput) -> EvalOutput:

nat/eval/register.py CHANGED Viewed

@@ -14,10 +14,13 @@
 # limitations under the License.
 # flake8: noqa
-# pylint: disable=unused-import
 # Import evaluators which need to be automatically registered here
 from .rag_evaluator.register import register_ragas_evaluator
+from .runtime_evaluator.register import register_avg_llm_latency_evaluator
+from .runtime_evaluator.register import register_avg_num_llm_calls_evaluator
+from .runtime_evaluator.register import register_avg_tokens_per_llm_end_evaluator
+from .runtime_evaluator.register import register_avg_workflow_runtime_evaluator
 from .swe_bench_evaluator.register import register_swe_bench_evaluator
 from .trajectory_evaluator.register import register_trajectory_evaluator
 from .tunable_rag_evaluator.register import register_tunable_rag_evaluator

nat/eval/remote_workflow.py CHANGED Viewed

@@ -74,7 +74,7 @@ class EvaluationRemoteWorkflowHandler:
                             if chunk_data.get("value"):
                                 final_response = chunk_data.get("value")
                         except json.JSONDecodeError as e:
-                            logger.error("Failed to parse generate response chunk: %s", e)
+                            logger.exception("Failed to parse generate response chunk: %s", e)
                             continue
                     elif line.startswith(INTERMEDIATE_DATA_PREFIX):
                         # This is an intermediate step
@@ -90,12 +90,12 @@ class EvaluationRemoteWorkflowHandler:
                                                                  payload=payload)
                             intermediate_steps.append(intermediate_step)
                         except (json.JSONDecodeError, ValidationError) as e:
-                            logger.error("Failed to parse intermediate step: %s", e)
+                            logger.exception("Failed to parse intermediate step: %s", e)
                             continue
         except aiohttp.ClientError as e:
             # Handle connection or HTTP-related errors
-            logger.error("Request failed for question %s: %s", question, e)
+            logger.exception("Request failed for question %s: %s", question, e)
             item.output_obj = None
             item.trajectory = []
             return

nat/eval/runtime_evaluator/__init__.py ADDED Viewed

@@ -0,0 +1,14 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.

nat/eval/runtime_evaluator/evaluate.py ADDED Viewed

@@ -0,0 +1,123 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import annotations
+from collections import defaultdict
+from dataclasses import dataclass
+from nat.data_models.intermediate_step import IntermediateStepType
+from nat.eval.evaluator.base_evaluator import BaseEvaluator
+from nat.eval.evaluator.evaluator_model import EvalInputItem
+from nat.eval.evaluator.evaluator_model import EvalOutputItem
+from nat.profiler.intermediate_property_adapter import IntermediatePropertyAdaptor
+@dataclass
+class _CallTiming:
+    start_ts: float | None = None
+    end_ts: float | None = None
+    @property
+    def latency(self) -> float | None:
+        if self.start_ts is None or self.end_ts is None:
+            return None
+        return max(0.0, self.end_ts - self.start_ts)
+class AverageLLMLatencyEvaluator(BaseEvaluator):
+    """
+    Mean difference between connected LLM_START and LLM_END events (same UUID).
+    The score is the average latency in seconds for the item. Reasoning contains per-call latencies.
+    """
+    def __init__(self, max_concurrency: int = 8):
+        super().__init__(max_concurrency=max_concurrency, tqdm_desc="Evaluating Avg LLM Latency")
+    async def evaluate_item(self, item: EvalInputItem) -> EvalOutputItem:  # noqa: D401
+        calls: dict[str, _CallTiming] = defaultdict(_CallTiming)
+        for step in (IntermediatePropertyAdaptor.from_intermediate_step(s) for s in item.trajectory):
+            if step.event_type == IntermediateStepType.LLM_START:
+                calls[step.UUID].start_ts = step.event_timestamp
+            elif step.event_type == IntermediateStepType.LLM_END:
+                calls[step.UUID].end_ts = step.event_timestamp
+        latencies = [ct.latency for ct in calls.values() if ct.latency is not None]
+        avg_latency = sum(latencies) / len(latencies) if latencies else 0.0
+        reasoning = {
+            "num_llm_calls": len(latencies),
+            "latencies": latencies,
+        }
+        return EvalOutputItem(id=item.id, score=round(avg_latency, 4), reasoning=reasoning)
+class AverageWorkflowRuntimeEvaluator(BaseEvaluator):
+    """
+    Average workflow runtime per item: max(event_timestamp) - min(event_timestamp) across the trajectory.
+    The score is the runtime in seconds for the item.
+    """
+    def __init__(self, max_concurrency: int = 8):
+        super().__init__(max_concurrency=max_concurrency, tqdm_desc="Evaluating Avg Workflow Runtime")
+    async def evaluate_item(self, item: EvalInputItem) -> EvalOutputItem:  # noqa: D401
+        if not item.trajectory:
+            return EvalOutputItem(id=item.id, score=0.0, reasoning={"note": "no steps"})
+        timestamps = [s.event_timestamp for s in item.trajectory]
+        runtime = max(timestamps) - min(timestamps)
+        return EvalOutputItem(id=item.id, score=round(max(0.0, runtime), 4), reasoning={"steps": len(timestamps)})
+class AverageNumberOfLLMCallsEvaluator(BaseEvaluator):
+    """
+    Average number of LLM calls per item. The score is the count for the item.
+    """
+    def __init__(self, max_concurrency: int = 8):
+        super().__init__(max_concurrency=max_concurrency, tqdm_desc="Evaluating Avg # LLM Calls")
+    async def evaluate_item(self, item: EvalInputItem) -> EvalOutputItem:  # noqa: D401
+        num_calls = sum(1 for s in item.trajectory if s.event_type == IntermediateStepType.LLM_END)
+        return EvalOutputItem(id=item.id, score=float(num_calls), reasoning={"num_llm_end": num_calls})
+class AverageTokensPerLLMEndEvaluator(BaseEvaluator):
+    """
+    Average total tokens per LLM_END event: sum of prompt and completion tokens if available.
+    The score is the average tokens per LLM_END for the item (0 if none).
+    """
+    def __init__(self, max_concurrency: int = 8):
+        super().__init__(max_concurrency=max_concurrency, tqdm_desc="Evaluating Avg Tokens/LLM_END")
+    async def evaluate_item(self, item: EvalInputItem) -> EvalOutputItem:  # noqa: D401
+        totals: list[int] = []
+        for step in (IntermediatePropertyAdaptor.from_intermediate_step(s) for s in item.trajectory):
+            if step.event_type == IntermediateStepType.LLM_END:
+                total_tokens = step.token_usage.total_tokens
+                # If framework doesn't set total, compute from prompt+completion
+                if total_tokens == 0:
+                    total_tokens = step.token_usage.prompt_tokens + step.token_usage.completion_tokens
+                totals.append(total_tokens)
+        avg_tokens = (sum(totals) / len(totals)) if totals else 0.0
+        reasoning = {
+            "num_llm_end": len(totals),
+            "totals": totals,
+        }
+        return EvalOutputItem(id=item.id, score=round(avg_tokens, 2), reasoning=reasoning)

nvidia-nat 1.2.1rc1__py3-none-any.whl → 1.3.0__py3-none-any.whl

nvidia-nat 1.2.1rc1py3-none-any.whl → 1.3.0py3-none-any.whl