nvidia-nat 1.3.0a20250910__py3-none-any.whl → 1.4.0a20251112__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (213) hide show
  1. nat/agent/base.py +13 -8
  2. nat/agent/prompt_optimizer/prompt.py +68 -0
  3. nat/agent/prompt_optimizer/register.py +149 -0
  4. nat/agent/react_agent/agent.py +6 -5
  5. nat/agent/react_agent/register.py +49 -39
  6. nat/agent/reasoning_agent/reasoning_agent.py +17 -15
  7. nat/agent/register.py +2 -0
  8. nat/agent/responses_api_agent/__init__.py +14 -0
  9. nat/agent/responses_api_agent/register.py +126 -0
  10. nat/agent/rewoo_agent/agent.py +304 -117
  11. nat/agent/rewoo_agent/prompt.py +19 -22
  12. nat/agent/rewoo_agent/register.py +51 -38
  13. nat/agent/tool_calling_agent/agent.py +75 -17
  14. nat/agent/tool_calling_agent/register.py +46 -23
  15. nat/authentication/api_key/api_key_auth_provider.py +6 -11
  16. nat/authentication/api_key/api_key_auth_provider_config.py +8 -5
  17. nat/authentication/credential_validator/__init__.py +14 -0
  18. nat/authentication/credential_validator/bearer_token_validator.py +557 -0
  19. nat/authentication/http_basic_auth/http_basic_auth_provider.py +1 -1
  20. nat/authentication/interfaces.py +5 -2
  21. nat/authentication/oauth2/oauth2_auth_code_flow_provider.py +69 -36
  22. nat/authentication/oauth2/oauth2_auth_code_flow_provider_config.py +2 -1
  23. nat/authentication/oauth2/oauth2_resource_server_config.py +125 -0
  24. nat/builder/builder.py +55 -23
  25. nat/builder/component_utils.py +9 -5
  26. nat/builder/context.py +54 -15
  27. nat/builder/eval_builder.py +14 -9
  28. nat/builder/framework_enum.py +1 -0
  29. nat/builder/front_end.py +1 -1
  30. nat/builder/function.py +370 -0
  31. nat/builder/function_info.py +1 -1
  32. nat/builder/intermediate_step_manager.py +38 -2
  33. nat/builder/workflow.py +5 -0
  34. nat/builder/workflow_builder.py +306 -54
  35. nat/cli/cli_utils/config_override.py +1 -1
  36. nat/cli/commands/info/info.py +16 -6
  37. nat/cli/commands/mcp/__init__.py +14 -0
  38. nat/cli/commands/mcp/mcp.py +986 -0
  39. nat/cli/commands/optimize.py +90 -0
  40. nat/cli/commands/start.py +1 -1
  41. nat/cli/commands/workflow/templates/config.yml.j2 +14 -13
  42. nat/cli/commands/workflow/templates/register.py.j2 +2 -2
  43. nat/cli/commands/workflow/templates/workflow.py.j2 +35 -21
  44. nat/cli/commands/workflow/workflow_commands.py +60 -18
  45. nat/cli/entrypoint.py +15 -11
  46. nat/cli/main.py +3 -0
  47. nat/cli/register_workflow.py +38 -4
  48. nat/cli/type_registry.py +72 -1
  49. nat/control_flow/__init__.py +0 -0
  50. nat/control_flow/register.py +20 -0
  51. nat/control_flow/router_agent/__init__.py +0 -0
  52. nat/control_flow/router_agent/agent.py +329 -0
  53. nat/control_flow/router_agent/prompt.py +48 -0
  54. nat/control_flow/router_agent/register.py +91 -0
  55. nat/control_flow/sequential_executor.py +166 -0
  56. nat/data_models/agent.py +34 -0
  57. nat/data_models/api_server.py +199 -69
  58. nat/data_models/authentication.py +23 -9
  59. nat/data_models/common.py +47 -0
  60. nat/data_models/component.py +2 -0
  61. nat/data_models/component_ref.py +11 -0
  62. nat/data_models/config.py +41 -17
  63. nat/data_models/dataset_handler.py +4 -3
  64. nat/data_models/function.py +34 -0
  65. nat/data_models/function_dependencies.py +8 -0
  66. nat/data_models/intermediate_step.py +9 -1
  67. nat/data_models/llm.py +15 -1
  68. nat/data_models/openai_mcp.py +46 -0
  69. nat/data_models/optimizable.py +208 -0
  70. nat/data_models/optimizer.py +161 -0
  71. nat/data_models/span.py +41 -3
  72. nat/data_models/thinking_mixin.py +2 -2
  73. nat/embedder/azure_openai_embedder.py +2 -1
  74. nat/embedder/nim_embedder.py +3 -2
  75. nat/embedder/openai_embedder.py +3 -2
  76. nat/eval/config.py +1 -1
  77. nat/eval/dataset_handler/dataset_downloader.py +3 -2
  78. nat/eval/dataset_handler/dataset_filter.py +34 -2
  79. nat/eval/evaluate.py +10 -3
  80. nat/eval/evaluator/base_evaluator.py +1 -1
  81. nat/eval/rag_evaluator/evaluate.py +7 -4
  82. nat/eval/register.py +4 -0
  83. nat/eval/runtime_evaluator/__init__.py +14 -0
  84. nat/eval/runtime_evaluator/evaluate.py +123 -0
  85. nat/eval/runtime_evaluator/register.py +100 -0
  86. nat/eval/swe_bench_evaluator/evaluate.py +1 -1
  87. nat/eval/trajectory_evaluator/register.py +1 -1
  88. nat/eval/tunable_rag_evaluator/evaluate.py +1 -1
  89. nat/eval/usage_stats.py +2 -0
  90. nat/eval/utils/output_uploader.py +3 -2
  91. nat/eval/utils/weave_eval.py +17 -3
  92. nat/experimental/decorators/experimental_warning_decorator.py +27 -7
  93. nat/experimental/test_time_compute/functions/execute_score_select_function.py +1 -1
  94. nat/experimental/test_time_compute/functions/plan_select_execute_function.py +7 -3
  95. nat/experimental/test_time_compute/functions/ttc_tool_orchestration_function.py +1 -1
  96. nat/experimental/test_time_compute/functions/ttc_tool_wrapper_function.py +3 -3
  97. nat/experimental/test_time_compute/models/strategy_base.py +2 -2
  98. nat/experimental/test_time_compute/selection/llm_based_output_merging_selector.py +1 -1
  99. nat/front_ends/console/authentication_flow_handler.py +82 -30
  100. nat/front_ends/console/console_front_end_plugin.py +19 -7
  101. nat/front_ends/fastapi/auth_flow_handlers/http_flow_handler.py +1 -1
  102. nat/front_ends/fastapi/auth_flow_handlers/websocket_flow_handler.py +52 -17
  103. nat/front_ends/fastapi/dask_client_mixin.py +65 -0
  104. nat/front_ends/fastapi/fastapi_front_end_config.py +25 -3
  105. nat/front_ends/fastapi/fastapi_front_end_plugin.py +140 -3
  106. nat/front_ends/fastapi/fastapi_front_end_plugin_worker.py +445 -265
  107. nat/front_ends/fastapi/job_store.py +518 -99
  108. nat/front_ends/fastapi/main.py +11 -19
  109. nat/front_ends/fastapi/message_handler.py +69 -44
  110. nat/front_ends/fastapi/message_validator.py +8 -7
  111. nat/front_ends/fastapi/utils.py +57 -0
  112. nat/front_ends/mcp/introspection_token_verifier.py +73 -0
  113. nat/front_ends/mcp/mcp_front_end_config.py +71 -3
  114. nat/front_ends/mcp/mcp_front_end_plugin.py +85 -21
  115. nat/front_ends/mcp/mcp_front_end_plugin_worker.py +248 -29
  116. nat/front_ends/mcp/memory_profiler.py +320 -0
  117. nat/front_ends/mcp/tool_converter.py +78 -25
  118. nat/front_ends/simple_base/simple_front_end_plugin_base.py +3 -1
  119. nat/llm/aws_bedrock_llm.py +21 -8
  120. nat/llm/azure_openai_llm.py +14 -5
  121. nat/llm/litellm_llm.py +80 -0
  122. nat/llm/nim_llm.py +23 -9
  123. nat/llm/openai_llm.py +19 -7
  124. nat/llm/register.py +4 -0
  125. nat/llm/utils/thinking.py +1 -1
  126. nat/observability/exporter/base_exporter.py +1 -1
  127. nat/observability/exporter/processing_exporter.py +29 -55
  128. nat/observability/exporter/span_exporter.py +43 -15
  129. nat/observability/exporter_manager.py +2 -2
  130. nat/observability/mixin/redaction_config_mixin.py +5 -4
  131. nat/observability/mixin/tagging_config_mixin.py +26 -14
  132. nat/observability/mixin/type_introspection_mixin.py +420 -107
  133. nat/observability/processor/batching_processor.py +1 -1
  134. nat/observability/processor/processor.py +3 -0
  135. nat/observability/processor/redaction/__init__.py +24 -0
  136. nat/observability/processor/redaction/contextual_redaction_processor.py +125 -0
  137. nat/observability/processor/redaction/contextual_span_redaction_processor.py +66 -0
  138. nat/observability/processor/redaction/redaction_processor.py +177 -0
  139. nat/observability/processor/redaction/span_header_redaction_processor.py +92 -0
  140. nat/observability/processor/span_tagging_processor.py +21 -14
  141. nat/observability/register.py +16 -0
  142. nat/profiler/callbacks/langchain_callback_handler.py +32 -7
  143. nat/profiler/callbacks/llama_index_callback_handler.py +36 -2
  144. nat/profiler/callbacks/token_usage_base_model.py +2 -0
  145. nat/profiler/decorators/framework_wrapper.py +61 -9
  146. nat/profiler/decorators/function_tracking.py +35 -3
  147. nat/profiler/forecasting/models/linear_model.py +1 -1
  148. nat/profiler/forecasting/models/random_forest_regressor.py +1 -1
  149. nat/profiler/inference_optimization/bottleneck_analysis/nested_stack_analysis.py +1 -1
  150. nat/profiler/inference_optimization/experimental/prefix_span_analysis.py +1 -1
  151. nat/profiler/parameter_optimization/__init__.py +0 -0
  152. nat/profiler/parameter_optimization/optimizable_utils.py +93 -0
  153. nat/profiler/parameter_optimization/optimizer_runtime.py +67 -0
  154. nat/profiler/parameter_optimization/parameter_optimizer.py +189 -0
  155. nat/profiler/parameter_optimization/parameter_selection.py +107 -0
  156. nat/profiler/parameter_optimization/pareto_visualizer.py +460 -0
  157. nat/profiler/parameter_optimization/prompt_optimizer.py +384 -0
  158. nat/profiler/parameter_optimization/update_helpers.py +66 -0
  159. nat/profiler/utils.py +3 -1
  160. nat/registry_handlers/pypi/register_pypi.py +5 -3
  161. nat/registry_handlers/rest/register_rest.py +5 -3
  162. nat/retriever/milvus/retriever.py +1 -1
  163. nat/retriever/nemo_retriever/register.py +2 -1
  164. nat/runtime/loader.py +1 -1
  165. nat/runtime/runner.py +111 -6
  166. nat/runtime/session.py +49 -3
  167. nat/settings/global_settings.py +2 -2
  168. nat/tool/chat_completion.py +4 -1
  169. nat/tool/code_execution/code_sandbox.py +3 -6
  170. nat/tool/code_execution/local_sandbox/Dockerfile.sandbox +19 -32
  171. nat/tool/code_execution/local_sandbox/local_sandbox_server.py +6 -1
  172. nat/tool/code_execution/local_sandbox/sandbox.requirements.txt +2 -0
  173. nat/tool/code_execution/local_sandbox/start_local_sandbox.sh +10 -4
  174. nat/tool/datetime_tools.py +1 -1
  175. nat/tool/github_tools.py +450 -0
  176. nat/tool/memory_tools/add_memory_tool.py +3 -3
  177. nat/tool/memory_tools/delete_memory_tool.py +3 -4
  178. nat/tool/memory_tools/get_memory_tool.py +4 -4
  179. nat/tool/register.py +2 -7
  180. nat/tool/server_tools.py +15 -2
  181. nat/utils/__init__.py +76 -0
  182. nat/utils/callable_utils.py +70 -0
  183. nat/utils/data_models/schema_validator.py +1 -1
  184. nat/utils/decorators.py +210 -0
  185. nat/utils/exception_handlers/automatic_retries.py +278 -72
  186. nat/utils/io/yaml_tools.py +73 -3
  187. nat/utils/log_levels.py +25 -0
  188. nat/utils/responses_api.py +26 -0
  189. nat/utils/string_utils.py +16 -0
  190. nat/utils/type_converter.py +12 -3
  191. nat/utils/type_utils.py +6 -2
  192. nvidia_nat-1.4.0a20251112.dist-info/METADATA +197 -0
  193. {nvidia_nat-1.3.0a20250910.dist-info → nvidia_nat-1.4.0a20251112.dist-info}/RECORD +199 -165
  194. {nvidia_nat-1.3.0a20250910.dist-info → nvidia_nat-1.4.0a20251112.dist-info}/entry_points.txt +1 -0
  195. nat/cli/commands/info/list_mcp.py +0 -461
  196. nat/data_models/temperature_mixin.py +0 -43
  197. nat/data_models/top_p_mixin.py +0 -43
  198. nat/observability/processor/header_redaction_processor.py +0 -123
  199. nat/observability/processor/redaction_processor.py +0 -77
  200. nat/tool/code_execution/test_code_execution_sandbox.py +0 -414
  201. nat/tool/github_tools/create_github_commit.py +0 -133
  202. nat/tool/github_tools/create_github_issue.py +0 -87
  203. nat/tool/github_tools/create_github_pr.py +0 -106
  204. nat/tool/github_tools/get_github_file.py +0 -106
  205. nat/tool/github_tools/get_github_issue.py +0 -166
  206. nat/tool/github_tools/get_github_pr.py +0 -256
  207. nat/tool/github_tools/update_github_issue.py +0 -100
  208. nvidia_nat-1.3.0a20250910.dist-info/METADATA +0 -373
  209. /nat/{tool/github_tools → agent/prompt_optimizer}/__init__.py +0 -0
  210. {nvidia_nat-1.3.0a20250910.dist-info → nvidia_nat-1.4.0a20251112.dist-info}/WHEEL +0 -0
  211. {nvidia_nat-1.3.0a20250910.dist-info → nvidia_nat-1.4.0a20251112.dist-info}/licenses/LICENSE-3rd-party.txt +0 -0
  212. {nvidia_nat-1.3.0a20250910.dist-info → nvidia_nat-1.4.0a20251112.dist-info}/licenses/LICENSE.md +0 -0
  213. {nvidia_nat-1.3.0a20250910.dist-info → nvidia_nat-1.4.0a20251112.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,161 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2021-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ from enum import Enum
17
+ from pathlib import Path
18
+
19
+ from pydantic import BaseModel
20
+ from pydantic import Field
21
+
22
+
23
+ class OptimizerMetric(BaseModel):
24
+ """
25
+ Parameters used by the workflow optimizer to define a metric to optimize.
26
+ """
27
+ evaluator_name: str = Field(description="Name of the metric to optimize.")
28
+ direction: str = Field(description="Direction of the optimization. Can be 'maximize' or 'minimize'.")
29
+ weight: float = Field(description="Weight of the metric in the optimization process.", default=1.0)
30
+
31
+
32
+ class SamplerType(str, Enum):
33
+ BAYESIAN = "bayesian"
34
+ GRID = "grid"
35
+
36
+
37
+ class NumericOptimizationConfig(BaseModel):
38
+ """
39
+ Configuration for numeric/enum optimization (Optuna).
40
+ """
41
+ enabled: bool = Field(default=True, description="Enable numeric optimization")
42
+ n_trials: int = Field(description="Number of trials for numeric optimization.", default=20)
43
+ sampler: SamplerType | None = Field(
44
+ default=None,
45
+ description="Sampling strategy for numeric optimization. Options: None or 'bayesian' uses \
46
+ the Optuna default (TPE for single-objective, NSGA-II for multi-objective) or 'grid' performs \
47
+ exhaustive grid search over parameter combinations. Defaults to None.",
48
+ )
49
+
50
+
51
+ class PromptGAOptimizationConfig(BaseModel):
52
+ """
53
+ Configuration for prompt optimization using a Genetic Algorithm.
54
+ """
55
+ enabled: bool = Field(default=False, description="Enable GA-based prompt optimization")
56
+
57
+ # Prompt optimization function hooks
58
+ prompt_population_init_function: str | None = Field(
59
+ default=None,
60
+ description="Optional function name to initialize/mutate candidate prompts.",
61
+ )
62
+ prompt_recombination_function: str | None = Field(
63
+ default=None,
64
+ description="Optional function name to recombine two parent prompts into a child.",
65
+ )
66
+
67
+ # Genetic algorithm configuration
68
+ ga_population_size: int = Field(
69
+ description="Population size for genetic algorithm prompt optimization.",
70
+ default=24,
71
+ )
72
+ ga_generations: int = Field(
73
+ description="Number of generations to evolve in GA prompt optimization.",
74
+ default=15,
75
+ )
76
+ ga_offspring_size: int | None = Field(
77
+ description="Number of offspring to produce per generation. Defaults to population_size - elitism.",
78
+ default=None,
79
+ )
80
+ ga_crossover_rate: float = Field(
81
+ description="Probability of applying crossover during reproduction.",
82
+ default=0.8,
83
+ ge=0.0,
84
+ le=1.0,
85
+ )
86
+ ga_mutation_rate: float = Field(
87
+ description="Probability of mutating a child after crossover.",
88
+ default=0.3,
89
+ ge=0.0,
90
+ le=1.0,
91
+ )
92
+ ga_elitism: int = Field(
93
+ description="Number of top individuals carried over unchanged each generation.",
94
+ default=2,
95
+ )
96
+ ga_selection_method: str = Field(
97
+ description="Parent selection strategy: 'tournament' or 'roulette'.",
98
+ default="tournament",
99
+ )
100
+ ga_tournament_size: int = Field(
101
+ description="Tournament size when using tournament selection.",
102
+ default=3,
103
+ )
104
+ ga_parallel_evaluations: int = Field(
105
+ description="Max number of individuals to evaluate concurrently per generation.",
106
+ default=8,
107
+ )
108
+ ga_diversity_lambda: float = Field(
109
+ description="Strength of diversity penalty (0 disables). Penalizes identical/near-identical prompts.",
110
+ default=0.0,
111
+ ge=0.0,
112
+ )
113
+
114
+
115
+ class OptimizerConfig(BaseModel):
116
+ """
117
+ Parameters used by the workflow optimizer.
118
+ """
119
+ output_path: Path | None = Field(
120
+ default=None,
121
+ description="Path to the output directory where the results will be saved.",
122
+ )
123
+
124
+ eval_metrics: dict[str, OptimizerMetric] | None = Field(
125
+ description="List of evaluation metrics to optimize.",
126
+ default=None,
127
+ )
128
+
129
+ reps_per_param_set: int = Field(
130
+ default=3,
131
+ description="Number of repetitions per parameter set for the optimization.",
132
+ )
133
+
134
+ target: float | None = Field(
135
+ description=(
136
+ "Target value for the optimization. If set, the optimization will stop when this value is reached."),
137
+ default=None,
138
+ )
139
+
140
+ multi_objective_combination_mode: str = Field(
141
+ description="Method to combine multiple objectives into a single score.",
142
+ default="harmonic",
143
+ )
144
+
145
+ # Nested configs
146
+ numeric: NumericOptimizationConfig = NumericOptimizationConfig()
147
+ prompt: PromptGAOptimizationConfig = PromptGAOptimizationConfig()
148
+
149
+
150
+ class OptimizerRunConfig(BaseModel):
151
+ """
152
+ Parameters used for an Optimizer R=run
153
+ """
154
+ # Eval parameters
155
+
156
+ config_file: Path | BaseModel # allow for instantiated configs to be passed in
157
+ dataset: str | Path | None # dataset file path can be specified in the config file
158
+ result_json_path: str = "$"
159
+ endpoint: str | None = None # only used when running the workflow remotely
160
+ endpoint_timeout: int = 300
161
+ override: tuple[tuple[str, str], ...] = ()
nat/data_models/span.py CHANGED
@@ -128,10 +128,48 @@ class SpanStatus(BaseModel):
128
128
  message: str | None = Field(default=None, description="The status message of the span.")
129
129
 
130
130
 
131
+ def _generate_nonzero_trace_id() -> int:
132
+ """Generate a non-zero 128-bit trace ID."""
133
+ return uuid.uuid4().int
134
+
135
+
136
+ def _generate_nonzero_span_id() -> int:
137
+ """Generate a non-zero 64-bit span ID."""
138
+ return uuid.uuid4().int >> 64
139
+
140
+
131
141
  class SpanContext(BaseModel):
132
- trace_id: int = Field(default_factory=lambda: uuid.uuid4().int, description="The 128-bit trace ID of the span.")
133
- span_id: int = Field(default_factory=lambda: uuid.uuid4().int & ((1 << 64) - 1),
134
- description="The 64-bit span ID of the span.")
142
+ trace_id: int = Field(default_factory=_generate_nonzero_trace_id,
143
+ description="The OTel-syle 128-bit trace ID of the span.")
144
+ span_id: int = Field(default_factory=_generate_nonzero_span_id,
145
+ description="The OTel-syle 64-bit span ID of the span.")
146
+
147
+ @field_validator("trace_id", mode="before")
148
+ @classmethod
149
+ def _validate_trace_id(cls, v: int | str | None) -> int:
150
+ """Regenerate if trace_id is None; raise an exception if trace_id is invalid;"""
151
+ if isinstance(v, str):
152
+ v = uuid.UUID(v).int
153
+ if isinstance(v, type(None)):
154
+ v = _generate_nonzero_trace_id()
155
+ if v <= 0 or v >> 128:
156
+ raise ValueError(f"Invalid trace_id: must be a non-zero 128-bit integer, got {v}")
157
+ return v
158
+
159
+ @field_validator("span_id", mode="before")
160
+ @classmethod
161
+ def _validate_span_id(cls, v: int | str | None) -> int:
162
+ """Regenerate if span_id is None; raise an exception if span_id is invalid;"""
163
+ if isinstance(v, str):
164
+ try:
165
+ v = int(v, 16)
166
+ except ValueError:
167
+ raise ValueError(f"span_id unable to be parsed: {v}")
168
+ if isinstance(v, type(None)):
169
+ v = _generate_nonzero_span_id()
170
+ if v <= 0 or v >> 64:
171
+ raise ValueError(f"Invalid span_id: must be a non-zero 64-bit integer, got {v}")
172
+ return v
135
173
 
136
174
 
137
175
  class Span(BaseModel):
@@ -51,7 +51,7 @@ class ThinkingMixin(
51
51
  Returns the system prompt to use for thinking.
52
52
  For NVIDIA Nemotron, returns "/think" if enabled, else "/no_think".
53
53
  For Llama Nemotron v1.5, returns "/think" if enabled, else "/no_think".
54
- For Llama Nemotron v1.0, returns "detailed thinking on" if enabled, else "detailed thinking off".
54
+ For Llama Nemotron v1.0 or v1.1, returns "detailed thinking on" if enabled, else "detailed thinking off".
55
55
  If thinking is not supported on the model, returns None.
56
56
 
57
57
  Returns:
@@ -72,7 +72,7 @@ class ThinkingMixin(
72
72
  return "/think" if self.thinking else "/no_think"
73
73
 
74
74
  if model.startswith("nvidia/llama"):
75
- if "v1-0" in model or "v1-1" in model:
75
+ if "v1-0" in model or "v1-1" in model or model.endswith("v1"):
76
76
  return f"detailed thinking {'on' if self.thinking else 'off'}"
77
77
 
78
78
  if "v1-5" in model:
@@ -20,6 +20,7 @@ from pydantic import Field
20
20
  from nat.builder.builder import Builder
21
21
  from nat.builder.embedder import EmbedderProviderInfo
22
22
  from nat.cli.register_workflow import register_embedder_provider
23
+ from nat.data_models.common import OptionalSecretStr
23
24
  from nat.data_models.embedder import EmbedderBaseConfig
24
25
  from nat.data_models.retry_mixin import RetryMixin
25
26
 
@@ -29,7 +30,7 @@ class AzureOpenAIEmbedderModelConfig(EmbedderBaseConfig, RetryMixin, name="azure
29
30
 
30
31
  model_config = ConfigDict(protected_namespaces=(), extra="allow")
31
32
 
32
- api_key: str | None = Field(default=None, description="Azure OpenAI API key to interact with hosted model.")
33
+ api_key: OptionalSecretStr = Field(default=None, description="Azure OpenAI API key to interact with hosted model.")
33
34
  api_version: str = Field(default="2025-04-01-preview", description="Azure OpenAI API version.")
34
35
  azure_endpoint: str | None = Field(validation_alias=AliasChoices("azure_endpoint", "base_url"),
35
36
  serialization_alias="azure_endpoint",
@@ -23,6 +23,7 @@ from pydantic import Field
23
23
  from nat.builder.builder import Builder
24
24
  from nat.builder.embedder import EmbedderProviderInfo
25
25
  from nat.cli.register_workflow import register_embedder_provider
26
+ from nat.data_models.common import OptionalSecretStr
26
27
  from nat.data_models.embedder import EmbedderBaseConfig
27
28
  from nat.data_models.retry_mixin import RetryMixin
28
29
 
@@ -41,7 +42,7 @@ TruncationOption = typing.Annotated[str, AfterValidator(option_in_allowed_values
41
42
  class NIMEmbedderModelConfig(EmbedderBaseConfig, RetryMixin, name="nim"):
42
43
  """A NVIDIA Inference Microservice (NIM) embedder provider to be used with an embedder client."""
43
44
 
44
- api_key: str | None = Field(default=None, description="NVIDIA API key to interact with hosted NIM.")
45
+ api_key: OptionalSecretStr = Field(default=None, description="NVIDIA API key to interact with hosted NIM.")
45
46
  base_url: str | None = Field(default=None, description="Base url to the hosted NIM.")
46
47
  model_name: str = Field(validation_alias=AliasChoices("model_name", "model"),
47
48
  serialization_alias="model",
@@ -50,7 +51,7 @@ class NIMEmbedderModelConfig(EmbedderBaseConfig, RetryMixin, name="nim"):
50
51
  description=("The truncation strategy if the input on the "
51
52
  "server side if it's too large."))
52
53
 
53
- model_config = ConfigDict(protected_namespaces=())
54
+ model_config = ConfigDict(protected_namespaces=(), extra="allow")
54
55
 
55
56
 
56
57
  @register_embedder_provider(config_type=NIMEmbedderModelConfig)
@@ -20,6 +20,7 @@ from pydantic import Field
20
20
  from nat.builder.builder import Builder
21
21
  from nat.builder.embedder import EmbedderProviderInfo
22
22
  from nat.cli.register_workflow import register_embedder_provider
23
+ from nat.data_models.common import OptionalSecretStr
23
24
  from nat.data_models.embedder import EmbedderBaseConfig
24
25
  from nat.data_models.retry_mixin import RetryMixin
25
26
 
@@ -27,9 +28,9 @@ from nat.data_models.retry_mixin import RetryMixin
27
28
  class OpenAIEmbedderModelConfig(EmbedderBaseConfig, RetryMixin, name="openai"):
28
29
  """An OpenAI LLM provider to be used with an LLM client."""
29
30
 
30
- model_config = ConfigDict(protected_namespaces=())
31
+ model_config = ConfigDict(protected_namespaces=(), extra="allow")
31
32
 
32
- api_key: str | None = Field(default=None, description="OpenAI API key to interact with hosted model.")
33
+ api_key: OptionalSecretStr = Field(default=None, description="OpenAI API key to interact with hosted model.")
33
34
  base_url: str | None = Field(default=None, description="Base url to the hosted model.")
34
35
  model_name: str = Field(validation_alias=AliasChoices("model_name", "model"),
35
36
  serialization_alias="model",
nat/eval/config.py CHANGED
@@ -27,7 +27,7 @@ class EvaluationRunConfig(BaseModel):
27
27
  """
28
28
  Parameters used for a single evaluation run.
29
29
  """
30
- config_file: Path
30
+ config_file: Path | BaseModel
31
31
  dataset: str | None = None # dataset file path can be specified in the config file
32
32
  result_json_path: str = "$"
33
33
  skip_workflow: bool = False
@@ -19,6 +19,7 @@ import boto3
19
19
  import requests
20
20
  from botocore.exceptions import NoCredentialsError
21
21
 
22
+ from nat.data_models.common import get_secret_value
22
23
  from nat.data_models.dataset_handler import EvalDatasetConfig
23
24
 
24
25
  logger = logging.getLogger(__name__)
@@ -46,8 +47,8 @@ class DatasetDownloader:
46
47
  try:
47
48
  self._s3_client = boto3.client("s3",
48
49
  endpoint_url=self.s3_config.endpoint_url,
49
- aws_access_key_id=self.s3_config.access_key,
50
- aws_secret_access_key=self.s3_config.secret_key)
50
+ aws_access_key_id=get_secret_value(self.s3_config.access_key),
51
+ aws_secret_access_key=get_secret_value(self.s3_config.secret_key))
51
52
  except NoCredentialsError as e:
52
53
  logger.error("AWS credentials not available: %s", e)
53
54
  raise
@@ -13,6 +13,8 @@
13
13
  # See the License for the specific language governing permissions and
14
14
  # limitations under the License.
15
15
 
16
+ import fnmatch
17
+
16
18
  import pandas as pd
17
19
 
18
20
  from nat.data_models.dataset_handler import EvalFilterConfig
@@ -24,6 +26,7 @@ class DatasetFilter:
24
26
  - If a allowlist is provided, only keep rows matching the filter values.
25
27
  - If a denylist is provided, remove rows matching the filter values.
26
28
  - If the filter column does not exist in the DataFrame, the filtering is skipped for that column.
29
+ - Supports Unix shell-style wildcards (``*``, ``?``, ``[seq]``, ``[!seq]``) for string matching.
27
30
 
28
31
  This is a utility class that is dataset agnostic and can be used to filter any DataFrame based on the provided
29
32
  filter configuration.
@@ -33,6 +36,33 @@ class DatasetFilter:
33
36
 
34
37
  self.filter_config = filter_config
35
38
 
39
+ @staticmethod
40
+ def _match_wildcard_patterns(series: pd.Series, patterns: list[str | int | float]) -> pd.Series:
41
+ """
42
+ Match series values against wildcard patterns and exact values.
43
+
44
+ Args:
45
+ series (pd.Series): pandas Series to match against
46
+ patterns (list[str | int | float]): List of patterns/values
47
+
48
+ Returns:
49
+ pd.Series: Boolean Series indicating matches
50
+ """
51
+ # Convert series to string for pattern matching
52
+ str_series = series.astype(str)
53
+
54
+ # Initialize boolean mask
55
+ matches = pd.Series([False] * len(series), index=series.index)
56
+
57
+ # Check each pattern using fnmatch with list comprehension to avoid lambda capture
58
+ for pattern in patterns:
59
+ pattern_str = str(pattern)
60
+ pattern_matches = pd.Series([fnmatch.fnmatch(val, pattern_str) for val in str_series],
61
+ index=str_series.index)
62
+ matches |= pattern_matches
63
+
64
+ return matches
65
+
36
66
  def apply_filters(self, df) -> pd.DataFrame:
37
67
 
38
68
  filtered_df = df.copy()
@@ -41,12 +71,14 @@ class DatasetFilter:
41
71
  if self.filter_config.allowlist:
42
72
  for column, values in self.filter_config.allowlist.field.items():
43
73
  if column in filtered_df.columns:
44
- filtered_df = filtered_df[filtered_df[column].isin(values)]
74
+ matches = self._match_wildcard_patterns(filtered_df[column], values)
75
+ filtered_df = filtered_df[matches]
45
76
 
46
77
  # Apply denylist (remove specified rows)
47
78
  if self.filter_config.denylist:
48
79
  for column, values in self.filter_config.denylist.field.items():
49
80
  if column in filtered_df.columns:
50
- filtered_df = filtered_df[~filtered_df[column].isin(values)]
81
+ matches = self._match_wildcard_patterns(filtered_df[column], values)
82
+ filtered_df = filtered_df[~matches]
51
83
 
52
84
  return filtered_df
nat/eval/evaluate.py CHANGED
@@ -104,6 +104,8 @@ class EvaluationRun:
104
104
  usage_stats_per_llm[llm_name].prompt_tokens += step.token_usage.prompt_tokens
105
105
  usage_stats_per_llm[llm_name].completion_tokens += step.token_usage.completion_tokens
106
106
  usage_stats_per_llm[llm_name].total_tokens += step.token_usage.total_tokens
107
+ usage_stats_per_llm[llm_name].reasoning_tokens += step.token_usage.reasoning_tokens
108
+ usage_stats_per_llm[llm_name].cached_tokens += step.token_usage.cached_tokens
107
109
  total_tokens += step.token_usage.total_tokens
108
110
 
109
111
  # find min and max event timestamps
@@ -449,10 +451,14 @@ class EvaluationRun:
449
451
  from nat.runtime.loader import load_config
450
452
 
451
453
  # Load and override the config
452
- if self.config.override:
454
+ config = None
455
+ if isinstance(self.config.config_file, BaseModel):
456
+ config = self.config.config_file
457
+ elif self.config.override:
453
458
  config = self.apply_overrides()
454
459
  else:
455
460
  config = load_config(self.config.config_file)
461
+
456
462
  self.eval_config = config.eval
457
463
  workflow_alias = self._get_workflow_alias(config.workflow.type)
458
464
  logger.debug("Loaded %s evaluation configuration: %s", workflow_alias, self.eval_config)
@@ -508,7 +514,7 @@ class EvaluationRun:
508
514
  # Run workflow and evaluate
509
515
  async with WorkflowEvalBuilder.from_config(config=config) as eval_workflow:
510
516
  # Initialize Weave integration
511
- self.weave_eval.initialize_logger(workflow_alias, self.eval_input, config)
517
+ self.weave_eval.initialize_logger(workflow_alias, self.eval_input, config, job_id=job_id)
512
518
 
513
519
  with self.eval_trace_context.evaluation_context():
514
520
  # Run workflow
@@ -516,7 +522,8 @@ class EvaluationRun:
516
522
  await self.run_workflow_remote()
517
523
  elif not self.config.skip_workflow:
518
524
  if session_manager is None:
519
- session_manager = SessionManager(eval_workflow.build(),
525
+ workflow = await eval_workflow.build()
526
+ session_manager = SessionManager(workflow,
520
527
  max_concurrency=self.eval_config.general.max_concurrency)
521
528
  await self.run_workflow_local(session_manager)
522
529
 
@@ -71,7 +71,7 @@ class BaseEvaluator(ABC):
71
71
  TqdmPositionRegistry.release(tqdm_position)
72
72
 
73
73
  # Compute average if possible
74
- numeric_scores = [item.score for item in output_items if isinstance(item.score, (int, float))]
74
+ numeric_scores = [item.score for item in output_items if isinstance(item.score, int | float)]
75
75
  avg_score = round(sum(numeric_scores) / len(numeric_scores), 2) if numeric_scores else None
76
76
 
77
77
  return EvalOutput(average_score=avg_score, eval_output_items=output_items)
@@ -116,11 +116,14 @@ class RAGEvaluator:
116
116
  """Convert NaN or None to 0.0 for safe arithmetic/serialization."""
117
117
  return 0.0 if v is None or (isinstance(v, float) and math.isnan(v)) else v
118
118
 
119
- # Convert from list of dicts to dict of lists, coercing NaN/None to 0.0
119
+ # Keep original scores (preserving NaN/None) for output
120
+ original_scores_dict = {metric: [score.get(metric) for score in scores] for metric in scores[0]}
121
+
122
+ # Convert from list of dicts to dict of lists, coercing NaN/None to 0.0 for average calculation
120
123
  scores_dict = {metric: [_nan_to_zero(score.get(metric)) for score in scores] for metric in scores[0]}
121
124
  first_metric_name = list(scores_dict.keys())[0] if scores_dict else None
122
125
 
123
- # Compute the average of each metric, guarding against empty lists
126
+ # Compute the average of each metric using cleaned scores (NaN/None -> 0.0)
124
127
  average_scores = {
125
128
  metric: (sum(values) / len(values) if values else 0.0)
126
129
  for metric, values in scores_dict.items()
@@ -137,11 +140,11 @@ class RAGEvaluator:
137
140
  else:
138
141
  ids = df["user_input"].tolist() # Use "user_input" as ID fallback
139
142
 
140
- # Construct EvalOutputItem list
143
+ # Construct EvalOutputItem list using original scores (preserving NaN/None)
141
144
  eval_output_items = [
142
145
  EvalOutputItem(
143
146
  id=ids[i],
144
- score=_nan_to_zero(getattr(row, first_metric_name, 0.0) if first_metric_name else 0.0),
147
+ score=original_scores_dict[first_metric_name][i] if first_metric_name else None,
145
148
  reasoning={
146
149
  key:
147
150
  getattr(row, key, None) # Use getattr to safely access attributes
nat/eval/register.py CHANGED
@@ -17,6 +17,10 @@
17
17
 
18
18
  # Import evaluators which need to be automatically registered here
19
19
  from .rag_evaluator.register import register_ragas_evaluator
20
+ from .runtime_evaluator.register import register_avg_llm_latency_evaluator
21
+ from .runtime_evaluator.register import register_avg_num_llm_calls_evaluator
22
+ from .runtime_evaluator.register import register_avg_tokens_per_llm_end_evaluator
23
+ from .runtime_evaluator.register import register_avg_workflow_runtime_evaluator
20
24
  from .swe_bench_evaluator.register import register_swe_bench_evaluator
21
25
  from .trajectory_evaluator.register import register_trajectory_evaluator
22
26
  from .tunable_rag_evaluator.register import register_tunable_rag_evaluator
@@ -0,0 +1,14 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
@@ -0,0 +1,123 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ from __future__ import annotations
17
+
18
+ from collections import defaultdict
19
+ from dataclasses import dataclass
20
+
21
+ from nat.data_models.intermediate_step import IntermediateStepType
22
+ from nat.eval.evaluator.base_evaluator import BaseEvaluator
23
+ from nat.eval.evaluator.evaluator_model import EvalInputItem
24
+ from nat.eval.evaluator.evaluator_model import EvalOutputItem
25
+ from nat.profiler.intermediate_property_adapter import IntermediatePropertyAdaptor
26
+
27
+
28
+ @dataclass
29
+ class _CallTiming:
30
+ start_ts: float | None = None
31
+ end_ts: float | None = None
32
+
33
+ @property
34
+ def latency(self) -> float | None:
35
+ if self.start_ts is None or self.end_ts is None:
36
+ return None
37
+ return max(0.0, self.end_ts - self.start_ts)
38
+
39
+
40
+ class AverageLLMLatencyEvaluator(BaseEvaluator):
41
+ """
42
+ Mean difference between connected LLM_START and LLM_END events (same UUID).
43
+ The score is the average latency in seconds for the item. Reasoning contains per-call latencies.
44
+ """
45
+
46
+ def __init__(self, max_concurrency: int = 8):
47
+ super().__init__(max_concurrency=max_concurrency, tqdm_desc="Evaluating Avg LLM Latency")
48
+
49
+ async def evaluate_item(self, item: EvalInputItem) -> EvalOutputItem: # noqa: D401
50
+ calls: dict[str, _CallTiming] = defaultdict(_CallTiming)
51
+
52
+ for step in (IntermediatePropertyAdaptor.from_intermediate_step(s) for s in item.trajectory):
53
+ if step.event_type == IntermediateStepType.LLM_START:
54
+ calls[step.UUID].start_ts = step.event_timestamp
55
+ elif step.event_type == IntermediateStepType.LLM_END:
56
+ calls[step.UUID].end_ts = step.event_timestamp
57
+
58
+ latencies = [ct.latency for ct in calls.values() if ct.latency is not None]
59
+ avg_latency = sum(latencies) / len(latencies) if latencies else 0.0
60
+
61
+ reasoning = {
62
+ "num_llm_calls": len(latencies),
63
+ "latencies": latencies,
64
+ }
65
+ return EvalOutputItem(id=item.id, score=round(avg_latency, 4), reasoning=reasoning)
66
+
67
+
68
+ class AverageWorkflowRuntimeEvaluator(BaseEvaluator):
69
+ """
70
+ Average workflow runtime per item: max(event_timestamp) - min(event_timestamp) across the trajectory.
71
+ The score is the runtime in seconds for the item.
72
+ """
73
+
74
+ def __init__(self, max_concurrency: int = 8):
75
+ super().__init__(max_concurrency=max_concurrency, tqdm_desc="Evaluating Avg Workflow Runtime")
76
+
77
+ async def evaluate_item(self, item: EvalInputItem) -> EvalOutputItem: # noqa: D401
78
+ if not item.trajectory:
79
+ return EvalOutputItem(id=item.id, score=0.0, reasoning={"note": "no steps"})
80
+
81
+ timestamps = [s.event_timestamp for s in item.trajectory]
82
+ runtime = max(timestamps) - min(timestamps)
83
+ return EvalOutputItem(id=item.id, score=round(max(0.0, runtime), 4), reasoning={"steps": len(timestamps)})
84
+
85
+
86
+ class AverageNumberOfLLMCallsEvaluator(BaseEvaluator):
87
+ """
88
+ Average number of LLM calls per item. The score is the count for the item.
89
+ """
90
+
91
+ def __init__(self, max_concurrency: int = 8):
92
+ super().__init__(max_concurrency=max_concurrency, tqdm_desc="Evaluating Avg # LLM Calls")
93
+
94
+ async def evaluate_item(self, item: EvalInputItem) -> EvalOutputItem: # noqa: D401
95
+ num_calls = sum(1 for s in item.trajectory if s.event_type == IntermediateStepType.LLM_END)
96
+ return EvalOutputItem(id=item.id, score=float(num_calls), reasoning={"num_llm_end": num_calls})
97
+
98
+
99
+ class AverageTokensPerLLMEndEvaluator(BaseEvaluator):
100
+ """
101
+ Average total tokens per LLM_END event: sum of prompt and completion tokens if available.
102
+ The score is the average tokens per LLM_END for the item (0 if none).
103
+ """
104
+
105
+ def __init__(self, max_concurrency: int = 8):
106
+ super().__init__(max_concurrency=max_concurrency, tqdm_desc="Evaluating Avg Tokens/LLM_END")
107
+
108
+ async def evaluate_item(self, item: EvalInputItem) -> EvalOutputItem: # noqa: D401
109
+ totals: list[int] = []
110
+ for step in (IntermediatePropertyAdaptor.from_intermediate_step(s) for s in item.trajectory):
111
+ if step.event_type == IntermediateStepType.LLM_END:
112
+ total_tokens = step.token_usage.total_tokens
113
+ # If framework doesn't set total, compute from prompt+completion
114
+ if total_tokens == 0:
115
+ total_tokens = step.token_usage.prompt_tokens + step.token_usage.completion_tokens
116
+ totals.append(total_tokens)
117
+
118
+ avg_tokens = (sum(totals) / len(totals)) if totals else 0.0
119
+ reasoning = {
120
+ "num_llm_end": len(totals),
121
+ "totals": totals,
122
+ }
123
+ return EvalOutputItem(id=item.id, score=round(avg_tokens, 2), reasoning=reasoning)