nvidia-nat 1.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (435) hide show
  1. aiq/__init__.py +66 -0
  2. nat/agent/__init__.py +0 -0
  3. nat/agent/base.py +256 -0
  4. nat/agent/dual_node.py +67 -0
  5. nat/agent/react_agent/__init__.py +0 -0
  6. nat/agent/react_agent/agent.py +363 -0
  7. nat/agent/react_agent/output_parser.py +104 -0
  8. nat/agent/react_agent/prompt.py +44 -0
  9. nat/agent/react_agent/register.py +149 -0
  10. nat/agent/reasoning_agent/__init__.py +0 -0
  11. nat/agent/reasoning_agent/reasoning_agent.py +225 -0
  12. nat/agent/register.py +23 -0
  13. nat/agent/rewoo_agent/__init__.py +0 -0
  14. nat/agent/rewoo_agent/agent.py +415 -0
  15. nat/agent/rewoo_agent/prompt.py +110 -0
  16. nat/agent/rewoo_agent/register.py +157 -0
  17. nat/agent/tool_calling_agent/__init__.py +0 -0
  18. nat/agent/tool_calling_agent/agent.py +119 -0
  19. nat/agent/tool_calling_agent/register.py +106 -0
  20. nat/authentication/__init__.py +14 -0
  21. nat/authentication/api_key/__init__.py +14 -0
  22. nat/authentication/api_key/api_key_auth_provider.py +96 -0
  23. nat/authentication/api_key/api_key_auth_provider_config.py +124 -0
  24. nat/authentication/api_key/register.py +26 -0
  25. nat/authentication/exceptions/__init__.py +14 -0
  26. nat/authentication/exceptions/api_key_exceptions.py +38 -0
  27. nat/authentication/http_basic_auth/__init__.py +0 -0
  28. nat/authentication/http_basic_auth/http_basic_auth_provider.py +81 -0
  29. nat/authentication/http_basic_auth/register.py +30 -0
  30. nat/authentication/interfaces.py +93 -0
  31. nat/authentication/oauth2/__init__.py +14 -0
  32. nat/authentication/oauth2/oauth2_auth_code_flow_provider.py +107 -0
  33. nat/authentication/oauth2/oauth2_auth_code_flow_provider_config.py +39 -0
  34. nat/authentication/oauth2/register.py +25 -0
  35. nat/authentication/register.py +21 -0
  36. nat/builder/__init__.py +0 -0
  37. nat/builder/builder.py +285 -0
  38. nat/builder/component_utils.py +316 -0
  39. nat/builder/context.py +270 -0
  40. nat/builder/embedder.py +24 -0
  41. nat/builder/eval_builder.py +161 -0
  42. nat/builder/evaluator.py +29 -0
  43. nat/builder/framework_enum.py +24 -0
  44. nat/builder/front_end.py +73 -0
  45. nat/builder/function.py +344 -0
  46. nat/builder/function_base.py +380 -0
  47. nat/builder/function_info.py +627 -0
  48. nat/builder/intermediate_step_manager.py +174 -0
  49. nat/builder/llm.py +25 -0
  50. nat/builder/retriever.py +25 -0
  51. nat/builder/user_interaction_manager.py +78 -0
  52. nat/builder/workflow.py +148 -0
  53. nat/builder/workflow_builder.py +1117 -0
  54. nat/cli/__init__.py +14 -0
  55. nat/cli/cli_utils/__init__.py +0 -0
  56. nat/cli/cli_utils/config_override.py +231 -0
  57. nat/cli/cli_utils/validation.py +37 -0
  58. nat/cli/commands/__init__.py +0 -0
  59. nat/cli/commands/configure/__init__.py +0 -0
  60. nat/cli/commands/configure/channel/__init__.py +0 -0
  61. nat/cli/commands/configure/channel/add.py +28 -0
  62. nat/cli/commands/configure/channel/channel.py +34 -0
  63. nat/cli/commands/configure/channel/remove.py +30 -0
  64. nat/cli/commands/configure/channel/update.py +30 -0
  65. nat/cli/commands/configure/configure.py +33 -0
  66. nat/cli/commands/evaluate.py +139 -0
  67. nat/cli/commands/info/__init__.py +14 -0
  68. nat/cli/commands/info/info.py +37 -0
  69. nat/cli/commands/info/list_channels.py +32 -0
  70. nat/cli/commands/info/list_components.py +129 -0
  71. nat/cli/commands/info/list_mcp.py +304 -0
  72. nat/cli/commands/registry/__init__.py +14 -0
  73. nat/cli/commands/registry/publish.py +88 -0
  74. nat/cli/commands/registry/pull.py +118 -0
  75. nat/cli/commands/registry/registry.py +36 -0
  76. nat/cli/commands/registry/remove.py +108 -0
  77. nat/cli/commands/registry/search.py +155 -0
  78. nat/cli/commands/sizing/__init__.py +14 -0
  79. nat/cli/commands/sizing/calc.py +297 -0
  80. nat/cli/commands/sizing/sizing.py +27 -0
  81. nat/cli/commands/start.py +246 -0
  82. nat/cli/commands/uninstall.py +81 -0
  83. nat/cli/commands/validate.py +47 -0
  84. nat/cli/commands/workflow/__init__.py +14 -0
  85. nat/cli/commands/workflow/templates/__init__.py.j2 +0 -0
  86. nat/cli/commands/workflow/templates/config.yml.j2 +16 -0
  87. nat/cli/commands/workflow/templates/pyproject.toml.j2 +22 -0
  88. nat/cli/commands/workflow/templates/register.py.j2 +5 -0
  89. nat/cli/commands/workflow/templates/workflow.py.j2 +36 -0
  90. nat/cli/commands/workflow/workflow.py +37 -0
  91. nat/cli/commands/workflow/workflow_commands.py +317 -0
  92. nat/cli/entrypoint.py +135 -0
  93. nat/cli/main.py +57 -0
  94. nat/cli/register_workflow.py +488 -0
  95. nat/cli/type_registry.py +1000 -0
  96. nat/data_models/__init__.py +14 -0
  97. nat/data_models/api_server.py +716 -0
  98. nat/data_models/authentication.py +231 -0
  99. nat/data_models/common.py +171 -0
  100. nat/data_models/component.py +58 -0
  101. nat/data_models/component_ref.py +168 -0
  102. nat/data_models/config.py +410 -0
  103. nat/data_models/dataset_handler.py +169 -0
  104. nat/data_models/discovery_metadata.py +305 -0
  105. nat/data_models/embedder.py +27 -0
  106. nat/data_models/evaluate.py +127 -0
  107. nat/data_models/evaluator.py +26 -0
  108. nat/data_models/front_end.py +26 -0
  109. nat/data_models/function.py +30 -0
  110. nat/data_models/function_dependencies.py +72 -0
  111. nat/data_models/interactive.py +246 -0
  112. nat/data_models/intermediate_step.py +302 -0
  113. nat/data_models/invocation_node.py +38 -0
  114. nat/data_models/llm.py +27 -0
  115. nat/data_models/logging.py +26 -0
  116. nat/data_models/memory.py +27 -0
  117. nat/data_models/object_store.py +44 -0
  118. nat/data_models/profiler.py +54 -0
  119. nat/data_models/registry_handler.py +26 -0
  120. nat/data_models/retriever.py +30 -0
  121. nat/data_models/retry_mixin.py +35 -0
  122. nat/data_models/span.py +190 -0
  123. nat/data_models/step_adaptor.py +64 -0
  124. nat/data_models/streaming.py +33 -0
  125. nat/data_models/swe_bench_model.py +54 -0
  126. nat/data_models/telemetry_exporter.py +26 -0
  127. nat/data_models/ttc_strategy.py +30 -0
  128. nat/embedder/__init__.py +0 -0
  129. nat/embedder/nim_embedder.py +59 -0
  130. nat/embedder/openai_embedder.py +43 -0
  131. nat/embedder/register.py +22 -0
  132. nat/eval/__init__.py +14 -0
  133. nat/eval/config.py +60 -0
  134. nat/eval/dataset_handler/__init__.py +0 -0
  135. nat/eval/dataset_handler/dataset_downloader.py +106 -0
  136. nat/eval/dataset_handler/dataset_filter.py +52 -0
  137. nat/eval/dataset_handler/dataset_handler.py +367 -0
  138. nat/eval/evaluate.py +510 -0
  139. nat/eval/evaluator/__init__.py +14 -0
  140. nat/eval/evaluator/base_evaluator.py +77 -0
  141. nat/eval/evaluator/evaluator_model.py +45 -0
  142. nat/eval/intermediate_step_adapter.py +99 -0
  143. nat/eval/rag_evaluator/__init__.py +0 -0
  144. nat/eval/rag_evaluator/evaluate.py +178 -0
  145. nat/eval/rag_evaluator/register.py +143 -0
  146. nat/eval/register.py +23 -0
  147. nat/eval/remote_workflow.py +133 -0
  148. nat/eval/runners/__init__.py +14 -0
  149. nat/eval/runners/config.py +39 -0
  150. nat/eval/runners/multi_eval_runner.py +54 -0
  151. nat/eval/runtime_event_subscriber.py +52 -0
  152. nat/eval/swe_bench_evaluator/__init__.py +0 -0
  153. nat/eval/swe_bench_evaluator/evaluate.py +215 -0
  154. nat/eval/swe_bench_evaluator/register.py +36 -0
  155. nat/eval/trajectory_evaluator/__init__.py +0 -0
  156. nat/eval/trajectory_evaluator/evaluate.py +75 -0
  157. nat/eval/trajectory_evaluator/register.py +40 -0
  158. nat/eval/tunable_rag_evaluator/__init__.py +0 -0
  159. nat/eval/tunable_rag_evaluator/evaluate.py +245 -0
  160. nat/eval/tunable_rag_evaluator/register.py +52 -0
  161. nat/eval/usage_stats.py +41 -0
  162. nat/eval/utils/__init__.py +0 -0
  163. nat/eval/utils/output_uploader.py +140 -0
  164. nat/eval/utils/tqdm_position_registry.py +40 -0
  165. nat/eval/utils/weave_eval.py +184 -0
  166. nat/experimental/__init__.py +0 -0
  167. nat/experimental/decorators/__init__.py +0 -0
  168. nat/experimental/decorators/experimental_warning_decorator.py +134 -0
  169. nat/experimental/test_time_compute/__init__.py +0 -0
  170. nat/experimental/test_time_compute/editing/__init__.py +0 -0
  171. nat/experimental/test_time_compute/editing/iterative_plan_refinement_editor.py +147 -0
  172. nat/experimental/test_time_compute/editing/llm_as_a_judge_editor.py +204 -0
  173. nat/experimental/test_time_compute/editing/motivation_aware_summarization.py +107 -0
  174. nat/experimental/test_time_compute/functions/__init__.py +0 -0
  175. nat/experimental/test_time_compute/functions/execute_score_select_function.py +105 -0
  176. nat/experimental/test_time_compute/functions/plan_select_execute_function.py +224 -0
  177. nat/experimental/test_time_compute/functions/ttc_tool_orchestration_function.py +205 -0
  178. nat/experimental/test_time_compute/functions/ttc_tool_wrapper_function.py +146 -0
  179. nat/experimental/test_time_compute/models/__init__.py +0 -0
  180. nat/experimental/test_time_compute/models/editor_config.py +132 -0
  181. nat/experimental/test_time_compute/models/scoring_config.py +112 -0
  182. nat/experimental/test_time_compute/models/search_config.py +120 -0
  183. nat/experimental/test_time_compute/models/selection_config.py +154 -0
  184. nat/experimental/test_time_compute/models/stage_enums.py +43 -0
  185. nat/experimental/test_time_compute/models/strategy_base.py +66 -0
  186. nat/experimental/test_time_compute/models/tool_use_config.py +41 -0
  187. nat/experimental/test_time_compute/models/ttc_item.py +48 -0
  188. nat/experimental/test_time_compute/register.py +36 -0
  189. nat/experimental/test_time_compute/scoring/__init__.py +0 -0
  190. nat/experimental/test_time_compute/scoring/llm_based_agent_scorer.py +168 -0
  191. nat/experimental/test_time_compute/scoring/llm_based_plan_scorer.py +168 -0
  192. nat/experimental/test_time_compute/scoring/motivation_aware_scorer.py +111 -0
  193. nat/experimental/test_time_compute/search/__init__.py +0 -0
  194. nat/experimental/test_time_compute/search/multi_llm_planner.py +128 -0
  195. nat/experimental/test_time_compute/search/multi_query_retrieval_search.py +122 -0
  196. nat/experimental/test_time_compute/search/single_shot_multi_plan_planner.py +128 -0
  197. nat/experimental/test_time_compute/selection/__init__.py +0 -0
  198. nat/experimental/test_time_compute/selection/best_of_n_selector.py +63 -0
  199. nat/experimental/test_time_compute/selection/llm_based_agent_output_selector.py +131 -0
  200. nat/experimental/test_time_compute/selection/llm_based_output_merging_selector.py +159 -0
  201. nat/experimental/test_time_compute/selection/llm_based_plan_selector.py +128 -0
  202. nat/experimental/test_time_compute/selection/threshold_selector.py +58 -0
  203. nat/front_ends/__init__.py +14 -0
  204. nat/front_ends/console/__init__.py +14 -0
  205. nat/front_ends/console/authentication_flow_handler.py +233 -0
  206. nat/front_ends/console/console_front_end_config.py +32 -0
  207. nat/front_ends/console/console_front_end_plugin.py +96 -0
  208. nat/front_ends/console/register.py +25 -0
  209. nat/front_ends/cron/__init__.py +14 -0
  210. nat/front_ends/fastapi/__init__.py +14 -0
  211. nat/front_ends/fastapi/auth_flow_handlers/__init__.py +0 -0
  212. nat/front_ends/fastapi/auth_flow_handlers/http_flow_handler.py +27 -0
  213. nat/front_ends/fastapi/auth_flow_handlers/websocket_flow_handler.py +107 -0
  214. nat/front_ends/fastapi/fastapi_front_end_config.py +241 -0
  215. nat/front_ends/fastapi/fastapi_front_end_controller.py +68 -0
  216. nat/front_ends/fastapi/fastapi_front_end_plugin.py +116 -0
  217. nat/front_ends/fastapi/fastapi_front_end_plugin_worker.py +1087 -0
  218. nat/front_ends/fastapi/html_snippets/__init__.py +14 -0
  219. nat/front_ends/fastapi/html_snippets/auth_code_grant_success.py +35 -0
  220. nat/front_ends/fastapi/intermediate_steps_subscriber.py +80 -0
  221. nat/front_ends/fastapi/job_store.py +183 -0
  222. nat/front_ends/fastapi/main.py +72 -0
  223. nat/front_ends/fastapi/message_handler.py +320 -0
  224. nat/front_ends/fastapi/message_validator.py +352 -0
  225. nat/front_ends/fastapi/register.py +25 -0
  226. nat/front_ends/fastapi/response_helpers.py +195 -0
  227. nat/front_ends/fastapi/step_adaptor.py +319 -0
  228. nat/front_ends/mcp/__init__.py +14 -0
  229. nat/front_ends/mcp/mcp_front_end_config.py +36 -0
  230. nat/front_ends/mcp/mcp_front_end_plugin.py +81 -0
  231. nat/front_ends/mcp/mcp_front_end_plugin_worker.py +143 -0
  232. nat/front_ends/mcp/register.py +27 -0
  233. nat/front_ends/mcp/tool_converter.py +241 -0
  234. nat/front_ends/register.py +22 -0
  235. nat/front_ends/simple_base/__init__.py +14 -0
  236. nat/front_ends/simple_base/simple_front_end_plugin_base.py +54 -0
  237. nat/llm/__init__.py +0 -0
  238. nat/llm/aws_bedrock_llm.py +57 -0
  239. nat/llm/nim_llm.py +46 -0
  240. nat/llm/openai_llm.py +46 -0
  241. nat/llm/register.py +23 -0
  242. nat/llm/utils/__init__.py +14 -0
  243. nat/llm/utils/env_config_value.py +94 -0
  244. nat/llm/utils/error.py +17 -0
  245. nat/memory/__init__.py +20 -0
  246. nat/memory/interfaces.py +183 -0
  247. nat/memory/models.py +112 -0
  248. nat/meta/pypi.md +58 -0
  249. nat/object_store/__init__.py +20 -0
  250. nat/object_store/in_memory_object_store.py +76 -0
  251. nat/object_store/interfaces.py +84 -0
  252. nat/object_store/models.py +38 -0
  253. nat/object_store/register.py +20 -0
  254. nat/observability/__init__.py +14 -0
  255. nat/observability/exporter/__init__.py +14 -0
  256. nat/observability/exporter/base_exporter.py +449 -0
  257. nat/observability/exporter/exporter.py +78 -0
  258. nat/observability/exporter/file_exporter.py +33 -0
  259. nat/observability/exporter/processing_exporter.py +322 -0
  260. nat/observability/exporter/raw_exporter.py +52 -0
  261. nat/observability/exporter/span_exporter.py +288 -0
  262. nat/observability/exporter_manager.py +335 -0
  263. nat/observability/mixin/__init__.py +14 -0
  264. nat/observability/mixin/batch_config_mixin.py +26 -0
  265. nat/observability/mixin/collector_config_mixin.py +23 -0
  266. nat/observability/mixin/file_mixin.py +288 -0
  267. nat/observability/mixin/file_mode.py +23 -0
  268. nat/observability/mixin/resource_conflict_mixin.py +134 -0
  269. nat/observability/mixin/serialize_mixin.py +61 -0
  270. nat/observability/mixin/type_introspection_mixin.py +183 -0
  271. nat/observability/processor/__init__.py +14 -0
  272. nat/observability/processor/batching_processor.py +310 -0
  273. nat/observability/processor/callback_processor.py +42 -0
  274. nat/observability/processor/intermediate_step_serializer.py +28 -0
  275. nat/observability/processor/processor.py +71 -0
  276. nat/observability/register.py +96 -0
  277. nat/observability/utils/__init__.py +14 -0
  278. nat/observability/utils/dict_utils.py +236 -0
  279. nat/observability/utils/time_utils.py +31 -0
  280. nat/plugins/.namespace +1 -0
  281. nat/profiler/__init__.py +0 -0
  282. nat/profiler/calc/__init__.py +14 -0
  283. nat/profiler/calc/calc_runner.py +627 -0
  284. nat/profiler/calc/calculations.py +288 -0
  285. nat/profiler/calc/data_models.py +188 -0
  286. nat/profiler/calc/plot.py +345 -0
  287. nat/profiler/callbacks/__init__.py +0 -0
  288. nat/profiler/callbacks/agno_callback_handler.py +295 -0
  289. nat/profiler/callbacks/base_callback_class.py +20 -0
  290. nat/profiler/callbacks/langchain_callback_handler.py +290 -0
  291. nat/profiler/callbacks/llama_index_callback_handler.py +205 -0
  292. nat/profiler/callbacks/semantic_kernel_callback_handler.py +238 -0
  293. nat/profiler/callbacks/token_usage_base_model.py +27 -0
  294. nat/profiler/data_frame_row.py +51 -0
  295. nat/profiler/data_models.py +24 -0
  296. nat/profiler/decorators/__init__.py +0 -0
  297. nat/profiler/decorators/framework_wrapper.py +131 -0
  298. nat/profiler/decorators/function_tracking.py +254 -0
  299. nat/profiler/forecasting/__init__.py +0 -0
  300. nat/profiler/forecasting/config.py +18 -0
  301. nat/profiler/forecasting/model_trainer.py +75 -0
  302. nat/profiler/forecasting/models/__init__.py +22 -0
  303. nat/profiler/forecasting/models/forecasting_base_model.py +40 -0
  304. nat/profiler/forecasting/models/linear_model.py +197 -0
  305. nat/profiler/forecasting/models/random_forest_regressor.py +269 -0
  306. nat/profiler/inference_metrics_model.py +28 -0
  307. nat/profiler/inference_optimization/__init__.py +0 -0
  308. nat/profiler/inference_optimization/bottleneck_analysis/__init__.py +0 -0
  309. nat/profiler/inference_optimization/bottleneck_analysis/nested_stack_analysis.py +460 -0
  310. nat/profiler/inference_optimization/bottleneck_analysis/simple_stack_analysis.py +258 -0
  311. nat/profiler/inference_optimization/data_models.py +386 -0
  312. nat/profiler/inference_optimization/experimental/__init__.py +0 -0
  313. nat/profiler/inference_optimization/experimental/concurrency_spike_analysis.py +468 -0
  314. nat/profiler/inference_optimization/experimental/prefix_span_analysis.py +405 -0
  315. nat/profiler/inference_optimization/llm_metrics.py +212 -0
  316. nat/profiler/inference_optimization/prompt_caching.py +163 -0
  317. nat/profiler/inference_optimization/token_uniqueness.py +107 -0
  318. nat/profiler/inference_optimization/workflow_runtimes.py +72 -0
  319. nat/profiler/intermediate_property_adapter.py +102 -0
  320. nat/profiler/profile_runner.py +473 -0
  321. nat/profiler/utils.py +184 -0
  322. nat/registry_handlers/__init__.py +0 -0
  323. nat/registry_handlers/local/__init__.py +0 -0
  324. nat/registry_handlers/local/local_handler.py +176 -0
  325. nat/registry_handlers/local/register_local.py +37 -0
  326. nat/registry_handlers/metadata_factory.py +60 -0
  327. nat/registry_handlers/package_utils.py +571 -0
  328. nat/registry_handlers/pypi/__init__.py +0 -0
  329. nat/registry_handlers/pypi/pypi_handler.py +251 -0
  330. nat/registry_handlers/pypi/register_pypi.py +40 -0
  331. nat/registry_handlers/register.py +21 -0
  332. nat/registry_handlers/registry_handler_base.py +157 -0
  333. nat/registry_handlers/rest/__init__.py +0 -0
  334. nat/registry_handlers/rest/register_rest.py +56 -0
  335. nat/registry_handlers/rest/rest_handler.py +237 -0
  336. nat/registry_handlers/schemas/__init__.py +0 -0
  337. nat/registry_handlers/schemas/headers.py +42 -0
  338. nat/registry_handlers/schemas/package.py +68 -0
  339. nat/registry_handlers/schemas/publish.py +68 -0
  340. nat/registry_handlers/schemas/pull.py +82 -0
  341. nat/registry_handlers/schemas/remove.py +36 -0
  342. nat/registry_handlers/schemas/search.py +91 -0
  343. nat/registry_handlers/schemas/status.py +47 -0
  344. nat/retriever/__init__.py +0 -0
  345. nat/retriever/interface.py +41 -0
  346. nat/retriever/milvus/__init__.py +14 -0
  347. nat/retriever/milvus/register.py +81 -0
  348. nat/retriever/milvus/retriever.py +228 -0
  349. nat/retriever/models.py +77 -0
  350. nat/retriever/nemo_retriever/__init__.py +14 -0
  351. nat/retriever/nemo_retriever/register.py +60 -0
  352. nat/retriever/nemo_retriever/retriever.py +190 -0
  353. nat/retriever/register.py +22 -0
  354. nat/runtime/__init__.py +14 -0
  355. nat/runtime/loader.py +220 -0
  356. nat/runtime/runner.py +195 -0
  357. nat/runtime/session.py +162 -0
  358. nat/runtime/user_metadata.py +130 -0
  359. nat/settings/__init__.py +0 -0
  360. nat/settings/global_settings.py +318 -0
  361. nat/test/.namespace +1 -0
  362. nat/tool/__init__.py +0 -0
  363. nat/tool/chat_completion.py +74 -0
  364. nat/tool/code_execution/README.md +151 -0
  365. nat/tool/code_execution/__init__.py +0 -0
  366. nat/tool/code_execution/code_sandbox.py +267 -0
  367. nat/tool/code_execution/local_sandbox/.gitignore +1 -0
  368. nat/tool/code_execution/local_sandbox/Dockerfile.sandbox +60 -0
  369. nat/tool/code_execution/local_sandbox/__init__.py +13 -0
  370. nat/tool/code_execution/local_sandbox/local_sandbox_server.py +198 -0
  371. nat/tool/code_execution/local_sandbox/sandbox.requirements.txt +6 -0
  372. nat/tool/code_execution/local_sandbox/start_local_sandbox.sh +50 -0
  373. nat/tool/code_execution/register.py +74 -0
  374. nat/tool/code_execution/test_code_execution_sandbox.py +414 -0
  375. nat/tool/code_execution/utils.py +100 -0
  376. nat/tool/datetime_tools.py +42 -0
  377. nat/tool/document_search.py +141 -0
  378. nat/tool/github_tools/__init__.py +0 -0
  379. nat/tool/github_tools/create_github_commit.py +133 -0
  380. nat/tool/github_tools/create_github_issue.py +87 -0
  381. nat/tool/github_tools/create_github_pr.py +106 -0
  382. nat/tool/github_tools/get_github_file.py +106 -0
  383. nat/tool/github_tools/get_github_issue.py +166 -0
  384. nat/tool/github_tools/get_github_pr.py +256 -0
  385. nat/tool/github_tools/update_github_issue.py +100 -0
  386. nat/tool/mcp/__init__.py +14 -0
  387. nat/tool/mcp/exceptions.py +142 -0
  388. nat/tool/mcp/mcp_client.py +255 -0
  389. nat/tool/mcp/mcp_tool.py +96 -0
  390. nat/tool/memory_tools/__init__.py +0 -0
  391. nat/tool/memory_tools/add_memory_tool.py +79 -0
  392. nat/tool/memory_tools/delete_memory_tool.py +67 -0
  393. nat/tool/memory_tools/get_memory_tool.py +72 -0
  394. nat/tool/nvidia_rag.py +95 -0
  395. nat/tool/register.py +38 -0
  396. nat/tool/retriever.py +94 -0
  397. nat/tool/server_tools.py +66 -0
  398. nat/utils/__init__.py +0 -0
  399. nat/utils/data_models/__init__.py +0 -0
  400. nat/utils/data_models/schema_validator.py +58 -0
  401. nat/utils/debugging_utils.py +43 -0
  402. nat/utils/dump_distro_mapping.py +32 -0
  403. nat/utils/exception_handlers/__init__.py +0 -0
  404. nat/utils/exception_handlers/automatic_retries.py +289 -0
  405. nat/utils/exception_handlers/mcp.py +211 -0
  406. nat/utils/exception_handlers/schemas.py +114 -0
  407. nat/utils/io/__init__.py +0 -0
  408. nat/utils/io/model_processing.py +28 -0
  409. nat/utils/io/yaml_tools.py +119 -0
  410. nat/utils/log_utils.py +37 -0
  411. nat/utils/metadata_utils.py +74 -0
  412. nat/utils/optional_imports.py +142 -0
  413. nat/utils/producer_consumer_queue.py +178 -0
  414. nat/utils/reactive/__init__.py +0 -0
  415. nat/utils/reactive/base/__init__.py +0 -0
  416. nat/utils/reactive/base/observable_base.py +65 -0
  417. nat/utils/reactive/base/observer_base.py +55 -0
  418. nat/utils/reactive/base/subject_base.py +79 -0
  419. nat/utils/reactive/observable.py +59 -0
  420. nat/utils/reactive/observer.py +76 -0
  421. nat/utils/reactive/subject.py +131 -0
  422. nat/utils/reactive/subscription.py +49 -0
  423. nat/utils/settings/__init__.py +0 -0
  424. nat/utils/settings/global_settings.py +197 -0
  425. nat/utils/string_utils.py +38 -0
  426. nat/utils/type_converter.py +290 -0
  427. nat/utils/type_utils.py +484 -0
  428. nat/utils/url_utils.py +27 -0
  429. nvidia_nat-1.2.0.dist-info/METADATA +365 -0
  430. nvidia_nat-1.2.0.dist-info/RECORD +435 -0
  431. nvidia_nat-1.2.0.dist-info/WHEEL +5 -0
  432. nvidia_nat-1.2.0.dist-info/entry_points.txt +21 -0
  433. nvidia_nat-1.2.0.dist-info/licenses/LICENSE-3rd-party.txt +5478 -0
  434. nvidia_nat-1.2.0.dist-info/licenses/LICENSE.md +201 -0
  435. nvidia_nat-1.2.0.dist-info/top_level.txt +2 -0
nat/eval/evaluate.py ADDED
@@ -0,0 +1,510 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ import asyncio
17
+ import logging
18
+ import shutil
19
+ from pathlib import Path
20
+ from typing import Any
21
+ from uuid import uuid4
22
+
23
+ from pydantic import BaseModel
24
+ from tqdm import tqdm
25
+
26
+ from nat.data_models.evaluate import EvalConfig
27
+ from nat.data_models.evaluate import JobEvictionPolicy
28
+ from nat.eval.config import EvaluationRunConfig
29
+ from nat.eval.config import EvaluationRunOutput
30
+ from nat.eval.dataset_handler.dataset_handler import DatasetHandler
31
+ from nat.eval.evaluator.evaluator_model import EvalInput
32
+ from nat.eval.evaluator.evaluator_model import EvalInputItem
33
+ from nat.eval.evaluator.evaluator_model import EvalOutput
34
+ from nat.eval.usage_stats import UsageStats
35
+ from nat.eval.usage_stats import UsageStatsItem
36
+ from nat.eval.usage_stats import UsageStatsLLM
37
+ from nat.eval.utils.output_uploader import OutputUploader
38
+ from nat.eval.utils.weave_eval import WeaveEvaluationIntegration
39
+ from nat.profiler.data_models import ProfilerResults
40
+ from nat.runtime.session import SessionManager
41
+
42
+ logger = logging.getLogger(__name__)
43
+
44
+
45
+ class EvaluationRun: # pylint: disable=too-many-public-methods
46
+ """
47
+ Instantiated for each evaluation run and used to store data for that single run.
48
+
49
+ .. warning::
50
+ **Experimental Feature**: The Evaluation API is experimental and may change in future releases.
51
+ Future versions may introduce breaking changes without notice.
52
+ """
53
+
54
+ def __init__(self, config: EvaluationRunConfig):
55
+ """
56
+ Initialize an EvaluationRun with configuration.
57
+ """
58
+ from nat.eval.intermediate_step_adapter import IntermediateStepAdapter
59
+
60
+ # Run-specific configuration
61
+ self.config: EvaluationRunConfig = config
62
+ self.eval_config: EvalConfig | None = None
63
+
64
+ # Helpers
65
+ self.intermediate_step_adapter: IntermediateStepAdapter = IntermediateStepAdapter()
66
+ self.weave_eval: WeaveEvaluationIntegration = WeaveEvaluationIntegration()
67
+ # Metadata
68
+ self.eval_input: EvalInput | None = None
69
+ self.workflow_interrupted: bool = False
70
+
71
+ # evaluation_results is list of tuples (evaluator_name, EvalOutput)
72
+ self.evaluation_results: list[tuple[str, EvalOutput]] = []
73
+
74
+ # usage stats
75
+ self.usage_stats: UsageStats = UsageStats()
76
+
77
+ # workflow output file
78
+ self.workflow_output_file: Path | None = None
79
+
80
+ # evaluation output files
81
+ self.evaluator_output_files: list[Path] = []
82
+
83
+ def _compute_usage_stats(self, item: EvalInputItem):
84
+ """Compute usage stats for a single item using the intermediate steps"""
85
+ # get the prompt and completion tokens from the intermediate steps
86
+ from nat.profiler.intermediate_property_adapter import IntermediatePropertyAdaptor
87
+ steps = [IntermediatePropertyAdaptor.from_intermediate_step(step) for step in item.trajectory]
88
+ usage_stats_per_llm = {}
89
+ total_tokens = 0
90
+ for step in steps:
91
+ if step.event_type == "LLM_END":
92
+ llm_name = step.llm_name
93
+ if llm_name not in usage_stats_per_llm:
94
+ usage_stats_per_llm[llm_name] = UsageStatsLLM()
95
+ usage_stats_per_llm[llm_name].prompt_tokens += step.token_usage.prompt_tokens
96
+ usage_stats_per_llm[llm_name].completion_tokens += step.token_usage.completion_tokens
97
+ usage_stats_per_llm[llm_name].total_tokens += step.token_usage.total_tokens
98
+ total_tokens += step.token_usage.total_tokens
99
+
100
+ # find min and max event timestamps
101
+ if item.trajectory:
102
+ min_timestamp = min(step.event_timestamp for step in item.trajectory)
103
+ max_timestamp = max(step.event_timestamp for step in item.trajectory)
104
+ runtime = max_timestamp - min_timestamp
105
+ else:
106
+ min_timestamp = 0.0
107
+ max_timestamp = 0.0
108
+ runtime = 0.0
109
+
110
+ # find llm latency by calculating p95 of all llm calls
111
+ llm_latencies = []
112
+ previous_llm_start_time = None
113
+ for step in steps:
114
+ if step.event_type == "LLM_START":
115
+ previous_llm_start_time = step.event_timestamp
116
+ elif step.event_type == "LLM_END" and previous_llm_start_time is not None:
117
+ llm_latencies.append(step.event_timestamp - previous_llm_start_time)
118
+ previous_llm_start_time = None
119
+
120
+ # Calculate p95 LLM latency (or 0 if no LLM calls)
121
+ if llm_latencies:
122
+ import numpy as np
123
+ llm_latency = float(np.percentile(llm_latencies, 95))
124
+ else:
125
+ llm_latency = 0.0
126
+
127
+ # add the usage stats to the usage stats dict
128
+ self.usage_stats.usage_stats_items[item.id] = UsageStatsItem(usage_stats_per_llm=usage_stats_per_llm,
129
+ runtime=runtime,
130
+ total_tokens=total_tokens,
131
+ min_timestamp=min_timestamp,
132
+ max_timestamp=max_timestamp,
133
+ llm_latency=llm_latency)
134
+ return self.usage_stats.usage_stats_items[item.id]
135
+
136
+ async def run_workflow_local(self, session_manager: SessionManager):
137
+ '''
138
+ Launch the workflow with the specified questions and extract the output using the jsonpath
139
+ '''
140
+ # import function level dependencies
141
+ from jsonpath_ng import parse
142
+
143
+ from nat.eval.runtime_event_subscriber import pull_intermediate
144
+
145
+ # Run the workflow
146
+ jsonpath_expr = parse(self.config.result_json_path)
147
+ stop_event = asyncio.Event()
148
+
149
+ async def run_one(item: EvalInputItem):
150
+ if stop_event.is_set():
151
+ return "", []
152
+
153
+ async with session_manager.run(item.input_obj) as runner:
154
+ if not session_manager.workflow.has_single_output:
155
+ # raise an error if the workflow has multiple outputs
156
+ raise NotImplementedError("Multiple outputs are not supported")
157
+
158
+ runner_result = None
159
+ intermediate_future = None
160
+
161
+ try:
162
+
163
+ # Start usage stats and intermediate steps collection in parallel
164
+ intermediate_future = pull_intermediate()
165
+ runner_result = runner.result()
166
+ base_output = await runner_result
167
+ intermediate_steps = await intermediate_future
168
+ except NotImplementedError as e:
169
+ # raise original error
170
+ raise e
171
+ except Exception as e:
172
+ logger.exception("Failed to run the workflow: %s", e, exc_info=True)
173
+ # stop processing if a workflow error occurs
174
+ self.workflow_interrupted = True
175
+
176
+ # Cancel any coroutines that are still running, avoiding a warning about unawaited coroutines
177
+ # (typically one of these two is what raised the exception and the other is still running)
178
+ for coro in (runner_result, intermediate_future):
179
+ if coro is not None:
180
+ asyncio.ensure_future(coro).cancel()
181
+
182
+ stop_event.set()
183
+ return
184
+
185
+ try:
186
+ base_output = runner.convert(base_output, to_type=str)
187
+ except ValueError:
188
+ pass
189
+
190
+ # if base_output is a pydantic model dump it to json
191
+ if isinstance(base_output, BaseModel):
192
+ output = base_output.model_dump_json(indent=2)
193
+ else:
194
+ m = jsonpath_expr.find(base_output)
195
+ if (not m):
196
+ raise RuntimeError(f"Failed to extract output using jsonpath: {self.config.result_json_path}")
197
+ if (len(m) > 1):
198
+ logger.warning("Multiple matches found for jsonpath at row '%s'. Matches: %s. Using the first",
199
+ base_output,
200
+ m)
201
+ output = m[0].value
202
+
203
+ item.output_obj = output
204
+ item.trajectory = self.intermediate_step_adapter.validate_intermediate_steps(intermediate_steps)
205
+ usage_stats_item = self._compute_usage_stats(item)
206
+
207
+ self.weave_eval.log_prediction(item, output)
208
+ await self.weave_eval.log_usage_stats(item, usage_stats_item)
209
+
210
+ async def wrapped_run(item: EvalInputItem) -> None:
211
+ await run_one(item)
212
+ pbar.update(1)
213
+
214
+ # if self.config.skip_complete is set skip eval_input_items with a non-empty output_obj
215
+ if self.config.skip_completed_entries:
216
+ eval_input_items = [item for item in self.eval_input.eval_input_items if not item.output_obj]
217
+ if not eval_input_items:
218
+ logger.warning("All items have a non-empty output. Skipping workflow pass altogether.")
219
+ return
220
+ else:
221
+ eval_input_items = self.eval_input.eval_input_items
222
+ pbar = tqdm(total=len(eval_input_items), desc="Running workflow")
223
+ await asyncio.gather(*[wrapped_run(item) for item in eval_input_items])
224
+ pbar.close()
225
+
226
+ async def run_workflow_remote(self):
227
+ from nat.eval.remote_workflow import EvaluationRemoteWorkflowHandler
228
+ handler = EvaluationRemoteWorkflowHandler(self.config, self.eval_config.general.max_concurrency)
229
+ await handler.run_workflow_remote(self.eval_input)
230
+ for item in self.eval_input.eval_input_items:
231
+ usage_stats_item = self._compute_usage_stats(item)
232
+ self.weave_eval.log_prediction(item, item.output_obj)
233
+ await self.weave_eval.log_usage_stats(item, usage_stats_item)
234
+
235
+ async def profile_workflow(self) -> ProfilerResults:
236
+ """
237
+ Profile a dataset
238
+ """
239
+
240
+ if not self.eval_config.general.profiler:
241
+ logger.info("Profiler is not enabled. Skipping profiling.")
242
+ return ProfilerResults()
243
+
244
+ from nat.profiler.profile_runner import ProfilerRunner
245
+
246
+ all_stats = []
247
+ for input_item in self.eval_input.eval_input_items:
248
+ all_stats.append(input_item.trajectory)
249
+
250
+ profiler_runner = ProfilerRunner(self.eval_config.general.profiler,
251
+ self.eval_config.general.output_dir,
252
+ write_output=self.config.write_output)
253
+
254
+ return await profiler_runner.run(all_stats)
255
+
256
+ def cleanup_output_directory(self):
257
+ '''Remove contents of the output directory if it exists'''
258
+ output_config = self.eval_config.general.output
259
+ output_dir = output_config.dir
260
+
261
+ if not (output_config and output_dir.exists()):
262
+ return
263
+
264
+ # If cleanup is true, remove the entire directory and we are done
265
+ if output_config.cleanup:
266
+ logger.info("Cleaning up entire output directory: %s", output_config.dir)
267
+ shutil.rmtree(output_config.dir)
268
+ return
269
+
270
+ if output_config.job_management.max_jobs == 0:
271
+ # No eviction policy
272
+ return
273
+
274
+ base_dir = output_dir / "jobs"
275
+ if not base_dir.exists():
276
+ return
277
+
278
+ # Get all subdirectories, which represent individual job runs
279
+ job_dirs = [d for d in base_dir.iterdir() if d.is_dir()]
280
+ if len(job_dirs) <= output_config.job_management.max_jobs:
281
+ return
282
+
283
+ # Determine sort key based on eviction_policy, defaulting to creation time
284
+ if output_config.job_management.eviction_policy == JobEvictionPolicy.TIME_MODIFIED:
285
+
286
+ def sort_key(x):
287
+ return x.stat().st_mtime
288
+
289
+ logger.info("Using last modified time for job eviction policy.")
290
+ else:
291
+
292
+ def sort_key(x):
293
+ return x.stat().st_ctime
294
+
295
+ logger.info("Using creation time for job eviction policy.")
296
+
297
+ # Sort directories (oldest first)
298
+ job_dirs.sort(key=sort_key)
299
+ num_to_delete = len(job_dirs) - output_config.job_management.max_jobs
300
+
301
+ logger.info("Found %d jobs, exceeding limit of %d. Removing %d oldest jobs.",
302
+ len(job_dirs),
303
+ output_config.job_management.max_jobs,
304
+ num_to_delete)
305
+
306
+ for dir_to_delete in job_dirs[:num_to_delete]:
307
+ try:
308
+ logger.info("Deleting old job directory: %s", dir_to_delete)
309
+ shutil.rmtree(dir_to_delete)
310
+ except Exception as e:
311
+ logger.exception("Failed to delete old job directory: %s: %s", dir_to_delete, e, exc_info=True)
312
+
313
+ def write_output(self, dataset_handler: DatasetHandler, profiler_results: ProfilerResults): # pylint: disable=unused-argument # noqa: E501
314
+ workflow_output_file = self.eval_config.general.output_dir / "workflow_output.json"
315
+ workflow_output_file.parent.mkdir(parents=True, exist_ok=True)
316
+
317
+ # Write the workflow output to a file (this can be used for re-running the evaluation)
318
+
319
+ step_filter = self.eval_config.general.output.workflow_output_step_filter \
320
+ if self.eval_config.general.output else None
321
+ workflow_output = dataset_handler.publish_eval_input(self.eval_input, step_filter)
322
+ with open(workflow_output_file, "w", encoding="utf-8") as f:
323
+ # set indent to 2 for pretty printing
324
+ f.write(workflow_output)
325
+ self.workflow_output_file = workflow_output_file
326
+ logger.info("Workflow output written to %s", workflow_output_file)
327
+
328
+ # Write the output of each evaluator to a separate json file
329
+ for evaluator_name, eval_output in self.evaluation_results:
330
+ output_file = self.eval_config.general.output_dir / f"{evaluator_name}_output.json"
331
+ output_file.parent.mkdir(parents=True, exist_ok=True)
332
+ # create json content using the evaluation results
333
+ output = eval_output.model_dump_json(indent=2)
334
+ with open(output_file, "w", encoding="utf-8") as f:
335
+ f.write(output)
336
+ self.evaluator_output_files.append(output_file)
337
+ logger.info("Evaluation results written to %s", output_file)
338
+
339
+ def publish_output(self, dataset_handler: DatasetHandler, profiler_results: ProfilerResults):
340
+ """Publish the output"""
341
+ if self.config.write_output:
342
+ self.write_output(dataset_handler, profiler_results)
343
+
344
+ if self.workflow_interrupted:
345
+ # Issue a warning if the workflow was not completed on all datasets
346
+ msg = ("Workflow execution was interrupted due to an error. The results may be incomplete. "
347
+ "You can re-execute evaluation for incomplete results by running "
348
+ "`eval` with the --skip_completed_entries flag.")
349
+ logger.warning(msg)
350
+
351
+ self.weave_eval.log_summary(self.usage_stats, self.evaluation_results, profiler_results)
352
+
353
+ async def run_single_evaluator(self, evaluator_name: str, evaluator: Any):
354
+ """Run a single evaluator and store its results."""
355
+ try:
356
+ eval_output = await evaluator.evaluate_fn(self.eval_input)
357
+ self.evaluation_results.append((evaluator_name, eval_output))
358
+
359
+ await self.weave_eval.alog_score(eval_output, evaluator_name)
360
+ except Exception as e:
361
+ logger.exception("An error occurred while running evaluator %s: %s", evaluator_name, e, exc_info=True)
362
+
363
+ async def run_evaluators(self, evaluators: dict[str, Any]):
364
+ """Run all configured evaluators asynchronously."""
365
+ tasks = [self.run_single_evaluator(name, evaluator) for name, evaluator in evaluators.items() if evaluator]
366
+
367
+ if not tasks:
368
+ logger.warning("All evaluators were empty or invalid.")
369
+ return
370
+
371
+ try:
372
+ await asyncio.gather(*tasks)
373
+ except Exception as e:
374
+ logger.exception("An error occurred while running evaluators: %s", e, exc_info=True)
375
+ raise
376
+ finally:
377
+ # Finish prediction loggers in Weave
378
+ await self.weave_eval.afinish_loggers()
379
+
380
+ def apply_overrides(self):
381
+ from nat.cli.cli_utils.config_override import load_and_override_config
382
+ from nat.data_models.config import Config
383
+ from nat.runtime.loader import PluginTypes
384
+ from nat.runtime.loader import discover_and_register_plugins
385
+ from nat.utils.data_models.schema_validator import validate_schema
386
+
387
+ # Register plugins before validation
388
+ discover_and_register_plugins(PluginTypes.CONFIG_OBJECT)
389
+
390
+ config_dict = load_and_override_config(self.config.config_file, self.config.override)
391
+ config = validate_schema(config_dict, Config)
392
+ return config
393
+
394
+ def _get_workflow_alias(self, workflow_type: str | None = None):
395
+ """Get the workflow alias for displaying in evaluation UI."""
396
+ if self.eval_config.general.workflow_alias:
397
+ return self.eval_config.general.workflow_alias
398
+
399
+ if not workflow_type or workflow_type == "EmptyFunctionConfig":
400
+ return "nat-eval"
401
+
402
+ return workflow_type
403
+
404
+ async def run_and_evaluate(self,
405
+ session_manager: SessionManager | None = None,
406
+ job_id: str | None = None) -> EvaluationRunOutput:
407
+ """
408
+ Run the workflow with the specified config file and evaluate the dataset
409
+ """
410
+ logger.info("Starting evaluation run with config file: %s", self.config.config_file)
411
+
412
+ from nat.builder.eval_builder import WorkflowEvalBuilder
413
+ from nat.runtime.loader import load_config
414
+
415
+ # Load and override the config
416
+ if self.config.override:
417
+ config = self.apply_overrides()
418
+ else:
419
+ config = load_config(self.config.config_file)
420
+ self.eval_config = config.eval
421
+ workflow_alias = self._get_workflow_alias(config.workflow.type)
422
+ logger.debug("Loaded %s evaluation configuration: %s", workflow_alias, self.eval_config)
423
+
424
+ # Cleanup the output directory
425
+ if self.eval_config.general.output:
426
+ self.cleanup_output_directory()
427
+
428
+ # Generate a job_id if append_job_id_to_output_dir is enabled and no job_id provided
429
+ if (self.eval_config.general.output
430
+ and self.eval_config.general.output.job_management.append_job_id_to_output_dir and not job_id):
431
+ job_id = "job_" + str(uuid4())
432
+ logger.info("Generated job ID for output directory: %s", job_id)
433
+
434
+ # If a job id is provided keep the data per-job
435
+ if job_id:
436
+ self.eval_config.general.output_dir = self.eval_config.general.output_dir / f"jobs/{job_id}"
437
+ if self.eval_config.general.output:
438
+ self.eval_config.general.output.dir = self.eval_config.general.output_dir
439
+
440
+ # Load the input dataset
441
+ # For multiple datasets, one handler per dataset can be created
442
+ dataset_config = self.eval_config.general.dataset # Currently only one dataset is supported
443
+ if not dataset_config:
444
+ logger.info("No dataset found, nothing to evaluate")
445
+ return EvaluationRunOutput(
446
+ workflow_output_file=self.workflow_output_file,
447
+ evaluator_output_files=self.evaluator_output_files,
448
+ workflow_interrupted=self.workflow_interrupted,
449
+ )
450
+
451
+ dataset_handler = DatasetHandler(dataset_config=dataset_config,
452
+ reps=self.config.reps,
453
+ concurrency=self.eval_config.general.max_concurrency,
454
+ num_passes=self.config.num_passes,
455
+ adjust_dataset_size=self.config.adjust_dataset_size)
456
+ self.eval_input = dataset_handler.get_eval_input_from_dataset(self.config.dataset)
457
+ if not self.eval_input.eval_input_items:
458
+ logger.info("Dataset is empty. Nothing to evaluate.")
459
+ return EvaluationRunOutput(
460
+ workflow_output_file=self.workflow_output_file,
461
+ evaluator_output_files=self.evaluator_output_files,
462
+ workflow_interrupted=self.workflow_interrupted,
463
+ )
464
+
465
+ # Run workflow and evaluate
466
+ async with WorkflowEvalBuilder.from_config(config=config) as eval_workflow:
467
+ # Initialize Weave integration
468
+ self.weave_eval.initialize_logger(workflow_alias, self.eval_input, config)
469
+
470
+ # Run workflow
471
+ if self.config.endpoint:
472
+ await self.run_workflow_remote()
473
+ else:
474
+ if not self.config.skip_workflow:
475
+ if session_manager is None:
476
+ session_manager = SessionManager(eval_workflow.build(),
477
+ max_concurrency=self.eval_config.general.max_concurrency)
478
+ await self.run_workflow_local(session_manager)
479
+
480
+ # Evaluate
481
+ evaluators = {name: eval_workflow.get_evaluator(name) for name in self.eval_config.evaluators}
482
+ await self.run_evaluators(evaluators)
483
+
484
+ # Profile the workflow
485
+ profiler_results = await self.profile_workflow()
486
+
487
+ # compute total runtime
488
+ if self.usage_stats.usage_stats_items:
489
+ self.usage_stats.total_runtime = max(self.usage_stats.usage_stats_items.values(),
490
+ key=lambda x: x.max_timestamp).max_timestamp - \
491
+ min(self.usage_stats.usage_stats_items.values(), key=lambda x: x.min_timestamp).min_timestamp
492
+ else:
493
+ self.usage_stats.total_runtime = 0.0
494
+
495
+ # Publish the results
496
+ self.publish_output(dataset_handler, profiler_results)
497
+
498
+ # Run custom scripts and upload evaluation outputs to S3
499
+ if self.eval_config.general.output:
500
+ output_uploader = OutputUploader(self.eval_config.general.output, job_id=job_id)
501
+ output_uploader.run_custom_scripts()
502
+ await output_uploader.upload_directory()
503
+
504
+ return EvaluationRunOutput(workflow_output_file=self.workflow_output_file,
505
+ evaluator_output_files=self.evaluator_output_files,
506
+ workflow_interrupted=self.workflow_interrupted,
507
+ eval_input=self.eval_input,
508
+ evaluation_results=self.evaluation_results,
509
+ usage_stats=self.usage_stats,
510
+ profiler_results=profiler_results)
@@ -0,0 +1,14 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
@@ -0,0 +1,77 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ import asyncio
17
+ from abc import ABC
18
+ from abc import abstractmethod
19
+
20
+ from tqdm import tqdm
21
+
22
+ from nat.eval.evaluator.evaluator_model import EvalInput
23
+ from nat.eval.evaluator.evaluator_model import EvalInputItem
24
+ from nat.eval.evaluator.evaluator_model import EvalOutput
25
+ from nat.eval.evaluator.evaluator_model import EvalOutputItem
26
+ from nat.eval.utils.tqdm_position_registry import TqdmPositionRegistry
27
+
28
+
29
+ class BaseEvaluator(ABC):
30
+ """
31
+ Base class for custom evaluators.
32
+
33
+ .. warning::
34
+ **Experimental Feature**: The Evaluation API is experimental and may change in future releases.
35
+ Future versions may introduce breaking changes without notice.
36
+
37
+ Each custom evaluator must implement the `evaluate_item` method which is used to evaluate a
38
+ single EvalInputItem.
39
+ """
40
+
41
+ def __init__(self, max_concurrency: int = 4, tqdm_desc: str = "Evaluating"):
42
+ self.max_concurrency = max_concurrency
43
+ self.semaphore = asyncio.Semaphore(max_concurrency)
44
+ self.tqdm_desc = tqdm_desc
45
+
46
+ @abstractmethod
47
+ async def evaluate_item(self, item: EvalInputItem) -> EvalOutputItem:
48
+ """Each evaluator must implement this for item-level evaluation"""
49
+ pass
50
+
51
+ async def evaluate(self, eval_input: EvalInput) -> EvalOutput:
52
+ pbar = None
53
+ try:
54
+ tqdm_position = TqdmPositionRegistry.claim()
55
+ pbar = tqdm(total=len(eval_input.eval_input_items), desc=self.tqdm_desc, position=tqdm_position)
56
+
57
+ async def wrapped(item):
58
+ async with self.semaphore:
59
+ try:
60
+ output_item = await self.evaluate_item(item)
61
+ pbar.update(1)
62
+ return output_item
63
+ except Exception as e:
64
+ # If the evaluator fails, return an error item with a score of 0.0
65
+ pbar.update(1)
66
+ return EvalOutputItem(id=item.id, score=0.0, reasoning={"error": f"Evaluator error: {str(e)}"})
67
+
68
+ output_items = await asyncio.gather(*[wrapped(item) for item in eval_input.eval_input_items])
69
+ finally:
70
+ pbar.close()
71
+ TqdmPositionRegistry.release(tqdm_position)
72
+
73
+ # Compute average if possible
74
+ numeric_scores = [item.score for item in output_items if isinstance(item.score, (int, float))]
75
+ avg_score = round(sum(numeric_scores) / len(numeric_scores), 2) if numeric_scores else None
76
+
77
+ return EvalOutput(average_score=avg_score, eval_output_items=output_items)
@@ -0,0 +1,45 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ import typing
17
+
18
+ from pydantic import BaseModel
19
+
20
+ from nat.data_models.intermediate_step import IntermediateStep
21
+
22
+
23
+ class EvalInputItem(BaseModel):
24
+ id: typing.Any
25
+ input_obj: typing.Any
26
+ expected_output_obj: typing.Any
27
+ output_obj: typing.Any = None # populated by the workflow
28
+ expected_trajectory: list[IntermediateStep] = []
29
+ trajectory: list[IntermediateStep] = [] # populated by the workflow
30
+ full_dataset_entry: typing.Any
31
+
32
+
33
+ class EvalInput(BaseModel):
34
+ eval_input_items: list[EvalInputItem]
35
+
36
+
37
+ class EvalOutputItem(BaseModel):
38
+ id: typing.Any # id or input_obj from EvalInputItem
39
+ score: typing.Any # float or any serializable type
40
+ reasoning: typing.Any
41
+
42
+
43
+ class EvalOutput(BaseModel):
44
+ average_score: typing.Any # float or any serializable type
45
+ eval_output_items: list[EvalOutputItem]