nvidia-nat 1.3.0a20250910__py3-none-any.whl → 1.4.0a20251112__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nat/agent/base.py +13 -8
- nat/agent/prompt_optimizer/prompt.py +68 -0
- nat/agent/prompt_optimizer/register.py +149 -0
- nat/agent/react_agent/agent.py +6 -5
- nat/agent/react_agent/register.py +49 -39
- nat/agent/reasoning_agent/reasoning_agent.py +17 -15
- nat/agent/register.py +2 -0
- nat/agent/responses_api_agent/__init__.py +14 -0
- nat/agent/responses_api_agent/register.py +126 -0
- nat/agent/rewoo_agent/agent.py +304 -117
- nat/agent/rewoo_agent/prompt.py +19 -22
- nat/agent/rewoo_agent/register.py +51 -38
- nat/agent/tool_calling_agent/agent.py +75 -17
- nat/agent/tool_calling_agent/register.py +46 -23
- nat/authentication/api_key/api_key_auth_provider.py +6 -11
- nat/authentication/api_key/api_key_auth_provider_config.py +8 -5
- nat/authentication/credential_validator/__init__.py +14 -0
- nat/authentication/credential_validator/bearer_token_validator.py +557 -0
- nat/authentication/http_basic_auth/http_basic_auth_provider.py +1 -1
- nat/authentication/interfaces.py +5 -2
- nat/authentication/oauth2/oauth2_auth_code_flow_provider.py +69 -36
- nat/authentication/oauth2/oauth2_auth_code_flow_provider_config.py +2 -1
- nat/authentication/oauth2/oauth2_resource_server_config.py +125 -0
- nat/builder/builder.py +55 -23
- nat/builder/component_utils.py +9 -5
- nat/builder/context.py +54 -15
- nat/builder/eval_builder.py +14 -9
- nat/builder/framework_enum.py +1 -0
- nat/builder/front_end.py +1 -1
- nat/builder/function.py +370 -0
- nat/builder/function_info.py +1 -1
- nat/builder/intermediate_step_manager.py +38 -2
- nat/builder/workflow.py +5 -0
- nat/builder/workflow_builder.py +306 -54
- nat/cli/cli_utils/config_override.py +1 -1
- nat/cli/commands/info/info.py +16 -6
- nat/cli/commands/mcp/__init__.py +14 -0
- nat/cli/commands/mcp/mcp.py +986 -0
- nat/cli/commands/optimize.py +90 -0
- nat/cli/commands/start.py +1 -1
- nat/cli/commands/workflow/templates/config.yml.j2 +14 -13
- nat/cli/commands/workflow/templates/register.py.j2 +2 -2
- nat/cli/commands/workflow/templates/workflow.py.j2 +35 -21
- nat/cli/commands/workflow/workflow_commands.py +60 -18
- nat/cli/entrypoint.py +15 -11
- nat/cli/main.py +3 -0
- nat/cli/register_workflow.py +38 -4
- nat/cli/type_registry.py +72 -1
- nat/control_flow/__init__.py +0 -0
- nat/control_flow/register.py +20 -0
- nat/control_flow/router_agent/__init__.py +0 -0
- nat/control_flow/router_agent/agent.py +329 -0
- nat/control_flow/router_agent/prompt.py +48 -0
- nat/control_flow/router_agent/register.py +91 -0
- nat/control_flow/sequential_executor.py +166 -0
- nat/data_models/agent.py +34 -0
- nat/data_models/api_server.py +199 -69
- nat/data_models/authentication.py +23 -9
- nat/data_models/common.py +47 -0
- nat/data_models/component.py +2 -0
- nat/data_models/component_ref.py +11 -0
- nat/data_models/config.py +41 -17
- nat/data_models/dataset_handler.py +4 -3
- nat/data_models/function.py +34 -0
- nat/data_models/function_dependencies.py +8 -0
- nat/data_models/intermediate_step.py +9 -1
- nat/data_models/llm.py +15 -1
- nat/data_models/openai_mcp.py +46 -0
- nat/data_models/optimizable.py +208 -0
- nat/data_models/optimizer.py +161 -0
- nat/data_models/span.py +41 -3
- nat/data_models/thinking_mixin.py +2 -2
- nat/embedder/azure_openai_embedder.py +2 -1
- nat/embedder/nim_embedder.py +3 -2
- nat/embedder/openai_embedder.py +3 -2
- nat/eval/config.py +1 -1
- nat/eval/dataset_handler/dataset_downloader.py +3 -2
- nat/eval/dataset_handler/dataset_filter.py +34 -2
- nat/eval/evaluate.py +10 -3
- nat/eval/evaluator/base_evaluator.py +1 -1
- nat/eval/rag_evaluator/evaluate.py +7 -4
- nat/eval/register.py +4 -0
- nat/eval/runtime_evaluator/__init__.py +14 -0
- nat/eval/runtime_evaluator/evaluate.py +123 -0
- nat/eval/runtime_evaluator/register.py +100 -0
- nat/eval/swe_bench_evaluator/evaluate.py +1 -1
- nat/eval/trajectory_evaluator/register.py +1 -1
- nat/eval/tunable_rag_evaluator/evaluate.py +1 -1
- nat/eval/usage_stats.py +2 -0
- nat/eval/utils/output_uploader.py +3 -2
- nat/eval/utils/weave_eval.py +17 -3
- nat/experimental/decorators/experimental_warning_decorator.py +27 -7
- nat/experimental/test_time_compute/functions/execute_score_select_function.py +1 -1
- nat/experimental/test_time_compute/functions/plan_select_execute_function.py +7 -3
- nat/experimental/test_time_compute/functions/ttc_tool_orchestration_function.py +1 -1
- nat/experimental/test_time_compute/functions/ttc_tool_wrapper_function.py +3 -3
- nat/experimental/test_time_compute/models/strategy_base.py +2 -2
- nat/experimental/test_time_compute/selection/llm_based_output_merging_selector.py +1 -1
- nat/front_ends/console/authentication_flow_handler.py +82 -30
- nat/front_ends/console/console_front_end_plugin.py +19 -7
- nat/front_ends/fastapi/auth_flow_handlers/http_flow_handler.py +1 -1
- nat/front_ends/fastapi/auth_flow_handlers/websocket_flow_handler.py +52 -17
- nat/front_ends/fastapi/dask_client_mixin.py +65 -0
- nat/front_ends/fastapi/fastapi_front_end_config.py +25 -3
- nat/front_ends/fastapi/fastapi_front_end_plugin.py +140 -3
- nat/front_ends/fastapi/fastapi_front_end_plugin_worker.py +445 -265
- nat/front_ends/fastapi/job_store.py +518 -99
- nat/front_ends/fastapi/main.py +11 -19
- nat/front_ends/fastapi/message_handler.py +69 -44
- nat/front_ends/fastapi/message_validator.py +8 -7
- nat/front_ends/fastapi/utils.py +57 -0
- nat/front_ends/mcp/introspection_token_verifier.py +73 -0
- nat/front_ends/mcp/mcp_front_end_config.py +71 -3
- nat/front_ends/mcp/mcp_front_end_plugin.py +85 -21
- nat/front_ends/mcp/mcp_front_end_plugin_worker.py +248 -29
- nat/front_ends/mcp/memory_profiler.py +320 -0
- nat/front_ends/mcp/tool_converter.py +78 -25
- nat/front_ends/simple_base/simple_front_end_plugin_base.py +3 -1
- nat/llm/aws_bedrock_llm.py +21 -8
- nat/llm/azure_openai_llm.py +14 -5
- nat/llm/litellm_llm.py +80 -0
- nat/llm/nim_llm.py +23 -9
- nat/llm/openai_llm.py +19 -7
- nat/llm/register.py +4 -0
- nat/llm/utils/thinking.py +1 -1
- nat/observability/exporter/base_exporter.py +1 -1
- nat/observability/exporter/processing_exporter.py +29 -55
- nat/observability/exporter/span_exporter.py +43 -15
- nat/observability/exporter_manager.py +2 -2
- nat/observability/mixin/redaction_config_mixin.py +5 -4
- nat/observability/mixin/tagging_config_mixin.py +26 -14
- nat/observability/mixin/type_introspection_mixin.py +420 -107
- nat/observability/processor/batching_processor.py +1 -1
- nat/observability/processor/processor.py +3 -0
- nat/observability/processor/redaction/__init__.py +24 -0
- nat/observability/processor/redaction/contextual_redaction_processor.py +125 -0
- nat/observability/processor/redaction/contextual_span_redaction_processor.py +66 -0
- nat/observability/processor/redaction/redaction_processor.py +177 -0
- nat/observability/processor/redaction/span_header_redaction_processor.py +92 -0
- nat/observability/processor/span_tagging_processor.py +21 -14
- nat/observability/register.py +16 -0
- nat/profiler/callbacks/langchain_callback_handler.py +32 -7
- nat/profiler/callbacks/llama_index_callback_handler.py +36 -2
- nat/profiler/callbacks/token_usage_base_model.py +2 -0
- nat/profiler/decorators/framework_wrapper.py +61 -9
- nat/profiler/decorators/function_tracking.py +35 -3
- nat/profiler/forecasting/models/linear_model.py +1 -1
- nat/profiler/forecasting/models/random_forest_regressor.py +1 -1
- nat/profiler/inference_optimization/bottleneck_analysis/nested_stack_analysis.py +1 -1
- nat/profiler/inference_optimization/experimental/prefix_span_analysis.py +1 -1
- nat/profiler/parameter_optimization/__init__.py +0 -0
- nat/profiler/parameter_optimization/optimizable_utils.py +93 -0
- nat/profiler/parameter_optimization/optimizer_runtime.py +67 -0
- nat/profiler/parameter_optimization/parameter_optimizer.py +189 -0
- nat/profiler/parameter_optimization/parameter_selection.py +107 -0
- nat/profiler/parameter_optimization/pareto_visualizer.py +460 -0
- nat/profiler/parameter_optimization/prompt_optimizer.py +384 -0
- nat/profiler/parameter_optimization/update_helpers.py +66 -0
- nat/profiler/utils.py +3 -1
- nat/registry_handlers/pypi/register_pypi.py +5 -3
- nat/registry_handlers/rest/register_rest.py +5 -3
- nat/retriever/milvus/retriever.py +1 -1
- nat/retriever/nemo_retriever/register.py +2 -1
- nat/runtime/loader.py +1 -1
- nat/runtime/runner.py +111 -6
- nat/runtime/session.py +49 -3
- nat/settings/global_settings.py +2 -2
- nat/tool/chat_completion.py +4 -1
- nat/tool/code_execution/code_sandbox.py +3 -6
- nat/tool/code_execution/local_sandbox/Dockerfile.sandbox +19 -32
- nat/tool/code_execution/local_sandbox/local_sandbox_server.py +6 -1
- nat/tool/code_execution/local_sandbox/sandbox.requirements.txt +2 -0
- nat/tool/code_execution/local_sandbox/start_local_sandbox.sh +10 -4
- nat/tool/datetime_tools.py +1 -1
- nat/tool/github_tools.py +450 -0
- nat/tool/memory_tools/add_memory_tool.py +3 -3
- nat/tool/memory_tools/delete_memory_tool.py +3 -4
- nat/tool/memory_tools/get_memory_tool.py +4 -4
- nat/tool/register.py +2 -7
- nat/tool/server_tools.py +15 -2
- nat/utils/__init__.py +76 -0
- nat/utils/callable_utils.py +70 -0
- nat/utils/data_models/schema_validator.py +1 -1
- nat/utils/decorators.py +210 -0
- nat/utils/exception_handlers/automatic_retries.py +278 -72
- nat/utils/io/yaml_tools.py +73 -3
- nat/utils/log_levels.py +25 -0
- nat/utils/responses_api.py +26 -0
- nat/utils/string_utils.py +16 -0
- nat/utils/type_converter.py +12 -3
- nat/utils/type_utils.py +6 -2
- nvidia_nat-1.4.0a20251112.dist-info/METADATA +197 -0
- {nvidia_nat-1.3.0a20250910.dist-info → nvidia_nat-1.4.0a20251112.dist-info}/RECORD +199 -165
- {nvidia_nat-1.3.0a20250910.dist-info → nvidia_nat-1.4.0a20251112.dist-info}/entry_points.txt +1 -0
- nat/cli/commands/info/list_mcp.py +0 -461
- nat/data_models/temperature_mixin.py +0 -43
- nat/data_models/top_p_mixin.py +0 -43
- nat/observability/processor/header_redaction_processor.py +0 -123
- nat/observability/processor/redaction_processor.py +0 -77
- nat/tool/code_execution/test_code_execution_sandbox.py +0 -414
- nat/tool/github_tools/create_github_commit.py +0 -133
- nat/tool/github_tools/create_github_issue.py +0 -87
- nat/tool/github_tools/create_github_pr.py +0 -106
- nat/tool/github_tools/get_github_file.py +0 -106
- nat/tool/github_tools/get_github_issue.py +0 -166
- nat/tool/github_tools/get_github_pr.py +0 -256
- nat/tool/github_tools/update_github_issue.py +0 -100
- nvidia_nat-1.3.0a20250910.dist-info/METADATA +0 -373
- /nat/{tool/github_tools → agent/prompt_optimizer}/__init__.py +0 -0
- {nvidia_nat-1.3.0a20250910.dist-info → nvidia_nat-1.4.0a20251112.dist-info}/WHEEL +0 -0
- {nvidia_nat-1.3.0a20250910.dist-info → nvidia_nat-1.4.0a20251112.dist-info}/licenses/LICENSE-3rd-party.txt +0 -0
- {nvidia_nat-1.3.0a20250910.dist-info → nvidia_nat-1.4.0a20251112.dist-info}/licenses/LICENSE.md +0 -0
- {nvidia_nat-1.3.0a20250910.dist-info → nvidia_nat-1.4.0a20251112.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,161 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2021-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
#
|
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
# you may not use this file except in compliance with the License.
|
|
6
|
+
# You may obtain a copy of the License at
|
|
7
|
+
#
|
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
#
|
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
# See the License for the specific language governing permissions and
|
|
14
|
+
# limitations under the License.
|
|
15
|
+
|
|
16
|
+
from enum import Enum
|
|
17
|
+
from pathlib import Path
|
|
18
|
+
|
|
19
|
+
from pydantic import BaseModel
|
|
20
|
+
from pydantic import Field
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class OptimizerMetric(BaseModel):
|
|
24
|
+
"""
|
|
25
|
+
Parameters used by the workflow optimizer to define a metric to optimize.
|
|
26
|
+
"""
|
|
27
|
+
evaluator_name: str = Field(description="Name of the metric to optimize.")
|
|
28
|
+
direction: str = Field(description="Direction of the optimization. Can be 'maximize' or 'minimize'.")
|
|
29
|
+
weight: float = Field(description="Weight of the metric in the optimization process.", default=1.0)
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class SamplerType(str, Enum):
|
|
33
|
+
BAYESIAN = "bayesian"
|
|
34
|
+
GRID = "grid"
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
class NumericOptimizationConfig(BaseModel):
|
|
38
|
+
"""
|
|
39
|
+
Configuration for numeric/enum optimization (Optuna).
|
|
40
|
+
"""
|
|
41
|
+
enabled: bool = Field(default=True, description="Enable numeric optimization")
|
|
42
|
+
n_trials: int = Field(description="Number of trials for numeric optimization.", default=20)
|
|
43
|
+
sampler: SamplerType | None = Field(
|
|
44
|
+
default=None,
|
|
45
|
+
description="Sampling strategy for numeric optimization. Options: None or 'bayesian' uses \
|
|
46
|
+
the Optuna default (TPE for single-objective, NSGA-II for multi-objective) or 'grid' performs \
|
|
47
|
+
exhaustive grid search over parameter combinations. Defaults to None.",
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
class PromptGAOptimizationConfig(BaseModel):
|
|
52
|
+
"""
|
|
53
|
+
Configuration for prompt optimization using a Genetic Algorithm.
|
|
54
|
+
"""
|
|
55
|
+
enabled: bool = Field(default=False, description="Enable GA-based prompt optimization")
|
|
56
|
+
|
|
57
|
+
# Prompt optimization function hooks
|
|
58
|
+
prompt_population_init_function: str | None = Field(
|
|
59
|
+
default=None,
|
|
60
|
+
description="Optional function name to initialize/mutate candidate prompts.",
|
|
61
|
+
)
|
|
62
|
+
prompt_recombination_function: str | None = Field(
|
|
63
|
+
default=None,
|
|
64
|
+
description="Optional function name to recombine two parent prompts into a child.",
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
# Genetic algorithm configuration
|
|
68
|
+
ga_population_size: int = Field(
|
|
69
|
+
description="Population size for genetic algorithm prompt optimization.",
|
|
70
|
+
default=24,
|
|
71
|
+
)
|
|
72
|
+
ga_generations: int = Field(
|
|
73
|
+
description="Number of generations to evolve in GA prompt optimization.",
|
|
74
|
+
default=15,
|
|
75
|
+
)
|
|
76
|
+
ga_offspring_size: int | None = Field(
|
|
77
|
+
description="Number of offspring to produce per generation. Defaults to population_size - elitism.",
|
|
78
|
+
default=None,
|
|
79
|
+
)
|
|
80
|
+
ga_crossover_rate: float = Field(
|
|
81
|
+
description="Probability of applying crossover during reproduction.",
|
|
82
|
+
default=0.8,
|
|
83
|
+
ge=0.0,
|
|
84
|
+
le=1.0,
|
|
85
|
+
)
|
|
86
|
+
ga_mutation_rate: float = Field(
|
|
87
|
+
description="Probability of mutating a child after crossover.",
|
|
88
|
+
default=0.3,
|
|
89
|
+
ge=0.0,
|
|
90
|
+
le=1.0,
|
|
91
|
+
)
|
|
92
|
+
ga_elitism: int = Field(
|
|
93
|
+
description="Number of top individuals carried over unchanged each generation.",
|
|
94
|
+
default=2,
|
|
95
|
+
)
|
|
96
|
+
ga_selection_method: str = Field(
|
|
97
|
+
description="Parent selection strategy: 'tournament' or 'roulette'.",
|
|
98
|
+
default="tournament",
|
|
99
|
+
)
|
|
100
|
+
ga_tournament_size: int = Field(
|
|
101
|
+
description="Tournament size when using tournament selection.",
|
|
102
|
+
default=3,
|
|
103
|
+
)
|
|
104
|
+
ga_parallel_evaluations: int = Field(
|
|
105
|
+
description="Max number of individuals to evaluate concurrently per generation.",
|
|
106
|
+
default=8,
|
|
107
|
+
)
|
|
108
|
+
ga_diversity_lambda: float = Field(
|
|
109
|
+
description="Strength of diversity penalty (0 disables). Penalizes identical/near-identical prompts.",
|
|
110
|
+
default=0.0,
|
|
111
|
+
ge=0.0,
|
|
112
|
+
)
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
class OptimizerConfig(BaseModel):
|
|
116
|
+
"""
|
|
117
|
+
Parameters used by the workflow optimizer.
|
|
118
|
+
"""
|
|
119
|
+
output_path: Path | None = Field(
|
|
120
|
+
default=None,
|
|
121
|
+
description="Path to the output directory where the results will be saved.",
|
|
122
|
+
)
|
|
123
|
+
|
|
124
|
+
eval_metrics: dict[str, OptimizerMetric] | None = Field(
|
|
125
|
+
description="List of evaluation metrics to optimize.",
|
|
126
|
+
default=None,
|
|
127
|
+
)
|
|
128
|
+
|
|
129
|
+
reps_per_param_set: int = Field(
|
|
130
|
+
default=3,
|
|
131
|
+
description="Number of repetitions per parameter set for the optimization.",
|
|
132
|
+
)
|
|
133
|
+
|
|
134
|
+
target: float | None = Field(
|
|
135
|
+
description=(
|
|
136
|
+
"Target value for the optimization. If set, the optimization will stop when this value is reached."),
|
|
137
|
+
default=None,
|
|
138
|
+
)
|
|
139
|
+
|
|
140
|
+
multi_objective_combination_mode: str = Field(
|
|
141
|
+
description="Method to combine multiple objectives into a single score.",
|
|
142
|
+
default="harmonic",
|
|
143
|
+
)
|
|
144
|
+
|
|
145
|
+
# Nested configs
|
|
146
|
+
numeric: NumericOptimizationConfig = NumericOptimizationConfig()
|
|
147
|
+
prompt: PromptGAOptimizationConfig = PromptGAOptimizationConfig()
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
class OptimizerRunConfig(BaseModel):
|
|
151
|
+
"""
|
|
152
|
+
Parameters used for an Optimizer R=run
|
|
153
|
+
"""
|
|
154
|
+
# Eval parameters
|
|
155
|
+
|
|
156
|
+
config_file: Path | BaseModel # allow for instantiated configs to be passed in
|
|
157
|
+
dataset: str | Path | None # dataset file path can be specified in the config file
|
|
158
|
+
result_json_path: str = "$"
|
|
159
|
+
endpoint: str | None = None # only used when running the workflow remotely
|
|
160
|
+
endpoint_timeout: int = 300
|
|
161
|
+
override: tuple[tuple[str, str], ...] = ()
|
nat/data_models/span.py
CHANGED
|
@@ -128,10 +128,48 @@ class SpanStatus(BaseModel):
|
|
|
128
128
|
message: str | None = Field(default=None, description="The status message of the span.")
|
|
129
129
|
|
|
130
130
|
|
|
131
|
+
def _generate_nonzero_trace_id() -> int:
|
|
132
|
+
"""Generate a non-zero 128-bit trace ID."""
|
|
133
|
+
return uuid.uuid4().int
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
def _generate_nonzero_span_id() -> int:
|
|
137
|
+
"""Generate a non-zero 64-bit span ID."""
|
|
138
|
+
return uuid.uuid4().int >> 64
|
|
139
|
+
|
|
140
|
+
|
|
131
141
|
class SpanContext(BaseModel):
|
|
132
|
-
trace_id: int = Field(default_factory=
|
|
133
|
-
|
|
134
|
-
|
|
142
|
+
trace_id: int = Field(default_factory=_generate_nonzero_trace_id,
|
|
143
|
+
description="The OTel-syle 128-bit trace ID of the span.")
|
|
144
|
+
span_id: int = Field(default_factory=_generate_nonzero_span_id,
|
|
145
|
+
description="The OTel-syle 64-bit span ID of the span.")
|
|
146
|
+
|
|
147
|
+
@field_validator("trace_id", mode="before")
|
|
148
|
+
@classmethod
|
|
149
|
+
def _validate_trace_id(cls, v: int | str | None) -> int:
|
|
150
|
+
"""Regenerate if trace_id is None; raise an exception if trace_id is invalid;"""
|
|
151
|
+
if isinstance(v, str):
|
|
152
|
+
v = uuid.UUID(v).int
|
|
153
|
+
if isinstance(v, type(None)):
|
|
154
|
+
v = _generate_nonzero_trace_id()
|
|
155
|
+
if v <= 0 or v >> 128:
|
|
156
|
+
raise ValueError(f"Invalid trace_id: must be a non-zero 128-bit integer, got {v}")
|
|
157
|
+
return v
|
|
158
|
+
|
|
159
|
+
@field_validator("span_id", mode="before")
|
|
160
|
+
@classmethod
|
|
161
|
+
def _validate_span_id(cls, v: int | str | None) -> int:
|
|
162
|
+
"""Regenerate if span_id is None; raise an exception if span_id is invalid;"""
|
|
163
|
+
if isinstance(v, str):
|
|
164
|
+
try:
|
|
165
|
+
v = int(v, 16)
|
|
166
|
+
except ValueError:
|
|
167
|
+
raise ValueError(f"span_id unable to be parsed: {v}")
|
|
168
|
+
if isinstance(v, type(None)):
|
|
169
|
+
v = _generate_nonzero_span_id()
|
|
170
|
+
if v <= 0 or v >> 64:
|
|
171
|
+
raise ValueError(f"Invalid span_id: must be a non-zero 64-bit integer, got {v}")
|
|
172
|
+
return v
|
|
135
173
|
|
|
136
174
|
|
|
137
175
|
class Span(BaseModel):
|
|
@@ -51,7 +51,7 @@ class ThinkingMixin(
|
|
|
51
51
|
Returns the system prompt to use for thinking.
|
|
52
52
|
For NVIDIA Nemotron, returns "/think" if enabled, else "/no_think".
|
|
53
53
|
For Llama Nemotron v1.5, returns "/think" if enabled, else "/no_think".
|
|
54
|
-
For Llama Nemotron v1.0, returns "detailed thinking on" if enabled, else "detailed thinking off".
|
|
54
|
+
For Llama Nemotron v1.0 or v1.1, returns "detailed thinking on" if enabled, else "detailed thinking off".
|
|
55
55
|
If thinking is not supported on the model, returns None.
|
|
56
56
|
|
|
57
57
|
Returns:
|
|
@@ -72,7 +72,7 @@ class ThinkingMixin(
|
|
|
72
72
|
return "/think" if self.thinking else "/no_think"
|
|
73
73
|
|
|
74
74
|
if model.startswith("nvidia/llama"):
|
|
75
|
-
if "v1-0" in model or "v1-1" in model:
|
|
75
|
+
if "v1-0" in model or "v1-1" in model or model.endswith("v1"):
|
|
76
76
|
return f"detailed thinking {'on' if self.thinking else 'off'}"
|
|
77
77
|
|
|
78
78
|
if "v1-5" in model:
|
|
@@ -20,6 +20,7 @@ from pydantic import Field
|
|
|
20
20
|
from nat.builder.builder import Builder
|
|
21
21
|
from nat.builder.embedder import EmbedderProviderInfo
|
|
22
22
|
from nat.cli.register_workflow import register_embedder_provider
|
|
23
|
+
from nat.data_models.common import OptionalSecretStr
|
|
23
24
|
from nat.data_models.embedder import EmbedderBaseConfig
|
|
24
25
|
from nat.data_models.retry_mixin import RetryMixin
|
|
25
26
|
|
|
@@ -29,7 +30,7 @@ class AzureOpenAIEmbedderModelConfig(EmbedderBaseConfig, RetryMixin, name="azure
|
|
|
29
30
|
|
|
30
31
|
model_config = ConfigDict(protected_namespaces=(), extra="allow")
|
|
31
32
|
|
|
32
|
-
api_key:
|
|
33
|
+
api_key: OptionalSecretStr = Field(default=None, description="Azure OpenAI API key to interact with hosted model.")
|
|
33
34
|
api_version: str = Field(default="2025-04-01-preview", description="Azure OpenAI API version.")
|
|
34
35
|
azure_endpoint: str | None = Field(validation_alias=AliasChoices("azure_endpoint", "base_url"),
|
|
35
36
|
serialization_alias="azure_endpoint",
|
nat/embedder/nim_embedder.py
CHANGED
|
@@ -23,6 +23,7 @@ from pydantic import Field
|
|
|
23
23
|
from nat.builder.builder import Builder
|
|
24
24
|
from nat.builder.embedder import EmbedderProviderInfo
|
|
25
25
|
from nat.cli.register_workflow import register_embedder_provider
|
|
26
|
+
from nat.data_models.common import OptionalSecretStr
|
|
26
27
|
from nat.data_models.embedder import EmbedderBaseConfig
|
|
27
28
|
from nat.data_models.retry_mixin import RetryMixin
|
|
28
29
|
|
|
@@ -41,7 +42,7 @@ TruncationOption = typing.Annotated[str, AfterValidator(option_in_allowed_values
|
|
|
41
42
|
class NIMEmbedderModelConfig(EmbedderBaseConfig, RetryMixin, name="nim"):
|
|
42
43
|
"""A NVIDIA Inference Microservice (NIM) embedder provider to be used with an embedder client."""
|
|
43
44
|
|
|
44
|
-
api_key:
|
|
45
|
+
api_key: OptionalSecretStr = Field(default=None, description="NVIDIA API key to interact with hosted NIM.")
|
|
45
46
|
base_url: str | None = Field(default=None, description="Base url to the hosted NIM.")
|
|
46
47
|
model_name: str = Field(validation_alias=AliasChoices("model_name", "model"),
|
|
47
48
|
serialization_alias="model",
|
|
@@ -50,7 +51,7 @@ class NIMEmbedderModelConfig(EmbedderBaseConfig, RetryMixin, name="nim"):
|
|
|
50
51
|
description=("The truncation strategy if the input on the "
|
|
51
52
|
"server side if it's too large."))
|
|
52
53
|
|
|
53
|
-
model_config = ConfigDict(protected_namespaces=())
|
|
54
|
+
model_config = ConfigDict(protected_namespaces=(), extra="allow")
|
|
54
55
|
|
|
55
56
|
|
|
56
57
|
@register_embedder_provider(config_type=NIMEmbedderModelConfig)
|
nat/embedder/openai_embedder.py
CHANGED
|
@@ -20,6 +20,7 @@ from pydantic import Field
|
|
|
20
20
|
from nat.builder.builder import Builder
|
|
21
21
|
from nat.builder.embedder import EmbedderProviderInfo
|
|
22
22
|
from nat.cli.register_workflow import register_embedder_provider
|
|
23
|
+
from nat.data_models.common import OptionalSecretStr
|
|
23
24
|
from nat.data_models.embedder import EmbedderBaseConfig
|
|
24
25
|
from nat.data_models.retry_mixin import RetryMixin
|
|
25
26
|
|
|
@@ -27,9 +28,9 @@ from nat.data_models.retry_mixin import RetryMixin
|
|
|
27
28
|
class OpenAIEmbedderModelConfig(EmbedderBaseConfig, RetryMixin, name="openai"):
|
|
28
29
|
"""An OpenAI LLM provider to be used with an LLM client."""
|
|
29
30
|
|
|
30
|
-
model_config = ConfigDict(protected_namespaces=())
|
|
31
|
+
model_config = ConfigDict(protected_namespaces=(), extra="allow")
|
|
31
32
|
|
|
32
|
-
api_key:
|
|
33
|
+
api_key: OptionalSecretStr = Field(default=None, description="OpenAI API key to interact with hosted model.")
|
|
33
34
|
base_url: str | None = Field(default=None, description="Base url to the hosted model.")
|
|
34
35
|
model_name: str = Field(validation_alias=AliasChoices("model_name", "model"),
|
|
35
36
|
serialization_alias="model",
|
nat/eval/config.py
CHANGED
|
@@ -27,7 +27,7 @@ class EvaluationRunConfig(BaseModel):
|
|
|
27
27
|
"""
|
|
28
28
|
Parameters used for a single evaluation run.
|
|
29
29
|
"""
|
|
30
|
-
config_file: Path
|
|
30
|
+
config_file: Path | BaseModel
|
|
31
31
|
dataset: str | None = None # dataset file path can be specified in the config file
|
|
32
32
|
result_json_path: str = "$"
|
|
33
33
|
skip_workflow: bool = False
|
|
@@ -19,6 +19,7 @@ import boto3
|
|
|
19
19
|
import requests
|
|
20
20
|
from botocore.exceptions import NoCredentialsError
|
|
21
21
|
|
|
22
|
+
from nat.data_models.common import get_secret_value
|
|
22
23
|
from nat.data_models.dataset_handler import EvalDatasetConfig
|
|
23
24
|
|
|
24
25
|
logger = logging.getLogger(__name__)
|
|
@@ -46,8 +47,8 @@ class DatasetDownloader:
|
|
|
46
47
|
try:
|
|
47
48
|
self._s3_client = boto3.client("s3",
|
|
48
49
|
endpoint_url=self.s3_config.endpoint_url,
|
|
49
|
-
aws_access_key_id=self.s3_config.access_key,
|
|
50
|
-
aws_secret_access_key=self.s3_config.secret_key)
|
|
50
|
+
aws_access_key_id=get_secret_value(self.s3_config.access_key),
|
|
51
|
+
aws_secret_access_key=get_secret_value(self.s3_config.secret_key))
|
|
51
52
|
except NoCredentialsError as e:
|
|
52
53
|
logger.error("AWS credentials not available: %s", e)
|
|
53
54
|
raise
|
|
@@ -13,6 +13,8 @@
|
|
|
13
13
|
# See the License for the specific language governing permissions and
|
|
14
14
|
# limitations under the License.
|
|
15
15
|
|
|
16
|
+
import fnmatch
|
|
17
|
+
|
|
16
18
|
import pandas as pd
|
|
17
19
|
|
|
18
20
|
from nat.data_models.dataset_handler import EvalFilterConfig
|
|
@@ -24,6 +26,7 @@ class DatasetFilter:
|
|
|
24
26
|
- If a allowlist is provided, only keep rows matching the filter values.
|
|
25
27
|
- If a denylist is provided, remove rows matching the filter values.
|
|
26
28
|
- If the filter column does not exist in the DataFrame, the filtering is skipped for that column.
|
|
29
|
+
- Supports Unix shell-style wildcards (``*``, ``?``, ``[seq]``, ``[!seq]``) for string matching.
|
|
27
30
|
|
|
28
31
|
This is a utility class that is dataset agnostic and can be used to filter any DataFrame based on the provided
|
|
29
32
|
filter configuration.
|
|
@@ -33,6 +36,33 @@ class DatasetFilter:
|
|
|
33
36
|
|
|
34
37
|
self.filter_config = filter_config
|
|
35
38
|
|
|
39
|
+
@staticmethod
|
|
40
|
+
def _match_wildcard_patterns(series: pd.Series, patterns: list[str | int | float]) -> pd.Series:
|
|
41
|
+
"""
|
|
42
|
+
Match series values against wildcard patterns and exact values.
|
|
43
|
+
|
|
44
|
+
Args:
|
|
45
|
+
series (pd.Series): pandas Series to match against
|
|
46
|
+
patterns (list[str | int | float]): List of patterns/values
|
|
47
|
+
|
|
48
|
+
Returns:
|
|
49
|
+
pd.Series: Boolean Series indicating matches
|
|
50
|
+
"""
|
|
51
|
+
# Convert series to string for pattern matching
|
|
52
|
+
str_series = series.astype(str)
|
|
53
|
+
|
|
54
|
+
# Initialize boolean mask
|
|
55
|
+
matches = pd.Series([False] * len(series), index=series.index)
|
|
56
|
+
|
|
57
|
+
# Check each pattern using fnmatch with list comprehension to avoid lambda capture
|
|
58
|
+
for pattern in patterns:
|
|
59
|
+
pattern_str = str(pattern)
|
|
60
|
+
pattern_matches = pd.Series([fnmatch.fnmatch(val, pattern_str) for val in str_series],
|
|
61
|
+
index=str_series.index)
|
|
62
|
+
matches |= pattern_matches
|
|
63
|
+
|
|
64
|
+
return matches
|
|
65
|
+
|
|
36
66
|
def apply_filters(self, df) -> pd.DataFrame:
|
|
37
67
|
|
|
38
68
|
filtered_df = df.copy()
|
|
@@ -41,12 +71,14 @@ class DatasetFilter:
|
|
|
41
71
|
if self.filter_config.allowlist:
|
|
42
72
|
for column, values in self.filter_config.allowlist.field.items():
|
|
43
73
|
if column in filtered_df.columns:
|
|
44
|
-
|
|
74
|
+
matches = self._match_wildcard_patterns(filtered_df[column], values)
|
|
75
|
+
filtered_df = filtered_df[matches]
|
|
45
76
|
|
|
46
77
|
# Apply denylist (remove specified rows)
|
|
47
78
|
if self.filter_config.denylist:
|
|
48
79
|
for column, values in self.filter_config.denylist.field.items():
|
|
49
80
|
if column in filtered_df.columns:
|
|
50
|
-
|
|
81
|
+
matches = self._match_wildcard_patterns(filtered_df[column], values)
|
|
82
|
+
filtered_df = filtered_df[~matches]
|
|
51
83
|
|
|
52
84
|
return filtered_df
|
nat/eval/evaluate.py
CHANGED
|
@@ -104,6 +104,8 @@ class EvaluationRun:
|
|
|
104
104
|
usage_stats_per_llm[llm_name].prompt_tokens += step.token_usage.prompt_tokens
|
|
105
105
|
usage_stats_per_llm[llm_name].completion_tokens += step.token_usage.completion_tokens
|
|
106
106
|
usage_stats_per_llm[llm_name].total_tokens += step.token_usage.total_tokens
|
|
107
|
+
usage_stats_per_llm[llm_name].reasoning_tokens += step.token_usage.reasoning_tokens
|
|
108
|
+
usage_stats_per_llm[llm_name].cached_tokens += step.token_usage.cached_tokens
|
|
107
109
|
total_tokens += step.token_usage.total_tokens
|
|
108
110
|
|
|
109
111
|
# find min and max event timestamps
|
|
@@ -449,10 +451,14 @@ class EvaluationRun:
|
|
|
449
451
|
from nat.runtime.loader import load_config
|
|
450
452
|
|
|
451
453
|
# Load and override the config
|
|
452
|
-
|
|
454
|
+
config = None
|
|
455
|
+
if isinstance(self.config.config_file, BaseModel):
|
|
456
|
+
config = self.config.config_file
|
|
457
|
+
elif self.config.override:
|
|
453
458
|
config = self.apply_overrides()
|
|
454
459
|
else:
|
|
455
460
|
config = load_config(self.config.config_file)
|
|
461
|
+
|
|
456
462
|
self.eval_config = config.eval
|
|
457
463
|
workflow_alias = self._get_workflow_alias(config.workflow.type)
|
|
458
464
|
logger.debug("Loaded %s evaluation configuration: %s", workflow_alias, self.eval_config)
|
|
@@ -508,7 +514,7 @@ class EvaluationRun:
|
|
|
508
514
|
# Run workflow and evaluate
|
|
509
515
|
async with WorkflowEvalBuilder.from_config(config=config) as eval_workflow:
|
|
510
516
|
# Initialize Weave integration
|
|
511
|
-
self.weave_eval.initialize_logger(workflow_alias, self.eval_input, config)
|
|
517
|
+
self.weave_eval.initialize_logger(workflow_alias, self.eval_input, config, job_id=job_id)
|
|
512
518
|
|
|
513
519
|
with self.eval_trace_context.evaluation_context():
|
|
514
520
|
# Run workflow
|
|
@@ -516,7 +522,8 @@ class EvaluationRun:
|
|
|
516
522
|
await self.run_workflow_remote()
|
|
517
523
|
elif not self.config.skip_workflow:
|
|
518
524
|
if session_manager is None:
|
|
519
|
-
|
|
525
|
+
workflow = await eval_workflow.build()
|
|
526
|
+
session_manager = SessionManager(workflow,
|
|
520
527
|
max_concurrency=self.eval_config.general.max_concurrency)
|
|
521
528
|
await self.run_workflow_local(session_manager)
|
|
522
529
|
|
|
@@ -71,7 +71,7 @@ class BaseEvaluator(ABC):
|
|
|
71
71
|
TqdmPositionRegistry.release(tqdm_position)
|
|
72
72
|
|
|
73
73
|
# Compute average if possible
|
|
74
|
-
numeric_scores = [item.score for item in output_items if isinstance(item.score,
|
|
74
|
+
numeric_scores = [item.score for item in output_items if isinstance(item.score, int | float)]
|
|
75
75
|
avg_score = round(sum(numeric_scores) / len(numeric_scores), 2) if numeric_scores else None
|
|
76
76
|
|
|
77
77
|
return EvalOutput(average_score=avg_score, eval_output_items=output_items)
|
|
@@ -116,11 +116,14 @@ class RAGEvaluator:
|
|
|
116
116
|
"""Convert NaN or None to 0.0 for safe arithmetic/serialization."""
|
|
117
117
|
return 0.0 if v is None or (isinstance(v, float) and math.isnan(v)) else v
|
|
118
118
|
|
|
119
|
-
#
|
|
119
|
+
# Keep original scores (preserving NaN/None) for output
|
|
120
|
+
original_scores_dict = {metric: [score.get(metric) for score in scores] for metric in scores[0]}
|
|
121
|
+
|
|
122
|
+
# Convert from list of dicts to dict of lists, coercing NaN/None to 0.0 for average calculation
|
|
120
123
|
scores_dict = {metric: [_nan_to_zero(score.get(metric)) for score in scores] for metric in scores[0]}
|
|
121
124
|
first_metric_name = list(scores_dict.keys())[0] if scores_dict else None
|
|
122
125
|
|
|
123
|
-
# Compute the average of each metric
|
|
126
|
+
# Compute the average of each metric using cleaned scores (NaN/None -> 0.0)
|
|
124
127
|
average_scores = {
|
|
125
128
|
metric: (sum(values) / len(values) if values else 0.0)
|
|
126
129
|
for metric, values in scores_dict.items()
|
|
@@ -137,11 +140,11 @@ class RAGEvaluator:
|
|
|
137
140
|
else:
|
|
138
141
|
ids = df["user_input"].tolist() # Use "user_input" as ID fallback
|
|
139
142
|
|
|
140
|
-
# Construct EvalOutputItem list
|
|
143
|
+
# Construct EvalOutputItem list using original scores (preserving NaN/None)
|
|
141
144
|
eval_output_items = [
|
|
142
145
|
EvalOutputItem(
|
|
143
146
|
id=ids[i],
|
|
144
|
-
score=
|
|
147
|
+
score=original_scores_dict[first_metric_name][i] if first_metric_name else None,
|
|
145
148
|
reasoning={
|
|
146
149
|
key:
|
|
147
150
|
getattr(row, key, None) # Use getattr to safely access attributes
|
nat/eval/register.py
CHANGED
|
@@ -17,6 +17,10 @@
|
|
|
17
17
|
|
|
18
18
|
# Import evaluators which need to be automatically registered here
|
|
19
19
|
from .rag_evaluator.register import register_ragas_evaluator
|
|
20
|
+
from .runtime_evaluator.register import register_avg_llm_latency_evaluator
|
|
21
|
+
from .runtime_evaluator.register import register_avg_num_llm_calls_evaluator
|
|
22
|
+
from .runtime_evaluator.register import register_avg_tokens_per_llm_end_evaluator
|
|
23
|
+
from .runtime_evaluator.register import register_avg_workflow_runtime_evaluator
|
|
20
24
|
from .swe_bench_evaluator.register import register_swe_bench_evaluator
|
|
21
25
|
from .trajectory_evaluator.register import register_trajectory_evaluator
|
|
22
26
|
from .tunable_rag_evaluator.register import register_tunable_rag_evaluator
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
#
|
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
# you may not use this file except in compliance with the License.
|
|
6
|
+
# You may obtain a copy of the License at
|
|
7
|
+
#
|
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
#
|
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
# See the License for the specific language governing permissions and
|
|
14
|
+
# limitations under the License.
|
|
@@ -0,0 +1,123 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
#
|
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
# you may not use this file except in compliance with the License.
|
|
6
|
+
# You may obtain a copy of the License at
|
|
7
|
+
#
|
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
#
|
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
# See the License for the specific language governing permissions and
|
|
14
|
+
# limitations under the License.
|
|
15
|
+
|
|
16
|
+
from __future__ import annotations
|
|
17
|
+
|
|
18
|
+
from collections import defaultdict
|
|
19
|
+
from dataclasses import dataclass
|
|
20
|
+
|
|
21
|
+
from nat.data_models.intermediate_step import IntermediateStepType
|
|
22
|
+
from nat.eval.evaluator.base_evaluator import BaseEvaluator
|
|
23
|
+
from nat.eval.evaluator.evaluator_model import EvalInputItem
|
|
24
|
+
from nat.eval.evaluator.evaluator_model import EvalOutputItem
|
|
25
|
+
from nat.profiler.intermediate_property_adapter import IntermediatePropertyAdaptor
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
@dataclass
|
|
29
|
+
class _CallTiming:
|
|
30
|
+
start_ts: float | None = None
|
|
31
|
+
end_ts: float | None = None
|
|
32
|
+
|
|
33
|
+
@property
|
|
34
|
+
def latency(self) -> float | None:
|
|
35
|
+
if self.start_ts is None or self.end_ts is None:
|
|
36
|
+
return None
|
|
37
|
+
return max(0.0, self.end_ts - self.start_ts)
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
class AverageLLMLatencyEvaluator(BaseEvaluator):
|
|
41
|
+
"""
|
|
42
|
+
Mean difference between connected LLM_START and LLM_END events (same UUID).
|
|
43
|
+
The score is the average latency in seconds for the item. Reasoning contains per-call latencies.
|
|
44
|
+
"""
|
|
45
|
+
|
|
46
|
+
def __init__(self, max_concurrency: int = 8):
|
|
47
|
+
super().__init__(max_concurrency=max_concurrency, tqdm_desc="Evaluating Avg LLM Latency")
|
|
48
|
+
|
|
49
|
+
async def evaluate_item(self, item: EvalInputItem) -> EvalOutputItem: # noqa: D401
|
|
50
|
+
calls: dict[str, _CallTiming] = defaultdict(_CallTiming)
|
|
51
|
+
|
|
52
|
+
for step in (IntermediatePropertyAdaptor.from_intermediate_step(s) for s in item.trajectory):
|
|
53
|
+
if step.event_type == IntermediateStepType.LLM_START:
|
|
54
|
+
calls[step.UUID].start_ts = step.event_timestamp
|
|
55
|
+
elif step.event_type == IntermediateStepType.LLM_END:
|
|
56
|
+
calls[step.UUID].end_ts = step.event_timestamp
|
|
57
|
+
|
|
58
|
+
latencies = [ct.latency for ct in calls.values() if ct.latency is not None]
|
|
59
|
+
avg_latency = sum(latencies) / len(latencies) if latencies else 0.0
|
|
60
|
+
|
|
61
|
+
reasoning = {
|
|
62
|
+
"num_llm_calls": len(latencies),
|
|
63
|
+
"latencies": latencies,
|
|
64
|
+
}
|
|
65
|
+
return EvalOutputItem(id=item.id, score=round(avg_latency, 4), reasoning=reasoning)
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
class AverageWorkflowRuntimeEvaluator(BaseEvaluator):
|
|
69
|
+
"""
|
|
70
|
+
Average workflow runtime per item: max(event_timestamp) - min(event_timestamp) across the trajectory.
|
|
71
|
+
The score is the runtime in seconds for the item.
|
|
72
|
+
"""
|
|
73
|
+
|
|
74
|
+
def __init__(self, max_concurrency: int = 8):
|
|
75
|
+
super().__init__(max_concurrency=max_concurrency, tqdm_desc="Evaluating Avg Workflow Runtime")
|
|
76
|
+
|
|
77
|
+
async def evaluate_item(self, item: EvalInputItem) -> EvalOutputItem: # noqa: D401
|
|
78
|
+
if not item.trajectory:
|
|
79
|
+
return EvalOutputItem(id=item.id, score=0.0, reasoning={"note": "no steps"})
|
|
80
|
+
|
|
81
|
+
timestamps = [s.event_timestamp for s in item.trajectory]
|
|
82
|
+
runtime = max(timestamps) - min(timestamps)
|
|
83
|
+
return EvalOutputItem(id=item.id, score=round(max(0.0, runtime), 4), reasoning={"steps": len(timestamps)})
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
class AverageNumberOfLLMCallsEvaluator(BaseEvaluator):
|
|
87
|
+
"""
|
|
88
|
+
Average number of LLM calls per item. The score is the count for the item.
|
|
89
|
+
"""
|
|
90
|
+
|
|
91
|
+
def __init__(self, max_concurrency: int = 8):
|
|
92
|
+
super().__init__(max_concurrency=max_concurrency, tqdm_desc="Evaluating Avg # LLM Calls")
|
|
93
|
+
|
|
94
|
+
async def evaluate_item(self, item: EvalInputItem) -> EvalOutputItem: # noqa: D401
|
|
95
|
+
num_calls = sum(1 for s in item.trajectory if s.event_type == IntermediateStepType.LLM_END)
|
|
96
|
+
return EvalOutputItem(id=item.id, score=float(num_calls), reasoning={"num_llm_end": num_calls})
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
class AverageTokensPerLLMEndEvaluator(BaseEvaluator):
|
|
100
|
+
"""
|
|
101
|
+
Average total tokens per LLM_END event: sum of prompt and completion tokens if available.
|
|
102
|
+
The score is the average tokens per LLM_END for the item (0 if none).
|
|
103
|
+
"""
|
|
104
|
+
|
|
105
|
+
def __init__(self, max_concurrency: int = 8):
|
|
106
|
+
super().__init__(max_concurrency=max_concurrency, tqdm_desc="Evaluating Avg Tokens/LLM_END")
|
|
107
|
+
|
|
108
|
+
async def evaluate_item(self, item: EvalInputItem) -> EvalOutputItem: # noqa: D401
|
|
109
|
+
totals: list[int] = []
|
|
110
|
+
for step in (IntermediatePropertyAdaptor.from_intermediate_step(s) for s in item.trajectory):
|
|
111
|
+
if step.event_type == IntermediateStepType.LLM_END:
|
|
112
|
+
total_tokens = step.token_usage.total_tokens
|
|
113
|
+
# If framework doesn't set total, compute from prompt+completion
|
|
114
|
+
if total_tokens == 0:
|
|
115
|
+
total_tokens = step.token_usage.prompt_tokens + step.token_usage.completion_tokens
|
|
116
|
+
totals.append(total_tokens)
|
|
117
|
+
|
|
118
|
+
avg_tokens = (sum(totals) / len(totals)) if totals else 0.0
|
|
119
|
+
reasoning = {
|
|
120
|
+
"num_llm_end": len(totals),
|
|
121
|
+
"totals": totals,
|
|
122
|
+
}
|
|
123
|
+
return EvalOutputItem(id=item.id, score=round(avg_tokens, 2), reasoning=reasoning)
|