aiqtoolkit 1.2.0a20250707__py3-none-any.whl → 1.2.0a20250730__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of aiqtoolkit might be problematic. Click here for more details.

Files changed (197) hide show
  1. aiq/agent/base.py +171 -8
  2. aiq/agent/dual_node.py +1 -1
  3. aiq/agent/react_agent/agent.py +113 -113
  4. aiq/agent/react_agent/register.py +31 -14
  5. aiq/agent/rewoo_agent/agent.py +36 -35
  6. aiq/agent/rewoo_agent/register.py +2 -2
  7. aiq/agent/tool_calling_agent/agent.py +3 -7
  8. aiq/authentication/__init__.py +14 -0
  9. aiq/authentication/api_key/__init__.py +14 -0
  10. aiq/authentication/api_key/api_key_auth_provider.py +92 -0
  11. aiq/authentication/api_key/api_key_auth_provider_config.py +124 -0
  12. aiq/authentication/api_key/register.py +26 -0
  13. aiq/authentication/exceptions/__init__.py +14 -0
  14. aiq/authentication/exceptions/api_key_exceptions.py +38 -0
  15. aiq/authentication/exceptions/auth_code_grant_exceptions.py +86 -0
  16. aiq/authentication/exceptions/call_back_exceptions.py +38 -0
  17. aiq/authentication/exceptions/request_exceptions.py +54 -0
  18. aiq/authentication/http_basic_auth/__init__.py +0 -0
  19. aiq/authentication/http_basic_auth/http_basic_auth_provider.py +81 -0
  20. aiq/authentication/http_basic_auth/register.py +30 -0
  21. aiq/authentication/interfaces.py +93 -0
  22. aiq/authentication/oauth2/__init__.py +14 -0
  23. aiq/authentication/oauth2/oauth2_auth_code_flow_provider.py +107 -0
  24. aiq/authentication/oauth2/oauth2_auth_code_flow_provider_config.py +39 -0
  25. aiq/authentication/oauth2/register.py +25 -0
  26. aiq/authentication/register.py +21 -0
  27. aiq/builder/builder.py +64 -2
  28. aiq/builder/component_utils.py +16 -3
  29. aiq/builder/context.py +26 -0
  30. aiq/builder/eval_builder.py +43 -2
  31. aiq/builder/function.py +32 -4
  32. aiq/builder/function_base.py +1 -1
  33. aiq/builder/intermediate_step_manager.py +6 -8
  34. aiq/builder/user_interaction_manager.py +3 -0
  35. aiq/builder/workflow.py +23 -18
  36. aiq/builder/workflow_builder.py +420 -73
  37. aiq/cli/commands/info/list_mcp.py +103 -16
  38. aiq/cli/commands/sizing/__init__.py +14 -0
  39. aiq/cli/commands/sizing/calc.py +294 -0
  40. aiq/cli/commands/sizing/sizing.py +27 -0
  41. aiq/cli/commands/start.py +1 -0
  42. aiq/cli/entrypoint.py +2 -0
  43. aiq/cli/register_workflow.py +80 -0
  44. aiq/cli/type_registry.py +151 -30
  45. aiq/data_models/api_server.py +117 -11
  46. aiq/data_models/authentication.py +231 -0
  47. aiq/data_models/common.py +35 -7
  48. aiq/data_models/component.py +17 -9
  49. aiq/data_models/component_ref.py +33 -0
  50. aiq/data_models/config.py +60 -3
  51. aiq/data_models/embedder.py +1 -0
  52. aiq/data_models/function_dependencies.py +8 -0
  53. aiq/data_models/interactive.py +10 -1
  54. aiq/data_models/intermediate_step.py +15 -5
  55. aiq/data_models/its_strategy.py +30 -0
  56. aiq/data_models/llm.py +1 -0
  57. aiq/data_models/memory.py +1 -0
  58. aiq/data_models/object_store.py +44 -0
  59. aiq/data_models/retry_mixin.py +35 -0
  60. aiq/data_models/span.py +187 -0
  61. aiq/data_models/telemetry_exporter.py +2 -2
  62. aiq/embedder/nim_embedder.py +2 -1
  63. aiq/embedder/openai_embedder.py +2 -1
  64. aiq/eval/config.py +19 -1
  65. aiq/eval/dataset_handler/dataset_handler.py +75 -1
  66. aiq/eval/evaluate.py +53 -10
  67. aiq/eval/rag_evaluator/evaluate.py +23 -12
  68. aiq/eval/remote_workflow.py +7 -2
  69. aiq/eval/runners/__init__.py +14 -0
  70. aiq/eval/runners/config.py +39 -0
  71. aiq/eval/runners/multi_eval_runner.py +54 -0
  72. aiq/eval/usage_stats.py +6 -0
  73. aiq/eval/utils/weave_eval.py +5 -1
  74. aiq/experimental/__init__.py +0 -0
  75. aiq/experimental/decorators/__init__.py +0 -0
  76. aiq/experimental/decorators/experimental_warning_decorator.py +130 -0
  77. aiq/experimental/inference_time_scaling/__init__.py +0 -0
  78. aiq/experimental/inference_time_scaling/editing/__init__.py +0 -0
  79. aiq/experimental/inference_time_scaling/editing/iterative_plan_refinement_editor.py +147 -0
  80. aiq/experimental/inference_time_scaling/editing/llm_as_a_judge_editor.py +204 -0
  81. aiq/experimental/inference_time_scaling/editing/motivation_aware_summarization.py +107 -0
  82. aiq/experimental/inference_time_scaling/functions/__init__.py +0 -0
  83. aiq/experimental/inference_time_scaling/functions/execute_score_select_function.py +105 -0
  84. aiq/experimental/inference_time_scaling/functions/its_tool_orchestration_function.py +205 -0
  85. aiq/experimental/inference_time_scaling/functions/its_tool_wrapper_function.py +146 -0
  86. aiq/experimental/inference_time_scaling/functions/plan_select_execute_function.py +224 -0
  87. aiq/experimental/inference_time_scaling/models/__init__.py +0 -0
  88. aiq/experimental/inference_time_scaling/models/editor_config.py +132 -0
  89. aiq/experimental/inference_time_scaling/models/its_item.py +48 -0
  90. aiq/experimental/inference_time_scaling/models/scoring_config.py +112 -0
  91. aiq/experimental/inference_time_scaling/models/search_config.py +120 -0
  92. aiq/experimental/inference_time_scaling/models/selection_config.py +154 -0
  93. aiq/experimental/inference_time_scaling/models/stage_enums.py +43 -0
  94. aiq/experimental/inference_time_scaling/models/strategy_base.py +66 -0
  95. aiq/experimental/inference_time_scaling/models/tool_use_config.py +41 -0
  96. aiq/experimental/inference_time_scaling/register.py +36 -0
  97. aiq/experimental/inference_time_scaling/scoring/__init__.py +0 -0
  98. aiq/experimental/inference_time_scaling/scoring/llm_based_agent_scorer.py +168 -0
  99. aiq/experimental/inference_time_scaling/scoring/llm_based_plan_scorer.py +168 -0
  100. aiq/experimental/inference_time_scaling/scoring/motivation_aware_scorer.py +111 -0
  101. aiq/experimental/inference_time_scaling/search/__init__.py +0 -0
  102. aiq/experimental/inference_time_scaling/search/multi_llm_planner.py +128 -0
  103. aiq/experimental/inference_time_scaling/search/multi_query_retrieval_search.py +122 -0
  104. aiq/experimental/inference_time_scaling/search/single_shot_multi_plan_planner.py +128 -0
  105. aiq/experimental/inference_time_scaling/selection/__init__.py +0 -0
  106. aiq/experimental/inference_time_scaling/selection/best_of_n_selector.py +63 -0
  107. aiq/experimental/inference_time_scaling/selection/llm_based_agent_output_selector.py +131 -0
  108. aiq/experimental/inference_time_scaling/selection/llm_based_output_merging_selector.py +159 -0
  109. aiq/experimental/inference_time_scaling/selection/llm_based_plan_selector.py +128 -0
  110. aiq/experimental/inference_time_scaling/selection/threshold_selector.py +58 -0
  111. aiq/front_ends/console/authentication_flow_handler.py +233 -0
  112. aiq/front_ends/console/console_front_end_plugin.py +11 -2
  113. aiq/front_ends/fastapi/auth_flow_handlers/__init__.py +0 -0
  114. aiq/front_ends/fastapi/auth_flow_handlers/http_flow_handler.py +27 -0
  115. aiq/front_ends/fastapi/auth_flow_handlers/websocket_flow_handler.py +107 -0
  116. aiq/front_ends/fastapi/fastapi_front_end_config.py +20 -0
  117. aiq/front_ends/fastapi/fastapi_front_end_controller.py +68 -0
  118. aiq/front_ends/fastapi/fastapi_front_end_plugin.py +14 -1
  119. aiq/front_ends/fastapi/fastapi_front_end_plugin_worker.py +353 -31
  120. aiq/front_ends/fastapi/html_snippets/__init__.py +14 -0
  121. aiq/front_ends/fastapi/html_snippets/auth_code_grant_success.py +35 -0
  122. aiq/front_ends/fastapi/main.py +2 -0
  123. aiq/front_ends/fastapi/message_handler.py +102 -84
  124. aiq/front_ends/fastapi/step_adaptor.py +2 -1
  125. aiq/llm/aws_bedrock_llm.py +2 -1
  126. aiq/llm/nim_llm.py +2 -1
  127. aiq/llm/openai_llm.py +2 -1
  128. aiq/object_store/__init__.py +20 -0
  129. aiq/object_store/in_memory_object_store.py +74 -0
  130. aiq/object_store/interfaces.py +84 -0
  131. aiq/object_store/models.py +36 -0
  132. aiq/object_store/register.py +20 -0
  133. aiq/observability/__init__.py +14 -0
  134. aiq/observability/exporter/__init__.py +14 -0
  135. aiq/observability/exporter/base_exporter.py +449 -0
  136. aiq/observability/exporter/exporter.py +78 -0
  137. aiq/observability/exporter/file_exporter.py +33 -0
  138. aiq/observability/exporter/processing_exporter.py +269 -0
  139. aiq/observability/exporter/raw_exporter.py +52 -0
  140. aiq/observability/exporter/span_exporter.py +264 -0
  141. aiq/observability/exporter_manager.py +335 -0
  142. aiq/observability/mixin/__init__.py +14 -0
  143. aiq/observability/mixin/batch_config_mixin.py +26 -0
  144. aiq/observability/mixin/collector_config_mixin.py +23 -0
  145. aiq/observability/mixin/file_mixin.py +288 -0
  146. aiq/observability/mixin/file_mode.py +23 -0
  147. aiq/observability/mixin/resource_conflict_mixin.py +134 -0
  148. aiq/observability/mixin/serialize_mixin.py +61 -0
  149. aiq/observability/mixin/type_introspection_mixin.py +183 -0
  150. aiq/observability/processor/__init__.py +14 -0
  151. aiq/observability/processor/batching_processor.py +316 -0
  152. aiq/observability/processor/intermediate_step_serializer.py +28 -0
  153. aiq/observability/processor/processor.py +68 -0
  154. aiq/observability/register.py +32 -116
  155. aiq/observability/utils/__init__.py +14 -0
  156. aiq/observability/utils/dict_utils.py +236 -0
  157. aiq/observability/utils/time_utils.py +31 -0
  158. aiq/profiler/calc/__init__.py +14 -0
  159. aiq/profiler/calc/calc_runner.py +623 -0
  160. aiq/profiler/calc/calculations.py +288 -0
  161. aiq/profiler/calc/data_models.py +176 -0
  162. aiq/profiler/calc/plot.py +345 -0
  163. aiq/profiler/data_models.py +2 -0
  164. aiq/profiler/profile_runner.py +16 -13
  165. aiq/runtime/loader.py +8 -2
  166. aiq/runtime/runner.py +23 -9
  167. aiq/runtime/session.py +16 -5
  168. aiq/tool/chat_completion.py +74 -0
  169. aiq/tool/code_execution/README.md +152 -0
  170. aiq/tool/code_execution/code_sandbox.py +151 -72
  171. aiq/tool/code_execution/local_sandbox/.gitignore +1 -0
  172. aiq/tool/code_execution/local_sandbox/local_sandbox_server.py +139 -24
  173. aiq/tool/code_execution/local_sandbox/sandbox.requirements.txt +3 -1
  174. aiq/tool/code_execution/local_sandbox/start_local_sandbox.sh +27 -2
  175. aiq/tool/code_execution/register.py +7 -3
  176. aiq/tool/code_execution/test_code_execution_sandbox.py +414 -0
  177. aiq/tool/mcp/exceptions.py +142 -0
  178. aiq/tool/mcp/mcp_client.py +17 -3
  179. aiq/tool/mcp/mcp_tool.py +1 -1
  180. aiq/tool/register.py +1 -0
  181. aiq/tool/server_tools.py +2 -2
  182. aiq/utils/exception_handlers/automatic_retries.py +289 -0
  183. aiq/utils/exception_handlers/mcp.py +211 -0
  184. aiq/utils/io/model_processing.py +28 -0
  185. aiq/utils/log_utils.py +37 -0
  186. aiq/utils/string_utils.py +38 -0
  187. aiq/utils/type_converter.py +18 -2
  188. aiq/utils/type_utils.py +87 -0
  189. {aiqtoolkit-1.2.0a20250707.dist-info → aiqtoolkit-1.2.0a20250730.dist-info}/METADATA +37 -9
  190. {aiqtoolkit-1.2.0a20250707.dist-info → aiqtoolkit-1.2.0a20250730.dist-info}/RECORD +195 -80
  191. {aiqtoolkit-1.2.0a20250707.dist-info → aiqtoolkit-1.2.0a20250730.dist-info}/entry_points.txt +3 -0
  192. aiq/front_ends/fastapi/websocket.py +0 -153
  193. aiq/observability/async_otel_listener.py +0 -470
  194. {aiqtoolkit-1.2.0a20250707.dist-info → aiqtoolkit-1.2.0a20250730.dist-info}/WHEEL +0 -0
  195. {aiqtoolkit-1.2.0a20250707.dist-info → aiqtoolkit-1.2.0a20250730.dist-info}/licenses/LICENSE-3rd-party.txt +0 -0
  196. {aiqtoolkit-1.2.0a20250707.dist-info → aiqtoolkit-1.2.0a20250730.dist-info}/licenses/LICENSE.md +0 -0
  197. {aiqtoolkit-1.2.0a20250707.dist-info → aiqtoolkit-1.2.0a20250730.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,30 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ import typing
17
+
18
+ from .common import BaseModelRegistryTag
19
+ from .common import TypedBaseModel
20
+
21
+
22
+ class ITSStrategyBaseConfig(TypedBaseModel, BaseModelRegistryTag):
23
+ """
24
+ Base configuration class for Inference Time Scaling (ITS) strategy.
25
+ This class is used to define the structure of ITS strategy configurations.
26
+ """
27
+ pass
28
+
29
+
30
+ ITSStrategyBaseConfigT = typing.TypeVar("ITSStrategyBaseConfigT", bound=ITSStrategyBaseConfig)
aiq/data_models/llm.py CHANGED
@@ -20,6 +20,7 @@ from .common import TypedBaseModel
20
20
 
21
21
 
22
22
  class LLMBaseConfig(TypedBaseModel, BaseModelRegistryTag):
23
+ """Base configuration for LLM providers."""
23
24
  pass
24
25
 
25
26
 
aiq/data_models/memory.py CHANGED
@@ -20,6 +20,7 @@ from .common import TypedBaseModel
20
20
 
21
21
 
22
22
  class MemoryBaseConfig(TypedBaseModel, BaseModelRegistryTag):
23
+ """ The base level config object for a memory object. Memories provide an interface for storing and retrieving. """
23
24
  pass
24
25
 
25
26
 
@@ -0,0 +1,44 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ import typing
17
+
18
+ from .common import BaseModelRegistryTag
19
+ from .common import TypedBaseModel
20
+
21
+
22
+ class ObjectStoreBaseConfig(TypedBaseModel, BaseModelRegistryTag):
23
+ pass
24
+
25
+
26
+ ObjectStoreBaseConfigT = typing.TypeVar("ObjectStoreBaseConfigT", bound=ObjectStoreBaseConfig)
27
+
28
+
29
+ class KeyAlreadyExistsError(Exception):
30
+
31
+ def __init__(self, key: str, additional_message: str | None = None):
32
+ parts = [f"Key already exists: {key}."]
33
+ if additional_message:
34
+ parts.append(additional_message)
35
+ super().__init__(" ".join(parts))
36
+
37
+
38
+ class NoSuchKeyError(Exception):
39
+
40
+ def __init__(self, key: str, additional_message: str | None = None):
41
+ parts = [f"No object found with key: {key}."]
42
+ if additional_message:
43
+ parts.append(additional_message)
44
+ super().__init__(" ".join(parts))
@@ -0,0 +1,35 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ from pydantic import BaseModel
17
+ from pydantic import Field
18
+
19
+
20
+ class RetryMixin(BaseModel):
21
+ """Mixin class for retry configuration."""
22
+ do_auto_retry: bool = Field(default=True,
23
+ description="Whether to automatically retry method calls"
24
+ " that fail with a retryable error.",
25
+ exclude=True)
26
+ num_retries: int = Field(default=5,
27
+ description="Number of times to retry a method call that fails"
28
+ " with a retryable error.",
29
+ exclude=True)
30
+ retry_on_status_codes: list[int | str] = Field(default_factory=lambda: [429, 500, 502, 503, 504],
31
+ description="List of HTTP status codes that should trigger a retry.",
32
+ exclude=True)
33
+ retry_on_errors: list[str] | None = Field(default_factory=lambda: ["Too Many Requests"],
34
+ description="List of error substrings that should trigger a retry.",
35
+ exclude=True)
@@ -0,0 +1,187 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ import logging
17
+ import time
18
+ import uuid
19
+ from enum import Enum
20
+ from typing import Any
21
+
22
+ from pydantic import BaseModel
23
+ from pydantic import Field
24
+ from pydantic import field_validator
25
+
26
+ logger = logging.getLogger(__name__)
27
+
28
+
29
+ class SpanKind(Enum):
30
+ LLM = "LLM"
31
+ TOOL = "TOOL"
32
+ WORKFLOW = "WORKFLOW"
33
+ TASK = "TASK"
34
+ FUNCTION = "FUNCTION"
35
+ CUSTOM = "CUSTOM"
36
+ SPAN = "SPAN"
37
+ EMBEDDER = "EMBEDDER"
38
+ RETRIEVER = "RETRIEVER"
39
+ AGENT = "AGENT"
40
+ RERANKER = "RERANKER"
41
+ GUARDRAIL = "GUARDRAIL"
42
+ EVALUATOR = "EVALUATOR"
43
+ UNKNOWN = "UNKNOWN"
44
+
45
+
46
+ EVENT_TYPE_TO_SPAN_KIND_MAP = {
47
+ "LLM_START": SpanKind.LLM,
48
+ "LLM_END": SpanKind.LLM,
49
+ "LLM_NEW_TOKEN": SpanKind.LLM,
50
+ "TOOL_START": SpanKind.TOOL,
51
+ "TOOL_END": SpanKind.TOOL,
52
+ "WORKFLOW_START": SpanKind.WORKFLOW,
53
+ "WORKFLOW_END": SpanKind.WORKFLOW,
54
+ "TASK_START": SpanKind.TASK,
55
+ "TASK_END": SpanKind.TASK,
56
+ "FUNCTION_START": SpanKind.FUNCTION,
57
+ "FUNCTION_END": SpanKind.FUNCTION,
58
+ "CUSTOM_START": SpanKind.CUSTOM,
59
+ "CUSTOM_END": SpanKind.CUSTOM,
60
+ "SPAN_START": SpanKind.SPAN,
61
+ "SPAN_END": SpanKind.SPAN,
62
+ "EMBEDDER_START": SpanKind.EMBEDDER,
63
+ "EMBEDDER_END": SpanKind.EMBEDDER,
64
+ "RETRIEVER_START": SpanKind.RETRIEVER,
65
+ "RETRIEVER_END": SpanKind.RETRIEVER,
66
+ "AGENT_START": SpanKind.AGENT,
67
+ "AGENT_END": SpanKind.AGENT,
68
+ "RERANKER_START": SpanKind.RERANKER,
69
+ "RERANKER_END": SpanKind.RERANKER,
70
+ "GUARDRAIL_START": SpanKind.GUARDRAIL,
71
+ "GUARDRAIL_END": SpanKind.GUARDRAIL,
72
+ "EVALUATOR_START": SpanKind.EVALUATOR,
73
+ "EVALUATOR_END": SpanKind.EVALUATOR,
74
+ }
75
+
76
+
77
+ def event_type_to_span_kind(event_type: str) -> SpanKind:
78
+ """Convert an event type to a span kind.
79
+
80
+ Args:
81
+ event_type (str): The event type to convert.
82
+
83
+ Returns:
84
+ SpanKind: The span kind.
85
+ """
86
+ return EVENT_TYPE_TO_SPAN_KIND_MAP.get(event_type, SpanKind.UNKNOWN)
87
+
88
+
89
+ class SpanAttributes(Enum):
90
+ AIQ_SPAN_KIND = "aiq.span.kind"
91
+ INPUT_VALUE = "input.value"
92
+ INPUT_MIME_TYPE = "input.mime_type"
93
+ LLM_TOKEN_COUNT_PROMPT = "llm.token_count.prompt"
94
+ LLM_TOKEN_COUNT_COMPLETION = "llm.token_count.completion"
95
+ LLM_TOKEN_COUNT_TOTAL = "llm.token_count.total"
96
+ OUTPUT_VALUE = "output.value"
97
+ OUTPUT_MIME_TYPE = "output.mime_type"
98
+ AIQ_USAGE_NUM_LLM_CALLS = "aiq.usage.num_llm_calls"
99
+ AIQ_USAGE_SECONDS_BETWEEN_CALLS = "aiq.usage.seconds_between_calls"
100
+ AIQ_USAGE_TOKEN_COUNT_PROMPT = "aiq.usage.token_count.prompt"
101
+ AIQ_USAGE_TOKEN_COUNT_COMPLETION = "aiq.usage.token_count.completion"
102
+ AIQ_USAGE_TOKEN_COUNT_TOTAL = "aiq.usage.token_count.total"
103
+ AIQ_EVENT_TYPE = "aiq.event_type"
104
+
105
+
106
+ class MimeTypes(Enum):
107
+ TEXT = "text/plain"
108
+ JSON = "application/json"
109
+
110
+
111
+ class SpanStatusCode(Enum):
112
+ OK = "OK"
113
+ ERROR = "ERROR"
114
+ UNSET = "UNSET"
115
+
116
+
117
+ class SpanEvent(BaseModel):
118
+ timestamp: float = Field(default_factory=lambda: int(time.time() * 1e9), description="The timestamp of the event.")
119
+ name: str = Field(description="The name of the event.")
120
+ attributes: dict[str, Any] = Field(default_factory=dict, description="The attributes of the event.")
121
+
122
+
123
+ class SpanStatus(BaseModel):
124
+ code: SpanStatusCode = Field(default=SpanStatusCode.OK, description="The status code of the span.")
125
+ message: str | None = Field(default=None, description="The status message of the span.")
126
+
127
+
128
+ class SpanContext(BaseModel):
129
+ trace_id: int = Field(default_factory=lambda: uuid.uuid4().int, description="The 128-bit trace ID of the span.")
130
+ span_id: int = Field(default_factory=lambda: uuid.uuid4().int & ((1 << 64) - 1),
131
+ description="The 64-bit span ID of the span.")
132
+
133
+
134
+ class Span(BaseModel):
135
+ name: str = Field(description="The name of the span.")
136
+ context: SpanContext | None = Field(default=None, description="The context of the span.")
137
+ parent: "Span | None" = Field(default=None, description="The parent span of the span.")
138
+ start_time: int = Field(default_factory=lambda: int(time.time() * 1e9), description="The start time of the span.")
139
+ end_time: int | None = Field(default=None, description="The end time of the span.")
140
+ attributes: dict[str, Any] = Field(default_factory=dict, description="The attributes of the span.")
141
+ events: list[SpanEvent] = Field(default_factory=list, description="The events of the span.")
142
+ status: SpanStatus = Field(default_factory=SpanStatus, description="The status of the span.")
143
+
144
+ @field_validator('context', mode='before')
145
+ @classmethod
146
+ def set_default_context(cls, v: SpanContext | None) -> SpanContext:
147
+ """Set the default context if the context is not provided.
148
+
149
+ Args:
150
+ v (SpanContext | None): The context to set.
151
+
152
+ Returns:
153
+ SpanContext: The context.
154
+ """
155
+ if v is None:
156
+ return SpanContext()
157
+ return v
158
+
159
+ def set_attribute(self, key: str, value: Any) -> None:
160
+ """Set the attribute of the span.
161
+
162
+ Args:
163
+ key (str): The key of the attribute.
164
+ value (Any): The value of the attribute.
165
+ """
166
+ self.attributes[key] = value
167
+
168
+ def add_event(self, name: str, attributes: dict[str, Any] | None = None) -> None:
169
+ """Add an event to the span.
170
+
171
+ Args:
172
+ name (str): The name of the event.
173
+ attributes (dict[str, Any] | None): The attributes of the event.
174
+ """
175
+ if attributes is None:
176
+ attributes = {}
177
+ self.events = self.events + [SpanEvent(name=name, attributes=attributes)]
178
+
179
+ def end(self, end_time: int | None = None) -> None:
180
+ """End the span.
181
+
182
+ Args:
183
+ end_time (int | None): The end time of the span.
184
+ """
185
+ if end_time is None:
186
+ end_time = int(time.time() * 1e9)
187
+ self.end_time = end_time
@@ -15,8 +15,8 @@
15
15
 
16
16
  import typing
17
17
 
18
- from .common import BaseModelRegistryTag
19
- from .common import TypedBaseModel
18
+ from aiq.data_models.common import BaseModelRegistryTag
19
+ from aiq.data_models.common import TypedBaseModel
20
20
 
21
21
 
22
22
  class TelemetryExporterBaseConfig(TypedBaseModel, BaseModelRegistryTag):
@@ -24,6 +24,7 @@ from aiq.builder.builder import Builder
24
24
  from aiq.builder.embedder import EmbedderProviderInfo
25
25
  from aiq.cli.register_workflow import register_embedder_provider
26
26
  from aiq.data_models.embedder import EmbedderBaseConfig
27
+ from aiq.data_models.retry_mixin import RetryMixin
27
28
 
28
29
  allowed_truncate_values = ["NONE", "START", "END"]
29
30
 
@@ -37,7 +38,7 @@ def option_in_allowed_values(v):
37
38
  TruncationOption = typing.Annotated[str, AfterValidator(option_in_allowed_values)]
38
39
 
39
40
 
40
- class NIMEmbedderModelConfig(EmbedderBaseConfig, name="nim"):
41
+ class NIMEmbedderModelConfig(EmbedderBaseConfig, RetryMixin, name="nim"):
41
42
  """A NVIDIA Inference Microservice (NIM) embedder provider to be used with an embedder client."""
42
43
 
43
44
  api_key: str | None = Field(default=None, description="NVIDIA API key to interact with hosted NIM.")
@@ -21,9 +21,10 @@ from aiq.builder.builder import Builder
21
21
  from aiq.builder.embedder import EmbedderProviderInfo
22
22
  from aiq.cli.register_workflow import register_embedder_provider
23
23
  from aiq.data_models.embedder import EmbedderBaseConfig
24
+ from aiq.data_models.retry_mixin import RetryMixin
24
25
 
25
26
 
26
- class OpenAIEmbedderModelConfig(EmbedderBaseConfig, name="openai"):
27
+ class OpenAIEmbedderModelConfig(EmbedderBaseConfig, RetryMixin, name="openai"):
27
28
  """An OpenAI LLM provider to be used with an LLM client."""
28
29
 
29
30
  model_config = ConfigDict(protected_namespaces=())
aiq/eval/config.py CHANGED
@@ -17,13 +17,18 @@ from pathlib import Path
17
17
 
18
18
  from pydantic import BaseModel
19
19
 
20
+ from aiq.eval.evaluator.evaluator_model import EvalInput
21
+ from aiq.eval.evaluator.evaluator_model import EvalOutput
22
+ from aiq.eval.usage_stats import UsageStats
23
+ from aiq.profiler.data_models import ProfilerResults
24
+
20
25
 
21
26
  class EvaluationRunConfig(BaseModel):
22
27
  """
23
28
  Parameters used for a single evaluation run.
24
29
  """
25
30
  config_file: Path
26
- dataset: str | None # dataset file path can be specified in the config file
31
+ dataset: str | None = None # dataset file path can be specified in the config file
27
32
  result_json_path: str = "$"
28
33
  skip_workflow: bool = False
29
34
  skip_completed_entries: bool = False
@@ -31,6 +36,14 @@ class EvaluationRunConfig(BaseModel):
31
36
  endpoint_timeout: int = 300
32
37
  reps: int = 1
33
38
  override: tuple[tuple[str, str], ...] = ()
39
+ # If false, the output will not be written to the output directory. This is
40
+ # useful when running evaluation via another tool.
41
+ write_output: bool = True
42
+ # if true, the dataset is adjusted to a multiple of the concurrency
43
+ adjust_dataset_size: bool = False
44
+ # number of passes at each concurrency, if 0 the dataset is adjusted to a multiple of the
45
+ # concurrency. The is only used if adjust_dataset_size is true
46
+ num_passes: int = 0
34
47
 
35
48
 
36
49
  class EvaluationRunOutput(BaseModel):
@@ -40,3 +53,8 @@ class EvaluationRunOutput(BaseModel):
40
53
  workflow_output_file: Path | None
41
54
  evaluator_output_files: list[Path]
42
55
  workflow_interrupted: bool
56
+
57
+ eval_input: EvalInput
58
+ evaluation_results: list[tuple[str, EvalOutput]]
59
+ usage_stats: UsageStats | None = None
60
+ profiler_results: ProfilerResults
@@ -14,6 +14,7 @@
14
14
  # limitations under the License.
15
15
 
16
16
  import json
17
+ import math
17
18
 
18
19
  import pandas as pd
19
20
 
@@ -33,12 +34,23 @@ class DatasetHandler:
33
34
  One DatasetHandler object is needed for each dataset to be evaluated.
34
35
  """
35
36
 
36
- def __init__(self, dataset_config: EvalDatasetConfig, reps: int):
37
+ def __init__(self,
38
+ dataset_config: EvalDatasetConfig,
39
+ reps: int,
40
+ concurrency: int,
41
+ num_passes: int | None = None,
42
+ adjust_dataset_size: bool = False):
37
43
  from aiq.eval.intermediate_step_adapter import IntermediateStepAdapter
38
44
 
39
45
  self.dataset_config = dataset_config
40
46
  self.dataset_filter = DatasetFilter(dataset_config.filter)
41
47
  self.reps = reps
48
+
49
+ # number of passes at specific concurrency
50
+ self.concurrency = concurrency
51
+ self.num_passes = num_passes
52
+ self.adjust_dataset_size = adjust_dataset_size
53
+
42
54
  # Helpers
43
55
  self.intermediate_step_adapter = IntermediateStepAdapter()
44
56
 
@@ -109,6 +121,63 @@ class DatasetHandler:
109
121
 
110
122
  return input_df
111
123
 
124
+ def adjust_dataset(self, input_df: pd.DataFrame) -> pd.DataFrame:
125
+ """
126
+ Adjust the dataset so its length is a multiple of concurrency.
127
+
128
+ If num_passes > 0:
129
+ dataset size is adjusted to concurrency * num_passes
130
+ else:
131
+ dataset size is adjusted to the largest multiple of concurrency
132
+ that is less than or equal to the current dataset size
133
+ """
134
+ if self.concurrency <= 0:
135
+ raise ValueError("Concurrency must be > 0")
136
+
137
+ if self.num_passes < 0:
138
+ raise ValueError("num_passes must be >= 0")
139
+
140
+ original_size = input_df.shape[0]
141
+
142
+ # Calculate target size
143
+ if self.num_passes > 0:
144
+ # When num_passes is specified, always use concurrency * num_passes
145
+ # This respects the user's intent for exact number of passes
146
+ target_size = self.concurrency * self.num_passes
147
+ else:
148
+ # When num_passes = 0, use the largest multiple of concurrency <= original_size
149
+ # If original_size < concurrency, we need at least concurrency rows
150
+ if original_size >= self.concurrency:
151
+ target_size = (original_size // self.concurrency) * self.concurrency
152
+ else:
153
+ target_size = self.concurrency
154
+
155
+ if target_size == 0:
156
+ raise ValueError("Input dataset too small for even one batch at given concurrency.")
157
+
158
+ id_col = self.dataset_config.id_key
159
+
160
+ # If we need more rows than we have, replicate the dataset
161
+ if original_size < target_size:
162
+ # Clean existing _rep suffix if present
163
+ input_df[id_col] = input_df[id_col].astype(str).str.replace(r"_rep\d+$", "", regex=True)
164
+
165
+ # Calculate how many complete copies we need
166
+ copies_needed = math.ceil(target_size / original_size)
167
+
168
+ # Create the replicated dataframe
169
+ replicated_dfs = []
170
+ for i in range(copies_needed):
171
+ df_copy = input_df.copy()
172
+ if i > 0: # Add suffix to all but the first copy
173
+ df_copy[id_col] = df_copy[id_col].astype(str) + f"_rep{i}"
174
+ replicated_dfs.append(df_copy)
175
+
176
+ input_df = pd.concat(replicated_dfs, ignore_index=True)
177
+
178
+ # Return exactly the target size
179
+ return input_df.head(target_size)
180
+
112
181
  def get_eval_input_from_dataset(self, dataset: str) -> EvalInput:
113
182
  # read the dataset and convert it to EvalInput
114
183
 
@@ -127,9 +196,14 @@ class DatasetHandler:
127
196
  input_df = self.dataset_filter.apply_filters(input_df)
128
197
  input_df.drop_duplicates(subset=[self.dataset_config.id_key], inplace=True)
129
198
 
199
+ if self.reps > 1 and self.adjust_dataset_size:
200
+ raise ValueError("reps and adjust_dataset_size are mutually exclusive")
201
+
130
202
  # If more than one repetition is needed, replicate the rows
131
203
  if self.reps > 1:
132
204
  input_df = self.setup_reps(input_df)
205
+ elif self.adjust_dataset_size:
206
+ input_df = self.adjust_dataset(input_df)
133
207
 
134
208
  # Convert the DataFrame to a list of EvalInput objects
135
209
  return self.get_eval_input_from_df(input_df)
aiq/eval/evaluate.py CHANGED
@@ -99,12 +99,34 @@ class EvaluationRun: # pylint: disable=too-many-public-methods
99
99
  max_timestamp = max(step.event_timestamp for step in item.trajectory)
100
100
  runtime = max_timestamp - min_timestamp
101
101
  else:
102
+ min_timestamp = 0.0
103
+ max_timestamp = 0.0
102
104
  runtime = 0.0
103
105
 
106
+ # find llm latency by calculating p95 of all llm calls
107
+ llm_latencies = []
108
+ previous_llm_start_time = None
109
+ for step in steps:
110
+ if step.event_type == "LLM_START":
111
+ previous_llm_start_time = step.event_timestamp
112
+ elif step.event_type == "LLM_END" and previous_llm_start_time is not None:
113
+ llm_latencies.append(step.event_timestamp - previous_llm_start_time)
114
+ previous_llm_start_time = None
115
+
116
+ # Calculate p95 LLM latency (or 0 if no LLM calls)
117
+ if llm_latencies:
118
+ import numpy as np
119
+ llm_latency = float(np.percentile(llm_latencies, 95))
120
+ else:
121
+ llm_latency = 0.0
122
+
104
123
  # add the usage stats to the usage stats dict
105
124
  self.usage_stats.usage_stats_items[item.id] = UsageStatsItem(usage_stats_per_llm=usage_stats_per_llm,
106
125
  runtime=runtime,
107
- total_tokens=total_tokens)
126
+ total_tokens=total_tokens,
127
+ min_timestamp=min_timestamp,
128
+ max_timestamp=max_timestamp,
129
+ llm_latency=llm_latency)
108
130
  return self.usage_stats.usage_stats_items[item.id]
109
131
 
110
132
  async def run_workflow_local(self, session_manager: AIQSessionManager):
@@ -221,7 +243,9 @@ class EvaluationRun: # pylint: disable=too-many-public-methods
221
243
  for input_item in self.eval_input.eval_input_items:
222
244
  all_stats.append(input_item.trajectory)
223
245
 
224
- profiler_runner = ProfilerRunner(self.eval_config.general.profiler, self.eval_config.general.output_dir)
246
+ profiler_runner = ProfilerRunner(self.eval_config.general.profiler,
247
+ self.eval_config.general.output_dir,
248
+ write_output=self.config.write_output)
225
249
 
226
250
  return await profiler_runner.run(all_stats)
227
251
 
@@ -308,6 +332,11 @@ class EvaluationRun: # pylint: disable=too-many-public-methods
308
332
  self.evaluator_output_files.append(output_file)
309
333
  logger.info("Evaluation results written to %s", output_file)
310
334
 
335
+ def publish_output(self, dataset_handler: DatasetHandler, profiler_results: ProfilerResults):
336
+ """Publish the output"""
337
+ if self.config.write_output:
338
+ self.write_output(dataset_handler, profiler_results)
339
+
311
340
  if self.workflow_interrupted:
312
341
  # Issue a warning if the workflow was not completed on all datasets
313
342
  msg = ("Workflow execution was interrupted due to an error. The results may be incomplete. "
@@ -415,7 +444,11 @@ class EvaluationRun: # pylint: disable=too-many-public-methods
415
444
  workflow_interrupted=self.workflow_interrupted,
416
445
  )
417
446
 
418
- dataset_handler = DatasetHandler(dataset_config=dataset_config, reps=self.config.reps)
447
+ dataset_handler = DatasetHandler(dataset_config=dataset_config,
448
+ reps=self.config.reps,
449
+ concurrency=self.eval_config.general.max_concurrency,
450
+ num_passes=self.config.num_passes,
451
+ adjust_dataset_size=self.config.adjust_dataset_size)
419
452
  self.eval_input = dataset_handler.get_eval_input_from_dataset(self.config.dataset)
420
453
  if not self.eval_input.eval_input_items:
421
454
  logger.info("Dataset is empty. Nothing to evaluate.")
@@ -447,8 +480,16 @@ class EvaluationRun: # pylint: disable=too-many-public-methods
447
480
  # Profile the workflow
448
481
  profiler_results = await self.profile_workflow()
449
482
 
450
- # Write the results to the output directory
451
- self.write_output(dataset_handler, profiler_results)
483
+ # compute total runtime
484
+ if self.usage_stats.usage_stats_items:
485
+ self.usage_stats.total_runtime = max(self.usage_stats.usage_stats_items.values(),
486
+ key=lambda x: x.max_timestamp).max_timestamp - \
487
+ min(self.usage_stats.usage_stats_items.values(), key=lambda x: x.min_timestamp).min_timestamp
488
+ else:
489
+ self.usage_stats.total_runtime = 0.0
490
+
491
+ # Publish the results
492
+ self.publish_output(dataset_handler, profiler_results)
452
493
 
453
494
  # Run custom scripts and upload evaluation outputs to S3
454
495
  if self.eval_config.general.output:
@@ -456,8 +497,10 @@ class EvaluationRun: # pylint: disable=too-many-public-methods
456
497
  output_uploader.run_custom_scripts()
457
498
  await output_uploader.upload_directory()
458
499
 
459
- return EvaluationRunOutput(
460
- workflow_output_file=self.workflow_output_file,
461
- evaluator_output_files=self.evaluator_output_files,
462
- workflow_interrupted=self.workflow_interrupted,
463
- )
500
+ return EvaluationRunOutput(workflow_output_file=self.workflow_output_file,
501
+ evaluator_output_files=self.evaluator_output_files,
502
+ workflow_interrupted=self.workflow_interrupted,
503
+ eval_input=self.eval_input,
504
+ evaluation_results=self.evaluation_results,
505
+ usage_stats=self.usage_stats,
506
+ profiler_results=profiler_results)