aiqtoolkit 1.2.0.dev0__py3-none-any.whl → 1.2.0rc1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of aiqtoolkit might be problematic. Click here for more details.

Files changed (220) hide show
  1. aiq/agent/base.py +170 -8
  2. aiq/agent/dual_node.py +1 -1
  3. aiq/agent/react_agent/agent.py +146 -112
  4. aiq/agent/react_agent/prompt.py +1 -6
  5. aiq/agent/react_agent/register.py +36 -35
  6. aiq/agent/rewoo_agent/agent.py +36 -35
  7. aiq/agent/rewoo_agent/register.py +2 -2
  8. aiq/agent/tool_calling_agent/agent.py +3 -7
  9. aiq/agent/tool_calling_agent/register.py +1 -1
  10. aiq/authentication/__init__.py +14 -0
  11. aiq/authentication/api_key/__init__.py +14 -0
  12. aiq/authentication/api_key/api_key_auth_provider.py +92 -0
  13. aiq/authentication/api_key/api_key_auth_provider_config.py +124 -0
  14. aiq/authentication/api_key/register.py +26 -0
  15. aiq/authentication/exceptions/__init__.py +14 -0
  16. aiq/authentication/exceptions/api_key_exceptions.py +38 -0
  17. aiq/authentication/exceptions/auth_code_grant_exceptions.py +86 -0
  18. aiq/authentication/exceptions/call_back_exceptions.py +38 -0
  19. aiq/authentication/exceptions/request_exceptions.py +54 -0
  20. aiq/authentication/http_basic_auth/__init__.py +0 -0
  21. aiq/authentication/http_basic_auth/http_basic_auth_provider.py +81 -0
  22. aiq/authentication/http_basic_auth/register.py +30 -0
  23. aiq/authentication/interfaces.py +93 -0
  24. aiq/authentication/oauth2/__init__.py +14 -0
  25. aiq/authentication/oauth2/oauth2_auth_code_flow_provider.py +107 -0
  26. aiq/authentication/oauth2/oauth2_auth_code_flow_provider_config.py +39 -0
  27. aiq/authentication/oauth2/register.py +25 -0
  28. aiq/authentication/register.py +21 -0
  29. aiq/builder/builder.py +64 -2
  30. aiq/builder/component_utils.py +16 -3
  31. aiq/builder/context.py +37 -0
  32. aiq/builder/eval_builder.py +43 -2
  33. aiq/builder/function.py +44 -12
  34. aiq/builder/function_base.py +1 -1
  35. aiq/builder/intermediate_step_manager.py +6 -8
  36. aiq/builder/user_interaction_manager.py +3 -0
  37. aiq/builder/workflow.py +23 -18
  38. aiq/builder/workflow_builder.py +421 -61
  39. aiq/cli/commands/info/list_mcp.py +103 -16
  40. aiq/cli/commands/sizing/__init__.py +14 -0
  41. aiq/cli/commands/sizing/calc.py +294 -0
  42. aiq/cli/commands/sizing/sizing.py +27 -0
  43. aiq/cli/commands/start.py +2 -1
  44. aiq/cli/entrypoint.py +2 -0
  45. aiq/cli/register_workflow.py +80 -0
  46. aiq/cli/type_registry.py +151 -30
  47. aiq/data_models/api_server.py +124 -12
  48. aiq/data_models/authentication.py +231 -0
  49. aiq/data_models/common.py +35 -7
  50. aiq/data_models/component.py +17 -9
  51. aiq/data_models/component_ref.py +33 -0
  52. aiq/data_models/config.py +60 -3
  53. aiq/data_models/dataset_handler.py +2 -1
  54. aiq/data_models/embedder.py +1 -0
  55. aiq/data_models/evaluate.py +23 -0
  56. aiq/data_models/function_dependencies.py +8 -0
  57. aiq/data_models/interactive.py +10 -1
  58. aiq/data_models/intermediate_step.py +38 -5
  59. aiq/data_models/its_strategy.py +30 -0
  60. aiq/data_models/llm.py +1 -0
  61. aiq/data_models/memory.py +1 -0
  62. aiq/data_models/object_store.py +44 -0
  63. aiq/data_models/profiler.py +1 -0
  64. aiq/data_models/retry_mixin.py +35 -0
  65. aiq/data_models/span.py +187 -0
  66. aiq/data_models/telemetry_exporter.py +2 -2
  67. aiq/embedder/nim_embedder.py +2 -1
  68. aiq/embedder/openai_embedder.py +2 -1
  69. aiq/eval/config.py +19 -1
  70. aiq/eval/dataset_handler/dataset_handler.py +87 -2
  71. aiq/eval/evaluate.py +208 -27
  72. aiq/eval/evaluator/base_evaluator.py +73 -0
  73. aiq/eval/evaluator/evaluator_model.py +1 -0
  74. aiq/eval/intermediate_step_adapter.py +11 -5
  75. aiq/eval/rag_evaluator/evaluate.py +55 -15
  76. aiq/eval/rag_evaluator/register.py +6 -1
  77. aiq/eval/remote_workflow.py +7 -2
  78. aiq/eval/runners/__init__.py +14 -0
  79. aiq/eval/runners/config.py +39 -0
  80. aiq/eval/runners/multi_eval_runner.py +54 -0
  81. aiq/eval/trajectory_evaluator/evaluate.py +22 -65
  82. aiq/eval/tunable_rag_evaluator/evaluate.py +150 -168
  83. aiq/eval/tunable_rag_evaluator/register.py +2 -0
  84. aiq/eval/usage_stats.py +41 -0
  85. aiq/eval/utils/output_uploader.py +10 -1
  86. aiq/eval/utils/weave_eval.py +184 -0
  87. aiq/experimental/__init__.py +0 -0
  88. aiq/experimental/decorators/__init__.py +0 -0
  89. aiq/experimental/decorators/experimental_warning_decorator.py +130 -0
  90. aiq/experimental/inference_time_scaling/__init__.py +0 -0
  91. aiq/experimental/inference_time_scaling/editing/__init__.py +0 -0
  92. aiq/experimental/inference_time_scaling/editing/iterative_plan_refinement_editor.py +147 -0
  93. aiq/experimental/inference_time_scaling/editing/llm_as_a_judge_editor.py +204 -0
  94. aiq/experimental/inference_time_scaling/editing/motivation_aware_summarization.py +107 -0
  95. aiq/experimental/inference_time_scaling/functions/__init__.py +0 -0
  96. aiq/experimental/inference_time_scaling/functions/execute_score_select_function.py +105 -0
  97. aiq/experimental/inference_time_scaling/functions/its_tool_orchestration_function.py +205 -0
  98. aiq/experimental/inference_time_scaling/functions/its_tool_wrapper_function.py +146 -0
  99. aiq/experimental/inference_time_scaling/functions/plan_select_execute_function.py +224 -0
  100. aiq/experimental/inference_time_scaling/models/__init__.py +0 -0
  101. aiq/experimental/inference_time_scaling/models/editor_config.py +132 -0
  102. aiq/experimental/inference_time_scaling/models/its_item.py +48 -0
  103. aiq/experimental/inference_time_scaling/models/scoring_config.py +112 -0
  104. aiq/experimental/inference_time_scaling/models/search_config.py +120 -0
  105. aiq/experimental/inference_time_scaling/models/selection_config.py +154 -0
  106. aiq/experimental/inference_time_scaling/models/stage_enums.py +43 -0
  107. aiq/experimental/inference_time_scaling/models/strategy_base.py +66 -0
  108. aiq/experimental/inference_time_scaling/models/tool_use_config.py +41 -0
  109. aiq/experimental/inference_time_scaling/register.py +36 -0
  110. aiq/experimental/inference_time_scaling/scoring/__init__.py +0 -0
  111. aiq/experimental/inference_time_scaling/scoring/llm_based_agent_scorer.py +168 -0
  112. aiq/experimental/inference_time_scaling/scoring/llm_based_plan_scorer.py +168 -0
  113. aiq/experimental/inference_time_scaling/scoring/motivation_aware_scorer.py +111 -0
  114. aiq/experimental/inference_time_scaling/search/__init__.py +0 -0
  115. aiq/experimental/inference_time_scaling/search/multi_llm_planner.py +128 -0
  116. aiq/experimental/inference_time_scaling/search/multi_query_retrieval_search.py +122 -0
  117. aiq/experimental/inference_time_scaling/search/single_shot_multi_plan_planner.py +128 -0
  118. aiq/experimental/inference_time_scaling/selection/__init__.py +0 -0
  119. aiq/experimental/inference_time_scaling/selection/best_of_n_selector.py +63 -0
  120. aiq/experimental/inference_time_scaling/selection/llm_based_agent_output_selector.py +131 -0
  121. aiq/experimental/inference_time_scaling/selection/llm_based_output_merging_selector.py +159 -0
  122. aiq/experimental/inference_time_scaling/selection/llm_based_plan_selector.py +128 -0
  123. aiq/experimental/inference_time_scaling/selection/threshold_selector.py +58 -0
  124. aiq/front_ends/console/authentication_flow_handler.py +233 -0
  125. aiq/front_ends/console/console_front_end_plugin.py +11 -2
  126. aiq/front_ends/fastapi/auth_flow_handlers/__init__.py +0 -0
  127. aiq/front_ends/fastapi/auth_flow_handlers/http_flow_handler.py +27 -0
  128. aiq/front_ends/fastapi/auth_flow_handlers/websocket_flow_handler.py +107 -0
  129. aiq/front_ends/fastapi/fastapi_front_end_config.py +93 -9
  130. aiq/front_ends/fastapi/fastapi_front_end_controller.py +68 -0
  131. aiq/front_ends/fastapi/fastapi_front_end_plugin.py +14 -1
  132. aiq/front_ends/fastapi/fastapi_front_end_plugin_worker.py +537 -52
  133. aiq/front_ends/fastapi/html_snippets/__init__.py +14 -0
  134. aiq/front_ends/fastapi/html_snippets/auth_code_grant_success.py +35 -0
  135. aiq/front_ends/fastapi/job_store.py +47 -25
  136. aiq/front_ends/fastapi/main.py +2 -0
  137. aiq/front_ends/fastapi/message_handler.py +108 -89
  138. aiq/front_ends/fastapi/step_adaptor.py +2 -1
  139. aiq/llm/aws_bedrock_llm.py +57 -0
  140. aiq/llm/nim_llm.py +2 -1
  141. aiq/llm/openai_llm.py +3 -2
  142. aiq/llm/register.py +1 -0
  143. aiq/meta/pypi.md +12 -12
  144. aiq/object_store/__init__.py +20 -0
  145. aiq/object_store/in_memory_object_store.py +74 -0
  146. aiq/object_store/interfaces.py +84 -0
  147. aiq/object_store/models.py +36 -0
  148. aiq/object_store/register.py +20 -0
  149. aiq/observability/__init__.py +14 -0
  150. aiq/observability/exporter/__init__.py +14 -0
  151. aiq/observability/exporter/base_exporter.py +449 -0
  152. aiq/observability/exporter/exporter.py +78 -0
  153. aiq/observability/exporter/file_exporter.py +33 -0
  154. aiq/observability/exporter/processing_exporter.py +269 -0
  155. aiq/observability/exporter/raw_exporter.py +52 -0
  156. aiq/observability/exporter/span_exporter.py +264 -0
  157. aiq/observability/exporter_manager.py +335 -0
  158. aiq/observability/mixin/__init__.py +14 -0
  159. aiq/observability/mixin/batch_config_mixin.py +26 -0
  160. aiq/observability/mixin/collector_config_mixin.py +23 -0
  161. aiq/observability/mixin/file_mixin.py +288 -0
  162. aiq/observability/mixin/file_mode.py +23 -0
  163. aiq/observability/mixin/resource_conflict_mixin.py +134 -0
  164. aiq/observability/mixin/serialize_mixin.py +61 -0
  165. aiq/observability/mixin/type_introspection_mixin.py +183 -0
  166. aiq/observability/processor/__init__.py +14 -0
  167. aiq/observability/processor/batching_processor.py +316 -0
  168. aiq/observability/processor/intermediate_step_serializer.py +28 -0
  169. aiq/observability/processor/processor.py +68 -0
  170. aiq/observability/register.py +36 -39
  171. aiq/observability/utils/__init__.py +14 -0
  172. aiq/observability/utils/dict_utils.py +236 -0
  173. aiq/observability/utils/time_utils.py +31 -0
  174. aiq/profiler/calc/__init__.py +14 -0
  175. aiq/profiler/calc/calc_runner.py +623 -0
  176. aiq/profiler/calc/calculations.py +288 -0
  177. aiq/profiler/calc/data_models.py +176 -0
  178. aiq/profiler/calc/plot.py +345 -0
  179. aiq/profiler/callbacks/langchain_callback_handler.py +22 -10
  180. aiq/profiler/data_models.py +24 -0
  181. aiq/profiler/inference_metrics_model.py +3 -0
  182. aiq/profiler/inference_optimization/bottleneck_analysis/nested_stack_analysis.py +8 -0
  183. aiq/profiler/inference_optimization/data_models.py +2 -2
  184. aiq/profiler/inference_optimization/llm_metrics.py +2 -2
  185. aiq/profiler/profile_runner.py +61 -21
  186. aiq/runtime/loader.py +9 -3
  187. aiq/runtime/runner.py +23 -9
  188. aiq/runtime/session.py +25 -7
  189. aiq/runtime/user_metadata.py +2 -3
  190. aiq/tool/chat_completion.py +74 -0
  191. aiq/tool/code_execution/README.md +152 -0
  192. aiq/tool/code_execution/code_sandbox.py +151 -72
  193. aiq/tool/code_execution/local_sandbox/.gitignore +1 -0
  194. aiq/tool/code_execution/local_sandbox/local_sandbox_server.py +139 -24
  195. aiq/tool/code_execution/local_sandbox/sandbox.requirements.txt +3 -1
  196. aiq/tool/code_execution/local_sandbox/start_local_sandbox.sh +27 -2
  197. aiq/tool/code_execution/register.py +7 -3
  198. aiq/tool/code_execution/test_code_execution_sandbox.py +414 -0
  199. aiq/tool/mcp/exceptions.py +142 -0
  200. aiq/tool/mcp/mcp_client.py +41 -6
  201. aiq/tool/mcp/mcp_tool.py +3 -2
  202. aiq/tool/register.py +1 -0
  203. aiq/tool/server_tools.py +6 -3
  204. aiq/utils/exception_handlers/automatic_retries.py +289 -0
  205. aiq/utils/exception_handlers/mcp.py +211 -0
  206. aiq/utils/io/model_processing.py +28 -0
  207. aiq/utils/log_utils.py +37 -0
  208. aiq/utils/string_utils.py +38 -0
  209. aiq/utils/type_converter.py +18 -2
  210. aiq/utils/type_utils.py +87 -0
  211. {aiqtoolkit-1.2.0.dev0.dist-info → aiqtoolkit-1.2.0rc1.dist-info}/METADATA +53 -21
  212. aiqtoolkit-1.2.0rc1.dist-info/RECORD +436 -0
  213. {aiqtoolkit-1.2.0.dev0.dist-info → aiqtoolkit-1.2.0rc1.dist-info}/WHEEL +1 -1
  214. {aiqtoolkit-1.2.0.dev0.dist-info → aiqtoolkit-1.2.0rc1.dist-info}/entry_points.txt +3 -0
  215. aiq/front_ends/fastapi/websocket.py +0 -148
  216. aiq/observability/async_otel_listener.py +0 -429
  217. aiqtoolkit-1.2.0.dev0.dist-info/RECORD +0 -316
  218. {aiqtoolkit-1.2.0.dev0.dist-info → aiqtoolkit-1.2.0rc1.dist-info}/licenses/LICENSE-3rd-party.txt +0 -0
  219. {aiqtoolkit-1.2.0.dev0.dist-info → aiqtoolkit-1.2.0rc1.dist-info}/licenses/LICENSE.md +0 -0
  220. {aiqtoolkit-1.2.0.dev0.dist-info → aiqtoolkit-1.2.0rc1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,623 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ import copy
17
+ import logging
18
+ import shutil
19
+ import time
20
+ import uuid
21
+ from pathlib import Path
22
+
23
+ from pydantic import ValidationError
24
+
25
+ from aiq.eval.config import EvaluationRunConfig
26
+ from aiq.eval.runners.config import MultiEvaluationRunConfig
27
+ from aiq.eval.runners.multi_eval_runner import MultiEvaluationRunner
28
+ from aiq.profiler.calc.calculations import LinearFitResult
29
+ from aiq.profiler.calc.calculations import calc_gpu_estimate_based_on_slope
30
+ from aiq.profiler.calc.calculations import calc_gpu_estimate_for_single_concurrency
31
+ from aiq.profiler.calc.calculations import compute_slope
32
+ from aiq.profiler.calc.data_models import CalcAlerts
33
+ from aiq.profiler.calc.data_models import CalcData
34
+ from aiq.profiler.calc.data_models import CalcRunnerConfig
35
+ from aiq.profiler.calc.data_models import CalcRunnerOutput
36
+ from aiq.profiler.calc.data_models import FitConfig
37
+ from aiq.profiler.calc.data_models import GPUEstimates
38
+ from aiq.profiler.calc.data_models import SizingMetricPerItem
39
+ from aiq.profiler.calc.data_models import SizingMetrics
40
+ from aiq.profiler.calc.data_models import SizingMetricsAlerts
41
+
42
+ logger = logging.getLogger(__name__)
43
+
44
+
45
+ class LinearFitAnalyzer:
46
+ """Handles linear regression analysis for concurrency vs time metrics."""
47
+
48
+ def __init__(self, fit_config: FitConfig):
49
+ self.fit_config = fit_config
50
+ self.llm_latency_fit: LinearFitResult | None = None
51
+ self.wf_runtime_fit: LinearFitResult | None = None
52
+
53
+ def analyze_metrics(self, sizing_metrics_per_concurrency: dict[int, SizingMetrics]) -> dict[int, CalcAlerts]:
54
+ """
55
+ Analyze metrics and return alerts including outlier information.
56
+
57
+ Returns:
58
+ dict[int, CalcAlerts]: Alerts per concurrency including outlier flags
59
+ """
60
+ alerts_per_concurrency = {}
61
+
62
+ # Need at least 2 points for linear regression
63
+ if len(sizing_metrics_per_concurrency) < 2:
64
+ logger.warning("Need at least 2 concurrencies for linear analysis")
65
+ # Return empty alerts for all concurrencies
66
+ for concurrency in sizing_metrics_per_concurrency.keys():
67
+ alerts_per_concurrency[concurrency] = CalcAlerts()
68
+ return alerts_per_concurrency
69
+
70
+ # Calculate linear fits
71
+ concurrencies = list(sizing_metrics_per_concurrency.keys())
72
+ latencies = [run.llm_latency_p95 for run in sizing_metrics_per_concurrency.values()]
73
+ try:
74
+ self.llm_latency_fit = compute_slope(concurrencies, latencies, self.fit_config)
75
+ logger.info("Computed latency fit: slope=%.4f, R²=%.3f",
76
+ self.llm_latency_fit.slope,
77
+ self.llm_latency_fit.r_squared)
78
+ except ValueError as e:
79
+ logger.warning("Failed to compute latency fit: %s", e)
80
+ self.llm_latency_fit = None
81
+
82
+ runtimes = [run.workflow_runtime_p95 for run in sizing_metrics_per_concurrency.values()]
83
+ try:
84
+ self.wf_runtime_fit = compute_slope(concurrencies, runtimes, self.fit_config)
85
+ logger.info("Computed runtime fit: slope=%.4f, R²=%.3f",
86
+ self.wf_runtime_fit.slope,
87
+ self.wf_runtime_fit.r_squared)
88
+ except ValueError as e:
89
+ logger.warning("Failed to compute runtime fit: %s", e)
90
+ self.wf_runtime_fit = None
91
+
92
+ # Add outlier information to alerts
93
+ for concurrency in sizing_metrics_per_concurrency.keys():
94
+ alerts = CalcAlerts()
95
+
96
+ # Check for latency outliers
97
+ if self.llm_latency_fit and concurrency in self.llm_latency_fit.outliers_removed:
98
+ alerts.outlier_llm_latency = True
99
+
100
+ # Check for runtime outliers
101
+ if self.wf_runtime_fit and concurrency in self.wf_runtime_fit.outliers_removed:
102
+ alerts.outlier_workflow_runtime = True
103
+
104
+ alerts_per_concurrency[concurrency] = alerts
105
+
106
+ return alerts_per_concurrency
107
+
108
+
109
+ class CalcRunner:
110
+ """
111
+ Calculator for GPU sizing based on concurrency vs. time metrics.
112
+ """
113
+
114
+ def __init__(self, config: CalcRunnerConfig):
115
+ """
116
+ Initialize CalcRunner with a config file and a list of concurrencies.
117
+ """
118
+ self.config = config
119
+
120
+ # Sizing metrics per concurrency, collected from the evaluation runs
121
+ # This is used as input to calculate the GPU estimates and alerts
122
+ self.metrics_per_concurrency: dict[int, SizingMetrics] = {}
123
+
124
+ self.valid_concurrencies: list = []
125
+
126
+ # GPU estimates and alerts
127
+ self.gpu_estimates_per_concurrency: dict[int, GPUEstimates] = {}
128
+ self.alerts_per_concurrency: dict[int, CalcAlerts] = {}
129
+
130
+ # Linear fit analyzer for outlier detection and trend analysis
131
+ self.linear_analyzer = LinearFitAnalyzer(self.config.fit_config)
132
+
133
+ # Validate configuration
134
+ self.validate_config()
135
+
136
+ def validate_config(self) -> None:
137
+ """
138
+ Validate the configuration parameters.
139
+ Raises ValueError if configuration is invalid.
140
+ """
141
+ # atleast two concurrencies are needed to estimate the GPU count
142
+ if len(self.config.concurrencies) < 2:
143
+ raise ValueError("Atleast two concurrencies are needed to estimate the GPU count.")
144
+
145
+ # if the same value is repeated in the concurrencies list, raise an error
146
+ if len(self.config.concurrencies) != len(set(self.config.concurrencies)):
147
+ raise ValueError("Concurrencies list contains duplicate values.")
148
+
149
+ # The value of the concurrencies has to be greater than 0
150
+ if any(concurrency <= 0 for concurrency in self.config.concurrencies):
151
+ raise ValueError("Concurrencies list contains values less than or equal to 0.")
152
+
153
+ if self.config.offline_mode:
154
+ # In offline mode target test parameters are needed to estimate the GPU count
155
+ if self.target_llm_latency <= 0 and self.target_wf_runtime <= 0:
156
+ raise ValueError("Both target_llm_latency and target_workflow_runtime are 0. "
157
+ "Cannot estimate the GPU count in offline mode.")
158
+ if self.test_gpu_count <= 0:
159
+ raise ValueError("Test GPU count is 0. Cannot estimate the GPU count in offline mode.")
160
+ if self.target_users <= 0:
161
+ raise ValueError("Target users is 0. Cannot estimate the GPU count in offline mode.")
162
+ if self.append_job:
163
+ raise ValueError("Appending jobs is not supported in offline mode.")
164
+ if not self.config.output_dir:
165
+ raise ValueError("Output directory is required in offline mode.")
166
+ else:
167
+ # Online mode validation
168
+ if not self.config.config_file:
169
+ raise ValueError("Config file is required in online mode.")
170
+ if self.target_llm_latency <= 0 and self.target_wf_runtime <= 0:
171
+ logger.warning("Both target_llm_latency and target_workflow_runtime are 0. "
172
+ "No SLA will be enforced.")
173
+ if self.test_gpu_count <= 0:
174
+ logger.warning("Test GPU count is 0. Tests will be run but the GPU count will not be estimated.")
175
+ if self.target_users <= 0:
176
+ logger.warning("Target users is 0. Tests will be run but the GPU count will not be estimated.")
177
+
178
+ @property
179
+ def target_llm_latency(self) -> float:
180
+ return self.config.target_llm_latency_p95
181
+
182
+ @property
183
+ def target_wf_runtime(self) -> float:
184
+ return self.config.target_workflow_runtime_p95
185
+
186
+ @property
187
+ def target_users(self) -> int:
188
+ return self.config.target_users
189
+
190
+ @property
191
+ def test_gpu_count(self) -> int:
192
+ return self.config.test_gpu_count
193
+
194
+ @property
195
+ def append_job(self) -> bool:
196
+ return self.config.append_job
197
+
198
+ @property
199
+ def output_dir(self) -> Path:
200
+ return self.config.output_dir
201
+
202
+ def _calc_gpu_estimates_based_on_slope(self,
203
+ sizing_metrics_per_concurrency: dict[int, SizingMetrics],
204
+ use_latency: bool,
205
+ use_runtime: bool) -> GPUEstimates:
206
+ """
207
+ Calculate GPU estimates based on the linear fit results
208
+ """
209
+ gpu_estimate_by_wf_runtime = None
210
+ gpu_estimate_by_llm_latency = None
211
+
212
+ if use_runtime and self.linear_analyzer.wf_runtime_fit:
213
+ fit = self.linear_analyzer.wf_runtime_fit
214
+ gpu_estimate_by_wf_runtime = calc_gpu_estimate_based_on_slope(target_time_metric=self.target_wf_runtime,
215
+ target_users=self.target_users,
216
+ test_gpu_count=self.test_gpu_count,
217
+ observed_slope=fit.slope,
218
+ observed_intercept=fit.intercept)
219
+ logger.info(
220
+ "[GPU Estimation %s] Runtime slope=%.4f, intercept=%.4f, R²=%.3f, outliers_removed=%s, estimate=%.2f",
221
+ "offline" if self.config.offline_mode else "online",
222
+ fit.slope,
223
+ fit.intercept,
224
+ fit.r_squared,
225
+ fit.outliers_removed,
226
+ gpu_estimate_by_wf_runtime)
227
+
228
+ if use_latency and self.linear_analyzer.llm_latency_fit:
229
+ fit = self.linear_analyzer.llm_latency_fit
230
+ gpu_estimate_by_llm_latency = calc_gpu_estimate_based_on_slope(target_time_metric=self.target_llm_latency,
231
+ target_users=self.target_users,
232
+ test_gpu_count=self.test_gpu_count,
233
+ observed_slope=fit.slope,
234
+ observed_intercept=fit.intercept)
235
+ logger.info(
236
+ "[GPU Estimation %s] Latency slope=%.4f, intercept=%.4f, R²=%.3f, outliers_removed=%s, estimate=%.2f",
237
+ "offline" if self.config.offline_mode else "online",
238
+ fit.slope,
239
+ fit.intercept,
240
+ fit.r_squared,
241
+ fit.outliers_removed,
242
+ gpu_estimate_by_llm_latency)
243
+
244
+ return GPUEstimates(gpu_estimate_by_wf_runtime=gpu_estimate_by_wf_runtime,
245
+ gpu_estimate_by_llm_latency=gpu_estimate_by_llm_latency)
246
+
247
+ def _calc_gpu_estimates_per_concurrency(self, sizing_metrics_per_concurrency: dict[int, SizingMetrics]):
248
+ """Calculate per-concurrency GPU estimates and existing alerts."""
249
+ use_latency = self.target_llm_latency > 0
250
+ use_runtime = self.target_wf_runtime > 0
251
+
252
+ logger.info("Calculating per-concurrency metrics for %d concurrencies", len(sizing_metrics_per_concurrency))
253
+ logger.info("Target users: %d, Test GPU count: %d", self.target_users, self.test_gpu_count)
254
+ logger.info("Using targets - Latency: %s, Runtime: %s",
255
+ "Yes" if use_latency else "No",
256
+ "Yes" if use_runtime else "No")
257
+
258
+ for concurrency, metrics_per_concurrency in sizing_metrics_per_concurrency.items():
259
+ observed_latency = metrics_per_concurrency.llm_latency_p95
260
+ observed_runtime = metrics_per_concurrency.workflow_runtime_p95
261
+
262
+ # Get ROUGH GPU estimates per concurrency. This is not used for the final GPU estimation.
263
+ # It is only available for information purposes.
264
+ gpu_estimates = calc_gpu_estimate_for_single_concurrency(target_llm_latency=self.target_llm_latency,
265
+ target_workflow_runtime=self.target_wf_runtime,
266
+ target_users=self.target_users,
267
+ test_concurrency=concurrency,
268
+ test_gpu_count=self.test_gpu_count,
269
+ observed_latency=observed_latency,
270
+ observed_runtime=observed_runtime)
271
+
272
+ # Store the GPU estimates directly (no need to reconstruct the same object)
273
+ self.gpu_estimates_per_concurrency[concurrency] = gpu_estimates
274
+
275
+ # Calculate out-of-range items based on per-item metrics (only if targets are specified)
276
+ num_items_greater_than_target_latency = 0
277
+ num_items_greater_than_target_runtime = 0
278
+
279
+ if (use_latency or use_runtime) and metrics_per_concurrency.per_item_metrics:
280
+ for item_metrics in metrics_per_concurrency.per_item_metrics.values():
281
+ if use_latency and item_metrics.llm_latency > self.target_llm_latency:
282
+ num_items_greater_than_target_latency += 1
283
+ if use_runtime and item_metrics.workflow_runtime > self.target_wf_runtime:
284
+ num_items_greater_than_target_runtime += 1
285
+ else:
286
+ logger.debug("Skipping per-item processing for concurrency %d (no targets or no per-item data)",
287
+ concurrency)
288
+
289
+ # Update existing alerts with the out-of-range data
290
+ existing_alerts = self.alerts_per_concurrency.get(concurrency, CalcAlerts())
291
+ existing_alerts.num_items_greater_than_target_latency = num_items_greater_than_target_latency
292
+ existing_alerts.num_items_greater_than_target_runtime = num_items_greater_than_target_runtime
293
+ self.alerts_per_concurrency[concurrency] = existing_alerts
294
+
295
+ logger.debug("Concurrency %d: GPU estimate=%.2f, out-of-range items=%d",
296
+ concurrency,
297
+ gpu_estimates.gpu_estimate_by_wf_runtime,
298
+ num_items_greater_than_target_latency + num_items_greater_than_target_runtime)
299
+
300
+ logger.info("Completed per-concurrency calculations:")
301
+ logger.info(" - GPU estimates calculated for %d concurrencies", len(self.gpu_estimates_per_concurrency))
302
+
303
+ def _validate_gpu_estimation_parameters(self, use_latency: bool, use_runtime: bool) -> bool:
304
+ """Validate parameters required for GPU estimation."""
305
+ if self.target_users <= 0:
306
+ logger.warning("Target users must be greater than 0 for GPU estimation")
307
+ return False
308
+
309
+ if self.test_gpu_count <= 0:
310
+ logger.warning("Test GPU count must be greater than 0 for GPU estimation")
311
+ return False
312
+
313
+ if not use_latency and not use_runtime:
314
+ logger.warning("No targets time metrics specified")
315
+ return False
316
+
317
+ return True
318
+
319
+ def _validate_metrics_data(self, sizing_metrics_per_concurrency: dict) -> dict:
320
+ """Validate and filter metrics data."""
321
+ valid_metrics = {}
322
+ for concurrency, metrics in sizing_metrics_per_concurrency.items():
323
+ if not metrics or not metrics.llm_latency_p95 or not metrics.workflow_runtime_p95:
324
+ logger.warning("Invalid metrics for concurrency %d: missing required fields", concurrency)
325
+ continue
326
+ valid_metrics[concurrency] = metrics
327
+ return valid_metrics
328
+
329
+ def _calc_fit_and_gpu_estimate(self, sizing_metrics_per_concurrency: dict[int, SizingMetrics]) -> GPUEstimates:
330
+ """
331
+ Estimate GPU count to meet target latency and/or workflow runtime SLA
332
+ for a given target user load.
333
+
334
+ Returns:
335
+ - GPU estimates based on the slope of the time vs concurrency
336
+ - GPU estimates per concurrency (rough estimates)
337
+ - Alerts per concurrency (outliers, etc.)
338
+ """
339
+ gpu_estimates = GPUEstimates()
340
+ # Filter out concurrencies that are missing required metrics
341
+ valid_metrics = self._validate_metrics_data(sizing_metrics_per_concurrency)
342
+ if not valid_metrics:
343
+ logger.warning("No valid metrics found for metrics calculation")
344
+ return gpu_estimates
345
+
346
+ # Filter out concurrencies that were interrupted
347
+ valid_runs = {
348
+ concurrency: metrics
349
+ for concurrency, metrics in valid_metrics.items() if not metrics.alerts.workflow_interrupted
350
+ }
351
+ if not valid_runs:
352
+ logger.warning("No valid runs found for slope-based estimation")
353
+ return gpu_estimates
354
+
355
+ self.valid_concurrencies = valid_runs.keys()
356
+
357
+ # Perform linear analysis on valid runs, this is done even if GPU estimation is skipped
358
+ self.alerts_per_concurrency = self.linear_analyzer.analyze_metrics(valid_runs)
359
+
360
+ # Validate GPU estimation parameters
361
+ use_latency = self.target_llm_latency > 0
362
+ use_runtime = self.target_wf_runtime > 0
363
+ if not self._validate_gpu_estimation_parameters(use_latency, use_runtime):
364
+ return gpu_estimates
365
+
366
+ logger.info("Starting GPU estimation with %d concurrencies", len(valid_metrics))
367
+ logger.info("Target users: %d, Test GPU count: %d", self.target_users, self.test_gpu_count)
368
+ logger.info("Target latency: %.3fs, Target runtime: %.3fs",
369
+ self.target_llm_latency if self.target_llm_latency > 0 else 0,
370
+ self.target_wf_runtime if self.target_wf_runtime > 0 else 0)
371
+
372
+ # Calculate GPU estimates per-concurrency
373
+ self._calc_gpu_estimates_per_concurrency(valid_runs)
374
+
375
+ # Calculate overall gpu estimates using linear fits
376
+ gpu_estimates = self._calc_gpu_estimates_based_on_slope(valid_runs, use_latency, use_runtime)
377
+
378
+ return gpu_estimates
379
+
380
+ def generate_calc_runner_output(self) -> CalcRunnerOutput:
381
+ """
382
+ Build CalcRunnerOutput from sizing metrics per concurrency.
383
+ """
384
+ if not self.metrics_per_concurrency:
385
+ logger.warning("No metrics per concurrency found. Skipping generation of CalcRunnerOutput.")
386
+ return CalcRunnerOutput()
387
+
388
+ logger.info("Building CalcRunnerOutput from %d concurrency metrics", len(self.metrics_per_concurrency))
389
+
390
+ # Calculate gpu estimates and per-concurrency metrics
391
+ gpu_estimates = self._calc_fit_and_gpu_estimate(self.metrics_per_concurrency)
392
+
393
+ # Group per-concurrency data (inputs to the calculator and outputs from the calculator)
394
+ calc_data = {}
395
+ for concurrency in self.metrics_per_concurrency.keys():
396
+ # Inputs to the calculator
397
+ tmp_sizing_metrics = self.metrics_per_concurrency[concurrency]
398
+ # Outputs from the calculator
399
+ tmp_gpu_estimates = self.gpu_estimates_per_concurrency.get(concurrency, GPUEstimates())
400
+ tmp_alerts = self.alerts_per_concurrency.get(concurrency, CalcAlerts())
401
+
402
+ calc_data[concurrency] = CalcData(gpu_estimates=tmp_gpu_estimates,
403
+ alerts=tmp_alerts,
404
+ sizing_metrics=tmp_sizing_metrics)
405
+
406
+ if gpu_estimates.gpu_estimate_by_wf_runtime is not None:
407
+ logger.info("GPU estimate by workflow runtime: %.2f", gpu_estimates.gpu_estimate_by_wf_runtime)
408
+ if gpu_estimates.gpu_estimate_by_llm_latency is not None:
409
+ logger.info("GPU estimate by LLM latency: %.2f", gpu_estimates.gpu_estimate_by_llm_latency)
410
+
411
+ return CalcRunnerOutput(gpu_estimates=gpu_estimates, calc_data=calc_data)
412
+
413
+ def plot_concurrency_vs_time_metrics(self, output_dir: Path):
414
+ """Plots concurrency vs. time metrics using pre-computed fits."""
415
+ from aiq.profiler.calc.plot import plot_concurrency_vs_time_metrics as plot_metrics
416
+
417
+ # Only plot if we have valid metrics and at least one fit
418
+ if not self.metrics_per_concurrency:
419
+ logger.warning("No metrics available for plotting")
420
+ return
421
+
422
+ # Filter to only valid runs for plotting
423
+ valid_runs = {
424
+ concurrency: metrics
425
+ for concurrency, metrics in self.metrics_per_concurrency.items() if concurrency in self.valid_concurrencies
426
+ }
427
+
428
+ if not valid_runs:
429
+ logger.warning("No valid runs available for plotting")
430
+ return
431
+ try:
432
+ plot_metrics(
433
+ metrics_per_concurrency=valid_runs, # Only valid runs
434
+ output_dir=output_dir,
435
+ target_llm_latency=self.target_llm_latency,
436
+ target_runtime=self.target_wf_runtime,
437
+ llm_latency_fit=self.linear_analyzer.llm_latency_fit, # May be None
438
+ runtime_fit=self.linear_analyzer.wf_runtime_fit # May be None
439
+ )
440
+ except Exception as e:
441
+ logger.exception("Failed to plot concurrency vs. time metrics: %s", e, exc_info=True)
442
+ logger.warning("Skipping plot of concurrency vs. time metrics")
443
+
444
+ def write_output(self, output_dir: Path, calc_runner_output: CalcRunnerOutput):
445
+ """
446
+ Write the output to the output directory.
447
+ """
448
+ if not output_dir:
449
+ logger.warning("Output directory is not set. Skipping write.")
450
+ return
451
+
452
+ mode = "offline" if self.config.offline_mode else "online"
453
+ subdir = output_dir / mode
454
+
455
+ if self.append_job:
456
+ job_dir = subdir / f"job_{uuid.uuid4()}"
457
+ else:
458
+ # Clear all previous jobs when not in append mode
459
+ existing_jobs = list(subdir.glob("job_*"))
460
+ if existing_jobs:
461
+ logger.info(f"Clearing {len(existing_jobs)} existing jobs")
462
+ for job in existing_jobs:
463
+ if job.is_dir():
464
+ shutil.rmtree(job)
465
+ # Use timestamp-based naming
466
+ job_dir = subdir / f"job_{int(time.time())}"
467
+
468
+ job_dir.mkdir(parents=True, exist_ok=True)
469
+
470
+ if self.config.plot_data:
471
+ self.plot_concurrency_vs_time_metrics(job_dir)
472
+
473
+ output_path = job_dir / "calc_runner_output.json"
474
+ output_path.write_text(calc_runner_output.model_dump_json(indent=2))
475
+ logger.info("Wrote output to %s", job_dir)
476
+
477
+ def run_offline(self) -> CalcRunnerOutput:
478
+ """
479
+ Run in offline mode.
480
+ 1. Read previous jobs in online mode and create sizing metrics per concurrency
481
+ 2. Calculate GPU estimates
482
+ 3. Write the output to the offline subdirectory
483
+ """
484
+ # Read all jobs in online mode and only append unique concurrency values to metrics_per_concurrency
485
+ online_dir = Path(self.config.output_dir) / "online"
486
+ if not online_dir.exists():
487
+ logger.warning("Online directory %s does not exist. Skipping offline mode.", online_dir)
488
+ return CalcRunnerOutput()
489
+
490
+ # Get all job directories and sort by creation time (most recent first)
491
+ job_dirs = [job_dir for job_dir in online_dir.iterdir() if job_dir.is_dir() and job_dir.name.startswith("job_")]
492
+ job_dirs.sort(key=lambda x: x.stat().st_mtime, reverse=True)
493
+
494
+ logger.info("Found %d job directories, processing from most recent to oldest", len(job_dirs))
495
+
496
+ for job_dir in job_dirs:
497
+ calc_runner_output_path = job_dir / "calc_runner_output.json"
498
+ if not calc_runner_output_path.exists():
499
+ logger.warning("Calc runner output file %s does not exist. Skipping job %s.",
500
+ calc_runner_output_path,
501
+ job_dir.name)
502
+ continue
503
+ try:
504
+ calc_output = CalcRunnerOutput.model_validate_json(calc_runner_output_path.read_text())
505
+ except ValidationError as e:
506
+ logger.exception("Failed to validate calc runner output file %s. Skipping job %s.",
507
+ calc_runner_output_path,
508
+ e,
509
+ exc_info=True)
510
+ continue
511
+
512
+ # Extract sizing metrics from calc_data
513
+ for concurrency, data in calc_output.calc_data.items():
514
+ metrics = data.sizing_metrics
515
+ if concurrency not in self.metrics_per_concurrency:
516
+ logger.info("Adding concurrency %s from job %s (most recent available).", concurrency, job_dir.name)
517
+ logger.info("Sizing metrics: %s", metrics)
518
+ self.metrics_per_concurrency[concurrency] = metrics
519
+ else:
520
+ # Skip since we already have this concurrency from a more recent job
521
+ logger.debug("Concurrency %s already exists from a more recent job. Skipping job %s.",
522
+ concurrency,
523
+ job_dir.name)
524
+
525
+ # calculate gpu estimates
526
+ calc_runner_output = self.generate_calc_runner_output()
527
+
528
+ # write the offline output
529
+ self.write_output(self.config.output_dir, calc_runner_output)
530
+
531
+ return calc_runner_output
532
+
533
+ async def run_online(self) -> CalcRunnerOutput:
534
+ """
535
+ Create a MultiEvaluationRunner with concurrency overrides.
536
+ Run in online mode.
537
+ 1. Run the workflow
538
+ 2. Create sizing metrics per concurrency from the profiler results and usage stats
539
+ 3. Calculate GPU estimates
540
+ 4. Write the output to the online subdirectory
541
+ """
542
+ # Override the concurrency and alias keys in the config
543
+ concurrency_key = "eval.general.max_concurrency"
544
+ alias_key = "eval.general.workflow_alias"
545
+ # Ensure profiler base metrics are enabled via overrides
546
+ profiler_base_metrics_key = "eval.general.profiler.base_metrics"
547
+
548
+ # setup the base config
549
+ eval_run_config = EvaluationRunConfig(config_file=self.config.config_file,
550
+ adjust_dataset_size=True,
551
+ num_passes=self.config.num_passes,
552
+ endpoint=self.config.endpoint,
553
+ endpoint_timeout=self.config.endpoint_timeout)
554
+
555
+ # Create a copy of the base config and apply the overrides for each concurrency
556
+ configs = {}
557
+ for concurrency in self.config.concurrencies:
558
+ config = copy.deepcopy(eval_run_config)
559
+ override = ((concurrency_key, str(concurrency)), (alias_key, "wf_concurrency_" + str(concurrency)),
560
+ (profiler_base_metrics_key, "true"))
561
+ config.override = override
562
+ configs[concurrency] = config
563
+
564
+ # Instantiate the multi-evaluation run config with the overrides for each concurrency
565
+ config = MultiEvaluationRunConfig(configs=configs)
566
+
567
+ # Instantiate and run multi-evaluation runner
568
+ runner = MultiEvaluationRunner(config)
569
+ evaluation_run_outputs = await runner.run_all()
570
+ if not evaluation_run_outputs:
571
+ logger.warning("No evaluation run outputs found. Skipping online mode.")
572
+ return CalcRunnerOutput()
573
+
574
+ # Calculate sizing metrics per concurrency
575
+ # if the workflow was interrupted, the metrics are not eligible for slope-based GPU estimation
576
+ for concurrency, eval_output in evaluation_run_outputs.items():
577
+ profiler_results = eval_output.profiler_results
578
+ usage_stats = eval_output.usage_stats
579
+ workflow_interrupted = eval_output.workflow_interrupted
580
+
581
+ per_item_metrics = {
582
+ item_id:
583
+ SizingMetricPerItem(llm_latency=item_metrics.llm_latency, workflow_runtime=item_metrics.runtime)
584
+ for item_id, item_metrics in eval_output.usage_stats.usage_stats_items.items()
585
+ }
586
+
587
+ # if the workflow was interrupted, the metrics are not eligible for slope-based GPU estimation
588
+ llm_latency_p95 = profiler_results.llm_latency_ci.p95 \
589
+ if profiler_results.llm_latency_ci else 0
590
+ workflow_runtime_p95 = profiler_results.workflow_runtime_metrics.p95 \
591
+ if profiler_results.workflow_runtime_metrics else 0
592
+ self.metrics_per_concurrency[concurrency] = SizingMetrics(
593
+ llm_latency_p95=llm_latency_p95,
594
+ workflow_runtime_p95=workflow_runtime_p95,
595
+ total_runtime=usage_stats.total_runtime,
596
+ per_item_metrics=per_item_metrics,
597
+ alerts=SizingMetricsAlerts(workflow_interrupted=workflow_interrupted))
598
+
599
+ # calculate gpu estimates
600
+ calc_runner_output = self.generate_calc_runner_output()
601
+
602
+ # plot the metrics and write the output
603
+ self.write_output(self.config.output_dir, calc_runner_output)
604
+
605
+ return calc_runner_output
606
+
607
+ async def run(self) -> CalcRunnerOutput:
608
+ """
609
+ online mode:
610
+ 1. Run the workflow
611
+ 2. Collect profiler results and usage stats
612
+ 3. Calculate GPU estimates
613
+ 4. Write the output to the online subdirectory
614
+
615
+ offline mode:
616
+ 1. Read previous jobs in online mode and only append unique concurrency values to metrics_per_concurrency
617
+ 2. Calculate GPU estimates
618
+ 3. Write the output to the offline subdirectory
619
+ """
620
+ if self.config.offline_mode:
621
+ return self.run_offline()
622
+ else:
623
+ return await self.run_online()