triton-model-analyzer 1.48.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (204) hide show
  1. model_analyzer/__init__.py +15 -0
  2. model_analyzer/analyzer.py +448 -0
  3. model_analyzer/cli/__init__.py +15 -0
  4. model_analyzer/cli/cli.py +193 -0
  5. model_analyzer/config/__init__.py +15 -0
  6. model_analyzer/config/generate/__init__.py +15 -0
  7. model_analyzer/config/generate/automatic_model_config_generator.py +164 -0
  8. model_analyzer/config/generate/base_model_config_generator.py +352 -0
  9. model_analyzer/config/generate/brute_plus_binary_parameter_search_run_config_generator.py +164 -0
  10. model_analyzer/config/generate/brute_run_config_generator.py +154 -0
  11. model_analyzer/config/generate/concurrency_sweeper.py +75 -0
  12. model_analyzer/config/generate/config_generator_interface.py +52 -0
  13. model_analyzer/config/generate/coordinate.py +143 -0
  14. model_analyzer/config/generate/coordinate_data.py +86 -0
  15. model_analyzer/config/generate/generator_utils.py +116 -0
  16. model_analyzer/config/generate/manual_model_config_generator.py +187 -0
  17. model_analyzer/config/generate/model_config_generator_factory.py +92 -0
  18. model_analyzer/config/generate/model_profile_spec.py +74 -0
  19. model_analyzer/config/generate/model_run_config_generator.py +154 -0
  20. model_analyzer/config/generate/model_variant_name_manager.py +150 -0
  21. model_analyzer/config/generate/neighborhood.py +536 -0
  22. model_analyzer/config/generate/optuna_plus_concurrency_sweep_run_config_generator.py +141 -0
  23. model_analyzer/config/generate/optuna_run_config_generator.py +838 -0
  24. model_analyzer/config/generate/perf_analyzer_config_generator.py +312 -0
  25. model_analyzer/config/generate/quick_plus_concurrency_sweep_run_config_generator.py +130 -0
  26. model_analyzer/config/generate/quick_run_config_generator.py +753 -0
  27. model_analyzer/config/generate/run_config_generator_factory.py +329 -0
  28. model_analyzer/config/generate/search_config.py +112 -0
  29. model_analyzer/config/generate/search_dimension.py +73 -0
  30. model_analyzer/config/generate/search_dimensions.py +85 -0
  31. model_analyzer/config/generate/search_parameter.py +49 -0
  32. model_analyzer/config/generate/search_parameters.py +388 -0
  33. model_analyzer/config/input/__init__.py +15 -0
  34. model_analyzer/config/input/config_command.py +483 -0
  35. model_analyzer/config/input/config_command_profile.py +1747 -0
  36. model_analyzer/config/input/config_command_report.py +267 -0
  37. model_analyzer/config/input/config_defaults.py +236 -0
  38. model_analyzer/config/input/config_enum.py +83 -0
  39. model_analyzer/config/input/config_field.py +216 -0
  40. model_analyzer/config/input/config_list_generic.py +112 -0
  41. model_analyzer/config/input/config_list_numeric.py +151 -0
  42. model_analyzer/config/input/config_list_string.py +111 -0
  43. model_analyzer/config/input/config_none.py +71 -0
  44. model_analyzer/config/input/config_object.py +129 -0
  45. model_analyzer/config/input/config_primitive.py +81 -0
  46. model_analyzer/config/input/config_status.py +75 -0
  47. model_analyzer/config/input/config_sweep.py +83 -0
  48. model_analyzer/config/input/config_union.py +113 -0
  49. model_analyzer/config/input/config_utils.py +128 -0
  50. model_analyzer/config/input/config_value.py +243 -0
  51. model_analyzer/config/input/objects/__init__.py +15 -0
  52. model_analyzer/config/input/objects/config_model_profile_spec.py +325 -0
  53. model_analyzer/config/input/objects/config_model_report_spec.py +173 -0
  54. model_analyzer/config/input/objects/config_plot.py +198 -0
  55. model_analyzer/config/input/objects/config_protobuf_utils.py +101 -0
  56. model_analyzer/config/input/yaml_config_validator.py +82 -0
  57. model_analyzer/config/run/__init__.py +15 -0
  58. model_analyzer/config/run/model_run_config.py +313 -0
  59. model_analyzer/config/run/run_config.py +168 -0
  60. model_analyzer/constants.py +76 -0
  61. model_analyzer/device/__init__.py +15 -0
  62. model_analyzer/device/device.py +24 -0
  63. model_analyzer/device/gpu_device.py +87 -0
  64. model_analyzer/device/gpu_device_factory.py +248 -0
  65. model_analyzer/entrypoint.py +307 -0
  66. model_analyzer/log_formatter.py +65 -0
  67. model_analyzer/model_analyzer_exceptions.py +24 -0
  68. model_analyzer/model_manager.py +255 -0
  69. model_analyzer/monitor/__init__.py +15 -0
  70. model_analyzer/monitor/cpu_monitor.py +69 -0
  71. model_analyzer/monitor/dcgm/DcgmDiag.py +191 -0
  72. model_analyzer/monitor/dcgm/DcgmFieldGroup.py +83 -0
  73. model_analyzer/monitor/dcgm/DcgmGroup.py +815 -0
  74. model_analyzer/monitor/dcgm/DcgmHandle.py +141 -0
  75. model_analyzer/monitor/dcgm/DcgmJsonReader.py +69 -0
  76. model_analyzer/monitor/dcgm/DcgmReader.py +623 -0
  77. model_analyzer/monitor/dcgm/DcgmStatus.py +57 -0
  78. model_analyzer/monitor/dcgm/DcgmSystem.py +412 -0
  79. model_analyzer/monitor/dcgm/__init__.py +15 -0
  80. model_analyzer/monitor/dcgm/common/__init__.py +13 -0
  81. model_analyzer/monitor/dcgm/common/dcgm_client_cli_parser.py +194 -0
  82. model_analyzer/monitor/dcgm/common/dcgm_client_main.py +86 -0
  83. model_analyzer/monitor/dcgm/dcgm_agent.py +887 -0
  84. model_analyzer/monitor/dcgm/dcgm_collectd_plugin.py +369 -0
  85. model_analyzer/monitor/dcgm/dcgm_errors.py +395 -0
  86. model_analyzer/monitor/dcgm/dcgm_field_helpers.py +546 -0
  87. model_analyzer/monitor/dcgm/dcgm_fields.py +815 -0
  88. model_analyzer/monitor/dcgm/dcgm_fields_collectd.py +671 -0
  89. model_analyzer/monitor/dcgm/dcgm_fields_internal.py +29 -0
  90. model_analyzer/monitor/dcgm/dcgm_fluentd.py +45 -0
  91. model_analyzer/monitor/dcgm/dcgm_monitor.py +138 -0
  92. model_analyzer/monitor/dcgm/dcgm_prometheus.py +326 -0
  93. model_analyzer/monitor/dcgm/dcgm_structs.py +2357 -0
  94. model_analyzer/monitor/dcgm/dcgm_telegraf.py +65 -0
  95. model_analyzer/monitor/dcgm/dcgm_value.py +151 -0
  96. model_analyzer/monitor/dcgm/dcgmvalue.py +155 -0
  97. model_analyzer/monitor/dcgm/denylist_recommendations.py +573 -0
  98. model_analyzer/monitor/dcgm/pydcgm.py +47 -0
  99. model_analyzer/monitor/monitor.py +143 -0
  100. model_analyzer/monitor/remote_monitor.py +137 -0
  101. model_analyzer/output/__init__.py +15 -0
  102. model_analyzer/output/file_writer.py +63 -0
  103. model_analyzer/output/output_writer.py +42 -0
  104. model_analyzer/perf_analyzer/__init__.py +15 -0
  105. model_analyzer/perf_analyzer/genai_perf_config.py +206 -0
  106. model_analyzer/perf_analyzer/perf_analyzer.py +882 -0
  107. model_analyzer/perf_analyzer/perf_config.py +479 -0
  108. model_analyzer/plots/__init__.py +15 -0
  109. model_analyzer/plots/detailed_plot.py +266 -0
  110. model_analyzer/plots/plot_manager.py +224 -0
  111. model_analyzer/plots/simple_plot.py +213 -0
  112. model_analyzer/record/__init__.py +15 -0
  113. model_analyzer/record/gpu_record.py +68 -0
  114. model_analyzer/record/metrics_manager.py +887 -0
  115. model_analyzer/record/record.py +280 -0
  116. model_analyzer/record/record_aggregator.py +256 -0
  117. model_analyzer/record/types/__init__.py +15 -0
  118. model_analyzer/record/types/cpu_available_ram.py +93 -0
  119. model_analyzer/record/types/cpu_used_ram.py +93 -0
  120. model_analyzer/record/types/gpu_free_memory.py +96 -0
  121. model_analyzer/record/types/gpu_power_usage.py +107 -0
  122. model_analyzer/record/types/gpu_total_memory.py +96 -0
  123. model_analyzer/record/types/gpu_used_memory.py +96 -0
  124. model_analyzer/record/types/gpu_utilization.py +108 -0
  125. model_analyzer/record/types/inter_token_latency_avg.py +60 -0
  126. model_analyzer/record/types/inter_token_latency_base.py +74 -0
  127. model_analyzer/record/types/inter_token_latency_max.py +60 -0
  128. model_analyzer/record/types/inter_token_latency_min.py +60 -0
  129. model_analyzer/record/types/inter_token_latency_p25.py +60 -0
  130. model_analyzer/record/types/inter_token_latency_p50.py +60 -0
  131. model_analyzer/record/types/inter_token_latency_p75.py +60 -0
  132. model_analyzer/record/types/inter_token_latency_p90.py +60 -0
  133. model_analyzer/record/types/inter_token_latency_p95.py +60 -0
  134. model_analyzer/record/types/inter_token_latency_p99.py +60 -0
  135. model_analyzer/record/types/output_token_throughput.py +105 -0
  136. model_analyzer/record/types/perf_client_response_wait.py +97 -0
  137. model_analyzer/record/types/perf_client_send_recv.py +97 -0
  138. model_analyzer/record/types/perf_latency.py +111 -0
  139. model_analyzer/record/types/perf_latency_avg.py +60 -0
  140. model_analyzer/record/types/perf_latency_base.py +74 -0
  141. model_analyzer/record/types/perf_latency_p90.py +60 -0
  142. model_analyzer/record/types/perf_latency_p95.py +60 -0
  143. model_analyzer/record/types/perf_latency_p99.py +60 -0
  144. model_analyzer/record/types/perf_server_compute_infer.py +97 -0
  145. model_analyzer/record/types/perf_server_compute_input.py +97 -0
  146. model_analyzer/record/types/perf_server_compute_output.py +97 -0
  147. model_analyzer/record/types/perf_server_queue.py +97 -0
  148. model_analyzer/record/types/perf_throughput.py +105 -0
  149. model_analyzer/record/types/time_to_first_token_avg.py +60 -0
  150. model_analyzer/record/types/time_to_first_token_base.py +74 -0
  151. model_analyzer/record/types/time_to_first_token_max.py +60 -0
  152. model_analyzer/record/types/time_to_first_token_min.py +60 -0
  153. model_analyzer/record/types/time_to_first_token_p25.py +60 -0
  154. model_analyzer/record/types/time_to_first_token_p50.py +60 -0
  155. model_analyzer/record/types/time_to_first_token_p75.py +60 -0
  156. model_analyzer/record/types/time_to_first_token_p90.py +60 -0
  157. model_analyzer/record/types/time_to_first_token_p95.py +60 -0
  158. model_analyzer/record/types/time_to_first_token_p99.py +60 -0
  159. model_analyzer/reports/__init__.py +15 -0
  160. model_analyzer/reports/html_report.py +195 -0
  161. model_analyzer/reports/pdf_report.py +50 -0
  162. model_analyzer/reports/report.py +86 -0
  163. model_analyzer/reports/report_factory.py +62 -0
  164. model_analyzer/reports/report_manager.py +1376 -0
  165. model_analyzer/reports/report_utils.py +42 -0
  166. model_analyzer/result/__init__.py +15 -0
  167. model_analyzer/result/constraint_manager.py +150 -0
  168. model_analyzer/result/model_config_measurement.py +354 -0
  169. model_analyzer/result/model_constraints.py +105 -0
  170. model_analyzer/result/parameter_search.py +246 -0
  171. model_analyzer/result/result_manager.py +430 -0
  172. model_analyzer/result/result_statistics.py +159 -0
  173. model_analyzer/result/result_table.py +217 -0
  174. model_analyzer/result/result_table_manager.py +646 -0
  175. model_analyzer/result/result_utils.py +42 -0
  176. model_analyzer/result/results.py +277 -0
  177. model_analyzer/result/run_config_measurement.py +658 -0
  178. model_analyzer/result/run_config_result.py +210 -0
  179. model_analyzer/result/run_config_result_comparator.py +110 -0
  180. model_analyzer/result/sorted_results.py +151 -0
  181. model_analyzer/state/__init__.py +15 -0
  182. model_analyzer/state/analyzer_state.py +76 -0
  183. model_analyzer/state/analyzer_state_manager.py +215 -0
  184. model_analyzer/triton/__init__.py +15 -0
  185. model_analyzer/triton/client/__init__.py +15 -0
  186. model_analyzer/triton/client/client.py +234 -0
  187. model_analyzer/triton/client/client_factory.py +57 -0
  188. model_analyzer/triton/client/grpc_client.py +104 -0
  189. model_analyzer/triton/client/http_client.py +107 -0
  190. model_analyzer/triton/model/__init__.py +15 -0
  191. model_analyzer/triton/model/model_config.py +556 -0
  192. model_analyzer/triton/model/model_config_variant.py +29 -0
  193. model_analyzer/triton/server/__init__.py +15 -0
  194. model_analyzer/triton/server/server.py +76 -0
  195. model_analyzer/triton/server/server_config.py +269 -0
  196. model_analyzer/triton/server/server_docker.py +229 -0
  197. model_analyzer/triton/server/server_factory.py +306 -0
  198. model_analyzer/triton/server/server_local.py +158 -0
  199. triton_model_analyzer-1.48.0.dist-info/METADATA +52 -0
  200. triton_model_analyzer-1.48.0.dist-info/RECORD +204 -0
  201. triton_model_analyzer-1.48.0.dist-info/WHEEL +5 -0
  202. triton_model_analyzer-1.48.0.dist-info/entry_points.txt +2 -0
  203. triton_model_analyzer-1.48.0.dist-info/licenses/LICENSE +67 -0
  204. triton_model_analyzer-1.48.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,887 @@
1
+ #!/usr/bin/env python3
2
+
3
+ # Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
4
+ #
5
+ # Licensed under the Apache License, Version 2.0 (the "License");
6
+ # you may not use this file except in compliance with the License.
7
+ # You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+
17
+ import logging
18
+ import os
19
+ import time
20
+ from collections import defaultdict
21
+ from typing import Dict, List, Optional, Tuple
22
+
23
+ import numba
24
+ import requests
25
+ from prometheus_client.parser import text_string_to_metric_families
26
+
27
+ from model_analyzer.config.generate.base_model_config_generator import (
28
+ BaseModelConfigGenerator,
29
+ )
30
+ from model_analyzer.config.run.run_config import RunConfig
31
+ from model_analyzer.constants import LOGGER_NAME, PA_ERROR_LOG_FILENAME
32
+ from model_analyzer.model_analyzer_exceptions import TritonModelAnalyzerException
33
+ from model_analyzer.monitor.cpu_monitor import CPUMonitor
34
+ from model_analyzer.monitor.dcgm.dcgm_monitor import DCGMMonitor
35
+ from model_analyzer.monitor.remote_monitor import RemoteMonitor
36
+ from model_analyzer.output.file_writer import FileWriter
37
+ from model_analyzer.perf_analyzer.perf_analyzer import PerfAnalyzer
38
+ from model_analyzer.result.run_config_measurement import RunConfigMeasurement
39
+ from model_analyzer.triton.model.model_config_variant import ModelConfigVariant
40
+
41
+ from .record import Record, RecordType
42
+ from .record_aggregator import RecordAggregator
43
+
44
+ logger = logging.getLogger(LOGGER_NAME)
45
+
46
+
47
+ class MetricsManager:
48
+ """
49
+ This class handles the profiling
50
+ categorization of metrics
51
+ """
52
+
53
+ metrics = [
54
+ "perf_throughput",
55
+ "perf_latency_avg",
56
+ "perf_latency_p90",
57
+ "perf_latency_p95",
58
+ "perf_latency_p99",
59
+ "perf_latency",
60
+ "perf_client_response_wait",
61
+ "perf_client_send_recv",
62
+ "perf_server_queue",
63
+ "perf_server_compute_input",
64
+ "perf_server_compute_infer",
65
+ "perf_server_compute_output",
66
+ "gpu_used_memory",
67
+ "gpu_free_memory",
68
+ "gpu_utilization",
69
+ "gpu_power_usage",
70
+ "cpu_available_ram",
71
+ "cpu_used_ram",
72
+ "time_to_first_token_avg",
73
+ "time_to_first_token_min",
74
+ "time_to_first_token_max",
75
+ "time_to_first_token_p99",
76
+ "time_to_first_token_p95",
77
+ "time_to_first_token_p90",
78
+ "time_to_first_token_p75",
79
+ "time_to_first_token_p50",
80
+ "time_to_first_token_p25",
81
+ "inter_token_latency_avg",
82
+ "inter_token_latency_min",
83
+ "inter_token_latency_max",
84
+ "inter_token_latency_p99",
85
+ "inter_token_latency_p95",
86
+ "inter_token_latency_p90",
87
+ "inter_token_latency_p75",
88
+ "inter_token_latency_p50",
89
+ "inter_token_latency_p25",
90
+ "output_token_throughput",
91
+ ]
92
+
93
+ def __init__(self, config, client, server, gpus, result_manager, state_manager):
94
+ """
95
+ Parameters
96
+ ----------
97
+ config :ConfigCommandProfile
98
+ The model analyzer's config
99
+ client : TritonClient
100
+ handle to the instance of Tritonclient to communicate with
101
+ the server
102
+ server : TritonServer
103
+ Handle to the instance of Triton being used
104
+ gpus: List of GPUDevices
105
+ The gpus being used to profile
106
+ result_manager : ResultManager
107
+ instance that manages the result tables and
108
+ adding results
109
+ state_manager: AnalyzerStateManager
110
+ manages the analyzer state
111
+ """
112
+
113
+ # Generate the output model repository path folder.
114
+ self._output_model_repo_path = config.output_model_repository_path
115
+
116
+ if len(config.profile_models) != len(
117
+ set([model._model_name for model in config.profile_models])
118
+ ):
119
+ raise TritonModelAnalyzerException(
120
+ f"Duplicate model names detected: "
121
+ f"{[model._model_name for model in config.profile_models]}"
122
+ )
123
+ self._first_config_variant = {}
124
+ self._config = config
125
+ self._client = client
126
+ self._server = server
127
+ self._result_manager = result_manager
128
+ self._state_manager = state_manager
129
+ self._loaded_models = None
130
+
131
+ self._cpu_warning_printed = False
132
+ self._encountered_perf_analyzer_error = False
133
+
134
+ (
135
+ self._gpu_metrics,
136
+ self._perf_metrics,
137
+ self._llm_metrics,
138
+ self._cpu_metrics,
139
+ ) = self._categorize_metrics(self.metrics, self._config.collect_cpu_metrics)
140
+ self._gpus = gpus
141
+ self._init_state()
142
+
143
+ def start_new_model(self):
144
+ """Indicate that profiling of a new model is starting"""
145
+ self._first_config_variant = {}
146
+
147
+ def encountered_perf_analyzer_error(self) -> bool:
148
+ return self._encountered_perf_analyzer_error
149
+
150
+ def _init_state(self):
151
+ """
152
+ Sets MetricsManager object managed
153
+ state variables in AnalyzerState
154
+ """
155
+
156
+ gpu_info = self._state_manager.get_state_variable("MetricsManager.gpu_info")
157
+
158
+ if self._state_manager.starting_fresh_run() or gpu_info is None:
159
+ gpu_info = {}
160
+
161
+ for i in range(len(self._gpus)):
162
+ if self._gpus[i].device_uuid() not in gpu_info:
163
+ device_info = {}
164
+ device = numba.cuda.list_devices()[i]
165
+ device_info["name"] = str(device.name, encoding="utf-8")
166
+ with device:
167
+ # convert bytes to GB
168
+ device_info["total_memory"] = (
169
+ numba.cuda.current_context().get_memory_info().total
170
+ )
171
+ gpu_info[self._gpus[i].device_uuid()] = device_info
172
+
173
+ self._state_manager.set_state_variable("MetricsManager.gpus", gpu_info)
174
+
175
+ @staticmethod
176
+ def _categorize_metrics(metric_tags, collect_cpu_metrics=False):
177
+ """
178
+ Splits the metrics into groups based
179
+ on how they are collected
180
+
181
+ Returns
182
+ -------
183
+ (list,list,list,list)
184
+ tuple of four lists (DCGM, PerfAnalyzer, LLM, CPU) metrics
185
+ """
186
+
187
+ gpu_metrics, perf_metrics, llm_metrics, cpu_metrics = [], [], [], []
188
+ # Separates metrics and objectives into related lists
189
+ for metric in MetricsManager.get_metric_types(metric_tags):
190
+ if metric in PerfAnalyzer.get_gpu_metrics():
191
+ gpu_metrics.append(metric)
192
+ elif metric in PerfAnalyzer.get_perf_metrics():
193
+ perf_metrics.append(metric)
194
+ elif metric in PerfAnalyzer.get_llm_metrics():
195
+ llm_metrics.append(metric)
196
+ elif collect_cpu_metrics and (metric in CPUMonitor.cpu_metrics):
197
+ cpu_metrics.append(metric)
198
+
199
+ return gpu_metrics, perf_metrics, llm_metrics, cpu_metrics
200
+
201
+ def profile_server(self):
202
+ """
203
+ Runs the DCGM monitor on the triton server without the perf_analyzer
204
+ Raises
205
+ ------
206
+ TritonModelAnalyzerException
207
+ """
208
+
209
+ capture_gpu_metrics = numba.cuda.is_available()
210
+ self._start_monitors(capture_gpu_metrics=capture_gpu_metrics)
211
+ time.sleep(self._config.duration_seconds)
212
+ if capture_gpu_metrics or self._config.always_report_gpu_metrics:
213
+ server_gpu_metrics = self._get_gpu_inference_metrics()
214
+ self._result_manager.add_server_data(data=server_gpu_metrics)
215
+ self._destroy_monitors(capture_gpu_metrics=capture_gpu_metrics)
216
+
217
+ def execute_run_config(
218
+ self, run_config: RunConfig
219
+ ) -> Optional[RunConfigMeasurement]:
220
+ """
221
+ Executes the RunConfig. Returns obtained measurement. Also sends
222
+ measurement to the result manager
223
+ """
224
+
225
+ self._create_model_variants(run_config)
226
+
227
+ # If this run config was already run, do not run again, just get the measurement
228
+ measurement = self._get_measurement_if_config_duplicate(run_config)
229
+ if measurement:
230
+ logger.info("Existing measurement found for run config. Skipping profile")
231
+ return measurement
232
+
233
+ current_model_variants = run_config.model_variants_name()
234
+ if current_model_variants != self._loaded_models:
235
+ self._server.stop()
236
+ self._server.start(env=run_config.triton_environment())
237
+
238
+ if not self._load_model_variants(run_config):
239
+ self._server.stop()
240
+ self._loaded_models = None
241
+ return None
242
+
243
+ self._loaded_models = current_model_variants
244
+
245
+ measurement = self.profile_models(run_config)
246
+
247
+ return measurement
248
+
249
+ def profile_models(self, run_config: RunConfig) -> Optional[RunConfigMeasurement]:
250
+ """
251
+ Runs monitors while running perf_analyzer with a specific set of
252
+ arguments. This will profile model inferencing.
253
+
254
+ Parameters
255
+ ----------
256
+ run_config : RunConfig
257
+ RunConfig object corresponding to the models being profiled.
258
+
259
+ Returns
260
+ -------
261
+ (dict of lists, list)
262
+ The gpu specific and non gpu metrics
263
+ """
264
+
265
+ perf_output_writer = (
266
+ None
267
+ if not self._config.perf_output
268
+ else FileWriter(self._config.perf_output_path)
269
+ )
270
+ capture_gpu_metrics = (
271
+ self._config.always_report_gpu_metrics or not run_config.cpu_only()
272
+ )
273
+
274
+ self._print_run_config_info(run_config)
275
+
276
+ self._start_monitors(capture_gpu_metrics=capture_gpu_metrics)
277
+
278
+ perf_analyzer_metrics, model_gpu_metrics = self._run_perf_analyzer(
279
+ run_config, perf_output_writer
280
+ )
281
+
282
+ if not perf_analyzer_metrics:
283
+ self._stop_monitors(capture_gpu_metrics=capture_gpu_metrics)
284
+ self._destroy_monitors(capture_gpu_metrics=capture_gpu_metrics)
285
+ return None
286
+
287
+ # Get metrics for model inference and combine metrics that do not have GPU UUID
288
+ if capture_gpu_metrics and not model_gpu_metrics:
289
+ model_gpu_metrics = self._get_gpu_inference_metrics()
290
+ model_cpu_metrics = self._get_cpu_inference_metrics()
291
+
292
+ self._destroy_monitors(capture_gpu_metrics=capture_gpu_metrics)
293
+
294
+ run_config_measurement = None
295
+ if model_gpu_metrics is not None and perf_analyzer_metrics is not None:
296
+ run_config_measurement = RunConfigMeasurement(
297
+ run_config.model_variants_name(), model_gpu_metrics
298
+ )
299
+
300
+ # Combine all per-model measurements into the RunConfigMeasurement
301
+ #
302
+ for model_run_config in run_config.model_run_configs():
303
+ perf_config = model_run_config.perf_config()
304
+ model_name = perf_config["model-name"]
305
+
306
+ model_non_gpu_metrics = list(
307
+ perf_analyzer_metrics[model_name].values()
308
+ ) + list(model_cpu_metrics.values())
309
+
310
+ model_specific_pa_params = (
311
+ perf_config.extract_model_specific_parameters()
312
+ )
313
+
314
+ run_config_measurement.add_model_config_measurement(
315
+ perf_config["model-name"],
316
+ model_specific_pa_params,
317
+ model_non_gpu_metrics,
318
+ )
319
+
320
+ self._result_manager.add_run_config_measurement(
321
+ run_config, run_config_measurement
322
+ )
323
+
324
+ return run_config_measurement
325
+
326
+ def finalize(self):
327
+ self._server.stop()
328
+
329
+ def _create_model_variants(self, run_config: RunConfig) -> None:
330
+ """
331
+ Creates and fills all model variant directories
332
+ """
333
+ for mrc in run_config.model_run_configs():
334
+ self._create_model_variant(
335
+ original_name=mrc.model_name(),
336
+ variant_config=mrc.model_config_variant(),
337
+ )
338
+
339
+ for composing_config_variant in mrc.composing_config_variants():
340
+ variant_name = composing_config_variant.variant_name
341
+ original_name = (
342
+ BaseModelConfigGenerator.extract_model_name_from_variant_name(
343
+ variant_name
344
+ )
345
+ )
346
+
347
+ self._create_model_variant(original_name, composing_config_variant)
348
+
349
+ # Create a version with the original (no _config_#/default appended) name
350
+ original_composing_config = (
351
+ BaseModelConfigGenerator.create_original_config_from_variant(
352
+ composing_config_variant.model_config
353
+ )
354
+ )
355
+ self._create_model_variant(
356
+ original_name,
357
+ ModelConfigVariant(original_composing_config, original_name),
358
+ ignore_first_config_variant=True,
359
+ )
360
+
361
+ def _create_model_variant(
362
+ self,
363
+ original_name: str,
364
+ variant_config: ModelConfigVariant,
365
+ ignore_first_config_variant: bool = False,
366
+ ) -> None:
367
+ """
368
+ Creates a directory for the model config variant in the output model
369
+ repository and fills directory with config
370
+ """
371
+
372
+ if self._config.triton_launch_mode != "remote":
373
+ self._create_non_remote_mode_model_variant(
374
+ original_name, variant_config, ignore_first_config_variant
375
+ )
376
+ else:
377
+ self._create_remote_mode_model_variant(original_name, variant_config)
378
+
379
+ def _create_non_remote_mode_model_variant(
380
+ self,
381
+ original_name: str,
382
+ variant_config: ModelConfigVariant,
383
+ ignore_first_config_variant: bool = False,
384
+ ) -> None:
385
+ """
386
+ Creates a directory for the model config variant in the output model
387
+ repository and fills directory with config
388
+ """
389
+ variant_name = variant_config.variant_name
390
+ model_repository = self._config.model_repository
391
+
392
+ original_model_dir = os.path.join(model_repository, original_name)
393
+ new_model_dir = os.path.join(self._output_model_repo_path, variant_name)
394
+ try:
395
+ # Create the directory for the new model
396
+ os.makedirs(new_model_dir, exist_ok=True)
397
+ self._first_config_variant.setdefault(original_name, None)
398
+
399
+ if ignore_first_config_variant:
400
+ variant_config.model_config.write_config_to_file(
401
+ new_model_dir, original_model_dir, None
402
+ )
403
+ else:
404
+ variant_config.model_config.write_config_to_file(
405
+ new_model_dir,
406
+ original_model_dir,
407
+ self._first_config_variant[original_name],
408
+ )
409
+
410
+ if self._first_config_variant[original_name] is None:
411
+ self._first_config_variant[original_name] = os.path.join(
412
+ self._output_model_repo_path, variant_name
413
+ )
414
+ except FileExistsError:
415
+ # Ignore if the file already exists
416
+ pass
417
+
418
+ def _create_remote_mode_model_variant(
419
+ self,
420
+ original_name: str,
421
+ variant_config: ModelConfigVariant,
422
+ ) -> None:
423
+ """
424
+ Creates a directory for the model config variant in the output model
425
+ repository and fills directory with only the config.pbtxt
426
+ """
427
+ variant_name = variant_config.variant_name
428
+ new_model_dir = os.path.join(self._output_model_repo_path, variant_name)
429
+ try:
430
+ os.makedirs(new_model_dir, exist_ok=False)
431
+ self._first_config_variant.setdefault(original_name, None)
432
+ variant_config.model_config.write_config_to_file(
433
+ model_path=new_model_dir,
434
+ src_model_path=new_model_dir,
435
+ first_variant_model_path=None,
436
+ )
437
+ except FileExistsError:
438
+ # Ignore if the dir already exists
439
+ pass
440
+
441
+ def _load_model_variants(self, run_config: RunConfig) -> bool:
442
+ """
443
+ Loads all model variants in the client
444
+ """
445
+ for mrc in run_config.model_run_configs():
446
+ # Load all composing model variants first, and then the parent model
447
+ for composing_config_variant in mrc.composing_config_variants():
448
+ if not self._load_model_variant(
449
+ variant_config=composing_config_variant
450
+ ):
451
+ return False
452
+ if not self._load_model_variant(variant_config=mrc.model_config_variant()):
453
+ return False
454
+ return True
455
+
456
+ def _load_model_variant(self, variant_config: ModelConfigVariant) -> bool:
457
+ """
458
+ Conditionally loads a model variant in the client
459
+ """
460
+ remote = self._config.triton_launch_mode == "remote"
461
+ c_api = self._config.triton_launch_mode == "c_api"
462
+ disabled = self._config.reload_model_disable
463
+ do_load = (remote and not disabled) or (not remote and not c_api)
464
+
465
+ retval = True
466
+ if do_load:
467
+ retval = self._do_load_model_variant(variant_config)
468
+ return retval
469
+
470
+ def _do_load_model_variant(self, variant_config: ModelConfigVariant) -> bool:
471
+ """
472
+ Loads a model variant in the client
473
+ """
474
+ self._client.wait_for_server_ready(
475
+ num_retries=self._config.client_max_retries,
476
+ log_file=self._server.log_file(),
477
+ )
478
+
479
+ model_name = variant_config.model_config.get_field("name")
480
+ variant_name = variant_config.variant_name
481
+ config_str = variant_config.model_config.get_config_str()
482
+ if (
483
+ self._client.load_model(
484
+ model_name=model_name,
485
+ variant_name=variant_name,
486
+ config_str=config_str,
487
+ )
488
+ == -1
489
+ ):
490
+ return False
491
+
492
+ if (
493
+ self._client.wait_for_model_ready(
494
+ model_name=variant_config.model_config.get_field("name"),
495
+ num_retries=self._config.client_max_retries,
496
+ )
497
+ == -1
498
+ ):
499
+ return False
500
+ return True
501
+
502
+ def _get_measurement_if_config_duplicate(self, run_config):
503
+ """
504
+ Checks whether this run config has measurements
505
+ in the state manager's results object
506
+ """
507
+
508
+ models_name = run_config.models_name()
509
+ model_variants_name = run_config.model_variants_name()
510
+ key = run_config.representation()
511
+
512
+ results = self._state_manager.get_state_variable("ResultManager.results")
513
+
514
+ if not results.contains_model_variant(models_name, model_variants_name):
515
+ return False
516
+
517
+ measurements = results.get_model_variants_measurements_dict(
518
+ models_name, model_variants_name
519
+ )
520
+
521
+ return measurements.get(key, None)
522
+
523
+ def _start_monitors(self, capture_gpu_metrics=True):
524
+ """
525
+ Start any metrics monitors
526
+ """
527
+
528
+ self._gpu_monitor = None
529
+ if capture_gpu_metrics:
530
+ try:
531
+ self._gpu_monitor = RemoteMonitor(
532
+ self._config.triton_metrics_url,
533
+ self._config.monitoring_interval,
534
+ self._gpu_metrics,
535
+ )
536
+
537
+ self._gpu_monitor.start_recording_metrics()
538
+ except TritonModelAnalyzerException:
539
+ self._destroy_monitors()
540
+ raise
541
+ finally:
542
+ if (
543
+ not self._gpu_monitor.is_monitoring_connected()
544
+ and self._config.triton_launch_mode != "c_api"
545
+ ):
546
+ raise TritonModelAnalyzerException(
547
+ f"Failed to connect to Tritonserver's GPU metrics monitor. "
548
+ f"Please check that the `triton_metrics_url` value is set correctly: {self._config.triton_metrics_url}."
549
+ )
550
+
551
+ self._cpu_monitor = CPUMonitor(
552
+ self._server, self._config.monitoring_interval, self._cpu_metrics
553
+ )
554
+ self._cpu_monitor.start_recording_metrics()
555
+
556
+ def _stop_monitors(self, capture_gpu_metrics=True):
557
+ """
558
+ Stop any metrics monitors, when we don't need
559
+ to collect the result
560
+ """
561
+
562
+ # Stop DCGM Monitor only if there are GPUs available
563
+ if capture_gpu_metrics:
564
+ self._gpu_monitor.stop_recording_metrics()
565
+ self._cpu_monitor.stop_recording_metrics()
566
+
567
+ def _destroy_monitors(self, capture_gpu_metrics=True):
568
+ """
569
+ Destroy the monitors created by start
570
+ """
571
+
572
+ if capture_gpu_metrics:
573
+ if self._gpu_monitor:
574
+ self._gpu_monitor.destroy()
575
+ if self._cpu_monitor:
576
+ self._cpu_monitor.destroy()
577
+ self._gpu_monitor = None
578
+ self._cpu_monitor = None
579
+
580
+ def _run_perf_analyzer(
581
+ self, run_config: RunConfig, perf_output_writer: Optional[FileWriter]
582
+ ) -> Tuple[Optional[Dict], Optional[Dict[int, List[Record]]]]:
583
+ """
584
+ Runs perf_analyzer and returns the aggregated metrics
585
+
586
+ Parameters
587
+ ----------
588
+ run_config : RunConfig
589
+ The RunConfig to execute on perf analyzer
590
+
591
+ perf_output_writer : FileWriter
592
+ Writer that writes the output from perf_analyzer to the output
593
+ stream/file. If None, the output is not written
594
+
595
+ Raises
596
+ ------
597
+ TritonModelAnalyzerException
598
+ """
599
+
600
+ perf_analyzer_env = run_config.triton_environment()
601
+
602
+ # IF running with C_API, need to set CUDA_VISIBLE_DEVICES here
603
+ if self._config.triton_launch_mode == "c_api":
604
+ perf_analyzer_env["CUDA_VISIBLE_DEVICES"] = ",".join(
605
+ [gpu.device_uuid() for gpu in self._gpus]
606
+ )
607
+
608
+ perf_analyzer = PerfAnalyzer(
609
+ path=self._config.perf_analyzer_path,
610
+ config=run_config,
611
+ max_retries=self._config.perf_analyzer_max_auto_adjusts,
612
+ timeout=self._config.perf_analyzer_timeout,
613
+ max_cpu_util=self._config.perf_analyzer_cpu_util,
614
+ model_type=self._config.model_type,
615
+ )
616
+
617
+ metrics_to_gather = self._perf_metrics + self._llm_metrics + self._gpu_metrics
618
+ status = perf_analyzer.run(metrics_to_gather, env=perf_analyzer_env)
619
+
620
+ self._write_perf_analyzer_output(perf_output_writer, perf_analyzer)
621
+
622
+ if status == 1:
623
+ self._handle_unsuccessful_perf_analyzer_run(perf_analyzer)
624
+ return (None, None)
625
+
626
+ perf_records = perf_analyzer.get_perf_records()
627
+
628
+ if self._config.model_type == "LLM":
629
+ perf_records[run_config.models_name()].extend(
630
+ perf_analyzer.get_llm_records()[run_config.models_name()]
631
+ )
632
+
633
+ gpu_records = perf_analyzer.get_gpu_records()
634
+
635
+ aggregated_perf_records = self._aggregate_perf_records(perf_records)
636
+ aggregated_gpu_records = self._aggregate_gpu_records(gpu_records)
637
+
638
+ return aggregated_perf_records, aggregated_gpu_records
639
+
640
+ def _write_perf_analyzer_output(
641
+ self, perf_output_writer: Optional[FileWriter], perf_analyzer: PerfAnalyzer
642
+ ) -> None:
643
+ if perf_output_writer:
644
+ perf_output_writer.write(
645
+ "============== Perf Analyzer Launched ==============\n"
646
+ f"Command: {perf_analyzer.get_cmd()}\n\n",
647
+ append=True,
648
+ )
649
+ if perf_analyzer.output():
650
+ perf_output_writer.write(perf_analyzer.output() + "\n", append=True)
651
+
652
+ def _handle_unsuccessful_perf_analyzer_run(
653
+ self, perf_analyzer: PerfAnalyzer
654
+ ) -> None:
655
+ output_file = f"{self._config.export_path}/{PA_ERROR_LOG_FILENAME}"
656
+
657
+ if not self._encountered_perf_analyzer_error:
658
+ self._encountered_perf_analyzer_error = True
659
+ if os.path.exists(output_file):
660
+ os.remove(output_file)
661
+
662
+ perf_error_log = FileWriter(output_file)
663
+ perf_error_log.write(
664
+ "Command: \n" + perf_analyzer.get_cmd() + "\n\n", append=True
665
+ )
666
+
667
+ if perf_analyzer.output():
668
+ perf_error_log.write(
669
+ "Error: \n" + perf_analyzer.output() + "\n", append=True
670
+ )
671
+ else:
672
+ perf_error_log.write(
673
+ "Error: "
674
+ + "perf_analyzer did not produce any output. It was likely terminated with a SIGABRT."
675
+ + "\n\n",
676
+ append=True,
677
+ )
678
+
679
+ def _aggregate_perf_records(self, perf_records):
680
+ per_model_perf_records = {}
681
+ for model, records in perf_records.items():
682
+ perf_record_aggregator = RecordAggregator()
683
+ perf_record_aggregator.insert_all(records)
684
+
685
+ per_model_perf_records[model] = perf_record_aggregator.aggregate()
686
+ return per_model_perf_records
687
+
688
+ def _get_gpu_inference_metrics(self):
689
+ """
690
+ Stops GPU monitor and aggregates any records
691
+ that are GPU specific
692
+ Returns
693
+ -------
694
+ dict
695
+ keys are gpu ids and values are metric values
696
+ in the order specified in self._gpu_metrics
697
+ """
698
+
699
+ # Stop and destroy DCGM monitor
700
+ gpu_records = self._gpu_monitor.stop_recording_metrics()
701
+ gpu_metrics = self._aggregate_gpu_records(gpu_records)
702
+ return gpu_metrics
703
+
704
+ def _aggregate_gpu_records(self, gpu_records):
705
+ # Insert all records into aggregator and get aggregated DCGM records
706
+ gpu_record_aggregator = RecordAggregator()
707
+ gpu_record_aggregator.insert_all(gpu_records)
708
+
709
+ records_groupby_gpu = {}
710
+ records_groupby_gpu = gpu_record_aggregator.groupby(
711
+ self._gpu_metrics, lambda record: record.device_uuid()
712
+ )
713
+
714
+ gpu_metrics = defaultdict(list)
715
+ for _, metric in records_groupby_gpu.items():
716
+ for gpu_uuid, metric_value in metric.items():
717
+ gpu_metrics[gpu_uuid].append(metric_value)
718
+ return gpu_metrics
719
+
720
+ def _get_cpu_inference_metrics(self):
721
+ """
722
+ Stops any monitors that just need the records to be aggregated
723
+ like the CPU metrics
724
+ """
725
+
726
+ cpu_records = self._cpu_monitor.stop_recording_metrics()
727
+
728
+ cpu_record_aggregator = RecordAggregator()
729
+ cpu_record_aggregator.insert_all(cpu_records)
730
+ return cpu_record_aggregator.aggregate()
731
+
732
+ def _check_triton_and_model_analyzer_gpus(self):
733
+ """
734
+ Check whether Triton Server and Model Analyzer are using the same GPUs
735
+ Raises
736
+ ------
737
+ TritonModelAnalyzerException
738
+ If they are using different GPUs this exception will be raised.
739
+ """
740
+
741
+ if (
742
+ self._config.triton_launch_mode != "remote"
743
+ and self._config.triton_launch_mode != "c_api"
744
+ ):
745
+ self._client.wait_for_server_ready(
746
+ num_retries=self._config.client_max_retries,
747
+ log_file=self._server.log_file(),
748
+ )
749
+
750
+ model_analyzer_gpus = [gpu.device_uuid() for gpu in self._gpus]
751
+ triton_gpus = self._get_triton_metrics_gpus()
752
+ if set(model_analyzer_gpus) != set(triton_gpus):
753
+ raise TritonModelAnalyzerException(
754
+ "'Triton Server is not using the same GPUs as Model Analyzer: '"
755
+ f"Model Analyzer GPUs {model_analyzer_gpus}, Triton GPUs {triton_gpus}"
756
+ )
757
+
758
+ def _get_triton_metrics_gpus(self):
759
+ """
760
+ Uses prometheus to request a list of GPU UUIDs corresponding to the GPUs
761
+ visible to Triton Inference Server
762
+ Parameters
763
+ ----------
764
+ config : namespace
765
+ The arguments passed into the CLI
766
+ """
767
+
768
+ triton_prom_str = str(
769
+ requests.get(self._config.triton_metrics_url, timeout=10).content,
770
+ encoding="ascii",
771
+ )
772
+ metrics = text_string_to_metric_families(triton_prom_str)
773
+
774
+ triton_gpus = []
775
+ for metric in metrics:
776
+ if metric.name == "nv_gpu_utilization":
777
+ for sample in metric.samples:
778
+ triton_gpus.append(sample.labels["gpu_uuid"])
779
+
780
+ return triton_gpus
781
+
782
+ def _print_run_config_info(self, run_config):
783
+ for model_run_config in run_config.model_run_configs():
784
+ perf_config = model_run_config.perf_config()
785
+ if perf_config["request-rate-range"]:
786
+ if perf_config["batch-size"] != 1:
787
+ logger.info(
788
+ f"Profiling {model_run_config.model_variant_name()}: client batch size={perf_config['batch-size']}, request-rate-range={perf_config['request-rate-range']}"
789
+ )
790
+ else:
791
+ logger.info(
792
+ f"Profiling {model_run_config.model_variant_name()}: request-rate-range={perf_config['request-rate-range']}"
793
+ )
794
+ else:
795
+ if perf_config["batch-size"] != 1:
796
+ logger.info(
797
+ f"Profiling {model_run_config.model_variant_name()}: client batch size={perf_config['batch-size']}, concurrency={perf_config['concurrency-range']}"
798
+ )
799
+ else:
800
+ logger.info(
801
+ f"Profiling {model_run_config.model_variant_name()}: concurrency={perf_config['concurrency-range']}"
802
+ )
803
+
804
+ # Vertical spacing when running multiple models at a time
805
+ if len(run_config.model_run_configs()) > 1:
806
+ logger.info("")
807
+
808
+ cpu_only = run_config.cpu_only()
809
+
810
+ # Inform user CPU metric(s) are not being collected under CPU mode
811
+ collect_cpu_metrics_expect = cpu_only
812
+ collect_cpu_metrics_actual = len(self._cpu_metrics) > 0
813
+ if collect_cpu_metrics_expect and not collect_cpu_metrics_actual:
814
+ if not self._cpu_warning_printed:
815
+ self._cpu_warning_printed = True
816
+ logger.warning(
817
+ "One or more models are running on the CPU, but CPU metric(s) are not being collected"
818
+ )
819
+ # Warn user about CPU monitor performance issue
820
+ if collect_cpu_metrics_actual:
821
+ if not self._cpu_warning_printed:
822
+ self._cpu_warning_printed = True
823
+ logger.warning(
824
+ "CPU metrics are being collected. This can affect the latency or throughput numbers reported by perf analyzer."
825
+ )
826
+
827
+ @staticmethod
828
+ def get_metric_types(tags):
829
+ """
830
+ Parameters
831
+ ----------
832
+ tags : list of str
833
+ Human readable names for the
834
+ metrics to monitor. They correspond
835
+ to actual record types.
836
+ Returns
837
+ -------
838
+ List
839
+ of record types being monitored
840
+ """
841
+
842
+ return [RecordType.get(tag) for tag in tags]
843
+
844
+ @staticmethod
845
+ def is_gpu_metric(tag):
846
+ """
847
+ Returns
848
+ ------
849
+ True if the given tag is a supported gpu metric
850
+ False otherwise
851
+ """
852
+ metric = MetricsManager.get_metric_types([tag])[0]
853
+ return metric in DCGMMonitor.model_analyzer_to_dcgm_field
854
+
855
+ @staticmethod
856
+ def is_perf_analyzer_metric(tag):
857
+ """
858
+ Returns
859
+ ------
860
+ True if the given tag is a supported perf_analyzer metric
861
+ False otherwise
862
+ """
863
+ metric = MetricsManager.get_metric_types([tag])[0]
864
+ return metric in PerfAnalyzer.get_perf_metrics()
865
+
866
+ @staticmethod
867
+ def is_llm_metric(tag):
868
+ """
869
+ Returns
870
+ ------
871
+ True if the given tag is a supported perf_analyzer metric
872
+ False otherwise
873
+ """
874
+ metric = MetricsManager.get_metric_types([tag])[0]
875
+ return metric in PerfAnalyzer.get_llm_metrics()
876
+
877
+ @staticmethod
878
+ def is_cpu_metric(tag):
879
+ """
880
+ Returns
881
+ ------
882
+ True if the given tag is a supported cpu metric
883
+ False otherwise
884
+ """
885
+
886
+ metric = MetricsManager.get_metric_types([tag])[0]
887
+ return metric in CPUMonitor.cpu_metrics