triton-model-analyzer 1.48.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (204) hide show
  1. model_analyzer/__init__.py +15 -0
  2. model_analyzer/analyzer.py +448 -0
  3. model_analyzer/cli/__init__.py +15 -0
  4. model_analyzer/cli/cli.py +193 -0
  5. model_analyzer/config/__init__.py +15 -0
  6. model_analyzer/config/generate/__init__.py +15 -0
  7. model_analyzer/config/generate/automatic_model_config_generator.py +164 -0
  8. model_analyzer/config/generate/base_model_config_generator.py +352 -0
  9. model_analyzer/config/generate/brute_plus_binary_parameter_search_run_config_generator.py +164 -0
  10. model_analyzer/config/generate/brute_run_config_generator.py +154 -0
  11. model_analyzer/config/generate/concurrency_sweeper.py +75 -0
  12. model_analyzer/config/generate/config_generator_interface.py +52 -0
  13. model_analyzer/config/generate/coordinate.py +143 -0
  14. model_analyzer/config/generate/coordinate_data.py +86 -0
  15. model_analyzer/config/generate/generator_utils.py +116 -0
  16. model_analyzer/config/generate/manual_model_config_generator.py +187 -0
  17. model_analyzer/config/generate/model_config_generator_factory.py +92 -0
  18. model_analyzer/config/generate/model_profile_spec.py +74 -0
  19. model_analyzer/config/generate/model_run_config_generator.py +154 -0
  20. model_analyzer/config/generate/model_variant_name_manager.py +150 -0
  21. model_analyzer/config/generate/neighborhood.py +536 -0
  22. model_analyzer/config/generate/optuna_plus_concurrency_sweep_run_config_generator.py +141 -0
  23. model_analyzer/config/generate/optuna_run_config_generator.py +838 -0
  24. model_analyzer/config/generate/perf_analyzer_config_generator.py +312 -0
  25. model_analyzer/config/generate/quick_plus_concurrency_sweep_run_config_generator.py +130 -0
  26. model_analyzer/config/generate/quick_run_config_generator.py +753 -0
  27. model_analyzer/config/generate/run_config_generator_factory.py +329 -0
  28. model_analyzer/config/generate/search_config.py +112 -0
  29. model_analyzer/config/generate/search_dimension.py +73 -0
  30. model_analyzer/config/generate/search_dimensions.py +85 -0
  31. model_analyzer/config/generate/search_parameter.py +49 -0
  32. model_analyzer/config/generate/search_parameters.py +388 -0
  33. model_analyzer/config/input/__init__.py +15 -0
  34. model_analyzer/config/input/config_command.py +483 -0
  35. model_analyzer/config/input/config_command_profile.py +1747 -0
  36. model_analyzer/config/input/config_command_report.py +267 -0
  37. model_analyzer/config/input/config_defaults.py +236 -0
  38. model_analyzer/config/input/config_enum.py +83 -0
  39. model_analyzer/config/input/config_field.py +216 -0
  40. model_analyzer/config/input/config_list_generic.py +112 -0
  41. model_analyzer/config/input/config_list_numeric.py +151 -0
  42. model_analyzer/config/input/config_list_string.py +111 -0
  43. model_analyzer/config/input/config_none.py +71 -0
  44. model_analyzer/config/input/config_object.py +129 -0
  45. model_analyzer/config/input/config_primitive.py +81 -0
  46. model_analyzer/config/input/config_status.py +75 -0
  47. model_analyzer/config/input/config_sweep.py +83 -0
  48. model_analyzer/config/input/config_union.py +113 -0
  49. model_analyzer/config/input/config_utils.py +128 -0
  50. model_analyzer/config/input/config_value.py +243 -0
  51. model_analyzer/config/input/objects/__init__.py +15 -0
  52. model_analyzer/config/input/objects/config_model_profile_spec.py +325 -0
  53. model_analyzer/config/input/objects/config_model_report_spec.py +173 -0
  54. model_analyzer/config/input/objects/config_plot.py +198 -0
  55. model_analyzer/config/input/objects/config_protobuf_utils.py +101 -0
  56. model_analyzer/config/input/yaml_config_validator.py +82 -0
  57. model_analyzer/config/run/__init__.py +15 -0
  58. model_analyzer/config/run/model_run_config.py +313 -0
  59. model_analyzer/config/run/run_config.py +168 -0
  60. model_analyzer/constants.py +76 -0
  61. model_analyzer/device/__init__.py +15 -0
  62. model_analyzer/device/device.py +24 -0
  63. model_analyzer/device/gpu_device.py +87 -0
  64. model_analyzer/device/gpu_device_factory.py +248 -0
  65. model_analyzer/entrypoint.py +307 -0
  66. model_analyzer/log_formatter.py +65 -0
  67. model_analyzer/model_analyzer_exceptions.py +24 -0
  68. model_analyzer/model_manager.py +255 -0
  69. model_analyzer/monitor/__init__.py +15 -0
  70. model_analyzer/monitor/cpu_monitor.py +69 -0
  71. model_analyzer/monitor/dcgm/DcgmDiag.py +191 -0
  72. model_analyzer/monitor/dcgm/DcgmFieldGroup.py +83 -0
  73. model_analyzer/monitor/dcgm/DcgmGroup.py +815 -0
  74. model_analyzer/monitor/dcgm/DcgmHandle.py +141 -0
  75. model_analyzer/monitor/dcgm/DcgmJsonReader.py +69 -0
  76. model_analyzer/monitor/dcgm/DcgmReader.py +623 -0
  77. model_analyzer/monitor/dcgm/DcgmStatus.py +57 -0
  78. model_analyzer/monitor/dcgm/DcgmSystem.py +412 -0
  79. model_analyzer/monitor/dcgm/__init__.py +15 -0
  80. model_analyzer/monitor/dcgm/common/__init__.py +13 -0
  81. model_analyzer/monitor/dcgm/common/dcgm_client_cli_parser.py +194 -0
  82. model_analyzer/monitor/dcgm/common/dcgm_client_main.py +86 -0
  83. model_analyzer/monitor/dcgm/dcgm_agent.py +887 -0
  84. model_analyzer/monitor/dcgm/dcgm_collectd_plugin.py +369 -0
  85. model_analyzer/monitor/dcgm/dcgm_errors.py +395 -0
  86. model_analyzer/monitor/dcgm/dcgm_field_helpers.py +546 -0
  87. model_analyzer/monitor/dcgm/dcgm_fields.py +815 -0
  88. model_analyzer/monitor/dcgm/dcgm_fields_collectd.py +671 -0
  89. model_analyzer/monitor/dcgm/dcgm_fields_internal.py +29 -0
  90. model_analyzer/monitor/dcgm/dcgm_fluentd.py +45 -0
  91. model_analyzer/monitor/dcgm/dcgm_monitor.py +138 -0
  92. model_analyzer/monitor/dcgm/dcgm_prometheus.py +326 -0
  93. model_analyzer/monitor/dcgm/dcgm_structs.py +2357 -0
  94. model_analyzer/monitor/dcgm/dcgm_telegraf.py +65 -0
  95. model_analyzer/monitor/dcgm/dcgm_value.py +151 -0
  96. model_analyzer/monitor/dcgm/dcgmvalue.py +155 -0
  97. model_analyzer/monitor/dcgm/denylist_recommendations.py +573 -0
  98. model_analyzer/monitor/dcgm/pydcgm.py +47 -0
  99. model_analyzer/monitor/monitor.py +143 -0
  100. model_analyzer/monitor/remote_monitor.py +137 -0
  101. model_analyzer/output/__init__.py +15 -0
  102. model_analyzer/output/file_writer.py +63 -0
  103. model_analyzer/output/output_writer.py +42 -0
  104. model_analyzer/perf_analyzer/__init__.py +15 -0
  105. model_analyzer/perf_analyzer/genai_perf_config.py +206 -0
  106. model_analyzer/perf_analyzer/perf_analyzer.py +882 -0
  107. model_analyzer/perf_analyzer/perf_config.py +479 -0
  108. model_analyzer/plots/__init__.py +15 -0
  109. model_analyzer/plots/detailed_plot.py +266 -0
  110. model_analyzer/plots/plot_manager.py +224 -0
  111. model_analyzer/plots/simple_plot.py +213 -0
  112. model_analyzer/record/__init__.py +15 -0
  113. model_analyzer/record/gpu_record.py +68 -0
  114. model_analyzer/record/metrics_manager.py +887 -0
  115. model_analyzer/record/record.py +280 -0
  116. model_analyzer/record/record_aggregator.py +256 -0
  117. model_analyzer/record/types/__init__.py +15 -0
  118. model_analyzer/record/types/cpu_available_ram.py +93 -0
  119. model_analyzer/record/types/cpu_used_ram.py +93 -0
  120. model_analyzer/record/types/gpu_free_memory.py +96 -0
  121. model_analyzer/record/types/gpu_power_usage.py +107 -0
  122. model_analyzer/record/types/gpu_total_memory.py +96 -0
  123. model_analyzer/record/types/gpu_used_memory.py +96 -0
  124. model_analyzer/record/types/gpu_utilization.py +108 -0
  125. model_analyzer/record/types/inter_token_latency_avg.py +60 -0
  126. model_analyzer/record/types/inter_token_latency_base.py +74 -0
  127. model_analyzer/record/types/inter_token_latency_max.py +60 -0
  128. model_analyzer/record/types/inter_token_latency_min.py +60 -0
  129. model_analyzer/record/types/inter_token_latency_p25.py +60 -0
  130. model_analyzer/record/types/inter_token_latency_p50.py +60 -0
  131. model_analyzer/record/types/inter_token_latency_p75.py +60 -0
  132. model_analyzer/record/types/inter_token_latency_p90.py +60 -0
  133. model_analyzer/record/types/inter_token_latency_p95.py +60 -0
  134. model_analyzer/record/types/inter_token_latency_p99.py +60 -0
  135. model_analyzer/record/types/output_token_throughput.py +105 -0
  136. model_analyzer/record/types/perf_client_response_wait.py +97 -0
  137. model_analyzer/record/types/perf_client_send_recv.py +97 -0
  138. model_analyzer/record/types/perf_latency.py +111 -0
  139. model_analyzer/record/types/perf_latency_avg.py +60 -0
  140. model_analyzer/record/types/perf_latency_base.py +74 -0
  141. model_analyzer/record/types/perf_latency_p90.py +60 -0
  142. model_analyzer/record/types/perf_latency_p95.py +60 -0
  143. model_analyzer/record/types/perf_latency_p99.py +60 -0
  144. model_analyzer/record/types/perf_server_compute_infer.py +97 -0
  145. model_analyzer/record/types/perf_server_compute_input.py +97 -0
  146. model_analyzer/record/types/perf_server_compute_output.py +97 -0
  147. model_analyzer/record/types/perf_server_queue.py +97 -0
  148. model_analyzer/record/types/perf_throughput.py +105 -0
  149. model_analyzer/record/types/time_to_first_token_avg.py +60 -0
  150. model_analyzer/record/types/time_to_first_token_base.py +74 -0
  151. model_analyzer/record/types/time_to_first_token_max.py +60 -0
  152. model_analyzer/record/types/time_to_first_token_min.py +60 -0
  153. model_analyzer/record/types/time_to_first_token_p25.py +60 -0
  154. model_analyzer/record/types/time_to_first_token_p50.py +60 -0
  155. model_analyzer/record/types/time_to_first_token_p75.py +60 -0
  156. model_analyzer/record/types/time_to_first_token_p90.py +60 -0
  157. model_analyzer/record/types/time_to_first_token_p95.py +60 -0
  158. model_analyzer/record/types/time_to_first_token_p99.py +60 -0
  159. model_analyzer/reports/__init__.py +15 -0
  160. model_analyzer/reports/html_report.py +195 -0
  161. model_analyzer/reports/pdf_report.py +50 -0
  162. model_analyzer/reports/report.py +86 -0
  163. model_analyzer/reports/report_factory.py +62 -0
  164. model_analyzer/reports/report_manager.py +1376 -0
  165. model_analyzer/reports/report_utils.py +42 -0
  166. model_analyzer/result/__init__.py +15 -0
  167. model_analyzer/result/constraint_manager.py +150 -0
  168. model_analyzer/result/model_config_measurement.py +354 -0
  169. model_analyzer/result/model_constraints.py +105 -0
  170. model_analyzer/result/parameter_search.py +246 -0
  171. model_analyzer/result/result_manager.py +430 -0
  172. model_analyzer/result/result_statistics.py +159 -0
  173. model_analyzer/result/result_table.py +217 -0
  174. model_analyzer/result/result_table_manager.py +646 -0
  175. model_analyzer/result/result_utils.py +42 -0
  176. model_analyzer/result/results.py +277 -0
  177. model_analyzer/result/run_config_measurement.py +658 -0
  178. model_analyzer/result/run_config_result.py +210 -0
  179. model_analyzer/result/run_config_result_comparator.py +110 -0
  180. model_analyzer/result/sorted_results.py +151 -0
  181. model_analyzer/state/__init__.py +15 -0
  182. model_analyzer/state/analyzer_state.py +76 -0
  183. model_analyzer/state/analyzer_state_manager.py +215 -0
  184. model_analyzer/triton/__init__.py +15 -0
  185. model_analyzer/triton/client/__init__.py +15 -0
  186. model_analyzer/triton/client/client.py +234 -0
  187. model_analyzer/triton/client/client_factory.py +57 -0
  188. model_analyzer/triton/client/grpc_client.py +104 -0
  189. model_analyzer/triton/client/http_client.py +107 -0
  190. model_analyzer/triton/model/__init__.py +15 -0
  191. model_analyzer/triton/model/model_config.py +556 -0
  192. model_analyzer/triton/model/model_config_variant.py +29 -0
  193. model_analyzer/triton/server/__init__.py +15 -0
  194. model_analyzer/triton/server/server.py +76 -0
  195. model_analyzer/triton/server/server_config.py +269 -0
  196. model_analyzer/triton/server/server_docker.py +229 -0
  197. model_analyzer/triton/server/server_factory.py +306 -0
  198. model_analyzer/triton/server/server_local.py +158 -0
  199. triton_model_analyzer-1.48.0.dist-info/METADATA +52 -0
  200. triton_model_analyzer-1.48.0.dist-info/RECORD +204 -0
  201. triton_model_analyzer-1.48.0.dist-info/WHEEL +5 -0
  202. triton_model_analyzer-1.48.0.dist-info/entry_points.txt +2 -0
  203. triton_model_analyzer-1.48.0.dist-info/licenses/LICENSE +67 -0
  204. triton_model_analyzer-1.48.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,882 @@
1
+ #!/usr/bin/env python3
2
+
3
+ # Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
4
+ #
5
+ # Licensed under the Apache License, Version 2.0 (the "License");
6
+ # you may not use this file except in compliance with the License.
7
+ # You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+
17
+ import csv
18
+ import glob
19
+ import logging
20
+ import os
21
+ import re
22
+ import signal
23
+ import tempfile
24
+ from csv import DictReader
25
+ from subprocess import STDOUT, Popen
26
+ from typing import Dict, List, Optional
27
+
28
+ import psutil
29
+
30
+ from model_analyzer.config.input.config_defaults import DEFAULT_MODEL_TYPE
31
+ from model_analyzer.constants import (
32
+ GENAI_PERF_COLLATERAL,
33
+ GENAI_PERF_CSV,
34
+ INTERVAL_SLEEP_TIME,
35
+ LOGGER_NAME,
36
+ MEASUREMENT_REQUEST_COUNT_STEP,
37
+ MEASUREMENT_WINDOW_STEP,
38
+ PERF_ANALYZER_MEASUREMENT_WINDOW,
39
+ PERF_ANALYZER_MINIMUM_REQUEST_COUNT,
40
+ )
41
+ from model_analyzer.model_analyzer_exceptions import TritonModelAnalyzerException
42
+ from model_analyzer.record.record import Record
43
+ from model_analyzer.record.types.gpu_free_memory import GPUFreeMemory
44
+ from model_analyzer.record.types.gpu_power_usage import GPUPowerUsage
45
+ from model_analyzer.record.types.gpu_used_memory import GPUUsedMemory
46
+ from model_analyzer.record.types.gpu_utilization import GPUUtilization
47
+ from model_analyzer.record.types.inter_token_latency_avg import InterTokenLatencyAvg
48
+ from model_analyzer.record.types.inter_token_latency_max import InterTokenLatencyMax
49
+ from model_analyzer.record.types.inter_token_latency_min import InterTokenLatencyMin
50
+ from model_analyzer.record.types.inter_token_latency_p25 import InterTokenLatencyP25
51
+ from model_analyzer.record.types.inter_token_latency_p50 import InterTokenLatencyP50
52
+ from model_analyzer.record.types.inter_token_latency_p75 import InterTokenLatencyP75
53
+ from model_analyzer.record.types.inter_token_latency_p90 import InterTokenLatencyP90
54
+ from model_analyzer.record.types.inter_token_latency_p95 import InterTokenLatencyP95
55
+ from model_analyzer.record.types.inter_token_latency_p99 import InterTokenLatencyP99
56
+ from model_analyzer.record.types.output_token_throughput import OutputTokenThroughput
57
+ from model_analyzer.record.types.perf_client_response_wait import PerfClientResponseWait
58
+ from model_analyzer.record.types.perf_client_send_recv import PerfClientSendRecv
59
+ from model_analyzer.record.types.perf_latency_avg import PerfLatencyAvg
60
+ from model_analyzer.record.types.perf_latency_p90 import PerfLatencyP90
61
+ from model_analyzer.record.types.perf_latency_p95 import PerfLatencyP95
62
+ from model_analyzer.record.types.perf_latency_p99 import PerfLatencyP99
63
+ from model_analyzer.record.types.perf_server_compute_infer import PerfServerComputeInfer
64
+ from model_analyzer.record.types.perf_server_compute_input import PerfServerComputeInput
65
+ from model_analyzer.record.types.perf_server_compute_output import (
66
+ PerfServerComputeOutput,
67
+ )
68
+ from model_analyzer.record.types.perf_server_queue import PerfServerQueue
69
+ from model_analyzer.record.types.perf_throughput import PerfThroughput
70
+ from model_analyzer.record.types.time_to_first_token_avg import TimeToFirstTokenAvg
71
+ from model_analyzer.record.types.time_to_first_token_max import TimeToFirstTokenMax
72
+ from model_analyzer.record.types.time_to_first_token_min import TimeToFirstTokenMin
73
+ from model_analyzer.record.types.time_to_first_token_p25 import TimeToFirstTokenP25
74
+ from model_analyzer.record.types.time_to_first_token_p50 import TimeToFirstTokenP50
75
+ from model_analyzer.record.types.time_to_first_token_p75 import TimeToFirstTokenP75
76
+ from model_analyzer.record.types.time_to_first_token_p90 import TimeToFirstTokenP90
77
+ from model_analyzer.record.types.time_to_first_token_p95 import TimeToFirstTokenP95
78
+ from model_analyzer.record.types.time_to_first_token_p99 import TimeToFirstTokenP99
79
+
80
+ logger = logging.getLogger(LOGGER_NAME)
81
+
82
+
83
+ class PerfAnalyzer:
84
+ """
85
+ This class provides an interface for running workloads
86
+ with perf_analyzer.
87
+ """
88
+
89
+ GPU_METRIC_UUID = 0
90
+ GPU_METRIC_VALUE = 1
91
+
92
+ PA_SUCCESS, PA_FAIL, PA_RETRY = 0, 1, 2
93
+
94
+ METRIC_TAG, CSV_STRING, RECORD_CLASS, REDUCTION_FACTOR = 0, 1, 2, 3
95
+ perf_metric_table = [
96
+ ["perf_latency_avg", "Avg latency", PerfLatencyAvg, "1000"],
97
+ ["perf_latency_p90", "p90 latency", PerfLatencyP90, "1000"],
98
+ ["perf_latency_p95", "p95 latency", PerfLatencyP95, "1000"],
99
+ ["perf_latency_p99", "p99 latency", PerfLatencyP99, "1000"],
100
+ ["perf_throughput", "Inferences/Second", PerfThroughput, "1"],
101
+ ["perf_client_send_recv", "request/response", PerfClientSendRecv, "1000"],
102
+ ["perf_client_send_recv", "send/recv", PerfClientSendRecv, "1000"],
103
+ ["perf_client_response_wait", "response wait", PerfClientResponseWait, "1000"],
104
+ ["perf_server_queue", "Server Queue", PerfServerQueue, "1000"],
105
+ [
106
+ "perf_server_compute_infer",
107
+ "Server Compute Infer",
108
+ PerfServerComputeInfer,
109
+ "1000",
110
+ ],
111
+ [
112
+ "perf_server_compute_input",
113
+ "Server Compute Input",
114
+ PerfServerComputeInput,
115
+ "1000",
116
+ ],
117
+ [
118
+ "perf_server_compute_output",
119
+ "Server Compute Output",
120
+ PerfServerComputeOutput,
121
+ "1000",
122
+ ],
123
+ ]
124
+
125
+ gpu_metric_table = [
126
+ ["gpu_utilization", "Avg GPU Utilization", GPUUtilization, "0.01"],
127
+ ["gpu_power_usage", "Avg GPU Power Usage", GPUPowerUsage, "1"],
128
+ ["gpu_used_memory", "Max GPU Memory Usage", GPUUsedMemory, "1000000"],
129
+ ["gpu_free_memory", "Total GPU Memory", GPUFreeMemory, "1000000"],
130
+ ]
131
+
132
+ llm_metric_table = [
133
+ [
134
+ "time_to_first_token_avg",
135
+ "Time To First Token (ns) avg",
136
+ TimeToFirstTokenAvg,
137
+ "1000",
138
+ ],
139
+ [
140
+ "time_to_first_token_min",
141
+ "Time To First Token (ns) min",
142
+ TimeToFirstTokenMin,
143
+ "1000",
144
+ ],
145
+ [
146
+ "time_to_first_token_max",
147
+ "Time To First Token (ns) max",
148
+ TimeToFirstTokenMax,
149
+ "1000",
150
+ ],
151
+ [
152
+ "time_to_first_token_p99",
153
+ "Time To First Token (ns) p99",
154
+ TimeToFirstTokenP99,
155
+ "1000",
156
+ ],
157
+ [
158
+ "time_to_first_token_p95",
159
+ "Time To First Token (ns) p95",
160
+ TimeToFirstTokenP95,
161
+ "1000",
162
+ ],
163
+ [
164
+ "time_to_first_token_p90",
165
+ "Time To First Token (ns) p90",
166
+ TimeToFirstTokenP90,
167
+ "1000",
168
+ ],
169
+ [
170
+ "time_to_first_token_p75",
171
+ "Time To First Token (ns) p75",
172
+ TimeToFirstTokenP75,
173
+ "1000",
174
+ ],
175
+ [
176
+ "time_to_first_token_p50",
177
+ "Time To First Token (ns) p50",
178
+ TimeToFirstTokenP50,
179
+ "1000",
180
+ ],
181
+ [
182
+ "time_to_first_token_p25",
183
+ "Time To First Token (ns) p25",
184
+ TimeToFirstTokenP25,
185
+ "1000",
186
+ ],
187
+ [
188
+ "inter_token_latency_avg",
189
+ "Inter Token Latency (ns) avg",
190
+ InterTokenLatencyAvg,
191
+ "1000",
192
+ ],
193
+ [
194
+ "inter_token_latency_min",
195
+ "Inter Token Latency (ns) min",
196
+ InterTokenLatencyMin,
197
+ "1000",
198
+ ],
199
+ [
200
+ "inter_token_latency_max",
201
+ "Inter Token Latency (ns) max",
202
+ InterTokenLatencyMax,
203
+ "1000",
204
+ ],
205
+ [
206
+ "inter_token_latency_p99",
207
+ "Inter Token Latency (ns) p99",
208
+ InterTokenLatencyP99,
209
+ "1000",
210
+ ],
211
+ [
212
+ "inter_token_latency_p95",
213
+ "Inter Token Latency (ns) p95",
214
+ InterTokenLatencyP95,
215
+ "1000",
216
+ ],
217
+ [
218
+ "inter_token_latency_p90",
219
+ "Inter Token Latency (ns) p90",
220
+ InterTokenLatencyP90,
221
+ "1000",
222
+ ],
223
+ [
224
+ "inter_token_latency_p75",
225
+ "Inter Token Latency (ns) p75",
226
+ InterTokenLatencyP75,
227
+ "1000",
228
+ ],
229
+ [
230
+ "inter_token_latency_p50",
231
+ "Inter Token Latency (ns) p50",
232
+ InterTokenLatencyP50,
233
+ "1000",
234
+ ],
235
+ [
236
+ "inter_token_latency_p25",
237
+ "Inter Token Latency (ns) p25",
238
+ InterTokenLatencyP25,
239
+ "1000",
240
+ ],
241
+ [
242
+ "output_token_throughput",
243
+ "Output Token Throughput (per sec) avg",
244
+ OutputTokenThroughput,
245
+ "1",
246
+ ],
247
+ ]
248
+
249
+ @staticmethod
250
+ def get_perf_metrics():
251
+ perf_metrics = [
252
+ perf_metric[PerfAnalyzer.RECORD_CLASS]
253
+ for perf_metric in PerfAnalyzer.perf_metric_table
254
+ ]
255
+ return perf_metrics
256
+
257
+ @staticmethod
258
+ def get_gpu_metrics():
259
+ gpu_metrics = [
260
+ gpu_metric[PerfAnalyzer.RECORD_CLASS]
261
+ for gpu_metric in PerfAnalyzer.gpu_metric_table
262
+ ]
263
+ return gpu_metrics
264
+
265
+ @staticmethod
266
+ def get_llm_metrics():
267
+ llm_metrics = [
268
+ llm_metric[PerfAnalyzer.RECORD_CLASS]
269
+ for llm_metric in PerfAnalyzer.llm_metric_table
270
+ ]
271
+ return llm_metrics
272
+
273
+ def __init__(
274
+ self,
275
+ path,
276
+ config,
277
+ max_retries,
278
+ timeout,
279
+ max_cpu_util,
280
+ model_type=DEFAULT_MODEL_TYPE,
281
+ ):
282
+ """
283
+ Parameters
284
+ ----------
285
+ path : full path to the perf_analyzer
286
+ executable
287
+ config : RunConfig
288
+ The RunConfig with information on what to execute
289
+ max_retries: int
290
+ Maximum number of times perf_analyzer adjusts parameters
291
+ in an attempt to profile a model.
292
+ timeout : int
293
+ Maximum number of seconds that perf_analyzer
294
+ will wait until the execution is complete.
295
+ max_cpu_util : float
296
+ Maximum CPU utilization allowed for perf_analyzer
297
+ """
298
+
299
+ self.bin_path = path
300
+ self._config = config
301
+ self._max_retries = max_retries
302
+ self._timeout = timeout
303
+ self._output = ""
304
+ self._perf_records = {}
305
+ self._llm_records = {}
306
+ self._gpu_records = []
307
+ self._max_cpu_util = max_cpu_util
308
+ self._model_type = model_type
309
+
310
+ def run(self, metrics, env=None):
311
+ """
312
+ Runs the perf analyzer with the
313
+ initialized configuration
314
+ Parameters
315
+ ----------
316
+ metrics : List of Record types
317
+ The list of record types to parse from
318
+ Perf Analyzer
319
+
320
+ env: dict
321
+ Environment variables to set for perf_analyzer run
322
+
323
+ Returns
324
+ -------
325
+ Dict
326
+ Dict of Model to List of Records obtained from this
327
+ run of perf_analyzer
328
+
329
+ Raises
330
+ ------
331
+ TritonModelAnalyzerException
332
+ If subprocess throws CalledProcessError
333
+ """
334
+
335
+ if metrics:
336
+ # Synchronously start and finish run
337
+ for _ in range(self._max_retries):
338
+ status = self._execute_pa(env)
339
+
340
+ if status == self.PA_FAIL:
341
+ return status
342
+ elif status == self.PA_SUCCESS:
343
+ self._parse_outputs(metrics)
344
+ break
345
+ elif status == self.PA_RETRY:
346
+ continue
347
+ else:
348
+ raise TritonModelAnalyzerException(f"Unexpected PA return {status}")
349
+
350
+ else:
351
+ logger.info(
352
+ f"Ran perf_analyzer {self._max_retries} times, "
353
+ "but no valid requests recorded"
354
+ )
355
+ return self.PA_FAIL
356
+
357
+ return self.PA_SUCCESS
358
+
359
+ def get_perf_records(self):
360
+ """
361
+ Returns
362
+ -------
363
+ The perf records from the last perf_analyzer run
364
+ """
365
+
366
+ if self._perf_records:
367
+ return self._perf_records
368
+ raise TritonModelAnalyzerException(
369
+ "Attempted to get perf_analyzer results without calling run first."
370
+ )
371
+
372
+ def get_llm_records(self):
373
+ """
374
+ Returns
375
+ -------
376
+ The LLM records from the last perf_analyzer run
377
+ """
378
+
379
+ if self._llm_records:
380
+ return self._llm_records
381
+ raise TritonModelAnalyzerException(
382
+ "Attempted to get perf_analyzer results without calling run first."
383
+ )
384
+
385
+ def get_gpu_records(self):
386
+ """
387
+ Returns
388
+ -------
389
+ The gpu records from the last perf_analyzer run
390
+ """
391
+
392
+ return self._gpu_records
393
+
394
+ def output(self):
395
+ """
396
+ Returns
397
+ -------
398
+ The stdout output of the
399
+ last perf_analyzer run
400
+ """
401
+
402
+ if not self._output:
403
+ logger.info("perf_analyzer did not produce any output.")
404
+ return self._output
405
+
406
+ def get_cmd(self):
407
+ """
408
+ Returns a string of the command to run
409
+ """
410
+ return " ".join(self._get_cmd())
411
+
412
+ def _execute_pa(self, env):
413
+ cmd = self._get_cmd()
414
+ logger.debug(f"Running {cmd}")
415
+ perf_analyzer_env = self._create_env(env)
416
+
417
+ process = self._create_process(cmd, perf_analyzer_env)
418
+ status = self._resolve_process(process)
419
+
420
+ return status
421
+
422
+ def _get_cmd(self):
423
+ if self._is_multi_model():
424
+ cmd = ["mpiexec", "--allow-run-as-root", "--tag-output"]
425
+ for index in range(len(self._config.model_run_configs())):
426
+ if index:
427
+ cmd += [":"]
428
+ cmd += ["-n", "1"]
429
+ cmd += self._get_single_model_cmd(index)
430
+ else:
431
+ cmd = self._get_single_model_cmd(0)
432
+ return cmd
433
+
434
+ def _get_single_model_cmd(self, index):
435
+ if self._model_type == "LLM":
436
+ cmd = ["genai-perf", "-m", self._config.models_name()]
437
+ cmd += self._get_genai_perf_cli_command(index).replace("=", " ").split()
438
+ cmd += ["--"]
439
+ cmd += (
440
+ self._get_pa_cli_command(index, exclude_model_name=True)
441
+ .replace("=", " ")
442
+ .split()
443
+ )
444
+ else:
445
+ cmd = [self.bin_path]
446
+ if self._is_multi_model():
447
+ cmd += ["--enable-mpi"]
448
+ cmd += self._get_pa_cli_command(index).replace("=", " ").split()
449
+
450
+ return cmd
451
+
452
+ def _get_pa_cli_command(self, index, exclude_model_name=False):
453
+ return (
454
+ self._config.model_run_configs()[index]
455
+ .perf_config()
456
+ .to_cli_string(exclude_model_name)
457
+ )
458
+
459
+ def _get_genai_perf_cli_command(self, index):
460
+ return self._config.genai_perf_config().to_cli_string()
461
+
462
+ def _create_env(self, env):
463
+ perf_analyzer_env = os.environ.copy()
464
+
465
+ if env:
466
+ # Filter env variables that use env lookups
467
+ for variable, value in env.items():
468
+ if value.find("$") == -1:
469
+ perf_analyzer_env[variable] = value
470
+ else:
471
+ # Collect the ones that need lookups to give to the shell
472
+ perf_analyzer_env[variable] = os.path.expandvars(value)
473
+
474
+ return perf_analyzer_env
475
+
476
+ def _create_process(self, cmd, perf_analyzer_env):
477
+ self._cmd_log = tempfile.NamedTemporaryFile()
478
+ try:
479
+ process = Popen(
480
+ cmd,
481
+ start_new_session=True,
482
+ stdout=self._cmd_log,
483
+ stderr=STDOUT,
484
+ encoding="utf-8",
485
+ env=perf_analyzer_env,
486
+ )
487
+ except FileNotFoundError as e:
488
+ raise TritonModelAnalyzerException(f"perf_analyzer binary not found : {e}")
489
+ return process
490
+
491
+ def _verify_output_files_exist(self):
492
+ """
493
+ Verify that perf_analyzer created the expected output files.
494
+ Waits briefly to handle filesystem buffering delays.
495
+ Returns True if all expected files exist, False otherwise.
496
+ """
497
+ import time
498
+
499
+ max_wait_time = 2.0 # seconds
500
+ wait_interval = 0.1 # seconds
501
+ max_attempts = int(max_wait_time / wait_interval)
502
+
503
+ for perf_config in [
504
+ mrc.perf_config() for mrc in self._config.model_run_configs()
505
+ ]:
506
+ latency_file = perf_config["latency-report-file"]
507
+
508
+ file_found = False
509
+ for attempt in range(max_attempts):
510
+ if os.path.isfile(latency_file):
511
+ file_found = True
512
+ break
513
+ if attempt < max_attempts - 1: # Don't sleep on last attempt
514
+ time.sleep(wait_interval)
515
+
516
+ if not file_found:
517
+ logger.error(f"Expected output file not found: {latency_file}")
518
+ return False
519
+
520
+ return True
521
+
522
+ def _resolve_process(self, process):
523
+ if self._poll_perf_analyzer(process) == 1:
524
+ return self.PA_FAIL
525
+
526
+ if process.returncode > 0:
527
+ if self._auto_adjust_parameters(process) == self.PA_FAIL:
528
+ return self.PA_FAIL
529
+ else:
530
+ return self.PA_RETRY
531
+ elif process.returncode < 0:
532
+ logger.error(
533
+ "perf_analyzer was terminated by signal: "
534
+ f"{signal.Signals(abs(process.returncode)).name}"
535
+ )
536
+ return self.PA_FAIL
537
+
538
+ if not self._verify_output_files_exist():
539
+ logger.error(
540
+ "perf_analyzer returned success but did not create expected output files"
541
+ )
542
+ logger.error("perf_analyzer output:")
543
+ if self._output:
544
+ logger.error(self._output)
545
+ else:
546
+ logger.error("(no output captured)")
547
+ # Check if this is due to measurement window being too small
548
+ if self._auto_adjust_parameters(process) == self.PA_FAIL:
549
+ return self.PA_FAIL
550
+ else:
551
+ return self.PA_RETRY
552
+
553
+ return self.PA_SUCCESS
554
+
555
+ def _poll_perf_analyzer(self, process):
556
+ """
557
+ Periodically poll the perf analyzer to get output
558
+ or see if it is taking too much time or CPU resources
559
+ """
560
+
561
+ current_timeout = self._timeout
562
+ process_util = psutil.Process(process.pid)
563
+
564
+ while current_timeout > 0:
565
+ if process.poll() is not None:
566
+ self._output = self._get_process_output()
567
+ break
568
+
569
+ # perf_analyzer using too much CPU?
570
+ cpu_util = process_util.cpu_percent(INTERVAL_SLEEP_TIME)
571
+ if cpu_util > self._max_cpu_util:
572
+ logger.info(
573
+ f"perf_analyzer used significant amount of CPU resources ({cpu_util}%), killing perf_analyzer"
574
+ )
575
+ self._output = self._get_process_output()
576
+ process.kill()
577
+
578
+ return self.PA_FAIL
579
+
580
+ current_timeout -= INTERVAL_SLEEP_TIME
581
+ else:
582
+ logger.info("perf_analyzer took very long to exit, killing perf_analyzer")
583
+ process.kill()
584
+
585
+ return self.PA_FAIL
586
+
587
+ return self.PA_SUCCESS
588
+
589
+ def _get_process_output(self):
590
+ self._cmd_log.seek(0)
591
+ tmp_output = self._cmd_log.read()
592
+ self._cmd_log.close()
593
+
594
+ # PA has occasionally output non-UTF-8 bytes which would cause MA
595
+ # to assert. In that case, just ignore the result instead of asserting
596
+ result = ""
597
+ try:
598
+ result = tmp_output.decode("utf-8")
599
+ except Exception:
600
+ # Ignore the result on decode failed
601
+ pass
602
+
603
+ return result
604
+
605
+ def _auto_adjust_parameters(self, process):
606
+ """
607
+ Attempt to update PA parameters based on the output
608
+ """
609
+ logger.debug(
610
+ f"_auto_adjust_parameters called. returncode={process.returncode}, output_length={len(self._output)}, has_failed_msg={'Failed to obtain stable measurement' in self._output}, has_larger_window_msg={'Please use a larger time window' in self._output}"
611
+ )
612
+ if (
613
+ self._output.find("Failed to obtain stable measurement") != -1
614
+ or self._output.find("Please use a larger time window") != -1
615
+ ):
616
+ logger.debug("Found error message, will adjust parameters")
617
+ per_rank_logs = self._split_output_per_rank()
618
+
619
+ for index, log in enumerate(per_rank_logs):
620
+ perf_config = self._config.model_run_configs()[index].perf_config()
621
+ self._auto_adjust_parameters_for_perf_config(perf_config, log)
622
+
623
+ return self.PA_SUCCESS
624
+ else:
625
+ clamped_output = self._output[:1000]
626
+ logger.info(
627
+ f"Running perf_analyzer failed with"
628
+ f" exit status {process.returncode}:\n{clamped_output}"
629
+ )
630
+ return self.PA_FAIL
631
+
632
+ def _auto_adjust_parameters_for_perf_config(self, perf_config, log):
633
+ if (
634
+ log.find("Failed to obtain stable measurement") != -1
635
+ or log.find("Please use a larger time window") != -1
636
+ ):
637
+ logger.debug(
638
+ f"Found measurement error in log, will adjust parameters. measurement-mode={perf_config['measurement-mode']}, current measurement-interval={perf_config['measurement-interval']}"
639
+ )
640
+ if perf_config["measurement-mode"] == "time_windows":
641
+ if perf_config["measurement-interval"] is None:
642
+ perf_config["measurement-interval"] = (
643
+ PERF_ANALYZER_MEASUREMENT_WINDOW + MEASUREMENT_WINDOW_STEP
644
+ )
645
+ else:
646
+ perf_config["measurement-interval"] = (
647
+ int(perf_config["measurement-interval"])
648
+ + MEASUREMENT_WINDOW_STEP
649
+ )
650
+
651
+ logger.info(
652
+ "perf_analyzer's measurement window is too small, "
653
+ f"increased to {perf_config['measurement-interval']} ms."
654
+ )
655
+ elif (
656
+ perf_config["measurement-mode"] is None
657
+ or perf_config["measurement-mode"] == "count_windows"
658
+ ):
659
+ if perf_config["measurement-request-count"] is None:
660
+ perf_config["measurement-request-count"] = (
661
+ PERF_ANALYZER_MINIMUM_REQUEST_COUNT
662
+ + MEASUREMENT_REQUEST_COUNT_STEP
663
+ )
664
+ else:
665
+ perf_config["measurement-request-count"] = (
666
+ int(perf_config["measurement-request-count"])
667
+ + MEASUREMENT_REQUEST_COUNT_STEP
668
+ )
669
+
670
+ logger.info(
671
+ "perf_analyzer's request count is too small, "
672
+ f"increased to {perf_config['measurement-request-count']}."
673
+ )
674
+
675
+ def _split_output_per_rank(self):
676
+ if self._is_multi_model():
677
+ outputs = ["" for mrc in self._config.model_run_configs()]
678
+ for line in self._output.splitlines():
679
+ # Example would find the '2': [1,2]<stdout>: fake output ***
680
+ rank = re.search(r"^\[\d+,(\d+)\]", line)
681
+
682
+ if rank:
683
+ index = int(rank.group(1))
684
+ outputs[index] += line + "\n"
685
+ return outputs
686
+ else:
687
+ return [self._output]
688
+
689
+ def _is_multi_model(self):
690
+ """
691
+ Returns true if the RunConfig provided to this class contains multiple perf_configs. Else False
692
+ """
693
+ return len(self._config.model_run_configs()) > 1
694
+
695
+ def _parse_outputs(self, metrics):
696
+ self._parse_generic_outputs(metrics)
697
+
698
+ if self._model_type == "LLM":
699
+ self._parse_llm_outputs(metrics)
700
+
701
+ def _parse_generic_outputs(self, metrics):
702
+ """
703
+ Extract records from the Perf Analyzer run for each model
704
+ """
705
+
706
+ for perf_config in [
707
+ mrc.perf_config() for mrc in self._config.model_run_configs()
708
+ ]:
709
+ latency_file = perf_config["latency-report-file"]
710
+ logger.debug(f"Reading PA results from {latency_file}")
711
+
712
+ with open(latency_file, mode="r") as f:
713
+ csv_reader = csv.DictReader(f, delimiter=",")
714
+
715
+ for row in csv_reader:
716
+ self._perf_records[
717
+ perf_config["model-name"]
718
+ ] = self._extract_perf_records_from_row(metrics, row)
719
+ self._gpu_records = self._extract_gpu_records_from_row(metrics, row)
720
+
721
+ for perf_config in [
722
+ mrc.perf_config() for mrc in self._config.model_run_configs()
723
+ ]:
724
+ # Remove the latency file and all associated composing model latency files
725
+ for f in glob.glob(f"*{perf_config['latency-report-file']}"):
726
+ os.remove(f)
727
+
728
+ def _parse_llm_outputs(self, metrics):
729
+ """
730
+ Extract records from the Perf Analyzer run for each model
731
+ """
732
+
733
+ perf_config = self._config.model_run_configs()[0].perf_config()
734
+
735
+ logger.debug(f"Reading GENAI-PERF results from {GENAI_PERF_CSV}")
736
+ with open(GENAI_PERF_CSV, mode="r") as f:
737
+ csv_reader = list(csv.DictReader(f, delimiter=","))
738
+
739
+ # See test_perf_analyzer::test_pa_llm_csv_output() for CSV output example
740
+ self._llm_records[perf_config["model-name"]] = self._extract_llm_records(
741
+ metrics, csv_reader
742
+ )
743
+
744
+ os.remove(GENAI_PERF_CSV)
745
+ for filename in GENAI_PERF_COLLATERAL:
746
+ os.remove(filename)
747
+
748
+ def _extract_perf_records_from_row(
749
+ self, requested_metrics: List[Record], row_metrics: Dict[str, str]
750
+ ) -> List[Record]:
751
+ perf_records: List[Record] = []
752
+ for perf_metric in PerfAnalyzer.perf_metric_table:
753
+ if self._is_metric_requested_and_in_row(
754
+ perf_metric, requested_metrics, row_metrics
755
+ ):
756
+ value = float(row_metrics[str(perf_metric[PerfAnalyzer.CSV_STRING])])
757
+ reduction_factor = float(
758
+ str(perf_metric[PerfAnalyzer.REDUCTION_FACTOR])
759
+ )
760
+ perf_value = value / reduction_factor
761
+
762
+ perf_records.append(
763
+ perf_metric[PerfAnalyzer.RECORD_CLASS](perf_value) # type: ignore
764
+ )
765
+
766
+ return perf_records
767
+
768
+ def _extract_gpu_records_from_row(
769
+ self, requested_metrics: List[Record], row_metrics: Dict[str, str]
770
+ ) -> List[Record]:
771
+ # GPU metrics have the following format: UUID0:value0;UUID1:value1;...
772
+ gpu_records: List[Record] = []
773
+ for gpu_metric in PerfAnalyzer.gpu_metric_table:
774
+ if self._is_metric_requested_and_in_row(
775
+ gpu_metric, requested_metrics, row_metrics
776
+ ):
777
+ gpu_metric_string = row_metrics[
778
+ str(gpu_metric[PerfAnalyzer.CSV_STRING])
779
+ ]
780
+
781
+ # Covers the case where PA didn't provide data
782
+ if not gpu_metric_string:
783
+ continue
784
+
785
+ # Needed because PA might terminate substring with a ;
786
+ if gpu_metric_string and gpu_metric_string[-1] == ";":
787
+ gpu_metric_string = gpu_metric_string[:-1]
788
+
789
+ gpu_metric_string_tuples = gpu_metric_string.split(";")
790
+
791
+ for gpu_metric_string_tuple in gpu_metric_string_tuples:
792
+ gpu_metric_tuple = gpu_metric_string_tuple.split(":")
793
+
794
+ uuid = gpu_metric_tuple[PerfAnalyzer.GPU_METRIC_UUID]
795
+ tmp_value = float(gpu_metric_tuple[PerfAnalyzer.GPU_METRIC_VALUE])
796
+ reduction_factor = float(
797
+ str(gpu_metric[PerfAnalyzer.REDUCTION_FACTOR])
798
+ )
799
+ value = tmp_value / reduction_factor
800
+
801
+ record = gpu_metric[PerfAnalyzer.RECORD_CLASS](
802
+ value=value, device_uuid=uuid
803
+ ) # type: ignore
804
+
805
+ gpu_records.append(record)
806
+
807
+ self._cleanup_gpu_records(gpu_records)
808
+ return gpu_records
809
+
810
+ def _extract_llm_records(
811
+ self, requested_metrics: List[Record], csv_reader: DictReader
812
+ ) -> List[Record]:
813
+ llm_records: List[Record] = []
814
+
815
+ for requested_metric in requested_metrics:
816
+ new_llm_record = self._get_llm_record_from_csv(requested_metric, csv_reader)
817
+ if new_llm_record:
818
+ llm_records.append(new_llm_record)
819
+
820
+ return llm_records
821
+
822
+ def _get_llm_record_from_csv(
823
+ self, requested_metric: Record, csv_reader: DictReader
824
+ ) -> Optional[Record]:
825
+ for row in csv_reader:
826
+ for key, value in row.items():
827
+ metric_string = f"{row['Metric']} {key}"
828
+ llm_metric = self._find_corresponding_llm_metric_row(metric_string)
829
+
830
+ if (
831
+ llm_metric
832
+ and llm_metric[PerfAnalyzer.METRIC_TAG] == requested_metric.tag
833
+ ):
834
+ adjusted_value = float(value) / float(
835
+ llm_metric[PerfAnalyzer.REDUCTION_FACTOR]
836
+ )
837
+
838
+ llm_record = llm_metric[PerfAnalyzer.RECORD_CLASS](adjusted_value) # type: ignore
839
+ return llm_record
840
+
841
+ return None
842
+
843
+ def _find_corresponding_llm_metric_row(self, metric_string: str) -> Optional[List]:
844
+ for row in PerfAnalyzer.llm_metric_table:
845
+ if metric_string == row[PerfAnalyzer.CSV_STRING]:
846
+ return row
847
+
848
+ return None
849
+
850
+ def _cleanup_gpu_records(self, gpu_records):
851
+ # Recalculate GPUFreeMemory by removing the value of the associated GPUUsedMemory
852
+ # Remove any GPUFreeMemory records that don't have a matching GPUUsedMemory
853
+ indexes_to_remove = []
854
+ for i, record in enumerate(gpu_records):
855
+ if type(record) == GPUFreeMemory:
856
+ # Find matching UUID UsedMemory
857
+ found = False
858
+ for other_record in gpu_records:
859
+ if (
860
+ type(other_record) == GPUUsedMemory
861
+ and record.device_uuid() == other_record.device_uuid()
862
+ ):
863
+ found = True
864
+ record._value = record.value() - other_record.value()
865
+ break
866
+ if not found:
867
+ indexes_to_remove.append(i)
868
+ for i in reversed(indexes_to_remove):
869
+ del gpu_records[i]
870
+
871
+ def _is_metric_requested_and_in_row(
872
+ self,
873
+ metric: List[object],
874
+ requested_metrics: List[Record],
875
+ row_metrics: Dict[str, str],
876
+ ) -> bool:
877
+ tag_match = any(
878
+ metric[PerfAnalyzer.METRIC_TAG] in requested_metric.tag
879
+ for requested_metric in requested_metrics
880
+ )
881
+
882
+ return tag_match and metric[PerfAnalyzer.CSV_STRING] in row_metrics