triton-model-analyzer 1.48.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (204) hide show
  1. model_analyzer/__init__.py +15 -0
  2. model_analyzer/analyzer.py +448 -0
  3. model_analyzer/cli/__init__.py +15 -0
  4. model_analyzer/cli/cli.py +193 -0
  5. model_analyzer/config/__init__.py +15 -0
  6. model_analyzer/config/generate/__init__.py +15 -0
  7. model_analyzer/config/generate/automatic_model_config_generator.py +164 -0
  8. model_analyzer/config/generate/base_model_config_generator.py +352 -0
  9. model_analyzer/config/generate/brute_plus_binary_parameter_search_run_config_generator.py +164 -0
  10. model_analyzer/config/generate/brute_run_config_generator.py +154 -0
  11. model_analyzer/config/generate/concurrency_sweeper.py +75 -0
  12. model_analyzer/config/generate/config_generator_interface.py +52 -0
  13. model_analyzer/config/generate/coordinate.py +143 -0
  14. model_analyzer/config/generate/coordinate_data.py +86 -0
  15. model_analyzer/config/generate/generator_utils.py +116 -0
  16. model_analyzer/config/generate/manual_model_config_generator.py +187 -0
  17. model_analyzer/config/generate/model_config_generator_factory.py +92 -0
  18. model_analyzer/config/generate/model_profile_spec.py +74 -0
  19. model_analyzer/config/generate/model_run_config_generator.py +154 -0
  20. model_analyzer/config/generate/model_variant_name_manager.py +150 -0
  21. model_analyzer/config/generate/neighborhood.py +536 -0
  22. model_analyzer/config/generate/optuna_plus_concurrency_sweep_run_config_generator.py +141 -0
  23. model_analyzer/config/generate/optuna_run_config_generator.py +838 -0
  24. model_analyzer/config/generate/perf_analyzer_config_generator.py +312 -0
  25. model_analyzer/config/generate/quick_plus_concurrency_sweep_run_config_generator.py +130 -0
  26. model_analyzer/config/generate/quick_run_config_generator.py +753 -0
  27. model_analyzer/config/generate/run_config_generator_factory.py +329 -0
  28. model_analyzer/config/generate/search_config.py +112 -0
  29. model_analyzer/config/generate/search_dimension.py +73 -0
  30. model_analyzer/config/generate/search_dimensions.py +85 -0
  31. model_analyzer/config/generate/search_parameter.py +49 -0
  32. model_analyzer/config/generate/search_parameters.py +388 -0
  33. model_analyzer/config/input/__init__.py +15 -0
  34. model_analyzer/config/input/config_command.py +483 -0
  35. model_analyzer/config/input/config_command_profile.py +1747 -0
  36. model_analyzer/config/input/config_command_report.py +267 -0
  37. model_analyzer/config/input/config_defaults.py +236 -0
  38. model_analyzer/config/input/config_enum.py +83 -0
  39. model_analyzer/config/input/config_field.py +216 -0
  40. model_analyzer/config/input/config_list_generic.py +112 -0
  41. model_analyzer/config/input/config_list_numeric.py +151 -0
  42. model_analyzer/config/input/config_list_string.py +111 -0
  43. model_analyzer/config/input/config_none.py +71 -0
  44. model_analyzer/config/input/config_object.py +129 -0
  45. model_analyzer/config/input/config_primitive.py +81 -0
  46. model_analyzer/config/input/config_status.py +75 -0
  47. model_analyzer/config/input/config_sweep.py +83 -0
  48. model_analyzer/config/input/config_union.py +113 -0
  49. model_analyzer/config/input/config_utils.py +128 -0
  50. model_analyzer/config/input/config_value.py +243 -0
  51. model_analyzer/config/input/objects/__init__.py +15 -0
  52. model_analyzer/config/input/objects/config_model_profile_spec.py +325 -0
  53. model_analyzer/config/input/objects/config_model_report_spec.py +173 -0
  54. model_analyzer/config/input/objects/config_plot.py +198 -0
  55. model_analyzer/config/input/objects/config_protobuf_utils.py +101 -0
  56. model_analyzer/config/input/yaml_config_validator.py +82 -0
  57. model_analyzer/config/run/__init__.py +15 -0
  58. model_analyzer/config/run/model_run_config.py +313 -0
  59. model_analyzer/config/run/run_config.py +168 -0
  60. model_analyzer/constants.py +76 -0
  61. model_analyzer/device/__init__.py +15 -0
  62. model_analyzer/device/device.py +24 -0
  63. model_analyzer/device/gpu_device.py +87 -0
  64. model_analyzer/device/gpu_device_factory.py +248 -0
  65. model_analyzer/entrypoint.py +307 -0
  66. model_analyzer/log_formatter.py +65 -0
  67. model_analyzer/model_analyzer_exceptions.py +24 -0
  68. model_analyzer/model_manager.py +255 -0
  69. model_analyzer/monitor/__init__.py +15 -0
  70. model_analyzer/monitor/cpu_monitor.py +69 -0
  71. model_analyzer/monitor/dcgm/DcgmDiag.py +191 -0
  72. model_analyzer/monitor/dcgm/DcgmFieldGroup.py +83 -0
  73. model_analyzer/monitor/dcgm/DcgmGroup.py +815 -0
  74. model_analyzer/monitor/dcgm/DcgmHandle.py +141 -0
  75. model_analyzer/monitor/dcgm/DcgmJsonReader.py +69 -0
  76. model_analyzer/monitor/dcgm/DcgmReader.py +623 -0
  77. model_analyzer/monitor/dcgm/DcgmStatus.py +57 -0
  78. model_analyzer/monitor/dcgm/DcgmSystem.py +412 -0
  79. model_analyzer/monitor/dcgm/__init__.py +15 -0
  80. model_analyzer/monitor/dcgm/common/__init__.py +13 -0
  81. model_analyzer/monitor/dcgm/common/dcgm_client_cli_parser.py +194 -0
  82. model_analyzer/monitor/dcgm/common/dcgm_client_main.py +86 -0
  83. model_analyzer/monitor/dcgm/dcgm_agent.py +887 -0
  84. model_analyzer/monitor/dcgm/dcgm_collectd_plugin.py +369 -0
  85. model_analyzer/monitor/dcgm/dcgm_errors.py +395 -0
  86. model_analyzer/monitor/dcgm/dcgm_field_helpers.py +546 -0
  87. model_analyzer/monitor/dcgm/dcgm_fields.py +815 -0
  88. model_analyzer/monitor/dcgm/dcgm_fields_collectd.py +671 -0
  89. model_analyzer/monitor/dcgm/dcgm_fields_internal.py +29 -0
  90. model_analyzer/monitor/dcgm/dcgm_fluentd.py +45 -0
  91. model_analyzer/monitor/dcgm/dcgm_monitor.py +138 -0
  92. model_analyzer/monitor/dcgm/dcgm_prometheus.py +326 -0
  93. model_analyzer/monitor/dcgm/dcgm_structs.py +2357 -0
  94. model_analyzer/monitor/dcgm/dcgm_telegraf.py +65 -0
  95. model_analyzer/monitor/dcgm/dcgm_value.py +151 -0
  96. model_analyzer/monitor/dcgm/dcgmvalue.py +155 -0
  97. model_analyzer/monitor/dcgm/denylist_recommendations.py +573 -0
  98. model_analyzer/monitor/dcgm/pydcgm.py +47 -0
  99. model_analyzer/monitor/monitor.py +143 -0
  100. model_analyzer/monitor/remote_monitor.py +137 -0
  101. model_analyzer/output/__init__.py +15 -0
  102. model_analyzer/output/file_writer.py +63 -0
  103. model_analyzer/output/output_writer.py +42 -0
  104. model_analyzer/perf_analyzer/__init__.py +15 -0
  105. model_analyzer/perf_analyzer/genai_perf_config.py +206 -0
  106. model_analyzer/perf_analyzer/perf_analyzer.py +882 -0
  107. model_analyzer/perf_analyzer/perf_config.py +479 -0
  108. model_analyzer/plots/__init__.py +15 -0
  109. model_analyzer/plots/detailed_plot.py +266 -0
  110. model_analyzer/plots/plot_manager.py +224 -0
  111. model_analyzer/plots/simple_plot.py +213 -0
  112. model_analyzer/record/__init__.py +15 -0
  113. model_analyzer/record/gpu_record.py +68 -0
  114. model_analyzer/record/metrics_manager.py +887 -0
  115. model_analyzer/record/record.py +280 -0
  116. model_analyzer/record/record_aggregator.py +256 -0
  117. model_analyzer/record/types/__init__.py +15 -0
  118. model_analyzer/record/types/cpu_available_ram.py +93 -0
  119. model_analyzer/record/types/cpu_used_ram.py +93 -0
  120. model_analyzer/record/types/gpu_free_memory.py +96 -0
  121. model_analyzer/record/types/gpu_power_usage.py +107 -0
  122. model_analyzer/record/types/gpu_total_memory.py +96 -0
  123. model_analyzer/record/types/gpu_used_memory.py +96 -0
  124. model_analyzer/record/types/gpu_utilization.py +108 -0
  125. model_analyzer/record/types/inter_token_latency_avg.py +60 -0
  126. model_analyzer/record/types/inter_token_latency_base.py +74 -0
  127. model_analyzer/record/types/inter_token_latency_max.py +60 -0
  128. model_analyzer/record/types/inter_token_latency_min.py +60 -0
  129. model_analyzer/record/types/inter_token_latency_p25.py +60 -0
  130. model_analyzer/record/types/inter_token_latency_p50.py +60 -0
  131. model_analyzer/record/types/inter_token_latency_p75.py +60 -0
  132. model_analyzer/record/types/inter_token_latency_p90.py +60 -0
  133. model_analyzer/record/types/inter_token_latency_p95.py +60 -0
  134. model_analyzer/record/types/inter_token_latency_p99.py +60 -0
  135. model_analyzer/record/types/output_token_throughput.py +105 -0
  136. model_analyzer/record/types/perf_client_response_wait.py +97 -0
  137. model_analyzer/record/types/perf_client_send_recv.py +97 -0
  138. model_analyzer/record/types/perf_latency.py +111 -0
  139. model_analyzer/record/types/perf_latency_avg.py +60 -0
  140. model_analyzer/record/types/perf_latency_base.py +74 -0
  141. model_analyzer/record/types/perf_latency_p90.py +60 -0
  142. model_analyzer/record/types/perf_latency_p95.py +60 -0
  143. model_analyzer/record/types/perf_latency_p99.py +60 -0
  144. model_analyzer/record/types/perf_server_compute_infer.py +97 -0
  145. model_analyzer/record/types/perf_server_compute_input.py +97 -0
  146. model_analyzer/record/types/perf_server_compute_output.py +97 -0
  147. model_analyzer/record/types/perf_server_queue.py +97 -0
  148. model_analyzer/record/types/perf_throughput.py +105 -0
  149. model_analyzer/record/types/time_to_first_token_avg.py +60 -0
  150. model_analyzer/record/types/time_to_first_token_base.py +74 -0
  151. model_analyzer/record/types/time_to_first_token_max.py +60 -0
  152. model_analyzer/record/types/time_to_first_token_min.py +60 -0
  153. model_analyzer/record/types/time_to_first_token_p25.py +60 -0
  154. model_analyzer/record/types/time_to_first_token_p50.py +60 -0
  155. model_analyzer/record/types/time_to_first_token_p75.py +60 -0
  156. model_analyzer/record/types/time_to_first_token_p90.py +60 -0
  157. model_analyzer/record/types/time_to_first_token_p95.py +60 -0
  158. model_analyzer/record/types/time_to_first_token_p99.py +60 -0
  159. model_analyzer/reports/__init__.py +15 -0
  160. model_analyzer/reports/html_report.py +195 -0
  161. model_analyzer/reports/pdf_report.py +50 -0
  162. model_analyzer/reports/report.py +86 -0
  163. model_analyzer/reports/report_factory.py +62 -0
  164. model_analyzer/reports/report_manager.py +1376 -0
  165. model_analyzer/reports/report_utils.py +42 -0
  166. model_analyzer/result/__init__.py +15 -0
  167. model_analyzer/result/constraint_manager.py +150 -0
  168. model_analyzer/result/model_config_measurement.py +354 -0
  169. model_analyzer/result/model_constraints.py +105 -0
  170. model_analyzer/result/parameter_search.py +246 -0
  171. model_analyzer/result/result_manager.py +430 -0
  172. model_analyzer/result/result_statistics.py +159 -0
  173. model_analyzer/result/result_table.py +217 -0
  174. model_analyzer/result/result_table_manager.py +646 -0
  175. model_analyzer/result/result_utils.py +42 -0
  176. model_analyzer/result/results.py +277 -0
  177. model_analyzer/result/run_config_measurement.py +658 -0
  178. model_analyzer/result/run_config_result.py +210 -0
  179. model_analyzer/result/run_config_result_comparator.py +110 -0
  180. model_analyzer/result/sorted_results.py +151 -0
  181. model_analyzer/state/__init__.py +15 -0
  182. model_analyzer/state/analyzer_state.py +76 -0
  183. model_analyzer/state/analyzer_state_manager.py +215 -0
  184. model_analyzer/triton/__init__.py +15 -0
  185. model_analyzer/triton/client/__init__.py +15 -0
  186. model_analyzer/triton/client/client.py +234 -0
  187. model_analyzer/triton/client/client_factory.py +57 -0
  188. model_analyzer/triton/client/grpc_client.py +104 -0
  189. model_analyzer/triton/client/http_client.py +107 -0
  190. model_analyzer/triton/model/__init__.py +15 -0
  191. model_analyzer/triton/model/model_config.py +556 -0
  192. model_analyzer/triton/model/model_config_variant.py +29 -0
  193. model_analyzer/triton/server/__init__.py +15 -0
  194. model_analyzer/triton/server/server.py +76 -0
  195. model_analyzer/triton/server/server_config.py +269 -0
  196. model_analyzer/triton/server/server_docker.py +229 -0
  197. model_analyzer/triton/server/server_factory.py +306 -0
  198. model_analyzer/triton/server/server_local.py +158 -0
  199. triton_model_analyzer-1.48.0.dist-info/METADATA +52 -0
  200. triton_model_analyzer-1.48.0.dist-info/RECORD +204 -0
  201. triton_model_analyzer-1.48.0.dist-info/WHEEL +5 -0
  202. triton_model_analyzer-1.48.0.dist-info/entry_points.txt +2 -0
  203. triton_model_analyzer-1.48.0.dist-info/licenses/LICENSE +67 -0
  204. triton_model_analyzer-1.48.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,255 @@
1
+ #!/usr/bin/env python3
2
+
3
+ # Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
4
+ #
5
+ # Licensed under the Apache License, Version 2.0 (the "License");
6
+ # you may not use this file except in compliance with the License.
7
+ # You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+
17
+ import logging
18
+ from typing import Dict, List, Optional
19
+
20
+ from model_analyzer.config.generate.model_variant_name_manager import (
21
+ ModelVariantNameManager,
22
+ )
23
+ from model_analyzer.config.generate.run_config_generator_factory import (
24
+ RunConfigGeneratorFactory,
25
+ )
26
+ from model_analyzer.config.generate.search_parameters import SearchParameters
27
+ from model_analyzer.config.input.config_command_profile import ConfigCommandProfile
28
+ from model_analyzer.config.input.objects.config_model_profile_spec import (
29
+ ConfigModelProfileSpec,
30
+ )
31
+ from model_analyzer.constants import INVALID_MEASUREMENT_THRESHOLD, LOGGER_NAME
32
+ from model_analyzer.device.gpu_device import GPUDevice
33
+ from model_analyzer.record.metrics_manager import MetricsManager
34
+ from model_analyzer.result.constraint_manager import ConstraintManager
35
+ from model_analyzer.result.result_manager import ResultManager
36
+ from model_analyzer.result.run_config_measurement import RunConfigMeasurement
37
+ from model_analyzer.state.analyzer_state_manager import AnalyzerStateManager
38
+ from model_analyzer.triton.client.client import TritonClient
39
+ from model_analyzer.triton.model.model_config import ModelConfig
40
+ from model_analyzer.triton.server.server import TritonServer
41
+
42
+ from .model_analyzer_exceptions import TritonModelAnalyzerException
43
+
44
+ logger = logging.getLogger(LOGGER_NAME)
45
+
46
+
47
+ class ModelManager:
48
+ """
49
+ This class handles the search for, creation of, and execution of run configs.
50
+ It also records the best results for each model.
51
+ """
52
+
53
+ def __init__(
54
+ self,
55
+ config: ConfigCommandProfile,
56
+ gpus: List[GPUDevice],
57
+ client: TritonClient,
58
+ server: TritonServer,
59
+ metrics_manager: MetricsManager,
60
+ result_manager: ResultManager,
61
+ state_manager: AnalyzerStateManager,
62
+ constraint_manager: ConstraintManager,
63
+ search_parameters: Dict[str, SearchParameters],
64
+ composing_search_parameters: Dict[str, SearchParameters],
65
+ ):
66
+ """
67
+ Parameters
68
+ ----------
69
+ config:ConfigCommandProfile
70
+ The config for the model analyzer
71
+ gpus: List of GPUDevice
72
+ client: TritonClient
73
+ The client handle used to send requests to Triton
74
+ server: TritonServer
75
+ The server handle used to start and stop Triton instances
76
+ metrics_manager: MetricsManager
77
+ The object that handles launching perf analyzer instances and profiling.
78
+ result_manager: ResultManager
79
+ The object that handles storing and sorting the results from the perf analyzer
80
+ state_manager: AnalyzerStateManager
81
+ The object that handles serializing the state of the analyzer and saving.
82
+ constraint_manager: ConstraintManager
83
+ The object that handles processing and applying
84
+ constraints on a given measurements
85
+ search_parameters: SearchParameters
86
+ The object that handles the users configuration search parameters
87
+ composing_search_parameters: SearchParameters
88
+ The object that handles the users configuration search parameters for composing models
89
+ """
90
+
91
+ self._config = config
92
+ self._gpus = gpus
93
+ self._client = client
94
+ self._server = server
95
+ self._metrics_manager = metrics_manager
96
+ self._result_manager = result_manager
97
+ self._state_manager = state_manager
98
+ self._constraint_manager = constraint_manager
99
+ self._search_parameters = search_parameters
100
+ self._composing_search_parameters = composing_search_parameters
101
+
102
+ if state_manager.starting_fresh_run():
103
+ self._init_state()
104
+
105
+ self._failed_measurement_attempts = 0
106
+ self._received_measurement_values_from_pa = False
107
+
108
+ self._model_variant_name_manager = ModelVariantNameManager.from_dict(
109
+ self._state_manager.get_state_variable(
110
+ "ModelManager.model_variant_name_manager"
111
+ )
112
+ )
113
+
114
+ def run_models(self, models: List[ConfigModelProfileSpec]) -> None:
115
+ """
116
+ Generates configs, runs inferences, gets
117
+ measurements for a list of models
118
+
119
+ Parameters
120
+ ----------
121
+ models : List of ConfigModelProfileSpec
122
+ The models to run
123
+ """
124
+
125
+ # Note: this is not done in config_command, because there isn't a ModelConfig yet,
126
+ # so we cannot determine if the model is an ensemble
127
+ self._check_for_ensemble_model_incompatibility(models)
128
+
129
+ self._metrics_manager.start_new_model()
130
+
131
+ # Save the global server config and update the server's config for this model run
132
+ server_config_copy = self._server.config().copy()
133
+
134
+ triton_server_flags = self._get_triton_server_flags(models)
135
+ self._server.update_config(params=triton_server_flags)
136
+
137
+ rcg = RunConfigGeneratorFactory.create_run_config_generator(
138
+ command_config=self._config,
139
+ state_manager=self._state_manager,
140
+ gpus=self._gpus,
141
+ models=models,
142
+ client=self._client,
143
+ result_manager=self._result_manager,
144
+ search_parameters=self._search_parameters,
145
+ composing_search_parameters=self._composing_search_parameters,
146
+ model_variant_name_manager=self._model_variant_name_manager,
147
+ )
148
+
149
+ for run_config in rcg.get_configs():
150
+ if self._state_manager.exiting():
151
+ break
152
+
153
+ if run_config.is_legal_combination():
154
+ measurement = self._metrics_manager.execute_run_config(run_config)
155
+
156
+ self._check_for_valid_measurement(measurement)
157
+ self._stop_ma_if_no_valid_measurement_threshold_reached()
158
+ else:
159
+ logger.info("Skipping illegal run configuration")
160
+ measurement = None
161
+
162
+ if measurement:
163
+ objectives = [model.objectives() for model in models]
164
+ weightings = [model.weighting() for model in models]
165
+
166
+ measurement.set_metric_weightings(metric_objectives=objectives)
167
+ measurement.set_constraint_manager(
168
+ constraint_manager=self._constraint_manager
169
+ )
170
+ measurement.set_model_config_weighting(model_config_weights=weightings)
171
+
172
+ rcg.set_last_results([measurement])
173
+ self._state_manager.save_checkpoint()
174
+
175
+ self._metrics_manager.finalize()
176
+
177
+ # Reset the server args to global config
178
+ self._server.update_config(params=server_config_copy.server_args())
179
+
180
+ model_variant_name_manager_dict = self._state_manager.default_encode(
181
+ self._model_variant_name_manager
182
+ )
183
+
184
+ self._state_manager.set_state_variable(
185
+ "ModelManager.model_variant_name_manager", model_variant_name_manager_dict
186
+ )
187
+
188
+ def _get_triton_server_flags(self, models):
189
+ triton_server_flags = models[0].triton_server_flags()
190
+
191
+ for model in models:
192
+ if model.triton_server_flags() != triton_server_flags:
193
+ raise TritonModelAnalyzerException(
194
+ f"Triton server flags must be the same for all models to run concurrently"
195
+ )
196
+
197
+ def _check_for_ensemble_model_incompatibility(
198
+ self, models: List[ConfigModelProfileSpec]
199
+ ) -> None:
200
+ for model in models:
201
+ model_config = ModelConfig.create_from_profile_spec(
202
+ model, self._config, self._client, self._gpus
203
+ )
204
+
205
+ if model_config.is_ensemble():
206
+ if len(models) > 1:
207
+ raise TritonModelAnalyzerException(
208
+ f"\nProfiling of multiple models is not supported for ensemble models"
209
+ )
210
+
211
+ if self._config.run_config_search_mode == "brute":
212
+ if self._config.get_config()[
213
+ "run_config_search_mode"
214
+ ].is_set_by_user():
215
+ raise TritonModelAnalyzerException(
216
+ f"\nBrute search mode is not supported for ensemble models"
217
+ "\nPlease use quick search mode (--run-config-search-mode quick)"
218
+ )
219
+ else:
220
+ self._config.run_config_search_mode = "quick"
221
+ elif not self._config.bls_composing_models:
222
+ if len(self._config.cpu_only_composing_models) > 0:
223
+ raise TritonModelAnalyzerException(
224
+ f"\nCan only specify --cpu-only-composing-models for ensemble or BLS models."
225
+ )
226
+
227
+ def _init_state(self):
228
+ """
229
+ Sets ModelManager object managed
230
+ state variables in AnalyzerState
231
+ """
232
+
233
+ self._state_manager.set_state_variable(
234
+ "ModelManager.model_variant_name_manager",
235
+ self._state_manager.default_encode(ModelVariantNameManager()),
236
+ )
237
+
238
+ def _check_for_valid_measurement(
239
+ self, measurement: Optional[RunConfigMeasurement]
240
+ ) -> None:
241
+ if measurement:
242
+ self._received_measurement_values_from_pa = True
243
+ else:
244
+ self._failed_measurement_attempts += 1
245
+
246
+ def _stop_ma_if_no_valid_measurement_threshold_reached(self) -> None:
247
+ if self._received_measurement_values_from_pa:
248
+ return
249
+
250
+ if self._failed_measurement_attempts >= INVALID_MEASUREMENT_THRESHOLD:
251
+ raise TritonModelAnalyzerException(
252
+ f"The first {INVALID_MEASUREMENT_THRESHOLD} attempts to acquire measurements "
253
+ "have failed. Please examine the Tritonserver/PA error logs "
254
+ "to determine what has gone wrong."
255
+ )
@@ -0,0 +1,15 @@
1
+ #!/usr/bin/env python3
2
+
3
+ # Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
4
+ #
5
+ # Licensed under the Apache License, Version 2.0 (the "License");
6
+ # you may not use this file except in compliance with the License.
7
+ # You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
@@ -0,0 +1,69 @@
1
+ #!/usr/bin/env python3
2
+
3
+ # Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
4
+ #
5
+ # Licensed under the Apache License, Version 2.0 (the "License");
6
+ # you may not use this file except in compliance with the License.
7
+ # You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+
17
+ from model_analyzer.record.types.cpu_available_ram import CPUAvailableRAM
18
+ from model_analyzer.record.types.cpu_used_ram import CPUUsedRAM
19
+
20
+ from .monitor import Monitor
21
+
22
+
23
+ class CPUMonitor(Monitor):
24
+ """
25
+ A monitor for measuring the CPU usage of tritonserver during inference
26
+ """
27
+
28
+ cpu_metrics = {CPUAvailableRAM, CPUUsedRAM}
29
+
30
+ def __init__(self, server, frequency, metrics):
31
+ """
32
+ Parameters
33
+ ----------
34
+ server : TritonServer
35
+ A handle to the TritonServer
36
+ frequency : float
37
+ How often the metrics should be monitored.
38
+ metrics : list
39
+ A list of Record objects that will be monitored.
40
+ """
41
+
42
+ super().__init__(frequency, metrics)
43
+ self._cpu_memory_records = []
44
+ self._server = server
45
+
46
+ def is_monitoring_connected(self) -> bool:
47
+ return True
48
+
49
+ def _monitoring_iteration(self):
50
+ """
51
+ Get memory info of process and
52
+ append
53
+ """
54
+ if (CPUUsedRAM in self._metrics) or (CPUAvailableRAM in self._metrics):
55
+ used_mem, free_mem = self._server.cpu_stats()
56
+ if CPUUsedRAM in self._metrics:
57
+ self._cpu_memory_records.append(CPUUsedRAM(value=used_mem))
58
+ if CPUAvailableRAM in self._metrics:
59
+ self._cpu_memory_records.append(CPUAvailableRAM(value=free_mem))
60
+
61
+ def _collect_records(self):
62
+ """
63
+ Returns
64
+ -------
65
+ List of Records
66
+ the records metrics
67
+ """
68
+
69
+ return self._cpu_memory_records
@@ -0,0 +1,191 @@
1
+ # Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import model_analyzer.monitor.dcgm.dcgm_structs as dcgm_structs
16
+ import model_analyzer.monitor.dcgm.dcgm_agent as dcgm_agent
17
+
18
+
19
+ class DcgmDiag:
20
+
21
+ # Maps version codes to simple version values for range comparisons
22
+ _versionMap = {dcgm_structs.dcgmRunDiag_version: 5}
23
+
24
+ def __init__(self,
25
+ gpuIds=None,
26
+ testNamesStr='',
27
+ paramsStr='',
28
+ verbose=True,
29
+ version=dcgm_structs.dcgmRunDiag_version):
30
+ # Make sure version is valid
31
+ if version not in DcgmDiag._versionMap:
32
+ raise ValueError("'%s' is not a valid version for dcgmRunDiag." %
33
+ version)
34
+ self.version = version
35
+
36
+ if self.version == dcgm_structs.dcgmRunDiag_version7:
37
+ self.runDiagInfo = dcgm_structs.c_dcgmRunDiag_v7()
38
+ else:
39
+ self.runDiagInfo = dcgm_structs.c_dcgmRunDiag_t()
40
+
41
+ self.numTests = 0
42
+ self.numParams = 0
43
+ self.SetVerbose(verbose)
44
+ if testNamesStr == '':
45
+ # default to a level 1 test
46
+ self.runDiagInfo.validate = 1
47
+ elif testNamesStr == '1':
48
+ self.runDiagInfo.validate = 1
49
+ elif testNamesStr == '2':
50
+ self.runDiagInfo.validate = 2
51
+ elif testNamesStr == '3':
52
+ self.runDiagInfo.validate = 3
53
+ elif testNamesStr == '4':
54
+ self.runDiagInfo.validate = 4
55
+ else:
56
+ # Make sure no number other that 1-4 were submitted
57
+ if testNamesStr.isdigit():
58
+ raise ValueError("'%s' is not a valid test name." %
59
+ testNamesStr)
60
+
61
+ # Copy to the testNames portion of the object
62
+ names = testNamesStr.split(',')
63
+ if len(names) > dcgm_structs.DCGM_MAX_TEST_NAMES:
64
+ err = 'DcgmDiag cannot initialize: %d test names were specified exceeding the limit of %d.' %\
65
+ (len(names), dcgm_structs.DCGM_MAX_TEST_NAMES)
66
+ raise ValueError(err)
67
+
68
+ for testName in names:
69
+ self.AddTest(testName)
70
+
71
+ if paramsStr != '':
72
+ params = paramsStr.split(';')
73
+ if len(params) >= dcgm_structs.DCGM_MAX_TEST_PARMS:
74
+ err = 'DcgmDiag cannot initialize: %d parameters were specified, exceeding the limit of %d.' %\
75
+ (len(params), dcgm_structs.DCGM_MAX_TEST_PARMS)
76
+ raise ValueError(err)
77
+
78
+ for param in params:
79
+ self.AddParameter(param)
80
+
81
+ if gpuIds:
82
+ first = True
83
+ for gpu in gpuIds:
84
+ if first:
85
+ self.runDiagInfo.gpuList = str(gpu)
86
+ first = False
87
+ else:
88
+ self.runDiagInfo.gpuList = "%s,%s" % (
89
+ self.runDiagInfo.gpuList, str(gpu))
90
+
91
+ def SetVerbose(self, val):
92
+ if val == True:
93
+ self.runDiagInfo.flags |= dcgm_structs.DCGM_RUN_FLAGS_VERBOSE
94
+ else:
95
+ self.runDiagInfo.flags &= ~dcgm_structs.DCGM_RUN_FLAGS_VERBOSE
96
+
97
+ def UseFakeGpus(self):
98
+ self.runDiagInfo.fakeGpuList = self.runDiagInfo.gpuList
99
+
100
+ def GetStruct(self):
101
+ return self.runDiagInfo
102
+
103
+ def AddParameter(self, parameterStr):
104
+ if len(parameterStr) >= dcgm_structs.DCGM_MAX_TEST_PARMS_LEN:
105
+ err = 'DcgmDiag cannot add parameter \'%s\' because it exceeds max length %d.' % \
106
+ (parameterStr, dcgm_structs.DCGM_MAX_TEST_PARMS_LEN)
107
+ raise ValueError(err)
108
+
109
+ index = 0
110
+ for c in parameterStr:
111
+ self.runDiagInfo.testParms[self.numParams][index] = ord(c)
112
+ index += 1
113
+
114
+ self.numParams += 1
115
+
116
+ def AddTest(self, testNameStr):
117
+ if len(testNameStr) >= dcgm_structs.DCGM_MAX_TEST_NAMES_LEN:
118
+ err = 'DcgmDiag cannot add test name \'%s\' because it exceeds max length %d.' % \
119
+ (testNameStr, dcgm_structs.DCGM_MAX_TEST_NAMES_LEN)
120
+ raise ValueError(err)
121
+
122
+ index = 0
123
+ for c in testNameStr:
124
+ self.runDiagInfo.testNames[self.numTests][index] = ord(c)
125
+ index += 1
126
+
127
+ self.numTests += 1
128
+
129
+ def SetStatsOnFail(self, val):
130
+ if val == True:
131
+ self.runDiagInfo.flags |= dcgm_structs.DCGM_RUN_FLAGS_STATSONFAIL
132
+
133
+ def SetThrottleMask(self, value):
134
+ if DcgmDiag._versionMap[self.version] < 3:
135
+ raise ValueError(
136
+ "Throttle mask requires minimum version 3 for dcgmRunDiag.")
137
+ if isinstance(
138
+ value,
139
+ str) and len(value) >= dcgm_structs.DCGM_THROTTLE_MASK_LEN:
140
+ raise ValueError("Throttle mask value '%s' exceeds max length %d." %
141
+ (value, dcgm_structs.DCGM_THROTTLE_MASK_LEN - 1))
142
+
143
+ self.runDiagInfo.throttleMask = str(value)
144
+
145
+ def SetFailEarly(self, enable=True, checkInterval=5):
146
+ if DcgmDiag._versionMap[self.version] < 5:
147
+ raise ValueError(
148
+ "Fail early requires minimum version 5 for dcgmRunDiag.")
149
+ if not isinstance(checkInterval, int):
150
+ raise ValueError("Invalid checkInterval value: %s" % checkInterval)
151
+
152
+ if enable:
153
+ self.runDiagInfo.flags |= dcgm_structs.DCGM_RUN_FLAGS_FAIL_EARLY
154
+ self.runDiagInfo.failCheckInterval = checkInterval
155
+ else:
156
+ self.runDiagInfo.flags &= ~dcgm_structs.DCGM_RUN_FLAGS_FAIL_EARLY
157
+
158
+ def Execute(self, handle):
159
+ return dcgm_agent.dcgmActionValidate_v2(handle, self.runDiagInfo,
160
+ self.version)
161
+
162
+ def SetStatsPath(self, statsPath):
163
+ if len(statsPath) >= dcgm_structs.DCGM_PATH_LEN:
164
+ err = "DcgmDiag cannot set statsPath '%s' because it exceeds max length %d." % \
165
+ (statsPath, dcgm_structs.DCGM_PATH_LEN)
166
+ raise ValueError(err)
167
+
168
+ self.runDiagInfo.statsPath = statsPath
169
+
170
+ def SetConfigFileContents(self, configFileContents):
171
+ if len(configFileContents) >= dcgm_structs.DCGM_MAX_CONFIG_FILE_LEN:
172
+ err = "Dcgm Diag cannot set config file contents to '%s' because it exceeds max length %d." \
173
+ % (configFileContents, dcgm_structs.DCGM_MAX_CONFIG_FILE_LEN)
174
+ raise ValueError(err)
175
+
176
+ self.runDiagInfo.configFileContents = configFileContents
177
+
178
+ def SetDebugLogFile(self, logFileName):
179
+ if len(logFileName) >= dcgm_structs.DCGM_FILE_LEN:
180
+ raise ValueError("Cannot set debug file to '%s' because it exceeds max length %d."\
181
+ % (logFileName, dcgm_structs.DCGM_FILE_LEN))
182
+
183
+ self.runDiagInfo.debugLogFile = logFileName
184
+
185
+ def SetDebugLevel(self, debugLevel):
186
+ if debugLevel < 0 or debugLevel > 5:
187
+ raise ValueError(
188
+ "Cannot set debug level to %d. Debug Level must be a value from 0-5 inclusive."
189
+ )
190
+
191
+ self.runDiagInfo.debugLevel = debugLevel
@@ -0,0 +1,83 @@
1
+ # Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import model_analyzer.monitor.dcgm.dcgm_agent as dcgm_agent
16
+ import model_analyzer.monitor.dcgm.dcgm_structs as dcgm_structs
17
+ '''
18
+ Class for managing a group of field IDs in the host engine.
19
+ '''
20
+
21
+
22
+ class DcgmFieldGroup:
23
+ '''
24
+ Constructor
25
+
26
+ dcgmHandle - DcgmHandle() instance to use for communicating with the host engine
27
+ name - Name of the field group to use within DCGM. This must be unique
28
+ fieldIds - Fields that are part of this group
29
+ fieldGroupId - If provided, this is used to initialize the object from an existing field group ID
30
+ '''
31
+
32
+ def __init__(self, dcgmHandle, name="", fieldIds=None, fieldGroupId=None):
33
+ fieldIds = fieldIds or []
34
+ self.name = name
35
+ self.fieldIds = fieldIds
36
+ self._dcgmHandle = dcgmHandle
37
+ self.wasCreated = False
38
+
39
+ #If the user passed in an ID, the field group already exists. Fetch live info
40
+ if fieldGroupId is not None:
41
+ self.fieldGroupId = fieldGroupId
42
+ fieldGroupInfo = dcgm_agent.dcgmFieldGroupGetInfo(
43
+ self._dcgmHandle.handle, self.fieldGroupId)
44
+ self.name = fieldGroupInfo.fieldGroupName
45
+ self.fieldIds = fieldGroupInfo.fieldIds
46
+ else:
47
+ self.fieldGroupId = None #Assign here so the destructor doesn't fail if the call below fails
48
+ self.fieldGroupId = dcgm_agent.dcgmFieldGroupCreate(
49
+ self._dcgmHandle.handle, fieldIds, name)
50
+ self.wasCreated = True
51
+
52
+ '''
53
+ Remove this field group from DCGM. This object can no longer be passed to other APIs after this call.
54
+ '''
55
+
56
+ def Delete(self):
57
+ if self.wasCreated and self.fieldGroupId is not None:
58
+ try:
59
+ try:
60
+ dcgm_agent.dcgmFieldGroupDestroy(self._dcgmHandle.handle,
61
+ self.fieldGroupId)
62
+ except dcgm_structs.dcgmExceptionClass(
63
+ dcgm_structs.DCGM_ST_NO_DATA):
64
+ # someone may have deleted the group under us. That's ok.
65
+ pass
66
+ except dcgm_structs.dcgmExceptionClass(
67
+ dcgm_structs.DCGM_ST_CONNECTION_NOT_VALID):
68
+ # We lost our connection, but we're destructing this object anyway.
69
+ pass
70
+ except AttributeError as ae:
71
+ # When we're cleaning up at the end, dcgm_agent and dcgm_structs have been unloaded and we'll
72
+ # get an AttributeError: "'NoneType' object has no 'dcgmExceptionClass'" Ignore this
73
+ pass
74
+ except TypeError as te:
75
+ # When we're cleaning up at the end, dcgm_agent and dcgm_structs have been unloaded and we might
76
+ # get a TypeError: "'NoneType' object is not callable'" Ignore this
77
+ pass
78
+ self.fieldGroupId = None
79
+ self._dcgmHandle = None
80
+
81
+ #Destructor
82
+ def __del__(self):
83
+ self.Delete()