triton-model-analyzer 1.48.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (204) hide show
  1. model_analyzer/__init__.py +15 -0
  2. model_analyzer/analyzer.py +448 -0
  3. model_analyzer/cli/__init__.py +15 -0
  4. model_analyzer/cli/cli.py +193 -0
  5. model_analyzer/config/__init__.py +15 -0
  6. model_analyzer/config/generate/__init__.py +15 -0
  7. model_analyzer/config/generate/automatic_model_config_generator.py +164 -0
  8. model_analyzer/config/generate/base_model_config_generator.py +352 -0
  9. model_analyzer/config/generate/brute_plus_binary_parameter_search_run_config_generator.py +164 -0
  10. model_analyzer/config/generate/brute_run_config_generator.py +154 -0
  11. model_analyzer/config/generate/concurrency_sweeper.py +75 -0
  12. model_analyzer/config/generate/config_generator_interface.py +52 -0
  13. model_analyzer/config/generate/coordinate.py +143 -0
  14. model_analyzer/config/generate/coordinate_data.py +86 -0
  15. model_analyzer/config/generate/generator_utils.py +116 -0
  16. model_analyzer/config/generate/manual_model_config_generator.py +187 -0
  17. model_analyzer/config/generate/model_config_generator_factory.py +92 -0
  18. model_analyzer/config/generate/model_profile_spec.py +74 -0
  19. model_analyzer/config/generate/model_run_config_generator.py +154 -0
  20. model_analyzer/config/generate/model_variant_name_manager.py +150 -0
  21. model_analyzer/config/generate/neighborhood.py +536 -0
  22. model_analyzer/config/generate/optuna_plus_concurrency_sweep_run_config_generator.py +141 -0
  23. model_analyzer/config/generate/optuna_run_config_generator.py +838 -0
  24. model_analyzer/config/generate/perf_analyzer_config_generator.py +312 -0
  25. model_analyzer/config/generate/quick_plus_concurrency_sweep_run_config_generator.py +130 -0
  26. model_analyzer/config/generate/quick_run_config_generator.py +753 -0
  27. model_analyzer/config/generate/run_config_generator_factory.py +329 -0
  28. model_analyzer/config/generate/search_config.py +112 -0
  29. model_analyzer/config/generate/search_dimension.py +73 -0
  30. model_analyzer/config/generate/search_dimensions.py +85 -0
  31. model_analyzer/config/generate/search_parameter.py +49 -0
  32. model_analyzer/config/generate/search_parameters.py +388 -0
  33. model_analyzer/config/input/__init__.py +15 -0
  34. model_analyzer/config/input/config_command.py +483 -0
  35. model_analyzer/config/input/config_command_profile.py +1747 -0
  36. model_analyzer/config/input/config_command_report.py +267 -0
  37. model_analyzer/config/input/config_defaults.py +236 -0
  38. model_analyzer/config/input/config_enum.py +83 -0
  39. model_analyzer/config/input/config_field.py +216 -0
  40. model_analyzer/config/input/config_list_generic.py +112 -0
  41. model_analyzer/config/input/config_list_numeric.py +151 -0
  42. model_analyzer/config/input/config_list_string.py +111 -0
  43. model_analyzer/config/input/config_none.py +71 -0
  44. model_analyzer/config/input/config_object.py +129 -0
  45. model_analyzer/config/input/config_primitive.py +81 -0
  46. model_analyzer/config/input/config_status.py +75 -0
  47. model_analyzer/config/input/config_sweep.py +83 -0
  48. model_analyzer/config/input/config_union.py +113 -0
  49. model_analyzer/config/input/config_utils.py +128 -0
  50. model_analyzer/config/input/config_value.py +243 -0
  51. model_analyzer/config/input/objects/__init__.py +15 -0
  52. model_analyzer/config/input/objects/config_model_profile_spec.py +325 -0
  53. model_analyzer/config/input/objects/config_model_report_spec.py +173 -0
  54. model_analyzer/config/input/objects/config_plot.py +198 -0
  55. model_analyzer/config/input/objects/config_protobuf_utils.py +101 -0
  56. model_analyzer/config/input/yaml_config_validator.py +82 -0
  57. model_analyzer/config/run/__init__.py +15 -0
  58. model_analyzer/config/run/model_run_config.py +313 -0
  59. model_analyzer/config/run/run_config.py +168 -0
  60. model_analyzer/constants.py +76 -0
  61. model_analyzer/device/__init__.py +15 -0
  62. model_analyzer/device/device.py +24 -0
  63. model_analyzer/device/gpu_device.py +87 -0
  64. model_analyzer/device/gpu_device_factory.py +248 -0
  65. model_analyzer/entrypoint.py +307 -0
  66. model_analyzer/log_formatter.py +65 -0
  67. model_analyzer/model_analyzer_exceptions.py +24 -0
  68. model_analyzer/model_manager.py +255 -0
  69. model_analyzer/monitor/__init__.py +15 -0
  70. model_analyzer/monitor/cpu_monitor.py +69 -0
  71. model_analyzer/monitor/dcgm/DcgmDiag.py +191 -0
  72. model_analyzer/monitor/dcgm/DcgmFieldGroup.py +83 -0
  73. model_analyzer/monitor/dcgm/DcgmGroup.py +815 -0
  74. model_analyzer/monitor/dcgm/DcgmHandle.py +141 -0
  75. model_analyzer/monitor/dcgm/DcgmJsonReader.py +69 -0
  76. model_analyzer/monitor/dcgm/DcgmReader.py +623 -0
  77. model_analyzer/monitor/dcgm/DcgmStatus.py +57 -0
  78. model_analyzer/monitor/dcgm/DcgmSystem.py +412 -0
  79. model_analyzer/monitor/dcgm/__init__.py +15 -0
  80. model_analyzer/monitor/dcgm/common/__init__.py +13 -0
  81. model_analyzer/monitor/dcgm/common/dcgm_client_cli_parser.py +194 -0
  82. model_analyzer/monitor/dcgm/common/dcgm_client_main.py +86 -0
  83. model_analyzer/monitor/dcgm/dcgm_agent.py +887 -0
  84. model_analyzer/monitor/dcgm/dcgm_collectd_plugin.py +369 -0
  85. model_analyzer/monitor/dcgm/dcgm_errors.py +395 -0
  86. model_analyzer/monitor/dcgm/dcgm_field_helpers.py +546 -0
  87. model_analyzer/monitor/dcgm/dcgm_fields.py +815 -0
  88. model_analyzer/monitor/dcgm/dcgm_fields_collectd.py +671 -0
  89. model_analyzer/monitor/dcgm/dcgm_fields_internal.py +29 -0
  90. model_analyzer/monitor/dcgm/dcgm_fluentd.py +45 -0
  91. model_analyzer/monitor/dcgm/dcgm_monitor.py +138 -0
  92. model_analyzer/monitor/dcgm/dcgm_prometheus.py +326 -0
  93. model_analyzer/monitor/dcgm/dcgm_structs.py +2357 -0
  94. model_analyzer/monitor/dcgm/dcgm_telegraf.py +65 -0
  95. model_analyzer/monitor/dcgm/dcgm_value.py +151 -0
  96. model_analyzer/monitor/dcgm/dcgmvalue.py +155 -0
  97. model_analyzer/monitor/dcgm/denylist_recommendations.py +573 -0
  98. model_analyzer/monitor/dcgm/pydcgm.py +47 -0
  99. model_analyzer/monitor/monitor.py +143 -0
  100. model_analyzer/monitor/remote_monitor.py +137 -0
  101. model_analyzer/output/__init__.py +15 -0
  102. model_analyzer/output/file_writer.py +63 -0
  103. model_analyzer/output/output_writer.py +42 -0
  104. model_analyzer/perf_analyzer/__init__.py +15 -0
  105. model_analyzer/perf_analyzer/genai_perf_config.py +206 -0
  106. model_analyzer/perf_analyzer/perf_analyzer.py +882 -0
  107. model_analyzer/perf_analyzer/perf_config.py +479 -0
  108. model_analyzer/plots/__init__.py +15 -0
  109. model_analyzer/plots/detailed_plot.py +266 -0
  110. model_analyzer/plots/plot_manager.py +224 -0
  111. model_analyzer/plots/simple_plot.py +213 -0
  112. model_analyzer/record/__init__.py +15 -0
  113. model_analyzer/record/gpu_record.py +68 -0
  114. model_analyzer/record/metrics_manager.py +887 -0
  115. model_analyzer/record/record.py +280 -0
  116. model_analyzer/record/record_aggregator.py +256 -0
  117. model_analyzer/record/types/__init__.py +15 -0
  118. model_analyzer/record/types/cpu_available_ram.py +93 -0
  119. model_analyzer/record/types/cpu_used_ram.py +93 -0
  120. model_analyzer/record/types/gpu_free_memory.py +96 -0
  121. model_analyzer/record/types/gpu_power_usage.py +107 -0
  122. model_analyzer/record/types/gpu_total_memory.py +96 -0
  123. model_analyzer/record/types/gpu_used_memory.py +96 -0
  124. model_analyzer/record/types/gpu_utilization.py +108 -0
  125. model_analyzer/record/types/inter_token_latency_avg.py +60 -0
  126. model_analyzer/record/types/inter_token_latency_base.py +74 -0
  127. model_analyzer/record/types/inter_token_latency_max.py +60 -0
  128. model_analyzer/record/types/inter_token_latency_min.py +60 -0
  129. model_analyzer/record/types/inter_token_latency_p25.py +60 -0
  130. model_analyzer/record/types/inter_token_latency_p50.py +60 -0
  131. model_analyzer/record/types/inter_token_latency_p75.py +60 -0
  132. model_analyzer/record/types/inter_token_latency_p90.py +60 -0
  133. model_analyzer/record/types/inter_token_latency_p95.py +60 -0
  134. model_analyzer/record/types/inter_token_latency_p99.py +60 -0
  135. model_analyzer/record/types/output_token_throughput.py +105 -0
  136. model_analyzer/record/types/perf_client_response_wait.py +97 -0
  137. model_analyzer/record/types/perf_client_send_recv.py +97 -0
  138. model_analyzer/record/types/perf_latency.py +111 -0
  139. model_analyzer/record/types/perf_latency_avg.py +60 -0
  140. model_analyzer/record/types/perf_latency_base.py +74 -0
  141. model_analyzer/record/types/perf_latency_p90.py +60 -0
  142. model_analyzer/record/types/perf_latency_p95.py +60 -0
  143. model_analyzer/record/types/perf_latency_p99.py +60 -0
  144. model_analyzer/record/types/perf_server_compute_infer.py +97 -0
  145. model_analyzer/record/types/perf_server_compute_input.py +97 -0
  146. model_analyzer/record/types/perf_server_compute_output.py +97 -0
  147. model_analyzer/record/types/perf_server_queue.py +97 -0
  148. model_analyzer/record/types/perf_throughput.py +105 -0
  149. model_analyzer/record/types/time_to_first_token_avg.py +60 -0
  150. model_analyzer/record/types/time_to_first_token_base.py +74 -0
  151. model_analyzer/record/types/time_to_first_token_max.py +60 -0
  152. model_analyzer/record/types/time_to_first_token_min.py +60 -0
  153. model_analyzer/record/types/time_to_first_token_p25.py +60 -0
  154. model_analyzer/record/types/time_to_first_token_p50.py +60 -0
  155. model_analyzer/record/types/time_to_first_token_p75.py +60 -0
  156. model_analyzer/record/types/time_to_first_token_p90.py +60 -0
  157. model_analyzer/record/types/time_to_first_token_p95.py +60 -0
  158. model_analyzer/record/types/time_to_first_token_p99.py +60 -0
  159. model_analyzer/reports/__init__.py +15 -0
  160. model_analyzer/reports/html_report.py +195 -0
  161. model_analyzer/reports/pdf_report.py +50 -0
  162. model_analyzer/reports/report.py +86 -0
  163. model_analyzer/reports/report_factory.py +62 -0
  164. model_analyzer/reports/report_manager.py +1376 -0
  165. model_analyzer/reports/report_utils.py +42 -0
  166. model_analyzer/result/__init__.py +15 -0
  167. model_analyzer/result/constraint_manager.py +150 -0
  168. model_analyzer/result/model_config_measurement.py +354 -0
  169. model_analyzer/result/model_constraints.py +105 -0
  170. model_analyzer/result/parameter_search.py +246 -0
  171. model_analyzer/result/result_manager.py +430 -0
  172. model_analyzer/result/result_statistics.py +159 -0
  173. model_analyzer/result/result_table.py +217 -0
  174. model_analyzer/result/result_table_manager.py +646 -0
  175. model_analyzer/result/result_utils.py +42 -0
  176. model_analyzer/result/results.py +277 -0
  177. model_analyzer/result/run_config_measurement.py +658 -0
  178. model_analyzer/result/run_config_result.py +210 -0
  179. model_analyzer/result/run_config_result_comparator.py +110 -0
  180. model_analyzer/result/sorted_results.py +151 -0
  181. model_analyzer/state/__init__.py +15 -0
  182. model_analyzer/state/analyzer_state.py +76 -0
  183. model_analyzer/state/analyzer_state_manager.py +215 -0
  184. model_analyzer/triton/__init__.py +15 -0
  185. model_analyzer/triton/client/__init__.py +15 -0
  186. model_analyzer/triton/client/client.py +234 -0
  187. model_analyzer/triton/client/client_factory.py +57 -0
  188. model_analyzer/triton/client/grpc_client.py +104 -0
  189. model_analyzer/triton/client/http_client.py +107 -0
  190. model_analyzer/triton/model/__init__.py +15 -0
  191. model_analyzer/triton/model/model_config.py +556 -0
  192. model_analyzer/triton/model/model_config_variant.py +29 -0
  193. model_analyzer/triton/server/__init__.py +15 -0
  194. model_analyzer/triton/server/server.py +76 -0
  195. model_analyzer/triton/server/server_config.py +269 -0
  196. model_analyzer/triton/server/server_docker.py +229 -0
  197. model_analyzer/triton/server/server_factory.py +306 -0
  198. model_analyzer/triton/server/server_local.py +158 -0
  199. triton_model_analyzer-1.48.0.dist-info/METADATA +52 -0
  200. triton_model_analyzer-1.48.0.dist-info/RECORD +204 -0
  201. triton_model_analyzer-1.48.0.dist-info/WHEEL +5 -0
  202. triton_model_analyzer-1.48.0.dist-info/entry_points.txt +2 -0
  203. triton_model_analyzer-1.48.0.dist-info/licenses/LICENSE +67 -0
  204. triton_model_analyzer-1.48.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,573 @@
1
+ # Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ import argparse
15
+ import sys
16
+ import logging
17
+ import json
18
+ import os
19
+
20
+ try:
21
+ import model_analyzer.monitor.dcgm.pydcgm as pydcgm
22
+ import model_analyzer.monitor.dcgm.dcgm_agent as dcgm_agent
23
+ import model_analyzer.monitor.dcgm.dcgm_structs as dcgm_structs
24
+ import model_analyzer.monitor.dcgm.dcgm_errors as dcgm_errors
25
+ import model_analyzer.monitor.dcgm.dcgm_fields as dcgm_fields
26
+ import model_analyzer.monitor.dcgm.DcgmSystem as DcgmSystem
27
+ except:
28
+ # If we don't find the bindings, add the default path and try again
29
+ if 'PYTHONPATH' in os.environ:
30
+ os.environ['PYTHONPATH'] = os.environ[
31
+ 'PYTHONPATH'] + ":/usr/local/dcgm/bindings"
32
+ else:
33
+ os.environ['PYTHONPATH'] = '/usr/local/dcgm/bindings'
34
+
35
+ import model_analyzer.monitor.dcgm.pydcgm as pydcgm
36
+ import model_analyzer.monitor.dcgm.dcgm_agent as dcgm_agent
37
+ import model_analyzer.monitor.dcgm.dcgm_structs as dcgm_structs
38
+ import model_analyzer.monitor.dcgm.dcgm_fields as dcgm_fields
39
+ import model_analyzer.monitor.dcgm.DcgmSystem as DcgmSystem
40
+
41
+ BR_ST_HEALTHY = 0x0000
42
+ BR_ST_NOT_DETECTED = 0x0001
43
+ BR_ST_FAILED_PASSIVE_HEALTH = 0x0002
44
+ BR_ST_FAILED_ACTIVE_HEALTH = 0x0004
45
+
46
+ BR_HEALTH_WATCH_BITMAP = dcgm_structs.DCGM_HEALTH_WATCH_ALL
47
+
48
+ DIAG_SM_STRESS_DURATION = 90.0
49
+ DIAG_CONSTANT_POWER_DURATION = 120.0
50
+ DIAG_CONSTANT_STRESS_DURATION = 120.0
51
+ DIAG_DIAGNOSTIC_DURATION = 300.0
52
+
53
+ global g_gpus
54
+ global g_switches
55
+ g_gpus = []
56
+ g_switches = []
57
+
58
+
59
+ class Entity(object):
60
+
61
+ def __init__(self,
62
+ entityId,
63
+ entityType=dcgm_fields.DCGM_FE_GPU,
64
+ uuid=None,
65
+ bdf=None):
66
+ self.health = BR_ST_HEALTHY
67
+ self.entityType = entityType
68
+ self.entityId = entityId
69
+ self.reasonsUnhealthy = []
70
+ if uuid:
71
+ self.uuid = uuid
72
+ if bdf:
73
+ self.bdf = bdf
74
+
75
+ def IsHealthy(self):
76
+ return self.health == BR_ST_HEALTHY
77
+
78
+ def MarkUnhealthy(self, failCondition, reason):
79
+ self.health = self.health | failCondition
80
+ self.reasonsUnhealthy.append(reason)
81
+
82
+ def WhyUnhealthy(self):
83
+ return self.reasonsUnhealthy
84
+
85
+ def SetEntityId(self, entityId):
86
+ self.entityId = entityId
87
+
88
+ def GetEntityId(self):
89
+ return self.entityId
90
+
91
+ def GetUUID(self):
92
+ return self.uuid
93
+
94
+ def GetBDF(self):
95
+ return self.bdf
96
+
97
+
98
+ def mark_entity_unhealthy(entities, entityId, code, reason):
99
+ found = False
100
+ for entity in entities:
101
+ if entityId == entity.GetEntityId():
102
+ entity.MarkUnhealthy(code, reason)
103
+ found = True
104
+
105
+ return found
106
+
107
+
108
+ def addParamString(runDiagInfo, paramIndex, paramStr):
109
+ strIndex = 0
110
+ for c in paramStr:
111
+ runDiagInfo.testParms[paramIndex][strIndex] = c
112
+ strIndex = strIndex + 1
113
+
114
+
115
+ def setTestDurations(runDiagInfo, timePercentage):
116
+ # We only are reducing the test time for the default case
117
+ if runDiagInfo.validate != 3:
118
+ return
119
+
120
+ stressDuration = int(DIAG_SM_STRESS_DURATION * timePercentage)
121
+ powerDuration = int(DIAG_CONSTANT_POWER_DURATION * timePercentage)
122
+ constantStressDuration = int(DIAG_CONSTANT_STRESS_DURATION * timePercentage)
123
+ diagDuration = int(DIAG_DIAGNOSTIC_DURATION * timePercentage)
124
+
125
+ smParam = "sm stress.test_duration=%d" % (stressDuration)
126
+ powerParam = "targeted power.test_duration=%d" % (powerDuration)
127
+ constantStressParam = "targeted stress.test_duration=%d" % (
128
+ constantStressDuration)
129
+ diagParam = "diagnostic.test_duration=%d" % (diagDuration)
130
+
131
+ addParamString(runDiagInfo, 0, diagParam)
132
+ addParamString(runDiagInfo, 1, smParam)
133
+ addParamString(runDiagInfo, 2, constantStressParam)
134
+ addParamString(runDiagInfo, 3, powerParam)
135
+
136
+
137
+ def initialize_run_diag_info(settings):
138
+ runDiagInfo = dcgm_structs.c_dcgmRunDiag_v7()
139
+ runDiagInfo.version = dcgm_structs.dcgmRunDiag_version7
140
+ runDiagInfo.flags = dcgm_structs.DCGM_RUN_FLAGS_VERBOSE
141
+ testNamesStr = settings['testNames']
142
+ if testNamesStr == '1':
143
+ runDiagInfo.validate = 1
144
+ elif testNamesStr == '2':
145
+ runDiagInfo.validate = 2
146
+ elif testNamesStr == '3':
147
+ runDiagInfo.validate = 3
148
+ else:
149
+ # Make sure no number other that 1-3 were submitted
150
+ if testNamesStr.isdigit():
151
+ raise ValueError("'%s' is not a valid test name" % testNamesStr)
152
+
153
+ # Copy to the testNames portion of the object
154
+ names = testNamesStr.split(',')
155
+ testIndex = 0
156
+ if len(names) > dcgm_structs.DCGM_MAX_TEST_NAMES:
157
+ err = 'Aborting DCGM Diag because %d test names were specified exceeding the limit of %d' %\
158
+ (len(names), dcgm_structs.DCGM_MAX_TEST_NAMES)
159
+ raise ValueError(err)
160
+
161
+ for testName in names:
162
+ testNameIndex = 0
163
+ if len(testName) >= dcgm_structs.DCGM_MAX_TEST_NAMES_LEN:
164
+ err = 'Aborting DCGM Diag because test name %s exceeds max length %d' % \
165
+ (testName, dcgm_structs.DCGM_MAX_TEST_NAMES_LEN)
166
+ raise ValueError(err)
167
+
168
+ for c in testName:
169
+ runDiagInfo.testNames[testIndex][testNameIndex] = c
170
+ testNameIndex = testNameIndex + 1
171
+
172
+ testIndex = testIndex + 1
173
+
174
+ if 'timePercentage' in settings:
175
+ setTestDurations(runDiagInfo, settings['timePercentage'])
176
+
177
+ activeGpuIds = []
178
+
179
+ first = True
180
+ for gpuObj in g_gpus:
181
+ if gpuObj.IsHealthy():
182
+ activeGpuIds.append(gpuObj.GetEntityId())
183
+ if first:
184
+ runDiagInfo.gpuList = str(gpuObj.GetEntityId())
185
+ first = False
186
+ else:
187
+ to_append = ',%s' % (str(gpuObj.GetEntityId()))
188
+ runDiagInfo.gpuList = runDiagInfo.gpuList + to_append
189
+
190
+ return runDiagInfo, activeGpuIds
191
+
192
+
193
+ def mark_all_unhealthy(activeGpuIds, reason):
194
+ for gpuId in activeGpuIds:
195
+ mark_entity_unhealthy(g_gpus, gpuId, BR_ST_FAILED_ACTIVE_HEALTH, reason)
196
+
197
+
198
+ def result_to_str(result):
199
+ if result == dcgm_structs.DCGM_DIAG_RESULT_PASS:
200
+ return 'PASS'
201
+ elif result == dcgm_structs.DCGM_DIAG_RESULT_SKIP:
202
+ return 'SKIP'
203
+ elif result == dcgm_structs.DCGM_DIAG_RESULT_WARN:
204
+ return 'WARN'
205
+ elif result == dcgm_structs.DCGM_DIAG_RESULT_FAIL:
206
+ return 'FAIL'
207
+ else:
208
+ return 'NOT RUN'
209
+
210
+
211
+ def check_passive_health_checks(response, activeGpuIds):
212
+ unhealthy = False
213
+ for i in range(0, dcgm_structs.DCGM_SWTEST_COUNT):
214
+ if response.levelOneResults[
215
+ i].result == dcgm_structs.DCGM_DIAG_RESULT_FAIL:
216
+ mark_all_unhealthy(activeGpuIds,
217
+ response.levelOneResults[i].error.msg)
218
+ unhealthy = True
219
+ break
220
+
221
+ return unhealthy
222
+
223
+
224
+ def check_gpu_diagnostic(handleObj, settings):
225
+ runDiagInfo, activeGpuIds = initialize_run_diag_info(settings)
226
+ if len(activeGpuIds) == 0:
227
+ return
228
+
229
+ response = dcgm_agent.dcgmActionValidate_v2(handleObj.handle, runDiagInfo)
230
+
231
+ sysError = response.systemError
232
+ if (sysError.code != dcgm_errors.DCGM_FR_OK):
233
+ raise ValueError(sysError)
234
+
235
+ if check_passive_health_checks(response, activeGpuIds) == False:
236
+ for gpuIndex in range(response.gpuCount):
237
+ for testIndex in range(dcgm_structs.DCGM_PER_GPU_TEST_COUNT_V8):
238
+ if response.perGpuResponses[gpuIndex].results[
239
+ testIndex].result == dcgm_structs.DCGM_DIAG_RESULT_FAIL:
240
+ gpuId = response.perGpuResponses[gpuIndex].gpuId
241
+ mark_entity_unhealthy(
242
+ g_gpus, gpuId, BR_ST_FAILED_ACTIVE_HEALTH,
243
+ response.perGpuResponses[gpuIndex].results[testIndex].
244
+ result.error.msg)
245
+
246
+ # NVVS marks all subsequent tests as failed so there's no point in continuing
247
+ break
248
+
249
+
250
+ def query_passive_health(handleObj, desired_watches):
251
+ dcgmGroup = handleObj.GetSystem().GetDefaultGroup()
252
+ watches = dcgmGroup.health.Get()
253
+
254
+ # Check for the correct watches to be set and set them if necessary
255
+ if watches != desired_watches:
256
+ dcgmGroup.health.Set(desired_watches)
257
+
258
+ return dcgmGroup.health.Check()
259
+
260
+
261
+ def denylist_from_passive_health_check(response):
262
+ for incidentIndex in range(response.incidentCount):
263
+ if response.incidents[
264
+ incidentIndex].health != dcgm_structs.DCGM_HEALTH_RESULT_FAIL:
265
+ # Only add to the denylist for failures; ignore warnings
266
+ continue
267
+
268
+ entityId = response.incidents[incidentIndex].entityInfo.entityId
269
+ entityGroupId = response.incidents[
270
+ incidentIndex].entityInfo.entityGroupId
271
+ errorString = response.incidents[incidentIndex].error.msg
272
+
273
+ if entityGroupId == dcgm_fields.DCGM_FE_GPU:
274
+ mark_entity_unhealthy(g_gpus, entityId, BR_ST_FAILED_PASSIVE_HEALTH,
275
+ errorString)
276
+ else:
277
+ mark_entity_unhealthy(g_switches, entityId,
278
+ BR_ST_FAILED_PASSIVE_HEALTH, errorString)
279
+
280
+
281
+ def check_passive_health(handleObj, watches):
282
+ response = query_passive_health(handleObj, watches)
283
+
284
+ if response.overallHealth != dcgm_structs.DCGM_HEALTH_RESULT_PASS:
285
+ denylist_from_passive_health_check(response)
286
+
287
+
288
+ def initialize_devices(handle, flags):
289
+ gpuIds = dcgm_agent.dcgmGetEntityGroupEntities(handle,
290
+ dcgm_fields.DCGM_FE_GPU,
291
+ flags)
292
+ switchIds = dcgm_agent.dcgmGetEntityGroupEntities(
293
+ handle, dcgm_fields.DCGM_FE_SWITCH, flags)
294
+
295
+ i = 0
296
+ for gpuId in gpuIds:
297
+ attributes = dcgm_agent.dcgmGetDeviceAttributes(handle, gpuId)
298
+ gpuObj = Entity(gpuId,
299
+ entityType=dcgm_fields.DCGM_FE_GPU,
300
+ uuid=attributes.identifiers.uuid,
301
+ bdf=attributes.identifiers.pciBusId)
302
+ g_gpus.append(gpuObj)
303
+ i = i + 1
304
+
305
+ i = 0
306
+ for switchId in switchIds:
307
+ switchObj = Entity(switchId, entityType=dcgm_fields.DCGM_FE_SWITCH)
308
+ g_switches.append(switchObj)
309
+ i = i + 1
310
+
311
+
312
+ # Process command line arguments
313
+ def __process_command_line__(settings):
314
+ parser = argparse.ArgumentParser()
315
+ parser.add_argument('-g',
316
+ '--num-gpus',
317
+ dest='num_gpus',
318
+ type=int,
319
+ help='The expected number of GPUs.')
320
+ parser.add_argument('-s',
321
+ '--num-switches',
322
+ dest='num_switches',
323
+ type=int,
324
+ help='The expected number of NvSwitches.')
325
+ parser.add_argument(
326
+ '-n',
327
+ '--hostname',
328
+ dest='hostname',
329
+ type=str,
330
+ help='The hostname of the nv-hostengine we want to query.')
331
+ parser.add_argument(
332
+ '-d',
333
+ '--detect',
334
+ dest='detect',
335
+ action='store_true',
336
+ help='Run on whatever GPUs can be detected. Do not check counts.')
337
+ parser.add_argument(
338
+ '-l',
339
+ '--log-file',
340
+ dest='logfileName',
341
+ type=str,
342
+ help=
343
+ 'The name of the log file where details should be stored. Default is stdout'
344
+ )
345
+ parser.add_argument(
346
+ '-u',
347
+ '--unsupported-too',
348
+ dest='unsupported',
349
+ action='store_true',
350
+ help='Get unsupported devices in addition to the ones DCGM supports')
351
+ parser.add_argument('-f',
352
+ '--full-report',
353
+ dest='fullReport',
354
+ action='store_true',
355
+ help='Print a health status for each GPU')
356
+ parser.add_argument(
357
+ '-c',
358
+ '--csv',
359
+ dest='outfmtCSV',
360
+ action='store_true',
361
+ help='Write output in csv format. By default, output is in json format.'
362
+ )
363
+ parser.add_argument(
364
+ '-w',
365
+ '--watches',
366
+ dest='watches',
367
+ type=str,
368
+ help=
369
+ 'Specify which health watches to monitor. By default, all are watched. Any list of the following may be specified:\n\ta = All watches\n\tp = PCIE\n\tm = Memory\n\ti = Inforom\n\tt = Thermal and Power\n\tn = NVLINK'
370
+ )
371
+
372
+ group = parser.add_mutually_exclusive_group()
373
+ group.add_argument(
374
+ '-r',
375
+ '--specified-test',
376
+ dest='testNames',
377
+ type=str,
378
+ help='Option to specify what tests are run in dcgmi diag.')
379
+ group.add_argument(
380
+ '-i',
381
+ '--instantaneous',
382
+ dest='instant',
383
+ action='store_true',
384
+ help='Specify to skip the longer tests and run instantaneously')
385
+ group.add_argument(
386
+ '-t',
387
+ '--time-limit',
388
+ dest='timeLimit',
389
+ type=int,
390
+ help=
391
+ 'The time limit in seconds that all the tests should not exceed. Diagnostics will be reduced in their time to meet this boundary.'
392
+ )
393
+
394
+ parser.set_defaults(instant=False, detect=False, fullReport=False)
395
+ args = parser.parse_args()
396
+
397
+ if args.num_gpus is not None and args.num_switches is not None:
398
+ settings['numGpus'] = args.num_gpus
399
+ settings['numSwitches'] = args.num_switches
400
+ elif args.detect == False:
401
+ raise ValueError(
402
+ 'Must specify either a number of gpus and switches with -g and -s or auto-detect with -d'
403
+ )
404
+
405
+ if args.hostname:
406
+ settings['hostname'] = args.hostname
407
+ else:
408
+ settings['hostname'] = 'localhost'
409
+
410
+ if args.unsupported:
411
+ settings['entity_get_flags'] = 0
412
+ else:
413
+ settings[
414
+ 'entity_get_flags'] = dcgm_structs.DCGM_GEGE_FLAG_ONLY_SUPPORTED
415
+
416
+ settings['instant'] = args.instant
417
+ settings['fullReport'] = args.fullReport
418
+
419
+ if args.testNames:
420
+ settings['testNames'] = args.testNames
421
+ else:
422
+ settings['testNames'] = '3'
423
+
424
+ if args.timeLimit:
425
+ settings['timePercentage'] = float(args.timeLimit) / 840.0
426
+
427
+ if args.logfileName:
428
+ logging.basicConfig(filename=args.logfileName)
429
+
430
+ if args.outfmtCSV:
431
+ settings['outfmtCSV'] = 1
432
+
433
+ if args.watches:
434
+ health_watches = 0
435
+ for c in args.watches:
436
+ if c == 'p':
437
+ health_watches |= dcgm_structs.DCGM_HEALTH_WATCH_PCIE
438
+ elif c == 'm':
439
+ health_watches |= dcgm_structs.DCGM_HEALTH_WATCH_MEM
440
+ elif c == 'i':
441
+ health_watches |= dcgm_structs.DCGM_HEALTH_WATCH_INFOROM
442
+ elif c == 't':
443
+ health_watches |= dcgm_structs.DCGM_HEALTH_WATCH_THERMAL
444
+ health_watches |= dcgm_structs.DCGM_HEALTH_WATCH_POWER
445
+ elif c == 'n':
446
+ health_watches |= dcgm_structs.DCGM_HEALTH_WATCH_NVLINK
447
+ elif c == 'a':
448
+ health_watches |= dcgm_structs.DCGM_HEALTH_WATCH_ALL
449
+ else:
450
+ print(("Unrecognized character %s found in watch string '%s'" %
451
+ (c, args.watches)))
452
+ sys.exit(-1)
453
+ settings['watches'] = health_watches
454
+ else:
455
+ settings['watches'] = BR_HEALTH_WATCH_BITMAP
456
+
457
+
458
+ def get_entity_id_list(entities):
459
+ ids = ""
460
+ first = True
461
+ for entity in entities:
462
+ if first:
463
+ ids = str(entity.GetEntityId())
464
+ else:
465
+ ids += ",%d" % (entity.GetEntityId())
466
+ first = False
467
+
468
+ return ids
469
+
470
+
471
+ def check_health(handleObj, settings, error_list):
472
+ initialize_devices(handleObj.handle, settings['entity_get_flags'])
473
+
474
+ if 'numGpus' in settings:
475
+ if len(g_gpus) != settings['numGpus']:
476
+ error_list.append(
477
+ "%d GPUs were specified but only %d were detected with ids '%s'"
478
+ %
479
+ (settings['numGpus'], len(g_gpus), get_entity_id_list(g_gpus)))
480
+
481
+ if 'numSwitches' in settings:
482
+ if len(g_switches) != settings['numSwitches']:
483
+ error_list.append(
484
+ "%d switches were specified but only %d were detected with ids '%s'"
485
+ % (settings['numSwitches'], len(g_switches),
486
+ get_entity_id_list(g_switches)))
487
+
488
+ check_passive_health(handleObj, settings['watches']) # quick check
489
+
490
+ if settings['instant'] == False:
491
+ check_gpu_diagnostic(handleObj, settings)
492
+
493
+
494
+ def process_command_line(settings):
495
+ try:
496
+ __process_command_line__(settings)
497
+ except ValueError as e:
498
+ return str(e)
499
+
500
+
501
+ def main():
502
+ # Parse the command line
503
+ settings = {}
504
+ error_list = []
505
+
506
+ exitCode = 0
507
+ jsonTop = {}
508
+
509
+ error = process_command_line(settings)
510
+ if error:
511
+ # If we had an error processing the command line, don't attempt to check anything
512
+ error_list.append(error)
513
+ else:
514
+ try:
515
+ handleObj = pydcgm.DcgmHandle(None, settings['hostname'],
516
+ dcgm_structs.DCGM_OPERATION_MODE_AUTO)
517
+
518
+ check_health(handleObj, settings, error_list)
519
+ except dcgm_structs.DCGMError as e:
520
+ # Catch any exceptions from DCGM and add them to the error_list so they'll be printed as JSON
521
+ error_list.append(str(e))
522
+ except ValueError as e:
523
+ error_list.append(str(e))
524
+
525
+ if 'outfmtCSV' in settings: # show all health, then all un-healthy
526
+ for gpuObj in g_gpus:
527
+ if gpuObj.IsHealthy() == True:
528
+ print("healthy,%s,%s" % (gpuObj.GetBDF(), gpuObj.GetUUID()))
529
+ for gpuObj in g_gpus:
530
+ if gpuObj.IsHealthy() == False:
531
+ print("unhealthy,%s,%s,\"%s\"" %
532
+ (gpuObj.GetBDF(), gpuObj.GetUUID(),
533
+ gpuObj.WhyUnhealthy()))
534
+
535
+ else: # build obj that can be output in json
536
+ denylistGpus = {}
537
+ healthyGpus = {}
538
+ for gpuObj in g_gpus:
539
+ if gpuObj.IsHealthy() == False:
540
+ details = {}
541
+ details['UUID'] = gpuObj.GetUUID()
542
+ details['BDF'] = gpuObj.GetBDF()
543
+ details['Failure Explanation'] = gpuObj.WhyUnhealthy()
544
+ denylistGpus[gpuObj.GetEntityId()] = details
545
+ elif settings['fullReport']:
546
+ details = {}
547
+ details['UUID'] = gpuObj.GetUUID()
548
+ details['BDF'] = gpuObj.GetBDF()
549
+ healthyGpus[gpuObj.GetEntityId()] = details
550
+
551
+ jsonTop['denylistedGpus'] = denylistGpus
552
+ if settings['fullReport']:
553
+ jsonTop['Healthy GPUs'] = healthyGpus
554
+
555
+ if len(error_list): # had error processing the command line
556
+ exitCode = 1
557
+ if 'outfmtCSV' in settings: # json output
558
+ if len(error_list):
559
+ for errObj in error_list:
560
+ print("errors,\"%s\"" % (errObj))
561
+ else:
562
+ jsonTop['errors'] = error_list
563
+
564
+ if 'outfmtCSV' in settings: # show all health, then all un-healthy
565
+ pass
566
+ else:
567
+ print(json.dumps(jsonTop, indent=4, separators=(',', ': ')))
568
+
569
+ sys.exit(exitCode)
570
+
571
+
572
+ if __name__ == '__main__':
573
+ main()
@@ -0,0 +1,47 @@
1
+ # Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+
16
+ def _python_version_check():
17
+ import sys
18
+ python_version = sys.version.split(None, 1)[0]
19
+ if python_version < '3':
20
+ print(
21
+ '[ERROR] Detected Python version {}. These bindings are for Python 3.5+. Please load the Python 2 bindings found at /usr/local/dcgm/bindings'
22
+ .format(python_version))
23
+ sys.exit(1)
24
+
25
+
26
+ _python_version_check()
27
+
28
+ #Bring classes into this namespace
29
+ from model_analyzer.monitor.dcgm.DcgmHandle import *
30
+ from model_analyzer.monitor.dcgm.DcgmGroup import *
31
+ from model_analyzer.monitor.dcgm.DcgmStatus import *
32
+ from model_analyzer.monitor.dcgm.DcgmSystem import *
33
+ from model_analyzer.monitor.dcgm.DcgmFieldGroup import *
34
+
35
+ import os
36
+ if '__DCGM_TESTING_FRAMEWORK_ACTIVE' in os.environ and os.environ[
37
+ '__DCGM_TESTING_FRAMEWORK_ACTIVE'] == '1':
38
+ import utils
39
+ import model_analyzer.monitor.dcgm.dcgm_structs as dcgm_structs
40
+ dcgm_structs._dcgmInit(utils.get_testing_framework_library_path())
41
+ '''
42
+ Define a unique exception type we will return so that callers can distinguish our exceptions from python standard ones
43
+ '''
44
+
45
+
46
+ class DcgmException(Exception):
47
+ pass