triton-model-analyzer 1.48.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (204) hide show
  1. model_analyzer/__init__.py +15 -0
  2. model_analyzer/analyzer.py +448 -0
  3. model_analyzer/cli/__init__.py +15 -0
  4. model_analyzer/cli/cli.py +193 -0
  5. model_analyzer/config/__init__.py +15 -0
  6. model_analyzer/config/generate/__init__.py +15 -0
  7. model_analyzer/config/generate/automatic_model_config_generator.py +164 -0
  8. model_analyzer/config/generate/base_model_config_generator.py +352 -0
  9. model_analyzer/config/generate/brute_plus_binary_parameter_search_run_config_generator.py +164 -0
  10. model_analyzer/config/generate/brute_run_config_generator.py +154 -0
  11. model_analyzer/config/generate/concurrency_sweeper.py +75 -0
  12. model_analyzer/config/generate/config_generator_interface.py +52 -0
  13. model_analyzer/config/generate/coordinate.py +143 -0
  14. model_analyzer/config/generate/coordinate_data.py +86 -0
  15. model_analyzer/config/generate/generator_utils.py +116 -0
  16. model_analyzer/config/generate/manual_model_config_generator.py +187 -0
  17. model_analyzer/config/generate/model_config_generator_factory.py +92 -0
  18. model_analyzer/config/generate/model_profile_spec.py +74 -0
  19. model_analyzer/config/generate/model_run_config_generator.py +154 -0
  20. model_analyzer/config/generate/model_variant_name_manager.py +150 -0
  21. model_analyzer/config/generate/neighborhood.py +536 -0
  22. model_analyzer/config/generate/optuna_plus_concurrency_sweep_run_config_generator.py +141 -0
  23. model_analyzer/config/generate/optuna_run_config_generator.py +838 -0
  24. model_analyzer/config/generate/perf_analyzer_config_generator.py +312 -0
  25. model_analyzer/config/generate/quick_plus_concurrency_sweep_run_config_generator.py +130 -0
  26. model_analyzer/config/generate/quick_run_config_generator.py +753 -0
  27. model_analyzer/config/generate/run_config_generator_factory.py +329 -0
  28. model_analyzer/config/generate/search_config.py +112 -0
  29. model_analyzer/config/generate/search_dimension.py +73 -0
  30. model_analyzer/config/generate/search_dimensions.py +85 -0
  31. model_analyzer/config/generate/search_parameter.py +49 -0
  32. model_analyzer/config/generate/search_parameters.py +388 -0
  33. model_analyzer/config/input/__init__.py +15 -0
  34. model_analyzer/config/input/config_command.py +483 -0
  35. model_analyzer/config/input/config_command_profile.py +1747 -0
  36. model_analyzer/config/input/config_command_report.py +267 -0
  37. model_analyzer/config/input/config_defaults.py +236 -0
  38. model_analyzer/config/input/config_enum.py +83 -0
  39. model_analyzer/config/input/config_field.py +216 -0
  40. model_analyzer/config/input/config_list_generic.py +112 -0
  41. model_analyzer/config/input/config_list_numeric.py +151 -0
  42. model_analyzer/config/input/config_list_string.py +111 -0
  43. model_analyzer/config/input/config_none.py +71 -0
  44. model_analyzer/config/input/config_object.py +129 -0
  45. model_analyzer/config/input/config_primitive.py +81 -0
  46. model_analyzer/config/input/config_status.py +75 -0
  47. model_analyzer/config/input/config_sweep.py +83 -0
  48. model_analyzer/config/input/config_union.py +113 -0
  49. model_analyzer/config/input/config_utils.py +128 -0
  50. model_analyzer/config/input/config_value.py +243 -0
  51. model_analyzer/config/input/objects/__init__.py +15 -0
  52. model_analyzer/config/input/objects/config_model_profile_spec.py +325 -0
  53. model_analyzer/config/input/objects/config_model_report_spec.py +173 -0
  54. model_analyzer/config/input/objects/config_plot.py +198 -0
  55. model_analyzer/config/input/objects/config_protobuf_utils.py +101 -0
  56. model_analyzer/config/input/yaml_config_validator.py +82 -0
  57. model_analyzer/config/run/__init__.py +15 -0
  58. model_analyzer/config/run/model_run_config.py +313 -0
  59. model_analyzer/config/run/run_config.py +168 -0
  60. model_analyzer/constants.py +76 -0
  61. model_analyzer/device/__init__.py +15 -0
  62. model_analyzer/device/device.py +24 -0
  63. model_analyzer/device/gpu_device.py +87 -0
  64. model_analyzer/device/gpu_device_factory.py +248 -0
  65. model_analyzer/entrypoint.py +307 -0
  66. model_analyzer/log_formatter.py +65 -0
  67. model_analyzer/model_analyzer_exceptions.py +24 -0
  68. model_analyzer/model_manager.py +255 -0
  69. model_analyzer/monitor/__init__.py +15 -0
  70. model_analyzer/monitor/cpu_monitor.py +69 -0
  71. model_analyzer/monitor/dcgm/DcgmDiag.py +191 -0
  72. model_analyzer/monitor/dcgm/DcgmFieldGroup.py +83 -0
  73. model_analyzer/monitor/dcgm/DcgmGroup.py +815 -0
  74. model_analyzer/monitor/dcgm/DcgmHandle.py +141 -0
  75. model_analyzer/monitor/dcgm/DcgmJsonReader.py +69 -0
  76. model_analyzer/monitor/dcgm/DcgmReader.py +623 -0
  77. model_analyzer/monitor/dcgm/DcgmStatus.py +57 -0
  78. model_analyzer/monitor/dcgm/DcgmSystem.py +412 -0
  79. model_analyzer/monitor/dcgm/__init__.py +15 -0
  80. model_analyzer/monitor/dcgm/common/__init__.py +13 -0
  81. model_analyzer/monitor/dcgm/common/dcgm_client_cli_parser.py +194 -0
  82. model_analyzer/monitor/dcgm/common/dcgm_client_main.py +86 -0
  83. model_analyzer/monitor/dcgm/dcgm_agent.py +887 -0
  84. model_analyzer/monitor/dcgm/dcgm_collectd_plugin.py +369 -0
  85. model_analyzer/monitor/dcgm/dcgm_errors.py +395 -0
  86. model_analyzer/monitor/dcgm/dcgm_field_helpers.py +546 -0
  87. model_analyzer/monitor/dcgm/dcgm_fields.py +815 -0
  88. model_analyzer/monitor/dcgm/dcgm_fields_collectd.py +671 -0
  89. model_analyzer/monitor/dcgm/dcgm_fields_internal.py +29 -0
  90. model_analyzer/monitor/dcgm/dcgm_fluentd.py +45 -0
  91. model_analyzer/monitor/dcgm/dcgm_monitor.py +138 -0
  92. model_analyzer/monitor/dcgm/dcgm_prometheus.py +326 -0
  93. model_analyzer/monitor/dcgm/dcgm_structs.py +2357 -0
  94. model_analyzer/monitor/dcgm/dcgm_telegraf.py +65 -0
  95. model_analyzer/monitor/dcgm/dcgm_value.py +151 -0
  96. model_analyzer/monitor/dcgm/dcgmvalue.py +155 -0
  97. model_analyzer/monitor/dcgm/denylist_recommendations.py +573 -0
  98. model_analyzer/monitor/dcgm/pydcgm.py +47 -0
  99. model_analyzer/monitor/monitor.py +143 -0
  100. model_analyzer/monitor/remote_monitor.py +137 -0
  101. model_analyzer/output/__init__.py +15 -0
  102. model_analyzer/output/file_writer.py +63 -0
  103. model_analyzer/output/output_writer.py +42 -0
  104. model_analyzer/perf_analyzer/__init__.py +15 -0
  105. model_analyzer/perf_analyzer/genai_perf_config.py +206 -0
  106. model_analyzer/perf_analyzer/perf_analyzer.py +882 -0
  107. model_analyzer/perf_analyzer/perf_config.py +479 -0
  108. model_analyzer/plots/__init__.py +15 -0
  109. model_analyzer/plots/detailed_plot.py +266 -0
  110. model_analyzer/plots/plot_manager.py +224 -0
  111. model_analyzer/plots/simple_plot.py +213 -0
  112. model_analyzer/record/__init__.py +15 -0
  113. model_analyzer/record/gpu_record.py +68 -0
  114. model_analyzer/record/metrics_manager.py +887 -0
  115. model_analyzer/record/record.py +280 -0
  116. model_analyzer/record/record_aggregator.py +256 -0
  117. model_analyzer/record/types/__init__.py +15 -0
  118. model_analyzer/record/types/cpu_available_ram.py +93 -0
  119. model_analyzer/record/types/cpu_used_ram.py +93 -0
  120. model_analyzer/record/types/gpu_free_memory.py +96 -0
  121. model_analyzer/record/types/gpu_power_usage.py +107 -0
  122. model_analyzer/record/types/gpu_total_memory.py +96 -0
  123. model_analyzer/record/types/gpu_used_memory.py +96 -0
  124. model_analyzer/record/types/gpu_utilization.py +108 -0
  125. model_analyzer/record/types/inter_token_latency_avg.py +60 -0
  126. model_analyzer/record/types/inter_token_latency_base.py +74 -0
  127. model_analyzer/record/types/inter_token_latency_max.py +60 -0
  128. model_analyzer/record/types/inter_token_latency_min.py +60 -0
  129. model_analyzer/record/types/inter_token_latency_p25.py +60 -0
  130. model_analyzer/record/types/inter_token_latency_p50.py +60 -0
  131. model_analyzer/record/types/inter_token_latency_p75.py +60 -0
  132. model_analyzer/record/types/inter_token_latency_p90.py +60 -0
  133. model_analyzer/record/types/inter_token_latency_p95.py +60 -0
  134. model_analyzer/record/types/inter_token_latency_p99.py +60 -0
  135. model_analyzer/record/types/output_token_throughput.py +105 -0
  136. model_analyzer/record/types/perf_client_response_wait.py +97 -0
  137. model_analyzer/record/types/perf_client_send_recv.py +97 -0
  138. model_analyzer/record/types/perf_latency.py +111 -0
  139. model_analyzer/record/types/perf_latency_avg.py +60 -0
  140. model_analyzer/record/types/perf_latency_base.py +74 -0
  141. model_analyzer/record/types/perf_latency_p90.py +60 -0
  142. model_analyzer/record/types/perf_latency_p95.py +60 -0
  143. model_analyzer/record/types/perf_latency_p99.py +60 -0
  144. model_analyzer/record/types/perf_server_compute_infer.py +97 -0
  145. model_analyzer/record/types/perf_server_compute_input.py +97 -0
  146. model_analyzer/record/types/perf_server_compute_output.py +97 -0
  147. model_analyzer/record/types/perf_server_queue.py +97 -0
  148. model_analyzer/record/types/perf_throughput.py +105 -0
  149. model_analyzer/record/types/time_to_first_token_avg.py +60 -0
  150. model_analyzer/record/types/time_to_first_token_base.py +74 -0
  151. model_analyzer/record/types/time_to_first_token_max.py +60 -0
  152. model_analyzer/record/types/time_to_first_token_min.py +60 -0
  153. model_analyzer/record/types/time_to_first_token_p25.py +60 -0
  154. model_analyzer/record/types/time_to_first_token_p50.py +60 -0
  155. model_analyzer/record/types/time_to_first_token_p75.py +60 -0
  156. model_analyzer/record/types/time_to_first_token_p90.py +60 -0
  157. model_analyzer/record/types/time_to_first_token_p95.py +60 -0
  158. model_analyzer/record/types/time_to_first_token_p99.py +60 -0
  159. model_analyzer/reports/__init__.py +15 -0
  160. model_analyzer/reports/html_report.py +195 -0
  161. model_analyzer/reports/pdf_report.py +50 -0
  162. model_analyzer/reports/report.py +86 -0
  163. model_analyzer/reports/report_factory.py +62 -0
  164. model_analyzer/reports/report_manager.py +1376 -0
  165. model_analyzer/reports/report_utils.py +42 -0
  166. model_analyzer/result/__init__.py +15 -0
  167. model_analyzer/result/constraint_manager.py +150 -0
  168. model_analyzer/result/model_config_measurement.py +354 -0
  169. model_analyzer/result/model_constraints.py +105 -0
  170. model_analyzer/result/parameter_search.py +246 -0
  171. model_analyzer/result/result_manager.py +430 -0
  172. model_analyzer/result/result_statistics.py +159 -0
  173. model_analyzer/result/result_table.py +217 -0
  174. model_analyzer/result/result_table_manager.py +646 -0
  175. model_analyzer/result/result_utils.py +42 -0
  176. model_analyzer/result/results.py +277 -0
  177. model_analyzer/result/run_config_measurement.py +658 -0
  178. model_analyzer/result/run_config_result.py +210 -0
  179. model_analyzer/result/run_config_result_comparator.py +110 -0
  180. model_analyzer/result/sorted_results.py +151 -0
  181. model_analyzer/state/__init__.py +15 -0
  182. model_analyzer/state/analyzer_state.py +76 -0
  183. model_analyzer/state/analyzer_state_manager.py +215 -0
  184. model_analyzer/triton/__init__.py +15 -0
  185. model_analyzer/triton/client/__init__.py +15 -0
  186. model_analyzer/triton/client/client.py +234 -0
  187. model_analyzer/triton/client/client_factory.py +57 -0
  188. model_analyzer/triton/client/grpc_client.py +104 -0
  189. model_analyzer/triton/client/http_client.py +107 -0
  190. model_analyzer/triton/model/__init__.py +15 -0
  191. model_analyzer/triton/model/model_config.py +556 -0
  192. model_analyzer/triton/model/model_config_variant.py +29 -0
  193. model_analyzer/triton/server/__init__.py +15 -0
  194. model_analyzer/triton/server/server.py +76 -0
  195. model_analyzer/triton/server/server_config.py +269 -0
  196. model_analyzer/triton/server/server_docker.py +229 -0
  197. model_analyzer/triton/server/server_factory.py +306 -0
  198. model_analyzer/triton/server/server_local.py +158 -0
  199. triton_model_analyzer-1.48.0.dist-info/METADATA +52 -0
  200. triton_model_analyzer-1.48.0.dist-info/RECORD +204 -0
  201. triton_model_analyzer-1.48.0.dist-info/WHEEL +5 -0
  202. triton_model_analyzer-1.48.0.dist-info/entry_points.txt +2 -0
  203. triton_model_analyzer-1.48.0.dist-info/licenses/LICENSE +67 -0
  204. triton_model_analyzer-1.48.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,623 @@
1
+ # Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ import subprocess
15
+ import signal, os
16
+ import model_analyzer.monitor.dcgm.pydcgm as pydcgm
17
+ import model_analyzer.monitor.dcgm.dcgm_structs as dcgm_structs
18
+ import threading
19
+ import model_analyzer.monitor.dcgm.dcgm_fields as dcgm_fields
20
+ import sys
21
+ import logging
22
+
23
+ defaultFieldIds = [
24
+ dcgm_fields.DCGM_FI_DEV_POWER_USAGE, dcgm_fields.DCGM_FI_DEV_GPU_TEMP,
25
+ dcgm_fields.DCGM_FI_DEV_SM_CLOCK, dcgm_fields.DCGM_FI_DEV_GPU_UTIL,
26
+ dcgm_fields.DCGM_FI_DEV_RETIRED_PENDING,
27
+ dcgm_fields.DCGM_FI_DEV_RETIRED_SBE, dcgm_fields.DCGM_FI_DEV_RETIRED_DBE,
28
+ dcgm_fields.DCGM_FI_DEV_ECC_SBE_VOL_TOTAL,
29
+ dcgm_fields.DCGM_FI_DEV_ECC_DBE_VOL_TOTAL,
30
+ dcgm_fields.DCGM_FI_DEV_ECC_SBE_AGG_TOTAL,
31
+ dcgm_fields.DCGM_FI_DEV_ECC_DBE_AGG_TOTAL, dcgm_fields.DCGM_FI_DEV_FB_TOTAL,
32
+ dcgm_fields.DCGM_FI_DEV_FB_FREE, dcgm_fields.DCGM_FI_DEV_FB_USED,
33
+ dcgm_fields.DCGM_FI_DEV_PCIE_REPLAY_COUNTER,
34
+ dcgm_fields.DCGM_FI_DEV_POWER_VIOLATION,
35
+ dcgm_fields.DCGM_FI_DEV_THERMAL_VIOLATION,
36
+ dcgm_fields.DCGM_FI_DEV_XID_ERRORS,
37
+ dcgm_fields.DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_TOTAL,
38
+ dcgm_fields.DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_TOTAL,
39
+ dcgm_fields.DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_TOTAL,
40
+ dcgm_fields.DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_TOTAL,
41
+ dcgm_fields.DCGM_FI_DEV_MEM_CLOCK, dcgm_fields.DCGM_FI_DEV_MEMORY_TEMP,
42
+ dcgm_fields.DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION,
43
+ dcgm_fields.DCGM_FI_DEV_MEM_COPY_UTIL,
44
+ dcgm_fields.DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL,
45
+ dcgm_fields.DCGM_FI_DEV_PCIE_TX_THROUGHPUT,
46
+ dcgm_fields.DCGM_FI_DEV_PCIE_RX_THROUGHPUT
47
+ ]
48
+
49
+
50
+ def entity_group_id_to_string(entityGroupId):
51
+ if entityGroupId == dcgm_fields.DCGM_FE_GPU:
52
+ return 'GPU'
53
+ elif entityGroupId == dcgm_fields.DCGM_FE_VGPU:
54
+ return 'VGPU'
55
+ elif entityGroupId == dcgm_fields.DCGM_FE_SWITCH:
56
+ return 'NVSWITCH'
57
+ elif entityGroupId == dcgm_fields.DCGM_FE_GPU_I:
58
+ return 'GPU INSTANCE'
59
+ elif entityGroupId == dcgm_fields.DCGM_FE_GPU_CI:
60
+ return 'COMPUTE INSTANCE'
61
+ elif entityGroupId == dcgm_fields.DCGM_FE_LINK:
62
+ return 'LINK'
63
+ else:
64
+ return ''
65
+
66
+
67
+ class DcgmReader(object):
68
+ ###########################################################################
69
+ '''
70
+ This function can be implemented as a callback in the class that inherits from DcgmReader
71
+ to handle each field individually.
72
+ By default, it passes a string with the gpu, field tag, and value to LogInfo()
73
+ @params:
74
+ gpuId : the id of the GPU this field is reporting on
75
+ fieldId : the id of the field (ignored by default, may be useful for children)
76
+ fieldTag : the string representation of the field id
77
+ val : the value class that comes from DCGM (v.value is the value for the field)
78
+ '''
79
+
80
+ def CustomFieldHandler(self, gpuId, fieldId, fieldTag, val):
81
+ print("GPU %s field %s=%s" % (str(gpuId), fieldTag, str(val.value)))
82
+
83
+ ###########################################################################
84
+ '''
85
+ This function can be implemented as a callback in the class that inherits from DcgmReader
86
+ to handle each field individually.
87
+ By default, it passes a string with the gpu, field tag, and value to LogInfo()
88
+ @params:
89
+ entityGroupId : the type of entity this field is reporting on
90
+ entityId : the id of the entity this field is reporting on
91
+ fieldId : the id of the field (ignored by default, may be useful for children)
92
+ fieldTag : the string representation of the field id
93
+ val : the value class that comes from DCGM (v.value is the value for the field)
94
+ '''
95
+
96
+ def CustomFieldHandler_v2(self, entityGroupId, entityId, fieldId, fieldTag,
97
+ val):
98
+ print("%s %s field %s=%s" % (entity_group_id_to_string(entityGroupId),
99
+ str(entityId), fieldTag, str(val.value)))
100
+
101
+ ###########################################################################
102
+ '''
103
+ This function can be implemented as a callback in the class that inherits from DcgmReader
104
+ to handle all of the data queried from DCGM.
105
+ By default, it will simply print the field tags and values for each GPU
106
+ @params:
107
+ fvs : Data in the format entityGroupId -> entityId -> values (dictionary of dictionaries)
108
+ '''
109
+
110
+ def CustomDataHandler_v2(self, fvs):
111
+ for entityGroupId in list(fvs.keys()):
112
+ entityGroup = fvs[entityGroupId]
113
+
114
+ for entityId in list(entityGroup.keys()):
115
+ entityFv = entityGroup[entityId]
116
+ for fieldId in list(entityFv.keys()):
117
+ if fieldId in self.m_dcgmIgnoreFields:
118
+ continue
119
+
120
+ val = entityFv[fieldId][-1]
121
+
122
+ if val.isBlank:
123
+ continue
124
+
125
+ fieldTag = self.m_fieldIdToInfo[fieldId].tag
126
+
127
+ self.CustomFieldHandler_v2(entityGroupId, entityId, fieldId,
128
+ fieldTag, val)
129
+
130
+ ###########################################################################
131
+ '''
132
+ This function can be implemented as a callback in the class that inherits from DcgmReader
133
+ to handle all of the data queried from DCGM.
134
+ By default, it will simply print the field tags and values for each GPU
135
+ @params:
136
+ fvs : Dictionary with gpuID as key and values as Value
137
+ '''
138
+
139
+ def CustomDataHandler(self, fvs):
140
+ for gpuId in list(fvs.keys()):
141
+ gpuFv = fvs[gpuId]
142
+
143
+ for fieldId in list(gpuFv.keys()):
144
+ if fieldId in self.m_dcgmIgnoreFields:
145
+ continue
146
+
147
+ val = gpuFv[fieldId][-1]
148
+
149
+ if val.isBlank:
150
+ continue
151
+
152
+ fieldTag = self.m_fieldIdToInfo[fieldId].tag
153
+
154
+ self.CustomFieldHandler(gpuId, fieldId, fieldTag, val)
155
+
156
+ ###########################################################################
157
+ def SetupGpuIdUUIdMappings(self):
158
+ '''
159
+ Populate the m_gpuIdToUUId map
160
+ '''
161
+
162
+ gpuIds = self.m_dcgmGroup.GetGpuIds()
163
+ for gpuId in gpuIds:
164
+ gpuInfo = self.m_dcgmSystem.discovery.GetGpuAttributes(gpuId)
165
+ self.m_gpuIdToUUId[gpuId] = gpuInfo.identifiers.uuid
166
+
167
+ ###########################################################################
168
+ '''
169
+ Constructor
170
+ @params:
171
+ hostname : Address:port of the host to connect. Defaults to localhost
172
+ fieldIds : List of the field ids to publish. If it isn't specified, our default list is used.
173
+ updateFrequency : Frequency of update in microseconds. Defauls to 10 seconds or 10000000 microseconds
174
+ maxKeepAge : Max time to keep data from NVML, in seconds. Default is 3600.0 (1 hour)
175
+ ignoreList : List of the field ids we want to query but not publish.
176
+ gpuIds : List of GPU IDs to monitor. If not provided, DcgmReader will monitor all GPUs on the system
177
+ fieldIntervalMap: Map of intervals to list of field numbers to monitor. Takes precedence over fieldIds and updateFrequency if not None.
178
+ '''
179
+
180
+ def __init__(self,
181
+ hostname='localhost',
182
+ fieldIds=None,
183
+ updateFrequency=10000000,
184
+ maxKeepAge=3600.0,
185
+ ignoreList=None,
186
+ fieldGroupName='dcgm_fieldgroupData',
187
+ gpuIds=None,
188
+ entities=None,
189
+ fieldIntervalMap=None):
190
+ fieldIds = fieldIds or defaultFieldIds
191
+ ignoreList = ignoreList or []
192
+ self.m_dcgmHostName = hostname
193
+ self.m_updateFreq = updateFrequency # default / redundant
194
+
195
+ self.m_fieldGroupName = fieldGroupName
196
+ self.m_publishFields = {}
197
+
198
+ if fieldIntervalMap is not None:
199
+ self.m_publishFields = fieldIntervalMap
200
+ else:
201
+ self.m_publishFields[self.m_updateFreq] = fieldIds
202
+
203
+ self.m_requestedGpuIds = gpuIds
204
+ self.m_requestedEntities = entities
205
+
206
+ self.m_dcgmIgnoreFields = ignoreList #Fields not to publish
207
+ self.m_maxKeepAge = maxKeepAge
208
+ self.m_dcgmHandle = None
209
+ self.m_dcgmSystem = None
210
+ self.m_dcgmGroup = None
211
+ self.m_closeHandle = False
212
+
213
+ self.m_gpuIdToBusId = {} #GpuID => PCI-E busId string
214
+ self.m_gpuIdToUUId = {} # FieldId => dcgm_fields.dcgm_field_meta_t
215
+ self.m_fieldIdToInfo = {} #FieldId => dcgm_fields.dcgm_field_meta_t
216
+ self.m_lock = threading.Lock(
217
+ ) #DCGM connection start-up/shutdown is not thread safe. Just lock pessimistically
218
+ self.m_debug = False
219
+
220
+ # For GetAllSinceLastCall* calls. We cache the value for these objects
221
+ # after first retrieval, so initializing them to None lets us know if
222
+ # we've made a first retrieval. The first retrieval is based on a
223
+ # "since" timestamp of 0, so it gets data in which we are not
224
+ # interested in. The second retrieval gets data since the first one, in
225
+ # which we ARE interested. The practical upshot of this is that actual
226
+ # reporting of data is delayed one collectd sampling interval -- as if
227
+ # the sampling was actually started one collectd sampling interval
228
+ # later. We expect this is not an issue.
229
+ self.fvs = None
230
+ self.dfvc = None
231
+ self.dfvec = None
232
+
233
+ ###########################################################################
234
+ '''
235
+ Define what should happen to this object at the beginning of a with
236
+ block. In this case, nothing more is needed since the constructor should've
237
+ been called.
238
+ '''
239
+
240
+ def __enter__(self):
241
+ return self
242
+
243
+ ###########################################################################
244
+ '''
245
+ Define the cleanup
246
+ '''
247
+
248
+ def __exit__(self, type, value, traceback):
249
+ self.Shutdown()
250
+
251
+ ###########################################################################
252
+ '''
253
+ This function intializes DCGM from the specified directory and connects to
254
+ the host engine.
255
+ '''
256
+
257
+ def InitWrapped(self, path=None):
258
+ dcgm_structs._dcgmInit(libDcgmPath=path)
259
+ self.Reconnect()
260
+
261
+ ###########################################################################
262
+ '''
263
+ This function tries to connect to hostengine and calls initwrapped to initialize
264
+ the dcgm.
265
+ '''
266
+
267
+ def Init(self, libpath=None):
268
+ with self.m_lock:
269
+ try:
270
+ self.InitWrapped(path=libpath)
271
+ except dcgm_structs.dcgmExceptionClass(
272
+ dcgm_structs.DCGM_ST_CONNECTION_NOT_VALID):
273
+ self.LogError("Can't connect to nv-hostengine. Is it down?")
274
+ self.SetDisconnected()
275
+
276
+ ###########################################################################
277
+ '''
278
+ Delete the DCGM group, DCGM system and DCGM handle and clear the attributes
279
+ on shutdown.
280
+ '''
281
+
282
+ def SetDisconnected(self):
283
+ #Force destructors since DCGM currently doesn't support more than one client connection per process
284
+ if self.m_dcgmGroup is not None:
285
+ del (self.m_dcgmGroup)
286
+ self.m_dcgmGroup = None
287
+ if self.m_dcgmSystem is not None:
288
+ del (self.m_dcgmSystem)
289
+ self.m_dcgmSystem = None
290
+ if self.m_dcgmHandle is not None:
291
+ del (self.m_dcgmHandle)
292
+ self.m_dcgmHandle = None
293
+
294
+ ##########################################################################
295
+ '''
296
+ This function calls the SetDisconnected function which disconnects from
297
+ DCGM and clears DCGM handle and DCGM group.
298
+ '''
299
+
300
+ def Shutdown(self):
301
+ with self.m_lock:
302
+ if self.m_closeHandle == True:
303
+ self.SetDisconnected()
304
+
305
+ ############################################################################
306
+ '''
307
+ Turns debugging output on
308
+ '''
309
+
310
+ def AddDebugOutput(self):
311
+ self.m_debug = True
312
+
313
+ ############################################################################
314
+ '''
315
+ '''
316
+
317
+ def InitializeFromHandle(self):
318
+ self.m_dcgmSystem = self.m_dcgmHandle.GetSystem()
319
+
320
+ if not self.m_requestedGpuIds and not self.m_requestedEntities:
321
+ self.m_dcgmGroup = self.m_dcgmSystem.GetDefaultGroup()
322
+ else:
323
+ groupName = "dcgmreader_%d" % os.getpid()
324
+
325
+ if self.m_requestedGpuIds:
326
+ self.m_dcgmGroup = self.m_dcgmSystem.GetGroupWithGpuIds(
327
+ groupName, self.m_requestedGpuIds)
328
+ if self.m_requestedEntities:
329
+ for entity in self.m_requestedEntities:
330
+ self.m_dcgmGroup.AddEntity(entity.entityGroupId,
331
+ entity.entityId)
332
+ else:
333
+ self.m_dcgmGroup = self.m_dcgmSystem.GetGroupWithEntities(
334
+ groupName, self.m_requestedEntities)
335
+
336
+ self.SetupGpuIdBusMappings()
337
+ self.SetupGpuIdUUIdMappings()
338
+ self.GetFieldMetadata()
339
+ self.AddFieldWatches()
340
+
341
+ ############################################################################
342
+ '''
343
+ Has DcgmReader use but not own a handle. Currently for the unit tests.
344
+ '''
345
+
346
+ def SetHandle(self, handle):
347
+ self.m_dcgmHandle = pydcgm.DcgmHandle(handle)
348
+ self.InitializeFromHandle()
349
+
350
+ ############################################################################
351
+ '''
352
+ Reconnect function checks if connection handle is present. If the handle is
353
+ none, it creates the handle and gets the default DCGM group. It then maps
354
+ gpuIds to BusID, set the meta data of the field ids and adds watches to the
355
+ field Ids mentioned in the idToWatch list.
356
+ '''
357
+
358
+ def Reconnect(self):
359
+ if self.m_dcgmHandle is not None:
360
+ return
361
+
362
+ self.LogDebug("Connection handle is None. Trying to reconnect")
363
+
364
+ self.m_dcgmHandle = pydcgm.DcgmHandle(
365
+ None, self.m_dcgmHostName, dcgm_structs.DCGM_OPERATION_MODE_AUTO)
366
+ self.m_closeHandle = True
367
+
368
+ self.LogDebug("Connected to nv-hostengine")
369
+
370
+ self.InitializeFromHandle()
371
+
372
+ ###########################################################################
373
+ '''
374
+ Populate the g_gpuIdToBusId map. This map contains mapping from
375
+ gpuID to the BusID.
376
+ '''
377
+
378
+ def SetupGpuIdBusMappings(self):
379
+ self.m_gpuIdToBusId = {}
380
+
381
+ gpuIds = self.m_dcgmGroup.GetGpuIds()
382
+ for gpuId in gpuIds:
383
+ gpuInfo = self.m_dcgmSystem.discovery.GetGpuAttributes(gpuId)
384
+ self.m_gpuIdToBusId[gpuId] = gpuInfo.identifiers.pciBusId
385
+
386
+ ###########################################################################
387
+ '''
388
+ Add watches to the fields which are passed in init function in idToWatch
389
+ list. It also updates the field values for the first time.
390
+ '''
391
+
392
+ def AddFieldWatches(self):
393
+ maxKeepSamples = 0 #No limit. Handled by m_maxKeepAge
394
+ for interval, fieldGroup in self.m_fieldGroups.items():
395
+ self.LogDebug("AddWatchFields: interval = " + str(interval) + "\n")
396
+ self.m_dcgmGroup.samples.WatchFields(fieldGroup, interval,
397
+ self.m_maxKeepAge,
398
+ maxKeepSamples)
399
+ self.m_dcgmSystem.UpdateAllFields(1)
400
+ self.LogDebug("AddWatchFields exit\n")
401
+
402
+ ###########################################################################
403
+ '''
404
+ If the groupID already exists, we delete that group and create a new fieldgroup with
405
+ the fields mentioned in idToWatch. Then information of each field is acquired from its id.
406
+ '''
407
+
408
+ def GetFieldMetadata(self):
409
+ self.m_fieldIdToInfo = {}
410
+ self.m_fieldGroups = {}
411
+ self.m_fieldGroup = None
412
+ allFieldIds = []
413
+
414
+ # Initialize groups for all field intervals.
415
+ self.LogDebug("GetFieldMetaData:\n")
416
+
417
+ intervalIndex = 0
418
+ for interval, fieldIds in self.m_publishFields.items():
419
+ self.LogDebug("sampling interval = " + str(interval) + ":\n")
420
+ for fieldId in fieldIds:
421
+ self.LogDebug(" fieldId: " + str(fieldId) + "\n")
422
+
423
+ intervalIndex += 1
424
+ fieldGroupName = self.m_fieldGroupName + "_" + str(intervalIndex)
425
+ findByNameId = self.m_dcgmSystem.GetFieldGroupIdByName(
426
+ fieldGroupName)
427
+ self.LogDebug("fieldGroupName: " + fieldGroupName + "\n")
428
+
429
+ # Remove our field group if it exists already
430
+ if findByNameId is not None:
431
+ self.LogDebug("fieldGroupId: " + findByNameId + "\n")
432
+ delFieldGroup = pydcgm.DcgmFieldGroup(
433
+ dcgmHandle=self.m_dcgmHandle, fieldGroupId=findByNameId)
434
+ delFieldGroup.Delete()
435
+ del (delFieldGroup)
436
+
437
+ self.m_fieldGroups[interval] = pydcgm.DcgmFieldGroup(
438
+ self.m_dcgmHandle, fieldGroupName, fieldIds)
439
+
440
+ for fieldId in fieldIds:
441
+ if fieldId not in allFieldIds:
442
+ allFieldIds += [fieldId]
443
+
444
+ self.m_fieldIdToInfo[
445
+ fieldId] = self.m_dcgmSystem.fields.GetFieldById(fieldId)
446
+ if self.m_fieldIdToInfo[fieldId] == 0 or self.m_fieldIdToInfo[
447
+ fieldId] == None:
448
+ self.LogError(
449
+ "Cannot get field tag for field id %d. Please check dcgm_fields to see if it is valid."
450
+ % (fieldId))
451
+ raise dcgm_structs.DCGMError(
452
+ dcgm_structs.DCGM_ST_UNKNOWN_FIELD)
453
+ # Initialize a field group of ALL fields.
454
+ fieldGroupName = self.m_fieldGroupName
455
+ findByNameId = self.m_dcgmSystem.GetFieldGroupIdByName(fieldGroupName)
456
+
457
+ # Remove our field group if it exists already
458
+ if findByNameId is not None:
459
+ delFieldGroup = pydcgm.DcgmFieldGroup(dcgmHandle=self.m_dcgmHandle,
460
+ fieldGroupId=findByNameId)
461
+ delFieldGroup.Delete()
462
+ del (delFieldGroup)
463
+
464
+ self.m_fieldGroup = pydcgm.DcgmFieldGroup(self.m_dcgmHandle,
465
+ fieldGroupName, allFieldIds)
466
+
467
+ ###########################################################################
468
+ '''
469
+ This function attempts to connect to DCGM and calls the implemented
470
+ CustomDataHandler in the child class with field values.
471
+ @params:
472
+ self.m_dcgmGroup.samples.GetLatest(self.m_fieldGroup).values : The field
473
+ values for each field. This dictionary contains fieldInfo for each field id
474
+ requested to be watched.
475
+ '''
476
+
477
+ def Process(self):
478
+ with self.m_lock:
479
+ try:
480
+ self.Reconnect()
481
+
482
+ # The first call just clears the collection set.
483
+
484
+ if not self.m_requestedEntities:
485
+ self.dfvc = self.m_dcgmGroup.samples.GetAllSinceLastCall(
486
+ self.dfvc, self.m_fieldGroup)
487
+ self.CustomDataHandler(self.dfvc.values)
488
+ self.dfvc.EmptyValues()
489
+ else:
490
+ self.dfvec = self.m_dcgmGroup.samples.GetAllSinceLastCall_v2(
491
+ self.dfvec, self.m_fieldGroup)
492
+ self.CustomDataHandler_v2(self.dfvec.values)
493
+ self.dfvec.EmptyValues()
494
+ except dcgm_structs.dcgmExceptionClass(
495
+ dcgm_structs.DCGM_ST_CONNECTION_NOT_VALID):
496
+ self.LogError("Can't connect to nv-hostengine. Is it down?")
497
+ self.SetDisconnected()
498
+
499
+ ###########################################################################
500
+ def LogInfo(self, msg):
501
+ logging.info(msg)
502
+
503
+ ###########################################################################
504
+ def LogDebug(self, msg):
505
+ logging.debug(msg)
506
+
507
+ ###########################################################################
508
+ def LogError(self, msg):
509
+ logging.error(msg)
510
+
511
+ ###########################################################################
512
+ '''
513
+ This function gets each value as a dictionary of dictionaries. The dictionary
514
+ returned is each gpu id mapped to a dictionary of it's field values. Each
515
+ field value dictionary is the field name mapped to the value or the field
516
+ id mapped to value depending on the parameter mapById.
517
+ '''
518
+
519
+ def GetLatestGpuValuesAsDict(self, mapById):
520
+ systemDictionary = {}
521
+
522
+ with self.m_lock:
523
+ try:
524
+ self.Reconnect()
525
+ fvs = self.m_dcgmGroup.samples.GetLatest(
526
+ self.m_fieldGroup).values
527
+ for gpuId in list(fvs.keys()):
528
+ systemDictionary[gpuId] = {
529
+ } # initialize the gpu's dictionary
530
+ gpuFv = fvs[gpuId]
531
+
532
+ for fieldId in list(gpuFv.keys()):
533
+ val = gpuFv[fieldId][-1]
534
+
535
+ if val.isBlank:
536
+ continue
537
+
538
+ if mapById == False:
539
+ fieldTag = self.m_fieldIdToInfo[fieldId].tag
540
+ systemDictionary[gpuId][
541
+ fieldTag] = val.value if isinstance(
542
+ val.value, bytes) else val.value
543
+ else:
544
+ systemDictionary[gpuId][
545
+ fieldId] = val.value if isinstance(
546
+ val.value, bytes) else val.value
547
+ except dcgm_structs.dcgmExceptionClass(
548
+ dcgm_structs.DCGM_ST_CONNECTION_NOT_VALID):
549
+ self.LogError(
550
+ "Can't connection to nv-hostengine. Please verify that it is running."
551
+ )
552
+ self.SetDisconnected()
553
+
554
+ return systemDictionary
555
+
556
+ ###########################################################################
557
+ '''
558
+ This function gets value as a dictionary of dictionaries of lists. The
559
+ dictionary returned is each gpu id mapped to a dictionary of it's field
560
+ value lists. Each field value dictionary is the field name mapped to the
561
+ list of values or the field id mapped to list of values depending on the
562
+ parameter mapById. The list of values are the values for each field since
563
+ the last retrieval.
564
+ '''
565
+
566
+ def GetAllGpuValuesAsDictSinceLastCall(self, mapById):
567
+ systemDictionary = {}
568
+
569
+ with self.m_lock:
570
+ try:
571
+ self.Reconnect()
572
+ report = self.fvs is not None
573
+ self.fvs = self.m_dcgmGroup.samples.GetAllSinceLastCall(
574
+ self.fvs, self.m_fieldGroup)
575
+ if report:
576
+ for gpuId in list(self.fvs.values.keys()):
577
+ systemDictionary[gpuId] = {
578
+ } # initialize the gpu's dictionary
579
+ gpuFv = self.fvs.values[gpuId]
580
+
581
+ for fieldId in list(gpuFv.keys()):
582
+ for val in gpuFv[fieldId]:
583
+ if val.isBlank:
584
+ continue
585
+
586
+ if mapById == False:
587
+ fieldTag = self.m_fieldIdToInfo[fieldId].tag
588
+ if not fieldTag in systemDictionary[gpuId]:
589
+ systemDictionary[gpuId][fieldTag] = []
590
+
591
+ systemDictionary[gpuId][fieldTag].append(
592
+ val)
593
+ else:
594
+ if not fieldId in systemDictionary[gpuId]:
595
+ systemDictionary[gpuId][fieldId] = []
596
+ systemDictionary[gpuId][fieldId].append(val)
597
+ except dcgm_structs.dcgmExceptionClass(
598
+ dcgm_structs.DCGM_ST_CONNECTION_NOT_VALID):
599
+ self.LogError(
600
+ "Can't connection to nv-hostengine. Please verify that it is running."
601
+ )
602
+ self.SetDisconnected()
603
+
604
+ if self.fvs is not None:
605
+ self.fvs.EmptyValues()
606
+
607
+ return systemDictionary
608
+
609
+ ###########################################################################
610
+ def GetLatestGpuValuesAsFieldIdDict(self):
611
+ return self.GetLatestGpuValuesAsDict(True)
612
+
613
+ ###########################################################################
614
+ def GetLatestGpuValuesAsFieldNameDict(self):
615
+ return self.GetLatestGpuValuesAsDict(False)
616
+
617
+ ###########################################################################
618
+ def GetAllGpuValuesAsFieldIdDictSinceLastCall(self):
619
+ return self.GetAllGpuValuesAsDictSinceLastCall(True)
620
+
621
+ ###########################################################################
622
+ def GetAllGpuValuesAsFieldNameDictSinceLastCall(self):
623
+ return self.GetAllGpuValuesAsDictSinceLastCall(False)
@@ -0,0 +1,57 @@
1
+ # Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import model_analyzer.monitor.dcgm.pydcgm as pydcgm
16
+ import model_analyzer.monitor.dcgm.dcgm_agent as dcgm_agent
17
+ import model_analyzer.monitor.dcgm.dcgm_structs as dcgm_structs
18
+
19
+
20
+ class DcgmStatus:
21
+
22
+ def __init__(self):
23
+ self.handle = dcgm_agent.dcgmStatusCreate()
24
+ self.errors = []
25
+
26
+ def __del__(self):
27
+ dcgm_agent.dcgmStatusDestroy(self.handle)
28
+
29
+ '''
30
+ Take any errors stored in our handle and update self.errors with them
31
+ '''
32
+
33
+ def UpdateErrors(self):
34
+ errorCount = dcgm_agent.dcgmStatusGetCount(self.handle)
35
+ if errorCount < 1:
36
+ return
37
+
38
+ for i in range(errorCount):
39
+ self.errors.append(dcgm_agent.dcgmStatusPopError(self.handle))
40
+
41
+ '''
42
+ Throw an exception if any errors are stored in our status handle
43
+
44
+ The exception text will contain all of the errors
45
+ '''
46
+
47
+ def ThrowExceptionOnErrors(self):
48
+ #Make sure we've captured all errors before looking at them
49
+ self.UpdateErrors()
50
+
51
+ if len(self.errors) < 1:
52
+ return
53
+
54
+ errorString = "Errors: "
55
+ for value in self.errors:
56
+ errorString += "\"%s\"" % value
57
+ raise dcgm_structs.DCGMError(value.status)