triton-model-analyzer 1.48.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (204) hide show
  1. model_analyzer/__init__.py +15 -0
  2. model_analyzer/analyzer.py +448 -0
  3. model_analyzer/cli/__init__.py +15 -0
  4. model_analyzer/cli/cli.py +193 -0
  5. model_analyzer/config/__init__.py +15 -0
  6. model_analyzer/config/generate/__init__.py +15 -0
  7. model_analyzer/config/generate/automatic_model_config_generator.py +164 -0
  8. model_analyzer/config/generate/base_model_config_generator.py +352 -0
  9. model_analyzer/config/generate/brute_plus_binary_parameter_search_run_config_generator.py +164 -0
  10. model_analyzer/config/generate/brute_run_config_generator.py +154 -0
  11. model_analyzer/config/generate/concurrency_sweeper.py +75 -0
  12. model_analyzer/config/generate/config_generator_interface.py +52 -0
  13. model_analyzer/config/generate/coordinate.py +143 -0
  14. model_analyzer/config/generate/coordinate_data.py +86 -0
  15. model_analyzer/config/generate/generator_utils.py +116 -0
  16. model_analyzer/config/generate/manual_model_config_generator.py +187 -0
  17. model_analyzer/config/generate/model_config_generator_factory.py +92 -0
  18. model_analyzer/config/generate/model_profile_spec.py +74 -0
  19. model_analyzer/config/generate/model_run_config_generator.py +154 -0
  20. model_analyzer/config/generate/model_variant_name_manager.py +150 -0
  21. model_analyzer/config/generate/neighborhood.py +536 -0
  22. model_analyzer/config/generate/optuna_plus_concurrency_sweep_run_config_generator.py +141 -0
  23. model_analyzer/config/generate/optuna_run_config_generator.py +838 -0
  24. model_analyzer/config/generate/perf_analyzer_config_generator.py +312 -0
  25. model_analyzer/config/generate/quick_plus_concurrency_sweep_run_config_generator.py +130 -0
  26. model_analyzer/config/generate/quick_run_config_generator.py +753 -0
  27. model_analyzer/config/generate/run_config_generator_factory.py +329 -0
  28. model_analyzer/config/generate/search_config.py +112 -0
  29. model_analyzer/config/generate/search_dimension.py +73 -0
  30. model_analyzer/config/generate/search_dimensions.py +85 -0
  31. model_analyzer/config/generate/search_parameter.py +49 -0
  32. model_analyzer/config/generate/search_parameters.py +388 -0
  33. model_analyzer/config/input/__init__.py +15 -0
  34. model_analyzer/config/input/config_command.py +483 -0
  35. model_analyzer/config/input/config_command_profile.py +1747 -0
  36. model_analyzer/config/input/config_command_report.py +267 -0
  37. model_analyzer/config/input/config_defaults.py +236 -0
  38. model_analyzer/config/input/config_enum.py +83 -0
  39. model_analyzer/config/input/config_field.py +216 -0
  40. model_analyzer/config/input/config_list_generic.py +112 -0
  41. model_analyzer/config/input/config_list_numeric.py +151 -0
  42. model_analyzer/config/input/config_list_string.py +111 -0
  43. model_analyzer/config/input/config_none.py +71 -0
  44. model_analyzer/config/input/config_object.py +129 -0
  45. model_analyzer/config/input/config_primitive.py +81 -0
  46. model_analyzer/config/input/config_status.py +75 -0
  47. model_analyzer/config/input/config_sweep.py +83 -0
  48. model_analyzer/config/input/config_union.py +113 -0
  49. model_analyzer/config/input/config_utils.py +128 -0
  50. model_analyzer/config/input/config_value.py +243 -0
  51. model_analyzer/config/input/objects/__init__.py +15 -0
  52. model_analyzer/config/input/objects/config_model_profile_spec.py +325 -0
  53. model_analyzer/config/input/objects/config_model_report_spec.py +173 -0
  54. model_analyzer/config/input/objects/config_plot.py +198 -0
  55. model_analyzer/config/input/objects/config_protobuf_utils.py +101 -0
  56. model_analyzer/config/input/yaml_config_validator.py +82 -0
  57. model_analyzer/config/run/__init__.py +15 -0
  58. model_analyzer/config/run/model_run_config.py +313 -0
  59. model_analyzer/config/run/run_config.py +168 -0
  60. model_analyzer/constants.py +76 -0
  61. model_analyzer/device/__init__.py +15 -0
  62. model_analyzer/device/device.py +24 -0
  63. model_analyzer/device/gpu_device.py +87 -0
  64. model_analyzer/device/gpu_device_factory.py +248 -0
  65. model_analyzer/entrypoint.py +307 -0
  66. model_analyzer/log_formatter.py +65 -0
  67. model_analyzer/model_analyzer_exceptions.py +24 -0
  68. model_analyzer/model_manager.py +255 -0
  69. model_analyzer/monitor/__init__.py +15 -0
  70. model_analyzer/monitor/cpu_monitor.py +69 -0
  71. model_analyzer/monitor/dcgm/DcgmDiag.py +191 -0
  72. model_analyzer/monitor/dcgm/DcgmFieldGroup.py +83 -0
  73. model_analyzer/monitor/dcgm/DcgmGroup.py +815 -0
  74. model_analyzer/monitor/dcgm/DcgmHandle.py +141 -0
  75. model_analyzer/monitor/dcgm/DcgmJsonReader.py +69 -0
  76. model_analyzer/monitor/dcgm/DcgmReader.py +623 -0
  77. model_analyzer/monitor/dcgm/DcgmStatus.py +57 -0
  78. model_analyzer/monitor/dcgm/DcgmSystem.py +412 -0
  79. model_analyzer/monitor/dcgm/__init__.py +15 -0
  80. model_analyzer/monitor/dcgm/common/__init__.py +13 -0
  81. model_analyzer/monitor/dcgm/common/dcgm_client_cli_parser.py +194 -0
  82. model_analyzer/monitor/dcgm/common/dcgm_client_main.py +86 -0
  83. model_analyzer/monitor/dcgm/dcgm_agent.py +887 -0
  84. model_analyzer/monitor/dcgm/dcgm_collectd_plugin.py +369 -0
  85. model_analyzer/monitor/dcgm/dcgm_errors.py +395 -0
  86. model_analyzer/monitor/dcgm/dcgm_field_helpers.py +546 -0
  87. model_analyzer/monitor/dcgm/dcgm_fields.py +815 -0
  88. model_analyzer/monitor/dcgm/dcgm_fields_collectd.py +671 -0
  89. model_analyzer/monitor/dcgm/dcgm_fields_internal.py +29 -0
  90. model_analyzer/monitor/dcgm/dcgm_fluentd.py +45 -0
  91. model_analyzer/monitor/dcgm/dcgm_monitor.py +138 -0
  92. model_analyzer/monitor/dcgm/dcgm_prometheus.py +326 -0
  93. model_analyzer/monitor/dcgm/dcgm_structs.py +2357 -0
  94. model_analyzer/monitor/dcgm/dcgm_telegraf.py +65 -0
  95. model_analyzer/monitor/dcgm/dcgm_value.py +151 -0
  96. model_analyzer/monitor/dcgm/dcgmvalue.py +155 -0
  97. model_analyzer/monitor/dcgm/denylist_recommendations.py +573 -0
  98. model_analyzer/monitor/dcgm/pydcgm.py +47 -0
  99. model_analyzer/monitor/monitor.py +143 -0
  100. model_analyzer/monitor/remote_monitor.py +137 -0
  101. model_analyzer/output/__init__.py +15 -0
  102. model_analyzer/output/file_writer.py +63 -0
  103. model_analyzer/output/output_writer.py +42 -0
  104. model_analyzer/perf_analyzer/__init__.py +15 -0
  105. model_analyzer/perf_analyzer/genai_perf_config.py +206 -0
  106. model_analyzer/perf_analyzer/perf_analyzer.py +882 -0
  107. model_analyzer/perf_analyzer/perf_config.py +479 -0
  108. model_analyzer/plots/__init__.py +15 -0
  109. model_analyzer/plots/detailed_plot.py +266 -0
  110. model_analyzer/plots/plot_manager.py +224 -0
  111. model_analyzer/plots/simple_plot.py +213 -0
  112. model_analyzer/record/__init__.py +15 -0
  113. model_analyzer/record/gpu_record.py +68 -0
  114. model_analyzer/record/metrics_manager.py +887 -0
  115. model_analyzer/record/record.py +280 -0
  116. model_analyzer/record/record_aggregator.py +256 -0
  117. model_analyzer/record/types/__init__.py +15 -0
  118. model_analyzer/record/types/cpu_available_ram.py +93 -0
  119. model_analyzer/record/types/cpu_used_ram.py +93 -0
  120. model_analyzer/record/types/gpu_free_memory.py +96 -0
  121. model_analyzer/record/types/gpu_power_usage.py +107 -0
  122. model_analyzer/record/types/gpu_total_memory.py +96 -0
  123. model_analyzer/record/types/gpu_used_memory.py +96 -0
  124. model_analyzer/record/types/gpu_utilization.py +108 -0
  125. model_analyzer/record/types/inter_token_latency_avg.py +60 -0
  126. model_analyzer/record/types/inter_token_latency_base.py +74 -0
  127. model_analyzer/record/types/inter_token_latency_max.py +60 -0
  128. model_analyzer/record/types/inter_token_latency_min.py +60 -0
  129. model_analyzer/record/types/inter_token_latency_p25.py +60 -0
  130. model_analyzer/record/types/inter_token_latency_p50.py +60 -0
  131. model_analyzer/record/types/inter_token_latency_p75.py +60 -0
  132. model_analyzer/record/types/inter_token_latency_p90.py +60 -0
  133. model_analyzer/record/types/inter_token_latency_p95.py +60 -0
  134. model_analyzer/record/types/inter_token_latency_p99.py +60 -0
  135. model_analyzer/record/types/output_token_throughput.py +105 -0
  136. model_analyzer/record/types/perf_client_response_wait.py +97 -0
  137. model_analyzer/record/types/perf_client_send_recv.py +97 -0
  138. model_analyzer/record/types/perf_latency.py +111 -0
  139. model_analyzer/record/types/perf_latency_avg.py +60 -0
  140. model_analyzer/record/types/perf_latency_base.py +74 -0
  141. model_analyzer/record/types/perf_latency_p90.py +60 -0
  142. model_analyzer/record/types/perf_latency_p95.py +60 -0
  143. model_analyzer/record/types/perf_latency_p99.py +60 -0
  144. model_analyzer/record/types/perf_server_compute_infer.py +97 -0
  145. model_analyzer/record/types/perf_server_compute_input.py +97 -0
  146. model_analyzer/record/types/perf_server_compute_output.py +97 -0
  147. model_analyzer/record/types/perf_server_queue.py +97 -0
  148. model_analyzer/record/types/perf_throughput.py +105 -0
  149. model_analyzer/record/types/time_to_first_token_avg.py +60 -0
  150. model_analyzer/record/types/time_to_first_token_base.py +74 -0
  151. model_analyzer/record/types/time_to_first_token_max.py +60 -0
  152. model_analyzer/record/types/time_to_first_token_min.py +60 -0
  153. model_analyzer/record/types/time_to_first_token_p25.py +60 -0
  154. model_analyzer/record/types/time_to_first_token_p50.py +60 -0
  155. model_analyzer/record/types/time_to_first_token_p75.py +60 -0
  156. model_analyzer/record/types/time_to_first_token_p90.py +60 -0
  157. model_analyzer/record/types/time_to_first_token_p95.py +60 -0
  158. model_analyzer/record/types/time_to_first_token_p99.py +60 -0
  159. model_analyzer/reports/__init__.py +15 -0
  160. model_analyzer/reports/html_report.py +195 -0
  161. model_analyzer/reports/pdf_report.py +50 -0
  162. model_analyzer/reports/report.py +86 -0
  163. model_analyzer/reports/report_factory.py +62 -0
  164. model_analyzer/reports/report_manager.py +1376 -0
  165. model_analyzer/reports/report_utils.py +42 -0
  166. model_analyzer/result/__init__.py +15 -0
  167. model_analyzer/result/constraint_manager.py +150 -0
  168. model_analyzer/result/model_config_measurement.py +354 -0
  169. model_analyzer/result/model_constraints.py +105 -0
  170. model_analyzer/result/parameter_search.py +246 -0
  171. model_analyzer/result/result_manager.py +430 -0
  172. model_analyzer/result/result_statistics.py +159 -0
  173. model_analyzer/result/result_table.py +217 -0
  174. model_analyzer/result/result_table_manager.py +646 -0
  175. model_analyzer/result/result_utils.py +42 -0
  176. model_analyzer/result/results.py +277 -0
  177. model_analyzer/result/run_config_measurement.py +658 -0
  178. model_analyzer/result/run_config_result.py +210 -0
  179. model_analyzer/result/run_config_result_comparator.py +110 -0
  180. model_analyzer/result/sorted_results.py +151 -0
  181. model_analyzer/state/__init__.py +15 -0
  182. model_analyzer/state/analyzer_state.py +76 -0
  183. model_analyzer/state/analyzer_state_manager.py +215 -0
  184. model_analyzer/triton/__init__.py +15 -0
  185. model_analyzer/triton/client/__init__.py +15 -0
  186. model_analyzer/triton/client/client.py +234 -0
  187. model_analyzer/triton/client/client_factory.py +57 -0
  188. model_analyzer/triton/client/grpc_client.py +104 -0
  189. model_analyzer/triton/client/http_client.py +107 -0
  190. model_analyzer/triton/model/__init__.py +15 -0
  191. model_analyzer/triton/model/model_config.py +556 -0
  192. model_analyzer/triton/model/model_config_variant.py +29 -0
  193. model_analyzer/triton/server/__init__.py +15 -0
  194. model_analyzer/triton/server/server.py +76 -0
  195. model_analyzer/triton/server/server_config.py +269 -0
  196. model_analyzer/triton/server/server_docker.py +229 -0
  197. model_analyzer/triton/server/server_factory.py +306 -0
  198. model_analyzer/triton/server/server_local.py +158 -0
  199. triton_model_analyzer-1.48.0.dist-info/METADATA +52 -0
  200. triton_model_analyzer-1.48.0.dist-info/RECORD +204 -0
  201. triton_model_analyzer-1.48.0.dist-info/WHEEL +5 -0
  202. triton_model_analyzer-1.48.0.dist-info/entry_points.txt +2 -0
  203. triton_model_analyzer-1.48.0.dist-info/licenses/LICENSE +67 -0
  204. triton_model_analyzer-1.48.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,815 @@
1
+ # Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import model_analyzer.monitor.dcgm.pydcgm as pydcgm
16
+ import model_analyzer.monitor.dcgm.dcgm_agent as dcgm_agent
17
+ import model_analyzer.monitor.dcgm.dcgm_structs as dcgm_structs
18
+ import model_analyzer.monitor.dcgm.dcgm_fields as dcgm_fields
19
+ import model_analyzer.monitor.dcgm.dcgm_field_helpers as dcgm_field_helpers
20
+ from model_analyzer.monitor.dcgm.DcgmHandle import DcgmHandle
21
+
22
+
23
+ class DcgmGroupConfig:
24
+
25
+ def __init__(self, dcgmHandle, groupId, dcgmGroup):
26
+ self._dcgmHandle = dcgmHandle
27
+ self._groupId = groupId
28
+ self._dcgmGroup = dcgmGroup
29
+
30
+ '''
31
+ Set configuration for this group
32
+
33
+ config should be an instance of dcgm_structs.c_dcgmDeviceConfig_v1
34
+
35
+ Will throw an exception on error
36
+ '''
37
+
38
+ def Set(self, config):
39
+ status = pydcgm.DcgmStatus()
40
+ ret = dcgm_structs.DCGM_ST_OK
41
+
42
+ try:
43
+ ret = dcgm_agent.dcgmConfigSet(self._dcgmHandle.handle,
44
+ self._groupId, config, status.handle)
45
+ except dcgm_structs.DCGMError as e:
46
+ pass
47
+
48
+ #Throw specific errors before return error
49
+ status.ThrowExceptionOnErrors()
50
+ #Throw an appropriate exception on error
51
+ dcgm_structs._dcgmCheckReturn(ret)
52
+
53
+ '''
54
+ Get configuration for this group
55
+
56
+ configType is a DCGM_CONFIG_? constant
57
+
58
+ Returns an array of dcgm_structs.c_dcgmDeviceConfig_v1 objects
59
+ Throws an exception on error
60
+ '''
61
+
62
+ def Get(self, configType):
63
+ status = pydcgm.DcgmStatus()
64
+
65
+ gpuIds = self._dcgmGroup.GetGpuIds()
66
+ configList = dcgm_agent.dcgmConfigGet(self._dcgmHandle.handle,
67
+ self._groupId, configType,
68
+ len(gpuIds), status.handle)
69
+ #Throw specific errors before return error
70
+ status.ThrowExceptionOnErrors()
71
+ return configList
72
+
73
+ '''
74
+ Enforce the configuration that has been set with Set()
75
+
76
+ Throws an exception on error
77
+ '''
78
+
79
+ def Enforce(self):
80
+ status = pydcgm.DcgmStatus()
81
+ ret = dcgm_structs.DCGM_ST_OK
82
+ try:
83
+ ret = dcgm_agent.dcgmConfigEnforce(self._dcgmHandle.handle,
84
+ self._groupId, status.handle)
85
+ except dcgm_structs.DCGMError as e:
86
+ pass
87
+
88
+ #Throw specific errors before return error
89
+ status.ThrowExceptionOnErrors()
90
+ #Throw an appropriate exception on error
91
+ dcgm_structs._dcgmCheckReturn(ret)
92
+
93
+
94
+ class DcgmGroupSamples:
95
+
96
+ def __init__(self, dcgmHandle, groupId, dcgmGroup):
97
+ self._dcgmHandle = dcgmHandle
98
+ self._groupId = groupId
99
+ self._dcgmGroup = dcgmGroup
100
+
101
+ '''
102
+ Tell DCGM to start recording samples for the given field group
103
+
104
+ fieldGroup: DcgmFieldGroup() instance tracking the fields we want to watch.
105
+ updateFreq: How often to update these fields in usec
106
+ maxKeepAge: How long to keep data for these fields in seconds
107
+ maxKeepSamples: Maximum number of samples to keep per field. 0=no limit
108
+
109
+ Once the field collection is watched, it will update whenever the next update
110
+ loop occurs. If you want to query these values immediately, use
111
+ handle.UpdateAllFields(True) to make sure that the fields have updated at least once.
112
+ '''
113
+
114
+ def WatchFields(self, fieldGroup, updateFreq, maxKeepAge, maxKeepSamples):
115
+ ret = dcgm_agent.dcgmWatchFields(self._dcgmHandle.handle, self._groupId,
116
+ fieldGroup.fieldGroupId, updateFreq,
117
+ maxKeepAge, maxKeepSamples)
118
+ dcgm_structs._dcgmCheckReturn(ret)
119
+
120
+ '''
121
+ tell DCGM to stop recording samples for a given field group
122
+
123
+ fieldGroup: DcgmFieldGroup() instance tracking the fields we want to unwatch.
124
+ '''
125
+
126
+ def UnwatchFields(self, fieldGroup):
127
+ ret = dcgm_agent.dcgmUnwatchFields(self._dcgmHandle.handle,
128
+ self._groupId,
129
+ fieldGroup.fieldGroupId)
130
+ dcgm_structs._dcgmCheckReturn(ret)
131
+
132
+ '''
133
+ Get the most recent values for each field in a field collection
134
+
135
+ fieldGroup: DcgmFieldGroup() instance tracking the fields we want to watch.
136
+
137
+ Returns DcgmFieldValueCollection object. Use its .values[gpuId][fieldId][0].value to access values
138
+ '''
139
+
140
+ def GetLatest(self, fieldGroup):
141
+ dfvc = dcgm_field_helpers.DcgmFieldValueCollection(
142
+ self._dcgmHandle.handle, self._groupId)
143
+ dfvc.GetLatestValues(fieldGroup)
144
+ return dfvc
145
+
146
+ '''
147
+ Get the most recent values for each field in a field collection
148
+
149
+ fieldGroup: DcgmFieldGroup() instance tracking the fields we want to watch.
150
+
151
+ Returns DcgmFieldValueEntityCollection object. Use its .values[entityGroupId][entityId][fieldId][0].value to access values
152
+ '''
153
+
154
+ def GetLatest_v2(self, fieldGroup):
155
+ dfvec = dcgm_field_helpers.DcgmFieldValueEntityCollection(
156
+ self._dcgmHandle.handle, self._groupId)
157
+ dfvec.GetLatestValues(fieldGroup)
158
+ return dfvec
159
+
160
+ '''
161
+ Get the new values for each field in a field collection since the last
162
+ collection.
163
+
164
+ dfvc: DcgmFieldValueCollection() instance. Will return a
165
+ DcgmFieldValueCollection with values since the one passed in.
166
+ Pass None for the first call to get one for subsequent calls.
167
+ On subsequent calls, pass what was returned.
168
+ fieldGroup: DcgmFieldGroup() instance tracking the fields we want to watch.
169
+
170
+ Returns DcgmFieldValueCollection object. Use its .values[gpuId][fieldId][*].value to access values
171
+ '''
172
+
173
+ def GetAllSinceLastCall(self, dfvc, fieldGroup):
174
+ if dfvc == None:
175
+ dfvc = dcgm_field_helpers.DcgmFieldValueCollection(
176
+ self._dcgmHandle.handle, self._groupId)
177
+ dfvc.GetLatestValues(fieldGroup)
178
+ else:
179
+ # We used to expect at least one value (GetLatestValues), so this
180
+ # ensures we provide one at the risk of repetition. This should not
181
+ # happen if we call this function infrequently enough (slower than
182
+ # the sampling rate).
183
+ dfvc.GetAllSinceLastCall(fieldGroup)
184
+ if len(dfvc.values) == 0:
185
+ dfvc.GetLatestValues(fieldGroup)
186
+ return dfvc
187
+
188
+ '''
189
+ Gets more values for each field in a field entity collection
190
+
191
+ dfvec: DcgmFieldValueEntityCollection() instance. Will return a
192
+ DcgmFieldValueEntityCollection with values since the one passed
193
+ in. Pass None for the first call to get one for subsequent
194
+ calls. On subsequent calls, pass what was returned.
195
+
196
+ fieldGroup: DcgmFieldGroup() instance tracking the fields we want to watch.
197
+
198
+ Returns DcgmFieldValueEntityCollection object. Use its .values[entityGroupId][entityId][fieldId][*].value to access values
199
+ '''
200
+
201
+ def GetAllSinceLastCall_v2(self, dvfec, fieldGroup):
202
+ if dfvec == None:
203
+ dfvec = dcgm_field_helpers.DcgmFieldValueEntityCollection(
204
+ self._dcgmHandle.handle, self._groupId)
205
+ dfvec.GetLastestValues_v2(fieldGroup)
206
+ else:
207
+ dfvec.GetAllSinceLastCall_v2(fieldGroup)
208
+ # We used to expect at least one value (GetLatestValues), so this
209
+ # ensures we provide one at the risk of repetition. This should not
210
+ # happen if we call this function infrequently enough (slower than
211
+ # the sampling rate).
212
+ if len(dfvec.values) == 0:
213
+ dfvec.GetLatestValues_v2(fieldGroup)
214
+
215
+ return dfvec
216
+
217
+ '''
218
+ Convenience alias for DcgmHandle.UpdateAllFields(). All fields on the system will be updated, not
219
+ just this group's.
220
+ '''
221
+
222
+ def UpdateAllFields(self, waitForUpdate):
223
+ self._dcgmHandle.UpdateAllFields(waitForUpdate)
224
+
225
+
226
+ class DcgmGroupHealth:
227
+
228
+ def __init__(self, dcgmHandle, groupId, dcgmGroup):
229
+ self._dcgmHandle = dcgmHandle
230
+ self._groupId = groupId
231
+ self._dcgmGroup = dcgmGroup
232
+
233
+ '''
234
+ Enable health checks for this group
235
+
236
+ systems: A bitmask of dcgm_structs.DCGM_HEALTH_WATCH_? definitions of which health checks to enable
237
+ updateInterval: How often DCGM should request new health data from the driver in usec
238
+ maxKeepAge: How long DCGM should keep health data around once it has been retrieved from the driver in seconds
239
+ '''
240
+
241
+ def Set(self, systems, updateInterval=None, maxKeepAge=None):
242
+ if updateInterval is None or maxKeepAge is None:
243
+ ret = dcgm_agent.dcgmHealthSet(self._dcgmHandle.handle,
244
+ self._groupId, systems)
245
+ else:
246
+ ret = dcgm_agent.dcgmHealthSet_v2(self._dcgmHandle.handle,
247
+ self._groupId, systems,
248
+ updateInterval, maxKeepAge)
249
+ dcgm_structs._dcgmCheckReturn(ret)
250
+
251
+ '''
252
+ Retrieve the current state of the DCGM health check system
253
+
254
+ Returns a bitmask of dcgm_structs.DCGM_HEALTH_WATCH_? definitions of which health checks are currently enabled
255
+ '''
256
+
257
+ def Get(self):
258
+ systems = dcgm_agent.dcgmHealthGet(self._dcgmHandle.handle,
259
+ self._groupId)
260
+ return systems
261
+
262
+ '''
263
+ Check the configured watches for any errors/failures/warnings that have occurred
264
+ since the last time this check was invoked. On the first call, stateful information
265
+ about all of the enabled watches within a group is created but no error results are
266
+ provided. On subsequent calls, any error information will be returned.
267
+
268
+ @param version IN: Allows the caller to use an older version of this request. Should be
269
+ dcgm_structs.dcgmHealthResponse_version4
270
+
271
+ Returns a dcgm_structs.c_dcgmHealthResponse_* object that contains results for each GPU/entity
272
+ '''
273
+
274
+ def Check(self, version=dcgm_structs.dcgmHealthResponse_version4):
275
+ resp = dcgm_agent.dcgmHealthCheck(self._dcgmHandle.handle,
276
+ self._groupId, version)
277
+ return resp
278
+
279
+
280
+ class DcgmGroupPolicy:
281
+
282
+ def __init__(self, dcgmHandle, groupId, dcgmGroup):
283
+ self._dcgmHandle = dcgmHandle
284
+ self._groupId = groupId
285
+ self._dcgmGroup = dcgmGroup
286
+
287
+ '''
288
+ Get the current violation policy inside the policy manager. Given a groupId, a number of
289
+ policy structures are retrieved.
290
+
291
+ @param statusHandle IN/OUT: pydcgm.DcgmStatus for the resulting status of the operation. Pass it as None
292
+ if the detailed error information for the operation is not needed (default).
293
+
294
+ Returns a list of dcgm_structs.c_dcgmPolicy_v1 with the same length as the number of GPUs in the group.
295
+ The index of an entry corresponds to a given GPU ID in the group. Throws an exception on error.
296
+ '''
297
+
298
+ def Get(self, statusHandle=None):
299
+ if statusHandle:
300
+ statusHandle = statusHandle.handle
301
+ count = len(self._dcgmGroup.GetGpuIds())
302
+ if count <= 0:
303
+ raise pydcgm.DcgmException(
304
+ "This group has no GPUs, cannot retrieve policies")
305
+ return dcgm_agent.dcgmPolicyGet(self._dcgmHandle.handle, self._groupId,
306
+ count, statusHandle)
307
+
308
+ '''
309
+ Set the current violation policy inside the policy manager. Given the conditions within "policy",
310
+ if a violation has occurred, subsequent action(s) may be performed to either
311
+ report or contain the failure.
312
+
313
+ This API is only supported on Tesla GPUs and will throw DCGMError_NotSupported if called on non-Tesla GPUs.
314
+
315
+ @param policy IN: dcgm_structs.c_dcgmPolicy_v1 that will be applied to all GPUs in the group
316
+
317
+ @param statusHandle IN/OUT: pydcgm.DcgmStatus for the resulting status for the operation. Pass it as
318
+ None if the detailed error information for the operation is not needed (default).
319
+
320
+ Returns Nothing. Throws an exception on error
321
+ '''
322
+
323
+ def Set(self, policy, statusHandle=None):
324
+ if statusHandle:
325
+ statusHandle = statusHandle.handle
326
+ dcgm_agent.dcgmPolicySet(self._dcgmHandle.handle, self._groupId, policy,
327
+ statusHandle)
328
+
329
+ '''
330
+ Register a function to be called when a specific policy condition (see dcgm_structs.c_dcgmPolicy_v1.condition)
331
+ has been violated. This callback(s) will be called automatically when in DCGM_OPERATION_MODE_AUTO mode and only after
332
+ DcgmPolicy.Trigger when in DCGM_OPERATION_MODE_MANUAL mode.
333
+ All callbacks are made within a separate thread.
334
+
335
+ This API is only supported on Tesla GPUs and will throw DCGMError_NotSupported if called on non-Tesla GPUs.
336
+
337
+ @param condition IN: The set of conditions specified as an OR'd list
338
+ (see dcgm_structs.DCGM_POLICY_COND_*)
339
+ for which to register a callback function
340
+
341
+ @param beginCallback IN: A function that should be called should a violation occur. This
342
+ function will be called prior to any actions specified by the policy are taken.
343
+
344
+ @param finishCallback IN: A reference to a function that should be called should a violation occur.
345
+ This function will be called after any action specified by the policy are completed.
346
+
347
+ At least one callback must be provided that is not None.
348
+
349
+ Returns Nothing. Throws an exception on error.
350
+ '''
351
+
352
+ def Register(self, condition, beginCallback=None, finishCallback=None):
353
+ if beginCallback is None and finishCallback is None:
354
+ raise pydcgm.DcgmException(
355
+ "At least 1 callback must be provided to register that is not None"
356
+ )
357
+ dcgm_agent.dcgmPolicyRegister(self._dcgmHandle.handle, self._groupId,
358
+ condition, beginCallback, finishCallback)
359
+
360
+ '''
361
+ Unregister a function to be called for a specific policy condition (see dcgm_structs.c_dcgmPolicy_v1.condition) .
362
+ This function will unregister all callbacks for a given condition.
363
+
364
+ @param condition IN: The set of conditions specified as an OR'd list
365
+ (see dcgm_structs.DCGM_POLICY_COND_*)
366
+ for which to unregister a callback function
367
+
368
+ Returns Nothing. Throws an exception on error.
369
+ '''
370
+
371
+ def Unregister(self, condition):
372
+ dcgm_agent.dcgmPolicyUnregister(self._dcgmHandle.handle, self._groupId,
373
+ condition)
374
+
375
+ '''
376
+ Inform the policy manager loop to perform an iteration and trigger the callbacks of any
377
+ registered functions. Callback functions will be called from a separate thread as the calling function.
378
+
379
+ Note: The GPU monitoring and management agent must call this method periodically if the operation
380
+ mode is set to manual mode (DCGM_OPERATION_MODE_MANUAL) during initialization
381
+ (\ref DcgmHandle.__init__).
382
+
383
+ Returns Nothing. Throws an exception if there is a generic error that the
384
+ policy manager was unable to perform another iteration.
385
+ '''
386
+
387
+ def Trigger(self):
388
+ dcgm_agent.dcgmPolicyTrigger(self._dcgmHandle.handle)
389
+
390
+
391
+ class DcgmGroupDiscovery:
392
+
393
+ def __init__(self, dcgmHandle, groupId, dcgmGroup):
394
+ self._dcgmHandle = dcgmHandle
395
+ self._groupId = groupId
396
+ self._dcgmGroup = dcgmGroup
397
+
398
+ '''
399
+ Get the topology for this group
400
+
401
+ Returns a c_dcgmGroupTopology_v1 object representing the topology for this group
402
+ '''
403
+
404
+ def GetTopology(self):
405
+ return dcgm_agent.dcgmGetGroupTopology(self._dcgmHandle.handle,
406
+ self._groupId)
407
+
408
+
409
+ class DcgmGroupStats:
410
+
411
+ def __init__(self, dcgmHandle, groupId, dcgmGroup):
412
+ self._dcgmHandle = dcgmHandle
413
+ self._groupId = groupId
414
+ self._dcgmGroup = dcgmGroup
415
+
416
+ '''
417
+ Tell DCGM to start recording samples for fields returned from GetPidInfo()
418
+
419
+ updateFreq: How often to update these fields in usec
420
+ maxKeepAge: How long to keep data for these fields in seconds
421
+ maxKeepSamples: Maximum number of samples to keep per field. 0=no limit
422
+
423
+ Once the field collection is watched, it will update whenever the next update
424
+ loop occurs. If you want to query these values immediately, use
425
+ handle.UpdateAllFields(True) to make sure that the fields have updated at least once.
426
+ '''
427
+
428
+ def WatchPidFields(self, updateFreq, maxKeepAge, maxKeepSamples):
429
+ ret = dcgm_agent.dcgmWatchPidFields(self._dcgmHandle.handle,
430
+ self._groupId, updateFreq,
431
+ maxKeepAge, maxKeepSamples)
432
+ dcgm_structs._dcgmCheckReturn(ret)
433
+
434
+ '''
435
+ Get process stats for a given PID on this GPU group
436
+
437
+ You must call WatchPidFields() before this query for this method to return any results
438
+
439
+ Returns a dcgm_structs.c_dcgmPidInfo_v2 structure
440
+ '''
441
+
442
+ def GetPidInfo(self, pid):
443
+ return dcgm_agent.dcgmGetPidInfo(self._dcgmHandle.handle, self._groupId,
444
+ pid)
445
+
446
+ '''
447
+ Tell DCGM to start recording samples for fields returned from GetJobStats()
448
+
449
+ updateFreq: How often to update these fields in usec
450
+ maxKeepAge: How long to keep data for these fields in seconds
451
+ maxKeepSamples: Maximum number of samples to keep per field. 0=no limit
452
+
453
+ Once the fields are watched, they will update whenever the next update
454
+ loop occurs. If you want to query these values immediately, use
455
+ handle.UpdateAllFields(True) to make sure that the fields have updated at least once.
456
+ '''
457
+
458
+ def WatchJobFields(self, updateFreq, maxKeepAge, maxKeepSamples):
459
+ ret = dcgm_agent.dcgmWatchJobFields(self._dcgmHandle.handle,
460
+ self._groupId, updateFreq,
461
+ maxKeepAge, maxKeepSamples)
462
+ dcgm_structs._dcgmCheckReturn(ret)
463
+
464
+ '''
465
+ Start collecting stats for a named job for this GPU group
466
+
467
+ Calling this will tell DCGM to start tracking stats for the given jobId. Stats tracking
468
+ will end when StopJobStats() is called
469
+
470
+ You must call WatchJobFields() before this call to tell DCGM to start sampling the fields
471
+ that are returned from GetJobStats().
472
+
473
+ jobId is a unique string identifier for this job. An exception will be thrown if this is not unique
474
+
475
+ Returns Nothing (Will throw exception on error)
476
+ '''
477
+
478
+ def StartJobStats(self, jobId):
479
+ ret = dcgm_agent.dcgmJobStartStats(self._dcgmHandle.handle,
480
+ self._groupId, jobId)
481
+ dcgm_structs._dcgmCheckReturn(ret)
482
+
483
+ '''
484
+ Stop collecting stats for a named job
485
+
486
+ Calling this will tell DCGM to stop collecting stats for a job that was previously started
487
+ with StartJobStats().
488
+
489
+ jobId is the unique string that was passed as jobId to StartJobStats.
490
+
491
+ Returns Nothing (Will throw exception on error)
492
+ '''
493
+
494
+ def StopJobStats(self, jobId):
495
+ ret = dcgm_agent.dcgmJobStopStats(self._dcgmHandle.handle, jobId)
496
+ dcgm_structs._dcgmCheckReturn(ret)
497
+
498
+ '''
499
+ Get stats for a job that was started with StartJobStats. If StopJobStats has not been called yet,
500
+ this will get stats from when the job started until now. If StopJob was called prior to
501
+ this, the returned Stats will go from when StartJobStats was called to when StopJobStats was called.
502
+
503
+ jobId is the unique string that was passed as jobId to StartJobStats and StopJobStats
504
+
505
+ Returns a dcgm_structs.c_dcgmJobInfo_v3 structure. Throws an exception on error
506
+ '''
507
+
508
+ def GetJobStats(self, jobId):
509
+ ret = dcgm_agent.dcgmJobGetStats(self._dcgmHandle.handle, jobId)
510
+ return ret
511
+
512
+ '''
513
+ This API tells DCGM to stop tracking the job given by jobId. After this call, you will no longer
514
+ be able to call GetJobStats() on this jobId. However, you will be able to reuse jobId after
515
+ this call.
516
+
517
+ jobId is the unique string that was passed as jobId to StartJobStats and StopJobStats
518
+
519
+ Returns Nothing (Will throw exception on error)
520
+ '''
521
+
522
+ def RemoveJob(self, jobId):
523
+ ret = dcgm_agent.dcgmJobRemove(self._dcgmHandle.handle, jobId)
524
+ return ret
525
+
526
+ '''
527
+ This API tells DCGM to stop tracking all jobs. After this call, you will no longer
528
+ be able to call dcgmJobGetStats() any jobs until you call StartJobStats() again.
529
+ You will be able to reuse any previously-used jobIds after this call.
530
+
531
+ Returns Nothing (Will throw exception on error)
532
+ '''
533
+
534
+ def RemoveAllJobs(self):
535
+ ret = dcgm_agent.dcgmJobRemoveAll(self._dcgmHandle.handle)
536
+ return ret
537
+
538
+
539
+ class DcgmGroupAction:
540
+
541
+ def __init__(self, dcgmHandle, groupId, dcgmGroup):
542
+ self._dcgmHandle = dcgmHandle
543
+ self._groupId = groupId
544
+ self._dcgmGroup = dcgmGroup
545
+
546
+ '''
547
+ Inform the action manager to perform a manual validation of a group of GPUs on the system
548
+
549
+ validate is what sort of validation to do. See dcgm_structs.DCGM_POLICY_VALID_* defines.
550
+
551
+ Returns a dcgm_structs.c_dcgmDiagResponse_v5 instance
552
+ '''
553
+
554
+ def Validate(self, validate):
555
+ runDiagInfo = dcgm_structs.c_dcgmRunDiag_v7()
556
+ runDiagInfo.version = dcgm_structs.dcgmRunDiag_version7
557
+ runDiagInfo.validate = validate
558
+ runDiagInfo.groupId = self._groupId
559
+
560
+ ret = dcgm_agent.dcgmActionValidate_v2(self._dcgmHandle.handle,
561
+ runDiagInfo)
562
+ return ret
563
+
564
+ '''
565
+ Run a diagnostic on this group of GPUs.
566
+
567
+ diagLevel is the level of diagnostic desired. See dcgm_structs.DCGM_DIAG_LVL_* constants.
568
+
569
+ Returns a dcgm_structs.c_dcgmDiagResponse_v5 instance
570
+ '''
571
+
572
+ def RunDiagnostic(self, diagLevel):
573
+ ret = dcgm_agent.dcgmRunDiagnostic(self._dcgmHandle.handle,
574
+ self._groupId, diagLevel)
575
+ return ret
576
+
577
+ '''
578
+ Run a specific diagnostic test on this group of GPUs.
579
+ testName is the name of the specific test that should be invoked.
580
+ Returns a dcgm_structs.c_dcgmDiagResponse_v5 instance
581
+ '''
582
+
583
+ def RunSpecificTest(self, testName):
584
+ runDiagInfo = dcgm_structs.c_dcgmRunDiag_v7()
585
+ runDiagInfo.version = dcgm_structs.dcgmRunDiag_version7
586
+ for i in range(len(testName)):
587
+ runDiagInfo.testNames[0][i] = testName[i]
588
+ runDiagInfo.groupId = self._groupId
589
+ runDiagInfo.validate = dcgm_structs.DCGM_POLICY_VALID_NONE
590
+ response = dcgm_agent.dcgmActionValidate_v2(self._dcgmHandle.handle,
591
+ runDiagInfo)
592
+ return response
593
+
594
+
595
+ class DcgmGroupProfiling:
596
+
597
+ def __init__(self, dcgmHandle, groupId, dcgmGroup):
598
+ """
599
+
600
+ Parameters
601
+ ----------
602
+ dcgmHandle : DcgmHandle
603
+ groupId : int
604
+ dcgmGroup : DcgmGroup
605
+ """
606
+ self._dcgmHandle = dcgmHandle
607
+ self._groupId = groupId
608
+ self._dcgmGroup = dcgmGroup
609
+
610
+ def GetSupportedMetricGroups(self):
611
+ """
612
+ Get a list of the profiling metric groups available for this group of entities
613
+
614
+ :return: dcgm_structs.c_dcgmProfGetMetricGroups_v3
615
+ :throws: dcgm_structs.DCGMError on error
616
+ """
617
+ gpuIds = self._dcgmGroup.GetGpuIds()
618
+ if len(gpuIds) < 1:
619
+ raise dcgm_structs.DCGMError_ProfilingNotSupported
620
+
621
+ ret = dcgm_agent.dcgmProfGetSupportedMetricGroups(
622
+ self._dcgmHandle.handle, gpuIds[0])
623
+ return ret
624
+
625
+
626
+ class DcgmGroup:
627
+ '''
628
+ Constructor.
629
+
630
+ Either groupId OR groupName must be provided as a parameter.
631
+ This will set which GPU group this object is bound to
632
+
633
+ groupId=DCGM_GROUP_ALL_GPUS creates a group with all GPUs. Passing an existing groupId will
634
+ not create an additional group.
635
+ If groupName is provided, an empty group (No GPUs) of name groupName will be created. This group
636
+ will be destroyed when this object goes out of scope or is deleted with del().
637
+ groupType is the type of group to create. See dcgm_structs.DCGM_GROUP_? constants.
638
+ '''
639
+
640
+ def __init__(self,
641
+ dcgmHandle,
642
+ groupId=None,
643
+ groupName=None,
644
+ groupType=dcgm_structs.DCGM_GROUP_EMPTY):
645
+ self._dcgmHandle = dcgmHandle
646
+
647
+ if groupId is None and groupName is None:
648
+ raise pydcgm.DcgmException(
649
+ "Either groupId or groupName is required")
650
+
651
+ if groupId is not None:
652
+ self._groupId = groupId
653
+ else:
654
+ self._groupId = dcgm_agent.dcgmGroupCreate(self._dcgmHandle.handle,
655
+ groupType, groupName)
656
+
657
+ #Create namespace classes
658
+ self.config = DcgmGroupConfig(self._dcgmHandle, self._groupId, self)
659
+ self.samples = DcgmGroupSamples(self._dcgmHandle, self._groupId, self)
660
+ self.health = DcgmGroupHealth(self._dcgmHandle, self._groupId, self)
661
+ self.policy = DcgmGroupPolicy(self._dcgmHandle, self._groupId, self)
662
+ self.discovery = DcgmGroupDiscovery(self._dcgmHandle, self._groupId,
663
+ self)
664
+ self.stats = DcgmGroupStats(self._dcgmHandle, self._groupId, self)
665
+ self.action = DcgmGroupAction(self._dcgmHandle, self._groupId, self)
666
+ self.profiling = DcgmGroupProfiling(self._dcgmHandle, self._groupId,
667
+ self)
668
+
669
+ '''
670
+ Remove this group from DCGM. This object will no longer be valid after this call.
671
+ '''
672
+
673
+ def Delete(self):
674
+ del self.config
675
+ self.config = None
676
+ del self.samples
677
+ self.samples = None
678
+ del self.health
679
+ self.health = None
680
+ del self.policy
681
+ self.policy = None
682
+ del self.discovery
683
+ self.discovery = None
684
+ del self.stats
685
+ self.stats = None
686
+ del self.action
687
+ self.action = None
688
+ del self.profiling
689
+ self.profiling = None
690
+
691
+ #Delete the group we created if we're not using the special all-GPU group
692
+ if self._groupId is not None and not self._IsGroupIdStatic():
693
+ ret = dcgm_agent.dcgmGroupDestroy(self._dcgmHandle.handle,
694
+ self._groupId)
695
+ dcgm_structs._dcgmCheckReturn(ret)
696
+
697
+ self._groupId = None
698
+
699
+ '''
700
+ Private method to determine if our groupId is a predefined one
701
+ '''
702
+
703
+ def _IsGroupIdStatic(self):
704
+ if self._groupId == dcgm_structs.DCGM_GROUP_ALL_GPUS or \
705
+ self._groupId == dcgm_structs.DCGM_GROUP_ALL_NVSWITCHES:
706
+ return True
707
+ return False
708
+
709
+ '''
710
+ Add a GPU to this group
711
+
712
+ gpuId is the GPU ID to add to our group
713
+
714
+ Returns Nothing. Throws an exception on error
715
+ '''
716
+
717
+ def AddGpu(self, gpuId):
718
+ if self._IsGroupIdStatic():
719
+ raise pydcgm.DcgmException("Can't add a GPU to a static group")
720
+
721
+ ret = dcgm_agent.dcgmGroupAddDevice(self._dcgmHandle.handle,
722
+ self._groupId, gpuId)
723
+ dcgm_structs._dcgmCheckReturn(ret)
724
+
725
+ '''
726
+ Add an entity to this group
727
+
728
+ entityGroupId is DCGM_FE_? constant of the entity group this entity belongs to
729
+ entityId is the entity to add to this group
730
+
731
+ Returns Nothing. Throws an exception on error
732
+ '''
733
+
734
+ def AddEntity(self, entityGroupId, entityId):
735
+ if self._IsGroupIdStatic():
736
+ raise pydcgm.DcgmException("Can't add an entity to a static group")
737
+
738
+ ret = dcgm_agent.dcgmGroupAddEntity(self._dcgmHandle.handle,
739
+ self._groupId, entityGroupId,
740
+ entityId)
741
+ dcgm_structs._dcgmCheckReturn(ret)
742
+
743
+ '''
744
+ Remove a GPU from this group
745
+
746
+ gpuId is the GPU ID to remove from our group
747
+
748
+ Returns Nothing. Throws an exception on error
749
+ '''
750
+
751
+ def RemoveGpu(self, gpuId):
752
+ if self._IsGroupIdStatic():
753
+ raise pydcgm.DcgmException("Can't remove a GPU from a static group")
754
+
755
+ ret = dcgm_agent.dcgmGroupRemoveDevice(self._dcgmHandle.handle,
756
+ self._groupId, gpuId)
757
+ dcgm_structs._dcgmCheckReturn(ret)
758
+
759
+ '''
760
+ Remove an entity from this group
761
+
762
+ entityGroupId is DCGM_FE_? constant of the entity group this entity belongs to
763
+ entityId is the entity to remove from this group
764
+
765
+ Returns Nothing. Throws an exception on error
766
+ '''
767
+
768
+ def RemoveEntity(self, entityGroupId, entityId):
769
+ if self._IsGroupIdStatic():
770
+ raise pydcgm.DcgmException(
771
+ "Can't remove an entity from a static group")
772
+
773
+ ret = dcgm_agent.dcgmGroupRemoveEntity(self._dcgmHandle.handle,
774
+ self._groupId, entityGroupId,
775
+ entityId)
776
+ dcgm_structs._dcgmCheckReturn(ret)
777
+
778
+ '''
779
+ Get an array of GPU ids that are part of this group
780
+
781
+ Note: this ignores non-GPU members of the group
782
+
783
+ Returns a list of GPU ids. Throws an exception on error
784
+ '''
785
+
786
+ def GetGpuIds(self):
787
+ groupInfo = dcgm_agent.dcgmGroupGetInfo(self._dcgmHandle.handle,
788
+ self._groupId)
789
+ groupGpuIds = []
790
+ for i in range(groupInfo.count):
791
+ if groupInfo.entityList[i].entityGroupId != dcgm_fields.DCGM_FE_GPU:
792
+ continue
793
+ groupGpuIds.append(groupInfo.entityList[i].entityId)
794
+ return groupGpuIds
795
+
796
+ '''
797
+ Get an array of entities that are part of this group
798
+
799
+ Returns a list of c_dcgmGroupEntityPair_t structs. Throws an exception on error
800
+ '''
801
+
802
+ def GetEntities(self):
803
+ groupInfo = dcgm_agent.dcgmGroupGetInfo(self._dcgmHandle.handle,
804
+ self._groupId)
805
+ entities = groupInfo.entityList[0:groupInfo.count]
806
+ return entities
807
+
808
+ '''
809
+ Get the groupId of this object
810
+
811
+ Returns our groupId
812
+ '''
813
+
814
+ def GetId(self):
815
+ return self._groupId