triton-model-analyzer 1.48.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (204) hide show
  1. model_analyzer/__init__.py +15 -0
  2. model_analyzer/analyzer.py +448 -0
  3. model_analyzer/cli/__init__.py +15 -0
  4. model_analyzer/cli/cli.py +193 -0
  5. model_analyzer/config/__init__.py +15 -0
  6. model_analyzer/config/generate/__init__.py +15 -0
  7. model_analyzer/config/generate/automatic_model_config_generator.py +164 -0
  8. model_analyzer/config/generate/base_model_config_generator.py +352 -0
  9. model_analyzer/config/generate/brute_plus_binary_parameter_search_run_config_generator.py +164 -0
  10. model_analyzer/config/generate/brute_run_config_generator.py +154 -0
  11. model_analyzer/config/generate/concurrency_sweeper.py +75 -0
  12. model_analyzer/config/generate/config_generator_interface.py +52 -0
  13. model_analyzer/config/generate/coordinate.py +143 -0
  14. model_analyzer/config/generate/coordinate_data.py +86 -0
  15. model_analyzer/config/generate/generator_utils.py +116 -0
  16. model_analyzer/config/generate/manual_model_config_generator.py +187 -0
  17. model_analyzer/config/generate/model_config_generator_factory.py +92 -0
  18. model_analyzer/config/generate/model_profile_spec.py +74 -0
  19. model_analyzer/config/generate/model_run_config_generator.py +154 -0
  20. model_analyzer/config/generate/model_variant_name_manager.py +150 -0
  21. model_analyzer/config/generate/neighborhood.py +536 -0
  22. model_analyzer/config/generate/optuna_plus_concurrency_sweep_run_config_generator.py +141 -0
  23. model_analyzer/config/generate/optuna_run_config_generator.py +838 -0
  24. model_analyzer/config/generate/perf_analyzer_config_generator.py +312 -0
  25. model_analyzer/config/generate/quick_plus_concurrency_sweep_run_config_generator.py +130 -0
  26. model_analyzer/config/generate/quick_run_config_generator.py +753 -0
  27. model_analyzer/config/generate/run_config_generator_factory.py +329 -0
  28. model_analyzer/config/generate/search_config.py +112 -0
  29. model_analyzer/config/generate/search_dimension.py +73 -0
  30. model_analyzer/config/generate/search_dimensions.py +85 -0
  31. model_analyzer/config/generate/search_parameter.py +49 -0
  32. model_analyzer/config/generate/search_parameters.py +388 -0
  33. model_analyzer/config/input/__init__.py +15 -0
  34. model_analyzer/config/input/config_command.py +483 -0
  35. model_analyzer/config/input/config_command_profile.py +1747 -0
  36. model_analyzer/config/input/config_command_report.py +267 -0
  37. model_analyzer/config/input/config_defaults.py +236 -0
  38. model_analyzer/config/input/config_enum.py +83 -0
  39. model_analyzer/config/input/config_field.py +216 -0
  40. model_analyzer/config/input/config_list_generic.py +112 -0
  41. model_analyzer/config/input/config_list_numeric.py +151 -0
  42. model_analyzer/config/input/config_list_string.py +111 -0
  43. model_analyzer/config/input/config_none.py +71 -0
  44. model_analyzer/config/input/config_object.py +129 -0
  45. model_analyzer/config/input/config_primitive.py +81 -0
  46. model_analyzer/config/input/config_status.py +75 -0
  47. model_analyzer/config/input/config_sweep.py +83 -0
  48. model_analyzer/config/input/config_union.py +113 -0
  49. model_analyzer/config/input/config_utils.py +128 -0
  50. model_analyzer/config/input/config_value.py +243 -0
  51. model_analyzer/config/input/objects/__init__.py +15 -0
  52. model_analyzer/config/input/objects/config_model_profile_spec.py +325 -0
  53. model_analyzer/config/input/objects/config_model_report_spec.py +173 -0
  54. model_analyzer/config/input/objects/config_plot.py +198 -0
  55. model_analyzer/config/input/objects/config_protobuf_utils.py +101 -0
  56. model_analyzer/config/input/yaml_config_validator.py +82 -0
  57. model_analyzer/config/run/__init__.py +15 -0
  58. model_analyzer/config/run/model_run_config.py +313 -0
  59. model_analyzer/config/run/run_config.py +168 -0
  60. model_analyzer/constants.py +76 -0
  61. model_analyzer/device/__init__.py +15 -0
  62. model_analyzer/device/device.py +24 -0
  63. model_analyzer/device/gpu_device.py +87 -0
  64. model_analyzer/device/gpu_device_factory.py +248 -0
  65. model_analyzer/entrypoint.py +307 -0
  66. model_analyzer/log_formatter.py +65 -0
  67. model_analyzer/model_analyzer_exceptions.py +24 -0
  68. model_analyzer/model_manager.py +255 -0
  69. model_analyzer/monitor/__init__.py +15 -0
  70. model_analyzer/monitor/cpu_monitor.py +69 -0
  71. model_analyzer/monitor/dcgm/DcgmDiag.py +191 -0
  72. model_analyzer/monitor/dcgm/DcgmFieldGroup.py +83 -0
  73. model_analyzer/monitor/dcgm/DcgmGroup.py +815 -0
  74. model_analyzer/monitor/dcgm/DcgmHandle.py +141 -0
  75. model_analyzer/monitor/dcgm/DcgmJsonReader.py +69 -0
  76. model_analyzer/monitor/dcgm/DcgmReader.py +623 -0
  77. model_analyzer/monitor/dcgm/DcgmStatus.py +57 -0
  78. model_analyzer/monitor/dcgm/DcgmSystem.py +412 -0
  79. model_analyzer/monitor/dcgm/__init__.py +15 -0
  80. model_analyzer/monitor/dcgm/common/__init__.py +13 -0
  81. model_analyzer/monitor/dcgm/common/dcgm_client_cli_parser.py +194 -0
  82. model_analyzer/monitor/dcgm/common/dcgm_client_main.py +86 -0
  83. model_analyzer/monitor/dcgm/dcgm_agent.py +887 -0
  84. model_analyzer/monitor/dcgm/dcgm_collectd_plugin.py +369 -0
  85. model_analyzer/monitor/dcgm/dcgm_errors.py +395 -0
  86. model_analyzer/monitor/dcgm/dcgm_field_helpers.py +546 -0
  87. model_analyzer/monitor/dcgm/dcgm_fields.py +815 -0
  88. model_analyzer/monitor/dcgm/dcgm_fields_collectd.py +671 -0
  89. model_analyzer/monitor/dcgm/dcgm_fields_internal.py +29 -0
  90. model_analyzer/monitor/dcgm/dcgm_fluentd.py +45 -0
  91. model_analyzer/monitor/dcgm/dcgm_monitor.py +138 -0
  92. model_analyzer/monitor/dcgm/dcgm_prometheus.py +326 -0
  93. model_analyzer/monitor/dcgm/dcgm_structs.py +2357 -0
  94. model_analyzer/monitor/dcgm/dcgm_telegraf.py +65 -0
  95. model_analyzer/monitor/dcgm/dcgm_value.py +151 -0
  96. model_analyzer/monitor/dcgm/dcgmvalue.py +155 -0
  97. model_analyzer/monitor/dcgm/denylist_recommendations.py +573 -0
  98. model_analyzer/monitor/dcgm/pydcgm.py +47 -0
  99. model_analyzer/monitor/monitor.py +143 -0
  100. model_analyzer/monitor/remote_monitor.py +137 -0
  101. model_analyzer/output/__init__.py +15 -0
  102. model_analyzer/output/file_writer.py +63 -0
  103. model_analyzer/output/output_writer.py +42 -0
  104. model_analyzer/perf_analyzer/__init__.py +15 -0
  105. model_analyzer/perf_analyzer/genai_perf_config.py +206 -0
  106. model_analyzer/perf_analyzer/perf_analyzer.py +882 -0
  107. model_analyzer/perf_analyzer/perf_config.py +479 -0
  108. model_analyzer/plots/__init__.py +15 -0
  109. model_analyzer/plots/detailed_plot.py +266 -0
  110. model_analyzer/plots/plot_manager.py +224 -0
  111. model_analyzer/plots/simple_plot.py +213 -0
  112. model_analyzer/record/__init__.py +15 -0
  113. model_analyzer/record/gpu_record.py +68 -0
  114. model_analyzer/record/metrics_manager.py +887 -0
  115. model_analyzer/record/record.py +280 -0
  116. model_analyzer/record/record_aggregator.py +256 -0
  117. model_analyzer/record/types/__init__.py +15 -0
  118. model_analyzer/record/types/cpu_available_ram.py +93 -0
  119. model_analyzer/record/types/cpu_used_ram.py +93 -0
  120. model_analyzer/record/types/gpu_free_memory.py +96 -0
  121. model_analyzer/record/types/gpu_power_usage.py +107 -0
  122. model_analyzer/record/types/gpu_total_memory.py +96 -0
  123. model_analyzer/record/types/gpu_used_memory.py +96 -0
  124. model_analyzer/record/types/gpu_utilization.py +108 -0
  125. model_analyzer/record/types/inter_token_latency_avg.py +60 -0
  126. model_analyzer/record/types/inter_token_latency_base.py +74 -0
  127. model_analyzer/record/types/inter_token_latency_max.py +60 -0
  128. model_analyzer/record/types/inter_token_latency_min.py +60 -0
  129. model_analyzer/record/types/inter_token_latency_p25.py +60 -0
  130. model_analyzer/record/types/inter_token_latency_p50.py +60 -0
  131. model_analyzer/record/types/inter_token_latency_p75.py +60 -0
  132. model_analyzer/record/types/inter_token_latency_p90.py +60 -0
  133. model_analyzer/record/types/inter_token_latency_p95.py +60 -0
  134. model_analyzer/record/types/inter_token_latency_p99.py +60 -0
  135. model_analyzer/record/types/output_token_throughput.py +105 -0
  136. model_analyzer/record/types/perf_client_response_wait.py +97 -0
  137. model_analyzer/record/types/perf_client_send_recv.py +97 -0
  138. model_analyzer/record/types/perf_latency.py +111 -0
  139. model_analyzer/record/types/perf_latency_avg.py +60 -0
  140. model_analyzer/record/types/perf_latency_base.py +74 -0
  141. model_analyzer/record/types/perf_latency_p90.py +60 -0
  142. model_analyzer/record/types/perf_latency_p95.py +60 -0
  143. model_analyzer/record/types/perf_latency_p99.py +60 -0
  144. model_analyzer/record/types/perf_server_compute_infer.py +97 -0
  145. model_analyzer/record/types/perf_server_compute_input.py +97 -0
  146. model_analyzer/record/types/perf_server_compute_output.py +97 -0
  147. model_analyzer/record/types/perf_server_queue.py +97 -0
  148. model_analyzer/record/types/perf_throughput.py +105 -0
  149. model_analyzer/record/types/time_to_first_token_avg.py +60 -0
  150. model_analyzer/record/types/time_to_first_token_base.py +74 -0
  151. model_analyzer/record/types/time_to_first_token_max.py +60 -0
  152. model_analyzer/record/types/time_to_first_token_min.py +60 -0
  153. model_analyzer/record/types/time_to_first_token_p25.py +60 -0
  154. model_analyzer/record/types/time_to_first_token_p50.py +60 -0
  155. model_analyzer/record/types/time_to_first_token_p75.py +60 -0
  156. model_analyzer/record/types/time_to_first_token_p90.py +60 -0
  157. model_analyzer/record/types/time_to_first_token_p95.py +60 -0
  158. model_analyzer/record/types/time_to_first_token_p99.py +60 -0
  159. model_analyzer/reports/__init__.py +15 -0
  160. model_analyzer/reports/html_report.py +195 -0
  161. model_analyzer/reports/pdf_report.py +50 -0
  162. model_analyzer/reports/report.py +86 -0
  163. model_analyzer/reports/report_factory.py +62 -0
  164. model_analyzer/reports/report_manager.py +1376 -0
  165. model_analyzer/reports/report_utils.py +42 -0
  166. model_analyzer/result/__init__.py +15 -0
  167. model_analyzer/result/constraint_manager.py +150 -0
  168. model_analyzer/result/model_config_measurement.py +354 -0
  169. model_analyzer/result/model_constraints.py +105 -0
  170. model_analyzer/result/parameter_search.py +246 -0
  171. model_analyzer/result/result_manager.py +430 -0
  172. model_analyzer/result/result_statistics.py +159 -0
  173. model_analyzer/result/result_table.py +217 -0
  174. model_analyzer/result/result_table_manager.py +646 -0
  175. model_analyzer/result/result_utils.py +42 -0
  176. model_analyzer/result/results.py +277 -0
  177. model_analyzer/result/run_config_measurement.py +658 -0
  178. model_analyzer/result/run_config_result.py +210 -0
  179. model_analyzer/result/run_config_result_comparator.py +110 -0
  180. model_analyzer/result/sorted_results.py +151 -0
  181. model_analyzer/state/__init__.py +15 -0
  182. model_analyzer/state/analyzer_state.py +76 -0
  183. model_analyzer/state/analyzer_state_manager.py +215 -0
  184. model_analyzer/triton/__init__.py +15 -0
  185. model_analyzer/triton/client/__init__.py +15 -0
  186. model_analyzer/triton/client/client.py +234 -0
  187. model_analyzer/triton/client/client_factory.py +57 -0
  188. model_analyzer/triton/client/grpc_client.py +104 -0
  189. model_analyzer/triton/client/http_client.py +107 -0
  190. model_analyzer/triton/model/__init__.py +15 -0
  191. model_analyzer/triton/model/model_config.py +556 -0
  192. model_analyzer/triton/model/model_config_variant.py +29 -0
  193. model_analyzer/triton/server/__init__.py +15 -0
  194. model_analyzer/triton/server/server.py +76 -0
  195. model_analyzer/triton/server/server_config.py +269 -0
  196. model_analyzer/triton/server/server_docker.py +229 -0
  197. model_analyzer/triton/server/server_factory.py +306 -0
  198. model_analyzer/triton/server/server_local.py +158 -0
  199. triton_model_analyzer-1.48.0.dist-info/METADATA +52 -0
  200. triton_model_analyzer-1.48.0.dist-info/RECORD +204 -0
  201. triton_model_analyzer-1.48.0.dist-info/WHEEL +5 -0
  202. triton_model_analyzer-1.48.0.dist-info/entry_points.txt +2 -0
  203. triton_model_analyzer-1.48.0.dist-info/licenses/LICENSE +67 -0
  204. triton_model_analyzer-1.48.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,546 @@
1
+ # Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import time
16
+ import model_analyzer.monitor.dcgm.dcgm_fields as dcgm_fields
17
+ import model_analyzer.monitor.dcgm.dcgm_fields_internal as dcgm_fields_internal
18
+ import model_analyzer.monitor.dcgm.dcgm_structs as dcgm_structs
19
+ import model_analyzer.monitor.dcgm.dcgm_agent as dcgm_agent
20
+ import ctypes
21
+ import model_analyzer.monitor.dcgm.dcgmvalue as dcgmvalue
22
+ import model_analyzer.monitor.dcgm.pydcgm as pydcgm
23
+ import json
24
+ '''
25
+ Helper class that makes a python-friendly field value from one returned from the python bindings
26
+ '''
27
+
28
+
29
+ class DcgmFieldValue():
30
+ '''
31
+ Constructor
32
+
33
+ rawValue is the latest dcgm_structs.c_dcgmFieldValue_v? structure of a field value returned from the raw APIs
34
+ '''
35
+
36
+ def __init__(self, rawValue):
37
+ #Make sure the class passed in is an expected type
38
+ if not type(rawValue) == dcgm_structs.c_dcgmFieldValue_v1:
39
+ raise Exception("Unexpected rawValue type %s" % str(type(rawValue)))
40
+
41
+ self.ts = rawValue.ts
42
+ self.fieldId = rawValue.fieldId
43
+ self.fieldType = chr(rawValue.fieldType)
44
+ self.isBlank = False
45
+ self.value = None
46
+
47
+ if rawValue.status != dcgm_structs.DCGM_ST_OK:
48
+ self.isBlank = True
49
+ return
50
+
51
+ if self.fieldType == dcgm_fields.DCGM_FT_DOUBLE:
52
+ self.value = float(rawValue.value.dbl)
53
+ self.isBlank = dcgmvalue.DCGM_FP64_IS_BLANK(self.value)
54
+ elif self.fieldType == dcgm_fields.DCGM_FT_INT64 or self.fieldType == dcgm_fields.DCGM_FT_TIMESTAMP:
55
+ self.value = int(rawValue.value.i64)
56
+ self.isBlank = dcgmvalue.DCGM_INT64_IS_BLANK(self.value)
57
+ elif self.fieldType == dcgm_fields.DCGM_FT_STRING:
58
+ self.value = str(rawValue.value.str)
59
+ self.isBlank = dcgmvalue.DCGM_STR_IS_BLANK(self.value)
60
+ elif self.fieldType == dcgm_fields.DCGM_FT_BINARY:
61
+ if self.fieldId == dcgm_fields.DCGM_FI_DEV_ACCOUNTING_DATA:
62
+ accStats = dcgm_structs.c_dcgmDevicePidAccountingStats_v1()
63
+ ctypes.memmove(ctypes.addressof(accStats), rawValue.value.blob,
64
+ accStats.FieldsSizeof())
65
+ if self.fieldId in [
66
+ dcgm_fields_internal.DCGM_FI_DEV_COMPUTE_PIDS,
67
+ dcgm_fields_internal.DCGM_FI_DEV_GRAPHICS_PIDS
68
+ ]:
69
+ processStats = dcgm_structs.c_dcgmRunningProcess_t()
70
+ ctypes.memmove(ctypes.addressof(processStats),
71
+ rawValue.value.blob, processStats.FieldsSizeof())
72
+ self.value = processStats
73
+ self.fieldType = dcgm_fields.DCGM_FT_BINARY
74
+ # This should always be false
75
+ self.isBlank = dcgmvalue.DCGM_INT64_IS_BLANK(processStats.pid)
76
+ elif self.fieldId == dcgm_fields.DCGM_FI_SYNC_BOOST:
77
+ #Not exposed publicly for now
78
+ self.value = None
79
+ else:
80
+ raise Exception("Blobs not handled yet for fieldId %d" %
81
+ self.fieldId)
82
+ else:
83
+ raise Exception("Unhandled fieldType: %s" % self.fieldType)
84
+
85
+
86
+ class DcgmFieldValueTimeSeries:
87
+
88
+ def __init__(self):
89
+ self.values = [] #Values in timestamp order
90
+
91
+ def __len__(self):
92
+ return len(self.values)
93
+
94
+ def __getitem__(self, key):
95
+ return self.values[key]
96
+
97
+ def InsertValue(self, value):
98
+ if len(self.values) < 1 or value.ts >= self.values[-1].ts:
99
+ self.values.append(value)
100
+ return
101
+
102
+ #Otherwise, we need to insert the value in the correct place. Find the place
103
+ for i, existingValue in enumerate(self.values):
104
+ if value.ts < existingValue.ts:
105
+ self.values.insert(i, value)
106
+ return
107
+
108
+ raise Exception("Unexpected no place to insert ts %d" % value.ts)
109
+
110
+
111
+ class FieldValueEncoder(json.JSONEncoder):
112
+ # Pylint does not link overloading the default method, so the comment below is WAR for the linting problem
113
+ def default(self, obj): # pylint: disable=E0202
114
+ nested_json = []
115
+ i = 0
116
+ for key in obj:
117
+ if isinstance(key, DcgmFieldValue):
118
+ if (key.isBlank):
119
+ continue
120
+ nested_json.append({
121
+ 'Timestamp': key.ts,
122
+ 'FieldId': key.fieldId,
123
+ 'Value': key.value
124
+ })
125
+ else:
126
+ return json.JSONEncoder.default(
127
+ self, obj) # Let default encoder throw exception
128
+ return nested_json
129
+
130
+
131
+ def py_helper_dcgm_field_values_since_callback(gpuId, values, numValues,
132
+ userData):
133
+
134
+ userData = ctypes.cast(userData, ctypes.py_object).value
135
+ userData._ProcessValues(gpuId, values[0:numValues])
136
+ return 0
137
+
138
+
139
+ helper_dcgm_field_values_since_callback = dcgm_agent.dcgmFieldValueEnumeration_f(
140
+ py_helper_dcgm_field_values_since_callback)
141
+
142
+
143
+ def py_helper_dcgm_field_values_since_callback_v2(entityGroupId, entityId,
144
+ values, numValues, userData):
145
+ userData = ctypes.cast(userData, ctypes.py_object).value
146
+ userData._ProcessValuesV2(entityGroupId, entityId, values[0:numValues])
147
+ return 0
148
+
149
+
150
+ helper_dcgm_field_values_since_callback_v2 = dcgm_agent.dcgmFieldValueEntityEnumeration_f(
151
+ py_helper_dcgm_field_values_since_callback_v2)
152
+ '''
153
+ Helper class for handling field value update callbacks and storing them in a .values member variable
154
+ '''
155
+
156
+
157
+ class DcgmFieldValueCollection:
158
+
159
+ def __init__(self, handle, groupId):
160
+ self.values = {
161
+ } #2D dictionary of [gpuId][fieldId](DcgmFieldValueTimeSeries)
162
+ self.entityValues = {
163
+ } #3D dictionary of [entityGroupId][entityId][fieldId](DcgmFieldValueTimeSeries)
164
+ self._handle = handle
165
+ self._groupId = groupId
166
+ self._numValuesSeen = 0
167
+ self._nextSinceTimestamp = 0
168
+
169
+ '''
170
+ Helper function called by the callback of dcgm_agent.dcgmGetValuesSince to process individual field values
171
+ '''
172
+
173
+ def _ProcessValues(self, gpuId, values):
174
+ self._numValuesSeen += len(values)
175
+
176
+ if gpuId not in self.values:
177
+ self.values[gpuId] = {}
178
+
179
+ for rawValue in values:
180
+ #Convert to python-friendly value
181
+ value = DcgmFieldValue(rawValue)
182
+
183
+ if value.fieldId not in self.values[gpuId]:
184
+ self.values[gpuId][value.fieldId] = DcgmFieldValueTimeSeries()
185
+
186
+ self.values[gpuId][value.fieldId].InsertValue(value)
187
+
188
+ '''
189
+ Helper function called by the callback py_helper_dcgm_field_values_since_callback_v2 to process individual field values
190
+ '''
191
+
192
+ def _ProcessValuesV2(self, entityGroupId, entityId, values):
193
+ self._numValuesSeen += len(values)
194
+
195
+ if entityGroupId not in self.entityValues:
196
+ self.entityValues[entityGroupId] = {}
197
+
198
+ if entityId not in self.entityValues[entityGroupId]:
199
+ self.entityValues[entityGroupId][entityId] = {}
200
+
201
+ for rawValue in values:
202
+ #Convert to python-friendly value
203
+ value = DcgmFieldValue(rawValue)
204
+
205
+ if value.fieldId not in self.entityValues[entityGroupId][entityId]:
206
+ self.entityValues[entityGroupId][entityId][
207
+ value.fieldId] = DcgmFieldValueTimeSeries()
208
+
209
+ self.entityValues[entityGroupId][entityId][
210
+ value.fieldId].InsertValue(value)
211
+
212
+ '''
213
+ Get the latest values for a fieldGroup and store them to the .values member variable
214
+
215
+ Note: This class does not automatically watch fieldGroup. You must do that ahead of time with dcgmGroup.samples.WatchFields()
216
+ '''
217
+
218
+ def GetLatestValues(self, fieldGroup):
219
+ ret = dcgm_agent.dcgmGetLatestValues(
220
+ self._handle, self._groupId, fieldGroup.fieldGroupId,
221
+ helper_dcgm_field_values_since_callback, self)
222
+ #Will throw exception on error
223
+ dcgm_structs._dcgmCheckReturn(ret)
224
+
225
+ '''
226
+ Method to cause more field values to be retrieved from DCGM. Returns the
227
+ number of field values that were retrieved.
228
+ '''
229
+
230
+ def GetAllSinceLastCall(self, fieldGroup):
231
+ beforeCount = self._numValuesSeen
232
+ self._nextSinceTimestamp = dcgm_agent.dcgmGetValuesSince(
233
+ self._handle, self._groupId, fieldGroup.fieldGroupId,
234
+ self._nextSinceTimestamp, helper_dcgm_field_values_since_callback,
235
+ self)
236
+ afterCount = self._numValuesSeen
237
+ return afterCount - beforeCount
238
+
239
+ def GetLatestValues_v2(self, fieldGroup):
240
+ ret = dcgm_agent.dcgmGetLatestValues_v2(
241
+ self._handle, self._groupId, fieldGroup.fieldGroupId,
242
+ helper_dcgm_field_values_since_callback_v2, self)
243
+ #Will throw exception on error
244
+ dcgm_structs._dcgmCheckReturn(ret)
245
+
246
+ '''
247
+ Method to cause more field values to be retrieved from DCGM. Returns the number of field values that were retrieved
248
+ '''
249
+
250
+ def GetAllSinceLastCall_v2(self, fieldGroup):
251
+ beforeCount = self._numValuesSeen
252
+ self._nextSinceTimestamp = dcgm_agent.dcgmGetValuesSince_v2(
253
+ self._handle, self._groupId, fieldGroup.fieldGroupId,
254
+ self._nextSinceTimestamp,
255
+ helper_dcgm_field_values_since_entity_callback, self)
256
+ afterCount = self._numValuesSeen
257
+ return afterCount - beforeCount
258
+
259
+ '''
260
+ Empty .values{} so that old data is no longer present in this structure.
261
+ This can be used to prevent .values from growing over time
262
+ '''
263
+
264
+ def EmptyValues(self):
265
+ self.values = {}
266
+ self._numValuesSeen = 0
267
+
268
+
269
+ '''
270
+ Helper class for watching a field group and storing fields values returned from it
271
+ '''
272
+
273
+
274
+ class DcgmFieldGroupWatcher(DcgmFieldValueCollection):
275
+ '''
276
+ Constructor
277
+
278
+ handle is a DCGM handle from dcgm_agent.dcgmInit()
279
+ groupId is a valid DCGM group ID returned from dcgm_agent.dcgmGroupCreate
280
+ fieldGroup is the DcgmFieldGroup() instance to watch fields for
281
+ operationMode is a dcgm_structs.DCGM_OPERATION_MODE_? constant for if the host engine is running in lock step or auto mode
282
+ updateFreq is how often to update each field in usec
283
+ maxKeepAge is how long DCGM should keep values for in seconds
284
+ maxKeepSamples is the maximum number of samples DCGM should ever cache for each field
285
+ startTimestamp is a base timestamp we should start from when first reading values. This can be used to resume a
286
+ previous instance of a DcgmFieldGroupWatcher by using its _nextSinceTimestamp.
287
+ 0=start with all cached data
288
+ '''
289
+
290
+ def __init__(self, handle, groupId, fieldGroup, operationMode, updateFreq,
291
+ maxKeepAge, maxKeepSamples, startTimestamp):
292
+ self._fieldGroup = fieldGroup
293
+ self._operationMode = operationMode
294
+ self._updateFreq = updateFreq
295
+ self._maxKeepAge = maxKeepAge
296
+ self._maxKeepSamples = maxKeepSamples
297
+ DcgmFieldValueCollection.__init__(self, handle, groupId)
298
+
299
+ self._nextSinceTimestamp = 0 #Start from beginning of time
300
+ if startTimestamp > 0:
301
+ self._nextSinceTimestamp = startTimestamp
302
+
303
+ #Start watches
304
+ self._WatchFieldGroup()
305
+
306
+ '''
307
+ Initiate the host engine watch on the fields
308
+ '''
309
+
310
+ def _WatchFieldGroup(self):
311
+ ret = dcgm_agent.dcgmWatchFields(self._handle, self._groupId,
312
+ self._fieldGroup.fieldGroupId,
313
+ self._updateFreq, self._maxKeepAge,
314
+ self._maxKeepSamples)
315
+ dcgm_structs._dcgmCheckReturn(ret) #Will throw exception on error
316
+
317
+ # Force an update of the fields so that we can fetch initial values.
318
+ ret = dcgm_agent.dcgmUpdateAllFields(self._handle, 1)
319
+ dcgm_structs._dcgmCheckReturn(ret) #Will throw exception on error
320
+
321
+ # Initial update will fetch from startTimestamp.
322
+ self.GetAllSinceLastCall()
323
+
324
+ '''
325
+ Method to cause more field values to be retrieved from DCGM. Returns the
326
+ number of field values that were retrieved
327
+ '''
328
+
329
+ def GetAllSinceLastCall(self):
330
+ #If we're in manual mode, force an update
331
+ if self._operationMode == dcgm_structs.DCGM_OPERATION_MODE_MANUAL:
332
+ ret = dcgm_agent.dcgmUpdateAllFields(self._handle, 1)
333
+ dcgm_structs._dcgmCheckReturn(ret) #Will throw exception on error
334
+
335
+ return super().GetAllSinceLastCall(self._fieldGroup)
336
+
337
+
338
+ def py_helper_dcgm_field_values_since_entity_callback(entityGroupId, entityId,
339
+ values, numValues,
340
+ userData):
341
+
342
+ userData = ctypes.cast(userData, ctypes.py_object).value
343
+ userData._ProcessValues(entityGroupId, entityId, values[0:numValues])
344
+ return 0
345
+
346
+
347
+ helper_dcgm_field_values_since_entity_callback = dcgm_agent.dcgmFieldValueEntityEnumeration_f(
348
+ py_helper_dcgm_field_values_since_entity_callback)
349
+ '''
350
+ Helper class for handling field value update callbacks and storing them in a .values member variable
351
+ '''
352
+
353
+
354
+ class DcgmFieldValueEntityCollection:
355
+
356
+ def __init__(self, handle, groupId):
357
+ self.values = {
358
+ } #3D dictionary of [entityGroupId][entityId][fieldId](DcgmFieldValueTimeSeries)
359
+ self._handle = handle
360
+ self._groupId = groupId
361
+ self._numValuesSeen = 0
362
+ self._nextSinceTimestamp = 0
363
+
364
+ '''
365
+ Helper function called by the callback of dcgm_agent.dcgmGetValuesSince to process individual field values
366
+ '''
367
+
368
+ def _ProcessValues(self, entityGroupId, entityId, values):
369
+ self._numValuesSeen += len(values)
370
+
371
+ if entityGroupId not in self.values:
372
+ self.values[entityGroupId] = {}
373
+
374
+ if entityId not in self.values[entityGroupId]:
375
+ self.values[entityGroupId][entityId] = {}
376
+
377
+ for rawValue in values:
378
+ #Convert to python-friendly value
379
+ value = DcgmFieldValue(rawValue)
380
+
381
+ if value.fieldId not in self.values[entityGroupId][entityId]:
382
+ self.values[entityGroupId][entityId][
383
+ value.fieldId] = DcgmFieldValueTimeSeries()
384
+
385
+ self.values[entityGroupId][entityId][value.fieldId].InsertValue(
386
+ value)
387
+
388
+ '''
389
+ Get the latest values for a fieldGroup and store them to the .values member variable
390
+
391
+ Note: This class does not automatically watch fieldGroup. You must do that ahead of time with dcgmGroup.samples.WatchFields()
392
+ '''
393
+
394
+ def GetLatestValues(self, fieldGroup):
395
+ ret = dcgm_agent.dcgmGetLatestValues_v2(
396
+ self._handle, self._groupId, fieldGroup.fieldGroupId,
397
+ helper_dcgm_field_values_since_entity_callback, self)
398
+ #Will throw exception on error
399
+ dcgm_structs._dcgmCheckReturn(ret)
400
+
401
+ '''
402
+ Method to cause more field values to be retrieved from DCGM. Returns the
403
+ number of field values that were retrieved.
404
+ '''
405
+
406
+ def GetAllSinceLastCall(self, fieldGroup):
407
+ beforeCount = self._numValuesSeen
408
+ self._nextSinceTimestamp = dcgm_agent.dcgmGetValuesSince_v2(
409
+ self._handle, self._groupId, fieldGroup.fieldGroupId,
410
+ self._nextSinceTimestamp,
411
+ helper_dcgm_field_values_since_entity_callback, self)
412
+ afterCount = self._numValuesSeen
413
+ return afterCount - beforeCount
414
+
415
+ '''
416
+ Empty .values{} so that old data is no longer present in this structure.
417
+ This can be used to prevent .values from growing over time
418
+ '''
419
+
420
+ def EmptyValues(self):
421
+ self.values = {}
422
+ self._numValuesSeen = 0
423
+
424
+
425
+ '''
426
+ Helper class for watching a field group and storing fields values returned from it
427
+ '''
428
+
429
+
430
+ class DcgmFieldGroupEntityWatcher(DcgmFieldValueEntityCollection):
431
+ '''
432
+ Constructor
433
+
434
+ handle is a DCGM handle from dcgm_agent.dcgmInit()
435
+ groupId is a valid DCGM group ID returned from dcgm_agent.dcgmGroupCreate
436
+ fieldGroup is the DcgmFieldGroup() instance to watch fields for
437
+ operationMode is a dcgm_structs.DCGM_OPERATION_MODE_? constant for if the host engine is running in lock step or auto mode
438
+ updateFreq is how often to update each field in usec
439
+ maxKeepAge is how long DCGM should keep values for in seconds
440
+ maxKeepSamples is the maximum number of samples DCGM should ever cache for each field
441
+ startTimestamp is a base timestamp we should start from when first reading values. This can be used to resume a
442
+ previous instance of a DcgmFieldGroupWatcher by using its _nextSinceTimestamp.
443
+ 0=start with all cached data
444
+ '''
445
+
446
+ def __init__(self, handle, groupId, fieldGroup, operationMode, updateFreq,
447
+ maxKeepAge, maxKeepSamples, startTimestamp):
448
+ self._fieldGroup = fieldGroup
449
+ self._operationMode = operationMode
450
+ self._updateFreq = updateFreq
451
+ self._maxKeepAge = maxKeepAge
452
+ self._maxKeepSamples = maxKeepSamples
453
+ DcgmFieldValueEntityCollection.__init__(self, handle, groupId)
454
+
455
+ self._nextSinceTimestamp = 0 #Start from beginning of time
456
+ if startTimestamp > 0:
457
+ self._nextSinceTimestamp = startTimestamp
458
+
459
+ #Start watches
460
+ self._WatchFieldGroup()
461
+
462
+ '''
463
+ Initiate the host engine watch on the fields
464
+ '''
465
+
466
+ def _WatchFieldGroup(self):
467
+ ret = dcgm_agent.dcgmWatchFields(self._handle, self._groupId,
468
+ self._fieldGroup.fieldGroupId,
469
+ self._updateFreq, self._maxKeepAge,
470
+ self._maxKeepSamples)
471
+ dcgm_structs._dcgmCheckReturn(ret) #Will throw exception on error
472
+
473
+ # Force an update of the fields so that we can fetch initial values.
474
+ ret = dcgm_agent.dcgmUpdateAllFields(self._handle, 1)
475
+ dcgm_structs._dcgmCheckReturn(ret) #Will throw exception on error
476
+
477
+ # Initial update will fetch from startTimestamp.
478
+ self.GetAllSinceLastCall()
479
+
480
+ '''
481
+ Method to cause more field values to be retrieved from DCGM. Returns the
482
+ number of field values that were retrieved
483
+ '''
484
+
485
+ def GetAllSinceLastCall(self):
486
+ #If we're in manual mode, force an update
487
+ if self._operationMode == dcgm_structs.DCGM_OPERATION_MODE_MANUAL:
488
+ ret = dcgm_agent.dcgmUpdateAllFields(self._handle, 1)
489
+ dcgm_structs._dcgmCheckReturn(ret) #Will throw exception on error
490
+
491
+ return super().GetAllSinceLastCall(self._fieldGroup)
492
+
493
+
494
+ #Test program for demonstrating how this module works
495
+ def main():
496
+ operationMode = dcgm_structs.DCGM_OPERATION_MODE_AUTO
497
+ timeStep = 1.0
498
+
499
+ dcgm_structs._dcgmInit()
500
+ dcgm_agent.dcgmInit() #Will throw an exception on error
501
+ handle = dcgm_agent.dcgmStartEmbedded(operationMode)
502
+ handleObj = pydcgm.DcgmHandle(handle=handle)
503
+ groupId = dcgm_structs.DCGM_GROUP_ALL_GPUS
504
+ fieldIds = [
505
+ dcgm_fields.DCGM_FI_DEV_SM_CLOCK, dcgm_fields.DCGM_FI_DEV_MEM_CLOCK
506
+ ]
507
+
508
+ fieldGroup = pydcgm.DcgmFieldGroup(handleObj, "my_field_group", fieldIds)
509
+
510
+ updateFreq = int(timeStep * 1000000.0)
511
+ maxKeepAge = 3600.0 #1 hour
512
+ maxKeepSamples = 0 #unlimited. maxKeepAge will enforce quota
513
+ startTimestamp = 0 #beginning of time
514
+
515
+ dfcw = DcgmFieldGroupWatcher(handle, groupId, fieldGroup, operationMode,
516
+ updateFreq, maxKeepAge, maxKeepSamples,
517
+ startTimestamp)
518
+ dfcw2 = DcgmFieldGroupEntityWatcher(handle, groupId, fieldGroup,
519
+ operationMode, updateFreq, maxKeepAge,
520
+ maxKeepSamples, startTimestamp)
521
+
522
+ while (True):
523
+ newUpdateCount = dfcw.GetAllSinceLastCall()
524
+ newUpdateCount2 = dfcw2.GetAllSinceLastCall()
525
+ print("Got %d and %d new field value updates" %
526
+ (newUpdateCount, newUpdateCount2))
527
+ for gpuId in list(dfcw.values.keys()):
528
+ print("gpuId %d" % gpuId)
529
+ for fieldId in list(dfcw.values[gpuId].keys()):
530
+ print(" fieldId %d: %d values. latest timestamp %d" % \
531
+ (fieldId, len(dfcw.values[gpuId][fieldId]), dfcw.values[gpuId][fieldId][-1].ts))
532
+
533
+ for entityGroupId in list(dfcw2.values.keys()):
534
+ print("entityGroupId %d" % entityGroupId)
535
+ for entityId in list(dfcw2.values[entityGroupId].keys()):
536
+ print(" entityId %d" % entityId)
537
+ for fieldId in list(
538
+ dfcw2.values[entityGroupId][entityId].keys()):
539
+ print(" fieldId %d: %d values. latest timestamp %d" % \
540
+ (fieldId, len(dfcw2.values[entityGroupId][entityId][fieldId]), dfcw2.values[entityGroupId][entityId][fieldId][-1].ts))
541
+
542
+ time.sleep(timeStep)
543
+
544
+
545
+ if __name__ == "__main__":
546
+ main()