triton-model-analyzer 1.48.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (204) hide show
  1. model_analyzer/__init__.py +15 -0
  2. model_analyzer/analyzer.py +448 -0
  3. model_analyzer/cli/__init__.py +15 -0
  4. model_analyzer/cli/cli.py +193 -0
  5. model_analyzer/config/__init__.py +15 -0
  6. model_analyzer/config/generate/__init__.py +15 -0
  7. model_analyzer/config/generate/automatic_model_config_generator.py +164 -0
  8. model_analyzer/config/generate/base_model_config_generator.py +352 -0
  9. model_analyzer/config/generate/brute_plus_binary_parameter_search_run_config_generator.py +164 -0
  10. model_analyzer/config/generate/brute_run_config_generator.py +154 -0
  11. model_analyzer/config/generate/concurrency_sweeper.py +75 -0
  12. model_analyzer/config/generate/config_generator_interface.py +52 -0
  13. model_analyzer/config/generate/coordinate.py +143 -0
  14. model_analyzer/config/generate/coordinate_data.py +86 -0
  15. model_analyzer/config/generate/generator_utils.py +116 -0
  16. model_analyzer/config/generate/manual_model_config_generator.py +187 -0
  17. model_analyzer/config/generate/model_config_generator_factory.py +92 -0
  18. model_analyzer/config/generate/model_profile_spec.py +74 -0
  19. model_analyzer/config/generate/model_run_config_generator.py +154 -0
  20. model_analyzer/config/generate/model_variant_name_manager.py +150 -0
  21. model_analyzer/config/generate/neighborhood.py +536 -0
  22. model_analyzer/config/generate/optuna_plus_concurrency_sweep_run_config_generator.py +141 -0
  23. model_analyzer/config/generate/optuna_run_config_generator.py +838 -0
  24. model_analyzer/config/generate/perf_analyzer_config_generator.py +312 -0
  25. model_analyzer/config/generate/quick_plus_concurrency_sweep_run_config_generator.py +130 -0
  26. model_analyzer/config/generate/quick_run_config_generator.py +753 -0
  27. model_analyzer/config/generate/run_config_generator_factory.py +329 -0
  28. model_analyzer/config/generate/search_config.py +112 -0
  29. model_analyzer/config/generate/search_dimension.py +73 -0
  30. model_analyzer/config/generate/search_dimensions.py +85 -0
  31. model_analyzer/config/generate/search_parameter.py +49 -0
  32. model_analyzer/config/generate/search_parameters.py +388 -0
  33. model_analyzer/config/input/__init__.py +15 -0
  34. model_analyzer/config/input/config_command.py +483 -0
  35. model_analyzer/config/input/config_command_profile.py +1747 -0
  36. model_analyzer/config/input/config_command_report.py +267 -0
  37. model_analyzer/config/input/config_defaults.py +236 -0
  38. model_analyzer/config/input/config_enum.py +83 -0
  39. model_analyzer/config/input/config_field.py +216 -0
  40. model_analyzer/config/input/config_list_generic.py +112 -0
  41. model_analyzer/config/input/config_list_numeric.py +151 -0
  42. model_analyzer/config/input/config_list_string.py +111 -0
  43. model_analyzer/config/input/config_none.py +71 -0
  44. model_analyzer/config/input/config_object.py +129 -0
  45. model_analyzer/config/input/config_primitive.py +81 -0
  46. model_analyzer/config/input/config_status.py +75 -0
  47. model_analyzer/config/input/config_sweep.py +83 -0
  48. model_analyzer/config/input/config_union.py +113 -0
  49. model_analyzer/config/input/config_utils.py +128 -0
  50. model_analyzer/config/input/config_value.py +243 -0
  51. model_analyzer/config/input/objects/__init__.py +15 -0
  52. model_analyzer/config/input/objects/config_model_profile_spec.py +325 -0
  53. model_analyzer/config/input/objects/config_model_report_spec.py +173 -0
  54. model_analyzer/config/input/objects/config_plot.py +198 -0
  55. model_analyzer/config/input/objects/config_protobuf_utils.py +101 -0
  56. model_analyzer/config/input/yaml_config_validator.py +82 -0
  57. model_analyzer/config/run/__init__.py +15 -0
  58. model_analyzer/config/run/model_run_config.py +313 -0
  59. model_analyzer/config/run/run_config.py +168 -0
  60. model_analyzer/constants.py +76 -0
  61. model_analyzer/device/__init__.py +15 -0
  62. model_analyzer/device/device.py +24 -0
  63. model_analyzer/device/gpu_device.py +87 -0
  64. model_analyzer/device/gpu_device_factory.py +248 -0
  65. model_analyzer/entrypoint.py +307 -0
  66. model_analyzer/log_formatter.py +65 -0
  67. model_analyzer/model_analyzer_exceptions.py +24 -0
  68. model_analyzer/model_manager.py +255 -0
  69. model_analyzer/monitor/__init__.py +15 -0
  70. model_analyzer/monitor/cpu_monitor.py +69 -0
  71. model_analyzer/monitor/dcgm/DcgmDiag.py +191 -0
  72. model_analyzer/monitor/dcgm/DcgmFieldGroup.py +83 -0
  73. model_analyzer/monitor/dcgm/DcgmGroup.py +815 -0
  74. model_analyzer/monitor/dcgm/DcgmHandle.py +141 -0
  75. model_analyzer/monitor/dcgm/DcgmJsonReader.py +69 -0
  76. model_analyzer/monitor/dcgm/DcgmReader.py +623 -0
  77. model_analyzer/monitor/dcgm/DcgmStatus.py +57 -0
  78. model_analyzer/monitor/dcgm/DcgmSystem.py +412 -0
  79. model_analyzer/monitor/dcgm/__init__.py +15 -0
  80. model_analyzer/monitor/dcgm/common/__init__.py +13 -0
  81. model_analyzer/monitor/dcgm/common/dcgm_client_cli_parser.py +194 -0
  82. model_analyzer/monitor/dcgm/common/dcgm_client_main.py +86 -0
  83. model_analyzer/monitor/dcgm/dcgm_agent.py +887 -0
  84. model_analyzer/monitor/dcgm/dcgm_collectd_plugin.py +369 -0
  85. model_analyzer/monitor/dcgm/dcgm_errors.py +395 -0
  86. model_analyzer/monitor/dcgm/dcgm_field_helpers.py +546 -0
  87. model_analyzer/monitor/dcgm/dcgm_fields.py +815 -0
  88. model_analyzer/monitor/dcgm/dcgm_fields_collectd.py +671 -0
  89. model_analyzer/monitor/dcgm/dcgm_fields_internal.py +29 -0
  90. model_analyzer/monitor/dcgm/dcgm_fluentd.py +45 -0
  91. model_analyzer/monitor/dcgm/dcgm_monitor.py +138 -0
  92. model_analyzer/monitor/dcgm/dcgm_prometheus.py +326 -0
  93. model_analyzer/monitor/dcgm/dcgm_structs.py +2357 -0
  94. model_analyzer/monitor/dcgm/dcgm_telegraf.py +65 -0
  95. model_analyzer/monitor/dcgm/dcgm_value.py +151 -0
  96. model_analyzer/monitor/dcgm/dcgmvalue.py +155 -0
  97. model_analyzer/monitor/dcgm/denylist_recommendations.py +573 -0
  98. model_analyzer/monitor/dcgm/pydcgm.py +47 -0
  99. model_analyzer/monitor/monitor.py +143 -0
  100. model_analyzer/monitor/remote_monitor.py +137 -0
  101. model_analyzer/output/__init__.py +15 -0
  102. model_analyzer/output/file_writer.py +63 -0
  103. model_analyzer/output/output_writer.py +42 -0
  104. model_analyzer/perf_analyzer/__init__.py +15 -0
  105. model_analyzer/perf_analyzer/genai_perf_config.py +206 -0
  106. model_analyzer/perf_analyzer/perf_analyzer.py +882 -0
  107. model_analyzer/perf_analyzer/perf_config.py +479 -0
  108. model_analyzer/plots/__init__.py +15 -0
  109. model_analyzer/plots/detailed_plot.py +266 -0
  110. model_analyzer/plots/plot_manager.py +224 -0
  111. model_analyzer/plots/simple_plot.py +213 -0
  112. model_analyzer/record/__init__.py +15 -0
  113. model_analyzer/record/gpu_record.py +68 -0
  114. model_analyzer/record/metrics_manager.py +887 -0
  115. model_analyzer/record/record.py +280 -0
  116. model_analyzer/record/record_aggregator.py +256 -0
  117. model_analyzer/record/types/__init__.py +15 -0
  118. model_analyzer/record/types/cpu_available_ram.py +93 -0
  119. model_analyzer/record/types/cpu_used_ram.py +93 -0
  120. model_analyzer/record/types/gpu_free_memory.py +96 -0
  121. model_analyzer/record/types/gpu_power_usage.py +107 -0
  122. model_analyzer/record/types/gpu_total_memory.py +96 -0
  123. model_analyzer/record/types/gpu_used_memory.py +96 -0
  124. model_analyzer/record/types/gpu_utilization.py +108 -0
  125. model_analyzer/record/types/inter_token_latency_avg.py +60 -0
  126. model_analyzer/record/types/inter_token_latency_base.py +74 -0
  127. model_analyzer/record/types/inter_token_latency_max.py +60 -0
  128. model_analyzer/record/types/inter_token_latency_min.py +60 -0
  129. model_analyzer/record/types/inter_token_latency_p25.py +60 -0
  130. model_analyzer/record/types/inter_token_latency_p50.py +60 -0
  131. model_analyzer/record/types/inter_token_latency_p75.py +60 -0
  132. model_analyzer/record/types/inter_token_latency_p90.py +60 -0
  133. model_analyzer/record/types/inter_token_latency_p95.py +60 -0
  134. model_analyzer/record/types/inter_token_latency_p99.py +60 -0
  135. model_analyzer/record/types/output_token_throughput.py +105 -0
  136. model_analyzer/record/types/perf_client_response_wait.py +97 -0
  137. model_analyzer/record/types/perf_client_send_recv.py +97 -0
  138. model_analyzer/record/types/perf_latency.py +111 -0
  139. model_analyzer/record/types/perf_latency_avg.py +60 -0
  140. model_analyzer/record/types/perf_latency_base.py +74 -0
  141. model_analyzer/record/types/perf_latency_p90.py +60 -0
  142. model_analyzer/record/types/perf_latency_p95.py +60 -0
  143. model_analyzer/record/types/perf_latency_p99.py +60 -0
  144. model_analyzer/record/types/perf_server_compute_infer.py +97 -0
  145. model_analyzer/record/types/perf_server_compute_input.py +97 -0
  146. model_analyzer/record/types/perf_server_compute_output.py +97 -0
  147. model_analyzer/record/types/perf_server_queue.py +97 -0
  148. model_analyzer/record/types/perf_throughput.py +105 -0
  149. model_analyzer/record/types/time_to_first_token_avg.py +60 -0
  150. model_analyzer/record/types/time_to_first_token_base.py +74 -0
  151. model_analyzer/record/types/time_to_first_token_max.py +60 -0
  152. model_analyzer/record/types/time_to_first_token_min.py +60 -0
  153. model_analyzer/record/types/time_to_first_token_p25.py +60 -0
  154. model_analyzer/record/types/time_to_first_token_p50.py +60 -0
  155. model_analyzer/record/types/time_to_first_token_p75.py +60 -0
  156. model_analyzer/record/types/time_to_first_token_p90.py +60 -0
  157. model_analyzer/record/types/time_to_first_token_p95.py +60 -0
  158. model_analyzer/record/types/time_to_first_token_p99.py +60 -0
  159. model_analyzer/reports/__init__.py +15 -0
  160. model_analyzer/reports/html_report.py +195 -0
  161. model_analyzer/reports/pdf_report.py +50 -0
  162. model_analyzer/reports/report.py +86 -0
  163. model_analyzer/reports/report_factory.py +62 -0
  164. model_analyzer/reports/report_manager.py +1376 -0
  165. model_analyzer/reports/report_utils.py +42 -0
  166. model_analyzer/result/__init__.py +15 -0
  167. model_analyzer/result/constraint_manager.py +150 -0
  168. model_analyzer/result/model_config_measurement.py +354 -0
  169. model_analyzer/result/model_constraints.py +105 -0
  170. model_analyzer/result/parameter_search.py +246 -0
  171. model_analyzer/result/result_manager.py +430 -0
  172. model_analyzer/result/result_statistics.py +159 -0
  173. model_analyzer/result/result_table.py +217 -0
  174. model_analyzer/result/result_table_manager.py +646 -0
  175. model_analyzer/result/result_utils.py +42 -0
  176. model_analyzer/result/results.py +277 -0
  177. model_analyzer/result/run_config_measurement.py +658 -0
  178. model_analyzer/result/run_config_result.py +210 -0
  179. model_analyzer/result/run_config_result_comparator.py +110 -0
  180. model_analyzer/result/sorted_results.py +151 -0
  181. model_analyzer/state/__init__.py +15 -0
  182. model_analyzer/state/analyzer_state.py +76 -0
  183. model_analyzer/state/analyzer_state_manager.py +215 -0
  184. model_analyzer/triton/__init__.py +15 -0
  185. model_analyzer/triton/client/__init__.py +15 -0
  186. model_analyzer/triton/client/client.py +234 -0
  187. model_analyzer/triton/client/client_factory.py +57 -0
  188. model_analyzer/triton/client/grpc_client.py +104 -0
  189. model_analyzer/triton/client/http_client.py +107 -0
  190. model_analyzer/triton/model/__init__.py +15 -0
  191. model_analyzer/triton/model/model_config.py +556 -0
  192. model_analyzer/triton/model/model_config_variant.py +29 -0
  193. model_analyzer/triton/server/__init__.py +15 -0
  194. model_analyzer/triton/server/server.py +76 -0
  195. model_analyzer/triton/server/server_config.py +269 -0
  196. model_analyzer/triton/server/server_docker.py +229 -0
  197. model_analyzer/triton/server/server_factory.py +306 -0
  198. model_analyzer/triton/server/server_local.py +158 -0
  199. triton_model_analyzer-1.48.0.dist-info/METADATA +52 -0
  200. triton_model_analyzer-1.48.0.dist-info/RECORD +204 -0
  201. triton_model_analyzer-1.48.0.dist-info/WHEEL +5 -0
  202. triton_model_analyzer-1.48.0.dist-info/entry_points.txt +2 -0
  203. triton_model_analyzer-1.48.0.dist-info/licenses/LICENSE +67 -0
  204. triton_model_analyzer-1.48.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,412 @@
1
+ # Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import model_analyzer.monitor.dcgm.pydcgm as pydcgm
16
+ import model_analyzer.monitor.dcgm.dcgm_agent as dcgm_agent
17
+ import model_analyzer.monitor.dcgm.dcgm_structs as dcgm_structs
18
+ import model_analyzer.monitor.dcgm.dcgm_fields as dcgm_fields
19
+ import ctypes
20
+
21
+
22
+ class DcgmSystemDiscovery:
23
+ '''
24
+ Constructor
25
+ '''
26
+
27
+ def __init__(self, dcgmHandle):
28
+ self._dcgmHandle = dcgmHandle
29
+
30
+ '''
31
+ Get all IDs of the GPUs that DCGM knows about. To get only GPUs that DCGM support,
32
+ use GetAllSupportedGpuIds().
33
+
34
+ Returns an array of GPU IDs. Each of these can be passed to DcgmGroup::AddGpu()
35
+ '''
36
+
37
+ def GetAllGpuIds(self):
38
+ gpuIds = dcgm_agent.dcgmGetAllDevices(self._dcgmHandle.handle)
39
+ return gpuIds
40
+
41
+ '''
42
+ Get all of IDs of the GPUs that DCGM supports. This will exclude unsupported
43
+ GPUs
44
+
45
+ Returns an array of GPU IDs. Each of these can be passed to DcgmGroup::AddGpu()
46
+ '''
47
+
48
+ def GetAllSupportedGpuIds(self):
49
+ gpuIds = dcgm_agent.dcgmGetAllSupportedDevices(self._dcgmHandle.handle)
50
+ return gpuIds
51
+
52
+ '''
53
+ Get some basic GPU attributes for a given GPU ID.
54
+
55
+ Returns a dcgm_structs.c_dcgmDeviceAttributes_v3() object for the given GPU
56
+ '''
57
+
58
+ def GetGpuAttributes(self, gpuId):
59
+ return dcgm_agent.dcgmGetDeviceAttributes(self._dcgmHandle.handle,
60
+ gpuId)
61
+
62
+ '''
63
+ Get topology information for a given GPU ID
64
+
65
+ Returns a dcgm_structs.c_dcgmDeviceTopology_v1 structure representing the topology for the given GPU
66
+ '''
67
+
68
+ def GetGpuTopology(self, gpuId):
69
+ return dcgm_agent.dcgmGetDeviceTopology(self._dcgmHandle.handle, gpuId)
70
+
71
+ '''
72
+ Get all entityIds of the entities that DCGM knows about.
73
+
74
+ entityGroupId IN: DCGM_FE_? constant of the entity group to fetch the entities of
75
+ onlyActive IN: Boolean as to whether to fetch entities that are supported by DCGM (True)
76
+ or all entity IDs (False)
77
+
78
+ Returns an array of entity IDs. Each of these can be passed to DcgmGroup::AddEntity()
79
+ '''
80
+
81
+ def GetEntityGroupEntities(self, entityGroupId, onlySupported):
82
+ flags = 0
83
+ if onlySupported:
84
+ flags |= dcgm_structs.DCGM_GEGE_FLAG_ONLY_SUPPORTED
85
+ entityIds = dcgm_agent.dcgmGetEntityGroupEntities(
86
+ self._dcgmHandle.handle, entityGroupId, flags)
87
+ return entityIds
88
+
89
+ '''
90
+ Get the status of all of the NvLink links in the system.
91
+
92
+ Returns a dcgm_structs.c_dcgmNvLinkStatus_v3 object.
93
+ '''
94
+
95
+ def GetNvLinkLinkStatus(self):
96
+ return dcgm_agent.dcgmGetNvLinkLinkStatus(self._dcgmHandle.handle)
97
+
98
+ '''
99
+ From a bitmask of input gpu ids, return a bitmask of numGpus GPUs which identifies the topologically
100
+ closest GPUs to use for a single job. DCGM will consider CPU affinities and NVLink connection speeds
101
+ to determine the closest.
102
+ hintFlags can instruct DCGM to consider GPU health or not. By default, unhealthy GPUs are excluded from
103
+ consideration.
104
+ '''
105
+
106
+ def SelectGpusByTopology(self, inputGpuIds, numGpus, hintFlags):
107
+ return dcgm_agent.dcgmSelectGpusByTopology(self._dcgmHandle.handle,
108
+ inputGpuIds, numGpus,
109
+ hintFlags)
110
+
111
+
112
+ class DcgmSystemIntrospect:
113
+ '''
114
+ Class to access the system-wide introspection modules of DCGM
115
+ '''
116
+
117
+ def __init__(self, dcgmHandle):
118
+ self._handle = dcgmHandle
119
+ self.memory = DcgmSystemIntrospectMemory(dcgmHandle)
120
+ self.cpuUtil = DcgmSystemIntrospectCpuUtil(dcgmHandle)
121
+
122
+ def UpdateAll(self, waitForUpdate=True):
123
+ dcgm_agent.dcgmIntrospectUpdateAll(self._handle.handle, waitForUpdate)
124
+
125
+
126
+ class DcgmSystemIntrospectMemory:
127
+ '''
128
+ Class to access information about the memory usage of DCGM itself
129
+ '''
130
+
131
+ def __init__(self, dcgmHandle):
132
+ self._dcgmHandle = dcgmHandle
133
+
134
+ def GetForHostengine(self, waitIfNoData=True):
135
+ '''
136
+ Retrieve the total amount of virtual memory that the hostengine process is currently using.
137
+ This measurement represents both the resident set size (what is currently in RAM) and
138
+ the swapped memory that belongs to the process.
139
+
140
+ waitIfNoData: wait for metadata to be updated if it's not available
141
+
142
+ Returns a dcgm_structs.c_dcgmIntrospectMemory_v1 object
143
+ Raises an exception for DCGM_ST_NO_DATA if no data is available yet and \ref waitIfNoData is False
144
+ '''
145
+ return dcgm_agent.dcgmIntrospectGetHostengineMemoryUsage(
146
+ self._dcgmHandle.handle, waitIfNoData)
147
+
148
+
149
+ class DcgmSystemIntrospectCpuUtil:
150
+ '''
151
+ Class to access information about the CPU Utilization of DCGM
152
+ '''
153
+
154
+ def __init__(self, dcgmHandle):
155
+ self._dcgmHandle = dcgmHandle
156
+
157
+ def GetForHostengine(self, waitIfNoData=True):
158
+ '''
159
+ Get the current CPU Utilization of the hostengine process.
160
+
161
+ waitIfNoData: wait for metadata to be updated if it's not available
162
+
163
+ Returns a dcgm_structs.c_dcgmIntrospectCpuUtil_v1 object
164
+ Raises an exception for DCGM_ST_NO_DATA if no data is available yet and \ref waitIfNoData is False
165
+ '''
166
+ return dcgm_agent.dcgmIntrospectGetHostengineCpuUtilization(
167
+ self._dcgmHandle.handle, waitIfNoData)
168
+
169
+
170
+ '''
171
+ Class to encapsulate DCGM field-metadata requests
172
+ '''
173
+
174
+
175
+ class DcgmSystemFields:
176
+
177
+ def GetFieldById(self, fieldId):
178
+ '''
179
+ Get a field's metadata by its dcgm_fields.DCGM_FI_* field ID
180
+
181
+ fieldId: dcgm_fields.DCGM_FI_* field ID of the field
182
+
183
+ Returns a dcgm_fields.c_dcgm_field_meta_t struct on success or None on error.
184
+ '''
185
+ return dcgm_fields.DcgmFieldGetById(fieldId)
186
+
187
+ def GetFieldByTag(self, tag):
188
+ '''
189
+ Get a field's metadata by its tag name. Ex: 'brand'
190
+
191
+ tag: Tag name of the field
192
+
193
+ Returns a dcgm_fields.c_dcgm_field_meta_t struct on success or None on error.
194
+ '''
195
+ return dcgm_fields.DcgmFieldGetByTag(tag)
196
+
197
+
198
+ '''
199
+ Class to encapsulate DCGM module management and introspection
200
+ '''
201
+
202
+
203
+ class DcgmSystemModules:
204
+ '''
205
+ Constructor
206
+ '''
207
+
208
+ def __init__(self, dcgmHandle):
209
+ self._dcgmHandle = dcgmHandle
210
+
211
+ '''
212
+ Denylist a module from being loaded by DCGM.
213
+
214
+ moduleId a dcgm_structs.dcgmModuleId* ID of the module to denylist
215
+
216
+ Returns: Nothing.
217
+ Raises a DCGM_ST_IN_USE exception if the module was already loaded
218
+ '''
219
+
220
+ def Denylist(self, moduleId):
221
+ dcgm_agent.dcgmModuleDenylist(self._dcgmHandle.handle, moduleId)
222
+
223
+ '''
224
+ Get the statuses of all of the modules in DCGM
225
+
226
+ Returns: a dcgm_structs.c_dcgmModuleGetStatuses_v1 structure.
227
+ '''
228
+
229
+ def GetStatuses(self):
230
+ return dcgm_agent.dcgmModuleGetStatuses(self._dcgmHandle.handle)
231
+
232
+
233
+ '''
234
+ Class to encapsulate DCGM profiling
235
+ '''
236
+
237
+
238
+ class DcgmSystemProfiling:
239
+ '''
240
+ Constructor
241
+ '''
242
+
243
+ def __init__(self, dcgmHandle):
244
+ self._dcgmHandle = dcgmHandle
245
+
246
+ '''
247
+ Pause profiling activities in DCGM. This should be used when you are monitoring profiling fields
248
+ from DCGM but want to be able to still run developer tools like nvprof, nsight systems, and nsight compute.
249
+ Profiling fields start with DCGM_PROF_ and are in the field ID range 1001-1012.
250
+
251
+ Call this API before you launch one of those tools and Resume() after the tool has completed.
252
+
253
+ DCGM will save BLANK values while profiling is paused.
254
+ Calling this while profiling activities are already paused is fine and will be treated as a no-op.
255
+ '''
256
+
257
+ def Pause(self):
258
+ return dcgm_agent.dcgmProfPause(self._dcgmHandle.handle)
259
+
260
+ '''
261
+ Resume profiling activities in DCGM that were previously paused with Pause().
262
+
263
+ Call this API after you have completed running other NVIDIA developer tools to reenable DCGM
264
+ profiling metrics.
265
+
266
+ DCGM will save BLANK values while profiling is paused.
267
+
268
+ Calling this while profiling activities have already been resumed is fine and will be treated as a no-op.
269
+ '''
270
+
271
+ def Resume(self):
272
+ return dcgm_agent.dcgmProfResume(self._dcgmHandle.handle)
273
+
274
+
275
+ '''
276
+ Class to encapsulate global DCGM methods. These apply to a single DcgmHandle, provided to the constructor
277
+ '''
278
+
279
+
280
+ class DcgmSystem:
281
+ '''
282
+ Constructor
283
+
284
+ dcgmHandle is a pydcgm.DcgmHandle instance of the connection that will be used by all methods of this class
285
+ '''
286
+
287
+ def __init__(self, dcgmHandle):
288
+ self._dcgmHandle = dcgmHandle
289
+
290
+ #Child classes
291
+ self.discovery = DcgmSystemDiscovery(self._dcgmHandle)
292
+ self.introspect = DcgmSystemIntrospect(self._dcgmHandle)
293
+ self.fields = DcgmSystemFields()
294
+ self.modules = DcgmSystemModules(self._dcgmHandle)
295
+ self.profiling = DcgmSystemProfiling(self._dcgmHandle)
296
+
297
+ '''
298
+ Request that the host engine perform a field value update cycle. If the host
299
+ engine was starting in DCGM_OPERATION_MODE_MANUAL, calling this method is
300
+ the only way that field values will be updated.
301
+
302
+ Note that performing a field value update cycle does not update every field.
303
+ It only update fields that are newly watched or fields that haven't updated
304
+ in enough time to warrant updating again, based on their update frequency.
305
+
306
+ waitForUpdate specifies whether this function call should block until the
307
+ field value update loop is complete or not. Use True if you intend to query
308
+ values immediately after calling this.
309
+ '''
310
+
311
+ def UpdateAllFields(self, waitForUpdate):
312
+ ret = dcgm_agent.dcgmUpdateAllFields(self._dcgmHandle.handle,
313
+ waitForUpdate)
314
+ #Throw an exception on error
315
+ dcgm_structs._dcgmCheckReturn(ret)
316
+
317
+ '''
318
+ Get a DcgmGroup instance for the default all-GPUs group. This object is used to
319
+ perform operations on a group of GPUs. See DcgmGroup.py for details.
320
+
321
+ AddGpu() and RemoveGpu() operations are not allowed on the default group
322
+ '''
323
+
324
+ def GetDefaultGroup(self):
325
+ return pydcgm.DcgmGroup(self._dcgmHandle,
326
+ groupId=dcgm_structs.DCGM_GROUP_ALL_GPUS)
327
+
328
+ '''
329
+ Get an instance of DcgmGroup with no GPUs. Call AddGpu() on the returned
330
+ object with GPU IDs from GetAllGpuIds() before performing actions on
331
+ the returned DcgmGroup instance.
332
+
333
+ groupName is the name of the group to create in the host engine. This name must be
334
+ unique.
335
+
336
+ Note: The group will be deleted from the host engine when the returned object goes out of scope
337
+ '''
338
+
339
+ def GetEmptyGroup(self, groupName):
340
+ return pydcgm.DcgmGroup(self._dcgmHandle, groupName=groupName)
341
+
342
+ '''
343
+ Get an instance of DcgmGroup populated with the gpuIds provided
344
+
345
+ groupName is the name of the group to create in the host engine. This name must be
346
+ unique.
347
+ gpuIds is the list of GPU IDs to add to the group
348
+
349
+ Note: The group will be deleted from the host engine when the returned object goes out of scope
350
+ '''
351
+
352
+ def GetGroupWithGpuIds(self, groupName, gpuIds):
353
+ newGroup = pydcgm.DcgmGroup(self._dcgmHandle, groupName=groupName)
354
+ for gpuId in gpuIds:
355
+ newGroup.AddGpu(gpuId)
356
+ return newGroup
357
+
358
+ '''
359
+ Get an instance of DcgmGroup populated with the provided entities
360
+
361
+ groupName is the name of the group to create in the host engine. This name must be
362
+ unique.
363
+ entities is the list of entity pairs (type and id) to add to the group
364
+
365
+ Note: The group will be deleted from the host engine when the returned object goes out of scope
366
+ '''
367
+
368
+ def GetGroupWithEntities(self, groupName, entities):
369
+ group = pydcgm.DcgmGroup(self._dcgmHandle, groupName=groupName)
370
+ for entity in entities:
371
+ group.AddEntity(entity.entityGroupId, entity.entityId)
372
+
373
+ return group
374
+
375
+ '''
376
+ Get ids of all DcgmGroups of GPUs. This returns a list containing the ids of the DcgmGroups.
377
+ '''
378
+
379
+ def GetAllGroupIds(self):
380
+ return dcgm_agent.dcgmGroupGetAllIds(self._dcgmHandle.handle)
381
+
382
+ '''
383
+ Get all all of the field groups in the system
384
+ '''
385
+
386
+ def GetAllFieldGroups(self):
387
+ return dcgm_agent.dcgmFieldGroupGetAll(self._dcgmHandle.handle)
388
+
389
+ '''
390
+ Get a field group's id by its name.
391
+
392
+ Returns: Field group ID if found
393
+ None if not found
394
+ '''
395
+
396
+ def GetFieldGroupIdByName(self, name):
397
+ allGroups = self.GetAllFieldGroups()
398
+ for i in range(0, allGroups.numFieldGroups):
399
+ if allGroups.fieldGroups[i].fieldGroupName == name:
400
+ return ctypes.c_void_p(allGroups.fieldGroups[i].fieldGroupId)
401
+
402
+ return None
403
+
404
+ def PauseTelemetryForDiag(self):
405
+ """Pause DCGM modules from updating field values."""
406
+ import dcgm_agent_internal
407
+ dcgm_agent_internal.dcgmPauseTelemetryForDiag(self._dcgmHandle.handle)
408
+
409
+ def ResumeTelemetryForDiag(self):
410
+ """Resume previously paused DCGM modules so that they can update field values."""
411
+ import dcgm_agent_internal
412
+ dcgm_agent_internal.dcgmResumeTelemetryForDiag(self._dcgmHandle.handle)
@@ -0,0 +1,15 @@
1
+ #!/usr/bin/env python3
2
+
3
+ # Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
4
+ #
5
+ # Licensed under the Apache License, Version 2.0 (the "License");
6
+ # you may not use this file except in compliance with the License.
7
+ # You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
@@ -0,0 +1,13 @@
1
+ # Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
@@ -0,0 +1,194 @@
1
+ # Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ from os import environ
15
+ import argparse
16
+ import logging
17
+ import sys
18
+
19
+
20
+ ###############################################################################
21
+ def create_parser(
22
+ publish_port=8000,
23
+ interval=10,
24
+ name='the monitoring tool', # Replace with 'prometheus', 'telegraf', etc.
25
+ field_ids=None,
26
+ log_file=None,
27
+ log_level='INFO',
28
+ dcgm_hostname=environ.get('DCGM_HOSTNAME') or 'localhost',
29
+ ):
30
+ '''
31
+ Create a parser that defaults to sane parameters.
32
+
33
+ The default parameters can be overridden through keyword arguments.
34
+
35
+ Note: if DCGM_HOSTNAME is set as an environment variable, it is used as
36
+ the default instead of localhost
37
+ '''
38
+
39
+ parser = argparse.ArgumentParser()
40
+ parser.add_argument(
41
+ '-p',
42
+ '--publish-port',
43
+ dest='publish_port',
44
+ type=int,
45
+ default=publish_port,
46
+ help='TCP port that the client should publish to. Default={}.'.format(
47
+ publish_port))
48
+ parser.add_argument(
49
+ '-i',
50
+ '--interval',
51
+ dest='interval',
52
+ type=int,
53
+ default=interval,
54
+ help=
55
+ 'How often the client should retrieve new values from DCGM in seconds. Default={}.'
56
+ .format(interval))
57
+ parser.add_argument(
58
+ '-f',
59
+ '--field-ids',
60
+ dest='field_ids',
61
+ type=str,
62
+ default=field_ids,
63
+ help=
64
+ 'Comma-separated list of field IDs that should be retrieved from DCGM. '
65
+ +
66
+ 'The full list of available field IDs can be obtained from dcgm_fields.h, dcgm_fields.py, '
67
+ + 'or running \'dcgmi dmon -l\'.')
68
+ parser.add_argument(
69
+ '--log-file',
70
+ dest='logfile',
71
+ type=str,
72
+ default=log_file,
73
+ help=
74
+ 'A path to a log file for recording what information is being sent to {}'
75
+ .format(name))
76
+ parser.add_argument(
77
+ '--log-level',
78
+ dest='loglevel',
79
+ type=str,
80
+ default=log_level,
81
+ help=
82
+ 'Specify a log level to use for logging.\n\tCRITICAL (0) - log only critical errors that drastically affect execution'
83
+ +
84
+ '\n\tERROR (1) - Log any error in execution\n\tWARNING (2) - Log all warnings and errors that occur'
85
+ +
86
+ '\n\tINFO (3) - Log informational messages about program execution in addition to warnings and errors'
87
+ +
88
+ '\n\tDEBUG (4) - Log debugging information in addition to all information about execution'
89
+ + '\nDefault: {}'.format(log_level))
90
+
91
+ group = parser.add_mutually_exclusive_group()
92
+ group.add_argument(
93
+ '-n',
94
+ '--hostname',
95
+ dest='hostname',
96
+ type=str,
97
+ default=dcgm_hostname,
98
+ help=
99
+ 'IP/hostname where the client should query DCGM for values. Default={} (all interfaces).'
100
+ .format(dcgm_hostname))
101
+ group.add_argument(
102
+ '-e',
103
+ '--embedded',
104
+ dest='embedded',
105
+ action='store_true',
106
+ help=
107
+ 'Launch DCGM from within this process instead of connecting to nv-hostengine.'
108
+ )
109
+
110
+ return parser
111
+
112
+
113
+ def add_custom_argument(parser, *args, **kwargs):
114
+ parser.add_argument(*args, **kwargs)
115
+
116
+
117
+ ###############################################################################
118
+ def add_target_host_argument(name, parser, default_target='localhost'):
119
+ parser.add_argument(
120
+ '-t',
121
+ '--publish-hostname',
122
+ dest='publish_hostname',
123
+ type=str,
124
+ default=default_target,
125
+ help='The hostname at which the client will publish the readings to {}'.
126
+ format(name))
127
+
128
+
129
+ ###############################################################################
130
+ def run_parser(parser):
131
+ '''
132
+ Run a parser created using create_parser
133
+ '''
134
+ return parser.parse_args()
135
+
136
+
137
+ ###############################################################################
138
+ def get_field_ids(args):
139
+ # This indicates the user supplied a string, so we should override the
140
+ # default
141
+ if isinstance(args.field_ids, str):
142
+ tokens = args.field_ids.split(",")
143
+ field_ids = [int(token) for token in tokens]
144
+ return field_ids
145
+ # The default object should already be an array of ints. Just return it
146
+ else:
147
+ return args.field_ids
148
+
149
+
150
+ ###############################################################################
151
+ def get_log_level(args):
152
+ levelStr = args.loglevel.upper()
153
+ if levelStr == '0' or levelStr == 'CRITICAL':
154
+ numeric_log_level = logging.CRITICAL
155
+ elif levelStr == '1' or levelStr == 'ERROR':
156
+ numeric_log_level = logging.ERROR
157
+ elif levelStr == '2' or levelStr == 'WARNING':
158
+ numeric_log_level = logging.WARNING
159
+ elif levelStr == '3' or levelStr == 'INFO':
160
+ numeric_log_level = logging.INFO
161
+ elif levelStr == '4' or levelStr == 'DEBUG':
162
+ numeric_log_level = logging.DEBUG
163
+ else:
164
+ print("Could not understand the specified --log-level '%s'" %
165
+ (args.loglevel))
166
+ args.print_help()
167
+ sys.exit(2)
168
+ return numeric_log_level
169
+
170
+
171
+ ###############################################################################
172
+ def parse_command_line(name, default_port, add_target_host=False):
173
+ # Fields we accept raw from the CLI
174
+ FIELDS_AS_IS = ['publish_port', 'interval', 'logfile', 'publish_hostname']
175
+
176
+ parser = create_parser(
177
+ name=name,
178
+ publish_port=default_port,
179
+ )
180
+
181
+ if add_target_host:
182
+ add_target_host_argument(name, parser)
183
+
184
+ args = run_parser(parser)
185
+ field_ids = get_field_ids(args)
186
+ log_level = get_log_level(args)
187
+
188
+ args_as_dict = vars(args)
189
+ settings = {i: args_as_dict[i] for i in FIELDS_AS_IS}
190
+ settings['dcgm_hostname'] = None if args.embedded else args.hostname
191
+ settings['field_ids'] = field_ids
192
+ settings['log_level'] = log_level
193
+
194
+ return settings