triton-model-analyzer 1.48.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (204) hide show
  1. model_analyzer/__init__.py +15 -0
  2. model_analyzer/analyzer.py +448 -0
  3. model_analyzer/cli/__init__.py +15 -0
  4. model_analyzer/cli/cli.py +193 -0
  5. model_analyzer/config/__init__.py +15 -0
  6. model_analyzer/config/generate/__init__.py +15 -0
  7. model_analyzer/config/generate/automatic_model_config_generator.py +164 -0
  8. model_analyzer/config/generate/base_model_config_generator.py +352 -0
  9. model_analyzer/config/generate/brute_plus_binary_parameter_search_run_config_generator.py +164 -0
  10. model_analyzer/config/generate/brute_run_config_generator.py +154 -0
  11. model_analyzer/config/generate/concurrency_sweeper.py +75 -0
  12. model_analyzer/config/generate/config_generator_interface.py +52 -0
  13. model_analyzer/config/generate/coordinate.py +143 -0
  14. model_analyzer/config/generate/coordinate_data.py +86 -0
  15. model_analyzer/config/generate/generator_utils.py +116 -0
  16. model_analyzer/config/generate/manual_model_config_generator.py +187 -0
  17. model_analyzer/config/generate/model_config_generator_factory.py +92 -0
  18. model_analyzer/config/generate/model_profile_spec.py +74 -0
  19. model_analyzer/config/generate/model_run_config_generator.py +154 -0
  20. model_analyzer/config/generate/model_variant_name_manager.py +150 -0
  21. model_analyzer/config/generate/neighborhood.py +536 -0
  22. model_analyzer/config/generate/optuna_plus_concurrency_sweep_run_config_generator.py +141 -0
  23. model_analyzer/config/generate/optuna_run_config_generator.py +838 -0
  24. model_analyzer/config/generate/perf_analyzer_config_generator.py +312 -0
  25. model_analyzer/config/generate/quick_plus_concurrency_sweep_run_config_generator.py +130 -0
  26. model_analyzer/config/generate/quick_run_config_generator.py +753 -0
  27. model_analyzer/config/generate/run_config_generator_factory.py +329 -0
  28. model_analyzer/config/generate/search_config.py +112 -0
  29. model_analyzer/config/generate/search_dimension.py +73 -0
  30. model_analyzer/config/generate/search_dimensions.py +85 -0
  31. model_analyzer/config/generate/search_parameter.py +49 -0
  32. model_analyzer/config/generate/search_parameters.py +388 -0
  33. model_analyzer/config/input/__init__.py +15 -0
  34. model_analyzer/config/input/config_command.py +483 -0
  35. model_analyzer/config/input/config_command_profile.py +1747 -0
  36. model_analyzer/config/input/config_command_report.py +267 -0
  37. model_analyzer/config/input/config_defaults.py +236 -0
  38. model_analyzer/config/input/config_enum.py +83 -0
  39. model_analyzer/config/input/config_field.py +216 -0
  40. model_analyzer/config/input/config_list_generic.py +112 -0
  41. model_analyzer/config/input/config_list_numeric.py +151 -0
  42. model_analyzer/config/input/config_list_string.py +111 -0
  43. model_analyzer/config/input/config_none.py +71 -0
  44. model_analyzer/config/input/config_object.py +129 -0
  45. model_analyzer/config/input/config_primitive.py +81 -0
  46. model_analyzer/config/input/config_status.py +75 -0
  47. model_analyzer/config/input/config_sweep.py +83 -0
  48. model_analyzer/config/input/config_union.py +113 -0
  49. model_analyzer/config/input/config_utils.py +128 -0
  50. model_analyzer/config/input/config_value.py +243 -0
  51. model_analyzer/config/input/objects/__init__.py +15 -0
  52. model_analyzer/config/input/objects/config_model_profile_spec.py +325 -0
  53. model_analyzer/config/input/objects/config_model_report_spec.py +173 -0
  54. model_analyzer/config/input/objects/config_plot.py +198 -0
  55. model_analyzer/config/input/objects/config_protobuf_utils.py +101 -0
  56. model_analyzer/config/input/yaml_config_validator.py +82 -0
  57. model_analyzer/config/run/__init__.py +15 -0
  58. model_analyzer/config/run/model_run_config.py +313 -0
  59. model_analyzer/config/run/run_config.py +168 -0
  60. model_analyzer/constants.py +76 -0
  61. model_analyzer/device/__init__.py +15 -0
  62. model_analyzer/device/device.py +24 -0
  63. model_analyzer/device/gpu_device.py +87 -0
  64. model_analyzer/device/gpu_device_factory.py +248 -0
  65. model_analyzer/entrypoint.py +307 -0
  66. model_analyzer/log_formatter.py +65 -0
  67. model_analyzer/model_analyzer_exceptions.py +24 -0
  68. model_analyzer/model_manager.py +255 -0
  69. model_analyzer/monitor/__init__.py +15 -0
  70. model_analyzer/monitor/cpu_monitor.py +69 -0
  71. model_analyzer/monitor/dcgm/DcgmDiag.py +191 -0
  72. model_analyzer/monitor/dcgm/DcgmFieldGroup.py +83 -0
  73. model_analyzer/monitor/dcgm/DcgmGroup.py +815 -0
  74. model_analyzer/monitor/dcgm/DcgmHandle.py +141 -0
  75. model_analyzer/monitor/dcgm/DcgmJsonReader.py +69 -0
  76. model_analyzer/monitor/dcgm/DcgmReader.py +623 -0
  77. model_analyzer/monitor/dcgm/DcgmStatus.py +57 -0
  78. model_analyzer/monitor/dcgm/DcgmSystem.py +412 -0
  79. model_analyzer/monitor/dcgm/__init__.py +15 -0
  80. model_analyzer/monitor/dcgm/common/__init__.py +13 -0
  81. model_analyzer/monitor/dcgm/common/dcgm_client_cli_parser.py +194 -0
  82. model_analyzer/monitor/dcgm/common/dcgm_client_main.py +86 -0
  83. model_analyzer/monitor/dcgm/dcgm_agent.py +887 -0
  84. model_analyzer/monitor/dcgm/dcgm_collectd_plugin.py +369 -0
  85. model_analyzer/monitor/dcgm/dcgm_errors.py +395 -0
  86. model_analyzer/monitor/dcgm/dcgm_field_helpers.py +546 -0
  87. model_analyzer/monitor/dcgm/dcgm_fields.py +815 -0
  88. model_analyzer/monitor/dcgm/dcgm_fields_collectd.py +671 -0
  89. model_analyzer/monitor/dcgm/dcgm_fields_internal.py +29 -0
  90. model_analyzer/monitor/dcgm/dcgm_fluentd.py +45 -0
  91. model_analyzer/monitor/dcgm/dcgm_monitor.py +138 -0
  92. model_analyzer/monitor/dcgm/dcgm_prometheus.py +326 -0
  93. model_analyzer/monitor/dcgm/dcgm_structs.py +2357 -0
  94. model_analyzer/monitor/dcgm/dcgm_telegraf.py +65 -0
  95. model_analyzer/monitor/dcgm/dcgm_value.py +151 -0
  96. model_analyzer/monitor/dcgm/dcgmvalue.py +155 -0
  97. model_analyzer/monitor/dcgm/denylist_recommendations.py +573 -0
  98. model_analyzer/monitor/dcgm/pydcgm.py +47 -0
  99. model_analyzer/monitor/monitor.py +143 -0
  100. model_analyzer/monitor/remote_monitor.py +137 -0
  101. model_analyzer/output/__init__.py +15 -0
  102. model_analyzer/output/file_writer.py +63 -0
  103. model_analyzer/output/output_writer.py +42 -0
  104. model_analyzer/perf_analyzer/__init__.py +15 -0
  105. model_analyzer/perf_analyzer/genai_perf_config.py +206 -0
  106. model_analyzer/perf_analyzer/perf_analyzer.py +882 -0
  107. model_analyzer/perf_analyzer/perf_config.py +479 -0
  108. model_analyzer/plots/__init__.py +15 -0
  109. model_analyzer/plots/detailed_plot.py +266 -0
  110. model_analyzer/plots/plot_manager.py +224 -0
  111. model_analyzer/plots/simple_plot.py +213 -0
  112. model_analyzer/record/__init__.py +15 -0
  113. model_analyzer/record/gpu_record.py +68 -0
  114. model_analyzer/record/metrics_manager.py +887 -0
  115. model_analyzer/record/record.py +280 -0
  116. model_analyzer/record/record_aggregator.py +256 -0
  117. model_analyzer/record/types/__init__.py +15 -0
  118. model_analyzer/record/types/cpu_available_ram.py +93 -0
  119. model_analyzer/record/types/cpu_used_ram.py +93 -0
  120. model_analyzer/record/types/gpu_free_memory.py +96 -0
  121. model_analyzer/record/types/gpu_power_usage.py +107 -0
  122. model_analyzer/record/types/gpu_total_memory.py +96 -0
  123. model_analyzer/record/types/gpu_used_memory.py +96 -0
  124. model_analyzer/record/types/gpu_utilization.py +108 -0
  125. model_analyzer/record/types/inter_token_latency_avg.py +60 -0
  126. model_analyzer/record/types/inter_token_latency_base.py +74 -0
  127. model_analyzer/record/types/inter_token_latency_max.py +60 -0
  128. model_analyzer/record/types/inter_token_latency_min.py +60 -0
  129. model_analyzer/record/types/inter_token_latency_p25.py +60 -0
  130. model_analyzer/record/types/inter_token_latency_p50.py +60 -0
  131. model_analyzer/record/types/inter_token_latency_p75.py +60 -0
  132. model_analyzer/record/types/inter_token_latency_p90.py +60 -0
  133. model_analyzer/record/types/inter_token_latency_p95.py +60 -0
  134. model_analyzer/record/types/inter_token_latency_p99.py +60 -0
  135. model_analyzer/record/types/output_token_throughput.py +105 -0
  136. model_analyzer/record/types/perf_client_response_wait.py +97 -0
  137. model_analyzer/record/types/perf_client_send_recv.py +97 -0
  138. model_analyzer/record/types/perf_latency.py +111 -0
  139. model_analyzer/record/types/perf_latency_avg.py +60 -0
  140. model_analyzer/record/types/perf_latency_base.py +74 -0
  141. model_analyzer/record/types/perf_latency_p90.py +60 -0
  142. model_analyzer/record/types/perf_latency_p95.py +60 -0
  143. model_analyzer/record/types/perf_latency_p99.py +60 -0
  144. model_analyzer/record/types/perf_server_compute_infer.py +97 -0
  145. model_analyzer/record/types/perf_server_compute_input.py +97 -0
  146. model_analyzer/record/types/perf_server_compute_output.py +97 -0
  147. model_analyzer/record/types/perf_server_queue.py +97 -0
  148. model_analyzer/record/types/perf_throughput.py +105 -0
  149. model_analyzer/record/types/time_to_first_token_avg.py +60 -0
  150. model_analyzer/record/types/time_to_first_token_base.py +74 -0
  151. model_analyzer/record/types/time_to_first_token_max.py +60 -0
  152. model_analyzer/record/types/time_to_first_token_min.py +60 -0
  153. model_analyzer/record/types/time_to_first_token_p25.py +60 -0
  154. model_analyzer/record/types/time_to_first_token_p50.py +60 -0
  155. model_analyzer/record/types/time_to_first_token_p75.py +60 -0
  156. model_analyzer/record/types/time_to_first_token_p90.py +60 -0
  157. model_analyzer/record/types/time_to_first_token_p95.py +60 -0
  158. model_analyzer/record/types/time_to_first_token_p99.py +60 -0
  159. model_analyzer/reports/__init__.py +15 -0
  160. model_analyzer/reports/html_report.py +195 -0
  161. model_analyzer/reports/pdf_report.py +50 -0
  162. model_analyzer/reports/report.py +86 -0
  163. model_analyzer/reports/report_factory.py +62 -0
  164. model_analyzer/reports/report_manager.py +1376 -0
  165. model_analyzer/reports/report_utils.py +42 -0
  166. model_analyzer/result/__init__.py +15 -0
  167. model_analyzer/result/constraint_manager.py +150 -0
  168. model_analyzer/result/model_config_measurement.py +354 -0
  169. model_analyzer/result/model_constraints.py +105 -0
  170. model_analyzer/result/parameter_search.py +246 -0
  171. model_analyzer/result/result_manager.py +430 -0
  172. model_analyzer/result/result_statistics.py +159 -0
  173. model_analyzer/result/result_table.py +217 -0
  174. model_analyzer/result/result_table_manager.py +646 -0
  175. model_analyzer/result/result_utils.py +42 -0
  176. model_analyzer/result/results.py +277 -0
  177. model_analyzer/result/run_config_measurement.py +658 -0
  178. model_analyzer/result/run_config_result.py +210 -0
  179. model_analyzer/result/run_config_result_comparator.py +110 -0
  180. model_analyzer/result/sorted_results.py +151 -0
  181. model_analyzer/state/__init__.py +15 -0
  182. model_analyzer/state/analyzer_state.py +76 -0
  183. model_analyzer/state/analyzer_state_manager.py +215 -0
  184. model_analyzer/triton/__init__.py +15 -0
  185. model_analyzer/triton/client/__init__.py +15 -0
  186. model_analyzer/triton/client/client.py +234 -0
  187. model_analyzer/triton/client/client_factory.py +57 -0
  188. model_analyzer/triton/client/grpc_client.py +104 -0
  189. model_analyzer/triton/client/http_client.py +107 -0
  190. model_analyzer/triton/model/__init__.py +15 -0
  191. model_analyzer/triton/model/model_config.py +556 -0
  192. model_analyzer/triton/model/model_config_variant.py +29 -0
  193. model_analyzer/triton/server/__init__.py +15 -0
  194. model_analyzer/triton/server/server.py +76 -0
  195. model_analyzer/triton/server/server_config.py +269 -0
  196. model_analyzer/triton/server/server_docker.py +229 -0
  197. model_analyzer/triton/server/server_factory.py +306 -0
  198. model_analyzer/triton/server/server_local.py +158 -0
  199. triton_model_analyzer-1.48.0.dist-info/METADATA +52 -0
  200. triton_model_analyzer-1.48.0.dist-info/RECORD +204 -0
  201. triton_model_analyzer-1.48.0.dist-info/WHEEL +5 -0
  202. triton_model_analyzer-1.48.0.dist-info/entry_points.txt +2 -0
  203. triton_model_analyzer-1.48.0.dist-info/licenses/LICENSE +67 -0
  204. triton_model_analyzer-1.48.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,2357 @@
1
+ # Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ ##
15
+ # Python bindings for "dcgm_structs.h"
16
+ ##
17
+
18
+ from ctypes import *
19
+ from ctypes.util import find_library
20
+ import sys
21
+ import os
22
+ import threading
23
+ import string
24
+ import json
25
+ import model_analyzer.monitor.dcgm.dcgmvalue as dcgmvalue
26
+ import platform
27
+ from inspect import isclass
28
+ from typing import Dict, List
29
+
30
+ DCGM_MAX_STR_LENGTH = 256
31
+ DCGM_MAX_NUM_DEVICES = 32 # DCGM 2.0 and newer = 32. DCGM 1.8 and older = 16
32
+ DCGM_MAX_NUM_SWITCHES = 12
33
+ DCGM_NVLINK_MAX_LINKS_PER_GPU = 18
34
+ DCGM_NVLINK_MAX_LINKS_PER_GPU_LEGACY1 = 6
35
+ DCGM_NVLINK_MAX_LINKS_PER_GPU_LEGACY2 = 12
36
+ DCGM_NVLINK_MAX_LINKS_PER_NVSWITCH_V1 = 36 # Max NvLinks per NvSwitch pre-Hopper
37
+ DCGM_NVLINK_MAX_LINKS_PER_NVSWITCH = 64
38
+ DCGM_LANE_MAX_LANES_PER_NVSWICH_LINK = 4
39
+ DCGM_MAX_CLOCKS = 256
40
+ DCGM_MAX_NUM_GROUPS = 64
41
+ DCGM_MAX_BLOB_LENGTH = 4096
42
+ DCGM_MAX_VGPU_INSTANCES_PER_PGPU = 32
43
+ DCGM_VGPU_NAME_BUFFER_SIZE = 64
44
+ DCGM_GRID_LICENSE_BUFFER_SIZE = 128
45
+ DCGM_MAX_VGPU_TYPES_PER_PGPU = 32
46
+ DCGM_DEVICE_UUID_BUFFER_SIZE = 80
47
+ DCGM_MAX_FBC_SESSIONS = 256
48
+
49
+ # When more than one value is returned from a query, which order should it be returned in?
50
+ DCGM_ORDER_ASCENDING = 1
51
+ DCGM_ORDER_DESCENDING = 2
52
+
53
+ DCGM_OPERATION_MODE_AUTO = 1
54
+ DCGM_OPERATION_MODE_MANUAL = 2
55
+
56
+ DCGM_ENCODER_QUERY_H264 = 0
57
+ DCGM_ENCODER_QUERY_HEVC = 1
58
+
59
+ DCGM_FBC_SESSION_TYPE_UNKNOWN = 0 # Unknown
60
+ DCGM_FBC_SESSION_TYPE_TOSYS = 1 # FB capture for a system buffer
61
+ DCGM_FBC_SESSION_TYPE_CUDA = 2 # FB capture for a cuda buffer
62
+ DCGM_FBC_SESSION_TYPE_VID = 3 # FB capture for a Vid buffer
63
+ DCGM_FBC_SESSION_TYPE_HWENC = 4 # FB capture for a NVENC HW buffer
64
+
65
+ ## C Type mappings ##
66
+ ## Enums
67
+
68
+ # Return types
69
+ _dcgmReturn_t = c_uint
70
+ DCGM_ST_OK = 0 # Success
71
+ DCGM_ST_BADPARAM = -1 # A bad parameter was passed to a function
72
+ DCGM_ST_GENERIC_ERROR = -3 # A generic, unspecified error
73
+ DCGM_ST_MEMORY = -4 # An out of memory error occured
74
+ DCGM_ST_NOT_CONFIGURED = -5 # Setting not configured
75
+ DCGM_ST_NOT_SUPPORTED = -6 # Feature not supported
76
+ DCGM_ST_INIT_ERROR = -7 # DCGM Init error
77
+ DCGM_ST_NVML_ERROR = -8 # When NVML returns error.
78
+ DCGM_ST_PENDING = -9 # Object is in pending state of something else
79
+ DCGM_ST_UNINITIALIZED = -10 # Object is in undefined state
80
+ DCGM_ST_TIMEOUT = -11 # Requested operation timed out
81
+ DCGM_ST_VER_MISMATCH = -12 # Version mismatch between received and understood API
82
+ DCGM_ST_UNKNOWN_FIELD = -13 # Unknown field id
83
+ DCGM_ST_NO_DATA = -14 # No data is available
84
+ DCGM_ST_STALE_DATA = -15
85
+ DCGM_ST_NOT_WATCHED = -16 # The given field is not being updated by the cache manager
86
+ DCGM_ST_NO_PERMISSION = -17 # We are not permissioned to perform the desired action
87
+ DCGM_ST_GPU_IS_LOST = -18 # GPU is no longer reachable
88
+ DCGM_ST_RESET_REQUIRED = -19 # GPU requires a reset
89
+ DCGM_ST_FUNCTION_NOT_FOUND = -20 # Unable to find function
90
+ DCGM_ST_CONNECTION_NOT_VALID = (
91
+ -21
92
+ ) # Connection to the host engine is not valid any longer
93
+ DCGM_ST_GPU_NOT_SUPPORTED = -22 # This GPU is not supported by DCGM
94
+ DCGM_ST_GROUP_INCOMPATIBLE = (
95
+ -23
96
+ ) # The GPUs of the provided group are not compatible with each other for the requested operation
97
+ DCGM_ST_MAX_LIMIT = -24
98
+ DCGM_ST_LIBRARY_NOT_FOUND = -25 # DCGM library could not be found
99
+ DCGM_ST_DUPLICATE_KEY = -26 # Duplicate key passed to the function
100
+ DCGM_ST_GPU_IN_SYNC_BOOST_GROUP = -27 # GPU is already a part of a sync boost group
101
+ DCGM_ST_GPU_NOT_IN_SYNC_BOOST_GROUP = -28 # GPU is a not a part of sync boost group
102
+ DCGM_ST_REQUIRES_ROOT = (
103
+ -29
104
+ ) # This operation cannot be performed when the host engine is running as non-root
105
+ DCGM_ST_NVVS_ERROR = (
106
+ -30
107
+ ) # DCGM GPU Diagnostic was successfully executed, but reported an error.
108
+ DCGM_ST_INSUFFICIENT_SIZE = -31 # An input argument is not large enough
109
+ DCGM_ST_FIELD_UNSUPPORTED_BY_API = (
110
+ -32
111
+ ) # The given field ID is not supported by the API being called
112
+ DCGM_ST_MODULE_NOT_LOADED = (
113
+ -33
114
+ ) # This request is serviced by a module of DCGM that is not currently loaded
115
+ DCGM_ST_IN_USE = (
116
+ -34
117
+ ) # The requested operation could not be completed because the affected resource is in use
118
+ DCGM_ST_GROUP_IS_EMPTY = (
119
+ -35
120
+ ) # The specified group is empty and this operation is not valid with an empty group
121
+ DCGM_ST_PROFILING_NOT_SUPPORTED = (
122
+ -36
123
+ ) # Profiling is not supported for this group of GPUs or GPU
124
+ DCGM_ST_PROFILING_LIBRARY_ERROR = (
125
+ -37
126
+ ) # The third-party Profiling module returned an unrecoverable error
127
+ DCGM_ST_PROFILING_MULTI_PASS = (
128
+ -38
129
+ ) # The requested profiling metrics cannot be collected in a single pass
130
+ DCGM_ST_DIAG_ALREADY_RUNNING = (
131
+ -39
132
+ ) # A diag instance is already running, cannot run a new diag until the current one finishes.
133
+ DCGM_ST_DIAG_BAD_JSON = (
134
+ -40
135
+ ) # The DCGM GPU Diagnostic returned JSON that cannot be parsed
136
+ DCGM_ST_DIAG_BAD_LAUNCH = -41 # Error while launching the DCGM GPU Diagnostic
137
+ DCGM_ST_DIAG_UNUSED = -42 # Unused
138
+ DCGM_ST_DIAG_THRESHOLD_EXCEEDED = (
139
+ -43
140
+ ) # A field value met or exceeded the error threshold.
141
+ DCGM_ST_INSUFFICIENT_DRIVER_VERSION = (
142
+ -44
143
+ ) # The installed driver version is insufficient for this API
144
+ DCGM_ST_INSTANCE_NOT_FOUND = -45 # The specified GPU instance does not exist
145
+ DCGM_ST_COMPUTE_INSTANCE_NOT_FOUND = (
146
+ -46
147
+ ) # The specified GPU compute instance does not exist
148
+ DCGM_ST_CHILD_NOT_KILLED = -47 # Couldn't kill a child process within the retries
149
+ DCGM_ST_3RD_PARTY_LIBRARY_ERROR = -48 # Detected an error in a 3rd-party library
150
+ DCGM_ST_INSUFFICIENT_RESOURCES = -49 # Not enough resources available
151
+ DCGM_ST_PLUGIN_EXCEPTION = -50 # Exception thrown from a diagnostic plugin
152
+ DCGM_ST_NVVS_ISOLATE_ERROR = (
153
+ -51
154
+ ) # The diagnostic returned an error that indicates the need for isolation
155
+ DCGM_ST_NVVS_BINARY_NOT_FOUND = (
156
+ -52
157
+ ) # The NVVS binary was not found in the specified location
158
+ DCGM_ST_NVVS_KILLED = -53 # The NVVS process was killed by a signal
159
+ DCGM_ST_PAUSED = -54 # The hostengine and all modules are paused
160
+
161
+ DCGM_GROUP_DEFAULT = 0 # All the GPUs on the node are added to the group
162
+ DCGM_GROUP_EMPTY = 1 # Creates an empty group
163
+ DCGM_GROUP_DEFAULT_NVSWITCHES = 2 # All NvSwitches of the node are added to the group
164
+ DCGM_GROUP_DEFAULT_INSTANCES = 3 # All GPU instances of the node are added to the group
165
+ DCGM_GROUP_DEFAULT_COMPUTE_INSTANCES = (
166
+ 4 # All compute instances of the node are added to the group
167
+ )
168
+ DCGM_GROUP_DEFAULT_ENTITIES = 5 # All entities are added to this default group
169
+
170
+ DCGM_GROUP_ALL_GPUS = 0x7FFFFFFF
171
+ DCGM_GROUP_ALL_NVSWITCHES = 0x7FFFFFFE
172
+ DCGM_GROUP_ALL_INSTANCES = 0x7FFFFFFD
173
+ DCGM_GROUP_ALL_COMPUTE_INSTANCES = 0x7FFFFFFC
174
+ DCGM_GROUP_ALL_ENTITIES = 0x7FFFFFFB
175
+
176
+ DCGM_GROUP_MAX_ENTITIES = 64 # Maximum number of entities per entity group
177
+
178
+ DCGM_CONFIG_TARGET_STATE = 0 # The target configuration values to be applied
179
+ DCGM_CONFIG_CURRENT_STATE = 1 # The current configuration state
180
+
181
+ DCGM_CONFIG_POWER_CAP_INDIVIDUAL = (
182
+ 0 # Represents the power cap to be applied for each member of the group
183
+ )
184
+ DCGM_CONFIG_POWER_BUDGET_GROUP = 1 # Represents the power budget for the entire group
185
+
186
+ DCGM_CONFIG_COMPUTEMODE_DEFAULT = (
187
+ 0 # Default compute mode -- multiple contexts per device
188
+ )
189
+ DCGM_CONFIG_COMPUTEMODE_PROHIBITED = (
190
+ 1 # Compute-prohibited mode -- no contexts per device
191
+ )
192
+ DCGM_CONFIG_COMPUTEMODE_EXCLUSIVE_PROCESS = 2 # * Compute-exclusive-process mode -- only one context per device, usable from multiple threads at a time
193
+
194
+ DCGM_TOPOLOGY_BOARD = 0x1
195
+ DCGM_TOPOLOGY_SINGLE = 0x2
196
+ DCGM_TOPOLOGY_MULTIPLE = 0x4
197
+ DCGM_TOPOLOGY_HOSTBRIDGE = 0x8
198
+ DCGM_TOPOLOGY_CPU = 0x10
199
+ DCGM_TOPOLOGY_SYSTEM = 0x20
200
+ DCGM_TOPOLOGY_NVLINK1 = 0x0100
201
+ DCGM_TOPOLOGY_NVLINK2 = 0x0200
202
+ DCGM_TOPOLOGY_NVLINK3 = 0x0400
203
+ DCGM_TOPOLOGY_NVLINK4 = 0x0800
204
+ DCGM_TOPOLOGY_NVLINK5 = 0x1000
205
+ DCGM_TOPOLOGY_NVLINK6 = 0x2000
206
+ DCGM_TOPOLOGY_NVLINK7 = 0x4000
207
+ DCGM_TOPOLOGY_NVLINK8 = 0x8000
208
+ DCGM_TOPOLOGY_NVLINK9 = 0x10000
209
+ DCGM_TOPOLOGY_NVLINK10 = 0x20000
210
+ DCGM_TOPOLOGY_NVLINK11 = 0x40000
211
+ DCGM_TOPOLOGY_NVLINK12 = 0x80000
212
+
213
+ # Diagnostic per gpu tests - fixed indices for dcgmDiagResponsePerGpu_t.results[]
214
+ DCGM_MEMORY_INDEX = 0
215
+ DCGM_DIAGNOSTIC_INDEX = 1
216
+ DCGM_PCI_INDEX = 2
217
+ DCGM_SM_STRESS_INDEX = 3
218
+ DCGM_TARGETED_STRESS_INDEX = 4
219
+ DCGM_TARGETED_POWER_INDEX = 5
220
+ DCGM_MEMORY_BANDWIDTH_INDEX = 6
221
+ DCGM_MEMTEST_INDEX = 7
222
+ DCGM_PULSE_TEST_INDEX = 8
223
+ DCGM_EUD_TEST_INDEX = 9
224
+ DCGM_UNUSED2_TEST_INDEX = 10
225
+ DCGM_UNUSED3_TEST_INDEX = 11
226
+ DCGM_UNUSED4_TEST_INDEX = 12
227
+ DCGM_UNUSED5_TEST_INDEX = 13
228
+ DCGM_PER_GPU_TEST_COUNT_V7 = 9
229
+ DCGM_PER_GPU_TEST_COUNT_V8 = 13
230
+
231
+ # DCGM Diag Level One test indices
232
+ DCGM_SWTEST_DENYLIST = 0
233
+ DCGM_SWTEST_NVML_LIBRARY = 1
234
+ DCGM_SWTEST_CUDA_MAIN_LIBRARY = 2
235
+ DCGM_SWTEST_CUDA_RUNTIME_LIBRARY = 3
236
+ DCGM_SWTEST_PERMISSIONS = 4
237
+ DCGM_SWTEST_PERSISTENCE_MODE = 5
238
+ DCGM_SWTEST_ENVIRONMENT = 6
239
+ DCGM_SWTEST_PAGE_RETIREMENT = 7
240
+ DCGM_SWTEST_GRAPHICS_PROCESSES = 8
241
+ DCGM_SWTEST_INFOROM = 9
242
+
243
+ # This test is only run by itself, so it can use the 0 slot
244
+ DCGM_CONTEXT_CREATE_INDEX = 0
245
+
246
+
247
+ class DCGM_INTROSPECT_STATE(object):
248
+ DISABLED = 0
249
+ ENABLED = 1
250
+
251
+
252
+ # Lib loading
253
+ dcgmLib = None
254
+ libLoadLock = threading.Lock()
255
+ _dcgmLib_refcount = 0 # Incremented on each dcgmInit and decremented on dcgmShutdown
256
+
257
+
258
+ class DCGMError(Exception):
259
+ """Class to return error values for DCGM"""
260
+
261
+ _valClassMapping: Dict = dict()
262
+ # List of currently known error codes
263
+ _error_code_to_string = {
264
+ DCGM_ST_OK: "Success",
265
+ DCGM_ST_BADPARAM: "Bad parameter passed to function",
266
+ DCGM_ST_GENERIC_ERROR: "Generic unspecified error",
267
+ DCGM_ST_MEMORY: "Out of memory error",
268
+ DCGM_ST_NOT_CONFIGURED: "Setting not configured",
269
+ DCGM_ST_NOT_SUPPORTED: "Feature not supported",
270
+ DCGM_ST_INIT_ERROR: "DCGM initialization error",
271
+ DCGM_ST_NVML_ERROR: "NVML error",
272
+ DCGM_ST_PENDING: "Object is in a pending state",
273
+ DCGM_ST_UNINITIALIZED: "Object is in an undefined state",
274
+ DCGM_ST_TIMEOUT: "Timeout",
275
+ DCGM_ST_VER_MISMATCH: "API version mismatch",
276
+ DCGM_ST_UNKNOWN_FIELD: "Unknown field",
277
+ DCGM_ST_NO_DATA: "No data is available",
278
+ DCGM_ST_STALE_DATA: "Data is considered stale",
279
+ DCGM_ST_NOT_WATCHED: "Field is not being updated",
280
+ DCGM_ST_NO_PERMISSION: "Not permissioned",
281
+ DCGM_ST_GPU_IS_LOST: "GPU is unreachable",
282
+ DCGM_ST_RESET_REQUIRED: "GPU requires a reset",
283
+ DCGM_ST_FUNCTION_NOT_FOUND: "Unable to find function",
284
+ DCGM_ST_CONNECTION_NOT_VALID: "The connection to the host engine is not valid any longer",
285
+ DCGM_ST_GPU_NOT_SUPPORTED: "This GPU is not supported by DCGM",
286
+ DCGM_ST_GROUP_INCOMPATIBLE: "GPUs are incompatible with each other for the requested operation",
287
+ DCGM_ST_MAX_LIMIT: "Max limit reached for the object",
288
+ DCGM_ST_LIBRARY_NOT_FOUND: "DCGM library could not be found",
289
+ DCGM_ST_DUPLICATE_KEY: "Duplicate key passed to function",
290
+ DCGM_ST_GPU_IN_SYNC_BOOST_GROUP: "GPU is already a part of a sync boost group",
291
+ DCGM_ST_GPU_NOT_IN_SYNC_BOOST_GROUP: "GPU is not a part of the sync boost group",
292
+ DCGM_ST_REQUIRES_ROOT: "This operation is not supported when the host engine is running as non root",
293
+ DCGM_ST_NVVS_ERROR: "DCGM GPU Diagnostic returned an error.",
294
+ DCGM_ST_INSUFFICIENT_SIZE: "An input argument is not large enough",
295
+ DCGM_ST_FIELD_UNSUPPORTED_BY_API: "The given field ID is not supported by the API being called",
296
+ DCGM_ST_MODULE_NOT_LOADED: "This request is serviced by a module of DCGM that is not currently loaded",
297
+ DCGM_ST_IN_USE: "The requested operation could not be completed because the affected resource is in use",
298
+ DCGM_ST_GROUP_IS_EMPTY: "The specified group is empty, and this operation is incompatible with an empty group",
299
+ DCGM_ST_PROFILING_NOT_SUPPORTED: "Profiling is not supported for this group of GPUs or GPU",
300
+ DCGM_ST_PROFILING_LIBRARY_ERROR: "The third-party Profiling module returned an unrecoverable error",
301
+ DCGM_ST_PROFILING_MULTI_PASS: "The requested profiling metrics cannot be collected in a single pass",
302
+ DCGM_ST_DIAG_ALREADY_RUNNING: "A diag instance is already running, cannot run a new diag until the current one finishes",
303
+ DCGM_ST_DIAG_BAD_JSON: "The GPU Diagnostic returned Json that cannot be parsed.",
304
+ DCGM_ST_DIAG_BAD_LAUNCH: "Error while launching the GPU Diagnostic.",
305
+ DCGM_ST_DIAG_UNUSED: "Unused error code",
306
+ DCGM_ST_DIAG_THRESHOLD_EXCEEDED: "A field value met or exceeded the error threshold.",
307
+ DCGM_ST_INSUFFICIENT_DRIVER_VERSION: "The installed driver version is insufficient for this API",
308
+ DCGM_ST_INSTANCE_NOT_FOUND: "The specified GPU instance does not exist",
309
+ DCGM_ST_COMPUTE_INSTANCE_NOT_FOUND: "The specified GPU compute instance does not exist",
310
+ DCGM_ST_CHILD_NOT_KILLED: "Couldn't kill a child process within the retries",
311
+ DCGM_ST_3RD_PARTY_LIBRARY_ERROR: "Detected an error in a 3rd-party library",
312
+ DCGM_ST_INSUFFICIENT_RESOURCES: "Not enough resources available",
313
+ DCGM_ST_PLUGIN_EXCEPTION: "Exception thrown from a diagnostic plugin",
314
+ DCGM_ST_NVVS_ISOLATE_ERROR: "The diagnostic returned an error that indicates the need for isolation",
315
+ }
316
+
317
+ def __new__(typ, value):
318
+ """
319
+ Maps value to a proper subclass of DCGMError.
320
+ """
321
+ if typ == DCGMError:
322
+ typ = DCGMError._valClassMapping.get(value, typ)
323
+ obj = Exception.__new__(typ)
324
+ obj.info = None
325
+ obj.value = value
326
+ return obj
327
+
328
+ def __str__(self):
329
+ msg = None
330
+ try:
331
+ if self.value not in DCGMError._error_code_to_string:
332
+ DCGMError._error_code_to_string[self.value] = str(
333
+ _dcgmErrorString(self.value)
334
+ )
335
+ msg = DCGMError._error_code_to_string[self.value]
336
+ # Ensure we catch all exceptions, otherwise the error code will be hidden in a traceback
337
+ except BaseException:
338
+ msg = "DCGM Error with code %d" % self.value
339
+
340
+ if self.info is not None:
341
+ if msg[-1] == ".":
342
+ msg = msg[:-1]
343
+ msg += ": '%s'" % self.info
344
+ return msg
345
+
346
+ def __eq__(self, other):
347
+ return self.value == other.value
348
+
349
+ def __hash__(self):
350
+ return hash(self.value)
351
+
352
+ def SetAdditionalInfo(self, msg):
353
+ """
354
+ Sets msg as additional information returned by the string representation of DCGMError and subclasses.
355
+ Example output for DCGMError_Uninitialized subclass, with msg set to 'more info msg here' is
356
+ "DCGMError_Uninitialized: Object is in an undefined state: 'more info msg here'".
357
+
358
+ Ensure that msg is a string or an object for which the __str__() method does not throw an error
359
+ """
360
+ self.info = msg
361
+
362
+
363
+ def dcgmExceptionClass(error_code):
364
+ return DCGMError._valClassMapping.get(error_code)
365
+
366
+
367
+ def _extractDCGMErrorsAsClasses():
368
+ """
369
+ Generates a hierarchy of classes on top of DCGMLError class.
370
+
371
+ Each DCGM Error gets a new DCGMError subclass. This way try,except blocks can filter appropriate
372
+ exceptions more easily.
373
+
374
+ DCGMError is a parent class. Each DCGM_ST_* gets it's own subclass.
375
+ e.g. DCGM_ST_UNINITIALIZED will be turned into DCGMError_Uninitialized
376
+ """
377
+ this_module = sys.modules[__name__]
378
+ dcgmErrorsNames = filter(lambda x: x.startswith("DCGM_ST_"), dir(this_module))
379
+ for err_name in dcgmErrorsNames:
380
+ # e.g. Turn DCGM_ST_UNINITIALIZED into DCGMError_Uninitialized
381
+ class_name = "DCGMError_" + string.capwords(
382
+ err_name.replace("DCGM_ST_", ""), "_"
383
+ ).replace("_", "")
384
+ err_val = getattr(this_module, err_name)
385
+
386
+ def gen_new(val):
387
+
388
+ def new(typ):
389
+ # pylint: disable=E1121
390
+ obj = DCGMError.__new__(typ, val)
391
+ return obj
392
+
393
+ return new
394
+
395
+ new_error_class = type(class_name, (DCGMError,), {"__new__": gen_new(err_val)})
396
+ new_error_class.__module__ = __name__
397
+ setattr(this_module, class_name, new_error_class)
398
+ DCGMError._valClassMapping[err_val] = new_error_class
399
+
400
+
401
+ _extractDCGMErrorsAsClasses()
402
+
403
+
404
+ class struct_c_dcgmUnit_t(Structure):
405
+ # Unit structures
406
+ pass # opaque handle
407
+
408
+
409
+ _dcgmUnit_t = POINTER(struct_c_dcgmUnit_t)
410
+
411
+
412
+ class _WrappedStructure:
413
+
414
+ def __init__(self, obj):
415
+ self.__dict__["_obj"] = obj
416
+
417
+ def __getattr__(self, key):
418
+ value = getattr(self._obj, key)
419
+ if isinstance(value, bytes):
420
+ return value.decode("utf-8")
421
+ if isclass(value):
422
+ return _WrappedStructure(value)
423
+ return value
424
+
425
+ def __getitem__(self, key):
426
+ value = self._obj[key]
427
+ if isinstance(value, bytes):
428
+ return value.decode("utf-8")
429
+ if isclass(value):
430
+ return _WrappedStructure(value)
431
+ return value
432
+
433
+ def __setattr__(self, key, raw_value):
434
+
435
+ def find_field_type(fields, key):
436
+ field = (f[1] for f in fields if f[0] == key)
437
+ try:
438
+ return next(field)
439
+ except StopIteration:
440
+ return None
441
+
442
+ if key == "_obj":
443
+ raise RuntimeError("Cannot set _obj")
444
+
445
+ value = raw_value
446
+ fieldtype = find_field_type(self._obj._fields_, key)
447
+
448
+ if fieldtype == c_uint and not isinstance(value, c_uint32):
449
+ value = int(value)
450
+ elif fieldtype == c_int and not isinstance(value, c_int32):
451
+ value = int(value)
452
+ elif isinstance(raw_value, str):
453
+ value = raw_value.encode("utf-8")
454
+
455
+ self._obj[key] = value
456
+ return value
457
+
458
+
459
+ class _DcgmStructure(Structure):
460
+
461
+ def __getattribute__(self, key):
462
+ value = super().__getattribute__(key)
463
+ if isinstance(value, bytes):
464
+ return value.decode("utf-8")
465
+ if isclass(value):
466
+ return _WrappedStructure(value)
467
+ return value
468
+
469
+ def __setattr__(self, key, raw_value):
470
+
471
+ def find_field_type(fields, key):
472
+ field = (f[1] for f in fields if f[0] == key)
473
+ try:
474
+ return next(field)
475
+ except StopIteration:
476
+ return None
477
+
478
+ value = raw_value
479
+ fieldtype = find_field_type(self._fields_, key)
480
+
481
+ if fieldtype == c_uint and not isinstance(value, c_uint32):
482
+ value = int(value)
483
+ elif fieldtype == c_int and not isinstance(value, c_int32):
484
+ value = int(value)
485
+ elif isinstance(raw_value, str):
486
+ value = raw_value.encode("utf-8")
487
+
488
+ return super().__setattr__(key, value)
489
+
490
+
491
+ class DcgmUnion(Union):
492
+
493
+ def __getattribute__(self, key):
494
+ value = super().__getattribute__(key)
495
+ if isinstance(value, bytes):
496
+ return value.decode("utf-8")
497
+ if isclass(value):
498
+ return _WrappedStructure(value)
499
+ return value
500
+
501
+ def __setattr__(self, key, raw_value):
502
+
503
+ def find_field_type(fields, key):
504
+ field = (f[1] for f in fields if f[0] == key)
505
+ try:
506
+ return next(field)
507
+ except StopIteration:
508
+ return None
509
+
510
+ value = raw_value
511
+ fieldtype = find_field_type(self._fields_, key)
512
+
513
+ if fieldtype == c_uint and not isinstance(value, c_uint32):
514
+ value = int(value)
515
+ elif fieldtype == c_int and not isinstance(value, c_int32):
516
+ value = int(value)
517
+ elif isinstance(raw_value, str):
518
+ value = raw_value.encode("utf-8")
519
+
520
+ return super().__setattr__(key, value)
521
+
522
+
523
+ class _PrintableStructure(_DcgmStructure):
524
+ """
525
+ Abstract class that produces nicer __str__ output than ctypes.Structure.
526
+ e.g. instead of:
527
+ >>> print str(obj)
528
+ <class_name object at 0x7fdf82fef9e0>
529
+ this class will print
530
+ class_name(field_name: formatted_value, field_name: formatted_value)
531
+
532
+ _fmt_ dictionary of <str _field_ name> -> <str format>
533
+ e.g. class that has _field_ 'hex_value', c_uint could be formatted with
534
+ _fmt_ = {"hex_value" : "%08X"}
535
+ to produce nicer output.
536
+ Default fomratting string for all fields can be set with key "<default>" like:
537
+ _fmt_ = {"<default>" : "%d MHz"} # e.g all values are numbers in MHz.
538
+ If not set it's assumed to be just "%s"
539
+
540
+ Exact format of returned str from this class is subject to change in the future.
541
+ """
542
+
543
+ _fmt_: Dict = {}
544
+
545
+ def __str__(self):
546
+ result = []
547
+ for x in self._fields_:
548
+ key = x[0]
549
+ value = getattr(self, key)
550
+ fmt = "%s"
551
+ if key in self._fmt_:
552
+ fmt = self._fmt_[key]
553
+ elif "<default>" in self._fmt_:
554
+ fmt = self._fmt_["<default>"]
555
+ result.append(("%s: " + fmt) % (key, value))
556
+ return self.__class__.__name__ + "(" + ", ".join(result) + ")"
557
+
558
+ def FieldsSizeof(self):
559
+ size = 0
560
+ for s, t in self._fields_:
561
+ size = size + sizeof(t)
562
+ return size
563
+
564
+
565
+ # JSON serializer for DCGM structures
566
+ class DcgmJSONEncoder(json.JSONEncoder):
567
+
568
+ def default(self, o): # pylint: disable=method-hidden
569
+ if isinstance(o, _PrintableStructure):
570
+ retVal = {}
571
+ for fieldName, fieldType in o._fields_:
572
+ subObj = getattr(o, fieldName)
573
+ if isinstance(subObj, _PrintableStructure):
574
+ subObj = self.default(subObj)
575
+
576
+ retVal[fieldName] = subObj
577
+
578
+ return retVal
579
+ elif isinstance(o, Array):
580
+ retVal = []
581
+ for i in range(len(o)):
582
+ subVal = {}
583
+ for fieldName, fieldType in o[i]._fields_:
584
+ subObj = getattr(o[i], fieldName)
585
+ if isinstance(subObj, _PrintableStructure):
586
+ subObj = self.default(subObj)
587
+
588
+ subVal[fieldName] = subObj
589
+
590
+ retVal.append(subVal)
591
+ return retVal
592
+
593
+ # Let the parent class handle this/fail
594
+ return json.JSONEncoder.default(self, o)
595
+
596
+
597
+ # Creates a unique version number for each struct
598
+ def make_dcgm_version(struct, ver):
599
+ return sizeof(struct) | (ver << 24)
600
+
601
+
602
+ # Function access ##
603
+ _dcgmGetFunctionPointer_cache: Dict = (
604
+ dict()
605
+ ) # function pointers are cached to prevent unnecessary libLoadLock locking
606
+
607
+
608
+ def _dcgmGetFunctionPointer(name):
609
+ global dcgmLib
610
+
611
+ if name in _dcgmGetFunctionPointer_cache:
612
+ return _dcgmGetFunctionPointer_cache[name]
613
+
614
+ libLoadLock.acquire()
615
+ try:
616
+ # ensure library was loaded
617
+ if dcgmLib is None:
618
+ raise DCGMError(DCGM_ST_UNINITIALIZED)
619
+ try:
620
+ _dcgmGetFunctionPointer_cache[name] = getattr(dcgmLib, name)
621
+ return _dcgmGetFunctionPointer_cache[name]
622
+ except AttributeError:
623
+ raise DCGMError(DCGM_ST_FUNCTION_NOT_FOUND)
624
+ finally:
625
+ # lock is always freed
626
+ libLoadLock.release()
627
+
628
+
629
+ # C function wrappers ##
630
+ def _LoadDcgmLibrary(libDcgmPath=None):
631
+ """
632
+ Load the library if it isn't loaded already
633
+ :param libDcgmPath: Optional path to the libdcgm*.so libraries. Will use system defaults if not specified.
634
+ :type libDcgmPath: str
635
+ :return: None
636
+ """
637
+ global dcgmLib
638
+
639
+ if dcgmLib is None:
640
+ # lock to ensure only one caller loads the library
641
+ libLoadLock.acquire()
642
+
643
+ try:
644
+ # ensure the library still isn't loaded
645
+ if dcgmLib is None:
646
+ try:
647
+ if sys.platform[:3] == "win":
648
+ # cdecl calling convention
649
+ # load nvml.dll from %ProgramFiles%/NVIDIA Corporation/NVSMI/nvml.dll
650
+ dcgmLib = CDLL(
651
+ os.path.join(
652
+ os.getenv("ProgramFiles", "C:/Program Files"),
653
+ "NVIDIA Corporation/NVSMI/dcgm.dll",
654
+ )
655
+ )
656
+ else:
657
+ if libDcgmPath:
658
+ lib_file = os.path.join(libDcgmPath, "libdcgm.so.4")
659
+ else:
660
+ # Try Debian-based distros
661
+ lib_file = "/usr/lib/{}-linux-gnu/libdcgm.so.4".format(
662
+ platform.machine()
663
+ )
664
+ if not os.path.isfile(lib_file):
665
+ # Presume Redhat-based distros
666
+ lib_file = "/usr/lib64/libdcgm.so.4"
667
+
668
+ dcgmLib = CDLL(lib_file)
669
+
670
+ except OSError as ose:
671
+ _dcgmCheckReturn(DCGM_ST_LIBRARY_NOT_FOUND)
672
+ if dcgmLib is None:
673
+ _dcgmCheckReturn(DCGM_ST_LIBRARY_NOT_FOUND)
674
+ finally:
675
+ # lock is always freed
676
+ libLoadLock.release()
677
+
678
+
679
+ def _dcgmInit(libDcgmPath=None):
680
+ _LoadDcgmLibrary(libDcgmPath)
681
+ # Atomically update refcount
682
+ global _dcgmLib_refcount
683
+ libLoadLock.acquire()
684
+ _dcgmLib_refcount += 1
685
+ libLoadLock.release()
686
+ return None
687
+
688
+
689
+ def _dcgmCheckReturn(ret):
690
+ if ret != DCGM_ST_OK:
691
+ raise DCGMError(ret)
692
+ return ret
693
+
694
+
695
+ def _dcgmShutdown():
696
+ # Leave the library loaded, but shutdown the interface
697
+ fn = _dcgmGetFunctionPointer("dcgmShutdown")
698
+ ret = fn()
699
+ _dcgmCheckReturn(ret)
700
+
701
+ # Atomically update refcount
702
+ global _dcgmLib_refcount
703
+ libLoadLock.acquire()
704
+ if 0 < _dcgmLib_refcount:
705
+ _dcgmLib_refcount -= 1
706
+ libLoadLock.release()
707
+ return None
708
+
709
+
710
+ def _dcgmErrorString(result):
711
+ fn = _dcgmGetFunctionPointer("dcgmErrorString")
712
+ fn.restype = c_char_p # otherwise return is an int
713
+ str = fn(result)
714
+ return str
715
+
716
+
717
+ # Represents a link object. type should be one of DCGM_FE_GPU or
718
+ # DCGM_FE_SWITCH. gpuId or switchID the associated gpu or switch;
719
+ #
720
+ class c_dcgm_link_t(_PrintableStructure):
721
+ _fields = [("type", c_uint8), ("index", c_uint8), ("id", c_uint16)]
722
+
723
+
724
+ class c_dcgmConnectV2Params_v1(_PrintableStructure):
725
+ _fields_ = [("version", c_uint), ("persistAfterDisconnect", c_uint)]
726
+
727
+
728
+ c_dcgmConnectV2Params_version1 = make_dcgm_version(c_dcgmConnectV2Params_v1, 1)
729
+
730
+
731
+ class c_dcgmConnectV2Params_v2(_PrintableStructure):
732
+ _fields_ = [
733
+ ("version", c_uint),
734
+ ("persistAfterDisconnect", c_uint),
735
+ ("timeoutMs", c_uint),
736
+ ("addressIsUnixSocket", c_uint),
737
+ ]
738
+
739
+
740
+ c_dcgmConnectV2Params_version2 = make_dcgm_version(c_dcgmConnectV2Params_v2, 2)
741
+ c_dcgmConnectV2Params_version = c_dcgmConnectV2Params_version2
742
+
743
+
744
+ class c_dcgmHostengineHealth_v1(_PrintableStructure):
745
+ _fields_ = [
746
+ ("version", c_uint),
747
+ ("overallHealth", c_uint),
748
+ ]
749
+
750
+
751
+ dcgmHostengineHealth_version1 = make_dcgm_version(c_dcgmHostengineHealth_v1, 1)
752
+ dcgmHostengineHealth_version = dcgmHostengineHealth_version1
753
+
754
+
755
+ # Represents memory and proc clocks for a device
756
+ class c_dcgmClockSet_v1(_PrintableStructure):
757
+ _fields_ = [
758
+ ("version", c_uint),
759
+ ("memClock", c_uint), # /* Memory Clock */
760
+ ("smClock", c_uint), # /* SM Clock */
761
+ ]
762
+
763
+
764
+ # Represents a entityGroupId + entityId pair to uniquely identify a given entityId inside
765
+ # a group of entities
766
+ # Added in DCGM 1.5.0
767
+ class c_dcgmGroupEntityPair_t(_PrintableStructure):
768
+ _fields_ = [
769
+ ("entityGroupId", c_uint32), # Entity Group ID entity belongs to
770
+ ("entityId", c_uint32), # Entity ID of the entity
771
+ ]
772
+
773
+
774
+ # /**
775
+ # * Structure to store information for DCGM group (v2)
776
+ # * Added in DCGM 1.5.0
777
+ # */
778
+ class c_dcgmGroupInfo_v2(_PrintableStructure):
779
+ _fields_ = [
780
+ ("version", c_uint),
781
+ ("count", c_uint),
782
+ ("groupName", c_char * DCGM_MAX_STR_LENGTH),
783
+ ("entityList", c_dcgmGroupEntityPair_t * DCGM_GROUP_MAX_ENTITIES),
784
+ ]
785
+
786
+
787
+ c_dcgmGroupInfo_version2 = make_dcgm_version(c_dcgmGroupInfo_v2, 2)
788
+
789
+ DcgmiMigProfileNone = 0 # No profile (for GPUs)
790
+ DcgmMigProfileGpuInstanceSlice1 = 1 # GPU instance slice 1
791
+ DcgmMigProfileGpuInstanceSlice2 = 2 # GPU instance slice 2
792
+ DcgmMigProfileGpuInstanceSlice3 = 3 # GPU instance slice 3
793
+ DcgmMigProfileGpuInstanceSlice4 = 4 # GPU instance slice 4
794
+ DcgmMigProfileGpuInstanceSlice7 = 5 # GPU instance slice 7
795
+ DcgmMigProfileGpuInstanceSlice8 = 6 # GPU instance slice 8
796
+ DcgmMigProfileGpuInstanceSlice6 = 7 # GPU instance slice 6
797
+ DcgmMigProfileGpuInstanceSlice1Rev1 = 8 # GPU instance slice 1 revision 1
798
+ DcgmMigProfileGpuInstanceSlice2Rev1 = 9 # GPU instance slice 2 revision 1
799
+ DcgmMigProfileGpuInstanceSlice1Rev2 = 10 # GPU instance slice 1 revision 2
800
+ DcgmMigProfileComputeInstanceSlice1 = 30 # compute instance slice 1
801
+ DcgmMigProfileComputeInstanceSlice2 = 31 # compute instance slice 2
802
+ DcgmMigProfileComputeInstanceSlice3 = 32 # compute instance slice 3
803
+ DcgmMigProfileComputeInstanceSlice4 = 33 # compute instance slice 4
804
+ DcgmMigProfileComputeInstanceSlice7 = 34 # compute instance slice 7
805
+ DcgmMigProfileComputeInstanceSlice8 = 35 # compute instance slice 8
806
+ DcgmMigProfileComputeInstanceSlice6 = 36 # compute instance slice 6
807
+ DcgmMigProfileComputeInstanceSlice1Rev1 = 37 # compute instance slice 1 revision 1
808
+
809
+
810
+ # /**
811
+ # * Represents a pair of entity pairings to uniquely identify an entity and its place in the hierarchy.
812
+ # */
813
+ class c_dcgmMigHierarchyInfo_t(_PrintableStructure):
814
+ _fields_ = [
815
+ ("entity", c_dcgmGroupEntityPair_t),
816
+ ("parent", c_dcgmGroupEntityPair_t),
817
+ ("sliceProfile", c_uint),
818
+ ]
819
+
820
+
821
+ class c_dcgmMigEntityInfo_t(_PrintableStructure):
822
+ _fields_ = [
823
+ ("gpuUuid", c_char * 128), # GPU UUID
824
+ ("nvmlGpuIndex", c_uint), # GPU index from NVML
825
+ ("nvmlInstanceId", c_uint), # GPU instance index within GPU
826
+ (
827
+ "nvmlComputeInstanceId",
828
+ c_uint,
829
+ ), # GPU Compute instance index within GPU instance
830
+ ("nvmlMigProfileId", c_uint), # Unique profile ID for GPU or Compute instances
831
+ ("nvmlProfileSlices", c_uint), # Number of slices in the MIG profile
832
+ ]
833
+
834
+
835
+ class c_dcgmMigHierarchyInfo_v2(_PrintableStructure):
836
+ _fields_ = [
837
+ ("entity", c_dcgmGroupEntityPair_t),
838
+ ("parent", c_dcgmGroupEntityPair_t),
839
+ ("info", c_dcgmMigEntityInfo_t),
840
+ ]
841
+
842
+
843
+ DCGM_MAX_INSTANCES_PER_GPU = 8
844
+ # There can never be more compute instances per GPU than instances per GPU because a compute instance
845
+ # is part of an instance
846
+ DCGM_MAX_COMPUTE_INSTANCES_PER_GPU = DCGM_MAX_INSTANCES_PER_GPU
847
+ # Currently, there cannot be more than 14 instances + compute instances. There are always 7 compute instances
848
+ # and never more than 7 instances
849
+ DCGM_MAX_TOTAL_INSTANCES = 14
850
+ DCGM_MAX_HIERARCHY_INFO = DCGM_MAX_NUM_DEVICES * DCGM_MAX_TOTAL_INSTANCES
851
+ DCGM_MAX_INSTANCES = DCGM_MAX_NUM_DEVICES * DCGM_MAX_INSTANCES_PER_GPU
852
+ # The maximum compute instances are always the same as the maximum instances because each compute instances
853
+ # is part of an instance
854
+ DCGM_MAX_COMPUTE_INSTANCES = DCGM_MAX_INSTANCES
855
+
856
+ DCGM_MIG_RECONFIG_DELAY_PROCESSING = (
857
+ 0x1 # Ask the hostengine to wait to process reconfiguring the GPUs
858
+ )
859
+
860
+
861
+ class c_dcgmMigHierarchy_v2(_PrintableStructure):
862
+ _fields_ = [
863
+ ("version", c_uint),
864
+ ("count", c_uint),
865
+ ("entityList", c_dcgmMigHierarchyInfo_v2 * DCGM_MAX_HIERARCHY_INFO),
866
+ ]
867
+
868
+
869
+ c_dcgmMigHierarchy_version2 = make_dcgm_version(c_dcgmMigHierarchy_v2, 2)
870
+
871
+
872
+ class c_dcgmDeleteMigEntity_v1(_PrintableStructure):
873
+ _fields_ = [
874
+ ("version", c_uint),
875
+ ("entityGroupId", c_uint32),
876
+ ("entityId", c_uint32),
877
+ ("flags", c_uint),
878
+ ]
879
+
880
+
881
+ c_dcgmDeleteMigEntity_version1 = make_dcgm_version(c_dcgmDeleteMigEntity_v1, 1)
882
+
883
+ # /**
884
+ # * Enum values for the kinds of MIG creations
885
+ # */
886
+ DcgmMigCreateGpuInstance = 0 # Create a GPU instance
887
+ DcgmMigCreateComputeInstance = 1 # Create a compute instance
888
+
889
+
890
+ class c_dcgmCreateMigEntity_v1(_PrintableStructure):
891
+ _fields_ = [
892
+ ("version", c_uint),
893
+ ("parentId", c_uint32),
894
+ ("profile", c_uint32),
895
+ ("createOption", c_uint32),
896
+ ("flags", c_uint),
897
+ ]
898
+
899
+
900
+ c_dcgmCreateMigEntity_version1 = make_dcgm_version(c_dcgmCreateMigEntity_v1, 1)
901
+
902
+
903
+ # /**
904
+ # * Structure to represent error attributes
905
+ # */
906
+ class c_dcgmErrorInfo_v1(_PrintableStructure):
907
+ _fields_ = [("gpuId", c_uint), ("fieldId", c_ushort), ("status", c_int)]
908
+
909
+
910
+ # /**
911
+ # * Represents list of supported clocks for a device
912
+ # */
913
+ class c_dcgmDeviceSupportedClockSets_v1(_PrintableStructure):
914
+ _fields_ = [
915
+ ("version", c_uint),
916
+ ("count", c_uint),
917
+ ("clockSet", c_dcgmClockSet_v1 * DCGM_MAX_CLOCKS),
918
+ ]
919
+
920
+
921
+ # /**
922
+ # * Represents accounting information for a device and pid
923
+ # */
924
+ class c_dcgmDevicePidAccountingStats_v1(_PrintableStructure):
925
+ _fields_ = [
926
+ ("version", c_uint32),
927
+ ("pid", c_uint32),
928
+ ("gpuUtilization", c_uint32),
929
+ ("memoryUtilization", c_uint32),
930
+ ("maxMemoryUsage", c_uint64),
931
+ ("startTimestamp", c_uint64),
932
+ ("activeTimeUsec", c_uint64),
933
+ ]
934
+
935
+
936
+ # /**
937
+ # * Represents thermal information
938
+ # */
939
+ class c_dcgmDeviceThermals_v1(_PrintableStructure):
940
+ _fields_ = [("version", c_uint), ("slowdownTemp", c_uint), ("shutdownTemp", c_uint)]
941
+
942
+
943
+ # /**
944
+ # * Represents various power limits
945
+ # */
946
+ class c_dcgmDevicePowerLimits_v1(_PrintableStructure):
947
+ _fields_ = [
948
+ ("version", c_uint),
949
+ ("curPowerLimit", c_uint),
950
+ ("defaultPowerLimit", c_uint),
951
+ ("enforcedPowerLimit", c_uint),
952
+ ("minPowerLimit", c_uint),
953
+ ("maxPowerLimit", c_uint),
954
+ ]
955
+
956
+
957
+ # /**
958
+ # * Represents device identifiers
959
+ # */
960
+ class c_dcgmDeviceIdentifiers_v1(_PrintableStructure):
961
+ _fields_ = [
962
+ ("version", c_uint),
963
+ ("brandName", c_char * DCGM_MAX_STR_LENGTH),
964
+ ("deviceName", c_char * DCGM_MAX_STR_LENGTH),
965
+ ("pciBusId", c_char * DCGM_MAX_STR_LENGTH),
966
+ ("serial", c_char * DCGM_MAX_STR_LENGTH),
967
+ ("uuid", c_char * DCGM_MAX_STR_LENGTH),
968
+ ("vbios", c_char * DCGM_MAX_STR_LENGTH),
969
+ ("inforomImageVersion", c_char * DCGM_MAX_STR_LENGTH),
970
+ ("pciDeviceId", c_uint32),
971
+ ("pciSubSystemId", c_uint32),
972
+ ("driverVersion", c_char * DCGM_MAX_STR_LENGTH),
973
+ ("virtualizationMode", c_uint32),
974
+ ]
975
+
976
+
977
+ # /**
978
+ # * Represents memory utilization
979
+ # */
980
+ class c_dcgmDeviceMemoryUsage_v1(_PrintableStructure):
981
+ _fields_ = [
982
+ ("version", c_uint),
983
+ ("bar1Total", c_uint),
984
+ ("fbTotal", c_uint),
985
+ ("fbUsed", c_uint),
986
+ ("fbFree", c_uint),
987
+ ]
988
+
989
+
990
+ # /**
991
+ # * Represents utilization values of vGPUs running on the device
992
+ # */
993
+ class c_dcgmDeviceVgpuUtilInfo_v1(_PrintableStructure):
994
+ _fields_ = [
995
+ ("version", c_uint),
996
+ ("vgpuId", c_uint),
997
+ ("smUtil", c_uint),
998
+ ("memUtil", c_uint),
999
+ ("encUtil", c_uint),
1000
+ ("decUtil", c_uint),
1001
+ ]
1002
+
1003
+
1004
+ # /**
1005
+ # * Utilization values for processes running within vGPU VMs using the device
1006
+ # */
1007
+ class c_dcgmDeviceVgpuProcessUtilInfo_v1(_PrintableStructure):
1008
+ _fields_ = [
1009
+ ("version", c_uint),
1010
+ ("vgpuId", c_uint),
1011
+ ("pid", c_uint),
1012
+ ("processName", c_char * DCGM_VGPU_NAME_BUFFER_SIZE),
1013
+ ("smUtil", c_uint),
1014
+ ("memUtil", c_uint),
1015
+ ("encUtil", c_uint),
1016
+ ("decUtil", c_uint),
1017
+ ]
1018
+
1019
+
1020
+ # /**
1021
+ # * Represents current encoder statistics for the given device/vGPU instance
1022
+ # */
1023
+ class c_dcgmDeviceEncStats_v1(_PrintableStructure):
1024
+ _fields_ = [
1025
+ ("version", c_uint),
1026
+ ("sessionCount", c_uint),
1027
+ ("averageFps", c_uint),
1028
+ ("averageLatency", c_uint),
1029
+ ]
1030
+
1031
+
1032
+ # /**
1033
+ # * Represents information about active encoder sessions on the given vGPU instance
1034
+ # */
1035
+ class c_dcgmDeviceVgpuEncSessions_v1(_PrintableStructure):
1036
+ _fields_ = [
1037
+ ("version", c_uint),
1038
+ ("vgpuId", c_uint),
1039
+ ("sessionId", c_uint),
1040
+ ("pid", c_uint),
1041
+ ("codecType", c_uint),
1042
+ ("hResolution", c_uint),
1043
+ ("vResolution", c_uint),
1044
+ ("averageFps", c_uint),
1045
+ ("averageLatency", c_uint),
1046
+ ]
1047
+
1048
+
1049
+ # /**
1050
+ # * Represents current frame buffer capture sessions statistics for the given device/vGPU instance
1051
+ # */
1052
+ class c_dcgmDeviceFbcStats_v1(_PrintableStructure):
1053
+ _fields_ = [
1054
+ ("version", c_uint),
1055
+ ("sessionCount", c_uint),
1056
+ ("averageFps", c_uint),
1057
+ ("averageLatency", c_uint),
1058
+ ]
1059
+
1060
+
1061
+ # /**
1062
+ # * Represents information about active FBC session on the given device/vGPU instance
1063
+ # */
1064
+ class c_dcgmDeviceFbcSessionInfo_t(_PrintableStructure):
1065
+ _fields_ = [
1066
+ ("version", c_uint),
1067
+ ("sessionId", c_uint),
1068
+ ("pid", c_uint),
1069
+ ("vgpuId", c_uint),
1070
+ ("displayOrdinal", c_uint),
1071
+ ("sessionType", c_uint),
1072
+ ("sessionFlags", c_uint),
1073
+ ("hMaxResolution", c_uint),
1074
+ ("vMaxResolution", c_uint),
1075
+ ("hResolution", c_uint),
1076
+ ("vResolution", c_uint),
1077
+ ("averageFps", c_uint),
1078
+ ("averageLatency", c_uint),
1079
+ ]
1080
+
1081
+
1082
+ # /**
1083
+ # * Represents all the active FBC sessions on the given device/vGPU instance
1084
+ # */
1085
+ class c_dcgmDeviceFbcSessions_v1(_PrintableStructure):
1086
+ _fields_ = [
1087
+ ("version", c_uint),
1088
+ ("sessionCount", c_uint),
1089
+ ("sessionInfo", c_dcgmDeviceFbcSessionInfo_t * DCGM_MAX_FBC_SESSIONS),
1090
+ ]
1091
+
1092
+
1093
+ # /**
1094
+ # * Represents static info related to vGPU types supported on the device
1095
+ # */
1096
+ class c_dcgmDeviceVgpuTypeInfo_v1(_PrintableStructure):
1097
+ _fields_ = [
1098
+ ("version", c_uint),
1099
+ ("vgpuTypeId", c_uint),
1100
+ ("vgpuTypeName", c_char * DCGM_VGPU_NAME_BUFFER_SIZE),
1101
+ ("vgpuTypeClass", c_char * DCGM_VGPU_NAME_BUFFER_SIZE),
1102
+ ("vgpuTypeLicense", c_char * DCGM_GRID_LICENSE_BUFFER_SIZE),
1103
+ ("deviceId", c_uint),
1104
+ ("subsystemId", c_uint),
1105
+ ("numDisplayHeads", c_uint),
1106
+ ("maxInstances", c_uint),
1107
+ ("frameRateLimit", c_uint),
1108
+ ("maxResolutionX", c_uint),
1109
+ ("maxResolutionY", c_uint),
1110
+ ("fbTotal", c_uint),
1111
+ ]
1112
+
1113
+
1114
+ class c_dcgmDeviceVgpuTypeInfo_v2(_PrintableStructure):
1115
+ _fields_ = [
1116
+ ("version", c_uint),
1117
+ ("vgpuTypeId", c_uint),
1118
+ ("vgpuTypeName", c_char * DCGM_VGPU_NAME_BUFFER_SIZE),
1119
+ ("vgpuTypeClass", c_char * DCGM_VGPU_NAME_BUFFER_SIZE),
1120
+ ("vgpuTypeLicense", c_char * DCGM_GRID_LICENSE_BUFFER_SIZE),
1121
+ ("deviceId", c_uint),
1122
+ ("subsystemId", c_uint),
1123
+ ("numDisplayHeads", c_uint),
1124
+ ("maxInstances", c_uint),
1125
+ ("frameRateLimit", c_uint),
1126
+ ("maxResolutionX", c_uint),
1127
+ ("maxResolutionY", c_uint),
1128
+ ("fbTotal", c_uint),
1129
+ ("gpuInstanceProfileId", c_uint),
1130
+ ]
1131
+
1132
+
1133
+ dcgmDeviceVgpuTypeInfo_version2 = make_dcgm_version(c_dcgmDeviceVgpuTypeInfo_v2, 2)
1134
+
1135
+
1136
+ class c_dcgmDeviceSettings_v2(_PrintableStructure):
1137
+ _fields_ = [
1138
+ ("version", c_uint),
1139
+ ("persistenceModeEnabled", c_uint),
1140
+ ("migModeEnabled", c_uint),
1141
+ ("confidentialComputeMode", c_uint),
1142
+ ]
1143
+
1144
+
1145
+ # /**
1146
+ # * Represents attributes corresponding to a device
1147
+ # */
1148
+ class c_dcgmDeviceAttributes_deprecated_v1(_PrintableStructure):
1149
+ _fields_ = [
1150
+ ("version", c_uint),
1151
+ ("clockSets", c_dcgmDeviceSupportedClockSets_v1),
1152
+ ("thermalSettings", c_dcgmDeviceThermals_v1),
1153
+ ("powerLimits", c_dcgmDevicePowerLimits_v1),
1154
+ ("identifiers", c_dcgmDeviceIdentifiers_v1),
1155
+ ("memoryUsage", c_dcgmDeviceMemoryUsage_v1),
1156
+ ("unused", c_char * 208),
1157
+ ]
1158
+
1159
+
1160
+ dcgmDeviceAttributes_deprecated_version1 = make_dcgm_version(
1161
+ c_dcgmDeviceAttributes_deprecated_v1, 1
1162
+ )
1163
+
1164
+
1165
+ # /**
1166
+ # * Represents attributes corresponding to a device
1167
+ # */
1168
+ class c_dcgmDeviceAttributes_v3(_PrintableStructure):
1169
+ _fields_ = [
1170
+ ("version", c_uint),
1171
+ ("clockSets", c_dcgmDeviceSupportedClockSets_v1),
1172
+ ("thermalSettings", c_dcgmDeviceThermals_v1),
1173
+ ("powerLimits", c_dcgmDevicePowerLimits_v1),
1174
+ ("identifiers", c_dcgmDeviceIdentifiers_v1),
1175
+ ("memoryUsage", c_dcgmDeviceMemoryUsage_v1),
1176
+ ("settings", c_dcgmDeviceSettings_v2),
1177
+ ]
1178
+
1179
+
1180
+ dcgmDeviceAttributes_version3 = make_dcgm_version(c_dcgmDeviceAttributes_v3, 3)
1181
+
1182
+
1183
+ # /**
1184
+ # * Represents attributes info for a MIG device
1185
+ # */
1186
+ class c_dcgmDeviceMigAttributesInfo_v1(_PrintableStructure):
1187
+ _fields_ = [
1188
+ ("version", c_uint),
1189
+ ("gpuInstanceId", c_uint),
1190
+ ("computeInstanceId", c_uint),
1191
+ ("multiprocessorCount", c_uint),
1192
+ ("sharedCopyEngineCount", c_uint),
1193
+ ("sharedDecoderCount", c_uint),
1194
+ ("sharedEncoderCount", c_uint),
1195
+ ("sharedJpegCount", c_uint),
1196
+ ("sharedOfaCount", c_uint),
1197
+ ("gpuInstanceSliceCount", c_uint),
1198
+ ("computeInstanceSliceCount", c_uint),
1199
+ ("memorySizeMB", c_uint64),
1200
+ ]
1201
+
1202
+
1203
+ dcgmDeviceMigAttributesInfo_version1 = make_dcgm_version(
1204
+ c_dcgmDeviceMigAttributesInfo_v1, 1
1205
+ )
1206
+
1207
+
1208
+ # /**
1209
+ # * Represents attributes for a MIG device
1210
+ # */
1211
+ class c_dcgmDeviceMigAttributes_v1(_PrintableStructure):
1212
+ _fields_ = [
1213
+ ("version", c_uint),
1214
+ ("migDevicesCount", c_uint),
1215
+ ("migAttributesInfo", c_dcgmDeviceMigAttributesInfo_v1),
1216
+ ]
1217
+
1218
+
1219
+ dcgmDeviceMigAttributes_version1 = make_dcgm_version(c_dcgmDeviceMigAttributes_v1, 1)
1220
+
1221
+
1222
+ # /**
1223
+ # * Represents GPU instance profile information
1224
+ # */
1225
+ class c_dcgmGpuInstanceProfileInfo_v1(_PrintableStructure):
1226
+ _fields_ = [
1227
+ ("version", c_uint),
1228
+ ("id", c_uint),
1229
+ ("isP2pSupported", c_uint),
1230
+ ("sliceCount", c_uint),
1231
+ ("instanceCount", c_uint),
1232
+ ("multiprocessorCount", c_uint),
1233
+ ("copyEngineCount", c_uint),
1234
+ ("decoderCount", c_uint),
1235
+ ("encoderCount", c_uint),
1236
+ ("jpegCount", c_uint),
1237
+ ("ofaCount", c_uint),
1238
+ ("memorySizeMB", c_uint64),
1239
+ ]
1240
+
1241
+
1242
+ dcgmGpuInstanceProfileInfo_version1 = make_dcgm_version(
1243
+ c_dcgmGpuInstanceProfileInfo_v1, 1
1244
+ )
1245
+
1246
+
1247
+ # /**
1248
+ # * Represents GPU instance profiles
1249
+ # */
1250
+ class c_dcgmGpuInstanceProfiles_v1(_PrintableStructure):
1251
+ _fields_ = [
1252
+ ("version", c_uint),
1253
+ ("profileCount", c_uint),
1254
+ ("profileInfo", c_dcgmGpuInstanceProfileInfo_v1),
1255
+ ]
1256
+
1257
+
1258
+ dcgmGpuInstanceProfiles_version1 = make_dcgm_version(c_dcgmGpuInstanceProfiles_v1, 1)
1259
+
1260
+
1261
+ # /**
1262
+ # * Represents Compute instance profile information
1263
+ # */
1264
+ class c_dcgmComputeInstanceProfileInfo_v1(_PrintableStructure):
1265
+ _fields_ = [
1266
+ ("version", c_uint),
1267
+ ("gpuInstanceId", c_uint),
1268
+ ("id", c_uint),
1269
+ ("sliceCount", c_uint),
1270
+ ("instanceCount", c_uint),
1271
+ ("multiprocessorCount", c_uint),
1272
+ ("sharedCopyEngineCount", c_uint),
1273
+ ("sharedDecoderCount", c_uint),
1274
+ ("sharedEncoderCount", c_uint),
1275
+ ("sharedJpegCount", c_uint),
1276
+ ("sharedOfaCount", c_uint),
1277
+ ]
1278
+
1279
+
1280
+ dcgmComputeInstanceProfileInfo_version1 = make_dcgm_version(
1281
+ c_dcgmComputeInstanceProfileInfo_v1, 1
1282
+ )
1283
+
1284
+
1285
+ # /**
1286
+ # * Represents Compute instance profiles
1287
+ # */
1288
+ class c_dcgmComputeInstanceProfiles_v1(_PrintableStructure):
1289
+ _fields_ = [
1290
+ ("version", c_uint),
1291
+ ("profileCount", c_uint),
1292
+ ("profileInfo", c_dcgmComputeInstanceProfileInfo_v1),
1293
+ ]
1294
+
1295
+
1296
+ dcgmComputeInstanceProfiles_version1 = make_dcgm_version(
1297
+ c_dcgmComputeInstanceProfiles_v1, 1
1298
+ )
1299
+
1300
+
1301
+ # /**
1302
+ # * Represents vGPU attributes corresponding to a device
1303
+ # */
1304
+ class c_dcgmVgpuDeviceAttributes_v6(_PrintableStructure):
1305
+ _fields_ = [
1306
+ ("version", c_uint),
1307
+ ("activeVgpuInstanceCount", c_uint),
1308
+ ("activeVgpuInstanceIds", c_uint * DCGM_MAX_VGPU_INSTANCES_PER_PGPU),
1309
+ ("creatableVgpuTypeCount", c_uint),
1310
+ ("creatableVgpuTypeIds", c_uint * DCGM_MAX_VGPU_TYPES_PER_PGPU),
1311
+ ("supportedVgpuTypeCount", c_uint),
1312
+ (
1313
+ "supportedVgpuTypeInfo",
1314
+ c_dcgmDeviceVgpuTypeInfo_v1 * DCGM_MAX_VGPU_TYPES_PER_PGPU,
1315
+ ),
1316
+ ("vgpuUtilInfo", c_dcgmDeviceVgpuUtilInfo_v1 * DCGM_MAX_VGPU_TYPES_PER_PGPU),
1317
+ ("gpuUtil", c_uint),
1318
+ ("memCopyUtil", c_uint),
1319
+ ("encUtil", c_uint),
1320
+ ("decUtil", c_uint),
1321
+ ]
1322
+
1323
+
1324
+ dcgmVgpuDeviceAttributes_version6 = make_dcgm_version(c_dcgmVgpuDeviceAttributes_v6, 1)
1325
+
1326
+
1327
+ class c_dcgmVgpuDeviceAttributes_v7(_PrintableStructure):
1328
+ _fields_ = [
1329
+ ("version", c_uint),
1330
+ ("activeVgpuInstanceCount", c_uint),
1331
+ ("activeVgpuInstanceIds", c_uint * DCGM_MAX_VGPU_INSTANCES_PER_PGPU),
1332
+ ("creatableVgpuTypeCount", c_uint),
1333
+ ("creatableVgpuTypeIds", c_uint * DCGM_MAX_VGPU_TYPES_PER_PGPU),
1334
+ ("supportedVgpuTypeCount", c_uint),
1335
+ (
1336
+ "supportedVgpuTypeInfo",
1337
+ c_dcgmDeviceVgpuTypeInfo_v2 * DCGM_MAX_VGPU_TYPES_PER_PGPU,
1338
+ ),
1339
+ ("vgpuUtilInfo", c_dcgmDeviceVgpuUtilInfo_v1 * DCGM_MAX_VGPU_TYPES_PER_PGPU),
1340
+ ("gpuUtil", c_uint),
1341
+ ("memCopyUtil", c_uint),
1342
+ ("encUtil", c_uint),
1343
+ ("decUtil", c_uint),
1344
+ ]
1345
+
1346
+
1347
+ dcgmVgpuDeviceAttributes_version7 = make_dcgm_version(c_dcgmVgpuDeviceAttributes_v7, 7)
1348
+
1349
+
1350
+ # /**
1351
+ # * Represents attributes specific to vGPU instance
1352
+ # */
1353
+ class c_dcgmVgpuInstanceAttributes_v1(_PrintableStructure):
1354
+ _fields_ = [
1355
+ ("version", c_uint),
1356
+ ("vmId", c_char * DCGM_DEVICE_UUID_BUFFER_SIZE),
1357
+ ("vmName", c_char * DCGM_DEVICE_UUID_BUFFER_SIZE),
1358
+ ("vgpuTypeId", c_uint),
1359
+ ("vgpuUuid", c_char * DCGM_DEVICE_UUID_BUFFER_SIZE),
1360
+ ("vgpuDriverVersion", c_char * DCGM_DEVICE_UUID_BUFFER_SIZE),
1361
+ ("fbUsage", c_uint),
1362
+ ("licenseStatus", c_uint),
1363
+ ("frameRateLimit", c_uint),
1364
+ ]
1365
+
1366
+
1367
+ dcgmVgpuInstanceAttributes_version1 = make_dcgm_version(
1368
+ c_dcgmVgpuInstanceAttributes_v1, 1
1369
+ )
1370
+
1371
+
1372
+ class c_dcgmConfigPowerLimit(_PrintableStructure):
1373
+ _fields_ = [("type", c_uint), ("val", c_uint)]
1374
+
1375
+
1376
+ class c_dcgmConfigPerfStateSettings_t(_PrintableStructure):
1377
+ _fields_ = [
1378
+ ("syncBoost", c_uint),
1379
+ ("targetClocks", c_dcgmClockSet_v1),
1380
+ ]
1381
+
1382
+
1383
+ # Structure to represent default configuration for a device
1384
+ class c_dcgmDeviceConfig_v1(_PrintableStructure):
1385
+ _fields_ = [
1386
+ # version must always be first
1387
+ ("version", c_uint),
1388
+ ("gpuId", c_uint),
1389
+ ("mEccMode", c_uint),
1390
+ ("mComputeMode", c_uint),
1391
+ ("mPerfState", c_dcgmConfigPerfStateSettings_t),
1392
+ ("mPowerLimit", c_dcgmConfigPowerLimit),
1393
+ ]
1394
+
1395
+
1396
+ dcgmDeviceConfig_version1 = make_dcgm_version(c_dcgmDeviceConfig_v1, 1)
1397
+
1398
+
1399
+ # Structure to represent default vGPU configuration for a device
1400
+ class c_dcgmDeviceVgpuConfig_v1(_PrintableStructure):
1401
+ _fields_ = [
1402
+ # version must always be first
1403
+ ("version", c_uint),
1404
+ ("gpuId", c_uint),
1405
+ ("mEccMode", c_uint),
1406
+ ("mComputeMode", c_uint),
1407
+ ("mPerfState", c_dcgmConfigPerfStateSettings_t),
1408
+ ("mPowerLimit", c_dcgmConfigPowerLimit),
1409
+ ]
1410
+
1411
+ def SetBlank(self):
1412
+ # Does not set version or gpuId
1413
+ self.mEccMode = dcgmvalue.DCGM_INT32_BLANK
1414
+ self.mPerfState.syncBoost = dcgmvalue.DCGM_INT32_BLANK
1415
+ self.mPerfState.targetClocks.memClock = dcgmvalue.DCGM_INT32_BLANK
1416
+ self.mPerfState.targetClocks.smClock = dcgmvalue.DCGM_INT32_BLANK
1417
+ self.mComputeMode = dcgmvalue.DCGM_INT32_BLANK
1418
+ self.mPowerLimit.type = DCGM_CONFIG_POWER_CAP_INDIVIDUAL
1419
+ self.mPowerLimit.val = dcgmvalue.DCGM_INT32_BLANK
1420
+
1421
+
1422
+ dcgmDeviceVgpuConfig_version1 = make_dcgm_version(c_dcgmDeviceVgpuConfig_v1, 1)
1423
+
1424
+
1425
+ # Structure to receive update on the list of metrics.
1426
+ class c_dcgmPolicyUpdate_v1(_PrintableStructure):
1427
+ _fields_ = [
1428
+ # version must always be first
1429
+ ("version", c_uint),
1430
+ ("power", c_uint),
1431
+ ]
1432
+
1433
+
1434
+ dcgmPolicyUpdate_version1 = make_dcgm_version(c_dcgmPolicyUpdate_v1, 1)
1435
+
1436
+ # Represents a Callback to receive power updates from the host engine
1437
+ _dcgmRecvUpdates_t = c_void_p
1438
+
1439
+
1440
+ # Define the structure that contains specific policy information
1441
+ class c_dcgmPolicyViolation_v1(_PrintableStructure):
1442
+ _fields_ = [
1443
+ # version must always be first
1444
+ ("version", c_uint),
1445
+ ("notifyOnEccDbe", c_uint),
1446
+ ("notifyOnPciEvent", c_uint),
1447
+ ("notifyOnMaxRetiredPages", c_uint),
1448
+ ]
1449
+
1450
+
1451
+ dcgmPolicyViolation_version1 = make_dcgm_version(c_dcgmPolicyViolation_v1, 1)
1452
+
1453
+
1454
+ class c_dcgmWatchFieldValue_v1(_PrintableStructure):
1455
+ _fields_: List = []
1456
+
1457
+
1458
+ dcgmWatchFieldValue_version1 = make_dcgm_version(c_dcgmWatchFieldValue_v1, 1)
1459
+
1460
+
1461
+ class c_dcgmUnwatchFieldValue_v1(_PrintableStructure):
1462
+ _fields_: List = []
1463
+
1464
+
1465
+ dcgmUnwatchFieldValue_version1 = make_dcgm_version(c_dcgmUnwatchFieldValue_v1, 1)
1466
+
1467
+
1468
+ class c_dcgmUpdateAllFields_v1(_PrintableStructure):
1469
+ _fields_: List = []
1470
+
1471
+
1472
+ dcgmUpdateAllFields_version1 = make_dcgm_version(c_dcgmUpdateAllFields_v1, 1)
1473
+
1474
+ dcgmGetMultipleValuesForFieldResponse_version1 = 1
1475
+
1476
+ # policy enums (and table indices)
1477
+ DCGM_POLICY_COND_IDX_DBE = 0
1478
+ DCGM_POLICY_COND_IDX_PCI = 1
1479
+ DCGM_POLICY_COND_IDX_MAX_PAGES_RETIRED = 2
1480
+ DCGM_POLICY_COND_IDX_THERMAL = 3
1481
+ DCGM_POLICY_COND_IDX_POWER = 4
1482
+ DCGM_POLICY_COND_IDX_NVLINK = 5
1483
+ DCGM_POLICY_COND_IDX_XID = 6
1484
+ DCGM_POLICY_COND_IDX_MAX = 7
1485
+
1486
+ # policy enum bitmasks
1487
+ DCGM_POLICY_COND_DBE = 0x1
1488
+ DCGM_POLICY_COND_PCI = 0x2
1489
+ DCGM_POLICY_COND_MAX_PAGES_RETIRED = 0x4
1490
+ DCGM_POLICY_COND_THERMAL = 0x8
1491
+ DCGM_POLICY_COND_POWER = 0x10
1492
+ DCGM_POLICY_COND_NVLINK = 0x20
1493
+ DCGM_POLICY_COND_XID = 0x40
1494
+ DCGM_POLICY_COND_MAX = 7
1495
+
1496
+ DCGM_POLICY_MODE_AUTOMATED = 0
1497
+ DCGM_POLICY_MODE_MANUAL = 1
1498
+
1499
+ DCGM_POLICY_ISOLATION_NONE = 0
1500
+
1501
+ DCGM_POLICY_ACTION_NONE = 0
1502
+ DCGM_POLICY_ACTION_GPURESET = 1 # Deprecated
1503
+
1504
+ DCGM_POLICY_VALID_NONE = 0
1505
+ DCGM_POLICY_VALID_SV_SHORT = 1
1506
+ DCGM_POLICY_VALID_SV_MED = 2
1507
+ DCGM_POLICY_VALID_SV_LONG = 3
1508
+ DCGM_POLICY_VALID_SV_XLONG = 4
1509
+
1510
+ DCGM_POLICY_FAILURE_NONE = 0
1511
+
1512
+ DCGM_DIAG_LVL_INVALID = 0
1513
+ DCGM_DIAG_LVL_SHORT = 10
1514
+ DCGM_DIAG_LVL_MED = 20
1515
+ DCGM_DIAG_LVL_LONG = 30
1516
+ DCGM_DIAG_LVL_XLONG = 40
1517
+
1518
+ DCGM_DIAG_RESULT_PASS = 0
1519
+ DCGM_DIAG_RESULT_SKIP = 1
1520
+ DCGM_DIAG_RESULT_WARN = 2
1521
+ DCGM_DIAG_RESULT_FAIL = 3
1522
+ DCGM_DIAG_RESULT_NOT_RUN = 4
1523
+
1524
+
1525
+ class c_dcgmPolicyConditionParmTypes_t(DcgmUnion):
1526
+ _fields_ = [
1527
+ ("boolean", c_bool),
1528
+ ("llval", c_longlong),
1529
+ ]
1530
+
1531
+
1532
+ class c_dcgmPolicyConditionParms_t(_PrintableStructure):
1533
+ _fields_ = [("tag", c_uint), ("val", c_dcgmPolicyConditionParmTypes_t)]
1534
+
1535
+
1536
+ class c_dcgmPolicy_v1(_PrintableStructure):
1537
+ _fields_ = [
1538
+ # version must always be first
1539
+ ("version", c_uint),
1540
+ ("condition", c_uint), # an OR'd list of DCGM_POLICY_COND_*
1541
+ ("mode", c_uint),
1542
+ ("isolation", c_uint),
1543
+ ("action", c_uint),
1544
+ ("validation", c_uint),
1545
+ ("response", c_uint),
1546
+ ("parms", c_dcgmPolicyConditionParms_t * DCGM_POLICY_COND_MAX),
1547
+ ]
1548
+
1549
+
1550
+ dcgmPolicy_version1 = make_dcgm_version(c_dcgmPolicy_v1, 1)
1551
+
1552
+
1553
+ class c_dcgmPolicyConditionPci_t(_PrintableStructure):
1554
+ _fields_ = [
1555
+ ("timestamp", c_longlong), # timestamp of the error
1556
+ ("counter", c_uint), # value of the PCIe replay counter
1557
+ ]
1558
+
1559
+
1560
+ class c_dcgmPolicyConditionDbe_t(_PrintableStructure):
1561
+ LOCATIONS = {"L1": 0, "L2": 1, "DEVICE": 2, "REGISTER": 3, "TEXTURE": 4}
1562
+
1563
+ _fields_ = [
1564
+ ("timestamp", c_longlong), # timestamp of the error
1565
+ ("location", c_int), # location of the error (one of self.LOCATIONS)
1566
+ ("numerrors", c_uint), # number of errors
1567
+ ]
1568
+
1569
+
1570
+ class c_dcgmPolicyConditionMpr_t(_PrintableStructure):
1571
+ _fields_ = [
1572
+ ("timestamp", c_longlong), # timestamp of the error
1573
+ ("sbepages", c_uint), # number of pending pages due to SBE
1574
+ ("dbepages", c_uint), # number of pending pages due to DBE
1575
+ ]
1576
+
1577
+
1578
+ class c_dcgmPolicyConditionThermal_t(_PrintableStructure):
1579
+ _fields_ = [
1580
+ ("timestamp", c_longlong), # timestamp of the error
1581
+ ("thermalViolation", c_uint), # Temperature reached that violated policy
1582
+ ]
1583
+
1584
+
1585
+ class c_dcgmPolicyConditionPower_t(_PrintableStructure):
1586
+ _fields_ = [
1587
+ ("timestamp", c_longlong), # timestamp of the error
1588
+ ("powerViolation", c_uint), # Power value reached that violated policyy
1589
+ ]
1590
+
1591
+
1592
+ class c_dcgmPolicyConditionNvlink_t(_PrintableStructure):
1593
+ _fields_ = [
1594
+ ("timestamp", c_longlong), # timestamp of the error
1595
+ ("fieldId", c_ushort), # FieldId of the nvlink error counter
1596
+ ("counter", c_uint), # Error value reached that violated policyy
1597
+ ]
1598
+
1599
+
1600
+ class c_dcgmPolicyConditionXID_t(_PrintableStructure):
1601
+ _fields_ = [
1602
+ ("timestamp", c_longlong), # timestamp of the error
1603
+ ("errnum", c_uint), # XID error number
1604
+ ]
1605
+
1606
+
1607
+ class c_dcgmPolicyCallbackResponse_v1(_PrintableStructure):
1608
+
1609
+ class Value(DcgmUnion):
1610
+ # implement more of the fields when a test requires them
1611
+ _fields_ = [
1612
+ ("dbe", c_dcgmPolicyConditionDbe_t), # ECC DBE return structure
1613
+ ("pci", c_dcgmPolicyConditionPci_t), # PCI replay error return structure
1614
+ (
1615
+ "mpr",
1616
+ c_dcgmPolicyConditionMpr_t,
1617
+ ), # Max retired pages limit return structure
1618
+ (
1619
+ "thermal",
1620
+ c_dcgmPolicyConditionThermal_t,
1621
+ ), # Thermal policy violations return structure
1622
+ (
1623
+ "power",
1624
+ c_dcgmPolicyConditionPower_t,
1625
+ ), # Power policy violations return structure
1626
+ (
1627
+ "nvlink",
1628
+ c_dcgmPolicyConditionNvlink_t,
1629
+ ), # Nvlink policy violations return structure..
1630
+ (
1631
+ "xid",
1632
+ c_dcgmPolicyConditionXID_t,
1633
+ ), # XID policy violations return structure
1634
+ ]
1635
+
1636
+ _fields_ = [
1637
+ ("version", c_uint),
1638
+ ("condition", c_int), # an OR'ed list of DCGM_POLICY_COND_*
1639
+ ("val", Value),
1640
+ ]
1641
+
1642
+
1643
+ class c_dcgmFieldValue_v1_value(DcgmUnion):
1644
+ _fields_ = [
1645
+ ("i64", c_int64),
1646
+ ("dbl", c_double),
1647
+ ("str", c_char * DCGM_MAX_STR_LENGTH),
1648
+ ("blob", c_byte * DCGM_MAX_BLOB_LENGTH),
1649
+ ]
1650
+
1651
+
1652
+ # This structure is used to represent value for the field to be queried.
1653
+ class c_dcgmFieldValue_v1(_PrintableStructure):
1654
+ _fields_ = [
1655
+ # version must always be first
1656
+ ("version", c_uint),
1657
+ ("fieldId", c_ushort),
1658
+ ("fieldType", c_short),
1659
+ ("status", c_int),
1660
+ ("ts", c_int64),
1661
+ ("value", c_dcgmFieldValue_v1_value),
1662
+ ]
1663
+
1664
+
1665
+ dcgmFieldValue_version1 = make_dcgm_version(c_dcgmFieldValue_v1, 1)
1666
+
1667
+
1668
+ # This structure is used to represent value for the field to be queried (version 2)
1669
+ class c_dcgmFieldValue_v2(_PrintableStructure):
1670
+ _fields_ = [
1671
+ # version must always be first
1672
+ ("version", c_uint),
1673
+ ("entityGroupId", c_uint),
1674
+ ("entityId", c_uint),
1675
+ ("fieldId", c_ushort),
1676
+ ("fieldType", c_short),
1677
+ ("status", c_int),
1678
+ ("unused", c_uint),
1679
+ ("ts", c_int64),
1680
+ ("value", c_dcgmFieldValue_v1_value),
1681
+ ]
1682
+
1683
+
1684
+ dcgmFieldValue_version2 = make_dcgm_version(c_dcgmFieldValue_v2, 2)
1685
+
1686
+ # Field value flags used by dcgm_agent.dcgmEntitiesGetLatestValues()
1687
+ DCGM_FV_FLAG_LIVE_DATA = 0x00000001
1688
+
1689
+ DCGM_HEALTH_WATCH_PCIE = 0x1
1690
+ DCGM_HEALTH_WATCH_NVLINK = 0x2
1691
+ DCGM_HEALTH_WATCH_PMU = 0x4
1692
+ DCGM_HEALTH_WATCH_MCU = 0x8
1693
+ DCGM_HEALTH_WATCH_MEM = 0x10
1694
+ DCGM_HEALTH_WATCH_SM = 0x20
1695
+ DCGM_HEALTH_WATCH_INFOROM = 0x40
1696
+ DCGM_HEALTH_WATCH_THERMAL = 0x80
1697
+ DCGM_HEALTH_WATCH_POWER = 0x100
1698
+ DCGM_HEALTH_WATCH_DRIVER = 0x200
1699
+ DCGM_HEALTH_WATCH_NVSWITCH_NONFATAL = 0x400
1700
+ DCGM_HEALTH_WATCH_NVSWITCH_FATAL = 0x800
1701
+ DCGM_HEALTH_WATCH_ALL = 0xFFFFFFFF
1702
+ DCGM_HEALTH_WATCH_COUNT_V1 = 10
1703
+ DCGM_HEALTH_WATCH_COUNT_V2 = 12
1704
+
1705
+ DCGM_HEALTH_RESULT_PASS = 0
1706
+ DCGM_HEALTH_RESULT_WARN = 10
1707
+ DCGM_HEALTH_RESULT_FAIL = 20
1708
+
1709
+
1710
+ class c_dcgmDiagErrorDetail_t(_PrintableStructure):
1711
+ _fields_ = [("msg", c_char * 1024), ("code", c_uint)]
1712
+
1713
+
1714
+ DCGM_HEALTH_WATCH_MAX_INCIDENTS = DCGM_GROUP_MAX_ENTITIES
1715
+
1716
+
1717
+ class c_dcgmIncidentInfo_t(_PrintableStructure):
1718
+ _fields_ = [
1719
+ ("system", c_uint),
1720
+ ("health", c_uint32),
1721
+ ("error", c_dcgmDiagErrorDetail_t),
1722
+ ("entityInfo", c_dcgmGroupEntityPair_t),
1723
+ ]
1724
+
1725
+
1726
+ class c_dcgmHealthResponse_v4(_PrintableStructure):
1727
+ _fields_ = [
1728
+ ("version", c_uint32),
1729
+ ("overallHealth", c_uint32),
1730
+ ("incidentCount", c_uint32),
1731
+ ("incidents", c_dcgmIncidentInfo_t * DCGM_HEALTH_WATCH_MAX_INCIDENTS),
1732
+ ]
1733
+
1734
+
1735
+ dcgmHealthResponse_version4 = make_dcgm_version(c_dcgmHealthResponse_v4, 4)
1736
+
1737
+
1738
+ class c_dcgmHealthSetParams_v2(_PrintableStructure):
1739
+ _fields_ = [
1740
+ ("version", c_uint32),
1741
+ ("groupId", c_void_p),
1742
+ ("systems", c_uint32),
1743
+ ("updateInterval", c_int64),
1744
+ ("maxKeepAge", c_double),
1745
+ ]
1746
+
1747
+
1748
+ dcgmHealthSetParams_version2 = make_dcgm_version(c_dcgmHealthSetParams_v2, 2)
1749
+
1750
+
1751
+ # Pid info structs
1752
+ class c_dcgmStatSummaryInt64_t(_PrintableStructure):
1753
+ _fields_ = [("minValue", c_int64), ("maxValue", c_int64), ("average", c_int64)]
1754
+
1755
+
1756
+ class c_dcgmStatSummaryInt32_t(_PrintableStructure):
1757
+ _fields_ = [("minValue", c_int32), ("maxValue", c_int32), ("average", c_int32)]
1758
+
1759
+
1760
+ class c_dcgmStatSummaryFp64_t(_PrintableStructure):
1761
+ _fields_ = [("minValue", c_double), ("maxValue", c_double), ("average", c_double)]
1762
+
1763
+
1764
+ class c_dcgmProcessUtilInfo_t(_PrintableStructure):
1765
+ _fields_ = [("pid", c_uint), ("smUtil", c_double), ("memUtil", c_double)]
1766
+
1767
+
1768
+ class c_dcgmHealthResponseInfo_t(_PrintableStructure):
1769
+ _fields_ = [("system", c_uint), ("health", c_uint)]
1770
+
1771
+
1772
+ DCGM_MAX_PID_INFO_NUM = 16
1773
+
1774
+
1775
+ class c_dcgmPidSingleInfo_t(_PrintableStructure):
1776
+ _fields_ = [
1777
+ ("gpuId", c_uint32),
1778
+ ("energyConsumed", c_int64),
1779
+ ("pcieRxBandwidth", c_dcgmStatSummaryInt64_t),
1780
+ ("pcieTxBandwidth", c_dcgmStatSummaryInt64_t),
1781
+ ("pcieReplays", c_int64),
1782
+ ("startTime", c_int64),
1783
+ ("endTime", c_int64),
1784
+ ("processUtilization", c_dcgmProcessUtilInfo_t),
1785
+ ("smUtilization", c_dcgmStatSummaryInt32_t),
1786
+ ("memoryUtilization", c_dcgmStatSummaryInt32_t),
1787
+ ("eccSingleBit", c_uint32), # Deprecated
1788
+ ("eccDoubleBit", c_uint32),
1789
+ ("memoryClock", c_dcgmStatSummaryInt32_t),
1790
+ ("smClock", c_dcgmStatSummaryInt32_t),
1791
+ ("numXidCriticalErrors", c_int32),
1792
+ ("xidCriticalErrorsTs", c_int64 * 10),
1793
+ ("numOtherComputePids", c_int32),
1794
+ ("otherComputePids", c_uint32 * DCGM_MAX_PID_INFO_NUM),
1795
+ ("numOtherGraphicsPids", c_int32),
1796
+ ("otherGraphicsPids", c_uint32 * DCGM_MAX_PID_INFO_NUM),
1797
+ ("maxGpuMemoryUsed", c_int64),
1798
+ ("powerViolationTime", c_int64),
1799
+ ("thermalViolationTime", c_int64),
1800
+ ("reliabilityViolationTime", c_int64),
1801
+ ("boardLimitViolationTime", c_int64),
1802
+ ("lowUtilizationTime", c_int64),
1803
+ ("syncBoostTime", c_int64),
1804
+ ("overallHealth", c_uint),
1805
+ ("incidentCount", c_uint),
1806
+ ("systems", c_dcgmHealthResponseInfo_t * DCGM_HEALTH_WATCH_COUNT_V1),
1807
+ ]
1808
+
1809
+
1810
+ class c_dcgmPidInfo_v2(_PrintableStructure):
1811
+ _fields_ = [
1812
+ ("version", c_uint32),
1813
+ ("pid", c_uint32),
1814
+ ("unused", c_uint32),
1815
+ ("numGpus", c_int32),
1816
+ ("summary", c_dcgmPidSingleInfo_t),
1817
+ ("gpus", c_dcgmPidSingleInfo_t * DCGM_MAX_NUM_DEVICES),
1818
+ ]
1819
+
1820
+
1821
+ dcgmPidInfo_version2 = make_dcgm_version(c_dcgmPidInfo_v2, 2)
1822
+
1823
+
1824
+ class c_dcgmRunningProcess_v1(_PrintableStructure):
1825
+ _fields_ = [("version", c_uint32), ("pid", c_uint32), ("memoryUsed", c_uint64)]
1826
+
1827
+
1828
+ dcgmRunningProcess_version1 = make_dcgm_version(c_dcgmRunningProcess_v1, 1)
1829
+
1830
+ c_dcgmRunningProcess_t = c_dcgmRunningProcess_v1
1831
+
1832
+
1833
+ class c_dcgmGpuUsageInfo_t(_PrintableStructure):
1834
+ _fields_ = [
1835
+ ("gpuId", c_uint32),
1836
+ ("energyConsumed", c_int64),
1837
+ ("powerUsage", c_dcgmStatSummaryFp64_t),
1838
+ ("pcieRxBandwidth", c_dcgmStatSummaryInt64_t),
1839
+ ("pcieTxBandwidth", c_dcgmStatSummaryInt64_t),
1840
+ ("pcieReplays", c_int64),
1841
+ ("startTime", c_int64),
1842
+ ("endTime", c_int64),
1843
+ ("smUtilization", c_dcgmStatSummaryInt32_t),
1844
+ ("memoryUtilization", c_dcgmStatSummaryInt32_t),
1845
+ ("eccSingleBit", c_uint32), # Deprecated
1846
+ ("eccDoubleBit", c_uint32),
1847
+ ("memoryClock", c_dcgmStatSummaryInt32_t),
1848
+ ("smClock", c_dcgmStatSummaryInt32_t),
1849
+ ("numXidCriticalErrors", c_int32),
1850
+ ("xidCriticalErrorsTs", c_int64 * 10),
1851
+ ("numComputePids", c_int32),
1852
+ ("computePids", c_dcgmProcessUtilInfo_t * DCGM_MAX_PID_INFO_NUM),
1853
+ ("numGraphicsPids", c_int32),
1854
+ ("graphicsPids", c_dcgmProcessUtilInfo_t * DCGM_MAX_PID_INFO_NUM),
1855
+ ("maxGpuMemoryUsed", c_int64),
1856
+ ("powerViolationTime", c_int64),
1857
+ ("thermalViolationTime", c_int64),
1858
+ ("reliabilityViolationTime", c_int64),
1859
+ ("boardLimitViolationTime", c_int64),
1860
+ ("lowUtilizationTime", c_int64),
1861
+ ("syncBoostTime", c_int64),
1862
+ ("overallHealth", c_uint),
1863
+ ("incidentCount", c_uint),
1864
+ ("systems", c_dcgmHealthResponseInfo_t * DCGM_HEALTH_WATCH_COUNT_V1),
1865
+ ]
1866
+
1867
+
1868
+ class c_dcgmJobInfo_v3(_PrintableStructure):
1869
+ _fields_ = [
1870
+ ("version", c_uint32),
1871
+ ("numGpus", c_int32),
1872
+ ("summary", c_dcgmGpuUsageInfo_t),
1873
+ ("gpus", c_dcgmGpuUsageInfo_t * DCGM_MAX_NUM_DEVICES),
1874
+ ]
1875
+
1876
+
1877
+ dcgmJobInfo_version3 = make_dcgm_version(c_dcgmJobInfo_v3, 3)
1878
+
1879
+
1880
+ class c_dcgmDiagTestResult_v2(_PrintableStructure):
1881
+ _fields_ = [
1882
+ ("result", c_uint),
1883
+ ("error", c_dcgmDiagErrorDetail_t),
1884
+ ("info", c_char * 1024),
1885
+ ]
1886
+
1887
+
1888
+ class c_dcgmDiagResponsePerGpu_v4(_PrintableStructure):
1889
+ _fields_ = [
1890
+ ("gpuId", c_uint),
1891
+ ("hwDiagnosticReturn", c_uint),
1892
+ ("results", c_dcgmDiagTestResult_v2 * DCGM_PER_GPU_TEST_COUNT_V8),
1893
+ ]
1894
+
1895
+
1896
+ DCGM_SWTEST_COUNT = 10
1897
+ LEVEL_ONE_MAX_RESULTS = 16
1898
+
1899
+
1900
+ class c_dcgmDiagResponse_v8(_PrintableStructure):
1901
+ _fields_ = [
1902
+ ("version", c_uint),
1903
+ ("gpuCount", c_uint),
1904
+ ("levelOneTestCount", c_uint),
1905
+ ("levelOneResults", c_dcgmDiagTestResult_v2 * LEVEL_ONE_MAX_RESULTS),
1906
+ ("perGpuResponses", c_dcgmDiagResponsePerGpu_v4 * DCGM_MAX_NUM_DEVICES),
1907
+ ("systemError", c_dcgmDiagErrorDetail_t),
1908
+ ("_unused", c_char * 1024),
1909
+ ]
1910
+
1911
+
1912
+ dcgmDiagResponse_version8 = make_dcgm_version(c_dcgmDiagResponse_v8, 8)
1913
+
1914
+ DCGM_AFFINITY_BITMASK_ARRAY_SIZE = 8
1915
+
1916
+
1917
+ class c_dcgmDeviceTopologyPath_t(_PrintableStructure):
1918
+ _fields_ = [("gpuId", c_uint32), ("path", c_uint32), ("localNvLinkIds", c_uint32)]
1919
+
1920
+
1921
+ class c_dcgmDeviceTopology_v1(_PrintableStructure):
1922
+ _fields_ = [
1923
+ ("version", c_uint32),
1924
+ ("cpuAffinityMask", c_ulong * DCGM_AFFINITY_BITMASK_ARRAY_SIZE),
1925
+ ("numGpus", c_uint32),
1926
+ ("gpuPaths", c_dcgmDeviceTopologyPath_t * (DCGM_MAX_NUM_DEVICES - 1)),
1927
+ ]
1928
+
1929
+
1930
+ dcgmDeviceTopology_version1 = make_dcgm_version(c_dcgmDeviceTopology_v1, 1)
1931
+
1932
+
1933
+ class c_dcgmGroupTopology_v1(_PrintableStructure):
1934
+ _fields_ = [
1935
+ ("version", c_uint32),
1936
+ ("groupCpuAffinityMask", c_ulong * DCGM_AFFINITY_BITMASK_ARRAY_SIZE),
1937
+ ("numaOptimalFlag", c_uint32),
1938
+ ("slowestPath", c_uint32),
1939
+ ]
1940
+
1941
+
1942
+ dcgmGroupTopology_version1 = make_dcgm_version(c_dcgmGroupTopology_v1, 1)
1943
+
1944
+ # Maximum number of field groups that can exist
1945
+ DCGM_MAX_NUM_FIELD_GROUPS = 64
1946
+
1947
+ # Maximum number of field IDs that can be in a single field group
1948
+ DCGM_MAX_FIELD_IDS_PER_FIELD_GROUP = 128
1949
+
1950
+
1951
+ class c_dcgmFieldGroupInfo_v1(_PrintableStructure):
1952
+ _fields_ = [
1953
+ ("version", c_uint32),
1954
+ ("numFieldIds", c_uint32),
1955
+ ("fieldGroupId", c_void_p),
1956
+ ("fieldGroupName", c_char * DCGM_MAX_STR_LENGTH),
1957
+ ("fieldIds", c_uint16 * DCGM_MAX_FIELD_IDS_PER_FIELD_GROUP),
1958
+ ]
1959
+
1960
+
1961
+ dcgmFieldGroupInfo_version1 = make_dcgm_version(c_dcgmFieldGroupInfo_v1, 1)
1962
+
1963
+
1964
+ class c_dcgmAllFieldGroup_v1(_PrintableStructure):
1965
+ _fields_ = [
1966
+ ("version", c_uint32),
1967
+ ("numFieldGroups", c_uint32),
1968
+ ("fieldGroups", c_dcgmFieldGroupInfo_v1 * DCGM_MAX_NUM_FIELD_GROUPS),
1969
+ ]
1970
+
1971
+
1972
+ dcgmAllFieldGroup_version1 = make_dcgm_version(c_dcgmAllFieldGroup_v1, 1)
1973
+
1974
+
1975
+ class c_dcgmIntrospectMemory_v1(_PrintableStructure):
1976
+ _fields_ = [
1977
+ ("version", c_uint32),
1978
+ (
1979
+ "bytesUsed",
1980
+ c_longlong,
1981
+ ), # The total number of bytes being used to store all of the fields being watched
1982
+ ]
1983
+
1984
+
1985
+ dcgmIntrospectMemory_version1 = make_dcgm_version(c_dcgmIntrospectMemory_v1, 1)
1986
+
1987
+
1988
+ class c_dcgmIntrospectCpuUtil_v1(_PrintableStructure):
1989
+ _fields_ = [
1990
+ ("version", c_uint32), #!< version number (dcgmIntrospectCpuUtil_version)
1991
+ ("total", c_double), #!< fraction of device's CPU resources that were used
1992
+ (
1993
+ "kernel",
1994
+ c_double,
1995
+ ), #!< fraction of device's CPU resources that were used in kernel mode
1996
+ (
1997
+ "user",
1998
+ c_double,
1999
+ ), #!< fraction of device's CPU resources that were used in user mode
2000
+ ]
2001
+
2002
+
2003
+ dcgmIntrospectCpuUtil_version1 = make_dcgm_version(c_dcgmIntrospectCpuUtil_v1, 1)
2004
+
2005
+ DCGM_MAX_CONFIG_FILE_LEN = 10000
2006
+ DCGM_MAX_TEST_NAMES = 20
2007
+ DCGM_MAX_TEST_NAMES_LEN = 50
2008
+ DCGM_MAX_TEST_PARMS = 100
2009
+ DCGM_MAX_TEST_PARMS_LEN = 100
2010
+ DCGM_GPU_LIST_LEN = 50
2011
+ DCGM_FILE_LEN = 30
2012
+ DCGM_PATH_LEN = 128
2013
+ DCGM_THROTTLE_MASK_LEN = 50
2014
+
2015
+ # Flags options for running the GPU diagnostic
2016
+ DCGM_RUN_FLAGS_VERBOSE = 0x0001
2017
+ DCGM_RUN_FLAGS_STATSONFAIL = 0x0002
2018
+ # UNUSED
2019
+ DCGM_RUN_FLAGS_TRAIN = 0x0004
2020
+ # UNUSED
2021
+ DCGM_RUN_FLAGS_FORCE_TRAIN = 0x0008
2022
+ DCGM_RUN_FLAGS_FAIL_EARLY = 0x0010 # Enable fail early checks for the Targeted Stress, Targeted Power, SM Stress, and Diagnostic tests
2023
+
2024
+
2025
+ class c_dcgmRunDiag_v7(_PrintableStructure):
2026
+ _fields_ = [
2027
+ ("version", c_uint), # version of this message
2028
+ (
2029
+ "flags",
2030
+ c_uint,
2031
+ ), # flags specifying binary options for running it. Currently verbose and stats on fail
2032
+ (
2033
+ "debugLevel",
2034
+ c_uint,
2035
+ ), # 0-5 for the debug level the GPU diagnostic will use for logging
2036
+ (
2037
+ "groupId",
2038
+ c_void_p,
2039
+ ), # group of GPUs to verify. Cannot be specified together with gpuList.
2040
+ ("validate", c_uint), # 0-3 for which tests to run. Optional.
2041
+ (
2042
+ "testNames",
2043
+ c_char * DCGM_MAX_TEST_NAMES * DCGM_MAX_TEST_NAMES_LEN,
2044
+ ), # Specifed list of test names. Optional.
2045
+ (
2046
+ "testParms",
2047
+ c_char * DCGM_MAX_TEST_PARMS * DCGM_MAX_TEST_PARMS_LEN,
2048
+ ), # Parameters to set for specified tests in the format: testName.parameterName=parameterValue. Optional.
2049
+ (
2050
+ "fakeGpuList",
2051
+ c_char * DCGM_GPU_LIST_LEN,
2052
+ ), # Comma-separated list of fake gpus. Cannot be specified with the groupId or gpuList.
2053
+ (
2054
+ "gpuList",
2055
+ c_char * DCGM_GPU_LIST_LEN,
2056
+ ), # Comma-separated list of gpus. Cannot be specified with the groupId.
2057
+ (
2058
+ "debugLogFile",
2059
+ c_char * DCGM_PATH_LEN,
2060
+ ), # Alternate name for the debug log file that should be used
2061
+ (
2062
+ "statsPath",
2063
+ c_char * DCGM_PATH_LEN,
2064
+ ), # Path that the plugin's statistics files should be written to
2065
+ (
2066
+ "configFileContents",
2067
+ c_char * DCGM_MAX_CONFIG_FILE_LEN,
2068
+ ), # Contents of nvvs config file (likely yaml)
2069
+ (
2070
+ "throttleMask",
2071
+ c_char * DCGM_THROTTLE_MASK_LEN,
2072
+ ), # Throttle reasons to ignore as either integer mask or csv list of reasons
2073
+ ("pluginPath", c_char * DCGM_PATH_LEN), # Custom path to the diagnostic plugins
2074
+ ("_unusedInt1", c_uint), # Unused
2075
+ ("_unusedInt2", c_uint), # Unused
2076
+ ("_unusedInt3", c_uint), # Unused
2077
+ ("_unusedBuf", c_char * DCGM_PATH_LEN), # Unused
2078
+ (
2079
+ "failCheckInterval",
2080
+ c_uint,
2081
+ ), # How often the fail early checks should occur when DCGM_RUN_FLAGS_FAIL_EARLY is set.
2082
+ ]
2083
+
2084
+
2085
+ dcgmRunDiag_version7 = make_dcgm_version(c_dcgmRunDiag_v7, 7)
2086
+
2087
+ # Latest c_dcgmRunDiag class
2088
+ c_dcgmRunDiag_t = c_dcgmRunDiag_v7
2089
+
2090
+ # Latest version for dcgmRunDiag_t
2091
+ dcgmRunDiag_version = dcgmRunDiag_version7
2092
+
2093
+ # Flags for dcgmGetEntityGroupEntities's flags parameter
2094
+ DCGM_GEGE_FLAG_ONLY_SUPPORTED = (
2095
+ 0x00000001 # Only return entities that are supported by DCGM.
2096
+ )
2097
+
2098
+ # Identifies a GPU NVLink error type returned by DCGM_FI_DEV_GPU_NVLINK_ERRORS
2099
+ DCGM_GPU_NVLINK_ERROR_RECOVERY_REQUIRED = 1 # NVLink link recovery error occurred
2100
+ DCGM_GPU_NVLINK_ERROR_FATAL = 2 # NVLink link fatal error occurred
2101
+
2102
+ # Topology hints for dcgmSelectGpusByTopology()
2103
+ DCGM_TOPO_HINT_F_NONE = 0x00000000 # No hints specified
2104
+ DCGM_TOPO_HINT_F_IGNOREHEALTH = (
2105
+ 0x00000001 # Ignore the health of the GPUs when picking GPUs for job execution.
2106
+ )
2107
+ # By default, only healthy GPUs are considered.
2108
+
2109
+
2110
+ class c_dcgmTopoSchedHint_v1(_PrintableStructure):
2111
+ _fields_ = [
2112
+ ("version", c_uint), # version of this message
2113
+ ("inputGpuIds", c_uint64), # bitmask of the GPU ids to choose from
2114
+ ("numGpus", c_uint32), # the number of GPUs that DCGM should chooose
2115
+ (
2116
+ "hintFlags",
2117
+ c_uint64,
2118
+ ), # Hints to ignore certain factors for the scheduling hint
2119
+ ]
2120
+
2121
+
2122
+ dcgmTopoSchedHint_version1 = make_dcgm_version(c_dcgmTopoSchedHint_v1, 1)
2123
+
2124
+ # DCGM NvLink link states used by c_dcgmNvLinkGpuLinkStatus_v1 & 2 and c_dcgmNvLinkNvSwitchLinkStatus_t's linkState field
2125
+ DcgmNvLinkLinkStateNotSupported = (
2126
+ 0 # NvLink is unsupported by this GPU (Default for GPUs)
2127
+ )
2128
+ DcgmNvLinkLinkStateDisabled = 1 # NvLink is supported for this link but this link is disabled (Default for NvSwitches)
2129
+ DcgmNvLinkLinkStateDown = 2 # This NvLink link is down (inactive)
2130
+ DcgmNvLinkLinkStateUp = 3 # This NvLink link is up (active)
2131
+
2132
+
2133
+ # State of NvLink links for a GPU
2134
+ class c_dcgmNvLinkGpuLinkStatus_v1(_PrintableStructure):
2135
+ _fields_ = [
2136
+ ("entityId", c_uint32), # Entity ID of the GPU (gpuId)
2137
+ (
2138
+ "linkState",
2139
+ c_uint32 * DCGM_NVLINK_MAX_LINKS_PER_GPU_LEGACY1,
2140
+ ), # Link state of each link of this GPU
2141
+ ]
2142
+
2143
+
2144
+ # State of NvLink links for a GPU
2145
+ class c_dcgmNvLinkGpuLinkStatus_v2(_PrintableStructure):
2146
+ _fields_ = [
2147
+ ("entityId", c_uint32), # Entity ID of the GPU (gpuId)
2148
+ (
2149
+ "linkState",
2150
+ c_uint32 * DCGM_NVLINK_MAX_LINKS_PER_GPU_LEGACY2,
2151
+ ), # Link state of each link of this GPU
2152
+ ]
2153
+
2154
+
2155
+ class c_dcgmNvLinkGpuLinkStatus_v3(_PrintableStructure):
2156
+ _fields_ = [
2157
+ ("entityId", c_uint32), # Entity ID of the GPU (gpuId)
2158
+ (
2159
+ "linkState",
2160
+ c_uint32 * DCGM_NVLINK_MAX_LINKS_PER_GPU,
2161
+ ), # Link state of each link of this GPU
2162
+ ]
2163
+
2164
+
2165
+ # State of NvLink links for a NvSwitch
2166
+ class c_dcgmNvLinkNvSwitchLinkStatus_v1(_PrintableStructure):
2167
+ _fields_ = [
2168
+ ("entityId", c_uint32), # Entity ID of the NvSwitch (physicalId)
2169
+ (
2170
+ "linkState",
2171
+ c_uint32 * DCGM_NVLINK_MAX_LINKS_PER_NVSWITCH_V1,
2172
+ ), # Link state of each link of this NvSwitch
2173
+ ]
2174
+
2175
+
2176
+ class c_dcgmNvLinkStatus_v2(_PrintableStructure):
2177
+ """
2178
+ NvSwitch link status for all GPUs and NvSwitches in the system
2179
+ """
2180
+
2181
+ _fields_ = [
2182
+ (
2183
+ "version",
2184
+ c_uint32,
2185
+ ), # version of this message. Should be dcgmNvLinkStatus_version1
2186
+ ("numGpus", c_uint32), # Number of GPUs populated in gpus[]
2187
+ (
2188
+ "gpus",
2189
+ c_dcgmNvLinkGpuLinkStatus_v2 * DCGM_MAX_NUM_DEVICES,
2190
+ ), # Per-GPU NvLink link statuses
2191
+ ("numNvSwitches", c_uint32), # Number of NvSwitches populated in nvSwitches[]
2192
+ (
2193
+ "nvSwitches",
2194
+ c_dcgmNvLinkNvSwitchLinkStatus_v1 * DCGM_MAX_NUM_SWITCHES,
2195
+ ), # Per-NvSwitch NvLink link statuses
2196
+ ]
2197
+
2198
+
2199
+ dcgmNvLinkStatus_version2 = make_dcgm_version(c_dcgmNvLinkStatus_v2, 2)
2200
+
2201
+
2202
+ # State of NvLink links for a NvSwitch
2203
+ class c_dcgmNvLinkNvSwitchLinkStatus_v2(_PrintableStructure):
2204
+ _fields_ = [
2205
+ ("entityId", c_uint32), # Entity ID of the NvSwitch (physicalId)
2206
+ (
2207
+ "linkState",
2208
+ c_uint32 * DCGM_NVLINK_MAX_LINKS_PER_NVSWITCH,
2209
+ ), # Link state of each link of this NvSwitch
2210
+ ]
2211
+
2212
+
2213
+ class c_dcgmNvLinkStatus_v3(_PrintableStructure):
2214
+ """
2215
+ NvSwitch link status for all GPUs and NvSwitches in the system
2216
+ """
2217
+
2218
+ _fields_ = [
2219
+ (
2220
+ "version",
2221
+ c_uint32,
2222
+ ), # version of this message. Should be dcgmNvLinkStatus_version1
2223
+ ("numGpus", c_uint32), # Number of GPUs populated in gpus[]
2224
+ (
2225
+ "gpus",
2226
+ c_dcgmNvLinkGpuLinkStatus_v3 * DCGM_MAX_NUM_DEVICES,
2227
+ ), # Per-GPU NvLink link statuses
2228
+ ("numNvSwitches", c_uint32), # Number of NvSwitches populated in nvSwitches[]
2229
+ (
2230
+ "nvSwitches",
2231
+ c_dcgmNvLinkNvSwitchLinkStatus_v2 * DCGM_MAX_NUM_SWITCHES,
2232
+ ), # Per-NvSwitch NvLink link statuses
2233
+ ]
2234
+
2235
+
2236
+ dcgmNvLinkStatus_version3 = make_dcgm_version(c_dcgmNvLinkStatus_v3, 3)
2237
+
2238
+ # Bitmask values for dcgmGetFieldIdSummary
2239
+ DCGM_SUMMARY_MIN = 0x00000001
2240
+ DCGM_SUMMARY_MAX = 0x00000002
2241
+ DCGM_SUMMARY_AVG = 0x00000004
2242
+ DCGM_SUMMARY_SUM = 0x00000008
2243
+ DCGM_SUMMARY_COUNT = 0x00000010
2244
+ DCGM_SUMMARY_INTEGRAL = 0x00000020
2245
+ DCGM_SUMMARY_DIFF = 0x00000040
2246
+ DCGM_SUMMARY_SIZE = 7
2247
+
2248
+
2249
+ class c_dcgmSummaryResponse_t(_PrintableStructure):
2250
+
2251
+ class ResponseValue(DcgmUnion):
2252
+ _fields_ = [
2253
+ ("i64", c_int64),
2254
+ ("dbl", c_double),
2255
+ ]
2256
+
2257
+ _fields_ = [
2258
+ ("fieldType", c_uint),
2259
+ ("summaryCount", c_uint),
2260
+ ("values", ResponseValue * DCGM_SUMMARY_SIZE),
2261
+ ]
2262
+
2263
+
2264
+ class c_dcgmFieldSummaryRequest_v1(_PrintableStructure):
2265
+ _fields_ = [
2266
+ ("version", c_uint),
2267
+ ("fieldId", c_ushort),
2268
+ ("entityGroupType", c_uint32),
2269
+ ("entityId", c_uint),
2270
+ ("summaryTypeMask", c_uint32),
2271
+ ("startTime", c_uint64),
2272
+ ("endTime", c_uint64),
2273
+ ("response", c_dcgmSummaryResponse_t),
2274
+ ]
2275
+
2276
+
2277
+ dcgmFieldSummaryRequest_version1 = make_dcgm_version(c_dcgmFieldSummaryRequest_v1, 1)
2278
+
2279
+ # Module IDs
2280
+ DcgmModuleIdCore = 0 # Core DCGM
2281
+ DcgmModuleIdNvSwitch = 1 # NvSwitch Module
2282
+ DcgmModuleIdVGPU = 2 # VGPU Module
2283
+ DcgmModuleIdIntrospect = 3 # Introspection Module
2284
+ DcgmModuleIdHealth = 4 # Health Module
2285
+ DcgmModuleIdPolicy = 5 # Policy Module
2286
+ DcgmModuleIdConfig = 6 # Config Module
2287
+ DcgmModuleIdDiag = 7 # GPU Diagnostic Module
2288
+ DcgmModuleIdProfiling = 8 # Profiling Module
2289
+ DcgmModuleIdCount = 9 # 1 greater than largest ID above
2290
+
2291
+ # Module Status
2292
+ DcgmModuleStatusNotLoaded = 0 # Module has not been loaded yet
2293
+ DcgmModuleStatusDenylisted = (
2294
+ 1 # Module has been added to the denylist so it can't be loaded
2295
+ )
2296
+ DcgmModuleStatusFailed = 2 # Loading the module failed
2297
+ DcgmModuleStatusLoaded = 3 # Module has been loaded
2298
+ DcgmModuleStatusUnloaded = 4 # Module has been unloaded
2299
+ DcgmModuleStatusPaused = 5 # Module has been paused. Implies it's been loaded
2300
+
2301
+ DCGM_MODULE_STATUSES_CAPACITY = 16
2302
+
2303
+
2304
+ class c_dcgmModuleGetStatusesModule_t(_PrintableStructure):
2305
+ _fields_ = [
2306
+ ("id", c_uint32), # One of DcgmModuleId*
2307
+ ("status", c_uint32), # One of DcgmModuleStatus*
2308
+ ]
2309
+
2310
+
2311
+ class c_dcgmModuleGetStatuses_v1(_PrintableStructure):
2312
+ _fields_ = [
2313
+ ("version", c_uint),
2314
+ ("numStatuses", c_uint32),
2315
+ ("statuses", c_dcgmModuleGetStatusesModule_t * DCGM_MODULE_STATUSES_CAPACITY),
2316
+ ]
2317
+
2318
+
2319
+ dcgmModuleGetStatuses_version1 = make_dcgm_version(c_dcgmModuleGetStatuses_v1, 1)
2320
+
2321
+ DCGM_PROF_MAX_NUM_GROUPS_V2 = (
2322
+ 10 # Maximum number of metric ID groups that can exist in DCGM
2323
+ )
2324
+ DCGM_PROF_MAX_FIELD_IDS_PER_GROUP_V2 = 64 # Maximum number of field IDs that can be in a single DCGM profiling metric group
2325
+
2326
+
2327
+ class c_dcgmProfMetricGroupInfo_v2(_PrintableStructure):
2328
+ _fields_ = [
2329
+ ("majorId", c_ushort),
2330
+ ("minorId", c_ushort),
2331
+ ("numFieldIds", c_uint32),
2332
+ ("fieldIds", c_ushort * DCGM_PROF_MAX_FIELD_IDS_PER_GROUP_V2),
2333
+ ]
2334
+
2335
+
2336
+ class c_dcgmProfGetMetricGroups_v3(_PrintableStructure):
2337
+ _fields_ = [
2338
+ ("version", c_uint32),
2339
+ ("unused", c_uint32),
2340
+ ("gpuId", c_uint32),
2341
+ ("numMetricGroups", c_uint32),
2342
+ ("metricGroups", c_dcgmProfMetricGroupInfo_v2 * DCGM_PROF_MAX_NUM_GROUPS_V2),
2343
+ ]
2344
+
2345
+
2346
+ dcgmProfGetMetricGroups_version3 = make_dcgm_version(c_dcgmProfGetMetricGroups_v3, 3)
2347
+
2348
+
2349
+ class c_dcgmVersionInfo_v2(_PrintableStructure):
2350
+ _fields_ = [
2351
+ ("version", c_uint32),
2352
+ ("rawBuildInfoString", c_char * (DCGM_MAX_STR_LENGTH * 2)),
2353
+ ]
2354
+
2355
+
2356
+ dcgmVersionInfo_version2 = make_dcgm_version(c_dcgmVersionInfo_v2, 2)
2357
+ dcgmVersionInfo_version = dcgmVersionInfo_version2