triton-model-analyzer 1.48.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (204) hide show
  1. model_analyzer/__init__.py +15 -0
  2. model_analyzer/analyzer.py +448 -0
  3. model_analyzer/cli/__init__.py +15 -0
  4. model_analyzer/cli/cli.py +193 -0
  5. model_analyzer/config/__init__.py +15 -0
  6. model_analyzer/config/generate/__init__.py +15 -0
  7. model_analyzer/config/generate/automatic_model_config_generator.py +164 -0
  8. model_analyzer/config/generate/base_model_config_generator.py +352 -0
  9. model_analyzer/config/generate/brute_plus_binary_parameter_search_run_config_generator.py +164 -0
  10. model_analyzer/config/generate/brute_run_config_generator.py +154 -0
  11. model_analyzer/config/generate/concurrency_sweeper.py +75 -0
  12. model_analyzer/config/generate/config_generator_interface.py +52 -0
  13. model_analyzer/config/generate/coordinate.py +143 -0
  14. model_analyzer/config/generate/coordinate_data.py +86 -0
  15. model_analyzer/config/generate/generator_utils.py +116 -0
  16. model_analyzer/config/generate/manual_model_config_generator.py +187 -0
  17. model_analyzer/config/generate/model_config_generator_factory.py +92 -0
  18. model_analyzer/config/generate/model_profile_spec.py +74 -0
  19. model_analyzer/config/generate/model_run_config_generator.py +154 -0
  20. model_analyzer/config/generate/model_variant_name_manager.py +150 -0
  21. model_analyzer/config/generate/neighborhood.py +536 -0
  22. model_analyzer/config/generate/optuna_plus_concurrency_sweep_run_config_generator.py +141 -0
  23. model_analyzer/config/generate/optuna_run_config_generator.py +838 -0
  24. model_analyzer/config/generate/perf_analyzer_config_generator.py +312 -0
  25. model_analyzer/config/generate/quick_plus_concurrency_sweep_run_config_generator.py +130 -0
  26. model_analyzer/config/generate/quick_run_config_generator.py +753 -0
  27. model_analyzer/config/generate/run_config_generator_factory.py +329 -0
  28. model_analyzer/config/generate/search_config.py +112 -0
  29. model_analyzer/config/generate/search_dimension.py +73 -0
  30. model_analyzer/config/generate/search_dimensions.py +85 -0
  31. model_analyzer/config/generate/search_parameter.py +49 -0
  32. model_analyzer/config/generate/search_parameters.py +388 -0
  33. model_analyzer/config/input/__init__.py +15 -0
  34. model_analyzer/config/input/config_command.py +483 -0
  35. model_analyzer/config/input/config_command_profile.py +1747 -0
  36. model_analyzer/config/input/config_command_report.py +267 -0
  37. model_analyzer/config/input/config_defaults.py +236 -0
  38. model_analyzer/config/input/config_enum.py +83 -0
  39. model_analyzer/config/input/config_field.py +216 -0
  40. model_analyzer/config/input/config_list_generic.py +112 -0
  41. model_analyzer/config/input/config_list_numeric.py +151 -0
  42. model_analyzer/config/input/config_list_string.py +111 -0
  43. model_analyzer/config/input/config_none.py +71 -0
  44. model_analyzer/config/input/config_object.py +129 -0
  45. model_analyzer/config/input/config_primitive.py +81 -0
  46. model_analyzer/config/input/config_status.py +75 -0
  47. model_analyzer/config/input/config_sweep.py +83 -0
  48. model_analyzer/config/input/config_union.py +113 -0
  49. model_analyzer/config/input/config_utils.py +128 -0
  50. model_analyzer/config/input/config_value.py +243 -0
  51. model_analyzer/config/input/objects/__init__.py +15 -0
  52. model_analyzer/config/input/objects/config_model_profile_spec.py +325 -0
  53. model_analyzer/config/input/objects/config_model_report_spec.py +173 -0
  54. model_analyzer/config/input/objects/config_plot.py +198 -0
  55. model_analyzer/config/input/objects/config_protobuf_utils.py +101 -0
  56. model_analyzer/config/input/yaml_config_validator.py +82 -0
  57. model_analyzer/config/run/__init__.py +15 -0
  58. model_analyzer/config/run/model_run_config.py +313 -0
  59. model_analyzer/config/run/run_config.py +168 -0
  60. model_analyzer/constants.py +76 -0
  61. model_analyzer/device/__init__.py +15 -0
  62. model_analyzer/device/device.py +24 -0
  63. model_analyzer/device/gpu_device.py +87 -0
  64. model_analyzer/device/gpu_device_factory.py +248 -0
  65. model_analyzer/entrypoint.py +307 -0
  66. model_analyzer/log_formatter.py +65 -0
  67. model_analyzer/model_analyzer_exceptions.py +24 -0
  68. model_analyzer/model_manager.py +255 -0
  69. model_analyzer/monitor/__init__.py +15 -0
  70. model_analyzer/monitor/cpu_monitor.py +69 -0
  71. model_analyzer/monitor/dcgm/DcgmDiag.py +191 -0
  72. model_analyzer/monitor/dcgm/DcgmFieldGroup.py +83 -0
  73. model_analyzer/monitor/dcgm/DcgmGroup.py +815 -0
  74. model_analyzer/monitor/dcgm/DcgmHandle.py +141 -0
  75. model_analyzer/monitor/dcgm/DcgmJsonReader.py +69 -0
  76. model_analyzer/monitor/dcgm/DcgmReader.py +623 -0
  77. model_analyzer/monitor/dcgm/DcgmStatus.py +57 -0
  78. model_analyzer/monitor/dcgm/DcgmSystem.py +412 -0
  79. model_analyzer/monitor/dcgm/__init__.py +15 -0
  80. model_analyzer/monitor/dcgm/common/__init__.py +13 -0
  81. model_analyzer/monitor/dcgm/common/dcgm_client_cli_parser.py +194 -0
  82. model_analyzer/monitor/dcgm/common/dcgm_client_main.py +86 -0
  83. model_analyzer/monitor/dcgm/dcgm_agent.py +887 -0
  84. model_analyzer/monitor/dcgm/dcgm_collectd_plugin.py +369 -0
  85. model_analyzer/monitor/dcgm/dcgm_errors.py +395 -0
  86. model_analyzer/monitor/dcgm/dcgm_field_helpers.py +546 -0
  87. model_analyzer/monitor/dcgm/dcgm_fields.py +815 -0
  88. model_analyzer/monitor/dcgm/dcgm_fields_collectd.py +671 -0
  89. model_analyzer/monitor/dcgm/dcgm_fields_internal.py +29 -0
  90. model_analyzer/monitor/dcgm/dcgm_fluentd.py +45 -0
  91. model_analyzer/monitor/dcgm/dcgm_monitor.py +138 -0
  92. model_analyzer/monitor/dcgm/dcgm_prometheus.py +326 -0
  93. model_analyzer/monitor/dcgm/dcgm_structs.py +2357 -0
  94. model_analyzer/monitor/dcgm/dcgm_telegraf.py +65 -0
  95. model_analyzer/monitor/dcgm/dcgm_value.py +151 -0
  96. model_analyzer/monitor/dcgm/dcgmvalue.py +155 -0
  97. model_analyzer/monitor/dcgm/denylist_recommendations.py +573 -0
  98. model_analyzer/monitor/dcgm/pydcgm.py +47 -0
  99. model_analyzer/monitor/monitor.py +143 -0
  100. model_analyzer/monitor/remote_monitor.py +137 -0
  101. model_analyzer/output/__init__.py +15 -0
  102. model_analyzer/output/file_writer.py +63 -0
  103. model_analyzer/output/output_writer.py +42 -0
  104. model_analyzer/perf_analyzer/__init__.py +15 -0
  105. model_analyzer/perf_analyzer/genai_perf_config.py +206 -0
  106. model_analyzer/perf_analyzer/perf_analyzer.py +882 -0
  107. model_analyzer/perf_analyzer/perf_config.py +479 -0
  108. model_analyzer/plots/__init__.py +15 -0
  109. model_analyzer/plots/detailed_plot.py +266 -0
  110. model_analyzer/plots/plot_manager.py +224 -0
  111. model_analyzer/plots/simple_plot.py +213 -0
  112. model_analyzer/record/__init__.py +15 -0
  113. model_analyzer/record/gpu_record.py +68 -0
  114. model_analyzer/record/metrics_manager.py +887 -0
  115. model_analyzer/record/record.py +280 -0
  116. model_analyzer/record/record_aggregator.py +256 -0
  117. model_analyzer/record/types/__init__.py +15 -0
  118. model_analyzer/record/types/cpu_available_ram.py +93 -0
  119. model_analyzer/record/types/cpu_used_ram.py +93 -0
  120. model_analyzer/record/types/gpu_free_memory.py +96 -0
  121. model_analyzer/record/types/gpu_power_usage.py +107 -0
  122. model_analyzer/record/types/gpu_total_memory.py +96 -0
  123. model_analyzer/record/types/gpu_used_memory.py +96 -0
  124. model_analyzer/record/types/gpu_utilization.py +108 -0
  125. model_analyzer/record/types/inter_token_latency_avg.py +60 -0
  126. model_analyzer/record/types/inter_token_latency_base.py +74 -0
  127. model_analyzer/record/types/inter_token_latency_max.py +60 -0
  128. model_analyzer/record/types/inter_token_latency_min.py +60 -0
  129. model_analyzer/record/types/inter_token_latency_p25.py +60 -0
  130. model_analyzer/record/types/inter_token_latency_p50.py +60 -0
  131. model_analyzer/record/types/inter_token_latency_p75.py +60 -0
  132. model_analyzer/record/types/inter_token_latency_p90.py +60 -0
  133. model_analyzer/record/types/inter_token_latency_p95.py +60 -0
  134. model_analyzer/record/types/inter_token_latency_p99.py +60 -0
  135. model_analyzer/record/types/output_token_throughput.py +105 -0
  136. model_analyzer/record/types/perf_client_response_wait.py +97 -0
  137. model_analyzer/record/types/perf_client_send_recv.py +97 -0
  138. model_analyzer/record/types/perf_latency.py +111 -0
  139. model_analyzer/record/types/perf_latency_avg.py +60 -0
  140. model_analyzer/record/types/perf_latency_base.py +74 -0
  141. model_analyzer/record/types/perf_latency_p90.py +60 -0
  142. model_analyzer/record/types/perf_latency_p95.py +60 -0
  143. model_analyzer/record/types/perf_latency_p99.py +60 -0
  144. model_analyzer/record/types/perf_server_compute_infer.py +97 -0
  145. model_analyzer/record/types/perf_server_compute_input.py +97 -0
  146. model_analyzer/record/types/perf_server_compute_output.py +97 -0
  147. model_analyzer/record/types/perf_server_queue.py +97 -0
  148. model_analyzer/record/types/perf_throughput.py +105 -0
  149. model_analyzer/record/types/time_to_first_token_avg.py +60 -0
  150. model_analyzer/record/types/time_to_first_token_base.py +74 -0
  151. model_analyzer/record/types/time_to_first_token_max.py +60 -0
  152. model_analyzer/record/types/time_to_first_token_min.py +60 -0
  153. model_analyzer/record/types/time_to_first_token_p25.py +60 -0
  154. model_analyzer/record/types/time_to_first_token_p50.py +60 -0
  155. model_analyzer/record/types/time_to_first_token_p75.py +60 -0
  156. model_analyzer/record/types/time_to_first_token_p90.py +60 -0
  157. model_analyzer/record/types/time_to_first_token_p95.py +60 -0
  158. model_analyzer/record/types/time_to_first_token_p99.py +60 -0
  159. model_analyzer/reports/__init__.py +15 -0
  160. model_analyzer/reports/html_report.py +195 -0
  161. model_analyzer/reports/pdf_report.py +50 -0
  162. model_analyzer/reports/report.py +86 -0
  163. model_analyzer/reports/report_factory.py +62 -0
  164. model_analyzer/reports/report_manager.py +1376 -0
  165. model_analyzer/reports/report_utils.py +42 -0
  166. model_analyzer/result/__init__.py +15 -0
  167. model_analyzer/result/constraint_manager.py +150 -0
  168. model_analyzer/result/model_config_measurement.py +354 -0
  169. model_analyzer/result/model_constraints.py +105 -0
  170. model_analyzer/result/parameter_search.py +246 -0
  171. model_analyzer/result/result_manager.py +430 -0
  172. model_analyzer/result/result_statistics.py +159 -0
  173. model_analyzer/result/result_table.py +217 -0
  174. model_analyzer/result/result_table_manager.py +646 -0
  175. model_analyzer/result/result_utils.py +42 -0
  176. model_analyzer/result/results.py +277 -0
  177. model_analyzer/result/run_config_measurement.py +658 -0
  178. model_analyzer/result/run_config_result.py +210 -0
  179. model_analyzer/result/run_config_result_comparator.py +110 -0
  180. model_analyzer/result/sorted_results.py +151 -0
  181. model_analyzer/state/__init__.py +15 -0
  182. model_analyzer/state/analyzer_state.py +76 -0
  183. model_analyzer/state/analyzer_state_manager.py +215 -0
  184. model_analyzer/triton/__init__.py +15 -0
  185. model_analyzer/triton/client/__init__.py +15 -0
  186. model_analyzer/triton/client/client.py +234 -0
  187. model_analyzer/triton/client/client_factory.py +57 -0
  188. model_analyzer/triton/client/grpc_client.py +104 -0
  189. model_analyzer/triton/client/http_client.py +107 -0
  190. model_analyzer/triton/model/__init__.py +15 -0
  191. model_analyzer/triton/model/model_config.py +556 -0
  192. model_analyzer/triton/model/model_config_variant.py +29 -0
  193. model_analyzer/triton/server/__init__.py +15 -0
  194. model_analyzer/triton/server/server.py +76 -0
  195. model_analyzer/triton/server/server_config.py +269 -0
  196. model_analyzer/triton/server/server_docker.py +229 -0
  197. model_analyzer/triton/server/server_factory.py +306 -0
  198. model_analyzer/triton/server/server_local.py +158 -0
  199. triton_model_analyzer-1.48.0.dist-info/METADATA +52 -0
  200. triton_model_analyzer-1.48.0.dist-info/RECORD +204 -0
  201. triton_model_analyzer-1.48.0.dist-info/WHEEL +5 -0
  202. triton_model_analyzer-1.48.0.dist-info/entry_points.txt +2 -0
  203. triton_model_analyzer-1.48.0.dist-info/licenses/LICENSE +67 -0
  204. triton_model_analyzer-1.48.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,815 @@
1
+ # Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ ##
15
+ # Python bindings for the internal API of DCGM library (dcgm_fields.h)
16
+ ##
17
+
18
+ from ctypes import *
19
+ from ctypes.util import find_library
20
+ import model_analyzer.monitor.dcgm.dcgm_structs as dcgm_structs
21
+ from typing import Dict
22
+
23
+ # Provides access to functions
24
+ dcgmFP = dcgm_structs._dcgmGetFunctionPointer
25
+
26
+ # Field Types are a single byte. List these in ASCII order
27
+ DCGM_FT_BINARY = "b" # Blob of binary data representing a structure
28
+ DCGM_FT_DOUBLE = "d" # 8-byte double precision
29
+ DCGM_FT_INT64 = "i" # 8-byte signed integer
30
+ DCGM_FT_STRING = "s" # Null-terminated ASCII Character string
31
+ DCGM_FT_TIMESTAMP = "t" # 8-byte signed integer usec since 1970
32
+
33
+ # Field scope. What are these fields associated with
34
+ DCGM_FS_GLOBAL = 0 # Field is global (ex: driver version)
35
+ DCGM_FS_ENTITY = 1 # Field is associated with an entity (GPU, VGPU, ..etc)
36
+ DCGM_FS_DEVICE = (
37
+ DCGM_FS_ENTITY # Field is associated with a device. Deprecated. Use DCGM_FS_ENTITY
38
+ )
39
+
40
+ # DCGM_FI_DEV_CLOCK_THROTTLE_REASONS is a bitmap of why the clock is throttled.
41
+ # These macros are masks for relevant throttling, and are a 1:1 map to the NVML
42
+ # reasons documented in nvml.h. The notes for the header are copied blow:
43
+
44
+ # Nothing is running on the GPU and the clocks are dropping to Idle state
45
+ DCGM_CLOCKS_THROTTLE_REASON_GPU_IDLE = 0x0000000000000001
46
+
47
+ # GPU clocks are limited by current setting of applications clocks
48
+ DCGM_CLOCKS_THROTTLE_REASON_CLOCKS_SETTING = 0x0000000000000002
49
+
50
+ # SW Power Scaling algorithm is reducing the clocks below requested clocks
51
+ DCGM_CLOCKS_THROTTLE_REASON_SW_POWER_CAP = 0x0000000000000004
52
+
53
+ # HW Slowdown (reducing the core clocks by a factor of 2 or more) is engaged
54
+ #
55
+ # This is an indicator of:
56
+ # - temperature being too high
57
+ # - External Power Brake Assertion is triggered (e.g. by the system power supply)
58
+ # - Power draw is too high and Fast Trigger protection is reducing the clocks
59
+ # - May be also reported during PState or clock change
60
+ # - This behavior may be removed in a later release.
61
+
62
+ DCGM_CLOCKS_THROTTLE_REASON_HW_SLOWDOWN = 0x0000000000000008
63
+
64
+ # Sync Boost
65
+ #
66
+ # This GPU has been added to a Sync boost group with nvidia-smi or DCGM in
67
+ # order to maximize performance per watt. All GPUs in the sync boost group
68
+ # will boost to the minimum possible clocks across the entire group. Look at
69
+ # the throttle reasons for other GPUs in the system to see why those GPUs are
70
+ # holding this one at lower clocks.
71
+ DCGM_CLOCKS_THROTTLE_REASON_SYNC_BOOST = 0x0000000000000010
72
+
73
+ # SW Thermal Slowdown
74
+ #
75
+ # This is an indicator of one or more of the following:
76
+ # - Current GPU temperature above the GPU Max Operating Temperature
77
+ # - Current memory temperature above the Memory Max Operating Temperature
78
+ DCGM_CLOCKS_THROTTLE_REASON_SW_THERMAL = 0x0000000000000020
79
+
80
+ # HW Thermal Slowdown (reducing the core clocks by a factor of 2 or more) is engaged
81
+ #
82
+ # This is an indicator of:
83
+ # - temperature being too high
84
+ DCGM_CLOCKS_THROTTLE_REASON_HW_THERMAL = 0x0000000000000040
85
+
86
+ # HW Power Brake Slowdown (reducing the core clocks by a factor of 2 or more) is engaged
87
+ #
88
+ # This is an indicator of:
89
+ # - External Power Brake Assertion being triggered (e.g. by the system power supply)
90
+ DCGM_CLOCKS_THROTTLE_REASON_HW_POWER_BRAKE = 0x0000000000000080
91
+
92
+ # GPU clocks are limited by current setting of Display clocks
93
+ DCGM_CLOCKS_THROTTLE_REASON_DISPLAY_CLOCKS = 0x0000000000000100
94
+
95
+ # Field entity groups. Which type of entity is this field or field value associated with
96
+ DCGM_FE_NONE = (
97
+ 0 # Field is not associated with an entity. Field scope should be DCGM_FS_GLOBAL
98
+ )
99
+ DCGM_FE_GPU = 1 # Field is associated with a GPU entity
100
+ DCGM_FE_VGPU = 2 # Field is associated with a VGPU entity
101
+ DCGM_FE_SWITCH = 3 # Field is associated with a Switch entity
102
+ DCGM_FE_GPU_I = 4 # Field is associated with a GPU Instance entity
103
+ DCGM_FE_GPU_CI = 5 # Field is associated with a GPU Compute Instance entity
104
+ DCGM_FE_LINK = 6 # Field is associated with an NVLINK
105
+
106
+ c_dcgm_field_eid_t = c_uint32 # Represents an identifier for an entity within a field entity. For instance, this is the gpuId for DCGM_FE_GPU.
107
+
108
+ # System attributes
109
+ DCGM_FI_UNKNOWN = 0
110
+ DCGM_FI_DRIVER_VERSION = 1 # Driver Version
111
+ DCGM_FI_NVML_VERSION = 2 # Underlying NVML version
112
+ DCGM_FI_PROCESS_NAME = (
113
+ 3 # Process Name. Will be nv-hostengine or your process's name in embedded mode
114
+ )
115
+ DCGM_FI_DEV_COUNT = 4 # Number of Devices on the node
116
+ DCGM_FI_CUDA_DRIVER_VERSION = 5 # Cuda Driver Version as an integer. CUDA 11.1 = 11100
117
+ # Device attributes
118
+ DCGM_FI_DEV_NAME = 50 # Name of the GPU device
119
+ DCGM_FI_DEV_BRAND = 51 # Device Brand
120
+ DCGM_FI_DEV_NVML_INDEX = 52 # NVML index of this GPU
121
+ DCGM_FI_DEV_SERIAL = 53 # Device Serial Number
122
+ DCGM_FI_DEV_UUID = 54 # UUID corresponding to the device
123
+ DCGM_FI_DEV_MINOR_NUMBER = 55 # Device node minor number /dev/nvidia#
124
+ DCGM_FI_DEV_OEM_INFOROM_VER = 56 # OEM inforom version
125
+ DCGM_FI_DEV_PCI_BUSID = 57 # PCI attributes for the device
126
+ DCGM_FI_DEV_PCI_COMBINED_ID = 58 # The combined 16-bit device id and 16-bit vendor id
127
+ DCGM_FI_DEV_PCI_SUBSYS_ID = 59 # The 32-bit Sub System Device ID
128
+ DCGM_FI_GPU_TOPOLOGY_PCI = 60 # Topology of all GPUs on the system via PCI (static)
129
+ DCGM_FI_GPU_TOPOLOGY_NVLINK = (
130
+ 61 # Topology of all GPUs on the system via NVLINK (static)
131
+ )
132
+ DCGM_FI_GPU_TOPOLOGY_AFFINITY = 62 # Affinity of all GPUs on the system (static)
133
+ DCGM_FI_DEV_CUDA_COMPUTE_CAPABILITY = 63 # Cuda compute capability for the device
134
+ DCGM_FI_DEV_COMPUTE_MODE = 65 # Compute mode for the device
135
+ DCGM_FI_DEV_PERSISTENCE_MODE = 66 # Persistence mode for the device
136
+ DCGM_FI_DEV_MIG_MODE = 67 # MIG mode for the device
137
+ DCGM_FI_DEV_CUDA_VISIBLE_DEVICES_STR = (
138
+ 68 # String value for CUDA_VISIBLE_DEVICES for the device
139
+ )
140
+ DCGM_FI_DEV_MIG_MAX_SLICES = 69 # The maximum number of slices this GPU supports
141
+ DCGM_FI_DEV_CPU_AFFINITY_0 = 70 # Device CPU affinity. part 1/8 = cpus 0 - 63
142
+ DCGM_FI_DEV_CPU_AFFINITY_1 = 71 # Device CPU affinity. part 1/8 = cpus 64 - 127
143
+ DCGM_FI_DEV_CPU_AFFINITY_2 = 72 # Device CPU affinity. part 2/8 = cpus 128 - 191
144
+ DCGM_FI_DEV_CPU_AFFINITY_3 = 73 # Device CPU affinity. part 3/8 = cpus 192 - 255
145
+ DCGM_FI_DEV_CC_MODE = 74 # Device CC/APM mode
146
+ DCGM_FI_DEV_MIG_ATTRIBUTES = 75 # MIG device attributes
147
+ DCGM_FI_DEV_MIG_GI_INFO = 76 # GPU instance profile information
148
+ DCGM_FI_DEV_MIG_CI_INFO = 77 # Compute instance profile information
149
+ DCGM_FI_DEV_ECC_INFOROM_VER = 80 # ECC inforom version
150
+ DCGM_FI_DEV_POWER_INFOROM_VER = 81 # Power management object inforom version
151
+ DCGM_FI_DEV_INFOROM_IMAGE_VER = 82 # Inforom image version
152
+ DCGM_FI_DEV_INFOROM_CONFIG_CHECK = 83 # Inforom configuration checksum
153
+ DCGM_FI_DEV_INFOROM_CONFIG_VALID = (
154
+ 84 # Reads the infoROM from the flash and verifies the checksums
155
+ )
156
+ DCGM_FI_DEV_VBIOS_VERSION = 85 # VBIOS version of the device
157
+ DCGM_FI_DEV_BAR1_TOTAL = 90 # Total BAR1 of the GPU
158
+ DCGM_FI_SYNC_BOOST = 91 # Deprecated - Sync boost settings on the node
159
+ DCGM_FI_DEV_BAR1_USED = 92 # Used BAR1 of the GPU in MB
160
+ DCGM_FI_DEV_BAR1_FREE = 93 # Free BAR1 of the GPU in MB
161
+ # Clocks and power
162
+ DCGM_FI_DEV_SM_CLOCK = 100 # SM clock for the device
163
+ DCGM_FI_DEV_MEM_CLOCK = 101 # Memory clock for the device
164
+ DCGM_FI_DEV_VIDEO_CLOCK = 102 # Video encoder/decoder clock for the device
165
+ DCGM_FI_DEV_APP_SM_CLOCK = 110 # SM Application clocks
166
+ DCGM_FI_DEV_APP_MEM_CLOCK = 111 # Memory Application clocks
167
+ DCGM_FI_DEV_CLOCK_THROTTLE_REASONS = (
168
+ 112 # Current clock throttle reasons (bitmask of DCGM_CLOCKS_THROTTLE_REASON_*)
169
+ )
170
+ DCGM_FI_DEV_MAX_SM_CLOCK = 113 # Maximum supported SM clock for the device
171
+ DCGM_FI_DEV_MAX_MEM_CLOCK = 114 # Maximum supported Memory clock for the device
172
+ DCGM_FI_DEV_MAX_VIDEO_CLOCK = (
173
+ 115 # Maximum supported Video encoder/decoder clock for the device
174
+ )
175
+ DCGM_FI_DEV_AUTOBOOST = 120 # Auto-boost for the device (1 = enabled. 0 = disabled)
176
+ DCGM_FI_DEV_SUPPORTED_CLOCKS = 130 # Supported clocks for the device
177
+ DCGM_FI_DEV_MEMORY_TEMP = 140 # Memory temperature for the device
178
+ DCGM_FI_DEV_GPU_TEMP = 150 # Current temperature readings for the device, in degrees C
179
+ DCGM_FI_DEV_MEM_MAX_OP_TEMP = (
180
+ 151 # Maximum operating temperature for the memory of this GPU
181
+ )
182
+ DCGM_FI_DEV_GPU_MAX_OP_TEMP = 152 # Maximum operating temperature for this GPU
183
+ DCGM_FI_DEV_POWER_USAGE = 155 # Power usage for the device in Watts
184
+ DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION = (
185
+ 156 # Total energy consumption for the GPU in mJ since the driver was last reloaded
186
+ )
187
+ DCGM_FI_DEV_SLOWDOWN_TEMP = 158 # Slowdown temperature for the device
188
+ DCGM_FI_DEV_SHUTDOWN_TEMP = 159 # Shutdown temperature for the device
189
+ DCGM_FI_DEV_POWER_MGMT_LIMIT = 160 # Current Power limit for the device
190
+ DCGM_FI_DEV_POWER_MGMT_LIMIT_MIN = 161 # Minimum power management limit for the device
191
+ DCGM_FI_DEV_POWER_MGMT_LIMIT_MAX = 162 # Maximum power management limit for the device
192
+ DCGM_FI_DEV_POWER_MGMT_LIMIT_DEF = 163 # Default power management limit for the device
193
+ DCGM_FI_DEV_ENFORCED_POWER_LIMIT = 164 # Effective power limit that the driver enforces after taking into account all limiters
194
+ DCGM_FI_DEV_PSTATE = 190 # Performance state (P-State) 0-15. 0=highest
195
+ DCGM_FI_DEV_FAN_SPEED = 191 # Fan speed for the device in percent 0-100
196
+ # Device utilization and telemetry
197
+ DCGM_FI_DEV_PCIE_TX_THROUGHPUT = 200 # Deprecated - PCIe Tx utilization information
198
+ DCGM_FI_DEV_PCIE_RX_THROUGHPUT = 201 # Deprecated - PCIe Rx utilization information
199
+ DCGM_FI_DEV_PCIE_REPLAY_COUNTER = 202 # PCIe replay counter
200
+ DCGM_FI_DEV_GPU_UTIL = 203 # GPU Utilization
201
+ DCGM_FI_DEV_MEM_COPY_UTIL = 204 # Memory Utilization
202
+ DCGM_FI_DEV_ACCOUNTING_DATA = 205 # Process accounting stats
203
+ DCGM_FI_DEV_ENC_UTIL = 206 # Encoder utilization
204
+ DCGM_FI_DEV_DEC_UTIL = 207 # Decoder utilization
205
+ # Fields 210, 211, 220, and 221 are internal-only. see dcgm_fields_internal.py
206
+ DCGM_FI_DEV_XID_ERRORS = 230 # XID errors. The value is the specific XID error
207
+ DCGM_FI_DEV_PCIE_MAX_LINK_GEN = 235 # PCIe Max Link Generation
208
+ DCGM_FI_DEV_PCIE_MAX_LINK_WIDTH = 236 # PCIe Max Link Width
209
+ DCGM_FI_DEV_PCIE_LINK_GEN = 237 # PCIe Current Link Generation
210
+ DCGM_FI_DEV_PCIE_LINK_WIDTH = 238 # PCIe Current Link Width
211
+ # Violation counters
212
+ DCGM_FI_DEV_POWER_VIOLATION = 240 # Power Violation time in usec
213
+ DCGM_FI_DEV_THERMAL_VIOLATION = 241 # Thermal Violation time in usec
214
+ DCGM_FI_DEV_SYNC_BOOST_VIOLATION = 242 # Sync Boost Violation time in usec
215
+ DCGM_FI_DEV_BOARD_LIMIT_VIOLATION = 243 # Board Limit Violation time in usec.
216
+ DCGM_FI_DEV_LOW_UTIL_VIOLATION = 244 # Low Utilization Violation time in usec.
217
+ DCGM_FI_DEV_RELIABILITY_VIOLATION = 245 # Reliability Violation time in usec.
218
+ DCGM_FI_DEV_TOTAL_APP_CLOCKS_VIOLATION = 246 # App Clocks Violation time in usec.
219
+ DCGM_FI_DEV_TOTAL_BASE_CLOCKS_VIOLATION = 247 # Base Clocks Violation time in usec.
220
+ # Framebuffer usage
221
+ DCGM_FI_DEV_FB_TOTAL = 250 # Total framebuffer memory in MB
222
+ DCGM_FI_DEV_FB_FREE = 251 # Total framebuffer used in MB
223
+ DCGM_FI_DEV_FB_USED = 252 # Total framebuffer free in MB
224
+ DCGM_FI_DEV_FB_RESERVED = 253 # Total framebuffer reserved in MB
225
+ # Device ECC Counters
226
+ DCGM_FI_DEV_ECC_CURRENT = 300 # Current ECC mode for the device
227
+ DCGM_FI_DEV_ECC_PENDING = 301 # Pending ECC mode for the device
228
+ DCGM_FI_DEV_ECC_SBE_VOL_TOTAL = 310 # Total single bit volatile ecc errors
229
+ DCGM_FI_DEV_ECC_DBE_VOL_TOTAL = 311 # Total double bit volatile ecc errors
230
+ DCGM_FI_DEV_ECC_SBE_AGG_TOTAL = (
231
+ 312 # Total single bit aggregate (persistent) ecc errors
232
+ )
233
+ DCGM_FI_DEV_ECC_DBE_AGG_TOTAL = (
234
+ 313 # Total double bit aggregate (persistent) ecc errors
235
+ )
236
+ DCGM_FI_DEV_ECC_SBE_VOL_L1 = 314 # L1 cache single bit volatile ecc errors
237
+ DCGM_FI_DEV_ECC_DBE_VOL_L1 = 315 # L1 cache double bit volatile ecc errors
238
+ DCGM_FI_DEV_ECC_SBE_VOL_L2 = 316 # L2 cache single bit volatile ecc errors
239
+ DCGM_FI_DEV_ECC_DBE_VOL_L2 = 317 # L2 cache double bit volatile ecc errors
240
+ DCGM_FI_DEV_ECC_SBE_VOL_DEV = 318 # Device memory single bit volatile ecc errors
241
+ DCGM_FI_DEV_ECC_DBE_VOL_DEV = 319 # Device memory double bit volatile ecc errors
242
+ DCGM_FI_DEV_ECC_SBE_VOL_REG = 320 # Register file single bit volatile ecc errors
243
+ DCGM_FI_DEV_ECC_DBE_VOL_REG = 321 # Register file double bit volatile ecc errors
244
+ DCGM_FI_DEV_ECC_SBE_VOL_TEX = 322 # Texture memory single bit volatile ecc errors
245
+ DCGM_FI_DEV_ECC_DBE_VOL_TEX = 323 # Texture memory double bit volatile ecc errors
246
+ DCGM_FI_DEV_ECC_SBE_AGG_L1 = (
247
+ 324 # L1 cache single bit aggregate (persistent) ecc errors
248
+ )
249
+ DCGM_FI_DEV_ECC_DBE_AGG_L1 = (
250
+ 325 # L1 cache double bit aggregate (persistent) ecc errors
251
+ )
252
+ DCGM_FI_DEV_ECC_SBE_AGG_L2 = (
253
+ 326 # L2 cache single bit aggregate (persistent) ecc errors
254
+ )
255
+ DCGM_FI_DEV_ECC_DBE_AGG_L2 = (
256
+ 327 # L2 cache double bit aggregate (persistent) ecc errors
257
+ )
258
+ DCGM_FI_DEV_ECC_SBE_AGG_DEV = (
259
+ 328 # Device memory single bit aggregate (persistent) ecc errors
260
+ )
261
+ DCGM_FI_DEV_ECC_DBE_AGG_DEV = (
262
+ 329 # Device memory double bit aggregate (persistent) ecc errors
263
+ )
264
+ DCGM_FI_DEV_ECC_SBE_AGG_REG = (
265
+ 330 # Register File single bit aggregate (persistent) ecc errors
266
+ )
267
+ DCGM_FI_DEV_ECC_DBE_AGG_REG = (
268
+ 331 # Register File double bit aggregate (persistent) ecc errors
269
+ )
270
+ DCGM_FI_DEV_ECC_SBE_AGG_TEX = (
271
+ 332 # Texture memory single bit aggregate (persistent) ecc errors
272
+ )
273
+ DCGM_FI_DEV_ECC_DBE_AGG_TEX = (
274
+ 333 # Texture memory double bit aggregate (persistent) ecc errors
275
+ )
276
+ DCGM_FI_DEV_RETIRED_SBE = 390 # Number of retired pages because of single bit errors
277
+ DCGM_FI_DEV_RETIRED_DBE = 391 # Number of retired pages because of double bit errors
278
+ DCGM_FI_DEV_RETIRED_PENDING = 392 # Number of pages pending retirement
279
+ # Row remapper fields (Ampere and newer)
280
+ DCGM_FI_DEV_UNCORRECTABLE_REMAPPED_ROWS = (
281
+ 393 # Number of remapped rows for uncorrectable errors
282
+ )
283
+ DCGM_FI_DEV_CORRECTABLE_REMAPPED_ROWS = (
284
+ 394 # Number of remapped rows for correctable errors
285
+ )
286
+ DCGM_FI_DEV_ROW_REMAP_FAILURE = 395 # Whether remapping of rows has failed
287
+ DCGM_FI_DEV_ROW_REMAP_PENDING = 396 # Whether remapping of rows is pending
288
+
289
+ # Device NvLink Bandwidth and Error Counters
290
+ DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L0 = (
291
+ 400 # NV Link flow control CRC Error Counter for Lane 0
292
+ )
293
+ DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L1 = (
294
+ 401 # NV Link flow control CRC Error Counter for Lane 1
295
+ )
296
+ DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L2 = (
297
+ 402 # NV Link flow control CRC Error Counter for Lane 2
298
+ )
299
+ DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L3 = (
300
+ 403 # NV Link flow control CRC Error Counter for Lane 3
301
+ )
302
+ DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L4 = (
303
+ 404 # NV Link flow control CRC Error Counter for Lane 4
304
+ )
305
+ DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L5 = (
306
+ 405 # NV Link flow control CRC Error Counter for Lane 5
307
+ )
308
+ DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_TOTAL = (
309
+ 409 # NV Link flow control CRC Error Counter total for all Lanes
310
+ )
311
+ DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L0 = (
312
+ 410 # NV Link data CRC Error Counter for Lane 0
313
+ )
314
+ DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L1 = (
315
+ 411 # NV Link data CRC Error Counter for Lane 1
316
+ )
317
+ DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L2 = (
318
+ 412 # NV Link data CRC Error Counter for Lane 2
319
+ )
320
+ DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L3 = (
321
+ 413 # NV Link data CRC Error Counter for Lane 3
322
+ )
323
+ DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L4 = (
324
+ 414 # NV Link data CRC Error Counter for Lane 4
325
+ )
326
+ DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L5 = (
327
+ 415 # NV Link data CRC Error Counter for Lane 5
328
+ )
329
+ DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_TOTAL = (
330
+ 419 # NV Link data CRC Error Counter total for all Lanes
331
+ )
332
+ DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L0 = (
333
+ 420 # NV Link Replay Error Counter for Lane 0
334
+ )
335
+ DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L1 = (
336
+ 421 # NV Link Replay Error Counter for Lane 1
337
+ )
338
+ DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L2 = (
339
+ 422 # NV Link Replay Error Counter for Lane 2
340
+ )
341
+ DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L3 = (
342
+ 423 # NV Link Replay Error Counter for Lane 3
343
+ )
344
+ DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L4 = (
345
+ 424 # NV Link Replay Error Counter for Lane 4
346
+ )
347
+ DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L5 = (
348
+ 425 # NV Link Replay Error Counter for Lane 3
349
+ )
350
+ DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_TOTAL = (
351
+ 429 # NV Link Replay Error Counter total for all Lanes
352
+ )
353
+
354
+ DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L0 = (
355
+ 430 # NV Link Recovery Error Counter for Lane 0
356
+ )
357
+ DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L1 = (
358
+ 431 # NV Link Recovery Error Counter for Lane 1
359
+ )
360
+ DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L2 = (
361
+ 432 # NV Link Recovery Error Counter for Lane 2
362
+ )
363
+ DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L3 = (
364
+ 433 # NV Link Recovery Error Counter for Lane 3
365
+ )
366
+ DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L4 = (
367
+ 434 # NV Link Recovery Error Counter for Lane 4
368
+ )
369
+ DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L5 = (
370
+ 435 # NV Link Recovery Error Counter for Lane 5
371
+ )
372
+ DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_TOTAL = (
373
+ 439 # NV Link Recovery Error Counter total for all Lanes
374
+ )
375
+ DCGM_FI_DEV_NVLINK_BANDWIDTH_L0 = 440 # NV Link Bandwidth Counter for Lane 0
376
+ DCGM_FI_DEV_NVLINK_BANDWIDTH_L1 = 441 # NV Link Bandwidth Counter for Lane 1
377
+ DCGM_FI_DEV_NVLINK_BANDWIDTH_L2 = 442 # NV Link Bandwidth Counter for Lane 2
378
+ DCGM_FI_DEV_NVLINK_BANDWIDTH_L3 = 443 # NV Link Bandwidth Counter for Lane 3
379
+ DCGM_FI_DEV_NVLINK_BANDWIDTH_L4 = 444 # NV Link Bandwidth Counter for Lane 4
380
+ DCGM_FI_DEV_NVLINK_BANDWIDTH_L5 = 445 # NV Link Bandwidth Counter for Lane 5
381
+ DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL = (
382
+ 449 # NV Link Bandwidth Counter total for all Lanes
383
+ )
384
+ DCGM_FI_DEV_GPU_NVLINK_ERRORS = 450 # GPU NVLink error information
385
+ DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L6 = 451
386
+ DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L7 = 452
387
+ DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L8 = 453
388
+ DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L9 = 454
389
+ DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L10 = 455
390
+ DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L11 = 456
391
+ DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L12 = 406
392
+ DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L13 = 407
393
+ DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L14 = 408
394
+ DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L15 = 481
395
+ DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L16 = 482
396
+ DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L17 = 483
397
+ DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L6 = 457
398
+ DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L7 = 458
399
+ DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L8 = 459
400
+ DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L9 = 460
401
+ DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L10 = 461
402
+ DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L11 = 462
403
+ DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L12 = 416
404
+ DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L13 = 417
405
+ DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L14 = 418
406
+ DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L15 = 484
407
+ DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L16 = 485
408
+ DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L17 = 486
409
+ DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L6 = 463
410
+ DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L7 = 464
411
+ DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L8 = 465
412
+ DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L9 = 466
413
+ DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L10 = 467
414
+ DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L11 = 468
415
+ DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L12 = 426
416
+ DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L13 = 427
417
+ DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L14 = 428
418
+ DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L15 = 487
419
+ DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L16 = 488
420
+ DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L17 = 489
421
+ DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L6 = 469
422
+ DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L7 = 470
423
+ DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L8 = 471
424
+ DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L9 = 472
425
+ DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L10 = 473
426
+ DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L11 = 474
427
+ DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L12 = 436
428
+ DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L13 = 437
429
+ DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L14 = 438
430
+ DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L15 = 491
431
+ DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L16 = 492
432
+ DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L17 = 493
433
+ DCGM_FI_DEV_NVLINK_BANDWIDTH_L6 = 475
434
+ DCGM_FI_DEV_NVLINK_BANDWIDTH_L7 = 476
435
+ DCGM_FI_DEV_NVLINK_BANDWIDTH_L8 = 477
436
+ DCGM_FI_DEV_NVLINK_BANDWIDTH_L9 = 478
437
+ DCGM_FI_DEV_NVLINK_BANDWIDTH_L10 = 479
438
+ DCGM_FI_DEV_NVLINK_BANDWIDTH_L11 = 480
439
+ DCGM_FI_DEV_NVLINK_BANDWIDTH_L12 = 446
440
+ DCGM_FI_DEV_NVLINK_BANDWIDTH_L13 = 447
441
+ DCGM_FI_DEV_NVLINK_BANDWIDTH_L14 = 448
442
+ DCGM_FI_DEV_NVLINK_BANDWIDTH_L15 = 494
443
+ DCGM_FI_DEV_NVLINK_BANDWIDTH_L16 = 495
444
+ DCGM_FI_DEV_NVLINK_BANDWIDTH_L17 = 496
445
+
446
+ # Device Attributes associated with virtualization
447
+ DCGM_FI_DEV_VIRTUAL_MODE = 500 # Operating mode of the GPU
448
+ DCGM_FI_DEV_SUPPORTED_TYPE_INFO = (
449
+ 501 # Includes Count and Supported vGPU type information
450
+ )
451
+ DCGM_FI_DEV_CREATABLE_VGPU_TYPE_IDS = (
452
+ 502 # Includes Count and List of Creatable vGPU type IDs
453
+ )
454
+ DCGM_FI_DEV_VGPU_INSTANCE_IDS = 503 # Includes Count and List of vGPU instance IDs
455
+ DCGM_FI_DEV_VGPU_UTILIZATIONS = (
456
+ 504 # Utilization values for vGPUs running on the device
457
+ )
458
+ DCGM_FI_DEV_VGPU_PER_PROCESS_UTILIZATION = (
459
+ 505 # Utilization values for processes running within vGPU VMs using the device
460
+ )
461
+ DCGM_FI_DEV_ENC_STATS = 506 # Current encoder statistics for a given device
462
+ DCGM_FI_DEV_FBC_STATS = (
463
+ 507 # Statistics of current active frame buffer capture sessions on a given device
464
+ )
465
+ DCGM_FI_DEV_FBC_SESSIONS_INFO = (
466
+ 508 # Information about active frame buffer capture sessions on a target device
467
+ )
468
+ DCGM_FI_DEV_SUPPORTED_VGPU_TYPE_IDS = (
469
+ 509 # Includes Count and currently Supported vGPU types on a device
470
+ )
471
+ DCGM_FI_DEV_VGPU_TYPE_INFO = (
472
+ 510 # Includes Static info of vGPU types supported on a device
473
+ )
474
+ DCGM_FI_DEV_VGPU_TYPE_NAME = (
475
+ 511 # Includes the name of a vGPU type supported on a device
476
+ )
477
+ DCGM_FI_DEV_VGPU_TYPE_CLASS = (
478
+ 512 # Includes the class of a vGPU type supported on a device
479
+ )
480
+ DCGM_FI_DEV_VGPU_TYPE_LICENSE = (
481
+ 513 # Includes the license info for a vGPU type supported on a device
482
+ )
483
+ # Related to vGPU Instance IDs
484
+ DCGM_FI_DEV_VGPU_VM_ID = 520 # vGPU VM ID
485
+ DCGM_FI_DEV_VGPU_VM_NAME = 521 # vGPU VM name
486
+ DCGM_FI_DEV_VGPU_TYPE = 522 # vGPU type of the vGPU instance
487
+ DCGM_FI_DEV_VGPU_UUID = 523 # UUID of the vGPU instance
488
+ DCGM_FI_DEV_VGPU_DRIVER_VERSION = 524 # Driver version of the vGPU instance
489
+ DCGM_FI_DEV_VGPU_MEMORY_USAGE = 525 # Memory usage of the vGPU instance
490
+ DCGM_FI_DEV_VGPU_LICENSE_STATUS = 526 # License status of the vGPU
491
+ DCGM_FI_DEV_VGPU_FRAME_RATE_LIMIT = 527 # Frame rate limit of the vGPU instance
492
+ DCGM_FI_DEV_VGPU_ENC_STATS = 528 # Current encoder statistics of the vGPU instance
493
+ DCGM_FI_DEV_VGPU_ENC_SESSIONS_INFO = (
494
+ 529 # Information about all active encoder sessions on the vGPU instance
495
+ )
496
+ DCGM_FI_DEV_VGPU_FBC_STATS = 530 # Statistics of current active frame buffer capture sessions on the vGPU instance
497
+ DCGM_FI_DEV_VGPU_FBC_SESSIONS_INFO = (
498
+ 531 # Information about active frame buffer capture sessions on the vGPU instance
499
+ )
500
+ DCGM_FI_DEV_VGPU_INSTANCE_LICENSE_STATE = (
501
+ 532 # License state information of the vGPU instance
502
+ )
503
+ DCGM_FI_DEV_VGPU_PCI_ID = 533 # PCI Id of the vGPU instance
504
+ DCGM_FI_DEV_VGPU_VM_GPU_INSTANCE_ID = 534 # GPU Instance Id of the vGPU instance
505
+ # Internal fields reserve the range 600..699
506
+ # below fields related to NVSwitch
507
+ DCGM_FI_FIRST_NVSWITCH_FIELD_ID = 700 # Starting field ID of the NVSwitch instance
508
+ DCGM_FI_DEV_NVSWITCH_LINK_THROUGHPUT_TX = 780
509
+ DCGM_FI_DEV_NVSWITCH_LINK_THROUGHPUT_RX = 781
510
+ DCGM_FI_DEV_NVSWITCH_LINK_FATAL_ERRORS = 782
511
+ DCGM_FI_DEV_NVSWITCH_LINK_NON_FATAL_ERRORS = 783
512
+ DCGM_FI_DEV_NVSWITCH_LINK_REPLAY_ERRORS = 784
513
+ DCGM_FI_DEV_NVSWITCH_LINK_RECOVERY_ERRORS = 785
514
+ DCGM_FI_DEV_NVSWITCH_LINK_FLIT_ERRORS = 786
515
+ DCGM_FI_DEV_NVSWITCH_LINK_CRC_ERRORS = 787
516
+ DCGM_FI_DEV_NVSWITCH_LINK_ECC_ERRORS = 788
517
+ DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_LOW_VC0 = 789
518
+ DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_LOW_VC1 = 790
519
+ DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_LOW_VC2 = 791
520
+ DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_LOW_VC3 = 792
521
+ DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_MEDIUM_VC0 = 793
522
+ DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_MEDIUM_VC1 = 794
523
+ DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_MEDIUM_VC2 = 795
524
+ DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_MEDIUM_VC3 = 796
525
+ DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_HIGH_VC0 = 797
526
+ DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_HIGH_VC1 = 798
527
+ DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_HIGH_VC2 = 799
528
+ DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_HIGH_VC3 = 800
529
+ DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_PANIC_VC0 = 801
530
+ DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_PANIC_VC1 = 802
531
+ DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_PANIC_VC2 = 803
532
+ DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_PANIC_VC3 = 804
533
+ DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_COUNT_VC0 = 805
534
+ DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_COUNT_VC1 = 806
535
+ DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_COUNT_VC2 = 807
536
+ DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_COUNT_VC3 = 808
537
+ DCGM_FI_DEV_NVSWITCH_LINK_CRC_ERRORS_LANE0 = 809
538
+ DCGM_FI_DEV_NVSWITCH_LINK_CRC_ERRORS_LANE1 = 810
539
+ DCGM_FI_DEV_NVSWITCH_LINK_CRC_ERRORS_LANE2 = 811
540
+ DCGM_FI_DEV_NVSWITCH_LINK_CRC_ERRORS_LANE3 = 812
541
+ DCGM_FI_DEV_NVSWITCH_LINK_ECC_ERRORS_LANE0 = 813
542
+ DCGM_FI_DEV_NVSWITCH_LINK_ECC_ERRORS_LANE1 = 814
543
+ DCGM_FI_DEV_NVSWITCH_LINK_ECC_ERRORS_LANE2 = 815
544
+ DCGM_FI_DEV_NVSWITCH_LINK_ECC_ERRORS_LANE3 = 816
545
+ DCGM_FI_DEV_NVSWITCH_FATAL_ERRORS = 856
546
+ DCGM_FI_DEV_NVSWITCH_NON_FATAL_ERRORS = 857
547
+ DCGM_FI_DEV_NVSWITCH_TEMPERATURE_CURRENT = 858
548
+ DCGM_FI_DEV_NVSWITCH_TEMPERATURE_LIMIT_SLOWDOWN = 859
549
+ DCGM_FI_DEV_NVSWITCH_TEMPERATURE_LIMIT_SHUTDOWN = 860
550
+ DCGM_FI_DEV_NVSWITCH_THROUGHPUT_TX = 861
551
+ DCGM_FI_DEV_NVSWITCH_THROUGHPUT_RX = 862
552
+
553
+ DCGM_FI_LAST_NVSWITCH_FIELD_ID = 899 # Last field ID of the NVSwitch instance
554
+ """
555
+ Profiling Fields
556
+ """
557
+ DCGM_FI_PROF_GR_ENGINE_ACTIVE = (
558
+ 1001 # Ratio of time the graphics engine is active. The graphics engine is
559
+ )
560
+ # active if a graphics/compute context is bound and the graphics pipe or
561
+ # compute pipe is busy.
562
+
563
+ DCGM_FI_PROF_SM_ACTIVE = 1002 # The ratio of cycles an SM has at least 1 warp assigned
564
+ # (computed from the number of cycles and elapsed cycles)
565
+
566
+ DCGM_FI_PROF_SM_OCCUPANCY = 1003 # The ratio of number of warps resident on an SM.
567
+ # (number of resident as a ratio of the theoretical
568
+ # maximum number of warps per elapsed cycle)
569
+
570
+ DCGM_FI_PROF_PIPE_TENSOR_ACTIVE = (
571
+ 1004 # The ratio of cycles the any tensor pipe is active
572
+ )
573
+ # (off the peak sustained elapsed cycles)
574
+
575
+ DCGM_FI_PROF_DRAM_ACTIVE = 1005 # The ratio of cycles the device memory interface is active sending or receiving data.
576
+ DCGM_FI_PROF_PIPE_FP64_ACTIVE = 1006 # Ratio of cycles the fp64 pipe is active.
577
+ DCGM_FI_PROF_PIPE_FP32_ACTIVE = 1007 # Ratio of cycles the fp32 pipe is active.
578
+ DCGM_FI_PROF_PIPE_FP16_ACTIVE = (
579
+ 1008 # Ratio of cycles the fp16 pipe is active. This does not include HMMA.
580
+ )
581
+ DCGM_FI_PROF_PCIE_TX_BYTES = 1009 # The number of bytes of active PCIe tx (transmit) data including both header and payload.
582
+ DCGM_FI_PROF_PCIE_RX_BYTES = 1010 # The number of bytes of active PCIe rx (read) data including both header and payload.
583
+ DCGM_FI_PROF_NVLINK_TX_BYTES = 1011 # The number of bytes of active NvLink tx (transmit) data including both header and payload.
584
+ DCGM_FI_PROF_NVLINK_RX_BYTES = 1012 # The number of bytes of active NvLink rx (receive) data including both header and payload.
585
+ DCGM_FI_PROF_PIPE_TENSOR_IMMA_ACTIVE = 1013 # The ratio of cycles the IMMA tensor pipe is active (off the peak sustained elapsed cycles)
586
+ DCGM_FI_PROF_PIPE_TENSOR_HMMA_ACTIVE = 1014 # The ratio of cycles the HMMA tensor pipe is active (off the peak sustained elapsed cycles)
587
+ DCGM_FI_PROF_PIPE_TENSOR_DFMA_ACTIVE = 1015 # The ratio of cycles the tensor (DFMA) pipe is active (off the peak sustained elapsed cycles)
588
+ DCGM_FI_PROF_PIPE_INT_ACTIVE = 1016 # Ratio of cycles the integer pipe is active.
589
+
590
+ # Ratio of cycles each of the NVDEC engines are active.
591
+ DCGM_FI_PROF_NVDEC0_ACTIVE = 1017
592
+ DCGM_FI_PROF_NVDEC1_ACTIVE = 1018
593
+ DCGM_FI_PROF_NVDEC2_ACTIVE = 1019
594
+ DCGM_FI_PROF_NVDEC3_ACTIVE = 1020
595
+ DCGM_FI_PROF_NVDEC4_ACTIVE = 1021
596
+ DCGM_FI_PROF_NVDEC5_ACTIVE = 1022
597
+ DCGM_FI_PROF_NVDEC6_ACTIVE = 1023
598
+ DCGM_FI_PROF_NVDEC7_ACTIVE = 1024
599
+
600
+ # Ratio of cycles each of the NVJPG engines are active.
601
+ DCGM_FI_PROF_NVJPG0_ACTIVE = 1025
602
+ DCGM_FI_PROF_NVJPG1_ACTIVE = 1026
603
+ DCGM_FI_PROF_NVJPG2_ACTIVE = 1027
604
+ DCGM_FI_PROF_NVJPG3_ACTIVE = 1028
605
+ DCGM_FI_PROF_NVJPG4_ACTIVE = 1029
606
+ DCGM_FI_PROF_NVJPG5_ACTIVE = 1030
607
+ DCGM_FI_PROF_NVJPG6_ACTIVE = 1031
608
+ DCGM_FI_PROF_NVJPG7_ACTIVE = 1032
609
+
610
+ # Ratio of cycles each of the NVOFA engines are active.
611
+ DCGM_FI_PROF_NVOFA0_ACTIVE = 1033
612
+ """
613
+ The per-link number of bytes of active NvLink TX (transmit) or RX (transmit) data including both header and payload.
614
+ For example: DCGM_FI_PROF_NVLINK_L0_TX_BYTES -> L0 TX
615
+ To get the bandwidth for a link, add the RX and TX value together like
616
+ total = DCGM_FI_PROF_NVLINK_L0_TX_BYTES + DCGM_FI_PROF_NVLINK_L0_RX_BYTES
617
+ """
618
+ DCGM_FI_PROF_NVLINK_L0_TX_BYTES = 1040
619
+ DCGM_FI_PROF_NVLINK_L0_RX_BYTES = 1041
620
+ DCGM_FI_PROF_NVLINK_L1_TX_BYTES = 1042
621
+ DCGM_FI_PROF_NVLINK_L1_RX_BYTES = 1043
622
+ DCGM_FI_PROF_NVLINK_L2_TX_BYTES = 1044
623
+ DCGM_FI_PROF_NVLINK_L2_RX_BYTES = 1045
624
+ DCGM_FI_PROF_NVLINK_L3_TX_BYTES = 1046
625
+ DCGM_FI_PROF_NVLINK_L3_RX_BYTES = 1047
626
+ DCGM_FI_PROF_NVLINK_L4_TX_BYTES = 1048
627
+ DCGM_FI_PROF_NVLINK_L4_RX_BYTES = 1049
628
+ DCGM_FI_PROF_NVLINK_L5_TX_BYTES = 1050
629
+ DCGM_FI_PROF_NVLINK_L5_RX_BYTES = 1051
630
+ DCGM_FI_PROF_NVLINK_L6_TX_BYTES = 1052
631
+ DCGM_FI_PROF_NVLINK_L6_RX_BYTES = 1053
632
+ DCGM_FI_PROF_NVLINK_L7_TX_BYTES = 1054
633
+ DCGM_FI_PROF_NVLINK_L7_RX_BYTES = 1055
634
+ DCGM_FI_PROF_NVLINK_L8_TX_BYTES = 1056
635
+ DCGM_FI_PROF_NVLINK_L8_RX_BYTES = 1057
636
+ DCGM_FI_PROF_NVLINK_L9_TX_BYTES = 1058
637
+ DCGM_FI_PROF_NVLINK_L9_RX_BYTES = 1059
638
+ DCGM_FI_PROF_NVLINK_L10_TX_BYTES = 1060
639
+ DCGM_FI_PROF_NVLINK_L10_RX_BYTES = 1061
640
+ DCGM_FI_PROF_NVLINK_L11_TX_BYTES = 1062
641
+ DCGM_FI_PROF_NVLINK_L11_RX_BYTES = 1063
642
+ DCGM_FI_PROF_NVLINK_L12_TX_BYTES = 1064
643
+ DCGM_FI_PROF_NVLINK_L12_RX_BYTES = 1065
644
+ DCGM_FI_PROF_NVLINK_L13_TX_BYTES = 1066
645
+ DCGM_FI_PROF_NVLINK_L13_RX_BYTES = 1067
646
+ DCGM_FI_PROF_NVLINK_L14_TX_BYTES = 1068
647
+ DCGM_FI_PROF_NVLINK_L14_RX_BYTES = 1069
648
+ DCGM_FI_PROF_NVLINK_L15_TX_BYTES = 1070
649
+ DCGM_FI_PROF_NVLINK_L15_RX_BYTES = 1071
650
+ DCGM_FI_PROF_NVLINK_L16_TX_BYTES = 1072
651
+ DCGM_FI_PROF_NVLINK_L16_RX_BYTES = 1073
652
+ DCGM_FI_PROF_NVLINK_L17_TX_BYTES = 1074
653
+ DCGM_FI_PROF_NVLINK_L17_RX_BYTES = 1075
654
+
655
+ DCGM_FI_PROF_NVLINK_THROUGHPUT_FIRST = DCGM_FI_PROF_NVLINK_L0_TX_BYTES
656
+ DCGM_FI_PROF_NVLINK_THROUGHPUT_LAST = DCGM_FI_PROF_NVLINK_L17_RX_BYTES
657
+
658
+ # greater than maximum fields above. This value can increase in the future
659
+ DCGM_FI_MAX_FIELDS = 1076
660
+
661
+
662
+ class struct_c_dcgm_field_meta_t(dcgm_structs._DcgmStructure):
663
+ # struct_c_dcgm_field_meta_t structure
664
+ pass # opaque handle
665
+
666
+
667
+ dcgm_field_meta_t = POINTER(struct_c_dcgm_field_meta_t)
668
+
669
+
670
+ class _PrintableStructure(dcgm_structs._DcgmStructure):
671
+ """
672
+ Abstract class that produces nicer __str__ output than ctypes.Structure.
673
+ e.g. instead of:
674
+ >>> print str(obj)
675
+ <class_name object at 0x7fdf82fef9e0>
676
+ this class will print
677
+ class_name(field_name: formatted_value, field_name: formatted_value)
678
+
679
+ _fmt_ dictionary of <str _field_ name> -> <str format>
680
+ e.g. class that has _field_ 'hex_value', c_uint could be formatted with
681
+ _fmt_ = {"hex_value" : "%08X"}
682
+ to produce nicer output.
683
+ Default fomratting string for all fields can be set with key "<default>" like:
684
+ _fmt_ = {"<default>" : "%d MHz"} # e.g all values are numbers in MHz.
685
+ If not set it's assumed to be just "%s"
686
+
687
+ Exact format of returned str from this class is subject to change in the future.
688
+ """
689
+
690
+ _fmt_: Dict = {}
691
+
692
+ def __str__(self):
693
+ result = []
694
+ for x in self._fields_:
695
+ key = x[0]
696
+ value = getattr(self, key)
697
+ fmt = "%s"
698
+ if key in self._fmt_:
699
+ fmt = self._fmt_[key]
700
+ elif "<default>" in self._fmt_:
701
+ fmt = self._fmt_["<default>"]
702
+ result.append(("%s: " + fmt) % (key, value))
703
+ return self.__class__.__name__ + "(" + ", ".join(result) + ")"
704
+
705
+
706
+ # Provides access to functions from dcgm_agent_internal
707
+ dcgmFP = dcgm_structs._dcgmGetFunctionPointer
708
+
709
+ SHORTNAME_LENGTH = 10
710
+ UNIT_LENGTH = 4
711
+
712
+
713
+ # Structure to hold formatting information for values
714
+ class c_dcgm_field_output_format_t(_PrintableStructure):
715
+ _fields_ = [
716
+ ("shortName", c_char * SHORTNAME_LENGTH),
717
+ ("unit", c_char * UNIT_LENGTH),
718
+ ("width", c_short),
719
+ ]
720
+
721
+
722
+ TAG_LENGTH = 48
723
+
724
+
725
+ # Structure to represent device information
726
+ class c_dcgm_field_meta_t(_PrintableStructure):
727
+ _fields_ = [
728
+ # version must always be first
729
+ ("fieldId", c_short),
730
+ ("fieldType", c_char),
731
+ ("size", c_ubyte),
732
+ ("tag", c_char * TAG_LENGTH),
733
+ ("scope", c_int),
734
+ ("valueFormat", c_dcgm_field_output_format_t),
735
+ ]
736
+
737
+
738
+ # Class for maintaining properties for each sampling type like Power, Utilization and Clock.
739
+ class pySamplingProperties:
740
+ """
741
+ The instance of this class is used to hold information related to each sampling event type.
742
+ """
743
+
744
+ def __init__(
745
+ self,
746
+ name,
747
+ sampling_type,
748
+ sample_val_type,
749
+ timeIntervalIdle,
750
+ timeIntervalBoost,
751
+ min_value,
752
+ max_value,
753
+ ):
754
+ self.name = name
755
+ self.sampling_type = sampling_type
756
+ self.timeIntervalIdle = timeIntervalIdle
757
+ self.timeIntervalBoost = timeIntervalBoost
758
+ self.min_value = min_value
759
+ self.max_value = max_value
760
+ self.sample_val_type = sample_val_type
761
+
762
+
763
+ def DcgmFieldsInit():
764
+ fn = dcgmFP("DcgmFieldsInit")
765
+ ret = fn()
766
+ assert ret == 0, "Got return %d from DcgmFieldsInit" % ret
767
+
768
+
769
+ def DcgmFieldGetById(fieldId):
770
+ """
771
+ Get metadata for a field, given its fieldId
772
+
773
+ :param fieldId: Field ID to get metadata for
774
+ :return: c_dcgm_field_meta_t struct on success. None on error.
775
+ """
776
+ DcgmFieldsInit()
777
+
778
+ fn = dcgmFP("DcgmFieldGetById")
779
+ fn.restype = POINTER(c_dcgm_field_meta_t)
780
+ c_field_meta_ptr = fn(fieldId)
781
+ if not c_field_meta_ptr:
782
+ return None
783
+
784
+ retVal = c_dcgm_field_meta_t()
785
+ memmove(addressof(retVal), c_field_meta_ptr, sizeof(retVal))
786
+ return retVal
787
+
788
+
789
+ def DcgmFieldGetByTag(tag):
790
+ """
791
+ Get metadata for a field, given its string tag
792
+
793
+ :param tag: Field tag to get metadata for. Example 'brand'
794
+ :return: c_dcgm_field_meta_t struct on success. None on error.
795
+ """
796
+ DcgmFieldsInit()
797
+
798
+ c_dcgm_field_meta_t()
799
+ fn = dcgmFP("DcgmFieldGetByTag")
800
+ fn.restype = POINTER(c_dcgm_field_meta_t)
801
+ c_field_meta_ptr = fn(c_char_p(tag.encode("utf-8")))
802
+ if not c_field_meta_ptr:
803
+ return None
804
+
805
+ retVal = c_dcgm_field_meta_t()
806
+ memmove(addressof(retVal), c_field_meta_ptr, sizeof(retVal))
807
+ return retVal
808
+
809
+
810
+ def DcgmFieldGetTagById(fieldId):
811
+ field = DcgmFieldGetById(fieldId)
812
+ if field:
813
+ return field.tag
814
+ else:
815
+ return None