triton-model-analyzer 1.48.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- model_analyzer/__init__.py +15 -0
- model_analyzer/analyzer.py +448 -0
- model_analyzer/cli/__init__.py +15 -0
- model_analyzer/cli/cli.py +193 -0
- model_analyzer/config/__init__.py +15 -0
- model_analyzer/config/generate/__init__.py +15 -0
- model_analyzer/config/generate/automatic_model_config_generator.py +164 -0
- model_analyzer/config/generate/base_model_config_generator.py +352 -0
- model_analyzer/config/generate/brute_plus_binary_parameter_search_run_config_generator.py +164 -0
- model_analyzer/config/generate/brute_run_config_generator.py +154 -0
- model_analyzer/config/generate/concurrency_sweeper.py +75 -0
- model_analyzer/config/generate/config_generator_interface.py +52 -0
- model_analyzer/config/generate/coordinate.py +143 -0
- model_analyzer/config/generate/coordinate_data.py +86 -0
- model_analyzer/config/generate/generator_utils.py +116 -0
- model_analyzer/config/generate/manual_model_config_generator.py +187 -0
- model_analyzer/config/generate/model_config_generator_factory.py +92 -0
- model_analyzer/config/generate/model_profile_spec.py +74 -0
- model_analyzer/config/generate/model_run_config_generator.py +154 -0
- model_analyzer/config/generate/model_variant_name_manager.py +150 -0
- model_analyzer/config/generate/neighborhood.py +536 -0
- model_analyzer/config/generate/optuna_plus_concurrency_sweep_run_config_generator.py +141 -0
- model_analyzer/config/generate/optuna_run_config_generator.py +838 -0
- model_analyzer/config/generate/perf_analyzer_config_generator.py +312 -0
- model_analyzer/config/generate/quick_plus_concurrency_sweep_run_config_generator.py +130 -0
- model_analyzer/config/generate/quick_run_config_generator.py +753 -0
- model_analyzer/config/generate/run_config_generator_factory.py +329 -0
- model_analyzer/config/generate/search_config.py +112 -0
- model_analyzer/config/generate/search_dimension.py +73 -0
- model_analyzer/config/generate/search_dimensions.py +85 -0
- model_analyzer/config/generate/search_parameter.py +49 -0
- model_analyzer/config/generate/search_parameters.py +388 -0
- model_analyzer/config/input/__init__.py +15 -0
- model_analyzer/config/input/config_command.py +483 -0
- model_analyzer/config/input/config_command_profile.py +1747 -0
- model_analyzer/config/input/config_command_report.py +267 -0
- model_analyzer/config/input/config_defaults.py +236 -0
- model_analyzer/config/input/config_enum.py +83 -0
- model_analyzer/config/input/config_field.py +216 -0
- model_analyzer/config/input/config_list_generic.py +112 -0
- model_analyzer/config/input/config_list_numeric.py +151 -0
- model_analyzer/config/input/config_list_string.py +111 -0
- model_analyzer/config/input/config_none.py +71 -0
- model_analyzer/config/input/config_object.py +129 -0
- model_analyzer/config/input/config_primitive.py +81 -0
- model_analyzer/config/input/config_status.py +75 -0
- model_analyzer/config/input/config_sweep.py +83 -0
- model_analyzer/config/input/config_union.py +113 -0
- model_analyzer/config/input/config_utils.py +128 -0
- model_analyzer/config/input/config_value.py +243 -0
- model_analyzer/config/input/objects/__init__.py +15 -0
- model_analyzer/config/input/objects/config_model_profile_spec.py +325 -0
- model_analyzer/config/input/objects/config_model_report_spec.py +173 -0
- model_analyzer/config/input/objects/config_plot.py +198 -0
- model_analyzer/config/input/objects/config_protobuf_utils.py +101 -0
- model_analyzer/config/input/yaml_config_validator.py +82 -0
- model_analyzer/config/run/__init__.py +15 -0
- model_analyzer/config/run/model_run_config.py +313 -0
- model_analyzer/config/run/run_config.py +168 -0
- model_analyzer/constants.py +76 -0
- model_analyzer/device/__init__.py +15 -0
- model_analyzer/device/device.py +24 -0
- model_analyzer/device/gpu_device.py +87 -0
- model_analyzer/device/gpu_device_factory.py +248 -0
- model_analyzer/entrypoint.py +307 -0
- model_analyzer/log_formatter.py +65 -0
- model_analyzer/model_analyzer_exceptions.py +24 -0
- model_analyzer/model_manager.py +255 -0
- model_analyzer/monitor/__init__.py +15 -0
- model_analyzer/monitor/cpu_monitor.py +69 -0
- model_analyzer/monitor/dcgm/DcgmDiag.py +191 -0
- model_analyzer/monitor/dcgm/DcgmFieldGroup.py +83 -0
- model_analyzer/monitor/dcgm/DcgmGroup.py +815 -0
- model_analyzer/monitor/dcgm/DcgmHandle.py +141 -0
- model_analyzer/monitor/dcgm/DcgmJsonReader.py +69 -0
- model_analyzer/monitor/dcgm/DcgmReader.py +623 -0
- model_analyzer/monitor/dcgm/DcgmStatus.py +57 -0
- model_analyzer/monitor/dcgm/DcgmSystem.py +412 -0
- model_analyzer/monitor/dcgm/__init__.py +15 -0
- model_analyzer/monitor/dcgm/common/__init__.py +13 -0
- model_analyzer/monitor/dcgm/common/dcgm_client_cli_parser.py +194 -0
- model_analyzer/monitor/dcgm/common/dcgm_client_main.py +86 -0
- model_analyzer/monitor/dcgm/dcgm_agent.py +887 -0
- model_analyzer/monitor/dcgm/dcgm_collectd_plugin.py +369 -0
- model_analyzer/monitor/dcgm/dcgm_errors.py +395 -0
- model_analyzer/monitor/dcgm/dcgm_field_helpers.py +546 -0
- model_analyzer/monitor/dcgm/dcgm_fields.py +815 -0
- model_analyzer/monitor/dcgm/dcgm_fields_collectd.py +671 -0
- model_analyzer/monitor/dcgm/dcgm_fields_internal.py +29 -0
- model_analyzer/monitor/dcgm/dcgm_fluentd.py +45 -0
- model_analyzer/monitor/dcgm/dcgm_monitor.py +138 -0
- model_analyzer/monitor/dcgm/dcgm_prometheus.py +326 -0
- model_analyzer/monitor/dcgm/dcgm_structs.py +2357 -0
- model_analyzer/monitor/dcgm/dcgm_telegraf.py +65 -0
- model_analyzer/monitor/dcgm/dcgm_value.py +151 -0
- model_analyzer/monitor/dcgm/dcgmvalue.py +155 -0
- model_analyzer/monitor/dcgm/denylist_recommendations.py +573 -0
- model_analyzer/monitor/dcgm/pydcgm.py +47 -0
- model_analyzer/monitor/monitor.py +143 -0
- model_analyzer/monitor/remote_monitor.py +137 -0
- model_analyzer/output/__init__.py +15 -0
- model_analyzer/output/file_writer.py +63 -0
- model_analyzer/output/output_writer.py +42 -0
- model_analyzer/perf_analyzer/__init__.py +15 -0
- model_analyzer/perf_analyzer/genai_perf_config.py +206 -0
- model_analyzer/perf_analyzer/perf_analyzer.py +882 -0
- model_analyzer/perf_analyzer/perf_config.py +479 -0
- model_analyzer/plots/__init__.py +15 -0
- model_analyzer/plots/detailed_plot.py +266 -0
- model_analyzer/plots/plot_manager.py +224 -0
- model_analyzer/plots/simple_plot.py +213 -0
- model_analyzer/record/__init__.py +15 -0
- model_analyzer/record/gpu_record.py +68 -0
- model_analyzer/record/metrics_manager.py +887 -0
- model_analyzer/record/record.py +280 -0
- model_analyzer/record/record_aggregator.py +256 -0
- model_analyzer/record/types/__init__.py +15 -0
- model_analyzer/record/types/cpu_available_ram.py +93 -0
- model_analyzer/record/types/cpu_used_ram.py +93 -0
- model_analyzer/record/types/gpu_free_memory.py +96 -0
- model_analyzer/record/types/gpu_power_usage.py +107 -0
- model_analyzer/record/types/gpu_total_memory.py +96 -0
- model_analyzer/record/types/gpu_used_memory.py +96 -0
- model_analyzer/record/types/gpu_utilization.py +108 -0
- model_analyzer/record/types/inter_token_latency_avg.py +60 -0
- model_analyzer/record/types/inter_token_latency_base.py +74 -0
- model_analyzer/record/types/inter_token_latency_max.py +60 -0
- model_analyzer/record/types/inter_token_latency_min.py +60 -0
- model_analyzer/record/types/inter_token_latency_p25.py +60 -0
- model_analyzer/record/types/inter_token_latency_p50.py +60 -0
- model_analyzer/record/types/inter_token_latency_p75.py +60 -0
- model_analyzer/record/types/inter_token_latency_p90.py +60 -0
- model_analyzer/record/types/inter_token_latency_p95.py +60 -0
- model_analyzer/record/types/inter_token_latency_p99.py +60 -0
- model_analyzer/record/types/output_token_throughput.py +105 -0
- model_analyzer/record/types/perf_client_response_wait.py +97 -0
- model_analyzer/record/types/perf_client_send_recv.py +97 -0
- model_analyzer/record/types/perf_latency.py +111 -0
- model_analyzer/record/types/perf_latency_avg.py +60 -0
- model_analyzer/record/types/perf_latency_base.py +74 -0
- model_analyzer/record/types/perf_latency_p90.py +60 -0
- model_analyzer/record/types/perf_latency_p95.py +60 -0
- model_analyzer/record/types/perf_latency_p99.py +60 -0
- model_analyzer/record/types/perf_server_compute_infer.py +97 -0
- model_analyzer/record/types/perf_server_compute_input.py +97 -0
- model_analyzer/record/types/perf_server_compute_output.py +97 -0
- model_analyzer/record/types/perf_server_queue.py +97 -0
- model_analyzer/record/types/perf_throughput.py +105 -0
- model_analyzer/record/types/time_to_first_token_avg.py +60 -0
- model_analyzer/record/types/time_to_first_token_base.py +74 -0
- model_analyzer/record/types/time_to_first_token_max.py +60 -0
- model_analyzer/record/types/time_to_first_token_min.py +60 -0
- model_analyzer/record/types/time_to_first_token_p25.py +60 -0
- model_analyzer/record/types/time_to_first_token_p50.py +60 -0
- model_analyzer/record/types/time_to_first_token_p75.py +60 -0
- model_analyzer/record/types/time_to_first_token_p90.py +60 -0
- model_analyzer/record/types/time_to_first_token_p95.py +60 -0
- model_analyzer/record/types/time_to_first_token_p99.py +60 -0
- model_analyzer/reports/__init__.py +15 -0
- model_analyzer/reports/html_report.py +195 -0
- model_analyzer/reports/pdf_report.py +50 -0
- model_analyzer/reports/report.py +86 -0
- model_analyzer/reports/report_factory.py +62 -0
- model_analyzer/reports/report_manager.py +1376 -0
- model_analyzer/reports/report_utils.py +42 -0
- model_analyzer/result/__init__.py +15 -0
- model_analyzer/result/constraint_manager.py +150 -0
- model_analyzer/result/model_config_measurement.py +354 -0
- model_analyzer/result/model_constraints.py +105 -0
- model_analyzer/result/parameter_search.py +246 -0
- model_analyzer/result/result_manager.py +430 -0
- model_analyzer/result/result_statistics.py +159 -0
- model_analyzer/result/result_table.py +217 -0
- model_analyzer/result/result_table_manager.py +646 -0
- model_analyzer/result/result_utils.py +42 -0
- model_analyzer/result/results.py +277 -0
- model_analyzer/result/run_config_measurement.py +658 -0
- model_analyzer/result/run_config_result.py +210 -0
- model_analyzer/result/run_config_result_comparator.py +110 -0
- model_analyzer/result/sorted_results.py +151 -0
- model_analyzer/state/__init__.py +15 -0
- model_analyzer/state/analyzer_state.py +76 -0
- model_analyzer/state/analyzer_state_manager.py +215 -0
- model_analyzer/triton/__init__.py +15 -0
- model_analyzer/triton/client/__init__.py +15 -0
- model_analyzer/triton/client/client.py +234 -0
- model_analyzer/triton/client/client_factory.py +57 -0
- model_analyzer/triton/client/grpc_client.py +104 -0
- model_analyzer/triton/client/http_client.py +107 -0
- model_analyzer/triton/model/__init__.py +15 -0
- model_analyzer/triton/model/model_config.py +556 -0
- model_analyzer/triton/model/model_config_variant.py +29 -0
- model_analyzer/triton/server/__init__.py +15 -0
- model_analyzer/triton/server/server.py +76 -0
- model_analyzer/triton/server/server_config.py +269 -0
- model_analyzer/triton/server/server_docker.py +229 -0
- model_analyzer/triton/server/server_factory.py +306 -0
- model_analyzer/triton/server/server_local.py +158 -0
- triton_model_analyzer-1.48.0.dist-info/METADATA +52 -0
- triton_model_analyzer-1.48.0.dist-info/RECORD +204 -0
- triton_model_analyzer-1.48.0.dist-info/WHEEL +5 -0
- triton_model_analyzer-1.48.0.dist-info/entry_points.txt +2 -0
- triton_model_analyzer-1.48.0.dist-info/licenses/LICENSE +67 -0
- triton_model_analyzer-1.48.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,815 @@
|
|
|
1
|
+
# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
##
|
|
15
|
+
# Python bindings for the internal API of DCGM library (dcgm_fields.h)
|
|
16
|
+
##
|
|
17
|
+
|
|
18
|
+
from ctypes import *
|
|
19
|
+
from ctypes.util import find_library
|
|
20
|
+
import model_analyzer.monitor.dcgm.dcgm_structs as dcgm_structs
|
|
21
|
+
from typing import Dict
|
|
22
|
+
|
|
23
|
+
# Provides access to functions
|
|
24
|
+
dcgmFP = dcgm_structs._dcgmGetFunctionPointer
|
|
25
|
+
|
|
26
|
+
# Field Types are a single byte. List these in ASCII order
|
|
27
|
+
DCGM_FT_BINARY = "b" # Blob of binary data representing a structure
|
|
28
|
+
DCGM_FT_DOUBLE = "d" # 8-byte double precision
|
|
29
|
+
DCGM_FT_INT64 = "i" # 8-byte signed integer
|
|
30
|
+
DCGM_FT_STRING = "s" # Null-terminated ASCII Character string
|
|
31
|
+
DCGM_FT_TIMESTAMP = "t" # 8-byte signed integer usec since 1970
|
|
32
|
+
|
|
33
|
+
# Field scope. What are these fields associated with
|
|
34
|
+
DCGM_FS_GLOBAL = 0 # Field is global (ex: driver version)
|
|
35
|
+
DCGM_FS_ENTITY = 1 # Field is associated with an entity (GPU, VGPU, ..etc)
|
|
36
|
+
DCGM_FS_DEVICE = (
|
|
37
|
+
DCGM_FS_ENTITY # Field is associated with a device. Deprecated. Use DCGM_FS_ENTITY
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
# DCGM_FI_DEV_CLOCK_THROTTLE_REASONS is a bitmap of why the clock is throttled.
|
|
41
|
+
# These macros are masks for relevant throttling, and are a 1:1 map to the NVML
|
|
42
|
+
# reasons documented in nvml.h. The notes for the header are copied blow:
|
|
43
|
+
|
|
44
|
+
# Nothing is running on the GPU and the clocks are dropping to Idle state
|
|
45
|
+
DCGM_CLOCKS_THROTTLE_REASON_GPU_IDLE = 0x0000000000000001
|
|
46
|
+
|
|
47
|
+
# GPU clocks are limited by current setting of applications clocks
|
|
48
|
+
DCGM_CLOCKS_THROTTLE_REASON_CLOCKS_SETTING = 0x0000000000000002
|
|
49
|
+
|
|
50
|
+
# SW Power Scaling algorithm is reducing the clocks below requested clocks
|
|
51
|
+
DCGM_CLOCKS_THROTTLE_REASON_SW_POWER_CAP = 0x0000000000000004
|
|
52
|
+
|
|
53
|
+
# HW Slowdown (reducing the core clocks by a factor of 2 or more) is engaged
|
|
54
|
+
#
|
|
55
|
+
# This is an indicator of:
|
|
56
|
+
# - temperature being too high
|
|
57
|
+
# - External Power Brake Assertion is triggered (e.g. by the system power supply)
|
|
58
|
+
# - Power draw is too high and Fast Trigger protection is reducing the clocks
|
|
59
|
+
# - May be also reported during PState or clock change
|
|
60
|
+
# - This behavior may be removed in a later release.
|
|
61
|
+
|
|
62
|
+
DCGM_CLOCKS_THROTTLE_REASON_HW_SLOWDOWN = 0x0000000000000008
|
|
63
|
+
|
|
64
|
+
# Sync Boost
|
|
65
|
+
#
|
|
66
|
+
# This GPU has been added to a Sync boost group with nvidia-smi or DCGM in
|
|
67
|
+
# order to maximize performance per watt. All GPUs in the sync boost group
|
|
68
|
+
# will boost to the minimum possible clocks across the entire group. Look at
|
|
69
|
+
# the throttle reasons for other GPUs in the system to see why those GPUs are
|
|
70
|
+
# holding this one at lower clocks.
|
|
71
|
+
DCGM_CLOCKS_THROTTLE_REASON_SYNC_BOOST = 0x0000000000000010
|
|
72
|
+
|
|
73
|
+
# SW Thermal Slowdown
|
|
74
|
+
#
|
|
75
|
+
# This is an indicator of one or more of the following:
|
|
76
|
+
# - Current GPU temperature above the GPU Max Operating Temperature
|
|
77
|
+
# - Current memory temperature above the Memory Max Operating Temperature
|
|
78
|
+
DCGM_CLOCKS_THROTTLE_REASON_SW_THERMAL = 0x0000000000000020
|
|
79
|
+
|
|
80
|
+
# HW Thermal Slowdown (reducing the core clocks by a factor of 2 or more) is engaged
|
|
81
|
+
#
|
|
82
|
+
# This is an indicator of:
|
|
83
|
+
# - temperature being too high
|
|
84
|
+
DCGM_CLOCKS_THROTTLE_REASON_HW_THERMAL = 0x0000000000000040
|
|
85
|
+
|
|
86
|
+
# HW Power Brake Slowdown (reducing the core clocks by a factor of 2 or more) is engaged
|
|
87
|
+
#
|
|
88
|
+
# This is an indicator of:
|
|
89
|
+
# - External Power Brake Assertion being triggered (e.g. by the system power supply)
|
|
90
|
+
DCGM_CLOCKS_THROTTLE_REASON_HW_POWER_BRAKE = 0x0000000000000080
|
|
91
|
+
|
|
92
|
+
# GPU clocks are limited by current setting of Display clocks
|
|
93
|
+
DCGM_CLOCKS_THROTTLE_REASON_DISPLAY_CLOCKS = 0x0000000000000100
|
|
94
|
+
|
|
95
|
+
# Field entity groups. Which type of entity is this field or field value associated with
|
|
96
|
+
DCGM_FE_NONE = (
|
|
97
|
+
0 # Field is not associated with an entity. Field scope should be DCGM_FS_GLOBAL
|
|
98
|
+
)
|
|
99
|
+
DCGM_FE_GPU = 1 # Field is associated with a GPU entity
|
|
100
|
+
DCGM_FE_VGPU = 2 # Field is associated with a VGPU entity
|
|
101
|
+
DCGM_FE_SWITCH = 3 # Field is associated with a Switch entity
|
|
102
|
+
DCGM_FE_GPU_I = 4 # Field is associated with a GPU Instance entity
|
|
103
|
+
DCGM_FE_GPU_CI = 5 # Field is associated with a GPU Compute Instance entity
|
|
104
|
+
DCGM_FE_LINK = 6 # Field is associated with an NVLINK
|
|
105
|
+
|
|
106
|
+
c_dcgm_field_eid_t = c_uint32 # Represents an identifier for an entity within a field entity. For instance, this is the gpuId for DCGM_FE_GPU.
|
|
107
|
+
|
|
108
|
+
# System attributes
|
|
109
|
+
DCGM_FI_UNKNOWN = 0
|
|
110
|
+
DCGM_FI_DRIVER_VERSION = 1 # Driver Version
|
|
111
|
+
DCGM_FI_NVML_VERSION = 2 # Underlying NVML version
|
|
112
|
+
DCGM_FI_PROCESS_NAME = (
|
|
113
|
+
3 # Process Name. Will be nv-hostengine or your process's name in embedded mode
|
|
114
|
+
)
|
|
115
|
+
DCGM_FI_DEV_COUNT = 4 # Number of Devices on the node
|
|
116
|
+
DCGM_FI_CUDA_DRIVER_VERSION = 5 # Cuda Driver Version as an integer. CUDA 11.1 = 11100
|
|
117
|
+
# Device attributes
|
|
118
|
+
DCGM_FI_DEV_NAME = 50 # Name of the GPU device
|
|
119
|
+
DCGM_FI_DEV_BRAND = 51 # Device Brand
|
|
120
|
+
DCGM_FI_DEV_NVML_INDEX = 52 # NVML index of this GPU
|
|
121
|
+
DCGM_FI_DEV_SERIAL = 53 # Device Serial Number
|
|
122
|
+
DCGM_FI_DEV_UUID = 54 # UUID corresponding to the device
|
|
123
|
+
DCGM_FI_DEV_MINOR_NUMBER = 55 # Device node minor number /dev/nvidia#
|
|
124
|
+
DCGM_FI_DEV_OEM_INFOROM_VER = 56 # OEM inforom version
|
|
125
|
+
DCGM_FI_DEV_PCI_BUSID = 57 # PCI attributes for the device
|
|
126
|
+
DCGM_FI_DEV_PCI_COMBINED_ID = 58 # The combined 16-bit device id and 16-bit vendor id
|
|
127
|
+
DCGM_FI_DEV_PCI_SUBSYS_ID = 59 # The 32-bit Sub System Device ID
|
|
128
|
+
DCGM_FI_GPU_TOPOLOGY_PCI = 60 # Topology of all GPUs on the system via PCI (static)
|
|
129
|
+
DCGM_FI_GPU_TOPOLOGY_NVLINK = (
|
|
130
|
+
61 # Topology of all GPUs on the system via NVLINK (static)
|
|
131
|
+
)
|
|
132
|
+
DCGM_FI_GPU_TOPOLOGY_AFFINITY = 62 # Affinity of all GPUs on the system (static)
|
|
133
|
+
DCGM_FI_DEV_CUDA_COMPUTE_CAPABILITY = 63 # Cuda compute capability for the device
|
|
134
|
+
DCGM_FI_DEV_COMPUTE_MODE = 65 # Compute mode for the device
|
|
135
|
+
DCGM_FI_DEV_PERSISTENCE_MODE = 66 # Persistence mode for the device
|
|
136
|
+
DCGM_FI_DEV_MIG_MODE = 67 # MIG mode for the device
|
|
137
|
+
DCGM_FI_DEV_CUDA_VISIBLE_DEVICES_STR = (
|
|
138
|
+
68 # String value for CUDA_VISIBLE_DEVICES for the device
|
|
139
|
+
)
|
|
140
|
+
DCGM_FI_DEV_MIG_MAX_SLICES = 69 # The maximum number of slices this GPU supports
|
|
141
|
+
DCGM_FI_DEV_CPU_AFFINITY_0 = 70 # Device CPU affinity. part 1/8 = cpus 0 - 63
|
|
142
|
+
DCGM_FI_DEV_CPU_AFFINITY_1 = 71 # Device CPU affinity. part 1/8 = cpus 64 - 127
|
|
143
|
+
DCGM_FI_DEV_CPU_AFFINITY_2 = 72 # Device CPU affinity. part 2/8 = cpus 128 - 191
|
|
144
|
+
DCGM_FI_DEV_CPU_AFFINITY_3 = 73 # Device CPU affinity. part 3/8 = cpus 192 - 255
|
|
145
|
+
DCGM_FI_DEV_CC_MODE = 74 # Device CC/APM mode
|
|
146
|
+
DCGM_FI_DEV_MIG_ATTRIBUTES = 75 # MIG device attributes
|
|
147
|
+
DCGM_FI_DEV_MIG_GI_INFO = 76 # GPU instance profile information
|
|
148
|
+
DCGM_FI_DEV_MIG_CI_INFO = 77 # Compute instance profile information
|
|
149
|
+
DCGM_FI_DEV_ECC_INFOROM_VER = 80 # ECC inforom version
|
|
150
|
+
DCGM_FI_DEV_POWER_INFOROM_VER = 81 # Power management object inforom version
|
|
151
|
+
DCGM_FI_DEV_INFOROM_IMAGE_VER = 82 # Inforom image version
|
|
152
|
+
DCGM_FI_DEV_INFOROM_CONFIG_CHECK = 83 # Inforom configuration checksum
|
|
153
|
+
DCGM_FI_DEV_INFOROM_CONFIG_VALID = (
|
|
154
|
+
84 # Reads the infoROM from the flash and verifies the checksums
|
|
155
|
+
)
|
|
156
|
+
DCGM_FI_DEV_VBIOS_VERSION = 85 # VBIOS version of the device
|
|
157
|
+
DCGM_FI_DEV_BAR1_TOTAL = 90 # Total BAR1 of the GPU
|
|
158
|
+
DCGM_FI_SYNC_BOOST = 91 # Deprecated - Sync boost settings on the node
|
|
159
|
+
DCGM_FI_DEV_BAR1_USED = 92 # Used BAR1 of the GPU in MB
|
|
160
|
+
DCGM_FI_DEV_BAR1_FREE = 93 # Free BAR1 of the GPU in MB
|
|
161
|
+
# Clocks and power
|
|
162
|
+
DCGM_FI_DEV_SM_CLOCK = 100 # SM clock for the device
|
|
163
|
+
DCGM_FI_DEV_MEM_CLOCK = 101 # Memory clock for the device
|
|
164
|
+
DCGM_FI_DEV_VIDEO_CLOCK = 102 # Video encoder/decoder clock for the device
|
|
165
|
+
DCGM_FI_DEV_APP_SM_CLOCK = 110 # SM Application clocks
|
|
166
|
+
DCGM_FI_DEV_APP_MEM_CLOCK = 111 # Memory Application clocks
|
|
167
|
+
DCGM_FI_DEV_CLOCK_THROTTLE_REASONS = (
|
|
168
|
+
112 # Current clock throttle reasons (bitmask of DCGM_CLOCKS_THROTTLE_REASON_*)
|
|
169
|
+
)
|
|
170
|
+
DCGM_FI_DEV_MAX_SM_CLOCK = 113 # Maximum supported SM clock for the device
|
|
171
|
+
DCGM_FI_DEV_MAX_MEM_CLOCK = 114 # Maximum supported Memory clock for the device
|
|
172
|
+
DCGM_FI_DEV_MAX_VIDEO_CLOCK = (
|
|
173
|
+
115 # Maximum supported Video encoder/decoder clock for the device
|
|
174
|
+
)
|
|
175
|
+
DCGM_FI_DEV_AUTOBOOST = 120 # Auto-boost for the device (1 = enabled. 0 = disabled)
|
|
176
|
+
DCGM_FI_DEV_SUPPORTED_CLOCKS = 130 # Supported clocks for the device
|
|
177
|
+
DCGM_FI_DEV_MEMORY_TEMP = 140 # Memory temperature for the device
|
|
178
|
+
DCGM_FI_DEV_GPU_TEMP = 150 # Current temperature readings for the device, in degrees C
|
|
179
|
+
DCGM_FI_DEV_MEM_MAX_OP_TEMP = (
|
|
180
|
+
151 # Maximum operating temperature for the memory of this GPU
|
|
181
|
+
)
|
|
182
|
+
DCGM_FI_DEV_GPU_MAX_OP_TEMP = 152 # Maximum operating temperature for this GPU
|
|
183
|
+
DCGM_FI_DEV_POWER_USAGE = 155 # Power usage for the device in Watts
|
|
184
|
+
DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION = (
|
|
185
|
+
156 # Total energy consumption for the GPU in mJ since the driver was last reloaded
|
|
186
|
+
)
|
|
187
|
+
DCGM_FI_DEV_SLOWDOWN_TEMP = 158 # Slowdown temperature for the device
|
|
188
|
+
DCGM_FI_DEV_SHUTDOWN_TEMP = 159 # Shutdown temperature for the device
|
|
189
|
+
DCGM_FI_DEV_POWER_MGMT_LIMIT = 160 # Current Power limit for the device
|
|
190
|
+
DCGM_FI_DEV_POWER_MGMT_LIMIT_MIN = 161 # Minimum power management limit for the device
|
|
191
|
+
DCGM_FI_DEV_POWER_MGMT_LIMIT_MAX = 162 # Maximum power management limit for the device
|
|
192
|
+
DCGM_FI_DEV_POWER_MGMT_LIMIT_DEF = 163 # Default power management limit for the device
|
|
193
|
+
DCGM_FI_DEV_ENFORCED_POWER_LIMIT = 164 # Effective power limit that the driver enforces after taking into account all limiters
|
|
194
|
+
DCGM_FI_DEV_PSTATE = 190 # Performance state (P-State) 0-15. 0=highest
|
|
195
|
+
DCGM_FI_DEV_FAN_SPEED = 191 # Fan speed for the device in percent 0-100
|
|
196
|
+
# Device utilization and telemetry
|
|
197
|
+
DCGM_FI_DEV_PCIE_TX_THROUGHPUT = 200 # Deprecated - PCIe Tx utilization information
|
|
198
|
+
DCGM_FI_DEV_PCIE_RX_THROUGHPUT = 201 # Deprecated - PCIe Rx utilization information
|
|
199
|
+
DCGM_FI_DEV_PCIE_REPLAY_COUNTER = 202 # PCIe replay counter
|
|
200
|
+
DCGM_FI_DEV_GPU_UTIL = 203 # GPU Utilization
|
|
201
|
+
DCGM_FI_DEV_MEM_COPY_UTIL = 204 # Memory Utilization
|
|
202
|
+
DCGM_FI_DEV_ACCOUNTING_DATA = 205 # Process accounting stats
|
|
203
|
+
DCGM_FI_DEV_ENC_UTIL = 206 # Encoder utilization
|
|
204
|
+
DCGM_FI_DEV_DEC_UTIL = 207 # Decoder utilization
|
|
205
|
+
# Fields 210, 211, 220, and 221 are internal-only. see dcgm_fields_internal.py
|
|
206
|
+
DCGM_FI_DEV_XID_ERRORS = 230 # XID errors. The value is the specific XID error
|
|
207
|
+
DCGM_FI_DEV_PCIE_MAX_LINK_GEN = 235 # PCIe Max Link Generation
|
|
208
|
+
DCGM_FI_DEV_PCIE_MAX_LINK_WIDTH = 236 # PCIe Max Link Width
|
|
209
|
+
DCGM_FI_DEV_PCIE_LINK_GEN = 237 # PCIe Current Link Generation
|
|
210
|
+
DCGM_FI_DEV_PCIE_LINK_WIDTH = 238 # PCIe Current Link Width
|
|
211
|
+
# Violation counters
|
|
212
|
+
DCGM_FI_DEV_POWER_VIOLATION = 240 # Power Violation time in usec
|
|
213
|
+
DCGM_FI_DEV_THERMAL_VIOLATION = 241 # Thermal Violation time in usec
|
|
214
|
+
DCGM_FI_DEV_SYNC_BOOST_VIOLATION = 242 # Sync Boost Violation time in usec
|
|
215
|
+
DCGM_FI_DEV_BOARD_LIMIT_VIOLATION = 243 # Board Limit Violation time in usec.
|
|
216
|
+
DCGM_FI_DEV_LOW_UTIL_VIOLATION = 244 # Low Utilization Violation time in usec.
|
|
217
|
+
DCGM_FI_DEV_RELIABILITY_VIOLATION = 245 # Reliability Violation time in usec.
|
|
218
|
+
DCGM_FI_DEV_TOTAL_APP_CLOCKS_VIOLATION = 246 # App Clocks Violation time in usec.
|
|
219
|
+
DCGM_FI_DEV_TOTAL_BASE_CLOCKS_VIOLATION = 247 # Base Clocks Violation time in usec.
|
|
220
|
+
# Framebuffer usage
|
|
221
|
+
DCGM_FI_DEV_FB_TOTAL = 250 # Total framebuffer memory in MB
|
|
222
|
+
DCGM_FI_DEV_FB_FREE = 251 # Total framebuffer used in MB
|
|
223
|
+
DCGM_FI_DEV_FB_USED = 252 # Total framebuffer free in MB
|
|
224
|
+
DCGM_FI_DEV_FB_RESERVED = 253 # Total framebuffer reserved in MB
|
|
225
|
+
# Device ECC Counters
|
|
226
|
+
DCGM_FI_DEV_ECC_CURRENT = 300 # Current ECC mode for the device
|
|
227
|
+
DCGM_FI_DEV_ECC_PENDING = 301 # Pending ECC mode for the device
|
|
228
|
+
DCGM_FI_DEV_ECC_SBE_VOL_TOTAL = 310 # Total single bit volatile ecc errors
|
|
229
|
+
DCGM_FI_DEV_ECC_DBE_VOL_TOTAL = 311 # Total double bit volatile ecc errors
|
|
230
|
+
DCGM_FI_DEV_ECC_SBE_AGG_TOTAL = (
|
|
231
|
+
312 # Total single bit aggregate (persistent) ecc errors
|
|
232
|
+
)
|
|
233
|
+
DCGM_FI_DEV_ECC_DBE_AGG_TOTAL = (
|
|
234
|
+
313 # Total double bit aggregate (persistent) ecc errors
|
|
235
|
+
)
|
|
236
|
+
DCGM_FI_DEV_ECC_SBE_VOL_L1 = 314 # L1 cache single bit volatile ecc errors
|
|
237
|
+
DCGM_FI_DEV_ECC_DBE_VOL_L1 = 315 # L1 cache double bit volatile ecc errors
|
|
238
|
+
DCGM_FI_DEV_ECC_SBE_VOL_L2 = 316 # L2 cache single bit volatile ecc errors
|
|
239
|
+
DCGM_FI_DEV_ECC_DBE_VOL_L2 = 317 # L2 cache double bit volatile ecc errors
|
|
240
|
+
DCGM_FI_DEV_ECC_SBE_VOL_DEV = 318 # Device memory single bit volatile ecc errors
|
|
241
|
+
DCGM_FI_DEV_ECC_DBE_VOL_DEV = 319 # Device memory double bit volatile ecc errors
|
|
242
|
+
DCGM_FI_DEV_ECC_SBE_VOL_REG = 320 # Register file single bit volatile ecc errors
|
|
243
|
+
DCGM_FI_DEV_ECC_DBE_VOL_REG = 321 # Register file double bit volatile ecc errors
|
|
244
|
+
DCGM_FI_DEV_ECC_SBE_VOL_TEX = 322 # Texture memory single bit volatile ecc errors
|
|
245
|
+
DCGM_FI_DEV_ECC_DBE_VOL_TEX = 323 # Texture memory double bit volatile ecc errors
|
|
246
|
+
DCGM_FI_DEV_ECC_SBE_AGG_L1 = (
|
|
247
|
+
324 # L1 cache single bit aggregate (persistent) ecc errors
|
|
248
|
+
)
|
|
249
|
+
DCGM_FI_DEV_ECC_DBE_AGG_L1 = (
|
|
250
|
+
325 # L1 cache double bit aggregate (persistent) ecc errors
|
|
251
|
+
)
|
|
252
|
+
DCGM_FI_DEV_ECC_SBE_AGG_L2 = (
|
|
253
|
+
326 # L2 cache single bit aggregate (persistent) ecc errors
|
|
254
|
+
)
|
|
255
|
+
DCGM_FI_DEV_ECC_DBE_AGG_L2 = (
|
|
256
|
+
327 # L2 cache double bit aggregate (persistent) ecc errors
|
|
257
|
+
)
|
|
258
|
+
DCGM_FI_DEV_ECC_SBE_AGG_DEV = (
|
|
259
|
+
328 # Device memory single bit aggregate (persistent) ecc errors
|
|
260
|
+
)
|
|
261
|
+
DCGM_FI_DEV_ECC_DBE_AGG_DEV = (
|
|
262
|
+
329 # Device memory double bit aggregate (persistent) ecc errors
|
|
263
|
+
)
|
|
264
|
+
DCGM_FI_DEV_ECC_SBE_AGG_REG = (
|
|
265
|
+
330 # Register File single bit aggregate (persistent) ecc errors
|
|
266
|
+
)
|
|
267
|
+
DCGM_FI_DEV_ECC_DBE_AGG_REG = (
|
|
268
|
+
331 # Register File double bit aggregate (persistent) ecc errors
|
|
269
|
+
)
|
|
270
|
+
DCGM_FI_DEV_ECC_SBE_AGG_TEX = (
|
|
271
|
+
332 # Texture memory single bit aggregate (persistent) ecc errors
|
|
272
|
+
)
|
|
273
|
+
DCGM_FI_DEV_ECC_DBE_AGG_TEX = (
|
|
274
|
+
333 # Texture memory double bit aggregate (persistent) ecc errors
|
|
275
|
+
)
|
|
276
|
+
DCGM_FI_DEV_RETIRED_SBE = 390 # Number of retired pages because of single bit errors
|
|
277
|
+
DCGM_FI_DEV_RETIRED_DBE = 391 # Number of retired pages because of double bit errors
|
|
278
|
+
DCGM_FI_DEV_RETIRED_PENDING = 392 # Number of pages pending retirement
|
|
279
|
+
# Row remapper fields (Ampere and newer)
|
|
280
|
+
DCGM_FI_DEV_UNCORRECTABLE_REMAPPED_ROWS = (
|
|
281
|
+
393 # Number of remapped rows for uncorrectable errors
|
|
282
|
+
)
|
|
283
|
+
DCGM_FI_DEV_CORRECTABLE_REMAPPED_ROWS = (
|
|
284
|
+
394 # Number of remapped rows for correctable errors
|
|
285
|
+
)
|
|
286
|
+
DCGM_FI_DEV_ROW_REMAP_FAILURE = 395 # Whether remapping of rows has failed
|
|
287
|
+
DCGM_FI_DEV_ROW_REMAP_PENDING = 396 # Whether remapping of rows is pending
|
|
288
|
+
|
|
289
|
+
# Device NvLink Bandwidth and Error Counters
|
|
290
|
+
DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L0 = (
|
|
291
|
+
400 # NV Link flow control CRC Error Counter for Lane 0
|
|
292
|
+
)
|
|
293
|
+
DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L1 = (
|
|
294
|
+
401 # NV Link flow control CRC Error Counter for Lane 1
|
|
295
|
+
)
|
|
296
|
+
DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L2 = (
|
|
297
|
+
402 # NV Link flow control CRC Error Counter for Lane 2
|
|
298
|
+
)
|
|
299
|
+
DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L3 = (
|
|
300
|
+
403 # NV Link flow control CRC Error Counter for Lane 3
|
|
301
|
+
)
|
|
302
|
+
DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L4 = (
|
|
303
|
+
404 # NV Link flow control CRC Error Counter for Lane 4
|
|
304
|
+
)
|
|
305
|
+
DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L5 = (
|
|
306
|
+
405 # NV Link flow control CRC Error Counter for Lane 5
|
|
307
|
+
)
|
|
308
|
+
DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_TOTAL = (
|
|
309
|
+
409 # NV Link flow control CRC Error Counter total for all Lanes
|
|
310
|
+
)
|
|
311
|
+
DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L0 = (
|
|
312
|
+
410 # NV Link data CRC Error Counter for Lane 0
|
|
313
|
+
)
|
|
314
|
+
DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L1 = (
|
|
315
|
+
411 # NV Link data CRC Error Counter for Lane 1
|
|
316
|
+
)
|
|
317
|
+
DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L2 = (
|
|
318
|
+
412 # NV Link data CRC Error Counter for Lane 2
|
|
319
|
+
)
|
|
320
|
+
DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L3 = (
|
|
321
|
+
413 # NV Link data CRC Error Counter for Lane 3
|
|
322
|
+
)
|
|
323
|
+
DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L4 = (
|
|
324
|
+
414 # NV Link data CRC Error Counter for Lane 4
|
|
325
|
+
)
|
|
326
|
+
DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L5 = (
|
|
327
|
+
415 # NV Link data CRC Error Counter for Lane 5
|
|
328
|
+
)
|
|
329
|
+
DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_TOTAL = (
|
|
330
|
+
419 # NV Link data CRC Error Counter total for all Lanes
|
|
331
|
+
)
|
|
332
|
+
DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L0 = (
|
|
333
|
+
420 # NV Link Replay Error Counter for Lane 0
|
|
334
|
+
)
|
|
335
|
+
DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L1 = (
|
|
336
|
+
421 # NV Link Replay Error Counter for Lane 1
|
|
337
|
+
)
|
|
338
|
+
DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L2 = (
|
|
339
|
+
422 # NV Link Replay Error Counter for Lane 2
|
|
340
|
+
)
|
|
341
|
+
DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L3 = (
|
|
342
|
+
423 # NV Link Replay Error Counter for Lane 3
|
|
343
|
+
)
|
|
344
|
+
DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L4 = (
|
|
345
|
+
424 # NV Link Replay Error Counter for Lane 4
|
|
346
|
+
)
|
|
347
|
+
DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L5 = (
|
|
348
|
+
425 # NV Link Replay Error Counter for Lane 3
|
|
349
|
+
)
|
|
350
|
+
DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_TOTAL = (
|
|
351
|
+
429 # NV Link Replay Error Counter total for all Lanes
|
|
352
|
+
)
|
|
353
|
+
|
|
354
|
+
DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L0 = (
|
|
355
|
+
430 # NV Link Recovery Error Counter for Lane 0
|
|
356
|
+
)
|
|
357
|
+
DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L1 = (
|
|
358
|
+
431 # NV Link Recovery Error Counter for Lane 1
|
|
359
|
+
)
|
|
360
|
+
DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L2 = (
|
|
361
|
+
432 # NV Link Recovery Error Counter for Lane 2
|
|
362
|
+
)
|
|
363
|
+
DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L3 = (
|
|
364
|
+
433 # NV Link Recovery Error Counter for Lane 3
|
|
365
|
+
)
|
|
366
|
+
DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L4 = (
|
|
367
|
+
434 # NV Link Recovery Error Counter for Lane 4
|
|
368
|
+
)
|
|
369
|
+
DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L5 = (
|
|
370
|
+
435 # NV Link Recovery Error Counter for Lane 5
|
|
371
|
+
)
|
|
372
|
+
DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_TOTAL = (
|
|
373
|
+
439 # NV Link Recovery Error Counter total for all Lanes
|
|
374
|
+
)
|
|
375
|
+
DCGM_FI_DEV_NVLINK_BANDWIDTH_L0 = 440 # NV Link Bandwidth Counter for Lane 0
|
|
376
|
+
DCGM_FI_DEV_NVLINK_BANDWIDTH_L1 = 441 # NV Link Bandwidth Counter for Lane 1
|
|
377
|
+
DCGM_FI_DEV_NVLINK_BANDWIDTH_L2 = 442 # NV Link Bandwidth Counter for Lane 2
|
|
378
|
+
DCGM_FI_DEV_NVLINK_BANDWIDTH_L3 = 443 # NV Link Bandwidth Counter for Lane 3
|
|
379
|
+
DCGM_FI_DEV_NVLINK_BANDWIDTH_L4 = 444 # NV Link Bandwidth Counter for Lane 4
|
|
380
|
+
DCGM_FI_DEV_NVLINK_BANDWIDTH_L5 = 445 # NV Link Bandwidth Counter for Lane 5
|
|
381
|
+
DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL = (
|
|
382
|
+
449 # NV Link Bandwidth Counter total for all Lanes
|
|
383
|
+
)
|
|
384
|
+
DCGM_FI_DEV_GPU_NVLINK_ERRORS = 450 # GPU NVLink error information
|
|
385
|
+
DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L6 = 451
|
|
386
|
+
DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L7 = 452
|
|
387
|
+
DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L8 = 453
|
|
388
|
+
DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L9 = 454
|
|
389
|
+
DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L10 = 455
|
|
390
|
+
DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L11 = 456
|
|
391
|
+
DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L12 = 406
|
|
392
|
+
DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L13 = 407
|
|
393
|
+
DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L14 = 408
|
|
394
|
+
DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L15 = 481
|
|
395
|
+
DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L16 = 482
|
|
396
|
+
DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L17 = 483
|
|
397
|
+
DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L6 = 457
|
|
398
|
+
DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L7 = 458
|
|
399
|
+
DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L8 = 459
|
|
400
|
+
DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L9 = 460
|
|
401
|
+
DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L10 = 461
|
|
402
|
+
DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L11 = 462
|
|
403
|
+
DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L12 = 416
|
|
404
|
+
DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L13 = 417
|
|
405
|
+
DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L14 = 418
|
|
406
|
+
DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L15 = 484
|
|
407
|
+
DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L16 = 485
|
|
408
|
+
DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L17 = 486
|
|
409
|
+
DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L6 = 463
|
|
410
|
+
DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L7 = 464
|
|
411
|
+
DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L8 = 465
|
|
412
|
+
DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L9 = 466
|
|
413
|
+
DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L10 = 467
|
|
414
|
+
DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L11 = 468
|
|
415
|
+
DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L12 = 426
|
|
416
|
+
DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L13 = 427
|
|
417
|
+
DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L14 = 428
|
|
418
|
+
DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L15 = 487
|
|
419
|
+
DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L16 = 488
|
|
420
|
+
DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L17 = 489
|
|
421
|
+
DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L6 = 469
|
|
422
|
+
DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L7 = 470
|
|
423
|
+
DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L8 = 471
|
|
424
|
+
DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L9 = 472
|
|
425
|
+
DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L10 = 473
|
|
426
|
+
DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L11 = 474
|
|
427
|
+
DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L12 = 436
|
|
428
|
+
DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L13 = 437
|
|
429
|
+
DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L14 = 438
|
|
430
|
+
DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L15 = 491
|
|
431
|
+
DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L16 = 492
|
|
432
|
+
DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L17 = 493
|
|
433
|
+
DCGM_FI_DEV_NVLINK_BANDWIDTH_L6 = 475
|
|
434
|
+
DCGM_FI_DEV_NVLINK_BANDWIDTH_L7 = 476
|
|
435
|
+
DCGM_FI_DEV_NVLINK_BANDWIDTH_L8 = 477
|
|
436
|
+
DCGM_FI_DEV_NVLINK_BANDWIDTH_L9 = 478
|
|
437
|
+
DCGM_FI_DEV_NVLINK_BANDWIDTH_L10 = 479
|
|
438
|
+
DCGM_FI_DEV_NVLINK_BANDWIDTH_L11 = 480
|
|
439
|
+
DCGM_FI_DEV_NVLINK_BANDWIDTH_L12 = 446
|
|
440
|
+
DCGM_FI_DEV_NVLINK_BANDWIDTH_L13 = 447
|
|
441
|
+
DCGM_FI_DEV_NVLINK_BANDWIDTH_L14 = 448
|
|
442
|
+
DCGM_FI_DEV_NVLINK_BANDWIDTH_L15 = 494
|
|
443
|
+
DCGM_FI_DEV_NVLINK_BANDWIDTH_L16 = 495
|
|
444
|
+
DCGM_FI_DEV_NVLINK_BANDWIDTH_L17 = 496
|
|
445
|
+
|
|
446
|
+
# Device Attributes associated with virtualization
|
|
447
|
+
DCGM_FI_DEV_VIRTUAL_MODE = 500 # Operating mode of the GPU
|
|
448
|
+
DCGM_FI_DEV_SUPPORTED_TYPE_INFO = (
|
|
449
|
+
501 # Includes Count and Supported vGPU type information
|
|
450
|
+
)
|
|
451
|
+
DCGM_FI_DEV_CREATABLE_VGPU_TYPE_IDS = (
|
|
452
|
+
502 # Includes Count and List of Creatable vGPU type IDs
|
|
453
|
+
)
|
|
454
|
+
DCGM_FI_DEV_VGPU_INSTANCE_IDS = 503 # Includes Count and List of vGPU instance IDs
|
|
455
|
+
DCGM_FI_DEV_VGPU_UTILIZATIONS = (
|
|
456
|
+
504 # Utilization values for vGPUs running on the device
|
|
457
|
+
)
|
|
458
|
+
DCGM_FI_DEV_VGPU_PER_PROCESS_UTILIZATION = (
|
|
459
|
+
505 # Utilization values for processes running within vGPU VMs using the device
|
|
460
|
+
)
|
|
461
|
+
DCGM_FI_DEV_ENC_STATS = 506 # Current encoder statistics for a given device
|
|
462
|
+
DCGM_FI_DEV_FBC_STATS = (
|
|
463
|
+
507 # Statistics of current active frame buffer capture sessions on a given device
|
|
464
|
+
)
|
|
465
|
+
DCGM_FI_DEV_FBC_SESSIONS_INFO = (
|
|
466
|
+
508 # Information about active frame buffer capture sessions on a target device
|
|
467
|
+
)
|
|
468
|
+
DCGM_FI_DEV_SUPPORTED_VGPU_TYPE_IDS = (
|
|
469
|
+
509 # Includes Count and currently Supported vGPU types on a device
|
|
470
|
+
)
|
|
471
|
+
DCGM_FI_DEV_VGPU_TYPE_INFO = (
|
|
472
|
+
510 # Includes Static info of vGPU types supported on a device
|
|
473
|
+
)
|
|
474
|
+
DCGM_FI_DEV_VGPU_TYPE_NAME = (
|
|
475
|
+
511 # Includes the name of a vGPU type supported on a device
|
|
476
|
+
)
|
|
477
|
+
DCGM_FI_DEV_VGPU_TYPE_CLASS = (
|
|
478
|
+
512 # Includes the class of a vGPU type supported on a device
|
|
479
|
+
)
|
|
480
|
+
DCGM_FI_DEV_VGPU_TYPE_LICENSE = (
|
|
481
|
+
513 # Includes the license info for a vGPU type supported on a device
|
|
482
|
+
)
|
|
483
|
+
# Related to vGPU Instance IDs
|
|
484
|
+
DCGM_FI_DEV_VGPU_VM_ID = 520 # vGPU VM ID
|
|
485
|
+
DCGM_FI_DEV_VGPU_VM_NAME = 521 # vGPU VM name
|
|
486
|
+
DCGM_FI_DEV_VGPU_TYPE = 522 # vGPU type of the vGPU instance
|
|
487
|
+
DCGM_FI_DEV_VGPU_UUID = 523 # UUID of the vGPU instance
|
|
488
|
+
DCGM_FI_DEV_VGPU_DRIVER_VERSION = 524 # Driver version of the vGPU instance
|
|
489
|
+
DCGM_FI_DEV_VGPU_MEMORY_USAGE = 525 # Memory usage of the vGPU instance
|
|
490
|
+
DCGM_FI_DEV_VGPU_LICENSE_STATUS = 526 # License status of the vGPU
|
|
491
|
+
DCGM_FI_DEV_VGPU_FRAME_RATE_LIMIT = 527 # Frame rate limit of the vGPU instance
|
|
492
|
+
DCGM_FI_DEV_VGPU_ENC_STATS = 528 # Current encoder statistics of the vGPU instance
|
|
493
|
+
DCGM_FI_DEV_VGPU_ENC_SESSIONS_INFO = (
|
|
494
|
+
529 # Information about all active encoder sessions on the vGPU instance
|
|
495
|
+
)
|
|
496
|
+
DCGM_FI_DEV_VGPU_FBC_STATS = 530 # Statistics of current active frame buffer capture sessions on the vGPU instance
|
|
497
|
+
DCGM_FI_DEV_VGPU_FBC_SESSIONS_INFO = (
|
|
498
|
+
531 # Information about active frame buffer capture sessions on the vGPU instance
|
|
499
|
+
)
|
|
500
|
+
DCGM_FI_DEV_VGPU_INSTANCE_LICENSE_STATE = (
|
|
501
|
+
532 # License state information of the vGPU instance
|
|
502
|
+
)
|
|
503
|
+
DCGM_FI_DEV_VGPU_PCI_ID = 533 # PCI Id of the vGPU instance
|
|
504
|
+
DCGM_FI_DEV_VGPU_VM_GPU_INSTANCE_ID = 534 # GPU Instance Id of the vGPU instance
|
|
505
|
+
# Internal fields reserve the range 600..699
|
|
506
|
+
# below fields related to NVSwitch
|
|
507
|
+
DCGM_FI_FIRST_NVSWITCH_FIELD_ID = 700 # Starting field ID of the NVSwitch instance
|
|
508
|
+
DCGM_FI_DEV_NVSWITCH_LINK_THROUGHPUT_TX = 780
|
|
509
|
+
DCGM_FI_DEV_NVSWITCH_LINK_THROUGHPUT_RX = 781
|
|
510
|
+
DCGM_FI_DEV_NVSWITCH_LINK_FATAL_ERRORS = 782
|
|
511
|
+
DCGM_FI_DEV_NVSWITCH_LINK_NON_FATAL_ERRORS = 783
|
|
512
|
+
DCGM_FI_DEV_NVSWITCH_LINK_REPLAY_ERRORS = 784
|
|
513
|
+
DCGM_FI_DEV_NVSWITCH_LINK_RECOVERY_ERRORS = 785
|
|
514
|
+
DCGM_FI_DEV_NVSWITCH_LINK_FLIT_ERRORS = 786
|
|
515
|
+
DCGM_FI_DEV_NVSWITCH_LINK_CRC_ERRORS = 787
|
|
516
|
+
DCGM_FI_DEV_NVSWITCH_LINK_ECC_ERRORS = 788
|
|
517
|
+
DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_LOW_VC0 = 789
|
|
518
|
+
DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_LOW_VC1 = 790
|
|
519
|
+
DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_LOW_VC2 = 791
|
|
520
|
+
DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_LOW_VC3 = 792
|
|
521
|
+
DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_MEDIUM_VC0 = 793
|
|
522
|
+
DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_MEDIUM_VC1 = 794
|
|
523
|
+
DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_MEDIUM_VC2 = 795
|
|
524
|
+
DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_MEDIUM_VC3 = 796
|
|
525
|
+
DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_HIGH_VC0 = 797
|
|
526
|
+
DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_HIGH_VC1 = 798
|
|
527
|
+
DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_HIGH_VC2 = 799
|
|
528
|
+
DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_HIGH_VC3 = 800
|
|
529
|
+
DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_PANIC_VC0 = 801
|
|
530
|
+
DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_PANIC_VC1 = 802
|
|
531
|
+
DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_PANIC_VC2 = 803
|
|
532
|
+
DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_PANIC_VC3 = 804
|
|
533
|
+
DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_COUNT_VC0 = 805
|
|
534
|
+
DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_COUNT_VC1 = 806
|
|
535
|
+
DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_COUNT_VC2 = 807
|
|
536
|
+
DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_COUNT_VC3 = 808
|
|
537
|
+
DCGM_FI_DEV_NVSWITCH_LINK_CRC_ERRORS_LANE0 = 809
|
|
538
|
+
DCGM_FI_DEV_NVSWITCH_LINK_CRC_ERRORS_LANE1 = 810
|
|
539
|
+
DCGM_FI_DEV_NVSWITCH_LINK_CRC_ERRORS_LANE2 = 811
|
|
540
|
+
DCGM_FI_DEV_NVSWITCH_LINK_CRC_ERRORS_LANE3 = 812
|
|
541
|
+
DCGM_FI_DEV_NVSWITCH_LINK_ECC_ERRORS_LANE0 = 813
|
|
542
|
+
DCGM_FI_DEV_NVSWITCH_LINK_ECC_ERRORS_LANE1 = 814
|
|
543
|
+
DCGM_FI_DEV_NVSWITCH_LINK_ECC_ERRORS_LANE2 = 815
|
|
544
|
+
DCGM_FI_DEV_NVSWITCH_LINK_ECC_ERRORS_LANE3 = 816
|
|
545
|
+
DCGM_FI_DEV_NVSWITCH_FATAL_ERRORS = 856
|
|
546
|
+
DCGM_FI_DEV_NVSWITCH_NON_FATAL_ERRORS = 857
|
|
547
|
+
DCGM_FI_DEV_NVSWITCH_TEMPERATURE_CURRENT = 858
|
|
548
|
+
DCGM_FI_DEV_NVSWITCH_TEMPERATURE_LIMIT_SLOWDOWN = 859
|
|
549
|
+
DCGM_FI_DEV_NVSWITCH_TEMPERATURE_LIMIT_SHUTDOWN = 860
|
|
550
|
+
DCGM_FI_DEV_NVSWITCH_THROUGHPUT_TX = 861
|
|
551
|
+
DCGM_FI_DEV_NVSWITCH_THROUGHPUT_RX = 862
|
|
552
|
+
|
|
553
|
+
DCGM_FI_LAST_NVSWITCH_FIELD_ID = 899 # Last field ID of the NVSwitch instance
|
|
554
|
+
"""
|
|
555
|
+
Profiling Fields
|
|
556
|
+
"""
|
|
557
|
+
DCGM_FI_PROF_GR_ENGINE_ACTIVE = (
|
|
558
|
+
1001 # Ratio of time the graphics engine is active. The graphics engine is
|
|
559
|
+
)
|
|
560
|
+
# active if a graphics/compute context is bound and the graphics pipe or
|
|
561
|
+
# compute pipe is busy.
|
|
562
|
+
|
|
563
|
+
DCGM_FI_PROF_SM_ACTIVE = 1002 # The ratio of cycles an SM has at least 1 warp assigned
|
|
564
|
+
# (computed from the number of cycles and elapsed cycles)
|
|
565
|
+
|
|
566
|
+
DCGM_FI_PROF_SM_OCCUPANCY = 1003 # The ratio of number of warps resident on an SM.
|
|
567
|
+
# (number of resident as a ratio of the theoretical
|
|
568
|
+
# maximum number of warps per elapsed cycle)
|
|
569
|
+
|
|
570
|
+
DCGM_FI_PROF_PIPE_TENSOR_ACTIVE = (
|
|
571
|
+
1004 # The ratio of cycles the any tensor pipe is active
|
|
572
|
+
)
|
|
573
|
+
# (off the peak sustained elapsed cycles)
|
|
574
|
+
|
|
575
|
+
DCGM_FI_PROF_DRAM_ACTIVE = 1005 # The ratio of cycles the device memory interface is active sending or receiving data.
|
|
576
|
+
DCGM_FI_PROF_PIPE_FP64_ACTIVE = 1006 # Ratio of cycles the fp64 pipe is active.
|
|
577
|
+
DCGM_FI_PROF_PIPE_FP32_ACTIVE = 1007 # Ratio of cycles the fp32 pipe is active.
|
|
578
|
+
DCGM_FI_PROF_PIPE_FP16_ACTIVE = (
|
|
579
|
+
1008 # Ratio of cycles the fp16 pipe is active. This does not include HMMA.
|
|
580
|
+
)
|
|
581
|
+
DCGM_FI_PROF_PCIE_TX_BYTES = 1009 # The number of bytes of active PCIe tx (transmit) data including both header and payload.
|
|
582
|
+
DCGM_FI_PROF_PCIE_RX_BYTES = 1010 # The number of bytes of active PCIe rx (read) data including both header and payload.
|
|
583
|
+
DCGM_FI_PROF_NVLINK_TX_BYTES = 1011 # The number of bytes of active NvLink tx (transmit) data including both header and payload.
|
|
584
|
+
DCGM_FI_PROF_NVLINK_RX_BYTES = 1012 # The number of bytes of active NvLink rx (receive) data including both header and payload.
|
|
585
|
+
DCGM_FI_PROF_PIPE_TENSOR_IMMA_ACTIVE = 1013 # The ratio of cycles the IMMA tensor pipe is active (off the peak sustained elapsed cycles)
|
|
586
|
+
DCGM_FI_PROF_PIPE_TENSOR_HMMA_ACTIVE = 1014 # The ratio of cycles the HMMA tensor pipe is active (off the peak sustained elapsed cycles)
|
|
587
|
+
DCGM_FI_PROF_PIPE_TENSOR_DFMA_ACTIVE = 1015 # The ratio of cycles the tensor (DFMA) pipe is active (off the peak sustained elapsed cycles)
|
|
588
|
+
DCGM_FI_PROF_PIPE_INT_ACTIVE = 1016 # Ratio of cycles the integer pipe is active.
|
|
589
|
+
|
|
590
|
+
# Ratio of cycles each of the NVDEC engines are active.
|
|
591
|
+
DCGM_FI_PROF_NVDEC0_ACTIVE = 1017
|
|
592
|
+
DCGM_FI_PROF_NVDEC1_ACTIVE = 1018
|
|
593
|
+
DCGM_FI_PROF_NVDEC2_ACTIVE = 1019
|
|
594
|
+
DCGM_FI_PROF_NVDEC3_ACTIVE = 1020
|
|
595
|
+
DCGM_FI_PROF_NVDEC4_ACTIVE = 1021
|
|
596
|
+
DCGM_FI_PROF_NVDEC5_ACTIVE = 1022
|
|
597
|
+
DCGM_FI_PROF_NVDEC6_ACTIVE = 1023
|
|
598
|
+
DCGM_FI_PROF_NVDEC7_ACTIVE = 1024
|
|
599
|
+
|
|
600
|
+
# Ratio of cycles each of the NVJPG engines are active.
|
|
601
|
+
DCGM_FI_PROF_NVJPG0_ACTIVE = 1025
|
|
602
|
+
DCGM_FI_PROF_NVJPG1_ACTIVE = 1026
|
|
603
|
+
DCGM_FI_PROF_NVJPG2_ACTIVE = 1027
|
|
604
|
+
DCGM_FI_PROF_NVJPG3_ACTIVE = 1028
|
|
605
|
+
DCGM_FI_PROF_NVJPG4_ACTIVE = 1029
|
|
606
|
+
DCGM_FI_PROF_NVJPG5_ACTIVE = 1030
|
|
607
|
+
DCGM_FI_PROF_NVJPG6_ACTIVE = 1031
|
|
608
|
+
DCGM_FI_PROF_NVJPG7_ACTIVE = 1032
|
|
609
|
+
|
|
610
|
+
# Ratio of cycles each of the NVOFA engines are active.
|
|
611
|
+
DCGM_FI_PROF_NVOFA0_ACTIVE = 1033
|
|
612
|
+
"""
|
|
613
|
+
The per-link number of bytes of active NvLink TX (transmit) or RX (transmit) data including both header and payload.
|
|
614
|
+
For example: DCGM_FI_PROF_NVLINK_L0_TX_BYTES -> L0 TX
|
|
615
|
+
To get the bandwidth for a link, add the RX and TX value together like
|
|
616
|
+
total = DCGM_FI_PROF_NVLINK_L0_TX_BYTES + DCGM_FI_PROF_NVLINK_L0_RX_BYTES
|
|
617
|
+
"""
|
|
618
|
+
DCGM_FI_PROF_NVLINK_L0_TX_BYTES = 1040
|
|
619
|
+
DCGM_FI_PROF_NVLINK_L0_RX_BYTES = 1041
|
|
620
|
+
DCGM_FI_PROF_NVLINK_L1_TX_BYTES = 1042
|
|
621
|
+
DCGM_FI_PROF_NVLINK_L1_RX_BYTES = 1043
|
|
622
|
+
DCGM_FI_PROF_NVLINK_L2_TX_BYTES = 1044
|
|
623
|
+
DCGM_FI_PROF_NVLINK_L2_RX_BYTES = 1045
|
|
624
|
+
DCGM_FI_PROF_NVLINK_L3_TX_BYTES = 1046
|
|
625
|
+
DCGM_FI_PROF_NVLINK_L3_RX_BYTES = 1047
|
|
626
|
+
DCGM_FI_PROF_NVLINK_L4_TX_BYTES = 1048
|
|
627
|
+
DCGM_FI_PROF_NVLINK_L4_RX_BYTES = 1049
|
|
628
|
+
DCGM_FI_PROF_NVLINK_L5_TX_BYTES = 1050
|
|
629
|
+
DCGM_FI_PROF_NVLINK_L5_RX_BYTES = 1051
|
|
630
|
+
DCGM_FI_PROF_NVLINK_L6_TX_BYTES = 1052
|
|
631
|
+
DCGM_FI_PROF_NVLINK_L6_RX_BYTES = 1053
|
|
632
|
+
DCGM_FI_PROF_NVLINK_L7_TX_BYTES = 1054
|
|
633
|
+
DCGM_FI_PROF_NVLINK_L7_RX_BYTES = 1055
|
|
634
|
+
DCGM_FI_PROF_NVLINK_L8_TX_BYTES = 1056
|
|
635
|
+
DCGM_FI_PROF_NVLINK_L8_RX_BYTES = 1057
|
|
636
|
+
DCGM_FI_PROF_NVLINK_L9_TX_BYTES = 1058
|
|
637
|
+
DCGM_FI_PROF_NVLINK_L9_RX_BYTES = 1059
|
|
638
|
+
DCGM_FI_PROF_NVLINK_L10_TX_BYTES = 1060
|
|
639
|
+
DCGM_FI_PROF_NVLINK_L10_RX_BYTES = 1061
|
|
640
|
+
DCGM_FI_PROF_NVLINK_L11_TX_BYTES = 1062
|
|
641
|
+
DCGM_FI_PROF_NVLINK_L11_RX_BYTES = 1063
|
|
642
|
+
DCGM_FI_PROF_NVLINK_L12_TX_BYTES = 1064
|
|
643
|
+
DCGM_FI_PROF_NVLINK_L12_RX_BYTES = 1065
|
|
644
|
+
DCGM_FI_PROF_NVLINK_L13_TX_BYTES = 1066
|
|
645
|
+
DCGM_FI_PROF_NVLINK_L13_RX_BYTES = 1067
|
|
646
|
+
DCGM_FI_PROF_NVLINK_L14_TX_BYTES = 1068
|
|
647
|
+
DCGM_FI_PROF_NVLINK_L14_RX_BYTES = 1069
|
|
648
|
+
DCGM_FI_PROF_NVLINK_L15_TX_BYTES = 1070
|
|
649
|
+
DCGM_FI_PROF_NVLINK_L15_RX_BYTES = 1071
|
|
650
|
+
DCGM_FI_PROF_NVLINK_L16_TX_BYTES = 1072
|
|
651
|
+
DCGM_FI_PROF_NVLINK_L16_RX_BYTES = 1073
|
|
652
|
+
DCGM_FI_PROF_NVLINK_L17_TX_BYTES = 1074
|
|
653
|
+
DCGM_FI_PROF_NVLINK_L17_RX_BYTES = 1075
|
|
654
|
+
|
|
655
|
+
DCGM_FI_PROF_NVLINK_THROUGHPUT_FIRST = DCGM_FI_PROF_NVLINK_L0_TX_BYTES
|
|
656
|
+
DCGM_FI_PROF_NVLINK_THROUGHPUT_LAST = DCGM_FI_PROF_NVLINK_L17_RX_BYTES
|
|
657
|
+
|
|
658
|
+
# greater than maximum fields above. This value can increase in the future
|
|
659
|
+
DCGM_FI_MAX_FIELDS = 1076
|
|
660
|
+
|
|
661
|
+
|
|
662
|
+
class struct_c_dcgm_field_meta_t(dcgm_structs._DcgmStructure):
|
|
663
|
+
# struct_c_dcgm_field_meta_t structure
|
|
664
|
+
pass # opaque handle
|
|
665
|
+
|
|
666
|
+
|
|
667
|
+
dcgm_field_meta_t = POINTER(struct_c_dcgm_field_meta_t)
|
|
668
|
+
|
|
669
|
+
|
|
670
|
+
class _PrintableStructure(dcgm_structs._DcgmStructure):
|
|
671
|
+
"""
|
|
672
|
+
Abstract class that produces nicer __str__ output than ctypes.Structure.
|
|
673
|
+
e.g. instead of:
|
|
674
|
+
>>> print str(obj)
|
|
675
|
+
<class_name object at 0x7fdf82fef9e0>
|
|
676
|
+
this class will print
|
|
677
|
+
class_name(field_name: formatted_value, field_name: formatted_value)
|
|
678
|
+
|
|
679
|
+
_fmt_ dictionary of <str _field_ name> -> <str format>
|
|
680
|
+
e.g. class that has _field_ 'hex_value', c_uint could be formatted with
|
|
681
|
+
_fmt_ = {"hex_value" : "%08X"}
|
|
682
|
+
to produce nicer output.
|
|
683
|
+
Default fomratting string for all fields can be set with key "<default>" like:
|
|
684
|
+
_fmt_ = {"<default>" : "%d MHz"} # e.g all values are numbers in MHz.
|
|
685
|
+
If not set it's assumed to be just "%s"
|
|
686
|
+
|
|
687
|
+
Exact format of returned str from this class is subject to change in the future.
|
|
688
|
+
"""
|
|
689
|
+
|
|
690
|
+
_fmt_: Dict = {}
|
|
691
|
+
|
|
692
|
+
def __str__(self):
|
|
693
|
+
result = []
|
|
694
|
+
for x in self._fields_:
|
|
695
|
+
key = x[0]
|
|
696
|
+
value = getattr(self, key)
|
|
697
|
+
fmt = "%s"
|
|
698
|
+
if key in self._fmt_:
|
|
699
|
+
fmt = self._fmt_[key]
|
|
700
|
+
elif "<default>" in self._fmt_:
|
|
701
|
+
fmt = self._fmt_["<default>"]
|
|
702
|
+
result.append(("%s: " + fmt) % (key, value))
|
|
703
|
+
return self.__class__.__name__ + "(" + ", ".join(result) + ")"
|
|
704
|
+
|
|
705
|
+
|
|
706
|
+
# Provides access to functions from dcgm_agent_internal
|
|
707
|
+
dcgmFP = dcgm_structs._dcgmGetFunctionPointer
|
|
708
|
+
|
|
709
|
+
SHORTNAME_LENGTH = 10
|
|
710
|
+
UNIT_LENGTH = 4
|
|
711
|
+
|
|
712
|
+
|
|
713
|
+
# Structure to hold formatting information for values
|
|
714
|
+
class c_dcgm_field_output_format_t(_PrintableStructure):
|
|
715
|
+
_fields_ = [
|
|
716
|
+
("shortName", c_char * SHORTNAME_LENGTH),
|
|
717
|
+
("unit", c_char * UNIT_LENGTH),
|
|
718
|
+
("width", c_short),
|
|
719
|
+
]
|
|
720
|
+
|
|
721
|
+
|
|
722
|
+
TAG_LENGTH = 48
|
|
723
|
+
|
|
724
|
+
|
|
725
|
+
# Structure to represent device information
|
|
726
|
+
class c_dcgm_field_meta_t(_PrintableStructure):
|
|
727
|
+
_fields_ = [
|
|
728
|
+
# version must always be first
|
|
729
|
+
("fieldId", c_short),
|
|
730
|
+
("fieldType", c_char),
|
|
731
|
+
("size", c_ubyte),
|
|
732
|
+
("tag", c_char * TAG_LENGTH),
|
|
733
|
+
("scope", c_int),
|
|
734
|
+
("valueFormat", c_dcgm_field_output_format_t),
|
|
735
|
+
]
|
|
736
|
+
|
|
737
|
+
|
|
738
|
+
# Class for maintaining properties for each sampling type like Power, Utilization and Clock.
|
|
739
|
+
class pySamplingProperties:
|
|
740
|
+
"""
|
|
741
|
+
The instance of this class is used to hold information related to each sampling event type.
|
|
742
|
+
"""
|
|
743
|
+
|
|
744
|
+
def __init__(
|
|
745
|
+
self,
|
|
746
|
+
name,
|
|
747
|
+
sampling_type,
|
|
748
|
+
sample_val_type,
|
|
749
|
+
timeIntervalIdle,
|
|
750
|
+
timeIntervalBoost,
|
|
751
|
+
min_value,
|
|
752
|
+
max_value,
|
|
753
|
+
):
|
|
754
|
+
self.name = name
|
|
755
|
+
self.sampling_type = sampling_type
|
|
756
|
+
self.timeIntervalIdle = timeIntervalIdle
|
|
757
|
+
self.timeIntervalBoost = timeIntervalBoost
|
|
758
|
+
self.min_value = min_value
|
|
759
|
+
self.max_value = max_value
|
|
760
|
+
self.sample_val_type = sample_val_type
|
|
761
|
+
|
|
762
|
+
|
|
763
|
+
def DcgmFieldsInit():
|
|
764
|
+
fn = dcgmFP("DcgmFieldsInit")
|
|
765
|
+
ret = fn()
|
|
766
|
+
assert ret == 0, "Got return %d from DcgmFieldsInit" % ret
|
|
767
|
+
|
|
768
|
+
|
|
769
|
+
def DcgmFieldGetById(fieldId):
|
|
770
|
+
"""
|
|
771
|
+
Get metadata for a field, given its fieldId
|
|
772
|
+
|
|
773
|
+
:param fieldId: Field ID to get metadata for
|
|
774
|
+
:return: c_dcgm_field_meta_t struct on success. None on error.
|
|
775
|
+
"""
|
|
776
|
+
DcgmFieldsInit()
|
|
777
|
+
|
|
778
|
+
fn = dcgmFP("DcgmFieldGetById")
|
|
779
|
+
fn.restype = POINTER(c_dcgm_field_meta_t)
|
|
780
|
+
c_field_meta_ptr = fn(fieldId)
|
|
781
|
+
if not c_field_meta_ptr:
|
|
782
|
+
return None
|
|
783
|
+
|
|
784
|
+
retVal = c_dcgm_field_meta_t()
|
|
785
|
+
memmove(addressof(retVal), c_field_meta_ptr, sizeof(retVal))
|
|
786
|
+
return retVal
|
|
787
|
+
|
|
788
|
+
|
|
789
|
+
def DcgmFieldGetByTag(tag):
|
|
790
|
+
"""
|
|
791
|
+
Get metadata for a field, given its string tag
|
|
792
|
+
|
|
793
|
+
:param tag: Field tag to get metadata for. Example 'brand'
|
|
794
|
+
:return: c_dcgm_field_meta_t struct on success. None on error.
|
|
795
|
+
"""
|
|
796
|
+
DcgmFieldsInit()
|
|
797
|
+
|
|
798
|
+
c_dcgm_field_meta_t()
|
|
799
|
+
fn = dcgmFP("DcgmFieldGetByTag")
|
|
800
|
+
fn.restype = POINTER(c_dcgm_field_meta_t)
|
|
801
|
+
c_field_meta_ptr = fn(c_char_p(tag.encode("utf-8")))
|
|
802
|
+
if not c_field_meta_ptr:
|
|
803
|
+
return None
|
|
804
|
+
|
|
805
|
+
retVal = c_dcgm_field_meta_t()
|
|
806
|
+
memmove(addressof(retVal), c_field_meta_ptr, sizeof(retVal))
|
|
807
|
+
return retVal
|
|
808
|
+
|
|
809
|
+
|
|
810
|
+
def DcgmFieldGetTagById(fieldId):
|
|
811
|
+
field = DcgmFieldGetById(fieldId)
|
|
812
|
+
if field:
|
|
813
|
+
return field.tag
|
|
814
|
+
else:
|
|
815
|
+
return None
|