triton-model-analyzer 1.48.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- model_analyzer/__init__.py +15 -0
- model_analyzer/analyzer.py +448 -0
- model_analyzer/cli/__init__.py +15 -0
- model_analyzer/cli/cli.py +193 -0
- model_analyzer/config/__init__.py +15 -0
- model_analyzer/config/generate/__init__.py +15 -0
- model_analyzer/config/generate/automatic_model_config_generator.py +164 -0
- model_analyzer/config/generate/base_model_config_generator.py +352 -0
- model_analyzer/config/generate/brute_plus_binary_parameter_search_run_config_generator.py +164 -0
- model_analyzer/config/generate/brute_run_config_generator.py +154 -0
- model_analyzer/config/generate/concurrency_sweeper.py +75 -0
- model_analyzer/config/generate/config_generator_interface.py +52 -0
- model_analyzer/config/generate/coordinate.py +143 -0
- model_analyzer/config/generate/coordinate_data.py +86 -0
- model_analyzer/config/generate/generator_utils.py +116 -0
- model_analyzer/config/generate/manual_model_config_generator.py +187 -0
- model_analyzer/config/generate/model_config_generator_factory.py +92 -0
- model_analyzer/config/generate/model_profile_spec.py +74 -0
- model_analyzer/config/generate/model_run_config_generator.py +154 -0
- model_analyzer/config/generate/model_variant_name_manager.py +150 -0
- model_analyzer/config/generate/neighborhood.py +536 -0
- model_analyzer/config/generate/optuna_plus_concurrency_sweep_run_config_generator.py +141 -0
- model_analyzer/config/generate/optuna_run_config_generator.py +838 -0
- model_analyzer/config/generate/perf_analyzer_config_generator.py +312 -0
- model_analyzer/config/generate/quick_plus_concurrency_sweep_run_config_generator.py +130 -0
- model_analyzer/config/generate/quick_run_config_generator.py +753 -0
- model_analyzer/config/generate/run_config_generator_factory.py +329 -0
- model_analyzer/config/generate/search_config.py +112 -0
- model_analyzer/config/generate/search_dimension.py +73 -0
- model_analyzer/config/generate/search_dimensions.py +85 -0
- model_analyzer/config/generate/search_parameter.py +49 -0
- model_analyzer/config/generate/search_parameters.py +388 -0
- model_analyzer/config/input/__init__.py +15 -0
- model_analyzer/config/input/config_command.py +483 -0
- model_analyzer/config/input/config_command_profile.py +1747 -0
- model_analyzer/config/input/config_command_report.py +267 -0
- model_analyzer/config/input/config_defaults.py +236 -0
- model_analyzer/config/input/config_enum.py +83 -0
- model_analyzer/config/input/config_field.py +216 -0
- model_analyzer/config/input/config_list_generic.py +112 -0
- model_analyzer/config/input/config_list_numeric.py +151 -0
- model_analyzer/config/input/config_list_string.py +111 -0
- model_analyzer/config/input/config_none.py +71 -0
- model_analyzer/config/input/config_object.py +129 -0
- model_analyzer/config/input/config_primitive.py +81 -0
- model_analyzer/config/input/config_status.py +75 -0
- model_analyzer/config/input/config_sweep.py +83 -0
- model_analyzer/config/input/config_union.py +113 -0
- model_analyzer/config/input/config_utils.py +128 -0
- model_analyzer/config/input/config_value.py +243 -0
- model_analyzer/config/input/objects/__init__.py +15 -0
- model_analyzer/config/input/objects/config_model_profile_spec.py +325 -0
- model_analyzer/config/input/objects/config_model_report_spec.py +173 -0
- model_analyzer/config/input/objects/config_plot.py +198 -0
- model_analyzer/config/input/objects/config_protobuf_utils.py +101 -0
- model_analyzer/config/input/yaml_config_validator.py +82 -0
- model_analyzer/config/run/__init__.py +15 -0
- model_analyzer/config/run/model_run_config.py +313 -0
- model_analyzer/config/run/run_config.py +168 -0
- model_analyzer/constants.py +76 -0
- model_analyzer/device/__init__.py +15 -0
- model_analyzer/device/device.py +24 -0
- model_analyzer/device/gpu_device.py +87 -0
- model_analyzer/device/gpu_device_factory.py +248 -0
- model_analyzer/entrypoint.py +307 -0
- model_analyzer/log_formatter.py +65 -0
- model_analyzer/model_analyzer_exceptions.py +24 -0
- model_analyzer/model_manager.py +255 -0
- model_analyzer/monitor/__init__.py +15 -0
- model_analyzer/monitor/cpu_monitor.py +69 -0
- model_analyzer/monitor/dcgm/DcgmDiag.py +191 -0
- model_analyzer/monitor/dcgm/DcgmFieldGroup.py +83 -0
- model_analyzer/monitor/dcgm/DcgmGroup.py +815 -0
- model_analyzer/monitor/dcgm/DcgmHandle.py +141 -0
- model_analyzer/monitor/dcgm/DcgmJsonReader.py +69 -0
- model_analyzer/monitor/dcgm/DcgmReader.py +623 -0
- model_analyzer/monitor/dcgm/DcgmStatus.py +57 -0
- model_analyzer/monitor/dcgm/DcgmSystem.py +412 -0
- model_analyzer/monitor/dcgm/__init__.py +15 -0
- model_analyzer/monitor/dcgm/common/__init__.py +13 -0
- model_analyzer/monitor/dcgm/common/dcgm_client_cli_parser.py +194 -0
- model_analyzer/monitor/dcgm/common/dcgm_client_main.py +86 -0
- model_analyzer/monitor/dcgm/dcgm_agent.py +887 -0
- model_analyzer/monitor/dcgm/dcgm_collectd_plugin.py +369 -0
- model_analyzer/monitor/dcgm/dcgm_errors.py +395 -0
- model_analyzer/monitor/dcgm/dcgm_field_helpers.py +546 -0
- model_analyzer/monitor/dcgm/dcgm_fields.py +815 -0
- model_analyzer/monitor/dcgm/dcgm_fields_collectd.py +671 -0
- model_analyzer/monitor/dcgm/dcgm_fields_internal.py +29 -0
- model_analyzer/monitor/dcgm/dcgm_fluentd.py +45 -0
- model_analyzer/monitor/dcgm/dcgm_monitor.py +138 -0
- model_analyzer/monitor/dcgm/dcgm_prometheus.py +326 -0
- model_analyzer/monitor/dcgm/dcgm_structs.py +2357 -0
- model_analyzer/monitor/dcgm/dcgm_telegraf.py +65 -0
- model_analyzer/monitor/dcgm/dcgm_value.py +151 -0
- model_analyzer/monitor/dcgm/dcgmvalue.py +155 -0
- model_analyzer/monitor/dcgm/denylist_recommendations.py +573 -0
- model_analyzer/monitor/dcgm/pydcgm.py +47 -0
- model_analyzer/monitor/monitor.py +143 -0
- model_analyzer/monitor/remote_monitor.py +137 -0
- model_analyzer/output/__init__.py +15 -0
- model_analyzer/output/file_writer.py +63 -0
- model_analyzer/output/output_writer.py +42 -0
- model_analyzer/perf_analyzer/__init__.py +15 -0
- model_analyzer/perf_analyzer/genai_perf_config.py +206 -0
- model_analyzer/perf_analyzer/perf_analyzer.py +882 -0
- model_analyzer/perf_analyzer/perf_config.py +479 -0
- model_analyzer/plots/__init__.py +15 -0
- model_analyzer/plots/detailed_plot.py +266 -0
- model_analyzer/plots/plot_manager.py +224 -0
- model_analyzer/plots/simple_plot.py +213 -0
- model_analyzer/record/__init__.py +15 -0
- model_analyzer/record/gpu_record.py +68 -0
- model_analyzer/record/metrics_manager.py +887 -0
- model_analyzer/record/record.py +280 -0
- model_analyzer/record/record_aggregator.py +256 -0
- model_analyzer/record/types/__init__.py +15 -0
- model_analyzer/record/types/cpu_available_ram.py +93 -0
- model_analyzer/record/types/cpu_used_ram.py +93 -0
- model_analyzer/record/types/gpu_free_memory.py +96 -0
- model_analyzer/record/types/gpu_power_usage.py +107 -0
- model_analyzer/record/types/gpu_total_memory.py +96 -0
- model_analyzer/record/types/gpu_used_memory.py +96 -0
- model_analyzer/record/types/gpu_utilization.py +108 -0
- model_analyzer/record/types/inter_token_latency_avg.py +60 -0
- model_analyzer/record/types/inter_token_latency_base.py +74 -0
- model_analyzer/record/types/inter_token_latency_max.py +60 -0
- model_analyzer/record/types/inter_token_latency_min.py +60 -0
- model_analyzer/record/types/inter_token_latency_p25.py +60 -0
- model_analyzer/record/types/inter_token_latency_p50.py +60 -0
- model_analyzer/record/types/inter_token_latency_p75.py +60 -0
- model_analyzer/record/types/inter_token_latency_p90.py +60 -0
- model_analyzer/record/types/inter_token_latency_p95.py +60 -0
- model_analyzer/record/types/inter_token_latency_p99.py +60 -0
- model_analyzer/record/types/output_token_throughput.py +105 -0
- model_analyzer/record/types/perf_client_response_wait.py +97 -0
- model_analyzer/record/types/perf_client_send_recv.py +97 -0
- model_analyzer/record/types/perf_latency.py +111 -0
- model_analyzer/record/types/perf_latency_avg.py +60 -0
- model_analyzer/record/types/perf_latency_base.py +74 -0
- model_analyzer/record/types/perf_latency_p90.py +60 -0
- model_analyzer/record/types/perf_latency_p95.py +60 -0
- model_analyzer/record/types/perf_latency_p99.py +60 -0
- model_analyzer/record/types/perf_server_compute_infer.py +97 -0
- model_analyzer/record/types/perf_server_compute_input.py +97 -0
- model_analyzer/record/types/perf_server_compute_output.py +97 -0
- model_analyzer/record/types/perf_server_queue.py +97 -0
- model_analyzer/record/types/perf_throughput.py +105 -0
- model_analyzer/record/types/time_to_first_token_avg.py +60 -0
- model_analyzer/record/types/time_to_first_token_base.py +74 -0
- model_analyzer/record/types/time_to_first_token_max.py +60 -0
- model_analyzer/record/types/time_to_first_token_min.py +60 -0
- model_analyzer/record/types/time_to_first_token_p25.py +60 -0
- model_analyzer/record/types/time_to_first_token_p50.py +60 -0
- model_analyzer/record/types/time_to_first_token_p75.py +60 -0
- model_analyzer/record/types/time_to_first_token_p90.py +60 -0
- model_analyzer/record/types/time_to_first_token_p95.py +60 -0
- model_analyzer/record/types/time_to_first_token_p99.py +60 -0
- model_analyzer/reports/__init__.py +15 -0
- model_analyzer/reports/html_report.py +195 -0
- model_analyzer/reports/pdf_report.py +50 -0
- model_analyzer/reports/report.py +86 -0
- model_analyzer/reports/report_factory.py +62 -0
- model_analyzer/reports/report_manager.py +1376 -0
- model_analyzer/reports/report_utils.py +42 -0
- model_analyzer/result/__init__.py +15 -0
- model_analyzer/result/constraint_manager.py +150 -0
- model_analyzer/result/model_config_measurement.py +354 -0
- model_analyzer/result/model_constraints.py +105 -0
- model_analyzer/result/parameter_search.py +246 -0
- model_analyzer/result/result_manager.py +430 -0
- model_analyzer/result/result_statistics.py +159 -0
- model_analyzer/result/result_table.py +217 -0
- model_analyzer/result/result_table_manager.py +646 -0
- model_analyzer/result/result_utils.py +42 -0
- model_analyzer/result/results.py +277 -0
- model_analyzer/result/run_config_measurement.py +658 -0
- model_analyzer/result/run_config_result.py +210 -0
- model_analyzer/result/run_config_result_comparator.py +110 -0
- model_analyzer/result/sorted_results.py +151 -0
- model_analyzer/state/__init__.py +15 -0
- model_analyzer/state/analyzer_state.py +76 -0
- model_analyzer/state/analyzer_state_manager.py +215 -0
- model_analyzer/triton/__init__.py +15 -0
- model_analyzer/triton/client/__init__.py +15 -0
- model_analyzer/triton/client/client.py +234 -0
- model_analyzer/triton/client/client_factory.py +57 -0
- model_analyzer/triton/client/grpc_client.py +104 -0
- model_analyzer/triton/client/http_client.py +107 -0
- model_analyzer/triton/model/__init__.py +15 -0
- model_analyzer/triton/model/model_config.py +556 -0
- model_analyzer/triton/model/model_config_variant.py +29 -0
- model_analyzer/triton/server/__init__.py +15 -0
- model_analyzer/triton/server/server.py +76 -0
- model_analyzer/triton/server/server_config.py +269 -0
- model_analyzer/triton/server/server_docker.py +229 -0
- model_analyzer/triton/server/server_factory.py +306 -0
- model_analyzer/triton/server/server_local.py +158 -0
- triton_model_analyzer-1.48.0.dist-info/METADATA +52 -0
- triton_model_analyzer-1.48.0.dist-info/RECORD +204 -0
- triton_model_analyzer-1.48.0.dist-info/WHEEL +5 -0
- triton_model_analyzer-1.48.0.dist-info/entry_points.txt +2 -0
- triton_model_analyzer-1.48.0.dist-info/licenses/LICENSE +67 -0
- triton_model_analyzer-1.48.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,2357 @@
|
|
|
1
|
+
# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
##
|
|
15
|
+
# Python bindings for "dcgm_structs.h"
|
|
16
|
+
##
|
|
17
|
+
|
|
18
|
+
from ctypes import *
|
|
19
|
+
from ctypes.util import find_library
|
|
20
|
+
import sys
|
|
21
|
+
import os
|
|
22
|
+
import threading
|
|
23
|
+
import string
|
|
24
|
+
import json
|
|
25
|
+
import model_analyzer.monitor.dcgm.dcgmvalue as dcgmvalue
|
|
26
|
+
import platform
|
|
27
|
+
from inspect import isclass
|
|
28
|
+
from typing import Dict, List
|
|
29
|
+
|
|
30
|
+
DCGM_MAX_STR_LENGTH = 256
|
|
31
|
+
DCGM_MAX_NUM_DEVICES = 32 # DCGM 2.0 and newer = 32. DCGM 1.8 and older = 16
|
|
32
|
+
DCGM_MAX_NUM_SWITCHES = 12
|
|
33
|
+
DCGM_NVLINK_MAX_LINKS_PER_GPU = 18
|
|
34
|
+
DCGM_NVLINK_MAX_LINKS_PER_GPU_LEGACY1 = 6
|
|
35
|
+
DCGM_NVLINK_MAX_LINKS_PER_GPU_LEGACY2 = 12
|
|
36
|
+
DCGM_NVLINK_MAX_LINKS_PER_NVSWITCH_V1 = 36 # Max NvLinks per NvSwitch pre-Hopper
|
|
37
|
+
DCGM_NVLINK_MAX_LINKS_PER_NVSWITCH = 64
|
|
38
|
+
DCGM_LANE_MAX_LANES_PER_NVSWICH_LINK = 4
|
|
39
|
+
DCGM_MAX_CLOCKS = 256
|
|
40
|
+
DCGM_MAX_NUM_GROUPS = 64
|
|
41
|
+
DCGM_MAX_BLOB_LENGTH = 4096
|
|
42
|
+
DCGM_MAX_VGPU_INSTANCES_PER_PGPU = 32
|
|
43
|
+
DCGM_VGPU_NAME_BUFFER_SIZE = 64
|
|
44
|
+
DCGM_GRID_LICENSE_BUFFER_SIZE = 128
|
|
45
|
+
DCGM_MAX_VGPU_TYPES_PER_PGPU = 32
|
|
46
|
+
DCGM_DEVICE_UUID_BUFFER_SIZE = 80
|
|
47
|
+
DCGM_MAX_FBC_SESSIONS = 256
|
|
48
|
+
|
|
49
|
+
# When more than one value is returned from a query, which order should it be returned in?
|
|
50
|
+
DCGM_ORDER_ASCENDING = 1
|
|
51
|
+
DCGM_ORDER_DESCENDING = 2
|
|
52
|
+
|
|
53
|
+
DCGM_OPERATION_MODE_AUTO = 1
|
|
54
|
+
DCGM_OPERATION_MODE_MANUAL = 2
|
|
55
|
+
|
|
56
|
+
DCGM_ENCODER_QUERY_H264 = 0
|
|
57
|
+
DCGM_ENCODER_QUERY_HEVC = 1
|
|
58
|
+
|
|
59
|
+
DCGM_FBC_SESSION_TYPE_UNKNOWN = 0 # Unknown
|
|
60
|
+
DCGM_FBC_SESSION_TYPE_TOSYS = 1 # FB capture for a system buffer
|
|
61
|
+
DCGM_FBC_SESSION_TYPE_CUDA = 2 # FB capture for a cuda buffer
|
|
62
|
+
DCGM_FBC_SESSION_TYPE_VID = 3 # FB capture for a Vid buffer
|
|
63
|
+
DCGM_FBC_SESSION_TYPE_HWENC = 4 # FB capture for a NVENC HW buffer
|
|
64
|
+
|
|
65
|
+
## C Type mappings ##
|
|
66
|
+
## Enums
|
|
67
|
+
|
|
68
|
+
# Return types
|
|
69
|
+
_dcgmReturn_t = c_uint
|
|
70
|
+
DCGM_ST_OK = 0 # Success
|
|
71
|
+
DCGM_ST_BADPARAM = -1 # A bad parameter was passed to a function
|
|
72
|
+
DCGM_ST_GENERIC_ERROR = -3 # A generic, unspecified error
|
|
73
|
+
DCGM_ST_MEMORY = -4 # An out of memory error occured
|
|
74
|
+
DCGM_ST_NOT_CONFIGURED = -5 # Setting not configured
|
|
75
|
+
DCGM_ST_NOT_SUPPORTED = -6 # Feature not supported
|
|
76
|
+
DCGM_ST_INIT_ERROR = -7 # DCGM Init error
|
|
77
|
+
DCGM_ST_NVML_ERROR = -8 # When NVML returns error.
|
|
78
|
+
DCGM_ST_PENDING = -9 # Object is in pending state of something else
|
|
79
|
+
DCGM_ST_UNINITIALIZED = -10 # Object is in undefined state
|
|
80
|
+
DCGM_ST_TIMEOUT = -11 # Requested operation timed out
|
|
81
|
+
DCGM_ST_VER_MISMATCH = -12 # Version mismatch between received and understood API
|
|
82
|
+
DCGM_ST_UNKNOWN_FIELD = -13 # Unknown field id
|
|
83
|
+
DCGM_ST_NO_DATA = -14 # No data is available
|
|
84
|
+
DCGM_ST_STALE_DATA = -15
|
|
85
|
+
DCGM_ST_NOT_WATCHED = -16 # The given field is not being updated by the cache manager
|
|
86
|
+
DCGM_ST_NO_PERMISSION = -17 # We are not permissioned to perform the desired action
|
|
87
|
+
DCGM_ST_GPU_IS_LOST = -18 # GPU is no longer reachable
|
|
88
|
+
DCGM_ST_RESET_REQUIRED = -19 # GPU requires a reset
|
|
89
|
+
DCGM_ST_FUNCTION_NOT_FOUND = -20 # Unable to find function
|
|
90
|
+
DCGM_ST_CONNECTION_NOT_VALID = (
|
|
91
|
+
-21
|
|
92
|
+
) # Connection to the host engine is not valid any longer
|
|
93
|
+
DCGM_ST_GPU_NOT_SUPPORTED = -22 # This GPU is not supported by DCGM
|
|
94
|
+
DCGM_ST_GROUP_INCOMPATIBLE = (
|
|
95
|
+
-23
|
|
96
|
+
) # The GPUs of the provided group are not compatible with each other for the requested operation
|
|
97
|
+
DCGM_ST_MAX_LIMIT = -24
|
|
98
|
+
DCGM_ST_LIBRARY_NOT_FOUND = -25 # DCGM library could not be found
|
|
99
|
+
DCGM_ST_DUPLICATE_KEY = -26 # Duplicate key passed to the function
|
|
100
|
+
DCGM_ST_GPU_IN_SYNC_BOOST_GROUP = -27 # GPU is already a part of a sync boost group
|
|
101
|
+
DCGM_ST_GPU_NOT_IN_SYNC_BOOST_GROUP = -28 # GPU is a not a part of sync boost group
|
|
102
|
+
DCGM_ST_REQUIRES_ROOT = (
|
|
103
|
+
-29
|
|
104
|
+
) # This operation cannot be performed when the host engine is running as non-root
|
|
105
|
+
DCGM_ST_NVVS_ERROR = (
|
|
106
|
+
-30
|
|
107
|
+
) # DCGM GPU Diagnostic was successfully executed, but reported an error.
|
|
108
|
+
DCGM_ST_INSUFFICIENT_SIZE = -31 # An input argument is not large enough
|
|
109
|
+
DCGM_ST_FIELD_UNSUPPORTED_BY_API = (
|
|
110
|
+
-32
|
|
111
|
+
) # The given field ID is not supported by the API being called
|
|
112
|
+
DCGM_ST_MODULE_NOT_LOADED = (
|
|
113
|
+
-33
|
|
114
|
+
) # This request is serviced by a module of DCGM that is not currently loaded
|
|
115
|
+
DCGM_ST_IN_USE = (
|
|
116
|
+
-34
|
|
117
|
+
) # The requested operation could not be completed because the affected resource is in use
|
|
118
|
+
DCGM_ST_GROUP_IS_EMPTY = (
|
|
119
|
+
-35
|
|
120
|
+
) # The specified group is empty and this operation is not valid with an empty group
|
|
121
|
+
DCGM_ST_PROFILING_NOT_SUPPORTED = (
|
|
122
|
+
-36
|
|
123
|
+
) # Profiling is not supported for this group of GPUs or GPU
|
|
124
|
+
DCGM_ST_PROFILING_LIBRARY_ERROR = (
|
|
125
|
+
-37
|
|
126
|
+
) # The third-party Profiling module returned an unrecoverable error
|
|
127
|
+
DCGM_ST_PROFILING_MULTI_PASS = (
|
|
128
|
+
-38
|
|
129
|
+
) # The requested profiling metrics cannot be collected in a single pass
|
|
130
|
+
DCGM_ST_DIAG_ALREADY_RUNNING = (
|
|
131
|
+
-39
|
|
132
|
+
) # A diag instance is already running, cannot run a new diag until the current one finishes.
|
|
133
|
+
DCGM_ST_DIAG_BAD_JSON = (
|
|
134
|
+
-40
|
|
135
|
+
) # The DCGM GPU Diagnostic returned JSON that cannot be parsed
|
|
136
|
+
DCGM_ST_DIAG_BAD_LAUNCH = -41 # Error while launching the DCGM GPU Diagnostic
|
|
137
|
+
DCGM_ST_DIAG_UNUSED = -42 # Unused
|
|
138
|
+
DCGM_ST_DIAG_THRESHOLD_EXCEEDED = (
|
|
139
|
+
-43
|
|
140
|
+
) # A field value met or exceeded the error threshold.
|
|
141
|
+
DCGM_ST_INSUFFICIENT_DRIVER_VERSION = (
|
|
142
|
+
-44
|
|
143
|
+
) # The installed driver version is insufficient for this API
|
|
144
|
+
DCGM_ST_INSTANCE_NOT_FOUND = -45 # The specified GPU instance does not exist
|
|
145
|
+
DCGM_ST_COMPUTE_INSTANCE_NOT_FOUND = (
|
|
146
|
+
-46
|
|
147
|
+
) # The specified GPU compute instance does not exist
|
|
148
|
+
DCGM_ST_CHILD_NOT_KILLED = -47 # Couldn't kill a child process within the retries
|
|
149
|
+
DCGM_ST_3RD_PARTY_LIBRARY_ERROR = -48 # Detected an error in a 3rd-party library
|
|
150
|
+
DCGM_ST_INSUFFICIENT_RESOURCES = -49 # Not enough resources available
|
|
151
|
+
DCGM_ST_PLUGIN_EXCEPTION = -50 # Exception thrown from a diagnostic plugin
|
|
152
|
+
DCGM_ST_NVVS_ISOLATE_ERROR = (
|
|
153
|
+
-51
|
|
154
|
+
) # The diagnostic returned an error that indicates the need for isolation
|
|
155
|
+
DCGM_ST_NVVS_BINARY_NOT_FOUND = (
|
|
156
|
+
-52
|
|
157
|
+
) # The NVVS binary was not found in the specified location
|
|
158
|
+
DCGM_ST_NVVS_KILLED = -53 # The NVVS process was killed by a signal
|
|
159
|
+
DCGM_ST_PAUSED = -54 # The hostengine and all modules are paused
|
|
160
|
+
|
|
161
|
+
DCGM_GROUP_DEFAULT = 0 # All the GPUs on the node are added to the group
|
|
162
|
+
DCGM_GROUP_EMPTY = 1 # Creates an empty group
|
|
163
|
+
DCGM_GROUP_DEFAULT_NVSWITCHES = 2 # All NvSwitches of the node are added to the group
|
|
164
|
+
DCGM_GROUP_DEFAULT_INSTANCES = 3 # All GPU instances of the node are added to the group
|
|
165
|
+
DCGM_GROUP_DEFAULT_COMPUTE_INSTANCES = (
|
|
166
|
+
4 # All compute instances of the node are added to the group
|
|
167
|
+
)
|
|
168
|
+
DCGM_GROUP_DEFAULT_ENTITIES = 5 # All entities are added to this default group
|
|
169
|
+
|
|
170
|
+
DCGM_GROUP_ALL_GPUS = 0x7FFFFFFF
|
|
171
|
+
DCGM_GROUP_ALL_NVSWITCHES = 0x7FFFFFFE
|
|
172
|
+
DCGM_GROUP_ALL_INSTANCES = 0x7FFFFFFD
|
|
173
|
+
DCGM_GROUP_ALL_COMPUTE_INSTANCES = 0x7FFFFFFC
|
|
174
|
+
DCGM_GROUP_ALL_ENTITIES = 0x7FFFFFFB
|
|
175
|
+
|
|
176
|
+
DCGM_GROUP_MAX_ENTITIES = 64 # Maximum number of entities per entity group
|
|
177
|
+
|
|
178
|
+
DCGM_CONFIG_TARGET_STATE = 0 # The target configuration values to be applied
|
|
179
|
+
DCGM_CONFIG_CURRENT_STATE = 1 # The current configuration state
|
|
180
|
+
|
|
181
|
+
DCGM_CONFIG_POWER_CAP_INDIVIDUAL = (
|
|
182
|
+
0 # Represents the power cap to be applied for each member of the group
|
|
183
|
+
)
|
|
184
|
+
DCGM_CONFIG_POWER_BUDGET_GROUP = 1 # Represents the power budget for the entire group
|
|
185
|
+
|
|
186
|
+
DCGM_CONFIG_COMPUTEMODE_DEFAULT = (
|
|
187
|
+
0 # Default compute mode -- multiple contexts per device
|
|
188
|
+
)
|
|
189
|
+
DCGM_CONFIG_COMPUTEMODE_PROHIBITED = (
|
|
190
|
+
1 # Compute-prohibited mode -- no contexts per device
|
|
191
|
+
)
|
|
192
|
+
DCGM_CONFIG_COMPUTEMODE_EXCLUSIVE_PROCESS = 2 # * Compute-exclusive-process mode -- only one context per device, usable from multiple threads at a time
|
|
193
|
+
|
|
194
|
+
DCGM_TOPOLOGY_BOARD = 0x1
|
|
195
|
+
DCGM_TOPOLOGY_SINGLE = 0x2
|
|
196
|
+
DCGM_TOPOLOGY_MULTIPLE = 0x4
|
|
197
|
+
DCGM_TOPOLOGY_HOSTBRIDGE = 0x8
|
|
198
|
+
DCGM_TOPOLOGY_CPU = 0x10
|
|
199
|
+
DCGM_TOPOLOGY_SYSTEM = 0x20
|
|
200
|
+
DCGM_TOPOLOGY_NVLINK1 = 0x0100
|
|
201
|
+
DCGM_TOPOLOGY_NVLINK2 = 0x0200
|
|
202
|
+
DCGM_TOPOLOGY_NVLINK3 = 0x0400
|
|
203
|
+
DCGM_TOPOLOGY_NVLINK4 = 0x0800
|
|
204
|
+
DCGM_TOPOLOGY_NVLINK5 = 0x1000
|
|
205
|
+
DCGM_TOPOLOGY_NVLINK6 = 0x2000
|
|
206
|
+
DCGM_TOPOLOGY_NVLINK7 = 0x4000
|
|
207
|
+
DCGM_TOPOLOGY_NVLINK8 = 0x8000
|
|
208
|
+
DCGM_TOPOLOGY_NVLINK9 = 0x10000
|
|
209
|
+
DCGM_TOPOLOGY_NVLINK10 = 0x20000
|
|
210
|
+
DCGM_TOPOLOGY_NVLINK11 = 0x40000
|
|
211
|
+
DCGM_TOPOLOGY_NVLINK12 = 0x80000
|
|
212
|
+
|
|
213
|
+
# Diagnostic per gpu tests - fixed indices for dcgmDiagResponsePerGpu_t.results[]
|
|
214
|
+
DCGM_MEMORY_INDEX = 0
|
|
215
|
+
DCGM_DIAGNOSTIC_INDEX = 1
|
|
216
|
+
DCGM_PCI_INDEX = 2
|
|
217
|
+
DCGM_SM_STRESS_INDEX = 3
|
|
218
|
+
DCGM_TARGETED_STRESS_INDEX = 4
|
|
219
|
+
DCGM_TARGETED_POWER_INDEX = 5
|
|
220
|
+
DCGM_MEMORY_BANDWIDTH_INDEX = 6
|
|
221
|
+
DCGM_MEMTEST_INDEX = 7
|
|
222
|
+
DCGM_PULSE_TEST_INDEX = 8
|
|
223
|
+
DCGM_EUD_TEST_INDEX = 9
|
|
224
|
+
DCGM_UNUSED2_TEST_INDEX = 10
|
|
225
|
+
DCGM_UNUSED3_TEST_INDEX = 11
|
|
226
|
+
DCGM_UNUSED4_TEST_INDEX = 12
|
|
227
|
+
DCGM_UNUSED5_TEST_INDEX = 13
|
|
228
|
+
DCGM_PER_GPU_TEST_COUNT_V7 = 9
|
|
229
|
+
DCGM_PER_GPU_TEST_COUNT_V8 = 13
|
|
230
|
+
|
|
231
|
+
# DCGM Diag Level One test indices
|
|
232
|
+
DCGM_SWTEST_DENYLIST = 0
|
|
233
|
+
DCGM_SWTEST_NVML_LIBRARY = 1
|
|
234
|
+
DCGM_SWTEST_CUDA_MAIN_LIBRARY = 2
|
|
235
|
+
DCGM_SWTEST_CUDA_RUNTIME_LIBRARY = 3
|
|
236
|
+
DCGM_SWTEST_PERMISSIONS = 4
|
|
237
|
+
DCGM_SWTEST_PERSISTENCE_MODE = 5
|
|
238
|
+
DCGM_SWTEST_ENVIRONMENT = 6
|
|
239
|
+
DCGM_SWTEST_PAGE_RETIREMENT = 7
|
|
240
|
+
DCGM_SWTEST_GRAPHICS_PROCESSES = 8
|
|
241
|
+
DCGM_SWTEST_INFOROM = 9
|
|
242
|
+
|
|
243
|
+
# This test is only run by itself, so it can use the 0 slot
|
|
244
|
+
DCGM_CONTEXT_CREATE_INDEX = 0
|
|
245
|
+
|
|
246
|
+
|
|
247
|
+
class DCGM_INTROSPECT_STATE(object):
|
|
248
|
+
DISABLED = 0
|
|
249
|
+
ENABLED = 1
|
|
250
|
+
|
|
251
|
+
|
|
252
|
+
# Lib loading
|
|
253
|
+
dcgmLib = None
|
|
254
|
+
libLoadLock = threading.Lock()
|
|
255
|
+
_dcgmLib_refcount = 0 # Incremented on each dcgmInit and decremented on dcgmShutdown
|
|
256
|
+
|
|
257
|
+
|
|
258
|
+
class DCGMError(Exception):
|
|
259
|
+
"""Class to return error values for DCGM"""
|
|
260
|
+
|
|
261
|
+
_valClassMapping: Dict = dict()
|
|
262
|
+
# List of currently known error codes
|
|
263
|
+
_error_code_to_string = {
|
|
264
|
+
DCGM_ST_OK: "Success",
|
|
265
|
+
DCGM_ST_BADPARAM: "Bad parameter passed to function",
|
|
266
|
+
DCGM_ST_GENERIC_ERROR: "Generic unspecified error",
|
|
267
|
+
DCGM_ST_MEMORY: "Out of memory error",
|
|
268
|
+
DCGM_ST_NOT_CONFIGURED: "Setting not configured",
|
|
269
|
+
DCGM_ST_NOT_SUPPORTED: "Feature not supported",
|
|
270
|
+
DCGM_ST_INIT_ERROR: "DCGM initialization error",
|
|
271
|
+
DCGM_ST_NVML_ERROR: "NVML error",
|
|
272
|
+
DCGM_ST_PENDING: "Object is in a pending state",
|
|
273
|
+
DCGM_ST_UNINITIALIZED: "Object is in an undefined state",
|
|
274
|
+
DCGM_ST_TIMEOUT: "Timeout",
|
|
275
|
+
DCGM_ST_VER_MISMATCH: "API version mismatch",
|
|
276
|
+
DCGM_ST_UNKNOWN_FIELD: "Unknown field",
|
|
277
|
+
DCGM_ST_NO_DATA: "No data is available",
|
|
278
|
+
DCGM_ST_STALE_DATA: "Data is considered stale",
|
|
279
|
+
DCGM_ST_NOT_WATCHED: "Field is not being updated",
|
|
280
|
+
DCGM_ST_NO_PERMISSION: "Not permissioned",
|
|
281
|
+
DCGM_ST_GPU_IS_LOST: "GPU is unreachable",
|
|
282
|
+
DCGM_ST_RESET_REQUIRED: "GPU requires a reset",
|
|
283
|
+
DCGM_ST_FUNCTION_NOT_FOUND: "Unable to find function",
|
|
284
|
+
DCGM_ST_CONNECTION_NOT_VALID: "The connection to the host engine is not valid any longer",
|
|
285
|
+
DCGM_ST_GPU_NOT_SUPPORTED: "This GPU is not supported by DCGM",
|
|
286
|
+
DCGM_ST_GROUP_INCOMPATIBLE: "GPUs are incompatible with each other for the requested operation",
|
|
287
|
+
DCGM_ST_MAX_LIMIT: "Max limit reached for the object",
|
|
288
|
+
DCGM_ST_LIBRARY_NOT_FOUND: "DCGM library could not be found",
|
|
289
|
+
DCGM_ST_DUPLICATE_KEY: "Duplicate key passed to function",
|
|
290
|
+
DCGM_ST_GPU_IN_SYNC_BOOST_GROUP: "GPU is already a part of a sync boost group",
|
|
291
|
+
DCGM_ST_GPU_NOT_IN_SYNC_BOOST_GROUP: "GPU is not a part of the sync boost group",
|
|
292
|
+
DCGM_ST_REQUIRES_ROOT: "This operation is not supported when the host engine is running as non root",
|
|
293
|
+
DCGM_ST_NVVS_ERROR: "DCGM GPU Diagnostic returned an error.",
|
|
294
|
+
DCGM_ST_INSUFFICIENT_SIZE: "An input argument is not large enough",
|
|
295
|
+
DCGM_ST_FIELD_UNSUPPORTED_BY_API: "The given field ID is not supported by the API being called",
|
|
296
|
+
DCGM_ST_MODULE_NOT_LOADED: "This request is serviced by a module of DCGM that is not currently loaded",
|
|
297
|
+
DCGM_ST_IN_USE: "The requested operation could not be completed because the affected resource is in use",
|
|
298
|
+
DCGM_ST_GROUP_IS_EMPTY: "The specified group is empty, and this operation is incompatible with an empty group",
|
|
299
|
+
DCGM_ST_PROFILING_NOT_SUPPORTED: "Profiling is not supported for this group of GPUs or GPU",
|
|
300
|
+
DCGM_ST_PROFILING_LIBRARY_ERROR: "The third-party Profiling module returned an unrecoverable error",
|
|
301
|
+
DCGM_ST_PROFILING_MULTI_PASS: "The requested profiling metrics cannot be collected in a single pass",
|
|
302
|
+
DCGM_ST_DIAG_ALREADY_RUNNING: "A diag instance is already running, cannot run a new diag until the current one finishes",
|
|
303
|
+
DCGM_ST_DIAG_BAD_JSON: "The GPU Diagnostic returned Json that cannot be parsed.",
|
|
304
|
+
DCGM_ST_DIAG_BAD_LAUNCH: "Error while launching the GPU Diagnostic.",
|
|
305
|
+
DCGM_ST_DIAG_UNUSED: "Unused error code",
|
|
306
|
+
DCGM_ST_DIAG_THRESHOLD_EXCEEDED: "A field value met or exceeded the error threshold.",
|
|
307
|
+
DCGM_ST_INSUFFICIENT_DRIVER_VERSION: "The installed driver version is insufficient for this API",
|
|
308
|
+
DCGM_ST_INSTANCE_NOT_FOUND: "The specified GPU instance does not exist",
|
|
309
|
+
DCGM_ST_COMPUTE_INSTANCE_NOT_FOUND: "The specified GPU compute instance does not exist",
|
|
310
|
+
DCGM_ST_CHILD_NOT_KILLED: "Couldn't kill a child process within the retries",
|
|
311
|
+
DCGM_ST_3RD_PARTY_LIBRARY_ERROR: "Detected an error in a 3rd-party library",
|
|
312
|
+
DCGM_ST_INSUFFICIENT_RESOURCES: "Not enough resources available",
|
|
313
|
+
DCGM_ST_PLUGIN_EXCEPTION: "Exception thrown from a diagnostic plugin",
|
|
314
|
+
DCGM_ST_NVVS_ISOLATE_ERROR: "The diagnostic returned an error that indicates the need for isolation",
|
|
315
|
+
}
|
|
316
|
+
|
|
317
|
+
def __new__(typ, value):
|
|
318
|
+
"""
|
|
319
|
+
Maps value to a proper subclass of DCGMError.
|
|
320
|
+
"""
|
|
321
|
+
if typ == DCGMError:
|
|
322
|
+
typ = DCGMError._valClassMapping.get(value, typ)
|
|
323
|
+
obj = Exception.__new__(typ)
|
|
324
|
+
obj.info = None
|
|
325
|
+
obj.value = value
|
|
326
|
+
return obj
|
|
327
|
+
|
|
328
|
+
def __str__(self):
|
|
329
|
+
msg = None
|
|
330
|
+
try:
|
|
331
|
+
if self.value not in DCGMError._error_code_to_string:
|
|
332
|
+
DCGMError._error_code_to_string[self.value] = str(
|
|
333
|
+
_dcgmErrorString(self.value)
|
|
334
|
+
)
|
|
335
|
+
msg = DCGMError._error_code_to_string[self.value]
|
|
336
|
+
# Ensure we catch all exceptions, otherwise the error code will be hidden in a traceback
|
|
337
|
+
except BaseException:
|
|
338
|
+
msg = "DCGM Error with code %d" % self.value
|
|
339
|
+
|
|
340
|
+
if self.info is not None:
|
|
341
|
+
if msg[-1] == ".":
|
|
342
|
+
msg = msg[:-1]
|
|
343
|
+
msg += ": '%s'" % self.info
|
|
344
|
+
return msg
|
|
345
|
+
|
|
346
|
+
def __eq__(self, other):
|
|
347
|
+
return self.value == other.value
|
|
348
|
+
|
|
349
|
+
def __hash__(self):
|
|
350
|
+
return hash(self.value)
|
|
351
|
+
|
|
352
|
+
def SetAdditionalInfo(self, msg):
|
|
353
|
+
"""
|
|
354
|
+
Sets msg as additional information returned by the string representation of DCGMError and subclasses.
|
|
355
|
+
Example output for DCGMError_Uninitialized subclass, with msg set to 'more info msg here' is
|
|
356
|
+
"DCGMError_Uninitialized: Object is in an undefined state: 'more info msg here'".
|
|
357
|
+
|
|
358
|
+
Ensure that msg is a string or an object for which the __str__() method does not throw an error
|
|
359
|
+
"""
|
|
360
|
+
self.info = msg
|
|
361
|
+
|
|
362
|
+
|
|
363
|
+
def dcgmExceptionClass(error_code):
|
|
364
|
+
return DCGMError._valClassMapping.get(error_code)
|
|
365
|
+
|
|
366
|
+
|
|
367
|
+
def _extractDCGMErrorsAsClasses():
|
|
368
|
+
"""
|
|
369
|
+
Generates a hierarchy of classes on top of DCGMLError class.
|
|
370
|
+
|
|
371
|
+
Each DCGM Error gets a new DCGMError subclass. This way try,except blocks can filter appropriate
|
|
372
|
+
exceptions more easily.
|
|
373
|
+
|
|
374
|
+
DCGMError is a parent class. Each DCGM_ST_* gets it's own subclass.
|
|
375
|
+
e.g. DCGM_ST_UNINITIALIZED will be turned into DCGMError_Uninitialized
|
|
376
|
+
"""
|
|
377
|
+
this_module = sys.modules[__name__]
|
|
378
|
+
dcgmErrorsNames = filter(lambda x: x.startswith("DCGM_ST_"), dir(this_module))
|
|
379
|
+
for err_name in dcgmErrorsNames:
|
|
380
|
+
# e.g. Turn DCGM_ST_UNINITIALIZED into DCGMError_Uninitialized
|
|
381
|
+
class_name = "DCGMError_" + string.capwords(
|
|
382
|
+
err_name.replace("DCGM_ST_", ""), "_"
|
|
383
|
+
).replace("_", "")
|
|
384
|
+
err_val = getattr(this_module, err_name)
|
|
385
|
+
|
|
386
|
+
def gen_new(val):
|
|
387
|
+
|
|
388
|
+
def new(typ):
|
|
389
|
+
# pylint: disable=E1121
|
|
390
|
+
obj = DCGMError.__new__(typ, val)
|
|
391
|
+
return obj
|
|
392
|
+
|
|
393
|
+
return new
|
|
394
|
+
|
|
395
|
+
new_error_class = type(class_name, (DCGMError,), {"__new__": gen_new(err_val)})
|
|
396
|
+
new_error_class.__module__ = __name__
|
|
397
|
+
setattr(this_module, class_name, new_error_class)
|
|
398
|
+
DCGMError._valClassMapping[err_val] = new_error_class
|
|
399
|
+
|
|
400
|
+
|
|
401
|
+
_extractDCGMErrorsAsClasses()
|
|
402
|
+
|
|
403
|
+
|
|
404
|
+
class struct_c_dcgmUnit_t(Structure):
|
|
405
|
+
# Unit structures
|
|
406
|
+
pass # opaque handle
|
|
407
|
+
|
|
408
|
+
|
|
409
|
+
_dcgmUnit_t = POINTER(struct_c_dcgmUnit_t)
|
|
410
|
+
|
|
411
|
+
|
|
412
|
+
class _WrappedStructure:
|
|
413
|
+
|
|
414
|
+
def __init__(self, obj):
|
|
415
|
+
self.__dict__["_obj"] = obj
|
|
416
|
+
|
|
417
|
+
def __getattr__(self, key):
|
|
418
|
+
value = getattr(self._obj, key)
|
|
419
|
+
if isinstance(value, bytes):
|
|
420
|
+
return value.decode("utf-8")
|
|
421
|
+
if isclass(value):
|
|
422
|
+
return _WrappedStructure(value)
|
|
423
|
+
return value
|
|
424
|
+
|
|
425
|
+
def __getitem__(self, key):
|
|
426
|
+
value = self._obj[key]
|
|
427
|
+
if isinstance(value, bytes):
|
|
428
|
+
return value.decode("utf-8")
|
|
429
|
+
if isclass(value):
|
|
430
|
+
return _WrappedStructure(value)
|
|
431
|
+
return value
|
|
432
|
+
|
|
433
|
+
def __setattr__(self, key, raw_value):
|
|
434
|
+
|
|
435
|
+
def find_field_type(fields, key):
|
|
436
|
+
field = (f[1] for f in fields if f[0] == key)
|
|
437
|
+
try:
|
|
438
|
+
return next(field)
|
|
439
|
+
except StopIteration:
|
|
440
|
+
return None
|
|
441
|
+
|
|
442
|
+
if key == "_obj":
|
|
443
|
+
raise RuntimeError("Cannot set _obj")
|
|
444
|
+
|
|
445
|
+
value = raw_value
|
|
446
|
+
fieldtype = find_field_type(self._obj._fields_, key)
|
|
447
|
+
|
|
448
|
+
if fieldtype == c_uint and not isinstance(value, c_uint32):
|
|
449
|
+
value = int(value)
|
|
450
|
+
elif fieldtype == c_int and not isinstance(value, c_int32):
|
|
451
|
+
value = int(value)
|
|
452
|
+
elif isinstance(raw_value, str):
|
|
453
|
+
value = raw_value.encode("utf-8")
|
|
454
|
+
|
|
455
|
+
self._obj[key] = value
|
|
456
|
+
return value
|
|
457
|
+
|
|
458
|
+
|
|
459
|
+
class _DcgmStructure(Structure):
|
|
460
|
+
|
|
461
|
+
def __getattribute__(self, key):
|
|
462
|
+
value = super().__getattribute__(key)
|
|
463
|
+
if isinstance(value, bytes):
|
|
464
|
+
return value.decode("utf-8")
|
|
465
|
+
if isclass(value):
|
|
466
|
+
return _WrappedStructure(value)
|
|
467
|
+
return value
|
|
468
|
+
|
|
469
|
+
def __setattr__(self, key, raw_value):
|
|
470
|
+
|
|
471
|
+
def find_field_type(fields, key):
|
|
472
|
+
field = (f[1] for f in fields if f[0] == key)
|
|
473
|
+
try:
|
|
474
|
+
return next(field)
|
|
475
|
+
except StopIteration:
|
|
476
|
+
return None
|
|
477
|
+
|
|
478
|
+
value = raw_value
|
|
479
|
+
fieldtype = find_field_type(self._fields_, key)
|
|
480
|
+
|
|
481
|
+
if fieldtype == c_uint and not isinstance(value, c_uint32):
|
|
482
|
+
value = int(value)
|
|
483
|
+
elif fieldtype == c_int and not isinstance(value, c_int32):
|
|
484
|
+
value = int(value)
|
|
485
|
+
elif isinstance(raw_value, str):
|
|
486
|
+
value = raw_value.encode("utf-8")
|
|
487
|
+
|
|
488
|
+
return super().__setattr__(key, value)
|
|
489
|
+
|
|
490
|
+
|
|
491
|
+
class DcgmUnion(Union):
|
|
492
|
+
|
|
493
|
+
def __getattribute__(self, key):
|
|
494
|
+
value = super().__getattribute__(key)
|
|
495
|
+
if isinstance(value, bytes):
|
|
496
|
+
return value.decode("utf-8")
|
|
497
|
+
if isclass(value):
|
|
498
|
+
return _WrappedStructure(value)
|
|
499
|
+
return value
|
|
500
|
+
|
|
501
|
+
def __setattr__(self, key, raw_value):
|
|
502
|
+
|
|
503
|
+
def find_field_type(fields, key):
|
|
504
|
+
field = (f[1] for f in fields if f[0] == key)
|
|
505
|
+
try:
|
|
506
|
+
return next(field)
|
|
507
|
+
except StopIteration:
|
|
508
|
+
return None
|
|
509
|
+
|
|
510
|
+
value = raw_value
|
|
511
|
+
fieldtype = find_field_type(self._fields_, key)
|
|
512
|
+
|
|
513
|
+
if fieldtype == c_uint and not isinstance(value, c_uint32):
|
|
514
|
+
value = int(value)
|
|
515
|
+
elif fieldtype == c_int and not isinstance(value, c_int32):
|
|
516
|
+
value = int(value)
|
|
517
|
+
elif isinstance(raw_value, str):
|
|
518
|
+
value = raw_value.encode("utf-8")
|
|
519
|
+
|
|
520
|
+
return super().__setattr__(key, value)
|
|
521
|
+
|
|
522
|
+
|
|
523
|
+
class _PrintableStructure(_DcgmStructure):
|
|
524
|
+
"""
|
|
525
|
+
Abstract class that produces nicer __str__ output than ctypes.Structure.
|
|
526
|
+
e.g. instead of:
|
|
527
|
+
>>> print str(obj)
|
|
528
|
+
<class_name object at 0x7fdf82fef9e0>
|
|
529
|
+
this class will print
|
|
530
|
+
class_name(field_name: formatted_value, field_name: formatted_value)
|
|
531
|
+
|
|
532
|
+
_fmt_ dictionary of <str _field_ name> -> <str format>
|
|
533
|
+
e.g. class that has _field_ 'hex_value', c_uint could be formatted with
|
|
534
|
+
_fmt_ = {"hex_value" : "%08X"}
|
|
535
|
+
to produce nicer output.
|
|
536
|
+
Default fomratting string for all fields can be set with key "<default>" like:
|
|
537
|
+
_fmt_ = {"<default>" : "%d MHz"} # e.g all values are numbers in MHz.
|
|
538
|
+
If not set it's assumed to be just "%s"
|
|
539
|
+
|
|
540
|
+
Exact format of returned str from this class is subject to change in the future.
|
|
541
|
+
"""
|
|
542
|
+
|
|
543
|
+
_fmt_: Dict = {}
|
|
544
|
+
|
|
545
|
+
def __str__(self):
|
|
546
|
+
result = []
|
|
547
|
+
for x in self._fields_:
|
|
548
|
+
key = x[0]
|
|
549
|
+
value = getattr(self, key)
|
|
550
|
+
fmt = "%s"
|
|
551
|
+
if key in self._fmt_:
|
|
552
|
+
fmt = self._fmt_[key]
|
|
553
|
+
elif "<default>" in self._fmt_:
|
|
554
|
+
fmt = self._fmt_["<default>"]
|
|
555
|
+
result.append(("%s: " + fmt) % (key, value))
|
|
556
|
+
return self.__class__.__name__ + "(" + ", ".join(result) + ")"
|
|
557
|
+
|
|
558
|
+
def FieldsSizeof(self):
|
|
559
|
+
size = 0
|
|
560
|
+
for s, t in self._fields_:
|
|
561
|
+
size = size + sizeof(t)
|
|
562
|
+
return size
|
|
563
|
+
|
|
564
|
+
|
|
565
|
+
# JSON serializer for DCGM structures
|
|
566
|
+
class DcgmJSONEncoder(json.JSONEncoder):
|
|
567
|
+
|
|
568
|
+
def default(self, o): # pylint: disable=method-hidden
|
|
569
|
+
if isinstance(o, _PrintableStructure):
|
|
570
|
+
retVal = {}
|
|
571
|
+
for fieldName, fieldType in o._fields_:
|
|
572
|
+
subObj = getattr(o, fieldName)
|
|
573
|
+
if isinstance(subObj, _PrintableStructure):
|
|
574
|
+
subObj = self.default(subObj)
|
|
575
|
+
|
|
576
|
+
retVal[fieldName] = subObj
|
|
577
|
+
|
|
578
|
+
return retVal
|
|
579
|
+
elif isinstance(o, Array):
|
|
580
|
+
retVal = []
|
|
581
|
+
for i in range(len(o)):
|
|
582
|
+
subVal = {}
|
|
583
|
+
for fieldName, fieldType in o[i]._fields_:
|
|
584
|
+
subObj = getattr(o[i], fieldName)
|
|
585
|
+
if isinstance(subObj, _PrintableStructure):
|
|
586
|
+
subObj = self.default(subObj)
|
|
587
|
+
|
|
588
|
+
subVal[fieldName] = subObj
|
|
589
|
+
|
|
590
|
+
retVal.append(subVal)
|
|
591
|
+
return retVal
|
|
592
|
+
|
|
593
|
+
# Let the parent class handle this/fail
|
|
594
|
+
return json.JSONEncoder.default(self, o)
|
|
595
|
+
|
|
596
|
+
|
|
597
|
+
# Creates a unique version number for each struct
|
|
598
|
+
def make_dcgm_version(struct, ver):
|
|
599
|
+
return sizeof(struct) | (ver << 24)
|
|
600
|
+
|
|
601
|
+
|
|
602
|
+
# Function access ##
|
|
603
|
+
_dcgmGetFunctionPointer_cache: Dict = (
|
|
604
|
+
dict()
|
|
605
|
+
) # function pointers are cached to prevent unnecessary libLoadLock locking
|
|
606
|
+
|
|
607
|
+
|
|
608
|
+
def _dcgmGetFunctionPointer(name):
|
|
609
|
+
global dcgmLib
|
|
610
|
+
|
|
611
|
+
if name in _dcgmGetFunctionPointer_cache:
|
|
612
|
+
return _dcgmGetFunctionPointer_cache[name]
|
|
613
|
+
|
|
614
|
+
libLoadLock.acquire()
|
|
615
|
+
try:
|
|
616
|
+
# ensure library was loaded
|
|
617
|
+
if dcgmLib is None:
|
|
618
|
+
raise DCGMError(DCGM_ST_UNINITIALIZED)
|
|
619
|
+
try:
|
|
620
|
+
_dcgmGetFunctionPointer_cache[name] = getattr(dcgmLib, name)
|
|
621
|
+
return _dcgmGetFunctionPointer_cache[name]
|
|
622
|
+
except AttributeError:
|
|
623
|
+
raise DCGMError(DCGM_ST_FUNCTION_NOT_FOUND)
|
|
624
|
+
finally:
|
|
625
|
+
# lock is always freed
|
|
626
|
+
libLoadLock.release()
|
|
627
|
+
|
|
628
|
+
|
|
629
|
+
# C function wrappers ##
|
|
630
|
+
def _LoadDcgmLibrary(libDcgmPath=None):
|
|
631
|
+
"""
|
|
632
|
+
Load the library if it isn't loaded already
|
|
633
|
+
:param libDcgmPath: Optional path to the libdcgm*.so libraries. Will use system defaults if not specified.
|
|
634
|
+
:type libDcgmPath: str
|
|
635
|
+
:return: None
|
|
636
|
+
"""
|
|
637
|
+
global dcgmLib
|
|
638
|
+
|
|
639
|
+
if dcgmLib is None:
|
|
640
|
+
# lock to ensure only one caller loads the library
|
|
641
|
+
libLoadLock.acquire()
|
|
642
|
+
|
|
643
|
+
try:
|
|
644
|
+
# ensure the library still isn't loaded
|
|
645
|
+
if dcgmLib is None:
|
|
646
|
+
try:
|
|
647
|
+
if sys.platform[:3] == "win":
|
|
648
|
+
# cdecl calling convention
|
|
649
|
+
# load nvml.dll from %ProgramFiles%/NVIDIA Corporation/NVSMI/nvml.dll
|
|
650
|
+
dcgmLib = CDLL(
|
|
651
|
+
os.path.join(
|
|
652
|
+
os.getenv("ProgramFiles", "C:/Program Files"),
|
|
653
|
+
"NVIDIA Corporation/NVSMI/dcgm.dll",
|
|
654
|
+
)
|
|
655
|
+
)
|
|
656
|
+
else:
|
|
657
|
+
if libDcgmPath:
|
|
658
|
+
lib_file = os.path.join(libDcgmPath, "libdcgm.so.4")
|
|
659
|
+
else:
|
|
660
|
+
# Try Debian-based distros
|
|
661
|
+
lib_file = "/usr/lib/{}-linux-gnu/libdcgm.so.4".format(
|
|
662
|
+
platform.machine()
|
|
663
|
+
)
|
|
664
|
+
if not os.path.isfile(lib_file):
|
|
665
|
+
# Presume Redhat-based distros
|
|
666
|
+
lib_file = "/usr/lib64/libdcgm.so.4"
|
|
667
|
+
|
|
668
|
+
dcgmLib = CDLL(lib_file)
|
|
669
|
+
|
|
670
|
+
except OSError as ose:
|
|
671
|
+
_dcgmCheckReturn(DCGM_ST_LIBRARY_NOT_FOUND)
|
|
672
|
+
if dcgmLib is None:
|
|
673
|
+
_dcgmCheckReturn(DCGM_ST_LIBRARY_NOT_FOUND)
|
|
674
|
+
finally:
|
|
675
|
+
# lock is always freed
|
|
676
|
+
libLoadLock.release()
|
|
677
|
+
|
|
678
|
+
|
|
679
|
+
def _dcgmInit(libDcgmPath=None):
|
|
680
|
+
_LoadDcgmLibrary(libDcgmPath)
|
|
681
|
+
# Atomically update refcount
|
|
682
|
+
global _dcgmLib_refcount
|
|
683
|
+
libLoadLock.acquire()
|
|
684
|
+
_dcgmLib_refcount += 1
|
|
685
|
+
libLoadLock.release()
|
|
686
|
+
return None
|
|
687
|
+
|
|
688
|
+
|
|
689
|
+
def _dcgmCheckReturn(ret):
|
|
690
|
+
if ret != DCGM_ST_OK:
|
|
691
|
+
raise DCGMError(ret)
|
|
692
|
+
return ret
|
|
693
|
+
|
|
694
|
+
|
|
695
|
+
def _dcgmShutdown():
|
|
696
|
+
# Leave the library loaded, but shutdown the interface
|
|
697
|
+
fn = _dcgmGetFunctionPointer("dcgmShutdown")
|
|
698
|
+
ret = fn()
|
|
699
|
+
_dcgmCheckReturn(ret)
|
|
700
|
+
|
|
701
|
+
# Atomically update refcount
|
|
702
|
+
global _dcgmLib_refcount
|
|
703
|
+
libLoadLock.acquire()
|
|
704
|
+
if 0 < _dcgmLib_refcount:
|
|
705
|
+
_dcgmLib_refcount -= 1
|
|
706
|
+
libLoadLock.release()
|
|
707
|
+
return None
|
|
708
|
+
|
|
709
|
+
|
|
710
|
+
def _dcgmErrorString(result):
|
|
711
|
+
fn = _dcgmGetFunctionPointer("dcgmErrorString")
|
|
712
|
+
fn.restype = c_char_p # otherwise return is an int
|
|
713
|
+
str = fn(result)
|
|
714
|
+
return str
|
|
715
|
+
|
|
716
|
+
|
|
717
|
+
# Represents a link object. type should be one of DCGM_FE_GPU or
|
|
718
|
+
# DCGM_FE_SWITCH. gpuId or switchID the associated gpu or switch;
|
|
719
|
+
#
|
|
720
|
+
class c_dcgm_link_t(_PrintableStructure):
|
|
721
|
+
_fields = [("type", c_uint8), ("index", c_uint8), ("id", c_uint16)]
|
|
722
|
+
|
|
723
|
+
|
|
724
|
+
class c_dcgmConnectV2Params_v1(_PrintableStructure):
|
|
725
|
+
_fields_ = [("version", c_uint), ("persistAfterDisconnect", c_uint)]
|
|
726
|
+
|
|
727
|
+
|
|
728
|
+
c_dcgmConnectV2Params_version1 = make_dcgm_version(c_dcgmConnectV2Params_v1, 1)
|
|
729
|
+
|
|
730
|
+
|
|
731
|
+
class c_dcgmConnectV2Params_v2(_PrintableStructure):
|
|
732
|
+
_fields_ = [
|
|
733
|
+
("version", c_uint),
|
|
734
|
+
("persistAfterDisconnect", c_uint),
|
|
735
|
+
("timeoutMs", c_uint),
|
|
736
|
+
("addressIsUnixSocket", c_uint),
|
|
737
|
+
]
|
|
738
|
+
|
|
739
|
+
|
|
740
|
+
c_dcgmConnectV2Params_version2 = make_dcgm_version(c_dcgmConnectV2Params_v2, 2)
|
|
741
|
+
c_dcgmConnectV2Params_version = c_dcgmConnectV2Params_version2
|
|
742
|
+
|
|
743
|
+
|
|
744
|
+
class c_dcgmHostengineHealth_v1(_PrintableStructure):
|
|
745
|
+
_fields_ = [
|
|
746
|
+
("version", c_uint),
|
|
747
|
+
("overallHealth", c_uint),
|
|
748
|
+
]
|
|
749
|
+
|
|
750
|
+
|
|
751
|
+
dcgmHostengineHealth_version1 = make_dcgm_version(c_dcgmHostengineHealth_v1, 1)
|
|
752
|
+
dcgmHostengineHealth_version = dcgmHostengineHealth_version1
|
|
753
|
+
|
|
754
|
+
|
|
755
|
+
# Represents memory and proc clocks for a device
|
|
756
|
+
class c_dcgmClockSet_v1(_PrintableStructure):
|
|
757
|
+
_fields_ = [
|
|
758
|
+
("version", c_uint),
|
|
759
|
+
("memClock", c_uint), # /* Memory Clock */
|
|
760
|
+
("smClock", c_uint), # /* SM Clock */
|
|
761
|
+
]
|
|
762
|
+
|
|
763
|
+
|
|
764
|
+
# Represents a entityGroupId + entityId pair to uniquely identify a given entityId inside
|
|
765
|
+
# a group of entities
|
|
766
|
+
# Added in DCGM 1.5.0
|
|
767
|
+
class c_dcgmGroupEntityPair_t(_PrintableStructure):
|
|
768
|
+
_fields_ = [
|
|
769
|
+
("entityGroupId", c_uint32), # Entity Group ID entity belongs to
|
|
770
|
+
("entityId", c_uint32), # Entity ID of the entity
|
|
771
|
+
]
|
|
772
|
+
|
|
773
|
+
|
|
774
|
+
# /**
|
|
775
|
+
# * Structure to store information for DCGM group (v2)
|
|
776
|
+
# * Added in DCGM 1.5.0
|
|
777
|
+
# */
|
|
778
|
+
class c_dcgmGroupInfo_v2(_PrintableStructure):
|
|
779
|
+
_fields_ = [
|
|
780
|
+
("version", c_uint),
|
|
781
|
+
("count", c_uint),
|
|
782
|
+
("groupName", c_char * DCGM_MAX_STR_LENGTH),
|
|
783
|
+
("entityList", c_dcgmGroupEntityPair_t * DCGM_GROUP_MAX_ENTITIES),
|
|
784
|
+
]
|
|
785
|
+
|
|
786
|
+
|
|
787
|
+
c_dcgmGroupInfo_version2 = make_dcgm_version(c_dcgmGroupInfo_v2, 2)
|
|
788
|
+
|
|
789
|
+
DcgmiMigProfileNone = 0 # No profile (for GPUs)
|
|
790
|
+
DcgmMigProfileGpuInstanceSlice1 = 1 # GPU instance slice 1
|
|
791
|
+
DcgmMigProfileGpuInstanceSlice2 = 2 # GPU instance slice 2
|
|
792
|
+
DcgmMigProfileGpuInstanceSlice3 = 3 # GPU instance slice 3
|
|
793
|
+
DcgmMigProfileGpuInstanceSlice4 = 4 # GPU instance slice 4
|
|
794
|
+
DcgmMigProfileGpuInstanceSlice7 = 5 # GPU instance slice 7
|
|
795
|
+
DcgmMigProfileGpuInstanceSlice8 = 6 # GPU instance slice 8
|
|
796
|
+
DcgmMigProfileGpuInstanceSlice6 = 7 # GPU instance slice 6
|
|
797
|
+
DcgmMigProfileGpuInstanceSlice1Rev1 = 8 # GPU instance slice 1 revision 1
|
|
798
|
+
DcgmMigProfileGpuInstanceSlice2Rev1 = 9 # GPU instance slice 2 revision 1
|
|
799
|
+
DcgmMigProfileGpuInstanceSlice1Rev2 = 10 # GPU instance slice 1 revision 2
|
|
800
|
+
DcgmMigProfileComputeInstanceSlice1 = 30 # compute instance slice 1
|
|
801
|
+
DcgmMigProfileComputeInstanceSlice2 = 31 # compute instance slice 2
|
|
802
|
+
DcgmMigProfileComputeInstanceSlice3 = 32 # compute instance slice 3
|
|
803
|
+
DcgmMigProfileComputeInstanceSlice4 = 33 # compute instance slice 4
|
|
804
|
+
DcgmMigProfileComputeInstanceSlice7 = 34 # compute instance slice 7
|
|
805
|
+
DcgmMigProfileComputeInstanceSlice8 = 35 # compute instance slice 8
|
|
806
|
+
DcgmMigProfileComputeInstanceSlice6 = 36 # compute instance slice 6
|
|
807
|
+
DcgmMigProfileComputeInstanceSlice1Rev1 = 37 # compute instance slice 1 revision 1
|
|
808
|
+
|
|
809
|
+
|
|
810
|
+
# /**
|
|
811
|
+
# * Represents a pair of entity pairings to uniquely identify an entity and its place in the hierarchy.
|
|
812
|
+
# */
|
|
813
|
+
class c_dcgmMigHierarchyInfo_t(_PrintableStructure):
|
|
814
|
+
_fields_ = [
|
|
815
|
+
("entity", c_dcgmGroupEntityPair_t),
|
|
816
|
+
("parent", c_dcgmGroupEntityPair_t),
|
|
817
|
+
("sliceProfile", c_uint),
|
|
818
|
+
]
|
|
819
|
+
|
|
820
|
+
|
|
821
|
+
class c_dcgmMigEntityInfo_t(_PrintableStructure):
|
|
822
|
+
_fields_ = [
|
|
823
|
+
("gpuUuid", c_char * 128), # GPU UUID
|
|
824
|
+
("nvmlGpuIndex", c_uint), # GPU index from NVML
|
|
825
|
+
("nvmlInstanceId", c_uint), # GPU instance index within GPU
|
|
826
|
+
(
|
|
827
|
+
"nvmlComputeInstanceId",
|
|
828
|
+
c_uint,
|
|
829
|
+
), # GPU Compute instance index within GPU instance
|
|
830
|
+
("nvmlMigProfileId", c_uint), # Unique profile ID for GPU or Compute instances
|
|
831
|
+
("nvmlProfileSlices", c_uint), # Number of slices in the MIG profile
|
|
832
|
+
]
|
|
833
|
+
|
|
834
|
+
|
|
835
|
+
class c_dcgmMigHierarchyInfo_v2(_PrintableStructure):
|
|
836
|
+
_fields_ = [
|
|
837
|
+
("entity", c_dcgmGroupEntityPair_t),
|
|
838
|
+
("parent", c_dcgmGroupEntityPair_t),
|
|
839
|
+
("info", c_dcgmMigEntityInfo_t),
|
|
840
|
+
]
|
|
841
|
+
|
|
842
|
+
|
|
843
|
+
DCGM_MAX_INSTANCES_PER_GPU = 8
|
|
844
|
+
# There can never be more compute instances per GPU than instances per GPU because a compute instance
|
|
845
|
+
# is part of an instance
|
|
846
|
+
DCGM_MAX_COMPUTE_INSTANCES_PER_GPU = DCGM_MAX_INSTANCES_PER_GPU
|
|
847
|
+
# Currently, there cannot be more than 14 instances + compute instances. There are always 7 compute instances
|
|
848
|
+
# and never more than 7 instances
|
|
849
|
+
DCGM_MAX_TOTAL_INSTANCES = 14
|
|
850
|
+
DCGM_MAX_HIERARCHY_INFO = DCGM_MAX_NUM_DEVICES * DCGM_MAX_TOTAL_INSTANCES
|
|
851
|
+
DCGM_MAX_INSTANCES = DCGM_MAX_NUM_DEVICES * DCGM_MAX_INSTANCES_PER_GPU
|
|
852
|
+
# The maximum compute instances are always the same as the maximum instances because each compute instances
|
|
853
|
+
# is part of an instance
|
|
854
|
+
DCGM_MAX_COMPUTE_INSTANCES = DCGM_MAX_INSTANCES
|
|
855
|
+
|
|
856
|
+
DCGM_MIG_RECONFIG_DELAY_PROCESSING = (
|
|
857
|
+
0x1 # Ask the hostengine to wait to process reconfiguring the GPUs
|
|
858
|
+
)
|
|
859
|
+
|
|
860
|
+
|
|
861
|
+
class c_dcgmMigHierarchy_v2(_PrintableStructure):
|
|
862
|
+
_fields_ = [
|
|
863
|
+
("version", c_uint),
|
|
864
|
+
("count", c_uint),
|
|
865
|
+
("entityList", c_dcgmMigHierarchyInfo_v2 * DCGM_MAX_HIERARCHY_INFO),
|
|
866
|
+
]
|
|
867
|
+
|
|
868
|
+
|
|
869
|
+
c_dcgmMigHierarchy_version2 = make_dcgm_version(c_dcgmMigHierarchy_v2, 2)
|
|
870
|
+
|
|
871
|
+
|
|
872
|
+
class c_dcgmDeleteMigEntity_v1(_PrintableStructure):
|
|
873
|
+
_fields_ = [
|
|
874
|
+
("version", c_uint),
|
|
875
|
+
("entityGroupId", c_uint32),
|
|
876
|
+
("entityId", c_uint32),
|
|
877
|
+
("flags", c_uint),
|
|
878
|
+
]
|
|
879
|
+
|
|
880
|
+
|
|
881
|
+
c_dcgmDeleteMigEntity_version1 = make_dcgm_version(c_dcgmDeleteMigEntity_v1, 1)
|
|
882
|
+
|
|
883
|
+
# /**
|
|
884
|
+
# * Enum values for the kinds of MIG creations
|
|
885
|
+
# */
|
|
886
|
+
DcgmMigCreateGpuInstance = 0 # Create a GPU instance
|
|
887
|
+
DcgmMigCreateComputeInstance = 1 # Create a compute instance
|
|
888
|
+
|
|
889
|
+
|
|
890
|
+
class c_dcgmCreateMigEntity_v1(_PrintableStructure):
|
|
891
|
+
_fields_ = [
|
|
892
|
+
("version", c_uint),
|
|
893
|
+
("parentId", c_uint32),
|
|
894
|
+
("profile", c_uint32),
|
|
895
|
+
("createOption", c_uint32),
|
|
896
|
+
("flags", c_uint),
|
|
897
|
+
]
|
|
898
|
+
|
|
899
|
+
|
|
900
|
+
c_dcgmCreateMigEntity_version1 = make_dcgm_version(c_dcgmCreateMigEntity_v1, 1)
|
|
901
|
+
|
|
902
|
+
|
|
903
|
+
# /**
|
|
904
|
+
# * Structure to represent error attributes
|
|
905
|
+
# */
|
|
906
|
+
class c_dcgmErrorInfo_v1(_PrintableStructure):
|
|
907
|
+
_fields_ = [("gpuId", c_uint), ("fieldId", c_ushort), ("status", c_int)]
|
|
908
|
+
|
|
909
|
+
|
|
910
|
+
# /**
|
|
911
|
+
# * Represents list of supported clocks for a device
|
|
912
|
+
# */
|
|
913
|
+
class c_dcgmDeviceSupportedClockSets_v1(_PrintableStructure):
|
|
914
|
+
_fields_ = [
|
|
915
|
+
("version", c_uint),
|
|
916
|
+
("count", c_uint),
|
|
917
|
+
("clockSet", c_dcgmClockSet_v1 * DCGM_MAX_CLOCKS),
|
|
918
|
+
]
|
|
919
|
+
|
|
920
|
+
|
|
921
|
+
# /**
|
|
922
|
+
# * Represents accounting information for a device and pid
|
|
923
|
+
# */
|
|
924
|
+
class c_dcgmDevicePidAccountingStats_v1(_PrintableStructure):
|
|
925
|
+
_fields_ = [
|
|
926
|
+
("version", c_uint32),
|
|
927
|
+
("pid", c_uint32),
|
|
928
|
+
("gpuUtilization", c_uint32),
|
|
929
|
+
("memoryUtilization", c_uint32),
|
|
930
|
+
("maxMemoryUsage", c_uint64),
|
|
931
|
+
("startTimestamp", c_uint64),
|
|
932
|
+
("activeTimeUsec", c_uint64),
|
|
933
|
+
]
|
|
934
|
+
|
|
935
|
+
|
|
936
|
+
# /**
|
|
937
|
+
# * Represents thermal information
|
|
938
|
+
# */
|
|
939
|
+
class c_dcgmDeviceThermals_v1(_PrintableStructure):
|
|
940
|
+
_fields_ = [("version", c_uint), ("slowdownTemp", c_uint), ("shutdownTemp", c_uint)]
|
|
941
|
+
|
|
942
|
+
|
|
943
|
+
# /**
|
|
944
|
+
# * Represents various power limits
|
|
945
|
+
# */
|
|
946
|
+
class c_dcgmDevicePowerLimits_v1(_PrintableStructure):
|
|
947
|
+
_fields_ = [
|
|
948
|
+
("version", c_uint),
|
|
949
|
+
("curPowerLimit", c_uint),
|
|
950
|
+
("defaultPowerLimit", c_uint),
|
|
951
|
+
("enforcedPowerLimit", c_uint),
|
|
952
|
+
("minPowerLimit", c_uint),
|
|
953
|
+
("maxPowerLimit", c_uint),
|
|
954
|
+
]
|
|
955
|
+
|
|
956
|
+
|
|
957
|
+
# /**
|
|
958
|
+
# * Represents device identifiers
|
|
959
|
+
# */
|
|
960
|
+
class c_dcgmDeviceIdentifiers_v1(_PrintableStructure):
|
|
961
|
+
_fields_ = [
|
|
962
|
+
("version", c_uint),
|
|
963
|
+
("brandName", c_char * DCGM_MAX_STR_LENGTH),
|
|
964
|
+
("deviceName", c_char * DCGM_MAX_STR_LENGTH),
|
|
965
|
+
("pciBusId", c_char * DCGM_MAX_STR_LENGTH),
|
|
966
|
+
("serial", c_char * DCGM_MAX_STR_LENGTH),
|
|
967
|
+
("uuid", c_char * DCGM_MAX_STR_LENGTH),
|
|
968
|
+
("vbios", c_char * DCGM_MAX_STR_LENGTH),
|
|
969
|
+
("inforomImageVersion", c_char * DCGM_MAX_STR_LENGTH),
|
|
970
|
+
("pciDeviceId", c_uint32),
|
|
971
|
+
("pciSubSystemId", c_uint32),
|
|
972
|
+
("driverVersion", c_char * DCGM_MAX_STR_LENGTH),
|
|
973
|
+
("virtualizationMode", c_uint32),
|
|
974
|
+
]
|
|
975
|
+
|
|
976
|
+
|
|
977
|
+
# /**
|
|
978
|
+
# * Represents memory utilization
|
|
979
|
+
# */
|
|
980
|
+
class c_dcgmDeviceMemoryUsage_v1(_PrintableStructure):
|
|
981
|
+
_fields_ = [
|
|
982
|
+
("version", c_uint),
|
|
983
|
+
("bar1Total", c_uint),
|
|
984
|
+
("fbTotal", c_uint),
|
|
985
|
+
("fbUsed", c_uint),
|
|
986
|
+
("fbFree", c_uint),
|
|
987
|
+
]
|
|
988
|
+
|
|
989
|
+
|
|
990
|
+
# /**
|
|
991
|
+
# * Represents utilization values of vGPUs running on the device
|
|
992
|
+
# */
|
|
993
|
+
class c_dcgmDeviceVgpuUtilInfo_v1(_PrintableStructure):
|
|
994
|
+
_fields_ = [
|
|
995
|
+
("version", c_uint),
|
|
996
|
+
("vgpuId", c_uint),
|
|
997
|
+
("smUtil", c_uint),
|
|
998
|
+
("memUtil", c_uint),
|
|
999
|
+
("encUtil", c_uint),
|
|
1000
|
+
("decUtil", c_uint),
|
|
1001
|
+
]
|
|
1002
|
+
|
|
1003
|
+
|
|
1004
|
+
# /**
|
|
1005
|
+
# * Utilization values for processes running within vGPU VMs using the device
|
|
1006
|
+
# */
|
|
1007
|
+
class c_dcgmDeviceVgpuProcessUtilInfo_v1(_PrintableStructure):
|
|
1008
|
+
_fields_ = [
|
|
1009
|
+
("version", c_uint),
|
|
1010
|
+
("vgpuId", c_uint),
|
|
1011
|
+
("pid", c_uint),
|
|
1012
|
+
("processName", c_char * DCGM_VGPU_NAME_BUFFER_SIZE),
|
|
1013
|
+
("smUtil", c_uint),
|
|
1014
|
+
("memUtil", c_uint),
|
|
1015
|
+
("encUtil", c_uint),
|
|
1016
|
+
("decUtil", c_uint),
|
|
1017
|
+
]
|
|
1018
|
+
|
|
1019
|
+
|
|
1020
|
+
# /**
|
|
1021
|
+
# * Represents current encoder statistics for the given device/vGPU instance
|
|
1022
|
+
# */
|
|
1023
|
+
class c_dcgmDeviceEncStats_v1(_PrintableStructure):
|
|
1024
|
+
_fields_ = [
|
|
1025
|
+
("version", c_uint),
|
|
1026
|
+
("sessionCount", c_uint),
|
|
1027
|
+
("averageFps", c_uint),
|
|
1028
|
+
("averageLatency", c_uint),
|
|
1029
|
+
]
|
|
1030
|
+
|
|
1031
|
+
|
|
1032
|
+
# /**
|
|
1033
|
+
# * Represents information about active encoder sessions on the given vGPU instance
|
|
1034
|
+
# */
|
|
1035
|
+
class c_dcgmDeviceVgpuEncSessions_v1(_PrintableStructure):
|
|
1036
|
+
_fields_ = [
|
|
1037
|
+
("version", c_uint),
|
|
1038
|
+
("vgpuId", c_uint),
|
|
1039
|
+
("sessionId", c_uint),
|
|
1040
|
+
("pid", c_uint),
|
|
1041
|
+
("codecType", c_uint),
|
|
1042
|
+
("hResolution", c_uint),
|
|
1043
|
+
("vResolution", c_uint),
|
|
1044
|
+
("averageFps", c_uint),
|
|
1045
|
+
("averageLatency", c_uint),
|
|
1046
|
+
]
|
|
1047
|
+
|
|
1048
|
+
|
|
1049
|
+
# /**
|
|
1050
|
+
# * Represents current frame buffer capture sessions statistics for the given device/vGPU instance
|
|
1051
|
+
# */
|
|
1052
|
+
class c_dcgmDeviceFbcStats_v1(_PrintableStructure):
|
|
1053
|
+
_fields_ = [
|
|
1054
|
+
("version", c_uint),
|
|
1055
|
+
("sessionCount", c_uint),
|
|
1056
|
+
("averageFps", c_uint),
|
|
1057
|
+
("averageLatency", c_uint),
|
|
1058
|
+
]
|
|
1059
|
+
|
|
1060
|
+
|
|
1061
|
+
# /**
|
|
1062
|
+
# * Represents information about active FBC session on the given device/vGPU instance
|
|
1063
|
+
# */
|
|
1064
|
+
class c_dcgmDeviceFbcSessionInfo_t(_PrintableStructure):
|
|
1065
|
+
_fields_ = [
|
|
1066
|
+
("version", c_uint),
|
|
1067
|
+
("sessionId", c_uint),
|
|
1068
|
+
("pid", c_uint),
|
|
1069
|
+
("vgpuId", c_uint),
|
|
1070
|
+
("displayOrdinal", c_uint),
|
|
1071
|
+
("sessionType", c_uint),
|
|
1072
|
+
("sessionFlags", c_uint),
|
|
1073
|
+
("hMaxResolution", c_uint),
|
|
1074
|
+
("vMaxResolution", c_uint),
|
|
1075
|
+
("hResolution", c_uint),
|
|
1076
|
+
("vResolution", c_uint),
|
|
1077
|
+
("averageFps", c_uint),
|
|
1078
|
+
("averageLatency", c_uint),
|
|
1079
|
+
]
|
|
1080
|
+
|
|
1081
|
+
|
|
1082
|
+
# /**
|
|
1083
|
+
# * Represents all the active FBC sessions on the given device/vGPU instance
|
|
1084
|
+
# */
|
|
1085
|
+
class c_dcgmDeviceFbcSessions_v1(_PrintableStructure):
|
|
1086
|
+
_fields_ = [
|
|
1087
|
+
("version", c_uint),
|
|
1088
|
+
("sessionCount", c_uint),
|
|
1089
|
+
("sessionInfo", c_dcgmDeviceFbcSessionInfo_t * DCGM_MAX_FBC_SESSIONS),
|
|
1090
|
+
]
|
|
1091
|
+
|
|
1092
|
+
|
|
1093
|
+
# /**
|
|
1094
|
+
# * Represents static info related to vGPU types supported on the device
|
|
1095
|
+
# */
|
|
1096
|
+
class c_dcgmDeviceVgpuTypeInfo_v1(_PrintableStructure):
|
|
1097
|
+
_fields_ = [
|
|
1098
|
+
("version", c_uint),
|
|
1099
|
+
("vgpuTypeId", c_uint),
|
|
1100
|
+
("vgpuTypeName", c_char * DCGM_VGPU_NAME_BUFFER_SIZE),
|
|
1101
|
+
("vgpuTypeClass", c_char * DCGM_VGPU_NAME_BUFFER_SIZE),
|
|
1102
|
+
("vgpuTypeLicense", c_char * DCGM_GRID_LICENSE_BUFFER_SIZE),
|
|
1103
|
+
("deviceId", c_uint),
|
|
1104
|
+
("subsystemId", c_uint),
|
|
1105
|
+
("numDisplayHeads", c_uint),
|
|
1106
|
+
("maxInstances", c_uint),
|
|
1107
|
+
("frameRateLimit", c_uint),
|
|
1108
|
+
("maxResolutionX", c_uint),
|
|
1109
|
+
("maxResolutionY", c_uint),
|
|
1110
|
+
("fbTotal", c_uint),
|
|
1111
|
+
]
|
|
1112
|
+
|
|
1113
|
+
|
|
1114
|
+
class c_dcgmDeviceVgpuTypeInfo_v2(_PrintableStructure):
|
|
1115
|
+
_fields_ = [
|
|
1116
|
+
("version", c_uint),
|
|
1117
|
+
("vgpuTypeId", c_uint),
|
|
1118
|
+
("vgpuTypeName", c_char * DCGM_VGPU_NAME_BUFFER_SIZE),
|
|
1119
|
+
("vgpuTypeClass", c_char * DCGM_VGPU_NAME_BUFFER_SIZE),
|
|
1120
|
+
("vgpuTypeLicense", c_char * DCGM_GRID_LICENSE_BUFFER_SIZE),
|
|
1121
|
+
("deviceId", c_uint),
|
|
1122
|
+
("subsystemId", c_uint),
|
|
1123
|
+
("numDisplayHeads", c_uint),
|
|
1124
|
+
("maxInstances", c_uint),
|
|
1125
|
+
("frameRateLimit", c_uint),
|
|
1126
|
+
("maxResolutionX", c_uint),
|
|
1127
|
+
("maxResolutionY", c_uint),
|
|
1128
|
+
("fbTotal", c_uint),
|
|
1129
|
+
("gpuInstanceProfileId", c_uint),
|
|
1130
|
+
]
|
|
1131
|
+
|
|
1132
|
+
|
|
1133
|
+
dcgmDeviceVgpuTypeInfo_version2 = make_dcgm_version(c_dcgmDeviceVgpuTypeInfo_v2, 2)
|
|
1134
|
+
|
|
1135
|
+
|
|
1136
|
+
class c_dcgmDeviceSettings_v2(_PrintableStructure):
|
|
1137
|
+
_fields_ = [
|
|
1138
|
+
("version", c_uint),
|
|
1139
|
+
("persistenceModeEnabled", c_uint),
|
|
1140
|
+
("migModeEnabled", c_uint),
|
|
1141
|
+
("confidentialComputeMode", c_uint),
|
|
1142
|
+
]
|
|
1143
|
+
|
|
1144
|
+
|
|
1145
|
+
# /**
|
|
1146
|
+
# * Represents attributes corresponding to a device
|
|
1147
|
+
# */
|
|
1148
|
+
class c_dcgmDeviceAttributes_deprecated_v1(_PrintableStructure):
|
|
1149
|
+
_fields_ = [
|
|
1150
|
+
("version", c_uint),
|
|
1151
|
+
("clockSets", c_dcgmDeviceSupportedClockSets_v1),
|
|
1152
|
+
("thermalSettings", c_dcgmDeviceThermals_v1),
|
|
1153
|
+
("powerLimits", c_dcgmDevicePowerLimits_v1),
|
|
1154
|
+
("identifiers", c_dcgmDeviceIdentifiers_v1),
|
|
1155
|
+
("memoryUsage", c_dcgmDeviceMemoryUsage_v1),
|
|
1156
|
+
("unused", c_char * 208),
|
|
1157
|
+
]
|
|
1158
|
+
|
|
1159
|
+
|
|
1160
|
+
dcgmDeviceAttributes_deprecated_version1 = make_dcgm_version(
|
|
1161
|
+
c_dcgmDeviceAttributes_deprecated_v1, 1
|
|
1162
|
+
)
|
|
1163
|
+
|
|
1164
|
+
|
|
1165
|
+
# /**
|
|
1166
|
+
# * Represents attributes corresponding to a device
|
|
1167
|
+
# */
|
|
1168
|
+
class c_dcgmDeviceAttributes_v3(_PrintableStructure):
|
|
1169
|
+
_fields_ = [
|
|
1170
|
+
("version", c_uint),
|
|
1171
|
+
("clockSets", c_dcgmDeviceSupportedClockSets_v1),
|
|
1172
|
+
("thermalSettings", c_dcgmDeviceThermals_v1),
|
|
1173
|
+
("powerLimits", c_dcgmDevicePowerLimits_v1),
|
|
1174
|
+
("identifiers", c_dcgmDeviceIdentifiers_v1),
|
|
1175
|
+
("memoryUsage", c_dcgmDeviceMemoryUsage_v1),
|
|
1176
|
+
("settings", c_dcgmDeviceSettings_v2),
|
|
1177
|
+
]
|
|
1178
|
+
|
|
1179
|
+
|
|
1180
|
+
dcgmDeviceAttributes_version3 = make_dcgm_version(c_dcgmDeviceAttributes_v3, 3)
|
|
1181
|
+
|
|
1182
|
+
|
|
1183
|
+
# /**
|
|
1184
|
+
# * Represents attributes info for a MIG device
|
|
1185
|
+
# */
|
|
1186
|
+
class c_dcgmDeviceMigAttributesInfo_v1(_PrintableStructure):
|
|
1187
|
+
_fields_ = [
|
|
1188
|
+
("version", c_uint),
|
|
1189
|
+
("gpuInstanceId", c_uint),
|
|
1190
|
+
("computeInstanceId", c_uint),
|
|
1191
|
+
("multiprocessorCount", c_uint),
|
|
1192
|
+
("sharedCopyEngineCount", c_uint),
|
|
1193
|
+
("sharedDecoderCount", c_uint),
|
|
1194
|
+
("sharedEncoderCount", c_uint),
|
|
1195
|
+
("sharedJpegCount", c_uint),
|
|
1196
|
+
("sharedOfaCount", c_uint),
|
|
1197
|
+
("gpuInstanceSliceCount", c_uint),
|
|
1198
|
+
("computeInstanceSliceCount", c_uint),
|
|
1199
|
+
("memorySizeMB", c_uint64),
|
|
1200
|
+
]
|
|
1201
|
+
|
|
1202
|
+
|
|
1203
|
+
dcgmDeviceMigAttributesInfo_version1 = make_dcgm_version(
|
|
1204
|
+
c_dcgmDeviceMigAttributesInfo_v1, 1
|
|
1205
|
+
)
|
|
1206
|
+
|
|
1207
|
+
|
|
1208
|
+
# /**
|
|
1209
|
+
# * Represents attributes for a MIG device
|
|
1210
|
+
# */
|
|
1211
|
+
class c_dcgmDeviceMigAttributes_v1(_PrintableStructure):
|
|
1212
|
+
_fields_ = [
|
|
1213
|
+
("version", c_uint),
|
|
1214
|
+
("migDevicesCount", c_uint),
|
|
1215
|
+
("migAttributesInfo", c_dcgmDeviceMigAttributesInfo_v1),
|
|
1216
|
+
]
|
|
1217
|
+
|
|
1218
|
+
|
|
1219
|
+
dcgmDeviceMigAttributes_version1 = make_dcgm_version(c_dcgmDeviceMigAttributes_v1, 1)
|
|
1220
|
+
|
|
1221
|
+
|
|
1222
|
+
# /**
|
|
1223
|
+
# * Represents GPU instance profile information
|
|
1224
|
+
# */
|
|
1225
|
+
class c_dcgmGpuInstanceProfileInfo_v1(_PrintableStructure):
|
|
1226
|
+
_fields_ = [
|
|
1227
|
+
("version", c_uint),
|
|
1228
|
+
("id", c_uint),
|
|
1229
|
+
("isP2pSupported", c_uint),
|
|
1230
|
+
("sliceCount", c_uint),
|
|
1231
|
+
("instanceCount", c_uint),
|
|
1232
|
+
("multiprocessorCount", c_uint),
|
|
1233
|
+
("copyEngineCount", c_uint),
|
|
1234
|
+
("decoderCount", c_uint),
|
|
1235
|
+
("encoderCount", c_uint),
|
|
1236
|
+
("jpegCount", c_uint),
|
|
1237
|
+
("ofaCount", c_uint),
|
|
1238
|
+
("memorySizeMB", c_uint64),
|
|
1239
|
+
]
|
|
1240
|
+
|
|
1241
|
+
|
|
1242
|
+
dcgmGpuInstanceProfileInfo_version1 = make_dcgm_version(
|
|
1243
|
+
c_dcgmGpuInstanceProfileInfo_v1, 1
|
|
1244
|
+
)
|
|
1245
|
+
|
|
1246
|
+
|
|
1247
|
+
# /**
|
|
1248
|
+
# * Represents GPU instance profiles
|
|
1249
|
+
# */
|
|
1250
|
+
class c_dcgmGpuInstanceProfiles_v1(_PrintableStructure):
|
|
1251
|
+
_fields_ = [
|
|
1252
|
+
("version", c_uint),
|
|
1253
|
+
("profileCount", c_uint),
|
|
1254
|
+
("profileInfo", c_dcgmGpuInstanceProfileInfo_v1),
|
|
1255
|
+
]
|
|
1256
|
+
|
|
1257
|
+
|
|
1258
|
+
dcgmGpuInstanceProfiles_version1 = make_dcgm_version(c_dcgmGpuInstanceProfiles_v1, 1)
|
|
1259
|
+
|
|
1260
|
+
|
|
1261
|
+
# /**
|
|
1262
|
+
# * Represents Compute instance profile information
|
|
1263
|
+
# */
|
|
1264
|
+
class c_dcgmComputeInstanceProfileInfo_v1(_PrintableStructure):
|
|
1265
|
+
_fields_ = [
|
|
1266
|
+
("version", c_uint),
|
|
1267
|
+
("gpuInstanceId", c_uint),
|
|
1268
|
+
("id", c_uint),
|
|
1269
|
+
("sliceCount", c_uint),
|
|
1270
|
+
("instanceCount", c_uint),
|
|
1271
|
+
("multiprocessorCount", c_uint),
|
|
1272
|
+
("sharedCopyEngineCount", c_uint),
|
|
1273
|
+
("sharedDecoderCount", c_uint),
|
|
1274
|
+
("sharedEncoderCount", c_uint),
|
|
1275
|
+
("sharedJpegCount", c_uint),
|
|
1276
|
+
("sharedOfaCount", c_uint),
|
|
1277
|
+
]
|
|
1278
|
+
|
|
1279
|
+
|
|
1280
|
+
dcgmComputeInstanceProfileInfo_version1 = make_dcgm_version(
|
|
1281
|
+
c_dcgmComputeInstanceProfileInfo_v1, 1
|
|
1282
|
+
)
|
|
1283
|
+
|
|
1284
|
+
|
|
1285
|
+
# /**
|
|
1286
|
+
# * Represents Compute instance profiles
|
|
1287
|
+
# */
|
|
1288
|
+
class c_dcgmComputeInstanceProfiles_v1(_PrintableStructure):
|
|
1289
|
+
_fields_ = [
|
|
1290
|
+
("version", c_uint),
|
|
1291
|
+
("profileCount", c_uint),
|
|
1292
|
+
("profileInfo", c_dcgmComputeInstanceProfileInfo_v1),
|
|
1293
|
+
]
|
|
1294
|
+
|
|
1295
|
+
|
|
1296
|
+
dcgmComputeInstanceProfiles_version1 = make_dcgm_version(
|
|
1297
|
+
c_dcgmComputeInstanceProfiles_v1, 1
|
|
1298
|
+
)
|
|
1299
|
+
|
|
1300
|
+
|
|
1301
|
+
# /**
|
|
1302
|
+
# * Represents vGPU attributes corresponding to a device
|
|
1303
|
+
# */
|
|
1304
|
+
class c_dcgmVgpuDeviceAttributes_v6(_PrintableStructure):
|
|
1305
|
+
_fields_ = [
|
|
1306
|
+
("version", c_uint),
|
|
1307
|
+
("activeVgpuInstanceCount", c_uint),
|
|
1308
|
+
("activeVgpuInstanceIds", c_uint * DCGM_MAX_VGPU_INSTANCES_PER_PGPU),
|
|
1309
|
+
("creatableVgpuTypeCount", c_uint),
|
|
1310
|
+
("creatableVgpuTypeIds", c_uint * DCGM_MAX_VGPU_TYPES_PER_PGPU),
|
|
1311
|
+
("supportedVgpuTypeCount", c_uint),
|
|
1312
|
+
(
|
|
1313
|
+
"supportedVgpuTypeInfo",
|
|
1314
|
+
c_dcgmDeviceVgpuTypeInfo_v1 * DCGM_MAX_VGPU_TYPES_PER_PGPU,
|
|
1315
|
+
),
|
|
1316
|
+
("vgpuUtilInfo", c_dcgmDeviceVgpuUtilInfo_v1 * DCGM_MAX_VGPU_TYPES_PER_PGPU),
|
|
1317
|
+
("gpuUtil", c_uint),
|
|
1318
|
+
("memCopyUtil", c_uint),
|
|
1319
|
+
("encUtil", c_uint),
|
|
1320
|
+
("decUtil", c_uint),
|
|
1321
|
+
]
|
|
1322
|
+
|
|
1323
|
+
|
|
1324
|
+
dcgmVgpuDeviceAttributes_version6 = make_dcgm_version(c_dcgmVgpuDeviceAttributes_v6, 1)
|
|
1325
|
+
|
|
1326
|
+
|
|
1327
|
+
class c_dcgmVgpuDeviceAttributes_v7(_PrintableStructure):
|
|
1328
|
+
_fields_ = [
|
|
1329
|
+
("version", c_uint),
|
|
1330
|
+
("activeVgpuInstanceCount", c_uint),
|
|
1331
|
+
("activeVgpuInstanceIds", c_uint * DCGM_MAX_VGPU_INSTANCES_PER_PGPU),
|
|
1332
|
+
("creatableVgpuTypeCount", c_uint),
|
|
1333
|
+
("creatableVgpuTypeIds", c_uint * DCGM_MAX_VGPU_TYPES_PER_PGPU),
|
|
1334
|
+
("supportedVgpuTypeCount", c_uint),
|
|
1335
|
+
(
|
|
1336
|
+
"supportedVgpuTypeInfo",
|
|
1337
|
+
c_dcgmDeviceVgpuTypeInfo_v2 * DCGM_MAX_VGPU_TYPES_PER_PGPU,
|
|
1338
|
+
),
|
|
1339
|
+
("vgpuUtilInfo", c_dcgmDeviceVgpuUtilInfo_v1 * DCGM_MAX_VGPU_TYPES_PER_PGPU),
|
|
1340
|
+
("gpuUtil", c_uint),
|
|
1341
|
+
("memCopyUtil", c_uint),
|
|
1342
|
+
("encUtil", c_uint),
|
|
1343
|
+
("decUtil", c_uint),
|
|
1344
|
+
]
|
|
1345
|
+
|
|
1346
|
+
|
|
1347
|
+
dcgmVgpuDeviceAttributes_version7 = make_dcgm_version(c_dcgmVgpuDeviceAttributes_v7, 7)
|
|
1348
|
+
|
|
1349
|
+
|
|
1350
|
+
# /**
|
|
1351
|
+
# * Represents attributes specific to vGPU instance
|
|
1352
|
+
# */
|
|
1353
|
+
class c_dcgmVgpuInstanceAttributes_v1(_PrintableStructure):
|
|
1354
|
+
_fields_ = [
|
|
1355
|
+
("version", c_uint),
|
|
1356
|
+
("vmId", c_char * DCGM_DEVICE_UUID_BUFFER_SIZE),
|
|
1357
|
+
("vmName", c_char * DCGM_DEVICE_UUID_BUFFER_SIZE),
|
|
1358
|
+
("vgpuTypeId", c_uint),
|
|
1359
|
+
("vgpuUuid", c_char * DCGM_DEVICE_UUID_BUFFER_SIZE),
|
|
1360
|
+
("vgpuDriverVersion", c_char * DCGM_DEVICE_UUID_BUFFER_SIZE),
|
|
1361
|
+
("fbUsage", c_uint),
|
|
1362
|
+
("licenseStatus", c_uint),
|
|
1363
|
+
("frameRateLimit", c_uint),
|
|
1364
|
+
]
|
|
1365
|
+
|
|
1366
|
+
|
|
1367
|
+
dcgmVgpuInstanceAttributes_version1 = make_dcgm_version(
|
|
1368
|
+
c_dcgmVgpuInstanceAttributes_v1, 1
|
|
1369
|
+
)
|
|
1370
|
+
|
|
1371
|
+
|
|
1372
|
+
class c_dcgmConfigPowerLimit(_PrintableStructure):
|
|
1373
|
+
_fields_ = [("type", c_uint), ("val", c_uint)]
|
|
1374
|
+
|
|
1375
|
+
|
|
1376
|
+
class c_dcgmConfigPerfStateSettings_t(_PrintableStructure):
|
|
1377
|
+
_fields_ = [
|
|
1378
|
+
("syncBoost", c_uint),
|
|
1379
|
+
("targetClocks", c_dcgmClockSet_v1),
|
|
1380
|
+
]
|
|
1381
|
+
|
|
1382
|
+
|
|
1383
|
+
# Structure to represent default configuration for a device
|
|
1384
|
+
class c_dcgmDeviceConfig_v1(_PrintableStructure):
|
|
1385
|
+
_fields_ = [
|
|
1386
|
+
# version must always be first
|
|
1387
|
+
("version", c_uint),
|
|
1388
|
+
("gpuId", c_uint),
|
|
1389
|
+
("mEccMode", c_uint),
|
|
1390
|
+
("mComputeMode", c_uint),
|
|
1391
|
+
("mPerfState", c_dcgmConfigPerfStateSettings_t),
|
|
1392
|
+
("mPowerLimit", c_dcgmConfigPowerLimit),
|
|
1393
|
+
]
|
|
1394
|
+
|
|
1395
|
+
|
|
1396
|
+
dcgmDeviceConfig_version1 = make_dcgm_version(c_dcgmDeviceConfig_v1, 1)
|
|
1397
|
+
|
|
1398
|
+
|
|
1399
|
+
# Structure to represent default vGPU configuration for a device
|
|
1400
|
+
class c_dcgmDeviceVgpuConfig_v1(_PrintableStructure):
|
|
1401
|
+
_fields_ = [
|
|
1402
|
+
# version must always be first
|
|
1403
|
+
("version", c_uint),
|
|
1404
|
+
("gpuId", c_uint),
|
|
1405
|
+
("mEccMode", c_uint),
|
|
1406
|
+
("mComputeMode", c_uint),
|
|
1407
|
+
("mPerfState", c_dcgmConfigPerfStateSettings_t),
|
|
1408
|
+
("mPowerLimit", c_dcgmConfigPowerLimit),
|
|
1409
|
+
]
|
|
1410
|
+
|
|
1411
|
+
def SetBlank(self):
|
|
1412
|
+
# Does not set version or gpuId
|
|
1413
|
+
self.mEccMode = dcgmvalue.DCGM_INT32_BLANK
|
|
1414
|
+
self.mPerfState.syncBoost = dcgmvalue.DCGM_INT32_BLANK
|
|
1415
|
+
self.mPerfState.targetClocks.memClock = dcgmvalue.DCGM_INT32_BLANK
|
|
1416
|
+
self.mPerfState.targetClocks.smClock = dcgmvalue.DCGM_INT32_BLANK
|
|
1417
|
+
self.mComputeMode = dcgmvalue.DCGM_INT32_BLANK
|
|
1418
|
+
self.mPowerLimit.type = DCGM_CONFIG_POWER_CAP_INDIVIDUAL
|
|
1419
|
+
self.mPowerLimit.val = dcgmvalue.DCGM_INT32_BLANK
|
|
1420
|
+
|
|
1421
|
+
|
|
1422
|
+
dcgmDeviceVgpuConfig_version1 = make_dcgm_version(c_dcgmDeviceVgpuConfig_v1, 1)
|
|
1423
|
+
|
|
1424
|
+
|
|
1425
|
+
# Structure to receive update on the list of metrics.
|
|
1426
|
+
class c_dcgmPolicyUpdate_v1(_PrintableStructure):
|
|
1427
|
+
_fields_ = [
|
|
1428
|
+
# version must always be first
|
|
1429
|
+
("version", c_uint),
|
|
1430
|
+
("power", c_uint),
|
|
1431
|
+
]
|
|
1432
|
+
|
|
1433
|
+
|
|
1434
|
+
dcgmPolicyUpdate_version1 = make_dcgm_version(c_dcgmPolicyUpdate_v1, 1)
|
|
1435
|
+
|
|
1436
|
+
# Represents a Callback to receive power updates from the host engine
|
|
1437
|
+
_dcgmRecvUpdates_t = c_void_p
|
|
1438
|
+
|
|
1439
|
+
|
|
1440
|
+
# Define the structure that contains specific policy information
|
|
1441
|
+
class c_dcgmPolicyViolation_v1(_PrintableStructure):
|
|
1442
|
+
_fields_ = [
|
|
1443
|
+
# version must always be first
|
|
1444
|
+
("version", c_uint),
|
|
1445
|
+
("notifyOnEccDbe", c_uint),
|
|
1446
|
+
("notifyOnPciEvent", c_uint),
|
|
1447
|
+
("notifyOnMaxRetiredPages", c_uint),
|
|
1448
|
+
]
|
|
1449
|
+
|
|
1450
|
+
|
|
1451
|
+
dcgmPolicyViolation_version1 = make_dcgm_version(c_dcgmPolicyViolation_v1, 1)
|
|
1452
|
+
|
|
1453
|
+
|
|
1454
|
+
class c_dcgmWatchFieldValue_v1(_PrintableStructure):
|
|
1455
|
+
_fields_: List = []
|
|
1456
|
+
|
|
1457
|
+
|
|
1458
|
+
dcgmWatchFieldValue_version1 = make_dcgm_version(c_dcgmWatchFieldValue_v1, 1)
|
|
1459
|
+
|
|
1460
|
+
|
|
1461
|
+
class c_dcgmUnwatchFieldValue_v1(_PrintableStructure):
|
|
1462
|
+
_fields_: List = []
|
|
1463
|
+
|
|
1464
|
+
|
|
1465
|
+
dcgmUnwatchFieldValue_version1 = make_dcgm_version(c_dcgmUnwatchFieldValue_v1, 1)
|
|
1466
|
+
|
|
1467
|
+
|
|
1468
|
+
class c_dcgmUpdateAllFields_v1(_PrintableStructure):
|
|
1469
|
+
_fields_: List = []
|
|
1470
|
+
|
|
1471
|
+
|
|
1472
|
+
dcgmUpdateAllFields_version1 = make_dcgm_version(c_dcgmUpdateAllFields_v1, 1)
|
|
1473
|
+
|
|
1474
|
+
dcgmGetMultipleValuesForFieldResponse_version1 = 1
|
|
1475
|
+
|
|
1476
|
+
# policy enums (and table indices)
|
|
1477
|
+
DCGM_POLICY_COND_IDX_DBE = 0
|
|
1478
|
+
DCGM_POLICY_COND_IDX_PCI = 1
|
|
1479
|
+
DCGM_POLICY_COND_IDX_MAX_PAGES_RETIRED = 2
|
|
1480
|
+
DCGM_POLICY_COND_IDX_THERMAL = 3
|
|
1481
|
+
DCGM_POLICY_COND_IDX_POWER = 4
|
|
1482
|
+
DCGM_POLICY_COND_IDX_NVLINK = 5
|
|
1483
|
+
DCGM_POLICY_COND_IDX_XID = 6
|
|
1484
|
+
DCGM_POLICY_COND_IDX_MAX = 7
|
|
1485
|
+
|
|
1486
|
+
# policy enum bitmasks
|
|
1487
|
+
DCGM_POLICY_COND_DBE = 0x1
|
|
1488
|
+
DCGM_POLICY_COND_PCI = 0x2
|
|
1489
|
+
DCGM_POLICY_COND_MAX_PAGES_RETIRED = 0x4
|
|
1490
|
+
DCGM_POLICY_COND_THERMAL = 0x8
|
|
1491
|
+
DCGM_POLICY_COND_POWER = 0x10
|
|
1492
|
+
DCGM_POLICY_COND_NVLINK = 0x20
|
|
1493
|
+
DCGM_POLICY_COND_XID = 0x40
|
|
1494
|
+
DCGM_POLICY_COND_MAX = 7
|
|
1495
|
+
|
|
1496
|
+
DCGM_POLICY_MODE_AUTOMATED = 0
|
|
1497
|
+
DCGM_POLICY_MODE_MANUAL = 1
|
|
1498
|
+
|
|
1499
|
+
DCGM_POLICY_ISOLATION_NONE = 0
|
|
1500
|
+
|
|
1501
|
+
DCGM_POLICY_ACTION_NONE = 0
|
|
1502
|
+
DCGM_POLICY_ACTION_GPURESET = 1 # Deprecated
|
|
1503
|
+
|
|
1504
|
+
DCGM_POLICY_VALID_NONE = 0
|
|
1505
|
+
DCGM_POLICY_VALID_SV_SHORT = 1
|
|
1506
|
+
DCGM_POLICY_VALID_SV_MED = 2
|
|
1507
|
+
DCGM_POLICY_VALID_SV_LONG = 3
|
|
1508
|
+
DCGM_POLICY_VALID_SV_XLONG = 4
|
|
1509
|
+
|
|
1510
|
+
DCGM_POLICY_FAILURE_NONE = 0
|
|
1511
|
+
|
|
1512
|
+
DCGM_DIAG_LVL_INVALID = 0
|
|
1513
|
+
DCGM_DIAG_LVL_SHORT = 10
|
|
1514
|
+
DCGM_DIAG_LVL_MED = 20
|
|
1515
|
+
DCGM_DIAG_LVL_LONG = 30
|
|
1516
|
+
DCGM_DIAG_LVL_XLONG = 40
|
|
1517
|
+
|
|
1518
|
+
DCGM_DIAG_RESULT_PASS = 0
|
|
1519
|
+
DCGM_DIAG_RESULT_SKIP = 1
|
|
1520
|
+
DCGM_DIAG_RESULT_WARN = 2
|
|
1521
|
+
DCGM_DIAG_RESULT_FAIL = 3
|
|
1522
|
+
DCGM_DIAG_RESULT_NOT_RUN = 4
|
|
1523
|
+
|
|
1524
|
+
|
|
1525
|
+
class c_dcgmPolicyConditionParmTypes_t(DcgmUnion):
|
|
1526
|
+
_fields_ = [
|
|
1527
|
+
("boolean", c_bool),
|
|
1528
|
+
("llval", c_longlong),
|
|
1529
|
+
]
|
|
1530
|
+
|
|
1531
|
+
|
|
1532
|
+
class c_dcgmPolicyConditionParms_t(_PrintableStructure):
|
|
1533
|
+
_fields_ = [("tag", c_uint), ("val", c_dcgmPolicyConditionParmTypes_t)]
|
|
1534
|
+
|
|
1535
|
+
|
|
1536
|
+
class c_dcgmPolicy_v1(_PrintableStructure):
|
|
1537
|
+
_fields_ = [
|
|
1538
|
+
# version must always be first
|
|
1539
|
+
("version", c_uint),
|
|
1540
|
+
("condition", c_uint), # an OR'd list of DCGM_POLICY_COND_*
|
|
1541
|
+
("mode", c_uint),
|
|
1542
|
+
("isolation", c_uint),
|
|
1543
|
+
("action", c_uint),
|
|
1544
|
+
("validation", c_uint),
|
|
1545
|
+
("response", c_uint),
|
|
1546
|
+
("parms", c_dcgmPolicyConditionParms_t * DCGM_POLICY_COND_MAX),
|
|
1547
|
+
]
|
|
1548
|
+
|
|
1549
|
+
|
|
1550
|
+
dcgmPolicy_version1 = make_dcgm_version(c_dcgmPolicy_v1, 1)
|
|
1551
|
+
|
|
1552
|
+
|
|
1553
|
+
class c_dcgmPolicyConditionPci_t(_PrintableStructure):
|
|
1554
|
+
_fields_ = [
|
|
1555
|
+
("timestamp", c_longlong), # timestamp of the error
|
|
1556
|
+
("counter", c_uint), # value of the PCIe replay counter
|
|
1557
|
+
]
|
|
1558
|
+
|
|
1559
|
+
|
|
1560
|
+
class c_dcgmPolicyConditionDbe_t(_PrintableStructure):
|
|
1561
|
+
LOCATIONS = {"L1": 0, "L2": 1, "DEVICE": 2, "REGISTER": 3, "TEXTURE": 4}
|
|
1562
|
+
|
|
1563
|
+
_fields_ = [
|
|
1564
|
+
("timestamp", c_longlong), # timestamp of the error
|
|
1565
|
+
("location", c_int), # location of the error (one of self.LOCATIONS)
|
|
1566
|
+
("numerrors", c_uint), # number of errors
|
|
1567
|
+
]
|
|
1568
|
+
|
|
1569
|
+
|
|
1570
|
+
class c_dcgmPolicyConditionMpr_t(_PrintableStructure):
|
|
1571
|
+
_fields_ = [
|
|
1572
|
+
("timestamp", c_longlong), # timestamp of the error
|
|
1573
|
+
("sbepages", c_uint), # number of pending pages due to SBE
|
|
1574
|
+
("dbepages", c_uint), # number of pending pages due to DBE
|
|
1575
|
+
]
|
|
1576
|
+
|
|
1577
|
+
|
|
1578
|
+
class c_dcgmPolicyConditionThermal_t(_PrintableStructure):
|
|
1579
|
+
_fields_ = [
|
|
1580
|
+
("timestamp", c_longlong), # timestamp of the error
|
|
1581
|
+
("thermalViolation", c_uint), # Temperature reached that violated policy
|
|
1582
|
+
]
|
|
1583
|
+
|
|
1584
|
+
|
|
1585
|
+
class c_dcgmPolicyConditionPower_t(_PrintableStructure):
|
|
1586
|
+
_fields_ = [
|
|
1587
|
+
("timestamp", c_longlong), # timestamp of the error
|
|
1588
|
+
("powerViolation", c_uint), # Power value reached that violated policyy
|
|
1589
|
+
]
|
|
1590
|
+
|
|
1591
|
+
|
|
1592
|
+
class c_dcgmPolicyConditionNvlink_t(_PrintableStructure):
|
|
1593
|
+
_fields_ = [
|
|
1594
|
+
("timestamp", c_longlong), # timestamp of the error
|
|
1595
|
+
("fieldId", c_ushort), # FieldId of the nvlink error counter
|
|
1596
|
+
("counter", c_uint), # Error value reached that violated policyy
|
|
1597
|
+
]
|
|
1598
|
+
|
|
1599
|
+
|
|
1600
|
+
class c_dcgmPolicyConditionXID_t(_PrintableStructure):
|
|
1601
|
+
_fields_ = [
|
|
1602
|
+
("timestamp", c_longlong), # timestamp of the error
|
|
1603
|
+
("errnum", c_uint), # XID error number
|
|
1604
|
+
]
|
|
1605
|
+
|
|
1606
|
+
|
|
1607
|
+
class c_dcgmPolicyCallbackResponse_v1(_PrintableStructure):
|
|
1608
|
+
|
|
1609
|
+
class Value(DcgmUnion):
|
|
1610
|
+
# implement more of the fields when a test requires them
|
|
1611
|
+
_fields_ = [
|
|
1612
|
+
("dbe", c_dcgmPolicyConditionDbe_t), # ECC DBE return structure
|
|
1613
|
+
("pci", c_dcgmPolicyConditionPci_t), # PCI replay error return structure
|
|
1614
|
+
(
|
|
1615
|
+
"mpr",
|
|
1616
|
+
c_dcgmPolicyConditionMpr_t,
|
|
1617
|
+
), # Max retired pages limit return structure
|
|
1618
|
+
(
|
|
1619
|
+
"thermal",
|
|
1620
|
+
c_dcgmPolicyConditionThermal_t,
|
|
1621
|
+
), # Thermal policy violations return structure
|
|
1622
|
+
(
|
|
1623
|
+
"power",
|
|
1624
|
+
c_dcgmPolicyConditionPower_t,
|
|
1625
|
+
), # Power policy violations return structure
|
|
1626
|
+
(
|
|
1627
|
+
"nvlink",
|
|
1628
|
+
c_dcgmPolicyConditionNvlink_t,
|
|
1629
|
+
), # Nvlink policy violations return structure..
|
|
1630
|
+
(
|
|
1631
|
+
"xid",
|
|
1632
|
+
c_dcgmPolicyConditionXID_t,
|
|
1633
|
+
), # XID policy violations return structure
|
|
1634
|
+
]
|
|
1635
|
+
|
|
1636
|
+
_fields_ = [
|
|
1637
|
+
("version", c_uint),
|
|
1638
|
+
("condition", c_int), # an OR'ed list of DCGM_POLICY_COND_*
|
|
1639
|
+
("val", Value),
|
|
1640
|
+
]
|
|
1641
|
+
|
|
1642
|
+
|
|
1643
|
+
class c_dcgmFieldValue_v1_value(DcgmUnion):
|
|
1644
|
+
_fields_ = [
|
|
1645
|
+
("i64", c_int64),
|
|
1646
|
+
("dbl", c_double),
|
|
1647
|
+
("str", c_char * DCGM_MAX_STR_LENGTH),
|
|
1648
|
+
("blob", c_byte * DCGM_MAX_BLOB_LENGTH),
|
|
1649
|
+
]
|
|
1650
|
+
|
|
1651
|
+
|
|
1652
|
+
# This structure is used to represent value for the field to be queried.
|
|
1653
|
+
class c_dcgmFieldValue_v1(_PrintableStructure):
|
|
1654
|
+
_fields_ = [
|
|
1655
|
+
# version must always be first
|
|
1656
|
+
("version", c_uint),
|
|
1657
|
+
("fieldId", c_ushort),
|
|
1658
|
+
("fieldType", c_short),
|
|
1659
|
+
("status", c_int),
|
|
1660
|
+
("ts", c_int64),
|
|
1661
|
+
("value", c_dcgmFieldValue_v1_value),
|
|
1662
|
+
]
|
|
1663
|
+
|
|
1664
|
+
|
|
1665
|
+
dcgmFieldValue_version1 = make_dcgm_version(c_dcgmFieldValue_v1, 1)
|
|
1666
|
+
|
|
1667
|
+
|
|
1668
|
+
# This structure is used to represent value for the field to be queried (version 2)
|
|
1669
|
+
class c_dcgmFieldValue_v2(_PrintableStructure):
|
|
1670
|
+
_fields_ = [
|
|
1671
|
+
# version must always be first
|
|
1672
|
+
("version", c_uint),
|
|
1673
|
+
("entityGroupId", c_uint),
|
|
1674
|
+
("entityId", c_uint),
|
|
1675
|
+
("fieldId", c_ushort),
|
|
1676
|
+
("fieldType", c_short),
|
|
1677
|
+
("status", c_int),
|
|
1678
|
+
("unused", c_uint),
|
|
1679
|
+
("ts", c_int64),
|
|
1680
|
+
("value", c_dcgmFieldValue_v1_value),
|
|
1681
|
+
]
|
|
1682
|
+
|
|
1683
|
+
|
|
1684
|
+
dcgmFieldValue_version2 = make_dcgm_version(c_dcgmFieldValue_v2, 2)
|
|
1685
|
+
|
|
1686
|
+
# Field value flags used by dcgm_agent.dcgmEntitiesGetLatestValues()
|
|
1687
|
+
DCGM_FV_FLAG_LIVE_DATA = 0x00000001
|
|
1688
|
+
|
|
1689
|
+
DCGM_HEALTH_WATCH_PCIE = 0x1
|
|
1690
|
+
DCGM_HEALTH_WATCH_NVLINK = 0x2
|
|
1691
|
+
DCGM_HEALTH_WATCH_PMU = 0x4
|
|
1692
|
+
DCGM_HEALTH_WATCH_MCU = 0x8
|
|
1693
|
+
DCGM_HEALTH_WATCH_MEM = 0x10
|
|
1694
|
+
DCGM_HEALTH_WATCH_SM = 0x20
|
|
1695
|
+
DCGM_HEALTH_WATCH_INFOROM = 0x40
|
|
1696
|
+
DCGM_HEALTH_WATCH_THERMAL = 0x80
|
|
1697
|
+
DCGM_HEALTH_WATCH_POWER = 0x100
|
|
1698
|
+
DCGM_HEALTH_WATCH_DRIVER = 0x200
|
|
1699
|
+
DCGM_HEALTH_WATCH_NVSWITCH_NONFATAL = 0x400
|
|
1700
|
+
DCGM_HEALTH_WATCH_NVSWITCH_FATAL = 0x800
|
|
1701
|
+
DCGM_HEALTH_WATCH_ALL = 0xFFFFFFFF
|
|
1702
|
+
DCGM_HEALTH_WATCH_COUNT_V1 = 10
|
|
1703
|
+
DCGM_HEALTH_WATCH_COUNT_V2 = 12
|
|
1704
|
+
|
|
1705
|
+
DCGM_HEALTH_RESULT_PASS = 0
|
|
1706
|
+
DCGM_HEALTH_RESULT_WARN = 10
|
|
1707
|
+
DCGM_HEALTH_RESULT_FAIL = 20
|
|
1708
|
+
|
|
1709
|
+
|
|
1710
|
+
class c_dcgmDiagErrorDetail_t(_PrintableStructure):
|
|
1711
|
+
_fields_ = [("msg", c_char * 1024), ("code", c_uint)]
|
|
1712
|
+
|
|
1713
|
+
|
|
1714
|
+
DCGM_HEALTH_WATCH_MAX_INCIDENTS = DCGM_GROUP_MAX_ENTITIES
|
|
1715
|
+
|
|
1716
|
+
|
|
1717
|
+
class c_dcgmIncidentInfo_t(_PrintableStructure):
|
|
1718
|
+
_fields_ = [
|
|
1719
|
+
("system", c_uint),
|
|
1720
|
+
("health", c_uint32),
|
|
1721
|
+
("error", c_dcgmDiagErrorDetail_t),
|
|
1722
|
+
("entityInfo", c_dcgmGroupEntityPair_t),
|
|
1723
|
+
]
|
|
1724
|
+
|
|
1725
|
+
|
|
1726
|
+
class c_dcgmHealthResponse_v4(_PrintableStructure):
|
|
1727
|
+
_fields_ = [
|
|
1728
|
+
("version", c_uint32),
|
|
1729
|
+
("overallHealth", c_uint32),
|
|
1730
|
+
("incidentCount", c_uint32),
|
|
1731
|
+
("incidents", c_dcgmIncidentInfo_t * DCGM_HEALTH_WATCH_MAX_INCIDENTS),
|
|
1732
|
+
]
|
|
1733
|
+
|
|
1734
|
+
|
|
1735
|
+
dcgmHealthResponse_version4 = make_dcgm_version(c_dcgmHealthResponse_v4, 4)
|
|
1736
|
+
|
|
1737
|
+
|
|
1738
|
+
class c_dcgmHealthSetParams_v2(_PrintableStructure):
|
|
1739
|
+
_fields_ = [
|
|
1740
|
+
("version", c_uint32),
|
|
1741
|
+
("groupId", c_void_p),
|
|
1742
|
+
("systems", c_uint32),
|
|
1743
|
+
("updateInterval", c_int64),
|
|
1744
|
+
("maxKeepAge", c_double),
|
|
1745
|
+
]
|
|
1746
|
+
|
|
1747
|
+
|
|
1748
|
+
dcgmHealthSetParams_version2 = make_dcgm_version(c_dcgmHealthSetParams_v2, 2)
|
|
1749
|
+
|
|
1750
|
+
|
|
1751
|
+
# Pid info structs
|
|
1752
|
+
class c_dcgmStatSummaryInt64_t(_PrintableStructure):
|
|
1753
|
+
_fields_ = [("minValue", c_int64), ("maxValue", c_int64), ("average", c_int64)]
|
|
1754
|
+
|
|
1755
|
+
|
|
1756
|
+
class c_dcgmStatSummaryInt32_t(_PrintableStructure):
|
|
1757
|
+
_fields_ = [("minValue", c_int32), ("maxValue", c_int32), ("average", c_int32)]
|
|
1758
|
+
|
|
1759
|
+
|
|
1760
|
+
class c_dcgmStatSummaryFp64_t(_PrintableStructure):
|
|
1761
|
+
_fields_ = [("minValue", c_double), ("maxValue", c_double), ("average", c_double)]
|
|
1762
|
+
|
|
1763
|
+
|
|
1764
|
+
class c_dcgmProcessUtilInfo_t(_PrintableStructure):
|
|
1765
|
+
_fields_ = [("pid", c_uint), ("smUtil", c_double), ("memUtil", c_double)]
|
|
1766
|
+
|
|
1767
|
+
|
|
1768
|
+
class c_dcgmHealthResponseInfo_t(_PrintableStructure):
|
|
1769
|
+
_fields_ = [("system", c_uint), ("health", c_uint)]
|
|
1770
|
+
|
|
1771
|
+
|
|
1772
|
+
DCGM_MAX_PID_INFO_NUM = 16
|
|
1773
|
+
|
|
1774
|
+
|
|
1775
|
+
class c_dcgmPidSingleInfo_t(_PrintableStructure):
|
|
1776
|
+
_fields_ = [
|
|
1777
|
+
("gpuId", c_uint32),
|
|
1778
|
+
("energyConsumed", c_int64),
|
|
1779
|
+
("pcieRxBandwidth", c_dcgmStatSummaryInt64_t),
|
|
1780
|
+
("pcieTxBandwidth", c_dcgmStatSummaryInt64_t),
|
|
1781
|
+
("pcieReplays", c_int64),
|
|
1782
|
+
("startTime", c_int64),
|
|
1783
|
+
("endTime", c_int64),
|
|
1784
|
+
("processUtilization", c_dcgmProcessUtilInfo_t),
|
|
1785
|
+
("smUtilization", c_dcgmStatSummaryInt32_t),
|
|
1786
|
+
("memoryUtilization", c_dcgmStatSummaryInt32_t),
|
|
1787
|
+
("eccSingleBit", c_uint32), # Deprecated
|
|
1788
|
+
("eccDoubleBit", c_uint32),
|
|
1789
|
+
("memoryClock", c_dcgmStatSummaryInt32_t),
|
|
1790
|
+
("smClock", c_dcgmStatSummaryInt32_t),
|
|
1791
|
+
("numXidCriticalErrors", c_int32),
|
|
1792
|
+
("xidCriticalErrorsTs", c_int64 * 10),
|
|
1793
|
+
("numOtherComputePids", c_int32),
|
|
1794
|
+
("otherComputePids", c_uint32 * DCGM_MAX_PID_INFO_NUM),
|
|
1795
|
+
("numOtherGraphicsPids", c_int32),
|
|
1796
|
+
("otherGraphicsPids", c_uint32 * DCGM_MAX_PID_INFO_NUM),
|
|
1797
|
+
("maxGpuMemoryUsed", c_int64),
|
|
1798
|
+
("powerViolationTime", c_int64),
|
|
1799
|
+
("thermalViolationTime", c_int64),
|
|
1800
|
+
("reliabilityViolationTime", c_int64),
|
|
1801
|
+
("boardLimitViolationTime", c_int64),
|
|
1802
|
+
("lowUtilizationTime", c_int64),
|
|
1803
|
+
("syncBoostTime", c_int64),
|
|
1804
|
+
("overallHealth", c_uint),
|
|
1805
|
+
("incidentCount", c_uint),
|
|
1806
|
+
("systems", c_dcgmHealthResponseInfo_t * DCGM_HEALTH_WATCH_COUNT_V1),
|
|
1807
|
+
]
|
|
1808
|
+
|
|
1809
|
+
|
|
1810
|
+
class c_dcgmPidInfo_v2(_PrintableStructure):
|
|
1811
|
+
_fields_ = [
|
|
1812
|
+
("version", c_uint32),
|
|
1813
|
+
("pid", c_uint32),
|
|
1814
|
+
("unused", c_uint32),
|
|
1815
|
+
("numGpus", c_int32),
|
|
1816
|
+
("summary", c_dcgmPidSingleInfo_t),
|
|
1817
|
+
("gpus", c_dcgmPidSingleInfo_t * DCGM_MAX_NUM_DEVICES),
|
|
1818
|
+
]
|
|
1819
|
+
|
|
1820
|
+
|
|
1821
|
+
dcgmPidInfo_version2 = make_dcgm_version(c_dcgmPidInfo_v2, 2)
|
|
1822
|
+
|
|
1823
|
+
|
|
1824
|
+
class c_dcgmRunningProcess_v1(_PrintableStructure):
|
|
1825
|
+
_fields_ = [("version", c_uint32), ("pid", c_uint32), ("memoryUsed", c_uint64)]
|
|
1826
|
+
|
|
1827
|
+
|
|
1828
|
+
dcgmRunningProcess_version1 = make_dcgm_version(c_dcgmRunningProcess_v1, 1)
|
|
1829
|
+
|
|
1830
|
+
c_dcgmRunningProcess_t = c_dcgmRunningProcess_v1
|
|
1831
|
+
|
|
1832
|
+
|
|
1833
|
+
class c_dcgmGpuUsageInfo_t(_PrintableStructure):
|
|
1834
|
+
_fields_ = [
|
|
1835
|
+
("gpuId", c_uint32),
|
|
1836
|
+
("energyConsumed", c_int64),
|
|
1837
|
+
("powerUsage", c_dcgmStatSummaryFp64_t),
|
|
1838
|
+
("pcieRxBandwidth", c_dcgmStatSummaryInt64_t),
|
|
1839
|
+
("pcieTxBandwidth", c_dcgmStatSummaryInt64_t),
|
|
1840
|
+
("pcieReplays", c_int64),
|
|
1841
|
+
("startTime", c_int64),
|
|
1842
|
+
("endTime", c_int64),
|
|
1843
|
+
("smUtilization", c_dcgmStatSummaryInt32_t),
|
|
1844
|
+
("memoryUtilization", c_dcgmStatSummaryInt32_t),
|
|
1845
|
+
("eccSingleBit", c_uint32), # Deprecated
|
|
1846
|
+
("eccDoubleBit", c_uint32),
|
|
1847
|
+
("memoryClock", c_dcgmStatSummaryInt32_t),
|
|
1848
|
+
("smClock", c_dcgmStatSummaryInt32_t),
|
|
1849
|
+
("numXidCriticalErrors", c_int32),
|
|
1850
|
+
("xidCriticalErrorsTs", c_int64 * 10),
|
|
1851
|
+
("numComputePids", c_int32),
|
|
1852
|
+
("computePids", c_dcgmProcessUtilInfo_t * DCGM_MAX_PID_INFO_NUM),
|
|
1853
|
+
("numGraphicsPids", c_int32),
|
|
1854
|
+
("graphicsPids", c_dcgmProcessUtilInfo_t * DCGM_MAX_PID_INFO_NUM),
|
|
1855
|
+
("maxGpuMemoryUsed", c_int64),
|
|
1856
|
+
("powerViolationTime", c_int64),
|
|
1857
|
+
("thermalViolationTime", c_int64),
|
|
1858
|
+
("reliabilityViolationTime", c_int64),
|
|
1859
|
+
("boardLimitViolationTime", c_int64),
|
|
1860
|
+
("lowUtilizationTime", c_int64),
|
|
1861
|
+
("syncBoostTime", c_int64),
|
|
1862
|
+
("overallHealth", c_uint),
|
|
1863
|
+
("incidentCount", c_uint),
|
|
1864
|
+
("systems", c_dcgmHealthResponseInfo_t * DCGM_HEALTH_WATCH_COUNT_V1),
|
|
1865
|
+
]
|
|
1866
|
+
|
|
1867
|
+
|
|
1868
|
+
class c_dcgmJobInfo_v3(_PrintableStructure):
|
|
1869
|
+
_fields_ = [
|
|
1870
|
+
("version", c_uint32),
|
|
1871
|
+
("numGpus", c_int32),
|
|
1872
|
+
("summary", c_dcgmGpuUsageInfo_t),
|
|
1873
|
+
("gpus", c_dcgmGpuUsageInfo_t * DCGM_MAX_NUM_DEVICES),
|
|
1874
|
+
]
|
|
1875
|
+
|
|
1876
|
+
|
|
1877
|
+
dcgmJobInfo_version3 = make_dcgm_version(c_dcgmJobInfo_v3, 3)
|
|
1878
|
+
|
|
1879
|
+
|
|
1880
|
+
class c_dcgmDiagTestResult_v2(_PrintableStructure):
|
|
1881
|
+
_fields_ = [
|
|
1882
|
+
("result", c_uint),
|
|
1883
|
+
("error", c_dcgmDiagErrorDetail_t),
|
|
1884
|
+
("info", c_char * 1024),
|
|
1885
|
+
]
|
|
1886
|
+
|
|
1887
|
+
|
|
1888
|
+
class c_dcgmDiagResponsePerGpu_v4(_PrintableStructure):
|
|
1889
|
+
_fields_ = [
|
|
1890
|
+
("gpuId", c_uint),
|
|
1891
|
+
("hwDiagnosticReturn", c_uint),
|
|
1892
|
+
("results", c_dcgmDiagTestResult_v2 * DCGM_PER_GPU_TEST_COUNT_V8),
|
|
1893
|
+
]
|
|
1894
|
+
|
|
1895
|
+
|
|
1896
|
+
DCGM_SWTEST_COUNT = 10
|
|
1897
|
+
LEVEL_ONE_MAX_RESULTS = 16
|
|
1898
|
+
|
|
1899
|
+
|
|
1900
|
+
class c_dcgmDiagResponse_v8(_PrintableStructure):
|
|
1901
|
+
_fields_ = [
|
|
1902
|
+
("version", c_uint),
|
|
1903
|
+
("gpuCount", c_uint),
|
|
1904
|
+
("levelOneTestCount", c_uint),
|
|
1905
|
+
("levelOneResults", c_dcgmDiagTestResult_v2 * LEVEL_ONE_MAX_RESULTS),
|
|
1906
|
+
("perGpuResponses", c_dcgmDiagResponsePerGpu_v4 * DCGM_MAX_NUM_DEVICES),
|
|
1907
|
+
("systemError", c_dcgmDiagErrorDetail_t),
|
|
1908
|
+
("_unused", c_char * 1024),
|
|
1909
|
+
]
|
|
1910
|
+
|
|
1911
|
+
|
|
1912
|
+
dcgmDiagResponse_version8 = make_dcgm_version(c_dcgmDiagResponse_v8, 8)
|
|
1913
|
+
|
|
1914
|
+
DCGM_AFFINITY_BITMASK_ARRAY_SIZE = 8
|
|
1915
|
+
|
|
1916
|
+
|
|
1917
|
+
class c_dcgmDeviceTopologyPath_t(_PrintableStructure):
|
|
1918
|
+
_fields_ = [("gpuId", c_uint32), ("path", c_uint32), ("localNvLinkIds", c_uint32)]
|
|
1919
|
+
|
|
1920
|
+
|
|
1921
|
+
class c_dcgmDeviceTopology_v1(_PrintableStructure):
|
|
1922
|
+
_fields_ = [
|
|
1923
|
+
("version", c_uint32),
|
|
1924
|
+
("cpuAffinityMask", c_ulong * DCGM_AFFINITY_BITMASK_ARRAY_SIZE),
|
|
1925
|
+
("numGpus", c_uint32),
|
|
1926
|
+
("gpuPaths", c_dcgmDeviceTopologyPath_t * (DCGM_MAX_NUM_DEVICES - 1)),
|
|
1927
|
+
]
|
|
1928
|
+
|
|
1929
|
+
|
|
1930
|
+
dcgmDeviceTopology_version1 = make_dcgm_version(c_dcgmDeviceTopology_v1, 1)
|
|
1931
|
+
|
|
1932
|
+
|
|
1933
|
+
class c_dcgmGroupTopology_v1(_PrintableStructure):
|
|
1934
|
+
_fields_ = [
|
|
1935
|
+
("version", c_uint32),
|
|
1936
|
+
("groupCpuAffinityMask", c_ulong * DCGM_AFFINITY_BITMASK_ARRAY_SIZE),
|
|
1937
|
+
("numaOptimalFlag", c_uint32),
|
|
1938
|
+
("slowestPath", c_uint32),
|
|
1939
|
+
]
|
|
1940
|
+
|
|
1941
|
+
|
|
1942
|
+
dcgmGroupTopology_version1 = make_dcgm_version(c_dcgmGroupTopology_v1, 1)
|
|
1943
|
+
|
|
1944
|
+
# Maximum number of field groups that can exist
|
|
1945
|
+
DCGM_MAX_NUM_FIELD_GROUPS = 64
|
|
1946
|
+
|
|
1947
|
+
# Maximum number of field IDs that can be in a single field group
|
|
1948
|
+
DCGM_MAX_FIELD_IDS_PER_FIELD_GROUP = 128
|
|
1949
|
+
|
|
1950
|
+
|
|
1951
|
+
class c_dcgmFieldGroupInfo_v1(_PrintableStructure):
|
|
1952
|
+
_fields_ = [
|
|
1953
|
+
("version", c_uint32),
|
|
1954
|
+
("numFieldIds", c_uint32),
|
|
1955
|
+
("fieldGroupId", c_void_p),
|
|
1956
|
+
("fieldGroupName", c_char * DCGM_MAX_STR_LENGTH),
|
|
1957
|
+
("fieldIds", c_uint16 * DCGM_MAX_FIELD_IDS_PER_FIELD_GROUP),
|
|
1958
|
+
]
|
|
1959
|
+
|
|
1960
|
+
|
|
1961
|
+
dcgmFieldGroupInfo_version1 = make_dcgm_version(c_dcgmFieldGroupInfo_v1, 1)
|
|
1962
|
+
|
|
1963
|
+
|
|
1964
|
+
class c_dcgmAllFieldGroup_v1(_PrintableStructure):
|
|
1965
|
+
_fields_ = [
|
|
1966
|
+
("version", c_uint32),
|
|
1967
|
+
("numFieldGroups", c_uint32),
|
|
1968
|
+
("fieldGroups", c_dcgmFieldGroupInfo_v1 * DCGM_MAX_NUM_FIELD_GROUPS),
|
|
1969
|
+
]
|
|
1970
|
+
|
|
1971
|
+
|
|
1972
|
+
dcgmAllFieldGroup_version1 = make_dcgm_version(c_dcgmAllFieldGroup_v1, 1)
|
|
1973
|
+
|
|
1974
|
+
|
|
1975
|
+
class c_dcgmIntrospectMemory_v1(_PrintableStructure):
|
|
1976
|
+
_fields_ = [
|
|
1977
|
+
("version", c_uint32),
|
|
1978
|
+
(
|
|
1979
|
+
"bytesUsed",
|
|
1980
|
+
c_longlong,
|
|
1981
|
+
), # The total number of bytes being used to store all of the fields being watched
|
|
1982
|
+
]
|
|
1983
|
+
|
|
1984
|
+
|
|
1985
|
+
dcgmIntrospectMemory_version1 = make_dcgm_version(c_dcgmIntrospectMemory_v1, 1)
|
|
1986
|
+
|
|
1987
|
+
|
|
1988
|
+
class c_dcgmIntrospectCpuUtil_v1(_PrintableStructure):
|
|
1989
|
+
_fields_ = [
|
|
1990
|
+
("version", c_uint32), #!< version number (dcgmIntrospectCpuUtil_version)
|
|
1991
|
+
("total", c_double), #!< fraction of device's CPU resources that were used
|
|
1992
|
+
(
|
|
1993
|
+
"kernel",
|
|
1994
|
+
c_double,
|
|
1995
|
+
), #!< fraction of device's CPU resources that were used in kernel mode
|
|
1996
|
+
(
|
|
1997
|
+
"user",
|
|
1998
|
+
c_double,
|
|
1999
|
+
), #!< fraction of device's CPU resources that were used in user mode
|
|
2000
|
+
]
|
|
2001
|
+
|
|
2002
|
+
|
|
2003
|
+
dcgmIntrospectCpuUtil_version1 = make_dcgm_version(c_dcgmIntrospectCpuUtil_v1, 1)
|
|
2004
|
+
|
|
2005
|
+
DCGM_MAX_CONFIG_FILE_LEN = 10000
|
|
2006
|
+
DCGM_MAX_TEST_NAMES = 20
|
|
2007
|
+
DCGM_MAX_TEST_NAMES_LEN = 50
|
|
2008
|
+
DCGM_MAX_TEST_PARMS = 100
|
|
2009
|
+
DCGM_MAX_TEST_PARMS_LEN = 100
|
|
2010
|
+
DCGM_GPU_LIST_LEN = 50
|
|
2011
|
+
DCGM_FILE_LEN = 30
|
|
2012
|
+
DCGM_PATH_LEN = 128
|
|
2013
|
+
DCGM_THROTTLE_MASK_LEN = 50
|
|
2014
|
+
|
|
2015
|
+
# Flags options for running the GPU diagnostic
|
|
2016
|
+
DCGM_RUN_FLAGS_VERBOSE = 0x0001
|
|
2017
|
+
DCGM_RUN_FLAGS_STATSONFAIL = 0x0002
|
|
2018
|
+
# UNUSED
|
|
2019
|
+
DCGM_RUN_FLAGS_TRAIN = 0x0004
|
|
2020
|
+
# UNUSED
|
|
2021
|
+
DCGM_RUN_FLAGS_FORCE_TRAIN = 0x0008
|
|
2022
|
+
DCGM_RUN_FLAGS_FAIL_EARLY = 0x0010 # Enable fail early checks for the Targeted Stress, Targeted Power, SM Stress, and Diagnostic tests
|
|
2023
|
+
|
|
2024
|
+
|
|
2025
|
+
class c_dcgmRunDiag_v7(_PrintableStructure):
|
|
2026
|
+
_fields_ = [
|
|
2027
|
+
("version", c_uint), # version of this message
|
|
2028
|
+
(
|
|
2029
|
+
"flags",
|
|
2030
|
+
c_uint,
|
|
2031
|
+
), # flags specifying binary options for running it. Currently verbose and stats on fail
|
|
2032
|
+
(
|
|
2033
|
+
"debugLevel",
|
|
2034
|
+
c_uint,
|
|
2035
|
+
), # 0-5 for the debug level the GPU diagnostic will use for logging
|
|
2036
|
+
(
|
|
2037
|
+
"groupId",
|
|
2038
|
+
c_void_p,
|
|
2039
|
+
), # group of GPUs to verify. Cannot be specified together with gpuList.
|
|
2040
|
+
("validate", c_uint), # 0-3 for which tests to run. Optional.
|
|
2041
|
+
(
|
|
2042
|
+
"testNames",
|
|
2043
|
+
c_char * DCGM_MAX_TEST_NAMES * DCGM_MAX_TEST_NAMES_LEN,
|
|
2044
|
+
), # Specifed list of test names. Optional.
|
|
2045
|
+
(
|
|
2046
|
+
"testParms",
|
|
2047
|
+
c_char * DCGM_MAX_TEST_PARMS * DCGM_MAX_TEST_PARMS_LEN,
|
|
2048
|
+
), # Parameters to set for specified tests in the format: testName.parameterName=parameterValue. Optional.
|
|
2049
|
+
(
|
|
2050
|
+
"fakeGpuList",
|
|
2051
|
+
c_char * DCGM_GPU_LIST_LEN,
|
|
2052
|
+
), # Comma-separated list of fake gpus. Cannot be specified with the groupId or gpuList.
|
|
2053
|
+
(
|
|
2054
|
+
"gpuList",
|
|
2055
|
+
c_char * DCGM_GPU_LIST_LEN,
|
|
2056
|
+
), # Comma-separated list of gpus. Cannot be specified with the groupId.
|
|
2057
|
+
(
|
|
2058
|
+
"debugLogFile",
|
|
2059
|
+
c_char * DCGM_PATH_LEN,
|
|
2060
|
+
), # Alternate name for the debug log file that should be used
|
|
2061
|
+
(
|
|
2062
|
+
"statsPath",
|
|
2063
|
+
c_char * DCGM_PATH_LEN,
|
|
2064
|
+
), # Path that the plugin's statistics files should be written to
|
|
2065
|
+
(
|
|
2066
|
+
"configFileContents",
|
|
2067
|
+
c_char * DCGM_MAX_CONFIG_FILE_LEN,
|
|
2068
|
+
), # Contents of nvvs config file (likely yaml)
|
|
2069
|
+
(
|
|
2070
|
+
"throttleMask",
|
|
2071
|
+
c_char * DCGM_THROTTLE_MASK_LEN,
|
|
2072
|
+
), # Throttle reasons to ignore as either integer mask or csv list of reasons
|
|
2073
|
+
("pluginPath", c_char * DCGM_PATH_LEN), # Custom path to the diagnostic plugins
|
|
2074
|
+
("_unusedInt1", c_uint), # Unused
|
|
2075
|
+
("_unusedInt2", c_uint), # Unused
|
|
2076
|
+
("_unusedInt3", c_uint), # Unused
|
|
2077
|
+
("_unusedBuf", c_char * DCGM_PATH_LEN), # Unused
|
|
2078
|
+
(
|
|
2079
|
+
"failCheckInterval",
|
|
2080
|
+
c_uint,
|
|
2081
|
+
), # How often the fail early checks should occur when DCGM_RUN_FLAGS_FAIL_EARLY is set.
|
|
2082
|
+
]
|
|
2083
|
+
|
|
2084
|
+
|
|
2085
|
+
dcgmRunDiag_version7 = make_dcgm_version(c_dcgmRunDiag_v7, 7)
|
|
2086
|
+
|
|
2087
|
+
# Latest c_dcgmRunDiag class
|
|
2088
|
+
c_dcgmRunDiag_t = c_dcgmRunDiag_v7
|
|
2089
|
+
|
|
2090
|
+
# Latest version for dcgmRunDiag_t
|
|
2091
|
+
dcgmRunDiag_version = dcgmRunDiag_version7
|
|
2092
|
+
|
|
2093
|
+
# Flags for dcgmGetEntityGroupEntities's flags parameter
|
|
2094
|
+
DCGM_GEGE_FLAG_ONLY_SUPPORTED = (
|
|
2095
|
+
0x00000001 # Only return entities that are supported by DCGM.
|
|
2096
|
+
)
|
|
2097
|
+
|
|
2098
|
+
# Identifies a GPU NVLink error type returned by DCGM_FI_DEV_GPU_NVLINK_ERRORS
|
|
2099
|
+
DCGM_GPU_NVLINK_ERROR_RECOVERY_REQUIRED = 1 # NVLink link recovery error occurred
|
|
2100
|
+
DCGM_GPU_NVLINK_ERROR_FATAL = 2 # NVLink link fatal error occurred
|
|
2101
|
+
|
|
2102
|
+
# Topology hints for dcgmSelectGpusByTopology()
|
|
2103
|
+
DCGM_TOPO_HINT_F_NONE = 0x00000000 # No hints specified
|
|
2104
|
+
DCGM_TOPO_HINT_F_IGNOREHEALTH = (
|
|
2105
|
+
0x00000001 # Ignore the health of the GPUs when picking GPUs for job execution.
|
|
2106
|
+
)
|
|
2107
|
+
# By default, only healthy GPUs are considered.
|
|
2108
|
+
|
|
2109
|
+
|
|
2110
|
+
class c_dcgmTopoSchedHint_v1(_PrintableStructure):
|
|
2111
|
+
_fields_ = [
|
|
2112
|
+
("version", c_uint), # version of this message
|
|
2113
|
+
("inputGpuIds", c_uint64), # bitmask of the GPU ids to choose from
|
|
2114
|
+
("numGpus", c_uint32), # the number of GPUs that DCGM should chooose
|
|
2115
|
+
(
|
|
2116
|
+
"hintFlags",
|
|
2117
|
+
c_uint64,
|
|
2118
|
+
), # Hints to ignore certain factors for the scheduling hint
|
|
2119
|
+
]
|
|
2120
|
+
|
|
2121
|
+
|
|
2122
|
+
dcgmTopoSchedHint_version1 = make_dcgm_version(c_dcgmTopoSchedHint_v1, 1)
|
|
2123
|
+
|
|
2124
|
+
# DCGM NvLink link states used by c_dcgmNvLinkGpuLinkStatus_v1 & 2 and c_dcgmNvLinkNvSwitchLinkStatus_t's linkState field
|
|
2125
|
+
DcgmNvLinkLinkStateNotSupported = (
|
|
2126
|
+
0 # NvLink is unsupported by this GPU (Default for GPUs)
|
|
2127
|
+
)
|
|
2128
|
+
DcgmNvLinkLinkStateDisabled = 1 # NvLink is supported for this link but this link is disabled (Default for NvSwitches)
|
|
2129
|
+
DcgmNvLinkLinkStateDown = 2 # This NvLink link is down (inactive)
|
|
2130
|
+
DcgmNvLinkLinkStateUp = 3 # This NvLink link is up (active)
|
|
2131
|
+
|
|
2132
|
+
|
|
2133
|
+
# State of NvLink links for a GPU
|
|
2134
|
+
class c_dcgmNvLinkGpuLinkStatus_v1(_PrintableStructure):
|
|
2135
|
+
_fields_ = [
|
|
2136
|
+
("entityId", c_uint32), # Entity ID of the GPU (gpuId)
|
|
2137
|
+
(
|
|
2138
|
+
"linkState",
|
|
2139
|
+
c_uint32 * DCGM_NVLINK_MAX_LINKS_PER_GPU_LEGACY1,
|
|
2140
|
+
), # Link state of each link of this GPU
|
|
2141
|
+
]
|
|
2142
|
+
|
|
2143
|
+
|
|
2144
|
+
# State of NvLink links for a GPU
|
|
2145
|
+
class c_dcgmNvLinkGpuLinkStatus_v2(_PrintableStructure):
|
|
2146
|
+
_fields_ = [
|
|
2147
|
+
("entityId", c_uint32), # Entity ID of the GPU (gpuId)
|
|
2148
|
+
(
|
|
2149
|
+
"linkState",
|
|
2150
|
+
c_uint32 * DCGM_NVLINK_MAX_LINKS_PER_GPU_LEGACY2,
|
|
2151
|
+
), # Link state of each link of this GPU
|
|
2152
|
+
]
|
|
2153
|
+
|
|
2154
|
+
|
|
2155
|
+
class c_dcgmNvLinkGpuLinkStatus_v3(_PrintableStructure):
|
|
2156
|
+
_fields_ = [
|
|
2157
|
+
("entityId", c_uint32), # Entity ID of the GPU (gpuId)
|
|
2158
|
+
(
|
|
2159
|
+
"linkState",
|
|
2160
|
+
c_uint32 * DCGM_NVLINK_MAX_LINKS_PER_GPU,
|
|
2161
|
+
), # Link state of each link of this GPU
|
|
2162
|
+
]
|
|
2163
|
+
|
|
2164
|
+
|
|
2165
|
+
# State of NvLink links for a NvSwitch
|
|
2166
|
+
class c_dcgmNvLinkNvSwitchLinkStatus_v1(_PrintableStructure):
|
|
2167
|
+
_fields_ = [
|
|
2168
|
+
("entityId", c_uint32), # Entity ID of the NvSwitch (physicalId)
|
|
2169
|
+
(
|
|
2170
|
+
"linkState",
|
|
2171
|
+
c_uint32 * DCGM_NVLINK_MAX_LINKS_PER_NVSWITCH_V1,
|
|
2172
|
+
), # Link state of each link of this NvSwitch
|
|
2173
|
+
]
|
|
2174
|
+
|
|
2175
|
+
|
|
2176
|
+
class c_dcgmNvLinkStatus_v2(_PrintableStructure):
|
|
2177
|
+
"""
|
|
2178
|
+
NvSwitch link status for all GPUs and NvSwitches in the system
|
|
2179
|
+
"""
|
|
2180
|
+
|
|
2181
|
+
_fields_ = [
|
|
2182
|
+
(
|
|
2183
|
+
"version",
|
|
2184
|
+
c_uint32,
|
|
2185
|
+
), # version of this message. Should be dcgmNvLinkStatus_version1
|
|
2186
|
+
("numGpus", c_uint32), # Number of GPUs populated in gpus[]
|
|
2187
|
+
(
|
|
2188
|
+
"gpus",
|
|
2189
|
+
c_dcgmNvLinkGpuLinkStatus_v2 * DCGM_MAX_NUM_DEVICES,
|
|
2190
|
+
), # Per-GPU NvLink link statuses
|
|
2191
|
+
("numNvSwitches", c_uint32), # Number of NvSwitches populated in nvSwitches[]
|
|
2192
|
+
(
|
|
2193
|
+
"nvSwitches",
|
|
2194
|
+
c_dcgmNvLinkNvSwitchLinkStatus_v1 * DCGM_MAX_NUM_SWITCHES,
|
|
2195
|
+
), # Per-NvSwitch NvLink link statuses
|
|
2196
|
+
]
|
|
2197
|
+
|
|
2198
|
+
|
|
2199
|
+
dcgmNvLinkStatus_version2 = make_dcgm_version(c_dcgmNvLinkStatus_v2, 2)
|
|
2200
|
+
|
|
2201
|
+
|
|
2202
|
+
# State of NvLink links for a NvSwitch
|
|
2203
|
+
class c_dcgmNvLinkNvSwitchLinkStatus_v2(_PrintableStructure):
|
|
2204
|
+
_fields_ = [
|
|
2205
|
+
("entityId", c_uint32), # Entity ID of the NvSwitch (physicalId)
|
|
2206
|
+
(
|
|
2207
|
+
"linkState",
|
|
2208
|
+
c_uint32 * DCGM_NVLINK_MAX_LINKS_PER_NVSWITCH,
|
|
2209
|
+
), # Link state of each link of this NvSwitch
|
|
2210
|
+
]
|
|
2211
|
+
|
|
2212
|
+
|
|
2213
|
+
class c_dcgmNvLinkStatus_v3(_PrintableStructure):
|
|
2214
|
+
"""
|
|
2215
|
+
NvSwitch link status for all GPUs and NvSwitches in the system
|
|
2216
|
+
"""
|
|
2217
|
+
|
|
2218
|
+
_fields_ = [
|
|
2219
|
+
(
|
|
2220
|
+
"version",
|
|
2221
|
+
c_uint32,
|
|
2222
|
+
), # version of this message. Should be dcgmNvLinkStatus_version1
|
|
2223
|
+
("numGpus", c_uint32), # Number of GPUs populated in gpus[]
|
|
2224
|
+
(
|
|
2225
|
+
"gpus",
|
|
2226
|
+
c_dcgmNvLinkGpuLinkStatus_v3 * DCGM_MAX_NUM_DEVICES,
|
|
2227
|
+
), # Per-GPU NvLink link statuses
|
|
2228
|
+
("numNvSwitches", c_uint32), # Number of NvSwitches populated in nvSwitches[]
|
|
2229
|
+
(
|
|
2230
|
+
"nvSwitches",
|
|
2231
|
+
c_dcgmNvLinkNvSwitchLinkStatus_v2 * DCGM_MAX_NUM_SWITCHES,
|
|
2232
|
+
), # Per-NvSwitch NvLink link statuses
|
|
2233
|
+
]
|
|
2234
|
+
|
|
2235
|
+
|
|
2236
|
+
dcgmNvLinkStatus_version3 = make_dcgm_version(c_dcgmNvLinkStatus_v3, 3)
|
|
2237
|
+
|
|
2238
|
+
# Bitmask values for dcgmGetFieldIdSummary
|
|
2239
|
+
DCGM_SUMMARY_MIN = 0x00000001
|
|
2240
|
+
DCGM_SUMMARY_MAX = 0x00000002
|
|
2241
|
+
DCGM_SUMMARY_AVG = 0x00000004
|
|
2242
|
+
DCGM_SUMMARY_SUM = 0x00000008
|
|
2243
|
+
DCGM_SUMMARY_COUNT = 0x00000010
|
|
2244
|
+
DCGM_SUMMARY_INTEGRAL = 0x00000020
|
|
2245
|
+
DCGM_SUMMARY_DIFF = 0x00000040
|
|
2246
|
+
DCGM_SUMMARY_SIZE = 7
|
|
2247
|
+
|
|
2248
|
+
|
|
2249
|
+
class c_dcgmSummaryResponse_t(_PrintableStructure):
|
|
2250
|
+
|
|
2251
|
+
class ResponseValue(DcgmUnion):
|
|
2252
|
+
_fields_ = [
|
|
2253
|
+
("i64", c_int64),
|
|
2254
|
+
("dbl", c_double),
|
|
2255
|
+
]
|
|
2256
|
+
|
|
2257
|
+
_fields_ = [
|
|
2258
|
+
("fieldType", c_uint),
|
|
2259
|
+
("summaryCount", c_uint),
|
|
2260
|
+
("values", ResponseValue * DCGM_SUMMARY_SIZE),
|
|
2261
|
+
]
|
|
2262
|
+
|
|
2263
|
+
|
|
2264
|
+
class c_dcgmFieldSummaryRequest_v1(_PrintableStructure):
|
|
2265
|
+
_fields_ = [
|
|
2266
|
+
("version", c_uint),
|
|
2267
|
+
("fieldId", c_ushort),
|
|
2268
|
+
("entityGroupType", c_uint32),
|
|
2269
|
+
("entityId", c_uint),
|
|
2270
|
+
("summaryTypeMask", c_uint32),
|
|
2271
|
+
("startTime", c_uint64),
|
|
2272
|
+
("endTime", c_uint64),
|
|
2273
|
+
("response", c_dcgmSummaryResponse_t),
|
|
2274
|
+
]
|
|
2275
|
+
|
|
2276
|
+
|
|
2277
|
+
dcgmFieldSummaryRequest_version1 = make_dcgm_version(c_dcgmFieldSummaryRequest_v1, 1)
|
|
2278
|
+
|
|
2279
|
+
# Module IDs
|
|
2280
|
+
DcgmModuleIdCore = 0 # Core DCGM
|
|
2281
|
+
DcgmModuleIdNvSwitch = 1 # NvSwitch Module
|
|
2282
|
+
DcgmModuleIdVGPU = 2 # VGPU Module
|
|
2283
|
+
DcgmModuleIdIntrospect = 3 # Introspection Module
|
|
2284
|
+
DcgmModuleIdHealth = 4 # Health Module
|
|
2285
|
+
DcgmModuleIdPolicy = 5 # Policy Module
|
|
2286
|
+
DcgmModuleIdConfig = 6 # Config Module
|
|
2287
|
+
DcgmModuleIdDiag = 7 # GPU Diagnostic Module
|
|
2288
|
+
DcgmModuleIdProfiling = 8 # Profiling Module
|
|
2289
|
+
DcgmModuleIdCount = 9 # 1 greater than largest ID above
|
|
2290
|
+
|
|
2291
|
+
# Module Status
|
|
2292
|
+
DcgmModuleStatusNotLoaded = 0 # Module has not been loaded yet
|
|
2293
|
+
DcgmModuleStatusDenylisted = (
|
|
2294
|
+
1 # Module has been added to the denylist so it can't be loaded
|
|
2295
|
+
)
|
|
2296
|
+
DcgmModuleStatusFailed = 2 # Loading the module failed
|
|
2297
|
+
DcgmModuleStatusLoaded = 3 # Module has been loaded
|
|
2298
|
+
DcgmModuleStatusUnloaded = 4 # Module has been unloaded
|
|
2299
|
+
DcgmModuleStatusPaused = 5 # Module has been paused. Implies it's been loaded
|
|
2300
|
+
|
|
2301
|
+
DCGM_MODULE_STATUSES_CAPACITY = 16
|
|
2302
|
+
|
|
2303
|
+
|
|
2304
|
+
class c_dcgmModuleGetStatusesModule_t(_PrintableStructure):
|
|
2305
|
+
_fields_ = [
|
|
2306
|
+
("id", c_uint32), # One of DcgmModuleId*
|
|
2307
|
+
("status", c_uint32), # One of DcgmModuleStatus*
|
|
2308
|
+
]
|
|
2309
|
+
|
|
2310
|
+
|
|
2311
|
+
class c_dcgmModuleGetStatuses_v1(_PrintableStructure):
|
|
2312
|
+
_fields_ = [
|
|
2313
|
+
("version", c_uint),
|
|
2314
|
+
("numStatuses", c_uint32),
|
|
2315
|
+
("statuses", c_dcgmModuleGetStatusesModule_t * DCGM_MODULE_STATUSES_CAPACITY),
|
|
2316
|
+
]
|
|
2317
|
+
|
|
2318
|
+
|
|
2319
|
+
dcgmModuleGetStatuses_version1 = make_dcgm_version(c_dcgmModuleGetStatuses_v1, 1)
|
|
2320
|
+
|
|
2321
|
+
DCGM_PROF_MAX_NUM_GROUPS_V2 = (
|
|
2322
|
+
10 # Maximum number of metric ID groups that can exist in DCGM
|
|
2323
|
+
)
|
|
2324
|
+
DCGM_PROF_MAX_FIELD_IDS_PER_GROUP_V2 = 64 # Maximum number of field IDs that can be in a single DCGM profiling metric group
|
|
2325
|
+
|
|
2326
|
+
|
|
2327
|
+
class c_dcgmProfMetricGroupInfo_v2(_PrintableStructure):
|
|
2328
|
+
_fields_ = [
|
|
2329
|
+
("majorId", c_ushort),
|
|
2330
|
+
("minorId", c_ushort),
|
|
2331
|
+
("numFieldIds", c_uint32),
|
|
2332
|
+
("fieldIds", c_ushort * DCGM_PROF_MAX_FIELD_IDS_PER_GROUP_V2),
|
|
2333
|
+
]
|
|
2334
|
+
|
|
2335
|
+
|
|
2336
|
+
class c_dcgmProfGetMetricGroups_v3(_PrintableStructure):
|
|
2337
|
+
_fields_ = [
|
|
2338
|
+
("version", c_uint32),
|
|
2339
|
+
("unused", c_uint32),
|
|
2340
|
+
("gpuId", c_uint32),
|
|
2341
|
+
("numMetricGroups", c_uint32),
|
|
2342
|
+
("metricGroups", c_dcgmProfMetricGroupInfo_v2 * DCGM_PROF_MAX_NUM_GROUPS_V2),
|
|
2343
|
+
]
|
|
2344
|
+
|
|
2345
|
+
|
|
2346
|
+
dcgmProfGetMetricGroups_version3 = make_dcgm_version(c_dcgmProfGetMetricGroups_v3, 3)
|
|
2347
|
+
|
|
2348
|
+
|
|
2349
|
+
class c_dcgmVersionInfo_v2(_PrintableStructure):
|
|
2350
|
+
_fields_ = [
|
|
2351
|
+
("version", c_uint32),
|
|
2352
|
+
("rawBuildInfoString", c_char * (DCGM_MAX_STR_LENGTH * 2)),
|
|
2353
|
+
]
|
|
2354
|
+
|
|
2355
|
+
|
|
2356
|
+
dcgmVersionInfo_version2 = make_dcgm_version(c_dcgmVersionInfo_v2, 2)
|
|
2357
|
+
dcgmVersionInfo_version = dcgmVersionInfo_version2
|