triton-model-analyzer 1.48.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- model_analyzer/__init__.py +15 -0
- model_analyzer/analyzer.py +448 -0
- model_analyzer/cli/__init__.py +15 -0
- model_analyzer/cli/cli.py +193 -0
- model_analyzer/config/__init__.py +15 -0
- model_analyzer/config/generate/__init__.py +15 -0
- model_analyzer/config/generate/automatic_model_config_generator.py +164 -0
- model_analyzer/config/generate/base_model_config_generator.py +352 -0
- model_analyzer/config/generate/brute_plus_binary_parameter_search_run_config_generator.py +164 -0
- model_analyzer/config/generate/brute_run_config_generator.py +154 -0
- model_analyzer/config/generate/concurrency_sweeper.py +75 -0
- model_analyzer/config/generate/config_generator_interface.py +52 -0
- model_analyzer/config/generate/coordinate.py +143 -0
- model_analyzer/config/generate/coordinate_data.py +86 -0
- model_analyzer/config/generate/generator_utils.py +116 -0
- model_analyzer/config/generate/manual_model_config_generator.py +187 -0
- model_analyzer/config/generate/model_config_generator_factory.py +92 -0
- model_analyzer/config/generate/model_profile_spec.py +74 -0
- model_analyzer/config/generate/model_run_config_generator.py +154 -0
- model_analyzer/config/generate/model_variant_name_manager.py +150 -0
- model_analyzer/config/generate/neighborhood.py +536 -0
- model_analyzer/config/generate/optuna_plus_concurrency_sweep_run_config_generator.py +141 -0
- model_analyzer/config/generate/optuna_run_config_generator.py +838 -0
- model_analyzer/config/generate/perf_analyzer_config_generator.py +312 -0
- model_analyzer/config/generate/quick_plus_concurrency_sweep_run_config_generator.py +130 -0
- model_analyzer/config/generate/quick_run_config_generator.py +753 -0
- model_analyzer/config/generate/run_config_generator_factory.py +329 -0
- model_analyzer/config/generate/search_config.py +112 -0
- model_analyzer/config/generate/search_dimension.py +73 -0
- model_analyzer/config/generate/search_dimensions.py +85 -0
- model_analyzer/config/generate/search_parameter.py +49 -0
- model_analyzer/config/generate/search_parameters.py +388 -0
- model_analyzer/config/input/__init__.py +15 -0
- model_analyzer/config/input/config_command.py +483 -0
- model_analyzer/config/input/config_command_profile.py +1747 -0
- model_analyzer/config/input/config_command_report.py +267 -0
- model_analyzer/config/input/config_defaults.py +236 -0
- model_analyzer/config/input/config_enum.py +83 -0
- model_analyzer/config/input/config_field.py +216 -0
- model_analyzer/config/input/config_list_generic.py +112 -0
- model_analyzer/config/input/config_list_numeric.py +151 -0
- model_analyzer/config/input/config_list_string.py +111 -0
- model_analyzer/config/input/config_none.py +71 -0
- model_analyzer/config/input/config_object.py +129 -0
- model_analyzer/config/input/config_primitive.py +81 -0
- model_analyzer/config/input/config_status.py +75 -0
- model_analyzer/config/input/config_sweep.py +83 -0
- model_analyzer/config/input/config_union.py +113 -0
- model_analyzer/config/input/config_utils.py +128 -0
- model_analyzer/config/input/config_value.py +243 -0
- model_analyzer/config/input/objects/__init__.py +15 -0
- model_analyzer/config/input/objects/config_model_profile_spec.py +325 -0
- model_analyzer/config/input/objects/config_model_report_spec.py +173 -0
- model_analyzer/config/input/objects/config_plot.py +198 -0
- model_analyzer/config/input/objects/config_protobuf_utils.py +101 -0
- model_analyzer/config/input/yaml_config_validator.py +82 -0
- model_analyzer/config/run/__init__.py +15 -0
- model_analyzer/config/run/model_run_config.py +313 -0
- model_analyzer/config/run/run_config.py +168 -0
- model_analyzer/constants.py +76 -0
- model_analyzer/device/__init__.py +15 -0
- model_analyzer/device/device.py +24 -0
- model_analyzer/device/gpu_device.py +87 -0
- model_analyzer/device/gpu_device_factory.py +248 -0
- model_analyzer/entrypoint.py +307 -0
- model_analyzer/log_formatter.py +65 -0
- model_analyzer/model_analyzer_exceptions.py +24 -0
- model_analyzer/model_manager.py +255 -0
- model_analyzer/monitor/__init__.py +15 -0
- model_analyzer/monitor/cpu_monitor.py +69 -0
- model_analyzer/monitor/dcgm/DcgmDiag.py +191 -0
- model_analyzer/monitor/dcgm/DcgmFieldGroup.py +83 -0
- model_analyzer/monitor/dcgm/DcgmGroup.py +815 -0
- model_analyzer/monitor/dcgm/DcgmHandle.py +141 -0
- model_analyzer/monitor/dcgm/DcgmJsonReader.py +69 -0
- model_analyzer/monitor/dcgm/DcgmReader.py +623 -0
- model_analyzer/monitor/dcgm/DcgmStatus.py +57 -0
- model_analyzer/monitor/dcgm/DcgmSystem.py +412 -0
- model_analyzer/monitor/dcgm/__init__.py +15 -0
- model_analyzer/monitor/dcgm/common/__init__.py +13 -0
- model_analyzer/monitor/dcgm/common/dcgm_client_cli_parser.py +194 -0
- model_analyzer/monitor/dcgm/common/dcgm_client_main.py +86 -0
- model_analyzer/monitor/dcgm/dcgm_agent.py +887 -0
- model_analyzer/monitor/dcgm/dcgm_collectd_plugin.py +369 -0
- model_analyzer/monitor/dcgm/dcgm_errors.py +395 -0
- model_analyzer/monitor/dcgm/dcgm_field_helpers.py +546 -0
- model_analyzer/monitor/dcgm/dcgm_fields.py +815 -0
- model_analyzer/monitor/dcgm/dcgm_fields_collectd.py +671 -0
- model_analyzer/monitor/dcgm/dcgm_fields_internal.py +29 -0
- model_analyzer/monitor/dcgm/dcgm_fluentd.py +45 -0
- model_analyzer/monitor/dcgm/dcgm_monitor.py +138 -0
- model_analyzer/monitor/dcgm/dcgm_prometheus.py +326 -0
- model_analyzer/monitor/dcgm/dcgm_structs.py +2357 -0
- model_analyzer/monitor/dcgm/dcgm_telegraf.py +65 -0
- model_analyzer/monitor/dcgm/dcgm_value.py +151 -0
- model_analyzer/monitor/dcgm/dcgmvalue.py +155 -0
- model_analyzer/monitor/dcgm/denylist_recommendations.py +573 -0
- model_analyzer/monitor/dcgm/pydcgm.py +47 -0
- model_analyzer/monitor/monitor.py +143 -0
- model_analyzer/monitor/remote_monitor.py +137 -0
- model_analyzer/output/__init__.py +15 -0
- model_analyzer/output/file_writer.py +63 -0
- model_analyzer/output/output_writer.py +42 -0
- model_analyzer/perf_analyzer/__init__.py +15 -0
- model_analyzer/perf_analyzer/genai_perf_config.py +206 -0
- model_analyzer/perf_analyzer/perf_analyzer.py +882 -0
- model_analyzer/perf_analyzer/perf_config.py +479 -0
- model_analyzer/plots/__init__.py +15 -0
- model_analyzer/plots/detailed_plot.py +266 -0
- model_analyzer/plots/plot_manager.py +224 -0
- model_analyzer/plots/simple_plot.py +213 -0
- model_analyzer/record/__init__.py +15 -0
- model_analyzer/record/gpu_record.py +68 -0
- model_analyzer/record/metrics_manager.py +887 -0
- model_analyzer/record/record.py +280 -0
- model_analyzer/record/record_aggregator.py +256 -0
- model_analyzer/record/types/__init__.py +15 -0
- model_analyzer/record/types/cpu_available_ram.py +93 -0
- model_analyzer/record/types/cpu_used_ram.py +93 -0
- model_analyzer/record/types/gpu_free_memory.py +96 -0
- model_analyzer/record/types/gpu_power_usage.py +107 -0
- model_analyzer/record/types/gpu_total_memory.py +96 -0
- model_analyzer/record/types/gpu_used_memory.py +96 -0
- model_analyzer/record/types/gpu_utilization.py +108 -0
- model_analyzer/record/types/inter_token_latency_avg.py +60 -0
- model_analyzer/record/types/inter_token_latency_base.py +74 -0
- model_analyzer/record/types/inter_token_latency_max.py +60 -0
- model_analyzer/record/types/inter_token_latency_min.py +60 -0
- model_analyzer/record/types/inter_token_latency_p25.py +60 -0
- model_analyzer/record/types/inter_token_latency_p50.py +60 -0
- model_analyzer/record/types/inter_token_latency_p75.py +60 -0
- model_analyzer/record/types/inter_token_latency_p90.py +60 -0
- model_analyzer/record/types/inter_token_latency_p95.py +60 -0
- model_analyzer/record/types/inter_token_latency_p99.py +60 -0
- model_analyzer/record/types/output_token_throughput.py +105 -0
- model_analyzer/record/types/perf_client_response_wait.py +97 -0
- model_analyzer/record/types/perf_client_send_recv.py +97 -0
- model_analyzer/record/types/perf_latency.py +111 -0
- model_analyzer/record/types/perf_latency_avg.py +60 -0
- model_analyzer/record/types/perf_latency_base.py +74 -0
- model_analyzer/record/types/perf_latency_p90.py +60 -0
- model_analyzer/record/types/perf_latency_p95.py +60 -0
- model_analyzer/record/types/perf_latency_p99.py +60 -0
- model_analyzer/record/types/perf_server_compute_infer.py +97 -0
- model_analyzer/record/types/perf_server_compute_input.py +97 -0
- model_analyzer/record/types/perf_server_compute_output.py +97 -0
- model_analyzer/record/types/perf_server_queue.py +97 -0
- model_analyzer/record/types/perf_throughput.py +105 -0
- model_analyzer/record/types/time_to_first_token_avg.py +60 -0
- model_analyzer/record/types/time_to_first_token_base.py +74 -0
- model_analyzer/record/types/time_to_first_token_max.py +60 -0
- model_analyzer/record/types/time_to_first_token_min.py +60 -0
- model_analyzer/record/types/time_to_first_token_p25.py +60 -0
- model_analyzer/record/types/time_to_first_token_p50.py +60 -0
- model_analyzer/record/types/time_to_first_token_p75.py +60 -0
- model_analyzer/record/types/time_to_first_token_p90.py +60 -0
- model_analyzer/record/types/time_to_first_token_p95.py +60 -0
- model_analyzer/record/types/time_to_first_token_p99.py +60 -0
- model_analyzer/reports/__init__.py +15 -0
- model_analyzer/reports/html_report.py +195 -0
- model_analyzer/reports/pdf_report.py +50 -0
- model_analyzer/reports/report.py +86 -0
- model_analyzer/reports/report_factory.py +62 -0
- model_analyzer/reports/report_manager.py +1376 -0
- model_analyzer/reports/report_utils.py +42 -0
- model_analyzer/result/__init__.py +15 -0
- model_analyzer/result/constraint_manager.py +150 -0
- model_analyzer/result/model_config_measurement.py +354 -0
- model_analyzer/result/model_constraints.py +105 -0
- model_analyzer/result/parameter_search.py +246 -0
- model_analyzer/result/result_manager.py +430 -0
- model_analyzer/result/result_statistics.py +159 -0
- model_analyzer/result/result_table.py +217 -0
- model_analyzer/result/result_table_manager.py +646 -0
- model_analyzer/result/result_utils.py +42 -0
- model_analyzer/result/results.py +277 -0
- model_analyzer/result/run_config_measurement.py +658 -0
- model_analyzer/result/run_config_result.py +210 -0
- model_analyzer/result/run_config_result_comparator.py +110 -0
- model_analyzer/result/sorted_results.py +151 -0
- model_analyzer/state/__init__.py +15 -0
- model_analyzer/state/analyzer_state.py +76 -0
- model_analyzer/state/analyzer_state_manager.py +215 -0
- model_analyzer/triton/__init__.py +15 -0
- model_analyzer/triton/client/__init__.py +15 -0
- model_analyzer/triton/client/client.py +234 -0
- model_analyzer/triton/client/client_factory.py +57 -0
- model_analyzer/triton/client/grpc_client.py +104 -0
- model_analyzer/triton/client/http_client.py +107 -0
- model_analyzer/triton/model/__init__.py +15 -0
- model_analyzer/triton/model/model_config.py +556 -0
- model_analyzer/triton/model/model_config_variant.py +29 -0
- model_analyzer/triton/server/__init__.py +15 -0
- model_analyzer/triton/server/server.py +76 -0
- model_analyzer/triton/server/server_config.py +269 -0
- model_analyzer/triton/server/server_docker.py +229 -0
- model_analyzer/triton/server/server_factory.py +306 -0
- model_analyzer/triton/server/server_local.py +158 -0
- triton_model_analyzer-1.48.0.dist-info/METADATA +52 -0
- triton_model_analyzer-1.48.0.dist-info/RECORD +204 -0
- triton_model_analyzer-1.48.0.dist-info/WHEEL +5 -0
- triton_model_analyzer-1.48.0.dist-info/entry_points.txt +2 -0
- triton_model_analyzer-1.48.0.dist-info/licenses/LICENSE +67 -0
- triton_model_analyzer-1.48.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,395 @@
|
|
|
1
|
+
# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
import ctypes
|
|
15
|
+
import model_analyzer.monitor.dcgm.dcgm_structs as dcgm_structs
|
|
16
|
+
|
|
17
|
+
DCGM_FR_OK = 0 # No error
|
|
18
|
+
DCGM_FR_UNKNOWN = 1 # Unknown error code
|
|
19
|
+
DCGM_FR_UNRECOGNIZED = 2 # Unrecognized error code
|
|
20
|
+
DCGM_FR_PCI_REPLAY_RATE = 3 # Unacceptable rate of PCI errors
|
|
21
|
+
DCGM_FR_VOLATILE_DBE_DETECTED = 4 # Uncorrectable volatile double bit error
|
|
22
|
+
DCGM_FR_VOLATILE_SBE_DETECTED = 5 # Unacceptable rate of volatile single bit errors
|
|
23
|
+
DCGM_FR_PENDING_PAGE_RETIREMENTS = 6 # Pending page retirements detected
|
|
24
|
+
DCGM_FR_RETIRED_PAGES_LIMIT = 7 # Unacceptable total page retirements detected
|
|
25
|
+
DCGM_FR_RETIRED_PAGES_DBE_LIMIT = 8 # Unacceptable total page retirements due to uncorrectable errors
|
|
26
|
+
DCGM_FR_CORRUPT_INFOROM = 9 # Corrupt inforom found
|
|
27
|
+
DCGM_FR_CLOCK_THROTTLE_THERMAL = 10 # Clocks being throttled due to overheating
|
|
28
|
+
DCGM_FR_POWER_UNREADABLE = 11 # Cannot get a reading for power from NVML
|
|
29
|
+
DCGM_FR_CLOCK_THROTTLE_POWER = 12 # Clock being throttled due to power restrictions
|
|
30
|
+
DCGM_FR_NVLINK_ERROR_THRESHOLD = 13 # Unacceptable rate of NVLink errors
|
|
31
|
+
DCGM_FR_NVLINK_DOWN = 14 # NVLink is down
|
|
32
|
+
DCGM_FR_NVSWITCH_FATAL_ERROR = 15 # Fatal errors on the NVSwitch
|
|
33
|
+
DCGM_FR_NVSWITCH_NON_FATAL_ERROR = 16 # Non-fatal errors on the NVSwitch
|
|
34
|
+
DCGM_FR_NVSWITCH_DOWN = 17 # NVSwitch is down
|
|
35
|
+
DCGM_FR_NO_ACCESS_TO_FILE = 18 # Cannot access a file
|
|
36
|
+
DCGM_FR_NVML_API = 19 # Error occurred on an NVML API
|
|
37
|
+
DCGM_FR_DEVICE_COUNT_MISMATCH = 20 # Disagreement in GPU count between /dev and NVML
|
|
38
|
+
DCGM_FR_BAD_PARAMETER = 21 # Bad parameter passed to API
|
|
39
|
+
DCGM_FR_CANNOT_OPEN_LIB = 22 # Cannot open a library that must be accessed
|
|
40
|
+
DCGM_FR_DENYLISTED_DRIVER = 23 # A driver on the denylist (nouveau) is active
|
|
41
|
+
DCGM_FR_NVML_LIB_BAD = 24 # The NVML library is missing expected functions
|
|
42
|
+
DCGM_FR_GRAPHICS_PROCESSES = 25 # Graphics processes are active on this GPU
|
|
43
|
+
DCGM_FR_HOSTENGINE_CONN = 26 # Unstable connection to nv-hostengine (daemonized DCGM)
|
|
44
|
+
DCGM_FR_FIELD_QUERY = 27 # Error querying a field from DCGM
|
|
45
|
+
DCGM_FR_BAD_CUDA_ENV = 28 # The environment has variables that hurt CUDA
|
|
46
|
+
DCGM_FR_PERSISTENCE_MODE = 29 # Persistence mode is disabled
|
|
47
|
+
DCGM_FR_LOW_BANDWIDTH = 30 # The bandwidth is unacceptably low
|
|
48
|
+
DCGM_FR_HIGH_LATENCY = 31 # Latency is too high
|
|
49
|
+
DCGM_FR_CANNOT_GET_FIELD_TAG = 32 # Cannot find a tag for a field
|
|
50
|
+
DCGM_FR_FIELD_VIOLATION = 33 # The value for the specified error field is above 0
|
|
51
|
+
DCGM_FR_FIELD_THRESHOLD = 34 # The value for the specified field is above the threshold
|
|
52
|
+
DCGM_FR_FIELD_VIOLATION_DBL = 35 # The value for the specified error field is above 0
|
|
53
|
+
DCGM_FR_FIELD_THRESHOLD_DBL = 36 # The value for the specified field is above the threshold
|
|
54
|
+
DCGM_FR_UNSUPPORTED_FIELD_TYPE = 37 # Field type cannot be supported
|
|
55
|
+
DCGM_FR_FIELD_THRESHOLD_TS = 38 # The value for the specified field is above the threshold
|
|
56
|
+
DCGM_FR_FIELD_THRESHOLD_TS_DBL = 39 # The value for the specified field is above the threshold
|
|
57
|
+
DCGM_FR_THERMAL_VIOLATIONS = 40 # Thermal violations detected
|
|
58
|
+
DCGM_FR_THERMAL_VIOLATIONS_TS = 41 # Thermal violations detected with a timestamp
|
|
59
|
+
DCGM_FR_TEMP_VIOLATION = 42 # Temperature is too high
|
|
60
|
+
DCGM_FR_THROTTLING_VIOLATION = 43 # Non-benign clock throttling is occurring
|
|
61
|
+
DCGM_FR_INTERNAL = 44 # An internal error was detected
|
|
62
|
+
DCGM_FR_PCIE_GENERATION = 45 # PCIe generation is too low
|
|
63
|
+
DCGM_FR_PCIE_WIDTH = 46 # PCIe width is too low
|
|
64
|
+
DCGM_FR_ABORTED = 47 # Test was aborted by a user signal
|
|
65
|
+
DCGM_FR_TEST_DISABLED = 48 # This test is disabled for this GPU
|
|
66
|
+
DCGM_FR_CANNOT_GET_STAT = 49 # Cannot get telemetry for a needed value
|
|
67
|
+
DCGM_FR_STRESS_LEVEL = 50 # Stress level is too low (bad performance)
|
|
68
|
+
DCGM_FR_CUDA_API = 51 # Error calling the specified CUDA API
|
|
69
|
+
DCGM_FR_FAULTY_MEMORY = 52 # Faulty memory detected on this GPU
|
|
70
|
+
DCGM_FR_CANNOT_SET_WATCHES = 53 # Unable to set field watches in DCGM
|
|
71
|
+
DCGM_FR_CUDA_UNBOUND = 54 # CUDA context is no longer bound
|
|
72
|
+
DCGM_FR_ECC_DISABLED = 55 # ECC memory is disabled right now
|
|
73
|
+
DCGM_FR_MEMORY_ALLOC = 56 # Cannot allocate memory
|
|
74
|
+
DCGM_FR_CUDA_DBE = 57 # CUDA detected unrecovable double-bit error
|
|
75
|
+
DCGM_FR_MEMORY_MISMATCH = 58 # Memory error detected
|
|
76
|
+
DCGM_FR_CUDA_DEVICE = 59 # No CUDA device discoverable for existing GPU
|
|
77
|
+
DCGM_FR_ECC_UNSUPPORTED = 60 # ECC memory is unsupported by this SKU
|
|
78
|
+
DCGM_FR_ECC_PENDING = 61 # ECC memory is in a pending state
|
|
79
|
+
DCGM_FR_MEMORY_BANDWIDTH = 62 # Memory bandwidth is too low
|
|
80
|
+
DCGM_FR_TARGET_POWER = 63 # Cannot hit the target power draw
|
|
81
|
+
DCGM_FR_API_FAIL = 64 # The specified API call failed
|
|
82
|
+
DCGM_FR_API_FAIL_GPU = 65 # The specified API call failed for the specified GPU
|
|
83
|
+
DCGM_FR_CUDA_CONTEXT = 66 # Cannot create a CUDA context on this GPU
|
|
84
|
+
DCGM_FR_DCGM_API = 67 # DCGM API failure
|
|
85
|
+
DCGM_FR_CONCURRENT_GPUS = 68 # Need multiple GPUs to run this test
|
|
86
|
+
DCGM_FR_TOO_MANY_ERRORS = 69 # More errors than fit in the return struct
|
|
87
|
+
DCGM_FR_NVLINK_CRC_ERROR_THRESHOLD = 70 # More than 100 CRC errors are happening per second
|
|
88
|
+
DCGM_FR_NVLINK_ERROR_CRITICAL = 71 # NVLink error for a field that should always be 0
|
|
89
|
+
DCGM_FR_ENFORCED_POWER_LIMIT = 72 # The enforced power limit is too low to hit the target
|
|
90
|
+
DCGM_FR_MEMORY_ALLOC_HOST = 73 # Cannot allocate memory on the host
|
|
91
|
+
DCGM_FR_GPU_OP_MODE = 74 # Bad GPU operating mode for running plugin
|
|
92
|
+
DCGM_FR_NO_MEMORY_CLOCKS = 75 # No memory clocks with the needed MHz were found
|
|
93
|
+
DCGM_FR_NO_GRAPHICS_CLOCKS = 76 # No graphics clocks with the needed MHz were found
|
|
94
|
+
DCGM_FR_HAD_TO_RESTORE_STATE = 77 # Note that we had to restore a GPU's state
|
|
95
|
+
DCGM_FR_L1TAG_UNSUPPORTED = 78 # L1TAG test is unsupported by this SKU
|
|
96
|
+
DCGM_FR_L1TAG_MISCOMPARE = 79 # L1TAG test failed on a miscompare
|
|
97
|
+
DCGM_FR_ROW_REMAP_FAILURE = 80 # Row remapping failed (Ampere or newer GPUs)
|
|
98
|
+
DCGM_FR_UNCONTAINED_ERROR = 81 # Uncontained error - XID 95
|
|
99
|
+
DCGM_FR_EMPTY_GPU_LIST = 82 # No GPU information given to plugin
|
|
100
|
+
DCGM_FR_DBE_PENDING_PAGE_RETIREMENTS = 83 # Pending page retirements due to a DBE
|
|
101
|
+
DCGM_FR_UNCORRECTABLE_ROW_REMAP = 84 # Uncorrectable row remapping
|
|
102
|
+
DCGM_FR_PENDING_ROW_REMAP = 85 # Row remapping is pending
|
|
103
|
+
DCGM_FR_BROKEN_P2P_MEMORY_DEVICE = 86 # P2P copy test detected an error writing to this GPU
|
|
104
|
+
DCGM_FR_BROKEN_P2P_WRITER_DEVICE = 87 # P2P copy test detected an error writing from this GPU
|
|
105
|
+
DCGM_FR_NVSWITCH_NVLINK_DOWN = 88 # An NVLink is down
|
|
106
|
+
DCGM_FR_EUD_BINARY_PERMISSIONS = 89 # EUD binary permissions are incorrect
|
|
107
|
+
DCGM_FR_EUD_NON_ROOT_USER = 90 # EUD plugin is not running as root
|
|
108
|
+
DCGM_FR_EUD_SPAWN_FAILURE = 91 # EUD plugin failed to spawn the EUD binary
|
|
109
|
+
DCGM_FR_EUD_TIMEOUT = 92 # EUD plugin timed out
|
|
110
|
+
DCGM_FR_EUD_ZOMBIE = 93 # EUD process remains running after the plugin considers it finished
|
|
111
|
+
DCGM_FR_EUD_NON_ZERO_EXIT_CODE = 94 # EUD process exited with a non-zero exit code
|
|
112
|
+
DCGM_FR_EUD_TEST_FAILED = 95 # EUD test failed
|
|
113
|
+
DCGM_FR_FILE_CREATE_PERMISSIONS = 96 # We cannot write a file in this directory.
|
|
114
|
+
DCGM_FR_PAUSE_RESUME_FAILED = 97 # Pause/Resume failed
|
|
115
|
+
DCGM_FR_ERROR_SENTINEL = 98 # MUST BE THE LAST ERROR CODE
|
|
116
|
+
|
|
117
|
+
# Standard message for running a field diagnostic
|
|
118
|
+
TRIAGE_RUN_FIELD_DIAG_MSG = "Run a field diagnostic on the GPU."
|
|
119
|
+
DEBUG_COOLING_MSG = "Verify that the cooling on this machine is functional, including external, thermal "\
|
|
120
|
+
"material interface, fans, and any other components."
|
|
121
|
+
BUG_REPORT_MSG = "Please capture an nvidia-bug-report and send it to NVIDIA."
|
|
122
|
+
|
|
123
|
+
# Define DCGM error priorities
|
|
124
|
+
DCGM_ERROR_MONITOR = 0 # Can perform workload, but needs to be monitored.
|
|
125
|
+
DCGM_ERROR_ISOLATE = 1 # Cannot perform workload. GPU should be isolated.
|
|
126
|
+
DCGM_ERROR_UNKNOWN = 2 # This error code is not recognized
|
|
127
|
+
|
|
128
|
+
# Messages for the error codes. All messages must be defined in the ERROR_CODE_MSG <msg> format
|
|
129
|
+
# where <msg> is the actual message.
|
|
130
|
+
|
|
131
|
+
DCGM_FR_OK_MSG = "The operation completed successfully."
|
|
132
|
+
DCGM_FR_UNKNOWN_MSG = "Unknown error."
|
|
133
|
+
DCGM_FR_UNRECOGNIZED_MSG = "Unrecognized error code."
|
|
134
|
+
# replay limit, gpu id, replay errors detected
|
|
135
|
+
DCGM_FR_PCI_REPLAY_RATE_MSG = "Detected more than %u PCIe replays per minute for GPU %u : %d"
|
|
136
|
+
# dbes deteced, gpu id
|
|
137
|
+
DCGM_FR_VOLATILE_DBE_DETECTED_MSG = "Detected %d volatile double-bit ECC error(s) in GPU %u."
|
|
138
|
+
# sbe limit, gpu id, sbes detected
|
|
139
|
+
DCGM_FR_VOLATILE_SBE_DETECTED_MSG = "More than %u single-bit ECC error(s) detected in GPU %u Volatile SBEs: %lld"
|
|
140
|
+
# gpu id
|
|
141
|
+
DCGM_FR_PENDING_PAGE_RETIREMENTS_MSG = "A pending retired page has been detected in GPU %u."
|
|
142
|
+
# retired pages detected, gpud id
|
|
143
|
+
DCGM_FR_RETIRED_PAGES_LIMIT_MSG = "%u or more retired pages have been detected in GPU %u. "
|
|
144
|
+
# retired pages due to dbes detected, gpu id
|
|
145
|
+
DCGM_FR_RETIRED_PAGES_DBE_LIMIT_MSG = "An excess of %u retired pages due to DBEs have been detected and" \
|
|
146
|
+
" more than one page has been retired due to DBEs in the past" \
|
|
147
|
+
" week in GPU %u."
|
|
148
|
+
# gpu id
|
|
149
|
+
DCGM_FR_CORRUPT_INFOROM_MSG = "A corrupt InfoROM has been detected in GPU %u."
|
|
150
|
+
# gpu id
|
|
151
|
+
DCGM_FR_CLOCK_THROTTLE_THERMAL_MSG = "Detected clock throttling due to thermal violation in GPU %u."
|
|
152
|
+
# gpu id
|
|
153
|
+
DCGM_FR_POWER_UNREADABLE_MSG = "Cannot reliably read the power usage for GPU %u."
|
|
154
|
+
# gpu id
|
|
155
|
+
DCGM_FR_CLOCK_THROTTLE_POWER_MSG = "Detected clock throttling due to power violation in GPU %u."
|
|
156
|
+
# nvlink errors detected, nvlink id, error threshold
|
|
157
|
+
DCGM_FR_NVLINK_ERROR_THRESHOLD_MSG = "Detected %ld NvLink errors on NvLink %u which exceeds threshold of %u"
|
|
158
|
+
# gpu id, nvlink id
|
|
159
|
+
DCGM_FR_NVLINK_DOWN_MSG = "GPU %u's NvLink link %d is currently down"
|
|
160
|
+
# nvswitch id, nvlink id
|
|
161
|
+
DCGM_FR_NVSWITCH_FATAL_ERROR_MSG = "Detected fatal errors on NvSwitch %u link %u"
|
|
162
|
+
# nvswitch id, nvlink id
|
|
163
|
+
DCGM_FR_NVSWITCH_NON_FATAL_ERROR_MSG = "Detected nonfatal errors on NvSwitch %u link %u"
|
|
164
|
+
# nvswitch id, nvlink port
|
|
165
|
+
DCGM_FR_NVSWITCH_DOWN_MSG = "NvSwitch physical ID %u's NvLink port %d is currently down."
|
|
166
|
+
# file path, error detail
|
|
167
|
+
DCGM_FR_NO_ACCESS_TO_FILE_MSG = "File %s could not be accessed directly: %s"
|
|
168
|
+
# purpose for communicating with NVML, NVML error as string, NVML error
|
|
169
|
+
DCGM_FR_NVML_API_MSG = "Error calling NVML API %s: %s"
|
|
170
|
+
DCGM_FR_DEVICE_COUNT_MISMATCH_MSG = "The number of devices NVML returns is different than the number "\
|
|
171
|
+
"of devices in /dev."
|
|
172
|
+
# function name
|
|
173
|
+
DCGM_FR_BAD_PARAMETER_MSG = "Bad parameter to function %s cannot be processed"
|
|
174
|
+
# library name, error returned from dlopen
|
|
175
|
+
DCGM_FR_CANNOT_OPEN_LIB_MSG = "Cannot open library %s: '%s'"
|
|
176
|
+
# the name of the driver on the denylist
|
|
177
|
+
DCGM_FR_DENYLISTED_DRIVER_MSG = "Found driver on the denylist: %s"
|
|
178
|
+
# the name of the function that wasn't found
|
|
179
|
+
DCGM_FR_NVML_LIB_BAD_MSG = "Cannot get pointer to %s from libnvidia-ml.so"
|
|
180
|
+
DCGM_FR_GRAPHICS_PROCESSES_MSG = "NVVS has detected graphics processes running on at least one "\
|
|
181
|
+
"GPU. This may cause some tests to fail."
|
|
182
|
+
# error message from the API call
|
|
183
|
+
DCGM_FR_HOSTENGINE_CONN_MSG = "Could not connect to the host engine: '%s'"
|
|
184
|
+
# field name, gpu id
|
|
185
|
+
DCGM_FR_FIELD_QUERY_MSG = "Could not query field %s for GPU %u"
|
|
186
|
+
# environment variable name
|
|
187
|
+
DCGM_FR_BAD_CUDA_ENV_MSG = "Found CUDA performance-limiting environment variable '%s'."
|
|
188
|
+
# gpu id
|
|
189
|
+
DCGM_FR_PERSISTENCE_MODE_MSG = "Persistence mode for GPU %u is currently disabled. The DCGM "\
|
|
190
|
+
"diagnostic requires peristence mode to be enabled."
|
|
191
|
+
DCGM_FR_LOW_BANDWIDTH_MSG = "Bandwidth of GPU %u in direction %s of %.2f did not exceed "\
|
|
192
|
+
"minimum required bandwidth of %.2f."
|
|
193
|
+
DCGM_FR_HIGH_LATENCY_MSG = "Latency type %s of GPU %u value %.2f exceeded maximum allowed "\
|
|
194
|
+
"latency of %.2f."
|
|
195
|
+
DCGM_FR_CANNOT_GET_FIELD_TAG_MSG = "Unable to get field information for field id %hu"
|
|
196
|
+
DCGM_FR_FIELD_VIOLATION_MSG = "Detected %ld %s for GPU %u"
|
|
197
|
+
DCGM_FR_FIELD_THRESHOLD_MSG = "Detected %ld %s for GPU %u which is above the threshold %ld"
|
|
198
|
+
DCGM_FR_FIELD_VIOLATION_DBL_MSG = "Detected %.1f %s for GPU %u"
|
|
199
|
+
DCGM_FR_FIELD_THRESHOLD_DBL_MSG = "Detected %.1f %s for GPU %u which is above the threshold %.1f"
|
|
200
|
+
DCGM_FR_UNSUPPORTED_FIELD_TYPE_MSG = "Field %s is not supported by this API because it is neither an "\
|
|
201
|
+
"int64 nor a double type."
|
|
202
|
+
DCGM_FR_FIELD_THRESHOLD_TS_MSG = "%s met or exceeded the threshold of %lu per second: %lu at "\
|
|
203
|
+
"%.1f seconds into the test."
|
|
204
|
+
DCGM_FR_FIELD_THRESHOLD_TS_DBL_MSG = "%s met or exceeded the threshold of %.1f per second: %.1f at "\
|
|
205
|
+
"%.1f seconds into the test."
|
|
206
|
+
DCGM_FR_THERMAL_VIOLATIONS_MSG = "There were thermal violations totaling %lu seconds for GPU %u"
|
|
207
|
+
DCGM_FR_THERMAL_VIOLATIONS_TS_MSG = "Thermal violations totaling %lu samples started at %.1f seconds "\
|
|
208
|
+
"into the test for GPU %u"
|
|
209
|
+
DCGM_FR_TEMP_VIOLATION_MSG = "Temperature %lld of GPU %u exceeded user-specified maximum "\
|
|
210
|
+
"allowed temperature %lld"
|
|
211
|
+
DCGM_FR_THROTTLING_VIOLATION_MSG = "Clocks are being throttling for GPU %u because of clock "\
|
|
212
|
+
"throttling starting %.1f seconds into the test. %s"
|
|
213
|
+
DCGM_FR_INTERNAL_MSG = "There was an internal error during the test: '%s'"
|
|
214
|
+
DCGM_FR_PCIE_GENERATION_MSG = "GPU %u is running at PCI link generation %d, which is below "\
|
|
215
|
+
"the minimum allowed link generation of %d (parameter '%s')"
|
|
216
|
+
DCGM_FR_PCIE_WIDTH_MSG = "GPU %u is running at PCI link width %dX, which is below the "\
|
|
217
|
+
"minimum allowed link generation of %d (parameter '%s')"
|
|
218
|
+
DCGM_FR_ABORTED_MSG = "Test was aborted early due to user signal"
|
|
219
|
+
DCGM_FR_TEST_DISABLED_MSG = "The %s test is skipped for this GPU."
|
|
220
|
+
DCGM_FR_CANNOT_GET_STAT_MSG = "Unable to generate / collect stat %s for GPU %u"
|
|
221
|
+
DCGM_FR_STRESS_LEVEL_MSG = "Max stress level of %.1f did not reach desired stress level of "\
|
|
222
|
+
"%.1f for GPU %u"
|
|
223
|
+
DCGM_FR_CUDA_API_MSG = "Error using CUDA API %s"
|
|
224
|
+
DCGM_FR_FAULTY_MEMORY_MSG = "Found %d faulty memory elements on GPU %u"
|
|
225
|
+
DCGM_FR_CANNOT_SET_WATCHES_MSG = "Unable to add field watches to DCGM: %s"
|
|
226
|
+
DCGM_FR_CUDA_UNBOUND_MSG = "Cuda GPU %d is no longer bound to a CUDA context...Aborting"
|
|
227
|
+
DCGM_FR_ECC_DISABLED_MSG = "Skipping test %s because ECC is not enabled on GPU %u"
|
|
228
|
+
DCGM_FR_MEMORY_ALLOC_MSG = "Couldn't allocate at least %.1f%% of GPU memory on GPU %u"
|
|
229
|
+
DCGM_FR_CUDA_DBE_MSG = "CUDA APIs have indicated that a double-bit ECC error has "\
|
|
230
|
+
"occured on GPU %u."
|
|
231
|
+
DCGM_FR_MEMORY_MISMATCH_MSG = "A memory mismatch was detected on GPU %u, but no error was "\
|
|
232
|
+
"reported by CUDA or NVML."
|
|
233
|
+
DCGM_FR_CUDA_DEVICE_MSG = "Unable to find a corresponding CUDA device for GPU %u: '%s'"
|
|
234
|
+
DCGM_FR_ECC_UNSUPPORTED_MSG = "This card does not support ECC Memory. Skipping test."
|
|
235
|
+
DCGM_FR_ECC_PENDING_MSG = "ECC memory for GPU %u is in a pending state."
|
|
236
|
+
DCGM_FR_MEMORY_BANDWIDTH_MSG = "GPU %u only achieved a memory bandwidth of %.2f GB/s, failing "\
|
|
237
|
+
"to meet %.2f GB/s for test %d"
|
|
238
|
+
DCGM_FR_TARGET_POWER_MSG = "Max power of %.1f did not reach desired power minimum %s of "\
|
|
239
|
+
"%.1f for GPU %u"
|
|
240
|
+
DCGM_FR_API_FAIL_MSG = "API call %s failed: '%s'"
|
|
241
|
+
DCGM_FR_API_FAIL_GPU_MSG = "API call %s failed for GPU %u: '%s'"
|
|
242
|
+
DCGM_FR_CUDA_CONTEXT_MSG = "GPU %u failed to create a CUDA context: %s"
|
|
243
|
+
DCGM_FR_DCGM_API_MSG = "Error using DCGM API %s"
|
|
244
|
+
DCGM_FR_CONCURRENT_GPUS_MSG = "Unable to run concurrent pair bandwidth test without 2 or more "\
|
|
245
|
+
"gpus. Skipping"
|
|
246
|
+
DCGM_FR_TOO_MANY_ERRORS_MSG = "This API can only return up to four errors per system. "\
|
|
247
|
+
"Additional errors were found for this system that couldn't be "\
|
|
248
|
+
"communicated."
|
|
249
|
+
DCGM_FR_NVLINK_CRC_ERROR_THRESHOLD_MSG = "%.1f %s NvLink errors found occuring per second on GPU %u, "\
|
|
250
|
+
"exceeding the limit of 100 per second."
|
|
251
|
+
DCGM_FR_NVLINK_ERROR_CRITICAL_MSG = "Detected %ld %s NvLink errors on GPU %u's NVLink (should be 0)"
|
|
252
|
+
DCGM_FR_ENFORCED_POWER_LIMIT_MSG = "Enforced power limit on GPU %u set to %.1f, which is too low to "\
|
|
253
|
+
"attempt to achieve target power %.1f"
|
|
254
|
+
DCGM_FR_MEMORY_ALLOC_HOST_MSG = "Cannot allocate %zu bytes on the host"
|
|
255
|
+
DCGM_FR_GPU_OP_MODE_MSG = "Skipping plugin due to a GPU being in GPU Operating Mode: LOW_DP."
|
|
256
|
+
DCGM_FR_NO_MEMORY_CLOCKS_MSG = "No memory clocks <= %u MHZ were found in %u supported memory clocks."
|
|
257
|
+
DCGM_FR_NO_GRAPHICS_CLOCKS_MSG = "No graphics clocks <= %u MHZ were found in %u supported graphics clocks for memory clock %u MHZ."
|
|
258
|
+
DCGM_FR_HAD_TO_RESTORE_STATE_MSG = "Had to restore GPU state on NVML GPU(s): %s"
|
|
259
|
+
DCGM_FR_L1TAG_UNSUPPORTED_MSG = "This card does not support the L1 cache test. Skipping test."
|
|
260
|
+
DCGM_FR_L1TAG_MISCOMPARE_MSG = "The L1 cache test failed with a miscompare."
|
|
261
|
+
DCGM_FR_ROW_REMAP_FAILURE_MSG = "Row remapping failed."
|
|
262
|
+
DCGM_FR_UNCONTAINED_ERROR_MSG = "GPU had an uncontained error (XID 95)"
|
|
263
|
+
DCGM_FR_EMPTY_GPU_LIST_MSG = "No valid GPUs passed to plugin"
|
|
264
|
+
DCGM_FR_DBE_PENDING_PAGE_RETIREMENTS_MSG = "Pending page retirements together with a DBE were detected on GPU %u."
|
|
265
|
+
DCGM_FR_UNCORRECTABLE_ROW_REMAP_MSG = "GPU %u has uncorrectable row remappings"
|
|
266
|
+
DCGM_FR_PENDING_ROW_REMAP_MSG = "GPU %u has pending row remappings"
|
|
267
|
+
DCGM_FR_BROKEN_P2P_MEMORY_DEVICE_MSG = "GPU %u was unsuccessfully written to in a peer-to-peer test: %s"
|
|
268
|
+
DCGM_FR_BROKEN_P2P_WRITER_DEVICE_MSG = "GPU %u unsuccessfully wrote data in a peer-to-peer test: %s"
|
|
269
|
+
DCGM_FR_NVSWITCH_NVLINK_DOWN_MSG = "NVSwitch %u's NvLink %u is down."
|
|
270
|
+
DCGM_FR_FILE_CREATE_PERMISSIONS_MSG = "The DCGM Diagnostic does not have permissions to create a file in directory '%s'"
|
|
271
|
+
|
|
272
|
+
# Suggestions for next steps for the corresponding error message
|
|
273
|
+
DCGM_FR_OK_NEXT = "N/A"
|
|
274
|
+
DCGM_FR_UNKNOWN_NEXT = ""
|
|
275
|
+
DCGM_FR_UNRECOGNIZED_NEXT = ""
|
|
276
|
+
DCGM_FR_PCI_REPLAY_RATE_NEXT = "Reconnect PCIe card. Run system side PCIE diagnostic utilities "\
|
|
277
|
+
"to verify hops off the GPU board. If issue is on the board, run "\
|
|
278
|
+
"the field diagnostic."
|
|
279
|
+
DCGM_FR_VOLATILE_DBE_DETECTED_NEXT = "Drain the GPU and reset it or reboot the node."
|
|
280
|
+
DCGM_FR_VOLATILE_SBE_DETECTED_NEXT = "Monitor - this GPU can still perform workload."
|
|
281
|
+
DCGM_FR_PENDING_PAGE_RETIREMENTS_NEXT = "If volatile double bit errors exist, drain the GPU and reset it "\
|
|
282
|
+
"or reboot the node. Otherwise, monitor - GPU can still perform "\
|
|
283
|
+
"workload."
|
|
284
|
+
DCGM_FR_RETIRED_PAGES_LIMIT_NEXT = TRIAGE_RUN_FIELD_DIAG_MSG
|
|
285
|
+
DCGM_FR_RETIRED_PAGES_DBE_LIMIT_NEXT = TRIAGE_RUN_FIELD_DIAG_MSG
|
|
286
|
+
DCGM_FR_CORRUPT_INFOROM_NEXT = "Flash the InfoROM to clear this corruption."
|
|
287
|
+
DCGM_FR_CLOCK_THROTTLE_THERMAL_NEXT = DEBUG_COOLING_MSG
|
|
288
|
+
DCGM_FR_POWER_UNREADABLE_NEXT = ""
|
|
289
|
+
DCGM_FR_CLOCK_THROTTLE_POWER_NEXT = "Monitor the power conditions. This GPU can still perform workload."
|
|
290
|
+
DCGM_FR_NVLINK_ERROR_THRESHOLD_NEXT = TRIAGE_RUN_FIELD_DIAG_MSG
|
|
291
|
+
DCGM_FR_NVLINK_DOWN_NEXT = TRIAGE_RUN_FIELD_DIAG_MSG
|
|
292
|
+
DCGM_FR_NVSWITCH_FATAL_ERROR_NEXT = TRIAGE_RUN_FIELD_DIAG_MSG
|
|
293
|
+
DCGM_FR_NVSWITCH_NON_FATAL_ERROR_NEXT = "Monitor the NVSwitch. It can still perform workload."
|
|
294
|
+
DCGM_FR_NVSWITCH_DOWN_NEXT = ""
|
|
295
|
+
DCGM_FR_NO_ACCESS_TO_FILE_NEXT = "Check relevant permissions, access, and existence of the file."
|
|
296
|
+
DCGM_FR_NVML_API_NEXT = "Check the error condition and ensure that appropriate libraries "\
|
|
297
|
+
"are present and accessible."
|
|
298
|
+
DCGM_FR_DEVICE_COUNT_MISMATCH_NEXT = "Check for the presence of cgroups, operating system blocks, and "\
|
|
299
|
+
"or unsupported / older cards"
|
|
300
|
+
DCGM_FR_BAD_PARAMETER_NEXT = ""
|
|
301
|
+
DCGM_FR_CANNOT_OPEN_LIB_NEXT = "Check for the existence of the library and set LD_LIBRARY_PATH "\
|
|
302
|
+
"if needed."
|
|
303
|
+
DCGM_FR_DENYLISTED_DRIVER_NEXT = "Please load the appropriate driver."
|
|
304
|
+
DCGM_FR_NVML_LIB_BAD_NEXT = "Make sure that the required version of libnvidia-ml.so "\
|
|
305
|
+
"is present and accessible on the system."
|
|
306
|
+
DCGM_FR_GRAPHICS_PROCESSES_NEXT = "Stop the graphics processes or run this diagnostic on a server "\
|
|
307
|
+
"that is not being used for display purposes."
|
|
308
|
+
DCGM_FR_HOSTENGINE_CONN_NEXT = "If hostengine is run separately, please ensure that it is up "\
|
|
309
|
+
"and responsive."
|
|
310
|
+
DCGM_FR_FIELD_QUERY_NEXT = ""
|
|
311
|
+
DCGM_FR_BAD_CUDA_ENV_NEXT = "Please unset this environment variable to address test failures."
|
|
312
|
+
DCGM_FR_PERSISTENCE_MODE_NEXT = "Enable persistence mode by running \"nvidia-smi -i <gpuId> -pm "\
|
|
313
|
+
"1 \" as root."
|
|
314
|
+
DCGM_FR_LOW_BANDWIDTH_NEXT = "Verify that your minimum bandwidth setting is appropriate for "\
|
|
315
|
+
"all topological consequences."
|
|
316
|
+
DCGM_FR_HIGH_LATENCY_NEXT = ""
|
|
317
|
+
DCGM_FR_CANNOT_GET_FIELD_TAG_NEXT = ""
|
|
318
|
+
DCGM_FR_FIELD_VIOLATION_NEXT = ""
|
|
319
|
+
DCGM_FR_FIELD_THRESHOLD_NEXT = ""
|
|
320
|
+
DCGM_FR_FIELD_VIOLATION_DBL_NEXT = ""
|
|
321
|
+
DCGM_FR_FIELD_THRESHOLD_DBL_NEXT = ""
|
|
322
|
+
DCGM_FR_UNSUPPORTED_FIELD_TYPE_NEXT = ""
|
|
323
|
+
DCGM_FR_FIELD_THRESHOLD_TS_NEXT = ""
|
|
324
|
+
DCGM_FR_FIELD_THRESHOLD_TS_DBL_NEXT = ""
|
|
325
|
+
DCGM_FR_THERMAL_VIOLATIONS_NEXT = DEBUG_COOLING_MSG
|
|
326
|
+
DCGM_FR_THERMAL_VIOLATIONS_TS_NEXT = DEBUG_COOLING_MSG
|
|
327
|
+
DCGM_FR_TEMP_VIOLATION_NEXT = "Verify that the user-specified temperature maximum is set "\
|
|
328
|
+
"correctly. If it is, %s" % DEBUG_COOLING_MSG
|
|
329
|
+
DCGM_FR_THROTTLING_VIOLATION_NEXT = ""
|
|
330
|
+
DCGM_FR_INTERNAL_NEXT = ""
|
|
331
|
+
DCGM_FR_PCIE_GENERATION_NEXT = ""
|
|
332
|
+
DCGM_FR_PCIE_WIDTH_NEXT = ""
|
|
333
|
+
DCGM_FR_ABORTED_NEXT = ""
|
|
334
|
+
DCGM_FR_TEST_DISABLED_NEXT = ""
|
|
335
|
+
DCGM_FR_CANNOT_GET_STAT_NEXT = "If running a standalone nv-hostengine, verify that it is up "\
|
|
336
|
+
"and responsive."
|
|
337
|
+
DCGM_FR_STRESS_LEVEL_NEXT = ""
|
|
338
|
+
DCGM_FR_CUDA_API_NEXT = ""
|
|
339
|
+
DCGM_FR_FAULTY_MEMORY_NEXT = TRIAGE_RUN_FIELD_DIAG_MSG
|
|
340
|
+
DCGM_FR_CANNOT_SET_WATCHES_NEXT = ""
|
|
341
|
+
DCGM_FR_CUDA_UNBOUND_NEXT = ""
|
|
342
|
+
DCGM_FR_ECC_DISABLED_NEXT = "Enable ECC memory by running \"nvidia-smi -i <gpuId> -e 1\" "\
|
|
343
|
+
"to enable. This may require a GPU reset or reboot to take effect."
|
|
344
|
+
DCGM_FR_MEMORY_ALLOC_NEXT = ""
|
|
345
|
+
DCGM_FR_CUDA_DBE_NEXT = TRIAGE_RUN_FIELD_DIAG_MSG
|
|
346
|
+
DCGM_FR_MEMORY_MISMATCH_NEXT = TRIAGE_RUN_FIELD_DIAG_MSG
|
|
347
|
+
DCGM_FR_CUDA_DEVICE_NEXT = ""
|
|
348
|
+
DCGM_FR_ECC_UNSUPPORTED_NEXT = ""
|
|
349
|
+
DCGM_FR_ECC_PENDING_NEXT = "Please reboot to activate it."
|
|
350
|
+
DCGM_FR_MEMORY_BANDWIDTH_NEXT = ""
|
|
351
|
+
DCGM_FR_TARGET_POWER_NEXT = ""
|
|
352
|
+
DCGM_FR_API_FAIL_NEXT = ""
|
|
353
|
+
DCGM_FR_API_FAIL_GPU_NEXT = ""
|
|
354
|
+
DCGM_FR_CUDA_CONTEXT_NEXT = "Please make sure the correct driver version is installed and "\
|
|
355
|
+
"verify that no conflicting libraries are present."
|
|
356
|
+
DCGM_FR_DCGM_API_NEXT = ""
|
|
357
|
+
DCGM_FR_CONCURRENT_GPUS_NEXT = ""
|
|
358
|
+
DCGM_FR_TOO_MANY_ERRORS_NEXT = ""
|
|
359
|
+
DCGM_FR_NVLINK_CRC_ERROR_THRESHOLD_NEXT = TRIAGE_RUN_FIELD_DIAG_MSG
|
|
360
|
+
DCGM_FR_NVLINK_ERROR_CRITICAL_NEXT = TRIAGE_RUN_FIELD_DIAG_MSG
|
|
361
|
+
DCGM_FR_ENFORCED_POWER_LIMIT_NEXT = "If this enforced power limit is necessary, then this test "\
|
|
362
|
+
"cannot be run. If it is unnecessary, then raise the enforced "\
|
|
363
|
+
"power limit setting to be able to run this test."
|
|
364
|
+
DCGM_FR_MEMORY_ALLOC_HOST_NEXT = "Manually kill processes or restart your machine."
|
|
365
|
+
DCGM_FR_GPU_OP_MODE_NEXT = "Fix by running nvidia-smi as root with: nvidia-smi --gom=0 -i "\
|
|
366
|
+
"<gpu index>"
|
|
367
|
+
DCGM_FR_NO_MEMORY_CLOCKS_NEXT = ""
|
|
368
|
+
DCGM_FR_NO_GRAPHICS_CLOCKS_NEXT = ""
|
|
369
|
+
DCGM_FR_HAD_TO_RESTORE_STATE_NEXT = ""
|
|
370
|
+
DCGM_FR_L1TAG_UNSUPPORTED_NEXT = ""
|
|
371
|
+
DCGM_FR_L1TAG_MISCOMPARE_NEXT = TRIAGE_RUN_FIELD_DIAG_MSG
|
|
372
|
+
DCGM_FR_ROW_REMAP_FAILURE_NEXT = DCGM_FR_VOLATILE_DBE_DETECTED_NEXT
|
|
373
|
+
DCGM_FR_UNCONTAINED_ERROR_NEXT = DCGM_FR_VOLATILE_DBE_DETECTED_NEXT
|
|
374
|
+
DCGM_FR_EMPTY_GPU_LIST_NEXT = ""
|
|
375
|
+
DCGM_FR_DBE_PENDING_PAGE_RETIREMENTS_NEXT = "Drain the GPU and reset it or reboot the node to resolve this issue."
|
|
376
|
+
DCGM_FR_UNCORRECTABLE_ROW_REMAP_NEXT = ""
|
|
377
|
+
DCGM_FR_PENDING_ROW_REMAP_NEXT = ""
|
|
378
|
+
DCGM_FR_BROKEN_P2P_MEMORY_DEVICE_NEXT = BUG_REPORT_MSG
|
|
379
|
+
DCGM_FR_BROKEN_P2P_WRITER_DEVICE_NEXT = BUG_REPORT_MSG
|
|
380
|
+
DCGM_FR_NVSWITCH_NVLINK_DOWN_NEXT = "Please check fabric manager and initialization logs to figure out why the link is down. You may also need to run a field diagnostic."
|
|
381
|
+
DCGM_FR_FILE_CREATE_PERMISSIONS_NEXT = "Please restart the hostengine with parameter --home-dir to specify a different home directory for the " \
|
|
382
|
+
"diagnostic or change permissions in the current directory to allow the user to write files there."
|
|
383
|
+
|
|
384
|
+
|
|
385
|
+
def dcgmErrorGetPriorityByCode(code):
|
|
386
|
+
fn = dcgm_structs._dcgmGetFunctionPointer("dcgmErrorGetPriorityByCode")
|
|
387
|
+
ret = fn(code)
|
|
388
|
+
return ret
|
|
389
|
+
|
|
390
|
+
|
|
391
|
+
def dcgmErrorGetFormatMsgByCode(code):
|
|
392
|
+
fn = dcgm_structs._dcgmGetFunctionPointer("dcgmErrorGetFormatMsgByCode")
|
|
393
|
+
fn.restype = ctypes.c_char_p
|
|
394
|
+
ret = fn(code)
|
|
395
|
+
return ret.decode('utf-8') if isinstance(ret, bytes) else ret
|