triton-model-analyzer 1.48.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (204) hide show
  1. model_analyzer/__init__.py +15 -0
  2. model_analyzer/analyzer.py +448 -0
  3. model_analyzer/cli/__init__.py +15 -0
  4. model_analyzer/cli/cli.py +193 -0
  5. model_analyzer/config/__init__.py +15 -0
  6. model_analyzer/config/generate/__init__.py +15 -0
  7. model_analyzer/config/generate/automatic_model_config_generator.py +164 -0
  8. model_analyzer/config/generate/base_model_config_generator.py +352 -0
  9. model_analyzer/config/generate/brute_plus_binary_parameter_search_run_config_generator.py +164 -0
  10. model_analyzer/config/generate/brute_run_config_generator.py +154 -0
  11. model_analyzer/config/generate/concurrency_sweeper.py +75 -0
  12. model_analyzer/config/generate/config_generator_interface.py +52 -0
  13. model_analyzer/config/generate/coordinate.py +143 -0
  14. model_analyzer/config/generate/coordinate_data.py +86 -0
  15. model_analyzer/config/generate/generator_utils.py +116 -0
  16. model_analyzer/config/generate/manual_model_config_generator.py +187 -0
  17. model_analyzer/config/generate/model_config_generator_factory.py +92 -0
  18. model_analyzer/config/generate/model_profile_spec.py +74 -0
  19. model_analyzer/config/generate/model_run_config_generator.py +154 -0
  20. model_analyzer/config/generate/model_variant_name_manager.py +150 -0
  21. model_analyzer/config/generate/neighborhood.py +536 -0
  22. model_analyzer/config/generate/optuna_plus_concurrency_sweep_run_config_generator.py +141 -0
  23. model_analyzer/config/generate/optuna_run_config_generator.py +838 -0
  24. model_analyzer/config/generate/perf_analyzer_config_generator.py +312 -0
  25. model_analyzer/config/generate/quick_plus_concurrency_sweep_run_config_generator.py +130 -0
  26. model_analyzer/config/generate/quick_run_config_generator.py +753 -0
  27. model_analyzer/config/generate/run_config_generator_factory.py +329 -0
  28. model_analyzer/config/generate/search_config.py +112 -0
  29. model_analyzer/config/generate/search_dimension.py +73 -0
  30. model_analyzer/config/generate/search_dimensions.py +85 -0
  31. model_analyzer/config/generate/search_parameter.py +49 -0
  32. model_analyzer/config/generate/search_parameters.py +388 -0
  33. model_analyzer/config/input/__init__.py +15 -0
  34. model_analyzer/config/input/config_command.py +483 -0
  35. model_analyzer/config/input/config_command_profile.py +1747 -0
  36. model_analyzer/config/input/config_command_report.py +267 -0
  37. model_analyzer/config/input/config_defaults.py +236 -0
  38. model_analyzer/config/input/config_enum.py +83 -0
  39. model_analyzer/config/input/config_field.py +216 -0
  40. model_analyzer/config/input/config_list_generic.py +112 -0
  41. model_analyzer/config/input/config_list_numeric.py +151 -0
  42. model_analyzer/config/input/config_list_string.py +111 -0
  43. model_analyzer/config/input/config_none.py +71 -0
  44. model_analyzer/config/input/config_object.py +129 -0
  45. model_analyzer/config/input/config_primitive.py +81 -0
  46. model_analyzer/config/input/config_status.py +75 -0
  47. model_analyzer/config/input/config_sweep.py +83 -0
  48. model_analyzer/config/input/config_union.py +113 -0
  49. model_analyzer/config/input/config_utils.py +128 -0
  50. model_analyzer/config/input/config_value.py +243 -0
  51. model_analyzer/config/input/objects/__init__.py +15 -0
  52. model_analyzer/config/input/objects/config_model_profile_spec.py +325 -0
  53. model_analyzer/config/input/objects/config_model_report_spec.py +173 -0
  54. model_analyzer/config/input/objects/config_plot.py +198 -0
  55. model_analyzer/config/input/objects/config_protobuf_utils.py +101 -0
  56. model_analyzer/config/input/yaml_config_validator.py +82 -0
  57. model_analyzer/config/run/__init__.py +15 -0
  58. model_analyzer/config/run/model_run_config.py +313 -0
  59. model_analyzer/config/run/run_config.py +168 -0
  60. model_analyzer/constants.py +76 -0
  61. model_analyzer/device/__init__.py +15 -0
  62. model_analyzer/device/device.py +24 -0
  63. model_analyzer/device/gpu_device.py +87 -0
  64. model_analyzer/device/gpu_device_factory.py +248 -0
  65. model_analyzer/entrypoint.py +307 -0
  66. model_analyzer/log_formatter.py +65 -0
  67. model_analyzer/model_analyzer_exceptions.py +24 -0
  68. model_analyzer/model_manager.py +255 -0
  69. model_analyzer/monitor/__init__.py +15 -0
  70. model_analyzer/monitor/cpu_monitor.py +69 -0
  71. model_analyzer/monitor/dcgm/DcgmDiag.py +191 -0
  72. model_analyzer/monitor/dcgm/DcgmFieldGroup.py +83 -0
  73. model_analyzer/monitor/dcgm/DcgmGroup.py +815 -0
  74. model_analyzer/monitor/dcgm/DcgmHandle.py +141 -0
  75. model_analyzer/monitor/dcgm/DcgmJsonReader.py +69 -0
  76. model_analyzer/monitor/dcgm/DcgmReader.py +623 -0
  77. model_analyzer/monitor/dcgm/DcgmStatus.py +57 -0
  78. model_analyzer/monitor/dcgm/DcgmSystem.py +412 -0
  79. model_analyzer/monitor/dcgm/__init__.py +15 -0
  80. model_analyzer/monitor/dcgm/common/__init__.py +13 -0
  81. model_analyzer/monitor/dcgm/common/dcgm_client_cli_parser.py +194 -0
  82. model_analyzer/monitor/dcgm/common/dcgm_client_main.py +86 -0
  83. model_analyzer/monitor/dcgm/dcgm_agent.py +887 -0
  84. model_analyzer/monitor/dcgm/dcgm_collectd_plugin.py +369 -0
  85. model_analyzer/monitor/dcgm/dcgm_errors.py +395 -0
  86. model_analyzer/monitor/dcgm/dcgm_field_helpers.py +546 -0
  87. model_analyzer/monitor/dcgm/dcgm_fields.py +815 -0
  88. model_analyzer/monitor/dcgm/dcgm_fields_collectd.py +671 -0
  89. model_analyzer/monitor/dcgm/dcgm_fields_internal.py +29 -0
  90. model_analyzer/monitor/dcgm/dcgm_fluentd.py +45 -0
  91. model_analyzer/monitor/dcgm/dcgm_monitor.py +138 -0
  92. model_analyzer/monitor/dcgm/dcgm_prometheus.py +326 -0
  93. model_analyzer/monitor/dcgm/dcgm_structs.py +2357 -0
  94. model_analyzer/monitor/dcgm/dcgm_telegraf.py +65 -0
  95. model_analyzer/monitor/dcgm/dcgm_value.py +151 -0
  96. model_analyzer/monitor/dcgm/dcgmvalue.py +155 -0
  97. model_analyzer/monitor/dcgm/denylist_recommendations.py +573 -0
  98. model_analyzer/monitor/dcgm/pydcgm.py +47 -0
  99. model_analyzer/monitor/monitor.py +143 -0
  100. model_analyzer/monitor/remote_monitor.py +137 -0
  101. model_analyzer/output/__init__.py +15 -0
  102. model_analyzer/output/file_writer.py +63 -0
  103. model_analyzer/output/output_writer.py +42 -0
  104. model_analyzer/perf_analyzer/__init__.py +15 -0
  105. model_analyzer/perf_analyzer/genai_perf_config.py +206 -0
  106. model_analyzer/perf_analyzer/perf_analyzer.py +882 -0
  107. model_analyzer/perf_analyzer/perf_config.py +479 -0
  108. model_analyzer/plots/__init__.py +15 -0
  109. model_analyzer/plots/detailed_plot.py +266 -0
  110. model_analyzer/plots/plot_manager.py +224 -0
  111. model_analyzer/plots/simple_plot.py +213 -0
  112. model_analyzer/record/__init__.py +15 -0
  113. model_analyzer/record/gpu_record.py +68 -0
  114. model_analyzer/record/metrics_manager.py +887 -0
  115. model_analyzer/record/record.py +280 -0
  116. model_analyzer/record/record_aggregator.py +256 -0
  117. model_analyzer/record/types/__init__.py +15 -0
  118. model_analyzer/record/types/cpu_available_ram.py +93 -0
  119. model_analyzer/record/types/cpu_used_ram.py +93 -0
  120. model_analyzer/record/types/gpu_free_memory.py +96 -0
  121. model_analyzer/record/types/gpu_power_usage.py +107 -0
  122. model_analyzer/record/types/gpu_total_memory.py +96 -0
  123. model_analyzer/record/types/gpu_used_memory.py +96 -0
  124. model_analyzer/record/types/gpu_utilization.py +108 -0
  125. model_analyzer/record/types/inter_token_latency_avg.py +60 -0
  126. model_analyzer/record/types/inter_token_latency_base.py +74 -0
  127. model_analyzer/record/types/inter_token_latency_max.py +60 -0
  128. model_analyzer/record/types/inter_token_latency_min.py +60 -0
  129. model_analyzer/record/types/inter_token_latency_p25.py +60 -0
  130. model_analyzer/record/types/inter_token_latency_p50.py +60 -0
  131. model_analyzer/record/types/inter_token_latency_p75.py +60 -0
  132. model_analyzer/record/types/inter_token_latency_p90.py +60 -0
  133. model_analyzer/record/types/inter_token_latency_p95.py +60 -0
  134. model_analyzer/record/types/inter_token_latency_p99.py +60 -0
  135. model_analyzer/record/types/output_token_throughput.py +105 -0
  136. model_analyzer/record/types/perf_client_response_wait.py +97 -0
  137. model_analyzer/record/types/perf_client_send_recv.py +97 -0
  138. model_analyzer/record/types/perf_latency.py +111 -0
  139. model_analyzer/record/types/perf_latency_avg.py +60 -0
  140. model_analyzer/record/types/perf_latency_base.py +74 -0
  141. model_analyzer/record/types/perf_latency_p90.py +60 -0
  142. model_analyzer/record/types/perf_latency_p95.py +60 -0
  143. model_analyzer/record/types/perf_latency_p99.py +60 -0
  144. model_analyzer/record/types/perf_server_compute_infer.py +97 -0
  145. model_analyzer/record/types/perf_server_compute_input.py +97 -0
  146. model_analyzer/record/types/perf_server_compute_output.py +97 -0
  147. model_analyzer/record/types/perf_server_queue.py +97 -0
  148. model_analyzer/record/types/perf_throughput.py +105 -0
  149. model_analyzer/record/types/time_to_first_token_avg.py +60 -0
  150. model_analyzer/record/types/time_to_first_token_base.py +74 -0
  151. model_analyzer/record/types/time_to_first_token_max.py +60 -0
  152. model_analyzer/record/types/time_to_first_token_min.py +60 -0
  153. model_analyzer/record/types/time_to_first_token_p25.py +60 -0
  154. model_analyzer/record/types/time_to_first_token_p50.py +60 -0
  155. model_analyzer/record/types/time_to_first_token_p75.py +60 -0
  156. model_analyzer/record/types/time_to_first_token_p90.py +60 -0
  157. model_analyzer/record/types/time_to_first_token_p95.py +60 -0
  158. model_analyzer/record/types/time_to_first_token_p99.py +60 -0
  159. model_analyzer/reports/__init__.py +15 -0
  160. model_analyzer/reports/html_report.py +195 -0
  161. model_analyzer/reports/pdf_report.py +50 -0
  162. model_analyzer/reports/report.py +86 -0
  163. model_analyzer/reports/report_factory.py +62 -0
  164. model_analyzer/reports/report_manager.py +1376 -0
  165. model_analyzer/reports/report_utils.py +42 -0
  166. model_analyzer/result/__init__.py +15 -0
  167. model_analyzer/result/constraint_manager.py +150 -0
  168. model_analyzer/result/model_config_measurement.py +354 -0
  169. model_analyzer/result/model_constraints.py +105 -0
  170. model_analyzer/result/parameter_search.py +246 -0
  171. model_analyzer/result/result_manager.py +430 -0
  172. model_analyzer/result/result_statistics.py +159 -0
  173. model_analyzer/result/result_table.py +217 -0
  174. model_analyzer/result/result_table_manager.py +646 -0
  175. model_analyzer/result/result_utils.py +42 -0
  176. model_analyzer/result/results.py +277 -0
  177. model_analyzer/result/run_config_measurement.py +658 -0
  178. model_analyzer/result/run_config_result.py +210 -0
  179. model_analyzer/result/run_config_result_comparator.py +110 -0
  180. model_analyzer/result/sorted_results.py +151 -0
  181. model_analyzer/state/__init__.py +15 -0
  182. model_analyzer/state/analyzer_state.py +76 -0
  183. model_analyzer/state/analyzer_state_manager.py +215 -0
  184. model_analyzer/triton/__init__.py +15 -0
  185. model_analyzer/triton/client/__init__.py +15 -0
  186. model_analyzer/triton/client/client.py +234 -0
  187. model_analyzer/triton/client/client_factory.py +57 -0
  188. model_analyzer/triton/client/grpc_client.py +104 -0
  189. model_analyzer/triton/client/http_client.py +107 -0
  190. model_analyzer/triton/model/__init__.py +15 -0
  191. model_analyzer/triton/model/model_config.py +556 -0
  192. model_analyzer/triton/model/model_config_variant.py +29 -0
  193. model_analyzer/triton/server/__init__.py +15 -0
  194. model_analyzer/triton/server/server.py +76 -0
  195. model_analyzer/triton/server/server_config.py +269 -0
  196. model_analyzer/triton/server/server_docker.py +229 -0
  197. model_analyzer/triton/server/server_factory.py +306 -0
  198. model_analyzer/triton/server/server_local.py +158 -0
  199. triton_model_analyzer-1.48.0.dist-info/METADATA +52 -0
  200. triton_model_analyzer-1.48.0.dist-info/RECORD +204 -0
  201. triton_model_analyzer-1.48.0.dist-info/WHEEL +5 -0
  202. triton_model_analyzer-1.48.0.dist-info/entry_points.txt +2 -0
  203. triton_model_analyzer-1.48.0.dist-info/licenses/LICENSE +67 -0
  204. triton_model_analyzer-1.48.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,395 @@
1
+ # Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ import ctypes
15
+ import model_analyzer.monitor.dcgm.dcgm_structs as dcgm_structs
16
+
17
+ DCGM_FR_OK = 0 # No error
18
+ DCGM_FR_UNKNOWN = 1 # Unknown error code
19
+ DCGM_FR_UNRECOGNIZED = 2 # Unrecognized error code
20
+ DCGM_FR_PCI_REPLAY_RATE = 3 # Unacceptable rate of PCI errors
21
+ DCGM_FR_VOLATILE_DBE_DETECTED = 4 # Uncorrectable volatile double bit error
22
+ DCGM_FR_VOLATILE_SBE_DETECTED = 5 # Unacceptable rate of volatile single bit errors
23
+ DCGM_FR_PENDING_PAGE_RETIREMENTS = 6 # Pending page retirements detected
24
+ DCGM_FR_RETIRED_PAGES_LIMIT = 7 # Unacceptable total page retirements detected
25
+ DCGM_FR_RETIRED_PAGES_DBE_LIMIT = 8 # Unacceptable total page retirements due to uncorrectable errors
26
+ DCGM_FR_CORRUPT_INFOROM = 9 # Corrupt inforom found
27
+ DCGM_FR_CLOCK_THROTTLE_THERMAL = 10 # Clocks being throttled due to overheating
28
+ DCGM_FR_POWER_UNREADABLE = 11 # Cannot get a reading for power from NVML
29
+ DCGM_FR_CLOCK_THROTTLE_POWER = 12 # Clock being throttled due to power restrictions
30
+ DCGM_FR_NVLINK_ERROR_THRESHOLD = 13 # Unacceptable rate of NVLink errors
31
+ DCGM_FR_NVLINK_DOWN = 14 # NVLink is down
32
+ DCGM_FR_NVSWITCH_FATAL_ERROR = 15 # Fatal errors on the NVSwitch
33
+ DCGM_FR_NVSWITCH_NON_FATAL_ERROR = 16 # Non-fatal errors on the NVSwitch
34
+ DCGM_FR_NVSWITCH_DOWN = 17 # NVSwitch is down
35
+ DCGM_FR_NO_ACCESS_TO_FILE = 18 # Cannot access a file
36
+ DCGM_FR_NVML_API = 19 # Error occurred on an NVML API
37
+ DCGM_FR_DEVICE_COUNT_MISMATCH = 20 # Disagreement in GPU count between /dev and NVML
38
+ DCGM_FR_BAD_PARAMETER = 21 # Bad parameter passed to API
39
+ DCGM_FR_CANNOT_OPEN_LIB = 22 # Cannot open a library that must be accessed
40
+ DCGM_FR_DENYLISTED_DRIVER = 23 # A driver on the denylist (nouveau) is active
41
+ DCGM_FR_NVML_LIB_BAD = 24 # The NVML library is missing expected functions
42
+ DCGM_FR_GRAPHICS_PROCESSES = 25 # Graphics processes are active on this GPU
43
+ DCGM_FR_HOSTENGINE_CONN = 26 # Unstable connection to nv-hostengine (daemonized DCGM)
44
+ DCGM_FR_FIELD_QUERY = 27 # Error querying a field from DCGM
45
+ DCGM_FR_BAD_CUDA_ENV = 28 # The environment has variables that hurt CUDA
46
+ DCGM_FR_PERSISTENCE_MODE = 29 # Persistence mode is disabled
47
+ DCGM_FR_LOW_BANDWIDTH = 30 # The bandwidth is unacceptably low
48
+ DCGM_FR_HIGH_LATENCY = 31 # Latency is too high
49
+ DCGM_FR_CANNOT_GET_FIELD_TAG = 32 # Cannot find a tag for a field
50
+ DCGM_FR_FIELD_VIOLATION = 33 # The value for the specified error field is above 0
51
+ DCGM_FR_FIELD_THRESHOLD = 34 # The value for the specified field is above the threshold
52
+ DCGM_FR_FIELD_VIOLATION_DBL = 35 # The value for the specified error field is above 0
53
+ DCGM_FR_FIELD_THRESHOLD_DBL = 36 # The value for the specified field is above the threshold
54
+ DCGM_FR_UNSUPPORTED_FIELD_TYPE = 37 # Field type cannot be supported
55
+ DCGM_FR_FIELD_THRESHOLD_TS = 38 # The value for the specified field is above the threshold
56
+ DCGM_FR_FIELD_THRESHOLD_TS_DBL = 39 # The value for the specified field is above the threshold
57
+ DCGM_FR_THERMAL_VIOLATIONS = 40 # Thermal violations detected
58
+ DCGM_FR_THERMAL_VIOLATIONS_TS = 41 # Thermal violations detected with a timestamp
59
+ DCGM_FR_TEMP_VIOLATION = 42 # Temperature is too high
60
+ DCGM_FR_THROTTLING_VIOLATION = 43 # Non-benign clock throttling is occurring
61
+ DCGM_FR_INTERNAL = 44 # An internal error was detected
62
+ DCGM_FR_PCIE_GENERATION = 45 # PCIe generation is too low
63
+ DCGM_FR_PCIE_WIDTH = 46 # PCIe width is too low
64
+ DCGM_FR_ABORTED = 47 # Test was aborted by a user signal
65
+ DCGM_FR_TEST_DISABLED = 48 # This test is disabled for this GPU
66
+ DCGM_FR_CANNOT_GET_STAT = 49 # Cannot get telemetry for a needed value
67
+ DCGM_FR_STRESS_LEVEL = 50 # Stress level is too low (bad performance)
68
+ DCGM_FR_CUDA_API = 51 # Error calling the specified CUDA API
69
+ DCGM_FR_FAULTY_MEMORY = 52 # Faulty memory detected on this GPU
70
+ DCGM_FR_CANNOT_SET_WATCHES = 53 # Unable to set field watches in DCGM
71
+ DCGM_FR_CUDA_UNBOUND = 54 # CUDA context is no longer bound
72
+ DCGM_FR_ECC_DISABLED = 55 # ECC memory is disabled right now
73
+ DCGM_FR_MEMORY_ALLOC = 56 # Cannot allocate memory
74
+ DCGM_FR_CUDA_DBE = 57 # CUDA detected unrecovable double-bit error
75
+ DCGM_FR_MEMORY_MISMATCH = 58 # Memory error detected
76
+ DCGM_FR_CUDA_DEVICE = 59 # No CUDA device discoverable for existing GPU
77
+ DCGM_FR_ECC_UNSUPPORTED = 60 # ECC memory is unsupported by this SKU
78
+ DCGM_FR_ECC_PENDING = 61 # ECC memory is in a pending state
79
+ DCGM_FR_MEMORY_BANDWIDTH = 62 # Memory bandwidth is too low
80
+ DCGM_FR_TARGET_POWER = 63 # Cannot hit the target power draw
81
+ DCGM_FR_API_FAIL = 64 # The specified API call failed
82
+ DCGM_FR_API_FAIL_GPU = 65 # The specified API call failed for the specified GPU
83
+ DCGM_FR_CUDA_CONTEXT = 66 # Cannot create a CUDA context on this GPU
84
+ DCGM_FR_DCGM_API = 67 # DCGM API failure
85
+ DCGM_FR_CONCURRENT_GPUS = 68 # Need multiple GPUs to run this test
86
+ DCGM_FR_TOO_MANY_ERRORS = 69 # More errors than fit in the return struct
87
+ DCGM_FR_NVLINK_CRC_ERROR_THRESHOLD = 70 # More than 100 CRC errors are happening per second
88
+ DCGM_FR_NVLINK_ERROR_CRITICAL = 71 # NVLink error for a field that should always be 0
89
+ DCGM_FR_ENFORCED_POWER_LIMIT = 72 # The enforced power limit is too low to hit the target
90
+ DCGM_FR_MEMORY_ALLOC_HOST = 73 # Cannot allocate memory on the host
91
+ DCGM_FR_GPU_OP_MODE = 74 # Bad GPU operating mode for running plugin
92
+ DCGM_FR_NO_MEMORY_CLOCKS = 75 # No memory clocks with the needed MHz were found
93
+ DCGM_FR_NO_GRAPHICS_CLOCKS = 76 # No graphics clocks with the needed MHz were found
94
+ DCGM_FR_HAD_TO_RESTORE_STATE = 77 # Note that we had to restore a GPU's state
95
+ DCGM_FR_L1TAG_UNSUPPORTED = 78 # L1TAG test is unsupported by this SKU
96
+ DCGM_FR_L1TAG_MISCOMPARE = 79 # L1TAG test failed on a miscompare
97
+ DCGM_FR_ROW_REMAP_FAILURE = 80 # Row remapping failed (Ampere or newer GPUs)
98
+ DCGM_FR_UNCONTAINED_ERROR = 81 # Uncontained error - XID 95
99
+ DCGM_FR_EMPTY_GPU_LIST = 82 # No GPU information given to plugin
100
+ DCGM_FR_DBE_PENDING_PAGE_RETIREMENTS = 83 # Pending page retirements due to a DBE
101
+ DCGM_FR_UNCORRECTABLE_ROW_REMAP = 84 # Uncorrectable row remapping
102
+ DCGM_FR_PENDING_ROW_REMAP = 85 # Row remapping is pending
103
+ DCGM_FR_BROKEN_P2P_MEMORY_DEVICE = 86 # P2P copy test detected an error writing to this GPU
104
+ DCGM_FR_BROKEN_P2P_WRITER_DEVICE = 87 # P2P copy test detected an error writing from this GPU
105
+ DCGM_FR_NVSWITCH_NVLINK_DOWN = 88 # An NVLink is down
106
+ DCGM_FR_EUD_BINARY_PERMISSIONS = 89 # EUD binary permissions are incorrect
107
+ DCGM_FR_EUD_NON_ROOT_USER = 90 # EUD plugin is not running as root
108
+ DCGM_FR_EUD_SPAWN_FAILURE = 91 # EUD plugin failed to spawn the EUD binary
109
+ DCGM_FR_EUD_TIMEOUT = 92 # EUD plugin timed out
110
+ DCGM_FR_EUD_ZOMBIE = 93 # EUD process remains running after the plugin considers it finished
111
+ DCGM_FR_EUD_NON_ZERO_EXIT_CODE = 94 # EUD process exited with a non-zero exit code
112
+ DCGM_FR_EUD_TEST_FAILED = 95 # EUD test failed
113
+ DCGM_FR_FILE_CREATE_PERMISSIONS = 96 # We cannot write a file in this directory.
114
+ DCGM_FR_PAUSE_RESUME_FAILED = 97 # Pause/Resume failed
115
+ DCGM_FR_ERROR_SENTINEL = 98 # MUST BE THE LAST ERROR CODE
116
+
117
+ # Standard message for running a field diagnostic
118
+ TRIAGE_RUN_FIELD_DIAG_MSG = "Run a field diagnostic on the GPU."
119
+ DEBUG_COOLING_MSG = "Verify that the cooling on this machine is functional, including external, thermal "\
120
+ "material interface, fans, and any other components."
121
+ BUG_REPORT_MSG = "Please capture an nvidia-bug-report and send it to NVIDIA."
122
+
123
+ # Define DCGM error priorities
124
+ DCGM_ERROR_MONITOR = 0 # Can perform workload, but needs to be monitored.
125
+ DCGM_ERROR_ISOLATE = 1 # Cannot perform workload. GPU should be isolated.
126
+ DCGM_ERROR_UNKNOWN = 2 # This error code is not recognized
127
+
128
+ # Messages for the error codes. All messages must be defined in the ERROR_CODE_MSG <msg> format
129
+ # where <msg> is the actual message.
130
+
131
+ DCGM_FR_OK_MSG = "The operation completed successfully."
132
+ DCGM_FR_UNKNOWN_MSG = "Unknown error."
133
+ DCGM_FR_UNRECOGNIZED_MSG = "Unrecognized error code."
134
+ # replay limit, gpu id, replay errors detected
135
+ DCGM_FR_PCI_REPLAY_RATE_MSG = "Detected more than %u PCIe replays per minute for GPU %u : %d"
136
+ # dbes deteced, gpu id
137
+ DCGM_FR_VOLATILE_DBE_DETECTED_MSG = "Detected %d volatile double-bit ECC error(s) in GPU %u."
138
+ # sbe limit, gpu id, sbes detected
139
+ DCGM_FR_VOLATILE_SBE_DETECTED_MSG = "More than %u single-bit ECC error(s) detected in GPU %u Volatile SBEs: %lld"
140
+ # gpu id
141
+ DCGM_FR_PENDING_PAGE_RETIREMENTS_MSG = "A pending retired page has been detected in GPU %u."
142
+ # retired pages detected, gpud id
143
+ DCGM_FR_RETIRED_PAGES_LIMIT_MSG = "%u or more retired pages have been detected in GPU %u. "
144
+ # retired pages due to dbes detected, gpu id
145
+ DCGM_FR_RETIRED_PAGES_DBE_LIMIT_MSG = "An excess of %u retired pages due to DBEs have been detected and" \
146
+ " more than one page has been retired due to DBEs in the past" \
147
+ " week in GPU %u."
148
+ # gpu id
149
+ DCGM_FR_CORRUPT_INFOROM_MSG = "A corrupt InfoROM has been detected in GPU %u."
150
+ # gpu id
151
+ DCGM_FR_CLOCK_THROTTLE_THERMAL_MSG = "Detected clock throttling due to thermal violation in GPU %u."
152
+ # gpu id
153
+ DCGM_FR_POWER_UNREADABLE_MSG = "Cannot reliably read the power usage for GPU %u."
154
+ # gpu id
155
+ DCGM_FR_CLOCK_THROTTLE_POWER_MSG = "Detected clock throttling due to power violation in GPU %u."
156
+ # nvlink errors detected, nvlink id, error threshold
157
+ DCGM_FR_NVLINK_ERROR_THRESHOLD_MSG = "Detected %ld NvLink errors on NvLink %u which exceeds threshold of %u"
158
+ # gpu id, nvlink id
159
+ DCGM_FR_NVLINK_DOWN_MSG = "GPU %u's NvLink link %d is currently down"
160
+ # nvswitch id, nvlink id
161
+ DCGM_FR_NVSWITCH_FATAL_ERROR_MSG = "Detected fatal errors on NvSwitch %u link %u"
162
+ # nvswitch id, nvlink id
163
+ DCGM_FR_NVSWITCH_NON_FATAL_ERROR_MSG = "Detected nonfatal errors on NvSwitch %u link %u"
164
+ # nvswitch id, nvlink port
165
+ DCGM_FR_NVSWITCH_DOWN_MSG = "NvSwitch physical ID %u's NvLink port %d is currently down."
166
+ # file path, error detail
167
+ DCGM_FR_NO_ACCESS_TO_FILE_MSG = "File %s could not be accessed directly: %s"
168
+ # purpose for communicating with NVML, NVML error as string, NVML error
169
+ DCGM_FR_NVML_API_MSG = "Error calling NVML API %s: %s"
170
+ DCGM_FR_DEVICE_COUNT_MISMATCH_MSG = "The number of devices NVML returns is different than the number "\
171
+ "of devices in /dev."
172
+ # function name
173
+ DCGM_FR_BAD_PARAMETER_MSG = "Bad parameter to function %s cannot be processed"
174
+ # library name, error returned from dlopen
175
+ DCGM_FR_CANNOT_OPEN_LIB_MSG = "Cannot open library %s: '%s'"
176
+ # the name of the driver on the denylist
177
+ DCGM_FR_DENYLISTED_DRIVER_MSG = "Found driver on the denylist: %s"
178
+ # the name of the function that wasn't found
179
+ DCGM_FR_NVML_LIB_BAD_MSG = "Cannot get pointer to %s from libnvidia-ml.so"
180
+ DCGM_FR_GRAPHICS_PROCESSES_MSG = "NVVS has detected graphics processes running on at least one "\
181
+ "GPU. This may cause some tests to fail."
182
+ # error message from the API call
183
+ DCGM_FR_HOSTENGINE_CONN_MSG = "Could not connect to the host engine: '%s'"
184
+ # field name, gpu id
185
+ DCGM_FR_FIELD_QUERY_MSG = "Could not query field %s for GPU %u"
186
+ # environment variable name
187
+ DCGM_FR_BAD_CUDA_ENV_MSG = "Found CUDA performance-limiting environment variable '%s'."
188
+ # gpu id
189
+ DCGM_FR_PERSISTENCE_MODE_MSG = "Persistence mode for GPU %u is currently disabled. The DCGM "\
190
+ "diagnostic requires peristence mode to be enabled."
191
+ DCGM_FR_LOW_BANDWIDTH_MSG = "Bandwidth of GPU %u in direction %s of %.2f did not exceed "\
192
+ "minimum required bandwidth of %.2f."
193
+ DCGM_FR_HIGH_LATENCY_MSG = "Latency type %s of GPU %u value %.2f exceeded maximum allowed "\
194
+ "latency of %.2f."
195
+ DCGM_FR_CANNOT_GET_FIELD_TAG_MSG = "Unable to get field information for field id %hu"
196
+ DCGM_FR_FIELD_VIOLATION_MSG = "Detected %ld %s for GPU %u"
197
+ DCGM_FR_FIELD_THRESHOLD_MSG = "Detected %ld %s for GPU %u which is above the threshold %ld"
198
+ DCGM_FR_FIELD_VIOLATION_DBL_MSG = "Detected %.1f %s for GPU %u"
199
+ DCGM_FR_FIELD_THRESHOLD_DBL_MSG = "Detected %.1f %s for GPU %u which is above the threshold %.1f"
200
+ DCGM_FR_UNSUPPORTED_FIELD_TYPE_MSG = "Field %s is not supported by this API because it is neither an "\
201
+ "int64 nor a double type."
202
+ DCGM_FR_FIELD_THRESHOLD_TS_MSG = "%s met or exceeded the threshold of %lu per second: %lu at "\
203
+ "%.1f seconds into the test."
204
+ DCGM_FR_FIELD_THRESHOLD_TS_DBL_MSG = "%s met or exceeded the threshold of %.1f per second: %.1f at "\
205
+ "%.1f seconds into the test."
206
+ DCGM_FR_THERMAL_VIOLATIONS_MSG = "There were thermal violations totaling %lu seconds for GPU %u"
207
+ DCGM_FR_THERMAL_VIOLATIONS_TS_MSG = "Thermal violations totaling %lu samples started at %.1f seconds "\
208
+ "into the test for GPU %u"
209
+ DCGM_FR_TEMP_VIOLATION_MSG = "Temperature %lld of GPU %u exceeded user-specified maximum "\
210
+ "allowed temperature %lld"
211
+ DCGM_FR_THROTTLING_VIOLATION_MSG = "Clocks are being throttling for GPU %u because of clock "\
212
+ "throttling starting %.1f seconds into the test. %s"
213
+ DCGM_FR_INTERNAL_MSG = "There was an internal error during the test: '%s'"
214
+ DCGM_FR_PCIE_GENERATION_MSG = "GPU %u is running at PCI link generation %d, which is below "\
215
+ "the minimum allowed link generation of %d (parameter '%s')"
216
+ DCGM_FR_PCIE_WIDTH_MSG = "GPU %u is running at PCI link width %dX, which is below the "\
217
+ "minimum allowed link generation of %d (parameter '%s')"
218
+ DCGM_FR_ABORTED_MSG = "Test was aborted early due to user signal"
219
+ DCGM_FR_TEST_DISABLED_MSG = "The %s test is skipped for this GPU."
220
+ DCGM_FR_CANNOT_GET_STAT_MSG = "Unable to generate / collect stat %s for GPU %u"
221
+ DCGM_FR_STRESS_LEVEL_MSG = "Max stress level of %.1f did not reach desired stress level of "\
222
+ "%.1f for GPU %u"
223
+ DCGM_FR_CUDA_API_MSG = "Error using CUDA API %s"
224
+ DCGM_FR_FAULTY_MEMORY_MSG = "Found %d faulty memory elements on GPU %u"
225
+ DCGM_FR_CANNOT_SET_WATCHES_MSG = "Unable to add field watches to DCGM: %s"
226
+ DCGM_FR_CUDA_UNBOUND_MSG = "Cuda GPU %d is no longer bound to a CUDA context...Aborting"
227
+ DCGM_FR_ECC_DISABLED_MSG = "Skipping test %s because ECC is not enabled on GPU %u"
228
+ DCGM_FR_MEMORY_ALLOC_MSG = "Couldn't allocate at least %.1f%% of GPU memory on GPU %u"
229
+ DCGM_FR_CUDA_DBE_MSG = "CUDA APIs have indicated that a double-bit ECC error has "\
230
+ "occured on GPU %u."
231
+ DCGM_FR_MEMORY_MISMATCH_MSG = "A memory mismatch was detected on GPU %u, but no error was "\
232
+ "reported by CUDA or NVML."
233
+ DCGM_FR_CUDA_DEVICE_MSG = "Unable to find a corresponding CUDA device for GPU %u: '%s'"
234
+ DCGM_FR_ECC_UNSUPPORTED_MSG = "This card does not support ECC Memory. Skipping test."
235
+ DCGM_FR_ECC_PENDING_MSG = "ECC memory for GPU %u is in a pending state."
236
+ DCGM_FR_MEMORY_BANDWIDTH_MSG = "GPU %u only achieved a memory bandwidth of %.2f GB/s, failing "\
237
+ "to meet %.2f GB/s for test %d"
238
+ DCGM_FR_TARGET_POWER_MSG = "Max power of %.1f did not reach desired power minimum %s of "\
239
+ "%.1f for GPU %u"
240
+ DCGM_FR_API_FAIL_MSG = "API call %s failed: '%s'"
241
+ DCGM_FR_API_FAIL_GPU_MSG = "API call %s failed for GPU %u: '%s'"
242
+ DCGM_FR_CUDA_CONTEXT_MSG = "GPU %u failed to create a CUDA context: %s"
243
+ DCGM_FR_DCGM_API_MSG = "Error using DCGM API %s"
244
+ DCGM_FR_CONCURRENT_GPUS_MSG = "Unable to run concurrent pair bandwidth test without 2 or more "\
245
+ "gpus. Skipping"
246
+ DCGM_FR_TOO_MANY_ERRORS_MSG = "This API can only return up to four errors per system. "\
247
+ "Additional errors were found for this system that couldn't be "\
248
+ "communicated."
249
+ DCGM_FR_NVLINK_CRC_ERROR_THRESHOLD_MSG = "%.1f %s NvLink errors found occuring per second on GPU %u, "\
250
+ "exceeding the limit of 100 per second."
251
+ DCGM_FR_NVLINK_ERROR_CRITICAL_MSG = "Detected %ld %s NvLink errors on GPU %u's NVLink (should be 0)"
252
+ DCGM_FR_ENFORCED_POWER_LIMIT_MSG = "Enforced power limit on GPU %u set to %.1f, which is too low to "\
253
+ "attempt to achieve target power %.1f"
254
+ DCGM_FR_MEMORY_ALLOC_HOST_MSG = "Cannot allocate %zu bytes on the host"
255
+ DCGM_FR_GPU_OP_MODE_MSG = "Skipping plugin due to a GPU being in GPU Operating Mode: LOW_DP."
256
+ DCGM_FR_NO_MEMORY_CLOCKS_MSG = "No memory clocks <= %u MHZ were found in %u supported memory clocks."
257
+ DCGM_FR_NO_GRAPHICS_CLOCKS_MSG = "No graphics clocks <= %u MHZ were found in %u supported graphics clocks for memory clock %u MHZ."
258
+ DCGM_FR_HAD_TO_RESTORE_STATE_MSG = "Had to restore GPU state on NVML GPU(s): %s"
259
+ DCGM_FR_L1TAG_UNSUPPORTED_MSG = "This card does not support the L1 cache test. Skipping test."
260
+ DCGM_FR_L1TAG_MISCOMPARE_MSG = "The L1 cache test failed with a miscompare."
261
+ DCGM_FR_ROW_REMAP_FAILURE_MSG = "Row remapping failed."
262
+ DCGM_FR_UNCONTAINED_ERROR_MSG = "GPU had an uncontained error (XID 95)"
263
+ DCGM_FR_EMPTY_GPU_LIST_MSG = "No valid GPUs passed to plugin"
264
+ DCGM_FR_DBE_PENDING_PAGE_RETIREMENTS_MSG = "Pending page retirements together with a DBE were detected on GPU %u."
265
+ DCGM_FR_UNCORRECTABLE_ROW_REMAP_MSG = "GPU %u has uncorrectable row remappings"
266
+ DCGM_FR_PENDING_ROW_REMAP_MSG = "GPU %u has pending row remappings"
267
+ DCGM_FR_BROKEN_P2P_MEMORY_DEVICE_MSG = "GPU %u was unsuccessfully written to in a peer-to-peer test: %s"
268
+ DCGM_FR_BROKEN_P2P_WRITER_DEVICE_MSG = "GPU %u unsuccessfully wrote data in a peer-to-peer test: %s"
269
+ DCGM_FR_NVSWITCH_NVLINK_DOWN_MSG = "NVSwitch %u's NvLink %u is down."
270
+ DCGM_FR_FILE_CREATE_PERMISSIONS_MSG = "The DCGM Diagnostic does not have permissions to create a file in directory '%s'"
271
+
272
+ # Suggestions for next steps for the corresponding error message
273
+ DCGM_FR_OK_NEXT = "N/A"
274
+ DCGM_FR_UNKNOWN_NEXT = ""
275
+ DCGM_FR_UNRECOGNIZED_NEXT = ""
276
+ DCGM_FR_PCI_REPLAY_RATE_NEXT = "Reconnect PCIe card. Run system side PCIE diagnostic utilities "\
277
+ "to verify hops off the GPU board. If issue is on the board, run "\
278
+ "the field diagnostic."
279
+ DCGM_FR_VOLATILE_DBE_DETECTED_NEXT = "Drain the GPU and reset it or reboot the node."
280
+ DCGM_FR_VOLATILE_SBE_DETECTED_NEXT = "Monitor - this GPU can still perform workload."
281
+ DCGM_FR_PENDING_PAGE_RETIREMENTS_NEXT = "If volatile double bit errors exist, drain the GPU and reset it "\
282
+ "or reboot the node. Otherwise, monitor - GPU can still perform "\
283
+ "workload."
284
+ DCGM_FR_RETIRED_PAGES_LIMIT_NEXT = TRIAGE_RUN_FIELD_DIAG_MSG
285
+ DCGM_FR_RETIRED_PAGES_DBE_LIMIT_NEXT = TRIAGE_RUN_FIELD_DIAG_MSG
286
+ DCGM_FR_CORRUPT_INFOROM_NEXT = "Flash the InfoROM to clear this corruption."
287
+ DCGM_FR_CLOCK_THROTTLE_THERMAL_NEXT = DEBUG_COOLING_MSG
288
+ DCGM_FR_POWER_UNREADABLE_NEXT = ""
289
+ DCGM_FR_CLOCK_THROTTLE_POWER_NEXT = "Monitor the power conditions. This GPU can still perform workload."
290
+ DCGM_FR_NVLINK_ERROR_THRESHOLD_NEXT = TRIAGE_RUN_FIELD_DIAG_MSG
291
+ DCGM_FR_NVLINK_DOWN_NEXT = TRIAGE_RUN_FIELD_DIAG_MSG
292
+ DCGM_FR_NVSWITCH_FATAL_ERROR_NEXT = TRIAGE_RUN_FIELD_DIAG_MSG
293
+ DCGM_FR_NVSWITCH_NON_FATAL_ERROR_NEXT = "Monitor the NVSwitch. It can still perform workload."
294
+ DCGM_FR_NVSWITCH_DOWN_NEXT = ""
295
+ DCGM_FR_NO_ACCESS_TO_FILE_NEXT = "Check relevant permissions, access, and existence of the file."
296
+ DCGM_FR_NVML_API_NEXT = "Check the error condition and ensure that appropriate libraries "\
297
+ "are present and accessible."
298
+ DCGM_FR_DEVICE_COUNT_MISMATCH_NEXT = "Check for the presence of cgroups, operating system blocks, and "\
299
+ "or unsupported / older cards"
300
+ DCGM_FR_BAD_PARAMETER_NEXT = ""
301
+ DCGM_FR_CANNOT_OPEN_LIB_NEXT = "Check for the existence of the library and set LD_LIBRARY_PATH "\
302
+ "if needed."
303
+ DCGM_FR_DENYLISTED_DRIVER_NEXT = "Please load the appropriate driver."
304
+ DCGM_FR_NVML_LIB_BAD_NEXT = "Make sure that the required version of libnvidia-ml.so "\
305
+ "is present and accessible on the system."
306
+ DCGM_FR_GRAPHICS_PROCESSES_NEXT = "Stop the graphics processes or run this diagnostic on a server "\
307
+ "that is not being used for display purposes."
308
+ DCGM_FR_HOSTENGINE_CONN_NEXT = "If hostengine is run separately, please ensure that it is up "\
309
+ "and responsive."
310
+ DCGM_FR_FIELD_QUERY_NEXT = ""
311
+ DCGM_FR_BAD_CUDA_ENV_NEXT = "Please unset this environment variable to address test failures."
312
+ DCGM_FR_PERSISTENCE_MODE_NEXT = "Enable persistence mode by running \"nvidia-smi -i <gpuId> -pm "\
313
+ "1 \" as root."
314
+ DCGM_FR_LOW_BANDWIDTH_NEXT = "Verify that your minimum bandwidth setting is appropriate for "\
315
+ "all topological consequences."
316
+ DCGM_FR_HIGH_LATENCY_NEXT = ""
317
+ DCGM_FR_CANNOT_GET_FIELD_TAG_NEXT = ""
318
+ DCGM_FR_FIELD_VIOLATION_NEXT = ""
319
+ DCGM_FR_FIELD_THRESHOLD_NEXT = ""
320
+ DCGM_FR_FIELD_VIOLATION_DBL_NEXT = ""
321
+ DCGM_FR_FIELD_THRESHOLD_DBL_NEXT = ""
322
+ DCGM_FR_UNSUPPORTED_FIELD_TYPE_NEXT = ""
323
+ DCGM_FR_FIELD_THRESHOLD_TS_NEXT = ""
324
+ DCGM_FR_FIELD_THRESHOLD_TS_DBL_NEXT = ""
325
+ DCGM_FR_THERMAL_VIOLATIONS_NEXT = DEBUG_COOLING_MSG
326
+ DCGM_FR_THERMAL_VIOLATIONS_TS_NEXT = DEBUG_COOLING_MSG
327
+ DCGM_FR_TEMP_VIOLATION_NEXT = "Verify that the user-specified temperature maximum is set "\
328
+ "correctly. If it is, %s" % DEBUG_COOLING_MSG
329
+ DCGM_FR_THROTTLING_VIOLATION_NEXT = ""
330
+ DCGM_FR_INTERNAL_NEXT = ""
331
+ DCGM_FR_PCIE_GENERATION_NEXT = ""
332
+ DCGM_FR_PCIE_WIDTH_NEXT = ""
333
+ DCGM_FR_ABORTED_NEXT = ""
334
+ DCGM_FR_TEST_DISABLED_NEXT = ""
335
+ DCGM_FR_CANNOT_GET_STAT_NEXT = "If running a standalone nv-hostengine, verify that it is up "\
336
+ "and responsive."
337
+ DCGM_FR_STRESS_LEVEL_NEXT = ""
338
+ DCGM_FR_CUDA_API_NEXT = ""
339
+ DCGM_FR_FAULTY_MEMORY_NEXT = TRIAGE_RUN_FIELD_DIAG_MSG
340
+ DCGM_FR_CANNOT_SET_WATCHES_NEXT = ""
341
+ DCGM_FR_CUDA_UNBOUND_NEXT = ""
342
+ DCGM_FR_ECC_DISABLED_NEXT = "Enable ECC memory by running \"nvidia-smi -i <gpuId> -e 1\" "\
343
+ "to enable. This may require a GPU reset or reboot to take effect."
344
+ DCGM_FR_MEMORY_ALLOC_NEXT = ""
345
+ DCGM_FR_CUDA_DBE_NEXT = TRIAGE_RUN_FIELD_DIAG_MSG
346
+ DCGM_FR_MEMORY_MISMATCH_NEXT = TRIAGE_RUN_FIELD_DIAG_MSG
347
+ DCGM_FR_CUDA_DEVICE_NEXT = ""
348
+ DCGM_FR_ECC_UNSUPPORTED_NEXT = ""
349
+ DCGM_FR_ECC_PENDING_NEXT = "Please reboot to activate it."
350
+ DCGM_FR_MEMORY_BANDWIDTH_NEXT = ""
351
+ DCGM_FR_TARGET_POWER_NEXT = ""
352
+ DCGM_FR_API_FAIL_NEXT = ""
353
+ DCGM_FR_API_FAIL_GPU_NEXT = ""
354
+ DCGM_FR_CUDA_CONTEXT_NEXT = "Please make sure the correct driver version is installed and "\
355
+ "verify that no conflicting libraries are present."
356
+ DCGM_FR_DCGM_API_NEXT = ""
357
+ DCGM_FR_CONCURRENT_GPUS_NEXT = ""
358
+ DCGM_FR_TOO_MANY_ERRORS_NEXT = ""
359
+ DCGM_FR_NVLINK_CRC_ERROR_THRESHOLD_NEXT = TRIAGE_RUN_FIELD_DIAG_MSG
360
+ DCGM_FR_NVLINK_ERROR_CRITICAL_NEXT = TRIAGE_RUN_FIELD_DIAG_MSG
361
+ DCGM_FR_ENFORCED_POWER_LIMIT_NEXT = "If this enforced power limit is necessary, then this test "\
362
+ "cannot be run. If it is unnecessary, then raise the enforced "\
363
+ "power limit setting to be able to run this test."
364
+ DCGM_FR_MEMORY_ALLOC_HOST_NEXT = "Manually kill processes or restart your machine."
365
+ DCGM_FR_GPU_OP_MODE_NEXT = "Fix by running nvidia-smi as root with: nvidia-smi --gom=0 -i "\
366
+ "<gpu index>"
367
+ DCGM_FR_NO_MEMORY_CLOCKS_NEXT = ""
368
+ DCGM_FR_NO_GRAPHICS_CLOCKS_NEXT = ""
369
+ DCGM_FR_HAD_TO_RESTORE_STATE_NEXT = ""
370
+ DCGM_FR_L1TAG_UNSUPPORTED_NEXT = ""
371
+ DCGM_FR_L1TAG_MISCOMPARE_NEXT = TRIAGE_RUN_FIELD_DIAG_MSG
372
+ DCGM_FR_ROW_REMAP_FAILURE_NEXT = DCGM_FR_VOLATILE_DBE_DETECTED_NEXT
373
+ DCGM_FR_UNCONTAINED_ERROR_NEXT = DCGM_FR_VOLATILE_DBE_DETECTED_NEXT
374
+ DCGM_FR_EMPTY_GPU_LIST_NEXT = ""
375
+ DCGM_FR_DBE_PENDING_PAGE_RETIREMENTS_NEXT = "Drain the GPU and reset it or reboot the node to resolve this issue."
376
+ DCGM_FR_UNCORRECTABLE_ROW_REMAP_NEXT = ""
377
+ DCGM_FR_PENDING_ROW_REMAP_NEXT = ""
378
+ DCGM_FR_BROKEN_P2P_MEMORY_DEVICE_NEXT = BUG_REPORT_MSG
379
+ DCGM_FR_BROKEN_P2P_WRITER_DEVICE_NEXT = BUG_REPORT_MSG
380
+ DCGM_FR_NVSWITCH_NVLINK_DOWN_NEXT = "Please check fabric manager and initialization logs to figure out why the link is down. You may also need to run a field diagnostic."
381
+ DCGM_FR_FILE_CREATE_PERMISSIONS_NEXT = "Please restart the hostengine with parameter --home-dir to specify a different home directory for the " \
382
+ "diagnostic or change permissions in the current directory to allow the user to write files there."
383
+
384
+
385
+ def dcgmErrorGetPriorityByCode(code):
386
+ fn = dcgm_structs._dcgmGetFunctionPointer("dcgmErrorGetPriorityByCode")
387
+ ret = fn(code)
388
+ return ret
389
+
390
+
391
+ def dcgmErrorGetFormatMsgByCode(code):
392
+ fn = dcgm_structs._dcgmGetFunctionPointer("dcgmErrorGetFormatMsgByCode")
393
+ fn.restype = ctypes.c_char_p
394
+ ret = fn(code)
395
+ return ret.decode('utf-8') if isinstance(ret, bytes) else ret