triton-model-analyzer 1.48.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (204) hide show
  1. model_analyzer/__init__.py +15 -0
  2. model_analyzer/analyzer.py +448 -0
  3. model_analyzer/cli/__init__.py +15 -0
  4. model_analyzer/cli/cli.py +193 -0
  5. model_analyzer/config/__init__.py +15 -0
  6. model_analyzer/config/generate/__init__.py +15 -0
  7. model_analyzer/config/generate/automatic_model_config_generator.py +164 -0
  8. model_analyzer/config/generate/base_model_config_generator.py +352 -0
  9. model_analyzer/config/generate/brute_plus_binary_parameter_search_run_config_generator.py +164 -0
  10. model_analyzer/config/generate/brute_run_config_generator.py +154 -0
  11. model_analyzer/config/generate/concurrency_sweeper.py +75 -0
  12. model_analyzer/config/generate/config_generator_interface.py +52 -0
  13. model_analyzer/config/generate/coordinate.py +143 -0
  14. model_analyzer/config/generate/coordinate_data.py +86 -0
  15. model_analyzer/config/generate/generator_utils.py +116 -0
  16. model_analyzer/config/generate/manual_model_config_generator.py +187 -0
  17. model_analyzer/config/generate/model_config_generator_factory.py +92 -0
  18. model_analyzer/config/generate/model_profile_spec.py +74 -0
  19. model_analyzer/config/generate/model_run_config_generator.py +154 -0
  20. model_analyzer/config/generate/model_variant_name_manager.py +150 -0
  21. model_analyzer/config/generate/neighborhood.py +536 -0
  22. model_analyzer/config/generate/optuna_plus_concurrency_sweep_run_config_generator.py +141 -0
  23. model_analyzer/config/generate/optuna_run_config_generator.py +838 -0
  24. model_analyzer/config/generate/perf_analyzer_config_generator.py +312 -0
  25. model_analyzer/config/generate/quick_plus_concurrency_sweep_run_config_generator.py +130 -0
  26. model_analyzer/config/generate/quick_run_config_generator.py +753 -0
  27. model_analyzer/config/generate/run_config_generator_factory.py +329 -0
  28. model_analyzer/config/generate/search_config.py +112 -0
  29. model_analyzer/config/generate/search_dimension.py +73 -0
  30. model_analyzer/config/generate/search_dimensions.py +85 -0
  31. model_analyzer/config/generate/search_parameter.py +49 -0
  32. model_analyzer/config/generate/search_parameters.py +388 -0
  33. model_analyzer/config/input/__init__.py +15 -0
  34. model_analyzer/config/input/config_command.py +483 -0
  35. model_analyzer/config/input/config_command_profile.py +1747 -0
  36. model_analyzer/config/input/config_command_report.py +267 -0
  37. model_analyzer/config/input/config_defaults.py +236 -0
  38. model_analyzer/config/input/config_enum.py +83 -0
  39. model_analyzer/config/input/config_field.py +216 -0
  40. model_analyzer/config/input/config_list_generic.py +112 -0
  41. model_analyzer/config/input/config_list_numeric.py +151 -0
  42. model_analyzer/config/input/config_list_string.py +111 -0
  43. model_analyzer/config/input/config_none.py +71 -0
  44. model_analyzer/config/input/config_object.py +129 -0
  45. model_analyzer/config/input/config_primitive.py +81 -0
  46. model_analyzer/config/input/config_status.py +75 -0
  47. model_analyzer/config/input/config_sweep.py +83 -0
  48. model_analyzer/config/input/config_union.py +113 -0
  49. model_analyzer/config/input/config_utils.py +128 -0
  50. model_analyzer/config/input/config_value.py +243 -0
  51. model_analyzer/config/input/objects/__init__.py +15 -0
  52. model_analyzer/config/input/objects/config_model_profile_spec.py +325 -0
  53. model_analyzer/config/input/objects/config_model_report_spec.py +173 -0
  54. model_analyzer/config/input/objects/config_plot.py +198 -0
  55. model_analyzer/config/input/objects/config_protobuf_utils.py +101 -0
  56. model_analyzer/config/input/yaml_config_validator.py +82 -0
  57. model_analyzer/config/run/__init__.py +15 -0
  58. model_analyzer/config/run/model_run_config.py +313 -0
  59. model_analyzer/config/run/run_config.py +168 -0
  60. model_analyzer/constants.py +76 -0
  61. model_analyzer/device/__init__.py +15 -0
  62. model_analyzer/device/device.py +24 -0
  63. model_analyzer/device/gpu_device.py +87 -0
  64. model_analyzer/device/gpu_device_factory.py +248 -0
  65. model_analyzer/entrypoint.py +307 -0
  66. model_analyzer/log_formatter.py +65 -0
  67. model_analyzer/model_analyzer_exceptions.py +24 -0
  68. model_analyzer/model_manager.py +255 -0
  69. model_analyzer/monitor/__init__.py +15 -0
  70. model_analyzer/monitor/cpu_monitor.py +69 -0
  71. model_analyzer/monitor/dcgm/DcgmDiag.py +191 -0
  72. model_analyzer/monitor/dcgm/DcgmFieldGroup.py +83 -0
  73. model_analyzer/monitor/dcgm/DcgmGroup.py +815 -0
  74. model_analyzer/monitor/dcgm/DcgmHandle.py +141 -0
  75. model_analyzer/monitor/dcgm/DcgmJsonReader.py +69 -0
  76. model_analyzer/monitor/dcgm/DcgmReader.py +623 -0
  77. model_analyzer/monitor/dcgm/DcgmStatus.py +57 -0
  78. model_analyzer/monitor/dcgm/DcgmSystem.py +412 -0
  79. model_analyzer/monitor/dcgm/__init__.py +15 -0
  80. model_analyzer/monitor/dcgm/common/__init__.py +13 -0
  81. model_analyzer/monitor/dcgm/common/dcgm_client_cli_parser.py +194 -0
  82. model_analyzer/monitor/dcgm/common/dcgm_client_main.py +86 -0
  83. model_analyzer/monitor/dcgm/dcgm_agent.py +887 -0
  84. model_analyzer/monitor/dcgm/dcgm_collectd_plugin.py +369 -0
  85. model_analyzer/monitor/dcgm/dcgm_errors.py +395 -0
  86. model_analyzer/monitor/dcgm/dcgm_field_helpers.py +546 -0
  87. model_analyzer/monitor/dcgm/dcgm_fields.py +815 -0
  88. model_analyzer/monitor/dcgm/dcgm_fields_collectd.py +671 -0
  89. model_analyzer/monitor/dcgm/dcgm_fields_internal.py +29 -0
  90. model_analyzer/monitor/dcgm/dcgm_fluentd.py +45 -0
  91. model_analyzer/monitor/dcgm/dcgm_monitor.py +138 -0
  92. model_analyzer/monitor/dcgm/dcgm_prometheus.py +326 -0
  93. model_analyzer/monitor/dcgm/dcgm_structs.py +2357 -0
  94. model_analyzer/monitor/dcgm/dcgm_telegraf.py +65 -0
  95. model_analyzer/monitor/dcgm/dcgm_value.py +151 -0
  96. model_analyzer/monitor/dcgm/dcgmvalue.py +155 -0
  97. model_analyzer/monitor/dcgm/denylist_recommendations.py +573 -0
  98. model_analyzer/monitor/dcgm/pydcgm.py +47 -0
  99. model_analyzer/monitor/monitor.py +143 -0
  100. model_analyzer/monitor/remote_monitor.py +137 -0
  101. model_analyzer/output/__init__.py +15 -0
  102. model_analyzer/output/file_writer.py +63 -0
  103. model_analyzer/output/output_writer.py +42 -0
  104. model_analyzer/perf_analyzer/__init__.py +15 -0
  105. model_analyzer/perf_analyzer/genai_perf_config.py +206 -0
  106. model_analyzer/perf_analyzer/perf_analyzer.py +882 -0
  107. model_analyzer/perf_analyzer/perf_config.py +479 -0
  108. model_analyzer/plots/__init__.py +15 -0
  109. model_analyzer/plots/detailed_plot.py +266 -0
  110. model_analyzer/plots/plot_manager.py +224 -0
  111. model_analyzer/plots/simple_plot.py +213 -0
  112. model_analyzer/record/__init__.py +15 -0
  113. model_analyzer/record/gpu_record.py +68 -0
  114. model_analyzer/record/metrics_manager.py +887 -0
  115. model_analyzer/record/record.py +280 -0
  116. model_analyzer/record/record_aggregator.py +256 -0
  117. model_analyzer/record/types/__init__.py +15 -0
  118. model_analyzer/record/types/cpu_available_ram.py +93 -0
  119. model_analyzer/record/types/cpu_used_ram.py +93 -0
  120. model_analyzer/record/types/gpu_free_memory.py +96 -0
  121. model_analyzer/record/types/gpu_power_usage.py +107 -0
  122. model_analyzer/record/types/gpu_total_memory.py +96 -0
  123. model_analyzer/record/types/gpu_used_memory.py +96 -0
  124. model_analyzer/record/types/gpu_utilization.py +108 -0
  125. model_analyzer/record/types/inter_token_latency_avg.py +60 -0
  126. model_analyzer/record/types/inter_token_latency_base.py +74 -0
  127. model_analyzer/record/types/inter_token_latency_max.py +60 -0
  128. model_analyzer/record/types/inter_token_latency_min.py +60 -0
  129. model_analyzer/record/types/inter_token_latency_p25.py +60 -0
  130. model_analyzer/record/types/inter_token_latency_p50.py +60 -0
  131. model_analyzer/record/types/inter_token_latency_p75.py +60 -0
  132. model_analyzer/record/types/inter_token_latency_p90.py +60 -0
  133. model_analyzer/record/types/inter_token_latency_p95.py +60 -0
  134. model_analyzer/record/types/inter_token_latency_p99.py +60 -0
  135. model_analyzer/record/types/output_token_throughput.py +105 -0
  136. model_analyzer/record/types/perf_client_response_wait.py +97 -0
  137. model_analyzer/record/types/perf_client_send_recv.py +97 -0
  138. model_analyzer/record/types/perf_latency.py +111 -0
  139. model_analyzer/record/types/perf_latency_avg.py +60 -0
  140. model_analyzer/record/types/perf_latency_base.py +74 -0
  141. model_analyzer/record/types/perf_latency_p90.py +60 -0
  142. model_analyzer/record/types/perf_latency_p95.py +60 -0
  143. model_analyzer/record/types/perf_latency_p99.py +60 -0
  144. model_analyzer/record/types/perf_server_compute_infer.py +97 -0
  145. model_analyzer/record/types/perf_server_compute_input.py +97 -0
  146. model_analyzer/record/types/perf_server_compute_output.py +97 -0
  147. model_analyzer/record/types/perf_server_queue.py +97 -0
  148. model_analyzer/record/types/perf_throughput.py +105 -0
  149. model_analyzer/record/types/time_to_first_token_avg.py +60 -0
  150. model_analyzer/record/types/time_to_first_token_base.py +74 -0
  151. model_analyzer/record/types/time_to_first_token_max.py +60 -0
  152. model_analyzer/record/types/time_to_first_token_min.py +60 -0
  153. model_analyzer/record/types/time_to_first_token_p25.py +60 -0
  154. model_analyzer/record/types/time_to_first_token_p50.py +60 -0
  155. model_analyzer/record/types/time_to_first_token_p75.py +60 -0
  156. model_analyzer/record/types/time_to_first_token_p90.py +60 -0
  157. model_analyzer/record/types/time_to_first_token_p95.py +60 -0
  158. model_analyzer/record/types/time_to_first_token_p99.py +60 -0
  159. model_analyzer/reports/__init__.py +15 -0
  160. model_analyzer/reports/html_report.py +195 -0
  161. model_analyzer/reports/pdf_report.py +50 -0
  162. model_analyzer/reports/report.py +86 -0
  163. model_analyzer/reports/report_factory.py +62 -0
  164. model_analyzer/reports/report_manager.py +1376 -0
  165. model_analyzer/reports/report_utils.py +42 -0
  166. model_analyzer/result/__init__.py +15 -0
  167. model_analyzer/result/constraint_manager.py +150 -0
  168. model_analyzer/result/model_config_measurement.py +354 -0
  169. model_analyzer/result/model_constraints.py +105 -0
  170. model_analyzer/result/parameter_search.py +246 -0
  171. model_analyzer/result/result_manager.py +430 -0
  172. model_analyzer/result/result_statistics.py +159 -0
  173. model_analyzer/result/result_table.py +217 -0
  174. model_analyzer/result/result_table_manager.py +646 -0
  175. model_analyzer/result/result_utils.py +42 -0
  176. model_analyzer/result/results.py +277 -0
  177. model_analyzer/result/run_config_measurement.py +658 -0
  178. model_analyzer/result/run_config_result.py +210 -0
  179. model_analyzer/result/run_config_result_comparator.py +110 -0
  180. model_analyzer/result/sorted_results.py +151 -0
  181. model_analyzer/state/__init__.py +15 -0
  182. model_analyzer/state/analyzer_state.py +76 -0
  183. model_analyzer/state/analyzer_state_manager.py +215 -0
  184. model_analyzer/triton/__init__.py +15 -0
  185. model_analyzer/triton/client/__init__.py +15 -0
  186. model_analyzer/triton/client/client.py +234 -0
  187. model_analyzer/triton/client/client_factory.py +57 -0
  188. model_analyzer/triton/client/grpc_client.py +104 -0
  189. model_analyzer/triton/client/http_client.py +107 -0
  190. model_analyzer/triton/model/__init__.py +15 -0
  191. model_analyzer/triton/model/model_config.py +556 -0
  192. model_analyzer/triton/model/model_config_variant.py +29 -0
  193. model_analyzer/triton/server/__init__.py +15 -0
  194. model_analyzer/triton/server/server.py +76 -0
  195. model_analyzer/triton/server/server_config.py +269 -0
  196. model_analyzer/triton/server/server_docker.py +229 -0
  197. model_analyzer/triton/server/server_factory.py +306 -0
  198. model_analyzer/triton/server/server_local.py +158 -0
  199. triton_model_analyzer-1.48.0.dist-info/METADATA +52 -0
  200. triton_model_analyzer-1.48.0.dist-info/RECORD +204 -0
  201. triton_model_analyzer-1.48.0.dist-info/WHEEL +5 -0
  202. triton_model_analyzer-1.48.0.dist-info/entry_points.txt +2 -0
  203. triton_model_analyzer-1.48.0.dist-info/licenses/LICENSE +67 -0
  204. triton_model_analyzer-1.48.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,671 @@
1
+ # Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ from model_analyzer.monitor.dcgm.dcgm_fields import *
16
+ from model_analyzer.monitor.dcgm.dcgm_fields_internal import *
17
+ import sys
18
+
19
+
20
+ class CollectdMetadata:
21
+ '''
22
+ Constructor
23
+ @params:
24
+ name: string identifying the dcgm field. The field_name as opposed to
25
+ field_id.Address:port of the host to connect. Defaults to localhost
26
+ kind: collectd type string.
27
+ used: a bool indicating whether or not the field is to be defined in
28
+ a collectd types.db file when GenerateCollectdTypesDB() is called
29
+ (generally if this file is run as a python3 mainline). We enumerate
30
+ all the dcgm fields, but only generate types.db records for those
31
+ supported at the current time. Others may or may not have correct
32
+ collectd type definitions (generally one might be a guage where it
33
+ is more correctly a counter). The idea is that an intrepid user may
34
+ enable generation of additional dcgm fields that they wish to collect
35
+ but are not officially supported yet.
36
+ '''
37
+
38
+ def __init__(self, name, kind, used=False):
39
+ self.name = name
40
+ self.kind = kind
41
+ self.used = used
42
+
43
+
44
+ # collectd metadata definition table.
45
+
46
+ CollectdMetadataDict = {
47
+ DCGM_FI_DRIVER_VERSION:
48
+ None,
49
+ DCGM_FI_NVML_VERSION:
50
+ None,
51
+ DCGM_FI_PROCESS_NAME:
52
+ None,
53
+ DCGM_FI_CUDA_DRIVER_VERSION:
54
+ CollectdMetadata("cuda_driver_version", "value:GAUGE:U:U"),
55
+ DCGM_FI_DEV_COUNT:
56
+ CollectdMetadata("device_count", "value:GAUGE:U:U"),
57
+ DCGM_FI_DEV_NAME:
58
+ None,
59
+ DCGM_FI_DEV_BRAND:
60
+ None,
61
+ DCGM_FI_DEV_NVML_INDEX:
62
+ CollectdMetadata("nvml_index", "value:GAUGE:U:U"),
63
+ DCGM_FI_DEV_SERIAL:
64
+ None,
65
+ DCGM_FI_DEV_CPU_AFFINITY_0:
66
+ CollectdMetadata("cpu_affinity_0", "value:GAUGE:U:U"),
67
+ DCGM_FI_DEV_CPU_AFFINITY_1:
68
+ CollectdMetadata("cpu_affinity_1", "value:GAUGE:U:U"),
69
+ DCGM_FI_DEV_CPU_AFFINITY_2:
70
+ CollectdMetadata("cpu_affinity_2", "value:GAUGE:U:U"),
71
+ DCGM_FI_DEV_CPU_AFFINITY_3:
72
+ CollectdMetadata("cpu_affinity_3", "value:GAUGE:U:U"),
73
+ DCGM_FI_DEV_UUID:
74
+ None,
75
+ DCGM_FI_DEV_MINOR_NUMBER:
76
+ CollectdMetadata("minor_number", "value:GAUGE:U:U"),
77
+ DCGM_FI_DEV_OEM_INFOROM_VER:
78
+ None,
79
+ DCGM_FI_DEV_ECC_INFOROM_VER:
80
+ None,
81
+ DCGM_FI_DEV_POWER_INFOROM_VER:
82
+ None,
83
+ DCGM_FI_DEV_INFOROM_IMAGE_VER:
84
+ None,
85
+ DCGM_FI_DEV_INFOROM_CONFIG_CHECK:
86
+ CollectdMetadata("inforom_config_checksum", "value:GAUGE:U:U"),
87
+ DCGM_FI_DEV_PCI_BUSID:
88
+ None,
89
+ DCGM_FI_DEV_PCI_COMBINED_ID:
90
+ CollectdMetadata("pci_combined_id", "value:GAUGE:U:U"),
91
+ DCGM_FI_DEV_PCI_SUBSYS_ID:
92
+ CollectdMetadata("pci_subsys_id", "value:GAUGE:U:U"),
93
+ DCGM_FI_DEV_PCIE_TX_THROUGHPUT:
94
+ CollectdMetadata("pcie_tx_throughput", "value:GAUGE:0:U", True),
95
+ DCGM_FI_DEV_PCIE_RX_THROUGHPUT:
96
+ CollectdMetadata("pcie_rx_throughput", "value:GAUGE:0:U", True),
97
+ DCGM_FI_DEV_PCIE_REPLAY_COUNTER:
98
+ CollectdMetadata("pcie_replay_counter", "value:COUNTER:0:U", True),
99
+ DCGM_FI_DEV_SM_CLOCK:
100
+ CollectdMetadata("sm_clock", "value:GAUGE:0:U", True),
101
+ DCGM_FI_DEV_MEM_CLOCK:
102
+ CollectdMetadata("memory_clock", "value:GAUGE:0:U", True),
103
+ DCGM_FI_DEV_VIDEO_CLOCK:
104
+ CollectdMetadata("video_clock", "value:GAUGE:0:U", True),
105
+ DCGM_FI_DEV_APP_SM_CLOCK:
106
+ CollectdMetadata("sm_app_clock", "value:GAUGE:0:U", True),
107
+ DCGM_FI_DEV_APP_MEM_CLOCK:
108
+ CollectdMetadata("mem_app_clock", "value:GAUGE:0:U", True),
109
+ DCGM_FI_DEV_CLOCK_THROTTLE_REASONS:
110
+ CollectdMetadata("current_clock_throttle_reasons", "value:GAUGE:U:U"),
111
+ DCGM_FI_DEV_MAX_SM_CLOCK:
112
+ CollectdMetadata("sm_max_clock", "value:GAUGE:0:U", True),
113
+ DCGM_FI_DEV_MAX_MEM_CLOCK:
114
+ CollectdMetadata("memory_max_clock", "value:GAUGE:0:U", True),
115
+ DCGM_FI_DEV_MAX_VIDEO_CLOCK:
116
+ CollectdMetadata("video_max_clock", "value:GAUGE:0:U", True),
117
+ DCGM_FI_DEV_AUTOBOOST:
118
+ CollectdMetadata("autoboost", "value:GAUGE:U:U"),
119
+ DCGM_FI_DEV_GPU_TEMP:
120
+ CollectdMetadata("gpu_temp", "value:GAUGE:U:U", True),
121
+ DCGM_FI_DEV_MEM_MAX_OP_TEMP:
122
+ CollectdMetadata("gpu_mem_max_op_temp", "value:GAUGE:U:U"),
123
+ DCGM_FI_DEV_GPU_MAX_OP_TEMP:
124
+ CollectdMetadata("gpu_max_op_temp", "value:GAUGE:U:U"),
125
+ DCGM_FI_DEV_SLOWDOWN_TEMP:
126
+ CollectdMetadata("slowdown_temp", "value:GAUGE:U:U"),
127
+ DCGM_FI_DEV_SHUTDOWN_TEMP:
128
+ CollectdMetadata("shutdown_temp", "value:GAUGE:U:U"),
129
+ DCGM_FI_DEV_POWER_MGMT_LIMIT:
130
+ CollectdMetadata("power_management_limit", "value:GAUGE:U:U"),
131
+ DCGM_FI_DEV_POWER_MGMT_LIMIT_MIN:
132
+ CollectdMetadata("power_management_limit_min", "value:GAUGE:U:U"),
133
+ DCGM_FI_DEV_POWER_MGMT_LIMIT_MAX:
134
+ CollectdMetadata("power_management_limit_max", "value:GAUGE:U:U"),
135
+ DCGM_FI_DEV_POWER_MGMT_LIMIT_DEF:
136
+ CollectdMetadata("power_management_limit_default", "value:GAUGE:U:U"),
137
+ DCGM_FI_DEV_POWER_USAGE:
138
+ CollectdMetadata("power_usage", "value:GAUGE:0:U", True),
139
+ DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION:
140
+ CollectdMetadata("total_energy_consumption", "value:GAUGE:0:U",
141
+ True), # left as guage since zeroed at driver reload
142
+ DCGM_FI_DEV_ENFORCED_POWER_LIMIT:
143
+ CollectdMetadata("enforced_power_limit", "value:GAUGE:U:U"),
144
+ DCGM_FI_DEV_PSTATE:
145
+ CollectdMetadata("pstate", "value:GAUGE:U:U"),
146
+ DCGM_FI_DEV_FAN_SPEED:
147
+ CollectdMetadata("fan_speed", "value:GAUGE:U:U"),
148
+ DCGM_FI_DEV_COMPUTE_MODE:
149
+ CollectdMetadata("compute_mode", "value:GAUGE:U:U"),
150
+ DCGM_FI_DEV_PERSISTENCE_MODE:
151
+ CollectdMetadata("persistance_mode", "value:GAUGE:U:U"),
152
+ DCGM_FI_DEV_MIG_MODE:
153
+ CollectdMetadata("mig_mode", "value:GAUGE:U:U"),
154
+ DCGM_FI_DEV_CUDA_VISIBLE_DEVICES_STR:
155
+ None,
156
+ DCGM_FI_DEV_MIG_MAX_SLICES:
157
+ CollectdMetadata("mig_max_slices", "value:GAUGE:U:U"),
158
+ DCGM_FI_DEV_ECC_CURRENT:
159
+ CollectdMetadata("ecc", "value:GAUGE:U:U"),
160
+ DCGM_FI_DEV_ECC_PENDING:
161
+ CollectdMetadata("ecc_pending", "value:GAUGE:U:U"),
162
+ DCGM_FI_DEV_ECC_SBE_VOL_TOTAL:
163
+ CollectdMetadata("ecc_sbe_volatile_total", "value:COUNTER:0:U", True),
164
+ DCGM_FI_DEV_ECC_DBE_VOL_TOTAL:
165
+ CollectdMetadata("ecc_dbe_volatile_total", "value:COUNTER:0:U", True),
166
+ DCGM_FI_DEV_ECC_SBE_AGG_TOTAL:
167
+ CollectdMetadata("ecc_sbe_aggregate_total", "value:COUNTER:0:U", True),
168
+ DCGM_FI_DEV_ECC_DBE_AGG_TOTAL:
169
+ CollectdMetadata("ecc_dbe_aggregate_total", "value:COUNTER:0:U", True),
170
+ DCGM_FI_DEV_ECC_SBE_VOL_L1:
171
+ CollectdMetadata("ecc_sbe_volatile_l1", "value:GAUGE:U:U"),
172
+ DCGM_FI_DEV_ECC_DBE_VOL_L1:
173
+ CollectdMetadata("ecc_dbe_volatile_l1", "value:GAUGE:U:U"),
174
+ DCGM_FI_DEV_ECC_SBE_VOL_L2:
175
+ CollectdMetadata("ecc_sbe_volatile_l2", "value:GAUGE:U:U"),
176
+ DCGM_FI_DEV_ECC_DBE_VOL_L2:
177
+ CollectdMetadata("ecc_dbe_volatile_l2", "value:GAUGE:U:U"),
178
+ DCGM_FI_DEV_ECC_SBE_VOL_DEV:
179
+ CollectdMetadata("ecc_sbe_volatile_device", "value:GAUGE:U:U"),
180
+ DCGM_FI_DEV_ECC_DBE_VOL_DEV:
181
+ CollectdMetadata("ecc_dbe_volatile_device", "value:GAUGE:U:U"),
182
+ DCGM_FI_DEV_ECC_SBE_VOL_REG:
183
+ CollectdMetadata("ecc_sbe_volatile_register", "value:GAUGE:U:U"),
184
+ DCGM_FI_DEV_ECC_DBE_VOL_REG:
185
+ CollectdMetadata("ecc_dbe_volatile_register", "value:GAUGE:U:U"),
186
+ DCGM_FI_DEV_ECC_SBE_VOL_TEX:
187
+ CollectdMetadata("ecc_sbe_volatile_texture", "value:GAUGE:U:U"),
188
+ DCGM_FI_DEV_ECC_DBE_VOL_TEX:
189
+ CollectdMetadata("ecc_dbe_volatile_texture", "value:GAUGE:U:U"),
190
+ DCGM_FI_DEV_ECC_SBE_AGG_L1:
191
+ CollectdMetadata("ecc_sbe_aggregate_l1", "value:GAUGE:U:U"),
192
+ DCGM_FI_DEV_ECC_DBE_AGG_L1:
193
+ CollectdMetadata("ecc_dbe_aggregate_l1", "value:GAUGE:U:U"),
194
+ DCGM_FI_DEV_ECC_SBE_AGG_L2:
195
+ CollectdMetadata("ecc_sbe_aggregate_l2", "value:GAUGE:U:U"),
196
+ DCGM_FI_DEV_ECC_DBE_AGG_L2:
197
+ CollectdMetadata("ecc_dbe_aggregate_l2", "value:GAUGE:U:U"),
198
+ DCGM_FI_DEV_ECC_SBE_AGG_DEV:
199
+ CollectdMetadata("ecc_sbe_aggregate_device", "value:GAUGE:U:U"),
200
+ DCGM_FI_DEV_ECC_DBE_AGG_DEV:
201
+ CollectdMetadata("ecc_dbe_aggregate_device", "value:GAUGE:U:U"),
202
+ DCGM_FI_DEV_ECC_SBE_AGG_REG:
203
+ CollectdMetadata("ecc_sbe_aggregate_register", "value:GAUGE:U:U"),
204
+ DCGM_FI_DEV_ECC_DBE_AGG_REG:
205
+ CollectdMetadata("ecc_dbe_aggregate_register", "value:GAUGE:U:U"),
206
+ DCGM_FI_DEV_ECC_SBE_AGG_TEX:
207
+ CollectdMetadata("ecc_sbe_aggregate_texture", "value:GAUGE:U:U"),
208
+ DCGM_FI_DEV_ECC_DBE_AGG_TEX:
209
+ CollectdMetadata("ecc_dbe_aggregate_texture", "value:GAUGE:U:U"),
210
+ DCGM_FI_DEV_GPU_UTIL:
211
+ CollectdMetadata("gpu_utilization", "value:GAUGE:0.0:1.0", True),
212
+ DCGM_FI_DEV_MEM_COPY_UTIL:
213
+ CollectdMetadata("mem_copy_utilization", "value:GAUGE:0:100", True),
214
+ DCGM_FI_DEV_ENC_UTIL:
215
+ CollectdMetadata("enc_utilization", "value:GAUGE:0:100"),
216
+ DCGM_FI_DEV_DEC_UTIL:
217
+ CollectdMetadata("dec_utilization", "value:GAUGE:0:100"),
218
+ DCGM_FI_DEV_VBIOS_VERSION:
219
+ None,
220
+ DCGM_FI_DEV_BAR1_TOTAL:
221
+ CollectdMetadata("bar1_total", "value:GAUGE:U:U"),
222
+ DCGM_FI_DEV_BAR1_USED:
223
+ CollectdMetadata("bar1_used", "value:GAUGE:U:U"),
224
+ DCGM_FI_DEV_BAR1_FREE:
225
+ CollectdMetadata("bar1_free", "value:GAUGE:U:U"),
226
+ DCGM_FI_DEV_FB_TOTAL:
227
+ CollectdMetadata("fb_total", "value:GAUGE:0.0:U", True),
228
+ DCGM_FI_DEV_FB_FREE:
229
+ CollectdMetadata("fb_free", "value:GAUGE:0.0:U", True),
230
+ DCGM_FI_DEV_FB_USED:
231
+ CollectdMetadata("fb_used", "value:GAUGE:0.0:U", True),
232
+ DCGM_FI_DEV_FB_RESERVED:
233
+ CollectdMetadata("fb_resv", "value:GAUGE:0.0:U", True),
234
+ DCGM_FI_DEV_VIRTUAL_MODE:
235
+ CollectdMetadata("virtualization_mode", "value:GAUGE:U:U"),
236
+ DCGM_FI_DEV_VGPU_INSTANCE_IDS:
237
+ None,
238
+ DCGM_FI_DEV_VGPU_UTILIZATIONS:
239
+ None,
240
+ DCGM_FI_DEV_VGPU_PER_PROCESS_UTILIZATION:
241
+ None,
242
+ DCGM_FI_DEV_VGPU_VM_ID:
243
+ None,
244
+ DCGM_FI_DEV_VGPU_VM_NAME:
245
+ None,
246
+ DCGM_FI_DEV_VGPU_TYPE:
247
+ CollectdMetadata("vgpu_instance_type", "value:GAUGE:U:U"),
248
+ DCGM_FI_DEV_VGPU_UUID:
249
+ None,
250
+ DCGM_FI_DEV_VGPU_DRIVER_VERSION:
251
+ None,
252
+ DCGM_FI_DEV_VGPU_MEMORY_USAGE:
253
+ CollectdMetadata("vgpu_instance_memory_usage", "value:GAUGE:U:U"),
254
+ DCGM_FI_DEV_VGPU_INSTANCE_LICENSE_STATE:
255
+ CollectdMetadata("vgpu_instance_license_state", "value:GAUGE:U:U"),
256
+ DCGM_FI_DEV_VGPU_LICENSE_STATUS:
257
+ CollectdMetadata("vgpu_instance_license_status", "value:GAUGE:U:U"),
258
+ DCGM_FI_DEV_VGPU_FRAME_RATE_LIMIT:
259
+ CollectdMetadata("vgpu_instance_frame_rate_limit", "value:GAUGE:U:U"),
260
+ DCGM_FI_DEV_VGPU_PCI_ID:
261
+ CollectdMetadata("vgpu_instance_pci_id", "value:GAUGE:U:U"),
262
+ DCGM_FI_DEV_VGPU_ENC_STATS:
263
+ None,
264
+ DCGM_FI_DEV_VGPU_ENC_SESSIONS_INFO:
265
+ None,
266
+ DCGM_FI_DEV_VGPU_FBC_STATS:
267
+ None,
268
+ DCGM_FI_DEV_VGPU_FBC_SESSIONS_INFO:
269
+ None,
270
+ DCGM_FI_DEV_VGPU_VM_GPU_INSTANCE_ID:
271
+ None,
272
+ DCGM_FI_DEV_SUPPORTED_TYPE_INFO:
273
+ None,
274
+ DCGM_FI_DEV_SUPPORTED_VGPU_TYPE_IDS:
275
+ None,
276
+ DCGM_FI_DEV_VGPU_TYPE_INFO:
277
+ None,
278
+ DCGM_FI_DEV_VGPU_TYPE_NAME:
279
+ None,
280
+ DCGM_FI_DEV_VGPU_TYPE_CLASS:
281
+ None,
282
+ DCGM_FI_DEV_VGPU_TYPE_LICENSE:
283
+ None,
284
+ DCGM_FI_DEV_CREATABLE_VGPU_TYPE_IDS:
285
+ None,
286
+ DCGM_FI_DEV_ENC_STATS:
287
+ None,
288
+ DCGM_FI_DEV_FBC_STATS:
289
+ None,
290
+ DCGM_FI_DEV_FBC_SESSIONS_INFO:
291
+ None,
292
+ DCGM_FI_DEV_ACCOUNTING_DATA:
293
+ None,
294
+ DCGM_FI_DEV_RETIRED_SBE:
295
+ CollectdMetadata("retired_pages_sbe", "value:COUNTER:0:U", True),
296
+ DCGM_FI_DEV_RETIRED_DBE:
297
+ CollectdMetadata("retired_pages_dbe", "value:COUNTER:0:U", True),
298
+ DCGM_FI_DEV_GRAPHICS_PIDS:
299
+ None,
300
+ DCGM_FI_DEV_COMPUTE_PIDS:
301
+ None,
302
+ DCGM_FI_DEV_SUPPORTED_CLOCKS:
303
+ None,
304
+ DCGM_FI_SYNC_BOOST:
305
+ None,
306
+ DCGM_FI_DEV_RETIRED_PENDING:
307
+ CollectdMetadata("retired_pages_pending", "value:GAUGE:0:1",
308
+ True), # boolean 1 = yes, 0 = no
309
+ DCGM_FI_DEV_UNCORRECTABLE_REMAPPED_ROWS:
310
+ CollectdMetadata("uncorrectable_remapped_rows", "value:GAUGE:U:U"),
311
+ DCGM_FI_DEV_CORRECTABLE_REMAPPED_ROWS:
312
+ CollectdMetadata("correctable_remapped_rows", "value:GAUGE:U:U"),
313
+ DCGM_FI_DEV_ROW_REMAP_FAILURE:
314
+ CollectdMetadata("row_remap_failure", "value:GAUGE:U:U"),
315
+ DCGM_FI_DEV_ROW_REMAP_PENDING:
316
+ CollectdMetadata("row_remap_pending", "value:GAUGE:U:U"),
317
+ DCGM_FI_DEV_INFOROM_CONFIG_VALID:
318
+ CollectdMetadata("inforom_config_valid", "value:GAUGE:U:U"),
319
+ DCGM_FI_DEV_XID_ERRORS:
320
+ CollectdMetadata("xid_errors", "value:GAUGE:0:U", True),
321
+ DCGM_FI_DEV_PCIE_MAX_LINK_GEN:
322
+ CollectdMetadata("pcie_max_link_gen", "value:GAUGE:U:U"),
323
+ DCGM_FI_DEV_PCIE_MAX_LINK_WIDTH:
324
+ CollectdMetadata("pcie_max_link_width", "value:GAUGE:U:U"),
325
+ DCGM_FI_DEV_PCIE_LINK_GEN:
326
+ CollectdMetadata("pcie_link_gen", "value:GAUGE:U:U"),
327
+ DCGM_FI_DEV_PCIE_LINK_WIDTH:
328
+ CollectdMetadata("pcie_link_width", "value:GAUGE:U:U"),
329
+ DCGM_FI_DEV_POWER_VIOLATION:
330
+ CollectdMetadata("power_violation", "value:COUNTER:0:U", True),
331
+ DCGM_FI_DEV_THERMAL_VIOLATION:
332
+ CollectdMetadata("thermal_violation", "value:COUNTER:0:U", True),
333
+ DCGM_FI_GPU_TOPOLOGY_PCI:
334
+ None,
335
+ DCGM_FI_GPU_TOPOLOGY_NVLINK:
336
+ None,
337
+ DCGM_FI_GPU_TOPOLOGY_AFFINITY:
338
+ None,
339
+ DCGM_FI_DEV_SYNC_BOOST_VIOLATION:
340
+ CollectdMetadata("sync_boost_violation", "value:GAUGE:U:U"),
341
+ DCGM_FI_DEV_BOARD_LIMIT_VIOLATION:
342
+ CollectdMetadata("board_limit_violation", "value:GAUGE:U:U"),
343
+ DCGM_FI_DEV_LOW_UTIL_VIOLATION:
344
+ CollectdMetadata("low_util_violation", "value:GAUGE:U:U"),
345
+ DCGM_FI_DEV_RELIABILITY_VIOLATION:
346
+ CollectdMetadata("reliability_violation", "value:GAUGE:U:U"),
347
+ DCGM_FI_DEV_TOTAL_APP_CLOCKS_VIOLATION:
348
+ CollectdMetadata("app_clock_violation", "value:GAUGE:U:U"),
349
+ DCGM_FI_DEV_TOTAL_BASE_CLOCKS_VIOLATION:
350
+ CollectdMetadata("base_clock_violation", "value:GAUGE:U:U"),
351
+ DCGM_FI_DEV_MEM_COPY_UTIL_SAMPLES:
352
+ CollectdMetadata("mem_util_samples", "value:GAUGE:U:U"),
353
+ DCGM_FI_DEV_GPU_UTIL_SAMPLES:
354
+ CollectdMetadata("gpu_util_samples", "value:GAUGE:U:U"),
355
+ DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L0:
356
+ CollectdMetadata("nvlink_flit_crc_error_count_l0", "value:GAUGE:U:U"),
357
+ DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L1:
358
+ CollectdMetadata("nvlink_flit_crc_error_count_l1", "value:GAUGE:U:U"),
359
+ DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L2:
360
+ CollectdMetadata("nvlink_flit_crc_error_count_l2", "value:GAUGE:U:U"),
361
+ DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L3:
362
+ CollectdMetadata("nvlink_flit_crc_error_count_l3", "value:GAUGE:U:U"),
363
+ DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L4:
364
+ CollectdMetadata("nvlink_flit_crc_error_count_l4", "value:GAUGE:U:U"),
365
+ DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L5:
366
+ CollectdMetadata("nvlink_flit_crc_error_count_l5", "value:GAUGE:U:U"),
367
+ DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_TOTAL:
368
+ CollectdMetadata("nvlink_flit_crc_error_count_total",
369
+ "value:COUNTER:0:U", True),
370
+ DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L0:
371
+ CollectdMetadata("nvlink_data_crc_error_count_l0", "value:GAUGE:U:U"),
372
+ DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L1:
373
+ CollectdMetadata("nvlink_data_crc_error_count_l1", "value:GAUGE:U:U"),
374
+ DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L2:
375
+ CollectdMetadata("nvlink_data_crc_error_count_l2", "value:GAUGE:U:U"),
376
+ DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L3:
377
+ CollectdMetadata("nvlink_data_crc_error_count_l3", "value:GAUGE:U:U"),
378
+ DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L4:
379
+ CollectdMetadata("nvlink_data_crc_error_count_l4", "value:GAUGE:U:U"),
380
+ DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L5:
381
+ CollectdMetadata("nvlink_data_crc_error_count_l5", "value:GAUGE:U:U"),
382
+ DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_TOTAL:
383
+ CollectdMetadata("nvlink_data_crc_error_count_total",
384
+ "value:COUNTER:0:U", True),
385
+ DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L0:
386
+ CollectdMetadata("nvlink_replay_error_count_l0", "value:GAUGE:U:U"),
387
+ DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L1:
388
+ CollectdMetadata("nvlink_replay_error_count_l1", "value:GAUGE:U:U"),
389
+ DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L2:
390
+ CollectdMetadata("nvlink_replay_error_count_l2", "value:GAUGE:U:U"),
391
+ DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L3:
392
+ CollectdMetadata("nvlink_replay_error_count_l3", "value:GAUGE:U:U"),
393
+ DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L4:
394
+ CollectdMetadata("nvlink_replay_error_count_l4", "value:GAUGE:U:U"),
395
+ DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L5:
396
+ CollectdMetadata("nvlink_replay_error_count_l5", "value:GAUGE:U:U"),
397
+ DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_TOTAL:
398
+ CollectdMetadata("nvlink_replay_error_count_total", "value:GAUGE:U:U"),
399
+ DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L0:
400
+ CollectdMetadata("nvlink_recovery_error_count_l0", "value:GAUGE:U:U"),
401
+ DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L1:
402
+ CollectdMetadata("nvlink_recovery_error_count_l1", "value:GAUGE:U:U"),
403
+ DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L2:
404
+ CollectdMetadata("nvlink_recovery_error_count_l2", "value:GAUGE:U:U"),
405
+ DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L3:
406
+ CollectdMetadata("nvlink_recovery_error_count_l3", "value:GAUGE:U:U"),
407
+ DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L4:
408
+ CollectdMetadata("nvlink_recovery_error_count_l4", "value:GAUGE:U:U"),
409
+ DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L5:
410
+ CollectdMetadata("nvlink_recovery_error_count_l5", "value:GAUGE:U:U"),
411
+ DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_TOTAL:
412
+ CollectdMetadata("nvlink_recovery_error_count_total",
413
+ "value:COUNTER:0:U", True),
414
+ DCGM_FI_DEV_NVLINK_BANDWIDTH_L0:
415
+ CollectdMetadata("nvlink_bandwidth_l0", "value:GAUGE:U:U"),
416
+ DCGM_FI_DEV_NVLINK_BANDWIDTH_L1:
417
+ CollectdMetadata("nvlink_bandwidth_l1", "value:GAUGE:U:U"),
418
+ DCGM_FI_DEV_NVLINK_BANDWIDTH_L2:
419
+ CollectdMetadata("nvlink_bandwidth_l2", "value:GAUGE:U:U"),
420
+ DCGM_FI_DEV_NVLINK_BANDWIDTH_L3:
421
+ CollectdMetadata("nvlink_bandwidth_l3", "value:GAUGE:U:U"),
422
+ DCGM_FI_DEV_NVLINK_BANDWIDTH_L4:
423
+ CollectdMetadata("nvlink_bandwidth_l4", "value:GAUGE:U:U"),
424
+ DCGM_FI_DEV_NVLINK_BANDWIDTH_L5:
425
+ CollectdMetadata("nvlink_bandwidth_l5", "value:GAUGE:U:U"),
426
+ DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL:
427
+ CollectdMetadata("nvlink_bandwidth_total", "value:GAUGE:0:U", True),
428
+ DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L6:
429
+ CollectdMetadata("nvlink_flit_crc_error_count_l6", "value:GAUGE:U:U"),
430
+ DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L7:
431
+ CollectdMetadata("nvlink_flit_crc_error_count_l7", "value:GAUGE:U:U"),
432
+ DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L8:
433
+ CollectdMetadata("nvlink_flit_crc_error_count_l8", "value:GAUGE:U:U"),
434
+ DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L9:
435
+ CollectdMetadata("nvlink_flit_crc_error_count_l9", "value:GAUGE:U:U"),
436
+ DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L10:
437
+ CollectdMetadata("nvlink_flit_crc_error_count_l10", "value:GAUGE:U:U"),
438
+ DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L11:
439
+ CollectdMetadata("nvlink_flit_crc_error_count_l11", "value:GAUGE:U:U"),
440
+ DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L6:
441
+ CollectdMetadata("nvlink_data_crc_error_count_l6", "value:GAUGE:U:U"),
442
+ DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L7:
443
+ CollectdMetadata("nvlink_data_crc_error_count_l7", "value:GAUGE:U:U"),
444
+ DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L8:
445
+ CollectdMetadata("nvlink_data_crc_error_count_l8", "value:GAUGE:U:U"),
446
+ DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L9:
447
+ CollectdMetadata("nvlink_data_crc_error_count_l9", "value:GAUGE:U:U"),
448
+ DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L10:
449
+ CollectdMetadata("nvlink_data_crc_error_count_l10", "value:GAUGE:U:U"),
450
+ DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L11:
451
+ CollectdMetadata("nvlink_data_crc_error_count_l11", "value:GAUGE:U:U"),
452
+ DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L6:
453
+ CollectdMetadata("nvlink_replay_error_count_l6", "value:GAUGE:U:U"),
454
+ DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L7:
455
+ CollectdMetadata("nvlink_replay_error_count_l7", "value:GAUGE:U:U"),
456
+ DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L8:
457
+ CollectdMetadata("nvlink_replay_error_count_l8", "value:GAUGE:U:U"),
458
+ DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L9:
459
+ CollectdMetadata("nvlink_replay_error_count_l9", "value:GAUGE:U:U"),
460
+ DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L10:
461
+ CollectdMetadata("nvlink_replay_error_count_l10", "value:GAUGE:U:U"),
462
+ DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L11:
463
+ CollectdMetadata("nvlink_replay_error_count_l11", "value:GAUGE:U:U"),
464
+ DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L6:
465
+ CollectdMetadata("nvlink_recovery_error_count_l6", "value:GAUGE:U:U"),
466
+ DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L7:
467
+ CollectdMetadata("nvlink_recovery_error_count_l7", "value:GAUGE:U:U"),
468
+ DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L8:
469
+ CollectdMetadata("nvlink_recovery_error_count_l8", "value:GAUGE:U:U"),
470
+ DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L9:
471
+ CollectdMetadata("nvlink_recovery_error_count_l9", "value:GAUGE:U:U"),
472
+ DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L10:
473
+ CollectdMetadata("nvlink_recovery_error_count_l10", "value:GAUGE:U:U"),
474
+ DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L11:
475
+ CollectdMetadata("nvlink_recovery_error_count_l11", "value:GAUGE:U:U"),
476
+ DCGM_FI_DEV_NVLINK_BANDWIDTH_L6:
477
+ CollectdMetadata("nvlink_bandwidth_l6", "value:GAUGE:U:U"),
478
+ DCGM_FI_DEV_NVLINK_BANDWIDTH_L7:
479
+ CollectdMetadata("nvlink_bandwidth_l7", "value:GAUGE:U:U"),
480
+ DCGM_FI_DEV_NVLINK_BANDWIDTH_L8:
481
+ CollectdMetadata("nvlink_bandwidth_l8", "value:GAUGE:U:U"),
482
+ DCGM_FI_DEV_NVLINK_BANDWIDTH_L9:
483
+ CollectdMetadata("nvlink_bandwidth_l9", "value:GAUGE:U:U"),
484
+ DCGM_FI_DEV_NVLINK_BANDWIDTH_L10:
485
+ CollectdMetadata("nvlink_bandwidth_l10", "value:GAUGE:U:U"),
486
+ DCGM_FI_DEV_NVLINK_BANDWIDTH_L11:
487
+ CollectdMetadata("nvlink_bandwidth_l11", "value:GAUGE:U:U"),
488
+ DCGM_FI_DEV_MEMORY_TEMP:
489
+ CollectdMetadata("memory_temp", "value:GAUGE:U:U", True),
490
+ DCGM_FI_DEV_GPU_NVLINK_ERRORS:
491
+ CollectdMetadata("gpu_nvlink_errors", "value:GAUGE:U:U"),
492
+ DCGM_FI_DEV_NVSWITCH_LINK_THROUGHPUT_TX:
493
+ CollectdMetadata("nvswitch_link_bandwidth_tx", "value:GAUGE:U:U"),
494
+ DCGM_FI_DEV_NVSWITCH_LINK_THROUGHPUT_RX:
495
+ CollectdMetadata("nvswitch_link_bandwidth_rx", "value:GAUGE:U:U"),
496
+ DCGM_FI_DEV_NVSWITCH_LINK_NON_FATAL_ERRORS:
497
+ CollectdMetadata("nvswitch_link_fatal_errors", "value:GAUGE:U:U"),
498
+ DCGM_FI_DEV_NVSWITCH_LINK_REPLAY_ERRORS:
499
+ CollectdMetadata("nvswitch_link_non_fatal_errors", "value:GAUGE:U:U"),
500
+ DCGM_FI_DEV_NVSWITCH_LINK_RECOVERY_ERRORS:
501
+ CollectdMetadata("nvswitch_link_recovery_errors", "value:GAUGE:U:U"),
502
+ DCGM_FI_DEV_NVSWITCH_LINK_FLIT_ERRORS:
503
+ CollectdMetadata("nvswitch_link_flit_errors", "value:GAUGE:U:U"),
504
+ DCGM_FI_DEV_NVSWITCH_LINK_CRC_ERRORS:
505
+ CollectdMetadata("nvswitch_link_crc_errors", "value:GAUGE:U:U"),
506
+ DCGM_FI_DEV_NVSWITCH_LINK_ECC_ERRORS:
507
+ CollectdMetadata("nvswitch_link_ecc_errors", "value:GAUGE:U:U"),
508
+ DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_LOW_VC0:
509
+ CollectdMetadata("nvswitch_link_latency_low_vc0", "value:GAUGE:U:U"),
510
+ DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_LOW_VC1:
511
+ CollectdMetadata("nvswitch_link_latency_low_vc1", "value:GAUGE:U:U"),
512
+ DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_LOW_VC2:
513
+ CollectdMetadata("nvswitch_link_latency_low_vc2", "value:GAUGE:U:U"),
514
+ DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_LOW_VC3:
515
+ CollectdMetadata("nvswitch_link_latency_low_vc3", "value:GAUGE:U:U"),
516
+ DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_MEDIUM_VC0:
517
+ CollectdMetadata("nvswitch_link_latency_medium_vc0", "value:GAUGE:U:U"),
518
+ DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_MEDIUM_VC1:
519
+ CollectdMetadata("nvswitch_link_latency_medium_vc1", "value:GAUGE:U:U"),
520
+ DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_MEDIUM_VC2:
521
+ CollectdMetadata("nvswitch_link_latency_medium_vc2", "value:GAUGE:U:U"),
522
+ DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_MEDIUM_VC3:
523
+ CollectdMetadata("nvswitch_link_latency_medium_vc3", "value:GAUGE:U:U"),
524
+ DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_HIGH_VC0:
525
+ CollectdMetadata("nvswitch_link_latency_high_vc0", "value:GAUGE:U:U"),
526
+ DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_HIGH_VC1:
527
+ CollectdMetadata("nvswitch_link_latency_high_vc1", "value:GAUGE:U:U"),
528
+ DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_HIGH_VC2:
529
+ CollectdMetadata("nvswitch_link_latency_high_vc2", "value:GAUGE:U:U"),
530
+ DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_HIGH_VC3:
531
+ CollectdMetadata("nvswitch_link_latency_high_vc3", "value:GAUGE:U:U"),
532
+ DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_PANIC_VC0:
533
+ CollectdMetadata("nvswitch_link_latency_panic_vc0", "value:GAUGE:U:U"),
534
+ DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_PANIC_VC1:
535
+ CollectdMetadata("nvswitch_link_latency_panic_vc1", "value:GAUGE:U:U"),
536
+ DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_PANIC_VC2:
537
+ CollectdMetadata("nvswitch_link_latency_panic_vc2", "value:GAUGE:U:U"),
538
+ DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_PANIC_VC3:
539
+ CollectdMetadata("nvswitch_link_latency_panic_vc3", "value:GAUGE:U:U"),
540
+ DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_COUNT_VC0:
541
+ CollectdMetadata("nvswitch_link_latency_count_vc0", "value:GAUGE:U:U"),
542
+ DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_COUNT_VC1:
543
+ CollectdMetadata("nvswitch_link_latency_count_vc1", "value:GAUGE:U:U"),
544
+ DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_COUNT_VC2:
545
+ CollectdMetadata("nvswitch_link_latency_count_vc2", "value:GAUGE:U:U"),
546
+ DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_COUNT_VC3:
547
+ CollectdMetadata("nvswitch_link_latency_count_vc3", "value:GAUGE:U:U"),
548
+ DCGM_FI_DEV_NVSWITCH_LINK_CRC_ERRORS_LANE0:
549
+ CollectdMetadata("nvswitch_link_crc_errors_lane0", "value:GAUGE:U:U"),
550
+ DCGM_FI_DEV_NVSWITCH_LINK_CRC_ERRORS_LANE1:
551
+ CollectdMetadata("nvswitch_link_crc_errors_lane1", "value:GAUGE:U:U"),
552
+ DCGM_FI_DEV_NVSWITCH_LINK_CRC_ERRORS_LANE2:
553
+ CollectdMetadata("nvswitch_link_crc_errors_lane2", "value:GAUGE:U:U"),
554
+ DCGM_FI_DEV_NVSWITCH_LINK_CRC_ERRORS_LANE3:
555
+ CollectdMetadata("nvswitch_link_crc_errors_lane3", "value:GAUGE:U:U"),
556
+ DCGM_FI_DEV_NVSWITCH_LINK_ECC_ERRORS_LANE0:
557
+ CollectdMetadata("nvswitch_link_ecc_errors_lane0", "value:GAUGE:U:U"),
558
+ DCGM_FI_DEV_NVSWITCH_LINK_ECC_ERRORS_LANE1:
559
+ CollectdMetadata("nvswitch_link_ecc_errors_lane1", "value:GAUGE:U:U"),
560
+ DCGM_FI_DEV_NVSWITCH_LINK_ECC_ERRORS_LANE2:
561
+ CollectdMetadata("nvswitch_link_ecc_errors_lane2", "value:GAUGE:U:U"),
562
+ DCGM_FI_DEV_NVSWITCH_LINK_ECC_ERRORS_LANE3:
563
+ CollectdMetadata("nvswitch_link_ecc_errors_lane3", "value:GAUGE:U:U"),
564
+ DCGM_FI_DEV_NVSWITCH_FATAL_ERRORS:
565
+ CollectdMetadata("nvswitch_fatal_error", "value:GAUGE:U:U"),
566
+ DCGM_FI_DEV_NVSWITCH_NON_FATAL_ERRORS:
567
+ CollectdMetadata("nvswitch_non_fatal_error", "value:GAUGE:U:U"),
568
+ DCGM_FI_DEV_NVSWITCH_TEMPERATURE_CURRENT:
569
+ CollectdMetadata("nvswitch_temperature_current", "value:GAUGE:U:U"),
570
+ DCGM_FI_DEV_NVSWITCH_TEMPERATURE_LIMIT_SLOWDOWN:
571
+ CollectdMetadata("nvswitch_temperature_limit_slowdown",
572
+ "value:GAUGE:U:U"),
573
+ DCGM_FI_DEV_NVSWITCH_TEMPERATURE_LIMIT_SHUTDOWN:
574
+ CollectdMetadata("nvswitch_temperature_limit_shutdown",
575
+ "value:GAUGE:U:U"),
576
+ DCGM_FI_DEV_NVSWITCH_THROUGHPUT_TX:
577
+ CollectdMetadata("nvswitch_throughput_tx", "value:GAUGE:U:U"),
578
+ DCGM_FI_DEV_NVSWITCH_THROUGHPUT_RX:
579
+ CollectdMetadata("nvswitch_throughput_rx", "value:GAUGE:U:U"),
580
+ DCGM_FI_DEV_CUDA_COMPUTE_CAPABILITY:
581
+ CollectdMetadata("cuda_compute_capability", "value:GAUGE:U:U"),
582
+ DCGM_FI_PROF_GR_ENGINE_ACTIVE:
583
+ CollectdMetadata("gr_engine_active", "value:GAUGE:0.0:1.0", True),
584
+ DCGM_FI_PROF_SM_ACTIVE:
585
+ CollectdMetadata("sm_active", "value:GAUGE:0.0:1.0", True),
586
+ DCGM_FI_PROF_SM_OCCUPANCY:
587
+ CollectdMetadata("sm_occupancy", "value:GAUGE:0:U", True),
588
+ DCGM_FI_PROF_PIPE_TENSOR_ACTIVE:
589
+ CollectdMetadata("tensor_active", "value:GAUGE:0.0:1.0", True),
590
+ DCGM_FI_PROF_DRAM_ACTIVE:
591
+ CollectdMetadata("dram_active", "value:GAUGE:0.0:1.0", True),
592
+ DCGM_FI_PROF_PIPE_FP64_ACTIVE:
593
+ CollectdMetadata("fp64_active", "value:GAUGE:U:U"),
594
+ DCGM_FI_PROF_PIPE_FP32_ACTIVE:
595
+ CollectdMetadata("fp32_active", "value:GAUGE:U:U"),
596
+ DCGM_FI_PROF_PIPE_FP16_ACTIVE:
597
+ CollectdMetadata("fp16_active", "value:GAUGE:U:U"),
598
+ DCGM_FI_PROF_PCIE_TX_BYTES:
599
+ CollectdMetadata("pcie_tx_bytes", "value:GAUGE:U:U"),
600
+ DCGM_FI_PROF_PCIE_RX_BYTES:
601
+ CollectdMetadata("pcie_rx_bytes", "value:GAUGE:U:U"),
602
+ DCGM_FI_PROF_NVLINK_TX_BYTES:
603
+ CollectdMetadata("nvlink_tx_bytes", "value:GAUGE:U:U"),
604
+ DCGM_FI_PROF_NVLINK_RX_BYTES:
605
+ CollectdMetadata("nvlink_rx_bytes", "value:GAUGE:U:U"),
606
+ DCGM_FI_PROF_PIPE_TENSOR_IMMA_ACTIVE:
607
+ CollectdMetadata("tensor_imma_active", "value:GAUGE:0.0:1.0", True),
608
+ DCGM_FI_PROF_PIPE_TENSOR_HMMA_ACTIVE:
609
+ CollectdMetadata("tensor_hmma_active", "value:GAUGE:0.0:1.0", True),
610
+ }
611
+
612
+ __fieldDict = None
613
+
614
+
615
+ def GenerateCollectdTypesDB():
616
+ length = max(
617
+ map(lambda x: len(x.name) if x else 0, CollectdMetadataDict.values()))
618
+
619
+ fmt = "{0:<" + str(length) + "}"
620
+ fail = False
621
+
622
+ for item in filter(None, CollectdMetadataDict.values()):
623
+ item_list = item.kind.split(':')
624
+
625
+ # Some rudimentary syntax checking.
626
+
627
+ if len(item_list) != 4:
628
+ sys.stderr.write(
629
+ 'Item ' + item.name +
630
+ ' has wrong number of collectd type fields - four required.\n')
631
+ fail = True
632
+
633
+ if item_list[1] not in ['GAUGE', 'COUNTER', 'DERIVE', 'ABSOLUTE']:
634
+ sys.stderr.write(
635
+ 'Item ' + item.name +
636
+ ' should be one of GAUGE, COUNTER, DERIVE, ABSOLUTE.\n')
637
+ fail = True
638
+
639
+ # We check this so we can enumerate all dcgm fields for possible
640
+ # inclusion, even if some are not (yet) formally supported.
641
+
642
+ if item.used:
643
+ print(fmt.format(item.name), item.kind)
644
+
645
+ if fail:
646
+ exit("Failed on db.types table syntax errors.\n")
647
+
648
+
649
+ def GetFieldByName(name):
650
+ global __fieldDict
651
+
652
+ if name.isnumeric():
653
+ return int(name)
654
+
655
+ if __fieldDict == None:
656
+ __fieldDict = {}
657
+
658
+ for key in CollectdMetadataDict:
659
+ item = CollectdMetadataDict[key]
660
+
661
+ if item != None:
662
+ __fieldDict[item.name] = key
663
+
664
+ if name not in __fieldDict.keys():
665
+ return -1
666
+
667
+ return __fieldDict[name]
668
+
669
+
670
+ if __name__ == '__main__':
671
+ GenerateCollectdTypesDB()