amd-node-scraper 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (197) hide show
  1. amd_node_scraper-0.0.1.dist-info/LICENSE +21 -0
  2. amd_node_scraper-0.0.1.dist-info/METADATA +424 -0
  3. amd_node_scraper-0.0.1.dist-info/RECORD +197 -0
  4. amd_node_scraper-0.0.1.dist-info/WHEEL +5 -0
  5. amd_node_scraper-0.0.1.dist-info/entry_points.txt +2 -0
  6. amd_node_scraper-0.0.1.dist-info/top_level.txt +1 -0
  7. nodescraper/__init__.py +32 -0
  8. nodescraper/base/__init__.py +34 -0
  9. nodescraper/base/inbandcollectortask.py +118 -0
  10. nodescraper/base/inbanddataplugin.py +39 -0
  11. nodescraper/base/regexanalyzer.py +120 -0
  12. nodescraper/cli/__init__.py +29 -0
  13. nodescraper/cli/cli.py +511 -0
  14. nodescraper/cli/constants.py +27 -0
  15. nodescraper/cli/dynamicparserbuilder.py +171 -0
  16. nodescraper/cli/helper.py +517 -0
  17. nodescraper/cli/inputargtypes.py +129 -0
  18. nodescraper/configbuilder.py +123 -0
  19. nodescraper/configregistry.py +66 -0
  20. nodescraper/configs/node_status.json +19 -0
  21. nodescraper/connection/__init__.py +25 -0
  22. nodescraper/connection/inband/__init__.py +46 -0
  23. nodescraper/connection/inband/inband.py +171 -0
  24. nodescraper/connection/inband/inbandlocal.py +93 -0
  25. nodescraper/connection/inband/inbandmanager.py +151 -0
  26. nodescraper/connection/inband/inbandremote.py +173 -0
  27. nodescraper/connection/inband/sshparams.py +43 -0
  28. nodescraper/constants.py +26 -0
  29. nodescraper/enums/__init__.py +40 -0
  30. nodescraper/enums/eventcategory.py +89 -0
  31. nodescraper/enums/eventpriority.py +42 -0
  32. nodescraper/enums/executionstatus.py +44 -0
  33. nodescraper/enums/osfamily.py +34 -0
  34. nodescraper/enums/systeminteraction.py +41 -0
  35. nodescraper/enums/systemlocation.py +33 -0
  36. nodescraper/generictypes.py +36 -0
  37. nodescraper/interfaces/__init__.py +44 -0
  38. nodescraper/interfaces/connectionmanager.py +143 -0
  39. nodescraper/interfaces/dataanalyzertask.py +138 -0
  40. nodescraper/interfaces/datacollectortask.py +185 -0
  41. nodescraper/interfaces/dataplugin.py +356 -0
  42. nodescraper/interfaces/plugin.py +127 -0
  43. nodescraper/interfaces/resultcollator.py +56 -0
  44. nodescraper/interfaces/task.py +164 -0
  45. nodescraper/interfaces/taskresulthook.py +39 -0
  46. nodescraper/models/__init__.py +48 -0
  47. nodescraper/models/analyzerargs.py +93 -0
  48. nodescraper/models/collectorargs.py +30 -0
  49. nodescraper/models/connectionconfig.py +34 -0
  50. nodescraper/models/datamodel.py +171 -0
  51. nodescraper/models/datapluginresult.py +39 -0
  52. nodescraper/models/event.py +158 -0
  53. nodescraper/models/pluginconfig.py +38 -0
  54. nodescraper/models/pluginresult.py +39 -0
  55. nodescraper/models/systeminfo.py +44 -0
  56. nodescraper/models/taskresult.py +185 -0
  57. nodescraper/models/timerangeargs.py +38 -0
  58. nodescraper/pluginexecutor.py +274 -0
  59. nodescraper/pluginregistry.py +152 -0
  60. nodescraper/plugins/__init__.py +25 -0
  61. nodescraper/plugins/inband/__init__.py +25 -0
  62. nodescraper/plugins/inband/amdsmi/__init__.py +28 -0
  63. nodescraper/plugins/inband/amdsmi/amdsmi_analyzer.py +821 -0
  64. nodescraper/plugins/inband/amdsmi/amdsmi_collector.py +1313 -0
  65. nodescraper/plugins/inband/amdsmi/amdsmi_plugin.py +43 -0
  66. nodescraper/plugins/inband/amdsmi/amdsmidata.py +1002 -0
  67. nodescraper/plugins/inband/amdsmi/analyzer_args.py +50 -0
  68. nodescraper/plugins/inband/amdsmi/cper.py +65 -0
  69. nodescraper/plugins/inband/bios/__init__.py +29 -0
  70. nodescraper/plugins/inband/bios/analyzer_args.py +64 -0
  71. nodescraper/plugins/inband/bios/bios_analyzer.py +93 -0
  72. nodescraper/plugins/inband/bios/bios_collector.py +93 -0
  73. nodescraper/plugins/inband/bios/bios_plugin.py +43 -0
  74. nodescraper/plugins/inband/bios/biosdata.py +30 -0
  75. nodescraper/plugins/inband/cmdline/__init__.py +25 -0
  76. nodescraper/plugins/inband/cmdline/analyzer_args.py +80 -0
  77. nodescraper/plugins/inband/cmdline/cmdline_analyzer.py +113 -0
  78. nodescraper/plugins/inband/cmdline/cmdline_collector.py +77 -0
  79. nodescraper/plugins/inband/cmdline/cmdline_plugin.py +43 -0
  80. nodescraper/plugins/inband/cmdline/cmdlinedata.py +30 -0
  81. nodescraper/plugins/inband/device_enumeration/__init__.py +29 -0
  82. nodescraper/plugins/inband/device_enumeration/analyzer_args.py +73 -0
  83. nodescraper/plugins/inband/device_enumeration/device_enumeration_analyzer.py +81 -0
  84. nodescraper/plugins/inband/device_enumeration/device_enumeration_collector.py +176 -0
  85. nodescraper/plugins/inband/device_enumeration/device_enumeration_plugin.py +45 -0
  86. nodescraper/plugins/inband/device_enumeration/deviceenumdata.py +36 -0
  87. nodescraper/plugins/inband/dimm/__init__.py +25 -0
  88. nodescraper/plugins/inband/dimm/collector_args.py +31 -0
  89. nodescraper/plugins/inband/dimm/dimm_collector.py +151 -0
  90. nodescraper/plugins/inband/dimm/dimm_plugin.py +40 -0
  91. nodescraper/plugins/inband/dimm/dimmdata.py +30 -0
  92. nodescraper/plugins/inband/dkms/__init__.py +25 -0
  93. nodescraper/plugins/inband/dkms/analyzer_args.py +85 -0
  94. nodescraper/plugins/inband/dkms/dkms_analyzer.py +106 -0
  95. nodescraper/plugins/inband/dkms/dkms_collector.py +76 -0
  96. nodescraper/plugins/inband/dkms/dkms_plugin.py +43 -0
  97. nodescraper/plugins/inband/dkms/dkmsdata.py +33 -0
  98. nodescraper/plugins/inband/dmesg/__init__.py +28 -0
  99. nodescraper/plugins/inband/dmesg/analyzer_args.py +33 -0
  100. nodescraper/plugins/inband/dmesg/collector_args.py +39 -0
  101. nodescraper/plugins/inband/dmesg/dmesg_analyzer.py +503 -0
  102. nodescraper/plugins/inband/dmesg/dmesg_collector.py +164 -0
  103. nodescraper/plugins/inband/dmesg/dmesg_plugin.py +44 -0
  104. nodescraper/plugins/inband/dmesg/dmesgdata.py +116 -0
  105. nodescraper/plugins/inband/fabrics/__init__.py +28 -0
  106. nodescraper/plugins/inband/fabrics/fabrics_collector.py +726 -0
  107. nodescraper/plugins/inband/fabrics/fabrics_plugin.py +37 -0
  108. nodescraper/plugins/inband/fabrics/fabricsdata.py +140 -0
  109. nodescraper/plugins/inband/journal/__init__.py +28 -0
  110. nodescraper/plugins/inband/journal/collector_args.py +33 -0
  111. nodescraper/plugins/inband/journal/journal_collector.py +107 -0
  112. nodescraper/plugins/inband/journal/journal_plugin.py +40 -0
  113. nodescraper/plugins/inband/journal/journaldata.py +44 -0
  114. nodescraper/plugins/inband/kernel/__init__.py +25 -0
  115. nodescraper/plugins/inband/kernel/analyzer_args.py +64 -0
  116. nodescraper/plugins/inband/kernel/kernel_analyzer.py +91 -0
  117. nodescraper/plugins/inband/kernel/kernel_collector.py +129 -0
  118. nodescraper/plugins/inband/kernel/kernel_plugin.py +43 -0
  119. nodescraper/plugins/inband/kernel/kerneldata.py +32 -0
  120. nodescraper/plugins/inband/kernel_module/__init__.py +25 -0
  121. nodescraper/plugins/inband/kernel_module/analyzer_args.py +59 -0
  122. nodescraper/plugins/inband/kernel_module/kernel_module_analyzer.py +211 -0
  123. nodescraper/plugins/inband/kernel_module/kernel_module_collector.py +264 -0
  124. nodescraper/plugins/inband/kernel_module/kernel_module_data.py +60 -0
  125. nodescraper/plugins/inband/kernel_module/kernel_module_plugin.py +43 -0
  126. nodescraper/plugins/inband/memory/__init__.py +25 -0
  127. nodescraper/plugins/inband/memory/analyzer_args.py +45 -0
  128. nodescraper/plugins/inband/memory/memory_analyzer.py +98 -0
  129. nodescraper/plugins/inband/memory/memory_collector.py +330 -0
  130. nodescraper/plugins/inband/memory/memory_plugin.py +43 -0
  131. nodescraper/plugins/inband/memory/memorydata.py +90 -0
  132. nodescraper/plugins/inband/network/__init__.py +28 -0
  133. nodescraper/plugins/inband/network/network_collector.py +1828 -0
  134. nodescraper/plugins/inband/network/network_plugin.py +37 -0
  135. nodescraper/plugins/inband/network/networkdata.py +319 -0
  136. nodescraper/plugins/inband/nvme/__init__.py +28 -0
  137. nodescraper/plugins/inband/nvme/nvme_collector.py +167 -0
  138. nodescraper/plugins/inband/nvme/nvme_plugin.py +37 -0
  139. nodescraper/plugins/inband/nvme/nvmedata.py +45 -0
  140. nodescraper/plugins/inband/os/__init__.py +25 -0
  141. nodescraper/plugins/inband/os/analyzer_args.py +64 -0
  142. nodescraper/plugins/inband/os/os_analyzer.py +73 -0
  143. nodescraper/plugins/inband/os/os_collector.py +131 -0
  144. nodescraper/plugins/inband/os/os_plugin.py +43 -0
  145. nodescraper/plugins/inband/os/osdata.py +31 -0
  146. nodescraper/plugins/inband/package/__init__.py +25 -0
  147. nodescraper/plugins/inband/package/analyzer_args.py +48 -0
  148. nodescraper/plugins/inband/package/package_analyzer.py +253 -0
  149. nodescraper/plugins/inband/package/package_collector.py +273 -0
  150. nodescraper/plugins/inband/package/package_plugin.py +43 -0
  151. nodescraper/plugins/inband/package/packagedata.py +41 -0
  152. nodescraper/plugins/inband/pcie/__init__.py +29 -0
  153. nodescraper/plugins/inband/pcie/analyzer_args.py +63 -0
  154. nodescraper/plugins/inband/pcie/pcie_analyzer.py +1081 -0
  155. nodescraper/plugins/inband/pcie/pcie_collector.py +690 -0
  156. nodescraper/plugins/inband/pcie/pcie_data.py +2017 -0
  157. nodescraper/plugins/inband/pcie/pcie_plugin.py +43 -0
  158. nodescraper/plugins/inband/process/__init__.py +25 -0
  159. nodescraper/plugins/inband/process/analyzer_args.py +45 -0
  160. nodescraper/plugins/inband/process/collector_args.py +31 -0
  161. nodescraper/plugins/inband/process/process_analyzer.py +91 -0
  162. nodescraper/plugins/inband/process/process_collector.py +115 -0
  163. nodescraper/plugins/inband/process/process_plugin.py +46 -0
  164. nodescraper/plugins/inband/process/processdata.py +34 -0
  165. nodescraper/plugins/inband/rocm/__init__.py +25 -0
  166. nodescraper/plugins/inband/rocm/analyzer_args.py +66 -0
  167. nodescraper/plugins/inband/rocm/rocm_analyzer.py +100 -0
  168. nodescraper/plugins/inband/rocm/rocm_collector.py +205 -0
  169. nodescraper/plugins/inband/rocm/rocm_plugin.py +43 -0
  170. nodescraper/plugins/inband/rocm/rocmdata.py +62 -0
  171. nodescraper/plugins/inband/storage/__init__.py +25 -0
  172. nodescraper/plugins/inband/storage/analyzer_args.py +38 -0
  173. nodescraper/plugins/inband/storage/collector_args.py +31 -0
  174. nodescraper/plugins/inband/storage/storage_analyzer.py +152 -0
  175. nodescraper/plugins/inband/storage/storage_collector.py +110 -0
  176. nodescraper/plugins/inband/storage/storage_plugin.py +44 -0
  177. nodescraper/plugins/inband/storage/storagedata.py +70 -0
  178. nodescraper/plugins/inband/sysctl/__init__.py +29 -0
  179. nodescraper/plugins/inband/sysctl/analyzer_args.py +67 -0
  180. nodescraper/plugins/inband/sysctl/sysctl_analyzer.py +81 -0
  181. nodescraper/plugins/inband/sysctl/sysctl_collector.py +101 -0
  182. nodescraper/plugins/inband/sysctl/sysctl_plugin.py +43 -0
  183. nodescraper/plugins/inband/sysctl/sysctldata.py +42 -0
  184. nodescraper/plugins/inband/syslog/__init__.py +28 -0
  185. nodescraper/plugins/inband/syslog/syslog_collector.py +121 -0
  186. nodescraper/plugins/inband/syslog/syslog_plugin.py +37 -0
  187. nodescraper/plugins/inband/syslog/syslogdata.py +46 -0
  188. nodescraper/plugins/inband/uptime/__init__.py +25 -0
  189. nodescraper/plugins/inband/uptime/uptime_collector.py +88 -0
  190. nodescraper/plugins/inband/uptime/uptime_plugin.py +37 -0
  191. nodescraper/plugins/inband/uptime/uptimedata.py +31 -0
  192. nodescraper/resultcollators/__init__.py +25 -0
  193. nodescraper/resultcollators/tablesummary.py +159 -0
  194. nodescraper/taskresulthooks/__init__.py +28 -0
  195. nodescraper/taskresulthooks/filesystemloghook.py +88 -0
  196. nodescraper/typeutils.py +171 -0
  197. nodescraper/utils.py +412 -0
@@ -0,0 +1,821 @@
1
+ ###############################################################################
2
+ #
3
+ # MIT License
4
+ #
5
+ # Copyright (c) 2025 Advanced Micro Devices, Inc.
6
+ #
7
+ # Permission is hereby granted, free of charge, to any person obtaining a copy
8
+ # of this software and associated documentation files (the "Software"), to deal
9
+ # in the Software without restriction, including without limitation the rights
10
+ # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11
+ # copies of the Software, and to permit persons to whom the Software is
12
+ # furnished to do so, subject to the following conditions:
13
+ #
14
+ # The above copyright notice and this permission notice shall be included in all
15
+ # copies or substantial portions of the Software.
16
+ #
17
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18
+ # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19
+ # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20
+ # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21
+ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22
+ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23
+ # SOFTWARE.
24
+ #
25
+ ###############################################################################
26
+ import io
27
+ from collections import defaultdict
28
+ from typing import Any, Optional, Union
29
+
30
+ from nodescraper.enums import EventCategory, EventPriority
31
+ from nodescraper.interfaces import DataAnalyzer
32
+ from nodescraper.models import TaskResult
33
+
34
+ from .amdsmidata import (
35
+ AmdSmiDataModel,
36
+ AmdSmiMetric,
37
+ AmdSmiStatic,
38
+ AmdSmiTstData,
39
+ EccData,
40
+ Fw,
41
+ Partition,
42
+ Processes,
43
+ XgmiMetrics,
44
+ )
45
+ from .analyzer_args import AmdSmiAnalyzerArgs
46
+ from .cper import CperAnalysisTaskMixin
47
+
48
+
49
+ class AmdSmiAnalyzer(CperAnalysisTaskMixin, DataAnalyzer[AmdSmiDataModel, None]):
50
+ """Check AMD SMI Application data for PCIe, ECC errors, CPER data, and analyze amdsmitst metrics"""
51
+
52
+ DATA_MODEL = AmdSmiDataModel
53
+
54
+ def check_expected_max_power(
55
+ self,
56
+ amdsmi_static_data: list[AmdSmiStatic],
57
+ expected_max_power: int,
58
+ ):
59
+ """Check against expected max power
60
+
61
+ Args:
62
+ amdsmi_static_data (list[AmdSmiStatic]): AmdSmiStatic data model
63
+ expected_max_power (int): expected max power
64
+ """
65
+ incorrect_max_power_gpus: dict[int, Union[int, str, float]] = {}
66
+ for gpu in amdsmi_static_data:
67
+ if gpu.limit is None or gpu.limit.max_power is None:
68
+ self._log_event(
69
+ category=EventCategory.PLATFORM,
70
+ description=f"GPU: {gpu.gpu} has no max power limit set",
71
+ priority=EventPriority.WARNING,
72
+ data={"gpu": gpu.gpu},
73
+ )
74
+ continue
75
+ max_power_value = gpu.limit.max_power.value
76
+ try:
77
+ max_power_float = float(max_power_value)
78
+ except ValueError:
79
+ self._log_event(
80
+ category=EventCategory.PLATFORM,
81
+ description=f"GPU: {gpu.gpu} has an invalid max power limit",
82
+ priority=EventPriority.ERROR,
83
+ data={
84
+ "gpu": gpu.gpu,
85
+ "max_power_value": max_power_value,
86
+ },
87
+ )
88
+ continue
89
+ if max_power_float != expected_max_power:
90
+ incorrect_max_power_gpus[gpu.gpu] = max_power_float
91
+ if incorrect_max_power_gpus:
92
+ self._log_event(
93
+ category=EventCategory.PLATFORM,
94
+ description="Max power mismatch",
95
+ priority=EventPriority.ERROR,
96
+ data={
97
+ "gpus": list(incorrect_max_power_gpus.keys()),
98
+ "max_power_values": incorrect_max_power_gpus,
99
+ "expected_max_power": expected_max_power,
100
+ },
101
+ )
102
+
103
+ def check_expected_driver_version(
104
+ self,
105
+ amdsmi_static_data: list[AmdSmiStatic],
106
+ expected_driver_version: str,
107
+ ) -> None:
108
+ """Check expectecd driver version
109
+
110
+ Args:
111
+ amdsmi_static_data (list[AmdSmiStatic]): AmdSmiStatic data model
112
+ expected_driver_version (str): expected driver version
113
+ """
114
+ bad_driver_gpus: list[int] = []
115
+
116
+ versions_by_gpu: dict[int, Optional[str]] = {}
117
+ for gpu in amdsmi_static_data:
118
+ ver: Optional[str] = None
119
+ if gpu.driver is not None:
120
+ ver = gpu.driver.version
121
+ versions_by_gpu[gpu.gpu] = ver
122
+ if ver != expected_driver_version:
123
+ bad_driver_gpus.append(gpu.gpu)
124
+
125
+ if bad_driver_gpus:
126
+ self._log_event(
127
+ category=EventCategory.PLATFORM,
128
+ description="Driver Version Mismatch",
129
+ priority=EventPriority.ERROR,
130
+ data={
131
+ "gpus": bad_driver_gpus,
132
+ "driver_version": {g: versions_by_gpu[g] for g in bad_driver_gpus},
133
+ "expected_driver_version": expected_driver_version,
134
+ },
135
+ )
136
+
137
+ def check_amdsmi_metric_pcie(
138
+ self,
139
+ amdsmi_metric_data: list[AmdSmiMetric],
140
+ l0_to_recovery_count_error_threshold: int,
141
+ l0_to_recovery_count_warning_threshold: int,
142
+ ):
143
+ """Check PCIe metrics for link errors
144
+
145
+ Checks for PCIe link width, speed, replays, recoveries, and NAKs.
146
+ Expected width/speeds should come from SKU info.
147
+
148
+ Args:
149
+ amdsmi_metric_data (list[AmdSmiMetric]): AmdSmiMetric data model
150
+ l0_to_recovery_count_error_threshold (int): Threshold for error events
151
+ l0_to_recovery_count_warning_threshold (int): Threshold for warning events
152
+ """
153
+ for metric in amdsmi_metric_data:
154
+ pcie_data = metric.pcie
155
+ gpu = metric.gpu
156
+
157
+ if pcie_data.width is not None and pcie_data.width != 16:
158
+ self._log_event(
159
+ category=EventCategory.IO,
160
+ description=f"GPU: {gpu} PCIe width is not x16",
161
+ priority=EventPriority.ERROR,
162
+ data={"gpu": gpu, "pcie_width": pcie_data.width, "expected": 16},
163
+ console_log=True,
164
+ )
165
+
166
+ if pcie_data.speed is not None and pcie_data.speed.value is not None:
167
+ try:
168
+ speed_val = float(pcie_data.speed.value)
169
+ if speed_val != 32.0:
170
+ self._log_event(
171
+ category=EventCategory.IO,
172
+ description=f"GPU: {gpu} PCIe link speed is not Gen5 (32 GT/s)",
173
+ priority=EventPriority.ERROR,
174
+ data={"gpu": gpu, "pcie_speed": speed_val, "expected": 32.0},
175
+ console_log=True,
176
+ )
177
+ except (ValueError, TypeError):
178
+ pass
179
+
180
+ if pcie_data.replay_count is not None and pcie_data.replay_count > 0:
181
+ self._log_event(
182
+ category=EventCategory.IO,
183
+ description=f"GPU: {gpu} has PCIe replay count: {pcie_data.replay_count}",
184
+ priority=EventPriority.WARNING,
185
+ data={"gpu": gpu, "replay_count": pcie_data.replay_count},
186
+ console_log=True,
187
+ )
188
+
189
+ if (
190
+ pcie_data.replay_roll_over_count is not None
191
+ and pcie_data.replay_roll_over_count > 0
192
+ ):
193
+ self._log_event(
194
+ category=EventCategory.IO,
195
+ description=f"GPU: {gpu} has PCIe replay rollover count: {pcie_data.replay_roll_over_count}",
196
+ priority=EventPriority.WARNING,
197
+ data={"gpu": gpu, "replay_roll_over_count": pcie_data.replay_roll_over_count},
198
+ console_log=True,
199
+ )
200
+
201
+ if pcie_data.l0_to_recovery_count is not None:
202
+ if pcie_data.l0_to_recovery_count > l0_to_recovery_count_error_threshold:
203
+ self._log_event(
204
+ category=EventCategory.IO,
205
+ description=f"GPU: {gpu} has {pcie_data.l0_to_recovery_count} L0 recoveries",
206
+ priority=EventPriority.ERROR,
207
+ data={
208
+ "gpu": gpu,
209
+ "l0_to_recovery_count": pcie_data.l0_to_recovery_count,
210
+ "error_threshold": l0_to_recovery_count_error_threshold,
211
+ },
212
+ console_log=True,
213
+ )
214
+ elif pcie_data.l0_to_recovery_count > l0_to_recovery_count_warning_threshold:
215
+ self._log_event(
216
+ category=EventCategory.IO,
217
+ description=f"GPU: {gpu} has {pcie_data.l0_to_recovery_count} L0 recoveries",
218
+ priority=EventPriority.WARNING,
219
+ data={
220
+ "gpu": gpu,
221
+ "l0_to_recovery_count": pcie_data.l0_to_recovery_count,
222
+ "warning_threshold": l0_to_recovery_count_warning_threshold,
223
+ },
224
+ console_log=True,
225
+ )
226
+
227
+ if pcie_data.nak_sent_count is not None and pcie_data.nak_sent_count > 0:
228
+ self._log_event(
229
+ category=EventCategory.IO,
230
+ description=f"GPU: {gpu} has sent {pcie_data.nak_sent_count} PCIe NAKs",
231
+ priority=EventPriority.WARNING,
232
+ data={"gpu": gpu, "nak_sent_count": pcie_data.nak_sent_count},
233
+ console_log=True,
234
+ )
235
+
236
+ if pcie_data.nak_received_count is not None and pcie_data.nak_received_count > 0:
237
+ self._log_event(
238
+ category=EventCategory.IO,
239
+ description=f"GPU: {gpu} has received {pcie_data.nak_received_count} PCIe NAKs",
240
+ priority=EventPriority.WARNING,
241
+ data={"gpu": gpu, "nak_received_count": pcie_data.nak_received_count},
242
+ console_log=True,
243
+ )
244
+
245
+ def check_amdsmi_metric_ecc_totals(self, amdsmi_metric_data: list[AmdSmiMetric]):
246
+ """Check ECC totals for all GPUs
247
+
248
+ Raises errors for uncorrectable errors, warnings for correctable and deferred.
249
+
250
+ Args:
251
+ amdsmi_metric_data (list[AmdSmiMetric]): AmdSmiMetric data model
252
+ """
253
+ for metric in amdsmi_metric_data:
254
+ ecc_totals = metric.ecc
255
+ gpu = metric.gpu
256
+
257
+ ecc_checks: list[tuple[EventPriority, Optional[int], str]] = [
258
+ (
259
+ EventPriority.WARNING,
260
+ ecc_totals.total_correctable_count,
261
+ "Total correctable ECC errors",
262
+ ),
263
+ (
264
+ EventPriority.ERROR,
265
+ ecc_totals.total_uncorrectable_count,
266
+ "Total uncorrectable ECC errors",
267
+ ),
268
+ (
269
+ EventPriority.WARNING,
270
+ ecc_totals.total_deferred_count,
271
+ "Total deferred ECC errors",
272
+ ),
273
+ (
274
+ EventPriority.WARNING,
275
+ ecc_totals.cache_correctable_count,
276
+ "Cache correctable ECC errors",
277
+ ),
278
+ (
279
+ EventPriority.ERROR,
280
+ ecc_totals.cache_uncorrectable_count,
281
+ "Cache uncorrectable ECC errors",
282
+ ),
283
+ ]
284
+
285
+ for priority, count, desc in ecc_checks:
286
+ if count is not None and count > 0:
287
+ self._log_event(
288
+ category=EventCategory.RAS,
289
+ description=f"GPU: {gpu} has {desc}: {count}",
290
+ priority=priority,
291
+ data={"gpu": gpu, "error_count": count, "error_type": desc},
292
+ console_log=True,
293
+ )
294
+
295
+ def check_amdsmi_metric_ecc(self, amdsmi_metric_data: list[AmdSmiMetric]):
296
+ """Check ECC counts in all blocks for all GPUs
297
+
298
+ Raises errors for uncorrectable errors, warnings for correctable and deferred.
299
+
300
+ Args:
301
+ amdsmi_metric_data (list[AmdSmiMetric]): AmdSmiMetric data model
302
+ """
303
+ for metric in amdsmi_metric_data:
304
+ gpu = metric.gpu
305
+ ecc_blocks = metric.ecc_blocks
306
+
307
+ # Skip if ecc_blocks is a string (e.g., "N/A") or empty
308
+ if isinstance(ecc_blocks, str) or not ecc_blocks:
309
+ continue
310
+
311
+ for block_name, ecc_data in ecc_blocks.items():
312
+ if not isinstance(ecc_data, EccData):
313
+ continue
314
+
315
+ if ecc_data.correctable_count is not None and ecc_data.correctable_count > 0:
316
+ self._log_event(
317
+ category=EventCategory.RAS,
318
+ description=f"GPU: {gpu} has correctable ECC errors in block {block_name}",
319
+ priority=EventPriority.WARNING,
320
+ data={
321
+ "gpu": gpu,
322
+ "block": block_name,
323
+ "correctable_count": ecc_data.correctable_count,
324
+ },
325
+ console_log=True,
326
+ )
327
+
328
+ if ecc_data.uncorrectable_count is not None and ecc_data.uncorrectable_count > 0:
329
+ self._log_event(
330
+ category=EventCategory.RAS,
331
+ description=f"GPU: {gpu} has uncorrectable ECC errors in block {block_name}",
332
+ priority=EventPriority.ERROR,
333
+ data={
334
+ "gpu": gpu,
335
+ "block": block_name,
336
+ "uncorrectable_count": ecc_data.uncorrectable_count,
337
+ },
338
+ console_log=True,
339
+ )
340
+
341
+ if ecc_data.deferred_count is not None and ecc_data.deferred_count > 0:
342
+ self._log_event(
343
+ category=EventCategory.RAS,
344
+ description=f"GPU: {gpu} has deferred ECC errors in block {block_name}",
345
+ priority=EventPriority.WARNING,
346
+ data={
347
+ "gpu": gpu,
348
+ "block": block_name,
349
+ "deferred_count": ecc_data.deferred_count,
350
+ },
351
+ console_log=True,
352
+ )
353
+
354
+ def expected_gpu_processes(
355
+ self, processes_data: Optional[list[Processes]], max_num_processes: int
356
+ ):
357
+ """Check the number of GPU processes running
358
+
359
+ Args:
360
+ processes_data (Optional[list[Processes]]): list of processes per GPU
361
+ max_num_processes (int): max number of expected processes
362
+ """
363
+ gpu_exceeds_num_processes: dict[int, int] = {}
364
+ if processes_data is None or len(processes_data) == 0:
365
+ self._log_event(
366
+ category=EventCategory.PLATFORM,
367
+ description="No GPU processes data available",
368
+ priority=EventPriority.WARNING,
369
+ data={"processes_data": processes_data},
370
+ console_log=True,
371
+ )
372
+ return
373
+ for process in processes_data:
374
+ if len(process.process_list) == 0 or isinstance(
375
+ process.process_list[0].process_info, str
376
+ ):
377
+ # Skip if there are no processes
378
+ continue
379
+
380
+ process_count = len(process.process_list)
381
+ if process_count > max_num_processes:
382
+ gpu_exceeds_num_processes[process.gpu] = process_count
383
+
384
+ if gpu_exceeds_num_processes:
385
+ self._log_event(
386
+ category=EventCategory.PLATFORM,
387
+ description="Number of processes exceeds max processes",
388
+ priority=EventPriority.ERROR,
389
+ data={
390
+ "gpu_exceeds_num_processes": gpu_exceeds_num_processes,
391
+ },
392
+ console_log=True,
393
+ )
394
+
395
+ def static_consistancy_check(self, amdsmi_static_data: list[AmdSmiStatic]):
396
+ """Check consistency of expected data
397
+
398
+ Args:
399
+ amdsmi_static_data (list[AmdSmiStatic]): AmdSmiStatic data model
400
+ """
401
+ consistancy_data: dict[str, Union[set[str], set[int]]] = {
402
+ "market_name": {gpu.asic.market_name for gpu in amdsmi_static_data},
403
+ "vendor_id": {gpu.asic.vendor_id for gpu in amdsmi_static_data},
404
+ "vendor_name": {gpu.asic.vendor_name for gpu in amdsmi_static_data},
405
+ "subvendor_id": {gpu.asic.subvendor_id for gpu in amdsmi_static_data},
406
+ "subsystem_id": {gpu.asic.subsystem_id for gpu in amdsmi_static_data},
407
+ "device_id": {gpu.asic.device_id for gpu in amdsmi_static_data},
408
+ "rev_id": {gpu.asic.rev_id for gpu in amdsmi_static_data},
409
+ "num_compute_units": {str(gpu.asic.num_compute_units) for gpu in amdsmi_static_data},
410
+ "target_graphics_version": {
411
+ gpu.asic.target_graphics_version for gpu in amdsmi_static_data
412
+ },
413
+ }
414
+ for key, value in consistancy_data.items():
415
+ if len(value) > 1:
416
+ self._log_event(
417
+ category=EventCategory.PLATFORM,
418
+ description=f"{key} is not consistent across all GPUs",
419
+ priority=EventPriority.WARNING,
420
+ data={
421
+ "field": key,
422
+ "non_consistent_values": value,
423
+ },
424
+ )
425
+
426
+ def check_static_data(
427
+ self,
428
+ amdsmi_static_data: list[AmdSmiStatic],
429
+ vendor_id: Optional[str],
430
+ subvendor_id: Optional[str],
431
+ device_id: tuple[Optional[str], Optional[str]],
432
+ subsystem_id: tuple[Optional[str], Optional[str]],
433
+ sku_name: Optional[str],
434
+ ) -> None:
435
+ """Check expected static data
436
+
437
+ Args:
438
+ amdsmi_static_data (list[AmdSmiStatic]): AmdSmiStatic data
439
+ vendor_id (Optional[str]): expected vendor_id
440
+ subvendor_id (Optional[str]): expected subvendor_id
441
+ device_id (tuple[Optional[str], Optional[str]]): expected device_id
442
+ subsystem_id (tuple[Optional[str], Optional[str]]): expected subsystem_id
443
+ sku_name (Optional[str]): expected sku_name
444
+ """
445
+
446
+ mismatches: list[tuple[int, str, str, str]] = []
447
+
448
+ expected_data: dict[str, Optional[str]] = {
449
+ "vendor_id": vendor_id,
450
+ "subvendor_id": subvendor_id,
451
+ "vendor_name": "Advanced Micro Devices Inc",
452
+ "market_name": sku_name,
453
+ }
454
+
455
+ for gpu_data in amdsmi_static_data:
456
+ collected_data: dict[str, str] = {
457
+ "vendor_id": gpu_data.asic.vendor_id,
458
+ "subvendor_id": gpu_data.asic.subvendor_id,
459
+ "vendor_name": gpu_data.asic.vendor_name,
460
+ "market_name": gpu_data.asic.market_name,
461
+ }
462
+
463
+ for key, expected in expected_data.items():
464
+ if expected is None:
465
+ continue
466
+ actual = collected_data[key]
467
+ if expected not in actual:
468
+ mismatches.append((gpu_data.gpu, key, expected, actual))
469
+ break
470
+
471
+ if device_id[0] is not None and device_id[1] is not None:
472
+ dev_actual = gpu_data.asic.device_id
473
+ if (
474
+ device_id[0].upper() not in dev_actual.upper()
475
+ and device_id[1].upper() not in dev_actual.upper()
476
+ ):
477
+ mismatches.append(
478
+ (gpu_data.gpu, "device_id", f"{device_id[0]}|{device_id[1]}", dev_actual)
479
+ )
480
+
481
+ if subsystem_id[0] is not None and subsystem_id[1] is not None:
482
+ subsys_actual = gpu_data.asic.subsystem_id
483
+ if (
484
+ subsystem_id[0].upper() not in subsys_actual.upper()
485
+ and subsystem_id[1].upper() not in subsys_actual.upper()
486
+ ):
487
+ mismatches.append(
488
+ (
489
+ gpu_data.gpu,
490
+ "subsystem_id",
491
+ f"{subsystem_id[0]}|{subsystem_id[1]}",
492
+ subsys_actual,
493
+ )
494
+ )
495
+
496
+ if mismatches:
497
+ payload = self._format_static_mismatch_payload(mismatches)
498
+ self._log_event(
499
+ category=EventCategory.PLATFORM,
500
+ description="amd-smi static data mismatch",
501
+ priority=EventPriority.ERROR,
502
+ data=payload,
503
+ )
504
+
505
+ def _format_static_mismatch_payload(
506
+ self,
507
+ mismatches: list[tuple[int, str, str, str]],
508
+ ) -> dict[str, Any]:
509
+ """Helper function for pretty printing mismatch in expected data
510
+
511
+ Args:
512
+ mismatches (list[tuple[int, str, str, str]]): mismatched data per GPU
513
+
514
+ Returns:
515
+ dict[str, Any]: dict of mismatched data per GPU
516
+ """
517
+ per_gpu: dict[int, list[dict[str, str]]] = defaultdict(list)
518
+ field_set: set[str] = set()
519
+
520
+ for gpu, field, expected, actual in mismatches:
521
+ field_set.add(field)
522
+ per_gpu[gpu].append({"field": field, "expected": expected, "actual": actual})
523
+
524
+ per_gpu_list: list[dict[str, Any]] = [
525
+ {"gpu": gpu, "mismatches": entries}
526
+ for gpu, entries in sorted(per_gpu.items(), key=lambda kv: kv[0])
527
+ ]
528
+
529
+ return {
530
+ "summary": {
531
+ "gpus_affected": len(per_gpu),
532
+ "fields": sorted(field_set),
533
+ "total_mismatches": sum(len(v) for v in per_gpu.values()),
534
+ },
535
+ "per_gpu": per_gpu_list,
536
+ }
537
+
538
+ def check_pldm_version(
539
+ self,
540
+ amdsmi_fw_data: Optional[list[Fw]],
541
+ expected_pldm_version: Optional[str],
542
+ ):
543
+ """Check expected pldm version
544
+
545
+ Args:
546
+ amdsmi_fw_data (Optional[list[Fw]]): data model
547
+ expected_pldm_version (Optional[str]): expected pldm version
548
+ """
549
+ PLDM_STRING = "PLDM_BUNDLE"
550
+ if amdsmi_fw_data is None or len(amdsmi_fw_data) == 0:
551
+ self._log_event(
552
+ category=EventCategory.PLATFORM,
553
+ description="No AMD SMI firmware data available",
554
+ priority=EventPriority.WARNING,
555
+ data={"amdsmi_fw_data": amdsmi_fw_data},
556
+ )
557
+ return
558
+ mismatched_gpus: list[int] = []
559
+ pldm_missing_gpus: list[int] = []
560
+ for fw_data in amdsmi_fw_data:
561
+ gpu = fw_data.gpu
562
+ if isinstance(fw_data.fw_list, str):
563
+ pldm_missing_gpus.append(gpu)
564
+ continue
565
+ for fw_info in fw_data.fw_list:
566
+ if PLDM_STRING == fw_info.fw_id and expected_pldm_version != fw_info.fw_version:
567
+ mismatched_gpus.append(gpu)
568
+ if PLDM_STRING == fw_info.fw_id:
569
+ break
570
+ else:
571
+ pldm_missing_gpus.append(gpu)
572
+
573
+ if mismatched_gpus or pldm_missing_gpus:
574
+ self._log_event(
575
+ category=EventCategory.FW,
576
+ description="PLDM Version Mismatch",
577
+ priority=EventPriority.ERROR,
578
+ data={
579
+ "mismatched_gpus": mismatched_gpus,
580
+ "pldm_missing_gpus": pldm_missing_gpus,
581
+ "expected_pldm_version": expected_pldm_version,
582
+ },
583
+ )
584
+
585
+ def check_expected_memory_partition_mode(
586
+ self,
587
+ partition_data: Optional[Partition],
588
+ expected_memory_partition_mode: Optional[str],
589
+ expected_compute_partition_mode: Optional[str],
590
+ ):
591
+ """Check expected mem partition mode
592
+
593
+ Args:
594
+ partition_data (Optional[Partition]): data model
595
+ expected_memory_partition_mode (Optional[str]): expected mem partition mode
596
+ expected_compute_partition_mode (Optional[str]): expected compute partition mode
597
+ """
598
+ if partition_data is None:
599
+ self._log_event(
600
+ category=EventCategory.PLATFORM,
601
+ description="No AMD SMI Partition data not available",
602
+ priority=EventPriority.WARNING,
603
+ )
604
+ return
605
+ bad_memory_partition_mode_gpus = []
606
+ for partition_current in partition_data.memory_partition:
607
+ if (
608
+ expected_memory_partition_mode is not None
609
+ and partition_current.partition_type != expected_memory_partition_mode
610
+ ):
611
+ bad_memory_partition_mode_gpus.append(
612
+ {
613
+ "gpu_id": partition_current.gpu_id,
614
+ "memory_partition_mode": partition_current.partition_type,
615
+ }
616
+ )
617
+
618
+ for compute_current in partition_data.compute_partition:
619
+ if (
620
+ expected_compute_partition_mode is not None
621
+ and compute_current.partition_type != expected_compute_partition_mode
622
+ ):
623
+ bad_memory_partition_mode_gpus.append(
624
+ {
625
+ "gpu_id": compute_current.gpu_id,
626
+ "compute_partition_mode": compute_current.partition_type,
627
+ }
628
+ )
629
+
630
+ if bad_memory_partition_mode_gpus:
631
+ self._log_event(
632
+ category=EventCategory.PLATFORM,
633
+ description="Partition Mode Mismatch",
634
+ priority=EventPriority.ERROR,
635
+ data={
636
+ "actual_partition_data": bad_memory_partition_mode_gpus,
637
+ "expected_memory_partition_mode": expected_memory_partition_mode,
638
+ "expected_compute_partition_mode": expected_compute_partition_mode,
639
+ },
640
+ )
641
+
642
+ def check_expected_xgmi_link_speed(
643
+ self,
644
+ xgmi_metric: Optional[list[XgmiMetrics]],
645
+ expected_xgmi_speed: Optional[list[float]] = None,
646
+ ):
647
+ """Check the XGMI link speed for all GPUs
648
+
649
+ Args:
650
+ xgmi_metric (Optional[list[XgmiMetrics]]): XGMI metrics data
651
+ expected_xgmi_speed (Optional[list[float]]): List of expected XGMI speeds (GT/s)
652
+ """
653
+ if xgmi_metric is None or len(xgmi_metric) == 0:
654
+ self._log_event(
655
+ category=EventCategory.IO,
656
+ description="XGMI link speed data is not available and cannot be checked",
657
+ priority=EventPriority.WARNING,
658
+ data={"xgmi_metric": xgmi_metric},
659
+ )
660
+ return
661
+
662
+ if expected_xgmi_speed is None or len(expected_xgmi_speed) == 0:
663
+ self._log_event(
664
+ category=EventCategory.IO,
665
+ description="Expected XGMI speed not configured, skipping XGMI link speed check",
666
+ priority=EventPriority.WARNING,
667
+ )
668
+ return
669
+
670
+ for xgmi_data in xgmi_metric:
671
+ link_metric = xgmi_data.link_metrics
672
+ try:
673
+ if link_metric.bit_rate is None or link_metric.bit_rate.value is None:
674
+ self._log_event(
675
+ category=EventCategory.IO,
676
+ description="XGMI link speed is not available",
677
+ priority=EventPriority.ERROR,
678
+ data={
679
+ "gpu": xgmi_data.gpu,
680
+ "xgmi_bit_rate": (
681
+ link_metric.bit_rate.unit if link_metric.bit_rate else "N/A"
682
+ ),
683
+ },
684
+ )
685
+ continue
686
+
687
+ xgmi_float = float(link_metric.bit_rate.value)
688
+ except ValueError:
689
+ self._log_event(
690
+ category=EventCategory.IO,
691
+ description="XGMI link speed is not a valid number",
692
+ priority=EventPriority.ERROR,
693
+ data={
694
+ "gpu": xgmi_data.gpu,
695
+ "xgmi_bit_rate": (
696
+ link_metric.bit_rate.value if link_metric.bit_rate else "N/A"
697
+ ),
698
+ },
699
+ )
700
+ continue
701
+
702
+ if xgmi_float not in expected_xgmi_speed:
703
+ self._log_event(
704
+ category=EventCategory.IO,
705
+ description="XGMI link speed is not as expected",
706
+ priority=EventPriority.ERROR,
707
+ data={
708
+ "gpu": xgmi_data.gpu,
709
+ "xgmi_bit_rate": xgmi_float,
710
+ "expected_xgmi_speed": expected_xgmi_speed,
711
+ },
712
+ console_log=True,
713
+ )
714
+
715
+ def check_amdsmitst(self, amdsmitst_data: AmdSmiTstData):
716
+ """Check AMD SMI test results
717
+
718
+ Args:
719
+ amdsmitst_data (AmdSmiTstData): AMD SMI test data
720
+ """
721
+ if amdsmitst_data.failed_test_count > 0:
722
+ self._log_event(
723
+ category=EventCategory.APPLICATION,
724
+ description=f"{amdsmitst_data.failed_test_count} failed tests running amdsmitst",
725
+ priority=EventPriority.ERROR,
726
+ data={
727
+ "failed_test_count": amdsmitst_data.failed_test_count,
728
+ "failed_tests": amdsmitst_data.failed_tests,
729
+ },
730
+ console_log=True,
731
+ )
732
+
733
+ def analyze_data(
734
+ self, data: AmdSmiDataModel, args: Optional[AmdSmiAnalyzerArgs] = None
735
+ ) -> TaskResult:
736
+ """Analyze the amdsmi data against expected data
737
+
738
+ Args:
739
+ data (AmdSmiDataModel): the AmdSmi data model
740
+ args (_type_, optional): optional AmdSmi analyzer args. Defaults to None.
741
+
742
+ Returns:
743
+ TaskResult: the result of the analysis indicating weather the AmdSmi data model
744
+ matched the expected data
745
+ """
746
+
747
+ if args is None:
748
+ args = AmdSmiAnalyzerArgs()
749
+
750
+ if data.metric is not None and len(data.metric) > 0:
751
+ if args.l0_to_recovery_count_error_threshold is not None:
752
+ self.check_amdsmi_metric_pcie(
753
+ data.metric,
754
+ args.l0_to_recovery_count_error_threshold,
755
+ args.l0_to_recovery_count_warning_threshold or 1,
756
+ )
757
+ self.check_amdsmi_metric_ecc_totals(data.metric)
758
+ self.check_amdsmi_metric_ecc(data.metric)
759
+
760
+ if args.expected_gpu_processes:
761
+ self.expected_gpu_processes(data.process, args.expected_gpu_processes)
762
+
763
+ if data.static is None or len(data.static) == 0:
764
+ self._log_event(
765
+ category=EventCategory.PLATFORM,
766
+ description="No AMD SMI static data available",
767
+ priority=EventPriority.WARNING,
768
+ data={"amdsmi_static_data": data.static},
769
+ )
770
+ else:
771
+ if args.expected_max_power:
772
+ self.check_expected_max_power(data.static, args.expected_max_power)
773
+ if args.expected_driver_version:
774
+ self.check_expected_driver_version(data.static, args.expected_driver_version)
775
+
776
+ self.static_consistancy_check(data.static)
777
+ if (
778
+ self.system_info.sku
779
+ and args.devid_ep
780
+ and args.devid_ep_vf
781
+ and args.vendorid_ep
782
+ and args.check_static_data
783
+ ) or args.check_static_data:
784
+ self.check_static_data(
785
+ data.static,
786
+ args.vendorid_ep,
787
+ args.vendorid_ep,
788
+ (args.devid_ep, args.devid_ep),
789
+ (args.devid_ep, args.devid_ep),
790
+ sku_name=args.sku_name,
791
+ )
792
+
793
+ if args.expected_memory_partition_mode or args.expected_compute_partition_mode:
794
+ self.check_expected_memory_partition_mode(
795
+ data.partition,
796
+ args.expected_memory_partition_mode,
797
+ args.expected_compute_partition_mode,
798
+ )
799
+
800
+ if args.expected_pldm_version:
801
+ self.check_pldm_version(data.firmware, args.expected_pldm_version)
802
+
803
+ if data.cper_data:
804
+ self.analyzer_cpers(
805
+ {
806
+ file_model_obj.file_name: io.BytesIO(file_model_obj.file_contents)
807
+ for file_model_obj in data.cper_data
808
+ },
809
+ analysis_range_start=args.analysis_range_start,
810
+ analysis_range_end=args.analysis_range_end,
811
+ )
812
+
813
+ if data.xgmi_metric and len(data.xgmi_metric) > 0:
814
+ self.check_expected_xgmi_link_speed(
815
+ data.xgmi_metric, expected_xgmi_speed=args.expected_xgmi_speed
816
+ )
817
+
818
+ if data.amdsmitst_data and data.amdsmitst_data.failed_test_count > 0:
819
+ self.check_amdsmitst(data.amdsmitst_data)
820
+
821
+ return self.result