amd-node-scraper 0.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- amd_node_scraper-0.0.1.dist-info/LICENSE +21 -0
- amd_node_scraper-0.0.1.dist-info/METADATA +424 -0
- amd_node_scraper-0.0.1.dist-info/RECORD +197 -0
- amd_node_scraper-0.0.1.dist-info/WHEEL +5 -0
- amd_node_scraper-0.0.1.dist-info/entry_points.txt +2 -0
- amd_node_scraper-0.0.1.dist-info/top_level.txt +1 -0
- nodescraper/__init__.py +32 -0
- nodescraper/base/__init__.py +34 -0
- nodescraper/base/inbandcollectortask.py +118 -0
- nodescraper/base/inbanddataplugin.py +39 -0
- nodescraper/base/regexanalyzer.py +120 -0
- nodescraper/cli/__init__.py +29 -0
- nodescraper/cli/cli.py +511 -0
- nodescraper/cli/constants.py +27 -0
- nodescraper/cli/dynamicparserbuilder.py +171 -0
- nodescraper/cli/helper.py +517 -0
- nodescraper/cli/inputargtypes.py +129 -0
- nodescraper/configbuilder.py +123 -0
- nodescraper/configregistry.py +66 -0
- nodescraper/configs/node_status.json +19 -0
- nodescraper/connection/__init__.py +25 -0
- nodescraper/connection/inband/__init__.py +46 -0
- nodescraper/connection/inband/inband.py +171 -0
- nodescraper/connection/inband/inbandlocal.py +93 -0
- nodescraper/connection/inband/inbandmanager.py +151 -0
- nodescraper/connection/inband/inbandremote.py +173 -0
- nodescraper/connection/inband/sshparams.py +43 -0
- nodescraper/constants.py +26 -0
- nodescraper/enums/__init__.py +40 -0
- nodescraper/enums/eventcategory.py +89 -0
- nodescraper/enums/eventpriority.py +42 -0
- nodescraper/enums/executionstatus.py +44 -0
- nodescraper/enums/osfamily.py +34 -0
- nodescraper/enums/systeminteraction.py +41 -0
- nodescraper/enums/systemlocation.py +33 -0
- nodescraper/generictypes.py +36 -0
- nodescraper/interfaces/__init__.py +44 -0
- nodescraper/interfaces/connectionmanager.py +143 -0
- nodescraper/interfaces/dataanalyzertask.py +138 -0
- nodescraper/interfaces/datacollectortask.py +185 -0
- nodescraper/interfaces/dataplugin.py +356 -0
- nodescraper/interfaces/plugin.py +127 -0
- nodescraper/interfaces/resultcollator.py +56 -0
- nodescraper/interfaces/task.py +164 -0
- nodescraper/interfaces/taskresulthook.py +39 -0
- nodescraper/models/__init__.py +48 -0
- nodescraper/models/analyzerargs.py +93 -0
- nodescraper/models/collectorargs.py +30 -0
- nodescraper/models/connectionconfig.py +34 -0
- nodescraper/models/datamodel.py +171 -0
- nodescraper/models/datapluginresult.py +39 -0
- nodescraper/models/event.py +158 -0
- nodescraper/models/pluginconfig.py +38 -0
- nodescraper/models/pluginresult.py +39 -0
- nodescraper/models/systeminfo.py +44 -0
- nodescraper/models/taskresult.py +185 -0
- nodescraper/models/timerangeargs.py +38 -0
- nodescraper/pluginexecutor.py +274 -0
- nodescraper/pluginregistry.py +152 -0
- nodescraper/plugins/__init__.py +25 -0
- nodescraper/plugins/inband/__init__.py +25 -0
- nodescraper/plugins/inband/amdsmi/__init__.py +28 -0
- nodescraper/plugins/inband/amdsmi/amdsmi_analyzer.py +821 -0
- nodescraper/plugins/inband/amdsmi/amdsmi_collector.py +1313 -0
- nodescraper/plugins/inband/amdsmi/amdsmi_plugin.py +43 -0
- nodescraper/plugins/inband/amdsmi/amdsmidata.py +1002 -0
- nodescraper/plugins/inband/amdsmi/analyzer_args.py +50 -0
- nodescraper/plugins/inband/amdsmi/cper.py +65 -0
- nodescraper/plugins/inband/bios/__init__.py +29 -0
- nodescraper/plugins/inband/bios/analyzer_args.py +64 -0
- nodescraper/plugins/inband/bios/bios_analyzer.py +93 -0
- nodescraper/plugins/inband/bios/bios_collector.py +93 -0
- nodescraper/plugins/inband/bios/bios_plugin.py +43 -0
- nodescraper/plugins/inband/bios/biosdata.py +30 -0
- nodescraper/plugins/inband/cmdline/__init__.py +25 -0
- nodescraper/plugins/inband/cmdline/analyzer_args.py +80 -0
- nodescraper/plugins/inband/cmdline/cmdline_analyzer.py +113 -0
- nodescraper/plugins/inband/cmdline/cmdline_collector.py +77 -0
- nodescraper/plugins/inband/cmdline/cmdline_plugin.py +43 -0
- nodescraper/plugins/inband/cmdline/cmdlinedata.py +30 -0
- nodescraper/plugins/inband/device_enumeration/__init__.py +29 -0
- nodescraper/plugins/inband/device_enumeration/analyzer_args.py +73 -0
- nodescraper/plugins/inband/device_enumeration/device_enumeration_analyzer.py +81 -0
- nodescraper/plugins/inband/device_enumeration/device_enumeration_collector.py +176 -0
- nodescraper/plugins/inband/device_enumeration/device_enumeration_plugin.py +45 -0
- nodescraper/plugins/inband/device_enumeration/deviceenumdata.py +36 -0
- nodescraper/plugins/inband/dimm/__init__.py +25 -0
- nodescraper/plugins/inband/dimm/collector_args.py +31 -0
- nodescraper/plugins/inband/dimm/dimm_collector.py +151 -0
- nodescraper/plugins/inband/dimm/dimm_plugin.py +40 -0
- nodescraper/plugins/inband/dimm/dimmdata.py +30 -0
- nodescraper/plugins/inband/dkms/__init__.py +25 -0
- nodescraper/plugins/inband/dkms/analyzer_args.py +85 -0
- nodescraper/plugins/inband/dkms/dkms_analyzer.py +106 -0
- nodescraper/plugins/inband/dkms/dkms_collector.py +76 -0
- nodescraper/plugins/inband/dkms/dkms_plugin.py +43 -0
- nodescraper/plugins/inband/dkms/dkmsdata.py +33 -0
- nodescraper/plugins/inband/dmesg/__init__.py +28 -0
- nodescraper/plugins/inband/dmesg/analyzer_args.py +33 -0
- nodescraper/plugins/inband/dmesg/collector_args.py +39 -0
- nodescraper/plugins/inband/dmesg/dmesg_analyzer.py +503 -0
- nodescraper/plugins/inband/dmesg/dmesg_collector.py +164 -0
- nodescraper/plugins/inband/dmesg/dmesg_plugin.py +44 -0
- nodescraper/plugins/inband/dmesg/dmesgdata.py +116 -0
- nodescraper/plugins/inband/fabrics/__init__.py +28 -0
- nodescraper/plugins/inband/fabrics/fabrics_collector.py +726 -0
- nodescraper/plugins/inband/fabrics/fabrics_plugin.py +37 -0
- nodescraper/plugins/inband/fabrics/fabricsdata.py +140 -0
- nodescraper/plugins/inband/journal/__init__.py +28 -0
- nodescraper/plugins/inband/journal/collector_args.py +33 -0
- nodescraper/plugins/inband/journal/journal_collector.py +107 -0
- nodescraper/plugins/inband/journal/journal_plugin.py +40 -0
- nodescraper/plugins/inband/journal/journaldata.py +44 -0
- nodescraper/plugins/inband/kernel/__init__.py +25 -0
- nodescraper/plugins/inband/kernel/analyzer_args.py +64 -0
- nodescraper/plugins/inband/kernel/kernel_analyzer.py +91 -0
- nodescraper/plugins/inband/kernel/kernel_collector.py +129 -0
- nodescraper/plugins/inband/kernel/kernel_plugin.py +43 -0
- nodescraper/plugins/inband/kernel/kerneldata.py +32 -0
- nodescraper/plugins/inband/kernel_module/__init__.py +25 -0
- nodescraper/plugins/inband/kernel_module/analyzer_args.py +59 -0
- nodescraper/plugins/inband/kernel_module/kernel_module_analyzer.py +211 -0
- nodescraper/plugins/inband/kernel_module/kernel_module_collector.py +264 -0
- nodescraper/plugins/inband/kernel_module/kernel_module_data.py +60 -0
- nodescraper/plugins/inband/kernel_module/kernel_module_plugin.py +43 -0
- nodescraper/plugins/inband/memory/__init__.py +25 -0
- nodescraper/plugins/inband/memory/analyzer_args.py +45 -0
- nodescraper/plugins/inband/memory/memory_analyzer.py +98 -0
- nodescraper/plugins/inband/memory/memory_collector.py +330 -0
- nodescraper/plugins/inband/memory/memory_plugin.py +43 -0
- nodescraper/plugins/inband/memory/memorydata.py +90 -0
- nodescraper/plugins/inband/network/__init__.py +28 -0
- nodescraper/plugins/inband/network/network_collector.py +1828 -0
- nodescraper/plugins/inband/network/network_plugin.py +37 -0
- nodescraper/plugins/inband/network/networkdata.py +319 -0
- nodescraper/plugins/inband/nvme/__init__.py +28 -0
- nodescraper/plugins/inband/nvme/nvme_collector.py +167 -0
- nodescraper/plugins/inband/nvme/nvme_plugin.py +37 -0
- nodescraper/plugins/inband/nvme/nvmedata.py +45 -0
- nodescraper/plugins/inband/os/__init__.py +25 -0
- nodescraper/plugins/inband/os/analyzer_args.py +64 -0
- nodescraper/plugins/inband/os/os_analyzer.py +73 -0
- nodescraper/plugins/inband/os/os_collector.py +131 -0
- nodescraper/plugins/inband/os/os_plugin.py +43 -0
- nodescraper/plugins/inband/os/osdata.py +31 -0
- nodescraper/plugins/inband/package/__init__.py +25 -0
- nodescraper/plugins/inband/package/analyzer_args.py +48 -0
- nodescraper/plugins/inband/package/package_analyzer.py +253 -0
- nodescraper/plugins/inband/package/package_collector.py +273 -0
- nodescraper/plugins/inband/package/package_plugin.py +43 -0
- nodescraper/plugins/inband/package/packagedata.py +41 -0
- nodescraper/plugins/inband/pcie/__init__.py +29 -0
- nodescraper/plugins/inband/pcie/analyzer_args.py +63 -0
- nodescraper/plugins/inband/pcie/pcie_analyzer.py +1081 -0
- nodescraper/plugins/inband/pcie/pcie_collector.py +690 -0
- nodescraper/plugins/inband/pcie/pcie_data.py +2017 -0
- nodescraper/plugins/inband/pcie/pcie_plugin.py +43 -0
- nodescraper/plugins/inband/process/__init__.py +25 -0
- nodescraper/plugins/inband/process/analyzer_args.py +45 -0
- nodescraper/plugins/inband/process/collector_args.py +31 -0
- nodescraper/plugins/inband/process/process_analyzer.py +91 -0
- nodescraper/plugins/inband/process/process_collector.py +115 -0
- nodescraper/plugins/inband/process/process_plugin.py +46 -0
- nodescraper/plugins/inband/process/processdata.py +34 -0
- nodescraper/plugins/inband/rocm/__init__.py +25 -0
- nodescraper/plugins/inband/rocm/analyzer_args.py +66 -0
- nodescraper/plugins/inband/rocm/rocm_analyzer.py +100 -0
- nodescraper/plugins/inband/rocm/rocm_collector.py +205 -0
- nodescraper/plugins/inband/rocm/rocm_plugin.py +43 -0
- nodescraper/plugins/inband/rocm/rocmdata.py +62 -0
- nodescraper/plugins/inband/storage/__init__.py +25 -0
- nodescraper/plugins/inband/storage/analyzer_args.py +38 -0
- nodescraper/plugins/inband/storage/collector_args.py +31 -0
- nodescraper/plugins/inband/storage/storage_analyzer.py +152 -0
- nodescraper/plugins/inband/storage/storage_collector.py +110 -0
- nodescraper/plugins/inband/storage/storage_plugin.py +44 -0
- nodescraper/plugins/inband/storage/storagedata.py +70 -0
- nodescraper/plugins/inband/sysctl/__init__.py +29 -0
- nodescraper/plugins/inband/sysctl/analyzer_args.py +67 -0
- nodescraper/plugins/inband/sysctl/sysctl_analyzer.py +81 -0
- nodescraper/plugins/inband/sysctl/sysctl_collector.py +101 -0
- nodescraper/plugins/inband/sysctl/sysctl_plugin.py +43 -0
- nodescraper/plugins/inband/sysctl/sysctldata.py +42 -0
- nodescraper/plugins/inband/syslog/__init__.py +28 -0
- nodescraper/plugins/inband/syslog/syslog_collector.py +121 -0
- nodescraper/plugins/inband/syslog/syslog_plugin.py +37 -0
- nodescraper/plugins/inband/syslog/syslogdata.py +46 -0
- nodescraper/plugins/inband/uptime/__init__.py +25 -0
- nodescraper/plugins/inband/uptime/uptime_collector.py +88 -0
- nodescraper/plugins/inband/uptime/uptime_plugin.py +37 -0
- nodescraper/plugins/inband/uptime/uptimedata.py +31 -0
- nodescraper/resultcollators/__init__.py +25 -0
- nodescraper/resultcollators/tablesummary.py +159 -0
- nodescraper/taskresulthooks/__init__.py +28 -0
- nodescraper/taskresulthooks/filesystemloghook.py +88 -0
- nodescraper/typeutils.py +171 -0
- nodescraper/utils.py +412 -0
|
@@ -0,0 +1,821 @@
|
|
|
1
|
+
###############################################################################
|
|
2
|
+
#
|
|
3
|
+
# MIT License
|
|
4
|
+
#
|
|
5
|
+
# Copyright (c) 2025 Advanced Micro Devices, Inc.
|
|
6
|
+
#
|
|
7
|
+
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
8
|
+
# of this software and associated documentation files (the "Software"), to deal
|
|
9
|
+
# in the Software without restriction, including without limitation the rights
|
|
10
|
+
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
11
|
+
# copies of the Software, and to permit persons to whom the Software is
|
|
12
|
+
# furnished to do so, subject to the following conditions:
|
|
13
|
+
#
|
|
14
|
+
# The above copyright notice and this permission notice shall be included in all
|
|
15
|
+
# copies or substantial portions of the Software.
|
|
16
|
+
#
|
|
17
|
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
18
|
+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
19
|
+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
20
|
+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
21
|
+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
22
|
+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
23
|
+
# SOFTWARE.
|
|
24
|
+
#
|
|
25
|
+
###############################################################################
|
|
26
|
+
import io
|
|
27
|
+
from collections import defaultdict
|
|
28
|
+
from typing import Any, Optional, Union
|
|
29
|
+
|
|
30
|
+
from nodescraper.enums import EventCategory, EventPriority
|
|
31
|
+
from nodescraper.interfaces import DataAnalyzer
|
|
32
|
+
from nodescraper.models import TaskResult
|
|
33
|
+
|
|
34
|
+
from .amdsmidata import (
|
|
35
|
+
AmdSmiDataModel,
|
|
36
|
+
AmdSmiMetric,
|
|
37
|
+
AmdSmiStatic,
|
|
38
|
+
AmdSmiTstData,
|
|
39
|
+
EccData,
|
|
40
|
+
Fw,
|
|
41
|
+
Partition,
|
|
42
|
+
Processes,
|
|
43
|
+
XgmiMetrics,
|
|
44
|
+
)
|
|
45
|
+
from .analyzer_args import AmdSmiAnalyzerArgs
|
|
46
|
+
from .cper import CperAnalysisTaskMixin
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
class AmdSmiAnalyzer(CperAnalysisTaskMixin, DataAnalyzer[AmdSmiDataModel, None]):
|
|
50
|
+
"""Check AMD SMI Application data for PCIe, ECC errors, CPER data, and analyze amdsmitst metrics"""
|
|
51
|
+
|
|
52
|
+
DATA_MODEL = AmdSmiDataModel
|
|
53
|
+
|
|
54
|
+
def check_expected_max_power(
|
|
55
|
+
self,
|
|
56
|
+
amdsmi_static_data: list[AmdSmiStatic],
|
|
57
|
+
expected_max_power: int,
|
|
58
|
+
):
|
|
59
|
+
"""Check against expected max power
|
|
60
|
+
|
|
61
|
+
Args:
|
|
62
|
+
amdsmi_static_data (list[AmdSmiStatic]): AmdSmiStatic data model
|
|
63
|
+
expected_max_power (int): expected max power
|
|
64
|
+
"""
|
|
65
|
+
incorrect_max_power_gpus: dict[int, Union[int, str, float]] = {}
|
|
66
|
+
for gpu in amdsmi_static_data:
|
|
67
|
+
if gpu.limit is None or gpu.limit.max_power is None:
|
|
68
|
+
self._log_event(
|
|
69
|
+
category=EventCategory.PLATFORM,
|
|
70
|
+
description=f"GPU: {gpu.gpu} has no max power limit set",
|
|
71
|
+
priority=EventPriority.WARNING,
|
|
72
|
+
data={"gpu": gpu.gpu},
|
|
73
|
+
)
|
|
74
|
+
continue
|
|
75
|
+
max_power_value = gpu.limit.max_power.value
|
|
76
|
+
try:
|
|
77
|
+
max_power_float = float(max_power_value)
|
|
78
|
+
except ValueError:
|
|
79
|
+
self._log_event(
|
|
80
|
+
category=EventCategory.PLATFORM,
|
|
81
|
+
description=f"GPU: {gpu.gpu} has an invalid max power limit",
|
|
82
|
+
priority=EventPriority.ERROR,
|
|
83
|
+
data={
|
|
84
|
+
"gpu": gpu.gpu,
|
|
85
|
+
"max_power_value": max_power_value,
|
|
86
|
+
},
|
|
87
|
+
)
|
|
88
|
+
continue
|
|
89
|
+
if max_power_float != expected_max_power:
|
|
90
|
+
incorrect_max_power_gpus[gpu.gpu] = max_power_float
|
|
91
|
+
if incorrect_max_power_gpus:
|
|
92
|
+
self._log_event(
|
|
93
|
+
category=EventCategory.PLATFORM,
|
|
94
|
+
description="Max power mismatch",
|
|
95
|
+
priority=EventPriority.ERROR,
|
|
96
|
+
data={
|
|
97
|
+
"gpus": list(incorrect_max_power_gpus.keys()),
|
|
98
|
+
"max_power_values": incorrect_max_power_gpus,
|
|
99
|
+
"expected_max_power": expected_max_power,
|
|
100
|
+
},
|
|
101
|
+
)
|
|
102
|
+
|
|
103
|
+
def check_expected_driver_version(
|
|
104
|
+
self,
|
|
105
|
+
amdsmi_static_data: list[AmdSmiStatic],
|
|
106
|
+
expected_driver_version: str,
|
|
107
|
+
) -> None:
|
|
108
|
+
"""Check expectecd driver version
|
|
109
|
+
|
|
110
|
+
Args:
|
|
111
|
+
amdsmi_static_data (list[AmdSmiStatic]): AmdSmiStatic data model
|
|
112
|
+
expected_driver_version (str): expected driver version
|
|
113
|
+
"""
|
|
114
|
+
bad_driver_gpus: list[int] = []
|
|
115
|
+
|
|
116
|
+
versions_by_gpu: dict[int, Optional[str]] = {}
|
|
117
|
+
for gpu in amdsmi_static_data:
|
|
118
|
+
ver: Optional[str] = None
|
|
119
|
+
if gpu.driver is not None:
|
|
120
|
+
ver = gpu.driver.version
|
|
121
|
+
versions_by_gpu[gpu.gpu] = ver
|
|
122
|
+
if ver != expected_driver_version:
|
|
123
|
+
bad_driver_gpus.append(gpu.gpu)
|
|
124
|
+
|
|
125
|
+
if bad_driver_gpus:
|
|
126
|
+
self._log_event(
|
|
127
|
+
category=EventCategory.PLATFORM,
|
|
128
|
+
description="Driver Version Mismatch",
|
|
129
|
+
priority=EventPriority.ERROR,
|
|
130
|
+
data={
|
|
131
|
+
"gpus": bad_driver_gpus,
|
|
132
|
+
"driver_version": {g: versions_by_gpu[g] for g in bad_driver_gpus},
|
|
133
|
+
"expected_driver_version": expected_driver_version,
|
|
134
|
+
},
|
|
135
|
+
)
|
|
136
|
+
|
|
137
|
+
def check_amdsmi_metric_pcie(
|
|
138
|
+
self,
|
|
139
|
+
amdsmi_metric_data: list[AmdSmiMetric],
|
|
140
|
+
l0_to_recovery_count_error_threshold: int,
|
|
141
|
+
l0_to_recovery_count_warning_threshold: int,
|
|
142
|
+
):
|
|
143
|
+
"""Check PCIe metrics for link errors
|
|
144
|
+
|
|
145
|
+
Checks for PCIe link width, speed, replays, recoveries, and NAKs.
|
|
146
|
+
Expected width/speeds should come from SKU info.
|
|
147
|
+
|
|
148
|
+
Args:
|
|
149
|
+
amdsmi_metric_data (list[AmdSmiMetric]): AmdSmiMetric data model
|
|
150
|
+
l0_to_recovery_count_error_threshold (int): Threshold for error events
|
|
151
|
+
l0_to_recovery_count_warning_threshold (int): Threshold for warning events
|
|
152
|
+
"""
|
|
153
|
+
for metric in amdsmi_metric_data:
|
|
154
|
+
pcie_data = metric.pcie
|
|
155
|
+
gpu = metric.gpu
|
|
156
|
+
|
|
157
|
+
if pcie_data.width is not None and pcie_data.width != 16:
|
|
158
|
+
self._log_event(
|
|
159
|
+
category=EventCategory.IO,
|
|
160
|
+
description=f"GPU: {gpu} PCIe width is not x16",
|
|
161
|
+
priority=EventPriority.ERROR,
|
|
162
|
+
data={"gpu": gpu, "pcie_width": pcie_data.width, "expected": 16},
|
|
163
|
+
console_log=True,
|
|
164
|
+
)
|
|
165
|
+
|
|
166
|
+
if pcie_data.speed is not None and pcie_data.speed.value is not None:
|
|
167
|
+
try:
|
|
168
|
+
speed_val = float(pcie_data.speed.value)
|
|
169
|
+
if speed_val != 32.0:
|
|
170
|
+
self._log_event(
|
|
171
|
+
category=EventCategory.IO,
|
|
172
|
+
description=f"GPU: {gpu} PCIe link speed is not Gen5 (32 GT/s)",
|
|
173
|
+
priority=EventPriority.ERROR,
|
|
174
|
+
data={"gpu": gpu, "pcie_speed": speed_val, "expected": 32.0},
|
|
175
|
+
console_log=True,
|
|
176
|
+
)
|
|
177
|
+
except (ValueError, TypeError):
|
|
178
|
+
pass
|
|
179
|
+
|
|
180
|
+
if pcie_data.replay_count is not None and pcie_data.replay_count > 0:
|
|
181
|
+
self._log_event(
|
|
182
|
+
category=EventCategory.IO,
|
|
183
|
+
description=f"GPU: {gpu} has PCIe replay count: {pcie_data.replay_count}",
|
|
184
|
+
priority=EventPriority.WARNING,
|
|
185
|
+
data={"gpu": gpu, "replay_count": pcie_data.replay_count},
|
|
186
|
+
console_log=True,
|
|
187
|
+
)
|
|
188
|
+
|
|
189
|
+
if (
|
|
190
|
+
pcie_data.replay_roll_over_count is not None
|
|
191
|
+
and pcie_data.replay_roll_over_count > 0
|
|
192
|
+
):
|
|
193
|
+
self._log_event(
|
|
194
|
+
category=EventCategory.IO,
|
|
195
|
+
description=f"GPU: {gpu} has PCIe replay rollover count: {pcie_data.replay_roll_over_count}",
|
|
196
|
+
priority=EventPriority.WARNING,
|
|
197
|
+
data={"gpu": gpu, "replay_roll_over_count": pcie_data.replay_roll_over_count},
|
|
198
|
+
console_log=True,
|
|
199
|
+
)
|
|
200
|
+
|
|
201
|
+
if pcie_data.l0_to_recovery_count is not None:
|
|
202
|
+
if pcie_data.l0_to_recovery_count > l0_to_recovery_count_error_threshold:
|
|
203
|
+
self._log_event(
|
|
204
|
+
category=EventCategory.IO,
|
|
205
|
+
description=f"GPU: {gpu} has {pcie_data.l0_to_recovery_count} L0 recoveries",
|
|
206
|
+
priority=EventPriority.ERROR,
|
|
207
|
+
data={
|
|
208
|
+
"gpu": gpu,
|
|
209
|
+
"l0_to_recovery_count": pcie_data.l0_to_recovery_count,
|
|
210
|
+
"error_threshold": l0_to_recovery_count_error_threshold,
|
|
211
|
+
},
|
|
212
|
+
console_log=True,
|
|
213
|
+
)
|
|
214
|
+
elif pcie_data.l0_to_recovery_count > l0_to_recovery_count_warning_threshold:
|
|
215
|
+
self._log_event(
|
|
216
|
+
category=EventCategory.IO,
|
|
217
|
+
description=f"GPU: {gpu} has {pcie_data.l0_to_recovery_count} L0 recoveries",
|
|
218
|
+
priority=EventPriority.WARNING,
|
|
219
|
+
data={
|
|
220
|
+
"gpu": gpu,
|
|
221
|
+
"l0_to_recovery_count": pcie_data.l0_to_recovery_count,
|
|
222
|
+
"warning_threshold": l0_to_recovery_count_warning_threshold,
|
|
223
|
+
},
|
|
224
|
+
console_log=True,
|
|
225
|
+
)
|
|
226
|
+
|
|
227
|
+
if pcie_data.nak_sent_count is not None and pcie_data.nak_sent_count > 0:
|
|
228
|
+
self._log_event(
|
|
229
|
+
category=EventCategory.IO,
|
|
230
|
+
description=f"GPU: {gpu} has sent {pcie_data.nak_sent_count} PCIe NAKs",
|
|
231
|
+
priority=EventPriority.WARNING,
|
|
232
|
+
data={"gpu": gpu, "nak_sent_count": pcie_data.nak_sent_count},
|
|
233
|
+
console_log=True,
|
|
234
|
+
)
|
|
235
|
+
|
|
236
|
+
if pcie_data.nak_received_count is not None and pcie_data.nak_received_count > 0:
|
|
237
|
+
self._log_event(
|
|
238
|
+
category=EventCategory.IO,
|
|
239
|
+
description=f"GPU: {gpu} has received {pcie_data.nak_received_count} PCIe NAKs",
|
|
240
|
+
priority=EventPriority.WARNING,
|
|
241
|
+
data={"gpu": gpu, "nak_received_count": pcie_data.nak_received_count},
|
|
242
|
+
console_log=True,
|
|
243
|
+
)
|
|
244
|
+
|
|
245
|
+
def check_amdsmi_metric_ecc_totals(self, amdsmi_metric_data: list[AmdSmiMetric]):
|
|
246
|
+
"""Check ECC totals for all GPUs
|
|
247
|
+
|
|
248
|
+
Raises errors for uncorrectable errors, warnings for correctable and deferred.
|
|
249
|
+
|
|
250
|
+
Args:
|
|
251
|
+
amdsmi_metric_data (list[AmdSmiMetric]): AmdSmiMetric data model
|
|
252
|
+
"""
|
|
253
|
+
for metric in amdsmi_metric_data:
|
|
254
|
+
ecc_totals = metric.ecc
|
|
255
|
+
gpu = metric.gpu
|
|
256
|
+
|
|
257
|
+
ecc_checks: list[tuple[EventPriority, Optional[int], str]] = [
|
|
258
|
+
(
|
|
259
|
+
EventPriority.WARNING,
|
|
260
|
+
ecc_totals.total_correctable_count,
|
|
261
|
+
"Total correctable ECC errors",
|
|
262
|
+
),
|
|
263
|
+
(
|
|
264
|
+
EventPriority.ERROR,
|
|
265
|
+
ecc_totals.total_uncorrectable_count,
|
|
266
|
+
"Total uncorrectable ECC errors",
|
|
267
|
+
),
|
|
268
|
+
(
|
|
269
|
+
EventPriority.WARNING,
|
|
270
|
+
ecc_totals.total_deferred_count,
|
|
271
|
+
"Total deferred ECC errors",
|
|
272
|
+
),
|
|
273
|
+
(
|
|
274
|
+
EventPriority.WARNING,
|
|
275
|
+
ecc_totals.cache_correctable_count,
|
|
276
|
+
"Cache correctable ECC errors",
|
|
277
|
+
),
|
|
278
|
+
(
|
|
279
|
+
EventPriority.ERROR,
|
|
280
|
+
ecc_totals.cache_uncorrectable_count,
|
|
281
|
+
"Cache uncorrectable ECC errors",
|
|
282
|
+
),
|
|
283
|
+
]
|
|
284
|
+
|
|
285
|
+
for priority, count, desc in ecc_checks:
|
|
286
|
+
if count is not None and count > 0:
|
|
287
|
+
self._log_event(
|
|
288
|
+
category=EventCategory.RAS,
|
|
289
|
+
description=f"GPU: {gpu} has {desc}: {count}",
|
|
290
|
+
priority=priority,
|
|
291
|
+
data={"gpu": gpu, "error_count": count, "error_type": desc},
|
|
292
|
+
console_log=True,
|
|
293
|
+
)
|
|
294
|
+
|
|
295
|
+
def check_amdsmi_metric_ecc(self, amdsmi_metric_data: list[AmdSmiMetric]):
|
|
296
|
+
"""Check ECC counts in all blocks for all GPUs
|
|
297
|
+
|
|
298
|
+
Raises errors for uncorrectable errors, warnings for correctable and deferred.
|
|
299
|
+
|
|
300
|
+
Args:
|
|
301
|
+
amdsmi_metric_data (list[AmdSmiMetric]): AmdSmiMetric data model
|
|
302
|
+
"""
|
|
303
|
+
for metric in amdsmi_metric_data:
|
|
304
|
+
gpu = metric.gpu
|
|
305
|
+
ecc_blocks = metric.ecc_blocks
|
|
306
|
+
|
|
307
|
+
# Skip if ecc_blocks is a string (e.g., "N/A") or empty
|
|
308
|
+
if isinstance(ecc_blocks, str) or not ecc_blocks:
|
|
309
|
+
continue
|
|
310
|
+
|
|
311
|
+
for block_name, ecc_data in ecc_blocks.items():
|
|
312
|
+
if not isinstance(ecc_data, EccData):
|
|
313
|
+
continue
|
|
314
|
+
|
|
315
|
+
if ecc_data.correctable_count is not None and ecc_data.correctable_count > 0:
|
|
316
|
+
self._log_event(
|
|
317
|
+
category=EventCategory.RAS,
|
|
318
|
+
description=f"GPU: {gpu} has correctable ECC errors in block {block_name}",
|
|
319
|
+
priority=EventPriority.WARNING,
|
|
320
|
+
data={
|
|
321
|
+
"gpu": gpu,
|
|
322
|
+
"block": block_name,
|
|
323
|
+
"correctable_count": ecc_data.correctable_count,
|
|
324
|
+
},
|
|
325
|
+
console_log=True,
|
|
326
|
+
)
|
|
327
|
+
|
|
328
|
+
if ecc_data.uncorrectable_count is not None and ecc_data.uncorrectable_count > 0:
|
|
329
|
+
self._log_event(
|
|
330
|
+
category=EventCategory.RAS,
|
|
331
|
+
description=f"GPU: {gpu} has uncorrectable ECC errors in block {block_name}",
|
|
332
|
+
priority=EventPriority.ERROR,
|
|
333
|
+
data={
|
|
334
|
+
"gpu": gpu,
|
|
335
|
+
"block": block_name,
|
|
336
|
+
"uncorrectable_count": ecc_data.uncorrectable_count,
|
|
337
|
+
},
|
|
338
|
+
console_log=True,
|
|
339
|
+
)
|
|
340
|
+
|
|
341
|
+
if ecc_data.deferred_count is not None and ecc_data.deferred_count > 0:
|
|
342
|
+
self._log_event(
|
|
343
|
+
category=EventCategory.RAS,
|
|
344
|
+
description=f"GPU: {gpu} has deferred ECC errors in block {block_name}",
|
|
345
|
+
priority=EventPriority.WARNING,
|
|
346
|
+
data={
|
|
347
|
+
"gpu": gpu,
|
|
348
|
+
"block": block_name,
|
|
349
|
+
"deferred_count": ecc_data.deferred_count,
|
|
350
|
+
},
|
|
351
|
+
console_log=True,
|
|
352
|
+
)
|
|
353
|
+
|
|
354
|
+
def expected_gpu_processes(
|
|
355
|
+
self, processes_data: Optional[list[Processes]], max_num_processes: int
|
|
356
|
+
):
|
|
357
|
+
"""Check the number of GPU processes running
|
|
358
|
+
|
|
359
|
+
Args:
|
|
360
|
+
processes_data (Optional[list[Processes]]): list of processes per GPU
|
|
361
|
+
max_num_processes (int): max number of expected processes
|
|
362
|
+
"""
|
|
363
|
+
gpu_exceeds_num_processes: dict[int, int] = {}
|
|
364
|
+
if processes_data is None or len(processes_data) == 0:
|
|
365
|
+
self._log_event(
|
|
366
|
+
category=EventCategory.PLATFORM,
|
|
367
|
+
description="No GPU processes data available",
|
|
368
|
+
priority=EventPriority.WARNING,
|
|
369
|
+
data={"processes_data": processes_data},
|
|
370
|
+
console_log=True,
|
|
371
|
+
)
|
|
372
|
+
return
|
|
373
|
+
for process in processes_data:
|
|
374
|
+
if len(process.process_list) == 0 or isinstance(
|
|
375
|
+
process.process_list[0].process_info, str
|
|
376
|
+
):
|
|
377
|
+
# Skip if there are no processes
|
|
378
|
+
continue
|
|
379
|
+
|
|
380
|
+
process_count = len(process.process_list)
|
|
381
|
+
if process_count > max_num_processes:
|
|
382
|
+
gpu_exceeds_num_processes[process.gpu] = process_count
|
|
383
|
+
|
|
384
|
+
if gpu_exceeds_num_processes:
|
|
385
|
+
self._log_event(
|
|
386
|
+
category=EventCategory.PLATFORM,
|
|
387
|
+
description="Number of processes exceeds max processes",
|
|
388
|
+
priority=EventPriority.ERROR,
|
|
389
|
+
data={
|
|
390
|
+
"gpu_exceeds_num_processes": gpu_exceeds_num_processes,
|
|
391
|
+
},
|
|
392
|
+
console_log=True,
|
|
393
|
+
)
|
|
394
|
+
|
|
395
|
+
def static_consistancy_check(self, amdsmi_static_data: list[AmdSmiStatic]):
|
|
396
|
+
"""Check consistency of expected data
|
|
397
|
+
|
|
398
|
+
Args:
|
|
399
|
+
amdsmi_static_data (list[AmdSmiStatic]): AmdSmiStatic data model
|
|
400
|
+
"""
|
|
401
|
+
consistancy_data: dict[str, Union[set[str], set[int]]] = {
|
|
402
|
+
"market_name": {gpu.asic.market_name for gpu in amdsmi_static_data},
|
|
403
|
+
"vendor_id": {gpu.asic.vendor_id for gpu in amdsmi_static_data},
|
|
404
|
+
"vendor_name": {gpu.asic.vendor_name for gpu in amdsmi_static_data},
|
|
405
|
+
"subvendor_id": {gpu.asic.subvendor_id for gpu in amdsmi_static_data},
|
|
406
|
+
"subsystem_id": {gpu.asic.subsystem_id for gpu in amdsmi_static_data},
|
|
407
|
+
"device_id": {gpu.asic.device_id for gpu in amdsmi_static_data},
|
|
408
|
+
"rev_id": {gpu.asic.rev_id for gpu in amdsmi_static_data},
|
|
409
|
+
"num_compute_units": {str(gpu.asic.num_compute_units) for gpu in amdsmi_static_data},
|
|
410
|
+
"target_graphics_version": {
|
|
411
|
+
gpu.asic.target_graphics_version for gpu in amdsmi_static_data
|
|
412
|
+
},
|
|
413
|
+
}
|
|
414
|
+
for key, value in consistancy_data.items():
|
|
415
|
+
if len(value) > 1:
|
|
416
|
+
self._log_event(
|
|
417
|
+
category=EventCategory.PLATFORM,
|
|
418
|
+
description=f"{key} is not consistent across all GPUs",
|
|
419
|
+
priority=EventPriority.WARNING,
|
|
420
|
+
data={
|
|
421
|
+
"field": key,
|
|
422
|
+
"non_consistent_values": value,
|
|
423
|
+
},
|
|
424
|
+
)
|
|
425
|
+
|
|
426
|
+
def check_static_data(
|
|
427
|
+
self,
|
|
428
|
+
amdsmi_static_data: list[AmdSmiStatic],
|
|
429
|
+
vendor_id: Optional[str],
|
|
430
|
+
subvendor_id: Optional[str],
|
|
431
|
+
device_id: tuple[Optional[str], Optional[str]],
|
|
432
|
+
subsystem_id: tuple[Optional[str], Optional[str]],
|
|
433
|
+
sku_name: Optional[str],
|
|
434
|
+
) -> None:
|
|
435
|
+
"""Check expected static data
|
|
436
|
+
|
|
437
|
+
Args:
|
|
438
|
+
amdsmi_static_data (list[AmdSmiStatic]): AmdSmiStatic data
|
|
439
|
+
vendor_id (Optional[str]): expected vendor_id
|
|
440
|
+
subvendor_id (Optional[str]): expected subvendor_id
|
|
441
|
+
device_id (tuple[Optional[str], Optional[str]]): expected device_id
|
|
442
|
+
subsystem_id (tuple[Optional[str], Optional[str]]): expected subsystem_id
|
|
443
|
+
sku_name (Optional[str]): expected sku_name
|
|
444
|
+
"""
|
|
445
|
+
|
|
446
|
+
mismatches: list[tuple[int, str, str, str]] = []
|
|
447
|
+
|
|
448
|
+
expected_data: dict[str, Optional[str]] = {
|
|
449
|
+
"vendor_id": vendor_id,
|
|
450
|
+
"subvendor_id": subvendor_id,
|
|
451
|
+
"vendor_name": "Advanced Micro Devices Inc",
|
|
452
|
+
"market_name": sku_name,
|
|
453
|
+
}
|
|
454
|
+
|
|
455
|
+
for gpu_data in amdsmi_static_data:
|
|
456
|
+
collected_data: dict[str, str] = {
|
|
457
|
+
"vendor_id": gpu_data.asic.vendor_id,
|
|
458
|
+
"subvendor_id": gpu_data.asic.subvendor_id,
|
|
459
|
+
"vendor_name": gpu_data.asic.vendor_name,
|
|
460
|
+
"market_name": gpu_data.asic.market_name,
|
|
461
|
+
}
|
|
462
|
+
|
|
463
|
+
for key, expected in expected_data.items():
|
|
464
|
+
if expected is None:
|
|
465
|
+
continue
|
|
466
|
+
actual = collected_data[key]
|
|
467
|
+
if expected not in actual:
|
|
468
|
+
mismatches.append((gpu_data.gpu, key, expected, actual))
|
|
469
|
+
break
|
|
470
|
+
|
|
471
|
+
if device_id[0] is not None and device_id[1] is not None:
|
|
472
|
+
dev_actual = gpu_data.asic.device_id
|
|
473
|
+
if (
|
|
474
|
+
device_id[0].upper() not in dev_actual.upper()
|
|
475
|
+
and device_id[1].upper() not in dev_actual.upper()
|
|
476
|
+
):
|
|
477
|
+
mismatches.append(
|
|
478
|
+
(gpu_data.gpu, "device_id", f"{device_id[0]}|{device_id[1]}", dev_actual)
|
|
479
|
+
)
|
|
480
|
+
|
|
481
|
+
if subsystem_id[0] is not None and subsystem_id[1] is not None:
|
|
482
|
+
subsys_actual = gpu_data.asic.subsystem_id
|
|
483
|
+
if (
|
|
484
|
+
subsystem_id[0].upper() not in subsys_actual.upper()
|
|
485
|
+
and subsystem_id[1].upper() not in subsys_actual.upper()
|
|
486
|
+
):
|
|
487
|
+
mismatches.append(
|
|
488
|
+
(
|
|
489
|
+
gpu_data.gpu,
|
|
490
|
+
"subsystem_id",
|
|
491
|
+
f"{subsystem_id[0]}|{subsystem_id[1]}",
|
|
492
|
+
subsys_actual,
|
|
493
|
+
)
|
|
494
|
+
)
|
|
495
|
+
|
|
496
|
+
if mismatches:
|
|
497
|
+
payload = self._format_static_mismatch_payload(mismatches)
|
|
498
|
+
self._log_event(
|
|
499
|
+
category=EventCategory.PLATFORM,
|
|
500
|
+
description="amd-smi static data mismatch",
|
|
501
|
+
priority=EventPriority.ERROR,
|
|
502
|
+
data=payload,
|
|
503
|
+
)
|
|
504
|
+
|
|
505
|
+
def _format_static_mismatch_payload(
|
|
506
|
+
self,
|
|
507
|
+
mismatches: list[tuple[int, str, str, str]],
|
|
508
|
+
) -> dict[str, Any]:
|
|
509
|
+
"""Helper function for pretty printing mismatch in expected data
|
|
510
|
+
|
|
511
|
+
Args:
|
|
512
|
+
mismatches (list[tuple[int, str, str, str]]): mismatched data per GPU
|
|
513
|
+
|
|
514
|
+
Returns:
|
|
515
|
+
dict[str, Any]: dict of mismatched data per GPU
|
|
516
|
+
"""
|
|
517
|
+
per_gpu: dict[int, list[dict[str, str]]] = defaultdict(list)
|
|
518
|
+
field_set: set[str] = set()
|
|
519
|
+
|
|
520
|
+
for gpu, field, expected, actual in mismatches:
|
|
521
|
+
field_set.add(field)
|
|
522
|
+
per_gpu[gpu].append({"field": field, "expected": expected, "actual": actual})
|
|
523
|
+
|
|
524
|
+
per_gpu_list: list[dict[str, Any]] = [
|
|
525
|
+
{"gpu": gpu, "mismatches": entries}
|
|
526
|
+
for gpu, entries in sorted(per_gpu.items(), key=lambda kv: kv[0])
|
|
527
|
+
]
|
|
528
|
+
|
|
529
|
+
return {
|
|
530
|
+
"summary": {
|
|
531
|
+
"gpus_affected": len(per_gpu),
|
|
532
|
+
"fields": sorted(field_set),
|
|
533
|
+
"total_mismatches": sum(len(v) for v in per_gpu.values()),
|
|
534
|
+
},
|
|
535
|
+
"per_gpu": per_gpu_list,
|
|
536
|
+
}
|
|
537
|
+
|
|
538
|
+
def check_pldm_version(
|
|
539
|
+
self,
|
|
540
|
+
amdsmi_fw_data: Optional[list[Fw]],
|
|
541
|
+
expected_pldm_version: Optional[str],
|
|
542
|
+
):
|
|
543
|
+
"""Check expected pldm version
|
|
544
|
+
|
|
545
|
+
Args:
|
|
546
|
+
amdsmi_fw_data (Optional[list[Fw]]): data model
|
|
547
|
+
expected_pldm_version (Optional[str]): expected pldm version
|
|
548
|
+
"""
|
|
549
|
+
PLDM_STRING = "PLDM_BUNDLE"
|
|
550
|
+
if amdsmi_fw_data is None or len(amdsmi_fw_data) == 0:
|
|
551
|
+
self._log_event(
|
|
552
|
+
category=EventCategory.PLATFORM,
|
|
553
|
+
description="No AMD SMI firmware data available",
|
|
554
|
+
priority=EventPriority.WARNING,
|
|
555
|
+
data={"amdsmi_fw_data": amdsmi_fw_data},
|
|
556
|
+
)
|
|
557
|
+
return
|
|
558
|
+
mismatched_gpus: list[int] = []
|
|
559
|
+
pldm_missing_gpus: list[int] = []
|
|
560
|
+
for fw_data in amdsmi_fw_data:
|
|
561
|
+
gpu = fw_data.gpu
|
|
562
|
+
if isinstance(fw_data.fw_list, str):
|
|
563
|
+
pldm_missing_gpus.append(gpu)
|
|
564
|
+
continue
|
|
565
|
+
for fw_info in fw_data.fw_list:
|
|
566
|
+
if PLDM_STRING == fw_info.fw_id and expected_pldm_version != fw_info.fw_version:
|
|
567
|
+
mismatched_gpus.append(gpu)
|
|
568
|
+
if PLDM_STRING == fw_info.fw_id:
|
|
569
|
+
break
|
|
570
|
+
else:
|
|
571
|
+
pldm_missing_gpus.append(gpu)
|
|
572
|
+
|
|
573
|
+
if mismatched_gpus or pldm_missing_gpus:
|
|
574
|
+
self._log_event(
|
|
575
|
+
category=EventCategory.FW,
|
|
576
|
+
description="PLDM Version Mismatch",
|
|
577
|
+
priority=EventPriority.ERROR,
|
|
578
|
+
data={
|
|
579
|
+
"mismatched_gpus": mismatched_gpus,
|
|
580
|
+
"pldm_missing_gpus": pldm_missing_gpus,
|
|
581
|
+
"expected_pldm_version": expected_pldm_version,
|
|
582
|
+
},
|
|
583
|
+
)
|
|
584
|
+
|
|
585
|
+
def check_expected_memory_partition_mode(
|
|
586
|
+
self,
|
|
587
|
+
partition_data: Optional[Partition],
|
|
588
|
+
expected_memory_partition_mode: Optional[str],
|
|
589
|
+
expected_compute_partition_mode: Optional[str],
|
|
590
|
+
):
|
|
591
|
+
"""Check expected mem partition mode
|
|
592
|
+
|
|
593
|
+
Args:
|
|
594
|
+
partition_data (Optional[Partition]): data model
|
|
595
|
+
expected_memory_partition_mode (Optional[str]): expected mem partition mode
|
|
596
|
+
expected_compute_partition_mode (Optional[str]): expected compute partition mode
|
|
597
|
+
"""
|
|
598
|
+
if partition_data is None:
|
|
599
|
+
self._log_event(
|
|
600
|
+
category=EventCategory.PLATFORM,
|
|
601
|
+
description="No AMD SMI Partition data not available",
|
|
602
|
+
priority=EventPriority.WARNING,
|
|
603
|
+
)
|
|
604
|
+
return
|
|
605
|
+
bad_memory_partition_mode_gpus = []
|
|
606
|
+
for partition_current in partition_data.memory_partition:
|
|
607
|
+
if (
|
|
608
|
+
expected_memory_partition_mode is not None
|
|
609
|
+
and partition_current.partition_type != expected_memory_partition_mode
|
|
610
|
+
):
|
|
611
|
+
bad_memory_partition_mode_gpus.append(
|
|
612
|
+
{
|
|
613
|
+
"gpu_id": partition_current.gpu_id,
|
|
614
|
+
"memory_partition_mode": partition_current.partition_type,
|
|
615
|
+
}
|
|
616
|
+
)
|
|
617
|
+
|
|
618
|
+
for compute_current in partition_data.compute_partition:
|
|
619
|
+
if (
|
|
620
|
+
expected_compute_partition_mode is not None
|
|
621
|
+
and compute_current.partition_type != expected_compute_partition_mode
|
|
622
|
+
):
|
|
623
|
+
bad_memory_partition_mode_gpus.append(
|
|
624
|
+
{
|
|
625
|
+
"gpu_id": compute_current.gpu_id,
|
|
626
|
+
"compute_partition_mode": compute_current.partition_type,
|
|
627
|
+
}
|
|
628
|
+
)
|
|
629
|
+
|
|
630
|
+
if bad_memory_partition_mode_gpus:
|
|
631
|
+
self._log_event(
|
|
632
|
+
category=EventCategory.PLATFORM,
|
|
633
|
+
description="Partition Mode Mismatch",
|
|
634
|
+
priority=EventPriority.ERROR,
|
|
635
|
+
data={
|
|
636
|
+
"actual_partition_data": bad_memory_partition_mode_gpus,
|
|
637
|
+
"expected_memory_partition_mode": expected_memory_partition_mode,
|
|
638
|
+
"expected_compute_partition_mode": expected_compute_partition_mode,
|
|
639
|
+
},
|
|
640
|
+
)
|
|
641
|
+
|
|
642
|
+
def check_expected_xgmi_link_speed(
|
|
643
|
+
self,
|
|
644
|
+
xgmi_metric: Optional[list[XgmiMetrics]],
|
|
645
|
+
expected_xgmi_speed: Optional[list[float]] = None,
|
|
646
|
+
):
|
|
647
|
+
"""Check the XGMI link speed for all GPUs
|
|
648
|
+
|
|
649
|
+
Args:
|
|
650
|
+
xgmi_metric (Optional[list[XgmiMetrics]]): XGMI metrics data
|
|
651
|
+
expected_xgmi_speed (Optional[list[float]]): List of expected XGMI speeds (GT/s)
|
|
652
|
+
"""
|
|
653
|
+
if xgmi_metric is None or len(xgmi_metric) == 0:
|
|
654
|
+
self._log_event(
|
|
655
|
+
category=EventCategory.IO,
|
|
656
|
+
description="XGMI link speed data is not available and cannot be checked",
|
|
657
|
+
priority=EventPriority.WARNING,
|
|
658
|
+
data={"xgmi_metric": xgmi_metric},
|
|
659
|
+
)
|
|
660
|
+
return
|
|
661
|
+
|
|
662
|
+
if expected_xgmi_speed is None or len(expected_xgmi_speed) == 0:
|
|
663
|
+
self._log_event(
|
|
664
|
+
category=EventCategory.IO,
|
|
665
|
+
description="Expected XGMI speed not configured, skipping XGMI link speed check",
|
|
666
|
+
priority=EventPriority.WARNING,
|
|
667
|
+
)
|
|
668
|
+
return
|
|
669
|
+
|
|
670
|
+
for xgmi_data in xgmi_metric:
|
|
671
|
+
link_metric = xgmi_data.link_metrics
|
|
672
|
+
try:
|
|
673
|
+
if link_metric.bit_rate is None or link_metric.bit_rate.value is None:
|
|
674
|
+
self._log_event(
|
|
675
|
+
category=EventCategory.IO,
|
|
676
|
+
description="XGMI link speed is not available",
|
|
677
|
+
priority=EventPriority.ERROR,
|
|
678
|
+
data={
|
|
679
|
+
"gpu": xgmi_data.gpu,
|
|
680
|
+
"xgmi_bit_rate": (
|
|
681
|
+
link_metric.bit_rate.unit if link_metric.bit_rate else "N/A"
|
|
682
|
+
),
|
|
683
|
+
},
|
|
684
|
+
)
|
|
685
|
+
continue
|
|
686
|
+
|
|
687
|
+
xgmi_float = float(link_metric.bit_rate.value)
|
|
688
|
+
except ValueError:
|
|
689
|
+
self._log_event(
|
|
690
|
+
category=EventCategory.IO,
|
|
691
|
+
description="XGMI link speed is not a valid number",
|
|
692
|
+
priority=EventPriority.ERROR,
|
|
693
|
+
data={
|
|
694
|
+
"gpu": xgmi_data.gpu,
|
|
695
|
+
"xgmi_bit_rate": (
|
|
696
|
+
link_metric.bit_rate.value if link_metric.bit_rate else "N/A"
|
|
697
|
+
),
|
|
698
|
+
},
|
|
699
|
+
)
|
|
700
|
+
continue
|
|
701
|
+
|
|
702
|
+
if xgmi_float not in expected_xgmi_speed:
|
|
703
|
+
self._log_event(
|
|
704
|
+
category=EventCategory.IO,
|
|
705
|
+
description="XGMI link speed is not as expected",
|
|
706
|
+
priority=EventPriority.ERROR,
|
|
707
|
+
data={
|
|
708
|
+
"gpu": xgmi_data.gpu,
|
|
709
|
+
"xgmi_bit_rate": xgmi_float,
|
|
710
|
+
"expected_xgmi_speed": expected_xgmi_speed,
|
|
711
|
+
},
|
|
712
|
+
console_log=True,
|
|
713
|
+
)
|
|
714
|
+
|
|
715
|
+
def check_amdsmitst(self, amdsmitst_data: AmdSmiTstData):
|
|
716
|
+
"""Check AMD SMI test results
|
|
717
|
+
|
|
718
|
+
Args:
|
|
719
|
+
amdsmitst_data (AmdSmiTstData): AMD SMI test data
|
|
720
|
+
"""
|
|
721
|
+
if amdsmitst_data.failed_test_count > 0:
|
|
722
|
+
self._log_event(
|
|
723
|
+
category=EventCategory.APPLICATION,
|
|
724
|
+
description=f"{amdsmitst_data.failed_test_count} failed tests running amdsmitst",
|
|
725
|
+
priority=EventPriority.ERROR,
|
|
726
|
+
data={
|
|
727
|
+
"failed_test_count": amdsmitst_data.failed_test_count,
|
|
728
|
+
"failed_tests": amdsmitst_data.failed_tests,
|
|
729
|
+
},
|
|
730
|
+
console_log=True,
|
|
731
|
+
)
|
|
732
|
+
|
|
733
|
+
def analyze_data(
|
|
734
|
+
self, data: AmdSmiDataModel, args: Optional[AmdSmiAnalyzerArgs] = None
|
|
735
|
+
) -> TaskResult:
|
|
736
|
+
"""Analyze the amdsmi data against expected data
|
|
737
|
+
|
|
738
|
+
Args:
|
|
739
|
+
data (AmdSmiDataModel): the AmdSmi data model
|
|
740
|
+
args (_type_, optional): optional AmdSmi analyzer args. Defaults to None.
|
|
741
|
+
|
|
742
|
+
Returns:
|
|
743
|
+
TaskResult: the result of the analysis indicating weather the AmdSmi data model
|
|
744
|
+
matched the expected data
|
|
745
|
+
"""
|
|
746
|
+
|
|
747
|
+
if args is None:
|
|
748
|
+
args = AmdSmiAnalyzerArgs()
|
|
749
|
+
|
|
750
|
+
if data.metric is not None and len(data.metric) > 0:
|
|
751
|
+
if args.l0_to_recovery_count_error_threshold is not None:
|
|
752
|
+
self.check_amdsmi_metric_pcie(
|
|
753
|
+
data.metric,
|
|
754
|
+
args.l0_to_recovery_count_error_threshold,
|
|
755
|
+
args.l0_to_recovery_count_warning_threshold or 1,
|
|
756
|
+
)
|
|
757
|
+
self.check_amdsmi_metric_ecc_totals(data.metric)
|
|
758
|
+
self.check_amdsmi_metric_ecc(data.metric)
|
|
759
|
+
|
|
760
|
+
if args.expected_gpu_processes:
|
|
761
|
+
self.expected_gpu_processes(data.process, args.expected_gpu_processes)
|
|
762
|
+
|
|
763
|
+
if data.static is None or len(data.static) == 0:
|
|
764
|
+
self._log_event(
|
|
765
|
+
category=EventCategory.PLATFORM,
|
|
766
|
+
description="No AMD SMI static data available",
|
|
767
|
+
priority=EventPriority.WARNING,
|
|
768
|
+
data={"amdsmi_static_data": data.static},
|
|
769
|
+
)
|
|
770
|
+
else:
|
|
771
|
+
if args.expected_max_power:
|
|
772
|
+
self.check_expected_max_power(data.static, args.expected_max_power)
|
|
773
|
+
if args.expected_driver_version:
|
|
774
|
+
self.check_expected_driver_version(data.static, args.expected_driver_version)
|
|
775
|
+
|
|
776
|
+
self.static_consistancy_check(data.static)
|
|
777
|
+
if (
|
|
778
|
+
self.system_info.sku
|
|
779
|
+
and args.devid_ep
|
|
780
|
+
and args.devid_ep_vf
|
|
781
|
+
and args.vendorid_ep
|
|
782
|
+
and args.check_static_data
|
|
783
|
+
) or args.check_static_data:
|
|
784
|
+
self.check_static_data(
|
|
785
|
+
data.static,
|
|
786
|
+
args.vendorid_ep,
|
|
787
|
+
args.vendorid_ep,
|
|
788
|
+
(args.devid_ep, args.devid_ep),
|
|
789
|
+
(args.devid_ep, args.devid_ep),
|
|
790
|
+
sku_name=args.sku_name,
|
|
791
|
+
)
|
|
792
|
+
|
|
793
|
+
if args.expected_memory_partition_mode or args.expected_compute_partition_mode:
|
|
794
|
+
self.check_expected_memory_partition_mode(
|
|
795
|
+
data.partition,
|
|
796
|
+
args.expected_memory_partition_mode,
|
|
797
|
+
args.expected_compute_partition_mode,
|
|
798
|
+
)
|
|
799
|
+
|
|
800
|
+
if args.expected_pldm_version:
|
|
801
|
+
self.check_pldm_version(data.firmware, args.expected_pldm_version)
|
|
802
|
+
|
|
803
|
+
if data.cper_data:
|
|
804
|
+
self.analyzer_cpers(
|
|
805
|
+
{
|
|
806
|
+
file_model_obj.file_name: io.BytesIO(file_model_obj.file_contents)
|
|
807
|
+
for file_model_obj in data.cper_data
|
|
808
|
+
},
|
|
809
|
+
analysis_range_start=args.analysis_range_start,
|
|
810
|
+
analysis_range_end=args.analysis_range_end,
|
|
811
|
+
)
|
|
812
|
+
|
|
813
|
+
if data.xgmi_metric and len(data.xgmi_metric) > 0:
|
|
814
|
+
self.check_expected_xgmi_link_speed(
|
|
815
|
+
data.xgmi_metric, expected_xgmi_speed=args.expected_xgmi_speed
|
|
816
|
+
)
|
|
817
|
+
|
|
818
|
+
if data.amdsmitst_data and data.amdsmitst_data.failed_test_count > 0:
|
|
819
|
+
self.check_amdsmitst(data.amdsmitst_data)
|
|
820
|
+
|
|
821
|
+
return self.result
|