amd-node-scraper 0.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- amd_node_scraper-0.0.1.dist-info/LICENSE +21 -0
- amd_node_scraper-0.0.1.dist-info/METADATA +424 -0
- amd_node_scraper-0.0.1.dist-info/RECORD +197 -0
- amd_node_scraper-0.0.1.dist-info/WHEEL +5 -0
- amd_node_scraper-0.0.1.dist-info/entry_points.txt +2 -0
- amd_node_scraper-0.0.1.dist-info/top_level.txt +1 -0
- nodescraper/__init__.py +32 -0
- nodescraper/base/__init__.py +34 -0
- nodescraper/base/inbandcollectortask.py +118 -0
- nodescraper/base/inbanddataplugin.py +39 -0
- nodescraper/base/regexanalyzer.py +120 -0
- nodescraper/cli/__init__.py +29 -0
- nodescraper/cli/cli.py +511 -0
- nodescraper/cli/constants.py +27 -0
- nodescraper/cli/dynamicparserbuilder.py +171 -0
- nodescraper/cli/helper.py +517 -0
- nodescraper/cli/inputargtypes.py +129 -0
- nodescraper/configbuilder.py +123 -0
- nodescraper/configregistry.py +66 -0
- nodescraper/configs/node_status.json +19 -0
- nodescraper/connection/__init__.py +25 -0
- nodescraper/connection/inband/__init__.py +46 -0
- nodescraper/connection/inband/inband.py +171 -0
- nodescraper/connection/inband/inbandlocal.py +93 -0
- nodescraper/connection/inband/inbandmanager.py +151 -0
- nodescraper/connection/inband/inbandremote.py +173 -0
- nodescraper/connection/inband/sshparams.py +43 -0
- nodescraper/constants.py +26 -0
- nodescraper/enums/__init__.py +40 -0
- nodescraper/enums/eventcategory.py +89 -0
- nodescraper/enums/eventpriority.py +42 -0
- nodescraper/enums/executionstatus.py +44 -0
- nodescraper/enums/osfamily.py +34 -0
- nodescraper/enums/systeminteraction.py +41 -0
- nodescraper/enums/systemlocation.py +33 -0
- nodescraper/generictypes.py +36 -0
- nodescraper/interfaces/__init__.py +44 -0
- nodescraper/interfaces/connectionmanager.py +143 -0
- nodescraper/interfaces/dataanalyzertask.py +138 -0
- nodescraper/interfaces/datacollectortask.py +185 -0
- nodescraper/interfaces/dataplugin.py +356 -0
- nodescraper/interfaces/plugin.py +127 -0
- nodescraper/interfaces/resultcollator.py +56 -0
- nodescraper/interfaces/task.py +164 -0
- nodescraper/interfaces/taskresulthook.py +39 -0
- nodescraper/models/__init__.py +48 -0
- nodescraper/models/analyzerargs.py +93 -0
- nodescraper/models/collectorargs.py +30 -0
- nodescraper/models/connectionconfig.py +34 -0
- nodescraper/models/datamodel.py +171 -0
- nodescraper/models/datapluginresult.py +39 -0
- nodescraper/models/event.py +158 -0
- nodescraper/models/pluginconfig.py +38 -0
- nodescraper/models/pluginresult.py +39 -0
- nodescraper/models/systeminfo.py +44 -0
- nodescraper/models/taskresult.py +185 -0
- nodescraper/models/timerangeargs.py +38 -0
- nodescraper/pluginexecutor.py +274 -0
- nodescraper/pluginregistry.py +152 -0
- nodescraper/plugins/__init__.py +25 -0
- nodescraper/plugins/inband/__init__.py +25 -0
- nodescraper/plugins/inband/amdsmi/__init__.py +28 -0
- nodescraper/plugins/inband/amdsmi/amdsmi_analyzer.py +821 -0
- nodescraper/plugins/inband/amdsmi/amdsmi_collector.py +1313 -0
- nodescraper/plugins/inband/amdsmi/amdsmi_plugin.py +43 -0
- nodescraper/plugins/inband/amdsmi/amdsmidata.py +1002 -0
- nodescraper/plugins/inband/amdsmi/analyzer_args.py +50 -0
- nodescraper/plugins/inband/amdsmi/cper.py +65 -0
- nodescraper/plugins/inband/bios/__init__.py +29 -0
- nodescraper/plugins/inband/bios/analyzer_args.py +64 -0
- nodescraper/plugins/inband/bios/bios_analyzer.py +93 -0
- nodescraper/plugins/inband/bios/bios_collector.py +93 -0
- nodescraper/plugins/inband/bios/bios_plugin.py +43 -0
- nodescraper/plugins/inband/bios/biosdata.py +30 -0
- nodescraper/plugins/inband/cmdline/__init__.py +25 -0
- nodescraper/plugins/inband/cmdline/analyzer_args.py +80 -0
- nodescraper/plugins/inband/cmdline/cmdline_analyzer.py +113 -0
- nodescraper/plugins/inband/cmdline/cmdline_collector.py +77 -0
- nodescraper/plugins/inband/cmdline/cmdline_plugin.py +43 -0
- nodescraper/plugins/inband/cmdline/cmdlinedata.py +30 -0
- nodescraper/plugins/inband/device_enumeration/__init__.py +29 -0
- nodescraper/plugins/inband/device_enumeration/analyzer_args.py +73 -0
- nodescraper/plugins/inband/device_enumeration/device_enumeration_analyzer.py +81 -0
- nodescraper/plugins/inband/device_enumeration/device_enumeration_collector.py +176 -0
- nodescraper/plugins/inband/device_enumeration/device_enumeration_plugin.py +45 -0
- nodescraper/plugins/inband/device_enumeration/deviceenumdata.py +36 -0
- nodescraper/plugins/inband/dimm/__init__.py +25 -0
- nodescraper/plugins/inband/dimm/collector_args.py +31 -0
- nodescraper/plugins/inband/dimm/dimm_collector.py +151 -0
- nodescraper/plugins/inband/dimm/dimm_plugin.py +40 -0
- nodescraper/plugins/inband/dimm/dimmdata.py +30 -0
- nodescraper/plugins/inband/dkms/__init__.py +25 -0
- nodescraper/plugins/inband/dkms/analyzer_args.py +85 -0
- nodescraper/plugins/inband/dkms/dkms_analyzer.py +106 -0
- nodescraper/plugins/inband/dkms/dkms_collector.py +76 -0
- nodescraper/plugins/inband/dkms/dkms_plugin.py +43 -0
- nodescraper/plugins/inband/dkms/dkmsdata.py +33 -0
- nodescraper/plugins/inband/dmesg/__init__.py +28 -0
- nodescraper/plugins/inband/dmesg/analyzer_args.py +33 -0
- nodescraper/plugins/inband/dmesg/collector_args.py +39 -0
- nodescraper/plugins/inband/dmesg/dmesg_analyzer.py +503 -0
- nodescraper/plugins/inband/dmesg/dmesg_collector.py +164 -0
- nodescraper/plugins/inband/dmesg/dmesg_plugin.py +44 -0
- nodescraper/plugins/inband/dmesg/dmesgdata.py +116 -0
- nodescraper/plugins/inband/fabrics/__init__.py +28 -0
- nodescraper/plugins/inband/fabrics/fabrics_collector.py +726 -0
- nodescraper/plugins/inband/fabrics/fabrics_plugin.py +37 -0
- nodescraper/plugins/inband/fabrics/fabricsdata.py +140 -0
- nodescraper/plugins/inband/journal/__init__.py +28 -0
- nodescraper/plugins/inband/journal/collector_args.py +33 -0
- nodescraper/plugins/inband/journal/journal_collector.py +107 -0
- nodescraper/plugins/inband/journal/journal_plugin.py +40 -0
- nodescraper/plugins/inband/journal/journaldata.py +44 -0
- nodescraper/plugins/inband/kernel/__init__.py +25 -0
- nodescraper/plugins/inband/kernel/analyzer_args.py +64 -0
- nodescraper/plugins/inband/kernel/kernel_analyzer.py +91 -0
- nodescraper/plugins/inband/kernel/kernel_collector.py +129 -0
- nodescraper/plugins/inband/kernel/kernel_plugin.py +43 -0
- nodescraper/plugins/inband/kernel/kerneldata.py +32 -0
- nodescraper/plugins/inband/kernel_module/__init__.py +25 -0
- nodescraper/plugins/inband/kernel_module/analyzer_args.py +59 -0
- nodescraper/plugins/inband/kernel_module/kernel_module_analyzer.py +211 -0
- nodescraper/plugins/inband/kernel_module/kernel_module_collector.py +264 -0
- nodescraper/plugins/inband/kernel_module/kernel_module_data.py +60 -0
- nodescraper/plugins/inband/kernel_module/kernel_module_plugin.py +43 -0
- nodescraper/plugins/inband/memory/__init__.py +25 -0
- nodescraper/plugins/inband/memory/analyzer_args.py +45 -0
- nodescraper/plugins/inband/memory/memory_analyzer.py +98 -0
- nodescraper/plugins/inband/memory/memory_collector.py +330 -0
- nodescraper/plugins/inband/memory/memory_plugin.py +43 -0
- nodescraper/plugins/inband/memory/memorydata.py +90 -0
- nodescraper/plugins/inband/network/__init__.py +28 -0
- nodescraper/plugins/inband/network/network_collector.py +1828 -0
- nodescraper/plugins/inband/network/network_plugin.py +37 -0
- nodescraper/plugins/inband/network/networkdata.py +319 -0
- nodescraper/plugins/inband/nvme/__init__.py +28 -0
- nodescraper/plugins/inband/nvme/nvme_collector.py +167 -0
- nodescraper/plugins/inband/nvme/nvme_plugin.py +37 -0
- nodescraper/plugins/inband/nvme/nvmedata.py +45 -0
- nodescraper/plugins/inband/os/__init__.py +25 -0
- nodescraper/plugins/inband/os/analyzer_args.py +64 -0
- nodescraper/plugins/inband/os/os_analyzer.py +73 -0
- nodescraper/plugins/inband/os/os_collector.py +131 -0
- nodescraper/plugins/inband/os/os_plugin.py +43 -0
- nodescraper/plugins/inband/os/osdata.py +31 -0
- nodescraper/plugins/inband/package/__init__.py +25 -0
- nodescraper/plugins/inband/package/analyzer_args.py +48 -0
- nodescraper/plugins/inband/package/package_analyzer.py +253 -0
- nodescraper/plugins/inband/package/package_collector.py +273 -0
- nodescraper/plugins/inband/package/package_plugin.py +43 -0
- nodescraper/plugins/inband/package/packagedata.py +41 -0
- nodescraper/plugins/inband/pcie/__init__.py +29 -0
- nodescraper/plugins/inband/pcie/analyzer_args.py +63 -0
- nodescraper/plugins/inband/pcie/pcie_analyzer.py +1081 -0
- nodescraper/plugins/inband/pcie/pcie_collector.py +690 -0
- nodescraper/plugins/inband/pcie/pcie_data.py +2017 -0
- nodescraper/plugins/inband/pcie/pcie_plugin.py +43 -0
- nodescraper/plugins/inband/process/__init__.py +25 -0
- nodescraper/plugins/inband/process/analyzer_args.py +45 -0
- nodescraper/plugins/inband/process/collector_args.py +31 -0
- nodescraper/plugins/inband/process/process_analyzer.py +91 -0
- nodescraper/plugins/inband/process/process_collector.py +115 -0
- nodescraper/plugins/inband/process/process_plugin.py +46 -0
- nodescraper/plugins/inband/process/processdata.py +34 -0
- nodescraper/plugins/inband/rocm/__init__.py +25 -0
- nodescraper/plugins/inband/rocm/analyzer_args.py +66 -0
- nodescraper/plugins/inband/rocm/rocm_analyzer.py +100 -0
- nodescraper/plugins/inband/rocm/rocm_collector.py +205 -0
- nodescraper/plugins/inband/rocm/rocm_plugin.py +43 -0
- nodescraper/plugins/inband/rocm/rocmdata.py +62 -0
- nodescraper/plugins/inband/storage/__init__.py +25 -0
- nodescraper/plugins/inband/storage/analyzer_args.py +38 -0
- nodescraper/plugins/inband/storage/collector_args.py +31 -0
- nodescraper/plugins/inband/storage/storage_analyzer.py +152 -0
- nodescraper/plugins/inband/storage/storage_collector.py +110 -0
- nodescraper/plugins/inband/storage/storage_plugin.py +44 -0
- nodescraper/plugins/inband/storage/storagedata.py +70 -0
- nodescraper/plugins/inband/sysctl/__init__.py +29 -0
- nodescraper/plugins/inband/sysctl/analyzer_args.py +67 -0
- nodescraper/plugins/inband/sysctl/sysctl_analyzer.py +81 -0
- nodescraper/plugins/inband/sysctl/sysctl_collector.py +101 -0
- nodescraper/plugins/inband/sysctl/sysctl_plugin.py +43 -0
- nodescraper/plugins/inband/sysctl/sysctldata.py +42 -0
- nodescraper/plugins/inband/syslog/__init__.py +28 -0
- nodescraper/plugins/inband/syslog/syslog_collector.py +121 -0
- nodescraper/plugins/inband/syslog/syslog_plugin.py +37 -0
- nodescraper/plugins/inband/syslog/syslogdata.py +46 -0
- nodescraper/plugins/inband/uptime/__init__.py +25 -0
- nodescraper/plugins/inband/uptime/uptime_collector.py +88 -0
- nodescraper/plugins/inband/uptime/uptime_plugin.py +37 -0
- nodescraper/plugins/inband/uptime/uptimedata.py +31 -0
- nodescraper/resultcollators/__init__.py +25 -0
- nodescraper/resultcollators/tablesummary.py +159 -0
- nodescraper/taskresulthooks/__init__.py +28 -0
- nodescraper/taskresulthooks/filesystemloghook.py +88 -0
- nodescraper/typeutils.py +171 -0
- nodescraper/utils.py +412 -0
|
@@ -0,0 +1,1081 @@
|
|
|
1
|
+
###############################################################################
|
|
2
|
+
#
|
|
3
|
+
# MIT License
|
|
4
|
+
#
|
|
5
|
+
# Copyright (c) 2025 Advanced Micro Devices, Inc.
|
|
6
|
+
#
|
|
7
|
+
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
8
|
+
# of this software and associated documentation files (the "Software"), to deal
|
|
9
|
+
# in the Software without restriction, including without limitation the rights
|
|
10
|
+
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
11
|
+
# copies of the Software, and to permit persons to whom the Software is
|
|
12
|
+
# furnished to do so, subject to the following conditions:
|
|
13
|
+
#
|
|
14
|
+
# The above copyright notice and this permission notice shall be included in all
|
|
15
|
+
# copies or substantial portions of the Software.
|
|
16
|
+
#
|
|
17
|
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
18
|
+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
19
|
+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
20
|
+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
21
|
+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
22
|
+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
23
|
+
# SOFTWARE.
|
|
24
|
+
#
|
|
25
|
+
###############################################################################
|
|
26
|
+
from typing import Dict, List, Optional, Set, Type, TypeVar
|
|
27
|
+
|
|
28
|
+
from pydantic import BaseModel, Field, ValidationError, field_validator
|
|
29
|
+
|
|
30
|
+
from nodescraper.enums import EventCategory, EventPriority
|
|
31
|
+
from nodescraper.interfaces import DataAnalyzer
|
|
32
|
+
from nodescraper.models import TaskResult
|
|
33
|
+
from nodescraper.utils import get_exception_traceback
|
|
34
|
+
|
|
35
|
+
from .analyzer_args import PcieAnalyzerArgs, normalize_to_dict
|
|
36
|
+
from .pcie_data import (
|
|
37
|
+
BdfStr,
|
|
38
|
+
CorrErrMaskReg,
|
|
39
|
+
CorrErrStatReg,
|
|
40
|
+
ECap16Gt,
|
|
41
|
+
ECapAer,
|
|
42
|
+
ECapSecpci,
|
|
43
|
+
ParityMisMatchStat16GT,
|
|
44
|
+
PcieCapStructure,
|
|
45
|
+
PcieCfgSpace,
|
|
46
|
+
PcieDataModel,
|
|
47
|
+
PcieExp,
|
|
48
|
+
PcieRegister,
|
|
49
|
+
UncorrErrMaskReg,
|
|
50
|
+
UncorrErrSevReg,
|
|
51
|
+
UncorrErrStatReg,
|
|
52
|
+
)
|
|
53
|
+
|
|
54
|
+
T_CAP = TypeVar("T_CAP", bound=PcieCapStructure)
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
class PcieAnalyzerInputModel(BaseModel):
|
|
58
|
+
"""
|
|
59
|
+
PCIeAnalyzerInputModel is a data model for validating and storing input parameters
|
|
60
|
+
related to PCIe (Peripheral Component Interconnect Express) analysis.
|
|
61
|
+
Attributes:
|
|
62
|
+
exp_speed (int): Expected PCIe speed, Speed is the PCIe Generation, constrained to values between 1 and 5 (inclusive).
|
|
63
|
+
exp_width (int): Expected PCIe width, constrained to values between 1 and 16 (inclusive).
|
|
64
|
+
exp_sriov_count (Optional[int]): Optional expected count of SR-IOV (Single Root I/O Virtualization) instances.
|
|
65
|
+
exp_gpu_count_override (Optional[int]): Optional override for the expected GPU count.
|
|
66
|
+
"""
|
|
67
|
+
|
|
68
|
+
exp_speed: int = Field(ge=1, le=5)
|
|
69
|
+
exp_width: int = Field(ge=1, le=16)
|
|
70
|
+
exp_sriov_count: Optional[int] = None
|
|
71
|
+
exp_gpu_count_override: Optional[int] = None
|
|
72
|
+
exp_max_payload_size: Dict[int, int] = Field(default_factory=dict)
|
|
73
|
+
exp_max_rd_req_size: Dict[int, int] = Field(default_factory=dict)
|
|
74
|
+
exp_ten_bit_tag_req_en: Dict[int, int] = Field(default_factory=dict)
|
|
75
|
+
|
|
76
|
+
@field_validator("exp_max_rd_req_size", "exp_max_payload_size", mode="before")
|
|
77
|
+
@classmethod
|
|
78
|
+
def validate_exp_max_rd_req_size(cls, v: Optional[Dict[int, int]]) -> Dict[int, int]:
|
|
79
|
+
"""Validates the expected maximum read request size."""
|
|
80
|
+
if v is None:
|
|
81
|
+
return {}
|
|
82
|
+
ret_dict = v.copy()
|
|
83
|
+
for key, value in v.items():
|
|
84
|
+
if value >= 0 and value <= 5:
|
|
85
|
+
ret_dict[key] = 128 << value # Convert to actual size in bytes
|
|
86
|
+
if value not in {128, 256, 512, 1024, 2048, 4096}:
|
|
87
|
+
raise ValueError(
|
|
88
|
+
"Expected max read request size must be one of: "
|
|
89
|
+
"1, 2, 3, 4, 5, 128, 256, 512, 1024, 2048, or 4096."
|
|
90
|
+
)
|
|
91
|
+
if key < 0 or key > 0xFFFF:
|
|
92
|
+
raise ValueError(" key must be a valid BDF (0-65535).")
|
|
93
|
+
return ret_dict
|
|
94
|
+
|
|
95
|
+
@field_validator("exp_ten_bit_tag_req_en", mode="before")
|
|
96
|
+
@classmethod
|
|
97
|
+
def validate_exp_ten_bit_tag_req_en(cls, v: Optional[Dict[int, int]]) -> Dict[int, int]:
|
|
98
|
+
"""Validates the expected 10-bit tag request enable value."""
|
|
99
|
+
if v is None:
|
|
100
|
+
return {}
|
|
101
|
+
for key, value in v.items():
|
|
102
|
+
if key < 0 or key > 0xFFFF:
|
|
103
|
+
raise ValueError("Key must be a valid BDF (0-65535).")
|
|
104
|
+
if value not in {0, 1}:
|
|
105
|
+
raise ValueError("Expected 10-bit tag request enable must be 0 or 1.")
|
|
106
|
+
return v
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
class PcieAnalyzer(DataAnalyzer):
|
|
110
|
+
"""Check PCIe Data for errors
|
|
111
|
+
|
|
112
|
+
This calls checks the following:
|
|
113
|
+
- PCIe link status for each BDF
|
|
114
|
+
- This checks if the link speed and width are as expected
|
|
115
|
+
- AER uncorrectable errors
|
|
116
|
+
- Checks PCIe AER uncorrectable error registers UNCORR_ERR_STAT_REG and reports any errors
|
|
117
|
+
- AER correctable errors
|
|
118
|
+
- Checks the AERs correctable error registers CORR_ERR_STAT_REG and reports any errors
|
|
119
|
+
- PCIe device status errors
|
|
120
|
+
- Checks PCIe device status errors reported in fields `CORR_ERR_DET` `NON_FATAL_ERR_DET` `FATAL_ERR_DET` `UR_DET`
|
|
121
|
+
- PCIe status errors
|
|
122
|
+
- Checks PCIe status errors reported in fields `MSTR_DATA_PAR_ERR` `SIGNALED_TARGET_ABORT` `RCVD_TARGET_ABORT`
|
|
123
|
+
`RCVD_MSTR_ABORT` `SIGNALED_SYS_ERR` `DET_PARITY_ERR`
|
|
124
|
+
|
|
125
|
+
"""
|
|
126
|
+
|
|
127
|
+
DATA_MODEL = PcieDataModel
|
|
128
|
+
|
|
129
|
+
GPU_BRIDGE_USP_ID = "0x1501"
|
|
130
|
+
GPU_BRIDGE_DSP_ID = "0x1500"
|
|
131
|
+
|
|
132
|
+
def validate_reg(self, bdf: str, reg: PcieRegister, log_event: bool) -> bool:
|
|
133
|
+
"""Ensures that the register has no error has has a value
|
|
134
|
+
|
|
135
|
+
Parameters
|
|
136
|
+
----------
|
|
137
|
+
bdf : str
|
|
138
|
+
base:device:function string just used for logging
|
|
139
|
+
reg : PcieRegister
|
|
140
|
+
Register to validate
|
|
141
|
+
log_event : bool
|
|
142
|
+
Whether to log an event if the register is invalid
|
|
143
|
+
|
|
144
|
+
Returns
|
|
145
|
+
-------
|
|
146
|
+
bool
|
|
147
|
+
True when validate successfully, False otherwise
|
|
148
|
+
"""
|
|
149
|
+
if reg.val is None or reg.err is not None:
|
|
150
|
+
if log_event:
|
|
151
|
+
self._log_event(
|
|
152
|
+
category=EventCategory.IO,
|
|
153
|
+
description="No value assgined to register or register collection resulted in error",
|
|
154
|
+
priority=EventPriority.WARNING,
|
|
155
|
+
data={"value": reg.val, "error": reg.err, "bdf": bdf},
|
|
156
|
+
)
|
|
157
|
+
return False
|
|
158
|
+
return True
|
|
159
|
+
|
|
160
|
+
def validate_cap(
|
|
161
|
+
self,
|
|
162
|
+
bdf: str,
|
|
163
|
+
name: str,
|
|
164
|
+
capability_structure: Optional[PcieCapStructure],
|
|
165
|
+
log_event: bool = True,
|
|
166
|
+
) -> bool:
|
|
167
|
+
"""Ensures that the capability structure has no error and exists
|
|
168
|
+
|
|
169
|
+
Parameters
|
|
170
|
+
----------
|
|
171
|
+
bdf : str
|
|
172
|
+
base:device:function string just used for logging
|
|
173
|
+
capability_structure : PcieCapStructure
|
|
174
|
+
Capability structure to validate
|
|
175
|
+
|
|
176
|
+
Returns
|
|
177
|
+
-------
|
|
178
|
+
bool
|
|
179
|
+
True when validate successfully, False otherwise
|
|
180
|
+
"""
|
|
181
|
+
if capability_structure is None:
|
|
182
|
+
if log_event:
|
|
183
|
+
self._log_event(
|
|
184
|
+
category=EventCategory.IO,
|
|
185
|
+
description="No value assgined to capability a structure ",
|
|
186
|
+
data={
|
|
187
|
+
"name": name,
|
|
188
|
+
"bdf": bdf,
|
|
189
|
+
},
|
|
190
|
+
priority=EventPriority.WARNING,
|
|
191
|
+
)
|
|
192
|
+
return False
|
|
193
|
+
null_regs = capability_structure.null_err_regs()
|
|
194
|
+
if null_regs:
|
|
195
|
+
if log_event:
|
|
196
|
+
self._log_event(
|
|
197
|
+
category=EventCategory.IO,
|
|
198
|
+
description="Capability structure has unset registers",
|
|
199
|
+
data={
|
|
200
|
+
"name": name,
|
|
201
|
+
"bdf": bdf,
|
|
202
|
+
"capability_structure": capability_structure,
|
|
203
|
+
"null_regs": null_regs,
|
|
204
|
+
},
|
|
205
|
+
priority=EventPriority.WARNING,
|
|
206
|
+
)
|
|
207
|
+
return False
|
|
208
|
+
return True
|
|
209
|
+
|
|
210
|
+
def validate_cap_dict(
|
|
211
|
+
self,
|
|
212
|
+
pcie_cfg_space: Dict[BdfStr, PcieCfgSpace],
|
|
213
|
+
cap_struct: Type[PcieCapStructure],
|
|
214
|
+
log_event: bool = True,
|
|
215
|
+
) -> set[str]:
|
|
216
|
+
"""Validates capability structures for all BDFs in the PCIe data
|
|
217
|
+
|
|
218
|
+
Parameters
|
|
219
|
+
----------
|
|
220
|
+
pcie_data : PCIeData
|
|
221
|
+
The PCIe data containing configuration space for each BDF
|
|
222
|
+
cap_struct : Type[PcieCapStructure]
|
|
223
|
+
The capability structure type to validate against each BDF's configuration space
|
|
224
|
+
log_event : bool, optional
|
|
225
|
+
Whether to log an event if a BDF does not have the specified capability structure, by default True
|
|
226
|
+
|
|
227
|
+
Returns
|
|
228
|
+
-------
|
|
229
|
+
set[str]
|
|
230
|
+
A set of BDFs that have the specified capability structure
|
|
231
|
+
"""
|
|
232
|
+
bdf_without_cap_struct = set()
|
|
233
|
+
for bdf, cfg_space in pcie_cfg_space.items():
|
|
234
|
+
cap_struct_data = cfg_space.get_struct(cap_struct)
|
|
235
|
+
if not self.validate_cap(bdf, cap_struct.__name__, cap_struct_data, False):
|
|
236
|
+
bdf_without_cap_struct.add(bdf)
|
|
237
|
+
if log_event and len(bdf_without_cap_struct) > 0:
|
|
238
|
+
self._log_event(
|
|
239
|
+
category=EventCategory.IO,
|
|
240
|
+
description=f"Capability Structure {cap_struct.__name__} not found in a Cfg Space",
|
|
241
|
+
priority=EventPriority.WARNING,
|
|
242
|
+
data={
|
|
243
|
+
"bdf_without_pcie_exp": list(bdf_without_cap_struct),
|
|
244
|
+
"num_bdfs_with_invalid_capability_structure": len(bdf_without_cap_struct),
|
|
245
|
+
"total_bdfs": len(pcie_cfg_space),
|
|
246
|
+
},
|
|
247
|
+
)
|
|
248
|
+
return set(pcie_cfg_space.keys()) - bdf_without_cap_struct
|
|
249
|
+
|
|
250
|
+
def get_valid_cap_dict(
|
|
251
|
+
self,
|
|
252
|
+
pcie_cfg_space: Dict[BdfStr, PcieCfgSpace],
|
|
253
|
+
cap_struct: Type[T_CAP],
|
|
254
|
+
log_event: bool = True,
|
|
255
|
+
) -> dict[BdfStr, T_CAP]:
|
|
256
|
+
"""Returns a dictionary of BDFs that have the specified capability structure
|
|
257
|
+
|
|
258
|
+
Parameters
|
|
259
|
+
----------
|
|
260
|
+
pcie_data : PCIeData
|
|
261
|
+
The PCIe data containing configuration space for each BDF
|
|
262
|
+
cap_struct : Type[T_CAP]
|
|
263
|
+
The capability structure type to validate against each BDF's configuration space
|
|
264
|
+
log_event : bool, optional
|
|
265
|
+
Whether to log an event if a BDF does not have the specified capability structure, by default True
|
|
266
|
+
|
|
267
|
+
Returns
|
|
268
|
+
-------
|
|
269
|
+
dict[BdfStr, T_CAP]
|
|
270
|
+
A dictionary of BDFs that have the specified capability structure
|
|
271
|
+
"""
|
|
272
|
+
bdfs_with_cap = self.validate_cap_dict(pcie_cfg_space, cap_struct, log_event=log_event)
|
|
273
|
+
bdf_cap_struct_dict: Dict[BdfStr, T_CAP] = {}
|
|
274
|
+
for bdf, cfg_space in pcie_cfg_space.items():
|
|
275
|
+
if bdf not in bdfs_with_cap:
|
|
276
|
+
continue
|
|
277
|
+
cap_struct_data = cfg_space.get_struct(cap_struct)
|
|
278
|
+
if cap_struct_data is None:
|
|
279
|
+
continue
|
|
280
|
+
bdf_cap_struct_dict[bdf] = cap_struct_data
|
|
281
|
+
|
|
282
|
+
return bdf_cap_struct_dict
|
|
283
|
+
|
|
284
|
+
def check_link_status(
|
|
285
|
+
self,
|
|
286
|
+
bdf_pcie_express_dict: Dict[str, PcieExp],
|
|
287
|
+
exp_speed: int = 5,
|
|
288
|
+
exp_width: int = 16,
|
|
289
|
+
):
|
|
290
|
+
"""Checks PCIe link status for each bdf in the bdf_list and compares with the expected rate/width
|
|
291
|
+
|
|
292
|
+
Args:
|
|
293
|
+
all_bdf_cfg_space (dict[BdfStr, PcieCfgSpace]):
|
|
294
|
+
dict of key bdf and value PcieCfgSpace object which contains register data
|
|
295
|
+
exp_speed (int): expected link speed
|
|
296
|
+
exp_width (int): expected link width
|
|
297
|
+
|
|
298
|
+
Returns:
|
|
299
|
+
None
|
|
300
|
+
"""
|
|
301
|
+
# Key: binary bit position, value: Gen <N>
|
|
302
|
+
sv_gen_speed = {
|
|
303
|
+
0b000000: 0,
|
|
304
|
+
0b000001: 1,
|
|
305
|
+
0b000010: 2,
|
|
306
|
+
0b000100: 3,
|
|
307
|
+
0b001000: 4,
|
|
308
|
+
0b010000: 5,
|
|
309
|
+
}
|
|
310
|
+
for bdf, pcie_exp in bdf_pcie_express_dict.items():
|
|
311
|
+
lnk_stat_reg = pcie_exp.lnk_stat_reg
|
|
312
|
+
lnk_cap_2_reg = pcie_exp.lnk_cap_2_reg
|
|
313
|
+
try:
|
|
314
|
+
if lnk_stat_reg.curr_lnk_speed.val == 0:
|
|
315
|
+
self._log_event(
|
|
316
|
+
category=EventCategory.IO,
|
|
317
|
+
description="Link speed vector is 0",
|
|
318
|
+
data={
|
|
319
|
+
"bdf": bdf,
|
|
320
|
+
"curr_lnk_speed": lnk_stat_reg.curr_lnk_speed.val,
|
|
321
|
+
"supported_lnk_speed_vec": lnk_cap_2_reg.supported_lnk_speed_vec.val,
|
|
322
|
+
},
|
|
323
|
+
priority=EventPriority.ERROR,
|
|
324
|
+
)
|
|
325
|
+
continue
|
|
326
|
+
|
|
327
|
+
curr_speed = lnk_stat_reg.curr_lnk_speed.get_val()
|
|
328
|
+
supported_vec = lnk_cap_2_reg.supported_lnk_speed_vec.get_val()
|
|
329
|
+
if curr_speed is None or supported_vec is None:
|
|
330
|
+
continue
|
|
331
|
+
sv_mask = 0b1 << (curr_speed - 1)
|
|
332
|
+
link_speed = sv_gen_speed[sv_mask & supported_vec]
|
|
333
|
+
|
|
334
|
+
if link_speed != exp_speed:
|
|
335
|
+
self._log_event(
|
|
336
|
+
category=EventCategory.IO,
|
|
337
|
+
description="Unexpected link speed detected",
|
|
338
|
+
priority=EventPriority.ERROR,
|
|
339
|
+
data={
|
|
340
|
+
"bdf": bdf,
|
|
341
|
+
"current_speed": link_speed,
|
|
342
|
+
"expected_speed": exp_speed,
|
|
343
|
+
},
|
|
344
|
+
)
|
|
345
|
+
if lnk_stat_reg.neg_lnk_width.get_val() != exp_width:
|
|
346
|
+
self._log_event(
|
|
347
|
+
category=EventCategory.IO,
|
|
348
|
+
description="Unexpected link width detected",
|
|
349
|
+
priority=EventPriority.ERROR,
|
|
350
|
+
data={
|
|
351
|
+
"bdf": bdf,
|
|
352
|
+
"current_width": lnk_stat_reg.neg_lnk_width.get_val(),
|
|
353
|
+
"expected_width": exp_width,
|
|
354
|
+
},
|
|
355
|
+
)
|
|
356
|
+
except Exception as e:
|
|
357
|
+
self._log_event(
|
|
358
|
+
category=EventCategory.IO,
|
|
359
|
+
description="Exception occurred while checking link status",
|
|
360
|
+
priority=EventPriority.ERROR,
|
|
361
|
+
data={"exception": get_exception_traceback(e)},
|
|
362
|
+
)
|
|
363
|
+
|
|
364
|
+
def check_uncorr_aer_errors(
|
|
365
|
+
self,
|
|
366
|
+
bdf_ecap_aer: Dict[BdfStr, ECapAer],
|
|
367
|
+
):
|
|
368
|
+
"""
|
|
369
|
+
Checks the following AER uncorrectable error registers
|
|
370
|
+
- Uncorrectable Error Status Register
|
|
371
|
+
- Uncorrectable Error Mask Register
|
|
372
|
+
- Uncorrectable Error Severity Register
|
|
373
|
+
|
|
374
|
+
Args:
|
|
375
|
+
bdf_cfg_space_dict (dict[BdfStr, PcieCfgSpace]):
|
|
376
|
+
dict of key bdf and value PcieCfgSpace object which contains register data
|
|
377
|
+
Returns:
|
|
378
|
+
None
|
|
379
|
+
"""
|
|
380
|
+
for bdf, ecap_aer in bdf_ecap_aer.items():
|
|
381
|
+
stat_reg: UncorrErrStatReg = ecap_aer.uncorr_err_stat
|
|
382
|
+
mask_reg: UncorrErrMaskReg = ecap_aer.uncorr_err_mask
|
|
383
|
+
sev_reg: UncorrErrSevReg = ecap_aer.uncorr_err_sev
|
|
384
|
+
stat_fields = stat_reg.bit_fields
|
|
385
|
+
mask_fields = mask_reg.bit_fields
|
|
386
|
+
sev_fields = sev_reg.bit_fields
|
|
387
|
+
# sort fields by bit position using offset
|
|
388
|
+
sorted_stat_fields = sorted(stat_fields.values(), key=lambda x: x.bit_mask)
|
|
389
|
+
sorted_mask_fields = sorted(mask_fields.values(), key=lambda x: x.bit_mask)
|
|
390
|
+
sorted_sev_fields = sorted(sev_fields.values(), key=lambda x: x.bit_mask)
|
|
391
|
+
# Iterate through all the fields in the stat, mask, and sev registers
|
|
392
|
+
for stat_field, mask_field, sev_field in zip(
|
|
393
|
+
sorted_stat_fields,
|
|
394
|
+
sorted_mask_fields,
|
|
395
|
+
sorted_sev_fields,
|
|
396
|
+
):
|
|
397
|
+
pcie_field_stat_value = stat_field.get_val()
|
|
398
|
+
pcie_field_mask_value = mask_field.get_val()
|
|
399
|
+
pcie_field_sev_value = sev_field.get_val()
|
|
400
|
+
err_descriptor: Dict[str, str] = {
|
|
401
|
+
"bdf": bdf,
|
|
402
|
+
"reg_name": stat_reg.__class__.__name__,
|
|
403
|
+
"field_desc": stat_field.desc,
|
|
404
|
+
"stat": (
|
|
405
|
+
hex(pcie_field_stat_value) if pcie_field_stat_value is not None else "None"
|
|
406
|
+
),
|
|
407
|
+
"mask": (
|
|
408
|
+
hex(pcie_field_mask_value) if pcie_field_mask_value is not None else "None"
|
|
409
|
+
),
|
|
410
|
+
"sev": (
|
|
411
|
+
hex(pcie_field_sev_value) if pcie_field_sev_value is not None else "None"
|
|
412
|
+
),
|
|
413
|
+
}
|
|
414
|
+
if pcie_field_stat_value != 0:
|
|
415
|
+
# Error detected
|
|
416
|
+
if pcie_field_sev_value != 1:
|
|
417
|
+
if pcie_field_mask_value == 1:
|
|
418
|
+
self._log_event(
|
|
419
|
+
category=EventCategory.IO,
|
|
420
|
+
description="Masked Fatal errors were detected",
|
|
421
|
+
priority=EventPriority.ERROR,
|
|
422
|
+
data=err_descriptor,
|
|
423
|
+
)
|
|
424
|
+
else:
|
|
425
|
+
self._log_event(
|
|
426
|
+
category=EventCategory.IO,
|
|
427
|
+
description="Unmasked Fatal errors were detected",
|
|
428
|
+
priority=EventPriority.ERROR,
|
|
429
|
+
data=err_descriptor,
|
|
430
|
+
)
|
|
431
|
+
else:
|
|
432
|
+
if pcie_field_mask_value == 1:
|
|
433
|
+
self._log_event(
|
|
434
|
+
category=EventCategory.IO,
|
|
435
|
+
description="Unmasked Non-Fatal errors were detected",
|
|
436
|
+
priority=EventPriority.WARNING,
|
|
437
|
+
data=err_descriptor,
|
|
438
|
+
)
|
|
439
|
+
else:
|
|
440
|
+
self._log_event(
|
|
441
|
+
category=EventCategory.IO,
|
|
442
|
+
description="Unmasked Non-Fatal errors were detected",
|
|
443
|
+
priority=EventPriority.WARNING,
|
|
444
|
+
data=err_descriptor,
|
|
445
|
+
)
|
|
446
|
+
|
|
447
|
+
def check_corr_aer_errors(
|
|
448
|
+
self,
|
|
449
|
+
bdf_ecap_aer: Dict[BdfStr, ECapAer],
|
|
450
|
+
):
|
|
451
|
+
"""
|
|
452
|
+
Checks the following AER correctable error registers
|
|
453
|
+
- Correctable Error Status Register
|
|
454
|
+
- Correctable Error Mask Register
|
|
455
|
+
|
|
456
|
+
Args:
|
|
457
|
+
bdf_cfg_space_dict (dict[BdfStr, PcieCfgSpace]):
|
|
458
|
+
dict of key bdf and value PcieCfgSpace object which contains register data
|
|
459
|
+
Returns:
|
|
460
|
+
None
|
|
461
|
+
"""
|
|
462
|
+
for bdf, ecap_aer in bdf_ecap_aer.items():
|
|
463
|
+
stat_reg: CorrErrStatReg = ecap_aer.corr_err_stat
|
|
464
|
+
mask_reg: CorrErrMaskReg = ecap_aer.corr_err_mask
|
|
465
|
+
stat_fields = stat_reg.bit_fields
|
|
466
|
+
mask_fields = mask_reg.bit_fields
|
|
467
|
+
sorted_stat_fields = sorted(stat_fields.values(), key=lambda x: x.bit_mask)
|
|
468
|
+
sorted_mask_fields = sorted(mask_fields.values(), key=lambda x: x.bit_mask)
|
|
469
|
+
|
|
470
|
+
for stat_field, mask_field in zip(
|
|
471
|
+
sorted_stat_fields,
|
|
472
|
+
sorted_mask_fields,
|
|
473
|
+
):
|
|
474
|
+
stat_val = stat_field.get_val()
|
|
475
|
+
if stat_val is not None and stat_val != 0:
|
|
476
|
+
err_dict = {
|
|
477
|
+
"bdf": bdf,
|
|
478
|
+
"reg_description": stat_reg.desc,
|
|
479
|
+
"field_description": stat_field.desc,
|
|
480
|
+
"bit_field_val": hex(stat_val),
|
|
481
|
+
}
|
|
482
|
+
if mask_field.get_val() == 1:
|
|
483
|
+
self._log_event(
|
|
484
|
+
category=EventCategory.IO,
|
|
485
|
+
description="Masked Correctable errors were detected",
|
|
486
|
+
priority=EventPriority.WARNING,
|
|
487
|
+
data=err_dict,
|
|
488
|
+
)
|
|
489
|
+
else:
|
|
490
|
+
self._log_event(
|
|
491
|
+
category=EventCategory.IO,
|
|
492
|
+
description="Masked Correctable errors were detected",
|
|
493
|
+
priority=EventPriority.ERROR,
|
|
494
|
+
data=err_dict,
|
|
495
|
+
)
|
|
496
|
+
|
|
497
|
+
def check_pcie_device_status_errors(self, bdf_pcie_express_dict: Dict[str, PcieExp]):
|
|
498
|
+
"""
|
|
499
|
+
Checks PCIe baseline error reported in Device Status Register
|
|
500
|
+
Reference: 9.4.1 Baseline Error Reporting
|
|
501
|
+
|
|
502
|
+
Args:
|
|
503
|
+
bdf_cfg_space_dict (dict[BdfStr, PcieCfgSpace]):
|
|
504
|
+
dict of key bdf and value PcieCfgSpace object which contains register data
|
|
505
|
+
Returns:
|
|
506
|
+
None
|
|
507
|
+
"""
|
|
508
|
+
for bdf, pcie_exp_cap in bdf_pcie_express_dict.items():
|
|
509
|
+
err_list = []
|
|
510
|
+
dev_stat_reg = pcie_exp_cap.dev_stat_reg
|
|
511
|
+
bit_field_list = [
|
|
512
|
+
dev_stat_reg.corr_err_det,
|
|
513
|
+
dev_stat_reg.non_fatal_err_det,
|
|
514
|
+
dev_stat_reg.fatal_err_det,
|
|
515
|
+
dev_stat_reg.ur_det,
|
|
516
|
+
]
|
|
517
|
+
err_list = [bit_field for bit_field in bit_field_list if bit_field.get_val() != 0]
|
|
518
|
+
|
|
519
|
+
if len(err_list) > 0:
|
|
520
|
+
self._log_event(
|
|
521
|
+
category=EventCategory.IO,
|
|
522
|
+
description="Device Status errors were detected",
|
|
523
|
+
priority=EventPriority.WARNING,
|
|
524
|
+
data={
|
|
525
|
+
"bdf": bdf,
|
|
526
|
+
"reg_description": dev_stat_reg.desc,
|
|
527
|
+
"field_desc_list": [err.desc for err in err_list],
|
|
528
|
+
"err_bitmask_list": [err.bit_mask for err in err_list],
|
|
529
|
+
"register_value": dev_stat_reg.val,
|
|
530
|
+
},
|
|
531
|
+
)
|
|
532
|
+
|
|
533
|
+
def check_pcie_status_errors(self, bdf_cfg_space_dict: Dict[BdfStr, PcieCfgSpace]):
|
|
534
|
+
"""
|
|
535
|
+
Checks PCIe baseline error reported in Status Registe
|
|
536
|
+
Reference: 9.4.1 Baseline Error Reporting
|
|
537
|
+
|
|
538
|
+
Args:
|
|
539
|
+
bdf_cfg_space_dict (dict[BdfStr, PcieCfgSpace]):
|
|
540
|
+
dict of key bdf and value PcieCfgSpace object which contains register data
|
|
541
|
+
Returns:
|
|
542
|
+
None
|
|
543
|
+
"""
|
|
544
|
+
for bdf, cfg_space in bdf_cfg_space_dict.items():
|
|
545
|
+
err_list = []
|
|
546
|
+
stat_reg = cfg_space.type_0_configuration.status
|
|
547
|
+
bit_field_list = [
|
|
548
|
+
stat_reg.mstr_data_par_err,
|
|
549
|
+
stat_reg.signaled_target_abort,
|
|
550
|
+
stat_reg.rcvd_target_abort,
|
|
551
|
+
stat_reg.rcvd_mstr_abort,
|
|
552
|
+
stat_reg.signaled_sys_err,
|
|
553
|
+
stat_reg.det_parity_err,
|
|
554
|
+
]
|
|
555
|
+
err_list = [bit_field for bit_field in bit_field_list if bit_field.get_val() != 0]
|
|
556
|
+
|
|
557
|
+
if len(err_list) > 0:
|
|
558
|
+
self._log_event(
|
|
559
|
+
category=EventCategory.IO,
|
|
560
|
+
description="PCI Express Status register errors were detected",
|
|
561
|
+
priority=EventPriority.WARNING,
|
|
562
|
+
data={
|
|
563
|
+
"bdf": bdf,
|
|
564
|
+
"reg_description": stat_reg.desc,
|
|
565
|
+
"field_desc_list": [err.desc for err in err_list],
|
|
566
|
+
"err_bitmask_list": [err.bit_mask for err in err_list],
|
|
567
|
+
"register_value": stat_reg.val,
|
|
568
|
+
},
|
|
569
|
+
)
|
|
570
|
+
|
|
571
|
+
def check_pcie_dev_ctrl_reg(
|
|
572
|
+
self,
|
|
573
|
+
bdf_pcie_express_dict: Dict[str, PcieExp],
|
|
574
|
+
exp_max_payload_size: Optional[int],
|
|
575
|
+
exp_max_rd_req_size: Optional[int],
|
|
576
|
+
):
|
|
577
|
+
"""Checks 7.5.3.4 Device Control Register (Offset 08h) fields for expected value:
|
|
578
|
+
- Max Payload Size
|
|
579
|
+
- Max Read Request Size
|
|
580
|
+
|
|
581
|
+
Args:
|
|
582
|
+
bdf_cfg_space_dict (dict[BdfStr, PcieCfgSpace]):
|
|
583
|
+
dict of key bdf and value PcieCfgSpace object which contains register data
|
|
584
|
+
exp_max_payload_size (Optional[int]): expected max payload size, when None it is not checked
|
|
585
|
+
exp_max_rd_req_size (Optional[int]): expected max read request size, when None it is not checked
|
|
586
|
+
Returns:
|
|
587
|
+
None
|
|
588
|
+
"""
|
|
589
|
+
encoding = {
|
|
590
|
+
0b000: 128,
|
|
591
|
+
0b001: 256,
|
|
592
|
+
0b010: 512,
|
|
593
|
+
0b011: 1024,
|
|
594
|
+
0b100: 2048,
|
|
595
|
+
0b101: 4096,
|
|
596
|
+
}
|
|
597
|
+
for bdf, pcie_exp in bdf_pcie_express_dict.items():
|
|
598
|
+
dev_ctrl_reg = pcie_exp.dev_ctrl_reg
|
|
599
|
+
mps_val = dev_ctrl_reg.mps.get_val()
|
|
600
|
+
if mps_val is None:
|
|
601
|
+
continue
|
|
602
|
+
max_payload_size = encoding[mps_val]
|
|
603
|
+
if exp_max_payload_size is not None and max_payload_size != exp_max_payload_size:
|
|
604
|
+
self._log_event(
|
|
605
|
+
category=EventCategory.IO,
|
|
606
|
+
description="Unexpected Max Payload Size detected",
|
|
607
|
+
priority=EventPriority.ERROR,
|
|
608
|
+
data={
|
|
609
|
+
"bdf": bdf,
|
|
610
|
+
"current_max_payload_size": max_payload_size,
|
|
611
|
+
"expected_max_payload_size": exp_max_payload_size,
|
|
612
|
+
},
|
|
613
|
+
)
|
|
614
|
+
|
|
615
|
+
max_rd_req_val = dev_ctrl_reg.max_rd_req_size.get_val()
|
|
616
|
+
if max_rd_req_val is None:
|
|
617
|
+
continue
|
|
618
|
+
max_rd_req_size = encoding[max_rd_req_val]
|
|
619
|
+
if max_rd_req_size is not None and max_rd_req_size != exp_max_rd_req_size:
|
|
620
|
+
self._log_event(
|
|
621
|
+
category=EventCategory.IO,
|
|
622
|
+
description="Unexpected Max Read Request Size detected",
|
|
623
|
+
priority=EventPriority.ERROR,
|
|
624
|
+
data={
|
|
625
|
+
"bdf": bdf,
|
|
626
|
+
"current_max_rd_req_size": max_rd_req_size,
|
|
627
|
+
"expected_max_rd_req_size": exp_max_rd_req_size,
|
|
628
|
+
},
|
|
629
|
+
)
|
|
630
|
+
|
|
631
|
+
def check_pcie_dev_ctrl_2_reg(
|
|
632
|
+
self,
|
|
633
|
+
bdf_pcie_express_dict: Dict[str, PcieExp],
|
|
634
|
+
exp_ten_bit_tag_req_en: Optional[int],
|
|
635
|
+
):
|
|
636
|
+
"""Checks 7.5.3.16 Device Control 2 Register (Offset 28h) fields for expected value:
|
|
637
|
+
- 10-bit Tag Request Enable
|
|
638
|
+
|
|
639
|
+
Args:
|
|
640
|
+
bdf_cfg_space_dict (dict[BdfStr, PcieCfgSpace]):
|
|
641
|
+
dict of key bdf and value PcieCfgSpace object which contains register data
|
|
642
|
+
exp_ten_bit_tag_req_en (Optional[int]): expected 10-bit tag request enable, when None it is not checked
|
|
643
|
+
Returns:
|
|
644
|
+
None
|
|
645
|
+
"""
|
|
646
|
+
for bdf, pcie_exp in bdf_pcie_express_dict.items():
|
|
647
|
+
dev_ctrl_2_reg = pcie_exp.dev_ctrl_2_reg
|
|
648
|
+
ten_bit_tag_req_en = dev_ctrl_2_reg.ten_bit_tag_req_en.get_val()
|
|
649
|
+
if exp_ten_bit_tag_req_en is not None and ten_bit_tag_req_en != exp_ten_bit_tag_req_en:
|
|
650
|
+
self._log_event(
|
|
651
|
+
category=EventCategory.IO,
|
|
652
|
+
description="Unexpected 10-bit Tag Request Enable detected",
|
|
653
|
+
priority=EventPriority.ERROR,
|
|
654
|
+
data={
|
|
655
|
+
"bdf": bdf,
|
|
656
|
+
"current_ten_bit_tag_req_en": ten_bit_tag_req_en,
|
|
657
|
+
"expected_ten_bit_tag_req_en": exp_ten_bit_tag_req_en,
|
|
658
|
+
},
|
|
659
|
+
)
|
|
660
|
+
|
|
661
|
+
def instantaneous_par_err_chk(self, bdf_cfg_space_dict: Dict[str, ECap16Gt]):
|
|
662
|
+
"""Instantaneous parity error check for ECap16Gt registers, will
|
|
663
|
+
log an event if any lanes have parity errors.
|
|
664
|
+
|
|
665
|
+
Parameters
|
|
666
|
+
----------
|
|
667
|
+
bdf_cfg_space_dict : Dict[str, ECap16Gt]
|
|
668
|
+
Dictionary of BDFs and their corresponding ECap16Gt capability structure
|
|
669
|
+
"""
|
|
670
|
+
for bdf, ecap_pl_16gt in bdf_cfg_space_dict.items():
|
|
671
|
+
par_mismatch_stat: ParityMisMatchStat16GT = ecap_pl_16gt.parity_mismatch_stat
|
|
672
|
+
retimer_fst_par_mismatch_stat = ecap_pl_16gt.retimer_fst_parity_mismatch_stat
|
|
673
|
+
for parity_register in [
|
|
674
|
+
par_mismatch_stat,
|
|
675
|
+
retimer_fst_par_mismatch_stat,
|
|
676
|
+
]:
|
|
677
|
+
if parity_register.val is None:
|
|
678
|
+
continue
|
|
679
|
+
par_bad_lanes = [
|
|
680
|
+
1 if (parity_register.val >> bit) & 1 else 0 for bit in range(0, 32)
|
|
681
|
+
]
|
|
682
|
+
number_of_bad_lanes = sum(par_bad_lanes)
|
|
683
|
+
if number_of_bad_lanes > 0:
|
|
684
|
+
self._log_event(
|
|
685
|
+
category=EventCategory.IO,
|
|
686
|
+
description="Lanes have parity errors",
|
|
687
|
+
priority=EventPriority.ERROR,
|
|
688
|
+
data={
|
|
689
|
+
"bdf": bdf,
|
|
690
|
+
"reg_name": parity_register.__class__.__name__,
|
|
691
|
+
"reg_desc": parity_register.desc,
|
|
692
|
+
"register_value": parity_register.val,
|
|
693
|
+
"number_of_bad_lanes": number_of_bad_lanes,
|
|
694
|
+
},
|
|
695
|
+
)
|
|
696
|
+
|
|
697
|
+
def lane_error_status_chk(self, ecap_sec_pci_dict: Dict[str, ECapSecpci]):
|
|
698
|
+
"""Lane error status check for ECapSecpci registers, will log an event if any lanes have errors.
|
|
699
|
+
|
|
700
|
+
Parameters
|
|
701
|
+
----------
|
|
702
|
+
ecap_sec_pci_dict : Dict[str, ECapSecpci]
|
|
703
|
+
Dictionary of BDFs and their corresponding ECapSecpci capability structure
|
|
704
|
+
"""
|
|
705
|
+
for bdf, ecap_sec_pci in ecap_sec_pci_dict.items():
|
|
706
|
+
lane_error_stat = ecap_sec_pci.lane_err_stat
|
|
707
|
+
lane_error_stat_val = lane_error_stat.val
|
|
708
|
+
if lane_error_stat_val != 0:
|
|
709
|
+
self._log_event(
|
|
710
|
+
category=EventCategory.IO,
|
|
711
|
+
description="Lane error detected",
|
|
712
|
+
priority=EventPriority.ERROR,
|
|
713
|
+
data={
|
|
714
|
+
"bdf": bdf,
|
|
715
|
+
"reg_name": lane_error_stat.__class__.__name__,
|
|
716
|
+
"register_value": lane_error_stat_val,
|
|
717
|
+
},
|
|
718
|
+
)
|
|
719
|
+
|
|
720
|
+
def device_consistancy_chk(self, bdf_cfg_space_dict: Dict[BdfStr, PcieCfgSpace]):
|
|
721
|
+
"""Checks that the configurable fields in the PCIe devices are all consistent"""
|
|
722
|
+
# Build a dynamic map of device IDs to BDFs from the actual devices in the system
|
|
723
|
+
dev_id_bdf_map: Dict[int, List[BdfStr]] = {}
|
|
724
|
+
|
|
725
|
+
for bdf, cfg_space in bdf_cfg_space_dict.items():
|
|
726
|
+
# Collect Unique device Ids contained in this system
|
|
727
|
+
device_id = cfg_space.type_0_configuration.device_id.val
|
|
728
|
+
if device_id is None:
|
|
729
|
+
self._log_event(
|
|
730
|
+
category=EventCategory.IO,
|
|
731
|
+
description="No value assigned to device id, unable to check consistency due to missing data",
|
|
732
|
+
data={
|
|
733
|
+
"bdf": bdf,
|
|
734
|
+
},
|
|
735
|
+
priority=EventPriority.WARNING,
|
|
736
|
+
)
|
|
737
|
+
continue
|
|
738
|
+
|
|
739
|
+
# Dynamically add device IDs as we encounter them
|
|
740
|
+
if device_id not in dev_id_bdf_map:
|
|
741
|
+
dev_id_bdf_map[device_id] = []
|
|
742
|
+
dev_id_bdf_map[device_id].append(bdf)
|
|
743
|
+
|
|
744
|
+
# check the values are all equal for select registers
|
|
745
|
+
cap_struct_dict = self.get_valid_cap_dict(bdf_cfg_space_dict, PcieExp, log_event=False)
|
|
746
|
+
for collected_device_id, list_of_bdfs in dev_id_bdf_map.items():
|
|
747
|
+
# check the values are all equal for select registers
|
|
748
|
+
mps = []
|
|
749
|
+
mrs = []
|
|
750
|
+
tbt = []
|
|
751
|
+
log_event = False
|
|
752
|
+
for bdf in list_of_bdfs:
|
|
753
|
+
if bdf not in cap_struct_dict:
|
|
754
|
+
# Missing Capability structure for this BDF, skip it, log event at end
|
|
755
|
+
log_event = True
|
|
756
|
+
continue
|
|
757
|
+
pcie_exp = cap_struct_dict[bdf]
|
|
758
|
+
dev_ctrl_reg = pcie_exp.dev_ctrl_reg
|
|
759
|
+
mps.append(dev_ctrl_reg.mps.val)
|
|
760
|
+
mrs.append(dev_ctrl_reg.max_rd_req_size.val)
|
|
761
|
+
tbt.append(dev_ctrl_reg.ext_tag_field_en.val)
|
|
762
|
+
# check the values are all equal for select registers
|
|
763
|
+
if len(set(mps)) > 1 or len(set(mrs)) > 1 or len(set(tbt)) > 1 or log_event:
|
|
764
|
+
collected_device_id_str = hex(collected_device_id)
|
|
765
|
+
self._log_event(
|
|
766
|
+
category=EventCategory.IO,
|
|
767
|
+
description=f"PCIe device {collected_device_id_str} has inconsistent values",
|
|
768
|
+
priority=EventPriority.WARNING,
|
|
769
|
+
data={
|
|
770
|
+
"dev_id": collected_device_id_str,
|
|
771
|
+
"bdf_list": list_of_bdfs,
|
|
772
|
+
"max_payload_size_list": mps,
|
|
773
|
+
"max_rd_req_size_list": mrs,
|
|
774
|
+
"ext_tag_field_en_list": tbt,
|
|
775
|
+
},
|
|
776
|
+
)
|
|
777
|
+
|
|
778
|
+
def check_ecap_16gt_regs(
|
|
779
|
+
self,
|
|
780
|
+
bdf_cfg_space_dict: dict[BdfStr, PcieCfgSpace],
|
|
781
|
+
):
|
|
782
|
+
"""Acquires ECap16Gt capability structure and checks for instantaneous parity errors"""
|
|
783
|
+
CAP_STRUCTURE = ECap16Gt
|
|
784
|
+
bdf_ecap_16gt_dict = self.get_valid_cap_dict(
|
|
785
|
+
bdf_cfg_space_dict, CAP_STRUCTURE, log_event=True
|
|
786
|
+
)
|
|
787
|
+
self.instantaneous_par_err_chk(bdf_cfg_space_dict=bdf_ecap_16gt_dict)
|
|
788
|
+
|
|
789
|
+
def check_ecap_sec_pci_regs(
|
|
790
|
+
self,
|
|
791
|
+
bdf_cfg_space_dict: dict[BdfStr, PcieCfgSpace],
|
|
792
|
+
):
|
|
793
|
+
"""Acquires ECapSecpci capability structure and checks for lane errors"""
|
|
794
|
+
CAP_STRUCTURE = ECapSecpci
|
|
795
|
+
bdf_ecap_secondary_pci = self.get_valid_cap_dict(
|
|
796
|
+
bdf_cfg_space_dict, CAP_STRUCTURE, log_event=True
|
|
797
|
+
)
|
|
798
|
+
self.lane_error_status_chk(ecap_sec_pci_dict=bdf_ecap_secondary_pci)
|
|
799
|
+
|
|
800
|
+
def check_ecap_aer_errors(
|
|
801
|
+
self,
|
|
802
|
+
bdf_cfg_space_dict: dict[BdfStr, PcieCfgSpace],
|
|
803
|
+
):
|
|
804
|
+
"""Acquires ECapAer capability structure and checks for AER errors"""
|
|
805
|
+
CAP_STRUCTURE = ECapAer
|
|
806
|
+
bdf_ecap_aer_error = self.get_valid_cap_dict(
|
|
807
|
+
bdf_cfg_space_dict, CAP_STRUCTURE, log_event=True
|
|
808
|
+
)
|
|
809
|
+
self.check_uncorr_aer_errors(bdf_ecap_aer=bdf_ecap_aer_error)
|
|
810
|
+
self.check_corr_aer_errors(bdf_ecap_aer=bdf_ecap_aer_error)
|
|
811
|
+
|
|
812
|
+
def check_pcie_exp_capability_structure_errors(
|
|
813
|
+
self, bdf_cfg_space_dict: Dict[BdfStr, PcieCfgSpace]
|
|
814
|
+
):
|
|
815
|
+
"""Checks the PCIe Express capability structure for errors"""
|
|
816
|
+
CAP_STRUCTURE = PcieExp
|
|
817
|
+
bdf_pcie_express_dict = self.get_valid_cap_dict(
|
|
818
|
+
bdf_cfg_space_dict, CAP_STRUCTURE, log_event=False
|
|
819
|
+
)
|
|
820
|
+
self.check_pcie_device_status_errors(bdf_pcie_express_dict=bdf_pcie_express_dict)
|
|
821
|
+
|
|
822
|
+
def check_pcie_exp_capability_structure_config(
|
|
823
|
+
self,
|
|
824
|
+
bdf_cfg_space_dict: dict[BdfStr, PcieCfgSpace],
|
|
825
|
+
exp_max_payload_size: Optional[int] = None,
|
|
826
|
+
exp_max_rd_req_size: Optional[int] = None,
|
|
827
|
+
exp_ten_bit_tag_req_en: Optional[int] = None,
|
|
828
|
+
):
|
|
829
|
+
"""Checks the PCIe Express capability structure for errors"""
|
|
830
|
+
CAP_STRUCTURE = PcieExp
|
|
831
|
+
|
|
832
|
+
bdf_pcie_express_dict = self.get_valid_cap_dict(
|
|
833
|
+
bdf_cfg_space_dict, CAP_STRUCTURE, log_event=True
|
|
834
|
+
)
|
|
835
|
+
|
|
836
|
+
if exp_max_payload_size is not None or exp_max_rd_req_size is not None:
|
|
837
|
+
self.check_pcie_dev_ctrl_reg(
|
|
838
|
+
bdf_pcie_express_dict=bdf_pcie_express_dict,
|
|
839
|
+
exp_max_payload_size=exp_max_payload_size,
|
|
840
|
+
exp_max_rd_req_size=exp_max_rd_req_size,
|
|
841
|
+
)
|
|
842
|
+
|
|
843
|
+
if exp_ten_bit_tag_req_en is not None:
|
|
844
|
+
self.check_pcie_dev_ctrl_2_reg(
|
|
845
|
+
bdf_pcie_express_dict=bdf_pcie_express_dict,
|
|
846
|
+
exp_ten_bit_tag_req_en=exp_ten_bit_tag_req_en,
|
|
847
|
+
)
|
|
848
|
+
|
|
849
|
+
@staticmethod
|
|
850
|
+
def filter_pcie_data_by_device_id(
|
|
851
|
+
bdf_cfg_space_dict: Dict[BdfStr, PcieCfgSpace],
|
|
852
|
+
device_ids: Set[int],
|
|
853
|
+
) -> Dict[BdfStr, PcieCfgSpace]:
|
|
854
|
+
"""Filters the PCIe data by device ID
|
|
855
|
+
|
|
856
|
+
Parameters
|
|
857
|
+
----------
|
|
858
|
+
device_ids : set[int]
|
|
859
|
+
Set of device IDs to filter by
|
|
860
|
+
|
|
861
|
+
Returns
|
|
862
|
+
-------
|
|
863
|
+
Dict[BdfStr, PcieCfgSpace]
|
|
864
|
+
Dictionary of BDFs and their corresponding PCIe configuration space
|
|
865
|
+
"""
|
|
866
|
+
new_cfg_space_dict: Dict[BdfStr, PcieCfgSpace] = {}
|
|
867
|
+
for bdf, pcie_data in bdf_cfg_space_dict.items():
|
|
868
|
+
dev_id = pcie_data.type_0_configuration.device_id.val
|
|
869
|
+
if dev_id in device_ids:
|
|
870
|
+
new_cfg_space_dict[bdf] = pcie_data
|
|
871
|
+
return new_cfg_space_dict
|
|
872
|
+
|
|
873
|
+
def check_gpu_count(
|
|
874
|
+
self,
|
|
875
|
+
pcie_data: PcieDataModel,
|
|
876
|
+
expected_gpu_count: Optional[int] = None,
|
|
877
|
+
):
|
|
878
|
+
"""Check if GPU count from PCIe data matches expected count
|
|
879
|
+
|
|
880
|
+
Parameters
|
|
881
|
+
----------
|
|
882
|
+
pcie_data : PcieDataModel
|
|
883
|
+
PCIe data model containing collected PCIe configuration space data
|
|
884
|
+
expected_gpu_count : Optional[int], optional
|
|
885
|
+
Expected GPU count, by default None (no check performed)
|
|
886
|
+
"""
|
|
887
|
+
if expected_gpu_count is None:
|
|
888
|
+
return
|
|
889
|
+
|
|
890
|
+
gpu_count_from_pcie = 0
|
|
891
|
+
for cfg_space in pcie_data.pcie_cfg_space.values():
|
|
892
|
+
vendor_id = cfg_space.type_0_configuration.vendor_id.val
|
|
893
|
+
if vendor_id == self.system_info.vendorid_ep:
|
|
894
|
+
gpu_count_from_pcie += 1
|
|
895
|
+
|
|
896
|
+
if gpu_count_from_pcie != expected_gpu_count:
|
|
897
|
+
self._log_event(
|
|
898
|
+
category=EventCategory.IO,
|
|
899
|
+
description="GPU count mismatch",
|
|
900
|
+
priority=EventPriority.ERROR,
|
|
901
|
+
data={
|
|
902
|
+
"gpu_count_from_pcie": gpu_count_from_pcie,
|
|
903
|
+
"expected_gpu_count": expected_gpu_count,
|
|
904
|
+
},
|
|
905
|
+
)
|
|
906
|
+
else:
|
|
907
|
+
self._log_event(
|
|
908
|
+
category=EventCategory.IO,
|
|
909
|
+
description="GPU count matches expected",
|
|
910
|
+
priority=EventPriority.INFO,
|
|
911
|
+
data={
|
|
912
|
+
"gpu_count": gpu_count_from_pcie,
|
|
913
|
+
},
|
|
914
|
+
)
|
|
915
|
+
|
|
916
|
+
def analyze_data(
|
|
917
|
+
self, data: PcieDataModel, args: Optional[PcieAnalyzerArgs] = None
|
|
918
|
+
) -> TaskResult:
|
|
919
|
+
"""Check PCIe data for errors by analyzing the PCIe register space and
|
|
920
|
+
checking the enumeration of the GPUs and optional SR-IOV VFs
|
|
921
|
+
|
|
922
|
+
Parameters
|
|
923
|
+
----------
|
|
924
|
+
data : PcieDataModel
|
|
925
|
+
PCIe data model containing collected PCIe configuration space data
|
|
926
|
+
args : Optional[PcieAnalyzerArgs], optional
|
|
927
|
+
Analyzer arguments containing expected values for validation, by default None
|
|
928
|
+
|
|
929
|
+
Returns
|
|
930
|
+
-------
|
|
931
|
+
TaskResult
|
|
932
|
+
Result of the analysis
|
|
933
|
+
"""
|
|
934
|
+
if args is None:
|
|
935
|
+
args = PcieAnalyzerArgs()
|
|
936
|
+
|
|
937
|
+
exp_speed = args.exp_speed
|
|
938
|
+
exp_width = args.exp_width
|
|
939
|
+
exp_sriov_count = args.exp_sriov_count
|
|
940
|
+
exp_gpu_count_override = args.exp_gpu_count_override
|
|
941
|
+
exp_max_payload_size = normalize_to_dict(
|
|
942
|
+
args.exp_max_payload_size, self.system_info.vendorid_ep
|
|
943
|
+
)
|
|
944
|
+
exp_max_rd_req_size = normalize_to_dict(
|
|
945
|
+
args.exp_max_rd_req_size, self.system_info.vendorid_ep
|
|
946
|
+
)
|
|
947
|
+
exp_ten_bit_tag_req_en = normalize_to_dict(
|
|
948
|
+
args.exp_ten_bit_tag_req_en, self.system_info.vendorid_ep
|
|
949
|
+
)
|
|
950
|
+
try:
|
|
951
|
+
pcie_input_data = PcieAnalyzerInputModel(
|
|
952
|
+
exp_speed=exp_speed,
|
|
953
|
+
exp_width=exp_width,
|
|
954
|
+
exp_sriov_count=exp_sriov_count,
|
|
955
|
+
exp_gpu_count_override=exp_gpu_count_override,
|
|
956
|
+
exp_ten_bit_tag_req_en=exp_ten_bit_tag_req_en,
|
|
957
|
+
exp_max_payload_size=exp_max_payload_size,
|
|
958
|
+
exp_max_rd_req_size=exp_max_rd_req_size,
|
|
959
|
+
)
|
|
960
|
+
except ValidationError as val_error:
|
|
961
|
+
self._log_event(
|
|
962
|
+
category=EventCategory.RUNTIME,
|
|
963
|
+
description="User input for PcieAnalyzerModel is invalid",
|
|
964
|
+
priority=EventPriority.ERROR,
|
|
965
|
+
data={
|
|
966
|
+
"validation_error": get_exception_traceback(val_error),
|
|
967
|
+
"valid_input": {
|
|
968
|
+
"exp_speed": "int, 1-5",
|
|
969
|
+
"exp_width": "int, 1-16",
|
|
970
|
+
"exp_sriov_count": "Optional[int]",
|
|
971
|
+
"exp_gpu_count_override": "Optional[int]",
|
|
972
|
+
},
|
|
973
|
+
"actual_input": {
|
|
974
|
+
"exp_speed": exp_speed,
|
|
975
|
+
"exp_width": exp_width,
|
|
976
|
+
"exp_sriov_count": exp_sriov_count,
|
|
977
|
+
"exp_gpu_count_override": exp_gpu_count_override,
|
|
978
|
+
},
|
|
979
|
+
},
|
|
980
|
+
)
|
|
981
|
+
return self.result
|
|
982
|
+
|
|
983
|
+
pcie_data: PcieDataModel = data
|
|
984
|
+
|
|
985
|
+
if pcie_data.pcie_cfg_space == {} and pcie_data.vf_pcie_cfg_space == {}:
|
|
986
|
+
# If both of the PCIe Configuration spaces are
|
|
987
|
+
self._log_event(
|
|
988
|
+
category=EventCategory.IO,
|
|
989
|
+
description="No PCIe config space found",
|
|
990
|
+
priority=EventPriority.WARNING,
|
|
991
|
+
)
|
|
992
|
+
return self.result
|
|
993
|
+
|
|
994
|
+
# Check every link in the PCIe configuration space for the expected capability structure,
|
|
995
|
+
# but don't check VF since those will be 0
|
|
996
|
+
bdf_pcie_express_dict = self.get_valid_cap_dict(
|
|
997
|
+
pcie_data.pcie_cfg_space,
|
|
998
|
+
PcieExp,
|
|
999
|
+
log_event=True,
|
|
1000
|
+
)
|
|
1001
|
+
self.check_link_status(
|
|
1002
|
+
bdf_pcie_express_dict=bdf_pcie_express_dict,
|
|
1003
|
+
exp_speed=exp_speed,
|
|
1004
|
+
exp_width=exp_width,
|
|
1005
|
+
)
|
|
1006
|
+
|
|
1007
|
+
amd_device_ids = set()
|
|
1008
|
+
for cfg_space in pcie_data.pcie_cfg_space.values():
|
|
1009
|
+
vendor_id = cfg_space.type_0_configuration.vendor_id.val
|
|
1010
|
+
device_id = cfg_space.type_0_configuration.device_id.val
|
|
1011
|
+
if vendor_id == self.system_info.vendorid_ep and device_id is not None:
|
|
1012
|
+
amd_device_ids.add(device_id)
|
|
1013
|
+
|
|
1014
|
+
# Filter PCIe data for AMD GPUs
|
|
1015
|
+
oam_pcie_data = self.filter_pcie_data_by_device_id(
|
|
1016
|
+
bdf_cfg_space_dict=pcie_data.pcie_cfg_space,
|
|
1017
|
+
device_ids=amd_device_ids,
|
|
1018
|
+
)
|
|
1019
|
+
|
|
1020
|
+
amd_vf_device_ids = set()
|
|
1021
|
+
if pcie_data.vf_pcie_cfg_space is not None:
|
|
1022
|
+
for cfg_space in pcie_data.vf_pcie_cfg_space.values():
|
|
1023
|
+
vendor_id = cfg_space.type_0_configuration.vendor_id.val
|
|
1024
|
+
device_id = cfg_space.type_0_configuration.device_id.val
|
|
1025
|
+
if vendor_id == self.system_info.vendorid_ep and device_id is not None:
|
|
1026
|
+
amd_vf_device_ids.add(device_id)
|
|
1027
|
+
|
|
1028
|
+
oam_vf_pcie_data = self.filter_pcie_data_by_device_id(
|
|
1029
|
+
bdf_cfg_space_dict=pcie_data.vf_pcie_cfg_space,
|
|
1030
|
+
device_ids=amd_vf_device_ids,
|
|
1031
|
+
)
|
|
1032
|
+
else:
|
|
1033
|
+
oam_vf_pcie_data = {}
|
|
1034
|
+
|
|
1035
|
+
# Include bridge/retimer devices (0x1500, 0x1501)
|
|
1036
|
+
us_ds_retimer = self.filter_pcie_data_by_device_id(
|
|
1037
|
+
bdf_cfg_space_dict=pcie_data.pcie_cfg_space,
|
|
1038
|
+
device_ids={0x1500, 0x1501},
|
|
1039
|
+
)
|
|
1040
|
+
ubb_data = {**oam_pcie_data, **us_ds_retimer}
|
|
1041
|
+
ubb_data_with_vf = {**ubb_data, **oam_vf_pcie_data}
|
|
1042
|
+
# Type 0 Configuration Space Checks
|
|
1043
|
+
self.check_pcie_status_errors(bdf_cfg_space_dict=ubb_data_with_vf)
|
|
1044
|
+
# Check other capability structures
|
|
1045
|
+
dev_ids = set(
|
|
1046
|
+
list(pcie_input_data.exp_max_payload_size.keys())
|
|
1047
|
+
+ list(pcie_input_data.exp_max_rd_req_size.keys())
|
|
1048
|
+
+ list(pcie_input_data.exp_ten_bit_tag_req_en.keys())
|
|
1049
|
+
)
|
|
1050
|
+
for device_id_to_check in dev_ids:
|
|
1051
|
+
cfg_space_filtered = self.filter_pcie_data_by_device_id(
|
|
1052
|
+
bdf_cfg_space_dict=pcie_data.pcie_cfg_space,
|
|
1053
|
+
device_ids={device_id_to_check},
|
|
1054
|
+
)
|
|
1055
|
+
self.check_pcie_exp_capability_structure_config(
|
|
1056
|
+
cfg_space_filtered,
|
|
1057
|
+
pcie_input_data.exp_max_payload_size.get(device_id_to_check),
|
|
1058
|
+
pcie_input_data.exp_max_rd_req_size.get(device_id_to_check),
|
|
1059
|
+
pcie_input_data.exp_ten_bit_tag_req_en.get(device_id_to_check),
|
|
1060
|
+
)
|
|
1061
|
+
|
|
1062
|
+
# run with vfs for AERs and PCIe EXP errors
|
|
1063
|
+
self.check_pcie_exp_capability_structure_errors(bdf_cfg_space_dict=ubb_data_with_vf)
|
|
1064
|
+
self.check_ecap_aer_errors(bdf_cfg_space_dict=ubb_data_with_vf)
|
|
1065
|
+
self.check_ecap_16gt_regs(bdf_cfg_space_dict=ubb_data)
|
|
1066
|
+
self.check_ecap_sec_pci_regs(bdf_cfg_space_dict=ubb_data)
|
|
1067
|
+
|
|
1068
|
+
if amd_device_ids:
|
|
1069
|
+
self.device_consistancy_chk(
|
|
1070
|
+
bdf_cfg_space_dict=ubb_data,
|
|
1071
|
+
)
|
|
1072
|
+
else:
|
|
1073
|
+
self._log_event(
|
|
1074
|
+
category=EventCategory.RUNTIME,
|
|
1075
|
+
description="No AMD GPU devices found, skipping device consistency check",
|
|
1076
|
+
priority=EventPriority.INFO,
|
|
1077
|
+
)
|
|
1078
|
+
|
|
1079
|
+
self.check_gpu_count(pcie_data, exp_gpu_count_override)
|
|
1080
|
+
|
|
1081
|
+
return self.result
|