amd-node-scraper 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (197) hide show
  1. amd_node_scraper-0.0.1.dist-info/LICENSE +21 -0
  2. amd_node_scraper-0.0.1.dist-info/METADATA +424 -0
  3. amd_node_scraper-0.0.1.dist-info/RECORD +197 -0
  4. amd_node_scraper-0.0.1.dist-info/WHEEL +5 -0
  5. amd_node_scraper-0.0.1.dist-info/entry_points.txt +2 -0
  6. amd_node_scraper-0.0.1.dist-info/top_level.txt +1 -0
  7. nodescraper/__init__.py +32 -0
  8. nodescraper/base/__init__.py +34 -0
  9. nodescraper/base/inbandcollectortask.py +118 -0
  10. nodescraper/base/inbanddataplugin.py +39 -0
  11. nodescraper/base/regexanalyzer.py +120 -0
  12. nodescraper/cli/__init__.py +29 -0
  13. nodescraper/cli/cli.py +511 -0
  14. nodescraper/cli/constants.py +27 -0
  15. nodescraper/cli/dynamicparserbuilder.py +171 -0
  16. nodescraper/cli/helper.py +517 -0
  17. nodescraper/cli/inputargtypes.py +129 -0
  18. nodescraper/configbuilder.py +123 -0
  19. nodescraper/configregistry.py +66 -0
  20. nodescraper/configs/node_status.json +19 -0
  21. nodescraper/connection/__init__.py +25 -0
  22. nodescraper/connection/inband/__init__.py +46 -0
  23. nodescraper/connection/inband/inband.py +171 -0
  24. nodescraper/connection/inband/inbandlocal.py +93 -0
  25. nodescraper/connection/inband/inbandmanager.py +151 -0
  26. nodescraper/connection/inband/inbandremote.py +173 -0
  27. nodescraper/connection/inband/sshparams.py +43 -0
  28. nodescraper/constants.py +26 -0
  29. nodescraper/enums/__init__.py +40 -0
  30. nodescraper/enums/eventcategory.py +89 -0
  31. nodescraper/enums/eventpriority.py +42 -0
  32. nodescraper/enums/executionstatus.py +44 -0
  33. nodescraper/enums/osfamily.py +34 -0
  34. nodescraper/enums/systeminteraction.py +41 -0
  35. nodescraper/enums/systemlocation.py +33 -0
  36. nodescraper/generictypes.py +36 -0
  37. nodescraper/interfaces/__init__.py +44 -0
  38. nodescraper/interfaces/connectionmanager.py +143 -0
  39. nodescraper/interfaces/dataanalyzertask.py +138 -0
  40. nodescraper/interfaces/datacollectortask.py +185 -0
  41. nodescraper/interfaces/dataplugin.py +356 -0
  42. nodescraper/interfaces/plugin.py +127 -0
  43. nodescraper/interfaces/resultcollator.py +56 -0
  44. nodescraper/interfaces/task.py +164 -0
  45. nodescraper/interfaces/taskresulthook.py +39 -0
  46. nodescraper/models/__init__.py +48 -0
  47. nodescraper/models/analyzerargs.py +93 -0
  48. nodescraper/models/collectorargs.py +30 -0
  49. nodescraper/models/connectionconfig.py +34 -0
  50. nodescraper/models/datamodel.py +171 -0
  51. nodescraper/models/datapluginresult.py +39 -0
  52. nodescraper/models/event.py +158 -0
  53. nodescraper/models/pluginconfig.py +38 -0
  54. nodescraper/models/pluginresult.py +39 -0
  55. nodescraper/models/systeminfo.py +44 -0
  56. nodescraper/models/taskresult.py +185 -0
  57. nodescraper/models/timerangeargs.py +38 -0
  58. nodescraper/pluginexecutor.py +274 -0
  59. nodescraper/pluginregistry.py +152 -0
  60. nodescraper/plugins/__init__.py +25 -0
  61. nodescraper/plugins/inband/__init__.py +25 -0
  62. nodescraper/plugins/inband/amdsmi/__init__.py +28 -0
  63. nodescraper/plugins/inband/amdsmi/amdsmi_analyzer.py +821 -0
  64. nodescraper/plugins/inband/amdsmi/amdsmi_collector.py +1313 -0
  65. nodescraper/plugins/inband/amdsmi/amdsmi_plugin.py +43 -0
  66. nodescraper/plugins/inband/amdsmi/amdsmidata.py +1002 -0
  67. nodescraper/plugins/inband/amdsmi/analyzer_args.py +50 -0
  68. nodescraper/plugins/inband/amdsmi/cper.py +65 -0
  69. nodescraper/plugins/inband/bios/__init__.py +29 -0
  70. nodescraper/plugins/inband/bios/analyzer_args.py +64 -0
  71. nodescraper/plugins/inband/bios/bios_analyzer.py +93 -0
  72. nodescraper/plugins/inband/bios/bios_collector.py +93 -0
  73. nodescraper/plugins/inband/bios/bios_plugin.py +43 -0
  74. nodescraper/plugins/inband/bios/biosdata.py +30 -0
  75. nodescraper/plugins/inband/cmdline/__init__.py +25 -0
  76. nodescraper/plugins/inband/cmdline/analyzer_args.py +80 -0
  77. nodescraper/plugins/inband/cmdline/cmdline_analyzer.py +113 -0
  78. nodescraper/plugins/inband/cmdline/cmdline_collector.py +77 -0
  79. nodescraper/plugins/inband/cmdline/cmdline_plugin.py +43 -0
  80. nodescraper/plugins/inband/cmdline/cmdlinedata.py +30 -0
  81. nodescraper/plugins/inband/device_enumeration/__init__.py +29 -0
  82. nodescraper/plugins/inband/device_enumeration/analyzer_args.py +73 -0
  83. nodescraper/plugins/inband/device_enumeration/device_enumeration_analyzer.py +81 -0
  84. nodescraper/plugins/inband/device_enumeration/device_enumeration_collector.py +176 -0
  85. nodescraper/plugins/inband/device_enumeration/device_enumeration_plugin.py +45 -0
  86. nodescraper/plugins/inband/device_enumeration/deviceenumdata.py +36 -0
  87. nodescraper/plugins/inband/dimm/__init__.py +25 -0
  88. nodescraper/plugins/inband/dimm/collector_args.py +31 -0
  89. nodescraper/plugins/inband/dimm/dimm_collector.py +151 -0
  90. nodescraper/plugins/inband/dimm/dimm_plugin.py +40 -0
  91. nodescraper/plugins/inband/dimm/dimmdata.py +30 -0
  92. nodescraper/plugins/inband/dkms/__init__.py +25 -0
  93. nodescraper/plugins/inband/dkms/analyzer_args.py +85 -0
  94. nodescraper/plugins/inband/dkms/dkms_analyzer.py +106 -0
  95. nodescraper/plugins/inband/dkms/dkms_collector.py +76 -0
  96. nodescraper/plugins/inband/dkms/dkms_plugin.py +43 -0
  97. nodescraper/plugins/inband/dkms/dkmsdata.py +33 -0
  98. nodescraper/plugins/inband/dmesg/__init__.py +28 -0
  99. nodescraper/plugins/inband/dmesg/analyzer_args.py +33 -0
  100. nodescraper/plugins/inband/dmesg/collector_args.py +39 -0
  101. nodescraper/plugins/inband/dmesg/dmesg_analyzer.py +503 -0
  102. nodescraper/plugins/inband/dmesg/dmesg_collector.py +164 -0
  103. nodescraper/plugins/inband/dmesg/dmesg_plugin.py +44 -0
  104. nodescraper/plugins/inband/dmesg/dmesgdata.py +116 -0
  105. nodescraper/plugins/inband/fabrics/__init__.py +28 -0
  106. nodescraper/plugins/inband/fabrics/fabrics_collector.py +726 -0
  107. nodescraper/plugins/inband/fabrics/fabrics_plugin.py +37 -0
  108. nodescraper/plugins/inband/fabrics/fabricsdata.py +140 -0
  109. nodescraper/plugins/inband/journal/__init__.py +28 -0
  110. nodescraper/plugins/inband/journal/collector_args.py +33 -0
  111. nodescraper/plugins/inband/journal/journal_collector.py +107 -0
  112. nodescraper/plugins/inband/journal/journal_plugin.py +40 -0
  113. nodescraper/plugins/inband/journal/journaldata.py +44 -0
  114. nodescraper/plugins/inband/kernel/__init__.py +25 -0
  115. nodescraper/plugins/inband/kernel/analyzer_args.py +64 -0
  116. nodescraper/plugins/inband/kernel/kernel_analyzer.py +91 -0
  117. nodescraper/plugins/inband/kernel/kernel_collector.py +129 -0
  118. nodescraper/plugins/inband/kernel/kernel_plugin.py +43 -0
  119. nodescraper/plugins/inband/kernel/kerneldata.py +32 -0
  120. nodescraper/plugins/inband/kernel_module/__init__.py +25 -0
  121. nodescraper/plugins/inband/kernel_module/analyzer_args.py +59 -0
  122. nodescraper/plugins/inband/kernel_module/kernel_module_analyzer.py +211 -0
  123. nodescraper/plugins/inband/kernel_module/kernel_module_collector.py +264 -0
  124. nodescraper/plugins/inband/kernel_module/kernel_module_data.py +60 -0
  125. nodescraper/plugins/inband/kernel_module/kernel_module_plugin.py +43 -0
  126. nodescraper/plugins/inband/memory/__init__.py +25 -0
  127. nodescraper/plugins/inband/memory/analyzer_args.py +45 -0
  128. nodescraper/plugins/inband/memory/memory_analyzer.py +98 -0
  129. nodescraper/plugins/inband/memory/memory_collector.py +330 -0
  130. nodescraper/plugins/inband/memory/memory_plugin.py +43 -0
  131. nodescraper/plugins/inband/memory/memorydata.py +90 -0
  132. nodescraper/plugins/inband/network/__init__.py +28 -0
  133. nodescraper/plugins/inband/network/network_collector.py +1828 -0
  134. nodescraper/plugins/inband/network/network_plugin.py +37 -0
  135. nodescraper/plugins/inband/network/networkdata.py +319 -0
  136. nodescraper/plugins/inband/nvme/__init__.py +28 -0
  137. nodescraper/plugins/inband/nvme/nvme_collector.py +167 -0
  138. nodescraper/plugins/inband/nvme/nvme_plugin.py +37 -0
  139. nodescraper/plugins/inband/nvme/nvmedata.py +45 -0
  140. nodescraper/plugins/inband/os/__init__.py +25 -0
  141. nodescraper/plugins/inband/os/analyzer_args.py +64 -0
  142. nodescraper/plugins/inband/os/os_analyzer.py +73 -0
  143. nodescraper/plugins/inband/os/os_collector.py +131 -0
  144. nodescraper/plugins/inband/os/os_plugin.py +43 -0
  145. nodescraper/plugins/inband/os/osdata.py +31 -0
  146. nodescraper/plugins/inband/package/__init__.py +25 -0
  147. nodescraper/plugins/inband/package/analyzer_args.py +48 -0
  148. nodescraper/plugins/inband/package/package_analyzer.py +253 -0
  149. nodescraper/plugins/inband/package/package_collector.py +273 -0
  150. nodescraper/plugins/inband/package/package_plugin.py +43 -0
  151. nodescraper/plugins/inband/package/packagedata.py +41 -0
  152. nodescraper/plugins/inband/pcie/__init__.py +29 -0
  153. nodescraper/plugins/inband/pcie/analyzer_args.py +63 -0
  154. nodescraper/plugins/inband/pcie/pcie_analyzer.py +1081 -0
  155. nodescraper/plugins/inband/pcie/pcie_collector.py +690 -0
  156. nodescraper/plugins/inband/pcie/pcie_data.py +2017 -0
  157. nodescraper/plugins/inband/pcie/pcie_plugin.py +43 -0
  158. nodescraper/plugins/inband/process/__init__.py +25 -0
  159. nodescraper/plugins/inband/process/analyzer_args.py +45 -0
  160. nodescraper/plugins/inband/process/collector_args.py +31 -0
  161. nodescraper/plugins/inband/process/process_analyzer.py +91 -0
  162. nodescraper/plugins/inband/process/process_collector.py +115 -0
  163. nodescraper/plugins/inband/process/process_plugin.py +46 -0
  164. nodescraper/plugins/inband/process/processdata.py +34 -0
  165. nodescraper/plugins/inband/rocm/__init__.py +25 -0
  166. nodescraper/plugins/inband/rocm/analyzer_args.py +66 -0
  167. nodescraper/plugins/inband/rocm/rocm_analyzer.py +100 -0
  168. nodescraper/plugins/inband/rocm/rocm_collector.py +205 -0
  169. nodescraper/plugins/inband/rocm/rocm_plugin.py +43 -0
  170. nodescraper/plugins/inband/rocm/rocmdata.py +62 -0
  171. nodescraper/plugins/inband/storage/__init__.py +25 -0
  172. nodescraper/plugins/inband/storage/analyzer_args.py +38 -0
  173. nodescraper/plugins/inband/storage/collector_args.py +31 -0
  174. nodescraper/plugins/inband/storage/storage_analyzer.py +152 -0
  175. nodescraper/plugins/inband/storage/storage_collector.py +110 -0
  176. nodescraper/plugins/inband/storage/storage_plugin.py +44 -0
  177. nodescraper/plugins/inband/storage/storagedata.py +70 -0
  178. nodescraper/plugins/inband/sysctl/__init__.py +29 -0
  179. nodescraper/plugins/inband/sysctl/analyzer_args.py +67 -0
  180. nodescraper/plugins/inband/sysctl/sysctl_analyzer.py +81 -0
  181. nodescraper/plugins/inband/sysctl/sysctl_collector.py +101 -0
  182. nodescraper/plugins/inband/sysctl/sysctl_plugin.py +43 -0
  183. nodescraper/plugins/inband/sysctl/sysctldata.py +42 -0
  184. nodescraper/plugins/inband/syslog/__init__.py +28 -0
  185. nodescraper/plugins/inband/syslog/syslog_collector.py +121 -0
  186. nodescraper/plugins/inband/syslog/syslog_plugin.py +37 -0
  187. nodescraper/plugins/inband/syslog/syslogdata.py +46 -0
  188. nodescraper/plugins/inband/uptime/__init__.py +25 -0
  189. nodescraper/plugins/inband/uptime/uptime_collector.py +88 -0
  190. nodescraper/plugins/inband/uptime/uptime_plugin.py +37 -0
  191. nodescraper/plugins/inband/uptime/uptimedata.py +31 -0
  192. nodescraper/resultcollators/__init__.py +25 -0
  193. nodescraper/resultcollators/tablesummary.py +159 -0
  194. nodescraper/taskresulthooks/__init__.py +28 -0
  195. nodescraper/taskresulthooks/filesystemloghook.py +88 -0
  196. nodescraper/typeutils.py +171 -0
  197. nodescraper/utils.py +412 -0
@@ -0,0 +1,1081 @@
1
+ ###############################################################################
2
+ #
3
+ # MIT License
4
+ #
5
+ # Copyright (c) 2025 Advanced Micro Devices, Inc.
6
+ #
7
+ # Permission is hereby granted, free of charge, to any person obtaining a copy
8
+ # of this software and associated documentation files (the "Software"), to deal
9
+ # in the Software without restriction, including without limitation the rights
10
+ # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11
+ # copies of the Software, and to permit persons to whom the Software is
12
+ # furnished to do so, subject to the following conditions:
13
+ #
14
+ # The above copyright notice and this permission notice shall be included in all
15
+ # copies or substantial portions of the Software.
16
+ #
17
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18
+ # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19
+ # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20
+ # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21
+ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22
+ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23
+ # SOFTWARE.
24
+ #
25
+ ###############################################################################
26
+ from typing import Dict, List, Optional, Set, Type, TypeVar
27
+
28
+ from pydantic import BaseModel, Field, ValidationError, field_validator
29
+
30
+ from nodescraper.enums import EventCategory, EventPriority
31
+ from nodescraper.interfaces import DataAnalyzer
32
+ from nodescraper.models import TaskResult
33
+ from nodescraper.utils import get_exception_traceback
34
+
35
+ from .analyzer_args import PcieAnalyzerArgs, normalize_to_dict
36
+ from .pcie_data import (
37
+ BdfStr,
38
+ CorrErrMaskReg,
39
+ CorrErrStatReg,
40
+ ECap16Gt,
41
+ ECapAer,
42
+ ECapSecpci,
43
+ ParityMisMatchStat16GT,
44
+ PcieCapStructure,
45
+ PcieCfgSpace,
46
+ PcieDataModel,
47
+ PcieExp,
48
+ PcieRegister,
49
+ UncorrErrMaskReg,
50
+ UncorrErrSevReg,
51
+ UncorrErrStatReg,
52
+ )
53
+
54
+ T_CAP = TypeVar("T_CAP", bound=PcieCapStructure)
55
+
56
+
57
+ class PcieAnalyzerInputModel(BaseModel):
58
+ """
59
+ PCIeAnalyzerInputModel is a data model for validating and storing input parameters
60
+ related to PCIe (Peripheral Component Interconnect Express) analysis.
61
+ Attributes:
62
+ exp_speed (int): Expected PCIe speed, Speed is the PCIe Generation, constrained to values between 1 and 5 (inclusive).
63
+ exp_width (int): Expected PCIe width, constrained to values between 1 and 16 (inclusive).
64
+ exp_sriov_count (Optional[int]): Optional expected count of SR-IOV (Single Root I/O Virtualization) instances.
65
+ exp_gpu_count_override (Optional[int]): Optional override for the expected GPU count.
66
+ """
67
+
68
+ exp_speed: int = Field(ge=1, le=5)
69
+ exp_width: int = Field(ge=1, le=16)
70
+ exp_sriov_count: Optional[int] = None
71
+ exp_gpu_count_override: Optional[int] = None
72
+ exp_max_payload_size: Dict[int, int] = Field(default_factory=dict)
73
+ exp_max_rd_req_size: Dict[int, int] = Field(default_factory=dict)
74
+ exp_ten_bit_tag_req_en: Dict[int, int] = Field(default_factory=dict)
75
+
76
+ @field_validator("exp_max_rd_req_size", "exp_max_payload_size", mode="before")
77
+ @classmethod
78
+ def validate_exp_max_rd_req_size(cls, v: Optional[Dict[int, int]]) -> Dict[int, int]:
79
+ """Validates the expected maximum read request size."""
80
+ if v is None:
81
+ return {}
82
+ ret_dict = v.copy()
83
+ for key, value in v.items():
84
+ if value >= 0 and value <= 5:
85
+ ret_dict[key] = 128 << value # Convert to actual size in bytes
86
+ if value not in {128, 256, 512, 1024, 2048, 4096}:
87
+ raise ValueError(
88
+ "Expected max read request size must be one of: "
89
+ "1, 2, 3, 4, 5, 128, 256, 512, 1024, 2048, or 4096."
90
+ )
91
+ if key < 0 or key > 0xFFFF:
92
+ raise ValueError(" key must be a valid BDF (0-65535).")
93
+ return ret_dict
94
+
95
+ @field_validator("exp_ten_bit_tag_req_en", mode="before")
96
+ @classmethod
97
+ def validate_exp_ten_bit_tag_req_en(cls, v: Optional[Dict[int, int]]) -> Dict[int, int]:
98
+ """Validates the expected 10-bit tag request enable value."""
99
+ if v is None:
100
+ return {}
101
+ for key, value in v.items():
102
+ if key < 0 or key > 0xFFFF:
103
+ raise ValueError("Key must be a valid BDF (0-65535).")
104
+ if value not in {0, 1}:
105
+ raise ValueError("Expected 10-bit tag request enable must be 0 or 1.")
106
+ return v
107
+
108
+
109
+ class PcieAnalyzer(DataAnalyzer):
110
+ """Check PCIe Data for errors
111
+
112
+ This calls checks the following:
113
+ - PCIe link status for each BDF
114
+ - This checks if the link speed and width are as expected
115
+ - AER uncorrectable errors
116
+ - Checks PCIe AER uncorrectable error registers UNCORR_ERR_STAT_REG and reports any errors
117
+ - AER correctable errors
118
+ - Checks the AERs correctable error registers CORR_ERR_STAT_REG and reports any errors
119
+ - PCIe device status errors
120
+ - Checks PCIe device status errors reported in fields `CORR_ERR_DET` `NON_FATAL_ERR_DET` `FATAL_ERR_DET` `UR_DET`
121
+ - PCIe status errors
122
+ - Checks PCIe status errors reported in fields `MSTR_DATA_PAR_ERR` `SIGNALED_TARGET_ABORT` `RCVD_TARGET_ABORT`
123
+ `RCVD_MSTR_ABORT` `SIGNALED_SYS_ERR` `DET_PARITY_ERR`
124
+
125
+ """
126
+
127
+ DATA_MODEL = PcieDataModel
128
+
129
+ GPU_BRIDGE_USP_ID = "0x1501"
130
+ GPU_BRIDGE_DSP_ID = "0x1500"
131
+
132
+ def validate_reg(self, bdf: str, reg: PcieRegister, log_event: bool) -> bool:
133
+ """Ensures that the register has no error has has a value
134
+
135
+ Parameters
136
+ ----------
137
+ bdf : str
138
+ base:device:function string just used for logging
139
+ reg : PcieRegister
140
+ Register to validate
141
+ log_event : bool
142
+ Whether to log an event if the register is invalid
143
+
144
+ Returns
145
+ -------
146
+ bool
147
+ True when validate successfully, False otherwise
148
+ """
149
+ if reg.val is None or reg.err is not None:
150
+ if log_event:
151
+ self._log_event(
152
+ category=EventCategory.IO,
153
+ description="No value assgined to register or register collection resulted in error",
154
+ priority=EventPriority.WARNING,
155
+ data={"value": reg.val, "error": reg.err, "bdf": bdf},
156
+ )
157
+ return False
158
+ return True
159
+
160
+ def validate_cap(
161
+ self,
162
+ bdf: str,
163
+ name: str,
164
+ capability_structure: Optional[PcieCapStructure],
165
+ log_event: bool = True,
166
+ ) -> bool:
167
+ """Ensures that the capability structure has no error and exists
168
+
169
+ Parameters
170
+ ----------
171
+ bdf : str
172
+ base:device:function string just used for logging
173
+ capability_structure : PcieCapStructure
174
+ Capability structure to validate
175
+
176
+ Returns
177
+ -------
178
+ bool
179
+ True when validate successfully, False otherwise
180
+ """
181
+ if capability_structure is None:
182
+ if log_event:
183
+ self._log_event(
184
+ category=EventCategory.IO,
185
+ description="No value assgined to capability a structure ",
186
+ data={
187
+ "name": name,
188
+ "bdf": bdf,
189
+ },
190
+ priority=EventPriority.WARNING,
191
+ )
192
+ return False
193
+ null_regs = capability_structure.null_err_regs()
194
+ if null_regs:
195
+ if log_event:
196
+ self._log_event(
197
+ category=EventCategory.IO,
198
+ description="Capability structure has unset registers",
199
+ data={
200
+ "name": name,
201
+ "bdf": bdf,
202
+ "capability_structure": capability_structure,
203
+ "null_regs": null_regs,
204
+ },
205
+ priority=EventPriority.WARNING,
206
+ )
207
+ return False
208
+ return True
209
+
210
+ def validate_cap_dict(
211
+ self,
212
+ pcie_cfg_space: Dict[BdfStr, PcieCfgSpace],
213
+ cap_struct: Type[PcieCapStructure],
214
+ log_event: bool = True,
215
+ ) -> set[str]:
216
+ """Validates capability structures for all BDFs in the PCIe data
217
+
218
+ Parameters
219
+ ----------
220
+ pcie_data : PCIeData
221
+ The PCIe data containing configuration space for each BDF
222
+ cap_struct : Type[PcieCapStructure]
223
+ The capability structure type to validate against each BDF's configuration space
224
+ log_event : bool, optional
225
+ Whether to log an event if a BDF does not have the specified capability structure, by default True
226
+
227
+ Returns
228
+ -------
229
+ set[str]
230
+ A set of BDFs that have the specified capability structure
231
+ """
232
+ bdf_without_cap_struct = set()
233
+ for bdf, cfg_space in pcie_cfg_space.items():
234
+ cap_struct_data = cfg_space.get_struct(cap_struct)
235
+ if not self.validate_cap(bdf, cap_struct.__name__, cap_struct_data, False):
236
+ bdf_without_cap_struct.add(bdf)
237
+ if log_event and len(bdf_without_cap_struct) > 0:
238
+ self._log_event(
239
+ category=EventCategory.IO,
240
+ description=f"Capability Structure {cap_struct.__name__} not found in a Cfg Space",
241
+ priority=EventPriority.WARNING,
242
+ data={
243
+ "bdf_without_pcie_exp": list(bdf_without_cap_struct),
244
+ "num_bdfs_with_invalid_capability_structure": len(bdf_without_cap_struct),
245
+ "total_bdfs": len(pcie_cfg_space),
246
+ },
247
+ )
248
+ return set(pcie_cfg_space.keys()) - bdf_without_cap_struct
249
+
250
+ def get_valid_cap_dict(
251
+ self,
252
+ pcie_cfg_space: Dict[BdfStr, PcieCfgSpace],
253
+ cap_struct: Type[T_CAP],
254
+ log_event: bool = True,
255
+ ) -> dict[BdfStr, T_CAP]:
256
+ """Returns a dictionary of BDFs that have the specified capability structure
257
+
258
+ Parameters
259
+ ----------
260
+ pcie_data : PCIeData
261
+ The PCIe data containing configuration space for each BDF
262
+ cap_struct : Type[T_CAP]
263
+ The capability structure type to validate against each BDF's configuration space
264
+ log_event : bool, optional
265
+ Whether to log an event if a BDF does not have the specified capability structure, by default True
266
+
267
+ Returns
268
+ -------
269
+ dict[BdfStr, T_CAP]
270
+ A dictionary of BDFs that have the specified capability structure
271
+ """
272
+ bdfs_with_cap = self.validate_cap_dict(pcie_cfg_space, cap_struct, log_event=log_event)
273
+ bdf_cap_struct_dict: Dict[BdfStr, T_CAP] = {}
274
+ for bdf, cfg_space in pcie_cfg_space.items():
275
+ if bdf not in bdfs_with_cap:
276
+ continue
277
+ cap_struct_data = cfg_space.get_struct(cap_struct)
278
+ if cap_struct_data is None:
279
+ continue
280
+ bdf_cap_struct_dict[bdf] = cap_struct_data
281
+
282
+ return bdf_cap_struct_dict
283
+
284
+ def check_link_status(
285
+ self,
286
+ bdf_pcie_express_dict: Dict[str, PcieExp],
287
+ exp_speed: int = 5,
288
+ exp_width: int = 16,
289
+ ):
290
+ """Checks PCIe link status for each bdf in the bdf_list and compares with the expected rate/width
291
+
292
+ Args:
293
+ all_bdf_cfg_space (dict[BdfStr, PcieCfgSpace]):
294
+ dict of key bdf and value PcieCfgSpace object which contains register data
295
+ exp_speed (int): expected link speed
296
+ exp_width (int): expected link width
297
+
298
+ Returns:
299
+ None
300
+ """
301
+ # Key: binary bit position, value: Gen <N>
302
+ sv_gen_speed = {
303
+ 0b000000: 0,
304
+ 0b000001: 1,
305
+ 0b000010: 2,
306
+ 0b000100: 3,
307
+ 0b001000: 4,
308
+ 0b010000: 5,
309
+ }
310
+ for bdf, pcie_exp in bdf_pcie_express_dict.items():
311
+ lnk_stat_reg = pcie_exp.lnk_stat_reg
312
+ lnk_cap_2_reg = pcie_exp.lnk_cap_2_reg
313
+ try:
314
+ if lnk_stat_reg.curr_lnk_speed.val == 0:
315
+ self._log_event(
316
+ category=EventCategory.IO,
317
+ description="Link speed vector is 0",
318
+ data={
319
+ "bdf": bdf,
320
+ "curr_lnk_speed": lnk_stat_reg.curr_lnk_speed.val,
321
+ "supported_lnk_speed_vec": lnk_cap_2_reg.supported_lnk_speed_vec.val,
322
+ },
323
+ priority=EventPriority.ERROR,
324
+ )
325
+ continue
326
+
327
+ curr_speed = lnk_stat_reg.curr_lnk_speed.get_val()
328
+ supported_vec = lnk_cap_2_reg.supported_lnk_speed_vec.get_val()
329
+ if curr_speed is None or supported_vec is None:
330
+ continue
331
+ sv_mask = 0b1 << (curr_speed - 1)
332
+ link_speed = sv_gen_speed[sv_mask & supported_vec]
333
+
334
+ if link_speed != exp_speed:
335
+ self._log_event(
336
+ category=EventCategory.IO,
337
+ description="Unexpected link speed detected",
338
+ priority=EventPriority.ERROR,
339
+ data={
340
+ "bdf": bdf,
341
+ "current_speed": link_speed,
342
+ "expected_speed": exp_speed,
343
+ },
344
+ )
345
+ if lnk_stat_reg.neg_lnk_width.get_val() != exp_width:
346
+ self._log_event(
347
+ category=EventCategory.IO,
348
+ description="Unexpected link width detected",
349
+ priority=EventPriority.ERROR,
350
+ data={
351
+ "bdf": bdf,
352
+ "current_width": lnk_stat_reg.neg_lnk_width.get_val(),
353
+ "expected_width": exp_width,
354
+ },
355
+ )
356
+ except Exception as e:
357
+ self._log_event(
358
+ category=EventCategory.IO,
359
+ description="Exception occurred while checking link status",
360
+ priority=EventPriority.ERROR,
361
+ data={"exception": get_exception_traceback(e)},
362
+ )
363
+
364
+ def check_uncorr_aer_errors(
365
+ self,
366
+ bdf_ecap_aer: Dict[BdfStr, ECapAer],
367
+ ):
368
+ """
369
+ Checks the following AER uncorrectable error registers
370
+ - Uncorrectable Error Status Register
371
+ - Uncorrectable Error Mask Register
372
+ - Uncorrectable Error Severity Register
373
+
374
+ Args:
375
+ bdf_cfg_space_dict (dict[BdfStr, PcieCfgSpace]):
376
+ dict of key bdf and value PcieCfgSpace object which contains register data
377
+ Returns:
378
+ None
379
+ """
380
+ for bdf, ecap_aer in bdf_ecap_aer.items():
381
+ stat_reg: UncorrErrStatReg = ecap_aer.uncorr_err_stat
382
+ mask_reg: UncorrErrMaskReg = ecap_aer.uncorr_err_mask
383
+ sev_reg: UncorrErrSevReg = ecap_aer.uncorr_err_sev
384
+ stat_fields = stat_reg.bit_fields
385
+ mask_fields = mask_reg.bit_fields
386
+ sev_fields = sev_reg.bit_fields
387
+ # sort fields by bit position using offset
388
+ sorted_stat_fields = sorted(stat_fields.values(), key=lambda x: x.bit_mask)
389
+ sorted_mask_fields = sorted(mask_fields.values(), key=lambda x: x.bit_mask)
390
+ sorted_sev_fields = sorted(sev_fields.values(), key=lambda x: x.bit_mask)
391
+ # Iterate through all the fields in the stat, mask, and sev registers
392
+ for stat_field, mask_field, sev_field in zip(
393
+ sorted_stat_fields,
394
+ sorted_mask_fields,
395
+ sorted_sev_fields,
396
+ ):
397
+ pcie_field_stat_value = stat_field.get_val()
398
+ pcie_field_mask_value = mask_field.get_val()
399
+ pcie_field_sev_value = sev_field.get_val()
400
+ err_descriptor: Dict[str, str] = {
401
+ "bdf": bdf,
402
+ "reg_name": stat_reg.__class__.__name__,
403
+ "field_desc": stat_field.desc,
404
+ "stat": (
405
+ hex(pcie_field_stat_value) if pcie_field_stat_value is not None else "None"
406
+ ),
407
+ "mask": (
408
+ hex(pcie_field_mask_value) if pcie_field_mask_value is not None else "None"
409
+ ),
410
+ "sev": (
411
+ hex(pcie_field_sev_value) if pcie_field_sev_value is not None else "None"
412
+ ),
413
+ }
414
+ if pcie_field_stat_value != 0:
415
+ # Error detected
416
+ if pcie_field_sev_value != 1:
417
+ if pcie_field_mask_value == 1:
418
+ self._log_event(
419
+ category=EventCategory.IO,
420
+ description="Masked Fatal errors were detected",
421
+ priority=EventPriority.ERROR,
422
+ data=err_descriptor,
423
+ )
424
+ else:
425
+ self._log_event(
426
+ category=EventCategory.IO,
427
+ description="Unmasked Fatal errors were detected",
428
+ priority=EventPriority.ERROR,
429
+ data=err_descriptor,
430
+ )
431
+ else:
432
+ if pcie_field_mask_value == 1:
433
+ self._log_event(
434
+ category=EventCategory.IO,
435
+ description="Unmasked Non-Fatal errors were detected",
436
+ priority=EventPriority.WARNING,
437
+ data=err_descriptor,
438
+ )
439
+ else:
440
+ self._log_event(
441
+ category=EventCategory.IO,
442
+ description="Unmasked Non-Fatal errors were detected",
443
+ priority=EventPriority.WARNING,
444
+ data=err_descriptor,
445
+ )
446
+
447
+ def check_corr_aer_errors(
448
+ self,
449
+ bdf_ecap_aer: Dict[BdfStr, ECapAer],
450
+ ):
451
+ """
452
+ Checks the following AER correctable error registers
453
+ - Correctable Error Status Register
454
+ - Correctable Error Mask Register
455
+
456
+ Args:
457
+ bdf_cfg_space_dict (dict[BdfStr, PcieCfgSpace]):
458
+ dict of key bdf and value PcieCfgSpace object which contains register data
459
+ Returns:
460
+ None
461
+ """
462
+ for bdf, ecap_aer in bdf_ecap_aer.items():
463
+ stat_reg: CorrErrStatReg = ecap_aer.corr_err_stat
464
+ mask_reg: CorrErrMaskReg = ecap_aer.corr_err_mask
465
+ stat_fields = stat_reg.bit_fields
466
+ mask_fields = mask_reg.bit_fields
467
+ sorted_stat_fields = sorted(stat_fields.values(), key=lambda x: x.bit_mask)
468
+ sorted_mask_fields = sorted(mask_fields.values(), key=lambda x: x.bit_mask)
469
+
470
+ for stat_field, mask_field in zip(
471
+ sorted_stat_fields,
472
+ sorted_mask_fields,
473
+ ):
474
+ stat_val = stat_field.get_val()
475
+ if stat_val is not None and stat_val != 0:
476
+ err_dict = {
477
+ "bdf": bdf,
478
+ "reg_description": stat_reg.desc,
479
+ "field_description": stat_field.desc,
480
+ "bit_field_val": hex(stat_val),
481
+ }
482
+ if mask_field.get_val() == 1:
483
+ self._log_event(
484
+ category=EventCategory.IO,
485
+ description="Masked Correctable errors were detected",
486
+ priority=EventPriority.WARNING,
487
+ data=err_dict,
488
+ )
489
+ else:
490
+ self._log_event(
491
+ category=EventCategory.IO,
492
+ description="Masked Correctable errors were detected",
493
+ priority=EventPriority.ERROR,
494
+ data=err_dict,
495
+ )
496
+
497
+ def check_pcie_device_status_errors(self, bdf_pcie_express_dict: Dict[str, PcieExp]):
498
+ """
499
+ Checks PCIe baseline error reported in Device Status Register
500
+ Reference: 9.4.1 Baseline Error Reporting
501
+
502
+ Args:
503
+ bdf_cfg_space_dict (dict[BdfStr, PcieCfgSpace]):
504
+ dict of key bdf and value PcieCfgSpace object which contains register data
505
+ Returns:
506
+ None
507
+ """
508
+ for bdf, pcie_exp_cap in bdf_pcie_express_dict.items():
509
+ err_list = []
510
+ dev_stat_reg = pcie_exp_cap.dev_stat_reg
511
+ bit_field_list = [
512
+ dev_stat_reg.corr_err_det,
513
+ dev_stat_reg.non_fatal_err_det,
514
+ dev_stat_reg.fatal_err_det,
515
+ dev_stat_reg.ur_det,
516
+ ]
517
+ err_list = [bit_field for bit_field in bit_field_list if bit_field.get_val() != 0]
518
+
519
+ if len(err_list) > 0:
520
+ self._log_event(
521
+ category=EventCategory.IO,
522
+ description="Device Status errors were detected",
523
+ priority=EventPriority.WARNING,
524
+ data={
525
+ "bdf": bdf,
526
+ "reg_description": dev_stat_reg.desc,
527
+ "field_desc_list": [err.desc for err in err_list],
528
+ "err_bitmask_list": [err.bit_mask for err in err_list],
529
+ "register_value": dev_stat_reg.val,
530
+ },
531
+ )
532
+
533
+ def check_pcie_status_errors(self, bdf_cfg_space_dict: Dict[BdfStr, PcieCfgSpace]):
534
+ """
535
+ Checks PCIe baseline error reported in Status Registe
536
+ Reference: 9.4.1 Baseline Error Reporting
537
+
538
+ Args:
539
+ bdf_cfg_space_dict (dict[BdfStr, PcieCfgSpace]):
540
+ dict of key bdf and value PcieCfgSpace object which contains register data
541
+ Returns:
542
+ None
543
+ """
544
+ for bdf, cfg_space in bdf_cfg_space_dict.items():
545
+ err_list = []
546
+ stat_reg = cfg_space.type_0_configuration.status
547
+ bit_field_list = [
548
+ stat_reg.mstr_data_par_err,
549
+ stat_reg.signaled_target_abort,
550
+ stat_reg.rcvd_target_abort,
551
+ stat_reg.rcvd_mstr_abort,
552
+ stat_reg.signaled_sys_err,
553
+ stat_reg.det_parity_err,
554
+ ]
555
+ err_list = [bit_field for bit_field in bit_field_list if bit_field.get_val() != 0]
556
+
557
+ if len(err_list) > 0:
558
+ self._log_event(
559
+ category=EventCategory.IO,
560
+ description="PCI Express Status register errors were detected",
561
+ priority=EventPriority.WARNING,
562
+ data={
563
+ "bdf": bdf,
564
+ "reg_description": stat_reg.desc,
565
+ "field_desc_list": [err.desc for err in err_list],
566
+ "err_bitmask_list": [err.bit_mask for err in err_list],
567
+ "register_value": stat_reg.val,
568
+ },
569
+ )
570
+
571
+ def check_pcie_dev_ctrl_reg(
572
+ self,
573
+ bdf_pcie_express_dict: Dict[str, PcieExp],
574
+ exp_max_payload_size: Optional[int],
575
+ exp_max_rd_req_size: Optional[int],
576
+ ):
577
+ """Checks 7.5.3.4 Device Control Register (Offset 08h) fields for expected value:
578
+ - Max Payload Size
579
+ - Max Read Request Size
580
+
581
+ Args:
582
+ bdf_cfg_space_dict (dict[BdfStr, PcieCfgSpace]):
583
+ dict of key bdf and value PcieCfgSpace object which contains register data
584
+ exp_max_payload_size (Optional[int]): expected max payload size, when None it is not checked
585
+ exp_max_rd_req_size (Optional[int]): expected max read request size, when None it is not checked
586
+ Returns:
587
+ None
588
+ """
589
+ encoding = {
590
+ 0b000: 128,
591
+ 0b001: 256,
592
+ 0b010: 512,
593
+ 0b011: 1024,
594
+ 0b100: 2048,
595
+ 0b101: 4096,
596
+ }
597
+ for bdf, pcie_exp in bdf_pcie_express_dict.items():
598
+ dev_ctrl_reg = pcie_exp.dev_ctrl_reg
599
+ mps_val = dev_ctrl_reg.mps.get_val()
600
+ if mps_val is None:
601
+ continue
602
+ max_payload_size = encoding[mps_val]
603
+ if exp_max_payload_size is not None and max_payload_size != exp_max_payload_size:
604
+ self._log_event(
605
+ category=EventCategory.IO,
606
+ description="Unexpected Max Payload Size detected",
607
+ priority=EventPriority.ERROR,
608
+ data={
609
+ "bdf": bdf,
610
+ "current_max_payload_size": max_payload_size,
611
+ "expected_max_payload_size": exp_max_payload_size,
612
+ },
613
+ )
614
+
615
+ max_rd_req_val = dev_ctrl_reg.max_rd_req_size.get_val()
616
+ if max_rd_req_val is None:
617
+ continue
618
+ max_rd_req_size = encoding[max_rd_req_val]
619
+ if max_rd_req_size is not None and max_rd_req_size != exp_max_rd_req_size:
620
+ self._log_event(
621
+ category=EventCategory.IO,
622
+ description="Unexpected Max Read Request Size detected",
623
+ priority=EventPriority.ERROR,
624
+ data={
625
+ "bdf": bdf,
626
+ "current_max_rd_req_size": max_rd_req_size,
627
+ "expected_max_rd_req_size": exp_max_rd_req_size,
628
+ },
629
+ )
630
+
631
+ def check_pcie_dev_ctrl_2_reg(
632
+ self,
633
+ bdf_pcie_express_dict: Dict[str, PcieExp],
634
+ exp_ten_bit_tag_req_en: Optional[int],
635
+ ):
636
+ """Checks 7.5.3.16 Device Control 2 Register (Offset 28h) fields for expected value:
637
+ - 10-bit Tag Request Enable
638
+
639
+ Args:
640
+ bdf_cfg_space_dict (dict[BdfStr, PcieCfgSpace]):
641
+ dict of key bdf and value PcieCfgSpace object which contains register data
642
+ exp_ten_bit_tag_req_en (Optional[int]): expected 10-bit tag request enable, when None it is not checked
643
+ Returns:
644
+ None
645
+ """
646
+ for bdf, pcie_exp in bdf_pcie_express_dict.items():
647
+ dev_ctrl_2_reg = pcie_exp.dev_ctrl_2_reg
648
+ ten_bit_tag_req_en = dev_ctrl_2_reg.ten_bit_tag_req_en.get_val()
649
+ if exp_ten_bit_tag_req_en is not None and ten_bit_tag_req_en != exp_ten_bit_tag_req_en:
650
+ self._log_event(
651
+ category=EventCategory.IO,
652
+ description="Unexpected 10-bit Tag Request Enable detected",
653
+ priority=EventPriority.ERROR,
654
+ data={
655
+ "bdf": bdf,
656
+ "current_ten_bit_tag_req_en": ten_bit_tag_req_en,
657
+ "expected_ten_bit_tag_req_en": exp_ten_bit_tag_req_en,
658
+ },
659
+ )
660
+
661
+ def instantaneous_par_err_chk(self, bdf_cfg_space_dict: Dict[str, ECap16Gt]):
662
+ """Instantaneous parity error check for ECap16Gt registers, will
663
+ log an event if any lanes have parity errors.
664
+
665
+ Parameters
666
+ ----------
667
+ bdf_cfg_space_dict : Dict[str, ECap16Gt]
668
+ Dictionary of BDFs and their corresponding ECap16Gt capability structure
669
+ """
670
+ for bdf, ecap_pl_16gt in bdf_cfg_space_dict.items():
671
+ par_mismatch_stat: ParityMisMatchStat16GT = ecap_pl_16gt.parity_mismatch_stat
672
+ retimer_fst_par_mismatch_stat = ecap_pl_16gt.retimer_fst_parity_mismatch_stat
673
+ for parity_register in [
674
+ par_mismatch_stat,
675
+ retimer_fst_par_mismatch_stat,
676
+ ]:
677
+ if parity_register.val is None:
678
+ continue
679
+ par_bad_lanes = [
680
+ 1 if (parity_register.val >> bit) & 1 else 0 for bit in range(0, 32)
681
+ ]
682
+ number_of_bad_lanes = sum(par_bad_lanes)
683
+ if number_of_bad_lanes > 0:
684
+ self._log_event(
685
+ category=EventCategory.IO,
686
+ description="Lanes have parity errors",
687
+ priority=EventPriority.ERROR,
688
+ data={
689
+ "bdf": bdf,
690
+ "reg_name": parity_register.__class__.__name__,
691
+ "reg_desc": parity_register.desc,
692
+ "register_value": parity_register.val,
693
+ "number_of_bad_lanes": number_of_bad_lanes,
694
+ },
695
+ )
696
+
697
+ def lane_error_status_chk(self, ecap_sec_pci_dict: Dict[str, ECapSecpci]):
698
+ """Lane error status check for ECapSecpci registers, will log an event if any lanes have errors.
699
+
700
+ Parameters
701
+ ----------
702
+ ecap_sec_pci_dict : Dict[str, ECapSecpci]
703
+ Dictionary of BDFs and their corresponding ECapSecpci capability structure
704
+ """
705
+ for bdf, ecap_sec_pci in ecap_sec_pci_dict.items():
706
+ lane_error_stat = ecap_sec_pci.lane_err_stat
707
+ lane_error_stat_val = lane_error_stat.val
708
+ if lane_error_stat_val != 0:
709
+ self._log_event(
710
+ category=EventCategory.IO,
711
+ description="Lane error detected",
712
+ priority=EventPriority.ERROR,
713
+ data={
714
+ "bdf": bdf,
715
+ "reg_name": lane_error_stat.__class__.__name__,
716
+ "register_value": lane_error_stat_val,
717
+ },
718
+ )
719
+
720
+ def device_consistancy_chk(self, bdf_cfg_space_dict: Dict[BdfStr, PcieCfgSpace]):
721
+ """Checks that the configurable fields in the PCIe devices are all consistent"""
722
+ # Build a dynamic map of device IDs to BDFs from the actual devices in the system
723
+ dev_id_bdf_map: Dict[int, List[BdfStr]] = {}
724
+
725
+ for bdf, cfg_space in bdf_cfg_space_dict.items():
726
+ # Collect Unique device Ids contained in this system
727
+ device_id = cfg_space.type_0_configuration.device_id.val
728
+ if device_id is None:
729
+ self._log_event(
730
+ category=EventCategory.IO,
731
+ description="No value assigned to device id, unable to check consistency due to missing data",
732
+ data={
733
+ "bdf": bdf,
734
+ },
735
+ priority=EventPriority.WARNING,
736
+ )
737
+ continue
738
+
739
+ # Dynamically add device IDs as we encounter them
740
+ if device_id not in dev_id_bdf_map:
741
+ dev_id_bdf_map[device_id] = []
742
+ dev_id_bdf_map[device_id].append(bdf)
743
+
744
+ # check the values are all equal for select registers
745
+ cap_struct_dict = self.get_valid_cap_dict(bdf_cfg_space_dict, PcieExp, log_event=False)
746
+ for collected_device_id, list_of_bdfs in dev_id_bdf_map.items():
747
+ # check the values are all equal for select registers
748
+ mps = []
749
+ mrs = []
750
+ tbt = []
751
+ log_event = False
752
+ for bdf in list_of_bdfs:
753
+ if bdf not in cap_struct_dict:
754
+ # Missing Capability structure for this BDF, skip it, log event at end
755
+ log_event = True
756
+ continue
757
+ pcie_exp = cap_struct_dict[bdf]
758
+ dev_ctrl_reg = pcie_exp.dev_ctrl_reg
759
+ mps.append(dev_ctrl_reg.mps.val)
760
+ mrs.append(dev_ctrl_reg.max_rd_req_size.val)
761
+ tbt.append(dev_ctrl_reg.ext_tag_field_en.val)
762
+ # check the values are all equal for select registers
763
+ if len(set(mps)) > 1 or len(set(mrs)) > 1 or len(set(tbt)) > 1 or log_event:
764
+ collected_device_id_str = hex(collected_device_id)
765
+ self._log_event(
766
+ category=EventCategory.IO,
767
+ description=f"PCIe device {collected_device_id_str} has inconsistent values",
768
+ priority=EventPriority.WARNING,
769
+ data={
770
+ "dev_id": collected_device_id_str,
771
+ "bdf_list": list_of_bdfs,
772
+ "max_payload_size_list": mps,
773
+ "max_rd_req_size_list": mrs,
774
+ "ext_tag_field_en_list": tbt,
775
+ },
776
+ )
777
+
778
+ def check_ecap_16gt_regs(
779
+ self,
780
+ bdf_cfg_space_dict: dict[BdfStr, PcieCfgSpace],
781
+ ):
782
+ """Acquires ECap16Gt capability structure and checks for instantaneous parity errors"""
783
+ CAP_STRUCTURE = ECap16Gt
784
+ bdf_ecap_16gt_dict = self.get_valid_cap_dict(
785
+ bdf_cfg_space_dict, CAP_STRUCTURE, log_event=True
786
+ )
787
+ self.instantaneous_par_err_chk(bdf_cfg_space_dict=bdf_ecap_16gt_dict)
788
+
789
+ def check_ecap_sec_pci_regs(
790
+ self,
791
+ bdf_cfg_space_dict: dict[BdfStr, PcieCfgSpace],
792
+ ):
793
+ """Acquires ECapSecpci capability structure and checks for lane errors"""
794
+ CAP_STRUCTURE = ECapSecpci
795
+ bdf_ecap_secondary_pci = self.get_valid_cap_dict(
796
+ bdf_cfg_space_dict, CAP_STRUCTURE, log_event=True
797
+ )
798
+ self.lane_error_status_chk(ecap_sec_pci_dict=bdf_ecap_secondary_pci)
799
+
800
+ def check_ecap_aer_errors(
801
+ self,
802
+ bdf_cfg_space_dict: dict[BdfStr, PcieCfgSpace],
803
+ ):
804
+ """Acquires ECapAer capability structure and checks for AER errors"""
805
+ CAP_STRUCTURE = ECapAer
806
+ bdf_ecap_aer_error = self.get_valid_cap_dict(
807
+ bdf_cfg_space_dict, CAP_STRUCTURE, log_event=True
808
+ )
809
+ self.check_uncorr_aer_errors(bdf_ecap_aer=bdf_ecap_aer_error)
810
+ self.check_corr_aer_errors(bdf_ecap_aer=bdf_ecap_aer_error)
811
+
812
+ def check_pcie_exp_capability_structure_errors(
813
+ self, bdf_cfg_space_dict: Dict[BdfStr, PcieCfgSpace]
814
+ ):
815
+ """Checks the PCIe Express capability structure for errors"""
816
+ CAP_STRUCTURE = PcieExp
817
+ bdf_pcie_express_dict = self.get_valid_cap_dict(
818
+ bdf_cfg_space_dict, CAP_STRUCTURE, log_event=False
819
+ )
820
+ self.check_pcie_device_status_errors(bdf_pcie_express_dict=bdf_pcie_express_dict)
821
+
822
+ def check_pcie_exp_capability_structure_config(
823
+ self,
824
+ bdf_cfg_space_dict: dict[BdfStr, PcieCfgSpace],
825
+ exp_max_payload_size: Optional[int] = None,
826
+ exp_max_rd_req_size: Optional[int] = None,
827
+ exp_ten_bit_tag_req_en: Optional[int] = None,
828
+ ):
829
+ """Checks the PCIe Express capability structure for errors"""
830
+ CAP_STRUCTURE = PcieExp
831
+
832
+ bdf_pcie_express_dict = self.get_valid_cap_dict(
833
+ bdf_cfg_space_dict, CAP_STRUCTURE, log_event=True
834
+ )
835
+
836
+ if exp_max_payload_size is not None or exp_max_rd_req_size is not None:
837
+ self.check_pcie_dev_ctrl_reg(
838
+ bdf_pcie_express_dict=bdf_pcie_express_dict,
839
+ exp_max_payload_size=exp_max_payload_size,
840
+ exp_max_rd_req_size=exp_max_rd_req_size,
841
+ )
842
+
843
+ if exp_ten_bit_tag_req_en is not None:
844
+ self.check_pcie_dev_ctrl_2_reg(
845
+ bdf_pcie_express_dict=bdf_pcie_express_dict,
846
+ exp_ten_bit_tag_req_en=exp_ten_bit_tag_req_en,
847
+ )
848
+
849
+ @staticmethod
850
+ def filter_pcie_data_by_device_id(
851
+ bdf_cfg_space_dict: Dict[BdfStr, PcieCfgSpace],
852
+ device_ids: Set[int],
853
+ ) -> Dict[BdfStr, PcieCfgSpace]:
854
+ """Filters the PCIe data by device ID
855
+
856
+ Parameters
857
+ ----------
858
+ device_ids : set[int]
859
+ Set of device IDs to filter by
860
+
861
+ Returns
862
+ -------
863
+ Dict[BdfStr, PcieCfgSpace]
864
+ Dictionary of BDFs and their corresponding PCIe configuration space
865
+ """
866
+ new_cfg_space_dict: Dict[BdfStr, PcieCfgSpace] = {}
867
+ for bdf, pcie_data in bdf_cfg_space_dict.items():
868
+ dev_id = pcie_data.type_0_configuration.device_id.val
869
+ if dev_id in device_ids:
870
+ new_cfg_space_dict[bdf] = pcie_data
871
+ return new_cfg_space_dict
872
+
873
+ def check_gpu_count(
874
+ self,
875
+ pcie_data: PcieDataModel,
876
+ expected_gpu_count: Optional[int] = None,
877
+ ):
878
+ """Check if GPU count from PCIe data matches expected count
879
+
880
+ Parameters
881
+ ----------
882
+ pcie_data : PcieDataModel
883
+ PCIe data model containing collected PCIe configuration space data
884
+ expected_gpu_count : Optional[int], optional
885
+ Expected GPU count, by default None (no check performed)
886
+ """
887
+ if expected_gpu_count is None:
888
+ return
889
+
890
+ gpu_count_from_pcie = 0
891
+ for cfg_space in pcie_data.pcie_cfg_space.values():
892
+ vendor_id = cfg_space.type_0_configuration.vendor_id.val
893
+ if vendor_id == self.system_info.vendorid_ep:
894
+ gpu_count_from_pcie += 1
895
+
896
+ if gpu_count_from_pcie != expected_gpu_count:
897
+ self._log_event(
898
+ category=EventCategory.IO,
899
+ description="GPU count mismatch",
900
+ priority=EventPriority.ERROR,
901
+ data={
902
+ "gpu_count_from_pcie": gpu_count_from_pcie,
903
+ "expected_gpu_count": expected_gpu_count,
904
+ },
905
+ )
906
+ else:
907
+ self._log_event(
908
+ category=EventCategory.IO,
909
+ description="GPU count matches expected",
910
+ priority=EventPriority.INFO,
911
+ data={
912
+ "gpu_count": gpu_count_from_pcie,
913
+ },
914
+ )
915
+
916
+ def analyze_data(
917
+ self, data: PcieDataModel, args: Optional[PcieAnalyzerArgs] = None
918
+ ) -> TaskResult:
919
+ """Check PCIe data for errors by analyzing the PCIe register space and
920
+ checking the enumeration of the GPUs and optional SR-IOV VFs
921
+
922
+ Parameters
923
+ ----------
924
+ data : PcieDataModel
925
+ PCIe data model containing collected PCIe configuration space data
926
+ args : Optional[PcieAnalyzerArgs], optional
927
+ Analyzer arguments containing expected values for validation, by default None
928
+
929
+ Returns
930
+ -------
931
+ TaskResult
932
+ Result of the analysis
933
+ """
934
+ if args is None:
935
+ args = PcieAnalyzerArgs()
936
+
937
+ exp_speed = args.exp_speed
938
+ exp_width = args.exp_width
939
+ exp_sriov_count = args.exp_sriov_count
940
+ exp_gpu_count_override = args.exp_gpu_count_override
941
+ exp_max_payload_size = normalize_to_dict(
942
+ args.exp_max_payload_size, self.system_info.vendorid_ep
943
+ )
944
+ exp_max_rd_req_size = normalize_to_dict(
945
+ args.exp_max_rd_req_size, self.system_info.vendorid_ep
946
+ )
947
+ exp_ten_bit_tag_req_en = normalize_to_dict(
948
+ args.exp_ten_bit_tag_req_en, self.system_info.vendorid_ep
949
+ )
950
+ try:
951
+ pcie_input_data = PcieAnalyzerInputModel(
952
+ exp_speed=exp_speed,
953
+ exp_width=exp_width,
954
+ exp_sriov_count=exp_sriov_count,
955
+ exp_gpu_count_override=exp_gpu_count_override,
956
+ exp_ten_bit_tag_req_en=exp_ten_bit_tag_req_en,
957
+ exp_max_payload_size=exp_max_payload_size,
958
+ exp_max_rd_req_size=exp_max_rd_req_size,
959
+ )
960
+ except ValidationError as val_error:
961
+ self._log_event(
962
+ category=EventCategory.RUNTIME,
963
+ description="User input for PcieAnalyzerModel is invalid",
964
+ priority=EventPriority.ERROR,
965
+ data={
966
+ "validation_error": get_exception_traceback(val_error),
967
+ "valid_input": {
968
+ "exp_speed": "int, 1-5",
969
+ "exp_width": "int, 1-16",
970
+ "exp_sriov_count": "Optional[int]",
971
+ "exp_gpu_count_override": "Optional[int]",
972
+ },
973
+ "actual_input": {
974
+ "exp_speed": exp_speed,
975
+ "exp_width": exp_width,
976
+ "exp_sriov_count": exp_sriov_count,
977
+ "exp_gpu_count_override": exp_gpu_count_override,
978
+ },
979
+ },
980
+ )
981
+ return self.result
982
+
983
+ pcie_data: PcieDataModel = data
984
+
985
+ if pcie_data.pcie_cfg_space == {} and pcie_data.vf_pcie_cfg_space == {}:
986
+ # If both of the PCIe Configuration spaces are
987
+ self._log_event(
988
+ category=EventCategory.IO,
989
+ description="No PCIe config space found",
990
+ priority=EventPriority.WARNING,
991
+ )
992
+ return self.result
993
+
994
+ # Check every link in the PCIe configuration space for the expected capability structure,
995
+ # but don't check VF since those will be 0
996
+ bdf_pcie_express_dict = self.get_valid_cap_dict(
997
+ pcie_data.pcie_cfg_space,
998
+ PcieExp,
999
+ log_event=True,
1000
+ )
1001
+ self.check_link_status(
1002
+ bdf_pcie_express_dict=bdf_pcie_express_dict,
1003
+ exp_speed=exp_speed,
1004
+ exp_width=exp_width,
1005
+ )
1006
+
1007
+ amd_device_ids = set()
1008
+ for cfg_space in pcie_data.pcie_cfg_space.values():
1009
+ vendor_id = cfg_space.type_0_configuration.vendor_id.val
1010
+ device_id = cfg_space.type_0_configuration.device_id.val
1011
+ if vendor_id == self.system_info.vendorid_ep and device_id is not None:
1012
+ amd_device_ids.add(device_id)
1013
+
1014
+ # Filter PCIe data for AMD GPUs
1015
+ oam_pcie_data = self.filter_pcie_data_by_device_id(
1016
+ bdf_cfg_space_dict=pcie_data.pcie_cfg_space,
1017
+ device_ids=amd_device_ids,
1018
+ )
1019
+
1020
+ amd_vf_device_ids = set()
1021
+ if pcie_data.vf_pcie_cfg_space is not None:
1022
+ for cfg_space in pcie_data.vf_pcie_cfg_space.values():
1023
+ vendor_id = cfg_space.type_0_configuration.vendor_id.val
1024
+ device_id = cfg_space.type_0_configuration.device_id.val
1025
+ if vendor_id == self.system_info.vendorid_ep and device_id is not None:
1026
+ amd_vf_device_ids.add(device_id)
1027
+
1028
+ oam_vf_pcie_data = self.filter_pcie_data_by_device_id(
1029
+ bdf_cfg_space_dict=pcie_data.vf_pcie_cfg_space,
1030
+ device_ids=amd_vf_device_ids,
1031
+ )
1032
+ else:
1033
+ oam_vf_pcie_data = {}
1034
+
1035
+ # Include bridge/retimer devices (0x1500, 0x1501)
1036
+ us_ds_retimer = self.filter_pcie_data_by_device_id(
1037
+ bdf_cfg_space_dict=pcie_data.pcie_cfg_space,
1038
+ device_ids={0x1500, 0x1501},
1039
+ )
1040
+ ubb_data = {**oam_pcie_data, **us_ds_retimer}
1041
+ ubb_data_with_vf = {**ubb_data, **oam_vf_pcie_data}
1042
+ # Type 0 Configuration Space Checks
1043
+ self.check_pcie_status_errors(bdf_cfg_space_dict=ubb_data_with_vf)
1044
+ # Check other capability structures
1045
+ dev_ids = set(
1046
+ list(pcie_input_data.exp_max_payload_size.keys())
1047
+ + list(pcie_input_data.exp_max_rd_req_size.keys())
1048
+ + list(pcie_input_data.exp_ten_bit_tag_req_en.keys())
1049
+ )
1050
+ for device_id_to_check in dev_ids:
1051
+ cfg_space_filtered = self.filter_pcie_data_by_device_id(
1052
+ bdf_cfg_space_dict=pcie_data.pcie_cfg_space,
1053
+ device_ids={device_id_to_check},
1054
+ )
1055
+ self.check_pcie_exp_capability_structure_config(
1056
+ cfg_space_filtered,
1057
+ pcie_input_data.exp_max_payload_size.get(device_id_to_check),
1058
+ pcie_input_data.exp_max_rd_req_size.get(device_id_to_check),
1059
+ pcie_input_data.exp_ten_bit_tag_req_en.get(device_id_to_check),
1060
+ )
1061
+
1062
+ # run with vfs for AERs and PCIe EXP errors
1063
+ self.check_pcie_exp_capability_structure_errors(bdf_cfg_space_dict=ubb_data_with_vf)
1064
+ self.check_ecap_aer_errors(bdf_cfg_space_dict=ubb_data_with_vf)
1065
+ self.check_ecap_16gt_regs(bdf_cfg_space_dict=ubb_data)
1066
+ self.check_ecap_sec_pci_regs(bdf_cfg_space_dict=ubb_data)
1067
+
1068
+ if amd_device_ids:
1069
+ self.device_consistancy_chk(
1070
+ bdf_cfg_space_dict=ubb_data,
1071
+ )
1072
+ else:
1073
+ self._log_event(
1074
+ category=EventCategory.RUNTIME,
1075
+ description="No AMD GPU devices found, skipping device consistency check",
1076
+ priority=EventPriority.INFO,
1077
+ )
1078
+
1079
+ self.check_gpu_count(pcie_data, exp_gpu_count_override)
1080
+
1081
+ return self.result