amd-node-scraper 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (197) hide show
  1. amd_node_scraper-0.0.1.dist-info/LICENSE +21 -0
  2. amd_node_scraper-0.0.1.dist-info/METADATA +424 -0
  3. amd_node_scraper-0.0.1.dist-info/RECORD +197 -0
  4. amd_node_scraper-0.0.1.dist-info/WHEEL +5 -0
  5. amd_node_scraper-0.0.1.dist-info/entry_points.txt +2 -0
  6. amd_node_scraper-0.0.1.dist-info/top_level.txt +1 -0
  7. nodescraper/__init__.py +32 -0
  8. nodescraper/base/__init__.py +34 -0
  9. nodescraper/base/inbandcollectortask.py +118 -0
  10. nodescraper/base/inbanddataplugin.py +39 -0
  11. nodescraper/base/regexanalyzer.py +120 -0
  12. nodescraper/cli/__init__.py +29 -0
  13. nodescraper/cli/cli.py +511 -0
  14. nodescraper/cli/constants.py +27 -0
  15. nodescraper/cli/dynamicparserbuilder.py +171 -0
  16. nodescraper/cli/helper.py +517 -0
  17. nodescraper/cli/inputargtypes.py +129 -0
  18. nodescraper/configbuilder.py +123 -0
  19. nodescraper/configregistry.py +66 -0
  20. nodescraper/configs/node_status.json +19 -0
  21. nodescraper/connection/__init__.py +25 -0
  22. nodescraper/connection/inband/__init__.py +46 -0
  23. nodescraper/connection/inband/inband.py +171 -0
  24. nodescraper/connection/inband/inbandlocal.py +93 -0
  25. nodescraper/connection/inband/inbandmanager.py +151 -0
  26. nodescraper/connection/inband/inbandremote.py +173 -0
  27. nodescraper/connection/inband/sshparams.py +43 -0
  28. nodescraper/constants.py +26 -0
  29. nodescraper/enums/__init__.py +40 -0
  30. nodescraper/enums/eventcategory.py +89 -0
  31. nodescraper/enums/eventpriority.py +42 -0
  32. nodescraper/enums/executionstatus.py +44 -0
  33. nodescraper/enums/osfamily.py +34 -0
  34. nodescraper/enums/systeminteraction.py +41 -0
  35. nodescraper/enums/systemlocation.py +33 -0
  36. nodescraper/generictypes.py +36 -0
  37. nodescraper/interfaces/__init__.py +44 -0
  38. nodescraper/interfaces/connectionmanager.py +143 -0
  39. nodescraper/interfaces/dataanalyzertask.py +138 -0
  40. nodescraper/interfaces/datacollectortask.py +185 -0
  41. nodescraper/interfaces/dataplugin.py +356 -0
  42. nodescraper/interfaces/plugin.py +127 -0
  43. nodescraper/interfaces/resultcollator.py +56 -0
  44. nodescraper/interfaces/task.py +164 -0
  45. nodescraper/interfaces/taskresulthook.py +39 -0
  46. nodescraper/models/__init__.py +48 -0
  47. nodescraper/models/analyzerargs.py +93 -0
  48. nodescraper/models/collectorargs.py +30 -0
  49. nodescraper/models/connectionconfig.py +34 -0
  50. nodescraper/models/datamodel.py +171 -0
  51. nodescraper/models/datapluginresult.py +39 -0
  52. nodescraper/models/event.py +158 -0
  53. nodescraper/models/pluginconfig.py +38 -0
  54. nodescraper/models/pluginresult.py +39 -0
  55. nodescraper/models/systeminfo.py +44 -0
  56. nodescraper/models/taskresult.py +185 -0
  57. nodescraper/models/timerangeargs.py +38 -0
  58. nodescraper/pluginexecutor.py +274 -0
  59. nodescraper/pluginregistry.py +152 -0
  60. nodescraper/plugins/__init__.py +25 -0
  61. nodescraper/plugins/inband/__init__.py +25 -0
  62. nodescraper/plugins/inband/amdsmi/__init__.py +28 -0
  63. nodescraper/plugins/inband/amdsmi/amdsmi_analyzer.py +821 -0
  64. nodescraper/plugins/inband/amdsmi/amdsmi_collector.py +1313 -0
  65. nodescraper/plugins/inband/amdsmi/amdsmi_plugin.py +43 -0
  66. nodescraper/plugins/inband/amdsmi/amdsmidata.py +1002 -0
  67. nodescraper/plugins/inband/amdsmi/analyzer_args.py +50 -0
  68. nodescraper/plugins/inband/amdsmi/cper.py +65 -0
  69. nodescraper/plugins/inband/bios/__init__.py +29 -0
  70. nodescraper/plugins/inband/bios/analyzer_args.py +64 -0
  71. nodescraper/plugins/inband/bios/bios_analyzer.py +93 -0
  72. nodescraper/plugins/inband/bios/bios_collector.py +93 -0
  73. nodescraper/plugins/inband/bios/bios_plugin.py +43 -0
  74. nodescraper/plugins/inband/bios/biosdata.py +30 -0
  75. nodescraper/plugins/inband/cmdline/__init__.py +25 -0
  76. nodescraper/plugins/inband/cmdline/analyzer_args.py +80 -0
  77. nodescraper/plugins/inband/cmdline/cmdline_analyzer.py +113 -0
  78. nodescraper/plugins/inband/cmdline/cmdline_collector.py +77 -0
  79. nodescraper/plugins/inband/cmdline/cmdline_plugin.py +43 -0
  80. nodescraper/plugins/inband/cmdline/cmdlinedata.py +30 -0
  81. nodescraper/plugins/inband/device_enumeration/__init__.py +29 -0
  82. nodescraper/plugins/inband/device_enumeration/analyzer_args.py +73 -0
  83. nodescraper/plugins/inband/device_enumeration/device_enumeration_analyzer.py +81 -0
  84. nodescraper/plugins/inband/device_enumeration/device_enumeration_collector.py +176 -0
  85. nodescraper/plugins/inband/device_enumeration/device_enumeration_plugin.py +45 -0
  86. nodescraper/plugins/inband/device_enumeration/deviceenumdata.py +36 -0
  87. nodescraper/plugins/inband/dimm/__init__.py +25 -0
  88. nodescraper/plugins/inband/dimm/collector_args.py +31 -0
  89. nodescraper/plugins/inband/dimm/dimm_collector.py +151 -0
  90. nodescraper/plugins/inband/dimm/dimm_plugin.py +40 -0
  91. nodescraper/plugins/inband/dimm/dimmdata.py +30 -0
  92. nodescraper/plugins/inband/dkms/__init__.py +25 -0
  93. nodescraper/plugins/inband/dkms/analyzer_args.py +85 -0
  94. nodescraper/plugins/inband/dkms/dkms_analyzer.py +106 -0
  95. nodescraper/plugins/inband/dkms/dkms_collector.py +76 -0
  96. nodescraper/plugins/inband/dkms/dkms_plugin.py +43 -0
  97. nodescraper/plugins/inband/dkms/dkmsdata.py +33 -0
  98. nodescraper/plugins/inband/dmesg/__init__.py +28 -0
  99. nodescraper/plugins/inband/dmesg/analyzer_args.py +33 -0
  100. nodescraper/plugins/inband/dmesg/collector_args.py +39 -0
  101. nodescraper/plugins/inband/dmesg/dmesg_analyzer.py +503 -0
  102. nodescraper/plugins/inband/dmesg/dmesg_collector.py +164 -0
  103. nodescraper/plugins/inband/dmesg/dmesg_plugin.py +44 -0
  104. nodescraper/plugins/inband/dmesg/dmesgdata.py +116 -0
  105. nodescraper/plugins/inband/fabrics/__init__.py +28 -0
  106. nodescraper/plugins/inband/fabrics/fabrics_collector.py +726 -0
  107. nodescraper/plugins/inband/fabrics/fabrics_plugin.py +37 -0
  108. nodescraper/plugins/inband/fabrics/fabricsdata.py +140 -0
  109. nodescraper/plugins/inband/journal/__init__.py +28 -0
  110. nodescraper/plugins/inband/journal/collector_args.py +33 -0
  111. nodescraper/plugins/inband/journal/journal_collector.py +107 -0
  112. nodescraper/plugins/inband/journal/journal_plugin.py +40 -0
  113. nodescraper/plugins/inband/journal/journaldata.py +44 -0
  114. nodescraper/plugins/inband/kernel/__init__.py +25 -0
  115. nodescraper/plugins/inband/kernel/analyzer_args.py +64 -0
  116. nodescraper/plugins/inband/kernel/kernel_analyzer.py +91 -0
  117. nodescraper/plugins/inband/kernel/kernel_collector.py +129 -0
  118. nodescraper/plugins/inband/kernel/kernel_plugin.py +43 -0
  119. nodescraper/plugins/inband/kernel/kerneldata.py +32 -0
  120. nodescraper/plugins/inband/kernel_module/__init__.py +25 -0
  121. nodescraper/plugins/inband/kernel_module/analyzer_args.py +59 -0
  122. nodescraper/plugins/inband/kernel_module/kernel_module_analyzer.py +211 -0
  123. nodescraper/plugins/inband/kernel_module/kernel_module_collector.py +264 -0
  124. nodescraper/plugins/inband/kernel_module/kernel_module_data.py +60 -0
  125. nodescraper/plugins/inband/kernel_module/kernel_module_plugin.py +43 -0
  126. nodescraper/plugins/inband/memory/__init__.py +25 -0
  127. nodescraper/plugins/inband/memory/analyzer_args.py +45 -0
  128. nodescraper/plugins/inband/memory/memory_analyzer.py +98 -0
  129. nodescraper/plugins/inband/memory/memory_collector.py +330 -0
  130. nodescraper/plugins/inband/memory/memory_plugin.py +43 -0
  131. nodescraper/plugins/inband/memory/memorydata.py +90 -0
  132. nodescraper/plugins/inband/network/__init__.py +28 -0
  133. nodescraper/plugins/inband/network/network_collector.py +1828 -0
  134. nodescraper/plugins/inband/network/network_plugin.py +37 -0
  135. nodescraper/plugins/inband/network/networkdata.py +319 -0
  136. nodescraper/plugins/inband/nvme/__init__.py +28 -0
  137. nodescraper/plugins/inband/nvme/nvme_collector.py +167 -0
  138. nodescraper/plugins/inband/nvme/nvme_plugin.py +37 -0
  139. nodescraper/plugins/inband/nvme/nvmedata.py +45 -0
  140. nodescraper/plugins/inband/os/__init__.py +25 -0
  141. nodescraper/plugins/inband/os/analyzer_args.py +64 -0
  142. nodescraper/plugins/inband/os/os_analyzer.py +73 -0
  143. nodescraper/plugins/inband/os/os_collector.py +131 -0
  144. nodescraper/plugins/inband/os/os_plugin.py +43 -0
  145. nodescraper/plugins/inband/os/osdata.py +31 -0
  146. nodescraper/plugins/inband/package/__init__.py +25 -0
  147. nodescraper/plugins/inband/package/analyzer_args.py +48 -0
  148. nodescraper/plugins/inband/package/package_analyzer.py +253 -0
  149. nodescraper/plugins/inband/package/package_collector.py +273 -0
  150. nodescraper/plugins/inband/package/package_plugin.py +43 -0
  151. nodescraper/plugins/inband/package/packagedata.py +41 -0
  152. nodescraper/plugins/inband/pcie/__init__.py +29 -0
  153. nodescraper/plugins/inband/pcie/analyzer_args.py +63 -0
  154. nodescraper/plugins/inband/pcie/pcie_analyzer.py +1081 -0
  155. nodescraper/plugins/inband/pcie/pcie_collector.py +690 -0
  156. nodescraper/plugins/inband/pcie/pcie_data.py +2017 -0
  157. nodescraper/plugins/inband/pcie/pcie_plugin.py +43 -0
  158. nodescraper/plugins/inband/process/__init__.py +25 -0
  159. nodescraper/plugins/inband/process/analyzer_args.py +45 -0
  160. nodescraper/plugins/inband/process/collector_args.py +31 -0
  161. nodescraper/plugins/inband/process/process_analyzer.py +91 -0
  162. nodescraper/plugins/inband/process/process_collector.py +115 -0
  163. nodescraper/plugins/inband/process/process_plugin.py +46 -0
  164. nodescraper/plugins/inband/process/processdata.py +34 -0
  165. nodescraper/plugins/inband/rocm/__init__.py +25 -0
  166. nodescraper/plugins/inband/rocm/analyzer_args.py +66 -0
  167. nodescraper/plugins/inband/rocm/rocm_analyzer.py +100 -0
  168. nodescraper/plugins/inband/rocm/rocm_collector.py +205 -0
  169. nodescraper/plugins/inband/rocm/rocm_plugin.py +43 -0
  170. nodescraper/plugins/inband/rocm/rocmdata.py +62 -0
  171. nodescraper/plugins/inband/storage/__init__.py +25 -0
  172. nodescraper/plugins/inband/storage/analyzer_args.py +38 -0
  173. nodescraper/plugins/inband/storage/collector_args.py +31 -0
  174. nodescraper/plugins/inband/storage/storage_analyzer.py +152 -0
  175. nodescraper/plugins/inband/storage/storage_collector.py +110 -0
  176. nodescraper/plugins/inband/storage/storage_plugin.py +44 -0
  177. nodescraper/plugins/inband/storage/storagedata.py +70 -0
  178. nodescraper/plugins/inband/sysctl/__init__.py +29 -0
  179. nodescraper/plugins/inband/sysctl/analyzer_args.py +67 -0
  180. nodescraper/plugins/inband/sysctl/sysctl_analyzer.py +81 -0
  181. nodescraper/plugins/inband/sysctl/sysctl_collector.py +101 -0
  182. nodescraper/plugins/inband/sysctl/sysctl_plugin.py +43 -0
  183. nodescraper/plugins/inband/sysctl/sysctldata.py +42 -0
  184. nodescraper/plugins/inband/syslog/__init__.py +28 -0
  185. nodescraper/plugins/inband/syslog/syslog_collector.py +121 -0
  186. nodescraper/plugins/inband/syslog/syslog_plugin.py +37 -0
  187. nodescraper/plugins/inband/syslog/syslogdata.py +46 -0
  188. nodescraper/plugins/inband/uptime/__init__.py +25 -0
  189. nodescraper/plugins/inband/uptime/uptime_collector.py +88 -0
  190. nodescraper/plugins/inband/uptime/uptime_plugin.py +37 -0
  191. nodescraper/plugins/inband/uptime/uptimedata.py +31 -0
  192. nodescraper/resultcollators/__init__.py +25 -0
  193. nodescraper/resultcollators/tablesummary.py +159 -0
  194. nodescraper/taskresulthooks/__init__.py +28 -0
  195. nodescraper/taskresulthooks/filesystemloghook.py +88 -0
  196. nodescraper/typeutils.py +171 -0
  197. nodescraper/utils.py +412 -0
@@ -0,0 +1,690 @@
1
+ ###############################################################################
2
+ #
3
+ # MIT License
4
+ #
5
+ # Copyright (c) 2025 Advanced Micro Devices, Inc.
6
+ #
7
+ # Permission is hereby granted, free of charge, to any person obtaining a copy
8
+ # of this software and associated documentation files (the "Software"), to deal
9
+ # in the Software without restriction, including without limitation the rights
10
+ # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11
+ # copies of the Software, and to permit persons to whom the Software is
12
+ # furnished to do so, subject to the following conditions:
13
+ #
14
+ # The above copyright notice and this permission notice shall be included in all
15
+ # copies or substantial portions of the Software.
16
+ #
17
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18
+ # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19
+ # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20
+ # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21
+ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22
+ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23
+ # SOFTWARE.
24
+ #
25
+ ###############################################################################
26
+ import re
27
+ from enum import Enum
28
+ from typing import Dict, List, Optional, Set, Tuple, Union
29
+
30
+ from pydantic import ValidationError
31
+
32
+ from nodescraper.base import InBandDataCollector
33
+ from nodescraper.connection.inband import TextFileArtifact
34
+ from nodescraper.enums import (
35
+ EventCategory,
36
+ EventPriority,
37
+ ExecutionStatus,
38
+ OSFamily,
39
+ SystemInteractionLevel,
40
+ )
41
+ from nodescraper.models import TaskResult
42
+ from nodescraper.utils import get_all_subclasses, get_exception_details
43
+
44
+ from .pcie_data import (
45
+ MAX_CAP_ID,
46
+ MAX_ECAP_ID,
47
+ CapabilityEnum,
48
+ ExtendedCapabilityEnum,
49
+ PcieCapStructure,
50
+ PcieCfgSpace,
51
+ PcieDataModel,
52
+ Type0Configuration,
53
+ Type1Configuration,
54
+ )
55
+
56
+
57
+ class PcieCollector(InBandDataCollector[PcieDataModel, None]):
58
+ """class for collection of PCIe data only supports Linux OS type.
59
+
60
+ This class collects the PCIE config space using the lspci hex dump and then parses the hex dump to get the
61
+ PCIe configuration space for the GPUs in the system. If the system interaction level is set to STANDARD or higher,
62
+ then the entire pcie configuration space is collected for the GPUs in the system. If the system interaction level
63
+ is set to SURFACE then, only the first 64 bytes of the pcie configuration space is collected for the GPUs in the system.
64
+
65
+ This class will collect important PCIe data from the system running the commands
66
+ - `lspci -vvv` : Verbose collection of PCIe data
67
+ - `lspci -vvvt`: Verbose tree view of PCIe data
68
+ - `lspci -PP`: Path view of PCIe data for the GPUs
69
+ - If system interaction level is set to STANDARD or higher, the following commands will be run with sudo:
70
+ - `lspci -xxxx`: Hex view of PCIe data for the GPUs
71
+ - otherwise the following commands will be run without sudo:
72
+ - `lspci -x`: Hex view of PCIe data for the GPUs
73
+ - `lspci -d <vendor_id>:<dev_id>` : Count the number of GPUs in the system with this command
74
+ - If system interaction level is set to STANDARD or higher, the following commands will be run with sudo:
75
+ - The sudo lspci -xxxx command is used to collect the PCIe configuration space for the GPUs in the system
76
+ - otherwise the following commands will be run without sudo:
77
+ - The lspci -x command is used to collect the PCIe configuration space for the GPUs in the system
78
+
79
+ """
80
+
81
+ SUPPORTED_OS_FAMILY: Set[OSFamily] = {OSFamily.LINUX}
82
+
83
+ DATA_MODEL = PcieDataModel
84
+
85
+ CMD_LSPCI_VERBOSE = "lspci -vvv"
86
+ CMD_LSPCI_VERBOSE_TREE = "lspci -vvvt"
87
+ CMD_LSPCI_PATH = "lspci -PP"
88
+ CMD_LSPCI_HEX_SUDO = "lspci -xxxx"
89
+ CMD_LSPCI_HEX = "lspci -x"
90
+ CMD_LSPCI_AMD_DEVICES = "lspci -d {vendor_id}: -nn"
91
+ CMD_LSPCI_PATH_DEVICE = "lspci -PP -d {vendor_id}:{dev_id}"
92
+
93
+ def _detect_amd_device_ids(self) -> dict[str, list[str]]:
94
+ """Detect AMD GPU device IDs from the system using lspci.
95
+
96
+ Returns:
97
+ dict[str, list[str]]: Dictionary with 'vendor_id', 'device_ids', and 'vf_device_ids'
98
+ """
99
+ vendor_id_hex = format(self.system_info.vendorid_ep, "x")
100
+ result: dict[str, list[str]] = {
101
+ "vendor_id": [vendor_id_hex],
102
+ "device_ids": [],
103
+ "vf_device_ids": [],
104
+ }
105
+
106
+ res = self._run_sut_cmd(
107
+ self.CMD_LSPCI_AMD_DEVICES.format(vendor_id=vendor_id_hex),
108
+ sudo=False,
109
+ log_artifact=False,
110
+ )
111
+ if res.exit_code == 0 and res.stdout:
112
+ # Pattern: [vendor:device]
113
+ device_id_pattern = rf"\[{vendor_id_hex}:([0-9a-fA-F]{{4}})\]"
114
+ # Pattern to detect VF in description
115
+ vf_pattern = r"Virtual Function"
116
+
117
+ for line in res.stdout.splitlines():
118
+ matches = re.findall(device_id_pattern, line)
119
+ if matches:
120
+ device_id = matches[0].lower()
121
+ # Check if it's a VF
122
+ if re.search(vf_pattern, line, re.IGNORECASE):
123
+ if device_id not in result["vf_device_ids"]:
124
+ result["vf_device_ids"].append(device_id)
125
+ self.logger.info(f"Detected AMD VF device ID: {device_id}")
126
+ else:
127
+ if device_id not in result["device_ids"]:
128
+ result["device_ids"].append(device_id)
129
+ self.logger.info(f"Detected AMD device ID: {device_id}")
130
+
131
+ self._log_event(
132
+ category=EventCategory.IO,
133
+ description="Detected AMD GPU device IDs from system",
134
+ data=result,
135
+ priority=EventPriority.INFO,
136
+ )
137
+
138
+ return result
139
+
140
+ def show_lspci_verbose(self, sudo=True) -> Optional[str]:
141
+ """Show lspci with -vvv."""
142
+ return self._run_os_cmd(self.CMD_LSPCI_VERBOSE, sudo=sudo)
143
+
144
+ def show_lspci_verbose_tree(self, sudo=True) -> Optional[str]:
145
+ """Show lspci with -vvvt (verbose tree view)."""
146
+ return self._run_os_cmd(self.CMD_LSPCI_VERBOSE_TREE, sudo=sudo)
147
+
148
+ def show_lspci_path(self, sudo=True) -> Optional[str]:
149
+ """Show lspci with -PP."""
150
+ return self._run_os_cmd(self.CMD_LSPCI_PATH, sudo=sudo)
151
+
152
+ def show_lspci_hex(self, bdf: Optional[str] = None, sudo=True) -> Optional[str]:
153
+ """Show lspci with -xxxx."""
154
+ if sudo:
155
+ hex_arg = "-xxxx"
156
+ else:
157
+ # Sudo required for whole pcie configuration space
158
+ hex_arg = "-x"
159
+
160
+ if bdf:
161
+ return self._run_os_cmd(f"lspci {hex_arg} -s {bdf}", sudo=sudo)
162
+ return self._run_os_cmd(f"lspci {hex_arg}", sudo=sudo)
163
+
164
+ def _run_os_cmd(
165
+ self, command: str, sudo: bool = True, ignore_error: bool = False
166
+ ) -> Optional[str]:
167
+ """Run os command. Run as sudo by default.
168
+
169
+ Args:
170
+ command (str): command to run on the OS
171
+ sudo (bool): run as sudo or not
172
+ ignore_error (bool): ignore error or not
173
+ Returns:
174
+ stdout: str
175
+ """
176
+ cmd_ret = self._run_sut_cmd(command, sudo=sudo)
177
+ if ignore_error:
178
+ return cmd_ret.stdout
179
+ elif cmd_ret.stderr != "" or cmd_ret.exit_code != 0:
180
+ return None
181
+ else:
182
+ return cmd_ret.stdout
183
+
184
+ def _get_upstream_bdf_from_buspath(
185
+ self,
186
+ vendor_id: str,
187
+ dev_id: str,
188
+ upstream_steps_limit: Optional[int] = 0,
189
+ sudo=True,
190
+ ) -> Optional[Dict[str, List[str]]]:
191
+ """Get all the upstream BDFs for a vendor/device id.
192
+
193
+ Parameters
194
+ ----------
195
+ vendor_id : str
196
+ A pcie vendor id
197
+ dev_id : str
198
+ A pcie device id
199
+ upstream_steps_limit : Optional[int]
200
+ The limit on the number of upstream devices to collect, by default 0
201
+ sudo : bool
202
+ Run the command as sudo or not, by default True
203
+
204
+ Returns
205
+ -------
206
+ Optional[List[str]]
207
+ A list of upstream BDFs or None on failure
208
+ """
209
+ split_bdf_pos = 0
210
+
211
+ bus_path_all_gpus = self._run_os_cmd(f"lspci -PP -d {vendor_id}:{dev_id}", sudo=sudo)
212
+ if bus_path_all_gpus is None or bus_path_all_gpus == "":
213
+ self._log_event(
214
+ category=EventCategory.IO,
215
+ description="Failed to get bus path info for vendor/device ID.",
216
+ data={"vendor_id": vendor_id, "dev_id": dev_id},
217
+ priority=EventPriority.INFO,
218
+ )
219
+ return None
220
+ upstream_bdfs: Dict[str, List[str]] = {}
221
+ for bus_path in bus_path_all_gpus.splitlines():
222
+ bus_path_list = (bus_path.split(" ")[split_bdf_pos]).split("/")
223
+ if upstream_steps_limit is not None and len(bus_path_list) < upstream_steps_limit + 1:
224
+ # We don't have enough upstream devices to collect
225
+ self._log_event(
226
+ category=EventCategory.RUNTIME,
227
+ description="Not enough upstream devices found.",
228
+ data={
229
+ "bus_path": bus_path,
230
+ "upstream_steps_limit": upstream_steps_limit,
231
+ "bus_path_list": bus_path_list,
232
+ },
233
+ priority=EventPriority.WARNING,
234
+ )
235
+ bdf_str = bus_path_list[-1]
236
+ upstream_bdfs[bdf_str] = []
237
+ # Flip the bus_path_list to get GPU first and then upstream devices
238
+ bus_path_list.reverse()
239
+ # Upstream + 1 to always include GPU and # of upstream devices
240
+ if upstream_steps_limit is None:
241
+ upstream_bdfs[bdf_str] = bus_path_list
242
+ else:
243
+ for bdf in range(min(len(bus_path_list), upstream_steps_limit + 1)):
244
+ upstream_bdfs[bdf_str].append(bus_path_list[bdf])
245
+
246
+ return upstream_bdfs
247
+
248
+ def _get_gpu_cfg_space(
249
+ self,
250
+ vendor_id: str,
251
+ device_id: str,
252
+ upstream_steps_from_gpu: Optional[int] = 0,
253
+ sudo=True,
254
+ ) -> dict[str, PcieCfgSpace]:
255
+ """
256
+ - Generates a nested dictionary with the PCIe configuration space for the bdfs corresponding to the vendor/device ID
257
+ - Populates the dict by reading cfg space through 'setpci' commands
258
+
259
+ Args:
260
+ vendor_id (str): vendor ID (hex format)
261
+ device_id (str): device ID (hex format)
262
+ upstream_steps_from_gpu (Optional[int]): The number of upstream devices to collect the PCIe cfg space for, by default 0
263
+ Returns:
264
+ all_bdf_cfg_space_dict: nested dictionary containing PCIe cfg space for all bdfs corresponding to the vendor/device ID
265
+ """
266
+ if (vendor_id is None) or (device_id is None):
267
+ self._log_event(
268
+ category=EventCategory.IO,
269
+ description="System info is invalid Vendor ID or Device ID is None.",
270
+ data={"vendor_id": vendor_id, "dev_id": device_id},
271
+ priority=EventPriority.ERROR,
272
+ )
273
+ return {}
274
+
275
+ bdf_list = self._get_upstream_bdf_from_buspath(
276
+ vendor_id,
277
+ device_id,
278
+ upstream_steps_limit=upstream_steps_from_gpu,
279
+ sudo=sudo,
280
+ )
281
+ if bdf_list is None:
282
+ return {}
283
+
284
+ all_bdf_cfg_space_dict = {}
285
+ for gpu_bdf_list in bdf_list.values():
286
+ for bdf in gpu_bdf_list:
287
+ new_base_dict = self.get_cfg_by_bdf(bdf, sudo=sudo)
288
+ all_bdf_cfg_space_dict[bdf] = new_base_dict
289
+ return all_bdf_cfg_space_dict
290
+
291
+ def parse_hex_dump(self, hex_dump: str) -> list[int]:
292
+ """Parse the hex dump."""
293
+
294
+ hex_dump = hex_dump.strip()
295
+ byte_list = []
296
+ for line in hex_dump.splitlines():
297
+ parts = line.split(":")
298
+ if len(parts) != 2:
299
+ continue # Skip malformed lines
300
+ if len(parts[1]) != 48:
301
+ continue # Unexpected number of bytes
302
+ byte_str = parts[1]
303
+ tokens = byte_str.strip().split()
304
+ for token in tokens:
305
+ byte = int(token, 16)
306
+ byte_list.append(byte)
307
+
308
+ return byte_list
309
+
310
+ def read_register(self, width: int, offset: int, config_data: List[int]):
311
+ """Read a register from the hex dump, width should be 1, 2, 4, or 8 bytes"""
312
+ register_value = 0
313
+ for i in range(0, width >> 3):
314
+ register_value += config_data[offset + i] << (i * 8)
315
+ return register_value
316
+
317
+ def extended_cap_finder(
318
+ self,
319
+ config_data: List[int],
320
+ cap_pointer: int,
321
+ cap_data: Optional[Dict[int, int]] = None,
322
+ ):
323
+ """Obtain capability structure by parsing the hex dump for capability pointers
324
+
325
+ config_data : List[int]
326
+ A list of int's representing the hex dump from lspci -x or sudo lspci -xxxx
327
+ cap_pointer : int
328
+ The hex value of a Capability pointer or 0x34 for the first cap pointer
329
+ cap_data : Optional[dict[int, int]], optional
330
+ A dictionary of capability pointers, by default None
331
+
332
+ returns
333
+ -------
334
+ cap_data : Dict[int, int]
335
+ A list of capability pointers, key is the cap_id and value is the cap_pointer use CapabilityEnum(cap_id) to get the Name
336
+ """
337
+ if cap_data is None:
338
+ cap_data = {}
339
+ if cap_pointer >= len(config_data) or cap_pointer + 1 >= len(config_data):
340
+ # prevent an illegal access to the list
341
+ return cap_data
342
+ cap_id = config_data[cap_pointer] + (config_data[cap_pointer + 1] << 8)
343
+ if cap_id > MAX_ECAP_ID:
344
+ # Break if the cap_id is greater than the max extended cap id
345
+ self._log_event(
346
+ category=EventCategory.IO,
347
+ description=f"Invalid Capability ID detected {cap_id}",
348
+ priority=EventPriority.ERROR,
349
+ data={"cap_id": cap_id},
350
+ )
351
+ return {}
352
+ cap_data[cap_id] = cap_pointer
353
+ if cap_pointer + 3 >= len(config_data):
354
+ return cap_data
355
+ next_cap_pointer = (config_data[cap_pointer + 2] & 0xF0) >> 4
356
+ next_cap_pointer += config_data[cap_pointer + 3] << 4
357
+ if next_cap_pointer == 0:
358
+ return cap_data
359
+ else:
360
+ return self.extended_cap_finder(config_data, next_cap_pointer, cap_data)
361
+
362
+ def cap_finder(
363
+ self,
364
+ config_data: List[int],
365
+ cap_pointer: int,
366
+ cap_data: Optional[Dict[int, int]] = None,
367
+ ):
368
+ """Obtain capability structure by parsing the hex dump for capability pointers
369
+
370
+ Parameters
371
+ ----------
372
+ config_data : List[int]
373
+ A list of int's representing the hex dump from lspci -xxxx
374
+ cap_pointer : int
375
+ The hex value of a Capability pointer or 0x34 for the first cap pointer
376
+ cap_data : Optional[Dict[int, int]], optional
377
+ A dictionary of capability pointers, by default None
378
+
379
+ Returns
380
+ -------
381
+ cap_data : Dict[int, int]
382
+ A list of extended apability pointers, key is the cap_id and value is the cap_pointer use ExtendedCapabilityEnum(cap_id) to get the Name
383
+ """
384
+ if cap_data is None:
385
+ cap_data = {}
386
+
387
+ if cap_pointer == 0x34:
388
+ # Special case for ths first cap pointer, this one doesn't have an associated cap_id so just move on
389
+ return self.cap_finder(config_data, config_data[0x34], cap_data)
390
+ if cap_pointer >= len(config_data) or cap_pointer + 1 >= len(config_data):
391
+ # prevent an illegal access to the list
392
+ return cap_data
393
+ cap_id = config_data[cap_pointer]
394
+ if cap_id > MAX_CAP_ID:
395
+ # Break if the cap_id is greater than the max cap id
396
+ self._log_event(
397
+ category=EventCategory.IO,
398
+ description=f"Invalid Capability ID detected {cap_id}",
399
+ priority=EventPriority.ERROR,
400
+ data={"cap_id": cap_id},
401
+ )
402
+ return {}
403
+ next_cap_pointer = config_data[cap_pointer + 1]
404
+ cap_data[cap_id] = cap_pointer
405
+ if next_cap_pointer == 0:
406
+ return cap_data
407
+ else:
408
+ return self.cap_finder(config_data, next_cap_pointer, cap_data)
409
+
410
+ def get_cap_struct(self, id: Enum) -> Optional[type[PcieCapStructure]]:
411
+ for cap_struct in get_all_subclasses(PcieCapStructure):
412
+ if cap_struct.cap_id == id:
413
+ return cap_struct
414
+ return None
415
+
416
+ def get_pcie_common_cfg(
417
+ self,
418
+ type_x_configuration: Union[type[Type0Configuration], type[Type1Configuration]],
419
+ config_data: List[int],
420
+ ) -> Union[Type0Configuration, Type1Configuration]:
421
+ """Get the Base PCIe configuration space from the hex dump items
422
+
423
+ Parameters
424
+ ----------
425
+ type_x_configuration : Union[type[Type0Configuration], type[Type1Configuration]]
426
+ Either Type0Configuration or Type1Configuration
427
+ config_data : List[int]
428
+ Config data from lspci -xxxx
429
+
430
+ Returns
431
+ -------
432
+ Union[Type0Configuration, Type1Configuration]
433
+ The complete model that was input
434
+ """
435
+ register_data: Dict[str, int] = {}
436
+ type_x_obj = type_x_configuration()
437
+ for register_name, register_in in type_x_obj.iter_regs():
438
+ register = register_in.model_copy()
439
+ register_data[register_name] = self.read_register(
440
+ register.width, register.offset, config_data
441
+ )
442
+ type_x_obj.set_regs(register_data)
443
+ return type_x_obj
444
+
445
+ def get_cap_cfg(
446
+ self,
447
+ cap_data: Dict[int, int],
448
+ config_data: List[int],
449
+ ) -> Union[
450
+ Dict[CapabilityEnum, PcieCapStructure], Dict[ExtendedCapabilityEnum, PcieCapStructure]
451
+ ]:
452
+ """Get the data from the capability structures
453
+
454
+ Parameters
455
+ ----------
456
+ cap_data : Dict[int,int]
457
+ A list of capability pointers, key is the cap_id and value is the cap_pointer
458
+ config_data : List[int]
459
+ A list of ints representing the hex dump from lspci -xxxx
460
+
461
+ Returns
462
+ -------
463
+ Union[Dict[CapabilityEnum, PcieCapStructure], Dict[ExtendedCapabilityEnum, PcieCapStructure]]
464
+ Either a dict of CapabilityEnum to PcieCapStructure or ExtendedCapabilityEnum to PcieCapStructure
465
+
466
+ """
467
+ cap_structure: Dict[Enum, PcieCapStructure] = {}
468
+ for cap_id, cap_addr in cap_data.items():
469
+ if cap_id == 0:
470
+ continue
471
+ if cap_addr >= 0x100:
472
+ cap_enum: Enum = ExtendedCapabilityEnum(cap_id)
473
+ else:
474
+ cap_enum = CapabilityEnum(cap_id)
475
+ cap_cls = self.get_cap_struct(cap_enum)
476
+ if cap_cls is None:
477
+ continue
478
+ cap_obj = cap_cls() # type: ignore[call-arg]
479
+ reg_data = {}
480
+ for register_name, register in cap_obj.iter_regs():
481
+ reg_data[register_name] = self.read_register(
482
+ register.width, register.offset + cap_addr, config_data
483
+ )
484
+ cap_obj.set_regs(reg_data)
485
+ cap_obj.offset = cap_addr
486
+ cap_structure[cap_enum] = cap_obj
487
+
488
+ return cap_structure # type: ignore[return-value]
489
+
490
+ def get_cfg_by_bdf(self, bdf: str, sudo=True) -> PcieCfgSpace:
491
+ """Will fill out a PcieCfgSpace object with the PCIe configuration space for a given BDF"""
492
+ hex_data_raw = self.show_lspci_hex(bdf, sudo=sudo)
493
+ if hex_data_raw is None:
494
+ self._log_event(
495
+ category=EventCategory.IO,
496
+ description="Failed to get hex data for BDF.",
497
+ data={"bdf": bdf},
498
+ priority=EventPriority.ERROR,
499
+ )
500
+ return PcieCfgSpace()
501
+ hex_data: List[int] = self.parse_hex_dump(hex_data_raw)
502
+ if len(hex_data) < 64:
503
+ # Expect at least 256 bytes of data, for the first 256 bytes of the PCIe config space
504
+ self._log_event(
505
+ category=EventCategory.IO,
506
+ description="Hex data is not the expected length",
507
+ data={"bdf": bdf, "length": len(hex_data)},
508
+ priority=EventPriority.ERROR,
509
+ )
510
+ return PcieCfgSpace()
511
+ cap_data, ecap_data = self.discover_capability_structure(hex_data)
512
+ return self.get_pcie_cfg(hex_data, cap_data, ecap_data)
513
+
514
+ def get_pcie_cfg(
515
+ self,
516
+ config_data: List[int],
517
+ cap_data: Dict[int, int],
518
+ ecap_data: Dict[int, int],
519
+ ) -> PcieCfgSpace:
520
+ """Gets the pcie config space from a list of ints
521
+
522
+ Parameters
523
+ ----------
524
+ config_data : List[int]
525
+ A list of ints representing the hex dump from lspci -xxxx
526
+ cap_data : Dict[int, int]
527
+ A list of capability pointers, key is the cap_id and value is the cap_pointer
528
+
529
+ Returns
530
+ -------
531
+ PcieCfgSpace
532
+ A PcieCfgSpace object with the PCIe configuration
533
+ """
534
+ type0 = self.get_pcie_common_cfg(Type0Configuration, config_data)
535
+ type1 = self.get_pcie_common_cfg(Type1Configuration, config_data)
536
+ cap = self.get_cap_cfg(cap_data, config_data)
537
+ ecap = self.get_cap_cfg(ecap_data, config_data)
538
+ return PcieCfgSpace(
539
+ type_0_configuration=type0, # type: ignore[arg-type]
540
+ type_1_configuration=type1, # type: ignore[arg-type]
541
+ capability_pointers=cap_data, # type: ignore[arg-type]
542
+ extended_capability_pointers=ecap_data, # type: ignore[arg-type]
543
+ cap_structure=cap, # type: ignore[arg-type]
544
+ ecap_structure=ecap, # type: ignore[arg-type]
545
+ )
546
+
547
+ def _log_pcie_artifacts(
548
+ self,
549
+ lspci_pp: Optional[str],
550
+ lspci_hex: Optional[str],
551
+ lspci_verbose_tree: Optional[str],
552
+ lspci_verbose: Optional[str],
553
+ ):
554
+ """Log the file artifacts for the PCIe data collector."""
555
+ name_log_map = {
556
+ "lspci_hex.txt": lspci_hex,
557
+ "lspci_verbose_tree.txt": lspci_verbose_tree,
558
+ "lspci_verbose.txt": lspci_verbose,
559
+ "lspci_pp.txt": lspci_pp,
560
+ }
561
+ for name, data in name_log_map.items():
562
+ if data is not None:
563
+ self.result.artifacts.append(TextFileArtifact(filename=name, contents=data))
564
+
565
+ def _get_pcie_data(
566
+ self, upstream_steps_to_collect: Optional[int] = None
567
+ ) -> Optional[PcieDataModel]:
568
+ """Will return all PCIe data in a PcieDataModel object.
569
+
570
+ Returns
571
+ -------
572
+ Optional[PcieDataModel]
573
+ The data in a PcieDataModel object or None on failure
574
+ """
575
+ minimum_system_interaction_level_required_for_sudo = SystemInteractionLevel.INTERACTIVE
576
+
577
+ try:
578
+ if (
579
+ isinstance(self.system_interaction_level, SystemInteractionLevel)
580
+ and self.system_interaction_level
581
+ >= minimum_system_interaction_level_required_for_sudo
582
+ ):
583
+ use_sudo = True
584
+ else:
585
+ use_sudo = False
586
+
587
+ if upstream_steps_to_collect is None:
588
+ upstream_steps_to_collect = None
589
+
590
+ # Detect AMD device IDs dynamically from the system
591
+ detected_devices = self._detect_amd_device_ids()
592
+ vendor_id = (
593
+ detected_devices["vendor_id"][0]
594
+ if detected_devices["vendor_id"]
595
+ else format(self.system_info.vendorid_ep, "x")
596
+ )
597
+ device_ids = detected_devices["device_ids"]
598
+ vf_device_ids = detected_devices["vf_device_ids"]
599
+
600
+ pcie_cfg_dict: Dict[str, PcieCfgSpace] = {}
601
+ vf_pcie_cfg_data: Dict[str, PcieCfgSpace] = {}
602
+
603
+ # Collect PCIe config space for each detected device ID
604
+ for dev_id in device_ids:
605
+ cfg_space = self._get_gpu_cfg_space(
606
+ vendor_id=vendor_id,
607
+ device_id=dev_id,
608
+ upstream_steps_from_gpu=upstream_steps_to_collect,
609
+ sudo=use_sudo,
610
+ )
611
+ if cfg_space:
612
+ pcie_cfg_dict.update(cfg_space)
613
+
614
+ # Collect VF PCIe config space for each detected VF device ID
615
+ for dev_id_vf in vf_device_ids:
616
+ vf_cfg_space = self._get_gpu_cfg_space(
617
+ vendor_id=vendor_id,
618
+ device_id=dev_id_vf,
619
+ upstream_steps_from_gpu=0,
620
+ sudo=use_sudo,
621
+ )
622
+ if vf_cfg_space:
623
+ vf_pcie_cfg_data.update(vf_cfg_space)
624
+
625
+ lspci_hex = self.show_lspci_hex(sudo=use_sudo)
626
+ lspci_verbose = self.show_lspci_verbose(sudo=use_sudo)
627
+ lspci_verbose_tree = self.show_lspci_verbose_tree(sudo=use_sudo)
628
+ lspci_path = self.show_lspci_path(sudo=use_sudo)
629
+ self._log_pcie_artifacts(
630
+ lspci_pp=lspci_path,
631
+ lspci_hex=lspci_hex,
632
+ lspci_verbose_tree=lspci_verbose_tree,
633
+ lspci_verbose=lspci_verbose,
634
+ )
635
+ pcie_data = PcieDataModel(
636
+ pcie_cfg_space=pcie_cfg_dict,
637
+ vf_pcie_cfg_space=vf_pcie_cfg_data,
638
+ )
639
+ except ValidationError as e:
640
+ self._log_event(
641
+ category=EventCategory.OS,
642
+ description="Failed to build model for PCIe data",
643
+ data=get_exception_details(e),
644
+ priority=EventPriority.ERROR,
645
+ )
646
+ self.result.status = ExecutionStatus.ERROR
647
+ return None
648
+ return pcie_data
649
+
650
+ def discover_capability_structure(
651
+ self, hex_dump: List[int]
652
+ ) -> Tuple[Dict[int, int], Dict[int, int]]:
653
+ """Obtain the capability structure by parsing the hex dump for capability pointers
654
+
655
+ Parameters
656
+ ----------
657
+ hex_dump : List[int]
658
+ A list of ints from lspci -xxxx
659
+
660
+ Returns
661
+ -------
662
+ dict[int, int]
663
+ A list of capability pointers, key is the cap_id and value is the cap_pointer
664
+ """
665
+ cap = self.cap_finder(hex_dump, 0x34)
666
+ ecap = self.extended_cap_finder(hex_dump, 0x100)
667
+ return cap, ecap
668
+
669
+ def collect_data(
670
+ self, args=None, upstream_steps_to_collect: Optional[int] = None, **kwargs
671
+ ) -> Tuple[TaskResult, Optional[PcieDataModel]]:
672
+ """Read PCIe data.
673
+
674
+ Args:
675
+ args: Optional collector arguments (not used)
676
+ upstream_steps_to_collect: Number of upstream devices to collect
677
+ **kwargs: Additional keyword arguments
678
+
679
+ Returns:
680
+ Tuple[TaskResult, Optional[PcieDataModel]]: tuple containing the result of the task and the PCIe data if available
681
+ """
682
+ pcie_data = self._get_pcie_data(upstream_steps_to_collect)
683
+ if pcie_data:
684
+ self._log_event(
685
+ category=EventCategory.IO,
686
+ description="PCIe Data read from GPUs",
687
+ data={"bdf_count": len(pcie_data.pcie_cfg_space.keys())},
688
+ priority=EventPriority.INFO,
689
+ )
690
+ return self.result, pcie_data