amd-node-scraper 0.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- amd_node_scraper-0.0.1.dist-info/LICENSE +21 -0
- amd_node_scraper-0.0.1.dist-info/METADATA +424 -0
- amd_node_scraper-0.0.1.dist-info/RECORD +197 -0
- amd_node_scraper-0.0.1.dist-info/WHEEL +5 -0
- amd_node_scraper-0.0.1.dist-info/entry_points.txt +2 -0
- amd_node_scraper-0.0.1.dist-info/top_level.txt +1 -0
- nodescraper/__init__.py +32 -0
- nodescraper/base/__init__.py +34 -0
- nodescraper/base/inbandcollectortask.py +118 -0
- nodescraper/base/inbanddataplugin.py +39 -0
- nodescraper/base/regexanalyzer.py +120 -0
- nodescraper/cli/__init__.py +29 -0
- nodescraper/cli/cli.py +511 -0
- nodescraper/cli/constants.py +27 -0
- nodescraper/cli/dynamicparserbuilder.py +171 -0
- nodescraper/cli/helper.py +517 -0
- nodescraper/cli/inputargtypes.py +129 -0
- nodescraper/configbuilder.py +123 -0
- nodescraper/configregistry.py +66 -0
- nodescraper/configs/node_status.json +19 -0
- nodescraper/connection/__init__.py +25 -0
- nodescraper/connection/inband/__init__.py +46 -0
- nodescraper/connection/inband/inband.py +171 -0
- nodescraper/connection/inband/inbandlocal.py +93 -0
- nodescraper/connection/inband/inbandmanager.py +151 -0
- nodescraper/connection/inband/inbandremote.py +173 -0
- nodescraper/connection/inband/sshparams.py +43 -0
- nodescraper/constants.py +26 -0
- nodescraper/enums/__init__.py +40 -0
- nodescraper/enums/eventcategory.py +89 -0
- nodescraper/enums/eventpriority.py +42 -0
- nodescraper/enums/executionstatus.py +44 -0
- nodescraper/enums/osfamily.py +34 -0
- nodescraper/enums/systeminteraction.py +41 -0
- nodescraper/enums/systemlocation.py +33 -0
- nodescraper/generictypes.py +36 -0
- nodescraper/interfaces/__init__.py +44 -0
- nodescraper/interfaces/connectionmanager.py +143 -0
- nodescraper/interfaces/dataanalyzertask.py +138 -0
- nodescraper/interfaces/datacollectortask.py +185 -0
- nodescraper/interfaces/dataplugin.py +356 -0
- nodescraper/interfaces/plugin.py +127 -0
- nodescraper/interfaces/resultcollator.py +56 -0
- nodescraper/interfaces/task.py +164 -0
- nodescraper/interfaces/taskresulthook.py +39 -0
- nodescraper/models/__init__.py +48 -0
- nodescraper/models/analyzerargs.py +93 -0
- nodescraper/models/collectorargs.py +30 -0
- nodescraper/models/connectionconfig.py +34 -0
- nodescraper/models/datamodel.py +171 -0
- nodescraper/models/datapluginresult.py +39 -0
- nodescraper/models/event.py +158 -0
- nodescraper/models/pluginconfig.py +38 -0
- nodescraper/models/pluginresult.py +39 -0
- nodescraper/models/systeminfo.py +44 -0
- nodescraper/models/taskresult.py +185 -0
- nodescraper/models/timerangeargs.py +38 -0
- nodescraper/pluginexecutor.py +274 -0
- nodescraper/pluginregistry.py +152 -0
- nodescraper/plugins/__init__.py +25 -0
- nodescraper/plugins/inband/__init__.py +25 -0
- nodescraper/plugins/inband/amdsmi/__init__.py +28 -0
- nodescraper/plugins/inband/amdsmi/amdsmi_analyzer.py +821 -0
- nodescraper/plugins/inband/amdsmi/amdsmi_collector.py +1313 -0
- nodescraper/plugins/inband/amdsmi/amdsmi_plugin.py +43 -0
- nodescraper/plugins/inband/amdsmi/amdsmidata.py +1002 -0
- nodescraper/plugins/inband/amdsmi/analyzer_args.py +50 -0
- nodescraper/plugins/inband/amdsmi/cper.py +65 -0
- nodescraper/plugins/inband/bios/__init__.py +29 -0
- nodescraper/plugins/inband/bios/analyzer_args.py +64 -0
- nodescraper/plugins/inband/bios/bios_analyzer.py +93 -0
- nodescraper/plugins/inband/bios/bios_collector.py +93 -0
- nodescraper/plugins/inband/bios/bios_plugin.py +43 -0
- nodescraper/plugins/inband/bios/biosdata.py +30 -0
- nodescraper/plugins/inband/cmdline/__init__.py +25 -0
- nodescraper/plugins/inband/cmdline/analyzer_args.py +80 -0
- nodescraper/plugins/inband/cmdline/cmdline_analyzer.py +113 -0
- nodescraper/plugins/inband/cmdline/cmdline_collector.py +77 -0
- nodescraper/plugins/inband/cmdline/cmdline_plugin.py +43 -0
- nodescraper/plugins/inband/cmdline/cmdlinedata.py +30 -0
- nodescraper/plugins/inband/device_enumeration/__init__.py +29 -0
- nodescraper/plugins/inband/device_enumeration/analyzer_args.py +73 -0
- nodescraper/plugins/inband/device_enumeration/device_enumeration_analyzer.py +81 -0
- nodescraper/plugins/inband/device_enumeration/device_enumeration_collector.py +176 -0
- nodescraper/plugins/inband/device_enumeration/device_enumeration_plugin.py +45 -0
- nodescraper/plugins/inband/device_enumeration/deviceenumdata.py +36 -0
- nodescraper/plugins/inband/dimm/__init__.py +25 -0
- nodescraper/plugins/inband/dimm/collector_args.py +31 -0
- nodescraper/plugins/inband/dimm/dimm_collector.py +151 -0
- nodescraper/plugins/inband/dimm/dimm_plugin.py +40 -0
- nodescraper/plugins/inband/dimm/dimmdata.py +30 -0
- nodescraper/plugins/inband/dkms/__init__.py +25 -0
- nodescraper/plugins/inband/dkms/analyzer_args.py +85 -0
- nodescraper/plugins/inband/dkms/dkms_analyzer.py +106 -0
- nodescraper/plugins/inband/dkms/dkms_collector.py +76 -0
- nodescraper/plugins/inband/dkms/dkms_plugin.py +43 -0
- nodescraper/plugins/inband/dkms/dkmsdata.py +33 -0
- nodescraper/plugins/inband/dmesg/__init__.py +28 -0
- nodescraper/plugins/inband/dmesg/analyzer_args.py +33 -0
- nodescraper/plugins/inband/dmesg/collector_args.py +39 -0
- nodescraper/plugins/inband/dmesg/dmesg_analyzer.py +503 -0
- nodescraper/plugins/inband/dmesg/dmesg_collector.py +164 -0
- nodescraper/plugins/inband/dmesg/dmesg_plugin.py +44 -0
- nodescraper/plugins/inband/dmesg/dmesgdata.py +116 -0
- nodescraper/plugins/inband/fabrics/__init__.py +28 -0
- nodescraper/plugins/inband/fabrics/fabrics_collector.py +726 -0
- nodescraper/plugins/inband/fabrics/fabrics_plugin.py +37 -0
- nodescraper/plugins/inband/fabrics/fabricsdata.py +140 -0
- nodescraper/plugins/inband/journal/__init__.py +28 -0
- nodescraper/plugins/inband/journal/collector_args.py +33 -0
- nodescraper/plugins/inband/journal/journal_collector.py +107 -0
- nodescraper/plugins/inband/journal/journal_plugin.py +40 -0
- nodescraper/plugins/inband/journal/journaldata.py +44 -0
- nodescraper/plugins/inband/kernel/__init__.py +25 -0
- nodescraper/plugins/inband/kernel/analyzer_args.py +64 -0
- nodescraper/plugins/inband/kernel/kernel_analyzer.py +91 -0
- nodescraper/plugins/inband/kernel/kernel_collector.py +129 -0
- nodescraper/plugins/inband/kernel/kernel_plugin.py +43 -0
- nodescraper/plugins/inband/kernel/kerneldata.py +32 -0
- nodescraper/plugins/inband/kernel_module/__init__.py +25 -0
- nodescraper/plugins/inband/kernel_module/analyzer_args.py +59 -0
- nodescraper/plugins/inband/kernel_module/kernel_module_analyzer.py +211 -0
- nodescraper/plugins/inband/kernel_module/kernel_module_collector.py +264 -0
- nodescraper/plugins/inband/kernel_module/kernel_module_data.py +60 -0
- nodescraper/plugins/inband/kernel_module/kernel_module_plugin.py +43 -0
- nodescraper/plugins/inband/memory/__init__.py +25 -0
- nodescraper/plugins/inband/memory/analyzer_args.py +45 -0
- nodescraper/plugins/inband/memory/memory_analyzer.py +98 -0
- nodescraper/plugins/inband/memory/memory_collector.py +330 -0
- nodescraper/plugins/inband/memory/memory_plugin.py +43 -0
- nodescraper/plugins/inband/memory/memorydata.py +90 -0
- nodescraper/plugins/inband/network/__init__.py +28 -0
- nodescraper/plugins/inband/network/network_collector.py +1828 -0
- nodescraper/plugins/inband/network/network_plugin.py +37 -0
- nodescraper/plugins/inband/network/networkdata.py +319 -0
- nodescraper/plugins/inband/nvme/__init__.py +28 -0
- nodescraper/plugins/inband/nvme/nvme_collector.py +167 -0
- nodescraper/plugins/inband/nvme/nvme_plugin.py +37 -0
- nodescraper/plugins/inband/nvme/nvmedata.py +45 -0
- nodescraper/plugins/inband/os/__init__.py +25 -0
- nodescraper/plugins/inband/os/analyzer_args.py +64 -0
- nodescraper/plugins/inband/os/os_analyzer.py +73 -0
- nodescraper/plugins/inband/os/os_collector.py +131 -0
- nodescraper/plugins/inband/os/os_plugin.py +43 -0
- nodescraper/plugins/inband/os/osdata.py +31 -0
- nodescraper/plugins/inband/package/__init__.py +25 -0
- nodescraper/plugins/inband/package/analyzer_args.py +48 -0
- nodescraper/plugins/inband/package/package_analyzer.py +253 -0
- nodescraper/plugins/inband/package/package_collector.py +273 -0
- nodescraper/plugins/inband/package/package_plugin.py +43 -0
- nodescraper/plugins/inband/package/packagedata.py +41 -0
- nodescraper/plugins/inband/pcie/__init__.py +29 -0
- nodescraper/plugins/inband/pcie/analyzer_args.py +63 -0
- nodescraper/plugins/inband/pcie/pcie_analyzer.py +1081 -0
- nodescraper/plugins/inband/pcie/pcie_collector.py +690 -0
- nodescraper/plugins/inband/pcie/pcie_data.py +2017 -0
- nodescraper/plugins/inband/pcie/pcie_plugin.py +43 -0
- nodescraper/plugins/inband/process/__init__.py +25 -0
- nodescraper/plugins/inband/process/analyzer_args.py +45 -0
- nodescraper/plugins/inband/process/collector_args.py +31 -0
- nodescraper/plugins/inband/process/process_analyzer.py +91 -0
- nodescraper/plugins/inband/process/process_collector.py +115 -0
- nodescraper/plugins/inband/process/process_plugin.py +46 -0
- nodescraper/plugins/inband/process/processdata.py +34 -0
- nodescraper/plugins/inband/rocm/__init__.py +25 -0
- nodescraper/plugins/inband/rocm/analyzer_args.py +66 -0
- nodescraper/plugins/inband/rocm/rocm_analyzer.py +100 -0
- nodescraper/plugins/inband/rocm/rocm_collector.py +205 -0
- nodescraper/plugins/inband/rocm/rocm_plugin.py +43 -0
- nodescraper/plugins/inband/rocm/rocmdata.py +62 -0
- nodescraper/plugins/inband/storage/__init__.py +25 -0
- nodescraper/plugins/inband/storage/analyzer_args.py +38 -0
- nodescraper/plugins/inband/storage/collector_args.py +31 -0
- nodescraper/plugins/inband/storage/storage_analyzer.py +152 -0
- nodescraper/plugins/inband/storage/storage_collector.py +110 -0
- nodescraper/plugins/inband/storage/storage_plugin.py +44 -0
- nodescraper/plugins/inband/storage/storagedata.py +70 -0
- nodescraper/plugins/inband/sysctl/__init__.py +29 -0
- nodescraper/plugins/inband/sysctl/analyzer_args.py +67 -0
- nodescraper/plugins/inband/sysctl/sysctl_analyzer.py +81 -0
- nodescraper/plugins/inband/sysctl/sysctl_collector.py +101 -0
- nodescraper/plugins/inband/sysctl/sysctl_plugin.py +43 -0
- nodescraper/plugins/inband/sysctl/sysctldata.py +42 -0
- nodescraper/plugins/inband/syslog/__init__.py +28 -0
- nodescraper/plugins/inband/syslog/syslog_collector.py +121 -0
- nodescraper/plugins/inband/syslog/syslog_plugin.py +37 -0
- nodescraper/plugins/inband/syslog/syslogdata.py +46 -0
- nodescraper/plugins/inband/uptime/__init__.py +25 -0
- nodescraper/plugins/inband/uptime/uptime_collector.py +88 -0
- nodescraper/plugins/inband/uptime/uptime_plugin.py +37 -0
- nodescraper/plugins/inband/uptime/uptimedata.py +31 -0
- nodescraper/resultcollators/__init__.py +25 -0
- nodescraper/resultcollators/tablesummary.py +159 -0
- nodescraper/taskresulthooks/__init__.py +28 -0
- nodescraper/taskresulthooks/filesystemloghook.py +88 -0
- nodescraper/typeutils.py +171 -0
- nodescraper/utils.py +412 -0
|
@@ -0,0 +1,690 @@
|
|
|
1
|
+
###############################################################################
|
|
2
|
+
#
|
|
3
|
+
# MIT License
|
|
4
|
+
#
|
|
5
|
+
# Copyright (c) 2025 Advanced Micro Devices, Inc.
|
|
6
|
+
#
|
|
7
|
+
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
8
|
+
# of this software and associated documentation files (the "Software"), to deal
|
|
9
|
+
# in the Software without restriction, including without limitation the rights
|
|
10
|
+
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
11
|
+
# copies of the Software, and to permit persons to whom the Software is
|
|
12
|
+
# furnished to do so, subject to the following conditions:
|
|
13
|
+
#
|
|
14
|
+
# The above copyright notice and this permission notice shall be included in all
|
|
15
|
+
# copies or substantial portions of the Software.
|
|
16
|
+
#
|
|
17
|
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
18
|
+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
19
|
+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
20
|
+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
21
|
+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
22
|
+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
23
|
+
# SOFTWARE.
|
|
24
|
+
#
|
|
25
|
+
###############################################################################
|
|
26
|
+
import re
|
|
27
|
+
from enum import Enum
|
|
28
|
+
from typing import Dict, List, Optional, Set, Tuple, Union
|
|
29
|
+
|
|
30
|
+
from pydantic import ValidationError
|
|
31
|
+
|
|
32
|
+
from nodescraper.base import InBandDataCollector
|
|
33
|
+
from nodescraper.connection.inband import TextFileArtifact
|
|
34
|
+
from nodescraper.enums import (
|
|
35
|
+
EventCategory,
|
|
36
|
+
EventPriority,
|
|
37
|
+
ExecutionStatus,
|
|
38
|
+
OSFamily,
|
|
39
|
+
SystemInteractionLevel,
|
|
40
|
+
)
|
|
41
|
+
from nodescraper.models import TaskResult
|
|
42
|
+
from nodescraper.utils import get_all_subclasses, get_exception_details
|
|
43
|
+
|
|
44
|
+
from .pcie_data import (
|
|
45
|
+
MAX_CAP_ID,
|
|
46
|
+
MAX_ECAP_ID,
|
|
47
|
+
CapabilityEnum,
|
|
48
|
+
ExtendedCapabilityEnum,
|
|
49
|
+
PcieCapStructure,
|
|
50
|
+
PcieCfgSpace,
|
|
51
|
+
PcieDataModel,
|
|
52
|
+
Type0Configuration,
|
|
53
|
+
Type1Configuration,
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
class PcieCollector(InBandDataCollector[PcieDataModel, None]):
|
|
58
|
+
"""class for collection of PCIe data only supports Linux OS type.
|
|
59
|
+
|
|
60
|
+
This class collects the PCIE config space using the lspci hex dump and then parses the hex dump to get the
|
|
61
|
+
PCIe configuration space for the GPUs in the system. If the system interaction level is set to STANDARD or higher,
|
|
62
|
+
then the entire pcie configuration space is collected for the GPUs in the system. If the system interaction level
|
|
63
|
+
is set to SURFACE then, only the first 64 bytes of the pcie configuration space is collected for the GPUs in the system.
|
|
64
|
+
|
|
65
|
+
This class will collect important PCIe data from the system running the commands
|
|
66
|
+
- `lspci -vvv` : Verbose collection of PCIe data
|
|
67
|
+
- `lspci -vvvt`: Verbose tree view of PCIe data
|
|
68
|
+
- `lspci -PP`: Path view of PCIe data for the GPUs
|
|
69
|
+
- If system interaction level is set to STANDARD or higher, the following commands will be run with sudo:
|
|
70
|
+
- `lspci -xxxx`: Hex view of PCIe data for the GPUs
|
|
71
|
+
- otherwise the following commands will be run without sudo:
|
|
72
|
+
- `lspci -x`: Hex view of PCIe data for the GPUs
|
|
73
|
+
- `lspci -d <vendor_id>:<dev_id>` : Count the number of GPUs in the system with this command
|
|
74
|
+
- If system interaction level is set to STANDARD or higher, the following commands will be run with sudo:
|
|
75
|
+
- The sudo lspci -xxxx command is used to collect the PCIe configuration space for the GPUs in the system
|
|
76
|
+
- otherwise the following commands will be run without sudo:
|
|
77
|
+
- The lspci -x command is used to collect the PCIe configuration space for the GPUs in the system
|
|
78
|
+
|
|
79
|
+
"""
|
|
80
|
+
|
|
81
|
+
SUPPORTED_OS_FAMILY: Set[OSFamily] = {OSFamily.LINUX}
|
|
82
|
+
|
|
83
|
+
DATA_MODEL = PcieDataModel
|
|
84
|
+
|
|
85
|
+
CMD_LSPCI_VERBOSE = "lspci -vvv"
|
|
86
|
+
CMD_LSPCI_VERBOSE_TREE = "lspci -vvvt"
|
|
87
|
+
CMD_LSPCI_PATH = "lspci -PP"
|
|
88
|
+
CMD_LSPCI_HEX_SUDO = "lspci -xxxx"
|
|
89
|
+
CMD_LSPCI_HEX = "lspci -x"
|
|
90
|
+
CMD_LSPCI_AMD_DEVICES = "lspci -d {vendor_id}: -nn"
|
|
91
|
+
CMD_LSPCI_PATH_DEVICE = "lspci -PP -d {vendor_id}:{dev_id}"
|
|
92
|
+
|
|
93
|
+
def _detect_amd_device_ids(self) -> dict[str, list[str]]:
|
|
94
|
+
"""Detect AMD GPU device IDs from the system using lspci.
|
|
95
|
+
|
|
96
|
+
Returns:
|
|
97
|
+
dict[str, list[str]]: Dictionary with 'vendor_id', 'device_ids', and 'vf_device_ids'
|
|
98
|
+
"""
|
|
99
|
+
vendor_id_hex = format(self.system_info.vendorid_ep, "x")
|
|
100
|
+
result: dict[str, list[str]] = {
|
|
101
|
+
"vendor_id": [vendor_id_hex],
|
|
102
|
+
"device_ids": [],
|
|
103
|
+
"vf_device_ids": [],
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
res = self._run_sut_cmd(
|
|
107
|
+
self.CMD_LSPCI_AMD_DEVICES.format(vendor_id=vendor_id_hex),
|
|
108
|
+
sudo=False,
|
|
109
|
+
log_artifact=False,
|
|
110
|
+
)
|
|
111
|
+
if res.exit_code == 0 and res.stdout:
|
|
112
|
+
# Pattern: [vendor:device]
|
|
113
|
+
device_id_pattern = rf"\[{vendor_id_hex}:([0-9a-fA-F]{{4}})\]"
|
|
114
|
+
# Pattern to detect VF in description
|
|
115
|
+
vf_pattern = r"Virtual Function"
|
|
116
|
+
|
|
117
|
+
for line in res.stdout.splitlines():
|
|
118
|
+
matches = re.findall(device_id_pattern, line)
|
|
119
|
+
if matches:
|
|
120
|
+
device_id = matches[0].lower()
|
|
121
|
+
# Check if it's a VF
|
|
122
|
+
if re.search(vf_pattern, line, re.IGNORECASE):
|
|
123
|
+
if device_id not in result["vf_device_ids"]:
|
|
124
|
+
result["vf_device_ids"].append(device_id)
|
|
125
|
+
self.logger.info(f"Detected AMD VF device ID: {device_id}")
|
|
126
|
+
else:
|
|
127
|
+
if device_id not in result["device_ids"]:
|
|
128
|
+
result["device_ids"].append(device_id)
|
|
129
|
+
self.logger.info(f"Detected AMD device ID: {device_id}")
|
|
130
|
+
|
|
131
|
+
self._log_event(
|
|
132
|
+
category=EventCategory.IO,
|
|
133
|
+
description="Detected AMD GPU device IDs from system",
|
|
134
|
+
data=result,
|
|
135
|
+
priority=EventPriority.INFO,
|
|
136
|
+
)
|
|
137
|
+
|
|
138
|
+
return result
|
|
139
|
+
|
|
140
|
+
def show_lspci_verbose(self, sudo=True) -> Optional[str]:
|
|
141
|
+
"""Show lspci with -vvv."""
|
|
142
|
+
return self._run_os_cmd(self.CMD_LSPCI_VERBOSE, sudo=sudo)
|
|
143
|
+
|
|
144
|
+
def show_lspci_verbose_tree(self, sudo=True) -> Optional[str]:
|
|
145
|
+
"""Show lspci with -vvvt (verbose tree view)."""
|
|
146
|
+
return self._run_os_cmd(self.CMD_LSPCI_VERBOSE_TREE, sudo=sudo)
|
|
147
|
+
|
|
148
|
+
def show_lspci_path(self, sudo=True) -> Optional[str]:
|
|
149
|
+
"""Show lspci with -PP."""
|
|
150
|
+
return self._run_os_cmd(self.CMD_LSPCI_PATH, sudo=sudo)
|
|
151
|
+
|
|
152
|
+
def show_lspci_hex(self, bdf: Optional[str] = None, sudo=True) -> Optional[str]:
|
|
153
|
+
"""Show lspci with -xxxx."""
|
|
154
|
+
if sudo:
|
|
155
|
+
hex_arg = "-xxxx"
|
|
156
|
+
else:
|
|
157
|
+
# Sudo required for whole pcie configuration space
|
|
158
|
+
hex_arg = "-x"
|
|
159
|
+
|
|
160
|
+
if bdf:
|
|
161
|
+
return self._run_os_cmd(f"lspci {hex_arg} -s {bdf}", sudo=sudo)
|
|
162
|
+
return self._run_os_cmd(f"lspci {hex_arg}", sudo=sudo)
|
|
163
|
+
|
|
164
|
+
def _run_os_cmd(
|
|
165
|
+
self, command: str, sudo: bool = True, ignore_error: bool = False
|
|
166
|
+
) -> Optional[str]:
|
|
167
|
+
"""Run os command. Run as sudo by default.
|
|
168
|
+
|
|
169
|
+
Args:
|
|
170
|
+
command (str): command to run on the OS
|
|
171
|
+
sudo (bool): run as sudo or not
|
|
172
|
+
ignore_error (bool): ignore error or not
|
|
173
|
+
Returns:
|
|
174
|
+
stdout: str
|
|
175
|
+
"""
|
|
176
|
+
cmd_ret = self._run_sut_cmd(command, sudo=sudo)
|
|
177
|
+
if ignore_error:
|
|
178
|
+
return cmd_ret.stdout
|
|
179
|
+
elif cmd_ret.stderr != "" or cmd_ret.exit_code != 0:
|
|
180
|
+
return None
|
|
181
|
+
else:
|
|
182
|
+
return cmd_ret.stdout
|
|
183
|
+
|
|
184
|
+
def _get_upstream_bdf_from_buspath(
|
|
185
|
+
self,
|
|
186
|
+
vendor_id: str,
|
|
187
|
+
dev_id: str,
|
|
188
|
+
upstream_steps_limit: Optional[int] = 0,
|
|
189
|
+
sudo=True,
|
|
190
|
+
) -> Optional[Dict[str, List[str]]]:
|
|
191
|
+
"""Get all the upstream BDFs for a vendor/device id.
|
|
192
|
+
|
|
193
|
+
Parameters
|
|
194
|
+
----------
|
|
195
|
+
vendor_id : str
|
|
196
|
+
A pcie vendor id
|
|
197
|
+
dev_id : str
|
|
198
|
+
A pcie device id
|
|
199
|
+
upstream_steps_limit : Optional[int]
|
|
200
|
+
The limit on the number of upstream devices to collect, by default 0
|
|
201
|
+
sudo : bool
|
|
202
|
+
Run the command as sudo or not, by default True
|
|
203
|
+
|
|
204
|
+
Returns
|
|
205
|
+
-------
|
|
206
|
+
Optional[List[str]]
|
|
207
|
+
A list of upstream BDFs or None on failure
|
|
208
|
+
"""
|
|
209
|
+
split_bdf_pos = 0
|
|
210
|
+
|
|
211
|
+
bus_path_all_gpus = self._run_os_cmd(f"lspci -PP -d {vendor_id}:{dev_id}", sudo=sudo)
|
|
212
|
+
if bus_path_all_gpus is None or bus_path_all_gpus == "":
|
|
213
|
+
self._log_event(
|
|
214
|
+
category=EventCategory.IO,
|
|
215
|
+
description="Failed to get bus path info for vendor/device ID.",
|
|
216
|
+
data={"vendor_id": vendor_id, "dev_id": dev_id},
|
|
217
|
+
priority=EventPriority.INFO,
|
|
218
|
+
)
|
|
219
|
+
return None
|
|
220
|
+
upstream_bdfs: Dict[str, List[str]] = {}
|
|
221
|
+
for bus_path in bus_path_all_gpus.splitlines():
|
|
222
|
+
bus_path_list = (bus_path.split(" ")[split_bdf_pos]).split("/")
|
|
223
|
+
if upstream_steps_limit is not None and len(bus_path_list) < upstream_steps_limit + 1:
|
|
224
|
+
# We don't have enough upstream devices to collect
|
|
225
|
+
self._log_event(
|
|
226
|
+
category=EventCategory.RUNTIME,
|
|
227
|
+
description="Not enough upstream devices found.",
|
|
228
|
+
data={
|
|
229
|
+
"bus_path": bus_path,
|
|
230
|
+
"upstream_steps_limit": upstream_steps_limit,
|
|
231
|
+
"bus_path_list": bus_path_list,
|
|
232
|
+
},
|
|
233
|
+
priority=EventPriority.WARNING,
|
|
234
|
+
)
|
|
235
|
+
bdf_str = bus_path_list[-1]
|
|
236
|
+
upstream_bdfs[bdf_str] = []
|
|
237
|
+
# Flip the bus_path_list to get GPU first and then upstream devices
|
|
238
|
+
bus_path_list.reverse()
|
|
239
|
+
# Upstream + 1 to always include GPU and # of upstream devices
|
|
240
|
+
if upstream_steps_limit is None:
|
|
241
|
+
upstream_bdfs[bdf_str] = bus_path_list
|
|
242
|
+
else:
|
|
243
|
+
for bdf in range(min(len(bus_path_list), upstream_steps_limit + 1)):
|
|
244
|
+
upstream_bdfs[bdf_str].append(bus_path_list[bdf])
|
|
245
|
+
|
|
246
|
+
return upstream_bdfs
|
|
247
|
+
|
|
248
|
+
def _get_gpu_cfg_space(
|
|
249
|
+
self,
|
|
250
|
+
vendor_id: str,
|
|
251
|
+
device_id: str,
|
|
252
|
+
upstream_steps_from_gpu: Optional[int] = 0,
|
|
253
|
+
sudo=True,
|
|
254
|
+
) -> dict[str, PcieCfgSpace]:
|
|
255
|
+
"""
|
|
256
|
+
- Generates a nested dictionary with the PCIe configuration space for the bdfs corresponding to the vendor/device ID
|
|
257
|
+
- Populates the dict by reading cfg space through 'setpci' commands
|
|
258
|
+
|
|
259
|
+
Args:
|
|
260
|
+
vendor_id (str): vendor ID (hex format)
|
|
261
|
+
device_id (str): device ID (hex format)
|
|
262
|
+
upstream_steps_from_gpu (Optional[int]): The number of upstream devices to collect the PCIe cfg space for, by default 0
|
|
263
|
+
Returns:
|
|
264
|
+
all_bdf_cfg_space_dict: nested dictionary containing PCIe cfg space for all bdfs corresponding to the vendor/device ID
|
|
265
|
+
"""
|
|
266
|
+
if (vendor_id is None) or (device_id is None):
|
|
267
|
+
self._log_event(
|
|
268
|
+
category=EventCategory.IO,
|
|
269
|
+
description="System info is invalid Vendor ID or Device ID is None.",
|
|
270
|
+
data={"vendor_id": vendor_id, "dev_id": device_id},
|
|
271
|
+
priority=EventPriority.ERROR,
|
|
272
|
+
)
|
|
273
|
+
return {}
|
|
274
|
+
|
|
275
|
+
bdf_list = self._get_upstream_bdf_from_buspath(
|
|
276
|
+
vendor_id,
|
|
277
|
+
device_id,
|
|
278
|
+
upstream_steps_limit=upstream_steps_from_gpu,
|
|
279
|
+
sudo=sudo,
|
|
280
|
+
)
|
|
281
|
+
if bdf_list is None:
|
|
282
|
+
return {}
|
|
283
|
+
|
|
284
|
+
all_bdf_cfg_space_dict = {}
|
|
285
|
+
for gpu_bdf_list in bdf_list.values():
|
|
286
|
+
for bdf in gpu_bdf_list:
|
|
287
|
+
new_base_dict = self.get_cfg_by_bdf(bdf, sudo=sudo)
|
|
288
|
+
all_bdf_cfg_space_dict[bdf] = new_base_dict
|
|
289
|
+
return all_bdf_cfg_space_dict
|
|
290
|
+
|
|
291
|
+
def parse_hex_dump(self, hex_dump: str) -> list[int]:
|
|
292
|
+
"""Parse the hex dump."""
|
|
293
|
+
|
|
294
|
+
hex_dump = hex_dump.strip()
|
|
295
|
+
byte_list = []
|
|
296
|
+
for line in hex_dump.splitlines():
|
|
297
|
+
parts = line.split(":")
|
|
298
|
+
if len(parts) != 2:
|
|
299
|
+
continue # Skip malformed lines
|
|
300
|
+
if len(parts[1]) != 48:
|
|
301
|
+
continue # Unexpected number of bytes
|
|
302
|
+
byte_str = parts[1]
|
|
303
|
+
tokens = byte_str.strip().split()
|
|
304
|
+
for token in tokens:
|
|
305
|
+
byte = int(token, 16)
|
|
306
|
+
byte_list.append(byte)
|
|
307
|
+
|
|
308
|
+
return byte_list
|
|
309
|
+
|
|
310
|
+
def read_register(self, width: int, offset: int, config_data: List[int]):
|
|
311
|
+
"""Read a register from the hex dump, width should be 1, 2, 4, or 8 bytes"""
|
|
312
|
+
register_value = 0
|
|
313
|
+
for i in range(0, width >> 3):
|
|
314
|
+
register_value += config_data[offset + i] << (i * 8)
|
|
315
|
+
return register_value
|
|
316
|
+
|
|
317
|
+
def extended_cap_finder(
|
|
318
|
+
self,
|
|
319
|
+
config_data: List[int],
|
|
320
|
+
cap_pointer: int,
|
|
321
|
+
cap_data: Optional[Dict[int, int]] = None,
|
|
322
|
+
):
|
|
323
|
+
"""Obtain capability structure by parsing the hex dump for capability pointers
|
|
324
|
+
|
|
325
|
+
config_data : List[int]
|
|
326
|
+
A list of int's representing the hex dump from lspci -x or sudo lspci -xxxx
|
|
327
|
+
cap_pointer : int
|
|
328
|
+
The hex value of a Capability pointer or 0x34 for the first cap pointer
|
|
329
|
+
cap_data : Optional[dict[int, int]], optional
|
|
330
|
+
A dictionary of capability pointers, by default None
|
|
331
|
+
|
|
332
|
+
returns
|
|
333
|
+
-------
|
|
334
|
+
cap_data : Dict[int, int]
|
|
335
|
+
A list of capability pointers, key is the cap_id and value is the cap_pointer use CapabilityEnum(cap_id) to get the Name
|
|
336
|
+
"""
|
|
337
|
+
if cap_data is None:
|
|
338
|
+
cap_data = {}
|
|
339
|
+
if cap_pointer >= len(config_data) or cap_pointer + 1 >= len(config_data):
|
|
340
|
+
# prevent an illegal access to the list
|
|
341
|
+
return cap_data
|
|
342
|
+
cap_id = config_data[cap_pointer] + (config_data[cap_pointer + 1] << 8)
|
|
343
|
+
if cap_id > MAX_ECAP_ID:
|
|
344
|
+
# Break if the cap_id is greater than the max extended cap id
|
|
345
|
+
self._log_event(
|
|
346
|
+
category=EventCategory.IO,
|
|
347
|
+
description=f"Invalid Capability ID detected {cap_id}",
|
|
348
|
+
priority=EventPriority.ERROR,
|
|
349
|
+
data={"cap_id": cap_id},
|
|
350
|
+
)
|
|
351
|
+
return {}
|
|
352
|
+
cap_data[cap_id] = cap_pointer
|
|
353
|
+
if cap_pointer + 3 >= len(config_data):
|
|
354
|
+
return cap_data
|
|
355
|
+
next_cap_pointer = (config_data[cap_pointer + 2] & 0xF0) >> 4
|
|
356
|
+
next_cap_pointer += config_data[cap_pointer + 3] << 4
|
|
357
|
+
if next_cap_pointer == 0:
|
|
358
|
+
return cap_data
|
|
359
|
+
else:
|
|
360
|
+
return self.extended_cap_finder(config_data, next_cap_pointer, cap_data)
|
|
361
|
+
|
|
362
|
+
def cap_finder(
|
|
363
|
+
self,
|
|
364
|
+
config_data: List[int],
|
|
365
|
+
cap_pointer: int,
|
|
366
|
+
cap_data: Optional[Dict[int, int]] = None,
|
|
367
|
+
):
|
|
368
|
+
"""Obtain capability structure by parsing the hex dump for capability pointers
|
|
369
|
+
|
|
370
|
+
Parameters
|
|
371
|
+
----------
|
|
372
|
+
config_data : List[int]
|
|
373
|
+
A list of int's representing the hex dump from lspci -xxxx
|
|
374
|
+
cap_pointer : int
|
|
375
|
+
The hex value of a Capability pointer or 0x34 for the first cap pointer
|
|
376
|
+
cap_data : Optional[Dict[int, int]], optional
|
|
377
|
+
A dictionary of capability pointers, by default None
|
|
378
|
+
|
|
379
|
+
Returns
|
|
380
|
+
-------
|
|
381
|
+
cap_data : Dict[int, int]
|
|
382
|
+
A list of extended apability pointers, key is the cap_id and value is the cap_pointer use ExtendedCapabilityEnum(cap_id) to get the Name
|
|
383
|
+
"""
|
|
384
|
+
if cap_data is None:
|
|
385
|
+
cap_data = {}
|
|
386
|
+
|
|
387
|
+
if cap_pointer == 0x34:
|
|
388
|
+
# Special case for ths first cap pointer, this one doesn't have an associated cap_id so just move on
|
|
389
|
+
return self.cap_finder(config_data, config_data[0x34], cap_data)
|
|
390
|
+
if cap_pointer >= len(config_data) or cap_pointer + 1 >= len(config_data):
|
|
391
|
+
# prevent an illegal access to the list
|
|
392
|
+
return cap_data
|
|
393
|
+
cap_id = config_data[cap_pointer]
|
|
394
|
+
if cap_id > MAX_CAP_ID:
|
|
395
|
+
# Break if the cap_id is greater than the max cap id
|
|
396
|
+
self._log_event(
|
|
397
|
+
category=EventCategory.IO,
|
|
398
|
+
description=f"Invalid Capability ID detected {cap_id}",
|
|
399
|
+
priority=EventPriority.ERROR,
|
|
400
|
+
data={"cap_id": cap_id},
|
|
401
|
+
)
|
|
402
|
+
return {}
|
|
403
|
+
next_cap_pointer = config_data[cap_pointer + 1]
|
|
404
|
+
cap_data[cap_id] = cap_pointer
|
|
405
|
+
if next_cap_pointer == 0:
|
|
406
|
+
return cap_data
|
|
407
|
+
else:
|
|
408
|
+
return self.cap_finder(config_data, next_cap_pointer, cap_data)
|
|
409
|
+
|
|
410
|
+
def get_cap_struct(self, id: Enum) -> Optional[type[PcieCapStructure]]:
|
|
411
|
+
for cap_struct in get_all_subclasses(PcieCapStructure):
|
|
412
|
+
if cap_struct.cap_id == id:
|
|
413
|
+
return cap_struct
|
|
414
|
+
return None
|
|
415
|
+
|
|
416
|
+
def get_pcie_common_cfg(
|
|
417
|
+
self,
|
|
418
|
+
type_x_configuration: Union[type[Type0Configuration], type[Type1Configuration]],
|
|
419
|
+
config_data: List[int],
|
|
420
|
+
) -> Union[Type0Configuration, Type1Configuration]:
|
|
421
|
+
"""Get the Base PCIe configuration space from the hex dump items
|
|
422
|
+
|
|
423
|
+
Parameters
|
|
424
|
+
----------
|
|
425
|
+
type_x_configuration : Union[type[Type0Configuration], type[Type1Configuration]]
|
|
426
|
+
Either Type0Configuration or Type1Configuration
|
|
427
|
+
config_data : List[int]
|
|
428
|
+
Config data from lspci -xxxx
|
|
429
|
+
|
|
430
|
+
Returns
|
|
431
|
+
-------
|
|
432
|
+
Union[Type0Configuration, Type1Configuration]
|
|
433
|
+
The complete model that was input
|
|
434
|
+
"""
|
|
435
|
+
register_data: Dict[str, int] = {}
|
|
436
|
+
type_x_obj = type_x_configuration()
|
|
437
|
+
for register_name, register_in in type_x_obj.iter_regs():
|
|
438
|
+
register = register_in.model_copy()
|
|
439
|
+
register_data[register_name] = self.read_register(
|
|
440
|
+
register.width, register.offset, config_data
|
|
441
|
+
)
|
|
442
|
+
type_x_obj.set_regs(register_data)
|
|
443
|
+
return type_x_obj
|
|
444
|
+
|
|
445
|
+
def get_cap_cfg(
|
|
446
|
+
self,
|
|
447
|
+
cap_data: Dict[int, int],
|
|
448
|
+
config_data: List[int],
|
|
449
|
+
) -> Union[
|
|
450
|
+
Dict[CapabilityEnum, PcieCapStructure], Dict[ExtendedCapabilityEnum, PcieCapStructure]
|
|
451
|
+
]:
|
|
452
|
+
"""Get the data from the capability structures
|
|
453
|
+
|
|
454
|
+
Parameters
|
|
455
|
+
----------
|
|
456
|
+
cap_data : Dict[int,int]
|
|
457
|
+
A list of capability pointers, key is the cap_id and value is the cap_pointer
|
|
458
|
+
config_data : List[int]
|
|
459
|
+
A list of ints representing the hex dump from lspci -xxxx
|
|
460
|
+
|
|
461
|
+
Returns
|
|
462
|
+
-------
|
|
463
|
+
Union[Dict[CapabilityEnum, PcieCapStructure], Dict[ExtendedCapabilityEnum, PcieCapStructure]]
|
|
464
|
+
Either a dict of CapabilityEnum to PcieCapStructure or ExtendedCapabilityEnum to PcieCapStructure
|
|
465
|
+
|
|
466
|
+
"""
|
|
467
|
+
cap_structure: Dict[Enum, PcieCapStructure] = {}
|
|
468
|
+
for cap_id, cap_addr in cap_data.items():
|
|
469
|
+
if cap_id == 0:
|
|
470
|
+
continue
|
|
471
|
+
if cap_addr >= 0x100:
|
|
472
|
+
cap_enum: Enum = ExtendedCapabilityEnum(cap_id)
|
|
473
|
+
else:
|
|
474
|
+
cap_enum = CapabilityEnum(cap_id)
|
|
475
|
+
cap_cls = self.get_cap_struct(cap_enum)
|
|
476
|
+
if cap_cls is None:
|
|
477
|
+
continue
|
|
478
|
+
cap_obj = cap_cls() # type: ignore[call-arg]
|
|
479
|
+
reg_data = {}
|
|
480
|
+
for register_name, register in cap_obj.iter_regs():
|
|
481
|
+
reg_data[register_name] = self.read_register(
|
|
482
|
+
register.width, register.offset + cap_addr, config_data
|
|
483
|
+
)
|
|
484
|
+
cap_obj.set_regs(reg_data)
|
|
485
|
+
cap_obj.offset = cap_addr
|
|
486
|
+
cap_structure[cap_enum] = cap_obj
|
|
487
|
+
|
|
488
|
+
return cap_structure # type: ignore[return-value]
|
|
489
|
+
|
|
490
|
+
def get_cfg_by_bdf(self, bdf: str, sudo=True) -> PcieCfgSpace:
|
|
491
|
+
"""Will fill out a PcieCfgSpace object with the PCIe configuration space for a given BDF"""
|
|
492
|
+
hex_data_raw = self.show_lspci_hex(bdf, sudo=sudo)
|
|
493
|
+
if hex_data_raw is None:
|
|
494
|
+
self._log_event(
|
|
495
|
+
category=EventCategory.IO,
|
|
496
|
+
description="Failed to get hex data for BDF.",
|
|
497
|
+
data={"bdf": bdf},
|
|
498
|
+
priority=EventPriority.ERROR,
|
|
499
|
+
)
|
|
500
|
+
return PcieCfgSpace()
|
|
501
|
+
hex_data: List[int] = self.parse_hex_dump(hex_data_raw)
|
|
502
|
+
if len(hex_data) < 64:
|
|
503
|
+
# Expect at least 256 bytes of data, for the first 256 bytes of the PCIe config space
|
|
504
|
+
self._log_event(
|
|
505
|
+
category=EventCategory.IO,
|
|
506
|
+
description="Hex data is not the expected length",
|
|
507
|
+
data={"bdf": bdf, "length": len(hex_data)},
|
|
508
|
+
priority=EventPriority.ERROR,
|
|
509
|
+
)
|
|
510
|
+
return PcieCfgSpace()
|
|
511
|
+
cap_data, ecap_data = self.discover_capability_structure(hex_data)
|
|
512
|
+
return self.get_pcie_cfg(hex_data, cap_data, ecap_data)
|
|
513
|
+
|
|
514
|
+
def get_pcie_cfg(
|
|
515
|
+
self,
|
|
516
|
+
config_data: List[int],
|
|
517
|
+
cap_data: Dict[int, int],
|
|
518
|
+
ecap_data: Dict[int, int],
|
|
519
|
+
) -> PcieCfgSpace:
|
|
520
|
+
"""Gets the pcie config space from a list of ints
|
|
521
|
+
|
|
522
|
+
Parameters
|
|
523
|
+
----------
|
|
524
|
+
config_data : List[int]
|
|
525
|
+
A list of ints representing the hex dump from lspci -xxxx
|
|
526
|
+
cap_data : Dict[int, int]
|
|
527
|
+
A list of capability pointers, key is the cap_id and value is the cap_pointer
|
|
528
|
+
|
|
529
|
+
Returns
|
|
530
|
+
-------
|
|
531
|
+
PcieCfgSpace
|
|
532
|
+
A PcieCfgSpace object with the PCIe configuration
|
|
533
|
+
"""
|
|
534
|
+
type0 = self.get_pcie_common_cfg(Type0Configuration, config_data)
|
|
535
|
+
type1 = self.get_pcie_common_cfg(Type1Configuration, config_data)
|
|
536
|
+
cap = self.get_cap_cfg(cap_data, config_data)
|
|
537
|
+
ecap = self.get_cap_cfg(ecap_data, config_data)
|
|
538
|
+
return PcieCfgSpace(
|
|
539
|
+
type_0_configuration=type0, # type: ignore[arg-type]
|
|
540
|
+
type_1_configuration=type1, # type: ignore[arg-type]
|
|
541
|
+
capability_pointers=cap_data, # type: ignore[arg-type]
|
|
542
|
+
extended_capability_pointers=ecap_data, # type: ignore[arg-type]
|
|
543
|
+
cap_structure=cap, # type: ignore[arg-type]
|
|
544
|
+
ecap_structure=ecap, # type: ignore[arg-type]
|
|
545
|
+
)
|
|
546
|
+
|
|
547
|
+
def _log_pcie_artifacts(
|
|
548
|
+
self,
|
|
549
|
+
lspci_pp: Optional[str],
|
|
550
|
+
lspci_hex: Optional[str],
|
|
551
|
+
lspci_verbose_tree: Optional[str],
|
|
552
|
+
lspci_verbose: Optional[str],
|
|
553
|
+
):
|
|
554
|
+
"""Log the file artifacts for the PCIe data collector."""
|
|
555
|
+
name_log_map = {
|
|
556
|
+
"lspci_hex.txt": lspci_hex,
|
|
557
|
+
"lspci_verbose_tree.txt": lspci_verbose_tree,
|
|
558
|
+
"lspci_verbose.txt": lspci_verbose,
|
|
559
|
+
"lspci_pp.txt": lspci_pp,
|
|
560
|
+
}
|
|
561
|
+
for name, data in name_log_map.items():
|
|
562
|
+
if data is not None:
|
|
563
|
+
self.result.artifacts.append(TextFileArtifact(filename=name, contents=data))
|
|
564
|
+
|
|
565
|
+
def _get_pcie_data(
|
|
566
|
+
self, upstream_steps_to_collect: Optional[int] = None
|
|
567
|
+
) -> Optional[PcieDataModel]:
|
|
568
|
+
"""Will return all PCIe data in a PcieDataModel object.
|
|
569
|
+
|
|
570
|
+
Returns
|
|
571
|
+
-------
|
|
572
|
+
Optional[PcieDataModel]
|
|
573
|
+
The data in a PcieDataModel object or None on failure
|
|
574
|
+
"""
|
|
575
|
+
minimum_system_interaction_level_required_for_sudo = SystemInteractionLevel.INTERACTIVE
|
|
576
|
+
|
|
577
|
+
try:
|
|
578
|
+
if (
|
|
579
|
+
isinstance(self.system_interaction_level, SystemInteractionLevel)
|
|
580
|
+
and self.system_interaction_level
|
|
581
|
+
>= minimum_system_interaction_level_required_for_sudo
|
|
582
|
+
):
|
|
583
|
+
use_sudo = True
|
|
584
|
+
else:
|
|
585
|
+
use_sudo = False
|
|
586
|
+
|
|
587
|
+
if upstream_steps_to_collect is None:
|
|
588
|
+
upstream_steps_to_collect = None
|
|
589
|
+
|
|
590
|
+
# Detect AMD device IDs dynamically from the system
|
|
591
|
+
detected_devices = self._detect_amd_device_ids()
|
|
592
|
+
vendor_id = (
|
|
593
|
+
detected_devices["vendor_id"][0]
|
|
594
|
+
if detected_devices["vendor_id"]
|
|
595
|
+
else format(self.system_info.vendorid_ep, "x")
|
|
596
|
+
)
|
|
597
|
+
device_ids = detected_devices["device_ids"]
|
|
598
|
+
vf_device_ids = detected_devices["vf_device_ids"]
|
|
599
|
+
|
|
600
|
+
pcie_cfg_dict: Dict[str, PcieCfgSpace] = {}
|
|
601
|
+
vf_pcie_cfg_data: Dict[str, PcieCfgSpace] = {}
|
|
602
|
+
|
|
603
|
+
# Collect PCIe config space for each detected device ID
|
|
604
|
+
for dev_id in device_ids:
|
|
605
|
+
cfg_space = self._get_gpu_cfg_space(
|
|
606
|
+
vendor_id=vendor_id,
|
|
607
|
+
device_id=dev_id,
|
|
608
|
+
upstream_steps_from_gpu=upstream_steps_to_collect,
|
|
609
|
+
sudo=use_sudo,
|
|
610
|
+
)
|
|
611
|
+
if cfg_space:
|
|
612
|
+
pcie_cfg_dict.update(cfg_space)
|
|
613
|
+
|
|
614
|
+
# Collect VF PCIe config space for each detected VF device ID
|
|
615
|
+
for dev_id_vf in vf_device_ids:
|
|
616
|
+
vf_cfg_space = self._get_gpu_cfg_space(
|
|
617
|
+
vendor_id=vendor_id,
|
|
618
|
+
device_id=dev_id_vf,
|
|
619
|
+
upstream_steps_from_gpu=0,
|
|
620
|
+
sudo=use_sudo,
|
|
621
|
+
)
|
|
622
|
+
if vf_cfg_space:
|
|
623
|
+
vf_pcie_cfg_data.update(vf_cfg_space)
|
|
624
|
+
|
|
625
|
+
lspci_hex = self.show_lspci_hex(sudo=use_sudo)
|
|
626
|
+
lspci_verbose = self.show_lspci_verbose(sudo=use_sudo)
|
|
627
|
+
lspci_verbose_tree = self.show_lspci_verbose_tree(sudo=use_sudo)
|
|
628
|
+
lspci_path = self.show_lspci_path(sudo=use_sudo)
|
|
629
|
+
self._log_pcie_artifacts(
|
|
630
|
+
lspci_pp=lspci_path,
|
|
631
|
+
lspci_hex=lspci_hex,
|
|
632
|
+
lspci_verbose_tree=lspci_verbose_tree,
|
|
633
|
+
lspci_verbose=lspci_verbose,
|
|
634
|
+
)
|
|
635
|
+
pcie_data = PcieDataModel(
|
|
636
|
+
pcie_cfg_space=pcie_cfg_dict,
|
|
637
|
+
vf_pcie_cfg_space=vf_pcie_cfg_data,
|
|
638
|
+
)
|
|
639
|
+
except ValidationError as e:
|
|
640
|
+
self._log_event(
|
|
641
|
+
category=EventCategory.OS,
|
|
642
|
+
description="Failed to build model for PCIe data",
|
|
643
|
+
data=get_exception_details(e),
|
|
644
|
+
priority=EventPriority.ERROR,
|
|
645
|
+
)
|
|
646
|
+
self.result.status = ExecutionStatus.ERROR
|
|
647
|
+
return None
|
|
648
|
+
return pcie_data
|
|
649
|
+
|
|
650
|
+
def discover_capability_structure(
|
|
651
|
+
self, hex_dump: List[int]
|
|
652
|
+
) -> Tuple[Dict[int, int], Dict[int, int]]:
|
|
653
|
+
"""Obtain the capability structure by parsing the hex dump for capability pointers
|
|
654
|
+
|
|
655
|
+
Parameters
|
|
656
|
+
----------
|
|
657
|
+
hex_dump : List[int]
|
|
658
|
+
A list of ints from lspci -xxxx
|
|
659
|
+
|
|
660
|
+
Returns
|
|
661
|
+
-------
|
|
662
|
+
dict[int, int]
|
|
663
|
+
A list of capability pointers, key is the cap_id and value is the cap_pointer
|
|
664
|
+
"""
|
|
665
|
+
cap = self.cap_finder(hex_dump, 0x34)
|
|
666
|
+
ecap = self.extended_cap_finder(hex_dump, 0x100)
|
|
667
|
+
return cap, ecap
|
|
668
|
+
|
|
669
|
+
def collect_data(
|
|
670
|
+
self, args=None, upstream_steps_to_collect: Optional[int] = None, **kwargs
|
|
671
|
+
) -> Tuple[TaskResult, Optional[PcieDataModel]]:
|
|
672
|
+
"""Read PCIe data.
|
|
673
|
+
|
|
674
|
+
Args:
|
|
675
|
+
args: Optional collector arguments (not used)
|
|
676
|
+
upstream_steps_to_collect: Number of upstream devices to collect
|
|
677
|
+
**kwargs: Additional keyword arguments
|
|
678
|
+
|
|
679
|
+
Returns:
|
|
680
|
+
Tuple[TaskResult, Optional[PcieDataModel]]: tuple containing the result of the task and the PCIe data if available
|
|
681
|
+
"""
|
|
682
|
+
pcie_data = self._get_pcie_data(upstream_steps_to_collect)
|
|
683
|
+
if pcie_data:
|
|
684
|
+
self._log_event(
|
|
685
|
+
category=EventCategory.IO,
|
|
686
|
+
description="PCIe Data read from GPUs",
|
|
687
|
+
data={"bdf_count": len(pcie_data.pcie_cfg_space.keys())},
|
|
688
|
+
priority=EventPriority.INFO,
|
|
689
|
+
)
|
|
690
|
+
return self.result, pcie_data
|