amd-node-scraper 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (197) hide show
  1. amd_node_scraper-0.0.1.dist-info/LICENSE +21 -0
  2. amd_node_scraper-0.0.1.dist-info/METADATA +424 -0
  3. amd_node_scraper-0.0.1.dist-info/RECORD +197 -0
  4. amd_node_scraper-0.0.1.dist-info/WHEEL +5 -0
  5. amd_node_scraper-0.0.1.dist-info/entry_points.txt +2 -0
  6. amd_node_scraper-0.0.1.dist-info/top_level.txt +1 -0
  7. nodescraper/__init__.py +32 -0
  8. nodescraper/base/__init__.py +34 -0
  9. nodescraper/base/inbandcollectortask.py +118 -0
  10. nodescraper/base/inbanddataplugin.py +39 -0
  11. nodescraper/base/regexanalyzer.py +120 -0
  12. nodescraper/cli/__init__.py +29 -0
  13. nodescraper/cli/cli.py +511 -0
  14. nodescraper/cli/constants.py +27 -0
  15. nodescraper/cli/dynamicparserbuilder.py +171 -0
  16. nodescraper/cli/helper.py +517 -0
  17. nodescraper/cli/inputargtypes.py +129 -0
  18. nodescraper/configbuilder.py +123 -0
  19. nodescraper/configregistry.py +66 -0
  20. nodescraper/configs/node_status.json +19 -0
  21. nodescraper/connection/__init__.py +25 -0
  22. nodescraper/connection/inband/__init__.py +46 -0
  23. nodescraper/connection/inband/inband.py +171 -0
  24. nodescraper/connection/inband/inbandlocal.py +93 -0
  25. nodescraper/connection/inband/inbandmanager.py +151 -0
  26. nodescraper/connection/inband/inbandremote.py +173 -0
  27. nodescraper/connection/inband/sshparams.py +43 -0
  28. nodescraper/constants.py +26 -0
  29. nodescraper/enums/__init__.py +40 -0
  30. nodescraper/enums/eventcategory.py +89 -0
  31. nodescraper/enums/eventpriority.py +42 -0
  32. nodescraper/enums/executionstatus.py +44 -0
  33. nodescraper/enums/osfamily.py +34 -0
  34. nodescraper/enums/systeminteraction.py +41 -0
  35. nodescraper/enums/systemlocation.py +33 -0
  36. nodescraper/generictypes.py +36 -0
  37. nodescraper/interfaces/__init__.py +44 -0
  38. nodescraper/interfaces/connectionmanager.py +143 -0
  39. nodescraper/interfaces/dataanalyzertask.py +138 -0
  40. nodescraper/interfaces/datacollectortask.py +185 -0
  41. nodescraper/interfaces/dataplugin.py +356 -0
  42. nodescraper/interfaces/plugin.py +127 -0
  43. nodescraper/interfaces/resultcollator.py +56 -0
  44. nodescraper/interfaces/task.py +164 -0
  45. nodescraper/interfaces/taskresulthook.py +39 -0
  46. nodescraper/models/__init__.py +48 -0
  47. nodescraper/models/analyzerargs.py +93 -0
  48. nodescraper/models/collectorargs.py +30 -0
  49. nodescraper/models/connectionconfig.py +34 -0
  50. nodescraper/models/datamodel.py +171 -0
  51. nodescraper/models/datapluginresult.py +39 -0
  52. nodescraper/models/event.py +158 -0
  53. nodescraper/models/pluginconfig.py +38 -0
  54. nodescraper/models/pluginresult.py +39 -0
  55. nodescraper/models/systeminfo.py +44 -0
  56. nodescraper/models/taskresult.py +185 -0
  57. nodescraper/models/timerangeargs.py +38 -0
  58. nodescraper/pluginexecutor.py +274 -0
  59. nodescraper/pluginregistry.py +152 -0
  60. nodescraper/plugins/__init__.py +25 -0
  61. nodescraper/plugins/inband/__init__.py +25 -0
  62. nodescraper/plugins/inband/amdsmi/__init__.py +28 -0
  63. nodescraper/plugins/inband/amdsmi/amdsmi_analyzer.py +821 -0
  64. nodescraper/plugins/inband/amdsmi/amdsmi_collector.py +1313 -0
  65. nodescraper/plugins/inband/amdsmi/amdsmi_plugin.py +43 -0
  66. nodescraper/plugins/inband/amdsmi/amdsmidata.py +1002 -0
  67. nodescraper/plugins/inband/amdsmi/analyzer_args.py +50 -0
  68. nodescraper/plugins/inband/amdsmi/cper.py +65 -0
  69. nodescraper/plugins/inband/bios/__init__.py +29 -0
  70. nodescraper/plugins/inband/bios/analyzer_args.py +64 -0
  71. nodescraper/plugins/inband/bios/bios_analyzer.py +93 -0
  72. nodescraper/plugins/inband/bios/bios_collector.py +93 -0
  73. nodescraper/plugins/inband/bios/bios_plugin.py +43 -0
  74. nodescraper/plugins/inband/bios/biosdata.py +30 -0
  75. nodescraper/plugins/inband/cmdline/__init__.py +25 -0
  76. nodescraper/plugins/inband/cmdline/analyzer_args.py +80 -0
  77. nodescraper/plugins/inband/cmdline/cmdline_analyzer.py +113 -0
  78. nodescraper/plugins/inband/cmdline/cmdline_collector.py +77 -0
  79. nodescraper/plugins/inband/cmdline/cmdline_plugin.py +43 -0
  80. nodescraper/plugins/inband/cmdline/cmdlinedata.py +30 -0
  81. nodescraper/plugins/inband/device_enumeration/__init__.py +29 -0
  82. nodescraper/plugins/inband/device_enumeration/analyzer_args.py +73 -0
  83. nodescraper/plugins/inband/device_enumeration/device_enumeration_analyzer.py +81 -0
  84. nodescraper/plugins/inband/device_enumeration/device_enumeration_collector.py +176 -0
  85. nodescraper/plugins/inband/device_enumeration/device_enumeration_plugin.py +45 -0
  86. nodescraper/plugins/inband/device_enumeration/deviceenumdata.py +36 -0
  87. nodescraper/plugins/inband/dimm/__init__.py +25 -0
  88. nodescraper/plugins/inband/dimm/collector_args.py +31 -0
  89. nodescraper/plugins/inband/dimm/dimm_collector.py +151 -0
  90. nodescraper/plugins/inband/dimm/dimm_plugin.py +40 -0
  91. nodescraper/plugins/inband/dimm/dimmdata.py +30 -0
  92. nodescraper/plugins/inband/dkms/__init__.py +25 -0
  93. nodescraper/plugins/inband/dkms/analyzer_args.py +85 -0
  94. nodescraper/plugins/inband/dkms/dkms_analyzer.py +106 -0
  95. nodescraper/plugins/inband/dkms/dkms_collector.py +76 -0
  96. nodescraper/plugins/inband/dkms/dkms_plugin.py +43 -0
  97. nodescraper/plugins/inband/dkms/dkmsdata.py +33 -0
  98. nodescraper/plugins/inband/dmesg/__init__.py +28 -0
  99. nodescraper/plugins/inband/dmesg/analyzer_args.py +33 -0
  100. nodescraper/plugins/inband/dmesg/collector_args.py +39 -0
  101. nodescraper/plugins/inband/dmesg/dmesg_analyzer.py +503 -0
  102. nodescraper/plugins/inband/dmesg/dmesg_collector.py +164 -0
  103. nodescraper/plugins/inband/dmesg/dmesg_plugin.py +44 -0
  104. nodescraper/plugins/inband/dmesg/dmesgdata.py +116 -0
  105. nodescraper/plugins/inband/fabrics/__init__.py +28 -0
  106. nodescraper/plugins/inband/fabrics/fabrics_collector.py +726 -0
  107. nodescraper/plugins/inband/fabrics/fabrics_plugin.py +37 -0
  108. nodescraper/plugins/inband/fabrics/fabricsdata.py +140 -0
  109. nodescraper/plugins/inband/journal/__init__.py +28 -0
  110. nodescraper/plugins/inband/journal/collector_args.py +33 -0
  111. nodescraper/plugins/inband/journal/journal_collector.py +107 -0
  112. nodescraper/plugins/inband/journal/journal_plugin.py +40 -0
  113. nodescraper/plugins/inband/journal/journaldata.py +44 -0
  114. nodescraper/plugins/inband/kernel/__init__.py +25 -0
  115. nodescraper/plugins/inband/kernel/analyzer_args.py +64 -0
  116. nodescraper/plugins/inband/kernel/kernel_analyzer.py +91 -0
  117. nodescraper/plugins/inband/kernel/kernel_collector.py +129 -0
  118. nodescraper/plugins/inband/kernel/kernel_plugin.py +43 -0
  119. nodescraper/plugins/inband/kernel/kerneldata.py +32 -0
  120. nodescraper/plugins/inband/kernel_module/__init__.py +25 -0
  121. nodescraper/plugins/inband/kernel_module/analyzer_args.py +59 -0
  122. nodescraper/plugins/inband/kernel_module/kernel_module_analyzer.py +211 -0
  123. nodescraper/plugins/inband/kernel_module/kernel_module_collector.py +264 -0
  124. nodescraper/plugins/inband/kernel_module/kernel_module_data.py +60 -0
  125. nodescraper/plugins/inband/kernel_module/kernel_module_plugin.py +43 -0
  126. nodescraper/plugins/inband/memory/__init__.py +25 -0
  127. nodescraper/plugins/inband/memory/analyzer_args.py +45 -0
  128. nodescraper/plugins/inband/memory/memory_analyzer.py +98 -0
  129. nodescraper/plugins/inband/memory/memory_collector.py +330 -0
  130. nodescraper/plugins/inband/memory/memory_plugin.py +43 -0
  131. nodescraper/plugins/inband/memory/memorydata.py +90 -0
  132. nodescraper/plugins/inband/network/__init__.py +28 -0
  133. nodescraper/plugins/inband/network/network_collector.py +1828 -0
  134. nodescraper/plugins/inband/network/network_plugin.py +37 -0
  135. nodescraper/plugins/inband/network/networkdata.py +319 -0
  136. nodescraper/plugins/inband/nvme/__init__.py +28 -0
  137. nodescraper/plugins/inband/nvme/nvme_collector.py +167 -0
  138. nodescraper/plugins/inband/nvme/nvme_plugin.py +37 -0
  139. nodescraper/plugins/inband/nvme/nvmedata.py +45 -0
  140. nodescraper/plugins/inband/os/__init__.py +25 -0
  141. nodescraper/plugins/inband/os/analyzer_args.py +64 -0
  142. nodescraper/plugins/inband/os/os_analyzer.py +73 -0
  143. nodescraper/plugins/inband/os/os_collector.py +131 -0
  144. nodescraper/plugins/inband/os/os_plugin.py +43 -0
  145. nodescraper/plugins/inband/os/osdata.py +31 -0
  146. nodescraper/plugins/inband/package/__init__.py +25 -0
  147. nodescraper/plugins/inband/package/analyzer_args.py +48 -0
  148. nodescraper/plugins/inband/package/package_analyzer.py +253 -0
  149. nodescraper/plugins/inband/package/package_collector.py +273 -0
  150. nodescraper/plugins/inband/package/package_plugin.py +43 -0
  151. nodescraper/plugins/inband/package/packagedata.py +41 -0
  152. nodescraper/plugins/inband/pcie/__init__.py +29 -0
  153. nodescraper/plugins/inband/pcie/analyzer_args.py +63 -0
  154. nodescraper/plugins/inband/pcie/pcie_analyzer.py +1081 -0
  155. nodescraper/plugins/inband/pcie/pcie_collector.py +690 -0
  156. nodescraper/plugins/inband/pcie/pcie_data.py +2017 -0
  157. nodescraper/plugins/inband/pcie/pcie_plugin.py +43 -0
  158. nodescraper/plugins/inband/process/__init__.py +25 -0
  159. nodescraper/plugins/inband/process/analyzer_args.py +45 -0
  160. nodescraper/plugins/inband/process/collector_args.py +31 -0
  161. nodescraper/plugins/inband/process/process_analyzer.py +91 -0
  162. nodescraper/plugins/inband/process/process_collector.py +115 -0
  163. nodescraper/plugins/inband/process/process_plugin.py +46 -0
  164. nodescraper/plugins/inband/process/processdata.py +34 -0
  165. nodescraper/plugins/inband/rocm/__init__.py +25 -0
  166. nodescraper/plugins/inband/rocm/analyzer_args.py +66 -0
  167. nodescraper/plugins/inband/rocm/rocm_analyzer.py +100 -0
  168. nodescraper/plugins/inband/rocm/rocm_collector.py +205 -0
  169. nodescraper/plugins/inband/rocm/rocm_plugin.py +43 -0
  170. nodescraper/plugins/inband/rocm/rocmdata.py +62 -0
  171. nodescraper/plugins/inband/storage/__init__.py +25 -0
  172. nodescraper/plugins/inband/storage/analyzer_args.py +38 -0
  173. nodescraper/plugins/inband/storage/collector_args.py +31 -0
  174. nodescraper/plugins/inband/storage/storage_analyzer.py +152 -0
  175. nodescraper/plugins/inband/storage/storage_collector.py +110 -0
  176. nodescraper/plugins/inband/storage/storage_plugin.py +44 -0
  177. nodescraper/plugins/inband/storage/storagedata.py +70 -0
  178. nodescraper/plugins/inband/sysctl/__init__.py +29 -0
  179. nodescraper/plugins/inband/sysctl/analyzer_args.py +67 -0
  180. nodescraper/plugins/inband/sysctl/sysctl_analyzer.py +81 -0
  181. nodescraper/plugins/inband/sysctl/sysctl_collector.py +101 -0
  182. nodescraper/plugins/inband/sysctl/sysctl_plugin.py +43 -0
  183. nodescraper/plugins/inband/sysctl/sysctldata.py +42 -0
  184. nodescraper/plugins/inband/syslog/__init__.py +28 -0
  185. nodescraper/plugins/inband/syslog/syslog_collector.py +121 -0
  186. nodescraper/plugins/inband/syslog/syslog_plugin.py +37 -0
  187. nodescraper/plugins/inband/syslog/syslogdata.py +46 -0
  188. nodescraper/plugins/inband/uptime/__init__.py +25 -0
  189. nodescraper/plugins/inband/uptime/uptime_collector.py +88 -0
  190. nodescraper/plugins/inband/uptime/uptime_plugin.py +37 -0
  191. nodescraper/plugins/inband/uptime/uptimedata.py +31 -0
  192. nodescraper/resultcollators/__init__.py +25 -0
  193. nodescraper/resultcollators/tablesummary.py +159 -0
  194. nodescraper/taskresulthooks/__init__.py +28 -0
  195. nodescraper/taskresulthooks/filesystemloghook.py +88 -0
  196. nodescraper/typeutils.py +171 -0
  197. nodescraper/utils.py +412 -0
@@ -0,0 +1,726 @@
1
+ ###############################################################################
2
+ #
3
+ # MIT License
4
+ #
5
+ # Copyright (c) 2025 Advanced Micro Devices, Inc.
6
+ #
7
+ # Permission is hereby granted, free of charge, to any person obtaining a copy
8
+ # of this software and associated documentation files (the "Software"), to deal
9
+ # in the Software without restriction, including without limitation the rights
10
+ # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11
+ # copies of the Software, and to permit persons to whom the Software is
12
+ # furnished to do so, subject to the following conditions:
13
+ #
14
+ # The above copyright notice and this permission notice shall be included in all
15
+ # copies or substantial portions of the Software.
16
+ #
17
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18
+ # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19
+ # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20
+ # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21
+ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22
+ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23
+ # SOFTWARE.
24
+ #
25
+ ###############################################################################
26
+ import re
27
+ from typing import Dict, List, Optional, Tuple
28
+
29
+ from nodescraper.base import InBandDataCollector
30
+ from nodescraper.enums import EventCategory, EventPriority, ExecutionStatus
31
+ from nodescraper.models import TaskResult
32
+
33
+ from .fabricsdata import (
34
+ FabricsDataModel,
35
+ IbdevNetdevMapping,
36
+ IbstatDevice,
37
+ IbvDeviceInfo,
38
+ MstDevice,
39
+ MstStatus,
40
+ OfedInfo,
41
+ RdmaDevice,
42
+ RdmaInfo,
43
+ RdmaLink,
44
+ )
45
+
46
+
47
+ class FabricsCollector(InBandDataCollector[FabricsDataModel, None]):
48
+ """Collect InfiniBand/RDMA fabrics configuration details"""
49
+
50
+ DATA_MODEL = FabricsDataModel
51
+ CMD_IBSTAT = "ibstat"
52
+ CMD_IBV_DEVINFO = "ibv_devinfo"
53
+ CMD_IB_DEV_NETDEVS = "ls -l /sys/class/infiniband/*/device/net"
54
+ CMD_OFED_INFO = "ofed_info -s"
55
+ CMD_MST_START = "mst start"
56
+ CMD_MST_STATUS = "mst status -v"
57
+ CMD_RDMA_DEV = "rdma dev"
58
+ CMD_RDMA_LINK = "rdma link"
59
+
60
+ def _parse_ibstat(self, output: str) -> List[IbstatDevice]:
61
+ """Parse 'ibstat' output into IbstatDevice objects.
62
+
63
+ Args:
64
+ output: Raw output from 'ibstat' command
65
+
66
+ Returns:
67
+ List of IbstatDevice objects
68
+ """
69
+ devices = []
70
+ current_device = None
71
+ current_port = None
72
+ current_port_attrs: Dict[str, str] = {}
73
+
74
+ for line in output.splitlines():
75
+ line_stripped = line.strip()
76
+
77
+ # CA name line (e.g., "CA 'mlx5_0'")
78
+ if line.startswith("CA "):
79
+ # Save previous device if exists
80
+ if current_device:
81
+ devices.append(current_device)
82
+
83
+ # Extract CA name
84
+ match = re.search(r"CA\s+'([^']+)'", line)
85
+ if match:
86
+ ca_name = match.group(1)
87
+ current_device = IbstatDevice(ca_name=ca_name, raw_output=output)
88
+ current_port = None
89
+ current_port_attrs = {}
90
+
91
+ # Port line (e.g., "Port 1:")
92
+ elif line.startswith("Port ") and ":" in line:
93
+ # Save previous port if exists
94
+ if current_device and current_port is not None:
95
+ current_device.ports[current_port] = current_port_attrs
96
+
97
+ # Extract port number
98
+ match = re.search(r"Port\s+(\d+):", line)
99
+ if match:
100
+ current_port = int(match.group(1))
101
+ current_port_attrs = {}
102
+
103
+ # Attribute lines (indented with key: value format)
104
+ elif ":" in line_stripped and current_device:
105
+ parts = line_stripped.split(":", 1)
106
+ if len(parts) == 2:
107
+ key = parts[0].strip()
108
+ value = parts[1].strip()
109
+
110
+ # Store port-specific attributes
111
+ if current_port is not None:
112
+ current_port_attrs[key] = value
113
+ else:
114
+ # Store device-level attributes
115
+ if key == "CA type":
116
+ current_device.ca_type = value
117
+ elif key == "Number of ports":
118
+ try:
119
+ current_device.number_of_ports = int(value)
120
+ except ValueError:
121
+ pass
122
+ elif key == "Firmware version":
123
+ current_device.firmware_version = value
124
+ elif key == "Hardware version":
125
+ current_device.hardware_version = value
126
+ elif key == "Node GUID":
127
+ current_device.node_guid = value
128
+ elif key == "System image GUID":
129
+ current_device.system_image_guid = value
130
+
131
+ # Save last device and port
132
+ if current_device:
133
+ if current_port is not None:
134
+ current_device.ports[current_port] = current_port_attrs
135
+ devices.append(current_device)
136
+
137
+ return devices
138
+
139
+ def _parse_ibv_devinfo(self, output: str) -> List[IbvDeviceInfo]:
140
+ """Parse 'ibv_devinfo' output into IbvDeviceInfo objects.
141
+
142
+ Args:
143
+ output: Raw output from 'ibv_devinfo' command
144
+
145
+ Returns:
146
+ List of IbvDeviceInfo objects
147
+ """
148
+ devices = []
149
+ current_device = None
150
+ current_port = None
151
+ current_port_attrs: Dict[str, str] = {}
152
+
153
+ for line in output.splitlines():
154
+ line_stripped = line.strip()
155
+
156
+ # Device header (e.g., "hca_id: mlx5_0")
157
+ if line.startswith("hca_id:"):
158
+ # Save previous device if exists
159
+ if current_device:
160
+ devices.append(current_device)
161
+
162
+ parts = line.split(":", 1)
163
+ if len(parts) == 2:
164
+ device_name = parts[1].strip()
165
+ current_device = IbvDeviceInfo(device=device_name, raw_output=output)
166
+ current_port = None
167
+ current_port_attrs = {}
168
+
169
+ # Port line (e.g., "port: 1")
170
+ elif line_stripped.startswith("port:") and current_device:
171
+ # Save previous port if exists
172
+ if current_port is not None:
173
+ current_device.ports[current_port] = current_port_attrs
174
+
175
+ parts = line_stripped.split(":", 1)
176
+ if len(parts) == 2:
177
+ try:
178
+ current_port = int(parts[1].strip())
179
+ current_port_attrs = {}
180
+ except ValueError:
181
+ pass
182
+
183
+ # Attribute lines (with key: value format)
184
+ elif ":" in line_stripped and current_device:
185
+ parts = line_stripped.split(":", 1)
186
+ if len(parts) == 2:
187
+ key = parts[0].strip()
188
+ value = parts[1].strip()
189
+
190
+ # Store port-specific attributes
191
+ if current_port is not None:
192
+ current_port_attrs[key] = value
193
+ else:
194
+ # Store device-level attributes
195
+ if key == "node_guid":
196
+ current_device.node_guid = value
197
+ elif key == "sys_image_guid":
198
+ current_device.sys_image_guid = value
199
+ elif key == "vendor_id":
200
+ current_device.vendor_id = value
201
+ elif key == "vendor_part_id":
202
+ current_device.vendor_part_id = value
203
+ elif key == "hw_ver":
204
+ current_device.hw_ver = value
205
+ elif key == "fw_ver":
206
+ current_device.fw_ver = value
207
+ elif key == "node_type":
208
+ current_device.node_type = value
209
+ elif key == "transport_type" or key == "transport":
210
+ current_device.transport_type = value
211
+
212
+ # Save last device and port
213
+ if current_device:
214
+ if current_port is not None:
215
+ current_device.ports[current_port] = current_port_attrs
216
+ devices.append(current_device)
217
+
218
+ return devices
219
+
220
+ def _parse_ib_dev_netdevs(self, output: str) -> List[IbdevNetdevMapping]:
221
+ """Parse 'ls -l /sys/class/infiniband/*/device/net' output into IbdevNetdevMapping objects.
222
+
223
+ Args:
224
+ output: Raw output from 'ls -l /sys/class/infiniband/*/device/net' command
225
+
226
+ Returns:
227
+ List of IbdevNetdevMapping objects
228
+ """
229
+ mappings = []
230
+ current_ib_device = None
231
+
232
+ for line in output.splitlines():
233
+ line = line.strip()
234
+ if not line:
235
+ continue
236
+
237
+ # Check if this is a directory path line
238
+ # Example: /sys/class/infiniband/rocep105s0/device/net:
239
+ if line.startswith("/sys/class/infiniband/") and line.endswith(":"):
240
+ # Extract IB device name from path
241
+ path_match = re.search(r"/sys/class/infiniband/([^/]+)/device/net:", line)
242
+ if path_match:
243
+ current_ib_device = path_match.group(1)
244
+ continue
245
+
246
+ # Skip "total" lines
247
+ if line.startswith("total"):
248
+ continue
249
+
250
+ # Parse directory listing lines (network device names)
251
+ # Example: drwxr-xr-x 5 root root 0 Jan 8 18:01 benic5p1
252
+ if current_ib_device and line.startswith("d"):
253
+ parts = line.split()
254
+ if len(parts) >= 9:
255
+ # The last part is the network device name
256
+ netdev = parts[-1]
257
+
258
+ # Create mapping with default port 1 (most common for single-port devices)
259
+ # State is unknown from ls output
260
+ mapping = IbdevNetdevMapping(
261
+ ib_device=current_ib_device, port=1, netdev=netdev, state=None
262
+ )
263
+ mappings.append(mapping)
264
+
265
+ return mappings
266
+
267
+ def _parse_ofed_info(self, output: str) -> OfedInfo:
268
+ """Parse 'ofed_info -s' output into OfedInfo object.
269
+
270
+ Args:
271
+ output: Raw output from 'ofed_info -s' command
272
+
273
+ Returns:
274
+ OfedInfo object
275
+ """
276
+ version = None
277
+
278
+ # The output is typically just a version string, possibly with trailing colon
279
+ # Example: OFED-internal-25.10-1.7.1:
280
+ output_stripped = output.strip()
281
+ if output_stripped:
282
+ # Remove trailing colon if present
283
+ version = output_stripped.rstrip(":")
284
+
285
+ return OfedInfo(version=version, raw_output=output)
286
+
287
+ def _parse_mst_status(self, output: str) -> MstStatus:
288
+ """Parse 'mst status -v' output into MstStatus object.
289
+
290
+ Args:
291
+ output: Raw output from 'mst status -v' command
292
+
293
+ Returns:
294
+ MstStatus object
295
+ """
296
+ mst_status = MstStatus(raw_output=output)
297
+ devices = []
298
+
299
+ # Check if MST is started
300
+ if "MST modules:" in output or "MST devices:" in output or "PCI devices:" in output:
301
+ mst_status.mst_started = True
302
+
303
+ for line in output.splitlines():
304
+ line = line.strip()
305
+ if not line:
306
+ continue
307
+
308
+ # Skip header lines
309
+ if (
310
+ line.startswith("MST modules:")
311
+ or line.startswith("PCI devices:")
312
+ or line.startswith("---")
313
+ ):
314
+ continue
315
+ if line.startswith("DEVICE_TYPE") or line.startswith("MST PCI module"):
316
+ continue
317
+
318
+ # Look for device lines containing "/dev/mst/"
319
+ if "/dev/mst/" in line:
320
+ parts = line.split()
321
+
322
+ # Handle old format: "/dev/mst/device_path" at the beginning
323
+ if line.startswith("/dev/mst/"):
324
+ device_path = parts[0]
325
+ device = MstDevice(device=device_path)
326
+
327
+ # Try to parse additional fields (old format with key=value)
328
+ for part in parts[1:]:
329
+ if "=" in part:
330
+ key, value = part.split("=", 1)
331
+ if key == "rdma":
332
+ device.rdma_device = value
333
+ elif key == "net":
334
+ device.net_device = value
335
+ elif ":" in value and "." in value:
336
+ device.pci_address = value
337
+ else:
338
+ device.attributes[key] = value
339
+ elif re.match(r"[0-9a-f]{2,4}:[0-9a-f]{2}:[0-9a-f]{2}\.[0-9]", part):
340
+ device.pci_address = part
341
+
342
+ devices.append(device)
343
+
344
+ # Handle new tabular format: DEVICE_TYPE MST PCI RDMA NET NUMA [VFIO]
345
+ # Example: ConnectX7(rev:0) /dev/mst/mt4129_pciconf9 ec:00.0 mlx5_4 net-enp235s0np0 1
346
+ else:
347
+ # Find the index of the /dev/mst/ device path
348
+ mst_idx = None
349
+ for i, part in enumerate(parts):
350
+ if part.startswith("/dev/mst/"):
351
+ mst_idx = i
352
+ break
353
+
354
+ if mst_idx is not None and len(parts) >= mst_idx + 3:
355
+ device_path = parts[mst_idx]
356
+ device = MstDevice(device=device_path)
357
+
358
+ # Store device type if available (before mst path)
359
+ if mst_idx > 0:
360
+ device.attributes["device_type"] = " ".join(parts[:mst_idx])
361
+
362
+ # PCI address (next column after MST path)
363
+ if mst_idx + 1 < len(parts):
364
+ pci_addr = parts[mst_idx + 1]
365
+ # Validate PCI address format (short or long form)
366
+ if re.match(r"[0-9a-f]{2,4}:[0-9a-f]{2}:[0-9a-f]{2}\.[0-9]", pci_addr):
367
+ device.pci_address = pci_addr
368
+
369
+ # RDMA device (column after PCI)
370
+ if mst_idx + 2 < len(parts):
371
+ rdma_dev = parts[mst_idx + 2]
372
+ if rdma_dev.startswith("mlx") or rdma_dev != "-":
373
+ device.rdma_device = rdma_dev
374
+
375
+ # NET device (column after RDMA)
376
+ if mst_idx + 3 < len(parts):
377
+ net_dev = parts[mst_idx + 3]
378
+ # Remove "net-" prefix if present
379
+ if net_dev.startswith("net-"):
380
+ net_dev = net_dev[4:]
381
+ if net_dev != "-":
382
+ device.net_device = net_dev
383
+
384
+ # NUMA node (column after NET)
385
+ if mst_idx + 4 < len(parts):
386
+ numa = parts[mst_idx + 4]
387
+ if numa.isdigit():
388
+ device.attributes["numa_node"] = numa
389
+
390
+ # VFIO or other attributes (remaining columns)
391
+ if mst_idx + 5 < len(parts):
392
+ device.attributes["vfio"] = " ".join(parts[mst_idx + 5 :])
393
+
394
+ devices.append(device)
395
+
396
+ mst_status.devices = devices
397
+ return mst_status
398
+
399
+ def _parse_rdma_dev(self, output: str) -> List[RdmaDevice]:
400
+ """Parse 'rdma dev' output into RdmaDevice objects.
401
+
402
+ Args:
403
+ output: Raw output from 'rdma dev' command
404
+
405
+ Returns:
406
+ List of RdmaDevice objects
407
+ """
408
+ devices = []
409
+
410
+ for line in output.splitlines():
411
+ line = line.strip()
412
+ if not line:
413
+ continue
414
+
415
+ # Example InfiniBand format: 0: mlx5_0: node_type ca fw 16.28.2006 node_guid 0c42:a103:00b3:bfa0 sys_image_guid 0c42:a103:00b3:bfa0
416
+ # Example RoCE format: 0: rocep9s0: node_type ca fw 1.117.1-a-63 node_guid 0690:81ff:fe4a:6c40 sys_image_guid 0690:81ff:fe4a:6c40
417
+ parts = line.split()
418
+ if len(parts) < 2:
419
+ continue
420
+
421
+ # First part might be index followed by colon
422
+ device_name = None
423
+ start_idx = 0
424
+
425
+ if parts[0].endswith(":"):
426
+ # Skip index (e.g., "0:")
427
+ start_idx = 1
428
+
429
+ if start_idx < len(parts):
430
+ device_name = parts[start_idx].rstrip(":")
431
+ start_idx += 1
432
+
433
+ if not device_name:
434
+ continue
435
+
436
+ device = RdmaDevice(device=device_name)
437
+
438
+ # Parse remaining attributes
439
+ i = start_idx
440
+ while i < len(parts):
441
+ if parts[i] == "node_type" and i + 1 < len(parts):
442
+ device.node_type = parts[i + 1]
443
+ i += 2
444
+ elif parts[i] == "fw" and i + 1 < len(parts):
445
+ device.attributes["fw_version"] = parts[i + 1]
446
+ i += 2
447
+ elif parts[i] == "node_guid" and i + 1 < len(parts):
448
+ device.node_guid = parts[i + 1]
449
+ i += 2
450
+ elif parts[i] == "sys_image_guid" and i + 1 < len(parts):
451
+ device.sys_image_guid = parts[i + 1]
452
+ i += 2
453
+ elif parts[i] == "state" and i + 1 < len(parts):
454
+ device.state = parts[i + 1]
455
+ i += 2
456
+ else:
457
+ # Store as generic attribute
458
+ if i + 1 < len(parts) and not parts[i + 1].startswith("-"):
459
+ device.attributes[parts[i]] = parts[i + 1]
460
+ i += 2
461
+ else:
462
+ i += 1
463
+
464
+ devices.append(device)
465
+
466
+ return devices
467
+
468
+ def _parse_rdma_link(self, output: str) -> List[RdmaLink]:
469
+ """Parse 'rdma link' output into RdmaLink objects.
470
+
471
+ Args:
472
+ output: Raw output from 'rdma link' command
473
+
474
+ Returns:
475
+ List of RdmaLink objects
476
+ """
477
+ links = []
478
+
479
+ for line in output.splitlines():
480
+ line = line.strip()
481
+ if not line:
482
+ continue
483
+
484
+ # Example InfiniBand format: link mlx5_0/1 state ACTIVE physical_state LINK_UP netdev ib0
485
+ # Example RoCE format: link rocep9s0/1 state DOWN physical_state POLLING netdev benic8p1
486
+ # Example alternate format: 0/1: mlx5_0/1: state ACTIVE physical_state LINK_UP
487
+ match = re.search(r"(\S+)/(\d+)", line)
488
+ if not match:
489
+ continue
490
+
491
+ device_name = match.group(1)
492
+ port = int(match.group(2))
493
+
494
+ link = RdmaLink(device=device_name, port=port)
495
+
496
+ # Parse remaining attributes
497
+ parts = line.split()
498
+ i = 0
499
+ while i < len(parts):
500
+ if parts[i] == "state" and i + 1 < len(parts):
501
+ link.state = parts[i + 1]
502
+ i += 2
503
+ elif parts[i] == "physical_state" and i + 1 < len(parts):
504
+ link.physical_state = parts[i + 1]
505
+ i += 2
506
+ elif parts[i] == "netdev" and i + 1 < len(parts):
507
+ link.netdev = parts[i + 1]
508
+ i += 2
509
+ else:
510
+ # Store as generic attribute if it's a key-value pair
511
+ if i + 1 < len(parts) and not parts[i + 1].startswith("-"):
512
+ link.attributes[parts[i]] = parts[i + 1]
513
+ i += 2
514
+ else:
515
+ i += 1
516
+
517
+ links.append(link)
518
+
519
+ return links
520
+
521
+ def collect_data(
522
+ self,
523
+ args=None,
524
+ ) -> Tuple[TaskResult, Optional[FabricsDataModel]]:
525
+ """Collect InfiniBand/RDMA fabrics configuration from the system.
526
+
527
+ Returns:
528
+ Tuple[TaskResult, Optional[FabricsDataModel]]: tuple containing the task result
529
+ and an instance of FabricsDataModel or None if collection failed.
530
+ """
531
+ ibstat_devices = []
532
+ ibv_devices = []
533
+ ibdev_netdev_mappings = []
534
+ ofed_info = None
535
+ mst_status = None
536
+ rdma_info = None
537
+
538
+ # Collect ibstat information
539
+ res_ibstat = self._run_sut_cmd(self.CMD_IBSTAT)
540
+ if res_ibstat.exit_code == 0:
541
+ ibstat_devices = self._parse_ibstat(res_ibstat.stdout)
542
+ self._log_event(
543
+ category=EventCategory.NETWORK,
544
+ description=f"Collected {len(ibstat_devices)} IB devices from ibstat",
545
+ priority=EventPriority.INFO,
546
+ )
547
+ else:
548
+ self._log_event(
549
+ category=EventCategory.NETWORK,
550
+ description="Error collecting ibstat information",
551
+ data={"command": res_ibstat.command, "exit_code": res_ibstat.exit_code},
552
+ priority=EventPriority.WARNING,
553
+ )
554
+
555
+ # Collect ibv_devinfo information
556
+ res_ibv = self._run_sut_cmd(self.CMD_IBV_DEVINFO)
557
+ if res_ibv.exit_code == 0:
558
+ ibv_devices = self._parse_ibv_devinfo(res_ibv.stdout)
559
+ self._log_event(
560
+ category=EventCategory.NETWORK,
561
+ description=f"Collected {len(ibv_devices)} IB devices from ibv_devinfo",
562
+ priority=EventPriority.INFO,
563
+ )
564
+ else:
565
+ self._log_event(
566
+ category=EventCategory.NETWORK,
567
+ description="ibv_devinfo command not available or failed",
568
+ data={"command": res_ibv.command, "exit_code": res_ibv.exit_code},
569
+ priority=EventPriority.INFO,
570
+ )
571
+
572
+ # Collect IB device to netdev mappings
573
+ res_ib_dev_netdevs = self._run_sut_cmd(self.CMD_IB_DEV_NETDEVS)
574
+ if res_ib_dev_netdevs.exit_code == 0:
575
+ ibdev_netdev_mappings = self._parse_ib_dev_netdevs(res_ib_dev_netdevs.stdout)
576
+ self._log_event(
577
+ category=EventCategory.NETWORK,
578
+ description=f"Collected {len(ibdev_netdev_mappings)} IB to netdev mappings",
579
+ priority=EventPriority.INFO,
580
+ )
581
+ else:
582
+ self._log_event(
583
+ category=EventCategory.NETWORK,
584
+ description="No InfiniBand devices found in sysfs",
585
+ data={
586
+ "command": res_ib_dev_netdevs.command,
587
+ "exit_code": res_ib_dev_netdevs.exit_code,
588
+ },
589
+ priority=EventPriority.INFO,
590
+ )
591
+
592
+ # Collect OFED version info
593
+ res_ofed = self._run_sut_cmd(self.CMD_OFED_INFO)
594
+ if res_ofed.exit_code == 0:
595
+ ofed_info = self._parse_ofed_info(res_ofed.stdout)
596
+ self._log_event(
597
+ category=EventCategory.NETWORK,
598
+ description=f"Collected OFED version: {ofed_info.version}",
599
+ priority=EventPriority.INFO,
600
+ )
601
+ else:
602
+ self._log_event(
603
+ category=EventCategory.NETWORK,
604
+ description="OFED not installed or ofed_info command not available",
605
+ data={"command": res_ofed.command, "exit_code": res_ofed.exit_code},
606
+ priority=EventPriority.INFO,
607
+ )
608
+
609
+ # Start MST and collect status
610
+ # First start MST
611
+ res_mst_start = self._run_sut_cmd(self.CMD_MST_START, sudo=True)
612
+ if res_mst_start.exit_code == 0:
613
+ # Check output for success indicators
614
+ output_lower = res_mst_start.stdout.lower()
615
+ if "success" in output_lower or "loading mst" in output_lower:
616
+ self._log_event(
617
+ category=EventCategory.NETWORK,
618
+ description="MST service started successfully",
619
+ priority=EventPriority.INFO,
620
+ )
621
+ else:
622
+ self._log_event(
623
+ category=EventCategory.NETWORK,
624
+ description="MST service command completed but status unclear",
625
+ data={"output": res_mst_start.stdout},
626
+ priority=EventPriority.INFO,
627
+ )
628
+ else:
629
+ self._log_event(
630
+ category=EventCategory.NETWORK,
631
+ description="MST tools not available (Mellanox-specific)",
632
+ data={"command": res_mst_start.command, "exit_code": res_mst_start.exit_code},
633
+ priority=EventPriority.INFO,
634
+ )
635
+
636
+ # Get MST status
637
+ res_mst_status = self._run_sut_cmd(self.CMD_MST_STATUS, sudo=True)
638
+ if res_mst_status.exit_code == 0:
639
+ mst_status = self._parse_mst_status(res_mst_status.stdout)
640
+ self._log_event(
641
+ category=EventCategory.NETWORK,
642
+ description=f"Collected MST status: {len(mst_status.devices)} devices",
643
+ priority=EventPriority.INFO,
644
+ )
645
+ else:
646
+ self._log_event(
647
+ category=EventCategory.NETWORK,
648
+ description="MST status not available (Mellanox-specific)",
649
+ data={"command": res_mst_status.command, "exit_code": res_mst_status.exit_code},
650
+ priority=EventPriority.INFO,
651
+ )
652
+
653
+ # Collect RDMA device information
654
+ rdma_devices = []
655
+ res_rdma_dev = self._run_sut_cmd(self.CMD_RDMA_DEV)
656
+ if res_rdma_dev.exit_code == 0:
657
+ rdma_devices = self._parse_rdma_dev(res_rdma_dev.stdout)
658
+ self._log_event(
659
+ category=EventCategory.NETWORK,
660
+ description=f"Collected {len(rdma_devices)} RDMA devices",
661
+ priority=EventPriority.INFO,
662
+ )
663
+ else:
664
+ self._log_event(
665
+ category=EventCategory.NETWORK,
666
+ description="Error collecting RDMA device information",
667
+ data={"command": res_rdma_dev.command, "exit_code": res_rdma_dev.exit_code},
668
+ priority=EventPriority.WARNING,
669
+ )
670
+
671
+ # Collect RDMA link information
672
+ rdma_links = []
673
+ res_rdma_link = self._run_sut_cmd(self.CMD_RDMA_LINK)
674
+ if res_rdma_link.exit_code == 0:
675
+ rdma_links = self._parse_rdma_link(res_rdma_link.stdout)
676
+ self._log_event(
677
+ category=EventCategory.NETWORK,
678
+ description=f"Collected {len(rdma_links)} RDMA links",
679
+ priority=EventPriority.INFO,
680
+ )
681
+ else:
682
+ self._log_event(
683
+ category=EventCategory.NETWORK,
684
+ description="Error collecting RDMA link information",
685
+ data={"command": res_rdma_link.command, "exit_code": res_rdma_link.exit_code},
686
+ priority=EventPriority.WARNING,
687
+ )
688
+
689
+ # Combine RDMA information
690
+ if rdma_devices or rdma_links:
691
+ rdma_info = RdmaInfo(
692
+ devices=rdma_devices,
693
+ links=rdma_links,
694
+ raw_output=res_rdma_dev.stdout + "\n" + res_rdma_link.stdout,
695
+ )
696
+
697
+ # Build the data model only if we collected any data
698
+ if (
699
+ ibstat_devices
700
+ or ibv_devices
701
+ or ibdev_netdev_mappings
702
+ or ofed_info
703
+ or mst_status
704
+ or rdma_info
705
+ ):
706
+ fabrics_data = FabricsDataModel(
707
+ ibstat_devices=ibstat_devices,
708
+ ibv_devices=ibv_devices,
709
+ ibdev_netdev_mappings=ibdev_netdev_mappings,
710
+ ofed_info=ofed_info,
711
+ mst_status=mst_status,
712
+ rdma_info=rdma_info,
713
+ )
714
+ self.result.message = (
715
+ f"Collected fabrics data: {len(ibstat_devices)} ibstat devices, "
716
+ f"{len(ibv_devices)} ibv devices, {len(ibdev_netdev_mappings)} mappings, "
717
+ f"OFED: {ofed_info.version if ofed_info else 'N/A'}, "
718
+ f"MST devices: {len(mst_status.devices) if mst_status else 0}, "
719
+ f"RDMA devices: {len(rdma_info.devices) if rdma_info else 0}"
720
+ )
721
+ self.result.status = ExecutionStatus.OK
722
+ return self.result, fabrics_data
723
+ else:
724
+ self.result.message = "No InfiniBand/RDMA fabrics hardware detected on this system"
725
+ self.result.status = ExecutionStatus.ERROR
726
+ return self.result, None