amd-node-scraper 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (197) hide show
  1. amd_node_scraper-0.0.1.dist-info/LICENSE +21 -0
  2. amd_node_scraper-0.0.1.dist-info/METADATA +424 -0
  3. amd_node_scraper-0.0.1.dist-info/RECORD +197 -0
  4. amd_node_scraper-0.0.1.dist-info/WHEEL +5 -0
  5. amd_node_scraper-0.0.1.dist-info/entry_points.txt +2 -0
  6. amd_node_scraper-0.0.1.dist-info/top_level.txt +1 -0
  7. nodescraper/__init__.py +32 -0
  8. nodescraper/base/__init__.py +34 -0
  9. nodescraper/base/inbandcollectortask.py +118 -0
  10. nodescraper/base/inbanddataplugin.py +39 -0
  11. nodescraper/base/regexanalyzer.py +120 -0
  12. nodescraper/cli/__init__.py +29 -0
  13. nodescraper/cli/cli.py +511 -0
  14. nodescraper/cli/constants.py +27 -0
  15. nodescraper/cli/dynamicparserbuilder.py +171 -0
  16. nodescraper/cli/helper.py +517 -0
  17. nodescraper/cli/inputargtypes.py +129 -0
  18. nodescraper/configbuilder.py +123 -0
  19. nodescraper/configregistry.py +66 -0
  20. nodescraper/configs/node_status.json +19 -0
  21. nodescraper/connection/__init__.py +25 -0
  22. nodescraper/connection/inband/__init__.py +46 -0
  23. nodescraper/connection/inband/inband.py +171 -0
  24. nodescraper/connection/inband/inbandlocal.py +93 -0
  25. nodescraper/connection/inband/inbandmanager.py +151 -0
  26. nodescraper/connection/inband/inbandremote.py +173 -0
  27. nodescraper/connection/inband/sshparams.py +43 -0
  28. nodescraper/constants.py +26 -0
  29. nodescraper/enums/__init__.py +40 -0
  30. nodescraper/enums/eventcategory.py +89 -0
  31. nodescraper/enums/eventpriority.py +42 -0
  32. nodescraper/enums/executionstatus.py +44 -0
  33. nodescraper/enums/osfamily.py +34 -0
  34. nodescraper/enums/systeminteraction.py +41 -0
  35. nodescraper/enums/systemlocation.py +33 -0
  36. nodescraper/generictypes.py +36 -0
  37. nodescraper/interfaces/__init__.py +44 -0
  38. nodescraper/interfaces/connectionmanager.py +143 -0
  39. nodescraper/interfaces/dataanalyzertask.py +138 -0
  40. nodescraper/interfaces/datacollectortask.py +185 -0
  41. nodescraper/interfaces/dataplugin.py +356 -0
  42. nodescraper/interfaces/plugin.py +127 -0
  43. nodescraper/interfaces/resultcollator.py +56 -0
  44. nodescraper/interfaces/task.py +164 -0
  45. nodescraper/interfaces/taskresulthook.py +39 -0
  46. nodescraper/models/__init__.py +48 -0
  47. nodescraper/models/analyzerargs.py +93 -0
  48. nodescraper/models/collectorargs.py +30 -0
  49. nodescraper/models/connectionconfig.py +34 -0
  50. nodescraper/models/datamodel.py +171 -0
  51. nodescraper/models/datapluginresult.py +39 -0
  52. nodescraper/models/event.py +158 -0
  53. nodescraper/models/pluginconfig.py +38 -0
  54. nodescraper/models/pluginresult.py +39 -0
  55. nodescraper/models/systeminfo.py +44 -0
  56. nodescraper/models/taskresult.py +185 -0
  57. nodescraper/models/timerangeargs.py +38 -0
  58. nodescraper/pluginexecutor.py +274 -0
  59. nodescraper/pluginregistry.py +152 -0
  60. nodescraper/plugins/__init__.py +25 -0
  61. nodescraper/plugins/inband/__init__.py +25 -0
  62. nodescraper/plugins/inband/amdsmi/__init__.py +28 -0
  63. nodescraper/plugins/inband/amdsmi/amdsmi_analyzer.py +821 -0
  64. nodescraper/plugins/inband/amdsmi/amdsmi_collector.py +1313 -0
  65. nodescraper/plugins/inband/amdsmi/amdsmi_plugin.py +43 -0
  66. nodescraper/plugins/inband/amdsmi/amdsmidata.py +1002 -0
  67. nodescraper/plugins/inband/amdsmi/analyzer_args.py +50 -0
  68. nodescraper/plugins/inband/amdsmi/cper.py +65 -0
  69. nodescraper/plugins/inband/bios/__init__.py +29 -0
  70. nodescraper/plugins/inband/bios/analyzer_args.py +64 -0
  71. nodescraper/plugins/inband/bios/bios_analyzer.py +93 -0
  72. nodescraper/plugins/inband/bios/bios_collector.py +93 -0
  73. nodescraper/plugins/inband/bios/bios_plugin.py +43 -0
  74. nodescraper/plugins/inband/bios/biosdata.py +30 -0
  75. nodescraper/plugins/inband/cmdline/__init__.py +25 -0
  76. nodescraper/plugins/inband/cmdline/analyzer_args.py +80 -0
  77. nodescraper/plugins/inband/cmdline/cmdline_analyzer.py +113 -0
  78. nodescraper/plugins/inband/cmdline/cmdline_collector.py +77 -0
  79. nodescraper/plugins/inband/cmdline/cmdline_plugin.py +43 -0
  80. nodescraper/plugins/inband/cmdline/cmdlinedata.py +30 -0
  81. nodescraper/plugins/inband/device_enumeration/__init__.py +29 -0
  82. nodescraper/plugins/inband/device_enumeration/analyzer_args.py +73 -0
  83. nodescraper/plugins/inband/device_enumeration/device_enumeration_analyzer.py +81 -0
  84. nodescraper/plugins/inband/device_enumeration/device_enumeration_collector.py +176 -0
  85. nodescraper/plugins/inband/device_enumeration/device_enumeration_plugin.py +45 -0
  86. nodescraper/plugins/inband/device_enumeration/deviceenumdata.py +36 -0
  87. nodescraper/plugins/inband/dimm/__init__.py +25 -0
  88. nodescraper/plugins/inband/dimm/collector_args.py +31 -0
  89. nodescraper/plugins/inband/dimm/dimm_collector.py +151 -0
  90. nodescraper/plugins/inband/dimm/dimm_plugin.py +40 -0
  91. nodescraper/plugins/inband/dimm/dimmdata.py +30 -0
  92. nodescraper/plugins/inband/dkms/__init__.py +25 -0
  93. nodescraper/plugins/inband/dkms/analyzer_args.py +85 -0
  94. nodescraper/plugins/inband/dkms/dkms_analyzer.py +106 -0
  95. nodescraper/plugins/inband/dkms/dkms_collector.py +76 -0
  96. nodescraper/plugins/inband/dkms/dkms_plugin.py +43 -0
  97. nodescraper/plugins/inband/dkms/dkmsdata.py +33 -0
  98. nodescraper/plugins/inband/dmesg/__init__.py +28 -0
  99. nodescraper/plugins/inband/dmesg/analyzer_args.py +33 -0
  100. nodescraper/plugins/inband/dmesg/collector_args.py +39 -0
  101. nodescraper/plugins/inband/dmesg/dmesg_analyzer.py +503 -0
  102. nodescraper/plugins/inband/dmesg/dmesg_collector.py +164 -0
  103. nodescraper/plugins/inband/dmesg/dmesg_plugin.py +44 -0
  104. nodescraper/plugins/inband/dmesg/dmesgdata.py +116 -0
  105. nodescraper/plugins/inband/fabrics/__init__.py +28 -0
  106. nodescraper/plugins/inband/fabrics/fabrics_collector.py +726 -0
  107. nodescraper/plugins/inband/fabrics/fabrics_plugin.py +37 -0
  108. nodescraper/plugins/inband/fabrics/fabricsdata.py +140 -0
  109. nodescraper/plugins/inband/journal/__init__.py +28 -0
  110. nodescraper/plugins/inband/journal/collector_args.py +33 -0
  111. nodescraper/plugins/inband/journal/journal_collector.py +107 -0
  112. nodescraper/plugins/inband/journal/journal_plugin.py +40 -0
  113. nodescraper/plugins/inband/journal/journaldata.py +44 -0
  114. nodescraper/plugins/inband/kernel/__init__.py +25 -0
  115. nodescraper/plugins/inband/kernel/analyzer_args.py +64 -0
  116. nodescraper/plugins/inband/kernel/kernel_analyzer.py +91 -0
  117. nodescraper/plugins/inband/kernel/kernel_collector.py +129 -0
  118. nodescraper/plugins/inband/kernel/kernel_plugin.py +43 -0
  119. nodescraper/plugins/inband/kernel/kerneldata.py +32 -0
  120. nodescraper/plugins/inband/kernel_module/__init__.py +25 -0
  121. nodescraper/plugins/inband/kernel_module/analyzer_args.py +59 -0
  122. nodescraper/plugins/inband/kernel_module/kernel_module_analyzer.py +211 -0
  123. nodescraper/plugins/inband/kernel_module/kernel_module_collector.py +264 -0
  124. nodescraper/plugins/inband/kernel_module/kernel_module_data.py +60 -0
  125. nodescraper/plugins/inband/kernel_module/kernel_module_plugin.py +43 -0
  126. nodescraper/plugins/inband/memory/__init__.py +25 -0
  127. nodescraper/plugins/inband/memory/analyzer_args.py +45 -0
  128. nodescraper/plugins/inband/memory/memory_analyzer.py +98 -0
  129. nodescraper/plugins/inband/memory/memory_collector.py +330 -0
  130. nodescraper/plugins/inband/memory/memory_plugin.py +43 -0
  131. nodescraper/plugins/inband/memory/memorydata.py +90 -0
  132. nodescraper/plugins/inband/network/__init__.py +28 -0
  133. nodescraper/plugins/inband/network/network_collector.py +1828 -0
  134. nodescraper/plugins/inband/network/network_plugin.py +37 -0
  135. nodescraper/plugins/inband/network/networkdata.py +319 -0
  136. nodescraper/plugins/inband/nvme/__init__.py +28 -0
  137. nodescraper/plugins/inband/nvme/nvme_collector.py +167 -0
  138. nodescraper/plugins/inband/nvme/nvme_plugin.py +37 -0
  139. nodescraper/plugins/inband/nvme/nvmedata.py +45 -0
  140. nodescraper/plugins/inband/os/__init__.py +25 -0
  141. nodescraper/plugins/inband/os/analyzer_args.py +64 -0
  142. nodescraper/plugins/inband/os/os_analyzer.py +73 -0
  143. nodescraper/plugins/inband/os/os_collector.py +131 -0
  144. nodescraper/plugins/inband/os/os_plugin.py +43 -0
  145. nodescraper/plugins/inband/os/osdata.py +31 -0
  146. nodescraper/plugins/inband/package/__init__.py +25 -0
  147. nodescraper/plugins/inband/package/analyzer_args.py +48 -0
  148. nodescraper/plugins/inband/package/package_analyzer.py +253 -0
  149. nodescraper/plugins/inband/package/package_collector.py +273 -0
  150. nodescraper/plugins/inband/package/package_plugin.py +43 -0
  151. nodescraper/plugins/inband/package/packagedata.py +41 -0
  152. nodescraper/plugins/inband/pcie/__init__.py +29 -0
  153. nodescraper/plugins/inband/pcie/analyzer_args.py +63 -0
  154. nodescraper/plugins/inband/pcie/pcie_analyzer.py +1081 -0
  155. nodescraper/plugins/inband/pcie/pcie_collector.py +690 -0
  156. nodescraper/plugins/inband/pcie/pcie_data.py +2017 -0
  157. nodescraper/plugins/inband/pcie/pcie_plugin.py +43 -0
  158. nodescraper/plugins/inband/process/__init__.py +25 -0
  159. nodescraper/plugins/inband/process/analyzer_args.py +45 -0
  160. nodescraper/plugins/inband/process/collector_args.py +31 -0
  161. nodescraper/plugins/inband/process/process_analyzer.py +91 -0
  162. nodescraper/plugins/inband/process/process_collector.py +115 -0
  163. nodescraper/plugins/inband/process/process_plugin.py +46 -0
  164. nodescraper/plugins/inband/process/processdata.py +34 -0
  165. nodescraper/plugins/inband/rocm/__init__.py +25 -0
  166. nodescraper/plugins/inband/rocm/analyzer_args.py +66 -0
  167. nodescraper/plugins/inband/rocm/rocm_analyzer.py +100 -0
  168. nodescraper/plugins/inband/rocm/rocm_collector.py +205 -0
  169. nodescraper/plugins/inband/rocm/rocm_plugin.py +43 -0
  170. nodescraper/plugins/inband/rocm/rocmdata.py +62 -0
  171. nodescraper/plugins/inband/storage/__init__.py +25 -0
  172. nodescraper/plugins/inband/storage/analyzer_args.py +38 -0
  173. nodescraper/plugins/inband/storage/collector_args.py +31 -0
  174. nodescraper/plugins/inband/storage/storage_analyzer.py +152 -0
  175. nodescraper/plugins/inband/storage/storage_collector.py +110 -0
  176. nodescraper/plugins/inband/storage/storage_plugin.py +44 -0
  177. nodescraper/plugins/inband/storage/storagedata.py +70 -0
  178. nodescraper/plugins/inband/sysctl/__init__.py +29 -0
  179. nodescraper/plugins/inband/sysctl/analyzer_args.py +67 -0
  180. nodescraper/plugins/inband/sysctl/sysctl_analyzer.py +81 -0
  181. nodescraper/plugins/inband/sysctl/sysctl_collector.py +101 -0
  182. nodescraper/plugins/inband/sysctl/sysctl_plugin.py +43 -0
  183. nodescraper/plugins/inband/sysctl/sysctldata.py +42 -0
  184. nodescraper/plugins/inband/syslog/__init__.py +28 -0
  185. nodescraper/plugins/inband/syslog/syslog_collector.py +121 -0
  186. nodescraper/plugins/inband/syslog/syslog_plugin.py +37 -0
  187. nodescraper/plugins/inband/syslog/syslogdata.py +46 -0
  188. nodescraper/plugins/inband/uptime/__init__.py +25 -0
  189. nodescraper/plugins/inband/uptime/uptime_collector.py +88 -0
  190. nodescraper/plugins/inband/uptime/uptime_plugin.py +37 -0
  191. nodescraper/plugins/inband/uptime/uptimedata.py +31 -0
  192. nodescraper/resultcollators/__init__.py +25 -0
  193. nodescraper/resultcollators/tablesummary.py +159 -0
  194. nodescraper/taskresulthooks/__init__.py +28 -0
  195. nodescraper/taskresulthooks/filesystemloghook.py +88 -0
  196. nodescraper/typeutils.py +171 -0
  197. nodescraper/utils.py +412 -0
@@ -0,0 +1,205 @@
1
+ ###############################################################################
2
+ #
3
+ # MIT License
4
+ #
5
+ # Copyright (c) 2025 Advanced Micro Devices, Inc.
6
+ #
7
+ # Permission is hereby granted, free of charge, to any person obtaining a copy
8
+ # of this software and associated documentation files (the "Software"), to deal
9
+ # in the Software without restriction, including without limitation the rights
10
+ # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11
+ # copies of the Software, and to permit persons to whom the Software is
12
+ # furnished to do so, subject to the following conditions:
13
+ #
14
+ # The above copyright notice and this permission notice shall be included in all
15
+ # copies or substantial portions of the Software.
16
+ #
17
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18
+ # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19
+ # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20
+ # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21
+ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22
+ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23
+ # SOFTWARE.
24
+ #
25
+ ###############################################################################
26
+ from typing import Optional
27
+
28
+ from nodescraper.base import InBandDataCollector
29
+ from nodescraper.connection.inband import TextFileArtifact
30
+ from nodescraper.enums import EventCategory, EventPriority, ExecutionStatus, OSFamily
31
+ from nodescraper.models import TaskResult
32
+ from nodescraper.utils import strip_ansi_codes
33
+
34
+ from .rocmdata import RocmDataModel
35
+
36
+
37
+ class RocmCollector(InBandDataCollector[RocmDataModel, None]):
38
+ """Collect ROCm version data"""
39
+
40
+ SUPPORTED_OS_FAMILY: set[OSFamily] = {OSFamily.LINUX}
41
+
42
+ DATA_MODEL = RocmDataModel
43
+ CMD_VERSION_PATHS = [
44
+ "/opt/rocm/.info/version-rocm",
45
+ "/opt/rocm/.info/version",
46
+ ]
47
+ CMD_ROCMINFO = "{rocm_path}/bin/rocminfo"
48
+ CMD_ROCM_LATEST = "ls -v -d /opt/rocm-[3-7]* | tail -1"
49
+ CMD_ROCM_DIRS = "ls -v -d /opt/rocm*"
50
+ CMD_LD_CONF = "grep -i -E 'rocm' /etc/ld.so.conf.d/*"
51
+ CMD_ROCM_LIBS = "ldconfig -p | grep -i -E 'rocm'"
52
+ CMD_ENV_VARS = "env | grep -Ei 'rocm|hsa|hip|mpi|openmp|ucx|miopen'"
53
+ CMD_CLINFO = "{rocm_path}/opencl/bin/*/clinfo"
54
+ CMD_KFD_PROC = "ls /sys/class/kfd/kfd/proc/"
55
+
56
+ def collect_data(self, args=None) -> tuple[TaskResult, Optional[RocmDataModel]]:
57
+ """Collect ROCm version data from the system.
58
+
59
+ Returns:
60
+ tuple[TaskResult, Optional[RocmDataModel]]: tuple containing the task result and ROCm data model if available.
61
+ """
62
+ rocm_data = None
63
+ for path in self.CMD_VERSION_PATHS:
64
+ res = self._run_sut_cmd(f"grep . {path}")
65
+ if res.exit_code == 0:
66
+ try:
67
+ rocm_data = RocmDataModel(rocm_version=res.stdout)
68
+ self._log_event(
69
+ category="ROCM_VERSION_READ",
70
+ description="ROCm version data collected",
71
+ data=rocm_data.model_dump(include={"rocm_version"}),
72
+ priority=EventPriority.INFO,
73
+ )
74
+ self.result.message = f"ROCm version: {rocm_data.rocm_version}"
75
+ self.result.status = ExecutionStatus.OK
76
+ break
77
+ except ValueError as e:
78
+ self._log_event(
79
+ category=EventCategory.OS,
80
+ description=f"Invalid ROCm version format: {res.stdout}",
81
+ data={"version": res.stdout, "error": str(e)},
82
+ priority=EventPriority.ERROR,
83
+ console_log=True,
84
+ )
85
+ self.result.message = f"Invalid ROCm version format: {res.stdout}"
86
+ self.result.status = ExecutionStatus.ERROR
87
+ return self.result, None
88
+ else:
89
+ self._log_event(
90
+ category=EventCategory.OS,
91
+ description=f"Unable to read ROCm version from {self.CMD_VERSION_PATHS}",
92
+ data={"raw_output": res.stdout},
93
+ priority=EventPriority.ERROR,
94
+ )
95
+
96
+ # Collect additional ROCm data if version was found
97
+ if rocm_data:
98
+ # Collect latest versioned ROCm path (rocm-[3-7]*)
99
+ versioned_path_res = self._run_sut_cmd(self.CMD_ROCM_LATEST)
100
+ if versioned_path_res.exit_code == 0:
101
+ rocm_data.rocm_latest_versioned_path = versioned_path_res.stdout.strip()
102
+
103
+ # Collect all ROCm paths as list
104
+ all_paths_res = self._run_sut_cmd(self.CMD_ROCM_DIRS)
105
+ if all_paths_res.exit_code == 0:
106
+ rocm_data.rocm_all_paths = [
107
+ path.strip()
108
+ for path in all_paths_res.stdout.strip().split("\n")
109
+ if path.strip()
110
+ ]
111
+
112
+ # Determine ROCm path for commands that need it
113
+ rocm_path = rocm_data.rocm_latest_versioned_path or "/opt/rocm"
114
+
115
+ # Collect rocminfo output as list of lines with ANSI codes stripped
116
+ rocminfo_cmd = self.CMD_ROCMINFO.format(rocm_path=rocm_path)
117
+ rocminfo_res = self._run_sut_cmd(rocminfo_cmd)
118
+ rocminfo_artifact_content = ""
119
+ if rocminfo_res.exit_code == 0:
120
+ # Split into lines and strip ANSI codes from each line
121
+ rocm_data.rocminfo = [
122
+ strip_ansi_codes(line) for line in rocminfo_res.stdout.strip().split("\n")
123
+ ]
124
+ rocminfo_artifact_content += "=" * 80 + "\n"
125
+ rocminfo_artifact_content += "ROCMNFO OUTPUT\n"
126
+ rocminfo_artifact_content += "=" * 80 + "\n\n"
127
+ rocminfo_artifact_content += rocminfo_res.stdout
128
+
129
+ # Collect ld.so.conf ROCm entries
130
+ ld_conf_res = self._run_sut_cmd(self.CMD_LD_CONF)
131
+ if ld_conf_res.exit_code == 0:
132
+ rocm_data.ld_conf_rocm = [
133
+ line.strip() for line in ld_conf_res.stdout.strip().split("\n") if line.strip()
134
+ ]
135
+
136
+ # Collect ROCm libraries from ldconfig
137
+ rocm_libs_res = self._run_sut_cmd(self.CMD_ROCM_LIBS)
138
+ if rocm_libs_res.exit_code == 0:
139
+ rocm_data.rocm_libs = [
140
+ line.strip()
141
+ for line in rocm_libs_res.stdout.strip().split("\n")
142
+ if line.strip()
143
+ ]
144
+
145
+ # Collect ROCm-related environment variables
146
+ env_vars_res = self._run_sut_cmd(self.CMD_ENV_VARS)
147
+ if env_vars_res.exit_code == 0:
148
+ rocm_data.env_vars = [
149
+ line.strip() for line in env_vars_res.stdout.strip().split("\n") if line.strip()
150
+ ]
151
+
152
+ # Collect clinfo output
153
+ clinfo_cmd = self.CMD_CLINFO.format(rocm_path=rocm_path)
154
+ clinfo_res = self._run_sut_cmd(clinfo_cmd)
155
+
156
+ # Always append clinfo section to artifact, even if empty or failed
157
+ if rocminfo_artifact_content:
158
+ rocminfo_artifact_content += "\n\n"
159
+ rocminfo_artifact_content += "=" * 80 + "\n"
160
+ rocminfo_artifact_content += "CLINFO OUTPUT\n"
161
+ rocminfo_artifact_content += "=" * 80 + "\n\n"
162
+
163
+ if clinfo_res.exit_code == 0:
164
+ rocm_data.clinfo = [
165
+ strip_ansi_codes(line) for line in clinfo_res.stdout.strip().split("\n")
166
+ ]
167
+ rocminfo_artifact_content += clinfo_res.stdout
168
+ else:
169
+ # Add error information if clinfo failed
170
+ rocminfo_artifact_content += f"Command: {clinfo_res.command}\n"
171
+ rocminfo_artifact_content += f"Exit Code: {clinfo_res.exit_code}\n"
172
+ if clinfo_res.stderr:
173
+ rocminfo_artifact_content += f"Error: {clinfo_res.stderr}\n"
174
+ if clinfo_res.stdout:
175
+ rocminfo_artifact_content += f"Output: {clinfo_res.stdout}\n"
176
+
177
+ # Add combined rocminfo and clinfo output as a text file artifact
178
+ if rocminfo_artifact_content:
179
+ self.result.artifacts.append(
180
+ TextFileArtifact(filename="rocminfo.log", contents=rocminfo_artifact_content)
181
+ )
182
+
183
+ # Collect KFD process list
184
+ kfd_proc_res = self._run_sut_cmd(self.CMD_KFD_PROC)
185
+ if kfd_proc_res.exit_code == 0:
186
+ rocm_data.kfd_proc = [
187
+ proc.strip() for proc in kfd_proc_res.stdout.strip().split("\n") if proc.strip()
188
+ ]
189
+
190
+ if not rocm_data:
191
+ self._log_event(
192
+ category=EventCategory.OS,
193
+ description="Error checking ROCm version",
194
+ data={
195
+ "command": res.command,
196
+ "exit_code": res.exit_code,
197
+ "stderr": res.stderr,
198
+ },
199
+ priority=EventPriority.ERROR,
200
+ console_log=True,
201
+ )
202
+ self.result.message = "ROCm version not found"
203
+ self.result.status = ExecutionStatus.ERROR
204
+
205
+ return self.result, rocm_data
@@ -0,0 +1,43 @@
1
+ ###############################################################################
2
+ #
3
+ # MIT License
4
+ #
5
+ # Copyright (c) 2025 Advanced Micro Devices, Inc.
6
+ #
7
+ # Permission is hereby granted, free of charge, to any person obtaining a copy
8
+ # of this software and associated documentation files (the "Software"), to deal
9
+ # in the Software without restriction, including without limitation the rights
10
+ # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11
+ # copies of the Software, and to permit persons to whom the Software is
12
+ # furnished to do so, subject to the following conditions:
13
+ #
14
+ # The above copyright notice and this permission notice shall be included in all
15
+ # copies or substantial portions of the Software.
16
+ #
17
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18
+ # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19
+ # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20
+ # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21
+ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22
+ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23
+ # SOFTWARE.
24
+ #
25
+ ###############################################################################
26
+ from nodescraper.base import InBandDataPlugin
27
+
28
+ from .analyzer_args import RocmAnalyzerArgs
29
+ from .rocm_analyzer import RocmAnalyzer
30
+ from .rocm_collector import RocmCollector
31
+ from .rocmdata import RocmDataModel
32
+
33
+
34
+ class RocmPlugin(InBandDataPlugin[RocmDataModel, None, RocmAnalyzerArgs]):
35
+ """Plugin for collection and analysis of rocm version data"""
36
+
37
+ DATA_MODEL = RocmDataModel
38
+
39
+ COLLECTOR = RocmCollector
40
+
41
+ ANALYZER = RocmAnalyzer
42
+
43
+ ANALYZER_ARGS = RocmAnalyzerArgs
@@ -0,0 +1,62 @@
1
+ ###############################################################################
2
+ #
3
+ # MIT License
4
+ #
5
+ # Copyright (c) 2025 Advanced Micro Devices, Inc.
6
+ #
7
+ # Permission is hereby granted, free of charge, to any person obtaining a copy
8
+ # of this software and associated documentation files (the "Software"), to deal
9
+ # in the Software without restriction, including without limitation the rights
10
+ # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11
+ # copies of the Software, and to permit persons to whom the Software is
12
+ # furnished to do so, subject to the following conditions:
13
+ #
14
+ # The above copyright notice and this permission notice shall be included in all
15
+ # copies or substantial portions of the Software.
16
+ #
17
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18
+ # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19
+ # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20
+ # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21
+ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22
+ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23
+ # SOFTWARE.
24
+ #
25
+ ###############################################################################
26
+ import re
27
+ from typing import List
28
+
29
+ from pydantic import field_validator
30
+
31
+ from nodescraper.models import DataModel
32
+
33
+
34
+ class RocmDataModel(DataModel):
35
+ rocm_version: str
36
+ rocminfo: List[str] = []
37
+ rocm_latest_versioned_path: str = ""
38
+ rocm_all_paths: List[str] = []
39
+ ld_conf_rocm: List[str] = []
40
+ rocm_libs: List[str] = []
41
+ env_vars: List[str] = []
42
+ clinfo: List[str] = []
43
+ kfd_proc: List[str] = []
44
+
45
+ @field_validator("rocm_version")
46
+ @classmethod
47
+ def validate_rocm_version(cls, rocm_version: str) -> str:
48
+ """
49
+ Validate the ROCm version format.
50
+
51
+ Args:
52
+ rocm_version (str): The ROCm version string to validate.
53
+
54
+ Raises:
55
+ ValueError: If the ROCm version does not match the expected format.
56
+
57
+ Returns:
58
+ str: The validated ROCm version string.
59
+ """
60
+ if not re.match(r"^\d+(?:\.\d+){0,3}(-\d+)?$", rocm_version):
61
+ raise ValueError(f"ROCm version has invalid format: {rocm_version}")
62
+ return rocm_version
@@ -0,0 +1,25 @@
1
+ ###############################################################################
2
+ #
3
+ # MIT License
4
+ #
5
+ # Copyright (c) 2025 Advanced Micro Devices, Inc.
6
+ #
7
+ # Permission is hereby granted, free of charge, to any person obtaining a copy
8
+ # of this software and associated documentation files (the "Software"), to deal
9
+ # in the Software without restriction, including without limitation the rights
10
+ # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11
+ # copies of the Software, and to permit persons to whom the Software is
12
+ # furnished to do so, subject to the following conditions:
13
+ #
14
+ # The above copyright notice and this permission notice shall be included in all
15
+ # copies or substantial portions of the Software.
16
+ #
17
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18
+ # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19
+ # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20
+ # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21
+ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22
+ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23
+ # SOFTWARE.
24
+ #
25
+ ###############################################################################
@@ -0,0 +1,38 @@
1
+ ###############################################################################
2
+ #
3
+ # MIT License
4
+ #
5
+ # Copyright (c) 2025 Advanced Micro Devices, Inc.
6
+ #
7
+ # Permission is hereby granted, free of charge, to any person obtaining a copy
8
+ # of this software and associated documentation files (the "Software"), to deal
9
+ # in the Software without restriction, including without limitation the rights
10
+ # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11
+ # copies of the Software, and to permit persons to whom the Software is
12
+ # furnished to do so, subject to the following conditions:
13
+ #
14
+ # The above copyright notice and this permission notice shall be included in all
15
+ # copies or substantial portions of the Software.
16
+ #
17
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18
+ # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19
+ # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20
+ # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21
+ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22
+ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23
+ # SOFTWARE.
24
+ #
25
+ ###############################################################################
26
+ from typing import Optional
27
+
28
+ from pydantic import Field
29
+
30
+ from nodescraper.models.analyzerargs import AnalyzerArgs
31
+
32
+
33
+ class StorageAnalyzerArgs(AnalyzerArgs):
34
+ min_required_free_space_abs: Optional[str] = None
35
+ min_required_free_space_prct: Optional[int] = None
36
+ ignore_devices: Optional[list[str]] = Field(default_factory=list)
37
+ check_devices: Optional[list[str]] = Field(default_factory=list)
38
+ regex_match: bool = False
@@ -0,0 +1,31 @@
1
+ ###############################################################################
2
+ #
3
+ # MIT License
4
+ #
5
+ # Copyright (c) 2025 Advanced Micro Devices, Inc.
6
+ #
7
+ # Permission is hereby granted, free of charge, to any person obtaining a copy
8
+ # of this software and associated documentation files (the "Software"), to deal
9
+ # in the Software without restriction, including without limitation the rights
10
+ # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11
+ # copies of the Software, and to permit persons to whom the Software is
12
+ # furnished to do so, subject to the following conditions:
13
+ #
14
+ # The above copyright notice and this permission notice shall be included in all
15
+ # copies or substantial portions of the Software.
16
+ #
17
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18
+ # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19
+ # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20
+ # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21
+ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22
+ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23
+ # SOFTWARE.
24
+ #
25
+ ###############################################################################
26
+
27
+ from nodescraper.models import CollectorArgs
28
+
29
+
30
+ class StorageCollectorArgs(CollectorArgs):
31
+ skip_sudo: bool = False
@@ -0,0 +1,152 @@
1
+ ###############################################################################
2
+ #
3
+ # MIT License
4
+ #
5
+ # Copyright (c) 2025 Advanced Micro Devices, Inc.
6
+ #
7
+ # Permission is hereby granted, free of charge, to any person obtaining a copy
8
+ # of this software and associated documentation files (the "Software"), to deal
9
+ # in the Software without restriction, including without limitation the rights
10
+ # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11
+ # copies of the Software, and to permit persons to whom the Software is
12
+ # furnished to do so, subject to the following conditions:
13
+ #
14
+ # The above copyright notice and this permission notice shall be included in all
15
+ # copies or substantial portions of the Software.
16
+ #
17
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18
+ # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19
+ # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20
+ # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21
+ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22
+ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23
+ # SOFTWARE.
24
+ #
25
+ ###############################################################################
26
+ import re
27
+ from typing import Optional
28
+
29
+ from nodescraper.enums import EventCategory, EventPriority, ExecutionStatus
30
+ from nodescraper.interfaces import DataAnalyzer
31
+ from nodescraper.models import TaskResult
32
+ from nodescraper.utils import bytes_to_human_readable, convert_to_bytes
33
+
34
+ from .analyzer_args import StorageAnalyzerArgs
35
+ from .storagedata import StorageDataModel
36
+
37
+
38
+ class StorageAnalyzer(DataAnalyzer[StorageDataModel, StorageAnalyzerArgs]):
39
+ """Check storage usage"""
40
+
41
+ DATA_MODEL = StorageDataModel
42
+
43
+ def _matches_device_filter(
44
+ self, device_name: str, exp_devices: list[str], regex_match: bool
45
+ ) -> bool:
46
+ """Check if the device name matches any of the expected devices""
47
+
48
+ Args:
49
+ device_name (str): device name to check
50
+ exp_devices (list[str]): list of expected devices to match against
51
+ regex_match (bool): if True, use regex matching; otherwise, use exact match
52
+
53
+ Returns:
54
+ bool: True if the device name matches any of the expected devices, False otherwise
55
+ """
56
+ for exp_device in exp_devices:
57
+ if regex_match:
58
+ try:
59
+ device_regex = re.compile(exp_device)
60
+ except re.error:
61
+ self._log_event(
62
+ category=EventCategory.STORAGE,
63
+ description=f"Invalid regex pattern: {exp_device}",
64
+ priority=EventPriority.ERROR,
65
+ )
66
+ continue
67
+ if device_regex.match(device_name):
68
+ return True
69
+ elif device_name == exp_device:
70
+ return True
71
+ return False
72
+
73
+ def analyze_data(
74
+ self, data: StorageDataModel, args: Optional[StorageAnalyzerArgs] = None
75
+ ) -> TaskResult:
76
+ """Analyze the storage data to check if there is enough free space
77
+
78
+ Args:
79
+ data (StorageDataModel): storage data to analyze
80
+ args (Optional[StorageAnalyzerArgs], optional): storage analysis arguments. Defaults to None.
81
+
82
+ Returns:
83
+ TaskResult: Result of the storage analysis containing the status and message.
84
+ """
85
+ if args is None:
86
+ args = StorageAnalyzerArgs(min_required_free_space_prct=10)
87
+ elif args.min_required_free_space_abs is None and args.min_required_free_space_prct is None:
88
+ args.min_required_free_space_prct = 10
89
+ self.logger.warning(
90
+ "No thresholds provided for storage analyzer arguments; defaulting to 10% free"
91
+ )
92
+
93
+ if not data.storage_data:
94
+ self.result.message = "No storage data available"
95
+ self.result.status = ExecutionStatus.NOT_RAN
96
+ return self.result
97
+
98
+ self.result.status = ExecutionStatus.OK
99
+ passing_devices = []
100
+ failing_devices = []
101
+ for device_name, device_data in data.storage_data.items():
102
+ if args.check_devices:
103
+ if not self._matches_device_filter(
104
+ device_name, args.check_devices, args.regex_match
105
+ ):
106
+ continue
107
+ elif args.ignore_devices:
108
+ if self._matches_device_filter(device_name, args.ignore_devices, args.regex_match):
109
+ continue
110
+
111
+ condition = False
112
+ if args.min_required_free_space_abs:
113
+ min_free_abs = convert_to_bytes(args.min_required_free_space_abs)
114
+ free_abs = convert_to_bytes(str(device_data.free))
115
+ if free_abs and free_abs > min_free_abs:
116
+ condition = True
117
+ else:
118
+ condition = True
119
+
120
+ if args.min_required_free_space_prct:
121
+ free_prct = 100 - device_data.percent
122
+ condition = condition and (free_prct > args.min_required_free_space_prct)
123
+
124
+ if condition:
125
+ passing_devices.append(device_name)
126
+ else:
127
+ device = convert_to_bytes(str(device_data.total))
128
+ prct = device_data.percent
129
+ failing_devices.append(device_name)
130
+ event_data = {
131
+ "offending_device": {
132
+ "device": device_name,
133
+ "total": device_data.total,
134
+ "free": device_data.free,
135
+ "percent": device_data.percent,
136
+ },
137
+ }
138
+ self._log_event(
139
+ category=EventCategory.STORAGE,
140
+ description=f"Insufficient disk space: {bytes_to_human_readable(device)} and {prct}%, used on {device_name}",
141
+ data=event_data,
142
+ priority=EventPriority.CRITICAL,
143
+ console_log=True,
144
+ )
145
+ if failing_devices:
146
+ self.result.message = f"Insufficient disk space on " f"[{', '.join(failing_devices)}]"
147
+ self.result.status = ExecutionStatus.ERROR
148
+ else:
149
+ self.result.message = (
150
+ f"Sufficient disk space available on " f"[{', '.join(passing_devices)}]"
151
+ )
152
+ return self.result
@@ -0,0 +1,110 @@
1
+ ###############################################################################
2
+ #
3
+ # MIT License
4
+ #
5
+ # Copyright (c) 2025 Advanced Micro Devices, Inc.
6
+ #
7
+ # Permission is hereby granted, free of charge, to any person obtaining a copy
8
+ # of this software and associated documentation files (the "Software"), to deal
9
+ # in the Software without restriction, including without limitation the rights
10
+ # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11
+ # copies of the Software, and to permit persons to whom the Software is
12
+ # furnished to do so, subject to the following conditions:
13
+ #
14
+ # The above copyright notice and this permission notice shall be included in all
15
+ # copies or substantial portions of the Software.
16
+ #
17
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18
+ # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19
+ # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20
+ # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21
+ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22
+ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23
+ # SOFTWARE.
24
+ #
25
+ ###############################################################################
26
+ import re
27
+ from typing import Optional
28
+
29
+ from nodescraper.base import InBandDataCollector
30
+ from nodescraper.enums import EventCategory, EventPriority, ExecutionStatus, OSFamily
31
+ from nodescraper.models import TaskResult
32
+
33
+ from .collector_args import StorageCollectorArgs
34
+ from .storagedata import DeviceStorageData, StorageDataModel
35
+
36
+
37
+ class StorageCollector(InBandDataCollector[StorageDataModel, None]):
38
+ """Collect disk usage details"""
39
+
40
+ DATA_MODEL = StorageDataModel
41
+ CMD_WINDOWS = """wmic LogicalDisk Where DriveType="3" Get DeviceId,Size,FreeSpace"""
42
+ CMD = """sh -c 'df -lH -B1 | grep -v 'boot''"""
43
+
44
+ def collect_data(
45
+ self, args: Optional[StorageCollectorArgs] = None
46
+ ) -> tuple[TaskResult, Optional[StorageDataModel]]:
47
+ """read storage usage data"""
48
+ if args is None:
49
+ args = StorageCollectorArgs()
50
+
51
+ storage_data = {}
52
+ if self.system_info.os_family == OSFamily.WINDOWS:
53
+ res = self._run_sut_cmd(self.CMD_WINDOWS)
54
+ if res.exit_code == 0:
55
+ for line in res.stdout.splitlines()[1:]:
56
+ if line:
57
+ device_id, free_space, size = line.split()
58
+ storage_data[device_id] = DeviceStorageData(
59
+ total=int(size),
60
+ free=int(free_space),
61
+ used=int(size) - int(free_space),
62
+ percent=round((int(size) - int(free_space)) / int(size) * 100, 2),
63
+ )
64
+ else:
65
+ if args.skip_sudo:
66
+ self.result.message = "Skipping sudo plugin"
67
+ self.result.status = ExecutionStatus.NOT_RAN
68
+ return self.result, None
69
+ res = self._run_sut_cmd(self.CMD, sudo=True)
70
+ if res.exit_code == 0:
71
+ for line in res.stdout.splitlines()[1:]:
72
+ if line:
73
+ device_id, size, used, available, percent = line.strip().split()[:5]
74
+ if device_id not in ["tmpfs", "overlay"]:
75
+ storage_data[device_id] = DeviceStorageData(
76
+ total=int(size),
77
+ free=int(available),
78
+ used=int(used),
79
+ percent=float(re.sub(r"%", "", percent)),
80
+ )
81
+
82
+ if res.exit_code != 0:
83
+ self._log_event(
84
+ category=EventCategory.OS,
85
+ description="Error checking available storage",
86
+ data={
87
+ "command": res.command,
88
+ "exit_code": res.exit_code,
89
+ "stderr": res.stderr,
90
+ },
91
+ priority=EventPriority.ERROR,
92
+ console_log=True,
93
+ )
94
+
95
+ if storage_data:
96
+ storage_data = dict(sorted(storage_data.items(), key=lambda x: x[1].total))
97
+ storage_model = StorageDataModel(storage_data=storage_data)
98
+ self._log_event(
99
+ category="STORAGE_READ",
100
+ description="Available storage read",
101
+ data=storage_model.model_dump(),
102
+ priority=EventPriority.INFO,
103
+ )
104
+ self.result.message = f"{len(storage_model.storage_data)} storage devices collected"
105
+ self.result.status = ExecutionStatus.OK
106
+ else:
107
+ storage_model = None
108
+ self.result.message = "Storage info not found"
109
+ self.result.status = ExecutionStatus.ERROR
110
+ return self.result, storage_model