amd-node-scraper 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (197) hide show
  1. amd_node_scraper-0.0.1.dist-info/LICENSE +21 -0
  2. amd_node_scraper-0.0.1.dist-info/METADATA +424 -0
  3. amd_node_scraper-0.0.1.dist-info/RECORD +197 -0
  4. amd_node_scraper-0.0.1.dist-info/WHEEL +5 -0
  5. amd_node_scraper-0.0.1.dist-info/entry_points.txt +2 -0
  6. amd_node_scraper-0.0.1.dist-info/top_level.txt +1 -0
  7. nodescraper/__init__.py +32 -0
  8. nodescraper/base/__init__.py +34 -0
  9. nodescraper/base/inbandcollectortask.py +118 -0
  10. nodescraper/base/inbanddataplugin.py +39 -0
  11. nodescraper/base/regexanalyzer.py +120 -0
  12. nodescraper/cli/__init__.py +29 -0
  13. nodescraper/cli/cli.py +511 -0
  14. nodescraper/cli/constants.py +27 -0
  15. nodescraper/cli/dynamicparserbuilder.py +171 -0
  16. nodescraper/cli/helper.py +517 -0
  17. nodescraper/cli/inputargtypes.py +129 -0
  18. nodescraper/configbuilder.py +123 -0
  19. nodescraper/configregistry.py +66 -0
  20. nodescraper/configs/node_status.json +19 -0
  21. nodescraper/connection/__init__.py +25 -0
  22. nodescraper/connection/inband/__init__.py +46 -0
  23. nodescraper/connection/inband/inband.py +171 -0
  24. nodescraper/connection/inband/inbandlocal.py +93 -0
  25. nodescraper/connection/inband/inbandmanager.py +151 -0
  26. nodescraper/connection/inband/inbandremote.py +173 -0
  27. nodescraper/connection/inband/sshparams.py +43 -0
  28. nodescraper/constants.py +26 -0
  29. nodescraper/enums/__init__.py +40 -0
  30. nodescraper/enums/eventcategory.py +89 -0
  31. nodescraper/enums/eventpriority.py +42 -0
  32. nodescraper/enums/executionstatus.py +44 -0
  33. nodescraper/enums/osfamily.py +34 -0
  34. nodescraper/enums/systeminteraction.py +41 -0
  35. nodescraper/enums/systemlocation.py +33 -0
  36. nodescraper/generictypes.py +36 -0
  37. nodescraper/interfaces/__init__.py +44 -0
  38. nodescraper/interfaces/connectionmanager.py +143 -0
  39. nodescraper/interfaces/dataanalyzertask.py +138 -0
  40. nodescraper/interfaces/datacollectortask.py +185 -0
  41. nodescraper/interfaces/dataplugin.py +356 -0
  42. nodescraper/interfaces/plugin.py +127 -0
  43. nodescraper/interfaces/resultcollator.py +56 -0
  44. nodescraper/interfaces/task.py +164 -0
  45. nodescraper/interfaces/taskresulthook.py +39 -0
  46. nodescraper/models/__init__.py +48 -0
  47. nodescraper/models/analyzerargs.py +93 -0
  48. nodescraper/models/collectorargs.py +30 -0
  49. nodescraper/models/connectionconfig.py +34 -0
  50. nodescraper/models/datamodel.py +171 -0
  51. nodescraper/models/datapluginresult.py +39 -0
  52. nodescraper/models/event.py +158 -0
  53. nodescraper/models/pluginconfig.py +38 -0
  54. nodescraper/models/pluginresult.py +39 -0
  55. nodescraper/models/systeminfo.py +44 -0
  56. nodescraper/models/taskresult.py +185 -0
  57. nodescraper/models/timerangeargs.py +38 -0
  58. nodescraper/pluginexecutor.py +274 -0
  59. nodescraper/pluginregistry.py +152 -0
  60. nodescraper/plugins/__init__.py +25 -0
  61. nodescraper/plugins/inband/__init__.py +25 -0
  62. nodescraper/plugins/inband/amdsmi/__init__.py +28 -0
  63. nodescraper/plugins/inband/amdsmi/amdsmi_analyzer.py +821 -0
  64. nodescraper/plugins/inband/amdsmi/amdsmi_collector.py +1313 -0
  65. nodescraper/plugins/inband/amdsmi/amdsmi_plugin.py +43 -0
  66. nodescraper/plugins/inband/amdsmi/amdsmidata.py +1002 -0
  67. nodescraper/plugins/inband/amdsmi/analyzer_args.py +50 -0
  68. nodescraper/plugins/inband/amdsmi/cper.py +65 -0
  69. nodescraper/plugins/inband/bios/__init__.py +29 -0
  70. nodescraper/plugins/inband/bios/analyzer_args.py +64 -0
  71. nodescraper/plugins/inband/bios/bios_analyzer.py +93 -0
  72. nodescraper/plugins/inband/bios/bios_collector.py +93 -0
  73. nodescraper/plugins/inband/bios/bios_plugin.py +43 -0
  74. nodescraper/plugins/inband/bios/biosdata.py +30 -0
  75. nodescraper/plugins/inband/cmdline/__init__.py +25 -0
  76. nodescraper/plugins/inband/cmdline/analyzer_args.py +80 -0
  77. nodescraper/plugins/inband/cmdline/cmdline_analyzer.py +113 -0
  78. nodescraper/plugins/inband/cmdline/cmdline_collector.py +77 -0
  79. nodescraper/plugins/inband/cmdline/cmdline_plugin.py +43 -0
  80. nodescraper/plugins/inband/cmdline/cmdlinedata.py +30 -0
  81. nodescraper/plugins/inband/device_enumeration/__init__.py +29 -0
  82. nodescraper/plugins/inband/device_enumeration/analyzer_args.py +73 -0
  83. nodescraper/plugins/inband/device_enumeration/device_enumeration_analyzer.py +81 -0
  84. nodescraper/plugins/inband/device_enumeration/device_enumeration_collector.py +176 -0
  85. nodescraper/plugins/inband/device_enumeration/device_enumeration_plugin.py +45 -0
  86. nodescraper/plugins/inband/device_enumeration/deviceenumdata.py +36 -0
  87. nodescraper/plugins/inband/dimm/__init__.py +25 -0
  88. nodescraper/plugins/inband/dimm/collector_args.py +31 -0
  89. nodescraper/plugins/inband/dimm/dimm_collector.py +151 -0
  90. nodescraper/plugins/inband/dimm/dimm_plugin.py +40 -0
  91. nodescraper/plugins/inband/dimm/dimmdata.py +30 -0
  92. nodescraper/plugins/inband/dkms/__init__.py +25 -0
  93. nodescraper/plugins/inband/dkms/analyzer_args.py +85 -0
  94. nodescraper/plugins/inband/dkms/dkms_analyzer.py +106 -0
  95. nodescraper/plugins/inband/dkms/dkms_collector.py +76 -0
  96. nodescraper/plugins/inband/dkms/dkms_plugin.py +43 -0
  97. nodescraper/plugins/inband/dkms/dkmsdata.py +33 -0
  98. nodescraper/plugins/inband/dmesg/__init__.py +28 -0
  99. nodescraper/plugins/inband/dmesg/analyzer_args.py +33 -0
  100. nodescraper/plugins/inband/dmesg/collector_args.py +39 -0
  101. nodescraper/plugins/inband/dmesg/dmesg_analyzer.py +503 -0
  102. nodescraper/plugins/inband/dmesg/dmesg_collector.py +164 -0
  103. nodescraper/plugins/inband/dmesg/dmesg_plugin.py +44 -0
  104. nodescraper/plugins/inband/dmesg/dmesgdata.py +116 -0
  105. nodescraper/plugins/inband/fabrics/__init__.py +28 -0
  106. nodescraper/plugins/inband/fabrics/fabrics_collector.py +726 -0
  107. nodescraper/plugins/inband/fabrics/fabrics_plugin.py +37 -0
  108. nodescraper/plugins/inband/fabrics/fabricsdata.py +140 -0
  109. nodescraper/plugins/inband/journal/__init__.py +28 -0
  110. nodescraper/plugins/inband/journal/collector_args.py +33 -0
  111. nodescraper/plugins/inband/journal/journal_collector.py +107 -0
  112. nodescraper/plugins/inband/journal/journal_plugin.py +40 -0
  113. nodescraper/plugins/inband/journal/journaldata.py +44 -0
  114. nodescraper/plugins/inband/kernel/__init__.py +25 -0
  115. nodescraper/plugins/inband/kernel/analyzer_args.py +64 -0
  116. nodescraper/plugins/inband/kernel/kernel_analyzer.py +91 -0
  117. nodescraper/plugins/inband/kernel/kernel_collector.py +129 -0
  118. nodescraper/plugins/inband/kernel/kernel_plugin.py +43 -0
  119. nodescraper/plugins/inband/kernel/kerneldata.py +32 -0
  120. nodescraper/plugins/inband/kernel_module/__init__.py +25 -0
  121. nodescraper/plugins/inband/kernel_module/analyzer_args.py +59 -0
  122. nodescraper/plugins/inband/kernel_module/kernel_module_analyzer.py +211 -0
  123. nodescraper/plugins/inband/kernel_module/kernel_module_collector.py +264 -0
  124. nodescraper/plugins/inband/kernel_module/kernel_module_data.py +60 -0
  125. nodescraper/plugins/inband/kernel_module/kernel_module_plugin.py +43 -0
  126. nodescraper/plugins/inband/memory/__init__.py +25 -0
  127. nodescraper/plugins/inband/memory/analyzer_args.py +45 -0
  128. nodescraper/plugins/inband/memory/memory_analyzer.py +98 -0
  129. nodescraper/plugins/inband/memory/memory_collector.py +330 -0
  130. nodescraper/plugins/inband/memory/memory_plugin.py +43 -0
  131. nodescraper/plugins/inband/memory/memorydata.py +90 -0
  132. nodescraper/plugins/inband/network/__init__.py +28 -0
  133. nodescraper/plugins/inband/network/network_collector.py +1828 -0
  134. nodescraper/plugins/inband/network/network_plugin.py +37 -0
  135. nodescraper/plugins/inband/network/networkdata.py +319 -0
  136. nodescraper/plugins/inband/nvme/__init__.py +28 -0
  137. nodescraper/plugins/inband/nvme/nvme_collector.py +167 -0
  138. nodescraper/plugins/inband/nvme/nvme_plugin.py +37 -0
  139. nodescraper/plugins/inband/nvme/nvmedata.py +45 -0
  140. nodescraper/plugins/inband/os/__init__.py +25 -0
  141. nodescraper/plugins/inband/os/analyzer_args.py +64 -0
  142. nodescraper/plugins/inband/os/os_analyzer.py +73 -0
  143. nodescraper/plugins/inband/os/os_collector.py +131 -0
  144. nodescraper/plugins/inband/os/os_plugin.py +43 -0
  145. nodescraper/plugins/inband/os/osdata.py +31 -0
  146. nodescraper/plugins/inband/package/__init__.py +25 -0
  147. nodescraper/plugins/inband/package/analyzer_args.py +48 -0
  148. nodescraper/plugins/inband/package/package_analyzer.py +253 -0
  149. nodescraper/plugins/inband/package/package_collector.py +273 -0
  150. nodescraper/plugins/inband/package/package_plugin.py +43 -0
  151. nodescraper/plugins/inband/package/packagedata.py +41 -0
  152. nodescraper/plugins/inband/pcie/__init__.py +29 -0
  153. nodescraper/plugins/inband/pcie/analyzer_args.py +63 -0
  154. nodescraper/plugins/inband/pcie/pcie_analyzer.py +1081 -0
  155. nodescraper/plugins/inband/pcie/pcie_collector.py +690 -0
  156. nodescraper/plugins/inband/pcie/pcie_data.py +2017 -0
  157. nodescraper/plugins/inband/pcie/pcie_plugin.py +43 -0
  158. nodescraper/plugins/inband/process/__init__.py +25 -0
  159. nodescraper/plugins/inband/process/analyzer_args.py +45 -0
  160. nodescraper/plugins/inband/process/collector_args.py +31 -0
  161. nodescraper/plugins/inband/process/process_analyzer.py +91 -0
  162. nodescraper/plugins/inband/process/process_collector.py +115 -0
  163. nodescraper/plugins/inband/process/process_plugin.py +46 -0
  164. nodescraper/plugins/inband/process/processdata.py +34 -0
  165. nodescraper/plugins/inband/rocm/__init__.py +25 -0
  166. nodescraper/plugins/inband/rocm/analyzer_args.py +66 -0
  167. nodescraper/plugins/inband/rocm/rocm_analyzer.py +100 -0
  168. nodescraper/plugins/inband/rocm/rocm_collector.py +205 -0
  169. nodescraper/plugins/inband/rocm/rocm_plugin.py +43 -0
  170. nodescraper/plugins/inband/rocm/rocmdata.py +62 -0
  171. nodescraper/plugins/inband/storage/__init__.py +25 -0
  172. nodescraper/plugins/inband/storage/analyzer_args.py +38 -0
  173. nodescraper/plugins/inband/storage/collector_args.py +31 -0
  174. nodescraper/plugins/inband/storage/storage_analyzer.py +152 -0
  175. nodescraper/plugins/inband/storage/storage_collector.py +110 -0
  176. nodescraper/plugins/inband/storage/storage_plugin.py +44 -0
  177. nodescraper/plugins/inband/storage/storagedata.py +70 -0
  178. nodescraper/plugins/inband/sysctl/__init__.py +29 -0
  179. nodescraper/plugins/inband/sysctl/analyzer_args.py +67 -0
  180. nodescraper/plugins/inband/sysctl/sysctl_analyzer.py +81 -0
  181. nodescraper/plugins/inband/sysctl/sysctl_collector.py +101 -0
  182. nodescraper/plugins/inband/sysctl/sysctl_plugin.py +43 -0
  183. nodescraper/plugins/inband/sysctl/sysctldata.py +42 -0
  184. nodescraper/plugins/inband/syslog/__init__.py +28 -0
  185. nodescraper/plugins/inband/syslog/syslog_collector.py +121 -0
  186. nodescraper/plugins/inband/syslog/syslog_plugin.py +37 -0
  187. nodescraper/plugins/inband/syslog/syslogdata.py +46 -0
  188. nodescraper/plugins/inband/uptime/__init__.py +25 -0
  189. nodescraper/plugins/inband/uptime/uptime_collector.py +88 -0
  190. nodescraper/plugins/inband/uptime/uptime_plugin.py +37 -0
  191. nodescraper/plugins/inband/uptime/uptimedata.py +31 -0
  192. nodescraper/resultcollators/__init__.py +25 -0
  193. nodescraper/resultcollators/tablesummary.py +159 -0
  194. nodescraper/taskresulthooks/__init__.py +28 -0
  195. nodescraper/taskresulthooks/filesystemloghook.py +88 -0
  196. nodescraper/typeutils.py +171 -0
  197. nodescraper/utils.py +412 -0
@@ -0,0 +1,1313 @@
1
+ ###############################################################################
2
+ #
3
+ # MIT License
4
+ #
5
+ # Copyright (c) 2025 Advanced Micro Devices, Inc.
6
+ #
7
+ # Permission is hereby granted, free of charge, to any person obtaining a copy
8
+ # of this software and associated documentation files (the "Software"), to deal
9
+ # in the Software without restriction, including without limitation the rights
10
+ # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11
+ # copies of the Software, and to permit persons to whom the Software is
12
+ # furnished to do so, subject to the following conditions:
13
+ #
14
+ # The above copyright notice and this permission notice shall be included in all
15
+ # copies or substantial portions of the Software.
16
+ #
17
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18
+ # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19
+ # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20
+ # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21
+ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22
+ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23
+ # SOFTWARE.
24
+ #
25
+ ###############################################################################
26
+ import io
27
+ import json
28
+ import re
29
+ from tarfile import TarFile
30
+ from typing import Any, Dict, List, Optional, Union
31
+
32
+ from pydantic import ValidationError
33
+
34
+ from nodescraper.base.inbandcollectortask import InBandDataCollector
35
+ from nodescraper.enums import EventCategory, EventPriority, ExecutionStatus, OSFamily
36
+ from nodescraper.models import TaskResult
37
+ from nodescraper.models.datamodel import FileModel
38
+ from nodescraper.plugins.inband.amdsmi.amdsmidata import (
39
+ AmdSmiDataModel,
40
+ AmdSmiListItem,
41
+ AmdSmiStatic,
42
+ AmdSmiVersion,
43
+ EccState,
44
+ Fw,
45
+ FwListItem,
46
+ Partition,
47
+ PartitionCompute,
48
+ PartitionMemory,
49
+ Processes,
50
+ ProcessInfo,
51
+ ProcessListItem,
52
+ ProcessMemoryUsage,
53
+ ProcessUsage,
54
+ StaticAsic,
55
+ StaticBoard,
56
+ StaticBus,
57
+ StaticCacheInfoItem,
58
+ StaticClockData,
59
+ StaticDriver,
60
+ StaticFrequencyLevels,
61
+ StaticNuma,
62
+ StaticPolicy,
63
+ StaticRas,
64
+ StaticSocPstate,
65
+ StaticVbios,
66
+ StaticVram,
67
+ StaticXgmiPlpd,
68
+ ValueUnit,
69
+ )
70
+ from nodescraper.utils import get_exception_traceback
71
+
72
+
73
+ class AmdSmiCollector(InBandDataCollector[AmdSmiDataModel, None]):
74
+ """Class for collection of inband tool amd-smi data."""
75
+
76
+ AMD_SMI_EXE = "amd-smi"
77
+
78
+ SUPPORTED_OS_FAMILY: set[OSFamily] = {OSFamily.LINUX}
79
+
80
+ DATA_MODEL = AmdSmiDataModel
81
+
82
+ CMD_VERSION = "version --json"
83
+ CMD_LIST = "list --json"
84
+ CMD_PROCESS = "process --json"
85
+ CMD_PARTITION = "partition --json"
86
+ CMD_FIRMWARE = "firmware --json"
87
+ CMD_STATIC = "static -g all --json"
88
+ CMD_STATIC_GPU = "static -g {gpu_id} --json"
89
+ CMD_RAS = "ras --cper --folder={folder}"
90
+
91
+ def _check_amdsmi_installed(self) -> bool:
92
+ """Check if amd-smi is installed
93
+
94
+ Returns:
95
+ bool: True if amd-smi is installed, False otherwise
96
+ """
97
+ cmd_ret = self._run_sut_cmd("which amd-smi")
98
+ return bool(cmd_ret.exit_code == 0 and "no amd-smi in" not in cmd_ret.stdout)
99
+
100
+ def _run_amd_smi(self, cmd: str) -> Optional[str]:
101
+ """Run amd-smi command
102
+
103
+ Args:
104
+ cmd (str): command arguments to pass to amd-smi
105
+
106
+ Returns:
107
+ Optional[str]: stdout from command or None on error
108
+ """
109
+ cmd_ret = self._run_sut_cmd(f"{self.AMD_SMI_EXE} {cmd}")
110
+
111
+ # Check for known warnings and errors that can be handled
112
+ is_group_warning = (
113
+ "User is missing the following required groups" in cmd_ret.stderr
114
+ or "User is missing the following required groups" in cmd_ret.stdout
115
+ )
116
+
117
+ # Check for known amd-smi internal errors
118
+ is_amdsmi_internal_error = any(
119
+ pattern in cmd_ret.stderr for pattern in ["KeyError:", "AttributeError:", "IndexError:"]
120
+ )
121
+
122
+ # Log warning if user is missing group
123
+ if cmd_ret.stderr != "" or cmd_ret.exit_code != 0:
124
+ if is_amdsmi_internal_error:
125
+ self._log_event(
126
+ category=EventCategory.SW_DRIVER,
127
+ description="amd-smi internal error detected",
128
+ data={
129
+ "command": cmd,
130
+ "exit_code": cmd_ret.exit_code,
131
+ "stderr": cmd_ret.stderr,
132
+ },
133
+ priority=EventPriority.WARNING,
134
+ console_log=True,
135
+ )
136
+ return None
137
+ elif not is_group_warning:
138
+ self._log_event(
139
+ category=EventCategory.APPLICATION,
140
+ description="Error running amd-smi command",
141
+ data={
142
+ "command": cmd,
143
+ "exit_code": cmd_ret.exit_code,
144
+ "stderr": cmd_ret.stderr,
145
+ },
146
+ priority=EventPriority.ERROR,
147
+ console_log=True,
148
+ )
149
+ return None
150
+ else:
151
+ self._log_event(
152
+ category=EventCategory.APPLICATION,
153
+ description="amd-smi warning (continuing): User missing required groups",
154
+ data={
155
+ "command": cmd,
156
+ "warning": cmd_ret.stderr or cmd_ret.stdout,
157
+ },
158
+ priority=EventPriority.WARNING,
159
+ console_log=False,
160
+ )
161
+
162
+ stdout = cmd_ret.stdout
163
+ if is_group_warning and stdout:
164
+ lines = stdout.split("\n")
165
+ cleaned_lines = [
166
+ line
167
+ for line in lines
168
+ if not any(
169
+ warn in line
170
+ for warn in [
171
+ "RuntimeError:",
172
+ "WARNING: User is missing",
173
+ "Please add user to these groups",
174
+ ]
175
+ )
176
+ ]
177
+ stdout = "\n".join(cleaned_lines).strip()
178
+
179
+ return stdout
180
+
181
+ def _run_amd_smi_dict(self, cmd: str) -> Optional[Union[dict, list[dict]]]:
182
+ """Run amd-smi command with json output
183
+
184
+ Args:
185
+ cmd (str): command arguments to pass to amd-smi
186
+
187
+ Returns:
188
+ Optional[Union[dict, list[dict]]]: parsed JSON output or None on error
189
+ """
190
+ cmd += " --json"
191
+ cmd_ret = self._run_amd_smi(cmd)
192
+ if cmd_ret:
193
+ try:
194
+ # Try to parse as single JSON first
195
+ return json.loads(cmd_ret)
196
+ except json.JSONDecodeError as e:
197
+ # try to extract and parse multiple JSON objects
198
+ try:
199
+ json_objects = []
200
+ decoder = json.JSONDecoder()
201
+ idx = 0
202
+ cmd_ret_stripped = cmd_ret.strip()
203
+
204
+ while idx < len(cmd_ret_stripped):
205
+ while idx < len(cmd_ret_stripped) and cmd_ret_stripped[idx].isspace():
206
+ idx += 1
207
+
208
+ if idx >= len(cmd_ret_stripped):
209
+ break
210
+
211
+ if cmd_ret_stripped[idx] not in ["{", "["]:
212
+ break
213
+
214
+ try:
215
+ obj, end_idx = decoder.raw_decode(cmd_ret_stripped, idx)
216
+ json_objects.append(obj)
217
+ idx = end_idx
218
+ except json.JSONDecodeError:
219
+ break
220
+
221
+ if json_objects:
222
+ return json_objects if len(json_objects) > 1 else json_objects[0]
223
+ else:
224
+ raise
225
+
226
+ except Exception:
227
+ self._log_event(
228
+ category=EventCategory.APPLICATION,
229
+ description=f"Error parsing command: `{cmd}` json data",
230
+ data={
231
+ "cmd": cmd,
232
+ "exception": get_exception_traceback(e),
233
+ },
234
+ priority=EventPriority.ERROR,
235
+ console_log=True,
236
+ )
237
+ return None
238
+ return None
239
+
240
+ def _to_number(self, v: object) -> Optional[Union[int, float]]:
241
+ """Helper function to return number from str, float or "N/A"
242
+
243
+ Args:
244
+ v (object): non number object
245
+
246
+ Returns:
247
+ Optional[Union[int, float]]: number version of input
248
+ """
249
+ if v in (None, "", "N/A"):
250
+ return None
251
+ try:
252
+ if isinstance(v, (int, float)):
253
+ return v
254
+ if isinstance(v, str):
255
+ s = v.strip()
256
+ try:
257
+ return int(s)
258
+ except Exception:
259
+ return float(s)
260
+ return float(str(v))
261
+ except Exception:
262
+ return None
263
+
264
+ def _valueunit(self, v: object, unit: str, *, required: bool = False) -> Optional[ValueUnit]:
265
+ """Build ValueUnit instance from object
266
+
267
+ Args:
268
+ v (object): object to be turned into ValueUnit
269
+ unit (str): unit of measurement
270
+ required (bool, optional): bool to force instance creation. Defaults to False.
271
+
272
+ Returns:
273
+ Optional[ValueUnit]: ValueUnit Instance
274
+ """
275
+ n = self._to_number(v)
276
+ if n is None:
277
+ return ValueUnit(value=0, unit=unit) if required else None
278
+ return ValueUnit(value=n, unit=unit)
279
+
280
+ def _valueunit_req(self, v: object, unit: str) -> ValueUnit:
281
+ """Helper function to force ValueUnit instance creation
282
+
283
+ Args:
284
+ v (object): object
285
+ unit (str): unit of measurement
286
+
287
+ Returns:
288
+ ValueUnit: instance of ValueUnit
289
+ """
290
+ vu = self._valueunit(v, unit, required=True)
291
+ assert vu is not None
292
+ return vu
293
+
294
+ def _normalize(self, val: object, default: str = "unknown", slot_type: bool = False) -> str:
295
+ """Normalize strings
296
+
297
+ Args:
298
+ val (object): object
299
+ default (str, optional): default option. Defaults to "unknown".
300
+ slot_type (bool, optional): map to one of {'OAM','PCIE','CEM','Unknown'}. Defaults to False.
301
+
302
+ Returns:
303
+ str: normalized string
304
+ """
305
+ s = str(val).strip() if val is not None else ""
306
+ if not s or s.upper() == "N/A":
307
+ return "Unknown" if slot_type else default
308
+
309
+ if slot_type:
310
+ u = s.upper().replace(" ", "").replace("-", "")
311
+ if u == "OAM":
312
+ return "OAM"
313
+ if u in {"PCIE", "PCIEXPRESS", "PCIEXP"} or u.startswith("PCIE"):
314
+ return "PCIE"
315
+ if u == "CEM":
316
+ return "CEM"
317
+ return "Unknown"
318
+
319
+ return s
320
+
321
+ def _get_amdsmi_data(self) -> Optional[AmdSmiDataModel]:
322
+ """Fill in information for AmdSmi data model
323
+
324
+ Returns:
325
+ Optional[AmdSmiDataModel]: instance of the AmdSmi data model
326
+ """
327
+ try:
328
+ version = self._get_amdsmi_version()
329
+ processes = self.get_process()
330
+ partition = self.get_partition()
331
+ firmware = self.get_firmware()
332
+ gpu_list = self.get_gpu_list()
333
+ statics = self.get_static()
334
+ cper_data = self.get_cper_data()
335
+ except Exception as e:
336
+ self._log_event(
337
+ category=EventCategory.APPLICATION,
338
+ description="Error running amd-smi sub commands",
339
+ data={"exception": get_exception_traceback(e)},
340
+ priority=EventPriority.ERROR,
341
+ console_log=True,
342
+ )
343
+ self.result.status = ExecutionStatus.EXECUTION_FAILURE
344
+ return None
345
+
346
+ try:
347
+ return AmdSmiDataModel(
348
+ version=version,
349
+ gpu_list=gpu_list,
350
+ process=processes,
351
+ partition=partition,
352
+ firmware=firmware,
353
+ static=statics,
354
+ cper_data=cper_data,
355
+ )
356
+ except ValidationError as err:
357
+ self.logger.warning("Validation err: %s", err)
358
+ self._log_event(
359
+ category=EventCategory.APPLICATION,
360
+ description="Failed to build AmdSmiDataModel",
361
+ data={"errors": err.errors(include_url=False)},
362
+ priority=EventPriority.ERROR,
363
+ )
364
+ return None
365
+
366
+ def _get_amdsmi_version(self) -> Optional[AmdSmiVersion]:
367
+ """Get amdsmi version and data
368
+
369
+ Returns:
370
+ Optional[AmdSmiVersion]: version information or None on error
371
+ """
372
+ ret = self._run_amd_smi_dict(self.CMD_VERSION)
373
+ if not ret or not isinstance(ret, list) or len(ret) == 0:
374
+ return None
375
+
376
+ version_data = ret[0] if isinstance(ret, list) else ret
377
+ if not isinstance(version_data, dict):
378
+ return None
379
+
380
+ try:
381
+ return AmdSmiVersion(
382
+ tool="amdsmi",
383
+ version=version_data.get("amdsmi_library_version", ""),
384
+ amdsmi_library_version=version_data.get("amdsmi_library_version", ""),
385
+ rocm_version=version_data.get("rocm_version", ""),
386
+ )
387
+ except ValidationError as err:
388
+ self._log_event(
389
+ category=EventCategory.APPLICATION,
390
+ description="Failed to build AmdSmiVersion",
391
+ data={"errors": err.errors(include_url=False)},
392
+ priority=EventPriority.WARNING,
393
+ )
394
+ return None
395
+
396
+ def get_gpu_list(self) -> Optional[list[AmdSmiListItem]]:
397
+ """Get GPU information from amd-smi list command
398
+
399
+ Returns:
400
+ Optional[list[AmdSmiListItem]]: list of GPU info items
401
+ """
402
+ ret = self._run_amd_smi_dict(self.CMD_LIST)
403
+ if not ret:
404
+ return []
405
+
406
+ gpu_data = ret if isinstance(ret, list) else [ret]
407
+ out: list[AmdSmiListItem] = []
408
+
409
+ def _to_int(x: Any, default: int = 0) -> int:
410
+ try:
411
+ return int(x)
412
+ except Exception:
413
+ return default
414
+
415
+ for item in gpu_data:
416
+ if not isinstance(item, dict):
417
+ continue
418
+
419
+ try:
420
+ out.append(
421
+ AmdSmiListItem(
422
+ gpu=_to_int(item.get("gpu", 0)),
423
+ bdf=str(item.get("bdf", "")),
424
+ uuid=str(item.get("uuid", "")),
425
+ kfd_id=_to_int(item.get("kfd_id", 0)),
426
+ node_id=_to_int(item.get("node_id", 0)),
427
+ partition_id=_to_int(item.get("partition_id", 0)),
428
+ )
429
+ )
430
+ except ValidationError as err:
431
+ self._log_event(
432
+ category=EventCategory.APPLICATION,
433
+ description="Failed to build AmdSmiListItem",
434
+ data={"errors": err.errors(include_url=False), "item": item},
435
+ priority=EventPriority.WARNING,
436
+ )
437
+
438
+ return out
439
+
440
+ def get_process(self) -> Optional[list[Processes]]:
441
+ """Get process information
442
+
443
+ Returns:
444
+ Optional[list[Processes]]: list of GPU processes
445
+ """
446
+ ret = self._run_amd_smi_dict(self.CMD_PROCESS)
447
+ if not ret:
448
+ return []
449
+
450
+ process_data = ret if isinstance(ret, list) else [ret]
451
+ out: list[Processes] = []
452
+
453
+ for item in process_data:
454
+ if not isinstance(item, dict):
455
+ continue
456
+
457
+ gpu_idx = int(item.get("gpu", 0)) if item.get("gpu") not in (None, "") else 0
458
+ process_list_raw = item.get("process_list", [])
459
+ if not isinstance(process_list_raw, list):
460
+ continue
461
+
462
+ plist: list[ProcessListItem] = []
463
+
464
+ for entry in process_list_raw:
465
+ if not isinstance(entry, dict):
466
+ plist.append(ProcessListItem(process_info=str(entry)))
467
+ continue
468
+
469
+ name = entry.get("name", "N/A")
470
+ pid_val = entry.get("pid", 0)
471
+ try:
472
+ pid = int(pid_val) if pid_val not in (None, "") else 0
473
+ except Exception:
474
+ pid = 0
475
+
476
+ mem_vu = self._valueunit(entry.get("mem"), "B")
477
+
478
+ mu = entry.get("memory_usage") or {}
479
+ mem_usage = ProcessMemoryUsage(
480
+ gtt_mem=self._valueunit(mu.get("gtt_mem"), "B"),
481
+ cpu_mem=self._valueunit(mu.get("cpu_mem"), "B"),
482
+ vram_mem=self._valueunit(mu.get("vram_mem"), "B"),
483
+ )
484
+
485
+ eu = entry.get("engine_usage") or {}
486
+ usage = ProcessUsage(
487
+ gfx=self._valueunit(eu.get("gfx"), "ns"),
488
+ enc=self._valueunit(eu.get("enc"), "ns"),
489
+ )
490
+
491
+ try:
492
+ plist.append(
493
+ ProcessListItem(
494
+ process_info=ProcessInfo(
495
+ name=str(name),
496
+ pid=pid,
497
+ memory_usage=mem_usage,
498
+ mem_usage=mem_vu,
499
+ usage=usage,
500
+ )
501
+ )
502
+ )
503
+ except ValidationError as err:
504
+ self._log_event(
505
+ category=EventCategory.APPLICATION,
506
+ description="Failed to build ProcessListItem; skipping entry",
507
+ data={
508
+ "errors": err.errors(include_url=False),
509
+ "gpu_index": gpu_idx,
510
+ "entry": repr(entry),
511
+ },
512
+ priority=EventPriority.WARNING,
513
+ )
514
+ continue
515
+
516
+ try:
517
+ out.append(Processes(gpu=gpu_idx, process_list=plist))
518
+ except ValidationError as err:
519
+ self._log_event(
520
+ category=EventCategory.APPLICATION,
521
+ description="Failed to build Processes",
522
+ data={"errors": err.errors(include_url=False), "gpu_index": gpu_idx},
523
+ priority=EventPriority.WARNING,
524
+ )
525
+
526
+ return out
527
+
528
+ def get_partition(self) -> Optional[Partition]:
529
+ """Check partition information
530
+
531
+ Returns:
532
+ Optional[Partition]: Partition data if available
533
+ """
534
+ ret = self._run_amd_smi_dict(self.CMD_PARTITION)
535
+ if not ret:
536
+ return None
537
+
538
+ partition_data = ret if isinstance(ret, list) else [ret]
539
+ memparts: list[PartitionMemory] = []
540
+ computeparts: list[PartitionCompute] = []
541
+
542
+ # Flatten multi-JSON results (partition command returns multiple JSON arrays)
543
+ flattened_data = []
544
+ for item in partition_data:
545
+ if isinstance(item, list):
546
+ flattened_data.extend(item)
547
+ elif isinstance(item, dict):
548
+ flattened_data.append(item)
549
+
550
+ for item in flattened_data:
551
+ if not isinstance(item, dict):
552
+ continue
553
+
554
+ gpu_idx = int(item.get("gpu", 0)) if item.get("gpu") not in (None, "") else 0
555
+ mem_pt = item.get("memory_partition")
556
+ comp_pt = item.get("compute_partition")
557
+
558
+ try:
559
+ memparts.append(
560
+ PartitionMemory(gpu_id=gpu_idx, partition_type=str(mem_pt) if mem_pt else None)
561
+ )
562
+ except ValidationError as err:
563
+ self._log_event(
564
+ category=EventCategory.APPLICATION,
565
+ description="Failed to build PartitionMemory",
566
+ data={
567
+ "errors": err.errors(include_url=False),
568
+ "gpu_index": gpu_idx,
569
+ "data": mem_pt,
570
+ },
571
+ priority=EventPriority.WARNING,
572
+ )
573
+
574
+ try:
575
+ computeparts.append(
576
+ PartitionCompute(
577
+ gpu_id=gpu_idx, partition_type=str(comp_pt) if comp_pt else None
578
+ )
579
+ )
580
+ except ValidationError as err:
581
+ self._log_event(
582
+ category=EventCategory.APPLICATION,
583
+ description="Failed to build PartitionCompute",
584
+ data={
585
+ "errors": err.errors(include_url=False),
586
+ "gpu_index": gpu_idx,
587
+ "data": comp_pt,
588
+ },
589
+ priority=EventPriority.WARNING,
590
+ )
591
+
592
+ try:
593
+ return Partition(memory_partition=memparts, compute_partition=computeparts)
594
+ except ValidationError as err:
595
+ self._log_event(
596
+ category=EventCategory.APPLICATION,
597
+ description="Failed to build Partition",
598
+ data={"errors": err.errors(include_url=False)},
599
+ priority=EventPriority.WARNING,
600
+ )
601
+ return None
602
+
603
+ def get_firmware(self) -> Optional[list[Fw]]:
604
+ """Get firmware information
605
+
606
+ Returns:
607
+ Optional[list[Fw]]: List of firmware info per GPU
608
+ """
609
+ ret = self._run_amd_smi_dict(self.CMD_FIRMWARE)
610
+ if not ret:
611
+ return []
612
+
613
+ firmware_data = ret if isinstance(ret, list) else [ret]
614
+ out: list[Fw] = []
615
+
616
+ for item in firmware_data:
617
+ if not isinstance(item, dict):
618
+ continue
619
+
620
+ gpu_idx = int(item.get("gpu", 0)) if item.get("gpu") not in (None, "") else 0
621
+ fw_list_raw = item.get("fw_list", [])
622
+
623
+ if not isinstance(fw_list_raw, list):
624
+ continue
625
+
626
+ normalized: list[FwListItem] = []
627
+ for e in fw_list_raw:
628
+ if isinstance(e, dict):
629
+ fid = e.get("fw_name")
630
+ ver = e.get("fw_version")
631
+ normalized.append(
632
+ FwListItem(
633
+ fw_id="" if fid is None else str(fid),
634
+ fw_version="" if ver is None else str(ver),
635
+ )
636
+ )
637
+ else:
638
+ self._log_event(
639
+ category=EventCategory.APPLICATION,
640
+ description="Unrecognized firmware entry shape",
641
+ data={"entry_shape": repr(e)},
642
+ priority=EventPriority.INFO,
643
+ )
644
+
645
+ try:
646
+ out.append(Fw(gpu=gpu_idx, fw_list=normalized))
647
+ except ValidationError as err:
648
+ self._log_event(
649
+ category=EventCategory.APPLICATION,
650
+ description="Failed to build Fw",
651
+ data={"errors": err.errors(include_url=False), "gpu_index": gpu_idx},
652
+ priority=EventPriority.WARNING,
653
+ )
654
+
655
+ return out
656
+
657
+ def get_static(self) -> Optional[list[AmdSmiStatic]]:
658
+ """Get Static info from amd-smi static command
659
+
660
+ Returns:
661
+ Optional[list[AmdSmiStatic]]: list of AmdSmiStatic instances or empty list
662
+ """
663
+ ret = self._run_amd_smi_dict(self.CMD_STATIC)
664
+ if not ret:
665
+ self.logger.info("Bulk static query failed, attempting per-GPU fallback")
666
+ gpu_list = self.get_gpu_list()
667
+ if gpu_list:
668
+ fallback_data: list[dict] = []
669
+ for gpu in gpu_list:
670
+ gpu_data = self._run_amd_smi_dict(self.CMD_STATIC_GPU.format(gpu_id=gpu.gpu))
671
+ if gpu_data:
672
+ if isinstance(gpu_data, dict):
673
+ fallback_data.append(gpu_data)
674
+ elif isinstance(gpu_data, list):
675
+ fallback_data.extend(gpu_data)
676
+ if fallback_data:
677
+ ret = fallback_data
678
+ else:
679
+ return []
680
+ else:
681
+ return []
682
+
683
+ if isinstance(ret, dict) and "gpu_data" in ret:
684
+ ret = ret["gpu_data"]
685
+
686
+ static_data = ret if isinstance(ret, list) else [ret]
687
+ out: list[AmdSmiStatic] = []
688
+
689
+ for item in static_data:
690
+ if not isinstance(item, dict) or "gpu" not in item:
691
+ continue
692
+
693
+ gpu_idx = int(item.get("gpu", 0)) if item.get("gpu") not in (None, "") else 0
694
+
695
+ asic = item.get("asic", {}) or {}
696
+ board = item.get("board", {}) or {}
697
+ bus = item.get("bus", {}) or {}
698
+ vbios = item.get("vbios", {}) or {}
699
+ driver = item.get("driver", {}) or {}
700
+ numa = item.get("numa", {}) or {}
701
+ vram = item.get("vram", {}) or {}
702
+ ras = item.get("ras", {}) or {}
703
+ cache = item.get("cache", {}) or {}
704
+ clock = item.get("clock", {}) or {}
705
+ soc_pstate = item.get("soc_pstate", {}) or {}
706
+ xgmi_plpd = item.get("xgmi_plpd", {}) or {}
707
+
708
+ # Bus / PCIe
709
+ bus_model = StaticBus(
710
+ bdf=str(bus.get("bdf", "")),
711
+ max_pcie_width=self._valueunit(bus.get("max_pcie_width"), "x"),
712
+ max_pcie_speed=self._valueunit(bus.get("max_pcie_speed"), "GT/s"),
713
+ pcie_interface_version=self._normalize(bus.get("pcie_interface_version")),
714
+ slot_type=self._normalize(bus.get("slot_type"), slot_type=True),
715
+ )
716
+
717
+ # ASIC
718
+ oam_id_raw = asic.get("oam_id")
719
+ if oam_id_raw in (None, "", "N/A"):
720
+ oam_id_val: Union[int, str] = "N/A"
721
+ elif isinstance(oam_id_raw, str):
722
+ oam_id_val = oam_id_raw
723
+ else:
724
+ oam_id_val = int(oam_id_raw) if oam_id_raw is not None else "N/A"
725
+
726
+ num_cu_raw = asic.get("num_compute_units")
727
+ if num_cu_raw in (None, "", "N/A"):
728
+ num_cu_val: Union[int, str] = "N/A"
729
+ elif isinstance(num_cu_raw, str):
730
+ num_cu_val = num_cu_raw
731
+ else:
732
+ num_cu_val = int(num_cu_raw) if num_cu_raw is not None else "N/A"
733
+
734
+ asic_model = StaticAsic(
735
+ market_name=self._normalize(
736
+ asic.get("market_name") or asic.get("asic_name"), default=""
737
+ ),
738
+ vendor_id=str(asic.get("vendor_id", "")),
739
+ vendor_name=str(asic.get("vendor_name", "")),
740
+ subvendor_id=str(asic.get("subvendor_id", "")),
741
+ device_id=str(asic.get("device_id", "")),
742
+ subsystem_id=str(asic.get("subsystem_id", "")),
743
+ rev_id=str(asic.get("rev_id", "")),
744
+ asic_serial=str(asic.get("asic_serial", "")),
745
+ oam_id=oam_id_val,
746
+ num_compute_units=num_cu_val,
747
+ target_graphics_version=str(asic.get("target_graphics_version", "")),
748
+ )
749
+
750
+ # Board
751
+ board_model = StaticBoard(
752
+ model_number=str(
753
+ board.get("model_number", "") or board.get("amdsmi_model_number", "")
754
+ ),
755
+ product_serial=str(board.get("product_serial", "")),
756
+ fru_id=str(board.get("fru_id", "")),
757
+ product_name=str(board.get("product_name", "")),
758
+ manufacturer_name=str(board.get("manufacturer_name", "")),
759
+ )
760
+
761
+ # Driver
762
+ driver_model = StaticDriver(
763
+ name=self._normalize(
764
+ driver.get("driver_name") if driver else None, default="unknown"
765
+ ),
766
+ version=self._normalize(
767
+ driver.get("driver_version") if driver else None, default="unknown"
768
+ ),
769
+ )
770
+
771
+ # VBIOS
772
+ vbios_model: Optional[StaticVbios] = None
773
+ if vbios:
774
+ vbios_model = StaticVbios(
775
+ name=str(vbios.get("vbios_name", "")),
776
+ build_date=str(vbios.get("vbios_build_date", "")),
777
+ part_number=str(vbios.get("vbios_part_number", "")),
778
+ version=str(vbios.get("vbios_version", "")),
779
+ )
780
+
781
+ # NUMA
782
+ numa_node = int(numa.get("node", 0) or 0)
783
+ affinity_raw = numa.get("affinity")
784
+ if affinity_raw in (None, "", "N/A"):
785
+ affinity_val: Union[int, str] = "N/A"
786
+ elif isinstance(affinity_raw, str):
787
+ affinity_val = affinity_raw
788
+ else:
789
+ affinity_val = int(affinity_raw) if affinity_raw is not None else "N/A"
790
+
791
+ numa_model = StaticNuma(node=numa_node, affinity=affinity_val)
792
+
793
+ # VRAM
794
+ vram_type = str(vram.get("vram_type", "") or "unknown")
795
+ vram_vendor = vram.get("vram_vendor")
796
+ vram_bits = vram.get("vram_bit_width")
797
+ vram_size_b: Optional[int] = None
798
+ if vram.get("vram_size_mb") is not None:
799
+ try:
800
+ vram_size_b = int(vram["vram_size_mb"]) * 1024 * 1024
801
+ except Exception:
802
+ vram_size_b = None
803
+
804
+ vram_model = StaticVram(
805
+ type=vram_type,
806
+ vendor=None if vram_vendor in (None, "", "N/A") else str(vram_vendor),
807
+ size=self._valueunit(vram_size_b, "B"),
808
+ bit_width=self._valueunit(vram_bits, "bit"),
809
+ max_bandwidth=None,
810
+ )
811
+
812
+ # SOC P-state
813
+ soc_pstate_model = self._parse_soc_pstate(soc_pstate)
814
+
815
+ # XGMI PLPD
816
+ xgmi_plpd_model = self._parse_xgmi_plpd(xgmi_plpd)
817
+
818
+ # RAS
819
+ ras_model = self._parse_ras(ras)
820
+
821
+ # Cache info
822
+ cache_info_model = self._parse_cache_info(cache)
823
+
824
+ # Clock
825
+ clock_dict_model = self._parse_clock_dict(clock)
826
+
827
+ try:
828
+ out.append(
829
+ AmdSmiStatic(
830
+ gpu=gpu_idx,
831
+ asic=asic_model,
832
+ bus=bus_model,
833
+ vbios=vbios_model,
834
+ limit=None,
835
+ driver=driver_model,
836
+ board=board_model,
837
+ ras=ras_model,
838
+ soc_pstate=soc_pstate_model,
839
+ xgmi_plpd=xgmi_plpd_model,
840
+ process_isolation="",
841
+ numa=numa_model,
842
+ vram=vram_model,
843
+ cache_info=cache_info_model,
844
+ partition=None,
845
+ clock=clock_dict_model,
846
+ )
847
+ )
848
+ except ValidationError as err:
849
+ self.logger.error(err)
850
+ self._log_event(
851
+ category=EventCategory.APPLICATION,
852
+ description="Failed to build AmdSmiStatic",
853
+ data={"errors": err.errors(include_url=False), "gpu_index": gpu_idx},
854
+ priority=EventPriority.WARNING,
855
+ )
856
+
857
+ return out
858
+
859
+ def _parse_soc_pstate(self, data: dict) -> Optional[StaticSocPstate]:
860
+ """Parse SOC P-state data
861
+
862
+ Args:
863
+ data (dict): SOC P-state data from amd-smi
864
+
865
+ Returns:
866
+ Optional[StaticSocPstate]: StaticSocPstate instance or None
867
+ """
868
+ if not isinstance(data, dict):
869
+ return None
870
+
871
+ try:
872
+ num_supported = int(data.get("num_supported", 0) or 0)
873
+ except Exception:
874
+ num_supported = 0
875
+ try:
876
+ current_id = int(data.get("current_id", 0) or 0)
877
+ except Exception:
878
+ current_id = 0
879
+
880
+ policies_raw = data.get("policies") or []
881
+ policies: list[StaticPolicy] = []
882
+ if isinstance(policies_raw, list):
883
+ for p in policies_raw:
884
+ if not isinstance(p, dict):
885
+ continue
886
+ pid = p.get("policy_id", 0)
887
+ desc = p.get("policy_description", "")
888
+ try:
889
+ policies.append(
890
+ StaticPolicy(
891
+ policy_id=int(pid) if pid not in (None, "") else 0,
892
+ policy_description=str(desc),
893
+ )
894
+ )
895
+ except ValidationError:
896
+ continue
897
+
898
+ if not num_supported and not current_id and not policies:
899
+ return None
900
+
901
+ try:
902
+ return StaticSocPstate(
903
+ num_supported=num_supported,
904
+ current_id=current_id,
905
+ policies=policies,
906
+ )
907
+ except ValidationError:
908
+ return None
909
+
910
+ def _parse_xgmi_plpd(self, data: dict) -> Optional[StaticXgmiPlpd]:
911
+ """Parse XGMI PLPD data
912
+
913
+ Args:
914
+ data (dict): XGMI PLPD data from amd-smi
915
+
916
+ Returns:
917
+ Optional[StaticXgmiPlpd]: StaticXgmiPlpd instance or None
918
+ """
919
+ if not isinstance(data, dict):
920
+ return None
921
+
922
+ try:
923
+ num_supported = int(data.get("num_supported", 0) or 0)
924
+ except Exception:
925
+ num_supported = 0
926
+ try:
927
+ current_id = int(data.get("current_id", 0) or 0)
928
+ except Exception:
929
+ current_id = 0
930
+
931
+ plpds_raw = data.get("plpds") or []
932
+ plpds: list[StaticPolicy] = []
933
+ if isinstance(plpds_raw, list):
934
+ for p in plpds_raw:
935
+ if not isinstance(p, dict):
936
+ continue
937
+ pid = p.get("policy_id", 0)
938
+ desc = p.get("policy_description", "")
939
+ try:
940
+ plpds.append(
941
+ StaticPolicy(
942
+ policy_id=int(pid) if pid not in (None, "") else 0,
943
+ policy_description=str(desc),
944
+ )
945
+ )
946
+ except ValidationError:
947
+ continue
948
+
949
+ if not num_supported and not current_id and not plpds:
950
+ return None
951
+
952
+ try:
953
+ return StaticXgmiPlpd(
954
+ num_supported=num_supported,
955
+ current_id=current_id,
956
+ plpds=plpds,
957
+ )
958
+ except ValidationError:
959
+ return None
960
+
961
+ def _parse_ras(self, data: dict) -> StaticRas:
962
+ """Parse RAS/ECC data
963
+
964
+ Args:
965
+ data (dict): RAS data from amd-smi
966
+
967
+ Returns:
968
+ StaticRas: StaticRas instance with default values if data is missing
969
+ """
970
+ if not isinstance(data, dict):
971
+ # Return default RAS data
972
+ return StaticRas(
973
+ eeprom_version="N/A",
974
+ parity_schema=EccState.NA,
975
+ single_bit_schema=EccState.NA,
976
+ double_bit_schema=EccState.NA,
977
+ poison_schema=EccState.NA,
978
+ ecc_block_state={},
979
+ )
980
+
981
+ def _to_ecc_state(value: Any) -> EccState:
982
+ """Convert string to EccState enum"""
983
+ if not value or not isinstance(value, str):
984
+ return EccState.NA
985
+ try:
986
+ return EccState(value.upper())
987
+ except (ValueError, AttributeError):
988
+ return EccState.NA
989
+
990
+ eeprom_version = str(data.get("eeprom_version", "N/A") or "N/A")
991
+ parity_schema = _to_ecc_state(data.get("parity_schema"))
992
+ single_bit_schema = _to_ecc_state(data.get("single_bit_schema"))
993
+ double_bit_schema = _to_ecc_state(data.get("double_bit_schema"))
994
+ poison_schema = _to_ecc_state(data.get("poison_schema"))
995
+
996
+ ecc_block_state = data.get("ecc_block_state", {})
997
+ ecc_block_state_final: Union[Dict[str, EccState], str]
998
+ if isinstance(ecc_block_state, dict):
999
+ parsed_blocks = {}
1000
+ for block_name, block_state in ecc_block_state.items():
1001
+ parsed_blocks[block_name] = _to_ecc_state(block_state)
1002
+ ecc_block_state_final = parsed_blocks
1003
+ elif isinstance(ecc_block_state, str):
1004
+ ecc_block_state_final = ecc_block_state
1005
+ else:
1006
+ ecc_block_state_final = {}
1007
+
1008
+ try:
1009
+ return StaticRas(
1010
+ eeprom_version=eeprom_version,
1011
+ parity_schema=parity_schema,
1012
+ single_bit_schema=single_bit_schema,
1013
+ double_bit_schema=double_bit_schema,
1014
+ poison_schema=poison_schema,
1015
+ ecc_block_state=ecc_block_state_final,
1016
+ )
1017
+ except ValidationError:
1018
+ # Return default if validation fails
1019
+ return StaticRas(
1020
+ eeprom_version="N/A",
1021
+ parity_schema=EccState.NA,
1022
+ single_bit_schema=EccState.NA,
1023
+ double_bit_schema=EccState.NA,
1024
+ poison_schema=EccState.NA,
1025
+ ecc_block_state={},
1026
+ )
1027
+
1028
+ def _parse_cache_info(self, data: dict) -> list[StaticCacheInfoItem]:
1029
+ """Parse cache info data
1030
+
1031
+ Args:
1032
+ data (dict): Cache data from amd-smi
1033
+
1034
+ Returns:
1035
+ list[StaticCacheInfoItem]: list of StaticCacheInfoItem instances
1036
+ """
1037
+ if not isinstance(data, dict) or not isinstance(data.get("cache"), list):
1038
+ return []
1039
+
1040
+ items = data["cache"]
1041
+
1042
+ def _as_list_str(v: Any) -> list[str]:
1043
+ if isinstance(v, list):
1044
+ return [str(x) for x in v]
1045
+ if isinstance(v, str):
1046
+ parts = [p.strip() for p in v.replace(";", ",").split(",")]
1047
+ return [p for p in parts if p]
1048
+ return []
1049
+
1050
+ out: list[StaticCacheInfoItem] = []
1051
+ for e in items:
1052
+ if not isinstance(e, dict):
1053
+ continue
1054
+
1055
+ cache_level = self._valueunit_req(e.get("cache_level"), "")
1056
+ max_num_cu_shared = self._valueunit_req(e.get("max_num_cu_shared"), "")
1057
+ num_cache_instance = self._valueunit_req(e.get("num_cache_instance"), "")
1058
+ cache_size = self._valueunit(e.get("cache_size"), "", required=False)
1059
+ cache_props = _as_list_str(e.get("cache_properties"))
1060
+
1061
+ lvl_val = cache_level.value
1062
+ cache_label_val = (
1063
+ f"Label_{int(lvl_val) if isinstance(lvl_val, (int, float)) else lvl_val}"
1064
+ )
1065
+ cache_label = ValueUnit(value=cache_label_val, unit="")
1066
+
1067
+ try:
1068
+ out.append(
1069
+ StaticCacheInfoItem(
1070
+ cache=cache_label,
1071
+ cache_properties=cache_props,
1072
+ cache_size=cache_size,
1073
+ cache_level=cache_level,
1074
+ max_num_cu_shared=max_num_cu_shared,
1075
+ num_cache_instance=num_cache_instance,
1076
+ )
1077
+ )
1078
+ except ValidationError as err:
1079
+ self._log_event(
1080
+ category=EventCategory.APPLICATION,
1081
+ description="Bad cache info entry from amd-smi; skipping",
1082
+ data={"entry": repr(e), "errors": err.errors(include_url=False)},
1083
+ priority=EventPriority.WARNING,
1084
+ )
1085
+ continue
1086
+
1087
+ return out
1088
+
1089
+ def _parse_clock(self, data: dict) -> Optional[StaticClockData]:
1090
+ """Parse clock data
1091
+
1092
+ Args:
1093
+ data (dict): Clock data from amd-smi
1094
+
1095
+ Returns:
1096
+ Optional[StaticClockData]: StaticClockData instance or None
1097
+ """
1098
+ if not isinstance(data, dict):
1099
+ return None
1100
+
1101
+ freqs_raw = data.get("frequency")
1102
+ if not isinstance(freqs_raw, list) or not freqs_raw:
1103
+ return None
1104
+
1105
+ def _to_mhz(v: object) -> Optional[int]:
1106
+ x = self._to_number(v)
1107
+ if x is None:
1108
+ return None
1109
+ xf = float(x)
1110
+ if xf >= 1e7:
1111
+ return int(round(xf / 1_000_000.0))
1112
+ if xf >= 1e4:
1113
+ return int(round(xf / 1_000.0))
1114
+ return int(round(xf))
1115
+
1116
+ freqs_mhz: list[int] = []
1117
+ for v in freqs_raw:
1118
+ mhz = _to_mhz(v)
1119
+ if mhz is not None:
1120
+ freqs_mhz.append(mhz)
1121
+
1122
+ if not freqs_mhz:
1123
+ return None
1124
+
1125
+ def _fmt(n: Optional[int]) -> Optional[str]:
1126
+ return None if n is None else f"{n} MHz"
1127
+
1128
+ level0: str = _fmt(freqs_mhz[0]) or "0 MHz"
1129
+ level1: Optional[str] = _fmt(freqs_mhz[1]) if len(freqs_mhz) > 1 else None
1130
+ level2: Optional[str] = _fmt(freqs_mhz[2]) if len(freqs_mhz) > 2 else None
1131
+
1132
+ cur_raw = data.get("current")
1133
+ current: Optional[int]
1134
+ if isinstance(cur_raw, (int, float)):
1135
+ current = int(cur_raw)
1136
+ elif isinstance(cur_raw, str) and cur_raw.strip() and cur_raw.upper() != "N/A":
1137
+ try:
1138
+ current = int(cur_raw.strip())
1139
+ except Exception:
1140
+ current = None
1141
+ else:
1142
+ current = None
1143
+
1144
+ try:
1145
+ levels = StaticFrequencyLevels.model_validate(
1146
+ {"Level 0": level0, "Level 1": level1, "Level 2": level2}
1147
+ )
1148
+
1149
+ # Use the alias "current level" as defined in the model
1150
+ return StaticClockData.model_validate(
1151
+ {"frequency_levels": levels, "current level": current}
1152
+ )
1153
+ except ValidationError:
1154
+ return None
1155
+
1156
+ def _parse_clock_dict(self, data: dict) -> Optional[dict[str, Union[StaticClockData, None]]]:
1157
+ """Parse clock data into dictionary structure
1158
+
1159
+ Args:
1160
+ data (dict): Clock data from amd-smi
1161
+
1162
+ Returns:
1163
+ Optional[dict[str, Union[StaticClockData, None]]]: dictionary of clock data or None
1164
+ """
1165
+ if not isinstance(data, dict):
1166
+ return None
1167
+
1168
+ clock_dict: dict[str, Union[StaticClockData, None]] = {}
1169
+
1170
+ clock_data = self._parse_clock(data)
1171
+ if clock_data:
1172
+ clock_dict["clk"] = clock_data
1173
+
1174
+ return clock_dict if clock_dict else None
1175
+
1176
+ def get_cper_data(self) -> List[FileModel]:
1177
+ """Collect CPER data from amd-smi ras command
1178
+
1179
+ Returns:
1180
+ list[FileModel]: List of CPER files or empty list if not supported/available
1181
+ """
1182
+ try:
1183
+ AMD_SMI_CPER_FOLDER = "/tmp/amd_smi_cper"
1184
+ # Ensure the cper folder exists but is empty
1185
+ self._run_sut_cmd(
1186
+ f"mkdir -p {AMD_SMI_CPER_FOLDER} && rm -f {AMD_SMI_CPER_FOLDER}/*.cper && rm -f {AMD_SMI_CPER_FOLDER}/*.json",
1187
+ sudo=False,
1188
+ )
1189
+ # Run amd-smi ras command with sudo to collect CPER data
1190
+ cper_cmd_ret = self._run_sut_cmd(
1191
+ f"{self.AMD_SMI_EXE} {self.CMD_RAS.format(folder=AMD_SMI_CPER_FOLDER)}",
1192
+ sudo=True,
1193
+ )
1194
+ if cper_cmd_ret.exit_code != 0:
1195
+ # Command failed, return empty list
1196
+ return []
1197
+ cper_cmd = cper_cmd_ret.stdout
1198
+ # search that a CPER is actually created here
1199
+ regex_cper_search = re.findall(r"(\w+\.cper)", cper_cmd)
1200
+ if not regex_cper_search:
1201
+ # Early exit if no CPER files were created
1202
+ return []
1203
+ # tar the cper folder
1204
+ self._run_sut_cmd(
1205
+ f"tar -czf {AMD_SMI_CPER_FOLDER}.tar.gz -C {AMD_SMI_CPER_FOLDER} .",
1206
+ sudo=True,
1207
+ )
1208
+ # Load the tar files
1209
+ cper_zip = self._read_sut_file(
1210
+ f"{AMD_SMI_CPER_FOLDER}.tar.gz", encoding=None, strip=False, log_artifact=True
1211
+ )
1212
+ # Since encoding=None, this returns BinaryFileArtifact which has contents: bytes
1213
+ if hasattr(cper_zip, "contents"):
1214
+ io_bytes = io.BytesIO(cper_zip.contents) # type: ignore[attr-defined]
1215
+ else:
1216
+ return []
1217
+ del cper_zip # Free memory after reading the file
1218
+ try:
1219
+ with TarFile.open(fileobj=io_bytes, mode="r:gz") as tar_file:
1220
+ cper_data = []
1221
+ for member in tar_file.getmembers():
1222
+ if member.isfile() and member.name.endswith(".cper"):
1223
+ file_content = tar_file.extractfile(member)
1224
+ if file_content is not None:
1225
+ # Decode the content, ignoring errors to avoid issues with binary data
1226
+ # that may not be valid UTF-8
1227
+ file_content_bytes = file_content.read()
1228
+ else:
1229
+ file_content_bytes = b""
1230
+ cper_data.append(
1231
+ FileModel(file_contents=file_content_bytes, file_name=member.name)
1232
+ )
1233
+ # Since we do not log the cper data in the data model create an event informing the user if CPER created
1234
+ if cper_data:
1235
+ self._log_event(
1236
+ category=EventCategory.APPLICATION,
1237
+ description="CPER data has been extracted from amd-smi",
1238
+ data={
1239
+ "cper_count": len(cper_data),
1240
+ },
1241
+ priority=EventPriority.INFO,
1242
+ )
1243
+ except Exception as e:
1244
+ self._log_event(
1245
+ category=EventCategory.APPLICATION,
1246
+ description="Error extracting cper data",
1247
+ data={
1248
+ "exception": get_exception_traceback(e),
1249
+ },
1250
+ priority=EventPriority.ERROR,
1251
+ console_log=True,
1252
+ )
1253
+ return []
1254
+ return cper_data
1255
+ except Exception as e:
1256
+ # If any unexpected error occurs during CPER collection, log it and return empty list
1257
+ # This ensures CPER collection failures don't break the entire data collection
1258
+ self._log_event(
1259
+ category=EventCategory.APPLICATION,
1260
+ description="Error collecting CPER data",
1261
+ data={
1262
+ "exception": get_exception_traceback(e),
1263
+ },
1264
+ priority=EventPriority.WARNING,
1265
+ console_log=False,
1266
+ )
1267
+ return []
1268
+
1269
+ def collect_data(
1270
+ self,
1271
+ args: Any = None,
1272
+ ) -> tuple[TaskResult, Optional[AmdSmiDataModel]]:
1273
+ """Collect AmdSmi data from system
1274
+
1275
+ Args:
1276
+ args (Any, optional): optional arguments for data collection. Defaults to None.
1277
+
1278
+ Returns:
1279
+ tuple[TaskResult, Optional[AmdSmiDataModel]]: task result and collected data model
1280
+ """
1281
+
1282
+ if not self._check_amdsmi_installed():
1283
+ self._log_event(
1284
+ category=EventCategory.APPLICATION,
1285
+ description="amd-smi is not installed",
1286
+ priority=EventPriority.WARNING,
1287
+ console_log=True,
1288
+ )
1289
+ self.result.status = ExecutionStatus.NOT_RAN
1290
+ return self.result, None
1291
+
1292
+ try:
1293
+ version = self._get_amdsmi_version()
1294
+ if version is not None:
1295
+ self.logger.info("amd-smi version: %s", version.version)
1296
+ self.logger.info("ROCm version: %s", version.rocm_version)
1297
+
1298
+ amd_smi_data = self._get_amdsmi_data()
1299
+
1300
+ if amd_smi_data is None:
1301
+ return self.result, None
1302
+
1303
+ return self.result, amd_smi_data
1304
+ except Exception as e:
1305
+ self._log_event(
1306
+ category=EventCategory.APPLICATION,
1307
+ description="Error running amd-smi collector",
1308
+ data={"exception": get_exception_traceback(e)},
1309
+ priority=EventPriority.ERROR,
1310
+ console_log=True,
1311
+ )
1312
+ self.result.status = ExecutionStatus.EXECUTION_FAILURE
1313
+ return self.result, None