amd-node-scraper 0.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- amd_node_scraper-0.0.1.dist-info/LICENSE +21 -0
- amd_node_scraper-0.0.1.dist-info/METADATA +424 -0
- amd_node_scraper-0.0.1.dist-info/RECORD +197 -0
- amd_node_scraper-0.0.1.dist-info/WHEEL +5 -0
- amd_node_scraper-0.0.1.dist-info/entry_points.txt +2 -0
- amd_node_scraper-0.0.1.dist-info/top_level.txt +1 -0
- nodescraper/__init__.py +32 -0
- nodescraper/base/__init__.py +34 -0
- nodescraper/base/inbandcollectortask.py +118 -0
- nodescraper/base/inbanddataplugin.py +39 -0
- nodescraper/base/regexanalyzer.py +120 -0
- nodescraper/cli/__init__.py +29 -0
- nodescraper/cli/cli.py +511 -0
- nodescraper/cli/constants.py +27 -0
- nodescraper/cli/dynamicparserbuilder.py +171 -0
- nodescraper/cli/helper.py +517 -0
- nodescraper/cli/inputargtypes.py +129 -0
- nodescraper/configbuilder.py +123 -0
- nodescraper/configregistry.py +66 -0
- nodescraper/configs/node_status.json +19 -0
- nodescraper/connection/__init__.py +25 -0
- nodescraper/connection/inband/__init__.py +46 -0
- nodescraper/connection/inband/inband.py +171 -0
- nodescraper/connection/inband/inbandlocal.py +93 -0
- nodescraper/connection/inband/inbandmanager.py +151 -0
- nodescraper/connection/inband/inbandremote.py +173 -0
- nodescraper/connection/inband/sshparams.py +43 -0
- nodescraper/constants.py +26 -0
- nodescraper/enums/__init__.py +40 -0
- nodescraper/enums/eventcategory.py +89 -0
- nodescraper/enums/eventpriority.py +42 -0
- nodescraper/enums/executionstatus.py +44 -0
- nodescraper/enums/osfamily.py +34 -0
- nodescraper/enums/systeminteraction.py +41 -0
- nodescraper/enums/systemlocation.py +33 -0
- nodescraper/generictypes.py +36 -0
- nodescraper/interfaces/__init__.py +44 -0
- nodescraper/interfaces/connectionmanager.py +143 -0
- nodescraper/interfaces/dataanalyzertask.py +138 -0
- nodescraper/interfaces/datacollectortask.py +185 -0
- nodescraper/interfaces/dataplugin.py +356 -0
- nodescraper/interfaces/plugin.py +127 -0
- nodescraper/interfaces/resultcollator.py +56 -0
- nodescraper/interfaces/task.py +164 -0
- nodescraper/interfaces/taskresulthook.py +39 -0
- nodescraper/models/__init__.py +48 -0
- nodescraper/models/analyzerargs.py +93 -0
- nodescraper/models/collectorargs.py +30 -0
- nodescraper/models/connectionconfig.py +34 -0
- nodescraper/models/datamodel.py +171 -0
- nodescraper/models/datapluginresult.py +39 -0
- nodescraper/models/event.py +158 -0
- nodescraper/models/pluginconfig.py +38 -0
- nodescraper/models/pluginresult.py +39 -0
- nodescraper/models/systeminfo.py +44 -0
- nodescraper/models/taskresult.py +185 -0
- nodescraper/models/timerangeargs.py +38 -0
- nodescraper/pluginexecutor.py +274 -0
- nodescraper/pluginregistry.py +152 -0
- nodescraper/plugins/__init__.py +25 -0
- nodescraper/plugins/inband/__init__.py +25 -0
- nodescraper/plugins/inband/amdsmi/__init__.py +28 -0
- nodescraper/plugins/inband/amdsmi/amdsmi_analyzer.py +821 -0
- nodescraper/plugins/inband/amdsmi/amdsmi_collector.py +1313 -0
- nodescraper/plugins/inband/amdsmi/amdsmi_plugin.py +43 -0
- nodescraper/plugins/inband/amdsmi/amdsmidata.py +1002 -0
- nodescraper/plugins/inband/amdsmi/analyzer_args.py +50 -0
- nodescraper/plugins/inband/amdsmi/cper.py +65 -0
- nodescraper/plugins/inband/bios/__init__.py +29 -0
- nodescraper/plugins/inband/bios/analyzer_args.py +64 -0
- nodescraper/plugins/inband/bios/bios_analyzer.py +93 -0
- nodescraper/plugins/inband/bios/bios_collector.py +93 -0
- nodescraper/plugins/inband/bios/bios_plugin.py +43 -0
- nodescraper/plugins/inband/bios/biosdata.py +30 -0
- nodescraper/plugins/inband/cmdline/__init__.py +25 -0
- nodescraper/plugins/inband/cmdline/analyzer_args.py +80 -0
- nodescraper/plugins/inband/cmdline/cmdline_analyzer.py +113 -0
- nodescraper/plugins/inband/cmdline/cmdline_collector.py +77 -0
- nodescraper/plugins/inband/cmdline/cmdline_plugin.py +43 -0
- nodescraper/plugins/inband/cmdline/cmdlinedata.py +30 -0
- nodescraper/plugins/inband/device_enumeration/__init__.py +29 -0
- nodescraper/plugins/inband/device_enumeration/analyzer_args.py +73 -0
- nodescraper/plugins/inband/device_enumeration/device_enumeration_analyzer.py +81 -0
- nodescraper/plugins/inband/device_enumeration/device_enumeration_collector.py +176 -0
- nodescraper/plugins/inband/device_enumeration/device_enumeration_plugin.py +45 -0
- nodescraper/plugins/inband/device_enumeration/deviceenumdata.py +36 -0
- nodescraper/plugins/inband/dimm/__init__.py +25 -0
- nodescraper/plugins/inband/dimm/collector_args.py +31 -0
- nodescraper/plugins/inband/dimm/dimm_collector.py +151 -0
- nodescraper/plugins/inband/dimm/dimm_plugin.py +40 -0
- nodescraper/plugins/inband/dimm/dimmdata.py +30 -0
- nodescraper/plugins/inband/dkms/__init__.py +25 -0
- nodescraper/plugins/inband/dkms/analyzer_args.py +85 -0
- nodescraper/plugins/inband/dkms/dkms_analyzer.py +106 -0
- nodescraper/plugins/inband/dkms/dkms_collector.py +76 -0
- nodescraper/plugins/inband/dkms/dkms_plugin.py +43 -0
- nodescraper/plugins/inband/dkms/dkmsdata.py +33 -0
- nodescraper/plugins/inband/dmesg/__init__.py +28 -0
- nodescraper/plugins/inband/dmesg/analyzer_args.py +33 -0
- nodescraper/plugins/inband/dmesg/collector_args.py +39 -0
- nodescraper/plugins/inband/dmesg/dmesg_analyzer.py +503 -0
- nodescraper/plugins/inband/dmesg/dmesg_collector.py +164 -0
- nodescraper/plugins/inband/dmesg/dmesg_plugin.py +44 -0
- nodescraper/plugins/inband/dmesg/dmesgdata.py +116 -0
- nodescraper/plugins/inband/fabrics/__init__.py +28 -0
- nodescraper/plugins/inband/fabrics/fabrics_collector.py +726 -0
- nodescraper/plugins/inband/fabrics/fabrics_plugin.py +37 -0
- nodescraper/plugins/inband/fabrics/fabricsdata.py +140 -0
- nodescraper/plugins/inband/journal/__init__.py +28 -0
- nodescraper/plugins/inband/journal/collector_args.py +33 -0
- nodescraper/plugins/inband/journal/journal_collector.py +107 -0
- nodescraper/plugins/inband/journal/journal_plugin.py +40 -0
- nodescraper/plugins/inband/journal/journaldata.py +44 -0
- nodescraper/plugins/inband/kernel/__init__.py +25 -0
- nodescraper/plugins/inband/kernel/analyzer_args.py +64 -0
- nodescraper/plugins/inband/kernel/kernel_analyzer.py +91 -0
- nodescraper/plugins/inband/kernel/kernel_collector.py +129 -0
- nodescraper/plugins/inband/kernel/kernel_plugin.py +43 -0
- nodescraper/plugins/inband/kernel/kerneldata.py +32 -0
- nodescraper/plugins/inband/kernel_module/__init__.py +25 -0
- nodescraper/plugins/inband/kernel_module/analyzer_args.py +59 -0
- nodescraper/plugins/inband/kernel_module/kernel_module_analyzer.py +211 -0
- nodescraper/plugins/inband/kernel_module/kernel_module_collector.py +264 -0
- nodescraper/plugins/inband/kernel_module/kernel_module_data.py +60 -0
- nodescraper/plugins/inband/kernel_module/kernel_module_plugin.py +43 -0
- nodescraper/plugins/inband/memory/__init__.py +25 -0
- nodescraper/plugins/inband/memory/analyzer_args.py +45 -0
- nodescraper/plugins/inband/memory/memory_analyzer.py +98 -0
- nodescraper/plugins/inband/memory/memory_collector.py +330 -0
- nodescraper/plugins/inband/memory/memory_plugin.py +43 -0
- nodescraper/plugins/inband/memory/memorydata.py +90 -0
- nodescraper/plugins/inband/network/__init__.py +28 -0
- nodescraper/plugins/inband/network/network_collector.py +1828 -0
- nodescraper/plugins/inband/network/network_plugin.py +37 -0
- nodescraper/plugins/inband/network/networkdata.py +319 -0
- nodescraper/plugins/inband/nvme/__init__.py +28 -0
- nodescraper/plugins/inband/nvme/nvme_collector.py +167 -0
- nodescraper/plugins/inband/nvme/nvme_plugin.py +37 -0
- nodescraper/plugins/inband/nvme/nvmedata.py +45 -0
- nodescraper/plugins/inband/os/__init__.py +25 -0
- nodescraper/plugins/inband/os/analyzer_args.py +64 -0
- nodescraper/plugins/inband/os/os_analyzer.py +73 -0
- nodescraper/plugins/inband/os/os_collector.py +131 -0
- nodescraper/plugins/inband/os/os_plugin.py +43 -0
- nodescraper/plugins/inband/os/osdata.py +31 -0
- nodescraper/plugins/inband/package/__init__.py +25 -0
- nodescraper/plugins/inband/package/analyzer_args.py +48 -0
- nodescraper/plugins/inband/package/package_analyzer.py +253 -0
- nodescraper/plugins/inband/package/package_collector.py +273 -0
- nodescraper/plugins/inband/package/package_plugin.py +43 -0
- nodescraper/plugins/inband/package/packagedata.py +41 -0
- nodescraper/plugins/inband/pcie/__init__.py +29 -0
- nodescraper/plugins/inband/pcie/analyzer_args.py +63 -0
- nodescraper/plugins/inband/pcie/pcie_analyzer.py +1081 -0
- nodescraper/plugins/inband/pcie/pcie_collector.py +690 -0
- nodescraper/plugins/inband/pcie/pcie_data.py +2017 -0
- nodescraper/plugins/inband/pcie/pcie_plugin.py +43 -0
- nodescraper/plugins/inband/process/__init__.py +25 -0
- nodescraper/plugins/inband/process/analyzer_args.py +45 -0
- nodescraper/plugins/inband/process/collector_args.py +31 -0
- nodescraper/plugins/inband/process/process_analyzer.py +91 -0
- nodescraper/plugins/inband/process/process_collector.py +115 -0
- nodescraper/plugins/inband/process/process_plugin.py +46 -0
- nodescraper/plugins/inband/process/processdata.py +34 -0
- nodescraper/plugins/inband/rocm/__init__.py +25 -0
- nodescraper/plugins/inband/rocm/analyzer_args.py +66 -0
- nodescraper/plugins/inband/rocm/rocm_analyzer.py +100 -0
- nodescraper/plugins/inband/rocm/rocm_collector.py +205 -0
- nodescraper/plugins/inband/rocm/rocm_plugin.py +43 -0
- nodescraper/plugins/inband/rocm/rocmdata.py +62 -0
- nodescraper/plugins/inband/storage/__init__.py +25 -0
- nodescraper/plugins/inband/storage/analyzer_args.py +38 -0
- nodescraper/plugins/inband/storage/collector_args.py +31 -0
- nodescraper/plugins/inband/storage/storage_analyzer.py +152 -0
- nodescraper/plugins/inband/storage/storage_collector.py +110 -0
- nodescraper/plugins/inband/storage/storage_plugin.py +44 -0
- nodescraper/plugins/inband/storage/storagedata.py +70 -0
- nodescraper/plugins/inband/sysctl/__init__.py +29 -0
- nodescraper/plugins/inband/sysctl/analyzer_args.py +67 -0
- nodescraper/plugins/inband/sysctl/sysctl_analyzer.py +81 -0
- nodescraper/plugins/inband/sysctl/sysctl_collector.py +101 -0
- nodescraper/plugins/inband/sysctl/sysctl_plugin.py +43 -0
- nodescraper/plugins/inband/sysctl/sysctldata.py +42 -0
- nodescraper/plugins/inband/syslog/__init__.py +28 -0
- nodescraper/plugins/inband/syslog/syslog_collector.py +121 -0
- nodescraper/plugins/inband/syslog/syslog_plugin.py +37 -0
- nodescraper/plugins/inband/syslog/syslogdata.py +46 -0
- nodescraper/plugins/inband/uptime/__init__.py +25 -0
- nodescraper/plugins/inband/uptime/uptime_collector.py +88 -0
- nodescraper/plugins/inband/uptime/uptime_plugin.py +37 -0
- nodescraper/plugins/inband/uptime/uptimedata.py +31 -0
- nodescraper/resultcollators/__init__.py +25 -0
- nodescraper/resultcollators/tablesummary.py +159 -0
- nodescraper/taskresulthooks/__init__.py +28 -0
- nodescraper/taskresulthooks/filesystemloghook.py +88 -0
- nodescraper/typeutils.py +171 -0
- nodescraper/utils.py +412 -0
|
@@ -0,0 +1,1313 @@
|
|
|
1
|
+
###############################################################################
|
|
2
|
+
#
|
|
3
|
+
# MIT License
|
|
4
|
+
#
|
|
5
|
+
# Copyright (c) 2025 Advanced Micro Devices, Inc.
|
|
6
|
+
#
|
|
7
|
+
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
8
|
+
# of this software and associated documentation files (the "Software"), to deal
|
|
9
|
+
# in the Software without restriction, including without limitation the rights
|
|
10
|
+
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
11
|
+
# copies of the Software, and to permit persons to whom the Software is
|
|
12
|
+
# furnished to do so, subject to the following conditions:
|
|
13
|
+
#
|
|
14
|
+
# The above copyright notice and this permission notice shall be included in all
|
|
15
|
+
# copies or substantial portions of the Software.
|
|
16
|
+
#
|
|
17
|
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
18
|
+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
19
|
+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
20
|
+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
21
|
+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
22
|
+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
23
|
+
# SOFTWARE.
|
|
24
|
+
#
|
|
25
|
+
###############################################################################
|
|
26
|
+
import io
|
|
27
|
+
import json
|
|
28
|
+
import re
|
|
29
|
+
from tarfile import TarFile
|
|
30
|
+
from typing import Any, Dict, List, Optional, Union
|
|
31
|
+
|
|
32
|
+
from pydantic import ValidationError
|
|
33
|
+
|
|
34
|
+
from nodescraper.base.inbandcollectortask import InBandDataCollector
|
|
35
|
+
from nodescraper.enums import EventCategory, EventPriority, ExecutionStatus, OSFamily
|
|
36
|
+
from nodescraper.models import TaskResult
|
|
37
|
+
from nodescraper.models.datamodel import FileModel
|
|
38
|
+
from nodescraper.plugins.inband.amdsmi.amdsmidata import (
|
|
39
|
+
AmdSmiDataModel,
|
|
40
|
+
AmdSmiListItem,
|
|
41
|
+
AmdSmiStatic,
|
|
42
|
+
AmdSmiVersion,
|
|
43
|
+
EccState,
|
|
44
|
+
Fw,
|
|
45
|
+
FwListItem,
|
|
46
|
+
Partition,
|
|
47
|
+
PartitionCompute,
|
|
48
|
+
PartitionMemory,
|
|
49
|
+
Processes,
|
|
50
|
+
ProcessInfo,
|
|
51
|
+
ProcessListItem,
|
|
52
|
+
ProcessMemoryUsage,
|
|
53
|
+
ProcessUsage,
|
|
54
|
+
StaticAsic,
|
|
55
|
+
StaticBoard,
|
|
56
|
+
StaticBus,
|
|
57
|
+
StaticCacheInfoItem,
|
|
58
|
+
StaticClockData,
|
|
59
|
+
StaticDriver,
|
|
60
|
+
StaticFrequencyLevels,
|
|
61
|
+
StaticNuma,
|
|
62
|
+
StaticPolicy,
|
|
63
|
+
StaticRas,
|
|
64
|
+
StaticSocPstate,
|
|
65
|
+
StaticVbios,
|
|
66
|
+
StaticVram,
|
|
67
|
+
StaticXgmiPlpd,
|
|
68
|
+
ValueUnit,
|
|
69
|
+
)
|
|
70
|
+
from nodescraper.utils import get_exception_traceback
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
class AmdSmiCollector(InBandDataCollector[AmdSmiDataModel, None]):
|
|
74
|
+
"""Class for collection of inband tool amd-smi data."""
|
|
75
|
+
|
|
76
|
+
AMD_SMI_EXE = "amd-smi"
|
|
77
|
+
|
|
78
|
+
SUPPORTED_OS_FAMILY: set[OSFamily] = {OSFamily.LINUX}
|
|
79
|
+
|
|
80
|
+
DATA_MODEL = AmdSmiDataModel
|
|
81
|
+
|
|
82
|
+
CMD_VERSION = "version --json"
|
|
83
|
+
CMD_LIST = "list --json"
|
|
84
|
+
CMD_PROCESS = "process --json"
|
|
85
|
+
CMD_PARTITION = "partition --json"
|
|
86
|
+
CMD_FIRMWARE = "firmware --json"
|
|
87
|
+
CMD_STATIC = "static -g all --json"
|
|
88
|
+
CMD_STATIC_GPU = "static -g {gpu_id} --json"
|
|
89
|
+
CMD_RAS = "ras --cper --folder={folder}"
|
|
90
|
+
|
|
91
|
+
def _check_amdsmi_installed(self) -> bool:
|
|
92
|
+
"""Check if amd-smi is installed
|
|
93
|
+
|
|
94
|
+
Returns:
|
|
95
|
+
bool: True if amd-smi is installed, False otherwise
|
|
96
|
+
"""
|
|
97
|
+
cmd_ret = self._run_sut_cmd("which amd-smi")
|
|
98
|
+
return bool(cmd_ret.exit_code == 0 and "no amd-smi in" not in cmd_ret.stdout)
|
|
99
|
+
|
|
100
|
+
def _run_amd_smi(self, cmd: str) -> Optional[str]:
|
|
101
|
+
"""Run amd-smi command
|
|
102
|
+
|
|
103
|
+
Args:
|
|
104
|
+
cmd (str): command arguments to pass to amd-smi
|
|
105
|
+
|
|
106
|
+
Returns:
|
|
107
|
+
Optional[str]: stdout from command or None on error
|
|
108
|
+
"""
|
|
109
|
+
cmd_ret = self._run_sut_cmd(f"{self.AMD_SMI_EXE} {cmd}")
|
|
110
|
+
|
|
111
|
+
# Check for known warnings and errors that can be handled
|
|
112
|
+
is_group_warning = (
|
|
113
|
+
"User is missing the following required groups" in cmd_ret.stderr
|
|
114
|
+
or "User is missing the following required groups" in cmd_ret.stdout
|
|
115
|
+
)
|
|
116
|
+
|
|
117
|
+
# Check for known amd-smi internal errors
|
|
118
|
+
is_amdsmi_internal_error = any(
|
|
119
|
+
pattern in cmd_ret.stderr for pattern in ["KeyError:", "AttributeError:", "IndexError:"]
|
|
120
|
+
)
|
|
121
|
+
|
|
122
|
+
# Log warning if user is missing group
|
|
123
|
+
if cmd_ret.stderr != "" or cmd_ret.exit_code != 0:
|
|
124
|
+
if is_amdsmi_internal_error:
|
|
125
|
+
self._log_event(
|
|
126
|
+
category=EventCategory.SW_DRIVER,
|
|
127
|
+
description="amd-smi internal error detected",
|
|
128
|
+
data={
|
|
129
|
+
"command": cmd,
|
|
130
|
+
"exit_code": cmd_ret.exit_code,
|
|
131
|
+
"stderr": cmd_ret.stderr,
|
|
132
|
+
},
|
|
133
|
+
priority=EventPriority.WARNING,
|
|
134
|
+
console_log=True,
|
|
135
|
+
)
|
|
136
|
+
return None
|
|
137
|
+
elif not is_group_warning:
|
|
138
|
+
self._log_event(
|
|
139
|
+
category=EventCategory.APPLICATION,
|
|
140
|
+
description="Error running amd-smi command",
|
|
141
|
+
data={
|
|
142
|
+
"command": cmd,
|
|
143
|
+
"exit_code": cmd_ret.exit_code,
|
|
144
|
+
"stderr": cmd_ret.stderr,
|
|
145
|
+
},
|
|
146
|
+
priority=EventPriority.ERROR,
|
|
147
|
+
console_log=True,
|
|
148
|
+
)
|
|
149
|
+
return None
|
|
150
|
+
else:
|
|
151
|
+
self._log_event(
|
|
152
|
+
category=EventCategory.APPLICATION,
|
|
153
|
+
description="amd-smi warning (continuing): User missing required groups",
|
|
154
|
+
data={
|
|
155
|
+
"command": cmd,
|
|
156
|
+
"warning": cmd_ret.stderr or cmd_ret.stdout,
|
|
157
|
+
},
|
|
158
|
+
priority=EventPriority.WARNING,
|
|
159
|
+
console_log=False,
|
|
160
|
+
)
|
|
161
|
+
|
|
162
|
+
stdout = cmd_ret.stdout
|
|
163
|
+
if is_group_warning and stdout:
|
|
164
|
+
lines = stdout.split("\n")
|
|
165
|
+
cleaned_lines = [
|
|
166
|
+
line
|
|
167
|
+
for line in lines
|
|
168
|
+
if not any(
|
|
169
|
+
warn in line
|
|
170
|
+
for warn in [
|
|
171
|
+
"RuntimeError:",
|
|
172
|
+
"WARNING: User is missing",
|
|
173
|
+
"Please add user to these groups",
|
|
174
|
+
]
|
|
175
|
+
)
|
|
176
|
+
]
|
|
177
|
+
stdout = "\n".join(cleaned_lines).strip()
|
|
178
|
+
|
|
179
|
+
return stdout
|
|
180
|
+
|
|
181
|
+
def _run_amd_smi_dict(self, cmd: str) -> Optional[Union[dict, list[dict]]]:
|
|
182
|
+
"""Run amd-smi command with json output
|
|
183
|
+
|
|
184
|
+
Args:
|
|
185
|
+
cmd (str): command arguments to pass to amd-smi
|
|
186
|
+
|
|
187
|
+
Returns:
|
|
188
|
+
Optional[Union[dict, list[dict]]]: parsed JSON output or None on error
|
|
189
|
+
"""
|
|
190
|
+
cmd += " --json"
|
|
191
|
+
cmd_ret = self._run_amd_smi(cmd)
|
|
192
|
+
if cmd_ret:
|
|
193
|
+
try:
|
|
194
|
+
# Try to parse as single JSON first
|
|
195
|
+
return json.loads(cmd_ret)
|
|
196
|
+
except json.JSONDecodeError as e:
|
|
197
|
+
# try to extract and parse multiple JSON objects
|
|
198
|
+
try:
|
|
199
|
+
json_objects = []
|
|
200
|
+
decoder = json.JSONDecoder()
|
|
201
|
+
idx = 0
|
|
202
|
+
cmd_ret_stripped = cmd_ret.strip()
|
|
203
|
+
|
|
204
|
+
while idx < len(cmd_ret_stripped):
|
|
205
|
+
while idx < len(cmd_ret_stripped) and cmd_ret_stripped[idx].isspace():
|
|
206
|
+
idx += 1
|
|
207
|
+
|
|
208
|
+
if idx >= len(cmd_ret_stripped):
|
|
209
|
+
break
|
|
210
|
+
|
|
211
|
+
if cmd_ret_stripped[idx] not in ["{", "["]:
|
|
212
|
+
break
|
|
213
|
+
|
|
214
|
+
try:
|
|
215
|
+
obj, end_idx = decoder.raw_decode(cmd_ret_stripped, idx)
|
|
216
|
+
json_objects.append(obj)
|
|
217
|
+
idx = end_idx
|
|
218
|
+
except json.JSONDecodeError:
|
|
219
|
+
break
|
|
220
|
+
|
|
221
|
+
if json_objects:
|
|
222
|
+
return json_objects if len(json_objects) > 1 else json_objects[0]
|
|
223
|
+
else:
|
|
224
|
+
raise
|
|
225
|
+
|
|
226
|
+
except Exception:
|
|
227
|
+
self._log_event(
|
|
228
|
+
category=EventCategory.APPLICATION,
|
|
229
|
+
description=f"Error parsing command: `{cmd}` json data",
|
|
230
|
+
data={
|
|
231
|
+
"cmd": cmd,
|
|
232
|
+
"exception": get_exception_traceback(e),
|
|
233
|
+
},
|
|
234
|
+
priority=EventPriority.ERROR,
|
|
235
|
+
console_log=True,
|
|
236
|
+
)
|
|
237
|
+
return None
|
|
238
|
+
return None
|
|
239
|
+
|
|
240
|
+
def _to_number(self, v: object) -> Optional[Union[int, float]]:
|
|
241
|
+
"""Helper function to return number from str, float or "N/A"
|
|
242
|
+
|
|
243
|
+
Args:
|
|
244
|
+
v (object): non number object
|
|
245
|
+
|
|
246
|
+
Returns:
|
|
247
|
+
Optional[Union[int, float]]: number version of input
|
|
248
|
+
"""
|
|
249
|
+
if v in (None, "", "N/A"):
|
|
250
|
+
return None
|
|
251
|
+
try:
|
|
252
|
+
if isinstance(v, (int, float)):
|
|
253
|
+
return v
|
|
254
|
+
if isinstance(v, str):
|
|
255
|
+
s = v.strip()
|
|
256
|
+
try:
|
|
257
|
+
return int(s)
|
|
258
|
+
except Exception:
|
|
259
|
+
return float(s)
|
|
260
|
+
return float(str(v))
|
|
261
|
+
except Exception:
|
|
262
|
+
return None
|
|
263
|
+
|
|
264
|
+
def _valueunit(self, v: object, unit: str, *, required: bool = False) -> Optional[ValueUnit]:
|
|
265
|
+
"""Build ValueUnit instance from object
|
|
266
|
+
|
|
267
|
+
Args:
|
|
268
|
+
v (object): object to be turned into ValueUnit
|
|
269
|
+
unit (str): unit of measurement
|
|
270
|
+
required (bool, optional): bool to force instance creation. Defaults to False.
|
|
271
|
+
|
|
272
|
+
Returns:
|
|
273
|
+
Optional[ValueUnit]: ValueUnit Instance
|
|
274
|
+
"""
|
|
275
|
+
n = self._to_number(v)
|
|
276
|
+
if n is None:
|
|
277
|
+
return ValueUnit(value=0, unit=unit) if required else None
|
|
278
|
+
return ValueUnit(value=n, unit=unit)
|
|
279
|
+
|
|
280
|
+
def _valueunit_req(self, v: object, unit: str) -> ValueUnit:
|
|
281
|
+
"""Helper function to force ValueUnit instance creation
|
|
282
|
+
|
|
283
|
+
Args:
|
|
284
|
+
v (object): object
|
|
285
|
+
unit (str): unit of measurement
|
|
286
|
+
|
|
287
|
+
Returns:
|
|
288
|
+
ValueUnit: instance of ValueUnit
|
|
289
|
+
"""
|
|
290
|
+
vu = self._valueunit(v, unit, required=True)
|
|
291
|
+
assert vu is not None
|
|
292
|
+
return vu
|
|
293
|
+
|
|
294
|
+
def _normalize(self, val: object, default: str = "unknown", slot_type: bool = False) -> str:
|
|
295
|
+
"""Normalize strings
|
|
296
|
+
|
|
297
|
+
Args:
|
|
298
|
+
val (object): object
|
|
299
|
+
default (str, optional): default option. Defaults to "unknown".
|
|
300
|
+
slot_type (bool, optional): map to one of {'OAM','PCIE','CEM','Unknown'}. Defaults to False.
|
|
301
|
+
|
|
302
|
+
Returns:
|
|
303
|
+
str: normalized string
|
|
304
|
+
"""
|
|
305
|
+
s = str(val).strip() if val is not None else ""
|
|
306
|
+
if not s or s.upper() == "N/A":
|
|
307
|
+
return "Unknown" if slot_type else default
|
|
308
|
+
|
|
309
|
+
if slot_type:
|
|
310
|
+
u = s.upper().replace(" ", "").replace("-", "")
|
|
311
|
+
if u == "OAM":
|
|
312
|
+
return "OAM"
|
|
313
|
+
if u in {"PCIE", "PCIEXPRESS", "PCIEXP"} or u.startswith("PCIE"):
|
|
314
|
+
return "PCIE"
|
|
315
|
+
if u == "CEM":
|
|
316
|
+
return "CEM"
|
|
317
|
+
return "Unknown"
|
|
318
|
+
|
|
319
|
+
return s
|
|
320
|
+
|
|
321
|
+
def _get_amdsmi_data(self) -> Optional[AmdSmiDataModel]:
|
|
322
|
+
"""Fill in information for AmdSmi data model
|
|
323
|
+
|
|
324
|
+
Returns:
|
|
325
|
+
Optional[AmdSmiDataModel]: instance of the AmdSmi data model
|
|
326
|
+
"""
|
|
327
|
+
try:
|
|
328
|
+
version = self._get_amdsmi_version()
|
|
329
|
+
processes = self.get_process()
|
|
330
|
+
partition = self.get_partition()
|
|
331
|
+
firmware = self.get_firmware()
|
|
332
|
+
gpu_list = self.get_gpu_list()
|
|
333
|
+
statics = self.get_static()
|
|
334
|
+
cper_data = self.get_cper_data()
|
|
335
|
+
except Exception as e:
|
|
336
|
+
self._log_event(
|
|
337
|
+
category=EventCategory.APPLICATION,
|
|
338
|
+
description="Error running amd-smi sub commands",
|
|
339
|
+
data={"exception": get_exception_traceback(e)},
|
|
340
|
+
priority=EventPriority.ERROR,
|
|
341
|
+
console_log=True,
|
|
342
|
+
)
|
|
343
|
+
self.result.status = ExecutionStatus.EXECUTION_FAILURE
|
|
344
|
+
return None
|
|
345
|
+
|
|
346
|
+
try:
|
|
347
|
+
return AmdSmiDataModel(
|
|
348
|
+
version=version,
|
|
349
|
+
gpu_list=gpu_list,
|
|
350
|
+
process=processes,
|
|
351
|
+
partition=partition,
|
|
352
|
+
firmware=firmware,
|
|
353
|
+
static=statics,
|
|
354
|
+
cper_data=cper_data,
|
|
355
|
+
)
|
|
356
|
+
except ValidationError as err:
|
|
357
|
+
self.logger.warning("Validation err: %s", err)
|
|
358
|
+
self._log_event(
|
|
359
|
+
category=EventCategory.APPLICATION,
|
|
360
|
+
description="Failed to build AmdSmiDataModel",
|
|
361
|
+
data={"errors": err.errors(include_url=False)},
|
|
362
|
+
priority=EventPriority.ERROR,
|
|
363
|
+
)
|
|
364
|
+
return None
|
|
365
|
+
|
|
366
|
+
def _get_amdsmi_version(self) -> Optional[AmdSmiVersion]:
|
|
367
|
+
"""Get amdsmi version and data
|
|
368
|
+
|
|
369
|
+
Returns:
|
|
370
|
+
Optional[AmdSmiVersion]: version information or None on error
|
|
371
|
+
"""
|
|
372
|
+
ret = self._run_amd_smi_dict(self.CMD_VERSION)
|
|
373
|
+
if not ret or not isinstance(ret, list) or len(ret) == 0:
|
|
374
|
+
return None
|
|
375
|
+
|
|
376
|
+
version_data = ret[0] if isinstance(ret, list) else ret
|
|
377
|
+
if not isinstance(version_data, dict):
|
|
378
|
+
return None
|
|
379
|
+
|
|
380
|
+
try:
|
|
381
|
+
return AmdSmiVersion(
|
|
382
|
+
tool="amdsmi",
|
|
383
|
+
version=version_data.get("amdsmi_library_version", ""),
|
|
384
|
+
amdsmi_library_version=version_data.get("amdsmi_library_version", ""),
|
|
385
|
+
rocm_version=version_data.get("rocm_version", ""),
|
|
386
|
+
)
|
|
387
|
+
except ValidationError as err:
|
|
388
|
+
self._log_event(
|
|
389
|
+
category=EventCategory.APPLICATION,
|
|
390
|
+
description="Failed to build AmdSmiVersion",
|
|
391
|
+
data={"errors": err.errors(include_url=False)},
|
|
392
|
+
priority=EventPriority.WARNING,
|
|
393
|
+
)
|
|
394
|
+
return None
|
|
395
|
+
|
|
396
|
+
def get_gpu_list(self) -> Optional[list[AmdSmiListItem]]:
|
|
397
|
+
"""Get GPU information from amd-smi list command
|
|
398
|
+
|
|
399
|
+
Returns:
|
|
400
|
+
Optional[list[AmdSmiListItem]]: list of GPU info items
|
|
401
|
+
"""
|
|
402
|
+
ret = self._run_amd_smi_dict(self.CMD_LIST)
|
|
403
|
+
if not ret:
|
|
404
|
+
return []
|
|
405
|
+
|
|
406
|
+
gpu_data = ret if isinstance(ret, list) else [ret]
|
|
407
|
+
out: list[AmdSmiListItem] = []
|
|
408
|
+
|
|
409
|
+
def _to_int(x: Any, default: int = 0) -> int:
|
|
410
|
+
try:
|
|
411
|
+
return int(x)
|
|
412
|
+
except Exception:
|
|
413
|
+
return default
|
|
414
|
+
|
|
415
|
+
for item in gpu_data:
|
|
416
|
+
if not isinstance(item, dict):
|
|
417
|
+
continue
|
|
418
|
+
|
|
419
|
+
try:
|
|
420
|
+
out.append(
|
|
421
|
+
AmdSmiListItem(
|
|
422
|
+
gpu=_to_int(item.get("gpu", 0)),
|
|
423
|
+
bdf=str(item.get("bdf", "")),
|
|
424
|
+
uuid=str(item.get("uuid", "")),
|
|
425
|
+
kfd_id=_to_int(item.get("kfd_id", 0)),
|
|
426
|
+
node_id=_to_int(item.get("node_id", 0)),
|
|
427
|
+
partition_id=_to_int(item.get("partition_id", 0)),
|
|
428
|
+
)
|
|
429
|
+
)
|
|
430
|
+
except ValidationError as err:
|
|
431
|
+
self._log_event(
|
|
432
|
+
category=EventCategory.APPLICATION,
|
|
433
|
+
description="Failed to build AmdSmiListItem",
|
|
434
|
+
data={"errors": err.errors(include_url=False), "item": item},
|
|
435
|
+
priority=EventPriority.WARNING,
|
|
436
|
+
)
|
|
437
|
+
|
|
438
|
+
return out
|
|
439
|
+
|
|
440
|
+
def get_process(self) -> Optional[list[Processes]]:
|
|
441
|
+
"""Get process information
|
|
442
|
+
|
|
443
|
+
Returns:
|
|
444
|
+
Optional[list[Processes]]: list of GPU processes
|
|
445
|
+
"""
|
|
446
|
+
ret = self._run_amd_smi_dict(self.CMD_PROCESS)
|
|
447
|
+
if not ret:
|
|
448
|
+
return []
|
|
449
|
+
|
|
450
|
+
process_data = ret if isinstance(ret, list) else [ret]
|
|
451
|
+
out: list[Processes] = []
|
|
452
|
+
|
|
453
|
+
for item in process_data:
|
|
454
|
+
if not isinstance(item, dict):
|
|
455
|
+
continue
|
|
456
|
+
|
|
457
|
+
gpu_idx = int(item.get("gpu", 0)) if item.get("gpu") not in (None, "") else 0
|
|
458
|
+
process_list_raw = item.get("process_list", [])
|
|
459
|
+
if not isinstance(process_list_raw, list):
|
|
460
|
+
continue
|
|
461
|
+
|
|
462
|
+
plist: list[ProcessListItem] = []
|
|
463
|
+
|
|
464
|
+
for entry in process_list_raw:
|
|
465
|
+
if not isinstance(entry, dict):
|
|
466
|
+
plist.append(ProcessListItem(process_info=str(entry)))
|
|
467
|
+
continue
|
|
468
|
+
|
|
469
|
+
name = entry.get("name", "N/A")
|
|
470
|
+
pid_val = entry.get("pid", 0)
|
|
471
|
+
try:
|
|
472
|
+
pid = int(pid_val) if pid_val not in (None, "") else 0
|
|
473
|
+
except Exception:
|
|
474
|
+
pid = 0
|
|
475
|
+
|
|
476
|
+
mem_vu = self._valueunit(entry.get("mem"), "B")
|
|
477
|
+
|
|
478
|
+
mu = entry.get("memory_usage") or {}
|
|
479
|
+
mem_usage = ProcessMemoryUsage(
|
|
480
|
+
gtt_mem=self._valueunit(mu.get("gtt_mem"), "B"),
|
|
481
|
+
cpu_mem=self._valueunit(mu.get("cpu_mem"), "B"),
|
|
482
|
+
vram_mem=self._valueunit(mu.get("vram_mem"), "B"),
|
|
483
|
+
)
|
|
484
|
+
|
|
485
|
+
eu = entry.get("engine_usage") or {}
|
|
486
|
+
usage = ProcessUsage(
|
|
487
|
+
gfx=self._valueunit(eu.get("gfx"), "ns"),
|
|
488
|
+
enc=self._valueunit(eu.get("enc"), "ns"),
|
|
489
|
+
)
|
|
490
|
+
|
|
491
|
+
try:
|
|
492
|
+
plist.append(
|
|
493
|
+
ProcessListItem(
|
|
494
|
+
process_info=ProcessInfo(
|
|
495
|
+
name=str(name),
|
|
496
|
+
pid=pid,
|
|
497
|
+
memory_usage=mem_usage,
|
|
498
|
+
mem_usage=mem_vu,
|
|
499
|
+
usage=usage,
|
|
500
|
+
)
|
|
501
|
+
)
|
|
502
|
+
)
|
|
503
|
+
except ValidationError as err:
|
|
504
|
+
self._log_event(
|
|
505
|
+
category=EventCategory.APPLICATION,
|
|
506
|
+
description="Failed to build ProcessListItem; skipping entry",
|
|
507
|
+
data={
|
|
508
|
+
"errors": err.errors(include_url=False),
|
|
509
|
+
"gpu_index": gpu_idx,
|
|
510
|
+
"entry": repr(entry),
|
|
511
|
+
},
|
|
512
|
+
priority=EventPriority.WARNING,
|
|
513
|
+
)
|
|
514
|
+
continue
|
|
515
|
+
|
|
516
|
+
try:
|
|
517
|
+
out.append(Processes(gpu=gpu_idx, process_list=plist))
|
|
518
|
+
except ValidationError as err:
|
|
519
|
+
self._log_event(
|
|
520
|
+
category=EventCategory.APPLICATION,
|
|
521
|
+
description="Failed to build Processes",
|
|
522
|
+
data={"errors": err.errors(include_url=False), "gpu_index": gpu_idx},
|
|
523
|
+
priority=EventPriority.WARNING,
|
|
524
|
+
)
|
|
525
|
+
|
|
526
|
+
return out
|
|
527
|
+
|
|
528
|
+
def get_partition(self) -> Optional[Partition]:
|
|
529
|
+
"""Check partition information
|
|
530
|
+
|
|
531
|
+
Returns:
|
|
532
|
+
Optional[Partition]: Partition data if available
|
|
533
|
+
"""
|
|
534
|
+
ret = self._run_amd_smi_dict(self.CMD_PARTITION)
|
|
535
|
+
if not ret:
|
|
536
|
+
return None
|
|
537
|
+
|
|
538
|
+
partition_data = ret if isinstance(ret, list) else [ret]
|
|
539
|
+
memparts: list[PartitionMemory] = []
|
|
540
|
+
computeparts: list[PartitionCompute] = []
|
|
541
|
+
|
|
542
|
+
# Flatten multi-JSON results (partition command returns multiple JSON arrays)
|
|
543
|
+
flattened_data = []
|
|
544
|
+
for item in partition_data:
|
|
545
|
+
if isinstance(item, list):
|
|
546
|
+
flattened_data.extend(item)
|
|
547
|
+
elif isinstance(item, dict):
|
|
548
|
+
flattened_data.append(item)
|
|
549
|
+
|
|
550
|
+
for item in flattened_data:
|
|
551
|
+
if not isinstance(item, dict):
|
|
552
|
+
continue
|
|
553
|
+
|
|
554
|
+
gpu_idx = int(item.get("gpu", 0)) if item.get("gpu") not in (None, "") else 0
|
|
555
|
+
mem_pt = item.get("memory_partition")
|
|
556
|
+
comp_pt = item.get("compute_partition")
|
|
557
|
+
|
|
558
|
+
try:
|
|
559
|
+
memparts.append(
|
|
560
|
+
PartitionMemory(gpu_id=gpu_idx, partition_type=str(mem_pt) if mem_pt else None)
|
|
561
|
+
)
|
|
562
|
+
except ValidationError as err:
|
|
563
|
+
self._log_event(
|
|
564
|
+
category=EventCategory.APPLICATION,
|
|
565
|
+
description="Failed to build PartitionMemory",
|
|
566
|
+
data={
|
|
567
|
+
"errors": err.errors(include_url=False),
|
|
568
|
+
"gpu_index": gpu_idx,
|
|
569
|
+
"data": mem_pt,
|
|
570
|
+
},
|
|
571
|
+
priority=EventPriority.WARNING,
|
|
572
|
+
)
|
|
573
|
+
|
|
574
|
+
try:
|
|
575
|
+
computeparts.append(
|
|
576
|
+
PartitionCompute(
|
|
577
|
+
gpu_id=gpu_idx, partition_type=str(comp_pt) if comp_pt else None
|
|
578
|
+
)
|
|
579
|
+
)
|
|
580
|
+
except ValidationError as err:
|
|
581
|
+
self._log_event(
|
|
582
|
+
category=EventCategory.APPLICATION,
|
|
583
|
+
description="Failed to build PartitionCompute",
|
|
584
|
+
data={
|
|
585
|
+
"errors": err.errors(include_url=False),
|
|
586
|
+
"gpu_index": gpu_idx,
|
|
587
|
+
"data": comp_pt,
|
|
588
|
+
},
|
|
589
|
+
priority=EventPriority.WARNING,
|
|
590
|
+
)
|
|
591
|
+
|
|
592
|
+
try:
|
|
593
|
+
return Partition(memory_partition=memparts, compute_partition=computeparts)
|
|
594
|
+
except ValidationError as err:
|
|
595
|
+
self._log_event(
|
|
596
|
+
category=EventCategory.APPLICATION,
|
|
597
|
+
description="Failed to build Partition",
|
|
598
|
+
data={"errors": err.errors(include_url=False)},
|
|
599
|
+
priority=EventPriority.WARNING,
|
|
600
|
+
)
|
|
601
|
+
return None
|
|
602
|
+
|
|
603
|
+
def get_firmware(self) -> Optional[list[Fw]]:
|
|
604
|
+
"""Get firmware information
|
|
605
|
+
|
|
606
|
+
Returns:
|
|
607
|
+
Optional[list[Fw]]: List of firmware info per GPU
|
|
608
|
+
"""
|
|
609
|
+
ret = self._run_amd_smi_dict(self.CMD_FIRMWARE)
|
|
610
|
+
if not ret:
|
|
611
|
+
return []
|
|
612
|
+
|
|
613
|
+
firmware_data = ret if isinstance(ret, list) else [ret]
|
|
614
|
+
out: list[Fw] = []
|
|
615
|
+
|
|
616
|
+
for item in firmware_data:
|
|
617
|
+
if not isinstance(item, dict):
|
|
618
|
+
continue
|
|
619
|
+
|
|
620
|
+
gpu_idx = int(item.get("gpu", 0)) if item.get("gpu") not in (None, "") else 0
|
|
621
|
+
fw_list_raw = item.get("fw_list", [])
|
|
622
|
+
|
|
623
|
+
if not isinstance(fw_list_raw, list):
|
|
624
|
+
continue
|
|
625
|
+
|
|
626
|
+
normalized: list[FwListItem] = []
|
|
627
|
+
for e in fw_list_raw:
|
|
628
|
+
if isinstance(e, dict):
|
|
629
|
+
fid = e.get("fw_name")
|
|
630
|
+
ver = e.get("fw_version")
|
|
631
|
+
normalized.append(
|
|
632
|
+
FwListItem(
|
|
633
|
+
fw_id="" if fid is None else str(fid),
|
|
634
|
+
fw_version="" if ver is None else str(ver),
|
|
635
|
+
)
|
|
636
|
+
)
|
|
637
|
+
else:
|
|
638
|
+
self._log_event(
|
|
639
|
+
category=EventCategory.APPLICATION,
|
|
640
|
+
description="Unrecognized firmware entry shape",
|
|
641
|
+
data={"entry_shape": repr(e)},
|
|
642
|
+
priority=EventPriority.INFO,
|
|
643
|
+
)
|
|
644
|
+
|
|
645
|
+
try:
|
|
646
|
+
out.append(Fw(gpu=gpu_idx, fw_list=normalized))
|
|
647
|
+
except ValidationError as err:
|
|
648
|
+
self._log_event(
|
|
649
|
+
category=EventCategory.APPLICATION,
|
|
650
|
+
description="Failed to build Fw",
|
|
651
|
+
data={"errors": err.errors(include_url=False), "gpu_index": gpu_idx},
|
|
652
|
+
priority=EventPriority.WARNING,
|
|
653
|
+
)
|
|
654
|
+
|
|
655
|
+
return out
|
|
656
|
+
|
|
657
|
+
def get_static(self) -> Optional[list[AmdSmiStatic]]:
|
|
658
|
+
"""Get Static info from amd-smi static command
|
|
659
|
+
|
|
660
|
+
Returns:
|
|
661
|
+
Optional[list[AmdSmiStatic]]: list of AmdSmiStatic instances or empty list
|
|
662
|
+
"""
|
|
663
|
+
ret = self._run_amd_smi_dict(self.CMD_STATIC)
|
|
664
|
+
if not ret:
|
|
665
|
+
self.logger.info("Bulk static query failed, attempting per-GPU fallback")
|
|
666
|
+
gpu_list = self.get_gpu_list()
|
|
667
|
+
if gpu_list:
|
|
668
|
+
fallback_data: list[dict] = []
|
|
669
|
+
for gpu in gpu_list:
|
|
670
|
+
gpu_data = self._run_amd_smi_dict(self.CMD_STATIC_GPU.format(gpu_id=gpu.gpu))
|
|
671
|
+
if gpu_data:
|
|
672
|
+
if isinstance(gpu_data, dict):
|
|
673
|
+
fallback_data.append(gpu_data)
|
|
674
|
+
elif isinstance(gpu_data, list):
|
|
675
|
+
fallback_data.extend(gpu_data)
|
|
676
|
+
if fallback_data:
|
|
677
|
+
ret = fallback_data
|
|
678
|
+
else:
|
|
679
|
+
return []
|
|
680
|
+
else:
|
|
681
|
+
return []
|
|
682
|
+
|
|
683
|
+
if isinstance(ret, dict) and "gpu_data" in ret:
|
|
684
|
+
ret = ret["gpu_data"]
|
|
685
|
+
|
|
686
|
+
static_data = ret if isinstance(ret, list) else [ret]
|
|
687
|
+
out: list[AmdSmiStatic] = []
|
|
688
|
+
|
|
689
|
+
for item in static_data:
|
|
690
|
+
if not isinstance(item, dict) or "gpu" not in item:
|
|
691
|
+
continue
|
|
692
|
+
|
|
693
|
+
gpu_idx = int(item.get("gpu", 0)) if item.get("gpu") not in (None, "") else 0
|
|
694
|
+
|
|
695
|
+
asic = item.get("asic", {}) or {}
|
|
696
|
+
board = item.get("board", {}) or {}
|
|
697
|
+
bus = item.get("bus", {}) or {}
|
|
698
|
+
vbios = item.get("vbios", {}) or {}
|
|
699
|
+
driver = item.get("driver", {}) or {}
|
|
700
|
+
numa = item.get("numa", {}) or {}
|
|
701
|
+
vram = item.get("vram", {}) or {}
|
|
702
|
+
ras = item.get("ras", {}) or {}
|
|
703
|
+
cache = item.get("cache", {}) or {}
|
|
704
|
+
clock = item.get("clock", {}) or {}
|
|
705
|
+
soc_pstate = item.get("soc_pstate", {}) or {}
|
|
706
|
+
xgmi_plpd = item.get("xgmi_plpd", {}) or {}
|
|
707
|
+
|
|
708
|
+
# Bus / PCIe
|
|
709
|
+
bus_model = StaticBus(
|
|
710
|
+
bdf=str(bus.get("bdf", "")),
|
|
711
|
+
max_pcie_width=self._valueunit(bus.get("max_pcie_width"), "x"),
|
|
712
|
+
max_pcie_speed=self._valueunit(bus.get("max_pcie_speed"), "GT/s"),
|
|
713
|
+
pcie_interface_version=self._normalize(bus.get("pcie_interface_version")),
|
|
714
|
+
slot_type=self._normalize(bus.get("slot_type"), slot_type=True),
|
|
715
|
+
)
|
|
716
|
+
|
|
717
|
+
# ASIC
|
|
718
|
+
oam_id_raw = asic.get("oam_id")
|
|
719
|
+
if oam_id_raw in (None, "", "N/A"):
|
|
720
|
+
oam_id_val: Union[int, str] = "N/A"
|
|
721
|
+
elif isinstance(oam_id_raw, str):
|
|
722
|
+
oam_id_val = oam_id_raw
|
|
723
|
+
else:
|
|
724
|
+
oam_id_val = int(oam_id_raw) if oam_id_raw is not None else "N/A"
|
|
725
|
+
|
|
726
|
+
num_cu_raw = asic.get("num_compute_units")
|
|
727
|
+
if num_cu_raw in (None, "", "N/A"):
|
|
728
|
+
num_cu_val: Union[int, str] = "N/A"
|
|
729
|
+
elif isinstance(num_cu_raw, str):
|
|
730
|
+
num_cu_val = num_cu_raw
|
|
731
|
+
else:
|
|
732
|
+
num_cu_val = int(num_cu_raw) if num_cu_raw is not None else "N/A"
|
|
733
|
+
|
|
734
|
+
asic_model = StaticAsic(
|
|
735
|
+
market_name=self._normalize(
|
|
736
|
+
asic.get("market_name") or asic.get("asic_name"), default=""
|
|
737
|
+
),
|
|
738
|
+
vendor_id=str(asic.get("vendor_id", "")),
|
|
739
|
+
vendor_name=str(asic.get("vendor_name", "")),
|
|
740
|
+
subvendor_id=str(asic.get("subvendor_id", "")),
|
|
741
|
+
device_id=str(asic.get("device_id", "")),
|
|
742
|
+
subsystem_id=str(asic.get("subsystem_id", "")),
|
|
743
|
+
rev_id=str(asic.get("rev_id", "")),
|
|
744
|
+
asic_serial=str(asic.get("asic_serial", "")),
|
|
745
|
+
oam_id=oam_id_val,
|
|
746
|
+
num_compute_units=num_cu_val,
|
|
747
|
+
target_graphics_version=str(asic.get("target_graphics_version", "")),
|
|
748
|
+
)
|
|
749
|
+
|
|
750
|
+
# Board
|
|
751
|
+
board_model = StaticBoard(
|
|
752
|
+
model_number=str(
|
|
753
|
+
board.get("model_number", "") or board.get("amdsmi_model_number", "")
|
|
754
|
+
),
|
|
755
|
+
product_serial=str(board.get("product_serial", "")),
|
|
756
|
+
fru_id=str(board.get("fru_id", "")),
|
|
757
|
+
product_name=str(board.get("product_name", "")),
|
|
758
|
+
manufacturer_name=str(board.get("manufacturer_name", "")),
|
|
759
|
+
)
|
|
760
|
+
|
|
761
|
+
# Driver
|
|
762
|
+
driver_model = StaticDriver(
|
|
763
|
+
name=self._normalize(
|
|
764
|
+
driver.get("driver_name") if driver else None, default="unknown"
|
|
765
|
+
),
|
|
766
|
+
version=self._normalize(
|
|
767
|
+
driver.get("driver_version") if driver else None, default="unknown"
|
|
768
|
+
),
|
|
769
|
+
)
|
|
770
|
+
|
|
771
|
+
# VBIOS
|
|
772
|
+
vbios_model: Optional[StaticVbios] = None
|
|
773
|
+
if vbios:
|
|
774
|
+
vbios_model = StaticVbios(
|
|
775
|
+
name=str(vbios.get("vbios_name", "")),
|
|
776
|
+
build_date=str(vbios.get("vbios_build_date", "")),
|
|
777
|
+
part_number=str(vbios.get("vbios_part_number", "")),
|
|
778
|
+
version=str(vbios.get("vbios_version", "")),
|
|
779
|
+
)
|
|
780
|
+
|
|
781
|
+
# NUMA
|
|
782
|
+
numa_node = int(numa.get("node", 0) or 0)
|
|
783
|
+
affinity_raw = numa.get("affinity")
|
|
784
|
+
if affinity_raw in (None, "", "N/A"):
|
|
785
|
+
affinity_val: Union[int, str] = "N/A"
|
|
786
|
+
elif isinstance(affinity_raw, str):
|
|
787
|
+
affinity_val = affinity_raw
|
|
788
|
+
else:
|
|
789
|
+
affinity_val = int(affinity_raw) if affinity_raw is not None else "N/A"
|
|
790
|
+
|
|
791
|
+
numa_model = StaticNuma(node=numa_node, affinity=affinity_val)
|
|
792
|
+
|
|
793
|
+
# VRAM
|
|
794
|
+
vram_type = str(vram.get("vram_type", "") or "unknown")
|
|
795
|
+
vram_vendor = vram.get("vram_vendor")
|
|
796
|
+
vram_bits = vram.get("vram_bit_width")
|
|
797
|
+
vram_size_b: Optional[int] = None
|
|
798
|
+
if vram.get("vram_size_mb") is not None:
|
|
799
|
+
try:
|
|
800
|
+
vram_size_b = int(vram["vram_size_mb"]) * 1024 * 1024
|
|
801
|
+
except Exception:
|
|
802
|
+
vram_size_b = None
|
|
803
|
+
|
|
804
|
+
vram_model = StaticVram(
|
|
805
|
+
type=vram_type,
|
|
806
|
+
vendor=None if vram_vendor in (None, "", "N/A") else str(vram_vendor),
|
|
807
|
+
size=self._valueunit(vram_size_b, "B"),
|
|
808
|
+
bit_width=self._valueunit(vram_bits, "bit"),
|
|
809
|
+
max_bandwidth=None,
|
|
810
|
+
)
|
|
811
|
+
|
|
812
|
+
# SOC P-state
|
|
813
|
+
soc_pstate_model = self._parse_soc_pstate(soc_pstate)
|
|
814
|
+
|
|
815
|
+
# XGMI PLPD
|
|
816
|
+
xgmi_plpd_model = self._parse_xgmi_plpd(xgmi_plpd)
|
|
817
|
+
|
|
818
|
+
# RAS
|
|
819
|
+
ras_model = self._parse_ras(ras)
|
|
820
|
+
|
|
821
|
+
# Cache info
|
|
822
|
+
cache_info_model = self._parse_cache_info(cache)
|
|
823
|
+
|
|
824
|
+
# Clock
|
|
825
|
+
clock_dict_model = self._parse_clock_dict(clock)
|
|
826
|
+
|
|
827
|
+
try:
|
|
828
|
+
out.append(
|
|
829
|
+
AmdSmiStatic(
|
|
830
|
+
gpu=gpu_idx,
|
|
831
|
+
asic=asic_model,
|
|
832
|
+
bus=bus_model,
|
|
833
|
+
vbios=vbios_model,
|
|
834
|
+
limit=None,
|
|
835
|
+
driver=driver_model,
|
|
836
|
+
board=board_model,
|
|
837
|
+
ras=ras_model,
|
|
838
|
+
soc_pstate=soc_pstate_model,
|
|
839
|
+
xgmi_plpd=xgmi_plpd_model,
|
|
840
|
+
process_isolation="",
|
|
841
|
+
numa=numa_model,
|
|
842
|
+
vram=vram_model,
|
|
843
|
+
cache_info=cache_info_model,
|
|
844
|
+
partition=None,
|
|
845
|
+
clock=clock_dict_model,
|
|
846
|
+
)
|
|
847
|
+
)
|
|
848
|
+
except ValidationError as err:
|
|
849
|
+
self.logger.error(err)
|
|
850
|
+
self._log_event(
|
|
851
|
+
category=EventCategory.APPLICATION,
|
|
852
|
+
description="Failed to build AmdSmiStatic",
|
|
853
|
+
data={"errors": err.errors(include_url=False), "gpu_index": gpu_idx},
|
|
854
|
+
priority=EventPriority.WARNING,
|
|
855
|
+
)
|
|
856
|
+
|
|
857
|
+
return out
|
|
858
|
+
|
|
859
|
+
def _parse_soc_pstate(self, data: dict) -> Optional[StaticSocPstate]:
|
|
860
|
+
"""Parse SOC P-state data
|
|
861
|
+
|
|
862
|
+
Args:
|
|
863
|
+
data (dict): SOC P-state data from amd-smi
|
|
864
|
+
|
|
865
|
+
Returns:
|
|
866
|
+
Optional[StaticSocPstate]: StaticSocPstate instance or None
|
|
867
|
+
"""
|
|
868
|
+
if not isinstance(data, dict):
|
|
869
|
+
return None
|
|
870
|
+
|
|
871
|
+
try:
|
|
872
|
+
num_supported = int(data.get("num_supported", 0) or 0)
|
|
873
|
+
except Exception:
|
|
874
|
+
num_supported = 0
|
|
875
|
+
try:
|
|
876
|
+
current_id = int(data.get("current_id", 0) or 0)
|
|
877
|
+
except Exception:
|
|
878
|
+
current_id = 0
|
|
879
|
+
|
|
880
|
+
policies_raw = data.get("policies") or []
|
|
881
|
+
policies: list[StaticPolicy] = []
|
|
882
|
+
if isinstance(policies_raw, list):
|
|
883
|
+
for p in policies_raw:
|
|
884
|
+
if not isinstance(p, dict):
|
|
885
|
+
continue
|
|
886
|
+
pid = p.get("policy_id", 0)
|
|
887
|
+
desc = p.get("policy_description", "")
|
|
888
|
+
try:
|
|
889
|
+
policies.append(
|
|
890
|
+
StaticPolicy(
|
|
891
|
+
policy_id=int(pid) if pid not in (None, "") else 0,
|
|
892
|
+
policy_description=str(desc),
|
|
893
|
+
)
|
|
894
|
+
)
|
|
895
|
+
except ValidationError:
|
|
896
|
+
continue
|
|
897
|
+
|
|
898
|
+
if not num_supported and not current_id and not policies:
|
|
899
|
+
return None
|
|
900
|
+
|
|
901
|
+
try:
|
|
902
|
+
return StaticSocPstate(
|
|
903
|
+
num_supported=num_supported,
|
|
904
|
+
current_id=current_id,
|
|
905
|
+
policies=policies,
|
|
906
|
+
)
|
|
907
|
+
except ValidationError:
|
|
908
|
+
return None
|
|
909
|
+
|
|
910
|
+
def _parse_xgmi_plpd(self, data: dict) -> Optional[StaticXgmiPlpd]:
|
|
911
|
+
"""Parse XGMI PLPD data
|
|
912
|
+
|
|
913
|
+
Args:
|
|
914
|
+
data (dict): XGMI PLPD data from amd-smi
|
|
915
|
+
|
|
916
|
+
Returns:
|
|
917
|
+
Optional[StaticXgmiPlpd]: StaticXgmiPlpd instance or None
|
|
918
|
+
"""
|
|
919
|
+
if not isinstance(data, dict):
|
|
920
|
+
return None
|
|
921
|
+
|
|
922
|
+
try:
|
|
923
|
+
num_supported = int(data.get("num_supported", 0) or 0)
|
|
924
|
+
except Exception:
|
|
925
|
+
num_supported = 0
|
|
926
|
+
try:
|
|
927
|
+
current_id = int(data.get("current_id", 0) or 0)
|
|
928
|
+
except Exception:
|
|
929
|
+
current_id = 0
|
|
930
|
+
|
|
931
|
+
plpds_raw = data.get("plpds") or []
|
|
932
|
+
plpds: list[StaticPolicy] = []
|
|
933
|
+
if isinstance(plpds_raw, list):
|
|
934
|
+
for p in plpds_raw:
|
|
935
|
+
if not isinstance(p, dict):
|
|
936
|
+
continue
|
|
937
|
+
pid = p.get("policy_id", 0)
|
|
938
|
+
desc = p.get("policy_description", "")
|
|
939
|
+
try:
|
|
940
|
+
plpds.append(
|
|
941
|
+
StaticPolicy(
|
|
942
|
+
policy_id=int(pid) if pid not in (None, "") else 0,
|
|
943
|
+
policy_description=str(desc),
|
|
944
|
+
)
|
|
945
|
+
)
|
|
946
|
+
except ValidationError:
|
|
947
|
+
continue
|
|
948
|
+
|
|
949
|
+
if not num_supported and not current_id and not plpds:
|
|
950
|
+
return None
|
|
951
|
+
|
|
952
|
+
try:
|
|
953
|
+
return StaticXgmiPlpd(
|
|
954
|
+
num_supported=num_supported,
|
|
955
|
+
current_id=current_id,
|
|
956
|
+
plpds=plpds,
|
|
957
|
+
)
|
|
958
|
+
except ValidationError:
|
|
959
|
+
return None
|
|
960
|
+
|
|
961
|
+
def _parse_ras(self, data: dict) -> StaticRas:
|
|
962
|
+
"""Parse RAS/ECC data
|
|
963
|
+
|
|
964
|
+
Args:
|
|
965
|
+
data (dict): RAS data from amd-smi
|
|
966
|
+
|
|
967
|
+
Returns:
|
|
968
|
+
StaticRas: StaticRas instance with default values if data is missing
|
|
969
|
+
"""
|
|
970
|
+
if not isinstance(data, dict):
|
|
971
|
+
# Return default RAS data
|
|
972
|
+
return StaticRas(
|
|
973
|
+
eeprom_version="N/A",
|
|
974
|
+
parity_schema=EccState.NA,
|
|
975
|
+
single_bit_schema=EccState.NA,
|
|
976
|
+
double_bit_schema=EccState.NA,
|
|
977
|
+
poison_schema=EccState.NA,
|
|
978
|
+
ecc_block_state={},
|
|
979
|
+
)
|
|
980
|
+
|
|
981
|
+
def _to_ecc_state(value: Any) -> EccState:
|
|
982
|
+
"""Convert string to EccState enum"""
|
|
983
|
+
if not value or not isinstance(value, str):
|
|
984
|
+
return EccState.NA
|
|
985
|
+
try:
|
|
986
|
+
return EccState(value.upper())
|
|
987
|
+
except (ValueError, AttributeError):
|
|
988
|
+
return EccState.NA
|
|
989
|
+
|
|
990
|
+
eeprom_version = str(data.get("eeprom_version", "N/A") or "N/A")
|
|
991
|
+
parity_schema = _to_ecc_state(data.get("parity_schema"))
|
|
992
|
+
single_bit_schema = _to_ecc_state(data.get("single_bit_schema"))
|
|
993
|
+
double_bit_schema = _to_ecc_state(data.get("double_bit_schema"))
|
|
994
|
+
poison_schema = _to_ecc_state(data.get("poison_schema"))
|
|
995
|
+
|
|
996
|
+
ecc_block_state = data.get("ecc_block_state", {})
|
|
997
|
+
ecc_block_state_final: Union[Dict[str, EccState], str]
|
|
998
|
+
if isinstance(ecc_block_state, dict):
|
|
999
|
+
parsed_blocks = {}
|
|
1000
|
+
for block_name, block_state in ecc_block_state.items():
|
|
1001
|
+
parsed_blocks[block_name] = _to_ecc_state(block_state)
|
|
1002
|
+
ecc_block_state_final = parsed_blocks
|
|
1003
|
+
elif isinstance(ecc_block_state, str):
|
|
1004
|
+
ecc_block_state_final = ecc_block_state
|
|
1005
|
+
else:
|
|
1006
|
+
ecc_block_state_final = {}
|
|
1007
|
+
|
|
1008
|
+
try:
|
|
1009
|
+
return StaticRas(
|
|
1010
|
+
eeprom_version=eeprom_version,
|
|
1011
|
+
parity_schema=parity_schema,
|
|
1012
|
+
single_bit_schema=single_bit_schema,
|
|
1013
|
+
double_bit_schema=double_bit_schema,
|
|
1014
|
+
poison_schema=poison_schema,
|
|
1015
|
+
ecc_block_state=ecc_block_state_final,
|
|
1016
|
+
)
|
|
1017
|
+
except ValidationError:
|
|
1018
|
+
# Return default if validation fails
|
|
1019
|
+
return StaticRas(
|
|
1020
|
+
eeprom_version="N/A",
|
|
1021
|
+
parity_schema=EccState.NA,
|
|
1022
|
+
single_bit_schema=EccState.NA,
|
|
1023
|
+
double_bit_schema=EccState.NA,
|
|
1024
|
+
poison_schema=EccState.NA,
|
|
1025
|
+
ecc_block_state={},
|
|
1026
|
+
)
|
|
1027
|
+
|
|
1028
|
+
def _parse_cache_info(self, data: dict) -> list[StaticCacheInfoItem]:
|
|
1029
|
+
"""Parse cache info data
|
|
1030
|
+
|
|
1031
|
+
Args:
|
|
1032
|
+
data (dict): Cache data from amd-smi
|
|
1033
|
+
|
|
1034
|
+
Returns:
|
|
1035
|
+
list[StaticCacheInfoItem]: list of StaticCacheInfoItem instances
|
|
1036
|
+
"""
|
|
1037
|
+
if not isinstance(data, dict) or not isinstance(data.get("cache"), list):
|
|
1038
|
+
return []
|
|
1039
|
+
|
|
1040
|
+
items = data["cache"]
|
|
1041
|
+
|
|
1042
|
+
def _as_list_str(v: Any) -> list[str]:
|
|
1043
|
+
if isinstance(v, list):
|
|
1044
|
+
return [str(x) for x in v]
|
|
1045
|
+
if isinstance(v, str):
|
|
1046
|
+
parts = [p.strip() for p in v.replace(";", ",").split(",")]
|
|
1047
|
+
return [p for p in parts if p]
|
|
1048
|
+
return []
|
|
1049
|
+
|
|
1050
|
+
out: list[StaticCacheInfoItem] = []
|
|
1051
|
+
for e in items:
|
|
1052
|
+
if not isinstance(e, dict):
|
|
1053
|
+
continue
|
|
1054
|
+
|
|
1055
|
+
cache_level = self._valueunit_req(e.get("cache_level"), "")
|
|
1056
|
+
max_num_cu_shared = self._valueunit_req(e.get("max_num_cu_shared"), "")
|
|
1057
|
+
num_cache_instance = self._valueunit_req(e.get("num_cache_instance"), "")
|
|
1058
|
+
cache_size = self._valueunit(e.get("cache_size"), "", required=False)
|
|
1059
|
+
cache_props = _as_list_str(e.get("cache_properties"))
|
|
1060
|
+
|
|
1061
|
+
lvl_val = cache_level.value
|
|
1062
|
+
cache_label_val = (
|
|
1063
|
+
f"Label_{int(lvl_val) if isinstance(lvl_val, (int, float)) else lvl_val}"
|
|
1064
|
+
)
|
|
1065
|
+
cache_label = ValueUnit(value=cache_label_val, unit="")
|
|
1066
|
+
|
|
1067
|
+
try:
|
|
1068
|
+
out.append(
|
|
1069
|
+
StaticCacheInfoItem(
|
|
1070
|
+
cache=cache_label,
|
|
1071
|
+
cache_properties=cache_props,
|
|
1072
|
+
cache_size=cache_size,
|
|
1073
|
+
cache_level=cache_level,
|
|
1074
|
+
max_num_cu_shared=max_num_cu_shared,
|
|
1075
|
+
num_cache_instance=num_cache_instance,
|
|
1076
|
+
)
|
|
1077
|
+
)
|
|
1078
|
+
except ValidationError as err:
|
|
1079
|
+
self._log_event(
|
|
1080
|
+
category=EventCategory.APPLICATION,
|
|
1081
|
+
description="Bad cache info entry from amd-smi; skipping",
|
|
1082
|
+
data={"entry": repr(e), "errors": err.errors(include_url=False)},
|
|
1083
|
+
priority=EventPriority.WARNING,
|
|
1084
|
+
)
|
|
1085
|
+
continue
|
|
1086
|
+
|
|
1087
|
+
return out
|
|
1088
|
+
|
|
1089
|
+
def _parse_clock(self, data: dict) -> Optional[StaticClockData]:
|
|
1090
|
+
"""Parse clock data
|
|
1091
|
+
|
|
1092
|
+
Args:
|
|
1093
|
+
data (dict): Clock data from amd-smi
|
|
1094
|
+
|
|
1095
|
+
Returns:
|
|
1096
|
+
Optional[StaticClockData]: StaticClockData instance or None
|
|
1097
|
+
"""
|
|
1098
|
+
if not isinstance(data, dict):
|
|
1099
|
+
return None
|
|
1100
|
+
|
|
1101
|
+
freqs_raw = data.get("frequency")
|
|
1102
|
+
if not isinstance(freqs_raw, list) or not freqs_raw:
|
|
1103
|
+
return None
|
|
1104
|
+
|
|
1105
|
+
def _to_mhz(v: object) -> Optional[int]:
|
|
1106
|
+
x = self._to_number(v)
|
|
1107
|
+
if x is None:
|
|
1108
|
+
return None
|
|
1109
|
+
xf = float(x)
|
|
1110
|
+
if xf >= 1e7:
|
|
1111
|
+
return int(round(xf / 1_000_000.0))
|
|
1112
|
+
if xf >= 1e4:
|
|
1113
|
+
return int(round(xf / 1_000.0))
|
|
1114
|
+
return int(round(xf))
|
|
1115
|
+
|
|
1116
|
+
freqs_mhz: list[int] = []
|
|
1117
|
+
for v in freqs_raw:
|
|
1118
|
+
mhz = _to_mhz(v)
|
|
1119
|
+
if mhz is not None:
|
|
1120
|
+
freqs_mhz.append(mhz)
|
|
1121
|
+
|
|
1122
|
+
if not freqs_mhz:
|
|
1123
|
+
return None
|
|
1124
|
+
|
|
1125
|
+
def _fmt(n: Optional[int]) -> Optional[str]:
|
|
1126
|
+
return None if n is None else f"{n} MHz"
|
|
1127
|
+
|
|
1128
|
+
level0: str = _fmt(freqs_mhz[0]) or "0 MHz"
|
|
1129
|
+
level1: Optional[str] = _fmt(freqs_mhz[1]) if len(freqs_mhz) > 1 else None
|
|
1130
|
+
level2: Optional[str] = _fmt(freqs_mhz[2]) if len(freqs_mhz) > 2 else None
|
|
1131
|
+
|
|
1132
|
+
cur_raw = data.get("current")
|
|
1133
|
+
current: Optional[int]
|
|
1134
|
+
if isinstance(cur_raw, (int, float)):
|
|
1135
|
+
current = int(cur_raw)
|
|
1136
|
+
elif isinstance(cur_raw, str) and cur_raw.strip() and cur_raw.upper() != "N/A":
|
|
1137
|
+
try:
|
|
1138
|
+
current = int(cur_raw.strip())
|
|
1139
|
+
except Exception:
|
|
1140
|
+
current = None
|
|
1141
|
+
else:
|
|
1142
|
+
current = None
|
|
1143
|
+
|
|
1144
|
+
try:
|
|
1145
|
+
levels = StaticFrequencyLevels.model_validate(
|
|
1146
|
+
{"Level 0": level0, "Level 1": level1, "Level 2": level2}
|
|
1147
|
+
)
|
|
1148
|
+
|
|
1149
|
+
# Use the alias "current level" as defined in the model
|
|
1150
|
+
return StaticClockData.model_validate(
|
|
1151
|
+
{"frequency_levels": levels, "current level": current}
|
|
1152
|
+
)
|
|
1153
|
+
except ValidationError:
|
|
1154
|
+
return None
|
|
1155
|
+
|
|
1156
|
+
def _parse_clock_dict(self, data: dict) -> Optional[dict[str, Union[StaticClockData, None]]]:
|
|
1157
|
+
"""Parse clock data into dictionary structure
|
|
1158
|
+
|
|
1159
|
+
Args:
|
|
1160
|
+
data (dict): Clock data from amd-smi
|
|
1161
|
+
|
|
1162
|
+
Returns:
|
|
1163
|
+
Optional[dict[str, Union[StaticClockData, None]]]: dictionary of clock data or None
|
|
1164
|
+
"""
|
|
1165
|
+
if not isinstance(data, dict):
|
|
1166
|
+
return None
|
|
1167
|
+
|
|
1168
|
+
clock_dict: dict[str, Union[StaticClockData, None]] = {}
|
|
1169
|
+
|
|
1170
|
+
clock_data = self._parse_clock(data)
|
|
1171
|
+
if clock_data:
|
|
1172
|
+
clock_dict["clk"] = clock_data
|
|
1173
|
+
|
|
1174
|
+
return clock_dict if clock_dict else None
|
|
1175
|
+
|
|
1176
|
+
def get_cper_data(self) -> List[FileModel]:
|
|
1177
|
+
"""Collect CPER data from amd-smi ras command
|
|
1178
|
+
|
|
1179
|
+
Returns:
|
|
1180
|
+
list[FileModel]: List of CPER files or empty list if not supported/available
|
|
1181
|
+
"""
|
|
1182
|
+
try:
|
|
1183
|
+
AMD_SMI_CPER_FOLDER = "/tmp/amd_smi_cper"
|
|
1184
|
+
# Ensure the cper folder exists but is empty
|
|
1185
|
+
self._run_sut_cmd(
|
|
1186
|
+
f"mkdir -p {AMD_SMI_CPER_FOLDER} && rm -f {AMD_SMI_CPER_FOLDER}/*.cper && rm -f {AMD_SMI_CPER_FOLDER}/*.json",
|
|
1187
|
+
sudo=False,
|
|
1188
|
+
)
|
|
1189
|
+
# Run amd-smi ras command with sudo to collect CPER data
|
|
1190
|
+
cper_cmd_ret = self._run_sut_cmd(
|
|
1191
|
+
f"{self.AMD_SMI_EXE} {self.CMD_RAS.format(folder=AMD_SMI_CPER_FOLDER)}",
|
|
1192
|
+
sudo=True,
|
|
1193
|
+
)
|
|
1194
|
+
if cper_cmd_ret.exit_code != 0:
|
|
1195
|
+
# Command failed, return empty list
|
|
1196
|
+
return []
|
|
1197
|
+
cper_cmd = cper_cmd_ret.stdout
|
|
1198
|
+
# search that a CPER is actually created here
|
|
1199
|
+
regex_cper_search = re.findall(r"(\w+\.cper)", cper_cmd)
|
|
1200
|
+
if not regex_cper_search:
|
|
1201
|
+
# Early exit if no CPER files were created
|
|
1202
|
+
return []
|
|
1203
|
+
# tar the cper folder
|
|
1204
|
+
self._run_sut_cmd(
|
|
1205
|
+
f"tar -czf {AMD_SMI_CPER_FOLDER}.tar.gz -C {AMD_SMI_CPER_FOLDER} .",
|
|
1206
|
+
sudo=True,
|
|
1207
|
+
)
|
|
1208
|
+
# Load the tar files
|
|
1209
|
+
cper_zip = self._read_sut_file(
|
|
1210
|
+
f"{AMD_SMI_CPER_FOLDER}.tar.gz", encoding=None, strip=False, log_artifact=True
|
|
1211
|
+
)
|
|
1212
|
+
# Since encoding=None, this returns BinaryFileArtifact which has contents: bytes
|
|
1213
|
+
if hasattr(cper_zip, "contents"):
|
|
1214
|
+
io_bytes = io.BytesIO(cper_zip.contents) # type: ignore[attr-defined]
|
|
1215
|
+
else:
|
|
1216
|
+
return []
|
|
1217
|
+
del cper_zip # Free memory after reading the file
|
|
1218
|
+
try:
|
|
1219
|
+
with TarFile.open(fileobj=io_bytes, mode="r:gz") as tar_file:
|
|
1220
|
+
cper_data = []
|
|
1221
|
+
for member in tar_file.getmembers():
|
|
1222
|
+
if member.isfile() and member.name.endswith(".cper"):
|
|
1223
|
+
file_content = tar_file.extractfile(member)
|
|
1224
|
+
if file_content is not None:
|
|
1225
|
+
# Decode the content, ignoring errors to avoid issues with binary data
|
|
1226
|
+
# that may not be valid UTF-8
|
|
1227
|
+
file_content_bytes = file_content.read()
|
|
1228
|
+
else:
|
|
1229
|
+
file_content_bytes = b""
|
|
1230
|
+
cper_data.append(
|
|
1231
|
+
FileModel(file_contents=file_content_bytes, file_name=member.name)
|
|
1232
|
+
)
|
|
1233
|
+
# Since we do not log the cper data in the data model create an event informing the user if CPER created
|
|
1234
|
+
if cper_data:
|
|
1235
|
+
self._log_event(
|
|
1236
|
+
category=EventCategory.APPLICATION,
|
|
1237
|
+
description="CPER data has been extracted from amd-smi",
|
|
1238
|
+
data={
|
|
1239
|
+
"cper_count": len(cper_data),
|
|
1240
|
+
},
|
|
1241
|
+
priority=EventPriority.INFO,
|
|
1242
|
+
)
|
|
1243
|
+
except Exception as e:
|
|
1244
|
+
self._log_event(
|
|
1245
|
+
category=EventCategory.APPLICATION,
|
|
1246
|
+
description="Error extracting cper data",
|
|
1247
|
+
data={
|
|
1248
|
+
"exception": get_exception_traceback(e),
|
|
1249
|
+
},
|
|
1250
|
+
priority=EventPriority.ERROR,
|
|
1251
|
+
console_log=True,
|
|
1252
|
+
)
|
|
1253
|
+
return []
|
|
1254
|
+
return cper_data
|
|
1255
|
+
except Exception as e:
|
|
1256
|
+
# If any unexpected error occurs during CPER collection, log it and return empty list
|
|
1257
|
+
# This ensures CPER collection failures don't break the entire data collection
|
|
1258
|
+
self._log_event(
|
|
1259
|
+
category=EventCategory.APPLICATION,
|
|
1260
|
+
description="Error collecting CPER data",
|
|
1261
|
+
data={
|
|
1262
|
+
"exception": get_exception_traceback(e),
|
|
1263
|
+
},
|
|
1264
|
+
priority=EventPriority.WARNING,
|
|
1265
|
+
console_log=False,
|
|
1266
|
+
)
|
|
1267
|
+
return []
|
|
1268
|
+
|
|
1269
|
+
def collect_data(
|
|
1270
|
+
self,
|
|
1271
|
+
args: Any = None,
|
|
1272
|
+
) -> tuple[TaskResult, Optional[AmdSmiDataModel]]:
|
|
1273
|
+
"""Collect AmdSmi data from system
|
|
1274
|
+
|
|
1275
|
+
Args:
|
|
1276
|
+
args (Any, optional): optional arguments for data collection. Defaults to None.
|
|
1277
|
+
|
|
1278
|
+
Returns:
|
|
1279
|
+
tuple[TaskResult, Optional[AmdSmiDataModel]]: task result and collected data model
|
|
1280
|
+
"""
|
|
1281
|
+
|
|
1282
|
+
if not self._check_amdsmi_installed():
|
|
1283
|
+
self._log_event(
|
|
1284
|
+
category=EventCategory.APPLICATION,
|
|
1285
|
+
description="amd-smi is not installed",
|
|
1286
|
+
priority=EventPriority.WARNING,
|
|
1287
|
+
console_log=True,
|
|
1288
|
+
)
|
|
1289
|
+
self.result.status = ExecutionStatus.NOT_RAN
|
|
1290
|
+
return self.result, None
|
|
1291
|
+
|
|
1292
|
+
try:
|
|
1293
|
+
version = self._get_amdsmi_version()
|
|
1294
|
+
if version is not None:
|
|
1295
|
+
self.logger.info("amd-smi version: %s", version.version)
|
|
1296
|
+
self.logger.info("ROCm version: %s", version.rocm_version)
|
|
1297
|
+
|
|
1298
|
+
amd_smi_data = self._get_amdsmi_data()
|
|
1299
|
+
|
|
1300
|
+
if amd_smi_data is None:
|
|
1301
|
+
return self.result, None
|
|
1302
|
+
|
|
1303
|
+
return self.result, amd_smi_data
|
|
1304
|
+
except Exception as e:
|
|
1305
|
+
self._log_event(
|
|
1306
|
+
category=EventCategory.APPLICATION,
|
|
1307
|
+
description="Error running amd-smi collector",
|
|
1308
|
+
data={"exception": get_exception_traceback(e)},
|
|
1309
|
+
priority=EventPriority.ERROR,
|
|
1310
|
+
console_log=True,
|
|
1311
|
+
)
|
|
1312
|
+
self.result.status = ExecutionStatus.EXECUTION_FAILURE
|
|
1313
|
+
return self.result, None
|