amd-node-scraper 0.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- amd_node_scraper-0.0.1.dist-info/LICENSE +21 -0
- amd_node_scraper-0.0.1.dist-info/METADATA +424 -0
- amd_node_scraper-0.0.1.dist-info/RECORD +197 -0
- amd_node_scraper-0.0.1.dist-info/WHEEL +5 -0
- amd_node_scraper-0.0.1.dist-info/entry_points.txt +2 -0
- amd_node_scraper-0.0.1.dist-info/top_level.txt +1 -0
- nodescraper/__init__.py +32 -0
- nodescraper/base/__init__.py +34 -0
- nodescraper/base/inbandcollectortask.py +118 -0
- nodescraper/base/inbanddataplugin.py +39 -0
- nodescraper/base/regexanalyzer.py +120 -0
- nodescraper/cli/__init__.py +29 -0
- nodescraper/cli/cli.py +511 -0
- nodescraper/cli/constants.py +27 -0
- nodescraper/cli/dynamicparserbuilder.py +171 -0
- nodescraper/cli/helper.py +517 -0
- nodescraper/cli/inputargtypes.py +129 -0
- nodescraper/configbuilder.py +123 -0
- nodescraper/configregistry.py +66 -0
- nodescraper/configs/node_status.json +19 -0
- nodescraper/connection/__init__.py +25 -0
- nodescraper/connection/inband/__init__.py +46 -0
- nodescraper/connection/inband/inband.py +171 -0
- nodescraper/connection/inband/inbandlocal.py +93 -0
- nodescraper/connection/inband/inbandmanager.py +151 -0
- nodescraper/connection/inband/inbandremote.py +173 -0
- nodescraper/connection/inband/sshparams.py +43 -0
- nodescraper/constants.py +26 -0
- nodescraper/enums/__init__.py +40 -0
- nodescraper/enums/eventcategory.py +89 -0
- nodescraper/enums/eventpriority.py +42 -0
- nodescraper/enums/executionstatus.py +44 -0
- nodescraper/enums/osfamily.py +34 -0
- nodescraper/enums/systeminteraction.py +41 -0
- nodescraper/enums/systemlocation.py +33 -0
- nodescraper/generictypes.py +36 -0
- nodescraper/interfaces/__init__.py +44 -0
- nodescraper/interfaces/connectionmanager.py +143 -0
- nodescraper/interfaces/dataanalyzertask.py +138 -0
- nodescraper/interfaces/datacollectortask.py +185 -0
- nodescraper/interfaces/dataplugin.py +356 -0
- nodescraper/interfaces/plugin.py +127 -0
- nodescraper/interfaces/resultcollator.py +56 -0
- nodescraper/interfaces/task.py +164 -0
- nodescraper/interfaces/taskresulthook.py +39 -0
- nodescraper/models/__init__.py +48 -0
- nodescraper/models/analyzerargs.py +93 -0
- nodescraper/models/collectorargs.py +30 -0
- nodescraper/models/connectionconfig.py +34 -0
- nodescraper/models/datamodel.py +171 -0
- nodescraper/models/datapluginresult.py +39 -0
- nodescraper/models/event.py +158 -0
- nodescraper/models/pluginconfig.py +38 -0
- nodescraper/models/pluginresult.py +39 -0
- nodescraper/models/systeminfo.py +44 -0
- nodescraper/models/taskresult.py +185 -0
- nodescraper/models/timerangeargs.py +38 -0
- nodescraper/pluginexecutor.py +274 -0
- nodescraper/pluginregistry.py +152 -0
- nodescraper/plugins/__init__.py +25 -0
- nodescraper/plugins/inband/__init__.py +25 -0
- nodescraper/plugins/inband/amdsmi/__init__.py +28 -0
- nodescraper/plugins/inband/amdsmi/amdsmi_analyzer.py +821 -0
- nodescraper/plugins/inband/amdsmi/amdsmi_collector.py +1313 -0
- nodescraper/plugins/inband/amdsmi/amdsmi_plugin.py +43 -0
- nodescraper/plugins/inband/amdsmi/amdsmidata.py +1002 -0
- nodescraper/plugins/inband/amdsmi/analyzer_args.py +50 -0
- nodescraper/plugins/inband/amdsmi/cper.py +65 -0
- nodescraper/plugins/inband/bios/__init__.py +29 -0
- nodescraper/plugins/inband/bios/analyzer_args.py +64 -0
- nodescraper/plugins/inband/bios/bios_analyzer.py +93 -0
- nodescraper/plugins/inband/bios/bios_collector.py +93 -0
- nodescraper/plugins/inband/bios/bios_plugin.py +43 -0
- nodescraper/plugins/inband/bios/biosdata.py +30 -0
- nodescraper/plugins/inband/cmdline/__init__.py +25 -0
- nodescraper/plugins/inband/cmdline/analyzer_args.py +80 -0
- nodescraper/plugins/inband/cmdline/cmdline_analyzer.py +113 -0
- nodescraper/plugins/inband/cmdline/cmdline_collector.py +77 -0
- nodescraper/plugins/inband/cmdline/cmdline_plugin.py +43 -0
- nodescraper/plugins/inband/cmdline/cmdlinedata.py +30 -0
- nodescraper/plugins/inband/device_enumeration/__init__.py +29 -0
- nodescraper/plugins/inband/device_enumeration/analyzer_args.py +73 -0
- nodescraper/plugins/inband/device_enumeration/device_enumeration_analyzer.py +81 -0
- nodescraper/plugins/inband/device_enumeration/device_enumeration_collector.py +176 -0
- nodescraper/plugins/inband/device_enumeration/device_enumeration_plugin.py +45 -0
- nodescraper/plugins/inband/device_enumeration/deviceenumdata.py +36 -0
- nodescraper/plugins/inband/dimm/__init__.py +25 -0
- nodescraper/plugins/inband/dimm/collector_args.py +31 -0
- nodescraper/plugins/inband/dimm/dimm_collector.py +151 -0
- nodescraper/plugins/inband/dimm/dimm_plugin.py +40 -0
- nodescraper/plugins/inband/dimm/dimmdata.py +30 -0
- nodescraper/plugins/inband/dkms/__init__.py +25 -0
- nodescraper/plugins/inband/dkms/analyzer_args.py +85 -0
- nodescraper/plugins/inband/dkms/dkms_analyzer.py +106 -0
- nodescraper/plugins/inband/dkms/dkms_collector.py +76 -0
- nodescraper/plugins/inband/dkms/dkms_plugin.py +43 -0
- nodescraper/plugins/inband/dkms/dkmsdata.py +33 -0
- nodescraper/plugins/inband/dmesg/__init__.py +28 -0
- nodescraper/plugins/inband/dmesg/analyzer_args.py +33 -0
- nodescraper/plugins/inband/dmesg/collector_args.py +39 -0
- nodescraper/plugins/inband/dmesg/dmesg_analyzer.py +503 -0
- nodescraper/plugins/inband/dmesg/dmesg_collector.py +164 -0
- nodescraper/plugins/inband/dmesg/dmesg_plugin.py +44 -0
- nodescraper/plugins/inband/dmesg/dmesgdata.py +116 -0
- nodescraper/plugins/inband/fabrics/__init__.py +28 -0
- nodescraper/plugins/inband/fabrics/fabrics_collector.py +726 -0
- nodescraper/plugins/inband/fabrics/fabrics_plugin.py +37 -0
- nodescraper/plugins/inband/fabrics/fabricsdata.py +140 -0
- nodescraper/plugins/inband/journal/__init__.py +28 -0
- nodescraper/plugins/inband/journal/collector_args.py +33 -0
- nodescraper/plugins/inband/journal/journal_collector.py +107 -0
- nodescraper/plugins/inband/journal/journal_plugin.py +40 -0
- nodescraper/plugins/inband/journal/journaldata.py +44 -0
- nodescraper/plugins/inband/kernel/__init__.py +25 -0
- nodescraper/plugins/inband/kernel/analyzer_args.py +64 -0
- nodescraper/plugins/inband/kernel/kernel_analyzer.py +91 -0
- nodescraper/plugins/inband/kernel/kernel_collector.py +129 -0
- nodescraper/plugins/inband/kernel/kernel_plugin.py +43 -0
- nodescraper/plugins/inband/kernel/kerneldata.py +32 -0
- nodescraper/plugins/inband/kernel_module/__init__.py +25 -0
- nodescraper/plugins/inband/kernel_module/analyzer_args.py +59 -0
- nodescraper/plugins/inband/kernel_module/kernel_module_analyzer.py +211 -0
- nodescraper/plugins/inband/kernel_module/kernel_module_collector.py +264 -0
- nodescraper/plugins/inband/kernel_module/kernel_module_data.py +60 -0
- nodescraper/plugins/inband/kernel_module/kernel_module_plugin.py +43 -0
- nodescraper/plugins/inband/memory/__init__.py +25 -0
- nodescraper/plugins/inband/memory/analyzer_args.py +45 -0
- nodescraper/plugins/inband/memory/memory_analyzer.py +98 -0
- nodescraper/plugins/inband/memory/memory_collector.py +330 -0
- nodescraper/plugins/inband/memory/memory_plugin.py +43 -0
- nodescraper/plugins/inband/memory/memorydata.py +90 -0
- nodescraper/plugins/inband/network/__init__.py +28 -0
- nodescraper/plugins/inband/network/network_collector.py +1828 -0
- nodescraper/plugins/inband/network/network_plugin.py +37 -0
- nodescraper/plugins/inband/network/networkdata.py +319 -0
- nodescraper/plugins/inband/nvme/__init__.py +28 -0
- nodescraper/plugins/inband/nvme/nvme_collector.py +167 -0
- nodescraper/plugins/inband/nvme/nvme_plugin.py +37 -0
- nodescraper/plugins/inband/nvme/nvmedata.py +45 -0
- nodescraper/plugins/inband/os/__init__.py +25 -0
- nodescraper/plugins/inband/os/analyzer_args.py +64 -0
- nodescraper/plugins/inband/os/os_analyzer.py +73 -0
- nodescraper/plugins/inband/os/os_collector.py +131 -0
- nodescraper/plugins/inband/os/os_plugin.py +43 -0
- nodescraper/plugins/inband/os/osdata.py +31 -0
- nodescraper/plugins/inband/package/__init__.py +25 -0
- nodescraper/plugins/inband/package/analyzer_args.py +48 -0
- nodescraper/plugins/inband/package/package_analyzer.py +253 -0
- nodescraper/plugins/inband/package/package_collector.py +273 -0
- nodescraper/plugins/inband/package/package_plugin.py +43 -0
- nodescraper/plugins/inband/package/packagedata.py +41 -0
- nodescraper/plugins/inband/pcie/__init__.py +29 -0
- nodescraper/plugins/inband/pcie/analyzer_args.py +63 -0
- nodescraper/plugins/inband/pcie/pcie_analyzer.py +1081 -0
- nodescraper/plugins/inband/pcie/pcie_collector.py +690 -0
- nodescraper/plugins/inband/pcie/pcie_data.py +2017 -0
- nodescraper/plugins/inband/pcie/pcie_plugin.py +43 -0
- nodescraper/plugins/inband/process/__init__.py +25 -0
- nodescraper/plugins/inband/process/analyzer_args.py +45 -0
- nodescraper/plugins/inband/process/collector_args.py +31 -0
- nodescraper/plugins/inband/process/process_analyzer.py +91 -0
- nodescraper/plugins/inband/process/process_collector.py +115 -0
- nodescraper/plugins/inband/process/process_plugin.py +46 -0
- nodescraper/plugins/inband/process/processdata.py +34 -0
- nodescraper/plugins/inband/rocm/__init__.py +25 -0
- nodescraper/plugins/inband/rocm/analyzer_args.py +66 -0
- nodescraper/plugins/inband/rocm/rocm_analyzer.py +100 -0
- nodescraper/plugins/inband/rocm/rocm_collector.py +205 -0
- nodescraper/plugins/inband/rocm/rocm_plugin.py +43 -0
- nodescraper/plugins/inband/rocm/rocmdata.py +62 -0
- nodescraper/plugins/inband/storage/__init__.py +25 -0
- nodescraper/plugins/inband/storage/analyzer_args.py +38 -0
- nodescraper/plugins/inband/storage/collector_args.py +31 -0
- nodescraper/plugins/inband/storage/storage_analyzer.py +152 -0
- nodescraper/plugins/inband/storage/storage_collector.py +110 -0
- nodescraper/plugins/inband/storage/storage_plugin.py +44 -0
- nodescraper/plugins/inband/storage/storagedata.py +70 -0
- nodescraper/plugins/inband/sysctl/__init__.py +29 -0
- nodescraper/plugins/inband/sysctl/analyzer_args.py +67 -0
- nodescraper/plugins/inband/sysctl/sysctl_analyzer.py +81 -0
- nodescraper/plugins/inband/sysctl/sysctl_collector.py +101 -0
- nodescraper/plugins/inband/sysctl/sysctl_plugin.py +43 -0
- nodescraper/plugins/inband/sysctl/sysctldata.py +42 -0
- nodescraper/plugins/inband/syslog/__init__.py +28 -0
- nodescraper/plugins/inband/syslog/syslog_collector.py +121 -0
- nodescraper/plugins/inband/syslog/syslog_plugin.py +37 -0
- nodescraper/plugins/inband/syslog/syslogdata.py +46 -0
- nodescraper/plugins/inband/uptime/__init__.py +25 -0
- nodescraper/plugins/inband/uptime/uptime_collector.py +88 -0
- nodescraper/plugins/inband/uptime/uptime_plugin.py +37 -0
- nodescraper/plugins/inband/uptime/uptimedata.py +31 -0
- nodescraper/resultcollators/__init__.py +25 -0
- nodescraper/resultcollators/tablesummary.py +159 -0
- nodescraper/taskresulthooks/__init__.py +28 -0
- nodescraper/taskresulthooks/filesystemloghook.py +88 -0
- nodescraper/typeutils.py +171 -0
- nodescraper/utils.py +412 -0
|
@@ -0,0 +1,1002 @@
|
|
|
1
|
+
###############################################################################
|
|
2
|
+
#
|
|
3
|
+
# MIT License
|
|
4
|
+
#
|
|
5
|
+
# Copyright (c) 2025 Advanced Micro Devices, Inc.
|
|
6
|
+
#
|
|
7
|
+
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
8
|
+
# of this software and associated documentation files (the "Software"), to deal
|
|
9
|
+
# in the Software without restriction, including without limitation the rights
|
|
10
|
+
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
11
|
+
# copies of the Software, and to permit persons to whom the Software is
|
|
12
|
+
# furnished to do so, subject to the following conditions:
|
|
13
|
+
#
|
|
14
|
+
# The above copyright notice and this permission notice shall be included in all
|
|
15
|
+
# copies or substantial portions of the Software.
|
|
16
|
+
#
|
|
17
|
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
18
|
+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
19
|
+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
20
|
+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
21
|
+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
22
|
+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
23
|
+
# SOFTWARE.
|
|
24
|
+
#
|
|
25
|
+
###############################################################################
|
|
26
|
+
import re
|
|
27
|
+
from enum import Enum
|
|
28
|
+
from typing import Any, Mapping, Optional, Union
|
|
29
|
+
|
|
30
|
+
from pydantic import (
|
|
31
|
+
AliasChoices,
|
|
32
|
+
BaseModel,
|
|
33
|
+
ConfigDict,
|
|
34
|
+
Field,
|
|
35
|
+
computed_field,
|
|
36
|
+
field_validator,
|
|
37
|
+
model_validator,
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
from nodescraper.models.datamodel import DataModel, FileModel
|
|
41
|
+
from nodescraper.utils import find_annotation_in_container
|
|
42
|
+
|
|
43
|
+
_NUM_UNIT_RE = re.compile(r"^\s*([-+]?\d+(?:\.\d+)?)(?:\s*([A-Za-z%/][A-Za-z0-9%/._-]*))?\s*$")
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def na_to_none(values: Union[int, str]):
|
|
47
|
+
if values == "N/A":
|
|
48
|
+
return None
|
|
49
|
+
return values
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def na_to_none_list(values: list[Union[int, str, None]]) -> list[Union[int, str, None]]:
|
|
53
|
+
ret_list: list[Union[int, str, None]] = values.copy()
|
|
54
|
+
for i in range(len(ret_list)):
|
|
55
|
+
if ret_list[i] == "N/A":
|
|
56
|
+
ret_list[i] = None
|
|
57
|
+
return ret_list
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def na_to_none_dict(values: object) -> Optional[dict[str, Any]]:
|
|
61
|
+
"""Normalize mapping-like fields where 'N/A' or empty should become None.
|
|
62
|
+
Accepts None; returns None for 'N/A'/'NA'/'' or non-mapping inputs."""
|
|
63
|
+
if values is None:
|
|
64
|
+
return None
|
|
65
|
+
if isinstance(values, str) and values.strip().upper() in {"N/A", "NA", ""}:
|
|
66
|
+
return None
|
|
67
|
+
if not isinstance(values, Mapping):
|
|
68
|
+
return None
|
|
69
|
+
|
|
70
|
+
out: dict[str, Any] = {}
|
|
71
|
+
for k, v in values.items():
|
|
72
|
+
if isinstance(v, str) and v.strip().upper() in {"N/A", "NA", ""}:
|
|
73
|
+
out[k] = None
|
|
74
|
+
else:
|
|
75
|
+
out[k] = v
|
|
76
|
+
return out
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
class AmdSmiBaseModel(BaseModel):
|
|
80
|
+
"""Base model for AMD SMI data models.
|
|
81
|
+
|
|
82
|
+
This is used to ensure that all AMD SMI data models have the same
|
|
83
|
+
configuration and validation.
|
|
84
|
+
"""
|
|
85
|
+
|
|
86
|
+
model_config = ConfigDict(
|
|
87
|
+
str_min_length=1,
|
|
88
|
+
str_strip_whitespace=True,
|
|
89
|
+
populate_by_name=True,
|
|
90
|
+
extra="forbid", # Forbid extra fields not defined in the model
|
|
91
|
+
)
|
|
92
|
+
|
|
93
|
+
def __init__(self, **data):
|
|
94
|
+
# Convert Union[int, str, float] -> ValueUnit
|
|
95
|
+
for field_name, field_type in self.__class__.model_fields.items():
|
|
96
|
+
annotation = field_type.annotation
|
|
97
|
+
target_type, container = find_annotation_in_container(annotation, ValueUnit)
|
|
98
|
+
if target_type is None:
|
|
99
|
+
continue
|
|
100
|
+
|
|
101
|
+
if field_name in data and isinstance(data[field_name], (int, str, float)):
|
|
102
|
+
# If the field is a primitive type, convert it to ValueUnit dict for validator
|
|
103
|
+
data[field_name] = {
|
|
104
|
+
"value": data[field_name],
|
|
105
|
+
"unit": "",
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
super().__init__(**data)
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
class ValueUnit(BaseModel):
|
|
112
|
+
"""A model for a value with a unit.
|
|
113
|
+
|
|
114
|
+
Accepts:
|
|
115
|
+
- dict: {"value": 123, "unit": "W"}
|
|
116
|
+
- number: 123 -> unit=""
|
|
117
|
+
- string with number+unit: "123 W" -> {"value": 123, "unit": "W"}
|
|
118
|
+
- "N/A" / "NA" / "" / None -> None
|
|
119
|
+
"""
|
|
120
|
+
|
|
121
|
+
value: Union[int, float, str]
|
|
122
|
+
unit: str = ""
|
|
123
|
+
|
|
124
|
+
@model_validator(mode="before")
|
|
125
|
+
@classmethod
|
|
126
|
+
def _coerce(cls, v):
|
|
127
|
+
# treat N/A as None
|
|
128
|
+
def na(x) -> bool:
|
|
129
|
+
return x is None or (isinstance(x, str) and x.strip().upper() in {"N/A", "NA", ""})
|
|
130
|
+
|
|
131
|
+
if na(v):
|
|
132
|
+
return None
|
|
133
|
+
|
|
134
|
+
if isinstance(v, dict):
|
|
135
|
+
val = v.get("value")
|
|
136
|
+
unit = v.get("unit", "")
|
|
137
|
+
if na(val):
|
|
138
|
+
return None
|
|
139
|
+
if isinstance(val, str):
|
|
140
|
+
m = _NUM_UNIT_RE.match(val.strip())
|
|
141
|
+
if m and not unit:
|
|
142
|
+
num, u = m.groups()
|
|
143
|
+
unit = u or unit or ""
|
|
144
|
+
val = float(num) if "." in num else int(num)
|
|
145
|
+
return {"value": val, "unit": unit}
|
|
146
|
+
|
|
147
|
+
# numbers
|
|
148
|
+
if isinstance(v, (int, float)):
|
|
149
|
+
return {"value": v, "unit": ""}
|
|
150
|
+
|
|
151
|
+
if isinstance(v, str):
|
|
152
|
+
s = v.strip()
|
|
153
|
+
m = _NUM_UNIT_RE.match(s)
|
|
154
|
+
if m:
|
|
155
|
+
num, unit = m.groups()
|
|
156
|
+
val = float(num) if "." in num else int(num)
|
|
157
|
+
return {"value": val, "unit": unit or ""}
|
|
158
|
+
return {"value": s, "unit": ""}
|
|
159
|
+
|
|
160
|
+
return v
|
|
161
|
+
|
|
162
|
+
@field_validator("unit")
|
|
163
|
+
@classmethod
|
|
164
|
+
def _clean_unit(cls, u):
|
|
165
|
+
return "" if u is None else str(u).strip()
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
# Process
|
|
169
|
+
class ProcessMemoryUsage(BaseModel):
|
|
170
|
+
gtt_mem: Optional[ValueUnit]
|
|
171
|
+
cpu_mem: Optional[ValueUnit]
|
|
172
|
+
vram_mem: Optional[ValueUnit]
|
|
173
|
+
|
|
174
|
+
na_validator = field_validator("gtt_mem", "cpu_mem", "vram_mem", mode="before")(na_to_none)
|
|
175
|
+
|
|
176
|
+
|
|
177
|
+
class ProcessUsage(BaseModel):
|
|
178
|
+
# AMDSMI reports engine usage in nanoseconds
|
|
179
|
+
gfx: Optional[ValueUnit]
|
|
180
|
+
enc: Optional[ValueUnit]
|
|
181
|
+
na_validator = field_validator("gfx", "enc", mode="before")(na_to_none)
|
|
182
|
+
|
|
183
|
+
|
|
184
|
+
class ProcessInfo(BaseModel):
|
|
185
|
+
name: str
|
|
186
|
+
pid: int
|
|
187
|
+
memory_usage: ProcessMemoryUsage
|
|
188
|
+
mem_usage: Optional[ValueUnit]
|
|
189
|
+
usage: ProcessUsage
|
|
190
|
+
na_validator = field_validator("mem_usage", mode="before")(na_to_none)
|
|
191
|
+
|
|
192
|
+
|
|
193
|
+
class EccState(Enum):
|
|
194
|
+
ENABLED = "ENABLED"
|
|
195
|
+
DISABLED = "DISABLED"
|
|
196
|
+
NONE = "NONE"
|
|
197
|
+
PARITY = "PARITY"
|
|
198
|
+
SING_C = "SING_C"
|
|
199
|
+
MULT_UC = "MULT_UC"
|
|
200
|
+
POISON = "POISON"
|
|
201
|
+
NA = "N/A"
|
|
202
|
+
|
|
203
|
+
|
|
204
|
+
class ProcessListItem(BaseModel):
|
|
205
|
+
process_info: Union[ProcessInfo, str]
|
|
206
|
+
|
|
207
|
+
|
|
208
|
+
class Processes(BaseModel):
|
|
209
|
+
gpu: int
|
|
210
|
+
process_list: list[ProcessListItem]
|
|
211
|
+
|
|
212
|
+
|
|
213
|
+
# FW
|
|
214
|
+
class FwListItem(BaseModel):
|
|
215
|
+
fw_id: str
|
|
216
|
+
fw_version: str
|
|
217
|
+
|
|
218
|
+
|
|
219
|
+
class Fw(BaseModel):
|
|
220
|
+
gpu: int
|
|
221
|
+
fw_list: Union[list[FwListItem], str]
|
|
222
|
+
|
|
223
|
+
|
|
224
|
+
class AmdSmiListItem(BaseModel):
|
|
225
|
+
gpu: int
|
|
226
|
+
bdf: str
|
|
227
|
+
uuid: str
|
|
228
|
+
kfd_id: int
|
|
229
|
+
node_id: int
|
|
230
|
+
partition_id: int
|
|
231
|
+
|
|
232
|
+
|
|
233
|
+
class AmdSmiVersion(BaseModel):
|
|
234
|
+
"""Contains the versioning info for amd-smi"""
|
|
235
|
+
|
|
236
|
+
tool: Optional[str] = None
|
|
237
|
+
version: Optional[str] = None
|
|
238
|
+
amdsmi_library_version: Optional[str] = None
|
|
239
|
+
rocm_version: Optional[str] = None
|
|
240
|
+
amdgpu_version: Optional[str] = None
|
|
241
|
+
amd_hsmp_driver_version: Optional[str] = None
|
|
242
|
+
|
|
243
|
+
@field_validator("*", mode="before")
|
|
244
|
+
@classmethod
|
|
245
|
+
def _stringify(cls, v):
|
|
246
|
+
if v is None or isinstance(v, str):
|
|
247
|
+
return v
|
|
248
|
+
if isinstance(v, (bytes, bytearray)):
|
|
249
|
+
return v.decode("utf-8", "ignore")
|
|
250
|
+
if isinstance(v, (tuple, list)):
|
|
251
|
+
return ".".join(str(x) for x in v)
|
|
252
|
+
return str(v)
|
|
253
|
+
|
|
254
|
+
|
|
255
|
+
class PartitionAccelerator(BaseModel):
|
|
256
|
+
"""Accelerator partition data"""
|
|
257
|
+
|
|
258
|
+
gpu_id: int
|
|
259
|
+
memory: Optional[str] = None
|
|
260
|
+
accelerator_type: Optional[str] = None
|
|
261
|
+
accelerator_profile_index: Optional[Union[str, int]] = None
|
|
262
|
+
partition_id: Optional[int] = None
|
|
263
|
+
|
|
264
|
+
|
|
265
|
+
class PartitionMemory(BaseModel):
|
|
266
|
+
"""Memory Partition data"""
|
|
267
|
+
|
|
268
|
+
gpu_id: int
|
|
269
|
+
partition_type: Optional[str] = None
|
|
270
|
+
|
|
271
|
+
|
|
272
|
+
class PartitionCompute(BaseModel):
|
|
273
|
+
"""Compute Partition data"""
|
|
274
|
+
|
|
275
|
+
gpu_id: int
|
|
276
|
+
partition_type: Optional[str] = None
|
|
277
|
+
|
|
278
|
+
|
|
279
|
+
class Partition(BaseModel):
|
|
280
|
+
"""Contains the partition info for amd-smi"""
|
|
281
|
+
|
|
282
|
+
memory_partition: list[PartitionMemory] = Field(default_factory=list)
|
|
283
|
+
compute_partition: list[PartitionCompute] = Field(default_factory=list)
|
|
284
|
+
|
|
285
|
+
|
|
286
|
+
### STATIC DATA ###
|
|
287
|
+
class StaticAsic(BaseModel):
|
|
288
|
+
market_name: str
|
|
289
|
+
vendor_id: str
|
|
290
|
+
vendor_name: str
|
|
291
|
+
subvendor_id: str
|
|
292
|
+
device_id: str
|
|
293
|
+
subsystem_id: str
|
|
294
|
+
rev_id: str
|
|
295
|
+
asic_serial: str
|
|
296
|
+
oam_id: Union[int, str] # can be N/A
|
|
297
|
+
num_compute_units: Union[int, str] # can be N/A
|
|
298
|
+
target_graphics_version: str
|
|
299
|
+
|
|
300
|
+
|
|
301
|
+
class StaticBus(AmdSmiBaseModel):
|
|
302
|
+
bdf: str
|
|
303
|
+
max_pcie_width: Optional[ValueUnit] = None
|
|
304
|
+
max_pcie_speed: Optional[ValueUnit] = None
|
|
305
|
+
pcie_interface_version: str = "unknown"
|
|
306
|
+
slot_type: str = "unknown"
|
|
307
|
+
|
|
308
|
+
|
|
309
|
+
class StaticVbios(BaseModel):
|
|
310
|
+
name: str
|
|
311
|
+
build_date: str
|
|
312
|
+
part_number: str
|
|
313
|
+
version: str
|
|
314
|
+
|
|
315
|
+
|
|
316
|
+
class StaticLimit(AmdSmiBaseModel):
|
|
317
|
+
max_power: Optional[ValueUnit] = None
|
|
318
|
+
min_power: Optional[ValueUnit] = None
|
|
319
|
+
socket_power: Optional[ValueUnit] = None
|
|
320
|
+
slowdown_edge_temperature: Optional[ValueUnit] = None
|
|
321
|
+
slowdown_hotspot_temperature: Optional[ValueUnit] = None
|
|
322
|
+
slowdown_vram_temperature: Optional[ValueUnit] = None
|
|
323
|
+
shutdown_edge_temperature: Optional[ValueUnit] = None
|
|
324
|
+
shutdown_hotspot_temperature: Optional[ValueUnit] = None
|
|
325
|
+
shutdown_vram_temperature: Optional[ValueUnit] = None
|
|
326
|
+
na_validator = field_validator(
|
|
327
|
+
"max_power",
|
|
328
|
+
"min_power",
|
|
329
|
+
"socket_power",
|
|
330
|
+
"slowdown_edge_temperature",
|
|
331
|
+
"slowdown_hotspot_temperature",
|
|
332
|
+
"slowdown_vram_temperature",
|
|
333
|
+
"shutdown_edge_temperature",
|
|
334
|
+
"shutdown_hotspot_temperature",
|
|
335
|
+
"shutdown_vram_temperature",
|
|
336
|
+
mode="before",
|
|
337
|
+
)(na_to_none)
|
|
338
|
+
|
|
339
|
+
|
|
340
|
+
class StaticDriver(BaseModel):
|
|
341
|
+
name: str
|
|
342
|
+
version: str
|
|
343
|
+
|
|
344
|
+
|
|
345
|
+
class StaticBoard(BaseModel):
|
|
346
|
+
model_config = ConfigDict(
|
|
347
|
+
populate_by_name=True,
|
|
348
|
+
)
|
|
349
|
+
|
|
350
|
+
amdsmi_model_number: str = Field(
|
|
351
|
+
alias="model_number"
|
|
352
|
+
) # Model number is a reserved keyword for pydantic
|
|
353
|
+
product_serial: str
|
|
354
|
+
fru_id: str
|
|
355
|
+
product_name: str
|
|
356
|
+
manufacturer_name: str
|
|
357
|
+
|
|
358
|
+
|
|
359
|
+
class StaticRas(BaseModel):
|
|
360
|
+
eeprom_version: str
|
|
361
|
+
parity_schema: EccState
|
|
362
|
+
single_bit_schema: EccState
|
|
363
|
+
double_bit_schema: EccState
|
|
364
|
+
poison_schema: EccState
|
|
365
|
+
ecc_block_state: Union[dict[str, EccState], str]
|
|
366
|
+
|
|
367
|
+
|
|
368
|
+
class StaticPartition(BaseModel):
|
|
369
|
+
# The name for compute_partition has changed we will support both for now
|
|
370
|
+
|
|
371
|
+
compute_partition: str = Field(
|
|
372
|
+
validation_alias=AliasChoices("compute_partition", "accelerator_partition")
|
|
373
|
+
)
|
|
374
|
+
memory_partition: str
|
|
375
|
+
partition_id: int
|
|
376
|
+
|
|
377
|
+
|
|
378
|
+
class StaticPolicy(BaseModel):
|
|
379
|
+
policy_id: int
|
|
380
|
+
policy_description: str
|
|
381
|
+
|
|
382
|
+
|
|
383
|
+
class StaticSocPstate(BaseModel):
|
|
384
|
+
num_supported: int
|
|
385
|
+
current_id: int
|
|
386
|
+
policies: list[StaticPolicy]
|
|
387
|
+
|
|
388
|
+
|
|
389
|
+
class StaticXgmiPlpd(BaseModel):
|
|
390
|
+
num_supported: int
|
|
391
|
+
current_id: int
|
|
392
|
+
plpds: list[StaticPolicy]
|
|
393
|
+
|
|
394
|
+
|
|
395
|
+
class StaticNuma(BaseModel):
|
|
396
|
+
node: int
|
|
397
|
+
affinity: Union[int, str] # can be N/A
|
|
398
|
+
|
|
399
|
+
|
|
400
|
+
class StaticVram(AmdSmiBaseModel):
|
|
401
|
+
type: str
|
|
402
|
+
vendor: Optional[str]
|
|
403
|
+
size: Optional[ValueUnit]
|
|
404
|
+
bit_width: Optional[ValueUnit]
|
|
405
|
+
max_bandwidth: Optional[ValueUnit] = None
|
|
406
|
+
na_validator = field_validator("vendor", "size", "bit_width", "max_bandwidth", mode="before")(
|
|
407
|
+
na_to_none
|
|
408
|
+
)
|
|
409
|
+
|
|
410
|
+
|
|
411
|
+
class StaticCacheInfoItem(AmdSmiBaseModel):
|
|
412
|
+
cache: ValueUnit
|
|
413
|
+
cache_properties: list[str]
|
|
414
|
+
cache_size: Optional[ValueUnit]
|
|
415
|
+
cache_level: ValueUnit
|
|
416
|
+
max_num_cu_shared: ValueUnit
|
|
417
|
+
num_cache_instance: ValueUnit
|
|
418
|
+
na_validator = field_validator("cache_size", mode="before")(na_to_none)
|
|
419
|
+
|
|
420
|
+
|
|
421
|
+
class StaticFrequencyLevels(BaseModel):
|
|
422
|
+
model_config = ConfigDict(
|
|
423
|
+
populate_by_name=True,
|
|
424
|
+
)
|
|
425
|
+
|
|
426
|
+
Level_0: str = Field(..., alias="Level 0")
|
|
427
|
+
Level_1: Optional[str] = Field(default=None, alias="Level 1")
|
|
428
|
+
Level_2: Optional[str] = Field(default=None, alias="Level 2")
|
|
429
|
+
|
|
430
|
+
|
|
431
|
+
class StaticClockData(BaseModel):
|
|
432
|
+
model_config = ConfigDict(
|
|
433
|
+
populate_by_name=True,
|
|
434
|
+
)
|
|
435
|
+
frequency_levels: StaticFrequencyLevels
|
|
436
|
+
|
|
437
|
+
current_level: Optional[int] = Field(..., alias="current level")
|
|
438
|
+
na_validator = field_validator("current_level", mode="before")(na_to_none)
|
|
439
|
+
|
|
440
|
+
|
|
441
|
+
class AmdSmiStatic(BaseModel):
|
|
442
|
+
"""Contains all static data"""
|
|
443
|
+
|
|
444
|
+
gpu: int
|
|
445
|
+
asic: StaticAsic
|
|
446
|
+
bus: StaticBus
|
|
447
|
+
vbios: Optional[StaticVbios]
|
|
448
|
+
limit: Optional[StaticLimit]
|
|
449
|
+
driver: StaticDriver
|
|
450
|
+
board: StaticBoard
|
|
451
|
+
ras: StaticRas
|
|
452
|
+
soc_pstate: Optional[StaticSocPstate]
|
|
453
|
+
xgmi_plpd: Optional[StaticXgmiPlpd]
|
|
454
|
+
process_isolation: str
|
|
455
|
+
numa: StaticNuma
|
|
456
|
+
vram: StaticVram
|
|
457
|
+
cache_info: list[StaticCacheInfoItem]
|
|
458
|
+
partition: Optional[StaticPartition] = None # This has been removed in Amd-smi 26.0.0+d30a0afe+
|
|
459
|
+
clock: Optional[dict[str, Union[StaticClockData, None]]] = None
|
|
460
|
+
na_validator_dict = field_validator("clock", mode="before")(na_to_none_dict)
|
|
461
|
+
na_validator = field_validator("soc_pstate", "xgmi_plpd", "vbios", "limit", mode="before")(
|
|
462
|
+
na_to_none
|
|
463
|
+
)
|
|
464
|
+
|
|
465
|
+
|
|
466
|
+
# PAGES
|
|
467
|
+
class PageData(BaseModel):
|
|
468
|
+
page_address: Union[int, str]
|
|
469
|
+
page_size: Union[int, str]
|
|
470
|
+
status: str
|
|
471
|
+
value: Optional[int]
|
|
472
|
+
|
|
473
|
+
|
|
474
|
+
class BadPages(BaseModel):
|
|
475
|
+
gpu: int
|
|
476
|
+
retired: list[PageData]
|
|
477
|
+
|
|
478
|
+
|
|
479
|
+
# Metric Data
|
|
480
|
+
class MetricUsage(BaseModel):
|
|
481
|
+
gfx_activity: Optional[ValueUnit]
|
|
482
|
+
umc_activity: Optional[ValueUnit]
|
|
483
|
+
mm_activity: Optional[ValueUnit]
|
|
484
|
+
vcn_activity: list[Optional[Union[ValueUnit, str]]]
|
|
485
|
+
jpeg_activity: list[Optional[Union[ValueUnit, str]]]
|
|
486
|
+
gfx_busy_inst: Optional[dict[str, list[Optional[Union[ValueUnit, str]]]]]
|
|
487
|
+
jpeg_busy: Optional[dict[str, list[Optional[Union[ValueUnit, str]]]]]
|
|
488
|
+
vcn_busy: Optional[dict[str, list[Optional[Union[ValueUnit, str]]]]]
|
|
489
|
+
na_validator_list = field_validator("vcn_activity", "jpeg_activity", mode="before")(
|
|
490
|
+
na_to_none_list
|
|
491
|
+
)
|
|
492
|
+
na_validator = field_validator(
|
|
493
|
+
"gfx_activity",
|
|
494
|
+
"umc_activity",
|
|
495
|
+
"mm_activity",
|
|
496
|
+
"gfx_busy_inst",
|
|
497
|
+
"jpeg_busy",
|
|
498
|
+
"vcn_busy",
|
|
499
|
+
mode="before",
|
|
500
|
+
)(na_to_none)
|
|
501
|
+
|
|
502
|
+
|
|
503
|
+
class MetricPower(BaseModel):
|
|
504
|
+
socket_power: Optional[ValueUnit]
|
|
505
|
+
gfx_voltage: Optional[ValueUnit]
|
|
506
|
+
soc_voltage: Optional[ValueUnit]
|
|
507
|
+
mem_voltage: Optional[ValueUnit]
|
|
508
|
+
throttle_status: Optional[str]
|
|
509
|
+
power_management: Optional[str]
|
|
510
|
+
na_validator = field_validator(
|
|
511
|
+
"socket_power",
|
|
512
|
+
"gfx_voltage",
|
|
513
|
+
"soc_voltage",
|
|
514
|
+
"mem_voltage",
|
|
515
|
+
"throttle_status",
|
|
516
|
+
"power_management",
|
|
517
|
+
mode="before",
|
|
518
|
+
)(na_to_none)
|
|
519
|
+
|
|
520
|
+
|
|
521
|
+
class MetricClockData(BaseModel):
|
|
522
|
+
clk: Optional[ValueUnit]
|
|
523
|
+
min_clk: Optional[ValueUnit]
|
|
524
|
+
max_clk: Optional[ValueUnit]
|
|
525
|
+
clk_locked: Optional[Union[int, str, dict]]
|
|
526
|
+
deep_sleep: Optional[Union[int, str, dict]]
|
|
527
|
+
na_validator = field_validator(
|
|
528
|
+
"clk", "min_clk", "max_clk", "clk_locked", "deep_sleep", mode="before"
|
|
529
|
+
)(na_to_none)
|
|
530
|
+
|
|
531
|
+
|
|
532
|
+
class MetricTemperature(BaseModel):
|
|
533
|
+
edge: Optional[ValueUnit]
|
|
534
|
+
hotspot: Optional[ValueUnit]
|
|
535
|
+
mem: Optional[ValueUnit]
|
|
536
|
+
na_validator = field_validator("edge", "hotspot", "mem", mode="before")(na_to_none)
|
|
537
|
+
|
|
538
|
+
|
|
539
|
+
class MetricPcie(BaseModel):
|
|
540
|
+
width: Optional[int]
|
|
541
|
+
speed: Optional[ValueUnit]
|
|
542
|
+
bandwidth: Optional[ValueUnit]
|
|
543
|
+
replay_count: Optional[int]
|
|
544
|
+
l0_to_recovery_count: Optional[int]
|
|
545
|
+
replay_roll_over_count: Optional[int]
|
|
546
|
+
nak_sent_count: Optional[int]
|
|
547
|
+
nak_received_count: Optional[int]
|
|
548
|
+
current_bandwidth_sent: Optional[int]
|
|
549
|
+
current_bandwidth_received: Optional[int]
|
|
550
|
+
max_packet_size: Optional[int]
|
|
551
|
+
lc_perf_other_end_recovery: Optional[int]
|
|
552
|
+
na_validator = field_validator(
|
|
553
|
+
"width",
|
|
554
|
+
"speed",
|
|
555
|
+
"bandwidth",
|
|
556
|
+
"replay_count",
|
|
557
|
+
"l0_to_recovery_count",
|
|
558
|
+
"replay_roll_over_count",
|
|
559
|
+
"nak_sent_count",
|
|
560
|
+
"nak_received_count",
|
|
561
|
+
"current_bandwidth_sent",
|
|
562
|
+
"current_bandwidth_received",
|
|
563
|
+
"max_packet_size",
|
|
564
|
+
"lc_perf_other_end_recovery",
|
|
565
|
+
mode="before",
|
|
566
|
+
)(na_to_none)
|
|
567
|
+
|
|
568
|
+
|
|
569
|
+
class MetricEccTotals(BaseModel):
|
|
570
|
+
total_correctable_count: Optional[int]
|
|
571
|
+
total_uncorrectable_count: Optional[int]
|
|
572
|
+
total_deferred_count: Optional[int]
|
|
573
|
+
cache_correctable_count: Optional[int]
|
|
574
|
+
cache_uncorrectable_count: Optional[int]
|
|
575
|
+
na_validator = field_validator(
|
|
576
|
+
"total_correctable_count",
|
|
577
|
+
"total_uncorrectable_count",
|
|
578
|
+
"total_deferred_count",
|
|
579
|
+
"cache_correctable_count",
|
|
580
|
+
"cache_uncorrectable_count",
|
|
581
|
+
mode="before",
|
|
582
|
+
)(na_to_none)
|
|
583
|
+
|
|
584
|
+
|
|
585
|
+
class MetricErrorCounts(BaseModel):
|
|
586
|
+
correctable_count: Optional[str]
|
|
587
|
+
uncorrectable_count: Optional[str]
|
|
588
|
+
deferred_count: Optional[str]
|
|
589
|
+
na_validator = field_validator(
|
|
590
|
+
"correctable_count", "uncorrectable_count", "deferred_count", mode="before"
|
|
591
|
+
)(na_to_none)
|
|
592
|
+
|
|
593
|
+
|
|
594
|
+
class MetricFan(BaseModel):
|
|
595
|
+
speed: Optional[ValueUnit]
|
|
596
|
+
max: Optional[ValueUnit]
|
|
597
|
+
rpm: Optional[ValueUnit]
|
|
598
|
+
usage: Optional[ValueUnit]
|
|
599
|
+
na_validator = field_validator("speed", "max", "rpm", "usage", mode="before")(na_to_none)
|
|
600
|
+
|
|
601
|
+
|
|
602
|
+
class MetricVoltageCurve(BaseModel):
|
|
603
|
+
point_0_frequency: Optional[ValueUnit]
|
|
604
|
+
point_0_voltage: Optional[ValueUnit]
|
|
605
|
+
point_1_frequency: Optional[ValueUnit]
|
|
606
|
+
point_1_voltage: Optional[ValueUnit]
|
|
607
|
+
point_2_frequency: Optional[ValueUnit]
|
|
608
|
+
point_2_voltage: Optional[ValueUnit]
|
|
609
|
+
|
|
610
|
+
na_validator = field_validator(
|
|
611
|
+
"point_0_frequency",
|
|
612
|
+
"point_0_voltage",
|
|
613
|
+
"point_1_frequency",
|
|
614
|
+
"point_1_voltage",
|
|
615
|
+
"point_2_frequency",
|
|
616
|
+
"point_2_voltage",
|
|
617
|
+
mode="before",
|
|
618
|
+
)(na_to_none)
|
|
619
|
+
|
|
620
|
+
|
|
621
|
+
class MetricEnergy(BaseModel):
|
|
622
|
+
total_energy_consumption: Optional[ValueUnit]
|
|
623
|
+
na_validator = field_validator("total_energy_consumption", mode="before")(na_to_none)
|
|
624
|
+
|
|
625
|
+
|
|
626
|
+
class MetricMemUsage(BaseModel):
|
|
627
|
+
total_vram: Optional[ValueUnit]
|
|
628
|
+
used_vram: Optional[ValueUnit]
|
|
629
|
+
free_vram: Optional[ValueUnit]
|
|
630
|
+
total_visible_vram: Optional[ValueUnit]
|
|
631
|
+
used_visible_vram: Optional[ValueUnit]
|
|
632
|
+
free_visible_vram: Optional[ValueUnit]
|
|
633
|
+
total_gtt: Optional[ValueUnit]
|
|
634
|
+
used_gtt: Optional[ValueUnit]
|
|
635
|
+
free_gtt: Optional[ValueUnit]
|
|
636
|
+
na_validator = field_validator(
|
|
637
|
+
"total_vram",
|
|
638
|
+
"used_vram",
|
|
639
|
+
"free_vram",
|
|
640
|
+
"total_visible_vram",
|
|
641
|
+
"used_visible_vram",
|
|
642
|
+
"free_visible_vram",
|
|
643
|
+
"total_gtt",
|
|
644
|
+
"used_gtt",
|
|
645
|
+
"free_gtt",
|
|
646
|
+
mode="before",
|
|
647
|
+
)(na_to_none)
|
|
648
|
+
|
|
649
|
+
|
|
650
|
+
class MetricThrottleVu(BaseModel):
|
|
651
|
+
xcp_0: Optional[list[Optional[Union[ValueUnit, str]]]] = None
|
|
652
|
+
# Deprecated below
|
|
653
|
+
value: Optional[dict[str, list[Union[int, str]]]] = Field(deprecated=True, default=None)
|
|
654
|
+
unit: str = Field(deprecated=True, default="")
|
|
655
|
+
|
|
656
|
+
|
|
657
|
+
class MetricThrottle(AmdSmiBaseModel):
|
|
658
|
+
accumulation_counter: Optional[Union[MetricThrottleVu, ValueUnit]] = None
|
|
659
|
+
|
|
660
|
+
gfx_clk_below_host_limit_accumulated: Optional[Union[MetricThrottleVu, ValueUnit]] = None
|
|
661
|
+
gfx_clk_below_host_limit_power_accumulated: Optional[Union[MetricThrottleVu, ValueUnit]] = None
|
|
662
|
+
gfx_clk_below_host_limit_power_violation_activity: Optional[
|
|
663
|
+
Union[MetricThrottleVu, ValueUnit]
|
|
664
|
+
] = None
|
|
665
|
+
gfx_clk_below_host_limit_power_violation_status: Optional[
|
|
666
|
+
Union[MetricThrottleVu, ValueUnit]
|
|
667
|
+
] = None
|
|
668
|
+
gfx_clk_below_host_limit_violation_activity: Optional[Union[MetricThrottleVu, ValueUnit]] = None
|
|
669
|
+
gfx_clk_below_host_limit_violation_accumulated: Optional[Union[MetricThrottleVu, ValueUnit]] = (
|
|
670
|
+
None
|
|
671
|
+
)
|
|
672
|
+
gfx_clk_below_host_limit_violation_status: Optional[Union[MetricThrottleVu, ValueUnit]] = None
|
|
673
|
+
gfx_clk_below_host_limit_thermal_violation_accumulated: Optional[
|
|
674
|
+
Union[MetricThrottleVu, ValueUnit]
|
|
675
|
+
] = None
|
|
676
|
+
gfx_clk_below_host_limit_thermal_violation_activity: Optional[
|
|
677
|
+
Union[MetricThrottleVu, ValueUnit]
|
|
678
|
+
] = None
|
|
679
|
+
gfx_clk_below_host_limit_thermal_violation_status: Optional[
|
|
680
|
+
Union[MetricThrottleVu, ValueUnit]
|
|
681
|
+
] = None
|
|
682
|
+
gfx_clk_below_host_limit_thermal_accumulated: Optional[Union[MetricThrottleVu, ValueUnit]] = (
|
|
683
|
+
None
|
|
684
|
+
)
|
|
685
|
+
|
|
686
|
+
hbm_thermal_accumulated: Optional[Union[MetricThrottleVu, ValueUnit]] = None
|
|
687
|
+
hbm_thermal_violation_activity: Optional[Union[MetricThrottleVu, ValueUnit]] = None
|
|
688
|
+
hbm_thermal_violation_status: Optional[Union[MetricThrottleVu, ValueUnit]] = None
|
|
689
|
+
low_utilization_violation_accumulated: Optional[Union[MetricThrottleVu, ValueUnit]] = None
|
|
690
|
+
low_utilization_violation_activity: Optional[Union[MetricThrottleVu, ValueUnit]] = None
|
|
691
|
+
low_utilization_violation_status: Optional[Union[MetricThrottleVu, ValueUnit]] = None
|
|
692
|
+
ppt_accumulated: Optional[Union[MetricThrottleVu, ValueUnit]] = None
|
|
693
|
+
ppt_violation_activity: Optional[Union[MetricThrottleVu, ValueUnit]] = None
|
|
694
|
+
ppt_violation_status: Optional[Union[MetricThrottleVu, ValueUnit]] = None
|
|
695
|
+
prochot_accumulated: Optional[Union[MetricThrottleVu, ValueUnit]] = None
|
|
696
|
+
prochot_violation_activity: Optional[Union[MetricThrottleVu, ValueUnit]] = None
|
|
697
|
+
prochot_violation_status: Optional[Union[MetricThrottleVu, ValueUnit]] = None
|
|
698
|
+
socket_thermal_accumulated: Optional[Union[MetricThrottleVu, ValueUnit]] = None
|
|
699
|
+
socket_thermal_violation_activity: Optional[Union[MetricThrottleVu, ValueUnit]] = None
|
|
700
|
+
socket_thermal_violation_status: Optional[Union[MetricThrottleVu, ValueUnit]] = None
|
|
701
|
+
vr_thermal_accumulated: Optional[Union[MetricThrottleVu, ValueUnit]] = None
|
|
702
|
+
vr_thermal_violation_activity: Optional[Union[MetricThrottleVu, ValueUnit]] = None
|
|
703
|
+
vr_thermal_violation_status: Optional[Union[MetricThrottleVu, ValueUnit]] = None
|
|
704
|
+
|
|
705
|
+
total_gfx_clk_below_host_limit_accumulated: Optional[Union[MetricThrottleVu, ValueUnit]] = None
|
|
706
|
+
low_utilization_accumulated: Optional[Union[MetricThrottleVu, ValueUnit]] = None
|
|
707
|
+
total_gfx_clk_below_host_limit_violation_status: Optional[
|
|
708
|
+
Union[MetricThrottleVu, ValueUnit]
|
|
709
|
+
] = None
|
|
710
|
+
total_gfx_clk_below_host_limit_violation_activity: Optional[
|
|
711
|
+
Union[MetricThrottleVu, ValueUnit]
|
|
712
|
+
] = None
|
|
713
|
+
|
|
714
|
+
na_validator = field_validator(
|
|
715
|
+
"accumulation_counter",
|
|
716
|
+
"gfx_clk_below_host_limit_accumulated",
|
|
717
|
+
"gfx_clk_below_host_limit_power_accumulated",
|
|
718
|
+
"gfx_clk_below_host_limit_power_violation_activity",
|
|
719
|
+
"gfx_clk_below_host_limit_power_violation_status",
|
|
720
|
+
"gfx_clk_below_host_limit_violation_activity",
|
|
721
|
+
"gfx_clk_below_host_limit_violation_accumulated",
|
|
722
|
+
"gfx_clk_below_host_limit_violation_status",
|
|
723
|
+
"gfx_clk_below_host_limit_thermal_violation_accumulated",
|
|
724
|
+
"gfx_clk_below_host_limit_thermal_violation_activity",
|
|
725
|
+
"gfx_clk_below_host_limit_thermal_violation_status",
|
|
726
|
+
"gfx_clk_below_host_limit_thermal_accumulated",
|
|
727
|
+
"hbm_thermal_accumulated",
|
|
728
|
+
"hbm_thermal_violation_activity",
|
|
729
|
+
"hbm_thermal_violation_status",
|
|
730
|
+
"low_utilization_violation_accumulated",
|
|
731
|
+
"low_utilization_violation_activity",
|
|
732
|
+
"low_utilization_violation_status",
|
|
733
|
+
"ppt_accumulated",
|
|
734
|
+
"ppt_violation_activity",
|
|
735
|
+
"ppt_violation_status",
|
|
736
|
+
"prochot_accumulated",
|
|
737
|
+
"prochot_violation_activity",
|
|
738
|
+
"prochot_violation_status",
|
|
739
|
+
"socket_thermal_accumulated",
|
|
740
|
+
"socket_thermal_violation_activity",
|
|
741
|
+
"socket_thermal_violation_status",
|
|
742
|
+
"vr_thermal_accumulated",
|
|
743
|
+
"vr_thermal_violation_activity",
|
|
744
|
+
"vr_thermal_violation_status",
|
|
745
|
+
"total_gfx_clk_below_host_limit_accumulated",
|
|
746
|
+
"low_utilization_accumulated",
|
|
747
|
+
"total_gfx_clk_below_host_limit_violation_status",
|
|
748
|
+
"total_gfx_clk_below_host_limit_violation_activity",
|
|
749
|
+
mode="before",
|
|
750
|
+
)(na_to_none)
|
|
751
|
+
|
|
752
|
+
|
|
753
|
+
class EccData(BaseModel):
|
|
754
|
+
"ECC counts collected per ecc block"
|
|
755
|
+
|
|
756
|
+
correctable_count: Optional[int] = 0
|
|
757
|
+
uncorrectable_count: Optional[int] = 0
|
|
758
|
+
deferred_count: Optional[int] = 0
|
|
759
|
+
|
|
760
|
+
na_validator = field_validator(
|
|
761
|
+
"correctable_count", "uncorrectable_count", "deferred_count", mode="before"
|
|
762
|
+
)(na_to_none)
|
|
763
|
+
|
|
764
|
+
|
|
765
|
+
class AmdSmiMetric(BaseModel):
|
|
766
|
+
gpu: int
|
|
767
|
+
usage: MetricUsage
|
|
768
|
+
power: MetricPower
|
|
769
|
+
clock: dict[str, MetricClockData]
|
|
770
|
+
temperature: MetricTemperature
|
|
771
|
+
pcie: MetricPcie
|
|
772
|
+
ecc: MetricEccTotals
|
|
773
|
+
ecc_blocks: Union[dict[str, EccData], str]
|
|
774
|
+
fan: MetricFan
|
|
775
|
+
voltage_curve: Optional[MetricVoltageCurve]
|
|
776
|
+
perf_level: Optional[Union[str, dict]]
|
|
777
|
+
xgmi_err: Optional[Union[str, dict]]
|
|
778
|
+
energy: Optional[MetricEnergy]
|
|
779
|
+
mem_usage: MetricMemUsage
|
|
780
|
+
throttle: MetricThrottle
|
|
781
|
+
|
|
782
|
+
na_validator = field_validator("xgmi_err", "perf_level", mode="before")(na_to_none)
|
|
783
|
+
|
|
784
|
+
@field_validator("ecc_blocks", mode="before")
|
|
785
|
+
@classmethod
|
|
786
|
+
def validate_ecc_blocks(cls, value: Union[dict[str, EccData], str]) -> dict[str, EccData]:
|
|
787
|
+
"""Validate the ecc_blocks field."""
|
|
788
|
+
if isinstance(value, str):
|
|
789
|
+
# If it's a string, we assume it's "N/A" and return an empty dict
|
|
790
|
+
return {}
|
|
791
|
+
return value
|
|
792
|
+
|
|
793
|
+
@field_validator("energy", mode="before")
|
|
794
|
+
@classmethod
|
|
795
|
+
def validate_energy(cls, value: Optional[Any]) -> Optional[MetricEnergy]:
|
|
796
|
+
"""Validate the energy field."""
|
|
797
|
+
if value == "N/A" or value is None:
|
|
798
|
+
return None
|
|
799
|
+
return value
|
|
800
|
+
|
|
801
|
+
|
|
802
|
+
### LINK DATA ###
|
|
803
|
+
|
|
804
|
+
|
|
805
|
+
class LinkStatusTable(Enum):
|
|
806
|
+
UP = "U"
|
|
807
|
+
DOWN = "D"
|
|
808
|
+
DISABLED = "X"
|
|
809
|
+
|
|
810
|
+
|
|
811
|
+
class BiDirectionalTable(Enum):
|
|
812
|
+
SELF = "SELF"
|
|
813
|
+
TRUE = "T"
|
|
814
|
+
|
|
815
|
+
|
|
816
|
+
class DmaTable(Enum):
|
|
817
|
+
SELF = "SELF"
|
|
818
|
+
TRUE = "T"
|
|
819
|
+
|
|
820
|
+
|
|
821
|
+
class AtomicsTable(Enum):
|
|
822
|
+
SELF = "SELF"
|
|
823
|
+
TRUE = "64,32"
|
|
824
|
+
THIRTY_TWO = "32"
|
|
825
|
+
SIXTY_FOUR = "64"
|
|
826
|
+
|
|
827
|
+
|
|
828
|
+
class LinkTypes(Enum):
|
|
829
|
+
XGMI = "XGMI"
|
|
830
|
+
PCIE = "PCIE"
|
|
831
|
+
SELF = "SELF"
|
|
832
|
+
|
|
833
|
+
|
|
834
|
+
class AccessTable(Enum):
|
|
835
|
+
ENABLED = "ENABLED"
|
|
836
|
+
DISABLED = "DISABLED"
|
|
837
|
+
|
|
838
|
+
|
|
839
|
+
# XGMI
|
|
840
|
+
class XgmiLink(BaseModel):
|
|
841
|
+
gpu: int
|
|
842
|
+
bdf: str
|
|
843
|
+
read: Optional[ValueUnit]
|
|
844
|
+
write: Optional[ValueUnit]
|
|
845
|
+
na_validator = field_validator("read", "write", mode="before")(na_to_none)
|
|
846
|
+
|
|
847
|
+
|
|
848
|
+
class XgmiLinkMetrics(BaseModel):
|
|
849
|
+
bit_rate: Optional[ValueUnit]
|
|
850
|
+
max_bandwidth: Optional[ValueUnit]
|
|
851
|
+
link_type: str
|
|
852
|
+
links: list[XgmiLink]
|
|
853
|
+
na_validator = field_validator("max_bandwidth", "bit_rate", mode="before")(na_to_none)
|
|
854
|
+
|
|
855
|
+
|
|
856
|
+
class XgmiMetrics(BaseModel):
|
|
857
|
+
gpu: int
|
|
858
|
+
bdf: str
|
|
859
|
+
link_metrics: XgmiLinkMetrics
|
|
860
|
+
|
|
861
|
+
|
|
862
|
+
class XgmiLinks(BaseModel):
|
|
863
|
+
gpu: int
|
|
864
|
+
bdf: str
|
|
865
|
+
link_status: list[LinkStatusTable]
|
|
866
|
+
|
|
867
|
+
|
|
868
|
+
class CoherentTable(Enum):
|
|
869
|
+
COHERANT = "C"
|
|
870
|
+
NON_COHERANT = "NC"
|
|
871
|
+
SELF = "SELF"
|
|
872
|
+
|
|
873
|
+
|
|
874
|
+
# TOPO
|
|
875
|
+
|
|
876
|
+
|
|
877
|
+
class TopoLink(BaseModel):
|
|
878
|
+
gpu: int
|
|
879
|
+
bdf: str
|
|
880
|
+
weight: int
|
|
881
|
+
link_status: AccessTable
|
|
882
|
+
link_type: LinkTypes
|
|
883
|
+
num_hops: int
|
|
884
|
+
bandwidth: str
|
|
885
|
+
# The below fields are sometimes missing, so we use Optional
|
|
886
|
+
coherent: Optional[CoherentTable] = None
|
|
887
|
+
atomics: Optional[AtomicsTable] = None
|
|
888
|
+
dma: Optional[DmaTable] = None
|
|
889
|
+
bi_dir: Optional[BiDirectionalTable] = None
|
|
890
|
+
|
|
891
|
+
@computed_field
|
|
892
|
+
def bandwidth_from(self) -> Optional[int]:
|
|
893
|
+
"""Get the bandwidth from the link."""
|
|
894
|
+
bw_split = self.bandwidth.split("-")
|
|
895
|
+
if len(bw_split) == 2:
|
|
896
|
+
return int(bw_split[0])
|
|
897
|
+
else:
|
|
898
|
+
# If the bandwidth is not in the expected format, return None
|
|
899
|
+
return None
|
|
900
|
+
|
|
901
|
+
@computed_field
|
|
902
|
+
def bandwidth_to(self) -> Optional[int]:
|
|
903
|
+
"""Get the bandwidth to the link."""
|
|
904
|
+
bw_split = self.bandwidth.split("-")
|
|
905
|
+
if len(bw_split) == 2:
|
|
906
|
+
return int(bw_split[1])
|
|
907
|
+
else:
|
|
908
|
+
# If the bandwidth is not in the expected format, return None
|
|
909
|
+
return None
|
|
910
|
+
|
|
911
|
+
|
|
912
|
+
class Topo(BaseModel):
|
|
913
|
+
gpu: int
|
|
914
|
+
bdf: str
|
|
915
|
+
links: list[TopoLink]
|
|
916
|
+
|
|
917
|
+
|
|
918
|
+
class AmdSmiTstData(BaseModel):
|
|
919
|
+
"Summary of amdsmitst results, with list and count of passing/skipped/failed tests"
|
|
920
|
+
|
|
921
|
+
passed_tests: list[str] = Field(default_factory=list)
|
|
922
|
+
skipped_tests: list[str] = Field(default_factory=list)
|
|
923
|
+
failed_tests: list[str] = Field(default_factory=list)
|
|
924
|
+
passed_test_count: int = 0
|
|
925
|
+
skipped_test_count: int = 0
|
|
926
|
+
failed_test_count: int = 0
|
|
927
|
+
|
|
928
|
+
|
|
929
|
+
class AmdSmiDataModel(DataModel):
|
|
930
|
+
"""Data model for amd-smi data.
|
|
931
|
+
|
|
932
|
+
Optionals are used to allow for the data to be missing,
|
|
933
|
+
This makes the data class more flexible for the analyzer
|
|
934
|
+
which consumes only the required data. If any more data is
|
|
935
|
+
required for the analyzer then they should not be set to
|
|
936
|
+
default.
|
|
937
|
+
"""
|
|
938
|
+
|
|
939
|
+
model_config = ConfigDict(
|
|
940
|
+
str_min_length=1,
|
|
941
|
+
str_strip_whitespace=True,
|
|
942
|
+
populate_by_name=True,
|
|
943
|
+
)
|
|
944
|
+
|
|
945
|
+
version: Optional[AmdSmiVersion] = None
|
|
946
|
+
gpu_list: Optional[list[AmdSmiListItem]] = Field(default_factory=list)
|
|
947
|
+
partition: Optional[Partition] = None
|
|
948
|
+
process: Optional[list[Processes]] = Field(default_factory=list)
|
|
949
|
+
topology: Optional[list[Topo]] = Field(default_factory=list)
|
|
950
|
+
firmware: Optional[list[Fw]] = Field(default_factory=list)
|
|
951
|
+
bad_pages: Optional[list[BadPages]] = Field(default_factory=list)
|
|
952
|
+
static: Optional[list[AmdSmiStatic]] = Field(default_factory=list)
|
|
953
|
+
metric: Optional[list[AmdSmiMetric]] = Field(default_factory=list)
|
|
954
|
+
xgmi_metric: Optional[list[XgmiMetrics]] = Field(default_factory=list)
|
|
955
|
+
xgmi_link: Optional[list[XgmiLinks]] = Field(default_factory=list)
|
|
956
|
+
cper_data: Optional[list[FileModel]] = Field(default_factory=list)
|
|
957
|
+
amdsmitst_data: AmdSmiTstData = Field(default_factory=AmdSmiTstData)
|
|
958
|
+
|
|
959
|
+
def get_list(self, gpu: int) -> Optional[AmdSmiListItem]:
|
|
960
|
+
"""Get the gpu list item for the given gpu id."""
|
|
961
|
+
if self.gpu_list is None:
|
|
962
|
+
return None
|
|
963
|
+
for item in self.gpu_list:
|
|
964
|
+
if item.gpu == gpu:
|
|
965
|
+
return item
|
|
966
|
+
return None
|
|
967
|
+
|
|
968
|
+
def get_process(self, gpu: int) -> Optional[Processes]:
|
|
969
|
+
"""Get the process data for the given gpu id."""
|
|
970
|
+
if self.process is None:
|
|
971
|
+
return None
|
|
972
|
+
for item in self.process:
|
|
973
|
+
if item.gpu == gpu:
|
|
974
|
+
return item
|
|
975
|
+
return None
|
|
976
|
+
|
|
977
|
+
def get_firmware(self, gpu: int) -> Optional[Fw]:
|
|
978
|
+
"""Get the firmware data for the given gpu id."""
|
|
979
|
+
if self.firmware is None:
|
|
980
|
+
return None
|
|
981
|
+
for item in self.firmware:
|
|
982
|
+
if item.gpu == gpu:
|
|
983
|
+
return item
|
|
984
|
+
return None
|
|
985
|
+
|
|
986
|
+
def get_static(self, gpu: int) -> Optional[AmdSmiStatic]:
|
|
987
|
+
"""Get the static data for the given gpu id."""
|
|
988
|
+
if self.static is None:
|
|
989
|
+
return None
|
|
990
|
+
for item in self.static:
|
|
991
|
+
if item.gpu == gpu:
|
|
992
|
+
return item
|
|
993
|
+
return None
|
|
994
|
+
|
|
995
|
+
def get_bad_pages(self, gpu: int) -> Optional[BadPages]:
|
|
996
|
+
"""Get the bad pages data for the given gpu id."""
|
|
997
|
+
if self.bad_pages is None:
|
|
998
|
+
return None
|
|
999
|
+
for item in self.bad_pages:
|
|
1000
|
+
if item.gpu == gpu:
|
|
1001
|
+
return item
|
|
1002
|
+
return None
|