amd-node-scraper 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (197) hide show
  1. amd_node_scraper-0.0.1.dist-info/LICENSE +21 -0
  2. amd_node_scraper-0.0.1.dist-info/METADATA +424 -0
  3. amd_node_scraper-0.0.1.dist-info/RECORD +197 -0
  4. amd_node_scraper-0.0.1.dist-info/WHEEL +5 -0
  5. amd_node_scraper-0.0.1.dist-info/entry_points.txt +2 -0
  6. amd_node_scraper-0.0.1.dist-info/top_level.txt +1 -0
  7. nodescraper/__init__.py +32 -0
  8. nodescraper/base/__init__.py +34 -0
  9. nodescraper/base/inbandcollectortask.py +118 -0
  10. nodescraper/base/inbanddataplugin.py +39 -0
  11. nodescraper/base/regexanalyzer.py +120 -0
  12. nodescraper/cli/__init__.py +29 -0
  13. nodescraper/cli/cli.py +511 -0
  14. nodescraper/cli/constants.py +27 -0
  15. nodescraper/cli/dynamicparserbuilder.py +171 -0
  16. nodescraper/cli/helper.py +517 -0
  17. nodescraper/cli/inputargtypes.py +129 -0
  18. nodescraper/configbuilder.py +123 -0
  19. nodescraper/configregistry.py +66 -0
  20. nodescraper/configs/node_status.json +19 -0
  21. nodescraper/connection/__init__.py +25 -0
  22. nodescraper/connection/inband/__init__.py +46 -0
  23. nodescraper/connection/inband/inband.py +171 -0
  24. nodescraper/connection/inband/inbandlocal.py +93 -0
  25. nodescraper/connection/inband/inbandmanager.py +151 -0
  26. nodescraper/connection/inband/inbandremote.py +173 -0
  27. nodescraper/connection/inband/sshparams.py +43 -0
  28. nodescraper/constants.py +26 -0
  29. nodescraper/enums/__init__.py +40 -0
  30. nodescraper/enums/eventcategory.py +89 -0
  31. nodescraper/enums/eventpriority.py +42 -0
  32. nodescraper/enums/executionstatus.py +44 -0
  33. nodescraper/enums/osfamily.py +34 -0
  34. nodescraper/enums/systeminteraction.py +41 -0
  35. nodescraper/enums/systemlocation.py +33 -0
  36. nodescraper/generictypes.py +36 -0
  37. nodescraper/interfaces/__init__.py +44 -0
  38. nodescraper/interfaces/connectionmanager.py +143 -0
  39. nodescraper/interfaces/dataanalyzertask.py +138 -0
  40. nodescraper/interfaces/datacollectortask.py +185 -0
  41. nodescraper/interfaces/dataplugin.py +356 -0
  42. nodescraper/interfaces/plugin.py +127 -0
  43. nodescraper/interfaces/resultcollator.py +56 -0
  44. nodescraper/interfaces/task.py +164 -0
  45. nodescraper/interfaces/taskresulthook.py +39 -0
  46. nodescraper/models/__init__.py +48 -0
  47. nodescraper/models/analyzerargs.py +93 -0
  48. nodescraper/models/collectorargs.py +30 -0
  49. nodescraper/models/connectionconfig.py +34 -0
  50. nodescraper/models/datamodel.py +171 -0
  51. nodescraper/models/datapluginresult.py +39 -0
  52. nodescraper/models/event.py +158 -0
  53. nodescraper/models/pluginconfig.py +38 -0
  54. nodescraper/models/pluginresult.py +39 -0
  55. nodescraper/models/systeminfo.py +44 -0
  56. nodescraper/models/taskresult.py +185 -0
  57. nodescraper/models/timerangeargs.py +38 -0
  58. nodescraper/pluginexecutor.py +274 -0
  59. nodescraper/pluginregistry.py +152 -0
  60. nodescraper/plugins/__init__.py +25 -0
  61. nodescraper/plugins/inband/__init__.py +25 -0
  62. nodescraper/plugins/inband/amdsmi/__init__.py +28 -0
  63. nodescraper/plugins/inband/amdsmi/amdsmi_analyzer.py +821 -0
  64. nodescraper/plugins/inband/amdsmi/amdsmi_collector.py +1313 -0
  65. nodescraper/plugins/inband/amdsmi/amdsmi_plugin.py +43 -0
  66. nodescraper/plugins/inband/amdsmi/amdsmidata.py +1002 -0
  67. nodescraper/plugins/inband/amdsmi/analyzer_args.py +50 -0
  68. nodescraper/plugins/inband/amdsmi/cper.py +65 -0
  69. nodescraper/plugins/inband/bios/__init__.py +29 -0
  70. nodescraper/plugins/inband/bios/analyzer_args.py +64 -0
  71. nodescraper/plugins/inband/bios/bios_analyzer.py +93 -0
  72. nodescraper/plugins/inband/bios/bios_collector.py +93 -0
  73. nodescraper/plugins/inband/bios/bios_plugin.py +43 -0
  74. nodescraper/plugins/inband/bios/biosdata.py +30 -0
  75. nodescraper/plugins/inband/cmdline/__init__.py +25 -0
  76. nodescraper/plugins/inband/cmdline/analyzer_args.py +80 -0
  77. nodescraper/plugins/inband/cmdline/cmdline_analyzer.py +113 -0
  78. nodescraper/plugins/inband/cmdline/cmdline_collector.py +77 -0
  79. nodescraper/plugins/inband/cmdline/cmdline_plugin.py +43 -0
  80. nodescraper/plugins/inband/cmdline/cmdlinedata.py +30 -0
  81. nodescraper/plugins/inband/device_enumeration/__init__.py +29 -0
  82. nodescraper/plugins/inband/device_enumeration/analyzer_args.py +73 -0
  83. nodescraper/plugins/inband/device_enumeration/device_enumeration_analyzer.py +81 -0
  84. nodescraper/plugins/inband/device_enumeration/device_enumeration_collector.py +176 -0
  85. nodescraper/plugins/inband/device_enumeration/device_enumeration_plugin.py +45 -0
  86. nodescraper/plugins/inband/device_enumeration/deviceenumdata.py +36 -0
  87. nodescraper/plugins/inband/dimm/__init__.py +25 -0
  88. nodescraper/plugins/inband/dimm/collector_args.py +31 -0
  89. nodescraper/plugins/inband/dimm/dimm_collector.py +151 -0
  90. nodescraper/plugins/inband/dimm/dimm_plugin.py +40 -0
  91. nodescraper/plugins/inband/dimm/dimmdata.py +30 -0
  92. nodescraper/plugins/inband/dkms/__init__.py +25 -0
  93. nodescraper/plugins/inband/dkms/analyzer_args.py +85 -0
  94. nodescraper/plugins/inband/dkms/dkms_analyzer.py +106 -0
  95. nodescraper/plugins/inband/dkms/dkms_collector.py +76 -0
  96. nodescraper/plugins/inband/dkms/dkms_plugin.py +43 -0
  97. nodescraper/plugins/inband/dkms/dkmsdata.py +33 -0
  98. nodescraper/plugins/inband/dmesg/__init__.py +28 -0
  99. nodescraper/plugins/inband/dmesg/analyzer_args.py +33 -0
  100. nodescraper/plugins/inband/dmesg/collector_args.py +39 -0
  101. nodescraper/plugins/inband/dmesg/dmesg_analyzer.py +503 -0
  102. nodescraper/plugins/inband/dmesg/dmesg_collector.py +164 -0
  103. nodescraper/plugins/inband/dmesg/dmesg_plugin.py +44 -0
  104. nodescraper/plugins/inband/dmesg/dmesgdata.py +116 -0
  105. nodescraper/plugins/inband/fabrics/__init__.py +28 -0
  106. nodescraper/plugins/inband/fabrics/fabrics_collector.py +726 -0
  107. nodescraper/plugins/inband/fabrics/fabrics_plugin.py +37 -0
  108. nodescraper/plugins/inband/fabrics/fabricsdata.py +140 -0
  109. nodescraper/plugins/inband/journal/__init__.py +28 -0
  110. nodescraper/plugins/inband/journal/collector_args.py +33 -0
  111. nodescraper/plugins/inband/journal/journal_collector.py +107 -0
  112. nodescraper/plugins/inband/journal/journal_plugin.py +40 -0
  113. nodescraper/plugins/inband/journal/journaldata.py +44 -0
  114. nodescraper/plugins/inband/kernel/__init__.py +25 -0
  115. nodescraper/plugins/inband/kernel/analyzer_args.py +64 -0
  116. nodescraper/plugins/inband/kernel/kernel_analyzer.py +91 -0
  117. nodescraper/plugins/inband/kernel/kernel_collector.py +129 -0
  118. nodescraper/plugins/inband/kernel/kernel_plugin.py +43 -0
  119. nodescraper/plugins/inband/kernel/kerneldata.py +32 -0
  120. nodescraper/plugins/inband/kernel_module/__init__.py +25 -0
  121. nodescraper/plugins/inband/kernel_module/analyzer_args.py +59 -0
  122. nodescraper/plugins/inband/kernel_module/kernel_module_analyzer.py +211 -0
  123. nodescraper/plugins/inband/kernel_module/kernel_module_collector.py +264 -0
  124. nodescraper/plugins/inband/kernel_module/kernel_module_data.py +60 -0
  125. nodescraper/plugins/inband/kernel_module/kernel_module_plugin.py +43 -0
  126. nodescraper/plugins/inband/memory/__init__.py +25 -0
  127. nodescraper/plugins/inband/memory/analyzer_args.py +45 -0
  128. nodescraper/plugins/inband/memory/memory_analyzer.py +98 -0
  129. nodescraper/plugins/inband/memory/memory_collector.py +330 -0
  130. nodescraper/plugins/inband/memory/memory_plugin.py +43 -0
  131. nodescraper/plugins/inband/memory/memorydata.py +90 -0
  132. nodescraper/plugins/inband/network/__init__.py +28 -0
  133. nodescraper/plugins/inband/network/network_collector.py +1828 -0
  134. nodescraper/plugins/inband/network/network_plugin.py +37 -0
  135. nodescraper/plugins/inband/network/networkdata.py +319 -0
  136. nodescraper/plugins/inband/nvme/__init__.py +28 -0
  137. nodescraper/plugins/inband/nvme/nvme_collector.py +167 -0
  138. nodescraper/plugins/inband/nvme/nvme_plugin.py +37 -0
  139. nodescraper/plugins/inband/nvme/nvmedata.py +45 -0
  140. nodescraper/plugins/inband/os/__init__.py +25 -0
  141. nodescraper/plugins/inband/os/analyzer_args.py +64 -0
  142. nodescraper/plugins/inband/os/os_analyzer.py +73 -0
  143. nodescraper/plugins/inband/os/os_collector.py +131 -0
  144. nodescraper/plugins/inband/os/os_plugin.py +43 -0
  145. nodescraper/plugins/inband/os/osdata.py +31 -0
  146. nodescraper/plugins/inband/package/__init__.py +25 -0
  147. nodescraper/plugins/inband/package/analyzer_args.py +48 -0
  148. nodescraper/plugins/inband/package/package_analyzer.py +253 -0
  149. nodescraper/plugins/inband/package/package_collector.py +273 -0
  150. nodescraper/plugins/inband/package/package_plugin.py +43 -0
  151. nodescraper/plugins/inband/package/packagedata.py +41 -0
  152. nodescraper/plugins/inband/pcie/__init__.py +29 -0
  153. nodescraper/plugins/inband/pcie/analyzer_args.py +63 -0
  154. nodescraper/plugins/inband/pcie/pcie_analyzer.py +1081 -0
  155. nodescraper/plugins/inband/pcie/pcie_collector.py +690 -0
  156. nodescraper/plugins/inband/pcie/pcie_data.py +2017 -0
  157. nodescraper/plugins/inband/pcie/pcie_plugin.py +43 -0
  158. nodescraper/plugins/inband/process/__init__.py +25 -0
  159. nodescraper/plugins/inband/process/analyzer_args.py +45 -0
  160. nodescraper/plugins/inband/process/collector_args.py +31 -0
  161. nodescraper/plugins/inband/process/process_analyzer.py +91 -0
  162. nodescraper/plugins/inband/process/process_collector.py +115 -0
  163. nodescraper/plugins/inband/process/process_plugin.py +46 -0
  164. nodescraper/plugins/inband/process/processdata.py +34 -0
  165. nodescraper/plugins/inband/rocm/__init__.py +25 -0
  166. nodescraper/plugins/inband/rocm/analyzer_args.py +66 -0
  167. nodescraper/plugins/inband/rocm/rocm_analyzer.py +100 -0
  168. nodescraper/plugins/inband/rocm/rocm_collector.py +205 -0
  169. nodescraper/plugins/inband/rocm/rocm_plugin.py +43 -0
  170. nodescraper/plugins/inband/rocm/rocmdata.py +62 -0
  171. nodescraper/plugins/inband/storage/__init__.py +25 -0
  172. nodescraper/plugins/inband/storage/analyzer_args.py +38 -0
  173. nodescraper/plugins/inband/storage/collector_args.py +31 -0
  174. nodescraper/plugins/inband/storage/storage_analyzer.py +152 -0
  175. nodescraper/plugins/inband/storage/storage_collector.py +110 -0
  176. nodescraper/plugins/inband/storage/storage_plugin.py +44 -0
  177. nodescraper/plugins/inband/storage/storagedata.py +70 -0
  178. nodescraper/plugins/inband/sysctl/__init__.py +29 -0
  179. nodescraper/plugins/inband/sysctl/analyzer_args.py +67 -0
  180. nodescraper/plugins/inband/sysctl/sysctl_analyzer.py +81 -0
  181. nodescraper/plugins/inband/sysctl/sysctl_collector.py +101 -0
  182. nodescraper/plugins/inband/sysctl/sysctl_plugin.py +43 -0
  183. nodescraper/plugins/inband/sysctl/sysctldata.py +42 -0
  184. nodescraper/plugins/inband/syslog/__init__.py +28 -0
  185. nodescraper/plugins/inband/syslog/syslog_collector.py +121 -0
  186. nodescraper/plugins/inband/syslog/syslog_plugin.py +37 -0
  187. nodescraper/plugins/inband/syslog/syslogdata.py +46 -0
  188. nodescraper/plugins/inband/uptime/__init__.py +25 -0
  189. nodescraper/plugins/inband/uptime/uptime_collector.py +88 -0
  190. nodescraper/plugins/inband/uptime/uptime_plugin.py +37 -0
  191. nodescraper/plugins/inband/uptime/uptimedata.py +31 -0
  192. nodescraper/resultcollators/__init__.py +25 -0
  193. nodescraper/resultcollators/tablesummary.py +159 -0
  194. nodescraper/taskresulthooks/__init__.py +28 -0
  195. nodescraper/taskresulthooks/filesystemloghook.py +88 -0
  196. nodescraper/typeutils.py +171 -0
  197. nodescraper/utils.py +412 -0
@@ -0,0 +1,1002 @@
1
+ ###############################################################################
2
+ #
3
+ # MIT License
4
+ #
5
+ # Copyright (c) 2025 Advanced Micro Devices, Inc.
6
+ #
7
+ # Permission is hereby granted, free of charge, to any person obtaining a copy
8
+ # of this software and associated documentation files (the "Software"), to deal
9
+ # in the Software without restriction, including without limitation the rights
10
+ # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11
+ # copies of the Software, and to permit persons to whom the Software is
12
+ # furnished to do so, subject to the following conditions:
13
+ #
14
+ # The above copyright notice and this permission notice shall be included in all
15
+ # copies or substantial portions of the Software.
16
+ #
17
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18
+ # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19
+ # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20
+ # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21
+ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22
+ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23
+ # SOFTWARE.
24
+ #
25
+ ###############################################################################
26
+ import re
27
+ from enum import Enum
28
+ from typing import Any, Mapping, Optional, Union
29
+
30
+ from pydantic import (
31
+ AliasChoices,
32
+ BaseModel,
33
+ ConfigDict,
34
+ Field,
35
+ computed_field,
36
+ field_validator,
37
+ model_validator,
38
+ )
39
+
40
+ from nodescraper.models.datamodel import DataModel, FileModel
41
+ from nodescraper.utils import find_annotation_in_container
42
+
43
+ _NUM_UNIT_RE = re.compile(r"^\s*([-+]?\d+(?:\.\d+)?)(?:\s*([A-Za-z%/][A-Za-z0-9%/._-]*))?\s*$")
44
+
45
+
46
+ def na_to_none(values: Union[int, str]):
47
+ if values == "N/A":
48
+ return None
49
+ return values
50
+
51
+
52
+ def na_to_none_list(values: list[Union[int, str, None]]) -> list[Union[int, str, None]]:
53
+ ret_list: list[Union[int, str, None]] = values.copy()
54
+ for i in range(len(ret_list)):
55
+ if ret_list[i] == "N/A":
56
+ ret_list[i] = None
57
+ return ret_list
58
+
59
+
60
+ def na_to_none_dict(values: object) -> Optional[dict[str, Any]]:
61
+ """Normalize mapping-like fields where 'N/A' or empty should become None.
62
+ Accepts None; returns None for 'N/A'/'NA'/'' or non-mapping inputs."""
63
+ if values is None:
64
+ return None
65
+ if isinstance(values, str) and values.strip().upper() in {"N/A", "NA", ""}:
66
+ return None
67
+ if not isinstance(values, Mapping):
68
+ return None
69
+
70
+ out: dict[str, Any] = {}
71
+ for k, v in values.items():
72
+ if isinstance(v, str) and v.strip().upper() in {"N/A", "NA", ""}:
73
+ out[k] = None
74
+ else:
75
+ out[k] = v
76
+ return out
77
+
78
+
79
+ class AmdSmiBaseModel(BaseModel):
80
+ """Base model for AMD SMI data models.
81
+
82
+ This is used to ensure that all AMD SMI data models have the same
83
+ configuration and validation.
84
+ """
85
+
86
+ model_config = ConfigDict(
87
+ str_min_length=1,
88
+ str_strip_whitespace=True,
89
+ populate_by_name=True,
90
+ extra="forbid", # Forbid extra fields not defined in the model
91
+ )
92
+
93
+ def __init__(self, **data):
94
+ # Convert Union[int, str, float] -> ValueUnit
95
+ for field_name, field_type in self.__class__.model_fields.items():
96
+ annotation = field_type.annotation
97
+ target_type, container = find_annotation_in_container(annotation, ValueUnit)
98
+ if target_type is None:
99
+ continue
100
+
101
+ if field_name in data and isinstance(data[field_name], (int, str, float)):
102
+ # If the field is a primitive type, convert it to ValueUnit dict for validator
103
+ data[field_name] = {
104
+ "value": data[field_name],
105
+ "unit": "",
106
+ }
107
+
108
+ super().__init__(**data)
109
+
110
+
111
+ class ValueUnit(BaseModel):
112
+ """A model for a value with a unit.
113
+
114
+ Accepts:
115
+ - dict: {"value": 123, "unit": "W"}
116
+ - number: 123 -> unit=""
117
+ - string with number+unit: "123 W" -> {"value": 123, "unit": "W"}
118
+ - "N/A" / "NA" / "" / None -> None
119
+ """
120
+
121
+ value: Union[int, float, str]
122
+ unit: str = ""
123
+
124
+ @model_validator(mode="before")
125
+ @classmethod
126
+ def _coerce(cls, v):
127
+ # treat N/A as None
128
+ def na(x) -> bool:
129
+ return x is None or (isinstance(x, str) and x.strip().upper() in {"N/A", "NA", ""})
130
+
131
+ if na(v):
132
+ return None
133
+
134
+ if isinstance(v, dict):
135
+ val = v.get("value")
136
+ unit = v.get("unit", "")
137
+ if na(val):
138
+ return None
139
+ if isinstance(val, str):
140
+ m = _NUM_UNIT_RE.match(val.strip())
141
+ if m and not unit:
142
+ num, u = m.groups()
143
+ unit = u or unit or ""
144
+ val = float(num) if "." in num else int(num)
145
+ return {"value": val, "unit": unit}
146
+
147
+ # numbers
148
+ if isinstance(v, (int, float)):
149
+ return {"value": v, "unit": ""}
150
+
151
+ if isinstance(v, str):
152
+ s = v.strip()
153
+ m = _NUM_UNIT_RE.match(s)
154
+ if m:
155
+ num, unit = m.groups()
156
+ val = float(num) if "." in num else int(num)
157
+ return {"value": val, "unit": unit or ""}
158
+ return {"value": s, "unit": ""}
159
+
160
+ return v
161
+
162
+ @field_validator("unit")
163
+ @classmethod
164
+ def _clean_unit(cls, u):
165
+ return "" if u is None else str(u).strip()
166
+
167
+
168
+ # Process
169
+ class ProcessMemoryUsage(BaseModel):
170
+ gtt_mem: Optional[ValueUnit]
171
+ cpu_mem: Optional[ValueUnit]
172
+ vram_mem: Optional[ValueUnit]
173
+
174
+ na_validator = field_validator("gtt_mem", "cpu_mem", "vram_mem", mode="before")(na_to_none)
175
+
176
+
177
+ class ProcessUsage(BaseModel):
178
+ # AMDSMI reports engine usage in nanoseconds
179
+ gfx: Optional[ValueUnit]
180
+ enc: Optional[ValueUnit]
181
+ na_validator = field_validator("gfx", "enc", mode="before")(na_to_none)
182
+
183
+
184
+ class ProcessInfo(BaseModel):
185
+ name: str
186
+ pid: int
187
+ memory_usage: ProcessMemoryUsage
188
+ mem_usage: Optional[ValueUnit]
189
+ usage: ProcessUsage
190
+ na_validator = field_validator("mem_usage", mode="before")(na_to_none)
191
+
192
+
193
+ class EccState(Enum):
194
+ ENABLED = "ENABLED"
195
+ DISABLED = "DISABLED"
196
+ NONE = "NONE"
197
+ PARITY = "PARITY"
198
+ SING_C = "SING_C"
199
+ MULT_UC = "MULT_UC"
200
+ POISON = "POISON"
201
+ NA = "N/A"
202
+
203
+
204
+ class ProcessListItem(BaseModel):
205
+ process_info: Union[ProcessInfo, str]
206
+
207
+
208
+ class Processes(BaseModel):
209
+ gpu: int
210
+ process_list: list[ProcessListItem]
211
+
212
+
213
+ # FW
214
+ class FwListItem(BaseModel):
215
+ fw_id: str
216
+ fw_version: str
217
+
218
+
219
+ class Fw(BaseModel):
220
+ gpu: int
221
+ fw_list: Union[list[FwListItem], str]
222
+
223
+
224
+ class AmdSmiListItem(BaseModel):
225
+ gpu: int
226
+ bdf: str
227
+ uuid: str
228
+ kfd_id: int
229
+ node_id: int
230
+ partition_id: int
231
+
232
+
233
+ class AmdSmiVersion(BaseModel):
234
+ """Contains the versioning info for amd-smi"""
235
+
236
+ tool: Optional[str] = None
237
+ version: Optional[str] = None
238
+ amdsmi_library_version: Optional[str] = None
239
+ rocm_version: Optional[str] = None
240
+ amdgpu_version: Optional[str] = None
241
+ amd_hsmp_driver_version: Optional[str] = None
242
+
243
+ @field_validator("*", mode="before")
244
+ @classmethod
245
+ def _stringify(cls, v):
246
+ if v is None or isinstance(v, str):
247
+ return v
248
+ if isinstance(v, (bytes, bytearray)):
249
+ return v.decode("utf-8", "ignore")
250
+ if isinstance(v, (tuple, list)):
251
+ return ".".join(str(x) for x in v)
252
+ return str(v)
253
+
254
+
255
+ class PartitionAccelerator(BaseModel):
256
+ """Accelerator partition data"""
257
+
258
+ gpu_id: int
259
+ memory: Optional[str] = None
260
+ accelerator_type: Optional[str] = None
261
+ accelerator_profile_index: Optional[Union[str, int]] = None
262
+ partition_id: Optional[int] = None
263
+
264
+
265
+ class PartitionMemory(BaseModel):
266
+ """Memory Partition data"""
267
+
268
+ gpu_id: int
269
+ partition_type: Optional[str] = None
270
+
271
+
272
+ class PartitionCompute(BaseModel):
273
+ """Compute Partition data"""
274
+
275
+ gpu_id: int
276
+ partition_type: Optional[str] = None
277
+
278
+
279
+ class Partition(BaseModel):
280
+ """Contains the partition info for amd-smi"""
281
+
282
+ memory_partition: list[PartitionMemory] = Field(default_factory=list)
283
+ compute_partition: list[PartitionCompute] = Field(default_factory=list)
284
+
285
+
286
+ ### STATIC DATA ###
287
+ class StaticAsic(BaseModel):
288
+ market_name: str
289
+ vendor_id: str
290
+ vendor_name: str
291
+ subvendor_id: str
292
+ device_id: str
293
+ subsystem_id: str
294
+ rev_id: str
295
+ asic_serial: str
296
+ oam_id: Union[int, str] # can be N/A
297
+ num_compute_units: Union[int, str] # can be N/A
298
+ target_graphics_version: str
299
+
300
+
301
+ class StaticBus(AmdSmiBaseModel):
302
+ bdf: str
303
+ max_pcie_width: Optional[ValueUnit] = None
304
+ max_pcie_speed: Optional[ValueUnit] = None
305
+ pcie_interface_version: str = "unknown"
306
+ slot_type: str = "unknown"
307
+
308
+
309
+ class StaticVbios(BaseModel):
310
+ name: str
311
+ build_date: str
312
+ part_number: str
313
+ version: str
314
+
315
+
316
+ class StaticLimit(AmdSmiBaseModel):
317
+ max_power: Optional[ValueUnit] = None
318
+ min_power: Optional[ValueUnit] = None
319
+ socket_power: Optional[ValueUnit] = None
320
+ slowdown_edge_temperature: Optional[ValueUnit] = None
321
+ slowdown_hotspot_temperature: Optional[ValueUnit] = None
322
+ slowdown_vram_temperature: Optional[ValueUnit] = None
323
+ shutdown_edge_temperature: Optional[ValueUnit] = None
324
+ shutdown_hotspot_temperature: Optional[ValueUnit] = None
325
+ shutdown_vram_temperature: Optional[ValueUnit] = None
326
+ na_validator = field_validator(
327
+ "max_power",
328
+ "min_power",
329
+ "socket_power",
330
+ "slowdown_edge_temperature",
331
+ "slowdown_hotspot_temperature",
332
+ "slowdown_vram_temperature",
333
+ "shutdown_edge_temperature",
334
+ "shutdown_hotspot_temperature",
335
+ "shutdown_vram_temperature",
336
+ mode="before",
337
+ )(na_to_none)
338
+
339
+
340
+ class StaticDriver(BaseModel):
341
+ name: str
342
+ version: str
343
+
344
+
345
+ class StaticBoard(BaseModel):
346
+ model_config = ConfigDict(
347
+ populate_by_name=True,
348
+ )
349
+
350
+ amdsmi_model_number: str = Field(
351
+ alias="model_number"
352
+ ) # Model number is a reserved keyword for pydantic
353
+ product_serial: str
354
+ fru_id: str
355
+ product_name: str
356
+ manufacturer_name: str
357
+
358
+
359
+ class StaticRas(BaseModel):
360
+ eeprom_version: str
361
+ parity_schema: EccState
362
+ single_bit_schema: EccState
363
+ double_bit_schema: EccState
364
+ poison_schema: EccState
365
+ ecc_block_state: Union[dict[str, EccState], str]
366
+
367
+
368
+ class StaticPartition(BaseModel):
369
+ # The name for compute_partition has changed we will support both for now
370
+
371
+ compute_partition: str = Field(
372
+ validation_alias=AliasChoices("compute_partition", "accelerator_partition")
373
+ )
374
+ memory_partition: str
375
+ partition_id: int
376
+
377
+
378
+ class StaticPolicy(BaseModel):
379
+ policy_id: int
380
+ policy_description: str
381
+
382
+
383
+ class StaticSocPstate(BaseModel):
384
+ num_supported: int
385
+ current_id: int
386
+ policies: list[StaticPolicy]
387
+
388
+
389
+ class StaticXgmiPlpd(BaseModel):
390
+ num_supported: int
391
+ current_id: int
392
+ plpds: list[StaticPolicy]
393
+
394
+
395
+ class StaticNuma(BaseModel):
396
+ node: int
397
+ affinity: Union[int, str] # can be N/A
398
+
399
+
400
+ class StaticVram(AmdSmiBaseModel):
401
+ type: str
402
+ vendor: Optional[str]
403
+ size: Optional[ValueUnit]
404
+ bit_width: Optional[ValueUnit]
405
+ max_bandwidth: Optional[ValueUnit] = None
406
+ na_validator = field_validator("vendor", "size", "bit_width", "max_bandwidth", mode="before")(
407
+ na_to_none
408
+ )
409
+
410
+
411
+ class StaticCacheInfoItem(AmdSmiBaseModel):
412
+ cache: ValueUnit
413
+ cache_properties: list[str]
414
+ cache_size: Optional[ValueUnit]
415
+ cache_level: ValueUnit
416
+ max_num_cu_shared: ValueUnit
417
+ num_cache_instance: ValueUnit
418
+ na_validator = field_validator("cache_size", mode="before")(na_to_none)
419
+
420
+
421
+ class StaticFrequencyLevels(BaseModel):
422
+ model_config = ConfigDict(
423
+ populate_by_name=True,
424
+ )
425
+
426
+ Level_0: str = Field(..., alias="Level 0")
427
+ Level_1: Optional[str] = Field(default=None, alias="Level 1")
428
+ Level_2: Optional[str] = Field(default=None, alias="Level 2")
429
+
430
+
431
+ class StaticClockData(BaseModel):
432
+ model_config = ConfigDict(
433
+ populate_by_name=True,
434
+ )
435
+ frequency_levels: StaticFrequencyLevels
436
+
437
+ current_level: Optional[int] = Field(..., alias="current level")
438
+ na_validator = field_validator("current_level", mode="before")(na_to_none)
439
+
440
+
441
+ class AmdSmiStatic(BaseModel):
442
+ """Contains all static data"""
443
+
444
+ gpu: int
445
+ asic: StaticAsic
446
+ bus: StaticBus
447
+ vbios: Optional[StaticVbios]
448
+ limit: Optional[StaticLimit]
449
+ driver: StaticDriver
450
+ board: StaticBoard
451
+ ras: StaticRas
452
+ soc_pstate: Optional[StaticSocPstate]
453
+ xgmi_plpd: Optional[StaticXgmiPlpd]
454
+ process_isolation: str
455
+ numa: StaticNuma
456
+ vram: StaticVram
457
+ cache_info: list[StaticCacheInfoItem]
458
+ partition: Optional[StaticPartition] = None # This has been removed in Amd-smi 26.0.0+d30a0afe+
459
+ clock: Optional[dict[str, Union[StaticClockData, None]]] = None
460
+ na_validator_dict = field_validator("clock", mode="before")(na_to_none_dict)
461
+ na_validator = field_validator("soc_pstate", "xgmi_plpd", "vbios", "limit", mode="before")(
462
+ na_to_none
463
+ )
464
+
465
+
466
+ # PAGES
467
+ class PageData(BaseModel):
468
+ page_address: Union[int, str]
469
+ page_size: Union[int, str]
470
+ status: str
471
+ value: Optional[int]
472
+
473
+
474
+ class BadPages(BaseModel):
475
+ gpu: int
476
+ retired: list[PageData]
477
+
478
+
479
+ # Metric Data
480
+ class MetricUsage(BaseModel):
481
+ gfx_activity: Optional[ValueUnit]
482
+ umc_activity: Optional[ValueUnit]
483
+ mm_activity: Optional[ValueUnit]
484
+ vcn_activity: list[Optional[Union[ValueUnit, str]]]
485
+ jpeg_activity: list[Optional[Union[ValueUnit, str]]]
486
+ gfx_busy_inst: Optional[dict[str, list[Optional[Union[ValueUnit, str]]]]]
487
+ jpeg_busy: Optional[dict[str, list[Optional[Union[ValueUnit, str]]]]]
488
+ vcn_busy: Optional[dict[str, list[Optional[Union[ValueUnit, str]]]]]
489
+ na_validator_list = field_validator("vcn_activity", "jpeg_activity", mode="before")(
490
+ na_to_none_list
491
+ )
492
+ na_validator = field_validator(
493
+ "gfx_activity",
494
+ "umc_activity",
495
+ "mm_activity",
496
+ "gfx_busy_inst",
497
+ "jpeg_busy",
498
+ "vcn_busy",
499
+ mode="before",
500
+ )(na_to_none)
501
+
502
+
503
+ class MetricPower(BaseModel):
504
+ socket_power: Optional[ValueUnit]
505
+ gfx_voltage: Optional[ValueUnit]
506
+ soc_voltage: Optional[ValueUnit]
507
+ mem_voltage: Optional[ValueUnit]
508
+ throttle_status: Optional[str]
509
+ power_management: Optional[str]
510
+ na_validator = field_validator(
511
+ "socket_power",
512
+ "gfx_voltage",
513
+ "soc_voltage",
514
+ "mem_voltage",
515
+ "throttle_status",
516
+ "power_management",
517
+ mode="before",
518
+ )(na_to_none)
519
+
520
+
521
+ class MetricClockData(BaseModel):
522
+ clk: Optional[ValueUnit]
523
+ min_clk: Optional[ValueUnit]
524
+ max_clk: Optional[ValueUnit]
525
+ clk_locked: Optional[Union[int, str, dict]]
526
+ deep_sleep: Optional[Union[int, str, dict]]
527
+ na_validator = field_validator(
528
+ "clk", "min_clk", "max_clk", "clk_locked", "deep_sleep", mode="before"
529
+ )(na_to_none)
530
+
531
+
532
+ class MetricTemperature(BaseModel):
533
+ edge: Optional[ValueUnit]
534
+ hotspot: Optional[ValueUnit]
535
+ mem: Optional[ValueUnit]
536
+ na_validator = field_validator("edge", "hotspot", "mem", mode="before")(na_to_none)
537
+
538
+
539
+ class MetricPcie(BaseModel):
540
+ width: Optional[int]
541
+ speed: Optional[ValueUnit]
542
+ bandwidth: Optional[ValueUnit]
543
+ replay_count: Optional[int]
544
+ l0_to_recovery_count: Optional[int]
545
+ replay_roll_over_count: Optional[int]
546
+ nak_sent_count: Optional[int]
547
+ nak_received_count: Optional[int]
548
+ current_bandwidth_sent: Optional[int]
549
+ current_bandwidth_received: Optional[int]
550
+ max_packet_size: Optional[int]
551
+ lc_perf_other_end_recovery: Optional[int]
552
+ na_validator = field_validator(
553
+ "width",
554
+ "speed",
555
+ "bandwidth",
556
+ "replay_count",
557
+ "l0_to_recovery_count",
558
+ "replay_roll_over_count",
559
+ "nak_sent_count",
560
+ "nak_received_count",
561
+ "current_bandwidth_sent",
562
+ "current_bandwidth_received",
563
+ "max_packet_size",
564
+ "lc_perf_other_end_recovery",
565
+ mode="before",
566
+ )(na_to_none)
567
+
568
+
569
+ class MetricEccTotals(BaseModel):
570
+ total_correctable_count: Optional[int]
571
+ total_uncorrectable_count: Optional[int]
572
+ total_deferred_count: Optional[int]
573
+ cache_correctable_count: Optional[int]
574
+ cache_uncorrectable_count: Optional[int]
575
+ na_validator = field_validator(
576
+ "total_correctable_count",
577
+ "total_uncorrectable_count",
578
+ "total_deferred_count",
579
+ "cache_correctable_count",
580
+ "cache_uncorrectable_count",
581
+ mode="before",
582
+ )(na_to_none)
583
+
584
+
585
+ class MetricErrorCounts(BaseModel):
586
+ correctable_count: Optional[str]
587
+ uncorrectable_count: Optional[str]
588
+ deferred_count: Optional[str]
589
+ na_validator = field_validator(
590
+ "correctable_count", "uncorrectable_count", "deferred_count", mode="before"
591
+ )(na_to_none)
592
+
593
+
594
+ class MetricFan(BaseModel):
595
+ speed: Optional[ValueUnit]
596
+ max: Optional[ValueUnit]
597
+ rpm: Optional[ValueUnit]
598
+ usage: Optional[ValueUnit]
599
+ na_validator = field_validator("speed", "max", "rpm", "usage", mode="before")(na_to_none)
600
+
601
+
602
+ class MetricVoltageCurve(BaseModel):
603
+ point_0_frequency: Optional[ValueUnit]
604
+ point_0_voltage: Optional[ValueUnit]
605
+ point_1_frequency: Optional[ValueUnit]
606
+ point_1_voltage: Optional[ValueUnit]
607
+ point_2_frequency: Optional[ValueUnit]
608
+ point_2_voltage: Optional[ValueUnit]
609
+
610
+ na_validator = field_validator(
611
+ "point_0_frequency",
612
+ "point_0_voltage",
613
+ "point_1_frequency",
614
+ "point_1_voltage",
615
+ "point_2_frequency",
616
+ "point_2_voltage",
617
+ mode="before",
618
+ )(na_to_none)
619
+
620
+
621
+ class MetricEnergy(BaseModel):
622
+ total_energy_consumption: Optional[ValueUnit]
623
+ na_validator = field_validator("total_energy_consumption", mode="before")(na_to_none)
624
+
625
+
626
+ class MetricMemUsage(BaseModel):
627
+ total_vram: Optional[ValueUnit]
628
+ used_vram: Optional[ValueUnit]
629
+ free_vram: Optional[ValueUnit]
630
+ total_visible_vram: Optional[ValueUnit]
631
+ used_visible_vram: Optional[ValueUnit]
632
+ free_visible_vram: Optional[ValueUnit]
633
+ total_gtt: Optional[ValueUnit]
634
+ used_gtt: Optional[ValueUnit]
635
+ free_gtt: Optional[ValueUnit]
636
+ na_validator = field_validator(
637
+ "total_vram",
638
+ "used_vram",
639
+ "free_vram",
640
+ "total_visible_vram",
641
+ "used_visible_vram",
642
+ "free_visible_vram",
643
+ "total_gtt",
644
+ "used_gtt",
645
+ "free_gtt",
646
+ mode="before",
647
+ )(na_to_none)
648
+
649
+
650
+ class MetricThrottleVu(BaseModel):
651
+ xcp_0: Optional[list[Optional[Union[ValueUnit, str]]]] = None
652
+ # Deprecated below
653
+ value: Optional[dict[str, list[Union[int, str]]]] = Field(deprecated=True, default=None)
654
+ unit: str = Field(deprecated=True, default="")
655
+
656
+
657
+ class MetricThrottle(AmdSmiBaseModel):
658
+ accumulation_counter: Optional[Union[MetricThrottleVu, ValueUnit]] = None
659
+
660
+ gfx_clk_below_host_limit_accumulated: Optional[Union[MetricThrottleVu, ValueUnit]] = None
661
+ gfx_clk_below_host_limit_power_accumulated: Optional[Union[MetricThrottleVu, ValueUnit]] = None
662
+ gfx_clk_below_host_limit_power_violation_activity: Optional[
663
+ Union[MetricThrottleVu, ValueUnit]
664
+ ] = None
665
+ gfx_clk_below_host_limit_power_violation_status: Optional[
666
+ Union[MetricThrottleVu, ValueUnit]
667
+ ] = None
668
+ gfx_clk_below_host_limit_violation_activity: Optional[Union[MetricThrottleVu, ValueUnit]] = None
669
+ gfx_clk_below_host_limit_violation_accumulated: Optional[Union[MetricThrottleVu, ValueUnit]] = (
670
+ None
671
+ )
672
+ gfx_clk_below_host_limit_violation_status: Optional[Union[MetricThrottleVu, ValueUnit]] = None
673
+ gfx_clk_below_host_limit_thermal_violation_accumulated: Optional[
674
+ Union[MetricThrottleVu, ValueUnit]
675
+ ] = None
676
+ gfx_clk_below_host_limit_thermal_violation_activity: Optional[
677
+ Union[MetricThrottleVu, ValueUnit]
678
+ ] = None
679
+ gfx_clk_below_host_limit_thermal_violation_status: Optional[
680
+ Union[MetricThrottleVu, ValueUnit]
681
+ ] = None
682
+ gfx_clk_below_host_limit_thermal_accumulated: Optional[Union[MetricThrottleVu, ValueUnit]] = (
683
+ None
684
+ )
685
+
686
+ hbm_thermal_accumulated: Optional[Union[MetricThrottleVu, ValueUnit]] = None
687
+ hbm_thermal_violation_activity: Optional[Union[MetricThrottleVu, ValueUnit]] = None
688
+ hbm_thermal_violation_status: Optional[Union[MetricThrottleVu, ValueUnit]] = None
689
+ low_utilization_violation_accumulated: Optional[Union[MetricThrottleVu, ValueUnit]] = None
690
+ low_utilization_violation_activity: Optional[Union[MetricThrottleVu, ValueUnit]] = None
691
+ low_utilization_violation_status: Optional[Union[MetricThrottleVu, ValueUnit]] = None
692
+ ppt_accumulated: Optional[Union[MetricThrottleVu, ValueUnit]] = None
693
+ ppt_violation_activity: Optional[Union[MetricThrottleVu, ValueUnit]] = None
694
+ ppt_violation_status: Optional[Union[MetricThrottleVu, ValueUnit]] = None
695
+ prochot_accumulated: Optional[Union[MetricThrottleVu, ValueUnit]] = None
696
+ prochot_violation_activity: Optional[Union[MetricThrottleVu, ValueUnit]] = None
697
+ prochot_violation_status: Optional[Union[MetricThrottleVu, ValueUnit]] = None
698
+ socket_thermal_accumulated: Optional[Union[MetricThrottleVu, ValueUnit]] = None
699
+ socket_thermal_violation_activity: Optional[Union[MetricThrottleVu, ValueUnit]] = None
700
+ socket_thermal_violation_status: Optional[Union[MetricThrottleVu, ValueUnit]] = None
701
+ vr_thermal_accumulated: Optional[Union[MetricThrottleVu, ValueUnit]] = None
702
+ vr_thermal_violation_activity: Optional[Union[MetricThrottleVu, ValueUnit]] = None
703
+ vr_thermal_violation_status: Optional[Union[MetricThrottleVu, ValueUnit]] = None
704
+
705
+ total_gfx_clk_below_host_limit_accumulated: Optional[Union[MetricThrottleVu, ValueUnit]] = None
706
+ low_utilization_accumulated: Optional[Union[MetricThrottleVu, ValueUnit]] = None
707
+ total_gfx_clk_below_host_limit_violation_status: Optional[
708
+ Union[MetricThrottleVu, ValueUnit]
709
+ ] = None
710
+ total_gfx_clk_below_host_limit_violation_activity: Optional[
711
+ Union[MetricThrottleVu, ValueUnit]
712
+ ] = None
713
+
714
+ na_validator = field_validator(
715
+ "accumulation_counter",
716
+ "gfx_clk_below_host_limit_accumulated",
717
+ "gfx_clk_below_host_limit_power_accumulated",
718
+ "gfx_clk_below_host_limit_power_violation_activity",
719
+ "gfx_clk_below_host_limit_power_violation_status",
720
+ "gfx_clk_below_host_limit_violation_activity",
721
+ "gfx_clk_below_host_limit_violation_accumulated",
722
+ "gfx_clk_below_host_limit_violation_status",
723
+ "gfx_clk_below_host_limit_thermal_violation_accumulated",
724
+ "gfx_clk_below_host_limit_thermal_violation_activity",
725
+ "gfx_clk_below_host_limit_thermal_violation_status",
726
+ "gfx_clk_below_host_limit_thermal_accumulated",
727
+ "hbm_thermal_accumulated",
728
+ "hbm_thermal_violation_activity",
729
+ "hbm_thermal_violation_status",
730
+ "low_utilization_violation_accumulated",
731
+ "low_utilization_violation_activity",
732
+ "low_utilization_violation_status",
733
+ "ppt_accumulated",
734
+ "ppt_violation_activity",
735
+ "ppt_violation_status",
736
+ "prochot_accumulated",
737
+ "prochot_violation_activity",
738
+ "prochot_violation_status",
739
+ "socket_thermal_accumulated",
740
+ "socket_thermal_violation_activity",
741
+ "socket_thermal_violation_status",
742
+ "vr_thermal_accumulated",
743
+ "vr_thermal_violation_activity",
744
+ "vr_thermal_violation_status",
745
+ "total_gfx_clk_below_host_limit_accumulated",
746
+ "low_utilization_accumulated",
747
+ "total_gfx_clk_below_host_limit_violation_status",
748
+ "total_gfx_clk_below_host_limit_violation_activity",
749
+ mode="before",
750
+ )(na_to_none)
751
+
752
+
753
+ class EccData(BaseModel):
754
+ "ECC counts collected per ecc block"
755
+
756
+ correctable_count: Optional[int] = 0
757
+ uncorrectable_count: Optional[int] = 0
758
+ deferred_count: Optional[int] = 0
759
+
760
+ na_validator = field_validator(
761
+ "correctable_count", "uncorrectable_count", "deferred_count", mode="before"
762
+ )(na_to_none)
763
+
764
+
765
+ class AmdSmiMetric(BaseModel):
766
+ gpu: int
767
+ usage: MetricUsage
768
+ power: MetricPower
769
+ clock: dict[str, MetricClockData]
770
+ temperature: MetricTemperature
771
+ pcie: MetricPcie
772
+ ecc: MetricEccTotals
773
+ ecc_blocks: Union[dict[str, EccData], str]
774
+ fan: MetricFan
775
+ voltage_curve: Optional[MetricVoltageCurve]
776
+ perf_level: Optional[Union[str, dict]]
777
+ xgmi_err: Optional[Union[str, dict]]
778
+ energy: Optional[MetricEnergy]
779
+ mem_usage: MetricMemUsage
780
+ throttle: MetricThrottle
781
+
782
+ na_validator = field_validator("xgmi_err", "perf_level", mode="before")(na_to_none)
783
+
784
+ @field_validator("ecc_blocks", mode="before")
785
+ @classmethod
786
+ def validate_ecc_blocks(cls, value: Union[dict[str, EccData], str]) -> dict[str, EccData]:
787
+ """Validate the ecc_blocks field."""
788
+ if isinstance(value, str):
789
+ # If it's a string, we assume it's "N/A" and return an empty dict
790
+ return {}
791
+ return value
792
+
793
+ @field_validator("energy", mode="before")
794
+ @classmethod
795
+ def validate_energy(cls, value: Optional[Any]) -> Optional[MetricEnergy]:
796
+ """Validate the energy field."""
797
+ if value == "N/A" or value is None:
798
+ return None
799
+ return value
800
+
801
+
802
+ ### LINK DATA ###
803
+
804
+
805
+ class LinkStatusTable(Enum):
806
+ UP = "U"
807
+ DOWN = "D"
808
+ DISABLED = "X"
809
+
810
+
811
+ class BiDirectionalTable(Enum):
812
+ SELF = "SELF"
813
+ TRUE = "T"
814
+
815
+
816
+ class DmaTable(Enum):
817
+ SELF = "SELF"
818
+ TRUE = "T"
819
+
820
+
821
+ class AtomicsTable(Enum):
822
+ SELF = "SELF"
823
+ TRUE = "64,32"
824
+ THIRTY_TWO = "32"
825
+ SIXTY_FOUR = "64"
826
+
827
+
828
+ class LinkTypes(Enum):
829
+ XGMI = "XGMI"
830
+ PCIE = "PCIE"
831
+ SELF = "SELF"
832
+
833
+
834
+ class AccessTable(Enum):
835
+ ENABLED = "ENABLED"
836
+ DISABLED = "DISABLED"
837
+
838
+
839
+ # XGMI
840
+ class XgmiLink(BaseModel):
841
+ gpu: int
842
+ bdf: str
843
+ read: Optional[ValueUnit]
844
+ write: Optional[ValueUnit]
845
+ na_validator = field_validator("read", "write", mode="before")(na_to_none)
846
+
847
+
848
+ class XgmiLinkMetrics(BaseModel):
849
+ bit_rate: Optional[ValueUnit]
850
+ max_bandwidth: Optional[ValueUnit]
851
+ link_type: str
852
+ links: list[XgmiLink]
853
+ na_validator = field_validator("max_bandwidth", "bit_rate", mode="before")(na_to_none)
854
+
855
+
856
+ class XgmiMetrics(BaseModel):
857
+ gpu: int
858
+ bdf: str
859
+ link_metrics: XgmiLinkMetrics
860
+
861
+
862
+ class XgmiLinks(BaseModel):
863
+ gpu: int
864
+ bdf: str
865
+ link_status: list[LinkStatusTable]
866
+
867
+
868
+ class CoherentTable(Enum):
869
+ COHERANT = "C"
870
+ NON_COHERANT = "NC"
871
+ SELF = "SELF"
872
+
873
+
874
+ # TOPO
875
+
876
+
877
+ class TopoLink(BaseModel):
878
+ gpu: int
879
+ bdf: str
880
+ weight: int
881
+ link_status: AccessTable
882
+ link_type: LinkTypes
883
+ num_hops: int
884
+ bandwidth: str
885
+ # The below fields are sometimes missing, so we use Optional
886
+ coherent: Optional[CoherentTable] = None
887
+ atomics: Optional[AtomicsTable] = None
888
+ dma: Optional[DmaTable] = None
889
+ bi_dir: Optional[BiDirectionalTable] = None
890
+
891
+ @computed_field
892
+ def bandwidth_from(self) -> Optional[int]:
893
+ """Get the bandwidth from the link."""
894
+ bw_split = self.bandwidth.split("-")
895
+ if len(bw_split) == 2:
896
+ return int(bw_split[0])
897
+ else:
898
+ # If the bandwidth is not in the expected format, return None
899
+ return None
900
+
901
+ @computed_field
902
+ def bandwidth_to(self) -> Optional[int]:
903
+ """Get the bandwidth to the link."""
904
+ bw_split = self.bandwidth.split("-")
905
+ if len(bw_split) == 2:
906
+ return int(bw_split[1])
907
+ else:
908
+ # If the bandwidth is not in the expected format, return None
909
+ return None
910
+
911
+
912
+ class Topo(BaseModel):
913
+ gpu: int
914
+ bdf: str
915
+ links: list[TopoLink]
916
+
917
+
918
+ class AmdSmiTstData(BaseModel):
919
+ "Summary of amdsmitst results, with list and count of passing/skipped/failed tests"
920
+
921
+ passed_tests: list[str] = Field(default_factory=list)
922
+ skipped_tests: list[str] = Field(default_factory=list)
923
+ failed_tests: list[str] = Field(default_factory=list)
924
+ passed_test_count: int = 0
925
+ skipped_test_count: int = 0
926
+ failed_test_count: int = 0
927
+
928
+
929
+ class AmdSmiDataModel(DataModel):
930
+ """Data model for amd-smi data.
931
+
932
+ Optionals are used to allow for the data to be missing,
933
+ This makes the data class more flexible for the analyzer
934
+ which consumes only the required data. If any more data is
935
+ required for the analyzer then they should not be set to
936
+ default.
937
+ """
938
+
939
+ model_config = ConfigDict(
940
+ str_min_length=1,
941
+ str_strip_whitespace=True,
942
+ populate_by_name=True,
943
+ )
944
+
945
+ version: Optional[AmdSmiVersion] = None
946
+ gpu_list: Optional[list[AmdSmiListItem]] = Field(default_factory=list)
947
+ partition: Optional[Partition] = None
948
+ process: Optional[list[Processes]] = Field(default_factory=list)
949
+ topology: Optional[list[Topo]] = Field(default_factory=list)
950
+ firmware: Optional[list[Fw]] = Field(default_factory=list)
951
+ bad_pages: Optional[list[BadPages]] = Field(default_factory=list)
952
+ static: Optional[list[AmdSmiStatic]] = Field(default_factory=list)
953
+ metric: Optional[list[AmdSmiMetric]] = Field(default_factory=list)
954
+ xgmi_metric: Optional[list[XgmiMetrics]] = Field(default_factory=list)
955
+ xgmi_link: Optional[list[XgmiLinks]] = Field(default_factory=list)
956
+ cper_data: Optional[list[FileModel]] = Field(default_factory=list)
957
+ amdsmitst_data: AmdSmiTstData = Field(default_factory=AmdSmiTstData)
958
+
959
+ def get_list(self, gpu: int) -> Optional[AmdSmiListItem]:
960
+ """Get the gpu list item for the given gpu id."""
961
+ if self.gpu_list is None:
962
+ return None
963
+ for item in self.gpu_list:
964
+ if item.gpu == gpu:
965
+ return item
966
+ return None
967
+
968
+ def get_process(self, gpu: int) -> Optional[Processes]:
969
+ """Get the process data for the given gpu id."""
970
+ if self.process is None:
971
+ return None
972
+ for item in self.process:
973
+ if item.gpu == gpu:
974
+ return item
975
+ return None
976
+
977
+ def get_firmware(self, gpu: int) -> Optional[Fw]:
978
+ """Get the firmware data for the given gpu id."""
979
+ if self.firmware is None:
980
+ return None
981
+ for item in self.firmware:
982
+ if item.gpu == gpu:
983
+ return item
984
+ return None
985
+
986
+ def get_static(self, gpu: int) -> Optional[AmdSmiStatic]:
987
+ """Get the static data for the given gpu id."""
988
+ if self.static is None:
989
+ return None
990
+ for item in self.static:
991
+ if item.gpu == gpu:
992
+ return item
993
+ return None
994
+
995
+ def get_bad_pages(self, gpu: int) -> Optional[BadPages]:
996
+ """Get the bad pages data for the given gpu id."""
997
+ if self.bad_pages is None:
998
+ return None
999
+ for item in self.bad_pages:
1000
+ if item.gpu == gpu:
1001
+ return item
1002
+ return None