amd-node-scraper 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (197) hide show
  1. amd_node_scraper-0.0.1.dist-info/LICENSE +21 -0
  2. amd_node_scraper-0.0.1.dist-info/METADATA +424 -0
  3. amd_node_scraper-0.0.1.dist-info/RECORD +197 -0
  4. amd_node_scraper-0.0.1.dist-info/WHEEL +5 -0
  5. amd_node_scraper-0.0.1.dist-info/entry_points.txt +2 -0
  6. amd_node_scraper-0.0.1.dist-info/top_level.txt +1 -0
  7. nodescraper/__init__.py +32 -0
  8. nodescraper/base/__init__.py +34 -0
  9. nodescraper/base/inbandcollectortask.py +118 -0
  10. nodescraper/base/inbanddataplugin.py +39 -0
  11. nodescraper/base/regexanalyzer.py +120 -0
  12. nodescraper/cli/__init__.py +29 -0
  13. nodescraper/cli/cli.py +511 -0
  14. nodescraper/cli/constants.py +27 -0
  15. nodescraper/cli/dynamicparserbuilder.py +171 -0
  16. nodescraper/cli/helper.py +517 -0
  17. nodescraper/cli/inputargtypes.py +129 -0
  18. nodescraper/configbuilder.py +123 -0
  19. nodescraper/configregistry.py +66 -0
  20. nodescraper/configs/node_status.json +19 -0
  21. nodescraper/connection/__init__.py +25 -0
  22. nodescraper/connection/inband/__init__.py +46 -0
  23. nodescraper/connection/inband/inband.py +171 -0
  24. nodescraper/connection/inband/inbandlocal.py +93 -0
  25. nodescraper/connection/inband/inbandmanager.py +151 -0
  26. nodescraper/connection/inband/inbandremote.py +173 -0
  27. nodescraper/connection/inband/sshparams.py +43 -0
  28. nodescraper/constants.py +26 -0
  29. nodescraper/enums/__init__.py +40 -0
  30. nodescraper/enums/eventcategory.py +89 -0
  31. nodescraper/enums/eventpriority.py +42 -0
  32. nodescraper/enums/executionstatus.py +44 -0
  33. nodescraper/enums/osfamily.py +34 -0
  34. nodescraper/enums/systeminteraction.py +41 -0
  35. nodescraper/enums/systemlocation.py +33 -0
  36. nodescraper/generictypes.py +36 -0
  37. nodescraper/interfaces/__init__.py +44 -0
  38. nodescraper/interfaces/connectionmanager.py +143 -0
  39. nodescraper/interfaces/dataanalyzertask.py +138 -0
  40. nodescraper/interfaces/datacollectortask.py +185 -0
  41. nodescraper/interfaces/dataplugin.py +356 -0
  42. nodescraper/interfaces/plugin.py +127 -0
  43. nodescraper/interfaces/resultcollator.py +56 -0
  44. nodescraper/interfaces/task.py +164 -0
  45. nodescraper/interfaces/taskresulthook.py +39 -0
  46. nodescraper/models/__init__.py +48 -0
  47. nodescraper/models/analyzerargs.py +93 -0
  48. nodescraper/models/collectorargs.py +30 -0
  49. nodescraper/models/connectionconfig.py +34 -0
  50. nodescraper/models/datamodel.py +171 -0
  51. nodescraper/models/datapluginresult.py +39 -0
  52. nodescraper/models/event.py +158 -0
  53. nodescraper/models/pluginconfig.py +38 -0
  54. nodescraper/models/pluginresult.py +39 -0
  55. nodescraper/models/systeminfo.py +44 -0
  56. nodescraper/models/taskresult.py +185 -0
  57. nodescraper/models/timerangeargs.py +38 -0
  58. nodescraper/pluginexecutor.py +274 -0
  59. nodescraper/pluginregistry.py +152 -0
  60. nodescraper/plugins/__init__.py +25 -0
  61. nodescraper/plugins/inband/__init__.py +25 -0
  62. nodescraper/plugins/inband/amdsmi/__init__.py +28 -0
  63. nodescraper/plugins/inband/amdsmi/amdsmi_analyzer.py +821 -0
  64. nodescraper/plugins/inband/amdsmi/amdsmi_collector.py +1313 -0
  65. nodescraper/plugins/inband/amdsmi/amdsmi_plugin.py +43 -0
  66. nodescraper/plugins/inband/amdsmi/amdsmidata.py +1002 -0
  67. nodescraper/plugins/inband/amdsmi/analyzer_args.py +50 -0
  68. nodescraper/plugins/inband/amdsmi/cper.py +65 -0
  69. nodescraper/plugins/inband/bios/__init__.py +29 -0
  70. nodescraper/plugins/inband/bios/analyzer_args.py +64 -0
  71. nodescraper/plugins/inband/bios/bios_analyzer.py +93 -0
  72. nodescraper/plugins/inband/bios/bios_collector.py +93 -0
  73. nodescraper/plugins/inband/bios/bios_plugin.py +43 -0
  74. nodescraper/plugins/inband/bios/biosdata.py +30 -0
  75. nodescraper/plugins/inband/cmdline/__init__.py +25 -0
  76. nodescraper/plugins/inband/cmdline/analyzer_args.py +80 -0
  77. nodescraper/plugins/inband/cmdline/cmdline_analyzer.py +113 -0
  78. nodescraper/plugins/inband/cmdline/cmdline_collector.py +77 -0
  79. nodescraper/plugins/inband/cmdline/cmdline_plugin.py +43 -0
  80. nodescraper/plugins/inband/cmdline/cmdlinedata.py +30 -0
  81. nodescraper/plugins/inband/device_enumeration/__init__.py +29 -0
  82. nodescraper/plugins/inband/device_enumeration/analyzer_args.py +73 -0
  83. nodescraper/plugins/inband/device_enumeration/device_enumeration_analyzer.py +81 -0
  84. nodescraper/plugins/inband/device_enumeration/device_enumeration_collector.py +176 -0
  85. nodescraper/plugins/inband/device_enumeration/device_enumeration_plugin.py +45 -0
  86. nodescraper/plugins/inband/device_enumeration/deviceenumdata.py +36 -0
  87. nodescraper/plugins/inband/dimm/__init__.py +25 -0
  88. nodescraper/plugins/inband/dimm/collector_args.py +31 -0
  89. nodescraper/plugins/inband/dimm/dimm_collector.py +151 -0
  90. nodescraper/plugins/inband/dimm/dimm_plugin.py +40 -0
  91. nodescraper/plugins/inband/dimm/dimmdata.py +30 -0
  92. nodescraper/plugins/inband/dkms/__init__.py +25 -0
  93. nodescraper/plugins/inband/dkms/analyzer_args.py +85 -0
  94. nodescraper/plugins/inband/dkms/dkms_analyzer.py +106 -0
  95. nodescraper/plugins/inband/dkms/dkms_collector.py +76 -0
  96. nodescraper/plugins/inband/dkms/dkms_plugin.py +43 -0
  97. nodescraper/plugins/inband/dkms/dkmsdata.py +33 -0
  98. nodescraper/plugins/inband/dmesg/__init__.py +28 -0
  99. nodescraper/plugins/inband/dmesg/analyzer_args.py +33 -0
  100. nodescraper/plugins/inband/dmesg/collector_args.py +39 -0
  101. nodescraper/plugins/inband/dmesg/dmesg_analyzer.py +503 -0
  102. nodescraper/plugins/inband/dmesg/dmesg_collector.py +164 -0
  103. nodescraper/plugins/inband/dmesg/dmesg_plugin.py +44 -0
  104. nodescraper/plugins/inband/dmesg/dmesgdata.py +116 -0
  105. nodescraper/plugins/inband/fabrics/__init__.py +28 -0
  106. nodescraper/plugins/inband/fabrics/fabrics_collector.py +726 -0
  107. nodescraper/plugins/inband/fabrics/fabrics_plugin.py +37 -0
  108. nodescraper/plugins/inband/fabrics/fabricsdata.py +140 -0
  109. nodescraper/plugins/inband/journal/__init__.py +28 -0
  110. nodescraper/plugins/inband/journal/collector_args.py +33 -0
  111. nodescraper/plugins/inband/journal/journal_collector.py +107 -0
  112. nodescraper/plugins/inband/journal/journal_plugin.py +40 -0
  113. nodescraper/plugins/inband/journal/journaldata.py +44 -0
  114. nodescraper/plugins/inband/kernel/__init__.py +25 -0
  115. nodescraper/plugins/inband/kernel/analyzer_args.py +64 -0
  116. nodescraper/plugins/inband/kernel/kernel_analyzer.py +91 -0
  117. nodescraper/plugins/inband/kernel/kernel_collector.py +129 -0
  118. nodescraper/plugins/inband/kernel/kernel_plugin.py +43 -0
  119. nodescraper/plugins/inband/kernel/kerneldata.py +32 -0
  120. nodescraper/plugins/inband/kernel_module/__init__.py +25 -0
  121. nodescraper/plugins/inband/kernel_module/analyzer_args.py +59 -0
  122. nodescraper/plugins/inband/kernel_module/kernel_module_analyzer.py +211 -0
  123. nodescraper/plugins/inband/kernel_module/kernel_module_collector.py +264 -0
  124. nodescraper/plugins/inband/kernel_module/kernel_module_data.py +60 -0
  125. nodescraper/plugins/inband/kernel_module/kernel_module_plugin.py +43 -0
  126. nodescraper/plugins/inband/memory/__init__.py +25 -0
  127. nodescraper/plugins/inband/memory/analyzer_args.py +45 -0
  128. nodescraper/plugins/inband/memory/memory_analyzer.py +98 -0
  129. nodescraper/plugins/inband/memory/memory_collector.py +330 -0
  130. nodescraper/plugins/inband/memory/memory_plugin.py +43 -0
  131. nodescraper/plugins/inband/memory/memorydata.py +90 -0
  132. nodescraper/plugins/inband/network/__init__.py +28 -0
  133. nodescraper/plugins/inband/network/network_collector.py +1828 -0
  134. nodescraper/plugins/inband/network/network_plugin.py +37 -0
  135. nodescraper/plugins/inband/network/networkdata.py +319 -0
  136. nodescraper/plugins/inband/nvme/__init__.py +28 -0
  137. nodescraper/plugins/inband/nvme/nvme_collector.py +167 -0
  138. nodescraper/plugins/inband/nvme/nvme_plugin.py +37 -0
  139. nodescraper/plugins/inband/nvme/nvmedata.py +45 -0
  140. nodescraper/plugins/inband/os/__init__.py +25 -0
  141. nodescraper/plugins/inband/os/analyzer_args.py +64 -0
  142. nodescraper/plugins/inband/os/os_analyzer.py +73 -0
  143. nodescraper/plugins/inband/os/os_collector.py +131 -0
  144. nodescraper/plugins/inband/os/os_plugin.py +43 -0
  145. nodescraper/plugins/inband/os/osdata.py +31 -0
  146. nodescraper/plugins/inband/package/__init__.py +25 -0
  147. nodescraper/plugins/inband/package/analyzer_args.py +48 -0
  148. nodescraper/plugins/inband/package/package_analyzer.py +253 -0
  149. nodescraper/plugins/inband/package/package_collector.py +273 -0
  150. nodescraper/plugins/inband/package/package_plugin.py +43 -0
  151. nodescraper/plugins/inband/package/packagedata.py +41 -0
  152. nodescraper/plugins/inband/pcie/__init__.py +29 -0
  153. nodescraper/plugins/inband/pcie/analyzer_args.py +63 -0
  154. nodescraper/plugins/inband/pcie/pcie_analyzer.py +1081 -0
  155. nodescraper/plugins/inband/pcie/pcie_collector.py +690 -0
  156. nodescraper/plugins/inband/pcie/pcie_data.py +2017 -0
  157. nodescraper/plugins/inband/pcie/pcie_plugin.py +43 -0
  158. nodescraper/plugins/inband/process/__init__.py +25 -0
  159. nodescraper/plugins/inband/process/analyzer_args.py +45 -0
  160. nodescraper/plugins/inband/process/collector_args.py +31 -0
  161. nodescraper/plugins/inband/process/process_analyzer.py +91 -0
  162. nodescraper/plugins/inband/process/process_collector.py +115 -0
  163. nodescraper/plugins/inband/process/process_plugin.py +46 -0
  164. nodescraper/plugins/inband/process/processdata.py +34 -0
  165. nodescraper/plugins/inband/rocm/__init__.py +25 -0
  166. nodescraper/plugins/inband/rocm/analyzer_args.py +66 -0
  167. nodescraper/plugins/inband/rocm/rocm_analyzer.py +100 -0
  168. nodescraper/plugins/inband/rocm/rocm_collector.py +205 -0
  169. nodescraper/plugins/inband/rocm/rocm_plugin.py +43 -0
  170. nodescraper/plugins/inband/rocm/rocmdata.py +62 -0
  171. nodescraper/plugins/inband/storage/__init__.py +25 -0
  172. nodescraper/plugins/inband/storage/analyzer_args.py +38 -0
  173. nodescraper/plugins/inband/storage/collector_args.py +31 -0
  174. nodescraper/plugins/inband/storage/storage_analyzer.py +152 -0
  175. nodescraper/plugins/inband/storage/storage_collector.py +110 -0
  176. nodescraper/plugins/inband/storage/storage_plugin.py +44 -0
  177. nodescraper/plugins/inband/storage/storagedata.py +70 -0
  178. nodescraper/plugins/inband/sysctl/__init__.py +29 -0
  179. nodescraper/plugins/inband/sysctl/analyzer_args.py +67 -0
  180. nodescraper/plugins/inband/sysctl/sysctl_analyzer.py +81 -0
  181. nodescraper/plugins/inband/sysctl/sysctl_collector.py +101 -0
  182. nodescraper/plugins/inband/sysctl/sysctl_plugin.py +43 -0
  183. nodescraper/plugins/inband/sysctl/sysctldata.py +42 -0
  184. nodescraper/plugins/inband/syslog/__init__.py +28 -0
  185. nodescraper/plugins/inband/syslog/syslog_collector.py +121 -0
  186. nodescraper/plugins/inband/syslog/syslog_plugin.py +37 -0
  187. nodescraper/plugins/inband/syslog/syslogdata.py +46 -0
  188. nodescraper/plugins/inband/uptime/__init__.py +25 -0
  189. nodescraper/plugins/inband/uptime/uptime_collector.py +88 -0
  190. nodescraper/plugins/inband/uptime/uptime_plugin.py +37 -0
  191. nodescraper/plugins/inband/uptime/uptimedata.py +31 -0
  192. nodescraper/resultcollators/__init__.py +25 -0
  193. nodescraper/resultcollators/tablesummary.py +159 -0
  194. nodescraper/taskresulthooks/__init__.py +28 -0
  195. nodescraper/taskresulthooks/filesystemloghook.py +88 -0
  196. nodescraper/typeutils.py +171 -0
  197. nodescraper/utils.py +412 -0
@@ -0,0 +1,33 @@
1
+ ###############################################################################
2
+ #
3
+ # MIT License
4
+ #
5
+ # Copyright (c) 2025 Advanced Micro Devices, Inc.
6
+ #
7
+ # Permission is hereby granted, free of charge, to any person obtaining a copy
8
+ # of this software and associated documentation files (the "Software"), to deal
9
+ # in the Software without restriction, including without limitation the rights
10
+ # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11
+ # copies of the Software, and to permit persons to whom the Software is
12
+ # furnished to do so, subject to the following conditions:
13
+ #
14
+ # The above copyright notice and this permission notice shall be included in all
15
+ # copies or substantial portions of the Software.
16
+ #
17
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18
+ # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19
+ # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20
+ # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21
+ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22
+ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23
+ # SOFTWARE.
24
+ #
25
+ ###############################################################################
26
+ from typing import Optional
27
+
28
+ from nodescraper.models import DataModel
29
+
30
+
31
+ class DkmsDataModel(DataModel):
32
+ status: Optional[str] = None
33
+ version: Optional[str] = None
@@ -0,0 +1,28 @@
1
+ ###############################################################################
2
+ #
3
+ # MIT License
4
+ #
5
+ # Copyright (c) 2025 Advanced Micro Devices, Inc.
6
+ #
7
+ # Permission is hereby granted, free of charge, to any person obtaining a copy
8
+ # of this software and associated documentation files (the "Software"), to deal
9
+ # in the Software without restriction, including without limitation the rights
10
+ # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11
+ # copies of the Software, and to permit persons to whom the Software is
12
+ # furnished to do so, subject to the following conditions:
13
+ #
14
+ # The above copyright notice and this permission notice shall be included in all
15
+ # copies or substantial portions of the Software.
16
+ #
17
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18
+ # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19
+ # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20
+ # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21
+ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22
+ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23
+ # SOFTWARE.
24
+ #
25
+ ###############################################################################
26
+ from .dmesg_plugin import DmesgPlugin
27
+
28
+ __all__ = ["DmesgPlugin"]
@@ -0,0 +1,33 @@
1
+ ###############################################################################
2
+ #
3
+ # MIT License
4
+ #
5
+ # Copyright (c) 2025 Advanced Micro Devices, Inc.
6
+ #
7
+ # Permission is hereby granted, free of charge, to any person obtaining a copy
8
+ # of this software and associated documentation files (the "Software"), to deal
9
+ # in the Software without restriction, including without limitation the rights
10
+ # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11
+ # copies of the Software, and to permit persons to whom the Software is
12
+ # furnished to do so, subject to the following conditions:
13
+ #
14
+ # The above copyright notice and this permission notice shall be included in all
15
+ # copies or substantial portions of the Software.
16
+ #
17
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18
+ # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19
+ # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20
+ # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21
+ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22
+ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23
+ # SOFTWARE.
24
+ #
25
+ ###############################################################################
26
+ from typing import Optional
27
+
28
+ from nodescraper.models import TimeRangeAnalysisArgs
29
+
30
+
31
+ class DmesgAnalyzerArgs(TimeRangeAnalysisArgs):
32
+ check_unknown_dmesg_errors: Optional[bool] = True
33
+ exclude_category: Optional[set[str]] = None
@@ -0,0 +1,39 @@
1
+ ###############################################################################
2
+ #
3
+ # MIT License
4
+ #
5
+ # Copyright (c) 2025 Advanced Micro Devices, Inc.
6
+ #
7
+ # Permission is hereby granted, free of charge, to any person obtaining a copy
8
+ # of this software and associated documentation files (the "Software"), to deal
9
+ # in the Software without restriction, including without limitation the rights
10
+ # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11
+ # copies of the Software, and to permit persons to whom the Software is
12
+ # furnished to do so, subject to the following conditions:
13
+ #
14
+ # The above copyright notice and this permission notice shall be included in all
15
+ # copies or substantial portions of the Software.
16
+ #
17
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18
+ # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19
+ # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20
+ # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21
+ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22
+ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23
+ # SOFTWARE.
24
+ #
25
+ ###############################################################################
26
+
27
+ from nodescraper.models import CollectorArgs
28
+
29
+
30
+ class DmesgCollectorArgs(CollectorArgs):
31
+ """Collector args
32
+
33
+ Args:
34
+ CollectorArgs (CollectorArgs): specific dmesg collector args
35
+ """
36
+
37
+ collect_rotated_logs: bool = False
38
+ skip_sudo: bool = False
39
+ log_dmesg_data: bool = True
@@ -0,0 +1,503 @@
1
+ ###############################################################################
2
+ #
3
+ # MIT License
4
+ #
5
+ # Copyright (c) 2025 Advanced Micro Devices, Inc.
6
+ #
7
+ # Permission is hereby granted, free of charge, to any person obtaining a copy
8
+ # of this software and associated documentation files (the "Software"), to deal
9
+ # in the Software without restriction, including without limitation the rights
10
+ # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11
+ # copies of the Software, and to permit persons to whom the Software is
12
+ # furnished to do so, subject to the following conditions:
13
+ #
14
+ # The above copyright notice and this permission notice shall be included in all
15
+ # copies or substantial portions of the Software.
16
+ #
17
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18
+ # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19
+ # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20
+ # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21
+ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22
+ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23
+ # SOFTWARE.
24
+ #
25
+ ###############################################################################
26
+ import datetime
27
+ import re
28
+ from typing import Optional
29
+
30
+ from nodescraper.base.regexanalyzer import ErrorRegex, RegexAnalyzer
31
+ from nodescraper.connection.inband import TextFileArtifact
32
+ from nodescraper.enums import EventCategory, EventPriority
33
+ from nodescraper.models import Event, TaskResult
34
+
35
+ from .analyzer_args import DmesgAnalyzerArgs
36
+ from .dmesgdata import DmesgData
37
+
38
+
39
+ class DmesgAnalyzer(RegexAnalyzer[DmesgData, DmesgAnalyzerArgs]):
40
+ """Check dmesg for errors"""
41
+
42
+ DATA_MODEL = DmesgData
43
+
44
+ ERROR_REGEX: list[ErrorRegex] = [
45
+ ErrorRegex(
46
+ regex=re.compile(r"(?:oom_kill_process.*)|(?:Out of memory.*)"),
47
+ message="Out of memory error",
48
+ event_category=EventCategory.SW_DRIVER,
49
+ ),
50
+ ErrorRegex(
51
+ regex=re.compile(r"IO_PAGE_FAULT"),
52
+ message="I/O Page Fault",
53
+ event_category=EventCategory.SW_DRIVER,
54
+ ),
55
+ ErrorRegex(
56
+ regex=re.compile(r"\bkernel panic\b.*", re.IGNORECASE),
57
+ message="Kernel Panic",
58
+ event_category=EventCategory.SW_DRIVER,
59
+ ),
60
+ ErrorRegex(
61
+ regex=re.compile(r"sq_intr"),
62
+ message="SQ Interrupt",
63
+ event_category=EventCategory.SW_DRIVER,
64
+ ),
65
+ ErrorRegex(
66
+ regex=re.compile(r"sram_ecc.*"),
67
+ message="SRAM ECC",
68
+ event_category=EventCategory.SW_DRIVER,
69
+ ),
70
+ ErrorRegex(
71
+ regex=re.compile(r"\[amdgpu\]\] \*ERROR\* hw_init of IP block.*"),
72
+ message="Failed to load driver. IP hardware init error.",
73
+ event_category=EventCategory.SW_DRIVER,
74
+ ),
75
+ ErrorRegex(
76
+ regex=re.compile(r"\[amdgpu\]\] \*ERROR\* sw_init of IP block.*"),
77
+ message="Failed to load driver. IP software init error.",
78
+ event_category=EventCategory.SW_DRIVER,
79
+ ),
80
+ ErrorRegex(
81
+ regex=re.compile(r"sched: RT throttling activated.*"),
82
+ message="Real Time throttling activated",
83
+ event_category=EventCategory.SW_DRIVER,
84
+ ),
85
+ ErrorRegex(
86
+ regex=re.compile(r"rcu_preempt detected stalls.*"),
87
+ message="RCU preempt detected stalls",
88
+ event_category=EventCategory.SW_DRIVER,
89
+ ),
90
+ ErrorRegex(
91
+ regex=re.compile(r"rcu_preempt self-detected stall.*"),
92
+ message="RCU preempt self-detected stall",
93
+ event_category=EventCategory.SW_DRIVER,
94
+ ),
95
+ ErrorRegex(
96
+ regex=re.compile(r"qcm fence wait loop timeout.*"),
97
+ message="QCM fence timeout",
98
+ event_category=EventCategory.SW_DRIVER,
99
+ ),
100
+ ErrorRegex(
101
+ regex=re.compile(r"(?:[\w-]+(?:\[[0-9.]+\])?\s+)?general protection fault[^\n]*"),
102
+ message="General protection fault",
103
+ event_category=EventCategory.SW_DRIVER,
104
+ ),
105
+ ErrorRegex(
106
+ regex=re.compile(
107
+ r"(?:segfault.*in .*\[)|(?:[Ss]egmentation [Ff]ault.*)|(?:[Ss]egfault.*)"
108
+ ),
109
+ message="Segmentation fault",
110
+ event_category=EventCategory.SW_DRIVER,
111
+ ),
112
+ ErrorRegex(
113
+ regex=re.compile(r"amdgpu: Failed to disallow cf state.*"),
114
+ message="Failed to disallow cf state",
115
+ event_category=EventCategory.SW_DRIVER,
116
+ ),
117
+ ErrorRegex(
118
+ regex=re.compile(r"\*ERROR\* Failed to terminate tmr.*"),
119
+ message="Failed to terminate tmr",
120
+ event_category=EventCategory.SW_DRIVER,
121
+ ),
122
+ ErrorRegex(
123
+ regex=re.compile(r"\*ERROR\* suspend of IP block <\w+> failed.*"),
124
+ message="Suspend of IP block failed",
125
+ event_category=EventCategory.SW_DRIVER,
126
+ ),
127
+ ErrorRegex(
128
+ regex=re.compile(
129
+ (
130
+ r"(amdgpu \w{4}:\w{2}:\w{2}\.\w:\s+amdgpu:\s+\[\S+\]\s*(?:retry|no-retry)? page fault[^\n]*)"
131
+ r"(?:\n[^\n]*(amdgpu \w{4}:\w{2}:\w{2}\.\w:\s+amdgpu:[^\n]*))?"
132
+ r"(?:\n[^\n]*(amdgpu \w{4}:\w{2}:\w{2}\.\w:\s+amdgpu:[^\n]*))?"
133
+ r"(?:\n[^\n]*(amdgpu \w{4}:\w{2}:\w{2}\.\w:\s+amdgpu:[^\n]*))?"
134
+ r"(?:\n[^\n]*(amdgpu \w{4}:\w{2}:\w{2}\.\w:\s+amdgpu:[^\n]*))?"
135
+ r"(?:\n[^\n]*(amdgpu \w{4}:\w{2}:\w{2}\.\w:\s+amdgpu:[^\n]*))?"
136
+ r"(?:\n[^\n]*(amdgpu \w{4}:\w{2}:\w{2}\.\w:\s+amdgpu:[^\n]*))?"
137
+ r"(?:\n[^\n]*(amdgpu \w{4}:\w{2}:\w{2}\.\w:\s+amdgpu:[^\n]*))?"
138
+ r"(?:\n[^\n]*(amdgpu \w{4}:\w{2}:\w{2}\.\w:\s+amdgpu:[^\n]*))?"
139
+ r"(?:\n[^\n]*(amdgpu \w{4}:\w{2}:\w{2}\.\w:\s+amdgpu:[^\n]*))?"
140
+ r"(?:\n[^\n]*(amdgpu \w{4}:\w{2}:\w{2}\.\w:\s+amdgpu:[^\n]*))?"
141
+ ),
142
+ re.MULTILINE,
143
+ ),
144
+ message="amdgpu Page Fault",
145
+ event_category=EventCategory.SW_DRIVER,
146
+ ),
147
+ ErrorRegex(
148
+ regex=re.compile((r"page fault for address.*")),
149
+ message="Page Fault",
150
+ event_category=EventCategory.OS,
151
+ ),
152
+ ErrorRegex(
153
+ regex=re.compile(
154
+ r"(?:amdgpu)(.*Fatal error during GPU init)|(Fatal error during GPU init)"
155
+ ),
156
+ message="Fatal error during GPU init",
157
+ event_category=EventCategory.SW_DRIVER,
158
+ ),
159
+ ErrorRegex(
160
+ regex=re.compile(r"(?:pcieport )(.*AER: aer_status.*)|(aer_status.*)"),
161
+ message="PCIe AER Error",
162
+ event_category=EventCategory.SW_DRIVER,
163
+ ),
164
+ ErrorRegex(
165
+ regex=re.compile(r"Failed to read journal file.*"),
166
+ message="Failed to read journal file",
167
+ event_category=EventCategory.OS,
168
+ event_priority=EventPriority.WARNING,
169
+ ),
170
+ ErrorRegex(
171
+ regex=re.compile(r"journal corrupted or uncleanly shut down.*"),
172
+ message="Journal file corrupted or uncleanly shut down",
173
+ event_category=EventCategory.OS,
174
+ event_priority=EventPriority.WARNING,
175
+ ),
176
+ ErrorRegex(
177
+ regex=re.compile(r"ACPI BIOS Error"),
178
+ message="ACPI BIOS Error",
179
+ event_category=EventCategory.BIOS,
180
+ ),
181
+ ErrorRegex(
182
+ regex=re.compile(r"ACPI Error"),
183
+ message="ACPI Error",
184
+ event_category=EventCategory.BIOS,
185
+ event_priority=EventPriority.WARNING,
186
+ ),
187
+ ErrorRegex(
188
+ regex=re.compile(r"EXT4-fs error \(device .*\):"),
189
+ message="Filesystem corrupted!",
190
+ event_category=EventCategory.OS,
191
+ ),
192
+ ErrorRegex(
193
+ regex=re.compile(r"(Buffer I\/O error on dev)(?:ice)? (\w+)"),
194
+ message="Error in buffered IO, check filesystem integrity",
195
+ event_category=EventCategory.IO,
196
+ ),
197
+ ErrorRegex(
198
+ regex=re.compile(
199
+ r"pcieport (\w+:\w+:\w+\.\w+):\s+(\w+):\s+(Slot\(\d+\)):\s+(Card not present)"
200
+ ),
201
+ message="PCIe card no longer present",
202
+ event_category=EventCategory.IO,
203
+ ),
204
+ ErrorRegex(
205
+ regex=re.compile(
206
+ r"pcieport (\w+:\w+:\w+\.\w+):\s+(\w+):\s+(Slot\(\d+\)):\s+(Link Down)"
207
+ ),
208
+ message="PCIe Link Down",
209
+ event_category=EventCategory.IO,
210
+ ),
211
+ ErrorRegex(
212
+ regex=re.compile(
213
+ r"pcieport (\w+:\w+:\w+\.\w+):\s+(\w+):\s+(current common clock configuration is inconsistent, reconfiguring)"
214
+ ),
215
+ message="Mismatched clock configuration between PCIe device and host",
216
+ event_category=EventCategory.IO,
217
+ ),
218
+ ErrorRegex(
219
+ regex=re.compile(
220
+ r"(?:\d{4}-\d+-\d+T\d+:\d+:\d+,\d+[+-]\d+:\d+)?(.* correctable hardware errors detected in total in \w+ block.*)"
221
+ ),
222
+ message="RAS Correctable Error",
223
+ event_category=EventCategory.RAS,
224
+ ),
225
+ ErrorRegex(
226
+ regex=re.compile(
227
+ r"(?:\d{4}-\d+-\d+T\d+:\d+:\d+,\d+[+-]\d+:\d+)?(.* uncorrectable hardware errors detected in \w+ block.*)"
228
+ ),
229
+ message="RAS Uncorrectable Error",
230
+ event_category=EventCategory.RAS,
231
+ ),
232
+ ErrorRegex(
233
+ regex=re.compile(
234
+ r"(?:\d{4}-\d+-\d+T\d+:\d+:\d+,\d+[+-]\d+:\d+)?(.* deferred hardware errors detected in \w+ block.*)"
235
+ ),
236
+ message="RAS Deferred Error",
237
+ event_category=EventCategory.RAS,
238
+ ),
239
+ ErrorRegex(
240
+ regex=re.compile(
241
+ r"((?:\[Hardware Error\]:\s+)?event severity: corrected.*)"
242
+ r"\n.*(\[Hardware Error\]:\s+Error \d+, type: corrected.*)"
243
+ r"\n.*(\[Hardware Error\]:\s+section_type: PCIe error.*)"
244
+ ),
245
+ message="RAS Corrected PCIe Error",
246
+ event_category=EventCategory.RAS,
247
+ ),
248
+ ErrorRegex(
249
+ regex=re.compile(r"(?:\d{4}-\d+-\d+T\d+:\d+:\d+,\d+[+-]\d+:\d+)?(.*GPU reset begin.*)"),
250
+ message="GPU Reset",
251
+ event_category=EventCategory.RAS,
252
+ ),
253
+ ErrorRegex(
254
+ regex=re.compile(
255
+ r"(?:\d{4}-\d+-\d+T\d+:\d+:\d+,\d+[+-]\d+:\d+)?(.*GPU reset(?:\(\d+\))? failed.*)"
256
+ ),
257
+ message="GPU reset failed",
258
+ event_category=EventCategory.RAS,
259
+ ),
260
+ ErrorRegex(
261
+ regex=re.compile(
262
+ (
263
+ r"(Accelerator Check Architecture[^\n]*)"
264
+ r"(?:\n[^\n]*){0,10}?"
265
+ r"(amdgpu[ 0-9a-fA-F:.]+:? [^\n]*entry\[\d+\]\.STATUS=0x[0-9a-fA-F]+)"
266
+ r"(?:\n[^\n]*){0,5}?"
267
+ r"(amdgpu[ 0-9a-fA-F:.]+:? [^\n]*entry\[\d+\]\.ADDR=0x[0-9a-fA-F]+)"
268
+ r"(?:\n[^\n]*){0,5}?"
269
+ r"(amdgpu[ 0-9a-fA-F:.]+:? [^\n]*entry\[\d+\]\.MISC0=0x[0-9a-fA-F]+)"
270
+ r"(?:\n[^\n]*){0,5}?"
271
+ r"(amdgpu[ 0-9a-fA-F:.]+:? [^\n]*entry\[\d+\]\.IPID=0x[0-9a-fA-F]+)"
272
+ r"(?:\n[^\n]*){0,5}?"
273
+ r"(amdgpu[ 0-9a-fA-F:.]+:? [^\n]*entry\[\d+\]\.SYND=0x[0-9a-fA-F]+-?)"
274
+ ),
275
+ re.MULTILINE,
276
+ ),
277
+ message="ACA Error",
278
+ event_category=EventCategory.RAS,
279
+ ),
280
+ ErrorRegex(
281
+ regex=re.compile(
282
+ (
283
+ r"(Accelerator Check Architecture[^\n]*)"
284
+ r"(?:\n[^\n]*){0,10}?"
285
+ r"(amdgpu[ 0-9a-fA-F:.]+:? [^\n]*CONTROL=0x[0-9a-fA-F]+)"
286
+ r"(?:\n[^\n]*){0,5}?"
287
+ r"(amdgpu[ 0-9a-fA-F:.]+:? [^\n]*STATUS=0x[0-9a-fA-F]+)"
288
+ r"(?:\n[^\n]*){0,5}?"
289
+ r"(amdgpu[ 0-9a-fA-F:.]+:? [^\n]*ADDR=0x[0-9a-fA-F]+)"
290
+ r"(?:\n[^\n]*){0,5}?"
291
+ r"(amdgpu[ 0-9a-fA-F:.]+:? [^\n]*MISC=0x[0-9a-fA-F]+)"
292
+ r"(?:\n[^\n]*){0,5}?"
293
+ r"(amdgpu[ 0-9a-fA-F:.]+:? [^\n]*CONFIG=0x[0-9a-fA-F]+)"
294
+ r"(?:\n[^\n]*){0,5}?"
295
+ r"(amdgpu[ 0-9a-fA-F:.]+:? [^\n]*IPID=0x[0-9a-fA-F]+)"
296
+ r"(?:\n[^\n]*){0,5}?"
297
+ r"(amdgpu[ 0-9a-fA-F:.]+:? [^\n]*SYND=0x[0-9a-fA-F]+)"
298
+ r"(?:\n[^\n]*){0,5}?"
299
+ r"(amdgpu[ 0-9a-fA-F:.]+:? [^\n]*DESTAT=0x[0-9a-fA-F]+)"
300
+ r"(?:\n[^\n]*){0,5}?"
301
+ r"(amdgpu[ 0-9a-fA-F:.]+:? [^\n]*DEADDR=0x[0-9a-fA-F]+)"
302
+ r"(?:\n[^\n]*){0,5}?"
303
+ r"(amdgpu[ 0-9a-fA-F:.]+:? [^\n]*CONTROL_MASK=0x[0-9a-fA-F]+)"
304
+ ),
305
+ re.MULTILINE,
306
+ ),
307
+ message="ACA Error",
308
+ event_category=EventCategory.RAS,
309
+ ),
310
+ ErrorRegex(
311
+ regex=re.compile(r"\[Hardware Error\]:.+MC\d+_STATUS.*(?:\n.*){0,5}"),
312
+ message="MCE Error",
313
+ event_category=EventCategory.RAS,
314
+ ),
315
+ ErrorRegex(
316
+ regex=re.compile(
317
+ r"(?:\d{4}-\d+-\d+T\d+:\d+:\d+,\d+[+-]\d+:\d+)? (.*Mode2 reset failed.*)"
318
+ ),
319
+ message="Mode 2 Reset Failed",
320
+ event_category=EventCategory.RAS,
321
+ ),
322
+ ErrorRegex(
323
+ regex=re.compile(
324
+ r"(?:\d{4}-\d+-\d+T\d+:\d+:\d+,\d+[+-]\d+:\d+)?(.*\[Hardware Error\]: Corrected error.*)"
325
+ ),
326
+ message="RAS Corrected Error",
327
+ event_category=EventCategory.RAS,
328
+ ),
329
+ ErrorRegex(
330
+ regex=re.compile(r"x86/cpu: SGX disabled by BIOS"),
331
+ message="SGX Error",
332
+ event_category=EventCategory.BIOS,
333
+ event_priority=EventPriority.WARNING,
334
+ ),
335
+ ErrorRegex(
336
+ regex=re.compile(r"amdgpu \w{4}:\w{2}:\w{2}.\w: amdgpu: WARN: GPU is throttled.*"),
337
+ message="GPU Throttled",
338
+ event_category=EventCategory.SW_DRIVER,
339
+ event_priority=EventPriority.WARNING,
340
+ ),
341
+ ErrorRegex(
342
+ regex=re.compile(
343
+ r"(?:\[[^\]]+\]\s*)?LNetError:.*ko2iblnd:\s*No matching interfaces",
344
+ re.IGNORECASE,
345
+ ),
346
+ message="LNet: ko2iblnd has no matching interfaces",
347
+ event_category=EventCategory.IO,
348
+ event_priority=EventPriority.WARNING,
349
+ ),
350
+ ErrorRegex(
351
+ regex=re.compile(
352
+ r"(?:\[[^\]]+\]\s*)?LNetError:\s*.*Error\s*-?\d+\s+starting up LNI\s+\w+",
353
+ re.IGNORECASE,
354
+ ),
355
+ message="LNet: Error starting up LNI",
356
+ event_category=EventCategory.IO,
357
+ event_priority=EventPriority.WARNING,
358
+ ),
359
+ ErrorRegex(
360
+ regex=re.compile(
361
+ r"LustreError:.*ptlrpc_init_portals\(\).*network initiali[sz]ation failed",
362
+ re.IGNORECASE,
363
+ ),
364
+ message="Lustre: network initialisation failed",
365
+ event_category=EventCategory.IO,
366
+ event_priority=EventPriority.WARNING,
367
+ ),
368
+ ]
369
+
370
+ @classmethod
371
+ def filter_dmesg(
372
+ cls,
373
+ dmesg_content: str,
374
+ analysis_range_start: Optional[datetime.datetime] = None,
375
+ analysis_range_end: Optional[datetime.datetime] = None,
376
+ ) -> str:
377
+ """Filter a dmesg log by date
378
+
379
+ Args:
380
+ dmesg_content (str): unfiltered dmesg log
381
+
382
+ Returns:
383
+ str: filterd dmesg log
384
+ """
385
+
386
+ filtered_dmesg = ""
387
+ found_start = False if analysis_range_start else True
388
+ for line in dmesg_content.splitlines():
389
+ date = re.search(r"(\d{4}-\d+-\d+T\d+:\d+:\d+),(\d+[+-]\d+:\d+)", line)
390
+ if date is not None:
391
+ date = datetime.datetime.fromisoformat(f"{date.group(1)}.{date.group(2)}")
392
+ # show date in UTC now
393
+ date = date.astimezone(datetime.timezone.utc)
394
+ if analysis_range_start and not found_start and date >= analysis_range_start:
395
+ found_start = True
396
+ elif analysis_range_end and date >= analysis_range_end:
397
+ break
398
+
399
+ # only read lines after starting timestamp is found, ignore lines that do not have valid date
400
+ if found_start:
401
+ filtered_dmesg += f"{line}\n"
402
+
403
+ return filtered_dmesg
404
+
405
+ def _is_known_error(self, known_err_events: list[Event], unknown_match: str) -> bool:
406
+ """Check if a potential unknown error line has a known regex
407
+
408
+ Args:
409
+ known_err_events (list[Event]): list of events from known regex
410
+ unknown_match (str): unknown match string
411
+
412
+ Returns:
413
+ bool: return True if error is known
414
+ """
415
+ for regex_obj in self.ERROR_REGEX:
416
+ try:
417
+ if regex_obj.regex.search(unknown_match):
418
+ return True
419
+ except re.error:
420
+ continue
421
+
422
+ for event in known_err_events:
423
+ known_match = event.data["match_content"]
424
+ if isinstance(known_match, list):
425
+ for line in known_match:
426
+ if unknown_match == line or unknown_match in line or line in unknown_match:
427
+ return True
428
+ elif isinstance(known_match, str):
429
+ if (
430
+ unknown_match == known_match
431
+ or unknown_match in known_match
432
+ or known_match in unknown_match
433
+ ):
434
+ return True
435
+ return False
436
+
437
+ def analyze_data(
438
+ self,
439
+ data: DmesgData,
440
+ args: Optional[DmesgAnalyzerArgs] = None,
441
+ ) -> TaskResult:
442
+ """Analyze dmesg data for errors
443
+
444
+ Args:
445
+ data (DmesgData): dmesg data to analyze
446
+ args (Optional[DmesgAnalyzerArgs], optional): dmesg analysis arguments. Defaults to None.
447
+
448
+ Returns:
449
+ TaskResult: The result of the analysis containing status and message.
450
+ """
451
+
452
+ if not args:
453
+ args = DmesgAnalyzerArgs()
454
+
455
+ if args.analysis_range_start or args.analysis_range_end:
456
+ self.logger.info(
457
+ "Filtering dmesg using range %s - %s",
458
+ args.analysis_range_start,
459
+ args.analysis_range_end,
460
+ )
461
+ dmesg_content = self.filter_dmesg(
462
+ data.dmesg_content,
463
+ args.analysis_range_start,
464
+ args.analysis_range_end,
465
+ )
466
+ self.result.artifacts.append(
467
+ TextFileArtifact(filename="filtered_dmesg.log", contents=dmesg_content)
468
+ )
469
+ else:
470
+ dmesg_content = data.dmesg_content
471
+
472
+ known_err_events = self.check_all_regexes(
473
+ content=dmesg_content, source="dmesg", error_regex=self.ERROR_REGEX
474
+ )
475
+ if args.exclude_category:
476
+ known_err_events = [
477
+ event for event in known_err_events if event.category not in args.exclude_category
478
+ ]
479
+
480
+ self.result.events += known_err_events
481
+
482
+ if args.check_unknown_dmesg_errors:
483
+ err_events = self.check_all_regexes(
484
+ content=dmesg_content,
485
+ source="dmesg",
486
+ error_regex=[
487
+ ErrorRegex(
488
+ regex=re.compile(
489
+ r"kern :(?:err|crit|alert|emerg)\s+: \d{4}-\d+-\d+T\d+:\d+:\d+,\d+[+-]\d+:\d+ (.*)"
490
+ ),
491
+ message="Unknown dmesg error",
492
+ event_category=EventCategory.UNKNOWN,
493
+ event_priority=EventPriority.WARNING,
494
+ )
495
+ ],
496
+ )
497
+
498
+ for err_event in err_events:
499
+ match_content = err_event.data["match_content"]
500
+ if not self._is_known_error(known_err_events, match_content):
501
+ self.result.events.append(err_event)
502
+
503
+ return self.result