wtftools 0.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- wtftools/__init__.py +55 -0
- wtftools/__main__.py +10 -0
- wtftools/audit.py +809 -0
- wtftools/colors.py +111 -0
- wtftools/config.py +249 -0
- wtftools/cron.py +388 -0
- wtftools/events.py +220 -0
- wtftools/explain.py +290 -0
- wtftools/info.py +90 -0
- wtftools/llm.py +129 -0
- wtftools/main.py +1328 -0
- wtftools/snapshot.py +203 -0
- wtftools/sysinfo.py +1608 -0
- wtftools-0.0.0.data/data/share/bash-completion/completions/wtf.bash-completion +134 -0
- wtftools-0.0.0.dist-info/METADATA +246 -0
- wtftools-0.0.0.dist-info/RECORD +20 -0
- wtftools-0.0.0.dist-info/WHEEL +5 -0
- wtftools-0.0.0.dist-info/entry_points.txt +3 -0
- wtftools-0.0.0.dist-info/licenses/LICENSE +21 -0
- wtftools-0.0.0.dist-info/top_level.txt +1 -0
wtftools/sysinfo.py
ADDED
|
@@ -0,0 +1,1608 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
# -*- coding: utf-8 -*-
|
|
3
|
+
"""System information gathering for wtftools.
|
|
4
|
+
|
|
5
|
+
Pure stdlib first, optional psutil for richer data.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import logging
|
|
9
|
+
import os
|
|
10
|
+
import platform
|
|
11
|
+
import re
|
|
12
|
+
import shutil
|
|
13
|
+
import socket
|
|
14
|
+
import subprocess
|
|
15
|
+
import time
|
|
16
|
+
import traceback
|
|
17
|
+
from typing import Any, Dict, List, Optional, Tuple
|
|
18
|
+
|
|
19
|
+
logger = logging.getLogger(__name__)
|
|
20
|
+
|
|
21
|
+
try:
|
|
22
|
+
import psutil # type: ignore
|
|
23
|
+
|
|
24
|
+
HAS_PSUTIL = True
|
|
25
|
+
except Exception:
|
|
26
|
+
HAS_PSUTIL = False
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
PROC_MEMINFO = "/proc/meminfo"
|
|
30
|
+
PROC_UPTIME = "/proc/uptime"
|
|
31
|
+
PROC_LOADAVG = "/proc/loadavg"
|
|
32
|
+
PROC_STAT = "/proc/stat"
|
|
33
|
+
PROC_CPUINFO = "/proc/cpuinfo"
|
|
34
|
+
PROC_MOUNTS = "/proc/mounts"
|
|
35
|
+
ETC_OS_RELEASE = "/etc/os-release"
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def run(cmd: List[str], timeout: int = 5) -> Tuple[int, str, str]:
|
|
39
|
+
"""Safely run a subprocess. Returns (returncode, stdout, stderr)."""
|
|
40
|
+
try:
|
|
41
|
+
result = subprocess.run(cmd, capture_output=True, text=True, timeout=timeout, check=False)
|
|
42
|
+
return result.returncode, result.stdout, result.stderr
|
|
43
|
+
except subprocess.TimeoutExpired:
|
|
44
|
+
return 124, "", "timeout"
|
|
45
|
+
except FileNotFoundError:
|
|
46
|
+
return 127, "", "not found"
|
|
47
|
+
except Exception as exc:
|
|
48
|
+
logger.debug(f"{type(exc).__name__}: {exc}\n{traceback.format_exc()}")
|
|
49
|
+
return 1, "", str(exc)
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def read_file(path: str) -> str:
|
|
53
|
+
"""Read a file, return empty string on error."""
|
|
54
|
+
try:
|
|
55
|
+
with open(path, encoding="utf-8", errors="replace") as f:
|
|
56
|
+
return f.read()
|
|
57
|
+
except Exception:
|
|
58
|
+
return ""
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def get_hostname() -> str:
|
|
62
|
+
return socket.gethostname()
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def get_os_release() -> Dict[str, str]:
|
|
66
|
+
"""Parse /etc/os-release into a dict."""
|
|
67
|
+
data: Dict[str, str] = {}
|
|
68
|
+
content = read_file(ETC_OS_RELEASE)
|
|
69
|
+
for line in content.splitlines():
|
|
70
|
+
if "=" not in line:
|
|
71
|
+
continue
|
|
72
|
+
key, _, value = line.partition("=")
|
|
73
|
+
data[key.strip()] = value.strip().strip('"').strip("'")
|
|
74
|
+
return data
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def get_kernel() -> str:
|
|
78
|
+
return platform.release()
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def get_uptime_seconds() -> float:
|
|
82
|
+
content = read_file(PROC_UPTIME)
|
|
83
|
+
if not content:
|
|
84
|
+
return 0.0
|
|
85
|
+
try:
|
|
86
|
+
return float(content.split()[0])
|
|
87
|
+
except (ValueError, IndexError):
|
|
88
|
+
return 0.0
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def format_duration(seconds: float) -> str:
|
|
92
|
+
"""Render seconds as a compact human duration."""
|
|
93
|
+
seconds = int(seconds)
|
|
94
|
+
days, rem = divmod(seconds, 86400)
|
|
95
|
+
hours, rem = divmod(rem, 3600)
|
|
96
|
+
minutes, secs = divmod(rem, 60)
|
|
97
|
+
parts = []
|
|
98
|
+
if days:
|
|
99
|
+
parts.append(f"{days}d")
|
|
100
|
+
if hours:
|
|
101
|
+
parts.append(f"{hours}h")
|
|
102
|
+
if minutes:
|
|
103
|
+
parts.append(f"{minutes}m")
|
|
104
|
+
if not parts:
|
|
105
|
+
parts.append(f"{secs}s")
|
|
106
|
+
return " ".join(parts)
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
def format_bytes(num_bytes: float) -> str:
|
|
110
|
+
"""Render bytes as KB/MB/GB/TB."""
|
|
111
|
+
for unit in ("B", "KB", "MB", "GB", "TB", "PB"):
|
|
112
|
+
if abs(num_bytes) < 1024.0:
|
|
113
|
+
return f"{num_bytes:3.1f}{unit}"
|
|
114
|
+
num_bytes /= 1024.0
|
|
115
|
+
return f"{num_bytes:.1f}EB"
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
def get_loadavg() -> Tuple[float, float, float]:
|
|
119
|
+
content = read_file(PROC_LOADAVG)
|
|
120
|
+
if not content:
|
|
121
|
+
return (0.0, 0.0, 0.0)
|
|
122
|
+
parts = content.split()
|
|
123
|
+
try:
|
|
124
|
+
return (float(parts[0]), float(parts[1]), float(parts[2]))
|
|
125
|
+
except (ValueError, IndexError):
|
|
126
|
+
return (0.0, 0.0, 0.0)
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
def get_cpu_count() -> int:
|
|
130
|
+
try:
|
|
131
|
+
return os.cpu_count() or 1
|
|
132
|
+
except Exception:
|
|
133
|
+
return 1
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
def get_cpu_model() -> str:
|
|
137
|
+
content = read_file(PROC_CPUINFO)
|
|
138
|
+
for line in content.splitlines():
|
|
139
|
+
if line.lower().startswith("model name"):
|
|
140
|
+
_, _, value = line.partition(":")
|
|
141
|
+
return value.strip()
|
|
142
|
+
return platform.processor() or "unknown"
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
def get_meminfo() -> Dict[str, int]:
|
|
146
|
+
"""Read /proc/meminfo, return dict of kB values."""
|
|
147
|
+
data: Dict[str, int] = {}
|
|
148
|
+
content = read_file(PROC_MEMINFO)
|
|
149
|
+
for line in content.splitlines():
|
|
150
|
+
match = re.match(r"^(\S+):\s+(\d+)\s*kB", line)
|
|
151
|
+
if match:
|
|
152
|
+
data[match.group(1)] = int(match.group(2)) * 1024
|
|
153
|
+
return data
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
def get_memory_summary() -> Dict[str, int]:
|
|
157
|
+
"""Return total/used/free/available memory in bytes."""
|
|
158
|
+
if HAS_PSUTIL:
|
|
159
|
+
vm = psutil.virtual_memory()
|
|
160
|
+
sw = psutil.swap_memory()
|
|
161
|
+
return {
|
|
162
|
+
"total": vm.total,
|
|
163
|
+
"available": vm.available,
|
|
164
|
+
"used": vm.used,
|
|
165
|
+
"free": vm.free,
|
|
166
|
+
"percent": int(vm.percent),
|
|
167
|
+
"swap_total": sw.total,
|
|
168
|
+
"swap_used": sw.used,
|
|
169
|
+
"swap_percent": int(sw.percent),
|
|
170
|
+
}
|
|
171
|
+
info = get_meminfo()
|
|
172
|
+
total = info.get("MemTotal", 0)
|
|
173
|
+
free = info.get("MemFree", 0)
|
|
174
|
+
available = info.get("MemAvailable", free)
|
|
175
|
+
used = total - available
|
|
176
|
+
percent = int(round(100 * used / total)) if total else 0
|
|
177
|
+
swap_total = info.get("SwapTotal", 0)
|
|
178
|
+
swap_free = info.get("SwapFree", 0)
|
|
179
|
+
swap_used = swap_total - swap_free
|
|
180
|
+
swap_percent = int(round(100 * swap_used / swap_total)) if swap_total else 0
|
|
181
|
+
return {
|
|
182
|
+
"total": total,
|
|
183
|
+
"available": available,
|
|
184
|
+
"used": used,
|
|
185
|
+
"free": free,
|
|
186
|
+
"percent": percent,
|
|
187
|
+
"swap_total": swap_total,
|
|
188
|
+
"swap_used": swap_used,
|
|
189
|
+
"swap_percent": swap_percent,
|
|
190
|
+
}
|
|
191
|
+
|
|
192
|
+
|
|
193
|
+
def get_mounts() -> List[Dict[str, str]]:
|
|
194
|
+
"""Read /proc/mounts. Filter out virtual / pseudo filesystems."""
|
|
195
|
+
skip_fs = {
|
|
196
|
+
"proc",
|
|
197
|
+
"sysfs",
|
|
198
|
+
"devtmpfs",
|
|
199
|
+
"devpts",
|
|
200
|
+
"tmpfs",
|
|
201
|
+
"cgroup",
|
|
202
|
+
"cgroup2",
|
|
203
|
+
"pstore",
|
|
204
|
+
"bpf",
|
|
205
|
+
"tracefs",
|
|
206
|
+
"debugfs",
|
|
207
|
+
"fusectl",
|
|
208
|
+
"configfs",
|
|
209
|
+
"hugetlbfs",
|
|
210
|
+
"mqueue",
|
|
211
|
+
"rpc_pipefs",
|
|
212
|
+
"binfmt_misc",
|
|
213
|
+
"autofs",
|
|
214
|
+
"securityfs",
|
|
215
|
+
"selinuxfs",
|
|
216
|
+
"fuse.gvfsd-fuse",
|
|
217
|
+
"fuse.portal",
|
|
218
|
+
"fuse.snapfuse",
|
|
219
|
+
"nsfs",
|
|
220
|
+
"ramfs",
|
|
221
|
+
"fuse.lxcfs",
|
|
222
|
+
"overlay",
|
|
223
|
+
"squashfs",
|
|
224
|
+
}
|
|
225
|
+
content = read_file(PROC_MOUNTS)
|
|
226
|
+
mounts: List[Dict[str, str]] = []
|
|
227
|
+
seen_targets: set = set()
|
|
228
|
+
for line in content.splitlines():
|
|
229
|
+
parts = line.split()
|
|
230
|
+
if len(parts) < 3:
|
|
231
|
+
continue
|
|
232
|
+
source, target, fs_type = parts[0], parts[1], parts[2]
|
|
233
|
+
if fs_type in skip_fs:
|
|
234
|
+
continue
|
|
235
|
+
if target.startswith(("/snap", "/var/lib/docker/", "/var/lib/snapd/")):
|
|
236
|
+
continue
|
|
237
|
+
if target in seen_targets:
|
|
238
|
+
continue
|
|
239
|
+
seen_targets.add(target)
|
|
240
|
+
mounts.append({"source": source, "target": target, "fstype": fs_type})
|
|
241
|
+
return mounts
|
|
242
|
+
|
|
243
|
+
|
|
244
|
+
def get_disk_usage(target: str) -> Optional[Dict[str, int]]:
|
|
245
|
+
"""Return total/used/free bytes for a mount path."""
|
|
246
|
+
try:
|
|
247
|
+
usage = shutil.disk_usage(target)
|
|
248
|
+
except OSError:
|
|
249
|
+
return None
|
|
250
|
+
percent = int(round(100 * usage.used / usage.total)) if usage.total else 0
|
|
251
|
+
return {
|
|
252
|
+
"total": usage.total,
|
|
253
|
+
"used": usage.used,
|
|
254
|
+
"free": usage.free,
|
|
255
|
+
"percent": percent,
|
|
256
|
+
}
|
|
257
|
+
|
|
258
|
+
|
|
259
|
+
def get_disks() -> List[Dict[str, Any]]:
|
|
260
|
+
"""Return a list of mount points with usage data."""
|
|
261
|
+
result: List[Dict[str, Any]] = []
|
|
262
|
+
for mount in get_mounts():
|
|
263
|
+
usage = get_disk_usage(mount["target"])
|
|
264
|
+
if usage is None:
|
|
265
|
+
continue
|
|
266
|
+
result.append(
|
|
267
|
+
{
|
|
268
|
+
"target": mount["target"],
|
|
269
|
+
"source": mount["source"],
|
|
270
|
+
"fstype": mount["fstype"],
|
|
271
|
+
**usage,
|
|
272
|
+
}
|
|
273
|
+
)
|
|
274
|
+
return result
|
|
275
|
+
|
|
276
|
+
|
|
277
|
+
def get_top_processes(by: str = "cpu", limit: int = 5) -> List[Dict[str, Any]]:
|
|
278
|
+
"""Return top processes by cpu or rss. Requires psutil or falls back to ps."""
|
|
279
|
+
if HAS_PSUTIL:
|
|
280
|
+
try:
|
|
281
|
+
procs = []
|
|
282
|
+
for proc in psutil.process_iter(["pid", "name", "username", "cpu_percent", "memory_info"]):
|
|
283
|
+
procs.append(proc.info)
|
|
284
|
+
time.sleep(0.1)
|
|
285
|
+
procs = []
|
|
286
|
+
for proc in psutil.process_iter(["pid", "name", "username", "cpu_percent", "memory_info"]):
|
|
287
|
+
info = proc.info
|
|
288
|
+
rss = info["memory_info"].rss if info.get("memory_info") else 0
|
|
289
|
+
procs.append(
|
|
290
|
+
{
|
|
291
|
+
"pid": info.get("pid"),
|
|
292
|
+
"name": (info.get("name") or "")[:32],
|
|
293
|
+
"user": (info.get("username") or "")[:16],
|
|
294
|
+
"cpu_percent": info.get("cpu_percent") or 0.0,
|
|
295
|
+
"rss": rss,
|
|
296
|
+
}
|
|
297
|
+
)
|
|
298
|
+
key = "cpu_percent" if by == "cpu" else "rss"
|
|
299
|
+
procs.sort(key=lambda p: p[key], reverse=True)
|
|
300
|
+
return procs[:limit]
|
|
301
|
+
except Exception as exc:
|
|
302
|
+
logger.debug(f"psutil top failed: {exc}")
|
|
303
|
+
# Fallback: ps
|
|
304
|
+
sort = "%cpu" if by == "cpu" else "rss"
|
|
305
|
+
rc, out, _ = run(["ps", "-eo", f"pid,user,{sort},comm", "--sort=-" + sort, "--no-headers"], timeout=5)
|
|
306
|
+
if rc != 0 or not out:
|
|
307
|
+
return []
|
|
308
|
+
result: List[Dict[str, Any]] = []
|
|
309
|
+
for line in out.splitlines()[:limit]:
|
|
310
|
+
parts = line.split(None, 3)
|
|
311
|
+
if len(parts) < 4:
|
|
312
|
+
continue
|
|
313
|
+
pid, user, metric, comm = parts
|
|
314
|
+
try:
|
|
315
|
+
value = float(metric)
|
|
316
|
+
except ValueError:
|
|
317
|
+
value = 0.0
|
|
318
|
+
item = {"pid": int(pid), "user": user, "name": comm}
|
|
319
|
+
if by == "cpu":
|
|
320
|
+
item["cpu_percent"] = value
|
|
321
|
+
item["rss"] = 0
|
|
322
|
+
else:
|
|
323
|
+
item["rss"] = int(value) * 1024 # ps rss is in kB
|
|
324
|
+
item["cpu_percent"] = 0.0
|
|
325
|
+
result.append(item)
|
|
326
|
+
return result
|
|
327
|
+
|
|
328
|
+
|
|
329
|
+
def count_zombie_processes() -> int:
|
|
330
|
+
"""Return number of zombie processes."""
|
|
331
|
+
if HAS_PSUTIL:
|
|
332
|
+
try:
|
|
333
|
+
return sum(1 for p in psutil.process_iter(["status"]) if p.info.get("status") == psutil.STATUS_ZOMBIE)
|
|
334
|
+
except Exception:
|
|
335
|
+
pass
|
|
336
|
+
rc, out, _ = run(["ps", "-eo", "stat", "--no-headers"], timeout=5)
|
|
337
|
+
if rc != 0:
|
|
338
|
+
return 0
|
|
339
|
+
return sum(1 for line in out.splitlines() if line.strip().startswith("Z"))
|
|
340
|
+
|
|
341
|
+
|
|
342
|
+
def get_failed_units() -> List[str]:
|
|
343
|
+
"""List of failed systemd units."""
|
|
344
|
+
rc, out, _ = run(["systemctl", "--failed", "--no-legend", "--plain", "--no-pager"], timeout=5)
|
|
345
|
+
if rc != 0:
|
|
346
|
+
return []
|
|
347
|
+
units: List[str] = []
|
|
348
|
+
for line in out.splitlines():
|
|
349
|
+
line = line.strip()
|
|
350
|
+
if not line:
|
|
351
|
+
continue
|
|
352
|
+
unit = line.split()[0]
|
|
353
|
+
if unit and unit != "0":
|
|
354
|
+
units.append(unit)
|
|
355
|
+
return units
|
|
356
|
+
|
|
357
|
+
|
|
358
|
+
def get_system_running_state() -> Optional[str]:
|
|
359
|
+
"""`systemctl is-system-running` — running / degraded / maintenance / etc."""
|
|
360
|
+
rc, out, _ = run(["systemctl", "is-system-running"], timeout=5)
|
|
361
|
+
if rc < 0 or rc == 127 or rc == 124:
|
|
362
|
+
return None
|
|
363
|
+
state = out.strip()
|
|
364
|
+
return state or None
|
|
365
|
+
|
|
366
|
+
|
|
367
|
+
def get_enabled_inactive_units(limit: int = 200) -> List[Dict[str, str]]:
|
|
368
|
+
"""Enabled .service units whose ActiveState is not active and Type is not oneshot.
|
|
369
|
+
|
|
370
|
+
Returns a list of dicts {name, state, sub, result}. Empty list when systemctl
|
|
371
|
+
is unavailable or all enabled services are running fine.
|
|
372
|
+
"""
|
|
373
|
+
rc, out, _ = run(
|
|
374
|
+
["systemctl", "list-unit-files", "--type=service", "--state=enabled", "--no-legend", "--no-pager"],
|
|
375
|
+
timeout=8,
|
|
376
|
+
)
|
|
377
|
+
if rc != 0 or not out:
|
|
378
|
+
return []
|
|
379
|
+
enabled: List[str] = []
|
|
380
|
+
for line in out.splitlines():
|
|
381
|
+
parts = line.split()
|
|
382
|
+
if parts:
|
|
383
|
+
enabled.append(parts[0])
|
|
384
|
+
enabled = enabled[:limit]
|
|
385
|
+
if not enabled:
|
|
386
|
+
return []
|
|
387
|
+
|
|
388
|
+
rc, out, _ = run(
|
|
389
|
+
["systemctl", "show", "--property=Id,ActiveState,SubState,Type,Result"] + enabled,
|
|
390
|
+
timeout=10,
|
|
391
|
+
)
|
|
392
|
+
if rc != 0 or not out:
|
|
393
|
+
return []
|
|
394
|
+
|
|
395
|
+
inactive: List[Dict[str, str]] = []
|
|
396
|
+
current: Dict[str, str] = {}
|
|
397
|
+
for line in out.splitlines():
|
|
398
|
+
if not line.strip():
|
|
399
|
+
if current:
|
|
400
|
+
_maybe_collect_inactive(current, inactive)
|
|
401
|
+
current = {}
|
|
402
|
+
continue
|
|
403
|
+
key, _, value = line.partition("=")
|
|
404
|
+
current[key.strip()] = value.strip()
|
|
405
|
+
if current:
|
|
406
|
+
_maybe_collect_inactive(current, inactive)
|
|
407
|
+
return inactive
|
|
408
|
+
|
|
409
|
+
|
|
410
|
+
def _maybe_collect_inactive(unit: Dict[str, str], out: List[Dict[str, str]]) -> None:
|
|
411
|
+
"""Append unit to `out` if it is enabled-but-not-running."""
|
|
412
|
+
if unit.get("Type") in ("oneshot",):
|
|
413
|
+
return
|
|
414
|
+
state = unit.get("ActiveState")
|
|
415
|
+
if state in ("active", "activating", "reloading"):
|
|
416
|
+
return
|
|
417
|
+
name = unit.get("Id")
|
|
418
|
+
if not name:
|
|
419
|
+
return
|
|
420
|
+
out.append(
|
|
421
|
+
{
|
|
422
|
+
"name": name,
|
|
423
|
+
"state": state or "unknown",
|
|
424
|
+
"sub": unit.get("SubState", ""),
|
|
425
|
+
"result": unit.get("Result", ""),
|
|
426
|
+
}
|
|
427
|
+
)
|
|
428
|
+
|
|
429
|
+
|
|
430
|
+
def get_reboot_required() -> Optional[str]:
|
|
431
|
+
"""Return reboot-required reason on Debian/Ubuntu, or None."""
|
|
432
|
+
marker = "/var/run/reboot-required"
|
|
433
|
+
if not os.path.exists(marker):
|
|
434
|
+
return None
|
|
435
|
+
pkgs = read_file("/var/run/reboot-required.pkgs").strip()
|
|
436
|
+
if pkgs:
|
|
437
|
+
first = pkgs.splitlines()[:3]
|
|
438
|
+
return "reboot required ({} pkg(s)): {}".format(len(pkgs.splitlines()), ", ".join(first))
|
|
439
|
+
return "reboot required"
|
|
440
|
+
|
|
441
|
+
|
|
442
|
+
def get_time_sync_status() -> Dict[str, Any]:
|
|
443
|
+
"""Return time sync info: {synchronized, ntp_active, source}."""
|
|
444
|
+
rc, out, _ = run(
|
|
445
|
+
["timedatectl", "show", "-p", "NTPSynchronized", "-p", "NTP", "-p", "CanNTP"],
|
|
446
|
+
timeout=5,
|
|
447
|
+
)
|
|
448
|
+
if rc == 127:
|
|
449
|
+
return {"synchronized": None, "ntp_active": None, "source": "timedatectl unavailable"}
|
|
450
|
+
if rc != 0 or not out:
|
|
451
|
+
return {"synchronized": None, "ntp_active": None, "source": "timedatectl error"}
|
|
452
|
+
fields: Dict[str, str] = {}
|
|
453
|
+
for line in out.splitlines():
|
|
454
|
+
key, _, value = line.partition("=")
|
|
455
|
+
fields[key.strip()] = value.strip()
|
|
456
|
+
return {
|
|
457
|
+
"synchronized": fields.get("NTPSynchronized") == "yes",
|
|
458
|
+
"ntp_active": fields.get("NTP") == "yes",
|
|
459
|
+
"source": "timedatectl",
|
|
460
|
+
}
|
|
461
|
+
|
|
462
|
+
|
|
463
|
+
def get_readonly_mounts() -> List[str]:
|
|
464
|
+
"""Return mounts that are read-only and where ro is unexpected (excludes squashfs, iso9660, cd, etc.)."""
|
|
465
|
+
expected_ro = {"squashfs", "iso9660", "udf"}
|
|
466
|
+
content = read_file(PROC_MOUNTS)
|
|
467
|
+
result: List[str] = []
|
|
468
|
+
skip_targets = ("/snap", "/proc", "/sys", "/dev", "/run", "/var/lib/docker", "/var/lib/snapd")
|
|
469
|
+
for line in content.splitlines():
|
|
470
|
+
parts = line.split()
|
|
471
|
+
if len(parts) < 4:
|
|
472
|
+
continue
|
|
473
|
+
target, fs_type, opts = parts[1], parts[2], parts[3]
|
|
474
|
+
if fs_type in expected_ro:
|
|
475
|
+
continue
|
|
476
|
+
if any(target.startswith(p) for p in skip_targets):
|
|
477
|
+
continue
|
|
478
|
+
flags = opts.split(",")
|
|
479
|
+
if "ro" in flags:
|
|
480
|
+
result.append(f"{target} ({fs_type})")
|
|
481
|
+
return result
|
|
482
|
+
|
|
483
|
+
|
|
484
|
+
def get_stuck_processes() -> List[Dict[str, Any]]:
|
|
485
|
+
"""Return processes in D state (uninterruptible sleep, often IO-stuck)."""
|
|
486
|
+
if HAS_PSUTIL:
|
|
487
|
+
try:
|
|
488
|
+
stuck: List[Dict[str, Any]] = []
|
|
489
|
+
for proc in psutil.process_iter(["pid", "name", "status"]):
|
|
490
|
+
if proc.info.get("status") == psutil.STATUS_DISK_SLEEP:
|
|
491
|
+
stuck.append({"pid": proc.info["pid"], "name": proc.info.get("name") or ""})
|
|
492
|
+
return stuck
|
|
493
|
+
except Exception:
|
|
494
|
+
pass
|
|
495
|
+
rc, out, _ = run(["ps", "-eo", "pid,stat,comm", "--no-headers"], timeout=5)
|
|
496
|
+
if rc != 0:
|
|
497
|
+
return []
|
|
498
|
+
result: List[Dict[str, Any]] = []
|
|
499
|
+
for line in out.splitlines():
|
|
500
|
+
parts = line.split(None, 2)
|
|
501
|
+
if len(parts) < 3:
|
|
502
|
+
continue
|
|
503
|
+
pid, stat_field, comm = parts
|
|
504
|
+
if stat_field.startswith("D"):
|
|
505
|
+
try:
|
|
506
|
+
result.append({"pid": int(pid), "name": comm})
|
|
507
|
+
except ValueError:
|
|
508
|
+
continue
|
|
509
|
+
return result
|
|
510
|
+
|
|
511
|
+
|
|
512
|
+
def get_iowait_percent(sample_seconds: float = 0.3) -> Optional[float]:
|
|
513
|
+
"""Sample /proc/stat twice and return iowait percent."""
|
|
514
|
+
|
|
515
|
+
def _snapshot() -> Optional[List[int]]:
|
|
516
|
+
line = read_file(PROC_STAT).splitlines()
|
|
517
|
+
if not line:
|
|
518
|
+
return None
|
|
519
|
+
parts = line[0].split()
|
|
520
|
+
if parts[0] != "cpu" or len(parts) < 6:
|
|
521
|
+
return None
|
|
522
|
+
try:
|
|
523
|
+
return [int(p) for p in parts[1:11]]
|
|
524
|
+
except ValueError:
|
|
525
|
+
return None
|
|
526
|
+
|
|
527
|
+
first = _snapshot()
|
|
528
|
+
if first is None:
|
|
529
|
+
return None
|
|
530
|
+
time.sleep(sample_seconds)
|
|
531
|
+
second = _snapshot()
|
|
532
|
+
if second is None:
|
|
533
|
+
return None
|
|
534
|
+
deltas = [b - a for a, b in zip(first, second)]
|
|
535
|
+
total = sum(deltas)
|
|
536
|
+
if total <= 0:
|
|
537
|
+
return 0.0
|
|
538
|
+
iowait = deltas[4] if len(deltas) > 4 else 0
|
|
539
|
+
return round(100.0 * iowait / total, 2)
|
|
540
|
+
|
|
541
|
+
|
|
542
|
+
def get_service_restart_counts(threshold: int = 3, limit: int = 200) -> List[Dict[str, Any]]:
|
|
543
|
+
"""Active .service units with NRestarts >= threshold (since boot).
|
|
544
|
+
|
|
545
|
+
NRestarts is cumulative — a single occasional crash isn't enough; the value
|
|
546
|
+
being high indicates a service that systemd has had to bring back multiple
|
|
547
|
+
times. Returns name + count sorted descending.
|
|
548
|
+
"""
|
|
549
|
+
rc, out, _ = run(
|
|
550
|
+
["systemctl", "list-units", "--type=service", "--state=active", "--no-legend", "--no-pager", "--plain"],
|
|
551
|
+
timeout=8,
|
|
552
|
+
)
|
|
553
|
+
if rc != 0 or not out:
|
|
554
|
+
return []
|
|
555
|
+
units: List[str] = []
|
|
556
|
+
for line in out.splitlines():
|
|
557
|
+
parts = line.split()
|
|
558
|
+
if parts and parts[0].endswith(".service"):
|
|
559
|
+
units.append(parts[0])
|
|
560
|
+
units = units[:limit]
|
|
561
|
+
if not units:
|
|
562
|
+
return []
|
|
563
|
+
rc, out, _ = run(["systemctl", "show", "-p", "Id", "-p", "NRestarts"] + units, timeout=10)
|
|
564
|
+
if rc != 0 or not out:
|
|
565
|
+
return []
|
|
566
|
+
result: List[Dict[str, Any]] = []
|
|
567
|
+
current: Dict[str, str] = {}
|
|
568
|
+
for line in out.splitlines():
|
|
569
|
+
if not line.strip():
|
|
570
|
+
_maybe_collect_restarts(current, threshold, result)
|
|
571
|
+
current = {}
|
|
572
|
+
continue
|
|
573
|
+
key, _, value = line.partition("=")
|
|
574
|
+
current[key.strip()] = value.strip()
|
|
575
|
+
_maybe_collect_restarts(current, threshold, result)
|
|
576
|
+
result.sort(key=lambda r: r["restarts"], reverse=True)
|
|
577
|
+
return result
|
|
578
|
+
|
|
579
|
+
|
|
580
|
+
def _maybe_collect_restarts(unit: Dict[str, str], threshold: int, out: List[Dict[str, Any]]) -> None:
|
|
581
|
+
try:
|
|
582
|
+
n = int(unit.get("NRestarts", "0"))
|
|
583
|
+
except ValueError:
|
|
584
|
+
return
|
|
585
|
+
name = unit.get("Id")
|
|
586
|
+
if not name or n < threshold:
|
|
587
|
+
return
|
|
588
|
+
out.append({"name": name, "restarts": n})
|
|
589
|
+
|
|
590
|
+
|
|
591
|
+
def get_network_errors() -> List[Dict[str, Any]]:
|
|
592
|
+
"""For each non-loopback, non-virtual interface read kernel error counters.
|
|
593
|
+
|
|
594
|
+
Returns interfaces with at least one non-zero counter (rx/tx errors or drops).
|
|
595
|
+
"""
|
|
596
|
+
base = "/sys/class/net"
|
|
597
|
+
if not os.path.isdir(base):
|
|
598
|
+
return []
|
|
599
|
+
try:
|
|
600
|
+
ifaces = sorted(os.listdir(base))
|
|
601
|
+
except OSError:
|
|
602
|
+
return []
|
|
603
|
+
results: List[Dict[str, Any]] = []
|
|
604
|
+
for iface in ifaces:
|
|
605
|
+
if iface == "lo" or _is_noisy_iface(iface):
|
|
606
|
+
continue
|
|
607
|
+
stats_dir = os.path.join(base, iface, "statistics")
|
|
608
|
+
if not os.path.isdir(stats_dir):
|
|
609
|
+
continue
|
|
610
|
+
|
|
611
|
+
# Bind stats_dir at def-time via default-arg to avoid late-binding
|
|
612
|
+
# the loop variable (caught by B023).
|
|
613
|
+
def _read_int(name: str, _dir: str = stats_dir) -> int:
|
|
614
|
+
try:
|
|
615
|
+
return int(read_file(os.path.join(_dir, name)).strip() or "0")
|
|
616
|
+
except (ValueError, OSError):
|
|
617
|
+
return 0
|
|
618
|
+
|
|
619
|
+
rx_err = _read_int("rx_errors")
|
|
620
|
+
tx_err = _read_int("tx_errors")
|
|
621
|
+
rx_drop = _read_int("rx_dropped")
|
|
622
|
+
tx_drop = _read_int("tx_dropped")
|
|
623
|
+
total = rx_err + tx_err + rx_drop + tx_drop
|
|
624
|
+
if total <= 0:
|
|
625
|
+
continue
|
|
626
|
+
results.append(
|
|
627
|
+
{
|
|
628
|
+
"iface": iface,
|
|
629
|
+
"rx_errors": rx_err,
|
|
630
|
+
"tx_errors": tx_err,
|
|
631
|
+
"rx_dropped": rx_drop,
|
|
632
|
+
"tx_dropped": tx_drop,
|
|
633
|
+
"total": total,
|
|
634
|
+
}
|
|
635
|
+
)
|
|
636
|
+
return results
|
|
637
|
+
|
|
638
|
+
|
|
639
|
+
def probe_http(url: str, timeout: float = 3.0) -> Dict[str, Any]:
|
|
640
|
+
"""HEAD-probe a URL. Returns {url, status_code, latency_ms, error}.
|
|
641
|
+
|
|
642
|
+
Uses stdlib http.client to avoid requests-as-a-dep. Treats HTTP redirects
|
|
643
|
+
as success (status_code is the first response code we see).
|
|
644
|
+
"""
|
|
645
|
+
from urllib.parse import urlparse
|
|
646
|
+
|
|
647
|
+
parsed = urlparse(url)
|
|
648
|
+
if parsed.scheme not in ("http", "https"):
|
|
649
|
+
return {"url": url, "status_code": None, "latency_ms": None, "error": f"unsupported scheme: {parsed.scheme}"}
|
|
650
|
+
host = parsed.hostname or "localhost"
|
|
651
|
+
port = parsed.port or (443 if parsed.scheme == "https" else 80)
|
|
652
|
+
path = parsed.path or "/"
|
|
653
|
+
if parsed.query:
|
|
654
|
+
path += f"?{parsed.query}"
|
|
655
|
+
|
|
656
|
+
started = time.monotonic()
|
|
657
|
+
try:
|
|
658
|
+
import http.client
|
|
659
|
+
|
|
660
|
+
if parsed.scheme == "https":
|
|
661
|
+
conn = http.client.HTTPSConnection(host, port, timeout=timeout)
|
|
662
|
+
else:
|
|
663
|
+
conn = http.client.HTTPConnection(host, port, timeout=timeout)
|
|
664
|
+
try:
|
|
665
|
+
conn.request("HEAD", path)
|
|
666
|
+
resp = conn.getresponse()
|
|
667
|
+
latency = (time.monotonic() - started) * 1000.0
|
|
668
|
+
return {"url": url, "status_code": resp.status, "latency_ms": round(latency, 1), "error": None}
|
|
669
|
+
finally:
|
|
670
|
+
conn.close()
|
|
671
|
+
except Exception as exc:
|
|
672
|
+
return {"url": url, "status_code": None, "latency_ms": None, "error": f"{type(exc).__name__}: {exc}"}
|
|
673
|
+
|
|
674
|
+
|
|
675
|
+
def probe_tcp(target: str, timeout: float = 3.0) -> Dict[str, Any]:
|
|
676
|
+
"""Open a TCP socket to host:port. Returns {target, latency_ms, error}."""
|
|
677
|
+
import socket as _socket
|
|
678
|
+
|
|
679
|
+
if ":" not in target:
|
|
680
|
+
return {"target": target, "latency_ms": None, "error": "expected host:port"}
|
|
681
|
+
host, _, port_s = target.rpartition(":")
|
|
682
|
+
try:
|
|
683
|
+
port = int(port_s)
|
|
684
|
+
except ValueError:
|
|
685
|
+
return {"target": target, "latency_ms": None, "error": f"invalid port: {port_s}"}
|
|
686
|
+
started = time.monotonic()
|
|
687
|
+
try:
|
|
688
|
+
sock = _socket.create_connection((host.strip("[]"), port), timeout=timeout)
|
|
689
|
+
sock.close()
|
|
690
|
+
latency = (time.monotonic() - started) * 1000.0
|
|
691
|
+
return {"target": target, "latency_ms": round(latency, 1), "error": None}
|
|
692
|
+
except Exception as exc:
|
|
693
|
+
return {"target": target, "latency_ms": None, "error": f"{type(exc).__name__}: {exc}"}
|
|
694
|
+
|
|
695
|
+
|
|
696
|
+
def get_block_devices() -> List[str]:
|
|
697
|
+
"""Return /dev paths of physical block devices (excluding partitions / loops)."""
|
|
698
|
+
import shutil
|
|
699
|
+
|
|
700
|
+
if not shutil.which("lsblk"):
|
|
701
|
+
return []
|
|
702
|
+
rc, out, _ = run(["lsblk", "-dn", "-o", "NAME,TYPE"], timeout=5)
|
|
703
|
+
if rc != 0:
|
|
704
|
+
return []
|
|
705
|
+
devices = []
|
|
706
|
+
for line in out.splitlines():
|
|
707
|
+
parts = line.split()
|
|
708
|
+
if len(parts) >= 2 and parts[1] == "disk" and not parts[0].startswith("loop"):
|
|
709
|
+
devices.append(f"/dev/{parts[0]}")
|
|
710
|
+
return devices
|
|
711
|
+
|
|
712
|
+
|
|
713
|
+
def get_smart_status() -> Optional[List[Dict[str, Any]]]:
|
|
714
|
+
"""Per-device SMART health (`smartctl -H -j`).
|
|
715
|
+
|
|
716
|
+
Returns None when smartctl is unavailable. Empty list means smartctl is
|
|
717
|
+
present but no block devices were found / accessible.
|
|
718
|
+
"""
|
|
719
|
+
import shutil
|
|
720
|
+
|
|
721
|
+
if not shutil.which("smartctl"):
|
|
722
|
+
return None
|
|
723
|
+
devices = get_block_devices()
|
|
724
|
+
if not devices:
|
|
725
|
+
return []
|
|
726
|
+
import json as _json
|
|
727
|
+
|
|
728
|
+
results: List[Dict[str, Any]] = []
|
|
729
|
+
for dev in devices:
|
|
730
|
+
rc, out, _ = run(["smartctl", "-H", "-j", dev], timeout=8)
|
|
731
|
+
# smartctl exit codes are a bitfield: 0=ok, bit 0 (1) = cmd parse fail,
|
|
732
|
+
# bit 1 (2) = device open failed, bit 2 (4) = some SMART command failed.
|
|
733
|
+
# We try to parse JSON even on non-zero rc — smartctl emits JSON anyway.
|
|
734
|
+
try:
|
|
735
|
+
data = _json.loads(out)
|
|
736
|
+
except (ValueError, _json.JSONDecodeError):
|
|
737
|
+
continue
|
|
738
|
+
smart = data.get("smart_status") or {}
|
|
739
|
+
passed = smart.get("passed")
|
|
740
|
+
msgs = data.get("messages") or []
|
|
741
|
+
msg_text = "; ".join(m.get("string", "") for m in msgs if m.get("string"))
|
|
742
|
+
if passed is None and rc != 0:
|
|
743
|
+
# device probably doesn't support SMART; skip silently
|
|
744
|
+
continue
|
|
745
|
+
results.append(
|
|
746
|
+
{
|
|
747
|
+
"device": dev,
|
|
748
|
+
"passed": bool(passed) if passed is not None else None,
|
|
749
|
+
"exit_code": rc,
|
|
750
|
+
"message": msg_text,
|
|
751
|
+
}
|
|
752
|
+
)
|
|
753
|
+
return results
|
|
754
|
+
|
|
755
|
+
|
|
756
|
+
def get_temperatures() -> List[Dict[str, Any]]:
|
|
757
|
+
"""Read CPU/GPU/board temperatures from `/sys/class/hwmon/*/temp*_input`.
|
|
758
|
+
|
|
759
|
+
Each value is millidegrees Celsius. Returns a list of
|
|
760
|
+
{sensor, label, celsius}. Sensors that look broken (negative, absurd)
|
|
761
|
+
are filtered out.
|
|
762
|
+
"""
|
|
763
|
+
base = "/sys/class/hwmon"
|
|
764
|
+
if not os.path.isdir(base):
|
|
765
|
+
return []
|
|
766
|
+
try:
|
|
767
|
+
hwmons = sorted(os.listdir(base))
|
|
768
|
+
except OSError:
|
|
769
|
+
return []
|
|
770
|
+
result: List[Dict[str, Any]] = []
|
|
771
|
+
for hwmon in hwmons:
|
|
772
|
+
hwmon_dir = os.path.join(base, hwmon)
|
|
773
|
+
sensor_name = read_file(os.path.join(hwmon_dir, "name")).strip() or hwmon
|
|
774
|
+
try:
|
|
775
|
+
files = os.listdir(hwmon_dir)
|
|
776
|
+
except OSError:
|
|
777
|
+
continue
|
|
778
|
+
for f in sorted(files):
|
|
779
|
+
match = re.match(r"^temp(\d+)_input$", f)
|
|
780
|
+
if not match:
|
|
781
|
+
continue
|
|
782
|
+
idx = match.group(1)
|
|
783
|
+
value_raw = read_file(os.path.join(hwmon_dir, f)).strip()
|
|
784
|
+
if not value_raw:
|
|
785
|
+
continue
|
|
786
|
+
try:
|
|
787
|
+
value_mc = int(value_raw)
|
|
788
|
+
except ValueError:
|
|
789
|
+
continue
|
|
790
|
+
celsius = value_mc / 1000.0
|
|
791
|
+
# Filter obviously-broken sensors: negative, or absurdly high (>200°C).
|
|
792
|
+
if celsius < -50 or celsius > 200:
|
|
793
|
+
continue
|
|
794
|
+
label_path = os.path.join(hwmon_dir, f"temp{idx}_label")
|
|
795
|
+
label = read_file(label_path).strip() if os.path.exists(label_path) else f"temp{idx}"
|
|
796
|
+
result.append(
|
|
797
|
+
{
|
|
798
|
+
"sensor": sensor_name,
|
|
799
|
+
"label": label,
|
|
800
|
+
"celsius": round(celsius, 1),
|
|
801
|
+
}
|
|
802
|
+
)
|
|
803
|
+
return result
|
|
804
|
+
|
|
805
|
+
|
|
806
|
+
def resolve_hostname(host: str, timeout: float = 2.0) -> Optional[float]:
|
|
807
|
+
"""Resolve a hostname via the system resolver. Returns ms elapsed, or None.
|
|
808
|
+
|
|
809
|
+
Uses socket.gethostbyname with a deadline so a broken resolver does not
|
|
810
|
+
hang the audit. We deliberately do NOT use any third-party resolver — we
|
|
811
|
+
want to test whatever DNS the host itself is configured to use.
|
|
812
|
+
"""
|
|
813
|
+
import socket
|
|
814
|
+
|
|
815
|
+
old_timeout = socket.getdefaulttimeout()
|
|
816
|
+
socket.setdefaulttimeout(timeout)
|
|
817
|
+
started = time.monotonic()
|
|
818
|
+
try:
|
|
819
|
+
socket.gethostbyname(host)
|
|
820
|
+
return (time.monotonic() - started) * 1000.0
|
|
821
|
+
except (socket.gaierror, OSError, socket.timeout):
|
|
822
|
+
return None
|
|
823
|
+
finally:
|
|
824
|
+
socket.setdefaulttimeout(old_timeout)
|
|
825
|
+
|
|
826
|
+
|
|
827
|
+
def get_fail2ban_jails() -> Optional[List[Dict[str, Any]]]:
|
|
828
|
+
"""List of {name, banned, total} per active fail2ban jail.
|
|
829
|
+
|
|
830
|
+
None if fail2ban-client is missing or not running. Empty list means
|
|
831
|
+
fail2ban is up but has no jails configured (rare).
|
|
832
|
+
"""
|
|
833
|
+
import shutil
|
|
834
|
+
|
|
835
|
+
if not shutil.which("fail2ban-client"):
|
|
836
|
+
return None
|
|
837
|
+
rc, out, _ = run(["fail2ban-client", "status"], timeout=5)
|
|
838
|
+
if rc != 0 or not out:
|
|
839
|
+
return None
|
|
840
|
+
jail_names: List[str] = []
|
|
841
|
+
for line in out.splitlines():
|
|
842
|
+
# Example line: `|- Jail list: sshd, recidive`
|
|
843
|
+
if "Jail list" in line:
|
|
844
|
+
_, _, rest = line.partition(":")
|
|
845
|
+
jail_names = [j.strip() for j in rest.split(",") if j.strip()]
|
|
846
|
+
break
|
|
847
|
+
if not jail_names:
|
|
848
|
+
return []
|
|
849
|
+
jails: List[Dict[str, Any]] = []
|
|
850
|
+
for name in jail_names:
|
|
851
|
+
rc, jout, _ = run(["fail2ban-client", "status", name], timeout=5)
|
|
852
|
+
if rc != 0:
|
|
853
|
+
continue
|
|
854
|
+
banned = 0
|
|
855
|
+
total = 0
|
|
856
|
+
for line in jout.splitlines():
|
|
857
|
+
# fail2ban-client decorates lines with tree prefixes like `|-` or
|
|
858
|
+
# `\`-`; strip those before matching the key.
|
|
859
|
+
stripped = line.lstrip(" |`-\t")
|
|
860
|
+
if stripped.startswith("Currently banned:"):
|
|
861
|
+
try:
|
|
862
|
+
banned = int(stripped.partition(":")[2].strip())
|
|
863
|
+
except (ValueError, IndexError):
|
|
864
|
+
banned = 0
|
|
865
|
+
elif stripped.startswith("Total banned:"):
|
|
866
|
+
try:
|
|
867
|
+
total = int(stripped.partition(":")[2].strip())
|
|
868
|
+
except (ValueError, IndexError):
|
|
869
|
+
total = 0
|
|
870
|
+
jails.append({"name": name, "banned": banned, "total": total})
|
|
871
|
+
return jails
|
|
872
|
+
|
|
873
|
+
|
|
874
|
+
def get_chrony_offset() -> Optional[float]:
|
|
875
|
+
"""Return |system time offset from NTP source| in seconds via `chronyc tracking`.
|
|
876
|
+
|
|
877
|
+
None if chrony is not the active NTP daemon or chronyc is unavailable.
|
|
878
|
+
"""
|
|
879
|
+
import shutil
|
|
880
|
+
|
|
881
|
+
if not shutil.which("chronyc"):
|
|
882
|
+
return None
|
|
883
|
+
rc, out, _ = run(["chronyc", "tracking"], timeout=5)
|
|
884
|
+
if rc != 0 or not out:
|
|
885
|
+
return None
|
|
886
|
+
# Line example:
|
|
887
|
+
# "System time : 0.000123456 seconds slow of NTP time"
|
|
888
|
+
for line in out.splitlines():
|
|
889
|
+
if "System time" not in line:
|
|
890
|
+
continue
|
|
891
|
+
match = re.search(r"([+-]?\d+(?:\.\d+)?)\s+seconds", line)
|
|
892
|
+
if match:
|
|
893
|
+
try:
|
|
894
|
+
return abs(float(match.group(1)))
|
|
895
|
+
except ValueError:
|
|
896
|
+
return None
|
|
897
|
+
return None
|
|
898
|
+
|
|
899
|
+
|
|
900
|
+
def get_docker_problem_containers() -> Optional[List[Dict[str, Any]]]:
|
|
901
|
+
"""List containers that are unhealthy or restart-looping.
|
|
902
|
+
|
|
903
|
+
Returns None when docker is unavailable / daemon unreachable. Empty list
|
|
904
|
+
means docker is reachable and nothing is misbehaving.
|
|
905
|
+
"""
|
|
906
|
+
import shutil
|
|
907
|
+
|
|
908
|
+
if not shutil.which("docker"):
|
|
909
|
+
return None
|
|
910
|
+
rc, out, _ = run(
|
|
911
|
+
["docker", "ps", "-a", "--format", "{{.Names}}\t{{.Status}}\t{{.State}}"],
|
|
912
|
+
timeout=8,
|
|
913
|
+
)
|
|
914
|
+
if rc != 0:
|
|
915
|
+
return None
|
|
916
|
+
problems: List[Dict[str, Any]] = []
|
|
917
|
+
for line in out.splitlines():
|
|
918
|
+
parts = line.split("\t")
|
|
919
|
+
if len(parts) < 3:
|
|
920
|
+
continue
|
|
921
|
+
name, status, state = parts[0], parts[1], parts[2]
|
|
922
|
+
low_status = status.lower()
|
|
923
|
+
low_state = state.lower()
|
|
924
|
+
problem: Optional[str] = None
|
|
925
|
+
if "unhealthy" in low_status:
|
|
926
|
+
problem = "unhealthy"
|
|
927
|
+
elif low_state == "restarting":
|
|
928
|
+
problem = "restarting"
|
|
929
|
+
# Note: exited containers are often intentional (init/one-shot jobs).
|
|
930
|
+
# We surface them only when they exited with a non-zero code AND are
|
|
931
|
+
# part of a restart-policy that should keep them alive — but discerning
|
|
932
|
+
# that requires `docker inspect`. Skip exited containers for now.
|
|
933
|
+
if problem:
|
|
934
|
+
problems.append(
|
|
935
|
+
{
|
|
936
|
+
"name": name,
|
|
937
|
+
"state": state,
|
|
938
|
+
"status": status,
|
|
939
|
+
"problem": problem,
|
|
940
|
+
}
|
|
941
|
+
)
|
|
942
|
+
return problems
|
|
943
|
+
|
|
944
|
+
|
|
945
|
+
def get_conntrack_usage() -> Optional[Tuple[int, int]]:
|
|
946
|
+
"""Return (current_entries, max_entries) for the netfilter conntrack table.
|
|
947
|
+
|
|
948
|
+
Path varies by distro/kernel — try the two known locations.
|
|
949
|
+
"""
|
|
950
|
+
candidates = (
|
|
951
|
+
("/proc/sys/net/netfilter/nf_conntrack_count", "/proc/sys/net/netfilter/nf_conntrack_max"),
|
|
952
|
+
("/proc/sys/net/nf_conntrack_count", "/proc/sys/net/nf_conntrack_max"),
|
|
953
|
+
)
|
|
954
|
+
for count_path, max_path in candidates:
|
|
955
|
+
count_raw = read_file(count_path).strip()
|
|
956
|
+
max_raw = read_file(max_path).strip()
|
|
957
|
+
if not count_raw or not max_raw:
|
|
958
|
+
continue
|
|
959
|
+
try:
|
|
960
|
+
return int(count_raw), int(max_raw)
|
|
961
|
+
except ValueError:
|
|
962
|
+
continue
|
|
963
|
+
return None
|
|
964
|
+
|
|
965
|
+
|
|
966
|
+
def get_top_paths_in(directory: str, limit: int = 10) -> List[Dict[str, Any]]:
|
|
967
|
+
"""Return top-N largest immediate subdirectories of `directory` (du -d1).
|
|
968
|
+
|
|
969
|
+
Used by `wtf explain --deep` to surface "who's eating the disk" when a
|
|
970
|
+
disk-fill warning fires. Bounded by 15s — on huge trees `du` can run long.
|
|
971
|
+
"""
|
|
972
|
+
import shutil
|
|
973
|
+
|
|
974
|
+
if not shutil.which("du") or not os.path.isdir(directory):
|
|
975
|
+
return []
|
|
976
|
+
# --block-size=1 → bytes, -d1 → only direct children.
|
|
977
|
+
rc, out, _ = run(["du", "-d1", "--block-size=1", directory], timeout=15)
|
|
978
|
+
if rc != 0 or not out:
|
|
979
|
+
return []
|
|
980
|
+
results: List[Dict[str, Any]] = []
|
|
981
|
+
for line in out.splitlines():
|
|
982
|
+
parts = line.split("\t")
|
|
983
|
+
if len(parts) != 2:
|
|
984
|
+
continue
|
|
985
|
+
try:
|
|
986
|
+
size = int(parts[0])
|
|
987
|
+
except ValueError:
|
|
988
|
+
continue
|
|
989
|
+
path = parts[1]
|
|
990
|
+
if path == directory:
|
|
991
|
+
continue # skip the directory itself (du -d1 emits it)
|
|
992
|
+
results.append({"path": path, "bytes": size})
|
|
993
|
+
results.sort(key=lambda r: r["bytes"], reverse=True)
|
|
994
|
+
return results[:limit]
|
|
995
|
+
|
|
996
|
+
|
|
997
|
+
def get_largest_files(directory: str, limit: int = 5, min_size_mb: int = 100) -> List[Dict[str, Any]]:
|
|
998
|
+
"""Find regular files under `directory` larger than min_size_mb."""
|
|
999
|
+
import shutil
|
|
1000
|
+
|
|
1001
|
+
if not shutil.which("find") or not os.path.isdir(directory):
|
|
1002
|
+
return []
|
|
1003
|
+
rc, out, _ = run(
|
|
1004
|
+
["find", directory, "-xdev", "-type", "f", "-size", f"+{min_size_mb}M", "-printf", "%s\t%p\n"],
|
|
1005
|
+
timeout=20,
|
|
1006
|
+
)
|
|
1007
|
+
if rc != 0 or not out:
|
|
1008
|
+
return []
|
|
1009
|
+
results: List[Dict[str, Any]] = []
|
|
1010
|
+
for line in out.splitlines():
|
|
1011
|
+
parts = line.split("\t", 1)
|
|
1012
|
+
if len(parts) != 2:
|
|
1013
|
+
continue
|
|
1014
|
+
try:
|
|
1015
|
+
size = int(parts[0])
|
|
1016
|
+
except ValueError:
|
|
1017
|
+
continue
|
|
1018
|
+
results.append({"path": parts[1], "bytes": size})
|
|
1019
|
+
results.sort(key=lambda r: r["bytes"], reverse=True)
|
|
1020
|
+
return results[:limit]
|
|
1021
|
+
|
|
1022
|
+
|
|
1023
|
+
def get_docker_disk_usage() -> Optional[List[Dict[str, str]]]:
|
|
1024
|
+
"""`docker system df` parsed into rows. None if docker missing/unreachable."""
|
|
1025
|
+
import shutil
|
|
1026
|
+
|
|
1027
|
+
if not shutil.which("docker"):
|
|
1028
|
+
return None
|
|
1029
|
+
rc, out, _ = run(
|
|
1030
|
+
["docker", "system", "df", "--format", "{{.Type}}\t{{.TotalCount}}\t{{.Size}}\t{{.Reclaimable}}"],
|
|
1031
|
+
timeout=8,
|
|
1032
|
+
)
|
|
1033
|
+
if rc != 0 or not out:
|
|
1034
|
+
return None
|
|
1035
|
+
rows: List[Dict[str, str]] = []
|
|
1036
|
+
for line in out.splitlines():
|
|
1037
|
+
parts = line.split("\t")
|
|
1038
|
+
if len(parts) >= 4:
|
|
1039
|
+
rows.append(
|
|
1040
|
+
{
|
|
1041
|
+
"type": parts[0],
|
|
1042
|
+
"count": parts[1],
|
|
1043
|
+
"size": parts[2],
|
|
1044
|
+
"reclaimable": parts[3],
|
|
1045
|
+
}
|
|
1046
|
+
)
|
|
1047
|
+
return rows
|
|
1048
|
+
|
|
1049
|
+
|
|
1050
|
+
def get_docker_container_sizes(limit: int = 10) -> Optional[List[Dict[str, str]]]:
|
|
1051
|
+
"""Per-container size breakdown (`docker ps -as`). None if docker missing.
|
|
1052
|
+
|
|
1053
|
+
The `.Size` field reports `Rw+Vsize` — read-write layer plus base image.
|
|
1054
|
+
"""
|
|
1055
|
+
import shutil
|
|
1056
|
+
|
|
1057
|
+
if not shutil.which("docker"):
|
|
1058
|
+
return None
|
|
1059
|
+
rc, out, _ = run(
|
|
1060
|
+
["docker", "ps", "-as", "--format", "{{.Names}}\t{{.Size}}\t{{.Image}}\t{{.Status}}"],
|
|
1061
|
+
timeout=8,
|
|
1062
|
+
)
|
|
1063
|
+
if rc != 0 or not out:
|
|
1064
|
+
return None
|
|
1065
|
+
rows: List[Dict[str, str]] = []
|
|
1066
|
+
for line in out.splitlines():
|
|
1067
|
+
parts = line.split("\t")
|
|
1068
|
+
if len(parts) >= 4:
|
|
1069
|
+
rows.append(
|
|
1070
|
+
{
|
|
1071
|
+
"name": parts[0],
|
|
1072
|
+
"size": parts[1],
|
|
1073
|
+
"image": parts[2],
|
|
1074
|
+
"status": parts[3],
|
|
1075
|
+
}
|
|
1076
|
+
)
|
|
1077
|
+
return rows[:limit]
|
|
1078
|
+
|
|
1079
|
+
|
|
1080
|
+
def get_docker_log_sizes(limit: int = 5) -> Optional[List[Dict[str, Any]]]:
|
|
1081
|
+
"""Per-container log-file size (json-file driver). None if docker missing.
|
|
1082
|
+
|
|
1083
|
+
Reads each container's LogPath via `docker inspect`, stats it on the host
|
|
1084
|
+
filesystem. Requires the wtf process to have read access to the path —
|
|
1085
|
+
typically only root or the docker group.
|
|
1086
|
+
"""
|
|
1087
|
+
import shutil
|
|
1088
|
+
|
|
1089
|
+
if not shutil.which("docker"):
|
|
1090
|
+
return None
|
|
1091
|
+
rc, out, _ = run(["docker", "ps", "-aq"], timeout=5)
|
|
1092
|
+
if rc != 0 or not out:
|
|
1093
|
+
return None
|
|
1094
|
+
ids = [i for i in out.splitlines() if i.strip()]
|
|
1095
|
+
if not ids:
|
|
1096
|
+
return []
|
|
1097
|
+
rc, out, _ = run(
|
|
1098
|
+
["docker", "inspect", "--format", "{{.Name}}\t{{.LogPath}}"] + ids,
|
|
1099
|
+
timeout=10,
|
|
1100
|
+
)
|
|
1101
|
+
if rc != 0 or not out:
|
|
1102
|
+
return None
|
|
1103
|
+
results: List[Dict[str, Any]] = []
|
|
1104
|
+
for line in out.splitlines():
|
|
1105
|
+
parts = line.split("\t")
|
|
1106
|
+
if len(parts) != 2:
|
|
1107
|
+
continue
|
|
1108
|
+
name = parts[0].lstrip("/")
|
|
1109
|
+
log_path = parts[1]
|
|
1110
|
+
if not log_path or log_path == "<no value>":
|
|
1111
|
+
continue
|
|
1112
|
+
try:
|
|
1113
|
+
size = os.path.getsize(log_path)
|
|
1114
|
+
except OSError:
|
|
1115
|
+
continue
|
|
1116
|
+
results.append({"name": name, "log_path": log_path, "bytes": size})
|
|
1117
|
+
results.sort(key=lambda r: r["bytes"], reverse=True)
|
|
1118
|
+
return results[:limit]
|
|
1119
|
+
|
|
1120
|
+
|
|
1121
|
+
def get_journal_disk_usage() -> Optional[int]:
|
|
1122
|
+
"""Total bytes occupied by journald archives via `journalctl --disk-usage`."""
|
|
1123
|
+
rc, out, _ = run(["journalctl", "--disk-usage"], timeout=5)
|
|
1124
|
+
if rc != 0 or not out:
|
|
1125
|
+
return None
|
|
1126
|
+
# Output examples:
|
|
1127
|
+
# "Archived and active journals take up 1.2G in the file system."
|
|
1128
|
+
# "Archived and active journals take up 824.0M in the file system."
|
|
1129
|
+
# "Archived and active journals take up 12.0G on disk."
|
|
1130
|
+
match = re.search(r"([0-9]+(?:\.[0-9]+)?)\s*([KMGT])?B?", out)
|
|
1131
|
+
if not match:
|
|
1132
|
+
return None
|
|
1133
|
+
try:
|
|
1134
|
+
value = float(match.group(1))
|
|
1135
|
+
except ValueError:
|
|
1136
|
+
return None
|
|
1137
|
+
multipliers = {"K": 1024, "M": 1024**2, "G": 1024**3, "T": 1024**4}
|
|
1138
|
+
unit = match.group(2)
|
|
1139
|
+
return int(value * multipliers.get(unit, 1))
|
|
1140
|
+
|
|
1141
|
+
|
|
1142
|
+
def get_pressure(resource: str) -> Optional[Dict[str, Dict[str, float]]]:
|
|
1143
|
+
"""Read /proc/pressure/<resource> (PSI: cpu, memory, io). None if absent.
|
|
1144
|
+
|
|
1145
|
+
Returns {"some": {"avg10":..., "avg60":..., "avg300":..., "total":...},
|
|
1146
|
+
"full": {...}} (full is absent for cpu in older kernels).
|
|
1147
|
+
"""
|
|
1148
|
+
if resource not in ("cpu", "memory", "io"):
|
|
1149
|
+
return None
|
|
1150
|
+
content = read_file(f"/proc/pressure/{resource}")
|
|
1151
|
+
if not content:
|
|
1152
|
+
return None
|
|
1153
|
+
result: Dict[str, Dict[str, float]] = {}
|
|
1154
|
+
for line in content.splitlines():
|
|
1155
|
+
parts = line.split()
|
|
1156
|
+
if not parts:
|
|
1157
|
+
continue
|
|
1158
|
+
scope = parts[0]
|
|
1159
|
+
if scope not in ("some", "full"):
|
|
1160
|
+
continue
|
|
1161
|
+
data: Dict[str, float] = {}
|
|
1162
|
+
for kv in parts[1:]:
|
|
1163
|
+
key, _, value = kv.partition("=")
|
|
1164
|
+
try:
|
|
1165
|
+
data[key] = float(value)
|
|
1166
|
+
except ValueError:
|
|
1167
|
+
continue
|
|
1168
|
+
result[scope] = data
|
|
1169
|
+
return result if result else None
|
|
1170
|
+
|
|
1171
|
+
|
|
1172
|
+
def get_kernel_taint() -> Optional[int]:
|
|
1173
|
+
"""Read /proc/sys/kernel/tainted. 0 = clean. Non-zero = kernel saw badness."""
|
|
1174
|
+
raw = read_file("/proc/sys/kernel/tainted").strip()
|
|
1175
|
+
if not raw:
|
|
1176
|
+
return None
|
|
1177
|
+
try:
|
|
1178
|
+
return int(raw)
|
|
1179
|
+
except ValueError:
|
|
1180
|
+
return None
|
|
1181
|
+
|
|
1182
|
+
|
|
1183
|
+
KERNEL_TAINT_BITS = {
|
|
1184
|
+
0: "PROPRIETARY_MODULE",
|
|
1185
|
+
1: "FORCED_MODULE",
|
|
1186
|
+
2: "UNSAFE_SMP",
|
|
1187
|
+
3: "FORCED_RMMOD",
|
|
1188
|
+
4: "MACHINE_CHECK",
|
|
1189
|
+
5: "BAD_PAGE",
|
|
1190
|
+
6: "USER",
|
|
1191
|
+
7: "DIE",
|
|
1192
|
+
8: "OVERRIDDEN_ACPI_TABLE",
|
|
1193
|
+
9: "WARN",
|
|
1194
|
+
10: "CRAP",
|
|
1195
|
+
11: "FIRMWARE_WORKAROUND",
|
|
1196
|
+
12: "OOT_MODULE",
|
|
1197
|
+
13: "UNSIGNED_MODULE",
|
|
1198
|
+
14: "SOFTLOCKUP",
|
|
1199
|
+
15: "LIVEPATCH",
|
|
1200
|
+
}
|
|
1201
|
+
|
|
1202
|
+
|
|
1203
|
+
def decode_kernel_taint(value: int) -> List[str]:
|
|
1204
|
+
"""Decode a kernel taint bitmask into a list of flag names."""
|
|
1205
|
+
return [name for bit, name in KERNEL_TAINT_BITS.items() if value & (1 << bit)]
|
|
1206
|
+
|
|
1207
|
+
|
|
1208
|
+
def get_certificate_expirations(
|
|
1209
|
+
roots: Optional[List[str]] = None,
|
|
1210
|
+
max_files: int = 50,
|
|
1211
|
+
) -> List[Dict[str, Any]]:
|
|
1212
|
+
"""Walk well-known TLS-cert roots and return list of {path, days_left}.
|
|
1213
|
+
|
|
1214
|
+
Returns empty list when openssl is unavailable or no cert dirs exist.
|
|
1215
|
+
Bounded by max_files to avoid runaway IO on misconfigured hosts.
|
|
1216
|
+
"""
|
|
1217
|
+
import shutil
|
|
1218
|
+
|
|
1219
|
+
if not shutil.which("openssl"):
|
|
1220
|
+
return []
|
|
1221
|
+
if roots is None:
|
|
1222
|
+
# NB: /etc/ssl/certs is the system CA bundle (root CAs ship with
|
|
1223
|
+
# far-future or already-past notAfter dates intentionally) — scanning
|
|
1224
|
+
# it produces spam, so we focus on server-cert locations only.
|
|
1225
|
+
roots = [
|
|
1226
|
+
"/etc/letsencrypt/live",
|
|
1227
|
+
"/etc/letsencrypt/archive",
|
|
1228
|
+
"/etc/nginx/ssl",
|
|
1229
|
+
"/etc/nginx/certs",
|
|
1230
|
+
"/etc/apache2/ssl",
|
|
1231
|
+
"/etc/haproxy/certs",
|
|
1232
|
+
"/etc/pki/tls/private",
|
|
1233
|
+
"/etc/dovecot/certs",
|
|
1234
|
+
"/etc/postfix/certs",
|
|
1235
|
+
"/etc/ssl/private",
|
|
1236
|
+
]
|
|
1237
|
+
candidates: List[str] = []
|
|
1238
|
+
for root in roots:
|
|
1239
|
+
if not os.path.isdir(root):
|
|
1240
|
+
continue
|
|
1241
|
+
for dirpath, _, filenames in os.walk(root, followlinks=False):
|
|
1242
|
+
for f in filenames:
|
|
1243
|
+
low = f.lower()
|
|
1244
|
+
if not (low.endswith(".pem") or low.endswith(".crt") or low.endswith(".cert")):
|
|
1245
|
+
continue
|
|
1246
|
+
# Skip private keys masquerading as .pem files.
|
|
1247
|
+
if "privkey" in low or "private" in low or "key" in low:
|
|
1248
|
+
if "pubkey" not in low:
|
|
1249
|
+
continue
|
|
1250
|
+
candidates.append(os.path.join(dirpath, f))
|
|
1251
|
+
if len(candidates) >= max_files:
|
|
1252
|
+
break
|
|
1253
|
+
if len(candidates) >= max_files:
|
|
1254
|
+
break
|
|
1255
|
+
if len(candidates) >= max_files:
|
|
1256
|
+
break
|
|
1257
|
+
|
|
1258
|
+
results: List[Dict[str, Any]] = []
|
|
1259
|
+
seen_inodes: set = set()
|
|
1260
|
+
for path in candidates:
|
|
1261
|
+
try:
|
|
1262
|
+
st = os.stat(path)
|
|
1263
|
+
if st.st_ino in seen_inodes:
|
|
1264
|
+
continue
|
|
1265
|
+
seen_inodes.add(st.st_ino)
|
|
1266
|
+
except OSError:
|
|
1267
|
+
continue
|
|
1268
|
+
days = _parse_cert_expiry_days(path)
|
|
1269
|
+
if days is None:
|
|
1270
|
+
continue
|
|
1271
|
+
results.append({"path": path, "days_left": days})
|
|
1272
|
+
results.sort(key=lambda r: r["days_left"])
|
|
1273
|
+
return results
|
|
1274
|
+
|
|
1275
|
+
|
|
1276
|
+
def _parse_cert_expiry_days(path: str) -> Optional[int]:
|
|
1277
|
+
"""Return days until notAfter, or None if unparseable / not a cert."""
|
|
1278
|
+
rc, out, _ = run(["openssl", "x509", "-enddate", "-noout", "-in", path], timeout=3)
|
|
1279
|
+
if rc != 0 or "=" not in out:
|
|
1280
|
+
return None
|
|
1281
|
+
_, _, when = out.strip().partition("=")
|
|
1282
|
+
when = when.strip()
|
|
1283
|
+
try:
|
|
1284
|
+
from datetime import datetime, timezone
|
|
1285
|
+
|
|
1286
|
+
dt = datetime.strptime(when, "%b %d %H:%M:%S %Y %Z")
|
|
1287
|
+
delta = dt.replace(tzinfo=timezone.utc) - datetime.now(timezone.utc)
|
|
1288
|
+
return delta.days
|
|
1289
|
+
except (ValueError, ImportError):
|
|
1290
|
+
return None
|
|
1291
|
+
|
|
1292
|
+
|
|
1293
|
+
def _snap_tcp() -> Optional[Dict[str, int]]:
|
|
1294
|
+
"""Snapshot of /proc/net/snmp Tcp counters."""
|
|
1295
|
+
content = read_file("/proc/net/snmp")
|
|
1296
|
+
if not content:
|
|
1297
|
+
return None
|
|
1298
|
+
header: Optional[List[str]] = None
|
|
1299
|
+
values: Optional[List[str]] = None
|
|
1300
|
+
for line in content.splitlines():
|
|
1301
|
+
if line.startswith("Tcp:"):
|
|
1302
|
+
parts = line.split()[1:]
|
|
1303
|
+
if header is None:
|
|
1304
|
+
header = parts
|
|
1305
|
+
else:
|
|
1306
|
+
values = parts
|
|
1307
|
+
break
|
|
1308
|
+
if not header or not values or len(header) != len(values):
|
|
1309
|
+
return None
|
|
1310
|
+
out: Dict[str, int] = {}
|
|
1311
|
+
for k, v in zip(header, values):
|
|
1312
|
+
try:
|
|
1313
|
+
out[k] = int(v)
|
|
1314
|
+
except ValueError:
|
|
1315
|
+
continue
|
|
1316
|
+
return out
|
|
1317
|
+
|
|
1318
|
+
|
|
1319
|
+
def get_tcp_retransmit_rate(sample_seconds: float = 1.0) -> Optional[float]:
|
|
1320
|
+
"""Sample TCP RetransSegs/OutSegs over a short window. Returns percent.
|
|
1321
|
+
|
|
1322
|
+
Returns None when /proc/net/snmp is unreadable. Returns 0.0 when there was
|
|
1323
|
+
no outbound TCP traffic during the sample (rate is undefined; we treat it
|
|
1324
|
+
as "no problem").
|
|
1325
|
+
"""
|
|
1326
|
+
first = _snap_tcp()
|
|
1327
|
+
if first is None:
|
|
1328
|
+
return None
|
|
1329
|
+
time.sleep(sample_seconds)
|
|
1330
|
+
second = _snap_tcp()
|
|
1331
|
+
if second is None:
|
|
1332
|
+
return None
|
|
1333
|
+
out_delta = second.get("OutSegs", 0) - first.get("OutSegs", 0)
|
|
1334
|
+
retr_delta = second.get("RetransSegs", 0) - first.get("RetransSegs", 0)
|
|
1335
|
+
if out_delta <= 0:
|
|
1336
|
+
return 0.0
|
|
1337
|
+
return round(100.0 * retr_delta / out_delta, 2)
|
|
1338
|
+
|
|
1339
|
+
|
|
1340
|
+
def get_service_details(unit: str) -> Optional[Dict[str, Any]]:
|
|
1341
|
+
"""Drilldown info for a single systemd unit. None if not found."""
|
|
1342
|
+
if "." not in unit:
|
|
1343
|
+
unit = f"{unit}.service"
|
|
1344
|
+
rc, out, _ = run(
|
|
1345
|
+
[
|
|
1346
|
+
"systemctl",
|
|
1347
|
+
"show",
|
|
1348
|
+
"-p",
|
|
1349
|
+
"Id",
|
|
1350
|
+
"-p",
|
|
1351
|
+
"Description",
|
|
1352
|
+
"-p",
|
|
1353
|
+
"LoadState",
|
|
1354
|
+
"-p",
|
|
1355
|
+
"ActiveState",
|
|
1356
|
+
"-p",
|
|
1357
|
+
"SubState",
|
|
1358
|
+
"-p",
|
|
1359
|
+
"Result",
|
|
1360
|
+
"-p",
|
|
1361
|
+
"UnitFileState",
|
|
1362
|
+
"-p",
|
|
1363
|
+
"MainPID",
|
|
1364
|
+
"-p",
|
|
1365
|
+
"NRestarts",
|
|
1366
|
+
"-p",
|
|
1367
|
+
"MemoryCurrent",
|
|
1368
|
+
"-p",
|
|
1369
|
+
"TasksCurrent",
|
|
1370
|
+
"-p",
|
|
1371
|
+
"ActiveEnterTimestamp",
|
|
1372
|
+
"-p",
|
|
1373
|
+
"ExecMainStartTimestamp",
|
|
1374
|
+
"-p",
|
|
1375
|
+
"FragmentPath",
|
|
1376
|
+
unit,
|
|
1377
|
+
],
|
|
1378
|
+
timeout=8,
|
|
1379
|
+
)
|
|
1380
|
+
if rc != 0 or not out:
|
|
1381
|
+
return None
|
|
1382
|
+
data: Dict[str, str] = {}
|
|
1383
|
+
for line in out.splitlines():
|
|
1384
|
+
key, _, value = line.partition("=")
|
|
1385
|
+
data[key.strip()] = value.strip()
|
|
1386
|
+
if not data.get("Id") or data.get("LoadState") == "not-found":
|
|
1387
|
+
return None
|
|
1388
|
+
return data
|
|
1389
|
+
|
|
1390
|
+
|
|
1391
|
+
def get_service_journal(unit: str, lines: int = 20) -> List[str]:
|
|
1392
|
+
"""Recent journal lines for a single unit."""
|
|
1393
|
+
if "." not in unit:
|
|
1394
|
+
unit = f"{unit}.service"
|
|
1395
|
+
rc, out, _ = run(["journalctl", "-u", unit, "-n", str(lines), "--no-pager", "-q"], timeout=8)
|
|
1396
|
+
if rc != 0 or not out:
|
|
1397
|
+
return []
|
|
1398
|
+
return [line for line in out.splitlines() if line.strip()]
|
|
1399
|
+
|
|
1400
|
+
|
|
1401
|
+
def get_pid_count() -> Tuple[int, int]:
|
|
1402
|
+
"""Return (current process count, kernel pid_max)."""
|
|
1403
|
+
pid_max_raw = read_file("/proc/sys/kernel/pid_max").strip()
|
|
1404
|
+
try:
|
|
1405
|
+
pid_max = int(pid_max_raw)
|
|
1406
|
+
except ValueError:
|
|
1407
|
+
pid_max = 0
|
|
1408
|
+
count = 0
|
|
1409
|
+
try:
|
|
1410
|
+
for name in os.listdir("/proc"):
|
|
1411
|
+
if name.isdigit():
|
|
1412
|
+
count += 1
|
|
1413
|
+
except OSError:
|
|
1414
|
+
count = 0
|
|
1415
|
+
return count, pid_max
|
|
1416
|
+
|
|
1417
|
+
|
|
1418
|
+
def get_oom_events(hours: int = 24) -> List[str]:
|
|
1419
|
+
"""Find OOM-kill events in journal/dmesg from the recent window."""
|
|
1420
|
+
events: List[str] = []
|
|
1421
|
+
rc, out, _ = run(
|
|
1422
|
+
["journalctl", "-k", "--since", f"{hours} hours ago", "--no-pager", "-q"],
|
|
1423
|
+
timeout=8,
|
|
1424
|
+
)
|
|
1425
|
+
if rc == 0 and out:
|
|
1426
|
+
for line in out.splitlines():
|
|
1427
|
+
low = line.lower()
|
|
1428
|
+
if "out of memory" in low or "killed process" in low or "oom-killer" in low:
|
|
1429
|
+
events.append(line.strip())
|
|
1430
|
+
return events
|
|
1431
|
+
# Fallback: dmesg (may need root)
|
|
1432
|
+
rc, out, _ = run(["dmesg", "-T"], timeout=5)
|
|
1433
|
+
if rc == 0:
|
|
1434
|
+
for line in out.splitlines():
|
|
1435
|
+
low = line.lower()
|
|
1436
|
+
if "out of memory" in low or "oom-killer" in low or "killed process" in low:
|
|
1437
|
+
events.append(line.strip())
|
|
1438
|
+
return events
|
|
1439
|
+
|
|
1440
|
+
|
|
1441
|
+
def get_recent_kernel_errors(hours: int = 24, limit: int = 5) -> List[str]:
|
|
1442
|
+
"""Recent kernel error/critical lines from journal."""
|
|
1443
|
+
rc, out, _ = run(
|
|
1444
|
+
["journalctl", "-k", "-p", "err", "--since", f"{hours} hours ago", "--no-pager", "-q"],
|
|
1445
|
+
timeout=8,
|
|
1446
|
+
)
|
|
1447
|
+
if rc != 0 or not out:
|
|
1448
|
+
return []
|
|
1449
|
+
lines = [line.strip() for line in out.splitlines() if line.strip()]
|
|
1450
|
+
return lines[-limit:]
|
|
1451
|
+
|
|
1452
|
+
|
|
1453
|
+
def get_listening_ports() -> List[Dict[str, Any]]:
|
|
1454
|
+
"""List of listening TCP ports."""
|
|
1455
|
+
if HAS_PSUTIL:
|
|
1456
|
+
try:
|
|
1457
|
+
ports: List[Dict[str, Any]] = []
|
|
1458
|
+
for conn in psutil.net_connections(kind="inet"):
|
|
1459
|
+
if conn.status != psutil.CONN_LISTEN:
|
|
1460
|
+
continue
|
|
1461
|
+
if conn.type != socket.SOCK_STREAM:
|
|
1462
|
+
continue
|
|
1463
|
+
if not conn.laddr:
|
|
1464
|
+
continue
|
|
1465
|
+
ports.append(
|
|
1466
|
+
{
|
|
1467
|
+
"addr": conn.laddr.ip,
|
|
1468
|
+
"port": conn.laddr.port,
|
|
1469
|
+
"pid": conn.pid,
|
|
1470
|
+
}
|
|
1471
|
+
)
|
|
1472
|
+
return ports
|
|
1473
|
+
except Exception:
|
|
1474
|
+
pass
|
|
1475
|
+
rc, out, _ = run(["ss", "-tlnH"], timeout=5)
|
|
1476
|
+
if rc != 0:
|
|
1477
|
+
return []
|
|
1478
|
+
ports = []
|
|
1479
|
+
for line in out.splitlines():
|
|
1480
|
+
parts = line.split()
|
|
1481
|
+
if len(parts) < 4:
|
|
1482
|
+
continue
|
|
1483
|
+
local = parts[3]
|
|
1484
|
+
if ":" not in local:
|
|
1485
|
+
continue
|
|
1486
|
+
addr, _, port = local.rpartition(":")
|
|
1487
|
+
try:
|
|
1488
|
+
ports.append({"addr": addr, "port": int(port), "pid": None})
|
|
1489
|
+
except ValueError:
|
|
1490
|
+
continue
|
|
1491
|
+
return ports
|
|
1492
|
+
|
|
1493
|
+
|
|
1494
|
+
def get_pending_updates() -> int:
|
|
1495
|
+
"""Count of pending apt updates. -1 if cannot determine."""
|
|
1496
|
+
if not shutil.which("apt"):
|
|
1497
|
+
return -1
|
|
1498
|
+
rc, out, _ = run(["apt", "list", "--upgradable"], timeout=10)
|
|
1499
|
+
if rc != 0:
|
|
1500
|
+
return -1
|
|
1501
|
+
count = 0
|
|
1502
|
+
for line in out.splitlines():
|
|
1503
|
+
line = line.strip()
|
|
1504
|
+
if not line or line.startswith("Listing"):
|
|
1505
|
+
continue
|
|
1506
|
+
count += 1
|
|
1507
|
+
return count
|
|
1508
|
+
|
|
1509
|
+
|
|
1510
|
+
def get_last_logins(limit: int = 5) -> List[str]:
|
|
1511
|
+
"""Recent successful logins via `last`."""
|
|
1512
|
+
rc, out, _ = run(["last", "-n", str(limit), "-F"], timeout=5)
|
|
1513
|
+
if rc != 0:
|
|
1514
|
+
return []
|
|
1515
|
+
return [line for line in out.splitlines()[:limit] if line.strip()]
|
|
1516
|
+
|
|
1517
|
+
|
|
1518
|
+
def get_failed_auth_count(hours: int = 24) -> int:
|
|
1519
|
+
"""Count of failed authentication events in the recent window."""
|
|
1520
|
+
rc, out, _ = run(
|
|
1521
|
+
["journalctl", "_SYSTEMD_UNIT=ssh.service", "_SYSTEMD_UNIT=sshd.service", "--since", f"{hours} hours ago", "--no-pager", "-q"],
|
|
1522
|
+
timeout=8,
|
|
1523
|
+
)
|
|
1524
|
+
if rc != 0 or not out:
|
|
1525
|
+
# Fallback to /var/log/auth.log
|
|
1526
|
+
out = read_file("/var/log/auth.log")
|
|
1527
|
+
if not out:
|
|
1528
|
+
return 0
|
|
1529
|
+
count = 0
|
|
1530
|
+
for line in out.splitlines():
|
|
1531
|
+
low = line.lower()
|
|
1532
|
+
if "failed password" in low or "authentication failure" in low or "invalid user" in low:
|
|
1533
|
+
count += 1
|
|
1534
|
+
return count
|
|
1535
|
+
|
|
1536
|
+
|
|
1537
|
+
def get_disk_io_busy() -> Optional[float]:
|
|
1538
|
+
"""Average disk busy percent across all disks. Requires psutil."""
|
|
1539
|
+
if not HAS_PSUTIL:
|
|
1540
|
+
return None
|
|
1541
|
+
try:
|
|
1542
|
+
first = psutil.disk_io_counters(perdisk=False)
|
|
1543
|
+
if first is None:
|
|
1544
|
+
return None
|
|
1545
|
+
time.sleep(0.5)
|
|
1546
|
+
second = psutil.disk_io_counters(perdisk=False)
|
|
1547
|
+
if second is None:
|
|
1548
|
+
return None
|
|
1549
|
+
busy_delta = (second.busy_time - first.busy_time) if hasattr(second, "busy_time") else 0
|
|
1550
|
+
return float(busy_delta) / 5.0 # 500ms window, expressed as pct
|
|
1551
|
+
except Exception:
|
|
1552
|
+
return None
|
|
1553
|
+
|
|
1554
|
+
|
|
1555
|
+
def get_open_fds() -> Optional[Tuple[int, int]]:
|
|
1556
|
+
"""Return (used, max) open file descriptors."""
|
|
1557
|
+
fs_file_nr = read_file("/proc/sys/fs/file-nr")
|
|
1558
|
+
fs_file_max = read_file("/proc/sys/fs/file-max")
|
|
1559
|
+
try:
|
|
1560
|
+
used = int(fs_file_nr.split()[0])
|
|
1561
|
+
except (ValueError, IndexError):
|
|
1562
|
+
return None
|
|
1563
|
+
try:
|
|
1564
|
+
max_fd = int(fs_file_max.strip())
|
|
1565
|
+
except ValueError:
|
|
1566
|
+
return None
|
|
1567
|
+
return used, max_fd
|
|
1568
|
+
|
|
1569
|
+
|
|
1570
|
+
def _is_noisy_iface(name: str) -> bool:
|
|
1571
|
+
"""Filter out container/virtual interfaces from the default network listing."""
|
|
1572
|
+
noisy_prefixes = ("veth", "docker", "br-", "virbr", "cni", "flannel", "cali", "lxcbr", "tun", "tap")
|
|
1573
|
+
return any(name.startswith(p) for p in noisy_prefixes)
|
|
1574
|
+
|
|
1575
|
+
|
|
1576
|
+
def get_network_interfaces(include_virtual: bool = False) -> List[Dict[str, Any]]:
|
|
1577
|
+
"""List of non-loopback network interfaces with IPs."""
|
|
1578
|
+
result: List[Dict[str, Any]] = []
|
|
1579
|
+
if HAS_PSUTIL:
|
|
1580
|
+
try:
|
|
1581
|
+
addrs = psutil.net_if_addrs()
|
|
1582
|
+
stats = psutil.net_if_stats()
|
|
1583
|
+
for name, addr_list in addrs.items():
|
|
1584
|
+
if name == "lo":
|
|
1585
|
+
continue
|
|
1586
|
+
if not include_virtual and _is_noisy_iface(name):
|
|
1587
|
+
continue
|
|
1588
|
+
ipv4 = [a.address for a in addr_list if a.family == socket.AF_INET]
|
|
1589
|
+
ipv6 = [a.address for a in addr_list if a.family == socket.AF_INET6 and not a.address.startswith("fe80")]
|
|
1590
|
+
is_up = stats.get(name).isup if name in stats else False
|
|
1591
|
+
result.append({"name": name, "ipv4": ipv4, "ipv6": ipv6, "up": is_up})
|
|
1592
|
+
return result
|
|
1593
|
+
except Exception:
|
|
1594
|
+
pass
|
|
1595
|
+
rc, out, _ = run(["ip", "-o", "-4", "addr", "show"], timeout=5)
|
|
1596
|
+
if rc != 0:
|
|
1597
|
+
return []
|
|
1598
|
+
for line in out.splitlines():
|
|
1599
|
+
parts = line.split()
|
|
1600
|
+
if len(parts) >= 4:
|
|
1601
|
+
name = parts[1]
|
|
1602
|
+
if name == "lo":
|
|
1603
|
+
continue
|
|
1604
|
+
if not include_virtual and _is_noisy_iface(name):
|
|
1605
|
+
continue
|
|
1606
|
+
ip = parts[3].split("/")[0]
|
|
1607
|
+
result.append({"name": name, "ipv4": [ip], "ipv6": [], "up": True})
|
|
1608
|
+
return result
|