wtftools 0.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
wtftools/sysinfo.py ADDED
@@ -0,0 +1,1608 @@
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ """System information gathering for wtftools.
4
+
5
+ Pure stdlib first, optional psutil for richer data.
6
+ """
7
+
8
+ import logging
9
+ import os
10
+ import platform
11
+ import re
12
+ import shutil
13
+ import socket
14
+ import subprocess
15
+ import time
16
+ import traceback
17
+ from typing import Any, Dict, List, Optional, Tuple
18
+
19
+ logger = logging.getLogger(__name__)
20
+
21
+ try:
22
+ import psutil # type: ignore
23
+
24
+ HAS_PSUTIL = True
25
+ except Exception:
26
+ HAS_PSUTIL = False
27
+
28
+
29
+ PROC_MEMINFO = "/proc/meminfo"
30
+ PROC_UPTIME = "/proc/uptime"
31
+ PROC_LOADAVG = "/proc/loadavg"
32
+ PROC_STAT = "/proc/stat"
33
+ PROC_CPUINFO = "/proc/cpuinfo"
34
+ PROC_MOUNTS = "/proc/mounts"
35
+ ETC_OS_RELEASE = "/etc/os-release"
36
+
37
+
38
+ def run(cmd: List[str], timeout: int = 5) -> Tuple[int, str, str]:
39
+ """Safely run a subprocess. Returns (returncode, stdout, stderr)."""
40
+ try:
41
+ result = subprocess.run(cmd, capture_output=True, text=True, timeout=timeout, check=False)
42
+ return result.returncode, result.stdout, result.stderr
43
+ except subprocess.TimeoutExpired:
44
+ return 124, "", "timeout"
45
+ except FileNotFoundError:
46
+ return 127, "", "not found"
47
+ except Exception as exc:
48
+ logger.debug(f"{type(exc).__name__}: {exc}\n{traceback.format_exc()}")
49
+ return 1, "", str(exc)
50
+
51
+
52
+ def read_file(path: str) -> str:
53
+ """Read a file, return empty string on error."""
54
+ try:
55
+ with open(path, encoding="utf-8", errors="replace") as f:
56
+ return f.read()
57
+ except Exception:
58
+ return ""
59
+
60
+
61
+ def get_hostname() -> str:
62
+ return socket.gethostname()
63
+
64
+
65
+ def get_os_release() -> Dict[str, str]:
66
+ """Parse /etc/os-release into a dict."""
67
+ data: Dict[str, str] = {}
68
+ content = read_file(ETC_OS_RELEASE)
69
+ for line in content.splitlines():
70
+ if "=" not in line:
71
+ continue
72
+ key, _, value = line.partition("=")
73
+ data[key.strip()] = value.strip().strip('"').strip("'")
74
+ return data
75
+
76
+
77
+ def get_kernel() -> str:
78
+ return platform.release()
79
+
80
+
81
+ def get_uptime_seconds() -> float:
82
+ content = read_file(PROC_UPTIME)
83
+ if not content:
84
+ return 0.0
85
+ try:
86
+ return float(content.split()[0])
87
+ except (ValueError, IndexError):
88
+ return 0.0
89
+
90
+
91
+ def format_duration(seconds: float) -> str:
92
+ """Render seconds as a compact human duration."""
93
+ seconds = int(seconds)
94
+ days, rem = divmod(seconds, 86400)
95
+ hours, rem = divmod(rem, 3600)
96
+ minutes, secs = divmod(rem, 60)
97
+ parts = []
98
+ if days:
99
+ parts.append(f"{days}d")
100
+ if hours:
101
+ parts.append(f"{hours}h")
102
+ if minutes:
103
+ parts.append(f"{minutes}m")
104
+ if not parts:
105
+ parts.append(f"{secs}s")
106
+ return " ".join(parts)
107
+
108
+
109
+ def format_bytes(num_bytes: float) -> str:
110
+ """Render bytes as KB/MB/GB/TB."""
111
+ for unit in ("B", "KB", "MB", "GB", "TB", "PB"):
112
+ if abs(num_bytes) < 1024.0:
113
+ return f"{num_bytes:3.1f}{unit}"
114
+ num_bytes /= 1024.0
115
+ return f"{num_bytes:.1f}EB"
116
+
117
+
118
+ def get_loadavg() -> Tuple[float, float, float]:
119
+ content = read_file(PROC_LOADAVG)
120
+ if not content:
121
+ return (0.0, 0.0, 0.0)
122
+ parts = content.split()
123
+ try:
124
+ return (float(parts[0]), float(parts[1]), float(parts[2]))
125
+ except (ValueError, IndexError):
126
+ return (0.0, 0.0, 0.0)
127
+
128
+
129
+ def get_cpu_count() -> int:
130
+ try:
131
+ return os.cpu_count() or 1
132
+ except Exception:
133
+ return 1
134
+
135
+
136
+ def get_cpu_model() -> str:
137
+ content = read_file(PROC_CPUINFO)
138
+ for line in content.splitlines():
139
+ if line.lower().startswith("model name"):
140
+ _, _, value = line.partition(":")
141
+ return value.strip()
142
+ return platform.processor() or "unknown"
143
+
144
+
145
+ def get_meminfo() -> Dict[str, int]:
146
+ """Read /proc/meminfo, return dict of kB values."""
147
+ data: Dict[str, int] = {}
148
+ content = read_file(PROC_MEMINFO)
149
+ for line in content.splitlines():
150
+ match = re.match(r"^(\S+):\s+(\d+)\s*kB", line)
151
+ if match:
152
+ data[match.group(1)] = int(match.group(2)) * 1024
153
+ return data
154
+
155
+
156
+ def get_memory_summary() -> Dict[str, int]:
157
+ """Return total/used/free/available memory in bytes."""
158
+ if HAS_PSUTIL:
159
+ vm = psutil.virtual_memory()
160
+ sw = psutil.swap_memory()
161
+ return {
162
+ "total": vm.total,
163
+ "available": vm.available,
164
+ "used": vm.used,
165
+ "free": vm.free,
166
+ "percent": int(vm.percent),
167
+ "swap_total": sw.total,
168
+ "swap_used": sw.used,
169
+ "swap_percent": int(sw.percent),
170
+ }
171
+ info = get_meminfo()
172
+ total = info.get("MemTotal", 0)
173
+ free = info.get("MemFree", 0)
174
+ available = info.get("MemAvailable", free)
175
+ used = total - available
176
+ percent = int(round(100 * used / total)) if total else 0
177
+ swap_total = info.get("SwapTotal", 0)
178
+ swap_free = info.get("SwapFree", 0)
179
+ swap_used = swap_total - swap_free
180
+ swap_percent = int(round(100 * swap_used / swap_total)) if swap_total else 0
181
+ return {
182
+ "total": total,
183
+ "available": available,
184
+ "used": used,
185
+ "free": free,
186
+ "percent": percent,
187
+ "swap_total": swap_total,
188
+ "swap_used": swap_used,
189
+ "swap_percent": swap_percent,
190
+ }
191
+
192
+
193
+ def get_mounts() -> List[Dict[str, str]]:
194
+ """Read /proc/mounts. Filter out virtual / pseudo filesystems."""
195
+ skip_fs = {
196
+ "proc",
197
+ "sysfs",
198
+ "devtmpfs",
199
+ "devpts",
200
+ "tmpfs",
201
+ "cgroup",
202
+ "cgroup2",
203
+ "pstore",
204
+ "bpf",
205
+ "tracefs",
206
+ "debugfs",
207
+ "fusectl",
208
+ "configfs",
209
+ "hugetlbfs",
210
+ "mqueue",
211
+ "rpc_pipefs",
212
+ "binfmt_misc",
213
+ "autofs",
214
+ "securityfs",
215
+ "selinuxfs",
216
+ "fuse.gvfsd-fuse",
217
+ "fuse.portal",
218
+ "fuse.snapfuse",
219
+ "nsfs",
220
+ "ramfs",
221
+ "fuse.lxcfs",
222
+ "overlay",
223
+ "squashfs",
224
+ }
225
+ content = read_file(PROC_MOUNTS)
226
+ mounts: List[Dict[str, str]] = []
227
+ seen_targets: set = set()
228
+ for line in content.splitlines():
229
+ parts = line.split()
230
+ if len(parts) < 3:
231
+ continue
232
+ source, target, fs_type = parts[0], parts[1], parts[2]
233
+ if fs_type in skip_fs:
234
+ continue
235
+ if target.startswith(("/snap", "/var/lib/docker/", "/var/lib/snapd/")):
236
+ continue
237
+ if target in seen_targets:
238
+ continue
239
+ seen_targets.add(target)
240
+ mounts.append({"source": source, "target": target, "fstype": fs_type})
241
+ return mounts
242
+
243
+
244
+ def get_disk_usage(target: str) -> Optional[Dict[str, int]]:
245
+ """Return total/used/free bytes for a mount path."""
246
+ try:
247
+ usage = shutil.disk_usage(target)
248
+ except OSError:
249
+ return None
250
+ percent = int(round(100 * usage.used / usage.total)) if usage.total else 0
251
+ return {
252
+ "total": usage.total,
253
+ "used": usage.used,
254
+ "free": usage.free,
255
+ "percent": percent,
256
+ }
257
+
258
+
259
+ def get_disks() -> List[Dict[str, Any]]:
260
+ """Return a list of mount points with usage data."""
261
+ result: List[Dict[str, Any]] = []
262
+ for mount in get_mounts():
263
+ usage = get_disk_usage(mount["target"])
264
+ if usage is None:
265
+ continue
266
+ result.append(
267
+ {
268
+ "target": mount["target"],
269
+ "source": mount["source"],
270
+ "fstype": mount["fstype"],
271
+ **usage,
272
+ }
273
+ )
274
+ return result
275
+
276
+
277
+ def get_top_processes(by: str = "cpu", limit: int = 5) -> List[Dict[str, Any]]:
278
+ """Return top processes by cpu or rss. Requires psutil or falls back to ps."""
279
+ if HAS_PSUTIL:
280
+ try:
281
+ procs = []
282
+ for proc in psutil.process_iter(["pid", "name", "username", "cpu_percent", "memory_info"]):
283
+ procs.append(proc.info)
284
+ time.sleep(0.1)
285
+ procs = []
286
+ for proc in psutil.process_iter(["pid", "name", "username", "cpu_percent", "memory_info"]):
287
+ info = proc.info
288
+ rss = info["memory_info"].rss if info.get("memory_info") else 0
289
+ procs.append(
290
+ {
291
+ "pid": info.get("pid"),
292
+ "name": (info.get("name") or "")[:32],
293
+ "user": (info.get("username") or "")[:16],
294
+ "cpu_percent": info.get("cpu_percent") or 0.0,
295
+ "rss": rss,
296
+ }
297
+ )
298
+ key = "cpu_percent" if by == "cpu" else "rss"
299
+ procs.sort(key=lambda p: p[key], reverse=True)
300
+ return procs[:limit]
301
+ except Exception as exc:
302
+ logger.debug(f"psutil top failed: {exc}")
303
+ # Fallback: ps
304
+ sort = "%cpu" if by == "cpu" else "rss"
305
+ rc, out, _ = run(["ps", "-eo", f"pid,user,{sort},comm", "--sort=-" + sort, "--no-headers"], timeout=5)
306
+ if rc != 0 or not out:
307
+ return []
308
+ result: List[Dict[str, Any]] = []
309
+ for line in out.splitlines()[:limit]:
310
+ parts = line.split(None, 3)
311
+ if len(parts) < 4:
312
+ continue
313
+ pid, user, metric, comm = parts
314
+ try:
315
+ value = float(metric)
316
+ except ValueError:
317
+ value = 0.0
318
+ item = {"pid": int(pid), "user": user, "name": comm}
319
+ if by == "cpu":
320
+ item["cpu_percent"] = value
321
+ item["rss"] = 0
322
+ else:
323
+ item["rss"] = int(value) * 1024 # ps rss is in kB
324
+ item["cpu_percent"] = 0.0
325
+ result.append(item)
326
+ return result
327
+
328
+
329
+ def count_zombie_processes() -> int:
330
+ """Return number of zombie processes."""
331
+ if HAS_PSUTIL:
332
+ try:
333
+ return sum(1 for p in psutil.process_iter(["status"]) if p.info.get("status") == psutil.STATUS_ZOMBIE)
334
+ except Exception:
335
+ pass
336
+ rc, out, _ = run(["ps", "-eo", "stat", "--no-headers"], timeout=5)
337
+ if rc != 0:
338
+ return 0
339
+ return sum(1 for line in out.splitlines() if line.strip().startswith("Z"))
340
+
341
+
342
+ def get_failed_units() -> List[str]:
343
+ """List of failed systemd units."""
344
+ rc, out, _ = run(["systemctl", "--failed", "--no-legend", "--plain", "--no-pager"], timeout=5)
345
+ if rc != 0:
346
+ return []
347
+ units: List[str] = []
348
+ for line in out.splitlines():
349
+ line = line.strip()
350
+ if not line:
351
+ continue
352
+ unit = line.split()[0]
353
+ if unit and unit != "0":
354
+ units.append(unit)
355
+ return units
356
+
357
+
358
+ def get_system_running_state() -> Optional[str]:
359
+ """`systemctl is-system-running` — running / degraded / maintenance / etc."""
360
+ rc, out, _ = run(["systemctl", "is-system-running"], timeout=5)
361
+ if rc < 0 or rc == 127 or rc == 124:
362
+ return None
363
+ state = out.strip()
364
+ return state or None
365
+
366
+
367
+ def get_enabled_inactive_units(limit: int = 200) -> List[Dict[str, str]]:
368
+ """Enabled .service units whose ActiveState is not active and Type is not oneshot.
369
+
370
+ Returns a list of dicts {name, state, sub, result}. Empty list when systemctl
371
+ is unavailable or all enabled services are running fine.
372
+ """
373
+ rc, out, _ = run(
374
+ ["systemctl", "list-unit-files", "--type=service", "--state=enabled", "--no-legend", "--no-pager"],
375
+ timeout=8,
376
+ )
377
+ if rc != 0 or not out:
378
+ return []
379
+ enabled: List[str] = []
380
+ for line in out.splitlines():
381
+ parts = line.split()
382
+ if parts:
383
+ enabled.append(parts[0])
384
+ enabled = enabled[:limit]
385
+ if not enabled:
386
+ return []
387
+
388
+ rc, out, _ = run(
389
+ ["systemctl", "show", "--property=Id,ActiveState,SubState,Type,Result"] + enabled,
390
+ timeout=10,
391
+ )
392
+ if rc != 0 or not out:
393
+ return []
394
+
395
+ inactive: List[Dict[str, str]] = []
396
+ current: Dict[str, str] = {}
397
+ for line in out.splitlines():
398
+ if not line.strip():
399
+ if current:
400
+ _maybe_collect_inactive(current, inactive)
401
+ current = {}
402
+ continue
403
+ key, _, value = line.partition("=")
404
+ current[key.strip()] = value.strip()
405
+ if current:
406
+ _maybe_collect_inactive(current, inactive)
407
+ return inactive
408
+
409
+
410
+ def _maybe_collect_inactive(unit: Dict[str, str], out: List[Dict[str, str]]) -> None:
411
+ """Append unit to `out` if it is enabled-but-not-running."""
412
+ if unit.get("Type") in ("oneshot",):
413
+ return
414
+ state = unit.get("ActiveState")
415
+ if state in ("active", "activating", "reloading"):
416
+ return
417
+ name = unit.get("Id")
418
+ if not name:
419
+ return
420
+ out.append(
421
+ {
422
+ "name": name,
423
+ "state": state or "unknown",
424
+ "sub": unit.get("SubState", ""),
425
+ "result": unit.get("Result", ""),
426
+ }
427
+ )
428
+
429
+
430
+ def get_reboot_required() -> Optional[str]:
431
+ """Return reboot-required reason on Debian/Ubuntu, or None."""
432
+ marker = "/var/run/reboot-required"
433
+ if not os.path.exists(marker):
434
+ return None
435
+ pkgs = read_file("/var/run/reboot-required.pkgs").strip()
436
+ if pkgs:
437
+ first = pkgs.splitlines()[:3]
438
+ return "reboot required ({} pkg(s)): {}".format(len(pkgs.splitlines()), ", ".join(first))
439
+ return "reboot required"
440
+
441
+
442
+ def get_time_sync_status() -> Dict[str, Any]:
443
+ """Return time sync info: {synchronized, ntp_active, source}."""
444
+ rc, out, _ = run(
445
+ ["timedatectl", "show", "-p", "NTPSynchronized", "-p", "NTP", "-p", "CanNTP"],
446
+ timeout=5,
447
+ )
448
+ if rc == 127:
449
+ return {"synchronized": None, "ntp_active": None, "source": "timedatectl unavailable"}
450
+ if rc != 0 or not out:
451
+ return {"synchronized": None, "ntp_active": None, "source": "timedatectl error"}
452
+ fields: Dict[str, str] = {}
453
+ for line in out.splitlines():
454
+ key, _, value = line.partition("=")
455
+ fields[key.strip()] = value.strip()
456
+ return {
457
+ "synchronized": fields.get("NTPSynchronized") == "yes",
458
+ "ntp_active": fields.get("NTP") == "yes",
459
+ "source": "timedatectl",
460
+ }
461
+
462
+
463
+ def get_readonly_mounts() -> List[str]:
464
+ """Return mounts that are read-only and where ro is unexpected (excludes squashfs, iso9660, cd, etc.)."""
465
+ expected_ro = {"squashfs", "iso9660", "udf"}
466
+ content = read_file(PROC_MOUNTS)
467
+ result: List[str] = []
468
+ skip_targets = ("/snap", "/proc", "/sys", "/dev", "/run", "/var/lib/docker", "/var/lib/snapd")
469
+ for line in content.splitlines():
470
+ parts = line.split()
471
+ if len(parts) < 4:
472
+ continue
473
+ target, fs_type, opts = parts[1], parts[2], parts[3]
474
+ if fs_type in expected_ro:
475
+ continue
476
+ if any(target.startswith(p) for p in skip_targets):
477
+ continue
478
+ flags = opts.split(",")
479
+ if "ro" in flags:
480
+ result.append(f"{target} ({fs_type})")
481
+ return result
482
+
483
+
484
+ def get_stuck_processes() -> List[Dict[str, Any]]:
485
+ """Return processes in D state (uninterruptible sleep, often IO-stuck)."""
486
+ if HAS_PSUTIL:
487
+ try:
488
+ stuck: List[Dict[str, Any]] = []
489
+ for proc in psutil.process_iter(["pid", "name", "status"]):
490
+ if proc.info.get("status") == psutil.STATUS_DISK_SLEEP:
491
+ stuck.append({"pid": proc.info["pid"], "name": proc.info.get("name") or ""})
492
+ return stuck
493
+ except Exception:
494
+ pass
495
+ rc, out, _ = run(["ps", "-eo", "pid,stat,comm", "--no-headers"], timeout=5)
496
+ if rc != 0:
497
+ return []
498
+ result: List[Dict[str, Any]] = []
499
+ for line in out.splitlines():
500
+ parts = line.split(None, 2)
501
+ if len(parts) < 3:
502
+ continue
503
+ pid, stat_field, comm = parts
504
+ if stat_field.startswith("D"):
505
+ try:
506
+ result.append({"pid": int(pid), "name": comm})
507
+ except ValueError:
508
+ continue
509
+ return result
510
+
511
+
512
+ def get_iowait_percent(sample_seconds: float = 0.3) -> Optional[float]:
513
+ """Sample /proc/stat twice and return iowait percent."""
514
+
515
+ def _snapshot() -> Optional[List[int]]:
516
+ line = read_file(PROC_STAT).splitlines()
517
+ if not line:
518
+ return None
519
+ parts = line[0].split()
520
+ if parts[0] != "cpu" or len(parts) < 6:
521
+ return None
522
+ try:
523
+ return [int(p) for p in parts[1:11]]
524
+ except ValueError:
525
+ return None
526
+
527
+ first = _snapshot()
528
+ if first is None:
529
+ return None
530
+ time.sleep(sample_seconds)
531
+ second = _snapshot()
532
+ if second is None:
533
+ return None
534
+ deltas = [b - a for a, b in zip(first, second)]
535
+ total = sum(deltas)
536
+ if total <= 0:
537
+ return 0.0
538
+ iowait = deltas[4] if len(deltas) > 4 else 0
539
+ return round(100.0 * iowait / total, 2)
540
+
541
+
542
+ def get_service_restart_counts(threshold: int = 3, limit: int = 200) -> List[Dict[str, Any]]:
543
+ """Active .service units with NRestarts >= threshold (since boot).
544
+
545
+ NRestarts is cumulative — a single occasional crash isn't enough; the value
546
+ being high indicates a service that systemd has had to bring back multiple
547
+ times. Returns name + count sorted descending.
548
+ """
549
+ rc, out, _ = run(
550
+ ["systemctl", "list-units", "--type=service", "--state=active", "--no-legend", "--no-pager", "--plain"],
551
+ timeout=8,
552
+ )
553
+ if rc != 0 or not out:
554
+ return []
555
+ units: List[str] = []
556
+ for line in out.splitlines():
557
+ parts = line.split()
558
+ if parts and parts[0].endswith(".service"):
559
+ units.append(parts[0])
560
+ units = units[:limit]
561
+ if not units:
562
+ return []
563
+ rc, out, _ = run(["systemctl", "show", "-p", "Id", "-p", "NRestarts"] + units, timeout=10)
564
+ if rc != 0 or not out:
565
+ return []
566
+ result: List[Dict[str, Any]] = []
567
+ current: Dict[str, str] = {}
568
+ for line in out.splitlines():
569
+ if not line.strip():
570
+ _maybe_collect_restarts(current, threshold, result)
571
+ current = {}
572
+ continue
573
+ key, _, value = line.partition("=")
574
+ current[key.strip()] = value.strip()
575
+ _maybe_collect_restarts(current, threshold, result)
576
+ result.sort(key=lambda r: r["restarts"], reverse=True)
577
+ return result
578
+
579
+
580
+ def _maybe_collect_restarts(unit: Dict[str, str], threshold: int, out: List[Dict[str, Any]]) -> None:
581
+ try:
582
+ n = int(unit.get("NRestarts", "0"))
583
+ except ValueError:
584
+ return
585
+ name = unit.get("Id")
586
+ if not name or n < threshold:
587
+ return
588
+ out.append({"name": name, "restarts": n})
589
+
590
+
591
+ def get_network_errors() -> List[Dict[str, Any]]:
592
+ """For each non-loopback, non-virtual interface read kernel error counters.
593
+
594
+ Returns interfaces with at least one non-zero counter (rx/tx errors or drops).
595
+ """
596
+ base = "/sys/class/net"
597
+ if not os.path.isdir(base):
598
+ return []
599
+ try:
600
+ ifaces = sorted(os.listdir(base))
601
+ except OSError:
602
+ return []
603
+ results: List[Dict[str, Any]] = []
604
+ for iface in ifaces:
605
+ if iface == "lo" or _is_noisy_iface(iface):
606
+ continue
607
+ stats_dir = os.path.join(base, iface, "statistics")
608
+ if not os.path.isdir(stats_dir):
609
+ continue
610
+
611
+ # Bind stats_dir at def-time via default-arg to avoid late-binding
612
+ # the loop variable (caught by B023).
613
+ def _read_int(name: str, _dir: str = stats_dir) -> int:
614
+ try:
615
+ return int(read_file(os.path.join(_dir, name)).strip() or "0")
616
+ except (ValueError, OSError):
617
+ return 0
618
+
619
+ rx_err = _read_int("rx_errors")
620
+ tx_err = _read_int("tx_errors")
621
+ rx_drop = _read_int("rx_dropped")
622
+ tx_drop = _read_int("tx_dropped")
623
+ total = rx_err + tx_err + rx_drop + tx_drop
624
+ if total <= 0:
625
+ continue
626
+ results.append(
627
+ {
628
+ "iface": iface,
629
+ "rx_errors": rx_err,
630
+ "tx_errors": tx_err,
631
+ "rx_dropped": rx_drop,
632
+ "tx_dropped": tx_drop,
633
+ "total": total,
634
+ }
635
+ )
636
+ return results
637
+
638
+
639
+ def probe_http(url: str, timeout: float = 3.0) -> Dict[str, Any]:
640
+ """HEAD-probe a URL. Returns {url, status_code, latency_ms, error}.
641
+
642
+ Uses stdlib http.client to avoid requests-as-a-dep. Treats HTTP redirects
643
+ as success (status_code is the first response code we see).
644
+ """
645
+ from urllib.parse import urlparse
646
+
647
+ parsed = urlparse(url)
648
+ if parsed.scheme not in ("http", "https"):
649
+ return {"url": url, "status_code": None, "latency_ms": None, "error": f"unsupported scheme: {parsed.scheme}"}
650
+ host = parsed.hostname or "localhost"
651
+ port = parsed.port or (443 if parsed.scheme == "https" else 80)
652
+ path = parsed.path or "/"
653
+ if parsed.query:
654
+ path += f"?{parsed.query}"
655
+
656
+ started = time.monotonic()
657
+ try:
658
+ import http.client
659
+
660
+ if parsed.scheme == "https":
661
+ conn = http.client.HTTPSConnection(host, port, timeout=timeout)
662
+ else:
663
+ conn = http.client.HTTPConnection(host, port, timeout=timeout)
664
+ try:
665
+ conn.request("HEAD", path)
666
+ resp = conn.getresponse()
667
+ latency = (time.monotonic() - started) * 1000.0
668
+ return {"url": url, "status_code": resp.status, "latency_ms": round(latency, 1), "error": None}
669
+ finally:
670
+ conn.close()
671
+ except Exception as exc:
672
+ return {"url": url, "status_code": None, "latency_ms": None, "error": f"{type(exc).__name__}: {exc}"}
673
+
674
+
675
+ def probe_tcp(target: str, timeout: float = 3.0) -> Dict[str, Any]:
676
+ """Open a TCP socket to host:port. Returns {target, latency_ms, error}."""
677
+ import socket as _socket
678
+
679
+ if ":" not in target:
680
+ return {"target": target, "latency_ms": None, "error": "expected host:port"}
681
+ host, _, port_s = target.rpartition(":")
682
+ try:
683
+ port = int(port_s)
684
+ except ValueError:
685
+ return {"target": target, "latency_ms": None, "error": f"invalid port: {port_s}"}
686
+ started = time.monotonic()
687
+ try:
688
+ sock = _socket.create_connection((host.strip("[]"), port), timeout=timeout)
689
+ sock.close()
690
+ latency = (time.monotonic() - started) * 1000.0
691
+ return {"target": target, "latency_ms": round(latency, 1), "error": None}
692
+ except Exception as exc:
693
+ return {"target": target, "latency_ms": None, "error": f"{type(exc).__name__}: {exc}"}
694
+
695
+
696
+ def get_block_devices() -> List[str]:
697
+ """Return /dev paths of physical block devices (excluding partitions / loops)."""
698
+ import shutil
699
+
700
+ if not shutil.which("lsblk"):
701
+ return []
702
+ rc, out, _ = run(["lsblk", "-dn", "-o", "NAME,TYPE"], timeout=5)
703
+ if rc != 0:
704
+ return []
705
+ devices = []
706
+ for line in out.splitlines():
707
+ parts = line.split()
708
+ if len(parts) >= 2 and parts[1] == "disk" and not parts[0].startswith("loop"):
709
+ devices.append(f"/dev/{parts[0]}")
710
+ return devices
711
+
712
+
713
+ def get_smart_status() -> Optional[List[Dict[str, Any]]]:
714
+ """Per-device SMART health (`smartctl -H -j`).
715
+
716
+ Returns None when smartctl is unavailable. Empty list means smartctl is
717
+ present but no block devices were found / accessible.
718
+ """
719
+ import shutil
720
+
721
+ if not shutil.which("smartctl"):
722
+ return None
723
+ devices = get_block_devices()
724
+ if not devices:
725
+ return []
726
+ import json as _json
727
+
728
+ results: List[Dict[str, Any]] = []
729
+ for dev in devices:
730
+ rc, out, _ = run(["smartctl", "-H", "-j", dev], timeout=8)
731
+ # smartctl exit codes are a bitfield: 0=ok, bit 0 (1) = cmd parse fail,
732
+ # bit 1 (2) = device open failed, bit 2 (4) = some SMART command failed.
733
+ # We try to parse JSON even on non-zero rc — smartctl emits JSON anyway.
734
+ try:
735
+ data = _json.loads(out)
736
+ except (ValueError, _json.JSONDecodeError):
737
+ continue
738
+ smart = data.get("smart_status") or {}
739
+ passed = smart.get("passed")
740
+ msgs = data.get("messages") or []
741
+ msg_text = "; ".join(m.get("string", "") for m in msgs if m.get("string"))
742
+ if passed is None and rc != 0:
743
+ # device probably doesn't support SMART; skip silently
744
+ continue
745
+ results.append(
746
+ {
747
+ "device": dev,
748
+ "passed": bool(passed) if passed is not None else None,
749
+ "exit_code": rc,
750
+ "message": msg_text,
751
+ }
752
+ )
753
+ return results
754
+
755
+
756
+ def get_temperatures() -> List[Dict[str, Any]]:
757
+ """Read CPU/GPU/board temperatures from `/sys/class/hwmon/*/temp*_input`.
758
+
759
+ Each value is millidegrees Celsius. Returns a list of
760
+ {sensor, label, celsius}. Sensors that look broken (negative, absurd)
761
+ are filtered out.
762
+ """
763
+ base = "/sys/class/hwmon"
764
+ if not os.path.isdir(base):
765
+ return []
766
+ try:
767
+ hwmons = sorted(os.listdir(base))
768
+ except OSError:
769
+ return []
770
+ result: List[Dict[str, Any]] = []
771
+ for hwmon in hwmons:
772
+ hwmon_dir = os.path.join(base, hwmon)
773
+ sensor_name = read_file(os.path.join(hwmon_dir, "name")).strip() or hwmon
774
+ try:
775
+ files = os.listdir(hwmon_dir)
776
+ except OSError:
777
+ continue
778
+ for f in sorted(files):
779
+ match = re.match(r"^temp(\d+)_input$", f)
780
+ if not match:
781
+ continue
782
+ idx = match.group(1)
783
+ value_raw = read_file(os.path.join(hwmon_dir, f)).strip()
784
+ if not value_raw:
785
+ continue
786
+ try:
787
+ value_mc = int(value_raw)
788
+ except ValueError:
789
+ continue
790
+ celsius = value_mc / 1000.0
791
+ # Filter obviously-broken sensors: negative, or absurdly high (>200°C).
792
+ if celsius < -50 or celsius > 200:
793
+ continue
794
+ label_path = os.path.join(hwmon_dir, f"temp{idx}_label")
795
+ label = read_file(label_path).strip() if os.path.exists(label_path) else f"temp{idx}"
796
+ result.append(
797
+ {
798
+ "sensor": sensor_name,
799
+ "label": label,
800
+ "celsius": round(celsius, 1),
801
+ }
802
+ )
803
+ return result
804
+
805
+
806
+ def resolve_hostname(host: str, timeout: float = 2.0) -> Optional[float]:
807
+ """Resolve a hostname via the system resolver. Returns ms elapsed, or None.
808
+
809
+ Uses socket.gethostbyname with a deadline so a broken resolver does not
810
+ hang the audit. We deliberately do NOT use any third-party resolver — we
811
+ want to test whatever DNS the host itself is configured to use.
812
+ """
813
+ import socket
814
+
815
+ old_timeout = socket.getdefaulttimeout()
816
+ socket.setdefaulttimeout(timeout)
817
+ started = time.monotonic()
818
+ try:
819
+ socket.gethostbyname(host)
820
+ return (time.monotonic() - started) * 1000.0
821
+ except (socket.gaierror, OSError, socket.timeout):
822
+ return None
823
+ finally:
824
+ socket.setdefaulttimeout(old_timeout)
825
+
826
+
827
+ def get_fail2ban_jails() -> Optional[List[Dict[str, Any]]]:
828
+ """List of {name, banned, total} per active fail2ban jail.
829
+
830
+ None if fail2ban-client is missing or not running. Empty list means
831
+ fail2ban is up but has no jails configured (rare).
832
+ """
833
+ import shutil
834
+
835
+ if not shutil.which("fail2ban-client"):
836
+ return None
837
+ rc, out, _ = run(["fail2ban-client", "status"], timeout=5)
838
+ if rc != 0 or not out:
839
+ return None
840
+ jail_names: List[str] = []
841
+ for line in out.splitlines():
842
+ # Example line: `|- Jail list: sshd, recidive`
843
+ if "Jail list" in line:
844
+ _, _, rest = line.partition(":")
845
+ jail_names = [j.strip() for j in rest.split(",") if j.strip()]
846
+ break
847
+ if not jail_names:
848
+ return []
849
+ jails: List[Dict[str, Any]] = []
850
+ for name in jail_names:
851
+ rc, jout, _ = run(["fail2ban-client", "status", name], timeout=5)
852
+ if rc != 0:
853
+ continue
854
+ banned = 0
855
+ total = 0
856
+ for line in jout.splitlines():
857
+ # fail2ban-client decorates lines with tree prefixes like `|-` or
858
+ # `\`-`; strip those before matching the key.
859
+ stripped = line.lstrip(" |`-\t")
860
+ if stripped.startswith("Currently banned:"):
861
+ try:
862
+ banned = int(stripped.partition(":")[2].strip())
863
+ except (ValueError, IndexError):
864
+ banned = 0
865
+ elif stripped.startswith("Total banned:"):
866
+ try:
867
+ total = int(stripped.partition(":")[2].strip())
868
+ except (ValueError, IndexError):
869
+ total = 0
870
+ jails.append({"name": name, "banned": banned, "total": total})
871
+ return jails
872
+
873
+
874
+ def get_chrony_offset() -> Optional[float]:
875
+ """Return |system time offset from NTP source| in seconds via `chronyc tracking`.
876
+
877
+ None if chrony is not the active NTP daemon or chronyc is unavailable.
878
+ """
879
+ import shutil
880
+
881
+ if not shutil.which("chronyc"):
882
+ return None
883
+ rc, out, _ = run(["chronyc", "tracking"], timeout=5)
884
+ if rc != 0 or not out:
885
+ return None
886
+ # Line example:
887
+ # "System time : 0.000123456 seconds slow of NTP time"
888
+ for line in out.splitlines():
889
+ if "System time" not in line:
890
+ continue
891
+ match = re.search(r"([+-]?\d+(?:\.\d+)?)\s+seconds", line)
892
+ if match:
893
+ try:
894
+ return abs(float(match.group(1)))
895
+ except ValueError:
896
+ return None
897
+ return None
898
+
899
+
900
+ def get_docker_problem_containers() -> Optional[List[Dict[str, Any]]]:
901
+ """List containers that are unhealthy or restart-looping.
902
+
903
+ Returns None when docker is unavailable / daemon unreachable. Empty list
904
+ means docker is reachable and nothing is misbehaving.
905
+ """
906
+ import shutil
907
+
908
+ if not shutil.which("docker"):
909
+ return None
910
+ rc, out, _ = run(
911
+ ["docker", "ps", "-a", "--format", "{{.Names}}\t{{.Status}}\t{{.State}}"],
912
+ timeout=8,
913
+ )
914
+ if rc != 0:
915
+ return None
916
+ problems: List[Dict[str, Any]] = []
917
+ for line in out.splitlines():
918
+ parts = line.split("\t")
919
+ if len(parts) < 3:
920
+ continue
921
+ name, status, state = parts[0], parts[1], parts[2]
922
+ low_status = status.lower()
923
+ low_state = state.lower()
924
+ problem: Optional[str] = None
925
+ if "unhealthy" in low_status:
926
+ problem = "unhealthy"
927
+ elif low_state == "restarting":
928
+ problem = "restarting"
929
+ # Note: exited containers are often intentional (init/one-shot jobs).
930
+ # We surface them only when they exited with a non-zero code AND are
931
+ # part of a restart-policy that should keep them alive — but discerning
932
+ # that requires `docker inspect`. Skip exited containers for now.
933
+ if problem:
934
+ problems.append(
935
+ {
936
+ "name": name,
937
+ "state": state,
938
+ "status": status,
939
+ "problem": problem,
940
+ }
941
+ )
942
+ return problems
943
+
944
+
945
+ def get_conntrack_usage() -> Optional[Tuple[int, int]]:
946
+ """Return (current_entries, max_entries) for the netfilter conntrack table.
947
+
948
+ Path varies by distro/kernel — try the two known locations.
949
+ """
950
+ candidates = (
951
+ ("/proc/sys/net/netfilter/nf_conntrack_count", "/proc/sys/net/netfilter/nf_conntrack_max"),
952
+ ("/proc/sys/net/nf_conntrack_count", "/proc/sys/net/nf_conntrack_max"),
953
+ )
954
+ for count_path, max_path in candidates:
955
+ count_raw = read_file(count_path).strip()
956
+ max_raw = read_file(max_path).strip()
957
+ if not count_raw or not max_raw:
958
+ continue
959
+ try:
960
+ return int(count_raw), int(max_raw)
961
+ except ValueError:
962
+ continue
963
+ return None
964
+
965
+
966
+ def get_top_paths_in(directory: str, limit: int = 10) -> List[Dict[str, Any]]:
967
+ """Return top-N largest immediate subdirectories of `directory` (du -d1).
968
+
969
+ Used by `wtf explain --deep` to surface "who's eating the disk" when a
970
+ disk-fill warning fires. Bounded by 15s — on huge trees `du` can run long.
971
+ """
972
+ import shutil
973
+
974
+ if not shutil.which("du") or not os.path.isdir(directory):
975
+ return []
976
+ # --block-size=1 → bytes, -d1 → only direct children.
977
+ rc, out, _ = run(["du", "-d1", "--block-size=1", directory], timeout=15)
978
+ if rc != 0 or not out:
979
+ return []
980
+ results: List[Dict[str, Any]] = []
981
+ for line in out.splitlines():
982
+ parts = line.split("\t")
983
+ if len(parts) != 2:
984
+ continue
985
+ try:
986
+ size = int(parts[0])
987
+ except ValueError:
988
+ continue
989
+ path = parts[1]
990
+ if path == directory:
991
+ continue # skip the directory itself (du -d1 emits it)
992
+ results.append({"path": path, "bytes": size})
993
+ results.sort(key=lambda r: r["bytes"], reverse=True)
994
+ return results[:limit]
995
+
996
+
997
+ def get_largest_files(directory: str, limit: int = 5, min_size_mb: int = 100) -> List[Dict[str, Any]]:
998
+ """Find regular files under `directory` larger than min_size_mb."""
999
+ import shutil
1000
+
1001
+ if not shutil.which("find") or not os.path.isdir(directory):
1002
+ return []
1003
+ rc, out, _ = run(
1004
+ ["find", directory, "-xdev", "-type", "f", "-size", f"+{min_size_mb}M", "-printf", "%s\t%p\n"],
1005
+ timeout=20,
1006
+ )
1007
+ if rc != 0 or not out:
1008
+ return []
1009
+ results: List[Dict[str, Any]] = []
1010
+ for line in out.splitlines():
1011
+ parts = line.split("\t", 1)
1012
+ if len(parts) != 2:
1013
+ continue
1014
+ try:
1015
+ size = int(parts[0])
1016
+ except ValueError:
1017
+ continue
1018
+ results.append({"path": parts[1], "bytes": size})
1019
+ results.sort(key=lambda r: r["bytes"], reverse=True)
1020
+ return results[:limit]
1021
+
1022
+
1023
+ def get_docker_disk_usage() -> Optional[List[Dict[str, str]]]:
1024
+ """`docker system df` parsed into rows. None if docker missing/unreachable."""
1025
+ import shutil
1026
+
1027
+ if not shutil.which("docker"):
1028
+ return None
1029
+ rc, out, _ = run(
1030
+ ["docker", "system", "df", "--format", "{{.Type}}\t{{.TotalCount}}\t{{.Size}}\t{{.Reclaimable}}"],
1031
+ timeout=8,
1032
+ )
1033
+ if rc != 0 or not out:
1034
+ return None
1035
+ rows: List[Dict[str, str]] = []
1036
+ for line in out.splitlines():
1037
+ parts = line.split("\t")
1038
+ if len(parts) >= 4:
1039
+ rows.append(
1040
+ {
1041
+ "type": parts[0],
1042
+ "count": parts[1],
1043
+ "size": parts[2],
1044
+ "reclaimable": parts[3],
1045
+ }
1046
+ )
1047
+ return rows
1048
+
1049
+
1050
+ def get_docker_container_sizes(limit: int = 10) -> Optional[List[Dict[str, str]]]:
1051
+ """Per-container size breakdown (`docker ps -as`). None if docker missing.
1052
+
1053
+ The `.Size` field reports `Rw+Vsize` — read-write layer plus base image.
1054
+ """
1055
+ import shutil
1056
+
1057
+ if not shutil.which("docker"):
1058
+ return None
1059
+ rc, out, _ = run(
1060
+ ["docker", "ps", "-as", "--format", "{{.Names}}\t{{.Size}}\t{{.Image}}\t{{.Status}}"],
1061
+ timeout=8,
1062
+ )
1063
+ if rc != 0 or not out:
1064
+ return None
1065
+ rows: List[Dict[str, str]] = []
1066
+ for line in out.splitlines():
1067
+ parts = line.split("\t")
1068
+ if len(parts) >= 4:
1069
+ rows.append(
1070
+ {
1071
+ "name": parts[0],
1072
+ "size": parts[1],
1073
+ "image": parts[2],
1074
+ "status": parts[3],
1075
+ }
1076
+ )
1077
+ return rows[:limit]
1078
+
1079
+
1080
+ def get_docker_log_sizes(limit: int = 5) -> Optional[List[Dict[str, Any]]]:
1081
+ """Per-container log-file size (json-file driver). None if docker missing.
1082
+
1083
+ Reads each container's LogPath via `docker inspect`, stats it on the host
1084
+ filesystem. Requires the wtf process to have read access to the path —
1085
+ typically only root or the docker group.
1086
+ """
1087
+ import shutil
1088
+
1089
+ if not shutil.which("docker"):
1090
+ return None
1091
+ rc, out, _ = run(["docker", "ps", "-aq"], timeout=5)
1092
+ if rc != 0 or not out:
1093
+ return None
1094
+ ids = [i for i in out.splitlines() if i.strip()]
1095
+ if not ids:
1096
+ return []
1097
+ rc, out, _ = run(
1098
+ ["docker", "inspect", "--format", "{{.Name}}\t{{.LogPath}}"] + ids,
1099
+ timeout=10,
1100
+ )
1101
+ if rc != 0 or not out:
1102
+ return None
1103
+ results: List[Dict[str, Any]] = []
1104
+ for line in out.splitlines():
1105
+ parts = line.split("\t")
1106
+ if len(parts) != 2:
1107
+ continue
1108
+ name = parts[0].lstrip("/")
1109
+ log_path = parts[1]
1110
+ if not log_path or log_path == "<no value>":
1111
+ continue
1112
+ try:
1113
+ size = os.path.getsize(log_path)
1114
+ except OSError:
1115
+ continue
1116
+ results.append({"name": name, "log_path": log_path, "bytes": size})
1117
+ results.sort(key=lambda r: r["bytes"], reverse=True)
1118
+ return results[:limit]
1119
+
1120
+
1121
+ def get_journal_disk_usage() -> Optional[int]:
1122
+ """Total bytes occupied by journald archives via `journalctl --disk-usage`."""
1123
+ rc, out, _ = run(["journalctl", "--disk-usage"], timeout=5)
1124
+ if rc != 0 or not out:
1125
+ return None
1126
+ # Output examples:
1127
+ # "Archived and active journals take up 1.2G in the file system."
1128
+ # "Archived and active journals take up 824.0M in the file system."
1129
+ # "Archived and active journals take up 12.0G on disk."
1130
+ match = re.search(r"([0-9]+(?:\.[0-9]+)?)\s*([KMGT])?B?", out)
1131
+ if not match:
1132
+ return None
1133
+ try:
1134
+ value = float(match.group(1))
1135
+ except ValueError:
1136
+ return None
1137
+ multipliers = {"K": 1024, "M": 1024**2, "G": 1024**3, "T": 1024**4}
1138
+ unit = match.group(2)
1139
+ return int(value * multipliers.get(unit, 1))
1140
+
1141
+
1142
+ def get_pressure(resource: str) -> Optional[Dict[str, Dict[str, float]]]:
1143
+ """Read /proc/pressure/<resource> (PSI: cpu, memory, io). None if absent.
1144
+
1145
+ Returns {"some": {"avg10":..., "avg60":..., "avg300":..., "total":...},
1146
+ "full": {...}} (full is absent for cpu in older kernels).
1147
+ """
1148
+ if resource not in ("cpu", "memory", "io"):
1149
+ return None
1150
+ content = read_file(f"/proc/pressure/{resource}")
1151
+ if not content:
1152
+ return None
1153
+ result: Dict[str, Dict[str, float]] = {}
1154
+ for line in content.splitlines():
1155
+ parts = line.split()
1156
+ if not parts:
1157
+ continue
1158
+ scope = parts[0]
1159
+ if scope not in ("some", "full"):
1160
+ continue
1161
+ data: Dict[str, float] = {}
1162
+ for kv in parts[1:]:
1163
+ key, _, value = kv.partition("=")
1164
+ try:
1165
+ data[key] = float(value)
1166
+ except ValueError:
1167
+ continue
1168
+ result[scope] = data
1169
+ return result if result else None
1170
+
1171
+
1172
+ def get_kernel_taint() -> Optional[int]:
1173
+ """Read /proc/sys/kernel/tainted. 0 = clean. Non-zero = kernel saw badness."""
1174
+ raw = read_file("/proc/sys/kernel/tainted").strip()
1175
+ if not raw:
1176
+ return None
1177
+ try:
1178
+ return int(raw)
1179
+ except ValueError:
1180
+ return None
1181
+
1182
+
1183
+ KERNEL_TAINT_BITS = {
1184
+ 0: "PROPRIETARY_MODULE",
1185
+ 1: "FORCED_MODULE",
1186
+ 2: "UNSAFE_SMP",
1187
+ 3: "FORCED_RMMOD",
1188
+ 4: "MACHINE_CHECK",
1189
+ 5: "BAD_PAGE",
1190
+ 6: "USER",
1191
+ 7: "DIE",
1192
+ 8: "OVERRIDDEN_ACPI_TABLE",
1193
+ 9: "WARN",
1194
+ 10: "CRAP",
1195
+ 11: "FIRMWARE_WORKAROUND",
1196
+ 12: "OOT_MODULE",
1197
+ 13: "UNSIGNED_MODULE",
1198
+ 14: "SOFTLOCKUP",
1199
+ 15: "LIVEPATCH",
1200
+ }
1201
+
1202
+
1203
+ def decode_kernel_taint(value: int) -> List[str]:
1204
+ """Decode a kernel taint bitmask into a list of flag names."""
1205
+ return [name for bit, name in KERNEL_TAINT_BITS.items() if value & (1 << bit)]
1206
+
1207
+
1208
+ def get_certificate_expirations(
1209
+ roots: Optional[List[str]] = None,
1210
+ max_files: int = 50,
1211
+ ) -> List[Dict[str, Any]]:
1212
+ """Walk well-known TLS-cert roots and return list of {path, days_left}.
1213
+
1214
+ Returns empty list when openssl is unavailable or no cert dirs exist.
1215
+ Bounded by max_files to avoid runaway IO on misconfigured hosts.
1216
+ """
1217
+ import shutil
1218
+
1219
+ if not shutil.which("openssl"):
1220
+ return []
1221
+ if roots is None:
1222
+ # NB: /etc/ssl/certs is the system CA bundle (root CAs ship with
1223
+ # far-future or already-past notAfter dates intentionally) — scanning
1224
+ # it produces spam, so we focus on server-cert locations only.
1225
+ roots = [
1226
+ "/etc/letsencrypt/live",
1227
+ "/etc/letsencrypt/archive",
1228
+ "/etc/nginx/ssl",
1229
+ "/etc/nginx/certs",
1230
+ "/etc/apache2/ssl",
1231
+ "/etc/haproxy/certs",
1232
+ "/etc/pki/tls/private",
1233
+ "/etc/dovecot/certs",
1234
+ "/etc/postfix/certs",
1235
+ "/etc/ssl/private",
1236
+ ]
1237
+ candidates: List[str] = []
1238
+ for root in roots:
1239
+ if not os.path.isdir(root):
1240
+ continue
1241
+ for dirpath, _, filenames in os.walk(root, followlinks=False):
1242
+ for f in filenames:
1243
+ low = f.lower()
1244
+ if not (low.endswith(".pem") or low.endswith(".crt") or low.endswith(".cert")):
1245
+ continue
1246
+ # Skip private keys masquerading as .pem files.
1247
+ if "privkey" in low or "private" in low or "key" in low:
1248
+ if "pubkey" not in low:
1249
+ continue
1250
+ candidates.append(os.path.join(dirpath, f))
1251
+ if len(candidates) >= max_files:
1252
+ break
1253
+ if len(candidates) >= max_files:
1254
+ break
1255
+ if len(candidates) >= max_files:
1256
+ break
1257
+
1258
+ results: List[Dict[str, Any]] = []
1259
+ seen_inodes: set = set()
1260
+ for path in candidates:
1261
+ try:
1262
+ st = os.stat(path)
1263
+ if st.st_ino in seen_inodes:
1264
+ continue
1265
+ seen_inodes.add(st.st_ino)
1266
+ except OSError:
1267
+ continue
1268
+ days = _parse_cert_expiry_days(path)
1269
+ if days is None:
1270
+ continue
1271
+ results.append({"path": path, "days_left": days})
1272
+ results.sort(key=lambda r: r["days_left"])
1273
+ return results
1274
+
1275
+
1276
+ def _parse_cert_expiry_days(path: str) -> Optional[int]:
1277
+ """Return days until notAfter, or None if unparseable / not a cert."""
1278
+ rc, out, _ = run(["openssl", "x509", "-enddate", "-noout", "-in", path], timeout=3)
1279
+ if rc != 0 or "=" not in out:
1280
+ return None
1281
+ _, _, when = out.strip().partition("=")
1282
+ when = when.strip()
1283
+ try:
1284
+ from datetime import datetime, timezone
1285
+
1286
+ dt = datetime.strptime(when, "%b %d %H:%M:%S %Y %Z")
1287
+ delta = dt.replace(tzinfo=timezone.utc) - datetime.now(timezone.utc)
1288
+ return delta.days
1289
+ except (ValueError, ImportError):
1290
+ return None
1291
+
1292
+
1293
+ def _snap_tcp() -> Optional[Dict[str, int]]:
1294
+ """Snapshot of /proc/net/snmp Tcp counters."""
1295
+ content = read_file("/proc/net/snmp")
1296
+ if not content:
1297
+ return None
1298
+ header: Optional[List[str]] = None
1299
+ values: Optional[List[str]] = None
1300
+ for line in content.splitlines():
1301
+ if line.startswith("Tcp:"):
1302
+ parts = line.split()[1:]
1303
+ if header is None:
1304
+ header = parts
1305
+ else:
1306
+ values = parts
1307
+ break
1308
+ if not header or not values or len(header) != len(values):
1309
+ return None
1310
+ out: Dict[str, int] = {}
1311
+ for k, v in zip(header, values):
1312
+ try:
1313
+ out[k] = int(v)
1314
+ except ValueError:
1315
+ continue
1316
+ return out
1317
+
1318
+
1319
+ def get_tcp_retransmit_rate(sample_seconds: float = 1.0) -> Optional[float]:
1320
+ """Sample TCP RetransSegs/OutSegs over a short window. Returns percent.
1321
+
1322
+ Returns None when /proc/net/snmp is unreadable. Returns 0.0 when there was
1323
+ no outbound TCP traffic during the sample (rate is undefined; we treat it
1324
+ as "no problem").
1325
+ """
1326
+ first = _snap_tcp()
1327
+ if first is None:
1328
+ return None
1329
+ time.sleep(sample_seconds)
1330
+ second = _snap_tcp()
1331
+ if second is None:
1332
+ return None
1333
+ out_delta = second.get("OutSegs", 0) - first.get("OutSegs", 0)
1334
+ retr_delta = second.get("RetransSegs", 0) - first.get("RetransSegs", 0)
1335
+ if out_delta <= 0:
1336
+ return 0.0
1337
+ return round(100.0 * retr_delta / out_delta, 2)
1338
+
1339
+
1340
+ def get_service_details(unit: str) -> Optional[Dict[str, Any]]:
1341
+ """Drilldown info for a single systemd unit. None if not found."""
1342
+ if "." not in unit:
1343
+ unit = f"{unit}.service"
1344
+ rc, out, _ = run(
1345
+ [
1346
+ "systemctl",
1347
+ "show",
1348
+ "-p",
1349
+ "Id",
1350
+ "-p",
1351
+ "Description",
1352
+ "-p",
1353
+ "LoadState",
1354
+ "-p",
1355
+ "ActiveState",
1356
+ "-p",
1357
+ "SubState",
1358
+ "-p",
1359
+ "Result",
1360
+ "-p",
1361
+ "UnitFileState",
1362
+ "-p",
1363
+ "MainPID",
1364
+ "-p",
1365
+ "NRestarts",
1366
+ "-p",
1367
+ "MemoryCurrent",
1368
+ "-p",
1369
+ "TasksCurrent",
1370
+ "-p",
1371
+ "ActiveEnterTimestamp",
1372
+ "-p",
1373
+ "ExecMainStartTimestamp",
1374
+ "-p",
1375
+ "FragmentPath",
1376
+ unit,
1377
+ ],
1378
+ timeout=8,
1379
+ )
1380
+ if rc != 0 or not out:
1381
+ return None
1382
+ data: Dict[str, str] = {}
1383
+ for line in out.splitlines():
1384
+ key, _, value = line.partition("=")
1385
+ data[key.strip()] = value.strip()
1386
+ if not data.get("Id") or data.get("LoadState") == "not-found":
1387
+ return None
1388
+ return data
1389
+
1390
+
1391
+ def get_service_journal(unit: str, lines: int = 20) -> List[str]:
1392
+ """Recent journal lines for a single unit."""
1393
+ if "." not in unit:
1394
+ unit = f"{unit}.service"
1395
+ rc, out, _ = run(["journalctl", "-u", unit, "-n", str(lines), "--no-pager", "-q"], timeout=8)
1396
+ if rc != 0 or not out:
1397
+ return []
1398
+ return [line for line in out.splitlines() if line.strip()]
1399
+
1400
+
1401
+ def get_pid_count() -> Tuple[int, int]:
1402
+ """Return (current process count, kernel pid_max)."""
1403
+ pid_max_raw = read_file("/proc/sys/kernel/pid_max").strip()
1404
+ try:
1405
+ pid_max = int(pid_max_raw)
1406
+ except ValueError:
1407
+ pid_max = 0
1408
+ count = 0
1409
+ try:
1410
+ for name in os.listdir("/proc"):
1411
+ if name.isdigit():
1412
+ count += 1
1413
+ except OSError:
1414
+ count = 0
1415
+ return count, pid_max
1416
+
1417
+
1418
+ def get_oom_events(hours: int = 24) -> List[str]:
1419
+ """Find OOM-kill events in journal/dmesg from the recent window."""
1420
+ events: List[str] = []
1421
+ rc, out, _ = run(
1422
+ ["journalctl", "-k", "--since", f"{hours} hours ago", "--no-pager", "-q"],
1423
+ timeout=8,
1424
+ )
1425
+ if rc == 0 and out:
1426
+ for line in out.splitlines():
1427
+ low = line.lower()
1428
+ if "out of memory" in low or "killed process" in low or "oom-killer" in low:
1429
+ events.append(line.strip())
1430
+ return events
1431
+ # Fallback: dmesg (may need root)
1432
+ rc, out, _ = run(["dmesg", "-T"], timeout=5)
1433
+ if rc == 0:
1434
+ for line in out.splitlines():
1435
+ low = line.lower()
1436
+ if "out of memory" in low or "oom-killer" in low or "killed process" in low:
1437
+ events.append(line.strip())
1438
+ return events
1439
+
1440
+
1441
+ def get_recent_kernel_errors(hours: int = 24, limit: int = 5) -> List[str]:
1442
+ """Recent kernel error/critical lines from journal."""
1443
+ rc, out, _ = run(
1444
+ ["journalctl", "-k", "-p", "err", "--since", f"{hours} hours ago", "--no-pager", "-q"],
1445
+ timeout=8,
1446
+ )
1447
+ if rc != 0 or not out:
1448
+ return []
1449
+ lines = [line.strip() for line in out.splitlines() if line.strip()]
1450
+ return lines[-limit:]
1451
+
1452
+
1453
+ def get_listening_ports() -> List[Dict[str, Any]]:
1454
+ """List of listening TCP ports."""
1455
+ if HAS_PSUTIL:
1456
+ try:
1457
+ ports: List[Dict[str, Any]] = []
1458
+ for conn in psutil.net_connections(kind="inet"):
1459
+ if conn.status != psutil.CONN_LISTEN:
1460
+ continue
1461
+ if conn.type != socket.SOCK_STREAM:
1462
+ continue
1463
+ if not conn.laddr:
1464
+ continue
1465
+ ports.append(
1466
+ {
1467
+ "addr": conn.laddr.ip,
1468
+ "port": conn.laddr.port,
1469
+ "pid": conn.pid,
1470
+ }
1471
+ )
1472
+ return ports
1473
+ except Exception:
1474
+ pass
1475
+ rc, out, _ = run(["ss", "-tlnH"], timeout=5)
1476
+ if rc != 0:
1477
+ return []
1478
+ ports = []
1479
+ for line in out.splitlines():
1480
+ parts = line.split()
1481
+ if len(parts) < 4:
1482
+ continue
1483
+ local = parts[3]
1484
+ if ":" not in local:
1485
+ continue
1486
+ addr, _, port = local.rpartition(":")
1487
+ try:
1488
+ ports.append({"addr": addr, "port": int(port), "pid": None})
1489
+ except ValueError:
1490
+ continue
1491
+ return ports
1492
+
1493
+
1494
+ def get_pending_updates() -> int:
1495
+ """Count of pending apt updates. -1 if cannot determine."""
1496
+ if not shutil.which("apt"):
1497
+ return -1
1498
+ rc, out, _ = run(["apt", "list", "--upgradable"], timeout=10)
1499
+ if rc != 0:
1500
+ return -1
1501
+ count = 0
1502
+ for line in out.splitlines():
1503
+ line = line.strip()
1504
+ if not line or line.startswith("Listing"):
1505
+ continue
1506
+ count += 1
1507
+ return count
1508
+
1509
+
1510
+ def get_last_logins(limit: int = 5) -> List[str]:
1511
+ """Recent successful logins via `last`."""
1512
+ rc, out, _ = run(["last", "-n", str(limit), "-F"], timeout=5)
1513
+ if rc != 0:
1514
+ return []
1515
+ return [line for line in out.splitlines()[:limit] if line.strip()]
1516
+
1517
+
1518
+ def get_failed_auth_count(hours: int = 24) -> int:
1519
+ """Count of failed authentication events in the recent window."""
1520
+ rc, out, _ = run(
1521
+ ["journalctl", "_SYSTEMD_UNIT=ssh.service", "_SYSTEMD_UNIT=sshd.service", "--since", f"{hours} hours ago", "--no-pager", "-q"],
1522
+ timeout=8,
1523
+ )
1524
+ if rc != 0 or not out:
1525
+ # Fallback to /var/log/auth.log
1526
+ out = read_file("/var/log/auth.log")
1527
+ if not out:
1528
+ return 0
1529
+ count = 0
1530
+ for line in out.splitlines():
1531
+ low = line.lower()
1532
+ if "failed password" in low or "authentication failure" in low or "invalid user" in low:
1533
+ count += 1
1534
+ return count
1535
+
1536
+
1537
+ def get_disk_io_busy() -> Optional[float]:
1538
+ """Average disk busy percent across all disks. Requires psutil."""
1539
+ if not HAS_PSUTIL:
1540
+ return None
1541
+ try:
1542
+ first = psutil.disk_io_counters(perdisk=False)
1543
+ if first is None:
1544
+ return None
1545
+ time.sleep(0.5)
1546
+ second = psutil.disk_io_counters(perdisk=False)
1547
+ if second is None:
1548
+ return None
1549
+ busy_delta = (second.busy_time - first.busy_time) if hasattr(second, "busy_time") else 0
1550
+ return float(busy_delta) / 5.0 # 500ms window, expressed as pct
1551
+ except Exception:
1552
+ return None
1553
+
1554
+
1555
+ def get_open_fds() -> Optional[Tuple[int, int]]:
1556
+ """Return (used, max) open file descriptors."""
1557
+ fs_file_nr = read_file("/proc/sys/fs/file-nr")
1558
+ fs_file_max = read_file("/proc/sys/fs/file-max")
1559
+ try:
1560
+ used = int(fs_file_nr.split()[0])
1561
+ except (ValueError, IndexError):
1562
+ return None
1563
+ try:
1564
+ max_fd = int(fs_file_max.strip())
1565
+ except ValueError:
1566
+ return None
1567
+ return used, max_fd
1568
+
1569
+
1570
+ def _is_noisy_iface(name: str) -> bool:
1571
+ """Filter out container/virtual interfaces from the default network listing."""
1572
+ noisy_prefixes = ("veth", "docker", "br-", "virbr", "cni", "flannel", "cali", "lxcbr", "tun", "tap")
1573
+ return any(name.startswith(p) for p in noisy_prefixes)
1574
+
1575
+
1576
+ def get_network_interfaces(include_virtual: bool = False) -> List[Dict[str, Any]]:
1577
+ """List of non-loopback network interfaces with IPs."""
1578
+ result: List[Dict[str, Any]] = []
1579
+ if HAS_PSUTIL:
1580
+ try:
1581
+ addrs = psutil.net_if_addrs()
1582
+ stats = psutil.net_if_stats()
1583
+ for name, addr_list in addrs.items():
1584
+ if name == "lo":
1585
+ continue
1586
+ if not include_virtual and _is_noisy_iface(name):
1587
+ continue
1588
+ ipv4 = [a.address for a in addr_list if a.family == socket.AF_INET]
1589
+ ipv6 = [a.address for a in addr_list if a.family == socket.AF_INET6 and not a.address.startswith("fe80")]
1590
+ is_up = stats.get(name).isup if name in stats else False
1591
+ result.append({"name": name, "ipv4": ipv4, "ipv6": ipv6, "up": is_up})
1592
+ return result
1593
+ except Exception:
1594
+ pass
1595
+ rc, out, _ = run(["ip", "-o", "-4", "addr", "show"], timeout=5)
1596
+ if rc != 0:
1597
+ return []
1598
+ for line in out.splitlines():
1599
+ parts = line.split()
1600
+ if len(parts) >= 4:
1601
+ name = parts[1]
1602
+ if name == "lo":
1603
+ continue
1604
+ if not include_virtual and _is_noisy_iface(name):
1605
+ continue
1606
+ ip = parts[3].split("/")[0]
1607
+ result.append({"name": name, "ipv4": [ip], "ipv6": [], "up": True})
1608
+ return result