@glassmkr/crucible 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (89) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +92 -0
  3. package/config/collector.example.yaml +43 -0
  4. package/dist/alerts/evaluator.d.ts +3 -0
  5. package/dist/alerts/evaluator.js +14 -0
  6. package/dist/alerts/evaluator.js.map +1 -0
  7. package/dist/alerts/rules.d.ts +7 -0
  8. package/dist/alerts/rules.js +203 -0
  9. package/dist/alerts/rules.js.map +1 -0
  10. package/dist/alerts/state.d.ts +6 -0
  11. package/dist/alerts/state.js +77 -0
  12. package/dist/alerts/state.js.map +1 -0
  13. package/dist/collect/cpu.d.ts +2 -0
  14. package/dist/collect/cpu.js +35 -0
  15. package/dist/collect/cpu.js.map +1 -0
  16. package/dist/collect/disks.d.ts +2 -0
  17. package/dist/collect/disks.js +33 -0
  18. package/dist/collect/disks.js.map +1 -0
  19. package/dist/collect/ipmi.d.ts +2 -0
  20. package/dist/collect/ipmi.js +55 -0
  21. package/dist/collect/ipmi.js.map +1 -0
  22. package/dist/collect/memory.d.ts +2 -0
  23. package/dist/collect/memory.js +27 -0
  24. package/dist/collect/memory.js.map +1 -0
  25. package/dist/collect/network.d.ts +2 -0
  26. package/dist/collect/network.js +54 -0
  27. package/dist/collect/network.js.map +1 -0
  28. package/dist/collect/os-alerts.d.ts +2 -0
  29. package/dist/collect/os-alerts.js +41 -0
  30. package/dist/collect/os-alerts.js.map +1 -0
  31. package/dist/collect/raid.d.ts +2 -0
  32. package/dist/collect/raid.js +34 -0
  33. package/dist/collect/raid.js.map +1 -0
  34. package/dist/collect/smart.d.ts +2 -0
  35. package/dist/collect/smart.js +56 -0
  36. package/dist/collect/smart.js.map +1 -0
  37. package/dist/collect/system.d.ts +2 -0
  38. package/dist/collect/system.js +19 -0
  39. package/dist/collect/system.js.map +1 -0
  40. package/dist/config.d.ts +208 -0
  41. package/dist/config.js +58 -0
  42. package/dist/config.js.map +1 -0
  43. package/dist/index.d.ts +2 -0
  44. package/dist/index.js +96 -0
  45. package/dist/index.js.map +1 -0
  46. package/dist/lib/exec.d.ts +1 -0
  47. package/dist/lib/exec.js +19 -0
  48. package/dist/lib/exec.js.map +1 -0
  49. package/dist/lib/parse.d.ts +4 -0
  50. package/dist/lib/parse.js +29 -0
  51. package/dist/lib/parse.js.map +1 -0
  52. package/dist/lib/types.d.ts +103 -0
  53. package/dist/lib/types.js +2 -0
  54. package/dist/lib/types.js.map +1 -0
  55. package/dist/notify/email.d.ts +4 -0
  56. package/dist/notify/email.js +55 -0
  57. package/dist/notify/email.js.map +1 -0
  58. package/dist/notify/slack.d.ts +2 -0
  59. package/dist/notify/slack.js +38 -0
  60. package/dist/notify/slack.js.map +1 -0
  61. package/dist/notify/telegram.d.ts +2 -0
  62. package/dist/notify/telegram.js +38 -0
  63. package/dist/notify/telegram.js.map +1 -0
  64. package/dist/push/forge.d.ts +2 -0
  65. package/dist/push/forge.js +26 -0
  66. package/dist/push/forge.js.map +1 -0
  67. package/package.json +29 -0
  68. package/src/alerts/evaluator.ts +15 -0
  69. package/src/alerts/rules.ts +184 -0
  70. package/src/alerts/state.ts +92 -0
  71. package/src/collect/cpu.ts +44 -0
  72. package/src/collect/disks.ts +36 -0
  73. package/src/collect/ipmi.ts +60 -0
  74. package/src/collect/memory.ts +30 -0
  75. package/src/collect/network.ts +61 -0
  76. package/src/collect/os-alerts.ts +43 -0
  77. package/src/collect/raid.ts +40 -0
  78. package/src/collect/smart.ts +60 -0
  79. package/src/collect/system.ts +21 -0
  80. package/src/config.ts +60 -0
  81. package/src/index.ts +112 -0
  82. package/src/lib/exec.ts +16 -0
  83. package/src/lib/parse.ts +29 -0
  84. package/src/lib/types.ts +110 -0
  85. package/src/notify/email.ts +68 -0
  86. package/src/notify/slack.ts +46 -0
  87. package/src/notify/telegram.ts +45 -0
  88. package/src/push/forge.ts +25 -0
  89. package/tsconfig.json +15 -0
@@ -0,0 +1,184 @@
1
+ // Alert rules for the collector are identical to the Forge evaluator.
2
+ // Re-export from a shared definition to avoid duplication.
3
+ // For the collector, we use the same 15 rules but with local thresholds from config.
4
+
5
+ import type { Snapshot, AlertResult } from "../lib/types.js";
6
+ import type { Config } from "../config.js";
7
+
8
+ export interface AlertRule {
9
+ type: string;
10
+ evaluate(snap: Snapshot, thresholds: Config["thresholds"]): AlertResult[];
11
+ }
12
+
13
+ export const allRules: AlertRule[] = [
14
+ // 1. RAM high
15
+ { type: "ram_high", evaluate(snap, t) {
16
+ if (!snap.memory?.total_mb) return [];
17
+ const pct = (snap.memory.used_mb / snap.memory.total_mb) * 100;
18
+ if (pct < (t.ram_percent ?? 90)) return [];
19
+ return [{ type: "ram_high", severity: pct >= 95 ? "critical" : "warning",
20
+ title: `RAM usage at ${pct.toFixed(1)}%`,
21
+ message: `Using ${snap.memory.used_mb}MB of ${snap.memory.total_mb}MB. ${snap.memory.available_mb}MB available.`,
22
+ evidence: { used_mb: snap.memory.used_mb, total_mb: snap.memory.total_mb, percent: Math.round(pct * 10) / 10 },
23
+ recommendation: "Check: ps aux --sort=-rss | head -20" }];
24
+ }},
25
+ // 2. Swap active
26
+ { type: "swap_active", evaluate(snap, t) {
27
+ if (t.swap_alert === false || !snap.memory || snap.memory.swap_used_mb <= 0) return [];
28
+ return [{ type: "swap_active", severity: "warning", title: `Swap in use: ${snap.memory.swap_used_mb}MB`,
29
+ message: "Server is using swap space, indicating memory pressure.",
30
+ evidence: { swap_used_mb: snap.memory.swap_used_mb },
31
+ recommendation: "Check: free -h && ps aux --sort=-rss | head -20" }];
32
+ }},
33
+ // 3. Disk space high
34
+ { type: "disk_space_high", evaluate(snap, t) {
35
+ if (!snap.disks) return [];
36
+ const threshold = t.disk_percent ?? 85;
37
+ return snap.disks.filter(d => d.percent_used >= threshold).map(d => ({
38
+ type: "disk_space_high", severity: d.percent_used >= 95 ? "critical" as const : "warning" as const,
39
+ title: `Disk ${d.mount} at ${d.percent_used}%`,
40
+ message: `${d.device}: ${d.used_gb}GB of ${d.total_gb}GB used. ${d.available_gb}GB available.`,
41
+ evidence: { device: d.device, mount: d.mount, percent_used: d.percent_used },
42
+ recommendation: "Check: du -sh /* | sort -rh | head -20" }));
43
+ }},
44
+ // 4. CPU iowait
45
+ { type: "cpu_iowait_high", evaluate(snap, t) {
46
+ if (!snap.cpu || snap.cpu.iowait_percent < (t.iowait_percent ?? 20)) return [];
47
+ return [{ type: "cpu_iowait_high", severity: "warning", title: `CPU iowait at ${snap.cpu.iowait_percent.toFixed(1)}%`,
48
+ message: `High I/O wait: CPU spending ${snap.cpu.iowait_percent.toFixed(1)}% waiting for disk.`,
49
+ evidence: { iowait_percent: snap.cpu.iowait_percent },
50
+ recommendation: "Check: iotop -oP or iostat -x 1 5" }];
51
+ }},
52
+ // 5. OOM kills
53
+ { type: "oom_kills", evaluate(snap) {
54
+ if (!snap.os_alerts || snap.os_alerts.oom_kills_recent <= 0) return [];
55
+ return [{ type: "oom_kills", severity: "critical", title: `${snap.os_alerts.oom_kills_recent} OOM kill(s)`,
56
+ message: `Kernel OOM killer terminated ${snap.os_alerts.oom_kills_recent} process(es).`,
57
+ evidence: { oom_kills_recent: snap.os_alerts.oom_kills_recent },
58
+ recommendation: "Check: dmesg | grep -i 'out of memory'" }];
59
+ }},
60
+ // 6. SMART failing
61
+ { type: "smart_failing", evaluate(snap) {
62
+ if (!snap.smart) return [];
63
+ return snap.smart.filter(d => d.health !== "PASSED" || (d.reallocated_sectors && d.reallocated_sectors > 0) || (d.pending_sectors && d.pending_sectors > 0))
64
+ .map(d => ({ type: "smart_failing", severity: "critical" as const,
65
+ title: `SMART failure: ${d.device}`, message: `${d.model}: drive showing signs of failure.`,
66
+ evidence: { device: d.device, health: d.health, reallocated_sectors: d.reallocated_sectors, pending_sectors: d.pending_sectors },
67
+ recommendation: `Back up data. Schedule replacement for ${d.device}.` }));
68
+ }},
69
+ // 7. NVMe wear
70
+ { type: "nvme_wear_high", evaluate(snap, t) {
71
+ if (!snap.smart) return [];
72
+ const threshold = t.nvme_wear_percent ?? 85;
73
+ return snap.smart.filter(d => d.percentage_used != null && d.percentage_used >= threshold)
74
+ .map(d => ({ type: "nvme_wear_high", severity: d.percentage_used! >= 95 ? "critical" as const : "warning" as const,
75
+ title: `NVMe ${d.device} wear at ${d.percentage_used}%`, message: `${d.model} at ${d.percentage_used}% lifetime wear.`,
76
+ evidence: { device: d.device, percentage_used: d.percentage_used },
77
+ recommendation: "Plan drive replacement." }));
78
+ }},
79
+ // 8. RAID degraded
80
+ { type: "raid_degraded", evaluate(snap) {
81
+ if (!snap.raid) return [];
82
+ return snap.raid.filter(r => r.degraded || r.failed_disks.length > 0)
83
+ .map(r => ({ type: "raid_degraded", severity: "critical" as const,
84
+ title: `RAID ${r.device} degraded`, message: `${r.device} (${r.level}) degraded. Failed: ${r.failed_disks.join(", ") || "unknown"}.`,
85
+ evidence: { device: r.device, failed_disks: r.failed_disks },
86
+ recommendation: "Replace failed drive immediately." }));
87
+ }},
88
+ // 9. Disk latency
89
+ { type: "disk_latency_high", evaluate(snap, t) {
90
+ if (!snap.disks) return [];
91
+ return snap.disks.filter(d => {
92
+ if (d.latency_p99_ms == null) return false;
93
+ const thresh = d.device.includes("nvme") ? (t.disk_latency_nvme_ms ?? 50) : (t.disk_latency_hdd_ms ?? 200);
94
+ return d.latency_p99_ms >= thresh;
95
+ }).map(d => ({ type: "disk_latency_high", severity: "warning" as const,
96
+ title: `Disk ${d.device} latency ${d.latency_p99_ms!.toFixed(1)}ms`,
97
+ message: `p99 I/O latency on ${d.device} is high.`,
98
+ evidence: { device: d.device, latency_p99_ms: d.latency_p99_ms },
99
+ recommendation: "Check: iotop -oP" }));
100
+ }},
101
+ // 10. Interface errors
102
+ { type: "interface_errors", evaluate(snap) {
103
+ if (!snap.network) return [];
104
+ return snap.network.filter(i => (i.rx_errors + i.tx_errors + i.rx_drops + i.tx_drops) > 0)
105
+ .map(i => ({ type: "interface_errors", severity: "warning" as const,
106
+ title: `${i.interface}: errors/drops detected`,
107
+ message: `RX errors=${i.rx_errors}, TX errors=${i.tx_errors}, RX drops=${i.rx_drops}, TX drops=${i.tx_drops}.`,
108
+ evidence: { interface: i.interface, rx_errors: i.rx_errors, tx_errors: i.tx_errors, rx_drops: i.rx_drops, tx_drops: i.tx_drops },
109
+ recommendation: "Check cables and SFP/transceiver." }));
110
+ }},
111
+ // 11. Link speed mismatch
112
+ { type: "link_speed_mismatch", evaluate(snap) {
113
+ if (!snap.network) return [];
114
+ return snap.network.filter(i => i.speed_mbps > 0 && i.speed_mbps < 1000)
115
+ .map(i => ({ type: "link_speed_mismatch", severity: "warning" as const,
116
+ title: `${i.interface} at ${i.speed_mbps} Mbps`,
117
+ message: `Interface negotiated below 1 Gbps.`,
118
+ evidence: { interface: i.interface, speed_mbps: i.speed_mbps },
119
+ recommendation: "Check cable, SFP, switch port config." }));
120
+ }},
121
+ // 12. Interface saturation
122
+ { type: "interface_saturation", evaluate(snap, t) {
123
+ if (!snap.network) return [];
124
+ const threshold = (t.interface_utilization_percent ?? 90) / 100;
125
+ return snap.network.filter(i => {
126
+ if (!i.speed_mbps) return false;
127
+ const maxBps = (i.speed_mbps * 1_000_000) / 8;
128
+ return Math.max(i.rx_bytes_sec, i.tx_bytes_sec) / maxBps >= threshold;
129
+ }).map(i => {
130
+ const maxBps = (i.speed_mbps * 1_000_000) / 8;
131
+ const util = Math.max(i.rx_bytes_sec, i.tx_bytes_sec) / maxBps * 100;
132
+ return { type: "interface_saturation", severity: "warning" as const,
133
+ title: `${i.interface} at ${util.toFixed(0)}% utilization`,
134
+ message: `Interface ${i.interface} (${i.speed_mbps} Mbps) near saturation.`,
135
+ evidence: { interface: i.interface, utilization_percent: Math.round(util * 10) / 10 },
136
+ recommendation: "Check: iftop or nload" };
137
+ });
138
+ }},
139
+ // 13. CPU temperature
140
+ { type: "cpu_temperature_high", evaluate(snap, t) {
141
+ if (!snap.ipmi?.available || !snap.ipmi.sensors) return [];
142
+ const warn = t.cpu_temp_warning_c ?? 80;
143
+ return snap.ipmi.sensors.filter(s => {
144
+ const n = s.name.toLowerCase();
145
+ if (!n.includes("cpu") && !n.includes("temp")) return false;
146
+ const v = typeof s.value === "number" ? s.value : parseFloat(String(s.value));
147
+ return !isNaN(v) && v >= warn;
148
+ }).map(s => {
149
+ const v = typeof s.value === "number" ? s.value : parseFloat(String(s.value));
150
+ const crit = s.upper_critical ?? (t.cpu_temp_critical_c ?? 90);
151
+ return { type: "cpu_temperature_high", severity: v >= crit ? "critical" as const : "warning" as const,
152
+ title: `${s.name}: ${v}${s.unit}`, message: `Temperature above warning threshold.`,
153
+ evidence: { sensor: s.name, value: v },
154
+ recommendation: "Check cooling, fans, airflow." };
155
+ });
156
+ }},
157
+ // 14. ECC errors
158
+ { type: "ecc_errors", evaluate(snap) {
159
+ if (!snap.ipmi?.ecc_errors) return [];
160
+ const { correctable, uncorrectable } = snap.ipmi.ecc_errors;
161
+ if (correctable <= 0 && uncorrectable <= 0) return [];
162
+ if (uncorrectable > 0) return [{ type: "ecc_errors", severity: "critical",
163
+ title: `${uncorrectable} uncorrectable ECC error(s)`, message: "Data corruption possible. DIMM failing.",
164
+ evidence: { correctable, uncorrectable },
165
+ recommendation: "Replace DIMM immediately. Run: ipmitool sdr type Memory" }];
166
+ return [{ type: "ecc_errors", severity: "warning",
167
+ title: `${correctable} correctable ECC error(s)`, message: "Early warning of DIMM failure.",
168
+ evidence: { correctable, uncorrectable },
169
+ recommendation: "Schedule DIMM replacement. Run: ipmitool sdr type Memory" }];
170
+ }},
171
+ // 15. PSU redundancy
172
+ { type: "psu_redundancy_loss", evaluate(snap) {
173
+ if (!snap.ipmi?.available || !snap.ipmi.sensors) return [];
174
+ const psus = snap.ipmi.sensors.filter(s => { const n = s.name.toLowerCase(); return n.includes("psu") || n.includes("power supply"); });
175
+ if (psus.length < 2) return [];
176
+ const failed = psus.filter(s => { const st = String(s.status).toLowerCase(); const v = String(s.value).toLowerCase();
177
+ return st.includes("fail") || st.includes("absent") || v.includes("fail") || v.includes("absent"); });
178
+ if (failed.length === 0) return [];
179
+ return [{ type: "psu_redundancy_loss", severity: "critical",
180
+ title: "PSU redundancy lost", message: `${failed.length} PSU(s) failed/absent: ${failed.map(p => p.name).join(", ")}.`,
181
+ evidence: { failed: failed.map(p => ({ name: p.name, status: p.status })) },
182
+ recommendation: "Replace failed PSU. Check power connections." }];
183
+ }},
184
+ ];
@@ -0,0 +1,92 @@
1
+ import { readFileSync, writeFileSync, mkdirSync } from "fs";
2
+ import type { AlertResult } from "../lib/types.js";
3
+
4
+ const STATE_FILE = "/var/lib/glassmkr/alert-state.json";
5
+
6
+ interface AlertState {
7
+ type: string;
8
+ first_seen: string;
9
+ last_seen: string;
10
+ notified: boolean;
11
+ }
12
+
13
+ let state: Map<string, AlertState> = new Map();
14
+
15
+ function load() {
16
+ try {
17
+ const raw = readFileSync(STATE_FILE, "utf-8");
18
+ const data: Record<string, AlertState> = JSON.parse(raw);
19
+ state = new Map(Object.entries(data));
20
+ } catch {
21
+ state = new Map();
22
+ }
23
+ }
24
+
25
+ function save() {
26
+ try {
27
+ mkdirSync("/var/lib/glassmkr", { recursive: true });
28
+ const obj: Record<string, AlertState> = {};
29
+ for (const [k, v] of state) obj[k] = v;
30
+ writeFileSync(STATE_FILE, JSON.stringify(obj, null, 2));
31
+ } catch (err) {
32
+ console.error("[state] Failed to save alert state:", err);
33
+ }
34
+ }
35
+
36
+ // Initialize on import
37
+ load();
38
+
39
+ export function updateAlertState(currentAlerts: AlertResult[]): {
40
+ newAlerts: AlertResult[];
41
+ resolvedAlerts: AlertResult[];
42
+ } {
43
+ const now = new Date().toISOString();
44
+ const currentTypes = new Set(currentAlerts.map((a) => a.type));
45
+ const newAlerts: AlertResult[] = [];
46
+ const resolvedAlerts: AlertResult[] = [];
47
+
48
+ // Check for new alerts
49
+ for (const alert of currentAlerts) {
50
+ const existing = state.get(alert.type);
51
+ if (!existing) {
52
+ // New alert
53
+ state.set(alert.type, { type: alert.type, first_seen: now, last_seen: now, notified: false });
54
+ newAlerts.push(alert);
55
+ } else {
56
+ // Existing alert, update last_seen
57
+ existing.last_seen = now;
58
+ }
59
+ }
60
+
61
+ // Check for resolved alerts
62
+ for (const [type, alertState] of state) {
63
+ if (!currentTypes.has(type)) {
64
+ resolvedAlerts.push({
65
+ type,
66
+ severity: "warning",
67
+ title: `Resolved: ${type}`,
68
+ message: `Condition cleared. Active for ${timeSince(alertState.first_seen)}.`,
69
+ evidence: {},
70
+ recommendation: "",
71
+ });
72
+ state.delete(type);
73
+ }
74
+ }
75
+
76
+ save();
77
+ return { newAlerts, resolvedAlerts };
78
+ }
79
+
80
+ function timeSince(isoDate: string): string {
81
+ const ms = Date.now() - new Date(isoDate).getTime();
82
+ const minutes = Math.floor(ms / 60000);
83
+ if (minutes < 60) return `${minutes} minute(s)`;
84
+ const hours = Math.floor(minutes / 60);
85
+ if (hours < 24) return `${hours} hour(s) ${minutes % 60} minute(s)`;
86
+ const days = Math.floor(hours / 24);
87
+ return `${days} day(s)`;
88
+ }
89
+
90
+ export function getActiveAlerts(): string[] {
91
+ return Array.from(state.keys());
92
+ }
@@ -0,0 +1,44 @@
1
+ import { readProcFile, sleep } from "../lib/parse.js";
2
+ import type { CpuInfo } from "../lib/types.js";
3
+
4
+ interface CpuStat {
5
+ user: number; nice: number; system: number; idle: number;
6
+ iowait: number; irq: number; softirq: number; steal: number;
7
+ }
8
+
9
+ function parseProcStat(): CpuStat {
10
+ const raw = readProcFile("/proc/stat") || "";
11
+ const line = raw.split("\n").find((l) => l.startsWith("cpu "));
12
+ if (!line) return { user: 0, nice: 0, system: 0, idle: 0, iowait: 0, irq: 0, softirq: 0, steal: 0 };
13
+ const parts = line.split(/\s+/).slice(1).map(Number);
14
+ return {
15
+ user: parts[0] || 0, nice: parts[1] || 0, system: parts[2] || 0, idle: parts[3] || 0,
16
+ iowait: parts[4] || 0, irq: parts[5] || 0, softirq: parts[6] || 0, steal: parts[7] || 0,
17
+ };
18
+ }
19
+
20
+ export async function collectCpu(): Promise<CpuInfo> {
21
+ const stat1 = parseProcStat();
22
+ await sleep(1000);
23
+ const stat2 = parseProcStat();
24
+
25
+ const d = {
26
+ user: stat2.user - stat1.user, nice: stat2.nice - stat1.nice,
27
+ system: stat2.system - stat1.system, idle: stat2.idle - stat1.idle,
28
+ iowait: stat2.iowait - stat1.iowait, irq: stat2.irq - stat1.irq,
29
+ softirq: stat2.softirq - stat1.softirq, steal: stat2.steal - stat1.steal,
30
+ };
31
+ const total = Object.values(d).reduce((a, b) => a + b, 0) || 1;
32
+
33
+ const loadavg = (readProcFile("/proc/loadavg") || "0 0 0").trim().split(" ");
34
+
35
+ return {
36
+ user_percent: Math.round((d.user / total) * 10000) / 100,
37
+ system_percent: Math.round((d.system / total) * 10000) / 100,
38
+ iowait_percent: Math.round((d.iowait / total) * 10000) / 100,
39
+ idle_percent: Math.round((d.idle / total) * 10000) / 100,
40
+ load_1m: parseFloat(loadavg[0]) || 0,
41
+ load_5m: parseFloat(loadavg[1]) || 0,
42
+ load_15m: parseFloat(loadavg[2]) || 0,
43
+ };
44
+ }
@@ -0,0 +1,36 @@
1
+ import { run } from "../lib/exec.js";
2
+ import type { DiskInfo } from "../lib/types.js";
3
+
4
+ export async function collectDisks(): Promise<DiskInfo[]> {
5
+ const dfOutput = await run("df", ["-B1", "--output=source,target,size,used,avail,pcent", "-x", "tmpfs", "-x", "devtmpfs", "-x", "squashfs"]);
6
+ if (!dfOutput) return [];
7
+
8
+ const lines = dfOutput.trim().split("\n").slice(1); // skip header
9
+ const disks: DiskInfo[] = [];
10
+
11
+ for (const line of lines) {
12
+ const parts = line.trim().split(/\s+/);
13
+ if (parts.length < 6) continue;
14
+ const device = parts[0];
15
+ const mount = parts[1];
16
+ const totalBytes = parseInt(parts[2]) || 0;
17
+ const usedBytes = parseInt(parts[3]) || 0;
18
+ const availBytes = parseInt(parts[4]) || 0;
19
+ const pctStr = parts[5].replace("%", "");
20
+ const percent = parseInt(pctStr) || 0;
21
+
22
+ // Skip pseudo-filesystems
23
+ if (!device.startsWith("/dev/")) continue;
24
+
25
+ disks.push({
26
+ device,
27
+ mount,
28
+ total_gb: Math.round((totalBytes / 1073741824) * 100) / 100,
29
+ used_gb: Math.round((usedBytes / 1073741824) * 100) / 100,
30
+ available_gb: Math.round((availBytes / 1073741824) * 100) / 100,
31
+ percent_used: percent,
32
+ });
33
+ }
34
+
35
+ return disks;
36
+ }
@@ -0,0 +1,60 @@
1
+ import { run } from "../lib/exec.js";
2
+ import type { IpmiInfo } from "../lib/types.js";
3
+
4
+ export async function collectIpmi(): Promise<IpmiInfo> {
5
+ const sensorRaw = await run("ipmitool", ["sensor"]);
6
+ if (!sensorRaw) {
7
+ return { available: false, sensors: [], ecc_errors: { correctable: 0, uncorrectable: 0 }, sel_entries_count: 0 };
8
+ }
9
+
10
+ // Parse sensor readings
11
+ const sensors: IpmiInfo["sensors"] = [];
12
+ for (const line of sensorRaw.split("\n")) {
13
+ const parts = line.split("|").map((s) => s.trim());
14
+ if (parts.length < 4) continue;
15
+ const name = parts[0];
16
+ const rawValue = parts[1];
17
+ const unit = parts[2];
18
+ const status = parts[3];
19
+
20
+ const numValue = parseFloat(rawValue);
21
+ const value: number | string = isNaN(numValue) ? rawValue : numValue;
22
+
23
+ // Parse upper critical threshold
24
+ let upperCritical: number | undefined;
25
+ if (parts[8]) {
26
+ const uc = parseFloat(parts[8]);
27
+ if (!isNaN(uc)) upperCritical = uc;
28
+ }
29
+
30
+ sensors.push({ name, value, unit, status, upper_critical: upperCritical });
31
+ }
32
+
33
+ // ECC errors from memory-type sensors
34
+ let correctable = 0;
35
+ let uncorrectable = 0;
36
+ for (const sensor of sensors) {
37
+ const name = sensor.name.toLowerCase();
38
+ if (name.includes("correctable") && typeof sensor.value === "number") {
39
+ correctable += sensor.value;
40
+ }
41
+ if (name.includes("uncorrectable") && typeof sensor.value === "number") {
42
+ uncorrectable += sensor.value;
43
+ }
44
+ }
45
+
46
+ // SEL entry count
47
+ let selCount = 0;
48
+ const selInfo = await run("ipmitool", ["sel", "info"]);
49
+ if (selInfo) {
50
+ const match = selInfo.match(/Entries\s*:\s*(\d+)/i);
51
+ if (match) selCount = parseInt(match[1], 10);
52
+ }
53
+
54
+ return {
55
+ available: true,
56
+ sensors,
57
+ ecc_errors: { correctable, uncorrectable },
58
+ sel_entries_count: selCount,
59
+ };
60
+ }
@@ -0,0 +1,30 @@
1
+ import { readProcFile, parseKb } from "../lib/parse.js";
2
+ import type { MemoryInfo } from "../lib/types.js";
3
+
4
+ export async function collectMemory(): Promise<MemoryInfo> {
5
+ const raw = readProcFile("/proc/meminfo") || "";
6
+ const kv: Record<string, string> = {};
7
+ for (const line of raw.split("\n")) {
8
+ const match = line.match(/^(\w+):\s+(.+)/);
9
+ if (match) kv[match[1]] = match[2];
10
+ }
11
+
12
+ const totalKb = parseKb(kv["MemTotal"]);
13
+ const availableKb = parseKb(kv["MemAvailable"]);
14
+ const swapTotalKb = parseKb(kv["SwapTotal"]);
15
+ const swapFreeKb = parseKb(kv["SwapFree"]);
16
+
17
+ const totalMb = Math.round(totalKb / 1024);
18
+ const availableMb = Math.round(availableKb / 1024);
19
+ const usedMb = totalMb - availableMb;
20
+ const swapTotalMb = Math.round(swapTotalKb / 1024);
21
+ const swapUsedMb = Math.round((swapTotalKb - swapFreeKb) / 1024);
22
+
23
+ return {
24
+ total_mb: totalMb,
25
+ used_mb: usedMb,
26
+ available_mb: availableMb,
27
+ swap_total_mb: swapTotalMb,
28
+ swap_used_mb: swapUsedMb,
29
+ };
30
+ }
@@ -0,0 +1,61 @@
1
+ import { readProcFile, sleep } from "../lib/parse.js";
2
+ import { readFileSync } from "fs";
3
+ import type { NetworkInfo } from "../lib/types.js";
4
+
5
+ interface IfaceStats {
6
+ rx_bytes: number; rx_packets: number; rx_errors: number; rx_drops: number;
7
+ tx_bytes: number; tx_packets: number; tx_errors: number; tx_drops: number;
8
+ }
9
+
10
+ function parseNetDev(): Record<string, IfaceStats> {
11
+ const raw = readProcFile("/proc/net/dev") || "";
12
+ const result: Record<string, IfaceStats> = {};
13
+ for (const line of raw.split("\n").slice(2)) {
14
+ const match = line.match(/^\s*(\S+):\s+(.*)/);
15
+ if (!match) continue;
16
+ const name = match[1];
17
+ // Skip virtual interfaces
18
+ if (name === "lo" || name.startsWith("veth") || name.startsWith("docker") || name.startsWith("br-") || name.startsWith("virbr")) continue;
19
+ const parts = match[2].trim().split(/\s+/).map(Number);
20
+ result[name] = {
21
+ rx_bytes: parts[0] || 0, rx_packets: parts[1] || 0, rx_errors: parts[2] || 0, rx_drops: parts[3] || 0,
22
+ tx_bytes: parts[8] || 0, tx_packets: parts[9] || 0, tx_errors: parts[10] || 0, tx_drops: parts[11] || 0,
23
+ };
24
+ }
25
+ return result;
26
+ }
27
+
28
+ function getSpeed(iface: string): number {
29
+ try {
30
+ const speed = readFileSync(`/sys/class/net/${iface}/speed`, "utf-8").trim();
31
+ const val = parseInt(speed, 10);
32
+ return isNaN(val) || val <= 0 ? 0 : val;
33
+ } catch {
34
+ return 0;
35
+ }
36
+ }
37
+
38
+ export async function collectNetwork(): Promise<NetworkInfo[]> {
39
+ const stats1 = parseNetDev();
40
+ await sleep(1000);
41
+ const stats2 = parseNetDev();
42
+
43
+ const results: NetworkInfo[] = [];
44
+ for (const [name, s2] of Object.entries(stats2)) {
45
+ const s1 = stats1[name];
46
+ if (!s1) continue;
47
+
48
+ results.push({
49
+ interface: name,
50
+ speed_mbps: getSpeed(name),
51
+ rx_bytes_sec: s2.rx_bytes - s1.rx_bytes,
52
+ tx_bytes_sec: s2.tx_bytes - s1.tx_bytes,
53
+ rx_errors: s2.rx_errors,
54
+ tx_errors: s2.tx_errors,
55
+ rx_drops: s2.rx_drops,
56
+ tx_drops: s2.tx_drops,
57
+ });
58
+ }
59
+
60
+ return results;
61
+ }
@@ -0,0 +1,43 @@
1
+ import { run } from "../lib/exec.js";
2
+ import { readProcFile } from "../lib/parse.js";
3
+ import { readdirSync, readFileSync } from "fs";
4
+ import type { OsAlerts } from "../lib/types.js";
5
+
6
+ export async function collectOsAlerts(): Promise<OsAlerts> {
7
+ // OOM kills
8
+ let oomKills = 0;
9
+ const dmesg = await run("dmesg", ["--level=err,crit", "--since", "5 min ago"]);
10
+ if (dmesg) {
11
+ oomKills = (dmesg.match(/Out of memory/gi) || []).length;
12
+ }
13
+
14
+ // Zombie processes
15
+ let zombies = 0;
16
+ try {
17
+ const pids = readdirSync("/proc").filter((f) => /^\d+$/.test(f));
18
+ for (const pid of pids) {
19
+ try {
20
+ const stat = readFileSync(`/proc/${pid}/stat`, "utf-8");
21
+ // Field 3 is the state character
22
+ const state = stat.split(" ")[2];
23
+ if (state === "Z") zombies++;
24
+ } catch { /* process disappeared */ }
25
+ }
26
+ } catch { /* /proc not readable */ }
27
+
28
+ // Time drift (simple: check if chrony/ntp reports drift)
29
+ let timeDriftMs = 0;
30
+ const chrony = await run("chronyc", ["tracking"]);
31
+ if (chrony) {
32
+ const match = chrony.match(/System time\s*:\s*([\d.]+)\s*seconds\s*(slow|fast)/);
33
+ if (match) {
34
+ timeDriftMs = parseFloat(match[1]) * 1000;
35
+ }
36
+ }
37
+
38
+ return {
39
+ oom_kills_recent: oomKills,
40
+ zombie_processes: zombies,
41
+ time_drift_ms: Math.round(timeDriftMs * 100) / 100,
42
+ };
43
+ }
@@ -0,0 +1,40 @@
1
+ import { readProcFile } from "../lib/parse.js";
2
+ import type { RaidInfo } from "../lib/types.js";
3
+
4
+ export async function collectRaid(): Promise<RaidInfo[]> {
5
+ const raw = readProcFile("/proc/mdstat");
6
+ if (!raw) return [];
7
+
8
+ const results: RaidInfo[] = [];
9
+ const lines = raw.split("\n");
10
+
11
+ for (let i = 0; i < lines.length; i++) {
12
+ const match = lines[i].match(/^(md\d+)\s*:\s*(\w+)\s+(\w+)\s+(.*)/);
13
+ if (!match) continue;
14
+
15
+ const device = match[1];
16
+ const status = match[2]; // "active" or "inactive"
17
+ const level = match[3]; // "raid1", "raid5", etc.
18
+ const disksPart = match[4];
19
+
20
+ // Parse component disks (e.g., "sda1[0] sdb1[1]")
21
+ const disks = (disksPart.match(/\w+\[\d+\]/g) || []).map((d) => d.replace(/\[\d+\]/, ""));
22
+
23
+ // Check next line for degraded status (e.g., "[UU_]" means one drive missing)
24
+ const statusLine = lines[i + 1] || "";
25
+ const bracketMatch = statusLine.match(/\[([U_]+)\]/);
26
+ const degraded = bracketMatch ? bracketMatch[1].includes("_") : false;
27
+
28
+ const failedDisks: string[] = [];
29
+ if (degraded && bracketMatch) {
30
+ const pattern = bracketMatch[1];
31
+ pattern.split("").forEach((c, idx) => {
32
+ if (c === "_" && disks[idx]) failedDisks.push(disks[idx]);
33
+ });
34
+ }
35
+
36
+ results.push({ device, level, status, degraded, disks, failed_disks: failedDisks });
37
+ }
38
+
39
+ return results;
40
+ }
@@ -0,0 +1,60 @@
1
+ import { run } from "../lib/exec.js";
2
+ import { readdirSync } from "fs";
3
+ import type { SmartInfo } from "../lib/types.js";
4
+
5
+ export async function collectSmart(): Promise<SmartInfo[]> {
6
+ // Find block devices
7
+ const devices: string[] = [];
8
+ try {
9
+ const entries = readdirSync("/sys/block");
10
+ for (const entry of entries) {
11
+ if (entry.startsWith("sd") || entry.startsWith("nvme") || entry.startsWith("hd")) {
12
+ devices.push(`/dev/${entry}`);
13
+ }
14
+ }
15
+ } catch {
16
+ return [];
17
+ }
18
+
19
+ const results: SmartInfo[] = [];
20
+ for (const device of devices) {
21
+ const output = await run("smartctl", ["--json", "--all", device]);
22
+ if (!output) continue;
23
+
24
+ try {
25
+ const data = JSON.parse(output);
26
+ const info: SmartInfo = {
27
+ device,
28
+ model: data.model_name || data.model_family || "unknown",
29
+ health: data.smart_status?.passed ? "PASSED" : "FAILED",
30
+ temperature_c: data.temperature?.current,
31
+ power_on_hours: data.power_on_time?.hours,
32
+ };
33
+
34
+ // NVMe specific
35
+ if (data.nvme_smart_health_information_log) {
36
+ const nvme = data.nvme_smart_health_information_log;
37
+ info.percentage_used = nvme.percentage_used;
38
+ info.temperature_c = nvme.temperature;
39
+ }
40
+
41
+ // SATA specific
42
+ if (data.ata_smart_attributes?.table) {
43
+ for (const attr of data.ata_smart_attributes.table) {
44
+ if (attr.id === 5 || attr.name === "Reallocated_Sector_Ct") {
45
+ info.reallocated_sectors = attr.raw?.value || 0;
46
+ }
47
+ if (attr.id === 197 || attr.name === "Current_Pending_Sector") {
48
+ info.pending_sectors = attr.raw?.value || 0;
49
+ }
50
+ }
51
+ }
52
+
53
+ results.push(info);
54
+ } catch {
55
+ // Failed to parse, skip this device
56
+ }
57
+ }
58
+
59
+ return results;
60
+ }
@@ -0,0 +1,21 @@
1
+ import { hostname } from "os";
2
+ import { readProcFile } from "../lib/parse.js";
3
+ import { run } from "../lib/exec.js";
4
+ import type { SystemInfo } from "../lib/types.js";
5
+
6
+ export async function collectSystem(): Promise<SystemInfo> {
7
+ const osRelease = readProcFile("/etc/os-release") || "";
8
+ const osName = osRelease.match(/PRETTY_NAME="(.+?)"/)?.[1] || "Unknown";
9
+ const kernel = (await run("uname", ["-r"]))?.trim() || "unknown";
10
+ const uptimeRaw = readProcFile("/proc/uptime") || "0";
11
+ const uptimeSeconds = Math.floor(parseFloat(uptimeRaw.split(" ")[0]));
12
+ const ip = (await run("hostname", ["-I"]))?.trim().split(" ")[0] || "unknown";
13
+
14
+ return {
15
+ hostname: hostname(),
16
+ ip,
17
+ os: osName,
18
+ kernel,
19
+ uptime_seconds: uptimeSeconds,
20
+ };
21
+ }