@glassmkr/crucible 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +92 -0
- package/config/collector.example.yaml +43 -0
- package/dist/alerts/evaluator.d.ts +3 -0
- package/dist/alerts/evaluator.js +14 -0
- package/dist/alerts/evaluator.js.map +1 -0
- package/dist/alerts/rules.d.ts +7 -0
- package/dist/alerts/rules.js +203 -0
- package/dist/alerts/rules.js.map +1 -0
- package/dist/alerts/state.d.ts +6 -0
- package/dist/alerts/state.js +77 -0
- package/dist/alerts/state.js.map +1 -0
- package/dist/collect/cpu.d.ts +2 -0
- package/dist/collect/cpu.js +35 -0
- package/dist/collect/cpu.js.map +1 -0
- package/dist/collect/disks.d.ts +2 -0
- package/dist/collect/disks.js +33 -0
- package/dist/collect/disks.js.map +1 -0
- package/dist/collect/ipmi.d.ts +2 -0
- package/dist/collect/ipmi.js +55 -0
- package/dist/collect/ipmi.js.map +1 -0
- package/dist/collect/memory.d.ts +2 -0
- package/dist/collect/memory.js +27 -0
- package/dist/collect/memory.js.map +1 -0
- package/dist/collect/network.d.ts +2 -0
- package/dist/collect/network.js +54 -0
- package/dist/collect/network.js.map +1 -0
- package/dist/collect/os-alerts.d.ts +2 -0
- package/dist/collect/os-alerts.js +41 -0
- package/dist/collect/os-alerts.js.map +1 -0
- package/dist/collect/raid.d.ts +2 -0
- package/dist/collect/raid.js +34 -0
- package/dist/collect/raid.js.map +1 -0
- package/dist/collect/smart.d.ts +2 -0
- package/dist/collect/smart.js +56 -0
- package/dist/collect/smart.js.map +1 -0
- package/dist/collect/system.d.ts +2 -0
- package/dist/collect/system.js +19 -0
- package/dist/collect/system.js.map +1 -0
- package/dist/config.d.ts +208 -0
- package/dist/config.js +58 -0
- package/dist/config.js.map +1 -0
- package/dist/index.d.ts +2 -0
- package/dist/index.js +96 -0
- package/dist/index.js.map +1 -0
- package/dist/lib/exec.d.ts +1 -0
- package/dist/lib/exec.js +19 -0
- package/dist/lib/exec.js.map +1 -0
- package/dist/lib/parse.d.ts +4 -0
- package/dist/lib/parse.js +29 -0
- package/dist/lib/parse.js.map +1 -0
- package/dist/lib/types.d.ts +103 -0
- package/dist/lib/types.js +2 -0
- package/dist/lib/types.js.map +1 -0
- package/dist/notify/email.d.ts +4 -0
- package/dist/notify/email.js +55 -0
- package/dist/notify/email.js.map +1 -0
- package/dist/notify/slack.d.ts +2 -0
- package/dist/notify/slack.js +38 -0
- package/dist/notify/slack.js.map +1 -0
- package/dist/notify/telegram.d.ts +2 -0
- package/dist/notify/telegram.js +38 -0
- package/dist/notify/telegram.js.map +1 -0
- package/dist/push/forge.d.ts +2 -0
- package/dist/push/forge.js +26 -0
- package/dist/push/forge.js.map +1 -0
- package/package.json +29 -0
- package/src/alerts/evaluator.ts +15 -0
- package/src/alerts/rules.ts +184 -0
- package/src/alerts/state.ts +92 -0
- package/src/collect/cpu.ts +44 -0
- package/src/collect/disks.ts +36 -0
- package/src/collect/ipmi.ts +60 -0
- package/src/collect/memory.ts +30 -0
- package/src/collect/network.ts +61 -0
- package/src/collect/os-alerts.ts +43 -0
- package/src/collect/raid.ts +40 -0
- package/src/collect/smart.ts +60 -0
- package/src/collect/system.ts +21 -0
- package/src/config.ts +60 -0
- package/src/index.ts +112 -0
- package/src/lib/exec.ts +16 -0
- package/src/lib/parse.ts +29 -0
- package/src/lib/types.ts +110 -0
- package/src/notify/email.ts +68 -0
- package/src/notify/slack.ts +46 -0
- package/src/notify/telegram.ts +45 -0
- package/src/push/forge.ts +25 -0
- package/tsconfig.json +15 -0
|
@@ -0,0 +1,184 @@
|
|
|
1
|
+
// Alert rules for the collector are identical to the Forge evaluator.
|
|
2
|
+
// Re-export from a shared definition to avoid duplication.
|
|
3
|
+
// For the collector, we use the same 15 rules but with local thresholds from config.
|
|
4
|
+
|
|
5
|
+
import type { Snapshot, AlertResult } from "../lib/types.js";
|
|
6
|
+
import type { Config } from "../config.js";
|
|
7
|
+
|
|
8
|
+
export interface AlertRule {
|
|
9
|
+
type: string;
|
|
10
|
+
evaluate(snap: Snapshot, thresholds: Config["thresholds"]): AlertResult[];
|
|
11
|
+
}
|
|
12
|
+
|
|
13
|
+
export const allRules: AlertRule[] = [
|
|
14
|
+
// 1. RAM high
|
|
15
|
+
{ type: "ram_high", evaluate(snap, t) {
|
|
16
|
+
if (!snap.memory?.total_mb) return [];
|
|
17
|
+
const pct = (snap.memory.used_mb / snap.memory.total_mb) * 100;
|
|
18
|
+
if (pct < (t.ram_percent ?? 90)) return [];
|
|
19
|
+
return [{ type: "ram_high", severity: pct >= 95 ? "critical" : "warning",
|
|
20
|
+
title: `RAM usage at ${pct.toFixed(1)}%`,
|
|
21
|
+
message: `Using ${snap.memory.used_mb}MB of ${snap.memory.total_mb}MB. ${snap.memory.available_mb}MB available.`,
|
|
22
|
+
evidence: { used_mb: snap.memory.used_mb, total_mb: snap.memory.total_mb, percent: Math.round(pct * 10) / 10 },
|
|
23
|
+
recommendation: "Check: ps aux --sort=-rss | head -20" }];
|
|
24
|
+
}},
|
|
25
|
+
// 2. Swap active
|
|
26
|
+
{ type: "swap_active", evaluate(snap, t) {
|
|
27
|
+
if (t.swap_alert === false || !snap.memory || snap.memory.swap_used_mb <= 0) return [];
|
|
28
|
+
return [{ type: "swap_active", severity: "warning", title: `Swap in use: ${snap.memory.swap_used_mb}MB`,
|
|
29
|
+
message: "Server is using swap space, indicating memory pressure.",
|
|
30
|
+
evidence: { swap_used_mb: snap.memory.swap_used_mb },
|
|
31
|
+
recommendation: "Check: free -h && ps aux --sort=-rss | head -20" }];
|
|
32
|
+
}},
|
|
33
|
+
// 3. Disk space high
|
|
34
|
+
{ type: "disk_space_high", evaluate(snap, t) {
|
|
35
|
+
if (!snap.disks) return [];
|
|
36
|
+
const threshold = t.disk_percent ?? 85;
|
|
37
|
+
return snap.disks.filter(d => d.percent_used >= threshold).map(d => ({
|
|
38
|
+
type: "disk_space_high", severity: d.percent_used >= 95 ? "critical" as const : "warning" as const,
|
|
39
|
+
title: `Disk ${d.mount} at ${d.percent_used}%`,
|
|
40
|
+
message: `${d.device}: ${d.used_gb}GB of ${d.total_gb}GB used. ${d.available_gb}GB available.`,
|
|
41
|
+
evidence: { device: d.device, mount: d.mount, percent_used: d.percent_used },
|
|
42
|
+
recommendation: "Check: du -sh /* | sort -rh | head -20" }));
|
|
43
|
+
}},
|
|
44
|
+
// 4. CPU iowait
|
|
45
|
+
{ type: "cpu_iowait_high", evaluate(snap, t) {
|
|
46
|
+
if (!snap.cpu || snap.cpu.iowait_percent < (t.iowait_percent ?? 20)) return [];
|
|
47
|
+
return [{ type: "cpu_iowait_high", severity: "warning", title: `CPU iowait at ${snap.cpu.iowait_percent.toFixed(1)}%`,
|
|
48
|
+
message: `High I/O wait: CPU spending ${snap.cpu.iowait_percent.toFixed(1)}% waiting for disk.`,
|
|
49
|
+
evidence: { iowait_percent: snap.cpu.iowait_percent },
|
|
50
|
+
recommendation: "Check: iotop -oP or iostat -x 1 5" }];
|
|
51
|
+
}},
|
|
52
|
+
// 5. OOM kills
|
|
53
|
+
{ type: "oom_kills", evaluate(snap) {
|
|
54
|
+
if (!snap.os_alerts || snap.os_alerts.oom_kills_recent <= 0) return [];
|
|
55
|
+
return [{ type: "oom_kills", severity: "critical", title: `${snap.os_alerts.oom_kills_recent} OOM kill(s)`,
|
|
56
|
+
message: `Kernel OOM killer terminated ${snap.os_alerts.oom_kills_recent} process(es).`,
|
|
57
|
+
evidence: { oom_kills_recent: snap.os_alerts.oom_kills_recent },
|
|
58
|
+
recommendation: "Check: dmesg | grep -i 'out of memory'" }];
|
|
59
|
+
}},
|
|
60
|
+
// 6. SMART failing
|
|
61
|
+
{ type: "smart_failing", evaluate(snap) {
|
|
62
|
+
if (!snap.smart) return [];
|
|
63
|
+
return snap.smart.filter(d => d.health !== "PASSED" || (d.reallocated_sectors && d.reallocated_sectors > 0) || (d.pending_sectors && d.pending_sectors > 0))
|
|
64
|
+
.map(d => ({ type: "smart_failing", severity: "critical" as const,
|
|
65
|
+
title: `SMART failure: ${d.device}`, message: `${d.model}: drive showing signs of failure.`,
|
|
66
|
+
evidence: { device: d.device, health: d.health, reallocated_sectors: d.reallocated_sectors, pending_sectors: d.pending_sectors },
|
|
67
|
+
recommendation: `Back up data. Schedule replacement for ${d.device}.` }));
|
|
68
|
+
}},
|
|
69
|
+
// 7. NVMe wear
|
|
70
|
+
{ type: "nvme_wear_high", evaluate(snap, t) {
|
|
71
|
+
if (!snap.smart) return [];
|
|
72
|
+
const threshold = t.nvme_wear_percent ?? 85;
|
|
73
|
+
return snap.smart.filter(d => d.percentage_used != null && d.percentage_used >= threshold)
|
|
74
|
+
.map(d => ({ type: "nvme_wear_high", severity: d.percentage_used! >= 95 ? "critical" as const : "warning" as const,
|
|
75
|
+
title: `NVMe ${d.device} wear at ${d.percentage_used}%`, message: `${d.model} at ${d.percentage_used}% lifetime wear.`,
|
|
76
|
+
evidence: { device: d.device, percentage_used: d.percentage_used },
|
|
77
|
+
recommendation: "Plan drive replacement." }));
|
|
78
|
+
}},
|
|
79
|
+
// 8. RAID degraded
|
|
80
|
+
{ type: "raid_degraded", evaluate(snap) {
|
|
81
|
+
if (!snap.raid) return [];
|
|
82
|
+
return snap.raid.filter(r => r.degraded || r.failed_disks.length > 0)
|
|
83
|
+
.map(r => ({ type: "raid_degraded", severity: "critical" as const,
|
|
84
|
+
title: `RAID ${r.device} degraded`, message: `${r.device} (${r.level}) degraded. Failed: ${r.failed_disks.join(", ") || "unknown"}.`,
|
|
85
|
+
evidence: { device: r.device, failed_disks: r.failed_disks },
|
|
86
|
+
recommendation: "Replace failed drive immediately." }));
|
|
87
|
+
}},
|
|
88
|
+
// 9. Disk latency
|
|
89
|
+
{ type: "disk_latency_high", evaluate(snap, t) {
|
|
90
|
+
if (!snap.disks) return [];
|
|
91
|
+
return snap.disks.filter(d => {
|
|
92
|
+
if (d.latency_p99_ms == null) return false;
|
|
93
|
+
const thresh = d.device.includes("nvme") ? (t.disk_latency_nvme_ms ?? 50) : (t.disk_latency_hdd_ms ?? 200);
|
|
94
|
+
return d.latency_p99_ms >= thresh;
|
|
95
|
+
}).map(d => ({ type: "disk_latency_high", severity: "warning" as const,
|
|
96
|
+
title: `Disk ${d.device} latency ${d.latency_p99_ms!.toFixed(1)}ms`,
|
|
97
|
+
message: `p99 I/O latency on ${d.device} is high.`,
|
|
98
|
+
evidence: { device: d.device, latency_p99_ms: d.latency_p99_ms },
|
|
99
|
+
recommendation: "Check: iotop -oP" }));
|
|
100
|
+
}},
|
|
101
|
+
// 10. Interface errors
|
|
102
|
+
{ type: "interface_errors", evaluate(snap) {
|
|
103
|
+
if (!snap.network) return [];
|
|
104
|
+
return snap.network.filter(i => (i.rx_errors + i.tx_errors + i.rx_drops + i.tx_drops) > 0)
|
|
105
|
+
.map(i => ({ type: "interface_errors", severity: "warning" as const,
|
|
106
|
+
title: `${i.interface}: errors/drops detected`,
|
|
107
|
+
message: `RX errors=${i.rx_errors}, TX errors=${i.tx_errors}, RX drops=${i.rx_drops}, TX drops=${i.tx_drops}.`,
|
|
108
|
+
evidence: { interface: i.interface, rx_errors: i.rx_errors, tx_errors: i.tx_errors, rx_drops: i.rx_drops, tx_drops: i.tx_drops },
|
|
109
|
+
recommendation: "Check cables and SFP/transceiver." }));
|
|
110
|
+
}},
|
|
111
|
+
// 11. Link speed mismatch
|
|
112
|
+
{ type: "link_speed_mismatch", evaluate(snap) {
|
|
113
|
+
if (!snap.network) return [];
|
|
114
|
+
return snap.network.filter(i => i.speed_mbps > 0 && i.speed_mbps < 1000)
|
|
115
|
+
.map(i => ({ type: "link_speed_mismatch", severity: "warning" as const,
|
|
116
|
+
title: `${i.interface} at ${i.speed_mbps} Mbps`,
|
|
117
|
+
message: `Interface negotiated below 1 Gbps.`,
|
|
118
|
+
evidence: { interface: i.interface, speed_mbps: i.speed_mbps },
|
|
119
|
+
recommendation: "Check cable, SFP, switch port config." }));
|
|
120
|
+
}},
|
|
121
|
+
// 12. Interface saturation
|
|
122
|
+
{ type: "interface_saturation", evaluate(snap, t) {
|
|
123
|
+
if (!snap.network) return [];
|
|
124
|
+
const threshold = (t.interface_utilization_percent ?? 90) / 100;
|
|
125
|
+
return snap.network.filter(i => {
|
|
126
|
+
if (!i.speed_mbps) return false;
|
|
127
|
+
const maxBps = (i.speed_mbps * 1_000_000) / 8;
|
|
128
|
+
return Math.max(i.rx_bytes_sec, i.tx_bytes_sec) / maxBps >= threshold;
|
|
129
|
+
}).map(i => {
|
|
130
|
+
const maxBps = (i.speed_mbps * 1_000_000) / 8;
|
|
131
|
+
const util = Math.max(i.rx_bytes_sec, i.tx_bytes_sec) / maxBps * 100;
|
|
132
|
+
return { type: "interface_saturation", severity: "warning" as const,
|
|
133
|
+
title: `${i.interface} at ${util.toFixed(0)}% utilization`,
|
|
134
|
+
message: `Interface ${i.interface} (${i.speed_mbps} Mbps) near saturation.`,
|
|
135
|
+
evidence: { interface: i.interface, utilization_percent: Math.round(util * 10) / 10 },
|
|
136
|
+
recommendation: "Check: iftop or nload" };
|
|
137
|
+
});
|
|
138
|
+
}},
|
|
139
|
+
// 13. CPU temperature
|
|
140
|
+
{ type: "cpu_temperature_high", evaluate(snap, t) {
|
|
141
|
+
if (!snap.ipmi?.available || !snap.ipmi.sensors) return [];
|
|
142
|
+
const warn = t.cpu_temp_warning_c ?? 80;
|
|
143
|
+
return snap.ipmi.sensors.filter(s => {
|
|
144
|
+
const n = s.name.toLowerCase();
|
|
145
|
+
if (!n.includes("cpu") && !n.includes("temp")) return false;
|
|
146
|
+
const v = typeof s.value === "number" ? s.value : parseFloat(String(s.value));
|
|
147
|
+
return !isNaN(v) && v >= warn;
|
|
148
|
+
}).map(s => {
|
|
149
|
+
const v = typeof s.value === "number" ? s.value : parseFloat(String(s.value));
|
|
150
|
+
const crit = s.upper_critical ?? (t.cpu_temp_critical_c ?? 90);
|
|
151
|
+
return { type: "cpu_temperature_high", severity: v >= crit ? "critical" as const : "warning" as const,
|
|
152
|
+
title: `${s.name}: ${v}${s.unit}`, message: `Temperature above warning threshold.`,
|
|
153
|
+
evidence: { sensor: s.name, value: v },
|
|
154
|
+
recommendation: "Check cooling, fans, airflow." };
|
|
155
|
+
});
|
|
156
|
+
}},
|
|
157
|
+
// 14. ECC errors
|
|
158
|
+
{ type: "ecc_errors", evaluate(snap) {
|
|
159
|
+
if (!snap.ipmi?.ecc_errors) return [];
|
|
160
|
+
const { correctable, uncorrectable } = snap.ipmi.ecc_errors;
|
|
161
|
+
if (correctable <= 0 && uncorrectable <= 0) return [];
|
|
162
|
+
if (uncorrectable > 0) return [{ type: "ecc_errors", severity: "critical",
|
|
163
|
+
title: `${uncorrectable} uncorrectable ECC error(s)`, message: "Data corruption possible. DIMM failing.",
|
|
164
|
+
evidence: { correctable, uncorrectable },
|
|
165
|
+
recommendation: "Replace DIMM immediately. Run: ipmitool sdr type Memory" }];
|
|
166
|
+
return [{ type: "ecc_errors", severity: "warning",
|
|
167
|
+
title: `${correctable} correctable ECC error(s)`, message: "Early warning of DIMM failure.",
|
|
168
|
+
evidence: { correctable, uncorrectable },
|
|
169
|
+
recommendation: "Schedule DIMM replacement. Run: ipmitool sdr type Memory" }];
|
|
170
|
+
}},
|
|
171
|
+
// 15. PSU redundancy
|
|
172
|
+
{ type: "psu_redundancy_loss", evaluate(snap) {
|
|
173
|
+
if (!snap.ipmi?.available || !snap.ipmi.sensors) return [];
|
|
174
|
+
const psus = snap.ipmi.sensors.filter(s => { const n = s.name.toLowerCase(); return n.includes("psu") || n.includes("power supply"); });
|
|
175
|
+
if (psus.length < 2) return [];
|
|
176
|
+
const failed = psus.filter(s => { const st = String(s.status).toLowerCase(); const v = String(s.value).toLowerCase();
|
|
177
|
+
return st.includes("fail") || st.includes("absent") || v.includes("fail") || v.includes("absent"); });
|
|
178
|
+
if (failed.length === 0) return [];
|
|
179
|
+
return [{ type: "psu_redundancy_loss", severity: "critical",
|
|
180
|
+
title: "PSU redundancy lost", message: `${failed.length} PSU(s) failed/absent: ${failed.map(p => p.name).join(", ")}.`,
|
|
181
|
+
evidence: { failed: failed.map(p => ({ name: p.name, status: p.status })) },
|
|
182
|
+
recommendation: "Replace failed PSU. Check power connections." }];
|
|
183
|
+
}},
|
|
184
|
+
];
|
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
import { readFileSync, writeFileSync, mkdirSync } from "fs";
|
|
2
|
+
import type { AlertResult } from "../lib/types.js";
|
|
3
|
+
|
|
4
|
+
const STATE_FILE = "/var/lib/glassmkr/alert-state.json";
|
|
5
|
+
|
|
6
|
+
interface AlertState {
|
|
7
|
+
type: string;
|
|
8
|
+
first_seen: string;
|
|
9
|
+
last_seen: string;
|
|
10
|
+
notified: boolean;
|
|
11
|
+
}
|
|
12
|
+
|
|
13
|
+
let state: Map<string, AlertState> = new Map();
|
|
14
|
+
|
|
15
|
+
function load() {
|
|
16
|
+
try {
|
|
17
|
+
const raw = readFileSync(STATE_FILE, "utf-8");
|
|
18
|
+
const data: Record<string, AlertState> = JSON.parse(raw);
|
|
19
|
+
state = new Map(Object.entries(data));
|
|
20
|
+
} catch {
|
|
21
|
+
state = new Map();
|
|
22
|
+
}
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
function save() {
|
|
26
|
+
try {
|
|
27
|
+
mkdirSync("/var/lib/glassmkr", { recursive: true });
|
|
28
|
+
const obj: Record<string, AlertState> = {};
|
|
29
|
+
for (const [k, v] of state) obj[k] = v;
|
|
30
|
+
writeFileSync(STATE_FILE, JSON.stringify(obj, null, 2));
|
|
31
|
+
} catch (err) {
|
|
32
|
+
console.error("[state] Failed to save alert state:", err);
|
|
33
|
+
}
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
// Initialize on import
|
|
37
|
+
load();
|
|
38
|
+
|
|
39
|
+
export function updateAlertState(currentAlerts: AlertResult[]): {
|
|
40
|
+
newAlerts: AlertResult[];
|
|
41
|
+
resolvedAlerts: AlertResult[];
|
|
42
|
+
} {
|
|
43
|
+
const now = new Date().toISOString();
|
|
44
|
+
const currentTypes = new Set(currentAlerts.map((a) => a.type));
|
|
45
|
+
const newAlerts: AlertResult[] = [];
|
|
46
|
+
const resolvedAlerts: AlertResult[] = [];
|
|
47
|
+
|
|
48
|
+
// Check for new alerts
|
|
49
|
+
for (const alert of currentAlerts) {
|
|
50
|
+
const existing = state.get(alert.type);
|
|
51
|
+
if (!existing) {
|
|
52
|
+
// New alert
|
|
53
|
+
state.set(alert.type, { type: alert.type, first_seen: now, last_seen: now, notified: false });
|
|
54
|
+
newAlerts.push(alert);
|
|
55
|
+
} else {
|
|
56
|
+
// Existing alert, update last_seen
|
|
57
|
+
existing.last_seen = now;
|
|
58
|
+
}
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
// Check for resolved alerts
|
|
62
|
+
for (const [type, alertState] of state) {
|
|
63
|
+
if (!currentTypes.has(type)) {
|
|
64
|
+
resolvedAlerts.push({
|
|
65
|
+
type,
|
|
66
|
+
severity: "warning",
|
|
67
|
+
title: `Resolved: ${type}`,
|
|
68
|
+
message: `Condition cleared. Active for ${timeSince(alertState.first_seen)}.`,
|
|
69
|
+
evidence: {},
|
|
70
|
+
recommendation: "",
|
|
71
|
+
});
|
|
72
|
+
state.delete(type);
|
|
73
|
+
}
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
save();
|
|
77
|
+
return { newAlerts, resolvedAlerts };
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
function timeSince(isoDate: string): string {
|
|
81
|
+
const ms = Date.now() - new Date(isoDate).getTime();
|
|
82
|
+
const minutes = Math.floor(ms / 60000);
|
|
83
|
+
if (minutes < 60) return `${minutes} minute(s)`;
|
|
84
|
+
const hours = Math.floor(minutes / 60);
|
|
85
|
+
if (hours < 24) return `${hours} hour(s) ${minutes % 60} minute(s)`;
|
|
86
|
+
const days = Math.floor(hours / 24);
|
|
87
|
+
return `${days} day(s)`;
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
export function getActiveAlerts(): string[] {
|
|
91
|
+
return Array.from(state.keys());
|
|
92
|
+
}
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
import { readProcFile, sleep } from "../lib/parse.js";
|
|
2
|
+
import type { CpuInfo } from "../lib/types.js";
|
|
3
|
+
|
|
4
|
+
interface CpuStat {
|
|
5
|
+
user: number; nice: number; system: number; idle: number;
|
|
6
|
+
iowait: number; irq: number; softirq: number; steal: number;
|
|
7
|
+
}
|
|
8
|
+
|
|
9
|
+
function parseProcStat(): CpuStat {
|
|
10
|
+
const raw = readProcFile("/proc/stat") || "";
|
|
11
|
+
const line = raw.split("\n").find((l) => l.startsWith("cpu "));
|
|
12
|
+
if (!line) return { user: 0, nice: 0, system: 0, idle: 0, iowait: 0, irq: 0, softirq: 0, steal: 0 };
|
|
13
|
+
const parts = line.split(/\s+/).slice(1).map(Number);
|
|
14
|
+
return {
|
|
15
|
+
user: parts[0] || 0, nice: parts[1] || 0, system: parts[2] || 0, idle: parts[3] || 0,
|
|
16
|
+
iowait: parts[4] || 0, irq: parts[5] || 0, softirq: parts[6] || 0, steal: parts[7] || 0,
|
|
17
|
+
};
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
export async function collectCpu(): Promise<CpuInfo> {
|
|
21
|
+
const stat1 = parseProcStat();
|
|
22
|
+
await sleep(1000);
|
|
23
|
+
const stat2 = parseProcStat();
|
|
24
|
+
|
|
25
|
+
const d = {
|
|
26
|
+
user: stat2.user - stat1.user, nice: stat2.nice - stat1.nice,
|
|
27
|
+
system: stat2.system - stat1.system, idle: stat2.idle - stat1.idle,
|
|
28
|
+
iowait: stat2.iowait - stat1.iowait, irq: stat2.irq - stat1.irq,
|
|
29
|
+
softirq: stat2.softirq - stat1.softirq, steal: stat2.steal - stat1.steal,
|
|
30
|
+
};
|
|
31
|
+
const total = Object.values(d).reduce((a, b) => a + b, 0) || 1;
|
|
32
|
+
|
|
33
|
+
const loadavg = (readProcFile("/proc/loadavg") || "0 0 0").trim().split(" ");
|
|
34
|
+
|
|
35
|
+
return {
|
|
36
|
+
user_percent: Math.round((d.user / total) * 10000) / 100,
|
|
37
|
+
system_percent: Math.round((d.system / total) * 10000) / 100,
|
|
38
|
+
iowait_percent: Math.round((d.iowait / total) * 10000) / 100,
|
|
39
|
+
idle_percent: Math.round((d.idle / total) * 10000) / 100,
|
|
40
|
+
load_1m: parseFloat(loadavg[0]) || 0,
|
|
41
|
+
load_5m: parseFloat(loadavg[1]) || 0,
|
|
42
|
+
load_15m: parseFloat(loadavg[2]) || 0,
|
|
43
|
+
};
|
|
44
|
+
}
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
import { run } from "../lib/exec.js";
|
|
2
|
+
import type { DiskInfo } from "../lib/types.js";
|
|
3
|
+
|
|
4
|
+
export async function collectDisks(): Promise<DiskInfo[]> {
|
|
5
|
+
const dfOutput = await run("df", ["-B1", "--output=source,target,size,used,avail,pcent", "-x", "tmpfs", "-x", "devtmpfs", "-x", "squashfs"]);
|
|
6
|
+
if (!dfOutput) return [];
|
|
7
|
+
|
|
8
|
+
const lines = dfOutput.trim().split("\n").slice(1); // skip header
|
|
9
|
+
const disks: DiskInfo[] = [];
|
|
10
|
+
|
|
11
|
+
for (const line of lines) {
|
|
12
|
+
const parts = line.trim().split(/\s+/);
|
|
13
|
+
if (parts.length < 6) continue;
|
|
14
|
+
const device = parts[0];
|
|
15
|
+
const mount = parts[1];
|
|
16
|
+
const totalBytes = parseInt(parts[2]) || 0;
|
|
17
|
+
const usedBytes = parseInt(parts[3]) || 0;
|
|
18
|
+
const availBytes = parseInt(parts[4]) || 0;
|
|
19
|
+
const pctStr = parts[5].replace("%", "");
|
|
20
|
+
const percent = parseInt(pctStr) || 0;
|
|
21
|
+
|
|
22
|
+
// Skip pseudo-filesystems
|
|
23
|
+
if (!device.startsWith("/dev/")) continue;
|
|
24
|
+
|
|
25
|
+
disks.push({
|
|
26
|
+
device,
|
|
27
|
+
mount,
|
|
28
|
+
total_gb: Math.round((totalBytes / 1073741824) * 100) / 100,
|
|
29
|
+
used_gb: Math.round((usedBytes / 1073741824) * 100) / 100,
|
|
30
|
+
available_gb: Math.round((availBytes / 1073741824) * 100) / 100,
|
|
31
|
+
percent_used: percent,
|
|
32
|
+
});
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
return disks;
|
|
36
|
+
}
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
import { run } from "../lib/exec.js";
|
|
2
|
+
import type { IpmiInfo } from "../lib/types.js";
|
|
3
|
+
|
|
4
|
+
export async function collectIpmi(): Promise<IpmiInfo> {
|
|
5
|
+
const sensorRaw = await run("ipmitool", ["sensor"]);
|
|
6
|
+
if (!sensorRaw) {
|
|
7
|
+
return { available: false, sensors: [], ecc_errors: { correctable: 0, uncorrectable: 0 }, sel_entries_count: 0 };
|
|
8
|
+
}
|
|
9
|
+
|
|
10
|
+
// Parse sensor readings
|
|
11
|
+
const sensors: IpmiInfo["sensors"] = [];
|
|
12
|
+
for (const line of sensorRaw.split("\n")) {
|
|
13
|
+
const parts = line.split("|").map((s) => s.trim());
|
|
14
|
+
if (parts.length < 4) continue;
|
|
15
|
+
const name = parts[0];
|
|
16
|
+
const rawValue = parts[1];
|
|
17
|
+
const unit = parts[2];
|
|
18
|
+
const status = parts[3];
|
|
19
|
+
|
|
20
|
+
const numValue = parseFloat(rawValue);
|
|
21
|
+
const value: number | string = isNaN(numValue) ? rawValue : numValue;
|
|
22
|
+
|
|
23
|
+
// Parse upper critical threshold
|
|
24
|
+
let upperCritical: number | undefined;
|
|
25
|
+
if (parts[8]) {
|
|
26
|
+
const uc = parseFloat(parts[8]);
|
|
27
|
+
if (!isNaN(uc)) upperCritical = uc;
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
sensors.push({ name, value, unit, status, upper_critical: upperCritical });
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
// ECC errors from memory-type sensors
|
|
34
|
+
let correctable = 0;
|
|
35
|
+
let uncorrectable = 0;
|
|
36
|
+
for (const sensor of sensors) {
|
|
37
|
+
const name = sensor.name.toLowerCase();
|
|
38
|
+
if (name.includes("correctable") && typeof sensor.value === "number") {
|
|
39
|
+
correctable += sensor.value;
|
|
40
|
+
}
|
|
41
|
+
if (name.includes("uncorrectable") && typeof sensor.value === "number") {
|
|
42
|
+
uncorrectable += sensor.value;
|
|
43
|
+
}
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
// SEL entry count
|
|
47
|
+
let selCount = 0;
|
|
48
|
+
const selInfo = await run("ipmitool", ["sel", "info"]);
|
|
49
|
+
if (selInfo) {
|
|
50
|
+
const match = selInfo.match(/Entries\s*:\s*(\d+)/i);
|
|
51
|
+
if (match) selCount = parseInt(match[1], 10);
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
return {
|
|
55
|
+
available: true,
|
|
56
|
+
sensors,
|
|
57
|
+
ecc_errors: { correctable, uncorrectable },
|
|
58
|
+
sel_entries_count: selCount,
|
|
59
|
+
};
|
|
60
|
+
}
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
import { readProcFile, parseKb } from "../lib/parse.js";
|
|
2
|
+
import type { MemoryInfo } from "../lib/types.js";
|
|
3
|
+
|
|
4
|
+
export async function collectMemory(): Promise<MemoryInfo> {
|
|
5
|
+
const raw = readProcFile("/proc/meminfo") || "";
|
|
6
|
+
const kv: Record<string, string> = {};
|
|
7
|
+
for (const line of raw.split("\n")) {
|
|
8
|
+
const match = line.match(/^(\w+):\s+(.+)/);
|
|
9
|
+
if (match) kv[match[1]] = match[2];
|
|
10
|
+
}
|
|
11
|
+
|
|
12
|
+
const totalKb = parseKb(kv["MemTotal"]);
|
|
13
|
+
const availableKb = parseKb(kv["MemAvailable"]);
|
|
14
|
+
const swapTotalKb = parseKb(kv["SwapTotal"]);
|
|
15
|
+
const swapFreeKb = parseKb(kv["SwapFree"]);
|
|
16
|
+
|
|
17
|
+
const totalMb = Math.round(totalKb / 1024);
|
|
18
|
+
const availableMb = Math.round(availableKb / 1024);
|
|
19
|
+
const usedMb = totalMb - availableMb;
|
|
20
|
+
const swapTotalMb = Math.round(swapTotalKb / 1024);
|
|
21
|
+
const swapUsedMb = Math.round((swapTotalKb - swapFreeKb) / 1024);
|
|
22
|
+
|
|
23
|
+
return {
|
|
24
|
+
total_mb: totalMb,
|
|
25
|
+
used_mb: usedMb,
|
|
26
|
+
available_mb: availableMb,
|
|
27
|
+
swap_total_mb: swapTotalMb,
|
|
28
|
+
swap_used_mb: swapUsedMb,
|
|
29
|
+
};
|
|
30
|
+
}
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
import { readProcFile, sleep } from "../lib/parse.js";
|
|
2
|
+
import { readFileSync } from "fs";
|
|
3
|
+
import type { NetworkInfo } from "../lib/types.js";
|
|
4
|
+
|
|
5
|
+
interface IfaceStats {
|
|
6
|
+
rx_bytes: number; rx_packets: number; rx_errors: number; rx_drops: number;
|
|
7
|
+
tx_bytes: number; tx_packets: number; tx_errors: number; tx_drops: number;
|
|
8
|
+
}
|
|
9
|
+
|
|
10
|
+
function parseNetDev(): Record<string, IfaceStats> {
|
|
11
|
+
const raw = readProcFile("/proc/net/dev") || "";
|
|
12
|
+
const result: Record<string, IfaceStats> = {};
|
|
13
|
+
for (const line of raw.split("\n").slice(2)) {
|
|
14
|
+
const match = line.match(/^\s*(\S+):\s+(.*)/);
|
|
15
|
+
if (!match) continue;
|
|
16
|
+
const name = match[1];
|
|
17
|
+
// Skip virtual interfaces
|
|
18
|
+
if (name === "lo" || name.startsWith("veth") || name.startsWith("docker") || name.startsWith("br-") || name.startsWith("virbr")) continue;
|
|
19
|
+
const parts = match[2].trim().split(/\s+/).map(Number);
|
|
20
|
+
result[name] = {
|
|
21
|
+
rx_bytes: parts[0] || 0, rx_packets: parts[1] || 0, rx_errors: parts[2] || 0, rx_drops: parts[3] || 0,
|
|
22
|
+
tx_bytes: parts[8] || 0, tx_packets: parts[9] || 0, tx_errors: parts[10] || 0, tx_drops: parts[11] || 0,
|
|
23
|
+
};
|
|
24
|
+
}
|
|
25
|
+
return result;
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
function getSpeed(iface: string): number {
|
|
29
|
+
try {
|
|
30
|
+
const speed = readFileSync(`/sys/class/net/${iface}/speed`, "utf-8").trim();
|
|
31
|
+
const val = parseInt(speed, 10);
|
|
32
|
+
return isNaN(val) || val <= 0 ? 0 : val;
|
|
33
|
+
} catch {
|
|
34
|
+
return 0;
|
|
35
|
+
}
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
export async function collectNetwork(): Promise<NetworkInfo[]> {
|
|
39
|
+
const stats1 = parseNetDev();
|
|
40
|
+
await sleep(1000);
|
|
41
|
+
const stats2 = parseNetDev();
|
|
42
|
+
|
|
43
|
+
const results: NetworkInfo[] = [];
|
|
44
|
+
for (const [name, s2] of Object.entries(stats2)) {
|
|
45
|
+
const s1 = stats1[name];
|
|
46
|
+
if (!s1) continue;
|
|
47
|
+
|
|
48
|
+
results.push({
|
|
49
|
+
interface: name,
|
|
50
|
+
speed_mbps: getSpeed(name),
|
|
51
|
+
rx_bytes_sec: s2.rx_bytes - s1.rx_bytes,
|
|
52
|
+
tx_bytes_sec: s2.tx_bytes - s1.tx_bytes,
|
|
53
|
+
rx_errors: s2.rx_errors,
|
|
54
|
+
tx_errors: s2.tx_errors,
|
|
55
|
+
rx_drops: s2.rx_drops,
|
|
56
|
+
tx_drops: s2.tx_drops,
|
|
57
|
+
});
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
return results;
|
|
61
|
+
}
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
import { run } from "../lib/exec.js";
|
|
2
|
+
import { readProcFile } from "../lib/parse.js";
|
|
3
|
+
import { readdirSync, readFileSync } from "fs";
|
|
4
|
+
import type { OsAlerts } from "../lib/types.js";
|
|
5
|
+
|
|
6
|
+
export async function collectOsAlerts(): Promise<OsAlerts> {
|
|
7
|
+
// OOM kills
|
|
8
|
+
let oomKills = 0;
|
|
9
|
+
const dmesg = await run("dmesg", ["--level=err,crit", "--since", "5 min ago"]);
|
|
10
|
+
if (dmesg) {
|
|
11
|
+
oomKills = (dmesg.match(/Out of memory/gi) || []).length;
|
|
12
|
+
}
|
|
13
|
+
|
|
14
|
+
// Zombie processes
|
|
15
|
+
let zombies = 0;
|
|
16
|
+
try {
|
|
17
|
+
const pids = readdirSync("/proc").filter((f) => /^\d+$/.test(f));
|
|
18
|
+
for (const pid of pids) {
|
|
19
|
+
try {
|
|
20
|
+
const stat = readFileSync(`/proc/${pid}/stat`, "utf-8");
|
|
21
|
+
// Field 3 is the state character
|
|
22
|
+
const state = stat.split(" ")[2];
|
|
23
|
+
if (state === "Z") zombies++;
|
|
24
|
+
} catch { /* process disappeared */ }
|
|
25
|
+
}
|
|
26
|
+
} catch { /* /proc not readable */ }
|
|
27
|
+
|
|
28
|
+
// Time drift (simple: check if chrony/ntp reports drift)
|
|
29
|
+
let timeDriftMs = 0;
|
|
30
|
+
const chrony = await run("chronyc", ["tracking"]);
|
|
31
|
+
if (chrony) {
|
|
32
|
+
const match = chrony.match(/System time\s*:\s*([\d.]+)\s*seconds\s*(slow|fast)/);
|
|
33
|
+
if (match) {
|
|
34
|
+
timeDriftMs = parseFloat(match[1]) * 1000;
|
|
35
|
+
}
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
return {
|
|
39
|
+
oom_kills_recent: oomKills,
|
|
40
|
+
zombie_processes: zombies,
|
|
41
|
+
time_drift_ms: Math.round(timeDriftMs * 100) / 100,
|
|
42
|
+
};
|
|
43
|
+
}
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
import { readProcFile } from "../lib/parse.js";
|
|
2
|
+
import type { RaidInfo } from "../lib/types.js";
|
|
3
|
+
|
|
4
|
+
export async function collectRaid(): Promise<RaidInfo[]> {
|
|
5
|
+
const raw = readProcFile("/proc/mdstat");
|
|
6
|
+
if (!raw) return [];
|
|
7
|
+
|
|
8
|
+
const results: RaidInfo[] = [];
|
|
9
|
+
const lines = raw.split("\n");
|
|
10
|
+
|
|
11
|
+
for (let i = 0; i < lines.length; i++) {
|
|
12
|
+
const match = lines[i].match(/^(md\d+)\s*:\s*(\w+)\s+(\w+)\s+(.*)/);
|
|
13
|
+
if (!match) continue;
|
|
14
|
+
|
|
15
|
+
const device = match[1];
|
|
16
|
+
const status = match[2]; // "active" or "inactive"
|
|
17
|
+
const level = match[3]; // "raid1", "raid5", etc.
|
|
18
|
+
const disksPart = match[4];
|
|
19
|
+
|
|
20
|
+
// Parse component disks (e.g., "sda1[0] sdb1[1]")
|
|
21
|
+
const disks = (disksPart.match(/\w+\[\d+\]/g) || []).map((d) => d.replace(/\[\d+\]/, ""));
|
|
22
|
+
|
|
23
|
+
// Check next line for degraded status (e.g., "[UU_]" means one drive missing)
|
|
24
|
+
const statusLine = lines[i + 1] || "";
|
|
25
|
+
const bracketMatch = statusLine.match(/\[([U_]+)\]/);
|
|
26
|
+
const degraded = bracketMatch ? bracketMatch[1].includes("_") : false;
|
|
27
|
+
|
|
28
|
+
const failedDisks: string[] = [];
|
|
29
|
+
if (degraded && bracketMatch) {
|
|
30
|
+
const pattern = bracketMatch[1];
|
|
31
|
+
pattern.split("").forEach((c, idx) => {
|
|
32
|
+
if (c === "_" && disks[idx]) failedDisks.push(disks[idx]);
|
|
33
|
+
});
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
results.push({ device, level, status, degraded, disks, failed_disks: failedDisks });
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
return results;
|
|
40
|
+
}
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
import { run } from "../lib/exec.js";
|
|
2
|
+
import { readdirSync } from "fs";
|
|
3
|
+
import type { SmartInfo } from "../lib/types.js";
|
|
4
|
+
|
|
5
|
+
export async function collectSmart(): Promise<SmartInfo[]> {
|
|
6
|
+
// Find block devices
|
|
7
|
+
const devices: string[] = [];
|
|
8
|
+
try {
|
|
9
|
+
const entries = readdirSync("/sys/block");
|
|
10
|
+
for (const entry of entries) {
|
|
11
|
+
if (entry.startsWith("sd") || entry.startsWith("nvme") || entry.startsWith("hd")) {
|
|
12
|
+
devices.push(`/dev/${entry}`);
|
|
13
|
+
}
|
|
14
|
+
}
|
|
15
|
+
} catch {
|
|
16
|
+
return [];
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
const results: SmartInfo[] = [];
|
|
20
|
+
for (const device of devices) {
|
|
21
|
+
const output = await run("smartctl", ["--json", "--all", device]);
|
|
22
|
+
if (!output) continue;
|
|
23
|
+
|
|
24
|
+
try {
|
|
25
|
+
const data = JSON.parse(output);
|
|
26
|
+
const info: SmartInfo = {
|
|
27
|
+
device,
|
|
28
|
+
model: data.model_name || data.model_family || "unknown",
|
|
29
|
+
health: data.smart_status?.passed ? "PASSED" : "FAILED",
|
|
30
|
+
temperature_c: data.temperature?.current,
|
|
31
|
+
power_on_hours: data.power_on_time?.hours,
|
|
32
|
+
};
|
|
33
|
+
|
|
34
|
+
// NVMe specific
|
|
35
|
+
if (data.nvme_smart_health_information_log) {
|
|
36
|
+
const nvme = data.nvme_smart_health_information_log;
|
|
37
|
+
info.percentage_used = nvme.percentage_used;
|
|
38
|
+
info.temperature_c = nvme.temperature;
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
// SATA specific
|
|
42
|
+
if (data.ata_smart_attributes?.table) {
|
|
43
|
+
for (const attr of data.ata_smart_attributes.table) {
|
|
44
|
+
if (attr.id === 5 || attr.name === "Reallocated_Sector_Ct") {
|
|
45
|
+
info.reallocated_sectors = attr.raw?.value || 0;
|
|
46
|
+
}
|
|
47
|
+
if (attr.id === 197 || attr.name === "Current_Pending_Sector") {
|
|
48
|
+
info.pending_sectors = attr.raw?.value || 0;
|
|
49
|
+
}
|
|
50
|
+
}
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
results.push(info);
|
|
54
|
+
} catch {
|
|
55
|
+
// Failed to parse, skip this device
|
|
56
|
+
}
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
return results;
|
|
60
|
+
}
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
import { hostname } from "os";
|
|
2
|
+
import { readProcFile } from "../lib/parse.js";
|
|
3
|
+
import { run } from "../lib/exec.js";
|
|
4
|
+
import type { SystemInfo } from "../lib/types.js";
|
|
5
|
+
|
|
6
|
+
export async function collectSystem(): Promise<SystemInfo> {
|
|
7
|
+
const osRelease = readProcFile("/etc/os-release") || "";
|
|
8
|
+
const osName = osRelease.match(/PRETTY_NAME="(.+?)"/)?.[1] || "Unknown";
|
|
9
|
+
const kernel = (await run("uname", ["-r"]))?.trim() || "unknown";
|
|
10
|
+
const uptimeRaw = readProcFile("/proc/uptime") || "0";
|
|
11
|
+
const uptimeSeconds = Math.floor(parseFloat(uptimeRaw.split(" ")[0]));
|
|
12
|
+
const ip = (await run("hostname", ["-I"]))?.trim().split(" ")[0] || "unknown";
|
|
13
|
+
|
|
14
|
+
return {
|
|
15
|
+
hostname: hostname(),
|
|
16
|
+
ip,
|
|
17
|
+
os: osName,
|
|
18
|
+
kernel,
|
|
19
|
+
uptime_seconds: uptimeSeconds,
|
|
20
|
+
};
|
|
21
|
+
}
|