@glassmkr/crucible 0.7.1 → 0.8.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (103) hide show
  1. package/dist/alerts/__tests__/rules.test.d.ts +1 -0
  2. package/dist/alerts/__tests__/rules.test.js +437 -0
  3. package/dist/alerts/__tests__/rules.test.js.map +1 -0
  4. package/dist/alerts/rules.d.ts +8 -0
  5. package/dist/alerts/rules.js +175 -34
  6. package/dist/alerts/rules.js.map +1 -1
  7. package/dist/api.d.ts +2 -0
  8. package/dist/api.js +7 -0
  9. package/dist/api.js.map +1 -0
  10. package/dist/collect/__tests__/dmi.test.d.ts +1 -0
  11. package/dist/collect/__tests__/dmi.test.js +133 -0
  12. package/dist/collect/__tests__/dmi.test.js.map +1 -0
  13. package/dist/collect/__tests__/ipmi.test.js +47 -1
  14. package/dist/collect/__tests__/ipmi.test.js.map +1 -1
  15. package/dist/collect/__tests__/thermal.test.d.ts +1 -0
  16. package/dist/collect/__tests__/thermal.test.js +224 -0
  17. package/dist/collect/__tests__/thermal.test.js.map +1 -0
  18. package/dist/collect/dmi.d.ts +19 -0
  19. package/dist/collect/dmi.js +118 -0
  20. package/dist/collect/dmi.js.map +1 -0
  21. package/dist/collect/ipmi.d.ts +27 -2
  22. package/dist/collect/ipmi.js +90 -2
  23. package/dist/collect/ipmi.js.map +1 -1
  24. package/dist/collect/thermal.d.ts +10 -0
  25. package/dist/collect/thermal.js +232 -0
  26. package/dist/collect/thermal.js.map +1 -0
  27. package/dist/config.d.ts +10 -0
  28. package/dist/config.js +2 -0
  29. package/dist/config.js.map +1 -1
  30. package/dist/index.js +51 -1
  31. package/dist/index.js.map +1 -1
  32. package/dist/lib/__tests__/capability.test.d.ts +1 -0
  33. package/dist/lib/__tests__/capability.test.js +87 -0
  34. package/dist/lib/__tests__/capability.test.js.map +1 -0
  35. package/dist/lib/__tests__/vendor-sensors.test.d.ts +1 -0
  36. package/dist/lib/__tests__/vendor-sensors.test.js +49 -0
  37. package/dist/lib/__tests__/vendor-sensors.test.js.map +1 -0
  38. package/dist/lib/capability.d.ts +21 -0
  39. package/dist/lib/capability.js +110 -0
  40. package/dist/lib/capability.js.map +1 -0
  41. package/dist/lib/cpu-thermal-chips.d.ts +2 -0
  42. package/dist/lib/cpu-thermal-chips.js +28 -0
  43. package/dist/lib/cpu-thermal-chips.js.map +1 -0
  44. package/dist/lib/types.d.ts +58 -0
  45. package/dist/lib/vendor-sensors.d.ts +27 -0
  46. package/dist/lib/vendor-sensors.js +63 -0
  47. package/dist/lib/vendor-sensors.js.map +1 -0
  48. package/dist/notify/telegram.js +1 -1
  49. package/dist/notify/telegram.js.map +1 -1
  50. package/package.json +16 -1
  51. package/rule-ids.json +29 -0
  52. package/.dockerignore +0 -13
  53. package/.github/ISSUE_TEMPLATE/bug_report.md +0 -24
  54. package/.github/ISSUE_TEMPLATE/no_data.md +0 -26
  55. package/.github/workflows/docker.yml +0 -53
  56. package/.github/workflows/publish.yml +0 -25
  57. package/Dockerfile +0 -59
  58. package/config/collector.example.yaml +0 -43
  59. package/docker-compose.yml +0 -26
  60. package/scripts/sign-release.sh +0 -29
  61. package/src/__tests__/cli.test.ts +0 -74
  62. package/src/__tests__/reboot-marker.test.ts +0 -122
  63. package/src/alerts/evaluator.ts +0 -15
  64. package/src/alerts/rules.ts +0 -283
  65. package/src/alerts/state.ts +0 -92
  66. package/src/cli.ts +0 -112
  67. package/src/collect/__tests__/ipmi.test.ts +0 -96
  68. package/src/collect/__tests__/smart.test.ts +0 -68
  69. package/src/collect/__tests__/system.test.ts +0 -29
  70. package/src/collect/__tests__/zfs.test.ts +0 -72
  71. package/src/collect/conntrack.ts +0 -27
  72. package/src/collect/cpu.ts +0 -92
  73. package/src/collect/disks.ts +0 -91
  74. package/src/collect/fd.ts +0 -31
  75. package/src/collect/io-errors.ts +0 -23
  76. package/src/collect/io-latency.ts +0 -103
  77. package/src/collect/ipmi.ts +0 -207
  78. package/src/collect/memory.ts +0 -30
  79. package/src/collect/network.ts +0 -193
  80. package/src/collect/ntp.ts +0 -114
  81. package/src/collect/os-alerts.ts +0 -43
  82. package/src/collect/raid.ts +0 -40
  83. package/src/collect/security.ts +0 -268
  84. package/src/collect/smart.ts +0 -72
  85. package/src/collect/system.ts +0 -32
  86. package/src/collect/systemd.ts +0 -33
  87. package/src/collect/zfs.ts +0 -66
  88. package/src/config.ts +0 -65
  89. package/src/index.ts +0 -221
  90. package/src/lib/__tests__/parse.test.ts +0 -28
  91. package/src/lib/exec.ts +0 -16
  92. package/src/lib/parse.ts +0 -29
  93. package/src/lib/reboot-marker.ts +0 -88
  94. package/src/lib/types.ts +0 -226
  95. package/src/lib/version-check.ts +0 -39
  96. package/src/lib/version.ts +0 -33
  97. package/src/metrics-server.ts +0 -123
  98. package/src/notify/email.ts +0 -69
  99. package/src/notify/slack.ts +0 -47
  100. package/src/notify/telegram.ts +0 -65
  101. package/src/push/forge.ts +0 -109
  102. package/tsconfig.json +0 -15
  103. package/vitest.config.ts +0 -12
@@ -1,43 +0,0 @@
1
- # Glassmkr Crucible Configuration
2
- # Copy to /etc/glassmkr/crucible.yaml
3
-
4
- # Server identity
5
- server_name: "my-server"
6
-
7
- # Collection settings
8
- collection:
9
- interval_seconds: 300 # How often to collect (default 5 minutes)
10
- ipmi: true # Collect IPMI data (requires ipmitool)
11
- smart: true # Collect SMART data (requires smartmontools)
12
-
13
- # Forge integration (optional dashboard)
14
- forge:
15
- enabled: false
16
- url: "https://forge.glassmkr.com"
17
- api_key: "" # Get this from forge.glassmkr.com after registering a server
18
-
19
- # Alert thresholds (all optional, sensible defaults used if omitted)
20
- thresholds:
21
- ram_percent: 90 # Alert when RAM usage exceeds this
22
- swap_alert: true # Alert on any swap usage
23
- disk_percent: 85 # Alert when any disk exceeds this
24
- iowait_percent: 20 # Alert when CPU iowait exceeds this
25
- nvme_wear_percent: 85 # Alert when NVMe lifetime wear exceeds this
26
- disk_latency_nvme_ms: 50 # p99 latency threshold for NVMe
27
- disk_latency_hdd_ms: 200 # p99 latency threshold for HDD
28
- cpu_temp_warning_c: 80 # CPU temperature warning
29
- cpu_temp_critical_c: 90 # CPU temperature critical
30
- interface_utilization_percent: 90 # Network saturation threshold
31
-
32
- # Notification channels (all optional)
33
- channels:
34
- telegram:
35
- enabled: false
36
- bot_token: ""
37
- chat_id: ""
38
- email:
39
- enabled: false
40
- to: ""
41
- slack:
42
- enabled: false
43
- webhook_url: ""
@@ -1,26 +0,0 @@
1
- # Glassmkr Crucible - docker compose deployment
2
- #
3
- # Before starting, create /etc/glassmkr/collector.yaml on the host with your
4
- # Forge collector key. See https://forge.glassmkr.com/docs/getting-started.
5
- #
6
- # Why privileged + host network:
7
- # - privileged: true gives access to /dev/ipmi0 (IPMI sensors) and raw disk devices (SMART)
8
- # - network_mode: host lets the agent read the real host network interfaces and bond state
9
- # - /proc and /sys are mounted so the agent monitors the host, not the container
10
-
11
- services:
12
- crucible:
13
- image: ghcr.io/glassmkr/crucible:latest
14
- container_name: glassmkr-crucible
15
- restart: unless-stopped
16
- privileged: true
17
- network_mode: host
18
- volumes:
19
- - /etc/glassmkr:/etc/glassmkr:ro
20
- - /proc:/host/proc:ro
21
- - /sys:/host/sys:ro
22
- - /dev:/dev:ro
23
- - /run/dbus:/run/dbus:ro
24
- environment:
25
- - HOST_PROC=/host/proc
26
- - HOST_SYS=/host/sys
@@ -1,29 +0,0 @@
1
- #!/bin/bash
2
- # Sign a Crucible release
3
- # Usage: ./scripts/sign-release.sh <version>
4
-
5
- VERSION=$1
6
- DIST_DIR="dist"
7
-
8
- if [ -z "$VERSION" ]; then
9
- echo "Usage: ./scripts/sign-release.sh <version>"
10
- echo "Example: ./scripts/sign-release.sh v0.2.0"
11
- exit 1
12
- fi
13
-
14
- echo "Signing Crucible $VERSION"
15
-
16
- # Generate checksums
17
- cd "$DIST_DIR" || exit 1
18
- sha256sum *.tar.gz *.deb 2>/dev/null > SHA256SUMS || sha256sum *.js > SHA256SUMS
19
-
20
- # Sign the checksums file
21
- gpg --armor --detach-sign --local-user security@glassmkr.com SHA256SUMS
22
-
23
- echo ""
24
- echo "Release artifacts:"
25
- ls -la SHA256SUMS SHA256SUMS.asc
26
- echo ""
27
- echo "Verify with:"
28
- echo " gpg --verify SHA256SUMS.asc SHA256SUMS"
29
- echo " sha256sum -c SHA256SUMS"
@@ -1,74 +0,0 @@
1
- import { describe, it, expect } from "vitest";
2
- import { parseCliArgs, helpText, DEFAULT_CONFIG_PATH } from "../cli.js";
3
-
4
- describe("parseCliArgs", () => {
5
- it("--version returns version string and mode=version", () => {
6
- const { result, output } = parseCliArgs(["--version"], "1.2.3");
7
- expect(result.mode).toBe("version");
8
- expect(output).toBe("glassmkr-crucible v1.2.3");
9
- });
10
-
11
- it("-v aliases --version", () => {
12
- const { result, output } = parseCliArgs(["-v"], "1.2.3");
13
- expect(result.mode).toBe("version");
14
- expect(output).toBe("glassmkr-crucible v1.2.3");
15
- });
16
-
17
- it("--help returns help text and mode=help", () => {
18
- const { result, output } = parseCliArgs(["--help"], "1.2.3");
19
- expect(result.mode).toBe("help");
20
- expect(output).toContain("glassmkr-crucible v1.2.3");
21
- expect(output).toContain("Usage:");
22
- expect(output).toContain("--version");
23
- expect(output).toContain("--help");
24
- expect(output).toContain("--config");
25
- });
26
-
27
- it("-h aliases --help", () => {
28
- const { result } = parseCliArgs(["-h"], "1.2.3");
29
- expect(result.mode).toBe("help");
30
- });
31
-
32
- it("no args returns mode=run with the default config path", () => {
33
- const { result, output } = parseCliArgs([], "1.2.3");
34
- expect(result.mode).toBe("run");
35
- expect(result.configPath).toBe(DEFAULT_CONFIG_PATH);
36
- expect(output).toBeNull();
37
- });
38
-
39
- it("-c accepts a path in the next argument", () => {
40
- const { result } = parseCliArgs(["-c", "/tmp/a.yaml"], "1.2.3");
41
- expect(result.configPath).toBe("/tmp/a.yaml");
42
- });
43
-
44
- it("--config accepts a path in the next argument", () => {
45
- const { result } = parseCliArgs(["--config", "/tmp/b.yaml"], "1.2.3");
46
- expect(result.configPath).toBe("/tmp/b.yaml");
47
- });
48
-
49
- it("--config=PATH form works", () => {
50
- const { result } = parseCliArgs(["--config=/tmp/c.yaml"], "1.2.3");
51
- expect(result.configPath).toBe("/tmp/c.yaml");
52
- });
53
-
54
- it("legacy positional argument still sets config path", () => {
55
- const { result } = parseCliArgs(["/tmp/legacy.yaml"], "1.2.3");
56
- expect(result.configPath).toBe("/tmp/legacy.yaml");
57
- });
58
-
59
- it("--version wins over a provided config path (no collector start)", () => {
60
- const { result } = parseCliArgs(["--config", "/tmp/x.yaml", "--version"], "1.2.3");
61
- expect(result.mode).toBe("version");
62
- });
63
- });
64
-
65
- describe("helpText", () => {
66
- it("mentions the binary name, default config path, and both flags", () => {
67
- const txt = helpText("0.6.1");
68
- expect(txt).toContain("glassmkr-crucible v0.6.1");
69
- expect(txt).toContain(DEFAULT_CONFIG_PATH);
70
- expect(txt).toContain("-v, --version");
71
- expect(txt).toContain("-h, --help");
72
- expect(txt).toContain("-c, --config");
73
- });
74
- });
@@ -1,122 +0,0 @@
1
- import { describe, it, expect, beforeEach, afterEach } from "vitest";
2
- import { mkdtempSync, existsSync, writeFileSync, statSync, rmSync, chmodSync } from "node:fs";
3
- import { tmpdir } from "node:os";
4
- import { join } from "node:path";
5
- import {
6
- consumeRebootMarker,
7
- writeRebootMarker,
8
- parseDuration,
9
- } from "../lib/reboot-marker.js";
10
- import { parseCliArgs } from "../cli.js";
11
-
12
- let tmpDir: string;
13
- let path: string;
14
-
15
- beforeEach(() => {
16
- tmpDir = mkdtempSync(join(tmpdir(), "crucible-test-"));
17
- path = join(tmpDir, "reboot-expected");
18
- });
19
- afterEach(() => {
20
- try { rmSync(tmpDir, { recursive: true, force: true }); } catch {}
21
- });
22
-
23
- describe("consumeRebootMarker", () => {
24
- it("7. marker present, not expired: returns flag, deletes file", () => {
25
- const future = new Date(Date.now() + 5 * 60_000).toISOString();
26
- writeFileSync(path, JSON.stringify({ expires_at: future, reason: "kernel update" }));
27
- const out = consumeRebootMarker(path);
28
- expect(out).toEqual({ expected: true, reason: "kernel update" });
29
- expect(existsSync(path)).toBe(false);
30
- });
31
-
32
- it("8. marker present, expired: returns null, deletes file", () => {
33
- const past = new Date(Date.now() - 60_000).toISOString();
34
- writeFileSync(path, JSON.stringify({ expires_at: past, reason: "stale" }));
35
- expect(consumeRebootMarker(path)).toBeNull();
36
- expect(existsSync(path)).toBe(false);
37
- });
38
-
39
- it("9. marker absent: returns null, no throw", () => {
40
- expect(consumeRebootMarker(path)).toBeNull();
41
- });
42
-
43
- it("15. malformed JSON: returns null, file deleted, no crash", () => {
44
- writeFileSync(path, "{not json at all");
45
- expect(consumeRebootMarker(path)).toBeNull();
46
- expect(existsSync(path)).toBe(false);
47
- });
48
-
49
- it("invalid expires_at (missing): returns null, file deleted", () => {
50
- writeFileSync(path, JSON.stringify({ reason: "oops" }));
51
- expect(consumeRebootMarker(path)).toBeNull();
52
- expect(existsSync(path)).toBe(false);
53
- });
54
-
55
- it("consumed marker cannot be re-read (single-use)", () => {
56
- const future = new Date(Date.now() + 60_000).toISOString();
57
- writeFileSync(path, JSON.stringify({ expires_at: future }));
58
- expect(consumeRebootMarker(path)).not.toBeNull();
59
- expect(consumeRebootMarker(path)).toBeNull();
60
- });
61
- });
62
-
63
- describe("writeRebootMarker", () => {
64
- it("13. writes file at given path with correct TTL and reason, 0600 mode", () => {
65
- const now = new Date("2026-04-21T22:00:00Z");
66
- const res = writeRebootMarker({ path, reason: "kernel update", ttlMs: 10 * 60_000, now });
67
- expect(res.path).toBe(path);
68
- expect(res.expires_at).toBe("2026-04-21T22:10:00.000Z");
69
- expect(existsSync(path)).toBe(true);
70
- const mode = statSync(path).mode & 0o777;
71
- expect(mode).toBe(0o600);
72
- const round = consumeRebootMarker(path, new Date("2026-04-21T22:05:00Z"));
73
- expect(round).toEqual({ expected: true, reason: "kernel update" });
74
- });
75
-
76
- it("default TTL is 10 minutes", () => {
77
- const now = new Date("2026-04-21T22:00:00Z");
78
- const res = writeRebootMarker({ path, now });
79
- expect(res.expires_at).toBe("2026-04-21T22:10:00.000Z");
80
- });
81
- });
82
-
83
- describe("parseDuration", () => {
84
- it.each([
85
- ["10m", 600_000],
86
- ["2h", 7_200_000],
87
- ["600s", 600_000],
88
- ["500ms", 500],
89
- ["30", 30_000], // bare number -> seconds
90
- ])("%s -> %d ms", (input, ms) => {
91
- expect(parseDuration(input)).toBe(ms);
92
- });
93
- it("rejects garbage", () => {
94
- expect(parseDuration("forever")).toBeNull();
95
- expect(parseDuration("-5m")).toBeNull();
96
- expect(parseDuration("")).toBeNull();
97
- });
98
- });
99
-
100
- describe("CLI parseCliArgs subcommands", () => {
101
- it("14. `reboot` subcommand captured with flags", () => {
102
- const { result } = parseCliArgs(["reboot", "--reason", "kernel update"], "1.0.0");
103
- expect(result.mode).toBe("reboot");
104
- expect(result.reason).toBe("kernel update");
105
- });
106
- it("`mark-reboot` with --ttl parsed through", () => {
107
- const { result } = parseCliArgs(["mark-reboot", "--ttl=5m", "--reason=test"], "1.0.0");
108
- expect(result.mode).toBe("mark-reboot");
109
- expect(result.ttl).toBe("5m");
110
- expect(result.reason).toBe("test");
111
- });
112
- it("`mark-reboot --help` returns help output without running", () => {
113
- const { result, output } = parseCliArgs(["mark-reboot", "--help"], "1.0.0");
114
- expect(result.mode).toBe("help");
115
- expect(output).toContain("mark-reboot");
116
- });
117
- it("top-level help lists the new subcommands", () => {
118
- const { output } = parseCliArgs(["--help"], "1.0.0");
119
- expect(output).toMatch(/mark-reboot/);
120
- expect(output).toMatch(/reboot/);
121
- });
122
- });
@@ -1,15 +0,0 @@
1
- import { allRules } from "./rules.js";
2
- import type { Snapshot, AlertResult } from "../lib/types.js";
3
- import type { Config } from "../config.js";
4
-
5
- export function evaluateAlerts(snapshot: Snapshot, thresholds: Config["thresholds"]): AlertResult[] {
6
- const results: AlertResult[] = [];
7
- for (const rule of allRules) {
8
- try {
9
- results.push(...rule.evaluate(snapshot, thresholds));
10
- } catch (err) {
11
- console.error(`[alerts] Rule ${rule.type} error:`, err);
12
- }
13
- }
14
- return results;
15
- }
@@ -1,283 +0,0 @@
1
- // Alert rules for the collector are identical to the Forge evaluator.
2
- // Re-export from a shared definition to avoid duplication.
3
- // For the collector, we use the same 15 rules but with local thresholds from config.
4
-
5
- import type { Snapshot, AlertResult } from "../lib/types.js";
6
- import type { Config } from "../config.js";
7
-
8
- export interface AlertRule {
9
- type: string;
10
- evaluate(snap: Snapshot, thresholds: Config["thresholds"]): AlertResult[];
11
- }
12
-
13
- export const allRules: AlertRule[] = [
14
- // 1. RAM high
15
- { type: "ram_high", evaluate(snap, t) {
16
- if (!snap.memory?.total_mb) return [];
17
- const pct = (snap.memory.used_mb / snap.memory.total_mb) * 100;
18
- if (pct < (t.ram_percent ?? 90)) return [];
19
- return [{ type: "ram_high", severity: pct >= 95 ? "critical" : "warning",
20
- title: `RAM usage at ${pct.toFixed(1)}%`,
21
- message: `Using ${snap.memory.used_mb}MB of ${snap.memory.total_mb}MB. ${snap.memory.available_mb}MB available.`,
22
- evidence: { used_mb: snap.memory.used_mb, total_mb: snap.memory.total_mb, percent: Math.round(pct * 10) / 10 },
23
- recommendation: "Check: ps aux --sort=-rss | head -20" }];
24
- }},
25
- // 2. Swap active
26
- { type: "swap_active", evaluate(snap, t) {
27
- if (t.swap_alert === false || !snap.memory || snap.memory.swap_used_mb <= 0) return [];
28
- return [{ type: "swap_active", severity: "warning", title: `Swap in use: ${snap.memory.swap_used_mb}MB`,
29
- message: "Server is using swap space, indicating memory pressure.",
30
- evidence: { swap_used_mb: snap.memory.swap_used_mb },
31
- recommendation: "Check: free -h && ps aux --sort=-rss | head -20" }];
32
- }},
33
- // 3. Disk space high
34
- { type: "disk_space_high", evaluate(snap, t) {
35
- if (!snap.disks) return [];
36
- const threshold = t.disk_percent ?? 85;
37
- return snap.disks.filter(d => d.percent_used >= threshold).map(d => ({
38
- type: "disk_space_high", severity: d.percent_used >= 95 ? "critical" as const : "warning" as const,
39
- title: `Disk ${d.mount} at ${d.percent_used}%`,
40
- message: `${d.device}: ${d.used_gb}GB of ${d.total_gb}GB used. ${d.available_gb}GB available.`,
41
- evidence: { device: d.device, mount: d.mount, percent_used: d.percent_used },
42
- recommendation: "Check: du -sh /* | sort -rh | head -20" }));
43
- }},
44
- // 4. CPU iowait
45
- { type: "cpu_iowait_high", evaluate(snap, t) {
46
- if (!snap.cpu || snap.cpu.iowait_percent < (t.iowait_percent ?? 20)) return [];
47
- return [{ type: "cpu_iowait_high", severity: "warning", title: `CPU iowait at ${snap.cpu.iowait_percent.toFixed(1)}%`,
48
- message: `High I/O wait: CPU spending ${snap.cpu.iowait_percent.toFixed(1)}% waiting for disk.`,
49
- evidence: { iowait_percent: snap.cpu.iowait_percent },
50
- recommendation: "Check: iotop -oP or iostat -x 1 5" }];
51
- }},
52
- // 5. OOM kills
53
- { type: "oom_kills", evaluate(snap) {
54
- if (!snap.os_alerts || snap.os_alerts.oom_kills_recent <= 0) return [];
55
- return [{ type: "oom_kills", severity: "critical", title: `${snap.os_alerts.oom_kills_recent} OOM kill(s)`,
56
- message: `Kernel OOM killer terminated ${snap.os_alerts.oom_kills_recent} process(es).`,
57
- evidence: { oom_kills_recent: snap.os_alerts.oom_kills_recent },
58
- recommendation: "Check: dmesg | grep -i 'out of memory'" }];
59
- }},
60
- // 6. SMART failing
61
- { type: "smart_failing", evaluate(snap) {
62
- if (!snap.smart) return [];
63
- return snap.smart.filter(d => d.health !== "PASSED" || (d.reallocated_sectors && d.reallocated_sectors > 0) || (d.pending_sectors && d.pending_sectors > 0))
64
- .map(d => ({ type: "smart_failing", severity: "critical" as const,
65
- title: `SMART failure: ${d.device}`, message: `${d.model}: drive showing signs of failure.`,
66
- evidence: { device: d.device, health: d.health, reallocated_sectors: d.reallocated_sectors, pending_sectors: d.pending_sectors },
67
- recommendation: `Back up data. Schedule replacement for ${d.device}.` }));
68
- }},
69
- // 7. NVMe wear
70
- { type: "nvme_wear_high", evaluate(snap, t) {
71
- if (!snap.smart) return [];
72
- const threshold = t.nvme_wear_percent ?? 85;
73
- return snap.smart.filter(d => d.percentage_used != null && d.percentage_used >= threshold)
74
- .map(d => ({ type: "nvme_wear_high", severity: d.percentage_used! >= 95 ? "critical" as const : "warning" as const,
75
- title: `NVMe ${d.device} wear at ${d.percentage_used}%`, message: `${d.model} at ${d.percentage_used}% lifetime wear.`,
76
- evidence: { device: d.device, percentage_used: d.percentage_used },
77
- recommendation: "Plan drive replacement." }));
78
- }},
79
- // 8. RAID degraded
80
- { type: "raid_degraded", evaluate(snap) {
81
- if (!snap.raid) return [];
82
- return snap.raid.filter(r => r.degraded || r.failed_disks.length > 0)
83
- .map(r => ({ type: "raid_degraded", severity: "critical" as const,
84
- title: `RAID ${r.device} degraded`, message: `${r.device} (${r.level}) degraded. Failed: ${r.failed_disks.join(", ") || "unknown"}.`,
85
- evidence: { device: r.device, failed_disks: r.failed_disks },
86
- recommendation: "Replace failed drive immediately." }));
87
- }},
88
- // 9. Disk latency
89
- { type: "disk_latency_high", evaluate(snap, t) {
90
- if (!snap.disks) return [];
91
- return snap.disks.filter(d => {
92
- if (d.latency_p99_ms == null) return false;
93
- const thresh = d.device.includes("nvme") ? (t.disk_latency_nvme_ms ?? 50) : (t.disk_latency_hdd_ms ?? 200);
94
- return d.latency_p99_ms >= thresh;
95
- }).map(d => ({ type: "disk_latency_high", severity: "warning" as const,
96
- title: `Disk ${d.device} latency ${d.latency_p99_ms!.toFixed(1)}ms`,
97
- message: `p99 I/O latency on ${d.device} is high.`,
98
- evidence: { device: d.device, latency_p99_ms: d.latency_p99_ms },
99
- recommendation: "Check: iotop -oP" }));
100
- }},
101
- // 10. Interface errors
102
- { type: "interface_errors", evaluate(snap) {
103
- if (!snap.network) return [];
104
- return snap.network.filter(i => (i.rx_errors + i.tx_errors + i.rx_drops + i.tx_drops) > 0)
105
- .map(i => ({ type: "interface_errors", severity: "warning" as const,
106
- title: `${i.interface}: errors/drops detected`,
107
- message: `RX errors=${i.rx_errors}, TX errors=${i.tx_errors}, RX drops=${i.rx_drops}, TX drops=${i.tx_drops}.`,
108
- evidence: { interface: i.interface, rx_errors: i.rx_errors, tx_errors: i.tx_errors, rx_drops: i.rx_drops, tx_drops: i.tx_drops },
109
- recommendation: "Check cables and SFP/transceiver." }));
110
- }},
111
- // 11. Link speed mismatch
112
- { type: "link_speed_mismatch", evaluate(snap) {
113
- if (!snap.network) return [];
114
- return snap.network.filter(i => i.speed_mbps > 0 && i.speed_mbps < 1000)
115
- .map(i => ({ type: "link_speed_mismatch", severity: "warning" as const,
116
- title: `${i.interface} at ${i.speed_mbps} Mbps`,
117
- message: `Interface negotiated below 1 Gbps.`,
118
- evidence: { interface: i.interface, speed_mbps: i.speed_mbps },
119
- recommendation: "Check cable, SFP, switch port config." }));
120
- }},
121
- // 12. Interface saturation
122
- { type: "interface_saturation", evaluate(snap, t) {
123
- if (!snap.network) return [];
124
- const threshold = (t.interface_utilization_percent ?? 90) / 100;
125
- return snap.network.filter(i => {
126
- if (!i.speed_mbps) return false;
127
- const maxBps = (i.speed_mbps * 1_000_000) / 8;
128
- return Math.max(i.rx_bytes_sec, i.tx_bytes_sec) / maxBps >= threshold;
129
- }).map(i => {
130
- const maxBps = (i.speed_mbps * 1_000_000) / 8;
131
- const util = Math.max(i.rx_bytes_sec, i.tx_bytes_sec) / maxBps * 100;
132
- return { type: "interface_saturation", severity: "warning" as const,
133
- title: `${i.interface} at ${util.toFixed(0)}% utilization`,
134
- message: `Interface ${i.interface} (${i.speed_mbps} Mbps) near saturation.`,
135
- evidence: { interface: i.interface, utilization_percent: Math.round(util * 10) / 10 },
136
- recommendation: "Check: iftop or nload" };
137
- });
138
- }},
139
- // 13. CPU temperature
140
- { type: "cpu_temperature_high", evaluate(snap, t) {
141
- if (!snap.ipmi?.available || !snap.ipmi.sensors) return [];
142
- const warn = t.cpu_temp_warning_c ?? 80;
143
- return snap.ipmi.sensors.filter(s => {
144
- const n = s.name.toLowerCase();
145
- if (!n.includes("cpu") && !n.includes("temp")) return false;
146
- const v = typeof s.value === "number" ? s.value : parseFloat(String(s.value));
147
- return !isNaN(v) && v >= warn;
148
- }).map(s => {
149
- const v = typeof s.value === "number" ? s.value : parseFloat(String(s.value));
150
- const crit = s.upper_critical ?? (t.cpu_temp_critical_c ?? 90);
151
- return { type: "cpu_temperature_high", severity: v >= crit ? "critical" as const : "warning" as const,
152
- title: `${s.name}: ${v}${s.unit}`, message: `Temperature above warning threshold.`,
153
- evidence: { sensor: s.name, value: v },
154
- recommendation: "Check cooling, fans, airflow." };
155
- });
156
- }},
157
- // 14. ECC errors
158
- { type: "ecc_errors", evaluate(snap) {
159
- if (!snap.ipmi?.ecc_errors) return [];
160
- const { correctable, uncorrectable } = snap.ipmi.ecc_errors;
161
- if (correctable <= 0 && uncorrectable <= 0) return [];
162
- if (uncorrectable > 0) return [{ type: "ecc_errors", severity: "critical",
163
- title: `${uncorrectable} uncorrectable ECC error(s)`, message: "Data corruption possible. DIMM failing.",
164
- evidence: { correctable, uncorrectable },
165
- recommendation: "Replace DIMM immediately. Run: ipmitool sdr type Memory" }];
166
- return [{ type: "ecc_errors", severity: "warning",
167
- title: `${correctable} correctable ECC error(s)`, message: "Early warning of DIMM failure.",
168
- evidence: { correctable, uncorrectable },
169
- recommendation: "Schedule DIMM replacement. Run: ipmitool sdr type Memory" }];
170
- }},
171
- // 15. PSU redundancy
172
- { type: "psu_redundancy_loss", evaluate(snap) {
173
- if (!snap.ipmi?.available || !snap.ipmi.sensors) return [];
174
- const psus = snap.ipmi.sensors.filter(s => { const n = s.name.toLowerCase(); return n.includes("psu") || n.includes("power supply"); });
175
- if (psus.length < 2) return [];
176
- const failed = psus.filter(s => { const st = String(s.status).toLowerCase(); const v = String(s.value).toLowerCase();
177
- return st.includes("fail") || st.includes("absent") || v.includes("fail") || v.includes("absent"); });
178
- if (failed.length === 0) return [];
179
- return [{ type: "psu_redundancy_loss", severity: "critical",
180
- title: "PSU redundancy lost", message: `${failed.length} PSU(s) failed/absent: ${failed.map(p => p.name).join(", ")}.`,
181
- evidence: { failed: failed.map(p => ({ name: p.name, status: p.status })) },
182
- recommendation: "Replace failed PSU. Check power connections." }];
183
- }},
184
- // 19. IPMI SEL critical events
185
- { type: "ipmi_sel_critical", evaluate(snap) {
186
- if (!snap.ipmi?.available || !snap.ipmi.sel_events_recent?.length) return [];
187
- const critical = snap.ipmi.sel_events_recent.filter(e => e.severity === "critical" && e.direction === "Asserted");
188
- if (critical.length === 0) return [];
189
- const byType: Record<string, typeof critical> = {};
190
- for (const e of critical) { if (!byType[e.sensor_type]) byType[e.sensor_type] = []; byType[e.sensor_type].push(e); }
191
- const details = Object.entries(byType).map(([t, evts]) => `${t}: ${evts.map(e => `${e.sensor}: ${e.event}`).join(", ")}`).join("; ");
192
- const recs: string[] = [];
193
- if (byType.memory) recs.push("Memory errors: identify slot with `ipmitool sel elist | grep -i memory`. Schedule DIMM replacement.");
194
- if (byType.power) recs.push("PSU event: check physical PSU and connections. Verify redundancy: `ipmitool chassis status`.");
195
- if (byType.watchdog) recs.push("Watchdog reset: OS or BMC became unresponsive. Check dmesg for root cause.");
196
- if (byType.processor) recs.push("CPU event: check for thermal throttling or MCE. Run `dmesg | grep -i mce`.");
197
- if (recs.length === 0) recs.push("Review full SEL: `ipmitool sel elist`.");
198
- return [{ type: "ipmi_sel_critical", severity: "critical",
199
- title: `IPMI: ${critical.length} critical hardware event(s)`,
200
- message: `BMC System Event Log: ${critical.length} critical event(s). ${details}`,
201
- evidence: { critical_events: critical, sensor_types: Object.keys(byType) },
202
- recommendation: recs.join(" ") }];
203
- }},
204
- // 20. Fan failure
205
- { type: "ipmi_fan_failure", evaluate(snap) {
206
- if (!snap.ipmi?.available || !snap.ipmi.fans?.length) return [];
207
- const failed = snap.ipmi.fans.filter(f => f.status === "critical" || (f.rpm === 0 && f.status !== "absent"));
208
- if (failed.length === 0) return [];
209
- const total = snap.ipmi.fans.filter(f => f.status !== "absent").length;
210
- const names = failed.map(f => `${f.name} (${f.rpm} RPM)`).join(", ");
211
- return [{ type: "ipmi_fan_failure", severity: "critical",
212
- title: `Fan failure: ${failed.length} of ${total} fans`,
213
- message: `${failed.length} fan(s) stopped or critically slow: ${names}. Reduced cooling capacity.`,
214
- evidence: { failed_fans: failed, total_fans: total, all_fans: snap.ipmi.fans.filter(f => f.status !== "absent") },
215
- recommendation: "Check physical fans. Monitor temps: `ipmitool sdr type Temperature`. Replace failed fan module." }];
216
- }},
217
- // === Security (6) ===
218
- // 21. SSH root password login
219
- { type: "ssh_root_password", evaluate(snap) {
220
- if (!snap.security?.ssh?.rootPasswordExposed) return [];
221
- return [{ type: "ssh_root_password", severity: "warning",
222
- title: "SSH root login with password enabled",
223
- message: `PermitRootLogin is "${snap.security.ssh.permitRootLogin}" and PasswordAuthentication is "${snap.security.ssh.passwordAuthentication}". Root can be brute-forced over SSH.`,
224
- evidence: { permitRootLogin: snap.security.ssh.permitRootLogin, passwordAuthentication: snap.security.ssh.passwordAuthentication },
225
- recommendation: 'Set "PermitRootLogin prohibit-password" in /etc/ssh/sshd_config and restart sshd. Key-based root login still works.' }];
226
- }},
227
- // 22. No firewall
228
- { type: "no_firewall", evaluate(snap) {
229
- if (!snap.security || snap.security.firewall.active) return [];
230
- return [{ type: "no_firewall", severity: "warning" as const,
231
- title: "No firewall active",
232
- message: "No active firewall rules detected (checked UFW, firewalld, nftables, iptables). All ports are exposed unless protected by network-level ACLs.",
233
- evidence: { source: snap.security.firewall.source },
234
- recommendation: 'Enable a firewall: "sudo ufw enable" (Debian/Ubuntu) or "sudo systemctl start firewalld" (RHEL/Rocky).' }];
235
- }},
236
- // 23. Pending security updates
237
- { type: "pending_security_updates", evaluate(snap, t) {
238
- if (!snap.security?.pending_updates?.available) return [];
239
- const maxPending = 10;
240
- if (snap.security.pending_updates.pendingCount <= maxPending) return [];
241
- const d = snap.security.pending_updates;
242
- return [{ type: "pending_security_updates", severity: "warning",
243
- title: `${d.pendingCount} security updates pending`,
244
- message: `${d.pendingCount} security updates pending on this ${d.distro} server.`,
245
- evidence: { pendingCount: d.pendingCount, distro: d.distro },
246
- recommendation: d.distro === "ubuntu" || d.distro === "debian" ? 'Apply with: "sudo apt-get upgrade"' : 'Apply with: "sudo dnf update --security"' }];
247
- }},
248
- // 24. Kernel vulnerabilities
249
- { type: "kernel_vulnerabilities", evaluate(snap) {
250
- if (!snap.security?.kernel_vulns?.length) return [];
251
- const unmitigated = snap.security.kernel_vulns.filter(v => !v.mitigated);
252
- if (unmitigated.length === 0) return [];
253
- const details = unmitigated.map(v => `${v.name}: ${v.status}`).join("; ");
254
- return [{ type: "kernel_vulnerabilities", severity: "warning",
255
- title: `${unmitigated.length} CPU vulnerability mitigations missing`,
256
- message: `Unmitigated: ${details}. Update the kernel and CPU microcode to apply mitigations.`,
257
- evidence: { unmitigated, total: snap.security.kernel_vulns.length },
258
- recommendation: 'Check: "grep . /sys/devices/system/cpu/vulnerabilities/*". Update kernel and microcode packages.' }];
259
- }},
260
- // 25. Kernel needs reboot
261
- { type: "kernel_needs_reboot", evaluate(snap) {
262
- if (!snap.security?.kernel_reboot?.needsReboot) return [];
263
- const k = snap.security.kernel_reboot;
264
- return [{ type: "kernel_needs_reboot", severity: "warning" as const,
265
- title: "Reboot required for kernel update",
266
- message: `Running kernel: ${k.running}. Installed kernel: ${k.installed}. A reboot is needed to apply the newer kernel.`,
267
- evidence: { running: k.running, installed: k.installed },
268
- recommendation: "Schedule a reboot to apply the newer kernel. Security patches may not be active until then." }];
269
- }},
270
- // 26. Unattended upgrades disabled
271
- { type: "unattended_upgrades_disabled", evaluate(snap) {
272
- if (!snap.security || snap.security.auto_updates.configured) return [];
273
- const a = snap.security.auto_updates;
274
- const hint = a.mechanism === "unattended-upgrades" ? 'Enable: "sudo dpkg-reconfigure -plow unattended-upgrades"'
275
- : a.mechanism === "dnf-automatic" ? 'Enable: "sudo systemctl enable --now dnf-automatic-install.timer"'
276
- : 'Install: "sudo apt install unattended-upgrades" (Debian/Ubuntu) or "sudo dnf install dnf-automatic" (RHEL/Rocky)';
277
- return [{ type: "unattended_upgrades_disabled", severity: "warning" as const,
278
- title: "Automatic security updates not configured",
279
- message: `${a.details}. Without automatic updates, security patches must be applied manually.`,
280
- evidence: { mechanism: a.mechanism, details: a.details },
281
- recommendation: hint }];
282
- }},
283
- ];