@glassmkr/crucible 0.7.1 → 0.8.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (103) hide show
  1. package/dist/alerts/__tests__/rules.test.d.ts +1 -0
  2. package/dist/alerts/__tests__/rules.test.js +437 -0
  3. package/dist/alerts/__tests__/rules.test.js.map +1 -0
  4. package/dist/alerts/rules.d.ts +8 -0
  5. package/dist/alerts/rules.js +175 -34
  6. package/dist/alerts/rules.js.map +1 -1
  7. package/dist/api.d.ts +2 -0
  8. package/dist/api.js +7 -0
  9. package/dist/api.js.map +1 -0
  10. package/dist/collect/__tests__/dmi.test.d.ts +1 -0
  11. package/dist/collect/__tests__/dmi.test.js +133 -0
  12. package/dist/collect/__tests__/dmi.test.js.map +1 -0
  13. package/dist/collect/__tests__/ipmi.test.js +47 -1
  14. package/dist/collect/__tests__/ipmi.test.js.map +1 -1
  15. package/dist/collect/__tests__/thermal.test.d.ts +1 -0
  16. package/dist/collect/__tests__/thermal.test.js +224 -0
  17. package/dist/collect/__tests__/thermal.test.js.map +1 -0
  18. package/dist/collect/dmi.d.ts +19 -0
  19. package/dist/collect/dmi.js +118 -0
  20. package/dist/collect/dmi.js.map +1 -0
  21. package/dist/collect/ipmi.d.ts +27 -2
  22. package/dist/collect/ipmi.js +90 -2
  23. package/dist/collect/ipmi.js.map +1 -1
  24. package/dist/collect/thermal.d.ts +10 -0
  25. package/dist/collect/thermal.js +232 -0
  26. package/dist/collect/thermal.js.map +1 -0
  27. package/dist/config.d.ts +10 -0
  28. package/dist/config.js +2 -0
  29. package/dist/config.js.map +1 -1
  30. package/dist/index.js +51 -1
  31. package/dist/index.js.map +1 -1
  32. package/dist/lib/__tests__/capability.test.d.ts +1 -0
  33. package/dist/lib/__tests__/capability.test.js +87 -0
  34. package/dist/lib/__tests__/capability.test.js.map +1 -0
  35. package/dist/lib/__tests__/vendor-sensors.test.d.ts +1 -0
  36. package/dist/lib/__tests__/vendor-sensors.test.js +49 -0
  37. package/dist/lib/__tests__/vendor-sensors.test.js.map +1 -0
  38. package/dist/lib/capability.d.ts +21 -0
  39. package/dist/lib/capability.js +110 -0
  40. package/dist/lib/capability.js.map +1 -0
  41. package/dist/lib/cpu-thermal-chips.d.ts +2 -0
  42. package/dist/lib/cpu-thermal-chips.js +28 -0
  43. package/dist/lib/cpu-thermal-chips.js.map +1 -0
  44. package/dist/lib/types.d.ts +58 -0
  45. package/dist/lib/vendor-sensors.d.ts +27 -0
  46. package/dist/lib/vendor-sensors.js +63 -0
  47. package/dist/lib/vendor-sensors.js.map +1 -0
  48. package/dist/notify/telegram.js +1 -1
  49. package/dist/notify/telegram.js.map +1 -1
  50. package/package.json +16 -1
  51. package/rule-ids.json +29 -0
  52. package/.dockerignore +0 -13
  53. package/.github/ISSUE_TEMPLATE/bug_report.md +0 -24
  54. package/.github/ISSUE_TEMPLATE/no_data.md +0 -26
  55. package/.github/workflows/docker.yml +0 -53
  56. package/.github/workflows/publish.yml +0 -25
  57. package/Dockerfile +0 -59
  58. package/config/collector.example.yaml +0 -43
  59. package/docker-compose.yml +0 -26
  60. package/scripts/sign-release.sh +0 -29
  61. package/src/__tests__/cli.test.ts +0 -74
  62. package/src/__tests__/reboot-marker.test.ts +0 -122
  63. package/src/alerts/evaluator.ts +0 -15
  64. package/src/alerts/rules.ts +0 -283
  65. package/src/alerts/state.ts +0 -92
  66. package/src/cli.ts +0 -112
  67. package/src/collect/__tests__/ipmi.test.ts +0 -96
  68. package/src/collect/__tests__/smart.test.ts +0 -68
  69. package/src/collect/__tests__/system.test.ts +0 -29
  70. package/src/collect/__tests__/zfs.test.ts +0 -72
  71. package/src/collect/conntrack.ts +0 -27
  72. package/src/collect/cpu.ts +0 -92
  73. package/src/collect/disks.ts +0 -91
  74. package/src/collect/fd.ts +0 -31
  75. package/src/collect/io-errors.ts +0 -23
  76. package/src/collect/io-latency.ts +0 -103
  77. package/src/collect/ipmi.ts +0 -207
  78. package/src/collect/memory.ts +0 -30
  79. package/src/collect/network.ts +0 -193
  80. package/src/collect/ntp.ts +0 -114
  81. package/src/collect/os-alerts.ts +0 -43
  82. package/src/collect/raid.ts +0 -40
  83. package/src/collect/security.ts +0 -268
  84. package/src/collect/smart.ts +0 -72
  85. package/src/collect/system.ts +0 -32
  86. package/src/collect/systemd.ts +0 -33
  87. package/src/collect/zfs.ts +0 -66
  88. package/src/config.ts +0 -65
  89. package/src/index.ts +0 -221
  90. package/src/lib/__tests__/parse.test.ts +0 -28
  91. package/src/lib/exec.ts +0 -16
  92. package/src/lib/parse.ts +0 -29
  93. package/src/lib/reboot-marker.ts +0 -88
  94. package/src/lib/types.ts +0 -226
  95. package/src/lib/version-check.ts +0 -39
  96. package/src/lib/version.ts +0 -33
  97. package/src/metrics-server.ts +0 -123
  98. package/src/notify/email.ts +0 -69
  99. package/src/notify/slack.ts +0 -47
  100. package/src/notify/telegram.ts +0 -65
  101. package/src/push/forge.ts +0 -109
  102. package/tsconfig.json +0 -15
  103. package/vitest.config.ts +0 -12
package/src/config.ts DELETED
@@ -1,65 +0,0 @@
1
- import { readFileSync } from "fs";
2
- import { parse } from "yaml";
3
- import { z } from "zod";
4
-
5
- const ConfigSchema = z.object({
6
- server_name: z.string().default("unnamed-server"),
7
- collection: z.object({
8
- interval_seconds: z.number().min(60).max(3600).default(300),
9
- ipmi: z.boolean().default(true),
10
- smart: z.boolean().default(true),
11
- }).default({}),
12
- forge: z.object({
13
- enabled: z.boolean().default(false),
14
- url: z.string().default("https://forge.glassmkr.com"),
15
- api_key: z.string().default(""),
16
- tls_pin: z.string().default(""),
17
- }).default({}),
18
- thresholds: z.object({
19
- ram_percent: z.number().default(90),
20
- swap_alert: z.boolean().default(true),
21
- disk_percent: z.number().default(85),
22
- iowait_percent: z.number().default(20),
23
- nvme_wear_percent: z.number().default(85),
24
- disk_latency_nvme_ms: z.number().default(50),
25
- disk_latency_hdd_ms: z.number().default(200),
26
- cpu_temp_warning_c: z.number().default(80),
27
- cpu_temp_critical_c: z.number().default(90),
28
- interface_utilization_percent: z.number().default(90),
29
- }).default({}),
30
- channels: z.object({
31
- telegram: z.object({
32
- enabled: z.boolean().default(false),
33
- bot_token: z.string().default(""),
34
- chat_id: z.string().default(""),
35
- }).default({}),
36
- email: z.object({
37
- enabled: z.boolean().default(false),
38
- to: z.string().default(""),
39
- }).default({}),
40
- slack: z.object({
41
- enabled: z.boolean().default(false),
42
- webhook_url: z.string().default(""),
43
- }).default({}),
44
- }).default({}),
45
- prometheus: z.object({
46
- enabled: z.boolean().default(false),
47
- port: z.number().default(9101),
48
- }).default({}),
49
- });
50
-
51
- export type Config = z.infer<typeof ConfigSchema>;
52
-
53
- export function loadConfig(path: string): Config {
54
- try {
55
- const raw = readFileSync(path, "utf-8");
56
- const parsed = parse(raw);
57
- return ConfigSchema.parse(parsed);
58
- } catch (err: any) {
59
- if (err.code === "ENOENT") {
60
- console.log(`[config] No config file at ${path}, using defaults`);
61
- return ConfigSchema.parse({});
62
- }
63
- throw err;
64
- }
65
- }
package/src/index.ts DELETED
@@ -1,221 +0,0 @@
1
- #!/usr/bin/env node
2
-
3
- import { parseCliArgs } from "./cli.js";
4
- import { CRUCIBLE_VERSION as PKG_VERSION } from "./lib/version.js";
5
-
6
- // Handle --version, --help, and planned-reboot subcommands before
7
- // importing collectors, loading config, or starting the Prometheus
8
- // server. Keeps the CLI responsive even on hosts missing the config
9
- // file or external tools.
10
- const { result: cliArgs, output: cliOutput } = parseCliArgs(process.argv.slice(2), PKG_VERSION);
11
- if (cliArgs.mode === "version" || cliArgs.mode === "help") {
12
- console.log(cliOutput);
13
- process.exit(0);
14
- }
15
- if (cliArgs.mode === "mark-reboot" || cliArgs.mode === "reboot") {
16
- const { writeRebootMarker, parseDuration, DEFAULT_TTL_MS } = await import("./lib/reboot-marker.js");
17
- const ttlMs = cliArgs.ttl ? parseDuration(cliArgs.ttl) : DEFAULT_TTL_MS;
18
- if (ttlMs === null) {
19
- console.error(`[mark-reboot] invalid --ttl value: ${cliArgs.ttl}. Use e.g. 10m, 2h, 600s.`);
20
- process.exit(2);
21
- }
22
- try {
23
- const { path, expires_at } = writeRebootMarker({
24
- reason: cliArgs.reason, ttlMs,
25
- });
26
- console.log(`[${cliArgs.mode}] marker written: ${path} (expires ${expires_at}${cliArgs.reason ? `, reason: ${cliArgs.reason}` : ""})`);
27
- } catch (err: any) {
28
- console.error(`[${cliArgs.mode}] failed to write marker: ${err?.message || err}`);
29
- console.error(` Most likely cause: need root privileges to write under /var/lib/crucible/.`);
30
- process.exit(1);
31
- }
32
- if (cliArgs.mode === "reboot") {
33
- const { execFileSync } = await import("node:child_process");
34
- console.log("[reboot] invoking systemctl reboot");
35
- try {
36
- execFileSync("systemctl", ["reboot"], { stdio: "inherit" });
37
- } catch (err: any) {
38
- console.error(`[reboot] systemctl reboot failed: ${err?.message || err}`);
39
- process.exit(1);
40
- }
41
- }
42
- process.exit(0);
43
- }
44
-
45
- import { loadConfig } from "./config.js";
46
- import { checkForUpdates } from "./lib/version-check.js";
47
- import { startMetricsServer, updateMetrics } from "./metrics-server.js";
48
- import { collectSystem } from "./collect/system.js";
49
- import { collectCpu } from "./collect/cpu.js";
50
- import { collectMemory } from "./collect/memory.js";
51
- import { collectDisks } from "./collect/disks.js";
52
- import { collectSmart } from "./collect/smart.js";
53
- import { collectNetwork } from "./collect/network.js";
54
- import { collectRaid } from "./collect/raid.js";
55
- import { collectIpmi } from "./collect/ipmi.js";
56
- import { collectOsAlerts } from "./collect/os-alerts.js";
57
- import { evaluateAlerts } from "./alerts/evaluator.js";
58
- import { updateAlertState } from "./alerts/state.js";
59
- import { sendTelegram } from "./notify/telegram.js";
60
- import { sendSlack } from "./notify/slack.js";
61
- import { sendEmail } from "./notify/email.js";
62
- import { pushToForge, initForgeAgent } from "./push/forge.js";
63
- import { collectSecurity, type SecurityData } from "./collect/security.js";
64
- import { collectZfs } from "./collect/zfs.js";
65
- import { collectIoErrors } from "./collect/io-errors.js";
66
- import { collectIoLatency } from "./collect/io-latency.js";
67
- import { collectConntrack } from "./collect/conntrack.js";
68
- import { collectSystemd } from "./collect/systemd.js";
69
- import { collectNtp } from "./collect/ntp.js";
70
- import { collectFileDescriptors } from "./collect/fd.js";
71
- import type { Snapshot, IpmiInfo } from "./lib/types.js";
72
- import { consumeRebootMarker, type PlannedReboot } from "./lib/reboot-marker.js";
73
-
74
- // Consume the planned-reboot marker once at startup. If the operator ran
75
- // `crucible-agent mark-reboot` / `reboot` before this boot, the marker
76
- // exists, we flag it on the first snapshot, and we delete the file (so
77
- // subsequent snapshots don't keep claiming the reboot was planned).
78
- const plannedRebootFlag: PlannedReboot | null = consumeRebootMarker();
79
- if (plannedRebootFlag) {
80
- console.log(`[collector] Planned reboot acknowledged${plannedRebootFlag.reason ? `: ${plannedRebootFlag.reason}` : ""}`);
81
- }
82
- let plannedRebootConsumed = false;
83
-
84
- const config = loadConfig(cliArgs.configPath);
85
-
86
- console.log(`[collector] Starting. Server: ${config.server_name}. Interval: ${config.collection.interval_seconds}s`);
87
- console.log(`[collector] IPMI: ${config.collection.ipmi ? "enabled" : "disabled"}, SMART: ${config.collection.smart ? "enabled" : "disabled"}`);
88
- console.log(`[collector] Forge: ${config.forge.enabled ? config.forge.url : "disabled"}`);
89
- console.log(`[collector] Prometheus: ${config.prometheus.enabled ? `:${config.prometheus.port}/metrics` : "disabled"}`);
90
-
91
- // Start Prometheus metrics server if enabled
92
- if (config.prometheus.enabled) {
93
- startMetricsServer(config.prometheus.port);
94
- }
95
-
96
- // Initialize TLS pinning for Forge if configured
97
- if (config.forge.tls_pin) {
98
- initForgeAgent(config.forge.tls_pin);
99
- console.log("[collector] TLS pinning enabled for Forge");
100
- }
101
-
102
- const emptyIpmi: IpmiInfo = { available: false, sensors: [], ecc_errors: { correctable: 0, uncorrectable: 0 }, sel_entries_count: 0, sel_events_recent: [], fans: [] };
103
-
104
- // Security checks run once per hour (every 12th cycle at 5-min intervals)
105
- let securityCycleCount = 0;
106
- let cachedSecurity: SecurityData | undefined;
107
-
108
- async function collect() {
109
- const startTime = Date.now();
110
- console.log(`[collector] Collecting...`);
111
-
112
- const [system, cpu, memory, disks, smart, network, raid, ipmi, osAlerts] = await Promise.all([
113
- collectSystem(),
114
- collectCpu(),
115
- collectMemory(),
116
- collectDisks(),
117
- config.collection.smart ? collectSmart() : Promise.resolve([]),
118
- collectNetwork(),
119
- collectRaid(),
120
- config.collection.ipmi ? collectIpmi() : Promise.resolve(emptyIpmi),
121
- collectOsAlerts(),
122
- ]);
123
-
124
- // Security checks: run once per hour, reuse cached data between runs
125
- securityCycleCount++;
126
- if (securityCycleCount >= 12 || !cachedSecurity) {
127
- securityCycleCount = 0;
128
- try { cachedSecurity = await collectSecurity(); } catch (err) { console.error("[security] Collection error:", err); }
129
- }
130
-
131
- const snapshot: Snapshot = {
132
- collector_version: PKG_VERSION,
133
- timestamp: new Date().toISOString(),
134
- system, cpu, memory, disks, smart, network, raid, ipmi, os_alerts: osAlerts,
135
- security: cachedSecurity,
136
- };
137
-
138
- // Single-shot: the very first snapshot after a marked reboot carries
139
- // the flag, subsequent snapshots do not.
140
- if (plannedRebootFlag && !plannedRebootConsumed) {
141
- (snapshot as any).expected_reboot = true;
142
- if (plannedRebootFlag.reason) (snapshot as any).expected_reboot_reason = plannedRebootFlag.reason;
143
- plannedRebootConsumed = true;
144
- }
145
-
146
- // ZFS and I/O errors: collect every cycle (lightweight checks)
147
- try { snapshot.zfs = await collectZfs() ?? undefined; } catch { /* skip if ZFS not available */ }
148
- try { snapshot.io_errors = await collectIoErrors() ?? undefined; } catch { /* skip on error */ }
149
- try { snapshot.io_latency = collectIoLatency(); } catch { /* skip on error */ }
150
- try { snapshot.conntrack = collectConntrack(); } catch { /* skip on error */ }
151
- try { snapshot.systemd = await collectSystemd(); } catch { /* skip on error */ }
152
- try { snapshot.ntp = await collectNtp(); } catch { /* skip on error */ }
153
- try { snapshot.file_descriptors = collectFileDescriptors(); } catch { /* skip on error */ }
154
-
155
- // Update Prometheus metrics
156
- updateMetrics(snapshot);
157
-
158
- // Evaluate alerts
159
- const alertResults = evaluateAlerts(snapshot, config.thresholds);
160
- const { newAlerts, resolvedAlerts } = updateAlertState(alertResults);
161
-
162
- const elapsed = Date.now() - startTime;
163
- console.log(`[collector] Collected in ${elapsed}ms. Alerts: ${alertResults.length} active, ${newAlerts.length} new, ${resolvedAlerts.length} resolved`);
164
-
165
- // Send notifications for new/resolved alerts
166
- if (newAlerts.length > 0 || resolvedAlerts.length > 0) {
167
- if (config.channels.telegram.enabled && config.channels.telegram.bot_token && config.channels.telegram.chat_id) {
168
- await sendTelegram(config.channels.telegram.bot_token, config.channels.telegram.chat_id, newAlerts, resolvedAlerts, config.server_name);
169
- }
170
- if (config.channels.slack.enabled && config.channels.slack.webhook_url) {
171
- await sendSlack(config.channels.slack.webhook_url, newAlerts, resolvedAlerts, config.server_name);
172
- }
173
- if (config.channels.email.enabled && config.channels.email.to) {
174
- await sendEmail(config.channels.email, newAlerts, resolvedAlerts, config.server_name);
175
- }
176
- }
177
-
178
- // Push to Forge (non-blocking)
179
- if (config.forge.enabled && config.forge.api_key) {
180
- pushToForge(config.forge.url, config.forge.api_key, snapshot);
181
- }
182
-
183
- // Check for updates (every 6 hours, non-blocking)
184
- checkForUpdates(config.forge.enabled ? config.forge.url : undefined);
185
-
186
- // Print summary on first run
187
- if (firstRun) {
188
- firstRun = false;
189
- console.log("");
190
- console.log("=== First collection complete ===");
191
- console.log(`Server: ${system.hostname} (${system.os})`);
192
- console.log(`CPU: ${cpu.user_percent.toFixed(1)}% (load: ${cpu.load_1m})`);
193
- const ramPct = memory.total_mb > 0 ? ((memory.used_mb / memory.total_mb) * 100).toFixed(1) : "0";
194
- console.log(`RAM: ${ramPct}% (${memory.used_mb} / ${memory.total_mb} MB)`);
195
- if (disks.length > 0) console.log(`Disk: ${disks[0].percent_used}% (${disks[0].mount})`);
196
- console.log(`SMART: ${smart.length > 0 ? `${smart.length} drive(s) checked` : "not available"}`);
197
- console.log(`Network: ${network.map((n) => n.interface).join(", ") || "none detected"}`);
198
- console.log(`IPMI: ${ipmi.available ? "available" : "not available"}`);
199
- console.log(`Active alerts: ${alertResults.length}`);
200
- console.log(`Forge: ${config.forge.enabled ? "enabled" : "disabled"}`);
201
- console.log("");
202
- }
203
- }
204
-
205
- let firstRun = true;
206
-
207
- // Run immediately
208
- collect();
209
-
210
- // Then on interval
211
- setInterval(collect, config.collection.interval_seconds * 1000);
212
-
213
- process.on("SIGTERM", () => {
214
- console.log("[collector] Received SIGTERM, shutting down");
215
- process.exit(0);
216
- });
217
-
218
- process.on("SIGINT", () => {
219
- console.log("[collector] Received SIGINT, shutting down");
220
- process.exit(0);
221
- });
@@ -1,28 +0,0 @@
1
- import { describe, it, expect } from "vitest";
2
- import { parseKeyValue, parseKb } from "../parse.js";
3
-
4
- describe("parseKeyValue", () => {
5
- it("parses colon-delimited key/value lines", () => {
6
- const out = parseKeyValue("Name: foo\nVersion: 1.2.3\n");
7
- expect(out).toEqual({ Name: "foo", Version: "1.2.3" });
8
- });
9
- it("ignores lines with no colon", () => {
10
- expect(parseKeyValue("no colon here\nA: 1\n")).toEqual({ A: "1" });
11
- });
12
- it("trims whitespace around keys and values", () => {
13
- expect(parseKeyValue(" A : 1 \n")).toEqual({ A: "1" });
14
- });
15
- });
16
-
17
- describe("parseKb", () => {
18
- it("parses a numeric kB value", () => {
19
- expect(parseKb("16384 kB")).toBe(16384);
20
- });
21
- it("parses without unit", () => {
22
- expect(parseKb("4096")).toBe(4096);
23
- });
24
- it("returns 0 for undefined/bad input", () => {
25
- expect(parseKb(undefined)).toBe(0);
26
- expect(parseKb("not a number")).toBe(0);
27
- });
28
- });
package/src/lib/exec.ts DELETED
@@ -1,16 +0,0 @@
1
- import { execFile } from "child_process";
2
- import { promisify } from "util";
3
-
4
- const execFileAsync = promisify(execFile);
5
-
6
- export async function run(cmd: string, args: string[], timeoutMs = 10000): Promise<string | null> {
7
- try {
8
- const { stdout } = await execFileAsync(cmd, args, { timeout: timeoutMs });
9
- return stdout;
10
- } catch (err: any) {
11
- if (err.code === "ENOENT") return null; // command not installed
12
- if (err.killed) return null; // timeout
13
- if (err.stdout) return err.stdout; // non-zero exit but has output
14
- return null;
15
- }
16
- }
package/src/lib/parse.ts DELETED
@@ -1,29 +0,0 @@
1
- import { readFileSync } from "fs";
2
-
3
- export function readProcFile(path: string): string | null {
4
- try {
5
- return readFileSync(path, "utf-8");
6
- } catch {
7
- return null;
8
- }
9
- }
10
-
11
- export function parseKeyValue(raw: string): Record<string, string> {
12
- const result: Record<string, string> = {};
13
- for (const line of raw.split("\n")) {
14
- const idx = line.indexOf(":");
15
- if (idx === -1) continue;
16
- result[line.slice(0, idx).trim()] = line.slice(idx + 1).trim();
17
- }
18
- return result;
19
- }
20
-
21
- export function parseKb(val: string | undefined): number {
22
- if (!val) return 0;
23
- const num = parseInt(val.replace(/\s*kB$/i, ""), 10);
24
- return isNaN(num) ? 0 : num;
25
- }
26
-
27
- export function sleep(ms: number): Promise<void> {
28
- return new Promise((r) => setTimeout(r, ms));
29
- }
@@ -1,88 +0,0 @@
1
- // Planned-reboot marker handling.
2
- //
3
- // An operator signals "the next reboot is expected, don't page me"
4
- // by writing a short-lived JSON file to disk BEFORE rebooting. The
5
- // collector reads and deletes it on agent startup; the first
6
- // post-boot snapshot then carries `expected_reboot: true` so Forge's
7
- // unexpected_reboot rule stays quiet.
8
- //
9
- // Single-use (deleted on read regardless of validity) and TTL-guarded
10
- // (default 10 min) so a forgotten marker cannot silence a genuine
11
- // crash reboot weeks later.
12
-
13
- import { existsSync, readFileSync, unlinkSync, writeFileSync, mkdirSync, chmodSync } from "node:fs";
14
- import { dirname } from "node:path";
15
-
16
- export const DEFAULT_MARKER_PATH = "/var/lib/crucible/reboot-expected";
17
- export const DEFAULT_TTL_MS = 10 * 60 * 1000;
18
-
19
- export interface PlannedReboot {
20
- expected: true;
21
- reason?: string;
22
- }
23
-
24
- export interface RebootMarker {
25
- expires_at: string; // ISO timestamp
26
- reason?: string;
27
- }
28
-
29
- /**
30
- * Read and delete the marker at `path`. Returns the resolved reboot flag
31
- * if the file existed, was parseable JSON, and hasn't expired; otherwise
32
- * returns null. The file is unlinked in every branch where it existed,
33
- * so a malformed or stale marker is one-shot (can't linger).
34
- */
35
- export function consumeRebootMarker(
36
- path: string = DEFAULT_MARKER_PATH,
37
- now: Date = new Date(),
38
- ): PlannedReboot | null {
39
- if (!existsSync(path)) return null;
40
- let raw: string;
41
- try { raw = readFileSync(path, "utf-8"); } catch { try { unlinkSync(path); } catch {} return null; }
42
- // Always delete after read, regardless of validity.
43
- try { unlinkSync(path); } catch {}
44
-
45
- let parsed: RebootMarker;
46
- try { parsed = JSON.parse(raw); } catch { return null; }
47
- if (!parsed || typeof parsed !== "object" || typeof parsed.expires_at !== "string") return null;
48
- const expiresAt = new Date(parsed.expires_at);
49
- if (isNaN(expiresAt.getTime())) return null;
50
- if (expiresAt.getTime() <= now.getTime()) return null; // stale
51
- return { expected: true, reason: parsed.reason };
52
- }
53
-
54
- /**
55
- * Write a planned-reboot marker. Used by the `mark-reboot` and `reboot`
56
- * CLI subcommands. `ttlMs` defaults to 10 minutes. Creates the parent
57
- * directory if needed. Chmod 600 so other users on the host can't read
58
- * or modify it.
59
- */
60
- export function writeRebootMarker(opts: {
61
- reason?: string;
62
- ttlMs?: number;
63
- path?: string;
64
- now?: Date;
65
- }): { path: string; expires_at: string } {
66
- const path = opts.path ?? DEFAULT_MARKER_PATH;
67
- const now = opts.now ?? new Date();
68
- const ttlMs = opts.ttlMs ?? DEFAULT_TTL_MS;
69
- const expiresAt = new Date(now.getTime() + ttlMs);
70
- const body: RebootMarker = { expires_at: expiresAt.toISOString() };
71
- if (opts.reason) body.reason = opts.reason;
72
- try { mkdirSync(dirname(path), { recursive: true, mode: 0o700 }); } catch {}
73
- writeFileSync(path, JSON.stringify(body), { mode: 0o600 });
74
- try { chmodSync(path, 0o600); } catch {}
75
- return { path, expires_at: body.expires_at };
76
- }
77
-
78
- /** Parse a duration like "10m", "2h", "600s" into milliseconds. Used by
79
- * the CLI for the `--ttl` flag. */
80
- export function parseDuration(s: string): number | null {
81
- const m = /^(\d+)\s*(ms|s|m|h)?$/.exec(s.trim());
82
- if (!m) return null;
83
- const n = parseInt(m[1], 10);
84
- if (!Number.isFinite(n) || n < 0) return null;
85
- const unit = m[2] ?? "s";
86
- const mult = unit === "ms" ? 1 : unit === "s" ? 1000 : unit === "m" ? 60_000 : 3_600_000;
87
- return n * mult;
88
- }
package/src/lib/types.ts DELETED
@@ -1,226 +0,0 @@
1
- export interface Snapshot {
2
- collector_version: string;
3
- timestamp: string;
4
- system: SystemInfo;
5
- cpu: CpuInfo;
6
- memory: MemoryInfo;
7
- disks: DiskInfo[];
8
- smart: SmartInfo[];
9
- network: NetworkInfo[];
10
- raid: RaidInfo[];
11
- ipmi: IpmiInfo;
12
- os_alerts: OsAlerts;
13
- security?: SecurityData;
14
- zfs?: ZfsData;
15
- io_errors?: { count: number; devices: string[] };
16
- io_latency?: Array<{ device: string; avg_read_latency_ms: number | null; avg_write_latency_ms: number | null; read_iops: number; write_iops: number }>;
17
- conntrack?: ConntrackData;
18
- systemd?: SystemdData;
19
- ntp?: NtpData;
20
- file_descriptors?: FileDescriptorData;
21
- // Planned-reboot flag: set only on the first snapshot after a reboot
22
- // that was marked with `crucible-agent mark-reboot` / `reboot`. Forge
23
- // reads this to suppress the `unexpected_reboot` rule. Single-use:
24
- // subsequent snapshots don't carry it.
25
- expected_reboot?: boolean;
26
- expected_reboot_reason?: string;
27
- }
28
-
29
- export interface ConntrackData {
30
- available: boolean;
31
- count: number;
32
- max: number;
33
- percent: number;
34
- }
35
-
36
- export interface SystemdData {
37
- failed_units: string[];
38
- failed_count: number;
39
- }
40
-
41
- export interface NtpData {
42
- synced: boolean;
43
- offset_seconds: number;
44
- source: string;
45
- daemon_running: boolean;
46
- }
47
-
48
- export interface FileDescriptorData {
49
- allocated: number;
50
- free: number;
51
- max: number;
52
- percent: number;
53
- }
54
-
55
- export interface ZfsPool {
56
- name: string;
57
- state: string;
58
- errors_text: string;
59
- scrub_errors?: number;
60
- scrub_repaired?: string;
61
- last_scrub_date?: string;
62
- scrub_never_run?: boolean;
63
- }
64
-
65
- export interface ZfsData {
66
- pools: ZfsPool[];
67
- }
68
-
69
- export interface SecurityData {
70
- ssh: { permitRootLogin: string; passwordAuthentication: string; rootPasswordExposed: boolean } | null;
71
- firewall: { active: boolean; source: string; details: string };
72
- pending_updates: { distro: string; pendingCount: number; available: boolean } | null;
73
- kernel_vulns: Array<{ name: string; status: string; mitigated: boolean }>;
74
- kernel_reboot: { running: string; installed: string; needsReboot: boolean } | null;
75
- auto_updates: { configured: boolean; mechanism: string; details: string };
76
- }
77
-
78
- export interface SystemInfo {
79
- hostname: string;
80
- ip: string;
81
- os: string;
82
- /** `ID=` from /etc/os-release, lowercased. e.g. "ubuntu", "debian", "rocky", "arch", "alpine". */
83
- os_id?: string;
84
- /** `ID_LIKE=` from /etc/os-release, lowercased, space-separated. Used by Forge
85
- * to pick distro-family-specific fix command variants. e.g. on Rocky this
86
- * is "rhel centos fedora"; on Ubuntu it is "debian". */
87
- os_id_like?: string;
88
- kernel: string;
89
- uptime_seconds: number;
90
- }
91
-
92
- export interface CpuCoreInfo {
93
- core: number;
94
- user_percent: number;
95
- system_percent: number;
96
- iowait_percent: number;
97
- idle_percent: number;
98
- irq_percent: number;
99
- softirq_percent: number;
100
- }
101
-
102
- export interface CpuInfo {
103
- user_percent: number;
104
- system_percent: number;
105
- iowait_percent: number;
106
- idle_percent: number;
107
- load_1m: number;
108
- load_5m: number;
109
- load_15m: number;
110
- cores?: CpuCoreInfo[];
111
- }
112
-
113
- export interface MemoryInfo {
114
- total_mb: number;
115
- used_mb: number;
116
- available_mb: number;
117
- swap_total_mb: number;
118
- swap_used_mb: number;
119
- }
120
-
121
- export interface DiskInfo {
122
- device: string;
123
- mount: string;
124
- total_gb: number;
125
- used_gb: number;
126
- available_gb: number;
127
- percent_used: number;
128
- fstype?: string;
129
- options?: string;
130
- inodes_total?: number;
131
- inodes_used?: number;
132
- inodes_free?: number;
133
- io_read_mb_s?: number;
134
- io_write_mb_s?: number;
135
- latency_p99_ms?: number;
136
- }
137
-
138
- export interface SmartInfo {
139
- device: string;
140
- model: string;
141
- health: string;
142
- temperature_c?: number;
143
- percentage_used?: number;
144
- reallocated_sectors?: number;
145
- pending_sectors?: number;
146
- power_on_hours?: number;
147
- }
148
-
149
- export interface NetworkInfo {
150
- interface: string;
151
- speed_mbps: number;
152
- rx_bytes_sec: number;
153
- tx_bytes_sec: number;
154
- /** Delta over the collection interval (rx_errors + any subtype counter). */
155
- rx_errors: number;
156
- tx_errors: number;
157
- rx_drops: number;
158
- tx_drops: number;
159
- /** Delta over the collection interval. Null if counter not available on this NIC. */
160
- rx_packets?: number;
161
- tx_packets?: number;
162
- /** Fine-grained RX hardware-error subtypes (deltas). Null if unavailable. */
163
- rx_crc_errors?: number;
164
- rx_frame_errors?: number;
165
- rx_length_errors?: number;
166
- /** TX physical-layer fault counter (delta). Null if unavailable. */
167
- tx_carrier_errors?: number;
168
- operstate?: string; // "up", "down", "unknown", etc. from /sys/class/net/{iface}/operstate
169
- bond_master?: string; // if this interface is a bond slave, the bond name
170
- is_bond_master?: boolean; // true when this entry represents the bond aggregate
171
- }
172
-
173
- export interface RaidInfo {
174
- device: string;
175
- level: string;
176
- status: string;
177
- degraded: boolean;
178
- disks: string[];
179
- failed_disks: string[];
180
- }
181
-
182
- export interface SelEvent {
183
- id: number;
184
- timestamp: string;
185
- sensor: string;
186
- sensor_type: string;
187
- event: string;
188
- direction: string;
189
- severity: string;
190
- }
191
-
192
- export interface FanStatus {
193
- name: string;
194
- rpm: number;
195
- status: string;
196
- }
197
-
198
- export interface IpmiInfo {
199
- available: boolean;
200
- sensors: Array<{
201
- name: string;
202
- value: number | string;
203
- unit: string;
204
- status: string;
205
- upper_critical?: number;
206
- }>;
207
- ecc_errors: { correctable: number; uncorrectable: number };
208
- sel_entries_count: number;
209
- sel_events_recent: SelEvent[];
210
- fans: FanStatus[];
211
- }
212
-
213
- export interface OsAlerts {
214
- oom_kills_recent: number;
215
- zombie_processes: number;
216
- time_drift_ms: number;
217
- }
218
-
219
- export interface AlertResult {
220
- type: string;
221
- severity: "critical" | "warning";
222
- title: string;
223
- message: string;
224
- evidence: Record<string, unknown>;
225
- recommendation: string;
226
- }