@glassmkr/crucible 0.7.0 → 0.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (111) hide show
  1. package/dist/alerts/__tests__/rules.test.d.ts +1 -0
  2. package/dist/alerts/__tests__/rules.test.js +325 -0
  3. package/dist/alerts/__tests__/rules.test.js.map +1 -0
  4. package/dist/alerts/rules.d.ts +8 -0
  5. package/dist/alerts/rules.js +139 -32
  6. package/dist/alerts/rules.js.map +1 -1
  7. package/dist/api.d.ts +2 -0
  8. package/dist/api.js +7 -0
  9. package/dist/api.js.map +1 -0
  10. package/dist/collect/__tests__/dmi.test.d.ts +1 -0
  11. package/dist/collect/__tests__/dmi.test.js +114 -0
  12. package/dist/collect/__tests__/dmi.test.js.map +1 -0
  13. package/dist/collect/__tests__/ipmi.test.js +47 -1
  14. package/dist/collect/__tests__/ipmi.test.js.map +1 -1
  15. package/dist/collect/__tests__/thermal.test.d.ts +1 -0
  16. package/dist/collect/__tests__/thermal.test.js +164 -0
  17. package/dist/collect/__tests__/thermal.test.js.map +1 -0
  18. package/dist/collect/dmi.d.ts +19 -0
  19. package/dist/collect/dmi.js +109 -0
  20. package/dist/collect/dmi.js.map +1 -0
  21. package/dist/collect/ipmi.d.ts +27 -2
  22. package/dist/collect/ipmi.js +90 -2
  23. package/dist/collect/ipmi.js.map +1 -1
  24. package/dist/collect/thermal.d.ts +10 -0
  25. package/dist/collect/thermal.js +187 -0
  26. package/dist/collect/thermal.js.map +1 -0
  27. package/dist/config.d.ts +10 -0
  28. package/dist/config.js +2 -0
  29. package/dist/config.js.map +1 -1
  30. package/dist/index.js +52 -14
  31. package/dist/index.js.map +1 -1
  32. package/dist/lib/__tests__/capability.test.d.ts +1 -0
  33. package/dist/lib/__tests__/capability.test.js +87 -0
  34. package/dist/lib/__tests__/capability.test.js.map +1 -0
  35. package/dist/lib/__tests__/vendor-sensors.test.d.ts +1 -0
  36. package/dist/lib/__tests__/vendor-sensors.test.js +49 -0
  37. package/dist/lib/__tests__/vendor-sensors.test.js.map +1 -0
  38. package/dist/lib/capability.d.ts +21 -0
  39. package/dist/lib/capability.js +110 -0
  40. package/dist/lib/capability.js.map +1 -0
  41. package/dist/lib/cpu-thermal-chips.d.ts +2 -0
  42. package/dist/lib/cpu-thermal-chips.js +28 -0
  43. package/dist/lib/cpu-thermal-chips.js.map +1 -0
  44. package/dist/lib/types.d.ts +58 -0
  45. package/dist/lib/vendor-sensors.d.ts +27 -0
  46. package/dist/lib/vendor-sensors.js +63 -0
  47. package/dist/lib/vendor-sensors.js.map +1 -0
  48. package/dist/lib/version-check.js +1 -1
  49. package/dist/lib/version-check.js.map +1 -1
  50. package/dist/lib/version.d.ts +1 -0
  51. package/dist/lib/version.js +32 -0
  52. package/dist/lib/version.js.map +1 -0
  53. package/dist/notify/email.js +2 -1
  54. package/dist/notify/email.js.map +1 -1
  55. package/dist/notify/slack.js +2 -1
  56. package/dist/notify/slack.js.map +1 -1
  57. package/dist/notify/telegram.js +1 -1
  58. package/dist/notify/telegram.js.map +1 -1
  59. package/package.json +16 -1
  60. package/rule-ids.json +29 -0
  61. package/.dockerignore +0 -13
  62. package/.github/ISSUE_TEMPLATE/bug_report.md +0 -24
  63. package/.github/ISSUE_TEMPLATE/no_data.md +0 -26
  64. package/.github/workflows/docker.yml +0 -53
  65. package/.github/workflows/publish.yml +0 -25
  66. package/Dockerfile +0 -59
  67. package/config/collector.example.yaml +0 -43
  68. package/docker-compose.yml +0 -26
  69. package/scripts/sign-release.sh +0 -29
  70. package/src/__tests__/cli.test.ts +0 -74
  71. package/src/__tests__/reboot-marker.test.ts +0 -122
  72. package/src/alerts/evaluator.ts +0 -15
  73. package/src/alerts/rules.ts +0 -283
  74. package/src/alerts/state.ts +0 -92
  75. package/src/cli.ts +0 -112
  76. package/src/collect/__tests__/ipmi.test.ts +0 -96
  77. package/src/collect/__tests__/smart.test.ts +0 -68
  78. package/src/collect/__tests__/system.test.ts +0 -29
  79. package/src/collect/__tests__/zfs.test.ts +0 -72
  80. package/src/collect/conntrack.ts +0 -27
  81. package/src/collect/cpu.ts +0 -92
  82. package/src/collect/disks.ts +0 -91
  83. package/src/collect/fd.ts +0 -31
  84. package/src/collect/io-errors.ts +0 -23
  85. package/src/collect/io-latency.ts +0 -103
  86. package/src/collect/ipmi.ts +0 -207
  87. package/src/collect/memory.ts +0 -30
  88. package/src/collect/network.ts +0 -193
  89. package/src/collect/ntp.ts +0 -114
  90. package/src/collect/os-alerts.ts +0 -43
  91. package/src/collect/raid.ts +0 -40
  92. package/src/collect/security.ts +0 -268
  93. package/src/collect/smart.ts +0 -72
  94. package/src/collect/system.ts +0 -32
  95. package/src/collect/systemd.ts +0 -33
  96. package/src/collect/zfs.ts +0 -66
  97. package/src/config.ts +0 -65
  98. package/src/index.ts +0 -233
  99. package/src/lib/__tests__/parse.test.ts +0 -28
  100. package/src/lib/exec.ts +0 -16
  101. package/src/lib/parse.ts +0 -29
  102. package/src/lib/reboot-marker.ts +0 -88
  103. package/src/lib/types.ts +0 -226
  104. package/src/lib/version-check.ts +0 -38
  105. package/src/metrics-server.ts +0 -123
  106. package/src/notify/email.ts +0 -68
  107. package/src/notify/slack.ts +0 -46
  108. package/src/notify/telegram.ts +0 -65
  109. package/src/push/forge.ts +0 -109
  110. package/tsconfig.json +0 -15
  111. package/vitest.config.ts +0 -12
@@ -1,72 +0,0 @@
1
- import { run } from "../lib/exec.js";
2
- import { readdirSync } from "fs";
3
- import type { SmartInfo } from "../lib/types.js";
4
-
5
- export async function collectSmart(): Promise<SmartInfo[]> {
6
- // Find block devices
7
- const devices: string[] = [];
8
- try {
9
- const entries = readdirSync("/sys/block");
10
- for (const entry of entries) {
11
- if (entry.startsWith("sd") || entry.startsWith("nvme") || entry.startsWith("hd")) {
12
- devices.push(`/dev/${entry}`);
13
- }
14
- }
15
- } catch {
16
- return [];
17
- }
18
-
19
- const results: SmartInfo[] = [];
20
- for (const device of devices) {
21
- const output = await run("smartctl", ["--json", "--all", device]);
22
- if (!output) continue;
23
-
24
- try {
25
- const info = parseSmartctlJson(JSON.parse(output), device);
26
- results.push(info);
27
- } catch {
28
- // Failed to parse, skip this device
29
- }
30
- }
31
-
32
- return results;
33
- }
34
-
35
- export function parseSmartctlJson(data: Record<string, unknown> & {
36
- model_name?: string;
37
- model_family?: string;
38
- smart_status?: { passed?: boolean };
39
- temperature?: { current?: number };
40
- power_on_time?: { hours?: number };
41
- nvme_smart_health_information_log?: { percentage_used?: number; temperature?: number };
42
- ata_smart_attributes?: { table?: Array<{ id?: number; name?: string; raw?: { value?: number } }> };
43
- }, device: string): SmartInfo {
44
- const info: SmartInfo = {
45
- device,
46
- model: data.model_name || data.model_family || "unknown",
47
- health: data.smart_status?.passed ? "PASSED" : "FAILED",
48
- temperature_c: data.temperature?.current,
49
- power_on_hours: data.power_on_time?.hours,
50
- };
51
-
52
- // NVMe specific
53
- if (data.nvme_smart_health_information_log) {
54
- const nvme = data.nvme_smart_health_information_log;
55
- info.percentage_used = nvme.percentage_used;
56
- info.temperature_c = nvme.temperature;
57
- }
58
-
59
- // SATA specific
60
- if (data.ata_smart_attributes?.table) {
61
- for (const attr of data.ata_smart_attributes.table) {
62
- if (attr.id === 5 || attr.name === "Reallocated_Sector_Ct") {
63
- info.reallocated_sectors = attr.raw?.value || 0;
64
- }
65
- if (attr.id === 197 || attr.name === "Current_Pending_Sector") {
66
- info.pending_sectors = attr.raw?.value || 0;
67
- }
68
- }
69
- }
70
-
71
- return info;
72
- }
@@ -1,32 +0,0 @@
1
- import { hostname } from "os";
2
- import { readProcFile } from "../lib/parse.js";
3
- import { run } from "../lib/exec.js";
4
- import type { SystemInfo } from "../lib/types.js";
5
-
6
- // Matches KEY=value with optional surrounding double quotes. Handles both
7
- // `ID=ubuntu` and `ID="rocky"` styles found in the wild.
8
- export function readOsReleaseField(osRelease: string, key: string): string | undefined {
9
- const m = osRelease.match(new RegExp(`^${key}=("?)(.+?)\\1$`, "m"));
10
- return m ? m[2].toLowerCase() : undefined;
11
- }
12
-
13
- export async function collectSystem(): Promise<SystemInfo> {
14
- const osRelease = readProcFile("/etc/os-release") || "";
15
- const osName = osRelease.match(/PRETTY_NAME="(.+?)"/)?.[1] || "Unknown";
16
- const os_id = readOsReleaseField(osRelease, "ID");
17
- const os_id_like = readOsReleaseField(osRelease, "ID_LIKE");
18
- const kernel = (await run("uname", ["-r"]))?.trim() || "unknown";
19
- const uptimeRaw = readProcFile("/proc/uptime") || "0";
20
- const uptimeSeconds = Math.floor(parseFloat(uptimeRaw.split(" ")[0]));
21
- const ip = (await run("hostname", ["-I"]))?.trim().split(" ")[0] || "unknown";
22
-
23
- return {
24
- hostname: hostname(),
25
- ip,
26
- os: osName,
27
- ...(os_id ? { os_id } : {}),
28
- ...(os_id_like ? { os_id_like } : {}),
29
- kernel,
30
- uptime_seconds: uptimeSeconds,
31
- };
32
- }
@@ -1,33 +0,0 @@
1
- import { run } from "../lib/exec.js";
2
-
3
- export interface SystemdData {
4
- failed_units: string[];
5
- failed_count: number;
6
- }
7
-
8
- // Units commonly in failed state by design or misconfiguration
9
- const DEFAULT_EXCLUDES = [
10
- "systemd-networkd-wait-online.service",
11
- ];
12
-
13
- export async function collectSystemd(extraExcludes: string[] = []): Promise<SystemdData> {
14
- const output = await run("systemctl", [
15
- "list-units", "--type=service", "--state=failed", "--no-legend", "--plain",
16
- ]);
17
-
18
- if (!output || output.trim() === "") {
19
- return { failed_units: [], failed_count: 0 };
20
- }
21
-
22
- const excludes = new Set([...DEFAULT_EXCLUDES, ...extraExcludes]);
23
- const units: string[] = [];
24
-
25
- for (const line of output.trim().split("\n")) {
26
- const unit = line.trim().split(/\s+/)[0];
27
- if (unit && unit.endsWith(".service") && !excludes.has(unit)) {
28
- units.push(unit);
29
- }
30
- }
31
-
32
- return { failed_units: units, failed_count: units.length };
33
- }
@@ -1,66 +0,0 @@
1
- import { run } from "../lib/exec.js";
2
- import type { ZfsData, ZfsPool } from "../lib/types.js";
3
-
4
- export async function collectZfs(): Promise<ZfsData | null> {
5
- // Check if zpool is installed
6
- const zpoolPath = await run("which", ["zpool"], 3000);
7
- if (!zpoolPath || !zpoolPath.trim()) return null;
8
-
9
- const zpoolStatus = await run("zpool", ["status"], 10000);
10
- if (!zpoolStatus || !zpoolStatus.trim()) return null;
11
-
12
- const pools = parseZpoolStatus(zpoolStatus);
13
- if (pools.length === 0) return null;
14
- return { pools };
15
- }
16
-
17
- export function parseZpoolStatus(zpoolStatus: string): ZfsPool[] {
18
- const pools: ZfsPool[] = [];
19
- let current: ZfsPool | null = null;
20
-
21
- for (const line of zpoolStatus.split("\n")) {
22
- const poolMatch = line.match(/^\s*pool:\s*(.+)/);
23
- if (poolMatch) {
24
- current = {
25
- name: poolMatch[1].trim(),
26
- state: "UNKNOWN",
27
- errors_text: "",
28
- };
29
- pools.push(current);
30
- continue;
31
- }
32
-
33
- if (!current) continue;
34
-
35
- const stateMatch = line.match(/^\s*state:\s*(.+)/);
36
- if (stateMatch) {
37
- current.state = stateMatch[1].trim();
38
- continue;
39
- }
40
-
41
- const errorsMatch = line.match(/^\s*errors:\s*(.+)/);
42
- if (errorsMatch) {
43
- current.errors_text = errorsMatch[1].trim();
44
- continue;
45
- }
46
-
47
- // Parse scrub info
48
- if (line.includes("scan:")) {
49
- if (line.includes("none requested")) {
50
- current.scrub_never_run = true;
51
- } else {
52
- const repairMatch = line.match(/scrub repaired (\S+) in .* with (\d+) errors/);
53
- if (repairMatch) {
54
- current.scrub_repaired = repairMatch[1];
55
- current.scrub_errors = parseInt(repairMatch[2]) || 0;
56
- }
57
- const dateMatch = line.match(/on (.+)$/);
58
- if (dateMatch) {
59
- current.last_scrub_date = dateMatch[1].trim();
60
- }
61
- }
62
- }
63
- }
64
-
65
- return pools;
66
- }
package/src/config.ts DELETED
@@ -1,65 +0,0 @@
1
- import { readFileSync } from "fs";
2
- import { parse } from "yaml";
3
- import { z } from "zod";
4
-
5
- const ConfigSchema = z.object({
6
- server_name: z.string().default("unnamed-server"),
7
- collection: z.object({
8
- interval_seconds: z.number().min(60).max(3600).default(300),
9
- ipmi: z.boolean().default(true),
10
- smart: z.boolean().default(true),
11
- }).default({}),
12
- forge: z.object({
13
- enabled: z.boolean().default(false),
14
- url: z.string().default("https://forge.glassmkr.com"),
15
- api_key: z.string().default(""),
16
- tls_pin: z.string().default(""),
17
- }).default({}),
18
- thresholds: z.object({
19
- ram_percent: z.number().default(90),
20
- swap_alert: z.boolean().default(true),
21
- disk_percent: z.number().default(85),
22
- iowait_percent: z.number().default(20),
23
- nvme_wear_percent: z.number().default(85),
24
- disk_latency_nvme_ms: z.number().default(50),
25
- disk_latency_hdd_ms: z.number().default(200),
26
- cpu_temp_warning_c: z.number().default(80),
27
- cpu_temp_critical_c: z.number().default(90),
28
- interface_utilization_percent: z.number().default(90),
29
- }).default({}),
30
- channels: z.object({
31
- telegram: z.object({
32
- enabled: z.boolean().default(false),
33
- bot_token: z.string().default(""),
34
- chat_id: z.string().default(""),
35
- }).default({}),
36
- email: z.object({
37
- enabled: z.boolean().default(false),
38
- to: z.string().default(""),
39
- }).default({}),
40
- slack: z.object({
41
- enabled: z.boolean().default(false),
42
- webhook_url: z.string().default(""),
43
- }).default({}),
44
- }).default({}),
45
- prometheus: z.object({
46
- enabled: z.boolean().default(false),
47
- port: z.number().default(9101),
48
- }).default({}),
49
- });
50
-
51
- export type Config = z.infer<typeof ConfigSchema>;
52
-
53
- export function loadConfig(path: string): Config {
54
- try {
55
- const raw = readFileSync(path, "utf-8");
56
- const parsed = parse(raw);
57
- return ConfigSchema.parse(parsed);
58
- } catch (err: any) {
59
- if (err.code === "ENOENT") {
60
- console.log(`[config] No config file at ${path}, using defaults`);
61
- return ConfigSchema.parse({});
62
- }
63
- throw err;
64
- }
65
- }
package/src/index.ts DELETED
@@ -1,233 +0,0 @@
1
- #!/usr/bin/env node
2
-
3
- import { readFileSync } from "node:fs";
4
- import { fileURLToPath } from "node:url";
5
- import { dirname, join } from "node:path";
6
- import { parseCliArgs } from "./cli.js";
7
-
8
- const __dirname = dirname(fileURLToPath(import.meta.url));
9
- const PKG_VERSION = (() => {
10
- try {
11
- const pkg = JSON.parse(readFileSync(join(__dirname, "..", "package.json"), "utf8"));
12
- return pkg.version || "0.0.0";
13
- } catch {
14
- return "0.0.0";
15
- }
16
- })();
17
-
18
- // Handle --version, --help, and planned-reboot subcommands before
19
- // importing collectors, loading config, or starting the Prometheus
20
- // server. Keeps the CLI responsive even on hosts missing the config
21
- // file or external tools.
22
- const { result: cliArgs, output: cliOutput } = parseCliArgs(process.argv.slice(2), PKG_VERSION);
23
- if (cliArgs.mode === "version" || cliArgs.mode === "help") {
24
- console.log(cliOutput);
25
- process.exit(0);
26
- }
27
- if (cliArgs.mode === "mark-reboot" || cliArgs.mode === "reboot") {
28
- const { writeRebootMarker, parseDuration, DEFAULT_TTL_MS } = await import("./lib/reboot-marker.js");
29
- const ttlMs = cliArgs.ttl ? parseDuration(cliArgs.ttl) : DEFAULT_TTL_MS;
30
- if (ttlMs === null) {
31
- console.error(`[mark-reboot] invalid --ttl value: ${cliArgs.ttl}. Use e.g. 10m, 2h, 600s.`);
32
- process.exit(2);
33
- }
34
- try {
35
- const { path, expires_at } = writeRebootMarker({
36
- reason: cliArgs.reason, ttlMs,
37
- });
38
- console.log(`[${cliArgs.mode}] marker written: ${path} (expires ${expires_at}${cliArgs.reason ? `, reason: ${cliArgs.reason}` : ""})`);
39
- } catch (err: any) {
40
- console.error(`[${cliArgs.mode}] failed to write marker: ${err?.message || err}`);
41
- console.error(` Most likely cause: need root privileges to write under /var/lib/crucible/.`);
42
- process.exit(1);
43
- }
44
- if (cliArgs.mode === "reboot") {
45
- const { execFileSync } = await import("node:child_process");
46
- console.log("[reboot] invoking systemctl reboot");
47
- try {
48
- execFileSync("systemctl", ["reboot"], { stdio: "inherit" });
49
- } catch (err: any) {
50
- console.error(`[reboot] systemctl reboot failed: ${err?.message || err}`);
51
- process.exit(1);
52
- }
53
- }
54
- process.exit(0);
55
- }
56
-
57
- import { loadConfig } from "./config.js";
58
- import { checkForUpdates } from "./lib/version-check.js";
59
- import { startMetricsServer, updateMetrics } from "./metrics-server.js";
60
- import { collectSystem } from "./collect/system.js";
61
- import { collectCpu } from "./collect/cpu.js";
62
- import { collectMemory } from "./collect/memory.js";
63
- import { collectDisks } from "./collect/disks.js";
64
- import { collectSmart } from "./collect/smart.js";
65
- import { collectNetwork } from "./collect/network.js";
66
- import { collectRaid } from "./collect/raid.js";
67
- import { collectIpmi } from "./collect/ipmi.js";
68
- import { collectOsAlerts } from "./collect/os-alerts.js";
69
- import { evaluateAlerts } from "./alerts/evaluator.js";
70
- import { updateAlertState } from "./alerts/state.js";
71
- import { sendTelegram } from "./notify/telegram.js";
72
- import { sendSlack } from "./notify/slack.js";
73
- import { sendEmail } from "./notify/email.js";
74
- import { pushToForge, initForgeAgent } from "./push/forge.js";
75
- import { collectSecurity, type SecurityData } from "./collect/security.js";
76
- import { collectZfs } from "./collect/zfs.js";
77
- import { collectIoErrors } from "./collect/io-errors.js";
78
- import { collectIoLatency } from "./collect/io-latency.js";
79
- import { collectConntrack } from "./collect/conntrack.js";
80
- import { collectSystemd } from "./collect/systemd.js";
81
- import { collectNtp } from "./collect/ntp.js";
82
- import { collectFileDescriptors } from "./collect/fd.js";
83
- import type { Snapshot, IpmiInfo } from "./lib/types.js";
84
- import { consumeRebootMarker, type PlannedReboot } from "./lib/reboot-marker.js";
85
-
86
- // Consume the planned-reboot marker once at startup. If the operator ran
87
- // `crucible-agent mark-reboot` / `reboot` before this boot, the marker
88
- // exists, we flag it on the first snapshot, and we delete the file (so
89
- // subsequent snapshots don't keep claiming the reboot was planned).
90
- const plannedRebootFlag: PlannedReboot | null = consumeRebootMarker();
91
- if (plannedRebootFlag) {
92
- console.log(`[collector] Planned reboot acknowledged${plannedRebootFlag.reason ? `: ${plannedRebootFlag.reason}` : ""}`);
93
- }
94
- let plannedRebootConsumed = false;
95
-
96
- const config = loadConfig(cliArgs.configPath);
97
-
98
- console.log(`[collector] Starting. Server: ${config.server_name}. Interval: ${config.collection.interval_seconds}s`);
99
- console.log(`[collector] IPMI: ${config.collection.ipmi ? "enabled" : "disabled"}, SMART: ${config.collection.smart ? "enabled" : "disabled"}`);
100
- console.log(`[collector] Forge: ${config.forge.enabled ? config.forge.url : "disabled"}`);
101
- console.log(`[collector] Prometheus: ${config.prometheus.enabled ? `:${config.prometheus.port}/metrics` : "disabled"}`);
102
-
103
- // Start Prometheus metrics server if enabled
104
- if (config.prometheus.enabled) {
105
- startMetricsServer(config.prometheus.port);
106
- }
107
-
108
- // Initialize TLS pinning for Forge if configured
109
- if (config.forge.tls_pin) {
110
- initForgeAgent(config.forge.tls_pin);
111
- console.log("[collector] TLS pinning enabled for Forge");
112
- }
113
-
114
- const emptyIpmi: IpmiInfo = { available: false, sensors: [], ecc_errors: { correctable: 0, uncorrectable: 0 }, sel_entries_count: 0, sel_events_recent: [], fans: [] };
115
-
116
- // Security checks run once per hour (every 12th cycle at 5-min intervals)
117
- let securityCycleCount = 0;
118
- let cachedSecurity: SecurityData | undefined;
119
-
120
- async function collect() {
121
- const startTime = Date.now();
122
- console.log(`[collector] Collecting...`);
123
-
124
- const [system, cpu, memory, disks, smart, network, raid, ipmi, osAlerts] = await Promise.all([
125
- collectSystem(),
126
- collectCpu(),
127
- collectMemory(),
128
- collectDisks(),
129
- config.collection.smart ? collectSmart() : Promise.resolve([]),
130
- collectNetwork(),
131
- collectRaid(),
132
- config.collection.ipmi ? collectIpmi() : Promise.resolve(emptyIpmi),
133
- collectOsAlerts(),
134
- ]);
135
-
136
- // Security checks: run once per hour, reuse cached data between runs
137
- securityCycleCount++;
138
- if (securityCycleCount >= 12 || !cachedSecurity) {
139
- securityCycleCount = 0;
140
- try { cachedSecurity = await collectSecurity(); } catch (err) { console.error("[security] Collection error:", err); }
141
- }
142
-
143
- const snapshot: Snapshot = {
144
- collector_version: PKG_VERSION,
145
- timestamp: new Date().toISOString(),
146
- system, cpu, memory, disks, smart, network, raid, ipmi, os_alerts: osAlerts,
147
- security: cachedSecurity,
148
- };
149
-
150
- // Single-shot: the very first snapshot after a marked reboot carries
151
- // the flag, subsequent snapshots do not.
152
- if (plannedRebootFlag && !plannedRebootConsumed) {
153
- (snapshot as any).expected_reboot = true;
154
- if (plannedRebootFlag.reason) (snapshot as any).expected_reboot_reason = plannedRebootFlag.reason;
155
- plannedRebootConsumed = true;
156
- }
157
-
158
- // ZFS and I/O errors: collect every cycle (lightweight checks)
159
- try { snapshot.zfs = await collectZfs() ?? undefined; } catch { /* skip if ZFS not available */ }
160
- try { snapshot.io_errors = await collectIoErrors() ?? undefined; } catch { /* skip on error */ }
161
- try { snapshot.io_latency = collectIoLatency(); } catch { /* skip on error */ }
162
- try { snapshot.conntrack = collectConntrack(); } catch { /* skip on error */ }
163
- try { snapshot.systemd = await collectSystemd(); } catch { /* skip on error */ }
164
- try { snapshot.ntp = await collectNtp(); } catch { /* skip on error */ }
165
- try { snapshot.file_descriptors = collectFileDescriptors(); } catch { /* skip on error */ }
166
-
167
- // Update Prometheus metrics
168
- updateMetrics(snapshot);
169
-
170
- // Evaluate alerts
171
- const alertResults = evaluateAlerts(snapshot, config.thresholds);
172
- const { newAlerts, resolvedAlerts } = updateAlertState(alertResults);
173
-
174
- const elapsed = Date.now() - startTime;
175
- console.log(`[collector] Collected in ${elapsed}ms. Alerts: ${alertResults.length} active, ${newAlerts.length} new, ${resolvedAlerts.length} resolved`);
176
-
177
- // Send notifications for new/resolved alerts
178
- if (newAlerts.length > 0 || resolvedAlerts.length > 0) {
179
- if (config.channels.telegram.enabled && config.channels.telegram.bot_token && config.channels.telegram.chat_id) {
180
- await sendTelegram(config.channels.telegram.bot_token, config.channels.telegram.chat_id, newAlerts, resolvedAlerts, config.server_name);
181
- }
182
- if (config.channels.slack.enabled && config.channels.slack.webhook_url) {
183
- await sendSlack(config.channels.slack.webhook_url, newAlerts, resolvedAlerts, config.server_name);
184
- }
185
- if (config.channels.email.enabled && config.channels.email.to) {
186
- await sendEmail(config.channels.email, newAlerts, resolvedAlerts, config.server_name);
187
- }
188
- }
189
-
190
- // Push to Forge (non-blocking)
191
- if (config.forge.enabled && config.forge.api_key) {
192
- pushToForge(config.forge.url, config.forge.api_key, snapshot);
193
- }
194
-
195
- // Check for updates (every 6 hours, non-blocking)
196
- checkForUpdates(config.forge.enabled ? config.forge.url : undefined);
197
-
198
- // Print summary on first run
199
- if (firstRun) {
200
- firstRun = false;
201
- console.log("");
202
- console.log("=== First collection complete ===");
203
- console.log(`Server: ${system.hostname} (${system.os})`);
204
- console.log(`CPU: ${cpu.user_percent.toFixed(1)}% (load: ${cpu.load_1m})`);
205
- const ramPct = memory.total_mb > 0 ? ((memory.used_mb / memory.total_mb) * 100).toFixed(1) : "0";
206
- console.log(`RAM: ${ramPct}% (${memory.used_mb} / ${memory.total_mb} MB)`);
207
- if (disks.length > 0) console.log(`Disk: ${disks[0].percent_used}% (${disks[0].mount})`);
208
- console.log(`SMART: ${smart.length > 0 ? `${smart.length} drive(s) checked` : "not available"}`);
209
- console.log(`Network: ${network.map((n) => n.interface).join(", ") || "none detected"}`);
210
- console.log(`IPMI: ${ipmi.available ? "available" : "not available"}`);
211
- console.log(`Active alerts: ${alertResults.length}`);
212
- console.log(`Forge: ${config.forge.enabled ? "enabled" : "disabled"}`);
213
- console.log("");
214
- }
215
- }
216
-
217
- let firstRun = true;
218
-
219
- // Run immediately
220
- collect();
221
-
222
- // Then on interval
223
- setInterval(collect, config.collection.interval_seconds * 1000);
224
-
225
- process.on("SIGTERM", () => {
226
- console.log("[collector] Received SIGTERM, shutting down");
227
- process.exit(0);
228
- });
229
-
230
- process.on("SIGINT", () => {
231
- console.log("[collector] Received SIGINT, shutting down");
232
- process.exit(0);
233
- });
@@ -1,28 +0,0 @@
1
- import { describe, it, expect } from "vitest";
2
- import { parseKeyValue, parseKb } from "../parse.js";
3
-
4
- describe("parseKeyValue", () => {
5
- it("parses colon-delimited key/value lines", () => {
6
- const out = parseKeyValue("Name: foo\nVersion: 1.2.3\n");
7
- expect(out).toEqual({ Name: "foo", Version: "1.2.3" });
8
- });
9
- it("ignores lines with no colon", () => {
10
- expect(parseKeyValue("no colon here\nA: 1\n")).toEqual({ A: "1" });
11
- });
12
- it("trims whitespace around keys and values", () => {
13
- expect(parseKeyValue(" A : 1 \n")).toEqual({ A: "1" });
14
- });
15
- });
16
-
17
- describe("parseKb", () => {
18
- it("parses a numeric kB value", () => {
19
- expect(parseKb("16384 kB")).toBe(16384);
20
- });
21
- it("parses without unit", () => {
22
- expect(parseKb("4096")).toBe(4096);
23
- });
24
- it("returns 0 for undefined/bad input", () => {
25
- expect(parseKb(undefined)).toBe(0);
26
- expect(parseKb("not a number")).toBe(0);
27
- });
28
- });
package/src/lib/exec.ts DELETED
@@ -1,16 +0,0 @@
1
- import { execFile } from "child_process";
2
- import { promisify } from "util";
3
-
4
- const execFileAsync = promisify(execFile);
5
-
6
- export async function run(cmd: string, args: string[], timeoutMs = 10000): Promise<string | null> {
7
- try {
8
- const { stdout } = await execFileAsync(cmd, args, { timeout: timeoutMs });
9
- return stdout;
10
- } catch (err: any) {
11
- if (err.code === "ENOENT") return null; // command not installed
12
- if (err.killed) return null; // timeout
13
- if (err.stdout) return err.stdout; // non-zero exit but has output
14
- return null;
15
- }
16
- }
package/src/lib/parse.ts DELETED
@@ -1,29 +0,0 @@
1
- import { readFileSync } from "fs";
2
-
3
- export function readProcFile(path: string): string | null {
4
- try {
5
- return readFileSync(path, "utf-8");
6
- } catch {
7
- return null;
8
- }
9
- }
10
-
11
- export function parseKeyValue(raw: string): Record<string, string> {
12
- const result: Record<string, string> = {};
13
- for (const line of raw.split("\n")) {
14
- const idx = line.indexOf(":");
15
- if (idx === -1) continue;
16
- result[line.slice(0, idx).trim()] = line.slice(idx + 1).trim();
17
- }
18
- return result;
19
- }
20
-
21
- export function parseKb(val: string | undefined): number {
22
- if (!val) return 0;
23
- const num = parseInt(val.replace(/\s*kB$/i, ""), 10);
24
- return isNaN(num) ? 0 : num;
25
- }
26
-
27
- export function sleep(ms: number): Promise<void> {
28
- return new Promise((r) => setTimeout(r, ms));
29
- }
@@ -1,88 +0,0 @@
1
- // Planned-reboot marker handling.
2
- //
3
- // An operator signals "the next reboot is expected, don't page me"
4
- // by writing a short-lived JSON file to disk BEFORE rebooting. The
5
- // collector reads and deletes it on agent startup; the first
6
- // post-boot snapshot then carries `expected_reboot: true` so Forge's
7
- // unexpected_reboot rule stays quiet.
8
- //
9
- // Single-use (deleted on read regardless of validity) and TTL-guarded
10
- // (default 10 min) so a forgotten marker cannot silence a genuine
11
- // crash reboot weeks later.
12
-
13
- import { existsSync, readFileSync, unlinkSync, writeFileSync, mkdirSync, chmodSync } from "node:fs";
14
- import { dirname } from "node:path";
15
-
16
- export const DEFAULT_MARKER_PATH = "/var/lib/crucible/reboot-expected";
17
- export const DEFAULT_TTL_MS = 10 * 60 * 1000;
18
-
19
- export interface PlannedReboot {
20
- expected: true;
21
- reason?: string;
22
- }
23
-
24
- export interface RebootMarker {
25
- expires_at: string; // ISO timestamp
26
- reason?: string;
27
- }
28
-
29
- /**
30
- * Read and delete the marker at `path`. Returns the resolved reboot flag
31
- * if the file existed, was parseable JSON, and hasn't expired; otherwise
32
- * returns null. The file is unlinked in every branch where it existed,
33
- * so a malformed or stale marker is one-shot (can't linger).
34
- */
35
- export function consumeRebootMarker(
36
- path: string = DEFAULT_MARKER_PATH,
37
- now: Date = new Date(),
38
- ): PlannedReboot | null {
39
- if (!existsSync(path)) return null;
40
- let raw: string;
41
- try { raw = readFileSync(path, "utf-8"); } catch { try { unlinkSync(path); } catch {} return null; }
42
- // Always delete after read, regardless of validity.
43
- try { unlinkSync(path); } catch {}
44
-
45
- let parsed: RebootMarker;
46
- try { parsed = JSON.parse(raw); } catch { return null; }
47
- if (!parsed || typeof parsed !== "object" || typeof parsed.expires_at !== "string") return null;
48
- const expiresAt = new Date(parsed.expires_at);
49
- if (isNaN(expiresAt.getTime())) return null;
50
- if (expiresAt.getTime() <= now.getTime()) return null; // stale
51
- return { expected: true, reason: parsed.reason };
52
- }
53
-
54
- /**
55
- * Write a planned-reboot marker. Used by the `mark-reboot` and `reboot`
56
- * CLI subcommands. `ttlMs` defaults to 10 minutes. Creates the parent
57
- * directory if needed. Chmod 600 so other users on the host can't read
58
- * or modify it.
59
- */
60
- export function writeRebootMarker(opts: {
61
- reason?: string;
62
- ttlMs?: number;
63
- path?: string;
64
- now?: Date;
65
- }): { path: string; expires_at: string } {
66
- const path = opts.path ?? DEFAULT_MARKER_PATH;
67
- const now = opts.now ?? new Date();
68
- const ttlMs = opts.ttlMs ?? DEFAULT_TTL_MS;
69
- const expiresAt = new Date(now.getTime() + ttlMs);
70
- const body: RebootMarker = { expires_at: expiresAt.toISOString() };
71
- if (opts.reason) body.reason = opts.reason;
72
- try { mkdirSync(dirname(path), { recursive: true, mode: 0o700 }); } catch {}
73
- writeFileSync(path, JSON.stringify(body), { mode: 0o600 });
74
- try { chmodSync(path, 0o600); } catch {}
75
- return { path, expires_at: body.expires_at };
76
- }
77
-
78
- /** Parse a duration like "10m", "2h", "600s" into milliseconds. Used by
79
- * the CLI for the `--ttl` flag. */
80
- export function parseDuration(s: string): number | null {
81
- const m = /^(\d+)\s*(ms|s|m|h)?$/.exec(s.trim());
82
- if (!m) return null;
83
- const n = parseInt(m[1], 10);
84
- if (!Number.isFinite(n) || n < 0) return null;
85
- const unit = m[2] ?? "s";
86
- const mult = unit === "ms" ? 1 : unit === "s" ? 1000 : unit === "m" ? 60_000 : 3_600_000;
87
- return n * mult;
88
- }