@glassmkr/crucible 0.7.1 → 0.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (103) hide show
  1. package/dist/alerts/__tests__/rules.test.d.ts +1 -0
  2. package/dist/alerts/__tests__/rules.test.js +325 -0
  3. package/dist/alerts/__tests__/rules.test.js.map +1 -0
  4. package/dist/alerts/rules.d.ts +8 -0
  5. package/dist/alerts/rules.js +139 -32
  6. package/dist/alerts/rules.js.map +1 -1
  7. package/dist/api.d.ts +2 -0
  8. package/dist/api.js +7 -0
  9. package/dist/api.js.map +1 -0
  10. package/dist/collect/__tests__/dmi.test.d.ts +1 -0
  11. package/dist/collect/__tests__/dmi.test.js +114 -0
  12. package/dist/collect/__tests__/dmi.test.js.map +1 -0
  13. package/dist/collect/__tests__/ipmi.test.js +47 -1
  14. package/dist/collect/__tests__/ipmi.test.js.map +1 -1
  15. package/dist/collect/__tests__/thermal.test.d.ts +1 -0
  16. package/dist/collect/__tests__/thermal.test.js +164 -0
  17. package/dist/collect/__tests__/thermal.test.js.map +1 -0
  18. package/dist/collect/dmi.d.ts +19 -0
  19. package/dist/collect/dmi.js +109 -0
  20. package/dist/collect/dmi.js.map +1 -0
  21. package/dist/collect/ipmi.d.ts +27 -2
  22. package/dist/collect/ipmi.js +90 -2
  23. package/dist/collect/ipmi.js.map +1 -1
  24. package/dist/collect/thermal.d.ts +10 -0
  25. package/dist/collect/thermal.js +187 -0
  26. package/dist/collect/thermal.js.map +1 -0
  27. package/dist/config.d.ts +10 -0
  28. package/dist/config.js +2 -0
  29. package/dist/config.js.map +1 -1
  30. package/dist/index.js +51 -1
  31. package/dist/index.js.map +1 -1
  32. package/dist/lib/__tests__/capability.test.d.ts +1 -0
  33. package/dist/lib/__tests__/capability.test.js +87 -0
  34. package/dist/lib/__tests__/capability.test.js.map +1 -0
  35. package/dist/lib/__tests__/vendor-sensors.test.d.ts +1 -0
  36. package/dist/lib/__tests__/vendor-sensors.test.js +49 -0
  37. package/dist/lib/__tests__/vendor-sensors.test.js.map +1 -0
  38. package/dist/lib/capability.d.ts +21 -0
  39. package/dist/lib/capability.js +110 -0
  40. package/dist/lib/capability.js.map +1 -0
  41. package/dist/lib/cpu-thermal-chips.d.ts +2 -0
  42. package/dist/lib/cpu-thermal-chips.js +28 -0
  43. package/dist/lib/cpu-thermal-chips.js.map +1 -0
  44. package/dist/lib/types.d.ts +58 -0
  45. package/dist/lib/vendor-sensors.d.ts +27 -0
  46. package/dist/lib/vendor-sensors.js +63 -0
  47. package/dist/lib/vendor-sensors.js.map +1 -0
  48. package/dist/notify/telegram.js +1 -1
  49. package/dist/notify/telegram.js.map +1 -1
  50. package/package.json +16 -1
  51. package/rule-ids.json +29 -0
  52. package/.dockerignore +0 -13
  53. package/.github/ISSUE_TEMPLATE/bug_report.md +0 -24
  54. package/.github/ISSUE_TEMPLATE/no_data.md +0 -26
  55. package/.github/workflows/docker.yml +0 -53
  56. package/.github/workflows/publish.yml +0 -25
  57. package/Dockerfile +0 -59
  58. package/config/collector.example.yaml +0 -43
  59. package/docker-compose.yml +0 -26
  60. package/scripts/sign-release.sh +0 -29
  61. package/src/__tests__/cli.test.ts +0 -74
  62. package/src/__tests__/reboot-marker.test.ts +0 -122
  63. package/src/alerts/evaluator.ts +0 -15
  64. package/src/alerts/rules.ts +0 -283
  65. package/src/alerts/state.ts +0 -92
  66. package/src/cli.ts +0 -112
  67. package/src/collect/__tests__/ipmi.test.ts +0 -96
  68. package/src/collect/__tests__/smart.test.ts +0 -68
  69. package/src/collect/__tests__/system.test.ts +0 -29
  70. package/src/collect/__tests__/zfs.test.ts +0 -72
  71. package/src/collect/conntrack.ts +0 -27
  72. package/src/collect/cpu.ts +0 -92
  73. package/src/collect/disks.ts +0 -91
  74. package/src/collect/fd.ts +0 -31
  75. package/src/collect/io-errors.ts +0 -23
  76. package/src/collect/io-latency.ts +0 -103
  77. package/src/collect/ipmi.ts +0 -207
  78. package/src/collect/memory.ts +0 -30
  79. package/src/collect/network.ts +0 -193
  80. package/src/collect/ntp.ts +0 -114
  81. package/src/collect/os-alerts.ts +0 -43
  82. package/src/collect/raid.ts +0 -40
  83. package/src/collect/security.ts +0 -268
  84. package/src/collect/smart.ts +0 -72
  85. package/src/collect/system.ts +0 -32
  86. package/src/collect/systemd.ts +0 -33
  87. package/src/collect/zfs.ts +0 -66
  88. package/src/config.ts +0 -65
  89. package/src/index.ts +0 -221
  90. package/src/lib/__tests__/parse.test.ts +0 -28
  91. package/src/lib/exec.ts +0 -16
  92. package/src/lib/parse.ts +0 -29
  93. package/src/lib/reboot-marker.ts +0 -88
  94. package/src/lib/types.ts +0 -226
  95. package/src/lib/version-check.ts +0 -39
  96. package/src/lib/version.ts +0 -33
  97. package/src/metrics-server.ts +0 -123
  98. package/src/notify/email.ts +0 -69
  99. package/src/notify/slack.ts +0 -47
  100. package/src/notify/telegram.ts +0 -65
  101. package/src/push/forge.ts +0 -109
  102. package/tsconfig.json +0 -15
  103. package/vitest.config.ts +0 -12
@@ -1,72 +0,0 @@
1
- import { run } from "../lib/exec.js";
2
- import { readdirSync } from "fs";
3
- import type { SmartInfo } from "../lib/types.js";
4
-
5
- export async function collectSmart(): Promise<SmartInfo[]> {
6
- // Find block devices
7
- const devices: string[] = [];
8
- try {
9
- const entries = readdirSync("/sys/block");
10
- for (const entry of entries) {
11
- if (entry.startsWith("sd") || entry.startsWith("nvme") || entry.startsWith("hd")) {
12
- devices.push(`/dev/${entry}`);
13
- }
14
- }
15
- } catch {
16
- return [];
17
- }
18
-
19
- const results: SmartInfo[] = [];
20
- for (const device of devices) {
21
- const output = await run("smartctl", ["--json", "--all", device]);
22
- if (!output) continue;
23
-
24
- try {
25
- const info = parseSmartctlJson(JSON.parse(output), device);
26
- results.push(info);
27
- } catch {
28
- // Failed to parse, skip this device
29
- }
30
- }
31
-
32
- return results;
33
- }
34
-
35
- export function parseSmartctlJson(data: Record<string, unknown> & {
36
- model_name?: string;
37
- model_family?: string;
38
- smart_status?: { passed?: boolean };
39
- temperature?: { current?: number };
40
- power_on_time?: { hours?: number };
41
- nvme_smart_health_information_log?: { percentage_used?: number; temperature?: number };
42
- ata_smart_attributes?: { table?: Array<{ id?: number; name?: string; raw?: { value?: number } }> };
43
- }, device: string): SmartInfo {
44
- const info: SmartInfo = {
45
- device,
46
- model: data.model_name || data.model_family || "unknown",
47
- health: data.smart_status?.passed ? "PASSED" : "FAILED",
48
- temperature_c: data.temperature?.current,
49
- power_on_hours: data.power_on_time?.hours,
50
- };
51
-
52
- // NVMe specific
53
- if (data.nvme_smart_health_information_log) {
54
- const nvme = data.nvme_smart_health_information_log;
55
- info.percentage_used = nvme.percentage_used;
56
- info.temperature_c = nvme.temperature;
57
- }
58
-
59
- // SATA specific
60
- if (data.ata_smart_attributes?.table) {
61
- for (const attr of data.ata_smart_attributes.table) {
62
- if (attr.id === 5 || attr.name === "Reallocated_Sector_Ct") {
63
- info.reallocated_sectors = attr.raw?.value || 0;
64
- }
65
- if (attr.id === 197 || attr.name === "Current_Pending_Sector") {
66
- info.pending_sectors = attr.raw?.value || 0;
67
- }
68
- }
69
- }
70
-
71
- return info;
72
- }
@@ -1,32 +0,0 @@
1
- import { hostname } from "os";
2
- import { readProcFile } from "../lib/parse.js";
3
- import { run } from "../lib/exec.js";
4
- import type { SystemInfo } from "../lib/types.js";
5
-
6
- // Matches KEY=value with optional surrounding double quotes. Handles both
7
- // `ID=ubuntu` and `ID="rocky"` styles found in the wild.
8
- export function readOsReleaseField(osRelease: string, key: string): string | undefined {
9
- const m = osRelease.match(new RegExp(`^${key}=("?)(.+?)\\1$`, "m"));
10
- return m ? m[2].toLowerCase() : undefined;
11
- }
12
-
13
- export async function collectSystem(): Promise<SystemInfo> {
14
- const osRelease = readProcFile("/etc/os-release") || "";
15
- const osName = osRelease.match(/PRETTY_NAME="(.+?)"/)?.[1] || "Unknown";
16
- const os_id = readOsReleaseField(osRelease, "ID");
17
- const os_id_like = readOsReleaseField(osRelease, "ID_LIKE");
18
- const kernel = (await run("uname", ["-r"]))?.trim() || "unknown";
19
- const uptimeRaw = readProcFile("/proc/uptime") || "0";
20
- const uptimeSeconds = Math.floor(parseFloat(uptimeRaw.split(" ")[0]));
21
- const ip = (await run("hostname", ["-I"]))?.trim().split(" ")[0] || "unknown";
22
-
23
- return {
24
- hostname: hostname(),
25
- ip,
26
- os: osName,
27
- ...(os_id ? { os_id } : {}),
28
- ...(os_id_like ? { os_id_like } : {}),
29
- kernel,
30
- uptime_seconds: uptimeSeconds,
31
- };
32
- }
@@ -1,33 +0,0 @@
1
- import { run } from "../lib/exec.js";
2
-
3
- export interface SystemdData {
4
- failed_units: string[];
5
- failed_count: number;
6
- }
7
-
8
- // Units commonly in failed state by design or misconfiguration
9
- const DEFAULT_EXCLUDES = [
10
- "systemd-networkd-wait-online.service",
11
- ];
12
-
13
- export async function collectSystemd(extraExcludes: string[] = []): Promise<SystemdData> {
14
- const output = await run("systemctl", [
15
- "list-units", "--type=service", "--state=failed", "--no-legend", "--plain",
16
- ]);
17
-
18
- if (!output || output.trim() === "") {
19
- return { failed_units: [], failed_count: 0 };
20
- }
21
-
22
- const excludes = new Set([...DEFAULT_EXCLUDES, ...extraExcludes]);
23
- const units: string[] = [];
24
-
25
- for (const line of output.trim().split("\n")) {
26
- const unit = line.trim().split(/\s+/)[0];
27
- if (unit && unit.endsWith(".service") && !excludes.has(unit)) {
28
- units.push(unit);
29
- }
30
- }
31
-
32
- return { failed_units: units, failed_count: units.length };
33
- }
@@ -1,66 +0,0 @@
1
- import { run } from "../lib/exec.js";
2
- import type { ZfsData, ZfsPool } from "../lib/types.js";
3
-
4
- export async function collectZfs(): Promise<ZfsData | null> {
5
- // Check if zpool is installed
6
- const zpoolPath = await run("which", ["zpool"], 3000);
7
- if (!zpoolPath || !zpoolPath.trim()) return null;
8
-
9
- const zpoolStatus = await run("zpool", ["status"], 10000);
10
- if (!zpoolStatus || !zpoolStatus.trim()) return null;
11
-
12
- const pools = parseZpoolStatus(zpoolStatus);
13
- if (pools.length === 0) return null;
14
- return { pools };
15
- }
16
-
17
- export function parseZpoolStatus(zpoolStatus: string): ZfsPool[] {
18
- const pools: ZfsPool[] = [];
19
- let current: ZfsPool | null = null;
20
-
21
- for (const line of zpoolStatus.split("\n")) {
22
- const poolMatch = line.match(/^\s*pool:\s*(.+)/);
23
- if (poolMatch) {
24
- current = {
25
- name: poolMatch[1].trim(),
26
- state: "UNKNOWN",
27
- errors_text: "",
28
- };
29
- pools.push(current);
30
- continue;
31
- }
32
-
33
- if (!current) continue;
34
-
35
- const stateMatch = line.match(/^\s*state:\s*(.+)/);
36
- if (stateMatch) {
37
- current.state = stateMatch[1].trim();
38
- continue;
39
- }
40
-
41
- const errorsMatch = line.match(/^\s*errors:\s*(.+)/);
42
- if (errorsMatch) {
43
- current.errors_text = errorsMatch[1].trim();
44
- continue;
45
- }
46
-
47
- // Parse scrub info
48
- if (line.includes("scan:")) {
49
- if (line.includes("none requested")) {
50
- current.scrub_never_run = true;
51
- } else {
52
- const repairMatch = line.match(/scrub repaired (\S+) in .* with (\d+) errors/);
53
- if (repairMatch) {
54
- current.scrub_repaired = repairMatch[1];
55
- current.scrub_errors = parseInt(repairMatch[2]) || 0;
56
- }
57
- const dateMatch = line.match(/on (.+)$/);
58
- if (dateMatch) {
59
- current.last_scrub_date = dateMatch[1].trim();
60
- }
61
- }
62
- }
63
- }
64
-
65
- return pools;
66
- }
package/src/config.ts DELETED
@@ -1,65 +0,0 @@
1
- import { readFileSync } from "fs";
2
- import { parse } from "yaml";
3
- import { z } from "zod";
4
-
5
- const ConfigSchema = z.object({
6
- server_name: z.string().default("unnamed-server"),
7
- collection: z.object({
8
- interval_seconds: z.number().min(60).max(3600).default(300),
9
- ipmi: z.boolean().default(true),
10
- smart: z.boolean().default(true),
11
- }).default({}),
12
- forge: z.object({
13
- enabled: z.boolean().default(false),
14
- url: z.string().default("https://forge.glassmkr.com"),
15
- api_key: z.string().default(""),
16
- tls_pin: z.string().default(""),
17
- }).default({}),
18
- thresholds: z.object({
19
- ram_percent: z.number().default(90),
20
- swap_alert: z.boolean().default(true),
21
- disk_percent: z.number().default(85),
22
- iowait_percent: z.number().default(20),
23
- nvme_wear_percent: z.number().default(85),
24
- disk_latency_nvme_ms: z.number().default(50),
25
- disk_latency_hdd_ms: z.number().default(200),
26
- cpu_temp_warning_c: z.number().default(80),
27
- cpu_temp_critical_c: z.number().default(90),
28
- interface_utilization_percent: z.number().default(90),
29
- }).default({}),
30
- channels: z.object({
31
- telegram: z.object({
32
- enabled: z.boolean().default(false),
33
- bot_token: z.string().default(""),
34
- chat_id: z.string().default(""),
35
- }).default({}),
36
- email: z.object({
37
- enabled: z.boolean().default(false),
38
- to: z.string().default(""),
39
- }).default({}),
40
- slack: z.object({
41
- enabled: z.boolean().default(false),
42
- webhook_url: z.string().default(""),
43
- }).default({}),
44
- }).default({}),
45
- prometheus: z.object({
46
- enabled: z.boolean().default(false),
47
- port: z.number().default(9101),
48
- }).default({}),
49
- });
50
-
51
- export type Config = z.infer<typeof ConfigSchema>;
52
-
53
- export function loadConfig(path: string): Config {
54
- try {
55
- const raw = readFileSync(path, "utf-8");
56
- const parsed = parse(raw);
57
- return ConfigSchema.parse(parsed);
58
- } catch (err: any) {
59
- if (err.code === "ENOENT") {
60
- console.log(`[config] No config file at ${path}, using defaults`);
61
- return ConfigSchema.parse({});
62
- }
63
- throw err;
64
- }
65
- }
package/src/index.ts DELETED
@@ -1,221 +0,0 @@
1
- #!/usr/bin/env node
2
-
3
- import { parseCliArgs } from "./cli.js";
4
- import { CRUCIBLE_VERSION as PKG_VERSION } from "./lib/version.js";
5
-
6
- // Handle --version, --help, and planned-reboot subcommands before
7
- // importing collectors, loading config, or starting the Prometheus
8
- // server. Keeps the CLI responsive even on hosts missing the config
9
- // file or external tools.
10
- const { result: cliArgs, output: cliOutput } = parseCliArgs(process.argv.slice(2), PKG_VERSION);
11
- if (cliArgs.mode === "version" || cliArgs.mode === "help") {
12
- console.log(cliOutput);
13
- process.exit(0);
14
- }
15
- if (cliArgs.mode === "mark-reboot" || cliArgs.mode === "reboot") {
16
- const { writeRebootMarker, parseDuration, DEFAULT_TTL_MS } = await import("./lib/reboot-marker.js");
17
- const ttlMs = cliArgs.ttl ? parseDuration(cliArgs.ttl) : DEFAULT_TTL_MS;
18
- if (ttlMs === null) {
19
- console.error(`[mark-reboot] invalid --ttl value: ${cliArgs.ttl}. Use e.g. 10m, 2h, 600s.`);
20
- process.exit(2);
21
- }
22
- try {
23
- const { path, expires_at } = writeRebootMarker({
24
- reason: cliArgs.reason, ttlMs,
25
- });
26
- console.log(`[${cliArgs.mode}] marker written: ${path} (expires ${expires_at}${cliArgs.reason ? `, reason: ${cliArgs.reason}` : ""})`);
27
- } catch (err: any) {
28
- console.error(`[${cliArgs.mode}] failed to write marker: ${err?.message || err}`);
29
- console.error(` Most likely cause: need root privileges to write under /var/lib/crucible/.`);
30
- process.exit(1);
31
- }
32
- if (cliArgs.mode === "reboot") {
33
- const { execFileSync } = await import("node:child_process");
34
- console.log("[reboot] invoking systemctl reboot");
35
- try {
36
- execFileSync("systemctl", ["reboot"], { stdio: "inherit" });
37
- } catch (err: any) {
38
- console.error(`[reboot] systemctl reboot failed: ${err?.message || err}`);
39
- process.exit(1);
40
- }
41
- }
42
- process.exit(0);
43
- }
44
-
45
- import { loadConfig } from "./config.js";
46
- import { checkForUpdates } from "./lib/version-check.js";
47
- import { startMetricsServer, updateMetrics } from "./metrics-server.js";
48
- import { collectSystem } from "./collect/system.js";
49
- import { collectCpu } from "./collect/cpu.js";
50
- import { collectMemory } from "./collect/memory.js";
51
- import { collectDisks } from "./collect/disks.js";
52
- import { collectSmart } from "./collect/smart.js";
53
- import { collectNetwork } from "./collect/network.js";
54
- import { collectRaid } from "./collect/raid.js";
55
- import { collectIpmi } from "./collect/ipmi.js";
56
- import { collectOsAlerts } from "./collect/os-alerts.js";
57
- import { evaluateAlerts } from "./alerts/evaluator.js";
58
- import { updateAlertState } from "./alerts/state.js";
59
- import { sendTelegram } from "./notify/telegram.js";
60
- import { sendSlack } from "./notify/slack.js";
61
- import { sendEmail } from "./notify/email.js";
62
- import { pushToForge, initForgeAgent } from "./push/forge.js";
63
- import { collectSecurity, type SecurityData } from "./collect/security.js";
64
- import { collectZfs } from "./collect/zfs.js";
65
- import { collectIoErrors } from "./collect/io-errors.js";
66
- import { collectIoLatency } from "./collect/io-latency.js";
67
- import { collectConntrack } from "./collect/conntrack.js";
68
- import { collectSystemd } from "./collect/systemd.js";
69
- import { collectNtp } from "./collect/ntp.js";
70
- import { collectFileDescriptors } from "./collect/fd.js";
71
- import type { Snapshot, IpmiInfo } from "./lib/types.js";
72
- import { consumeRebootMarker, type PlannedReboot } from "./lib/reboot-marker.js";
73
-
74
- // Consume the planned-reboot marker once at startup. If the operator ran
75
- // `crucible-agent mark-reboot` / `reboot` before this boot, the marker
76
- // exists, we flag it on the first snapshot, and we delete the file (so
77
- // subsequent snapshots don't keep claiming the reboot was planned).
78
- const plannedRebootFlag: PlannedReboot | null = consumeRebootMarker();
79
- if (plannedRebootFlag) {
80
- console.log(`[collector] Planned reboot acknowledged${plannedRebootFlag.reason ? `: ${plannedRebootFlag.reason}` : ""}`);
81
- }
82
- let plannedRebootConsumed = false;
83
-
84
- const config = loadConfig(cliArgs.configPath);
85
-
86
- console.log(`[collector] Starting. Server: ${config.server_name}. Interval: ${config.collection.interval_seconds}s`);
87
- console.log(`[collector] IPMI: ${config.collection.ipmi ? "enabled" : "disabled"}, SMART: ${config.collection.smart ? "enabled" : "disabled"}`);
88
- console.log(`[collector] Forge: ${config.forge.enabled ? config.forge.url : "disabled"}`);
89
- console.log(`[collector] Prometheus: ${config.prometheus.enabled ? `:${config.prometheus.port}/metrics` : "disabled"}`);
90
-
91
- // Start Prometheus metrics server if enabled
92
- if (config.prometheus.enabled) {
93
- startMetricsServer(config.prometheus.port);
94
- }
95
-
96
- // Initialize TLS pinning for Forge if configured
97
- if (config.forge.tls_pin) {
98
- initForgeAgent(config.forge.tls_pin);
99
- console.log("[collector] TLS pinning enabled for Forge");
100
- }
101
-
102
- const emptyIpmi: IpmiInfo = { available: false, sensors: [], ecc_errors: { correctable: 0, uncorrectable: 0 }, sel_entries_count: 0, sel_events_recent: [], fans: [] };
103
-
104
- // Security checks run once per hour (every 12th cycle at 5-min intervals)
105
- let securityCycleCount = 0;
106
- let cachedSecurity: SecurityData | undefined;
107
-
108
- async function collect() {
109
- const startTime = Date.now();
110
- console.log(`[collector] Collecting...`);
111
-
112
- const [system, cpu, memory, disks, smart, network, raid, ipmi, osAlerts] = await Promise.all([
113
- collectSystem(),
114
- collectCpu(),
115
- collectMemory(),
116
- collectDisks(),
117
- config.collection.smart ? collectSmart() : Promise.resolve([]),
118
- collectNetwork(),
119
- collectRaid(),
120
- config.collection.ipmi ? collectIpmi() : Promise.resolve(emptyIpmi),
121
- collectOsAlerts(),
122
- ]);
123
-
124
- // Security checks: run once per hour, reuse cached data between runs
125
- securityCycleCount++;
126
- if (securityCycleCount >= 12 || !cachedSecurity) {
127
- securityCycleCount = 0;
128
- try { cachedSecurity = await collectSecurity(); } catch (err) { console.error("[security] Collection error:", err); }
129
- }
130
-
131
- const snapshot: Snapshot = {
132
- collector_version: PKG_VERSION,
133
- timestamp: new Date().toISOString(),
134
- system, cpu, memory, disks, smart, network, raid, ipmi, os_alerts: osAlerts,
135
- security: cachedSecurity,
136
- };
137
-
138
- // Single-shot: the very first snapshot after a marked reboot carries
139
- // the flag, subsequent snapshots do not.
140
- if (plannedRebootFlag && !plannedRebootConsumed) {
141
- (snapshot as any).expected_reboot = true;
142
- if (plannedRebootFlag.reason) (snapshot as any).expected_reboot_reason = plannedRebootFlag.reason;
143
- plannedRebootConsumed = true;
144
- }
145
-
146
- // ZFS and I/O errors: collect every cycle (lightweight checks)
147
- try { snapshot.zfs = await collectZfs() ?? undefined; } catch { /* skip if ZFS not available */ }
148
- try { snapshot.io_errors = await collectIoErrors() ?? undefined; } catch { /* skip on error */ }
149
- try { snapshot.io_latency = collectIoLatency(); } catch { /* skip on error */ }
150
- try { snapshot.conntrack = collectConntrack(); } catch { /* skip on error */ }
151
- try { snapshot.systemd = await collectSystemd(); } catch { /* skip on error */ }
152
- try { snapshot.ntp = await collectNtp(); } catch { /* skip on error */ }
153
- try { snapshot.file_descriptors = collectFileDescriptors(); } catch { /* skip on error */ }
154
-
155
- // Update Prometheus metrics
156
- updateMetrics(snapshot);
157
-
158
- // Evaluate alerts
159
- const alertResults = evaluateAlerts(snapshot, config.thresholds);
160
- const { newAlerts, resolvedAlerts } = updateAlertState(alertResults);
161
-
162
- const elapsed = Date.now() - startTime;
163
- console.log(`[collector] Collected in ${elapsed}ms. Alerts: ${alertResults.length} active, ${newAlerts.length} new, ${resolvedAlerts.length} resolved`);
164
-
165
- // Send notifications for new/resolved alerts
166
- if (newAlerts.length > 0 || resolvedAlerts.length > 0) {
167
- if (config.channels.telegram.enabled && config.channels.telegram.bot_token && config.channels.telegram.chat_id) {
168
- await sendTelegram(config.channels.telegram.bot_token, config.channels.telegram.chat_id, newAlerts, resolvedAlerts, config.server_name);
169
- }
170
- if (config.channels.slack.enabled && config.channels.slack.webhook_url) {
171
- await sendSlack(config.channels.slack.webhook_url, newAlerts, resolvedAlerts, config.server_name);
172
- }
173
- if (config.channels.email.enabled && config.channels.email.to) {
174
- await sendEmail(config.channels.email, newAlerts, resolvedAlerts, config.server_name);
175
- }
176
- }
177
-
178
- // Push to Forge (non-blocking)
179
- if (config.forge.enabled && config.forge.api_key) {
180
- pushToForge(config.forge.url, config.forge.api_key, snapshot);
181
- }
182
-
183
- // Check for updates (every 6 hours, non-blocking)
184
- checkForUpdates(config.forge.enabled ? config.forge.url : undefined);
185
-
186
- // Print summary on first run
187
- if (firstRun) {
188
- firstRun = false;
189
- console.log("");
190
- console.log("=== First collection complete ===");
191
- console.log(`Server: ${system.hostname} (${system.os})`);
192
- console.log(`CPU: ${cpu.user_percent.toFixed(1)}% (load: ${cpu.load_1m})`);
193
- const ramPct = memory.total_mb > 0 ? ((memory.used_mb / memory.total_mb) * 100).toFixed(1) : "0";
194
- console.log(`RAM: ${ramPct}% (${memory.used_mb} / ${memory.total_mb} MB)`);
195
- if (disks.length > 0) console.log(`Disk: ${disks[0].percent_used}% (${disks[0].mount})`);
196
- console.log(`SMART: ${smart.length > 0 ? `${smart.length} drive(s) checked` : "not available"}`);
197
- console.log(`Network: ${network.map((n) => n.interface).join(", ") || "none detected"}`);
198
- console.log(`IPMI: ${ipmi.available ? "available" : "not available"}`);
199
- console.log(`Active alerts: ${alertResults.length}`);
200
- console.log(`Forge: ${config.forge.enabled ? "enabled" : "disabled"}`);
201
- console.log("");
202
- }
203
- }
204
-
205
- let firstRun = true;
206
-
207
- // Run immediately
208
- collect();
209
-
210
- // Then on interval
211
- setInterval(collect, config.collection.interval_seconds * 1000);
212
-
213
- process.on("SIGTERM", () => {
214
- console.log("[collector] Received SIGTERM, shutting down");
215
- process.exit(0);
216
- });
217
-
218
- process.on("SIGINT", () => {
219
- console.log("[collector] Received SIGINT, shutting down");
220
- process.exit(0);
221
- });
@@ -1,28 +0,0 @@
1
- import { describe, it, expect } from "vitest";
2
- import { parseKeyValue, parseKb } from "../parse.js";
3
-
4
- describe("parseKeyValue", () => {
5
- it("parses colon-delimited key/value lines", () => {
6
- const out = parseKeyValue("Name: foo\nVersion: 1.2.3\n");
7
- expect(out).toEqual({ Name: "foo", Version: "1.2.3" });
8
- });
9
- it("ignores lines with no colon", () => {
10
- expect(parseKeyValue("no colon here\nA: 1\n")).toEqual({ A: "1" });
11
- });
12
- it("trims whitespace around keys and values", () => {
13
- expect(parseKeyValue(" A : 1 \n")).toEqual({ A: "1" });
14
- });
15
- });
16
-
17
- describe("parseKb", () => {
18
- it("parses a numeric kB value", () => {
19
- expect(parseKb("16384 kB")).toBe(16384);
20
- });
21
- it("parses without unit", () => {
22
- expect(parseKb("4096")).toBe(4096);
23
- });
24
- it("returns 0 for undefined/bad input", () => {
25
- expect(parseKb(undefined)).toBe(0);
26
- expect(parseKb("not a number")).toBe(0);
27
- });
28
- });
package/src/lib/exec.ts DELETED
@@ -1,16 +0,0 @@
1
- import { execFile } from "child_process";
2
- import { promisify } from "util";
3
-
4
- const execFileAsync = promisify(execFile);
5
-
6
- export async function run(cmd: string, args: string[], timeoutMs = 10000): Promise<string | null> {
7
- try {
8
- const { stdout } = await execFileAsync(cmd, args, { timeout: timeoutMs });
9
- return stdout;
10
- } catch (err: any) {
11
- if (err.code === "ENOENT") return null; // command not installed
12
- if (err.killed) return null; // timeout
13
- if (err.stdout) return err.stdout; // non-zero exit but has output
14
- return null;
15
- }
16
- }
package/src/lib/parse.ts DELETED
@@ -1,29 +0,0 @@
1
- import { readFileSync } from "fs";
2
-
3
- export function readProcFile(path: string): string | null {
4
- try {
5
- return readFileSync(path, "utf-8");
6
- } catch {
7
- return null;
8
- }
9
- }
10
-
11
- export function parseKeyValue(raw: string): Record<string, string> {
12
- const result: Record<string, string> = {};
13
- for (const line of raw.split("\n")) {
14
- const idx = line.indexOf(":");
15
- if (idx === -1) continue;
16
- result[line.slice(0, idx).trim()] = line.slice(idx + 1).trim();
17
- }
18
- return result;
19
- }
20
-
21
- export function parseKb(val: string | undefined): number {
22
- if (!val) return 0;
23
- const num = parseInt(val.replace(/\s*kB$/i, ""), 10);
24
- return isNaN(num) ? 0 : num;
25
- }
26
-
27
- export function sleep(ms: number): Promise<void> {
28
- return new Promise((r) => setTimeout(r, ms));
29
- }
@@ -1,88 +0,0 @@
1
- // Planned-reboot marker handling.
2
- //
3
- // An operator signals "the next reboot is expected, don't page me"
4
- // by writing a short-lived JSON file to disk BEFORE rebooting. The
5
- // collector reads and deletes it on agent startup; the first
6
- // post-boot snapshot then carries `expected_reboot: true` so Forge's
7
- // unexpected_reboot rule stays quiet.
8
- //
9
- // Single-use (deleted on read regardless of validity) and TTL-guarded
10
- // (default 10 min) so a forgotten marker cannot silence a genuine
11
- // crash reboot weeks later.
12
-
13
- import { existsSync, readFileSync, unlinkSync, writeFileSync, mkdirSync, chmodSync } from "node:fs";
14
- import { dirname } from "node:path";
15
-
16
- export const DEFAULT_MARKER_PATH = "/var/lib/crucible/reboot-expected";
17
- export const DEFAULT_TTL_MS = 10 * 60 * 1000;
18
-
19
- export interface PlannedReboot {
20
- expected: true;
21
- reason?: string;
22
- }
23
-
24
- export interface RebootMarker {
25
- expires_at: string; // ISO timestamp
26
- reason?: string;
27
- }
28
-
29
- /**
30
- * Read and delete the marker at `path`. Returns the resolved reboot flag
31
- * if the file existed, was parseable JSON, and hasn't expired; otherwise
32
- * returns null. The file is unlinked in every branch where it existed,
33
- * so a malformed or stale marker is one-shot (can't linger).
34
- */
35
- export function consumeRebootMarker(
36
- path: string = DEFAULT_MARKER_PATH,
37
- now: Date = new Date(),
38
- ): PlannedReboot | null {
39
- if (!existsSync(path)) return null;
40
- let raw: string;
41
- try { raw = readFileSync(path, "utf-8"); } catch { try { unlinkSync(path); } catch {} return null; }
42
- // Always delete after read, regardless of validity.
43
- try { unlinkSync(path); } catch {}
44
-
45
- let parsed: RebootMarker;
46
- try { parsed = JSON.parse(raw); } catch { return null; }
47
- if (!parsed || typeof parsed !== "object" || typeof parsed.expires_at !== "string") return null;
48
- const expiresAt = new Date(parsed.expires_at);
49
- if (isNaN(expiresAt.getTime())) return null;
50
- if (expiresAt.getTime() <= now.getTime()) return null; // stale
51
- return { expected: true, reason: parsed.reason };
52
- }
53
-
54
- /**
55
- * Write a planned-reboot marker. Used by the `mark-reboot` and `reboot`
56
- * CLI subcommands. `ttlMs` defaults to 10 minutes. Creates the parent
57
- * directory if needed. Chmod 600 so other users on the host can't read
58
- * or modify it.
59
- */
60
- export function writeRebootMarker(opts: {
61
- reason?: string;
62
- ttlMs?: number;
63
- path?: string;
64
- now?: Date;
65
- }): { path: string; expires_at: string } {
66
- const path = opts.path ?? DEFAULT_MARKER_PATH;
67
- const now = opts.now ?? new Date();
68
- const ttlMs = opts.ttlMs ?? DEFAULT_TTL_MS;
69
- const expiresAt = new Date(now.getTime() + ttlMs);
70
- const body: RebootMarker = { expires_at: expiresAt.toISOString() };
71
- if (opts.reason) body.reason = opts.reason;
72
- try { mkdirSync(dirname(path), { recursive: true, mode: 0o700 }); } catch {}
73
- writeFileSync(path, JSON.stringify(body), { mode: 0o600 });
74
- try { chmodSync(path, 0o600); } catch {}
75
- return { path, expires_at: body.expires_at };
76
- }
77
-
78
- /** Parse a duration like "10m", "2h", "600s" into milliseconds. Used by
79
- * the CLI for the `--ttl` flag. */
80
- export function parseDuration(s: string): number | null {
81
- const m = /^(\d+)\s*(ms|s|m|h)?$/.exec(s.trim());
82
- if (!m) return null;
83
- const n = parseInt(m[1], 10);
84
- if (!Number.isFinite(n) || n < 0) return null;
85
- const unit = m[2] ?? "s";
86
- const mult = unit === "ms" ? 1 : unit === "s" ? 1000 : unit === "m" ? 60_000 : 3_600_000;
87
- return n * mult;
88
- }