@glassmkr/crucible 0.7.1 → 0.8.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/alerts/__tests__/rules.test.d.ts +1 -0
- package/dist/alerts/__tests__/rules.test.js +437 -0
- package/dist/alerts/__tests__/rules.test.js.map +1 -0
- package/dist/alerts/rules.d.ts +8 -0
- package/dist/alerts/rules.js +175 -34
- package/dist/alerts/rules.js.map +1 -1
- package/dist/api.d.ts +2 -0
- package/dist/api.js +7 -0
- package/dist/api.js.map +1 -0
- package/dist/collect/__tests__/dmi.test.d.ts +1 -0
- package/dist/collect/__tests__/dmi.test.js +133 -0
- package/dist/collect/__tests__/dmi.test.js.map +1 -0
- package/dist/collect/__tests__/ipmi.test.js +47 -1
- package/dist/collect/__tests__/ipmi.test.js.map +1 -1
- package/dist/collect/__tests__/thermal.test.d.ts +1 -0
- package/dist/collect/__tests__/thermal.test.js +224 -0
- package/dist/collect/__tests__/thermal.test.js.map +1 -0
- package/dist/collect/dmi.d.ts +19 -0
- package/dist/collect/dmi.js +118 -0
- package/dist/collect/dmi.js.map +1 -0
- package/dist/collect/ipmi.d.ts +27 -2
- package/dist/collect/ipmi.js +90 -2
- package/dist/collect/ipmi.js.map +1 -1
- package/dist/collect/thermal.d.ts +10 -0
- package/dist/collect/thermal.js +232 -0
- package/dist/collect/thermal.js.map +1 -0
- package/dist/config.d.ts +10 -0
- package/dist/config.js +2 -0
- package/dist/config.js.map +1 -1
- package/dist/index.js +51 -1
- package/dist/index.js.map +1 -1
- package/dist/lib/__tests__/capability.test.d.ts +1 -0
- package/dist/lib/__tests__/capability.test.js +87 -0
- package/dist/lib/__tests__/capability.test.js.map +1 -0
- package/dist/lib/__tests__/vendor-sensors.test.d.ts +1 -0
- package/dist/lib/__tests__/vendor-sensors.test.js +49 -0
- package/dist/lib/__tests__/vendor-sensors.test.js.map +1 -0
- package/dist/lib/capability.d.ts +21 -0
- package/dist/lib/capability.js +110 -0
- package/dist/lib/capability.js.map +1 -0
- package/dist/lib/cpu-thermal-chips.d.ts +2 -0
- package/dist/lib/cpu-thermal-chips.js +28 -0
- package/dist/lib/cpu-thermal-chips.js.map +1 -0
- package/dist/lib/types.d.ts +58 -0
- package/dist/lib/vendor-sensors.d.ts +27 -0
- package/dist/lib/vendor-sensors.js +63 -0
- package/dist/lib/vendor-sensors.js.map +1 -0
- package/dist/notify/telegram.js +1 -1
- package/dist/notify/telegram.js.map +1 -1
- package/package.json +16 -1
- package/rule-ids.json +29 -0
- package/.dockerignore +0 -13
- package/.github/ISSUE_TEMPLATE/bug_report.md +0 -24
- package/.github/ISSUE_TEMPLATE/no_data.md +0 -26
- package/.github/workflows/docker.yml +0 -53
- package/.github/workflows/publish.yml +0 -25
- package/Dockerfile +0 -59
- package/config/collector.example.yaml +0 -43
- package/docker-compose.yml +0 -26
- package/scripts/sign-release.sh +0 -29
- package/src/__tests__/cli.test.ts +0 -74
- package/src/__tests__/reboot-marker.test.ts +0 -122
- package/src/alerts/evaluator.ts +0 -15
- package/src/alerts/rules.ts +0 -283
- package/src/alerts/state.ts +0 -92
- package/src/cli.ts +0 -112
- package/src/collect/__tests__/ipmi.test.ts +0 -96
- package/src/collect/__tests__/smart.test.ts +0 -68
- package/src/collect/__tests__/system.test.ts +0 -29
- package/src/collect/__tests__/zfs.test.ts +0 -72
- package/src/collect/conntrack.ts +0 -27
- package/src/collect/cpu.ts +0 -92
- package/src/collect/disks.ts +0 -91
- package/src/collect/fd.ts +0 -31
- package/src/collect/io-errors.ts +0 -23
- package/src/collect/io-latency.ts +0 -103
- package/src/collect/ipmi.ts +0 -207
- package/src/collect/memory.ts +0 -30
- package/src/collect/network.ts +0 -193
- package/src/collect/ntp.ts +0 -114
- package/src/collect/os-alerts.ts +0 -43
- package/src/collect/raid.ts +0 -40
- package/src/collect/security.ts +0 -268
- package/src/collect/smart.ts +0 -72
- package/src/collect/system.ts +0 -32
- package/src/collect/systemd.ts +0 -33
- package/src/collect/zfs.ts +0 -66
- package/src/config.ts +0 -65
- package/src/index.ts +0 -221
- package/src/lib/__tests__/parse.test.ts +0 -28
- package/src/lib/exec.ts +0 -16
- package/src/lib/parse.ts +0 -29
- package/src/lib/reboot-marker.ts +0 -88
- package/src/lib/types.ts +0 -226
- package/src/lib/version-check.ts +0 -39
- package/src/lib/version.ts +0 -33
- package/src/metrics-server.ts +0 -123
- package/src/notify/email.ts +0 -69
- package/src/notify/slack.ts +0 -47
- package/src/notify/telegram.ts +0 -65
- package/src/push/forge.ts +0 -109
- package/tsconfig.json +0 -15
- package/vitest.config.ts +0 -12
package/src/config.ts
DELETED
|
@@ -1,65 +0,0 @@
|
|
|
1
|
-
import { readFileSync } from "fs";
|
|
2
|
-
import { parse } from "yaml";
|
|
3
|
-
import { z } from "zod";
|
|
4
|
-
|
|
5
|
-
const ConfigSchema = z.object({
|
|
6
|
-
server_name: z.string().default("unnamed-server"),
|
|
7
|
-
collection: z.object({
|
|
8
|
-
interval_seconds: z.number().min(60).max(3600).default(300),
|
|
9
|
-
ipmi: z.boolean().default(true),
|
|
10
|
-
smart: z.boolean().default(true),
|
|
11
|
-
}).default({}),
|
|
12
|
-
forge: z.object({
|
|
13
|
-
enabled: z.boolean().default(false),
|
|
14
|
-
url: z.string().default("https://forge.glassmkr.com"),
|
|
15
|
-
api_key: z.string().default(""),
|
|
16
|
-
tls_pin: z.string().default(""),
|
|
17
|
-
}).default({}),
|
|
18
|
-
thresholds: z.object({
|
|
19
|
-
ram_percent: z.number().default(90),
|
|
20
|
-
swap_alert: z.boolean().default(true),
|
|
21
|
-
disk_percent: z.number().default(85),
|
|
22
|
-
iowait_percent: z.number().default(20),
|
|
23
|
-
nvme_wear_percent: z.number().default(85),
|
|
24
|
-
disk_latency_nvme_ms: z.number().default(50),
|
|
25
|
-
disk_latency_hdd_ms: z.number().default(200),
|
|
26
|
-
cpu_temp_warning_c: z.number().default(80),
|
|
27
|
-
cpu_temp_critical_c: z.number().default(90),
|
|
28
|
-
interface_utilization_percent: z.number().default(90),
|
|
29
|
-
}).default({}),
|
|
30
|
-
channels: z.object({
|
|
31
|
-
telegram: z.object({
|
|
32
|
-
enabled: z.boolean().default(false),
|
|
33
|
-
bot_token: z.string().default(""),
|
|
34
|
-
chat_id: z.string().default(""),
|
|
35
|
-
}).default({}),
|
|
36
|
-
email: z.object({
|
|
37
|
-
enabled: z.boolean().default(false),
|
|
38
|
-
to: z.string().default(""),
|
|
39
|
-
}).default({}),
|
|
40
|
-
slack: z.object({
|
|
41
|
-
enabled: z.boolean().default(false),
|
|
42
|
-
webhook_url: z.string().default(""),
|
|
43
|
-
}).default({}),
|
|
44
|
-
}).default({}),
|
|
45
|
-
prometheus: z.object({
|
|
46
|
-
enabled: z.boolean().default(false),
|
|
47
|
-
port: z.number().default(9101),
|
|
48
|
-
}).default({}),
|
|
49
|
-
});
|
|
50
|
-
|
|
51
|
-
export type Config = z.infer<typeof ConfigSchema>;
|
|
52
|
-
|
|
53
|
-
export function loadConfig(path: string): Config {
|
|
54
|
-
try {
|
|
55
|
-
const raw = readFileSync(path, "utf-8");
|
|
56
|
-
const parsed = parse(raw);
|
|
57
|
-
return ConfigSchema.parse(parsed);
|
|
58
|
-
} catch (err: any) {
|
|
59
|
-
if (err.code === "ENOENT") {
|
|
60
|
-
console.log(`[config] No config file at ${path}, using defaults`);
|
|
61
|
-
return ConfigSchema.parse({});
|
|
62
|
-
}
|
|
63
|
-
throw err;
|
|
64
|
-
}
|
|
65
|
-
}
|
package/src/index.ts
DELETED
|
@@ -1,221 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/env node
|
|
2
|
-
|
|
3
|
-
import { parseCliArgs } from "./cli.js";
|
|
4
|
-
import { CRUCIBLE_VERSION as PKG_VERSION } from "./lib/version.js";
|
|
5
|
-
|
|
6
|
-
// Handle --version, --help, and planned-reboot subcommands before
|
|
7
|
-
// importing collectors, loading config, or starting the Prometheus
|
|
8
|
-
// server. Keeps the CLI responsive even on hosts missing the config
|
|
9
|
-
// file or external tools.
|
|
10
|
-
const { result: cliArgs, output: cliOutput } = parseCliArgs(process.argv.slice(2), PKG_VERSION);
|
|
11
|
-
if (cliArgs.mode === "version" || cliArgs.mode === "help") {
|
|
12
|
-
console.log(cliOutput);
|
|
13
|
-
process.exit(0);
|
|
14
|
-
}
|
|
15
|
-
if (cliArgs.mode === "mark-reboot" || cliArgs.mode === "reboot") {
|
|
16
|
-
const { writeRebootMarker, parseDuration, DEFAULT_TTL_MS } = await import("./lib/reboot-marker.js");
|
|
17
|
-
const ttlMs = cliArgs.ttl ? parseDuration(cliArgs.ttl) : DEFAULT_TTL_MS;
|
|
18
|
-
if (ttlMs === null) {
|
|
19
|
-
console.error(`[mark-reboot] invalid --ttl value: ${cliArgs.ttl}. Use e.g. 10m, 2h, 600s.`);
|
|
20
|
-
process.exit(2);
|
|
21
|
-
}
|
|
22
|
-
try {
|
|
23
|
-
const { path, expires_at } = writeRebootMarker({
|
|
24
|
-
reason: cliArgs.reason, ttlMs,
|
|
25
|
-
});
|
|
26
|
-
console.log(`[${cliArgs.mode}] marker written: ${path} (expires ${expires_at}${cliArgs.reason ? `, reason: ${cliArgs.reason}` : ""})`);
|
|
27
|
-
} catch (err: any) {
|
|
28
|
-
console.error(`[${cliArgs.mode}] failed to write marker: ${err?.message || err}`);
|
|
29
|
-
console.error(` Most likely cause: need root privileges to write under /var/lib/crucible/.`);
|
|
30
|
-
process.exit(1);
|
|
31
|
-
}
|
|
32
|
-
if (cliArgs.mode === "reboot") {
|
|
33
|
-
const { execFileSync } = await import("node:child_process");
|
|
34
|
-
console.log("[reboot] invoking systemctl reboot");
|
|
35
|
-
try {
|
|
36
|
-
execFileSync("systemctl", ["reboot"], { stdio: "inherit" });
|
|
37
|
-
} catch (err: any) {
|
|
38
|
-
console.error(`[reboot] systemctl reboot failed: ${err?.message || err}`);
|
|
39
|
-
process.exit(1);
|
|
40
|
-
}
|
|
41
|
-
}
|
|
42
|
-
process.exit(0);
|
|
43
|
-
}
|
|
44
|
-
|
|
45
|
-
import { loadConfig } from "./config.js";
|
|
46
|
-
import { checkForUpdates } from "./lib/version-check.js";
|
|
47
|
-
import { startMetricsServer, updateMetrics } from "./metrics-server.js";
|
|
48
|
-
import { collectSystem } from "./collect/system.js";
|
|
49
|
-
import { collectCpu } from "./collect/cpu.js";
|
|
50
|
-
import { collectMemory } from "./collect/memory.js";
|
|
51
|
-
import { collectDisks } from "./collect/disks.js";
|
|
52
|
-
import { collectSmart } from "./collect/smart.js";
|
|
53
|
-
import { collectNetwork } from "./collect/network.js";
|
|
54
|
-
import { collectRaid } from "./collect/raid.js";
|
|
55
|
-
import { collectIpmi } from "./collect/ipmi.js";
|
|
56
|
-
import { collectOsAlerts } from "./collect/os-alerts.js";
|
|
57
|
-
import { evaluateAlerts } from "./alerts/evaluator.js";
|
|
58
|
-
import { updateAlertState } from "./alerts/state.js";
|
|
59
|
-
import { sendTelegram } from "./notify/telegram.js";
|
|
60
|
-
import { sendSlack } from "./notify/slack.js";
|
|
61
|
-
import { sendEmail } from "./notify/email.js";
|
|
62
|
-
import { pushToForge, initForgeAgent } from "./push/forge.js";
|
|
63
|
-
import { collectSecurity, type SecurityData } from "./collect/security.js";
|
|
64
|
-
import { collectZfs } from "./collect/zfs.js";
|
|
65
|
-
import { collectIoErrors } from "./collect/io-errors.js";
|
|
66
|
-
import { collectIoLatency } from "./collect/io-latency.js";
|
|
67
|
-
import { collectConntrack } from "./collect/conntrack.js";
|
|
68
|
-
import { collectSystemd } from "./collect/systemd.js";
|
|
69
|
-
import { collectNtp } from "./collect/ntp.js";
|
|
70
|
-
import { collectFileDescriptors } from "./collect/fd.js";
|
|
71
|
-
import type { Snapshot, IpmiInfo } from "./lib/types.js";
|
|
72
|
-
import { consumeRebootMarker, type PlannedReboot } from "./lib/reboot-marker.js";
|
|
73
|
-
|
|
74
|
-
// Consume the planned-reboot marker once at startup. If the operator ran
|
|
75
|
-
// `crucible-agent mark-reboot` / `reboot` before this boot, the marker
|
|
76
|
-
// exists, we flag it on the first snapshot, and we delete the file (so
|
|
77
|
-
// subsequent snapshots don't keep claiming the reboot was planned).
|
|
78
|
-
const plannedRebootFlag: PlannedReboot | null = consumeRebootMarker();
|
|
79
|
-
if (plannedRebootFlag) {
|
|
80
|
-
console.log(`[collector] Planned reboot acknowledged${plannedRebootFlag.reason ? `: ${plannedRebootFlag.reason}` : ""}`);
|
|
81
|
-
}
|
|
82
|
-
let plannedRebootConsumed = false;
|
|
83
|
-
|
|
84
|
-
const config = loadConfig(cliArgs.configPath);
|
|
85
|
-
|
|
86
|
-
console.log(`[collector] Starting. Server: ${config.server_name}. Interval: ${config.collection.interval_seconds}s`);
|
|
87
|
-
console.log(`[collector] IPMI: ${config.collection.ipmi ? "enabled" : "disabled"}, SMART: ${config.collection.smart ? "enabled" : "disabled"}`);
|
|
88
|
-
console.log(`[collector] Forge: ${config.forge.enabled ? config.forge.url : "disabled"}`);
|
|
89
|
-
console.log(`[collector] Prometheus: ${config.prometheus.enabled ? `:${config.prometheus.port}/metrics` : "disabled"}`);
|
|
90
|
-
|
|
91
|
-
// Start Prometheus metrics server if enabled
|
|
92
|
-
if (config.prometheus.enabled) {
|
|
93
|
-
startMetricsServer(config.prometheus.port);
|
|
94
|
-
}
|
|
95
|
-
|
|
96
|
-
// Initialize TLS pinning for Forge if configured
|
|
97
|
-
if (config.forge.tls_pin) {
|
|
98
|
-
initForgeAgent(config.forge.tls_pin);
|
|
99
|
-
console.log("[collector] TLS pinning enabled for Forge");
|
|
100
|
-
}
|
|
101
|
-
|
|
102
|
-
const emptyIpmi: IpmiInfo = { available: false, sensors: [], ecc_errors: { correctable: 0, uncorrectable: 0 }, sel_entries_count: 0, sel_events_recent: [], fans: [] };
|
|
103
|
-
|
|
104
|
-
// Security checks run once per hour (every 12th cycle at 5-min intervals)
|
|
105
|
-
let securityCycleCount = 0;
|
|
106
|
-
let cachedSecurity: SecurityData | undefined;
|
|
107
|
-
|
|
108
|
-
async function collect() {
|
|
109
|
-
const startTime = Date.now();
|
|
110
|
-
console.log(`[collector] Collecting...`);
|
|
111
|
-
|
|
112
|
-
const [system, cpu, memory, disks, smart, network, raid, ipmi, osAlerts] = await Promise.all([
|
|
113
|
-
collectSystem(),
|
|
114
|
-
collectCpu(),
|
|
115
|
-
collectMemory(),
|
|
116
|
-
collectDisks(),
|
|
117
|
-
config.collection.smart ? collectSmart() : Promise.resolve([]),
|
|
118
|
-
collectNetwork(),
|
|
119
|
-
collectRaid(),
|
|
120
|
-
config.collection.ipmi ? collectIpmi() : Promise.resolve(emptyIpmi),
|
|
121
|
-
collectOsAlerts(),
|
|
122
|
-
]);
|
|
123
|
-
|
|
124
|
-
// Security checks: run once per hour, reuse cached data between runs
|
|
125
|
-
securityCycleCount++;
|
|
126
|
-
if (securityCycleCount >= 12 || !cachedSecurity) {
|
|
127
|
-
securityCycleCount = 0;
|
|
128
|
-
try { cachedSecurity = await collectSecurity(); } catch (err) { console.error("[security] Collection error:", err); }
|
|
129
|
-
}
|
|
130
|
-
|
|
131
|
-
const snapshot: Snapshot = {
|
|
132
|
-
collector_version: PKG_VERSION,
|
|
133
|
-
timestamp: new Date().toISOString(),
|
|
134
|
-
system, cpu, memory, disks, smart, network, raid, ipmi, os_alerts: osAlerts,
|
|
135
|
-
security: cachedSecurity,
|
|
136
|
-
};
|
|
137
|
-
|
|
138
|
-
// Single-shot: the very first snapshot after a marked reboot carries
|
|
139
|
-
// the flag, subsequent snapshots do not.
|
|
140
|
-
if (plannedRebootFlag && !plannedRebootConsumed) {
|
|
141
|
-
(snapshot as any).expected_reboot = true;
|
|
142
|
-
if (plannedRebootFlag.reason) (snapshot as any).expected_reboot_reason = plannedRebootFlag.reason;
|
|
143
|
-
plannedRebootConsumed = true;
|
|
144
|
-
}
|
|
145
|
-
|
|
146
|
-
// ZFS and I/O errors: collect every cycle (lightweight checks)
|
|
147
|
-
try { snapshot.zfs = await collectZfs() ?? undefined; } catch { /* skip if ZFS not available */ }
|
|
148
|
-
try { snapshot.io_errors = await collectIoErrors() ?? undefined; } catch { /* skip on error */ }
|
|
149
|
-
try { snapshot.io_latency = collectIoLatency(); } catch { /* skip on error */ }
|
|
150
|
-
try { snapshot.conntrack = collectConntrack(); } catch { /* skip on error */ }
|
|
151
|
-
try { snapshot.systemd = await collectSystemd(); } catch { /* skip on error */ }
|
|
152
|
-
try { snapshot.ntp = await collectNtp(); } catch { /* skip on error */ }
|
|
153
|
-
try { snapshot.file_descriptors = collectFileDescriptors(); } catch { /* skip on error */ }
|
|
154
|
-
|
|
155
|
-
// Update Prometheus metrics
|
|
156
|
-
updateMetrics(snapshot);
|
|
157
|
-
|
|
158
|
-
// Evaluate alerts
|
|
159
|
-
const alertResults = evaluateAlerts(snapshot, config.thresholds);
|
|
160
|
-
const { newAlerts, resolvedAlerts } = updateAlertState(alertResults);
|
|
161
|
-
|
|
162
|
-
const elapsed = Date.now() - startTime;
|
|
163
|
-
console.log(`[collector] Collected in ${elapsed}ms. Alerts: ${alertResults.length} active, ${newAlerts.length} new, ${resolvedAlerts.length} resolved`);
|
|
164
|
-
|
|
165
|
-
// Send notifications for new/resolved alerts
|
|
166
|
-
if (newAlerts.length > 0 || resolvedAlerts.length > 0) {
|
|
167
|
-
if (config.channels.telegram.enabled && config.channels.telegram.bot_token && config.channels.telegram.chat_id) {
|
|
168
|
-
await sendTelegram(config.channels.telegram.bot_token, config.channels.telegram.chat_id, newAlerts, resolvedAlerts, config.server_name);
|
|
169
|
-
}
|
|
170
|
-
if (config.channels.slack.enabled && config.channels.slack.webhook_url) {
|
|
171
|
-
await sendSlack(config.channels.slack.webhook_url, newAlerts, resolvedAlerts, config.server_name);
|
|
172
|
-
}
|
|
173
|
-
if (config.channels.email.enabled && config.channels.email.to) {
|
|
174
|
-
await sendEmail(config.channels.email, newAlerts, resolvedAlerts, config.server_name);
|
|
175
|
-
}
|
|
176
|
-
}
|
|
177
|
-
|
|
178
|
-
// Push to Forge (non-blocking)
|
|
179
|
-
if (config.forge.enabled && config.forge.api_key) {
|
|
180
|
-
pushToForge(config.forge.url, config.forge.api_key, snapshot);
|
|
181
|
-
}
|
|
182
|
-
|
|
183
|
-
// Check for updates (every 6 hours, non-blocking)
|
|
184
|
-
checkForUpdates(config.forge.enabled ? config.forge.url : undefined);
|
|
185
|
-
|
|
186
|
-
// Print summary on first run
|
|
187
|
-
if (firstRun) {
|
|
188
|
-
firstRun = false;
|
|
189
|
-
console.log("");
|
|
190
|
-
console.log("=== First collection complete ===");
|
|
191
|
-
console.log(`Server: ${system.hostname} (${system.os})`);
|
|
192
|
-
console.log(`CPU: ${cpu.user_percent.toFixed(1)}% (load: ${cpu.load_1m})`);
|
|
193
|
-
const ramPct = memory.total_mb > 0 ? ((memory.used_mb / memory.total_mb) * 100).toFixed(1) : "0";
|
|
194
|
-
console.log(`RAM: ${ramPct}% (${memory.used_mb} / ${memory.total_mb} MB)`);
|
|
195
|
-
if (disks.length > 0) console.log(`Disk: ${disks[0].percent_used}% (${disks[0].mount})`);
|
|
196
|
-
console.log(`SMART: ${smart.length > 0 ? `${smart.length} drive(s) checked` : "not available"}`);
|
|
197
|
-
console.log(`Network: ${network.map((n) => n.interface).join(", ") || "none detected"}`);
|
|
198
|
-
console.log(`IPMI: ${ipmi.available ? "available" : "not available"}`);
|
|
199
|
-
console.log(`Active alerts: ${alertResults.length}`);
|
|
200
|
-
console.log(`Forge: ${config.forge.enabled ? "enabled" : "disabled"}`);
|
|
201
|
-
console.log("");
|
|
202
|
-
}
|
|
203
|
-
}
|
|
204
|
-
|
|
205
|
-
let firstRun = true;
|
|
206
|
-
|
|
207
|
-
// Run immediately
|
|
208
|
-
collect();
|
|
209
|
-
|
|
210
|
-
// Then on interval
|
|
211
|
-
setInterval(collect, config.collection.interval_seconds * 1000);
|
|
212
|
-
|
|
213
|
-
process.on("SIGTERM", () => {
|
|
214
|
-
console.log("[collector] Received SIGTERM, shutting down");
|
|
215
|
-
process.exit(0);
|
|
216
|
-
});
|
|
217
|
-
|
|
218
|
-
process.on("SIGINT", () => {
|
|
219
|
-
console.log("[collector] Received SIGINT, shutting down");
|
|
220
|
-
process.exit(0);
|
|
221
|
-
});
|
|
@@ -1,28 +0,0 @@
|
|
|
1
|
-
import { describe, it, expect } from "vitest";
|
|
2
|
-
import { parseKeyValue, parseKb } from "../parse.js";
|
|
3
|
-
|
|
4
|
-
describe("parseKeyValue", () => {
|
|
5
|
-
it("parses colon-delimited key/value lines", () => {
|
|
6
|
-
const out = parseKeyValue("Name: foo\nVersion: 1.2.3\n");
|
|
7
|
-
expect(out).toEqual({ Name: "foo", Version: "1.2.3" });
|
|
8
|
-
});
|
|
9
|
-
it("ignores lines with no colon", () => {
|
|
10
|
-
expect(parseKeyValue("no colon here\nA: 1\n")).toEqual({ A: "1" });
|
|
11
|
-
});
|
|
12
|
-
it("trims whitespace around keys and values", () => {
|
|
13
|
-
expect(parseKeyValue(" A : 1 \n")).toEqual({ A: "1" });
|
|
14
|
-
});
|
|
15
|
-
});
|
|
16
|
-
|
|
17
|
-
describe("parseKb", () => {
|
|
18
|
-
it("parses a numeric kB value", () => {
|
|
19
|
-
expect(parseKb("16384 kB")).toBe(16384);
|
|
20
|
-
});
|
|
21
|
-
it("parses without unit", () => {
|
|
22
|
-
expect(parseKb("4096")).toBe(4096);
|
|
23
|
-
});
|
|
24
|
-
it("returns 0 for undefined/bad input", () => {
|
|
25
|
-
expect(parseKb(undefined)).toBe(0);
|
|
26
|
-
expect(parseKb("not a number")).toBe(0);
|
|
27
|
-
});
|
|
28
|
-
});
|
package/src/lib/exec.ts
DELETED
|
@@ -1,16 +0,0 @@
|
|
|
1
|
-
import { execFile } from "child_process";
|
|
2
|
-
import { promisify } from "util";
|
|
3
|
-
|
|
4
|
-
const execFileAsync = promisify(execFile);
|
|
5
|
-
|
|
6
|
-
export async function run(cmd: string, args: string[], timeoutMs = 10000): Promise<string | null> {
|
|
7
|
-
try {
|
|
8
|
-
const { stdout } = await execFileAsync(cmd, args, { timeout: timeoutMs });
|
|
9
|
-
return stdout;
|
|
10
|
-
} catch (err: any) {
|
|
11
|
-
if (err.code === "ENOENT") return null; // command not installed
|
|
12
|
-
if (err.killed) return null; // timeout
|
|
13
|
-
if (err.stdout) return err.stdout; // non-zero exit but has output
|
|
14
|
-
return null;
|
|
15
|
-
}
|
|
16
|
-
}
|
package/src/lib/parse.ts
DELETED
|
@@ -1,29 +0,0 @@
|
|
|
1
|
-
import { readFileSync } from "fs";
|
|
2
|
-
|
|
3
|
-
export function readProcFile(path: string): string | null {
|
|
4
|
-
try {
|
|
5
|
-
return readFileSync(path, "utf-8");
|
|
6
|
-
} catch {
|
|
7
|
-
return null;
|
|
8
|
-
}
|
|
9
|
-
}
|
|
10
|
-
|
|
11
|
-
export function parseKeyValue(raw: string): Record<string, string> {
|
|
12
|
-
const result: Record<string, string> = {};
|
|
13
|
-
for (const line of raw.split("\n")) {
|
|
14
|
-
const idx = line.indexOf(":");
|
|
15
|
-
if (idx === -1) continue;
|
|
16
|
-
result[line.slice(0, idx).trim()] = line.slice(idx + 1).trim();
|
|
17
|
-
}
|
|
18
|
-
return result;
|
|
19
|
-
}
|
|
20
|
-
|
|
21
|
-
export function parseKb(val: string | undefined): number {
|
|
22
|
-
if (!val) return 0;
|
|
23
|
-
const num = parseInt(val.replace(/\s*kB$/i, ""), 10);
|
|
24
|
-
return isNaN(num) ? 0 : num;
|
|
25
|
-
}
|
|
26
|
-
|
|
27
|
-
export function sleep(ms: number): Promise<void> {
|
|
28
|
-
return new Promise((r) => setTimeout(r, ms));
|
|
29
|
-
}
|
package/src/lib/reboot-marker.ts
DELETED
|
@@ -1,88 +0,0 @@
|
|
|
1
|
-
// Planned-reboot marker handling.
|
|
2
|
-
//
|
|
3
|
-
// An operator signals "the next reboot is expected, don't page me"
|
|
4
|
-
// by writing a short-lived JSON file to disk BEFORE rebooting. The
|
|
5
|
-
// collector reads and deletes it on agent startup; the first
|
|
6
|
-
// post-boot snapshot then carries `expected_reboot: true` so Forge's
|
|
7
|
-
// unexpected_reboot rule stays quiet.
|
|
8
|
-
//
|
|
9
|
-
// Single-use (deleted on read regardless of validity) and TTL-guarded
|
|
10
|
-
// (default 10 min) so a forgotten marker cannot silence a genuine
|
|
11
|
-
// crash reboot weeks later.
|
|
12
|
-
|
|
13
|
-
import { existsSync, readFileSync, unlinkSync, writeFileSync, mkdirSync, chmodSync } from "node:fs";
|
|
14
|
-
import { dirname } from "node:path";
|
|
15
|
-
|
|
16
|
-
export const DEFAULT_MARKER_PATH = "/var/lib/crucible/reboot-expected";
|
|
17
|
-
export const DEFAULT_TTL_MS = 10 * 60 * 1000;
|
|
18
|
-
|
|
19
|
-
export interface PlannedReboot {
|
|
20
|
-
expected: true;
|
|
21
|
-
reason?: string;
|
|
22
|
-
}
|
|
23
|
-
|
|
24
|
-
export interface RebootMarker {
|
|
25
|
-
expires_at: string; // ISO timestamp
|
|
26
|
-
reason?: string;
|
|
27
|
-
}
|
|
28
|
-
|
|
29
|
-
/**
|
|
30
|
-
* Read and delete the marker at `path`. Returns the resolved reboot flag
|
|
31
|
-
* if the file existed, was parseable JSON, and hasn't expired; otherwise
|
|
32
|
-
* returns null. The file is unlinked in every branch where it existed,
|
|
33
|
-
* so a malformed or stale marker is one-shot (can't linger).
|
|
34
|
-
*/
|
|
35
|
-
export function consumeRebootMarker(
|
|
36
|
-
path: string = DEFAULT_MARKER_PATH,
|
|
37
|
-
now: Date = new Date(),
|
|
38
|
-
): PlannedReboot | null {
|
|
39
|
-
if (!existsSync(path)) return null;
|
|
40
|
-
let raw: string;
|
|
41
|
-
try { raw = readFileSync(path, "utf-8"); } catch { try { unlinkSync(path); } catch {} return null; }
|
|
42
|
-
// Always delete after read, regardless of validity.
|
|
43
|
-
try { unlinkSync(path); } catch {}
|
|
44
|
-
|
|
45
|
-
let parsed: RebootMarker;
|
|
46
|
-
try { parsed = JSON.parse(raw); } catch { return null; }
|
|
47
|
-
if (!parsed || typeof parsed !== "object" || typeof parsed.expires_at !== "string") return null;
|
|
48
|
-
const expiresAt = new Date(parsed.expires_at);
|
|
49
|
-
if (isNaN(expiresAt.getTime())) return null;
|
|
50
|
-
if (expiresAt.getTime() <= now.getTime()) return null; // stale
|
|
51
|
-
return { expected: true, reason: parsed.reason };
|
|
52
|
-
}
|
|
53
|
-
|
|
54
|
-
/**
|
|
55
|
-
* Write a planned-reboot marker. Used by the `mark-reboot` and `reboot`
|
|
56
|
-
* CLI subcommands. `ttlMs` defaults to 10 minutes. Creates the parent
|
|
57
|
-
* directory if needed. Chmod 600 so other users on the host can't read
|
|
58
|
-
* or modify it.
|
|
59
|
-
*/
|
|
60
|
-
export function writeRebootMarker(opts: {
|
|
61
|
-
reason?: string;
|
|
62
|
-
ttlMs?: number;
|
|
63
|
-
path?: string;
|
|
64
|
-
now?: Date;
|
|
65
|
-
}): { path: string; expires_at: string } {
|
|
66
|
-
const path = opts.path ?? DEFAULT_MARKER_PATH;
|
|
67
|
-
const now = opts.now ?? new Date();
|
|
68
|
-
const ttlMs = opts.ttlMs ?? DEFAULT_TTL_MS;
|
|
69
|
-
const expiresAt = new Date(now.getTime() + ttlMs);
|
|
70
|
-
const body: RebootMarker = { expires_at: expiresAt.toISOString() };
|
|
71
|
-
if (opts.reason) body.reason = opts.reason;
|
|
72
|
-
try { mkdirSync(dirname(path), { recursive: true, mode: 0o700 }); } catch {}
|
|
73
|
-
writeFileSync(path, JSON.stringify(body), { mode: 0o600 });
|
|
74
|
-
try { chmodSync(path, 0o600); } catch {}
|
|
75
|
-
return { path, expires_at: body.expires_at };
|
|
76
|
-
}
|
|
77
|
-
|
|
78
|
-
/** Parse a duration like "10m", "2h", "600s" into milliseconds. Used by
|
|
79
|
-
* the CLI for the `--ttl` flag. */
|
|
80
|
-
export function parseDuration(s: string): number | null {
|
|
81
|
-
const m = /^(\d+)\s*(ms|s|m|h)?$/.exec(s.trim());
|
|
82
|
-
if (!m) return null;
|
|
83
|
-
const n = parseInt(m[1], 10);
|
|
84
|
-
if (!Number.isFinite(n) || n < 0) return null;
|
|
85
|
-
const unit = m[2] ?? "s";
|
|
86
|
-
const mult = unit === "ms" ? 1 : unit === "s" ? 1000 : unit === "m" ? 60_000 : 3_600_000;
|
|
87
|
-
return n * mult;
|
|
88
|
-
}
|
package/src/lib/types.ts
DELETED
|
@@ -1,226 +0,0 @@
|
|
|
1
|
-
export interface Snapshot {
|
|
2
|
-
collector_version: string;
|
|
3
|
-
timestamp: string;
|
|
4
|
-
system: SystemInfo;
|
|
5
|
-
cpu: CpuInfo;
|
|
6
|
-
memory: MemoryInfo;
|
|
7
|
-
disks: DiskInfo[];
|
|
8
|
-
smart: SmartInfo[];
|
|
9
|
-
network: NetworkInfo[];
|
|
10
|
-
raid: RaidInfo[];
|
|
11
|
-
ipmi: IpmiInfo;
|
|
12
|
-
os_alerts: OsAlerts;
|
|
13
|
-
security?: SecurityData;
|
|
14
|
-
zfs?: ZfsData;
|
|
15
|
-
io_errors?: { count: number; devices: string[] };
|
|
16
|
-
io_latency?: Array<{ device: string; avg_read_latency_ms: number | null; avg_write_latency_ms: number | null; read_iops: number; write_iops: number }>;
|
|
17
|
-
conntrack?: ConntrackData;
|
|
18
|
-
systemd?: SystemdData;
|
|
19
|
-
ntp?: NtpData;
|
|
20
|
-
file_descriptors?: FileDescriptorData;
|
|
21
|
-
// Planned-reboot flag: set only on the first snapshot after a reboot
|
|
22
|
-
// that was marked with `crucible-agent mark-reboot` / `reboot`. Forge
|
|
23
|
-
// reads this to suppress the `unexpected_reboot` rule. Single-use:
|
|
24
|
-
// subsequent snapshots don't carry it.
|
|
25
|
-
expected_reboot?: boolean;
|
|
26
|
-
expected_reboot_reason?: string;
|
|
27
|
-
}
|
|
28
|
-
|
|
29
|
-
export interface ConntrackData {
|
|
30
|
-
available: boolean;
|
|
31
|
-
count: number;
|
|
32
|
-
max: number;
|
|
33
|
-
percent: number;
|
|
34
|
-
}
|
|
35
|
-
|
|
36
|
-
export interface SystemdData {
|
|
37
|
-
failed_units: string[];
|
|
38
|
-
failed_count: number;
|
|
39
|
-
}
|
|
40
|
-
|
|
41
|
-
export interface NtpData {
|
|
42
|
-
synced: boolean;
|
|
43
|
-
offset_seconds: number;
|
|
44
|
-
source: string;
|
|
45
|
-
daemon_running: boolean;
|
|
46
|
-
}
|
|
47
|
-
|
|
48
|
-
export interface FileDescriptorData {
|
|
49
|
-
allocated: number;
|
|
50
|
-
free: number;
|
|
51
|
-
max: number;
|
|
52
|
-
percent: number;
|
|
53
|
-
}
|
|
54
|
-
|
|
55
|
-
export interface ZfsPool {
|
|
56
|
-
name: string;
|
|
57
|
-
state: string;
|
|
58
|
-
errors_text: string;
|
|
59
|
-
scrub_errors?: number;
|
|
60
|
-
scrub_repaired?: string;
|
|
61
|
-
last_scrub_date?: string;
|
|
62
|
-
scrub_never_run?: boolean;
|
|
63
|
-
}
|
|
64
|
-
|
|
65
|
-
export interface ZfsData {
|
|
66
|
-
pools: ZfsPool[];
|
|
67
|
-
}
|
|
68
|
-
|
|
69
|
-
export interface SecurityData {
|
|
70
|
-
ssh: { permitRootLogin: string; passwordAuthentication: string; rootPasswordExposed: boolean } | null;
|
|
71
|
-
firewall: { active: boolean; source: string; details: string };
|
|
72
|
-
pending_updates: { distro: string; pendingCount: number; available: boolean } | null;
|
|
73
|
-
kernel_vulns: Array<{ name: string; status: string; mitigated: boolean }>;
|
|
74
|
-
kernel_reboot: { running: string; installed: string; needsReboot: boolean } | null;
|
|
75
|
-
auto_updates: { configured: boolean; mechanism: string; details: string };
|
|
76
|
-
}
|
|
77
|
-
|
|
78
|
-
export interface SystemInfo {
|
|
79
|
-
hostname: string;
|
|
80
|
-
ip: string;
|
|
81
|
-
os: string;
|
|
82
|
-
/** `ID=` from /etc/os-release, lowercased. e.g. "ubuntu", "debian", "rocky", "arch", "alpine". */
|
|
83
|
-
os_id?: string;
|
|
84
|
-
/** `ID_LIKE=` from /etc/os-release, lowercased, space-separated. Used by Forge
|
|
85
|
-
* to pick distro-family-specific fix command variants. e.g. on Rocky this
|
|
86
|
-
* is "rhel centos fedora"; on Ubuntu it is "debian". */
|
|
87
|
-
os_id_like?: string;
|
|
88
|
-
kernel: string;
|
|
89
|
-
uptime_seconds: number;
|
|
90
|
-
}
|
|
91
|
-
|
|
92
|
-
export interface CpuCoreInfo {
|
|
93
|
-
core: number;
|
|
94
|
-
user_percent: number;
|
|
95
|
-
system_percent: number;
|
|
96
|
-
iowait_percent: number;
|
|
97
|
-
idle_percent: number;
|
|
98
|
-
irq_percent: number;
|
|
99
|
-
softirq_percent: number;
|
|
100
|
-
}
|
|
101
|
-
|
|
102
|
-
export interface CpuInfo {
|
|
103
|
-
user_percent: number;
|
|
104
|
-
system_percent: number;
|
|
105
|
-
iowait_percent: number;
|
|
106
|
-
idle_percent: number;
|
|
107
|
-
load_1m: number;
|
|
108
|
-
load_5m: number;
|
|
109
|
-
load_15m: number;
|
|
110
|
-
cores?: CpuCoreInfo[];
|
|
111
|
-
}
|
|
112
|
-
|
|
113
|
-
export interface MemoryInfo {
|
|
114
|
-
total_mb: number;
|
|
115
|
-
used_mb: number;
|
|
116
|
-
available_mb: number;
|
|
117
|
-
swap_total_mb: number;
|
|
118
|
-
swap_used_mb: number;
|
|
119
|
-
}
|
|
120
|
-
|
|
121
|
-
export interface DiskInfo {
|
|
122
|
-
device: string;
|
|
123
|
-
mount: string;
|
|
124
|
-
total_gb: number;
|
|
125
|
-
used_gb: number;
|
|
126
|
-
available_gb: number;
|
|
127
|
-
percent_used: number;
|
|
128
|
-
fstype?: string;
|
|
129
|
-
options?: string;
|
|
130
|
-
inodes_total?: number;
|
|
131
|
-
inodes_used?: number;
|
|
132
|
-
inodes_free?: number;
|
|
133
|
-
io_read_mb_s?: number;
|
|
134
|
-
io_write_mb_s?: number;
|
|
135
|
-
latency_p99_ms?: number;
|
|
136
|
-
}
|
|
137
|
-
|
|
138
|
-
export interface SmartInfo {
|
|
139
|
-
device: string;
|
|
140
|
-
model: string;
|
|
141
|
-
health: string;
|
|
142
|
-
temperature_c?: number;
|
|
143
|
-
percentage_used?: number;
|
|
144
|
-
reallocated_sectors?: number;
|
|
145
|
-
pending_sectors?: number;
|
|
146
|
-
power_on_hours?: number;
|
|
147
|
-
}
|
|
148
|
-
|
|
149
|
-
export interface NetworkInfo {
|
|
150
|
-
interface: string;
|
|
151
|
-
speed_mbps: number;
|
|
152
|
-
rx_bytes_sec: number;
|
|
153
|
-
tx_bytes_sec: number;
|
|
154
|
-
/** Delta over the collection interval (rx_errors + any subtype counter). */
|
|
155
|
-
rx_errors: number;
|
|
156
|
-
tx_errors: number;
|
|
157
|
-
rx_drops: number;
|
|
158
|
-
tx_drops: number;
|
|
159
|
-
/** Delta over the collection interval. Null if counter not available on this NIC. */
|
|
160
|
-
rx_packets?: number;
|
|
161
|
-
tx_packets?: number;
|
|
162
|
-
/** Fine-grained RX hardware-error subtypes (deltas). Null if unavailable. */
|
|
163
|
-
rx_crc_errors?: number;
|
|
164
|
-
rx_frame_errors?: number;
|
|
165
|
-
rx_length_errors?: number;
|
|
166
|
-
/** TX physical-layer fault counter (delta). Null if unavailable. */
|
|
167
|
-
tx_carrier_errors?: number;
|
|
168
|
-
operstate?: string; // "up", "down", "unknown", etc. from /sys/class/net/{iface}/operstate
|
|
169
|
-
bond_master?: string; // if this interface is a bond slave, the bond name
|
|
170
|
-
is_bond_master?: boolean; // true when this entry represents the bond aggregate
|
|
171
|
-
}
|
|
172
|
-
|
|
173
|
-
export interface RaidInfo {
|
|
174
|
-
device: string;
|
|
175
|
-
level: string;
|
|
176
|
-
status: string;
|
|
177
|
-
degraded: boolean;
|
|
178
|
-
disks: string[];
|
|
179
|
-
failed_disks: string[];
|
|
180
|
-
}
|
|
181
|
-
|
|
182
|
-
export interface SelEvent {
|
|
183
|
-
id: number;
|
|
184
|
-
timestamp: string;
|
|
185
|
-
sensor: string;
|
|
186
|
-
sensor_type: string;
|
|
187
|
-
event: string;
|
|
188
|
-
direction: string;
|
|
189
|
-
severity: string;
|
|
190
|
-
}
|
|
191
|
-
|
|
192
|
-
export interface FanStatus {
|
|
193
|
-
name: string;
|
|
194
|
-
rpm: number;
|
|
195
|
-
status: string;
|
|
196
|
-
}
|
|
197
|
-
|
|
198
|
-
export interface IpmiInfo {
|
|
199
|
-
available: boolean;
|
|
200
|
-
sensors: Array<{
|
|
201
|
-
name: string;
|
|
202
|
-
value: number | string;
|
|
203
|
-
unit: string;
|
|
204
|
-
status: string;
|
|
205
|
-
upper_critical?: number;
|
|
206
|
-
}>;
|
|
207
|
-
ecc_errors: { correctable: number; uncorrectable: number };
|
|
208
|
-
sel_entries_count: number;
|
|
209
|
-
sel_events_recent: SelEvent[];
|
|
210
|
-
fans: FanStatus[];
|
|
211
|
-
}
|
|
212
|
-
|
|
213
|
-
export interface OsAlerts {
|
|
214
|
-
oom_kills_recent: number;
|
|
215
|
-
zombie_processes: number;
|
|
216
|
-
time_drift_ms: number;
|
|
217
|
-
}
|
|
218
|
-
|
|
219
|
-
export interface AlertResult {
|
|
220
|
-
type: string;
|
|
221
|
-
severity: "critical" | "warning";
|
|
222
|
-
title: string;
|
|
223
|
-
message: string;
|
|
224
|
-
evidence: Record<string, unknown>;
|
|
225
|
-
recommendation: string;
|
|
226
|
-
}
|