@glassmkr/crucible 0.7.0 → 0.8.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/alerts/__tests__/rules.test.d.ts +1 -0
- package/dist/alerts/__tests__/rules.test.js +325 -0
- package/dist/alerts/__tests__/rules.test.js.map +1 -0
- package/dist/alerts/rules.d.ts +8 -0
- package/dist/alerts/rules.js +139 -32
- package/dist/alerts/rules.js.map +1 -1
- package/dist/api.d.ts +2 -0
- package/dist/api.js +7 -0
- package/dist/api.js.map +1 -0
- package/dist/collect/__tests__/dmi.test.d.ts +1 -0
- package/dist/collect/__tests__/dmi.test.js +114 -0
- package/dist/collect/__tests__/dmi.test.js.map +1 -0
- package/dist/collect/__tests__/ipmi.test.js +47 -1
- package/dist/collect/__tests__/ipmi.test.js.map +1 -1
- package/dist/collect/__tests__/thermal.test.d.ts +1 -0
- package/dist/collect/__tests__/thermal.test.js +164 -0
- package/dist/collect/__tests__/thermal.test.js.map +1 -0
- package/dist/collect/dmi.d.ts +19 -0
- package/dist/collect/dmi.js +109 -0
- package/dist/collect/dmi.js.map +1 -0
- package/dist/collect/ipmi.d.ts +27 -2
- package/dist/collect/ipmi.js +90 -2
- package/dist/collect/ipmi.js.map +1 -1
- package/dist/collect/thermal.d.ts +10 -0
- package/dist/collect/thermal.js +187 -0
- package/dist/collect/thermal.js.map +1 -0
- package/dist/config.d.ts +10 -0
- package/dist/config.js +2 -0
- package/dist/config.js.map +1 -1
- package/dist/index.js +52 -14
- package/dist/index.js.map +1 -1
- package/dist/lib/__tests__/capability.test.d.ts +1 -0
- package/dist/lib/__tests__/capability.test.js +87 -0
- package/dist/lib/__tests__/capability.test.js.map +1 -0
- package/dist/lib/__tests__/vendor-sensors.test.d.ts +1 -0
- package/dist/lib/__tests__/vendor-sensors.test.js +49 -0
- package/dist/lib/__tests__/vendor-sensors.test.js.map +1 -0
- package/dist/lib/capability.d.ts +21 -0
- package/dist/lib/capability.js +110 -0
- package/dist/lib/capability.js.map +1 -0
- package/dist/lib/cpu-thermal-chips.d.ts +2 -0
- package/dist/lib/cpu-thermal-chips.js +28 -0
- package/dist/lib/cpu-thermal-chips.js.map +1 -0
- package/dist/lib/types.d.ts +58 -0
- package/dist/lib/vendor-sensors.d.ts +27 -0
- package/dist/lib/vendor-sensors.js +63 -0
- package/dist/lib/vendor-sensors.js.map +1 -0
- package/dist/lib/version-check.js +1 -1
- package/dist/lib/version-check.js.map +1 -1
- package/dist/lib/version.d.ts +1 -0
- package/dist/lib/version.js +32 -0
- package/dist/lib/version.js.map +1 -0
- package/dist/notify/email.js +2 -1
- package/dist/notify/email.js.map +1 -1
- package/dist/notify/slack.js +2 -1
- package/dist/notify/slack.js.map +1 -1
- package/dist/notify/telegram.js +1 -1
- package/dist/notify/telegram.js.map +1 -1
- package/package.json +16 -1
- package/rule-ids.json +29 -0
- package/.dockerignore +0 -13
- package/.github/ISSUE_TEMPLATE/bug_report.md +0 -24
- package/.github/ISSUE_TEMPLATE/no_data.md +0 -26
- package/.github/workflows/docker.yml +0 -53
- package/.github/workflows/publish.yml +0 -25
- package/Dockerfile +0 -59
- package/config/collector.example.yaml +0 -43
- package/docker-compose.yml +0 -26
- package/scripts/sign-release.sh +0 -29
- package/src/__tests__/cli.test.ts +0 -74
- package/src/__tests__/reboot-marker.test.ts +0 -122
- package/src/alerts/evaluator.ts +0 -15
- package/src/alerts/rules.ts +0 -283
- package/src/alerts/state.ts +0 -92
- package/src/cli.ts +0 -112
- package/src/collect/__tests__/ipmi.test.ts +0 -96
- package/src/collect/__tests__/smart.test.ts +0 -68
- package/src/collect/__tests__/system.test.ts +0 -29
- package/src/collect/__tests__/zfs.test.ts +0 -72
- package/src/collect/conntrack.ts +0 -27
- package/src/collect/cpu.ts +0 -92
- package/src/collect/disks.ts +0 -91
- package/src/collect/fd.ts +0 -31
- package/src/collect/io-errors.ts +0 -23
- package/src/collect/io-latency.ts +0 -103
- package/src/collect/ipmi.ts +0 -207
- package/src/collect/memory.ts +0 -30
- package/src/collect/network.ts +0 -193
- package/src/collect/ntp.ts +0 -114
- package/src/collect/os-alerts.ts +0 -43
- package/src/collect/raid.ts +0 -40
- package/src/collect/security.ts +0 -268
- package/src/collect/smart.ts +0 -72
- package/src/collect/system.ts +0 -32
- package/src/collect/systemd.ts +0 -33
- package/src/collect/zfs.ts +0 -66
- package/src/config.ts +0 -65
- package/src/index.ts +0 -233
- package/src/lib/__tests__/parse.test.ts +0 -28
- package/src/lib/exec.ts +0 -16
- package/src/lib/parse.ts +0 -29
- package/src/lib/reboot-marker.ts +0 -88
- package/src/lib/types.ts +0 -226
- package/src/lib/version-check.ts +0 -38
- package/src/metrics-server.ts +0 -123
- package/src/notify/email.ts +0 -68
- package/src/notify/slack.ts +0 -46
- package/src/notify/telegram.ts +0 -65
- package/src/push/forge.ts +0 -109
- package/tsconfig.json +0 -15
- package/vitest.config.ts +0 -12
package/src/collect/smart.ts
DELETED
|
@@ -1,72 +0,0 @@
|
|
|
1
|
-
import { run } from "../lib/exec.js";
|
|
2
|
-
import { readdirSync } from "fs";
|
|
3
|
-
import type { SmartInfo } from "../lib/types.js";
|
|
4
|
-
|
|
5
|
-
export async function collectSmart(): Promise<SmartInfo[]> {
|
|
6
|
-
// Find block devices
|
|
7
|
-
const devices: string[] = [];
|
|
8
|
-
try {
|
|
9
|
-
const entries = readdirSync("/sys/block");
|
|
10
|
-
for (const entry of entries) {
|
|
11
|
-
if (entry.startsWith("sd") || entry.startsWith("nvme") || entry.startsWith("hd")) {
|
|
12
|
-
devices.push(`/dev/${entry}`);
|
|
13
|
-
}
|
|
14
|
-
}
|
|
15
|
-
} catch {
|
|
16
|
-
return [];
|
|
17
|
-
}
|
|
18
|
-
|
|
19
|
-
const results: SmartInfo[] = [];
|
|
20
|
-
for (const device of devices) {
|
|
21
|
-
const output = await run("smartctl", ["--json", "--all", device]);
|
|
22
|
-
if (!output) continue;
|
|
23
|
-
|
|
24
|
-
try {
|
|
25
|
-
const info = parseSmartctlJson(JSON.parse(output), device);
|
|
26
|
-
results.push(info);
|
|
27
|
-
} catch {
|
|
28
|
-
// Failed to parse, skip this device
|
|
29
|
-
}
|
|
30
|
-
}
|
|
31
|
-
|
|
32
|
-
return results;
|
|
33
|
-
}
|
|
34
|
-
|
|
35
|
-
export function parseSmartctlJson(data: Record<string, unknown> & {
|
|
36
|
-
model_name?: string;
|
|
37
|
-
model_family?: string;
|
|
38
|
-
smart_status?: { passed?: boolean };
|
|
39
|
-
temperature?: { current?: number };
|
|
40
|
-
power_on_time?: { hours?: number };
|
|
41
|
-
nvme_smart_health_information_log?: { percentage_used?: number; temperature?: number };
|
|
42
|
-
ata_smart_attributes?: { table?: Array<{ id?: number; name?: string; raw?: { value?: number } }> };
|
|
43
|
-
}, device: string): SmartInfo {
|
|
44
|
-
const info: SmartInfo = {
|
|
45
|
-
device,
|
|
46
|
-
model: data.model_name || data.model_family || "unknown",
|
|
47
|
-
health: data.smart_status?.passed ? "PASSED" : "FAILED",
|
|
48
|
-
temperature_c: data.temperature?.current,
|
|
49
|
-
power_on_hours: data.power_on_time?.hours,
|
|
50
|
-
};
|
|
51
|
-
|
|
52
|
-
// NVMe specific
|
|
53
|
-
if (data.nvme_smart_health_information_log) {
|
|
54
|
-
const nvme = data.nvme_smart_health_information_log;
|
|
55
|
-
info.percentage_used = nvme.percentage_used;
|
|
56
|
-
info.temperature_c = nvme.temperature;
|
|
57
|
-
}
|
|
58
|
-
|
|
59
|
-
// SATA specific
|
|
60
|
-
if (data.ata_smart_attributes?.table) {
|
|
61
|
-
for (const attr of data.ata_smart_attributes.table) {
|
|
62
|
-
if (attr.id === 5 || attr.name === "Reallocated_Sector_Ct") {
|
|
63
|
-
info.reallocated_sectors = attr.raw?.value || 0;
|
|
64
|
-
}
|
|
65
|
-
if (attr.id === 197 || attr.name === "Current_Pending_Sector") {
|
|
66
|
-
info.pending_sectors = attr.raw?.value || 0;
|
|
67
|
-
}
|
|
68
|
-
}
|
|
69
|
-
}
|
|
70
|
-
|
|
71
|
-
return info;
|
|
72
|
-
}
|
package/src/collect/system.ts
DELETED
|
@@ -1,32 +0,0 @@
|
|
|
1
|
-
import { hostname } from "os";
|
|
2
|
-
import { readProcFile } from "../lib/parse.js";
|
|
3
|
-
import { run } from "../lib/exec.js";
|
|
4
|
-
import type { SystemInfo } from "../lib/types.js";
|
|
5
|
-
|
|
6
|
-
// Matches KEY=value with optional surrounding double quotes. Handles both
|
|
7
|
-
// `ID=ubuntu` and `ID="rocky"` styles found in the wild.
|
|
8
|
-
export function readOsReleaseField(osRelease: string, key: string): string | undefined {
|
|
9
|
-
const m = osRelease.match(new RegExp(`^${key}=("?)(.+?)\\1$`, "m"));
|
|
10
|
-
return m ? m[2].toLowerCase() : undefined;
|
|
11
|
-
}
|
|
12
|
-
|
|
13
|
-
export async function collectSystem(): Promise<SystemInfo> {
|
|
14
|
-
const osRelease = readProcFile("/etc/os-release") || "";
|
|
15
|
-
const osName = osRelease.match(/PRETTY_NAME="(.+?)"/)?.[1] || "Unknown";
|
|
16
|
-
const os_id = readOsReleaseField(osRelease, "ID");
|
|
17
|
-
const os_id_like = readOsReleaseField(osRelease, "ID_LIKE");
|
|
18
|
-
const kernel = (await run("uname", ["-r"]))?.trim() || "unknown";
|
|
19
|
-
const uptimeRaw = readProcFile("/proc/uptime") || "0";
|
|
20
|
-
const uptimeSeconds = Math.floor(parseFloat(uptimeRaw.split(" ")[0]));
|
|
21
|
-
const ip = (await run("hostname", ["-I"]))?.trim().split(" ")[0] || "unknown";
|
|
22
|
-
|
|
23
|
-
return {
|
|
24
|
-
hostname: hostname(),
|
|
25
|
-
ip,
|
|
26
|
-
os: osName,
|
|
27
|
-
...(os_id ? { os_id } : {}),
|
|
28
|
-
...(os_id_like ? { os_id_like } : {}),
|
|
29
|
-
kernel,
|
|
30
|
-
uptime_seconds: uptimeSeconds,
|
|
31
|
-
};
|
|
32
|
-
}
|
package/src/collect/systemd.ts
DELETED
|
@@ -1,33 +0,0 @@
|
|
|
1
|
-
import { run } from "../lib/exec.js";
|
|
2
|
-
|
|
3
|
-
export interface SystemdData {
|
|
4
|
-
failed_units: string[];
|
|
5
|
-
failed_count: number;
|
|
6
|
-
}
|
|
7
|
-
|
|
8
|
-
// Units commonly in failed state by design or misconfiguration
|
|
9
|
-
const DEFAULT_EXCLUDES = [
|
|
10
|
-
"systemd-networkd-wait-online.service",
|
|
11
|
-
];
|
|
12
|
-
|
|
13
|
-
export async function collectSystemd(extraExcludes: string[] = []): Promise<SystemdData> {
|
|
14
|
-
const output = await run("systemctl", [
|
|
15
|
-
"list-units", "--type=service", "--state=failed", "--no-legend", "--plain",
|
|
16
|
-
]);
|
|
17
|
-
|
|
18
|
-
if (!output || output.trim() === "") {
|
|
19
|
-
return { failed_units: [], failed_count: 0 };
|
|
20
|
-
}
|
|
21
|
-
|
|
22
|
-
const excludes = new Set([...DEFAULT_EXCLUDES, ...extraExcludes]);
|
|
23
|
-
const units: string[] = [];
|
|
24
|
-
|
|
25
|
-
for (const line of output.trim().split("\n")) {
|
|
26
|
-
const unit = line.trim().split(/\s+/)[0];
|
|
27
|
-
if (unit && unit.endsWith(".service") && !excludes.has(unit)) {
|
|
28
|
-
units.push(unit);
|
|
29
|
-
}
|
|
30
|
-
}
|
|
31
|
-
|
|
32
|
-
return { failed_units: units, failed_count: units.length };
|
|
33
|
-
}
|
package/src/collect/zfs.ts
DELETED
|
@@ -1,66 +0,0 @@
|
|
|
1
|
-
import { run } from "../lib/exec.js";
|
|
2
|
-
import type { ZfsData, ZfsPool } from "../lib/types.js";
|
|
3
|
-
|
|
4
|
-
export async function collectZfs(): Promise<ZfsData | null> {
|
|
5
|
-
// Check if zpool is installed
|
|
6
|
-
const zpoolPath = await run("which", ["zpool"], 3000);
|
|
7
|
-
if (!zpoolPath || !zpoolPath.trim()) return null;
|
|
8
|
-
|
|
9
|
-
const zpoolStatus = await run("zpool", ["status"], 10000);
|
|
10
|
-
if (!zpoolStatus || !zpoolStatus.trim()) return null;
|
|
11
|
-
|
|
12
|
-
const pools = parseZpoolStatus(zpoolStatus);
|
|
13
|
-
if (pools.length === 0) return null;
|
|
14
|
-
return { pools };
|
|
15
|
-
}
|
|
16
|
-
|
|
17
|
-
export function parseZpoolStatus(zpoolStatus: string): ZfsPool[] {
|
|
18
|
-
const pools: ZfsPool[] = [];
|
|
19
|
-
let current: ZfsPool | null = null;
|
|
20
|
-
|
|
21
|
-
for (const line of zpoolStatus.split("\n")) {
|
|
22
|
-
const poolMatch = line.match(/^\s*pool:\s*(.+)/);
|
|
23
|
-
if (poolMatch) {
|
|
24
|
-
current = {
|
|
25
|
-
name: poolMatch[1].trim(),
|
|
26
|
-
state: "UNKNOWN",
|
|
27
|
-
errors_text: "",
|
|
28
|
-
};
|
|
29
|
-
pools.push(current);
|
|
30
|
-
continue;
|
|
31
|
-
}
|
|
32
|
-
|
|
33
|
-
if (!current) continue;
|
|
34
|
-
|
|
35
|
-
const stateMatch = line.match(/^\s*state:\s*(.+)/);
|
|
36
|
-
if (stateMatch) {
|
|
37
|
-
current.state = stateMatch[1].trim();
|
|
38
|
-
continue;
|
|
39
|
-
}
|
|
40
|
-
|
|
41
|
-
const errorsMatch = line.match(/^\s*errors:\s*(.+)/);
|
|
42
|
-
if (errorsMatch) {
|
|
43
|
-
current.errors_text = errorsMatch[1].trim();
|
|
44
|
-
continue;
|
|
45
|
-
}
|
|
46
|
-
|
|
47
|
-
// Parse scrub info
|
|
48
|
-
if (line.includes("scan:")) {
|
|
49
|
-
if (line.includes("none requested")) {
|
|
50
|
-
current.scrub_never_run = true;
|
|
51
|
-
} else {
|
|
52
|
-
const repairMatch = line.match(/scrub repaired (\S+) in .* with (\d+) errors/);
|
|
53
|
-
if (repairMatch) {
|
|
54
|
-
current.scrub_repaired = repairMatch[1];
|
|
55
|
-
current.scrub_errors = parseInt(repairMatch[2]) || 0;
|
|
56
|
-
}
|
|
57
|
-
const dateMatch = line.match(/on (.+)$/);
|
|
58
|
-
if (dateMatch) {
|
|
59
|
-
current.last_scrub_date = dateMatch[1].trim();
|
|
60
|
-
}
|
|
61
|
-
}
|
|
62
|
-
}
|
|
63
|
-
}
|
|
64
|
-
|
|
65
|
-
return pools;
|
|
66
|
-
}
|
package/src/config.ts
DELETED
|
@@ -1,65 +0,0 @@
|
|
|
1
|
-
import { readFileSync } from "fs";
|
|
2
|
-
import { parse } from "yaml";
|
|
3
|
-
import { z } from "zod";
|
|
4
|
-
|
|
5
|
-
const ConfigSchema = z.object({
|
|
6
|
-
server_name: z.string().default("unnamed-server"),
|
|
7
|
-
collection: z.object({
|
|
8
|
-
interval_seconds: z.number().min(60).max(3600).default(300),
|
|
9
|
-
ipmi: z.boolean().default(true),
|
|
10
|
-
smart: z.boolean().default(true),
|
|
11
|
-
}).default({}),
|
|
12
|
-
forge: z.object({
|
|
13
|
-
enabled: z.boolean().default(false),
|
|
14
|
-
url: z.string().default("https://forge.glassmkr.com"),
|
|
15
|
-
api_key: z.string().default(""),
|
|
16
|
-
tls_pin: z.string().default(""),
|
|
17
|
-
}).default({}),
|
|
18
|
-
thresholds: z.object({
|
|
19
|
-
ram_percent: z.number().default(90),
|
|
20
|
-
swap_alert: z.boolean().default(true),
|
|
21
|
-
disk_percent: z.number().default(85),
|
|
22
|
-
iowait_percent: z.number().default(20),
|
|
23
|
-
nvme_wear_percent: z.number().default(85),
|
|
24
|
-
disk_latency_nvme_ms: z.number().default(50),
|
|
25
|
-
disk_latency_hdd_ms: z.number().default(200),
|
|
26
|
-
cpu_temp_warning_c: z.number().default(80),
|
|
27
|
-
cpu_temp_critical_c: z.number().default(90),
|
|
28
|
-
interface_utilization_percent: z.number().default(90),
|
|
29
|
-
}).default({}),
|
|
30
|
-
channels: z.object({
|
|
31
|
-
telegram: z.object({
|
|
32
|
-
enabled: z.boolean().default(false),
|
|
33
|
-
bot_token: z.string().default(""),
|
|
34
|
-
chat_id: z.string().default(""),
|
|
35
|
-
}).default({}),
|
|
36
|
-
email: z.object({
|
|
37
|
-
enabled: z.boolean().default(false),
|
|
38
|
-
to: z.string().default(""),
|
|
39
|
-
}).default({}),
|
|
40
|
-
slack: z.object({
|
|
41
|
-
enabled: z.boolean().default(false),
|
|
42
|
-
webhook_url: z.string().default(""),
|
|
43
|
-
}).default({}),
|
|
44
|
-
}).default({}),
|
|
45
|
-
prometheus: z.object({
|
|
46
|
-
enabled: z.boolean().default(false),
|
|
47
|
-
port: z.number().default(9101),
|
|
48
|
-
}).default({}),
|
|
49
|
-
});
|
|
50
|
-
|
|
51
|
-
export type Config = z.infer<typeof ConfigSchema>;
|
|
52
|
-
|
|
53
|
-
export function loadConfig(path: string): Config {
|
|
54
|
-
try {
|
|
55
|
-
const raw = readFileSync(path, "utf-8");
|
|
56
|
-
const parsed = parse(raw);
|
|
57
|
-
return ConfigSchema.parse(parsed);
|
|
58
|
-
} catch (err: any) {
|
|
59
|
-
if (err.code === "ENOENT") {
|
|
60
|
-
console.log(`[config] No config file at ${path}, using defaults`);
|
|
61
|
-
return ConfigSchema.parse({});
|
|
62
|
-
}
|
|
63
|
-
throw err;
|
|
64
|
-
}
|
|
65
|
-
}
|
package/src/index.ts
DELETED
|
@@ -1,233 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/env node
|
|
2
|
-
|
|
3
|
-
import { readFileSync } from "node:fs";
|
|
4
|
-
import { fileURLToPath } from "node:url";
|
|
5
|
-
import { dirname, join } from "node:path";
|
|
6
|
-
import { parseCliArgs } from "./cli.js";
|
|
7
|
-
|
|
8
|
-
const __dirname = dirname(fileURLToPath(import.meta.url));
|
|
9
|
-
const PKG_VERSION = (() => {
|
|
10
|
-
try {
|
|
11
|
-
const pkg = JSON.parse(readFileSync(join(__dirname, "..", "package.json"), "utf8"));
|
|
12
|
-
return pkg.version || "0.0.0";
|
|
13
|
-
} catch {
|
|
14
|
-
return "0.0.0";
|
|
15
|
-
}
|
|
16
|
-
})();
|
|
17
|
-
|
|
18
|
-
// Handle --version, --help, and planned-reboot subcommands before
|
|
19
|
-
// importing collectors, loading config, or starting the Prometheus
|
|
20
|
-
// server. Keeps the CLI responsive even on hosts missing the config
|
|
21
|
-
// file or external tools.
|
|
22
|
-
const { result: cliArgs, output: cliOutput } = parseCliArgs(process.argv.slice(2), PKG_VERSION);
|
|
23
|
-
if (cliArgs.mode === "version" || cliArgs.mode === "help") {
|
|
24
|
-
console.log(cliOutput);
|
|
25
|
-
process.exit(0);
|
|
26
|
-
}
|
|
27
|
-
if (cliArgs.mode === "mark-reboot" || cliArgs.mode === "reboot") {
|
|
28
|
-
const { writeRebootMarker, parseDuration, DEFAULT_TTL_MS } = await import("./lib/reboot-marker.js");
|
|
29
|
-
const ttlMs = cliArgs.ttl ? parseDuration(cliArgs.ttl) : DEFAULT_TTL_MS;
|
|
30
|
-
if (ttlMs === null) {
|
|
31
|
-
console.error(`[mark-reboot] invalid --ttl value: ${cliArgs.ttl}. Use e.g. 10m, 2h, 600s.`);
|
|
32
|
-
process.exit(2);
|
|
33
|
-
}
|
|
34
|
-
try {
|
|
35
|
-
const { path, expires_at } = writeRebootMarker({
|
|
36
|
-
reason: cliArgs.reason, ttlMs,
|
|
37
|
-
});
|
|
38
|
-
console.log(`[${cliArgs.mode}] marker written: ${path} (expires ${expires_at}${cliArgs.reason ? `, reason: ${cliArgs.reason}` : ""})`);
|
|
39
|
-
} catch (err: any) {
|
|
40
|
-
console.error(`[${cliArgs.mode}] failed to write marker: ${err?.message || err}`);
|
|
41
|
-
console.error(` Most likely cause: need root privileges to write under /var/lib/crucible/.`);
|
|
42
|
-
process.exit(1);
|
|
43
|
-
}
|
|
44
|
-
if (cliArgs.mode === "reboot") {
|
|
45
|
-
const { execFileSync } = await import("node:child_process");
|
|
46
|
-
console.log("[reboot] invoking systemctl reboot");
|
|
47
|
-
try {
|
|
48
|
-
execFileSync("systemctl", ["reboot"], { stdio: "inherit" });
|
|
49
|
-
} catch (err: any) {
|
|
50
|
-
console.error(`[reboot] systemctl reboot failed: ${err?.message || err}`);
|
|
51
|
-
process.exit(1);
|
|
52
|
-
}
|
|
53
|
-
}
|
|
54
|
-
process.exit(0);
|
|
55
|
-
}
|
|
56
|
-
|
|
57
|
-
import { loadConfig } from "./config.js";
|
|
58
|
-
import { checkForUpdates } from "./lib/version-check.js";
|
|
59
|
-
import { startMetricsServer, updateMetrics } from "./metrics-server.js";
|
|
60
|
-
import { collectSystem } from "./collect/system.js";
|
|
61
|
-
import { collectCpu } from "./collect/cpu.js";
|
|
62
|
-
import { collectMemory } from "./collect/memory.js";
|
|
63
|
-
import { collectDisks } from "./collect/disks.js";
|
|
64
|
-
import { collectSmart } from "./collect/smart.js";
|
|
65
|
-
import { collectNetwork } from "./collect/network.js";
|
|
66
|
-
import { collectRaid } from "./collect/raid.js";
|
|
67
|
-
import { collectIpmi } from "./collect/ipmi.js";
|
|
68
|
-
import { collectOsAlerts } from "./collect/os-alerts.js";
|
|
69
|
-
import { evaluateAlerts } from "./alerts/evaluator.js";
|
|
70
|
-
import { updateAlertState } from "./alerts/state.js";
|
|
71
|
-
import { sendTelegram } from "./notify/telegram.js";
|
|
72
|
-
import { sendSlack } from "./notify/slack.js";
|
|
73
|
-
import { sendEmail } from "./notify/email.js";
|
|
74
|
-
import { pushToForge, initForgeAgent } from "./push/forge.js";
|
|
75
|
-
import { collectSecurity, type SecurityData } from "./collect/security.js";
|
|
76
|
-
import { collectZfs } from "./collect/zfs.js";
|
|
77
|
-
import { collectIoErrors } from "./collect/io-errors.js";
|
|
78
|
-
import { collectIoLatency } from "./collect/io-latency.js";
|
|
79
|
-
import { collectConntrack } from "./collect/conntrack.js";
|
|
80
|
-
import { collectSystemd } from "./collect/systemd.js";
|
|
81
|
-
import { collectNtp } from "./collect/ntp.js";
|
|
82
|
-
import { collectFileDescriptors } from "./collect/fd.js";
|
|
83
|
-
import type { Snapshot, IpmiInfo } from "./lib/types.js";
|
|
84
|
-
import { consumeRebootMarker, type PlannedReboot } from "./lib/reboot-marker.js";
|
|
85
|
-
|
|
86
|
-
// Consume the planned-reboot marker once at startup. If the operator ran
|
|
87
|
-
// `crucible-agent mark-reboot` / `reboot` before this boot, the marker
|
|
88
|
-
// exists, we flag it on the first snapshot, and we delete the file (so
|
|
89
|
-
// subsequent snapshots don't keep claiming the reboot was planned).
|
|
90
|
-
const plannedRebootFlag: PlannedReboot | null = consumeRebootMarker();
|
|
91
|
-
if (plannedRebootFlag) {
|
|
92
|
-
console.log(`[collector] Planned reboot acknowledged${plannedRebootFlag.reason ? `: ${plannedRebootFlag.reason}` : ""}`);
|
|
93
|
-
}
|
|
94
|
-
let plannedRebootConsumed = false;
|
|
95
|
-
|
|
96
|
-
const config = loadConfig(cliArgs.configPath);
|
|
97
|
-
|
|
98
|
-
console.log(`[collector] Starting. Server: ${config.server_name}. Interval: ${config.collection.interval_seconds}s`);
|
|
99
|
-
console.log(`[collector] IPMI: ${config.collection.ipmi ? "enabled" : "disabled"}, SMART: ${config.collection.smart ? "enabled" : "disabled"}`);
|
|
100
|
-
console.log(`[collector] Forge: ${config.forge.enabled ? config.forge.url : "disabled"}`);
|
|
101
|
-
console.log(`[collector] Prometheus: ${config.prometheus.enabled ? `:${config.prometheus.port}/metrics` : "disabled"}`);
|
|
102
|
-
|
|
103
|
-
// Start Prometheus metrics server if enabled
|
|
104
|
-
if (config.prometheus.enabled) {
|
|
105
|
-
startMetricsServer(config.prometheus.port);
|
|
106
|
-
}
|
|
107
|
-
|
|
108
|
-
// Initialize TLS pinning for Forge if configured
|
|
109
|
-
if (config.forge.tls_pin) {
|
|
110
|
-
initForgeAgent(config.forge.tls_pin);
|
|
111
|
-
console.log("[collector] TLS pinning enabled for Forge");
|
|
112
|
-
}
|
|
113
|
-
|
|
114
|
-
const emptyIpmi: IpmiInfo = { available: false, sensors: [], ecc_errors: { correctable: 0, uncorrectable: 0 }, sel_entries_count: 0, sel_events_recent: [], fans: [] };
|
|
115
|
-
|
|
116
|
-
// Security checks run once per hour (every 12th cycle at 5-min intervals)
|
|
117
|
-
let securityCycleCount = 0;
|
|
118
|
-
let cachedSecurity: SecurityData | undefined;
|
|
119
|
-
|
|
120
|
-
async function collect() {
|
|
121
|
-
const startTime = Date.now();
|
|
122
|
-
console.log(`[collector] Collecting...`);
|
|
123
|
-
|
|
124
|
-
const [system, cpu, memory, disks, smart, network, raid, ipmi, osAlerts] = await Promise.all([
|
|
125
|
-
collectSystem(),
|
|
126
|
-
collectCpu(),
|
|
127
|
-
collectMemory(),
|
|
128
|
-
collectDisks(),
|
|
129
|
-
config.collection.smart ? collectSmart() : Promise.resolve([]),
|
|
130
|
-
collectNetwork(),
|
|
131
|
-
collectRaid(),
|
|
132
|
-
config.collection.ipmi ? collectIpmi() : Promise.resolve(emptyIpmi),
|
|
133
|
-
collectOsAlerts(),
|
|
134
|
-
]);
|
|
135
|
-
|
|
136
|
-
// Security checks: run once per hour, reuse cached data between runs
|
|
137
|
-
securityCycleCount++;
|
|
138
|
-
if (securityCycleCount >= 12 || !cachedSecurity) {
|
|
139
|
-
securityCycleCount = 0;
|
|
140
|
-
try { cachedSecurity = await collectSecurity(); } catch (err) { console.error("[security] Collection error:", err); }
|
|
141
|
-
}
|
|
142
|
-
|
|
143
|
-
const snapshot: Snapshot = {
|
|
144
|
-
collector_version: PKG_VERSION,
|
|
145
|
-
timestamp: new Date().toISOString(),
|
|
146
|
-
system, cpu, memory, disks, smart, network, raid, ipmi, os_alerts: osAlerts,
|
|
147
|
-
security: cachedSecurity,
|
|
148
|
-
};
|
|
149
|
-
|
|
150
|
-
// Single-shot: the very first snapshot after a marked reboot carries
|
|
151
|
-
// the flag, subsequent snapshots do not.
|
|
152
|
-
if (plannedRebootFlag && !plannedRebootConsumed) {
|
|
153
|
-
(snapshot as any).expected_reboot = true;
|
|
154
|
-
if (plannedRebootFlag.reason) (snapshot as any).expected_reboot_reason = plannedRebootFlag.reason;
|
|
155
|
-
plannedRebootConsumed = true;
|
|
156
|
-
}
|
|
157
|
-
|
|
158
|
-
// ZFS and I/O errors: collect every cycle (lightweight checks)
|
|
159
|
-
try { snapshot.zfs = await collectZfs() ?? undefined; } catch { /* skip if ZFS not available */ }
|
|
160
|
-
try { snapshot.io_errors = await collectIoErrors() ?? undefined; } catch { /* skip on error */ }
|
|
161
|
-
try { snapshot.io_latency = collectIoLatency(); } catch { /* skip on error */ }
|
|
162
|
-
try { snapshot.conntrack = collectConntrack(); } catch { /* skip on error */ }
|
|
163
|
-
try { snapshot.systemd = await collectSystemd(); } catch { /* skip on error */ }
|
|
164
|
-
try { snapshot.ntp = await collectNtp(); } catch { /* skip on error */ }
|
|
165
|
-
try { snapshot.file_descriptors = collectFileDescriptors(); } catch { /* skip on error */ }
|
|
166
|
-
|
|
167
|
-
// Update Prometheus metrics
|
|
168
|
-
updateMetrics(snapshot);
|
|
169
|
-
|
|
170
|
-
// Evaluate alerts
|
|
171
|
-
const alertResults = evaluateAlerts(snapshot, config.thresholds);
|
|
172
|
-
const { newAlerts, resolvedAlerts } = updateAlertState(alertResults);
|
|
173
|
-
|
|
174
|
-
const elapsed = Date.now() - startTime;
|
|
175
|
-
console.log(`[collector] Collected in ${elapsed}ms. Alerts: ${alertResults.length} active, ${newAlerts.length} new, ${resolvedAlerts.length} resolved`);
|
|
176
|
-
|
|
177
|
-
// Send notifications for new/resolved alerts
|
|
178
|
-
if (newAlerts.length > 0 || resolvedAlerts.length > 0) {
|
|
179
|
-
if (config.channels.telegram.enabled && config.channels.telegram.bot_token && config.channels.telegram.chat_id) {
|
|
180
|
-
await sendTelegram(config.channels.telegram.bot_token, config.channels.telegram.chat_id, newAlerts, resolvedAlerts, config.server_name);
|
|
181
|
-
}
|
|
182
|
-
if (config.channels.slack.enabled && config.channels.slack.webhook_url) {
|
|
183
|
-
await sendSlack(config.channels.slack.webhook_url, newAlerts, resolvedAlerts, config.server_name);
|
|
184
|
-
}
|
|
185
|
-
if (config.channels.email.enabled && config.channels.email.to) {
|
|
186
|
-
await sendEmail(config.channels.email, newAlerts, resolvedAlerts, config.server_name);
|
|
187
|
-
}
|
|
188
|
-
}
|
|
189
|
-
|
|
190
|
-
// Push to Forge (non-blocking)
|
|
191
|
-
if (config.forge.enabled && config.forge.api_key) {
|
|
192
|
-
pushToForge(config.forge.url, config.forge.api_key, snapshot);
|
|
193
|
-
}
|
|
194
|
-
|
|
195
|
-
// Check for updates (every 6 hours, non-blocking)
|
|
196
|
-
checkForUpdates(config.forge.enabled ? config.forge.url : undefined);
|
|
197
|
-
|
|
198
|
-
// Print summary on first run
|
|
199
|
-
if (firstRun) {
|
|
200
|
-
firstRun = false;
|
|
201
|
-
console.log("");
|
|
202
|
-
console.log("=== First collection complete ===");
|
|
203
|
-
console.log(`Server: ${system.hostname} (${system.os})`);
|
|
204
|
-
console.log(`CPU: ${cpu.user_percent.toFixed(1)}% (load: ${cpu.load_1m})`);
|
|
205
|
-
const ramPct = memory.total_mb > 0 ? ((memory.used_mb / memory.total_mb) * 100).toFixed(1) : "0";
|
|
206
|
-
console.log(`RAM: ${ramPct}% (${memory.used_mb} / ${memory.total_mb} MB)`);
|
|
207
|
-
if (disks.length > 0) console.log(`Disk: ${disks[0].percent_used}% (${disks[0].mount})`);
|
|
208
|
-
console.log(`SMART: ${smart.length > 0 ? `${smart.length} drive(s) checked` : "not available"}`);
|
|
209
|
-
console.log(`Network: ${network.map((n) => n.interface).join(", ") || "none detected"}`);
|
|
210
|
-
console.log(`IPMI: ${ipmi.available ? "available" : "not available"}`);
|
|
211
|
-
console.log(`Active alerts: ${alertResults.length}`);
|
|
212
|
-
console.log(`Forge: ${config.forge.enabled ? "enabled" : "disabled"}`);
|
|
213
|
-
console.log("");
|
|
214
|
-
}
|
|
215
|
-
}
|
|
216
|
-
|
|
217
|
-
let firstRun = true;
|
|
218
|
-
|
|
219
|
-
// Run immediately
|
|
220
|
-
collect();
|
|
221
|
-
|
|
222
|
-
// Then on interval
|
|
223
|
-
setInterval(collect, config.collection.interval_seconds * 1000);
|
|
224
|
-
|
|
225
|
-
process.on("SIGTERM", () => {
|
|
226
|
-
console.log("[collector] Received SIGTERM, shutting down");
|
|
227
|
-
process.exit(0);
|
|
228
|
-
});
|
|
229
|
-
|
|
230
|
-
process.on("SIGINT", () => {
|
|
231
|
-
console.log("[collector] Received SIGINT, shutting down");
|
|
232
|
-
process.exit(0);
|
|
233
|
-
});
|
|
@@ -1,28 +0,0 @@
|
|
|
1
|
-
import { describe, it, expect } from "vitest";
|
|
2
|
-
import { parseKeyValue, parseKb } from "../parse.js";
|
|
3
|
-
|
|
4
|
-
describe("parseKeyValue", () => {
|
|
5
|
-
it("parses colon-delimited key/value lines", () => {
|
|
6
|
-
const out = parseKeyValue("Name: foo\nVersion: 1.2.3\n");
|
|
7
|
-
expect(out).toEqual({ Name: "foo", Version: "1.2.3" });
|
|
8
|
-
});
|
|
9
|
-
it("ignores lines with no colon", () => {
|
|
10
|
-
expect(parseKeyValue("no colon here\nA: 1\n")).toEqual({ A: "1" });
|
|
11
|
-
});
|
|
12
|
-
it("trims whitespace around keys and values", () => {
|
|
13
|
-
expect(parseKeyValue(" A : 1 \n")).toEqual({ A: "1" });
|
|
14
|
-
});
|
|
15
|
-
});
|
|
16
|
-
|
|
17
|
-
describe("parseKb", () => {
|
|
18
|
-
it("parses a numeric kB value", () => {
|
|
19
|
-
expect(parseKb("16384 kB")).toBe(16384);
|
|
20
|
-
});
|
|
21
|
-
it("parses without unit", () => {
|
|
22
|
-
expect(parseKb("4096")).toBe(4096);
|
|
23
|
-
});
|
|
24
|
-
it("returns 0 for undefined/bad input", () => {
|
|
25
|
-
expect(parseKb(undefined)).toBe(0);
|
|
26
|
-
expect(parseKb("not a number")).toBe(0);
|
|
27
|
-
});
|
|
28
|
-
});
|
package/src/lib/exec.ts
DELETED
|
@@ -1,16 +0,0 @@
|
|
|
1
|
-
import { execFile } from "child_process";
|
|
2
|
-
import { promisify } from "util";
|
|
3
|
-
|
|
4
|
-
const execFileAsync = promisify(execFile);
|
|
5
|
-
|
|
6
|
-
export async function run(cmd: string, args: string[], timeoutMs = 10000): Promise<string | null> {
|
|
7
|
-
try {
|
|
8
|
-
const { stdout } = await execFileAsync(cmd, args, { timeout: timeoutMs });
|
|
9
|
-
return stdout;
|
|
10
|
-
} catch (err: any) {
|
|
11
|
-
if (err.code === "ENOENT") return null; // command not installed
|
|
12
|
-
if (err.killed) return null; // timeout
|
|
13
|
-
if (err.stdout) return err.stdout; // non-zero exit but has output
|
|
14
|
-
return null;
|
|
15
|
-
}
|
|
16
|
-
}
|
package/src/lib/parse.ts
DELETED
|
@@ -1,29 +0,0 @@
|
|
|
1
|
-
import { readFileSync } from "fs";
|
|
2
|
-
|
|
3
|
-
export function readProcFile(path: string): string | null {
|
|
4
|
-
try {
|
|
5
|
-
return readFileSync(path, "utf-8");
|
|
6
|
-
} catch {
|
|
7
|
-
return null;
|
|
8
|
-
}
|
|
9
|
-
}
|
|
10
|
-
|
|
11
|
-
export function parseKeyValue(raw: string): Record<string, string> {
|
|
12
|
-
const result: Record<string, string> = {};
|
|
13
|
-
for (const line of raw.split("\n")) {
|
|
14
|
-
const idx = line.indexOf(":");
|
|
15
|
-
if (idx === -1) continue;
|
|
16
|
-
result[line.slice(0, idx).trim()] = line.slice(idx + 1).trim();
|
|
17
|
-
}
|
|
18
|
-
return result;
|
|
19
|
-
}
|
|
20
|
-
|
|
21
|
-
export function parseKb(val: string | undefined): number {
|
|
22
|
-
if (!val) return 0;
|
|
23
|
-
const num = parseInt(val.replace(/\s*kB$/i, ""), 10);
|
|
24
|
-
return isNaN(num) ? 0 : num;
|
|
25
|
-
}
|
|
26
|
-
|
|
27
|
-
export function sleep(ms: number): Promise<void> {
|
|
28
|
-
return new Promise((r) => setTimeout(r, ms));
|
|
29
|
-
}
|
package/src/lib/reboot-marker.ts
DELETED
|
@@ -1,88 +0,0 @@
|
|
|
1
|
-
// Planned-reboot marker handling.
|
|
2
|
-
//
|
|
3
|
-
// An operator signals "the next reboot is expected, don't page me"
|
|
4
|
-
// by writing a short-lived JSON file to disk BEFORE rebooting. The
|
|
5
|
-
// collector reads and deletes it on agent startup; the first
|
|
6
|
-
// post-boot snapshot then carries `expected_reboot: true` so Forge's
|
|
7
|
-
// unexpected_reboot rule stays quiet.
|
|
8
|
-
//
|
|
9
|
-
// Single-use (deleted on read regardless of validity) and TTL-guarded
|
|
10
|
-
// (default 10 min) so a forgotten marker cannot silence a genuine
|
|
11
|
-
// crash reboot weeks later.
|
|
12
|
-
|
|
13
|
-
import { existsSync, readFileSync, unlinkSync, writeFileSync, mkdirSync, chmodSync } from "node:fs";
|
|
14
|
-
import { dirname } from "node:path";
|
|
15
|
-
|
|
16
|
-
export const DEFAULT_MARKER_PATH = "/var/lib/crucible/reboot-expected";
|
|
17
|
-
export const DEFAULT_TTL_MS = 10 * 60 * 1000;
|
|
18
|
-
|
|
19
|
-
export interface PlannedReboot {
|
|
20
|
-
expected: true;
|
|
21
|
-
reason?: string;
|
|
22
|
-
}
|
|
23
|
-
|
|
24
|
-
export interface RebootMarker {
|
|
25
|
-
expires_at: string; // ISO timestamp
|
|
26
|
-
reason?: string;
|
|
27
|
-
}
|
|
28
|
-
|
|
29
|
-
/**
|
|
30
|
-
* Read and delete the marker at `path`. Returns the resolved reboot flag
|
|
31
|
-
* if the file existed, was parseable JSON, and hasn't expired; otherwise
|
|
32
|
-
* returns null. The file is unlinked in every branch where it existed,
|
|
33
|
-
* so a malformed or stale marker is one-shot (can't linger).
|
|
34
|
-
*/
|
|
35
|
-
export function consumeRebootMarker(
|
|
36
|
-
path: string = DEFAULT_MARKER_PATH,
|
|
37
|
-
now: Date = new Date(),
|
|
38
|
-
): PlannedReboot | null {
|
|
39
|
-
if (!existsSync(path)) return null;
|
|
40
|
-
let raw: string;
|
|
41
|
-
try { raw = readFileSync(path, "utf-8"); } catch { try { unlinkSync(path); } catch {} return null; }
|
|
42
|
-
// Always delete after read, regardless of validity.
|
|
43
|
-
try { unlinkSync(path); } catch {}
|
|
44
|
-
|
|
45
|
-
let parsed: RebootMarker;
|
|
46
|
-
try { parsed = JSON.parse(raw); } catch { return null; }
|
|
47
|
-
if (!parsed || typeof parsed !== "object" || typeof parsed.expires_at !== "string") return null;
|
|
48
|
-
const expiresAt = new Date(parsed.expires_at);
|
|
49
|
-
if (isNaN(expiresAt.getTime())) return null;
|
|
50
|
-
if (expiresAt.getTime() <= now.getTime()) return null; // stale
|
|
51
|
-
return { expected: true, reason: parsed.reason };
|
|
52
|
-
}
|
|
53
|
-
|
|
54
|
-
/**
|
|
55
|
-
* Write a planned-reboot marker. Used by the `mark-reboot` and `reboot`
|
|
56
|
-
* CLI subcommands. `ttlMs` defaults to 10 minutes. Creates the parent
|
|
57
|
-
* directory if needed. Chmod 600 so other users on the host can't read
|
|
58
|
-
* or modify it.
|
|
59
|
-
*/
|
|
60
|
-
export function writeRebootMarker(opts: {
|
|
61
|
-
reason?: string;
|
|
62
|
-
ttlMs?: number;
|
|
63
|
-
path?: string;
|
|
64
|
-
now?: Date;
|
|
65
|
-
}): { path: string; expires_at: string } {
|
|
66
|
-
const path = opts.path ?? DEFAULT_MARKER_PATH;
|
|
67
|
-
const now = opts.now ?? new Date();
|
|
68
|
-
const ttlMs = opts.ttlMs ?? DEFAULT_TTL_MS;
|
|
69
|
-
const expiresAt = new Date(now.getTime() + ttlMs);
|
|
70
|
-
const body: RebootMarker = { expires_at: expiresAt.toISOString() };
|
|
71
|
-
if (opts.reason) body.reason = opts.reason;
|
|
72
|
-
try { mkdirSync(dirname(path), { recursive: true, mode: 0o700 }); } catch {}
|
|
73
|
-
writeFileSync(path, JSON.stringify(body), { mode: 0o600 });
|
|
74
|
-
try { chmodSync(path, 0o600); } catch {}
|
|
75
|
-
return { path, expires_at: body.expires_at };
|
|
76
|
-
}
|
|
77
|
-
|
|
78
|
-
/** Parse a duration like "10m", "2h", "600s" into milliseconds. Used by
|
|
79
|
-
* the CLI for the `--ttl` flag. */
|
|
80
|
-
export function parseDuration(s: string): number | null {
|
|
81
|
-
const m = /^(\d+)\s*(ms|s|m|h)?$/.exec(s.trim());
|
|
82
|
-
if (!m) return null;
|
|
83
|
-
const n = parseInt(m[1], 10);
|
|
84
|
-
if (!Number.isFinite(n) || n < 0) return null;
|
|
85
|
-
const unit = m[2] ?? "s";
|
|
86
|
-
const mult = unit === "ms" ? 1 : unit === "s" ? 1000 : unit === "m" ? 60_000 : 3_600_000;
|
|
87
|
-
return n * mult;
|
|
88
|
-
}
|