@glassmkr/crucible 0.6.5 → 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -13,10 +13,26 @@ interface PreviousCounters {
13
13
  tx_errors: number;
14
14
  rx_drops: number;
15
15
  tx_drops: number;
16
+ rx_packets: number;
17
+ tx_packets: number;
18
+ rx_crc_errors?: number;
19
+ rx_frame_errors?: number;
20
+ rx_length_errors?: number;
21
+ tx_carrier_errors?: number;
16
22
  }
17
23
 
18
24
  const previousCounters = new Map<string, PreviousCounters>();
19
25
 
26
+ function readStatCounter(iface: string, name: string): number | undefined {
27
+ try {
28
+ const raw = readFileSync(`/sys/class/net/${iface}/statistics/${name}`, "utf-8").trim();
29
+ const val = parseInt(raw, 10);
30
+ return Number.isFinite(val) ? val : undefined;
31
+ } catch {
32
+ return undefined;
33
+ }
34
+ }
35
+
20
36
  function parseNetDev(): Record<string, IfaceStats> {
21
37
  const raw = readProcFile("/proc/net/dev") || "";
22
38
  const result: Record<string, IfaceStats> = {};
@@ -66,6 +82,14 @@ function getBondMaster(iface: string): string | undefined {
66
82
  return undefined;
67
83
  }
68
84
 
85
+ function isBondMaster(iface: string): boolean {
86
+ try {
87
+ return readdirSync("/proc/net/bonding/").includes(iface);
88
+ } catch {
89
+ return false;
90
+ }
91
+ }
92
+
69
93
  // Compute delta, handling counter wraps (current < previous means reset, use current as delta)
70
94
  function delta(current: number, previous: number): number {
71
95
  if (current >= previous) return current - previous;
@@ -87,17 +111,37 @@ export async function collectNetwork(): Promise<NetworkInfo[]> {
87
111
 
88
112
  const prev = previousCounters.get(name);
89
113
 
114
+ // /sys/class/net/*/statistics/ exposes finer-grained RX/TX subtype
115
+ // counters than /proc/net/dev. Read cumulative values here; delta is
116
+ // derived below against the previous cycle's snapshot.
117
+ const rxCrcCum = readStatCounter(name, "rx_crc_errors");
118
+ const rxFrameCum = readStatCounter(name, "rx_frame_errors");
119
+ const rxLenCum = readStatCounter(name, "rx_length_errors");
120
+ const txCarrierCum = readStatCounter(name, "tx_carrier_errors");
121
+
90
122
  // Compute error/drop deltas (0 on first cycle after start or new interface)
91
123
  let rxErrorsDelta = 0;
92
124
  let txErrorsDelta = 0;
93
125
  let rxDropsDelta = 0;
94
126
  let txDropsDelta = 0;
127
+ let rxPacketsDelta = 0;
128
+ let txPacketsDelta = 0;
129
+ let rxCrcDelta: number | undefined;
130
+ let rxFrameDelta: number | undefined;
131
+ let rxLenDelta: number | undefined;
132
+ let txCarrierDelta: number | undefined;
95
133
 
96
134
  if (prev) {
97
135
  rxErrorsDelta = delta(s2.rx_errors, prev.rx_errors);
98
136
  txErrorsDelta = delta(s2.tx_errors, prev.tx_errors);
99
137
  rxDropsDelta = delta(s2.rx_drops, prev.rx_drops);
100
138
  txDropsDelta = delta(s2.tx_drops, prev.tx_drops);
139
+ rxPacketsDelta = delta(s2.rx_packets, prev.rx_packets);
140
+ txPacketsDelta = delta(s2.tx_packets, prev.tx_packets);
141
+ if (rxCrcCum != null && prev.rx_crc_errors != null) rxCrcDelta = delta(rxCrcCum, prev.rx_crc_errors);
142
+ if (rxFrameCum != null && prev.rx_frame_errors != null) rxFrameDelta = delta(rxFrameCum, prev.rx_frame_errors);
143
+ if (rxLenCum != null && prev.rx_length_errors != null) rxLenDelta = delta(rxLenCum, prev.rx_length_errors);
144
+ if (txCarrierCum != null && prev.tx_carrier_errors != null) txCarrierDelta = delta(txCarrierCum, prev.tx_carrier_errors);
101
145
  }
102
146
 
103
147
  // Store current cumulative values for next cycle
@@ -106,6 +150,12 @@ export async function collectNetwork(): Promise<NetworkInfo[]> {
106
150
  tx_errors: s2.tx_errors,
107
151
  rx_drops: s2.rx_drops,
108
152
  tx_drops: s2.tx_drops,
153
+ rx_packets: s2.rx_packets,
154
+ tx_packets: s2.tx_packets,
155
+ rx_crc_errors: rxCrcCum,
156
+ rx_frame_errors: rxFrameCum,
157
+ rx_length_errors: rxLenCum,
158
+ tx_carrier_errors: txCarrierCum,
109
159
  });
110
160
 
111
161
  const entry: NetworkInfo = {
@@ -117,10 +167,18 @@ export async function collectNetwork(): Promise<NetworkInfo[]> {
117
167
  tx_errors: txErrorsDelta,
118
168
  rx_drops: rxDropsDelta,
119
169
  tx_drops: txDropsDelta,
170
+ rx_packets: rxPacketsDelta,
171
+ tx_packets: txPacketsDelta,
120
172
  operstate: getOperstate(name),
121
173
  };
174
+ if (rxCrcDelta !== undefined) entry.rx_crc_errors = rxCrcDelta;
175
+ if (rxFrameDelta !== undefined) entry.rx_frame_errors = rxFrameDelta;
176
+ if (rxLenDelta !== undefined) entry.rx_length_errors = rxLenDelta;
177
+ if (txCarrierDelta !== undefined) entry.tx_carrier_errors = txCarrierDelta;
122
178
  const master = getBondMaster(name);
123
179
  if (master) entry.bond_master = master;
180
+ // Identify bond masters (have at least one slave pointing at them).
181
+ if (isBondMaster(name)) entry.is_bond_master = true;
124
182
  results.push(entry);
125
183
  }
126
184
 
@@ -3,9 +3,18 @@ import { readProcFile } from "../lib/parse.js";
3
3
  import { run } from "../lib/exec.js";
4
4
  import type { SystemInfo } from "../lib/types.js";
5
5
 
6
+ // Matches KEY=value with optional surrounding double quotes. Handles both
7
+ // `ID=ubuntu` and `ID="rocky"` styles found in the wild.
8
+ export function readOsReleaseField(osRelease: string, key: string): string | undefined {
9
+ const m = osRelease.match(new RegExp(`^${key}=("?)(.+?)\\1$`, "m"));
10
+ return m ? m[2].toLowerCase() : undefined;
11
+ }
12
+
6
13
  export async function collectSystem(): Promise<SystemInfo> {
7
14
  const osRelease = readProcFile("/etc/os-release") || "";
8
15
  const osName = osRelease.match(/PRETTY_NAME="(.+?)"/)?.[1] || "Unknown";
16
+ const os_id = readOsReleaseField(osRelease, "ID");
17
+ const os_id_like = readOsReleaseField(osRelease, "ID_LIKE");
9
18
  const kernel = (await run("uname", ["-r"]))?.trim() || "unknown";
10
19
  const uptimeRaw = readProcFile("/proc/uptime") || "0";
11
20
  const uptimeSeconds = Math.floor(parseFloat(uptimeRaw.split(" ")[0]));
@@ -15,6 +24,8 @@ export async function collectSystem(): Promise<SystemInfo> {
15
24
  hostname: hostname(),
16
25
  ip,
17
26
  os: osName,
27
+ ...(os_id ? { os_id } : {}),
28
+ ...(os_id_like ? { os_id_like } : {}),
18
29
  kernel,
19
30
  uptime_seconds: uptimeSeconds,
20
31
  };
package/src/index.ts CHANGED
@@ -15,14 +15,44 @@ const PKG_VERSION = (() => {
15
15
  }
16
16
  })();
17
17
 
18
- // Handle --version and --help before importing collectors, loading config, or
19
- // starting the Prometheus server. This keeps the CLI responsive even on hosts
20
- // missing the config file or external tools.
18
+ // Handle --version, --help, and planned-reboot subcommands before
19
+ // importing collectors, loading config, or starting the Prometheus
20
+ // server. Keeps the CLI responsive even on hosts missing the config
21
+ // file or external tools.
21
22
  const { result: cliArgs, output: cliOutput } = parseCliArgs(process.argv.slice(2), PKG_VERSION);
22
- if (cliArgs.mode !== "run") {
23
+ if (cliArgs.mode === "version" || cliArgs.mode === "help") {
23
24
  console.log(cliOutput);
24
25
  process.exit(0);
25
26
  }
27
+ if (cliArgs.mode === "mark-reboot" || cliArgs.mode === "reboot") {
28
+ const { writeRebootMarker, parseDuration, DEFAULT_TTL_MS } = await import("./lib/reboot-marker.js");
29
+ const ttlMs = cliArgs.ttl ? parseDuration(cliArgs.ttl) : DEFAULT_TTL_MS;
30
+ if (ttlMs === null) {
31
+ console.error(`[mark-reboot] invalid --ttl value: ${cliArgs.ttl}. Use e.g. 10m, 2h, 600s.`);
32
+ process.exit(2);
33
+ }
34
+ try {
35
+ const { path, expires_at } = writeRebootMarker({
36
+ reason: cliArgs.reason, ttlMs,
37
+ });
38
+ console.log(`[${cliArgs.mode}] marker written: ${path} (expires ${expires_at}${cliArgs.reason ? `, reason: ${cliArgs.reason}` : ""})`);
39
+ } catch (err: any) {
40
+ console.error(`[${cliArgs.mode}] failed to write marker: ${err?.message || err}`);
41
+ console.error(` Most likely cause: need root privileges to write under /var/lib/crucible/.`);
42
+ process.exit(1);
43
+ }
44
+ if (cliArgs.mode === "reboot") {
45
+ const { execFileSync } = await import("node:child_process");
46
+ console.log("[reboot] invoking systemctl reboot");
47
+ try {
48
+ execFileSync("systemctl", ["reboot"], { stdio: "inherit" });
49
+ } catch (err: any) {
50
+ console.error(`[reboot] systemctl reboot failed: ${err?.message || err}`);
51
+ process.exit(1);
52
+ }
53
+ }
54
+ process.exit(0);
55
+ }
26
56
 
27
57
  import { loadConfig } from "./config.js";
28
58
  import { checkForUpdates } from "./lib/version-check.js";
@@ -51,6 +81,17 @@ import { collectSystemd } from "./collect/systemd.js";
51
81
  import { collectNtp } from "./collect/ntp.js";
52
82
  import { collectFileDescriptors } from "./collect/fd.js";
53
83
  import type { Snapshot, IpmiInfo } from "./lib/types.js";
84
+ import { consumeRebootMarker, type PlannedReboot } from "./lib/reboot-marker.js";
85
+
86
+ // Consume the planned-reboot marker once at startup. If the operator ran
87
+ // `crucible-agent mark-reboot` / `reboot` before this boot, the marker
88
+ // exists, we flag it on the first snapshot, and we delete the file (so
89
+ // subsequent snapshots don't keep claiming the reboot was planned).
90
+ const plannedRebootFlag: PlannedReboot | null = consumeRebootMarker();
91
+ if (plannedRebootFlag) {
92
+ console.log(`[collector] Planned reboot acknowledged${plannedRebootFlag.reason ? `: ${plannedRebootFlag.reason}` : ""}`);
93
+ }
94
+ let plannedRebootConsumed = false;
54
95
 
55
96
  const config = loadConfig(cliArgs.configPath);
56
97
 
@@ -106,6 +147,14 @@ async function collect() {
106
147
  security: cachedSecurity,
107
148
  };
108
149
 
150
+ // Single-shot: the very first snapshot after a marked reboot carries
151
+ // the flag, subsequent snapshots do not.
152
+ if (plannedRebootFlag && !plannedRebootConsumed) {
153
+ (snapshot as any).expected_reboot = true;
154
+ if (plannedRebootFlag.reason) (snapshot as any).expected_reboot_reason = plannedRebootFlag.reason;
155
+ plannedRebootConsumed = true;
156
+ }
157
+
109
158
  // ZFS and I/O errors: collect every cycle (lightweight checks)
110
159
  try { snapshot.zfs = await collectZfs() ?? undefined; } catch { /* skip if ZFS not available */ }
111
160
  try { snapshot.io_errors = await collectIoErrors() ?? undefined; } catch { /* skip on error */ }
@@ -0,0 +1,88 @@
1
+ // Planned-reboot marker handling.
2
+ //
3
+ // An operator signals "the next reboot is expected, don't page me"
4
+ // by writing a short-lived JSON file to disk BEFORE rebooting. The
5
+ // collector reads and deletes it on agent startup; the first
6
+ // post-boot snapshot then carries `expected_reboot: true` so Forge's
7
+ // unexpected_reboot rule stays quiet.
8
+ //
9
+ // Single-use (deleted on read regardless of validity) and TTL-guarded
10
+ // (default 10 min) so a forgotten marker cannot silence a genuine
11
+ // crash reboot weeks later.
12
+
13
+ import { existsSync, readFileSync, unlinkSync, writeFileSync, mkdirSync, chmodSync } from "node:fs";
14
+ import { dirname } from "node:path";
15
+
16
+ export const DEFAULT_MARKER_PATH = "/var/lib/crucible/reboot-expected";
17
+ export const DEFAULT_TTL_MS = 10 * 60 * 1000;
18
+
19
+ export interface PlannedReboot {
20
+ expected: true;
21
+ reason?: string;
22
+ }
23
+
24
+ export interface RebootMarker {
25
+ expires_at: string; // ISO timestamp
26
+ reason?: string;
27
+ }
28
+
29
+ /**
30
+ * Read and delete the marker at `path`. Returns the resolved reboot flag
31
+ * if the file existed, was parseable JSON, and hasn't expired; otherwise
32
+ * returns null. The file is unlinked in every branch where it existed,
33
+ * so a malformed or stale marker is one-shot (can't linger).
34
+ */
35
+ export function consumeRebootMarker(
36
+ path: string = DEFAULT_MARKER_PATH,
37
+ now: Date = new Date(),
38
+ ): PlannedReboot | null {
39
+ if (!existsSync(path)) return null;
40
+ let raw: string;
41
+ try { raw = readFileSync(path, "utf-8"); } catch { try { unlinkSync(path); } catch {} return null; }
42
+ // Always delete after read, regardless of validity.
43
+ try { unlinkSync(path); } catch {}
44
+
45
+ let parsed: RebootMarker;
46
+ try { parsed = JSON.parse(raw); } catch { return null; }
47
+ if (!parsed || typeof parsed !== "object" || typeof parsed.expires_at !== "string") return null;
48
+ const expiresAt = new Date(parsed.expires_at);
49
+ if (isNaN(expiresAt.getTime())) return null;
50
+ if (expiresAt.getTime() <= now.getTime()) return null; // stale
51
+ return { expected: true, reason: parsed.reason };
52
+ }
53
+
54
+ /**
55
+ * Write a planned-reboot marker. Used by the `mark-reboot` and `reboot`
56
+ * CLI subcommands. `ttlMs` defaults to 10 minutes. Creates the parent
57
+ * directory if needed. Chmod 600 so other users on the host can't read
58
+ * or modify it.
59
+ */
60
+ export function writeRebootMarker(opts: {
61
+ reason?: string;
62
+ ttlMs?: number;
63
+ path?: string;
64
+ now?: Date;
65
+ }): { path: string; expires_at: string } {
66
+ const path = opts.path ?? DEFAULT_MARKER_PATH;
67
+ const now = opts.now ?? new Date();
68
+ const ttlMs = opts.ttlMs ?? DEFAULT_TTL_MS;
69
+ const expiresAt = new Date(now.getTime() + ttlMs);
70
+ const body: RebootMarker = { expires_at: expiresAt.toISOString() };
71
+ if (opts.reason) body.reason = opts.reason;
72
+ try { mkdirSync(dirname(path), { recursive: true, mode: 0o700 }); } catch {}
73
+ writeFileSync(path, JSON.stringify(body), { mode: 0o600 });
74
+ try { chmodSync(path, 0o600); } catch {}
75
+ return { path, expires_at: body.expires_at };
76
+ }
77
+
78
+ /** Parse a duration like "10m", "2h", "600s" into milliseconds. Used by
79
+ * the CLI for the `--ttl` flag. */
80
+ export function parseDuration(s: string): number | null {
81
+ const m = /^(\d+)\s*(ms|s|m|h)?$/.exec(s.trim());
82
+ if (!m) return null;
83
+ const n = parseInt(m[1], 10);
84
+ if (!Number.isFinite(n) || n < 0) return null;
85
+ const unit = m[2] ?? "s";
86
+ const mult = unit === "ms" ? 1 : unit === "s" ? 1000 : unit === "m" ? 60_000 : 3_600_000;
87
+ return n * mult;
88
+ }
package/src/lib/types.ts CHANGED
@@ -18,6 +18,12 @@ export interface Snapshot {
18
18
  systemd?: SystemdData;
19
19
  ntp?: NtpData;
20
20
  file_descriptors?: FileDescriptorData;
21
+ // Planned-reboot flag: set only on the first snapshot after a reboot
22
+ // that was marked with `crucible-agent mark-reboot` / `reboot`. Forge
23
+ // reads this to suppress the `unexpected_reboot` rule. Single-use:
24
+ // subsequent snapshots don't carry it.
25
+ expected_reboot?: boolean;
26
+ expected_reboot_reason?: string;
21
27
  }
22
28
 
23
29
  export interface ConntrackData {
@@ -73,6 +79,12 @@ export interface SystemInfo {
73
79
  hostname: string;
74
80
  ip: string;
75
81
  os: string;
82
+ /** `ID=` from /etc/os-release, lowercased. e.g. "ubuntu", "debian", "rocky", "arch", "alpine". */
83
+ os_id?: string;
84
+ /** `ID_LIKE=` from /etc/os-release, lowercased, space-separated. Used by Forge
85
+ * to pick distro-family-specific fix command variants. e.g. on Rocky this
86
+ * is "rhel centos fedora"; on Ubuntu it is "debian". */
87
+ os_id_like?: string;
76
88
  kernel: string;
77
89
  uptime_seconds: number;
78
90
  }
@@ -139,12 +151,23 @@ export interface NetworkInfo {
139
151
  speed_mbps: number;
140
152
  rx_bytes_sec: number;
141
153
  tx_bytes_sec: number;
154
+ /** Delta over the collection interval (rx_errors + any subtype counter). */
142
155
  rx_errors: number;
143
156
  tx_errors: number;
144
157
  rx_drops: number;
145
158
  tx_drops: number;
159
+ /** Delta over the collection interval. Null if counter not available on this NIC. */
160
+ rx_packets?: number;
161
+ tx_packets?: number;
162
+ /** Fine-grained RX hardware-error subtypes (deltas). Null if unavailable. */
163
+ rx_crc_errors?: number;
164
+ rx_frame_errors?: number;
165
+ rx_length_errors?: number;
166
+ /** TX physical-layer fault counter (delta). Null if unavailable. */
167
+ tx_carrier_errors?: number;
146
168
  operstate?: string; // "up", "down", "unknown", etc. from /sys/class/net/{iface}/operstate
147
169
  bond_master?: string; // if this interface is a bond slave, the bond name
170
+ is_bond_master?: boolean; // true when this entry represents the bond aggregate
148
171
  }
149
172
 
150
173
  export interface RaidInfo {