@glassmkr/crucible 0.10.3 → 0.11.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. package/dist/collect/__tests__/c1-c6.test.d.ts +1 -0
  2. package/dist/collect/__tests__/c1-c6.test.js +160 -0
  3. package/dist/collect/__tests__/c1-c6.test.js.map +1 -0
  4. package/dist/collect/__tests__/c7-c10.test.d.ts +1 -0
  5. package/dist/collect/__tests__/c7-c10.test.js +271 -0
  6. package/dist/collect/__tests__/c7-c10.test.js.map +1 -0
  7. package/dist/collect/bonding.d.ts +37 -0
  8. package/dist/collect/bonding.js +246 -0
  9. package/dist/collect/bonding.js.map +1 -0
  10. package/dist/collect/conntrack.d.ts +19 -0
  11. package/dist/collect/conntrack.js +82 -1
  12. package/dist/collect/conntrack.js.map +1 -1
  13. package/dist/collect/edac.d.ts +2 -0
  14. package/dist/collect/edac.js +104 -0
  15. package/dist/collect/edac.js.map +1 -0
  16. package/dist/collect/fd.d.ts +46 -0
  17. package/dist/collect/fd.js +148 -0
  18. package/dist/collect/fd.js.map +1 -1
  19. package/dist/collect/hardware-raid.d.ts +2 -0
  20. package/dist/collect/hardware-raid.js +152 -0
  21. package/dist/collect/hardware-raid.js.map +1 -0
  22. package/dist/collect/psi.d.ts +20 -0
  23. package/dist/collect/psi.js +90 -0
  24. package/dist/collect/psi.js.map +1 -0
  25. package/dist/collect/reboot-evidence.d.ts +2 -0
  26. package/dist/collect/reboot-evidence.js +109 -0
  27. package/dist/collect/reboot-evidence.js.map +1 -0
  28. package/dist/collect/tcp-stats.d.ts +37 -0
  29. package/dist/collect/tcp-stats.js +153 -0
  30. package/dist/collect/tcp-stats.js.map +1 -0
  31. package/dist/collect/vmstat.d.ts +22 -0
  32. package/dist/collect/vmstat.js +94 -0
  33. package/dist/collect/vmstat.js.map +1 -0
  34. package/dist/collect/zfs.js +94 -0
  35. package/dist/collect/zfs.js.map +1 -1
  36. package/dist/index.js +49 -1
  37. package/dist/index.js.map +1 -1
  38. package/dist/lib/types.d.ts +211 -0
  39. package/package.json +1 -1
@@ -31,12 +31,200 @@ export interface Snapshot {
31
31
  file_descriptors?: FileDescriptorData;
32
32
  expected_reboot?: boolean;
33
33
  expected_reboot_reason?: string;
34
+ /** EDAC memory-error counters per memory controller + DIMM. */
35
+ ecc_edac?: EdacSnapshot;
36
+ /** PSI pressure-stall counters per resource (cpu, memory, io). */
37
+ psi?: PsiSnapshot;
38
+ /** /proc/vmstat swap-in/out rates. */
39
+ vmstat?: VmstatSnapshot;
40
+ /** pstore / kdump / wtmp signals corroborating a reboot. */
41
+ reboot_evidence?: RebootEvidence;
42
+ /** Hardware RAID controllers scraped via vendor CLIs. */
43
+ hardware_raid?: HardwareRaidSnapshot;
44
+ /** Per-process FD scan (top-50 consumers + RLIMIT_NOFILE). */
45
+ process_fd?: ProcessFdSnapshot;
46
+ /** LACP / bonding driver state from /proc/net/bonding. */
47
+ bonding?: BondingSnapshot;
48
+ /** TCP segment / retransmit / listen-queue counters from
49
+ * /proc/net/snmp + /proc/net/netstat. */
50
+ tcp_stats?: TcpStatsSnapshot;
51
+ }
52
+ export interface EdacDimm {
53
+ /** dimm_label (vendor-defined string, e.g. "CPU1_DIMM_A1"). */
54
+ label: string;
55
+ /** dimm_location (slot number / chip-channel ordering). */
56
+ location: string;
57
+ /** DIMM size in MB; null if /sys did not report. */
58
+ size_mb: number | null;
59
+ ce_count: number;
60
+ ue_count: number;
61
+ }
62
+ export interface EdacSnapshot {
63
+ /** Sum of ce_count across all memory controllers. */
64
+ edac_corrected_total: number;
65
+ /** Sum of ue_count across all memory controllers. */
66
+ edac_uncorrected_total: number;
67
+ /** Per-DIMM detail. Empty array on hosts where dimm metadata
68
+ * isn't exposed (older EDAC drivers). */
69
+ dimms: EdacDimm[];
70
+ }
71
+ export interface PsiResource {
72
+ /** Rolling average % over the last 10 / 60 / 300 seconds. */
73
+ avg10: number;
74
+ avg60: number;
75
+ avg300: number;
76
+ /** Cumulative microseconds stalled since boot. */
77
+ total: number;
78
+ }
79
+ export interface PsiSnapshot {
80
+ cpu?: {
81
+ some: PsiResource;
82
+ full?: PsiResource;
83
+ };
84
+ memory?: {
85
+ some: PsiResource;
86
+ full?: PsiResource;
87
+ };
88
+ io?: {
89
+ some: PsiResource;
90
+ full?: PsiResource;
91
+ };
92
+ }
93
+ export interface VmstatSnapshot {
94
+ /** Cumulative pswpin since boot. */
95
+ pswpin_total: number;
96
+ pswpout_total: number;
97
+ /** Per-second swap-in rate over the most recent interval; null on
98
+ * the first snapshot (no baseline) or after a counter reset (host
99
+ * reboot mid-session). */
100
+ pswpin_rate: number | null;
101
+ pswpout_rate: number | null;
102
+ }
103
+ export interface RebootEvidence {
104
+ /** True if /sys/fs/pstore/ contains any dmesg-* / console-* records
105
+ * from the prior kernel. */
106
+ pstore_present: boolean;
107
+ /** Number of pstore records found (zero when pstore_present=false). */
108
+ pstore_record_count: number;
109
+ /** True if /var/crash/ contains a kdump vmcore. */
110
+ vmcore_present: boolean;
111
+ /** Most recent `last reboot -F` output line, verbatim. Null if
112
+ * `last` is unavailable or wtmp is empty. */
113
+ wtmp_reboot_record: string | null;
114
+ /** Heuristic: true when wtmp shows a `shutdown` record before the
115
+ * most recent reboot (suggests a clean shutdown). false when only
116
+ * the boot record is present (suggests hard reset or power loss). */
117
+ prior_shutdown_clean: boolean;
118
+ }
119
+ export interface HardwareRaidController {
120
+ vendor: "dell" | "hpe" | "lsi" | "adaptec";
121
+ controller_id: string;
122
+ /** Vendor-reported overall state, e.g. "Optimal", "Degraded",
123
+ * "Critical", "Failed", or "Unknown". The dashboard's
124
+ * raid_degraded evaluator pages on any state != "Optimal". */
125
+ state: string;
126
+ /** Count of physical disks the controller flagged as failed /
127
+ * degraded; null when the parser couldn't extract this. */
128
+ degraded_disks: number | null;
129
+ /** Optional vendor-text excerpt the dashboard can surface in
130
+ * evidence; null when not captured. */
131
+ raw_summary: string | null;
132
+ }
133
+ export interface HardwareRaidSnapshot {
134
+ controllers: HardwareRaidController[];
34
135
  }
35
136
  export interface ConntrackData {
36
137
  available: boolean;
37
138
  count: number;
38
139
  max: number;
39
140
  percent: number;
141
+ /** C9 (2026-05-19): cumulative insert_failed counter (sum across CPUs)
142
+ * from /proc/net/stat/nf_conntrack. Optional because pre-0.11.0
143
+ * agents omit it. */
144
+ insert_failed_total?: number;
145
+ /** C9: cumulative drop counter from /proc/net/stat/nf_conntrack. */
146
+ drop_total?: number;
147
+ /** Per-second insert_failed rate over the most recent snapshot
148
+ * interval. Null on first snapshot, on counter reset, or when the
149
+ * stat file is unavailable. */
150
+ insert_failed_rate_per_sec?: number | null;
151
+ drop_rate_per_sec?: number | null;
152
+ }
153
+ export interface ProcessFdEntry {
154
+ pid: number;
155
+ comm: string;
156
+ fd_count: number;
157
+ rlimit_nofile_soft: number;
158
+ rlimit_nofile_hard: number;
159
+ /** fd_count / rlimit_nofile_soft * 100, rounded to one decimal. Zero
160
+ * when soft limit is unlimited (no useful proximity signal). */
161
+ percent_of_soft_limit: number;
162
+ }
163
+ export interface ProcessFdSnapshot {
164
+ available: boolean;
165
+ reason?: string;
166
+ /** Top 50 processes by fd_count. */
167
+ top_consumers: ProcessFdEntry[];
168
+ /** Number of numeric /proc/<pid> entries we considered. */
169
+ total_processes_scanned: number;
170
+ /** Aggregate signal: max percent_of_soft_limit across top_consumers.
171
+ * Null when top_consumers is empty. */
172
+ highest_percent_of_limit: number | null;
173
+ }
174
+ export interface BondSlave {
175
+ name: string;
176
+ mii_status: string;
177
+ link_failure_count: number;
178
+ permanent_hw_addr: string;
179
+ aggregator_id: number | null;
180
+ partner_churn_state: string | null;
181
+ partner_lacp_port_state: number | null;
182
+ /** Convenience flag derived from the LACP port-state bitfield's
183
+ * synchronization bit (bit 3, 0x08). Null when the bond is not
184
+ * LACP or partner state was not captured. */
185
+ partner_lacp_synchronized: boolean | null;
186
+ }
187
+ export interface BondAggregator {
188
+ id: number;
189
+ number_of_ports: number;
190
+ actor_key: number | null;
191
+ partner_key: number | null;
192
+ partner_mac_address: string | null;
193
+ }
194
+ export interface Bond {
195
+ name: string;
196
+ mode: string;
197
+ is_lacp: boolean;
198
+ lacp_rate: string | null;
199
+ slaves: BondSlave[];
200
+ /** Equal to slaves.length; surfaces the "configured" port count
201
+ * alongside active_aggregator.number_of_ports so the dashboard can
202
+ * compute a shortfall. */
203
+ configured_port_count: number;
204
+ active_aggregator: BondAggregator | null;
205
+ }
206
+ export interface BondingSnapshot {
207
+ available: boolean;
208
+ reason?: string;
209
+ bonds: Bond[];
210
+ }
211
+ export interface TcpStatsSnapshot {
212
+ available: boolean;
213
+ reason?: string;
214
+ out_segs_total?: number;
215
+ retrans_segs_total?: number;
216
+ in_segs_total?: number;
217
+ /** Retransmits divided by segments sent over the most recent
218
+ * interval. Range 0.0 - 1.0. Null on first snapshot or counter
219
+ * reset. Zero when no outbound traffic in the interval. */
220
+ retrans_ratio?: number | null;
221
+ retrans_rate_per_sec?: number | null;
222
+ /** Optional listen-queue counters from /proc/net/netstat TcpExt.
223
+ * Absent when /proc/net/netstat is not readable. */
224
+ listen_overflows_total?: number;
225
+ listen_drops_total?: number;
226
+ listen_overflows_rate_per_sec?: number | null;
227
+ listen_drops_rate_per_sec?: number | null;
40
228
  }
41
229
  export interface SystemdData {
42
230
  failed_units: string[];
@@ -58,6 +246,21 @@ export interface FileDescriptorData {
58
246
  max: number;
59
247
  percent: number;
60
248
  }
249
+ export interface ZfsVdev {
250
+ /** Vdev name, e.g. "raidz2-0", "mirror-0", or a raw device for
251
+ * single-device top-level stripes. */
252
+ name: string;
253
+ /** Vdev state from `zpool status` (ONLINE, DEGRADED, FAULTED,
254
+ * REMOVED, SUSPENDED, UNAVAIL). */
255
+ state: string;
256
+ /** Redundancy class. C6 addition (2026-05-19): scaled vdev severity
257
+ * matrix on the dashboard side depends on this so a DEGRADED
258
+ * raidz1 (zero remaining tolerance) pages differently from a
259
+ * DEGRADED raidz2 (one disk-fault budget left). */
260
+ redundancy_class: "mirror" | "raidz1" | "raidz2" | "raidz3" | "draid" | "stripe";
261
+ /** Number of child devices under this vdev in a non-ONLINE state. */
262
+ degraded_disks_count: number;
263
+ }
61
264
  export interface ZfsPool {
62
265
  name: string;
63
266
  state: string;
@@ -66,6 +269,14 @@ export interface ZfsPool {
66
269
  scrub_repaired?: string;
67
270
  last_scrub_date?: string;
68
271
  scrub_never_run?: boolean;
272
+ /** Top-level data vdevs. Always present from collector v0.10.4+.
273
+ * Dashboard tolerates absent (older agents) via capability gates. */
274
+ vdevs: ZfsVdev[];
275
+ /** Separate log (SLOG / ZIL) vdevs. Empty array on pools without
276
+ * a SLOG configured. */
277
+ slog_vdevs: ZfsVdev[];
278
+ /** Cache (L2ARC) vdevs. Empty array on pools without L2ARC. */
279
+ l2arc_vdevs: ZfsVdev[];
69
280
  }
70
281
  export interface ZfsData {
71
282
  pools: ZfsPool[];
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@glassmkr/crucible",
3
- "version": "0.10.3",
3
+ "version": "0.11.0",
4
4
  "description": "Lightweight bare metal server monitoring. IPMI, SMART, OS, network. Opinionated alerts.",
5
5
  "type": "module",
6
6
  "main": "./dist/index.js",