@glassmkr/crucible 0.1.0 → 0.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.github/ISSUE_TEMPLATE/bug_report.md +24 -0
- package/.github/ISSUE_TEMPLATE/no_data.md +26 -0
- package/README.md +48 -52
- package/dist/alerts/rules.js +120 -0
- package/dist/alerts/rules.js.map +1 -1
- package/dist/collect/cpu.js +57 -16
- package/dist/collect/cpu.js.map +1 -1
- package/dist/collect/disks.js +48 -2
- package/dist/collect/disks.js.map +1 -1
- package/dist/collect/ipmi.js +160 -2
- package/dist/collect/ipmi.js.map +1 -1
- package/dist/collect/security.d.ts +39 -0
- package/dist/collect/security.js +176 -0
- package/dist/collect/security.js.map +1 -0
- package/dist/config.d.ts +23 -0
- package/dist/config.js +5 -0
- package/dist/config.js.map +1 -1
- package/dist/index.js +48 -3
- package/dist/index.js.map +1 -1
- package/dist/lib/types.d.ts +64 -0
- package/dist/lib/version-check.d.ts +7 -0
- package/dist/lib/version-check.js +39 -0
- package/dist/lib/version-check.js.map +1 -0
- package/dist/metrics-server.d.ts +3 -0
- package/dist/metrics-server.js +113 -0
- package/dist/metrics-server.js.map +1 -0
- package/dist/notify/telegram.js +27 -9
- package/dist/notify/telegram.js.map +1 -1
- package/dist/push/forge.d.ts +1 -0
- package/dist/push/forge.js +80 -4
- package/dist/push/forge.js.map +1 -1
- package/package.json +1 -1
- package/scripts/sign-release.sh +29 -0
- package/src/alerts/rules.ts +99 -0
- package/src/collect/cpu.ts +64 -16
- package/src/collect/disks.ts +57 -2
- package/src/collect/ipmi.ts +147 -3
- package/src/collect/security.ts +238 -0
- package/src/config.ts +5 -0
- package/src/index.ts +49 -3
- package/src/lib/types.ts +44 -0
- package/src/lib/version-check.ts +38 -0
- package/src/metrics-server.ts +123 -0
- package/src/notify/telegram.ts +29 -9
- package/src/push/forge.ts +89 -5
package/src/index.ts
CHANGED
|
@@ -1,6 +1,21 @@
|
|
|
1
1
|
#!/usr/bin/env node
|
|
2
2
|
|
|
3
|
+
import { readFileSync } from "node:fs";
|
|
4
|
+
import { fileURLToPath } from "node:url";
|
|
5
|
+
import { dirname, join } from "node:path";
|
|
3
6
|
import { loadConfig } from "./config.js";
|
|
7
|
+
|
|
8
|
+
const __dirname = dirname(fileURLToPath(import.meta.url));
|
|
9
|
+
const PKG_VERSION = (() => {
|
|
10
|
+
try {
|
|
11
|
+
const pkg = JSON.parse(readFileSync(join(__dirname, "..", "package.json"), "utf8"));
|
|
12
|
+
return pkg.version || "0.0.0";
|
|
13
|
+
} catch {
|
|
14
|
+
return "0.0.0";
|
|
15
|
+
}
|
|
16
|
+
})();
|
|
17
|
+
import { checkForUpdates } from "./lib/version-check.js";
|
|
18
|
+
import { startMetricsServer, updateMetrics } from "./metrics-server.js";
|
|
4
19
|
import { collectSystem } from "./collect/system.js";
|
|
5
20
|
import { collectCpu } from "./collect/cpu.js";
|
|
6
21
|
import { collectMemory } from "./collect/memory.js";
|
|
@@ -15,7 +30,8 @@ import { updateAlertState } from "./alerts/state.js";
|
|
|
15
30
|
import { sendTelegram } from "./notify/telegram.js";
|
|
16
31
|
import { sendSlack } from "./notify/slack.js";
|
|
17
32
|
import { sendEmail } from "./notify/email.js";
|
|
18
|
-
import { pushToForge } from "./push/forge.js";
|
|
33
|
+
import { pushToForge, initForgeAgent } from "./push/forge.js";
|
|
34
|
+
import { collectSecurity, type SecurityData } from "./collect/security.js";
|
|
19
35
|
import type { Snapshot, IpmiInfo } from "./lib/types.js";
|
|
20
36
|
|
|
21
37
|
const configPath = process.argv[2] || "/etc/glassmkr/collector.yaml";
|
|
@@ -24,8 +40,24 @@ const config = loadConfig(configPath);
|
|
|
24
40
|
console.log(`[collector] Starting. Server: ${config.server_name}. Interval: ${config.collection.interval_seconds}s`);
|
|
25
41
|
console.log(`[collector] IPMI: ${config.collection.ipmi ? "enabled" : "disabled"}, SMART: ${config.collection.smart ? "enabled" : "disabled"}`);
|
|
26
42
|
console.log(`[collector] Forge: ${config.forge.enabled ? config.forge.url : "disabled"}`);
|
|
43
|
+
console.log(`[collector] Prometheus: ${config.prometheus.enabled ? `:${config.prometheus.port}/metrics` : "disabled"}`);
|
|
44
|
+
|
|
45
|
+
// Start Prometheus metrics server if enabled
|
|
46
|
+
if (config.prometheus.enabled) {
|
|
47
|
+
startMetricsServer(config.prometheus.port);
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
// Initialize TLS pinning for Forge if configured
|
|
51
|
+
if (config.forge.tls_pin) {
|
|
52
|
+
initForgeAgent(config.forge.tls_pin);
|
|
53
|
+
console.log("[collector] TLS pinning enabled for Forge");
|
|
54
|
+
}
|
|
27
55
|
|
|
28
|
-
const emptyIpmi: IpmiInfo = { available: false, sensors: [], ecc_errors: { correctable: 0, uncorrectable: 0 }, sel_entries_count: 0 };
|
|
56
|
+
const emptyIpmi: IpmiInfo = { available: false, sensors: [], ecc_errors: { correctable: 0, uncorrectable: 0 }, sel_entries_count: 0, sel_events_recent: [], fans: [] };
|
|
57
|
+
|
|
58
|
+
// Security checks run once per hour (every 12th cycle at 5-min intervals)
|
|
59
|
+
let securityCycleCount = 0;
|
|
60
|
+
let cachedSecurity: SecurityData | undefined;
|
|
29
61
|
|
|
30
62
|
async function collect() {
|
|
31
63
|
const startTime = Date.now();
|
|
@@ -43,12 +75,23 @@ async function collect() {
|
|
|
43
75
|
collectOsAlerts(),
|
|
44
76
|
]);
|
|
45
77
|
|
|
78
|
+
// Security checks: run once per hour, reuse cached data between runs
|
|
79
|
+
securityCycleCount++;
|
|
80
|
+
if (securityCycleCount >= 12 || !cachedSecurity) {
|
|
81
|
+
securityCycleCount = 0;
|
|
82
|
+
try { cachedSecurity = await collectSecurity(); } catch (err) { console.error("[security] Collection error:", err); }
|
|
83
|
+
}
|
|
84
|
+
|
|
46
85
|
const snapshot: Snapshot = {
|
|
47
|
-
collector_version:
|
|
86
|
+
collector_version: PKG_VERSION,
|
|
48
87
|
timestamp: new Date().toISOString(),
|
|
49
88
|
system, cpu, memory, disks, smart, network, raid, ipmi, os_alerts: osAlerts,
|
|
89
|
+
security: cachedSecurity,
|
|
50
90
|
};
|
|
51
91
|
|
|
92
|
+
// Update Prometheus metrics
|
|
93
|
+
updateMetrics(snapshot);
|
|
94
|
+
|
|
52
95
|
// Evaluate alerts
|
|
53
96
|
const alertResults = evaluateAlerts(snapshot, config.thresholds);
|
|
54
97
|
const { newAlerts, resolvedAlerts } = updateAlertState(alertResults);
|
|
@@ -74,6 +117,9 @@ async function collect() {
|
|
|
74
117
|
pushToForge(config.forge.url, config.forge.api_key, snapshot);
|
|
75
118
|
}
|
|
76
119
|
|
|
120
|
+
// Check for updates (every 6 hours, non-blocking)
|
|
121
|
+
checkForUpdates(config.forge.enabled ? config.forge.url : undefined);
|
|
122
|
+
|
|
77
123
|
// Print summary on first run
|
|
78
124
|
if (firstRun) {
|
|
79
125
|
firstRun = false;
|
package/src/lib/types.ts
CHANGED
|
@@ -10,6 +10,16 @@ export interface Snapshot {
|
|
|
10
10
|
raid: RaidInfo[];
|
|
11
11
|
ipmi: IpmiInfo;
|
|
12
12
|
os_alerts: OsAlerts;
|
|
13
|
+
security?: SecurityData;
|
|
14
|
+
}
|
|
15
|
+
|
|
16
|
+
export interface SecurityData {
|
|
17
|
+
ssh: { permitRootLogin: string; passwordAuthentication: string; rootPasswordExposed: boolean } | null;
|
|
18
|
+
firewall: { active: boolean; source: string; details: string };
|
|
19
|
+
pending_updates: { distro: string; pendingCount: number; available: boolean } | null;
|
|
20
|
+
kernel_vulns: Array<{ name: string; status: string; mitigated: boolean }>;
|
|
21
|
+
kernel_reboot: { running: string; installed: string; needsReboot: boolean } | null;
|
|
22
|
+
auto_updates: { configured: boolean; mechanism: string; details: string };
|
|
13
23
|
}
|
|
14
24
|
|
|
15
25
|
export interface SystemInfo {
|
|
@@ -20,6 +30,16 @@ export interface SystemInfo {
|
|
|
20
30
|
uptime_seconds: number;
|
|
21
31
|
}
|
|
22
32
|
|
|
33
|
+
export interface CpuCoreInfo {
|
|
34
|
+
core: number;
|
|
35
|
+
user_percent: number;
|
|
36
|
+
system_percent: number;
|
|
37
|
+
iowait_percent: number;
|
|
38
|
+
idle_percent: number;
|
|
39
|
+
irq_percent: number;
|
|
40
|
+
softirq_percent: number;
|
|
41
|
+
}
|
|
42
|
+
|
|
23
43
|
export interface CpuInfo {
|
|
24
44
|
user_percent: number;
|
|
25
45
|
system_percent: number;
|
|
@@ -28,6 +48,7 @@ export interface CpuInfo {
|
|
|
28
48
|
load_1m: number;
|
|
29
49
|
load_5m: number;
|
|
30
50
|
load_15m: number;
|
|
51
|
+
cores?: CpuCoreInfo[];
|
|
31
52
|
}
|
|
32
53
|
|
|
33
54
|
export interface MemoryInfo {
|
|
@@ -45,6 +66,11 @@ export interface DiskInfo {
|
|
|
45
66
|
used_gb: number;
|
|
46
67
|
available_gb: number;
|
|
47
68
|
percent_used: number;
|
|
69
|
+
fstype?: string;
|
|
70
|
+
options?: string;
|
|
71
|
+
inodes_total?: number;
|
|
72
|
+
inodes_used?: number;
|
|
73
|
+
inodes_free?: number;
|
|
48
74
|
io_read_mb_s?: number;
|
|
49
75
|
io_write_mb_s?: number;
|
|
50
76
|
latency_p99_ms?: number;
|
|
@@ -81,6 +107,22 @@ export interface RaidInfo {
|
|
|
81
107
|
failed_disks: string[];
|
|
82
108
|
}
|
|
83
109
|
|
|
110
|
+
export interface SelEvent {
|
|
111
|
+
id: number;
|
|
112
|
+
timestamp: string;
|
|
113
|
+
sensor: string;
|
|
114
|
+
sensor_type: string;
|
|
115
|
+
event: string;
|
|
116
|
+
direction: string;
|
|
117
|
+
severity: string;
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
export interface FanStatus {
|
|
121
|
+
name: string;
|
|
122
|
+
rpm: number;
|
|
123
|
+
status: string;
|
|
124
|
+
}
|
|
125
|
+
|
|
84
126
|
export interface IpmiInfo {
|
|
85
127
|
available: boolean;
|
|
86
128
|
sensors: Array<{
|
|
@@ -92,6 +134,8 @@ export interface IpmiInfo {
|
|
|
92
134
|
}>;
|
|
93
135
|
ecc_errors: { correctable: number; uncorrectable: number };
|
|
94
136
|
sel_entries_count: number;
|
|
137
|
+
sel_events_recent: SelEvent[];
|
|
138
|
+
fans: FanStatus[];
|
|
95
139
|
}
|
|
96
140
|
|
|
97
141
|
export interface OsAlerts {
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
const CURRENT_VERSION = "0.1.0";
|
|
2
|
+
let lastCheckTime = 0;
|
|
3
|
+
let lastResult: { updateAvailable: boolean; latest: string; changelog: string } | null = null;
|
|
4
|
+
const CHECK_INTERVAL = 6 * 60 * 60 * 1000; // check every 6 hours
|
|
5
|
+
|
|
6
|
+
export function getCurrentVersion(): string {
|
|
7
|
+
return CURRENT_VERSION;
|
|
8
|
+
}
|
|
9
|
+
|
|
10
|
+
export async function checkForUpdates(forgeUrl?: string): Promise<void> {
|
|
11
|
+
const now = Date.now();
|
|
12
|
+
if (now - lastCheckTime < CHECK_INTERVAL) return;
|
|
13
|
+
lastCheckTime = now;
|
|
14
|
+
|
|
15
|
+
const url = forgeUrl || "https://forge.glassmkr.com";
|
|
16
|
+
try {
|
|
17
|
+
const res = await fetch(`${url}/api/v1/version`, { signal: AbortSignal.timeout(5000) });
|
|
18
|
+
if (!res.ok) return;
|
|
19
|
+
const data = await res.json() as { crucible?: { latest?: string; min_supported?: string; changelog_url?: string } };
|
|
20
|
+
const latest = data.crucible?.latest;
|
|
21
|
+
if (!latest) return;
|
|
22
|
+
|
|
23
|
+
if (latest !== CURRENT_VERSION) {
|
|
24
|
+
console.log(`[update] New Crucible version available: ${latest} (current: ${CURRENT_VERSION})`);
|
|
25
|
+
console.log(`[update] Changelog: ${data.crucible?.changelog_url || "https://github.com/glassmkr/crucible/releases"}`);
|
|
26
|
+
console.log(`[update] Run: npm update -g @glassmkr/crucible && sudo systemctl restart glassmkr-collector`);
|
|
27
|
+
lastResult = { updateAvailable: true, latest, changelog: data.crucible?.changelog_url || "" };
|
|
28
|
+
} else {
|
|
29
|
+
lastResult = { updateAvailable: false, latest, changelog: "" };
|
|
30
|
+
}
|
|
31
|
+
} catch {
|
|
32
|
+
// Version check is non-critical, fail silently
|
|
33
|
+
}
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
export function getUpdateStatus() {
|
|
37
|
+
return lastResult;
|
|
38
|
+
}
|
|
@@ -0,0 +1,123 @@
|
|
|
1
|
+
import { createServer } from "http";
|
|
2
|
+
import type { Snapshot } from "./lib/types.js";
|
|
3
|
+
|
|
4
|
+
let latestSnapshot: Snapshot | null = null;
|
|
5
|
+
|
|
6
|
+
export function updateMetrics(snapshot: Snapshot) {
|
|
7
|
+
latestSnapshot = snapshot;
|
|
8
|
+
}
|
|
9
|
+
|
|
10
|
+
export function startMetricsServer(port: number) {
|
|
11
|
+
const server = createServer((req, res) => {
|
|
12
|
+
if (req.url === "/metrics" && req.method === "GET") {
|
|
13
|
+
if (!latestSnapshot) {
|
|
14
|
+
res.writeHead(503);
|
|
15
|
+
res.end("# No data collected yet\n");
|
|
16
|
+
return;
|
|
17
|
+
}
|
|
18
|
+
res.writeHead(200, { "Content-Type": "text/plain; version=0.0.4" });
|
|
19
|
+
res.end(formatPrometheus(latestSnapshot));
|
|
20
|
+
} else if (req.url === "/health") {
|
|
21
|
+
res.writeHead(200);
|
|
22
|
+
res.end("ok\n");
|
|
23
|
+
} else {
|
|
24
|
+
res.writeHead(404);
|
|
25
|
+
res.end("Not found\n");
|
|
26
|
+
}
|
|
27
|
+
});
|
|
28
|
+
|
|
29
|
+
server.listen(port, "0.0.0.0", () => {
|
|
30
|
+
console.log(`[metrics] Prometheus endpoint listening on :${port}/metrics`);
|
|
31
|
+
});
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
function formatPrometheus(snap: Snapshot): string {
|
|
35
|
+
const lines: string[] = [];
|
|
36
|
+
|
|
37
|
+
// CPU
|
|
38
|
+
lines.push("# HELP glassmkr_cpu_user_percent CPU user utilization");
|
|
39
|
+
lines.push("# TYPE glassmkr_cpu_user_percent gauge");
|
|
40
|
+
lines.push(`glassmkr_cpu_user_percent ${snap.cpu.user_percent}`);
|
|
41
|
+
lines.push(`glassmkr_cpu_system_percent ${snap.cpu.system_percent}`);
|
|
42
|
+
lines.push(`glassmkr_cpu_iowait_percent ${snap.cpu.iowait_percent}`);
|
|
43
|
+
lines.push(`glassmkr_cpu_idle_percent ${snap.cpu.idle_percent}`);
|
|
44
|
+
lines.push(`glassmkr_load_1m ${snap.cpu.load_1m}`);
|
|
45
|
+
lines.push(`glassmkr_load_5m ${snap.cpu.load_5m}`);
|
|
46
|
+
lines.push(`glassmkr_load_15m ${snap.cpu.load_15m}`);
|
|
47
|
+
|
|
48
|
+
// Memory
|
|
49
|
+
lines.push("# HELP glassmkr_memory_used_mb Memory used in MB");
|
|
50
|
+
lines.push("# TYPE glassmkr_memory_used_mb gauge");
|
|
51
|
+
lines.push(`glassmkr_memory_used_mb ${snap.memory.used_mb}`);
|
|
52
|
+
lines.push(`glassmkr_memory_total_mb ${snap.memory.total_mb}`);
|
|
53
|
+
lines.push(`glassmkr_memory_available_mb ${snap.memory.available_mb}`);
|
|
54
|
+
lines.push(`glassmkr_swap_used_mb ${snap.memory.swap_used_mb}`);
|
|
55
|
+
|
|
56
|
+
// Disks
|
|
57
|
+
lines.push("# HELP glassmkr_disk_used_percent Disk usage percentage");
|
|
58
|
+
lines.push("# TYPE glassmkr_disk_used_percent gauge");
|
|
59
|
+
for (const disk of snap.disks) {
|
|
60
|
+
const labels = `mount="${disk.mount}",device="${disk.device}"`;
|
|
61
|
+
lines.push(`glassmkr_disk_used_percent{${labels}} ${disk.percent_used}`);
|
|
62
|
+
lines.push(`glassmkr_disk_total_gb{${labels}} ${disk.total_gb}`);
|
|
63
|
+
lines.push(`glassmkr_disk_used_gb{${labels}} ${disk.used_gb}`);
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
// Network
|
|
67
|
+
lines.push("# HELP glassmkr_net_rx_bytes_sec Network receive bytes per second");
|
|
68
|
+
lines.push("# TYPE glassmkr_net_rx_bytes_sec gauge");
|
|
69
|
+
for (const iface of snap.network) {
|
|
70
|
+
const labels = `interface="${iface.interface}"`;
|
|
71
|
+
lines.push(`glassmkr_net_rx_bytes_sec{${labels}} ${iface.rx_bytes_sec}`);
|
|
72
|
+
lines.push(`glassmkr_net_tx_bytes_sec{${labels}} ${iface.tx_bytes_sec}`);
|
|
73
|
+
lines.push(`glassmkr_net_rx_errors{${labels}} ${iface.rx_errors}`);
|
|
74
|
+
lines.push(`glassmkr_net_tx_errors{${labels}} ${iface.tx_errors}`);
|
|
75
|
+
lines.push(`glassmkr_net_speed_mbps{${labels}} ${iface.speed_mbps}`);
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
// SMART
|
|
79
|
+
for (const drive of snap.smart) {
|
|
80
|
+
const labels = `device="${drive.device}",model="${drive.model}"`;
|
|
81
|
+
if (drive.temperature_c != null) lines.push(`glassmkr_smart_temperature_c{${labels}} ${drive.temperature_c}`);
|
|
82
|
+
if (drive.percentage_used != null) lines.push(`glassmkr_smart_percentage_used{${labels}} ${drive.percentage_used}`);
|
|
83
|
+
if (drive.reallocated_sectors != null) lines.push(`glassmkr_smart_reallocated_sectors{${labels}} ${drive.reallocated_sectors}`);
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
// IPMI
|
|
87
|
+
if (snap.ipmi?.available) {
|
|
88
|
+
for (const sensor of snap.ipmi.sensors) {
|
|
89
|
+
if (typeof sensor.value === "number") {
|
|
90
|
+
const sensorName = sensor.name.replace(/[^a-zA-Z0-9_]/g, "_").toLowerCase();
|
|
91
|
+
lines.push(`glassmkr_ipmi_sensor{sensor="${sensor.name}",unit="${sensor.unit}"} ${sensor.value}`);
|
|
92
|
+
}
|
|
93
|
+
}
|
|
94
|
+
lines.push(`glassmkr_ipmi_ecc_correctable ${snap.ipmi.ecc_errors.correctable}`);
|
|
95
|
+
lines.push(`glassmkr_ipmi_ecc_uncorrectable ${snap.ipmi.ecc_errors.uncorrectable}`);
|
|
96
|
+
|
|
97
|
+
// Fans
|
|
98
|
+
if (snap.ipmi.fans) {
|
|
99
|
+
for (const fan of snap.ipmi.fans) {
|
|
100
|
+
lines.push(`glassmkr_ipmi_fan_rpm{fan="${fan.name}",status="${fan.status}"} ${fan.rpm}`);
|
|
101
|
+
}
|
|
102
|
+
}
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
// OS alerts
|
|
106
|
+
lines.push(`glassmkr_oom_kills_recent ${snap.os_alerts.oom_kills_recent}`);
|
|
107
|
+
lines.push(`glassmkr_zombie_processes ${snap.os_alerts.zombie_processes}`);
|
|
108
|
+
|
|
109
|
+
// Security
|
|
110
|
+
if (snap.security) {
|
|
111
|
+
lines.push(`glassmkr_ssh_root_password_exposed ${snap.security.ssh?.rootPasswordExposed ? 1 : 0}`);
|
|
112
|
+
lines.push(`glassmkr_firewall_active ${snap.security.firewall.active ? 1 : 0}`);
|
|
113
|
+
if (snap.security.pending_updates?.available) {
|
|
114
|
+
lines.push(`glassmkr_pending_security_updates ${snap.security.pending_updates.pendingCount}`);
|
|
115
|
+
}
|
|
116
|
+
const unmitigated = snap.security.kernel_vulns.filter(v => !v.mitigated).length;
|
|
117
|
+
lines.push(`glassmkr_kernel_vulns_unmitigated ${unmitigated}`);
|
|
118
|
+
lines.push(`glassmkr_kernel_needs_reboot ${snap.security.kernel_reboot?.needsReboot ? 1 : 0}`);
|
|
119
|
+
lines.push(`glassmkr_auto_updates_configured ${snap.security.auto_updates.configured ? 1 : 0}`);
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
return lines.join("\n") + "\n";
|
|
123
|
+
}
|
package/src/notify/telegram.ts
CHANGED
|
@@ -1,5 +1,22 @@
|
|
|
1
1
|
import type { AlertResult } from "../lib/types.js";
|
|
2
2
|
|
|
3
|
+
const PRIORITY_MAP: Record<string, string> = {
|
|
4
|
+
raid_degraded: "P1", smart_failing: "P1", ecc_errors: "P1", psu_redundancy_loss: "P1", ipmi_fan_failure: "P1",
|
|
5
|
+
oom_kills: "P2", ram_high: "P2", disk_space_high: "P2", ipmi_sel_critical: "P2", disk_io_errors: "P2", zfs_pool_unhealthy: "P2",
|
|
6
|
+
cpu_iowait_high: "P3", nvme_wear_high: "P3", disk_latency_high: "P3", cpu_temperature_high: "P3",
|
|
7
|
+
ssh_root_password: "P3", pending_security_updates: "P3", kernel_vulnerabilities: "P3", zfs_scrub_errors: "P3",
|
|
8
|
+
swap_active: "P4", no_firewall: "P4", kernel_needs_reboot: "P4", unattended_upgrades_disabled: "P4",
|
|
9
|
+
interface_errors: "P4", link_speed_mismatch: "P4", interface_saturation: "P4",
|
|
10
|
+
};
|
|
11
|
+
|
|
12
|
+
const PRIORITY_LABELS: Record<string, string> = {
|
|
13
|
+
P1: "\u{1F534} P1 Urgent", P2: "\u{1F7E0} P2 High", P3: "\u{1F7E1} P3 Medium", P4: "\u{1F535} P4 Low",
|
|
14
|
+
};
|
|
15
|
+
|
|
16
|
+
function getPriority(alertType: string): string {
|
|
17
|
+
return PRIORITY_MAP[alertType] || "P3";
|
|
18
|
+
}
|
|
19
|
+
|
|
3
20
|
export async function sendTelegram(
|
|
4
21
|
botToken: string,
|
|
5
22
|
chatId: string,
|
|
@@ -10,16 +27,19 @@ export async function sendTelegram(
|
|
|
10
27
|
const parts: string[] = [];
|
|
11
28
|
|
|
12
29
|
if (newAlerts.length > 0) {
|
|
13
|
-
|
|
14
|
-
const
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
30
|
+
// Group by priority
|
|
31
|
+
const byPriority: Record<string, AlertResult[]> = {};
|
|
32
|
+
for (const a of newAlerts) {
|
|
33
|
+
const p = getPriority(a.type);
|
|
34
|
+
if (!byPriority[p]) byPriority[p] = [];
|
|
35
|
+
byPriority[p].push(a);
|
|
19
36
|
}
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
37
|
+
|
|
38
|
+
for (const p of ["P1", "P2", "P3", "P4"]) {
|
|
39
|
+
const alerts = byPriority[p];
|
|
40
|
+
if (!alerts?.length) continue;
|
|
41
|
+
parts.push(`${PRIORITY_LABELS[p]} on <b>${serverName}</b>:\n`);
|
|
42
|
+
for (const a of alerts) parts.push(` \u2022 <b>${a.title}</b>\n ${a.recommendation}\n`);
|
|
23
43
|
}
|
|
24
44
|
}
|
|
25
45
|
|
package/src/push/forge.ts
CHANGED
|
@@ -1,18 +1,55 @@
|
|
|
1
|
+
import https from "https";
|
|
2
|
+
import tls from "tls";
|
|
3
|
+
import crypto from "crypto";
|
|
1
4
|
import type { Snapshot } from "../lib/types.js";
|
|
2
5
|
|
|
6
|
+
let agent: https.Agent | undefined;
|
|
7
|
+
|
|
8
|
+
export function initForgeAgent(tlsPin?: string): void {
|
|
9
|
+
if (!tlsPin) {
|
|
10
|
+
agent = undefined; // Use default (Node built-in fetch)
|
|
11
|
+
return;
|
|
12
|
+
}
|
|
13
|
+
|
|
14
|
+
agent = new https.Agent({
|
|
15
|
+
rejectUnauthorized: true,
|
|
16
|
+
checkServerIdentity: (hostname: string, cert: any) => {
|
|
17
|
+
const err = tls.checkServerIdentity(hostname, cert);
|
|
18
|
+
if (err) return err;
|
|
19
|
+
|
|
20
|
+
const pubkey = cert.pubkey;
|
|
21
|
+
if (!pubkey) return new Error("Certificate has no public key");
|
|
22
|
+
|
|
23
|
+
const hash = crypto.createHash("sha256").update(pubkey).digest("base64");
|
|
24
|
+
if (hash !== tlsPin) {
|
|
25
|
+
return new Error(
|
|
26
|
+
`TLS pin mismatch for ${hostname}. ` +
|
|
27
|
+
`Expected: ${tlsPin}, Got: ${hash}. ` +
|
|
28
|
+
`If the server certificate was rotated with a new key, update tls_pin in collector.yaml.`
|
|
29
|
+
);
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
return undefined;
|
|
33
|
+
},
|
|
34
|
+
});
|
|
35
|
+
}
|
|
36
|
+
|
|
3
37
|
export async function pushToForge(url: string, apiKey: string, snapshot: Snapshot): Promise<boolean> {
|
|
38
|
+
// If TLS pinning is enabled, use https.request (fetch doesn't support custom agents)
|
|
39
|
+
if (agent) {
|
|
40
|
+
return pushWithAgent(url, apiKey, snapshot);
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
// Default: use fetch (no pinning)
|
|
4
44
|
try {
|
|
5
45
|
const response = await fetch(`${url}/api/v1/ingest`, {
|
|
6
46
|
method: "POST",
|
|
7
|
-
headers: {
|
|
8
|
-
Authorization: `Bearer ${apiKey}`,
|
|
9
|
-
"Content-Type": "application/json",
|
|
10
|
-
},
|
|
47
|
+
headers: { Authorization: `Bearer ${apiKey}`, "Content-Type": "application/json" },
|
|
11
48
|
body: JSON.stringify(snapshot),
|
|
12
49
|
signal: AbortSignal.timeout(10000),
|
|
13
50
|
});
|
|
14
51
|
if (response.ok) {
|
|
15
|
-
const data = await response.json() as {
|
|
52
|
+
const data = await response.json() as { active_alerts?: number };
|
|
16
53
|
console.log(`[forge] Push successful. Active alerts: ${data.active_alerts ?? 0}`);
|
|
17
54
|
} else {
|
|
18
55
|
console.error(`[forge] Push failed: ${response.status} ${response.statusText}`);
|
|
@@ -23,3 +60,50 @@ export async function pushToForge(url: string, apiKey: string, snapshot: Snapsho
|
|
|
23
60
|
return false;
|
|
24
61
|
}
|
|
25
62
|
}
|
|
63
|
+
|
|
64
|
+
function pushWithAgent(url: string, apiKey: string, snapshot: Snapshot): Promise<boolean> {
|
|
65
|
+
return new Promise((resolve) => {
|
|
66
|
+
const parsed = new URL(`${url}/api/v1/ingest`);
|
|
67
|
+
const body = JSON.stringify(snapshot);
|
|
68
|
+
|
|
69
|
+
const req = https.request({
|
|
70
|
+
hostname: parsed.hostname,
|
|
71
|
+
port: parsed.port ? parseInt(parsed.port) : 443,
|
|
72
|
+
path: parsed.pathname,
|
|
73
|
+
method: "POST",
|
|
74
|
+
agent,
|
|
75
|
+
headers: {
|
|
76
|
+
Authorization: `Bearer ${apiKey}`,
|
|
77
|
+
"Content-Type": "application/json",
|
|
78
|
+
"Content-Length": Buffer.byteLength(body),
|
|
79
|
+
},
|
|
80
|
+
timeout: 10000,
|
|
81
|
+
}, (res) => {
|
|
82
|
+
let data = "";
|
|
83
|
+
res.on("data", (chunk) => data += chunk);
|
|
84
|
+
res.on("end", () => {
|
|
85
|
+
if (res.statusCode && res.statusCode >= 200 && res.statusCode < 300) {
|
|
86
|
+
try {
|
|
87
|
+
const parsed = JSON.parse(data);
|
|
88
|
+
console.log(`[forge] Push successful (pinned). Active alerts: ${parsed.active_alerts ?? 0}`);
|
|
89
|
+
} catch { /* ignore parse errors */ }
|
|
90
|
+
resolve(true);
|
|
91
|
+
} else {
|
|
92
|
+
console.error(`[forge] Push failed (pinned): ${res.statusCode}`);
|
|
93
|
+
resolve(false);
|
|
94
|
+
}
|
|
95
|
+
});
|
|
96
|
+
});
|
|
97
|
+
|
|
98
|
+
req.on("error", (err) => {
|
|
99
|
+
console.error(`[forge] Push failed (pinned): ${err.message}`);
|
|
100
|
+
resolve(false);
|
|
101
|
+
});
|
|
102
|
+
req.on("timeout", () => {
|
|
103
|
+
req.destroy(new Error("Request timed out"));
|
|
104
|
+
resolve(false);
|
|
105
|
+
});
|
|
106
|
+
req.write(body);
|
|
107
|
+
req.end();
|
|
108
|
+
});
|
|
109
|
+
}
|