bgrun 3.12.12 → 3.12.14

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/guard.ts DELETED
@@ -1,208 +0,0 @@
1
- /**
2
- * BGR Standalone Process Guard
3
- *
4
- * Runs as an independent process that monitors ALL guarded processes
5
- * (BGR_KEEP_ALIVE=true) and the dashboard itself. If the dashboard
6
- * crashes, the guard restarts it. If any guarded process dies, the
7
- * guard restarts it.
8
- *
9
- * This is the "outer shell" — it cannot be killed by a dashboard crash.
10
- *
11
- * Usage:
12
- * bgrun --guard # Start guard as a managed bgrun process
13
- * bgrun --_guard-loop # (Internal) Actually run the guard loop
14
- * bgrun --_guard-loop 30 # Check every 30 seconds
15
- */
16
-
17
- import { getAllProcesses, getProcess } from './db';
18
- import { isProcessRunning, getProcessPorts, findChildPid } from './platform';
19
- import { handleRun } from './commands/run';
20
- import { parseEnvString } from './utils';
21
- import { createHmac } from 'crypto';
22
-
23
- // Webhook configuration via environment variables
24
- const WEBHOOK_URL = process.env.BGR_WEBHOOK_URL || '';
25
- const WEBHOOK_SECRET = process.env.BGR_WEBHOOK_SECRET || '';
26
-
27
- const DEFAULT_INTERVAL_MS = 30_000;
28
- const MAX_BACKOFF_MS = 5 * 60_000; // 5 minutes max
29
- const CRASH_THRESHOLD = 5; // Start backoff after this many restarts
30
- const STABILITY_WINDOW_MS = 120_000; // 2 minutes stable = reset counter
31
-
32
- interface GuardState {
33
- restartCounts: Map<string, number>;
34
- nextRestartTime: Map<string, number>;
35
- lastSeenAlive: Map<string, number>;
36
- }
37
-
38
- const state: GuardState = {
39
- restartCounts: new Map(),
40
- nextRestartTime: new Map(),
41
- lastSeenAlive: new Map(),
42
- };
43
-
44
- async function notifyWebhook(event: 'crash' | 'restart' | 'restart_failed', name: string, details: Record<string, any>) {
45
- if (!WEBHOOK_URL) return;
46
- try {
47
- const payload = JSON.stringify({
48
- event,
49
- process: name,
50
- timestamp: new Date().toISOString(),
51
- ...details,
52
- });
53
-
54
- const headers: Record<string, string> = {
55
- 'Content-Type': 'application/json',
56
- 'User-Agent': 'bgrun-guard/1.0',
57
- };
58
-
59
- // HMAC signature if secret is configured
60
- if (WEBHOOK_SECRET) {
61
- const sig = createHmac('sha256', WEBHOOK_SECRET).update(payload).digest('hex');
62
- headers['X-BGR-Signature'] = `sha256=${sig}`;
63
- }
64
-
65
- // Fire and forget with timeout
66
- const controller = new AbortController();
67
- const timeout = setTimeout(() => controller.abort(), 5000);
68
- await fetch(WEBHOOK_URL, {
69
- method: 'POST',
70
- headers,
71
- body: payload,
72
- signal: controller.signal,
73
- });
74
- clearTimeout(timeout);
75
- } catch (err: any) {
76
- console.error(`[guard] Webhook failed: ${err.message}`);
77
- }
78
- }
79
-
80
- async function restartProcess(name: string): Promise<boolean> {
81
- try {
82
- await handleRun({
83
- action: 'run',
84
- name,
85
- force: true,
86
- remoteName: '',
87
- });
88
- return true;
89
- } catch (err: any) {
90
- console.error(`[guard] ✗ Failed to restart "${name}": ${err.message}`);
91
- return false;
92
- }
93
- }
94
-
95
- function getBackoffMs(restartCount: number): number {
96
- if (restartCount <= CRASH_THRESHOLD) return 0;
97
- const exponent = restartCount - CRASH_THRESHOLD;
98
- return Math.min(30_000 * Math.pow(2, exponent - 1), MAX_BACKOFF_MS);
99
- }
100
-
101
- async function guardCycle(): Promise<void> {
102
- try {
103
- const processes = getAllProcesses();
104
- if (processes.length === 0) return;
105
-
106
- const now = Date.now();
107
- let checked = 0;
108
- let restarted = 0;
109
- let skipped = 0;
110
-
111
- for (const proc of processes) {
112
- // Skip the guard process itself
113
- if (proc.name === 'bgr-guard') continue;
114
-
115
- const env = proc.env ? parseEnvString(proc.env) : {};
116
- const isGuarded = env.BGR_KEEP_ALIVE === 'true';
117
- const isDashboard = proc.name === 'bgr-dashboard';
118
-
119
- // Guard both: explicitly guarded processes AND the dashboard
120
- if (!isGuarded && !isDashboard) continue;
121
-
122
- checked++;
123
-
124
- try {
125
- const alive = await isProcessRunning(proc.pid, proc.command);
126
-
127
- if (!alive && proc.pid > 0) {
128
- // Check backoff
129
- const nextRestart = state.nextRestartTime.get(proc.name) || 0;
130
- if (now < nextRestart) {
131
- const waitSecs = Math.round((nextRestart - now) / 1000);
132
- skipped++;
133
- continue;
134
- }
135
-
136
- console.log(`[guard] ⚠ "${proc.name}" (PID ${proc.pid}) is dead — restarting...`);
137
-
138
- // Notify crash detected
139
- notifyWebhook('crash', proc.name, { pid: proc.pid, isDashboard });
140
-
141
- const success = await restartProcess(proc.name);
142
- if (success) {
143
- const count = (state.restartCounts.get(proc.name) || 0) + 1;
144
- state.restartCounts.set(proc.name, count);
145
- state.lastSeenAlive.delete(proc.name);
146
-
147
- const backoff = getBackoffMs(count);
148
- if (backoff > 0) {
149
- state.nextRestartTime.set(proc.name, now + backoff);
150
- console.log(`[guard] ✓ Restarted "${proc.name}" (#${count}). Crash loop: next check in ${Math.round(backoff / 1000)}s`);
151
- } else {
152
- console.log(`[guard] ✓ Restarted "${proc.name}" (#${count})`);
153
- }
154
- restarted++;
155
-
156
- // Notify restart success
157
- notifyWebhook('restart', proc.name, { pid: proc.pid, restartCount: count, backoffMs: backoff });
158
- } else {
159
- // Notify restart failed
160
- notifyWebhook('restart_failed', proc.name, { pid: proc.pid });
161
- }
162
- } else if (alive) {
163
- // Track stability — if alive for STABILITY_WINDOW, reset counters
164
- const count = state.restartCounts.get(proc.name) || 0;
165
- if (count > 0) {
166
- const lastSeen = state.lastSeenAlive.get(proc.name);
167
- if (!lastSeen) {
168
- state.lastSeenAlive.set(proc.name, now);
169
- } else if (now - lastSeen > STABILITY_WINDOW_MS) {
170
- state.restartCounts.delete(proc.name);
171
- state.nextRestartTime.delete(proc.name);
172
- state.lastSeenAlive.delete(proc.name);
173
- console.log(`[guard] ✓ "${proc.name}" stable for ${Math.round(STABILITY_WINDOW_MS / 1000)}s — reset counters`);
174
- }
175
- }
176
- }
177
- } catch (err: any) {
178
- console.error(`[guard] Error checking "${proc.name}": ${err.message}`);
179
- }
180
- }
181
-
182
- if (restarted > 0) {
183
- console.log(`[guard] Cycle: ${checked} checked, ${restarted} restarted, ${skipped} in backoff`);
184
- }
185
- } catch (err: any) {
186
- console.error(`[guard] Error in guard cycle: ${err.message}`);
187
- }
188
- }
189
-
190
- export async function startGuardLoop(intervalMs: number = DEFAULT_INTERVAL_MS) {
191
- const interval = intervalMs || DEFAULT_INTERVAL_MS;
192
-
193
- console.log(`[guard] ═══════════════════════════════════════════`);
194
- console.log(`[guard] 🛡️ BGR Standalone Guard started`);
195
- console.log(`[guard] Check interval: ${interval / 1000}s`);
196
- console.log(`[guard] Crash backoff threshold: ${CRASH_THRESHOLD} restarts`);
197
- console.log(`[guard] Stability window: ${STABILITY_WINDOW_MS / 1000}s`);
198
- console.log(`[guard] Monitoring: BGR_KEEP_ALIVE=true + bgr-dashboard`);
199
- console.log(`[guard] Webhook: ${WEBHOOK_URL || '(none — set BGR_WEBHOOK_URL to enable)'}`);
200
- console.log(`[guard] Started: ${new Date().toLocaleString()}`);
201
- console.log(`[guard] ═══════════════════════════════════════════`);
202
-
203
- // Run initial check immediately
204
- await guardCycle();
205
-
206
- // Then run on interval
207
- setInterval(guardCycle, interval);
208
- }