rollbridge 0.1.5 → 0.1.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +104 -4
- package/TODO.md +42 -40
- package/docs/cli.md +146 -6
- package/docs/config.md +139 -2
- package/docs/logging.md +77 -0
- package/docs/releasing.md +53 -0
- package/docs/tensorbuzz-runbook.md +129 -0
- package/docs/velocious.md +49 -11
- package/docs/workers.md +115 -0
- package/package.json +1 -1
- package/src/cli.js +290 -1
- package/src/config.js +169 -6
- package/src/daemon.js +216 -13
- package/src/doctor.js +177 -0
- package/src/event-log.js +47 -0
- package/src/managed-process.js +225 -16
- package/src/process-memory.js +110 -0
- package/src/recover.js +134 -0
- package/src/release-group.js +71 -21
- package/src/state-store.js +103 -0
- package/src/system-ids.js +71 -0
- package/src/template.js +32 -0
- package/test/completion.test.js +64 -0
- package/test/config-validation.test.js +227 -0
- package/test/doctor.test.js +205 -3
- package/test/event-log.test.js +46 -0
- package/test/fixtures/memory-hog.js +19 -0
- package/test/managed-process.test.js +290 -0
- package/test/process-memory.test.js +40 -0
- package/test/recover.test.js +162 -0
- package/test/release-group.test.js +22 -0
- package/test/rollbridge.test.js +523 -6
- package/test/state-store.test.js +69 -0
- package/test/system-ids.test.js +24 -0
package/src/managed-process.js
CHANGED
|
@@ -2,14 +2,16 @@
|
|
|
2
2
|
|
|
3
3
|
import {EventEmitter} from "node:events"
|
|
4
4
|
import {spawn} from "node:child_process"
|
|
5
|
+
import {processGroupMembers} from "./process-memory.js"
|
|
5
6
|
|
|
6
7
|
/**
|
|
7
8
|
* @typedef {import("./json.js").JsonValue} JsonValue
|
|
8
9
|
* @typedef {"starting" | "running" | "stopping" | "stopped" | "failed"} ManagedProcessState
|
|
10
|
+
* @typedef {"deploy" | "crash" | "manual" | "memory"} ManagedProcessStartReason
|
|
9
11
|
* @typedef {import("node:child_process").ChildProcess["signalCode"]} ProcessExitSignal
|
|
10
12
|
* @typedef {{at: string, line: string, stream: "stdout" | "stderr"}} ManagedProcessLog
|
|
11
|
-
* @typedef {{command: string, cwd: string | undefined, env: Record<string, string | undefined>, logger: (message: string, data?: Record<string, import("./json.js").JsonValue>) => void, outputLines: number, restart: import("./config.js").RestartConfig, restartDelayMs: number, shouldRestart: () => boolean, stopTimeoutMs: number}} ManagedProcessDefinition
|
|
12
|
-
* @typedef {{command: string, cwd: string | undefined, exitCode: number | null | undefined, exitSignal: ProcessExitSignal | undefined, id: string, logs: ManagedProcessLog[], pid: number | undefined, restarts: number, startedAt: string | undefined, state: ManagedProcessState, uptimeMs: number | undefined}} ManagedProcessStatus
|
|
13
|
+
* @typedef {{command: string, cwd: string | undefined, env: Record<string, string | undefined>, lifecycle: import("./config.js").LifecycleConfig, logger: (message: string, data?: Record<string, import("./json.js").JsonValue>) => void, memory: import("./config.js").MemoryConfig | undefined, outputLines: number, restart: import("./config.js").RestartConfig, restartDelayMs: number, shouldRestart: () => boolean, stopSignal: string, stopTimeoutMs: number}} ManagedProcessDefinition
|
|
14
|
+
* @typedef {{children: import("./process-memory.js").ProcessGroupMember[], command: string, cwd: string | undefined, exitCode: number | null | undefined, exitSignal: ProcessExitSignal | undefined, id: string, lastMemoryRestartAt: string | undefined, lastStartReason: ManagedProcessStartReason | undefined, logs: ManagedProcessLog[], memoryRestarts: number, pid: number | undefined, restarts: number, rssBytes: number | undefined, startedAt: string | undefined, state: ManagedProcessState, uptimeMs: number | undefined}} ManagedProcessStatus
|
|
13
15
|
*/
|
|
14
16
|
|
|
15
17
|
export default class ManagedProcess extends EventEmitter {
|
|
@@ -20,29 +22,43 @@ export default class ManagedProcess extends EventEmitter {
|
|
|
20
22
|
* @param {Record<string, string | undefined>} args.env - Environment.
|
|
21
23
|
* @param {string} args.id - Process id.
|
|
22
24
|
* @param {(message: string, data?: Record<string, JsonValue>) => void} args.logger - Logger callback.
|
|
25
|
+
* @param {import("./config.js").LifecycleConfig} [args.lifecycle] - Graceful-stop lifecycle hooks (none by default).
|
|
26
|
+
* @param {import("./config.js").MemoryConfig} [args.memory] - Memory supervision config (off when omitted).
|
|
23
27
|
* @param {number} args.outputLines - Recent stdout/stderr lines to retain and report.
|
|
24
28
|
* @param {import("./config.js").RestartConfig} [args.restart] - Restart policy (defaults to unlimited restarts with a constant delay).
|
|
25
29
|
* @param {number} args.restartDelayMs - Restart delay.
|
|
26
30
|
* @param {() => boolean} args.shouldRestart - Restart policy callback.
|
|
31
|
+
* @param {string} [args.stopSignal] - Signal sent to gracefully stop the process (default "SIGTERM").
|
|
27
32
|
* @param {number} args.stopTimeoutMs - Stop timeout.
|
|
28
33
|
*/
|
|
29
|
-
constructor({command, cwd, env, id, logger, outputLines, restart = {backoffFactor: 1, maxDelayMs: 0, maxRestarts: undefined, windowMs: 0}, restartDelayMs, shouldRestart, stopTimeoutMs}) {
|
|
34
|
+
constructor({command, cwd, env, id, lifecycle = {drainTimeoutMs: 0}, logger, memory, outputLines, restart = {backoffFactor: 1, maxDelayMs: 0, maxRestarts: undefined, windowMs: 0}, restartDelayMs, shouldRestart, stopSignal = "SIGTERM", stopTimeoutMs}) {
|
|
30
35
|
super()
|
|
31
36
|
|
|
32
37
|
this.command = command
|
|
33
38
|
this.cwd = cwd
|
|
34
39
|
this.env = env
|
|
35
40
|
this.id = id
|
|
41
|
+
this.lifecycle = lifecycle
|
|
36
42
|
this.logger = logger
|
|
43
|
+
this.memory = memory
|
|
37
44
|
this.outputLines = outputLines
|
|
38
45
|
this.restart = restart
|
|
39
46
|
this.restartDelayMs = restartDelayMs
|
|
40
47
|
this.shouldRestart = shouldRestart
|
|
48
|
+
this.stopSignal = stopSignal
|
|
41
49
|
this.stopTimeoutMs = stopTimeoutMs
|
|
42
50
|
this.state = /** @type {ManagedProcessState} */ ("stopped")
|
|
51
|
+
this.lastStartReason = /** @type {ManagedProcessStartReason | undefined} */ (undefined)
|
|
43
52
|
this.logs = /** @type {ManagedProcessLog[]} */ ([])
|
|
44
53
|
this.restarts = 0
|
|
45
54
|
this.recentRestarts = /** @type {number[]} */ ([])
|
|
55
|
+
this.rssBytes = /** @type {number | undefined} */ (undefined)
|
|
56
|
+
this.children = /** @type {import("./process-memory.js").ProcessGroupMember[]} */ ([])
|
|
57
|
+
this.memoryRestarts = 0
|
|
58
|
+
this.lastMemoryRestartAtMs = /** @type {number | undefined} */ (undefined)
|
|
59
|
+
this.memoryTimer = /** @type {ReturnType<typeof setInterval> | undefined} */ (undefined)
|
|
60
|
+
this.memoryRestarting = false
|
|
61
|
+
this.memoryWarned = false
|
|
46
62
|
this.startedAtMs = /** @type {number | undefined} */ (undefined)
|
|
47
63
|
this.intentionalStop = false
|
|
48
64
|
this.restartTimer = undefined
|
|
@@ -53,8 +69,11 @@ export default class ManagedProcess extends EventEmitter {
|
|
|
53
69
|
this.exitSignal = undefined
|
|
54
70
|
}
|
|
55
71
|
|
|
56
|
-
/**
|
|
57
|
-
|
|
72
|
+
/**
|
|
73
|
+
* @param {ManagedProcessStartReason} [reason] - Why the process is being started (deploy by default; "crash" on auto-restart, "manual" via the restart command).
|
|
74
|
+
* @returns {Promise<void>} Resolves after spawn.
|
|
75
|
+
*/
|
|
76
|
+
async start(reason = "deploy") {
|
|
58
77
|
if (this.child) return
|
|
59
78
|
|
|
60
79
|
this.intentionalStop = false
|
|
@@ -83,7 +102,9 @@ export default class ManagedProcess extends EventEmitter {
|
|
|
83
102
|
child.once("spawn", () => {
|
|
84
103
|
this.state = "running"
|
|
85
104
|
this.startedAtMs = Date.now()
|
|
86
|
-
this.
|
|
105
|
+
this.lastStartReason = reason
|
|
106
|
+
this.logger("process started", {command: this.command, id: this.id, pid: child.pid || null, reason})
|
|
107
|
+
this.startMemoryMonitor()
|
|
87
108
|
this.emit("started")
|
|
88
109
|
resolve(undefined)
|
|
89
110
|
})
|
|
@@ -107,11 +128,14 @@ export default class ManagedProcess extends EventEmitter {
|
|
|
107
128
|
this.command = definition.command
|
|
108
129
|
this.cwd = definition.cwd
|
|
109
130
|
this.env = definition.env
|
|
131
|
+
this.lifecycle = definition.lifecycle
|
|
110
132
|
this.logger = definition.logger
|
|
133
|
+
this.memory = definition.memory
|
|
111
134
|
this.outputLines = definition.outputLines
|
|
112
135
|
this.restart = definition.restart
|
|
113
136
|
this.restartDelayMs = definition.restartDelayMs
|
|
114
137
|
this.shouldRestart = definition.shouldRestart
|
|
138
|
+
this.stopSignal = definition.stopSignal
|
|
115
139
|
this.stopTimeoutMs = definition.stopTimeoutMs
|
|
116
140
|
}
|
|
117
141
|
|
|
@@ -145,6 +169,9 @@ export default class ManagedProcess extends EventEmitter {
|
|
|
145
169
|
this.child = undefined
|
|
146
170
|
this.pid = undefined
|
|
147
171
|
this.exitPromise = undefined
|
|
172
|
+
this.rssBytes = undefined
|
|
173
|
+
this.children = []
|
|
174
|
+
this.clearMemoryMonitor()
|
|
148
175
|
this.state = wasIntentional ? "stopped" : "failed"
|
|
149
176
|
this.logger("process exited", {code, id: this.id, signal})
|
|
150
177
|
this.emit("exit", {code, signal})
|
|
@@ -206,10 +233,103 @@ export default class ManagedProcess extends EventEmitter {
|
|
|
206
233
|
this.restartTimer = setTimeout(() => {
|
|
207
234
|
this.restartTimer = undefined
|
|
208
235
|
this.restarts += 1
|
|
209
|
-
this.start().catch((error) => {
|
|
236
|
+
this.start("crash").catch((error) => {
|
|
210
237
|
this.logger("process restart failed", {error: error instanceof Error ? error.message : String(error), id: this.id})
|
|
211
238
|
})
|
|
212
239
|
}, delayMs)
|
|
240
|
+
|
|
241
|
+
// The daemon's listening servers govern its lifetime; a pending restart must never be the sole
|
|
242
|
+
// handle keeping the process alive (like the memory and persist timers above). Otherwise a
|
|
243
|
+
// crashed process with an unlimited restart policy would respawn forever and block exit.
|
|
244
|
+
this.restartTimer.unref?.()
|
|
245
|
+
}
|
|
246
|
+
|
|
247
|
+
/**
|
|
248
|
+
* Starts the periodic RSS check for this process when memory supervision is configured.
|
|
249
|
+
* @returns {void}
|
|
250
|
+
*/
|
|
251
|
+
startMemoryMonitor() {
|
|
252
|
+
this.clearMemoryMonitor()
|
|
253
|
+
|
|
254
|
+
if (!this.memory) return
|
|
255
|
+
|
|
256
|
+
this.memoryTimer = setInterval(() => this.checkMemory(), this.memory.checkIntervalMs)
|
|
257
|
+
this.memoryTimer.unref?.()
|
|
258
|
+
}
|
|
259
|
+
|
|
260
|
+
/** @returns {void} Stops the periodic RSS check. */
|
|
261
|
+
clearMemoryMonitor() {
|
|
262
|
+
if (this.memoryTimer) {
|
|
263
|
+
clearInterval(this.memoryTimer)
|
|
264
|
+
this.memoryTimer = undefined
|
|
265
|
+
}
|
|
266
|
+
}
|
|
267
|
+
|
|
268
|
+
/**
|
|
269
|
+
* Measures the process group's RSS and warns or restarts when it crosses the configured thresholds.
|
|
270
|
+
* @returns {void}
|
|
271
|
+
*/
|
|
272
|
+
checkMemory() {
|
|
273
|
+
if (!this.memory || !this.pid || this.memoryRestarting) return
|
|
274
|
+
|
|
275
|
+
const members = processGroupMembers(this.pid)
|
|
276
|
+
|
|
277
|
+
if (members.length === 0) return
|
|
278
|
+
|
|
279
|
+
this.children = members
|
|
280
|
+
|
|
281
|
+
const measured = members.filter((member) => member.rssBytes !== undefined)
|
|
282
|
+
|
|
283
|
+
if (measured.length === 0) return
|
|
284
|
+
|
|
285
|
+
const rssBytes = measured.reduce((total, member) => total + (member.rssBytes ?? 0), 0)
|
|
286
|
+
|
|
287
|
+
this.rssBytes = rssBytes
|
|
288
|
+
|
|
289
|
+
if (rssBytes > this.memory.limitBytes) {
|
|
290
|
+
this.logger("memory limit exceeded", {id: this.id, limitBytes: this.memory.limitBytes, rssBytes})
|
|
291
|
+
void this.restartForMemory()
|
|
292
|
+
|
|
293
|
+
return
|
|
294
|
+
}
|
|
295
|
+
|
|
296
|
+
if (this.memory.warnBytes > 0 && rssBytes > this.memory.warnBytes) {
|
|
297
|
+
if (!this.memoryWarned) {
|
|
298
|
+
this.logger("memory warning", {id: this.id, rssBytes, warnBytes: this.memory.warnBytes})
|
|
299
|
+
this.memoryWarned = true
|
|
300
|
+
}
|
|
301
|
+
} else {
|
|
302
|
+
this.memoryWarned = false
|
|
303
|
+
}
|
|
304
|
+
}
|
|
305
|
+
|
|
306
|
+
/**
|
|
307
|
+
* Gracefully restarts the process after it exceeded its memory limit (SIGTERM, then
|
|
308
|
+
* SIGKILL after the stop timeout), recording the restart so status can report it.
|
|
309
|
+
* @returns {Promise<void>} Resolves once the process has been restarted.
|
|
310
|
+
*/
|
|
311
|
+
async restartForMemory() {
|
|
312
|
+
if (this.memoryRestarting) return
|
|
313
|
+
|
|
314
|
+
this.memoryRestarting = true
|
|
315
|
+
|
|
316
|
+
try {
|
|
317
|
+
await this.stop()
|
|
318
|
+
|
|
319
|
+
// Don't respawn if the supervising context no longer wants this process running
|
|
320
|
+
// (daemon shutting down, or the release draining/retired) — otherwise a restart racing
|
|
321
|
+
// a shutdown could leave a child running after shutdown collected its stop promises.
|
|
322
|
+
if (!this.shouldRestart()) return
|
|
323
|
+
|
|
324
|
+
this.memoryRestarts += 1
|
|
325
|
+
this.lastMemoryRestartAtMs = Date.now()
|
|
326
|
+
this.memoryWarned = false
|
|
327
|
+
await this.start("memory")
|
|
328
|
+
} catch (error) {
|
|
329
|
+
this.logger("memory restart failed", {error: error instanceof Error ? error.message : String(error), id: this.id})
|
|
330
|
+
} finally {
|
|
331
|
+
this.memoryRestarting = false
|
|
332
|
+
}
|
|
213
333
|
}
|
|
214
334
|
|
|
215
335
|
/**
|
|
@@ -218,6 +338,7 @@ export default class ManagedProcess extends EventEmitter {
|
|
|
218
338
|
*/
|
|
219
339
|
async stop(options = {}) {
|
|
220
340
|
this.intentionalStop = true
|
|
341
|
+
this.clearMemoryMonitor()
|
|
221
342
|
|
|
222
343
|
if (this.restartTimer) {
|
|
223
344
|
clearTimeout(this.restartTimer)
|
|
@@ -232,21 +353,104 @@ export default class ManagedProcess extends EventEmitter {
|
|
|
232
353
|
}
|
|
233
354
|
|
|
234
355
|
this.state = "stopping"
|
|
235
|
-
|
|
236
|
-
const
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
if (
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
356
|
+
|
|
357
|
+
const {drainCommand, drainTimeoutMs, quietCommand, stopCommand} = this.lifecycle
|
|
358
|
+
|
|
359
|
+
// 1. Quiesce: tell the process to stop accepting new work.
|
|
360
|
+
if (quietCommand) await this.runHook(quietCommand, this.stopTimeoutMs, "quiet command")
|
|
361
|
+
|
|
362
|
+
// 2. Drain: let in-flight work finish, bounded by drainTimeoutMs (0 skips the step). A
|
|
363
|
+
// drainCommand blocks until drained; otherwise wait for the process to exit on its own.
|
|
364
|
+
if (this.child && drainTimeoutMs > 0) {
|
|
365
|
+
if (drainCommand) await this.runHook(drainCommand, drainTimeoutMs, "drain command")
|
|
366
|
+
else await this.waitForExit(drainTimeoutMs)
|
|
367
|
+
}
|
|
368
|
+
|
|
369
|
+
// 3. Stop whatever is still running, then SIGKILL if it outlasts the graceful window.
|
|
370
|
+
if (this.child) {
|
|
371
|
+
if (stopCommand) await this.runHook(stopCommand, this.stopTimeoutMs, "stop command")
|
|
372
|
+
else this.killProcessGroup(this.stopSignal)
|
|
373
|
+
|
|
374
|
+
const timeoutMs = options.timeoutMs ?? this.stopTimeoutMs
|
|
375
|
+
|
|
376
|
+
if (this.child && !(await this.waitForExit(timeoutMs))) {
|
|
377
|
+
this.logger("process stop timed out; sending SIGKILL", {id: this.id, pid: this.pid})
|
|
378
|
+
this.killProcessGroup("SIGKILL")
|
|
379
|
+
await this.waitForExit(5000)
|
|
380
|
+
}
|
|
243
381
|
}
|
|
244
382
|
|
|
245
383
|
this.state = "stopped"
|
|
246
384
|
}
|
|
247
385
|
|
|
248
386
|
/**
|
|
249
|
-
*
|
|
387
|
+
* Runs a lifecycle hook command, bounded by a timeout so a hung hook can never block stop().
|
|
388
|
+
* Failures are logged and swallowed — the graceful-stop sequence proceeds (and SIGKILL is the
|
|
389
|
+
* ultimate fallback) regardless of the hook's outcome.
|
|
390
|
+
* @param {string} command - Shell command to run.
|
|
391
|
+
* @param {number} timeoutMs - Maximum time to wait for the hook before killing it.
|
|
392
|
+
* @param {string} label - Hook name, for log messages.
|
|
393
|
+
* @returns {Promise<void>} Resolves when the hook exits, errors, or times out.
|
|
394
|
+
*/
|
|
395
|
+
async runHook(command, timeoutMs, label) {
|
|
396
|
+
await new Promise((resolve) => {
|
|
397
|
+
let settled = false
|
|
398
|
+
const finish = () => { if (!settled) { settled = true; resolve(undefined) } }
|
|
399
|
+
|
|
400
|
+
/** @type {import("node:child_process").ChildProcess} */
|
|
401
|
+
let hook
|
|
402
|
+
|
|
403
|
+
try {
|
|
404
|
+
hook = spawn(command, {
|
|
405
|
+
cwd: this.cwd,
|
|
406
|
+
detached: true,
|
|
407
|
+
env: {...process.env, ...this.env, ROLLBRIDGE_PID: this.pid ? String(this.pid) : ""},
|
|
408
|
+
shell: true,
|
|
409
|
+
stdio: "ignore"
|
|
410
|
+
})
|
|
411
|
+
} catch (error) {
|
|
412
|
+
this.logger(`${label} failed`, {error: error instanceof Error ? error.message : String(error), id: this.id})
|
|
413
|
+
finish()
|
|
414
|
+
|
|
415
|
+
return
|
|
416
|
+
}
|
|
417
|
+
|
|
418
|
+
const timer = setTimeout(() => {
|
|
419
|
+
this.logger(`${label} timed out`, {id: this.id, timeoutMs})
|
|
420
|
+
|
|
421
|
+
if (hook.pid) {
|
|
422
|
+
try {
|
|
423
|
+
process.kill(-hook.pid, "SIGKILL")
|
|
424
|
+
} catch {
|
|
425
|
+
// The hook already exited.
|
|
426
|
+
}
|
|
427
|
+
}
|
|
428
|
+
|
|
429
|
+
finish()
|
|
430
|
+
}, timeoutMs)
|
|
431
|
+
|
|
432
|
+
hook.once("exit", (code, signal) => {
|
|
433
|
+
clearTimeout(timer)
|
|
434
|
+
|
|
435
|
+
// A non-zero/signalled exit is surfaced (but still non-fatal); skip when the timeout
|
|
436
|
+
// already killed the hook, which logs separately.
|
|
437
|
+
if (!settled) {
|
|
438
|
+
if (typeof code === "number" && code !== 0) this.logger(`${label} exited non-zero`, {code, id: this.id})
|
|
439
|
+
else if (signal) this.logger(`${label} exited on signal`, {id: this.id, signal})
|
|
440
|
+
}
|
|
441
|
+
|
|
442
|
+
finish()
|
|
443
|
+
})
|
|
444
|
+
hook.once("error", (error) => {
|
|
445
|
+
clearTimeout(timer)
|
|
446
|
+
this.logger(`${label} failed`, {error: error instanceof Error ? error.message : String(error), id: this.id})
|
|
447
|
+
finish()
|
|
448
|
+
})
|
|
449
|
+
})
|
|
450
|
+
}
|
|
451
|
+
|
|
452
|
+
/**
|
|
453
|
+
* @param {string} signal - Signal name to send (the configured stop signal, or "SIGKILL").
|
|
250
454
|
* @returns {void}
|
|
251
455
|
*/
|
|
252
456
|
killProcessGroup(signal) {
|
|
@@ -282,14 +486,19 @@ export default class ManagedProcess extends EventEmitter {
|
|
|
282
486
|
/** @returns {ManagedProcessStatus} Status payload. */
|
|
283
487
|
status() {
|
|
284
488
|
return {
|
|
489
|
+
children: this.children,
|
|
285
490
|
command: this.command,
|
|
286
491
|
cwd: this.cwd,
|
|
287
492
|
exitCode: this.exitCode,
|
|
288
493
|
exitSignal: this.exitSignal,
|
|
289
494
|
id: this.id,
|
|
495
|
+
lastMemoryRestartAt: this.lastMemoryRestartAtMs === undefined ? undefined : new Date(this.lastMemoryRestartAtMs).toISOString(),
|
|
496
|
+
lastStartReason: this.lastStartReason,
|
|
290
497
|
logs: this.logs.slice(-this.outputLines),
|
|
498
|
+
memoryRestarts: this.memoryRestarts,
|
|
291
499
|
pid: this.pid,
|
|
292
500
|
restarts: this.restarts,
|
|
501
|
+
rssBytes: this.rssBytes,
|
|
293
502
|
startedAt: this.startedAtMs === undefined ? undefined : new Date(this.startedAtMs).toISOString(),
|
|
294
503
|
state: this.state,
|
|
295
504
|
uptimeMs: this.state === "running" && this.startedAtMs !== undefined ? Date.now() - this.startedAtMs : undefined
|
|
@@ -0,0 +1,110 @@
|
|
|
1
|
+
// @ts-check
|
|
2
|
+
|
|
3
|
+
import fs from "node:fs"
|
|
4
|
+
|
|
5
|
+
/**
|
|
6
|
+
* @typedef {{command: string, pid: number, rssBytes: number | undefined}} ProcessGroupMember
|
|
7
|
+
*/
|
|
8
|
+
|
|
9
|
+
/**
|
|
10
|
+
* Lists the members of a managed process group with each member's resident memory.
|
|
11
|
+
* Rollbridge spawns each process detached, so the spawned pid is the process-group
|
|
12
|
+
* leader and every process in the tree (the shell wrapper, the app, any children)
|
|
13
|
+
* shares that group id.
|
|
14
|
+
*
|
|
15
|
+
* Reads `/proc` (Linux); returns an empty array when unavailable (no `/proc`, e.g.
|
|
16
|
+
* non-Linux) or the group has no members.
|
|
17
|
+
* @param {number} pgid - Process-group id (the detached spawn's pid).
|
|
18
|
+
* @returns {ProcessGroupMember[]} Group members, ordered by pid.
|
|
19
|
+
*/
|
|
20
|
+
export function processGroupMembers(pgid) {
|
|
21
|
+
/** @type {string[]} */
|
|
22
|
+
let entries
|
|
23
|
+
|
|
24
|
+
try {
|
|
25
|
+
entries = fs.readdirSync("/proc")
|
|
26
|
+
} catch {
|
|
27
|
+
return []
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
/** @type {ProcessGroupMember[]} */
|
|
31
|
+
const members = []
|
|
32
|
+
|
|
33
|
+
for (const entry of entries) {
|
|
34
|
+
if (!/^\d+$/.test(entry)) continue
|
|
35
|
+
if (processGroupId(entry) !== pgid) continue
|
|
36
|
+
|
|
37
|
+
members.push({command: commandName(entry), pid: Number(entry), rssBytes: residentBytes(entry)})
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
members.sort((first, second) => first.pid - second.pid)
|
|
41
|
+
|
|
42
|
+
return members
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
/**
|
|
46
|
+
* Measures the total resident memory (RSS) of a managed process group.
|
|
47
|
+
* @param {number} pgid - Process-group id (the detached spawn's pid).
|
|
48
|
+
* @returns {number | undefined} Total resident memory in bytes, or undefined when unmeasurable.
|
|
49
|
+
*/
|
|
50
|
+
export function measureProcessGroupRssBytes(pgid) {
|
|
51
|
+
const measured = processGroupMembers(pgid).filter((member) => member.rssBytes !== undefined)
|
|
52
|
+
|
|
53
|
+
if (measured.length === 0) return undefined
|
|
54
|
+
|
|
55
|
+
return measured.reduce((total, member) => total + (member.rssBytes ?? 0), 0)
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
/**
|
|
59
|
+
* @param {string} pid - Process id.
|
|
60
|
+
* @returns {string} The process command name (`/proc/<pid>/comm`), or "" when unavailable.
|
|
61
|
+
*/
|
|
62
|
+
function commandName(pid) {
|
|
63
|
+
try {
|
|
64
|
+
return fs.readFileSync(`/proc/${pid}/comm`, "utf8").trim()
|
|
65
|
+
} catch {
|
|
66
|
+
return ""
|
|
67
|
+
}
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
/**
|
|
71
|
+
* @param {string} pid - Process id.
|
|
72
|
+
* @returns {number | undefined} The process-group id, or undefined when the process is gone.
|
|
73
|
+
*/
|
|
74
|
+
function processGroupId(pid) {
|
|
75
|
+
let stat
|
|
76
|
+
|
|
77
|
+
try {
|
|
78
|
+
stat = fs.readFileSync(`/proc/${pid}/stat`, "utf8")
|
|
79
|
+
} catch {
|
|
80
|
+
return undefined
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
// The comm field is wrapped in parens and may itself contain spaces or parens, so the
|
|
84
|
+
// numeric fields are parsed from after the final ")". They are: state, ppid, pgrp, ...
|
|
85
|
+
const commEnd = stat.lastIndexOf(")")
|
|
86
|
+
|
|
87
|
+
if (commEnd < 0) return undefined
|
|
88
|
+
|
|
89
|
+
const pgrp = Number(stat.slice(commEnd + 2).split(" ")[2])
|
|
90
|
+
|
|
91
|
+
return Number.isInteger(pgrp) ? pgrp : undefined
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
/**
|
|
95
|
+
* @param {string} pid - Process id.
|
|
96
|
+
* @returns {number | undefined} Resident memory in bytes, or undefined when unavailable.
|
|
97
|
+
*/
|
|
98
|
+
function residentBytes(pid) {
|
|
99
|
+
let status
|
|
100
|
+
|
|
101
|
+
try {
|
|
102
|
+
status = fs.readFileSync(`/proc/${pid}/status`, "utf8")
|
|
103
|
+
} catch {
|
|
104
|
+
return undefined
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
const match = status.match(/^VmRSS:\s+(\d+)\s+kB/m)
|
|
108
|
+
|
|
109
|
+
return match ? Number(match[1]) * 1024 : undefined
|
|
110
|
+
}
|
package/src/recover.js
ADDED
|
@@ -0,0 +1,134 @@
|
|
|
1
|
+
// @ts-check
|
|
2
|
+
|
|
3
|
+
import {inspectControlSocket} from "./daemon.js"
|
|
4
|
+
import {clearState, isProcessAlive, liveProcesses, readState} from "./state-store.js"
|
|
5
|
+
|
|
6
|
+
// How long to confirm a SIGKILL'd group has actually exited before reporting it un-stoppable.
|
|
7
|
+
const KILL_CONFIRM_TIMEOUT_MS = 500
|
|
8
|
+
|
|
9
|
+
/**
|
|
10
|
+
* @typedef {{id: string, pid: number, releaseId: string | null}} OrphanProcess
|
|
11
|
+
* @typedef {{error: string}} RecoverError
|
|
12
|
+
* @typedef {{cleared: boolean, forced: boolean, orphans: OrphanProcess[], remaining: OrphanProcess[]}} RecoverReport
|
|
13
|
+
* @typedef {RecoverError | RecoverReport} RecoverResult
|
|
14
|
+
*/
|
|
15
|
+
|
|
16
|
+
/**
|
|
17
|
+
* Cleans up orphaned managed processes left by a crashed daemon. Reads the persisted state
|
|
18
|
+
* (config.statePath) and finds managed processes whose pids are still alive. By default it
|
|
19
|
+
* only reports them; with `force` it stops each one's process group (SIGTERM, then SIGKILL
|
|
20
|
+
* after the configured timeout) and clears the stale state file.
|
|
21
|
+
*
|
|
22
|
+
* When `force` leaves any orphan still running (for example a process owned by another user
|
|
23
|
+
* that can't be signaled), the state file is **kept** so the operator can investigate and
|
|
24
|
+
* re-run recovery — the survivors are returned in `remaining` and `cleared` stays false.
|
|
25
|
+
*
|
|
26
|
+
* Refuses to run while a daemon (or another process) holds the control socket — those pids
|
|
27
|
+
* belong to a live daemon, not a crash. A recycled pid can be a false positive, so review the
|
|
28
|
+
* dry-run list before using `force`.
|
|
29
|
+
* @param {object} args - Options.
|
|
30
|
+
* @param {import("./config.js").RollbridgeConfig} args.config - Normalized config.
|
|
31
|
+
* @param {boolean} args.force - Whether to actually stop the orphans (otherwise list them).
|
|
32
|
+
* @param {(pid: number, timeoutMs: number) => Promise<boolean>} [args.stopGroup] - Stops a process group and resolves to whether it is gone afterward (defaults to the real implementation; injectable for tests).
|
|
33
|
+
* @returns {Promise<RecoverResult>} The orphans found and whether they were stopped, or an error.
|
|
34
|
+
*/
|
|
35
|
+
export async function recoverOrphans({config, force, stopGroup = stopProcessGroup}) {
|
|
36
|
+
if (config.statePath === undefined) {
|
|
37
|
+
return {error: "No statePath is configured; set statePath in the config to enable recovery."}
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
if (await daemonIsRunning(config.control.path)) {
|
|
41
|
+
return {error: `A daemon (or another process) is using ${config.control.path}; stop it before recovering — recover is for cleaning up after a crash.`}
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
const orphans = liveProcesses(await readState(config.statePath))
|
|
45
|
+
|
|
46
|
+
if (!force) return {cleared: false, forced: false, orphans, remaining: []}
|
|
47
|
+
|
|
48
|
+
/** @type {OrphanProcess[]} */
|
|
49
|
+
const remaining = []
|
|
50
|
+
|
|
51
|
+
for (const orphan of orphans) {
|
|
52
|
+
const stopped = await stopGroup(orphan.pid, config.proxy.forceStopTimeoutMs)
|
|
53
|
+
|
|
54
|
+
if (!stopped) remaining.push(orphan)
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
// Only clear the state file once every orphan is confirmed gone; otherwise keep it so the
|
|
58
|
+
// operator can still find and retry the survivors on the next run.
|
|
59
|
+
if (remaining.length === 0) await clearState(config.statePath)
|
|
60
|
+
|
|
61
|
+
return {cleared: remaining.length === 0, forced: true, orphans, remaining}
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
/**
|
|
65
|
+
* @param {string} socketPath - Control socket path.
|
|
66
|
+
* @returns {Promise<boolean>} True when a process is live on the socket (or it can't be probed).
|
|
67
|
+
*/
|
|
68
|
+
async function daemonIsRunning(socketPath) {
|
|
69
|
+
try {
|
|
70
|
+
return (await inspectControlSocket(socketPath)).alive
|
|
71
|
+
} catch {
|
|
72
|
+
// Can't tell — be conservative and refuse to stop processes.
|
|
73
|
+
return true
|
|
74
|
+
}
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
/**
|
|
78
|
+
* Stops a detached process group: SIGTERM, then SIGKILL if it outlives the timeout.
|
|
79
|
+
* @param {number} pid - Process-group leader pid (the detached spawn's pid).
|
|
80
|
+
* @param {number} timeoutMs - Grace period before SIGKILL.
|
|
81
|
+
* @returns {Promise<boolean>} True once the process is gone; false if it is still alive afterward (for example owned by another user, so it can't be signaled).
|
|
82
|
+
*/
|
|
83
|
+
async function stopProcessGroup(pid, timeoutMs) {
|
|
84
|
+
const term = sendSignal(pid, "SIGTERM")
|
|
85
|
+
|
|
86
|
+
if (term === "gone") return true
|
|
87
|
+
if (term === "denied") return false
|
|
88
|
+
|
|
89
|
+
if (await waitForExit(pid, timeoutMs)) return true
|
|
90
|
+
|
|
91
|
+
const kill = sendSignal(pid, "SIGKILL")
|
|
92
|
+
|
|
93
|
+
if (kill === "gone") return true
|
|
94
|
+
if (kill === "denied") return false
|
|
95
|
+
|
|
96
|
+
return waitForExit(pid, KILL_CONFIRM_TIMEOUT_MS)
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
/**
|
|
100
|
+
* Polls until the pid is no longer alive or the timeout elapses.
|
|
101
|
+
* @param {number} pid - Process pid to watch.
|
|
102
|
+
* @param {number} timeoutMs - How long to wait for it to exit.
|
|
103
|
+
* @returns {Promise<boolean>} True once the process is gone, false if it is still alive at the deadline.
|
|
104
|
+
*/
|
|
105
|
+
async function waitForExit(pid, timeoutMs) {
|
|
106
|
+
const deadline = Date.now() + timeoutMs
|
|
107
|
+
|
|
108
|
+
while (Date.now() < deadline) {
|
|
109
|
+
if (!isProcessAlive(pid)) return true
|
|
110
|
+
|
|
111
|
+
await new Promise((resolve) => setTimeout(resolve, 50))
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
return !isProcessAlive(pid)
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
/**
|
|
118
|
+
* Sends a signal to a detached process group, classifying the outcome.
|
|
119
|
+
* @param {number} pid - Process-group leader pid.
|
|
120
|
+
* @param {"SIGTERM" | "SIGKILL"} signal - Signal to send to the group.
|
|
121
|
+
* @returns {"denied" | "gone" | "sent"} `gone` when the group no longer exists (ESRCH), `denied` when it can't be signaled (for example EPERM), otherwise `sent`.
|
|
122
|
+
*/
|
|
123
|
+
function sendSignal(pid, signal) {
|
|
124
|
+
try {
|
|
125
|
+
process.kill(-pid, signal)
|
|
126
|
+
|
|
127
|
+
return "sent"
|
|
128
|
+
} catch (error) {
|
|
129
|
+
if (error && typeof error === "object" && "code" in error && error.code === "ESRCH") return "gone"
|
|
130
|
+
|
|
131
|
+
// EPERM (owned by another user) or anything else: we could not deliver the signal.
|
|
132
|
+
return "denied"
|
|
133
|
+
}
|
|
134
|
+
}
|