rollbridge 0.1.5 → 0.1.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +125 -4
- package/TODO.md +45 -43
- package/docs/cli.md +166 -6
- package/docs/config.md +172 -2
- package/docs/logging.md +77 -0
- package/docs/releasing.md +53 -0
- package/docs/tensorbuzz-runbook.md +129 -0
- package/docs/velocious.md +49 -11
- package/docs/workers.md +115 -0
- package/package.json +1 -1
- package/src/cli.js +327 -1
- package/src/config.js +268 -6
- package/src/daemon.js +216 -13
- package/src/doctor.js +177 -0
- package/src/event-log.js +47 -0
- package/src/managed-process.js +225 -16
- package/src/predeploy-cleanup.js +340 -0
- package/src/process-memory.js +110 -0
- package/src/recover.js +134 -0
- package/src/release-group.js +71 -21
- package/src/state-store.js +103 -0
- package/src/system-ids.js +71 -0
- package/src/template.js +32 -0
- package/test/completion.test.js +64 -0
- package/test/config-validation.test.js +268 -0
- package/test/doctor.test.js +205 -3
- package/test/event-log.test.js +46 -0
- package/test/fixtures/memory-hog.js +19 -0
- package/test/managed-process.test.js +290 -0
- package/test/predeploy-cleanup.test.js +131 -0
- package/test/process-memory.test.js +40 -0
- package/test/recover.test.js +162 -0
- package/test/release-group.test.js +22 -0
- package/test/rollbridge.test.js +523 -6
- package/test/state-store.test.js +69 -0
- package/test/system-ids.test.js +24 -0
package/src/managed-process.js
CHANGED
|
@@ -2,14 +2,16 @@
|
|
|
2
2
|
|
|
3
3
|
import {EventEmitter} from "node:events"
|
|
4
4
|
import {spawn} from "node:child_process"
|
|
5
|
+
import {processGroupMembers} from "./process-memory.js"
|
|
5
6
|
|
|
6
7
|
/**
|
|
7
8
|
* @typedef {import("./json.js").JsonValue} JsonValue
|
|
8
9
|
* @typedef {"starting" | "running" | "stopping" | "stopped" | "failed"} ManagedProcessState
|
|
10
|
+
* @typedef {"deploy" | "crash" | "manual" | "memory"} ManagedProcessStartReason
|
|
9
11
|
* @typedef {import("node:child_process").ChildProcess["signalCode"]} ProcessExitSignal
|
|
10
12
|
* @typedef {{at: string, line: string, stream: "stdout" | "stderr"}} ManagedProcessLog
|
|
11
|
-
* @typedef {{command: string, cwd: string | undefined, env: Record<string, string | undefined>, logger: (message: string, data?: Record<string, import("./json.js").JsonValue>) => void, outputLines: number, restart: import("./config.js").RestartConfig, restartDelayMs: number, shouldRestart: () => boolean, stopTimeoutMs: number}} ManagedProcessDefinition
|
|
12
|
-
* @typedef {{command: string, cwd: string | undefined, exitCode: number | null | undefined, exitSignal: ProcessExitSignal | undefined, id: string, logs: ManagedProcessLog[], pid: number | undefined, restarts: number, startedAt: string | undefined, state: ManagedProcessState, uptimeMs: number | undefined}} ManagedProcessStatus
|
|
13
|
+
* @typedef {{command: string, cwd: string | undefined, env: Record<string, string | undefined>, lifecycle: import("./config.js").LifecycleConfig, logger: (message: string, data?: Record<string, import("./json.js").JsonValue>) => void, memory: import("./config.js").MemoryConfig | undefined, outputLines: number, restart: import("./config.js").RestartConfig, restartDelayMs: number, shouldRestart: () => boolean, stopSignal: string, stopTimeoutMs: number}} ManagedProcessDefinition
|
|
14
|
+
* @typedef {{children: import("./process-memory.js").ProcessGroupMember[], command: string, cwd: string | undefined, exitCode: number | null | undefined, exitSignal: ProcessExitSignal | undefined, id: string, lastMemoryRestartAt: string | undefined, lastStartReason: ManagedProcessStartReason | undefined, logs: ManagedProcessLog[], memoryRestarts: number, pid: number | undefined, restarts: number, rssBytes: number | undefined, startedAt: string | undefined, state: ManagedProcessState, uptimeMs: number | undefined}} ManagedProcessStatus
|
|
13
15
|
*/
|
|
14
16
|
|
|
15
17
|
export default class ManagedProcess extends EventEmitter {
|
|
@@ -20,29 +22,43 @@ export default class ManagedProcess extends EventEmitter {
|
|
|
20
22
|
* @param {Record<string, string | undefined>} args.env - Environment.
|
|
21
23
|
* @param {string} args.id - Process id.
|
|
22
24
|
* @param {(message: string, data?: Record<string, JsonValue>) => void} args.logger - Logger callback.
|
|
25
|
+
* @param {import("./config.js").LifecycleConfig} [args.lifecycle] - Graceful-stop lifecycle hooks (none by default).
|
|
26
|
+
* @param {import("./config.js").MemoryConfig} [args.memory] - Memory supervision config (off when omitted).
|
|
23
27
|
* @param {number} args.outputLines - Recent stdout/stderr lines to retain and report.
|
|
24
28
|
* @param {import("./config.js").RestartConfig} [args.restart] - Restart policy (defaults to unlimited restarts with a constant delay).
|
|
25
29
|
* @param {number} args.restartDelayMs - Restart delay.
|
|
26
30
|
* @param {() => boolean} args.shouldRestart - Restart policy callback.
|
|
31
|
+
* @param {string} [args.stopSignal] - Signal sent to gracefully stop the process (default "SIGTERM").
|
|
27
32
|
* @param {number} args.stopTimeoutMs - Stop timeout.
|
|
28
33
|
*/
|
|
29
|
-
constructor({command, cwd, env, id, logger, outputLines, restart = {backoffFactor: 1, maxDelayMs: 0, maxRestarts: undefined, windowMs: 0}, restartDelayMs, shouldRestart, stopTimeoutMs}) {
|
|
34
|
+
constructor({command, cwd, env, id, lifecycle = {drainTimeoutMs: 0}, logger, memory, outputLines, restart = {backoffFactor: 1, maxDelayMs: 0, maxRestarts: undefined, windowMs: 0}, restartDelayMs, shouldRestart, stopSignal = "SIGTERM", stopTimeoutMs}) {
|
|
30
35
|
super()
|
|
31
36
|
|
|
32
37
|
this.command = command
|
|
33
38
|
this.cwd = cwd
|
|
34
39
|
this.env = env
|
|
35
40
|
this.id = id
|
|
41
|
+
this.lifecycle = lifecycle
|
|
36
42
|
this.logger = logger
|
|
43
|
+
this.memory = memory
|
|
37
44
|
this.outputLines = outputLines
|
|
38
45
|
this.restart = restart
|
|
39
46
|
this.restartDelayMs = restartDelayMs
|
|
40
47
|
this.shouldRestart = shouldRestart
|
|
48
|
+
this.stopSignal = stopSignal
|
|
41
49
|
this.stopTimeoutMs = stopTimeoutMs
|
|
42
50
|
this.state = /** @type {ManagedProcessState} */ ("stopped")
|
|
51
|
+
this.lastStartReason = /** @type {ManagedProcessStartReason | undefined} */ (undefined)
|
|
43
52
|
this.logs = /** @type {ManagedProcessLog[]} */ ([])
|
|
44
53
|
this.restarts = 0
|
|
45
54
|
this.recentRestarts = /** @type {number[]} */ ([])
|
|
55
|
+
this.rssBytes = /** @type {number | undefined} */ (undefined)
|
|
56
|
+
this.children = /** @type {import("./process-memory.js").ProcessGroupMember[]} */ ([])
|
|
57
|
+
this.memoryRestarts = 0
|
|
58
|
+
this.lastMemoryRestartAtMs = /** @type {number | undefined} */ (undefined)
|
|
59
|
+
this.memoryTimer = /** @type {ReturnType<typeof setInterval> | undefined} */ (undefined)
|
|
60
|
+
this.memoryRestarting = false
|
|
61
|
+
this.memoryWarned = false
|
|
46
62
|
this.startedAtMs = /** @type {number | undefined} */ (undefined)
|
|
47
63
|
this.intentionalStop = false
|
|
48
64
|
this.restartTimer = undefined
|
|
@@ -53,8 +69,11 @@ export default class ManagedProcess extends EventEmitter {
|
|
|
53
69
|
this.exitSignal = undefined
|
|
54
70
|
}
|
|
55
71
|
|
|
56
|
-
/**
|
|
57
|
-
|
|
72
|
+
/**
|
|
73
|
+
* @param {ManagedProcessStartReason} [reason] - Why the process is being started (deploy by default; "crash" on auto-restart, "manual" via the restart command).
|
|
74
|
+
* @returns {Promise<void>} Resolves after spawn.
|
|
75
|
+
*/
|
|
76
|
+
async start(reason = "deploy") {
|
|
58
77
|
if (this.child) return
|
|
59
78
|
|
|
60
79
|
this.intentionalStop = false
|
|
@@ -83,7 +102,9 @@ export default class ManagedProcess extends EventEmitter {
|
|
|
83
102
|
child.once("spawn", () => {
|
|
84
103
|
this.state = "running"
|
|
85
104
|
this.startedAtMs = Date.now()
|
|
86
|
-
this.
|
|
105
|
+
this.lastStartReason = reason
|
|
106
|
+
this.logger("process started", {command: this.command, id: this.id, pid: child.pid || null, reason})
|
|
107
|
+
this.startMemoryMonitor()
|
|
87
108
|
this.emit("started")
|
|
88
109
|
resolve(undefined)
|
|
89
110
|
})
|
|
@@ -107,11 +128,14 @@ export default class ManagedProcess extends EventEmitter {
|
|
|
107
128
|
this.command = definition.command
|
|
108
129
|
this.cwd = definition.cwd
|
|
109
130
|
this.env = definition.env
|
|
131
|
+
this.lifecycle = definition.lifecycle
|
|
110
132
|
this.logger = definition.logger
|
|
133
|
+
this.memory = definition.memory
|
|
111
134
|
this.outputLines = definition.outputLines
|
|
112
135
|
this.restart = definition.restart
|
|
113
136
|
this.restartDelayMs = definition.restartDelayMs
|
|
114
137
|
this.shouldRestart = definition.shouldRestart
|
|
138
|
+
this.stopSignal = definition.stopSignal
|
|
115
139
|
this.stopTimeoutMs = definition.stopTimeoutMs
|
|
116
140
|
}
|
|
117
141
|
|
|
@@ -145,6 +169,9 @@ export default class ManagedProcess extends EventEmitter {
|
|
|
145
169
|
this.child = undefined
|
|
146
170
|
this.pid = undefined
|
|
147
171
|
this.exitPromise = undefined
|
|
172
|
+
this.rssBytes = undefined
|
|
173
|
+
this.children = []
|
|
174
|
+
this.clearMemoryMonitor()
|
|
148
175
|
this.state = wasIntentional ? "stopped" : "failed"
|
|
149
176
|
this.logger("process exited", {code, id: this.id, signal})
|
|
150
177
|
this.emit("exit", {code, signal})
|
|
@@ -206,10 +233,103 @@ export default class ManagedProcess extends EventEmitter {
|
|
|
206
233
|
this.restartTimer = setTimeout(() => {
|
|
207
234
|
this.restartTimer = undefined
|
|
208
235
|
this.restarts += 1
|
|
209
|
-
this.start().catch((error) => {
|
|
236
|
+
this.start("crash").catch((error) => {
|
|
210
237
|
this.logger("process restart failed", {error: error instanceof Error ? error.message : String(error), id: this.id})
|
|
211
238
|
})
|
|
212
239
|
}, delayMs)
|
|
240
|
+
|
|
241
|
+
// The daemon's listening servers govern its lifetime; a pending restart must never be the sole
|
|
242
|
+
// handle keeping the process alive (like the memory and persist timers above). Otherwise a
|
|
243
|
+
// crashed process with an unlimited restart policy would respawn forever and block exit.
|
|
244
|
+
this.restartTimer.unref?.()
|
|
245
|
+
}
|
|
246
|
+
|
|
247
|
+
/**
|
|
248
|
+
* Starts the periodic RSS check for this process when memory supervision is configured.
|
|
249
|
+
* @returns {void}
|
|
250
|
+
*/
|
|
251
|
+
startMemoryMonitor() {
|
|
252
|
+
this.clearMemoryMonitor()
|
|
253
|
+
|
|
254
|
+
if (!this.memory) return
|
|
255
|
+
|
|
256
|
+
this.memoryTimer = setInterval(() => this.checkMemory(), this.memory.checkIntervalMs)
|
|
257
|
+
this.memoryTimer.unref?.()
|
|
258
|
+
}
|
|
259
|
+
|
|
260
|
+
/** @returns {void} Stops the periodic RSS check. */
|
|
261
|
+
clearMemoryMonitor() {
|
|
262
|
+
if (this.memoryTimer) {
|
|
263
|
+
clearInterval(this.memoryTimer)
|
|
264
|
+
this.memoryTimer = undefined
|
|
265
|
+
}
|
|
266
|
+
}
|
|
267
|
+
|
|
268
|
+
/**
|
|
269
|
+
* Measures the process group's RSS and warns or restarts when it crosses the configured thresholds.
|
|
270
|
+
* @returns {void}
|
|
271
|
+
*/
|
|
272
|
+
checkMemory() {
|
|
273
|
+
if (!this.memory || !this.pid || this.memoryRestarting) return
|
|
274
|
+
|
|
275
|
+
const members = processGroupMembers(this.pid)
|
|
276
|
+
|
|
277
|
+
if (members.length === 0) return
|
|
278
|
+
|
|
279
|
+
this.children = members
|
|
280
|
+
|
|
281
|
+
const measured = members.filter((member) => member.rssBytes !== undefined)
|
|
282
|
+
|
|
283
|
+
if (measured.length === 0) return
|
|
284
|
+
|
|
285
|
+
const rssBytes = measured.reduce((total, member) => total + (member.rssBytes ?? 0), 0)
|
|
286
|
+
|
|
287
|
+
this.rssBytes = rssBytes
|
|
288
|
+
|
|
289
|
+
if (rssBytes > this.memory.limitBytes) {
|
|
290
|
+
this.logger("memory limit exceeded", {id: this.id, limitBytes: this.memory.limitBytes, rssBytes})
|
|
291
|
+
void this.restartForMemory()
|
|
292
|
+
|
|
293
|
+
return
|
|
294
|
+
}
|
|
295
|
+
|
|
296
|
+
if (this.memory.warnBytes > 0 && rssBytes > this.memory.warnBytes) {
|
|
297
|
+
if (!this.memoryWarned) {
|
|
298
|
+
this.logger("memory warning", {id: this.id, rssBytes, warnBytes: this.memory.warnBytes})
|
|
299
|
+
this.memoryWarned = true
|
|
300
|
+
}
|
|
301
|
+
} else {
|
|
302
|
+
this.memoryWarned = false
|
|
303
|
+
}
|
|
304
|
+
}
|
|
305
|
+
|
|
306
|
+
/**
|
|
307
|
+
* Gracefully restarts the process after it exceeded its memory limit (SIGTERM, then
|
|
308
|
+
* SIGKILL after the stop timeout), recording the restart so status can report it.
|
|
309
|
+
* @returns {Promise<void>} Resolves once the process has been restarted.
|
|
310
|
+
*/
|
|
311
|
+
async restartForMemory() {
|
|
312
|
+
if (this.memoryRestarting) return
|
|
313
|
+
|
|
314
|
+
this.memoryRestarting = true
|
|
315
|
+
|
|
316
|
+
try {
|
|
317
|
+
await this.stop()
|
|
318
|
+
|
|
319
|
+
// Don't respawn if the supervising context no longer wants this process running
|
|
320
|
+
// (daemon shutting down, or the release draining/retired) — otherwise a restart racing
|
|
321
|
+
// a shutdown could leave a child running after shutdown collected its stop promises.
|
|
322
|
+
if (!this.shouldRestart()) return
|
|
323
|
+
|
|
324
|
+
this.memoryRestarts += 1
|
|
325
|
+
this.lastMemoryRestartAtMs = Date.now()
|
|
326
|
+
this.memoryWarned = false
|
|
327
|
+
await this.start("memory")
|
|
328
|
+
} catch (error) {
|
|
329
|
+
this.logger("memory restart failed", {error: error instanceof Error ? error.message : String(error), id: this.id})
|
|
330
|
+
} finally {
|
|
331
|
+
this.memoryRestarting = false
|
|
332
|
+
}
|
|
213
333
|
}
|
|
214
334
|
|
|
215
335
|
/**
|
|
@@ -218,6 +338,7 @@ export default class ManagedProcess extends EventEmitter {
|
|
|
218
338
|
*/
|
|
219
339
|
async stop(options = {}) {
|
|
220
340
|
this.intentionalStop = true
|
|
341
|
+
this.clearMemoryMonitor()
|
|
221
342
|
|
|
222
343
|
if (this.restartTimer) {
|
|
223
344
|
clearTimeout(this.restartTimer)
|
|
@@ -232,21 +353,104 @@ export default class ManagedProcess extends EventEmitter {
|
|
|
232
353
|
}
|
|
233
354
|
|
|
234
355
|
this.state = "stopping"
|
|
235
|
-
|
|
236
|
-
const
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
if (
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
356
|
+
|
|
357
|
+
const {drainCommand, drainTimeoutMs, quietCommand, stopCommand} = this.lifecycle
|
|
358
|
+
|
|
359
|
+
// 1. Quiesce: tell the process to stop accepting new work.
|
|
360
|
+
if (quietCommand) await this.runHook(quietCommand, this.stopTimeoutMs, "quiet command")
|
|
361
|
+
|
|
362
|
+
// 2. Drain: let in-flight work finish, bounded by drainTimeoutMs (0 skips the step). A
|
|
363
|
+
// drainCommand blocks until drained; otherwise wait for the process to exit on its own.
|
|
364
|
+
if (this.child && drainTimeoutMs > 0) {
|
|
365
|
+
if (drainCommand) await this.runHook(drainCommand, drainTimeoutMs, "drain command")
|
|
366
|
+
else await this.waitForExit(drainTimeoutMs)
|
|
367
|
+
}
|
|
368
|
+
|
|
369
|
+
// 3. Stop whatever is still running, then SIGKILL if it outlasts the graceful window.
|
|
370
|
+
if (this.child) {
|
|
371
|
+
if (stopCommand) await this.runHook(stopCommand, this.stopTimeoutMs, "stop command")
|
|
372
|
+
else this.killProcessGroup(this.stopSignal)
|
|
373
|
+
|
|
374
|
+
const timeoutMs = options.timeoutMs ?? this.stopTimeoutMs
|
|
375
|
+
|
|
376
|
+
if (this.child && !(await this.waitForExit(timeoutMs))) {
|
|
377
|
+
this.logger("process stop timed out; sending SIGKILL", {id: this.id, pid: this.pid})
|
|
378
|
+
this.killProcessGroup("SIGKILL")
|
|
379
|
+
await this.waitForExit(5000)
|
|
380
|
+
}
|
|
243
381
|
}
|
|
244
382
|
|
|
245
383
|
this.state = "stopped"
|
|
246
384
|
}
|
|
247
385
|
|
|
248
386
|
/**
|
|
249
|
-
*
|
|
387
|
+
* Runs a lifecycle hook command, bounded by a timeout so a hung hook can never block stop().
|
|
388
|
+
* Failures are logged and swallowed — the graceful-stop sequence proceeds (and SIGKILL is the
|
|
389
|
+
* ultimate fallback) regardless of the hook's outcome.
|
|
390
|
+
* @param {string} command - Shell command to run.
|
|
391
|
+
* @param {number} timeoutMs - Maximum time to wait for the hook before killing it.
|
|
392
|
+
* @param {string} label - Hook name, for log messages.
|
|
393
|
+
* @returns {Promise<void>} Resolves when the hook exits, errors, or times out.
|
|
394
|
+
*/
|
|
395
|
+
async runHook(command, timeoutMs, label) {
|
|
396
|
+
await new Promise((resolve) => {
|
|
397
|
+
let settled = false
|
|
398
|
+
const finish = () => { if (!settled) { settled = true; resolve(undefined) } }
|
|
399
|
+
|
|
400
|
+
/** @type {import("node:child_process").ChildProcess} */
|
|
401
|
+
let hook
|
|
402
|
+
|
|
403
|
+
try {
|
|
404
|
+
hook = spawn(command, {
|
|
405
|
+
cwd: this.cwd,
|
|
406
|
+
detached: true,
|
|
407
|
+
env: {...process.env, ...this.env, ROLLBRIDGE_PID: this.pid ? String(this.pid) : ""},
|
|
408
|
+
shell: true,
|
|
409
|
+
stdio: "ignore"
|
|
410
|
+
})
|
|
411
|
+
} catch (error) {
|
|
412
|
+
this.logger(`${label} failed`, {error: error instanceof Error ? error.message : String(error), id: this.id})
|
|
413
|
+
finish()
|
|
414
|
+
|
|
415
|
+
return
|
|
416
|
+
}
|
|
417
|
+
|
|
418
|
+
const timer = setTimeout(() => {
|
|
419
|
+
this.logger(`${label} timed out`, {id: this.id, timeoutMs})
|
|
420
|
+
|
|
421
|
+
if (hook.pid) {
|
|
422
|
+
try {
|
|
423
|
+
process.kill(-hook.pid, "SIGKILL")
|
|
424
|
+
} catch {
|
|
425
|
+
// The hook already exited.
|
|
426
|
+
}
|
|
427
|
+
}
|
|
428
|
+
|
|
429
|
+
finish()
|
|
430
|
+
}, timeoutMs)
|
|
431
|
+
|
|
432
|
+
hook.once("exit", (code, signal) => {
|
|
433
|
+
clearTimeout(timer)
|
|
434
|
+
|
|
435
|
+
// A non-zero/signalled exit is surfaced (but still non-fatal); skip when the timeout
|
|
436
|
+
// already killed the hook, which logs separately.
|
|
437
|
+
if (!settled) {
|
|
438
|
+
if (typeof code === "number" && code !== 0) this.logger(`${label} exited non-zero`, {code, id: this.id})
|
|
439
|
+
else if (signal) this.logger(`${label} exited on signal`, {id: this.id, signal})
|
|
440
|
+
}
|
|
441
|
+
|
|
442
|
+
finish()
|
|
443
|
+
})
|
|
444
|
+
hook.once("error", (error) => {
|
|
445
|
+
clearTimeout(timer)
|
|
446
|
+
this.logger(`${label} failed`, {error: error instanceof Error ? error.message : String(error), id: this.id})
|
|
447
|
+
finish()
|
|
448
|
+
})
|
|
449
|
+
})
|
|
450
|
+
}
|
|
451
|
+
|
|
452
|
+
/**
|
|
453
|
+
* @param {string} signal - Signal name to send (the configured stop signal, or "SIGKILL").
|
|
250
454
|
* @returns {void}
|
|
251
455
|
*/
|
|
252
456
|
killProcessGroup(signal) {
|
|
@@ -282,14 +486,19 @@ export default class ManagedProcess extends EventEmitter {
|
|
|
282
486
|
/** @returns {ManagedProcessStatus} Status payload. */
|
|
283
487
|
status() {
|
|
284
488
|
return {
|
|
489
|
+
children: this.children,
|
|
285
490
|
command: this.command,
|
|
286
491
|
cwd: this.cwd,
|
|
287
492
|
exitCode: this.exitCode,
|
|
288
493
|
exitSignal: this.exitSignal,
|
|
289
494
|
id: this.id,
|
|
495
|
+
lastMemoryRestartAt: this.lastMemoryRestartAtMs === undefined ? undefined : new Date(this.lastMemoryRestartAtMs).toISOString(),
|
|
496
|
+
lastStartReason: this.lastStartReason,
|
|
290
497
|
logs: this.logs.slice(-this.outputLines),
|
|
498
|
+
memoryRestarts: this.memoryRestarts,
|
|
291
499
|
pid: this.pid,
|
|
292
500
|
restarts: this.restarts,
|
|
501
|
+
rssBytes: this.rssBytes,
|
|
293
502
|
startedAt: this.startedAtMs === undefined ? undefined : new Date(this.startedAtMs).toISOString(),
|
|
294
503
|
state: this.state,
|
|
295
504
|
uptimeMs: this.state === "running" && this.startedAtMs !== undefined ? Date.now() - this.startedAtMs : undefined
|