rollbridge 0.1.4 → 0.1.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +137 -4
- package/TODO.md +47 -45
- package/docs/cli.md +169 -6
- package/docs/config.md +160 -3
- package/docs/logging.md +77 -0
- package/docs/nginx.md +104 -0
- package/docs/releasing.md +53 -0
- package/docs/tensorbuzz-runbook.md +129 -0
- package/docs/velocious.md +238 -0
- package/docs/workers.md +115 -0
- package/package.json +3 -2
- package/src/cli.js +317 -1
- package/src/config.js +240 -6
- package/src/daemon.js +284 -4
- package/src/doctor.js +177 -0
- package/src/event-log.js +47 -0
- package/src/managed-process.js +287 -22
- package/src/process-memory.js +110 -0
- package/src/recover.js +134 -0
- package/src/release-group.js +80 -21
- package/src/state-store.js +103 -0
- package/src/system-ids.js +71 -0
- package/src/template.js +32 -0
- package/test/completion.test.js +64 -0
- package/test/config-validation.test.js +267 -0
- package/test/doctor.test.js +205 -3
- package/test/event-log.test.js +46 -0
- package/test/fixtures/memory-hog.js +19 -0
- package/test/managed-process.test.js +376 -0
- package/test/process-memory.test.js +40 -0
- package/test/recover.test.js +162 -0
- package/test/release-group.test.js +22 -0
- package/test/rollbridge.test.js +716 -6
- package/test/state-store.test.js +69 -0
- package/test/system-ids.test.js +24 -0
- package/scripts/release-patch.js +0 -83
package/src/managed-process.js
CHANGED
|
@@ -2,14 +2,16 @@
|
|
|
2
2
|
|
|
3
3
|
import {EventEmitter} from "node:events"
|
|
4
4
|
import {spawn} from "node:child_process"
|
|
5
|
+
import {processGroupMembers} from "./process-memory.js"
|
|
5
6
|
|
|
6
7
|
/**
|
|
7
8
|
* @typedef {import("./json.js").JsonValue} JsonValue
|
|
8
9
|
* @typedef {"starting" | "running" | "stopping" | "stopped" | "failed"} ManagedProcessState
|
|
10
|
+
* @typedef {"deploy" | "crash" | "manual" | "memory"} ManagedProcessStartReason
|
|
9
11
|
* @typedef {import("node:child_process").ChildProcess["signalCode"]} ProcessExitSignal
|
|
10
12
|
* @typedef {{at: string, line: string, stream: "stdout" | "stderr"}} ManagedProcessLog
|
|
11
|
-
* @typedef {{command: string, cwd: string | undefined, env: Record<string, string | undefined>, logger: (message: string, data?: Record<string, import("./json.js").JsonValue>) => void, outputLines: number, restartDelayMs: number, shouldRestart: () => boolean, stopTimeoutMs: number}} ManagedProcessDefinition
|
|
12
|
-
* @typedef {{command: string, cwd: string | undefined, exitCode: number | null | undefined, exitSignal: ProcessExitSignal | undefined, id: string, logs: ManagedProcessLog[], pid: number | undefined, restarts: number, startedAt: string | undefined, state: ManagedProcessState, uptimeMs: number | undefined}} ManagedProcessStatus
|
|
13
|
+
* @typedef {{command: string, cwd: string | undefined, env: Record<string, string | undefined>, lifecycle: import("./config.js").LifecycleConfig, logger: (message: string, data?: Record<string, import("./json.js").JsonValue>) => void, memory: import("./config.js").MemoryConfig | undefined, outputLines: number, restart: import("./config.js").RestartConfig, restartDelayMs: number, shouldRestart: () => boolean, stopSignal: string, stopTimeoutMs: number}} ManagedProcessDefinition
|
|
14
|
+
* @typedef {{children: import("./process-memory.js").ProcessGroupMember[], command: string, cwd: string | undefined, exitCode: number | null | undefined, exitSignal: ProcessExitSignal | undefined, id: string, lastMemoryRestartAt: string | undefined, lastStartReason: ManagedProcessStartReason | undefined, logs: ManagedProcessLog[], memoryRestarts: number, pid: number | undefined, restarts: number, rssBytes: number | undefined, startedAt: string | undefined, state: ManagedProcessState, uptimeMs: number | undefined}} ManagedProcessStatus
|
|
13
15
|
*/
|
|
14
16
|
|
|
15
17
|
export default class ManagedProcess extends EventEmitter {
|
|
@@ -20,26 +22,43 @@ export default class ManagedProcess extends EventEmitter {
|
|
|
20
22
|
* @param {Record<string, string | undefined>} args.env - Environment.
|
|
21
23
|
* @param {string} args.id - Process id.
|
|
22
24
|
* @param {(message: string, data?: Record<string, JsonValue>) => void} args.logger - Logger callback.
|
|
25
|
+
* @param {import("./config.js").LifecycleConfig} [args.lifecycle] - Graceful-stop lifecycle hooks (none by default).
|
|
26
|
+
* @param {import("./config.js").MemoryConfig} [args.memory] - Memory supervision config (off when omitted).
|
|
23
27
|
* @param {number} args.outputLines - Recent stdout/stderr lines to retain and report.
|
|
28
|
+
* @param {import("./config.js").RestartConfig} [args.restart] - Restart policy (defaults to unlimited restarts with a constant delay).
|
|
24
29
|
* @param {number} args.restartDelayMs - Restart delay.
|
|
25
30
|
* @param {() => boolean} args.shouldRestart - Restart policy callback.
|
|
31
|
+
* @param {string} [args.stopSignal] - Signal sent to gracefully stop the process (default "SIGTERM").
|
|
26
32
|
* @param {number} args.stopTimeoutMs - Stop timeout.
|
|
27
33
|
*/
|
|
28
|
-
constructor({command, cwd, env, id, logger, outputLines, restartDelayMs, shouldRestart, stopTimeoutMs}) {
|
|
34
|
+
constructor({command, cwd, env, id, lifecycle = {drainTimeoutMs: 0}, logger, memory, outputLines, restart = {backoffFactor: 1, maxDelayMs: 0, maxRestarts: undefined, windowMs: 0}, restartDelayMs, shouldRestart, stopSignal = "SIGTERM", stopTimeoutMs}) {
|
|
29
35
|
super()
|
|
30
36
|
|
|
31
37
|
this.command = command
|
|
32
38
|
this.cwd = cwd
|
|
33
39
|
this.env = env
|
|
34
40
|
this.id = id
|
|
41
|
+
this.lifecycle = lifecycle
|
|
35
42
|
this.logger = logger
|
|
43
|
+
this.memory = memory
|
|
36
44
|
this.outputLines = outputLines
|
|
45
|
+
this.restart = restart
|
|
37
46
|
this.restartDelayMs = restartDelayMs
|
|
38
47
|
this.shouldRestart = shouldRestart
|
|
48
|
+
this.stopSignal = stopSignal
|
|
39
49
|
this.stopTimeoutMs = stopTimeoutMs
|
|
40
50
|
this.state = /** @type {ManagedProcessState} */ ("stopped")
|
|
51
|
+
this.lastStartReason = /** @type {ManagedProcessStartReason | undefined} */ (undefined)
|
|
41
52
|
this.logs = /** @type {ManagedProcessLog[]} */ ([])
|
|
42
53
|
this.restarts = 0
|
|
54
|
+
this.recentRestarts = /** @type {number[]} */ ([])
|
|
55
|
+
this.rssBytes = /** @type {number | undefined} */ (undefined)
|
|
56
|
+
this.children = /** @type {import("./process-memory.js").ProcessGroupMember[]} */ ([])
|
|
57
|
+
this.memoryRestarts = 0
|
|
58
|
+
this.lastMemoryRestartAtMs = /** @type {number | undefined} */ (undefined)
|
|
59
|
+
this.memoryTimer = /** @type {ReturnType<typeof setInterval> | undefined} */ (undefined)
|
|
60
|
+
this.memoryRestarting = false
|
|
61
|
+
this.memoryWarned = false
|
|
43
62
|
this.startedAtMs = /** @type {number | undefined} */ (undefined)
|
|
44
63
|
this.intentionalStop = false
|
|
45
64
|
this.restartTimer = undefined
|
|
@@ -50,8 +69,11 @@ export default class ManagedProcess extends EventEmitter {
|
|
|
50
69
|
this.exitSignal = undefined
|
|
51
70
|
}
|
|
52
71
|
|
|
53
|
-
/**
|
|
54
|
-
|
|
72
|
+
/**
|
|
73
|
+
* @param {ManagedProcessStartReason} [reason] - Why the process is being started (deploy by default; "crash" on auto-restart, "manual" via the restart command).
|
|
74
|
+
* @returns {Promise<void>} Resolves after spawn.
|
|
75
|
+
*/
|
|
76
|
+
async start(reason = "deploy") {
|
|
55
77
|
if (this.child) return
|
|
56
78
|
|
|
57
79
|
this.intentionalStop = false
|
|
@@ -80,7 +102,9 @@ export default class ManagedProcess extends EventEmitter {
|
|
|
80
102
|
child.once("spawn", () => {
|
|
81
103
|
this.state = "running"
|
|
82
104
|
this.startedAtMs = Date.now()
|
|
83
|
-
this.
|
|
105
|
+
this.lastStartReason = reason
|
|
106
|
+
this.logger("process started", {command: this.command, id: this.id, pid: child.pid || null, reason})
|
|
107
|
+
this.startMemoryMonitor()
|
|
84
108
|
this.emit("started")
|
|
85
109
|
resolve(undefined)
|
|
86
110
|
})
|
|
@@ -104,10 +128,14 @@ export default class ManagedProcess extends EventEmitter {
|
|
|
104
128
|
this.command = definition.command
|
|
105
129
|
this.cwd = definition.cwd
|
|
106
130
|
this.env = definition.env
|
|
131
|
+
this.lifecycle = definition.lifecycle
|
|
107
132
|
this.logger = definition.logger
|
|
133
|
+
this.memory = definition.memory
|
|
108
134
|
this.outputLines = definition.outputLines
|
|
135
|
+
this.restart = definition.restart
|
|
109
136
|
this.restartDelayMs = definition.restartDelayMs
|
|
110
137
|
this.shouldRestart = definition.shouldRestart
|
|
138
|
+
this.stopSignal = definition.stopSignal
|
|
111
139
|
this.stopTimeoutMs = definition.stopTimeoutMs
|
|
112
140
|
}
|
|
113
141
|
|
|
@@ -141,18 +169,166 @@ export default class ManagedProcess extends EventEmitter {
|
|
|
141
169
|
this.child = undefined
|
|
142
170
|
this.pid = undefined
|
|
143
171
|
this.exitPromise = undefined
|
|
172
|
+
this.rssBytes = undefined
|
|
173
|
+
this.children = []
|
|
174
|
+
this.clearMemoryMonitor()
|
|
144
175
|
this.state = wasIntentional ? "stopped" : "failed"
|
|
145
176
|
this.logger("process exited", {code, id: this.id, signal})
|
|
146
177
|
this.emit("exit", {code, signal})
|
|
147
178
|
|
|
148
179
|
if (!wasIntentional && this.shouldRestart()) {
|
|
149
|
-
this.
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
180
|
+
this.scheduleRestart()
|
|
181
|
+
}
|
|
182
|
+
}
|
|
183
|
+
|
|
184
|
+
/**
|
|
185
|
+
* Schedules an automatic restart per the restart policy, or gives up once the policy's limit is reached.
|
|
186
|
+
* @returns {void}
|
|
187
|
+
*/
|
|
188
|
+
scheduleRestart() {
|
|
189
|
+
const {backoffFactor, maxRestarts, windowMs} = this.restart
|
|
190
|
+
|
|
191
|
+
// Fast path: unlimited restarts with a constant delay needs no per-restart bookkeeping.
|
|
192
|
+
// The delay is constant across attempts here (backoffFactor is 1), so restartDelayFor(0)
|
|
193
|
+
// gives the right value while still applying any maxDelayMs cap.
|
|
194
|
+
if (maxRestarts === undefined && backoffFactor === 1) {
|
|
195
|
+
this.queueRestart(this.restartDelayFor(0))
|
|
196
|
+
|
|
197
|
+
return
|
|
198
|
+
}
|
|
199
|
+
|
|
200
|
+
const now = Date.now()
|
|
201
|
+
|
|
202
|
+
if (windowMs > 0) {
|
|
203
|
+
this.recentRestarts = this.recentRestarts.filter((time) => time > now - windowMs)
|
|
204
|
+
}
|
|
205
|
+
|
|
206
|
+
if (maxRestarts !== undefined && this.recentRestarts.length >= maxRestarts) {
|
|
207
|
+
this.logger("restart limit reached", {id: this.id, maxRestarts, windowMs})
|
|
208
|
+
|
|
209
|
+
return
|
|
210
|
+
}
|
|
211
|
+
|
|
212
|
+
const delay = this.restartDelayFor(this.recentRestarts.length)
|
|
213
|
+
|
|
214
|
+
this.recentRestarts.push(now)
|
|
215
|
+
this.queueRestart(delay)
|
|
216
|
+
}
|
|
217
|
+
|
|
218
|
+
/**
|
|
219
|
+
* @param {number} attempt - Number of restarts already counted in the current window.
|
|
220
|
+
* @returns {number} Backed-off restart delay in milliseconds, capped by maxDelayMs when set.
|
|
221
|
+
*/
|
|
222
|
+
restartDelayFor(attempt) {
|
|
223
|
+
const backedOff = this.restartDelayMs * this.restart.backoffFactor ** attempt
|
|
224
|
+
|
|
225
|
+
return this.restart.maxDelayMs > 0 ? Math.min(backedOff, this.restart.maxDelayMs) : backedOff
|
|
226
|
+
}
|
|
227
|
+
|
|
228
|
+
/**
|
|
229
|
+
* @param {number} delayMs - Delay before the restart attempt.
|
|
230
|
+
* @returns {void}
|
|
231
|
+
*/
|
|
232
|
+
queueRestart(delayMs) {
|
|
233
|
+
this.restartTimer = setTimeout(() => {
|
|
234
|
+
this.restartTimer = undefined
|
|
235
|
+
this.restarts += 1
|
|
236
|
+
this.start("crash").catch((error) => {
|
|
237
|
+
this.logger("process restart failed", {error: error instanceof Error ? error.message : String(error), id: this.id})
|
|
238
|
+
})
|
|
239
|
+
}, delayMs)
|
|
240
|
+
|
|
241
|
+
// The daemon's listening servers govern its lifetime; a pending restart must never be the sole
|
|
242
|
+
// handle keeping the process alive (like the memory and persist timers above). Otherwise a
|
|
243
|
+
// crashed process with an unlimited restart policy would respawn forever and block exit.
|
|
244
|
+
this.restartTimer.unref?.()
|
|
245
|
+
}
|
|
246
|
+
|
|
247
|
+
/**
|
|
248
|
+
* Starts the periodic RSS check for this process when memory supervision is configured.
|
|
249
|
+
* @returns {void}
|
|
250
|
+
*/
|
|
251
|
+
startMemoryMonitor() {
|
|
252
|
+
this.clearMemoryMonitor()
|
|
253
|
+
|
|
254
|
+
if (!this.memory) return
|
|
255
|
+
|
|
256
|
+
this.memoryTimer = setInterval(() => this.checkMemory(), this.memory.checkIntervalMs)
|
|
257
|
+
this.memoryTimer.unref?.()
|
|
258
|
+
}
|
|
259
|
+
|
|
260
|
+
/** @returns {void} Stops the periodic RSS check. */
|
|
261
|
+
clearMemoryMonitor() {
|
|
262
|
+
if (this.memoryTimer) {
|
|
263
|
+
clearInterval(this.memoryTimer)
|
|
264
|
+
this.memoryTimer = undefined
|
|
265
|
+
}
|
|
266
|
+
}
|
|
267
|
+
|
|
268
|
+
/**
|
|
269
|
+
* Measures the process group's RSS and warns or restarts when it crosses the configured thresholds.
|
|
270
|
+
* @returns {void}
|
|
271
|
+
*/
|
|
272
|
+
checkMemory() {
|
|
273
|
+
if (!this.memory || !this.pid || this.memoryRestarting) return
|
|
274
|
+
|
|
275
|
+
const members = processGroupMembers(this.pid)
|
|
276
|
+
|
|
277
|
+
if (members.length === 0) return
|
|
278
|
+
|
|
279
|
+
this.children = members
|
|
280
|
+
|
|
281
|
+
const measured = members.filter((member) => member.rssBytes !== undefined)
|
|
282
|
+
|
|
283
|
+
if (measured.length === 0) return
|
|
284
|
+
|
|
285
|
+
const rssBytes = measured.reduce((total, member) => total + (member.rssBytes ?? 0), 0)
|
|
286
|
+
|
|
287
|
+
this.rssBytes = rssBytes
|
|
288
|
+
|
|
289
|
+
if (rssBytes > this.memory.limitBytes) {
|
|
290
|
+
this.logger("memory limit exceeded", {id: this.id, limitBytes: this.memory.limitBytes, rssBytes})
|
|
291
|
+
void this.restartForMemory()
|
|
292
|
+
|
|
293
|
+
return
|
|
294
|
+
}
|
|
295
|
+
|
|
296
|
+
if (this.memory.warnBytes > 0 && rssBytes > this.memory.warnBytes) {
|
|
297
|
+
if (!this.memoryWarned) {
|
|
298
|
+
this.logger("memory warning", {id: this.id, rssBytes, warnBytes: this.memory.warnBytes})
|
|
299
|
+
this.memoryWarned = true
|
|
300
|
+
}
|
|
301
|
+
} else {
|
|
302
|
+
this.memoryWarned = false
|
|
303
|
+
}
|
|
304
|
+
}
|
|
305
|
+
|
|
306
|
+
/**
|
|
307
|
+
* Gracefully restarts the process after it exceeded its memory limit (SIGTERM, then
|
|
308
|
+
* SIGKILL after the stop timeout), recording the restart so status can report it.
|
|
309
|
+
* @returns {Promise<void>} Resolves once the process has been restarted.
|
|
310
|
+
*/
|
|
311
|
+
async restartForMemory() {
|
|
312
|
+
if (this.memoryRestarting) return
|
|
313
|
+
|
|
314
|
+
this.memoryRestarting = true
|
|
315
|
+
|
|
316
|
+
try {
|
|
317
|
+
await this.stop()
|
|
318
|
+
|
|
319
|
+
// Don't respawn if the supervising context no longer wants this process running
|
|
320
|
+
// (daemon shutting down, or the release draining/retired) — otherwise a restart racing
|
|
321
|
+
// a shutdown could leave a child running after shutdown collected its stop promises.
|
|
322
|
+
if (!this.shouldRestart()) return
|
|
323
|
+
|
|
324
|
+
this.memoryRestarts += 1
|
|
325
|
+
this.lastMemoryRestartAtMs = Date.now()
|
|
326
|
+
this.memoryWarned = false
|
|
327
|
+
await this.start("memory")
|
|
328
|
+
} catch (error) {
|
|
329
|
+
this.logger("memory restart failed", {error: error instanceof Error ? error.message : String(error), id: this.id})
|
|
330
|
+
} finally {
|
|
331
|
+
this.memoryRestarting = false
|
|
156
332
|
}
|
|
157
333
|
}
|
|
158
334
|
|
|
@@ -162,6 +338,7 @@ export default class ManagedProcess extends EventEmitter {
|
|
|
162
338
|
*/
|
|
163
339
|
async stop(options = {}) {
|
|
164
340
|
this.intentionalStop = true
|
|
341
|
+
this.clearMemoryMonitor()
|
|
165
342
|
|
|
166
343
|
if (this.restartTimer) {
|
|
167
344
|
clearTimeout(this.restartTimer)
|
|
@@ -176,21 +353,104 @@ export default class ManagedProcess extends EventEmitter {
|
|
|
176
353
|
}
|
|
177
354
|
|
|
178
355
|
this.state = "stopping"
|
|
179
|
-
|
|
180
|
-
const
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
if (
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
356
|
+
|
|
357
|
+
const {drainCommand, drainTimeoutMs, quietCommand, stopCommand} = this.lifecycle
|
|
358
|
+
|
|
359
|
+
// 1. Quiesce: tell the process to stop accepting new work.
|
|
360
|
+
if (quietCommand) await this.runHook(quietCommand, this.stopTimeoutMs, "quiet command")
|
|
361
|
+
|
|
362
|
+
// 2. Drain: let in-flight work finish, bounded by drainTimeoutMs (0 skips the step). A
|
|
363
|
+
// drainCommand blocks until drained; otherwise wait for the process to exit on its own.
|
|
364
|
+
if (this.child && drainTimeoutMs > 0) {
|
|
365
|
+
if (drainCommand) await this.runHook(drainCommand, drainTimeoutMs, "drain command")
|
|
366
|
+
else await this.waitForExit(drainTimeoutMs)
|
|
367
|
+
}
|
|
368
|
+
|
|
369
|
+
// 3. Stop whatever is still running, then SIGKILL if it outlasts the graceful window.
|
|
370
|
+
if (this.child) {
|
|
371
|
+
if (stopCommand) await this.runHook(stopCommand, this.stopTimeoutMs, "stop command")
|
|
372
|
+
else this.killProcessGroup(this.stopSignal)
|
|
373
|
+
|
|
374
|
+
const timeoutMs = options.timeoutMs ?? this.stopTimeoutMs
|
|
375
|
+
|
|
376
|
+
if (this.child && !(await this.waitForExit(timeoutMs))) {
|
|
377
|
+
this.logger("process stop timed out; sending SIGKILL", {id: this.id, pid: this.pid})
|
|
378
|
+
this.killProcessGroup("SIGKILL")
|
|
379
|
+
await this.waitForExit(5000)
|
|
380
|
+
}
|
|
187
381
|
}
|
|
188
382
|
|
|
189
383
|
this.state = "stopped"
|
|
190
384
|
}
|
|
191
385
|
|
|
192
386
|
/**
|
|
193
|
-
*
|
|
387
|
+
* Runs a lifecycle hook command, bounded by a timeout so a hung hook can never block stop().
|
|
388
|
+
* Failures are logged and swallowed — the graceful-stop sequence proceeds (and SIGKILL is the
|
|
389
|
+
* ultimate fallback) regardless of the hook's outcome.
|
|
390
|
+
* @param {string} command - Shell command to run.
|
|
391
|
+
* @param {number} timeoutMs - Maximum time to wait for the hook before killing it.
|
|
392
|
+
* @param {string} label - Hook name, for log messages.
|
|
393
|
+
* @returns {Promise<void>} Resolves when the hook exits, errors, or times out.
|
|
394
|
+
*/
|
|
395
|
+
async runHook(command, timeoutMs, label) {
|
|
396
|
+
await new Promise((resolve) => {
|
|
397
|
+
let settled = false
|
|
398
|
+
const finish = () => { if (!settled) { settled = true; resolve(undefined) } }
|
|
399
|
+
|
|
400
|
+
/** @type {import("node:child_process").ChildProcess} */
|
|
401
|
+
let hook
|
|
402
|
+
|
|
403
|
+
try {
|
|
404
|
+
hook = spawn(command, {
|
|
405
|
+
cwd: this.cwd,
|
|
406
|
+
detached: true,
|
|
407
|
+
env: {...process.env, ...this.env, ROLLBRIDGE_PID: this.pid ? String(this.pid) : ""},
|
|
408
|
+
shell: true,
|
|
409
|
+
stdio: "ignore"
|
|
410
|
+
})
|
|
411
|
+
} catch (error) {
|
|
412
|
+
this.logger(`${label} failed`, {error: error instanceof Error ? error.message : String(error), id: this.id})
|
|
413
|
+
finish()
|
|
414
|
+
|
|
415
|
+
return
|
|
416
|
+
}
|
|
417
|
+
|
|
418
|
+
const timer = setTimeout(() => {
|
|
419
|
+
this.logger(`${label} timed out`, {id: this.id, timeoutMs})
|
|
420
|
+
|
|
421
|
+
if (hook.pid) {
|
|
422
|
+
try {
|
|
423
|
+
process.kill(-hook.pid, "SIGKILL")
|
|
424
|
+
} catch {
|
|
425
|
+
// The hook already exited.
|
|
426
|
+
}
|
|
427
|
+
}
|
|
428
|
+
|
|
429
|
+
finish()
|
|
430
|
+
}, timeoutMs)
|
|
431
|
+
|
|
432
|
+
hook.once("exit", (code, signal) => {
|
|
433
|
+
clearTimeout(timer)
|
|
434
|
+
|
|
435
|
+
// A non-zero/signalled exit is surfaced (but still non-fatal); skip when the timeout
|
|
436
|
+
// already killed the hook, which logs separately.
|
|
437
|
+
if (!settled) {
|
|
438
|
+
if (typeof code === "number" && code !== 0) this.logger(`${label} exited non-zero`, {code, id: this.id})
|
|
439
|
+
else if (signal) this.logger(`${label} exited on signal`, {id: this.id, signal})
|
|
440
|
+
}
|
|
441
|
+
|
|
442
|
+
finish()
|
|
443
|
+
})
|
|
444
|
+
hook.once("error", (error) => {
|
|
445
|
+
clearTimeout(timer)
|
|
446
|
+
this.logger(`${label} failed`, {error: error instanceof Error ? error.message : String(error), id: this.id})
|
|
447
|
+
finish()
|
|
448
|
+
})
|
|
449
|
+
})
|
|
450
|
+
}
|
|
451
|
+
|
|
452
|
+
/**
|
|
453
|
+
* @param {string} signal - Signal name to send (the configured stop signal, or "SIGKILL").
|
|
194
454
|
* @returns {void}
|
|
195
455
|
*/
|
|
196
456
|
killProcessGroup(signal) {
|
|
@@ -226,14 +486,19 @@ export default class ManagedProcess extends EventEmitter {
|
|
|
226
486
|
/** @returns {ManagedProcessStatus} Status payload. */
|
|
227
487
|
status() {
|
|
228
488
|
return {
|
|
489
|
+
children: this.children,
|
|
229
490
|
command: this.command,
|
|
230
491
|
cwd: this.cwd,
|
|
231
492
|
exitCode: this.exitCode,
|
|
232
493
|
exitSignal: this.exitSignal,
|
|
233
494
|
id: this.id,
|
|
495
|
+
lastMemoryRestartAt: this.lastMemoryRestartAtMs === undefined ? undefined : new Date(this.lastMemoryRestartAtMs).toISOString(),
|
|
496
|
+
lastStartReason: this.lastStartReason,
|
|
234
497
|
logs: this.logs.slice(-this.outputLines),
|
|
498
|
+
memoryRestarts: this.memoryRestarts,
|
|
235
499
|
pid: this.pid,
|
|
236
500
|
restarts: this.restarts,
|
|
501
|
+
rssBytes: this.rssBytes,
|
|
237
502
|
startedAt: this.startedAtMs === undefined ? undefined : new Date(this.startedAtMs).toISOString(),
|
|
238
503
|
state: this.state,
|
|
239
504
|
uptimeMs: this.state === "running" && this.startedAtMs !== undefined ? Date.now() - this.startedAtMs : undefined
|
|
@@ -0,0 +1,110 @@
|
|
|
1
|
+
// @ts-check
|
|
2
|
+
|
|
3
|
+
import fs from "node:fs"
|
|
4
|
+
|
|
5
|
+
/**
|
|
6
|
+
* @typedef {{command: string, pid: number, rssBytes: number | undefined}} ProcessGroupMember
|
|
7
|
+
*/
|
|
8
|
+
|
|
9
|
+
/**
|
|
10
|
+
* Lists the members of a managed process group with each member's resident memory.
|
|
11
|
+
* Rollbridge spawns each process detached, so the spawned pid is the process-group
|
|
12
|
+
* leader and every process in the tree (the shell wrapper, the app, any children)
|
|
13
|
+
* shares that group id.
|
|
14
|
+
*
|
|
15
|
+
* Reads `/proc` (Linux); returns an empty array when unavailable (no `/proc`, e.g.
|
|
16
|
+
* non-Linux) or the group has no members.
|
|
17
|
+
* @param {number} pgid - Process-group id (the detached spawn's pid).
|
|
18
|
+
* @returns {ProcessGroupMember[]} Group members, ordered by pid.
|
|
19
|
+
*/
|
|
20
|
+
export function processGroupMembers(pgid) {
|
|
21
|
+
/** @type {string[]} */
|
|
22
|
+
let entries
|
|
23
|
+
|
|
24
|
+
try {
|
|
25
|
+
entries = fs.readdirSync("/proc")
|
|
26
|
+
} catch {
|
|
27
|
+
return []
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
/** @type {ProcessGroupMember[]} */
|
|
31
|
+
const members = []
|
|
32
|
+
|
|
33
|
+
for (const entry of entries) {
|
|
34
|
+
if (!/^\d+$/.test(entry)) continue
|
|
35
|
+
if (processGroupId(entry) !== pgid) continue
|
|
36
|
+
|
|
37
|
+
members.push({command: commandName(entry), pid: Number(entry), rssBytes: residentBytes(entry)})
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
members.sort((first, second) => first.pid - second.pid)
|
|
41
|
+
|
|
42
|
+
return members
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
/**
|
|
46
|
+
* Measures the total resident memory (RSS) of a managed process group.
|
|
47
|
+
* @param {number} pgid - Process-group id (the detached spawn's pid).
|
|
48
|
+
* @returns {number | undefined} Total resident memory in bytes, or undefined when unmeasurable.
|
|
49
|
+
*/
|
|
50
|
+
export function measureProcessGroupRssBytes(pgid) {
|
|
51
|
+
const measured = processGroupMembers(pgid).filter((member) => member.rssBytes !== undefined)
|
|
52
|
+
|
|
53
|
+
if (measured.length === 0) return undefined
|
|
54
|
+
|
|
55
|
+
return measured.reduce((total, member) => total + (member.rssBytes ?? 0), 0)
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
/**
|
|
59
|
+
* @param {string} pid - Process id.
|
|
60
|
+
* @returns {string} The process command name (`/proc/<pid>/comm`), or "" when unavailable.
|
|
61
|
+
*/
|
|
62
|
+
function commandName(pid) {
|
|
63
|
+
try {
|
|
64
|
+
return fs.readFileSync(`/proc/${pid}/comm`, "utf8").trim()
|
|
65
|
+
} catch {
|
|
66
|
+
return ""
|
|
67
|
+
}
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
/**
|
|
71
|
+
* @param {string} pid - Process id.
|
|
72
|
+
* @returns {number | undefined} The process-group id, or undefined when the process is gone.
|
|
73
|
+
*/
|
|
74
|
+
function processGroupId(pid) {
|
|
75
|
+
let stat
|
|
76
|
+
|
|
77
|
+
try {
|
|
78
|
+
stat = fs.readFileSync(`/proc/${pid}/stat`, "utf8")
|
|
79
|
+
} catch {
|
|
80
|
+
return undefined
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
// The comm field is wrapped in parens and may itself contain spaces or parens, so the
|
|
84
|
+
// numeric fields are parsed from after the final ")". They are: state, ppid, pgrp, ...
|
|
85
|
+
const commEnd = stat.lastIndexOf(")")
|
|
86
|
+
|
|
87
|
+
if (commEnd < 0) return undefined
|
|
88
|
+
|
|
89
|
+
const pgrp = Number(stat.slice(commEnd + 2).split(" ")[2])
|
|
90
|
+
|
|
91
|
+
return Number.isInteger(pgrp) ? pgrp : undefined
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
/**
|
|
95
|
+
* @param {string} pid - Process id.
|
|
96
|
+
* @returns {number | undefined} Resident memory in bytes, or undefined when unavailable.
|
|
97
|
+
*/
|
|
98
|
+
function residentBytes(pid) {
|
|
99
|
+
let status
|
|
100
|
+
|
|
101
|
+
try {
|
|
102
|
+
status = fs.readFileSync(`/proc/${pid}/status`, "utf8")
|
|
103
|
+
} catch {
|
|
104
|
+
return undefined
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
const match = status.match(/^VmRSS:\s+(\d+)\s+kB/m)
|
|
108
|
+
|
|
109
|
+
return match ? Number(match[1]) * 1024 : undefined
|
|
110
|
+
}
|
package/src/recover.js
ADDED
|
@@ -0,0 +1,134 @@
|
|
|
1
|
+
// @ts-check
|
|
2
|
+
|
|
3
|
+
import {inspectControlSocket} from "./daemon.js"
|
|
4
|
+
import {clearState, isProcessAlive, liveProcesses, readState} from "./state-store.js"
|
|
5
|
+
|
|
6
|
+
// How long to confirm a SIGKILL'd group has actually exited before reporting it un-stoppable.
|
|
7
|
+
const KILL_CONFIRM_TIMEOUT_MS = 500
|
|
8
|
+
|
|
9
|
+
/**
|
|
10
|
+
* @typedef {{id: string, pid: number, releaseId: string | null}} OrphanProcess
|
|
11
|
+
* @typedef {{error: string}} RecoverError
|
|
12
|
+
* @typedef {{cleared: boolean, forced: boolean, orphans: OrphanProcess[], remaining: OrphanProcess[]}} RecoverReport
|
|
13
|
+
* @typedef {RecoverError | RecoverReport} RecoverResult
|
|
14
|
+
*/
|
|
15
|
+
|
|
16
|
+
/**
|
|
17
|
+
* Cleans up orphaned managed processes left by a crashed daemon. Reads the persisted state
|
|
18
|
+
* (config.statePath) and finds managed processes whose pids are still alive. By default it
|
|
19
|
+
* only reports them; with `force` it stops each one's process group (SIGTERM, then SIGKILL
|
|
20
|
+
* after the configured timeout) and clears the stale state file.
|
|
21
|
+
*
|
|
22
|
+
* When `force` leaves any orphan still running (for example a process owned by another user
|
|
23
|
+
* that can't be signaled), the state file is **kept** so the operator can investigate and
|
|
24
|
+
* re-run recovery — the survivors are returned in `remaining` and `cleared` stays false.
|
|
25
|
+
*
|
|
26
|
+
* Refuses to run while a daemon (or another process) holds the control socket — those pids
|
|
27
|
+
* belong to a live daemon, not a crash. A recycled pid can be a false positive, so review the
|
|
28
|
+
* dry-run list before using `force`.
|
|
29
|
+
* @param {object} args - Options.
|
|
30
|
+
* @param {import("./config.js").RollbridgeConfig} args.config - Normalized config.
|
|
31
|
+
* @param {boolean} args.force - Whether to actually stop the orphans (otherwise list them).
|
|
32
|
+
* @param {(pid: number, timeoutMs: number) => Promise<boolean>} [args.stopGroup] - Stops a process group and resolves to whether it is gone afterward (defaults to the real implementation; injectable for tests).
|
|
33
|
+
* @returns {Promise<RecoverResult>} The orphans found and whether they were stopped, or an error.
|
|
34
|
+
*/
|
|
35
|
+
export async function recoverOrphans({config, force, stopGroup = stopProcessGroup}) {
|
|
36
|
+
if (config.statePath === undefined) {
|
|
37
|
+
return {error: "No statePath is configured; set statePath in the config to enable recovery."}
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
if (await daemonIsRunning(config.control.path)) {
|
|
41
|
+
return {error: `A daemon (or another process) is using ${config.control.path}; stop it before recovering — recover is for cleaning up after a crash.`}
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
const orphans = liveProcesses(await readState(config.statePath))
|
|
45
|
+
|
|
46
|
+
if (!force) return {cleared: false, forced: false, orphans, remaining: []}
|
|
47
|
+
|
|
48
|
+
/** @type {OrphanProcess[]} */
|
|
49
|
+
const remaining = []
|
|
50
|
+
|
|
51
|
+
for (const orphan of orphans) {
|
|
52
|
+
const stopped = await stopGroup(orphan.pid, config.proxy.forceStopTimeoutMs)
|
|
53
|
+
|
|
54
|
+
if (!stopped) remaining.push(orphan)
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
// Only clear the state file once every orphan is confirmed gone; otherwise keep it so the
|
|
58
|
+
// operator can still find and retry the survivors on the next run.
|
|
59
|
+
if (remaining.length === 0) await clearState(config.statePath)
|
|
60
|
+
|
|
61
|
+
return {cleared: remaining.length === 0, forced: true, orphans, remaining}
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
/**
|
|
65
|
+
* @param {string} socketPath - Control socket path.
|
|
66
|
+
* @returns {Promise<boolean>} True when a process is live on the socket (or it can't be probed).
|
|
67
|
+
*/
|
|
68
|
+
async function daemonIsRunning(socketPath) {
|
|
69
|
+
try {
|
|
70
|
+
return (await inspectControlSocket(socketPath)).alive
|
|
71
|
+
} catch {
|
|
72
|
+
// Can't tell — be conservative and refuse to stop processes.
|
|
73
|
+
return true
|
|
74
|
+
}
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
/**
|
|
78
|
+
* Stops a detached process group: SIGTERM, then SIGKILL if it outlives the timeout.
|
|
79
|
+
* @param {number} pid - Process-group leader pid (the detached spawn's pid).
|
|
80
|
+
* @param {number} timeoutMs - Grace period before SIGKILL.
|
|
81
|
+
* @returns {Promise<boolean>} True once the process is gone; false if it is still alive afterward (for example owned by another user, so it can't be signaled).
|
|
82
|
+
*/
|
|
83
|
+
async function stopProcessGroup(pid, timeoutMs) {
|
|
84
|
+
const term = sendSignal(pid, "SIGTERM")
|
|
85
|
+
|
|
86
|
+
if (term === "gone") return true
|
|
87
|
+
if (term === "denied") return false
|
|
88
|
+
|
|
89
|
+
if (await waitForExit(pid, timeoutMs)) return true
|
|
90
|
+
|
|
91
|
+
const kill = sendSignal(pid, "SIGKILL")
|
|
92
|
+
|
|
93
|
+
if (kill === "gone") return true
|
|
94
|
+
if (kill === "denied") return false
|
|
95
|
+
|
|
96
|
+
return waitForExit(pid, KILL_CONFIRM_TIMEOUT_MS)
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
/**
|
|
100
|
+
* Polls until the pid is no longer alive or the timeout elapses.
|
|
101
|
+
* @param {number} pid - Process pid to watch.
|
|
102
|
+
* @param {number} timeoutMs - How long to wait for it to exit.
|
|
103
|
+
* @returns {Promise<boolean>} True once the process is gone, false if it is still alive at the deadline.
|
|
104
|
+
*/
|
|
105
|
+
async function waitForExit(pid, timeoutMs) {
|
|
106
|
+
const deadline = Date.now() + timeoutMs
|
|
107
|
+
|
|
108
|
+
while (Date.now() < deadline) {
|
|
109
|
+
if (!isProcessAlive(pid)) return true
|
|
110
|
+
|
|
111
|
+
await new Promise((resolve) => setTimeout(resolve, 50))
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
return !isProcessAlive(pid)
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
/**
|
|
118
|
+
* Sends a signal to a detached process group, classifying the outcome.
|
|
119
|
+
* @param {number} pid - Process-group leader pid.
|
|
120
|
+
* @param {"SIGTERM" | "SIGKILL"} signal - Signal to send to the group.
|
|
121
|
+
* @returns {"denied" | "gone" | "sent"} `gone` when the group no longer exists (ESRCH), `denied` when it can't be signaled (for example EPERM), otherwise `sent`.
|
|
122
|
+
*/
|
|
123
|
+
function sendSignal(pid, signal) {
|
|
124
|
+
try {
|
|
125
|
+
process.kill(-pid, signal)
|
|
126
|
+
|
|
127
|
+
return "sent"
|
|
128
|
+
} catch (error) {
|
|
129
|
+
if (error && typeof error === "object" && "code" in error && error.code === "ESRCH") return "gone"
|
|
130
|
+
|
|
131
|
+
// EPERM (owned by another user) or anything else: we could not deliver the signal.
|
|
132
|
+
return "denied"
|
|
133
|
+
}
|
|
134
|
+
}
|