rollbridge 0.1.5 → 0.1.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -2,14 +2,16 @@
2
2
 
3
3
  import {EventEmitter} from "node:events"
4
4
  import {spawn} from "node:child_process"
5
+ import {processGroupMembers} from "./process-memory.js"
5
6
 
6
7
  /**
7
8
  * @typedef {import("./json.js").JsonValue} JsonValue
8
9
  * @typedef {"starting" | "running" | "stopping" | "stopped" | "failed"} ManagedProcessState
10
+ * @typedef {"deploy" | "crash" | "manual" | "memory"} ManagedProcessStartReason
9
11
  * @typedef {import("node:child_process").ChildProcess["signalCode"]} ProcessExitSignal
10
12
  * @typedef {{at: string, line: string, stream: "stdout" | "stderr"}} ManagedProcessLog
11
- * @typedef {{command: string, cwd: string | undefined, env: Record<string, string | undefined>, logger: (message: string, data?: Record<string, import("./json.js").JsonValue>) => void, outputLines: number, restart: import("./config.js").RestartConfig, restartDelayMs: number, shouldRestart: () => boolean, stopTimeoutMs: number}} ManagedProcessDefinition
12
- * @typedef {{command: string, cwd: string | undefined, exitCode: number | null | undefined, exitSignal: ProcessExitSignal | undefined, id: string, logs: ManagedProcessLog[], pid: number | undefined, restarts: number, startedAt: string | undefined, state: ManagedProcessState, uptimeMs: number | undefined}} ManagedProcessStatus
13
+ * @typedef {{command: string, cwd: string | undefined, env: Record<string, string | undefined>, lifecycle: import("./config.js").LifecycleConfig, logger: (message: string, data?: Record<string, import("./json.js").JsonValue>) => void, memory: import("./config.js").MemoryConfig | undefined, outputLines: number, restart: import("./config.js").RestartConfig, restartDelayMs: number, shouldRestart: () => boolean, stopSignal: string, stopTimeoutMs: number}} ManagedProcessDefinition
14
+ * @typedef {{children: import("./process-memory.js").ProcessGroupMember[], command: string, cwd: string | undefined, exitCode: number | null | undefined, exitSignal: ProcessExitSignal | undefined, id: string, lastMemoryRestartAt: string | undefined, lastStartReason: ManagedProcessStartReason | undefined, logs: ManagedProcessLog[], memoryRestarts: number, pid: number | undefined, restarts: number, rssBytes: number | undefined, startedAt: string | undefined, state: ManagedProcessState, uptimeMs: number | undefined}} ManagedProcessStatus
13
15
  */
14
16
 
15
17
  export default class ManagedProcess extends EventEmitter {
@@ -20,29 +22,43 @@ export default class ManagedProcess extends EventEmitter {
20
22
  * @param {Record<string, string | undefined>} args.env - Environment.
21
23
  * @param {string} args.id - Process id.
22
24
  * @param {(message: string, data?: Record<string, JsonValue>) => void} args.logger - Logger callback.
25
+ * @param {import("./config.js").LifecycleConfig} [args.lifecycle] - Graceful-stop lifecycle hooks (none by default).
26
+ * @param {import("./config.js").MemoryConfig} [args.memory] - Memory supervision config (off when omitted).
23
27
  * @param {number} args.outputLines - Recent stdout/stderr lines to retain and report.
24
28
  * @param {import("./config.js").RestartConfig} [args.restart] - Restart policy (defaults to unlimited restarts with a constant delay).
25
29
  * @param {number} args.restartDelayMs - Restart delay.
26
30
  * @param {() => boolean} args.shouldRestart - Restart policy callback.
31
+ * @param {string} [args.stopSignal] - Signal sent to gracefully stop the process (default "SIGTERM").
27
32
  * @param {number} args.stopTimeoutMs - Stop timeout.
28
33
  */
29
- constructor({command, cwd, env, id, logger, outputLines, restart = {backoffFactor: 1, maxDelayMs: 0, maxRestarts: undefined, windowMs: 0}, restartDelayMs, shouldRestart, stopTimeoutMs}) {
34
+ constructor({command, cwd, env, id, lifecycle = {drainTimeoutMs: 0}, logger, memory, outputLines, restart = {backoffFactor: 1, maxDelayMs: 0, maxRestarts: undefined, windowMs: 0}, restartDelayMs, shouldRestart, stopSignal = "SIGTERM", stopTimeoutMs}) {
30
35
  super()
31
36
 
32
37
  this.command = command
33
38
  this.cwd = cwd
34
39
  this.env = env
35
40
  this.id = id
41
+ this.lifecycle = lifecycle
36
42
  this.logger = logger
43
+ this.memory = memory
37
44
  this.outputLines = outputLines
38
45
  this.restart = restart
39
46
  this.restartDelayMs = restartDelayMs
40
47
  this.shouldRestart = shouldRestart
48
+ this.stopSignal = stopSignal
41
49
  this.stopTimeoutMs = stopTimeoutMs
42
50
  this.state = /** @type {ManagedProcessState} */ ("stopped")
51
+ this.lastStartReason = /** @type {ManagedProcessStartReason | undefined} */ (undefined)
43
52
  this.logs = /** @type {ManagedProcessLog[]} */ ([])
44
53
  this.restarts = 0
45
54
  this.recentRestarts = /** @type {number[]} */ ([])
55
+ this.rssBytes = /** @type {number | undefined} */ (undefined)
56
+ this.children = /** @type {import("./process-memory.js").ProcessGroupMember[]} */ ([])
57
+ this.memoryRestarts = 0
58
+ this.lastMemoryRestartAtMs = /** @type {number | undefined} */ (undefined)
59
+ this.memoryTimer = /** @type {ReturnType<typeof setInterval> | undefined} */ (undefined)
60
+ this.memoryRestarting = false
61
+ this.memoryWarned = false
46
62
  this.startedAtMs = /** @type {number | undefined} */ (undefined)
47
63
  this.intentionalStop = false
48
64
  this.restartTimer = undefined
@@ -53,8 +69,11 @@ export default class ManagedProcess extends EventEmitter {
53
69
  this.exitSignal = undefined
54
70
  }
55
71
 
56
- /** @returns {Promise<void>} Resolves after spawn. */
57
- async start() {
72
+ /**
73
+ * @param {ManagedProcessStartReason} [reason] - Why the process is being started (deploy by default; "crash" on auto-restart, "manual" via the restart command).
74
+ * @returns {Promise<void>} Resolves after spawn.
75
+ */
76
+ async start(reason = "deploy") {
58
77
  if (this.child) return
59
78
 
60
79
  this.intentionalStop = false
@@ -83,7 +102,9 @@ export default class ManagedProcess extends EventEmitter {
83
102
  child.once("spawn", () => {
84
103
  this.state = "running"
85
104
  this.startedAtMs = Date.now()
86
- this.logger("process started", {command: this.command, id: this.id, pid: child.pid || null})
105
+ this.lastStartReason = reason
106
+ this.logger("process started", {command: this.command, id: this.id, pid: child.pid || null, reason})
107
+ this.startMemoryMonitor()
87
108
  this.emit("started")
88
109
  resolve(undefined)
89
110
  })
@@ -107,11 +128,14 @@ export default class ManagedProcess extends EventEmitter {
107
128
  this.command = definition.command
108
129
  this.cwd = definition.cwd
109
130
  this.env = definition.env
131
+ this.lifecycle = definition.lifecycle
110
132
  this.logger = definition.logger
133
+ this.memory = definition.memory
111
134
  this.outputLines = definition.outputLines
112
135
  this.restart = definition.restart
113
136
  this.restartDelayMs = definition.restartDelayMs
114
137
  this.shouldRestart = definition.shouldRestart
138
+ this.stopSignal = definition.stopSignal
115
139
  this.stopTimeoutMs = definition.stopTimeoutMs
116
140
  }
117
141
 
@@ -145,6 +169,9 @@ export default class ManagedProcess extends EventEmitter {
145
169
  this.child = undefined
146
170
  this.pid = undefined
147
171
  this.exitPromise = undefined
172
+ this.rssBytes = undefined
173
+ this.children = []
174
+ this.clearMemoryMonitor()
148
175
  this.state = wasIntentional ? "stopped" : "failed"
149
176
  this.logger("process exited", {code, id: this.id, signal})
150
177
  this.emit("exit", {code, signal})
@@ -206,10 +233,103 @@ export default class ManagedProcess extends EventEmitter {
206
233
  this.restartTimer = setTimeout(() => {
207
234
  this.restartTimer = undefined
208
235
  this.restarts += 1
209
- this.start().catch((error) => {
236
+ this.start("crash").catch((error) => {
210
237
  this.logger("process restart failed", {error: error instanceof Error ? error.message : String(error), id: this.id})
211
238
  })
212
239
  }, delayMs)
240
+
241
+ // The daemon's listening servers govern its lifetime; a pending restart must never be the sole
242
+ // handle keeping the process alive (like the memory and persist timers above). Otherwise a
243
+ // crashed process with an unlimited restart policy would respawn forever and block exit.
244
+ this.restartTimer.unref?.()
245
+ }
246
+
247
+ /**
248
+ * Starts the periodic RSS check for this process when memory supervision is configured.
249
+ * @returns {void}
250
+ */
251
+ startMemoryMonitor() {
252
+ this.clearMemoryMonitor()
253
+
254
+ if (!this.memory) return
255
+
256
+ this.memoryTimer = setInterval(() => this.checkMemory(), this.memory.checkIntervalMs)
257
+ this.memoryTimer.unref?.()
258
+ }
259
+
260
+ /** @returns {void} Stops the periodic RSS check. */
261
+ clearMemoryMonitor() {
262
+ if (this.memoryTimer) {
263
+ clearInterval(this.memoryTimer)
264
+ this.memoryTimer = undefined
265
+ }
266
+ }
267
+
268
+ /**
269
+ * Measures the process group's RSS and warns or restarts when it crosses the configured thresholds.
270
+ * @returns {void}
271
+ */
272
+ checkMemory() {
273
+ if (!this.memory || !this.pid || this.memoryRestarting) return
274
+
275
+ const members = processGroupMembers(this.pid)
276
+
277
+ if (members.length === 0) return
278
+
279
+ this.children = members
280
+
281
+ const measured = members.filter((member) => member.rssBytes !== undefined)
282
+
283
+ if (measured.length === 0) return
284
+
285
+ const rssBytes = measured.reduce((total, member) => total + (member.rssBytes ?? 0), 0)
286
+
287
+ this.rssBytes = rssBytes
288
+
289
+ if (rssBytes > this.memory.limitBytes) {
290
+ this.logger("memory limit exceeded", {id: this.id, limitBytes: this.memory.limitBytes, rssBytes})
291
+ void this.restartForMemory()
292
+
293
+ return
294
+ }
295
+
296
+ if (this.memory.warnBytes > 0 && rssBytes > this.memory.warnBytes) {
297
+ if (!this.memoryWarned) {
298
+ this.logger("memory warning", {id: this.id, rssBytes, warnBytes: this.memory.warnBytes})
299
+ this.memoryWarned = true
300
+ }
301
+ } else {
302
+ this.memoryWarned = false
303
+ }
304
+ }
305
+
306
+ /**
307
+ * Gracefully restarts the process after it exceeded its memory limit (SIGTERM, then
308
+ * SIGKILL after the stop timeout), recording the restart so status can report it.
309
+ * @returns {Promise<void>} Resolves once the process has been restarted.
310
+ */
311
+ async restartForMemory() {
312
+ if (this.memoryRestarting) return
313
+
314
+ this.memoryRestarting = true
315
+
316
+ try {
317
+ await this.stop()
318
+
319
+ // Don't respawn if the supervising context no longer wants this process running
320
+ // (daemon shutting down, or the release draining/retired) — otherwise a restart racing
321
+ // a shutdown could leave a child running after shutdown collected its stop promises.
322
+ if (!this.shouldRestart()) return
323
+
324
+ this.memoryRestarts += 1
325
+ this.lastMemoryRestartAtMs = Date.now()
326
+ this.memoryWarned = false
327
+ await this.start("memory")
328
+ } catch (error) {
329
+ this.logger("memory restart failed", {error: error instanceof Error ? error.message : String(error), id: this.id})
330
+ } finally {
331
+ this.memoryRestarting = false
332
+ }
213
333
  }
214
334
 
215
335
  /**
@@ -218,6 +338,7 @@ export default class ManagedProcess extends EventEmitter {
218
338
  */
219
339
  async stop(options = {}) {
220
340
  this.intentionalStop = true
341
+ this.clearMemoryMonitor()
221
342
 
222
343
  if (this.restartTimer) {
223
344
  clearTimeout(this.restartTimer)
@@ -232,21 +353,104 @@ export default class ManagedProcess extends EventEmitter {
232
353
  }
233
354
 
234
355
  this.state = "stopping"
235
- this.killProcessGroup("SIGTERM")
236
- const timeoutMs = options.timeoutMs ?? this.stopTimeoutMs
237
- const stopped = await this.waitForExit(timeoutMs)
238
-
239
- if (!stopped) {
240
- this.logger("process stop timed out; sending SIGKILL", {id: this.id, pid: child.pid})
241
- this.killProcessGroup("SIGKILL")
242
- await this.waitForExit(5000)
356
+
357
+ const {drainCommand, drainTimeoutMs, quietCommand, stopCommand} = this.lifecycle
358
+
359
+ // 1. Quiesce: tell the process to stop accepting new work.
360
+ if (quietCommand) await this.runHook(quietCommand, this.stopTimeoutMs, "quiet command")
361
+
362
+ // 2. Drain: let in-flight work finish, bounded by drainTimeoutMs (0 skips the step). A
363
+ // drainCommand blocks until drained; otherwise wait for the process to exit on its own.
364
+ if (this.child && drainTimeoutMs > 0) {
365
+ if (drainCommand) await this.runHook(drainCommand, drainTimeoutMs, "drain command")
366
+ else await this.waitForExit(drainTimeoutMs)
367
+ }
368
+
369
+ // 3. Stop whatever is still running, then SIGKILL if it outlasts the graceful window.
370
+ if (this.child) {
371
+ if (stopCommand) await this.runHook(stopCommand, this.stopTimeoutMs, "stop command")
372
+ else this.killProcessGroup(this.stopSignal)
373
+
374
+ const timeoutMs = options.timeoutMs ?? this.stopTimeoutMs
375
+
376
+ if (this.child && !(await this.waitForExit(timeoutMs))) {
377
+ this.logger("process stop timed out; sending SIGKILL", {id: this.id, pid: this.pid})
378
+ this.killProcessGroup("SIGKILL")
379
+ await this.waitForExit(5000)
380
+ }
243
381
  }
244
382
 
245
383
  this.state = "stopped"
246
384
  }
247
385
 
248
386
  /**
249
- * @param {"SIGTERM" | "SIGKILL"} signal - Signal to send.
387
+ * Runs a lifecycle hook command, bounded by a timeout so a hung hook can never block stop().
388
+ * Failures are logged and swallowed — the graceful-stop sequence proceeds (and SIGKILL is the
389
+ * ultimate fallback) regardless of the hook's outcome.
390
+ * @param {string} command - Shell command to run.
391
+ * @param {number} timeoutMs - Maximum time to wait for the hook before killing it.
392
+ * @param {string} label - Hook name, for log messages.
393
+ * @returns {Promise<void>} Resolves when the hook exits, errors, or times out.
394
+ */
395
+ async runHook(command, timeoutMs, label) {
396
+ await new Promise((resolve) => {
397
+ let settled = false
398
+ const finish = () => { if (!settled) { settled = true; resolve(undefined) } }
399
+
400
+ /** @type {import("node:child_process").ChildProcess} */
401
+ let hook
402
+
403
+ try {
404
+ hook = spawn(command, {
405
+ cwd: this.cwd,
406
+ detached: true,
407
+ env: {...process.env, ...this.env, ROLLBRIDGE_PID: this.pid ? String(this.pid) : ""},
408
+ shell: true,
409
+ stdio: "ignore"
410
+ })
411
+ } catch (error) {
412
+ this.logger(`${label} failed`, {error: error instanceof Error ? error.message : String(error), id: this.id})
413
+ finish()
414
+
415
+ return
416
+ }
417
+
418
+ const timer = setTimeout(() => {
419
+ this.logger(`${label} timed out`, {id: this.id, timeoutMs})
420
+
421
+ if (hook.pid) {
422
+ try {
423
+ process.kill(-hook.pid, "SIGKILL")
424
+ } catch {
425
+ // The hook already exited.
426
+ }
427
+ }
428
+
429
+ finish()
430
+ }, timeoutMs)
431
+
432
+ hook.once("exit", (code, signal) => {
433
+ clearTimeout(timer)
434
+
435
+ // A non-zero/signalled exit is surfaced (but still non-fatal); skip when the timeout
436
+ // already killed the hook, which logs separately.
437
+ if (!settled) {
438
+ if (typeof code === "number" && code !== 0) this.logger(`${label} exited non-zero`, {code, id: this.id})
439
+ else if (signal) this.logger(`${label} exited on signal`, {id: this.id, signal})
440
+ }
441
+
442
+ finish()
443
+ })
444
+ hook.once("error", (error) => {
445
+ clearTimeout(timer)
446
+ this.logger(`${label} failed`, {error: error instanceof Error ? error.message : String(error), id: this.id})
447
+ finish()
448
+ })
449
+ })
450
+ }
451
+
452
+ /**
453
+ * @param {string} signal - Signal name to send (the configured stop signal, or "SIGKILL").
250
454
  * @returns {void}
251
455
  */
252
456
  killProcessGroup(signal) {
@@ -282,14 +486,19 @@ export default class ManagedProcess extends EventEmitter {
282
486
  /** @returns {ManagedProcessStatus} Status payload. */
283
487
  status() {
284
488
  return {
489
+ children: this.children,
285
490
  command: this.command,
286
491
  cwd: this.cwd,
287
492
  exitCode: this.exitCode,
288
493
  exitSignal: this.exitSignal,
289
494
  id: this.id,
495
+ lastMemoryRestartAt: this.lastMemoryRestartAtMs === undefined ? undefined : new Date(this.lastMemoryRestartAtMs).toISOString(),
496
+ lastStartReason: this.lastStartReason,
290
497
  logs: this.logs.slice(-this.outputLines),
498
+ memoryRestarts: this.memoryRestarts,
291
499
  pid: this.pid,
292
500
  restarts: this.restarts,
501
+ rssBytes: this.rssBytes,
293
502
  startedAt: this.startedAtMs === undefined ? undefined : new Date(this.startedAtMs).toISOString(),
294
503
  state: this.state,
295
504
  uptimeMs: this.state === "running" && this.startedAtMs !== undefined ? Date.now() - this.startedAtMs : undefined
@@ -0,0 +1,110 @@
1
+ // @ts-check
2
+
3
+ import fs from "node:fs"
4
+
5
+ /**
6
+ * @typedef {{command: string, pid: number, rssBytes: number | undefined}} ProcessGroupMember
7
+ */
8
+
9
+ /**
10
+ * Lists the members of a managed process group with each member's resident memory.
11
+ * Rollbridge spawns each process detached, so the spawned pid is the process-group
12
+ * leader and every process in the tree (the shell wrapper, the app, any children)
13
+ * shares that group id.
14
+ *
15
+ * Reads `/proc` (Linux); returns an empty array when unavailable (no `/proc`, e.g.
16
+ * non-Linux) or the group has no members.
17
+ * @param {number} pgid - Process-group id (the detached spawn's pid).
18
+ * @returns {ProcessGroupMember[]} Group members, ordered by pid.
19
+ */
20
+ export function processGroupMembers(pgid) {
21
+ /** @type {string[]} */
22
+ let entries
23
+
24
+ try {
25
+ entries = fs.readdirSync("/proc")
26
+ } catch {
27
+ return []
28
+ }
29
+
30
+ /** @type {ProcessGroupMember[]} */
31
+ const members = []
32
+
33
+ for (const entry of entries) {
34
+ if (!/^\d+$/.test(entry)) continue
35
+ if (processGroupId(entry) !== pgid) continue
36
+
37
+ members.push({command: commandName(entry), pid: Number(entry), rssBytes: residentBytes(entry)})
38
+ }
39
+
40
+ members.sort((first, second) => first.pid - second.pid)
41
+
42
+ return members
43
+ }
44
+
45
+ /**
46
+ * Measures the total resident memory (RSS) of a managed process group.
47
+ * @param {number} pgid - Process-group id (the detached spawn's pid).
48
+ * @returns {number | undefined} Total resident memory in bytes, or undefined when unmeasurable.
49
+ */
50
+ export function measureProcessGroupRssBytes(pgid) {
51
+ const measured = processGroupMembers(pgid).filter((member) => member.rssBytes !== undefined)
52
+
53
+ if (measured.length === 0) return undefined
54
+
55
+ return measured.reduce((total, member) => total + (member.rssBytes ?? 0), 0)
56
+ }
57
+
58
+ /**
59
+ * @param {string} pid - Process id.
60
+ * @returns {string} The process command name (`/proc/<pid>/comm`), or "" when unavailable.
61
+ */
62
+ function commandName(pid) {
63
+ try {
64
+ return fs.readFileSync(`/proc/${pid}/comm`, "utf8").trim()
65
+ } catch {
66
+ return ""
67
+ }
68
+ }
69
+
70
+ /**
71
+ * @param {string} pid - Process id.
72
+ * @returns {number | undefined} The process-group id, or undefined when the process is gone.
73
+ */
74
+ function processGroupId(pid) {
75
+ let stat
76
+
77
+ try {
78
+ stat = fs.readFileSync(`/proc/${pid}/stat`, "utf8")
79
+ } catch {
80
+ return undefined
81
+ }
82
+
83
+ // The comm field is wrapped in parens and may itself contain spaces or parens, so the
84
+ // numeric fields are parsed from after the final ")". They are: state, ppid, pgrp, ...
85
+ const commEnd = stat.lastIndexOf(")")
86
+
87
+ if (commEnd < 0) return undefined
88
+
89
+ const pgrp = Number(stat.slice(commEnd + 2).split(" ")[2])
90
+
91
+ return Number.isInteger(pgrp) ? pgrp : undefined
92
+ }
93
+
94
+ /**
95
+ * @param {string} pid - Process id.
96
+ * @returns {number | undefined} Resident memory in bytes, or undefined when unavailable.
97
+ */
98
+ function residentBytes(pid) {
99
+ let status
100
+
101
+ try {
102
+ status = fs.readFileSync(`/proc/${pid}/status`, "utf8")
103
+ } catch {
104
+ return undefined
105
+ }
106
+
107
+ const match = status.match(/^VmRSS:\s+(\d+)\s+kB/m)
108
+
109
+ return match ? Number(match[1]) * 1024 : undefined
110
+ }
package/src/recover.js ADDED
@@ -0,0 +1,134 @@
1
+ // @ts-check
2
+
3
+ import {inspectControlSocket} from "./daemon.js"
4
+ import {clearState, isProcessAlive, liveProcesses, readState} from "./state-store.js"
5
+
6
+ // How long to confirm a SIGKILL'd group has actually exited before reporting it un-stoppable.
7
+ const KILL_CONFIRM_TIMEOUT_MS = 500
8
+
9
+ /**
10
+ * @typedef {{id: string, pid: number, releaseId: string | null}} OrphanProcess
11
+ * @typedef {{error: string}} RecoverError
12
+ * @typedef {{cleared: boolean, forced: boolean, orphans: OrphanProcess[], remaining: OrphanProcess[]}} RecoverReport
13
+ * @typedef {RecoverError | RecoverReport} RecoverResult
14
+ */
15
+
16
+ /**
17
+ * Cleans up orphaned managed processes left by a crashed daemon. Reads the persisted state
18
+ * (config.statePath) and finds managed processes whose pids are still alive. By default it
19
+ * only reports them; with `force` it stops each one's process group (SIGTERM, then SIGKILL
20
+ * after the configured timeout) and clears the stale state file.
21
+ *
22
+ * When `force` leaves any orphan still running (for example a process owned by another user
23
+ * that can't be signaled), the state file is **kept** so the operator can investigate and
24
+ * re-run recovery — the survivors are returned in `remaining` and `cleared` stays false.
25
+ *
26
+ * Refuses to run while a daemon (or another process) holds the control socket — those pids
27
+ * belong to a live daemon, not a crash. A recycled pid can be a false positive, so review the
28
+ * dry-run list before using `force`.
29
+ * @param {object} args - Options.
30
+ * @param {import("./config.js").RollbridgeConfig} args.config - Normalized config.
31
+ * @param {boolean} args.force - Whether to actually stop the orphans (otherwise list them).
32
+ * @param {(pid: number, timeoutMs: number) => Promise<boolean>} [args.stopGroup] - Stops a process group and resolves to whether it is gone afterward (defaults to the real implementation; injectable for tests).
33
+ * @returns {Promise<RecoverResult>} The orphans found and whether they were stopped, or an error.
34
+ */
35
+ export async function recoverOrphans({config, force, stopGroup = stopProcessGroup}) {
36
+ if (config.statePath === undefined) {
37
+ return {error: "No statePath is configured; set statePath in the config to enable recovery."}
38
+ }
39
+
40
+ if (await daemonIsRunning(config.control.path)) {
41
+ return {error: `A daemon (or another process) is using ${config.control.path}; stop it before recovering — recover is for cleaning up after a crash.`}
42
+ }
43
+
44
+ const orphans = liveProcesses(await readState(config.statePath))
45
+
46
+ if (!force) return {cleared: false, forced: false, orphans, remaining: []}
47
+
48
+ /** @type {OrphanProcess[]} */
49
+ const remaining = []
50
+
51
+ for (const orphan of orphans) {
52
+ const stopped = await stopGroup(orphan.pid, config.proxy.forceStopTimeoutMs)
53
+
54
+ if (!stopped) remaining.push(orphan)
55
+ }
56
+
57
+ // Only clear the state file once every orphan is confirmed gone; otherwise keep it so the
58
+ // operator can still find and retry the survivors on the next run.
59
+ if (remaining.length === 0) await clearState(config.statePath)
60
+
61
+ return {cleared: remaining.length === 0, forced: true, orphans, remaining}
62
+ }
63
+
64
+ /**
65
+ * @param {string} socketPath - Control socket path.
66
+ * @returns {Promise<boolean>} True when a process is live on the socket (or it can't be probed).
67
+ */
68
+ async function daemonIsRunning(socketPath) {
69
+ try {
70
+ return (await inspectControlSocket(socketPath)).alive
71
+ } catch {
72
+ // Can't tell — be conservative and refuse to stop processes.
73
+ return true
74
+ }
75
+ }
76
+
77
+ /**
78
+ * Stops a detached process group: SIGTERM, then SIGKILL if it outlives the timeout.
79
+ * @param {number} pid - Process-group leader pid (the detached spawn's pid).
80
+ * @param {number} timeoutMs - Grace period before SIGKILL.
81
+ * @returns {Promise<boolean>} True once the process is gone; false if it is still alive afterward (for example owned by another user, so it can't be signaled).
82
+ */
83
+ async function stopProcessGroup(pid, timeoutMs) {
84
+ const term = sendSignal(pid, "SIGTERM")
85
+
86
+ if (term === "gone") return true
87
+ if (term === "denied") return false
88
+
89
+ if (await waitForExit(pid, timeoutMs)) return true
90
+
91
+ const kill = sendSignal(pid, "SIGKILL")
92
+
93
+ if (kill === "gone") return true
94
+ if (kill === "denied") return false
95
+
96
+ return waitForExit(pid, KILL_CONFIRM_TIMEOUT_MS)
97
+ }
98
+
99
+ /**
100
+ * Polls until the pid is no longer alive or the timeout elapses.
101
+ * @param {number} pid - Process pid to watch.
102
+ * @param {number} timeoutMs - How long to wait for it to exit.
103
+ * @returns {Promise<boolean>} True once the process is gone, false if it is still alive at the deadline.
104
+ */
105
+ async function waitForExit(pid, timeoutMs) {
106
+ const deadline = Date.now() + timeoutMs
107
+
108
+ while (Date.now() < deadline) {
109
+ if (!isProcessAlive(pid)) return true
110
+
111
+ await new Promise((resolve) => setTimeout(resolve, 50))
112
+ }
113
+
114
+ return !isProcessAlive(pid)
115
+ }
116
+
117
+ /**
118
+ * Sends a signal to a detached process group, classifying the outcome.
119
+ * @param {number} pid - Process-group leader pid.
120
+ * @param {"SIGTERM" | "SIGKILL"} signal - Signal to send to the group.
121
+ * @returns {"denied" | "gone" | "sent"} `gone` when the group no longer exists (ESRCH), `denied` when it can't be signaled (for example EPERM), otherwise `sent`.
122
+ */
123
+ function sendSignal(pid, signal) {
124
+ try {
125
+ process.kill(-pid, signal)
126
+
127
+ return "sent"
128
+ } catch (error) {
129
+ if (error && typeof error === "object" && "code" in error && error.code === "ESRCH") return "gone"
130
+
131
+ // EPERM (owned by another user) or anything else: we could not deliver the signal.
132
+ return "denied"
133
+ }
134
+ }