rollbridge 0.1.5 → 0.1.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +125 -4
- package/TODO.md +45 -43
- package/docs/cli.md +166 -6
- package/docs/config.md +172 -2
- package/docs/logging.md +77 -0
- package/docs/releasing.md +53 -0
- package/docs/tensorbuzz-runbook.md +129 -0
- package/docs/velocious.md +49 -11
- package/docs/workers.md +115 -0
- package/package.json +1 -1
- package/src/cli.js +327 -1
- package/src/config.js +268 -6
- package/src/daemon.js +216 -13
- package/src/doctor.js +177 -0
- package/src/event-log.js +47 -0
- package/src/managed-process.js +225 -16
- package/src/predeploy-cleanup.js +340 -0
- package/src/process-memory.js +110 -0
- package/src/recover.js +134 -0
- package/src/release-group.js +71 -21
- package/src/state-store.js +103 -0
- package/src/system-ids.js +71 -0
- package/src/template.js +32 -0
- package/test/completion.test.js +64 -0
- package/test/config-validation.test.js +268 -0
- package/test/doctor.test.js +205 -3
- package/test/event-log.test.js +46 -0
- package/test/fixtures/memory-hog.js +19 -0
- package/test/managed-process.test.js +290 -0
- package/test/predeploy-cleanup.test.js +131 -0
- package/test/process-memory.test.js +40 -0
- package/test/recover.test.js +162 -0
- package/test/release-group.test.js +22 -0
- package/test/rollbridge.test.js +523 -6
- package/test/state-store.test.js +69 -0
- package/test/system-ids.test.js +24 -0
|
@@ -0,0 +1,340 @@
|
|
|
1
|
+
// @ts-check
|
|
2
|
+
|
|
3
|
+
import fs from "node:fs"
|
|
4
|
+
import path from "node:path"
|
|
5
|
+
import {spawnSync} from "node:child_process"
|
|
6
|
+
import {setTimeout as sleep} from "node:timers/promises"
|
|
7
|
+
import {inspectControlSocket} from "./daemon.js"
|
|
8
|
+
import {recoverOrphans} from "./recover.js"
|
|
9
|
+
import {sendControlCommand} from "./control-client.js"
|
|
10
|
+
|
|
11
|
+
/**
|
|
12
|
+
* @typedef {import("./json.js").JsonValue} JsonValue
|
|
13
|
+
* @typedef {{pid: number, parentPid: number, args: string}} ProcessRow
|
|
14
|
+
* @typedef {{action: string, legacyProcesses: ProcessRow[], recoveredOrphans: number}} PredeployCleanupResult
|
|
15
|
+
*/
|
|
16
|
+
|
|
17
|
+
/**
|
|
18
|
+
* Prepares a host for a Rollbridge deploy by handling the two cases that can block a fresh
|
|
19
|
+
* daemon startup: orphaned Rollbridge-managed pids from a crashed daemon, and explicitly
|
|
20
|
+
* configured legacy processes from the pre-Rollbridge supervisor.
|
|
21
|
+
* @param {object} args - Options.
|
|
22
|
+
* @param {import("./config.js").RollbridgeConfig} args.config - Rollbridge config.
|
|
23
|
+
* @param {(socketPath: string) => Promise<import("./daemon.js").ControlSocketInspection>} [args.inspectSocket] - Control socket probe.
|
|
24
|
+
* @param {string} [args.releasePath] - Pending release path, used to restart the daemon when this release changes Rollbridge itself.
|
|
25
|
+
* @param {(args: {command: Record<string, JsonValue>, path: string}) => Promise<Record<string, JsonValue>>} [args.sendCommand] - Control command sender.
|
|
26
|
+
* @param {(command: string, args: string[]) => import("node:child_process").SpawnSyncReturns<Buffer>} [args.runCommand] - Command runner.
|
|
27
|
+
* @param {(pid: number, signal: string) => void} [args.killProcess] - Signal sender.
|
|
28
|
+
* @param {(args: {config: import("./config.js").RollbridgeConfig, force: boolean}) => Promise<import("./recover.js").RecoverResult>} [args.recover] - Orphan recovery function.
|
|
29
|
+
* @returns {Promise<PredeployCleanupResult>} Cleanup result.
|
|
30
|
+
*/
|
|
31
|
+
export async function predeployCleanup({
|
|
32
|
+
config,
|
|
33
|
+
inspectSocket = inspectControlSocket,
|
|
34
|
+
killProcess = process.kill,
|
|
35
|
+
releasePath,
|
|
36
|
+
recover = recoverOrphans,
|
|
37
|
+
runCommand = spawnSync,
|
|
38
|
+
sendCommand = sendControlCommand
|
|
39
|
+
}) {
|
|
40
|
+
const inspection = await inspectSocket(config.control.path)
|
|
41
|
+
|
|
42
|
+
if (inspection.alive) {
|
|
43
|
+
const status = await activeDaemonStatus({config, inspection, sendCommand})
|
|
44
|
+
|
|
45
|
+
if (status.activeReleaseId && daemonMatchesPendingRelease({config, releasePath, status})) {
|
|
46
|
+
return {action: "daemon-active", legacyProcesses: [], recoveredOrphans: 0}
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
await sendCommand({command: {command: "shutdown"}, path: config.control.path})
|
|
50
|
+
await waitForControlSocketShutdown({config, inspectSocket})
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
const recoveredOrphans = await recoverConfiguredOrphans(config, recover)
|
|
54
|
+
const legacyProcesses = await stopLegacyProcesses({config, killProcess, runCommand})
|
|
55
|
+
|
|
56
|
+
return {
|
|
57
|
+
action: inspection.alive ? "daemon-stopped" : "no-daemon-cleaned",
|
|
58
|
+
legacyProcesses,
|
|
59
|
+
recoveredOrphans
|
|
60
|
+
}
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
/**
|
|
64
|
+
* @param {object} args - Options.
|
|
65
|
+
* @param {import("./config.js").RollbridgeConfig} args.config - Rollbridge config.
|
|
66
|
+
* @param {import("./daemon.js").ControlSocketInspection} args.inspection - Socket inspection.
|
|
67
|
+
* @param {(args: {command: Record<string, JsonValue>, path: string}) => Promise<Record<string, JsonValue>>} args.sendCommand - Control command sender.
|
|
68
|
+
* @returns {Promise<import("./daemon.js").DaemonStatus>} Daemon status.
|
|
69
|
+
*/
|
|
70
|
+
async function activeDaemonStatus({config, inspection, sendCommand}) {
|
|
71
|
+
if (inspection.application === undefined) {
|
|
72
|
+
throw new Error(`A non-Rollbridge process is using ${config.control.path}; refusing predeploy cleanup.`)
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
if (inspection.application !== config.application) {
|
|
76
|
+
throw new Error(`A Rollbridge daemon for "${inspection.application}" is using ${config.control.path}; expected "${config.application}".`)
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
const status = await sendCommand({command: {command: "status"}, path: config.control.path})
|
|
80
|
+
|
|
81
|
+
return /** @type {import("./daemon.js").DaemonStatus} */ (status)
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
/**
|
|
85
|
+
* @param {object} args - Options.
|
|
86
|
+
* @param {import("./config.js").RollbridgeConfig} args.config - Rollbridge config.
|
|
87
|
+
* @param {string} [args.releasePath] - Pending release path.
|
|
88
|
+
* @param {import("./daemon.js").DaemonStatus} args.status - Active daemon status.
|
|
89
|
+
* @returns {boolean} True when the active daemon can be reused for this deploy.
|
|
90
|
+
*/
|
|
91
|
+
function daemonMatchesPendingRelease({config, releasePath, status}) {
|
|
92
|
+
if (!proxyMatchesConfig(status.proxy, config.proxy)) return false
|
|
93
|
+
if (releasePath !== undefined && rollbridgePackageChanged({releasePath, status})) return false
|
|
94
|
+
|
|
95
|
+
return true
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
/**
|
|
99
|
+
* @param {import("./daemon.js").DaemonStatus["proxy"]} currentProxy - Current proxy status.
|
|
100
|
+
* @param {import("./config.js").ProxyConfig} expectedProxy - Pending release proxy config.
|
|
101
|
+
* @returns {boolean} True when the current daemon proxy matches the pending config.
|
|
102
|
+
*/
|
|
103
|
+
function proxyMatchesConfig(currentProxy, expectedProxy) {
|
|
104
|
+
return currentProxy.host === expectedProxy.host &&
|
|
105
|
+
currentProxy.port === expectedProxy.port &&
|
|
106
|
+
currentProxy.upstreamHost === expectedProxy.upstreamHost
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
/**
|
|
110
|
+
* @param {object} args - Options.
|
|
111
|
+
* @param {string} args.releasePath - Pending release path.
|
|
112
|
+
* @param {import("./daemon.js").DaemonStatus} args.status - Active daemon status.
|
|
113
|
+
* @returns {boolean} True when the pending release uses a different Rollbridge package version.
|
|
114
|
+
*/
|
|
115
|
+
function rollbridgePackageChanged({releasePath, status}) {
|
|
116
|
+
const activeRelease = status.releases.find((release) => release.releaseId === status.activeReleaseId)
|
|
117
|
+
if (activeRelease === undefined) return false
|
|
118
|
+
|
|
119
|
+
const releaseVersion = rollbridgePackageVersion(releasePath)
|
|
120
|
+
const activeVersion = rollbridgePackageVersion(activeRelease.releasePath)
|
|
121
|
+
|
|
122
|
+
return releaseVersion !== undefined &&
|
|
123
|
+
activeVersion !== undefined &&
|
|
124
|
+
releaseVersion !== activeVersion
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
/**
|
|
128
|
+
* @param {string} releasePath - Release path containing node_modules.
|
|
129
|
+
* @returns {string | undefined} Installed Rollbridge package version.
|
|
130
|
+
*/
|
|
131
|
+
function rollbridgePackageVersion(releasePath) {
|
|
132
|
+
try {
|
|
133
|
+
const packageJson = JSON.parse(fs.readFileSync(path.join(releasePath, "node_modules", "rollbridge", "package.json"), "utf8"))
|
|
134
|
+
|
|
135
|
+
if (packageJson && typeof packageJson === "object" && "version" in packageJson && typeof packageJson.version === "string") {
|
|
136
|
+
return packageJson.version
|
|
137
|
+
}
|
|
138
|
+
} catch (error) {
|
|
139
|
+
if (error && typeof error === "object" && "code" in error && error.code === "ENOENT") return undefined
|
|
140
|
+
|
|
141
|
+
throw error
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
return undefined
|
|
145
|
+
}
|
|
146
|
+
|
|
147
|
+
/**
|
|
148
|
+
* @param {object} args - Options.
|
|
149
|
+
* @param {import("./config.js").RollbridgeConfig} args.config - Rollbridge config.
|
|
150
|
+
* @param {(socketPath: string) => Promise<import("./daemon.js").ControlSocketInspection>} args.inspectSocket - Control socket probe.
|
|
151
|
+
* @returns {Promise<void>} Resolves after the socket stops accepting Rollbridge commands.
|
|
152
|
+
*/
|
|
153
|
+
async function waitForControlSocketShutdown({config, inspectSocket}) {
|
|
154
|
+
const deadline = Date.now() + 30000
|
|
155
|
+
|
|
156
|
+
while (true) {
|
|
157
|
+
const inspection = await inspectSocket(config.control.path)
|
|
158
|
+
if (!inspection.alive) return
|
|
159
|
+
|
|
160
|
+
if (Date.now() >= deadline) {
|
|
161
|
+
throw new Error(`Timed out waiting for Rollbridge daemon at ${config.control.path} to shut down.`)
|
|
162
|
+
}
|
|
163
|
+
|
|
164
|
+
await sleep(250)
|
|
165
|
+
}
|
|
166
|
+
}
|
|
167
|
+
|
|
168
|
+
/**
|
|
169
|
+
* @param {import("./config.js").RollbridgeConfig} config - Rollbridge config.
|
|
170
|
+
* @param {(args: {config: import("./config.js").RollbridgeConfig, force: boolean}) => Promise<import("./recover.js").RecoverResult>} recover - Orphan recovery function.
|
|
171
|
+
* @returns {Promise<number>} Number of orphans found.
|
|
172
|
+
*/
|
|
173
|
+
async function recoverConfiguredOrphans(config, recover) {
|
|
174
|
+
if (config.statePath === undefined) return 0
|
|
175
|
+
|
|
176
|
+
const result = await recover({config, force: true})
|
|
177
|
+
|
|
178
|
+
if ("error" in result) {
|
|
179
|
+
throw new Error(result.error)
|
|
180
|
+
}
|
|
181
|
+
|
|
182
|
+
if (result.remaining.length > 0) {
|
|
183
|
+
throw new Error(`Could not stop ${result.remaining.length} Rollbridge orphaned process${result.remaining.length === 1 ? "" : "es"}.`)
|
|
184
|
+
}
|
|
185
|
+
|
|
186
|
+
return result.orphans.length
|
|
187
|
+
}
|
|
188
|
+
|
|
189
|
+
/**
|
|
190
|
+
* @param {object} args - Options.
|
|
191
|
+
* @param {import("./config.js").RollbridgeConfig} args.config - Rollbridge config.
|
|
192
|
+
* @param {(command: string, args: string[]) => import("node:child_process").SpawnSyncReturns<Buffer>} args.runCommand - Command runner.
|
|
193
|
+
* @param {(pid: number, signal: string) => void} args.killProcess - Signal sender.
|
|
194
|
+
* @returns {Promise<ProcessRow[]>} Stopped legacy processes.
|
|
195
|
+
*/
|
|
196
|
+
async function stopLegacyProcesses({config, killProcess, runCommand}) {
|
|
197
|
+
const takeoverConfig = config.legacyTakeover
|
|
198
|
+
if (takeoverConfig === undefined) return []
|
|
199
|
+
|
|
200
|
+
for (const screenName of takeoverConfig.screens) {
|
|
201
|
+
runCommand("screen", ["-S", screenName, "-X", "quit"])
|
|
202
|
+
}
|
|
203
|
+
|
|
204
|
+
const stoppedProcesses = await stopProcessTree({
|
|
205
|
+
killProcess,
|
|
206
|
+
processRows: legacyProcesses(config),
|
|
207
|
+
timeoutMs: takeoverConfig.forceStopTimeoutMs
|
|
208
|
+
})
|
|
209
|
+
const remainingProcesses = legacyProcesses(config)
|
|
210
|
+
|
|
211
|
+
if (remainingProcesses.length > 0) {
|
|
212
|
+
const details = remainingProcesses.map((row) => `${row.pid} ${row.args}`).join("\n")
|
|
213
|
+
|
|
214
|
+
throw new Error(`Refusing Rollbridge deploy while legacy processes are still running:\n${details}`)
|
|
215
|
+
}
|
|
216
|
+
|
|
217
|
+
return stoppedProcesses
|
|
218
|
+
}
|
|
219
|
+
|
|
220
|
+
/**
|
|
221
|
+
* @param {import("./config.js").RollbridgeConfig} config - Rollbridge config.
|
|
222
|
+
* @returns {ProcessRow[]} Legacy process rows and their descendants.
|
|
223
|
+
*/
|
|
224
|
+
function legacyProcesses(config) {
|
|
225
|
+
const rows = processRows()
|
|
226
|
+
const protectedPids = protectedProcessIds(rows)
|
|
227
|
+
const legacyPids = new Set(rows.filter((row) => legacySeedProcess(row, config, protectedPids)).map((row) => row.pid))
|
|
228
|
+
let changed = true
|
|
229
|
+
|
|
230
|
+
while (changed) {
|
|
231
|
+
changed = false
|
|
232
|
+
|
|
233
|
+
for (const row of rows) {
|
|
234
|
+
if (!legacyPids.has(row.pid) && legacyPids.has(row.parentPid)) {
|
|
235
|
+
legacyPids.add(row.pid)
|
|
236
|
+
changed = true
|
|
237
|
+
}
|
|
238
|
+
}
|
|
239
|
+
}
|
|
240
|
+
|
|
241
|
+
return rows.filter((row) => legacyPids.has(row.pid))
|
|
242
|
+
}
|
|
243
|
+
|
|
244
|
+
/**
|
|
245
|
+
* @param {ProcessRow} row - Process row.
|
|
246
|
+
* @param {import("./config.js").RollbridgeConfig} config - Rollbridge config.
|
|
247
|
+
* @param {Set<number>} protectedPids - Current cleanup process and ancestors.
|
|
248
|
+
* @returns {boolean} True when the row identifies a configured legacy process.
|
|
249
|
+
*/
|
|
250
|
+
function legacySeedProcess(row, config, protectedPids) {
|
|
251
|
+
const takeoverConfig = config.legacyTakeover
|
|
252
|
+
if (takeoverConfig === undefined || protectedPids.has(row.pid)) return false
|
|
253
|
+
|
|
254
|
+
if (takeoverConfig.screens.some((screenName) => row.args.includes(`SCREEN -dmS ${screenName}`))) {
|
|
255
|
+
return true
|
|
256
|
+
}
|
|
257
|
+
|
|
258
|
+
return takeoverConfig.processes.some((processConfig) => (
|
|
259
|
+
processConfig.includes.every((matcher) => row.args.includes(matcher))
|
|
260
|
+
))
|
|
261
|
+
}
|
|
262
|
+
|
|
263
|
+
/**
|
|
264
|
+
* @param {ProcessRow[]} rows - Current process table rows.
|
|
265
|
+
* @returns {Set<number>} Pids that belong to the running cleanup command.
|
|
266
|
+
*/
|
|
267
|
+
function protectedProcessIds(rows) {
|
|
268
|
+
const byPid = new Map(rows.map((row) => [row.pid, row]))
|
|
269
|
+
const protectedPids = new Set([process.pid])
|
|
270
|
+
let parentPid = process.ppid
|
|
271
|
+
|
|
272
|
+
while (parentPid > 0 && !protectedPids.has(parentPid)) {
|
|
273
|
+
protectedPids.add(parentPid)
|
|
274
|
+
parentPid = byPid.get(parentPid)?.parentPid || 0
|
|
275
|
+
}
|
|
276
|
+
|
|
277
|
+
return protectedPids
|
|
278
|
+
}
|
|
279
|
+
|
|
280
|
+
/** @returns {ProcessRow[]} Current process table rows. */
|
|
281
|
+
function processRows() {
|
|
282
|
+
const result = spawnSync("ps", ["-eo", "pid=,ppid=,args="], {encoding: "utf8"})
|
|
283
|
+
|
|
284
|
+
if (result.error) throw result.error
|
|
285
|
+
if (result.status !== 0) throw new Error(`Failed to inspect running processes: ${result.stderr}`)
|
|
286
|
+
|
|
287
|
+
return result.stdout.split("\n").flatMap((line) => {
|
|
288
|
+
const match = line.match(/^\s*(\d+)\s+(\d+)\s+(.+)$/)
|
|
289
|
+
if (!match) return []
|
|
290
|
+
|
|
291
|
+
const pid = Number(match[1])
|
|
292
|
+
const parentPid = Number(match[2])
|
|
293
|
+
|
|
294
|
+
return [{args: match[3], parentPid, pid}]
|
|
295
|
+
})
|
|
296
|
+
}
|
|
297
|
+
|
|
298
|
+
/**
|
|
299
|
+
* @param {object} args - Options.
|
|
300
|
+
* @param {(pid: number, signal: string) => void} args.killProcess - Signal sender.
|
|
301
|
+
* @param {ProcessRow[]} args.processRows - Processes to stop.
|
|
302
|
+
* @param {number} args.timeoutMs - Grace period before SIGKILL.
|
|
303
|
+
* @returns {Promise<ProcessRow[]>} Processes that were signaled.
|
|
304
|
+
*/
|
|
305
|
+
async function stopProcessTree({killProcess, processRows, timeoutMs}) {
|
|
306
|
+
/** @type {ProcessRow[]} */
|
|
307
|
+
const stoppedProcesses = []
|
|
308
|
+
|
|
309
|
+
for (const row of processRows) {
|
|
310
|
+
if (sendSignal(row.pid, "SIGTERM", killProcess)) stoppedProcesses.push(row)
|
|
311
|
+
}
|
|
312
|
+
|
|
313
|
+
if (stoppedProcesses.length === 0) return []
|
|
314
|
+
|
|
315
|
+
await sleep(timeoutMs)
|
|
316
|
+
|
|
317
|
+
for (const row of processRows) {
|
|
318
|
+
sendSignal(row.pid, "SIGKILL", killProcess)
|
|
319
|
+
}
|
|
320
|
+
|
|
321
|
+
return stoppedProcesses
|
|
322
|
+
}
|
|
323
|
+
|
|
324
|
+
/**
|
|
325
|
+
* @param {number} pid - Process id.
|
|
326
|
+
* @param {string} signal - Signal name.
|
|
327
|
+
* @param {(pid: number, signal: string) => void} killProcess - Signal sender.
|
|
328
|
+
* @returns {boolean} True when the signal was sent, false when the process was already gone.
|
|
329
|
+
*/
|
|
330
|
+
function sendSignal(pid, signal, killProcess) {
|
|
331
|
+
try {
|
|
332
|
+
killProcess(pid, signal)
|
|
333
|
+
|
|
334
|
+
return true
|
|
335
|
+
} catch (error) {
|
|
336
|
+
if (error && typeof error === "object" && "code" in error && error.code === "ESRCH") return false
|
|
337
|
+
|
|
338
|
+
throw error
|
|
339
|
+
}
|
|
340
|
+
}
|
|
@@ -0,0 +1,110 @@
|
|
|
1
|
+
// @ts-check
|
|
2
|
+
|
|
3
|
+
import fs from "node:fs"
|
|
4
|
+
|
|
5
|
+
/**
|
|
6
|
+
* @typedef {{command: string, pid: number, rssBytes: number | undefined}} ProcessGroupMember
|
|
7
|
+
*/
|
|
8
|
+
|
|
9
|
+
/**
|
|
10
|
+
* Lists the members of a managed process group with each member's resident memory.
|
|
11
|
+
* Rollbridge spawns each process detached, so the spawned pid is the process-group
|
|
12
|
+
* leader and every process in the tree (the shell wrapper, the app, any children)
|
|
13
|
+
* shares that group id.
|
|
14
|
+
*
|
|
15
|
+
* Reads `/proc` (Linux); returns an empty array when unavailable (no `/proc`, e.g.
|
|
16
|
+
* non-Linux) or the group has no members.
|
|
17
|
+
* @param {number} pgid - Process-group id (the detached spawn's pid).
|
|
18
|
+
* @returns {ProcessGroupMember[]} Group members, ordered by pid.
|
|
19
|
+
*/
|
|
20
|
+
export function processGroupMembers(pgid) {
|
|
21
|
+
/** @type {string[]} */
|
|
22
|
+
let entries
|
|
23
|
+
|
|
24
|
+
try {
|
|
25
|
+
entries = fs.readdirSync("/proc")
|
|
26
|
+
} catch {
|
|
27
|
+
return []
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
/** @type {ProcessGroupMember[]} */
|
|
31
|
+
const members = []
|
|
32
|
+
|
|
33
|
+
for (const entry of entries) {
|
|
34
|
+
if (!/^\d+$/.test(entry)) continue
|
|
35
|
+
if (processGroupId(entry) !== pgid) continue
|
|
36
|
+
|
|
37
|
+
members.push({command: commandName(entry), pid: Number(entry), rssBytes: residentBytes(entry)})
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
members.sort((first, second) => first.pid - second.pid)
|
|
41
|
+
|
|
42
|
+
return members
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
/**
|
|
46
|
+
* Measures the total resident memory (RSS) of a managed process group.
|
|
47
|
+
* @param {number} pgid - Process-group id (the detached spawn's pid).
|
|
48
|
+
* @returns {number | undefined} Total resident memory in bytes, or undefined when unmeasurable.
|
|
49
|
+
*/
|
|
50
|
+
export function measureProcessGroupRssBytes(pgid) {
|
|
51
|
+
const measured = processGroupMembers(pgid).filter((member) => member.rssBytes !== undefined)
|
|
52
|
+
|
|
53
|
+
if (measured.length === 0) return undefined
|
|
54
|
+
|
|
55
|
+
return measured.reduce((total, member) => total + (member.rssBytes ?? 0), 0)
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
/**
|
|
59
|
+
* @param {string} pid - Process id.
|
|
60
|
+
* @returns {string} The process command name (`/proc/<pid>/comm`), or "" when unavailable.
|
|
61
|
+
*/
|
|
62
|
+
function commandName(pid) {
|
|
63
|
+
try {
|
|
64
|
+
return fs.readFileSync(`/proc/${pid}/comm`, "utf8").trim()
|
|
65
|
+
} catch {
|
|
66
|
+
return ""
|
|
67
|
+
}
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
/**
|
|
71
|
+
* @param {string} pid - Process id.
|
|
72
|
+
* @returns {number | undefined} The process-group id, or undefined when the process is gone.
|
|
73
|
+
*/
|
|
74
|
+
function processGroupId(pid) {
|
|
75
|
+
let stat
|
|
76
|
+
|
|
77
|
+
try {
|
|
78
|
+
stat = fs.readFileSync(`/proc/${pid}/stat`, "utf8")
|
|
79
|
+
} catch {
|
|
80
|
+
return undefined
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
// The comm field is wrapped in parens and may itself contain spaces or parens, so the
|
|
84
|
+
// numeric fields are parsed from after the final ")". They are: state, ppid, pgrp, ...
|
|
85
|
+
const commEnd = stat.lastIndexOf(")")
|
|
86
|
+
|
|
87
|
+
if (commEnd < 0) return undefined
|
|
88
|
+
|
|
89
|
+
const pgrp = Number(stat.slice(commEnd + 2).split(" ")[2])
|
|
90
|
+
|
|
91
|
+
return Number.isInteger(pgrp) ? pgrp : undefined
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
/**
|
|
95
|
+
* @param {string} pid - Process id.
|
|
96
|
+
* @returns {number | undefined} Resident memory in bytes, or undefined when unavailable.
|
|
97
|
+
*/
|
|
98
|
+
function residentBytes(pid) {
|
|
99
|
+
let status
|
|
100
|
+
|
|
101
|
+
try {
|
|
102
|
+
status = fs.readFileSync(`/proc/${pid}/status`, "utf8")
|
|
103
|
+
} catch {
|
|
104
|
+
return undefined
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
const match = status.match(/^VmRSS:\s+(\d+)\s+kB/m)
|
|
108
|
+
|
|
109
|
+
return match ? Number(match[1]) * 1024 : undefined
|
|
110
|
+
}
|
package/src/recover.js
ADDED
|
@@ -0,0 +1,134 @@
|
|
|
1
|
+
// @ts-check
|
|
2
|
+
|
|
3
|
+
import {inspectControlSocket} from "./daemon.js"
|
|
4
|
+
import {clearState, isProcessAlive, liveProcesses, readState} from "./state-store.js"
|
|
5
|
+
|
|
6
|
+
// How long to confirm a SIGKILL'd group has actually exited before reporting it un-stoppable.
|
|
7
|
+
const KILL_CONFIRM_TIMEOUT_MS = 500
|
|
8
|
+
|
|
9
|
+
/**
|
|
10
|
+
* @typedef {{id: string, pid: number, releaseId: string | null}} OrphanProcess
|
|
11
|
+
* @typedef {{error: string}} RecoverError
|
|
12
|
+
* @typedef {{cleared: boolean, forced: boolean, orphans: OrphanProcess[], remaining: OrphanProcess[]}} RecoverReport
|
|
13
|
+
* @typedef {RecoverError | RecoverReport} RecoverResult
|
|
14
|
+
*/
|
|
15
|
+
|
|
16
|
+
/**
|
|
17
|
+
* Cleans up orphaned managed processes left by a crashed daemon. Reads the persisted state
|
|
18
|
+
* (config.statePath) and finds managed processes whose pids are still alive. By default it
|
|
19
|
+
* only reports them; with `force` it stops each one's process group (SIGTERM, then SIGKILL
|
|
20
|
+
* after the configured timeout) and clears the stale state file.
|
|
21
|
+
*
|
|
22
|
+
* When `force` leaves any orphan still running (for example a process owned by another user
|
|
23
|
+
* that can't be signaled), the state file is **kept** so the operator can investigate and
|
|
24
|
+
* re-run recovery — the survivors are returned in `remaining` and `cleared` stays false.
|
|
25
|
+
*
|
|
26
|
+
* Refuses to run while a daemon (or another process) holds the control socket — those pids
|
|
27
|
+
* belong to a live daemon, not a crash. A recycled pid can be a false positive, so review the
|
|
28
|
+
* dry-run list before using `force`.
|
|
29
|
+
* @param {object} args - Options.
|
|
30
|
+
* @param {import("./config.js").RollbridgeConfig} args.config - Normalized config.
|
|
31
|
+
* @param {boolean} args.force - Whether to actually stop the orphans (otherwise list them).
|
|
32
|
+
* @param {(pid: number, timeoutMs: number) => Promise<boolean>} [args.stopGroup] - Stops a process group and resolves to whether it is gone afterward (defaults to the real implementation; injectable for tests).
|
|
33
|
+
* @returns {Promise<RecoverResult>} The orphans found and whether they were stopped, or an error.
|
|
34
|
+
*/
|
|
35
|
+
export async function recoverOrphans({config, force, stopGroup = stopProcessGroup}) {
|
|
36
|
+
if (config.statePath === undefined) {
|
|
37
|
+
return {error: "No statePath is configured; set statePath in the config to enable recovery."}
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
if (await daemonIsRunning(config.control.path)) {
|
|
41
|
+
return {error: `A daemon (or another process) is using ${config.control.path}; stop it before recovering — recover is for cleaning up after a crash.`}
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
const orphans = liveProcesses(await readState(config.statePath))
|
|
45
|
+
|
|
46
|
+
if (!force) return {cleared: false, forced: false, orphans, remaining: []}
|
|
47
|
+
|
|
48
|
+
/** @type {OrphanProcess[]} */
|
|
49
|
+
const remaining = []
|
|
50
|
+
|
|
51
|
+
for (const orphan of orphans) {
|
|
52
|
+
const stopped = await stopGroup(orphan.pid, config.proxy.forceStopTimeoutMs)
|
|
53
|
+
|
|
54
|
+
if (!stopped) remaining.push(orphan)
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
// Only clear the state file once every orphan is confirmed gone; otherwise keep it so the
|
|
58
|
+
// operator can still find and retry the survivors on the next run.
|
|
59
|
+
if (remaining.length === 0) await clearState(config.statePath)
|
|
60
|
+
|
|
61
|
+
return {cleared: remaining.length === 0, forced: true, orphans, remaining}
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
/**
|
|
65
|
+
* @param {string} socketPath - Control socket path.
|
|
66
|
+
* @returns {Promise<boolean>} True when a process is live on the socket (or it can't be probed).
|
|
67
|
+
*/
|
|
68
|
+
async function daemonIsRunning(socketPath) {
|
|
69
|
+
try {
|
|
70
|
+
return (await inspectControlSocket(socketPath)).alive
|
|
71
|
+
} catch {
|
|
72
|
+
// Can't tell — be conservative and refuse to stop processes.
|
|
73
|
+
return true
|
|
74
|
+
}
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
/**
|
|
78
|
+
* Stops a detached process group: SIGTERM, then SIGKILL if it outlives the timeout.
|
|
79
|
+
* @param {number} pid - Process-group leader pid (the detached spawn's pid).
|
|
80
|
+
* @param {number} timeoutMs - Grace period before SIGKILL.
|
|
81
|
+
* @returns {Promise<boolean>} True once the process is gone; false if it is still alive afterward (for example owned by another user, so it can't be signaled).
|
|
82
|
+
*/
|
|
83
|
+
async function stopProcessGroup(pid, timeoutMs) {
|
|
84
|
+
const term = sendSignal(pid, "SIGTERM")
|
|
85
|
+
|
|
86
|
+
if (term === "gone") return true
|
|
87
|
+
if (term === "denied") return false
|
|
88
|
+
|
|
89
|
+
if (await waitForExit(pid, timeoutMs)) return true
|
|
90
|
+
|
|
91
|
+
const kill = sendSignal(pid, "SIGKILL")
|
|
92
|
+
|
|
93
|
+
if (kill === "gone") return true
|
|
94
|
+
if (kill === "denied") return false
|
|
95
|
+
|
|
96
|
+
return waitForExit(pid, KILL_CONFIRM_TIMEOUT_MS)
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
/**
|
|
100
|
+
* Polls until the pid is no longer alive or the timeout elapses.
|
|
101
|
+
* @param {number} pid - Process pid to watch.
|
|
102
|
+
* @param {number} timeoutMs - How long to wait for it to exit.
|
|
103
|
+
* @returns {Promise<boolean>} True once the process is gone, false if it is still alive at the deadline.
|
|
104
|
+
*/
|
|
105
|
+
async function waitForExit(pid, timeoutMs) {
|
|
106
|
+
const deadline = Date.now() + timeoutMs
|
|
107
|
+
|
|
108
|
+
while (Date.now() < deadline) {
|
|
109
|
+
if (!isProcessAlive(pid)) return true
|
|
110
|
+
|
|
111
|
+
await new Promise((resolve) => setTimeout(resolve, 50))
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
return !isProcessAlive(pid)
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
/**
|
|
118
|
+
* Sends a signal to a detached process group, classifying the outcome.
|
|
119
|
+
* @param {number} pid - Process-group leader pid.
|
|
120
|
+
* @param {"SIGTERM" | "SIGKILL"} signal - Signal to send to the group.
|
|
121
|
+
* @returns {"denied" | "gone" | "sent"} `gone` when the group no longer exists (ESRCH), `denied` when it can't be signaled (for example EPERM), otherwise `sent`.
|
|
122
|
+
*/
|
|
123
|
+
function sendSignal(pid, signal) {
|
|
124
|
+
try {
|
|
125
|
+
process.kill(-pid, signal)
|
|
126
|
+
|
|
127
|
+
return "sent"
|
|
128
|
+
} catch (error) {
|
|
129
|
+
if (error && typeof error === "object" && "code" in error && error.code === "ESRCH") return "gone"
|
|
130
|
+
|
|
131
|
+
// EPERM (owned by another user) or anything else: we could not deliver the signal.
|
|
132
|
+
return "denied"
|
|
133
|
+
}
|
|
134
|
+
}
|