rollbridge 0.1.4 → 0.1.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +137 -4
- package/TODO.md +47 -45
- package/docs/cli.md +169 -6
- package/docs/config.md +160 -3
- package/docs/logging.md +77 -0
- package/docs/nginx.md +104 -0
- package/docs/releasing.md +53 -0
- package/docs/tensorbuzz-runbook.md +129 -0
- package/docs/velocious.md +238 -0
- package/docs/workers.md +115 -0
- package/package.json +3 -2
- package/src/cli.js +317 -1
- package/src/config.js +240 -6
- package/src/daemon.js +284 -4
- package/src/doctor.js +177 -0
- package/src/event-log.js +47 -0
- package/src/managed-process.js +287 -22
- package/src/process-memory.js +110 -0
- package/src/recover.js +134 -0
- package/src/release-group.js +80 -21
- package/src/state-store.js +103 -0
- package/src/system-ids.js +71 -0
- package/src/template.js +32 -0
- package/test/completion.test.js +64 -0
- package/test/config-validation.test.js +267 -0
- package/test/doctor.test.js +205 -3
- package/test/event-log.test.js +46 -0
- package/test/fixtures/memory-hog.js +19 -0
- package/test/managed-process.test.js +376 -0
- package/test/process-memory.test.js +40 -0
- package/test/recover.test.js +162 -0
- package/test/release-group.test.js +22 -0
- package/test/rollbridge.test.js +716 -6
- package/test/state-store.test.js +69 -0
- package/test/system-ids.test.js +24 -0
- package/scripts/release-patch.js +0 -83
package/src/daemon.js
CHANGED
|
@@ -4,13 +4,19 @@ import fs from "node:fs/promises"
|
|
|
4
4
|
import http from "node:http"
|
|
5
5
|
import net from "node:net"
|
|
6
6
|
import httpProxy from "http-proxy"
|
|
7
|
+
import EventLog from "./event-log.js"
|
|
7
8
|
import ReleaseGroup from "./release-group.js"
|
|
9
|
+
import {clearState, isProcessAlive, liveProcesses, readState, writeState} from "./state-store.js"
|
|
10
|
+
import {resolveGroupId, resolveUserId} from "./system-ids.js"
|
|
11
|
+
|
|
12
|
+
const EVENT_HISTORY_LIMIT = 1000
|
|
13
|
+
const STATE_PERSIST_INTERVAL_MS = 5000
|
|
8
14
|
|
|
9
15
|
/**
|
|
10
16
|
* @typedef {import("./json.js").JsonValue} JsonValue
|
|
11
17
|
* @typedef {{releaseId?: string, releasePath: string, revision?: string}} DeployArgs
|
|
12
18
|
* @typedef {{id: string, process: import("./managed-process.js").ManagedProcessStatus}} ProcessStatus
|
|
13
|
-
* @typedef {{activeReleaseId: string | null, application: string, control: import("./config.js").ControlConfig, proxy: {host: string, port: number | undefined, upstreamHost: string}, releases: import("./release-group.js").ReleaseStatus[], services: ProcessStatus[], singletons: ProcessStatus[]}} DaemonStatus
|
|
19
|
+
* @typedef {{activeReleaseId: string | null, application: string, control: import("./config.js").ControlConfig, orphans: {id: string, pid: number, releaseId: string | null}[], proxy: {host: string, port: number | undefined, upstreamHost: string}, releases: import("./release-group.js").ReleaseStatus[], services: ProcessStatus[], singletons: ProcessStatus[]}} DaemonStatus
|
|
14
20
|
*/
|
|
15
21
|
|
|
16
22
|
export default class RollbridgeDaemon {
|
|
@@ -21,7 +27,18 @@ export default class RollbridgeDaemon {
|
|
|
21
27
|
*/
|
|
22
28
|
constructor({config, logger}) {
|
|
23
29
|
this.config = config
|
|
24
|
-
this.
|
|
30
|
+
this.eventLog = new EventLog(EVENT_HISTORY_LIMIT)
|
|
31
|
+
|
|
32
|
+
const baseLogger = logger || ((message, data = {}) => console.log(JSON.stringify({at: new Date().toISOString(), data, message})))
|
|
33
|
+
|
|
34
|
+
// Every operational milestone is logged through this.logger, so recording here
|
|
35
|
+
// gives a structured event history for free (deploys, switches, stops, crashes,
|
|
36
|
+
// restarts, and failed commands).
|
|
37
|
+
this.logger = /** @type {(message: string, data?: Record<string, JsonValue>) => void} */ ((message, data = {}) => {
|
|
38
|
+
this.eventLog.record(message, data)
|
|
39
|
+
baseLogger(message, data)
|
|
40
|
+
})
|
|
41
|
+
|
|
25
42
|
this.releases = /** @type {Map<string, ReleaseGroup>} */ (new Map())
|
|
26
43
|
this.services = /** @type {Map<string, import("./managed-process.js").default>} */ (new Map())
|
|
27
44
|
this.servicePorts = /** @type {Record<string, number>} */ ({})
|
|
@@ -32,14 +49,22 @@ export default class RollbridgeDaemon {
|
|
|
32
49
|
this.controlServer = /** @type {net.Server | undefined} */ (undefined)
|
|
33
50
|
this.proxyPort = /** @type {number | undefined} */ (undefined)
|
|
34
51
|
this.stopping = false
|
|
52
|
+
this.statePath = config.statePath
|
|
53
|
+
this.persistTimer = /** @type {ReturnType<typeof setInterval> | undefined} */ (undefined)
|
|
54
|
+
this.pendingWrite = /** @type {Promise<void> | undefined} */ (undefined)
|
|
55
|
+
// Still-alive managed processes left by a previous daemon (from statePath), captured at
|
|
56
|
+
// startup and surfaced in status(). The daemon cannot re-manage them, only report them.
|
|
57
|
+
this.orphans = /** @type {{id: string, pid: number, releaseId: string | null}[]} */ ([])
|
|
35
58
|
|
|
36
59
|
this.proxy.on("error", (error, req, res) => this.onProxyError(error, req, res))
|
|
37
60
|
}
|
|
38
61
|
|
|
39
62
|
/** @returns {Promise<void>} Starts proxy and control listeners. */
|
|
40
63
|
async start() {
|
|
64
|
+
await this.reportOrphans()
|
|
41
65
|
await this.startProxy()
|
|
42
66
|
await this.startControlServer()
|
|
67
|
+
this.startStatePersistence()
|
|
43
68
|
}
|
|
44
69
|
|
|
45
70
|
/** @returns {Promise<void>} Starts the stable local proxy. */
|
|
@@ -78,6 +103,30 @@ export default class RollbridgeDaemon {
|
|
|
78
103
|
if (this.config.control.mode !== undefined) {
|
|
79
104
|
await fs.chmod(this.config.control.path, this.config.control.mode)
|
|
80
105
|
}
|
|
106
|
+
|
|
107
|
+
await this.applyControlSocketOwnership()
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
/**
|
|
111
|
+
* Applies control.owner/control.group to the bound socket via chown, resolving names to ids.
|
|
112
|
+
* @returns {Promise<void>} Resolves once ownership is applied (no-op when neither is set).
|
|
113
|
+
*/
|
|
114
|
+
async applyControlSocketOwnership() {
|
|
115
|
+
const {group, owner, path: socketPath} = this.config.control
|
|
116
|
+
|
|
117
|
+
if (owner === undefined && group === undefined) return
|
|
118
|
+
|
|
119
|
+
// -1 leaves the uid/gid unchanged (POSIX chown semantics).
|
|
120
|
+
const uid = owner === undefined ? -1 : resolveUserId(owner)
|
|
121
|
+
const gid = group === undefined ? -1 : resolveGroupId(group)
|
|
122
|
+
|
|
123
|
+
try {
|
|
124
|
+
await fs.chown(socketPath, uid, gid)
|
|
125
|
+
} catch (error) {
|
|
126
|
+
const reason = error instanceof Error ? error.message : String(error)
|
|
127
|
+
|
|
128
|
+
throw new Error(`Could not set control socket owner/group on ${socketPath}: ${reason}. Run the daemon as a user allowed to chown it (for example root, or a member of the target group).`, {cause: error})
|
|
129
|
+
}
|
|
81
130
|
}
|
|
82
131
|
|
|
83
132
|
/** @returns {Promise<void>} Removes a stale Unix socket before binding, or fails clearly when a daemon is alive. */
|
|
@@ -195,6 +244,7 @@ export default class RollbridgeDaemon {
|
|
|
195
244
|
this.executeControlLine(line)
|
|
196
245
|
.then((response) => socket.write(`${JSON.stringify({status: "success", ...response})}\n`))
|
|
197
246
|
.catch((error) => {
|
|
247
|
+
this.logger("command failed", {error: error instanceof Error ? error.message : String(error)})
|
|
198
248
|
socket.write(`${JSON.stringify({
|
|
199
249
|
error: error instanceof Error ? error.message : String(error),
|
|
200
250
|
status: "error"
|
|
@@ -228,11 +278,26 @@ export default class RollbridgeDaemon {
|
|
|
228
278
|
return this.status()
|
|
229
279
|
}
|
|
230
280
|
|
|
281
|
+
if (commandName === "events") {
|
|
282
|
+
return {events: this.eventLog.recent(typeof data.limit === "number" ? data.limit : undefined)}
|
|
283
|
+
}
|
|
284
|
+
|
|
231
285
|
if (commandName === "stop") {
|
|
232
286
|
await this.stopRelease(stringOrUndefined(data.releaseId))
|
|
233
287
|
return this.status()
|
|
234
288
|
}
|
|
235
289
|
|
|
290
|
+
if (commandName === "restart") {
|
|
291
|
+
return await this.restartProcesses({
|
|
292
|
+
policy: stringOrUndefined(data.policy),
|
|
293
|
+
processId: stringOrUndefined(data.processId)
|
|
294
|
+
})
|
|
295
|
+
}
|
|
296
|
+
|
|
297
|
+
if (commandName === "rollback") {
|
|
298
|
+
return await this.rollback({releaseId: stringOrUndefined(data.releaseId)})
|
|
299
|
+
}
|
|
300
|
+
|
|
236
301
|
if (commandName === "shutdown") {
|
|
237
302
|
setImmediate(() => {
|
|
238
303
|
this.shutdown().catch((error) => {
|
|
@@ -271,6 +336,7 @@ export default class RollbridgeDaemon {
|
|
|
271
336
|
await this.ensureServices(release, startedServices)
|
|
272
337
|
await release.start()
|
|
273
338
|
} catch (error) {
|
|
339
|
+
this.logger("deploy failed", {error: error instanceof Error ? error.message : String(error), releaseId: newReleaseId})
|
|
274
340
|
await this.stopStartedServices(startedServices)
|
|
275
341
|
throw error
|
|
276
342
|
}
|
|
@@ -289,12 +355,60 @@ export default class RollbridgeDaemon {
|
|
|
289
355
|
void this.drainAndPrune(previousRelease)
|
|
290
356
|
}
|
|
291
357
|
|
|
358
|
+
this.persistState()
|
|
359
|
+
|
|
292
360
|
return {
|
|
293
361
|
activeReleaseId: release.releaseId,
|
|
294
362
|
previousReleaseId: previousRelease ? previousRelease.releaseId : null
|
|
295
363
|
}
|
|
296
364
|
}
|
|
297
365
|
|
|
366
|
+
/**
|
|
367
|
+
* Rolls back to a previously-active release by re-running the deploy flow on its
|
|
368
|
+
* retained metadata: it re-starts the target release, health-checks it, switches
|
|
369
|
+
* traffic, replaces singletons, and drains the current release — just like a deploy,
|
|
370
|
+
* so a failed rollback leaves the current release active.
|
|
371
|
+
* @param {{releaseId?: string}} [args] - Target release id; defaults to the most recently retired release.
|
|
372
|
+
* @returns {Promise<Record<string, JsonValue>>} The rollback result.
|
|
373
|
+
*/
|
|
374
|
+
async rollback({releaseId} = {}) {
|
|
375
|
+
const target = releaseId ? this.releases.get(releaseId) : this.previousRelease()
|
|
376
|
+
|
|
377
|
+
if (!target) {
|
|
378
|
+
throw new Error(releaseId ? `No retained release "${releaseId}" to roll back to.` : "No previous release to roll back to.")
|
|
379
|
+
}
|
|
380
|
+
|
|
381
|
+
if (target === this.activeRelease) {
|
|
382
|
+
throw new Error(`Release "${target.releaseId}" is already active.`)
|
|
383
|
+
}
|
|
384
|
+
|
|
385
|
+
// The target may still be draining a prior deploy (live processes). Stop it before the
|
|
386
|
+
// deploy below re-uses its id in this.releases, otherwise the still-running instance
|
|
387
|
+
// would be dropped from status/pruning/shutdown and could be orphaned.
|
|
388
|
+
if (target.state !== "stopped" && target.state !== "failed") {
|
|
389
|
+
await target.stop()
|
|
390
|
+
}
|
|
391
|
+
|
|
392
|
+
this.logger("rollback starting", {releaseId: target.releaseId, releasePath: target.releasePath})
|
|
393
|
+
|
|
394
|
+
return await this.deploy({releaseId: target.releaseId, releasePath: target.releasePath, revision: target.revision})
|
|
395
|
+
}
|
|
396
|
+
|
|
397
|
+
/**
|
|
398
|
+
* @returns {ReleaseGroup | undefined} The most recently active release other than the current one, if any.
|
|
399
|
+
*/
|
|
400
|
+
previousRelease() {
|
|
401
|
+
/** @type {ReleaseGroup | undefined} */
|
|
402
|
+
let previous
|
|
403
|
+
|
|
404
|
+
for (const release of this.releases.values()) {
|
|
405
|
+
if (release === this.activeRelease || !release.activatedAt) continue
|
|
406
|
+
if (!previous || Date.parse(release.activatedAt) >= Date.parse(/** @type {string} */ (previous.activatedAt))) previous = release
|
|
407
|
+
}
|
|
408
|
+
|
|
409
|
+
return previous
|
|
410
|
+
}
|
|
411
|
+
|
|
298
412
|
/**
|
|
299
413
|
* Starts missing daemon-wide services before release-owned processes need them.
|
|
300
414
|
* @param {ReleaseGroup} release - Release providing templates and ports.
|
|
@@ -317,7 +431,7 @@ export default class RollbridgeDaemon {
|
|
|
317
431
|
}
|
|
318
432
|
|
|
319
433
|
try {
|
|
320
|
-
await service.start()
|
|
434
|
+
await service.start("deploy")
|
|
321
435
|
startedServices.push(processConfig.id)
|
|
322
436
|
} catch (error) {
|
|
323
437
|
this.services.delete(processConfig.id)
|
|
@@ -363,10 +477,14 @@ export default class RollbridgeDaemon {
|
|
|
363
477
|
command: nextDefinition.command,
|
|
364
478
|
cwd: nextDefinition.cwd,
|
|
365
479
|
env: nextDefinition.env,
|
|
480
|
+
lifecycle: nextDefinition.lifecycle,
|
|
366
481
|
logger: nextDefinition.logger,
|
|
482
|
+
memory: nextDefinition.memory,
|
|
367
483
|
outputLines: nextDefinition.outputLines,
|
|
484
|
+
restart: nextDefinition.restart,
|
|
368
485
|
restartDelayMs: nextDefinition.restartDelayMs,
|
|
369
486
|
shouldRestart: nextDefinition.shouldRestart,
|
|
487
|
+
stopSignal: nextDefinition.stopSignal,
|
|
370
488
|
stopTimeoutMs: nextDefinition.stopTimeoutMs
|
|
371
489
|
})
|
|
372
490
|
}
|
|
@@ -390,10 +508,90 @@ export default class RollbridgeDaemon {
|
|
|
390
508
|
const singleton = release.buildProcess(processConfig)
|
|
391
509
|
|
|
392
510
|
this.singletons.set(processConfig.id, singleton)
|
|
393
|
-
await singleton.start()
|
|
511
|
+
await singleton.start("deploy")
|
|
394
512
|
}
|
|
395
513
|
}
|
|
396
514
|
|
|
515
|
+
/**
|
|
516
|
+
* Restarts non-proxied processes selected by id or policy, or all of them: running
|
|
517
|
+
* processes are bounced (stop then start) and crashed or stopped ones are revived,
|
|
518
|
+
* matching the conventional meaning of "restart".
|
|
519
|
+
*
|
|
520
|
+
* The proxied process is never restarted in place (that would drop traffic); use a
|
|
521
|
+
* deploy for a zero-downtime replacement.
|
|
522
|
+
* @param {{policy?: string, processId?: string}} selector - Restart selector; restarts all non-proxied processes when both are omitted.
|
|
523
|
+
* @returns {Promise<Record<string, JsonValue>>} The ids that were restarted.
|
|
524
|
+
*/
|
|
525
|
+
async restartProcesses({policy, processId} = {}) {
|
|
526
|
+
if (policy === "proxied" || (processId !== undefined && this.isProxiedId(processId))) {
|
|
527
|
+
throw new Error('The proxied process cannot be restarted in place; use "rollbridge deploy" for a zero-downtime replacement.')
|
|
528
|
+
}
|
|
529
|
+
|
|
530
|
+
const targets = this.collectRestartTargets({policy, processId})
|
|
531
|
+
|
|
532
|
+
if (processId !== undefined && targets.length === 0) {
|
|
533
|
+
throw new Error(`No managed process with id "${processId}" to restart.`)
|
|
534
|
+
}
|
|
535
|
+
|
|
536
|
+
for (const target of targets) {
|
|
537
|
+
this.logger("process restart requested", {processId: target.id})
|
|
538
|
+
await target.process.stop()
|
|
539
|
+
await target.process.start("manual")
|
|
540
|
+
}
|
|
541
|
+
|
|
542
|
+
return {restarted: targets.map((target) => target.id)}
|
|
543
|
+
}
|
|
544
|
+
|
|
545
|
+
/**
|
|
546
|
+
* @param {{policy?: string, processId?: string}} selector - Restart selector.
|
|
547
|
+
* @returns {{id: string, process: import("./managed-process.js").default}[]} Running non-proxied processes matching the selector.
|
|
548
|
+
*/
|
|
549
|
+
collectRestartTargets({policy, processId}) {
|
|
550
|
+
const targets = /** @type {{id: string, process: import("./managed-process.js").default}[]} */ ([])
|
|
551
|
+
|
|
552
|
+
for (const processConfig of this.config.processes) {
|
|
553
|
+
if (processConfig.policy === "proxied") continue
|
|
554
|
+
if (policy !== undefined && processConfig.policy !== policy) continue
|
|
555
|
+
|
|
556
|
+
for (const instance of this.runningInstances(processConfig)) {
|
|
557
|
+
// A processId selector matches the base config id (all replicas) or one replica's id.
|
|
558
|
+
if (processId !== undefined && processId !== processConfig.id && processId !== instance.id) continue
|
|
559
|
+
|
|
560
|
+
targets.push(instance)
|
|
561
|
+
}
|
|
562
|
+
}
|
|
563
|
+
|
|
564
|
+
return targets
|
|
565
|
+
}
|
|
566
|
+
|
|
567
|
+
/**
|
|
568
|
+
* @param {import("./config.js").ProcessConfig} processConfig - Process definition.
|
|
569
|
+
* @returns {{id: string, process: import("./managed-process.js").default}[]} Running instances (replicas) for this config.
|
|
570
|
+
*/
|
|
571
|
+
runningInstances(processConfig) {
|
|
572
|
+
if (processConfig.policy === "service") {
|
|
573
|
+
const service = this.services.get(processConfig.id)
|
|
574
|
+
|
|
575
|
+
return service ? [{id: processConfig.id, process: service}] : []
|
|
576
|
+
}
|
|
577
|
+
|
|
578
|
+
if (processConfig.policy === "singleton") {
|
|
579
|
+
const singleton = this.singletons.get(processConfig.id)
|
|
580
|
+
|
|
581
|
+
return singleton ? [{id: processConfig.id, process: singleton}] : []
|
|
582
|
+
}
|
|
583
|
+
|
|
584
|
+
return this.activeRelease ? this.activeRelease.getProcesses(processConfig.id) : []
|
|
585
|
+
}
|
|
586
|
+
|
|
587
|
+
/**
|
|
588
|
+
* @param {string} id - Process id.
|
|
589
|
+
* @returns {boolean} True when the id belongs to the proxied process.
|
|
590
|
+
*/
|
|
591
|
+
isProxiedId(id) {
|
|
592
|
+
return this.config.processes.some((processConfig) => processConfig.policy === "proxied" && processConfig.id === id)
|
|
593
|
+
}
|
|
594
|
+
|
|
397
595
|
/**
|
|
398
596
|
* @param {string | undefined} releaseId - Release id, or active release when omitted.
|
|
399
597
|
* @returns {Promise<void>} Resolves when stopped.
|
|
@@ -405,7 +603,9 @@ export default class RollbridgeDaemon {
|
|
|
405
603
|
if (release === this.activeRelease) this.activeRelease = undefined
|
|
406
604
|
|
|
407
605
|
await release.stop()
|
|
606
|
+
this.logger("release stopped", {releaseId: release.releaseId})
|
|
408
607
|
this.pruneStoppedReleases()
|
|
608
|
+
this.persistState()
|
|
409
609
|
}
|
|
410
610
|
|
|
411
611
|
/**
|
|
@@ -416,10 +616,12 @@ export default class RollbridgeDaemon {
|
|
|
416
616
|
async drainAndPrune(release) {
|
|
417
617
|
try {
|
|
418
618
|
await release.drainAndStop(this.config.proxy.drainTimeoutMs)
|
|
619
|
+
this.logger("release drained", {releaseId: release.releaseId})
|
|
419
620
|
} catch (error) {
|
|
420
621
|
this.logger("release drain failed", {error: error instanceof Error ? error.message : String(error), releaseId: release.releaseId})
|
|
421
622
|
} finally {
|
|
422
623
|
this.pruneStoppedReleases()
|
|
624
|
+
this.persistState()
|
|
423
625
|
}
|
|
424
626
|
}
|
|
425
627
|
|
|
@@ -432,11 +634,75 @@ export default class RollbridgeDaemon {
|
|
|
432
634
|
}
|
|
433
635
|
}
|
|
434
636
|
|
|
637
|
+
/** @returns {void} Starts periodic state persistence when statePath is configured. */
|
|
638
|
+
startStatePersistence() {
|
|
639
|
+
if (!this.statePath) return
|
|
640
|
+
|
|
641
|
+
this.persistState()
|
|
642
|
+
this.persistTimer = setInterval(() => this.persistState(), STATE_PERSIST_INTERVAL_MS)
|
|
643
|
+
this.persistTimer.unref?.()
|
|
644
|
+
}
|
|
645
|
+
|
|
646
|
+
/**
|
|
647
|
+
* Persists a state snapshot (status plus recent events) to statePath, atomically and
|
|
648
|
+
* fire-and-forget. A failed write is logged but never blocks daemon operation.
|
|
649
|
+
* @returns {void}
|
|
650
|
+
*/
|
|
651
|
+
persistState() {
|
|
652
|
+
if (!this.statePath || this.stopping) return
|
|
653
|
+
|
|
654
|
+
const statePath = this.statePath
|
|
655
|
+
// Drop the orphans view from the snapshot: it reflects a *previous* daemon's leftovers, not
|
|
656
|
+
// this daemon's own managed state, and is recomputed from the persisted processes on restart.
|
|
657
|
+
const {orphans: _orphans, ...status} = this.status()
|
|
658
|
+
const snapshot = {...status, events: this.eventLog.recent(), persistedAt: new Date().toISOString()}
|
|
659
|
+
|
|
660
|
+
// Serialize writes (and track the tail) so shutdown can wait for an in-flight write before
|
|
661
|
+
// clearing the file — otherwise a write started before shutdown could recreate it afterward.
|
|
662
|
+
this.pendingWrite = Promise.resolve(this.pendingWrite)
|
|
663
|
+
.catch(() => {})
|
|
664
|
+
.then(() => writeState(statePath, snapshot))
|
|
665
|
+
.catch((error) => {
|
|
666
|
+
this.logger("state persist failed", {error: error instanceof Error ? error.message : String(error)})
|
|
667
|
+
})
|
|
668
|
+
}
|
|
669
|
+
|
|
670
|
+
/**
|
|
671
|
+
* On startup, reads any state left by a previous daemon and reports managed processes whose
|
|
672
|
+
* pids are still alive — likely orphans from a daemon that did not shut down cleanly. This is
|
|
673
|
+
* advisory (Rollbridge cannot re-adopt detached children); the operator stops the leftovers.
|
|
674
|
+
* A recycled pid could be a false positive, so reports are a prompt to investigate.
|
|
675
|
+
* @returns {Promise<void>} Resolves once orphans are reported.
|
|
676
|
+
*/
|
|
677
|
+
async reportOrphans() {
|
|
678
|
+
if (!this.statePath) return
|
|
679
|
+
|
|
680
|
+
const orphans = liveProcesses(await readState(this.statePath))
|
|
681
|
+
|
|
682
|
+
// Keep them for status() so `rollbridge status` reflects still-running children after a
|
|
683
|
+
// restart, not just the startup log below.
|
|
684
|
+
this.orphans = orphans
|
|
685
|
+
|
|
686
|
+
for (const orphan of orphans) {
|
|
687
|
+
this.logger("orphaned managed process detected", {pid: orphan.pid, processId: orphan.id, releaseId: orphan.releaseId})
|
|
688
|
+
}
|
|
689
|
+
|
|
690
|
+
if (orphans.length > 0) {
|
|
691
|
+
this.logger("orphaned processes from a previous daemon", {count: orphans.length, hint: "a previous daemon did not shut down cleanly; verify these pids and stop any leftovers"})
|
|
692
|
+
}
|
|
693
|
+
}
|
|
694
|
+
|
|
435
695
|
/** @returns {Promise<void>} Stops proxy, control socket, and child processes. */
|
|
436
696
|
async shutdown() {
|
|
437
697
|
if (this.stopping) return
|
|
438
698
|
|
|
439
699
|
this.stopping = true
|
|
700
|
+
|
|
701
|
+
if (this.persistTimer) {
|
|
702
|
+
clearInterval(this.persistTimer)
|
|
703
|
+
this.persistTimer = undefined
|
|
704
|
+
}
|
|
705
|
+
|
|
440
706
|
this.proxy.close()
|
|
441
707
|
await Promise.allSettled([...this.services.values()].map((processInstance) => processInstance.stop()))
|
|
442
708
|
await Promise.allSettled([...this.singletons.values()].map((processInstance) => processInstance.stop()))
|
|
@@ -444,6 +710,14 @@ export default class RollbridgeDaemon {
|
|
|
444
710
|
await this.closeServer(this.proxyServer)
|
|
445
711
|
await this.closeServer(this.controlServer)
|
|
446
712
|
await fs.rm(this.config.control.path, {force: true})
|
|
713
|
+
|
|
714
|
+
// A clean shutdown leaves no orphans, so remove the state file rather than leaving stale
|
|
715
|
+
// pids. Wait for any in-flight write first so it can't recreate the file afterward (no new
|
|
716
|
+
// writes start: stopping is set and the persist timer is cleared above).
|
|
717
|
+
if (this.statePath) {
|
|
718
|
+
if (this.pendingWrite) await this.pendingWrite
|
|
719
|
+
await clearState(this.statePath)
|
|
720
|
+
}
|
|
447
721
|
}
|
|
448
722
|
|
|
449
723
|
/**
|
|
@@ -463,10 +737,16 @@ export default class RollbridgeDaemon {
|
|
|
463
737
|
|
|
464
738
|
/** @returns {DaemonStatus} Status payload. */
|
|
465
739
|
status() {
|
|
740
|
+
// Re-check liveness and prune the dead permanently, so the list self-clears as the operator
|
|
741
|
+
// stops the leftovers (e.g. via `rollbridge recover`). Pruning (not just filtering) matters:
|
|
742
|
+
// a cleared orphan must not reappear if the OS later recycles its pid for an unrelated process.
|
|
743
|
+
this.orphans = this.orphans.filter((orphan) => isProcessAlive(orphan.pid))
|
|
744
|
+
|
|
466
745
|
return {
|
|
467
746
|
activeReleaseId: this.activeRelease ? this.activeRelease.releaseId : null,
|
|
468
747
|
application: this.config.application,
|
|
469
748
|
control: {...this.config.control},
|
|
749
|
+
orphans: [...this.orphans],
|
|
470
750
|
proxy: {
|
|
471
751
|
host: this.config.proxy.host,
|
|
472
752
|
port: this.proxyPort ?? this.config.proxy.port,
|
package/src/doctor.js
CHANGED
|
@@ -5,9 +5,12 @@ import fs from "node:fs/promises"
|
|
|
5
5
|
import net from "node:net"
|
|
6
6
|
import path from "node:path"
|
|
7
7
|
import {inspectControlSocket} from "./daemon.js"
|
|
8
|
+
import {liveProcesses, readState} from "./state-store.js"
|
|
9
|
+
import {processTemplateContext, renderObject, renderTemplate} from "./template.js"
|
|
8
10
|
|
|
9
11
|
/**
|
|
10
12
|
* @typedef {{detail: string, name: string, ok: boolean}} DoctorCheck
|
|
13
|
+
* @typedef {{cwd: string, id: string, ok: true} | {error: string, id: string, ok: false}} ProcessRender
|
|
11
14
|
*/
|
|
12
15
|
|
|
13
16
|
/**
|
|
@@ -25,9 +28,55 @@ export async function runEnvironmentChecks(config) {
|
|
|
25
28
|
checks.push(await controlSocketDirectoryCheck(config))
|
|
26
29
|
checks.push(await proxyPortCheck(config))
|
|
27
30
|
|
|
31
|
+
if (config.statePath !== undefined) {
|
|
32
|
+
// A live daemon persists its own (live) pids into the state file, so they are not orphans.
|
|
33
|
+
const daemonRunning = !("error" in socketInspection) && socketInspection.alive
|
|
34
|
+
|
|
35
|
+
checks.push(await statePathDirectoryCheck(config.statePath))
|
|
36
|
+
checks.push(await orphanCheck(config.statePath, daemonRunning))
|
|
37
|
+
}
|
|
38
|
+
|
|
28
39
|
return checks
|
|
29
40
|
}
|
|
30
41
|
|
|
42
|
+
/**
|
|
43
|
+
* @param {string} statePath - Configured state file path.
|
|
44
|
+
* @returns {Promise<DoctorCheck>} Whether the state file's directory is writable.
|
|
45
|
+
*/
|
|
46
|
+
async function statePathDirectoryCheck(statePath) {
|
|
47
|
+
const directory = path.dirname(path.resolve(statePath))
|
|
48
|
+
|
|
49
|
+
try {
|
|
50
|
+
await fs.access(directory, fsConstants.W_OK | fsConstants.X_OK)
|
|
51
|
+
|
|
52
|
+
return {detail: `${directory} is writable`, name: "state path directory", ok: true}
|
|
53
|
+
} catch {
|
|
54
|
+
return {detail: `${directory} is missing or not writable; state cannot be persisted`, name: "state path directory", ok: false}
|
|
55
|
+
}
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
/**
|
|
59
|
+
* @param {string} statePath - Configured state file path.
|
|
60
|
+
* @param {boolean} daemonRunning - Whether a Rollbridge daemon is currently live on the control socket.
|
|
61
|
+
* @returns {Promise<DoctorCheck>} Whether any orphaned managed processes from a prior daemon are still alive.
|
|
62
|
+
*/
|
|
63
|
+
async function orphanCheck(statePath, daemonRunning) {
|
|
64
|
+
if (daemonRunning) {
|
|
65
|
+
// The running daemon owns the pids in the state file; they are managed, not orphaned.
|
|
66
|
+
return {detail: "a daemon is running; its managed processes are not orphans", name: "orphaned processes", ok: true}
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
const orphans = liveProcesses(await readState(statePath))
|
|
70
|
+
|
|
71
|
+
if (orphans.length === 0) {
|
|
72
|
+
return {detail: "no leftover processes from a previous daemon", name: "orphaned processes", ok: true}
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
const summary = orphans.map((orphan) => `${orphan.id} (pid ${orphan.pid})`).join(", ")
|
|
76
|
+
|
|
77
|
+
return {detail: `${orphans.length} possible orphaned process${orphans.length === 1 ? "" : "es"} still running: ${summary} — verify and stop any leftovers`, name: "orphaned processes", ok: false}
|
|
78
|
+
}
|
|
79
|
+
|
|
31
80
|
/**
|
|
32
81
|
* @param {string} socketPath - Control socket path.
|
|
33
82
|
* @returns {Promise<{alive: boolean, application?: string} | {error: string}>} Probe result, or the probe error.
|
|
@@ -112,3 +161,131 @@ async function canBindPort(host, port) {
|
|
|
112
161
|
server.listen(port, host, () => server.close(() => resolve({ok: true})))
|
|
113
162
|
})
|
|
114
163
|
}
|
|
164
|
+
|
|
165
|
+
/**
|
|
166
|
+
* Runs deploy-time checks against a specific release: that the release directory exists, that
|
|
167
|
+
* every process's command/cwd/env templates resolve, and that each rendered working directory
|
|
168
|
+
* exists. These need the per-release values that only exist at deploy time, so the operator
|
|
169
|
+
* supplies them (the release path, and optionally an id/revision). Ports referenced by templates
|
|
170
|
+
* are rendered with the low end of each process's configured range as a representative value.
|
|
171
|
+
* @param {import("./config.js").RollbridgeConfig} config - Normalized config.
|
|
172
|
+
* @param {{releaseId?: string, releasePath: string, revision?: string}} release - Release to render against.
|
|
173
|
+
* @returns {Promise<DoctorCheck[]>} One check per probed aspect.
|
|
174
|
+
*/
|
|
175
|
+
export async function runReleaseChecks(config, release) {
|
|
176
|
+
const releasePath = path.resolve(release.releasePath)
|
|
177
|
+
const releaseId = release.releaseId || release.revision || path.basename(releasePath)
|
|
178
|
+
const revision = release.revision || releaseId
|
|
179
|
+
const ports = representativePorts(config)
|
|
180
|
+
const renders = config.processes.map((processConfig) => renderProcess(processConfig, {application: config.application, ports, proxy: config.proxy, releaseId, releasePath, revision}))
|
|
181
|
+
|
|
182
|
+
return [await releasePathCheck(releasePath), templateCheck(renders), await workingDirectoryCheck(renders)]
|
|
183
|
+
}
|
|
184
|
+
|
|
185
|
+
/**
|
|
186
|
+
* @param {import("./config.js").RollbridgeConfig} config - Normalized config.
|
|
187
|
+
* @returns {Record<string, number>} The ports a deploy would allocate, using each range's low end.
|
|
188
|
+
*/
|
|
189
|
+
function representativePorts(config) {
|
|
190
|
+
/** @type {Record<string, number>} */
|
|
191
|
+
const ports = {}
|
|
192
|
+
|
|
193
|
+
for (const processConfig of config.processes) {
|
|
194
|
+
if (processConfig.port) ports[processConfig.id] = processConfig.port.from
|
|
195
|
+
}
|
|
196
|
+
|
|
197
|
+
return ports
|
|
198
|
+
}
|
|
199
|
+
|
|
200
|
+
/**
|
|
201
|
+
* Renders a process's command, cwd, and env against a deploy-time context (replica index 0).
|
|
202
|
+
* @param {import("./config.js").ProcessConfig} processConfig - Process to render.
|
|
203
|
+
* @param {{application: string, ports: Record<string, number>, proxy: import("./config.js").ProxyConfig, releaseId: string, releasePath: string, revision: string}} shared - Shared render inputs.
|
|
204
|
+
* @returns {ProcessRender} The rendered cwd, or the first template error.
|
|
205
|
+
*/
|
|
206
|
+
function renderProcess(processConfig, shared) {
|
|
207
|
+
const context = processTemplateContext({
|
|
208
|
+
application: shared.application,
|
|
209
|
+
ports: shared.ports,
|
|
210
|
+
processId: processConfig.id,
|
|
211
|
+
proxy: shared.proxy,
|
|
212
|
+
releaseId: shared.releaseId,
|
|
213
|
+
releasePath: shared.releasePath,
|
|
214
|
+
replicaCount: processConfig.replicas,
|
|
215
|
+
replicaIndex: 0,
|
|
216
|
+
revision: shared.revision
|
|
217
|
+
})
|
|
218
|
+
|
|
219
|
+
try {
|
|
220
|
+
const cwd = processConfig.cwd ? renderTemplate(processConfig.cwd, context) : shared.releasePath
|
|
221
|
+
|
|
222
|
+
renderTemplate(processConfig.command, context)
|
|
223
|
+
renderObject(processConfig.env, context)
|
|
224
|
+
|
|
225
|
+
return {cwd: path.resolve(shared.releasePath, cwd), id: processConfig.id, ok: true}
|
|
226
|
+
} catch (error) {
|
|
227
|
+
return {error: error instanceof Error ? error.message : String(error), id: processConfig.id, ok: false}
|
|
228
|
+
}
|
|
229
|
+
}
|
|
230
|
+
|
|
231
|
+
/**
|
|
232
|
+
* @param {string} releasePath - Resolved release directory.
|
|
233
|
+
* @returns {Promise<DoctorCheck>} Whether the release directory exists.
|
|
234
|
+
*/
|
|
235
|
+
async function releasePathCheck(releasePath) {
|
|
236
|
+
if (await isDirectory(releasePath)) {
|
|
237
|
+
return {detail: `${releasePath} exists`, name: "release path", ok: true}
|
|
238
|
+
}
|
|
239
|
+
|
|
240
|
+
return {detail: `${releasePath} is missing or not a directory`, name: "release path", ok: false}
|
|
241
|
+
}
|
|
242
|
+
|
|
243
|
+
/**
|
|
244
|
+
* @param {ProcessRender[]} renders - Per-process render results.
|
|
245
|
+
* @returns {DoctorCheck} Whether every process's templates resolved against the release context.
|
|
246
|
+
*/
|
|
247
|
+
function templateCheck(renders) {
|
|
248
|
+
const failures = renders.flatMap((render) => (render.ok ? [] : [`${render.id}: ${render.error}`]))
|
|
249
|
+
|
|
250
|
+
if (failures.length === 0) {
|
|
251
|
+
return {detail: `all ${renders.length} process command/cwd/env templates resolve`, name: "process templates", ok: true}
|
|
252
|
+
}
|
|
253
|
+
|
|
254
|
+
return {detail: `unresolved templates — ${failures.join("; ")}`, name: "process templates", ok: false}
|
|
255
|
+
}
|
|
256
|
+
|
|
257
|
+
/**
|
|
258
|
+
* @param {ProcessRender[]} renders - Per-process render results.
|
|
259
|
+
* @returns {Promise<DoctorCheck>} Whether each rendered working directory exists.
|
|
260
|
+
*/
|
|
261
|
+
async function workingDirectoryCheck(renders) {
|
|
262
|
+
/** @type {string[]} */
|
|
263
|
+
const missing = []
|
|
264
|
+
let checked = 0
|
|
265
|
+
|
|
266
|
+
for (const render of renders) {
|
|
267
|
+
if (!render.ok) continue
|
|
268
|
+
|
|
269
|
+
checked++
|
|
270
|
+
|
|
271
|
+
if (!(await isDirectory(render.cwd))) missing.push(`${render.id} (${render.cwd})`)
|
|
272
|
+
}
|
|
273
|
+
|
|
274
|
+
if (missing.length === 0) {
|
|
275
|
+
return {detail: `all ${checked} process working ${checked === 1 ? "directory exists" : "directories exist"}`, name: "process working directories", ok: true}
|
|
276
|
+
}
|
|
277
|
+
|
|
278
|
+
return {detail: `missing working ${missing.length === 1 ? "directory" : "directories"}: ${missing.join(", ")}`, name: "process working directories", ok: false}
|
|
279
|
+
}
|
|
280
|
+
|
|
281
|
+
/**
|
|
282
|
+
* @param {string} target - Path to test.
|
|
283
|
+
* @returns {Promise<boolean>} True when the path exists and is a directory.
|
|
284
|
+
*/
|
|
285
|
+
async function isDirectory(target) {
|
|
286
|
+
try {
|
|
287
|
+
return (await fs.stat(target)).isDirectory()
|
|
288
|
+
} catch {
|
|
289
|
+
return false
|
|
290
|
+
}
|
|
291
|
+
}
|
package/src/event-log.js
ADDED
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
// @ts-check
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* @typedef {import("./json.js").JsonValue} JsonValue
|
|
5
|
+
* @typedef {{at: string, data: Record<string, JsonValue>, message: string}} DaemonEvent
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
/**
|
|
9
|
+
* A bounded, in-memory history of structured daemon events (deploys, traffic
|
|
10
|
+
* switches, stops, crashes, restarts, and failed commands). The newest events
|
|
11
|
+
* are kept; the oldest are dropped once the limit is exceeded.
|
|
12
|
+
*/
|
|
13
|
+
export default class EventLog {
|
|
14
|
+
/**
|
|
15
|
+
* @param {number} limit - Maximum number of events to retain.
|
|
16
|
+
*/
|
|
17
|
+
constructor(limit) {
|
|
18
|
+
this.limit = limit
|
|
19
|
+
this.events = /** @type {DaemonEvent[]} */ ([])
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
/**
|
|
23
|
+
* Appends an event, dropping the oldest events beyond the limit.
|
|
24
|
+
* @param {string} message - Event type/message.
|
|
25
|
+
* @param {Record<string, JsonValue>} data - Structured event payload.
|
|
26
|
+
* @returns {void}
|
|
27
|
+
*/
|
|
28
|
+
record(message, data) {
|
|
29
|
+
this.events.push({at: new Date().toISOString(), data, message})
|
|
30
|
+
|
|
31
|
+
if (this.events.length > this.limit) {
|
|
32
|
+
this.events.splice(0, this.events.length - this.limit)
|
|
33
|
+
}
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
/**
|
|
37
|
+
* @param {number} [limit] - Maximum number of most-recent events to return; all when omitted or invalid.
|
|
38
|
+
* @returns {DaemonEvent[]} The most recent events, oldest first.
|
|
39
|
+
*/
|
|
40
|
+
recent(limit) {
|
|
41
|
+
if (typeof limit !== "number" || !Number.isFinite(limit) || limit <= 0 || limit >= this.events.length) {
|
|
42
|
+
return [...this.events]
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
return this.events.slice(this.events.length - limit)
|
|
46
|
+
}
|
|
47
|
+
}
|