rollbridge 0.1.5 → 0.1.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +104 -4
- package/TODO.md +42 -40
- package/docs/cli.md +146 -6
- package/docs/config.md +139 -2
- package/docs/logging.md +77 -0
- package/docs/releasing.md +53 -0
- package/docs/tensorbuzz-runbook.md +129 -0
- package/docs/velocious.md +49 -11
- package/docs/workers.md +115 -0
- package/package.json +1 -1
- package/src/cli.js +290 -1
- package/src/config.js +169 -6
- package/src/daemon.js +216 -13
- package/src/doctor.js +177 -0
- package/src/event-log.js +47 -0
- package/src/managed-process.js +225 -16
- package/src/process-memory.js +110 -0
- package/src/recover.js +134 -0
- package/src/release-group.js +71 -21
- package/src/state-store.js +103 -0
- package/src/system-ids.js +71 -0
- package/src/template.js +32 -0
- package/test/completion.test.js +64 -0
- package/test/config-validation.test.js +227 -0
- package/test/doctor.test.js +205 -3
- package/test/event-log.test.js +46 -0
- package/test/fixtures/memory-hog.js +19 -0
- package/test/managed-process.test.js +290 -0
- package/test/process-memory.test.js +40 -0
- package/test/recover.test.js +162 -0
- package/test/release-group.test.js +22 -0
- package/test/rollbridge.test.js +523 -6
- package/test/state-store.test.js +69 -0
- package/test/system-ids.test.js +24 -0
package/src/daemon.js
CHANGED
|
@@ -4,13 +4,19 @@ import fs from "node:fs/promises"
|
|
|
4
4
|
import http from "node:http"
|
|
5
5
|
import net from "node:net"
|
|
6
6
|
import httpProxy from "http-proxy"
|
|
7
|
+
import EventLog from "./event-log.js"
|
|
7
8
|
import ReleaseGroup from "./release-group.js"
|
|
9
|
+
import {clearState, isProcessAlive, liveProcesses, readState, writeState} from "./state-store.js"
|
|
10
|
+
import {resolveGroupId, resolveUserId} from "./system-ids.js"
|
|
11
|
+
|
|
12
|
+
const EVENT_HISTORY_LIMIT = 1000
|
|
13
|
+
const STATE_PERSIST_INTERVAL_MS = 5000
|
|
8
14
|
|
|
9
15
|
/**
|
|
10
16
|
* @typedef {import("./json.js").JsonValue} JsonValue
|
|
11
17
|
* @typedef {{releaseId?: string, releasePath: string, revision?: string}} DeployArgs
|
|
12
18
|
* @typedef {{id: string, process: import("./managed-process.js").ManagedProcessStatus}} ProcessStatus
|
|
13
|
-
* @typedef {{activeReleaseId: string | null, application: string, control: import("./config.js").ControlConfig, proxy: {host: string, port: number | undefined, upstreamHost: string}, releases: import("./release-group.js").ReleaseStatus[], services: ProcessStatus[], singletons: ProcessStatus[]}} DaemonStatus
|
|
19
|
+
* @typedef {{activeReleaseId: string | null, application: string, control: import("./config.js").ControlConfig, orphans: {id: string, pid: number, releaseId: string | null}[], proxy: {host: string, port: number | undefined, upstreamHost: string}, releases: import("./release-group.js").ReleaseStatus[], services: ProcessStatus[], singletons: ProcessStatus[]}} DaemonStatus
|
|
14
20
|
*/
|
|
15
21
|
|
|
16
22
|
export default class RollbridgeDaemon {
|
|
@@ -21,7 +27,18 @@ export default class RollbridgeDaemon {
|
|
|
21
27
|
*/
|
|
22
28
|
constructor({config, logger}) {
|
|
23
29
|
this.config = config
|
|
24
|
-
this.
|
|
30
|
+
this.eventLog = new EventLog(EVENT_HISTORY_LIMIT)
|
|
31
|
+
|
|
32
|
+
const baseLogger = logger || ((message, data = {}) => console.log(JSON.stringify({at: new Date().toISOString(), data, message})))
|
|
33
|
+
|
|
34
|
+
// Every operational milestone is logged through this.logger, so recording here
|
|
35
|
+
// gives a structured event history for free (deploys, switches, stops, crashes,
|
|
36
|
+
// restarts, and failed commands).
|
|
37
|
+
this.logger = /** @type {(message: string, data?: Record<string, JsonValue>) => void} */ ((message, data = {}) => {
|
|
38
|
+
this.eventLog.record(message, data)
|
|
39
|
+
baseLogger(message, data)
|
|
40
|
+
})
|
|
41
|
+
|
|
25
42
|
this.releases = /** @type {Map<string, ReleaseGroup>} */ (new Map())
|
|
26
43
|
this.services = /** @type {Map<string, import("./managed-process.js").default>} */ (new Map())
|
|
27
44
|
this.servicePorts = /** @type {Record<string, number>} */ ({})
|
|
@@ -32,14 +49,22 @@ export default class RollbridgeDaemon {
|
|
|
32
49
|
this.controlServer = /** @type {net.Server | undefined} */ (undefined)
|
|
33
50
|
this.proxyPort = /** @type {number | undefined} */ (undefined)
|
|
34
51
|
this.stopping = false
|
|
52
|
+
this.statePath = config.statePath
|
|
53
|
+
this.persistTimer = /** @type {ReturnType<typeof setInterval> | undefined} */ (undefined)
|
|
54
|
+
this.pendingWrite = /** @type {Promise<void> | undefined} */ (undefined)
|
|
55
|
+
// Still-alive managed processes left by a previous daemon (from statePath), captured at
|
|
56
|
+
// startup and surfaced in status(). The daemon cannot re-manage them, only report them.
|
|
57
|
+
this.orphans = /** @type {{id: string, pid: number, releaseId: string | null}[]} */ ([])
|
|
35
58
|
|
|
36
59
|
this.proxy.on("error", (error, req, res) => this.onProxyError(error, req, res))
|
|
37
60
|
}
|
|
38
61
|
|
|
39
62
|
/** @returns {Promise<void>} Starts proxy and control listeners. */
|
|
40
63
|
async start() {
|
|
64
|
+
await this.reportOrphans()
|
|
41
65
|
await this.startProxy()
|
|
42
66
|
await this.startControlServer()
|
|
67
|
+
this.startStatePersistence()
|
|
43
68
|
}
|
|
44
69
|
|
|
45
70
|
/** @returns {Promise<void>} Starts the stable local proxy. */
|
|
@@ -78,6 +103,30 @@ export default class RollbridgeDaemon {
|
|
|
78
103
|
if (this.config.control.mode !== undefined) {
|
|
79
104
|
await fs.chmod(this.config.control.path, this.config.control.mode)
|
|
80
105
|
}
|
|
106
|
+
|
|
107
|
+
await this.applyControlSocketOwnership()
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
/**
|
|
111
|
+
* Applies control.owner/control.group to the bound socket via chown, resolving names to ids.
|
|
112
|
+
* @returns {Promise<void>} Resolves once ownership is applied (no-op when neither is set).
|
|
113
|
+
*/
|
|
114
|
+
async applyControlSocketOwnership() {
|
|
115
|
+
const {group, owner, path: socketPath} = this.config.control
|
|
116
|
+
|
|
117
|
+
if (owner === undefined && group === undefined) return
|
|
118
|
+
|
|
119
|
+
// -1 leaves the uid/gid unchanged (POSIX chown semantics).
|
|
120
|
+
const uid = owner === undefined ? -1 : resolveUserId(owner)
|
|
121
|
+
const gid = group === undefined ? -1 : resolveGroupId(group)
|
|
122
|
+
|
|
123
|
+
try {
|
|
124
|
+
await fs.chown(socketPath, uid, gid)
|
|
125
|
+
} catch (error) {
|
|
126
|
+
const reason = error instanceof Error ? error.message : String(error)
|
|
127
|
+
|
|
128
|
+
throw new Error(`Could not set control socket owner/group on ${socketPath}: ${reason}. Run the daemon as a user allowed to chown it (for example root, or a member of the target group).`, {cause: error})
|
|
129
|
+
}
|
|
81
130
|
}
|
|
82
131
|
|
|
83
132
|
/** @returns {Promise<void>} Removes a stale Unix socket before binding, or fails clearly when a daemon is alive. */
|
|
@@ -195,6 +244,7 @@ export default class RollbridgeDaemon {
|
|
|
195
244
|
this.executeControlLine(line)
|
|
196
245
|
.then((response) => socket.write(`${JSON.stringify({status: "success", ...response})}\n`))
|
|
197
246
|
.catch((error) => {
|
|
247
|
+
this.logger("command failed", {error: error instanceof Error ? error.message : String(error)})
|
|
198
248
|
socket.write(`${JSON.stringify({
|
|
199
249
|
error: error instanceof Error ? error.message : String(error),
|
|
200
250
|
status: "error"
|
|
@@ -228,6 +278,10 @@ export default class RollbridgeDaemon {
|
|
|
228
278
|
return this.status()
|
|
229
279
|
}
|
|
230
280
|
|
|
281
|
+
if (commandName === "events") {
|
|
282
|
+
return {events: this.eventLog.recent(typeof data.limit === "number" ? data.limit : undefined)}
|
|
283
|
+
}
|
|
284
|
+
|
|
231
285
|
if (commandName === "stop") {
|
|
232
286
|
await this.stopRelease(stringOrUndefined(data.releaseId))
|
|
233
287
|
return this.status()
|
|
@@ -240,6 +294,10 @@ export default class RollbridgeDaemon {
|
|
|
240
294
|
})
|
|
241
295
|
}
|
|
242
296
|
|
|
297
|
+
if (commandName === "rollback") {
|
|
298
|
+
return await this.rollback({releaseId: stringOrUndefined(data.releaseId)})
|
|
299
|
+
}
|
|
300
|
+
|
|
243
301
|
if (commandName === "shutdown") {
|
|
244
302
|
setImmediate(() => {
|
|
245
303
|
this.shutdown().catch((error) => {
|
|
@@ -278,6 +336,7 @@ export default class RollbridgeDaemon {
|
|
|
278
336
|
await this.ensureServices(release, startedServices)
|
|
279
337
|
await release.start()
|
|
280
338
|
} catch (error) {
|
|
339
|
+
this.logger("deploy failed", {error: error instanceof Error ? error.message : String(error), releaseId: newReleaseId})
|
|
281
340
|
await this.stopStartedServices(startedServices)
|
|
282
341
|
throw error
|
|
283
342
|
}
|
|
@@ -296,12 +355,60 @@ export default class RollbridgeDaemon {
|
|
|
296
355
|
void this.drainAndPrune(previousRelease)
|
|
297
356
|
}
|
|
298
357
|
|
|
358
|
+
this.persistState()
|
|
359
|
+
|
|
299
360
|
return {
|
|
300
361
|
activeReleaseId: release.releaseId,
|
|
301
362
|
previousReleaseId: previousRelease ? previousRelease.releaseId : null
|
|
302
363
|
}
|
|
303
364
|
}
|
|
304
365
|
|
|
366
|
+
/**
|
|
367
|
+
* Rolls back to a previously-active release by re-running the deploy flow on its
|
|
368
|
+
* retained metadata: it re-starts the target release, health-checks it, switches
|
|
369
|
+
* traffic, replaces singletons, and drains the current release — just like a deploy,
|
|
370
|
+
* so a failed rollback leaves the current release active.
|
|
371
|
+
* @param {{releaseId?: string}} [args] - Target release id; defaults to the most recently retired release.
|
|
372
|
+
* @returns {Promise<Record<string, JsonValue>>} The rollback result.
|
|
373
|
+
*/
|
|
374
|
+
async rollback({releaseId} = {}) {
|
|
375
|
+
const target = releaseId ? this.releases.get(releaseId) : this.previousRelease()
|
|
376
|
+
|
|
377
|
+
if (!target) {
|
|
378
|
+
throw new Error(releaseId ? `No retained release "${releaseId}" to roll back to.` : "No previous release to roll back to.")
|
|
379
|
+
}
|
|
380
|
+
|
|
381
|
+
if (target === this.activeRelease) {
|
|
382
|
+
throw new Error(`Release "${target.releaseId}" is already active.`)
|
|
383
|
+
}
|
|
384
|
+
|
|
385
|
+
// The target may still be draining a prior deploy (live processes). Stop it before the
|
|
386
|
+
// deploy below re-uses its id in this.releases, otherwise the still-running instance
|
|
387
|
+
// would be dropped from status/pruning/shutdown and could be orphaned.
|
|
388
|
+
if (target.state !== "stopped" && target.state !== "failed") {
|
|
389
|
+
await target.stop()
|
|
390
|
+
}
|
|
391
|
+
|
|
392
|
+
this.logger("rollback starting", {releaseId: target.releaseId, releasePath: target.releasePath})
|
|
393
|
+
|
|
394
|
+
return await this.deploy({releaseId: target.releaseId, releasePath: target.releasePath, revision: target.revision})
|
|
395
|
+
}
|
|
396
|
+
|
|
397
|
+
/**
|
|
398
|
+
* @returns {ReleaseGroup | undefined} The most recently active release other than the current one, if any.
|
|
399
|
+
*/
|
|
400
|
+
previousRelease() {
|
|
401
|
+
/** @type {ReleaseGroup | undefined} */
|
|
402
|
+
let previous
|
|
403
|
+
|
|
404
|
+
for (const release of this.releases.values()) {
|
|
405
|
+
if (release === this.activeRelease || !release.activatedAt) continue
|
|
406
|
+
if (!previous || Date.parse(release.activatedAt) >= Date.parse(/** @type {string} */ (previous.activatedAt))) previous = release
|
|
407
|
+
}
|
|
408
|
+
|
|
409
|
+
return previous
|
|
410
|
+
}
|
|
411
|
+
|
|
305
412
|
/**
|
|
306
413
|
* Starts missing daemon-wide services before release-owned processes need them.
|
|
307
414
|
* @param {ReleaseGroup} release - Release providing templates and ports.
|
|
@@ -324,7 +431,7 @@ export default class RollbridgeDaemon {
|
|
|
324
431
|
}
|
|
325
432
|
|
|
326
433
|
try {
|
|
327
|
-
await service.start()
|
|
434
|
+
await service.start("deploy")
|
|
328
435
|
startedServices.push(processConfig.id)
|
|
329
436
|
} catch (error) {
|
|
330
437
|
this.services.delete(processConfig.id)
|
|
@@ -370,11 +477,14 @@ export default class RollbridgeDaemon {
|
|
|
370
477
|
command: nextDefinition.command,
|
|
371
478
|
cwd: nextDefinition.cwd,
|
|
372
479
|
env: nextDefinition.env,
|
|
480
|
+
lifecycle: nextDefinition.lifecycle,
|
|
373
481
|
logger: nextDefinition.logger,
|
|
482
|
+
memory: nextDefinition.memory,
|
|
374
483
|
outputLines: nextDefinition.outputLines,
|
|
375
484
|
restart: nextDefinition.restart,
|
|
376
485
|
restartDelayMs: nextDefinition.restartDelayMs,
|
|
377
486
|
shouldRestart: nextDefinition.shouldRestart,
|
|
487
|
+
stopSignal: nextDefinition.stopSignal,
|
|
378
488
|
stopTimeoutMs: nextDefinition.stopTimeoutMs
|
|
379
489
|
})
|
|
380
490
|
}
|
|
@@ -398,7 +508,7 @@ export default class RollbridgeDaemon {
|
|
|
398
508
|
const singleton = release.buildProcess(processConfig)
|
|
399
509
|
|
|
400
510
|
this.singletons.set(processConfig.id, singleton)
|
|
401
|
-
await singleton.start()
|
|
511
|
+
await singleton.start("deploy")
|
|
402
512
|
}
|
|
403
513
|
}
|
|
404
514
|
|
|
@@ -426,7 +536,7 @@ export default class RollbridgeDaemon {
|
|
|
426
536
|
for (const target of targets) {
|
|
427
537
|
this.logger("process restart requested", {processId: target.id})
|
|
428
538
|
await target.process.stop()
|
|
429
|
-
await target.process.start()
|
|
539
|
+
await target.process.start("manual")
|
|
430
540
|
}
|
|
431
541
|
|
|
432
542
|
return {restarted: targets.map((target) => target.id)}
|
|
@@ -441,12 +551,14 @@ export default class RollbridgeDaemon {
|
|
|
441
551
|
|
|
442
552
|
for (const processConfig of this.config.processes) {
|
|
443
553
|
if (processConfig.policy === "proxied") continue
|
|
444
|
-
if (processId !== undefined && processConfig.id !== processId) continue
|
|
445
554
|
if (policy !== undefined && processConfig.policy !== policy) continue
|
|
446
555
|
|
|
447
|
-
const
|
|
556
|
+
for (const instance of this.runningInstances(processConfig)) {
|
|
557
|
+
// A processId selector matches the base config id (all replicas) or one replica's id.
|
|
558
|
+
if (processId !== undefined && processId !== processConfig.id && processId !== instance.id) continue
|
|
448
559
|
|
|
449
|
-
|
|
560
|
+
targets.push(instance)
|
|
561
|
+
}
|
|
450
562
|
}
|
|
451
563
|
|
|
452
564
|
return targets
|
|
@@ -454,13 +566,22 @@ export default class RollbridgeDaemon {
|
|
|
454
566
|
|
|
455
567
|
/**
|
|
456
568
|
* @param {import("./config.js").ProcessConfig} processConfig - Process definition.
|
|
457
|
-
* @returns {import("./managed-process.js").default
|
|
569
|
+
* @returns {{id: string, process: import("./managed-process.js").default}[]} Running instances (replicas) for this config.
|
|
458
570
|
*/
|
|
459
|
-
|
|
460
|
-
if (processConfig.policy === "service")
|
|
461
|
-
|
|
571
|
+
runningInstances(processConfig) {
|
|
572
|
+
if (processConfig.policy === "service") {
|
|
573
|
+
const service = this.services.get(processConfig.id)
|
|
574
|
+
|
|
575
|
+
return service ? [{id: processConfig.id, process: service}] : []
|
|
576
|
+
}
|
|
462
577
|
|
|
463
|
-
|
|
578
|
+
if (processConfig.policy === "singleton") {
|
|
579
|
+
const singleton = this.singletons.get(processConfig.id)
|
|
580
|
+
|
|
581
|
+
return singleton ? [{id: processConfig.id, process: singleton}] : []
|
|
582
|
+
}
|
|
583
|
+
|
|
584
|
+
return this.activeRelease ? this.activeRelease.getProcesses(processConfig.id) : []
|
|
464
585
|
}
|
|
465
586
|
|
|
466
587
|
/**
|
|
@@ -482,7 +603,9 @@ export default class RollbridgeDaemon {
|
|
|
482
603
|
if (release === this.activeRelease) this.activeRelease = undefined
|
|
483
604
|
|
|
484
605
|
await release.stop()
|
|
606
|
+
this.logger("release stopped", {releaseId: release.releaseId})
|
|
485
607
|
this.pruneStoppedReleases()
|
|
608
|
+
this.persistState()
|
|
486
609
|
}
|
|
487
610
|
|
|
488
611
|
/**
|
|
@@ -493,10 +616,12 @@ export default class RollbridgeDaemon {
|
|
|
493
616
|
async drainAndPrune(release) {
|
|
494
617
|
try {
|
|
495
618
|
await release.drainAndStop(this.config.proxy.drainTimeoutMs)
|
|
619
|
+
this.logger("release drained", {releaseId: release.releaseId})
|
|
496
620
|
} catch (error) {
|
|
497
621
|
this.logger("release drain failed", {error: error instanceof Error ? error.message : String(error), releaseId: release.releaseId})
|
|
498
622
|
} finally {
|
|
499
623
|
this.pruneStoppedReleases()
|
|
624
|
+
this.persistState()
|
|
500
625
|
}
|
|
501
626
|
}
|
|
502
627
|
|
|
@@ -509,11 +634,75 @@ export default class RollbridgeDaemon {
|
|
|
509
634
|
}
|
|
510
635
|
}
|
|
511
636
|
|
|
637
|
+
/** @returns {void} Starts periodic state persistence when statePath is configured. */
|
|
638
|
+
startStatePersistence() {
|
|
639
|
+
if (!this.statePath) return
|
|
640
|
+
|
|
641
|
+
this.persistState()
|
|
642
|
+
this.persistTimer = setInterval(() => this.persistState(), STATE_PERSIST_INTERVAL_MS)
|
|
643
|
+
this.persistTimer.unref?.()
|
|
644
|
+
}
|
|
645
|
+
|
|
646
|
+
/**
|
|
647
|
+
* Persists a state snapshot (status plus recent events) to statePath, atomically and
|
|
648
|
+
* fire-and-forget. A failed write is logged but never blocks daemon operation.
|
|
649
|
+
* @returns {void}
|
|
650
|
+
*/
|
|
651
|
+
persistState() {
|
|
652
|
+
if (!this.statePath || this.stopping) return
|
|
653
|
+
|
|
654
|
+
const statePath = this.statePath
|
|
655
|
+
// Drop the orphans view from the snapshot: it reflects a *previous* daemon's leftovers, not
|
|
656
|
+
// this daemon's own managed state, and is recomputed from the persisted processes on restart.
|
|
657
|
+
const {orphans: _orphans, ...status} = this.status()
|
|
658
|
+
const snapshot = {...status, events: this.eventLog.recent(), persistedAt: new Date().toISOString()}
|
|
659
|
+
|
|
660
|
+
// Serialize writes (and track the tail) so shutdown can wait for an in-flight write before
|
|
661
|
+
// clearing the file — otherwise a write started before shutdown could recreate it afterward.
|
|
662
|
+
this.pendingWrite = Promise.resolve(this.pendingWrite)
|
|
663
|
+
.catch(() => {})
|
|
664
|
+
.then(() => writeState(statePath, snapshot))
|
|
665
|
+
.catch((error) => {
|
|
666
|
+
this.logger("state persist failed", {error: error instanceof Error ? error.message : String(error)})
|
|
667
|
+
})
|
|
668
|
+
}
|
|
669
|
+
|
|
670
|
+
/**
|
|
671
|
+
* On startup, reads any state left by a previous daemon and reports managed processes whose
|
|
672
|
+
* pids are still alive — likely orphans from a daemon that did not shut down cleanly. This is
|
|
673
|
+
* advisory (Rollbridge cannot re-adopt detached children); the operator stops the leftovers.
|
|
674
|
+
* A recycled pid could be a false positive, so reports are a prompt to investigate.
|
|
675
|
+
* @returns {Promise<void>} Resolves once orphans are reported.
|
|
676
|
+
*/
|
|
677
|
+
async reportOrphans() {
|
|
678
|
+
if (!this.statePath) return
|
|
679
|
+
|
|
680
|
+
const orphans = liveProcesses(await readState(this.statePath))
|
|
681
|
+
|
|
682
|
+
// Keep them for status() so `rollbridge status` reflects still-running children after a
|
|
683
|
+
// restart, not just the startup log below.
|
|
684
|
+
this.orphans = orphans
|
|
685
|
+
|
|
686
|
+
for (const orphan of orphans) {
|
|
687
|
+
this.logger("orphaned managed process detected", {pid: orphan.pid, processId: orphan.id, releaseId: orphan.releaseId})
|
|
688
|
+
}
|
|
689
|
+
|
|
690
|
+
if (orphans.length > 0) {
|
|
691
|
+
this.logger("orphaned processes from a previous daemon", {count: orphans.length, hint: "a previous daemon did not shut down cleanly; verify these pids and stop any leftovers"})
|
|
692
|
+
}
|
|
693
|
+
}
|
|
694
|
+
|
|
512
695
|
/** @returns {Promise<void>} Stops proxy, control socket, and child processes. */
|
|
513
696
|
async shutdown() {
|
|
514
697
|
if (this.stopping) return
|
|
515
698
|
|
|
516
699
|
this.stopping = true
|
|
700
|
+
|
|
701
|
+
if (this.persistTimer) {
|
|
702
|
+
clearInterval(this.persistTimer)
|
|
703
|
+
this.persistTimer = undefined
|
|
704
|
+
}
|
|
705
|
+
|
|
517
706
|
this.proxy.close()
|
|
518
707
|
await Promise.allSettled([...this.services.values()].map((processInstance) => processInstance.stop()))
|
|
519
708
|
await Promise.allSettled([...this.singletons.values()].map((processInstance) => processInstance.stop()))
|
|
@@ -521,6 +710,14 @@ export default class RollbridgeDaemon {
|
|
|
521
710
|
await this.closeServer(this.proxyServer)
|
|
522
711
|
await this.closeServer(this.controlServer)
|
|
523
712
|
await fs.rm(this.config.control.path, {force: true})
|
|
713
|
+
|
|
714
|
+
// A clean shutdown leaves no orphans, so remove the state file rather than leaving stale
|
|
715
|
+
// pids. Wait for any in-flight write first so it can't recreate the file afterward (no new
|
|
716
|
+
// writes start: stopping is set and the persist timer is cleared above).
|
|
717
|
+
if (this.statePath) {
|
|
718
|
+
if (this.pendingWrite) await this.pendingWrite
|
|
719
|
+
await clearState(this.statePath)
|
|
720
|
+
}
|
|
524
721
|
}
|
|
525
722
|
|
|
526
723
|
/**
|
|
@@ -540,10 +737,16 @@ export default class RollbridgeDaemon {
|
|
|
540
737
|
|
|
541
738
|
/** @returns {DaemonStatus} Status payload. */
|
|
542
739
|
status() {
|
|
740
|
+
// Re-check liveness and prune the dead permanently, so the list self-clears as the operator
|
|
741
|
+
// stops the leftovers (e.g. via `rollbridge recover`). Pruning (not just filtering) matters:
|
|
742
|
+
// a cleared orphan must not reappear if the OS later recycles its pid for an unrelated process.
|
|
743
|
+
this.orphans = this.orphans.filter((orphan) => isProcessAlive(orphan.pid))
|
|
744
|
+
|
|
543
745
|
return {
|
|
544
746
|
activeReleaseId: this.activeRelease ? this.activeRelease.releaseId : null,
|
|
545
747
|
application: this.config.application,
|
|
546
748
|
control: {...this.config.control},
|
|
749
|
+
orphans: [...this.orphans],
|
|
547
750
|
proxy: {
|
|
548
751
|
host: this.config.proxy.host,
|
|
549
752
|
port: this.proxyPort ?? this.config.proxy.port,
|
package/src/doctor.js
CHANGED
|
@@ -5,9 +5,12 @@ import fs from "node:fs/promises"
|
|
|
5
5
|
import net from "node:net"
|
|
6
6
|
import path from "node:path"
|
|
7
7
|
import {inspectControlSocket} from "./daemon.js"
|
|
8
|
+
import {liveProcesses, readState} from "./state-store.js"
|
|
9
|
+
import {processTemplateContext, renderObject, renderTemplate} from "./template.js"
|
|
8
10
|
|
|
9
11
|
/**
|
|
10
12
|
* @typedef {{detail: string, name: string, ok: boolean}} DoctorCheck
|
|
13
|
+
* @typedef {{cwd: string, id: string, ok: true} | {error: string, id: string, ok: false}} ProcessRender
|
|
11
14
|
*/
|
|
12
15
|
|
|
13
16
|
/**
|
|
@@ -25,9 +28,55 @@ export async function runEnvironmentChecks(config) {
|
|
|
25
28
|
checks.push(await controlSocketDirectoryCheck(config))
|
|
26
29
|
checks.push(await proxyPortCheck(config))
|
|
27
30
|
|
|
31
|
+
if (config.statePath !== undefined) {
|
|
32
|
+
// A live daemon persists its own (live) pids into the state file, so they are not orphans.
|
|
33
|
+
const daemonRunning = !("error" in socketInspection) && socketInspection.alive
|
|
34
|
+
|
|
35
|
+
checks.push(await statePathDirectoryCheck(config.statePath))
|
|
36
|
+
checks.push(await orphanCheck(config.statePath, daemonRunning))
|
|
37
|
+
}
|
|
38
|
+
|
|
28
39
|
return checks
|
|
29
40
|
}
|
|
30
41
|
|
|
42
|
+
/**
|
|
43
|
+
* @param {string} statePath - Configured state file path.
|
|
44
|
+
* @returns {Promise<DoctorCheck>} Whether the state file's directory is writable.
|
|
45
|
+
*/
|
|
46
|
+
async function statePathDirectoryCheck(statePath) {
|
|
47
|
+
const directory = path.dirname(path.resolve(statePath))
|
|
48
|
+
|
|
49
|
+
try {
|
|
50
|
+
await fs.access(directory, fsConstants.W_OK | fsConstants.X_OK)
|
|
51
|
+
|
|
52
|
+
return {detail: `${directory} is writable`, name: "state path directory", ok: true}
|
|
53
|
+
} catch {
|
|
54
|
+
return {detail: `${directory} is missing or not writable; state cannot be persisted`, name: "state path directory", ok: false}
|
|
55
|
+
}
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
/**
|
|
59
|
+
* @param {string} statePath - Configured state file path.
|
|
60
|
+
* @param {boolean} daemonRunning - Whether a Rollbridge daemon is currently live on the control socket.
|
|
61
|
+
* @returns {Promise<DoctorCheck>} Whether any orphaned managed processes from a prior daemon are still alive.
|
|
62
|
+
*/
|
|
63
|
+
async function orphanCheck(statePath, daemonRunning) {
|
|
64
|
+
if (daemonRunning) {
|
|
65
|
+
// The running daemon owns the pids in the state file; they are managed, not orphaned.
|
|
66
|
+
return {detail: "a daemon is running; its managed processes are not orphans", name: "orphaned processes", ok: true}
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
const orphans = liveProcesses(await readState(statePath))
|
|
70
|
+
|
|
71
|
+
if (orphans.length === 0) {
|
|
72
|
+
return {detail: "no leftover processes from a previous daemon", name: "orphaned processes", ok: true}
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
const summary = orphans.map((orphan) => `${orphan.id} (pid ${orphan.pid})`).join(", ")
|
|
76
|
+
|
|
77
|
+
return {detail: `${orphans.length} possible orphaned process${orphans.length === 1 ? "" : "es"} still running: ${summary} — verify and stop any leftovers`, name: "orphaned processes", ok: false}
|
|
78
|
+
}
|
|
79
|
+
|
|
31
80
|
/**
|
|
32
81
|
* @param {string} socketPath - Control socket path.
|
|
33
82
|
* @returns {Promise<{alive: boolean, application?: string} | {error: string}>} Probe result, or the probe error.
|
|
@@ -112,3 +161,131 @@ async function canBindPort(host, port) {
|
|
|
112
161
|
server.listen(port, host, () => server.close(() => resolve({ok: true})))
|
|
113
162
|
})
|
|
114
163
|
}
|
|
164
|
+
|
|
165
|
+
/**
|
|
166
|
+
* Runs deploy-time checks against a specific release: that the release directory exists, that
|
|
167
|
+
* every process's command/cwd/env templates resolve, and that each rendered working directory
|
|
168
|
+
* exists. These need the per-release values that only exist at deploy time, so the operator
|
|
169
|
+
* supplies them (the release path, and optionally an id/revision). Ports referenced by templates
|
|
170
|
+
* are rendered with the low end of each process's configured range as a representative value.
|
|
171
|
+
* @param {import("./config.js").RollbridgeConfig} config - Normalized config.
|
|
172
|
+
* @param {{releaseId?: string, releasePath: string, revision?: string}} release - Release to render against.
|
|
173
|
+
* @returns {Promise<DoctorCheck[]>} One check per probed aspect.
|
|
174
|
+
*/
|
|
175
|
+
export async function runReleaseChecks(config, release) {
|
|
176
|
+
const releasePath = path.resolve(release.releasePath)
|
|
177
|
+
const releaseId = release.releaseId || release.revision || path.basename(releasePath)
|
|
178
|
+
const revision = release.revision || releaseId
|
|
179
|
+
const ports = representativePorts(config)
|
|
180
|
+
const renders = config.processes.map((processConfig) => renderProcess(processConfig, {application: config.application, ports, proxy: config.proxy, releaseId, releasePath, revision}))
|
|
181
|
+
|
|
182
|
+
return [await releasePathCheck(releasePath), templateCheck(renders), await workingDirectoryCheck(renders)]
|
|
183
|
+
}
|
|
184
|
+
|
|
185
|
+
/**
|
|
186
|
+
* @param {import("./config.js").RollbridgeConfig} config - Normalized config.
|
|
187
|
+
* @returns {Record<string, number>} The ports a deploy would allocate, using each range's low end.
|
|
188
|
+
*/
|
|
189
|
+
function representativePorts(config) {
|
|
190
|
+
/** @type {Record<string, number>} */
|
|
191
|
+
const ports = {}
|
|
192
|
+
|
|
193
|
+
for (const processConfig of config.processes) {
|
|
194
|
+
if (processConfig.port) ports[processConfig.id] = processConfig.port.from
|
|
195
|
+
}
|
|
196
|
+
|
|
197
|
+
return ports
|
|
198
|
+
}
|
|
199
|
+
|
|
200
|
+
/**
|
|
201
|
+
* Renders a process's command, cwd, and env against a deploy-time context (replica index 0).
|
|
202
|
+
* @param {import("./config.js").ProcessConfig} processConfig - Process to render.
|
|
203
|
+
* @param {{application: string, ports: Record<string, number>, proxy: import("./config.js").ProxyConfig, releaseId: string, releasePath: string, revision: string}} shared - Shared render inputs.
|
|
204
|
+
* @returns {ProcessRender} The rendered cwd, or the first template error.
|
|
205
|
+
*/
|
|
206
|
+
function renderProcess(processConfig, shared) {
|
|
207
|
+
const context = processTemplateContext({
|
|
208
|
+
application: shared.application,
|
|
209
|
+
ports: shared.ports,
|
|
210
|
+
processId: processConfig.id,
|
|
211
|
+
proxy: shared.proxy,
|
|
212
|
+
releaseId: shared.releaseId,
|
|
213
|
+
releasePath: shared.releasePath,
|
|
214
|
+
replicaCount: processConfig.replicas,
|
|
215
|
+
replicaIndex: 0,
|
|
216
|
+
revision: shared.revision
|
|
217
|
+
})
|
|
218
|
+
|
|
219
|
+
try {
|
|
220
|
+
const cwd = processConfig.cwd ? renderTemplate(processConfig.cwd, context) : shared.releasePath
|
|
221
|
+
|
|
222
|
+
renderTemplate(processConfig.command, context)
|
|
223
|
+
renderObject(processConfig.env, context)
|
|
224
|
+
|
|
225
|
+
return {cwd: path.resolve(shared.releasePath, cwd), id: processConfig.id, ok: true}
|
|
226
|
+
} catch (error) {
|
|
227
|
+
return {error: error instanceof Error ? error.message : String(error), id: processConfig.id, ok: false}
|
|
228
|
+
}
|
|
229
|
+
}
|
|
230
|
+
|
|
231
|
+
/**
|
|
232
|
+
* @param {string} releasePath - Resolved release directory.
|
|
233
|
+
* @returns {Promise<DoctorCheck>} Whether the release directory exists.
|
|
234
|
+
*/
|
|
235
|
+
async function releasePathCheck(releasePath) {
|
|
236
|
+
if (await isDirectory(releasePath)) {
|
|
237
|
+
return {detail: `${releasePath} exists`, name: "release path", ok: true}
|
|
238
|
+
}
|
|
239
|
+
|
|
240
|
+
return {detail: `${releasePath} is missing or not a directory`, name: "release path", ok: false}
|
|
241
|
+
}
|
|
242
|
+
|
|
243
|
+
/**
|
|
244
|
+
* @param {ProcessRender[]} renders - Per-process render results.
|
|
245
|
+
* @returns {DoctorCheck} Whether every process's templates resolved against the release context.
|
|
246
|
+
*/
|
|
247
|
+
function templateCheck(renders) {
|
|
248
|
+
const failures = renders.flatMap((render) => (render.ok ? [] : [`${render.id}: ${render.error}`]))
|
|
249
|
+
|
|
250
|
+
if (failures.length === 0) {
|
|
251
|
+
return {detail: `all ${renders.length} process command/cwd/env templates resolve`, name: "process templates", ok: true}
|
|
252
|
+
}
|
|
253
|
+
|
|
254
|
+
return {detail: `unresolved templates — ${failures.join("; ")}`, name: "process templates", ok: false}
|
|
255
|
+
}
|
|
256
|
+
|
|
257
|
+
/**
|
|
258
|
+
* @param {ProcessRender[]} renders - Per-process render results.
|
|
259
|
+
* @returns {Promise<DoctorCheck>} Whether each rendered working directory exists.
|
|
260
|
+
*/
|
|
261
|
+
async function workingDirectoryCheck(renders) {
|
|
262
|
+
/** @type {string[]} */
|
|
263
|
+
const missing = []
|
|
264
|
+
let checked = 0
|
|
265
|
+
|
|
266
|
+
for (const render of renders) {
|
|
267
|
+
if (!render.ok) continue
|
|
268
|
+
|
|
269
|
+
checked++
|
|
270
|
+
|
|
271
|
+
if (!(await isDirectory(render.cwd))) missing.push(`${render.id} (${render.cwd})`)
|
|
272
|
+
}
|
|
273
|
+
|
|
274
|
+
if (missing.length === 0) {
|
|
275
|
+
return {detail: `all ${checked} process working ${checked === 1 ? "directory exists" : "directories exist"}`, name: "process working directories", ok: true}
|
|
276
|
+
}
|
|
277
|
+
|
|
278
|
+
return {detail: `missing working ${missing.length === 1 ? "directory" : "directories"}: ${missing.join(", ")}`, name: "process working directories", ok: false}
|
|
279
|
+
}
|
|
280
|
+
|
|
281
|
+
/**
|
|
282
|
+
* @param {string} target - Path to test.
|
|
283
|
+
* @returns {Promise<boolean>} True when the path exists and is a directory.
|
|
284
|
+
*/
|
|
285
|
+
async function isDirectory(target) {
|
|
286
|
+
try {
|
|
287
|
+
return (await fs.stat(target)).isDirectory()
|
|
288
|
+
} catch {
|
|
289
|
+
return false
|
|
290
|
+
}
|
|
291
|
+
}
|
package/src/event-log.js
ADDED
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
// @ts-check
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* @typedef {import("./json.js").JsonValue} JsonValue
|
|
5
|
+
* @typedef {{at: string, data: Record<string, JsonValue>, message: string}} DaemonEvent
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
/**
|
|
9
|
+
* A bounded, in-memory history of structured daemon events (deploys, traffic
|
|
10
|
+
* switches, stops, crashes, restarts, and failed commands). The newest events
|
|
11
|
+
* are kept; the oldest are dropped once the limit is exceeded.
|
|
12
|
+
*/
|
|
13
|
+
export default class EventLog {
|
|
14
|
+
/**
|
|
15
|
+
* @param {number} limit - Maximum number of events to retain.
|
|
16
|
+
*/
|
|
17
|
+
constructor(limit) {
|
|
18
|
+
this.limit = limit
|
|
19
|
+
this.events = /** @type {DaemonEvent[]} */ ([])
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
/**
|
|
23
|
+
* Appends an event, dropping the oldest events beyond the limit.
|
|
24
|
+
* @param {string} message - Event type/message.
|
|
25
|
+
* @param {Record<string, JsonValue>} data - Structured event payload.
|
|
26
|
+
* @returns {void}
|
|
27
|
+
*/
|
|
28
|
+
record(message, data) {
|
|
29
|
+
this.events.push({at: new Date().toISOString(), data, message})
|
|
30
|
+
|
|
31
|
+
if (this.events.length > this.limit) {
|
|
32
|
+
this.events.splice(0, this.events.length - this.limit)
|
|
33
|
+
}
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
/**
|
|
37
|
+
* @param {number} [limit] - Maximum number of most-recent events to return; all when omitted or invalid.
|
|
38
|
+
* @returns {DaemonEvent[]} The most recent events, oldest first.
|
|
39
|
+
*/
|
|
40
|
+
recent(limit) {
|
|
41
|
+
if (typeof limit !== "number" || !Number.isFinite(limit) || limit <= 0 || limit >= this.events.length) {
|
|
42
|
+
return [...this.events]
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
return this.events.slice(this.events.length - limit)
|
|
46
|
+
}
|
|
47
|
+
}
|