rollbridge 0.1.5 → 0.1.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +104 -4
- package/TODO.md +42 -40
- package/docs/cli.md +146 -6
- package/docs/config.md +139 -2
- package/docs/logging.md +77 -0
- package/docs/releasing.md +53 -0
- package/docs/tensorbuzz-runbook.md +129 -0
- package/docs/velocious.md +49 -11
- package/docs/workers.md +115 -0
- package/package.json +1 -1
- package/src/cli.js +290 -1
- package/src/config.js +169 -6
- package/src/daemon.js +216 -13
- package/src/doctor.js +177 -0
- package/src/event-log.js +47 -0
- package/src/managed-process.js +225 -16
- package/src/process-memory.js +110 -0
- package/src/recover.js +134 -0
- package/src/release-group.js +71 -21
- package/src/state-store.js +103 -0
- package/src/system-ids.js +71 -0
- package/src/template.js +32 -0
- package/test/completion.test.js +64 -0
- package/test/config-validation.test.js +227 -0
- package/test/doctor.test.js +205 -3
- package/test/event-log.test.js +46 -0
- package/test/fixtures/memory-hog.js +19 -0
- package/test/managed-process.test.js +290 -0
- package/test/process-memory.test.js +40 -0
- package/test/recover.test.js +162 -0
- package/test/release-group.test.js +22 -0
- package/test/rollbridge.test.js +523 -6
- package/test/state-store.test.js +69 -0
- package/test/system-ids.test.js +24 -0
|
@@ -1,6 +1,8 @@
|
|
|
1
1
|
// @ts-check
|
|
2
2
|
|
|
3
3
|
import assert from "node:assert/strict"
|
|
4
|
+
import fs from "node:fs"
|
|
5
|
+
import os from "node:os"
|
|
4
6
|
import path from "node:path"
|
|
5
7
|
import test from "node:test"
|
|
6
8
|
import {fileURLToPath} from "node:url"
|
|
@@ -105,6 +107,32 @@ test("counts automatic restarts and reports startedAt and uptime while running",
|
|
|
105
107
|
}
|
|
106
108
|
})
|
|
107
109
|
|
|
110
|
+
test("a queued auto-restart timer is unref'd so it can't keep the process alive", async () => {
|
|
111
|
+
const managed = new ManagedProcess({
|
|
112
|
+
command: `${JSON.stringify(process.execPath)} ${JSON.stringify(crasherPath)}`,
|
|
113
|
+
cwd: undefined,
|
|
114
|
+
env: {},
|
|
115
|
+
id: "crasher",
|
|
116
|
+
logger: () => {},
|
|
117
|
+
outputLines: 50,
|
|
118
|
+
restartDelayMs: 5000,
|
|
119
|
+
shouldRestart: () => true,
|
|
120
|
+
stopTimeoutMs: 500
|
|
121
|
+
})
|
|
122
|
+
|
|
123
|
+
try {
|
|
124
|
+
await managed.start()
|
|
125
|
+
|
|
126
|
+
// After the fixture crashes a restart is queued. Under the default unlimited restart policy a
|
|
127
|
+
// ref'd timer would respawn forever and block process exit, so the queued timer must be unref'd.
|
|
128
|
+
await waitFor(() => managed.restartTimer !== undefined)
|
|
129
|
+
|
|
130
|
+
assert.equal(managed.restartTimer?.hasRef(), false)
|
|
131
|
+
} finally {
|
|
132
|
+
await managed.stop()
|
|
133
|
+
}
|
|
134
|
+
})
|
|
135
|
+
|
|
108
136
|
/**
|
|
109
137
|
* Builds a managed crasher with a specific restart policy.
|
|
110
138
|
* @param {import("../src/config.js").RestartConfig} restart - Restart policy.
|
|
@@ -125,6 +153,268 @@ function buildCrasher(restart) {
|
|
|
125
153
|
})
|
|
126
154
|
}
|
|
127
155
|
|
|
156
|
+
test("records the start reason, marking crash auto-restarts", async () => {
|
|
157
|
+
const managed = buildCrasher({backoffFactor: 1, maxDelayMs: 0, maxRestarts: undefined, windowMs: 0})
|
|
158
|
+
|
|
159
|
+
try {
|
|
160
|
+
await managed.start()
|
|
161
|
+
|
|
162
|
+
assert.equal(managed.status().lastStartReason, "deploy")
|
|
163
|
+
|
|
164
|
+
// The fixture crashes ~40ms after each start, so it auto-restarts with reason "crash".
|
|
165
|
+
await waitFor(() => managed.status().restarts >= 1)
|
|
166
|
+
|
|
167
|
+
assert.equal(managed.status().lastStartReason, "crash")
|
|
168
|
+
} finally {
|
|
169
|
+
await managed.stop()
|
|
170
|
+
}
|
|
171
|
+
})
|
|
172
|
+
|
|
173
|
+
test("records the manual start reason", async () => {
|
|
174
|
+
// Restarts disabled, so the crash does not overwrite the manual reason.
|
|
175
|
+
const managed = buildCrasher({backoffFactor: 1, maxDelayMs: 0, maxRestarts: 0, windowMs: 0})
|
|
176
|
+
|
|
177
|
+
try {
|
|
178
|
+
await managed.start("manual")
|
|
179
|
+
|
|
180
|
+
assert.equal(managed.status().lastStartReason, "manual")
|
|
181
|
+
} finally {
|
|
182
|
+
await managed.stop()
|
|
183
|
+
}
|
|
184
|
+
})
|
|
185
|
+
|
|
186
|
+
test("does not record a start reason when the spawn fails", async () => {
|
|
187
|
+
const managed = new ManagedProcess({
|
|
188
|
+
command: "true",
|
|
189
|
+
cwd: "/nonexistent-rollbridge-spawn-dir",
|
|
190
|
+
env: {},
|
|
191
|
+
id: "broken",
|
|
192
|
+
logger: () => {},
|
|
193
|
+
outputLines: 50,
|
|
194
|
+
restartDelayMs: 10,
|
|
195
|
+
shouldRestart: () => false,
|
|
196
|
+
stopTimeoutMs: 500
|
|
197
|
+
})
|
|
198
|
+
|
|
199
|
+
// The cwd does not exist, so the spawn fails before the process ever runs.
|
|
200
|
+
await assert.rejects(() => managed.start("manual"))
|
|
201
|
+
assert.equal(managed.status().lastStartReason, undefined)
|
|
202
|
+
})
|
|
203
|
+
|
|
204
|
+
/**
|
|
205
|
+
* Builds a long-lived managed process (stays running until stopped) with a restart gate.
|
|
206
|
+
* @param {() => boolean} shouldRestart - Restart policy callback.
|
|
207
|
+
* @returns {ManagedProcess} Managed process.
|
|
208
|
+
*/
|
|
209
|
+
function buildLongLived(shouldRestart) {
|
|
210
|
+
return new ManagedProcess({
|
|
211
|
+
command: `${JSON.stringify(process.execPath)} -e ${JSON.stringify("setInterval(() => {}, 1000)")}`,
|
|
212
|
+
cwd: undefined,
|
|
213
|
+
env: {},
|
|
214
|
+
id: "worker",
|
|
215
|
+
logger: () => {},
|
|
216
|
+
outputLines: 50,
|
|
217
|
+
restartDelayMs: 10,
|
|
218
|
+
shouldRestart,
|
|
219
|
+
stopTimeoutMs: 500
|
|
220
|
+
})
|
|
221
|
+
}
|
|
222
|
+
|
|
223
|
+
test("runs quiet and drain lifecycle hooks before stopping", async () => {
|
|
224
|
+
const dir = fs.mkdtempSync(path.join(os.tmpdir(), "rollbridge-hooks-"))
|
|
225
|
+
const logPath = path.join(dir, "hooks.log")
|
|
226
|
+
const append = (/** @type {string} */ word) => `${JSON.stringify("/bin/sh")} -c ${JSON.stringify(`echo ${word} >> ${logPath}`)}`
|
|
227
|
+
const managed = new ManagedProcess({
|
|
228
|
+
command: `${JSON.stringify(process.execPath)} -e ${JSON.stringify("setInterval(() => {}, 1000)")}`,
|
|
229
|
+
cwd: undefined,
|
|
230
|
+
env: {},
|
|
231
|
+
id: "worker",
|
|
232
|
+
lifecycle: {drainCommand: append("drain"), drainTimeoutMs: 500, quietCommand: append("quiet")},
|
|
233
|
+
logger: () => {},
|
|
234
|
+
outputLines: 50,
|
|
235
|
+
restartDelayMs: 10,
|
|
236
|
+
shouldRestart: () => false,
|
|
237
|
+
stopSignal: "SIGTERM",
|
|
238
|
+
stopTimeoutMs: 2000
|
|
239
|
+
})
|
|
240
|
+
|
|
241
|
+
try {
|
|
242
|
+
await managed.start()
|
|
243
|
+
await managed.stop()
|
|
244
|
+
|
|
245
|
+
assert.equal(managed.status().state, "stopped")
|
|
246
|
+
// quietCommand ran, then drainCommand, then the worker was stopped via stopSignal.
|
|
247
|
+
assert.deepEqual(fs.readFileSync(logPath, "utf8").trim().split("\n"), ["quiet", "drain"])
|
|
248
|
+
} finally {
|
|
249
|
+
await managed.stop()
|
|
250
|
+
fs.rmSync(dir, {force: true, recursive: true})
|
|
251
|
+
}
|
|
252
|
+
})
|
|
253
|
+
|
|
254
|
+
test("a configured stopCommand is used instead of the stop signal", async () => {
|
|
255
|
+
const dir = fs.mkdtempSync(path.join(os.tmpdir(), "rollbridge-hooks-"))
|
|
256
|
+
const logPath = path.join(dir, "stop.log")
|
|
257
|
+
const managed = new ManagedProcess({
|
|
258
|
+
command: `${JSON.stringify(process.execPath)} -e ${JSON.stringify("setInterval(() => {}, 1000)")}`,
|
|
259
|
+
cwd: undefined,
|
|
260
|
+
env: {},
|
|
261
|
+
id: "worker",
|
|
262
|
+
// The stop command logs and kills the worker's process group, so no SIGKILL fallback is needed.
|
|
263
|
+
lifecycle: {drainTimeoutMs: 0, stopCommand: `${JSON.stringify("/bin/sh")} -c ${JSON.stringify(`echo stop >> ${logPath}; kill -KILL -$ROLLBRIDGE_PID`)}`},
|
|
264
|
+
logger: () => {},
|
|
265
|
+
outputLines: 50,
|
|
266
|
+
restartDelayMs: 10,
|
|
267
|
+
shouldRestart: () => false,
|
|
268
|
+
stopSignal: "SIGTERM",
|
|
269
|
+
stopTimeoutMs: 2000
|
|
270
|
+
})
|
|
271
|
+
|
|
272
|
+
/** @type {string[]} */
|
|
273
|
+
const signals = []
|
|
274
|
+
const killProcessGroup = managed.killProcessGroup.bind(managed)
|
|
275
|
+
|
|
276
|
+
managed.killProcessGroup = (signal) => {
|
|
277
|
+
signals.push(signal)
|
|
278
|
+
killProcessGroup(signal)
|
|
279
|
+
}
|
|
280
|
+
|
|
281
|
+
try {
|
|
282
|
+
await managed.start()
|
|
283
|
+
await managed.stop()
|
|
284
|
+
|
|
285
|
+
assert.equal(managed.status().state, "stopped")
|
|
286
|
+
assert.deepEqual(fs.readFileSync(logPath, "utf8").trim().split("\n"), ["stop"])
|
|
287
|
+
// The stop signal is replaced by the stop command (only a SIGKILL fallback may be sent).
|
|
288
|
+
assert.ok(!signals.includes("SIGTERM"), `expected no stopSignal, got ${signals.join(",")}`)
|
|
289
|
+
} finally {
|
|
290
|
+
await managed.stop()
|
|
291
|
+
fs.rmSync(dir, {force: true, recursive: true})
|
|
292
|
+
}
|
|
293
|
+
})
|
|
294
|
+
|
|
295
|
+
test("a failing lifecycle hook is logged but does not fail the stop", async () => {
|
|
296
|
+
/** @type {string[]} */
|
|
297
|
+
const messages = []
|
|
298
|
+
const managed = new ManagedProcess({
|
|
299
|
+
command: `${JSON.stringify(process.execPath)} -e ${JSON.stringify("setInterval(() => {}, 1000)")}`,
|
|
300
|
+
cwd: undefined,
|
|
301
|
+
env: {},
|
|
302
|
+
id: "worker",
|
|
303
|
+
lifecycle: {drainTimeoutMs: 0, quietCommand: `${JSON.stringify("/bin/sh")} -c "exit 3"`},
|
|
304
|
+
logger: (message) => { messages.push(message) },
|
|
305
|
+
outputLines: 50,
|
|
306
|
+
restartDelayMs: 10,
|
|
307
|
+
shouldRestart: () => false,
|
|
308
|
+
stopSignal: "SIGTERM",
|
|
309
|
+
stopTimeoutMs: 2000
|
|
310
|
+
})
|
|
311
|
+
|
|
312
|
+
try {
|
|
313
|
+
await managed.start()
|
|
314
|
+
await managed.stop()
|
|
315
|
+
|
|
316
|
+
assert.equal(managed.status().state, "stopped")
|
|
317
|
+
assert.ok(messages.includes("quiet command exited non-zero"), `expected a non-zero hook log, got ${messages.join(",")}`)
|
|
318
|
+
} finally {
|
|
319
|
+
await managed.stop()
|
|
320
|
+
}
|
|
321
|
+
})
|
|
322
|
+
|
|
323
|
+
test("a hanging lifecycle hook is bounded so stop still completes", async () => {
|
|
324
|
+
const managed = new ManagedProcess({
|
|
325
|
+
command: `${JSON.stringify(process.execPath)} -e ${JSON.stringify("setInterval(() => {}, 1000)")}`,
|
|
326
|
+
cwd: undefined,
|
|
327
|
+
env: {},
|
|
328
|
+
id: "worker",
|
|
329
|
+
lifecycle: {drainTimeoutMs: 0, quietCommand: "sleep 30"},
|
|
330
|
+
logger: () => {},
|
|
331
|
+
outputLines: 50,
|
|
332
|
+
restartDelayMs: 10,
|
|
333
|
+
shouldRestart: () => false,
|
|
334
|
+
stopSignal: "SIGTERM",
|
|
335
|
+
stopTimeoutMs: 300
|
|
336
|
+
})
|
|
337
|
+
|
|
338
|
+
try {
|
|
339
|
+
await managed.start()
|
|
340
|
+
|
|
341
|
+
const startedAt = Date.now()
|
|
342
|
+
|
|
343
|
+
await managed.stop()
|
|
344
|
+
|
|
345
|
+
assert.equal(managed.status().state, "stopped")
|
|
346
|
+
// The hung quietCommand is killed at stopTimeoutMs rather than blocking stop indefinitely.
|
|
347
|
+
assert.ok(Date.now() - startedAt < 5000, "stop should not wait for the hung hook")
|
|
348
|
+
} finally {
|
|
349
|
+
await managed.stop()
|
|
350
|
+
}
|
|
351
|
+
})
|
|
352
|
+
|
|
353
|
+
test("sends the configured stopSignal as the graceful stop signal", async () => {
|
|
354
|
+
const managed = new ManagedProcess({
|
|
355
|
+
command: `${JSON.stringify(process.execPath)} -e ${JSON.stringify("setInterval(() => {}, 1000)")}`,
|
|
356
|
+
cwd: undefined,
|
|
357
|
+
env: {},
|
|
358
|
+
id: "worker",
|
|
359
|
+
logger: () => {},
|
|
360
|
+
outputLines: 50,
|
|
361
|
+
restartDelayMs: 10,
|
|
362
|
+
shouldRestart: () => false,
|
|
363
|
+
stopSignal: "SIGINT",
|
|
364
|
+
stopTimeoutMs: 500
|
|
365
|
+
})
|
|
366
|
+
|
|
367
|
+
/** @type {string[]} */
|
|
368
|
+
const signals = []
|
|
369
|
+
const killProcessGroup = managed.killProcessGroup.bind(managed)
|
|
370
|
+
|
|
371
|
+
managed.killProcessGroup = (signal) => {
|
|
372
|
+
signals.push(signal)
|
|
373
|
+
killProcessGroup(signal)
|
|
374
|
+
}
|
|
375
|
+
|
|
376
|
+
await managed.start()
|
|
377
|
+
await managed.stop()
|
|
378
|
+
|
|
379
|
+
// The graceful stop uses the configured signal (a SIGKILL fallback, if any, comes after).
|
|
380
|
+
assert.equal(signals[0], "SIGINT")
|
|
381
|
+
assert.equal(managed.status().state, "stopped")
|
|
382
|
+
})
|
|
383
|
+
|
|
384
|
+
test("a memory restart respawns and is counted when the supervisor still wants the process", async () => {
|
|
385
|
+
const managed = buildLongLived(() => true)
|
|
386
|
+
|
|
387
|
+
try {
|
|
388
|
+
await managed.start()
|
|
389
|
+
await managed.restartForMemory()
|
|
390
|
+
|
|
391
|
+
assert.equal(managed.status().state, "running")
|
|
392
|
+
assert.equal(managed.memoryRestarts, 1)
|
|
393
|
+
assert.equal(managed.status().lastStartReason, "memory")
|
|
394
|
+
} finally {
|
|
395
|
+
await managed.stop()
|
|
396
|
+
}
|
|
397
|
+
})
|
|
398
|
+
|
|
399
|
+
test("a memory restart does not respawn when shouldRestart is false", async () => {
|
|
400
|
+
let allowRestart = true
|
|
401
|
+
const managed = buildLongLived(() => allowRestart)
|
|
402
|
+
|
|
403
|
+
try {
|
|
404
|
+
await managed.start()
|
|
405
|
+
assert.equal(managed.status().state, "running")
|
|
406
|
+
|
|
407
|
+
// The supervisor (e.g. daemon shutdown or a draining release) no longer wants it running.
|
|
408
|
+
allowRestart = false
|
|
409
|
+
await managed.restartForMemory()
|
|
410
|
+
|
|
411
|
+
assert.equal(managed.status().state, "stopped")
|
|
412
|
+
assert.equal(managed.memoryRestarts, 0)
|
|
413
|
+
} finally {
|
|
414
|
+
await managed.stop()
|
|
415
|
+
}
|
|
416
|
+
})
|
|
417
|
+
|
|
128
418
|
test("does not auto-restart when the restart policy is disabled (maxRestarts: 0)", async () => {
|
|
129
419
|
const managed = buildCrasher({backoffFactor: 1, maxDelayMs: 0, maxRestarts: 0, windowMs: 0})
|
|
130
420
|
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
// @ts-check
|
|
2
|
+
|
|
3
|
+
import assert from "node:assert/strict"
|
|
4
|
+
import fs from "node:fs"
|
|
5
|
+
import test from "node:test"
|
|
6
|
+
import {measureProcessGroupRssBytes, processGroupMembers} from "../src/process-memory.js"
|
|
7
|
+
|
|
8
|
+
const linuxOnly = process.platform !== "linux" && "requires /proc (Linux)"
|
|
9
|
+
|
|
10
|
+
/**
|
|
11
|
+
* @returns {number} The current process's group id, read from /proc.
|
|
12
|
+
*/
|
|
13
|
+
function currentProcessGroupId() {
|
|
14
|
+
const stat = fs.readFileSync("/proc/self/stat", "utf8")
|
|
15
|
+
|
|
16
|
+
return Number(stat.slice(stat.lastIndexOf(")") + 2).split(" ")[2])
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
test("measures the resident memory of a live process group", {skip: linuxOnly}, () => {
|
|
20
|
+
const rssBytes = measureProcessGroupRssBytes(currentProcessGroupId())
|
|
21
|
+
|
|
22
|
+
assert.ok(typeof rssBytes === "number" && rssBytes > 0, `expected a positive RSS, got ${rssBytes}`)
|
|
23
|
+
})
|
|
24
|
+
|
|
25
|
+
test("returns undefined for a process group with no members", {skip: linuxOnly}, () => {
|
|
26
|
+
assert.equal(measureProcessGroupRssBytes(2147483646), undefined)
|
|
27
|
+
})
|
|
28
|
+
|
|
29
|
+
test("lists process-group members with their command and resident memory", {skip: linuxOnly}, () => {
|
|
30
|
+
const members = processGroupMembers(currentProcessGroupId())
|
|
31
|
+
const self = members.find((member) => member.pid === process.pid)
|
|
32
|
+
|
|
33
|
+
assert.ok(self, "the current process should be a group member")
|
|
34
|
+
assert.ok(typeof self.rssBytes === "number" && self.rssBytes > 0)
|
|
35
|
+
assert.equal(typeof self.command, "string")
|
|
36
|
+
})
|
|
37
|
+
|
|
38
|
+
test("returns an empty list for a process group with no members", {skip: linuxOnly}, () => {
|
|
39
|
+
assert.deepEqual(processGroupMembers(2147483646), [])
|
|
40
|
+
})
|
|
@@ -0,0 +1,162 @@
|
|
|
1
|
+
// @ts-check
|
|
2
|
+
|
|
3
|
+
import assert from "node:assert/strict"
|
|
4
|
+
import {spawn} from "node:child_process"
|
|
5
|
+
import {once} from "node:events"
|
|
6
|
+
import fs from "node:fs/promises"
|
|
7
|
+
import os from "node:os"
|
|
8
|
+
import path from "node:path"
|
|
9
|
+
import test from "node:test"
|
|
10
|
+
import RollbridgeDaemon from "../src/daemon.js"
|
|
11
|
+
import {normalizeConfig} from "../src/config.js"
|
|
12
|
+
import {recoverOrphans} from "../src/recover.js"
|
|
13
|
+
import {isProcessAlive, readState, writeState} from "../src/state-store.js"
|
|
14
|
+
|
|
15
|
+
/**
|
|
16
|
+
* @param {string} dir - Working directory.
|
|
17
|
+
* @param {{statePath?: string}} [options] - Config options.
|
|
18
|
+
* @returns {import("../src/config.js").RollbridgeConfig} Normalized config.
|
|
19
|
+
*/
|
|
20
|
+
function buildConfig(dir, {statePath} = {}) {
|
|
21
|
+
return normalizeConfig({
|
|
22
|
+
application: "recover-test",
|
|
23
|
+
control: {path: path.join(dir, "rollbridge.sock")},
|
|
24
|
+
processes: [{command: "true", id: "web", policy: "proxied", port: {from: 0, to: 0}}],
|
|
25
|
+
proxy: {forceStopTimeoutMs: 1000, host: "127.0.0.1", port: 0},
|
|
26
|
+
...(statePath ? {statePath} : {})
|
|
27
|
+
})
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
/**
|
|
31
|
+
* @returns {Promise<import("node:child_process").ChildProcess>} A detached, long-lived process (its own group leader).
|
|
32
|
+
*/
|
|
33
|
+
async function spawnOrphan() {
|
|
34
|
+
const orphan = spawn(process.execPath, ["-e", "setInterval(() => {}, 1000)"], {detached: true, stdio: "ignore"})
|
|
35
|
+
|
|
36
|
+
await once(orphan, "spawn")
|
|
37
|
+
|
|
38
|
+
return orphan
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
/**
|
|
42
|
+
* @param {string} statePath - State file path.
|
|
43
|
+
* @param {number | undefined} pid - Orphan pid to record.
|
|
44
|
+
* @returns {Promise<void>} Resolves once written.
|
|
45
|
+
*/
|
|
46
|
+
async function writeOrphanState(statePath, pid) {
|
|
47
|
+
await writeState(statePath, {activeReleaseId: "v1", releases: [{processes: [{id: "worker", pid}], releaseId: "v1"}], services: [], singletons: []})
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
/**
|
|
51
|
+
* @param {() => boolean} probe - Condition to await.
|
|
52
|
+
* @returns {Promise<void>} Resolves once the probe returns true.
|
|
53
|
+
*/
|
|
54
|
+
async function waitFor(probe) {
|
|
55
|
+
const deadline = Date.now() + 3000
|
|
56
|
+
|
|
57
|
+
while (Date.now() < deadline) {
|
|
58
|
+
if (probe()) return
|
|
59
|
+
await new Promise((resolve) => setTimeout(resolve, 25))
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
throw new Error("Timed out waiting for condition")
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
test("recover requires a configured statePath", async () => {
|
|
66
|
+
const dir = await fs.mkdtemp(path.join(os.tmpdir(), "rollbridge-recover-"))
|
|
67
|
+
|
|
68
|
+
try {
|
|
69
|
+
const result = await recoverOrphans({config: buildConfig(dir), force: true})
|
|
70
|
+
|
|
71
|
+
assert.ok("error" in result && /statePath/.test(result.error))
|
|
72
|
+
} finally {
|
|
73
|
+
await fs.rm(dir, {force: true, recursive: true})
|
|
74
|
+
}
|
|
75
|
+
})
|
|
76
|
+
|
|
77
|
+
test("recover lists orphans without stopping them unless forced", async () => {
|
|
78
|
+
const dir = await fs.mkdtemp(path.join(os.tmpdir(), "rollbridge-recover-"))
|
|
79
|
+
const statePath = path.join(dir, "state.json")
|
|
80
|
+
const orphan = await spawnOrphan()
|
|
81
|
+
|
|
82
|
+
try {
|
|
83
|
+
await writeOrphanState(statePath, orphan.pid)
|
|
84
|
+
|
|
85
|
+
const result = await recoverOrphans({config: buildConfig(dir, {statePath}), force: false})
|
|
86
|
+
|
|
87
|
+
assert.ok(!("error" in result))
|
|
88
|
+
assert.equal(result.forced, false)
|
|
89
|
+
assert.equal(result.cleared, false)
|
|
90
|
+
assert.deepEqual(result.remaining, [])
|
|
91
|
+
assert.equal(result.orphans.length, 1)
|
|
92
|
+
assert.equal(result.orphans[0].pid, orphan.pid)
|
|
93
|
+
assert.ok(orphan.pid !== undefined && isProcessAlive(orphan.pid), "the orphan must not be stopped by a dry run")
|
|
94
|
+
assert.ok(await readState(statePath), "a dry run must not clear the state file")
|
|
95
|
+
} finally {
|
|
96
|
+
orphan.kill("SIGKILL")
|
|
97
|
+
await fs.rm(dir, {force: true, recursive: true})
|
|
98
|
+
}
|
|
99
|
+
})
|
|
100
|
+
|
|
101
|
+
test("recover --force stops orphan process groups and clears the state file", async () => {
|
|
102
|
+
const dir = await fs.mkdtemp(path.join(os.tmpdir(), "rollbridge-recover-"))
|
|
103
|
+
const statePath = path.join(dir, "state.json")
|
|
104
|
+
const orphan = await spawnOrphan()
|
|
105
|
+
|
|
106
|
+
try {
|
|
107
|
+
await writeOrphanState(statePath, orphan.pid)
|
|
108
|
+
|
|
109
|
+
const result = await recoverOrphans({config: buildConfig(dir, {statePath}), force: true})
|
|
110
|
+
|
|
111
|
+
assert.ok(!("error" in result))
|
|
112
|
+
assert.equal(result.forced, true)
|
|
113
|
+
assert.equal(result.cleared, true)
|
|
114
|
+
assert.deepEqual(result.remaining, [])
|
|
115
|
+
await waitFor(() => orphan.pid === undefined || !isProcessAlive(orphan.pid))
|
|
116
|
+
assert.equal(await readState(statePath), undefined, "the state file is cleared after a forced recovery")
|
|
117
|
+
} finally {
|
|
118
|
+
orphan.kill("SIGKILL")
|
|
119
|
+
await fs.rm(dir, {force: true, recursive: true})
|
|
120
|
+
}
|
|
121
|
+
})
|
|
122
|
+
|
|
123
|
+
test("recover --force keeps the state file when an orphan cannot be stopped", async () => {
|
|
124
|
+
const dir = await fs.mkdtemp(path.join(os.tmpdir(), "rollbridge-recover-"))
|
|
125
|
+
const statePath = path.join(dir, "state.json")
|
|
126
|
+
const orphan = await spawnOrphan()
|
|
127
|
+
|
|
128
|
+
try {
|
|
129
|
+
await writeOrphanState(statePath, orphan.pid)
|
|
130
|
+
|
|
131
|
+
// Simulate an orphan that can't be signaled (for example owned by another user): stopGroup
|
|
132
|
+
// reports it is still alive.
|
|
133
|
+
const result = await recoverOrphans({config: buildConfig(dir, {statePath}), force: true, stopGroup: async () => false})
|
|
134
|
+
|
|
135
|
+
assert.ok(!("error" in result))
|
|
136
|
+
assert.equal(result.forced, true)
|
|
137
|
+
assert.equal(result.cleared, false, "the state file is kept when an orphan survives")
|
|
138
|
+
assert.equal(result.remaining.length, 1)
|
|
139
|
+
assert.equal(result.remaining[0].pid, orphan.pid)
|
|
140
|
+
assert.ok(await readState(statePath), "the state file must remain so the operator can retry")
|
|
141
|
+
} finally {
|
|
142
|
+
orphan.kill("SIGKILL")
|
|
143
|
+
await fs.rm(dir, {force: true, recursive: true})
|
|
144
|
+
}
|
|
145
|
+
})
|
|
146
|
+
|
|
147
|
+
test("recover refuses while a daemon is running", async () => {
|
|
148
|
+
const dir = await fs.mkdtemp(path.join(os.tmpdir(), "rollbridge-recover-"))
|
|
149
|
+
const config = buildConfig(dir, {statePath: path.join(dir, "state.json")})
|
|
150
|
+
const daemon = new RollbridgeDaemon({config, logger: () => {}})
|
|
151
|
+
|
|
152
|
+
await daemon.start()
|
|
153
|
+
|
|
154
|
+
try {
|
|
155
|
+
const result = await recoverOrphans({config, force: true})
|
|
156
|
+
|
|
157
|
+
assert.ok("error" in result && /is using/.test(result.error))
|
|
158
|
+
} finally {
|
|
159
|
+
await daemon.shutdown()
|
|
160
|
+
await fs.rm(dir, {force: true, recursive: true})
|
|
161
|
+
}
|
|
162
|
+
})
|
|
@@ -41,6 +41,28 @@ test("templates interpolate values from the daemon environment", () => {
|
|
|
41
41
|
}
|
|
42
42
|
})
|
|
43
43
|
|
|
44
|
+
test("replica processes get a replica index, count, and template context", () => {
|
|
45
|
+
const config = normalizeConfig({
|
|
46
|
+
application: "demo",
|
|
47
|
+
control: {path: "/tmp/rollbridge-release-group.sock"},
|
|
48
|
+
processes: [
|
|
49
|
+
{command: "run web", id: "web", policy: "proxied", port: {from: 0, to: 0}},
|
|
50
|
+
{command: "worker {{replicaIndex}}/{{replicaCount}}", env: {SLOT: "{{replicaIndex}}"}, id: "worker", policy: "companion", replicas: 3}
|
|
51
|
+
],
|
|
52
|
+
proxy: {host: "127.0.0.1", port: 0}
|
|
53
|
+
})
|
|
54
|
+
const release = new ReleaseGroup({config, logger: () => {}, releaseId: "v1", releasePath: "/tmp/rel", revision: "v1"})
|
|
55
|
+
const workerConfig = release.config.processes[1]
|
|
56
|
+
const replica = release.buildProcess(workerConfig, {count: 3, index: 1, instanceId: "worker#1"})
|
|
57
|
+
|
|
58
|
+
assert.equal(replica.id, "worker#1")
|
|
59
|
+
assert.equal(replica.command, "worker 1/3")
|
|
60
|
+
assert.equal(replica.env.ROLLBRIDGE_REPLICA_INDEX, "1")
|
|
61
|
+
assert.equal(replica.env.ROLLBRIDGE_REPLICA_COUNT, "3")
|
|
62
|
+
assert.equal(replica.env.ROLLBRIDGE_PROCESS_ID, "worker")
|
|
63
|
+
assert.equal(replica.env.SLOT, "1")
|
|
64
|
+
})
|
|
65
|
+
|
|
44
66
|
test("a referenced daemon environment variable that is unset fails fast", () => {
|
|
45
67
|
const release = buildRelease({
|
|
46
68
|
command: "run {{env.ROLLBRIDGE_ENV_MISSING}}",
|