@openparachute/hub 0.6.5-rc.6 → 0.6.5-rc.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/__tests__/hub-db-liveness.test.ts +12 -7
- package/src/__tests__/serve.test.ts +72 -0
- package/src/commands/serve.ts +81 -10
- package/src/hub-db-liveness.ts +33 -17
- package/src/hub-server.ts +5 -2
package/package.json
CHANGED
|
@@ -214,18 +214,23 @@ describe("DbHolder.probePath (#610 proactive detection)", () => {
|
|
|
214
214
|
h.cleanup();
|
|
215
215
|
});
|
|
216
216
|
|
|
217
|
-
test("path GONE (ENOENT) →
|
|
218
|
-
//
|
|
219
|
-
//
|
|
220
|
-
|
|
221
|
-
|
|
217
|
+
test("path GONE (ENOENT) → exit(1) directly, NO reopen (#619 follow-up)", () => {
|
|
218
|
+
// The genuine `rm -rf ~/.parachute` field shape. We must NOT reopen here:
|
|
219
|
+
// reopen is openHubDb, which mkdir-recursive's the dir back + opens a fresh
|
|
220
|
+
// EMPTY db, so its SELECT-1 verify would PASS and the hub would "heal" into a
|
|
221
|
+
// half-recovered state (empty db, stale in-memory state, wiped well-known,
|
|
222
|
+
// un-respawned modules). A full wipe must exit so the platform manager does a
|
|
223
|
+
// clean restart that re-bootstraps everything. `onReopen` throws to PROVE the
|
|
224
|
+
// reopen path is never taken — if it were, this test would surface the throw.
|
|
222
225
|
const h = makeHolder({
|
|
223
226
|
initialInode: INODE_A,
|
|
224
227
|
statInode: () => undefined, // ENOENT
|
|
225
|
-
onReopen: () =>
|
|
228
|
+
onReopen: () => {
|
|
229
|
+
throw new Error("reopen must NOT be called on a gone verdict");
|
|
230
|
+
},
|
|
226
231
|
});
|
|
227
232
|
expect(h.holder.probePath()).toBe("gone");
|
|
228
|
-
expect(h.stats().reopens).toBe(
|
|
233
|
+
expect(h.stats().reopens).toBe(0);
|
|
229
234
|
expect(h.stats().exits).toBe(1);
|
|
230
235
|
expect(h.stats().exitCode).toBe(1);
|
|
231
236
|
h.cleanup();
|
|
@@ -4,6 +4,7 @@ import { tmpdir } from "node:os";
|
|
|
4
4
|
import { join } from "node:path";
|
|
5
5
|
import { _resetBootstrapTokenForTests, getBootstrapToken } from "../bootstrap-token.ts";
|
|
6
6
|
import {
|
|
7
|
+
armServeDbWatchdog,
|
|
7
8
|
formatBootstrapTokenBanner,
|
|
8
9
|
formatListeningBanner,
|
|
9
10
|
hubPortConflictMessage,
|
|
@@ -463,3 +464,74 @@ describe("resolveStartupIssuer — expose-state fallback (#531)", () => {
|
|
|
463
464
|
expect(resolveStartupIssuer({}, {}, throwing)).toBeUndefined();
|
|
464
465
|
});
|
|
465
466
|
});
|
|
467
|
+
|
|
468
|
+
describe("armServeDbWatchdog — #610/#619 ghost-fd watchdog wiring on the serve path", () => {
|
|
469
|
+
let tmp: string;
|
|
470
|
+
let realDbPath: string;
|
|
471
|
+
|
|
472
|
+
beforeEach(() => {
|
|
473
|
+
tmp = mkdtempSync(join(tmpdir(), "serve-watchdog-"));
|
|
474
|
+
realDbPath = join(tmp, "hub.db");
|
|
475
|
+
});
|
|
476
|
+
afterEach(() => {
|
|
477
|
+
rmSync(tmp, { recursive: true, force: true });
|
|
478
|
+
});
|
|
479
|
+
|
|
480
|
+
test("starts the liveness timer (without it, a wipe is never noticed)", () => {
|
|
481
|
+
let tick: (() => void) | undefined;
|
|
482
|
+
const { livenessTimer } = armServeDbWatchdog(realDbPath, {
|
|
483
|
+
openDb: () => openHubDb(realDbPath),
|
|
484
|
+
statInode: () => ({ dev: 1, ino: 42 }),
|
|
485
|
+
setIntervalFn: (cb) => {
|
|
486
|
+
tick = cb;
|
|
487
|
+
return 0;
|
|
488
|
+
},
|
|
489
|
+
clearIntervalFn: () => {},
|
|
490
|
+
});
|
|
491
|
+
// The timer must actually be armed — the captured tick callback proves
|
|
492
|
+
// startDbPathLivenessTimer ran (the #619 bug was that it never did on this path).
|
|
493
|
+
expect(tick).toBeInstanceOf(Function);
|
|
494
|
+
expect(livenessTimer).toBeDefined();
|
|
495
|
+
});
|
|
496
|
+
|
|
497
|
+
test("opens the db BEFORE snapshotting the inode, so a wipe tick self-exits (#619 ordering)", () => {
|
|
498
|
+
// The load-bearing invariant: `initialInode` must be a DEFINED baseline so
|
|
499
|
+
// a later "gone" verdict fires reopen-or-exit. If the helper statted before
|
|
500
|
+
// opening (the bug), a fresh path would yield ENOENT → undefined baseline →
|
|
501
|
+
// probe stuck at "unknown" → NEVER exits on a wipe.
|
|
502
|
+
let opened = false;
|
|
503
|
+
let wiped = false;
|
|
504
|
+
let tick: (() => void) | undefined;
|
|
505
|
+
const exitCodes: number[] = [];
|
|
506
|
+
armServeDbWatchdog(realDbPath, {
|
|
507
|
+
openDb: () => {
|
|
508
|
+
if (wiped) throw new Error("ENOENT: state dir wiped");
|
|
509
|
+
opened = true;
|
|
510
|
+
return openHubDb(realDbPath);
|
|
511
|
+
},
|
|
512
|
+
statInode: () => {
|
|
513
|
+
if (wiped) return undefined; // path gone
|
|
514
|
+
// Proves ordering: if the helper statted before opening, this throws and
|
|
515
|
+
// the helper's catch leaves initialInode undefined (watchdog disabled).
|
|
516
|
+
if (!opened) throw Object.assign(new Error("ENOENT"), { code: "ENOENT" });
|
|
517
|
+
return { dev: 1, ino: 42 };
|
|
518
|
+
},
|
|
519
|
+
setIntervalFn: (cb) => {
|
|
520
|
+
tick = cb;
|
|
521
|
+
return 0;
|
|
522
|
+
},
|
|
523
|
+
clearIntervalFn: () => {},
|
|
524
|
+
exit: (code) => exitCodes.push(code),
|
|
525
|
+
});
|
|
526
|
+
|
|
527
|
+
// Simulate the wipe, then drive one watchdog tick.
|
|
528
|
+
wiped = true;
|
|
529
|
+
expect(tick).toBeInstanceOf(Function);
|
|
530
|
+
tick?.();
|
|
531
|
+
|
|
532
|
+
// The probe saw "gone" against a real baseline → reopen threw (dir gone) →
|
|
533
|
+
// exit(1). A non-zero exitCodes proves `initialInode` was a defined baseline,
|
|
534
|
+
// which proves the db was opened before the inode snapshot.
|
|
535
|
+
expect(exitCodes).toEqual([1]);
|
|
536
|
+
});
|
|
537
|
+
});
|
package/src/commands/serve.ts
CHANGED
|
@@ -34,7 +34,7 @@ import { generateBootstrapToken } from "../bootstrap-token.ts";
|
|
|
34
34
|
// path isolation.
|
|
35
35
|
import { CONFIG_DIR, SERVICES_MANIFEST_PATH } from "../config.ts";
|
|
36
36
|
import { readExposeState } from "../expose-state.ts";
|
|
37
|
-
import { createDbHolder } from "../hub-db-liveness.ts";
|
|
37
|
+
import { createDbHolder, defaultStatInode, startDbPathLivenessTimer } from "../hub-db-liveness.ts";
|
|
38
38
|
import { hubDbPath, openHubDb } from "../hub-db.ts";
|
|
39
39
|
import { hubFetch } from "../hub-server.ts";
|
|
40
40
|
import { writeHubFile } from "../hub.ts";
|
|
@@ -304,6 +304,80 @@ export function formatBootstrapTokenBanner(token: string, hubUrl?: string): stri
|
|
|
304
304
|
].join("\n");
|
|
305
305
|
}
|
|
306
306
|
|
|
307
|
+
/**
|
|
308
|
+
* Injectable seams for {@link armServeDbWatchdog} (test-only). Generic on the
|
|
309
|
+
* timer handle `H` so the scheduler seams never name `setInterval` in type
|
|
310
|
+
* position — mirrors `DbLivenessTimerDeps<H>` in hub-db-liveness.ts, which
|
|
311
|
+
* keeps the public interface portable to a types-less tsc environment.
|
|
312
|
+
*/
|
|
313
|
+
export interface ServeDbWatchdogDeps<H = unknown> {
|
|
314
|
+
log?: (line: string) => void;
|
|
315
|
+
/** Open a db handle (default {@link openHubDb}). Tests inject a fake that creates a fixture. */
|
|
316
|
+
openDb?: (path: string) => ReturnType<typeof openHubDb>;
|
|
317
|
+
/** Path stat for the inode snapshot + proactive probe (default {@link defaultStatInode}). */
|
|
318
|
+
statInode?: typeof defaultStatInode;
|
|
319
|
+
/** Injectable scheduler threaded to the liveness timer (default `setInterval`). */
|
|
320
|
+
setIntervalFn?: (cb: () => void, ms: number) => H;
|
|
321
|
+
/** Injectable clear threaded to the liveness timer (default `clearInterval`). */
|
|
322
|
+
clearIntervalFn?: (handle: H) => void;
|
|
323
|
+
/** Process-exit fn threaded into the holder's reopen-or-exit (default `process.exit`). */
|
|
324
|
+
exit?: (code: number) => void;
|
|
325
|
+
}
|
|
326
|
+
|
|
327
|
+
/**
|
|
328
|
+
* Build the self-heal DB holder (#594) + start the proactive ghost-fd watchdog
|
|
329
|
+
* (#610) for the `parachute serve` path, returning both so the caller wires
|
|
330
|
+
* `getDb`/`onDbError`/`probeDbPath` and stops the timer on shutdown.
|
|
331
|
+
*
|
|
332
|
+
* Extracted + exported so the wiring is unit-testable WITHOUT binding a real
|
|
333
|
+
* port (#619): a serve()-level test would have to `Bun.serve` and risk the
|
|
334
|
+
* hub#535 launchd-bootout hazard, so this pure helper carries the load-bearing
|
|
335
|
+
* invariants instead — (1) the db is OPENED before the inode is snapshotted, so
|
|
336
|
+
* a fresh-install first boot gets a defined baseline (an ENOENT snapshot would
|
|
337
|
+
* silently disable the proactive probe for the whole process lifetime), and
|
|
338
|
+
* (2) the liveness timer is actually started. Both were absent on this path
|
|
339
|
+
* before #619 — the watchdog was wired only into `createHubServer`.
|
|
340
|
+
*/
|
|
341
|
+
export function armServeDbWatchdog<H = unknown>(
|
|
342
|
+
dbPath: string,
|
|
343
|
+
deps: ServeDbWatchdogDeps<H> = {},
|
|
344
|
+
): {
|
|
345
|
+
dbHolder: ReturnType<typeof createDbHolder>;
|
|
346
|
+
livenessTimer: ReturnType<typeof startDbPathLivenessTimer>;
|
|
347
|
+
} {
|
|
348
|
+
const openDb = deps.openDb ?? openHubDb;
|
|
349
|
+
const statInode = deps.statInode ?? defaultStatInode;
|
|
350
|
+
// Open FIRST — `openHubDb` mkdir's + creates the file when absent, so the
|
|
351
|
+
// stat below sees a real inode on a fresh-install first boot. Reversing this
|
|
352
|
+
// would leave `initialInode` undefined (ENOENT) and the probe at "unknown"
|
|
353
|
+
// for the process lifetime. Mirrors `createHubServer`'s ordering.
|
|
354
|
+
const db = openDb(dbPath);
|
|
355
|
+
let initialInode: ReturnType<typeof defaultStatInode> | undefined;
|
|
356
|
+
try {
|
|
357
|
+
initialInode = statInode(dbPath);
|
|
358
|
+
} catch {
|
|
359
|
+
initialInode = undefined;
|
|
360
|
+
}
|
|
361
|
+
const dbHolder = createDbHolder(db, {
|
|
362
|
+
reopen: () => openDb(dbPath),
|
|
363
|
+
dbPath,
|
|
364
|
+
statInode,
|
|
365
|
+
initialInode,
|
|
366
|
+
...(deps.log !== undefined ? { log: deps.log } : {}),
|
|
367
|
+
...(deps.exit !== undefined ? { exit: deps.exit } : {}),
|
|
368
|
+
});
|
|
369
|
+
// The active `parachute serve` path (systemd / launchd / container ExecStart)
|
|
370
|
+
// MUST start the watchdog here, not only in `createHubServer` — else a
|
|
371
|
+
// `rm -rf ~/.parachute` under a running unit leaves a ghost fd that keeps
|
|
372
|
+
// SELECT 1 succeeding with no thrown error, the reactive path never fires,
|
|
373
|
+
// and the hub never self-recovers (#619).
|
|
374
|
+
const livenessTimer = startDbPathLivenessTimer<H>(dbHolder, {
|
|
375
|
+
...(deps.setIntervalFn !== undefined ? { setIntervalFn: deps.setIntervalFn } : {}),
|
|
376
|
+
...(deps.clearIntervalFn !== undefined ? { clearIntervalFn: deps.clearIntervalFn } : {}),
|
|
377
|
+
});
|
|
378
|
+
return { dbHolder, livenessTimer };
|
|
379
|
+
}
|
|
380
|
+
|
|
307
381
|
/**
|
|
308
382
|
* Run the hub fetch loop in the foreground. Resolves when `Bun.serve` is
|
|
309
383
|
* bound; the returned `stop()` shuts the server down for tests.
|
|
@@ -355,15 +429,8 @@ export async function serve(opts: ServeOpts = {}): Promise<{
|
|
|
355
429
|
writeHubFile(hubHtmlPath);
|
|
356
430
|
|
|
357
431
|
const dbPath = hubDbPath();
|
|
358
|
-
// Self-heal-or-die DB holder (#594)
|
|
359
|
-
|
|
360
|
-
// error / malformed image — e.g. the state dir deleted under a running hub)
|
|
361
|
-
// can reopen the handle once, or exit(1) for the platform manager to restart
|
|
362
|
-
// us with a fresh one. `getDb` reads the current handle from the holder.
|
|
363
|
-
const dbHolder = createDbHolder(openHubDb(dbPath), {
|
|
364
|
-
reopen: () => openHubDb(dbPath),
|
|
365
|
-
log,
|
|
366
|
-
});
|
|
432
|
+
// Self-heal-or-die DB holder (#594) + proactive ghost-fd watchdog (#610/#619).
|
|
433
|
+
const { dbHolder, livenessTimer } = armServeDbWatchdog(dbPath, { log });
|
|
367
434
|
const adminBootstrap = await seedInitialAdminIfNeeded(dbHolder.get(), env, log);
|
|
368
435
|
|
|
369
436
|
if (adminBootstrap === "needs-setup") {
|
|
@@ -401,6 +468,9 @@ export async function serve(opts: ServeOpts = {}): Promise<{
|
|
|
401
468
|
fetch: hubFetch(WELL_KNOWN_DIR, {
|
|
402
469
|
getDb: () => dbHolder.get(),
|
|
403
470
|
onDbError: (err) => dbHolder.healOrExit(err),
|
|
471
|
+
// #610: /health's db check probes the path so monitoring + the #591
|
|
472
|
+
// adoption probe see a wipe instead of the ghost-fd lie.
|
|
473
|
+
probeDbPath: () => dbHolder.probePath(),
|
|
404
474
|
issuer,
|
|
405
475
|
loopbackPort: port,
|
|
406
476
|
supervisor,
|
|
@@ -486,6 +556,7 @@ export async function serve(opts: ServeOpts = {}): Promise<{
|
|
|
486
556
|
for (const state of supervisor.list()) {
|
|
487
557
|
await supervisor.stop(state.short);
|
|
488
558
|
}
|
|
559
|
+
livenessTimer.stop();
|
|
489
560
|
await server.stop();
|
|
490
561
|
dbHolder.get().close();
|
|
491
562
|
},
|
package/src/hub-db-liveness.ts
CHANGED
|
@@ -383,25 +383,41 @@ export function createDbHolder(initial: Database, deps: DbHolderDeps): DbHolder
|
|
|
383
383
|
const verdict = classifyPathLiveness({ expected: currentInode, current: pathInode });
|
|
384
384
|
if (verdict === "ok" || verdict === "unknown") return verdict;
|
|
385
385
|
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
|
|
386
|
+
if (verdict === "gone") {
|
|
387
|
+
// The whole state dir was wiped under the running hub (`rm -rf
|
|
388
|
+
// ~/.parachute`). We must NOT reopen-in-place here: `reopen` is
|
|
389
|
+
// `openHubDb`, which `mkdirSync`'s the dir back + opens a fresh EMPTY db,
|
|
390
|
+
// so its SELECT-1 verify would PASS and we'd "heal" into a half-recovered
|
|
391
|
+
// hub — empty db, but stale in-memory state, wiped well-known files, and
|
|
392
|
+
// supervised modules whose own state dirs are gone yet never re-spawned
|
|
393
|
+
// (#619 follow-up). The correct recovery for a full wipe is a clean
|
|
394
|
+
// process exit so the platform manager (systemd / launchd / container)
|
|
395
|
+
// restarts `parachute serve`, which re-bootstraps everything (well-known,
|
|
396
|
+
// admin seed, supervisor re-spawn). This restores the #610 design intent
|
|
397
|
+
// ("we exit, letting the platform manager restart") that the shared
|
|
398
|
+
// reopen-or-exit path silently defeated via openHubDb's mkdir-recursive.
|
|
399
|
+
log(
|
|
400
|
+
`parachute hub: db path ${deps.dbPath} no longer exists (state dir wiped under a running hub, #610); exiting so the platform manager restarts the hub with a freshly bootstrapped state dir.`,
|
|
401
|
+
);
|
|
402
|
+
exit(1);
|
|
403
|
+
return verdict;
|
|
404
|
+
}
|
|
405
|
+
|
|
406
|
+
// "replaced": the db FILE was swapped underneath us (e.g. a restore copied
|
|
407
|
+
// a new file over the same path) while the rest of the state dir is intact.
|
|
408
|
+
// Adopting the fresh inode in-place via reopen-or-exit is correct here — a
|
|
409
|
+
// process restart would be heavier than needed.
|
|
391
410
|
//
|
|
392
|
-
// ONE-TICK /health ANOMALY (intentional):
|
|
393
|
-
//
|
|
394
|
-
//
|
|
395
|
-
//
|
|
396
|
-
//
|
|
397
|
-
//
|
|
398
|
-
//
|
|
399
|
-
//
|
|
400
|
-
// string can't cascade.
|
|
411
|
+
// ONE-TICK /health ANOMALY (intentional): the reopenOrExit below heals
|
|
412
|
+
// SYNCHRONOUSLY, but we still RETURN "replaced" for this one call — so the
|
|
413
|
+
// /health request that drove this probe reports `db:"error: path-replaced"`
|
|
414
|
+
// even though the handle is now healthy; the very next request reads `ok`.
|
|
415
|
+
// We don't mask it (returning "ok" here would hide that a heal just
|
|
416
|
+
// happened, which is exactly what monitoring wants to see). It's safe
|
|
417
|
+
// because #591's adoption probe checks only HTTP 200 (`res.ok`), not the
|
|
418
|
+
// specific `db` string, so a single transient error string can't cascade.
|
|
401
419
|
reopenOrExit(
|
|
402
|
-
|
|
403
|
-
? `db path ${deps.dbPath} no longer exists (state dir wiped under a running hub, #610)`
|
|
404
|
-
: `db path ${deps.dbPath} now resolves to a different inode (DB file replaced underneath the open handle, #610)`,
|
|
420
|
+
`db path ${deps.dbPath} now resolves to a different inode (DB file replaced underneath the open handle, #610)`,
|
|
405
421
|
);
|
|
406
422
|
return verdict;
|
|
407
423
|
},
|
package/src/hub-server.ts
CHANGED
|
@@ -1627,8 +1627,11 @@ export function hubFetch(
|
|
|
1627
1627
|
// succeeding, so `probeDbLiveness` alone would report `db:"ok"` on a
|
|
1628
1628
|
// database that's gone from disk (the /health lie the issue calls
|
|
1629
1629
|
// out). `probeDbPath` stat()s the path + compares inodes; on a
|
|
1630
|
-
//
|
|
1631
|
-
//
|
|
1630
|
+
// "replaced" verdict it self-heals in-place (reopen-or-exit, adopt
|
|
1631
|
+
// the new inode); on a "gone" verdict it exits the process directly
|
|
1632
|
+
// (#621 — a full wipe needs a clean platform-manager restart, not an
|
|
1633
|
+
// empty-db reopen). Either way we surface the fault so the #591
|
|
1634
|
+
// adoption probe + monitoring see it.
|
|
1632
1635
|
const pathVerdict = deps?.probeDbPath?.();
|
|
1633
1636
|
if (pathVerdict === "gone" || pathVerdict === "replaced") {
|
|
1634
1637
|
// One-request anomaly on "replaced": probeDbPath already healed the
|