@openparachute/hub 0.6.5-rc.4 → 0.6.5-rc.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@openparachute/hub",
3
- "version": "0.6.5-rc.4",
3
+ "version": "0.6.5-rc.5",
4
4
  "description": "parachute — the local hub for the Parachute ecosystem (discovery, ports, lifecycle, soon OAuth).",
5
5
  "license": "AGPL-3.0",
6
6
  "publishConfig": {
@@ -1,6 +1,14 @@
1
1
  import { Database } from "bun:sqlite";
2
2
  import { describe, expect, test } from "bun:test";
3
- import { classifyDbError, createDbHolder, probeDbLiveness } from "../hub-db-liveness.ts";
3
+ import {
4
+ type DbInode,
5
+ type StatInodeFn,
6
+ classifyDbError,
7
+ classifyPathLiveness,
8
+ createDbHolder,
9
+ probeDbLiveness,
10
+ startDbPathLivenessTimer,
11
+ } from "../hub-db-liveness.ts";
4
12
 
5
13
  /** Build a `SQLiteError`-shaped object with the given code + message. */
6
14
  function sqliteErr(code: string, message: string): Error & { code: string } {
@@ -137,3 +145,209 @@ describe("createDbHolder (#594)", () => {
137
145
  initial.close();
138
146
  });
139
147
  });
148
+
149
+ const INODE_A: DbInode = { dev: 1, ino: 100 };
150
+ const INODE_B: DbInode = { dev: 1, ino: 200 };
151
+
152
+ describe("classifyPathLiveness (#610)", () => {
153
+ test("same inode → ok", () => {
154
+ expect(classifyPathLiveness({ expected: INODE_A, current: INODE_A })).toBe("ok");
155
+ expect(classifyPathLiveness({ expected: INODE_A, current: { ...INODE_A } })).toBe("ok");
156
+ });
157
+ test("ENOENT on the path (current undefined) → gone", () => {
158
+ expect(classifyPathLiveness({ expected: INODE_A, current: undefined })).toBe("gone");
159
+ });
160
+ test("different inode → replaced", () => {
161
+ expect(classifyPathLiveness({ expected: INODE_A, current: INODE_B })).toBe("replaced");
162
+ // a different device counts too
163
+ expect(classifyPathLiveness({ expected: INODE_A, current: { dev: 2, ino: 100 } })).toBe(
164
+ "replaced",
165
+ );
166
+ });
167
+ test("no baseline snapshot (expected undefined) → unknown, never self-heals", () => {
168
+ expect(classifyPathLiveness({ expected: undefined, current: INODE_A })).toBe("unknown");
169
+ expect(classifyPathLiveness({ expected: undefined, current: undefined })).toBe("unknown");
170
+ });
171
+ });
172
+
173
+ describe("DbHolder.probePath (#610 proactive detection)", () => {
174
+ /** A holder whose path stat is driven by the injected `statInode`. */
175
+ function makeHolder(opts: {
176
+ initialInode: DbInode | undefined;
177
+ statInode: StatInodeFn;
178
+ onReopen?: () => Database;
179
+ }) {
180
+ const initial = new Database(":memory:");
181
+ let reopens = 0;
182
+ let exits = 0;
183
+ let exitCode: number | undefined;
184
+ const holder = createDbHolder(initial, {
185
+ dbPath: "/fake/hub.db",
186
+ initialInode: opts.initialInode,
187
+ statInode: opts.statInode,
188
+ reopen: () => {
189
+ reopens += 1;
190
+ return opts.onReopen ? opts.onReopen() : new Database(":memory:");
191
+ },
192
+ exit: (code) => {
193
+ exits += 1;
194
+ exitCode = code;
195
+ },
196
+ log: () => {},
197
+ });
198
+ return {
199
+ holder,
200
+ stats: () => ({ reopens, exits, exitCode }),
201
+ cleanup: () => {
202
+ try {
203
+ initial.close();
204
+ } catch {}
205
+ },
206
+ };
207
+ }
208
+
209
+ test("healthy path (same inode) → no reopen, no exit", () => {
210
+ const h = makeHolder({ initialInode: INODE_A, statInode: () => INODE_A });
211
+ expect(h.holder.probePath()).toBe("ok");
212
+ expect(h.stats().reopens).toBe(0);
213
+ expect(h.stats().exits).toBe(0);
214
+ h.cleanup();
215
+ });
216
+
217
+ test("path GONE (ENOENT) → reopen attempted; reopen verify fails → exit(1)", () => {
218
+ // Reopen returns a closed handle (the dir is still gone) → SELECT 1 throws
219
+ // → exit. This is the genuine `rm -rf ~/.parachute` field shape.
220
+ const dead = new Database(":memory:");
221
+ dead.close();
222
+ const h = makeHolder({
223
+ initialInode: INODE_A,
224
+ statInode: () => undefined, // ENOENT
225
+ onReopen: () => dead,
226
+ });
227
+ expect(h.holder.probePath()).toBe("gone");
228
+ expect(h.stats().reopens).toBe(1);
229
+ expect(h.stats().exits).toBe(1);
230
+ expect(h.stats().exitCode).toBe(1);
231
+ h.cleanup();
232
+ });
233
+
234
+ test("path REPLACED (different inode) → reopen + swap (heals, no exit)", () => {
235
+ const h = makeHolder({
236
+ initialInode: INODE_A,
237
+ statInode: () => INODE_B, // path now resolves to a different inode
238
+ onReopen: () => new Database(":memory:"),
239
+ });
240
+ expect(h.holder.probePath()).toBe("replaced");
241
+ expect(h.stats().reopens).toBe(1);
242
+ expect(h.stats().exits).toBe(0);
243
+ h.cleanup();
244
+ });
245
+
246
+ test("NEVER fires on a transient stat throw (EACCES) — returns ok, no reopen/exit", () => {
247
+ const h = makeHolder({
248
+ initialInode: INODE_A,
249
+ statInode: () => {
250
+ const e = new Error("permission denied") as Error & { code: string };
251
+ e.code = "EACCES";
252
+ throw e;
253
+ },
254
+ });
255
+ expect(h.holder.probePath()).toBe("ok");
256
+ expect(h.stats().reopens).toBe(0);
257
+ expect(h.stats().exits).toBe(0);
258
+ h.cleanup();
259
+ });
260
+
261
+ test("no baseline inode → unknown, never self-heals (safe degradation)", () => {
262
+ const h = makeHolder({ initialInode: undefined, statInode: () => undefined });
263
+ expect(h.holder.probePath()).toBe("unknown");
264
+ expect(h.stats().reopens).toBe(0);
265
+ expect(h.stats().exits).toBe(0);
266
+ h.cleanup();
267
+ });
268
+
269
+ test("no dbPath configured → probePath is a no-op (unknown)", () => {
270
+ const initial = new Database(":memory:");
271
+ const holder = createDbHolder(initial, {
272
+ reopen: () => new Database(":memory:"),
273
+ exit: () => {},
274
+ log: () => {},
275
+ });
276
+ expect(holder.probePath()).toBe("unknown");
277
+ initial.close();
278
+ });
279
+
280
+ test("after a heal (replaced), the inode baseline is re-snapshotted to the new file", () => {
281
+ // First probe sees INODE_B (replaced) → reopen; statInode then returns
282
+ // INODE_B again so the NEXT probe sees the SAME inode → ok (not a loop).
283
+ let exits = 0;
284
+ const initial = new Database(":memory:");
285
+ const holder = createDbHolder(initial, {
286
+ dbPath: "/fake/hub.db",
287
+ initialInode: INODE_A,
288
+ statInode: () => INODE_B,
289
+ reopen: () => new Database(":memory:"),
290
+ exit: () => {
291
+ exits += 1;
292
+ },
293
+ log: () => {},
294
+ });
295
+ expect(holder.probePath()).toBe("replaced"); // A → B, heal
296
+ expect(holder.probePath()).toBe("ok"); // B → B, no further action
297
+ expect(exits).toBe(0);
298
+ initial.close();
299
+ });
300
+ });
301
+
302
+ describe("startDbPathLivenessTimer (#610 bounded watchdog)", () => {
303
+ test("each tick calls probePath exactly once; stop() clears the timer", () => {
304
+ let probes = 0;
305
+ const fakeHolder = {
306
+ get: () => new Database(":memory:"),
307
+ healOrExit: () => "ignored" as const,
308
+ probePath: () => {
309
+ probes += 1;
310
+ return "ok" as const;
311
+ },
312
+ };
313
+ let registered: (() => void) | undefined;
314
+ let cleared = false;
315
+ const timer = startDbPathLivenessTimer<number>(fakeHolder, {
316
+ setIntervalFn: (cb) => {
317
+ registered = cb;
318
+ return 42;
319
+ },
320
+ clearIntervalFn: (h) => {
321
+ expect(h).toBe(42);
322
+ cleared = true;
323
+ },
324
+ });
325
+ expect(registered).toBeDefined();
326
+ registered?.();
327
+ registered?.();
328
+ expect(probes).toBe(2);
329
+ timer.stop();
330
+ expect(cleared).toBe(true);
331
+ });
332
+
333
+ test("a probe that throws is swallowed (the timer callback never crashes the process)", () => {
334
+ const fakeHolder = {
335
+ get: () => new Database(":memory:"),
336
+ healOrExit: () => "ignored" as const,
337
+ probePath: (): "ok" => {
338
+ throw new Error("unexpected");
339
+ },
340
+ };
341
+ let registered: (() => void) | undefined;
342
+ startDbPathLivenessTimer<number>(fakeHolder, {
343
+ setIntervalFn: (cb) => {
344
+ registered = cb;
345
+ return 1;
346
+ },
347
+ clearIntervalFn: () => {},
348
+ log: () => {},
349
+ });
350
+ // Must NOT throw out of the callback.
351
+ expect(() => registered?.()).not.toThrow();
352
+ });
353
+ });
@@ -110,6 +110,69 @@ describe("hubFetch routing", () => {
110
110
  }
111
111
  });
112
112
 
113
+ test("/health reports db:ok when getDb is live and the proactive path probe is ok (#610)", async () => {
114
+ const h = makeHarness();
115
+ try {
116
+ const db = openHubDb(hubDbPath(h.dir));
117
+ try {
118
+ const res = await hubFetch(h.dir, {
119
+ getDb: () => db,
120
+ probeDbPath: () => "ok",
121
+ })(req("/health"));
122
+ expect(res.status).toBe(200);
123
+ const body = (await res.json()) as { status: string; db: string };
124
+ expect(body.status).toBe("ok");
125
+ expect(body.db).toBe("ok");
126
+ } finally {
127
+ db.close();
128
+ }
129
+ } finally {
130
+ h.cleanup();
131
+ }
132
+ });
133
+
134
+ test("/health surfaces db:error:path-gone when the proactive probe sees a wiped path (#610)", async () => {
135
+ // The ghost-fd lie: SELECT 1 still succeeds against the unlinked inode, so
136
+ // probeDbLiveness alone would report ok. probeDbPath stat()s the PATH and
137
+ // returns "gone" → /health must report the fault instead of lying.
138
+ const h = makeHarness();
139
+ try {
140
+ const db = openHubDb(hubDbPath(h.dir));
141
+ try {
142
+ const res = await hubFetch(h.dir, {
143
+ getDb: () => db,
144
+ probeDbPath: () => "gone",
145
+ })(req("/health"));
146
+ expect(res.status).toBe(200); // /health stays 200 (process liveness)
147
+ const body = (await res.json()) as { db: string };
148
+ expect(body.db).toBe("error: path-gone");
149
+ } finally {
150
+ db.close();
151
+ }
152
+ } finally {
153
+ h.cleanup();
154
+ }
155
+ });
156
+
157
+ test("/health surfaces db:error:path-replaced when the proactive probe sees an inode swap (#610)", async () => {
158
+ const h = makeHarness();
159
+ try {
160
+ const db = openHubDb(hubDbPath(h.dir));
161
+ try {
162
+ const res = await hubFetch(h.dir, {
163
+ getDb: () => db,
164
+ probeDbPath: () => "replaced",
165
+ })(req("/health"));
166
+ const body = (await res.json()) as { db: string };
167
+ expect(body.db).toBe("error: path-replaced");
168
+ } finally {
169
+ db.close();
170
+ }
171
+ } finally {
172
+ h.cleanup();
173
+ }
174
+ });
175
+
113
176
  test("/ renders the signed-out indicator dynamically when DB is configured but no session cookie (rc.13)", async () => {
114
177
  // The dynamic path takes over from the static disk file the moment a
115
178
  // DB is configured. With no session cookie, we still render — just
@@ -377,6 +377,127 @@ describe("install", () => {
377
377
  }
378
378
  });
379
379
 
380
+ test("ADOPT-KILLS an attributable same-module orphan on the canonical port + reclaims it (#609)", async () => {
381
+ // Wipe-recovery: `rm -rf ~/.parachute` + re-`init` leaves the supervised
382
+ // vault child running on :1940. The fresh install must reclaim the canonical
383
+ // port (adopt-kill the attributable orphan) rather than port-walk to 1944.
384
+ const { path, configDir, cleanup } = makeTempPath();
385
+ try {
386
+ const logs: string[] = [];
387
+ const kills: Array<{ pid: number; signal: string | number }> = [];
388
+ // installDir = /opt/.parachute/vault → the orphan's cmdline carries it, so
389
+ // attribution (per-module marker = installDir) succeeds.
390
+ const installDirPkg = "/opt/.parachute/vault/package.json";
391
+ // Port 1940 is held UNTIL the SIGTERM lands; after the kill the re-probe
392
+ // (collectOccupiedPorts) sees it free, so the assignment lands on 1940.
393
+ let killed = false;
394
+ const code = await install("vault", {
395
+ runner: async () => 0,
396
+ manifestPath: path,
397
+ configDir,
398
+ startService: async () => 0,
399
+ isLinked: () => false,
400
+ findGlobalInstall: () => installDirPkg,
401
+ portProbe: async (p) => p === 1940 && !killed,
402
+ pidOnPort: (p) => (p === 1940 && !killed ? 7777 : undefined),
403
+ ownerOfPid: (pid) =>
404
+ pid === 7777 ? "parachute-vault --port 1940 (/opt/.parachute/vault)" : undefined,
405
+ killPid: (pid, signal) => {
406
+ kills.push({ pid, signal });
407
+ killed = true; // the orphan releases the port on SIGTERM
408
+ },
409
+ sleep: async () => {},
410
+ reclaimDelayMs: 0,
411
+ log: (l) => logs.push(l),
412
+ });
413
+ expect(code).toBe(0);
414
+ const joined = logs.join("\n");
415
+ // We adopt-killed the attributable orphan…
416
+ expect(joined).toMatch(/attributable prior vault instance \(pid 7777/);
417
+ expect(joined).toMatch(/reclaiming it \(adopt-kill\)/);
418
+ expect(kills.map((k) => k.signal)).toContain("SIGTERM");
419
+ // …and the fresh install landed on the CANONICAL port, not a fallback.
420
+ const entry = findService("parachute-vault", path);
421
+ expect(entry?.port).toBe(1940);
422
+ expect(joined).not.toMatch(/is in use; assigned/);
423
+ } finally {
424
+ cleanup();
425
+ }
426
+ });
427
+
428
+ test("escalates to SIGKILL when the orphan ignores SIGTERM (#609)", async () => {
429
+ const { path, configDir, cleanup } = makeTempPath();
430
+ try {
431
+ const logs: string[] = [];
432
+ const signals: Array<string | number> = [];
433
+ const code = await install("vault", {
434
+ runner: async () => 0,
435
+ manifestPath: path,
436
+ configDir,
437
+ startService: async () => 0,
438
+ isLinked: () => false,
439
+ findGlobalInstall: () => "/opt/.parachute/vault/package.json",
440
+ // Orphan never releases 1940 → install ultimately walks (degrades
441
+ // gracefully), but it MUST have escalated to SIGKILL first.
442
+ portProbe: async (p) => p === 1940,
443
+ pidOnPort: (p) => (p === 1940 ? 8888 : undefined),
444
+ ownerOfPid: (pid) => (pid === 8888 ? "parachute-vault (/opt/.parachute/vault)" : undefined),
445
+ killPid: (_pid, signal) => {
446
+ signals.push(signal);
447
+ },
448
+ sleep: async () => {},
449
+ reclaimDelayMs: 0,
450
+ log: (l) => logs.push(l),
451
+ });
452
+ expect(code).toBe(0);
453
+ expect(signals).toContain("SIGTERM");
454
+ expect(signals).toContain("SIGKILL");
455
+ expect(logs.join("\n")).toMatch(/escalated to SIGKILL/);
456
+ } finally {
457
+ cleanup();
458
+ }
459
+ });
460
+
461
+ test("does NOT kill a FOREIGN holder on the canonical port — walks + warns instead (#609 safety)", async () => {
462
+ // The crux: a non-attributable holder (an operator's unrelated process, or a
463
+ // sibling module) on :1940 must NEVER be killed. We fall through to the #590
464
+ // warn-and-walk path unchanged.
465
+ const { path, configDir, cleanup } = makeTempPath();
466
+ try {
467
+ const logs: string[] = [];
468
+ let killCalled = false;
469
+ const code = await install("vault", {
470
+ runner: async () => 0,
471
+ manifestPath: path,
472
+ configDir,
473
+ startService: async () => 0,
474
+ isLinked: () => false,
475
+ findGlobalInstall: () => "/opt/.parachute/vault/package.json",
476
+ portProbe: async (p) => p === 1940,
477
+ pidOnPort: (p) => (p === 1940 ? 5555 : undefined),
478
+ // Foreign cmdline — does NOT contain the vault installDir marker.
479
+ ownerOfPid: (pid) => (pid === 5555 ? "/usr/bin/python3 /opt/my-own-server.py" : undefined),
480
+ killPid: () => {
481
+ killCalled = true;
482
+ },
483
+ sleep: async () => {},
484
+ reclaimDelayMs: 0,
485
+ log: (l) => logs.push(l),
486
+ });
487
+ expect(code).toBe(0);
488
+ expect(killCalled).toBe(false); // NEVER kill a foreign process
489
+ const joined = logs.join("\n");
490
+ // The #590 warn-and-walk path is unchanged for the foreign holder.
491
+ expect(joined).toMatch(/canonical port 1940 is in use; assigned/);
492
+ expect(joined).toContain("pid 5555 (/usr/bin/python3 /opt/my-own-server.py)");
493
+ expect(joined).toMatch(/stale pre-supervisor daemon/);
494
+ const entry = findService("parachute-vault", path);
495
+ expect(entry?.port).not.toBe(1940); // walked, not reclaimed
496
+ } finally {
497
+ cleanup();
498
+ }
499
+ });
500
+
380
501
  test("squatter pid present but command line unreadable → names the pid alone (#590)", async () => {
381
502
  const { path, configDir, cleanup } = makeTempPath();
382
503
  try {
@@ -179,6 +179,46 @@ describe("deriveWizardState", () => {
179
179
  }
180
180
  });
181
181
 
182
+ test("vault step when admin exists and only the SEED placeholder vault row is present (hub#607)", async () => {
183
+ // `parachute init` seeds a `parachute-vault` placeholder into
184
+ // services.json at SEED_VERSION ("0.0.0-linked") under hub#168 Cut 1
185
+ // (`noCreate`): the MODULE is installed, but no instance exists yet.
186
+ // Pre-#607, `hasVault` keyed off a bare `findService(...) !== undefined`
187
+ // check, which the placeholder satisfied — so the wizard silently
188
+ // skipped its vault step on EVERY init'd box and the operator finished
189
+ // setup with no vault. The placeholder must NOT count as a real vault.
190
+ const db = openHubDb(hubDbPath(h.dir));
191
+ try {
192
+ await createUser(db, "owner", "pw");
193
+ writeManifest(
194
+ {
195
+ services: [
196
+ {
197
+ name: "parachute-vault",
198
+ version: "0.0.0-linked",
199
+ port: 1940,
200
+ paths: ["/vault/default"],
201
+ health: "/health",
202
+ },
203
+ ],
204
+ },
205
+ h.manifestPath,
206
+ );
207
+ const s = deriveWizardState({
208
+ db,
209
+ manifestPath: h.manifestPath,
210
+ readExposeStateFn: h.readExposeStateFn,
211
+ });
212
+ // The placeholder is module-installed-but-no-instance, so the wizard
213
+ // still owns vault creation: it presents the create/import/skip step.
214
+ expect(s.step).toBe("vault");
215
+ expect(s.hasAdmin).toBe(true);
216
+ expect(s.hasVault).toBe(false);
217
+ } finally {
218
+ db.close();
219
+ }
220
+ });
221
+
182
222
  test("expose step when admin + vault exist but expose mode not set yet (hub#268 Item 2)", async () => {
183
223
  const db = openHubDb(hubDbPath(h.dir));
184
224
  try {
@@ -1506,6 +1546,105 @@ describe("handleSetupVaultPost", () => {
1506
1546
  }
1507
1547
  });
1508
1548
 
1549
+ test("create on a SEED-placeholder box: does NOT short-circuit + drives the supervisor to start vault (hub#607 + hub#608)", async () => {
1550
+ // hub#607 + hub#608 coupled fresh-operator flow. On an init'd box,
1551
+ // services.json already carries a `parachute-vault` placeholder at
1552
+ // SEED_VERSION ("0.0.0-linked") — the MODULE is installed, no instance
1553
+ // exists. With the hub#607 `hasVault` discrimination, the wizard's vault
1554
+ // step still appears (the placeholder isn't a real vault), so a
1555
+ // `mode=create` POST must NOT be treated as "already provisioned" and
1556
+ // short-circuit to expose. It runs `runInstall`, which seeds/stamps the
1557
+ // row and — the hub#608 fix — drives `supervisor.start(...)` so the
1558
+ // freshly-created vault is ACTIVE immediately, not inactive-until-
1559
+ // restart. We assert both: the install op fired (no short-circuit) AND
1560
+ // the supervisor now reports a live vault child.
1561
+ const db = openHubDb(hubDbPath(h.dir));
1562
+ try {
1563
+ const user = await createUser(db, "owner", "pw");
1564
+ const { createSession, SESSION_COOKIE_NAME: SC } = await import("../sessions.ts");
1565
+ const session = createSession(db, { userId: user.id });
1566
+ // Simulate `parachute init`: the vault MODULE is seeded as a
1567
+ // placeholder, no supervisor entry yet (init ran with noStart).
1568
+ writeManifest(
1569
+ {
1570
+ services: [
1571
+ {
1572
+ name: "parachute-vault",
1573
+ version: "0.0.0-linked",
1574
+ port: 1940,
1575
+ paths: ["/vault/default"],
1576
+ health: "/health",
1577
+ },
1578
+ ],
1579
+ },
1580
+ h.manifestPath,
1581
+ );
1582
+ const get = handleSetupGet(req("/admin/setup"), {
1583
+ db,
1584
+ manifestPath: h.manifestPath,
1585
+ configDir: h.dir,
1586
+ readExposeStateFn: h.readExposeStateFn,
1587
+ issuer: "https://hub.example",
1588
+ registry: getDefaultOperationsRegistry(),
1589
+ });
1590
+ const csrf = setCookie(get, CSRF_COOKIE_NAME) ?? "";
1591
+ const supervisor = makeSupervisor();
1592
+ // Sanity: no vault child before the wizard create.
1593
+ expect(supervisor.get("vault")).toBeUndefined();
1594
+ const runCalls: string[][] = [];
1595
+ const stubbedRun = async (cmd: readonly string[]) => {
1596
+ runCalls.push([...cmd]);
1597
+ return 0;
1598
+ };
1599
+ const post = await handleSetupVaultPost(
1600
+ req("/admin/setup/vault", {
1601
+ method: "POST",
1602
+ body: new URLSearchParams({
1603
+ [CSRF_FIELD_NAME]: csrf,
1604
+ mode: "create",
1605
+ vault_name: "myvault",
1606
+ scribe_provider: "none",
1607
+ }).toString(),
1608
+ headers: {
1609
+ "content-type": "application/x-www-form-urlencoded",
1610
+ cookie: `${CSRF_COOKIE_NAME}=${csrf}; ${SC}=${session.id}`,
1611
+ },
1612
+ }),
1613
+ {
1614
+ db,
1615
+ manifestPath: h.manifestPath,
1616
+ configDir: h.dir,
1617
+ readExposeStateFn: h.readExposeStateFn,
1618
+ issuer: "https://hub.example",
1619
+ supervisor,
1620
+ registry: getDefaultOperationsRegistry(),
1621
+ run: stubbedRun,
1622
+ isLinked: () => false,
1623
+ },
1624
+ );
1625
+ // Not short-circuited: the placeholder is not a real vault, so the
1626
+ // POST enqueues an install op rather than redirecting to expose.
1627
+ expect(post.status).toBe(303);
1628
+ const location = post.headers.get("location") ?? "";
1629
+ expect(location).toMatch(/^\/admin\/setup\?op=/);
1630
+ // Let the background runInstall promise reach the runner + supervisor.
1631
+ await new Promise((r) => setTimeout(r, 50));
1632
+ // #607 proof: the install actually ran (not the "already provisioned"
1633
+ // short-circuit, which fires no `bun add`).
1634
+ expect(runCalls.some((c) => c.join(" ").includes("bun add -g @openparachute/vault"))).toBe(
1635
+ true,
1636
+ );
1637
+ // #608 proof: the supervisor was driven to start the vault child, so
1638
+ // the vault is live immediately after the wizard create — no manual
1639
+ // `parachute start vault` / hub restart needed.
1640
+ const vaultState = supervisor.get("vault");
1641
+ expect(vaultState).toBeDefined();
1642
+ expect(["starting", "running", "restarting"]).toContain(vaultState?.status ?? "");
1643
+ } finally {
1644
+ db.close();
1645
+ }
1646
+ });
1647
+
1509
1648
  // --- scribe cleanup sub-form (2026-05-27) -----------------------------
1510
1649
  //
1511
1650
  // The vault step's scribe sub-form was extended with a second radio
@@ -7,8 +7,12 @@ import { CONFIG_DIR, SERVICES_MANIFEST_PATH } from "../config.ts";
7
7
  import { type ExposeState, readExposeState } from "../expose-state.ts";
8
8
  import {
9
9
  HUB_DEFAULT_PORT,
10
+ type KillFn,
10
11
  type PidOnPortFn,
12
+ type SleepFn,
13
+ defaultKill,
11
14
  defaultPidOnPort,
15
+ defaultSleep,
12
16
  readHubPort,
13
17
  } from "../hub-control.ts";
14
18
  import { type HubUnitDeps, defaultHubUnitDeps, isHubUnitInstalled } from "../hub-unit.ts";
@@ -18,6 +22,7 @@ import {
18
22
  readModuleManifest,
19
23
  validateModuleManifest,
20
24
  } from "../module-manifest.ts";
25
+ import { orphanAttributable } from "../orphan-attribution.ts";
21
26
  import { assignServicePort } from "../port-assign.ts";
22
27
  import { finalizeModuleInstall, stampInstallDirOnRow } from "../post-install.ts";
23
28
  import {
@@ -323,6 +328,30 @@ export interface InstallOpts {
323
328
  * `defaultOwnerOfPid` (`ps -o command= -p <pid>`); tests inject a stub.
324
329
  */
325
330
  ownerOfPid?: OwnerProbeFn;
331
+ /**
332
+ * Test seam for the install-time canonical-port ADOPT-KILL (#609). When the
333
+ * canonical port is held by an attributable prior instance of THE SAME module
334
+ * (a surviving orphan child after `rm -rf ~/.parachute` + re-`init`), we
335
+ * SIGTERM→SIGKILL it to reclaim the canonical port instead of walking to a
336
+ * non-canonical fallback. Reuses the #601 `orphanAttributable` machinery in
337
+ * per-module mode (marker = this install's `installDir`); a foreign /
338
+ * unattributable holder is NEVER killed — it falls through to the #590
339
+ * warn-and-walk path. Production wires `defaultKill` (`process.kill`); tests
340
+ * inject a spy so no real process is signalled.
341
+ */
342
+ killPid?: KillFn;
343
+ /**
344
+ * Test seam for the grace delay between SIGTERM and the SIGKILL escalation in
345
+ * the #609 adopt-kill. Production wires `defaultSleep`; tests inject a no-op
346
+ * so the path runs instantly.
347
+ */
348
+ sleep?: SleepFn;
349
+ /**
350
+ * Test seam: ms to wait after SIGTERM before re-probing + escalating to
351
+ * SIGKILL in the #609 adopt-kill. Default 1500ms (a listener-release grace);
352
+ * tests pass 0.
353
+ */
354
+ reclaimDelayMs?: number;
326
355
  /**
327
356
  * Test seam for reading `<packageDir>/.parachute/module.json`. Production
328
357
  * uses the real file reader; tests inject a map from package-dir → manifest
@@ -452,6 +481,50 @@ async function collectOccupiedPorts(
452
481
  return ports;
453
482
  }
454
483
 
484
+ /**
485
+ * Adopt-kill an ATTRIBUTABLE orphan holding the canonical port (#609). The
486
+ * caller has ALREADY confirmed attribution (per-module marker) — this is purely
487
+ * the signal sequence, mirroring the supervisor's `adoptKillOrphanOnPort`:
488
+ * SIGTERM, a listener-release grace, then SIGKILL only if the SAME pid still
489
+ * holds the SAME port. Best-effort: a kill that doesn't free the port degrades
490
+ * to the normal warn-and-walk path (the subsequent `collectOccupiedPorts` still
491
+ * sees the port held and `assignServicePort` walks).
492
+ *
493
+ * The re-probe before SIGKILL is deliberately NOT re-attributed: we already
494
+ * attributed `holder` to this module, and only escalate if that exact pid still
495
+ * holds the port (the same accepted, vanishingly-small TOCTOU window the
496
+ * supervisor + migrate sweep carry).
497
+ */
498
+ async function adoptKillOnPort(args: {
499
+ port: number;
500
+ holder: number;
501
+ kill: KillFn;
502
+ sleep: SleepFn;
503
+ pidOnPort: PidOnPortFn;
504
+ delayMs: number;
505
+ log: (line: string) => void;
506
+ }): Promise<void> {
507
+ const { port, holder, kill, sleep, pidOnPort, delayMs, log } = args;
508
+ try {
509
+ kill(holder, "SIGTERM");
510
+ } catch {
511
+ // ESRCH (already gone) / EPERM (can't signal) — nothing more to do; the
512
+ // re-probe + walk path handles a still-held port.
513
+ return;
514
+ }
515
+ await sleep(delayMs);
516
+ if (pidOnPort(port) === holder) {
517
+ try {
518
+ kill(holder, "SIGKILL");
519
+ log(` pid ${holder} did not release ${port} on SIGTERM; escalated to SIGKILL.`);
520
+ } catch {
521
+ // Already gone / can't signal — best-effort; fall through to the walk.
522
+ }
523
+ } else {
524
+ log(` reclaimed canonical port ${port} (pid ${holder} released it).`);
525
+ }
526
+ }
527
+
455
528
  function defaultFindGlobalInstall(pkg: string): string | null {
456
529
  for (const prefix of bunGlobalPrefixes()) {
457
530
  const pkgJsonPath = join(prefix, ...pkg.split("/"), "package.json");
@@ -988,23 +1061,98 @@ export async function install(input: string, opts: InstallOpts = {}): Promise<nu
988
1061
  // future installs no longer touch them.
989
1062
  const preInitEntry = findService(entryName, manifestPath);
990
1063
  const probe = opts.portProbe ?? defaultPortProbe;
991
- const occupied = await collectOccupiedPorts(manifestPath, entryName, preInitEntry?.port, probe);
1064
+ const pidOnPort = opts.pidOnPort ?? defaultPidOnPort;
1065
+ const ownerOfPid = opts.ownerOfPid ?? defaultOwnerOfPid;
992
1066
  const canonicalPort = spec.seedEntry?.().port ?? preInitEntry?.port;
1067
+
1068
+ // #609 wipe-recovery adopt-kill: BEFORE assigning, if the canonical port is
1069
+ // held by an attributable prior instance of THE SAME module (the classic
1070
+ // `rm -rf ~/.parachute` + re-`init` case — the supervised vault child keeps
1071
+ // running on :1940 and the fresh install would otherwise port-walk to 1944),
1072
+ // reclaim the port by adopt-killing the orphan rather than walking. Reuses the
1073
+ // #601 `orphanAttributable` machinery in PER-MODULE mode (marker = THIS
1074
+ // install's installDir, the same module-specific marker the supervisor's
1075
+ // crash-restart path uses) so a FOREIGN / sibling-module / operator process is
1076
+ // NEVER killed — it falls through to the #590 warn-and-walk path below.
1077
+ // Detection + module-specific attribution only; the kill is gated hard.
1078
+ // Gate the probe on the canonical port actually being OCCUPIED — when it's
1079
+ // free there's nothing to reclaim, and probing pid would be wasted work (and
1080
+ // a false "I looked at the port" signal). `probe` is the same TCP listen probe
1081
+ // `collectOccupiedPorts` uses below; a services.json row on the canonical port
1082
+ // also counts as occupied (a prior install's lingering entry).
1083
+ const canonicalOccupied =
1084
+ canonicalPort !== undefined &&
1085
+ (preInitEntry?.port === canonicalPort ||
1086
+ (await (async () => {
1087
+ try {
1088
+ return await probe(canonicalPort);
1089
+ } catch {
1090
+ return false;
1091
+ }
1092
+ })()));
1093
+ if (canonicalPort !== undefined && installDir && canonicalOccupied) {
1094
+ const holder = pidOnPort(canonicalPort);
1095
+ if (holder !== undefined && holder !== process.pid) {
1096
+ const { attributable, cmdline } = orphanAttributable({
1097
+ orphan: holder,
1098
+ // No recorded pid to trust here — a wiped services.json carries none —
1099
+ // so attribution rides entirely on the per-module cmdline marker.
1100
+ recordedPid: undefined,
1101
+ short,
1102
+ startCmdHint: undefined,
1103
+ ownerOfPid,
1104
+ // Per-module marker = installDir (e.g. `~/.parachute/vault/`); a prior
1105
+ // instance of this module was launched from there, so its `ps` cmdline
1106
+ // carries it. NOT the broad `parachute` marker — that would let a
1107
+ // sibling module's orphan on this port be (wrongly) adopted.
1108
+ moduleMarker: installDir,
1109
+ });
1110
+ if (attributable) {
1111
+ log(
1112
+ `Canonical port ${canonicalPort} is held by an attributable prior ${short} instance (pid ${holder}${cmdline ? `, ${cmdline}` : ""}) — reclaiming it (adopt-kill) instead of walking to a fallback (#609).`,
1113
+ );
1114
+ await adoptKillOnPort({
1115
+ port: canonicalPort,
1116
+ holder,
1117
+ kill: opts.killPid ?? defaultKill,
1118
+ sleep: opts.sleep ?? defaultSleep,
1119
+ pidOnPort,
1120
+ delayMs: opts.reclaimDelayMs ?? 1500,
1121
+ log,
1122
+ });
1123
+ }
1124
+ }
1125
+ }
1126
+
1127
+ // Hub-as-port-authority (#53): pick the service's port now and reflect it
1128
+ // in services.json. Pre-hub#206 the install path also wrote `PORT=<port>`
1129
+ // into the service's `.env`; post-#206 (option A) services.json is the
1130
+ // single source of truth — services follow the 4-tier resolvePort ladder
1131
+ // (services.json → service config → bare PORT env → compiled-in default,
1132
+ // per parachute-scribe#41 / parachute-agent#146 / parachute-agent#148 /
1133
+ // parachute-patterns#45), so the duplicate `.env` PORT was at best dead
1134
+ // weight and at worst a source of drift on re-install. Existing `.env`
1135
+ // PORT lines on operator machines stay where they are — harmless — and
1136
+ // future installs no longer touch them.
1137
+ //
1138
+ // collectOccupiedPorts runs AFTER the #609 adopt-kill above so a reclaimed
1139
+ // canonical port is seen as free and the assignment lands on it (no walk).
1140
+ const occupied = await collectOccupiedPorts(manifestPath, entryName, preInitEntry?.port, probe);
993
1141
  const portResult = assignServicePort({
994
1142
  canonical: canonicalPort,
995
1143
  occupied,
996
1144
  });
997
1145
  if (portResult.warning) {
998
1146
  log(`⚠ ${portResult.warning}`);
999
- // #590 item 2: the canonical port was held, so we walked to a fallback. Name
1000
- // the squatter — the supervisor start-path does this post-#581; do it here at
1001
- // install-time too. Reuse the #581 pidOnPort / ownerOfPid seams (detection
1002
- // only; never kill). When the holder is a foreign pid (not one of OUR rows —
1003
- // which is the common case when a stale pre-supervisor daemon is squatting),
1004
- // surface its pid + command line + a hint.
1147
+ // #590 item 2: the canonical port was held by a NON-attributable holder (the
1148
+ // #609 adopt-kill above already reclaimed an attributable same-module
1149
+ // orphan), so we walked to a fallback. Name the squatter the supervisor
1150
+ // start-path does this post-#581; do it here at install-time too. Reuse the
1151
+ // #581 pidOnPort / ownerOfPid seams (detection only; never kill). When the
1152
+ // holder is a foreign pid (not one of OUR rows — which is the common case
1153
+ // when a stale pre-supervisor daemon is squatting), surface its pid +
1154
+ // command line + a hint.
1005
1155
  if (canonicalPort !== undefined && portResult.source !== "canonical") {
1006
- const pidOnPort = opts.pidOnPort ?? defaultPidOnPort;
1007
- const ownerOfPid = opts.ownerOfPid ?? defaultOwnerOfPid;
1008
1156
  const holder = pidOnPort(canonicalPort);
1009
1157
  if (holder !== undefined) {
1010
1158
  const cmdline = ownerOfPid(holder);
@@ -1,7 +1,8 @@
1
1
  import type { Database } from "bun:sqlite";
2
+ import { statSync } from "node:fs";
2
3
 
3
4
  /**
4
- * SQLite-handle liveness + self-heal policy (#594).
5
+ * SQLite-handle liveness + self-heal policy (#594, #610).
5
6
  *
6
7
  * Field repro: an operator deleted `~/.parachute` while the hub unit was
7
8
  * running. The process kept an fd to the now-unlinked `hub.db` inode — cached
@@ -12,12 +13,33 @@ import type { Database } from "bun:sqlite";
12
13
  * /health is the worst possible failure shape — a crash-restart would have
13
14
  * self-healed in seconds (the platform manager re-`openHubDb`s a fresh handle).
14
15
  *
15
- * The policy here: on a request that hits the persistent-corruption error
16
- * class, attempt ONE reopen of the handle; if reopen fails OR the error
17
- * recurs immediately, log loudly and `process.exit(1)` so the platform
18
- * manager (launchd / systemd / container runtime) restarts with a fresh
19
- * handle. We are careful to scope "fatal" to the persistent class — a
20
- * transient `SQLITE_BUSY` (a momentary write lock) must NOT kill the hub.
16
+ * The REACTIVE policy (#594): on a request that hits the persistent-corruption
17
+ * error class, attempt ONE reopen of the handle; if reopen fails OR the error
18
+ * recurs immediately, log loudly and `process.exit(1)` so the platform manager
19
+ * (launchd / systemd / container runtime) restarts with a fresh handle. We are
20
+ * careful to scope "fatal" to the persistent class — a transient `SQLITE_BUSY`
21
+ * (a momentary write lock) must NOT kill the hub.
22
+ *
23
+ * The PROACTIVE policy (#610): the reactive path above only fires on a THROWN
24
+ * error. But on Linux, `rm -rf ~/.parachute` under a running hub does NOT throw
25
+ * — the kernel keeps the unlinked `hub.db` inode alive behind the open fd, so
26
+ * `SELECT 1` and even writes keep succeeding against the ghost (deleted) inode
27
+ * indefinitely. Nothing throws ⇒ the reactive self-heal never fires ⇒ `/health`
28
+ * lies `db:"ok"` forever against a database that's gone from disk. The proactive
29
+ * check closes this gap WITHOUT relying on a thrown error: at open time we record
30
+ * the db file's inode (`st_dev`/`st_ino`); a low-frequency probe (and `/health`'s
31
+ * db check) re-`stat()`s the configured path and compares. ENOENT on the path, or
32
+ * an inode mismatch, means the on-disk DB the handle points at is gone / replaced
33
+ * ⇒ trigger the SAME reopen-or-exit machinery (here the path is gone, so reopen's
34
+ * verify fails and we exit, letting the platform manager restart with a fresh,
35
+ * on-disk handle in seconds rather than "never").
36
+ *
37
+ * SAFETY (both policies): we only ever escalate to reopen/exit on the genuine
38
+ * persistent signal — a thrown fatal error, or a definitively gone/replaced path.
39
+ * Transient conditions (SQLITE_BUSY, a momentary lock, a stat() that fails for a
40
+ * reason OTHER than ENOENT — e.g. EACCES, EINTR) NEVER trigger it. The exit fn is
41
+ * injectable so no test can kill the test process (hub#535 precedent), and the
42
+ * proactive timer is bounded so it can't spin.
21
43
  */
22
44
 
23
45
  /**
@@ -120,12 +142,92 @@ export function probeDbLiveness(db: Database): "ok" | string {
120
142
  }
121
143
  }
122
144
 
145
+ /**
146
+ * The identity of an on-disk file — `st_dev`/`st_ino`, the only two fields that
147
+ * uniquely identify an inode across a delete+recreate. We snapshot this for the
148
+ * db path at open time so the proactive probe (#610) can tell "same file the
149
+ * handle points at" from "path now resolves to a DIFFERENT (or no) inode".
150
+ */
151
+ export interface DbInode {
152
+ dev: number;
153
+ ino: number;
154
+ }
155
+
156
+ /**
157
+ * Injectable `stat` of the db PATH (not the open handle). Production wires
158
+ * {@link defaultStatInode} (`fs.statSync`); tests inject a function that returns
159
+ * a chosen inode, `undefined` for ENOENT (path gone), or throws a non-ENOENT
160
+ * error (e.g. EACCES — a TRANSIENT failure that must NOT trigger self-heal).
161
+ *
162
+ * Contract: return the {@link DbInode} on success, `undefined` when the path
163
+ * does not exist (ENOENT — the genuine "wiped" signal), and THROW for any other
164
+ * error (so the caller can treat it as transient and leave the hub alone).
165
+ */
166
+ export type StatInodeFn = (path: string) => DbInode | undefined;
167
+
168
+ /**
169
+ * Production `stat`: returns the path's inode, or `undefined` on ENOENT. Any
170
+ * other error (EACCES, EINTR, …) is re-thrown so the caller classifies it as
171
+ * transient — we only ever self-heal on a DEFINITIVELY-gone path.
172
+ */
173
+ export const defaultStatInode: StatInodeFn = (path) => {
174
+ try {
175
+ const st = statSync(path);
176
+ return { dev: st.dev, ino: st.ino };
177
+ } catch (err) {
178
+ if (err && typeof err === "object" && (err as { code?: unknown }).code === "ENOENT") {
179
+ return undefined;
180
+ }
181
+ throw err;
182
+ }
183
+ };
184
+
185
+ /**
186
+ * The verdict of a proactive path-liveness check (#610), against the inode the
187
+ * handle was opened on:
188
+ * - `"ok"` → the path still resolves to the SAME inode the handle holds.
189
+ * - `"gone"` → the path no longer exists (ENOENT) — the state dir was wiped.
190
+ * - `"replaced"` → the path exists but resolves to a DIFFERENT inode — the DB
191
+ * file was deleted + recreated underneath the handle.
192
+ * - `"unknown"` → we couldn't snapshot the open inode (no baseline) so we
193
+ * can't compare; treated as a non-signal (never self-heals).
194
+ *
195
+ * Only `"gone"`/`"replaced"` are the genuine wipe signal that triggers self-heal.
196
+ */
197
+ export type PathLivenessClass = "ok" | "gone" | "replaced" | "unknown";
198
+
199
+ /**
200
+ * Pure classifier: compare the inode the path resolves to NOW (or `undefined`
201
+ * for ENOENT) against the inode the open handle was created on. No I/O — the
202
+ * caller does the `stat()` and the open-inode snapshot; this is the decision so
203
+ * it's trivially unit-testable and the "never fire on transient" rule is a
204
+ * single, auditable function.
205
+ *
206
+ * A non-ENOENT stat error is NOT represented here — the caller (`statInode`'s
207
+ * contract) THROWS on it, and the probe treats a thrown stat as transient
208
+ * (leaves the hub alone). Only a clean ENOENT (`current === undefined`) or a
209
+ * clean inode mismatch reaches a self-heal verdict.
210
+ */
211
+ export function classifyPathLiveness(args: {
212
+ /** The inode the open db handle was created on (snapshot at open). */
213
+ expected: DbInode | undefined;
214
+ /** The inode the path resolves to NOW, or `undefined` for ENOENT. */
215
+ current: DbInode | undefined;
216
+ }): PathLivenessClass {
217
+ const { expected, current } = args;
218
+ // No baseline → we can't compare; never self-heal on a missing snapshot.
219
+ if (expected === undefined) return "unknown";
220
+ if (current === undefined) return "gone";
221
+ if (current.dev === expected.dev && current.ino === expected.ino) return "ok";
222
+ return "replaced";
223
+ }
224
+
123
225
  /**
124
226
  * A mutable holder for the hub's `Database` handle so a request handler that
125
227
  * hits the fatal error class can swap in a freshly-reopened handle without
126
228
  * re-threading the closure-captured `db` through every call site. `getDb()`
127
- * in hub-server reads `holder.get()`; the self-heal path calls
128
- * `holder.healOrExit(err)`.
229
+ * in hub-server reads `holder.get()`; the reactive self-heal path calls
230
+ * `holder.healOrExit(err)`; the proactive (#610) path calls `holder.probePath()`.
129
231
  */
130
232
  export interface DbHolder {
131
233
  /** The current live handle. */
@@ -145,6 +247,20 @@ export interface DbHolder {
145
247
  * gone), we exit rather than loop — the platform manager owns the restart.
146
248
  */
147
249
  healOrExit(err: unknown): "ignored" | "healed" | "exited";
250
+ /**
251
+ * PROACTIVE path-liveness probe (#610). `stat()`s the configured db PATH and
252
+ * compares its inode to the one the open handle was created on. On a genuine
253
+ * wipe signal (`"gone"`/`"replaced"`) it triggers the SAME reopen-or-exit
254
+ * machinery as `healOrExit` (here the path is gone, so reopen's verify fails
255
+ * → exit → platform manager restarts with a fresh on-disk handle). On `"ok"`,
256
+ * `"unknown"`, or a thrown (transient) stat it does NOTHING.
257
+ *
258
+ * Returns the {@link PathLivenessClass} verdict so `/health` and tests can see
259
+ * what was observed; the `"healed"`/`"exited"` side effects mirror `healOrExit`.
260
+ * Wired into the bounded liveness timer in hub-server AND into `/health`'s db
261
+ * check, so monitoring + the #591 adoption probe see the fault instead of a lie.
262
+ */
263
+ probePath(): PathLivenessClass;
148
264
  }
149
265
 
150
266
  export interface DbHolderDeps {
@@ -156,6 +272,20 @@ export interface DbHolderDeps {
156
272
  exit?: (code: number) => void;
157
273
  /** Close a (presumed-dead) handle best-effort before swapping (default `db.close()`). */
158
274
  closeOld?: (db: Database) => void;
275
+ /**
276
+ * The on-disk db PATH the proactive probe (#610) stat()s. When omitted,
277
+ * `probePath()` is a no-op (`"unknown"`) — backwards-compatible for the
278
+ * reactive-only callers + tests that don't exercise the proactive path.
279
+ */
280
+ dbPath?: string;
281
+ /** Injectable path stat for the proactive probe (default {@link defaultStatInode}). */
282
+ statInode?: StatInodeFn;
283
+ /**
284
+ * The inode the INITIAL handle was opened on. Production passes the snapshot
285
+ * taken right after `openHubDb`; when omitted (or when the snapshot itself
286
+ * failed), `probePath()` returns `"unknown"` and never self-heals.
287
+ */
288
+ initialInode?: DbInode | undefined;
159
289
  }
160
290
 
161
291
  /**
@@ -166,8 +296,13 @@ export interface DbHolderDeps {
166
296
  */
167
297
  export function createDbHolder(initial: Database, deps: DbHolderDeps): DbHolder {
168
298
  let current = initial;
299
+ // The inode the CURRENT handle is bound to. Updated on every successful
300
+ // reopen so the proactive probe (#610) compares against the live handle, not
301
+ // a one-time snapshot that would go stale after a heal.
302
+ let currentInode: DbInode | undefined = deps.initialInode;
169
303
  const log = deps.log ?? ((line) => console.error(line));
170
304
  const exit = deps.exit ?? ((code) => process.exit(code));
305
+ const statInode = deps.statInode ?? defaultStatInode;
171
306
  const closeOld =
172
307
  deps.closeOld ??
173
308
  ((db) => {
@@ -178,34 +313,159 @@ export function createDbHolder(initial: Database, deps: DbHolderDeps): DbHolder
178
313
  }
179
314
  });
180
315
 
316
+ /**
317
+ * Shared reopen-once-or-exit core for BOTH the reactive (`healOrExit`) and
318
+ * proactive (`probePath`) self-heal paths. `reason` is the loud-log preamble
319
+ * describing what triggered it. Returns `"healed"` (fresh handle swapped in +
320
+ * verified) or `"exited"` (reopen failed / new handle dead → exit, which only
321
+ * returns in tests where `exit` is a non-killing spy).
322
+ */
323
+ const reopenOrExit = (reason: string): "healed" | "exited" => {
324
+ log(`parachute hub: ${reason}. Attempting one DB handle reopen…`);
325
+
326
+ let reopened: Database;
327
+ try {
328
+ reopened = deps.reopen();
329
+ // Confirm the fresh handle is actually live before trusting it.
330
+ reopened.query("SELECT 1").get();
331
+ } catch (reopenErr) {
332
+ const rd = reopenErr instanceof Error ? reopenErr.message : String(reopenErr);
333
+ log(
334
+ `parachute hub: DB reopen failed (${rd}); exiting so the platform manager restarts the hub with a fresh handle.`,
335
+ );
336
+ exit(1);
337
+ return "exited";
338
+ }
339
+
340
+ // Reopen succeeded + verified. Swap it in; the old handle is dead.
341
+ closeOld(current);
342
+ current = reopened;
343
+ // Re-snapshot the inode of the path the fresh handle now points at, so the
344
+ // proactive probe tracks the NEW file (best-effort — a failed snapshot
345
+ // leaves `currentInode` undefined → probe returns "unknown", never fires).
346
+ if (deps.dbPath !== undefined) {
347
+ try {
348
+ currentInode = statInode(deps.dbPath);
349
+ } catch {
350
+ currentInode = undefined;
351
+ }
352
+ }
353
+ log("parachute hub: DB handle reopened successfully; continuing.");
354
+ return "healed";
355
+ };
356
+
181
357
  return {
182
358
  get: () => current,
183
359
  healOrExit(err: unknown) {
184
360
  const klass = classifyDbError(err);
185
361
  if (klass !== "fatal") return "ignored";
186
-
187
362
  const detail = err instanceof Error ? err.message : String(err);
188
- log(`parachute hub: persistent SQLite failure (${detail}). Attempting one DB handle reopen…`);
363
+ return reopenOrExit(`persistent SQLite failure (${detail})`);
364
+ },
365
+ probePath(): PathLivenessClass {
366
+ // No path configured → proactive probe disabled (reactive-only callers).
367
+ if (deps.dbPath === undefined) return "unknown";
189
368
 
190
- let reopened: Database;
369
+ // `pathInode` (NOT `current`) — the inode the db PATH resolves to right
370
+ // now. Named distinctly from the outer `current` (the live Database
371
+ // handle) so a reader can't misread this as the DB handle.
372
+ let pathInode: DbInode | undefined;
191
373
  try {
192
- reopened = deps.reopen();
193
- // Confirm the fresh handle is actually live before trusting it.
194
- reopened.query("SELECT 1").get();
195
- } catch (reopenErr) {
196
- const rd = reopenErr instanceof Error ? reopenErr.message : String(reopenErr);
197
- log(
198
- `parachute hub: DB reopen failed (${rd}); exiting so the platform manager restarts the hub with a fresh handle.`,
199
- );
200
- exit(1);
201
- return "exited";
374
+ pathInode = statInode(deps.dbPath);
375
+ } catch {
376
+ // A non-ENOENT stat failure (EACCES, EINTR, a transient FS hiccup) is
377
+ // explicitly NOT a wipe signal. Leave the hub alone — the next probe
378
+ // re-reads. This is the "never fire on transient" guard for the
379
+ // proactive path; only a clean ENOENT/mismatch below self-heals.
380
+ return "ok";
202
381
  }
203
382
 
204
- // Reopen succeeded + verified. Swap it in; the old handle is dead.
205
- closeOld(current);
206
- current = reopened;
207
- log("parachute hub: DB handle reopened successfully; continuing.");
208
- return "healed";
383
+ const verdict = classifyPathLiveness({ expected: currentInode, current: pathInode });
384
+ if (verdict === "ok" || verdict === "unknown") return verdict;
385
+
386
+ // Genuine wipe signal: the on-disk DB the handle points at is gone
387
+ // ("gone") or was replaced underneath us ("replaced"). Trigger the SAME
388
+ // reopen-or-exit machinery. When the path is gone, reopen's SELECT-1
389
+ // verify fails → exit → platform manager restarts with a fresh on-disk
390
+ // handle (seconds, not "never"). When replaced, we adopt the fresh inode.
391
+ //
392
+ // ONE-TICK /health ANOMALY (intentional): on a "replaced" verdict the
393
+ // reopenOrExit below heals SYNCHRONOUSLY, but we still RETURN "replaced"
394
+ // for this one call — so the /health request that drove this probe reports
395
+ // `db:"error: path-replaced"` even though the handle is now healthy; the
396
+ // very next request reads `ok`. We don't mask it (returning "ok" here would
397
+ // hide that a heal just happened, which is exactly what monitoring wants to
398
+ // see). It's safe because #591's adoption probe checks only HTTP 200
399
+ // (`res.ok`), not the specific `db` string, so a single transient error
400
+ // string can't cascade.
401
+ reopenOrExit(
402
+ verdict === "gone"
403
+ ? `db path ${deps.dbPath} no longer exists (state dir wiped under a running hub, #610)`
404
+ : `db path ${deps.dbPath} now resolves to a different inode (DB file replaced underneath the open handle, #610)`,
405
+ );
406
+ return verdict;
407
+ },
408
+ };
409
+ }
410
+
411
+ /** Handle to stop a running proactive-liveness timer (test cleanup + shutdown). */
412
+ export interface DbLivenessTimer {
413
+ stop(): void;
414
+ }
415
+
416
+ export interface DbLivenessTimerDeps<H = unknown> {
417
+ /** Poll cadence in ms. Default 15_000 (low-frequency — this is a safety net,
418
+ * not a hot path; the cost is one `stat()` syscall per tick). */
419
+ intervalMs?: number;
420
+ /** Injectable scheduler (default `setInterval`). Tests drive ticks manually. */
421
+ setIntervalFn?: (cb: () => void, ms: number) => H;
422
+ /** Injectable clear (default `clearInterval`). */
423
+ clearIntervalFn?: (handle: H) => void;
424
+ /** Loud log sink for an unexpected probe throw (default `console.error`). */
425
+ log?: (line: string) => void;
426
+ }
427
+
428
+ /**
429
+ * Start the bounded, low-frequency PROACTIVE liveness timer (#610). Each tick
430
+ * calls `holder.probePath()` — which self-heals (reopen-or-exit) on a genuine
431
+ * wipe and no-ops otherwise. The cadence is fixed (default 15s) so it can NEVER
432
+ * spin: a tick does exactly one `stat()` then sleeps the full interval; even if
433
+ * the probe self-heals + exits, that's terminal. We swallow any unexpected
434
+ * probe throw (logged) rather than let an interval callback crash the process —
435
+ * the probe is a safety net, not a load-bearing request path.
436
+ *
437
+ * `unref()` is called so this timer never keeps the event loop alive on its own
438
+ * (it's purely a watchdog over the already-running server).
439
+ */
440
+ export function startDbPathLivenessTimer<H = ReturnType<typeof setInterval>>(
441
+ holder: DbHolder,
442
+ deps: DbLivenessTimerDeps<H> = {},
443
+ ): DbLivenessTimer {
444
+ const intervalMs = deps.intervalMs ?? 15_000;
445
+ const setIntervalFn =
446
+ deps.setIntervalFn ?? ((cb: () => void, ms: number) => setInterval(cb, ms) as unknown as H);
447
+ const clearIntervalFn =
448
+ deps.clearIntervalFn ??
449
+ ((h: H) => clearInterval(h as unknown as ReturnType<typeof setInterval>));
450
+ const log = deps.log ?? ((line) => console.error(line));
451
+
452
+ const handle = setIntervalFn(() => {
453
+ try {
454
+ holder.probePath();
455
+ } catch (err) {
456
+ // A probe should never throw (statInode swallows non-ENOENT, the holder
457
+ // handles the rest), but if it somehow does, don't take the process down
458
+ // from inside a timer callback — log and let the next tick retry.
459
+ const detail = err instanceof Error ? err.message : String(err);
460
+ log(`parachute hub: proactive DB-liveness probe threw unexpectedly (${detail}); ignoring.`);
461
+ }
462
+ }, intervalMs);
463
+ // Don't let the watchdog alone keep the process alive.
464
+ (handle as { unref?: () => void }).unref?.();
465
+
466
+ return {
467
+ stop() {
468
+ clearIntervalFn(handle);
209
469
  },
210
470
  };
211
471
  }
package/src/hub-server.ts CHANGED
@@ -184,7 +184,13 @@ import { applyCorsHeaders, corsPreflightResponse, isCorsAllowedRoute } from "./c
184
184
  import { ensureCsrfToken } from "./csrf.ts";
185
185
  import { readExposeState } from "./expose-state.ts";
186
186
  import { HUB_DEFAULT_PORT, HUB_SVC, clearHubPort, writeHubPort } from "./hub-control.ts";
187
- import { classifyDbError, createDbHolder, probeDbLiveness } from "./hub-db-liveness.ts";
187
+ import {
188
+ classifyDbError,
189
+ createDbHolder,
190
+ defaultStatInode,
191
+ probeDbLiveness,
192
+ startDbPathLivenessTimer,
193
+ } from "./hub-db-liveness.ts";
188
194
  import { hubDbPath, openHubDb } from "./hub-db.ts";
189
195
  import { getHubOrigin } from "./hub-settings.ts";
190
196
  import { type RenderHubOpts, renderHub } from "./hub.ts";
@@ -842,6 +848,17 @@ export interface HubFetchDeps {
842
848
  * the response. Absent in tests that don't exercise the DB-error path.
843
849
  */
844
850
  onDbError?: (err: unknown) => "ignored" | "healed" | "exited";
851
+ /**
852
+ * PROACTIVE db-path liveness probe (#610). Production wires the
853
+ * {@link DbHolder}'s `probePath` so the `/health` db check `stat()`s the
854
+ * configured db path and compares its inode to the open handle's — catching
855
+ * the "operator wiped `~/.parachute` under a running hub" case that NEVER
856
+ * throws on Linux (the unlinked-but-open ghost inode keeps `SELECT 1`
857
+ * succeeding). Returns the path-liveness verdict; on a genuine wipe it ALSO
858
+ * triggers the reopen-or-exit self-heal. Absent in tests that don't exercise
859
+ * the proactive path — `/health` then falls back to the `SELECT 1` probe only.
860
+ */
861
+ probeDbPath?: () => "ok" | "gone" | "replaced" | "unknown";
845
862
  /**
846
863
  * Hub origin used as the OAuth `iss` claim and to build the authorization-
847
864
  * server metadata document. When omitted, OAuth endpoints fall back to the
@@ -1605,7 +1622,26 @@ export function hubFetch(
1605
1622
  let db: "ok" | string = "unconfigured";
1606
1623
  if (getDb) {
1607
1624
  try {
1608
- db = probeDbLiveness(getDb());
1625
+ // PROACTIVE path check FIRST (#610): on Linux a wiped state dir
1626
+ // doesn't throw — the unlinked-but-open ghost inode keeps SELECT 1
1627
+ // succeeding, so `probeDbLiveness` alone would report `db:"ok"` on a
1628
+ // database that's gone from disk (the /health lie the issue calls
1629
+ // out). `probeDbPath` stat()s the path + compares inodes; on a
1630
+ // gone/replaced verdict it ALSO self-heals (reopen-or-exit) and we
1631
+ // surface the fault so the #591 adoption probe + monitoring see it.
1632
+ const pathVerdict = deps?.probeDbPath?.();
1633
+ if (pathVerdict === "gone" || pathVerdict === "replaced") {
1634
+ // One-request anomaly on "replaced": probeDbPath already healed the
1635
+ // handle synchronously, but THIS request still reports the fault
1636
+ // (the next /health reads `db:"ok"`). Intentional — we surface that
1637
+ // a heal just occurred rather than masking it. Safe because #591's
1638
+ // adoption probe gates on HTTP 200 (`res.ok`), not the `db` string,
1639
+ // so a single transient error string can't cascade. ("gone" exits
1640
+ // the process, usually before this response is even sent.)
1641
+ db = `error: path-${pathVerdict}`;
1642
+ } else {
1643
+ db = probeDbLiveness(getDb());
1644
+ }
1609
1645
  } catch {
1610
1646
  // getDb() itself threw (e.g. openHubDb failed) — report it as an
1611
1647
  // error class without letting /health 500.
@@ -2769,12 +2805,36 @@ if (import.meta.main) {
2769
2805
  // touch the DB still works before first open. Once opened, the holder owns
2770
2806
  // reopen-once-or-exit on a persistent SQLite fault.
2771
2807
  let holder: ReturnType<typeof createDbHolder> | undefined;
2772
- const getDb = () => {
2808
+ let livenessTimer: ReturnType<typeof startDbPathLivenessTimer> | undefined;
2809
+ const ensureHolder = (): ReturnType<typeof createDbHolder> => {
2773
2810
  if (!holder) {
2774
- holder = createDbHolder(openHubDb(dbPath), { reopen: () => openHubDb(dbPath) });
2811
+ const db = openHubDb(dbPath);
2812
+ // Snapshot the inode the handle is bound to NOW, so the proactive probe
2813
+ // (#610) can later notice the path has gone / been replaced. Best-effort
2814
+ // — a failed snapshot leaves the proactive probe at "unknown" (it never
2815
+ // self-heals without a baseline), while the reactive path still covers
2816
+ // thrown faults.
2817
+ let initialInode: ReturnType<typeof defaultStatInode> | undefined;
2818
+ try {
2819
+ initialInode = defaultStatInode(dbPath);
2820
+ } catch {
2821
+ initialInode = undefined;
2822
+ }
2823
+ holder = createDbHolder(db, {
2824
+ reopen: () => openHubDb(dbPath),
2825
+ dbPath,
2826
+ statInode: defaultStatInode,
2827
+ initialInode,
2828
+ });
2829
+ // Start the bounded proactive-liveness watchdog (#610) once the handle is
2830
+ // open. It stat()s the db path on a low-frequency timer and self-heals
2831
+ // (reopen-or-exit) the moment the on-disk DB is wiped — closing the
2832
+ // ghost-fd gap the reactive path can't see (no thrown error on Linux).
2833
+ livenessTimer = startDbPathLivenessTimer(holder);
2775
2834
  }
2776
- return holder.get();
2835
+ return holder;
2777
2836
  };
2837
+ const getDb = () => ensureHolder().get();
2778
2838
  const onDbError = (err: unknown): "ignored" | "healed" | "exited" =>
2779
2839
  holder ? holder.healOrExit(err) : "ignored";
2780
2840
  Bun.serve({
@@ -2792,7 +2852,13 @@ if (import.meta.main) {
2792
2852
  // Bun's equivalent is this. 255s comfortably exceeds Render's edge
2793
2853
  // pool TTL (community-observed ~120s). Closes hub#399.
2794
2854
  idleTimeout: 255,
2795
- fetch: hubFetch(wellKnownDir, { getDb, onDbError, issuer, loopbackPort: port }),
2855
+ fetch: hubFetch(wellKnownDir, {
2856
+ getDb,
2857
+ onDbError,
2858
+ probeDbPath: () => holder?.probePath() ?? "unknown",
2859
+ issuer,
2860
+ loopbackPort: port,
2861
+ }),
2796
2862
  });
2797
2863
  // Register PID + port from the running hub itself so any startup path
2798
2864
  // (spawn-via-`ensureHubRunning` or a direct `bun src/hub-server.ts` from
@@ -2802,6 +2868,7 @@ if (import.meta.main) {
2802
2868
  writePid(HUB_SVC, process.pid);
2803
2869
  writeHubPort(port);
2804
2870
  const cleanup = () => {
2871
+ livenessTimer?.stop();
2805
2872
  clearPid(HUB_SVC);
2806
2873
  clearHubPort();
2807
2874
  };
@@ -75,6 +75,7 @@ import {
75
75
  readOperatorTokenFile,
76
76
  } from "./operator-token.ts";
77
77
  import { isHttpsRequest } from "./request-protocol.ts";
78
+ import { SEED_VERSION } from "./service-spec.ts";
78
79
  import { findService, readManifestLenient } from "./services-manifest.ts";
79
80
  import {
80
81
  SESSION_TTL_MS,
@@ -274,13 +275,29 @@ export function deriveWizardState(deps: {
274
275
  // which maps to `parachute-vault` in services.json.
275
276
  const vaultSpec = specFor(FIRST_VAULT_SHORT);
276
277
  const vaultEntry = findService(vaultSpec.manifestName, deps.manifestPath);
278
+ // hub#607: distinguish the SEED placeholder from a real vault instance.
279
+ // `parachute init` installs the vault MODULE without creating an instance
280
+ // (hub#168 Cut 1: `noCreate`), seeding a services.json entry at
281
+ // SEED_VERSION ("0.0.0-linked") with the canonical `/vault/default` mount.
282
+ // Vault's own first-boot overwrites that entry with the real instance once
283
+ // a vault is actually created. A bare `findService(...) !== undefined`
284
+ // check matches the placeholder, so on EVERY init'd box the wizard treated
285
+ // the vault step as already-done and skipped straight to expose — the
286
+ // operator finished setup with no vault and no prompt. Treat a
287
+ // SEED_VERSION row as "module installed, no instance" so the wizard still
288
+ // presents its create / import / skip step. This is the SAME
289
+ // discrimination `buildWellKnown` gained in hub#577 (it suppresses the
290
+ // phantom `vaults[]` row at SEED_VERSION); both surfaces must agree that a
291
+ // placeholder is not a real vault.
292
+ const vaultIsPlaceholder = vaultEntry !== undefined && vaultEntry.version === SEED_VERSION;
293
+ const hasRealVault = vaultEntry !== undefined && !vaultIsPlaceholder;
277
294
  // hub#168 Cut 2: `setup_vault_skipped === "true"` advances the wizard
278
295
  // past the vault step even when no vault row exists. The operator
279
296
  // explicitly chose Skip; the module is installed (Cut 1) but no
280
297
  // instance was provisioned. Treat as "vault step is done" for the
281
298
  // purposes of state-derivation so the wizard moves to expose.
282
299
  const vaultSkipped = getSetting(deps.db, "setup_vault_skipped") === "true";
283
- const hasVault = vaultEntry !== undefined || vaultSkipped;
300
+ const hasVault = hasRealVault || vaultSkipped;
284
301
  // Expose-mode is the operator's "how will this hub be reached?" answer
285
302
  // (hub#268 Item 2). Stored as a hub_setting; the wizard's expose step
286
303
  // sets it; absence means we should still ask. EXCEPT — if we're