@openparachute/hub 0.6.5-rc.4 → 0.6.5-rc.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/__tests__/hub-db-liveness.test.ts +215 -1
- package/src/__tests__/hub-server.test.ts +63 -0
- package/src/__tests__/install.test.ts +121 -0
- package/src/__tests__/setup-wizard.test.ts +139 -0
- package/src/commands/install.ts +157 -9
- package/src/hub-db-liveness.ts +287 -27
- package/src/hub-server.ts +73 -6
- package/src/setup-wizard.ts +18 -1
package/package.json
CHANGED
|
@@ -1,6 +1,14 @@
|
|
|
1
1
|
import { Database } from "bun:sqlite";
|
|
2
2
|
import { describe, expect, test } from "bun:test";
|
|
3
|
-
import {
|
|
3
|
+
import {
|
|
4
|
+
type DbInode,
|
|
5
|
+
type StatInodeFn,
|
|
6
|
+
classifyDbError,
|
|
7
|
+
classifyPathLiveness,
|
|
8
|
+
createDbHolder,
|
|
9
|
+
probeDbLiveness,
|
|
10
|
+
startDbPathLivenessTimer,
|
|
11
|
+
} from "../hub-db-liveness.ts";
|
|
4
12
|
|
|
5
13
|
/** Build a `SQLiteError`-shaped object with the given code + message. */
|
|
6
14
|
function sqliteErr(code: string, message: string): Error & { code: string } {
|
|
@@ -137,3 +145,209 @@ describe("createDbHolder (#594)", () => {
|
|
|
137
145
|
initial.close();
|
|
138
146
|
});
|
|
139
147
|
});
|
|
148
|
+
|
|
149
|
+
const INODE_A: DbInode = { dev: 1, ino: 100 };
|
|
150
|
+
const INODE_B: DbInode = { dev: 1, ino: 200 };
|
|
151
|
+
|
|
152
|
+
describe("classifyPathLiveness (#610)", () => {
|
|
153
|
+
test("same inode → ok", () => {
|
|
154
|
+
expect(classifyPathLiveness({ expected: INODE_A, current: INODE_A })).toBe("ok");
|
|
155
|
+
expect(classifyPathLiveness({ expected: INODE_A, current: { ...INODE_A } })).toBe("ok");
|
|
156
|
+
});
|
|
157
|
+
test("ENOENT on the path (current undefined) → gone", () => {
|
|
158
|
+
expect(classifyPathLiveness({ expected: INODE_A, current: undefined })).toBe("gone");
|
|
159
|
+
});
|
|
160
|
+
test("different inode → replaced", () => {
|
|
161
|
+
expect(classifyPathLiveness({ expected: INODE_A, current: INODE_B })).toBe("replaced");
|
|
162
|
+
// a different device counts too
|
|
163
|
+
expect(classifyPathLiveness({ expected: INODE_A, current: { dev: 2, ino: 100 } })).toBe(
|
|
164
|
+
"replaced",
|
|
165
|
+
);
|
|
166
|
+
});
|
|
167
|
+
test("no baseline snapshot (expected undefined) → unknown, never self-heals", () => {
|
|
168
|
+
expect(classifyPathLiveness({ expected: undefined, current: INODE_A })).toBe("unknown");
|
|
169
|
+
expect(classifyPathLiveness({ expected: undefined, current: undefined })).toBe("unknown");
|
|
170
|
+
});
|
|
171
|
+
});
|
|
172
|
+
|
|
173
|
+
describe("DbHolder.probePath (#610 proactive detection)", () => {
|
|
174
|
+
/** A holder whose path stat is driven by the injected `statInode`. */
|
|
175
|
+
function makeHolder(opts: {
|
|
176
|
+
initialInode: DbInode | undefined;
|
|
177
|
+
statInode: StatInodeFn;
|
|
178
|
+
onReopen?: () => Database;
|
|
179
|
+
}) {
|
|
180
|
+
const initial = new Database(":memory:");
|
|
181
|
+
let reopens = 0;
|
|
182
|
+
let exits = 0;
|
|
183
|
+
let exitCode: number | undefined;
|
|
184
|
+
const holder = createDbHolder(initial, {
|
|
185
|
+
dbPath: "/fake/hub.db",
|
|
186
|
+
initialInode: opts.initialInode,
|
|
187
|
+
statInode: opts.statInode,
|
|
188
|
+
reopen: () => {
|
|
189
|
+
reopens += 1;
|
|
190
|
+
return opts.onReopen ? opts.onReopen() : new Database(":memory:");
|
|
191
|
+
},
|
|
192
|
+
exit: (code) => {
|
|
193
|
+
exits += 1;
|
|
194
|
+
exitCode = code;
|
|
195
|
+
},
|
|
196
|
+
log: () => {},
|
|
197
|
+
});
|
|
198
|
+
return {
|
|
199
|
+
holder,
|
|
200
|
+
stats: () => ({ reopens, exits, exitCode }),
|
|
201
|
+
cleanup: () => {
|
|
202
|
+
try {
|
|
203
|
+
initial.close();
|
|
204
|
+
} catch {}
|
|
205
|
+
},
|
|
206
|
+
};
|
|
207
|
+
}
|
|
208
|
+
|
|
209
|
+
test("healthy path (same inode) → no reopen, no exit", () => {
|
|
210
|
+
const h = makeHolder({ initialInode: INODE_A, statInode: () => INODE_A });
|
|
211
|
+
expect(h.holder.probePath()).toBe("ok");
|
|
212
|
+
expect(h.stats().reopens).toBe(0);
|
|
213
|
+
expect(h.stats().exits).toBe(0);
|
|
214
|
+
h.cleanup();
|
|
215
|
+
});
|
|
216
|
+
|
|
217
|
+
test("path GONE (ENOENT) → reopen attempted; reopen verify fails → exit(1)", () => {
|
|
218
|
+
// Reopen returns a closed handle (the dir is still gone) → SELECT 1 throws
|
|
219
|
+
// → exit. This is the genuine `rm -rf ~/.parachute` field shape.
|
|
220
|
+
const dead = new Database(":memory:");
|
|
221
|
+
dead.close();
|
|
222
|
+
const h = makeHolder({
|
|
223
|
+
initialInode: INODE_A,
|
|
224
|
+
statInode: () => undefined, // ENOENT
|
|
225
|
+
onReopen: () => dead,
|
|
226
|
+
});
|
|
227
|
+
expect(h.holder.probePath()).toBe("gone");
|
|
228
|
+
expect(h.stats().reopens).toBe(1);
|
|
229
|
+
expect(h.stats().exits).toBe(1);
|
|
230
|
+
expect(h.stats().exitCode).toBe(1);
|
|
231
|
+
h.cleanup();
|
|
232
|
+
});
|
|
233
|
+
|
|
234
|
+
test("path REPLACED (different inode) → reopen + swap (heals, no exit)", () => {
|
|
235
|
+
const h = makeHolder({
|
|
236
|
+
initialInode: INODE_A,
|
|
237
|
+
statInode: () => INODE_B, // path now resolves to a different inode
|
|
238
|
+
onReopen: () => new Database(":memory:"),
|
|
239
|
+
});
|
|
240
|
+
expect(h.holder.probePath()).toBe("replaced");
|
|
241
|
+
expect(h.stats().reopens).toBe(1);
|
|
242
|
+
expect(h.stats().exits).toBe(0);
|
|
243
|
+
h.cleanup();
|
|
244
|
+
});
|
|
245
|
+
|
|
246
|
+
test("NEVER fires on a transient stat throw (EACCES) — returns ok, no reopen/exit", () => {
|
|
247
|
+
const h = makeHolder({
|
|
248
|
+
initialInode: INODE_A,
|
|
249
|
+
statInode: () => {
|
|
250
|
+
const e = new Error("permission denied") as Error & { code: string };
|
|
251
|
+
e.code = "EACCES";
|
|
252
|
+
throw e;
|
|
253
|
+
},
|
|
254
|
+
});
|
|
255
|
+
expect(h.holder.probePath()).toBe("ok");
|
|
256
|
+
expect(h.stats().reopens).toBe(0);
|
|
257
|
+
expect(h.stats().exits).toBe(0);
|
|
258
|
+
h.cleanup();
|
|
259
|
+
});
|
|
260
|
+
|
|
261
|
+
test("no baseline inode → unknown, never self-heals (safe degradation)", () => {
|
|
262
|
+
const h = makeHolder({ initialInode: undefined, statInode: () => undefined });
|
|
263
|
+
expect(h.holder.probePath()).toBe("unknown");
|
|
264
|
+
expect(h.stats().reopens).toBe(0);
|
|
265
|
+
expect(h.stats().exits).toBe(0);
|
|
266
|
+
h.cleanup();
|
|
267
|
+
});
|
|
268
|
+
|
|
269
|
+
test("no dbPath configured → probePath is a no-op (unknown)", () => {
|
|
270
|
+
const initial = new Database(":memory:");
|
|
271
|
+
const holder = createDbHolder(initial, {
|
|
272
|
+
reopen: () => new Database(":memory:"),
|
|
273
|
+
exit: () => {},
|
|
274
|
+
log: () => {},
|
|
275
|
+
});
|
|
276
|
+
expect(holder.probePath()).toBe("unknown");
|
|
277
|
+
initial.close();
|
|
278
|
+
});
|
|
279
|
+
|
|
280
|
+
test("after a heal (replaced), the inode baseline is re-snapshotted to the new file", () => {
|
|
281
|
+
// First probe sees INODE_B (replaced) → reopen; statInode then returns
|
|
282
|
+
// INODE_B again so the NEXT probe sees the SAME inode → ok (not a loop).
|
|
283
|
+
let exits = 0;
|
|
284
|
+
const initial = new Database(":memory:");
|
|
285
|
+
const holder = createDbHolder(initial, {
|
|
286
|
+
dbPath: "/fake/hub.db",
|
|
287
|
+
initialInode: INODE_A,
|
|
288
|
+
statInode: () => INODE_B,
|
|
289
|
+
reopen: () => new Database(":memory:"),
|
|
290
|
+
exit: () => {
|
|
291
|
+
exits += 1;
|
|
292
|
+
},
|
|
293
|
+
log: () => {},
|
|
294
|
+
});
|
|
295
|
+
expect(holder.probePath()).toBe("replaced"); // A → B, heal
|
|
296
|
+
expect(holder.probePath()).toBe("ok"); // B → B, no further action
|
|
297
|
+
expect(exits).toBe(0);
|
|
298
|
+
initial.close();
|
|
299
|
+
});
|
|
300
|
+
});
|
|
301
|
+
|
|
302
|
+
describe("startDbPathLivenessTimer (#610 bounded watchdog)", () => {
|
|
303
|
+
test("each tick calls probePath exactly once; stop() clears the timer", () => {
|
|
304
|
+
let probes = 0;
|
|
305
|
+
const fakeHolder = {
|
|
306
|
+
get: () => new Database(":memory:"),
|
|
307
|
+
healOrExit: () => "ignored" as const,
|
|
308
|
+
probePath: () => {
|
|
309
|
+
probes += 1;
|
|
310
|
+
return "ok" as const;
|
|
311
|
+
},
|
|
312
|
+
};
|
|
313
|
+
let registered: (() => void) | undefined;
|
|
314
|
+
let cleared = false;
|
|
315
|
+
const timer = startDbPathLivenessTimer<number>(fakeHolder, {
|
|
316
|
+
setIntervalFn: (cb) => {
|
|
317
|
+
registered = cb;
|
|
318
|
+
return 42;
|
|
319
|
+
},
|
|
320
|
+
clearIntervalFn: (h) => {
|
|
321
|
+
expect(h).toBe(42);
|
|
322
|
+
cleared = true;
|
|
323
|
+
},
|
|
324
|
+
});
|
|
325
|
+
expect(registered).toBeDefined();
|
|
326
|
+
registered?.();
|
|
327
|
+
registered?.();
|
|
328
|
+
expect(probes).toBe(2);
|
|
329
|
+
timer.stop();
|
|
330
|
+
expect(cleared).toBe(true);
|
|
331
|
+
});
|
|
332
|
+
|
|
333
|
+
test("a probe that throws is swallowed (the timer callback never crashes the process)", () => {
|
|
334
|
+
const fakeHolder = {
|
|
335
|
+
get: () => new Database(":memory:"),
|
|
336
|
+
healOrExit: () => "ignored" as const,
|
|
337
|
+
probePath: (): "ok" => {
|
|
338
|
+
throw new Error("unexpected");
|
|
339
|
+
},
|
|
340
|
+
};
|
|
341
|
+
let registered: (() => void) | undefined;
|
|
342
|
+
startDbPathLivenessTimer<number>(fakeHolder, {
|
|
343
|
+
setIntervalFn: (cb) => {
|
|
344
|
+
registered = cb;
|
|
345
|
+
return 1;
|
|
346
|
+
},
|
|
347
|
+
clearIntervalFn: () => {},
|
|
348
|
+
log: () => {},
|
|
349
|
+
});
|
|
350
|
+
// Must NOT throw out of the callback.
|
|
351
|
+
expect(() => registered?.()).not.toThrow();
|
|
352
|
+
});
|
|
353
|
+
});
|
|
@@ -110,6 +110,69 @@ describe("hubFetch routing", () => {
|
|
|
110
110
|
}
|
|
111
111
|
});
|
|
112
112
|
|
|
113
|
+
test("/health reports db:ok when getDb is live and the proactive path probe is ok (#610)", async () => {
|
|
114
|
+
const h = makeHarness();
|
|
115
|
+
try {
|
|
116
|
+
const db = openHubDb(hubDbPath(h.dir));
|
|
117
|
+
try {
|
|
118
|
+
const res = await hubFetch(h.dir, {
|
|
119
|
+
getDb: () => db,
|
|
120
|
+
probeDbPath: () => "ok",
|
|
121
|
+
})(req("/health"));
|
|
122
|
+
expect(res.status).toBe(200);
|
|
123
|
+
const body = (await res.json()) as { status: string; db: string };
|
|
124
|
+
expect(body.status).toBe("ok");
|
|
125
|
+
expect(body.db).toBe("ok");
|
|
126
|
+
} finally {
|
|
127
|
+
db.close();
|
|
128
|
+
}
|
|
129
|
+
} finally {
|
|
130
|
+
h.cleanup();
|
|
131
|
+
}
|
|
132
|
+
});
|
|
133
|
+
|
|
134
|
+
test("/health surfaces db:error:path-gone when the proactive probe sees a wiped path (#610)", async () => {
|
|
135
|
+
// The ghost-fd lie: SELECT 1 still succeeds against the unlinked inode, so
|
|
136
|
+
// probeDbLiveness alone would report ok. probeDbPath stat()s the PATH and
|
|
137
|
+
// returns "gone" → /health must report the fault instead of lying.
|
|
138
|
+
const h = makeHarness();
|
|
139
|
+
try {
|
|
140
|
+
const db = openHubDb(hubDbPath(h.dir));
|
|
141
|
+
try {
|
|
142
|
+
const res = await hubFetch(h.dir, {
|
|
143
|
+
getDb: () => db,
|
|
144
|
+
probeDbPath: () => "gone",
|
|
145
|
+
})(req("/health"));
|
|
146
|
+
expect(res.status).toBe(200); // /health stays 200 (process liveness)
|
|
147
|
+
const body = (await res.json()) as { db: string };
|
|
148
|
+
expect(body.db).toBe("error: path-gone");
|
|
149
|
+
} finally {
|
|
150
|
+
db.close();
|
|
151
|
+
}
|
|
152
|
+
} finally {
|
|
153
|
+
h.cleanup();
|
|
154
|
+
}
|
|
155
|
+
});
|
|
156
|
+
|
|
157
|
+
test("/health surfaces db:error:path-replaced when the proactive probe sees an inode swap (#610)", async () => {
|
|
158
|
+
const h = makeHarness();
|
|
159
|
+
try {
|
|
160
|
+
const db = openHubDb(hubDbPath(h.dir));
|
|
161
|
+
try {
|
|
162
|
+
const res = await hubFetch(h.dir, {
|
|
163
|
+
getDb: () => db,
|
|
164
|
+
probeDbPath: () => "replaced",
|
|
165
|
+
})(req("/health"));
|
|
166
|
+
const body = (await res.json()) as { db: string };
|
|
167
|
+
expect(body.db).toBe("error: path-replaced");
|
|
168
|
+
} finally {
|
|
169
|
+
db.close();
|
|
170
|
+
}
|
|
171
|
+
} finally {
|
|
172
|
+
h.cleanup();
|
|
173
|
+
}
|
|
174
|
+
});
|
|
175
|
+
|
|
113
176
|
test("/ renders the signed-out indicator dynamically when DB is configured but no session cookie (rc.13)", async () => {
|
|
114
177
|
// The dynamic path takes over from the static disk file the moment a
|
|
115
178
|
// DB is configured. With no session cookie, we still render — just
|
|
@@ -377,6 +377,127 @@ describe("install", () => {
|
|
|
377
377
|
}
|
|
378
378
|
});
|
|
379
379
|
|
|
380
|
+
test("ADOPT-KILLS an attributable same-module orphan on the canonical port + reclaims it (#609)", async () => {
|
|
381
|
+
// Wipe-recovery: `rm -rf ~/.parachute` + re-`init` leaves the supervised
|
|
382
|
+
// vault child running on :1940. The fresh install must reclaim the canonical
|
|
383
|
+
// port (adopt-kill the attributable orphan) rather than port-walk to 1944.
|
|
384
|
+
const { path, configDir, cleanup } = makeTempPath();
|
|
385
|
+
try {
|
|
386
|
+
const logs: string[] = [];
|
|
387
|
+
const kills: Array<{ pid: number; signal: string | number }> = [];
|
|
388
|
+
// installDir = /opt/.parachute/vault → the orphan's cmdline carries it, so
|
|
389
|
+
// attribution (per-module marker = installDir) succeeds.
|
|
390
|
+
const installDirPkg = "/opt/.parachute/vault/package.json";
|
|
391
|
+
// Port 1940 is held UNTIL the SIGTERM lands; after the kill the re-probe
|
|
392
|
+
// (collectOccupiedPorts) sees it free, so the assignment lands on 1940.
|
|
393
|
+
let killed = false;
|
|
394
|
+
const code = await install("vault", {
|
|
395
|
+
runner: async () => 0,
|
|
396
|
+
manifestPath: path,
|
|
397
|
+
configDir,
|
|
398
|
+
startService: async () => 0,
|
|
399
|
+
isLinked: () => false,
|
|
400
|
+
findGlobalInstall: () => installDirPkg,
|
|
401
|
+
portProbe: async (p) => p === 1940 && !killed,
|
|
402
|
+
pidOnPort: (p) => (p === 1940 && !killed ? 7777 : undefined),
|
|
403
|
+
ownerOfPid: (pid) =>
|
|
404
|
+
pid === 7777 ? "parachute-vault --port 1940 (/opt/.parachute/vault)" : undefined,
|
|
405
|
+
killPid: (pid, signal) => {
|
|
406
|
+
kills.push({ pid, signal });
|
|
407
|
+
killed = true; // the orphan releases the port on SIGTERM
|
|
408
|
+
},
|
|
409
|
+
sleep: async () => {},
|
|
410
|
+
reclaimDelayMs: 0,
|
|
411
|
+
log: (l) => logs.push(l),
|
|
412
|
+
});
|
|
413
|
+
expect(code).toBe(0);
|
|
414
|
+
const joined = logs.join("\n");
|
|
415
|
+
// We adopt-killed the attributable orphan…
|
|
416
|
+
expect(joined).toMatch(/attributable prior vault instance \(pid 7777/);
|
|
417
|
+
expect(joined).toMatch(/reclaiming it \(adopt-kill\)/);
|
|
418
|
+
expect(kills.map((k) => k.signal)).toContain("SIGTERM");
|
|
419
|
+
// …and the fresh install landed on the CANONICAL port, not a fallback.
|
|
420
|
+
const entry = findService("parachute-vault", path);
|
|
421
|
+
expect(entry?.port).toBe(1940);
|
|
422
|
+
expect(joined).not.toMatch(/is in use; assigned/);
|
|
423
|
+
} finally {
|
|
424
|
+
cleanup();
|
|
425
|
+
}
|
|
426
|
+
});
|
|
427
|
+
|
|
428
|
+
test("escalates to SIGKILL when the orphan ignores SIGTERM (#609)", async () => {
|
|
429
|
+
const { path, configDir, cleanup } = makeTempPath();
|
|
430
|
+
try {
|
|
431
|
+
const logs: string[] = [];
|
|
432
|
+
const signals: Array<string | number> = [];
|
|
433
|
+
const code = await install("vault", {
|
|
434
|
+
runner: async () => 0,
|
|
435
|
+
manifestPath: path,
|
|
436
|
+
configDir,
|
|
437
|
+
startService: async () => 0,
|
|
438
|
+
isLinked: () => false,
|
|
439
|
+
findGlobalInstall: () => "/opt/.parachute/vault/package.json",
|
|
440
|
+
// Orphan never releases 1940 → install ultimately walks (degrades
|
|
441
|
+
// gracefully), but it MUST have escalated to SIGKILL first.
|
|
442
|
+
portProbe: async (p) => p === 1940,
|
|
443
|
+
pidOnPort: (p) => (p === 1940 ? 8888 : undefined),
|
|
444
|
+
ownerOfPid: (pid) => (pid === 8888 ? "parachute-vault (/opt/.parachute/vault)" : undefined),
|
|
445
|
+
killPid: (_pid, signal) => {
|
|
446
|
+
signals.push(signal);
|
|
447
|
+
},
|
|
448
|
+
sleep: async () => {},
|
|
449
|
+
reclaimDelayMs: 0,
|
|
450
|
+
log: (l) => logs.push(l),
|
|
451
|
+
});
|
|
452
|
+
expect(code).toBe(0);
|
|
453
|
+
expect(signals).toContain("SIGTERM");
|
|
454
|
+
expect(signals).toContain("SIGKILL");
|
|
455
|
+
expect(logs.join("\n")).toMatch(/escalated to SIGKILL/);
|
|
456
|
+
} finally {
|
|
457
|
+
cleanup();
|
|
458
|
+
}
|
|
459
|
+
});
|
|
460
|
+
|
|
461
|
+
test("does NOT kill a FOREIGN holder on the canonical port — walks + warns instead (#609 safety)", async () => {
|
|
462
|
+
// The crux: a non-attributable holder (an operator's unrelated process, or a
|
|
463
|
+
// sibling module) on :1940 must NEVER be killed. We fall through to the #590
|
|
464
|
+
// warn-and-walk path unchanged.
|
|
465
|
+
const { path, configDir, cleanup } = makeTempPath();
|
|
466
|
+
try {
|
|
467
|
+
const logs: string[] = [];
|
|
468
|
+
let killCalled = false;
|
|
469
|
+
const code = await install("vault", {
|
|
470
|
+
runner: async () => 0,
|
|
471
|
+
manifestPath: path,
|
|
472
|
+
configDir,
|
|
473
|
+
startService: async () => 0,
|
|
474
|
+
isLinked: () => false,
|
|
475
|
+
findGlobalInstall: () => "/opt/.parachute/vault/package.json",
|
|
476
|
+
portProbe: async (p) => p === 1940,
|
|
477
|
+
pidOnPort: (p) => (p === 1940 ? 5555 : undefined),
|
|
478
|
+
// Foreign cmdline — does NOT contain the vault installDir marker.
|
|
479
|
+
ownerOfPid: (pid) => (pid === 5555 ? "/usr/bin/python3 /opt/my-own-server.py" : undefined),
|
|
480
|
+
killPid: () => {
|
|
481
|
+
killCalled = true;
|
|
482
|
+
},
|
|
483
|
+
sleep: async () => {},
|
|
484
|
+
reclaimDelayMs: 0,
|
|
485
|
+
log: (l) => logs.push(l),
|
|
486
|
+
});
|
|
487
|
+
expect(code).toBe(0);
|
|
488
|
+
expect(killCalled).toBe(false); // NEVER kill a foreign process
|
|
489
|
+
const joined = logs.join("\n");
|
|
490
|
+
// The #590 warn-and-walk path is unchanged for the foreign holder.
|
|
491
|
+
expect(joined).toMatch(/canonical port 1940 is in use; assigned/);
|
|
492
|
+
expect(joined).toContain("pid 5555 (/usr/bin/python3 /opt/my-own-server.py)");
|
|
493
|
+
expect(joined).toMatch(/stale pre-supervisor daemon/);
|
|
494
|
+
const entry = findService("parachute-vault", path);
|
|
495
|
+
expect(entry?.port).not.toBe(1940); // walked, not reclaimed
|
|
496
|
+
} finally {
|
|
497
|
+
cleanup();
|
|
498
|
+
}
|
|
499
|
+
});
|
|
500
|
+
|
|
380
501
|
test("squatter pid present but command line unreadable → names the pid alone (#590)", async () => {
|
|
381
502
|
const { path, configDir, cleanup } = makeTempPath();
|
|
382
503
|
try {
|
|
@@ -179,6 +179,46 @@ describe("deriveWizardState", () => {
|
|
|
179
179
|
}
|
|
180
180
|
});
|
|
181
181
|
|
|
182
|
+
test("vault step when admin exists and only the SEED placeholder vault row is present (hub#607)", async () => {
|
|
183
|
+
// `parachute init` seeds a `parachute-vault` placeholder into
|
|
184
|
+
// services.json at SEED_VERSION ("0.0.0-linked") under hub#168 Cut 1
|
|
185
|
+
// (`noCreate`): the MODULE is installed, but no instance exists yet.
|
|
186
|
+
// Pre-#607, `hasVault` keyed off a bare `findService(...) !== undefined`
|
|
187
|
+
// check, which the placeholder satisfied — so the wizard silently
|
|
188
|
+
// skipped its vault step on EVERY init'd box and the operator finished
|
|
189
|
+
// setup with no vault. The placeholder must NOT count as a real vault.
|
|
190
|
+
const db = openHubDb(hubDbPath(h.dir));
|
|
191
|
+
try {
|
|
192
|
+
await createUser(db, "owner", "pw");
|
|
193
|
+
writeManifest(
|
|
194
|
+
{
|
|
195
|
+
services: [
|
|
196
|
+
{
|
|
197
|
+
name: "parachute-vault",
|
|
198
|
+
version: "0.0.0-linked",
|
|
199
|
+
port: 1940,
|
|
200
|
+
paths: ["/vault/default"],
|
|
201
|
+
health: "/health",
|
|
202
|
+
},
|
|
203
|
+
],
|
|
204
|
+
},
|
|
205
|
+
h.manifestPath,
|
|
206
|
+
);
|
|
207
|
+
const s = deriveWizardState({
|
|
208
|
+
db,
|
|
209
|
+
manifestPath: h.manifestPath,
|
|
210
|
+
readExposeStateFn: h.readExposeStateFn,
|
|
211
|
+
});
|
|
212
|
+
// The placeholder is module-installed-but-no-instance, so the wizard
|
|
213
|
+
// still owns vault creation: it presents the create/import/skip step.
|
|
214
|
+
expect(s.step).toBe("vault");
|
|
215
|
+
expect(s.hasAdmin).toBe(true);
|
|
216
|
+
expect(s.hasVault).toBe(false);
|
|
217
|
+
} finally {
|
|
218
|
+
db.close();
|
|
219
|
+
}
|
|
220
|
+
});
|
|
221
|
+
|
|
182
222
|
test("expose step when admin + vault exist but expose mode not set yet (hub#268 Item 2)", async () => {
|
|
183
223
|
const db = openHubDb(hubDbPath(h.dir));
|
|
184
224
|
try {
|
|
@@ -1506,6 +1546,105 @@ describe("handleSetupVaultPost", () => {
|
|
|
1506
1546
|
}
|
|
1507
1547
|
});
|
|
1508
1548
|
|
|
1549
|
+
test("create on a SEED-placeholder box: does NOT short-circuit + drives the supervisor to start vault (hub#607 + hub#608)", async () => {
|
|
1550
|
+
// hub#607 + hub#608 coupled fresh-operator flow. On an init'd box,
|
|
1551
|
+
// services.json already carries a `parachute-vault` placeholder at
|
|
1552
|
+
// SEED_VERSION ("0.0.0-linked") — the MODULE is installed, no instance
|
|
1553
|
+
// exists. With the hub#607 `hasVault` discrimination, the wizard's vault
|
|
1554
|
+
// step still appears (the placeholder isn't a real vault), so a
|
|
1555
|
+
// `mode=create` POST must NOT be treated as "already provisioned" and
|
|
1556
|
+
// short-circuit to expose. It runs `runInstall`, which seeds/stamps the
|
|
1557
|
+
// row and — the hub#608 fix — drives `supervisor.start(...)` so the
|
|
1558
|
+
// freshly-created vault is ACTIVE immediately, not inactive-until-
|
|
1559
|
+
// restart. We assert both: the install op fired (no short-circuit) AND
|
|
1560
|
+
// the supervisor now reports a live vault child.
|
|
1561
|
+
const db = openHubDb(hubDbPath(h.dir));
|
|
1562
|
+
try {
|
|
1563
|
+
const user = await createUser(db, "owner", "pw");
|
|
1564
|
+
const { createSession, SESSION_COOKIE_NAME: SC } = await import("../sessions.ts");
|
|
1565
|
+
const session = createSession(db, { userId: user.id });
|
|
1566
|
+
// Simulate `parachute init`: the vault MODULE is seeded as a
|
|
1567
|
+
// placeholder, no supervisor entry yet (init ran with noStart).
|
|
1568
|
+
writeManifest(
|
|
1569
|
+
{
|
|
1570
|
+
services: [
|
|
1571
|
+
{
|
|
1572
|
+
name: "parachute-vault",
|
|
1573
|
+
version: "0.0.0-linked",
|
|
1574
|
+
port: 1940,
|
|
1575
|
+
paths: ["/vault/default"],
|
|
1576
|
+
health: "/health",
|
|
1577
|
+
},
|
|
1578
|
+
],
|
|
1579
|
+
},
|
|
1580
|
+
h.manifestPath,
|
|
1581
|
+
);
|
|
1582
|
+
const get = handleSetupGet(req("/admin/setup"), {
|
|
1583
|
+
db,
|
|
1584
|
+
manifestPath: h.manifestPath,
|
|
1585
|
+
configDir: h.dir,
|
|
1586
|
+
readExposeStateFn: h.readExposeStateFn,
|
|
1587
|
+
issuer: "https://hub.example",
|
|
1588
|
+
registry: getDefaultOperationsRegistry(),
|
|
1589
|
+
});
|
|
1590
|
+
const csrf = setCookie(get, CSRF_COOKIE_NAME) ?? "";
|
|
1591
|
+
const supervisor = makeSupervisor();
|
|
1592
|
+
// Sanity: no vault child before the wizard create.
|
|
1593
|
+
expect(supervisor.get("vault")).toBeUndefined();
|
|
1594
|
+
const runCalls: string[][] = [];
|
|
1595
|
+
const stubbedRun = async (cmd: readonly string[]) => {
|
|
1596
|
+
runCalls.push([...cmd]);
|
|
1597
|
+
return 0;
|
|
1598
|
+
};
|
|
1599
|
+
const post = await handleSetupVaultPost(
|
|
1600
|
+
req("/admin/setup/vault", {
|
|
1601
|
+
method: "POST",
|
|
1602
|
+
body: new URLSearchParams({
|
|
1603
|
+
[CSRF_FIELD_NAME]: csrf,
|
|
1604
|
+
mode: "create",
|
|
1605
|
+
vault_name: "myvault",
|
|
1606
|
+
scribe_provider: "none",
|
|
1607
|
+
}).toString(),
|
|
1608
|
+
headers: {
|
|
1609
|
+
"content-type": "application/x-www-form-urlencoded",
|
|
1610
|
+
cookie: `${CSRF_COOKIE_NAME}=${csrf}; ${SC}=${session.id}`,
|
|
1611
|
+
},
|
|
1612
|
+
}),
|
|
1613
|
+
{
|
|
1614
|
+
db,
|
|
1615
|
+
manifestPath: h.manifestPath,
|
|
1616
|
+
configDir: h.dir,
|
|
1617
|
+
readExposeStateFn: h.readExposeStateFn,
|
|
1618
|
+
issuer: "https://hub.example",
|
|
1619
|
+
supervisor,
|
|
1620
|
+
registry: getDefaultOperationsRegistry(),
|
|
1621
|
+
run: stubbedRun,
|
|
1622
|
+
isLinked: () => false,
|
|
1623
|
+
},
|
|
1624
|
+
);
|
|
1625
|
+
// Not short-circuited: the placeholder is not a real vault, so the
|
|
1626
|
+
// POST enqueues an install op rather than redirecting to expose.
|
|
1627
|
+
expect(post.status).toBe(303);
|
|
1628
|
+
const location = post.headers.get("location") ?? "";
|
|
1629
|
+
expect(location).toMatch(/^\/admin\/setup\?op=/);
|
|
1630
|
+
// Let the background runInstall promise reach the runner + supervisor.
|
|
1631
|
+
await new Promise((r) => setTimeout(r, 50));
|
|
1632
|
+
// #607 proof: the install actually ran (not the "already provisioned"
|
|
1633
|
+
// short-circuit, which fires no `bun add`).
|
|
1634
|
+
expect(runCalls.some((c) => c.join(" ").includes("bun add -g @openparachute/vault"))).toBe(
|
|
1635
|
+
true,
|
|
1636
|
+
);
|
|
1637
|
+
// #608 proof: the supervisor was driven to start the vault child, so
|
|
1638
|
+
// the vault is live immediately after the wizard create — no manual
|
|
1639
|
+
// `parachute start vault` / hub restart needed.
|
|
1640
|
+
const vaultState = supervisor.get("vault");
|
|
1641
|
+
expect(vaultState).toBeDefined();
|
|
1642
|
+
expect(["starting", "running", "restarting"]).toContain(vaultState?.status ?? "");
|
|
1643
|
+
} finally {
|
|
1644
|
+
db.close();
|
|
1645
|
+
}
|
|
1646
|
+
});
|
|
1647
|
+
|
|
1509
1648
|
// --- scribe cleanup sub-form (2026-05-27) -----------------------------
|
|
1510
1649
|
//
|
|
1511
1650
|
// The vault step's scribe sub-form was extended with a second radio
|
package/src/commands/install.ts
CHANGED
|
@@ -7,8 +7,12 @@ import { CONFIG_DIR, SERVICES_MANIFEST_PATH } from "../config.ts";
|
|
|
7
7
|
import { type ExposeState, readExposeState } from "../expose-state.ts";
|
|
8
8
|
import {
|
|
9
9
|
HUB_DEFAULT_PORT,
|
|
10
|
+
type KillFn,
|
|
10
11
|
type PidOnPortFn,
|
|
12
|
+
type SleepFn,
|
|
13
|
+
defaultKill,
|
|
11
14
|
defaultPidOnPort,
|
|
15
|
+
defaultSleep,
|
|
12
16
|
readHubPort,
|
|
13
17
|
} from "../hub-control.ts";
|
|
14
18
|
import { type HubUnitDeps, defaultHubUnitDeps, isHubUnitInstalled } from "../hub-unit.ts";
|
|
@@ -18,6 +22,7 @@ import {
|
|
|
18
22
|
readModuleManifest,
|
|
19
23
|
validateModuleManifest,
|
|
20
24
|
} from "../module-manifest.ts";
|
|
25
|
+
import { orphanAttributable } from "../orphan-attribution.ts";
|
|
21
26
|
import { assignServicePort } from "../port-assign.ts";
|
|
22
27
|
import { finalizeModuleInstall, stampInstallDirOnRow } from "../post-install.ts";
|
|
23
28
|
import {
|
|
@@ -323,6 +328,30 @@ export interface InstallOpts {
|
|
|
323
328
|
* `defaultOwnerOfPid` (`ps -o command= -p <pid>`); tests inject a stub.
|
|
324
329
|
*/
|
|
325
330
|
ownerOfPid?: OwnerProbeFn;
|
|
331
|
+
/**
|
|
332
|
+
* Test seam for the install-time canonical-port ADOPT-KILL (#609). When the
|
|
333
|
+
* canonical port is held by an attributable prior instance of THE SAME module
|
|
334
|
+
* (a surviving orphan child after `rm -rf ~/.parachute` + re-`init`), we
|
|
335
|
+
* SIGTERM→SIGKILL it to reclaim the canonical port instead of walking to a
|
|
336
|
+
* non-canonical fallback. Reuses the #601 `orphanAttributable` machinery in
|
|
337
|
+
* per-module mode (marker = this install's `installDir`); a foreign /
|
|
338
|
+
* unattributable holder is NEVER killed — it falls through to the #590
|
|
339
|
+
* warn-and-walk path. Production wires `defaultKill` (`process.kill`); tests
|
|
340
|
+
* inject a spy so no real process is signalled.
|
|
341
|
+
*/
|
|
342
|
+
killPid?: KillFn;
|
|
343
|
+
/**
|
|
344
|
+
* Test seam for the grace delay between SIGTERM and the SIGKILL escalation in
|
|
345
|
+
* the #609 adopt-kill. Production wires `defaultSleep`; tests inject a no-op
|
|
346
|
+
* so the path runs instantly.
|
|
347
|
+
*/
|
|
348
|
+
sleep?: SleepFn;
|
|
349
|
+
/**
|
|
350
|
+
* Test seam: ms to wait after SIGTERM before re-probing + escalating to
|
|
351
|
+
* SIGKILL in the #609 adopt-kill. Default 1500ms (a listener-release grace);
|
|
352
|
+
* tests pass 0.
|
|
353
|
+
*/
|
|
354
|
+
reclaimDelayMs?: number;
|
|
326
355
|
/**
|
|
327
356
|
* Test seam for reading `<packageDir>/.parachute/module.json`. Production
|
|
328
357
|
* uses the real file reader; tests inject a map from package-dir → manifest
|
|
@@ -452,6 +481,50 @@ async function collectOccupiedPorts(
|
|
|
452
481
|
return ports;
|
|
453
482
|
}
|
|
454
483
|
|
|
484
|
+
/**
|
|
485
|
+
* Adopt-kill an ATTRIBUTABLE orphan holding the canonical port (#609). The
|
|
486
|
+
* caller has ALREADY confirmed attribution (per-module marker) — this is purely
|
|
487
|
+
* the signal sequence, mirroring the supervisor's `adoptKillOrphanOnPort`:
|
|
488
|
+
* SIGTERM, a listener-release grace, then SIGKILL only if the SAME pid still
|
|
489
|
+
* holds the SAME port. Best-effort: a kill that doesn't free the port degrades
|
|
490
|
+
* to the normal warn-and-walk path (the subsequent `collectOccupiedPorts` still
|
|
491
|
+
* sees the port held and `assignServicePort` walks).
|
|
492
|
+
*
|
|
493
|
+
* The re-probe before SIGKILL is deliberately NOT re-attributed: we already
|
|
494
|
+
* attributed `holder` to this module, and only escalate if that exact pid still
|
|
495
|
+
* holds the port (the same accepted, vanishingly-small TOCTOU window the
|
|
496
|
+
* supervisor + migrate sweep carry).
|
|
497
|
+
*/
|
|
498
|
+
async function adoptKillOnPort(args: {
|
|
499
|
+
port: number;
|
|
500
|
+
holder: number;
|
|
501
|
+
kill: KillFn;
|
|
502
|
+
sleep: SleepFn;
|
|
503
|
+
pidOnPort: PidOnPortFn;
|
|
504
|
+
delayMs: number;
|
|
505
|
+
log: (line: string) => void;
|
|
506
|
+
}): Promise<void> {
|
|
507
|
+
const { port, holder, kill, sleep, pidOnPort, delayMs, log } = args;
|
|
508
|
+
try {
|
|
509
|
+
kill(holder, "SIGTERM");
|
|
510
|
+
} catch {
|
|
511
|
+
// ESRCH (already gone) / EPERM (can't signal) — nothing more to do; the
|
|
512
|
+
// re-probe + walk path handles a still-held port.
|
|
513
|
+
return;
|
|
514
|
+
}
|
|
515
|
+
await sleep(delayMs);
|
|
516
|
+
if (pidOnPort(port) === holder) {
|
|
517
|
+
try {
|
|
518
|
+
kill(holder, "SIGKILL");
|
|
519
|
+
log(` pid ${holder} did not release ${port} on SIGTERM; escalated to SIGKILL.`);
|
|
520
|
+
} catch {
|
|
521
|
+
// Already gone / can't signal — best-effort; fall through to the walk.
|
|
522
|
+
}
|
|
523
|
+
} else {
|
|
524
|
+
log(` reclaimed canonical port ${port} (pid ${holder} released it).`);
|
|
525
|
+
}
|
|
526
|
+
}
|
|
527
|
+
|
|
455
528
|
function defaultFindGlobalInstall(pkg: string): string | null {
|
|
456
529
|
for (const prefix of bunGlobalPrefixes()) {
|
|
457
530
|
const pkgJsonPath = join(prefix, ...pkg.split("/"), "package.json");
|
|
@@ -988,23 +1061,98 @@ export async function install(input: string, opts: InstallOpts = {}): Promise<nu
|
|
|
988
1061
|
// future installs no longer touch them.
|
|
989
1062
|
const preInitEntry = findService(entryName, manifestPath);
|
|
990
1063
|
const probe = opts.portProbe ?? defaultPortProbe;
|
|
991
|
-
const
|
|
1064
|
+
const pidOnPort = opts.pidOnPort ?? defaultPidOnPort;
|
|
1065
|
+
const ownerOfPid = opts.ownerOfPid ?? defaultOwnerOfPid;
|
|
992
1066
|
const canonicalPort = spec.seedEntry?.().port ?? preInitEntry?.port;
|
|
1067
|
+
|
|
1068
|
+
// #609 wipe-recovery adopt-kill: BEFORE assigning, if the canonical port is
|
|
1069
|
+
// held by an attributable prior instance of THE SAME module (the classic
|
|
1070
|
+
// `rm -rf ~/.parachute` + re-`init` case — the supervised vault child keeps
|
|
1071
|
+
// running on :1940 and the fresh install would otherwise port-walk to 1944),
|
|
1072
|
+
// reclaim the port by adopt-killing the orphan rather than walking. Reuses the
|
|
1073
|
+
// #601 `orphanAttributable` machinery in PER-MODULE mode (marker = THIS
|
|
1074
|
+
// install's installDir, the same module-specific marker the supervisor's
|
|
1075
|
+
// crash-restart path uses) so a FOREIGN / sibling-module / operator process is
|
|
1076
|
+
// NEVER killed — it falls through to the #590 warn-and-walk path below.
|
|
1077
|
+
// Detection + module-specific attribution only; the kill is gated hard.
|
|
1078
|
+
// Gate the probe on the canonical port actually being OCCUPIED — when it's
|
|
1079
|
+
// free there's nothing to reclaim, and probing pid would be wasted work (and
|
|
1080
|
+
// a false "I looked at the port" signal). `probe` is the same TCP listen probe
|
|
1081
|
+
// `collectOccupiedPorts` uses below; a services.json row on the canonical port
|
|
1082
|
+
// also counts as occupied (a prior install's lingering entry).
|
|
1083
|
+
const canonicalOccupied =
|
|
1084
|
+
canonicalPort !== undefined &&
|
|
1085
|
+
(preInitEntry?.port === canonicalPort ||
|
|
1086
|
+
(await (async () => {
|
|
1087
|
+
try {
|
|
1088
|
+
return await probe(canonicalPort);
|
|
1089
|
+
} catch {
|
|
1090
|
+
return false;
|
|
1091
|
+
}
|
|
1092
|
+
})()));
|
|
1093
|
+
if (canonicalPort !== undefined && installDir && canonicalOccupied) {
|
|
1094
|
+
const holder = pidOnPort(canonicalPort);
|
|
1095
|
+
if (holder !== undefined && holder !== process.pid) {
|
|
1096
|
+
const { attributable, cmdline } = orphanAttributable({
|
|
1097
|
+
orphan: holder,
|
|
1098
|
+
// No recorded pid to trust here — a wiped services.json carries none —
|
|
1099
|
+
// so attribution rides entirely on the per-module cmdline marker.
|
|
1100
|
+
recordedPid: undefined,
|
|
1101
|
+
short,
|
|
1102
|
+
startCmdHint: undefined,
|
|
1103
|
+
ownerOfPid,
|
|
1104
|
+
// Per-module marker = installDir (e.g. `~/.parachute/vault/`); a prior
|
|
1105
|
+
// instance of this module was launched from there, so its `ps` cmdline
|
|
1106
|
+
// carries it. NOT the broad `parachute` marker — that would let a
|
|
1107
|
+
// sibling module's orphan on this port be (wrongly) adopted.
|
|
1108
|
+
moduleMarker: installDir,
|
|
1109
|
+
});
|
|
1110
|
+
if (attributable) {
|
|
1111
|
+
log(
|
|
1112
|
+
`Canonical port ${canonicalPort} is held by an attributable prior ${short} instance (pid ${holder}${cmdline ? `, ${cmdline}` : ""}) — reclaiming it (adopt-kill) instead of walking to a fallback (#609).`,
|
|
1113
|
+
);
|
|
1114
|
+
await adoptKillOnPort({
|
|
1115
|
+
port: canonicalPort,
|
|
1116
|
+
holder,
|
|
1117
|
+
kill: opts.killPid ?? defaultKill,
|
|
1118
|
+
sleep: opts.sleep ?? defaultSleep,
|
|
1119
|
+
pidOnPort,
|
|
1120
|
+
delayMs: opts.reclaimDelayMs ?? 1500,
|
|
1121
|
+
log,
|
|
1122
|
+
});
|
|
1123
|
+
}
|
|
1124
|
+
}
|
|
1125
|
+
}
|
|
1126
|
+
|
|
1127
|
+
// Hub-as-port-authority (#53): pick the service's port now and reflect it
|
|
1128
|
+
// in services.json. Pre-hub#206 the install path also wrote `PORT=<port>`
|
|
1129
|
+
// into the service's `.env`; post-#206 (option A) services.json is the
|
|
1130
|
+
// single source of truth — services follow the 4-tier resolvePort ladder
|
|
1131
|
+
// (services.json → service config → bare PORT env → compiled-in default,
|
|
1132
|
+
// per parachute-scribe#41 / parachute-agent#146 / parachute-agent#148 /
|
|
1133
|
+
// parachute-patterns#45), so the duplicate `.env` PORT was at best dead
|
|
1134
|
+
// weight and at worst a source of drift on re-install. Existing `.env`
|
|
1135
|
+
// PORT lines on operator machines stay where they are — harmless — and
|
|
1136
|
+
// future installs no longer touch them.
|
|
1137
|
+
//
|
|
1138
|
+
// collectOccupiedPorts runs AFTER the #609 adopt-kill above so a reclaimed
|
|
1139
|
+
// canonical port is seen as free and the assignment lands on it (no walk).
|
|
1140
|
+
const occupied = await collectOccupiedPorts(manifestPath, entryName, preInitEntry?.port, probe);
|
|
993
1141
|
const portResult = assignServicePort({
|
|
994
1142
|
canonical: canonicalPort,
|
|
995
1143
|
occupied,
|
|
996
1144
|
});
|
|
997
1145
|
if (portResult.warning) {
|
|
998
1146
|
log(`⚠ ${portResult.warning}`);
|
|
999
|
-
// #590 item 2: the canonical port was held
|
|
1000
|
-
//
|
|
1001
|
-
//
|
|
1002
|
-
//
|
|
1003
|
-
//
|
|
1004
|
-
//
|
|
1147
|
+
// #590 item 2: the canonical port was held by a NON-attributable holder (the
|
|
1148
|
+
// #609 adopt-kill above already reclaimed an attributable same-module
|
|
1149
|
+
// orphan), so we walked to a fallback. Name the squatter — the supervisor
|
|
1150
|
+
// start-path does this post-#581; do it here at install-time too. Reuse the
|
|
1151
|
+
// #581 pidOnPort / ownerOfPid seams (detection only; never kill). When the
|
|
1152
|
+
// holder is a foreign pid (not one of OUR rows — which is the common case
|
|
1153
|
+
// when a stale pre-supervisor daemon is squatting), surface its pid +
|
|
1154
|
+
// command line + a hint.
|
|
1005
1155
|
if (canonicalPort !== undefined && portResult.source !== "canonical") {
|
|
1006
|
-
const pidOnPort = opts.pidOnPort ?? defaultPidOnPort;
|
|
1007
|
-
const ownerOfPid = opts.ownerOfPid ?? defaultOwnerOfPid;
|
|
1008
1156
|
const holder = pidOnPort(canonicalPort);
|
|
1009
1157
|
if (holder !== undefined) {
|
|
1010
1158
|
const cmdline = ownerOfPid(holder);
|
package/src/hub-db-liveness.ts
CHANGED
|
@@ -1,7 +1,8 @@
|
|
|
1
1
|
import type { Database } from "bun:sqlite";
|
|
2
|
+
import { statSync } from "node:fs";
|
|
2
3
|
|
|
3
4
|
/**
|
|
4
|
-
* SQLite-handle liveness + self-heal policy (#594).
|
|
5
|
+
* SQLite-handle liveness + self-heal policy (#594, #610).
|
|
5
6
|
*
|
|
6
7
|
* Field repro: an operator deleted `~/.parachute` while the hub unit was
|
|
7
8
|
* running. The process kept an fd to the now-unlinked `hub.db` inode — cached
|
|
@@ -12,12 +13,33 @@ import type { Database } from "bun:sqlite";
|
|
|
12
13
|
* /health is the worst possible failure shape — a crash-restart would have
|
|
13
14
|
* self-healed in seconds (the platform manager re-`openHubDb`s a fresh handle).
|
|
14
15
|
*
|
|
15
|
-
* The policy
|
|
16
|
-
* class, attempt ONE reopen of the handle; if reopen fails OR the error
|
|
17
|
-
* recurs immediately, log loudly and `process.exit(1)` so the platform
|
|
18
|
-
*
|
|
19
|
-
*
|
|
20
|
-
*
|
|
16
|
+
* The REACTIVE policy (#594): on a request that hits the persistent-corruption
|
|
17
|
+
* error class, attempt ONE reopen of the handle; if reopen fails OR the error
|
|
18
|
+
* recurs immediately, log loudly and `process.exit(1)` so the platform manager
|
|
19
|
+
* (launchd / systemd / container runtime) restarts with a fresh handle. We are
|
|
20
|
+
* careful to scope "fatal" to the persistent class — a transient `SQLITE_BUSY`
|
|
21
|
+
* (a momentary write lock) must NOT kill the hub.
|
|
22
|
+
*
|
|
23
|
+
* The PROACTIVE policy (#610): the reactive path above only fires on a THROWN
|
|
24
|
+
* error. But on Linux, `rm -rf ~/.parachute` under a running hub does NOT throw
|
|
25
|
+
* — the kernel keeps the unlinked `hub.db` inode alive behind the open fd, so
|
|
26
|
+
* `SELECT 1` and even writes keep succeeding against the ghost (deleted) inode
|
|
27
|
+
* indefinitely. Nothing throws ⇒ the reactive self-heal never fires ⇒ `/health`
|
|
28
|
+
* lies `db:"ok"` forever against a database that's gone from disk. The proactive
|
|
29
|
+
* check closes this gap WITHOUT relying on a thrown error: at open time we record
|
|
30
|
+
* the db file's inode (`st_dev`/`st_ino`); a low-frequency probe (and `/health`'s
|
|
31
|
+
* db check) re-`stat()`s the configured path and compares. ENOENT on the path, or
|
|
32
|
+
* an inode mismatch, means the on-disk DB the handle points at is gone / replaced
|
|
33
|
+
* ⇒ trigger the SAME reopen-or-exit machinery (here the path is gone, so reopen's
|
|
34
|
+
* verify fails and we exit, letting the platform manager restart with a fresh,
|
|
35
|
+
* on-disk handle in seconds rather than "never").
|
|
36
|
+
*
|
|
37
|
+
* SAFETY (both policies): we only ever escalate to reopen/exit on the genuine
|
|
38
|
+
* persistent signal — a thrown fatal error, or a definitively gone/replaced path.
|
|
39
|
+
* Transient conditions (SQLITE_BUSY, a momentary lock, a stat() that fails for a
|
|
40
|
+
* reason OTHER than ENOENT — e.g. EACCES, EINTR) NEVER trigger it. The exit fn is
|
|
41
|
+
* injectable so no test can kill the test process (hub#535 precedent), and the
|
|
42
|
+
* proactive timer is bounded so it can't spin.
|
|
21
43
|
*/
|
|
22
44
|
|
|
23
45
|
/**
|
|
@@ -120,12 +142,92 @@ export function probeDbLiveness(db: Database): "ok" | string {
|
|
|
120
142
|
}
|
|
121
143
|
}
|
|
122
144
|
|
|
145
|
+
/**
|
|
146
|
+
* The identity of an on-disk file — `st_dev`/`st_ino`, the only two fields that
|
|
147
|
+
* uniquely identify an inode across a delete+recreate. We snapshot this for the
|
|
148
|
+
* db path at open time so the proactive probe (#610) can tell "same file the
|
|
149
|
+
* handle points at" from "path now resolves to a DIFFERENT (or no) inode".
|
|
150
|
+
*/
|
|
151
|
+
export interface DbInode {
|
|
152
|
+
dev: number;
|
|
153
|
+
ino: number;
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
/**
|
|
157
|
+
* Injectable `stat` of the db PATH (not the open handle). Production wires
|
|
158
|
+
* {@link defaultStatInode} (`fs.statSync`); tests inject a function that returns
|
|
159
|
+
* a chosen inode, `undefined` for ENOENT (path gone), or throws a non-ENOENT
|
|
160
|
+
* error (e.g. EACCES — a TRANSIENT failure that must NOT trigger self-heal).
|
|
161
|
+
*
|
|
162
|
+
* Contract: return the {@link DbInode} on success, `undefined` when the path
|
|
163
|
+
* does not exist (ENOENT — the genuine "wiped" signal), and THROW for any other
|
|
164
|
+
* error (so the caller can treat it as transient and leave the hub alone).
|
|
165
|
+
*/
|
|
166
|
+
export type StatInodeFn = (path: string) => DbInode | undefined;
|
|
167
|
+
|
|
168
|
+
/**
|
|
169
|
+
* Production `stat`: returns the path's inode, or `undefined` on ENOENT. Any
|
|
170
|
+
* other error (EACCES, EINTR, …) is re-thrown so the caller classifies it as
|
|
171
|
+
* transient — we only ever self-heal on a DEFINITIVELY-gone path.
|
|
172
|
+
*/
|
|
173
|
+
export const defaultStatInode: StatInodeFn = (path) => {
|
|
174
|
+
try {
|
|
175
|
+
const st = statSync(path);
|
|
176
|
+
return { dev: st.dev, ino: st.ino };
|
|
177
|
+
} catch (err) {
|
|
178
|
+
if (err && typeof err === "object" && (err as { code?: unknown }).code === "ENOENT") {
|
|
179
|
+
return undefined;
|
|
180
|
+
}
|
|
181
|
+
throw err;
|
|
182
|
+
}
|
|
183
|
+
};
|
|
184
|
+
|
|
185
|
+
/**
|
|
186
|
+
* The verdict of a proactive path-liveness check (#610), against the inode the
|
|
187
|
+
* handle was opened on:
|
|
188
|
+
* - `"ok"` → the path still resolves to the SAME inode the handle holds.
|
|
189
|
+
* - `"gone"` → the path no longer exists (ENOENT) — the state dir was wiped.
|
|
190
|
+
* - `"replaced"` → the path exists but resolves to a DIFFERENT inode — the DB
|
|
191
|
+
* file was deleted + recreated underneath the handle.
|
|
192
|
+
* - `"unknown"` → we couldn't snapshot the open inode (no baseline) so we
|
|
193
|
+
* can't compare; treated as a non-signal (never self-heals).
|
|
194
|
+
*
|
|
195
|
+
* Only `"gone"`/`"replaced"` are the genuine wipe signal that triggers self-heal.
|
|
196
|
+
*/
|
|
197
|
+
export type PathLivenessClass = "ok" | "gone" | "replaced" | "unknown";
|
|
198
|
+
|
|
199
|
+
/**
|
|
200
|
+
* Pure classifier: compare the inode the path resolves to NOW (or `undefined`
|
|
201
|
+
* for ENOENT) against the inode the open handle was created on. No I/O — the
|
|
202
|
+
* caller does the `stat()` and the open-inode snapshot; this is the decision so
|
|
203
|
+
* it's trivially unit-testable and the "never fire on transient" rule is a
|
|
204
|
+
* single, auditable function.
|
|
205
|
+
*
|
|
206
|
+
* A non-ENOENT stat error is NOT represented here — the caller (`statInode`'s
|
|
207
|
+
* contract) THROWS on it, and the probe treats a thrown stat as transient
|
|
208
|
+
* (leaves the hub alone). Only a clean ENOENT (`current === undefined`) or a
|
|
209
|
+
* clean inode mismatch reaches a self-heal verdict.
|
|
210
|
+
*/
|
|
211
|
+
export function classifyPathLiveness(args: {
|
|
212
|
+
/** The inode the open db handle was created on (snapshot at open). */
|
|
213
|
+
expected: DbInode | undefined;
|
|
214
|
+
/** The inode the path resolves to NOW, or `undefined` for ENOENT. */
|
|
215
|
+
current: DbInode | undefined;
|
|
216
|
+
}): PathLivenessClass {
|
|
217
|
+
const { expected, current } = args;
|
|
218
|
+
// No baseline → we can't compare; never self-heal on a missing snapshot.
|
|
219
|
+
if (expected === undefined) return "unknown";
|
|
220
|
+
if (current === undefined) return "gone";
|
|
221
|
+
if (current.dev === expected.dev && current.ino === expected.ino) return "ok";
|
|
222
|
+
return "replaced";
|
|
223
|
+
}
|
|
224
|
+
|
|
123
225
|
/**
|
|
124
226
|
* A mutable holder for the hub's `Database` handle so a request handler that
|
|
125
227
|
* hits the fatal error class can swap in a freshly-reopened handle without
|
|
126
228
|
* re-threading the closure-captured `db` through every call site. `getDb()`
|
|
127
|
-
* in hub-server reads `holder.get()`; the self-heal path calls
|
|
128
|
-
* `holder.healOrExit(err)`.
|
|
229
|
+
* in hub-server reads `holder.get()`; the reactive self-heal path calls
|
|
230
|
+
* `holder.healOrExit(err)`; the proactive (#610) path calls `holder.probePath()`.
|
|
129
231
|
*/
|
|
130
232
|
export interface DbHolder {
|
|
131
233
|
/** The current live handle. */
|
|
@@ -145,6 +247,20 @@ export interface DbHolder {
|
|
|
145
247
|
* gone), we exit rather than loop — the platform manager owns the restart.
|
|
146
248
|
*/
|
|
147
249
|
healOrExit(err: unknown): "ignored" | "healed" | "exited";
|
|
250
|
+
/**
|
|
251
|
+
* PROACTIVE path-liveness probe (#610). `stat()`s the configured db PATH and
|
|
252
|
+
* compares its inode to the one the open handle was created on. On a genuine
|
|
253
|
+
* wipe signal (`"gone"`/`"replaced"`) it triggers the SAME reopen-or-exit
|
|
254
|
+
* machinery as `healOrExit` (here the path is gone, so reopen's verify fails
|
|
255
|
+
* → exit → platform manager restarts with a fresh on-disk handle). On `"ok"`,
|
|
256
|
+
* `"unknown"`, or a thrown (transient) stat it does NOTHING.
|
|
257
|
+
*
|
|
258
|
+
* Returns the {@link PathLivenessClass} verdict so `/health` and tests can see
|
|
259
|
+
* what was observed; the `"healed"`/`"exited"` side effects mirror `healOrExit`.
|
|
260
|
+
* Wired into the bounded liveness timer in hub-server AND into `/health`'s db
|
|
261
|
+
* check, so monitoring + the #591 adoption probe see the fault instead of a lie.
|
|
262
|
+
*/
|
|
263
|
+
probePath(): PathLivenessClass;
|
|
148
264
|
}
|
|
149
265
|
|
|
150
266
|
export interface DbHolderDeps {
|
|
@@ -156,6 +272,20 @@ export interface DbHolderDeps {
|
|
|
156
272
|
exit?: (code: number) => void;
|
|
157
273
|
/** Close a (presumed-dead) handle best-effort before swapping (default `db.close()`). */
|
|
158
274
|
closeOld?: (db: Database) => void;
|
|
275
|
+
/**
|
|
276
|
+
* The on-disk db PATH the proactive probe (#610) stat()s. When omitted,
|
|
277
|
+
* `probePath()` is a no-op (`"unknown"`) — backwards-compatible for the
|
|
278
|
+
* reactive-only callers + tests that don't exercise the proactive path.
|
|
279
|
+
*/
|
|
280
|
+
dbPath?: string;
|
|
281
|
+
/** Injectable path stat for the proactive probe (default {@link defaultStatInode}). */
|
|
282
|
+
statInode?: StatInodeFn;
|
|
283
|
+
/**
|
|
284
|
+
* The inode the INITIAL handle was opened on. Production passes the snapshot
|
|
285
|
+
* taken right after `openHubDb`; when omitted (or when the snapshot itself
|
|
286
|
+
* failed), `probePath()` returns `"unknown"` and never self-heals.
|
|
287
|
+
*/
|
|
288
|
+
initialInode?: DbInode | undefined;
|
|
159
289
|
}
|
|
160
290
|
|
|
161
291
|
/**
|
|
@@ -166,8 +296,13 @@ export interface DbHolderDeps {
|
|
|
166
296
|
*/
|
|
167
297
|
export function createDbHolder(initial: Database, deps: DbHolderDeps): DbHolder {
|
|
168
298
|
let current = initial;
|
|
299
|
+
// The inode the CURRENT handle is bound to. Updated on every successful
|
|
300
|
+
// reopen so the proactive probe (#610) compares against the live handle, not
|
|
301
|
+
// a one-time snapshot that would go stale after a heal.
|
|
302
|
+
let currentInode: DbInode | undefined = deps.initialInode;
|
|
169
303
|
const log = deps.log ?? ((line) => console.error(line));
|
|
170
304
|
const exit = deps.exit ?? ((code) => process.exit(code));
|
|
305
|
+
const statInode = deps.statInode ?? defaultStatInode;
|
|
171
306
|
const closeOld =
|
|
172
307
|
deps.closeOld ??
|
|
173
308
|
((db) => {
|
|
@@ -178,34 +313,159 @@ export function createDbHolder(initial: Database, deps: DbHolderDeps): DbHolder
|
|
|
178
313
|
}
|
|
179
314
|
});
|
|
180
315
|
|
|
316
|
+
/**
|
|
317
|
+
* Shared reopen-once-or-exit core for BOTH the reactive (`healOrExit`) and
|
|
318
|
+
* proactive (`probePath`) self-heal paths. `reason` is the loud-log preamble
|
|
319
|
+
* describing what triggered it. Returns `"healed"` (fresh handle swapped in +
|
|
320
|
+
* verified) or `"exited"` (reopen failed / new handle dead → exit, which only
|
|
321
|
+
* returns in tests where `exit` is a non-killing spy).
|
|
322
|
+
*/
|
|
323
|
+
const reopenOrExit = (reason: string): "healed" | "exited" => {
|
|
324
|
+
log(`parachute hub: ${reason}. Attempting one DB handle reopen…`);
|
|
325
|
+
|
|
326
|
+
let reopened: Database;
|
|
327
|
+
try {
|
|
328
|
+
reopened = deps.reopen();
|
|
329
|
+
// Confirm the fresh handle is actually live before trusting it.
|
|
330
|
+
reopened.query("SELECT 1").get();
|
|
331
|
+
} catch (reopenErr) {
|
|
332
|
+
const rd = reopenErr instanceof Error ? reopenErr.message : String(reopenErr);
|
|
333
|
+
log(
|
|
334
|
+
`parachute hub: DB reopen failed (${rd}); exiting so the platform manager restarts the hub with a fresh handle.`,
|
|
335
|
+
);
|
|
336
|
+
exit(1);
|
|
337
|
+
return "exited";
|
|
338
|
+
}
|
|
339
|
+
|
|
340
|
+
// Reopen succeeded + verified. Swap it in; the old handle is dead.
|
|
341
|
+
closeOld(current);
|
|
342
|
+
current = reopened;
|
|
343
|
+
// Re-snapshot the inode of the path the fresh handle now points at, so the
|
|
344
|
+
// proactive probe tracks the NEW file (best-effort — a failed snapshot
|
|
345
|
+
// leaves `currentInode` undefined → probe returns "unknown", never fires).
|
|
346
|
+
if (deps.dbPath !== undefined) {
|
|
347
|
+
try {
|
|
348
|
+
currentInode = statInode(deps.dbPath);
|
|
349
|
+
} catch {
|
|
350
|
+
currentInode = undefined;
|
|
351
|
+
}
|
|
352
|
+
}
|
|
353
|
+
log("parachute hub: DB handle reopened successfully; continuing.");
|
|
354
|
+
return "healed";
|
|
355
|
+
};
|
|
356
|
+
|
|
181
357
|
return {
|
|
182
358
|
get: () => current,
|
|
183
359
|
healOrExit(err: unknown) {
|
|
184
360
|
const klass = classifyDbError(err);
|
|
185
361
|
if (klass !== "fatal") return "ignored";
|
|
186
|
-
|
|
187
362
|
const detail = err instanceof Error ? err.message : String(err);
|
|
188
|
-
|
|
363
|
+
return reopenOrExit(`persistent SQLite failure (${detail})`);
|
|
364
|
+
},
|
|
365
|
+
probePath(): PathLivenessClass {
|
|
366
|
+
// No path configured → proactive probe disabled (reactive-only callers).
|
|
367
|
+
if (deps.dbPath === undefined) return "unknown";
|
|
189
368
|
|
|
190
|
-
|
|
369
|
+
// `pathInode` (NOT `current`) — the inode the db PATH resolves to right
|
|
370
|
+
// now. Named distinctly from the outer `current` (the live Database
|
|
371
|
+
// handle) so a reader can't misread this as the DB handle.
|
|
372
|
+
let pathInode: DbInode | undefined;
|
|
191
373
|
try {
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
);
|
|
200
|
-
exit(1);
|
|
201
|
-
return "exited";
|
|
374
|
+
pathInode = statInode(deps.dbPath);
|
|
375
|
+
} catch {
|
|
376
|
+
// A non-ENOENT stat failure (EACCES, EINTR, a transient FS hiccup) is
|
|
377
|
+
// explicitly NOT a wipe signal. Leave the hub alone — the next probe
|
|
378
|
+
// re-reads. This is the "never fire on transient" guard for the
|
|
379
|
+
// proactive path; only a clean ENOENT/mismatch below self-heals.
|
|
380
|
+
return "ok";
|
|
202
381
|
}
|
|
203
382
|
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
383
|
+
const verdict = classifyPathLiveness({ expected: currentInode, current: pathInode });
|
|
384
|
+
if (verdict === "ok" || verdict === "unknown") return verdict;
|
|
385
|
+
|
|
386
|
+
// Genuine wipe signal: the on-disk DB the handle points at is gone
|
|
387
|
+
// ("gone") or was replaced underneath us ("replaced"). Trigger the SAME
|
|
388
|
+
// reopen-or-exit machinery. When the path is gone, reopen's SELECT-1
|
|
389
|
+
// verify fails → exit → platform manager restarts with a fresh on-disk
|
|
390
|
+
// handle (seconds, not "never"). When replaced, we adopt the fresh inode.
|
|
391
|
+
//
|
|
392
|
+
// ONE-TICK /health ANOMALY (intentional): on a "replaced" verdict the
|
|
393
|
+
// reopenOrExit below heals SYNCHRONOUSLY, but we still RETURN "replaced"
|
|
394
|
+
// for this one call — so the /health request that drove this probe reports
|
|
395
|
+
// `db:"error: path-replaced"` even though the handle is now healthy; the
|
|
396
|
+
// very next request reads `ok`. We don't mask it (returning "ok" here would
|
|
397
|
+
// hide that a heal just happened, which is exactly what monitoring wants to
|
|
398
|
+
// see). It's safe because #591's adoption probe checks only HTTP 200
|
|
399
|
+
// (`res.ok`), not the specific `db` string, so a single transient error
|
|
400
|
+
// string can't cascade.
|
|
401
|
+
reopenOrExit(
|
|
402
|
+
verdict === "gone"
|
|
403
|
+
? `db path ${deps.dbPath} no longer exists (state dir wiped under a running hub, #610)`
|
|
404
|
+
: `db path ${deps.dbPath} now resolves to a different inode (DB file replaced underneath the open handle, #610)`,
|
|
405
|
+
);
|
|
406
|
+
return verdict;
|
|
407
|
+
},
|
|
408
|
+
};
|
|
409
|
+
}
|
|
410
|
+
|
|
411
|
+
/** Handle to stop a running proactive-liveness timer (test cleanup + shutdown). */
|
|
412
|
+
export interface DbLivenessTimer {
|
|
413
|
+
stop(): void;
|
|
414
|
+
}
|
|
415
|
+
|
|
416
|
+
export interface DbLivenessTimerDeps<H = unknown> {
|
|
417
|
+
/** Poll cadence in ms. Default 15_000 (low-frequency — this is a safety net,
|
|
418
|
+
* not a hot path; the cost is one `stat()` syscall per tick). */
|
|
419
|
+
intervalMs?: number;
|
|
420
|
+
/** Injectable scheduler (default `setInterval`). Tests drive ticks manually. */
|
|
421
|
+
setIntervalFn?: (cb: () => void, ms: number) => H;
|
|
422
|
+
/** Injectable clear (default `clearInterval`). */
|
|
423
|
+
clearIntervalFn?: (handle: H) => void;
|
|
424
|
+
/** Loud log sink for an unexpected probe throw (default `console.error`). */
|
|
425
|
+
log?: (line: string) => void;
|
|
426
|
+
}
|
|
427
|
+
|
|
428
|
+
/**
|
|
429
|
+
* Start the bounded, low-frequency PROACTIVE liveness timer (#610). Each tick
|
|
430
|
+
* calls `holder.probePath()` — which self-heals (reopen-or-exit) on a genuine
|
|
431
|
+
* wipe and no-ops otherwise. The cadence is fixed (default 15s) so it can NEVER
|
|
432
|
+
* spin: a tick does exactly one `stat()` then sleeps the full interval; even if
|
|
433
|
+
* the probe self-heals + exits, that's terminal. We swallow any unexpected
|
|
434
|
+
* probe throw (logged) rather than let an interval callback crash the process —
|
|
435
|
+
* the probe is a safety net, not a load-bearing request path.
|
|
436
|
+
*
|
|
437
|
+
* `unref()` is called so this timer never keeps the event loop alive on its own
|
|
438
|
+
* (it's purely a watchdog over the already-running server).
|
|
439
|
+
*/
|
|
440
|
+
export function startDbPathLivenessTimer<H = ReturnType<typeof setInterval>>(
|
|
441
|
+
holder: DbHolder,
|
|
442
|
+
deps: DbLivenessTimerDeps<H> = {},
|
|
443
|
+
): DbLivenessTimer {
|
|
444
|
+
const intervalMs = deps.intervalMs ?? 15_000;
|
|
445
|
+
const setIntervalFn =
|
|
446
|
+
deps.setIntervalFn ?? ((cb: () => void, ms: number) => setInterval(cb, ms) as unknown as H);
|
|
447
|
+
const clearIntervalFn =
|
|
448
|
+
deps.clearIntervalFn ??
|
|
449
|
+
((h: H) => clearInterval(h as unknown as ReturnType<typeof setInterval>));
|
|
450
|
+
const log = deps.log ?? ((line) => console.error(line));
|
|
451
|
+
|
|
452
|
+
const handle = setIntervalFn(() => {
|
|
453
|
+
try {
|
|
454
|
+
holder.probePath();
|
|
455
|
+
} catch (err) {
|
|
456
|
+
// A probe should never throw (statInode swallows non-ENOENT, the holder
|
|
457
|
+
// handles the rest), but if it somehow does, don't take the process down
|
|
458
|
+
// from inside a timer callback — log and let the next tick retry.
|
|
459
|
+
const detail = err instanceof Error ? err.message : String(err);
|
|
460
|
+
log(`parachute hub: proactive DB-liveness probe threw unexpectedly (${detail}); ignoring.`);
|
|
461
|
+
}
|
|
462
|
+
}, intervalMs);
|
|
463
|
+
// Don't let the watchdog alone keep the process alive.
|
|
464
|
+
(handle as { unref?: () => void }).unref?.();
|
|
465
|
+
|
|
466
|
+
return {
|
|
467
|
+
stop() {
|
|
468
|
+
clearIntervalFn(handle);
|
|
209
469
|
},
|
|
210
470
|
};
|
|
211
471
|
}
|
package/src/hub-server.ts
CHANGED
|
@@ -184,7 +184,13 @@ import { applyCorsHeaders, corsPreflightResponse, isCorsAllowedRoute } from "./c
|
|
|
184
184
|
import { ensureCsrfToken } from "./csrf.ts";
|
|
185
185
|
import { readExposeState } from "./expose-state.ts";
|
|
186
186
|
import { HUB_DEFAULT_PORT, HUB_SVC, clearHubPort, writeHubPort } from "./hub-control.ts";
|
|
187
|
-
import {
|
|
187
|
+
import {
|
|
188
|
+
classifyDbError,
|
|
189
|
+
createDbHolder,
|
|
190
|
+
defaultStatInode,
|
|
191
|
+
probeDbLiveness,
|
|
192
|
+
startDbPathLivenessTimer,
|
|
193
|
+
} from "./hub-db-liveness.ts";
|
|
188
194
|
import { hubDbPath, openHubDb } from "./hub-db.ts";
|
|
189
195
|
import { getHubOrigin } from "./hub-settings.ts";
|
|
190
196
|
import { type RenderHubOpts, renderHub } from "./hub.ts";
|
|
@@ -842,6 +848,17 @@ export interface HubFetchDeps {
|
|
|
842
848
|
* the response. Absent in tests that don't exercise the DB-error path.
|
|
843
849
|
*/
|
|
844
850
|
onDbError?: (err: unknown) => "ignored" | "healed" | "exited";
|
|
851
|
+
/**
|
|
852
|
+
* PROACTIVE db-path liveness probe (#610). Production wires the
|
|
853
|
+
* {@link DbHolder}'s `probePath` so the `/health` db check `stat()`s the
|
|
854
|
+
* configured db path and compares its inode to the open handle's — catching
|
|
855
|
+
* the "operator wiped `~/.parachute` under a running hub" case that NEVER
|
|
856
|
+
* throws on Linux (the unlinked-but-open ghost inode keeps `SELECT 1`
|
|
857
|
+
* succeeding). Returns the path-liveness verdict; on a genuine wipe it ALSO
|
|
858
|
+
* triggers the reopen-or-exit self-heal. Absent in tests that don't exercise
|
|
859
|
+
* the proactive path — `/health` then falls back to the `SELECT 1` probe only.
|
|
860
|
+
*/
|
|
861
|
+
probeDbPath?: () => "ok" | "gone" | "replaced" | "unknown";
|
|
845
862
|
/**
|
|
846
863
|
* Hub origin used as the OAuth `iss` claim and to build the authorization-
|
|
847
864
|
* server metadata document. When omitted, OAuth endpoints fall back to the
|
|
@@ -1605,7 +1622,26 @@ export function hubFetch(
|
|
|
1605
1622
|
let db: "ok" | string = "unconfigured";
|
|
1606
1623
|
if (getDb) {
|
|
1607
1624
|
try {
|
|
1608
|
-
|
|
1625
|
+
// PROACTIVE path check FIRST (#610): on Linux a wiped state dir
|
|
1626
|
+
// doesn't throw — the unlinked-but-open ghost inode keeps SELECT 1
|
|
1627
|
+
// succeeding, so `probeDbLiveness` alone would report `db:"ok"` on a
|
|
1628
|
+
// database that's gone from disk (the /health lie the issue calls
|
|
1629
|
+
// out). `probeDbPath` stat()s the path + compares inodes; on a
|
|
1630
|
+
// gone/replaced verdict it ALSO self-heals (reopen-or-exit) and we
|
|
1631
|
+
// surface the fault so the #591 adoption probe + monitoring see it.
|
|
1632
|
+
const pathVerdict = deps?.probeDbPath?.();
|
|
1633
|
+
if (pathVerdict === "gone" || pathVerdict === "replaced") {
|
|
1634
|
+
// One-request anomaly on "replaced": probeDbPath already healed the
|
|
1635
|
+
// handle synchronously, but THIS request still reports the fault
|
|
1636
|
+
// (the next /health reads `db:"ok"`). Intentional — we surface that
|
|
1637
|
+
// a heal just occurred rather than masking it. Safe because #591's
|
|
1638
|
+
// adoption probe gates on HTTP 200 (`res.ok`), not the `db` string,
|
|
1639
|
+
// so a single transient error string can't cascade. ("gone" exits
|
|
1640
|
+
// the process, usually before this response is even sent.)
|
|
1641
|
+
db = `error: path-${pathVerdict}`;
|
|
1642
|
+
} else {
|
|
1643
|
+
db = probeDbLiveness(getDb());
|
|
1644
|
+
}
|
|
1609
1645
|
} catch {
|
|
1610
1646
|
// getDb() itself threw (e.g. openHubDb failed) — report it as an
|
|
1611
1647
|
// error class without letting /health 500.
|
|
@@ -2769,12 +2805,36 @@ if (import.meta.main) {
|
|
|
2769
2805
|
// touch the DB still works before first open. Once opened, the holder owns
|
|
2770
2806
|
// reopen-once-or-exit on a persistent SQLite fault.
|
|
2771
2807
|
let holder: ReturnType<typeof createDbHolder> | undefined;
|
|
2772
|
-
|
|
2808
|
+
let livenessTimer: ReturnType<typeof startDbPathLivenessTimer> | undefined;
|
|
2809
|
+
const ensureHolder = (): ReturnType<typeof createDbHolder> => {
|
|
2773
2810
|
if (!holder) {
|
|
2774
|
-
|
|
2811
|
+
const db = openHubDb(dbPath);
|
|
2812
|
+
// Snapshot the inode the handle is bound to NOW, so the proactive probe
|
|
2813
|
+
// (#610) can later notice the path has gone / been replaced. Best-effort
|
|
2814
|
+
// — a failed snapshot leaves the proactive probe at "unknown" (it never
|
|
2815
|
+
// self-heals without a baseline), while the reactive path still covers
|
|
2816
|
+
// thrown faults.
|
|
2817
|
+
let initialInode: ReturnType<typeof defaultStatInode> | undefined;
|
|
2818
|
+
try {
|
|
2819
|
+
initialInode = defaultStatInode(dbPath);
|
|
2820
|
+
} catch {
|
|
2821
|
+
initialInode = undefined;
|
|
2822
|
+
}
|
|
2823
|
+
holder = createDbHolder(db, {
|
|
2824
|
+
reopen: () => openHubDb(dbPath),
|
|
2825
|
+
dbPath,
|
|
2826
|
+
statInode: defaultStatInode,
|
|
2827
|
+
initialInode,
|
|
2828
|
+
});
|
|
2829
|
+
// Start the bounded proactive-liveness watchdog (#610) once the handle is
|
|
2830
|
+
// open. It stat()s the db path on a low-frequency timer and self-heals
|
|
2831
|
+
// (reopen-or-exit) the moment the on-disk DB is wiped — closing the
|
|
2832
|
+
// ghost-fd gap the reactive path can't see (no thrown error on Linux).
|
|
2833
|
+
livenessTimer = startDbPathLivenessTimer(holder);
|
|
2775
2834
|
}
|
|
2776
|
-
return holder
|
|
2835
|
+
return holder;
|
|
2777
2836
|
};
|
|
2837
|
+
const getDb = () => ensureHolder().get();
|
|
2778
2838
|
const onDbError = (err: unknown): "ignored" | "healed" | "exited" =>
|
|
2779
2839
|
holder ? holder.healOrExit(err) : "ignored";
|
|
2780
2840
|
Bun.serve({
|
|
@@ -2792,7 +2852,13 @@ if (import.meta.main) {
|
|
|
2792
2852
|
// Bun's equivalent is this. 255s comfortably exceeds Render's edge
|
|
2793
2853
|
// pool TTL (community-observed ~120s). Closes hub#399.
|
|
2794
2854
|
idleTimeout: 255,
|
|
2795
|
-
fetch: hubFetch(wellKnownDir, {
|
|
2855
|
+
fetch: hubFetch(wellKnownDir, {
|
|
2856
|
+
getDb,
|
|
2857
|
+
onDbError,
|
|
2858
|
+
probeDbPath: () => holder?.probePath() ?? "unknown",
|
|
2859
|
+
issuer,
|
|
2860
|
+
loopbackPort: port,
|
|
2861
|
+
}),
|
|
2796
2862
|
});
|
|
2797
2863
|
// Register PID + port from the running hub itself so any startup path
|
|
2798
2864
|
// (spawn-via-`ensureHubRunning` or a direct `bun src/hub-server.ts` from
|
|
@@ -2802,6 +2868,7 @@ if (import.meta.main) {
|
|
|
2802
2868
|
writePid(HUB_SVC, process.pid);
|
|
2803
2869
|
writeHubPort(port);
|
|
2804
2870
|
const cleanup = () => {
|
|
2871
|
+
livenessTimer?.stop();
|
|
2805
2872
|
clearPid(HUB_SVC);
|
|
2806
2873
|
clearHubPort();
|
|
2807
2874
|
};
|
package/src/setup-wizard.ts
CHANGED
|
@@ -75,6 +75,7 @@ import {
|
|
|
75
75
|
readOperatorTokenFile,
|
|
76
76
|
} from "./operator-token.ts";
|
|
77
77
|
import { isHttpsRequest } from "./request-protocol.ts";
|
|
78
|
+
import { SEED_VERSION } from "./service-spec.ts";
|
|
78
79
|
import { findService, readManifestLenient } from "./services-manifest.ts";
|
|
79
80
|
import {
|
|
80
81
|
SESSION_TTL_MS,
|
|
@@ -274,13 +275,29 @@ export function deriveWizardState(deps: {
|
|
|
274
275
|
// which maps to `parachute-vault` in services.json.
|
|
275
276
|
const vaultSpec = specFor(FIRST_VAULT_SHORT);
|
|
276
277
|
const vaultEntry = findService(vaultSpec.manifestName, deps.manifestPath);
|
|
278
|
+
// hub#607: distinguish the SEED placeholder from a real vault instance.
|
|
279
|
+
// `parachute init` installs the vault MODULE without creating an instance
|
|
280
|
+
// (hub#168 Cut 1: `noCreate`), seeding a services.json entry at
|
|
281
|
+
// SEED_VERSION ("0.0.0-linked") with the canonical `/vault/default` mount.
|
|
282
|
+
// Vault's own first-boot overwrites that entry with the real instance once
|
|
283
|
+
// a vault is actually created. A bare `findService(...) !== undefined`
|
|
284
|
+
// check matches the placeholder, so on EVERY init'd box the wizard treated
|
|
285
|
+
// the vault step as already-done and skipped straight to expose — the
|
|
286
|
+
// operator finished setup with no vault and no prompt. Treat a
|
|
287
|
+
// SEED_VERSION row as "module installed, no instance" so the wizard still
|
|
288
|
+
// presents its create / import / skip step. This is the SAME
|
|
289
|
+
// discrimination `buildWellKnown` gained in hub#577 (it suppresses the
|
|
290
|
+
// phantom `vaults[]` row at SEED_VERSION); both surfaces must agree that a
|
|
291
|
+
// placeholder is not a real vault.
|
|
292
|
+
const vaultIsPlaceholder = vaultEntry !== undefined && vaultEntry.version === SEED_VERSION;
|
|
293
|
+
const hasRealVault = vaultEntry !== undefined && !vaultIsPlaceholder;
|
|
277
294
|
// hub#168 Cut 2: `setup_vault_skipped === "true"` advances the wizard
|
|
278
295
|
// past the vault step even when no vault row exists. The operator
|
|
279
296
|
// explicitly chose Skip; the module is installed (Cut 1) but no
|
|
280
297
|
// instance was provisioned. Treat as "vault step is done" for the
|
|
281
298
|
// purposes of state-derivation so the wizard moves to expose.
|
|
282
299
|
const vaultSkipped = getSetting(deps.db, "setup_vault_skipped") === "true";
|
|
283
|
-
const hasVault =
|
|
300
|
+
const hasVault = hasRealVault || vaultSkipped;
|
|
284
301
|
// Expose-mode is the operator's "how will this hub be reached?" answer
|
|
285
302
|
// (hub#268 Item 2). Stored as a hub_setting; the wizard's expose step
|
|
286
303
|
// sets it; absence means we should still ask. EXCEPT — if we're
|