@openparachute/hub 0.6.4 → 0.6.5-rc.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/__tests__/cloudflare-tunnel.test.ts +78 -0
- package/src/__tests__/expose-cloudflare.test.ts +253 -0
- package/src/__tests__/hub-db-liveness.test.ts +139 -0
- package/src/__tests__/hub-server.test.ts +145 -6
- package/src/__tests__/hub-unit.test.ts +110 -1
- package/src/__tests__/oauth-handlers.test.ts +457 -0
- package/src/__tests__/oauth-ui.test.ts +27 -0
- package/src/cloudflare/tunnel.ts +70 -0
- package/src/commands/expose-cloudflare.ts +157 -2
- package/src/commands/serve.ts +14 -4
- package/src/hub-db-liveness.ts +211 -0
- package/src/hub-server.ts +1175 -1104
- package/src/hub-unit.ts +74 -27
- package/src/oauth-handlers.ts +69 -25
- package/src/oauth-ui.ts +28 -2
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import { spawnSync } from "node:child_process";
|
|
2
|
-
import { mkdirSync, openSync } from "node:fs";
|
|
2
|
+
import { existsSync, mkdirSync, openSync } from "node:fs";
|
|
3
3
|
import { dirname } from "node:path";
|
|
4
4
|
import {
|
|
5
5
|
DEFAULT_TUNNEL_NAME,
|
|
@@ -37,8 +37,10 @@ import {
|
|
|
37
37
|
type Tunnel,
|
|
38
38
|
createTunnel,
|
|
39
39
|
credentialsPath,
|
|
40
|
+
deleteTunnel,
|
|
40
41
|
findTunnelByName,
|
|
41
42
|
routeDns,
|
|
43
|
+
tunnelConnectionCount,
|
|
42
44
|
} from "../cloudflare/tunnel.ts";
|
|
43
45
|
import { CONFIG_DIR, SERVICES_MANIFEST_PATH } from "../config.ts";
|
|
44
46
|
import {
|
|
@@ -267,6 +269,48 @@ export function looksLikeCloudflare(addresses: readonly string[]): boolean {
|
|
|
267
269
|
return false;
|
|
268
270
|
}
|
|
269
271
|
|
|
272
|
+
/**
|
|
273
|
+
* Poll for the spawned connector establishing a live edge connection, bounded
|
|
274
|
+
* by `timeoutMs` (#593). Resolves true the first time `cloudflared tunnel
|
|
275
|
+
* info` reports ≥1 connector connection; false when the budget elapses with
|
|
276
|
+
* none. The pid existing is NOT proof of connection — the field repro had a
|
|
277
|
+
* live pid crash-looping on a missing creds file, every request returning
|
|
278
|
+
* Cloudflare error 1033. This is the loud verification that turns that silent
|
|
279
|
+
* false-success into an actionable failure.
|
|
280
|
+
*
|
|
281
|
+
* Injectable so tests drive both branches deterministically without a real
|
|
282
|
+
* cloudflared. Production uses `defaultVerifyConnection` (a bounded
|
|
283
|
+
* `tunnelConnectionCount` poll).
|
|
284
|
+
*/
|
|
285
|
+
export type VerifyConnectionFn = (args: {
|
|
286
|
+
runner: Runner;
|
|
287
|
+
tunnelName: string;
|
|
288
|
+
timeoutMs: number;
|
|
289
|
+
pollMs: number;
|
|
290
|
+
sleep: (ms: number) => Promise<void>;
|
|
291
|
+
}) => Promise<boolean>;
|
|
292
|
+
|
|
293
|
+
export const defaultVerifyConnection: VerifyConnectionFn = async ({
|
|
294
|
+
runner,
|
|
295
|
+
tunnelName,
|
|
296
|
+
timeoutMs,
|
|
297
|
+
pollMs,
|
|
298
|
+
sleep,
|
|
299
|
+
}) => {
|
|
300
|
+
const deadline = Date.now() + timeoutMs;
|
|
301
|
+
// Probe immediately, then poll until a connector registers or the budget
|
|
302
|
+
// elapses. `tunnelConnectionCount` swallows its own CLI/parse errors → 0, so
|
|
303
|
+
// a not-yet-ready connector just costs another poll. Worst case is roughly
|
|
304
|
+
// ceil(timeoutMs / pollMs) iterations (each = one `cloudflared tunnel info`
|
|
305
|
+
// call + one sleep) before the deadline check returns false — with the
|
|
306
|
+
// production defaults (25_000 / 1_000) that's ~25 probes over ~25s.
|
|
307
|
+
for (;;) {
|
|
308
|
+
if ((await tunnelConnectionCount(runner, tunnelName)) > 0) return true;
|
|
309
|
+
if (Date.now() >= deadline) return false;
|
|
310
|
+
await sleep(pollMs);
|
|
311
|
+
}
|
|
312
|
+
};
|
|
313
|
+
|
|
270
314
|
export interface ExposeCloudflareOpts {
|
|
271
315
|
runner?: Runner;
|
|
272
316
|
spawner?: CloudflaredSpawner;
|
|
@@ -307,6 +351,23 @@ export interface ExposeCloudflareOpts {
|
|
|
307
351
|
* Tests inject a stub; production uses `defaultResolveHost` (Bun DNS).
|
|
308
352
|
*/
|
|
309
353
|
resolveHost?: ResolveHostFn;
|
|
354
|
+
/**
|
|
355
|
+
* Verify the spawned connector actually established an edge connection
|
|
356
|
+
* before claiming "✓ Cloudflare tunnel up" (#593). Production polls
|
|
357
|
+
* `cloudflared tunnel info` for a live connector (bounded). Tests inject a
|
|
358
|
+
* stub to drive the success / timeout branches without a real cloudflared.
|
|
359
|
+
* Returns true once at least one connection is live, false on timeout.
|
|
360
|
+
* Default policy mirrors `connectorPids`/`resolveHost`: when a test injects a
|
|
361
|
+
* stub `spawner` (and no explicit seam), default to an inert "connected"
|
|
362
|
+
* stub so existing stub-spawner suites don't have to model the probe.
|
|
363
|
+
*/
|
|
364
|
+
verifyConnection?: VerifyConnectionFn;
|
|
365
|
+
/** Connection-verify budget in ms (default 25_000). */
|
|
366
|
+
verifyTimeoutMs?: number;
|
|
367
|
+
/** Poll interval for the connection-verify probe in ms (default 1_000). */
|
|
368
|
+
verifyPollMs?: number;
|
|
369
|
+
/** Sleep between connection-verify polls. Tests pin to a no-op. */
|
|
370
|
+
sleep?: (ms: number) => Promise<void>;
|
|
310
371
|
log?: (line: string) => void;
|
|
311
372
|
manifestPath?: string;
|
|
312
373
|
statePath?: string;
|
|
@@ -402,6 +463,10 @@ interface Resolved {
|
|
|
402
463
|
}) => InstallResult;
|
|
403
464
|
removeService: (args: { tunnelName: string }) => RemoveResult;
|
|
404
465
|
resolveHost: ResolveHostFn;
|
|
466
|
+
verifyConnection: VerifyConnectionFn;
|
|
467
|
+
verifyTimeoutMs: number;
|
|
468
|
+
verifyPollMs: number;
|
|
469
|
+
sleep: (ms: number) => Promise<void>;
|
|
405
470
|
log: (line: string) => void;
|
|
406
471
|
manifestPath: string;
|
|
407
472
|
statePath: string;
|
|
@@ -488,6 +553,17 @@ function resolve(opts: ExposeCloudflareOpts, tunnelNameDefault: string): Resolve
|
|
|
488
553
|
resolveHost:
|
|
489
554
|
opts.resolveHost ??
|
|
490
555
|
(opts.spawner === undefined ? defaultResolveHost : async () => ["104.16.0.1"]),
|
|
556
|
+
// Connection-verify seam (#593). Same defaulting policy as
|
|
557
|
+
// `connectorPids`/`resolveHost`: when a test injects a stub `spawner` (and
|
|
558
|
+
// no explicit seam), default to an inert "connected" stub so existing
|
|
559
|
+
// stub-spawner suites don't have to model the `tunnel info` probe.
|
|
560
|
+
// Production (no spawner override) gets the real bounded poll.
|
|
561
|
+
verifyConnection:
|
|
562
|
+
opts.verifyConnection ??
|
|
563
|
+
(opts.spawner === undefined ? defaultVerifyConnection : async () => true),
|
|
564
|
+
verifyTimeoutMs: opts.verifyTimeoutMs ?? 25_000,
|
|
565
|
+
verifyPollMs: opts.verifyPollMs ?? 1_000,
|
|
566
|
+
sleep: opts.sleep ?? ((ms) => new Promise((resolve) => setTimeout(resolve, ms))),
|
|
491
567
|
log: opts.log ?? ((line) => console.log(line)),
|
|
492
568
|
manifestPath: opts.manifestPath ?? SERVICES_MANIFEST_PATH,
|
|
493
569
|
statePath: opts.statePath ?? CLOUDFLARED_STATE_PATH,
|
|
@@ -694,7 +770,50 @@ export async function exposeCloudflareUp(
|
|
|
694
770
|
" Each machine gets its own dedicated tunnel — you don't need to run `cloudflared tunnel create` separately; expose does it.",
|
|
695
771
|
);
|
|
696
772
|
} else {
|
|
697
|
-
|
|
773
|
+
// Reuse-path credentials verification + self-heal (#593). `findTunnelByName`
|
|
774
|
+
// only proves the tunnel exists ACCOUNT-side. The connector needs the LOCAL
|
|
775
|
+
// credentials file (`~/.cloudflared/<uuid>.json`, written at `tunnel create`
|
|
776
|
+
// time) to authenticate — and that file gets lost on clean-slate flows
|
|
777
|
+
// (`rm -rf ~/.parachute` and friends) while the account-side tunnel
|
|
778
|
+
// survives. The field repro: tunnel reused, "✓ tunnel up" printed, connector
|
|
779
|
+
// crash-looping on "credentials file … doesn't exist", every request → 1033.
|
|
780
|
+
//
|
|
781
|
+
// If the creds file is missing we recreate the tunnel: delete the
|
|
782
|
+
// account-side tunnel by name (`--force`, so a stale registered connector
|
|
783
|
+
// doesn't block it), then `tunnel create` re-writes a fresh creds file. The
|
|
784
|
+
// new tunnel gets a new UUID; `routeDns` below uses `--overwrite-dns`, so the
|
|
785
|
+
// hostname's CNAME is repointed at the new UUID even though it pointed at the
|
|
786
|
+
// old one. The field case confirmed `tunnel delete` + re-run heals cleanly.
|
|
787
|
+
const existingCreds = credentialsPath(tunnel.id, r.cloudflaredHome);
|
|
788
|
+
if (existsSync(existingCreds)) {
|
|
789
|
+
r.log(`✓ Reusing existing tunnel "${r.tunnelName}" (${tunnel.id})`);
|
|
790
|
+
} else {
|
|
791
|
+
r.log(
|
|
792
|
+
`⚠ Tunnel "${r.tunnelName}" (${tunnel.id}) exists in Cloudflare, but its local credentials`,
|
|
793
|
+
);
|
|
794
|
+
r.log(` file is missing (${existingCreds}) — the connector can't authenticate from this`);
|
|
795
|
+
r.log(" machine. Recreating the tunnel so a fresh credentials file is written…");
|
|
796
|
+
try {
|
|
797
|
+
await deleteTunnel(r.runner, r.tunnelName);
|
|
798
|
+
} catch (err) {
|
|
799
|
+
if (err instanceof CloudflaredError) {
|
|
800
|
+
r.log("");
|
|
801
|
+
r.log(`✗ Couldn't delete the stale tunnel automatically: ${err.message}`);
|
|
802
|
+
r.log("");
|
|
803
|
+
r.log("Recover manually, then re-run this command:");
|
|
804
|
+
r.log(` cloudflared tunnel delete --force ${r.tunnelName}`);
|
|
805
|
+
r.log(` parachute expose public --cloudflare --domain ${hostname}`);
|
|
806
|
+
return 1;
|
|
807
|
+
}
|
|
808
|
+
throw err;
|
|
809
|
+
}
|
|
810
|
+
try {
|
|
811
|
+
tunnel = await createTunnel(r.runner, r.tunnelName);
|
|
812
|
+
} catch (err) {
|
|
813
|
+
return reportCloudflaredError(err, r.log);
|
|
814
|
+
}
|
|
815
|
+
r.log(`✓ Recreated tunnel ${tunnel.id} (fresh credentials written).`);
|
|
816
|
+
}
|
|
698
817
|
}
|
|
699
818
|
|
|
700
819
|
r.log(`Routing DNS: ${hostname} → tunnel ${tunnel.id}…`);
|
|
@@ -955,6 +1074,42 @@ export async function exposeCloudflareUp(
|
|
|
955
1074
|
}
|
|
956
1075
|
}
|
|
957
1076
|
|
|
1077
|
+
// Post-start connection verification (#593). The connector pid existing is
|
|
1078
|
+
// NOT proof it connected — the field repro had a live pid crash-looping on a
|
|
1079
|
+
// missing creds file, with every public request returning Cloudflare error
|
|
1080
|
+
// 1033 (tunnel registered, no connector) while the CLI printed "✓ tunnel up".
|
|
1081
|
+
// Poll `cloudflared tunnel info` for a live edge connection, bounded. On
|
|
1082
|
+
// timeout, fail LOUDLY with the connector log path + the crash-loop signature
|
|
1083
|
+
// to grep for, instead of claiming success.
|
|
1084
|
+
r.log("");
|
|
1085
|
+
r.log("Verifying the connector established a tunnel connection…");
|
|
1086
|
+
const connected = await r.verifyConnection({
|
|
1087
|
+
runner: r.runner,
|
|
1088
|
+
tunnelName: r.tunnelName,
|
|
1089
|
+
timeoutMs: r.verifyTimeoutMs,
|
|
1090
|
+
pollMs: r.verifyPollMs,
|
|
1091
|
+
sleep: r.sleep,
|
|
1092
|
+
});
|
|
1093
|
+
if (!connected) {
|
|
1094
|
+
r.log("");
|
|
1095
|
+
r.log(
|
|
1096
|
+
`✗ The cloudflared connector (pid ${pid}) started but never registered a tunnel connection`,
|
|
1097
|
+
);
|
|
1098
|
+
r.log(` within ${Math.round(r.verifyTimeoutMs / 1000)}s. Public requests to ${hostname} will`);
|
|
1099
|
+
r.log(" return Cloudflare error 1033 (tunnel registered, no connector) until this resolves.");
|
|
1100
|
+
r.log("");
|
|
1101
|
+
r.log("Check the connector log for the crash-loop cause:");
|
|
1102
|
+
r.log(` tail -n 50 ${r.logPath}`);
|
|
1103
|
+
r.log(' A repeating "credentials file … doesn\'t exist" line means the local credentials are');
|
|
1104
|
+
r.log(
|
|
1105
|
+
" gone — re-run this command (it auto-recreates the tunnel). Other repeating errors point",
|
|
1106
|
+
);
|
|
1107
|
+
r.log(" at the specific failure. Confirm the connector once it's healthy with:");
|
|
1108
|
+
r.log(` cloudflared tunnel info ${r.tunnelName}`);
|
|
1109
|
+
return 1;
|
|
1110
|
+
}
|
|
1111
|
+
r.log("✓ Connector connected.");
|
|
1112
|
+
|
|
958
1113
|
const baseUrl = `https://${hostname}`;
|
|
959
1114
|
let vaultUrl: string | undefined;
|
|
960
1115
|
if (vaultEntry) {
|
package/src/commands/serve.ts
CHANGED
|
@@ -34,6 +34,7 @@ import { generateBootstrapToken } from "../bootstrap-token.ts";
|
|
|
34
34
|
// path isolation.
|
|
35
35
|
import { CONFIG_DIR, SERVICES_MANIFEST_PATH } from "../config.ts";
|
|
36
36
|
import { readExposeState } from "../expose-state.ts";
|
|
37
|
+
import { createDbHolder } from "../hub-db-liveness.ts";
|
|
37
38
|
import { hubDbPath, openHubDb } from "../hub-db.ts";
|
|
38
39
|
import { hubFetch } from "../hub-server.ts";
|
|
39
40
|
import { writeHubFile } from "../hub.ts";
|
|
@@ -345,8 +346,16 @@ export async function serve(opts: ServeOpts = {}): Promise<{
|
|
|
345
346
|
if (!existsSync(hubHtmlPath)) writeHubFile(hubHtmlPath);
|
|
346
347
|
|
|
347
348
|
const dbPath = hubDbPath();
|
|
348
|
-
|
|
349
|
-
|
|
349
|
+
// Self-heal-or-die DB holder (#594). The handle lives behind a mutable
|
|
350
|
+
// holder so a request that hits the persistent-corruption class (disk I/O
|
|
351
|
+
// error / malformed image — e.g. the state dir deleted under a running hub)
|
|
352
|
+
// can reopen the handle once, or exit(1) for the platform manager to restart
|
|
353
|
+
// us with a fresh one. `getDb` reads the current handle from the holder.
|
|
354
|
+
const dbHolder = createDbHolder(openHubDb(dbPath), {
|
|
355
|
+
reopen: () => openHubDb(dbPath),
|
|
356
|
+
log,
|
|
357
|
+
});
|
|
358
|
+
const adminBootstrap = await seedInitialAdminIfNeeded(dbHolder.get(), env, log);
|
|
350
359
|
|
|
351
360
|
if (adminBootstrap === "needs-setup") {
|
|
352
361
|
log(
|
|
@@ -381,7 +390,8 @@ export async function serve(opts: ServeOpts = {}): Promise<{
|
|
|
381
390
|
// CMD), so the fix has to land here too. Closes hub#399.
|
|
382
391
|
idleTimeout: 255,
|
|
383
392
|
fetch: hubFetch(WELL_KNOWN_DIR, {
|
|
384
|
-
getDb: () =>
|
|
393
|
+
getDb: () => dbHolder.get(),
|
|
394
|
+
onDbError: (err) => dbHolder.healOrExit(err),
|
|
385
395
|
issuer,
|
|
386
396
|
loopbackPort: port,
|
|
387
397
|
supervisor,
|
|
@@ -468,7 +478,7 @@ export async function serve(opts: ServeOpts = {}): Promise<{
|
|
|
468
478
|
await supervisor.stop(state.short);
|
|
469
479
|
}
|
|
470
480
|
await server.stop();
|
|
471
|
-
|
|
481
|
+
dbHolder.get().close();
|
|
472
482
|
},
|
|
473
483
|
};
|
|
474
484
|
}
|
|
@@ -0,0 +1,211 @@
|
|
|
1
|
+
import type { Database } from "bun:sqlite";
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* SQLite-handle liveness + self-heal policy (#594).
|
|
5
|
+
*
|
|
6
|
+
* Field repro: an operator deleted `~/.parachute` while the hub unit was
|
|
7
|
+
* running. The process kept an fd to the now-unlinked `hub.db` inode — cached
|
|
8
|
+
* reads half-worked, every write / WAL op threw `SQLiteError: disk I/O error`.
|
|
9
|
+
* Result: `/health` stayed 200 (it never touched the DB), every DB-touching
|
|
10
|
+
* route 500'd indefinitely, and operator-facing CLI checks lied (served from
|
|
11
|
+
* the dead handle's cached pages). An hour of clean 500s behind a green
|
|
12
|
+
* /health is the worst possible failure shape — a crash-restart would have
|
|
13
|
+
* self-healed in seconds (the platform manager re-`openHubDb`s a fresh handle).
|
|
14
|
+
*
|
|
15
|
+
* The policy here: on a request that hits the persistent-corruption error
|
|
16
|
+
* class, attempt ONE reopen of the handle; if reopen fails OR the error
|
|
17
|
+
* recurs immediately, log loudly and `process.exit(1)` so the platform
|
|
18
|
+
* manager (launchd / systemd / container runtime) restarts with a fresh
|
|
19
|
+
* handle. We are careful to scope "fatal" to the persistent class — a
|
|
20
|
+
* transient `SQLITE_BUSY` (a momentary write lock) must NOT kill the hub.
|
|
21
|
+
*/
|
|
22
|
+
|
|
23
|
+
/**
|
|
24
|
+
* How a thrown DB error should be treated.
|
|
25
|
+
* - `fatal` → persistent corruption / dead handle (disk I/O error,
|
|
26
|
+
* database disk image is malformed, NOTADB, CORRUPT, IOERR).
|
|
27
|
+
* Triggers the reopen-once-or-exit machinery.
|
|
28
|
+
* - `transient` → a momentary lock (SQLITE_BUSY / SQLITE_LOCKED). Never
|
|
29
|
+
* fatal; the caller surfaces it as an ordinary error and
|
|
30
|
+
* the next request likely succeeds.
|
|
31
|
+
* - `other` → not a recognized SQLite-handle failure (e.g. a constraint
|
|
32
|
+
* violation, a programming error). Not the liveness concern;
|
|
33
|
+
* the caller handles it as a normal error.
|
|
34
|
+
*/
|
|
35
|
+
export type DbErrorClass = "fatal" | "transient" | "other";
|
|
36
|
+
|
|
37
|
+
/**
|
|
38
|
+
* Pull a lowercase "<code> <message>" string out of an unknown thrown value
|
|
39
|
+
* for substring matching. `bun:sqlite` throws `SQLiteError` with a `code`
|
|
40
|
+
* (e.g. `SQLITE_IOERR`, `SQLITE_BUSY`) and a `message` (e.g. "disk I/O
|
|
41
|
+
* error"). We match on both so a runtime that surfaces one but not the other
|
|
42
|
+
* still classifies correctly.
|
|
43
|
+
*/
|
|
44
|
+
function errorSignature(err: unknown): string {
|
|
45
|
+
if (err && typeof err === "object") {
|
|
46
|
+
const e = err as { code?: unknown; message?: unknown; name?: unknown };
|
|
47
|
+
const code = typeof e.code === "string" ? e.code : "";
|
|
48
|
+
const message = typeof e.message === "string" ? e.message : "";
|
|
49
|
+
const name = typeof e.name === "string" ? e.name : "";
|
|
50
|
+
return `${code} ${name} ${message}`.toLowerCase();
|
|
51
|
+
}
|
|
52
|
+
return String(err).toLowerCase();
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
/**
|
|
56
|
+
* Classify a thrown DB error. Order matters: a transient BUSY/LOCKED is
|
|
57
|
+
* checked FIRST so it's never mistaken for the fatal class, even if a future
|
|
58
|
+
* message happened to share a substring.
|
|
59
|
+
*/
|
|
60
|
+
export function classifyDbError(err: unknown): DbErrorClass {
|
|
61
|
+
const sig = errorSignature(err);
|
|
62
|
+
if (sig.length === 0) return "other";
|
|
63
|
+
|
|
64
|
+
// Transient locks — explicitly NON-fatal. SQLITE_BUSY is a momentary write
|
|
65
|
+
// lock under WAL contention; killing the hub on it would turn ordinary
|
|
66
|
+
// concurrency into a restart loop. SQLITE_LOCKED is the same class.
|
|
67
|
+
if (sig.includes("sqlite_busy") || sig.includes("sqlite_locked")) return "transient";
|
|
68
|
+
if (/\bdatabase is locked\b/.test(sig) || /\bdatabase table is locked\b/.test(sig)) {
|
|
69
|
+
return "transient";
|
|
70
|
+
}
|
|
71
|
+
// A handful of SQLITE_IOERR *sub-codes* are contention, not corruption:
|
|
72
|
+
// SQLITE_IOERR_BLOCKED (a legacy busy variant) and SQLITE_IOERR_LOCK (a
|
|
73
|
+
// lock-acquisition failure). The generic `sqlite_ioerr` substring match
|
|
74
|
+
// below would otherwise sweep these into the fatal class and exit the hub on
|
|
75
|
+
// transient I/O contention. Check them FIRST so they classify as transient.
|
|
76
|
+
if (sig.includes("sqlite_ioerr_blocked") || sig.includes("sqlite_ioerr_lock")) {
|
|
77
|
+
return "transient";
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
// Persistent-corruption / dead-handle class → fatal (reopen-once-or-exit).
|
|
81
|
+
// `disk I/O error` is the exact field message (state dir deleted under a
|
|
82
|
+
// running hub); the malformed-image + corrupt + notadb codes are the
|
|
83
|
+
// related on-disk-corruption shapes the issue calls out.
|
|
84
|
+
//
|
|
85
|
+
// `sqlite_ioerr` matches the GENERIC `SQLITE_IOERR` code, which is what Bun
|
|
86
|
+
// surfaces for the dead-handle case (the unlinked-inode field repro reports
|
|
87
|
+
// exactly `code: "SQLITE_IOERR", message: "disk I/O error"`, not a
|
|
88
|
+
// sub-code). The two transient IOERR sub-codes are already filtered out
|
|
89
|
+
// above, so reaching this `includes` means either the generic code or a
|
|
90
|
+
// corruption sub-code — both fatal. (`disk i/o error` is also matched
|
|
91
|
+
// directly so a runtime that surfaces the message but not the code still
|
|
92
|
+
// classifies.)
|
|
93
|
+
if (
|
|
94
|
+
sig.includes("disk i/o error") ||
|
|
95
|
+
sig.includes("sqlite_ioerr") ||
|
|
96
|
+
sig.includes("database disk image is malformed") ||
|
|
97
|
+
sig.includes("sqlite_corrupt") ||
|
|
98
|
+
sig.includes("sqlite_notadb") ||
|
|
99
|
+
sig.includes("file is not a database")
|
|
100
|
+
) {
|
|
101
|
+
return "fatal";
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
return "other";
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
/**
|
|
108
|
+
* Cheap DB liveness probe for `/health` (#594). Runs `SELECT 1`. Returns
|
|
109
|
+
* `"ok"` on success, or `"error: <class>"` where class is the
|
|
110
|
+
* {@link classifyDbError} verdict, so a monitor can tell "hub up but DB dead"
|
|
111
|
+
* apart from "hub up, DB fine". NEVER throws — a probe that threw would make
|
|
112
|
+
* /health itself 500, defeating the point (/health must stay fast + reliable).
|
|
113
|
+
*/
|
|
114
|
+
export function probeDbLiveness(db: Database): "ok" | string {
|
|
115
|
+
try {
|
|
116
|
+
db.query("SELECT 1").get();
|
|
117
|
+
return "ok";
|
|
118
|
+
} catch (err) {
|
|
119
|
+
return `error: ${classifyDbError(err)}`;
|
|
120
|
+
}
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
/**
|
|
124
|
+
* A mutable holder for the hub's `Database` handle so a request handler that
|
|
125
|
+
* hits the fatal error class can swap in a freshly-reopened handle without
|
|
126
|
+
* re-threading the closure-captured `db` through every call site. `getDb()`
|
|
127
|
+
* in hub-server reads `holder.get()`; the self-heal path calls
|
|
128
|
+
* `holder.healOrExit(err)`.
|
|
129
|
+
*/
|
|
130
|
+
export interface DbHolder {
|
|
131
|
+
/** The current live handle. */
|
|
132
|
+
get(): Database;
|
|
133
|
+
/**
|
|
134
|
+
* React to a thrown DB error per the liveness policy:
|
|
135
|
+
* - `transient`/`other` → return `"ignored"` (caller surfaces a normal error).
|
|
136
|
+
* - `fatal`, reopen succeeds + a `SELECT 1` passes on the new handle →
|
|
137
|
+
* swap the handle in, return `"healed"` (caller retries / surfaces a
|
|
138
|
+
* transient error the next request clears).
|
|
139
|
+
* - `fatal`, reopen fails OR the new handle still fails `SELECT 1` →
|
|
140
|
+
* log loudly + `exit(1)`. Returns `"exited"` only in tests (the injected
|
|
141
|
+
* exit fn doesn't actually exit the process).
|
|
142
|
+
*
|
|
143
|
+
* Reopen-once semantics: a single fatal error triggers one reopen attempt.
|
|
144
|
+
* If the *reopened* handle is also dead (e.g. the underlying dir is still
|
|
145
|
+
* gone), we exit rather than loop — the platform manager owns the restart.
|
|
146
|
+
*/
|
|
147
|
+
healOrExit(err: unknown): "ignored" | "healed" | "exited";
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
export interface DbHolderDeps {
|
|
151
|
+
/** Open a fresh handle (production: `() => openHubDb(dbPath)`). */
|
|
152
|
+
reopen: () => Database;
|
|
153
|
+
/** Loud log sink (default `console.error`). */
|
|
154
|
+
log?: (line: string) => void;
|
|
155
|
+
/** Process-exit fn (default `process.exit`; tests inject a spy). */
|
|
156
|
+
exit?: (code: number) => void;
|
|
157
|
+
/** Close a (presumed-dead) handle best-effort before swapping (default `db.close()`). */
|
|
158
|
+
closeOld?: (db: Database) => void;
|
|
159
|
+
}
|
|
160
|
+
|
|
161
|
+
/**
|
|
162
|
+
* Build a {@link DbHolder} over an initial handle. Production wires
|
|
163
|
+
* `reopen: () => openHubDb(dbPath)` and the default exit/log; tests inject a
|
|
164
|
+
* fake reopen + a non-exiting `exit` spy so the fatal branch is exercised
|
|
165
|
+
* without killing the test process.
|
|
166
|
+
*/
|
|
167
|
+
export function createDbHolder(initial: Database, deps: DbHolderDeps): DbHolder {
|
|
168
|
+
let current = initial;
|
|
169
|
+
const log = deps.log ?? ((line) => console.error(line));
|
|
170
|
+
const exit = deps.exit ?? ((code) => process.exit(code));
|
|
171
|
+
const closeOld =
|
|
172
|
+
deps.closeOld ??
|
|
173
|
+
((db) => {
|
|
174
|
+
try {
|
|
175
|
+
db.close();
|
|
176
|
+
} catch {
|
|
177
|
+
// Best-effort — a dead handle may throw on close; we're replacing it.
|
|
178
|
+
}
|
|
179
|
+
});
|
|
180
|
+
|
|
181
|
+
return {
|
|
182
|
+
get: () => current,
|
|
183
|
+
healOrExit(err: unknown) {
|
|
184
|
+
const klass = classifyDbError(err);
|
|
185
|
+
if (klass !== "fatal") return "ignored";
|
|
186
|
+
|
|
187
|
+
const detail = err instanceof Error ? err.message : String(err);
|
|
188
|
+
log(`parachute hub: persistent SQLite failure (${detail}). Attempting one DB handle reopen…`);
|
|
189
|
+
|
|
190
|
+
let reopened: Database;
|
|
191
|
+
try {
|
|
192
|
+
reopened = deps.reopen();
|
|
193
|
+
// Confirm the fresh handle is actually live before trusting it.
|
|
194
|
+
reopened.query("SELECT 1").get();
|
|
195
|
+
} catch (reopenErr) {
|
|
196
|
+
const rd = reopenErr instanceof Error ? reopenErr.message : String(reopenErr);
|
|
197
|
+
log(
|
|
198
|
+
`parachute hub: DB reopen failed (${rd}); exiting so the platform manager restarts the hub with a fresh handle.`,
|
|
199
|
+
);
|
|
200
|
+
exit(1);
|
|
201
|
+
return "exited";
|
|
202
|
+
}
|
|
203
|
+
|
|
204
|
+
// Reopen succeeded + verified. Swap it in; the old handle is dead.
|
|
205
|
+
closeOld(current);
|
|
206
|
+
current = reopened;
|
|
207
|
+
log("parachute hub: DB handle reopened successfully; continuing.");
|
|
208
|
+
return "healed";
|
|
209
|
+
},
|
|
210
|
+
};
|
|
211
|
+
}
|