@openparachute/hub 0.6.4 → 0.6.5-rc.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,211 @@
1
+ import type { Database } from "bun:sqlite";
2
+
3
+ /**
4
+ * SQLite-handle liveness + self-heal policy (#594).
5
+ *
6
+ * Field repro: an operator deleted `~/.parachute` while the hub unit was
7
+ * running. The process kept an fd to the now-unlinked `hub.db` inode — cached
8
+ * reads half-worked, every write / WAL op threw `SQLiteError: disk I/O error`.
9
+ * Result: `/health` stayed 200 (it never touched the DB), every DB-touching
10
+ * route 500'd indefinitely, and operator-facing CLI checks lied (served from
11
+ * the dead handle's cached pages). An hour of clean 500s behind a green
12
+ * /health is the worst possible failure shape — a crash-restart would have
13
+ * self-healed in seconds (the platform manager re-`openHubDb`s a fresh handle).
14
+ *
15
+ * The policy here: on a request that hits the persistent-corruption error
16
+ * class, attempt ONE reopen of the handle; if reopen fails OR the error
17
+ * recurs immediately, log loudly and `process.exit(1)` so the platform
18
+ * manager (launchd / systemd / container runtime) restarts with a fresh
19
+ * handle. We are careful to scope "fatal" to the persistent class — a
20
+ * transient `SQLITE_BUSY` (a momentary write lock) must NOT kill the hub.
21
+ */
22
+
23
+ /**
24
+ * How a thrown DB error should be treated.
25
+ * - `fatal` → persistent corruption / dead handle (disk I/O error,
26
+ * database disk image is malformed, NOTADB, CORRUPT, IOERR).
27
+ * Triggers the reopen-once-or-exit machinery.
28
+ * - `transient` → a momentary lock (SQLITE_BUSY / SQLITE_LOCKED). Never
29
+ * fatal; the caller surfaces it as an ordinary error and
30
+ * the next request likely succeeds.
31
+ * - `other` → not a recognized SQLite-handle failure (e.g. a constraint
32
+ * violation, a programming error). Not the liveness concern;
33
+ * the caller handles it as a normal error.
34
+ */
35
+ export type DbErrorClass = "fatal" | "transient" | "other";
36
+
37
+ /**
38
+ * Pull a lowercase "<code> <message>" string out of an unknown thrown value
39
+ * for substring matching. `bun:sqlite` throws `SQLiteError` with a `code`
40
+ * (e.g. `SQLITE_IOERR`, `SQLITE_BUSY`) and a `message` (e.g. "disk I/O
41
+ * error"). We match on both so a runtime that surfaces one but not the other
42
+ * still classifies correctly.
43
+ */
44
+ function errorSignature(err: unknown): string {
45
+ if (err && typeof err === "object") {
46
+ const e = err as { code?: unknown; message?: unknown; name?: unknown };
47
+ const code = typeof e.code === "string" ? e.code : "";
48
+ const message = typeof e.message === "string" ? e.message : "";
49
+ const name = typeof e.name === "string" ? e.name : "";
50
+ return `${code} ${name} ${message}`.toLowerCase();
51
+ }
52
+ return String(err).toLowerCase();
53
+ }
54
+
55
+ /**
56
+ * Classify a thrown DB error. Order matters: a transient BUSY/LOCKED is
57
+ * checked FIRST so it's never mistaken for the fatal class, even if a future
58
+ * message happened to share a substring.
59
+ */
60
+ export function classifyDbError(err: unknown): DbErrorClass {
61
+ const sig = errorSignature(err);
62
+ if (sig.length === 0) return "other";
63
+
64
+ // Transient locks — explicitly NON-fatal. SQLITE_BUSY is a momentary write
65
+ // lock under WAL contention; killing the hub on it would turn ordinary
66
+ // concurrency into a restart loop. SQLITE_LOCKED is the same class.
67
+ if (sig.includes("sqlite_busy") || sig.includes("sqlite_locked")) return "transient";
68
+ if (/\bdatabase is locked\b/.test(sig) || /\bdatabase table is locked\b/.test(sig)) {
69
+ return "transient";
70
+ }
71
+ // A handful of SQLITE_IOERR *sub-codes* are contention, not corruption:
72
+ // SQLITE_IOERR_BLOCKED (a legacy busy variant) and SQLITE_IOERR_LOCK (a
73
+ // lock-acquisition failure). The generic `sqlite_ioerr` substring match
74
+ // below would otherwise sweep these into the fatal class and exit the hub on
75
+ // transient I/O contention. Check them FIRST so they classify as transient.
76
+ if (sig.includes("sqlite_ioerr_blocked") || sig.includes("sqlite_ioerr_lock")) {
77
+ return "transient";
78
+ }
79
+
80
+ // Persistent-corruption / dead-handle class → fatal (reopen-once-or-exit).
81
+ // `disk I/O error` is the exact field message (state dir deleted under a
82
+ // running hub); the malformed-image + corrupt + notadb codes are the
83
+ // related on-disk-corruption shapes the issue calls out.
84
+ //
85
+ // `sqlite_ioerr` matches the GENERIC `SQLITE_IOERR` code, which is what Bun
86
+ // surfaces for the dead-handle case (the unlinked-inode field repro reports
87
+ // exactly `code: "SQLITE_IOERR", message: "disk I/O error"`, not a
88
+ // sub-code). The two transient IOERR sub-codes are already filtered out
89
+ // above, so reaching this `includes` means either the generic code or a
90
+ // corruption sub-code — both fatal. (`disk i/o error` is also matched
91
+ // directly so a runtime that surfaces the message but not the code still
92
+ // classifies.)
93
+ if (
94
+ sig.includes("disk i/o error") ||
95
+ sig.includes("sqlite_ioerr") ||
96
+ sig.includes("database disk image is malformed") ||
97
+ sig.includes("sqlite_corrupt") ||
98
+ sig.includes("sqlite_notadb") ||
99
+ sig.includes("file is not a database")
100
+ ) {
101
+ return "fatal";
102
+ }
103
+
104
+ return "other";
105
+ }
106
+
107
+ /**
108
+ * Cheap DB liveness probe for `/health` (#594). Runs `SELECT 1`. Returns
109
+ * `"ok"` on success, or `"error: <class>"` where class is the
110
+ * {@link classifyDbError} verdict, so a monitor can tell "hub up but DB dead"
111
+ * apart from "hub up, DB fine". NEVER throws — a probe that threw would make
112
+ * /health itself 500, defeating the point (/health must stay fast + reliable).
113
+ */
114
+ export function probeDbLiveness(db: Database): "ok" | string {
115
+ try {
116
+ db.query("SELECT 1").get();
117
+ return "ok";
118
+ } catch (err) {
119
+ return `error: ${classifyDbError(err)}`;
120
+ }
121
+ }
122
+
123
+ /**
124
+ * A mutable holder for the hub's `Database` handle so a request handler that
125
+ * hits the fatal error class can swap in a freshly-reopened handle without
126
+ * re-threading the closure-captured `db` through every call site. `getDb()`
127
+ * in hub-server reads `holder.get()`; the self-heal path calls
128
+ * `holder.healOrExit(err)`.
129
+ */
130
+ export interface DbHolder {
131
+ /** The current live handle. */
132
+ get(): Database;
133
+ /**
134
+ * React to a thrown DB error per the liveness policy:
135
+ * - `transient`/`other` → return `"ignored"` (caller surfaces a normal error).
136
+ * - `fatal`, reopen succeeds + a `SELECT 1` passes on the new handle →
137
+ * swap the handle in, return `"healed"` (caller retries / surfaces a
138
+ * transient error the next request clears).
139
+ * - `fatal`, reopen fails OR the new handle still fails `SELECT 1` →
140
+ * log loudly + `exit(1)`. Returns `"exited"` only in tests (the injected
141
+ * exit fn doesn't actually exit the process).
142
+ *
143
+ * Reopen-once semantics: a single fatal error triggers one reopen attempt.
144
+ * If the *reopened* handle is also dead (e.g. the underlying dir is still
145
+ * gone), we exit rather than loop — the platform manager owns the restart.
146
+ */
147
+ healOrExit(err: unknown): "ignored" | "healed" | "exited";
148
+ }
149
+
150
+ export interface DbHolderDeps {
151
+ /** Open a fresh handle (production: `() => openHubDb(dbPath)`). */
152
+ reopen: () => Database;
153
+ /** Loud log sink (default `console.error`). */
154
+ log?: (line: string) => void;
155
+ /** Process-exit fn (default `process.exit`; tests inject a spy). */
156
+ exit?: (code: number) => void;
157
+ /** Close a (presumed-dead) handle best-effort before swapping (default `db.close()`). */
158
+ closeOld?: (db: Database) => void;
159
+ }
160
+
161
+ /**
162
+ * Build a {@link DbHolder} over an initial handle. Production wires
163
+ * `reopen: () => openHubDb(dbPath)` and the default exit/log; tests inject a
164
+ * fake reopen + a non-exiting `exit` spy so the fatal branch is exercised
165
+ * without killing the test process.
166
+ */
167
+ export function createDbHolder(initial: Database, deps: DbHolderDeps): DbHolder {
168
+ let current = initial;
169
+ const log = deps.log ?? ((line) => console.error(line));
170
+ const exit = deps.exit ?? ((code) => process.exit(code));
171
+ const closeOld =
172
+ deps.closeOld ??
173
+ ((db) => {
174
+ try {
175
+ db.close();
176
+ } catch {
177
+ // Best-effort — a dead handle may throw on close; we're replacing it.
178
+ }
179
+ });
180
+
181
+ return {
182
+ get: () => current,
183
+ healOrExit(err: unknown) {
184
+ const klass = classifyDbError(err);
185
+ if (klass !== "fatal") return "ignored";
186
+
187
+ const detail = err instanceof Error ? err.message : String(err);
188
+ log(`parachute hub: persistent SQLite failure (${detail}). Attempting one DB handle reopen…`);
189
+
190
+ let reopened: Database;
191
+ try {
192
+ reopened = deps.reopen();
193
+ // Confirm the fresh handle is actually live before trusting it.
194
+ reopened.query("SELECT 1").get();
195
+ } catch (reopenErr) {
196
+ const rd = reopenErr instanceof Error ? reopenErr.message : String(reopenErr);
197
+ log(
198
+ `parachute hub: DB reopen failed (${rd}); exiting so the platform manager restarts the hub with a fresh handle.`,
199
+ );
200
+ exit(1);
201
+ return "exited";
202
+ }
203
+
204
+ // Reopen succeeded + verified. Swap it in; the old handle is dead.
205
+ closeOld(current);
206
+ current = reopened;
207
+ log("parachute hub: DB handle reopened successfully; continuing.");
208
+ return "healed";
209
+ },
210
+ };
211
+ }