@openparachute/hub 0.6.4 → 0.6.5-rc.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -5,6 +5,7 @@ import { join } from "node:path";
5
5
  import { fileURLToPath } from "node:url";
6
6
  import { buildCsrfCookie, generateCsrfToken } from "../csrf.ts";
7
7
  import { HUB_SVC, hubPortPath } from "../hub-control.ts";
8
+ import { createDbHolder } from "../hub-db-liveness.ts";
8
9
  import { hubDbPath, openHubDb } from "../hub-db.ts";
9
10
  import {
10
11
  findServiceUpstream,
@@ -1240,14 +1241,10 @@ describe("hubFetch routing", () => {
1240
1241
  // open shouldn't cascade into a restart loop. The body advertises the
1241
1242
  // running version so a deploy verifier can confirm the rolled-out
1242
1243
  // image is the one it expected.
1243
- test("/health returns 200 JSON without invoking the db", async () => {
1244
+ test("/health returns 200 JSON; unconfigured db field when no getDb", async () => {
1244
1245
  const h = makeHarness();
1245
1246
  try {
1246
- const res = await hubFetch(h.dir, {
1247
- getDb: () => {
1248
- throw new Error("getDb must not be called by /health");
1249
- },
1250
- })(req("/health"));
1247
+ const res = await hubFetch(h.dir)(req("/health"));
1251
1248
  expect(res.status).toBe(200);
1252
1249
  expect(res.headers.get("content-type")).toContain("application/json");
1253
1250
  expect(res.headers.get("cache-control")).toBe("no-store");
@@ -1255,11 +1252,153 @@ describe("hubFetch routing", () => {
1255
1252
  expect(body.status).toBe("ok");
1256
1253
  expect(body.service).toBe("parachute-hub");
1257
1254
  expect(typeof body.version).toBe("string");
1255
+ expect(body.db).toBe("unconfigured");
1258
1256
  } finally {
1259
1257
  h.cleanup();
1260
1258
  }
1261
1259
  });
1262
1260
 
1261
+ test("/health db field is ok on a live db (#594)", async () => {
1262
+ const h = makeHarness();
1263
+ const db = openHubDb(hubDbPath(h.dir));
1264
+ try {
1265
+ const res = await hubFetch(h.dir, { getDb: () => db })(req("/health"));
1266
+ expect(res.status).toBe(200);
1267
+ const body = (await res.json()) as Record<string, unknown>;
1268
+ expect(body.status).toBe("ok");
1269
+ expect(body.db).toBe("ok");
1270
+ } finally {
1271
+ db.close();
1272
+ h.cleanup();
1273
+ }
1274
+ });
1275
+
1276
+ test("/health db field reports error: <class> on a dead handle, still 200 (#594)", async () => {
1277
+ const h = makeHarness();
1278
+ const db = openHubDb(hubDbPath(h.dir));
1279
+ db.close(); // dead handle — every SELECT throws
1280
+ try {
1281
+ const res = await hubFetch(h.dir, { getDb: () => db })(req("/health"));
1282
+ // Always 200 while the process is up — the HTTP status is process
1283
+ // liveness, the db field is the readiness signal.
1284
+ expect(res.status).toBe(200);
1285
+ const body = (await res.json()) as Record<string, unknown>;
1286
+ expect(body.status).toBe("ok");
1287
+ expect(typeof body.db).toBe("string");
1288
+ expect((body.db as string).startsWith("error:")).toBe(true);
1289
+ } finally {
1290
+ h.cleanup();
1291
+ }
1292
+ });
1293
+
1294
+ test("a fatal DB throw escaping a handler returns a structured body + invokes onDbError (#594)", async () => {
1295
+ const h = makeHarness();
1296
+ try {
1297
+ const fatal = Object.assign(new Error("disk I/O error"), {
1298
+ name: "SQLiteError",
1299
+ code: "SQLITE_IOERR",
1300
+ });
1301
+ let onDbErrorCalls = 0;
1302
+ // A non-/health, DB-touching route (`/`) whose getDb throws the fatal
1303
+ // class. The top-level self-heal wrapper catches it, calls onDbError,
1304
+ // and returns a structured db_unavailable body (not a bare 500).
1305
+ const res = await hubFetch(h.dir, {
1306
+ manifestPath: h.manifestPath,
1307
+ getDb: () => {
1308
+ throw fatal;
1309
+ },
1310
+ onDbError: () => {
1311
+ onDbErrorCalls += 1;
1312
+ return "healed";
1313
+ },
1314
+ })(req("/", { headers: { accept: "text/html" } }));
1315
+ expect(onDbErrorCalls).toBe(1);
1316
+ expect(res.status).toBe(503);
1317
+ const body = (await res.json()) as Record<string, unknown>;
1318
+ expect(body.error).toBe("db_unavailable");
1319
+ expect(typeof body.error_description).toBe("string");
1320
+ expect(body.error_description as string).toContain("reopened");
1321
+ } finally {
1322
+ h.cleanup();
1323
+ }
1324
+ });
1325
+
1326
+ test("a non-DB throw still propagates (not swallowed by the self-heal wrapper) (#594)", async () => {
1327
+ const h = makeHarness();
1328
+ try {
1329
+ let onDbErrorCalls = 0;
1330
+ const handler = hubFetch(h.dir, {
1331
+ manifestPath: h.manifestPath,
1332
+ getDb: () => {
1333
+ throw new Error("some unrelated programming error");
1334
+ },
1335
+ onDbError: () => {
1336
+ onDbErrorCalls += 1;
1337
+ return "ignored";
1338
+ },
1339
+ });
1340
+ await expect(handler(req("/", { headers: { accept: "text/html" } }))).rejects.toThrow(
1341
+ "some unrelated programming error",
1342
+ );
1343
+ expect(onDbErrorCalls).toBe(0);
1344
+ } finally {
1345
+ h.cleanup();
1346
+ }
1347
+ });
1348
+
1349
+ test("a transient SQLITE_BUSY escaping a handler → 503, does NOT exit the hub (#594)", async () => {
1350
+ const h = makeHarness();
1351
+ const db = openHubDb(hubDbPath(h.dir));
1352
+ try {
1353
+ // Wire a REAL DbHolder so this pins the end-to-end transient path: the
1354
+ // wrapper catches the throw, routes it to the holder's healOrExit, and
1355
+ // the holder must classify SQLITE_BUSY as transient → "ignored" → NO
1356
+ // reopen, NO exit. A spy `exit` that fails the test if ever called is the
1357
+ // regression guard ("a momentary lock never kills the hub").
1358
+ let exited = false;
1359
+ let reopened = false;
1360
+ const holder = createDbHolder(db, {
1361
+ reopen: () => {
1362
+ reopened = true;
1363
+ return db;
1364
+ },
1365
+ exit: () => {
1366
+ exited = true;
1367
+ },
1368
+ log: () => {},
1369
+ });
1370
+ const busy = Object.assign(new Error("database is locked"), {
1371
+ name: "SQLiteError",
1372
+ code: "SQLITE_BUSY",
1373
+ });
1374
+ // First getDb() (the wizard-redirect userCount read) throws BUSY; the
1375
+ // wrapper's catch routes it through the holder.
1376
+ let firstCall = true;
1377
+ const res = await hubFetch(h.dir, {
1378
+ manifestPath: h.manifestPath,
1379
+ getDb: () => {
1380
+ if (firstCall) {
1381
+ firstCall = false;
1382
+ throw busy;
1383
+ }
1384
+ return holder.get();
1385
+ },
1386
+ onDbError: (err) => holder.healOrExit(err),
1387
+ })(req("/", { headers: { accept: "text/html" } }));
1388
+ expect(res.status).toBe(503);
1389
+ const body = (await res.json()) as Record<string, unknown>;
1390
+ expect(body.error).toBe("db_unavailable");
1391
+ // Transient class is named in the structured body, not "reopened".
1392
+ expect(body.error_description as string).toContain("transient");
1393
+ // The crux: the hub did NOT exit and did NOT reopen on a transient lock.
1394
+ expect(exited).toBe(false);
1395
+ expect(reopened).toBe(false);
1396
+ } finally {
1397
+ db.close();
1398
+ h.cleanup();
1399
+ }
1400
+ });
1401
+
1263
1402
  // First-boot setup wizard (hub#259, expanding hub#258's static
1264
1403
  // placeholder). When no admin exists, GET /admin/setup renders the
1265
1404
  // wizard's account-step form. Once admin + vault both exist, it 301s
@@ -44,7 +44,7 @@ function fakeDeps(
44
44
  * `null` (hub not answering) or `{ ok, version }`. Drives
45
45
  * `probeHealthVersion` across the version-check + post-restart re-probe.
46
46
  */
47
- healthVersionSeq?: ({ ok: boolean; version?: string } | null)[];
47
+ healthVersionSeq?: ({ ok: boolean; version?: string; db?: string } | null)[];
48
48
  listeningSeq?: boolean[];
49
49
  installedUnit?: boolean;
50
50
  } = {},
@@ -756,4 +756,113 @@ describe("ensureHubVersionMatches — version-check-and-restart at adoption (#59
756
756
  expect(res.outcome).toBe("restart-failed");
757
757
  expect(res.messages.join("\n")).toContain("Unit parachute-hub.service not found.");
758
758
  });
759
+
760
+ // #594: a hub whose VERSION matches but whose /health reports a db fault
761
+ // (dead handle — state dir deleted under it) must be treated as needing a
762
+ // restart, through the same restart-once machinery.
763
+ test("version matches but /health reports db fault → restart-once → restarted when db heals", async () => {
764
+ const f = fakeDeps({
765
+ platform: "darwin",
766
+ getuid: () => 501,
767
+ installedUnit: true,
768
+ // first probe: right version but dead DB handle; after the restart the
769
+ // re-probe sees a live DB.
770
+ healthVersionSeq: [
771
+ { ok: true, version: INSTALLED, db: "error: fatal" },
772
+ { ok: true, version: INSTALLED, db: "ok" },
773
+ ],
774
+ });
775
+ const res = await ensureHubVersionMatches({
776
+ installedVersion: INSTALLED,
777
+ port: 1939,
778
+ deps: f.deps,
779
+ readyPollMs: 0,
780
+ });
781
+ expect(res.outcome).toBe("restarted");
782
+ const restarts = f.calls.filter((c) => c.includes("kickstart"));
783
+ expect(restarts).toHaveLength(1);
784
+ });
785
+
786
+ // #594: a SUSTAINED transient fault visible in /health (e.g. a write lock
787
+ // that never clears) is still an "error:" verdict, so the adoption probe
788
+ // treats it as needing a restart — same as the fatal case. Pins that
789
+ // `healthReportsDbFault` keys on the "error:" prefix, not the fatal class.
790
+ test("version matches but /health reports db error: transient → restart-once (#594)", async () => {
791
+ const f = fakeDeps({
792
+ platform: "darwin",
793
+ getuid: () => 501,
794
+ installedUnit: true,
795
+ // first probe: right version, sustained transient DB fault; after the
796
+ // restart the re-probe sees a live DB.
797
+ healthVersionSeq: [
798
+ { ok: true, version: INSTALLED, db: "error: transient" },
799
+ { ok: true, version: INSTALLED, db: "ok" },
800
+ ],
801
+ });
802
+ const res = await ensureHubVersionMatches({
803
+ installedVersion: INSTALLED,
804
+ port: 1939,
805
+ deps: f.deps,
806
+ readyPollMs: 0,
807
+ });
808
+ expect(res.outcome).toBe("restarted");
809
+ const restarts = f.calls.filter((c) => c.includes("kickstart"));
810
+ expect(restarts).toHaveLength(1);
811
+ });
812
+
813
+ test("version + db both ok → match, NO restart (#594 doesn't fire on a healthy hub)", async () => {
814
+ const f = fakeDeps({
815
+ platform: "darwin",
816
+ getuid: () => 501,
817
+ installedUnit: true,
818
+ healthVersionSeq: [{ ok: true, version: INSTALLED, db: "ok" }],
819
+ });
820
+ const res = await ensureHubVersionMatches({
821
+ installedVersion: INSTALLED,
822
+ port: 1939,
823
+ deps: f.deps,
824
+ readyPollMs: 0,
825
+ });
826
+ expect(res.outcome).toBe("match");
827
+ expect(f.calls).toEqual([]);
828
+ });
829
+
830
+ test("db fault persists after the restart → still-mismatched with a db-specific message (#594)", async () => {
831
+ const f = fakeDeps({
832
+ platform: "darwin",
833
+ getuid: () => 501,
834
+ installedUnit: true,
835
+ // Every probe reports the dead handle (state dir still gone). Restart
836
+ // once, then settle — no loop.
837
+ healthVersionSeq: [{ ok: true, version: INSTALLED, db: "error: fatal" }],
838
+ });
839
+ const res = await ensureHubVersionMatches({
840
+ installedVersion: INSTALLED,
841
+ port: 1939,
842
+ deps: f.deps,
843
+ readyTimeoutMs: 0,
844
+ readyPollMs: 0,
845
+ });
846
+ expect(res.outcome).toBe("still-mismatched");
847
+ const restarts = f.calls.filter((c) => c.includes("kickstart"));
848
+ expect(restarts).toHaveLength(1);
849
+ expect(res.messages.join("\n")).toContain("database still reports a fault");
850
+ });
851
+
852
+ test("a hub with NO db field (pre-#594) on a version match → match, not treated as a fault", async () => {
853
+ const f = fakeDeps({
854
+ platform: "darwin",
855
+ getuid: () => 501,
856
+ installedUnit: true,
857
+ healthVersionSeq: [{ ok: true, version: INSTALLED /* no db field */ }],
858
+ });
859
+ const res = await ensureHubVersionMatches({
860
+ installedVersion: INSTALLED,
861
+ port: 1939,
862
+ deps: f.deps,
863
+ readyPollMs: 0,
864
+ });
865
+ expect(res.outcome).toBe("match");
866
+ expect(f.calls).toEqual([]);
867
+ });
759
868
  });
@@ -133,3 +133,73 @@ export async function routeDns(
133
133
  export function credentialsPath(uuid: string, cloudflaredHome: string): string {
134
134
  return join(cloudflaredHome, `${uuid}.json`);
135
135
  }
136
+
137
+ /**
138
+ * `cloudflared tunnel delete <name>` removes the account-side tunnel. Used by
139
+ * the reuse-path self-heal (#593): when an existing tunnel's local credentials
140
+ * file is missing, the tunnel is unusable from this machine — we delete the
141
+ * account-side tunnel and recreate it so `tunnel create` re-writes a fresh
142
+ * `~/.cloudflared/<uuid>.json`.
143
+ *
144
+ * `--force` makes the delete non-interactive and tears down any lingering
145
+ * connector record cloudflared still has registered for the tunnel — without
146
+ * it, `tunnel delete` refuses ("tunnel has active connections") when a stale
147
+ * connector is registered account-side, which is exactly the crash-loop state
148
+ * #593 self-heals. Deleting a tunnel with no live local connector is safe: the
149
+ * field repro showed `tunnel delete` + re-run worked cleanly.
150
+ */
151
+ export async function deleteTunnel(runner: Runner, name: string): Promise<void> {
152
+ const cmd = ["cloudflared", "tunnel", "delete", "--force", name];
153
+ const result = await runner(cmd);
154
+ if (result.code !== 0) {
155
+ throw new CloudflaredError(
156
+ `cloudflared tunnel delete failed: ${combineErrStreams(result)}`,
157
+ cmd,
158
+ result,
159
+ );
160
+ }
161
+ }
162
+
163
+ /**
164
+ * Count the active connector connections cloudflared reports for a tunnel via
165
+ * `cloudflared tunnel info --output json <name>`. Used by the post-start
166
+ * connection verification (#593): a spawned connector pid existing ≠ the
167
+ * connector actually registered an edge connection (the error-1033 field
168
+ * repro — pid alive, connector crash-looping on a missing creds file, every
169
+ * request 1033).
170
+ *
171
+ * The JSON shape is `{ conns: [ { ... }, … ] }` (or a top-level `connections`
172
+ * array on some cloudflared versions). We count entries defensively across
173
+ * both shapes and treat any parse/CLI failure as `0` (not-yet-connected) — the
174
+ * caller polls, so a transient miss just costs one more poll. Returns the
175
+ * connector count; `> 0` means at least one edge connection is live.
176
+ */
177
+ export async function tunnelConnectionCount(runner: Runner, name: string): Promise<number> {
178
+ const cmd = ["cloudflared", "tunnel", "info", "--output", "json", name];
179
+ let result: CommandResult;
180
+ try {
181
+ result = await runner(cmd);
182
+ } catch {
183
+ return 0;
184
+ }
185
+ if (result.code !== 0) return 0;
186
+ let parsed: unknown;
187
+ try {
188
+ parsed = JSON.parse(result.stdout);
189
+ } catch {
190
+ return 0;
191
+ }
192
+ if (!parsed || typeof parsed !== "object") return 0;
193
+ const obj = parsed as Record<string, unknown>;
194
+ // `cloudflared tunnel info --output json` reports per-connector entries under
195
+ // `conns` on current versions; older shapes used a flat `connections` array.
196
+ // Count whichever is present.
197
+ const conns = obj.conns ?? obj.connections;
198
+ if (Array.isArray(conns)) {
199
+ // Each entry may itself carry a nested `conns` array (per-colo connector
200
+ // detail). Count an entry as a live connection when it exists; that's the
201
+ // signal we need ("the connector registered at least one edge connection").
202
+ return conns.length;
203
+ }
204
+ return 0;
205
+ }
@@ -1,5 +1,5 @@
1
1
  import { spawnSync } from "node:child_process";
2
- import { mkdirSync, openSync } from "node:fs";
2
+ import { existsSync, mkdirSync, openSync } from "node:fs";
3
3
  import { dirname } from "node:path";
4
4
  import {
5
5
  DEFAULT_TUNNEL_NAME,
@@ -37,8 +37,10 @@ import {
37
37
  type Tunnel,
38
38
  createTunnel,
39
39
  credentialsPath,
40
+ deleteTunnel,
40
41
  findTunnelByName,
41
42
  routeDns,
43
+ tunnelConnectionCount,
42
44
  } from "../cloudflare/tunnel.ts";
43
45
  import { CONFIG_DIR, SERVICES_MANIFEST_PATH } from "../config.ts";
44
46
  import {
@@ -267,6 +269,48 @@ export function looksLikeCloudflare(addresses: readonly string[]): boolean {
267
269
  return false;
268
270
  }
269
271
 
272
+ /**
273
+ * Poll for the spawned connector establishing a live edge connection, bounded
274
+ * by `timeoutMs` (#593). Resolves true the first time `cloudflared tunnel
275
+ * info` reports ≥1 connector connection; false when the budget elapses with
276
+ * none. The pid existing is NOT proof of connection — the field repro had a
277
+ * live pid crash-looping on a missing creds file, every request returning
278
+ * Cloudflare error 1033. This is the loud verification that turns that silent
279
+ * false-success into an actionable failure.
280
+ *
281
+ * Injectable so tests drive both branches deterministically without a real
282
+ * cloudflared. Production uses `defaultVerifyConnection` (a bounded
283
+ * `tunnelConnectionCount` poll).
284
+ */
285
+ export type VerifyConnectionFn = (args: {
286
+ runner: Runner;
287
+ tunnelName: string;
288
+ timeoutMs: number;
289
+ pollMs: number;
290
+ sleep: (ms: number) => Promise<void>;
291
+ }) => Promise<boolean>;
292
+
293
+ export const defaultVerifyConnection: VerifyConnectionFn = async ({
294
+ runner,
295
+ tunnelName,
296
+ timeoutMs,
297
+ pollMs,
298
+ sleep,
299
+ }) => {
300
+ const deadline = Date.now() + timeoutMs;
301
+ // Probe immediately, then poll until a connector registers or the budget
302
+ // elapses. `tunnelConnectionCount` swallows its own CLI/parse errors → 0, so
303
+ // a not-yet-ready connector just costs another poll. Worst case is roughly
304
+ // ceil(timeoutMs / pollMs) iterations (each = one `cloudflared tunnel info`
305
+ // call + one sleep) before the deadline check returns false — with the
306
+ // production defaults (25_000 / 1_000) that's ~25 probes over ~25s.
307
+ for (;;) {
308
+ if ((await tunnelConnectionCount(runner, tunnelName)) > 0) return true;
309
+ if (Date.now() >= deadline) return false;
310
+ await sleep(pollMs);
311
+ }
312
+ };
313
+
270
314
  export interface ExposeCloudflareOpts {
271
315
  runner?: Runner;
272
316
  spawner?: CloudflaredSpawner;
@@ -307,6 +351,23 @@ export interface ExposeCloudflareOpts {
307
351
  * Tests inject a stub; production uses `defaultResolveHost` (Bun DNS).
308
352
  */
309
353
  resolveHost?: ResolveHostFn;
354
+ /**
355
+ * Verify the spawned connector actually established an edge connection
356
+ * before claiming "✓ Cloudflare tunnel up" (#593). Production polls
357
+ * `cloudflared tunnel info` for a live connector (bounded). Tests inject a
358
+ * stub to drive the success / timeout branches without a real cloudflared.
359
+ * Returns true once at least one connection is live, false on timeout.
360
+ * Default policy mirrors `connectorPids`/`resolveHost`: when a test injects a
361
+ * stub `spawner` (and no explicit seam), default to an inert "connected"
362
+ * stub so existing stub-spawner suites don't have to model the probe.
363
+ */
364
+ verifyConnection?: VerifyConnectionFn;
365
+ /** Connection-verify budget in ms (default 25_000). */
366
+ verifyTimeoutMs?: number;
367
+ /** Poll interval for the connection-verify probe in ms (default 1_000). */
368
+ verifyPollMs?: number;
369
+ /** Sleep between connection-verify polls. Tests pin to a no-op. */
370
+ sleep?: (ms: number) => Promise<void>;
310
371
  log?: (line: string) => void;
311
372
  manifestPath?: string;
312
373
  statePath?: string;
@@ -402,6 +463,10 @@ interface Resolved {
402
463
  }) => InstallResult;
403
464
  removeService: (args: { tunnelName: string }) => RemoveResult;
404
465
  resolveHost: ResolveHostFn;
466
+ verifyConnection: VerifyConnectionFn;
467
+ verifyTimeoutMs: number;
468
+ verifyPollMs: number;
469
+ sleep: (ms: number) => Promise<void>;
405
470
  log: (line: string) => void;
406
471
  manifestPath: string;
407
472
  statePath: string;
@@ -488,6 +553,17 @@ function resolve(opts: ExposeCloudflareOpts, tunnelNameDefault: string): Resolve
488
553
  resolveHost:
489
554
  opts.resolveHost ??
490
555
  (opts.spawner === undefined ? defaultResolveHost : async () => ["104.16.0.1"]),
556
+ // Connection-verify seam (#593). Same defaulting policy as
557
+ // `connectorPids`/`resolveHost`: when a test injects a stub `spawner` (and
558
+ // no explicit seam), default to an inert "connected" stub so existing
559
+ // stub-spawner suites don't have to model the `tunnel info` probe.
560
+ // Production (no spawner override) gets the real bounded poll.
561
+ verifyConnection:
562
+ opts.verifyConnection ??
563
+ (opts.spawner === undefined ? defaultVerifyConnection : async () => true),
564
+ verifyTimeoutMs: opts.verifyTimeoutMs ?? 25_000,
565
+ verifyPollMs: opts.verifyPollMs ?? 1_000,
566
+ sleep: opts.sleep ?? ((ms) => new Promise((resolve) => setTimeout(resolve, ms))),
491
567
  log: opts.log ?? ((line) => console.log(line)),
492
568
  manifestPath: opts.manifestPath ?? SERVICES_MANIFEST_PATH,
493
569
  statePath: opts.statePath ?? CLOUDFLARED_STATE_PATH,
@@ -694,7 +770,50 @@ export async function exposeCloudflareUp(
694
770
  " Each machine gets its own dedicated tunnel — you don't need to run `cloudflared tunnel create` separately; expose does it.",
695
771
  );
696
772
  } else {
697
- r.log(`✓ Reusing existing tunnel "${r.tunnelName}" (${tunnel.id})`);
773
+ // Reuse-path credentials verification + self-heal (#593). `findTunnelByName`
774
+ // only proves the tunnel exists ACCOUNT-side. The connector needs the LOCAL
775
+ // credentials file (`~/.cloudflared/<uuid>.json`, written at `tunnel create`
776
+ // time) to authenticate — and that file gets lost on clean-slate flows
777
+ // (`rm -rf ~/.parachute` and friends) while the account-side tunnel
778
+ // survives. The field repro: tunnel reused, "✓ tunnel up" printed, connector
779
+ // crash-looping on "credentials file … doesn't exist", every request → 1033.
780
+ //
781
+ // If the creds file is missing we recreate the tunnel: delete the
782
+ // account-side tunnel by name (`--force`, so a stale registered connector
783
+ // doesn't block it), then `tunnel create` re-writes a fresh creds file. The
784
+ // new tunnel gets a new UUID; `routeDns` below uses `--overwrite-dns`, so the
785
+ // hostname's CNAME is repointed at the new UUID even though it pointed at the
786
+ // old one. The field case confirmed `tunnel delete` + re-run heals cleanly.
787
+ const existingCreds = credentialsPath(tunnel.id, r.cloudflaredHome);
788
+ if (existsSync(existingCreds)) {
789
+ r.log(`✓ Reusing existing tunnel "${r.tunnelName}" (${tunnel.id})`);
790
+ } else {
791
+ r.log(
792
+ `⚠ Tunnel "${r.tunnelName}" (${tunnel.id}) exists in Cloudflare, but its local credentials`,
793
+ );
794
+ r.log(` file is missing (${existingCreds}) — the connector can't authenticate from this`);
795
+ r.log(" machine. Recreating the tunnel so a fresh credentials file is written…");
796
+ try {
797
+ await deleteTunnel(r.runner, r.tunnelName);
798
+ } catch (err) {
799
+ if (err instanceof CloudflaredError) {
800
+ r.log("");
801
+ r.log(`✗ Couldn't delete the stale tunnel automatically: ${err.message}`);
802
+ r.log("");
803
+ r.log("Recover manually, then re-run this command:");
804
+ r.log(` cloudflared tunnel delete --force ${r.tunnelName}`);
805
+ r.log(` parachute expose public --cloudflare --domain ${hostname}`);
806
+ return 1;
807
+ }
808
+ throw err;
809
+ }
810
+ try {
811
+ tunnel = await createTunnel(r.runner, r.tunnelName);
812
+ } catch (err) {
813
+ return reportCloudflaredError(err, r.log);
814
+ }
815
+ r.log(`✓ Recreated tunnel ${tunnel.id} (fresh credentials written).`);
816
+ }
698
817
  }
699
818
 
700
819
  r.log(`Routing DNS: ${hostname} → tunnel ${tunnel.id}…`);
@@ -955,6 +1074,42 @@ export async function exposeCloudflareUp(
955
1074
  }
956
1075
  }
957
1076
 
1077
+ // Post-start connection verification (#593). The connector pid existing is
1078
+ // NOT proof it connected — the field repro had a live pid crash-looping on a
1079
+ // missing creds file, with every public request returning Cloudflare error
1080
+ // 1033 (tunnel registered, no connector) while the CLI printed "✓ tunnel up".
1081
+ // Poll `cloudflared tunnel info` for a live edge connection, bounded. On
1082
+ // timeout, fail LOUDLY with the connector log path + the crash-loop signature
1083
+ // to grep for, instead of claiming success.
1084
+ r.log("");
1085
+ r.log("Verifying the connector established a tunnel connection…");
1086
+ const connected = await r.verifyConnection({
1087
+ runner: r.runner,
1088
+ tunnelName: r.tunnelName,
1089
+ timeoutMs: r.verifyTimeoutMs,
1090
+ pollMs: r.verifyPollMs,
1091
+ sleep: r.sleep,
1092
+ });
1093
+ if (!connected) {
1094
+ r.log("");
1095
+ r.log(
1096
+ `✗ The cloudflared connector (pid ${pid}) started but never registered a tunnel connection`,
1097
+ );
1098
+ r.log(` within ${Math.round(r.verifyTimeoutMs / 1000)}s. Public requests to ${hostname} will`);
1099
+ r.log(" return Cloudflare error 1033 (tunnel registered, no connector) until this resolves.");
1100
+ r.log("");
1101
+ r.log("Check the connector log for the crash-loop cause:");
1102
+ r.log(` tail -n 50 ${r.logPath}`);
1103
+ r.log(' A repeating "credentials file … doesn\'t exist" line means the local credentials are');
1104
+ r.log(
1105
+ " gone — re-run this command (it auto-recreates the tunnel). Other repeating errors point",
1106
+ );
1107
+ r.log(" at the specific failure. Confirm the connector once it's healthy with:");
1108
+ r.log(` cloudflared tunnel info ${r.tunnelName}`);
1109
+ return 1;
1110
+ }
1111
+ r.log("✓ Connector connected.");
1112
+
958
1113
  const baseUrl = `https://${hostname}`;
959
1114
  let vaultUrl: string | undefined;
960
1115
  if (vaultEntry) {
@@ -34,6 +34,7 @@ import { generateBootstrapToken } from "../bootstrap-token.ts";
34
34
  // path isolation.
35
35
  import { CONFIG_DIR, SERVICES_MANIFEST_PATH } from "../config.ts";
36
36
  import { readExposeState } from "../expose-state.ts";
37
+ import { createDbHolder } from "../hub-db-liveness.ts";
37
38
  import { hubDbPath, openHubDb } from "../hub-db.ts";
38
39
  import { hubFetch } from "../hub-server.ts";
39
40
  import { writeHubFile } from "../hub.ts";
@@ -345,8 +346,16 @@ export async function serve(opts: ServeOpts = {}): Promise<{
345
346
  if (!existsSync(hubHtmlPath)) writeHubFile(hubHtmlPath);
346
347
 
347
348
  const dbPath = hubDbPath();
348
- const db = openHubDb(dbPath);
349
- const adminBootstrap = await seedInitialAdminIfNeeded(db, env, log);
349
+ // Self-heal-or-die DB holder (#594). The handle lives behind a mutable
350
+ // holder so a request that hits the persistent-corruption class (disk I/O
351
+ // error / malformed image — e.g. the state dir deleted under a running hub)
352
+ // can reopen the handle once, or exit(1) for the platform manager to restart
353
+ // us with a fresh one. `getDb` reads the current handle from the holder.
354
+ const dbHolder = createDbHolder(openHubDb(dbPath), {
355
+ reopen: () => openHubDb(dbPath),
356
+ log,
357
+ });
358
+ const adminBootstrap = await seedInitialAdminIfNeeded(dbHolder.get(), env, log);
350
359
 
351
360
  if (adminBootstrap === "needs-setup") {
352
361
  log(
@@ -381,7 +390,8 @@ export async function serve(opts: ServeOpts = {}): Promise<{
381
390
  // CMD), so the fix has to land here too. Closes hub#399.
382
391
  idleTimeout: 255,
383
392
  fetch: hubFetch(WELL_KNOWN_DIR, {
384
- getDb: () => db,
393
+ getDb: () => dbHolder.get(),
394
+ onDbError: (err) => dbHolder.healOrExit(err),
385
395
  issuer,
386
396
  loopbackPort: port,
387
397
  supervisor,
@@ -468,7 +478,7 @@ export async function serve(opts: ServeOpts = {}): Promise<{
468
478
  await supervisor.stop(state.short);
469
479
  }
470
480
  await server.stop();
471
- db.close();
481
+ dbHolder.get().close();
472
482
  },
473
483
  };
474
484
  }