@openparachute/hub 0.6.4 → 0.6.5-rc.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/__tests__/cloudflare-tunnel.test.ts +78 -0
- package/src/__tests__/expose-cloudflare.test.ts +253 -0
- package/src/__tests__/hub-db-liveness.test.ts +139 -0
- package/src/__tests__/hub-server.test.ts +145 -6
- package/src/__tests__/hub-unit.test.ts +110 -1
- package/src/cloudflare/tunnel.ts +70 -0
- package/src/commands/expose-cloudflare.ts +157 -2
- package/src/commands/serve.ts +14 -4
- package/src/hub-db-liveness.ts +211 -0
- package/src/hub-server.ts +1175 -1104
- package/src/hub-unit.ts +74 -27
|
@@ -5,6 +5,7 @@ import { join } from "node:path";
|
|
|
5
5
|
import { fileURLToPath } from "node:url";
|
|
6
6
|
import { buildCsrfCookie, generateCsrfToken } from "../csrf.ts";
|
|
7
7
|
import { HUB_SVC, hubPortPath } from "../hub-control.ts";
|
|
8
|
+
import { createDbHolder } from "../hub-db-liveness.ts";
|
|
8
9
|
import { hubDbPath, openHubDb } from "../hub-db.ts";
|
|
9
10
|
import {
|
|
10
11
|
findServiceUpstream,
|
|
@@ -1240,14 +1241,10 @@ describe("hubFetch routing", () => {
|
|
|
1240
1241
|
// open shouldn't cascade into a restart loop. The body advertises the
|
|
1241
1242
|
// running version so a deploy verifier can confirm the rolled-out
|
|
1242
1243
|
// image is the one it expected.
|
|
1243
|
-
test("/health returns 200 JSON
|
|
1244
|
+
test("/health returns 200 JSON; unconfigured db field when no getDb", async () => {
|
|
1244
1245
|
const h = makeHarness();
|
|
1245
1246
|
try {
|
|
1246
|
-
const res = await hubFetch(h.dir
|
|
1247
|
-
getDb: () => {
|
|
1248
|
-
throw new Error("getDb must not be called by /health");
|
|
1249
|
-
},
|
|
1250
|
-
})(req("/health"));
|
|
1247
|
+
const res = await hubFetch(h.dir)(req("/health"));
|
|
1251
1248
|
expect(res.status).toBe(200);
|
|
1252
1249
|
expect(res.headers.get("content-type")).toContain("application/json");
|
|
1253
1250
|
expect(res.headers.get("cache-control")).toBe("no-store");
|
|
@@ -1255,11 +1252,153 @@ describe("hubFetch routing", () => {
|
|
|
1255
1252
|
expect(body.status).toBe("ok");
|
|
1256
1253
|
expect(body.service).toBe("parachute-hub");
|
|
1257
1254
|
expect(typeof body.version).toBe("string");
|
|
1255
|
+
expect(body.db).toBe("unconfigured");
|
|
1258
1256
|
} finally {
|
|
1259
1257
|
h.cleanup();
|
|
1260
1258
|
}
|
|
1261
1259
|
});
|
|
1262
1260
|
|
|
1261
|
+
test("/health db field is ok on a live db (#594)", async () => {
|
|
1262
|
+
const h = makeHarness();
|
|
1263
|
+
const db = openHubDb(hubDbPath(h.dir));
|
|
1264
|
+
try {
|
|
1265
|
+
const res = await hubFetch(h.dir, { getDb: () => db })(req("/health"));
|
|
1266
|
+
expect(res.status).toBe(200);
|
|
1267
|
+
const body = (await res.json()) as Record<string, unknown>;
|
|
1268
|
+
expect(body.status).toBe("ok");
|
|
1269
|
+
expect(body.db).toBe("ok");
|
|
1270
|
+
} finally {
|
|
1271
|
+
db.close();
|
|
1272
|
+
h.cleanup();
|
|
1273
|
+
}
|
|
1274
|
+
});
|
|
1275
|
+
|
|
1276
|
+
test("/health db field reports error: <class> on a dead handle, still 200 (#594)", async () => {
|
|
1277
|
+
const h = makeHarness();
|
|
1278
|
+
const db = openHubDb(hubDbPath(h.dir));
|
|
1279
|
+
db.close(); // dead handle — every SELECT throws
|
|
1280
|
+
try {
|
|
1281
|
+
const res = await hubFetch(h.dir, { getDb: () => db })(req("/health"));
|
|
1282
|
+
// Always 200 while the process is up — the HTTP status is process
|
|
1283
|
+
// liveness, the db field is the readiness signal.
|
|
1284
|
+
expect(res.status).toBe(200);
|
|
1285
|
+
const body = (await res.json()) as Record<string, unknown>;
|
|
1286
|
+
expect(body.status).toBe("ok");
|
|
1287
|
+
expect(typeof body.db).toBe("string");
|
|
1288
|
+
expect((body.db as string).startsWith("error:")).toBe(true);
|
|
1289
|
+
} finally {
|
|
1290
|
+
h.cleanup();
|
|
1291
|
+
}
|
|
1292
|
+
});
|
|
1293
|
+
|
|
1294
|
+
test("a fatal DB throw escaping a handler returns a structured body + invokes onDbError (#594)", async () => {
|
|
1295
|
+
const h = makeHarness();
|
|
1296
|
+
try {
|
|
1297
|
+
const fatal = Object.assign(new Error("disk I/O error"), {
|
|
1298
|
+
name: "SQLiteError",
|
|
1299
|
+
code: "SQLITE_IOERR",
|
|
1300
|
+
});
|
|
1301
|
+
let onDbErrorCalls = 0;
|
|
1302
|
+
// A non-/health, DB-touching route (`/`) whose getDb throws the fatal
|
|
1303
|
+
// class. The top-level self-heal wrapper catches it, calls onDbError,
|
|
1304
|
+
// and returns a structured db_unavailable body (not a bare 500).
|
|
1305
|
+
const res = await hubFetch(h.dir, {
|
|
1306
|
+
manifestPath: h.manifestPath,
|
|
1307
|
+
getDb: () => {
|
|
1308
|
+
throw fatal;
|
|
1309
|
+
},
|
|
1310
|
+
onDbError: () => {
|
|
1311
|
+
onDbErrorCalls += 1;
|
|
1312
|
+
return "healed";
|
|
1313
|
+
},
|
|
1314
|
+
})(req("/", { headers: { accept: "text/html" } }));
|
|
1315
|
+
expect(onDbErrorCalls).toBe(1);
|
|
1316
|
+
expect(res.status).toBe(503);
|
|
1317
|
+
const body = (await res.json()) as Record<string, unknown>;
|
|
1318
|
+
expect(body.error).toBe("db_unavailable");
|
|
1319
|
+
expect(typeof body.error_description).toBe("string");
|
|
1320
|
+
expect(body.error_description as string).toContain("reopened");
|
|
1321
|
+
} finally {
|
|
1322
|
+
h.cleanup();
|
|
1323
|
+
}
|
|
1324
|
+
});
|
|
1325
|
+
|
|
1326
|
+
test("a non-DB throw still propagates (not swallowed by the self-heal wrapper) (#594)", async () => {
|
|
1327
|
+
const h = makeHarness();
|
|
1328
|
+
try {
|
|
1329
|
+
let onDbErrorCalls = 0;
|
|
1330
|
+
const handler = hubFetch(h.dir, {
|
|
1331
|
+
manifestPath: h.manifestPath,
|
|
1332
|
+
getDb: () => {
|
|
1333
|
+
throw new Error("some unrelated programming error");
|
|
1334
|
+
},
|
|
1335
|
+
onDbError: () => {
|
|
1336
|
+
onDbErrorCalls += 1;
|
|
1337
|
+
return "ignored";
|
|
1338
|
+
},
|
|
1339
|
+
});
|
|
1340
|
+
await expect(handler(req("/", { headers: { accept: "text/html" } }))).rejects.toThrow(
|
|
1341
|
+
"some unrelated programming error",
|
|
1342
|
+
);
|
|
1343
|
+
expect(onDbErrorCalls).toBe(0);
|
|
1344
|
+
} finally {
|
|
1345
|
+
h.cleanup();
|
|
1346
|
+
}
|
|
1347
|
+
});
|
|
1348
|
+
|
|
1349
|
+
test("a transient SQLITE_BUSY escaping a handler → 503, does NOT exit the hub (#594)", async () => {
|
|
1350
|
+
const h = makeHarness();
|
|
1351
|
+
const db = openHubDb(hubDbPath(h.dir));
|
|
1352
|
+
try {
|
|
1353
|
+
// Wire a REAL DbHolder so this pins the end-to-end transient path: the
|
|
1354
|
+
// wrapper catches the throw, routes it to the holder's healOrExit, and
|
|
1355
|
+
// the holder must classify SQLITE_BUSY as transient → "ignored" → NO
|
|
1356
|
+
// reopen, NO exit. A spy `exit` that fails the test if ever called is the
|
|
1357
|
+
// regression guard ("a momentary lock never kills the hub").
|
|
1358
|
+
let exited = false;
|
|
1359
|
+
let reopened = false;
|
|
1360
|
+
const holder = createDbHolder(db, {
|
|
1361
|
+
reopen: () => {
|
|
1362
|
+
reopened = true;
|
|
1363
|
+
return db;
|
|
1364
|
+
},
|
|
1365
|
+
exit: () => {
|
|
1366
|
+
exited = true;
|
|
1367
|
+
},
|
|
1368
|
+
log: () => {},
|
|
1369
|
+
});
|
|
1370
|
+
const busy = Object.assign(new Error("database is locked"), {
|
|
1371
|
+
name: "SQLiteError",
|
|
1372
|
+
code: "SQLITE_BUSY",
|
|
1373
|
+
});
|
|
1374
|
+
// First getDb() (the wizard-redirect userCount read) throws BUSY; the
|
|
1375
|
+
// wrapper's catch routes it through the holder.
|
|
1376
|
+
let firstCall = true;
|
|
1377
|
+
const res = await hubFetch(h.dir, {
|
|
1378
|
+
manifestPath: h.manifestPath,
|
|
1379
|
+
getDb: () => {
|
|
1380
|
+
if (firstCall) {
|
|
1381
|
+
firstCall = false;
|
|
1382
|
+
throw busy;
|
|
1383
|
+
}
|
|
1384
|
+
return holder.get();
|
|
1385
|
+
},
|
|
1386
|
+
onDbError: (err) => holder.healOrExit(err),
|
|
1387
|
+
})(req("/", { headers: { accept: "text/html" } }));
|
|
1388
|
+
expect(res.status).toBe(503);
|
|
1389
|
+
const body = (await res.json()) as Record<string, unknown>;
|
|
1390
|
+
expect(body.error).toBe("db_unavailable");
|
|
1391
|
+
// Transient class is named in the structured body, not "reopened".
|
|
1392
|
+
expect(body.error_description as string).toContain("transient");
|
|
1393
|
+
// The crux: the hub did NOT exit and did NOT reopen on a transient lock.
|
|
1394
|
+
expect(exited).toBe(false);
|
|
1395
|
+
expect(reopened).toBe(false);
|
|
1396
|
+
} finally {
|
|
1397
|
+
db.close();
|
|
1398
|
+
h.cleanup();
|
|
1399
|
+
}
|
|
1400
|
+
});
|
|
1401
|
+
|
|
1263
1402
|
// First-boot setup wizard (hub#259, expanding hub#258's static
|
|
1264
1403
|
// placeholder). When no admin exists, GET /admin/setup renders the
|
|
1265
1404
|
// wizard's account-step form. Once admin + vault both exist, it 301s
|
|
@@ -44,7 +44,7 @@ function fakeDeps(
|
|
|
44
44
|
* `null` (hub not answering) or `{ ok, version }`. Drives
|
|
45
45
|
* `probeHealthVersion` across the version-check + post-restart re-probe.
|
|
46
46
|
*/
|
|
47
|
-
healthVersionSeq?: ({ ok: boolean; version?: string } | null)[];
|
|
47
|
+
healthVersionSeq?: ({ ok: boolean; version?: string; db?: string } | null)[];
|
|
48
48
|
listeningSeq?: boolean[];
|
|
49
49
|
installedUnit?: boolean;
|
|
50
50
|
} = {},
|
|
@@ -756,4 +756,113 @@ describe("ensureHubVersionMatches — version-check-and-restart at adoption (#59
|
|
|
756
756
|
expect(res.outcome).toBe("restart-failed");
|
|
757
757
|
expect(res.messages.join("\n")).toContain("Unit parachute-hub.service not found.");
|
|
758
758
|
});
|
|
759
|
+
|
|
760
|
+
// #594: a hub whose VERSION matches but whose /health reports a db fault
|
|
761
|
+
// (dead handle — state dir deleted under it) must be treated as needing a
|
|
762
|
+
// restart, through the same restart-once machinery.
|
|
763
|
+
test("version matches but /health reports db fault → restart-once → restarted when db heals", async () => {
|
|
764
|
+
const f = fakeDeps({
|
|
765
|
+
platform: "darwin",
|
|
766
|
+
getuid: () => 501,
|
|
767
|
+
installedUnit: true,
|
|
768
|
+
// first probe: right version but dead DB handle; after the restart the
|
|
769
|
+
// re-probe sees a live DB.
|
|
770
|
+
healthVersionSeq: [
|
|
771
|
+
{ ok: true, version: INSTALLED, db: "error: fatal" },
|
|
772
|
+
{ ok: true, version: INSTALLED, db: "ok" },
|
|
773
|
+
],
|
|
774
|
+
});
|
|
775
|
+
const res = await ensureHubVersionMatches({
|
|
776
|
+
installedVersion: INSTALLED,
|
|
777
|
+
port: 1939,
|
|
778
|
+
deps: f.deps,
|
|
779
|
+
readyPollMs: 0,
|
|
780
|
+
});
|
|
781
|
+
expect(res.outcome).toBe("restarted");
|
|
782
|
+
const restarts = f.calls.filter((c) => c.includes("kickstart"));
|
|
783
|
+
expect(restarts).toHaveLength(1);
|
|
784
|
+
});
|
|
785
|
+
|
|
786
|
+
// #594: a SUSTAINED transient fault visible in /health (e.g. a write lock
|
|
787
|
+
// that never clears) is still an "error:" verdict, so the adoption probe
|
|
788
|
+
// treats it as needing a restart — same as the fatal case. Pins that
|
|
789
|
+
// `healthReportsDbFault` keys on the "error:" prefix, not the fatal class.
|
|
790
|
+
test("version matches but /health reports db error: transient → restart-once (#594)", async () => {
|
|
791
|
+
const f = fakeDeps({
|
|
792
|
+
platform: "darwin",
|
|
793
|
+
getuid: () => 501,
|
|
794
|
+
installedUnit: true,
|
|
795
|
+
// first probe: right version, sustained transient DB fault; after the
|
|
796
|
+
// restart the re-probe sees a live DB.
|
|
797
|
+
healthVersionSeq: [
|
|
798
|
+
{ ok: true, version: INSTALLED, db: "error: transient" },
|
|
799
|
+
{ ok: true, version: INSTALLED, db: "ok" },
|
|
800
|
+
],
|
|
801
|
+
});
|
|
802
|
+
const res = await ensureHubVersionMatches({
|
|
803
|
+
installedVersion: INSTALLED,
|
|
804
|
+
port: 1939,
|
|
805
|
+
deps: f.deps,
|
|
806
|
+
readyPollMs: 0,
|
|
807
|
+
});
|
|
808
|
+
expect(res.outcome).toBe("restarted");
|
|
809
|
+
const restarts = f.calls.filter((c) => c.includes("kickstart"));
|
|
810
|
+
expect(restarts).toHaveLength(1);
|
|
811
|
+
});
|
|
812
|
+
|
|
813
|
+
test("version + db both ok → match, NO restart (#594 doesn't fire on a healthy hub)", async () => {
|
|
814
|
+
const f = fakeDeps({
|
|
815
|
+
platform: "darwin",
|
|
816
|
+
getuid: () => 501,
|
|
817
|
+
installedUnit: true,
|
|
818
|
+
healthVersionSeq: [{ ok: true, version: INSTALLED, db: "ok" }],
|
|
819
|
+
});
|
|
820
|
+
const res = await ensureHubVersionMatches({
|
|
821
|
+
installedVersion: INSTALLED,
|
|
822
|
+
port: 1939,
|
|
823
|
+
deps: f.deps,
|
|
824
|
+
readyPollMs: 0,
|
|
825
|
+
});
|
|
826
|
+
expect(res.outcome).toBe("match");
|
|
827
|
+
expect(f.calls).toEqual([]);
|
|
828
|
+
});
|
|
829
|
+
|
|
830
|
+
test("db fault persists after the restart → still-mismatched with a db-specific message (#594)", async () => {
|
|
831
|
+
const f = fakeDeps({
|
|
832
|
+
platform: "darwin",
|
|
833
|
+
getuid: () => 501,
|
|
834
|
+
installedUnit: true,
|
|
835
|
+
// Every probe reports the dead handle (state dir still gone). Restart
|
|
836
|
+
// once, then settle — no loop.
|
|
837
|
+
healthVersionSeq: [{ ok: true, version: INSTALLED, db: "error: fatal" }],
|
|
838
|
+
});
|
|
839
|
+
const res = await ensureHubVersionMatches({
|
|
840
|
+
installedVersion: INSTALLED,
|
|
841
|
+
port: 1939,
|
|
842
|
+
deps: f.deps,
|
|
843
|
+
readyTimeoutMs: 0,
|
|
844
|
+
readyPollMs: 0,
|
|
845
|
+
});
|
|
846
|
+
expect(res.outcome).toBe("still-mismatched");
|
|
847
|
+
const restarts = f.calls.filter((c) => c.includes("kickstart"));
|
|
848
|
+
expect(restarts).toHaveLength(1);
|
|
849
|
+
expect(res.messages.join("\n")).toContain("database still reports a fault");
|
|
850
|
+
});
|
|
851
|
+
|
|
852
|
+
test("a hub with NO db field (pre-#594) on a version match → match, not treated as a fault", async () => {
|
|
853
|
+
const f = fakeDeps({
|
|
854
|
+
platform: "darwin",
|
|
855
|
+
getuid: () => 501,
|
|
856
|
+
installedUnit: true,
|
|
857
|
+
healthVersionSeq: [{ ok: true, version: INSTALLED /* no db field */ }],
|
|
858
|
+
});
|
|
859
|
+
const res = await ensureHubVersionMatches({
|
|
860
|
+
installedVersion: INSTALLED,
|
|
861
|
+
port: 1939,
|
|
862
|
+
deps: f.deps,
|
|
863
|
+
readyPollMs: 0,
|
|
864
|
+
});
|
|
865
|
+
expect(res.outcome).toBe("match");
|
|
866
|
+
expect(f.calls).toEqual([]);
|
|
867
|
+
});
|
|
759
868
|
});
|
package/src/cloudflare/tunnel.ts
CHANGED
|
@@ -133,3 +133,73 @@ export async function routeDns(
|
|
|
133
133
|
export function credentialsPath(uuid: string, cloudflaredHome: string): string {
|
|
134
134
|
return join(cloudflaredHome, `${uuid}.json`);
|
|
135
135
|
}
|
|
136
|
+
|
|
137
|
+
/**
|
|
138
|
+
* `cloudflared tunnel delete <name>` removes the account-side tunnel. Used by
|
|
139
|
+
* the reuse-path self-heal (#593): when an existing tunnel's local credentials
|
|
140
|
+
* file is missing, the tunnel is unusable from this machine — we delete the
|
|
141
|
+
* account-side tunnel and recreate it so `tunnel create` re-writes a fresh
|
|
142
|
+
* `~/.cloudflared/<uuid>.json`.
|
|
143
|
+
*
|
|
144
|
+
* `--force` makes the delete non-interactive and tears down any lingering
|
|
145
|
+
* connector record cloudflared still has registered for the tunnel — without
|
|
146
|
+
* it, `tunnel delete` refuses ("tunnel has active connections") when a stale
|
|
147
|
+
* connector is registered account-side, which is exactly the crash-loop state
|
|
148
|
+
* #593 self-heals. Deleting a tunnel with no live local connector is safe: the
|
|
149
|
+
* field repro showed `tunnel delete` + re-run worked cleanly.
|
|
150
|
+
*/
|
|
151
|
+
export async function deleteTunnel(runner: Runner, name: string): Promise<void> {
|
|
152
|
+
const cmd = ["cloudflared", "tunnel", "delete", "--force", name];
|
|
153
|
+
const result = await runner(cmd);
|
|
154
|
+
if (result.code !== 0) {
|
|
155
|
+
throw new CloudflaredError(
|
|
156
|
+
`cloudflared tunnel delete failed: ${combineErrStreams(result)}`,
|
|
157
|
+
cmd,
|
|
158
|
+
result,
|
|
159
|
+
);
|
|
160
|
+
}
|
|
161
|
+
}
|
|
162
|
+
|
|
163
|
+
/**
|
|
164
|
+
* Count the active connector connections cloudflared reports for a tunnel via
|
|
165
|
+
* `cloudflared tunnel info --output json <name>`. Used by the post-start
|
|
166
|
+
* connection verification (#593): a spawned connector pid existing ≠ the
|
|
167
|
+
* connector actually registered an edge connection (the error-1033 field
|
|
168
|
+
* repro — pid alive, connector crash-looping on a missing creds file, every
|
|
169
|
+
* request 1033).
|
|
170
|
+
*
|
|
171
|
+
* The JSON shape is `{ conns: [ { ... }, … ] }` (or a top-level `connections`
|
|
172
|
+
* array on some cloudflared versions). We count entries defensively across
|
|
173
|
+
* both shapes and treat any parse/CLI failure as `0` (not-yet-connected) — the
|
|
174
|
+
* caller polls, so a transient miss just costs one more poll. Returns the
|
|
175
|
+
* connector count; `> 0` means at least one edge connection is live.
|
|
176
|
+
*/
|
|
177
|
+
export async function tunnelConnectionCount(runner: Runner, name: string): Promise<number> {
|
|
178
|
+
const cmd = ["cloudflared", "tunnel", "info", "--output", "json", name];
|
|
179
|
+
let result: CommandResult;
|
|
180
|
+
try {
|
|
181
|
+
result = await runner(cmd);
|
|
182
|
+
} catch {
|
|
183
|
+
return 0;
|
|
184
|
+
}
|
|
185
|
+
if (result.code !== 0) return 0;
|
|
186
|
+
let parsed: unknown;
|
|
187
|
+
try {
|
|
188
|
+
parsed = JSON.parse(result.stdout);
|
|
189
|
+
} catch {
|
|
190
|
+
return 0;
|
|
191
|
+
}
|
|
192
|
+
if (!parsed || typeof parsed !== "object") return 0;
|
|
193
|
+
const obj = parsed as Record<string, unknown>;
|
|
194
|
+
// `cloudflared tunnel info --output json` reports per-connector entries under
|
|
195
|
+
// `conns` on current versions; older shapes used a flat `connections` array.
|
|
196
|
+
// Count whichever is present.
|
|
197
|
+
const conns = obj.conns ?? obj.connections;
|
|
198
|
+
if (Array.isArray(conns)) {
|
|
199
|
+
// Each entry may itself carry a nested `conns` array (per-colo connector
|
|
200
|
+
// detail). Count an entry as a live connection when it exists; that's the
|
|
201
|
+
// signal we need ("the connector registered at least one edge connection").
|
|
202
|
+
return conns.length;
|
|
203
|
+
}
|
|
204
|
+
return 0;
|
|
205
|
+
}
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import { spawnSync } from "node:child_process";
|
|
2
|
-
import { mkdirSync, openSync } from "node:fs";
|
|
2
|
+
import { existsSync, mkdirSync, openSync } from "node:fs";
|
|
3
3
|
import { dirname } from "node:path";
|
|
4
4
|
import {
|
|
5
5
|
DEFAULT_TUNNEL_NAME,
|
|
@@ -37,8 +37,10 @@ import {
|
|
|
37
37
|
type Tunnel,
|
|
38
38
|
createTunnel,
|
|
39
39
|
credentialsPath,
|
|
40
|
+
deleteTunnel,
|
|
40
41
|
findTunnelByName,
|
|
41
42
|
routeDns,
|
|
43
|
+
tunnelConnectionCount,
|
|
42
44
|
} from "../cloudflare/tunnel.ts";
|
|
43
45
|
import { CONFIG_DIR, SERVICES_MANIFEST_PATH } from "../config.ts";
|
|
44
46
|
import {
|
|
@@ -267,6 +269,48 @@ export function looksLikeCloudflare(addresses: readonly string[]): boolean {
|
|
|
267
269
|
return false;
|
|
268
270
|
}
|
|
269
271
|
|
|
272
|
+
/**
|
|
273
|
+
* Poll for the spawned connector establishing a live edge connection, bounded
|
|
274
|
+
* by `timeoutMs` (#593). Resolves true the first time `cloudflared tunnel
|
|
275
|
+
* info` reports ≥1 connector connection; false when the budget elapses with
|
|
276
|
+
* none. The pid existing is NOT proof of connection — the field repro had a
|
|
277
|
+
* live pid crash-looping on a missing creds file, every request returning
|
|
278
|
+
* Cloudflare error 1033. This is the loud verification that turns that silent
|
|
279
|
+
* false-success into an actionable failure.
|
|
280
|
+
*
|
|
281
|
+
* Injectable so tests drive both branches deterministically without a real
|
|
282
|
+
* cloudflared. Production uses `defaultVerifyConnection` (a bounded
|
|
283
|
+
* `tunnelConnectionCount` poll).
|
|
284
|
+
*/
|
|
285
|
+
export type VerifyConnectionFn = (args: {
|
|
286
|
+
runner: Runner;
|
|
287
|
+
tunnelName: string;
|
|
288
|
+
timeoutMs: number;
|
|
289
|
+
pollMs: number;
|
|
290
|
+
sleep: (ms: number) => Promise<void>;
|
|
291
|
+
}) => Promise<boolean>;
|
|
292
|
+
|
|
293
|
+
export const defaultVerifyConnection: VerifyConnectionFn = async ({
|
|
294
|
+
runner,
|
|
295
|
+
tunnelName,
|
|
296
|
+
timeoutMs,
|
|
297
|
+
pollMs,
|
|
298
|
+
sleep,
|
|
299
|
+
}) => {
|
|
300
|
+
const deadline = Date.now() + timeoutMs;
|
|
301
|
+
// Probe immediately, then poll until a connector registers or the budget
|
|
302
|
+
// elapses. `tunnelConnectionCount` swallows its own CLI/parse errors → 0, so
|
|
303
|
+
// a not-yet-ready connector just costs another poll. Worst case is roughly
|
|
304
|
+
// ceil(timeoutMs / pollMs) iterations (each = one `cloudflared tunnel info`
|
|
305
|
+
// call + one sleep) before the deadline check returns false — with the
|
|
306
|
+
// production defaults (25_000 / 1_000) that's ~25 probes over ~25s.
|
|
307
|
+
for (;;) {
|
|
308
|
+
if ((await tunnelConnectionCount(runner, tunnelName)) > 0) return true;
|
|
309
|
+
if (Date.now() >= deadline) return false;
|
|
310
|
+
await sleep(pollMs);
|
|
311
|
+
}
|
|
312
|
+
};
|
|
313
|
+
|
|
270
314
|
export interface ExposeCloudflareOpts {
|
|
271
315
|
runner?: Runner;
|
|
272
316
|
spawner?: CloudflaredSpawner;
|
|
@@ -307,6 +351,23 @@ export interface ExposeCloudflareOpts {
|
|
|
307
351
|
* Tests inject a stub; production uses `defaultResolveHost` (Bun DNS).
|
|
308
352
|
*/
|
|
309
353
|
resolveHost?: ResolveHostFn;
|
|
354
|
+
/**
|
|
355
|
+
* Verify the spawned connector actually established an edge connection
|
|
356
|
+
* before claiming "✓ Cloudflare tunnel up" (#593). Production polls
|
|
357
|
+
* `cloudflared tunnel info` for a live connector (bounded). Tests inject a
|
|
358
|
+
* stub to drive the success / timeout branches without a real cloudflared.
|
|
359
|
+
* Returns true once at least one connection is live, false on timeout.
|
|
360
|
+
* Default policy mirrors `connectorPids`/`resolveHost`: when a test injects a
|
|
361
|
+
* stub `spawner` (and no explicit seam), default to an inert "connected"
|
|
362
|
+
* stub so existing stub-spawner suites don't have to model the probe.
|
|
363
|
+
*/
|
|
364
|
+
verifyConnection?: VerifyConnectionFn;
|
|
365
|
+
/** Connection-verify budget in ms (default 25_000). */
|
|
366
|
+
verifyTimeoutMs?: number;
|
|
367
|
+
/** Poll interval for the connection-verify probe in ms (default 1_000). */
|
|
368
|
+
verifyPollMs?: number;
|
|
369
|
+
/** Sleep between connection-verify polls. Tests pin to a no-op. */
|
|
370
|
+
sleep?: (ms: number) => Promise<void>;
|
|
310
371
|
log?: (line: string) => void;
|
|
311
372
|
manifestPath?: string;
|
|
312
373
|
statePath?: string;
|
|
@@ -402,6 +463,10 @@ interface Resolved {
|
|
|
402
463
|
}) => InstallResult;
|
|
403
464
|
removeService: (args: { tunnelName: string }) => RemoveResult;
|
|
404
465
|
resolveHost: ResolveHostFn;
|
|
466
|
+
verifyConnection: VerifyConnectionFn;
|
|
467
|
+
verifyTimeoutMs: number;
|
|
468
|
+
verifyPollMs: number;
|
|
469
|
+
sleep: (ms: number) => Promise<void>;
|
|
405
470
|
log: (line: string) => void;
|
|
406
471
|
manifestPath: string;
|
|
407
472
|
statePath: string;
|
|
@@ -488,6 +553,17 @@ function resolve(opts: ExposeCloudflareOpts, tunnelNameDefault: string): Resolve
|
|
|
488
553
|
resolveHost:
|
|
489
554
|
opts.resolveHost ??
|
|
490
555
|
(opts.spawner === undefined ? defaultResolveHost : async () => ["104.16.0.1"]),
|
|
556
|
+
// Connection-verify seam (#593). Same defaulting policy as
|
|
557
|
+
// `connectorPids`/`resolveHost`: when a test injects a stub `spawner` (and
|
|
558
|
+
// no explicit seam), default to an inert "connected" stub so existing
|
|
559
|
+
// stub-spawner suites don't have to model the `tunnel info` probe.
|
|
560
|
+
// Production (no spawner override) gets the real bounded poll.
|
|
561
|
+
verifyConnection:
|
|
562
|
+
opts.verifyConnection ??
|
|
563
|
+
(opts.spawner === undefined ? defaultVerifyConnection : async () => true),
|
|
564
|
+
verifyTimeoutMs: opts.verifyTimeoutMs ?? 25_000,
|
|
565
|
+
verifyPollMs: opts.verifyPollMs ?? 1_000,
|
|
566
|
+
sleep: opts.sleep ?? ((ms) => new Promise((resolve) => setTimeout(resolve, ms))),
|
|
491
567
|
log: opts.log ?? ((line) => console.log(line)),
|
|
492
568
|
manifestPath: opts.manifestPath ?? SERVICES_MANIFEST_PATH,
|
|
493
569
|
statePath: opts.statePath ?? CLOUDFLARED_STATE_PATH,
|
|
@@ -694,7 +770,50 @@ export async function exposeCloudflareUp(
|
|
|
694
770
|
" Each machine gets its own dedicated tunnel — you don't need to run `cloudflared tunnel create` separately; expose does it.",
|
|
695
771
|
);
|
|
696
772
|
} else {
|
|
697
|
-
|
|
773
|
+
// Reuse-path credentials verification + self-heal (#593). `findTunnelByName`
|
|
774
|
+
// only proves the tunnel exists ACCOUNT-side. The connector needs the LOCAL
|
|
775
|
+
// credentials file (`~/.cloudflared/<uuid>.json`, written at `tunnel create`
|
|
776
|
+
// time) to authenticate — and that file gets lost on clean-slate flows
|
|
777
|
+
// (`rm -rf ~/.parachute` and friends) while the account-side tunnel
|
|
778
|
+
// survives. The field repro: tunnel reused, "✓ tunnel up" printed, connector
|
|
779
|
+
// crash-looping on "credentials file … doesn't exist", every request → 1033.
|
|
780
|
+
//
|
|
781
|
+
// If the creds file is missing we recreate the tunnel: delete the
|
|
782
|
+
// account-side tunnel by name (`--force`, so a stale registered connector
|
|
783
|
+
// doesn't block it), then `tunnel create` re-writes a fresh creds file. The
|
|
784
|
+
// new tunnel gets a new UUID; `routeDns` below uses `--overwrite-dns`, so the
|
|
785
|
+
// hostname's CNAME is repointed at the new UUID even though it pointed at the
|
|
786
|
+
// old one. The field case confirmed `tunnel delete` + re-run heals cleanly.
|
|
787
|
+
const existingCreds = credentialsPath(tunnel.id, r.cloudflaredHome);
|
|
788
|
+
if (existsSync(existingCreds)) {
|
|
789
|
+
r.log(`✓ Reusing existing tunnel "${r.tunnelName}" (${tunnel.id})`);
|
|
790
|
+
} else {
|
|
791
|
+
r.log(
|
|
792
|
+
`⚠ Tunnel "${r.tunnelName}" (${tunnel.id}) exists in Cloudflare, but its local credentials`,
|
|
793
|
+
);
|
|
794
|
+
r.log(` file is missing (${existingCreds}) — the connector can't authenticate from this`);
|
|
795
|
+
r.log(" machine. Recreating the tunnel so a fresh credentials file is written…");
|
|
796
|
+
try {
|
|
797
|
+
await deleteTunnel(r.runner, r.tunnelName);
|
|
798
|
+
} catch (err) {
|
|
799
|
+
if (err instanceof CloudflaredError) {
|
|
800
|
+
r.log("");
|
|
801
|
+
r.log(`✗ Couldn't delete the stale tunnel automatically: ${err.message}`);
|
|
802
|
+
r.log("");
|
|
803
|
+
r.log("Recover manually, then re-run this command:");
|
|
804
|
+
r.log(` cloudflared tunnel delete --force ${r.tunnelName}`);
|
|
805
|
+
r.log(` parachute expose public --cloudflare --domain ${hostname}`);
|
|
806
|
+
return 1;
|
|
807
|
+
}
|
|
808
|
+
throw err;
|
|
809
|
+
}
|
|
810
|
+
try {
|
|
811
|
+
tunnel = await createTunnel(r.runner, r.tunnelName);
|
|
812
|
+
} catch (err) {
|
|
813
|
+
return reportCloudflaredError(err, r.log);
|
|
814
|
+
}
|
|
815
|
+
r.log(`✓ Recreated tunnel ${tunnel.id} (fresh credentials written).`);
|
|
816
|
+
}
|
|
698
817
|
}
|
|
699
818
|
|
|
700
819
|
r.log(`Routing DNS: ${hostname} → tunnel ${tunnel.id}…`);
|
|
@@ -955,6 +1074,42 @@ export async function exposeCloudflareUp(
|
|
|
955
1074
|
}
|
|
956
1075
|
}
|
|
957
1076
|
|
|
1077
|
+
// Post-start connection verification (#593). The connector pid existing is
|
|
1078
|
+
// NOT proof it connected — the field repro had a live pid crash-looping on a
|
|
1079
|
+
// missing creds file, with every public request returning Cloudflare error
|
|
1080
|
+
// 1033 (tunnel registered, no connector) while the CLI printed "✓ tunnel up".
|
|
1081
|
+
// Poll `cloudflared tunnel info` for a live edge connection, bounded. On
|
|
1082
|
+
// timeout, fail LOUDLY with the connector log path + the crash-loop signature
|
|
1083
|
+
// to grep for, instead of claiming success.
|
|
1084
|
+
r.log("");
|
|
1085
|
+
r.log("Verifying the connector established a tunnel connection…");
|
|
1086
|
+
const connected = await r.verifyConnection({
|
|
1087
|
+
runner: r.runner,
|
|
1088
|
+
tunnelName: r.tunnelName,
|
|
1089
|
+
timeoutMs: r.verifyTimeoutMs,
|
|
1090
|
+
pollMs: r.verifyPollMs,
|
|
1091
|
+
sleep: r.sleep,
|
|
1092
|
+
});
|
|
1093
|
+
if (!connected) {
|
|
1094
|
+
r.log("");
|
|
1095
|
+
r.log(
|
|
1096
|
+
`✗ The cloudflared connector (pid ${pid}) started but never registered a tunnel connection`,
|
|
1097
|
+
);
|
|
1098
|
+
r.log(` within ${Math.round(r.verifyTimeoutMs / 1000)}s. Public requests to ${hostname} will`);
|
|
1099
|
+
r.log(" return Cloudflare error 1033 (tunnel registered, no connector) until this resolves.");
|
|
1100
|
+
r.log("");
|
|
1101
|
+
r.log("Check the connector log for the crash-loop cause:");
|
|
1102
|
+
r.log(` tail -n 50 ${r.logPath}`);
|
|
1103
|
+
r.log(' A repeating "credentials file … doesn\'t exist" line means the local credentials are');
|
|
1104
|
+
r.log(
|
|
1105
|
+
" gone — re-run this command (it auto-recreates the tunnel). Other repeating errors point",
|
|
1106
|
+
);
|
|
1107
|
+
r.log(" at the specific failure. Confirm the connector once it's healthy with:");
|
|
1108
|
+
r.log(` cloudflared tunnel info ${r.tunnelName}`);
|
|
1109
|
+
return 1;
|
|
1110
|
+
}
|
|
1111
|
+
r.log("✓ Connector connected.");
|
|
1112
|
+
|
|
958
1113
|
const baseUrl = `https://${hostname}`;
|
|
959
1114
|
let vaultUrl: string | undefined;
|
|
960
1115
|
if (vaultEntry) {
|
package/src/commands/serve.ts
CHANGED
|
@@ -34,6 +34,7 @@ import { generateBootstrapToken } from "../bootstrap-token.ts";
|
|
|
34
34
|
// path isolation.
|
|
35
35
|
import { CONFIG_DIR, SERVICES_MANIFEST_PATH } from "../config.ts";
|
|
36
36
|
import { readExposeState } from "../expose-state.ts";
|
|
37
|
+
import { createDbHolder } from "../hub-db-liveness.ts";
|
|
37
38
|
import { hubDbPath, openHubDb } from "../hub-db.ts";
|
|
38
39
|
import { hubFetch } from "../hub-server.ts";
|
|
39
40
|
import { writeHubFile } from "../hub.ts";
|
|
@@ -345,8 +346,16 @@ export async function serve(opts: ServeOpts = {}): Promise<{
|
|
|
345
346
|
if (!existsSync(hubHtmlPath)) writeHubFile(hubHtmlPath);
|
|
346
347
|
|
|
347
348
|
const dbPath = hubDbPath();
|
|
348
|
-
|
|
349
|
-
|
|
349
|
+
// Self-heal-or-die DB holder (#594). The handle lives behind a mutable
|
|
350
|
+
// holder so a request that hits the persistent-corruption class (disk I/O
|
|
351
|
+
// error / malformed image — e.g. the state dir deleted under a running hub)
|
|
352
|
+
// can reopen the handle once, or exit(1) for the platform manager to restart
|
|
353
|
+
// us with a fresh one. `getDb` reads the current handle from the holder.
|
|
354
|
+
const dbHolder = createDbHolder(openHubDb(dbPath), {
|
|
355
|
+
reopen: () => openHubDb(dbPath),
|
|
356
|
+
log,
|
|
357
|
+
});
|
|
358
|
+
const adminBootstrap = await seedInitialAdminIfNeeded(dbHolder.get(), env, log);
|
|
350
359
|
|
|
351
360
|
if (adminBootstrap === "needs-setup") {
|
|
352
361
|
log(
|
|
@@ -381,7 +390,8 @@ export async function serve(opts: ServeOpts = {}): Promise<{
|
|
|
381
390
|
// CMD), so the fix has to land here too. Closes hub#399.
|
|
382
391
|
idleTimeout: 255,
|
|
383
392
|
fetch: hubFetch(WELL_KNOWN_DIR, {
|
|
384
|
-
getDb: () =>
|
|
393
|
+
getDb: () => dbHolder.get(),
|
|
394
|
+
onDbError: (err) => dbHolder.healOrExit(err),
|
|
385
395
|
issuer,
|
|
386
396
|
loopbackPort: port,
|
|
387
397
|
supervisor,
|
|
@@ -468,7 +478,7 @@ export async function serve(opts: ServeOpts = {}): Promise<{
|
|
|
468
478
|
await supervisor.stop(state.short);
|
|
469
479
|
}
|
|
470
480
|
await server.stop();
|
|
471
|
-
|
|
481
|
+
dbHolder.get().close();
|
|
472
482
|
},
|
|
473
483
|
};
|
|
474
484
|
}
|