@openparachute/hub 0.5.14-rc.13 → 0.5.14-rc.14

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@openparachute/hub",
3
- "version": "0.5.14-rc.13",
3
+ "version": "0.5.14-rc.14",
4
4
  "description": "parachute — the local hub for the Parachute ecosystem (discovery, ports, lifecycle, soon OAuth).",
5
5
  "license": "AGPL-3.0",
6
6
  "publishConfig": {
@@ -551,6 +551,191 @@ describe("exposeCloudflareUp", () => {
551
551
  }
552
552
  });
553
553
 
554
+ test("hub#487: kills orphan connectors found by pgrep before spawning, not just the state pid", async () => {
555
+ // The orphan-accumulation bug: each re-expose spawned a fresh connector
556
+ // without killing prior ones, and state only tracked the most-recent pid.
557
+ // Orphans the state file lost track of (crashed mid-rewrite, started by
558
+ // hand) must still be swept — `connectorPids` finds them by UUID/config
559
+ // path. Here state knows pid 99999, but pgrep also surfaces 88888 + 77777
560
+ // serving the same tunnel; all three get SIGTERM before the new spawn.
561
+ const env = makeEnv();
562
+ try {
563
+ const uuid = "cccccccc-0000-0000-0000-000000000003";
564
+ const priorRecord: CloudflaredTunnelRecord = {
565
+ pid: 99999,
566
+ tunnelUuid: uuid,
567
+ tunnelName: "parachute",
568
+ hostname: "vault.example.com",
569
+ startedAt: "2026-04-21T00:00:00.000Z",
570
+ configPath: env.configPath,
571
+ };
572
+ writeCloudflaredState({ version: 2, tunnels: { parachute: priorRecord } }, env.statePath);
573
+
574
+ const { runner } = queueRunner([
575
+ { code: 0, stdout: "cloudflared 2024.1.0\n", stderr: "" },
576
+ { code: 0, stdout: JSON.stringify([{ id: uuid, name: "parachute" }]), stderr: "" },
577
+ { code: 0, stdout: "", stderr: "" }, // route dns
578
+ ]);
579
+ const { spawner, seen } = fakeSpawner(42010);
580
+ const killed: number[] = [];
581
+
582
+ const code = await exposeCloudflareUp("vault.example.com", {
583
+ runner,
584
+ spawner,
585
+ alive: () => true, // all candidate pids report alive
586
+ kill: (pid) => killed.push(pid),
587
+ // pgrep surfaces two orphans the state record didn't track.
588
+ connectorPids: () => [88888, 77777],
589
+ resolveHost: async () => ["104.16.0.1"], // Cloudflare — no DNS warning
590
+ log: () => {},
591
+ manifestPath: env.manifestPath,
592
+ statePath: env.statePath,
593
+ exposeStatePath: env.exposeStatePath,
594
+ configPath: env.configPath,
595
+ logPath: env.logPath,
596
+ cloudflaredHome: env.cloudflaredHome,
597
+ configDir: env.configDir,
598
+ skipHub: true,
599
+ });
600
+
601
+ expect(code).toBe(0);
602
+ // Every prior connector (state pid + both pgrep orphans) is stopped
603
+ // before the new one spawns.
604
+ expect(killed.sort()).toEqual([77777, 88888, 99999]);
605
+ // Exactly one fresh connector spawned, and it's the one recorded.
606
+ expect(seen).toHaveLength(1);
607
+ expect(findTunnelRecord(readCloudflaredState(env.statePath), "parachute")?.pid).toBe(42010);
608
+ } finally {
609
+ env.cleanup();
610
+ }
611
+ });
612
+
613
+ test("hub#487: warns when DNS doesn't resolve yet (pending zone)", async () => {
614
+ // route dns succeeded but the hostname doesn't resolve — the "pending"
615
+ // zone shape (NS not switched at the registrar). Non-fatal: still exit 0,
616
+ // still print the URLs, but add the nameserver-switch nudge.
617
+ const env = makeEnv();
618
+ try {
619
+ const uuid = "dddddddd-0000-0000-0000-000000000004";
620
+ const { runner } = queueRunner([
621
+ { code: 0, stdout: "cloudflared 2024.1.0\n", stderr: "" },
622
+ { code: 0, stdout: JSON.stringify([{ id: uuid, name: "parachute" }]), stderr: "" },
623
+ { code: 0, stdout: "", stderr: "" },
624
+ ]);
625
+ const { spawner } = fakeSpawner(42020);
626
+ const logs: string[] = [];
627
+
628
+ const code = await exposeCloudflareUp("vault.newzone.com", {
629
+ runner,
630
+ spawner,
631
+ alive: () => false,
632
+ kill: () => {},
633
+ connectorPids: () => [],
634
+ resolveHost: async () => [], // NXDOMAIN / not live yet
635
+ log: (l) => logs.push(l),
636
+ manifestPath: env.manifestPath,
637
+ statePath: env.statePath,
638
+ exposeStatePath: env.exposeStatePath,
639
+ configPath: env.configPath,
640
+ logPath: env.logPath,
641
+ cloudflaredHome: env.cloudflaredHome,
642
+ configDir: env.configDir,
643
+ skipHub: true,
644
+ });
645
+
646
+ expect(code).toBe(0); // non-fatal — the expose still completes
647
+ const joined = logs.join("\n");
648
+ expect(joined).toContain("DNS isn't live yet for vault.newzone.com");
649
+ expect(joined).toContain("dig +short newzone.com NS");
650
+ expect(joined).toContain("ns.cloudflare.com");
651
+ // The success URLs still print.
652
+ expect(joined).toContain("https://vault.newzone.com/admin/");
653
+ } finally {
654
+ env.cleanup();
655
+ }
656
+ });
657
+
658
+ test("hub#487: warns when hostname resolves but not to Cloudflare (shadowed)", async () => {
659
+ // route dns succeeded but the hostname resolves to a non-Cloudflare IP —
660
+ // a Pages project / grey-cloud A record shadowing the tunnel → edge 404.
661
+ const env = makeEnv();
662
+ try {
663
+ const uuid = "eeeeeeee-0000-0000-0000-000000000006";
664
+ const { runner } = queueRunner([
665
+ { code: 0, stdout: "cloudflared 2024.1.0\n", stderr: "" },
666
+ { code: 0, stdout: JSON.stringify([{ id: uuid, name: "parachute" }]), stderr: "" },
667
+ { code: 0, stdout: "", stderr: "" },
668
+ ]);
669
+ const { spawner } = fakeSpawner(42021);
670
+ const logs: string[] = [];
671
+
672
+ const code = await exposeCloudflareUp("docs.parachute.computer", {
673
+ runner,
674
+ spawner,
675
+ alive: () => false,
676
+ kill: () => {},
677
+ connectorPids: () => [],
678
+ resolveHost: async () => ["203.0.113.10"], // not a Cloudflare range
679
+ log: (l) => logs.push(l),
680
+ manifestPath: env.manifestPath,
681
+ statePath: env.statePath,
682
+ exposeStatePath: env.exposeStatePath,
683
+ configPath: env.configPath,
684
+ logPath: env.logPath,
685
+ cloudflaredHome: env.cloudflaredHome,
686
+ configDir: env.configDir,
687
+ skipHub: true,
688
+ });
689
+
690
+ expect(code).toBe(0);
691
+ const joined = logs.join("\n");
692
+ expect(joined).toContain("not to Cloudflare's edge");
693
+ expect(joined).toContain("shadowed");
694
+ expect(joined).toContain("Pages project");
695
+ } finally {
696
+ env.cleanup();
697
+ }
698
+ });
699
+
700
+ test("hub#487: no DNS warning when hostname resolves at Cloudflare's edge", async () => {
701
+ const env = makeEnv();
702
+ try {
703
+ const uuid = "ffffffff-0000-0000-0000-000000000007";
704
+ const { runner } = queueRunner([
705
+ { code: 0, stdout: "cloudflared 2024.1.0\n", stderr: "" },
706
+ { code: 0, stdout: JSON.stringify([{ id: uuid, name: "parachute" }]), stderr: "" },
707
+ { code: 0, stdout: "", stderr: "" },
708
+ ]);
709
+ const { spawner } = fakeSpawner(42022);
710
+ const logs: string[] = [];
711
+
712
+ const code = await exposeCloudflareUp("vault.example.com", {
713
+ runner,
714
+ spawner,
715
+ alive: () => false,
716
+ kill: () => {},
717
+ connectorPids: () => [],
718
+ resolveHost: async () => ["104.18.32.7"], // 104.16.0.0/13 — Cloudflare
719
+ log: (l) => logs.push(l),
720
+ manifestPath: env.manifestPath,
721
+ statePath: env.statePath,
722
+ exposeStatePath: env.exposeStatePath,
723
+ configPath: env.configPath,
724
+ logPath: env.logPath,
725
+ cloudflaredHome: env.cloudflaredHome,
726
+ configDir: env.configDir,
727
+ skipHub: true,
728
+ });
729
+
730
+ expect(code).toBe(0);
731
+ const joined = logs.join("\n");
732
+ expect(joined).not.toContain("DNS isn't live yet");
733
+ expect(joined).not.toContain("not to Cloudflare's edge");
734
+ } finally {
735
+ env.cleanup();
736
+ }
737
+ });
738
+
554
739
  test("two tunnels with different --tunnel-name coexist in state", async () => {
555
740
  const env = makeEnv();
556
741
  try {
@@ -929,6 +1114,46 @@ describe("exposeCloudflareOff", () => {
929
1114
  }
930
1115
  });
931
1116
 
1117
+ test("hub#487: off sweeps orphan connectors the state record didn't track", async () => {
1118
+ const env = makeEnv();
1119
+ try {
1120
+ const uuid = "abababab-0000-0000-0000-000000000009";
1121
+ writeCloudflaredState(
1122
+ {
1123
+ version: 2,
1124
+ tunnels: {
1125
+ parachute: {
1126
+ pid: 55555,
1127
+ tunnelUuid: uuid,
1128
+ tunnelName: "parachute",
1129
+ hostname: "vault.example.com",
1130
+ startedAt: "2026-04-22T12:00:00.000Z",
1131
+ configPath: env.configPath,
1132
+ },
1133
+ },
1134
+ },
1135
+ env.statePath,
1136
+ );
1137
+ const killed: number[] = [];
1138
+ const code = await exposeCloudflareOff({
1139
+ statePath: env.statePath,
1140
+ exposeStatePath: env.exposeStatePath,
1141
+ alive: () => true,
1142
+ kill: (pid) => killed.push(pid),
1143
+ // pgrep finds the tracked pid (skipped — already signalled) plus an
1144
+ // untracked orphan 66666 serving the same tunnel.
1145
+ connectorPids: () => [55555, 66666],
1146
+ log: () => {},
1147
+ });
1148
+ expect(code).toBe(0);
1149
+ // Tracked pid stopped once, orphan also stopped — no double-kill of 55555.
1150
+ expect(killed.sort()).toEqual([55555, 66666]);
1151
+ expect(existsSync(env.statePath)).toBe(false);
1152
+ } finally {
1153
+ env.cleanup();
1154
+ }
1155
+ });
1156
+
932
1157
  test("targets the named tunnel and leaves siblings intact", async () => {
933
1158
  const env = makeEnv();
934
1159
  try {
@@ -783,6 +783,159 @@ describe("parachute start", () => {
783
783
  h.cleanup();
784
784
  }
785
785
  });
786
+
787
+ // hub#487 — readiness gating beyond the bare liveness settle. Aaron hit this
788
+ // on a fresh EC2 box: `parachute start vault` printed "✓ vault started" while
789
+ // the process died ~instantly on EADDRINUSE (an orphan held 1940), and
790
+ // `parachute status` then showed it inactive.
791
+
792
+ /**
793
+ * A stub spawner that also seeds the service's log file with `content`, so
794
+ * the readiness-failure path's log-tail + EADDRINUSE detection can read a
795
+ * realistic boot error. Mirrors how the real spawner appends stdout/stderr
796
+ * to the logfile.
797
+ */
798
+ function makeSpawnerWithLog(pid: number, content: string): SpawnerStub {
799
+ const calls: SpawnerStub["calls"] = [];
800
+ return {
801
+ calls,
802
+ spawn(cmd, logFile, opts) {
803
+ calls.push({ cmd: [...cmd], logFile, env: opts?.env, cwd: opts?.cwd });
804
+ // The start path calls ensureLogPath() before spawn, so logFile's
805
+ // parent dir already exists — just write the simulated boot output.
806
+ writeFileSync(logFile, content);
807
+ return pid;
808
+ },
809
+ };
810
+ }
811
+
812
+ test("hub#487: EADDRINUSE in the log → port-in-use message + log tail, not ✓", async () => {
813
+ const h = makeHarness();
814
+ try {
815
+ seedVault(h.manifestPath);
816
+ const spawner = makeSpawnerWithLog(
817
+ 4242,
818
+ "booting vault…\nerror: listen EADDRINUSE: address already in use 0.0.0.0:1940\n",
819
+ );
820
+ const lines: string[] = [];
821
+ const code = await start("vault", {
822
+ configDir: h.configDir,
823
+ manifestPath: h.manifestPath,
824
+ spawner,
825
+ alive: () => false, // process died right after the EADDRINUSE throw
826
+ sleep: async () => {},
827
+ startSettleMs: 1,
828
+ log: (l) => lines.push(l),
829
+ });
830
+ expect(code).toBe(1);
831
+ expect(readPid("vault", h.configDir)).toBeUndefined();
832
+ const out = lines.join("\n");
833
+ expect(out).toMatch(/port 1940 is already in use/);
834
+ expect(out).toMatch(/lsof -ti:1940/);
835
+ // The real boot error is surfaced inline so the operator doesn't have to
836
+ // go tail the log themselves.
837
+ expect(out).toMatch(/EADDRINUSE/);
838
+ expect(out).not.toMatch(/✓ vault started/);
839
+ } finally {
840
+ h.cleanup();
841
+ }
842
+ });
843
+
844
+ test("hub#487: process survives settle but never binds its port → failure with log tail", async () => {
845
+ const h = makeHarness();
846
+ try {
847
+ seedVault(h.manifestPath);
848
+ const spawner = makeSpawnerWithLog(4242, "vault crashed mid-boot\n");
849
+ const lines: string[] = [];
850
+ let aliveCalls = 0;
851
+ const code = await start("vault", {
852
+ configDir: h.configDir,
853
+ manifestPath: h.manifestPath,
854
+ spawner,
855
+ // Alive through the settle + first readiness poll, then dies — the
856
+ // slow-EADDRINUSE / crash-after-boot shape.
857
+ alive: () => {
858
+ aliveCalls++;
859
+ return aliveCalls <= 1;
860
+ },
861
+ sleep: async () => {},
862
+ startSettleMs: 1,
863
+ startReadyMs: 50,
864
+ startReadyPollMs: 1,
865
+ portListening: async () => false, // never binds
866
+ log: (l) => lines.push(l),
867
+ });
868
+ expect(code).toBe(1);
869
+ expect(readPid("vault", h.configDir)).toBeUndefined();
870
+ const out = lines.join("\n");
871
+ expect(out).toMatch(/✗ vault failed to start/);
872
+ expect(out).toMatch(/exited during startup/);
873
+ expect(out).not.toMatch(/✓ vault started/);
874
+ } finally {
875
+ h.cleanup();
876
+ }
877
+ });
878
+
879
+ test("hub#487: alive but port silent past the window → non-fatal warning, exit 0", async () => {
880
+ const h = makeHarness();
881
+ try {
882
+ seedVault(h.manifestPath);
883
+ const spawner = makeSpawner([4242]);
884
+ const lines: string[] = [];
885
+ const code = await start("vault", {
886
+ configDir: h.configDir,
887
+ manifestPath: h.manifestPath,
888
+ spawner,
889
+ alive: () => true, // stays up the whole time
890
+ sleep: async () => {},
891
+ startSettleMs: 1,
892
+ startReadyMs: 10,
893
+ startReadyPollMs: 1,
894
+ portListening: async () => false, // slow boot — not listening yet
895
+ log: (l) => lines.push(l),
896
+ });
897
+ // A slow-but-alive daemon isn't a hard failure — we warn rather than fail.
898
+ expect(code).toBe(0);
899
+ expect(readPid("vault", h.configDir)).toBe(4242);
900
+ const out = lines.join("\n");
901
+ expect(out).toMatch(/port 1940 isn't accepting connections yet/);
902
+ expect(out).not.toMatch(/✓ vault started/);
903
+ } finally {
904
+ h.cleanup();
905
+ }
906
+ });
907
+
908
+ test("hub#487: alive + port listening → success", async () => {
909
+ const h = makeHarness();
910
+ try {
911
+ seedVault(h.manifestPath);
912
+ const spawner = makeSpawner([4242]);
913
+ const lines: string[] = [];
914
+ let probeCalls = 0;
915
+ const code = await start("vault", {
916
+ configDir: h.configDir,
917
+ manifestPath: h.manifestPath,
918
+ spawner,
919
+ alive: () => true,
920
+ sleep: async () => {},
921
+ startSettleMs: 1,
922
+ startReadyMs: 50,
923
+ startReadyPollMs: 1,
924
+ // Not listening on the first poll, bound on the second — exercises the
925
+ // poll loop rather than an instant true.
926
+ portListening: async () => {
927
+ probeCalls++;
928
+ return probeCalls >= 2;
929
+ },
930
+ log: (l) => lines.push(l),
931
+ });
932
+ expect(code).toBe(0);
933
+ expect(readPid("vault", h.configDir)).toBe(4242);
934
+ expect(lines.join("\n")).toMatch(/✓ vault started \(pid 4242\)/);
935
+ } finally {
936
+ h.cleanup();
937
+ }
938
+ });
786
939
  });
787
940
 
788
941
  describe("parachute stop", () => {
@@ -1,3 +1,4 @@
1
+ import { spawnSync } from "node:child_process";
1
2
  import { mkdirSync, openSync } from "node:fs";
2
3
  import { dirname } from "node:path";
3
4
  import { DEFAULT_TUNNEL_NAME, cloudflaredPathsFor, writeConfig } from "../cloudflare/config.ts";
@@ -100,11 +101,159 @@ const defaultKill: KillFn = (pid, signal) => {
100
101
  process.kill(pid, signal);
101
102
  };
102
103
 
104
+ /**
105
+ * Find the PIDs of every running `cloudflared` connector serving THIS tunnel.
106
+ * "This tunnel" is identified by either the tunnel UUID or the config.yml path
107
+ * appearing on the process command line — both are unique to Parachute's
108
+ * connector for this tunnel, so we never touch an unrelated cloudflared the
109
+ * operator may be running for a different tunnel.
110
+ *
111
+ * The motivating bug (hub#487): each `parachute expose public --cloudflare`
112
+ * "reused the tunnel" but spawned a fresh connector (new pid) without killing
113
+ * the prior ones, and the state file only tracked the most-recent pid. Orphan
114
+ * connectors accumulated — multiple `cloudflared tunnel run` processes all
115
+ * serving stale `config.yml` snapshots, so edge routing became nondeterministic
116
+ * ("silent fails"). Sweeping by UUID/config-path catches the orphans that the
117
+ * single-pid state record misses (prior runs that crashed mid-rewrite, or a
118
+ * connector the operator started by hand for this tunnel).
119
+ *
120
+ * Injectable so tests assert the sweep without a live `pgrep`.
121
+ */
122
+ export type ConnectorPidsFn = (tunnelUuid: string, configPath: string) => number[];
123
+
124
+ export const defaultConnectorPids: ConnectorPidsFn = (tunnelUuid, configPath) => {
125
+ try {
126
+ // `pgrep -fl cloudflared` lists "<pid> <full command line>" for every
127
+ // process whose command line matches "cloudflared". We then filter to the
128
+ // ones that name THIS tunnel (uuid or config path) so the kill is surgical.
129
+ // macOS + Linux ship pgrep; Windows is out of scope (mirrors hub#287's lsof
130
+ // assumption). Any failure → [] (caller falls back to state-tracked pid).
131
+ const result = spawnSync("pgrep", ["-fl", "cloudflared"], {
132
+ encoding: "utf8",
133
+ timeout: 2000,
134
+ });
135
+ if (result.status !== 0 || typeof result.stdout !== "string") return [];
136
+ const selfPid = process.pid;
137
+ const pids: number[] = [];
138
+ for (const line of result.stdout.split("\n")) {
139
+ const trimmed = line.trim();
140
+ if (trimmed.length === 0) continue;
141
+ const match = trimmed.match(/^(\d+)\s+(.*)$/);
142
+ if (!match) continue;
143
+ const pid = Number.parseInt(match[1]!, 10);
144
+ const cmdline = match[2]!;
145
+ if (!Number.isInteger(pid) || pid <= 0 || pid === selfPid) continue;
146
+ // Surgical match: only connectors that name this tunnel's UUID or its
147
+ // config path. A bare `cloudflared` (e.g. `--version`, `tunnel list`)
148
+ // or a connector for a *different* tunnel won't match either token.
149
+ if (cmdline.includes(tunnelUuid) || cmdline.includes(configPath)) {
150
+ pids.push(pid);
151
+ }
152
+ }
153
+ return pids;
154
+ } catch {
155
+ return [];
156
+ }
157
+ };
158
+
159
+ /**
160
+ * Resolve a hostname to its A/AAAA addresses. Returns [] when the name doesn't
161
+ * resolve (NXDOMAIN, SERVFAIL, no records yet) — the signal the DNS
162
+ * self-diagnosis keys on. Injectable so tests drive each case (unresolved /
163
+ * Cloudflare / non-Cloudflare) deterministically.
164
+ */
165
+ export type ResolveHostFn = (hostname: string) => Promise<string[]>;
166
+
167
+ export const defaultResolveHost: ResolveHostFn = async (hostname) => {
168
+ try {
169
+ // Bun.dns ships with the runtime; `node:dns/promises` is equally fine but
170
+ // Bun.dns.lookup returns both families in one call. `all: true` gives every
171
+ // record so a partially-propagated name still surfaces an address.
172
+ const records = await Bun.dns.lookup(hostname, { family: 0 });
173
+ return records.map((r) => r.address).filter((a) => typeof a === "string" && a.length > 0);
174
+ } catch {
175
+ return [];
176
+ }
177
+ };
178
+
179
+ /**
180
+ * Cloudflare's published anycast IPv4 ranges (the proxy edge). A proxied
181
+ * (orange-cloud) record — which is what `cloudflared tunnel route dns` creates
182
+ * — resolves to one of these. If the hostname resolves to something *outside*
183
+ * these ranges, it's almost certainly shadowed: a Pages project, an A record,
184
+ * or a grey-cloud CNAME pointing elsewhere. We keep the list to the v4 ranges
185
+ * (the common case) and treat any IPv6 in Cloudflare's 2606:4700::/32 block as
186
+ * Cloudflare too. Source: https://www.cloudflare.com/ips/ (stable for years).
187
+ */
188
+ const CLOUDFLARE_V4_RANGES: ReadonlyArray<readonly [string, number]> = [
189
+ ["173.245.48.0", 20],
190
+ ["103.21.244.0", 22],
191
+ ["103.22.200.0", 22],
192
+ ["103.31.4.0", 22],
193
+ ["141.101.64.0", 18],
194
+ ["108.162.192.0", 18],
195
+ ["190.93.240.0", 20],
196
+ ["188.114.96.0", 20],
197
+ ["197.234.240.0", 22],
198
+ ["198.41.128.0", 17],
199
+ ["162.158.0.0", 15],
200
+ ["104.16.0.0", 13],
201
+ ["104.24.0.0", 14],
202
+ ["172.64.0.0", 13],
203
+ ["131.0.72.0", 22],
204
+ ];
205
+
206
+ function ipv4ToInt(ip: string): number | undefined {
207
+ const parts = ip.split(".");
208
+ if (parts.length !== 4) return undefined;
209
+ let n = 0;
210
+ for (const part of parts) {
211
+ const octet = Number.parseInt(part, 10);
212
+ if (!Number.isInteger(octet) || octet < 0 || octet > 255) return undefined;
213
+ n = n * 256 + octet;
214
+ }
215
+ return n >>> 0;
216
+ }
217
+
218
+ /** True if any resolved address belongs to Cloudflare's edge. */
219
+ export function looksLikeCloudflare(addresses: readonly string[]): boolean {
220
+ for (const addr of addresses) {
221
+ // IPv6: Cloudflare's edge lives in 2606:4700::/32.
222
+ if (addr.includes(":")) {
223
+ if (addr.toLowerCase().startsWith("2606:4700")) return true;
224
+ continue;
225
+ }
226
+ const ipInt = ipv4ToInt(addr);
227
+ if (ipInt === undefined) continue;
228
+ for (const [base, bits] of CLOUDFLARE_V4_RANGES) {
229
+ const baseInt = ipv4ToInt(base);
230
+ if (baseInt === undefined) continue;
231
+ const mask = bits === 0 ? 0 : (0xffffffff << (32 - bits)) >>> 0;
232
+ if ((ipInt & mask) === (baseInt & mask)) return true;
233
+ }
234
+ }
235
+ return false;
236
+ }
237
+
103
238
  export interface ExposeCloudflareOpts {
104
239
  runner?: Runner;
105
240
  spawner?: CloudflaredSpawner;
106
241
  alive?: AliveFn;
107
242
  kill?: KillFn;
243
+ /**
244
+ * Find every running cloudflared connector PID serving this tunnel (by UUID
245
+ * or config-path match). Used to sweep orphan connectors before spawning a
246
+ * fresh one (hub#487). Tests inject a stub; production uses
247
+ * `defaultConnectorPids` (a filtered `pgrep -fl cloudflared`).
248
+ */
249
+ connectorPids?: ConnectorPidsFn;
250
+ /**
251
+ * Resolve a hostname to its addresses, for the post-route DNS self-diagnosis
252
+ * (hub#487). Returns the resolved IPs (empty when NXDOMAIN / not yet live).
253
+ * Best-effort and non-fatal — a failure to resolve never blocks the expose.
254
+ * Tests inject a stub; production uses `defaultResolveHost` (Bun DNS).
255
+ */
256
+ resolveHost?: ResolveHostFn;
108
257
  log?: (line: string) => void;
109
258
  manifestPath?: string;
110
259
  statePath?: string;
@@ -186,6 +335,8 @@ interface Resolved {
186
335
  spawner: CloudflaredSpawner;
187
336
  alive: AliveFn;
188
337
  kill: KillFn;
338
+ connectorPids: ConnectorPidsFn;
339
+ resolveHost: ResolveHostFn;
189
340
  log: (line: string) => void;
190
341
  manifestPath: string;
191
342
  statePath: string;
@@ -217,6 +368,17 @@ function resolve(opts: ExposeCloudflareOpts): Resolved {
217
368
  spawner: opts.spawner ?? defaultCloudflaredSpawner,
218
369
  alive: opts.alive ?? defaultAlive,
219
370
  kill: opts.kill ?? defaultKill,
371
+ // Defaulting policy mirrors lifecycle's startReadyMs (hub#487): the real
372
+ // implementations shell out (`pgrep`) / hit the network (DNS). When a test
373
+ // injects a fake `spawner` but no explicit seam, fall back to inert stubs
374
+ // (no orphans found; "resolves at Cloudflare" → no DNS warning) so suites
375
+ // stay deterministic and offline. Production (no spawner override) always
376
+ // gets the real `pgrep` sweep + DNS diagnosis.
377
+ connectorPids:
378
+ opts.connectorPids ?? (opts.spawner === undefined ? defaultConnectorPids : () => []),
379
+ resolveHost:
380
+ opts.resolveHost ??
381
+ (opts.spawner === undefined ? defaultResolveHost : async () => ["104.16.0.1"]),
220
382
  log: opts.log ?? ((line) => console.log(line)),
221
383
  manifestPath: opts.manifestPath ?? SERVICES_MANIFEST_PATH,
222
384
  statePath: opts.statePath ?? CLOUDFLARED_STATE_PATH,
@@ -261,6 +423,49 @@ function printAuthGuidance(log: (line: string) => void, vaultUrl: string): void
261
423
  log(` ${AUTH_DOC_URL}`);
262
424
  }
263
425
 
426
+ /**
427
+ * Best-effort registrable-zone guess: the last two labels of the hostname
428
+ * (`vault.example.com` → `example.com`, `gitcoin.parachute.computer` →
429
+ * `parachute.computer`). This is a heuristic — multi-label public suffixes
430
+ * (`foo.co.uk`) would guess `co.uk` — but it's only used to phrase the
431
+ * `dig +short <zone> NS` remedy, where being off by a label is a harmless
432
+ * nudge. We don't ship a full public-suffix list for one warning string.
433
+ */
434
+ function guessZone(hostname: string): string {
435
+ const labels = hostname.split(".").filter((l) => l.length > 0);
436
+ if (labels.length <= 2) return hostname;
437
+ return labels.slice(-2).join(".");
438
+ }
439
+
440
+ /**
441
+ * Non-fatal post-route DNS diagnosis. Resolves `hostname` and warns when the
442
+ * result looks wrong — see the call site for the two symptoms this addresses.
443
+ * Never throws (resolveHost swallows its own errors) and never changes the
444
+ * exit code; the worst case is no output.
445
+ */
446
+ async function diagnoseDns(hostname: string, r: Resolved): Promise<void> {
447
+ const zone = guessZone(hostname);
448
+ const addresses = await r.resolveHost(hostname);
449
+ if (addresses.length === 0) {
450
+ r.log("");
451
+ r.log(`⚠ DNS isn't live yet for ${hostname}.`);
452
+ r.log(` If ${zone} is a new Cloudflare zone, its nameservers may not be switched at your`);
453
+ r.log(" registrar yet. Check with:");
454
+ r.log(` dig +short ${zone} NS # should list *.ns.cloudflare.com`);
455
+ r.log(" Propagation can take minutes to hours. The tunnel itself is up — the URLs below");
456
+ r.log(" will start working once DNS resolves.");
457
+ return;
458
+ }
459
+ if (!looksLikeCloudflare(addresses)) {
460
+ r.log("");
461
+ r.log(`⚠ ${hostname} resolves (${addresses.join(", ")}) but not to Cloudflare's edge.`);
462
+ r.log(` It may be shadowed by another DNS record or a Cloudflare Pages project on ${zone}.`);
463
+ r.log(" Ensure it's a proxied (orange-cloud) CNAME to the tunnel — check");
464
+ r.log(` https://dash.cloudflare.com → DNS for ${zone}. A grey-cloud / A record / Pages`);
465
+ r.log(" binding on this hostname will 404 the tunnel at the edge.");
466
+ }
467
+ }
468
+
264
469
  export async function exposeCloudflareUp(
265
470
  hostname: string,
266
471
  opts: ExposeCloudflareOpts = {},
@@ -390,6 +595,19 @@ export async function exposeCloudflareUp(
390
595
  }
391
596
  r.log("✓ DNS routed.");
392
597
 
598
+ // Post-route DNS self-diagnosis (hub#487). `cloudflared tunnel route dns`
599
+ // can succeed (the CNAME is written in Cloudflare's API) while the hostname
600
+ // is still NOT actually serving the tunnel — two shapes Aaron hit:
601
+ // (a) a "pending" zone whose nameservers aren't switched at the registrar
602
+ // yet, so the record exists in Cloudflare but nothing resolves; and
603
+ // (b) a subdomain shadowed by a Cloudflare Pages project on the same zone,
604
+ // so the edge 404s the tunnel.
605
+ // Both previously printed "✓ DNS routed" + the URLs as if fine. This check
606
+ // is best-effort and strictly NON-FATAL — it only adds a warning; it never
607
+ // changes the exit code or blocks the expose. Fast: one DNS lookup with a
608
+ // built-in timeout in `resolveHost`.
609
+ await diagnoseDns(hostname, r);
610
+
393
611
  const credsFile = credentialsPath(tunnel.id, r.cloudflaredHome);
394
612
  writeConfig(
395
613
  {
@@ -408,12 +626,28 @@ export async function exposeCloudflareUp(
408
626
  );
409
627
  r.log(`✓ Wrote ${r.configPath}`);
410
628
 
629
+ // Orphan-connector sweep (hub#487). Before spawning a fresh connector, kill
630
+ // EVERY cloudflared connector currently serving this tunnel so exactly one
631
+ // process serves the config.yml we just wrote. Pre-fix, each re-expose
632
+ // spawned a new connector without killing the prior ones (state tracked only
633
+ // the most-recent pid), so orphans accumulated and edge routing became
634
+ // nondeterministic. We union two sources:
635
+ // - the pid recorded in cloudflared-state.json (the prior `parachute`-
636
+ // spawned connector for this tunnel name), and
637
+ // - any pid found by scanning running processes for this tunnel's UUID or
638
+ // config path (catches orphans the state file lost track of — crashed
639
+ // mid-rewrite, or started by hand for this tunnel).
411
640
  const stateBefore = readCloudflaredState(r.statePath);
412
641
  const prior = findTunnelRecord(stateBefore, r.tunnelName);
413
- if (prior && r.alive(prior.pid)) {
642
+ const toKill = new Set<number>();
643
+ if (prior && r.alive(prior.pid)) toKill.add(prior.pid);
644
+ for (const pid of r.connectorPids(tunnel.id, r.configPath)) {
645
+ if (r.alive(pid)) toKill.add(pid);
646
+ }
647
+ for (const deadPid of toKill) {
414
648
  try {
415
- r.kill(prior.pid, "SIGTERM");
416
- r.log(`Stopped prior cloudflared (pid ${prior.pid}).`);
649
+ r.kill(deadPid, "SIGTERM");
650
+ r.log(`Stopped prior cloudflared connector (pid ${deadPid}).`);
417
651
  } catch {
418
652
  // Process is already gone — safe to ignore; we replace the record below.
419
653
  }
@@ -530,6 +764,18 @@ export async function exposeCloudflareOff(opts: ExposeCloudflareOpts = {}): Prom
530
764
  } else {
531
765
  r.log(`cloudflared (pid ${record.pid}) wasn't running; clearing stale state.`);
532
766
  }
767
+ // Sweep any orphan connectors for this tunnel that the state record didn't
768
+ // track (hub#487) so `off` leaves exactly zero connectors serving it. Match
769
+ // by UUID/config-path; skip the record pid we already signalled above.
770
+ for (const orphanPid of r.connectorPids(record.tunnelUuid, record.configPath)) {
771
+ if (orphanPid === record.pid || !r.alive(orphanPid)) continue;
772
+ try {
773
+ r.kill(orphanPid, "SIGTERM");
774
+ r.log(`✓ Stopped orphan cloudflared connector (pid ${orphanPid}).`);
775
+ } catch {
776
+ // Already gone between probe and kill — fine.
777
+ }
778
+ }
533
779
  const stateAfter = withoutTunnelRecord(stateBefore, r.tunnelName);
534
780
  if (stateAfter) {
535
781
  writeCloudflaredState(stateAfter, r.statePath);
@@ -1,4 +1,5 @@
1
- import { existsSync, openSync } from "node:fs";
1
+ import { existsSync, openSync, readFileSync } from "node:fs";
2
+ import { Socket } from "node:net";
2
3
  import { join } from "node:path";
3
4
  import { CONFIG_DIR, SERVICES_MANIFEST_PATH } from "../config.ts";
4
5
  import { readEnvFileValues } from "../env-file.ts";
@@ -84,6 +85,44 @@ export const defaultSpawner: Spawner = {
84
85
  export type KillFn = (pid: number, signal: NodeJS.Signals | number) => void;
85
86
  export type SleepFn = (ms: number) => Promise<void>;
86
87
 
88
+ /**
89
+ * "Is something listening on this TCP port on loopback?" seam. Pairs with the
90
+ * spawn-then-die settle (hub#194) to catch the *other* silent-start failure
91
+ * shape (hub#487): a service that lives long enough to clear the liveness
92
+ * check but never binds its port because the port is already held (EADDRINUSE
93
+ * from an orphan). The recorded pid stays alive (vault's process supervisor
94
+ * retries / lingers) so `alive(pid)` says "running" while `parachute status`
95
+ * shows it inactive because nothing answers on the port.
96
+ *
97
+ * Tests inject a deterministic stub; production uses `defaultPortListening`.
98
+ */
99
+ export type PortListeningFn = (port: number) => Promise<boolean>;
100
+
101
+ /**
102
+ * Connect-probe: open a TCP socket to 127.0.0.1:<port> and see if it's
103
+ * accepted. A successful connect means *something* is listening; we close
104
+ * immediately. Connection refused / timeout means nothing is bound yet.
105
+ * `node:net` rather than `Bun.connect` because the latter has no clean
106
+ * "connection refused → false" without a custom socket handler, and the net
107
+ * Socket's `error`/`connect` events map directly onto the boolean we want.
108
+ */
109
+ export const defaultPortListening: PortListeningFn = (port) =>
110
+ new Promise((resolve) => {
111
+ const socket = new Socket();
112
+ let settled = false;
113
+ const done = (listening: boolean) => {
114
+ if (settled) return;
115
+ settled = true;
116
+ socket.destroy();
117
+ resolve(listening);
118
+ };
119
+ socket.setTimeout(1000);
120
+ socket.once("connect", () => done(true));
121
+ socket.once("timeout", () => done(false));
122
+ socket.once("error", () => done(false));
123
+ socket.connect(port, "127.0.0.1");
124
+ });
125
+
87
126
  /**
88
127
  * Group-aware liveness: returns true if the process group (pgid == pid)
89
128
  * still has any member. Pairs with `defaultSpawner`'s `detached: true` —
@@ -130,6 +169,35 @@ export const defaultKill: KillFn = (pid, signal) => {
130
169
 
131
170
  export const defaultSleep: SleepFn = (ms) => new Promise((r) => setTimeout(r, ms));
132
171
 
172
+ /**
173
+ * Read the trailing `n` lines of a logfile, best-effort. Used to surface the
174
+ * real boot error when a start fails — operators shouldn't have to manually
175
+ * `tail` the log to learn *why* the daemon died. Returns [] on any read
176
+ * error (missing file, permissions) so the caller falls back to the generic
177
+ * "tail the log" hint without throwing.
178
+ */
179
+ function readLogTail(logFile: string, n: number): string[] {
180
+ try {
181
+ const content = readFileSync(logFile, "utf8");
182
+ const trimmed = content.replace(/\n$/, "");
183
+ if (trimmed === "") return [];
184
+ return trimmed.split("\n").slice(-n);
185
+ } catch {
186
+ return [];
187
+ }
188
+ }
189
+
190
+ /**
191
+ * Heuristic EADDRINUSE detector over a logfile tail. cloudflared, Bun, and
192
+ * Node all surface port collisions with recognizable phrases; we match the
193
+ * common ones rather than parse a structured error (there isn't one across
194
+ * runtimes). False positives are harmless — the worst case is we *also* print
195
+ * the port-in-use remedy on an unrelated failure, which is still actionable.
196
+ */
197
+ function detectAddrInUse(logTail: readonly string[]): boolean {
198
+ return logTail.some((line) => /EADDRINUSE|address already in use|port .* in use/i.test(line));
199
+ }
200
+
133
201
  export interface LifecycleOpts {
134
202
  spawner?: Spawner;
135
203
  kill?: KillFn;
@@ -161,6 +229,30 @@ export interface LifecycleOpts {
161
229
  * settle.
162
230
  */
163
231
  startSettleMs?: number;
232
+ /**
233
+ * Probe whether the service's port is listening, post-spawn. Pairs with the
234
+ * settle (hub#194) to catch the EADDRINUSE-orphan shape (hub#487): the
235
+ * process survives the liveness window (vault lingers / retries) but never
236
+ * binds because the port is already held, so `start` would otherwise report
237
+ * "✓ started" while `status` shows it inactive. Tests inject a stub;
238
+ * production uses `defaultPortListening` (a loopback TCP connect probe).
239
+ */
240
+ portListening?: PortListeningFn;
241
+ /**
242
+ * How long `start` polls for the service to bind its port after the
243
+ * liveness settle passes. Default 4000ms in production — long enough to
244
+ * cover vault/scribe cold-boot (DB open, route registration) without making
245
+ * a healthy start feel laggy. Polled at `startReadyPollMs` intervals; the
246
+ * first time the port answers we declare success. If the window elapses
247
+ * with the process still alive but the port silent, we print a non-fatal
248
+ * warning (the daemon may still be coming up) rather than failing — only a
249
+ * *dead* process is a hard failure. Defaulting policy mirrors
250
+ * `startSettleMs`: 0 (skipped) unless `portListening` is injected or the
251
+ * production path (no spawner override) is active.
252
+ */
253
+ startReadyMs?: number;
254
+ /** Poll interval while waiting for the port to come up. Default 200ms. */
255
+ startReadyPollMs?: number;
164
256
  /**
165
257
  * Override the hub origin passed to services as PARACHUTE_HUB_ORIGIN. If
166
258
  * unset, `start` derives it from `expose-state.json` (when exposed) or
@@ -194,6 +286,9 @@ interface Resolved {
194
286
  killWaitMs: number;
195
287
  pollIntervalMs: number;
196
288
  startSettleMs: number;
289
+ portListening: PortListeningFn;
290
+ startReadyMs: number;
291
+ startReadyPollMs: number;
197
292
  hubOrigin: string | undefined;
198
293
  ensureHub: (opts: EnsureHubOpts) => Promise<EnsureHubResult>;
199
294
  stopHubFn: (opts: StopHubOpts) => Promise<boolean>;
@@ -220,6 +315,16 @@ function resolve(opts: LifecycleOpts): Resolved {
220
315
  // override `alive`, which re-enables the default 250ms.
221
316
  startSettleMs:
222
317
  opts.startSettleMs ?? (opts.spawner === undefined || opts.alive !== undefined ? 250 : 0),
318
+ portListening: opts.portListening ?? defaultPortListening,
319
+ // Same defaulting policy as startSettleMs: production (no spawner
320
+ // override) gets the real 4s readiness window; tests that inject a stub
321
+ // spawner get 0 (skipped) unless they explicitly opt in via
322
+ // `portListening` or `startReadyMs`, so existing stub-spawner tests don't
323
+ // start probing a fake port.
324
+ startReadyMs:
325
+ opts.startReadyMs ??
326
+ (opts.spawner === undefined || opts.portListening !== undefined ? 4000 : 0),
327
+ startReadyPollMs: opts.startReadyPollMs ?? 200,
223
328
  hubOrigin: resolveHubOrigin(opts.hubOrigin, configDir),
224
329
  ensureHub: opts.hub?.ensureRunning ?? ensureHubRunning,
225
330
  stopHubFn: opts.hub?.stop ?? stopHub,
@@ -464,21 +569,97 @@ export async function start(svc: string | undefined, opts: LifecycleOpts = {}):
464
569
  }
465
570
  writePid(short, pid, r.configDir);
466
571
 
467
- // Settle-poll for spawn-then-immediately-die (hub#194). A spawn returning
468
- // a pid only proves the kernel forked the process; the child may exit
469
- // microseconds later if its main code path throws before listening
470
- // (e.g. notes-serve's Bun.resolveSync failing for bun-linked installs).
471
- // Without this poll, we'd report success and the operator would chase
472
- // a phantom 502.
572
+ // Boot-readiness gating (hub#194 + hub#487). A spawn returning a pid only
573
+ // proves the kernel forked the process it says nothing about whether the
574
+ // service survived its boot or bound its port. Two silent-start shapes:
575
+ //
576
+ // (1) spawn-then-immediately-die (hub#194): the child throws before
577
+ // listening (notes-serve's Bun.resolveSync failing for bun-linked
578
+ // installs) and exits microseconds later. Caught by the settle below.
579
+ //
580
+ // (2) alive-but-never-bound (hub#487): the port is already held by an
581
+ // orphan, the child hits EADDRINUSE, but its process *lingers* (or a
582
+ // supervisor retries) long enough to clear the liveness check. `start`
583
+ // would report "✓ started" while `parachute status` shows it inactive
584
+ // because nothing answers on the port. Aaron hit exactly this with an
585
+ // orphan holding vault's 1940 on a fresh EC2 box. Caught by the
586
+ // port-readiness poll below.
587
+ //
588
+ // On any failure we surface the tail of the logfile so the operator sees
589
+ // the real boot error inline, and we specifically call out EADDRINUSE with
590
+ // the `lsof -ti:<port>` remedy.
591
+ const reportStartFailure = (reason: string): void => {
592
+ clearPid(short, r.configDir);
593
+ failures++;
594
+ const tail = readLogTail(logFile, 20);
595
+ if (detectAddrInUse(tail)) {
596
+ r.log(
597
+ `✗ ${short} failed to start: port ${entry.port} is already in use. Stop the existing process first — find it with \`lsof -ti:${entry.port}\` (then \`kill <pid>\`), or run \`parachute restart ${short}\`.`,
598
+ );
599
+ } else {
600
+ r.log(`✗ ${short} failed to start: ${reason}`);
601
+ }
602
+ if (tail.length > 0) {
603
+ r.log(` ── last ${tail.length} log line(s) (${logFile}) ──`);
604
+ for (const line of tail) r.log(` │ ${line}`);
605
+ } else {
606
+ r.log(` Tail the log for details: tail -50 ${logFile}`);
607
+ }
608
+ };
609
+
473
610
  if (r.startSettleMs > 0) {
474
611
  await r.sleep(r.startSettleMs);
475
612
  if (!r.alive(pid)) {
476
- clearPid(short, r.configDir);
477
- failures++;
613
+ reportStartFailure(
614
+ `spawned pid ${pid} but the process exited within ${r.startSettleMs}ms.`,
615
+ );
616
+ continue;
617
+ }
618
+ }
619
+
620
+ // Port-readiness poll (hub#487). The process is alive; now confirm it
621
+ // actually bound its port before claiming success. Poll up to
622
+ // `startReadyMs`, re-checking liveness each iteration so a *later* death
623
+ // (e.g. a slow EADDRINUSE crash) is still reported as a failure. A process
624
+ // that stays alive but never binds within the window gets a non-fatal
625
+ // warning rather than a hard failure — some daemons legitimately do slow
626
+ // boot work, and we'd rather not flip a healthy-but-slow start to red.
627
+ if (r.startReadyMs > 0) {
628
+ const deadline = r.now() + r.startReadyMs;
629
+ let listening = false;
630
+ let died = false;
631
+ while (r.now() < deadline) {
632
+ if (!r.alive(pid)) {
633
+ died = true;
634
+ break;
635
+ }
636
+ if (await r.portListening(entry.port)) {
637
+ listening = true;
638
+ break;
639
+ }
640
+ await r.sleep(r.startReadyPollMs);
641
+ }
642
+ if (died) {
643
+ reportStartFailure(`spawned pid ${pid} but the process exited during startup.`);
644
+ continue;
645
+ }
646
+ if (!listening) {
647
+ // Last-chance liveness check — the loop may have exited on the
648
+ // deadline right as the process died.
649
+ if (!r.alive(pid)) {
650
+ reportStartFailure(`spawned pid ${pid} but the process exited during startup.`);
651
+ continue;
652
+ }
478
653
  r.log(
479
- `✗ ${short} failed to start: spawned pid ${pid} but the process exited within ${r.startSettleMs}ms.`,
654
+ `⚠ ${short} started (pid ${pid}) but port ${entry.port} isn't accepting connections yet after ${r.startReadyMs}ms.`,
480
655
  );
481
- r.log(` Tail the log for details: tail -50 ${logFile}`);
656
+ r.log(
657
+ ` It may still be coming up — check \`parachute status\` and \`parachute logs ${short}\`.`,
658
+ );
659
+ if (r.hubOrigin) r.log(` ${HUB_ORIGIN_ENV}=${r.hubOrigin}`);
660
+ if (short === "vault" && r.hubOrigin) {
661
+ persistVaultHubOrigin(r.configDir, r.hubOrigin, r.log);
662
+ }
482
663
  continue;
483
664
  }
484
665
  }