@openparachute/hub 0.5.14-rc.13 → 0.5.14-rc.14
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json
CHANGED
|
@@ -551,6 +551,191 @@ describe("exposeCloudflareUp", () => {
|
|
|
551
551
|
}
|
|
552
552
|
});
|
|
553
553
|
|
|
554
|
+
test("hub#487: kills orphan connectors found by pgrep before spawning, not just the state pid", async () => {
|
|
555
|
+
// The orphan-accumulation bug: each re-expose spawned a fresh connector
|
|
556
|
+
// without killing prior ones, and state only tracked the most-recent pid.
|
|
557
|
+
// Orphans the state file lost track of (crashed mid-rewrite, started by
|
|
558
|
+
// hand) must still be swept — `connectorPids` finds them by UUID/config
|
|
559
|
+
// path. Here state knows pid 99999, but pgrep also surfaces 88888 + 77777
|
|
560
|
+
// serving the same tunnel; all three get SIGTERM before the new spawn.
|
|
561
|
+
const env = makeEnv();
|
|
562
|
+
try {
|
|
563
|
+
const uuid = "cccccccc-0000-0000-0000-000000000003";
|
|
564
|
+
const priorRecord: CloudflaredTunnelRecord = {
|
|
565
|
+
pid: 99999,
|
|
566
|
+
tunnelUuid: uuid,
|
|
567
|
+
tunnelName: "parachute",
|
|
568
|
+
hostname: "vault.example.com",
|
|
569
|
+
startedAt: "2026-04-21T00:00:00.000Z",
|
|
570
|
+
configPath: env.configPath,
|
|
571
|
+
};
|
|
572
|
+
writeCloudflaredState({ version: 2, tunnels: { parachute: priorRecord } }, env.statePath);
|
|
573
|
+
|
|
574
|
+
const { runner } = queueRunner([
|
|
575
|
+
{ code: 0, stdout: "cloudflared 2024.1.0\n", stderr: "" },
|
|
576
|
+
{ code: 0, stdout: JSON.stringify([{ id: uuid, name: "parachute" }]), stderr: "" },
|
|
577
|
+
{ code: 0, stdout: "", stderr: "" }, // route dns
|
|
578
|
+
]);
|
|
579
|
+
const { spawner, seen } = fakeSpawner(42010);
|
|
580
|
+
const killed: number[] = [];
|
|
581
|
+
|
|
582
|
+
const code = await exposeCloudflareUp("vault.example.com", {
|
|
583
|
+
runner,
|
|
584
|
+
spawner,
|
|
585
|
+
alive: () => true, // all candidate pids report alive
|
|
586
|
+
kill: (pid) => killed.push(pid),
|
|
587
|
+
// pgrep surfaces two orphans the state record didn't track.
|
|
588
|
+
connectorPids: () => [88888, 77777],
|
|
589
|
+
resolveHost: async () => ["104.16.0.1"], // Cloudflare — no DNS warning
|
|
590
|
+
log: () => {},
|
|
591
|
+
manifestPath: env.manifestPath,
|
|
592
|
+
statePath: env.statePath,
|
|
593
|
+
exposeStatePath: env.exposeStatePath,
|
|
594
|
+
configPath: env.configPath,
|
|
595
|
+
logPath: env.logPath,
|
|
596
|
+
cloudflaredHome: env.cloudflaredHome,
|
|
597
|
+
configDir: env.configDir,
|
|
598
|
+
skipHub: true,
|
|
599
|
+
});
|
|
600
|
+
|
|
601
|
+
expect(code).toBe(0);
|
|
602
|
+
// Every prior connector (state pid + both pgrep orphans) is stopped
|
|
603
|
+
// before the new one spawns.
|
|
604
|
+
expect(killed.sort()).toEqual([77777, 88888, 99999]);
|
|
605
|
+
// Exactly one fresh connector spawned, and it's the one recorded.
|
|
606
|
+
expect(seen).toHaveLength(1);
|
|
607
|
+
expect(findTunnelRecord(readCloudflaredState(env.statePath), "parachute")?.pid).toBe(42010);
|
|
608
|
+
} finally {
|
|
609
|
+
env.cleanup();
|
|
610
|
+
}
|
|
611
|
+
});
|
|
612
|
+
|
|
613
|
+
test("hub#487: warns when DNS doesn't resolve yet (pending zone)", async () => {
|
|
614
|
+
// route dns succeeded but the hostname doesn't resolve — the "pending"
|
|
615
|
+
// zone shape (NS not switched at the registrar). Non-fatal: still exit 0,
|
|
616
|
+
// still print the URLs, but add the nameserver-switch nudge.
|
|
617
|
+
const env = makeEnv();
|
|
618
|
+
try {
|
|
619
|
+
const uuid = "dddddddd-0000-0000-0000-000000000004";
|
|
620
|
+
const { runner } = queueRunner([
|
|
621
|
+
{ code: 0, stdout: "cloudflared 2024.1.0\n", stderr: "" },
|
|
622
|
+
{ code: 0, stdout: JSON.stringify([{ id: uuid, name: "parachute" }]), stderr: "" },
|
|
623
|
+
{ code: 0, stdout: "", stderr: "" },
|
|
624
|
+
]);
|
|
625
|
+
const { spawner } = fakeSpawner(42020);
|
|
626
|
+
const logs: string[] = [];
|
|
627
|
+
|
|
628
|
+
const code = await exposeCloudflareUp("vault.newzone.com", {
|
|
629
|
+
runner,
|
|
630
|
+
spawner,
|
|
631
|
+
alive: () => false,
|
|
632
|
+
kill: () => {},
|
|
633
|
+
connectorPids: () => [],
|
|
634
|
+
resolveHost: async () => [], // NXDOMAIN / not live yet
|
|
635
|
+
log: (l) => logs.push(l),
|
|
636
|
+
manifestPath: env.manifestPath,
|
|
637
|
+
statePath: env.statePath,
|
|
638
|
+
exposeStatePath: env.exposeStatePath,
|
|
639
|
+
configPath: env.configPath,
|
|
640
|
+
logPath: env.logPath,
|
|
641
|
+
cloudflaredHome: env.cloudflaredHome,
|
|
642
|
+
configDir: env.configDir,
|
|
643
|
+
skipHub: true,
|
|
644
|
+
});
|
|
645
|
+
|
|
646
|
+
expect(code).toBe(0); // non-fatal — the expose still completes
|
|
647
|
+
const joined = logs.join("\n");
|
|
648
|
+
expect(joined).toContain("DNS isn't live yet for vault.newzone.com");
|
|
649
|
+
expect(joined).toContain("dig +short newzone.com NS");
|
|
650
|
+
expect(joined).toContain("ns.cloudflare.com");
|
|
651
|
+
// The success URLs still print.
|
|
652
|
+
expect(joined).toContain("https://vault.newzone.com/admin/");
|
|
653
|
+
} finally {
|
|
654
|
+
env.cleanup();
|
|
655
|
+
}
|
|
656
|
+
});
|
|
657
|
+
|
|
658
|
+
test("hub#487: warns when hostname resolves but not to Cloudflare (shadowed)", async () => {
|
|
659
|
+
// route dns succeeded but the hostname resolves to a non-Cloudflare IP —
|
|
660
|
+
// a Pages project / grey-cloud A record shadowing the tunnel → edge 404.
|
|
661
|
+
const env = makeEnv();
|
|
662
|
+
try {
|
|
663
|
+
const uuid = "eeeeeeee-0000-0000-0000-000000000006";
|
|
664
|
+
const { runner } = queueRunner([
|
|
665
|
+
{ code: 0, stdout: "cloudflared 2024.1.0\n", stderr: "" },
|
|
666
|
+
{ code: 0, stdout: JSON.stringify([{ id: uuid, name: "parachute" }]), stderr: "" },
|
|
667
|
+
{ code: 0, stdout: "", stderr: "" },
|
|
668
|
+
]);
|
|
669
|
+
const { spawner } = fakeSpawner(42021);
|
|
670
|
+
const logs: string[] = [];
|
|
671
|
+
|
|
672
|
+
const code = await exposeCloudflareUp("docs.parachute.computer", {
|
|
673
|
+
runner,
|
|
674
|
+
spawner,
|
|
675
|
+
alive: () => false,
|
|
676
|
+
kill: () => {},
|
|
677
|
+
connectorPids: () => [],
|
|
678
|
+
resolveHost: async () => ["203.0.113.10"], // not a Cloudflare range
|
|
679
|
+
log: (l) => logs.push(l),
|
|
680
|
+
manifestPath: env.manifestPath,
|
|
681
|
+
statePath: env.statePath,
|
|
682
|
+
exposeStatePath: env.exposeStatePath,
|
|
683
|
+
configPath: env.configPath,
|
|
684
|
+
logPath: env.logPath,
|
|
685
|
+
cloudflaredHome: env.cloudflaredHome,
|
|
686
|
+
configDir: env.configDir,
|
|
687
|
+
skipHub: true,
|
|
688
|
+
});
|
|
689
|
+
|
|
690
|
+
expect(code).toBe(0);
|
|
691
|
+
const joined = logs.join("\n");
|
|
692
|
+
expect(joined).toContain("not to Cloudflare's edge");
|
|
693
|
+
expect(joined).toContain("shadowed");
|
|
694
|
+
expect(joined).toContain("Pages project");
|
|
695
|
+
} finally {
|
|
696
|
+
env.cleanup();
|
|
697
|
+
}
|
|
698
|
+
});
|
|
699
|
+
|
|
700
|
+
test("hub#487: no DNS warning when hostname resolves at Cloudflare's edge", async () => {
|
|
701
|
+
const env = makeEnv();
|
|
702
|
+
try {
|
|
703
|
+
const uuid = "ffffffff-0000-0000-0000-000000000007";
|
|
704
|
+
const { runner } = queueRunner([
|
|
705
|
+
{ code: 0, stdout: "cloudflared 2024.1.0\n", stderr: "" },
|
|
706
|
+
{ code: 0, stdout: JSON.stringify([{ id: uuid, name: "parachute" }]), stderr: "" },
|
|
707
|
+
{ code: 0, stdout: "", stderr: "" },
|
|
708
|
+
]);
|
|
709
|
+
const { spawner } = fakeSpawner(42022);
|
|
710
|
+
const logs: string[] = [];
|
|
711
|
+
|
|
712
|
+
const code = await exposeCloudflareUp("vault.example.com", {
|
|
713
|
+
runner,
|
|
714
|
+
spawner,
|
|
715
|
+
alive: () => false,
|
|
716
|
+
kill: () => {},
|
|
717
|
+
connectorPids: () => [],
|
|
718
|
+
resolveHost: async () => ["104.18.32.7"], // 104.16.0.0/13 — Cloudflare
|
|
719
|
+
log: (l) => logs.push(l),
|
|
720
|
+
manifestPath: env.manifestPath,
|
|
721
|
+
statePath: env.statePath,
|
|
722
|
+
exposeStatePath: env.exposeStatePath,
|
|
723
|
+
configPath: env.configPath,
|
|
724
|
+
logPath: env.logPath,
|
|
725
|
+
cloudflaredHome: env.cloudflaredHome,
|
|
726
|
+
configDir: env.configDir,
|
|
727
|
+
skipHub: true,
|
|
728
|
+
});
|
|
729
|
+
|
|
730
|
+
expect(code).toBe(0);
|
|
731
|
+
const joined = logs.join("\n");
|
|
732
|
+
expect(joined).not.toContain("DNS isn't live yet");
|
|
733
|
+
expect(joined).not.toContain("not to Cloudflare's edge");
|
|
734
|
+
} finally {
|
|
735
|
+
env.cleanup();
|
|
736
|
+
}
|
|
737
|
+
});
|
|
738
|
+
|
|
554
739
|
test("two tunnels with different --tunnel-name coexist in state", async () => {
|
|
555
740
|
const env = makeEnv();
|
|
556
741
|
try {
|
|
@@ -929,6 +1114,46 @@ describe("exposeCloudflareOff", () => {
|
|
|
929
1114
|
}
|
|
930
1115
|
});
|
|
931
1116
|
|
|
1117
|
+
test("hub#487: off sweeps orphan connectors the state record didn't track", async () => {
|
|
1118
|
+
const env = makeEnv();
|
|
1119
|
+
try {
|
|
1120
|
+
const uuid = "abababab-0000-0000-0000-000000000009";
|
|
1121
|
+
writeCloudflaredState(
|
|
1122
|
+
{
|
|
1123
|
+
version: 2,
|
|
1124
|
+
tunnels: {
|
|
1125
|
+
parachute: {
|
|
1126
|
+
pid: 55555,
|
|
1127
|
+
tunnelUuid: uuid,
|
|
1128
|
+
tunnelName: "parachute",
|
|
1129
|
+
hostname: "vault.example.com",
|
|
1130
|
+
startedAt: "2026-04-22T12:00:00.000Z",
|
|
1131
|
+
configPath: env.configPath,
|
|
1132
|
+
},
|
|
1133
|
+
},
|
|
1134
|
+
},
|
|
1135
|
+
env.statePath,
|
|
1136
|
+
);
|
|
1137
|
+
const killed: number[] = [];
|
|
1138
|
+
const code = await exposeCloudflareOff({
|
|
1139
|
+
statePath: env.statePath,
|
|
1140
|
+
exposeStatePath: env.exposeStatePath,
|
|
1141
|
+
alive: () => true,
|
|
1142
|
+
kill: (pid) => killed.push(pid),
|
|
1143
|
+
// pgrep finds the tracked pid (skipped — already signalled) plus an
|
|
1144
|
+
// untracked orphan 66666 serving the same tunnel.
|
|
1145
|
+
connectorPids: () => [55555, 66666],
|
|
1146
|
+
log: () => {},
|
|
1147
|
+
});
|
|
1148
|
+
expect(code).toBe(0);
|
|
1149
|
+
// Tracked pid stopped once, orphan also stopped — no double-kill of 55555.
|
|
1150
|
+
expect(killed.sort()).toEqual([55555, 66666]);
|
|
1151
|
+
expect(existsSync(env.statePath)).toBe(false);
|
|
1152
|
+
} finally {
|
|
1153
|
+
env.cleanup();
|
|
1154
|
+
}
|
|
1155
|
+
});
|
|
1156
|
+
|
|
932
1157
|
test("targets the named tunnel and leaves siblings intact", async () => {
|
|
933
1158
|
const env = makeEnv();
|
|
934
1159
|
try {
|
|
@@ -783,6 +783,159 @@ describe("parachute start", () => {
|
|
|
783
783
|
h.cleanup();
|
|
784
784
|
}
|
|
785
785
|
});
|
|
786
|
+
|
|
787
|
+
// hub#487 — readiness gating beyond the bare liveness settle. Aaron hit this
|
|
788
|
+
// on a fresh EC2 box: `parachute start vault` printed "✓ vault started" while
|
|
789
|
+
// the process died ~instantly on EADDRINUSE (an orphan held 1940), and
|
|
790
|
+
// `parachute status` then showed it inactive.
|
|
791
|
+
|
|
792
|
+
/**
|
|
793
|
+
* A stub spawner that also seeds the service's log file with `content`, so
|
|
794
|
+
* the readiness-failure path's log-tail + EADDRINUSE detection can read a
|
|
795
|
+
* realistic boot error. Mirrors how the real spawner appends stdout/stderr
|
|
796
|
+
* to the logfile.
|
|
797
|
+
*/
|
|
798
|
+
function makeSpawnerWithLog(pid: number, content: string): SpawnerStub {
|
|
799
|
+
const calls: SpawnerStub["calls"] = [];
|
|
800
|
+
return {
|
|
801
|
+
calls,
|
|
802
|
+
spawn(cmd, logFile, opts) {
|
|
803
|
+
calls.push({ cmd: [...cmd], logFile, env: opts?.env, cwd: opts?.cwd });
|
|
804
|
+
// The start path calls ensureLogPath() before spawn, so logFile's
|
|
805
|
+
// parent dir already exists — just write the simulated boot output.
|
|
806
|
+
writeFileSync(logFile, content);
|
|
807
|
+
return pid;
|
|
808
|
+
},
|
|
809
|
+
};
|
|
810
|
+
}
|
|
811
|
+
|
|
812
|
+
test("hub#487: EADDRINUSE in the log → port-in-use message + log tail, not ✓", async () => {
|
|
813
|
+
const h = makeHarness();
|
|
814
|
+
try {
|
|
815
|
+
seedVault(h.manifestPath);
|
|
816
|
+
const spawner = makeSpawnerWithLog(
|
|
817
|
+
4242,
|
|
818
|
+
"booting vault…\nerror: listen EADDRINUSE: address already in use 0.0.0.0:1940\n",
|
|
819
|
+
);
|
|
820
|
+
const lines: string[] = [];
|
|
821
|
+
const code = await start("vault", {
|
|
822
|
+
configDir: h.configDir,
|
|
823
|
+
manifestPath: h.manifestPath,
|
|
824
|
+
spawner,
|
|
825
|
+
alive: () => false, // process died right after the EADDRINUSE throw
|
|
826
|
+
sleep: async () => {},
|
|
827
|
+
startSettleMs: 1,
|
|
828
|
+
log: (l) => lines.push(l),
|
|
829
|
+
});
|
|
830
|
+
expect(code).toBe(1);
|
|
831
|
+
expect(readPid("vault", h.configDir)).toBeUndefined();
|
|
832
|
+
const out = lines.join("\n");
|
|
833
|
+
expect(out).toMatch(/port 1940 is already in use/);
|
|
834
|
+
expect(out).toMatch(/lsof -ti:1940/);
|
|
835
|
+
// The real boot error is surfaced inline so the operator doesn't have to
|
|
836
|
+
// go tail the log themselves.
|
|
837
|
+
expect(out).toMatch(/EADDRINUSE/);
|
|
838
|
+
expect(out).not.toMatch(/✓ vault started/);
|
|
839
|
+
} finally {
|
|
840
|
+
h.cleanup();
|
|
841
|
+
}
|
|
842
|
+
});
|
|
843
|
+
|
|
844
|
+
test("hub#487: process survives settle but never binds its port → failure with log tail", async () => {
|
|
845
|
+
const h = makeHarness();
|
|
846
|
+
try {
|
|
847
|
+
seedVault(h.manifestPath);
|
|
848
|
+
const spawner = makeSpawnerWithLog(4242, "vault crashed mid-boot\n");
|
|
849
|
+
const lines: string[] = [];
|
|
850
|
+
let aliveCalls = 0;
|
|
851
|
+
const code = await start("vault", {
|
|
852
|
+
configDir: h.configDir,
|
|
853
|
+
manifestPath: h.manifestPath,
|
|
854
|
+
spawner,
|
|
855
|
+
// Alive through the settle + first readiness poll, then dies — the
|
|
856
|
+
// slow-EADDRINUSE / crash-after-boot shape.
|
|
857
|
+
alive: () => {
|
|
858
|
+
aliveCalls++;
|
|
859
|
+
return aliveCalls <= 1;
|
|
860
|
+
},
|
|
861
|
+
sleep: async () => {},
|
|
862
|
+
startSettleMs: 1,
|
|
863
|
+
startReadyMs: 50,
|
|
864
|
+
startReadyPollMs: 1,
|
|
865
|
+
portListening: async () => false, // never binds
|
|
866
|
+
log: (l) => lines.push(l),
|
|
867
|
+
});
|
|
868
|
+
expect(code).toBe(1);
|
|
869
|
+
expect(readPid("vault", h.configDir)).toBeUndefined();
|
|
870
|
+
const out = lines.join("\n");
|
|
871
|
+
expect(out).toMatch(/✗ vault failed to start/);
|
|
872
|
+
expect(out).toMatch(/exited during startup/);
|
|
873
|
+
expect(out).not.toMatch(/✓ vault started/);
|
|
874
|
+
} finally {
|
|
875
|
+
h.cleanup();
|
|
876
|
+
}
|
|
877
|
+
});
|
|
878
|
+
|
|
879
|
+
test("hub#487: alive but port silent past the window → non-fatal warning, exit 0", async () => {
|
|
880
|
+
const h = makeHarness();
|
|
881
|
+
try {
|
|
882
|
+
seedVault(h.manifestPath);
|
|
883
|
+
const spawner = makeSpawner([4242]);
|
|
884
|
+
const lines: string[] = [];
|
|
885
|
+
const code = await start("vault", {
|
|
886
|
+
configDir: h.configDir,
|
|
887
|
+
manifestPath: h.manifestPath,
|
|
888
|
+
spawner,
|
|
889
|
+
alive: () => true, // stays up the whole time
|
|
890
|
+
sleep: async () => {},
|
|
891
|
+
startSettleMs: 1,
|
|
892
|
+
startReadyMs: 10,
|
|
893
|
+
startReadyPollMs: 1,
|
|
894
|
+
portListening: async () => false, // slow boot — not listening yet
|
|
895
|
+
log: (l) => lines.push(l),
|
|
896
|
+
});
|
|
897
|
+
// A slow-but-alive daemon isn't a hard failure — we warn rather than fail.
|
|
898
|
+
expect(code).toBe(0);
|
|
899
|
+
expect(readPid("vault", h.configDir)).toBe(4242);
|
|
900
|
+
const out = lines.join("\n");
|
|
901
|
+
expect(out).toMatch(/port 1940 isn't accepting connections yet/);
|
|
902
|
+
expect(out).not.toMatch(/✓ vault started/);
|
|
903
|
+
} finally {
|
|
904
|
+
h.cleanup();
|
|
905
|
+
}
|
|
906
|
+
});
|
|
907
|
+
|
|
908
|
+
test("hub#487: alive + port listening → success", async () => {
|
|
909
|
+
const h = makeHarness();
|
|
910
|
+
try {
|
|
911
|
+
seedVault(h.manifestPath);
|
|
912
|
+
const spawner = makeSpawner([4242]);
|
|
913
|
+
const lines: string[] = [];
|
|
914
|
+
let probeCalls = 0;
|
|
915
|
+
const code = await start("vault", {
|
|
916
|
+
configDir: h.configDir,
|
|
917
|
+
manifestPath: h.manifestPath,
|
|
918
|
+
spawner,
|
|
919
|
+
alive: () => true,
|
|
920
|
+
sleep: async () => {},
|
|
921
|
+
startSettleMs: 1,
|
|
922
|
+
startReadyMs: 50,
|
|
923
|
+
startReadyPollMs: 1,
|
|
924
|
+
// Not listening on the first poll, bound on the second — exercises the
|
|
925
|
+
// poll loop rather than an instant true.
|
|
926
|
+
portListening: async () => {
|
|
927
|
+
probeCalls++;
|
|
928
|
+
return probeCalls >= 2;
|
|
929
|
+
},
|
|
930
|
+
log: (l) => lines.push(l),
|
|
931
|
+
});
|
|
932
|
+
expect(code).toBe(0);
|
|
933
|
+
expect(readPid("vault", h.configDir)).toBe(4242);
|
|
934
|
+
expect(lines.join("\n")).toMatch(/✓ vault started \(pid 4242\)/);
|
|
935
|
+
} finally {
|
|
936
|
+
h.cleanup();
|
|
937
|
+
}
|
|
938
|
+
});
|
|
786
939
|
});
|
|
787
940
|
|
|
788
941
|
describe("parachute stop", () => {
|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import { spawnSync } from "node:child_process";
|
|
1
2
|
import { mkdirSync, openSync } from "node:fs";
|
|
2
3
|
import { dirname } from "node:path";
|
|
3
4
|
import { DEFAULT_TUNNEL_NAME, cloudflaredPathsFor, writeConfig } from "../cloudflare/config.ts";
|
|
@@ -100,11 +101,159 @@ const defaultKill: KillFn = (pid, signal) => {
|
|
|
100
101
|
process.kill(pid, signal);
|
|
101
102
|
};
|
|
102
103
|
|
|
104
|
+
/**
|
|
105
|
+
* Find the PIDs of every running `cloudflared` connector serving THIS tunnel.
|
|
106
|
+
* "This tunnel" is identified by either the tunnel UUID or the config.yml path
|
|
107
|
+
* appearing on the process command line — both are unique to Parachute's
|
|
108
|
+
* connector for this tunnel, so we never touch an unrelated cloudflared the
|
|
109
|
+
* operator may be running for a different tunnel.
|
|
110
|
+
*
|
|
111
|
+
* The motivating bug (hub#487): each `parachute expose public --cloudflare`
|
|
112
|
+
* "reused the tunnel" but spawned a fresh connector (new pid) without killing
|
|
113
|
+
* the prior ones, and the state file only tracked the most-recent pid. Orphan
|
|
114
|
+
* connectors accumulated — multiple `cloudflared tunnel run` processes all
|
|
115
|
+
* serving stale `config.yml` snapshots, so edge routing became nondeterministic
|
|
116
|
+
* ("silent fails"). Sweeping by UUID/config-path catches the orphans that the
|
|
117
|
+
* single-pid state record misses (prior runs that crashed mid-rewrite, or a
|
|
118
|
+
* connector the operator started by hand for this tunnel).
|
|
119
|
+
*
|
|
120
|
+
* Injectable so tests assert the sweep without a live `pgrep`.
|
|
121
|
+
*/
|
|
122
|
+
export type ConnectorPidsFn = (tunnelUuid: string, configPath: string) => number[];
|
|
123
|
+
|
|
124
|
+
export const defaultConnectorPids: ConnectorPidsFn = (tunnelUuid, configPath) => {
|
|
125
|
+
try {
|
|
126
|
+
// `pgrep -fl cloudflared` lists "<pid> <full command line>" for every
|
|
127
|
+
// process whose command line matches "cloudflared". We then filter to the
|
|
128
|
+
// ones that name THIS tunnel (uuid or config path) so the kill is surgical.
|
|
129
|
+
// macOS + Linux ship pgrep; Windows is out of scope (mirrors hub#287's lsof
|
|
130
|
+
// assumption). Any failure → [] (caller falls back to state-tracked pid).
|
|
131
|
+
const result = spawnSync("pgrep", ["-fl", "cloudflared"], {
|
|
132
|
+
encoding: "utf8",
|
|
133
|
+
timeout: 2000,
|
|
134
|
+
});
|
|
135
|
+
if (result.status !== 0 || typeof result.stdout !== "string") return [];
|
|
136
|
+
const selfPid = process.pid;
|
|
137
|
+
const pids: number[] = [];
|
|
138
|
+
for (const line of result.stdout.split("\n")) {
|
|
139
|
+
const trimmed = line.trim();
|
|
140
|
+
if (trimmed.length === 0) continue;
|
|
141
|
+
const match = trimmed.match(/^(\d+)\s+(.*)$/);
|
|
142
|
+
if (!match) continue;
|
|
143
|
+
const pid = Number.parseInt(match[1]!, 10);
|
|
144
|
+
const cmdline = match[2]!;
|
|
145
|
+
if (!Number.isInteger(pid) || pid <= 0 || pid === selfPid) continue;
|
|
146
|
+
// Surgical match: only connectors that name this tunnel's UUID or its
|
|
147
|
+
// config path. A bare `cloudflared` (e.g. `--version`, `tunnel list`)
|
|
148
|
+
// or a connector for a *different* tunnel won't match either token.
|
|
149
|
+
if (cmdline.includes(tunnelUuid) || cmdline.includes(configPath)) {
|
|
150
|
+
pids.push(pid);
|
|
151
|
+
}
|
|
152
|
+
}
|
|
153
|
+
return pids;
|
|
154
|
+
} catch {
|
|
155
|
+
return [];
|
|
156
|
+
}
|
|
157
|
+
};
|
|
158
|
+
|
|
159
|
+
/**
|
|
160
|
+
* Resolve a hostname to its A/AAAA addresses. Returns [] when the name doesn't
|
|
161
|
+
* resolve (NXDOMAIN, SERVFAIL, no records yet) — the signal the DNS
|
|
162
|
+
* self-diagnosis keys on. Injectable so tests drive each case (unresolved /
|
|
163
|
+
* Cloudflare / non-Cloudflare) deterministically.
|
|
164
|
+
*/
|
|
165
|
+
export type ResolveHostFn = (hostname: string) => Promise<string[]>;
|
|
166
|
+
|
|
167
|
+
export const defaultResolveHost: ResolveHostFn = async (hostname) => {
|
|
168
|
+
try {
|
|
169
|
+
// Bun.dns ships with the runtime; `node:dns/promises` is equally fine but
|
|
170
|
+
// Bun.dns.lookup returns both families in one call. `all: true` gives every
|
|
171
|
+
// record so a partially-propagated name still surfaces an address.
|
|
172
|
+
const records = await Bun.dns.lookup(hostname, { family: 0 });
|
|
173
|
+
return records.map((r) => r.address).filter((a) => typeof a === "string" && a.length > 0);
|
|
174
|
+
} catch {
|
|
175
|
+
return [];
|
|
176
|
+
}
|
|
177
|
+
};
|
|
178
|
+
|
|
179
|
+
/**
|
|
180
|
+
* Cloudflare's published anycast IPv4 ranges (the proxy edge). A proxied
|
|
181
|
+
* (orange-cloud) record — which is what `cloudflared tunnel route dns` creates
|
|
182
|
+
* — resolves to one of these. If the hostname resolves to something *outside*
|
|
183
|
+
* these ranges, it's almost certainly shadowed: a Pages project, an A record,
|
|
184
|
+
* or a grey-cloud CNAME pointing elsewhere. We keep the list to the v4 ranges
|
|
185
|
+
* (the common case) and treat any IPv6 in Cloudflare's 2606:4700::/32 block as
|
|
186
|
+
* Cloudflare too. Source: https://www.cloudflare.com/ips/ (stable for years).
|
|
187
|
+
*/
|
|
188
|
+
const CLOUDFLARE_V4_RANGES: ReadonlyArray<readonly [string, number]> = [
|
|
189
|
+
["173.245.48.0", 20],
|
|
190
|
+
["103.21.244.0", 22],
|
|
191
|
+
["103.22.200.0", 22],
|
|
192
|
+
["103.31.4.0", 22],
|
|
193
|
+
["141.101.64.0", 18],
|
|
194
|
+
["108.162.192.0", 18],
|
|
195
|
+
["190.93.240.0", 20],
|
|
196
|
+
["188.114.96.0", 20],
|
|
197
|
+
["197.234.240.0", 22],
|
|
198
|
+
["198.41.128.0", 17],
|
|
199
|
+
["162.158.0.0", 15],
|
|
200
|
+
["104.16.0.0", 13],
|
|
201
|
+
["104.24.0.0", 14],
|
|
202
|
+
["172.64.0.0", 13],
|
|
203
|
+
["131.0.72.0", 22],
|
|
204
|
+
];
|
|
205
|
+
|
|
206
|
+
function ipv4ToInt(ip: string): number | undefined {
|
|
207
|
+
const parts = ip.split(".");
|
|
208
|
+
if (parts.length !== 4) return undefined;
|
|
209
|
+
let n = 0;
|
|
210
|
+
for (const part of parts) {
|
|
211
|
+
const octet = Number.parseInt(part, 10);
|
|
212
|
+
if (!Number.isInteger(octet) || octet < 0 || octet > 255) return undefined;
|
|
213
|
+
n = n * 256 + octet;
|
|
214
|
+
}
|
|
215
|
+
return n >>> 0;
|
|
216
|
+
}
|
|
217
|
+
|
|
218
|
+
/** True if any resolved address belongs to Cloudflare's edge. */
|
|
219
|
+
export function looksLikeCloudflare(addresses: readonly string[]): boolean {
|
|
220
|
+
for (const addr of addresses) {
|
|
221
|
+
// IPv6: Cloudflare's edge lives in 2606:4700::/32.
|
|
222
|
+
if (addr.includes(":")) {
|
|
223
|
+
if (addr.toLowerCase().startsWith("2606:4700")) return true;
|
|
224
|
+
continue;
|
|
225
|
+
}
|
|
226
|
+
const ipInt = ipv4ToInt(addr);
|
|
227
|
+
if (ipInt === undefined) continue;
|
|
228
|
+
for (const [base, bits] of CLOUDFLARE_V4_RANGES) {
|
|
229
|
+
const baseInt = ipv4ToInt(base);
|
|
230
|
+
if (baseInt === undefined) continue;
|
|
231
|
+
const mask = bits === 0 ? 0 : (0xffffffff << (32 - bits)) >>> 0;
|
|
232
|
+
if ((ipInt & mask) === (baseInt & mask)) return true;
|
|
233
|
+
}
|
|
234
|
+
}
|
|
235
|
+
return false;
|
|
236
|
+
}
|
|
237
|
+
|
|
103
238
|
export interface ExposeCloudflareOpts {
|
|
104
239
|
runner?: Runner;
|
|
105
240
|
spawner?: CloudflaredSpawner;
|
|
106
241
|
alive?: AliveFn;
|
|
107
242
|
kill?: KillFn;
|
|
243
|
+
/**
|
|
244
|
+
* Find every running cloudflared connector PID serving this tunnel (by UUID
|
|
245
|
+
* or config-path match). Used to sweep orphan connectors before spawning a
|
|
246
|
+
* fresh one (hub#487). Tests inject a stub; production uses
|
|
247
|
+
* `defaultConnectorPids` (a filtered `pgrep -fl cloudflared`).
|
|
248
|
+
*/
|
|
249
|
+
connectorPids?: ConnectorPidsFn;
|
|
250
|
+
/**
|
|
251
|
+
* Resolve a hostname to its addresses, for the post-route DNS self-diagnosis
|
|
252
|
+
* (hub#487). Returns the resolved IPs (empty when NXDOMAIN / not yet live).
|
|
253
|
+
* Best-effort and non-fatal — a failure to resolve never blocks the expose.
|
|
254
|
+
* Tests inject a stub; production uses `defaultResolveHost` (Bun DNS).
|
|
255
|
+
*/
|
|
256
|
+
resolveHost?: ResolveHostFn;
|
|
108
257
|
log?: (line: string) => void;
|
|
109
258
|
manifestPath?: string;
|
|
110
259
|
statePath?: string;
|
|
@@ -186,6 +335,8 @@ interface Resolved {
|
|
|
186
335
|
spawner: CloudflaredSpawner;
|
|
187
336
|
alive: AliveFn;
|
|
188
337
|
kill: KillFn;
|
|
338
|
+
connectorPids: ConnectorPidsFn;
|
|
339
|
+
resolveHost: ResolveHostFn;
|
|
189
340
|
log: (line: string) => void;
|
|
190
341
|
manifestPath: string;
|
|
191
342
|
statePath: string;
|
|
@@ -217,6 +368,17 @@ function resolve(opts: ExposeCloudflareOpts): Resolved {
|
|
|
217
368
|
spawner: opts.spawner ?? defaultCloudflaredSpawner,
|
|
218
369
|
alive: opts.alive ?? defaultAlive,
|
|
219
370
|
kill: opts.kill ?? defaultKill,
|
|
371
|
+
// Defaulting policy mirrors lifecycle's startReadyMs (hub#487): the real
|
|
372
|
+
// implementations shell out (`pgrep`) / hit the network (DNS). When a test
|
|
373
|
+
// injects a fake `spawner` but no explicit seam, fall back to inert stubs
|
|
374
|
+
// (no orphans found; "resolves at Cloudflare" → no DNS warning) so suites
|
|
375
|
+
// stay deterministic and offline. Production (no spawner override) always
|
|
376
|
+
// gets the real `pgrep` sweep + DNS diagnosis.
|
|
377
|
+
connectorPids:
|
|
378
|
+
opts.connectorPids ?? (opts.spawner === undefined ? defaultConnectorPids : () => []),
|
|
379
|
+
resolveHost:
|
|
380
|
+
opts.resolveHost ??
|
|
381
|
+
(opts.spawner === undefined ? defaultResolveHost : async () => ["104.16.0.1"]),
|
|
220
382
|
log: opts.log ?? ((line) => console.log(line)),
|
|
221
383
|
manifestPath: opts.manifestPath ?? SERVICES_MANIFEST_PATH,
|
|
222
384
|
statePath: opts.statePath ?? CLOUDFLARED_STATE_PATH,
|
|
@@ -261,6 +423,49 @@ function printAuthGuidance(log: (line: string) => void, vaultUrl: string): void
|
|
|
261
423
|
log(` ${AUTH_DOC_URL}`);
|
|
262
424
|
}
|
|
263
425
|
|
|
426
|
+
/**
|
|
427
|
+
* Best-effort registrable-zone guess: the last two labels of the hostname
|
|
428
|
+
* (`vault.example.com` → `example.com`, `gitcoin.parachute.computer` →
|
|
429
|
+
* `parachute.computer`). This is a heuristic — multi-label public suffixes
|
|
430
|
+
* (`foo.co.uk`) would guess `co.uk` — but it's only used to phrase the
|
|
431
|
+
* `dig +short <zone> NS` remedy, where being off by a label is a harmless
|
|
432
|
+
* nudge. We don't ship a full public-suffix list for one warning string.
|
|
433
|
+
*/
|
|
434
|
+
function guessZone(hostname: string): string {
|
|
435
|
+
const labels = hostname.split(".").filter((l) => l.length > 0);
|
|
436
|
+
if (labels.length <= 2) return hostname;
|
|
437
|
+
return labels.slice(-2).join(".");
|
|
438
|
+
}
|
|
439
|
+
|
|
440
|
+
/**
|
|
441
|
+
* Non-fatal post-route DNS diagnosis. Resolves `hostname` and warns when the
|
|
442
|
+
* result looks wrong — see the call site for the two symptoms this addresses.
|
|
443
|
+
* Never throws (resolveHost swallows its own errors) and never changes the
|
|
444
|
+
* exit code; the worst case is no output.
|
|
445
|
+
*/
|
|
446
|
+
async function diagnoseDns(hostname: string, r: Resolved): Promise<void> {
|
|
447
|
+
const zone = guessZone(hostname);
|
|
448
|
+
const addresses = await r.resolveHost(hostname);
|
|
449
|
+
if (addresses.length === 0) {
|
|
450
|
+
r.log("");
|
|
451
|
+
r.log(`⚠ DNS isn't live yet for ${hostname}.`);
|
|
452
|
+
r.log(` If ${zone} is a new Cloudflare zone, its nameservers may not be switched at your`);
|
|
453
|
+
r.log(" registrar yet. Check with:");
|
|
454
|
+
r.log(` dig +short ${zone} NS # should list *.ns.cloudflare.com`);
|
|
455
|
+
r.log(" Propagation can take minutes to hours. The tunnel itself is up — the URLs below");
|
|
456
|
+
r.log(" will start working once DNS resolves.");
|
|
457
|
+
return;
|
|
458
|
+
}
|
|
459
|
+
if (!looksLikeCloudflare(addresses)) {
|
|
460
|
+
r.log("");
|
|
461
|
+
r.log(`⚠ ${hostname} resolves (${addresses.join(", ")}) but not to Cloudflare's edge.`);
|
|
462
|
+
r.log(` It may be shadowed by another DNS record or a Cloudflare Pages project on ${zone}.`);
|
|
463
|
+
r.log(" Ensure it's a proxied (orange-cloud) CNAME to the tunnel — check");
|
|
464
|
+
r.log(` https://dash.cloudflare.com → DNS for ${zone}. A grey-cloud / A record / Pages`);
|
|
465
|
+
r.log(" binding on this hostname will 404 the tunnel at the edge.");
|
|
466
|
+
}
|
|
467
|
+
}
|
|
468
|
+
|
|
264
469
|
export async function exposeCloudflareUp(
|
|
265
470
|
hostname: string,
|
|
266
471
|
opts: ExposeCloudflareOpts = {},
|
|
@@ -390,6 +595,19 @@ export async function exposeCloudflareUp(
|
|
|
390
595
|
}
|
|
391
596
|
r.log("✓ DNS routed.");
|
|
392
597
|
|
|
598
|
+
// Post-route DNS self-diagnosis (hub#487). `cloudflared tunnel route dns`
|
|
599
|
+
// can succeed (the CNAME is written in Cloudflare's API) while the hostname
|
|
600
|
+
// is still NOT actually serving the tunnel — two shapes Aaron hit:
|
|
601
|
+
// (a) a "pending" zone whose nameservers aren't switched at the registrar
|
|
602
|
+
// yet, so the record exists in Cloudflare but nothing resolves; and
|
|
603
|
+
// (b) a subdomain shadowed by a Cloudflare Pages project on the same zone,
|
|
604
|
+
// so the edge 404s the tunnel.
|
|
605
|
+
// Both previously printed "✓ DNS routed" + the URLs as if fine. This check
|
|
606
|
+
// is best-effort and strictly NON-FATAL — it only adds a warning; it never
|
|
607
|
+
// changes the exit code or blocks the expose. Fast: one DNS lookup with a
|
|
608
|
+
// built-in timeout in `resolveHost`.
|
|
609
|
+
await diagnoseDns(hostname, r);
|
|
610
|
+
|
|
393
611
|
const credsFile = credentialsPath(tunnel.id, r.cloudflaredHome);
|
|
394
612
|
writeConfig(
|
|
395
613
|
{
|
|
@@ -408,12 +626,28 @@ export async function exposeCloudflareUp(
|
|
|
408
626
|
);
|
|
409
627
|
r.log(`✓ Wrote ${r.configPath}`);
|
|
410
628
|
|
|
629
|
+
// Orphan-connector sweep (hub#487). Before spawning a fresh connector, kill
|
|
630
|
+
// EVERY cloudflared connector currently serving this tunnel so exactly one
|
|
631
|
+
// process serves the config.yml we just wrote. Pre-fix, each re-expose
|
|
632
|
+
// spawned a new connector without killing the prior ones (state tracked only
|
|
633
|
+
// the most-recent pid), so orphans accumulated and edge routing became
|
|
634
|
+
// nondeterministic. We union two sources:
|
|
635
|
+
// - the pid recorded in cloudflared-state.json (the prior `parachute`-
|
|
636
|
+
// spawned connector for this tunnel name), and
|
|
637
|
+
// - any pid found by scanning running processes for this tunnel's UUID or
|
|
638
|
+
// config path (catches orphans the state file lost track of — crashed
|
|
639
|
+
// mid-rewrite, or started by hand for this tunnel).
|
|
411
640
|
const stateBefore = readCloudflaredState(r.statePath);
|
|
412
641
|
const prior = findTunnelRecord(stateBefore, r.tunnelName);
|
|
413
|
-
|
|
642
|
+
const toKill = new Set<number>();
|
|
643
|
+
if (prior && r.alive(prior.pid)) toKill.add(prior.pid);
|
|
644
|
+
for (const pid of r.connectorPids(tunnel.id, r.configPath)) {
|
|
645
|
+
if (r.alive(pid)) toKill.add(pid);
|
|
646
|
+
}
|
|
647
|
+
for (const deadPid of toKill) {
|
|
414
648
|
try {
|
|
415
|
-
r.kill(
|
|
416
|
-
r.log(`Stopped prior cloudflared (pid ${
|
|
649
|
+
r.kill(deadPid, "SIGTERM");
|
|
650
|
+
r.log(`Stopped prior cloudflared connector (pid ${deadPid}).`);
|
|
417
651
|
} catch {
|
|
418
652
|
// Process is already gone — safe to ignore; we replace the record below.
|
|
419
653
|
}
|
|
@@ -530,6 +764,18 @@ export async function exposeCloudflareOff(opts: ExposeCloudflareOpts = {}): Prom
|
|
|
530
764
|
} else {
|
|
531
765
|
r.log(`cloudflared (pid ${record.pid}) wasn't running; clearing stale state.`);
|
|
532
766
|
}
|
|
767
|
+
// Sweep any orphan connectors for this tunnel that the state record didn't
|
|
768
|
+
// track (hub#487) so `off` leaves exactly zero connectors serving it. Match
|
|
769
|
+
// by UUID/config-path; skip the record pid we already signalled above.
|
|
770
|
+
for (const orphanPid of r.connectorPids(record.tunnelUuid, record.configPath)) {
|
|
771
|
+
if (orphanPid === record.pid || !r.alive(orphanPid)) continue;
|
|
772
|
+
try {
|
|
773
|
+
r.kill(orphanPid, "SIGTERM");
|
|
774
|
+
r.log(`✓ Stopped orphan cloudflared connector (pid ${orphanPid}).`);
|
|
775
|
+
} catch {
|
|
776
|
+
// Already gone between probe and kill — fine.
|
|
777
|
+
}
|
|
778
|
+
}
|
|
533
779
|
const stateAfter = withoutTunnelRecord(stateBefore, r.tunnelName);
|
|
534
780
|
if (stateAfter) {
|
|
535
781
|
writeCloudflaredState(stateAfter, r.statePath);
|
|
@@ -1,4 +1,5 @@
|
|
|
1
|
-
import { existsSync, openSync } from "node:fs";
|
|
1
|
+
import { existsSync, openSync, readFileSync } from "node:fs";
|
|
2
|
+
import { Socket } from "node:net";
|
|
2
3
|
import { join } from "node:path";
|
|
3
4
|
import { CONFIG_DIR, SERVICES_MANIFEST_PATH } from "../config.ts";
|
|
4
5
|
import { readEnvFileValues } from "../env-file.ts";
|
|
@@ -84,6 +85,44 @@ export const defaultSpawner: Spawner = {
|
|
|
84
85
|
export type KillFn = (pid: number, signal: NodeJS.Signals | number) => void;
|
|
85
86
|
export type SleepFn = (ms: number) => Promise<void>;
|
|
86
87
|
|
|
88
|
+
/**
|
|
89
|
+
* "Is something listening on this TCP port on loopback?" seam. Pairs with the
|
|
90
|
+
* spawn-then-die settle (hub#194) to catch the *other* silent-start failure
|
|
91
|
+
* shape (hub#487): a service that lives long enough to clear the liveness
|
|
92
|
+
* check but never binds its port because the port is already held (EADDRINUSE
|
|
93
|
+
* from an orphan). The recorded pid stays alive (vault's process supervisor
|
|
94
|
+
* retries / lingers) so `alive(pid)` says "running" while `parachute status`
|
|
95
|
+
* shows it inactive because nothing answers on the port.
|
|
96
|
+
*
|
|
97
|
+
* Tests inject a deterministic stub; production uses `defaultPortListening`.
|
|
98
|
+
*/
|
|
99
|
+
export type PortListeningFn = (port: number) => Promise<boolean>;
|
|
100
|
+
|
|
101
|
+
/**
|
|
102
|
+
* Connect-probe: open a TCP socket to 127.0.0.1:<port> and see if it's
|
|
103
|
+
* accepted. A successful connect means *something* is listening; we close
|
|
104
|
+
* immediately. Connection refused / timeout means nothing is bound yet.
|
|
105
|
+
* `node:net` rather than `Bun.connect` because the latter has no clean
|
|
106
|
+
* "connection refused → false" without a custom socket handler, and the net
|
|
107
|
+
* Socket's `error`/`connect` events map directly onto the boolean we want.
|
|
108
|
+
*/
|
|
109
|
+
export const defaultPortListening: PortListeningFn = (port) =>
|
|
110
|
+
new Promise((resolve) => {
|
|
111
|
+
const socket = new Socket();
|
|
112
|
+
let settled = false;
|
|
113
|
+
const done = (listening: boolean) => {
|
|
114
|
+
if (settled) return;
|
|
115
|
+
settled = true;
|
|
116
|
+
socket.destroy();
|
|
117
|
+
resolve(listening);
|
|
118
|
+
};
|
|
119
|
+
socket.setTimeout(1000);
|
|
120
|
+
socket.once("connect", () => done(true));
|
|
121
|
+
socket.once("timeout", () => done(false));
|
|
122
|
+
socket.once("error", () => done(false));
|
|
123
|
+
socket.connect(port, "127.0.0.1");
|
|
124
|
+
});
|
|
125
|
+
|
|
87
126
|
/**
|
|
88
127
|
* Group-aware liveness: returns true if the process group (pgid == pid)
|
|
89
128
|
* still has any member. Pairs with `defaultSpawner`'s `detached: true` —
|
|
@@ -130,6 +169,35 @@ export const defaultKill: KillFn = (pid, signal) => {
|
|
|
130
169
|
|
|
131
170
|
export const defaultSleep: SleepFn = (ms) => new Promise((r) => setTimeout(r, ms));
|
|
132
171
|
|
|
172
|
+
/**
|
|
173
|
+
* Read the trailing `n` lines of a logfile, best-effort. Used to surface the
|
|
174
|
+
* real boot error when a start fails — operators shouldn't have to manually
|
|
175
|
+
* `tail` the log to learn *why* the daemon died. Returns [] on any read
|
|
176
|
+
* error (missing file, permissions) so the caller falls back to the generic
|
|
177
|
+
* "tail the log" hint without throwing.
|
|
178
|
+
*/
|
|
179
|
+
function readLogTail(logFile: string, n: number): string[] {
|
|
180
|
+
try {
|
|
181
|
+
const content = readFileSync(logFile, "utf8");
|
|
182
|
+
const trimmed = content.replace(/\n$/, "");
|
|
183
|
+
if (trimmed === "") return [];
|
|
184
|
+
return trimmed.split("\n").slice(-n);
|
|
185
|
+
} catch {
|
|
186
|
+
return [];
|
|
187
|
+
}
|
|
188
|
+
}
|
|
189
|
+
|
|
190
|
+
/**
|
|
191
|
+
* Heuristic EADDRINUSE detector over a logfile tail. cloudflared, Bun, and
|
|
192
|
+
* Node all surface port collisions with recognizable phrases; we match the
|
|
193
|
+
* common ones rather than parse a structured error (there isn't one across
|
|
194
|
+
* runtimes). False positives are harmless — the worst case is we *also* print
|
|
195
|
+
* the port-in-use remedy on an unrelated failure, which is still actionable.
|
|
196
|
+
*/
|
|
197
|
+
function detectAddrInUse(logTail: readonly string[]): boolean {
|
|
198
|
+
return logTail.some((line) => /EADDRINUSE|address already in use|port .* in use/i.test(line));
|
|
199
|
+
}
|
|
200
|
+
|
|
133
201
|
export interface LifecycleOpts {
|
|
134
202
|
spawner?: Spawner;
|
|
135
203
|
kill?: KillFn;
|
|
@@ -161,6 +229,30 @@ export interface LifecycleOpts {
|
|
|
161
229
|
* settle.
|
|
162
230
|
*/
|
|
163
231
|
startSettleMs?: number;
|
|
232
|
+
/**
|
|
233
|
+
* Probe whether the service's port is listening, post-spawn. Pairs with the
|
|
234
|
+
* settle (hub#194) to catch the EADDRINUSE-orphan shape (hub#487): the
|
|
235
|
+
* process survives the liveness window (vault lingers / retries) but never
|
|
236
|
+
* binds because the port is already held, so `start` would otherwise report
|
|
237
|
+
* "✓ started" while `status` shows it inactive. Tests inject a stub;
|
|
238
|
+
* production uses `defaultPortListening` (a loopback TCP connect probe).
|
|
239
|
+
*/
|
|
240
|
+
portListening?: PortListeningFn;
|
|
241
|
+
/**
|
|
242
|
+
* How long `start` polls for the service to bind its port after the
|
|
243
|
+
* liveness settle passes. Default 4000ms in production — long enough to
|
|
244
|
+
* cover vault/scribe cold-boot (DB open, route registration) without making
|
|
245
|
+
* a healthy start feel laggy. Polled at `startReadyPollMs` intervals; the
|
|
246
|
+
* first time the port answers we declare success. If the window elapses
|
|
247
|
+
* with the process still alive but the port silent, we print a non-fatal
|
|
248
|
+
* warning (the daemon may still be coming up) rather than failing — only a
|
|
249
|
+
* *dead* process is a hard failure. Defaulting policy mirrors
|
|
250
|
+
* `startSettleMs`: 0 (skipped) unless `portListening` is injected or the
|
|
251
|
+
* production path (no spawner override) is active.
|
|
252
|
+
*/
|
|
253
|
+
startReadyMs?: number;
|
|
254
|
+
/** Poll interval while waiting for the port to come up. Default 200ms. */
|
|
255
|
+
startReadyPollMs?: number;
|
|
164
256
|
/**
|
|
165
257
|
* Override the hub origin passed to services as PARACHUTE_HUB_ORIGIN. If
|
|
166
258
|
* unset, `start` derives it from `expose-state.json` (when exposed) or
|
|
@@ -194,6 +286,9 @@ interface Resolved {
|
|
|
194
286
|
killWaitMs: number;
|
|
195
287
|
pollIntervalMs: number;
|
|
196
288
|
startSettleMs: number;
|
|
289
|
+
portListening: PortListeningFn;
|
|
290
|
+
startReadyMs: number;
|
|
291
|
+
startReadyPollMs: number;
|
|
197
292
|
hubOrigin: string | undefined;
|
|
198
293
|
ensureHub: (opts: EnsureHubOpts) => Promise<EnsureHubResult>;
|
|
199
294
|
stopHubFn: (opts: StopHubOpts) => Promise<boolean>;
|
|
@@ -220,6 +315,16 @@ function resolve(opts: LifecycleOpts): Resolved {
|
|
|
220
315
|
// override `alive`, which re-enables the default 250ms.
|
|
221
316
|
startSettleMs:
|
|
222
317
|
opts.startSettleMs ?? (opts.spawner === undefined || opts.alive !== undefined ? 250 : 0),
|
|
318
|
+
portListening: opts.portListening ?? defaultPortListening,
|
|
319
|
+
// Same defaulting policy as startSettleMs: production (no spawner
|
|
320
|
+
// override) gets the real 4s readiness window; tests that inject a stub
|
|
321
|
+
// spawner get 0 (skipped) unless they explicitly opt in via
|
|
322
|
+
// `portListening` or `startReadyMs`, so existing stub-spawner tests don't
|
|
323
|
+
// start probing a fake port.
|
|
324
|
+
startReadyMs:
|
|
325
|
+
opts.startReadyMs ??
|
|
326
|
+
(opts.spawner === undefined || opts.portListening !== undefined ? 4000 : 0),
|
|
327
|
+
startReadyPollMs: opts.startReadyPollMs ?? 200,
|
|
223
328
|
hubOrigin: resolveHubOrigin(opts.hubOrigin, configDir),
|
|
224
329
|
ensureHub: opts.hub?.ensureRunning ?? ensureHubRunning,
|
|
225
330
|
stopHubFn: opts.hub?.stop ?? stopHub,
|
|
@@ -464,21 +569,97 @@ export async function start(svc: string | undefined, opts: LifecycleOpts = {}):
|
|
|
464
569
|
}
|
|
465
570
|
writePid(short, pid, r.configDir);
|
|
466
571
|
|
|
467
|
-
//
|
|
468
|
-
//
|
|
469
|
-
//
|
|
470
|
-
//
|
|
471
|
-
//
|
|
472
|
-
//
|
|
572
|
+
// Boot-readiness gating (hub#194 + hub#487). A spawn returning a pid only
|
|
573
|
+
// proves the kernel forked the process — it says nothing about whether the
|
|
574
|
+
// service survived its boot or bound its port. Two silent-start shapes:
|
|
575
|
+
//
|
|
576
|
+
// (1) spawn-then-immediately-die (hub#194): the child throws before
|
|
577
|
+
// listening (notes-serve's Bun.resolveSync failing for bun-linked
|
|
578
|
+
// installs) and exits microseconds later. Caught by the settle below.
|
|
579
|
+
//
|
|
580
|
+
// (2) alive-but-never-bound (hub#487): the port is already held by an
|
|
581
|
+
// orphan, the child hits EADDRINUSE, but its process *lingers* (or a
|
|
582
|
+
// supervisor retries) long enough to clear the liveness check. `start`
|
|
583
|
+
// would report "✓ started" while `parachute status` shows it inactive
|
|
584
|
+
// because nothing answers on the port. Aaron hit exactly this with an
|
|
585
|
+
// orphan holding vault's 1940 on a fresh EC2 box. Caught by the
|
|
586
|
+
// port-readiness poll below.
|
|
587
|
+
//
|
|
588
|
+
// On any failure we surface the tail of the logfile so the operator sees
|
|
589
|
+
// the real boot error inline, and we specifically call out EADDRINUSE with
|
|
590
|
+
// the `lsof -ti:<port>` remedy.
|
|
591
|
+
const reportStartFailure = (reason: string): void => {
|
|
592
|
+
clearPid(short, r.configDir);
|
|
593
|
+
failures++;
|
|
594
|
+
const tail = readLogTail(logFile, 20);
|
|
595
|
+
if (detectAddrInUse(tail)) {
|
|
596
|
+
r.log(
|
|
597
|
+
`✗ ${short} failed to start: port ${entry.port} is already in use. Stop the existing process first — find it with \`lsof -ti:${entry.port}\` (then \`kill <pid>\`), or run \`parachute restart ${short}\`.`,
|
|
598
|
+
);
|
|
599
|
+
} else {
|
|
600
|
+
r.log(`✗ ${short} failed to start: ${reason}`);
|
|
601
|
+
}
|
|
602
|
+
if (tail.length > 0) {
|
|
603
|
+
r.log(` ── last ${tail.length} log line(s) (${logFile}) ──`);
|
|
604
|
+
for (const line of tail) r.log(` │ ${line}`);
|
|
605
|
+
} else {
|
|
606
|
+
r.log(` Tail the log for details: tail -50 ${logFile}`);
|
|
607
|
+
}
|
|
608
|
+
};
|
|
609
|
+
|
|
473
610
|
if (r.startSettleMs > 0) {
|
|
474
611
|
await r.sleep(r.startSettleMs);
|
|
475
612
|
if (!r.alive(pid)) {
|
|
476
|
-
|
|
477
|
-
|
|
613
|
+
reportStartFailure(
|
|
614
|
+
`spawned pid ${pid} but the process exited within ${r.startSettleMs}ms.`,
|
|
615
|
+
);
|
|
616
|
+
continue;
|
|
617
|
+
}
|
|
618
|
+
}
|
|
619
|
+
|
|
620
|
+
// Port-readiness poll (hub#487). The process is alive; now confirm it
|
|
621
|
+
// actually bound its port before claiming success. Poll up to
|
|
622
|
+
// `startReadyMs`, re-checking liveness each iteration so a *later* death
|
|
623
|
+
// (e.g. a slow EADDRINUSE crash) is still reported as a failure. A process
|
|
624
|
+
// that stays alive but never binds within the window gets a non-fatal
|
|
625
|
+
// warning rather than a hard failure — some daemons legitimately do slow
|
|
626
|
+
// boot work, and we'd rather not flip a healthy-but-slow start to red.
|
|
627
|
+
if (r.startReadyMs > 0) {
|
|
628
|
+
const deadline = r.now() + r.startReadyMs;
|
|
629
|
+
let listening = false;
|
|
630
|
+
let died = false;
|
|
631
|
+
while (r.now() < deadline) {
|
|
632
|
+
if (!r.alive(pid)) {
|
|
633
|
+
died = true;
|
|
634
|
+
break;
|
|
635
|
+
}
|
|
636
|
+
if (await r.portListening(entry.port)) {
|
|
637
|
+
listening = true;
|
|
638
|
+
break;
|
|
639
|
+
}
|
|
640
|
+
await r.sleep(r.startReadyPollMs);
|
|
641
|
+
}
|
|
642
|
+
if (died) {
|
|
643
|
+
reportStartFailure(`spawned pid ${pid} but the process exited during startup.`);
|
|
644
|
+
continue;
|
|
645
|
+
}
|
|
646
|
+
if (!listening) {
|
|
647
|
+
// Last-chance liveness check — the loop may have exited on the
|
|
648
|
+
// deadline right as the process died.
|
|
649
|
+
if (!r.alive(pid)) {
|
|
650
|
+
reportStartFailure(`spawned pid ${pid} but the process exited during startup.`);
|
|
651
|
+
continue;
|
|
652
|
+
}
|
|
478
653
|
r.log(
|
|
479
|
-
|
|
654
|
+
`⚠ ${short} started (pid ${pid}) but port ${entry.port} isn't accepting connections yet after ${r.startReadyMs}ms.`,
|
|
480
655
|
);
|
|
481
|
-
r.log(
|
|
656
|
+
r.log(
|
|
657
|
+
` It may still be coming up — check \`parachute status\` and \`parachute logs ${short}\`.`,
|
|
658
|
+
);
|
|
659
|
+
if (r.hubOrigin) r.log(` ${HUB_ORIGIN_ENV}=${r.hubOrigin}`);
|
|
660
|
+
if (short === "vault" && r.hubOrigin) {
|
|
661
|
+
persistVaultHubOrigin(r.configDir, r.hubOrigin, r.log);
|
|
662
|
+
}
|
|
482
663
|
continue;
|
|
483
664
|
}
|
|
484
665
|
}
|