@openparachute/hub 0.6.5-rc.2 → 0.6.5-rc.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@openparachute/hub",
3
- "version": "0.6.5-rc.2",
3
+ "version": "0.6.5-rc.3",
4
4
  "description": "parachute — the local hub for the Parachute ecosystem (discovery, ports, lifecycle, soon OAuth).",
5
5
  "license": "AGPL-3.0",
6
6
  "publishConfig": {
@@ -936,6 +936,42 @@ describe("POST /api/modules/:short/start", () => {
936
936
  expect(spawns[0]?.env?.MY_CUSTOM_VAR).toBe("sentinel123");
937
937
  });
938
938
 
939
+ test("#519 surface orphan: start surfaces the structured port_squatter error (not a bare failure)", async () => {
940
+ // The #519 field signature: after a hub restart, a module (surface on the
941
+ // box; vault here) is orphaned — listening on its port but NOT a supervised
942
+ // child. The restart-surface API path (`parachute restart <svc>` → 404
943
+ // fallthrough → start, and the boot reconcile) calls `supervisor.start()`,
944
+ // whose #581 squatter detection must surface the structured `port_squatter`
945
+ // error in the response body so the operator gets an actionable next step,
946
+ // not an opaque "request failed". This pins that propagation.
947
+ seedVault(1940);
948
+ // A real Supervisor with the squatter seams injected: pid 95870 (the #519
949
+ // orphan) holds :1940 and is NOT one of the supervisor's children.
950
+ const supervisor = new Supervisor({
951
+ spawnFn: () => {
952
+ throw new Error("should not spawn — the port is squatted");
953
+ },
954
+ pidOnPort: (port) => (port === 1940 ? 95870 : undefined),
955
+ ownerOfPid: (pid) => (pid === 95870 ? "bun /x/.parachute/surface/server.ts" : undefined),
956
+ });
957
+ const bearer = await mintBearer(h, [API_MODULES_OPS_REQUIRED_SCOPE]);
958
+ const res = await handleStart(
959
+ postReq("/api/modules/vault/start", { authorization: `Bearer ${bearer}` }),
960
+ "vault",
961
+ { db: h.db, issuer: ISSUER, manifestPath: h.manifestPath, configDir: h.dir, supervisor },
962
+ );
963
+ // 200 with the structured error riding in state.startError — the SPA/CLI
964
+ // render the actionable squatter message instead of a 500 "request failed".
965
+ expect(res.status).toBe(200);
966
+ const body = (await res.json()) as {
967
+ short: string;
968
+ state: { status: string; startError?: { error_type: string; error_description: string } };
969
+ };
970
+ expect(body.state.status).toBe("crashed");
971
+ expect(body.state.startError?.error_type).toBe("port_squatter");
972
+ expect(body.state.startError?.error_description).toContain("port 1940 is held by pid 95870");
973
+ });
974
+
939
975
  test("400 not_installed when the module isn't in services.json (no silent install)", async () => {
940
976
  // No seedVault — services.json has no vault row.
941
977
  const { supervisor, spawns } = makeIdleSupervisor();
@@ -0,0 +1,98 @@
1
+ import { describe, expect, test } from "bun:test";
2
+ import { orphanAttributable } from "../orphan-attribution.ts";
3
+
4
+ describe("orphanAttributable — two attribution modes (#601 review)", () => {
5
+ const ownerOfPid = (cmdlines: Record<number, string | undefined>) => (pid: number) =>
6
+ cmdlines[pid];
7
+
8
+ test("recorded-pid match → attributable in BOTH modes (cmdline not even read)", () => {
9
+ // No cmdline available, but the orphan IS the recorded pid → trivially ours.
10
+ const probe = ownerOfPid({});
11
+ const broad = orphanAttributable({
12
+ orphan: 100,
13
+ recordedPid: 100,
14
+ short: "vault",
15
+ startCmdHint: undefined,
16
+ ownerOfPid: probe,
17
+ });
18
+ const perModule = orphanAttributable({
19
+ orphan: 100,
20
+ recordedPid: 100,
21
+ short: "vault",
22
+ startCmdHint: undefined,
23
+ ownerOfPid: probe,
24
+ moduleMarker: "parachute-vault",
25
+ });
26
+ expect(broad.attributable).toBe(true);
27
+ expect(perModule.attributable).toBe(true);
28
+ });
29
+
30
+ test("broad mode (no moduleMarker): any `parachute` cmdline is attributable", () => {
31
+ const res = orphanAttributable({
32
+ orphan: 200,
33
+ recordedPid: undefined,
34
+ short: "vault",
35
+ startCmdHint: undefined,
36
+ ownerOfPid: ownerOfPid({ 200: "parachute-scribe serve" }),
37
+ });
38
+ // Migrate-sweep width: a sibling parachute process still counts.
39
+ expect(res.attributable).toBe(true);
40
+ expect(res.cmdline).toBe("parachute-scribe serve");
41
+ });
42
+
43
+ test("per-module mode: own marker matches → attributable", () => {
44
+ const res = orphanAttributable({
45
+ orphan: 300,
46
+ recordedPid: undefined,
47
+ short: "vault",
48
+ startCmdHint: undefined,
49
+ ownerOfPid: ownerOfPid({ 300: "parachute-vault serve" }),
50
+ moduleMarker: "parachute-vault",
51
+ });
52
+ expect(res.attributable).toBe(true);
53
+ });
54
+
55
+ test("per-module mode: a SIBLING parachute module is NOT attributable (cross-module-kill guard)", () => {
56
+ const res = orphanAttributable({
57
+ orphan: 400,
58
+ recordedPid: undefined,
59
+ short: "vault",
60
+ startCmdHint: undefined,
61
+ // A real parachute process (carries `parachute`) — but it's SCRIBE, not
62
+ // vault. The broad mode would attribute it; per-module must not.
63
+ ownerOfPid: ownerOfPid({ 400: "parachute-scribe serve" }),
64
+ moduleMarker: "parachute-vault",
65
+ });
66
+ expect(res.attributable).toBe(false);
67
+ // The cmdline is still returned so the caller can surface it in the message.
68
+ expect(res.cmdline).toBe("parachute-scribe serve");
69
+ });
70
+
71
+ test("either mode: unreadable cmdline + non-matching pid → NOT attributable", () => {
72
+ for (const moduleMarker of [undefined, "parachute-vault"]) {
73
+ const res = orphanAttributable({
74
+ orphan: 500,
75
+ recordedPid: 999, // different from orphan
76
+ short: "vault",
77
+ startCmdHint: undefined,
78
+ ownerOfPid: ownerOfPid({}), // returns undefined
79
+ moduleMarker,
80
+ });
81
+ expect(res.attributable).toBe(false);
82
+ expect(res.cmdline).toBeUndefined();
83
+ }
84
+ });
85
+
86
+ test("startCmdHint is an additional needle in per-module mode", () => {
87
+ const res = orphanAttributable({
88
+ orphan: 600,
89
+ recordedPid: undefined,
90
+ short: "vault",
91
+ startCmdHint: "my-custom-server.ts",
92
+ // cmdline lacks the module binary but carries the explicit hint.
93
+ ownerOfPid: ownerOfPid({ 600: "node /opt/my-custom-server.ts" }),
94
+ moduleMarker: "parachute-vault",
95
+ });
96
+ expect(res.attributable).toBe(true);
97
+ });
98
+ });
@@ -478,6 +478,369 @@ describe("Supervisor restart-on-crash", () => {
478
478
  });
479
479
  });
480
480
 
481
+ describe("Supervisor crash-restart port reclamation (#522 / #582)", () => {
482
+ // A killFn that records its (pid, signal) calls so a test can prove an
483
+ // adopt-kill happened (or didn't). Does NOT forward to a fake — these tests
484
+ // drive the orphan's "death" by flipping the injected pidOnPort.
485
+ function recordingKill(): { killFn: KillFn; calls: Array<{ pid: number; signal: unknown }> } {
486
+ const calls: Array<{ pid: number; signal: unknown }> = [];
487
+ return { calls, killFn: (pid, signal) => calls.push({ pid, signal }) };
488
+ }
489
+
490
+ test("attributable orphan + kill frees the port → adopt-kill then respawn", async () => {
491
+ const first = makeFakeProc(900);
492
+ const second = makeFakeProc(901);
493
+ const spawner = makeQueueSpawner();
494
+ spawner.enqueue(first);
495
+ spawner.enqueue(second);
496
+ // The orphan (pid 5000) holds :1940 right after the crash; the SIGTERM
497
+ // adopt-kill frees it — model that by clearing the holder inside the kill
498
+ // stub (so the SIGKILL-escalation re-probe sees a freed port).
499
+ const calls: Array<{ pid: number; signal: unknown }> = [];
500
+ let holder: number | undefined = undefined;
501
+ const killFn: KillFn = (pid, signal) => {
502
+ calls.push({ pid, signal });
503
+ if (pid === 5000 && signal === "SIGTERM") holder = undefined; // the orphan died
504
+ };
505
+ const sup = new Supervisor({
506
+ spawnFn: spawner.spawn,
507
+ killFn,
508
+ restartDelayMs: 0,
509
+ sleep: () => Promise.resolve(),
510
+ pidOnPort: (port) => (port === 1940 ? holder : undefined),
511
+ // Attributable PER-MODULE: the cmdline carries THIS module's start binary
512
+ // (`parachute-vault`), not just a bare `parachute` marker.
513
+ ownerOfPid: (pid) => (pid === 5000 ? "parachute-vault serve" : undefined),
514
+ });
515
+
516
+ await sup.start({ short: "vault", cmd: ["parachute-vault", "serve"], env: { PORT: "1940" } });
517
+ // Orphan grabs the port, then the child crashes. `handleExit` detects the
518
+ // attributable orphan + adopt-kills it (the stub clears `holder`), then
519
+ // falls through to a normal restart that re-spawns onto the freed port.
520
+ holder = 5000;
521
+ first.closeStreams();
522
+ first.resolveExit(1);
523
+ await tick();
524
+
525
+ // The orphan got SIGTERM'd, and the module respawned onto the freed port.
526
+ expect(calls.some((c) => c.pid === 5000 && c.signal === "SIGTERM")).toBe(true);
527
+ expect(spawner.calls).toHaveLength(2);
528
+ const state = sup.get("vault");
529
+ expect(state?.status).toBe("running");
530
+ expect(state?.pid).toBe(901);
531
+ // It WAS counted as a normal restart (the module did crash + we reclaimed).
532
+ expect(state?.restartsInWindow).toBe(1);
533
+
534
+ second.closeStreams();
535
+ sup.stop("vault");
536
+ second.resolveExit(0);
537
+ });
538
+
539
+ test("attributable orphan + kill fails (ESRCH) → respawn still attempted", async () => {
540
+ const first = makeFakeProc(910);
541
+ const second = makeFakeProc(911);
542
+ const spawner = makeQueueSpawner();
543
+ spawner.enqueue(first);
544
+ spawner.enqueue(second);
545
+ // killFn throws ESRCH (the orphan vanished between probe + signal) — the
546
+ // adopt-kill swallows it and the respawn proceeds best-effort.
547
+ const killFn: KillFn = () => {
548
+ const err = new Error("no such process") as NodeJS.ErrnoException;
549
+ err.code = "ESRCH";
550
+ throw err;
551
+ };
552
+ // Port free at the initial start; the orphan appears only after the crash.
553
+ let holder: number | undefined = undefined;
554
+ const sup = new Supervisor({
555
+ spawnFn: spawner.spawn,
556
+ killFn,
557
+ restartDelayMs: 0,
558
+ sleep: () => Promise.resolve(),
559
+ // Holder present at crash time; the (failed) kill doesn't change it here,
560
+ // but the respawn is still attempted — that's the invariant under test.
561
+ pidOnPort: (port) => (port === 1940 ? holder : undefined),
562
+ // Attributable per-module: cmdline carries this module's start binary.
563
+ ownerOfPid: () => "parachute-vault serve",
564
+ });
565
+
566
+ await sup.start({ short: "vault", cmd: ["parachute-vault", "serve"], env: { PORT: "1940" } });
567
+ holder = 5001;
568
+ first.closeStreams();
569
+ first.resolveExit(1);
570
+ await tick();
571
+
572
+ // Respawn was attempted despite the kill throwing ESRCH (best-effort).
573
+ expect(spawner.calls).toHaveLength(2);
574
+ const state = sup.get("vault");
575
+ expect(state?.status).toBe("running");
576
+
577
+ second.closeStreams();
578
+ sup.stop("vault");
579
+ second.resolveExit(0);
580
+ });
581
+
582
+ test("foreign holder with readable cmdline → port_squatter error, no kill, no budget tick", async () => {
583
+ const first = makeFakeProc(920);
584
+ const spawner = makeQueueSpawner();
585
+ spawner.enqueue(first); // ONLY one proc — a respawn would throw "unexpected spawn".
586
+ const kill = recordingKill();
587
+ let holder: number | undefined = undefined; // free at start; orphan after crash.
588
+ const sup = new Supervisor({
589
+ spawnFn: spawner.spawn,
590
+ killFn: kill.killFn,
591
+ restartDelayMs: 0,
592
+ sleep: () => Promise.resolve(),
593
+ pidOnPort: (port) => (port === 1940 ? holder : undefined),
594
+ // NOT attributable: an operator's unrelated dev server (no `parachute-vault`
595
+ // marker in its cmdline).
596
+ ownerOfPid: (pid) => (pid === 6000 ? "node /home/op/my-app/server.js" : undefined),
597
+ });
598
+
599
+ await sup.start({ short: "vault", cmd: ["parachute-vault", "serve"], env: { PORT: "1940" } });
600
+ holder = 6000;
601
+ first.closeStreams();
602
+ first.resolveExit(1);
603
+ await tick();
604
+
605
+ const state = sup.get("vault");
606
+ expect(state?.status).toBe("crashed");
607
+ expect(state?.startError?.error_type).toBe("port_squatter");
608
+ expect(state?.startError?.error_description).toContain("port 1940 is held by pid 6000");
609
+ // No kill (foreign process), no respawn, and the crash budget was NOT
610
+ // ticked (the module didn't crash — a foreign process is blocking its port).
611
+ expect(kill.calls).toHaveLength(0);
612
+ expect(spawner.calls).toHaveLength(1);
613
+ expect(state?.restartsInWindow).toBe(0);
614
+ });
615
+
616
+ test("foreign holder with UNREADABLE cmdline → port_squatter error, no kill, no budget tick", async () => {
617
+ const first = makeFakeProc(930);
618
+ const spawner = makeQueueSpawner();
619
+ spawner.enqueue(first); // only one — no respawn expected.
620
+ const kill = recordingKill();
621
+ let holder: number | undefined = undefined; // free at start; orphan after crash.
622
+ const sup = new Supervisor({
623
+ spawnFn: spawner.spawn,
624
+ killFn: kill.killFn,
625
+ restartDelayMs: 0,
626
+ sleep: () => Promise.resolve(),
627
+ pidOnPort: (port) => (port === 1940 ? holder : undefined),
628
+ // Unreadable cmdline + a non-matching pid → NOT attributable (never kill).
629
+ ownerOfPid: () => undefined,
630
+ });
631
+
632
+ await sup.start({ short: "vault", cmd: ["parachute-vault", "serve"], env: { PORT: "1940" } });
633
+ holder = 7000;
634
+ first.closeStreams();
635
+ first.resolveExit(1);
636
+ await tick();
637
+
638
+ const state = sup.get("vault");
639
+ expect(state?.status).toBe("crashed");
640
+ expect(state?.startError?.error_type).toBe("port_squatter");
641
+ expect(kill.calls).toHaveLength(0);
642
+ expect(spawner.calls).toHaveLength(1);
643
+ expect(state?.restartsInWindow).toBe(0);
644
+ });
645
+
646
+ test("no squatter after a crash → normal restart (unchanged behavior)", async () => {
647
+ const first = makeFakeProc(940);
648
+ const second = makeFakeProc(941);
649
+ const spawner = makeQueueSpawner();
650
+ spawner.enqueue(first);
651
+ spawner.enqueue(second);
652
+ const kill = recordingKill();
653
+ const sup = new Supervisor({
654
+ spawnFn: spawner.spawn,
655
+ killFn: kill.killFn,
656
+ restartDelayMs: 0,
657
+ sleep: () => Promise.resolve(),
658
+ // Port free at crash time (no squatter).
659
+ pidOnPort: () => undefined,
660
+ ownerOfPid: () => undefined,
661
+ });
662
+
663
+ await sup.start({ short: "vault", cmd: ["bun", "vault.ts"], env: { PORT: "1940" } });
664
+ first.closeStreams();
665
+ first.resolveExit(1);
666
+ await tick();
667
+
668
+ expect(kill.calls).toHaveLength(0);
669
+ expect(spawner.calls).toHaveLength(2);
670
+ const state = sup.get("vault");
671
+ expect(state?.status).toBe("running");
672
+ expect(state?.restartsInWindow).toBe(1);
673
+
674
+ second.closeStreams();
675
+ sup.stop("vault");
676
+ second.resolveExit(0);
677
+ });
678
+
679
+ test('reclaimPolicy "prompt" → never adopt-kills, surfaces even an attributable orphan', async () => {
680
+ const first = makeFakeProc(950);
681
+ const spawner = makeQueueSpawner();
682
+ spawner.enqueue(first); // only one — prompt halts, no respawn.
683
+ const kill = recordingKill();
684
+ let holder: number | undefined = undefined; // free at start; orphan after crash.
685
+ const sup = new Supervisor({
686
+ spawnFn: spawner.spawn,
687
+ killFn: kill.killFn,
688
+ restartDelayMs: 0,
689
+ sleep: () => Promise.resolve(),
690
+ reclaimPolicy: "prompt",
691
+ pidOnPort: (port) => (port === 1940 ? holder : undefined),
692
+ // Attributable per-module (parachute-vault marker) — but "prompt" still refuses to kill.
693
+ ownerOfPid: () => "parachute-vault serve",
694
+ });
695
+
696
+ await sup.start({ short: "vault", cmd: ["parachute-vault", "serve"], env: { PORT: "1940" } });
697
+ holder = 8000;
698
+ first.closeStreams();
699
+ first.resolveExit(1);
700
+ await tick();
701
+
702
+ const state = sup.get("vault");
703
+ expect(state?.status).toBe("crashed");
704
+ expect(state?.startError?.error_type).toBe("port_squatter");
705
+ expect(kill.calls).toHaveLength(0); // prompt never kills
706
+ expect(spawner.calls).toHaveLength(1);
707
+ expect(state?.restartsInWindow).toBe(0);
708
+ });
709
+
710
+ test("foreign SIBLING parachute module on the port → NOT adopt-killed, port_squatter error (per-module attribution, #601 review)", async () => {
711
+ // The cross-module-kill hazard the per-module attribution closes: vault's
712
+ // crash-restart finds its port held by a SCRIBE orphan (a genuine parachute
713
+ // process — its cmdline carries `parachute` AND scribe's own binary — but
714
+ // NOT vault's). The broad `parachute` marker would have adopt-KILLED scribe;
715
+ // the per-module marker (`parachute-vault`) does not match scribe's cmdline,
716
+ // so scribe is "not attributable" → surfaced, never killed.
717
+ const first = makeFakeProc(960);
718
+ const spawner = makeQueueSpawner();
719
+ spawner.enqueue(first); // only one — a kill+respawn would consume a second.
720
+ const kill = recordingKill();
721
+ let holder: number | undefined = undefined; // free at start; sibling after crash.
722
+ const sup = new Supervisor({
723
+ spawnFn: spawner.spawn,
724
+ killFn: kill.killFn,
725
+ restartDelayMs: 0,
726
+ sleep: () => Promise.resolve(),
727
+ pidOnPort: (port) => (port === 1940 ? holder : undefined),
728
+ // A real parachute process — but it's SCRIBE, not vault. Carries the broad
729
+ // `parachute` marker (and `parachute-scribe`) yet NOT `parachute-vault`.
730
+ ownerOfPid: (pid) => (pid === 9000 ? "parachute-scribe serve" : undefined),
731
+ });
732
+
733
+ await sup.start({ short: "vault", cmd: ["parachute-vault", "serve"], env: { PORT: "1940" } });
734
+ holder = 9000;
735
+ first.closeStreams();
736
+ first.resolveExit(1);
737
+ await tick();
738
+
739
+ const state = sup.get("vault");
740
+ // The sibling was NOT killed (per-module marker mismatch), surfaced instead.
741
+ expect(kill.calls).toHaveLength(0);
742
+ expect(spawner.calls).toHaveLength(1); // no respawn — halted, no port reclaim
743
+ expect(state?.status).toBe("crashed");
744
+ expect(state?.startError?.error_type).toBe("port_squatter");
745
+ expect(state?.startError?.error_description).toContain("port 1940 is held by pid 9000");
746
+ // Sanity: the sibling WOULD have matched the broad `parachute` marker —
747
+ // proving the per-module marker is what spared it.
748
+ expect("parachute-scribe serve").toContain("parachute");
749
+ });
750
+
751
+ test("generic-runtime startCmd (bun server.ts): a FOREIGN bun on the port is NOT over-attributed (#601 re-review)", async () => {
752
+ // A custom operator startCmd whose cmd[0] is a generic runtime (`bun`). The
753
+ // marker must NOT be "bun" (that would adopt-KILL any bun process on the
754
+ // port) — it falls through to the module-specific cwd (the installDir). A
755
+ // foreign bun process with a DIFFERENT cwd in its cmdline is then NOT
756
+ // attributable → surfaced, never killed.
757
+ const first = makeFakeProc(970);
758
+ const spawner = makeQueueSpawner();
759
+ spawner.enqueue(first); // only one — no kill+respawn expected.
760
+ const kill = recordingKill();
761
+ let holder: number | undefined = undefined;
762
+ const sup = new Supervisor({
763
+ spawnFn: spawner.spawn,
764
+ killFn: kill.killFn,
765
+ restartDelayMs: 0,
766
+ sleep: () => Promise.resolve(),
767
+ pidOnPort: (port) => (port === 1940 ? holder : undefined),
768
+ // A foreign bun process — its cmdline carries `bun` (which WOULD match a
769
+ // naive cmd[0] marker) but NOT vault's installDir.
770
+ ownerOfPid: (pid) => (pid === 9100 ? "bun /home/op/other-project/server.ts" : undefined),
771
+ });
772
+
773
+ await sup.start({
774
+ short: "vault",
775
+ cmd: ["bun", "server.ts"],
776
+ cwd: "/x/.parachute/vault",
777
+ env: { PORT: "1940" },
778
+ });
779
+ holder = 9100;
780
+ first.closeStreams();
781
+ first.resolveExit(1);
782
+ await tick();
783
+
784
+ const state = sup.get("vault");
785
+ // Not over-attributed: no kill, no respawn — surfaced as a squatter.
786
+ expect(kill.calls).toHaveLength(0);
787
+ expect(spawner.calls).toHaveLength(1);
788
+ expect(state?.status).toBe("crashed");
789
+ expect(state?.startError?.error_type).toBe("port_squatter");
790
+ // Sanity: the foreign cmdline WOULD have matched a naive "bun" marker —
791
+ // proving the generic-runtime fall-through to the cwd marker is what spared it.
792
+ expect("bun /home/op/other-project/server.ts").toContain("bun");
793
+ });
794
+
795
+ test("generic-runtime startCmd: a GENUINE prior instance (same installDir cwd) IS adopted (positive control)", async () => {
796
+ // The other side of the fall-through: with cmd[0]=`bun`, the marker is the
797
+ // module's cwd (`/x/.parachute/vault`). A genuine prior vault instance was
798
+ // launched from that installDir, so its cmdline carries the path → it IS
799
+ // attributable and gets adopt-killed.
800
+ const first = makeFakeProc(980);
801
+ const second = makeFakeProc(981);
802
+ const spawner = makeQueueSpawner();
803
+ spawner.enqueue(first);
804
+ spawner.enqueue(second);
805
+ const calls: Array<{ pid: number; signal: unknown }> = [];
806
+ let holder: number | undefined = undefined;
807
+ const killFn: KillFn = (pid, signal) => {
808
+ calls.push({ pid, signal });
809
+ if (pid === 9200 && signal === "SIGTERM") holder = undefined; // orphan died
810
+ };
811
+ const sup = new Supervisor({
812
+ spawnFn: spawner.spawn,
813
+ killFn,
814
+ restartDelayMs: 0,
815
+ sleep: () => Promise.resolve(),
816
+ pidOnPort: (port) => (port === 1940 ? holder : undefined),
817
+ // Genuine prior vault instance — launched from vault's installDir, so the
818
+ // cwd marker appears in its cmdline.
819
+ ownerOfPid: (pid) => (pid === 9200 ? "bun /x/.parachute/vault/server.ts" : undefined),
820
+ });
821
+
822
+ await sup.start({
823
+ short: "vault",
824
+ cmd: ["bun", "server.ts"],
825
+ cwd: "/x/.parachute/vault",
826
+ env: { PORT: "1940" },
827
+ });
828
+ holder = 9200;
829
+ first.closeStreams();
830
+ first.resolveExit(1);
831
+ await tick();
832
+
833
+ // Adopted: SIGTERM'd the genuine prior instance, then respawned.
834
+ expect(calls.some((c) => c.pid === 9200 && c.signal === "SIGTERM")).toBe(true);
835
+ expect(spawner.calls).toHaveLength(2);
836
+ expect(sup.get("vault")?.status).toBe("running");
837
+
838
+ second.closeStreams();
839
+ sup.stop("vault");
840
+ second.resolveExit(0);
841
+ });
842
+ });
843
+
481
844
  describe("Supervisor.stop", () => {
482
845
  test("operator stop is not a crash — does not restart", async () => {
483
846
  const proc = makeFakeProc(101);
@@ -82,6 +82,7 @@ import {
82
82
  installManagedUnit,
83
83
  removeManagedUnit,
84
84
  } from "../managed-unit.ts";
85
+ import { type OwnerProbeFn, orphanAttributable } from "../orphan-attribution.ts";
85
86
  import { type PortListeningFn, defaultPortListening } from "../port-probe.ts";
86
87
  import { type AliveFn, clearPid, readPid } from "../process-state.ts";
87
88
  import { shortNameForManifest } from "../service-spec.ts";
@@ -103,12 +104,12 @@ export function defaultHubCliPath(): string {
103
104
  return fileURLToPath(new URL("../cli.ts", import.meta.url));
104
105
  }
105
106
 
106
- /**
107
- * Best-effort command-line probe for a pid (the orphan-sweep ownership check).
108
- * Returns the process's command line, or undefined when it can't be read. See
109
- * `CutoverDeps.ownerOfPid`.
110
- */
111
- export type OwnerProbeFn = (pid: number) => string | undefined;
107
+ // `OwnerProbeFn` + the attribution heuristic (`orphanAttributable`) now live in
108
+ // the shared `src/orphan-attribution.ts` so the migrate orphan-sweep and the
109
+ // supervisor's crash-restart adopt-kill share ONE implementation (no drift on
110
+ // the safety-critical "is this mine?" check). Re-exported here for the existing
111
+ // `migrate-cutover` import surface.
112
+ export type { OwnerProbeFn } from "../orphan-attribution.ts";
112
113
 
113
114
  /**
114
115
  * Production `ownerOfPid`: `ps -o command= -p <pid>` returns the full argv of the
@@ -454,49 +455,10 @@ async function stopDetachedModule(
454
455
  log(` ✓ stopped ${target.short}`);
455
456
  }
456
457
 
457
- /**
458
- * Decide whether an orphan pid bound to a MODULE port is plausibly attributable
459
- * to that parachute module the MUST-FIX-2 guard against blind-killing an
460
- * operator's unrelated process that merely squats a declared port. Attributable
461
- * when ANY of:
462
- * - the orphan pid equals the module's RECORDED pid (services.json/pidfile);
463
- * - its command line mentions `parachute` (any parachute-managed process —
464
- * the `~/.parachute/...` install path and the `@openparachute/<mod>`
465
- * package name both carry this marker, so it catches every genuine
466
- * parachute-managed module);
467
- * - its command line mentions the module's start command (when a hint is
468
- * supplied — currently always unset at the call site, the seam is kept
469
- * for a future services.json-derived start command).
470
- * An unreadable command line (probe returned undefined) + a non-matching pid is
471
- * NOT attributable — we refuse to kill it.
472
- *
473
- * NOTE: the bare module short-name needle (`vault`/`runner`/`scribe`/`notes`)
474
- * was deliberately dropped — on the most destructive command (a process KILL),
475
- * a bare short-name is too loose: a `runner` substring matches an unrelated CI
476
- * runner squatting the port. The `parachute` marker already attributes every
477
- * genuine parachute-managed process, so the short-name arm only widened the
478
- * false-positive surface.
479
- */
480
- function orphanAttributable(args: {
481
- orphan: number;
482
- recordedPid: number | undefined;
483
- short: string;
484
- startCmdHint: string | undefined;
485
- ownerOfPid: OwnerProbeFn;
486
- }): { attributable: boolean; cmdline: string | undefined } {
487
- const { orphan, recordedPid, startCmdHint, ownerOfPid } = args;
488
- if (recordedPid !== undefined && orphan === recordedPid) {
489
- return { attributable: true, cmdline: undefined };
490
- }
491
- const cmdline = ownerOfPid(orphan);
492
- if (cmdline === undefined) return { attributable: false, cmdline: undefined };
493
- const haystack = cmdline.toLowerCase();
494
- const needles = ["parachute", ...(startCmdHint ? [startCmdHint.toLowerCase()] : [])].filter(
495
- (n) => n.length > 0,
496
- );
497
- const attributable = needles.some((n) => haystack.includes(n));
498
- return { attributable, cmdline };
499
- }
458
+ // `orphanAttributable` — the safety-critical "is this orphan plausibly this
459
+ // module?" heuristic now lives in the shared `src/orphan-attribution.ts`
460
+ // (imported above), so the supervisor's crash-restart adopt-kill uses the same
461
+ // implementation. See that file for the full attribution contract.
500
462
 
501
463
  /**
502
464
  * §7.2 orphan sweep: lsof a port, and if a live process is bound to it, adopt +
@@ -0,0 +1,102 @@
1
+ /**
2
+ * Shared port-orphan ATTRIBUTION — the safety crux behind every adopt-kill in
3
+ * the hub.
4
+ *
5
+ * Two lifecycle sites reclaim a module's port from a process the supervisor
6
+ * doesn't directly own:
7
+ * - the `parachute migrate --to-supervised` orphan sweep
8
+ * (`commands/migrate-cutover.ts:sweepOrphanOnPort`), and
9
+ * - the supervisor's crash-restart path
10
+ * (`supervisor.ts:handleExit` → `adoptKillOrphanOnPort`).
11
+ *
12
+ * Both must answer the SAME question before sending a signal: is the process
13
+ * holding the module's port plausibly THIS parachute module (a leftover
14
+ * instance / orphan we may adopt-kill), or an UNRELATED process the operator is
15
+ * running on the same port (which we must never touch)? Sharing one
16
+ * implementation keeps the two sites from drifting — a loosened needle in one
17
+ * place can't widen the kill surface in the other without the other noticing.
18
+ *
19
+ * The function is intentionally CONSERVATIVE: when in any doubt (unreadable
20
+ * command line + a non-matching pid) it returns `attributable: false`, and the
21
+ * caller refuses to kill. False-negatives cost a surfaced `port_squatter`
22
+ * error (the operator resolves it); a false-positive costs killing someone
23
+ * else's process — a far worse failure, so we bias hard toward not-attributable.
24
+ */
25
+
26
+ /**
27
+ * Best-effort command line of a pid. Returns the process's argv (one line) or
28
+ * undefined when it can't be read (pid gone, permission, no `ps`). Both
29
+ * supervisor + migrate wire a `ps -o command= -p <pid>` shell-out; the seam is
30
+ * injectable so tests drive attribution without shelling out.
31
+ */
32
+ export type OwnerProbeFn = (pid: number) => string | undefined;
33
+
34
+ /**
35
+ * Decide whether an orphan pid bound to a MODULE port is plausibly attributable
36
+ * to that parachute module — the guard against blind-killing an operator's
37
+ * unrelated process that merely squats a declared port. Attributable when ANY
38
+ * of:
39
+ * - the orphan pid equals the module's RECORDED pid (services.json/pidfile,
40
+ * or a supervisor entry's recorded pid);
41
+ * - (the cmdline arm) it matches the configured needle set — see `moduleMarker`.
42
+ *
43
+ * An unreadable command line (probe returned undefined) + a non-matching pid is
44
+ * NOT attributable — we refuse to kill it.
45
+ *
46
+ * TWO ATTRIBUTION MODES (the `moduleMarker` knob):
47
+ *
48
+ * - **Broad ("parachute") — the migrate orphan-sweep.** `moduleMarker`
49
+ * OMITTED: the cmdline needle is the bare `parachute` marker (the
50
+ * `~/.parachute/...` install path + the `@openparachute/<mod>` package name
51
+ * both carry it). The sweep runs ecosystem-wide during a cutover, so
52
+ * "is it ANY parachute-managed process?" is the right, field-tested width.
53
+ *
54
+ * - **Per-module — the supervisor's crash-restart adopt-kill.** `moduleMarker`
55
+ * PROVIDED (the module's own start binary / installDir, e.g.
56
+ * `parachute-vault` or `~/.parachute/vault/`): the cmdline must contain THAT
57
+ * marker. The supervisor is always restarting ONE specific module and knows
58
+ * its identity, so a bare `parachute` match is too loose — it would let
59
+ * vault's restart adopt-KILL a sibling `scribe`/`runner` orphan that happens
60
+ * to hold vault's port (a cross-module kill). Requiring the module-specific
61
+ * marker means the supervisor can only ever reclaim a prior instance of the
62
+ * SAME module; a sibling's process is "not attributable" → surfaced, never
63
+ * killed.
64
+ *
65
+ * The bare module short-NAME (`vault`/`scribe`/…) is deliberately NOT a needle
66
+ * in either mode — on a process KILL a bare short-name is too loose (a `runner`
67
+ * substring matches an unrelated CI runner). The per-module marker is the
68
+ * fully-qualified binary/path, not the short name.
69
+ *
70
+ * `startCmdHint` is an additional optional cmdline needle (currently unset at
71
+ * both call sites; a seam for a future services.json-derived start command).
72
+ */
73
+ export function orphanAttributable(args: {
74
+ orphan: number;
75
+ recordedPid: number | undefined;
76
+ short: string;
77
+ startCmdHint: string | undefined;
78
+ ownerOfPid: OwnerProbeFn;
79
+ /**
80
+ * When provided, the cmdline arm requires THIS module-specific marker (start
81
+ * binary / installDir) instead of the broad `parachute` marker — see the
82
+ * "two attribution modes" note above. Omitted → broad `parachute` (migrate).
83
+ */
84
+ moduleMarker?: string;
85
+ }): { attributable: boolean; cmdline: string | undefined } {
86
+ const { orphan, recordedPid, startCmdHint, ownerOfPid, moduleMarker } = args;
87
+ if (recordedPid !== undefined && orphan === recordedPid) {
88
+ return { attributable: true, cmdline: undefined };
89
+ }
90
+ const cmdline = ownerOfPid(orphan);
91
+ if (cmdline === undefined) return { attributable: false, cmdline: undefined };
92
+ const haystack = cmdline.toLowerCase();
93
+ // Per-module mode (moduleMarker set) uses the module-specific marker as the
94
+ // base needle; broad mode (migrate sweep) uses "parachute". `startCmdHint` is
95
+ // an extra needle in either mode.
96
+ const baseNeedle = moduleMarker ? moduleMarker.toLowerCase() : "parachute";
97
+ const needles = [baseNeedle, ...(startCmdHint ? [startCmdHint.toLowerCase()] : [])].filter(
98
+ (n) => n.length > 0,
99
+ );
100
+ const attributable = needles.some((n) => haystack.includes(n));
101
+ return { attributable, cmdline };
102
+ }
package/src/supervisor.ts CHANGED
@@ -42,6 +42,7 @@ import {
42
42
  rethrowIfMissing,
43
43
  } from "@openparachute/depcheck";
44
44
  import { defaultPidOnPort } from "./hub-control.ts";
45
+ import { orphanAttributable } from "./orphan-attribution.ts";
45
46
  import { type PortListeningFn, defaultPortListening } from "./port-probe.ts";
46
47
 
47
48
  /**
@@ -285,6 +286,28 @@ export interface SupervisorOpts {
285
286
  * stub-spawner test path defaults to "unknown" (returns undefined).
286
287
  */
287
288
  readonly ownerOfPid?: OwnerProbeFn;
289
+ /**
290
+ * Port-reclamation POLICY for the CRASH-RESTART path (#522 / #582). When a
291
+ * supervised child crashes and a foreign process now holds its declared port,
292
+ * `handleExit` must decide what to do with an ATTRIBUTABLE orphan (one whose
293
+ * command line carries the `parachute` marker or matches a recorded module
294
+ * pid — see `orphan-attribution.ts`):
295
+ * - `"adopt"` (default): adopt-kill the attributable orphan (SIGTERM →
296
+ * SIGKILL escalation, all idempotent) and proceed to re-spawn. This
297
+ * extends the migrate orphan-sweep's field-tested auto-adopt behavior to
298
+ * the crash-restart path — closing the recurring "port 1940 taken"
299
+ * crash-loop (#522) for good.
300
+ * - `"prompt"`: NEVER auto-kill; record the structured `port_squatter`
301
+ * start-error (same surface a NON-attributable squatter gets) so the
302
+ * operator resolves it manually.
303
+ *
304
+ * A NON-attributable holder is ALWAYS surfaced (never killed) regardless of
305
+ * policy — `"adopt"` only ever escalates to a kill on a holder we can
306
+ * attribute to this very module. Default `"adopt"`; the flag is the one-line
307
+ * lever to flip the whole crash-restart behavior to detect-and-prompt if the
308
+ * auto-kill default is later vetoed.
309
+ */
310
+ readonly reclaimPolicy?: "adopt" | "prompt";
288
311
  }
289
312
 
290
313
  /**
@@ -322,6 +345,42 @@ const DEFAULT_START_READY_POLL_MS = 200;
322
345
  const DEFAULT_LATE_BIND_WATCH_MS = 60_000;
323
346
  const DEFAULT_LATE_BIND_POLL_MS = 1_000;
324
347
 
348
+ /**
349
+ * Generic language runtimes that can front a custom operator startCmd (e.g.
350
+ * `bun server.ts`, `python3 -m app`). When one of these is `cmd[0]` it is NOT a
351
+ * module-specific marker — using it as the adopt-kill attribution needle would
352
+ * match ANY such process on the port (over-broad kill, #601 re-review). The
353
+ * per-module marker then falls through to the module's installDir/cwd instead.
354
+ * First-party modules (`parachute-vault`, `parachute-scribe`, …) are unaffected
355
+ * — their `cmd[0]` isn't in this set. Matched on the BASENAME, lowercased, with
356
+ * any `.exe` suffix stripped (Windows), so an absolute `/usr/bin/bun` is caught.
357
+ */
358
+ const GENERIC_RUNTIMES = new Set([
359
+ "bun",
360
+ "node",
361
+ "nodejs",
362
+ "deno",
363
+ "python",
364
+ "python2",
365
+ "python3",
366
+ "ruby",
367
+ "sh",
368
+ "bash",
369
+ "zsh",
370
+ "dash",
371
+ "env",
372
+ ]);
373
+
374
+ /**
375
+ * Is `cmd0` a generic language runtime rather than a module-specific binary?
376
+ * Strips the directory and a trailing `.exe`, lowercases, and checks the
377
+ * {@link GENERIC_RUNTIMES} set. See `moduleMarkerFor`.
378
+ */
379
+ function isGenericRuntime(cmd0: string): boolean {
380
+ const base = (cmd0.split("/").pop() ?? cmd0).toLowerCase().replace(/\.exe$/, "");
381
+ return GENERIC_RUNTIMES.has(base);
382
+ }
383
+
325
384
  /**
326
385
  * Bounded, line-oriented ring buffer (§6.5). Holds the most-recent lines of a
327
386
  * module's output up to `maxBytes`; pushing past the cap drops whole lines
@@ -406,6 +465,12 @@ export class Supervisor {
406
465
  // opt in by injecting `pidOnPort` / `ownerOfPid`.
407
466
  pidOnPort: opts.pidOnPort ?? (isProductionPath ? defaultPidOnPort : () => undefined),
408
467
  ownerOfPid: opts.ownerOfPid ?? (isProductionPath ? defaultOwnerOfPid : () => undefined),
468
+ // Crash-restart port-reclamation policy (#522 / #582). Default "adopt"
469
+ // everywhere (production + tests) — the migrate precedent already
470
+ // auto-kills attributable orphans, and the attribution check is
471
+ // conservative. The flag exists so a future veto of auto-kill is a
472
+ // one-line "prompt" flip.
473
+ reclaimPolicy: opts.reclaimPolicy ?? "adopt",
409
474
  };
410
475
  }
411
476
 
@@ -467,13 +532,13 @@ export class Supervisor {
467
532
  // spawning — the operator sees the offending pid + cmdline + a copy-paste
468
533
  // recovery in `status` / the SPA. Detection only: we never kill someone
469
534
  // else's process (it may be the operator's unrelated dev server).
470
- const squatter = this.detectPortSquatter(entry);
535
+ const squatter = this.checkPortSquatter(entry);
471
536
  if (squatter) {
472
537
  entry.state = {
473
538
  ...entry.state,
474
539
  status: "crashed",
475
540
  pid: undefined,
476
- startError: squatter,
541
+ startError: this.portSquatterError(entry, squatter),
477
542
  };
478
543
  return entry.state;
479
544
  }
@@ -532,9 +597,14 @@ export class Supervisor {
532
597
  * replaced) and must not vouch for whoever now holds the port. An entry with
533
598
  * no `proc` (never spawned) contributes no pid either.
534
599
  */
535
- private supervisedPids(): Set<number> {
600
+ private supervisedPids(exclude?: ModuleEntry): Set<number> {
536
601
  const pids = new Set<number>();
537
602
  for (const entry of this.modules.values()) {
603
+ // The just-crashed entry on the `handleExit` path is still `running`
604
+ // (status hasn't been updated yet) with `entry.proc.pid` pointing at the
605
+ // now-DEAD child — it must not vouch for whoever holds the port (the same
606
+ // N1 stale-pid hazard, here for an exiting-but-not-yet-restated child).
607
+ if (exclude !== undefined && entry === exclude) continue;
538
608
  if (entry.state.status !== "running" && entry.state.status !== "starting") continue;
539
609
  const pid = entry.proc?.pid;
540
610
  if (typeof pid === "number" && pid > 0) pids.add(pid);
@@ -543,28 +613,52 @@ export class Supervisor {
543
613
  }
544
614
 
545
615
  /**
546
- * Pre-spawn port-squatter check (#580 item 4). Returns a structured
547
- * `port_squatter` start-error when the module's declared port is held by a
616
+ * Pure pre-spawn port-squatter PROBE (#580 item 4, refactored for #522/#582).
617
+ * Returns the squatter detail when the module's declared port is held by a
548
618
  * process the supervisor does NOT own; undefined when the port is free, the
549
619
  * holder is one of our own children, or detection isn't available on this
550
620
  * platform (no `lsof` → `pidOnPort` returns undefined → we degrade to the
551
621
  * existing started-but-unbound path post-spawn).
552
622
  *
623
+ * This is DETECTION ONLY — it records nothing and kills nothing. The two
624
+ * callers decide what to do with the result:
625
+ * - `start()` (#581) records the structured `port_squatter` start-error and
626
+ * refuses to spawn (a foreign pid on a module port may be the operator's
627
+ * unrelated process — never auto-killed on the operator-initiated path);
628
+ * - `handleExit` (#522/#582) additionally runs attribution and, for an
629
+ * ATTRIBUTABLE orphan under the "adopt" policy, adopt-kills + re-spawns.
630
+ *
553
631
  * Ownership precedent mirrors `migrate-cutover.ts:sweepOrphanOnPort`'s "is
554
632
  * this mine?" check — here the discriminant is "is the holder one of my live
555
- * children's pids?". We deliberately do NOT kill the holder (detection only):
556
- * a foreign pid on a module port may be the operator's unrelated process.
633
+ * children's pids?".
557
634
  */
558
- private detectPortSquatter(entry: ModuleEntry): ModuleStartError | undefined {
635
+ private checkPortSquatter(
636
+ entry: ModuleEntry,
637
+ excludeCrashingEntry = false,
638
+ ): { port: number; holder: number; cmdline: string | undefined } | undefined {
559
639
  const portStr = entry.req.env?.PORT;
560
640
  const port = portStr ? Number(portStr) : Number.NaN;
561
641
  if (!Number.isFinite(port) || port <= 0) return undefined; // No declared port.
562
642
 
563
643
  const holder = this.opts.pidOnPort(port);
564
644
  if (holder === undefined) return undefined; // Port free, or detection unavailable.
565
- if (this.supervisedPids().has(holder)) return undefined; // Our own child.
645
+ // On the crash-restart path the crashing entry is still `running` with a
646
+ // stale (dead) pid — exclude it so it can't vouch for the holder.
647
+ if (this.supervisedPids(excludeCrashingEntry ? entry : undefined).has(holder)) return undefined;
648
+
649
+ return { port, holder, cmdline: this.opts.ownerOfPid(holder) };
650
+ }
566
651
 
567
- const cmdline = this.opts.ownerOfPid(holder);
652
+ /**
653
+ * Build the structured, actionable `port_squatter` start-error from a probe
654
+ * result (#581). Shared by `start()` and the NON-attributable / "prompt"
655
+ * branch of `handleExit` so the wire shape stays identical.
656
+ */
657
+ private portSquatterError(
658
+ entry: ModuleEntry,
659
+ squatter: { port: number; holder: number; cmdline: string | undefined },
660
+ ): ModuleStartError {
661
+ const { port, holder, cmdline } = squatter;
568
662
  const who = cmdline ? `pid ${holder} (${cmdline})` : `pid ${holder}`;
569
663
  const short = entry.req.short;
570
664
  return {
@@ -576,6 +670,163 @@ export class Supervisor {
576
670
  };
577
671
  }
578
672
 
673
+ /**
674
+ * Adopt-kill an orphan holding a module's port on the crash-restart path
675
+ * (#522 / #582). Best-effort + idempotent: SIGTERM the group, brief wait, then
676
+ * a SIGKILL escalation if it's still bound — every signal is try-caught so an
677
+ * ESRCH (the orphan already exited between probe + signal) is a no-op, not a
678
+ * throw. Modeled on `migrate-cutover.ts:sweepOrphanOnPort`'s adopt arm, using
679
+ * the supervisor's group-aware `killFn`. If the kill doesn't free the port the
680
+ * subsequent re-spawn just EADDRINUSE-crashes again and the normal restart
681
+ * budget eventually halts the loop — so a failed kill degrades gracefully.
682
+ */
683
+ private async adoptKillOrphanOnPort(port: number, holder: number): Promise<void> {
684
+ try {
685
+ this.opts.killFn(holder, "SIGTERM");
686
+ } catch {
687
+ // ESRCH (already gone) or EPERM (can't signal) — best-effort: nothing
688
+ // more to do, the re-spawn surfaces a still-held port as a normal crash.
689
+ return;
690
+ }
691
+ // Give the orphan a moment to drop its listener before escalating. Reuse the
692
+ // restart delay (also the socket-release grace) so we don't add a new knob.
693
+ await this.opts.sleep(this.opts.restartDelayMs);
694
+ // Still holding the port? Escalate to SIGKILL (idempotent — if it already
695
+ // exited under the SIGTERM the port is free and we skip the escalation).
696
+ // N1: this re-check is deliberately NOT re-attributed — we already
697
+ // attributed `holder` to this module before the SIGTERM, and only escalate
698
+ // if the SAME pid still holds the SAME port. The TOCTOU window (the
699
+ // originally-attributed pid exits and the OS recycles its number onto a new,
700
+ // foreign holder of this port between the SIGTERM and this re-probe) is the
701
+ // same accepted, vanishingly-small risk the migrate sweep's SIGKILL
702
+ // follow-up carries (`sweepOrphanOnPort`); not worth a second `ps` round-trip.
703
+ if (this.opts.pidOnPort(port) === holder) {
704
+ try {
705
+ this.opts.killFn(holder, "SIGKILL");
706
+ } catch {
707
+ // Already gone / can't signal — best-effort; fall through to re-spawn.
708
+ }
709
+ }
710
+ }
711
+
712
+ /**
713
+ * Crash-restart squatter resolution (#522 / #582). Called from `handleExit`
714
+ * when a foreign process holds the crashed module's port. Returns:
715
+ * - `true` → the loop should HALT: we recorded a structured `port_squatter`
716
+ * start-error + set status `crashed` WITHOUT touching the crash budget
717
+ * (the module didn't crash — a foreign process is blocking its port, so a
718
+ * budget tick would wrongly bring us closer to "giving up"). Applies to a
719
+ * NON-attributable holder always, and to an attributable holder under the
720
+ * `"prompt"` policy.
721
+ * - `false` → we ADOPT-KILLED an attributable orphan (under the default
722
+ * `"adopt"` policy); the caller falls through to the normal restart, which
723
+ * re-spawns onto the now-freed port (counting as a normal restart).
724
+ *
725
+ * Attribution is the safety crux: REUSE the shared `orphanAttributable`
726
+ * (`orphan-attribution.ts`) — but in its PER-MODULE mode (`moduleMarker` set),
727
+ * NOT the migrate sweep's broad `parachute` mode. The supervisor is always
728
+ * restarting ONE specific module and knows its identity, so it requires the
729
+ * orphan's cmdline to carry THIS module's own start binary / installDir before
730
+ * killing — a bare `parachute` match would let vault's restart adopt-kill a
731
+ * sibling `scribe`/`runner` orphan on vault's port (a cross-module kill). So a
732
+ * sibling module's process (or an operator's unrelated process) is "not
733
+ * attributable" → surfaced, never killed. Only a genuine prior instance of the
734
+ * SAME module is reclaimable.
735
+ */
736
+ private async handleCrashRestartSquatter(
737
+ entry: ModuleEntry,
738
+ squatter: { port: number; holder: number; cmdline: string | undefined },
739
+ exitCode: number | null,
740
+ ): Promise<boolean> {
741
+ const { port, holder } = squatter;
742
+ const short = entry.req.short;
743
+
744
+ const recordSquatterError = (): true => {
745
+ entry.state = {
746
+ ...entry.state,
747
+ status: "crashed",
748
+ pid: undefined,
749
+ lastExitCode: exitCode,
750
+ // NB: restartsInWindow is left as-is — we deliberately do NOT push a
751
+ // crash stamp for a port-blocked module (it didn't crash).
752
+ startError: this.portSquatterError(entry, squatter),
753
+ };
754
+ return true;
755
+ };
756
+
757
+ // Policy gate: "prompt" never auto-kills — surface every squatter for the
758
+ // operator (the one-line lever to flip off auto-kill if it's vetoed).
759
+ if (this.opts.reclaimPolicy === "prompt") {
760
+ this.opts.output(
761
+ `[supervisor] ${short} crashed; port ${port} held by pid ${holder} (reclaim policy "prompt") — surfacing instead of adopting.\n`,
762
+ );
763
+ return recordSquatterError();
764
+ }
765
+
766
+ // "adopt": adopt-kill only an ATTRIBUTABLE orphan. The recorded pid arm uses
767
+ // the entry's last-known pid (the just-crashed child's) — if the SAME pid
768
+ // somehow still holds the port it's trivially ours to reclaim; otherwise the
769
+ // PER-MODULE cmdline marker (this module's own start binary / installDir)
770
+ // decides — NOT the broad `parachute` marker, so a sibling module's orphan
771
+ // on this port is not attributable.
772
+ const { attributable, cmdline } = orphanAttributable({
773
+ orphan: holder,
774
+ recordedPid: entry.proc?.pid,
775
+ short,
776
+ startCmdHint: undefined,
777
+ ownerOfPid: this.opts.ownerOfPid,
778
+ moduleMarker: this.moduleMarkerFor(entry),
779
+ });
780
+ if (!attributable) {
781
+ const desc = cmdline ?? squatter.cmdline ?? "command line unavailable";
782
+ this.opts.output(
783
+ `[supervisor] ${short} crashed; port ${port} held by an unrelated process (pid ${holder}, ${desc}) — refusing to kill it; surfacing.\n`,
784
+ );
785
+ return recordSquatterError();
786
+ }
787
+
788
+ // Attributable orphan under "adopt": reclaim the port, then fall through to
789
+ // the normal restart (return false). Best-effort — if the kill doesn't free
790
+ // the port, the re-spawn EADDRINUSE-crashes as a normal crash and the budget
791
+ // eventually halts the loop.
792
+ this.opts.output(
793
+ `[supervisor] ${short} crashed; port ${port} held by an attributable orphan (pid ${holder}${cmdline ? `, ${cmdline}` : ""}) — adopting + killing it before restart.\n`,
794
+ );
795
+ await this.adoptKillOrphanOnPort(port, holder);
796
+ return false;
797
+ }
798
+
799
+ /**
800
+ * The module-specific cmdline marker for the per-module adopt-kill attribution
801
+ * (#601 review). A genuine prior instance of THIS module was launched with
802
+ * this module's start binary (`req.cmd[0]`, e.g. `parachute-vault`) and from
803
+ * its installDir (`req.cwd`, e.g. `~/.parachute/vault/`) — both appear in the
804
+ * orphan's `ps` cmdline.
805
+ *
806
+ * Prefer the start binary (it's the most module-distinctive token) — BUT only
807
+ * when it's actually module-specific. A custom operator startCmd like
808
+ * `bun server.ts` has a GENERIC RUNTIME at `cmd[0]` (`bun`/`node`/`python`/…);
809
+ * using "bun" as the marker would attribute ANY bun process on the port — the
810
+ * exact over-broad adopt-kill per-module attribution exists to prevent
811
+ * (#601 re-review). So when `cmd[0]`'s basename is a known generic runtime,
812
+ * fall through to the cwd / installDir marker, which IS module-specific.
813
+ *
814
+ * Returns undefined only when neither a non-generic `cmd[0]` nor a usable cwd
815
+ * is available — attribution then falls back to the recorded-pid arm only (the
816
+ * cmdline arm can't match an empty needle → the safe, conservative degradation:
817
+ * never a false-positive kill).
818
+ *
819
+ * Note we pass the FULL `cmd[0]` (e.g. `parachute-vault`, or an absolute
820
+ * `/path/to/parachute-vault`), not a bare short name — the short name
821
+ * (`vault`) is deliberately too loose for a kill decision.
822
+ */
823
+ private moduleMarkerFor(entry: ModuleEntry): string | undefined {
824
+ const binary = entry.req.cmd[0];
825
+ if (binary && binary.length > 0 && !isGenericRuntime(binary)) return binary;
826
+ if (entry.req.cwd && entry.req.cwd.length > 0) return entry.req.cwd;
827
+ return undefined;
828
+ }
829
+
579
830
  /**
580
831
  * Poll the module's port until it binds or `startReadyMs` elapses (§6.5).
581
832
  * Skipped when the gate is disabled (stub-spawner test path) or the request
@@ -813,6 +1064,26 @@ export class Supervisor {
813
1064
  return;
814
1065
  }
815
1066
 
1067
+ // Crash-restart port reconciliation (#522 / #582). Before counting this
1068
+ // crash and re-spawning, check whether the module's declared port is now
1069
+ // held by a process the supervisor doesn't own. The `start()` squatter
1070
+ // check (#581) only runs on the operator-initiated path; the crash-restart
1071
+ // loop bypassed it, so a foreign process that grabbed the port between the
1072
+ // crash and the auto-restart kept EADDRINUSE-crash-looping into a bare
1073
+ // `crashed` with no clue why (#582), and a leftover-autostart orphan from a
1074
+ // prior instance re-took the port forever (#522). `excludeCrashingEntry`
1075
+ // drops the just-crashed child's stale pid from the "ours" set (N1).
1076
+ const squatter = this.checkPortSquatter(entry, /* excludeCrashingEntry */ true);
1077
+ if (squatter) {
1078
+ const handled = await this.handleCrashRestartSquatter(entry, squatter, exitCode);
1079
+ // `handled` true → we surfaced a structured error and halted the loop
1080
+ // WITHOUT counting this against the crash budget (the module didn't crash
1081
+ // — a foreign process is blocking its port). `false` → we adopt-killed an
1082
+ // attributable orphan and fall through to the normal restart below, which
1083
+ // re-spawns onto the now-freed port (counting as a normal restart).
1084
+ if (handled) return;
1085
+ }
1086
+
816
1087
  const now = this.opts.now();
817
1088
  // Drop crashes older than the window before counting.
818
1089
  const cutoff = now - this.opts.restartWindowMs;