@openparachute/hub 0.6.5-rc.2 → 0.6.5-rc.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +4 -1
- package/src/__tests__/api-modules-ops.test.ts +36 -0
- package/src/__tests__/hub.test.ts +27 -1
- package/src/__tests__/notes-serve.test.ts +10 -0
- package/src/__tests__/orphan-attribution.test.ts +98 -0
- package/src/__tests__/supervisor.test.ts +363 -0
- package/src/commands/migrate-cutover.ts +11 -49
- package/src/commands/serve.ts +11 -2
- package/src/notes-serve.ts +24 -4
- package/src/orphan-attribution.ts +102 -0
- package/src/supervisor.ts +281 -10
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@openparachute/hub",
|
|
3
|
-
"version": "0.6.5-rc.
|
|
3
|
+
"version": "0.6.5-rc.4",
|
|
4
4
|
"description": "parachute — the local hub for the Parachute ecosystem (discovery, ports, lifecycle, soon OAuth).",
|
|
5
5
|
"license": "AGPL-3.0",
|
|
6
6
|
"publishConfig": {
|
|
@@ -26,6 +26,9 @@
|
|
|
26
26
|
},
|
|
27
27
|
"scripts": {
|
|
28
28
|
"start": "bun src/cli.ts",
|
|
29
|
+
"build:depcheck": "[ ! -f packages/depcheck/package.json ] || [ -f packages/depcheck/dist/index.js ] || bun run --cwd packages/depcheck build",
|
|
30
|
+
"prepare": "bun run build:depcheck",
|
|
31
|
+
"pretest": "bun run build:depcheck",
|
|
29
32
|
"test": "bun test ./src",
|
|
30
33
|
"lint": "biome check .",
|
|
31
34
|
"lint:fix": "biome check --write .",
|
|
@@ -936,6 +936,42 @@ describe("POST /api/modules/:short/start", () => {
|
|
|
936
936
|
expect(spawns[0]?.env?.MY_CUSTOM_VAR).toBe("sentinel123");
|
|
937
937
|
});
|
|
938
938
|
|
|
939
|
+
test("#519 surface orphan: start surfaces the structured port_squatter error (not a bare failure)", async () => {
|
|
940
|
+
// The #519 field signature: after a hub restart, a module (surface on the
|
|
941
|
+
// box; vault here) is orphaned — listening on its port but NOT a supervised
|
|
942
|
+
// child. The restart-surface API path (`parachute restart <svc>` → 404
|
|
943
|
+
// fallthrough → start, and the boot reconcile) calls `supervisor.start()`,
|
|
944
|
+
// whose #581 squatter detection must surface the structured `port_squatter`
|
|
945
|
+
// error in the response body so the operator gets an actionable next step,
|
|
946
|
+
// not an opaque "request failed". This pins that propagation.
|
|
947
|
+
seedVault(1940);
|
|
948
|
+
// A real Supervisor with the squatter seams injected: pid 95870 (the #519
|
|
949
|
+
// orphan) holds :1940 and is NOT one of the supervisor's children.
|
|
950
|
+
const supervisor = new Supervisor({
|
|
951
|
+
spawnFn: () => {
|
|
952
|
+
throw new Error("should not spawn — the port is squatted");
|
|
953
|
+
},
|
|
954
|
+
pidOnPort: (port) => (port === 1940 ? 95870 : undefined),
|
|
955
|
+
ownerOfPid: (pid) => (pid === 95870 ? "bun /x/.parachute/surface/server.ts" : undefined),
|
|
956
|
+
});
|
|
957
|
+
const bearer = await mintBearer(h, [API_MODULES_OPS_REQUIRED_SCOPE]);
|
|
958
|
+
const res = await handleStart(
|
|
959
|
+
postReq("/api/modules/vault/start", { authorization: `Bearer ${bearer}` }),
|
|
960
|
+
"vault",
|
|
961
|
+
{ db: h.db, issuer: ISSUER, manifestPath: h.manifestPath, configDir: h.dir, supervisor },
|
|
962
|
+
);
|
|
963
|
+
// 200 with the structured error riding in state.startError — the SPA/CLI
|
|
964
|
+
// render the actionable squatter message instead of a 500 "request failed".
|
|
965
|
+
expect(res.status).toBe(200);
|
|
966
|
+
const body = (await res.json()) as {
|
|
967
|
+
short: string;
|
|
968
|
+
state: { status: string; startError?: { error_type: string; error_description: string } };
|
|
969
|
+
};
|
|
970
|
+
expect(body.state.status).toBe("crashed");
|
|
971
|
+
expect(body.state.startError?.error_type).toBe("port_squatter");
|
|
972
|
+
expect(body.state.startError?.error_description).toContain("port 1940 is held by pid 95870");
|
|
973
|
+
});
|
|
974
|
+
|
|
939
975
|
test("400 not_installed when the module isn't in services.json (no silent install)", async () => {
|
|
940
976
|
// No seedVault — services.json has no vault row.
|
|
941
977
|
const { supervisor, spawns } = makeIdleSupervisor();
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import { describe, expect, test } from "bun:test";
|
|
2
|
-
import { existsSync, mkdtempSync, readFileSync, rmSync } from "node:fs";
|
|
2
|
+
import { existsSync, mkdtempSync, readFileSync, rmSync, writeFileSync } from "node:fs";
|
|
3
3
|
import { tmpdir } from "node:os";
|
|
4
4
|
import { join } from "node:path";
|
|
5
5
|
import { renderHub, writeHubFile } from "../hub.ts";
|
|
@@ -301,4 +301,30 @@ describe("writeHubFile", () => {
|
|
|
301
301
|
rmSync(dir, { recursive: true, force: true });
|
|
302
302
|
}
|
|
303
303
|
});
|
|
304
|
+
|
|
305
|
+
// hub#171: serve regenerates hub.html on EVERY start (the `!existsSync`
|
|
306
|
+
// guard in commands/serve.ts was dropped), so writeHubFile must be safe to
|
|
307
|
+
// call when the file already exists and must overwrite stale content with
|
|
308
|
+
// a fresh render from current code — otherwise an upgrade serves the old
|
|
309
|
+
// page until an unrelated `parachute expose` re-runs.
|
|
310
|
+
test("overwrites a pre-existing stale hub.html with a fresh render (hub#171)", () => {
|
|
311
|
+
const dir = mkdtempSync(join(tmpdir(), "pcli-hub-"));
|
|
312
|
+
try {
|
|
313
|
+
const path = join(dir, "well-known", "hub.html");
|
|
314
|
+
// First write creates the well-known dir + a real file; then plant
|
|
315
|
+
// stale content to simulate an old hub.html left over from a prior code
|
|
316
|
+
// version after `git pull` + `parachute restart hub`.
|
|
317
|
+
writeHubFile(path);
|
|
318
|
+
writeFileSync(path, "<!-- stale pre-upgrade hub.html -->");
|
|
319
|
+
expect(readFileSync(path, "utf8")).toContain("stale");
|
|
320
|
+
|
|
321
|
+
const written = writeHubFile(path);
|
|
322
|
+
expect(written).toBe(path);
|
|
323
|
+
const content = readFileSync(path, "utf8");
|
|
324
|
+
expect(content).not.toContain("stale");
|
|
325
|
+
expect(content).toBe(renderHub());
|
|
326
|
+
} finally {
|
|
327
|
+
rmSync(dir, { recursive: true, force: true });
|
|
328
|
+
}
|
|
329
|
+
});
|
|
304
330
|
});
|
|
@@ -6,6 +6,7 @@ import {
|
|
|
6
6
|
normalizeMount,
|
|
7
7
|
notesDistCandidates,
|
|
8
8
|
notesFetch,
|
|
9
|
+
notesServeOptions,
|
|
9
10
|
resolveNotesDistFrom,
|
|
10
11
|
} from "../notes-serve.ts";
|
|
11
12
|
|
|
@@ -26,6 +27,15 @@ function req(path: string): Request {
|
|
|
26
27
|
return new Request(`http://127.0.0.1${path}`);
|
|
27
28
|
}
|
|
28
29
|
|
|
30
|
+
describe("notesServeOptions (hub#399 residual)", () => {
|
|
31
|
+
test("sets idleTimeout: 255 to outlast edge keep-alive pools, matching hub-server.ts", () => {
|
|
32
|
+
const opts = notesServeOptions(5173, "/tmp/dist", "/notes");
|
|
33
|
+
expect(opts.idleTimeout).toBe(255);
|
|
34
|
+
expect(opts.port).toBe(5173);
|
|
35
|
+
expect(typeof opts.fetch).toBe("function");
|
|
36
|
+
});
|
|
37
|
+
});
|
|
38
|
+
|
|
29
39
|
describe("normalizeMount", () => {
|
|
30
40
|
test("strips trailing slashes", () => {
|
|
31
41
|
expect(normalizeMount("/notes/")).toBe("/notes");
|
|
@@ -0,0 +1,98 @@
|
|
|
1
|
+
import { describe, expect, test } from "bun:test";
|
|
2
|
+
import { orphanAttributable } from "../orphan-attribution.ts";
|
|
3
|
+
|
|
4
|
+
describe("orphanAttributable — two attribution modes (#601 review)", () => {
|
|
5
|
+
const ownerOfPid = (cmdlines: Record<number, string | undefined>) => (pid: number) =>
|
|
6
|
+
cmdlines[pid];
|
|
7
|
+
|
|
8
|
+
test("recorded-pid match → attributable in BOTH modes (cmdline not even read)", () => {
|
|
9
|
+
// No cmdline available, but the orphan IS the recorded pid → trivially ours.
|
|
10
|
+
const probe = ownerOfPid({});
|
|
11
|
+
const broad = orphanAttributable({
|
|
12
|
+
orphan: 100,
|
|
13
|
+
recordedPid: 100,
|
|
14
|
+
short: "vault",
|
|
15
|
+
startCmdHint: undefined,
|
|
16
|
+
ownerOfPid: probe,
|
|
17
|
+
});
|
|
18
|
+
const perModule = orphanAttributable({
|
|
19
|
+
orphan: 100,
|
|
20
|
+
recordedPid: 100,
|
|
21
|
+
short: "vault",
|
|
22
|
+
startCmdHint: undefined,
|
|
23
|
+
ownerOfPid: probe,
|
|
24
|
+
moduleMarker: "parachute-vault",
|
|
25
|
+
});
|
|
26
|
+
expect(broad.attributable).toBe(true);
|
|
27
|
+
expect(perModule.attributable).toBe(true);
|
|
28
|
+
});
|
|
29
|
+
|
|
30
|
+
test("broad mode (no moduleMarker): any `parachute` cmdline is attributable", () => {
|
|
31
|
+
const res = orphanAttributable({
|
|
32
|
+
orphan: 200,
|
|
33
|
+
recordedPid: undefined,
|
|
34
|
+
short: "vault",
|
|
35
|
+
startCmdHint: undefined,
|
|
36
|
+
ownerOfPid: ownerOfPid({ 200: "parachute-scribe serve" }),
|
|
37
|
+
});
|
|
38
|
+
// Migrate-sweep width: a sibling parachute process still counts.
|
|
39
|
+
expect(res.attributable).toBe(true);
|
|
40
|
+
expect(res.cmdline).toBe("parachute-scribe serve");
|
|
41
|
+
});
|
|
42
|
+
|
|
43
|
+
test("per-module mode: own marker matches → attributable", () => {
|
|
44
|
+
const res = orphanAttributable({
|
|
45
|
+
orphan: 300,
|
|
46
|
+
recordedPid: undefined,
|
|
47
|
+
short: "vault",
|
|
48
|
+
startCmdHint: undefined,
|
|
49
|
+
ownerOfPid: ownerOfPid({ 300: "parachute-vault serve" }),
|
|
50
|
+
moduleMarker: "parachute-vault",
|
|
51
|
+
});
|
|
52
|
+
expect(res.attributable).toBe(true);
|
|
53
|
+
});
|
|
54
|
+
|
|
55
|
+
test("per-module mode: a SIBLING parachute module is NOT attributable (cross-module-kill guard)", () => {
|
|
56
|
+
const res = orphanAttributable({
|
|
57
|
+
orphan: 400,
|
|
58
|
+
recordedPid: undefined,
|
|
59
|
+
short: "vault",
|
|
60
|
+
startCmdHint: undefined,
|
|
61
|
+
// A real parachute process (carries `parachute`) — but it's SCRIBE, not
|
|
62
|
+
// vault. The broad mode would attribute it; per-module must not.
|
|
63
|
+
ownerOfPid: ownerOfPid({ 400: "parachute-scribe serve" }),
|
|
64
|
+
moduleMarker: "parachute-vault",
|
|
65
|
+
});
|
|
66
|
+
expect(res.attributable).toBe(false);
|
|
67
|
+
// The cmdline is still returned so the caller can surface it in the message.
|
|
68
|
+
expect(res.cmdline).toBe("parachute-scribe serve");
|
|
69
|
+
});
|
|
70
|
+
|
|
71
|
+
test("either mode: unreadable cmdline + non-matching pid → NOT attributable", () => {
|
|
72
|
+
for (const moduleMarker of [undefined, "parachute-vault"]) {
|
|
73
|
+
const res = orphanAttributable({
|
|
74
|
+
orphan: 500,
|
|
75
|
+
recordedPid: 999, // different from orphan
|
|
76
|
+
short: "vault",
|
|
77
|
+
startCmdHint: undefined,
|
|
78
|
+
ownerOfPid: ownerOfPid({}), // returns undefined
|
|
79
|
+
moduleMarker,
|
|
80
|
+
});
|
|
81
|
+
expect(res.attributable).toBe(false);
|
|
82
|
+
expect(res.cmdline).toBeUndefined();
|
|
83
|
+
}
|
|
84
|
+
});
|
|
85
|
+
|
|
86
|
+
test("startCmdHint is an additional needle in per-module mode", () => {
|
|
87
|
+
const res = orphanAttributable({
|
|
88
|
+
orphan: 600,
|
|
89
|
+
recordedPid: undefined,
|
|
90
|
+
short: "vault",
|
|
91
|
+
startCmdHint: "my-custom-server.ts",
|
|
92
|
+
// cmdline lacks the module binary but carries the explicit hint.
|
|
93
|
+
ownerOfPid: ownerOfPid({ 600: "node /opt/my-custom-server.ts" }),
|
|
94
|
+
moduleMarker: "parachute-vault",
|
|
95
|
+
});
|
|
96
|
+
expect(res.attributable).toBe(true);
|
|
97
|
+
});
|
|
98
|
+
});
|
|
@@ -478,6 +478,369 @@ describe("Supervisor restart-on-crash", () => {
|
|
|
478
478
|
});
|
|
479
479
|
});
|
|
480
480
|
|
|
481
|
+
describe("Supervisor crash-restart port reclamation (#522 / #582)", () => {
|
|
482
|
+
// A killFn that records its (pid, signal) calls so a test can prove an
|
|
483
|
+
// adopt-kill happened (or didn't). Does NOT forward to a fake — these tests
|
|
484
|
+
// drive the orphan's "death" by flipping the injected pidOnPort.
|
|
485
|
+
function recordingKill(): { killFn: KillFn; calls: Array<{ pid: number; signal: unknown }> } {
|
|
486
|
+
const calls: Array<{ pid: number; signal: unknown }> = [];
|
|
487
|
+
return { calls, killFn: (pid, signal) => calls.push({ pid, signal }) };
|
|
488
|
+
}
|
|
489
|
+
|
|
490
|
+
test("attributable orphan + kill frees the port → adopt-kill then respawn", async () => {
|
|
491
|
+
const first = makeFakeProc(900);
|
|
492
|
+
const second = makeFakeProc(901);
|
|
493
|
+
const spawner = makeQueueSpawner();
|
|
494
|
+
spawner.enqueue(first);
|
|
495
|
+
spawner.enqueue(second);
|
|
496
|
+
// The orphan (pid 5000) holds :1940 right after the crash; the SIGTERM
|
|
497
|
+
// adopt-kill frees it — model that by clearing the holder inside the kill
|
|
498
|
+
// stub (so the SIGKILL-escalation re-probe sees a freed port).
|
|
499
|
+
const calls: Array<{ pid: number; signal: unknown }> = [];
|
|
500
|
+
let holder: number | undefined = undefined;
|
|
501
|
+
const killFn: KillFn = (pid, signal) => {
|
|
502
|
+
calls.push({ pid, signal });
|
|
503
|
+
if (pid === 5000 && signal === "SIGTERM") holder = undefined; // the orphan died
|
|
504
|
+
};
|
|
505
|
+
const sup = new Supervisor({
|
|
506
|
+
spawnFn: spawner.spawn,
|
|
507
|
+
killFn,
|
|
508
|
+
restartDelayMs: 0,
|
|
509
|
+
sleep: () => Promise.resolve(),
|
|
510
|
+
pidOnPort: (port) => (port === 1940 ? holder : undefined),
|
|
511
|
+
// Attributable PER-MODULE: the cmdline carries THIS module's start binary
|
|
512
|
+
// (`parachute-vault`), not just a bare `parachute` marker.
|
|
513
|
+
ownerOfPid: (pid) => (pid === 5000 ? "parachute-vault serve" : undefined),
|
|
514
|
+
});
|
|
515
|
+
|
|
516
|
+
await sup.start({ short: "vault", cmd: ["parachute-vault", "serve"], env: { PORT: "1940" } });
|
|
517
|
+
// Orphan grabs the port, then the child crashes. `handleExit` detects the
|
|
518
|
+
// attributable orphan + adopt-kills it (the stub clears `holder`), then
|
|
519
|
+
// falls through to a normal restart that re-spawns onto the freed port.
|
|
520
|
+
holder = 5000;
|
|
521
|
+
first.closeStreams();
|
|
522
|
+
first.resolveExit(1);
|
|
523
|
+
await tick();
|
|
524
|
+
|
|
525
|
+
// The orphan got SIGTERM'd, and the module respawned onto the freed port.
|
|
526
|
+
expect(calls.some((c) => c.pid === 5000 && c.signal === "SIGTERM")).toBe(true);
|
|
527
|
+
expect(spawner.calls).toHaveLength(2);
|
|
528
|
+
const state = sup.get("vault");
|
|
529
|
+
expect(state?.status).toBe("running");
|
|
530
|
+
expect(state?.pid).toBe(901);
|
|
531
|
+
// It WAS counted as a normal restart (the module did crash + we reclaimed).
|
|
532
|
+
expect(state?.restartsInWindow).toBe(1);
|
|
533
|
+
|
|
534
|
+
second.closeStreams();
|
|
535
|
+
sup.stop("vault");
|
|
536
|
+
second.resolveExit(0);
|
|
537
|
+
});
|
|
538
|
+
|
|
539
|
+
test("attributable orphan + kill fails (ESRCH) → respawn still attempted", async () => {
|
|
540
|
+
const first = makeFakeProc(910);
|
|
541
|
+
const second = makeFakeProc(911);
|
|
542
|
+
const spawner = makeQueueSpawner();
|
|
543
|
+
spawner.enqueue(first);
|
|
544
|
+
spawner.enqueue(second);
|
|
545
|
+
// killFn throws ESRCH (the orphan vanished between probe + signal) — the
|
|
546
|
+
// adopt-kill swallows it and the respawn proceeds best-effort.
|
|
547
|
+
const killFn: KillFn = () => {
|
|
548
|
+
const err = new Error("no such process") as NodeJS.ErrnoException;
|
|
549
|
+
err.code = "ESRCH";
|
|
550
|
+
throw err;
|
|
551
|
+
};
|
|
552
|
+
// Port free at the initial start; the orphan appears only after the crash.
|
|
553
|
+
let holder: number | undefined = undefined;
|
|
554
|
+
const sup = new Supervisor({
|
|
555
|
+
spawnFn: spawner.spawn,
|
|
556
|
+
killFn,
|
|
557
|
+
restartDelayMs: 0,
|
|
558
|
+
sleep: () => Promise.resolve(),
|
|
559
|
+
// Holder present at crash time; the (failed) kill doesn't change it here,
|
|
560
|
+
// but the respawn is still attempted — that's the invariant under test.
|
|
561
|
+
pidOnPort: (port) => (port === 1940 ? holder : undefined),
|
|
562
|
+
// Attributable per-module: cmdline carries this module's start binary.
|
|
563
|
+
ownerOfPid: () => "parachute-vault serve",
|
|
564
|
+
});
|
|
565
|
+
|
|
566
|
+
await sup.start({ short: "vault", cmd: ["parachute-vault", "serve"], env: { PORT: "1940" } });
|
|
567
|
+
holder = 5001;
|
|
568
|
+
first.closeStreams();
|
|
569
|
+
first.resolveExit(1);
|
|
570
|
+
await tick();
|
|
571
|
+
|
|
572
|
+
// Respawn was attempted despite the kill throwing ESRCH (best-effort).
|
|
573
|
+
expect(spawner.calls).toHaveLength(2);
|
|
574
|
+
const state = sup.get("vault");
|
|
575
|
+
expect(state?.status).toBe("running");
|
|
576
|
+
|
|
577
|
+
second.closeStreams();
|
|
578
|
+
sup.stop("vault");
|
|
579
|
+
second.resolveExit(0);
|
|
580
|
+
});
|
|
581
|
+
|
|
582
|
+
test("foreign holder with readable cmdline → port_squatter error, no kill, no budget tick", async () => {
|
|
583
|
+
const first = makeFakeProc(920);
|
|
584
|
+
const spawner = makeQueueSpawner();
|
|
585
|
+
spawner.enqueue(first); // ONLY one proc — a respawn would throw "unexpected spawn".
|
|
586
|
+
const kill = recordingKill();
|
|
587
|
+
let holder: number | undefined = undefined; // free at start; orphan after crash.
|
|
588
|
+
const sup = new Supervisor({
|
|
589
|
+
spawnFn: spawner.spawn,
|
|
590
|
+
killFn: kill.killFn,
|
|
591
|
+
restartDelayMs: 0,
|
|
592
|
+
sleep: () => Promise.resolve(),
|
|
593
|
+
pidOnPort: (port) => (port === 1940 ? holder : undefined),
|
|
594
|
+
// NOT attributable: an operator's unrelated dev server (no `parachute-vault`
|
|
595
|
+
// marker in its cmdline).
|
|
596
|
+
ownerOfPid: (pid) => (pid === 6000 ? "node /home/op/my-app/server.js" : undefined),
|
|
597
|
+
});
|
|
598
|
+
|
|
599
|
+
await sup.start({ short: "vault", cmd: ["parachute-vault", "serve"], env: { PORT: "1940" } });
|
|
600
|
+
holder = 6000;
|
|
601
|
+
first.closeStreams();
|
|
602
|
+
first.resolveExit(1);
|
|
603
|
+
await tick();
|
|
604
|
+
|
|
605
|
+
const state = sup.get("vault");
|
|
606
|
+
expect(state?.status).toBe("crashed");
|
|
607
|
+
expect(state?.startError?.error_type).toBe("port_squatter");
|
|
608
|
+
expect(state?.startError?.error_description).toContain("port 1940 is held by pid 6000");
|
|
609
|
+
// No kill (foreign process), no respawn, and the crash budget was NOT
|
|
610
|
+
// ticked (the module didn't crash — a foreign process is blocking its port).
|
|
611
|
+
expect(kill.calls).toHaveLength(0);
|
|
612
|
+
expect(spawner.calls).toHaveLength(1);
|
|
613
|
+
expect(state?.restartsInWindow).toBe(0);
|
|
614
|
+
});
|
|
615
|
+
|
|
616
|
+
test("foreign holder with UNREADABLE cmdline → port_squatter error, no kill, no budget tick", async () => {
|
|
617
|
+
const first = makeFakeProc(930);
|
|
618
|
+
const spawner = makeQueueSpawner();
|
|
619
|
+
spawner.enqueue(first); // only one — no respawn expected.
|
|
620
|
+
const kill = recordingKill();
|
|
621
|
+
let holder: number | undefined = undefined; // free at start; orphan after crash.
|
|
622
|
+
const sup = new Supervisor({
|
|
623
|
+
spawnFn: spawner.spawn,
|
|
624
|
+
killFn: kill.killFn,
|
|
625
|
+
restartDelayMs: 0,
|
|
626
|
+
sleep: () => Promise.resolve(),
|
|
627
|
+
pidOnPort: (port) => (port === 1940 ? holder : undefined),
|
|
628
|
+
// Unreadable cmdline + a non-matching pid → NOT attributable (never kill).
|
|
629
|
+
ownerOfPid: () => undefined,
|
|
630
|
+
});
|
|
631
|
+
|
|
632
|
+
await sup.start({ short: "vault", cmd: ["parachute-vault", "serve"], env: { PORT: "1940" } });
|
|
633
|
+
holder = 7000;
|
|
634
|
+
first.closeStreams();
|
|
635
|
+
first.resolveExit(1);
|
|
636
|
+
await tick();
|
|
637
|
+
|
|
638
|
+
const state = sup.get("vault");
|
|
639
|
+
expect(state?.status).toBe("crashed");
|
|
640
|
+
expect(state?.startError?.error_type).toBe("port_squatter");
|
|
641
|
+
expect(kill.calls).toHaveLength(0);
|
|
642
|
+
expect(spawner.calls).toHaveLength(1);
|
|
643
|
+
expect(state?.restartsInWindow).toBe(0);
|
|
644
|
+
});
|
|
645
|
+
|
|
646
|
+
test("no squatter after a crash → normal restart (unchanged behavior)", async () => {
|
|
647
|
+
const first = makeFakeProc(940);
|
|
648
|
+
const second = makeFakeProc(941);
|
|
649
|
+
const spawner = makeQueueSpawner();
|
|
650
|
+
spawner.enqueue(first);
|
|
651
|
+
spawner.enqueue(second);
|
|
652
|
+
const kill = recordingKill();
|
|
653
|
+
const sup = new Supervisor({
|
|
654
|
+
spawnFn: spawner.spawn,
|
|
655
|
+
killFn: kill.killFn,
|
|
656
|
+
restartDelayMs: 0,
|
|
657
|
+
sleep: () => Promise.resolve(),
|
|
658
|
+
// Port free at crash time (no squatter).
|
|
659
|
+
pidOnPort: () => undefined,
|
|
660
|
+
ownerOfPid: () => undefined,
|
|
661
|
+
});
|
|
662
|
+
|
|
663
|
+
await sup.start({ short: "vault", cmd: ["bun", "vault.ts"], env: { PORT: "1940" } });
|
|
664
|
+
first.closeStreams();
|
|
665
|
+
first.resolveExit(1);
|
|
666
|
+
await tick();
|
|
667
|
+
|
|
668
|
+
expect(kill.calls).toHaveLength(0);
|
|
669
|
+
expect(spawner.calls).toHaveLength(2);
|
|
670
|
+
const state = sup.get("vault");
|
|
671
|
+
expect(state?.status).toBe("running");
|
|
672
|
+
expect(state?.restartsInWindow).toBe(1);
|
|
673
|
+
|
|
674
|
+
second.closeStreams();
|
|
675
|
+
sup.stop("vault");
|
|
676
|
+
second.resolveExit(0);
|
|
677
|
+
});
|
|
678
|
+
|
|
679
|
+
test('reclaimPolicy "prompt" → never adopt-kills, surfaces even an attributable orphan', async () => {
|
|
680
|
+
const first = makeFakeProc(950);
|
|
681
|
+
const spawner = makeQueueSpawner();
|
|
682
|
+
spawner.enqueue(first); // only one — prompt halts, no respawn.
|
|
683
|
+
const kill = recordingKill();
|
|
684
|
+
let holder: number | undefined = undefined; // free at start; orphan after crash.
|
|
685
|
+
const sup = new Supervisor({
|
|
686
|
+
spawnFn: spawner.spawn,
|
|
687
|
+
killFn: kill.killFn,
|
|
688
|
+
restartDelayMs: 0,
|
|
689
|
+
sleep: () => Promise.resolve(),
|
|
690
|
+
reclaimPolicy: "prompt",
|
|
691
|
+
pidOnPort: (port) => (port === 1940 ? holder : undefined),
|
|
692
|
+
// Attributable per-module (parachute-vault marker) — but "prompt" still refuses to kill.
|
|
693
|
+
ownerOfPid: () => "parachute-vault serve",
|
|
694
|
+
});
|
|
695
|
+
|
|
696
|
+
await sup.start({ short: "vault", cmd: ["parachute-vault", "serve"], env: { PORT: "1940" } });
|
|
697
|
+
holder = 8000;
|
|
698
|
+
first.closeStreams();
|
|
699
|
+
first.resolveExit(1);
|
|
700
|
+
await tick();
|
|
701
|
+
|
|
702
|
+
const state = sup.get("vault");
|
|
703
|
+
expect(state?.status).toBe("crashed");
|
|
704
|
+
expect(state?.startError?.error_type).toBe("port_squatter");
|
|
705
|
+
expect(kill.calls).toHaveLength(0); // prompt never kills
|
|
706
|
+
expect(spawner.calls).toHaveLength(1);
|
|
707
|
+
expect(state?.restartsInWindow).toBe(0);
|
|
708
|
+
});
|
|
709
|
+
|
|
710
|
+
test("foreign SIBLING parachute module on the port → NOT adopt-killed, port_squatter error (per-module attribution, #601 review)", async () => {
|
|
711
|
+
// The cross-module-kill hazard the per-module attribution closes: vault's
|
|
712
|
+
// crash-restart finds its port held by a SCRIBE orphan (a genuine parachute
|
|
713
|
+
// process — its cmdline carries `parachute` AND scribe's own binary — but
|
|
714
|
+
// NOT vault's). The broad `parachute` marker would have adopt-KILLED scribe;
|
|
715
|
+
// the per-module marker (`parachute-vault`) does not match scribe's cmdline,
|
|
716
|
+
// so scribe is "not attributable" → surfaced, never killed.
|
|
717
|
+
const first = makeFakeProc(960);
|
|
718
|
+
const spawner = makeQueueSpawner();
|
|
719
|
+
spawner.enqueue(first); // only one — a kill+respawn would consume a second.
|
|
720
|
+
const kill = recordingKill();
|
|
721
|
+
let holder: number | undefined = undefined; // free at start; sibling after crash.
|
|
722
|
+
const sup = new Supervisor({
|
|
723
|
+
spawnFn: spawner.spawn,
|
|
724
|
+
killFn: kill.killFn,
|
|
725
|
+
restartDelayMs: 0,
|
|
726
|
+
sleep: () => Promise.resolve(),
|
|
727
|
+
pidOnPort: (port) => (port === 1940 ? holder : undefined),
|
|
728
|
+
// A real parachute process — but it's SCRIBE, not vault. Carries the broad
|
|
729
|
+
// `parachute` marker (and `parachute-scribe`) yet NOT `parachute-vault`.
|
|
730
|
+
ownerOfPid: (pid) => (pid === 9000 ? "parachute-scribe serve" : undefined),
|
|
731
|
+
});
|
|
732
|
+
|
|
733
|
+
await sup.start({ short: "vault", cmd: ["parachute-vault", "serve"], env: { PORT: "1940" } });
|
|
734
|
+
holder = 9000;
|
|
735
|
+
first.closeStreams();
|
|
736
|
+
first.resolveExit(1);
|
|
737
|
+
await tick();
|
|
738
|
+
|
|
739
|
+
const state = sup.get("vault");
|
|
740
|
+
// The sibling was NOT killed (per-module marker mismatch), surfaced instead.
|
|
741
|
+
expect(kill.calls).toHaveLength(0);
|
|
742
|
+
expect(spawner.calls).toHaveLength(1); // no respawn — halted, no port reclaim
|
|
743
|
+
expect(state?.status).toBe("crashed");
|
|
744
|
+
expect(state?.startError?.error_type).toBe("port_squatter");
|
|
745
|
+
expect(state?.startError?.error_description).toContain("port 1940 is held by pid 9000");
|
|
746
|
+
// Sanity: the sibling WOULD have matched the broad `parachute` marker —
|
|
747
|
+
// proving the per-module marker is what spared it.
|
|
748
|
+
expect("parachute-scribe serve").toContain("parachute");
|
|
749
|
+
});
|
|
750
|
+
|
|
751
|
+
test("generic-runtime startCmd (bun server.ts): a FOREIGN bun on the port is NOT over-attributed (#601 re-review)", async () => {
|
|
752
|
+
// A custom operator startCmd whose cmd[0] is a generic runtime (`bun`). The
|
|
753
|
+
// marker must NOT be "bun" (that would adopt-KILL any bun process on the
|
|
754
|
+
// port) — it falls through to the module-specific cwd (the installDir). A
|
|
755
|
+
// foreign bun process with a DIFFERENT cwd in its cmdline is then NOT
|
|
756
|
+
// attributable → surfaced, never killed.
|
|
757
|
+
const first = makeFakeProc(970);
|
|
758
|
+
const spawner = makeQueueSpawner();
|
|
759
|
+
spawner.enqueue(first); // only one — no kill+respawn expected.
|
|
760
|
+
const kill = recordingKill();
|
|
761
|
+
let holder: number | undefined = undefined;
|
|
762
|
+
const sup = new Supervisor({
|
|
763
|
+
spawnFn: spawner.spawn,
|
|
764
|
+
killFn: kill.killFn,
|
|
765
|
+
restartDelayMs: 0,
|
|
766
|
+
sleep: () => Promise.resolve(),
|
|
767
|
+
pidOnPort: (port) => (port === 1940 ? holder : undefined),
|
|
768
|
+
// A foreign bun process — its cmdline carries `bun` (which WOULD match a
|
|
769
|
+
// naive cmd[0] marker) but NOT vault's installDir.
|
|
770
|
+
ownerOfPid: (pid) => (pid === 9100 ? "bun /home/op/other-project/server.ts" : undefined),
|
|
771
|
+
});
|
|
772
|
+
|
|
773
|
+
await sup.start({
|
|
774
|
+
short: "vault",
|
|
775
|
+
cmd: ["bun", "server.ts"],
|
|
776
|
+
cwd: "/x/.parachute/vault",
|
|
777
|
+
env: { PORT: "1940" },
|
|
778
|
+
});
|
|
779
|
+
holder = 9100;
|
|
780
|
+
first.closeStreams();
|
|
781
|
+
first.resolveExit(1);
|
|
782
|
+
await tick();
|
|
783
|
+
|
|
784
|
+
const state = sup.get("vault");
|
|
785
|
+
// Not over-attributed: no kill, no respawn — surfaced as a squatter.
|
|
786
|
+
expect(kill.calls).toHaveLength(0);
|
|
787
|
+
expect(spawner.calls).toHaveLength(1);
|
|
788
|
+
expect(state?.status).toBe("crashed");
|
|
789
|
+
expect(state?.startError?.error_type).toBe("port_squatter");
|
|
790
|
+
// Sanity: the foreign cmdline WOULD have matched a naive "bun" marker —
|
|
791
|
+
// proving the generic-runtime fall-through to the cwd marker is what spared it.
|
|
792
|
+
expect("bun /home/op/other-project/server.ts").toContain("bun");
|
|
793
|
+
});
|
|
794
|
+
|
|
795
|
+
test("generic-runtime startCmd: a GENUINE prior instance (same installDir cwd) IS adopted (positive control)", async () => {
|
|
796
|
+
// The other side of the fall-through: with cmd[0]=`bun`, the marker is the
|
|
797
|
+
// module's cwd (`/x/.parachute/vault`). A genuine prior vault instance was
|
|
798
|
+
// launched from that installDir, so its cmdline carries the path → it IS
|
|
799
|
+
// attributable and gets adopt-killed.
|
|
800
|
+
const first = makeFakeProc(980);
|
|
801
|
+
const second = makeFakeProc(981);
|
|
802
|
+
const spawner = makeQueueSpawner();
|
|
803
|
+
spawner.enqueue(first);
|
|
804
|
+
spawner.enqueue(second);
|
|
805
|
+
const calls: Array<{ pid: number; signal: unknown }> = [];
|
|
806
|
+
let holder: number | undefined = undefined;
|
|
807
|
+
const killFn: KillFn = (pid, signal) => {
|
|
808
|
+
calls.push({ pid, signal });
|
|
809
|
+
if (pid === 9200 && signal === "SIGTERM") holder = undefined; // orphan died
|
|
810
|
+
};
|
|
811
|
+
const sup = new Supervisor({
|
|
812
|
+
spawnFn: spawner.spawn,
|
|
813
|
+
killFn,
|
|
814
|
+
restartDelayMs: 0,
|
|
815
|
+
sleep: () => Promise.resolve(),
|
|
816
|
+
pidOnPort: (port) => (port === 1940 ? holder : undefined),
|
|
817
|
+
// Genuine prior vault instance — launched from vault's installDir, so the
|
|
818
|
+
// cwd marker appears in its cmdline.
|
|
819
|
+
ownerOfPid: (pid) => (pid === 9200 ? "bun /x/.parachute/vault/server.ts" : undefined),
|
|
820
|
+
});
|
|
821
|
+
|
|
822
|
+
await sup.start({
|
|
823
|
+
short: "vault",
|
|
824
|
+
cmd: ["bun", "server.ts"],
|
|
825
|
+
cwd: "/x/.parachute/vault",
|
|
826
|
+
env: { PORT: "1940" },
|
|
827
|
+
});
|
|
828
|
+
holder = 9200;
|
|
829
|
+
first.closeStreams();
|
|
830
|
+
first.resolveExit(1);
|
|
831
|
+
await tick();
|
|
832
|
+
|
|
833
|
+
// Adopted: SIGTERM'd the genuine prior instance, then respawned.
|
|
834
|
+
expect(calls.some((c) => c.pid === 9200 && c.signal === "SIGTERM")).toBe(true);
|
|
835
|
+
expect(spawner.calls).toHaveLength(2);
|
|
836
|
+
expect(sup.get("vault")?.status).toBe("running");
|
|
837
|
+
|
|
838
|
+
second.closeStreams();
|
|
839
|
+
sup.stop("vault");
|
|
840
|
+
second.resolveExit(0);
|
|
841
|
+
});
|
|
842
|
+
});
|
|
843
|
+
|
|
481
844
|
describe("Supervisor.stop", () => {
|
|
482
845
|
test("operator stop is not a crash — does not restart", async () => {
|
|
483
846
|
const proc = makeFakeProc(101);
|
|
@@ -82,6 +82,7 @@ import {
|
|
|
82
82
|
installManagedUnit,
|
|
83
83
|
removeManagedUnit,
|
|
84
84
|
} from "../managed-unit.ts";
|
|
85
|
+
import { type OwnerProbeFn, orphanAttributable } from "../orphan-attribution.ts";
|
|
85
86
|
import { type PortListeningFn, defaultPortListening } from "../port-probe.ts";
|
|
86
87
|
import { type AliveFn, clearPid, readPid } from "../process-state.ts";
|
|
87
88
|
import { shortNameForManifest } from "../service-spec.ts";
|
|
@@ -103,12 +104,12 @@ export function defaultHubCliPath(): string {
|
|
|
103
104
|
return fileURLToPath(new URL("../cli.ts", import.meta.url));
|
|
104
105
|
}
|
|
105
106
|
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
export type OwnerProbeFn
|
|
107
|
+
// `OwnerProbeFn` + the attribution heuristic (`orphanAttributable`) now live in
|
|
108
|
+
// the shared `src/orphan-attribution.ts` so the migrate orphan-sweep and the
|
|
109
|
+
// supervisor's crash-restart adopt-kill share ONE implementation (no drift on
|
|
110
|
+
// the safety-critical "is this mine?" check). Re-exported here for the existing
|
|
111
|
+
// `migrate-cutover` import surface.
|
|
112
|
+
export type { OwnerProbeFn } from "../orphan-attribution.ts";
|
|
112
113
|
|
|
113
114
|
/**
|
|
114
115
|
* Production `ownerOfPid`: `ps -o command= -p <pid>` returns the full argv of the
|
|
@@ -454,49 +455,10 @@ async function stopDetachedModule(
|
|
|
454
455
|
log(` ✓ stopped ${target.short}`);
|
|
455
456
|
}
|
|
456
457
|
|
|
457
|
-
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
* when ANY of:
|
|
462
|
-
* - the orphan pid equals the module's RECORDED pid (services.json/pidfile);
|
|
463
|
-
* - its command line mentions `parachute` (any parachute-managed process —
|
|
464
|
-
* the `~/.parachute/...` install path and the `@openparachute/<mod>`
|
|
465
|
-
* package name both carry this marker, so it catches every genuine
|
|
466
|
-
* parachute-managed module);
|
|
467
|
-
* - its command line mentions the module's start command (when a hint is
|
|
468
|
-
* supplied — currently always unset at the call site, the seam is kept
|
|
469
|
-
* for a future services.json-derived start command).
|
|
470
|
-
* An unreadable command line (probe returned undefined) + a non-matching pid is
|
|
471
|
-
* NOT attributable — we refuse to kill it.
|
|
472
|
-
*
|
|
473
|
-
* NOTE: the bare module short-name needle (`vault`/`runner`/`scribe`/`notes`)
|
|
474
|
-
* was deliberately dropped — on the most destructive command (a process KILL),
|
|
475
|
-
* a bare short-name is too loose: a `runner` substring matches an unrelated CI
|
|
476
|
-
* runner squatting the port. The `parachute` marker already attributes every
|
|
477
|
-
* genuine parachute-managed process, so the short-name arm only widened the
|
|
478
|
-
* false-positive surface.
|
|
479
|
-
*/
|
|
480
|
-
function orphanAttributable(args: {
|
|
481
|
-
orphan: number;
|
|
482
|
-
recordedPid: number | undefined;
|
|
483
|
-
short: string;
|
|
484
|
-
startCmdHint: string | undefined;
|
|
485
|
-
ownerOfPid: OwnerProbeFn;
|
|
486
|
-
}): { attributable: boolean; cmdline: string | undefined } {
|
|
487
|
-
const { orphan, recordedPid, startCmdHint, ownerOfPid } = args;
|
|
488
|
-
if (recordedPid !== undefined && orphan === recordedPid) {
|
|
489
|
-
return { attributable: true, cmdline: undefined };
|
|
490
|
-
}
|
|
491
|
-
const cmdline = ownerOfPid(orphan);
|
|
492
|
-
if (cmdline === undefined) return { attributable: false, cmdline: undefined };
|
|
493
|
-
const haystack = cmdline.toLowerCase();
|
|
494
|
-
const needles = ["parachute", ...(startCmdHint ? [startCmdHint.toLowerCase()] : [])].filter(
|
|
495
|
-
(n) => n.length > 0,
|
|
496
|
-
);
|
|
497
|
-
const attributable = needles.some((n) => haystack.includes(n));
|
|
498
|
-
return { attributable, cmdline };
|
|
499
|
-
}
|
|
458
|
+
// `orphanAttributable` — the safety-critical "is this orphan plausibly this
|
|
459
|
+
// module?" heuristic — now lives in the shared `src/orphan-attribution.ts`
|
|
460
|
+
// (imported above), so the supervisor's crash-restart adopt-kill uses the same
|
|
461
|
+
// implementation. See that file for the full attribution contract.
|
|
500
462
|
|
|
501
463
|
/**
|
|
502
464
|
* §7.2 orphan sweep: lsof a port, and if a live process is bound to it, adopt +
|
package/src/commands/serve.ts
CHANGED
|
@@ -337,13 +337,22 @@ export async function serve(opts: ServeOpts = {}): Promise<{
|
|
|
337
337
|
// hatch for setups that want loopback-only inside a sidecar.
|
|
338
338
|
const hostname = env.PARACHUTE_BIND_HOST || "0.0.0.0";
|
|
339
339
|
|
|
340
|
-
// Ensure the well-known dir exists, and
|
|
340
|
+
// Ensure the well-known dir exists, and (re)write the static hub.html so `/`
|
|
341
341
|
// serves something coherent on a fresh disk (the dynamic path through
|
|
342
342
|
// `hubFetch` takes over once a DB row exists; the disk file is the
|
|
343
343
|
// signed-out fallback).
|
|
344
|
+
//
|
|
345
|
+
// Regenerate on EVERY serve start, not just when the file is absent (#171):
|
|
346
|
+
// hub.html is a served artifact built from current code, and code ships via
|
|
347
|
+
// `git pull` + `parachute restart hub`. Guarding on `!existsSync` left the
|
|
348
|
+
// stale post-upgrade file on disk until an unrelated `parachute expose`
|
|
349
|
+
// re-ran — so operators saw old hub.html after an upgrade. The write is a
|
|
350
|
+
// cheap, deterministic, atomic (tmp+rename) render of static signed-out
|
|
351
|
+
// HTML with no expose-state or DB dependency, so it's safe to call every
|
|
352
|
+
// start.
|
|
344
353
|
if (!existsSync(WELL_KNOWN_DIR)) mkdirSync(WELL_KNOWN_DIR, { recursive: true });
|
|
345
354
|
const hubHtmlPath = join(WELL_KNOWN_DIR, "hub.html");
|
|
346
|
-
|
|
355
|
+
writeHubFile(hubHtmlPath);
|
|
347
356
|
|
|
348
357
|
const dbPath = hubDbPath();
|
|
349
358
|
// Self-heal-or-die DB holder (#594). The handle lives behind a mutable
|
package/src/notes-serve.ts
CHANGED
|
@@ -176,6 +176,29 @@ export function notesFetch(dist: string, mount: string): (req: Request) => Respo
|
|
|
176
176
|
};
|
|
177
177
|
}
|
|
178
178
|
|
|
179
|
+
/**
|
|
180
|
+
* Build the `Bun.serve` config for the notes static server.
|
|
181
|
+
*
|
|
182
|
+
* `idleTimeout: 255` matches hub-server.ts. When this static-serve sits behind
|
|
183
|
+
* an edge proxy that pools keep-alive connections (Render, Cloudflare, fly
|
|
184
|
+
* proxy), the edge's idle timeout outlasts Bun's default — the proxy reuses a
|
|
185
|
+
* connection we just closed and returns a "random" 502. 255s comfortably
|
|
186
|
+
* exceeds Render's community-observed ~120s edge pool TTL. Closes the hub#399
|
|
187
|
+
* residual on the second serve entrypoint (the Notes PWA path). Exported so a
|
|
188
|
+
* test can assert the option is set without booting a server.
|
|
189
|
+
*/
|
|
190
|
+
export function notesServeOptions(
|
|
191
|
+
port: number,
|
|
192
|
+
dist: string,
|
|
193
|
+
mount: string,
|
|
194
|
+
): { port: number; idleTimeout: number; fetch: (req: Request) => Response } {
|
|
195
|
+
return {
|
|
196
|
+
port,
|
|
197
|
+
idleTimeout: 255,
|
|
198
|
+
fetch: notesFetch(dist, mount),
|
|
199
|
+
};
|
|
200
|
+
}
|
|
201
|
+
|
|
179
202
|
if (import.meta.main) {
|
|
180
203
|
const { port, dist: distArg, mount } = parseArgs(process.argv.slice(2));
|
|
181
204
|
|
|
@@ -187,10 +210,7 @@ if (import.meta.main) {
|
|
|
187
210
|
process.exit(1);
|
|
188
211
|
}
|
|
189
212
|
|
|
190
|
-
Bun.serve(
|
|
191
|
-
port,
|
|
192
|
-
fetch: notesFetch(dist, mount),
|
|
193
|
-
});
|
|
213
|
+
Bun.serve(notesServeOptions(port, dist, mount));
|
|
194
214
|
|
|
195
215
|
console.log(`notes static-serve listening on :${port} (dist=${dist}, mount=${mount || "/"})`);
|
|
196
216
|
}
|
|
@@ -0,0 +1,102 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Shared port-orphan ATTRIBUTION — the safety crux behind every adopt-kill in
|
|
3
|
+
* the hub.
|
|
4
|
+
*
|
|
5
|
+
* Two lifecycle sites reclaim a module's port from a process the supervisor
|
|
6
|
+
* doesn't directly own:
|
|
7
|
+
* - the `parachute migrate --to-supervised` orphan sweep
|
|
8
|
+
* (`commands/migrate-cutover.ts:sweepOrphanOnPort`), and
|
|
9
|
+
* - the supervisor's crash-restart path
|
|
10
|
+
* (`supervisor.ts:handleExit` → `adoptKillOrphanOnPort`).
|
|
11
|
+
*
|
|
12
|
+
* Both must answer the SAME question before sending a signal: is the process
|
|
13
|
+
* holding the module's port plausibly THIS parachute module (a leftover
|
|
14
|
+
* instance / orphan we may adopt-kill), or an UNRELATED process the operator is
|
|
15
|
+
* running on the same port (which we must never touch)? Sharing one
|
|
16
|
+
* implementation keeps the two sites from drifting — a loosened needle in one
|
|
17
|
+
* place can't widen the kill surface in the other without the other noticing.
|
|
18
|
+
*
|
|
19
|
+
* The function is intentionally CONSERVATIVE: when in any doubt (unreadable
|
|
20
|
+
* command line + a non-matching pid) it returns `attributable: false`, and the
|
|
21
|
+
* caller refuses to kill. False-negatives cost a surfaced `port_squatter`
|
|
22
|
+
* error (the operator resolves it); a false-positive costs killing someone
|
|
23
|
+
* else's process — a far worse failure, so we bias hard toward not-attributable.
|
|
24
|
+
*/
|
|
25
|
+
|
|
26
|
+
/**
|
|
27
|
+
* Best-effort command line of a pid. Returns the process's argv (one line) or
|
|
28
|
+
* undefined when it can't be read (pid gone, permission, no `ps`). Both
|
|
29
|
+
* supervisor + migrate wire a `ps -o command= -p <pid>` shell-out; the seam is
|
|
30
|
+
* injectable so tests drive attribution without shelling out.
|
|
31
|
+
*/
|
|
32
|
+
export type OwnerProbeFn = (pid: number) => string | undefined;
|
|
33
|
+
|
|
34
|
+
/**
|
|
35
|
+
* Decide whether an orphan pid bound to a MODULE port is plausibly attributable
|
|
36
|
+
* to that parachute module — the guard against blind-killing an operator's
|
|
37
|
+
* unrelated process that merely squats a declared port. Attributable when ANY
|
|
38
|
+
* of:
|
|
39
|
+
* - the orphan pid equals the module's RECORDED pid (services.json/pidfile,
|
|
40
|
+
* or a supervisor entry's recorded pid);
|
|
41
|
+
* - (the cmdline arm) it matches the configured needle set — see `moduleMarker`.
|
|
42
|
+
*
|
|
43
|
+
* An unreadable command line (probe returned undefined) + a non-matching pid is
|
|
44
|
+
* NOT attributable — we refuse to kill it.
|
|
45
|
+
*
|
|
46
|
+
* TWO ATTRIBUTION MODES (the `moduleMarker` knob):
|
|
47
|
+
*
|
|
48
|
+
* - **Broad ("parachute") — the migrate orphan-sweep.** `moduleMarker`
|
|
49
|
+
* OMITTED: the cmdline needle is the bare `parachute` marker (the
|
|
50
|
+
* `~/.parachute/...` install path + the `@openparachute/<mod>` package name
|
|
51
|
+
* both carry it). The sweep runs ecosystem-wide during a cutover, so
|
|
52
|
+
* "is it ANY parachute-managed process?" is the right, field-tested width.
|
|
53
|
+
*
|
|
54
|
+
* - **Per-module — the supervisor's crash-restart adopt-kill.** `moduleMarker`
|
|
55
|
+
* PROVIDED (the module's own start binary / installDir, e.g.
|
|
56
|
+
* `parachute-vault` or `~/.parachute/vault/`): the cmdline must contain THAT
|
|
57
|
+
* marker. The supervisor is always restarting ONE specific module and knows
|
|
58
|
+
* its identity, so a bare `parachute` match is too loose — it would let
|
|
59
|
+
* vault's restart adopt-KILL a sibling `scribe`/`runner` orphan that happens
|
|
60
|
+
* to hold vault's port (a cross-module kill). Requiring the module-specific
|
|
61
|
+
* marker means the supervisor can only ever reclaim a prior instance of the
|
|
62
|
+
* SAME module; a sibling's process is "not attributable" → surfaced, never
|
|
63
|
+
* killed.
|
|
64
|
+
*
|
|
65
|
+
* The bare module short-NAME (`vault`/`scribe`/…) is deliberately NOT a needle
|
|
66
|
+
* in either mode — on a process KILL a bare short-name is too loose (a `runner`
|
|
67
|
+
* substring matches an unrelated CI runner). The per-module marker is the
|
|
68
|
+
* fully-qualified binary/path, not the short name.
|
|
69
|
+
*
|
|
70
|
+
* `startCmdHint` is an additional optional cmdline needle (currently unset at
|
|
71
|
+
* both call sites; a seam for a future services.json-derived start command).
|
|
72
|
+
*/
|
|
73
|
+
export function orphanAttributable(args: {
|
|
74
|
+
orphan: number;
|
|
75
|
+
recordedPid: number | undefined;
|
|
76
|
+
short: string;
|
|
77
|
+
startCmdHint: string | undefined;
|
|
78
|
+
ownerOfPid: OwnerProbeFn;
|
|
79
|
+
/**
|
|
80
|
+
* When provided, the cmdline arm requires THIS module-specific marker (start
|
|
81
|
+
* binary / installDir) instead of the broad `parachute` marker — see the
|
|
82
|
+
* "two attribution modes" note above. Omitted → broad `parachute` (migrate).
|
|
83
|
+
*/
|
|
84
|
+
moduleMarker?: string;
|
|
85
|
+
}): { attributable: boolean; cmdline: string | undefined } {
|
|
86
|
+
const { orphan, recordedPid, startCmdHint, ownerOfPid, moduleMarker } = args;
|
|
87
|
+
if (recordedPid !== undefined && orphan === recordedPid) {
|
|
88
|
+
return { attributable: true, cmdline: undefined };
|
|
89
|
+
}
|
|
90
|
+
const cmdline = ownerOfPid(orphan);
|
|
91
|
+
if (cmdline === undefined) return { attributable: false, cmdline: undefined };
|
|
92
|
+
const haystack = cmdline.toLowerCase();
|
|
93
|
+
// Per-module mode (moduleMarker set) uses the module-specific marker as the
|
|
94
|
+
// base needle; broad mode (migrate sweep) uses "parachute". `startCmdHint` is
|
|
95
|
+
// an extra needle in either mode.
|
|
96
|
+
const baseNeedle = moduleMarker ? moduleMarker.toLowerCase() : "parachute";
|
|
97
|
+
const needles = [baseNeedle, ...(startCmdHint ? [startCmdHint.toLowerCase()] : [])].filter(
|
|
98
|
+
(n) => n.length > 0,
|
|
99
|
+
);
|
|
100
|
+
const attributable = needles.some((n) => haystack.includes(n));
|
|
101
|
+
return { attributable, cmdline };
|
|
102
|
+
}
|
package/src/supervisor.ts
CHANGED
|
@@ -42,6 +42,7 @@ import {
|
|
|
42
42
|
rethrowIfMissing,
|
|
43
43
|
} from "@openparachute/depcheck";
|
|
44
44
|
import { defaultPidOnPort } from "./hub-control.ts";
|
|
45
|
+
import { orphanAttributable } from "./orphan-attribution.ts";
|
|
45
46
|
import { type PortListeningFn, defaultPortListening } from "./port-probe.ts";
|
|
46
47
|
|
|
47
48
|
/**
|
|
@@ -285,6 +286,28 @@ export interface SupervisorOpts {
|
|
|
285
286
|
* stub-spawner test path defaults to "unknown" (returns undefined).
|
|
286
287
|
*/
|
|
287
288
|
readonly ownerOfPid?: OwnerProbeFn;
|
|
289
|
+
/**
|
|
290
|
+
* Port-reclamation POLICY for the CRASH-RESTART path (#522 / #582). When a
|
|
291
|
+
* supervised child crashes and a foreign process now holds its declared port,
|
|
292
|
+
* `handleExit` must decide what to do with an ATTRIBUTABLE orphan (one whose
|
|
293
|
+
* command line carries the `parachute` marker or matches a recorded module
|
|
294
|
+
* pid — see `orphan-attribution.ts`):
|
|
295
|
+
* - `"adopt"` (default): adopt-kill the attributable orphan (SIGTERM →
|
|
296
|
+
* SIGKILL escalation, all idempotent) and proceed to re-spawn. This
|
|
297
|
+
* extends the migrate orphan-sweep's field-tested auto-adopt behavior to
|
|
298
|
+
* the crash-restart path — closing the recurring "port 1940 taken"
|
|
299
|
+
* crash-loop (#522) for good.
|
|
300
|
+
* - `"prompt"`: NEVER auto-kill; record the structured `port_squatter`
|
|
301
|
+
* start-error (same surface a NON-attributable squatter gets) so the
|
|
302
|
+
* operator resolves it manually.
|
|
303
|
+
*
|
|
304
|
+
* A NON-attributable holder is ALWAYS surfaced (never killed) regardless of
|
|
305
|
+
* policy — `"adopt"` only ever escalates to a kill on a holder we can
|
|
306
|
+
* attribute to this very module. Default `"adopt"`; the flag is the one-line
|
|
307
|
+
* lever to flip the whole crash-restart behavior to detect-and-prompt if the
|
|
308
|
+
* auto-kill default is later vetoed.
|
|
309
|
+
*/
|
|
310
|
+
readonly reclaimPolicy?: "adopt" | "prompt";
|
|
288
311
|
}
|
|
289
312
|
|
|
290
313
|
/**
|
|
@@ -322,6 +345,42 @@ const DEFAULT_START_READY_POLL_MS = 200;
|
|
|
322
345
|
const DEFAULT_LATE_BIND_WATCH_MS = 60_000;
|
|
323
346
|
const DEFAULT_LATE_BIND_POLL_MS = 1_000;
|
|
324
347
|
|
|
348
|
+
/**
|
|
349
|
+
* Generic language runtimes that can front a custom operator startCmd (e.g.
|
|
350
|
+
* `bun server.ts`, `python3 -m app`). When one of these is `cmd[0]` it is NOT a
|
|
351
|
+
* module-specific marker — using it as the adopt-kill attribution needle would
|
|
352
|
+
* match ANY such process on the port (over-broad kill, #601 re-review). The
|
|
353
|
+
* per-module marker then falls through to the module's installDir/cwd instead.
|
|
354
|
+
* First-party modules (`parachute-vault`, `parachute-scribe`, …) are unaffected
|
|
355
|
+
* — their `cmd[0]` isn't in this set. Matched on the BASENAME, lowercased, with
|
|
356
|
+
* any `.exe` suffix stripped (Windows), so an absolute `/usr/bin/bun` is caught.
|
|
357
|
+
*/
|
|
358
|
+
const GENERIC_RUNTIMES = new Set([
|
|
359
|
+
"bun",
|
|
360
|
+
"node",
|
|
361
|
+
"nodejs",
|
|
362
|
+
"deno",
|
|
363
|
+
"python",
|
|
364
|
+
"python2",
|
|
365
|
+
"python3",
|
|
366
|
+
"ruby",
|
|
367
|
+
"sh",
|
|
368
|
+
"bash",
|
|
369
|
+
"zsh",
|
|
370
|
+
"dash",
|
|
371
|
+
"env",
|
|
372
|
+
]);
|
|
373
|
+
|
|
374
|
+
/**
|
|
375
|
+
* Is `cmd0` a generic language runtime rather than a module-specific binary?
|
|
376
|
+
* Strips the directory and a trailing `.exe`, lowercases, and checks the
|
|
377
|
+
* {@link GENERIC_RUNTIMES} set. See `moduleMarkerFor`.
|
|
378
|
+
*/
|
|
379
|
+
function isGenericRuntime(cmd0: string): boolean {
|
|
380
|
+
const base = (cmd0.split("/").pop() ?? cmd0).toLowerCase().replace(/\.exe$/, "");
|
|
381
|
+
return GENERIC_RUNTIMES.has(base);
|
|
382
|
+
}
|
|
383
|
+
|
|
325
384
|
/**
|
|
326
385
|
* Bounded, line-oriented ring buffer (§6.5). Holds the most-recent lines of a
|
|
327
386
|
* module's output up to `maxBytes`; pushing past the cap drops whole lines
|
|
@@ -406,6 +465,12 @@ export class Supervisor {
|
|
|
406
465
|
// opt in by injecting `pidOnPort` / `ownerOfPid`.
|
|
407
466
|
pidOnPort: opts.pidOnPort ?? (isProductionPath ? defaultPidOnPort : () => undefined),
|
|
408
467
|
ownerOfPid: opts.ownerOfPid ?? (isProductionPath ? defaultOwnerOfPid : () => undefined),
|
|
468
|
+
// Crash-restart port-reclamation policy (#522 / #582). Default "adopt"
|
|
469
|
+
// everywhere (production + tests) — the migrate precedent already
|
|
470
|
+
// auto-kills attributable orphans, and the attribution check is
|
|
471
|
+
// conservative. The flag exists so a future veto of auto-kill is a
|
|
472
|
+
// one-line "prompt" flip.
|
|
473
|
+
reclaimPolicy: opts.reclaimPolicy ?? "adopt",
|
|
409
474
|
};
|
|
410
475
|
}
|
|
411
476
|
|
|
@@ -467,13 +532,13 @@ export class Supervisor {
|
|
|
467
532
|
// spawning — the operator sees the offending pid + cmdline + a copy-paste
|
|
468
533
|
// recovery in `status` / the SPA. Detection only: we never kill someone
|
|
469
534
|
// else's process (it may be the operator's unrelated dev server).
|
|
470
|
-
const squatter = this.
|
|
535
|
+
const squatter = this.checkPortSquatter(entry);
|
|
471
536
|
if (squatter) {
|
|
472
537
|
entry.state = {
|
|
473
538
|
...entry.state,
|
|
474
539
|
status: "crashed",
|
|
475
540
|
pid: undefined,
|
|
476
|
-
startError: squatter,
|
|
541
|
+
startError: this.portSquatterError(entry, squatter),
|
|
477
542
|
};
|
|
478
543
|
return entry.state;
|
|
479
544
|
}
|
|
@@ -532,9 +597,14 @@ export class Supervisor {
|
|
|
532
597
|
* replaced) and must not vouch for whoever now holds the port. An entry with
|
|
533
598
|
* no `proc` (never spawned) contributes no pid either.
|
|
534
599
|
*/
|
|
535
|
-
private supervisedPids(): Set<number> {
|
|
600
|
+
private supervisedPids(exclude?: ModuleEntry): Set<number> {
|
|
536
601
|
const pids = new Set<number>();
|
|
537
602
|
for (const entry of this.modules.values()) {
|
|
603
|
+
// The just-crashed entry on the `handleExit` path is still `running`
|
|
604
|
+
// (status hasn't been updated yet) with `entry.proc.pid` pointing at the
|
|
605
|
+
// now-DEAD child — it must not vouch for whoever holds the port (the same
|
|
606
|
+
// N1 stale-pid hazard, here for an exiting-but-not-yet-restated child).
|
|
607
|
+
if (exclude !== undefined && entry === exclude) continue;
|
|
538
608
|
if (entry.state.status !== "running" && entry.state.status !== "starting") continue;
|
|
539
609
|
const pid = entry.proc?.pid;
|
|
540
610
|
if (typeof pid === "number" && pid > 0) pids.add(pid);
|
|
@@ -543,28 +613,52 @@ export class Supervisor {
|
|
|
543
613
|
}
|
|
544
614
|
|
|
545
615
|
/**
|
|
546
|
-
*
|
|
547
|
-
*
|
|
616
|
+
* Pure pre-spawn port-squatter PROBE (#580 item 4, refactored for #522/#582).
|
|
617
|
+
* Returns the squatter detail when the module's declared port is held by a
|
|
548
618
|
* process the supervisor does NOT own; undefined when the port is free, the
|
|
549
619
|
* holder is one of our own children, or detection isn't available on this
|
|
550
620
|
* platform (no `lsof` → `pidOnPort` returns undefined → we degrade to the
|
|
551
621
|
* existing started-but-unbound path post-spawn).
|
|
552
622
|
*
|
|
623
|
+
* This is DETECTION ONLY — it records nothing and kills nothing. The two
|
|
624
|
+
* callers decide what to do with the result:
|
|
625
|
+
* - `start()` (#581) records the structured `port_squatter` start-error and
|
|
626
|
+
* refuses to spawn (a foreign pid on a module port may be the operator's
|
|
627
|
+
* unrelated process — never auto-killed on the operator-initiated path);
|
|
628
|
+
* - `handleExit` (#522/#582) additionally runs attribution and, for an
|
|
629
|
+
* ATTRIBUTABLE orphan under the "adopt" policy, adopt-kills + re-spawns.
|
|
630
|
+
*
|
|
553
631
|
* Ownership precedent mirrors `migrate-cutover.ts:sweepOrphanOnPort`'s "is
|
|
554
632
|
* this mine?" check — here the discriminant is "is the holder one of my live
|
|
555
|
-
* children's pids?".
|
|
556
|
-
* a foreign pid on a module port may be the operator's unrelated process.
|
|
633
|
+
* children's pids?".
|
|
557
634
|
*/
|
|
558
|
-
private
|
|
635
|
+
private checkPortSquatter(
|
|
636
|
+
entry: ModuleEntry,
|
|
637
|
+
excludeCrashingEntry = false,
|
|
638
|
+
): { port: number; holder: number; cmdline: string | undefined } | undefined {
|
|
559
639
|
const portStr = entry.req.env?.PORT;
|
|
560
640
|
const port = portStr ? Number(portStr) : Number.NaN;
|
|
561
641
|
if (!Number.isFinite(port) || port <= 0) return undefined; // No declared port.
|
|
562
642
|
|
|
563
643
|
const holder = this.opts.pidOnPort(port);
|
|
564
644
|
if (holder === undefined) return undefined; // Port free, or detection unavailable.
|
|
565
|
-
|
|
645
|
+
// On the crash-restart path the crashing entry is still `running` with a
|
|
646
|
+
// stale (dead) pid — exclude it so it can't vouch for the holder.
|
|
647
|
+
if (this.supervisedPids(excludeCrashingEntry ? entry : undefined).has(holder)) return undefined;
|
|
648
|
+
|
|
649
|
+
return { port, holder, cmdline: this.opts.ownerOfPid(holder) };
|
|
650
|
+
}
|
|
566
651
|
|
|
567
|
-
|
|
652
|
+
/**
|
|
653
|
+
* Build the structured, actionable `port_squatter` start-error from a probe
|
|
654
|
+
* result (#581). Shared by `start()` and the NON-attributable / "prompt"
|
|
655
|
+
* branch of `handleExit` so the wire shape stays identical.
|
|
656
|
+
*/
|
|
657
|
+
private portSquatterError(
|
|
658
|
+
entry: ModuleEntry,
|
|
659
|
+
squatter: { port: number; holder: number; cmdline: string | undefined },
|
|
660
|
+
): ModuleStartError {
|
|
661
|
+
const { port, holder, cmdline } = squatter;
|
|
568
662
|
const who = cmdline ? `pid ${holder} (${cmdline})` : `pid ${holder}`;
|
|
569
663
|
const short = entry.req.short;
|
|
570
664
|
return {
|
|
@@ -576,6 +670,163 @@ export class Supervisor {
|
|
|
576
670
|
};
|
|
577
671
|
}
|
|
578
672
|
|
|
673
|
+
/**
|
|
674
|
+
* Adopt-kill an orphan holding a module's port on the crash-restart path
|
|
675
|
+
* (#522 / #582). Best-effort + idempotent: SIGTERM the group, brief wait, then
|
|
676
|
+
* a SIGKILL escalation if it's still bound — every signal is try-caught so an
|
|
677
|
+
* ESRCH (the orphan already exited between probe + signal) is a no-op, not a
|
|
678
|
+
* throw. Modeled on `migrate-cutover.ts:sweepOrphanOnPort`'s adopt arm, using
|
|
679
|
+
* the supervisor's group-aware `killFn`. If the kill doesn't free the port the
|
|
680
|
+
* subsequent re-spawn just EADDRINUSE-crashes again and the normal restart
|
|
681
|
+
* budget eventually halts the loop — so a failed kill degrades gracefully.
|
|
682
|
+
*/
|
|
683
|
+
private async adoptKillOrphanOnPort(port: number, holder: number): Promise<void> {
|
|
684
|
+
try {
|
|
685
|
+
this.opts.killFn(holder, "SIGTERM");
|
|
686
|
+
} catch {
|
|
687
|
+
// ESRCH (already gone) or EPERM (can't signal) — best-effort: nothing
|
|
688
|
+
// more to do, the re-spawn surfaces a still-held port as a normal crash.
|
|
689
|
+
return;
|
|
690
|
+
}
|
|
691
|
+
// Give the orphan a moment to drop its listener before escalating. Reuse the
|
|
692
|
+
// restart delay (also the socket-release grace) so we don't add a new knob.
|
|
693
|
+
await this.opts.sleep(this.opts.restartDelayMs);
|
|
694
|
+
// Still holding the port? Escalate to SIGKILL (idempotent — if it already
|
|
695
|
+
// exited under the SIGTERM the port is free and we skip the escalation).
|
|
696
|
+
// N1: this re-check is deliberately NOT re-attributed — we already
|
|
697
|
+
// attributed `holder` to this module before the SIGTERM, and only escalate
|
|
698
|
+
// if the SAME pid still holds the SAME port. The TOCTOU window (the
|
|
699
|
+
// originally-attributed pid exits and the OS recycles its number onto a new,
|
|
700
|
+
// foreign holder of this port between the SIGTERM and this re-probe) is the
|
|
701
|
+
// same accepted, vanishingly-small risk the migrate sweep's SIGKILL
|
|
702
|
+
// follow-up carries (`sweepOrphanOnPort`); not worth a second `ps` round-trip.
|
|
703
|
+
if (this.opts.pidOnPort(port) === holder) {
|
|
704
|
+
try {
|
|
705
|
+
this.opts.killFn(holder, "SIGKILL");
|
|
706
|
+
} catch {
|
|
707
|
+
// Already gone / can't signal — best-effort; fall through to re-spawn.
|
|
708
|
+
}
|
|
709
|
+
}
|
|
710
|
+
}
|
|
711
|
+
|
|
712
|
+
/**
|
|
713
|
+
* Crash-restart squatter resolution (#522 / #582). Called from `handleExit`
|
|
714
|
+
* when a foreign process holds the crashed module's port. Returns:
|
|
715
|
+
* - `true` → the loop should HALT: we recorded a structured `port_squatter`
|
|
716
|
+
* start-error + set status `crashed` WITHOUT touching the crash budget
|
|
717
|
+
* (the module didn't crash — a foreign process is blocking its port, so a
|
|
718
|
+
* budget tick would wrongly bring us closer to "giving up"). Applies to a
|
|
719
|
+
* NON-attributable holder always, and to an attributable holder under the
|
|
720
|
+
* `"prompt"` policy.
|
|
721
|
+
* - `false` → we ADOPT-KILLED an attributable orphan (under the default
|
|
722
|
+
* `"adopt"` policy); the caller falls through to the normal restart, which
|
|
723
|
+
* re-spawns onto the now-freed port (counting as a normal restart).
|
|
724
|
+
*
|
|
725
|
+
* Attribution is the safety crux: REUSE the shared `orphanAttributable`
|
|
726
|
+
* (`orphan-attribution.ts`) — but in its PER-MODULE mode (`moduleMarker` set),
|
|
727
|
+
* NOT the migrate sweep's broad `parachute` mode. The supervisor is always
|
|
728
|
+
* restarting ONE specific module and knows its identity, so it requires the
|
|
729
|
+
* orphan's cmdline to carry THIS module's own start binary / installDir before
|
|
730
|
+
* killing — a bare `parachute` match would let vault's restart adopt-kill a
|
|
731
|
+
* sibling `scribe`/`runner` orphan on vault's port (a cross-module kill). So a
|
|
732
|
+
* sibling module's process (or an operator's unrelated process) is "not
|
|
733
|
+
* attributable" → surfaced, never killed. Only a genuine prior instance of the
|
|
734
|
+
* SAME module is reclaimable.
|
|
735
|
+
*/
|
|
736
|
+
private async handleCrashRestartSquatter(
|
|
737
|
+
entry: ModuleEntry,
|
|
738
|
+
squatter: { port: number; holder: number; cmdline: string | undefined },
|
|
739
|
+
exitCode: number | null,
|
|
740
|
+
): Promise<boolean> {
|
|
741
|
+
const { port, holder } = squatter;
|
|
742
|
+
const short = entry.req.short;
|
|
743
|
+
|
|
744
|
+
const recordSquatterError = (): true => {
|
|
745
|
+
entry.state = {
|
|
746
|
+
...entry.state,
|
|
747
|
+
status: "crashed",
|
|
748
|
+
pid: undefined,
|
|
749
|
+
lastExitCode: exitCode,
|
|
750
|
+
// NB: restartsInWindow is left as-is — we deliberately do NOT push a
|
|
751
|
+
// crash stamp for a port-blocked module (it didn't crash).
|
|
752
|
+
startError: this.portSquatterError(entry, squatter),
|
|
753
|
+
};
|
|
754
|
+
return true;
|
|
755
|
+
};
|
|
756
|
+
|
|
757
|
+
// Policy gate: "prompt" never auto-kills — surface every squatter for the
|
|
758
|
+
// operator (the one-line lever to flip off auto-kill if it's vetoed).
|
|
759
|
+
if (this.opts.reclaimPolicy === "prompt") {
|
|
760
|
+
this.opts.output(
|
|
761
|
+
`[supervisor] ${short} crashed; port ${port} held by pid ${holder} (reclaim policy "prompt") — surfacing instead of adopting.\n`,
|
|
762
|
+
);
|
|
763
|
+
return recordSquatterError();
|
|
764
|
+
}
|
|
765
|
+
|
|
766
|
+
// "adopt": adopt-kill only an ATTRIBUTABLE orphan. The recorded pid arm uses
|
|
767
|
+
// the entry's last-known pid (the just-crashed child's) — if the SAME pid
|
|
768
|
+
// somehow still holds the port it's trivially ours to reclaim; otherwise the
|
|
769
|
+
// PER-MODULE cmdline marker (this module's own start binary / installDir)
|
|
770
|
+
// decides — NOT the broad `parachute` marker, so a sibling module's orphan
|
|
771
|
+
// on this port is not attributable.
|
|
772
|
+
const { attributable, cmdline } = orphanAttributable({
|
|
773
|
+
orphan: holder,
|
|
774
|
+
recordedPid: entry.proc?.pid,
|
|
775
|
+
short,
|
|
776
|
+
startCmdHint: undefined,
|
|
777
|
+
ownerOfPid: this.opts.ownerOfPid,
|
|
778
|
+
moduleMarker: this.moduleMarkerFor(entry),
|
|
779
|
+
});
|
|
780
|
+
if (!attributable) {
|
|
781
|
+
const desc = cmdline ?? squatter.cmdline ?? "command line unavailable";
|
|
782
|
+
this.opts.output(
|
|
783
|
+
`[supervisor] ${short} crashed; port ${port} held by an unrelated process (pid ${holder}, ${desc}) — refusing to kill it; surfacing.\n`,
|
|
784
|
+
);
|
|
785
|
+
return recordSquatterError();
|
|
786
|
+
}
|
|
787
|
+
|
|
788
|
+
// Attributable orphan under "adopt": reclaim the port, then fall through to
|
|
789
|
+
// the normal restart (return false). Best-effort — if the kill doesn't free
|
|
790
|
+
// the port, the re-spawn EADDRINUSE-crashes as a normal crash and the budget
|
|
791
|
+
// eventually halts the loop.
|
|
792
|
+
this.opts.output(
|
|
793
|
+
`[supervisor] ${short} crashed; port ${port} held by an attributable orphan (pid ${holder}${cmdline ? `, ${cmdline}` : ""}) — adopting + killing it before restart.\n`,
|
|
794
|
+
);
|
|
795
|
+
await this.adoptKillOrphanOnPort(port, holder);
|
|
796
|
+
return false;
|
|
797
|
+
}
|
|
798
|
+
|
|
799
|
+
/**
|
|
800
|
+
* The module-specific cmdline marker for the per-module adopt-kill attribution
|
|
801
|
+
* (#601 review). A genuine prior instance of THIS module was launched with
|
|
802
|
+
* this module's start binary (`req.cmd[0]`, e.g. `parachute-vault`) and from
|
|
803
|
+
* its installDir (`req.cwd`, e.g. `~/.parachute/vault/`) — both appear in the
|
|
804
|
+
* orphan's `ps` cmdline.
|
|
805
|
+
*
|
|
806
|
+
* Prefer the start binary (it's the most module-distinctive token) — BUT only
|
|
807
|
+
* when it's actually module-specific. A custom operator startCmd like
|
|
808
|
+
* `bun server.ts` has a GENERIC RUNTIME at `cmd[0]` (`bun`/`node`/`python`/…);
|
|
809
|
+
* using "bun" as the marker would attribute ANY bun process on the port — the
|
|
810
|
+
* exact over-broad adopt-kill per-module attribution exists to prevent
|
|
811
|
+
* (#601 re-review). So when `cmd[0]`'s basename is a known generic runtime,
|
|
812
|
+
* fall through to the cwd / installDir marker, which IS module-specific.
|
|
813
|
+
*
|
|
814
|
+
* Returns undefined only when neither a non-generic `cmd[0]` nor a usable cwd
|
|
815
|
+
* is available — attribution then falls back to the recorded-pid arm only (the
|
|
816
|
+
* cmdline arm can't match an empty needle → the safe, conservative degradation:
|
|
817
|
+
* never a false-positive kill).
|
|
818
|
+
*
|
|
819
|
+
* Note we pass the FULL `cmd[0]` (e.g. `parachute-vault`, or an absolute
|
|
820
|
+
* `/path/to/parachute-vault`), not a bare short name — the short name
|
|
821
|
+
* (`vault`) is deliberately too loose for a kill decision.
|
|
822
|
+
*/
|
|
823
|
+
private moduleMarkerFor(entry: ModuleEntry): string | undefined {
|
|
824
|
+
const binary = entry.req.cmd[0];
|
|
825
|
+
if (binary && binary.length > 0 && !isGenericRuntime(binary)) return binary;
|
|
826
|
+
if (entry.req.cwd && entry.req.cwd.length > 0) return entry.req.cwd;
|
|
827
|
+
return undefined;
|
|
828
|
+
}
|
|
829
|
+
|
|
579
830
|
/**
|
|
580
831
|
* Poll the module's port until it binds or `startReadyMs` elapses (§6.5).
|
|
581
832
|
* Skipped when the gate is disabled (stub-spawner test path) or the request
|
|
@@ -813,6 +1064,26 @@ export class Supervisor {
|
|
|
813
1064
|
return;
|
|
814
1065
|
}
|
|
815
1066
|
|
|
1067
|
+
// Crash-restart port reconciliation (#522 / #582). Before counting this
|
|
1068
|
+
// crash and re-spawning, check whether the module's declared port is now
|
|
1069
|
+
// held by a process the supervisor doesn't own. The `start()` squatter
|
|
1070
|
+
// check (#581) only runs on the operator-initiated path; the crash-restart
|
|
1071
|
+
// loop bypassed it, so a foreign process that grabbed the port between the
|
|
1072
|
+
// crash and the auto-restart kept EADDRINUSE-crash-looping into a bare
|
|
1073
|
+
// `crashed` with no clue why (#582), and a leftover-autostart orphan from a
|
|
1074
|
+
// prior instance re-took the port forever (#522). `excludeCrashingEntry`
|
|
1075
|
+
// drops the just-crashed child's stale pid from the "ours" set (N1).
|
|
1076
|
+
const squatter = this.checkPortSquatter(entry, /* excludeCrashingEntry */ true);
|
|
1077
|
+
if (squatter) {
|
|
1078
|
+
const handled = await this.handleCrashRestartSquatter(entry, squatter, exitCode);
|
|
1079
|
+
// `handled` true → we surfaced a structured error and halted the loop
|
|
1080
|
+
// WITHOUT counting this against the crash budget (the module didn't crash
|
|
1081
|
+
// — a foreign process is blocking its port). `false` → we adopt-killed an
|
|
1082
|
+
// attributable orphan and fall through to the normal restart below, which
|
|
1083
|
+
// re-spawns onto the now-freed port (counting as a normal restart).
|
|
1084
|
+
if (handled) return;
|
|
1085
|
+
}
|
|
1086
|
+
|
|
816
1087
|
const now = this.opts.now();
|
|
817
1088
|
// Drop crashes older than the window before counting.
|
|
818
1089
|
const cutoff = now - this.opts.restartWindowMs;
|