@openparachute/hub 0.6.2 → 0.6.3-rc.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (58) hide show
  1. package/README.md +87 -35
  2. package/package.json +1 -1
  3. package/src/__tests__/api-hub-upgrade.test.ts +690 -0
  4. package/src/__tests__/api-modules-ops.test.ts +359 -3
  5. package/src/__tests__/api-modules.test.ts +54 -0
  6. package/src/__tests__/expose-cloudflare.test.ts +163 -72
  7. package/src/__tests__/expose-off-auto.test.ts +26 -1
  8. package/src/__tests__/expose.test.ts +260 -240
  9. package/src/__tests__/hub-control.test.ts +1 -242
  10. package/src/__tests__/hub-server.test.ts +64 -0
  11. package/src/__tests__/hub-unit.test.ts +574 -0
  12. package/src/__tests__/init.test.ts +219 -2
  13. package/src/__tests__/lifecycle.test.ts +416 -1448
  14. package/src/__tests__/managed-unit.test.ts +575 -0
  15. package/src/__tests__/migrate-cutover.test.ts +840 -0
  16. package/src/__tests__/migrate-offer.test.ts +240 -0
  17. package/src/__tests__/migrate.test.ts +132 -0
  18. package/src/__tests__/module-ops-client.test.ts +556 -0
  19. package/src/__tests__/port-probe.test.ts +23 -0
  20. package/src/__tests__/setup-wizard.test.ts +130 -0
  21. package/src/__tests__/status-supervisor.test.ts +504 -0
  22. package/src/__tests__/status.test.ts +157 -708
  23. package/src/__tests__/supervisor.test.ts +471 -6
  24. package/src/__tests__/upgrade.test.ts +351 -5
  25. package/src/api-hub-upgrade.ts +384 -0
  26. package/src/api-hub.ts +2 -1
  27. package/src/api-modules-ops.ts +221 -0
  28. package/src/api-modules.ts +18 -2
  29. package/src/cli.ts +97 -12
  30. package/src/cloudflare/connector-service.ts +117 -322
  31. package/src/commands/expose-cloudflare.ts +63 -71
  32. package/src/commands/expose-supervisor.ts +247 -0
  33. package/src/commands/expose.ts +59 -48
  34. package/src/commands/init.ts +225 -12
  35. package/src/commands/lifecycle.ts +455 -816
  36. package/src/commands/migrate-cutover.ts +837 -0
  37. package/src/commands/migrate.ts +71 -2
  38. package/src/commands/serve-boot.ts +71 -25
  39. package/src/commands/status.ts +535 -235
  40. package/src/commands/upgrade.ts +100 -2
  41. package/src/help.ts +128 -68
  42. package/src/hub-control.ts +23 -162
  43. package/src/hub-server.ts +39 -0
  44. package/src/hub-unit.ts +735 -0
  45. package/src/hub-upgrade-helper.ts +306 -0
  46. package/src/hub-upgrade-mode.ts +209 -0
  47. package/src/hub-upgrade-status.ts +150 -0
  48. package/src/managed-unit.ts +692 -0
  49. package/src/migrate-offer.ts +186 -0
  50. package/src/module-ops-client.ts +457 -0
  51. package/src/port-probe.ts +50 -0
  52. package/src/process-state.ts +19 -3
  53. package/src/setup-wizard.ts +80 -1
  54. package/src/supervisor.ts +389 -38
  55. package/web/ui/dist/assets/index-D_6AFvZy.js +61 -0
  56. package/web/ui/dist/assets/{index-BiBlvEaj.css → index-mz8XcVPP.css} +1 -1
  57. package/web/ui/dist/index.html +2 -2
  58. package/web/ui/dist/assets/index-CIN3mnmf.js +0 -61
@@ -0,0 +1,840 @@
1
+ import { describe, expect, test } from "bun:test";
2
+ import { mkdtempSync, rmSync } from "node:fs";
3
+ import { tmpdir } from "node:os";
4
+ import { join } from "node:path";
5
+ import {
6
+ type CutoverDeps,
7
+ type WriteUnitResult,
8
+ cutoverToSupervised,
9
+ defaultCutoverDeps,
10
+ teardownHubUnit,
11
+ } from "../commands/migrate-cutover.ts";
12
+ import type { HubUnitDeps, InstallAndStartHubUnitResult } from "../hub-unit.ts";
13
+ import type { ManagedUnitRemoveResult } from "../managed-unit.ts";
14
+ import { writePid } from "../process-state.ts";
15
+
16
+ /**
17
+ * ALL destructive-path tests run in a FRESH sandbox `PARACHUTE_HOME` with stubbed
18
+ * seams — NO real Bun.spawn / systemctl / launchctl / lsof / process kills, NO
19
+ * touching the operator's `~/.parachute`. The sandbox dir is only used to seed
20
+ * services.json + pidfiles (real `writePid`) so the detector + per-module stop
21
+ * read genuine on-disk state; everything that would touch a live process is a
22
+ * fake.
23
+ */
24
+ interface Harness {
25
+ configDir: string;
26
+ manifestPath: string;
27
+ cleanup: () => void;
28
+ }
29
+
30
+ function makeHarness(): Harness {
31
+ const dir = mkdtempSync(join(tmpdir(), "pcli-cutover-"));
32
+ return {
33
+ configDir: dir,
34
+ manifestPath: join(dir, "services.json"),
35
+ cleanup: () => rmSync(dir, { recursive: true, force: true }),
36
+ };
37
+ }
38
+
39
+ function seedManifest(manifestPath: string, services: Array<{ name: string; port: number }>): void {
40
+ const full = services.map((s) => ({
41
+ name: s.name,
42
+ port: s.port,
43
+ paths: [`/${s.name}`],
44
+ health: "/health",
45
+ version: "1.0.0",
46
+ }));
47
+ Bun.write(manifestPath, JSON.stringify({ services: full }));
48
+ }
49
+
50
+ /** A trace-recording set of cutover deps, with sane defaults for the happy path. */
51
+ interface FakeCutover {
52
+ deps: Partial<CutoverDeps>;
53
+ trace: string[];
54
+ hubUnitDeps: HubUnitDeps;
55
+ }
56
+
57
+ function fakeHubUnitDeps(): HubUnitDeps {
58
+ return {
59
+ platform: "linux",
60
+ getuid: () => 1000,
61
+ homeDir: () => "/home/op",
62
+ userName: () => "op",
63
+ which: (b) => (b === "bun" ? "/home/op/.bun/bin/bun" : `/usr/bin/${b}`),
64
+ run: () => ({ code: 0, stdout: "", stderr: "" }),
65
+ writeFile: () => {},
66
+ removeFile: () => {},
67
+ readFile: () => undefined,
68
+ exists: () => false,
69
+ probeHealth: async () => false,
70
+ portListening: async () => false,
71
+ sleep: async () => {},
72
+ };
73
+ }
74
+
75
+ function makeFakeCutover(over: Partial<CutoverDeps> = {}): FakeCutover {
76
+ const trace: string[] = [];
77
+ const hubUnitDeps = fakeHubUnitDeps();
78
+ // Mutable "world" the fakes read so we can model state transitions (a port
79
+ // that frees after the stop, a unit that becomes installed after a write).
80
+ const world = {
81
+ unitInstalled: false,
82
+ hubHealthy: false,
83
+ /** Ports currently "listening" (held). The verify-free step polls this. */
84
+ listening: new Set<number>(),
85
+ /** Ports an orphan (lsof) reports a pid on. */
86
+ orphanPorts: new Map<number, number>(),
87
+ alivePids: new Set<number>(),
88
+ };
89
+ const deps: Partial<CutoverDeps> = {
90
+ hubUnitDeps,
91
+ alive: (pid) => world.alivePids.has(pid),
92
+ kill: (pid, signal) => {
93
+ trace.push(`kill ${pid} ${signal}`);
94
+ // SIGKILL / SIGTERM removes the process from the world.
95
+ world.alivePids.delete(pid);
96
+ },
97
+ pidOnPort: (port) => world.orphanPorts.get(port),
98
+ // Default ownership probe: unattributable (returns undefined) — hermetic, no
99
+ // real `ps` shell-out. Tests that exercise the ownership check override it.
100
+ ownerOfPid: () => undefined,
101
+ portListening: async (port) => world.listening.has(port),
102
+ stopHub: async () => {
103
+ trace.push("stopHub");
104
+ world.listening.delete(1939);
105
+ return true;
106
+ },
107
+ isHubUnitInstalled: () => world.unitInstalled,
108
+ probeHealth: async () => world.hubHealthy,
109
+ sleep: async () => {},
110
+ writeUnitWithoutStarting: (): WriteUnitResult => {
111
+ trace.push("writeUnit");
112
+ world.unitInstalled = true;
113
+ return { written: true, outcome: "installed", messages: ["wrote unit (not started)"] };
114
+ },
115
+ installAndStartHubUnit: async (): Promise<InstallAndStartHubUnitResult> => {
116
+ trace.push("startUnit");
117
+ world.hubHealthy = true;
118
+ return {
119
+ outcome: "started",
120
+ port: 1939,
121
+ install: { outcome: "installed", kind: "systemd-user", messages: [] },
122
+ messages: ["started unit"],
123
+ };
124
+ },
125
+ ...over,
126
+ };
127
+ // Expose the world via closure for tests that want to manipulate it.
128
+ (deps as { _world?: typeof world })._world = world;
129
+ return { deps, trace, hubUnitDeps };
130
+ }
131
+
132
+ function getWorld(deps: Partial<CutoverDeps>): {
133
+ unitInstalled: boolean;
134
+ hubHealthy: boolean;
135
+ listening: Set<number>;
136
+ orphanPorts: Map<number, number>;
137
+ alivePids: Set<number>;
138
+ } {
139
+ const w = (deps as { _world?: ReturnType<typeof Object> })._world;
140
+ if (!w) throw new Error("no world");
141
+ return w as ReturnType<typeof getWorld>;
142
+ }
143
+
144
+ describe("cutoverToSupervised — happy path (§7.1)", () => {
145
+ test("detached box with running hub + modules migrates end-to-end", async () => {
146
+ const h = makeHarness();
147
+ try {
148
+ seedManifest(h.manifestPath, [{ name: "vault", port: 1940 }]);
149
+ const fc = makeFakeCutover();
150
+ const w = getWorld(fc.deps);
151
+ // The detached hub + vault are alive + bound to their ports. vault is
152
+ // tracked by a real pidfile (pid 5555); stopping it frees port 1940.
153
+ w.listening.add(1939);
154
+ w.listening.add(1940);
155
+ w.alivePids.add(5555);
156
+ writePid("vault", 5555, h.configDir);
157
+ const baseKill = fc.deps.kill;
158
+ fc.deps.kill = (pid, signal) => {
159
+ baseKill?.(pid, signal);
160
+ if (pid === 5555) getWorld(fc.deps).listening.delete(1940);
161
+ };
162
+ const log: string[] = [];
163
+ const result = await cutoverToSupervised({
164
+ configDir: h.configDir,
165
+ manifestPath: h.manifestPath,
166
+ deps: fc.deps,
167
+ log: (l) => log.push(l),
168
+ pollMs: 0,
169
+ });
170
+ expect(result.outcome).toBe("migrated");
171
+ // ORDERING: write the unit BEFORE stopping detached, verify ports free
172
+ // BEFORE starting the unit, start AFTER stop. The trace proves the order.
173
+ const writeIdx = fc.trace.indexOf("writeUnit");
174
+ const stopIdx = fc.trace.indexOf("stopHub");
175
+ const startIdx = fc.trace.indexOf("startUnit");
176
+ expect(writeIdx).toBeGreaterThanOrEqual(0);
177
+ expect(writeIdx).toBeLessThan(stopIdx); // unit written before stop
178
+ expect(stopIdx).toBeLessThan(startIdx); // detached stopped before unit start
179
+ // The hub is now supervised + healthy.
180
+ expect(getWorld(fc.deps).hubHealthy).toBe(true);
181
+ expect(getWorld(fc.deps).unitInstalled).toBe(true);
182
+ } finally {
183
+ h.cleanup();
184
+ }
185
+ });
186
+
187
+ test("verify-ports-free runs before start (start never races a held port)", async () => {
188
+ const h = makeHarness();
189
+ try {
190
+ seedManifest(h.manifestPath, [{ name: "vault", port: 1940 }]);
191
+ const fc = makeFakeCutover();
192
+ const w = getWorld(fc.deps);
193
+ w.listening.add(1939);
194
+ w.listening.add(1940);
195
+ w.alivePids.add(5555);
196
+ writePid("vault", 5555, h.configDir);
197
+ const baseKill = fc.deps.kill;
198
+ fc.deps.kill = (pid, signal) => {
199
+ baseKill?.(pid, signal);
200
+ if (pid === 5555) getWorld(fc.deps).listening.delete(1940);
201
+ };
202
+ // Record the world's listening state at the instant startUnit is called.
203
+ let listeningAtStart: number[] = [];
204
+ const baseStart = fc.deps.installAndStartHubUnit;
205
+ fc.deps.installAndStartHubUnit = async (opts) => {
206
+ listeningAtStart = [...getWorld(fc.deps).listening];
207
+ return baseStart?.(opts) as Promise<InstallAndStartHubUnitResult>;
208
+ };
209
+ const result = await cutoverToSupervised({
210
+ configDir: h.configDir,
211
+ manifestPath: h.manifestPath,
212
+ deps: fc.deps,
213
+ log: () => {},
214
+ pollMs: 0,
215
+ });
216
+ expect(result.outcome).toBe("migrated");
217
+ // By the time the unit starts, both ports must be free.
218
+ expect(listeningAtStart).toEqual([]);
219
+ } finally {
220
+ h.cleanup();
221
+ }
222
+ });
223
+ });
224
+
225
+ describe("cutoverToSupervised — idempotency + resumability", () => {
226
+ test("already-supervised box is a no-op (unit installed + /health answers)", async () => {
227
+ const h = makeHarness();
228
+ try {
229
+ const fc = makeFakeCutover();
230
+ const w = getWorld(fc.deps);
231
+ w.unitInstalled = true;
232
+ w.hubHealthy = true;
233
+ const result = await cutoverToSupervised({
234
+ configDir: h.configDir,
235
+ manifestPath: h.manifestPath,
236
+ deps: fc.deps,
237
+ log: () => {},
238
+ pollMs: 0,
239
+ });
240
+ expect(result.outcome).toBe("already-migrated");
241
+ // No destructive step ran.
242
+ expect(fc.trace).not.toContain("stopHub");
243
+ expect(fc.trace).not.toContain("startUnit");
244
+ expect(fc.trace).not.toContain("writeUnit");
245
+ } finally {
246
+ h.cleanup();
247
+ }
248
+ });
249
+
250
+ test("resumes a partial cutover (unit written but hub not healthy)", async () => {
251
+ const h = makeHarness();
252
+ try {
253
+ seedManifest(h.manifestPath, []);
254
+ const fc = makeFakeCutover();
255
+ const w = getWorld(fc.deps);
256
+ // Unit on disk from a prior aborted run, but the hub never came up.
257
+ w.unitInstalled = true;
258
+ w.hubHealthy = false;
259
+ const result = await cutoverToSupervised({
260
+ configDir: h.configDir,
261
+ manifestPath: h.manifestPath,
262
+ deps: fc.deps,
263
+ log: () => {},
264
+ pollMs: 0,
265
+ });
266
+ // NOT a no-op — it resumes (re-write idempotent, stop no-ops, start).
267
+ expect(result.outcome).toBe("migrated");
268
+ expect(fc.trace).toContain("writeUnit");
269
+ expect(fc.trace).toContain("startUnit");
270
+ } finally {
271
+ h.cleanup();
272
+ }
273
+ });
274
+ });
275
+
276
+ describe("cutoverToSupervised — §7.2 orphan sweep", () => {
277
+ test("a process bound to a module port (stale pidfile) is adopted + killed before start", async () => {
278
+ const h = makeHarness();
279
+ try {
280
+ seedManifest(h.manifestPath, [{ name: "vault", port: 1940 }]);
281
+ // The orphan IS the stale vault module (its command line is attributable),
282
+ // so the ownership check (MUST-FIX 2) adopts + kills it.
283
+ const fc = makeFakeCutover({
284
+ ownerOfPid: (pid) =>
285
+ pid === 4242
286
+ ? "bun /home/op/.bun/install/global/@openparachute/vault/server.ts --port 1940"
287
+ : undefined,
288
+ });
289
+ const w = getWorld(fc.deps);
290
+ // vault's pidfile is gone, but an orphan PID 4242 still holds port 1940.
291
+ w.listening.add(1939);
292
+ w.listening.add(1940);
293
+ w.orphanPorts.set(1940, 4242);
294
+ w.alivePids.add(4242);
295
+ // When the orphan is killed, free its port (model the OS releasing it).
296
+ const baseKill = fc.deps.kill;
297
+ fc.deps.kill = (pid, signal) => {
298
+ baseKill?.(pid, signal);
299
+ if (pid === 4242) getWorld(fc.deps).listening.delete(1940);
300
+ };
301
+ const result = await cutoverToSupervised({
302
+ configDir: h.configDir,
303
+ manifestPath: h.manifestPath,
304
+ deps: fc.deps,
305
+ log: () => {},
306
+ pollMs: 0,
307
+ });
308
+ expect(result.outcome).toBe("migrated");
309
+ // The orphan was killed (adopted from lsof, not from a pidfile).
310
+ expect(fc.trace).toContain("kill 4242 SIGTERM");
311
+ } finally {
312
+ h.cleanup();
313
+ }
314
+ });
315
+ });
316
+
317
+ describe("cutoverToSupervised — fail-safe recovery states", () => {
318
+ test("port-stuck: a port that won't free fails with the unit written-but-not-started", async () => {
319
+ const h = makeHarness();
320
+ try {
321
+ seedManifest(h.manifestPath, [{ name: "vault", port: 1940 }]);
322
+ const fc = makeFakeCutover();
323
+ const w = getWorld(fc.deps);
324
+ w.listening.add(1939);
325
+ w.listening.add(1940);
326
+ // stopHub frees 1939, but NOTHING frees 1940 (no orphan to adopt, the
327
+ // detached stop didn't release it) — the port stays held forever.
328
+ const result = await cutoverToSupervised({
329
+ configDir: h.configDir,
330
+ manifestPath: h.manifestPath,
331
+ deps: fc.deps,
332
+ log: () => {},
333
+ pollMs: 0,
334
+ timeoutMs: 0,
335
+ });
336
+ expect(result.outcome).toBe("port-stuck");
337
+ // FAIL-SAFE: the unit WAS written (recoverable), and the unit was NOT
338
+ // started (we never raced the held port).
339
+ expect(fc.trace).toContain("writeUnit");
340
+ expect(fc.trace).not.toContain("startUnit");
341
+ expect(getWorld(fc.deps).unitInstalled).toBe(true);
342
+ expect(result.messages.join("\n")).toContain("re-run");
343
+ } finally {
344
+ h.cleanup();
345
+ }
346
+ });
347
+
348
+ test("write-failed (bun unresolvable): bails BEFORE stopping anything", async () => {
349
+ const h = makeHarness();
350
+ try {
351
+ seedManifest(h.manifestPath, [{ name: "vault", port: 1940 }]);
352
+ const fc = makeFakeCutover({
353
+ writeUnitWithoutStarting: () => ({
354
+ written: false,
355
+ outcome: "fallback",
356
+ cause: "write-failed",
357
+ messages: ["cannot build hub unit: 'bun' not found on PATH"],
358
+ }),
359
+ });
360
+ const w = getWorld(fc.deps);
361
+ w.listening.add(1939);
362
+ const result = await cutoverToSupervised({
363
+ configDir: h.configDir,
364
+ manifestPath: h.manifestPath,
365
+ deps: fc.deps,
366
+ log: () => {},
367
+ pollMs: 0,
368
+ });
369
+ // A write failure is distinct from no-manager (MUST-FIX NIT) — a manager
370
+ // may exist; we just couldn't compose/write the unit.
371
+ expect(result.outcome).toBe("write-failed");
372
+ // FAIL-SAFE: nothing was stopped — we never reached step 3.
373
+ expect(fc.trace).not.toContain("stopHub");
374
+ expect(fc.trace).not.toContain("startUnit");
375
+ } finally {
376
+ h.cleanup();
377
+ }
378
+ });
379
+
380
+ test("start-failed: unit start degrades → recoverable (unit written, re-runnable)", async () => {
381
+ const h = makeHarness();
382
+ try {
383
+ seedManifest(h.manifestPath, []);
384
+ const fc = makeFakeCutover({
385
+ installAndStartHubUnit: async () => ({
386
+ outcome: "no-manager",
387
+ port: 1939,
388
+ install: { outcome: "fallback", messages: ["manager gone"] },
389
+ messages: ["manager gone"],
390
+ }),
391
+ });
392
+ const w = getWorld(fc.deps);
393
+ w.listening.add(1939);
394
+ const result = await cutoverToSupervised({
395
+ configDir: h.configDir,
396
+ manifestPath: h.manifestPath,
397
+ deps: fc.deps,
398
+ log: () => {},
399
+ pollMs: 0,
400
+ });
401
+ expect(result.outcome).toBe("start-failed");
402
+ expect(result.messages.join("\n")).toContain("re-run");
403
+ // The unit was written (recoverable); we stopped detached but the unit is
404
+ // on disk so a re-run is clean.
405
+ expect(getWorld(fc.deps).unitInstalled).toBe(true);
406
+ } finally {
407
+ h.cleanup();
408
+ }
409
+ });
410
+
411
+ test("verify-timeout: unit starts but /health never answers → recoverable", async () => {
412
+ const h = makeHarness();
413
+ try {
414
+ seedManifest(h.manifestPath, []);
415
+ const fc = makeFakeCutover({
416
+ // The unit "starts" but the world's hubHealthy stays false.
417
+ installAndStartHubUnit: async () => ({
418
+ outcome: "started",
419
+ port: 1939,
420
+ install: { outcome: "installed", kind: "systemd-user", messages: [] },
421
+ messages: ["started"],
422
+ }),
423
+ });
424
+ const w = getWorld(fc.deps);
425
+ w.listening.add(1939);
426
+ // Note: probeHealth reads world.hubHealthy which stays false.
427
+ const result = await cutoverToSupervised({
428
+ configDir: h.configDir,
429
+ manifestPath: h.manifestPath,
430
+ deps: fc.deps,
431
+ log: () => {},
432
+ pollMs: 0,
433
+ timeoutMs: 0,
434
+ });
435
+ expect(result.outcome).toBe("verify-timeout");
436
+ expect(result.messages.join("\n")).toContain("logs hub");
437
+ } finally {
438
+ h.cleanup();
439
+ }
440
+ });
441
+ });
442
+
443
+ describe("teardownHubUnit (§7.4)", () => {
444
+ test("removes the hub unit (idempotent success path)", () => {
445
+ let removeArgs: { launchdLabel: string; systemdUnitName: string } | undefined;
446
+ const log: string[] = [];
447
+ const res = teardownHubUnit({
448
+ log: (l) => log.push(l),
449
+ remove: (opts): ManagedUnitRemoveResult => {
450
+ removeArgs = { launchdLabel: opts.launchdLabel, systemdUnitName: opts.systemdUnitName };
451
+ return { removed: true, messages: [opts.removedSystemdMessage(opts.systemdUnitName)] };
452
+ },
453
+ });
454
+ expect(res.removed).toBe(true);
455
+ expect(removeArgs?.launchdLabel).toBe("computer.parachute.hub");
456
+ expect(removeArgs?.systemdUnitName).toBe("parachute-hub.service");
457
+ // Surfaces the fallback hint.
458
+ expect(log.join("\n")).toContain("parachute serve");
459
+ });
460
+
461
+ test("no unit installed → no-op, friendly message", () => {
462
+ const log: string[] = [];
463
+ const res = teardownHubUnit({
464
+ log: (l) => log.push(l),
465
+ remove: (): ManagedUnitRemoveResult => ({ removed: false, messages: [] }),
466
+ });
467
+ expect(res.removed).toBe(false);
468
+ expect(log.join("\n")).toContain("nothing to tear down");
469
+ });
470
+ });
471
+
472
+ // ===========================================================================
473
+ // MUST-FIX 1 — group-aware kill (hub#88 re-opened in the cutover).
474
+ //
475
+ // Modules are spawned `detached: true` → the recorded pid is a process-GROUP
476
+ // leader. A wrapper startCmd (`pnpm exec tsx server.ts`) leaves the real server
477
+ // as a GRANDCHILD in that group. The cutover used the BARE-pid kill, which
478
+ // signals only the wrapper → the tsx grandchild survives, keeps holding the
479
+ // module port → `waitPortFree` times out → `port-stuck` on the FIRST run. These
480
+ // tests pin the fix: `defaultCutoverDeps.kill` is GROUP-aware (negative pid).
481
+ // ===========================================================================
482
+
483
+ function pidAlive(pid: number): boolean {
484
+ try {
485
+ process.kill(pid, 0);
486
+ return true;
487
+ } catch {
488
+ return false;
489
+ }
490
+ }
491
+
492
+ describe("MUST-FIX 1: group-aware kill targets the process GROUP (negative pid)", () => {
493
+ test("defaultCutoverDeps.kill signals -pid (the whole group), with an ESRCH bare-pid fallback", () => {
494
+ const calls: Array<{ pid: number; signal: NodeJS.Signals | number }> = [];
495
+ const realKill = process.kill.bind(process);
496
+ const spy = (pid: number, signal?: NodeJS.Signals | number) => {
497
+ calls.push({ pid, signal: signal ?? 0 });
498
+ // Make the group send (negative pid) succeed so no fallback is taken.
499
+ return true as unknown as ReturnType<typeof process.kill>;
500
+ };
501
+ try {
502
+ // biome-ignore lint/suspicious/noExplicitAny: test spy on process.kill
503
+ (process as any).kill = spy;
504
+ defaultCutoverDeps.kill(4242, "SIGTERM");
505
+ } finally {
506
+ // biome-ignore lint/suspicious/noExplicitAny: restore
507
+ (process as any).kill = realKill;
508
+ }
509
+ // The group send fired with the NEGATIVE pid — not the bare pid.
510
+ expect(calls).toEqual([{ pid: -4242, signal: "SIGTERM" }]);
511
+ });
512
+
513
+ test("ESRCH on the group send falls back to a bare-pid signal (legacy pidfile)", () => {
514
+ const calls: Array<{ pid: number; signal: NodeJS.Signals | number }> = [];
515
+ const realKill = process.kill.bind(process);
516
+ const spy = (pid: number, signal?: NodeJS.Signals | number) => {
517
+ calls.push({ pid, signal: signal ?? 0 });
518
+ if (pid < 0) {
519
+ const err = new Error("no such process") as NodeJS.ErrnoException;
520
+ err.code = "ESRCH";
521
+ throw err;
522
+ }
523
+ return true as unknown as ReturnType<typeof process.kill>;
524
+ };
525
+ try {
526
+ // biome-ignore lint/suspicious/noExplicitAny: test spy on process.kill
527
+ (process as any).kill = spy;
528
+ defaultCutoverDeps.kill(777, "SIGKILL");
529
+ } finally {
530
+ // biome-ignore lint/suspicious/noExplicitAny: restore
531
+ (process as any).kill = realKill;
532
+ }
533
+ // First the group send (ESRCH), then the bare-pid fallback.
534
+ expect(calls).toEqual([
535
+ { pid: -777, signal: "SIGKILL" },
536
+ { pid: 777, signal: "SIGKILL" },
537
+ ]);
538
+ });
539
+
540
+ test("LOAD-BEARING: a wrapper-startCmd grandchild holding the module port is reaped → cutover migrates (NOT port-stuck)", async () => {
541
+ const h = makeHarness();
542
+ // Spawn a REAL detached wrapper that backgrounds a long-lived grandchild and
543
+ // prints the grandchild's pid — faithfully modeling `pnpm exec tsx server.ts`
544
+ // (a wrapper whose tsx grandchild is the thing actually holding the port). The
545
+ // wrapper is its own process-group leader (detached: true → pid == pgid).
546
+ const proc = Bun.spawn(["sh", "-c", "sleep 30 & echo $!; wait"], {
547
+ stdio: ["ignore", "pipe", "ignore"],
548
+ detached: true,
549
+ env: process.env,
550
+ });
551
+ const leaderPid = proc.pid; // group leader (the "wrapper")
552
+ let grandchildPid = -1;
553
+ try {
554
+ const { value } = await proc.stdout.getReader().read();
555
+ grandchildPid = Number.parseInt(
556
+ new TextDecoder().decode(value ?? new Uint8Array()).trim(),
557
+ 10,
558
+ );
559
+ expect(Number.isInteger(grandchildPid)).toBe(true);
560
+ expect(pidAlive(grandchildPid)).toBe(true);
561
+
562
+ seedManifest(h.manifestPath, [{ name: "vault", port: 1940 }]);
563
+ // vault's pidfile points at the WRAPPER leader (what `parachute start`
564
+ // recorded); the GRANDCHILD is what actually holds port 1940.
565
+ writePid("vault", leaderPid, h.configDir);
566
+
567
+ const fc = makeFakeCutover({
568
+ // PRODUCTION group-aware kill + alive (the fix under test) — NOT the
569
+ // harness fakes. Everything else stays stubbed (no real unit / health).
570
+ kill: defaultCutoverDeps.kill,
571
+ alive: defaultCutoverDeps.alive,
572
+ // The module port reads as HELD while the grandchild is still alive, and
573
+ // FREE the instant the grandchild dies — real-process backed, so only a
574
+ // correct group-kill (which reaps the grandchild) frees it.
575
+ portListening: async (port) => (port === 1940 ? pidAlive(grandchildPid) : false),
576
+ // No lsof orphan beyond the pidfile path — the stop reaps the group.
577
+ pidOnPort: () => undefined,
578
+ });
579
+ const w = getWorld(fc.deps);
580
+ w.listening.add(1939); // hub port held; stopHub frees it.
581
+
582
+ const result = await cutoverToSupervised({
583
+ configDir: h.configDir,
584
+ manifestPath: h.manifestPath,
585
+ deps: fc.deps,
586
+ log: () => {},
587
+ pollMs: 1,
588
+ timeoutMs: 2000,
589
+ });
590
+
591
+ // The grandchild was reaped by the GROUP kill → port 1940 freed → the
592
+ // cutover proceeded. A bare-pid kill would have left the grandchild alive,
593
+ // 1940 held, and the outcome `port-stuck` (the hub#88 footgun).
594
+ expect(result.outcome).toBe("migrated");
595
+ expect(pidAlive(grandchildPid)).toBe(false);
596
+ } finally {
597
+ // Defensive cleanup — both should already be gone on the happy path.
598
+ try {
599
+ process.kill(-leaderPid, "SIGKILL");
600
+ } catch {}
601
+ if (grandchildPid > 0) {
602
+ try {
603
+ process.kill(grandchildPid, "SIGKILL");
604
+ } catch {}
605
+ }
606
+ h.cleanup();
607
+ }
608
+ });
609
+ });
610
+
611
+ // ===========================================================================
612
+ // MUST-FIX 2 — ownership check in the per-module orphan sweep (no blind-kill).
613
+ //
614
+ // `sweepOrphanOnPort` must NOT kill whatever holds a declared MODULE port — only
615
+ // processes plausibly attributable to that parachute module. An operator's own
616
+ // dev server squatting a module port must survive (warning + port-stuck), not be
617
+ // nuked.
618
+ // ===========================================================================
619
+
620
+ describe("MUST-FIX 2: orphan-sweep ownership check on module ports", () => {
621
+ test("an orphan attributable to the module (cmdline mentions it) is adopted + killed", async () => {
622
+ const h = makeHarness();
623
+ try {
624
+ seedManifest(h.manifestPath, [{ name: "vault", port: 1940 }]);
625
+ const fc = makeFakeCutover({
626
+ // The orphan's command line looks like a parachute vault process.
627
+ ownerOfPid: (pid) =>
628
+ pid === 4242
629
+ ? "bun /home/op/.bun/install/global/@openparachute/vault/server.ts"
630
+ : undefined,
631
+ });
632
+ const w = getWorld(fc.deps);
633
+ w.listening.add(1939);
634
+ w.listening.add(1940);
635
+ w.orphanPorts.set(1940, 4242);
636
+ w.alivePids.add(4242);
637
+ const baseKill = fc.deps.kill;
638
+ fc.deps.kill = (pid, signal) => {
639
+ baseKill?.(pid, signal);
640
+ if (pid === 4242) getWorld(fc.deps).listening.delete(1940);
641
+ };
642
+ const result = await cutoverToSupervised({
643
+ configDir: h.configDir,
644
+ manifestPath: h.manifestPath,
645
+ deps: fc.deps,
646
+ log: () => {},
647
+ pollMs: 0,
648
+ });
649
+ expect(result.outcome).toBe("migrated");
650
+ // Attributable → adopted + killed.
651
+ expect(fc.trace).toContain("kill 4242 SIGTERM");
652
+ } finally {
653
+ h.cleanup();
654
+ }
655
+ });
656
+
657
+ test("an UNATTRIBUTABLE process on a module port is NOT killed → warning + port-stuck", async () => {
658
+ const h = makeHarness();
659
+ try {
660
+ seedManifest(h.manifestPath, [{ name: "vault", port: 1940 }]);
661
+ const fc = makeFakeCutover({
662
+ // The orphan is an operator's own dev server — nothing parachute-ish.
663
+ ownerOfPid: (pid) => (pid === 7777 ? "node /Users/op/my-app/dev-server.js" : undefined),
664
+ });
665
+ const w = getWorld(fc.deps);
666
+ w.listening.add(1939);
667
+ w.listening.add(1940);
668
+ // No vault pidfile (so no recorded-pid match) — the squatter is 7777.
669
+ w.orphanPorts.set(1940, 7777);
670
+ w.alivePids.add(7777);
671
+ const log: string[] = [];
672
+ const result = await cutoverToSupervised({
673
+ configDir: h.configDir,
674
+ manifestPath: h.manifestPath,
675
+ deps: fc.deps,
676
+ log: (l) => log.push(l),
677
+ pollMs: 0,
678
+ timeoutMs: 0,
679
+ });
680
+ // The cutover refused to nuke the unrelated process → the port stays held →
681
+ // port-stuck (the operator resolves it).
682
+ expect(result.outcome).toBe("port-stuck");
683
+ // The squatter was NEVER signalled.
684
+ expect(fc.trace).not.toContain("kill 7777 SIGTERM");
685
+ expect(fc.trace).not.toContain("kill 7777 SIGKILL");
686
+ expect(getWorld(fc.deps).alivePids.has(7777)).toBe(true);
687
+ // A clear warning names the unrelated process + refuses.
688
+ const out = log.join("\n");
689
+ expect(out).toContain("held by an unrelated process");
690
+ expect(out).toContain("7777");
691
+ expect(out).toContain("dev-server.js");
692
+ } finally {
693
+ h.cleanup();
694
+ }
695
+ });
696
+
697
+ test("an orphan whose command line is unreadable is treated as UNATTRIBUTABLE", async () => {
698
+ const h = makeHarness();
699
+ try {
700
+ seedManifest(h.manifestPath, [{ name: "vault", port: 1940 }]);
701
+ const fc = makeFakeCutover({
702
+ // ps failed / pid gone → no cmdline, and the pid doesn't match a record.
703
+ ownerOfPid: () => undefined,
704
+ });
705
+ const w = getWorld(fc.deps);
706
+ w.listening.add(1939);
707
+ w.listening.add(1940);
708
+ w.orphanPorts.set(1940, 5050);
709
+ w.alivePids.add(5050);
710
+ const result = await cutoverToSupervised({
711
+ configDir: h.configDir,
712
+ manifestPath: h.manifestPath,
713
+ deps: fc.deps,
714
+ log: () => {},
715
+ pollMs: 0,
716
+ timeoutMs: 0,
717
+ });
718
+ expect(result.outcome).toBe("port-stuck");
719
+ expect(fc.trace).not.toContain("kill 5050 SIGTERM");
720
+ } finally {
721
+ h.cleanup();
722
+ }
723
+ });
724
+
725
+ test("a process whose cmdline contains ONLY the bare module short-name (no 'parachute') is NOT attributed → not killed → port-stuck", async () => {
726
+ // Regression guard for the #507 re-review nit: the bare short-name needle was
727
+ // dropped because it false-attributed unrelated processes. The canonical
728
+ // footgun: a CI `gitlab-runner` squatting the `runner` module's port. Its
729
+ // cmdline contains the bare short-name "runner" but NOT "parachute", so under
730
+ // the old loose match the cutover would have KILLED it. Attribution now needs
731
+ // the `parachute` marker (or a recorded-pid / start-cmd-hint match).
732
+ const h = makeHarness();
733
+ try {
734
+ seedManifest(h.manifestPath, [{ name: "runner", port: 1943 }]);
735
+ const fc = makeFakeCutover({
736
+ // A genuine, unrelated CI runner — cmdline carries the bare short-name
737
+ // "runner" but nothing parachute-ish.
738
+ ownerOfPid: (pid) =>
739
+ pid === 8888
740
+ ? "/usr/local/bin/gitlab-runner run --config /etc/gitlab-runner/config.toml"
741
+ : undefined,
742
+ });
743
+ const w = getWorld(fc.deps);
744
+ w.listening.add(1939);
745
+ w.listening.add(1943);
746
+ // No runner pidfile (no recorded-pid match) — the squatter is 8888.
747
+ w.orphanPorts.set(1943, 8888);
748
+ w.alivePids.add(8888);
749
+ const log: string[] = [];
750
+ const result = await cutoverToSupervised({
751
+ configDir: h.configDir,
752
+ manifestPath: h.manifestPath,
753
+ deps: fc.deps,
754
+ log: (l) => log.push(l),
755
+ pollMs: 0,
756
+ timeoutMs: 0,
757
+ });
758
+ // Refused to kill → port stays held → port-stuck.
759
+ expect(result.outcome).toBe("port-stuck");
760
+ expect(fc.trace).not.toContain("kill 8888 SIGTERM");
761
+ expect(fc.trace).not.toContain("kill 8888 SIGKILL");
762
+ expect(getWorld(fc.deps).alivePids.has(8888)).toBe(true);
763
+ const out = log.join("\n");
764
+ expect(out).toContain("held by an unrelated process");
765
+ expect(out).toContain("8888");
766
+ expect(out).toContain("gitlab-runner");
767
+ } finally {
768
+ h.cleanup();
769
+ }
770
+ });
771
+ });
772
+
773
+ // ===========================================================================
774
+ // MUST-FIX NIT — distinguish no-manager from write-failed in the cutover.
775
+ // ===========================================================================
776
+
777
+ describe("MUST-FIX NIT: no-manager vs write-failed are distinct outcomes", () => {
778
+ test("bun-not-found (write-failed cause) → write-failed outcome with an accurate message", async () => {
779
+ const h = makeHarness();
780
+ try {
781
+ seedManifest(h.manifestPath, []);
782
+ const fc = makeFakeCutover({
783
+ writeUnitWithoutStarting: () => ({
784
+ written: false,
785
+ outcome: "fallback",
786
+ cause: "write-failed",
787
+ messages: ["cannot build hub unit: 'bun' not found on PATH"],
788
+ }),
789
+ });
790
+ const w = getWorld(fc.deps);
791
+ w.listening.add(1939);
792
+ const result = await cutoverToSupervised({
793
+ configDir: h.configDir,
794
+ manifestPath: h.manifestPath,
795
+ deps: fc.deps,
796
+ log: () => {},
797
+ pollMs: 0,
798
+ });
799
+ expect(result.outcome).toBe("write-failed");
800
+ // NOT the "no service manager" message — names bun / the write failure.
801
+ const out = result.messages.join("\n");
802
+ expect(out).toContain("Could not write the hub unit file");
803
+ expect(out).not.toContain("This host has no service manager");
804
+ // FAIL-SAFE: nothing stopped (still before step 3).
805
+ expect(fc.trace).not.toContain("stopHub");
806
+ expect(fc.trace).not.toContain("startUnit");
807
+ } finally {
808
+ h.cleanup();
809
+ }
810
+ });
811
+
812
+ test("genuine no-manager (no systemd/launchd) → no-manager outcome + service-manager message", async () => {
813
+ const h = makeHarness();
814
+ try {
815
+ seedManifest(h.manifestPath, []);
816
+ const fc = makeFakeCutover({
817
+ writeUnitWithoutStarting: () => ({
818
+ written: false,
819
+ outcome: "fallback",
820
+ cause: "no-manager",
821
+ messages: ["no service manager (launchctl) found on this host"],
822
+ }),
823
+ });
824
+ const w = getWorld(fc.deps);
825
+ w.listening.add(1939);
826
+ const result = await cutoverToSupervised({
827
+ configDir: h.configDir,
828
+ manifestPath: h.manifestPath,
829
+ deps: fc.deps,
830
+ log: () => {},
831
+ pollMs: 0,
832
+ });
833
+ expect(result.outcome).toBe("no-manager");
834
+ expect(result.messages.join("\n")).toContain("This host has no service manager");
835
+ expect(fc.trace).not.toContain("stopHub");
836
+ } finally {
837
+ h.cleanup();
838
+ }
839
+ });
840
+ });