npm - @openparachute/hub - Versions diffs - 0.7.4-rc.4 → 0.7.4-rc.5 - Mend

@openparachute/hub 0.7.4-rc.4 → 0.7.4-rc.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

package/package.json +1 -1
package/src/__tests__/api-hub-upgrade.test.ts +59 -3
package/src/__tests__/cloudflare-connector-service.test.ts +3 -1
package/src/__tests__/managed-unit.test.ts +62 -0
package/src/__tests__/supervisor.test.ts +25 -0
package/src/api-hub-upgrade.ts +38 -3
package/src/managed-unit.ts +30 -1
package/src/supervisor.ts +46 -2

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@openparachute/hub",
-  "version": "0.7.4-rc.4",
+  "version": "0.7.4-rc.5",
   "description": "parachute — the local hub for the Parachute ecosystem (discovery, ports, lifecycle, soon OAuth).",
   "license": "AGPL-3.0",
   "publishConfig": {

package/src/__tests__/api-hub-upgrade.test.ts CHANGED Viewed

@@ -323,8 +323,15 @@ describe("POST /api/hub/upgrade — redeploy-required short-circuit (§5.3)", ()
 });
 describe("POST /api/hub/upgrade — 409 in-flight guard (concurrent-upgrade)", () => {
-  /** Seed the status file with a prior op in the given phase. */
-  function seedStatus(dir: string, phase: HubUpgradeStatus["phase"], opId = "prior-op"): void {
+  /** Seed the status file with a prior op in the given phase. `startedAt`
+   * defaults to now (a FRESH in-flight slot); pass an old ISO string to seed a
+   * stale / abandoned slot for the #506 TTL tests. */
+  function seedStatus(
+    dir: string,
+    phase: HubUpgradeStatus["phase"],
+    opId = "prior-op",
+    startedAt: string = new Date().toISOString(),
+  ): void {
     writeHubUpgradeStatus(dir, {
       operation_id: opId,
       phase,
@@ -333,7 +340,7 @@ describe("POST /api/hub/upgrade — 409 in-flight guard (concurrent-upgrade)", (
       target_version: "0.6.3-rc.2",
       channel: "rc",
       log: [],
-      started_at: new Date().toISOString(),
+      started_at: startedAt,
     });
   }
@@ -385,6 +392,55 @@ describe("POST /api/hub/upgrade — 409 in-flight guard (concurrent-upgrade)", (
     expect(res.status).toBe(202);
     expect(spawned.length).toBe(1);
   });
+  // #506: a crashed helper leaves an in-flight slot stuck forever — without a
+  // TTL it 409-deadlocks every future upgrade. A STALE in-flight slot must be
+  // treated as abandoned so the new request proceeds.
+  for (const phase of ["pending", "running", "restarting"] as const) {
+    test(`#506: STALE in-flight slot (phase=${phase}, started 30m ago) → proceeds, not 409`, async () => {
+      const bearer = await mintBearer(harness, ["parachute:host:admin"]);
+      const thirtyMinAgo = new Date(Date.now() - 30 * 60 * 1000).toISOString();
+      seedStatus(harness.dir, phase, "crashed-op", thirtyMinAgo);
+      const { deps, spawned } = baseDeps(harness);
+      const res = await handleHubUpgrade(
+        postReq({ authorization: `Bearer ${bearer}` }, { channel: "rc" }),
+        deps,
+      );
+      // Abandoned slot freed: a fresh op took over + spawned its helper.
+      expect(res.status).toBe(202);
+      expect(spawned.length).toBe(1);
+      const status = readHubUpgradeStatus(harness.dir);
+      expect(status?.operation_id).not.toBe("crashed-op");
+      expect(spawned[0]?.operationId).toBe(status?.operation_id);
+    });
+  }
+  test("#506: FRESH in-flight slot (started just now) → still 409", async () => {
+    const bearer = await mintBearer(harness, ["parachute:host:admin"]);
+    // Just-started (well within the 15m TTL) → a real, live upgrade → 409.
+    seedStatus(harness.dir, "running", "live-op", new Date().toISOString());
+    const { deps, spawned } = baseDeps(harness);
+    const res = await handleHubUpgrade(
+      postReq({ authorization: `Bearer ${bearer}` }, { channel: "rc" }),
+      deps,
+    );
+    expect(res.status).toBe(409);
+    expect(spawned.length).toBe(0);
+    expect(readHubUpgradeStatus(harness.dir)?.operation_id).toBe("live-op");
+  });
+  test("#506: in-flight slot with a malformed started_at → treated as stale, proceeds", async () => {
+    const bearer = await mintBearer(harness, ["parachute:host:admin"]);
+    seedStatus(harness.dir, "running", "garbage-op", "not-a-date");
+    const { deps, spawned } = baseDeps(harness);
+    const res = await handleHubUpgrade(
+      postReq({ authorization: `Bearer ${bearer}` }, { channel: "rc" }),
+      deps,
+    );
+    // An unparseable timestamp must not deadlock — treat as abandoned.
+    expect(res.status).toBe(202);
+    expect(spawned.length).toBe(1);
+  });
 });
 describe("appendHubUpgradeStatus — operation_id guard (stale-helper isolation)", () => {

package/src/__tests__/cloudflare-connector-service.test.ts CHANGED Viewed

@@ -258,8 +258,10 @@ describe("installConnectorService — Linux systemd", () => {
       platform: "linux",
       getuid: () => 1000,
       userName: () => "op",
-      // enable-linger FAIL, then daemon-reload OK, enable --now OK.
+      // #528 probe: show-user → Linger=no (off, so we proceed to enable);
+      // then enable-linger FAIL, daemon-reload OK, enable --now OK.
       runResults: [
+        { code: 0, stdout: "Linger=no\n", stderr: "" },
         { code: 1, stdout: "", stderr: "Failed to enable linger" },
         { code: 0, stdout: "", stderr: "" },
         { code: 0, stdout: "", stderr: "" },

package/src/__tests__/managed-unit.test.ts CHANGED Viewed

@@ -398,6 +398,68 @@ describe("installManagedUnit — start:boolean (§7.1)", () => {
     expect(f.calls).toContainEqual(["systemctl", "--user", "daemon-reload"]);
     expect(f.calls.some((c) => c.includes("enable"))).toBe(false);
   });
+  // #528: a per-command fake `run` so the linger probe + enable-linger can return
+  // distinct results. Non-linger commands (systemctl daemon-reload / enable) all
+  // succeed; only the linger sequence is scripted via `linger`.
+  function lingerDeps(linger: {
+    probe?: ServiceCommandResult;
+    enable?: ServiceCommandResult;
+  }): FakeDepsState {
+    const ok: ServiceCommandResult = { code: 0, stdout: "", stderr: "" };
+    return fakeDeps({
+      platform: "linux",
+      getuid: () => 1000,
+      userName: () => "op",
+      run: ((cmd: readonly string[]) => {
+        // `calls` is recorded by the default run; here we record into a closure
+        // list returned alongside via the returned FakeDepsState — but fakeDeps
+        // only records in its OWN default run. So push into a shared array.
+        recorded.push([...cmd]);
+        if (cmd[0] === "loginctl" && cmd[1] === "show-user") return linger.probe ?? ok;
+        if (cmd[0] === "loginctl" && cmd[1] === "enable-linger") return linger.enable ?? ok;
+        return ok;
+      }) as ManagedUnitDeps["run"],
+    });
+  }
+  // Shared recorder for the per-command run above (fakeDeps's own `calls` array
+  // isn't populated when we override `run`).
+  let recorded: string[][] = [];
+  test("#528: linger ALREADY on → no enable attempt, no warning (false-alarm fix)", () => {
+    recorded = [];
+    const f = lingerDeps({ probe: { code: 0, stdout: "Linger=yes\n", stderr: "" } });
+    const result = installManagedUnit({
+      unit: hubUnit(f.deps),
+      deps: f.deps,
+      messages: HUB_MESSAGES,
+      start: false,
+    });
+    // Probed current state...
+    expect(recorded).toContainEqual(["loginctl", "show-user", "op", "--property=Linger"]);
+    // ...and because it's already on, did NOT try to enable it.
+    expect(recorded.some((c) => c[0] === "loginctl" && c[1] === "enable-linger")).toBe(false);
+    // ...and emitted NO scary linger warning.
+    expect(result.messages).not.toContain(HUB_MESSAGES.lingerWarning);
+  });
+  test("#528: linger OFF + enable-linger fails → warning surfaces", () => {
+    recorded = [];
+    const f = lingerDeps({
+      probe: { code: 0, stdout: "Linger=no\n", stderr: "" },
+      enable: { code: 1, stdout: "", stderr: "operation not permitted" },
+    });
+    const result = installManagedUnit({
+      unit: hubUnit(f.deps),
+      deps: f.deps,
+      messages: HUB_MESSAGES,
+      start: false,
+    });
+    // Off → did attempt to enable...
+    expect(recorded).toContainEqual(["loginctl", "enable-linger", "op"]);
+    // ...and the genuine failure warns.
+    expect(result.messages).toContain(HUB_MESSAGES.lingerWarning);
+  });
 });
 // ---------------------------------------------------------------------------

package/src/__tests__/supervisor.test.ts CHANGED Viewed

@@ -1591,6 +1591,31 @@ describe("Supervisor port-readiness + structured start-error (§6.5)", () => {
     expect(spawner.calls).toHaveLength(0);
   });
+  test("(#634) preflight non-executable binary → non_executable start-error, NO spawn", async () => {
+    const spawner = makeQueueSpawner();
+    const sup = new Supervisor({
+      spawnFn: spawner.spawn,
+      killFn: noopKill,
+      // `which` requires X_OK so it returns null for a 100644 bin...
+      which: () => null,
+      // ...but the secondary probe finds it present-but-non-executable.
+      findNonExecutable: () => "/x/vault/bin/parachute-vault",
+      portListening: async () => true,
+      startReadyMs: 50,
+      sleep: () => Promise.resolve(),
+    });
+    const state = await sup.start(reqWithPort("vault", 1940));
+    expect(state.status).toBe("crashed");
+    expect(state.startError?.error_type).toBe("non_executable");
+    expect(state.startError?.error_description).toContain(
+      "but is not executable — run chmod +x /x/vault/bin/parachute-vault",
+    );
+    // No misleading "not installed" install card, and never spawned.
+    expect(state.startError?.binary).toBe("parachute-vault");
+    expect(spawner.calls).toHaveLength(0);
+  });
   test("a clean re-start clears a prior started-but-unbound start-error", async () => {
     const first = makeFakeProc(201);
     const second = makeFakeProc(202);

package/src/api-hub-upgrade.ts CHANGED Viewed

@@ -67,6 +67,34 @@ export const HUB_UPGRADE_REQUIRED_SCOPE = "parachute:host:admin";
  */
 const IN_FLIGHT_PHASES = new Set<HubUpgradeStatus["phase"]>(["pending", "running", "restarting"]);
+/**
+ * #506: TTL for the 409 in-flight guard. The status file is single-slot, and a
+ * helper that CRASHES (OOM, killed mid-rewrite, host reboot) never reaches a
+ * terminal phase — leaving the slot stuck in `pending`/`running`/`restarting`
+ * FOREVER and 409-deadlocking every future upgrade. So: an in-flight slot whose
+ * `started_at` is older than this bound is treated as ABANDONED and the new
+ * request proceeds (overwriting the stale slot).
+ *
+ * 15 minutes — comfortably past the longest expected in-place upgrade (an
+ * `npm view` + `bun add -g` rewrite + restart is seconds-to-low-minutes even on
+ * a slow box / cold cache). A live upgrade finishing under the bound is never
+ * mistaken for abandoned; a crashed one frees the slot within 15 min instead of
+ * never. (A missing/garbage `started_at` is treated as stale → not 409, so a
+ * malformed file can't deadlock either.)
+ */
+const IN_FLIGHT_TTL_MS = 15 * 60 * 1000;
+/**
+ * Is an in-flight slot still FRESH (within the TTL), so a second POST must be
+ * rejected 409? An unparseable / missing `started_at` is treated as stale
+ * (not fresh) so a malformed file frees the slot rather than deadlocking it.
+ */
+function isInFlightFresh(existing: HubUpgradeStatus, now: Date): boolean {
+  const startedMs = Date.parse(existing.started_at);
+  if (Number.isNaN(startedMs)) return false;
+  return now.getTime() - startedMs < IN_FLIGHT_TTL_MS;
+}
 export interface SpawnHelperArgs {
   operationId: string;
   channel: "rc" | "latest";
@@ -213,7 +241,9 @@ export async function handleHubUpgrade(req: Request, deps: ApiHubUpgradeDeps): P
   const parsed = await parseBody(req);
   if (parsed instanceof Response) return parsed;
-  // ── 409 in-flight guard ────────────────────────────────────────────────────
+  const now = (deps.now ?? (() => new Date()))();
+  // ── 409 in-flight guard (TTL-bounded) ──────────────────────────────────────
   // The status file is single-slot (one hub, one upgrade). If a prior upgrade
   // is still in a non-terminal phase (pending/running/restarting), starting a
   // SECOND would overwrite its operation_id — and a still-running first helper
@@ -222,9 +252,15 @@ export async function handleHubUpgrade(req: Request, deps: ApiHubUpgradeDeps): P
   // server-side too (a second tab, a stale page, a scripted POST). Reject with
   // 409 unless the slot is free (no file) or the prior op reached a terminal
   // phase (failed / redeploy-required / succeeded).
+  //
+  // #506: BUT a non-terminal slot is only a real block while it's FRESH. A
+  // helper that crashed (OOM / killed / host reboot) leaves the slot stuck
+  // in-flight forever and would 409-deadlock every future upgrade. So an
+  // in-flight slot older than IN_FLIGHT_TTL_MS is treated as ABANDONED and the
+  // request proceeds (the seeded status below overwrites the stale slot).
   const readStatus = deps.readStatus ?? readHubUpgradeStatus;
   const existing = readStatus(deps.configDir);
-  if (existing && IN_FLIGHT_PHASES.has(existing.phase)) {
+  if (existing && IN_FLIGHT_PHASES.has(existing.phase) && isInFlightFresh(existing, now)) {
     return jsonError(
       409,
       "upgrade_in_flight",
@@ -234,7 +270,6 @@ export async function handleHubUpgrade(req: Request, deps: ApiHubUpgradeDeps): P
   const hubSrcDir = deps.hubSrcDir ?? dirname(fileURLToPath(import.meta.url));
   const env = deps.env ?? process.env;
-  const now = (deps.now ?? (() => new Date()))();
   const currentVersion = (deps.currentVersion ?? (() => defaultCurrentVersion(hubSrcDir)))();
   // Auto-detect the channel from the current version when not explicitly set —

package/src/managed-unit.ts CHANGED Viewed

@@ -454,6 +454,25 @@ function installLaunchdUnit(opts: InstallManagedUnitOpts): ManagedUnitInstallRes
   };
 }
+/**
+ * #528: is `loginctl` linger already enabled for `userName`? Best-effort probe:
+ * `loginctl show-user <user> --property=Linger` prints `Linger=yes` / `Linger=no`.
+ * Returns true ONLY on a clear `Linger=yes`; ANY ambiguity (non-zero exit, a
+ * throw, or unparseable output) returns false so the caller falls through to the
+ * enable attempt — we never SKIP enabling on a guess, only when linger is
+ * provably already on. (`show-user` of a user with no session can itself exit
+ * non-zero; treat that as "unknown → try to enable".)
+ */
+function lingerAlreadyOn(deps: ManagedUnitDeps, userName: string): boolean {
+  try {
+    const probe = deps.run(["loginctl", "show-user", userName, "--property=Linger"]);
+    if (probe.code !== 0) return false;
+    return /(^|\n)\s*Linger=yes\s*(\n|$)/i.test(probe.stdout);
+  } catch {
+    return false;
+  }
+}
 function installSystemdUnit(opts: InstallManagedUnitOpts): ManagedUnitInstallResult {
   const { unit, deps, messages } = opts;
   const start = opts.start ?? true;
@@ -490,10 +509,20 @@ function installSystemdUnit(opts: InstallManagedUnitOpts): ManagedUnitInstallRes
   // systemctl but not loginctl would propagate the spawn error out and hard-fail
   // the calling command. (Run on both start + install-without-start: linger is a
   // boot-survival nicety independent of whether we start the unit now.)
+  //
+  // #528: pre-check the CURRENT linger state before trying to enable it. When
+  // linger is ALREADY on (the common re-install / re-migrate case on a box
+  // whose owner already enabled it), `enable-linger` is a no-op we don't need —
+  // and on some systemd builds it can return non-zero even though linger is
+  // genuinely on, raising a scary "couldn't enable lingering, your hub won't
+  // survive reboot" warning that is a FALSE ALARM. So: probe first; if linger
+  // is on, skip both the enable AND the warning. Only when linger is genuinely
+  // OFF and the enable attempt then fails do we warn. This is the single-owner
+  // self-host reboot-survival happy path — keep it quiet when it's already good.
   if (!root && userName) {
     if (deps.which("loginctl") === null) {
       outMessages.push(messages.lingerWarning);
-    } else {
+    } else if (!lingerAlreadyOn(deps, userName)) {
       try {
         const linger = deps.run(["loginctl", "enable-linger", userName]);
         if (linger.code !== 0) outMessages.push(messages.lingerWarning);

package/src/supervisor.ts CHANGED Viewed

@@ -38,6 +38,7 @@ import { spawnSync } from "node:child_process";
 import {
   MissingDependencyError,
   type MissingDependencyWire,
+  NonExecutableError,
   ensureExecutable,
   rethrowIfMissing,
 } from "@openparachute/depcheck";
@@ -263,6 +264,14 @@ export interface SupervisorOpts {
    * Tests exercising the missing-binary branch inject `which: () => null`.
    */
   readonly which?: (cmd: string) => string | null;
+  /**
+   * #634 secondary-probe seam for `ensureExecutable`: when `which` returns null,
+   * walk PATH IGNORING X_OK to detect a present-but-non-executable binary (a
+   * `bin` that lost its +x bit). Production leaves this undefined so depcheck's
+   * real PATH walk runs (gated to the real `Bun.which`); tests inject it to
+   * exercise the non-executable preflight branch through a stubbed `which`.
+   */
+  readonly findNonExecutable?: (binary: string) => string | null;
   /**
    * Pre-spawn port-squatter detection (#580 item 4). Returns the pid holding a
    * TCP LISTEN on the module's port, or undefined when the port is free /
@@ -427,8 +436,11 @@ export class LogRingBuffer {
  * boot and threads it into the API handlers.
  */
 export class Supervisor {
-  private readonly opts: Required<Omit<SupervisorOpts, "spawnFn">> & {
+  private readonly opts: Required<Omit<SupervisorOpts, "spawnFn" | "findNonExecutable">> & {
     readonly spawnFn: SpawnFn;
+    // Optional #634 probe seam — undefined on the production path so depcheck's
+    // own real PATH walk runs (gated to the real `Bun.which`).
+    readonly findNonExecutable?: (binary: string) => string | null;
   };
   private readonly modules = new Map<string, ModuleEntry>();
@@ -459,6 +471,9 @@ export class Supervisor {
       lateBindWatchMs: opts.lateBindWatchMs ?? DEFAULT_LATE_BIND_WATCH_MS,
       lateBindPollMs: opts.lateBindPollMs ?? DEFAULT_LATE_BIND_POLL_MS,
       which: opts.which ?? (isProductionPath ? Bun.which : () => "/stub/bin/preflight-skipped"),
+      // #634: undefined on production so depcheck's real PATH walk runs (its
+      // gate keys on the real `Bun.which`); tests inject it to drive the branch.
+      findNonExecutable: opts.findNonExecutable,
       // Squatter detection (#580 item 4): real probes on the production path;
       // the stub-spawner test path defaults to "no squatter / unknown owner" so
       // fake-proc tests (which never hold a real port) aren't tripped. Tests
@@ -509,7 +524,9 @@ export class Supervisor {
     const startBinary = req.cmd[0];
     if (startBinary) {
       try {
-        ensureExecutable(startBinary, { which: this.opts.which });
+        const ensureOpts: Parameters<typeof ensureExecutable>[1] = { which: this.opts.which };
+        if (this.opts.findNonExecutable) ensureOpts.findNonExecutable = this.opts.findNonExecutable;
+        ensureExecutable(startBinary, ensureOpts);
       } catch (err) {
         if (err instanceof MissingDependencyError) {
           entry.state = {
@@ -520,6 +537,18 @@ export class Supervisor {
           };
           return entry.state;
         }
+        // #634: the binary IS present but not executable (a `bin` that lost its
+        // +x bit). Record the actionable chmod hint instead of a misleading
+        // "not installed" — and never throw out of `start`.
+        if (err instanceof NonExecutableError) {
+          entry.state = {
+            ...entry.state,
+            status: "crashed",
+            pid: undefined,
+            startError: nonExecutableStartError(err, this.opts.now),
+          };
+          return entry.state;
+        }
         throw err;
       }
     }
@@ -1243,6 +1272,21 @@ function startErrorFromWire(wire: MissingDependencyWire, now: () => number): Mod
   };
 }
+/**
+ * #634: map a `NonExecutableError` (binary present on PATH but not +x) onto the
+ * `ModuleStartError` shape. `error_type: "non_executable"` so a UI can branch;
+ * `error_description` is the formatted `chmod +x` block. No install card — the
+ * fix is a permission flip, not a reinstall.
+ */
+function nonExecutableStartError(err: NonExecutableError, now: () => number): ModuleStartError {
+  return {
+    error_type: err.errorType,
+    error_description: err.message,
+    binary: err.binary,
+    at: new Date(now()).toISOString(),
+  };
+}
 /**
  * Production group-aware kill (hub#88). Sends `signal` to the entire process
  * group rooted at `pid` (the negative-pid syscall) so a wrapped startCmd's