@openparachute/hub 0.6.4 → 0.6.5-rc.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/hub-unit.ts CHANGED
@@ -85,7 +85,9 @@ export interface HubUnitDeps extends ManagedUnitDeps {
85
85
  * `null` when the hub doesn't answer at all (connection-refused / timeout).
86
86
  * Production uses a bounded `fetch`; tests inject a deterministic stub.
87
87
  */
88
- probeHealthVersion: (port: number) => Promise<{ ok: boolean; version?: string } | null>;
88
+ probeHealthVersion: (
89
+ port: number,
90
+ ) => Promise<{ ok: boolean; version?: string; db?: string } | null>;
89
91
  /** TCP connect-probe for readiness polling (reuses `defaultPortListening`). */
90
92
  portListening: PortListeningFn;
91
93
  /** Sleep between readiness polls (tests pin to 0). */
@@ -118,27 +120,48 @@ async function defaultProbeHealth(port: number): Promise<boolean> {
118
120
  */
119
121
  async function defaultProbeHealthVersion(
120
122
  port: number,
121
- ): Promise<{ ok: boolean; version?: string } | null> {
123
+ ): Promise<{ ok: boolean; version?: string; db?: string } | null> {
122
124
  try {
123
125
  const res = await fetch(`http://127.0.0.1:${port}/health`, {
124
126
  signal: AbortSignal.timeout(1500),
125
127
  });
126
128
  let version: string | undefined;
129
+ let db: string | undefined;
127
130
  try {
128
131
  const body = (await res.json()) as unknown;
129
- if (body && typeof body === "object" && "version" in body) {
132
+ if (body && typeof body === "object") {
130
133
  const v = (body as { version?: unknown }).version;
131
134
  if (typeof v === "string" && v.length > 0) version = v;
135
+ // `db` liveness verdict (#594): "ok" / "error: <class>" / "unconfigured".
136
+ // Threaded through so the adoption probe can treat a db-error hub as
137
+ // needing a restart even when its version matches.
138
+ const d = (body as { db?: unknown }).db;
139
+ if (typeof d === "string" && d.length > 0) db = d;
132
140
  }
133
141
  } catch {
134
- // Non-JSON body → no version. Leave `version` undefined (→ mismatch).
142
+ // Non-JSON body → no version/db. Leave undefined (→ mismatch / unknown db).
135
143
  }
136
- return version !== undefined ? { ok: res.ok, version } : { ok: res.ok };
144
+ const out: { ok: boolean; version?: string; db?: string } = { ok: res.ok };
145
+ if (version !== undefined) out.version = version;
146
+ if (db !== undefined) out.db = db;
147
+ return out;
137
148
  } catch {
138
149
  return null;
139
150
  }
140
151
  }
141
152
 
153
+ /**
154
+ * True when a `/health` `db` field reports a non-recoverable liveness fault
155
+ * (#594) — anything starting with "error:" (e.g. "error: fatal" from the
156
+ * dead-handle field repro). "ok" and "unconfigured" are not faults: a
157
+ * pre-wizard hub with no DB rows still reports a working handle. A missing
158
+ * `db` field (an older hub that predates #594) reads as "unknown → don't
159
+ * treat as a fault" so we never restart a hub merely for lacking the field.
160
+ */
161
+ function healthReportsDbFault(db: string | undefined): boolean {
162
+ return typeof db === "string" && db.startsWith("error:");
163
+ }
164
+
142
165
  export const defaultHubUnitDeps: HubUnitDeps = {
143
166
  ...defaultManagedUnitDeps,
144
167
  probeHealth: defaultProbeHealth,
@@ -510,13 +533,22 @@ export async function ensureHubVersionMatches(
510
533
  }
511
534
 
512
535
  const runningVersion = probe.version;
513
- if (runningVersion === installedVersion) {
514
- // Exactly today's behavior versions agree, no extra restart.
536
+ const dbFault = healthReportsDbFault(probe.db);
537
+ if (runningVersion === installedVersion && !dbFault) {
538
+ // Versions agree AND the DB handle is live — today's behavior, no restart.
515
539
  return { outcome: "match", runningVersion, installedVersion, messages: [] };
516
540
  }
517
541
 
518
- // Mismatch (includes the no-`version`-field very-old-hub case undefined).
519
- const runningLabel = runningVersion ?? "an older version (no version field)";
542
+ // From here we know the running hub needs a restart: EITHER its version is
543
+ // stale (the #590 zombie-adoption case) OR it's reporting a dead DB handle
544
+ // (#594 — a hub that adopted-as-version-match but whose state dir was deleted
545
+ // under it; /health stays 200 while every DB route 500s). Both run through
546
+ // the same restart-once machinery. `runningLabel` describes whichever fault
547
+ // we're acting on so the operator sees an accurate reason.
548
+ const versionMismatch = runningVersion !== installedVersion;
549
+ const runningLabel = versionMismatch
550
+ ? (runningVersion ?? "an older version (no version field)")
551
+ : `${runningVersion} with a dead database handle (${probe.db})`;
520
552
 
521
553
  // Is this hub one we can restart through the manager? If there's no manager,
522
554
  // or no unit installed, the running hub is a legacy detached pid / a dev
@@ -556,45 +588,60 @@ export async function ensureHubVersionMatches(
556
588
  outcome: "restarted",
557
589
  runningVersion: v,
558
590
  installedVersion,
559
- messages: [`✓ hub unit restarted; now running ${installedVersion}.`],
591
+ messages: [`✓ hub unit restarted; now running ${installedVersion} with a live database.`],
560
592
  });
561
- const stillMismatchedResult = (last: string | undefined): EnsureHubVersionMatchesResult => {
562
- const reports = last ? ` (reports ${last})` : "";
593
+ const stillMismatchedResult = (
594
+ last: { version?: string; db?: string } | undefined,
595
+ ): EnsureHubVersionMatchesResult => {
596
+ const lastVersion = last?.version;
597
+ const reports = lastVersion ? ` (reports ${lastVersion})` : "";
598
+ const dbStillBad = healthReportsDbFault(last?.db);
563
599
  return {
564
600
  outcome: "still-mismatched",
565
- ...(last !== undefined ? { runningVersion: last } : {}),
601
+ ...(lastVersion !== undefined ? { runningVersion: lastVersion } : {}),
566
602
  installedVersion,
567
- messages: [
568
- `⚠ restarted the hub unit, but it is still not reporting ${installedVersion}${reports}.`,
569
- " This can happen with a bun-linked checkout on a feature branch whose package.json version trails the running code.",
570
- ` Continuing verify with \`parachute status\` / \`curl http://127.0.0.1:${port}/health\` if the hub should be on a specific version.`,
571
- ],
603
+ messages: dbStillBad
604
+ ? [
605
+ `⚠ restarted the hub unit, but its database still reports a fault (${last?.db}).`,
606
+ " The state directory may still be missing or the database file corrupted.",
607
+ ` Check it with \`curl http://127.0.0.1:${port}/health\` and ensure ~/.parachute exists.`,
608
+ ]
609
+ : [
610
+ `⚠ restarted the hub unit, but it is still not reporting ${installedVersion}${reports}.`,
611
+ " This can happen with a bun-linked checkout on a feature branch whose package.json version trails the running code.",
612
+ ` Continuing — verify with \`parachute status\` / \`curl http://127.0.0.1:${port}/health\` if the hub should be on a specific version.`,
613
+ ],
572
614
  };
573
615
  };
574
616
 
575
- // Re-probe `/health` until the running version matches the installed version
576
- // or the readiness budget elapses. Restart-loop guard: we restart AT MOST
577
- // once if it still mismatches after this single restart (e.g. a bun-linked
578
- // checkout on a branch), we warn + continue rather than looping.
617
+ // A re-probe counts as "healed" only when the version matches AND the DB
618
+ // handle is live a restart that came back on the right version but with a
619
+ // still-dead handle hasn't actually fixed the #594 fault.
620
+ const probeHealed = (p: { version?: string; db?: string } | null): boolean =>
621
+ p !== null && p.version === installedVersion && !healthReportsDbFault(p.db);
622
+
623
+ // Re-probe `/health` until the hub is healed or the readiness budget elapses.
624
+ // Restart-loop guard: we restart AT MOST once — if it still mismatches /
625
+ // db-faults after this single restart (e.g. a bun-linked checkout on a
626
+ // branch, or a still-missing state dir), we warn + continue rather than loop.
579
627
  const deadline = Date.now() + readyTimeoutMs;
580
628
  for (;;) {
581
629
  const after = await deps.probeHealthVersion(port);
582
- if (after !== null && after.version === installedVersion) {
630
+ if (probeHealed(after)) {
583
631
  return restartedResult(installedVersion);
584
632
  }
585
633
  if (Date.now() >= deadline) {
586
- // Report the last-observed (still-stale) version if the hub came back.
587
- return stillMismatchedResult(after?.version ?? runningVersion);
634
+ return stillMismatchedResult(after ?? { version: runningVersion });
588
635
  }
589
636
  if (readyPollMs > 0) await deps.sleep(readyPollMs);
590
637
  else break;
591
638
  }
592
639
  // readyPollMs === 0 fast-path: one more probe, then settle.
593
640
  const finalProbe = await deps.probeHealthVersion(port);
594
- if (finalProbe !== null && finalProbe.version === installedVersion) {
641
+ if (probeHealed(finalProbe)) {
595
642
  return restartedResult(installedVersion);
596
643
  }
597
- return stillMismatchedResult(finalProbe?.version ?? runningVersion);
644
+ return stillMismatchedResult(finalProbe ?? { version: runningVersion });
598
645
  }
599
646
 
600
647
  /**