npm - @shogo-ai/worker - Versions diffs - 1.7.14 → 1.8.3 - Mend

@shogo-ai/worker 1.7.14 → 1.8.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (3) hide show

package/package.json +2 -2
package/src/lib/__tests__/runtime-manager-restart-cap.test.ts +310 -0
package/src/lib/runtime-manager.ts +229 -4

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@shogo-ai/worker",
-  "version": "1.7.14",
+  "version": "1.8.3",
   "description": "Shogo Cloud Agent Worker — run Shogo agents on your own machine (laptop, devbox, CI).",
   "license": "MIT",
   "author": "Shogo Technologies, Inc.",
@@ -42,7 +42,7 @@
     "prepublishOnly": "bun run typecheck"
   },
   "dependencies": {
-    "@shogo-ai/sdk": "workspace:*",
+    "@shogo-ai/sdk": "^1.4.1",
     "commander": "^12.1.0",
     "picocolors": "^1.1.1"
   },

package/src/lib/__tests__/runtime-manager-restart-cap.test.ts ADDED Viewed

@@ -0,0 +1,310 @@
+// SPDX-License-Identifier: MIT
+// Copyright (C) 2026 Shogo Technologies, Inc.
+/**
+ * Coverage for the {@link WorkerRuntimeManager} circuit breaker and
+ * process-group reaping introduced after a single user's project ran
+ * away with 54 consecutive `signal=SIGKILL` jetsam exits, each cycle
+ * leaking the agent-runtime's vite / preview-manager / LSP children
+ * (see main.log lines 5258–7093 for the prod reproduction).
+ *
+ * The unfixed code:
+ *   - Spawned the runtime with `detached: false` and only ever called
+ *     `proc.kill(...)` on the parent, so the kernel left the child
+ *     subprocess tree alive after each jetsam SIGKILL of the parent.
+ *     Symptom: `Force-killed leaked process N on port 37273` 10 times
+ *     in a row, EADDRINUSE on the next spawn.
+ *   - Had no max-restart cap; `handleExit` incremented forever, so
+ *     once jetsam fired the first time the worker was locked into a
+ *     ~1/minute respawn loop until the user quit the app. Symptom:
+ *     `restart #54` in the same project's log.
+ *
+ * This file pins three contracts:
+ *
+ *   1. Every non-clean exit reaps the runtime's process group via the
+ *      recorded PID (the orphan-sweep that wasn't happening before).
+ *   2. After {@link MAX_CONSECUTIVE_RESTARTS} consecutive non-clean
+ *      exits within {@link RESTART_FAILURE_WINDOW_MS}, the slot
+ *      transitions to `'failed'` and the manager stops scheduling
+ *      restarts. `ensureRunning(...)` then refuses to silently
+ *      respawn until the operator calls `resetFailure(...)`.
+ *   3. A clean exit (code=0, signal=null) does NOT increment the
+ *      failure budget — bounded chat sessions that exit normally
+ *      after their work is done should leave the slot in the same
+ *      "healthy" state they found it in.
+ */
+import { afterEach, beforeEach, describe, expect, it } from 'bun:test';
+import { WorkerRuntimeManager } from '../runtime-manager.ts';
+interface FakeProc {
+  exitCode: number | null;
+  signalCode: NodeJS.Signals | null;
+  killed: boolean;
+  pid: number;
+  kill: (signal?: NodeJS.Signals | number) => boolean;
+  once: (event: string, cb: (...args: unknown[]) => void) => FakeProc;
+  on: (event: string, cb: (...args: unknown[]) => void) => FakeProc;
+}
+function makeFakeProc(pid = 99999): FakeProc {
+  const proc: FakeProc = {
+    exitCode: null,
+    signalCode: null,
+    killed: false,
+    pid,
+    kill(_signal?: NodeJS.Signals | number) {
+      proc.killed = true;
+      return true;
+    },
+    once(_event, _cb) {
+      return proc;
+    },
+    on(_event, _cb) {
+      return proc;
+    },
+  };
+  return proc;
+}
+function insertRunningSlot(mgr: WorkerRuntimeManager, projectId: string, pid = 99999) {
+  const proc = makeFakeProc(pid);
+  const slot = {
+    projectId,
+    agentPort: 0,
+    apiServerPort: 0,
+    status: 'running' as const,
+    proc,
+    pid,
+    startedAt: Date.now(),
+    lastUsedAt: Date.now(),
+    restarts: 0,
+    consecutiveFailures: 0,
+    lastFailureAt: 0,
+    graceTimer: null,
+    restartTimer: null,
+    idleTimer: null,
+    spawnConfig: {} as never,
+    startPromise: null,
+  };
+  (mgr as unknown as { runtimes: Map<string, typeof slot> }).runtimes.set(projectId, slot);
+  return slot;
+}
+function handleExit(
+  mgr: WorkerRuntimeManager,
+  slot: ReturnType<typeof insertRunningSlot>,
+  code: number | null,
+  signal: NodeJS.Signals | null,
+): void {
+  (mgr as unknown as {
+    handleExit: (s: typeof slot, c: number | null, sg: NodeJS.Signals | null) => void;
+  }).handleExit(slot, code, signal);
+}
+const SILENT = { log: () => {}, warn: () => {}, error: () => {} } as const;
+// Capture process.kill calls so we can pin that handleExit reaps the
+// process group (negative pid argument) instead of just the parent.
+type KillCall = { target: number; signal: NodeJS.Signals | number | undefined };
+let killCalls: KillCall[] = [];
+const realProcessKill = process.kill;
+beforeEach(() => {
+  killCalls = [];
+  // Replace process.kill with a spy that ALSO doesn't actually signal
+  // anything (the fake pid 99999 might collide with a real process on
+  // the CI host).
+  (process as unknown as { kill: typeof process.kill }).kill = ((
+    pid: number,
+    signal?: NodeJS.Signals | number,
+  ) => {
+    killCalls.push({ target: pid, signal });
+    return true;
+  }) as typeof process.kill;
+});
+afterEach(() => {
+  (process as unknown as { kill: typeof process.kill }).kill = realProcessKill;
+});
+describe('WorkerRuntimeManager process-group reaping on non-clean exit', () => {
+  it('non-clean exit fires SIGKILL at the negative pid (the process group)', () => {
+    const mgr = new WorkerRuntimeManager({ logger: SILENT });
+    const slot = insertRunningSlot(mgr, 'proj-reap', 12345);
+    handleExit(mgr, slot, null, 'SIGKILL');
+    const pgKills = killCalls.filter((c) => c.target === -12345);
+    if (process.platform === 'win32') {
+      // killProcessGroup is a no-op on Windows; the inner preview-manager
+      // owns its own port-scoped reaper there.
+      expect(pgKills).toHaveLength(0);
+    } else {
+      expect(pgKills).toHaveLength(1);
+      expect(pgKills[0]!.signal).toBe('SIGKILL');
+    }
+  });
+  it('clean exit (code=0, signal=null) does NOT reap the process group', () => {
+    const mgr = new WorkerRuntimeManager({ logger: SILENT });
+    const slot = insertRunningSlot(mgr, 'proj-clean', 12346);
+    handleExit(mgr, slot, 0, null);
+    expect(killCalls.filter((c) => c.target === -12346)).toHaveLength(0);
+    // Slot is removed from the map on clean exit.
+    const runtimes = (mgr as unknown as { runtimes: Map<string, unknown> }).runtimes;
+    expect(runtimes.has('proj-clean')).toBe(false);
+  });
+  it('handleExit clears the recorded pid so a follow-up call is a no-op', () => {
+    const mgr = new WorkerRuntimeManager({ logger: SILENT });
+    const slot = insertRunningSlot(mgr, 'proj-pid-clear', 12347);
+    handleExit(mgr, slot, null, 'SIGKILL');
+    expect(slot.pid).toBeNull();
+    killCalls = [];
+    handleExit(mgr, slot, null, 'SIGKILL');
+    // No pid recorded → no further -pid kills should be issued.
+    expect(killCalls.filter((c) => c.target === -12347)).toHaveLength(0);
+  });
+});
+describe('WorkerRuntimeManager circuit breaker on consecutive failures', () => {
+  it('parks the slot in failed after MAX_CONSECUTIVE_RESTARTS consecutive jetsam SIGKILLs', () => {
+    const mgr = new WorkerRuntimeManager({ logger: SILENT });
+    const slot = insertRunningSlot(mgr, 'proj-jetsam', 22222);
+    // 8 is the constant in runtime-manager.ts; if it changes the
+    // assertion below has to follow. Hard-coding the expected count
+    // here surfaces the contract change at review time.
+    for (let i = 0; i < 8; i++) {
+      handleExit(mgr, slot, null, 'SIGKILL');
+    }
+    expect(slot.status).toBe('failed');
+    expect(slot.consecutiveFailures).toBe(8);
+    expect(slot.restartTimer).toBeNull();
+    expect(slot.lastError).toMatch(/Circuit breaker tripped/);
+    // The port is released so a future resetFailure() can re-allocate
+    // from a clean state.
+    expect(slot.agentPort).toBe(0);
+    expect(slot.apiServerPort).toBe(0);
+  });
+  it('still schedules a restart on the Nth non-clean exit when N < MAX', () => {
+    const mgr = new WorkerRuntimeManager({ logger: SILENT });
+    const slot = insertRunningSlot(mgr, 'proj-retry', 22223);
+    for (let i = 0; i < 3; i++) {
+      handleExit(mgr, slot, null, 'SIGKILL');
+    }
+    expect(slot.status).toBe('restarting');
+    expect(slot.consecutiveFailures).toBe(3);
+    expect(slot.restartTimer).not.toBeNull();
+    // Clean up the scheduled timer so the test runner exits.
+    if (slot.restartTimer) {
+      clearTimeout(slot.restartTimer);
+      slot.restartTimer = null;
+    }
+  });
+  it('clean exit between failures does NOT increment the failure budget', () => {
+    const mgr = new WorkerRuntimeManager({ logger: SILENT });
+    const slot = insertRunningSlot(mgr, 'proj-mixed', 22224);
+    handleExit(mgr, slot, null, 'SIGKILL');
+    expect(slot.consecutiveFailures).toBe(1);
+    // A clean exit removes the slot from the map. Re-insert a fresh
+    // slot to model "user reopened the project after a healthy
+    // shutdown" — its failure budget should be the default zero, not
+    // a continuation of the previous slot's counter.
+    handleExit(mgr, slot, 0, null);
+    const runtimes = (mgr as unknown as { runtimes: Map<string, unknown> }).runtimes;
+    expect(runtimes.has('proj-mixed')).toBe(false);
+    const freshSlot = insertRunningSlot(mgr, 'proj-mixed', 22225);
+    expect(freshSlot.consecutiveFailures).toBe(0);
+  });
+  it('a failure OUTSIDE the rolling window resets the counter to 1', () => {
+    const mgr = new WorkerRuntimeManager({ logger: SILENT });
+    const slot = insertRunningSlot(mgr, 'proj-window', 22226);
+    handleExit(mgr, slot, null, 'SIGKILL');
+    expect(slot.consecutiveFailures).toBe(1);
+    if (slot.restartTimer) { clearTimeout(slot.restartTimer); slot.restartTimer = null; }
+    // Backdate the last failure to outside the window (default 5min).
+    // Date.now() is stable enough inside one test run; we just shift
+    // the recorded timestamp into the past.
+    slot.lastFailureAt = Date.now() - (10 * 60 * 1000);
+    handleExit(mgr, slot, null, 'SIGKILL');
+    expect(slot.consecutiveFailures).toBe(1);
+    if (slot.restartTimer) { clearTimeout(slot.restartTimer); slot.restartTimer = null; }
+  });
+  it('exits triggered by stop() (status=stopping) do NOT count toward the failure budget', () => {
+    const mgr = new WorkerRuntimeManager({ logger: SILENT });
+    const slot = insertRunningSlot(mgr, 'proj-stop', 22227);
+    slot.status = 'stopping';
+    handleExit(mgr, slot, null, 'SIGTERM');
+    expect(slot.consecutiveFailures).toBe(0);
+    expect(slot.status).toBe('stopped');
+    // And the process group is NOT re-reaped — stop() already did that
+    // before signalling the parent.
+    expect(killCalls.filter((c) => c.target === -22227)).toHaveLength(0);
+  });
+});
+describe('WorkerRuntimeManager resetFailure', () => {
+  it('clears a failed slot so the next ensureRunning can re-spawn', () => {
+    const mgr = new WorkerRuntimeManager({ logger: SILENT });
+    const slot = insertRunningSlot(mgr, 'proj-reset', 33333);
+    for (let i = 0; i < 8; i++) handleExit(mgr, slot, null, 'SIGKILL');
+    expect(slot.status).toBe('failed');
+    const reset = mgr.resetFailure('proj-reset');
+    expect(reset).toBe(true);
+    const runtimes = (mgr as unknown as { runtimes: Map<string, unknown> }).runtimes;
+    expect(runtimes.has('proj-reset')).toBe(false);
+  });
+  it('returns false for a running slot — guard against accidental resets', () => {
+    const mgr = new WorkerRuntimeManager({ logger: SILENT });
+    insertRunningSlot(mgr, 'proj-running', 33334);
+    expect(mgr.resetFailure('proj-running')).toBe(false);
+    const runtimes = (mgr as unknown as { runtimes: Map<string, unknown> }).runtimes;
+    // Slot is preserved when resetFailure is a no-op.
+    expect(runtimes.has('proj-running')).toBe(true);
+  });
+  it('returns false for an unknown projectId', () => {
+    const mgr = new WorkerRuntimeManager({ logger: SILENT });
+    expect(mgr.resetFailure('does-not-exist')).toBe(false);
+  });
+});
+describe('WorkerRuntimeManager.ensureRunning refuses failed slots', () => {
+  it('throws a directive error pointing at resetFailure when the slot is parked', async () => {
+    const mgr = new WorkerRuntimeManager({ logger: SILENT });
+    const slot = insertRunningSlot(mgr, 'proj-locked', 44444);
+    for (let i = 0; i < 8; i++) handleExit(mgr, slot, null, 'SIGKILL');
+    // The fake spawn config isn't reached — ensureRunning guards on
+    // status before doStart — so we can pass an empty object.
+    await expect(
+      mgr.ensureRunning('proj-locked', {
+        cloudUrl: 'https://example.invalid',
+        apiKey: 'unused',
+      }),
+    ).rejects.toThrow(/Circuit breaker tripped|resetFailure/);
+  });
+});

package/src/lib/runtime-manager.ts CHANGED Viewed

@@ -54,12 +54,53 @@ const RUNTIME_IDLE_MS = 15 * 60 * 1000;
 const RESTART_BACKOFF_BASE_MS = 1_000;
 const RESTART_BACKOFF_MAX_MS = 60_000;
+/**
+ * Circuit breaker. After this many *consecutive* non-clean exits within
+ * {@link RESTART_FAILURE_WINDOW_MS}, the manager stops respawning and
+ * parks the slot in `'failed'`. Without this cap a project that the OS
+ * keeps OOM-killing (macOS jetsam SIGKILL is the canonical case) burns
+ * forever at ~1/minute, with every cycle re-spawning bun + vite +
+ * tsserver + pyright and (on posix) leaking the children of any
+ * incarnation the manager couldn't kill in its own process group.
+ *
+ * 8 across a 5-minute window means "we tried for at least
+ * (8 * BASE_BACKOFF capped at MAX) = ~7m of escalating backoff before
+ * we gave up", which is long enough to ride out a transient port-bind
+ * race or a one-shot dependency upgrade and short enough that an
+ * operator chasing runaway RSS on their laptop notices the loop
+ * stopping before the next memory-pressure cycle.
+ */
+const MAX_CONSECUTIVE_RESTARTS = 8;
+const RESTART_FAILURE_WINDOW_MS = 5 * 60 * 1000;
+/**
+ * If a runtime stays up at least this long after the /health-gated
+ * `'running'` transition, we treat it as "recovered" and reset the
+ * consecutive-failure counter. This is the contract that lets a
+ * project that crashed twice on cold start (e.g. waiting for the API
+ * port to release) but then ran healthily for ten minutes start over
+ * with a fresh budget the next time it hiccups.
+ */
+const STARTUP_GRACE_MS = 60_000;
 /** Health check poll interval while waiting for /health. */
 const HEALTH_POLL_MS = 500;
 /** Total timeout waiting for first /health success after spawn. */
 const HEALTH_BOOT_TIMEOUT_MS = 30_000;
-export type RuntimeStatus = 'starting' | 'running' | 'restarting' | 'stopping' | 'stopped' | 'error';
+export type RuntimeStatus =
+  | 'starting'
+  | 'running'
+  | 'restarting'
+  | 'stopping'
+  | 'stopped'
+  | 'error'
+  /**
+   * Terminal state: the circuit breaker tripped. The slot stays in the
+   * `runtimes` map (so `status(projectId)` keeps reporting it) but no
+   * more spawns will happen until {@link WorkerRuntimeManager.resetFailure}
+   * is called or the slot is explicitly `stop()`'d.
+   */
+  | 'failed';
 export interface RuntimeStatusInfo {
   projectId: string;
@@ -215,9 +256,27 @@ interface InternalRuntime {
   apiServerPort: number;
   status: RuntimeStatus;
   proc: ChildProcess | null;
+  /**
+   * PID of the most recent spawn, retained after `proc` is nulled in
+   * {@link WorkerRuntimeManager.handleExit}. On posix, the runtime is
+   * spawned as a process group leader (`detached: true`), so this is
+   * also the PGID — `process.kill(-pid, ...)` cascades to vite, the
+   * preview-manager's inner API server, tsserver and pyright that the
+   * runtime spawned, which otherwise survive a SIGKILL of the parent
+   * (jetsam OOM) and accumulate as orphans until app restart.
+   */
+  pid: number | null;
   startedAt: number;
   lastUsedAt: number;
   restarts: number;
+  /** Consecutive non-clean exits since the last healthy run. */
+  consecutiveFailures: number;
+  /** Timestamp (Date.now) of the most recent non-clean exit. Used to
+   *  detect "loop within the failure window" for the circuit breaker. */
+  lastFailureAt: number;
+  /** Timer that resets `consecutiveFailures` to 0 once a fresh run has
+   *  survived for {@link STARTUP_GRACE_MS}. */
+  graceTimer: ReturnType<typeof setTimeout> | null;
   restartTimer: ReturnType<typeof setTimeout> | null;
   idleTimer: ReturnType<typeof setTimeout> | null;
   lastError?: string;
@@ -469,6 +528,20 @@ export class WorkerRuntimeManager implements RuntimeResolver {
   async ensureRunning(projectId: string, config: ProjectSpawnConfig): Promise<RuntimeStatusInfo> {
     if (this.stopped) throw new Error('WorkerRuntimeManager is stopped');
+    // Refuse circuit-broken slots BEFORE auto-pull so we don't churn
+    // the network/disk on a project we already know we won't spawn.
+    // Surfacing the parked-state message lets the caller (tunnel
+    // proxy, desktop UI) render an actionable error instead of the
+    // generic auto-pull / spawn failure.
+    const failedExisting = this.runtimes.get(projectId);
+    if (failedExisting?.status === 'failed') {
+      throw new Error(
+        `[WorkerRuntimeManager] cannot ensureRunning(${projectId}): ` +
+        `${failedExisting.lastError ?? 'runtime is in failed state'}. ` +
+        `Call resetFailure(${projectId}) or stop(${projectId}) before retrying.`,
+      );
+    }
     // Apply auto-pull before any runtime spawn so the runtime's PROJECT_DIR
     // points at a fully-cloned workspace. Idempotent: subsequent calls hit
     // the `pulledProjects` short-circuit.
@@ -761,14 +834,46 @@ export class WorkerRuntimeManager implements RuntimeResolver {
     r.status = 'stopping';
     if (r.restartTimer) { clearTimeout(r.restartTimer); r.restartTimer = null; }
     if (r.idleTimer) { clearTimeout(r.idleTimer); r.idleTimer = null; }
+    if (r.graceTimer) { clearTimeout(r.graceTimer); r.graceTimer = null; }
     if (r.proc) {
+      // Send the requested signal to the whole process group first so
+      // children (vite, preview-manager's API server, LSPs) start their
+      // own graceful shutdown in parallel with the parent.
+      this.killProcessGroup(r, signal);
       try { r.proc.kill(signal); } catch { /* already gone */ }
       await this.waitForExit(r.proc, 5000);
+      // Belt-and-suspenders: if the grace window elapsed without a
+      // clean exit, `waitForExit` already SIGKILL'd the parent — chase
+      // the rest of the group too in case any child ignored SIGTERM.
+      this.killProcessGroup(r, 'SIGKILL');
     }
+    r.pid = null;
     this.releasePort(r.agentPort);
     this.runtimes.delete(projectId);
   }
+  /**
+   * Re-arm a runtime that the circuit breaker parked in `'failed'`.
+   * Drops the slot from the map so the next `ensureRunning(projectId, …)`
+   * call performs a fresh `doStart()` with a zeroed failure budget.
+   *
+   * Intended for the desktop's "reopen project" flow and for operators
+   * who fixed whatever was crashing the runtime (e.g. freed memory,
+   * deleted a corrupted workspace file) and want to retry without
+   * tearing down the whole worker. No-op if the project isn't in
+   * `'failed'`.
+   */
+  resetFailure(projectId: string): boolean {
+    const r = this.runtimes.get(projectId);
+    if (!r || r.status !== 'failed') return false;
+    if (r.restartTimer) { clearTimeout(r.restartTimer); r.restartTimer = null; }
+    if (r.idleTimer) { clearTimeout(r.idleTimer); r.idleTimer = null; }
+    if (r.graceTimer) { clearTimeout(r.graceTimer); r.graceTimer = null; }
+    this.runtimes.delete(projectId);
+    this.log.log(`[WorkerRuntimeManager] resetFailure: ${projectId} cleared, next ensureRunning will respawn`);
+    return true;
+  }
   async stopAll(signal: NodeJS.Signals = 'SIGTERM'): Promise<void> {
     this.stopped = true;
     // Stop watchers FIRST so their final flush has a chance to PUT before
@@ -801,9 +906,13 @@ export class WorkerRuntimeManager implements RuntimeResolver {
       apiServerPort: 0,
       status: 'starting',
       proc: null,
+      pid: null,
       startedAt: 0,
       lastUsedAt: Date.now(),
       restarts: 0,
+      consecutiveFailures: 0,
+      lastFailureAt: 0,
+      graceTimer: null,
       restartTimer: null,
       idleTimer: null,
       spawnConfig: config,
@@ -833,14 +942,35 @@ export class WorkerRuntimeManager implements RuntimeResolver {
         `via ${command} ${args.join(' ')} (port=${slot.agentPort}, source=${resolved.source})`,
     );
+    // Spawn the runtime as its own process group leader (posix only —
+    // Windows has no equivalent and Node's child_process docs warn that
+    // `detached: true` there gives you a separate console window, not a
+    // PGID). Mirrors what apps/desktop/src/local-server.ts already does
+    // for the outer API server so a single kill at teardown reaches the
+    // bun child + every subprocess it spawned (vite, the inner
+    // preview-manager API server, tsserver, pyright). Without this,
+    // jetsam SIGKILL of the bun parent leaves all of those orphans
+    // alive — they keep their listening sockets, the next respawn
+    // races into EADDRINUSE, and RSS climbs forever (see the storm
+    // pattern in main.log lines 5258–7093 where preview-manager has to
+    // walk the API port and `Force-killed leaked process` 10 times in
+    // a row).
+    const useProcessGroup = process.platform !== 'win32';
     const proc = spawn(command, args, {
       cwd,
       env,
-      detached: false,
+      detached: useProcessGroup,
       stdio: ['ignore', 'pipe', 'pipe'],
     });
+    if (useProcessGroup) {
+      // The detached child would otherwise keep the parent's event
+      // loop alive even after we've removed it from `runtimes` —
+      // matches the apps/desktop precedent at local-server.ts:412.
+      try { proc.unref(); } catch { /* unref is best-effort */ }
+    }
     slot.proc = proc;
+    slot.pid = proc.pid ?? null;
     slot.status = 'starting';
     slot.startedAt = Date.now();
@@ -870,11 +1000,17 @@ export class WorkerRuntimeManager implements RuntimeResolver {
       slot.status = 'running';
       slot.lastUsedAt = Date.now();
       this.armIdleTimer(slot);
+      this.armGraceTimer(slot);
       return slot;
     } catch (err: any) {
       slot.status = 'error';
       slot.lastError = err?.message ?? String(err);
-      try { proc.kill('SIGTERM'); } catch { /* nothing */ }
+      // The /health wait timed out (or the spawn itself failed). Tear
+      // down the whole process group rather than just the parent so we
+      // don't leave a half-booted preview-manager + vite running on
+      // the allocated ports.
+      this.killProcessGroup(slot, 'SIGTERM');
+      try { proc.kill('SIGTERM'); } catch { /* already gone */ }
       this.releasePort(slot.agentPort);
       slot.agentPort = 0;
       slot.apiServerPort = 0;
@@ -882,6 +1018,51 @@ export class WorkerRuntimeManager implements RuntimeResolver {
     }
   }
+  /**
+   * Kill every process in `slot.pid`'s process group. Best-effort:
+   * if the group is already gone (everyone exited cleanly), or the
+   * platform doesn't support PGID kills (Windows), this is a no-op.
+   *
+   * Why we use the recorded PID and not `slot.proc.pid`: by the time
+   * {@link handleExit} runs, `proc` has already fired its `'exit'`
+   * event and we've nulled it. The kernel keeps the process group
+   * intact until the *last* member of the group exits, so the PGID
+   * we captured at spawn is still valid for reaping the orphans even
+   * after the group leader is gone.
+   */
+  private killProcessGroup(slot: InternalRuntime, signal: NodeJS.Signals): void {
+    if (process.platform === 'win32') return;
+    if (!slot.pid) return;
+    try {
+      process.kill(-slot.pid, signal);
+    } catch {
+      // ESRCH (no such process group) is the happy path here — it
+      // means every child exited with their parent and the kernel
+      // already reaped the group. EPERM is the only other plausible
+      // case; swallow it because the caller has no recourse anyway.
+    }
+  }
+  /**
+   * Arm a timer that resets `consecutiveFailures` once a fresh run has
+   * survived for {@link STARTUP_GRACE_MS}. Re-armed on every successful
+   * /health transition; cleared on any non-clean exit so a crash
+   * inside the grace window counts toward the circuit breaker.
+   */
+  private armGraceTimer(slot: InternalRuntime): void {
+    if (slot.graceTimer) {
+      clearTimeout(slot.graceTimer);
+      slot.graceTimer = null;
+    }
+    slot.graceTimer = setTimeout(() => {
+      slot.graceTimer = null;
+      if (slot.consecutiveFailures > 0) {
+        slot.consecutiveFailures = 0;
+      }
+    }, STARTUP_GRACE_MS);
+    try { slot.graceTimer.unref?.(); } catch { /* unref is best-effort */ }
+  }
   private buildEnv(slot: InternalRuntime, runtimeBinPath: string): NodeJS.ProcessEnv {
     const cfg = slot.spawnConfig;
     const env: NodeJS.ProcessEnv = {
@@ -956,9 +1137,16 @@ export class WorkerRuntimeManager implements RuntimeResolver {
       `[WorkerRuntimeManager] runtime ${slot.projectId} exited (code=${code}, signal=${signal})`,
     );
     slot.proc = null;
+    if (slot.graceTimer) {
+      clearTimeout(slot.graceTimer);
+      slot.graceTimer = null;
+    }
     if (slot.status === 'stopping' || this.stopped) {
+      // We initiated the stop; the orphan reap was already done by
+      // stop()/stopAll(). Just clear bookkeeping.
       slot.status = 'stopped';
+      slot.pid = null;
       this.releasePort(slot.agentPort);
       slot.agentPort = 0;
       slot.apiServerPort = 0;
@@ -967,6 +1155,7 @@ export class WorkerRuntimeManager implements RuntimeResolver {
     if (exitedClean) {
       slot.status = 'stopped';
+      slot.pid = null;
       this.releasePort(slot.agentPort);
       slot.agentPort = 0;
       slot.apiServerPort = 0;
@@ -974,17 +1163,52 @@ export class WorkerRuntimeManager implements RuntimeResolver {
       return;
     }
+    // Non-clean exit. Two failure shapes we care about:
+    //   1. Parent died but children (vite, preview-manager API server,
+    //      LSPs) are still alive in the same process group. Without
+    //      reaping, the next doStart() races into EADDRINUSE on the
+    //      same agent port and starts the storm.
+    //   2. Repeated jetsam SIGKILL under memory pressure. Each cycle
+    //      respawns the full child tree and leaks more RSS. The
+    //      circuit breaker below stops that loop.
+    this.killProcessGroup(slot, 'SIGKILL');
+    slot.pid = null;
+    const now = Date.now();
+    const withinWindow = now - slot.lastFailureAt <= RESTART_FAILURE_WINDOW_MS;
+    slot.consecutiveFailures = withinWindow ? slot.consecutiveFailures + 1 : 1;
+    slot.lastFailureAt = now;
     slot.restarts += 1;
     slot.lastError = `exited code=${code} signal=${signal}`;
+    if (slot.consecutiveFailures >= MAX_CONSECUTIVE_RESTARTS) {
+      slot.status = 'failed';
+      slot.lastError =
+        `Circuit breaker tripped: ${slot.consecutiveFailures} consecutive non-clean exits ` +
+        `within ${Math.round(RESTART_FAILURE_WINDOW_MS / 1000)}s (last: code=${code} signal=${signal}). ` +
+        `Most recent on macOS is jetsam OOM (signal=SIGKILL with code=null); ` +
+        `the previous incarnation's vite/tsserver/preview-manager children were reaped to ` +
+        `prevent further RSS growth. Stop, fix the workspace, and call resetFailure(projectId) ` +
+        `(or stop(projectId)) to allow another spawn attempt.`;
+      this.releasePort(slot.agentPort);
+      slot.agentPort = 0;
+      slot.apiServerPort = 0;
+      if (slot.restartTimer) { clearTimeout(slot.restartTimer); slot.restartTimer = null; }
+      if (slot.idleTimer) { clearTimeout(slot.idleTimer); slot.idleTimer = null; }
+      this.log.error(`[WorkerRuntimeManager] ${slot.lastError}`);
+      return;
+    }
     const delay = this.restartBackoffMs(slot.restarts);
     slot.status = 'restarting';
     this.log.warn(
       `[WorkerRuntimeManager] restarting ${slot.projectId} in ${Math.round(delay / 1000)}s ` +
-        `(restart #${slot.restarts})`,
+        `(restart #${slot.restarts}, consecutive failures ${slot.consecutiveFailures}/${MAX_CONSECUTIVE_RESTARTS})`,
     );
     if (slot.restartTimer) clearTimeout(slot.restartTimer);
     slot.restartTimer = setTimeout(() => {
       slot.restartTimer = null;
+      if (slot.status === 'failed' || this.stopped) return;
       slot.startPromise = this.doStart(slot).then((r) => {
         slot.startPromise = null;
         return r;
@@ -994,6 +1218,7 @@ export class WorkerRuntimeManager implements RuntimeResolver {
         return slot;
       });
     }, delay);
+    try { slot.restartTimer.unref?.(); } catch { /* unref is best-effort */ }
   }
   private restartBackoffMs(restarts: number): number {