@shogo-ai/worker 1.7.14 → 1.8.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@shogo-ai/worker",
3
- "version": "1.7.14",
3
+ "version": "1.8.3",
4
4
  "description": "Shogo Cloud Agent Worker — run Shogo agents on your own machine (laptop, devbox, CI).",
5
5
  "license": "MIT",
6
6
  "author": "Shogo Technologies, Inc.",
@@ -42,7 +42,7 @@
42
42
  "prepublishOnly": "bun run typecheck"
43
43
  },
44
44
  "dependencies": {
45
- "@shogo-ai/sdk": "workspace:*",
45
+ "@shogo-ai/sdk": "^1.4.1",
46
46
  "commander": "^12.1.0",
47
47
  "picocolors": "^1.1.1"
48
48
  },
@@ -0,0 +1,310 @@
1
+ // SPDX-License-Identifier: MIT
2
+ // Copyright (C) 2026 Shogo Technologies, Inc.
3
+ /**
4
+ * Coverage for the {@link WorkerRuntimeManager} circuit breaker and
5
+ * process-group reaping introduced after a single user's project ran
6
+ * away with 54 consecutive `signal=SIGKILL` jetsam exits, each cycle
7
+ * leaking the agent-runtime's vite / preview-manager / LSP children
8
+ * (see main.log lines 5258–7093 for the prod reproduction).
9
+ *
10
+ * The unfixed code:
11
+ * - Spawned the runtime with `detached: false` and only ever called
12
+ * `proc.kill(...)` on the parent, so the kernel left the child
13
+ * subprocess tree alive after each jetsam SIGKILL of the parent.
14
+ * Symptom: `Force-killed leaked process N on port 37273` 10 times
15
+ * in a row, EADDRINUSE on the next spawn.
16
+ * - Had no max-restart cap; `handleExit` incremented forever, so
17
+ * once jetsam fired the first time the worker was locked into a
18
+ * ~1/minute respawn loop until the user quit the app. Symptom:
19
+ * `restart #54` in the same project's log.
20
+ *
21
+ * This file pins three contracts:
22
+ *
23
+ * 1. Every non-clean exit reaps the runtime's process group via the
24
+ * recorded PID (the orphan-sweep that wasn't happening before).
25
+ * 2. After {@link MAX_CONSECUTIVE_RESTARTS} consecutive non-clean
26
+ * exits within {@link RESTART_FAILURE_WINDOW_MS}, the slot
27
+ * transitions to `'failed'` and the manager stops scheduling
28
+ * restarts. `ensureRunning(...)` then refuses to silently
29
+ * respawn until the operator calls `resetFailure(...)`.
30
+ * 3. A clean exit (code=0, signal=null) does NOT increment the
31
+ * failure budget — bounded chat sessions that exit normally
32
+ * after their work is done should leave the slot in the same
33
+ * "healthy" state they found it in.
34
+ */
35
+ import { afterEach, beforeEach, describe, expect, it } from 'bun:test';
36
+ import { WorkerRuntimeManager } from '../runtime-manager.ts';
37
+
38
+ interface FakeProc {
39
+ exitCode: number | null;
40
+ signalCode: NodeJS.Signals | null;
41
+ killed: boolean;
42
+ pid: number;
43
+ kill: (signal?: NodeJS.Signals | number) => boolean;
44
+ once: (event: string, cb: (...args: unknown[]) => void) => FakeProc;
45
+ on: (event: string, cb: (...args: unknown[]) => void) => FakeProc;
46
+ }
47
+
48
+ function makeFakeProc(pid = 99999): FakeProc {
49
+ const proc: FakeProc = {
50
+ exitCode: null,
51
+ signalCode: null,
52
+ killed: false,
53
+ pid,
54
+ kill(_signal?: NodeJS.Signals | number) {
55
+ proc.killed = true;
56
+ return true;
57
+ },
58
+ once(_event, _cb) {
59
+ return proc;
60
+ },
61
+ on(_event, _cb) {
62
+ return proc;
63
+ },
64
+ };
65
+ return proc;
66
+ }
67
+
68
+ function insertRunningSlot(mgr: WorkerRuntimeManager, projectId: string, pid = 99999) {
69
+ const proc = makeFakeProc(pid);
70
+ const slot = {
71
+ projectId,
72
+ agentPort: 0,
73
+ apiServerPort: 0,
74
+ status: 'running' as const,
75
+ proc,
76
+ pid,
77
+ startedAt: Date.now(),
78
+ lastUsedAt: Date.now(),
79
+ restarts: 0,
80
+ consecutiveFailures: 0,
81
+ lastFailureAt: 0,
82
+ graceTimer: null,
83
+ restartTimer: null,
84
+ idleTimer: null,
85
+ spawnConfig: {} as never,
86
+ startPromise: null,
87
+ };
88
+ (mgr as unknown as { runtimes: Map<string, typeof slot> }).runtimes.set(projectId, slot);
89
+ return slot;
90
+ }
91
+
92
+ function handleExit(
93
+ mgr: WorkerRuntimeManager,
94
+ slot: ReturnType<typeof insertRunningSlot>,
95
+ code: number | null,
96
+ signal: NodeJS.Signals | null,
97
+ ): void {
98
+ (mgr as unknown as {
99
+ handleExit: (s: typeof slot, c: number | null, sg: NodeJS.Signals | null) => void;
100
+ }).handleExit(slot, code, signal);
101
+ }
102
+
103
+ const SILENT = { log: () => {}, warn: () => {}, error: () => {} } as const;
104
+
105
+ // Capture process.kill calls so we can pin that handleExit reaps the
106
+ // process group (negative pid argument) instead of just the parent.
107
+ type KillCall = { target: number; signal: NodeJS.Signals | number | undefined };
108
+ let killCalls: KillCall[] = [];
109
+ const realProcessKill = process.kill;
110
+
111
+ beforeEach(() => {
112
+ killCalls = [];
113
+ // Replace process.kill with a spy that ALSO doesn't actually signal
114
+ // anything (the fake pid 99999 might collide with a real process on
115
+ // the CI host).
116
+ (process as unknown as { kill: typeof process.kill }).kill = ((
117
+ pid: number,
118
+ signal?: NodeJS.Signals | number,
119
+ ) => {
120
+ killCalls.push({ target: pid, signal });
121
+ return true;
122
+ }) as typeof process.kill;
123
+ });
124
+
125
+ afterEach(() => {
126
+ (process as unknown as { kill: typeof process.kill }).kill = realProcessKill;
127
+ });
128
+
129
+ describe('WorkerRuntimeManager process-group reaping on non-clean exit', () => {
130
+ it('non-clean exit fires SIGKILL at the negative pid (the process group)', () => {
131
+ const mgr = new WorkerRuntimeManager({ logger: SILENT });
132
+ const slot = insertRunningSlot(mgr, 'proj-reap', 12345);
133
+
134
+ handleExit(mgr, slot, null, 'SIGKILL');
135
+
136
+ const pgKills = killCalls.filter((c) => c.target === -12345);
137
+ if (process.platform === 'win32') {
138
+ // killProcessGroup is a no-op on Windows; the inner preview-manager
139
+ // owns its own port-scoped reaper there.
140
+ expect(pgKills).toHaveLength(0);
141
+ } else {
142
+ expect(pgKills).toHaveLength(1);
143
+ expect(pgKills[0]!.signal).toBe('SIGKILL');
144
+ }
145
+ });
146
+
147
+ it('clean exit (code=0, signal=null) does NOT reap the process group', () => {
148
+ const mgr = new WorkerRuntimeManager({ logger: SILENT });
149
+ const slot = insertRunningSlot(mgr, 'proj-clean', 12346);
150
+
151
+ handleExit(mgr, slot, 0, null);
152
+
153
+ expect(killCalls.filter((c) => c.target === -12346)).toHaveLength(0);
154
+ // Slot is removed from the map on clean exit.
155
+ const runtimes = (mgr as unknown as { runtimes: Map<string, unknown> }).runtimes;
156
+ expect(runtimes.has('proj-clean')).toBe(false);
157
+ });
158
+
159
+ it('handleExit clears the recorded pid so a follow-up call is a no-op', () => {
160
+ const mgr = new WorkerRuntimeManager({ logger: SILENT });
161
+ const slot = insertRunningSlot(mgr, 'proj-pid-clear', 12347);
162
+
163
+ handleExit(mgr, slot, null, 'SIGKILL');
164
+ expect(slot.pid).toBeNull();
165
+
166
+ killCalls = [];
167
+ handleExit(mgr, slot, null, 'SIGKILL');
168
+ // No pid recorded → no further -pid kills should be issued.
169
+ expect(killCalls.filter((c) => c.target === -12347)).toHaveLength(0);
170
+ });
171
+ });
172
+
173
+ describe('WorkerRuntimeManager circuit breaker on consecutive failures', () => {
174
+ it('parks the slot in failed after MAX_CONSECUTIVE_RESTARTS consecutive jetsam SIGKILLs', () => {
175
+ const mgr = new WorkerRuntimeManager({ logger: SILENT });
176
+ const slot = insertRunningSlot(mgr, 'proj-jetsam', 22222);
177
+
178
+ // 8 is the constant in runtime-manager.ts; if it changes the
179
+ // assertion below has to follow. Hard-coding the expected count
180
+ // here surfaces the contract change at review time.
181
+ for (let i = 0; i < 8; i++) {
182
+ handleExit(mgr, slot, null, 'SIGKILL');
183
+ }
184
+
185
+ expect(slot.status).toBe('failed');
186
+ expect(slot.consecutiveFailures).toBe(8);
187
+ expect(slot.restartTimer).toBeNull();
188
+ expect(slot.lastError).toMatch(/Circuit breaker tripped/);
189
+ // The port is released so a future resetFailure() can re-allocate
190
+ // from a clean state.
191
+ expect(slot.agentPort).toBe(0);
192
+ expect(slot.apiServerPort).toBe(0);
193
+ });
194
+
195
+ it('still schedules a restart on the Nth non-clean exit when N < MAX', () => {
196
+ const mgr = new WorkerRuntimeManager({ logger: SILENT });
197
+ const slot = insertRunningSlot(mgr, 'proj-retry', 22223);
198
+
199
+ for (let i = 0; i < 3; i++) {
200
+ handleExit(mgr, slot, null, 'SIGKILL');
201
+ }
202
+
203
+ expect(slot.status).toBe('restarting');
204
+ expect(slot.consecutiveFailures).toBe(3);
205
+ expect(slot.restartTimer).not.toBeNull();
206
+ // Clean up the scheduled timer so the test runner exits.
207
+ if (slot.restartTimer) {
208
+ clearTimeout(slot.restartTimer);
209
+ slot.restartTimer = null;
210
+ }
211
+ });
212
+
213
+ it('clean exit between failures does NOT increment the failure budget', () => {
214
+ const mgr = new WorkerRuntimeManager({ logger: SILENT });
215
+ const slot = insertRunningSlot(mgr, 'proj-mixed', 22224);
216
+
217
+ handleExit(mgr, slot, null, 'SIGKILL');
218
+ expect(slot.consecutiveFailures).toBe(1);
219
+
220
+ // A clean exit removes the slot from the map. Re-insert a fresh
221
+ // slot to model "user reopened the project after a healthy
222
+ // shutdown" — its failure budget should be the default zero, not
223
+ // a continuation of the previous slot's counter.
224
+ handleExit(mgr, slot, 0, null);
225
+ const runtimes = (mgr as unknown as { runtimes: Map<string, unknown> }).runtimes;
226
+ expect(runtimes.has('proj-mixed')).toBe(false);
227
+
228
+ const freshSlot = insertRunningSlot(mgr, 'proj-mixed', 22225);
229
+ expect(freshSlot.consecutiveFailures).toBe(0);
230
+ });
231
+
232
+ it('a failure OUTSIDE the rolling window resets the counter to 1', () => {
233
+ const mgr = new WorkerRuntimeManager({ logger: SILENT });
234
+ const slot = insertRunningSlot(mgr, 'proj-window', 22226);
235
+
236
+ handleExit(mgr, slot, null, 'SIGKILL');
237
+ expect(slot.consecutiveFailures).toBe(1);
238
+ if (slot.restartTimer) { clearTimeout(slot.restartTimer); slot.restartTimer = null; }
239
+
240
+ // Backdate the last failure to outside the window (default 5min).
241
+ // Date.now() is stable enough inside one test run; we just shift
242
+ // the recorded timestamp into the past.
243
+ slot.lastFailureAt = Date.now() - (10 * 60 * 1000);
244
+
245
+ handleExit(mgr, slot, null, 'SIGKILL');
246
+ expect(slot.consecutiveFailures).toBe(1);
247
+ if (slot.restartTimer) { clearTimeout(slot.restartTimer); slot.restartTimer = null; }
248
+ });
249
+
250
+ it('exits triggered by stop() (status=stopping) do NOT count toward the failure budget', () => {
251
+ const mgr = new WorkerRuntimeManager({ logger: SILENT });
252
+ const slot = insertRunningSlot(mgr, 'proj-stop', 22227);
253
+ slot.status = 'stopping';
254
+
255
+ handleExit(mgr, slot, null, 'SIGTERM');
256
+
257
+ expect(slot.consecutiveFailures).toBe(0);
258
+ expect(slot.status).toBe('stopped');
259
+ // And the process group is NOT re-reaped — stop() already did that
260
+ // before signalling the parent.
261
+ expect(killCalls.filter((c) => c.target === -22227)).toHaveLength(0);
262
+ });
263
+ });
264
+
265
+ describe('WorkerRuntimeManager resetFailure', () => {
266
+ it('clears a failed slot so the next ensureRunning can re-spawn', () => {
267
+ const mgr = new WorkerRuntimeManager({ logger: SILENT });
268
+ const slot = insertRunningSlot(mgr, 'proj-reset', 33333);
269
+ for (let i = 0; i < 8; i++) handleExit(mgr, slot, null, 'SIGKILL');
270
+ expect(slot.status).toBe('failed');
271
+
272
+ const reset = mgr.resetFailure('proj-reset');
273
+ expect(reset).toBe(true);
274
+
275
+ const runtimes = (mgr as unknown as { runtimes: Map<string, unknown> }).runtimes;
276
+ expect(runtimes.has('proj-reset')).toBe(false);
277
+ });
278
+
279
+ it('returns false for a running slot — guard against accidental resets', () => {
280
+ const mgr = new WorkerRuntimeManager({ logger: SILENT });
281
+ insertRunningSlot(mgr, 'proj-running', 33334);
282
+
283
+ expect(mgr.resetFailure('proj-running')).toBe(false);
284
+ const runtimes = (mgr as unknown as { runtimes: Map<string, unknown> }).runtimes;
285
+ // Slot is preserved when resetFailure is a no-op.
286
+ expect(runtimes.has('proj-running')).toBe(true);
287
+ });
288
+
289
+ it('returns false for an unknown projectId', () => {
290
+ const mgr = new WorkerRuntimeManager({ logger: SILENT });
291
+ expect(mgr.resetFailure('does-not-exist')).toBe(false);
292
+ });
293
+ });
294
+
295
+ describe('WorkerRuntimeManager.ensureRunning refuses failed slots', () => {
296
+ it('throws a directive error pointing at resetFailure when the slot is parked', async () => {
297
+ const mgr = new WorkerRuntimeManager({ logger: SILENT });
298
+ const slot = insertRunningSlot(mgr, 'proj-locked', 44444);
299
+ for (let i = 0; i < 8; i++) handleExit(mgr, slot, null, 'SIGKILL');
300
+
301
+ // The fake spawn config isn't reached — ensureRunning guards on
302
+ // status before doStart — so we can pass an empty object.
303
+ await expect(
304
+ mgr.ensureRunning('proj-locked', {
305
+ cloudUrl: 'https://example.invalid',
306
+ apiKey: 'unused',
307
+ }),
308
+ ).rejects.toThrow(/Circuit breaker tripped|resetFailure/);
309
+ });
310
+ });
@@ -54,12 +54,53 @@ const RUNTIME_IDLE_MS = 15 * 60 * 1000;
54
54
  const RESTART_BACKOFF_BASE_MS = 1_000;
55
55
  const RESTART_BACKOFF_MAX_MS = 60_000;
56
56
 
57
+ /**
58
+ * Circuit breaker. After this many *consecutive* non-clean exits within
59
+ * {@link RESTART_FAILURE_WINDOW_MS}, the manager stops respawning and
60
+ * parks the slot in `'failed'`. Without this cap a project that the OS
61
+ * keeps OOM-killing (macOS jetsam SIGKILL is the canonical case) burns
62
+ * forever at ~1/minute, with every cycle re-spawning bun + vite +
63
+ * tsserver + pyright and (on posix) leaking the children of any
64
+ * incarnation the manager couldn't kill in its own process group.
65
+ *
66
+ * 8 across a 5-minute window means "we tried for at least
67
+ * (8 * BASE_BACKOFF capped at MAX) = ~7m of escalating backoff before
68
+ * we gave up", which is long enough to ride out a transient port-bind
69
+ * race or a one-shot dependency upgrade and short enough that an
70
+ * operator chasing runaway RSS on their laptop notices the loop
71
+ * stopping before the next memory-pressure cycle.
72
+ */
73
+ const MAX_CONSECUTIVE_RESTARTS = 8;
74
+ const RESTART_FAILURE_WINDOW_MS = 5 * 60 * 1000;
75
+ /**
76
+ * If a runtime stays up at least this long after the /health-gated
77
+ * `'running'` transition, we treat it as "recovered" and reset the
78
+ * consecutive-failure counter. This is the contract that lets a
79
+ * project that crashed twice on cold start (e.g. waiting for the API
80
+ * port to release) but then ran healthily for ten minutes start over
81
+ * with a fresh budget the next time it hiccups.
82
+ */
83
+ const STARTUP_GRACE_MS = 60_000;
84
+
57
85
  /** Health check poll interval while waiting for /health. */
58
86
  const HEALTH_POLL_MS = 500;
59
87
  /** Total timeout waiting for first /health success after spawn. */
60
88
  const HEALTH_BOOT_TIMEOUT_MS = 30_000;
61
89
 
62
- export type RuntimeStatus = 'starting' | 'running' | 'restarting' | 'stopping' | 'stopped' | 'error';
90
+ export type RuntimeStatus =
91
+ | 'starting'
92
+ | 'running'
93
+ | 'restarting'
94
+ | 'stopping'
95
+ | 'stopped'
96
+ | 'error'
97
+ /**
98
+ * Terminal state: the circuit breaker tripped. The slot stays in the
99
+ * `runtimes` map (so `status(projectId)` keeps reporting it) but no
100
+ * more spawns will happen until {@link WorkerRuntimeManager.resetFailure}
101
+ * is called or the slot is explicitly `stop()`'d.
102
+ */
103
+ | 'failed';
63
104
 
64
105
  export interface RuntimeStatusInfo {
65
106
  projectId: string;
@@ -215,9 +256,27 @@ interface InternalRuntime {
215
256
  apiServerPort: number;
216
257
  status: RuntimeStatus;
217
258
  proc: ChildProcess | null;
259
+ /**
260
+ * PID of the most recent spawn, retained after `proc` is nulled in
261
+ * {@link WorkerRuntimeManager.handleExit}. On posix, the runtime is
262
+ * spawned as a process group leader (`detached: true`), so this is
263
+ * also the PGID — `process.kill(-pid, ...)` cascades to vite, the
264
+ * preview-manager's inner API server, tsserver and pyright that the
265
+ * runtime spawned, which otherwise survive a SIGKILL of the parent
266
+ * (jetsam OOM) and accumulate as orphans until app restart.
267
+ */
268
+ pid: number | null;
218
269
  startedAt: number;
219
270
  lastUsedAt: number;
220
271
  restarts: number;
272
+ /** Consecutive non-clean exits since the last healthy run. */
273
+ consecutiveFailures: number;
274
+ /** Timestamp (Date.now) of the most recent non-clean exit. Used to
275
+ * detect "loop within the failure window" for the circuit breaker. */
276
+ lastFailureAt: number;
277
+ /** Timer that resets `consecutiveFailures` to 0 once a fresh run has
278
+ * survived for {@link STARTUP_GRACE_MS}. */
279
+ graceTimer: ReturnType<typeof setTimeout> | null;
221
280
  restartTimer: ReturnType<typeof setTimeout> | null;
222
281
  idleTimer: ReturnType<typeof setTimeout> | null;
223
282
  lastError?: string;
@@ -469,6 +528,20 @@ export class WorkerRuntimeManager implements RuntimeResolver {
469
528
  async ensureRunning(projectId: string, config: ProjectSpawnConfig): Promise<RuntimeStatusInfo> {
470
529
  if (this.stopped) throw new Error('WorkerRuntimeManager is stopped');
471
530
 
531
+ // Refuse circuit-broken slots BEFORE auto-pull so we don't churn
532
+ // the network/disk on a project we already know we won't spawn.
533
+ // Surfacing the parked-state message lets the caller (tunnel
534
+ // proxy, desktop UI) render an actionable error instead of the
535
+ // generic auto-pull / spawn failure.
536
+ const failedExisting = this.runtimes.get(projectId);
537
+ if (failedExisting?.status === 'failed') {
538
+ throw new Error(
539
+ `[WorkerRuntimeManager] cannot ensureRunning(${projectId}): ` +
540
+ `${failedExisting.lastError ?? 'runtime is in failed state'}. ` +
541
+ `Call resetFailure(${projectId}) or stop(${projectId}) before retrying.`,
542
+ );
543
+ }
544
+
472
545
  // Apply auto-pull before any runtime spawn so the runtime's PROJECT_DIR
473
546
  // points at a fully-cloned workspace. Idempotent: subsequent calls hit
474
547
  // the `pulledProjects` short-circuit.
@@ -761,14 +834,46 @@ export class WorkerRuntimeManager implements RuntimeResolver {
761
834
  r.status = 'stopping';
762
835
  if (r.restartTimer) { clearTimeout(r.restartTimer); r.restartTimer = null; }
763
836
  if (r.idleTimer) { clearTimeout(r.idleTimer); r.idleTimer = null; }
837
+ if (r.graceTimer) { clearTimeout(r.graceTimer); r.graceTimer = null; }
764
838
  if (r.proc) {
839
+ // Send the requested signal to the whole process group first so
840
+ // children (vite, preview-manager's API server, LSPs) start their
841
+ // own graceful shutdown in parallel with the parent.
842
+ this.killProcessGroup(r, signal);
765
843
  try { r.proc.kill(signal); } catch { /* already gone */ }
766
844
  await this.waitForExit(r.proc, 5000);
845
+ // Belt-and-suspenders: if the grace window elapsed without a
846
+ // clean exit, `waitForExit` already SIGKILL'd the parent — chase
847
+ // the rest of the group too in case any child ignored SIGTERM.
848
+ this.killProcessGroup(r, 'SIGKILL');
767
849
  }
850
+ r.pid = null;
768
851
  this.releasePort(r.agentPort);
769
852
  this.runtimes.delete(projectId);
770
853
  }
771
854
 
855
+ /**
856
+ * Re-arm a runtime that the circuit breaker parked in `'failed'`.
857
+ * Drops the slot from the map so the next `ensureRunning(projectId, …)`
858
+ * call performs a fresh `doStart()` with a zeroed failure budget.
859
+ *
860
+ * Intended for the desktop's "reopen project" flow and for operators
861
+ * who fixed whatever was crashing the runtime (e.g. freed memory,
862
+ * deleted a corrupted workspace file) and want to retry without
863
+ * tearing down the whole worker. No-op if the project isn't in
864
+ * `'failed'`.
865
+ */
866
+ resetFailure(projectId: string): boolean {
867
+ const r = this.runtimes.get(projectId);
868
+ if (!r || r.status !== 'failed') return false;
869
+ if (r.restartTimer) { clearTimeout(r.restartTimer); r.restartTimer = null; }
870
+ if (r.idleTimer) { clearTimeout(r.idleTimer); r.idleTimer = null; }
871
+ if (r.graceTimer) { clearTimeout(r.graceTimer); r.graceTimer = null; }
872
+ this.runtimes.delete(projectId);
873
+ this.log.log(`[WorkerRuntimeManager] resetFailure: ${projectId} cleared, next ensureRunning will respawn`);
874
+ return true;
875
+ }
876
+
772
877
  async stopAll(signal: NodeJS.Signals = 'SIGTERM'): Promise<void> {
773
878
  this.stopped = true;
774
879
  // Stop watchers FIRST so their final flush has a chance to PUT before
@@ -801,9 +906,13 @@ export class WorkerRuntimeManager implements RuntimeResolver {
801
906
  apiServerPort: 0,
802
907
  status: 'starting',
803
908
  proc: null,
909
+ pid: null,
804
910
  startedAt: 0,
805
911
  lastUsedAt: Date.now(),
806
912
  restarts: 0,
913
+ consecutiveFailures: 0,
914
+ lastFailureAt: 0,
915
+ graceTimer: null,
807
916
  restartTimer: null,
808
917
  idleTimer: null,
809
918
  spawnConfig: config,
@@ -833,14 +942,35 @@ export class WorkerRuntimeManager implements RuntimeResolver {
833
942
  `via ${command} ${args.join(' ')} (port=${slot.agentPort}, source=${resolved.source})`,
834
943
  );
835
944
 
945
+ // Spawn the runtime as its own process group leader (posix only —
946
+ // Windows has no equivalent and Node's child_process docs warn that
947
+ // `detached: true` there gives you a separate console window, not a
948
+ // PGID). Mirrors what apps/desktop/src/local-server.ts already does
949
+ // for the outer API server so a single kill at teardown reaches the
950
+ // bun child + every subprocess it spawned (vite, the inner
951
+ // preview-manager API server, tsserver, pyright). Without this,
952
+ // jetsam SIGKILL of the bun parent leaves all of those orphans
953
+ // alive — they keep their listening sockets, the next respawn
954
+ // races into EADDRINUSE, and RSS climbs forever (see the storm
955
+ // pattern in main.log lines 5258–7093 where preview-manager has to
956
+ // walk the API port and `Force-killed leaked process` 10 times in
957
+ // a row).
958
+ const useProcessGroup = process.platform !== 'win32';
836
959
  const proc = spawn(command, args, {
837
960
  cwd,
838
961
  env,
839
- detached: false,
962
+ detached: useProcessGroup,
840
963
  stdio: ['ignore', 'pipe', 'pipe'],
841
964
  });
965
+ if (useProcessGroup) {
966
+ // The detached child would otherwise keep the parent's event
967
+ // loop alive even after we've removed it from `runtimes` —
968
+ // matches the apps/desktop precedent at local-server.ts:412.
969
+ try { proc.unref(); } catch { /* unref is best-effort */ }
970
+ }
842
971
 
843
972
  slot.proc = proc;
973
+ slot.pid = proc.pid ?? null;
844
974
  slot.status = 'starting';
845
975
  slot.startedAt = Date.now();
846
976
 
@@ -870,11 +1000,17 @@ export class WorkerRuntimeManager implements RuntimeResolver {
870
1000
  slot.status = 'running';
871
1001
  slot.lastUsedAt = Date.now();
872
1002
  this.armIdleTimer(slot);
1003
+ this.armGraceTimer(slot);
873
1004
  return slot;
874
1005
  } catch (err: any) {
875
1006
  slot.status = 'error';
876
1007
  slot.lastError = err?.message ?? String(err);
877
- try { proc.kill('SIGTERM'); } catch { /* nothing */ }
1008
+ // The /health wait timed out (or the spawn itself failed). Tear
1009
+ // down the whole process group rather than just the parent so we
1010
+ // don't leave a half-booted preview-manager + vite running on
1011
+ // the allocated ports.
1012
+ this.killProcessGroup(slot, 'SIGTERM');
1013
+ try { proc.kill('SIGTERM'); } catch { /* already gone */ }
878
1014
  this.releasePort(slot.agentPort);
879
1015
  slot.agentPort = 0;
880
1016
  slot.apiServerPort = 0;
@@ -882,6 +1018,51 @@ export class WorkerRuntimeManager implements RuntimeResolver {
882
1018
  }
883
1019
  }
884
1020
 
1021
+ /**
1022
+ * Kill every process in `slot.pid`'s process group. Best-effort:
1023
+ * if the group is already gone (everyone exited cleanly), or the
1024
+ * platform doesn't support PGID kills (Windows), this is a no-op.
1025
+ *
1026
+ * Why we use the recorded PID and not `slot.proc.pid`: by the time
1027
+ * {@link handleExit} runs, `proc` has already fired its `'exit'`
1028
+ * event and we've nulled it. The kernel keeps the process group
1029
+ * intact until the *last* member of the group exits, so the PGID
1030
+ * we captured at spawn is still valid for reaping the orphans even
1031
+ * after the group leader is gone.
1032
+ */
1033
+ private killProcessGroup(slot: InternalRuntime, signal: NodeJS.Signals): void {
1034
+ if (process.platform === 'win32') return;
1035
+ if (!slot.pid) return;
1036
+ try {
1037
+ process.kill(-slot.pid, signal);
1038
+ } catch {
1039
+ // ESRCH (no such process group) is the happy path here — it
1040
+ // means every child exited with their parent and the kernel
1041
+ // already reaped the group. EPERM is the only other plausible
1042
+ // case; swallow it because the caller has no recourse anyway.
1043
+ }
1044
+ }
1045
+
1046
+ /**
1047
+ * Arm a timer that resets `consecutiveFailures` once a fresh run has
1048
+ * survived for {@link STARTUP_GRACE_MS}. Re-armed on every successful
1049
+ * /health transition; cleared on any non-clean exit so a crash
1050
+ * inside the grace window counts toward the circuit breaker.
1051
+ */
1052
+ private armGraceTimer(slot: InternalRuntime): void {
1053
+ if (slot.graceTimer) {
1054
+ clearTimeout(slot.graceTimer);
1055
+ slot.graceTimer = null;
1056
+ }
1057
+ slot.graceTimer = setTimeout(() => {
1058
+ slot.graceTimer = null;
1059
+ if (slot.consecutiveFailures > 0) {
1060
+ slot.consecutiveFailures = 0;
1061
+ }
1062
+ }, STARTUP_GRACE_MS);
1063
+ try { slot.graceTimer.unref?.(); } catch { /* unref is best-effort */ }
1064
+ }
1065
+
885
1066
  private buildEnv(slot: InternalRuntime, runtimeBinPath: string): NodeJS.ProcessEnv {
886
1067
  const cfg = slot.spawnConfig;
887
1068
  const env: NodeJS.ProcessEnv = {
@@ -956,9 +1137,16 @@ export class WorkerRuntimeManager implements RuntimeResolver {
956
1137
  `[WorkerRuntimeManager] runtime ${slot.projectId} exited (code=${code}, signal=${signal})`,
957
1138
  );
958
1139
  slot.proc = null;
1140
+ if (slot.graceTimer) {
1141
+ clearTimeout(slot.graceTimer);
1142
+ slot.graceTimer = null;
1143
+ }
959
1144
 
960
1145
  if (slot.status === 'stopping' || this.stopped) {
1146
+ // We initiated the stop; the orphan reap was already done by
1147
+ // stop()/stopAll(). Just clear bookkeeping.
961
1148
  slot.status = 'stopped';
1149
+ slot.pid = null;
962
1150
  this.releasePort(slot.agentPort);
963
1151
  slot.agentPort = 0;
964
1152
  slot.apiServerPort = 0;
@@ -967,6 +1155,7 @@ export class WorkerRuntimeManager implements RuntimeResolver {
967
1155
 
968
1156
  if (exitedClean) {
969
1157
  slot.status = 'stopped';
1158
+ slot.pid = null;
970
1159
  this.releasePort(slot.agentPort);
971
1160
  slot.agentPort = 0;
972
1161
  slot.apiServerPort = 0;
@@ -974,17 +1163,52 @@ export class WorkerRuntimeManager implements RuntimeResolver {
974
1163
  return;
975
1164
  }
976
1165
 
1166
+ // Non-clean exit. Two failure shapes we care about:
1167
+ // 1. Parent died but children (vite, preview-manager API server,
1168
+ // LSPs) are still alive in the same process group. Without
1169
+ // reaping, the next doStart() races into EADDRINUSE on the
1170
+ // same agent port and starts the storm.
1171
+ // 2. Repeated jetsam SIGKILL under memory pressure. Each cycle
1172
+ // respawns the full child tree and leaks more RSS. The
1173
+ // circuit breaker below stops that loop.
1174
+ this.killProcessGroup(slot, 'SIGKILL');
1175
+ slot.pid = null;
1176
+
1177
+ const now = Date.now();
1178
+ const withinWindow = now - slot.lastFailureAt <= RESTART_FAILURE_WINDOW_MS;
1179
+ slot.consecutiveFailures = withinWindow ? slot.consecutiveFailures + 1 : 1;
1180
+ slot.lastFailureAt = now;
977
1181
  slot.restarts += 1;
978
1182
  slot.lastError = `exited code=${code} signal=${signal}`;
1183
+
1184
+ if (slot.consecutiveFailures >= MAX_CONSECUTIVE_RESTARTS) {
1185
+ slot.status = 'failed';
1186
+ slot.lastError =
1187
+ `Circuit breaker tripped: ${slot.consecutiveFailures} consecutive non-clean exits ` +
1188
+ `within ${Math.round(RESTART_FAILURE_WINDOW_MS / 1000)}s (last: code=${code} signal=${signal}). ` +
1189
+ `Most recent on macOS is jetsam OOM (signal=SIGKILL with code=null); ` +
1190
+ `the previous incarnation's vite/tsserver/preview-manager children were reaped to ` +
1191
+ `prevent further RSS growth. Stop, fix the workspace, and call resetFailure(projectId) ` +
1192
+ `(or stop(projectId)) to allow another spawn attempt.`;
1193
+ this.releasePort(slot.agentPort);
1194
+ slot.agentPort = 0;
1195
+ slot.apiServerPort = 0;
1196
+ if (slot.restartTimer) { clearTimeout(slot.restartTimer); slot.restartTimer = null; }
1197
+ if (slot.idleTimer) { clearTimeout(slot.idleTimer); slot.idleTimer = null; }
1198
+ this.log.error(`[WorkerRuntimeManager] ${slot.lastError}`);
1199
+ return;
1200
+ }
1201
+
979
1202
  const delay = this.restartBackoffMs(slot.restarts);
980
1203
  slot.status = 'restarting';
981
1204
  this.log.warn(
982
1205
  `[WorkerRuntimeManager] restarting ${slot.projectId} in ${Math.round(delay / 1000)}s ` +
983
- `(restart #${slot.restarts})`,
1206
+ `(restart #${slot.restarts}, consecutive failures ${slot.consecutiveFailures}/${MAX_CONSECUTIVE_RESTARTS})`,
984
1207
  );
985
1208
  if (slot.restartTimer) clearTimeout(slot.restartTimer);
986
1209
  slot.restartTimer = setTimeout(() => {
987
1210
  slot.restartTimer = null;
1211
+ if (slot.status === 'failed' || this.stopped) return;
988
1212
  slot.startPromise = this.doStart(slot).then((r) => {
989
1213
  slot.startPromise = null;
990
1214
  return r;
@@ -994,6 +1218,7 @@ export class WorkerRuntimeManager implements RuntimeResolver {
994
1218
  return slot;
995
1219
  });
996
1220
  }, delay);
1221
+ try { slot.restartTimer.unref?.(); } catch { /* unref is best-effort */ }
997
1222
  }
998
1223
 
999
1224
  private restartBackoffMs(restarts: number): number {