@shogo-ai/worker 1.7.14 → 1.8.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json
CHANGED
|
@@ -0,0 +1,310 @@
|
|
|
1
|
+
// SPDX-License-Identifier: MIT
|
|
2
|
+
// Copyright (C) 2026 Shogo Technologies, Inc.
|
|
3
|
+
/**
|
|
4
|
+
* Coverage for the {@link WorkerRuntimeManager} circuit breaker and
|
|
5
|
+
* process-group reaping introduced after a single user's project ran
|
|
6
|
+
* away with 54 consecutive `signal=SIGKILL` jetsam exits, each cycle
|
|
7
|
+
* leaking the agent-runtime's vite / preview-manager / LSP children
|
|
8
|
+
* (see main.log lines 5258–7093 for the prod reproduction).
|
|
9
|
+
*
|
|
10
|
+
* The unfixed code:
|
|
11
|
+
* - Spawned the runtime with `detached: false` and only ever called
|
|
12
|
+
* `proc.kill(...)` on the parent, so the kernel left the child
|
|
13
|
+
* subprocess tree alive after each jetsam SIGKILL of the parent.
|
|
14
|
+
* Symptom: `Force-killed leaked process N on port 37273` 10 times
|
|
15
|
+
* in a row, EADDRINUSE on the next spawn.
|
|
16
|
+
* - Had no max-restart cap; `handleExit` incremented forever, so
|
|
17
|
+
* once jetsam fired the first time the worker was locked into a
|
|
18
|
+
* ~1/minute respawn loop until the user quit the app. Symptom:
|
|
19
|
+
* `restart #54` in the same project's log.
|
|
20
|
+
*
|
|
21
|
+
* This file pins three contracts:
|
|
22
|
+
*
|
|
23
|
+
* 1. Every non-clean exit reaps the runtime's process group via the
|
|
24
|
+
* recorded PID (the orphan-sweep that wasn't happening before).
|
|
25
|
+
* 2. After {@link MAX_CONSECUTIVE_RESTARTS} consecutive non-clean
|
|
26
|
+
* exits within {@link RESTART_FAILURE_WINDOW_MS}, the slot
|
|
27
|
+
* transitions to `'failed'` and the manager stops scheduling
|
|
28
|
+
* restarts. `ensureRunning(...)` then refuses to silently
|
|
29
|
+
* respawn until the operator calls `resetFailure(...)`.
|
|
30
|
+
* 3. A clean exit (code=0, signal=null) does NOT increment the
|
|
31
|
+
* failure budget — bounded chat sessions that exit normally
|
|
32
|
+
* after their work is done should leave the slot in the same
|
|
33
|
+
* "healthy" state they found it in.
|
|
34
|
+
*/
|
|
35
|
+
import { afterEach, beforeEach, describe, expect, it } from 'bun:test';
|
|
36
|
+
import { WorkerRuntimeManager } from '../runtime-manager.ts';
|
|
37
|
+
|
|
38
|
+
interface FakeProc {
|
|
39
|
+
exitCode: number | null;
|
|
40
|
+
signalCode: NodeJS.Signals | null;
|
|
41
|
+
killed: boolean;
|
|
42
|
+
pid: number;
|
|
43
|
+
kill: (signal?: NodeJS.Signals | number) => boolean;
|
|
44
|
+
once: (event: string, cb: (...args: unknown[]) => void) => FakeProc;
|
|
45
|
+
on: (event: string, cb: (...args: unknown[]) => void) => FakeProc;
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
function makeFakeProc(pid = 99999): FakeProc {
|
|
49
|
+
const proc: FakeProc = {
|
|
50
|
+
exitCode: null,
|
|
51
|
+
signalCode: null,
|
|
52
|
+
killed: false,
|
|
53
|
+
pid,
|
|
54
|
+
kill(_signal?: NodeJS.Signals | number) {
|
|
55
|
+
proc.killed = true;
|
|
56
|
+
return true;
|
|
57
|
+
},
|
|
58
|
+
once(_event, _cb) {
|
|
59
|
+
return proc;
|
|
60
|
+
},
|
|
61
|
+
on(_event, _cb) {
|
|
62
|
+
return proc;
|
|
63
|
+
},
|
|
64
|
+
};
|
|
65
|
+
return proc;
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
function insertRunningSlot(mgr: WorkerRuntimeManager, projectId: string, pid = 99999) {
|
|
69
|
+
const proc = makeFakeProc(pid);
|
|
70
|
+
const slot = {
|
|
71
|
+
projectId,
|
|
72
|
+
agentPort: 0,
|
|
73
|
+
apiServerPort: 0,
|
|
74
|
+
status: 'running' as const,
|
|
75
|
+
proc,
|
|
76
|
+
pid,
|
|
77
|
+
startedAt: Date.now(),
|
|
78
|
+
lastUsedAt: Date.now(),
|
|
79
|
+
restarts: 0,
|
|
80
|
+
consecutiveFailures: 0,
|
|
81
|
+
lastFailureAt: 0,
|
|
82
|
+
graceTimer: null,
|
|
83
|
+
restartTimer: null,
|
|
84
|
+
idleTimer: null,
|
|
85
|
+
spawnConfig: {} as never,
|
|
86
|
+
startPromise: null,
|
|
87
|
+
};
|
|
88
|
+
(mgr as unknown as { runtimes: Map<string, typeof slot> }).runtimes.set(projectId, slot);
|
|
89
|
+
return slot;
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
function handleExit(
|
|
93
|
+
mgr: WorkerRuntimeManager,
|
|
94
|
+
slot: ReturnType<typeof insertRunningSlot>,
|
|
95
|
+
code: number | null,
|
|
96
|
+
signal: NodeJS.Signals | null,
|
|
97
|
+
): void {
|
|
98
|
+
(mgr as unknown as {
|
|
99
|
+
handleExit: (s: typeof slot, c: number | null, sg: NodeJS.Signals | null) => void;
|
|
100
|
+
}).handleExit(slot, code, signal);
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
const SILENT = { log: () => {}, warn: () => {}, error: () => {} } as const;
|
|
104
|
+
|
|
105
|
+
// Capture process.kill calls so we can pin that handleExit reaps the
|
|
106
|
+
// process group (negative pid argument) instead of just the parent.
|
|
107
|
+
type KillCall = { target: number; signal: NodeJS.Signals | number | undefined };
|
|
108
|
+
let killCalls: KillCall[] = [];
|
|
109
|
+
const realProcessKill = process.kill;
|
|
110
|
+
|
|
111
|
+
beforeEach(() => {
|
|
112
|
+
killCalls = [];
|
|
113
|
+
// Replace process.kill with a spy that ALSO doesn't actually signal
|
|
114
|
+
// anything (the fake pid 99999 might collide with a real process on
|
|
115
|
+
// the CI host).
|
|
116
|
+
(process as unknown as { kill: typeof process.kill }).kill = ((
|
|
117
|
+
pid: number,
|
|
118
|
+
signal?: NodeJS.Signals | number,
|
|
119
|
+
) => {
|
|
120
|
+
killCalls.push({ target: pid, signal });
|
|
121
|
+
return true;
|
|
122
|
+
}) as typeof process.kill;
|
|
123
|
+
});
|
|
124
|
+
|
|
125
|
+
afterEach(() => {
|
|
126
|
+
(process as unknown as { kill: typeof process.kill }).kill = realProcessKill;
|
|
127
|
+
});
|
|
128
|
+
|
|
129
|
+
describe('WorkerRuntimeManager process-group reaping on non-clean exit', () => {
|
|
130
|
+
it('non-clean exit fires SIGKILL at the negative pid (the process group)', () => {
|
|
131
|
+
const mgr = new WorkerRuntimeManager({ logger: SILENT });
|
|
132
|
+
const slot = insertRunningSlot(mgr, 'proj-reap', 12345);
|
|
133
|
+
|
|
134
|
+
handleExit(mgr, slot, null, 'SIGKILL');
|
|
135
|
+
|
|
136
|
+
const pgKills = killCalls.filter((c) => c.target === -12345);
|
|
137
|
+
if (process.platform === 'win32') {
|
|
138
|
+
// killProcessGroup is a no-op on Windows; the inner preview-manager
|
|
139
|
+
// owns its own port-scoped reaper there.
|
|
140
|
+
expect(pgKills).toHaveLength(0);
|
|
141
|
+
} else {
|
|
142
|
+
expect(pgKills).toHaveLength(1);
|
|
143
|
+
expect(pgKills[0]!.signal).toBe('SIGKILL');
|
|
144
|
+
}
|
|
145
|
+
});
|
|
146
|
+
|
|
147
|
+
it('clean exit (code=0, signal=null) does NOT reap the process group', () => {
|
|
148
|
+
const mgr = new WorkerRuntimeManager({ logger: SILENT });
|
|
149
|
+
const slot = insertRunningSlot(mgr, 'proj-clean', 12346);
|
|
150
|
+
|
|
151
|
+
handleExit(mgr, slot, 0, null);
|
|
152
|
+
|
|
153
|
+
expect(killCalls.filter((c) => c.target === -12346)).toHaveLength(0);
|
|
154
|
+
// Slot is removed from the map on clean exit.
|
|
155
|
+
const runtimes = (mgr as unknown as { runtimes: Map<string, unknown> }).runtimes;
|
|
156
|
+
expect(runtimes.has('proj-clean')).toBe(false);
|
|
157
|
+
});
|
|
158
|
+
|
|
159
|
+
it('handleExit clears the recorded pid so a follow-up call is a no-op', () => {
|
|
160
|
+
const mgr = new WorkerRuntimeManager({ logger: SILENT });
|
|
161
|
+
const slot = insertRunningSlot(mgr, 'proj-pid-clear', 12347);
|
|
162
|
+
|
|
163
|
+
handleExit(mgr, slot, null, 'SIGKILL');
|
|
164
|
+
expect(slot.pid).toBeNull();
|
|
165
|
+
|
|
166
|
+
killCalls = [];
|
|
167
|
+
handleExit(mgr, slot, null, 'SIGKILL');
|
|
168
|
+
// No pid recorded → no further -pid kills should be issued.
|
|
169
|
+
expect(killCalls.filter((c) => c.target === -12347)).toHaveLength(0);
|
|
170
|
+
});
|
|
171
|
+
});
|
|
172
|
+
|
|
173
|
+
describe('WorkerRuntimeManager circuit breaker on consecutive failures', () => {
|
|
174
|
+
it('parks the slot in failed after MAX_CONSECUTIVE_RESTARTS consecutive jetsam SIGKILLs', () => {
|
|
175
|
+
const mgr = new WorkerRuntimeManager({ logger: SILENT });
|
|
176
|
+
const slot = insertRunningSlot(mgr, 'proj-jetsam', 22222);
|
|
177
|
+
|
|
178
|
+
// 8 is the constant in runtime-manager.ts; if it changes the
|
|
179
|
+
// assertion below has to follow. Hard-coding the expected count
|
|
180
|
+
// here surfaces the contract change at review time.
|
|
181
|
+
for (let i = 0; i < 8; i++) {
|
|
182
|
+
handleExit(mgr, slot, null, 'SIGKILL');
|
|
183
|
+
}
|
|
184
|
+
|
|
185
|
+
expect(slot.status).toBe('failed');
|
|
186
|
+
expect(slot.consecutiveFailures).toBe(8);
|
|
187
|
+
expect(slot.restartTimer).toBeNull();
|
|
188
|
+
expect(slot.lastError).toMatch(/Circuit breaker tripped/);
|
|
189
|
+
// The port is released so a future resetFailure() can re-allocate
|
|
190
|
+
// from a clean state.
|
|
191
|
+
expect(slot.agentPort).toBe(0);
|
|
192
|
+
expect(slot.apiServerPort).toBe(0);
|
|
193
|
+
});
|
|
194
|
+
|
|
195
|
+
it('still schedules a restart on the Nth non-clean exit when N < MAX', () => {
|
|
196
|
+
const mgr = new WorkerRuntimeManager({ logger: SILENT });
|
|
197
|
+
const slot = insertRunningSlot(mgr, 'proj-retry', 22223);
|
|
198
|
+
|
|
199
|
+
for (let i = 0; i < 3; i++) {
|
|
200
|
+
handleExit(mgr, slot, null, 'SIGKILL');
|
|
201
|
+
}
|
|
202
|
+
|
|
203
|
+
expect(slot.status).toBe('restarting');
|
|
204
|
+
expect(slot.consecutiveFailures).toBe(3);
|
|
205
|
+
expect(slot.restartTimer).not.toBeNull();
|
|
206
|
+
// Clean up the scheduled timer so the test runner exits.
|
|
207
|
+
if (slot.restartTimer) {
|
|
208
|
+
clearTimeout(slot.restartTimer);
|
|
209
|
+
slot.restartTimer = null;
|
|
210
|
+
}
|
|
211
|
+
});
|
|
212
|
+
|
|
213
|
+
it('clean exit between failures does NOT increment the failure budget', () => {
|
|
214
|
+
const mgr = new WorkerRuntimeManager({ logger: SILENT });
|
|
215
|
+
const slot = insertRunningSlot(mgr, 'proj-mixed', 22224);
|
|
216
|
+
|
|
217
|
+
handleExit(mgr, slot, null, 'SIGKILL');
|
|
218
|
+
expect(slot.consecutiveFailures).toBe(1);
|
|
219
|
+
|
|
220
|
+
// A clean exit removes the slot from the map. Re-insert a fresh
|
|
221
|
+
// slot to model "user reopened the project after a healthy
|
|
222
|
+
// shutdown" — its failure budget should be the default zero, not
|
|
223
|
+
// a continuation of the previous slot's counter.
|
|
224
|
+
handleExit(mgr, slot, 0, null);
|
|
225
|
+
const runtimes = (mgr as unknown as { runtimes: Map<string, unknown> }).runtimes;
|
|
226
|
+
expect(runtimes.has('proj-mixed')).toBe(false);
|
|
227
|
+
|
|
228
|
+
const freshSlot = insertRunningSlot(mgr, 'proj-mixed', 22225);
|
|
229
|
+
expect(freshSlot.consecutiveFailures).toBe(0);
|
|
230
|
+
});
|
|
231
|
+
|
|
232
|
+
it('a failure OUTSIDE the rolling window resets the counter to 1', () => {
|
|
233
|
+
const mgr = new WorkerRuntimeManager({ logger: SILENT });
|
|
234
|
+
const slot = insertRunningSlot(mgr, 'proj-window', 22226);
|
|
235
|
+
|
|
236
|
+
handleExit(mgr, slot, null, 'SIGKILL');
|
|
237
|
+
expect(slot.consecutiveFailures).toBe(1);
|
|
238
|
+
if (slot.restartTimer) { clearTimeout(slot.restartTimer); slot.restartTimer = null; }
|
|
239
|
+
|
|
240
|
+
// Backdate the last failure to outside the window (default 5min).
|
|
241
|
+
// Date.now() is stable enough inside one test run; we just shift
|
|
242
|
+
// the recorded timestamp into the past.
|
|
243
|
+
slot.lastFailureAt = Date.now() - (10 * 60 * 1000);
|
|
244
|
+
|
|
245
|
+
handleExit(mgr, slot, null, 'SIGKILL');
|
|
246
|
+
expect(slot.consecutiveFailures).toBe(1);
|
|
247
|
+
if (slot.restartTimer) { clearTimeout(slot.restartTimer); slot.restartTimer = null; }
|
|
248
|
+
});
|
|
249
|
+
|
|
250
|
+
it('exits triggered by stop() (status=stopping) do NOT count toward the failure budget', () => {
|
|
251
|
+
const mgr = new WorkerRuntimeManager({ logger: SILENT });
|
|
252
|
+
const slot = insertRunningSlot(mgr, 'proj-stop', 22227);
|
|
253
|
+
slot.status = 'stopping';
|
|
254
|
+
|
|
255
|
+
handleExit(mgr, slot, null, 'SIGTERM');
|
|
256
|
+
|
|
257
|
+
expect(slot.consecutiveFailures).toBe(0);
|
|
258
|
+
expect(slot.status).toBe('stopped');
|
|
259
|
+
// And the process group is NOT re-reaped — stop() already did that
|
|
260
|
+
// before signalling the parent.
|
|
261
|
+
expect(killCalls.filter((c) => c.target === -22227)).toHaveLength(0);
|
|
262
|
+
});
|
|
263
|
+
});
|
|
264
|
+
|
|
265
|
+
describe('WorkerRuntimeManager resetFailure', () => {
|
|
266
|
+
it('clears a failed slot so the next ensureRunning can re-spawn', () => {
|
|
267
|
+
const mgr = new WorkerRuntimeManager({ logger: SILENT });
|
|
268
|
+
const slot = insertRunningSlot(mgr, 'proj-reset', 33333);
|
|
269
|
+
for (let i = 0; i < 8; i++) handleExit(mgr, slot, null, 'SIGKILL');
|
|
270
|
+
expect(slot.status).toBe('failed');
|
|
271
|
+
|
|
272
|
+
const reset = mgr.resetFailure('proj-reset');
|
|
273
|
+
expect(reset).toBe(true);
|
|
274
|
+
|
|
275
|
+
const runtimes = (mgr as unknown as { runtimes: Map<string, unknown> }).runtimes;
|
|
276
|
+
expect(runtimes.has('proj-reset')).toBe(false);
|
|
277
|
+
});
|
|
278
|
+
|
|
279
|
+
it('returns false for a running slot — guard against accidental resets', () => {
|
|
280
|
+
const mgr = new WorkerRuntimeManager({ logger: SILENT });
|
|
281
|
+
insertRunningSlot(mgr, 'proj-running', 33334);
|
|
282
|
+
|
|
283
|
+
expect(mgr.resetFailure('proj-running')).toBe(false);
|
|
284
|
+
const runtimes = (mgr as unknown as { runtimes: Map<string, unknown> }).runtimes;
|
|
285
|
+
// Slot is preserved when resetFailure is a no-op.
|
|
286
|
+
expect(runtimes.has('proj-running')).toBe(true);
|
|
287
|
+
});
|
|
288
|
+
|
|
289
|
+
it('returns false for an unknown projectId', () => {
|
|
290
|
+
const mgr = new WorkerRuntimeManager({ logger: SILENT });
|
|
291
|
+
expect(mgr.resetFailure('does-not-exist')).toBe(false);
|
|
292
|
+
});
|
|
293
|
+
});
|
|
294
|
+
|
|
295
|
+
describe('WorkerRuntimeManager.ensureRunning refuses failed slots', () => {
|
|
296
|
+
it('throws a directive error pointing at resetFailure when the slot is parked', async () => {
|
|
297
|
+
const mgr = new WorkerRuntimeManager({ logger: SILENT });
|
|
298
|
+
const slot = insertRunningSlot(mgr, 'proj-locked', 44444);
|
|
299
|
+
for (let i = 0; i < 8; i++) handleExit(mgr, slot, null, 'SIGKILL');
|
|
300
|
+
|
|
301
|
+
// The fake spawn config isn't reached — ensureRunning guards on
|
|
302
|
+
// status before doStart — so we can pass an empty object.
|
|
303
|
+
await expect(
|
|
304
|
+
mgr.ensureRunning('proj-locked', {
|
|
305
|
+
cloudUrl: 'https://example.invalid',
|
|
306
|
+
apiKey: 'unused',
|
|
307
|
+
}),
|
|
308
|
+
).rejects.toThrow(/Circuit breaker tripped|resetFailure/);
|
|
309
|
+
});
|
|
310
|
+
});
|
|
@@ -54,12 +54,53 @@ const RUNTIME_IDLE_MS = 15 * 60 * 1000;
|
|
|
54
54
|
const RESTART_BACKOFF_BASE_MS = 1_000;
|
|
55
55
|
const RESTART_BACKOFF_MAX_MS = 60_000;
|
|
56
56
|
|
|
57
|
+
/**
|
|
58
|
+
* Circuit breaker. After this many *consecutive* non-clean exits within
|
|
59
|
+
* {@link RESTART_FAILURE_WINDOW_MS}, the manager stops respawning and
|
|
60
|
+
* parks the slot in `'failed'`. Without this cap a project that the OS
|
|
61
|
+
* keeps OOM-killing (macOS jetsam SIGKILL is the canonical case) burns
|
|
62
|
+
* forever at ~1/minute, with every cycle re-spawning bun + vite +
|
|
63
|
+
* tsserver + pyright and (on posix) leaking the children of any
|
|
64
|
+
* incarnation the manager couldn't kill in its own process group.
|
|
65
|
+
*
|
|
66
|
+
* 8 across a 5-minute window means "we tried for at least
|
|
67
|
+
* (8 * BASE_BACKOFF capped at MAX) = ~7m of escalating backoff before
|
|
68
|
+
* we gave up", which is long enough to ride out a transient port-bind
|
|
69
|
+
* race or a one-shot dependency upgrade and short enough that an
|
|
70
|
+
* operator chasing runaway RSS on their laptop notices the loop
|
|
71
|
+
* stopping before the next memory-pressure cycle.
|
|
72
|
+
*/
|
|
73
|
+
const MAX_CONSECUTIVE_RESTARTS = 8;
|
|
74
|
+
const RESTART_FAILURE_WINDOW_MS = 5 * 60 * 1000;
|
|
75
|
+
/**
|
|
76
|
+
* If a runtime stays up at least this long after the /health-gated
|
|
77
|
+
* `'running'` transition, we treat it as "recovered" and reset the
|
|
78
|
+
* consecutive-failure counter. This is the contract that lets a
|
|
79
|
+
* project that crashed twice on cold start (e.g. waiting for the API
|
|
80
|
+
* port to release) but then ran healthily for ten minutes start over
|
|
81
|
+
* with a fresh budget the next time it hiccups.
|
|
82
|
+
*/
|
|
83
|
+
const STARTUP_GRACE_MS = 60_000;
|
|
84
|
+
|
|
57
85
|
/** Health check poll interval while waiting for /health. */
|
|
58
86
|
const HEALTH_POLL_MS = 500;
|
|
59
87
|
/** Total timeout waiting for first /health success after spawn. */
|
|
60
88
|
const HEALTH_BOOT_TIMEOUT_MS = 30_000;
|
|
61
89
|
|
|
62
|
-
export type RuntimeStatus =
|
|
90
|
+
export type RuntimeStatus =
|
|
91
|
+
| 'starting'
|
|
92
|
+
| 'running'
|
|
93
|
+
| 'restarting'
|
|
94
|
+
| 'stopping'
|
|
95
|
+
| 'stopped'
|
|
96
|
+
| 'error'
|
|
97
|
+
/**
|
|
98
|
+
* Terminal state: the circuit breaker tripped. The slot stays in the
|
|
99
|
+
* `runtimes` map (so `status(projectId)` keeps reporting it) but no
|
|
100
|
+
* more spawns will happen until {@link WorkerRuntimeManager.resetFailure}
|
|
101
|
+
* is called or the slot is explicitly `stop()`'d.
|
|
102
|
+
*/
|
|
103
|
+
| 'failed';
|
|
63
104
|
|
|
64
105
|
export interface RuntimeStatusInfo {
|
|
65
106
|
projectId: string;
|
|
@@ -215,9 +256,27 @@ interface InternalRuntime {
|
|
|
215
256
|
apiServerPort: number;
|
|
216
257
|
status: RuntimeStatus;
|
|
217
258
|
proc: ChildProcess | null;
|
|
259
|
+
/**
|
|
260
|
+
* PID of the most recent spawn, retained after `proc` is nulled in
|
|
261
|
+
* {@link WorkerRuntimeManager.handleExit}. On posix, the runtime is
|
|
262
|
+
* spawned as a process group leader (`detached: true`), so this is
|
|
263
|
+
* also the PGID — `process.kill(-pid, ...)` cascades to vite, the
|
|
264
|
+
* preview-manager's inner API server, tsserver and pyright that the
|
|
265
|
+
* runtime spawned, which otherwise survive a SIGKILL of the parent
|
|
266
|
+
* (jetsam OOM) and accumulate as orphans until app restart.
|
|
267
|
+
*/
|
|
268
|
+
pid: number | null;
|
|
218
269
|
startedAt: number;
|
|
219
270
|
lastUsedAt: number;
|
|
220
271
|
restarts: number;
|
|
272
|
+
/** Consecutive non-clean exits since the last healthy run. */
|
|
273
|
+
consecutiveFailures: number;
|
|
274
|
+
/** Timestamp (Date.now) of the most recent non-clean exit. Used to
|
|
275
|
+
* detect "loop within the failure window" for the circuit breaker. */
|
|
276
|
+
lastFailureAt: number;
|
|
277
|
+
/** Timer that resets `consecutiveFailures` to 0 once a fresh run has
|
|
278
|
+
* survived for {@link STARTUP_GRACE_MS}. */
|
|
279
|
+
graceTimer: ReturnType<typeof setTimeout> | null;
|
|
221
280
|
restartTimer: ReturnType<typeof setTimeout> | null;
|
|
222
281
|
idleTimer: ReturnType<typeof setTimeout> | null;
|
|
223
282
|
lastError?: string;
|
|
@@ -469,6 +528,20 @@ export class WorkerRuntimeManager implements RuntimeResolver {
|
|
|
469
528
|
async ensureRunning(projectId: string, config: ProjectSpawnConfig): Promise<RuntimeStatusInfo> {
|
|
470
529
|
if (this.stopped) throw new Error('WorkerRuntimeManager is stopped');
|
|
471
530
|
|
|
531
|
+
// Refuse circuit-broken slots BEFORE auto-pull so we don't churn
|
|
532
|
+
// the network/disk on a project we already know we won't spawn.
|
|
533
|
+
// Surfacing the parked-state message lets the caller (tunnel
|
|
534
|
+
// proxy, desktop UI) render an actionable error instead of the
|
|
535
|
+
// generic auto-pull / spawn failure.
|
|
536
|
+
const failedExisting = this.runtimes.get(projectId);
|
|
537
|
+
if (failedExisting?.status === 'failed') {
|
|
538
|
+
throw new Error(
|
|
539
|
+
`[WorkerRuntimeManager] cannot ensureRunning(${projectId}): ` +
|
|
540
|
+
`${failedExisting.lastError ?? 'runtime is in failed state'}. ` +
|
|
541
|
+
`Call resetFailure(${projectId}) or stop(${projectId}) before retrying.`,
|
|
542
|
+
);
|
|
543
|
+
}
|
|
544
|
+
|
|
472
545
|
// Apply auto-pull before any runtime spawn so the runtime's PROJECT_DIR
|
|
473
546
|
// points at a fully-cloned workspace. Idempotent: subsequent calls hit
|
|
474
547
|
// the `pulledProjects` short-circuit.
|
|
@@ -761,14 +834,46 @@ export class WorkerRuntimeManager implements RuntimeResolver {
|
|
|
761
834
|
r.status = 'stopping';
|
|
762
835
|
if (r.restartTimer) { clearTimeout(r.restartTimer); r.restartTimer = null; }
|
|
763
836
|
if (r.idleTimer) { clearTimeout(r.idleTimer); r.idleTimer = null; }
|
|
837
|
+
if (r.graceTimer) { clearTimeout(r.graceTimer); r.graceTimer = null; }
|
|
764
838
|
if (r.proc) {
|
|
839
|
+
// Send the requested signal to the whole process group first so
|
|
840
|
+
// children (vite, preview-manager's API server, LSPs) start their
|
|
841
|
+
// own graceful shutdown in parallel with the parent.
|
|
842
|
+
this.killProcessGroup(r, signal);
|
|
765
843
|
try { r.proc.kill(signal); } catch { /* already gone */ }
|
|
766
844
|
await this.waitForExit(r.proc, 5000);
|
|
845
|
+
// Belt-and-suspenders: if the grace window elapsed without a
|
|
846
|
+
// clean exit, `waitForExit` already SIGKILL'd the parent — chase
|
|
847
|
+
// the rest of the group too in case any child ignored SIGTERM.
|
|
848
|
+
this.killProcessGroup(r, 'SIGKILL');
|
|
767
849
|
}
|
|
850
|
+
r.pid = null;
|
|
768
851
|
this.releasePort(r.agentPort);
|
|
769
852
|
this.runtimes.delete(projectId);
|
|
770
853
|
}
|
|
771
854
|
|
|
855
|
+
/**
|
|
856
|
+
* Re-arm a runtime that the circuit breaker parked in `'failed'`.
|
|
857
|
+
* Drops the slot from the map so the next `ensureRunning(projectId, …)`
|
|
858
|
+
* call performs a fresh `doStart()` with a zeroed failure budget.
|
|
859
|
+
*
|
|
860
|
+
* Intended for the desktop's "reopen project" flow and for operators
|
|
861
|
+
* who fixed whatever was crashing the runtime (e.g. freed memory,
|
|
862
|
+
* deleted a corrupted workspace file) and want to retry without
|
|
863
|
+
* tearing down the whole worker. No-op if the project isn't in
|
|
864
|
+
* `'failed'`.
|
|
865
|
+
*/
|
|
866
|
+
resetFailure(projectId: string): boolean {
|
|
867
|
+
const r = this.runtimes.get(projectId);
|
|
868
|
+
if (!r || r.status !== 'failed') return false;
|
|
869
|
+
if (r.restartTimer) { clearTimeout(r.restartTimer); r.restartTimer = null; }
|
|
870
|
+
if (r.idleTimer) { clearTimeout(r.idleTimer); r.idleTimer = null; }
|
|
871
|
+
if (r.graceTimer) { clearTimeout(r.graceTimer); r.graceTimer = null; }
|
|
872
|
+
this.runtimes.delete(projectId);
|
|
873
|
+
this.log.log(`[WorkerRuntimeManager] resetFailure: ${projectId} cleared, next ensureRunning will respawn`);
|
|
874
|
+
return true;
|
|
875
|
+
}
|
|
876
|
+
|
|
772
877
|
async stopAll(signal: NodeJS.Signals = 'SIGTERM'): Promise<void> {
|
|
773
878
|
this.stopped = true;
|
|
774
879
|
// Stop watchers FIRST so their final flush has a chance to PUT before
|
|
@@ -801,9 +906,13 @@ export class WorkerRuntimeManager implements RuntimeResolver {
|
|
|
801
906
|
apiServerPort: 0,
|
|
802
907
|
status: 'starting',
|
|
803
908
|
proc: null,
|
|
909
|
+
pid: null,
|
|
804
910
|
startedAt: 0,
|
|
805
911
|
lastUsedAt: Date.now(),
|
|
806
912
|
restarts: 0,
|
|
913
|
+
consecutiveFailures: 0,
|
|
914
|
+
lastFailureAt: 0,
|
|
915
|
+
graceTimer: null,
|
|
807
916
|
restartTimer: null,
|
|
808
917
|
idleTimer: null,
|
|
809
918
|
spawnConfig: config,
|
|
@@ -833,14 +942,35 @@ export class WorkerRuntimeManager implements RuntimeResolver {
|
|
|
833
942
|
`via ${command} ${args.join(' ')} (port=${slot.agentPort}, source=${resolved.source})`,
|
|
834
943
|
);
|
|
835
944
|
|
|
945
|
+
// Spawn the runtime as its own process group leader (posix only —
|
|
946
|
+
// Windows has no equivalent and Node's child_process docs warn that
|
|
947
|
+
// `detached: true` there gives you a separate console window, not a
|
|
948
|
+
// PGID). Mirrors what apps/desktop/src/local-server.ts already does
|
|
949
|
+
// for the outer API server so a single kill at teardown reaches the
|
|
950
|
+
// bun child + every subprocess it spawned (vite, the inner
|
|
951
|
+
// preview-manager API server, tsserver, pyright). Without this,
|
|
952
|
+
// jetsam SIGKILL of the bun parent leaves all of those orphans
|
|
953
|
+
// alive — they keep their listening sockets, the next respawn
|
|
954
|
+
// races into EADDRINUSE, and RSS climbs forever (see the storm
|
|
955
|
+
// pattern in main.log lines 5258–7093 where preview-manager has to
|
|
956
|
+
// walk the API port and `Force-killed leaked process` 10 times in
|
|
957
|
+
// a row).
|
|
958
|
+
const useProcessGroup = process.platform !== 'win32';
|
|
836
959
|
const proc = spawn(command, args, {
|
|
837
960
|
cwd,
|
|
838
961
|
env,
|
|
839
|
-
detached:
|
|
962
|
+
detached: useProcessGroup,
|
|
840
963
|
stdio: ['ignore', 'pipe', 'pipe'],
|
|
841
964
|
});
|
|
965
|
+
if (useProcessGroup) {
|
|
966
|
+
// The detached child would otherwise keep the parent's event
|
|
967
|
+
// loop alive even after we've removed it from `runtimes` —
|
|
968
|
+
// matches the apps/desktop precedent at local-server.ts:412.
|
|
969
|
+
try { proc.unref(); } catch { /* unref is best-effort */ }
|
|
970
|
+
}
|
|
842
971
|
|
|
843
972
|
slot.proc = proc;
|
|
973
|
+
slot.pid = proc.pid ?? null;
|
|
844
974
|
slot.status = 'starting';
|
|
845
975
|
slot.startedAt = Date.now();
|
|
846
976
|
|
|
@@ -870,11 +1000,17 @@ export class WorkerRuntimeManager implements RuntimeResolver {
|
|
|
870
1000
|
slot.status = 'running';
|
|
871
1001
|
slot.lastUsedAt = Date.now();
|
|
872
1002
|
this.armIdleTimer(slot);
|
|
1003
|
+
this.armGraceTimer(slot);
|
|
873
1004
|
return slot;
|
|
874
1005
|
} catch (err: any) {
|
|
875
1006
|
slot.status = 'error';
|
|
876
1007
|
slot.lastError = err?.message ?? String(err);
|
|
877
|
-
|
|
1008
|
+
// The /health wait timed out (or the spawn itself failed). Tear
|
|
1009
|
+
// down the whole process group rather than just the parent so we
|
|
1010
|
+
// don't leave a half-booted preview-manager + vite running on
|
|
1011
|
+
// the allocated ports.
|
|
1012
|
+
this.killProcessGroup(slot, 'SIGTERM');
|
|
1013
|
+
try { proc.kill('SIGTERM'); } catch { /* already gone */ }
|
|
878
1014
|
this.releasePort(slot.agentPort);
|
|
879
1015
|
slot.agentPort = 0;
|
|
880
1016
|
slot.apiServerPort = 0;
|
|
@@ -882,6 +1018,51 @@ export class WorkerRuntimeManager implements RuntimeResolver {
|
|
|
882
1018
|
}
|
|
883
1019
|
}
|
|
884
1020
|
|
|
1021
|
+
/**
|
|
1022
|
+
* Kill every process in `slot.pid`'s process group. Best-effort:
|
|
1023
|
+
* if the group is already gone (everyone exited cleanly), or the
|
|
1024
|
+
* platform doesn't support PGID kills (Windows), this is a no-op.
|
|
1025
|
+
*
|
|
1026
|
+
* Why we use the recorded PID and not `slot.proc.pid`: by the time
|
|
1027
|
+
* {@link handleExit} runs, `proc` has already fired its `'exit'`
|
|
1028
|
+
* event and we've nulled it. The kernel keeps the process group
|
|
1029
|
+
* intact until the *last* member of the group exits, so the PGID
|
|
1030
|
+
* we captured at spawn is still valid for reaping the orphans even
|
|
1031
|
+
* after the group leader is gone.
|
|
1032
|
+
*/
|
|
1033
|
+
private killProcessGroup(slot: InternalRuntime, signal: NodeJS.Signals): void {
|
|
1034
|
+
if (process.platform === 'win32') return;
|
|
1035
|
+
if (!slot.pid) return;
|
|
1036
|
+
try {
|
|
1037
|
+
process.kill(-slot.pid, signal);
|
|
1038
|
+
} catch {
|
|
1039
|
+
// ESRCH (no such process group) is the happy path here — it
|
|
1040
|
+
// means every child exited with their parent and the kernel
|
|
1041
|
+
// already reaped the group. EPERM is the only other plausible
|
|
1042
|
+
// case; swallow it because the caller has no recourse anyway.
|
|
1043
|
+
}
|
|
1044
|
+
}
|
|
1045
|
+
|
|
1046
|
+
/**
|
|
1047
|
+
* Arm a timer that resets `consecutiveFailures` once a fresh run has
|
|
1048
|
+
* survived for {@link STARTUP_GRACE_MS}. Re-armed on every successful
|
|
1049
|
+
* /health transition; cleared on any non-clean exit so a crash
|
|
1050
|
+
* inside the grace window counts toward the circuit breaker.
|
|
1051
|
+
*/
|
|
1052
|
+
private armGraceTimer(slot: InternalRuntime): void {
|
|
1053
|
+
if (slot.graceTimer) {
|
|
1054
|
+
clearTimeout(slot.graceTimer);
|
|
1055
|
+
slot.graceTimer = null;
|
|
1056
|
+
}
|
|
1057
|
+
slot.graceTimer = setTimeout(() => {
|
|
1058
|
+
slot.graceTimer = null;
|
|
1059
|
+
if (slot.consecutiveFailures > 0) {
|
|
1060
|
+
slot.consecutiveFailures = 0;
|
|
1061
|
+
}
|
|
1062
|
+
}, STARTUP_GRACE_MS);
|
|
1063
|
+
try { slot.graceTimer.unref?.(); } catch { /* unref is best-effort */ }
|
|
1064
|
+
}
|
|
1065
|
+
|
|
885
1066
|
private buildEnv(slot: InternalRuntime, runtimeBinPath: string): NodeJS.ProcessEnv {
|
|
886
1067
|
const cfg = slot.spawnConfig;
|
|
887
1068
|
const env: NodeJS.ProcessEnv = {
|
|
@@ -956,9 +1137,16 @@ export class WorkerRuntimeManager implements RuntimeResolver {
|
|
|
956
1137
|
`[WorkerRuntimeManager] runtime ${slot.projectId} exited (code=${code}, signal=${signal})`,
|
|
957
1138
|
);
|
|
958
1139
|
slot.proc = null;
|
|
1140
|
+
if (slot.graceTimer) {
|
|
1141
|
+
clearTimeout(slot.graceTimer);
|
|
1142
|
+
slot.graceTimer = null;
|
|
1143
|
+
}
|
|
959
1144
|
|
|
960
1145
|
if (slot.status === 'stopping' || this.stopped) {
|
|
1146
|
+
// We initiated the stop; the orphan reap was already done by
|
|
1147
|
+
// stop()/stopAll(). Just clear bookkeeping.
|
|
961
1148
|
slot.status = 'stopped';
|
|
1149
|
+
slot.pid = null;
|
|
962
1150
|
this.releasePort(slot.agentPort);
|
|
963
1151
|
slot.agentPort = 0;
|
|
964
1152
|
slot.apiServerPort = 0;
|
|
@@ -967,6 +1155,7 @@ export class WorkerRuntimeManager implements RuntimeResolver {
|
|
|
967
1155
|
|
|
968
1156
|
if (exitedClean) {
|
|
969
1157
|
slot.status = 'stopped';
|
|
1158
|
+
slot.pid = null;
|
|
970
1159
|
this.releasePort(slot.agentPort);
|
|
971
1160
|
slot.agentPort = 0;
|
|
972
1161
|
slot.apiServerPort = 0;
|
|
@@ -974,17 +1163,52 @@ export class WorkerRuntimeManager implements RuntimeResolver {
|
|
|
974
1163
|
return;
|
|
975
1164
|
}
|
|
976
1165
|
|
|
1166
|
+
// Non-clean exit. Two failure shapes we care about:
|
|
1167
|
+
// 1. Parent died but children (vite, preview-manager API server,
|
|
1168
|
+
// LSPs) are still alive in the same process group. Without
|
|
1169
|
+
// reaping, the next doStart() races into EADDRINUSE on the
|
|
1170
|
+
// same agent port and starts the storm.
|
|
1171
|
+
// 2. Repeated jetsam SIGKILL under memory pressure. Each cycle
|
|
1172
|
+
// respawns the full child tree and leaks more RSS. The
|
|
1173
|
+
// circuit breaker below stops that loop.
|
|
1174
|
+
this.killProcessGroup(slot, 'SIGKILL');
|
|
1175
|
+
slot.pid = null;
|
|
1176
|
+
|
|
1177
|
+
const now = Date.now();
|
|
1178
|
+
const withinWindow = now - slot.lastFailureAt <= RESTART_FAILURE_WINDOW_MS;
|
|
1179
|
+
slot.consecutiveFailures = withinWindow ? slot.consecutiveFailures + 1 : 1;
|
|
1180
|
+
slot.lastFailureAt = now;
|
|
977
1181
|
slot.restarts += 1;
|
|
978
1182
|
slot.lastError = `exited code=${code} signal=${signal}`;
|
|
1183
|
+
|
|
1184
|
+
if (slot.consecutiveFailures >= MAX_CONSECUTIVE_RESTARTS) {
|
|
1185
|
+
slot.status = 'failed';
|
|
1186
|
+
slot.lastError =
|
|
1187
|
+
`Circuit breaker tripped: ${slot.consecutiveFailures} consecutive non-clean exits ` +
|
|
1188
|
+
`within ${Math.round(RESTART_FAILURE_WINDOW_MS / 1000)}s (last: code=${code} signal=${signal}). ` +
|
|
1189
|
+
`Most recent on macOS is jetsam OOM (signal=SIGKILL with code=null); ` +
|
|
1190
|
+
`the previous incarnation's vite/tsserver/preview-manager children were reaped to ` +
|
|
1191
|
+
`prevent further RSS growth. Stop, fix the workspace, and call resetFailure(projectId) ` +
|
|
1192
|
+
`(or stop(projectId)) to allow another spawn attempt.`;
|
|
1193
|
+
this.releasePort(slot.agentPort);
|
|
1194
|
+
slot.agentPort = 0;
|
|
1195
|
+
slot.apiServerPort = 0;
|
|
1196
|
+
if (slot.restartTimer) { clearTimeout(slot.restartTimer); slot.restartTimer = null; }
|
|
1197
|
+
if (slot.idleTimer) { clearTimeout(slot.idleTimer); slot.idleTimer = null; }
|
|
1198
|
+
this.log.error(`[WorkerRuntimeManager] ${slot.lastError}`);
|
|
1199
|
+
return;
|
|
1200
|
+
}
|
|
1201
|
+
|
|
979
1202
|
const delay = this.restartBackoffMs(slot.restarts);
|
|
980
1203
|
slot.status = 'restarting';
|
|
981
1204
|
this.log.warn(
|
|
982
1205
|
`[WorkerRuntimeManager] restarting ${slot.projectId} in ${Math.round(delay / 1000)}s ` +
|
|
983
|
-
`(restart #${slot.restarts})`,
|
|
1206
|
+
`(restart #${slot.restarts}, consecutive failures ${slot.consecutiveFailures}/${MAX_CONSECUTIVE_RESTARTS})`,
|
|
984
1207
|
);
|
|
985
1208
|
if (slot.restartTimer) clearTimeout(slot.restartTimer);
|
|
986
1209
|
slot.restartTimer = setTimeout(() => {
|
|
987
1210
|
slot.restartTimer = null;
|
|
1211
|
+
if (slot.status === 'failed' || this.stopped) return;
|
|
988
1212
|
slot.startPromise = this.doStart(slot).then((r) => {
|
|
989
1213
|
slot.startPromise = null;
|
|
990
1214
|
return r;
|
|
@@ -994,6 +1218,7 @@ export class WorkerRuntimeManager implements RuntimeResolver {
|
|
|
994
1218
|
return slot;
|
|
995
1219
|
});
|
|
996
1220
|
}, delay);
|
|
1221
|
+
try { slot.restartTimer.unref?.(); } catch { /* unref is best-effort */ }
|
|
997
1222
|
}
|
|
998
1223
|
|
|
999
1224
|
private restartBackoffMs(restarts: number): number {
|