@shogo-ai/worker 1.8.8 → 1.8.10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json
CHANGED
|
@@ -0,0 +1,251 @@
|
|
|
1
|
+
// SPDX-License-Identifier: MIT
|
|
2
|
+
// Copyright (C) 2026 Shogo Technologies, Inc.
|
|
3
|
+
/**
|
|
4
|
+
* Pins {@link WorkerRuntimeManager.waitForHealth} against the three
|
|
5
|
+
* readiness signals it now considers and the diagnostic behavior on
|
|
6
|
+
* timeout. The historical bug this protects against: agent-runtime
|
|
7
|
+
* cold boots on Windows with `bun --conditions=development run
|
|
8
|
+
* packages/agent-runtime/src/server.ts` routinely take 30-45s just
|
|
9
|
+
* to JIT-compile the TS dep graph (shared-runtime + generators +
|
|
10
|
+
* tools + hooks + gateway). The kernel-level TCP listener is up
|
|
11
|
+
* within ~1-2s of spawn (the moment `Bun.serve()` reads its default
|
|
12
|
+
* export), but the event loop is still saturated for tens of seconds
|
|
13
|
+
* after that, so the old HTTP-only /health gate timed out, SIGTERM'd
|
|
14
|
+
* the still-booting child, and the restart-loop chewed through
|
|
15
|
+
* `MAX_CONSECUTIVE_RESTARTS` before giving up — leaving chat stuck
|
|
16
|
+
* with "Connection timed out — The agent runtime could not be
|
|
17
|
+
* reached".
|
|
18
|
+
*
|
|
19
|
+
* The contracts pinned here:
|
|
20
|
+
*
|
|
21
|
+
* 1. HTTP /health 200 returns immediately — the happy path is
|
|
22
|
+
* unchanged for healthy runtimes.
|
|
23
|
+
* 2. TCP-listening + recent stdout activity returns successfully
|
|
24
|
+
* even when /health never responds — handles the cold-boot
|
|
25
|
+
* saturation case.
|
|
26
|
+
* 3. TCP-listening but stdout silent past STDOUT_PROGRESS_WINDOW_MS
|
|
27
|
+
* does NOT short-circuit — a truly-wedged process still hits
|
|
28
|
+
* the hard timeout.
|
|
29
|
+
* 4. Process death (exitCode/signalCode/killed) short-circuits
|
|
30
|
+
* immediately with the exit info, instead of waiting the full
|
|
31
|
+
* timeout window.
|
|
32
|
+
*/
|
|
33
|
+
import { afterEach, beforeEach, describe, expect, it } from 'bun:test';
|
|
34
|
+
import { WorkerRuntimeManager } from '../runtime-manager.ts';
|
|
35
|
+
|
|
36
|
+
interface FakeProc {
|
|
37
|
+
exitCode: number | null;
|
|
38
|
+
signalCode: NodeJS.Signals | null;
|
|
39
|
+
killed: boolean;
|
|
40
|
+
pid: number;
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
interface SlotShape {
|
|
44
|
+
projectId: string;
|
|
45
|
+
agentPort: number;
|
|
46
|
+
apiServerPort: number;
|
|
47
|
+
status: string;
|
|
48
|
+
proc: FakeProc;
|
|
49
|
+
pid: number;
|
|
50
|
+
startedAt: number;
|
|
51
|
+
lastStdoutAt: number;
|
|
52
|
+
lastUsedAt: number;
|
|
53
|
+
restarts: number;
|
|
54
|
+
consecutiveFailures: number;
|
|
55
|
+
lastFailureAt: number;
|
|
56
|
+
graceTimer: null;
|
|
57
|
+
restartTimer: null;
|
|
58
|
+
idleTimer: null;
|
|
59
|
+
spawnConfig: unknown;
|
|
60
|
+
startPromise: null;
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
function makeSlot(overrides: Partial<SlotShape> = {}): SlotShape {
|
|
64
|
+
const now = Date.now();
|
|
65
|
+
const proc: FakeProc = {
|
|
66
|
+
exitCode: null,
|
|
67
|
+
signalCode: null,
|
|
68
|
+
killed: false,
|
|
69
|
+
pid: 12345,
|
|
70
|
+
};
|
|
71
|
+
return {
|
|
72
|
+
projectId: 'proj-test',
|
|
73
|
+
agentPort: 41234,
|
|
74
|
+
apiServerPort: 41235,
|
|
75
|
+
status: 'starting',
|
|
76
|
+
proc,
|
|
77
|
+
pid: proc.pid,
|
|
78
|
+
startedAt: now,
|
|
79
|
+
lastStdoutAt: now,
|
|
80
|
+
lastUsedAt: now,
|
|
81
|
+
restarts: 0,
|
|
82
|
+
consecutiveFailures: 0,
|
|
83
|
+
lastFailureAt: 0,
|
|
84
|
+
graceTimer: null,
|
|
85
|
+
restartTimer: null,
|
|
86
|
+
idleTimer: null,
|
|
87
|
+
spawnConfig: {},
|
|
88
|
+
startPromise: null,
|
|
89
|
+
...overrides,
|
|
90
|
+
};
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
const SILENT = { log: () => {}, warn: () => {}, error: () => {} } as const;
|
|
94
|
+
|
|
95
|
+
function callWaitForHealth(
|
|
96
|
+
mgr: WorkerRuntimeManager,
|
|
97
|
+
slot: SlotShape,
|
|
98
|
+
timeoutMs: number,
|
|
99
|
+
): Promise<void> {
|
|
100
|
+
return (mgr as unknown as {
|
|
101
|
+
waitForHealth: (s: SlotShape, t: number) => Promise<void>;
|
|
102
|
+
}).waitForHealth(slot, timeoutMs);
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
function stubTcpProbe(mgr: WorkerRuntimeManager, listening: boolean | (() => boolean)): void {
|
|
106
|
+
(mgr as unknown as { tcpProbe: (port: number) => Promise<boolean> }).tcpProbe = async () =>
|
|
107
|
+
typeof listening === 'function' ? listening() : listening;
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
const ORIGINAL_FETCH = globalThis.fetch;
|
|
111
|
+
|
|
112
|
+
describe('WorkerRuntimeManager.waitForHealth (private)', () => {
|
|
113
|
+
let mgr: WorkerRuntimeManager;
|
|
114
|
+
|
|
115
|
+
beforeEach(() => {
|
|
116
|
+
mgr = new WorkerRuntimeManager({ logger: SILENT });
|
|
117
|
+
});
|
|
118
|
+
|
|
119
|
+
afterEach(() => {
|
|
120
|
+
globalThis.fetch = ORIGINAL_FETCH;
|
|
121
|
+
});
|
|
122
|
+
|
|
123
|
+
it('returns immediately when /health responds 200', async () => {
|
|
124
|
+
globalThis.fetch = (async () => new Response('ok', { status: 200 })) as typeof fetch;
|
|
125
|
+
// Even with TCP probe returning false the HTTP fast path wins.
|
|
126
|
+
stubTcpProbe(mgr, false);
|
|
127
|
+
|
|
128
|
+
const slot = makeSlot();
|
|
129
|
+
const startedAt = Date.now();
|
|
130
|
+
await callWaitForHealth(mgr, slot, 5_000);
|
|
131
|
+
expect(Date.now() - startedAt).toBeLessThan(2_000);
|
|
132
|
+
});
|
|
133
|
+
|
|
134
|
+
it(
|
|
135
|
+
'accepts TCP-listening + recent stdout activity as ready when /health keeps failing',
|
|
136
|
+
async () => {
|
|
137
|
+
// Reproduces the Windows `--conditions=development` cold-boot:
|
|
138
|
+
// Bun.serve has bound the port (kernel-level listener up) and
|
|
139
|
+
// the child is still emitting log lines (LSP-TS init,
|
|
140
|
+
// preview-manager spawn, vite build), but /health is starved
|
|
141
|
+
// because the event loop is busy JIT-compiling. The wait
|
|
142
|
+
// should accept this as ready instead of timing out for 30s
|
|
143
|
+
// and SIGTERM'ing a still-booting process.
|
|
144
|
+
globalThis.fetch = (async () => {
|
|
145
|
+
throw new Error('ECONNREFUSED');
|
|
146
|
+
}) as typeof fetch;
|
|
147
|
+
stubTcpProbe(mgr, true);
|
|
148
|
+
|
|
149
|
+
const slot = makeSlot();
|
|
150
|
+
// Keep stdout "fresh" so the progress window stays open.
|
|
151
|
+
const bumper = setInterval(() => {
|
|
152
|
+
slot.lastStdoutAt = Date.now();
|
|
153
|
+
}, 200);
|
|
154
|
+
|
|
155
|
+
const startedAt = Date.now();
|
|
156
|
+
try {
|
|
157
|
+
await callWaitForHealth(mgr, slot, 30_000);
|
|
158
|
+
} finally {
|
|
159
|
+
clearInterval(bumper);
|
|
160
|
+
}
|
|
161
|
+
const elapsed = Date.now() - startedAt;
|
|
162
|
+
// Should return within the first iteration or two — TCP probe
|
|
163
|
+
// succeeds + stdout fresh = ready. Hard cap well below the 30s
|
|
164
|
+
// timeout to catch a regression to the legacy 30s spin.
|
|
165
|
+
expect(elapsed).toBeLessThan(5_000);
|
|
166
|
+
},
|
|
167
|
+
15_000,
|
|
168
|
+
);
|
|
169
|
+
|
|
170
|
+
it(
|
|
171
|
+
'does NOT accept TCP-listening alone when stdout is silent past the progress window',
|
|
172
|
+
async () => {
|
|
173
|
+
// The silent-but-bound case mimics a process that bound the
|
|
174
|
+
// port and then wedged (infinite loop in top-level code, native
|
|
175
|
+
// crash mid-init that didn't propagate to the parent). The
|
|
176
|
+
// progress window correctly refuses to short-circuit here so
|
|
177
|
+
// the restart loop has a chance to recover.
|
|
178
|
+
globalThis.fetch = (async () => {
|
|
179
|
+
throw new Error('ECONNREFUSED');
|
|
180
|
+
}) as typeof fetch;
|
|
181
|
+
stubTcpProbe(mgr, true);
|
|
182
|
+
|
|
183
|
+
const slot = makeSlot({
|
|
184
|
+
// Pre-age the stdout timestamp so the window is already
|
|
185
|
+
// closed before the first iteration.
|
|
186
|
+
lastStdoutAt: Date.now() - 60_000,
|
|
187
|
+
});
|
|
188
|
+
|
|
189
|
+
await expect(callWaitForHealth(mgr, slot, 1_200)).rejects.toThrow(
|
|
190
|
+
/Timeout waiting for agent-runtime \/health on port 41234.*tcpListening=true/s,
|
|
191
|
+
);
|
|
192
|
+
},
|
|
193
|
+
5_000,
|
|
194
|
+
);
|
|
195
|
+
|
|
196
|
+
it('times out cleanly when neither TCP nor HTTP ever come up', async () => {
|
|
197
|
+
globalThis.fetch = (async () => {
|
|
198
|
+
throw new Error('ECONNREFUSED');
|
|
199
|
+
}) as typeof fetch;
|
|
200
|
+
stubTcpProbe(mgr, false);
|
|
201
|
+
|
|
202
|
+
const slot = makeSlot();
|
|
203
|
+
await expect(callWaitForHealth(mgr, slot, 1_200)).rejects.toThrow(
|
|
204
|
+
/Timeout waiting for agent-runtime \/health on port 41234.*tcpListening=false/s,
|
|
205
|
+
);
|
|
206
|
+
});
|
|
207
|
+
|
|
208
|
+
it('short-circuits with exit info when the child dies mid-wait', async () => {
|
|
209
|
+
// Without the short-circuit the wait would spin for the full
|
|
210
|
+
// 30s and surface a generic timeout — the operator's first clue
|
|
211
|
+
// to "the process crashed during boot" should be a clear exit
|
|
212
|
+
// code, not the same boilerplate error as a network failure.
|
|
213
|
+
globalThis.fetch = (async () => {
|
|
214
|
+
throw new Error('ECONNREFUSED');
|
|
215
|
+
}) as typeof fetch;
|
|
216
|
+
stubTcpProbe(mgr, false);
|
|
217
|
+
|
|
218
|
+
const slot = makeSlot();
|
|
219
|
+
setTimeout(() => {
|
|
220
|
+
slot.proc.exitCode = 1;
|
|
221
|
+
}, 400);
|
|
222
|
+
|
|
223
|
+
const startedAt = Date.now();
|
|
224
|
+
await expect(callWaitForHealth(mgr, slot, 30_000)).rejects.toThrow(
|
|
225
|
+
/agent-runtime exited \(code=1, signal=null\) before becoming healthy on port 41234/,
|
|
226
|
+
);
|
|
227
|
+
// 2s is a comfortable ceiling: the inner setTimeout for exitCode
|
|
228
|
+
// fires at 400ms and the wait should observe it within one
|
|
229
|
+
// HEALTH_POLL_MS (500ms) tick.
|
|
230
|
+
expect(Date.now() - startedAt).toBeLessThan(2_500);
|
|
231
|
+
});
|
|
232
|
+
|
|
233
|
+
it('detects SIGKILL-style deaths where exitCode stays null', async () => {
|
|
234
|
+
// Mirrors the jetsam-on-macOS case and the `cleanupStaleProcesses`
|
|
235
|
+
// SIGKILL of a previous run that the manager itself fired —
|
|
236
|
+
// exitCode never gets a number, only signalCode flips.
|
|
237
|
+
globalThis.fetch = (async () => {
|
|
238
|
+
throw new Error('ECONNREFUSED');
|
|
239
|
+
}) as typeof fetch;
|
|
240
|
+
stubTcpProbe(mgr, false);
|
|
241
|
+
|
|
242
|
+
const slot = makeSlot();
|
|
243
|
+
setTimeout(() => {
|
|
244
|
+
slot.proc.signalCode = 'SIGKILL';
|
|
245
|
+
}, 400);
|
|
246
|
+
|
|
247
|
+
await expect(callWaitForHealth(mgr, slot, 30_000)).rejects.toThrow(
|
|
248
|
+
/agent-runtime exited \(code=null, signal=SIGKILL\) before becoming healthy on port 41234/,
|
|
249
|
+
);
|
|
250
|
+
});
|
|
251
|
+
});
|
|
@@ -34,6 +34,7 @@
|
|
|
34
34
|
import { type ChildProcess, spawn } from 'node:child_process';
|
|
35
35
|
import { createHmac, randomBytes } from 'node:crypto';
|
|
36
36
|
import { existsSync, mkdirSync, readdirSync } from 'node:fs';
|
|
37
|
+
import { createConnection } from 'node:net';
|
|
37
38
|
import { tmpdir } from 'node:os';
|
|
38
39
|
import { dirname, join } from 'node:path';
|
|
39
40
|
import { resolveRuntime, type ResolvedRuntime } from './runtime-resolver.ts';
|
|
@@ -84,8 +85,41 @@ const STARTUP_GRACE_MS = 60_000;
|
|
|
84
85
|
|
|
85
86
|
/** Health check poll interval while waiting for /health. */
|
|
86
87
|
const HEALTH_POLL_MS = 500;
|
|
87
|
-
/**
|
|
88
|
-
|
|
88
|
+
/**
|
|
89
|
+
* Absolute ceiling on how long we'll wait for a freshly-spawned
|
|
90
|
+
* agent-runtime to clear the boot gate (either a 200 /health response
|
|
91
|
+
* or the TCP-listening + stdout-progress fallback below). Was 30s
|
|
92
|
+
* before the Windows `--conditions=development` cold-boot
|
|
93
|
+
* investigation surfaced that Bun routinely takes 30-45s just to JIT
|
|
94
|
+
* the agent-runtime's TS dep graph (shared-runtime + generators +
|
|
95
|
+
* tools + hooks + gateway) on a cold workspace, leaving zero budget
|
|
96
|
+
* for /health to actually respond. 60s gives the slow path room
|
|
97
|
+
* without giving a truly-hung process a free pass — the progress
|
|
98
|
+
* detector below catches that case independently.
|
|
99
|
+
*/
|
|
100
|
+
const HEALTH_BOOT_TIMEOUT_MS = 60_000;
|
|
101
|
+
/**
|
|
102
|
+
* If the child's TCP listener is up AND its stdout has emitted a line
|
|
103
|
+
* within this window, we accept it as ready even though /health
|
|
104
|
+
* hasn't returned yet. Models "is the process making forward
|
|
105
|
+
* progress?" — a Bun runtime mid `optimizeDeps` / LSP-spawn / hook
|
|
106
|
+
* registration is unresponsive to HTTP but still emits `[LSP-TS]`,
|
|
107
|
+
* `[preview-manager]`, `[AgentGateway]` etc. log lines as it walks
|
|
108
|
+
* its boot sequence, so stdout activity is a reliable liveness
|
|
109
|
+
* signal. A truly-hung child (event-loop wedged, zero log output)
|
|
110
|
+
* stays silent past this window and falls through to the normal
|
|
111
|
+
* /health timeout + SIGTERM + restart loop.
|
|
112
|
+
*/
|
|
113
|
+
const STDOUT_PROGRESS_WINDOW_MS = 10_000;
|
|
114
|
+
/**
|
|
115
|
+
* Per-attempt TCP connect budget for the kernel-level readiness
|
|
116
|
+
* probe. Connect attempts only need a TCP SYN/SYN-ACK roundtrip on
|
|
117
|
+
* loopback — anything past ~50ms means the kernel doesn't have the
|
|
118
|
+
* listener bound yet (the agent-runtime hasn't reached the
|
|
119
|
+
* Bun.serve() default-export evaluation), so 500ms is comfortably
|
|
120
|
+
* above the noise floor without prolonging probe iterations.
|
|
121
|
+
*/
|
|
122
|
+
const TCP_CONNECT_TIMEOUT_MS = 500;
|
|
89
123
|
|
|
90
124
|
export type RuntimeStatus =
|
|
91
125
|
| 'starting'
|
|
@@ -267,6 +301,18 @@ interface InternalRuntime {
|
|
|
267
301
|
*/
|
|
268
302
|
pid: number | null;
|
|
269
303
|
startedAt: number;
|
|
304
|
+
/**
|
|
305
|
+
* Wall-clock timestamp of the most recent stdout/stderr line emitted
|
|
306
|
+
* by `proc`. Read by {@link WorkerRuntimeManager.waitForHealth} as a
|
|
307
|
+
* "process is making forward progress" signal that lets a slow boot
|
|
308
|
+
* clear the readiness gate even when /health hasn't responded yet.
|
|
309
|
+
* See the {@link STDOUT_PROGRESS_WINDOW_MS} doc and the comment block
|
|
310
|
+
* inside `waitForHealth` for the full rationale (Windows
|
|
311
|
+
* `--conditions=development` cold-boot pattern where Bun
|
|
312
|
+
* JIT-compiles the entire TS dep graph on first request and the
|
|
313
|
+
* 30s HTTP /health budget vanishes before the first response).
|
|
314
|
+
*/
|
|
315
|
+
lastStdoutAt: number;
|
|
270
316
|
lastUsedAt: number;
|
|
271
317
|
restarts: number;
|
|
272
318
|
/** Consecutive non-clean exits since the last healthy run. */
|
|
@@ -908,6 +954,7 @@ export class WorkerRuntimeManager implements RuntimeResolver {
|
|
|
908
954
|
proc: null,
|
|
909
955
|
pid: null,
|
|
910
956
|
startedAt: 0,
|
|
957
|
+
lastStdoutAt: 0,
|
|
911
958
|
lastUsedAt: Date.now(),
|
|
912
959
|
restarts: 0,
|
|
913
960
|
consecutiveFailures: 0,
|
|
@@ -973,6 +1020,11 @@ export class WorkerRuntimeManager implements RuntimeResolver {
|
|
|
973
1020
|
slot.pid = proc.pid ?? null;
|
|
974
1021
|
slot.status = 'starting';
|
|
975
1022
|
slot.startedAt = Date.now();
|
|
1023
|
+
// Seed the progress timestamp at spawn so a child that emits its
|
|
1024
|
+
// first line within `STDOUT_PROGRESS_WINDOW_MS` is treated as
|
|
1025
|
+
// "making progress since spawn" without the readiness check having
|
|
1026
|
+
// to special-case the cold-start gap.
|
|
1027
|
+
slot.lastStdoutAt = slot.startedAt;
|
|
976
1028
|
|
|
977
1029
|
proc.on('error', (err) => {
|
|
978
1030
|
slot.lastError = err?.message ?? String(err);
|
|
@@ -984,19 +1036,25 @@ export class WorkerRuntimeManager implements RuntimeResolver {
|
|
|
984
1036
|
});
|
|
985
1037
|
|
|
986
1038
|
const prefix = `[runtime:${slot.projectId.slice(0, 8)}]`;
|
|
1039
|
+
// Each output line bumps `lastStdoutAt` — used by waitForHealth as
|
|
1040
|
+
// a forward-progress signal so a long-but-still-booting child
|
|
1041
|
+
// (LSP spawn, optimizeDeps, hook registration) doesn't get
|
|
1042
|
+
// SIGTERM'd mid-boot just because /health hasn't responded yet.
|
|
987
1043
|
proc.stdout?.on('data', (data) => {
|
|
1044
|
+
slot.lastStdoutAt = Date.now();
|
|
988
1045
|
for (const line of data.toString().trimEnd().split('\n')) {
|
|
989
1046
|
if (line) this.log.log(`${prefix} ${line}`);
|
|
990
1047
|
}
|
|
991
1048
|
});
|
|
992
1049
|
proc.stderr?.on('data', (data) => {
|
|
1050
|
+
slot.lastStdoutAt = Date.now();
|
|
993
1051
|
for (const line of data.toString().trimEnd().split('\n')) {
|
|
994
1052
|
if (line) this.log.error(`${prefix} ${line}`);
|
|
995
1053
|
}
|
|
996
1054
|
});
|
|
997
1055
|
|
|
998
1056
|
try {
|
|
999
|
-
await this.waitForHealth(slot
|
|
1057
|
+
await this.waitForHealth(slot, HEALTH_BOOT_TIMEOUT_MS);
|
|
1000
1058
|
slot.status = 'running';
|
|
1001
1059
|
slot.lastUsedAt = Date.now();
|
|
1002
1060
|
this.armIdleTimer(slot);
|
|
@@ -1288,18 +1346,101 @@ export class WorkerRuntimeManager implements RuntimeResolver {
|
|
|
1288
1346
|
}
|
|
1289
1347
|
}
|
|
1290
1348
|
|
|
1291
|
-
|
|
1292
|
-
|
|
1293
|
-
|
|
1294
|
-
|
|
1295
|
-
|
|
1296
|
-
|
|
1349
|
+
/**
|
|
1350
|
+
* Kernel-level TCP-accept probe. Resolves `true` if a TCP connection
|
|
1351
|
+
* to `127.0.0.1:port` succeeds within {@link TCP_CONNECT_TIMEOUT_MS},
|
|
1352
|
+
* `false` for any failure mode (refused, timed out, host unreachable).
|
|
1353
|
+
*
|
|
1354
|
+
* This is intentionally distinct from {@link isPortListening} (which
|
|
1355
|
+
* fires an HTTP HEAD): we want to know whether `Bun.serve()` has
|
|
1356
|
+
* bound the socket, NOT whether its request handler is responding.
|
|
1357
|
+
* Once the kernel has the listener, the agent-runtime's
|
|
1358
|
+
* `export default { port, fetch }` has been evaluated — which is
|
|
1359
|
+
* sufficient evidence to clear the boot gate even if the event loop
|
|
1360
|
+
* is still busy JIT-compiling the rest of the TS dep graph. Without
|
|
1361
|
+
* this distinction every `--conditions=development` cold boot on
|
|
1362
|
+
* Windows hits the HEALTH_BOOT_TIMEOUT_MS ceiling because /health
|
|
1363
|
+
* is the very last thing the saturated event loop gets to.
|
|
1364
|
+
*/
|
|
1365
|
+
private tcpProbe(port: number): Promise<boolean> {
|
|
1366
|
+
return new Promise<boolean>((resolve) => {
|
|
1367
|
+
let settled = false;
|
|
1368
|
+
const finish = (ok: boolean) => {
|
|
1369
|
+
if (settled) return;
|
|
1370
|
+
settled = true;
|
|
1371
|
+
try {
|
|
1372
|
+
socket.destroy();
|
|
1373
|
+
} catch {
|
|
1374
|
+
/* socket may already be torn down */
|
|
1375
|
+
}
|
|
1376
|
+
resolve(ok);
|
|
1377
|
+
};
|
|
1378
|
+
const socket = createConnection({ host: '127.0.0.1', port });
|
|
1379
|
+
socket.setTimeout(TCP_CONNECT_TIMEOUT_MS);
|
|
1380
|
+
socket.once('connect', () => finish(true));
|
|
1381
|
+
socket.once('error', () => finish(false));
|
|
1382
|
+
socket.once('timeout', () => finish(false));
|
|
1383
|
+
});
|
|
1384
|
+
}
|
|
1385
|
+
|
|
1386
|
+
/**
|
|
1387
|
+
* Wait for a freshly-spawned agent-runtime to clear the boot gate.
|
|
1388
|
+
*
|
|
1389
|
+
* Three independent readiness signals, in priority order:
|
|
1390
|
+
*
|
|
1391
|
+
* 1. **HTTP /health returns 2xx.** Happy path — the runtime is
|
|
1392
|
+
* fully booted and its event loop is responsive. We return
|
|
1393
|
+
* immediately and the caller transitions the slot to
|
|
1394
|
+
* `'running'`.
|
|
1395
|
+
*
|
|
1396
|
+
* 2. **TCP listener bound + recent stdout activity.** The kernel
|
|
1397
|
+
* has the port (so `Bun.serve()` evaluated its default export)
|
|
1398
|
+
* AND the child has printed a log line in the last
|
|
1399
|
+
* {@link STDOUT_PROGRESS_WINDOW_MS}. The process is alive,
|
|
1400
|
+
* bound, and making forward progress — just slow because Bun
|
|
1401
|
+
* is JIT-compiling the rest of the TS dep graph (the
|
|
1402
|
+
* `--conditions=development` cold-boot pattern) and /health
|
|
1403
|
+
* hasn't gotten a turn on the event loop yet. Accepting this
|
|
1404
|
+
* as ready unblocks the AgentProxy retry storm that would
|
|
1405
|
+
* otherwise compound the saturation. Once the child starts
|
|
1406
|
+
* handling requests for real, /health will respond and the
|
|
1407
|
+
* AgentProxy retries will succeed independently of this gate.
|
|
1408
|
+
*
|
|
1409
|
+
* 3. **Hard timeout at `timeoutMs`.** Neither HTTP nor the
|
|
1410
|
+
* TCP-plus-progress fallback has cleared. Throw the
|
|
1411
|
+
* descriptive "Timeout waiting for agent-runtime /health" so
|
|
1412
|
+
* the restart-with-backoff loop in `handleExit()` SIGTERMs the
|
|
1413
|
+
* child and respawns.
|
|
1414
|
+
*
|
|
1415
|
+
* Process-death short-circuit applies throughout: if the child exits
|
|
1416
|
+
* mid-wait (e.g. spawn-time port conflict, missing native binding),
|
|
1417
|
+
* we throw immediately with the exit code/signal rather than spinning
|
|
1418
|
+
* for the full `timeoutMs`.
|
|
1419
|
+
*/
|
|
1420
|
+
private async waitForHealth(slot: InternalRuntime, timeoutMs: number): Promise<void> {
|
|
1421
|
+
const port = slot.agentPort;
|
|
1422
|
+
const proc = slot.proc;
|
|
1423
|
+
if (!proc) {
|
|
1424
|
+
throw new Error(`waitForHealth: slot ${slot.projectId} has no spawned process`);
|
|
1425
|
+
}
|
|
1426
|
+
const startedAt = Date.now();
|
|
1427
|
+
const deadline = startedAt + timeoutMs;
|
|
1428
|
+
let lastError: string | null = null;
|
|
1429
|
+
let lastTcpListening = false;
|
|
1430
|
+
let httpAttempts = 0;
|
|
1431
|
+
let tcpAttempts = 0;
|
|
1432
|
+
let iteration = 0;
|
|
1433
|
+
|
|
1297
1434
|
while (Date.now() < deadline) {
|
|
1435
|
+
iteration++;
|
|
1298
1436
|
if (proc.exitCode !== null || proc.signalCode != null || proc.killed) {
|
|
1299
1437
|
throw new Error(
|
|
1300
1438
|
`agent-runtime exited (code=${proc.exitCode}, signal=${proc.signalCode}) before becoming healthy on port ${port}`,
|
|
1301
1439
|
);
|
|
1302
1440
|
}
|
|
1441
|
+
|
|
1442
|
+
// Signal 1: HTTP /health (happy path).
|
|
1443
|
+
httpAttempts++;
|
|
1303
1444
|
const controller = new AbortController();
|
|
1304
1445
|
const t = setTimeout(() => controller.abort(), 1500);
|
|
1305
1446
|
try {
|
|
@@ -1308,13 +1449,56 @@ export class WorkerRuntimeManager implements RuntimeResolver {
|
|
|
1308
1449
|
signal: controller.signal,
|
|
1309
1450
|
});
|
|
1310
1451
|
clearTimeout(t);
|
|
1311
|
-
if (resp.ok)
|
|
1312
|
-
|
|
1452
|
+
if (resp.ok) {
|
|
1453
|
+
this.log.log(
|
|
1454
|
+
`[WorkerRuntimeManager] /health ready for ${slot.projectId} on port ${port} ` +
|
|
1455
|
+
`(HTTP ${resp.status} after ${Date.now() - startedAt}ms, ${iteration} iter, ${httpAttempts} http)`,
|
|
1456
|
+
);
|
|
1457
|
+
return;
|
|
1458
|
+
}
|
|
1459
|
+
// Non-2xx is recorded but doesn't short-circuit — the runtime
|
|
1460
|
+
// may briefly serve 503 while initializing post-bind, and the
|
|
1461
|
+
// TCP-progress fallback below covers that window too.
|
|
1462
|
+
lastError = `HTTP /health returned ${resp.status}`;
|
|
1463
|
+
} catch (err: any) {
|
|
1313
1464
|
clearTimeout(t);
|
|
1465
|
+
const name = err?.name ?? 'Error';
|
|
1466
|
+
const code = err?.code ?? err?.cause?.code;
|
|
1467
|
+
lastError = `HTTP /health failed: ${name}${code ? `(${code})` : ''}: ${err?.message ?? err}`;
|
|
1314
1468
|
}
|
|
1469
|
+
|
|
1470
|
+
// Signal 2: TCP listener bound + recent stdout activity.
|
|
1471
|
+
tcpAttempts++;
|
|
1472
|
+
lastTcpListening = await this.tcpProbe(port);
|
|
1473
|
+
if (lastTcpListening) {
|
|
1474
|
+
const sinceStdoutMs = Date.now() - slot.lastStdoutAt;
|
|
1475
|
+
if (sinceStdoutMs < STDOUT_PROGRESS_WINDOW_MS) {
|
|
1476
|
+
this.log.log(
|
|
1477
|
+
`[WorkerRuntimeManager] TCP-listening + stdout-active for ${slot.projectId} on ` +
|
|
1478
|
+
`port ${port} (last stdout ${sinceStdoutMs}ms ago, ` +
|
|
1479
|
+
`${Date.now() - startedAt}ms since spawn, ${httpAttempts} http, ${tcpAttempts} tcp). ` +
|
|
1480
|
+
`Accepting as ready — /health still warming up but the child is bound and making progress.`,
|
|
1481
|
+
);
|
|
1482
|
+
return;
|
|
1483
|
+
}
|
|
1484
|
+
lastError =
|
|
1485
|
+
`TCP listening but stdout silent for ${sinceStdoutMs}ms ` +
|
|
1486
|
+
`(> ${STDOUT_PROGRESS_WINDOW_MS}ms window); last http: ${lastError ?? 'n/a'}`;
|
|
1487
|
+
}
|
|
1488
|
+
|
|
1315
1489
|
await new Promise((r) => setTimeout(r, HEALTH_POLL_MS));
|
|
1316
1490
|
}
|
|
1317
|
-
|
|
1491
|
+
|
|
1492
|
+
if (proc.exitCode !== null || proc.signalCode != null || proc.killed) {
|
|
1493
|
+
throw new Error(
|
|
1494
|
+
`agent-runtime exited (code=${proc.exitCode}, signal=${proc.signalCode}) before becoming healthy on port ${port}`,
|
|
1495
|
+
);
|
|
1496
|
+
}
|
|
1497
|
+
throw new Error(
|
|
1498
|
+
`Timeout waiting for agent-runtime /health on port ${port} ` +
|
|
1499
|
+
`after ${iteration} iter (httpAttempts=${httpAttempts}, tcpAttempts=${tcpAttempts}, ` +
|
|
1500
|
+
`tcpListening=${lastTcpListening}, lastError=${lastError ?? 'n/a'})`,
|
|
1501
|
+
);
|
|
1318
1502
|
}
|
|
1319
1503
|
|
|
1320
1504
|
private async waitForExit(proc: ChildProcess, timeoutMs: number): Promise<void> {
|