@shogo-ai/worker 1.8.8 → 1.8.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@shogo-ai/worker",
3
- "version": "1.8.8",
3
+ "version": "1.8.10",
4
4
  "description": "Shogo Cloud Agent Worker — run Shogo agents on your own machine (laptop, devbox, CI).",
5
5
  "license": "MIT",
6
6
  "author": "Shogo Technologies, Inc.",
@@ -0,0 +1,251 @@
1
+ // SPDX-License-Identifier: MIT
2
+ // Copyright (C) 2026 Shogo Technologies, Inc.
3
+ /**
4
+ * Pins {@link WorkerRuntimeManager.waitForHealth} against the three
5
+ * readiness signals it now considers and the diagnostic behavior on
6
+ * timeout. The historical bug this protects against: agent-runtime
7
+ * cold boots on Windows with `bun --conditions=development run
8
+ * packages/agent-runtime/src/server.ts` routinely take 30-45s just
9
+ * to JIT-compile the TS dep graph (shared-runtime + generators +
10
+ * tools + hooks + gateway). The kernel-level TCP listener is up
11
+ * within ~1-2s of spawn (the moment `Bun.serve()` reads its default
12
+ * export), but the event loop is still saturated for tens of seconds
13
+ * after that, so the old HTTP-only /health gate timed out, SIGTERM'd
14
+ * the still-booting child, and the restart-loop chewed through
15
+ * `MAX_CONSECUTIVE_RESTARTS` before giving up — leaving chat stuck
16
+ * with "Connection timed out — The agent runtime could not be
17
+ * reached".
18
+ *
19
+ * The contracts pinned here:
20
+ *
21
+ * 1. HTTP /health 200 returns immediately — the happy path is
22
+ * unchanged for healthy runtimes.
23
+ * 2. TCP-listening + recent stdout activity returns successfully
24
+ * even when /health never responds — handles the cold-boot
25
+ * saturation case.
26
+ * 3. TCP-listening but stdout silent past STDOUT_PROGRESS_WINDOW_MS
27
+ * does NOT short-circuit — a truly-wedged process still hits
28
+ * the hard timeout.
29
+ * 4. Process death (exitCode/signalCode/killed) short-circuits
30
+ * immediately with the exit info, instead of waiting the full
31
+ * timeout window.
32
+ */
33
+ import { afterEach, beforeEach, describe, expect, it } from 'bun:test';
34
+ import { WorkerRuntimeManager } from '../runtime-manager.ts';
35
+
36
+ interface FakeProc {
37
+ exitCode: number | null;
38
+ signalCode: NodeJS.Signals | null;
39
+ killed: boolean;
40
+ pid: number;
41
+ }
42
+
43
+ interface SlotShape {
44
+ projectId: string;
45
+ agentPort: number;
46
+ apiServerPort: number;
47
+ status: string;
48
+ proc: FakeProc;
49
+ pid: number;
50
+ startedAt: number;
51
+ lastStdoutAt: number;
52
+ lastUsedAt: number;
53
+ restarts: number;
54
+ consecutiveFailures: number;
55
+ lastFailureAt: number;
56
+ graceTimer: null;
57
+ restartTimer: null;
58
+ idleTimer: null;
59
+ spawnConfig: unknown;
60
+ startPromise: null;
61
+ }
62
+
63
+ function makeSlot(overrides: Partial<SlotShape> = {}): SlotShape {
64
+ const now = Date.now();
65
+ const proc: FakeProc = {
66
+ exitCode: null,
67
+ signalCode: null,
68
+ killed: false,
69
+ pid: 12345,
70
+ };
71
+ return {
72
+ projectId: 'proj-test',
73
+ agentPort: 41234,
74
+ apiServerPort: 41235,
75
+ status: 'starting',
76
+ proc,
77
+ pid: proc.pid,
78
+ startedAt: now,
79
+ lastStdoutAt: now,
80
+ lastUsedAt: now,
81
+ restarts: 0,
82
+ consecutiveFailures: 0,
83
+ lastFailureAt: 0,
84
+ graceTimer: null,
85
+ restartTimer: null,
86
+ idleTimer: null,
87
+ spawnConfig: {},
88
+ startPromise: null,
89
+ ...overrides,
90
+ };
91
+ }
92
+
93
+ const SILENT = { log: () => {}, warn: () => {}, error: () => {} } as const;
94
+
95
+ function callWaitForHealth(
96
+ mgr: WorkerRuntimeManager,
97
+ slot: SlotShape,
98
+ timeoutMs: number,
99
+ ): Promise<void> {
100
+ return (mgr as unknown as {
101
+ waitForHealth: (s: SlotShape, t: number) => Promise<void>;
102
+ }).waitForHealth(slot, timeoutMs);
103
+ }
104
+
105
+ function stubTcpProbe(mgr: WorkerRuntimeManager, listening: boolean | (() => boolean)): void {
106
+ (mgr as unknown as { tcpProbe: (port: number) => Promise<boolean> }).tcpProbe = async () =>
107
+ typeof listening === 'function' ? listening() : listening;
108
+ }
109
+
110
+ const ORIGINAL_FETCH = globalThis.fetch;
111
+
112
+ describe('WorkerRuntimeManager.waitForHealth (private)', () => {
113
+ let mgr: WorkerRuntimeManager;
114
+
115
+ beforeEach(() => {
116
+ mgr = new WorkerRuntimeManager({ logger: SILENT });
117
+ });
118
+
119
+ afterEach(() => {
120
+ globalThis.fetch = ORIGINAL_FETCH;
121
+ });
122
+
123
+ it('returns immediately when /health responds 200', async () => {
124
+ globalThis.fetch = (async () => new Response('ok', { status: 200 })) as typeof fetch;
125
+ // Even with TCP probe returning false the HTTP fast path wins.
126
+ stubTcpProbe(mgr, false);
127
+
128
+ const slot = makeSlot();
129
+ const startedAt = Date.now();
130
+ await callWaitForHealth(mgr, slot, 5_000);
131
+ expect(Date.now() - startedAt).toBeLessThan(2_000);
132
+ });
133
+
134
+ it(
135
+ 'accepts TCP-listening + recent stdout activity as ready when /health keeps failing',
136
+ async () => {
137
+ // Reproduces the Windows `--conditions=development` cold-boot:
138
+ // Bun.serve has bound the port (kernel-level listener up) and
139
+ // the child is still emitting log lines (LSP-TS init,
140
+ // preview-manager spawn, vite build), but /health is starved
141
+ // because the event loop is busy JIT-compiling. The wait
142
+ // should accept this as ready instead of timing out for 30s
143
+ // and SIGTERM'ing a still-booting process.
144
+ globalThis.fetch = (async () => {
145
+ throw new Error('ECONNREFUSED');
146
+ }) as typeof fetch;
147
+ stubTcpProbe(mgr, true);
148
+
149
+ const slot = makeSlot();
150
+ // Keep stdout "fresh" so the progress window stays open.
151
+ const bumper = setInterval(() => {
152
+ slot.lastStdoutAt = Date.now();
153
+ }, 200);
154
+
155
+ const startedAt = Date.now();
156
+ try {
157
+ await callWaitForHealth(mgr, slot, 30_000);
158
+ } finally {
159
+ clearInterval(bumper);
160
+ }
161
+ const elapsed = Date.now() - startedAt;
162
+ // Should return within the first iteration or two — TCP probe
163
+ // succeeds + stdout fresh = ready. Hard cap well below the 30s
164
+ // timeout to catch a regression to the legacy 30s spin.
165
+ expect(elapsed).toBeLessThan(5_000);
166
+ },
167
+ 15_000,
168
+ );
169
+
170
+ it(
171
+ 'does NOT accept TCP-listening alone when stdout is silent past the progress window',
172
+ async () => {
173
+ // The silent-but-bound case mimics a process that bound the
174
+ // port and then wedged (infinite loop in top-level code, native
175
+ // crash mid-init that didn't propagate to the parent). The
176
+ // progress window correctly refuses to short-circuit here so
177
+ // the restart loop has a chance to recover.
178
+ globalThis.fetch = (async () => {
179
+ throw new Error('ECONNREFUSED');
180
+ }) as typeof fetch;
181
+ stubTcpProbe(mgr, true);
182
+
183
+ const slot = makeSlot({
184
+ // Pre-age the stdout timestamp so the window is already
185
+ // closed before the first iteration.
186
+ lastStdoutAt: Date.now() - 60_000,
187
+ });
188
+
189
+ await expect(callWaitForHealth(mgr, slot, 1_200)).rejects.toThrow(
190
+ /Timeout waiting for agent-runtime \/health on port 41234.*tcpListening=true/s,
191
+ );
192
+ },
193
+ 5_000,
194
+ );
195
+
196
+ it('times out cleanly when neither TCP nor HTTP ever come up', async () => {
197
+ globalThis.fetch = (async () => {
198
+ throw new Error('ECONNREFUSED');
199
+ }) as typeof fetch;
200
+ stubTcpProbe(mgr, false);
201
+
202
+ const slot = makeSlot();
203
+ await expect(callWaitForHealth(mgr, slot, 1_200)).rejects.toThrow(
204
+ /Timeout waiting for agent-runtime \/health on port 41234.*tcpListening=false/s,
205
+ );
206
+ });
207
+
208
+ it('short-circuits with exit info when the child dies mid-wait', async () => {
209
+ // Without the short-circuit the wait would spin for the full
210
+ // 30s and surface a generic timeout — the operator's first clue
211
+ // to "the process crashed during boot" should be a clear exit
212
+ // code, not the same boilerplate error as a network failure.
213
+ globalThis.fetch = (async () => {
214
+ throw new Error('ECONNREFUSED');
215
+ }) as typeof fetch;
216
+ stubTcpProbe(mgr, false);
217
+
218
+ const slot = makeSlot();
219
+ setTimeout(() => {
220
+ slot.proc.exitCode = 1;
221
+ }, 400);
222
+
223
+ const startedAt = Date.now();
224
+ await expect(callWaitForHealth(mgr, slot, 30_000)).rejects.toThrow(
225
+ /agent-runtime exited \(code=1, signal=null\) before becoming healthy on port 41234/,
226
+ );
227
+ // 2s is a comfortable ceiling: the inner setTimeout for exitCode
228
+ // fires at 400ms and the wait should observe it within one
229
+ // HEALTH_POLL_MS (500ms) tick.
230
+ expect(Date.now() - startedAt).toBeLessThan(2_500);
231
+ });
232
+
233
+ it('detects SIGKILL-style deaths where exitCode stays null', async () => {
234
+ // Mirrors the jetsam-on-macOS case and the `cleanupStaleProcesses`
235
+ // SIGKILL of a previous run that the manager itself fired —
236
+ // exitCode never gets a number, only signalCode flips.
237
+ globalThis.fetch = (async () => {
238
+ throw new Error('ECONNREFUSED');
239
+ }) as typeof fetch;
240
+ stubTcpProbe(mgr, false);
241
+
242
+ const slot = makeSlot();
243
+ setTimeout(() => {
244
+ slot.proc.signalCode = 'SIGKILL';
245
+ }, 400);
246
+
247
+ await expect(callWaitForHealth(mgr, slot, 30_000)).rejects.toThrow(
248
+ /agent-runtime exited \(code=null, signal=SIGKILL\) before becoming healthy on port 41234/,
249
+ );
250
+ });
251
+ });
@@ -34,6 +34,7 @@
34
34
  import { type ChildProcess, spawn } from 'node:child_process';
35
35
  import { createHmac, randomBytes } from 'node:crypto';
36
36
  import { existsSync, mkdirSync, readdirSync } from 'node:fs';
37
+ import { createConnection } from 'node:net';
37
38
  import { tmpdir } from 'node:os';
38
39
  import { dirname, join } from 'node:path';
39
40
  import { resolveRuntime, type ResolvedRuntime } from './runtime-resolver.ts';
@@ -84,8 +85,41 @@ const STARTUP_GRACE_MS = 60_000;
84
85
 
85
86
  /** Health check poll interval while waiting for /health. */
86
87
  const HEALTH_POLL_MS = 500;
87
- /** Total timeout waiting for first /health success after spawn. */
88
- const HEALTH_BOOT_TIMEOUT_MS = 30_000;
88
+ /**
89
+ * Absolute ceiling on how long we'll wait for a freshly-spawned
90
+ * agent-runtime to clear the boot gate (either a 200 /health response
91
+ * or the TCP-listening + stdout-progress fallback below). Was 30s
92
+ * before the Windows `--conditions=development` cold-boot
93
+ * investigation surfaced that Bun routinely takes 30-45s just to JIT
94
+ * the agent-runtime's TS dep graph (shared-runtime + generators +
95
+ * tools + hooks + gateway) on a cold workspace, leaving zero budget
96
+ * for /health to actually respond. 60s gives the slow path room
97
+ * without giving a truly-hung process a free pass — the progress
98
+ * detector below catches that case independently.
99
+ */
100
+ const HEALTH_BOOT_TIMEOUT_MS = 60_000;
101
+ /**
102
+ * If the child's TCP listener is up AND its stdout has emitted a line
103
+ * within this window, we accept it as ready even though /health
104
+ * hasn't returned yet. Models "is the process making forward
105
+ * progress?" — a Bun runtime mid `optimizeDeps` / LSP-spawn / hook
106
+ * registration is unresponsive to HTTP but still emits `[LSP-TS]`,
107
+ * `[preview-manager]`, `[AgentGateway]` etc. log lines as it walks
108
+ * its boot sequence, so stdout activity is a reliable liveness
109
+ * signal. A truly-hung child (event-loop wedged, zero log output)
110
+ * stays silent past this window and falls through to the normal
111
+ * /health timeout + SIGTERM + restart loop.
112
+ */
113
+ const STDOUT_PROGRESS_WINDOW_MS = 10_000;
114
+ /**
115
+ * Per-attempt TCP connect budget for the kernel-level readiness
116
+ * probe. Connect attempts only need a TCP SYN/SYN-ACK roundtrip on
117
+ * loopback — anything past ~50ms means the kernel doesn't have the
118
+ * listener bound yet (the agent-runtime hasn't reached the
119
+ * Bun.serve() default-export evaluation), so 500ms is comfortably
120
+ * above the noise floor without prolonging probe iterations.
121
+ */
122
+ const TCP_CONNECT_TIMEOUT_MS = 500;
89
123
 
90
124
  export type RuntimeStatus =
91
125
  | 'starting'
@@ -267,6 +301,18 @@ interface InternalRuntime {
267
301
  */
268
302
  pid: number | null;
269
303
  startedAt: number;
304
+ /**
305
+ * Wall-clock timestamp of the most recent stdout/stderr line emitted
306
+ * by `proc`. Read by {@link WorkerRuntimeManager.waitForHealth} as a
307
+ * "process is making forward progress" signal that lets a slow boot
308
+ * clear the readiness gate even when /health hasn't responded yet.
309
+ * See the {@link STDOUT_PROGRESS_WINDOW_MS} doc and the comment block
310
+ * inside `waitForHealth` for the full rationale (Windows
311
+ * `--conditions=development` cold-boot pattern where Bun
312
+ * JIT-compiles the entire TS dep graph on first request and the
313
+ * 30s HTTP /health budget vanishes before the first response).
314
+ */
315
+ lastStdoutAt: number;
270
316
  lastUsedAt: number;
271
317
  restarts: number;
272
318
  /** Consecutive non-clean exits since the last healthy run. */
@@ -908,6 +954,7 @@ export class WorkerRuntimeManager implements RuntimeResolver {
908
954
  proc: null,
909
955
  pid: null,
910
956
  startedAt: 0,
957
+ lastStdoutAt: 0,
911
958
  lastUsedAt: Date.now(),
912
959
  restarts: 0,
913
960
  consecutiveFailures: 0,
@@ -973,6 +1020,11 @@ export class WorkerRuntimeManager implements RuntimeResolver {
973
1020
  slot.pid = proc.pid ?? null;
974
1021
  slot.status = 'starting';
975
1022
  slot.startedAt = Date.now();
1023
+ // Seed the progress timestamp at spawn so a child that emits its
1024
+ // first line within `STDOUT_PROGRESS_WINDOW_MS` is treated as
1025
+ // "making progress since spawn" without the readiness check having
1026
+ // to special-case the cold-start gap.
1027
+ slot.lastStdoutAt = slot.startedAt;
976
1028
 
977
1029
  proc.on('error', (err) => {
978
1030
  slot.lastError = err?.message ?? String(err);
@@ -984,19 +1036,25 @@ export class WorkerRuntimeManager implements RuntimeResolver {
984
1036
  });
985
1037
 
986
1038
  const prefix = `[runtime:${slot.projectId.slice(0, 8)}]`;
1039
+ // Each output line bumps `lastStdoutAt` — used by waitForHealth as
1040
+ // a forward-progress signal so a long-but-still-booting child
1041
+ // (LSP spawn, optimizeDeps, hook registration) doesn't get
1042
+ // SIGTERM'd mid-boot just because /health hasn't responded yet.
987
1043
  proc.stdout?.on('data', (data) => {
1044
+ slot.lastStdoutAt = Date.now();
988
1045
  for (const line of data.toString().trimEnd().split('\n')) {
989
1046
  if (line) this.log.log(`${prefix} ${line}`);
990
1047
  }
991
1048
  });
992
1049
  proc.stderr?.on('data', (data) => {
1050
+ slot.lastStdoutAt = Date.now();
993
1051
  for (const line of data.toString().trimEnd().split('\n')) {
994
1052
  if (line) this.log.error(`${prefix} ${line}`);
995
1053
  }
996
1054
  });
997
1055
 
998
1056
  try {
999
- await this.waitForHealth(slot.agentPort, slot.proc, HEALTH_BOOT_TIMEOUT_MS);
1057
+ await this.waitForHealth(slot, HEALTH_BOOT_TIMEOUT_MS);
1000
1058
  slot.status = 'running';
1001
1059
  slot.lastUsedAt = Date.now();
1002
1060
  this.armIdleTimer(slot);
@@ -1288,18 +1346,101 @@ export class WorkerRuntimeManager implements RuntimeResolver {
1288
1346
  }
1289
1347
  }
1290
1348
 
1291
- private async waitForHealth(
1292
- port: number,
1293
- proc: ChildProcess,
1294
- timeoutMs: number,
1295
- ): Promise<void> {
1296
- const deadline = Date.now() + timeoutMs;
1349
+ /**
1350
+ * Kernel-level TCP-accept probe. Resolves `true` if a TCP connection
1351
+ * to `127.0.0.1:port` succeeds within {@link TCP_CONNECT_TIMEOUT_MS},
1352
+ * `false` for any failure mode (refused, timed out, host unreachable).
1353
+ *
1354
+ * This is intentionally distinct from {@link isPortListening} (which
1355
+ * fires an HTTP HEAD): we want to know whether `Bun.serve()` has
1356
+ * bound the socket, NOT whether its request handler is responding.
1357
+ * Once the kernel has the listener, the agent-runtime's
1358
+ * `export default { port, fetch }` has been evaluated — which is
1359
+ * sufficient evidence to clear the boot gate even if the event loop
1360
+ * is still busy JIT-compiling the rest of the TS dep graph. Without
1361
+ * this distinction every `--conditions=development` cold boot on
1362
+ * Windows hits the HEALTH_BOOT_TIMEOUT_MS ceiling because /health
1363
+ * is the very last thing the saturated event loop gets to.
1364
+ */
1365
+ private tcpProbe(port: number): Promise<boolean> {
1366
+ return new Promise<boolean>((resolve) => {
1367
+ let settled = false;
1368
+ const finish = (ok: boolean) => {
1369
+ if (settled) return;
1370
+ settled = true;
1371
+ try {
1372
+ socket.destroy();
1373
+ } catch {
1374
+ /* socket may already be torn down */
1375
+ }
1376
+ resolve(ok);
1377
+ };
1378
+ const socket = createConnection({ host: '127.0.0.1', port });
1379
+ socket.setTimeout(TCP_CONNECT_TIMEOUT_MS);
1380
+ socket.once('connect', () => finish(true));
1381
+ socket.once('error', () => finish(false));
1382
+ socket.once('timeout', () => finish(false));
1383
+ });
1384
+ }
1385
+
1386
+ /**
1387
+ * Wait for a freshly-spawned agent-runtime to clear the boot gate.
1388
+ *
1389
+ * Three independent readiness signals, in priority order:
1390
+ *
1391
+ * 1. **HTTP /health returns 2xx.** Happy path — the runtime is
1392
+ * fully booted and its event loop is responsive. We return
1393
+ * immediately and the caller transitions the slot to
1394
+ * `'running'`.
1395
+ *
1396
+ * 2. **TCP listener bound + recent stdout activity.** The kernel
1397
+ * has the port (so `Bun.serve()` evaluated its default export)
1398
+ * AND the child has printed a log line in the last
1399
+ * {@link STDOUT_PROGRESS_WINDOW_MS}. The process is alive,
1400
+ * bound, and making forward progress — just slow because Bun
1401
+ * is JIT-compiling the rest of the TS dep graph (the
1402
+ * `--conditions=development` cold-boot pattern) and /health
1403
+ * hasn't gotten a turn on the event loop yet. Accepting this
1404
+ * as ready unblocks the AgentProxy retry storm that would
1405
+ * otherwise compound the saturation. Once the child starts
1406
+ * handling requests for real, /health will respond and the
1407
+ * AgentProxy retries will succeed independently of this gate.
1408
+ *
1409
+ * 3. **Hard timeout at `timeoutMs`.** Neither HTTP nor the
1410
+ * TCP-plus-progress fallback has cleared. Throw the
1411
+ * descriptive "Timeout waiting for agent-runtime /health" so
1412
+ * the restart-with-backoff loop in `handleExit()` SIGTERMs the
1413
+ * child and respawns.
1414
+ *
1415
+ * Process-death short-circuit applies throughout: if the child exits
1416
+ * mid-wait (e.g. spawn-time port conflict, missing native binding),
1417
+ * we throw immediately with the exit code/signal rather than spinning
1418
+ * for the full `timeoutMs`.
1419
+ */
1420
+ private async waitForHealth(slot: InternalRuntime, timeoutMs: number): Promise<void> {
1421
+ const port = slot.agentPort;
1422
+ const proc = slot.proc;
1423
+ if (!proc) {
1424
+ throw new Error(`waitForHealth: slot ${slot.projectId} has no spawned process`);
1425
+ }
1426
+ const startedAt = Date.now();
1427
+ const deadline = startedAt + timeoutMs;
1428
+ let lastError: string | null = null;
1429
+ let lastTcpListening = false;
1430
+ let httpAttempts = 0;
1431
+ let tcpAttempts = 0;
1432
+ let iteration = 0;
1433
+
1297
1434
  while (Date.now() < deadline) {
1435
+ iteration++;
1298
1436
  if (proc.exitCode !== null || proc.signalCode != null || proc.killed) {
1299
1437
  throw new Error(
1300
1438
  `agent-runtime exited (code=${proc.exitCode}, signal=${proc.signalCode}) before becoming healthy on port ${port}`,
1301
1439
  );
1302
1440
  }
1441
+
1442
+ // Signal 1: HTTP /health (happy path).
1443
+ httpAttempts++;
1303
1444
  const controller = new AbortController();
1304
1445
  const t = setTimeout(() => controller.abort(), 1500);
1305
1446
  try {
@@ -1308,13 +1449,56 @@ export class WorkerRuntimeManager implements RuntimeResolver {
1308
1449
  signal: controller.signal,
1309
1450
  });
1310
1451
  clearTimeout(t);
1311
- if (resp.ok) return;
1312
- } catch {
1452
+ if (resp.ok) {
1453
+ this.log.log(
1454
+ `[WorkerRuntimeManager] /health ready for ${slot.projectId} on port ${port} ` +
1455
+ `(HTTP ${resp.status} after ${Date.now() - startedAt}ms, ${iteration} iter, ${httpAttempts} http)`,
1456
+ );
1457
+ return;
1458
+ }
1459
+ // Non-2xx is recorded but doesn't short-circuit — the runtime
1460
+ // may briefly serve 503 while initializing post-bind, and the
1461
+ // TCP-progress fallback below covers that window too.
1462
+ lastError = `HTTP /health returned ${resp.status}`;
1463
+ } catch (err: any) {
1313
1464
  clearTimeout(t);
1465
+ const name = err?.name ?? 'Error';
1466
+ const code = err?.code ?? err?.cause?.code;
1467
+ lastError = `HTTP /health failed: ${name}${code ? `(${code})` : ''}: ${err?.message ?? err}`;
1314
1468
  }
1469
+
1470
+ // Signal 2: TCP listener bound + recent stdout activity.
1471
+ tcpAttempts++;
1472
+ lastTcpListening = await this.tcpProbe(port);
1473
+ if (lastTcpListening) {
1474
+ const sinceStdoutMs = Date.now() - slot.lastStdoutAt;
1475
+ if (sinceStdoutMs < STDOUT_PROGRESS_WINDOW_MS) {
1476
+ this.log.log(
1477
+ `[WorkerRuntimeManager] TCP-listening + stdout-active for ${slot.projectId} on ` +
1478
+ `port ${port} (last stdout ${sinceStdoutMs}ms ago, ` +
1479
+ `${Date.now() - startedAt}ms since spawn, ${httpAttempts} http, ${tcpAttempts} tcp). ` +
1480
+ `Accepting as ready — /health still warming up but the child is bound and making progress.`,
1481
+ );
1482
+ return;
1483
+ }
1484
+ lastError =
1485
+ `TCP listening but stdout silent for ${sinceStdoutMs}ms ` +
1486
+ `(> ${STDOUT_PROGRESS_WINDOW_MS}ms window); last http: ${lastError ?? 'n/a'}`;
1487
+ }
1488
+
1315
1489
  await new Promise((r) => setTimeout(r, HEALTH_POLL_MS));
1316
1490
  }
1317
- throw new Error(`Timeout waiting for agent-runtime /health on port ${port}`);
1491
+
1492
+ if (proc.exitCode !== null || proc.signalCode != null || proc.killed) {
1493
+ throw new Error(
1494
+ `agent-runtime exited (code=${proc.exitCode}, signal=${proc.signalCode}) before becoming healthy on port ${port}`,
1495
+ );
1496
+ }
1497
+ throw new Error(
1498
+ `Timeout waiting for agent-runtime /health on port ${port} ` +
1499
+ `after ${iteration} iter (httpAttempts=${httpAttempts}, tcpAttempts=${tcpAttempts}, ` +
1500
+ `tcpListening=${lastTcpListening}, lastError=${lastError ?? 'n/a'})`,
1501
+ );
1318
1502
  }
1319
1503
 
1320
1504
  private async waitForExit(proc: ChildProcess, timeoutMs: number): Promise<void> {