@shogo-ai/worker 1.8.10 → 1.8.12

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -87,30 +87,50 @@ const STARTUP_GRACE_MS = 60_000;
87
87
  const HEALTH_POLL_MS = 500;
88
88
  /**
89
89
  * Absolute ceiling on how long we'll wait for a freshly-spawned
90
- * agent-runtime to clear the boot gate (either a 200 /health response
91
- * or the TCP-listening + stdout-progress fallback below). Was 30s
92
- * before the Windows `--conditions=development` cold-boot
93
- * investigation surfaced that Bun routinely takes 30-45s just to JIT
94
- * the agent-runtime's TS dep graph (shared-runtime + generators +
95
- * tools + hooks + gateway) on a cold workspace, leaving zero budget
96
- * for /health to actually respond. 60s gives the slow path room
97
- * without giving a truly-hung process a free pass — the progress
98
- * detector below catches that case independently.
90
+ * agent-runtime to respond 2xx on /health. After the 2026-05 fix
91
+ * (`runtime-log-writer` async batched writes, deferred LSP +
92
+ * IndexEngine start, /health fast-path in the outer fetch handler),
93
+ * a Windows cold boot is ~8-12 s; macOS/Linux are ~3-5 s. 30 s gives
94
+ * a comfortable margin without hiding real hangs.
95
+ *
96
+ * Why we don't short-circuit on TCP-listening + stdout activity any
97
+ * more (the historical fast path that bypassed /health entirely):
98
+ * declaring "ready" while the event loop is saturated tricks the UI
99
+ * into starting to load the preview iframe and the API server into
100
+ * starting the agent-proxy, both of which then hit their own (much
101
+ * shorter) timeouts and surface as the "Connection timed out — The
102
+ * agent runtime could not be reached" toast and "[AgentProxy] PATCH
103
+ * /agent/config timeout" retries. Waiting for a real 2xx is slower
104
+ * but the resulting `'running'` status actually means "responsive to
105
+ * HTTP", which is what every caller of `status === 'running'` already
106
+ * assumes. The progress logger keeps devs informed during the slow
107
+ * path so the wait isn't a silent black-box.
108
+ */
109
+ const HEALTH_BOOT_TIMEOUT_MS = 30_000;
110
+ /**
111
+ * If `STDOUT_PROGRESS_WINDOW_MS` elapses without any new stdout AND
112
+ * TCP-listening is true AND /health still hasn't responded, the
113
+ * child is considered wedged and the wait short-circuits with the
114
+ * "stdout silent" error so the restart loop has a chance to recover.
115
+ * Note: this is the ONLY purpose of the window now — it does not
116
+ * declare the runtime ready on its own. See HEALTH_BOOT_TIMEOUT_MS.
117
+ *
118
+ * Sized at 25 s — short of the 30 s HEALTH_BOOT_TIMEOUT_MS so a
119
+ * wedged child surfaces with the more informative "stdout silent for
120
+ * Xms" message instead of the generic timeout. After the 2026-05
121
+ * cold-boot investigation we deferred LSP startup + IndexEngine
122
+ * pre-warm out of the critical path and got Windows cold boot down
123
+ * to ~8-12 s (with macOS at ~3-5 s), so a 25 s silent window is now
124
+ * a real anomaly worth respawning for.
99
125
  */
100
- const HEALTH_BOOT_TIMEOUT_MS = 60_000;
126
+ const STDOUT_PROGRESS_WINDOW_MS = 25_000;
101
127
  /**
102
- * If the child's TCP listener is up AND its stdout has emitted a line
103
- * within this window, we accept it as ready even though /health
104
- * hasn't returned yet. Models "is the process making forward
105
- * progress?" — a Bun runtime mid `optimizeDeps` / LSP-spawn / hook
106
- * registration is unresponsive to HTTP but still emits `[LSP-TS]`,
107
- * `[preview-manager]`, `[AgentGateway]` etc. log lines as it walks
108
- * its boot sequence, so stdout activity is a reliable liveness
109
- * signal. A truly-hung child (event-loop wedged, zero log output)
110
- * stays silent past this window and falls through to the normal
111
- * /health timeout + SIGTERM + restart loop.
128
+ * Log a "still waiting for /health" progress line at most this
129
+ * often, so operators / devs watching `[dev:all]` see something
130
+ * happening during the slow Windows cold-boot path instead of a
131
+ * silent 30-90s gap.
112
132
  */
113
- const STDOUT_PROGRESS_WINDOW_MS = 10_000;
133
+ const HEALTH_PROGRESS_LOG_MS = 5_000;
114
134
  /**
115
135
  * Per-attempt TCP connect budget for the kernel-level readiness
116
136
  * probe. Connect attempts only need a TCP SYN/SYN-ACK roundtrip on
@@ -1384,38 +1404,51 @@ export class WorkerRuntimeManager implements RuntimeResolver {
1384
1404
  }
1385
1405
 
1386
1406
  /**
1387
- * Wait for a freshly-spawned agent-runtime to clear the boot gate.
1407
+ * Wait for a freshly-spawned agent-runtime to be HTTP-responsive on
1408
+ * /health. This is the gate that drives the slot's `'starting'` →
1409
+ * `'running'` transition; every consumer of `status === 'running'`
1410
+ * (`/sandbox/url`, AgentProxy, the iframe preview readiness probe)
1411
+ * assumes the runtime can actually serve requests, so this gate
1412
+ * must mean exactly that — not "the kernel accepted a TCP listener
1413
+ * on the port".
1414
+ *
1415
+ * Loop semantics:
1416
+ *
1417
+ * 1. **HTTP /health 2xx → return.** Happy path; the event loop
1418
+ * is responsive and the slot can transition to `'running'`.
1388
1419
  *
1389
- * Three independent readiness signals, in priority order:
1420
+ * 2. **Child exits throw.** Same as before — surface the
1421
+ * exit code/signal so restart-with-backoff can recover from
1422
+ * crashes-during-boot without burning the full timeout.
1390
1423
  *
1391
- * 1. **HTTP /health returns 2xx.** Happy path the runtime is
1392
- * fully booted and its event loop is responsive. We return
1393
- * immediately and the caller transitions the slot to
1394
- * `'running'`.
1424
+ * 3. **TCP listening + stdout silent > {@link STDOUT_PROGRESS_WINDOW_MS}
1425
+ * throw.** TCP up but no log lines for 30s is "wedged":
1426
+ * Bun is past `Bun.serve()` (so the kernel has the port) but
1427
+ * something inside the runtime spun the event loop to death
1428
+ * (infinite loop, deadlock). Bail so the restart loop can
1429
+ * SIGTERM and respawn instead of spinning the full timeoutMs.
1395
1430
  *
1396
- * 2. **TCP listener bound + recent stdout activity.** The kernel
1397
- * has the port (so `Bun.serve()` evaluated its default export)
1398
- * AND the child has printed a log line in the last
1399
- * {@link STDOUT_PROGRESS_WINDOW_MS}. The process is alive,
1400
- * bound, and making forward progress — just slow because Bun
1401
- * is JIT-compiling the rest of the TS dep graph (the
1402
- * `--conditions=development` cold-boot pattern) and /health
1403
- * hasn't gotten a turn on the event loop yet. Accepting this
1404
- * as ready unblocks the AgentProxy retry storm that would
1405
- * otherwise compound the saturation. Once the child starts
1406
- * handling requests for real, /health will respond and the
1407
- * AgentProxy retries will succeed independently of this gate.
1431
+ * 4. **`timeoutMs` elapsed throw.** Final ceiling. After the
1432
+ * 2026-05 fix that deferred LSP + IndexEngine out of the
1433
+ * critical-path boot sequence in agent-runtime, a healthy
1434
+ * Windows cold boot is ~8-12 s; the 30 s budget gives a
1435
+ * comfortable margin without hiding real hangs.
1408
1436
  *
1409
- * 3. **Hard timeout at `timeoutMs`.** Neither HTTP nor the
1410
- * TCP-plus-progress fallback has cleared. Throw the
1411
- * descriptive "Timeout waiting for agent-runtime /health" so
1412
- * the restart-with-backoff loop in `handleExit()` SIGTERMs the
1413
- * child and respawns.
1437
+ * Progress logging: every {@link HEALTH_PROGRESS_LOG_MS} we emit a
1438
+ * single line summarizing where the wait stands so the dev/operator
1439
+ * watching `[dev:all]` sees something happen during the slow path
1440
+ * instead of staring at a silent terminal for a minute.
1414
1441
  *
1415
- * Process-death short-circuit applies throughout: if the child exits
1416
- * mid-wait (e.g. spawn-time port conflict, missing native binding),
1417
- * we throw immediately with the exit code/signal rather than spinning
1418
- * for the full `timeoutMs`.
1442
+ * What changed vs the pre-2026-05 fast path: that revision returned
1443
+ * as soon as TCP-listening + recent stdout was true, treating "the
1444
+ * child is making progress" as ready. In practice the child was
1445
+ * still saturated for tens of seconds after that point, so the UI
1446
+ * would start its preview iframe load and the API server its agent
1447
+ * proxy, both of which then hit their own (4-15s) timeouts and
1448
+ * surfaced as the "Connection timed out — The agent runtime could
1449
+ * not be reached" toast plus "[AgentProxy] timeout, retrying" log
1450
+ * spam. The new gate trades 20-60s of additional boot wait for a
1451
+ * `'running'` signal the rest of the stack can actually trust.
1419
1452
  */
1420
1453
  private async waitForHealth(slot: InternalRuntime, timeoutMs: number): Promise<void> {
1421
1454
  const port = slot.agentPort;
@@ -1427,9 +1460,11 @@ export class WorkerRuntimeManager implements RuntimeResolver {
1427
1460
  const deadline = startedAt + timeoutMs;
1428
1461
  let lastError: string | null = null;
1429
1462
  let lastTcpListening = false;
1463
+ let lastTcpAt = 0;
1430
1464
  let httpAttempts = 0;
1431
1465
  let tcpAttempts = 0;
1432
1466
  let iteration = 0;
1467
+ let lastProgressLogAt = startedAt;
1433
1468
 
1434
1469
  while (Date.now() < deadline) {
1435
1470
  iteration++;
@@ -1439,7 +1474,9 @@ export class WorkerRuntimeManager implements RuntimeResolver {
1439
1474
  );
1440
1475
  }
1441
1476
 
1442
- // Signal 1: HTTP /health (happy path).
1477
+ // Primary signal: HTTP /health. This is the only signal that
1478
+ // declares the runtime ready — TCP-listening alone is not enough
1479
+ // (see class docstring above for the rationale).
1443
1480
  httpAttempts++;
1444
1481
  const controller = new AbortController();
1445
1482
  const t = setTimeout(() => controller.abort(), 1500);
@@ -1457,8 +1494,7 @@ export class WorkerRuntimeManager implements RuntimeResolver {
1457
1494
  return;
1458
1495
  }
1459
1496
  // Non-2xx is recorded but doesn't short-circuit — the runtime
1460
- // may briefly serve 503 while initializing post-bind, and the
1461
- // TCP-progress fallback below covers that window too.
1497
+ // may briefly serve 503 while initializing post-bind.
1462
1498
  lastError = `HTTP /health returned ${resp.status}`;
1463
1499
  } catch (err: any) {
1464
1500
  clearTimeout(t);
@@ -1467,23 +1503,37 @@ export class WorkerRuntimeManager implements RuntimeResolver {
1467
1503
  lastError = `HTTP /health failed: ${name}${code ? `(${code})` : ''}: ${err?.message ?? err}`;
1468
1504
  }
1469
1505
 
1470
- // Signal 2: TCP listener bound + recent stdout activity.
1506
+ // Secondary signal: TCP listener + stdout activity. Now used ONLY
1507
+ // as a wedge detector — if TCP is up but stdout has been silent
1508
+ // for the progress window, abandon the wait so the restart loop
1509
+ // can recover. Never declares the runtime ready on its own.
1471
1510
  tcpAttempts++;
1472
1511
  lastTcpListening = await this.tcpProbe(port);
1473
1512
  if (lastTcpListening) {
1513
+ lastTcpAt = Date.now();
1474
1514
  const sinceStdoutMs = Date.now() - slot.lastStdoutAt;
1475
- if (sinceStdoutMs < STDOUT_PROGRESS_WINDOW_MS) {
1476
- this.log.log(
1477
- `[WorkerRuntimeManager] TCP-listening + stdout-active for ${slot.projectId} on ` +
1478
- `port ${port} (last stdout ${sinceStdoutMs}ms ago, ` +
1479
- `${Date.now() - startedAt}ms since spawn, ${httpAttempts} http, ${tcpAttempts} tcp). ` +
1480
- `Accepting as ready — /health still warming up but the child is bound and making progress.`,
1515
+ if (sinceStdoutMs >= STDOUT_PROGRESS_WINDOW_MS) {
1516
+ throw new Error(
1517
+ `agent-runtime wedged on port ${port}: TCP listening but stdout silent for ` +
1518
+ `${sinceStdoutMs}ms (> ${STDOUT_PROGRESS_WINDOW_MS}ms window); ` +
1519
+ `${httpAttempts} /health attempts, last error: ${lastError ?? 'n/a'}`,
1481
1520
  );
1482
- return;
1483
1521
  }
1484
- lastError =
1485
- `TCP listening but stdout silent for ${sinceStdoutMs}ms ` +
1486
- `(> ${STDOUT_PROGRESS_WINDOW_MS}ms window); last http: ${lastError ?? 'n/a'}`;
1522
+ }
1523
+
1524
+ const now = Date.now();
1525
+ if (now - lastProgressLogAt >= HEALTH_PROGRESS_LOG_MS) {
1526
+ const elapsedMs = now - startedAt;
1527
+ const sinceStdoutMs = now - slot.lastStdoutAt;
1528
+ const sinceTcpMs = lastTcpAt > 0 ? now - lastTcpAt : null;
1529
+ this.log.log(
1530
+ `[WorkerRuntimeManager] still waiting for /health on ${slot.projectId} ` +
1531
+ `port ${port} (${(elapsedMs / 1000).toFixed(1)}s elapsed, ` +
1532
+ `tcpListening=${lastTcpListening}${sinceTcpMs != null ? `(${sinceTcpMs}ms ago)` : ''}, ` +
1533
+ `lastStdout=${sinceStdoutMs}ms ago, ${httpAttempts} http, ${tcpAttempts} tcp, ` +
1534
+ `lastError=${lastError ?? 'n/a'})`,
1535
+ );
1536
+ lastProgressLogAt = now;
1487
1537
  }
1488
1538
 
1489
1539
  await new Promise((r) => setTimeout(r, HEALTH_POLL_MS));
package/src/lib/tunnel.ts CHANGED
@@ -202,10 +202,6 @@ export class WorkerTunnel {
202
202
  this.pollTimer = null;
203
203
  }
204
204
  this.cleanupWs();
205
- if (this.ws) {
206
- try { this.ws.close(1000, 'Tunnel stopped'); } catch { /* already closed */ }
207
- this.ws = null;
208
- }
209
205
  this.log.log('[WorkerTunnel] Tunnel stopped');
210
206
  }
211
207