@shogo-ai/worker 1.8.9 → 1.8.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@shogo-ai/worker",
3
- "version": "1.8.9",
3
+ "version": "1.8.11",
4
4
  "description": "Shogo Cloud Agent Worker — run Shogo agents on your own machine (laptop, devbox, CI).",
5
5
  "license": "MIT",
6
6
  "author": "Shogo Technologies, Inc.",
@@ -1,34 +1,42 @@
1
1
  // SPDX-License-Identifier: MIT
2
2
  // Copyright (C) 2026 Shogo Technologies, Inc.
3
3
  /**
4
- * Pins {@link WorkerRuntimeManager.waitForHealth} against the three
5
- * readiness signals it now considers and the diagnostic behavior on
6
- * timeout. The historical bug this protects against: agent-runtime
7
- * cold boots on Windows with `bun --conditions=development run
8
- * packages/agent-runtime/src/server.ts` routinely take 30-45s just
9
- * to JIT-compile the TS dep graph (shared-runtime + generators +
10
- * tools + hooks + gateway). The kernel-level TCP listener is up
11
- * within ~1-2s of spawn (the moment `Bun.serve()` reads its default
12
- * export), but the event loop is still saturated for tens of seconds
13
- * after that, so the old HTTP-only /health gate timed out, SIGTERM'd
14
- * the still-booting child, and the restart-loop chewed through
15
- * `MAX_CONSECUTIVE_RESTARTS` before giving up leaving chat stuck
16
- * with "Connection timed out The agent runtime could not be
17
- * reached".
4
+ * Pins {@link WorkerRuntimeManager.waitForHealth} against the readiness
5
+ * contract every downstream caller of `status === 'running'` assumes:
6
+ * the agent runtime can actually serve HTTP, not just "the kernel
7
+ * accepted a TCP listener on the port". The historical context:
8
+ *
9
+ * - First pass (pre-2026-05): HTTP /health was the only signal. Bun
10
+ * on Windows takes 30-45s to JIT the runtime TS dep graph during
11
+ * cold boot, and /health was starved on the busy event loop the
12
+ * whole time, so the 30s timeout SIGTERM'd the still-booting
13
+ * child and the restart loop chewed through `MAX_CONSECUTIVE_RESTARTS`.
14
+ * - Second pass: added a TCP-listening + recent stdout fast path
15
+ * that returned "ready" as soon as `Bun.serve()` bound the port
16
+ * and the child kept emitting log lines. This stopped the SIGTERM
17
+ * storm but tricked `/sandbox/url` into reporting ready before
18
+ * HTTP actually worked, so the canvas iframe and AgentProxy
19
+ * started chasing a black hole and surfaced as "Connection timed
20
+ * out — The agent runtime could not be reached".
21
+ * - Current pass (2026-05): keep waiting for a real /health 2xx with
22
+ * HEALTH_BOOT_TIMEOUT_MS=30 s (now realistic on Windows because
23
+ * LSP + IndexEngine were moved out of the critical-path boot
24
+ * sequence in agent-runtime). TCP-listening + silent stdout >
25
+ * STDOUT_PROGRESS_WINDOW_MS=25 s acts as a *wedge detector* —
26
+ * a child that bound the port but stopped producing output for
27
+ * 25 s is genuinely stuck and the restart loop should recover
28
+ * it instead of waiting for the hard timeout.
18
29
  *
19
30
  * The contracts pinned here:
20
31
  *
21
- * 1. HTTP /health 200 returns immediately — the happy path is
22
- * unchanged for healthy runtimes.
23
- * 2. TCP-listening + recent stdout activity returns successfully
24
- * even when /health never responds — handles the cold-boot
25
- * saturation case.
32
+ * 1. HTTP /health 200 returns immediately — the happy path.
33
+ * 2. /health that never responds (with TCP up + stdout fresh) keeps
34
+ * waiting until the timeout, instead of declaring ready early.
26
35
  * 3. TCP-listening but stdout silent past STDOUT_PROGRESS_WINDOW_MS
27
- * does NOT short-circuit a truly-wedged process still hits
28
- * the hard timeout.
29
- * 4. Process death (exitCode/signalCode/killed) short-circuits
30
- * immediately with the exit info, instead of waiting the full
31
- * timeout window.
36
+ * throws the "wedged" error so restart-with-backoff fires.
37
+ * 4. Neither TCP nor HTTP ever come up → clean timeout error.
38
+ * 5. Process death (exitCode/signalCode/killed) short-circuits
39
+ * immediately with the exit info.
32
40
  */
33
41
  import { afterEach, beforeEach, describe, expect, it } from 'bun:test';
34
42
  import { WorkerRuntimeManager } from '../runtime-manager.ts';
@@ -132,63 +140,73 @@ describe('WorkerRuntimeManager.waitForHealth (private)', () => {
132
140
  });
133
141
 
134
142
  it(
135
- 'accepts TCP-listening + recent stdout activity as ready when /health keeps failing',
143
+ 'keeps waiting for /health even when TCP is up and stdout is fresh',
136
144
  async () => {
137
- // Reproduces the Windows `--conditions=development` cold-boot:
138
- // Bun.serve has bound the port (kernel-level listener up) and
139
- // the child is still emitting log lines (LSP-TS init,
140
- // preview-manager spawn, vite build), but /health is starved
141
- // because the event loop is busy JIT-compiling. The wait
142
- // should accept this as ready instead of timing out for 30s
143
- // and SIGTERM'ing a still-booting process.
145
+ // Reproduces the Windows cold-boot saturation pattern: Bun.serve
146
+ // has bound the port and the child is still emitting log lines
147
+ // (LSP-TS init, preview-manager spawn, vite build), but /health
148
+ // never gets event-loop time. The wait must NOT declare the
149
+ // runtime ready on TCP+stdout alone any more see the file
150
+ // docstring for the canvas-iframe "Connection timed out"
151
+ // regression that fast-path caused. It should keep polling
152
+ // /health and eventually hit the deadline.
144
153
  globalThis.fetch = (async () => {
145
154
  throw new Error('ECONNREFUSED');
146
155
  }) as typeof fetch;
147
156
  stubTcpProbe(mgr, true);
148
157
 
149
158
  const slot = makeSlot();
150
- // Keep stdout "fresh" so the progress window stays open.
159
+ // Keep stdout "fresh" so the wedge detector stays quiet — we
160
+ // want to exercise the hard timeout, not the wedge bail-out.
151
161
  const bumper = setInterval(() => {
152
162
  slot.lastStdoutAt = Date.now();
153
163
  }, 200);
154
164
 
155
165
  const startedAt = Date.now();
156
166
  try {
157
- await callWaitForHealth(mgr, slot, 30_000);
167
+ await expect(callWaitForHealth(mgr, slot, 1_500)).rejects.toThrow(
168
+ /Timeout waiting for agent-runtime \/health on port 41234.*tcpListening=true/s,
169
+ );
158
170
  } finally {
159
171
  clearInterval(bumper);
160
172
  }
161
173
  const elapsed = Date.now() - startedAt;
162
- // Should return within the first iteration or two TCP probe
163
- // succeeds + stdout fresh = ready. Hard cap well below the 30s
164
- // timeout to catch a regression to the legacy 30s spin.
165
- expect(elapsed).toBeLessThan(5_000);
174
+ // Should have spent close to the full 1.5s window polling
175
+ // /health, not short-circuited early.
176
+ expect(elapsed).toBeGreaterThanOrEqual(1_400);
166
177
  },
167
- 15_000,
178
+ 5_000,
168
179
  );
169
180
 
170
181
  it(
171
- 'does NOT accept TCP-listening alone when stdout is silent past the progress window',
182
+ 'throws "wedged" when TCP is up and stdout is silent past the progress window',
172
183
  async () => {
173
184
  // The silent-but-bound case mimics a process that bound the
174
185
  // port and then wedged (infinite loop in top-level code, native
175
- // crash mid-init that didn't propagate to the parent). The
176
- // progress window correctly refuses to short-circuit here so
177
- // the restart loop has a chance to recover.
186
+ // crash mid-init that didn't propagate to the parent). Bail
187
+ // out fast with a "wedged" error so the restart-with-backoff
188
+ // loop in handleExit() can SIGTERM and respawn, instead of
189
+ // spinning the full HEALTH_BOOT_TIMEOUT_MS for a process that
190
+ // will never recover.
178
191
  globalThis.fetch = (async () => {
179
192
  throw new Error('ECONNREFUSED');
180
193
  }) as typeof fetch;
181
194
  stubTcpProbe(mgr, true);
182
195
 
183
196
  const slot = makeSlot({
184
- // Pre-age the stdout timestamp so the window is already
185
- // closed before the first iteration.
186
- lastStdoutAt: Date.now() - 60_000,
197
+ // Pre-age the stdout timestamp so the wedge detector trips
198
+ // on the first iteration. Must exceed STDOUT_PROGRESS_WINDOW_MS
199
+ // (25 s) to fire.
200
+ lastStdoutAt: Date.now() - 30_000,
187
201
  });
188
202
 
189
- await expect(callWaitForHealth(mgr, slot, 1_200)).rejects.toThrow(
190
- /Timeout waiting for agent-runtime \/health on port 41234.*tcpListening=true/s,
203
+ // Generous outer timeout — the wedge detector should fire
204
+ // immediately, well before the 30s timeout would.
205
+ const startedAt = Date.now();
206
+ await expect(callWaitForHealth(mgr, slot, 30_000)).rejects.toThrow(
207
+ /agent-runtime wedged on port 41234.*stdout silent for/s,
191
208
  );
209
+ expect(Date.now() - startedAt).toBeLessThan(2_500);
192
210
  },
193
211
  5_000,
194
212
  );
@@ -87,30 +87,50 @@ const STARTUP_GRACE_MS = 60_000;
87
87
  const HEALTH_POLL_MS = 500;
88
88
  /**
89
89
  * Absolute ceiling on how long we'll wait for a freshly-spawned
90
- * agent-runtime to clear the boot gate (either a 200 /health response
91
- * or the TCP-listening + stdout-progress fallback below). Was 30s
92
- * before the Windows `--conditions=development` cold-boot
93
- * investigation surfaced that Bun routinely takes 30-45s just to JIT
94
- * the agent-runtime's TS dep graph (shared-runtime + generators +
95
- * tools + hooks + gateway) on a cold workspace, leaving zero budget
96
- * for /health to actually respond. 60s gives the slow path room
97
- * without giving a truly-hung process a free pass — the progress
98
- * detector below catches that case independently.
90
+ * agent-runtime to respond 2xx on /health. After the 2026-05 fix
91
+ * (`runtime-log-writer` async batched writes, deferred LSP +
92
+ * IndexEngine start, /health fast-path in the outer fetch handler),
93
+ * a Windows cold boot is ~8-12 s; macOS/Linux are ~3-5 s. 30 s gives
94
+ * a comfortable margin without hiding real hangs.
95
+ *
96
+ * Why we don't short-circuit on TCP-listening + stdout activity any
97
+ * more (the historical fast path that bypassed /health entirely):
98
+ * declaring "ready" while the event loop is saturated tricks the UI
99
+ * into starting to load the preview iframe and the API server into
100
+ * starting the agent-proxy, both of which then hit their own (much
101
+ * shorter) timeouts and surface as the "Connection timed out — The
102
+ * agent runtime could not be reached" toast and "[AgentProxy] PATCH
103
+ * /agent/config timeout" retries. Waiting for a real 2xx is slower
104
+ * but the resulting `'running'` status actually means "responsive to
105
+ * HTTP", which is what every caller of `status === 'running'` already
106
+ * assumes. The progress logger keeps devs informed during the slow
107
+ * path so the wait isn't a silent black-box.
108
+ */
109
+ const HEALTH_BOOT_TIMEOUT_MS = 30_000;
110
+ /**
111
+ * If `STDOUT_PROGRESS_WINDOW_MS` elapses without any new stdout AND
112
+ * TCP-listening is true AND /health still hasn't responded, the
113
+ * child is considered wedged and the wait short-circuits with the
114
+ * "stdout silent" error so the restart loop has a chance to recover.
115
+ * Note: this is the ONLY purpose of the window now — it does not
116
+ * declare the runtime ready on its own. See HEALTH_BOOT_TIMEOUT_MS.
117
+ *
118
+ * Sized at 25 s — short of the 30 s HEALTH_BOOT_TIMEOUT_MS so a
119
+ * wedged child surfaces with the more informative "stdout silent for
120
+ * Xms" message instead of the generic timeout. After the 2026-05
121
+ * cold-boot investigation we deferred LSP startup + IndexEngine
122
+ * pre-warm out of the critical path and got Windows cold boot down
123
+ * to ~8-12 s (with macOS at ~3-5 s), so a 25 s silent window is now
124
+ * a real anomaly worth respawning for.
99
125
  */
100
- const HEALTH_BOOT_TIMEOUT_MS = 60_000;
126
+ const STDOUT_PROGRESS_WINDOW_MS = 25_000;
101
127
  /**
102
- * If the child's TCP listener is up AND its stdout has emitted a line
103
- * within this window, we accept it as ready even though /health
104
- * hasn't returned yet. Models "is the process making forward
105
- * progress?" — a Bun runtime mid `optimizeDeps` / LSP-spawn / hook
106
- * registration is unresponsive to HTTP but still emits `[LSP-TS]`,
107
- * `[preview-manager]`, `[AgentGateway]` etc. log lines as it walks
108
- * its boot sequence, so stdout activity is a reliable liveness
109
- * signal. A truly-hung child (event-loop wedged, zero log output)
110
- * stays silent past this window and falls through to the normal
111
- * /health timeout + SIGTERM + restart loop.
128
+ * Log a "still waiting for /health" progress line at most this
129
+ * often, so operators / devs watching `[dev:all]` see something
130
+ * happening during the slow Windows cold-boot path instead of a
131
+ * silent 30-90s gap.
112
132
  */
113
- const STDOUT_PROGRESS_WINDOW_MS = 10_000;
133
+ const HEALTH_PROGRESS_LOG_MS = 5_000;
114
134
  /**
115
135
  * Per-attempt TCP connect budget for the kernel-level readiness
116
136
  * probe. Connect attempts only need a TCP SYN/SYN-ACK roundtrip on
@@ -1384,38 +1404,51 @@ export class WorkerRuntimeManager implements RuntimeResolver {
1384
1404
  }
1385
1405
 
1386
1406
  /**
1387
- * Wait for a freshly-spawned agent-runtime to clear the boot gate.
1407
+ * Wait for a freshly-spawned agent-runtime to be HTTP-responsive on
1408
+ * /health. This is the gate that drives the slot's `'starting'` →
1409
+ * `'running'` transition; every consumer of `status === 'running'`
1410
+ * (`/sandbox/url`, AgentProxy, the iframe preview readiness probe)
1411
+ * assumes the runtime can actually serve requests, so this gate
1412
+ * must mean exactly that — not "the kernel accepted a TCP listener
1413
+ * on the port".
1414
+ *
1415
+ * Loop semantics:
1416
+ *
1417
+ * 1. **HTTP /health 2xx → return.** Happy path; the event loop
1418
+ * is responsive and the slot can transition to `'running'`.
1388
1419
  *
1389
- * Three independent readiness signals, in priority order:
1420
+ * 2. **Child exits throw.** Same as before — surface the
1421
+ * exit code/signal so restart-with-backoff can recover from
1422
+ * crashes-during-boot without burning the full timeout.
1390
1423
  *
1391
- * 1. **HTTP /health returns 2xx.** Happy path the runtime is
1392
- * fully booted and its event loop is responsive. We return
1393
- * immediately and the caller transitions the slot to
1394
- * `'running'`.
1424
+ * 3. **TCP listening + stdout silent > {@link STDOUT_PROGRESS_WINDOW_MS}
1425
+ * throw.** TCP up but no log lines for 30s is "wedged":
1426
+ * Bun is past `Bun.serve()` (so the kernel has the port) but
1427
+ * something inside the runtime spun the event loop to death
1428
+ * (infinite loop, deadlock). Bail so the restart loop can
1429
+ * SIGTERM and respawn instead of spinning the full timeoutMs.
1395
1430
  *
1396
- * 2. **TCP listener bound + recent stdout activity.** The kernel
1397
- * has the port (so `Bun.serve()` evaluated its default export)
1398
- * AND the child has printed a log line in the last
1399
- * {@link STDOUT_PROGRESS_WINDOW_MS}. The process is alive,
1400
- * bound, and making forward progress — just slow because Bun
1401
- * is JIT-compiling the rest of the TS dep graph (the
1402
- * `--conditions=development` cold-boot pattern) and /health
1403
- * hasn't gotten a turn on the event loop yet. Accepting this
1404
- * as ready unblocks the AgentProxy retry storm that would
1405
- * otherwise compound the saturation. Once the child starts
1406
- * handling requests for real, /health will respond and the
1407
- * AgentProxy retries will succeed independently of this gate.
1431
+ * 4. **`timeoutMs` elapsed throw.** Final ceiling. After the
1432
+ * 2026-05 fix that deferred LSP + IndexEngine out of the
1433
+ * critical-path boot sequence in agent-runtime, a healthy
1434
+ * Windows cold boot is ~8-12 s; the 30 s budget gives a
1435
+ * comfortable margin without hiding real hangs.
1408
1436
  *
1409
- * 3. **Hard timeout at `timeoutMs`.** Neither HTTP nor the
1410
- * TCP-plus-progress fallback has cleared. Throw the
1411
- * descriptive "Timeout waiting for agent-runtime /health" so
1412
- * the restart-with-backoff loop in `handleExit()` SIGTERMs the
1413
- * child and respawns.
1437
+ * Progress logging: every {@link HEALTH_PROGRESS_LOG_MS} we emit a
1438
+ * single line summarizing where the wait stands so the dev/operator
1439
+ * watching `[dev:all]` sees something happen during the slow path
1440
+ * instead of staring at a silent terminal for a minute.
1414
1441
  *
1415
- * Process-death short-circuit applies throughout: if the child exits
1416
- * mid-wait (e.g. spawn-time port conflict, missing native binding),
1417
- * we throw immediately with the exit code/signal rather than spinning
1418
- * for the full `timeoutMs`.
1442
+ * What changed vs the pre-2026-05 fast path: that revision returned
1443
+ * as soon as TCP-listening + recent stdout was true, treating "the
1444
+ * child is making progress" as ready. In practice the child was
1445
+ * still saturated for tens of seconds after that point, so the UI
1446
+ * would start its preview iframe load and the API server its agent
1447
+ * proxy, both of which then hit their own (4-15s) timeouts and
1448
+ * surfaced as the "Connection timed out — The agent runtime could
1449
+ * not be reached" toast plus "[AgentProxy] timeout, retrying" log
1450
+ * spam. The new gate trades 20-60s of additional boot wait for a
1451
+ * `'running'` signal the rest of the stack can actually trust.
1419
1452
  */
1420
1453
  private async waitForHealth(slot: InternalRuntime, timeoutMs: number): Promise<void> {
1421
1454
  const port = slot.agentPort;
@@ -1427,9 +1460,11 @@ export class WorkerRuntimeManager implements RuntimeResolver {
1427
1460
  const deadline = startedAt + timeoutMs;
1428
1461
  let lastError: string | null = null;
1429
1462
  let lastTcpListening = false;
1463
+ let lastTcpAt = 0;
1430
1464
  let httpAttempts = 0;
1431
1465
  let tcpAttempts = 0;
1432
1466
  let iteration = 0;
1467
+ let lastProgressLogAt = startedAt;
1433
1468
 
1434
1469
  while (Date.now() < deadline) {
1435
1470
  iteration++;
@@ -1439,7 +1474,9 @@ export class WorkerRuntimeManager implements RuntimeResolver {
1439
1474
  );
1440
1475
  }
1441
1476
 
1442
- // Signal 1: HTTP /health (happy path).
1477
+ // Primary signal: HTTP /health. This is the only signal that
1478
+ // declares the runtime ready — TCP-listening alone is not enough
1479
+ // (see class docstring above for the rationale).
1443
1480
  httpAttempts++;
1444
1481
  const controller = new AbortController();
1445
1482
  const t = setTimeout(() => controller.abort(), 1500);
@@ -1457,8 +1494,7 @@ export class WorkerRuntimeManager implements RuntimeResolver {
1457
1494
  return;
1458
1495
  }
1459
1496
  // Non-2xx is recorded but doesn't short-circuit — the runtime
1460
- // may briefly serve 503 while initializing post-bind, and the
1461
- // TCP-progress fallback below covers that window too.
1497
+ // may briefly serve 503 while initializing post-bind.
1462
1498
  lastError = `HTTP /health returned ${resp.status}`;
1463
1499
  } catch (err: any) {
1464
1500
  clearTimeout(t);
@@ -1467,23 +1503,37 @@ export class WorkerRuntimeManager implements RuntimeResolver {
1467
1503
  lastError = `HTTP /health failed: ${name}${code ? `(${code})` : ''}: ${err?.message ?? err}`;
1468
1504
  }
1469
1505
 
1470
- // Signal 2: TCP listener bound + recent stdout activity.
1506
+ // Secondary signal: TCP listener + stdout activity. Now used ONLY
1507
+ // as a wedge detector — if TCP is up but stdout has been silent
1508
+ // for the progress window, abandon the wait so the restart loop
1509
+ // can recover. Never declares the runtime ready on its own.
1471
1510
  tcpAttempts++;
1472
1511
  lastTcpListening = await this.tcpProbe(port);
1473
1512
  if (lastTcpListening) {
1513
+ lastTcpAt = Date.now();
1474
1514
  const sinceStdoutMs = Date.now() - slot.lastStdoutAt;
1475
- if (sinceStdoutMs < STDOUT_PROGRESS_WINDOW_MS) {
1476
- this.log.log(
1477
- `[WorkerRuntimeManager] TCP-listening + stdout-active for ${slot.projectId} on ` +
1478
- `port ${port} (last stdout ${sinceStdoutMs}ms ago, ` +
1479
- `${Date.now() - startedAt}ms since spawn, ${httpAttempts} http, ${tcpAttempts} tcp). ` +
1480
- `Accepting as ready — /health still warming up but the child is bound and making progress.`,
1515
+ if (sinceStdoutMs >= STDOUT_PROGRESS_WINDOW_MS) {
1516
+ throw new Error(
1517
+ `agent-runtime wedged on port ${port}: TCP listening but stdout silent for ` +
1518
+ `${sinceStdoutMs}ms (> ${STDOUT_PROGRESS_WINDOW_MS}ms window); ` +
1519
+ `${httpAttempts} /health attempts, last error: ${lastError ?? 'n/a'}`,
1481
1520
  );
1482
- return;
1483
1521
  }
1484
- lastError =
1485
- `TCP listening but stdout silent for ${sinceStdoutMs}ms ` +
1486
- `(> ${STDOUT_PROGRESS_WINDOW_MS}ms window); last http: ${lastError ?? 'n/a'}`;
1522
+ }
1523
+
1524
+ const now = Date.now();
1525
+ if (now - lastProgressLogAt >= HEALTH_PROGRESS_LOG_MS) {
1526
+ const elapsedMs = now - startedAt;
1527
+ const sinceStdoutMs = now - slot.lastStdoutAt;
1528
+ const sinceTcpMs = lastTcpAt > 0 ? now - lastTcpAt : null;
1529
+ this.log.log(
1530
+ `[WorkerRuntimeManager] still waiting for /health on ${slot.projectId} ` +
1531
+ `port ${port} (${(elapsedMs / 1000).toFixed(1)}s elapsed, ` +
1532
+ `tcpListening=${lastTcpListening}${sinceTcpMs != null ? `(${sinceTcpMs}ms ago)` : ''}, ` +
1533
+ `lastStdout=${sinceStdoutMs}ms ago, ${httpAttempts} http, ${tcpAttempts} tcp, ` +
1534
+ `lastError=${lastError ?? 'n/a'})`,
1535
+ );
1536
+ lastProgressLogAt = now;
1487
1537
  }
1488
1538
 
1489
1539
  await new Promise((r) => setTimeout(r, HEALTH_POLL_MS));