@shogo-ai/worker 1.8.10 → 1.8.12

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,34 +1,42 @@
1
1
  // SPDX-License-Identifier: MIT
2
2
  // Copyright (C) 2026 Shogo Technologies, Inc.
3
3
  /**
4
- * Pins {@link WorkerRuntimeManager.waitForHealth} against the three
5
- * readiness signals it now considers and the diagnostic behavior on
6
- * timeout. The historical bug this protects against: agent-runtime
7
- * cold boots on Windows with `bun --conditions=development run
8
- * packages/agent-runtime/src/server.ts` routinely take 30-45s just
9
- * to JIT-compile the TS dep graph (shared-runtime + generators +
10
- * tools + hooks + gateway). The kernel-level TCP listener is up
11
- * within ~1-2s of spawn (the moment `Bun.serve()` reads its default
12
- * export), but the event loop is still saturated for tens of seconds
13
- * after that, so the old HTTP-only /health gate timed out, SIGTERM'd
14
- * the still-booting child, and the restart-loop chewed through
15
- * `MAX_CONSECUTIVE_RESTARTS` before giving up leaving chat stuck
16
- * with "Connection timed out The agent runtime could not be
17
- * reached".
4
+ * Pins {@link WorkerRuntimeManager.waitForHealth} against the readiness
5
+ * contract every downstream caller of `status === 'running'` assumes:
6
+ * the agent runtime can actually serve HTTP, not just "the kernel
7
+ * accepted a TCP listener on the port". The historical context:
8
+ *
9
+ * - First pass (pre-2026-05): HTTP /health was the only signal. Bun
10
+ * on Windows takes 30-45s to JIT the runtime TS dep graph during
11
+ * cold boot, and /health was starved on the busy event loop the
12
+ * whole time, so the 30s timeout SIGTERM'd the still-booting
13
+ * child and the restart loop chewed through `MAX_CONSECUTIVE_RESTARTS`.
14
+ * - Second pass: added a TCP-listening + recent stdout fast path
15
+ * that returned "ready" as soon as `Bun.serve()` bound the port
16
+ * and the child kept emitting log lines. This stopped the SIGTERM
17
+ * storm but tricked `/sandbox/url` into reporting ready before
18
+ * HTTP actually worked, so the canvas iframe and AgentProxy
19
+ * started chasing a black hole and surfaced as "Connection timed
20
+ * out — The agent runtime could not be reached".
21
+ * - Current pass (2026-05): keep waiting for a real /health 2xx with
22
+ * HEALTH_BOOT_TIMEOUT_MS=30 s (now realistic on Windows because
23
+ * LSP + IndexEngine were moved out of the critical-path boot
24
+ * sequence in agent-runtime). TCP-listening + silent stdout >
25
+ * STDOUT_PROGRESS_WINDOW_MS=25 s acts as a *wedge detector* —
26
+ * a child that bound the port but stopped producing output for
27
+ * 25 s is genuinely stuck and the restart loop should recover
28
+ * it instead of waiting for the hard timeout.
18
29
  *
19
30
  * The contracts pinned here:
20
31
  *
21
- * 1. HTTP /health 200 returns immediately — the happy path is
22
- * unchanged for healthy runtimes.
23
- * 2. TCP-listening + recent stdout activity returns successfully
24
- * even when /health never responds — handles the cold-boot
25
- * saturation case.
32
+ * 1. HTTP /health 200 returns immediately — the happy path.
33
+ * 2. /health that never responds (with TCP up + stdout fresh) keeps
34
+ * waiting until the timeout, instead of declaring ready early.
26
35
  * 3. TCP-listening but stdout silent past STDOUT_PROGRESS_WINDOW_MS
27
- * does NOT short-circuit a truly-wedged process still hits
28
- * the hard timeout.
29
- * 4. Process death (exitCode/signalCode/killed) short-circuits
30
- * immediately with the exit info, instead of waiting the full
31
- * timeout window.
36
+ * throws the "wedged" error so restart-with-backoff fires.
37
+ * 4. Neither TCP nor HTTP ever come up → clean timeout error.
38
+ * 5. Process death (exitCode/signalCode/killed) short-circuits
39
+ * immediately with the exit info.
32
40
  */
33
41
  import { afterEach, beforeEach, describe, expect, it } from 'bun:test';
34
42
  import { WorkerRuntimeManager } from '../runtime-manager.ts';
@@ -132,63 +140,73 @@ describe('WorkerRuntimeManager.waitForHealth (private)', () => {
132
140
  });
133
141
 
134
142
  it(
135
- 'accepts TCP-listening + recent stdout activity as ready when /health keeps failing',
143
+ 'keeps waiting for /health even when TCP is up and stdout is fresh',
136
144
  async () => {
137
- // Reproduces the Windows `--conditions=development` cold-boot:
138
- // Bun.serve has bound the port (kernel-level listener up) and
139
- // the child is still emitting log lines (LSP-TS init,
140
- // preview-manager spawn, vite build), but /health is starved
141
- // because the event loop is busy JIT-compiling. The wait
142
- // should accept this as ready instead of timing out for 30s
143
- // and SIGTERM'ing a still-booting process.
145
+ // Reproduces the Windows cold-boot saturation pattern: Bun.serve
146
+ // has bound the port and the child is still emitting log lines
147
+ // (LSP-TS init, preview-manager spawn, vite build), but /health
148
+ // never gets event-loop time. The wait must NOT declare the
149
+ // runtime ready on TCP+stdout alone any more see the file
150
+ // docstring for the canvas-iframe "Connection timed out"
151
+ // regression that fast-path caused. It should keep polling
152
+ // /health and eventually hit the deadline.
144
153
  globalThis.fetch = (async () => {
145
154
  throw new Error('ECONNREFUSED');
146
155
  }) as typeof fetch;
147
156
  stubTcpProbe(mgr, true);
148
157
 
149
158
  const slot = makeSlot();
150
- // Keep stdout "fresh" so the progress window stays open.
159
+ // Keep stdout "fresh" so the wedge detector stays quiet — we
160
+ // want to exercise the hard timeout, not the wedge bail-out.
151
161
  const bumper = setInterval(() => {
152
162
  slot.lastStdoutAt = Date.now();
153
163
  }, 200);
154
164
 
155
165
  const startedAt = Date.now();
156
166
  try {
157
- await callWaitForHealth(mgr, slot, 30_000);
167
+ await expect(callWaitForHealth(mgr, slot, 1_500)).rejects.toThrow(
168
+ /Timeout waiting for agent-runtime \/health on port 41234.*tcpListening=true/s,
169
+ );
158
170
  } finally {
159
171
  clearInterval(bumper);
160
172
  }
161
173
  const elapsed = Date.now() - startedAt;
162
- // Should return within the first iteration or two TCP probe
163
- // succeeds + stdout fresh = ready. Hard cap well below the 30s
164
- // timeout to catch a regression to the legacy 30s spin.
165
- expect(elapsed).toBeLessThan(5_000);
174
+ // Should have spent close to the full 1.5s window polling
175
+ // /health, not short-circuited early.
176
+ expect(elapsed).toBeGreaterThanOrEqual(1_400);
166
177
  },
167
- 15_000,
178
+ 5_000,
168
179
  );
169
180
 
170
181
  it(
171
- 'does NOT accept TCP-listening alone when stdout is silent past the progress window',
182
+ 'throws "wedged" when TCP is up and stdout is silent past the progress window',
172
183
  async () => {
173
184
  // The silent-but-bound case mimics a process that bound the
174
185
  // port and then wedged (infinite loop in top-level code, native
175
- // crash mid-init that didn't propagate to the parent). The
176
- // progress window correctly refuses to short-circuit here so
177
- // the restart loop has a chance to recover.
186
+ // crash mid-init that didn't propagate to the parent). Bail
187
+ // out fast with a "wedged" error so the restart-with-backoff
188
+ // loop in handleExit() can SIGTERM and respawn, instead of
189
+ // spinning the full HEALTH_BOOT_TIMEOUT_MS for a process that
190
+ // will never recover.
178
191
  globalThis.fetch = (async () => {
179
192
  throw new Error('ECONNREFUSED');
180
193
  }) as typeof fetch;
181
194
  stubTcpProbe(mgr, true);
182
195
 
183
196
  const slot = makeSlot({
184
- // Pre-age the stdout timestamp so the window is already
185
- // closed before the first iteration.
186
- lastStdoutAt: Date.now() - 60_000,
197
+ // Pre-age the stdout timestamp so the wedge detector trips
198
+ // on the first iteration. Must exceed STDOUT_PROGRESS_WINDOW_MS
199
+ // (25 s) to fire.
200
+ lastStdoutAt: Date.now() - 30_000,
187
201
  });
188
202
 
189
- await expect(callWaitForHealth(mgr, slot, 1_200)).rejects.toThrow(
190
- /Timeout waiting for agent-runtime \/health on port 41234.*tcpListening=true/s,
203
+ // Generous outer timeout — the wedge detector should fire
204
+ // immediately, well before the 30s timeout would.
205
+ const startedAt = Date.now();
206
+ await expect(callWaitForHealth(mgr, slot, 30_000)).rejects.toThrow(
207
+ /agent-runtime wedged on port 41234.*stdout silent for/s,
191
208
  );
209
+ expect(Date.now() - startedAt).toBeLessThan(2_500);
192
210
  },
193
211
  5_000,
194
212
  );