@shogo-ai/worker 1.8.10 → 1.8.12
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/lib/__tests__/cloud-login-coverage.test.ts +210 -0
- package/src/lib/__tests__/git-cloner.test.ts +130 -6
- package/src/lib/__tests__/runtime-manager-coverage-gaps.test.ts +768 -0
- package/src/lib/__tests__/runtime-manager-wait-for-health.test.ts +66 -48
- package/src/lib/__tests__/tunnel-coverage.test.ts +1094 -0
- package/src/lib/runtime-manager.ts +112 -62
- package/src/lib/tunnel.ts +0 -4
|
@@ -1,34 +1,42 @@
|
|
|
1
1
|
// SPDX-License-Identifier: MIT
|
|
2
2
|
// Copyright (C) 2026 Shogo Technologies, Inc.
|
|
3
3
|
/**
|
|
4
|
-
* Pins {@link WorkerRuntimeManager.waitForHealth} against the
|
|
5
|
-
*
|
|
6
|
-
*
|
|
7
|
-
*
|
|
8
|
-
*
|
|
9
|
-
*
|
|
10
|
-
*
|
|
11
|
-
*
|
|
12
|
-
*
|
|
13
|
-
*
|
|
14
|
-
*
|
|
15
|
-
*
|
|
16
|
-
*
|
|
17
|
-
*
|
|
4
|
+
* Pins {@link WorkerRuntimeManager.waitForHealth} against the readiness
|
|
5
|
+
* contract every downstream caller of `status === 'running'` assumes:
|
|
6
|
+
* the agent runtime can actually serve HTTP, not just "the kernel
|
|
7
|
+
* accepted a TCP listener on the port". The historical context:
|
|
8
|
+
*
|
|
9
|
+
* - First pass (pre-2026-05): HTTP /health was the only signal. Bun
|
|
10
|
+
* on Windows takes 30-45s to JIT the runtime TS dep graph during
|
|
11
|
+
* cold boot, and /health was starved on the busy event loop the
|
|
12
|
+
* whole time, so the 30s timeout SIGTERM'd the still-booting
|
|
13
|
+
* child and the restart loop chewed through `MAX_CONSECUTIVE_RESTARTS`.
|
|
14
|
+
* - Second pass: added a TCP-listening + recent stdout fast path
|
|
15
|
+
* that returned "ready" as soon as `Bun.serve()` bound the port
|
|
16
|
+
* and the child kept emitting log lines. This stopped the SIGTERM
|
|
17
|
+
* storm but tricked `/sandbox/url` into reporting ready before
|
|
18
|
+
* HTTP actually worked, so the canvas iframe and AgentProxy
|
|
19
|
+
* started chasing a black hole and surfaced as "Connection timed
|
|
20
|
+
* out — The agent runtime could not be reached".
|
|
21
|
+
* - Current pass (2026-05): keep waiting for a real /health 2xx with
|
|
22
|
+
* HEALTH_BOOT_TIMEOUT_MS=30 s (now realistic on Windows because
|
|
23
|
+
* LSP + IndexEngine were moved out of the critical-path boot
|
|
24
|
+
* sequence in agent-runtime). TCP-listening + silent stdout >
|
|
25
|
+
* STDOUT_PROGRESS_WINDOW_MS=25 s acts as a *wedge detector* —
|
|
26
|
+
* a child that bound the port but stopped producing output for
|
|
27
|
+
* 25 s is genuinely stuck and the restart loop should recover
|
|
28
|
+
* it instead of waiting for the hard timeout.
|
|
18
29
|
*
|
|
19
30
|
* The contracts pinned here:
|
|
20
31
|
*
|
|
21
|
-
* 1. HTTP /health 200 returns immediately — the happy path
|
|
22
|
-
*
|
|
23
|
-
*
|
|
24
|
-
* even when /health never responds — handles the cold-boot
|
|
25
|
-
* saturation case.
|
|
32
|
+
* 1. HTTP /health 200 returns immediately — the happy path.
|
|
33
|
+
* 2. /health that never responds (with TCP up + stdout fresh) keeps
|
|
34
|
+
* waiting until the timeout, instead of declaring ready early.
|
|
26
35
|
* 3. TCP-listening but stdout silent past STDOUT_PROGRESS_WINDOW_MS
|
|
27
|
-
*
|
|
28
|
-
*
|
|
29
|
-
*
|
|
30
|
-
* immediately with the exit info
|
|
31
|
-
* timeout window.
|
|
36
|
+
* throws the "wedged" error so restart-with-backoff fires.
|
|
37
|
+
* 4. Neither TCP nor HTTP ever come up → clean timeout error.
|
|
38
|
+
* 5. Process death (exitCode/signalCode/killed) short-circuits
|
|
39
|
+
* immediately with the exit info.
|
|
32
40
|
*/
|
|
33
41
|
import { afterEach, beforeEach, describe, expect, it } from 'bun:test';
|
|
34
42
|
import { WorkerRuntimeManager } from '../runtime-manager.ts';
|
|
@@ -132,63 +140,73 @@ describe('WorkerRuntimeManager.waitForHealth (private)', () => {
|
|
|
132
140
|
});
|
|
133
141
|
|
|
134
142
|
it(
|
|
135
|
-
'
|
|
143
|
+
'keeps waiting for /health even when TCP is up and stdout is fresh',
|
|
136
144
|
async () => {
|
|
137
|
-
// Reproduces the Windows
|
|
138
|
-
//
|
|
139
|
-
//
|
|
140
|
-
//
|
|
141
|
-
//
|
|
142
|
-
//
|
|
143
|
-
//
|
|
145
|
+
// Reproduces the Windows cold-boot saturation pattern: Bun.serve
|
|
146
|
+
// has bound the port and the child is still emitting log lines
|
|
147
|
+
// (LSP-TS init, preview-manager spawn, vite build), but /health
|
|
148
|
+
// never gets event-loop time. The wait must NOT declare the
|
|
149
|
+
// runtime ready on TCP+stdout alone any more — see the file
|
|
150
|
+
// docstring for the canvas-iframe "Connection timed out"
|
|
151
|
+
// regression that fast-path caused. It should keep polling
|
|
152
|
+
// /health and eventually hit the deadline.
|
|
144
153
|
globalThis.fetch = (async () => {
|
|
145
154
|
throw new Error('ECONNREFUSED');
|
|
146
155
|
}) as typeof fetch;
|
|
147
156
|
stubTcpProbe(mgr, true);
|
|
148
157
|
|
|
149
158
|
const slot = makeSlot();
|
|
150
|
-
// Keep stdout "fresh" so the
|
|
159
|
+
// Keep stdout "fresh" so the wedge detector stays quiet — we
|
|
160
|
+
// want to exercise the hard timeout, not the wedge bail-out.
|
|
151
161
|
const bumper = setInterval(() => {
|
|
152
162
|
slot.lastStdoutAt = Date.now();
|
|
153
163
|
}, 200);
|
|
154
164
|
|
|
155
165
|
const startedAt = Date.now();
|
|
156
166
|
try {
|
|
157
|
-
await callWaitForHealth(mgr, slot,
|
|
167
|
+
await expect(callWaitForHealth(mgr, slot, 1_500)).rejects.toThrow(
|
|
168
|
+
/Timeout waiting for agent-runtime \/health on port 41234.*tcpListening=true/s,
|
|
169
|
+
);
|
|
158
170
|
} finally {
|
|
159
171
|
clearInterval(bumper);
|
|
160
172
|
}
|
|
161
173
|
const elapsed = Date.now() - startedAt;
|
|
162
|
-
// Should
|
|
163
|
-
//
|
|
164
|
-
|
|
165
|
-
expect(elapsed).toBeLessThan(5_000);
|
|
174
|
+
// Should have spent close to the full 1.5s window polling
|
|
175
|
+
// /health, not short-circuited early.
|
|
176
|
+
expect(elapsed).toBeGreaterThanOrEqual(1_400);
|
|
166
177
|
},
|
|
167
|
-
|
|
178
|
+
5_000,
|
|
168
179
|
);
|
|
169
180
|
|
|
170
181
|
it(
|
|
171
|
-
'
|
|
182
|
+
'throws "wedged" when TCP is up and stdout is silent past the progress window',
|
|
172
183
|
async () => {
|
|
173
184
|
// The silent-but-bound case mimics a process that bound the
|
|
174
185
|
// port and then wedged (infinite loop in top-level code, native
|
|
175
|
-
// crash mid-init that didn't propagate to the parent).
|
|
176
|
-
//
|
|
177
|
-
//
|
|
186
|
+
// crash mid-init that didn't propagate to the parent). Bail
|
|
187
|
+
// out fast with a "wedged" error so the restart-with-backoff
|
|
188
|
+
// loop in handleExit() can SIGTERM and respawn, instead of
|
|
189
|
+
// spinning the full HEALTH_BOOT_TIMEOUT_MS for a process that
|
|
190
|
+
// will never recover.
|
|
178
191
|
globalThis.fetch = (async () => {
|
|
179
192
|
throw new Error('ECONNREFUSED');
|
|
180
193
|
}) as typeof fetch;
|
|
181
194
|
stubTcpProbe(mgr, true);
|
|
182
195
|
|
|
183
196
|
const slot = makeSlot({
|
|
184
|
-
// Pre-age the stdout timestamp so the
|
|
185
|
-
//
|
|
186
|
-
|
|
197
|
+
// Pre-age the stdout timestamp so the wedge detector trips
|
|
198
|
+
// on the first iteration. Must exceed STDOUT_PROGRESS_WINDOW_MS
|
|
199
|
+
// (25 s) to fire.
|
|
200
|
+
lastStdoutAt: Date.now() - 30_000,
|
|
187
201
|
});
|
|
188
202
|
|
|
189
|
-
|
|
190
|
-
|
|
203
|
+
// Generous outer timeout — the wedge detector should fire
|
|
204
|
+
// immediately, well before the 30s timeout would.
|
|
205
|
+
const startedAt = Date.now();
|
|
206
|
+
await expect(callWaitForHealth(mgr, slot, 30_000)).rejects.toThrow(
|
|
207
|
+
/agent-runtime wedged on port 41234.*stdout silent for/s,
|
|
191
208
|
);
|
|
209
|
+
expect(Date.now() - startedAt).toBeLessThan(2_500);
|
|
192
210
|
},
|
|
193
211
|
5_000,
|
|
194
212
|
);
|