@shogo-ai/worker 1.7.10 → 1.7.11
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json
CHANGED
|
@@ -0,0 +1,196 @@
|
|
|
1
|
+
// SPDX-License-Identifier: MIT
|
|
2
|
+
// Copyright (C) 2026 Shogo Technologies, Inc.
|
|
3
|
+
/**
|
|
4
|
+
* Coverage for `WorkerRuntimeManager` idle eviction.
|
|
5
|
+
*
|
|
6
|
+
* The "chat cut mid-stream" symptom in production was: the agent-proxy
|
|
7
|
+
* never refreshed `lastUsedAt` on a long stream, so a 16-minute Opus
|
|
8
|
+
* turn looked identical to a 16-minute idle slot to the manager and
|
|
9
|
+
* `stop()` fired mid-stream. The reaper itself is correct — it's
|
|
10
|
+
* `lastUsedAt` that wasn't being kept fresh by the surrounding HTTP
|
|
11
|
+
* code.
|
|
12
|
+
*
|
|
13
|
+
* The reproduction here pins both halves of the contract:
|
|
14
|
+
*
|
|
15
|
+
* 1. With NO `touch()` calls inside the idle window, the reaper
|
|
16
|
+
* DOES fire — the slot is killed, the process is SIGTERM'd, and
|
|
17
|
+
* the runtimes map is cleared. This is the bug surface.
|
|
18
|
+
*
|
|
19
|
+
* 2. With periodic `touch()` calls (modelling the agent-proxy
|
|
20
|
+
* forwarding chunks / the AI proxy receiving tool-call requests),
|
|
21
|
+
* the reaper NEVER fires across multiple idle windows — each
|
|
22
|
+
* touch resets the timer.
|
|
23
|
+
*
|
|
24
|
+
* The remaining tests pin the desktop / `SHOGO_LOCAL_MODE=true` opt-out:
|
|
25
|
+
* `idleMs: 0` (or any non-finite value) disables the reaper without
|
|
26
|
+
* ever arming a timer, so a long chat in local mode is not bounded by
|
|
27
|
+
* the cloud's 15-minute window even if a touch hook breaks somewhere.
|
|
28
|
+
*/
|
|
29
|
+
import { describe, expect, it } from 'bun:test';
|
|
30
|
+
import { WorkerRuntimeManager } from '../runtime-manager.ts';
|
|
31
|
+
|
|
32
|
+
interface FakeProc {
|
|
33
|
+
exitCode: number | null;
|
|
34
|
+
signalCode: NodeJS.Signals | null;
|
|
35
|
+
killed: boolean;
|
|
36
|
+
pid: number;
|
|
37
|
+
kill: (signal?: NodeJS.Signals | number) => boolean;
|
|
38
|
+
once: (event: string, cb: (...args: unknown[]) => void) => FakeProc;
|
|
39
|
+
on: (event: string, cb: (...args: unknown[]) => void) => FakeProc;
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
function makeFakeProc(): FakeProc {
|
|
43
|
+
const proc: FakeProc = {
|
|
44
|
+
// `waitForExit` short-circuits when `exitCode !== null`, so a
|
|
45
|
+
// synchronous "already exited" fake keeps stop() from blocking on
|
|
46
|
+
// an exit listener that never fires.
|
|
47
|
+
exitCode: 0,
|
|
48
|
+
signalCode: null,
|
|
49
|
+
killed: false,
|
|
50
|
+
pid: 99999,
|
|
51
|
+
kill(_signal?: NodeJS.Signals | number) {
|
|
52
|
+
proc.killed = true;
|
|
53
|
+
return true;
|
|
54
|
+
},
|
|
55
|
+
once(_event, _cb) {
|
|
56
|
+
return proc;
|
|
57
|
+
},
|
|
58
|
+
on(_event, _cb) {
|
|
59
|
+
return proc;
|
|
60
|
+
},
|
|
61
|
+
};
|
|
62
|
+
return proc;
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
function insertRunningSlot(mgr: WorkerRuntimeManager, projectId: string) {
|
|
66
|
+
const proc = makeFakeProc();
|
|
67
|
+
const slot = {
|
|
68
|
+
projectId,
|
|
69
|
+
// 0 short-circuits releasePort and avoids polluting usedPorts.
|
|
70
|
+
agentPort: 0,
|
|
71
|
+
apiServerPort: 0,
|
|
72
|
+
status: 'running' as const,
|
|
73
|
+
proc,
|
|
74
|
+
startedAt: Date.now(),
|
|
75
|
+
lastUsedAt: Date.now(),
|
|
76
|
+
restarts: 0,
|
|
77
|
+
restartTimer: null,
|
|
78
|
+
idleTimer: null,
|
|
79
|
+
spawnConfig: {} as never,
|
|
80
|
+
startPromise: null,
|
|
81
|
+
};
|
|
82
|
+
// The runtimes map and armIdleTimer are private; tests reach in
|
|
83
|
+
// deliberately because the public surface (ensureRunning) requires a
|
|
84
|
+
// real binary on disk.
|
|
85
|
+
(mgr as unknown as { runtimes: Map<string, typeof slot> }).runtimes.set(
|
|
86
|
+
projectId,
|
|
87
|
+
slot,
|
|
88
|
+
);
|
|
89
|
+
return slot;
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
function armIdle(mgr: WorkerRuntimeManager, slot: ReturnType<typeof insertRunningSlot>): void {
|
|
93
|
+
(mgr as unknown as { armIdleTimer: (s: typeof slot) => void }).armIdleTimer(slot);
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
const SILENT = { log: () => {}, warn: () => {}, error: () => {} } as const;
|
|
97
|
+
|
|
98
|
+
describe('WorkerRuntimeManager idle eviction', () => {
|
|
99
|
+
it('reproduction: with no touches inside the idle window, the reaper kills the runtime (mid-stream cut)', async () => {
|
|
100
|
+
// 50ms idle window stands in for the production 15min — what we
|
|
101
|
+
// want to pin is "no `touch()` for >= idleMs ⇒ stop() fires", not
|
|
102
|
+
// the magnitude.
|
|
103
|
+
const mgr = new WorkerRuntimeManager({ idleMs: 50, logger: SILENT });
|
|
104
|
+
const slot = insertRunningSlot(mgr, 'proj-stream');
|
|
105
|
+
|
|
106
|
+
armIdle(mgr, slot);
|
|
107
|
+
expect(slot.idleTimer).not.toBeNull();
|
|
108
|
+
|
|
109
|
+
// Simulate an agent-proxy that forwards bytes for >idleMs but
|
|
110
|
+
// never calls `touch()` — exactly the production bug. The
|
|
111
|
+
// manager has no notion of in-flight HTTP, so it evicts.
|
|
112
|
+
await new Promise((r) => setTimeout(r, 120));
|
|
113
|
+
|
|
114
|
+
const runtimes = (mgr as unknown as { runtimes: Map<string, unknown> }).runtimes;
|
|
115
|
+
expect(runtimes.has('proj-stream')).toBe(false);
|
|
116
|
+
expect(slot.proc.killed).toBe(true);
|
|
117
|
+
});
|
|
118
|
+
|
|
119
|
+
it('fix: periodic touch() inside the idle window keeps the runtime alive across multiple windows', async () => {
|
|
120
|
+
const idleMs = 50;
|
|
121
|
+
const mgr = new WorkerRuntimeManager({ idleMs, logger: SILENT });
|
|
122
|
+
const slot = insertRunningSlot(mgr, 'proj-active');
|
|
123
|
+
|
|
124
|
+
armIdle(mgr, slot);
|
|
125
|
+
|
|
126
|
+
// Models the activity hooks the API server will install:
|
|
127
|
+
// - agent-proxy `touch(projectId)` on each forwarded SSE chunk,
|
|
128
|
+
// - AI proxy `touch(projectId)` after token decode.
|
|
129
|
+
// We tick every (idleMs / 5) for ~3x the idle window so the
|
|
130
|
+
// assertion lands well past where an unfixed reaper would fire.
|
|
131
|
+
const touchInterval = setInterval(() => mgr.touch('proj-active'), Math.max(5, idleMs / 5));
|
|
132
|
+
await new Promise((r) => setTimeout(r, idleMs * 3));
|
|
133
|
+
clearInterval(touchInterval);
|
|
134
|
+
|
|
135
|
+
const runtimes = (mgr as unknown as { runtimes: Map<string, unknown> }).runtimes;
|
|
136
|
+
expect(runtimes.has('proj-active')).toBe(true);
|
|
137
|
+
expect(slot.proc.killed).toBe(false);
|
|
138
|
+
|
|
139
|
+
// Sanity: once activity stops, the reaper does fire — the slot is
|
|
140
|
+
// not somehow "stuck alive". This pins the resume-eviction-on-idle
|
|
141
|
+
// half of the contract so a future regression that disables the
|
|
142
|
+
// reaper outright still gets caught.
|
|
143
|
+
await new Promise((r) => setTimeout(r, idleMs * 3));
|
|
144
|
+
expect(runtimes.has('proj-active')).toBe(false);
|
|
145
|
+
expect(slot.proc.killed).toBe(true);
|
|
146
|
+
});
|
|
147
|
+
|
|
148
|
+
it('touch() on an unknown projectId is a safe no-op', () => {
|
|
149
|
+
const mgr = new WorkerRuntimeManager({ idleMs: 50, logger: SILENT });
|
|
150
|
+
expect(() => mgr.touch('does-not-exist')).not.toThrow();
|
|
151
|
+
});
|
|
152
|
+
|
|
153
|
+
it('idleMs=0 disables the reaper (no timer is armed) — desktop opt-out', async () => {
|
|
154
|
+
const mgr = new WorkerRuntimeManager({ idleMs: 0, logger: SILENT });
|
|
155
|
+
const slot = insertRunningSlot(mgr, 'proj-local');
|
|
156
|
+
|
|
157
|
+
armIdle(mgr, slot);
|
|
158
|
+
expect(slot.idleTimer).toBeNull();
|
|
159
|
+
|
|
160
|
+
await new Promise((r) => setTimeout(r, 50));
|
|
161
|
+
const runtimes = (mgr as unknown as { runtimes: Map<string, unknown> }).runtimes;
|
|
162
|
+
expect(runtimes.has('proj-local')).toBe(true);
|
|
163
|
+
expect(slot.proc.killed).toBe(false);
|
|
164
|
+
});
|
|
165
|
+
|
|
166
|
+
it('idleMs=Infinity also disables the reaper', () => {
|
|
167
|
+
const mgr = new WorkerRuntimeManager({
|
|
168
|
+
idleMs: Number.POSITIVE_INFINITY,
|
|
169
|
+
logger: SILENT,
|
|
170
|
+
});
|
|
171
|
+
const slot = insertRunningSlot(mgr, 'proj-inf');
|
|
172
|
+
|
|
173
|
+
armIdle(mgr, slot);
|
|
174
|
+
expect(slot.idleTimer).toBeNull();
|
|
175
|
+
});
|
|
176
|
+
|
|
177
|
+
it('negative idleMs disables the reaper (defensive)', () => {
|
|
178
|
+
const mgr = new WorkerRuntimeManager({ idleMs: -1, logger: SILENT });
|
|
179
|
+
const slot = insertRunningSlot(mgr, 'proj-neg');
|
|
180
|
+
|
|
181
|
+
armIdle(mgr, slot);
|
|
182
|
+
expect(slot.idleTimer).toBeNull();
|
|
183
|
+
});
|
|
184
|
+
|
|
185
|
+
it('idleMs > 0 still arms a timer (cloud regression)', () => {
|
|
186
|
+
const mgr = new WorkerRuntimeManager({ idleMs: 1_000, logger: SILENT });
|
|
187
|
+
const slot = insertRunningSlot(mgr, 'proj-cloud');
|
|
188
|
+
|
|
189
|
+
armIdle(mgr, slot);
|
|
190
|
+
expect(slot.idleTimer).not.toBeNull();
|
|
191
|
+
if (slot.idleTimer) {
|
|
192
|
+
clearTimeout(slot.idleTimer);
|
|
193
|
+
slot.idleTimer = null;
|
|
194
|
+
}
|
|
195
|
+
});
|
|
196
|
+
});
|
|
@@ -116,7 +116,16 @@ export type RuntimeBinResolver = () => ResolvedRuntime | null;
|
|
|
116
116
|
export interface WorkerRuntimeManagerOptions {
|
|
117
117
|
/** `--runtime-bin <path>` flag value if any (forwarded to resolveRuntime). */
|
|
118
118
|
runtimeBin?: string;
|
|
119
|
-
/**
|
|
119
|
+
/**
|
|
120
|
+
* Idle window in ms before evicting an unused runtime (default 15min).
|
|
121
|
+
*
|
|
122
|
+
* Pass `0`, a negative number, or `Infinity` to disable idle eviction
|
|
123
|
+
* entirely. The desktop / `SHOGO_LOCAL_MODE=true` path uses this to
|
|
124
|
+
* keep long-running chat streams alive past 15min of agent-proxy
|
|
125
|
+
* silence — eviction in that environment cuts the user's stream
|
|
126
|
+
* mid-flight (only one user, no resource pressure to recycle for).
|
|
127
|
+
* Cloud workers leave this unset so the default still fires.
|
|
128
|
+
*/
|
|
120
129
|
idleMs?: number;
|
|
121
130
|
/** Optional logger. Defaults to console. */
|
|
122
131
|
logger?: Pick<Console, 'log' | 'warn' | 'error'>;
|
|
@@ -994,8 +1003,15 @@ export class WorkerRuntimeManager implements RuntimeResolver {
|
|
|
994
1003
|
}
|
|
995
1004
|
|
|
996
1005
|
private armIdleTimer(slot: InternalRuntime): void {
|
|
997
|
-
if (slot.idleTimer)
|
|
1006
|
+
if (slot.idleTimer) {
|
|
1007
|
+
clearTimeout(slot.idleTimer);
|
|
1008
|
+
slot.idleTimer = null;
|
|
1009
|
+
}
|
|
998
1010
|
const idleMs = this.opts.idleMs ?? RUNTIME_IDLE_MS;
|
|
1011
|
+
// `idleMs <= 0` or non-finite disables eviction. Used by desktop /
|
|
1012
|
+
// `SHOGO_LOCAL_MODE=true` where reaping a "stale" runtime really
|
|
1013
|
+
// means killing the in-flight chat stream of the only user.
|
|
1014
|
+
if (!Number.isFinite(idleMs) || idleMs <= 0) return;
|
|
999
1015
|
slot.idleTimer = setTimeout(() => {
|
|
1000
1016
|
const since = Date.now() - slot.lastUsedAt;
|
|
1001
1017
|
if (since < idleMs) {
|