alvin-bot 4.14.1 → 4.14.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +43 -0
- package/dist/services/async-agent-watcher.js +35 -1
- package/package.json +1 -1
- package/test/watcher-zombie-fix.test.ts +252 -0
package/CHANGELOG.md
CHANGED
|
@@ -2,6 +2,49 @@
|
|
|
2
2
|
|
|
3
3
|
All notable changes to Alvin Bot are documented here.
|
|
4
4
|
|
|
5
|
+
## [4.14.2] — 2026-04-16
|
|
6
|
+
|
|
7
|
+
### 🐛 Patch: watcher zombie-entry fix (missing outputFile > 10 min = failed)
|
|
8
|
+
|
|
9
|
+
**Edge case Ali caught today:** a pending async-agent entry stuck in `/subagents list` for 3+ hours showing "running" — but the underlying `alvin_dispatch_agent` subprocess had already died (its output file was gone). The entry would have continued haunting the list until the 12-hour `giveUpAt` ceiling fired.
|
|
10
|
+
|
|
11
|
+
**Root cause:** `async-agent-watcher`'s `pollOnce` handled four states from `parseOutputFileStatus` — `completed` / `failed` / `running` / `missing`. For `missing` (file doesn't exist or is empty), the watcher just kept polling forever, on the assumption that a slow subprocess might eventually write. If the subprocess crashed before writing ANY output, the file never appeared, and we polled for 12 hours before timing out.
|
|
12
|
+
|
|
13
|
+
**Fix:** when `status.state === "missing"` AND `now - entry.startedAt > MISSING_FILE_FAILURE_MS` (default 10 min, configurable via `ALVIN_MISSING_FILE_FAILURE_MS` env var), deliver as failed with an explicit message:
|
|
14
|
+
|
|
15
|
+
> *Dispatched subprocess never wrote its output file (N m after start). Likely crashed before initializing, or the file was removed externally.*
|
|
16
|
+
|
|
17
|
+
10 minutes is well above any legitimate `claude -p` startup variance (normal first-write latency is seconds) and well below the 12-hour hard ceiling.
|
|
18
|
+
|
|
19
|
+
### What's preserved (regression-guard tested)
|
|
20
|
+
|
|
21
|
+
- Running agents (file has content but no `end_turn`/`result` yet) are untouched by this path — they still keep polling as before.
|
|
22
|
+
- Completed agents (clean `end_turn` or `stream-json result` event) still deliver normally.
|
|
23
|
+
- Explicit `failed` state from the parser (if ever used) still delivers error normally.
|
|
24
|
+
- v4.12.4's "file is stale but has text → deliver partial" path takes precedence over the new zombie check (the file has content, so not "missing").
|
|
25
|
+
- 12-hour `giveUpAt` hard ceiling still applies as the ultimate safety net.
|
|
26
|
+
- Session's `pendingBackgroundCount` decrement fires on zombie failure, same as every other delivery path.
|
|
27
|
+
|
|
28
|
+
### Testing
|
|
29
|
+
|
|
30
|
+
- **Baseline**: 498 tests (v4.14.1)
|
|
31
|
+
- **New**: `test/watcher-zombie-fix.test.ts` — 6 tests:
|
|
32
|
+
- Young missing file (<threshold) stays pending
|
|
33
|
+
- Old missing file (>threshold) delivers failed + removes from pending
|
|
34
|
+
- Default threshold is 10 min when env var unset
|
|
35
|
+
- Running file (has content) is unaffected by zombie check
|
|
36
|
+
- Completed file delivers as completed (regression guard)
|
|
37
|
+
- Session's `pendingBackgroundCount` decrements on zombie delivery
|
|
38
|
+
- **Total**: 504 tests, all green, TSC clean
|
|
39
|
+
|
|
40
|
+
### Files changed
|
|
41
|
+
|
|
42
|
+
- **Modified**: `src/services/async-agent-watcher.ts` (new `getMissingFileFailureMs()` + zombie branch in `pollOnce`)
|
|
43
|
+
- **NEW tests**: `test/watcher-zombie-fix.test.ts`
|
|
44
|
+
- **Version**: `package.json` 4.14.1 → 4.14.2
|
|
45
|
+
|
|
46
|
+
---
|
|
47
|
+
|
|
5
48
|
## [4.14.1] — 2026-04-16
|
|
6
49
|
|
|
7
50
|
### 🐛 Patch: `/subagents list` now shows v4.13+ dispatch agents too
|
|
@@ -33,6 +33,31 @@ const POLL_INTERVAL_MS = 15_000;
|
|
|
33
33
|
* a timeout banner. SEO audits historically take ~13 min, so 12h
|
|
34
34
|
* is absurdly generous and protects against state-file growth. */
|
|
35
35
|
const MAX_AGENT_AGE_MS = 12 * 60 * 60 * 1000;
|
|
36
|
+
/**
|
|
37
|
+
* v4.14.2 — When a dispatched subprocess never creates its outputFile
|
|
38
|
+
* (spawn failure, crash before first write, file deleted externally),
|
|
39
|
+
* `parseOutputFileStatus` returns "missing" on every poll. Pre-v4.14.2
|
|
40
|
+
* that meant waiting the full 12h MAX_AGENT_AGE_MS before delivering a
|
|
41
|
+
* timeout — a 12-hour zombie in `/subagents list`.
|
|
42
|
+
*
|
|
43
|
+
* This threshold caps how long we tolerate a missing file before
|
|
44
|
+
* declaring the agent failed. `claude -p` normally writes its first
|
|
45
|
+
* JSONL line within seconds of spawn; 10 minutes is way above any
|
|
46
|
+
* legitimate startup variance and well below the 12h ceiling.
|
|
47
|
+
*
|
|
48
|
+
* Configurable via the ALVIN_MISSING_FILE_FAILURE_MS env var. Tests
|
|
49
|
+
* use shorter values via the same hook. Only the getter is exposed
|
|
50
|
+
* so callers always see the current env value, not a stale constant.
|
|
51
|
+
*/
|
|
52
|
+
function getMissingFileFailureMs() {
|
|
53
|
+
const raw = process.env.ALVIN_MISSING_FILE_FAILURE_MS;
|
|
54
|
+
if (raw) {
|
|
55
|
+
const n = Number(raw);
|
|
56
|
+
if (Number.isFinite(n) && n > 0)
|
|
57
|
+
return n;
|
|
58
|
+
}
|
|
59
|
+
return 10 * 60 * 1000; // default 10 min
|
|
60
|
+
}
|
|
36
61
|
// ── Module state ──────────────────────────────────────────────────
|
|
37
62
|
const pending = new Map();
|
|
38
63
|
let pollTimer = null;
|
|
@@ -139,6 +164,7 @@ export function stopWatcher() {
|
|
|
139
164
|
export async function pollOnce() {
|
|
140
165
|
const now = Date.now();
|
|
141
166
|
const toRemove = [];
|
|
167
|
+
const missingFileFailureMs = getMissingFileFailureMs();
|
|
142
168
|
for (const entry of pending.values()) {
|
|
143
169
|
entry.lastCheckedAt = now;
|
|
144
170
|
// Timeout check first — if the agent is past its giveUpAt, give up
|
|
@@ -157,7 +183,15 @@ export async function pollOnce() {
|
|
|
157
183
|
await deliverAsFailure(entry, "error", status.error);
|
|
158
184
|
toRemove.push(entry.agentId);
|
|
159
185
|
}
|
|
160
|
-
|
|
186
|
+
else if (status.state === "missing" &&
|
|
187
|
+
now - entry.startedAt > missingFileFailureMs) {
|
|
188
|
+
// v4.14.2 — Zombie guard: the subprocess never created its
|
|
189
|
+
// output file within `missingFileFailureMs` (default 10 min).
|
|
190
|
+
// Declare failed instead of polling until the 12h giveUpAt.
|
|
191
|
+
await deliverAsFailure(entry, "error", `Dispatched subprocess never wrote its output file (${Math.round((now - entry.startedAt) / 60_000)}m after start). Likely crashed before initializing, or the file was removed externally.`);
|
|
192
|
+
toRemove.push(entry.agentId);
|
|
193
|
+
}
|
|
194
|
+
// running / missing-but-young → keep polling next cycle
|
|
161
195
|
}
|
|
162
196
|
if (toRemove.length > 0) {
|
|
163
197
|
for (const id of toRemove)
|
package/package.json
CHANGED
|
@@ -0,0 +1,252 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* v4.14.2 — zombie-entry fix for async-agent-watcher.
|
|
3
|
+
*
|
|
4
|
+
* Problem: when the dispatched `claude -p` subprocess never produces
|
|
5
|
+
* its outputFile (crashed before the first write, spawn failed, file
|
|
6
|
+
* got deleted externally), `parseOutputFileStatus` returns "missing"
|
|
7
|
+
* on every poll. The watcher keeps polling forever until `giveUpAt`
|
|
8
|
+
* (12 hours) fires, then delivers a timeout banner. Meanwhile the
|
|
9
|
+
* entry hangs in `/subagents list` as a permanent "running" zombie.
|
|
10
|
+
*
|
|
11
|
+
* Fix: when status is "missing" for longer than
|
|
12
|
+
* `MISSING_FILE_FAILURE_MS` (default 10 min, env-configurable), the
|
|
13
|
+
* watcher declares the agent failed with a clear "output file never
|
|
14
|
+
* appeared" reason, delivers the failure banner, and removes the
|
|
15
|
+
* entry. 10 minutes is well above normal startup variance (seconds)
|
|
16
|
+
* and well below the 12h hard ceiling.
|
|
17
|
+
*
|
|
18
|
+
* Invariants preserved:
|
|
19
|
+
* - An agent whose output file DOES appear, even slowly, continues
|
|
20
|
+
* normally (missing on first poll, running on second, completed
|
|
21
|
+
* on third — same as v4.14.1).
|
|
22
|
+
* - The `completed` path (end_turn or stream-json result) is
|
|
23
|
+
* unchanged.
|
|
24
|
+
* - The `failed` path (existing "error" state from parser) is
|
|
25
|
+
* unchanged.
|
|
26
|
+
* - The 12h giveUpAt ceiling still applies — it's now just less
|
|
27
|
+
* likely to be hit because missing-file zombies resolve earlier.
|
|
28
|
+
*/
|
|
29
|
+
import { describe, it, expect, beforeEach, afterEach, vi } from "vitest";
|
|
30
|
+
import fs from "fs";
|
|
31
|
+
import os from "os";
|
|
32
|
+
import { resolve } from "path";
|
|
33
|
+
|
|
34
|
+
const TEST_DATA_DIR = resolve(
|
|
35
|
+
os.tmpdir(),
|
|
36
|
+
`alvin-zombie-${process.pid}-${Date.now()}`,
|
|
37
|
+
);
|
|
38
|
+
|
|
39
|
+
interface Delivered {
|
|
40
|
+
info: { name: string; status: string };
|
|
41
|
+
result: { status: string; output: string; error?: string };
|
|
42
|
+
}
|
|
43
|
+
let delivered: Delivered[] = [];
|
|
44
|
+
|
|
45
|
+
beforeEach(async () => {
|
|
46
|
+
if (fs.existsSync(TEST_DATA_DIR)) {
|
|
47
|
+
fs.rmSync(TEST_DATA_DIR, { recursive: true, force: true });
|
|
48
|
+
}
|
|
49
|
+
fs.mkdirSync(TEST_DATA_DIR, { recursive: true });
|
|
50
|
+
process.env.ALVIN_DATA_DIR = TEST_DATA_DIR;
|
|
51
|
+
// Reset the env override between tests
|
|
52
|
+
delete process.env.ALVIN_MISSING_FILE_FAILURE_MS;
|
|
53
|
+
delivered = [];
|
|
54
|
+
vi.resetModules();
|
|
55
|
+
vi.doMock("../src/services/subagent-delivery.js", () => ({
|
|
56
|
+
deliverSubAgentResult: async (info: unknown, result: unknown) => {
|
|
57
|
+
delivered.push({
|
|
58
|
+
info: info as Delivered["info"],
|
|
59
|
+
result: result as Delivered["result"],
|
|
60
|
+
});
|
|
61
|
+
},
|
|
62
|
+
attachBotApi: () => {},
|
|
63
|
+
__setBotApiForTest: () => {},
|
|
64
|
+
}));
|
|
65
|
+
});
|
|
66
|
+
|
|
67
|
+
afterEach(async () => {
|
|
68
|
+
try {
|
|
69
|
+
const mod = await import("../src/services/async-agent-watcher.js");
|
|
70
|
+
mod.stopWatcher();
|
|
71
|
+
mod.__resetForTest();
|
|
72
|
+
} catch {}
|
|
73
|
+
delete process.env.ALVIN_MISSING_FILE_FAILURE_MS;
|
|
74
|
+
});
|
|
75
|
+
|
|
76
|
+
describe("watcher zombie fix (v4.14.2)", () => {
|
|
77
|
+
it("missing file younger than threshold stays pending (no premature fail)", async () => {
|
|
78
|
+
// Threshold = 10 min. Backdate only 2 min. Expect: still pending.
|
|
79
|
+
const mod = await import("../src/services/async-agent-watcher.js");
|
|
80
|
+
mod.registerPendingAgent({
|
|
81
|
+
agentId: "young-zombie",
|
|
82
|
+
outputFile: `${TEST_DATA_DIR}/nonexistent.jsonl`,
|
|
83
|
+
description: "young",
|
|
84
|
+
prompt: "p",
|
|
85
|
+
chatId: 1,
|
|
86
|
+
userId: 1,
|
|
87
|
+
toolUseId: null,
|
|
88
|
+
});
|
|
89
|
+
// Forcibly set startedAt to 2 min ago
|
|
90
|
+
const pending = mod.listPendingAgents();
|
|
91
|
+
expect(pending).toHaveLength(1);
|
|
92
|
+
(pending[0] as { startedAt: number }).startedAt = Date.now() - 2 * 60_000;
|
|
93
|
+
|
|
94
|
+
await mod.pollOnce();
|
|
95
|
+
|
|
96
|
+
expect(delivered).toHaveLength(0);
|
|
97
|
+
expect(mod.listPendingAgents()).toHaveLength(1);
|
|
98
|
+
});
|
|
99
|
+
|
|
100
|
+
it("missing file older than threshold delivers failed + removes from pending", async () => {
|
|
101
|
+
process.env.ALVIN_MISSING_FILE_FAILURE_MS = "120000"; // 2 min for test
|
|
102
|
+
const mod = await import("../src/services/async-agent-watcher.js");
|
|
103
|
+
mod.registerPendingAgent({
|
|
104
|
+
agentId: "old-zombie",
|
|
105
|
+
outputFile: `${TEST_DATA_DIR}/never-appears.jsonl`,
|
|
106
|
+
description: "stuck crash zombie",
|
|
107
|
+
prompt: "p",
|
|
108
|
+
chatId: 1,
|
|
109
|
+
userId: 1,
|
|
110
|
+
toolUseId: null,
|
|
111
|
+
});
|
|
112
|
+
// Backdate 5 min (> 2 min threshold)
|
|
113
|
+
const pending = mod.listPendingAgents();
|
|
114
|
+
(pending[0] as { startedAt: number }).startedAt = Date.now() - 5 * 60_000;
|
|
115
|
+
|
|
116
|
+
await mod.pollOnce();
|
|
117
|
+
|
|
118
|
+
expect(delivered).toHaveLength(1);
|
|
119
|
+
expect(delivered[0].result.status).toBe("error");
|
|
120
|
+
// Error message should be explicit so user understands
|
|
121
|
+
expect(delivered[0].result.error).toMatch(/output file|never appeared|never wrote/i);
|
|
122
|
+
expect(mod.listPendingAgents()).toHaveLength(0);
|
|
123
|
+
});
|
|
124
|
+
|
|
125
|
+
it("default threshold is 10 min when env var is not set", async () => {
|
|
126
|
+
const mod = await import("../src/services/async-agent-watcher.js");
|
|
127
|
+
mod.registerPendingAgent({
|
|
128
|
+
agentId: "at-default",
|
|
129
|
+
outputFile: `${TEST_DATA_DIR}/z.jsonl`,
|
|
130
|
+
description: "default threshold",
|
|
131
|
+
prompt: "p",
|
|
132
|
+
chatId: 1,
|
|
133
|
+
userId: 1,
|
|
134
|
+
toolUseId: null,
|
|
135
|
+
});
|
|
136
|
+
// Backdate 9 min — still under the 10-min default, should stay pending
|
|
137
|
+
let p = mod.listPendingAgents();
|
|
138
|
+
(p[0] as { startedAt: number }).startedAt = Date.now() - 9 * 60_000;
|
|
139
|
+
await mod.pollOnce();
|
|
140
|
+
expect(delivered).toHaveLength(0);
|
|
141
|
+
expect(mod.listPendingAgents()).toHaveLength(1);
|
|
142
|
+
|
|
143
|
+
// Backdate to 11 min — over threshold, should fire
|
|
144
|
+
p = mod.listPendingAgents();
|
|
145
|
+
(p[0] as { startedAt: number }).startedAt = Date.now() - 11 * 60_000;
|
|
146
|
+
await mod.pollOnce();
|
|
147
|
+
expect(delivered).toHaveLength(1);
|
|
148
|
+
});
|
|
149
|
+
|
|
150
|
+
it("running file (has content, no end_turn) is unaffected by zombie check", async () => {
|
|
151
|
+
// A file WITH content should never trigger the missing-file path
|
|
152
|
+
// regardless of age.
|
|
153
|
+
const outPath = `${TEST_DATA_DIR}/running.jsonl`;
|
|
154
|
+
fs.writeFileSync(
|
|
155
|
+
outPath,
|
|
156
|
+
JSON.stringify({
|
|
157
|
+
type: "assistant",
|
|
158
|
+
isSidechain: true,
|
|
159
|
+
agentId: "x",
|
|
160
|
+
message: {
|
|
161
|
+
role: "assistant",
|
|
162
|
+
content: [{ type: "tool_use", name: "Bash", input: {} }],
|
|
163
|
+
stop_reason: "tool_use",
|
|
164
|
+
},
|
|
165
|
+
}) + "\n",
|
|
166
|
+
"utf-8",
|
|
167
|
+
);
|
|
168
|
+
const mod = await import("../src/services/async-agent-watcher.js");
|
|
169
|
+
mod.registerPendingAgent({
|
|
170
|
+
agentId: "active-work",
|
|
171
|
+
outputFile: outPath,
|
|
172
|
+
description: "legitimately running",
|
|
173
|
+
prompt: "p",
|
|
174
|
+
chatId: 1,
|
|
175
|
+
userId: 1,
|
|
176
|
+
toolUseId: null,
|
|
177
|
+
});
|
|
178
|
+
const p = mod.listPendingAgents();
|
|
179
|
+
(p[0] as { startedAt: number }).startedAt = Date.now() - 30 * 60_000; // 30 min old
|
|
180
|
+
|
|
181
|
+
await mod.pollOnce();
|
|
182
|
+
|
|
183
|
+
// v4.12.4 staleness detection COULD fire here because the file has
|
|
184
|
+
// text content and is stale. That's a different (benign) path — the
|
|
185
|
+
// agent gets delivered as "completed with partial output". Either
|
|
186
|
+
// way, the zombie-fix error path must NOT fire.
|
|
187
|
+
const anyZombieError = delivered.some(
|
|
188
|
+
(d) => d.result.error && /output file never/i.test(d.result.error),
|
|
189
|
+
);
|
|
190
|
+
expect(anyZombieError).toBe(false);
|
|
191
|
+
});
|
|
192
|
+
|
|
193
|
+
it("completed file delivers as completed (unchanged)", async () => {
|
|
194
|
+
const outPath = `${TEST_DATA_DIR}/done.jsonl`;
|
|
195
|
+
fs.writeFileSync(
|
|
196
|
+
outPath,
|
|
197
|
+
JSON.stringify({
|
|
198
|
+
type: "assistant",
|
|
199
|
+
agentId: "x",
|
|
200
|
+
message: {
|
|
201
|
+
content: [{ type: "text", text: "all good" }],
|
|
202
|
+
stop_reason: "end_turn",
|
|
203
|
+
},
|
|
204
|
+
}) + "\n",
|
|
205
|
+
"utf-8",
|
|
206
|
+
);
|
|
207
|
+
const mod = await import("../src/services/async-agent-watcher.js");
|
|
208
|
+
mod.registerPendingAgent({
|
|
209
|
+
agentId: "done-agent",
|
|
210
|
+
outputFile: outPath,
|
|
211
|
+
description: "clean completion",
|
|
212
|
+
prompt: "p",
|
|
213
|
+
chatId: 1,
|
|
214
|
+
userId: 1,
|
|
215
|
+
toolUseId: null,
|
|
216
|
+
});
|
|
217
|
+
// Backdate 1h — would trigger zombie if misapplied
|
|
218
|
+
const p = mod.listPendingAgents();
|
|
219
|
+
(p[0] as { startedAt: number }).startedAt = Date.now() - 60 * 60_000;
|
|
220
|
+
|
|
221
|
+
await mod.pollOnce();
|
|
222
|
+
|
|
223
|
+
expect(delivered).toHaveLength(1);
|
|
224
|
+
expect(delivered[0].result.status).toBe("completed");
|
|
225
|
+
});
|
|
226
|
+
|
|
227
|
+
it("decrements session counter on zombie failure delivery", async () => {
|
|
228
|
+
process.env.ALVIN_MISSING_FILE_FAILURE_MS = "1000"; // 1 sec for fast test
|
|
229
|
+
const sessionMod = await import("../src/services/session.js");
|
|
230
|
+
const session = sessionMod.getSession("zombie-session");
|
|
231
|
+
session.pendingBackgroundCount = 1;
|
|
232
|
+
|
|
233
|
+
const mod = await import("../src/services/async-agent-watcher.js");
|
|
234
|
+
mod.registerPendingAgent({
|
|
235
|
+
agentId: "session-zombie",
|
|
236
|
+
outputFile: `${TEST_DATA_DIR}/gone.jsonl`,
|
|
237
|
+
description: "zombie for counter",
|
|
238
|
+
prompt: "p",
|
|
239
|
+
chatId: 1,
|
|
240
|
+
userId: 1,
|
|
241
|
+
toolUseId: null,
|
|
242
|
+
sessionKey: "zombie-session",
|
|
243
|
+
});
|
|
244
|
+
const p = mod.listPendingAgents();
|
|
245
|
+
(p[0] as { startedAt: number }).startedAt = Date.now() - 5000; // 5 sec ago, > 1sec threshold
|
|
246
|
+
|
|
247
|
+
await mod.pollOnce();
|
|
248
|
+
|
|
249
|
+
expect(delivered).toHaveLength(1);
|
|
250
|
+
expect(session.pendingBackgroundCount).toBe(0);
|
|
251
|
+
});
|
|
252
|
+
});
|