alvin-bot 4.14.1 → 4.14.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/CHANGELOG.md CHANGED
@@ -2,6 +2,49 @@
2
2
 
3
3
  All notable changes to Alvin Bot are documented here.
4
4
 
5
+ ## [4.14.2] — 2026-04-16
6
+
7
+ ### 🐛 Patch: watcher zombie-entry fix (missing outputFile > 10 min = failed)
8
+
9
+ **Edge case Ali caught today:** a pending async-agent entry stuck in `/subagents list` for 3+ hours showing "running" — but the underlying `alvin_dispatch_agent` subprocess had already died (its output file was gone). The entry would have continued haunting the list until the 12-hour `giveUpAt` ceiling fired.
10
+
11
+ **Root cause:** `async-agent-watcher`'s `pollOnce` handled four states from `parseOutputFileStatus` — `completed` / `failed` / `running` / `missing`. For `missing` (file doesn't exist or is empty), the watcher just kept polling forever, on the assumption that a slow subprocess might eventually write. If the subprocess crashed before writing ANY output, the file never appeared, and we polled for 12 hours before timing out.
12
+
13
+ **Fix:** when `status.state === "missing"` AND `now - entry.startedAt > MISSING_FILE_FAILURE_MS` (default 10 min, configurable via `ALVIN_MISSING_FILE_FAILURE_MS` env var), deliver as failed with an explicit message:
14
+
15
+ > *Dispatched subprocess never wrote its output file (N m after start). Likely crashed before initializing, or the file was removed externally.*
16
+
17
+ 10 minutes is well above any legitimate `claude -p` startup variance (normal first-write latency is seconds) and well below the 12-hour hard ceiling.
18
+
19
+ ### What's preserved (regression-guard tested)
20
+
21
+ - Running agents (file has content but no `end_turn`/`result` yet) are untouched by this path — they still keep polling as before.
22
+ - Completed agents (clean `end_turn` or `stream-json result` event) still deliver normally.
23
+ - Explicit `failed` state from the parser (if ever used) still delivers error normally.
24
+ - v4.12.4's "file is stale but has text → deliver partial" path takes precedence over the new zombie check (the file has content, so not "missing").
25
+ - 12-hour `giveUpAt` hard ceiling still applies as the ultimate safety net.
26
+ - Session's `pendingBackgroundCount` decrement fires on zombie failure, same as every other delivery path.
27
+
28
+ ### Testing
29
+
30
+ - **Baseline**: 498 tests (v4.14.1)
31
+ - **New**: `test/watcher-zombie-fix.test.ts` — 6 tests:
32
+ - Young missing file (<threshold) stays pending
33
+ - Old missing file (>threshold) delivers failed + removes from pending
34
+ - Default threshold is 10 min when env var unset
35
+ - Running file (has content) is unaffected by zombie check
36
+ - Completed file delivers as completed (regression guard)
37
+ - Session's `pendingBackgroundCount` decrements on zombie delivery
38
+ - **Total**: 504 tests, all green, TSC clean
39
+
40
+ ### Files changed
41
+
42
+ - **Modified**: `src/services/async-agent-watcher.ts` (new `getMissingFileFailureMs()` + zombie branch in `pollOnce`)
43
+ - **NEW tests**: `test/watcher-zombie-fix.test.ts`
44
+ - **Version**: `package.json` 4.14.1 → 4.14.2
45
+
46
+ ---
47
+
5
48
  ## [4.14.1] — 2026-04-16
6
49
 
7
50
  ### 🐛 Patch: `/subagents list` now shows v4.13+ dispatch agents too
@@ -33,6 +33,31 @@ const POLL_INTERVAL_MS = 15_000;
33
33
  * a timeout banner. SEO audits historically take ~13 min, so 12h
34
34
  * is absurdly generous and protects against state-file growth. */
35
35
  const MAX_AGENT_AGE_MS = 12 * 60 * 60 * 1000;
36
+ /**
37
+ * v4.14.2 — When a dispatched subprocess never creates its outputFile
38
+ * (spawn failure, crash before first write, file deleted externally),
39
+ * `parseOutputFileStatus` returns "missing" on every poll. Pre-v4.14.2
40
+ * that meant waiting the full 12h MAX_AGENT_AGE_MS before delivering a
41
+ * timeout — a 12-hour zombie in `/subagents list`.
42
+ *
43
+ * This threshold caps how long we tolerate a missing file before
44
+ * declaring the agent failed. `claude -p` normally writes its first
45
+ * JSONL line within seconds of spawn; 10 minutes is way above any
46
+ * legitimate startup variance and well below the 12h ceiling.
47
+ *
48
+ * Configurable via the ALVIN_MISSING_FILE_FAILURE_MS env var. Tests
49
+ * use shorter values via the same hook. Only the getter is exposed
50
+ * so callers always see the current env value, not a stale constant.
51
+ */
52
+ function getMissingFileFailureMs() {
53
+ const raw = process.env.ALVIN_MISSING_FILE_FAILURE_MS;
54
+ if (raw) {
55
+ const n = Number(raw);
56
+ if (Number.isFinite(n) && n > 0)
57
+ return n;
58
+ }
59
+ return 10 * 60 * 1000; // default 10 min
60
+ }
36
61
  // ── Module state ──────────────────────────────────────────────────
37
62
  const pending = new Map();
38
63
  let pollTimer = null;
@@ -139,6 +164,7 @@ export function stopWatcher() {
139
164
  export async function pollOnce() {
140
165
  const now = Date.now();
141
166
  const toRemove = [];
167
+ const missingFileFailureMs = getMissingFileFailureMs();
142
168
  for (const entry of pending.values()) {
143
169
  entry.lastCheckedAt = now;
144
170
  // Timeout check first — if the agent is past its giveUpAt, give up
@@ -157,7 +183,15 @@ export async function pollOnce() {
157
183
  await deliverAsFailure(entry, "error", status.error);
158
184
  toRemove.push(entry.agentId);
159
185
  }
160
- // running / missing → keep polling next cycle
186
+ else if (status.state === "missing" &&
187
+ now - entry.startedAt > missingFileFailureMs) {
188
+ // v4.14.2 — Zombie guard: the subprocess never created its
189
+ // output file within `missingFileFailureMs` (default 10 min).
190
+ // Declare failed instead of polling until the 12h giveUpAt.
191
+ await deliverAsFailure(entry, "error", `Dispatched subprocess never wrote its output file (${Math.round((now - entry.startedAt) / 60_000)}m after start). Likely crashed before initializing, or the file was removed externally.`);
192
+ toRemove.push(entry.agentId);
193
+ }
194
+ // running / missing-but-young → keep polling next cycle
161
195
  }
162
196
  if (toRemove.length > 0) {
163
197
  for (const id of toRemove)
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "alvin-bot",
3
- "version": "4.14.1",
3
+ "version": "4.14.2",
4
4
  "description": "Alvin Bot \u2014 Your personal AI agent on Telegram, WhatsApp, Discord, Signal, and Web.",
5
5
  "type": "module",
6
6
  "main": "dist/index.js",
@@ -0,0 +1,252 @@
1
+ /**
2
+ * v4.14.2 — zombie-entry fix for async-agent-watcher.
3
+ *
4
+ * Problem: when the dispatched `claude -p` subprocess never produces
5
+ * its outputFile (crashed before the first write, spawn failed, file
6
+ * got deleted externally), `parseOutputFileStatus` returns "missing"
7
+ * on every poll. The watcher keeps polling forever until `giveUpAt`
8
+ * (12 hours) fires, then delivers a timeout banner. Meanwhile the
9
+ * entry hangs in `/subagents list` as a permanent "running" zombie.
10
+ *
11
+ * Fix: when status is "missing" for longer than
12
+ * `MISSING_FILE_FAILURE_MS` (default 10 min, env-configurable), the
13
+ * watcher declares the agent failed with a clear "output file never
14
+ * appeared" reason, delivers the failure banner, and removes the
15
+ * entry. 10 minutes is well above normal startup variance (seconds)
16
+ * and well below the 12h hard ceiling.
17
+ *
18
+ * Invariants preserved:
19
+ * - An agent whose output file DOES appear, even slowly, continues
20
+ * normally (missing on first poll, running on second, completed
21
+ * on third — same as v4.14.1).
22
+ * - The `completed` path (end_turn or stream-json result) is
23
+ * unchanged.
24
+ * - The `failed` path (existing "error" state from parser) is
25
+ * unchanged.
26
+ * - The 12h giveUpAt ceiling still applies — it's now just less
27
+ * likely to be hit because missing-file zombies resolve earlier.
28
+ */
29
+ import { describe, it, expect, beforeEach, afterEach, vi } from "vitest";
30
+ import fs from "fs";
31
+ import os from "os";
32
+ import { resolve } from "path";
33
+
34
+ const TEST_DATA_DIR = resolve(
35
+ os.tmpdir(),
36
+ `alvin-zombie-${process.pid}-${Date.now()}`,
37
+ );
38
+
39
+ interface Delivered {
40
+ info: { name: string; status: string };
41
+ result: { status: string; output: string; error?: string };
42
+ }
43
+ let delivered: Delivered[] = [];
44
+
45
+ beforeEach(async () => {
46
+ if (fs.existsSync(TEST_DATA_DIR)) {
47
+ fs.rmSync(TEST_DATA_DIR, { recursive: true, force: true });
48
+ }
49
+ fs.mkdirSync(TEST_DATA_DIR, { recursive: true });
50
+ process.env.ALVIN_DATA_DIR = TEST_DATA_DIR;
51
+ // Reset the env override between tests
52
+ delete process.env.ALVIN_MISSING_FILE_FAILURE_MS;
53
+ delivered = [];
54
+ vi.resetModules();
55
+ vi.doMock("../src/services/subagent-delivery.js", () => ({
56
+ deliverSubAgentResult: async (info: unknown, result: unknown) => {
57
+ delivered.push({
58
+ info: info as Delivered["info"],
59
+ result: result as Delivered["result"],
60
+ });
61
+ },
62
+ attachBotApi: () => {},
63
+ __setBotApiForTest: () => {},
64
+ }));
65
+ });
66
+
67
+ afterEach(async () => {
68
+ try {
69
+ const mod = await import("../src/services/async-agent-watcher.js");
70
+ mod.stopWatcher();
71
+ mod.__resetForTest();
72
+ } catch {}
73
+ delete process.env.ALVIN_MISSING_FILE_FAILURE_MS;
74
+ });
75
+
76
+ describe("watcher zombie fix (v4.14.2)", () => {
77
+ it("missing file younger than threshold stays pending (no premature fail)", async () => {
78
+ // Threshold = 10 min. Backdate only 2 min. Expect: still pending.
79
+ const mod = await import("../src/services/async-agent-watcher.js");
80
+ mod.registerPendingAgent({
81
+ agentId: "young-zombie",
82
+ outputFile: `${TEST_DATA_DIR}/nonexistent.jsonl`,
83
+ description: "young",
84
+ prompt: "p",
85
+ chatId: 1,
86
+ userId: 1,
87
+ toolUseId: null,
88
+ });
89
+ // Forcibly set startedAt to 2 min ago
90
+ const pending = mod.listPendingAgents();
91
+ expect(pending).toHaveLength(1);
92
+ (pending[0] as { startedAt: number }).startedAt = Date.now() - 2 * 60_000;
93
+
94
+ await mod.pollOnce();
95
+
96
+ expect(delivered).toHaveLength(0);
97
+ expect(mod.listPendingAgents()).toHaveLength(1);
98
+ });
99
+
100
+ it("missing file older than threshold delivers failed + removes from pending", async () => {
101
+ process.env.ALVIN_MISSING_FILE_FAILURE_MS = "120000"; // 2 min for test
102
+ const mod = await import("../src/services/async-agent-watcher.js");
103
+ mod.registerPendingAgent({
104
+ agentId: "old-zombie",
105
+ outputFile: `${TEST_DATA_DIR}/never-appears.jsonl`,
106
+ description: "stuck crash zombie",
107
+ prompt: "p",
108
+ chatId: 1,
109
+ userId: 1,
110
+ toolUseId: null,
111
+ });
112
+ // Backdate 5 min (> 2 min threshold)
113
+ const pending = mod.listPendingAgents();
114
+ (pending[0] as { startedAt: number }).startedAt = Date.now() - 5 * 60_000;
115
+
116
+ await mod.pollOnce();
117
+
118
+ expect(delivered).toHaveLength(1);
119
+ expect(delivered[0].result.status).toBe("error");
120
+ // Error message should be explicit so user understands
121
+ expect(delivered[0].result.error).toMatch(/output file|never appeared|never wrote/i);
122
+ expect(mod.listPendingAgents()).toHaveLength(0);
123
+ });
124
+
125
+ it("default threshold is 10 min when env var is not set", async () => {
126
+ const mod = await import("../src/services/async-agent-watcher.js");
127
+ mod.registerPendingAgent({
128
+ agentId: "at-default",
129
+ outputFile: `${TEST_DATA_DIR}/z.jsonl`,
130
+ description: "default threshold",
131
+ prompt: "p",
132
+ chatId: 1,
133
+ userId: 1,
134
+ toolUseId: null,
135
+ });
136
+ // Backdate 9 min — still under the 10-min default, should stay pending
137
+ let p = mod.listPendingAgents();
138
+ (p[0] as { startedAt: number }).startedAt = Date.now() - 9 * 60_000;
139
+ await mod.pollOnce();
140
+ expect(delivered).toHaveLength(0);
141
+ expect(mod.listPendingAgents()).toHaveLength(1);
142
+
143
+ // Backdate to 11 min — over threshold, should fire
144
+ p = mod.listPendingAgents();
145
+ (p[0] as { startedAt: number }).startedAt = Date.now() - 11 * 60_000;
146
+ await mod.pollOnce();
147
+ expect(delivered).toHaveLength(1);
148
+ });
149
+
150
+ it("running file (has content, no end_turn) is unaffected by zombie check", async () => {
151
+ // A file WITH content should never trigger the missing-file path
152
+ // regardless of age.
153
+ const outPath = `${TEST_DATA_DIR}/running.jsonl`;
154
+ fs.writeFileSync(
155
+ outPath,
156
+ JSON.stringify({
157
+ type: "assistant",
158
+ isSidechain: true,
159
+ agentId: "x",
160
+ message: {
161
+ role: "assistant",
162
+ content: [{ type: "tool_use", name: "Bash", input: {} }],
163
+ stop_reason: "tool_use",
164
+ },
165
+ }) + "\n",
166
+ "utf-8",
167
+ );
168
+ const mod = await import("../src/services/async-agent-watcher.js");
169
+ mod.registerPendingAgent({
170
+ agentId: "active-work",
171
+ outputFile: outPath,
172
+ description: "legitimately running",
173
+ prompt: "p",
174
+ chatId: 1,
175
+ userId: 1,
176
+ toolUseId: null,
177
+ });
178
+ const p = mod.listPendingAgents();
179
+ (p[0] as { startedAt: number }).startedAt = Date.now() - 30 * 60_000; // 30 min old
180
+
181
+ await mod.pollOnce();
182
+
183
+ // v4.12.4 staleness detection COULD fire here because the file has
184
+ // text content and is stale. That's a different (benign) path — the
185
+ // agent gets delivered as "completed with partial output". Either
186
+ // way, the zombie-fix error path must NOT fire.
187
+ const anyZombieError = delivered.some(
188
+ (d) => d.result.error && /output file never/i.test(d.result.error),
189
+ );
190
+ expect(anyZombieError).toBe(false);
191
+ });
192
+
193
+ it("completed file delivers as completed (unchanged)", async () => {
194
+ const outPath = `${TEST_DATA_DIR}/done.jsonl`;
195
+ fs.writeFileSync(
196
+ outPath,
197
+ JSON.stringify({
198
+ type: "assistant",
199
+ agentId: "x",
200
+ message: {
201
+ content: [{ type: "text", text: "all good" }],
202
+ stop_reason: "end_turn",
203
+ },
204
+ }) + "\n",
205
+ "utf-8",
206
+ );
207
+ const mod = await import("../src/services/async-agent-watcher.js");
208
+ mod.registerPendingAgent({
209
+ agentId: "done-agent",
210
+ outputFile: outPath,
211
+ description: "clean completion",
212
+ prompt: "p",
213
+ chatId: 1,
214
+ userId: 1,
215
+ toolUseId: null,
216
+ });
217
+ // Backdate 1h — would trigger zombie if misapplied
218
+ const p = mod.listPendingAgents();
219
+ (p[0] as { startedAt: number }).startedAt = Date.now() - 60 * 60_000;
220
+
221
+ await mod.pollOnce();
222
+
223
+ expect(delivered).toHaveLength(1);
224
+ expect(delivered[0].result.status).toBe("completed");
225
+ });
226
+
227
+ it("decrements session counter on zombie failure delivery", async () => {
228
+ process.env.ALVIN_MISSING_FILE_FAILURE_MS = "1000"; // 1 sec for fast test
229
+ const sessionMod = await import("../src/services/session.js");
230
+ const session = sessionMod.getSession("zombie-session");
231
+ session.pendingBackgroundCount = 1;
232
+
233
+ const mod = await import("../src/services/async-agent-watcher.js");
234
+ mod.registerPendingAgent({
235
+ agentId: "session-zombie",
236
+ outputFile: `${TEST_DATA_DIR}/gone.jsonl`,
237
+ description: "zombie for counter",
238
+ prompt: "p",
239
+ chatId: 1,
240
+ userId: 1,
241
+ toolUseId: null,
242
+ sessionKey: "zombie-session",
243
+ });
244
+ const p = mod.listPendingAgents();
245
+ (p[0] as { startedAt: number }).startedAt = Date.now() - 5000; // 5 sec ago, > 1sec threshold
246
+
247
+ await mod.pollOnce();
248
+
249
+ expect(delivered).toHaveLength(1);
250
+ expect(session.pendingBackgroundCount).toBe(0);
251
+ });
252
+ });