alvin-bot 4.8.8 → 4.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -27,15 +27,13 @@ import { resolve } from "path";
27
27
  import os from "os";
28
28
  import { execSync } from "child_process";
29
29
  import { BOT_VERSION } from "../version.js";
30
+ import { decideBrakeAction, shouldResetCrashCounter, DEFAULTS, } from "./watchdog-brake.js";
30
31
  const DATA_DIR = process.env.ALVIN_DATA_DIR || resolve(os.homedir(), ".alvin-bot");
31
32
  const STATE_DIR = resolve(DATA_DIR, "state");
32
33
  const BEACON_FILE = resolve(STATE_DIR, "watchdog.json");
33
34
  const ALERT_FILE = resolve(STATE_DIR, "crash-loop.alert");
34
35
  const BEACON_INTERVAL_MS = 30_000; // write a beacon every 30 s
35
- const CRASH_WINDOW_MS = 10 * 60 * 1000; // 10 min crashes within this count toward the brake
36
- const CRASH_BRAKE_THRESHOLD = 10; // after this many crashes in the window, brake
37
- const STALE_BEACON_MS = 90_000; // a beacon older than this is considered "old enough that previous process really exited"
38
- const RECOVERY_UPTIME_MS = 5 * 60 * 1000; // 5 min of clean uptime resets the counter
36
+ // Thresholds and windows live in watchdog-brake.ts DEFAULTS.
39
37
  let beaconTimer = null;
40
38
  let resetTimer = null;
41
39
  let bootTime = 0;
@@ -57,7 +55,21 @@ function readBeacon() {
57
55
  typeof parsed.crashCount === "number" &&
58
56
  typeof parsed.crashWindowStart === "number" &&
59
57
  typeof parsed.version === "string") {
60
- return parsed;
58
+ // Older beacons don't have daily-counter fields — default them to
59
+ // 0/now so the brake logic treats this run as the start of the
60
+ // first daily window.
61
+ return {
62
+ lastBeat: parsed.lastBeat,
63
+ pid: parsed.pid,
64
+ bootTime: parsed.bootTime,
65
+ crashCount: parsed.crashCount,
66
+ crashWindowStart: parsed.crashWindowStart,
67
+ version: parsed.version,
68
+ dailyCrashCount: typeof parsed.dailyCrashCount === "number" ? parsed.dailyCrashCount : 0,
69
+ dailyCrashWindowStart: typeof parsed.dailyCrashWindowStart === "number"
70
+ ? parsed.dailyCrashWindowStart
71
+ : Date.now(),
72
+ };
61
73
  }
62
74
  return null;
63
75
  }
@@ -78,8 +90,9 @@ function writeAlert(reason, crashCount) {
78
90
  const content = [
79
91
  `Alvin Bot crash-loop brake hit at ${new Date().toISOString()}`,
80
92
  `Version: ${BOT_VERSION}`,
81
- `Crashes in the last ${CRASH_WINDOW_MS / 60_000} minutes: ${crashCount}`,
82
- `Threshold: ${CRASH_BRAKE_THRESHOLD}`,
93
+ `Crashes in the last ${DEFAULTS.SHORT_WINDOW_MS / 60_000} minutes: ${crashCount}`,
94
+ `Short-window threshold: ${DEFAULTS.SHORT_BRAKE_THRESHOLD}`,
95
+ `Daily threshold: ${DEFAULTS.DAILY_BRAKE_THRESHOLD}`,
83
96
  ``,
84
97
  `Reason: ${reason}`,
85
98
  ``,
@@ -147,36 +160,25 @@ export function startWatchdog() {
147
160
  ensureStateDir();
148
161
  bootTime = Date.now();
149
162
  const previous = readBeacon();
150
- let crashCount = 0;
151
- let crashWindowStart = bootTime;
163
+ const decision = decideBrakeAction(previous, bootTime);
164
+ if (decision.action === "brake") {
165
+ console.error(`[watchdog] crash-loop brake triggered: ${decision.reason}`);
166
+ writeAlert(decision.reason, previous?.crashCount ?? 0);
167
+ checkCrashLoopBrake();
168
+ // checkCrashLoopBrake calls process.exit — execution never reaches here.
169
+ return;
170
+ }
171
+ let crashCount = decision.crashCount;
172
+ let crashWindowStart = decision.crashWindowStart;
173
+ let dailyCrashCount = decision.dailyCrashCount;
174
+ let dailyCrashWindowStart = decision.dailyCrashWindowStart;
152
175
  if (previous) {
153
176
  const timeSinceLastBeat = bootTime - previous.lastBeat;
154
- const inWindow = bootTime - previous.crashWindowStart < CRASH_WINDOW_MS;
155
- if (timeSinceLastBeat < STALE_BEACON_MS) {
156
- // Previous process exited very recently → that's a crash (or a
157
- // graceful exit immediately followed by a restart, which we treat
158
- // the same way for the brake — the goal is to detect rapid cycles).
159
- if (inWindow) {
160
- crashCount = previous.crashCount + 1;
161
- crashWindowStart = previous.crashWindowStart;
162
- }
163
- else {
164
- // Previous crash was outside the window → reset counter
165
- crashCount = 1;
166
- }
167
- console.log(`[watchdog] detected restart after ${Math.round(timeSinceLastBeat / 1000)}s — crash ${crashCount}/${CRASH_BRAKE_THRESHOLD} in current ${CRASH_WINDOW_MS / 60_000}min window`);
168
- if (crashCount >= CRASH_BRAKE_THRESHOLD) {
169
- console.error(`[watchdog] crash-loop brake triggered (${crashCount} crashes in ${CRASH_WINDOW_MS / 60_000}min)`);
170
- writeAlert(`Process restarted ${crashCount} times within ${CRASH_WINDOW_MS / 60_000} minutes. Last beacon was ${Math.round(timeSinceLastBeat / 1000)}s ago. Most likely a deterministic crash on startup.`, crashCount);
171
- // Re-use the brake check to unload + exit cleanly
172
- checkCrashLoopBrake();
173
- }
174
- }
175
- else {
176
- // Previous beacon was old → process had clean uptime before exit,
177
- // OR system was rebooted between runs. Reset crash count.
178
- crashCount = 0;
179
- crashWindowStart = bootTime;
177
+ if (timeSinceLastBeat < DEFAULTS.STALE_BEACON_MS) {
178
+ console.log(`[watchdog] detected restart after ${Math.round(timeSinceLastBeat / 1000)}s — ` +
179
+ `crash ${crashCount}/${DEFAULTS.SHORT_BRAKE_THRESHOLD} in current ` +
180
+ `${DEFAULTS.SHORT_WINDOW_MS / 60_000}min window, ` +
181
+ `${dailyCrashCount}/${DEFAULTS.DAILY_BRAKE_THRESHOLD} in current 24h window`);
180
182
  }
181
183
  }
182
184
  // Write the first beacon immediately so a fresh restart updates the file
@@ -186,6 +188,8 @@ export function startWatchdog() {
186
188
  bootTime,
187
189
  crashCount,
188
190
  crashWindowStart,
191
+ dailyCrashCount,
192
+ dailyCrashWindowStart,
189
193
  version: BOT_VERSION,
190
194
  });
191
195
  // Periodic beacon writer
@@ -196,15 +200,20 @@ export function startWatchdog() {
196
200
  bootTime,
197
201
  crashCount,
198
202
  crashWindowStart,
203
+ dailyCrashCount,
204
+ dailyCrashWindowStart,
199
205
  version: BOT_VERSION,
200
206
  });
201
207
  }, BEACON_INTERVAL_MS);
202
- // Schedule a recovery counter reset after RECOVERY_UPTIME_MS of clean
203
- // uptime. If we make it that far without dying, the bot is healthy
204
- // again and we shouldn't penalize a future single crash.
208
+ // Schedule a recovery counter reset after RESET_AFTER_MS (1 h by default)
209
+ // of clean uptime. The old policy was 5 min too short because chronic
210
+ // crashes often had 5-10 min gaps and never tripped the brake.
205
211
  resetTimer = setTimeout(() => {
206
- if (crashCount > 0) {
207
- console.log(`[watchdog] ${RECOVERY_UPTIME_MS / 60_000}min clean uptime resetting crash counter from ${crashCount} to 0`);
212
+ const uptime = Date.now() - bootTime;
213
+ if (shouldResetCrashCounter(uptime) && crashCount > 0) {
214
+ console.log(`[watchdog] ${Math.round(uptime / 60_000)}min clean uptime — ` +
215
+ `resetting short-window crash counter from ${crashCount} to 0 ` +
216
+ `(daily counter ${dailyCrashCount} stays)`);
208
217
  crashCount = 0;
209
218
  crashWindowStart = Date.now();
210
219
  writeBeacon({
@@ -213,11 +222,16 @@ export function startWatchdog() {
213
222
  bootTime,
214
223
  crashCount,
215
224
  crashWindowStart,
225
+ dailyCrashCount,
226
+ dailyCrashWindowStart,
216
227
  version: BOT_VERSION,
217
228
  });
218
229
  }
219
- }, RECOVERY_UPTIME_MS);
220
- console.log(`[watchdog] started — beacon every ${BEACON_INTERVAL_MS / 1000}s, brake at ${CRASH_BRAKE_THRESHOLD} crashes per ${CRASH_WINDOW_MS / 60_000}min, recovery after ${RECOVERY_UPTIME_MS / 60_000}min uptime`);
230
+ }, DEFAULTS.RESET_AFTER_MS);
231
+ console.log(`[watchdog] started — beacon every ${BEACON_INTERVAL_MS / 1000}s, ` +
232
+ `brake at ${DEFAULTS.SHORT_BRAKE_THRESHOLD} crashes / ${DEFAULTS.SHORT_WINDOW_MS / 60_000}min ` +
233
+ `or ${DEFAULTS.DAILY_BRAKE_THRESHOLD} / 24h, ` +
234
+ `recovery after ${DEFAULTS.RESET_AFTER_MS / 60_000}min uptime`);
221
235
  }
222
236
  /**
223
237
  * Stop the watchdog cleanly. Called from the shutdown handler in
@@ -0,0 +1,109 @@
1
+ /**
2
+ * Console formatter — adds ISO timestamps to every console.log /
3
+ * console.warn / console.error call, and drops high-volume noise
4
+ * (libsignal session dumps, Claude CLI native-binary banner).
5
+ *
6
+ * Installed once at bootstrap time from src/index.ts. Idempotent.
7
+ *
8
+ * Why not pino / winston: those pull in several MB of deps and change
9
+ * the call-site ergonomics. Every caller in the bot today uses plain
10
+ * `console.log`; monkey-patching those is a 40-line change instead of
11
+ * a refactor of every file.
12
+ */
13
+ import util from "node:util";
14
+ let snapshot = null;
15
+ /**
16
+ * Noise patterns from production logs that fill out.log/err.log with
17
+ * tens of KB per day without carrying useful signal. Added sparingly —
18
+ * every entry here is a line a human will never need to grep for.
19
+ */
20
+ const NOISE_PATTERNS = [
21
+ // libsignal session dump header — the multi-line body following this
22
+ // line is silenced by the first-line detector below.
23
+ /^Closing session: SessionEntry \{/,
24
+ // libsignal prekey bundle swap notification
25
+ /^Closing open session in favor of incoming prekey bundle/,
26
+ // Claude CLI startup banner — spammed once per query
27
+ /^\[claude\] Native binary: /,
28
+ // libsignal Bad MAC — session desync, harmless, repeats endlessly
29
+ /^Session error:Error: Bad MAC Error: Bad MAC/,
30
+ ];
31
+ /** Exported for testing. */
32
+ export function isNoisyLine(line) {
33
+ return NOISE_PATTERNS.some((re) => re.test(line));
34
+ }
35
+ /**
36
+ * Track whether we're currently inside a libsignal multi-line dump. The
37
+ * dumps look like `Closing session: SessionEntry {` followed by several
38
+ * lines of buffer hex, closing with `}`. We swallow everything from the
39
+ * opening brace to its matching `}` line.
40
+ */
41
+ let suppressDepth = 0;
42
+ function shouldSuppress(raw) {
43
+ const line = raw.trimEnd();
44
+ if (suppressDepth > 0) {
45
+ // Inside a multi-line dump — count braces on this line. The dumps
46
+ // only contain ASCII braces in the structural positions, so this
47
+ // is safe enough for production noise.
48
+ const opens = (line.match(/\{/g) || []).length;
49
+ const closes = (line.match(/\}/g) || []).length;
50
+ suppressDepth += opens;
51
+ suppressDepth -= closes;
52
+ if (suppressDepth < 0)
53
+ suppressDepth = 0;
54
+ return true;
55
+ }
56
+ if (isNoisyLine(line)) {
57
+ // If the noisy header opens a block, start suppressing its body.
58
+ const opens = (line.match(/\{/g) || []).length;
59
+ const closes = (line.match(/\}/g) || []).length;
60
+ suppressDepth = Math.max(0, opens - closes);
61
+ return true;
62
+ }
63
+ return false;
64
+ }
65
+ function formatWithTimestamp(method, stream) {
66
+ return (...args) => {
67
+ // Render args the same way console does — util.format handles %s / %d / objects.
68
+ const text = renderArgs(args);
69
+ if (shouldSuppress(text))
70
+ return;
71
+ const stamp = new Date().toISOString();
72
+ // Write directly to the stream so we don't recurse through console.
73
+ stream.write(`${stamp} ${text}\n`);
74
+ void method; // keep original ref alive for uninstall
75
+ };
76
+ }
77
+ function renderArgs(args) {
78
+ // Use Node's built-in util.format — it matches console.* exactly.
79
+ return util.format(...args);
80
+ }
81
+ /**
82
+ * Install timestamp + noise-filter formatters on console.log/warn/info/error.
83
+ * Safe to call multiple times.
84
+ */
85
+ export function installConsoleFormatter() {
86
+ if (snapshot)
87
+ return; // already installed
88
+ snapshot = {
89
+ log: console.log.bind(console),
90
+ warn: console.warn.bind(console),
91
+ error: console.error.bind(console),
92
+ info: console.info.bind(console),
93
+ };
94
+ console.log = formatWithTimestamp(snapshot.log, process.stdout);
95
+ console.info = formatWithTimestamp(snapshot.info, process.stdout);
96
+ console.warn = formatWithTimestamp(snapshot.warn, process.stderr);
97
+ console.error = formatWithTimestamp(snapshot.error, process.stderr);
98
+ }
99
+ /** Restore the original console methods. Used by tests + shutdown. */
100
+ export function uninstallConsoleFormatter() {
101
+ if (!snapshot)
102
+ return;
103
+ console.log = snapshot.log;
104
+ console.info = snapshot.info;
105
+ console.warn = snapshot.warn;
106
+ console.error = snapshot.error;
107
+ snapshot = null;
108
+ suppressDepth = 0;
109
+ }
@@ -0,0 +1,24 @@
1
+ /**
2
+ * Trailing-edge debounce. Delays `fn` until `waitMs` has elapsed since
3
+ * the most recent call. Coalesces bursts into a single invocation with
4
+ * the most recent arguments.
5
+ *
6
+ * Used by fs.watch consumers (skills, plugins) where macOS FSEvents
7
+ * delivers many duplicate events for a single logical change.
8
+ */
9
+ export function debounce(fn, waitMs) {
10
+ let timer = null;
11
+ let lastArgs = null;
12
+ return function debounced(...args) {
13
+ lastArgs = args;
14
+ if (timer)
15
+ clearTimeout(timer);
16
+ timer = setTimeout(() => {
17
+ timer = null;
18
+ const call = lastArgs;
19
+ lastArgs = null;
20
+ if (call)
21
+ fn(...call);
22
+ }, waitMs);
23
+ };
24
+ }
@@ -0,0 +1,62 @@
1
+ /**
2
+ * Telegram error filter — single source of truth for "which grammy
3
+ * errors are harmless and should never reach the end user as a
4
+ * 'Fehler: ...' reply."
5
+ *
6
+ * Context: grammy's Bot API wrapper surfaces these as plain Error
7
+ * objects with the description baked into `.message`. Some call sites
8
+ * (live-stream edit races, callback-answer races after a modal was
9
+ * already dismissed, message-to-edit-gone races when the user just
10
+ * deleted the message) produce errors that are 100% benign — they
11
+ * just mean the UI state we were about to write is already there.
12
+ *
13
+ * This file centralises the list so we can update one regex and have
14
+ * the filter apply everywhere. Used by bot.catch(), by the streaming
15
+ * `telegram.ts` finalize path, by handlers/message.ts, and by any
16
+ * future caller that needs to decide "report this to the user or
17
+ * drop it silently."
18
+ */
19
+ const HARMLESS_PATTERNS = [
20
+ // The big one — live-stream edit races
21
+ /message is not modified/i,
22
+ /specified new message content and reply markup are exactly the same/i,
23
+ // Callback-answer race: the user tapped a stale inline button
24
+ /query is too old and response timeout expired/i,
25
+ /query ID is invalid/i,
26
+ // The user deleted the message we were about to edit
27
+ /message to edit not found/i,
28
+ /message to delete not found/i,
29
+ /MESSAGE_ID_INVALID/i,
30
+ ];
31
+ /**
32
+ * True if the error is one of the known-harmless Telegram races.
33
+ * Accepts Error objects, grammy's GrammyError (which has an additional
34
+ * `description` field), and plain strings. `null` / `undefined` return
35
+ * false so callers can use this directly in catch blocks.
36
+ */
37
+ export function isHarmlessTelegramError(err) {
38
+ if (err === null || err === undefined)
39
+ return false;
40
+ let haystack = "";
41
+ if (typeof err === "string") {
42
+ haystack = err;
43
+ }
44
+ else if (err instanceof Error) {
45
+ haystack = err.message || "";
46
+ // grammy's GrammyError carries the server's reason on .description
47
+ const desc = err.description;
48
+ if (typeof desc === "string")
49
+ haystack += " " + desc;
50
+ }
51
+ else if (typeof err === "object") {
52
+ // Plain object — look for message/description fields
53
+ const obj = err;
54
+ if (typeof obj.message === "string")
55
+ haystack += obj.message;
56
+ if (typeof obj.description === "string")
57
+ haystack += " " + obj.description;
58
+ }
59
+ if (!haystack)
60
+ return false;
61
+ return HARMLESS_PATTERNS.some((re) => re.test(haystack));
62
+ }
@@ -31,6 +31,9 @@ import { BOT_ROOT, ENV_FILE, PUBLIC_DIR, MEMORY_DIR, MEMORY_FILE, SOUL_FILE, DAT
31
31
  import { broadcast } from "../services/broadcast.js";
32
32
  import { BOT_VERSION } from "../version.js";
33
33
  const WEB_PORT = parseInt(process.env.WEB_PORT || "3100");
34
+ /** Module-scope reference to the WebSocket server so stopWebServer() can
35
+ * tear it down together with the HTTP server. Set inside startWebServer(). */
36
+ let wsServerRef = null;
34
37
  const WEB_PASSWORD = process.env.WEB_PASSWORD || "";
35
38
  /** The actual port the Web UI is running on (may differ from WEB_PORT if busy). */
36
39
  let actualWebPort = WEB_PORT;
@@ -1426,6 +1429,7 @@ export function startWebServer() {
1426
1429
  });
1427
1430
  });
1428
1431
  const wss = new WebSocketServer({ server });
1432
+ wsServerRef = wss;
1429
1433
  handleWebSocket(wss);
1430
1434
  // Smart port: try WEB_PORT, increment if busy (up to +20)
1431
1435
  const MAX_TRIES = 20;
@@ -1449,6 +1453,58 @@ export function startWebServer() {
1449
1453
  tryListen(WEB_PORT);
1450
1454
  return server;
1451
1455
  }
1456
+ /**
1457
+ * Gracefully stop the web server so the port is released.
1458
+ *
1459
+ * Why this exists: `shutdown()` in src/index.ts used to stop grammy and the
1460
+ * scheduler but leave the HTTP server listening. macOS then held the
1461
+ * listening socket in the socket table, so launchd's next boot of the bot
1462
+ * hit `EADDRINUSE :::3100`, threw an Uncaught exception and crash-looped.
1463
+ *
1464
+ * What this does:
1465
+ * 1. Force-close idle keep-alive sockets (otherwise close() hangs on them).
1466
+ * 2. Force-close active open requests (long-poll clients, WebSocket
1467
+ * upgrades that never completed).
1468
+ * 3. Tear down the WebSocket server so its own sockets don't linger.
1469
+ * 4. Await `server.close()` so the listening socket is truly released
1470
+ * before the caller's shutdown continues.
1471
+ *
1472
+ * Safe to call multiple times; no-op when the server is already closed or
1473
+ * never listened. Never throws.
1474
+ */
1475
+ export async function stopWebServer(server) {
1476
+ try {
1477
+ if (wsServerRef) {
1478
+ for (const client of wsServerRef.clients) {
1479
+ try {
1480
+ client.terminate();
1481
+ }
1482
+ catch { /* ignore */ }
1483
+ }
1484
+ await new Promise((resolve) => wsServerRef.close(() => resolve()));
1485
+ wsServerRef = null;
1486
+ }
1487
+ }
1488
+ catch { /* ignore */ }
1489
+ if (!server.listening)
1490
+ return;
1491
+ try {
1492
+ // Node 18.2+ APIs — break any keep-alive / long-poll stalls so
1493
+ // server.close() can actually resolve.
1494
+ const s = server;
1495
+ if (typeof s.closeIdleConnections === "function")
1496
+ s.closeIdleConnections();
1497
+ if (typeof s.closeAllConnections === "function")
1498
+ s.closeAllConnections();
1499
+ }
1500
+ catch { /* ignore */ }
1501
+ await new Promise((resolve) => {
1502
+ // close() callback fires with an Error arg when the server wasn't
1503
+ // listening — we just resolve in either case. The caller only cares
1504
+ // that the port is free when this awaits.
1505
+ server.close(() => resolve());
1506
+ });
1507
+ }
1452
1508
  /** Get the actual port the Web UI is running on. */
1453
1509
  export function getWebPort() {
1454
1510
  return actualWebPort;
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "alvin-bot",
3
- "version": "4.8.8",
3
+ "version": "4.9.0",
4
4
  "description": "Alvin Bot — Your personal AI agent on Telegram, WhatsApp, Discord, Signal, and Web.",
5
5
  "type": "module",
6
6
  "main": "dist/index.js",