alvin-bot 5.1.4 → 5.1.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/CHANGELOG.md CHANGED
@@ -2,6 +2,21 @@
2
2
 
3
3
  All notable changes to Alvin Bot are documented here.
4
4
 
5
+ ## [5.1.5] — 2026-05-15
6
+
7
+ ### Health monitor no longer cries wolf about its own log lines
8
+
9
+ The bot watches its own error and crash counts over 24 hours and warns you if they keep climbing. It turned out that monitor was largely measuring itself. v5.1.4 fixed one harmless thing being mislabelled as an error; this release fixes the root cause so the whole class of false alarms stops.
10
+
11
+ Two things were wrong:
12
+
13
+ - **"Errors" counted every line written to the error log — even harmless ones.** Some parts of the bot deliberately write benign status notes there (a self-healing message-format retry; the alert system logging whether its own notification went out). Every one of those inflated the error count, including the alert system flagging its own output — a loop that kept the warning alive no matter what. The error count now ignores these known-harmless notes and still counts every real error, including ones it has never seen before.
14
+ - **Every intentional restart was counted as a crash.** When the bot updates itself or you run `/update` or `/restart`, it exits on purpose and is immediately relaunched. The crash detector saw the quick exit and scored it as a crash, so simply shipping updates made the crash graph creep upward and trip the alarm. Planned restarts are now recognised as planned and no longer counted as crashes.
15
+
16
+ ### What this means for you
17
+
18
+ If you saw a "trend anomaly: errors/crashes steadily climbing" alert shortly after updating, that was the monitor reacting to the update itself, not a real regression. After this release the trend reflects reality. No action needed — update as usual.
19
+
5
20
  ## [5.1.4] — 2026-05-15
6
21
 
7
22
  ### No more false "errors are climbing" health alerts
@@ -27,6 +27,7 @@ import { BOT_VERSION } from "../version.js";
27
27
  import { getWebPort } from "../web/server.js";
28
28
  import { getUsageSummary, getAllRateLimits, formatTokens } from "../services/usage-tracker.js";
29
29
  import { runUpdate, getAutoUpdate, setAutoUpdate, startAutoUpdateLoop } from "../services/updater.js";
30
+ import { markExpectedRestart } from "../services/watchdog.js";
30
31
  import { getReleaseHighlights } from "../services/release-highlights.js";
31
32
  import { runCleanup, getCleanupPolicy } from "../services/disk-cleanup.js";
32
33
  import { getHealthStatus, isFailedOver } from "../services/heartbeat.js";
@@ -1912,6 +1913,8 @@ export function registerCommands(bot) {
1912
1913
  bot.command("restart", async (ctx) => {
1913
1914
  const lang = getSession(ctx.from.id).language;
1914
1915
  await ctx.reply(t("bot.restart.triggered", lang));
1916
+ // Intentional restart — don't let the watchdog score it as a crash.
1917
+ markExpectedRestart();
1915
1918
  // Small delay so the Telegram message is actually delivered before exit
1916
1919
  setTimeout(() => process.exit(0), 500);
1917
1920
  });
@@ -1936,6 +1939,8 @@ export function registerCommands(bot) {
1936
1939
  }
1937
1940
  if (result.requiresRestart) {
1938
1941
  await ctx.reply(t("bot.update.restarting", lang));
1942
+ // Intentional restart — don't let the watchdog score it as a crash.
1943
+ markExpectedRestart();
1939
1944
  setTimeout(() => process.exit(0), 500);
1940
1945
  }
1941
1946
  }
@@ -42,6 +42,30 @@ const TRENDS_PATH = join(homedir(), ".alvin-bot", "state", "trends.jsonl");
42
42
  const DEFAULT_INTERVAL_HOURS = 24;
43
43
  const DEFAULT_AI_THRESHOLD_DAYS = 7;
44
44
  const MAX_RETAIN_DAYS = 90;
45
+ /**
46
+ * What counts as an "error" line in alvin-bot.err.log for the
47
+ * errors_24h metric.
48
+ *
49
+ * stderr IS the bot's error channel, so the default is: count every
50
+ * timestamped line. But a few subsystems deliberately write *benign*
51
+ * operational diagnostics to stderr:
52
+ *
53
+ * - subagent-delivery's self-healing Markdown→plaintext retry
54
+ * (a successful, expected fallback — not an error)
55
+ * - critical-notify's own delivery-outcome line, kept on stderr on
56
+ * purpose so it stays visible even in brake/crash context
57
+ *
58
+ * Counting those turned this very monitor into a false-alarm generator:
59
+ * it flagged its OWN log lines plus every release's restart churn, so
60
+ * the alert kept firing even after the underlying issue was fixed.
61
+ *
62
+ * This is a BLACKLIST (count everything except the known benign
63
+ * emitters), not a whitelist of error signatures — a health monitor
64
+ * must never silently miss a novel real error. New benign emitters, if
65
+ * any, get added here in one place instead of being chased across the
66
+ * codebase.
67
+ */
68
+ export const ERR_LOG_PATTERN = /^(?!.*(?:\[critical-notify\]|\[subagent-delivery\] Markdown parse failed)).+/;
45
69
  let trendsTimer = null;
46
70
  function isDisabled() {
47
71
  return (process.env.ALVIN_DISABLE_TRENDS === "true" ||
@@ -114,7 +138,7 @@ function takeSnapshot(activeProvider) {
114
138
  heap_mb: Math.round(mem.heapUsed / 1024 / 1024),
115
139
  crashes_24h: readWatchdogCrashes24h(),
116
140
  diag_24h: countDiagnosticBundlesLast24h(),
117
- errors_24h: countLogLinesLast24h("alvin-bot.err.log"),
141
+ errors_24h: countLogLinesLast24h("alvin-bot.err.log", ERR_LOG_PATTERN),
118
142
  provider: activeProvider,
119
143
  version: BOT_VERSION,
120
144
  };
@@ -7,8 +7,10 @@
7
7
  * - startAutoUpdateLoop(): periodic check every 6h if enabled
8
8
  *
9
9
  * After a successful update that produces new artifacts, the bot calls
10
- * process.exit(0) and PM2 auto-restarts it with fresh code. This is the
11
- * only safe self-restart path we never re-exec the Node process directly.
10
+ * process.exit(0) and relies on its supervising process manager to
11
+ * restart it with fresh code (launchd KeepAlive, systemd Restart=, PM2,
12
+ * Docker restart policy, etc.). This is the only safe self-restart path
13
+ * — we never re-exec the Node process directly.
12
14
  *
13
15
  * The auto-update flag is persisted to ~/.alvin-bot/auto-update.flag
14
16
  * (a plain text file containing "on" or "off"), so it survives restarts.
@@ -20,6 +22,7 @@ import { fileURLToPath } from "url";
20
22
  import fs from "fs";
21
23
  import os from "os";
22
24
  import { BOT_VERSION } from "../version.js";
25
+ import { markExpectedRestart } from "./watchdog.js";
23
26
  const execAsync = promisify(exec);
24
27
  const PROJECT_ROOT = resolve(dirname(fileURLToPath(import.meta.url)), "../..");
25
28
  const DATA_DIR = process.env.ALVIN_DATA_DIR || resolve(os.homedir(), ".alvin-bot");
@@ -261,7 +264,11 @@ export function startAutoUpdateLoop() {
261
264
  autoTimer = setInterval(async () => {
262
265
  const result = await runUpdate();
263
266
  if (result.ok && result.requiresRestart) {
264
- console.log(`[auto-update] ${result.message} — exiting for PM2 restart`);
267
+ console.log(`[auto-update] ${result.message} — exiting for process-manager restart`);
268
+ // Flag this as an intentional restart so the watchdog doesn't
269
+ // count the planned exit(0) as a crash (would inflate crashes_24h
270
+ // every release and trip the trend monitor).
271
+ markExpectedRestart();
265
272
  // Small delay so any in-flight log write completes
266
273
  setTimeout(() => process.exit(0), 1_000);
267
274
  }
@@ -59,10 +59,12 @@ export function decideBrakeAction(previous, now, opts = {}) {
59
59
  }
60
60
  const timeSinceLastBeat = now - previous.lastBeat;
61
61
  const previousExitedRecently = timeSinceLastBeat < staleMs;
62
- if (!previousExitedRecently) {
63
- // Clean exit (or machine reboot between runs) short-window counter
64
- // resets, but the daily counter keeps going unless its own window
65
- // already expired above.
62
+ if (!previousExitedRecently || previous.expectedRestart) {
63
+ // Clean exit, intentional restart (auto-update / `/update`), or a
64
+ // machine reboot between runs short-window counter resets, but the
65
+ // daily counter keeps going unless its own window already expired
66
+ // above. expectedRestart is honored even when the beacon is fresh:
67
+ // a planned process.exit(0) is not a crash.
66
68
  return {
67
69
  action: "proceed",
68
70
  crashCount: 0,
@@ -87,6 +87,22 @@ function writeBeacon(data) {
87
87
  console.error("[watchdog] failed to write beacon:", err);
88
88
  }
89
89
  }
90
+ /**
91
+ * Mark the imminent process exit as an INTENTIONAL restart so the next
92
+ * boot's decideBrakeAction does not count it as a crash. Called by the
93
+ * updater right before process.exit(0) for auto-update / `/update`.
94
+ *
95
+ * Read-modify-write: preserves the live crash counters; only flips the
96
+ * expectedRestart flag. Best-effort and synchronous — if the beacon
97
+ * can't be read (first run, disk issue) we simply skip; worst case the
98
+ * restart is counted as one crash, which is the pre-fix behavior.
99
+ */
100
+ export function markExpectedRestart() {
101
+ const current = readBeacon();
102
+ if (!current)
103
+ return;
104
+ writeBeacon({ ...current, lastBeat: Date.now(), expectedRestart: true });
105
+ }
90
106
  function writeAlert(reason, crashCount) {
91
107
  try {
92
108
  const content = [
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "alvin-bot",
3
- "version": "5.1.4",
3
+ "version": "5.1.5",
4
4
  "description": "Alvin Bot — Your personal AI agent on Telegram, WhatsApp, Discord, Signal, and Web.",
5
5
  "type": "module",
6
6
  "main": "dist/index.js",