npm - alvin-bot - Versions diffs - 5.1.4 → 5.1.5 - Mend

alvin-bot 5.1.4 → 5.1.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

package/CHANGELOG.md +15 -0
package/dist/handlers/commands.js +5 -0
package/dist/services/trends.js +25 -1
package/dist/services/updater.js +10 -3
package/dist/services/watchdog-brake.js +6 -4
package/dist/services/watchdog.js +16 -0
package/package.json +1 -1

package/CHANGELOG.md CHANGED Viewed

@@ -2,6 +2,21 @@
 All notable changes to Alvin Bot are documented here.
+## [5.1.5] — 2026-05-15
+### Health monitor no longer cries wolf about its own log lines
+The bot watches its own error and crash counts over 24 hours and warns you if they keep climbing. It turned out that monitor was largely measuring itself. v5.1.4 fixed one harmless thing being mislabelled as an error; this release fixes the root cause so the whole class of false alarms stops.
+Two things were wrong:
+- **"Errors" counted every line written to the error log — even harmless ones.** Some parts of the bot deliberately write benign status notes there (a self-healing message-format retry; the alert system logging whether its own notification went out). Every one of those inflated the error count, including the alert system flagging its own output — a loop that kept the warning alive no matter what. The error count now ignores these known-harmless notes and still counts every real error, including ones it has never seen before.
+- **Every intentional restart was counted as a crash.** When the bot updates itself or you run `/update` or `/restart`, it exits on purpose and is immediately relaunched. The crash detector saw the quick exit and scored it as a crash, so simply shipping updates made the crash graph creep upward and trip the alarm. Planned restarts are now recognised as planned and no longer counted as crashes.
+### What this means for you
+If you saw a "trend anomaly: errors/crashes steadily climbing" alert shortly after updating, that was the monitor reacting to the update itself, not a real regression. After this release the trend reflects reality. No action needed — update as usual.
 ## [5.1.4] — 2026-05-15
 ### No more false "errors are climbing" health alerts

package/dist/handlers/commands.js CHANGED Viewed

@@ -27,6 +27,7 @@ import { BOT_VERSION } from "../version.js";
 import { getWebPort } from "../web/server.js";
 import { getUsageSummary, getAllRateLimits, formatTokens } from "../services/usage-tracker.js";
 import { runUpdate, getAutoUpdate, setAutoUpdate, startAutoUpdateLoop } from "../services/updater.js";
+import { markExpectedRestart } from "../services/watchdog.js";
 import { getReleaseHighlights } from "../services/release-highlights.js";
 import { runCleanup, getCleanupPolicy } from "../services/disk-cleanup.js";
 import { getHealthStatus, isFailedOver } from "../services/heartbeat.js";
@@ -1912,6 +1913,8 @@ export function registerCommands(bot) {
     bot.command("restart", async (ctx) => {
         const lang = getSession(ctx.from.id).language;
         await ctx.reply(t("bot.restart.triggered", lang));
+        // Intentional restart — don't let the watchdog score it as a crash.
+        markExpectedRestart();
         // Small delay so the Telegram message is actually delivered before exit
         setTimeout(() => process.exit(0), 500);
     });
@@ -1936,6 +1939,8 @@ export function registerCommands(bot) {
                 }
                 if (result.requiresRestart) {
                     await ctx.reply(t("bot.update.restarting", lang));
+                    // Intentional restart — don't let the watchdog score it as a crash.
+                    markExpectedRestart();
                     setTimeout(() => process.exit(0), 500);
                 }
             }

package/dist/services/trends.js CHANGED Viewed

@@ -42,6 +42,30 @@ const TRENDS_PATH = join(homedir(), ".alvin-bot", "state", "trends.jsonl");
 const DEFAULT_INTERVAL_HOURS = 24;
 const DEFAULT_AI_THRESHOLD_DAYS = 7;
 const MAX_RETAIN_DAYS = 90;
+/**
+ * What counts as an "error" line in alvin-bot.err.log for the
+ * errors_24h metric.
+ *
+ * stderr IS the bot's error channel, so the default is: count every
+ * timestamped line. But a few subsystems deliberately write *benign*
+ * operational diagnostics to stderr:
+ *
+ *   - subagent-delivery's self-healing Markdown→plaintext retry
+ *     (a successful, expected fallback — not an error)
+ *   - critical-notify's own delivery-outcome line, kept on stderr on
+ *     purpose so it stays visible even in brake/crash context
+ *
+ * Counting those turned this very monitor into a false-alarm generator:
+ * it flagged its OWN log lines plus every release's restart churn, so
+ * the alert kept firing even after the underlying issue was fixed.
+ *
+ * This is a BLACKLIST (count everything except the known benign
+ * emitters), not a whitelist of error signatures — a health monitor
+ * must never silently miss a novel real error. New benign emitters, if
+ * any, get added here in one place instead of being chased across the
+ * codebase.
+ */
+export const ERR_LOG_PATTERN = /^(?!.*(?:\[critical-notify\]|\[subagent-delivery\] Markdown parse failed)).+/;
 let trendsTimer = null;
 function isDisabled() {
     return (process.env.ALVIN_DISABLE_TRENDS === "true" ||
@@ -114,7 +138,7 @@ function takeSnapshot(activeProvider) {
         heap_mb: Math.round(mem.heapUsed / 1024 / 1024),
         crashes_24h: readWatchdogCrashes24h(),
         diag_24h: countDiagnosticBundlesLast24h(),
-        errors_24h: countLogLinesLast24h("alvin-bot.err.log"),
+        errors_24h: countLogLinesLast24h("alvin-bot.err.log", ERR_LOG_PATTERN),
         provider: activeProvider,
         version: BOT_VERSION,
     };

package/dist/services/updater.js CHANGED Viewed

@@ -7,8 +7,10 @@
  *   - startAutoUpdateLoop(): periodic check every 6h if enabled
  *
  * After a successful update that produces new artifacts, the bot calls
- * process.exit(0) and PM2 auto-restarts it with fresh code. This is the
- * only safe self-restart path — we never re-exec the Node process directly.
+ * process.exit(0) and relies on its supervising process manager to
+ * restart it with fresh code (launchd KeepAlive, systemd Restart=, PM2,
+ * Docker restart policy, etc.). This is the only safe self-restart path
+ * — we never re-exec the Node process directly.
  *
  * The auto-update flag is persisted to ~/.alvin-bot/auto-update.flag
  * (a plain text file containing "on" or "off"), so it survives restarts.
@@ -20,6 +22,7 @@ import { fileURLToPath } from "url";
 import fs from "fs";
 import os from "os";
 import { BOT_VERSION } from "../version.js";
+import { markExpectedRestart } from "./watchdog.js";
 const execAsync = promisify(exec);
 const PROJECT_ROOT = resolve(dirname(fileURLToPath(import.meta.url)), "../..");
 const DATA_DIR = process.env.ALVIN_DATA_DIR || resolve(os.homedir(), ".alvin-bot");
@@ -261,7 +264,11 @@ export function startAutoUpdateLoop() {
     autoTimer = setInterval(async () => {
         const result = await runUpdate();
         if (result.ok && result.requiresRestart) {
-            console.log(`[auto-update] ${result.message} — exiting for PM2 restart`);
+            console.log(`[auto-update] ${result.message} — exiting for process-manager restart`);
+            // Flag this as an intentional restart so the watchdog doesn't
+            // count the planned exit(0) as a crash (would inflate crashes_24h
+            // every release and trip the trend monitor).
+            markExpectedRestart();
             // Small delay so any in-flight log write completes
             setTimeout(() => process.exit(0), 1_000);
         }

package/dist/services/watchdog-brake.js CHANGED Viewed

@@ -59,10 +59,12 @@ export function decideBrakeAction(previous, now, opts = {}) {
     }
     const timeSinceLastBeat = now - previous.lastBeat;
     const previousExitedRecently = timeSinceLastBeat < staleMs;
-    if (!previousExitedRecently) {
-        // Clean exit (or machine reboot between runs) → short-window counter
-        // resets, but the daily counter keeps going unless its own window
-        // already expired above.
+    if (!previousExitedRecently || previous.expectedRestart) {
+        // Clean exit, intentional restart (auto-update / `/update`), or a
+        // machine reboot between runs → short-window counter resets, but the
+        // daily counter keeps going unless its own window already expired
+        // above. expectedRestart is honored even when the beacon is fresh:
+        // a planned process.exit(0) is not a crash.
         return {
             action: "proceed",
             crashCount: 0,

package/dist/services/watchdog.js CHANGED Viewed

@@ -87,6 +87,22 @@ function writeBeacon(data) {
         console.error("[watchdog] failed to write beacon:", err);
     }
 }
+/**
+ * Mark the imminent process exit as an INTENTIONAL restart so the next
+ * boot's decideBrakeAction does not count it as a crash. Called by the
+ * updater right before process.exit(0) for auto-update / `/update`.
+ *
+ * Read-modify-write: preserves the live crash counters; only flips the
+ * expectedRestart flag. Best-effort and synchronous — if the beacon
+ * can't be read (first run, disk issue) we simply skip; worst case the
+ * restart is counted as one crash, which is the pre-fix behavior.
+ */
+export function markExpectedRestart() {
+    const current = readBeacon();
+    if (!current)
+        return;
+    writeBeacon({ ...current, lastBeat: Date.now(), expectedRestart: true });
+}
 function writeAlert(reason, crashCount) {
     try {
         const content = [

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "alvin-bot",
-  "version": "5.1.4",
+  "version": "5.1.5",
   "description": "Alvin Bot — Your personal AI agent on Telegram, WhatsApp, Discord, Signal, and Web.",
   "type": "module",
   "main": "dist/index.js",