alvin-bot 5.1.4 → 5.1.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +15 -0
- package/dist/handlers/commands.js +5 -0
- package/dist/services/trends.js +25 -1
- package/dist/services/updater.js +10 -3
- package/dist/services/watchdog-brake.js +6 -4
- package/dist/services/watchdog.js +16 -0
- package/package.json +1 -1
package/CHANGELOG.md
CHANGED
|
@@ -2,6 +2,21 @@
|
|
|
2
2
|
|
|
3
3
|
All notable changes to Alvin Bot are documented here.
|
|
4
4
|
|
|
5
|
+
## [5.1.5] — 2026-05-15
|
|
6
|
+
|
|
7
|
+
### Health monitor no longer cries wolf about its own log lines
|
|
8
|
+
|
|
9
|
+
The bot watches its own error and crash counts over 24 hours and warns you if they keep climbing. It turned out that monitor was largely measuring itself. v5.1.4 fixed one harmless thing being mislabelled as an error; this release fixes the root cause so the whole class of false alarms stops.
|
|
10
|
+
|
|
11
|
+
Two things were wrong:
|
|
12
|
+
|
|
13
|
+
- **"Errors" counted every line written to the error log — even harmless ones.** Some parts of the bot deliberately write benign status notes there (a self-healing message-format retry; the alert system logging whether its own notification went out). Every one of those inflated the error count, including the alert system flagging its own output — a loop that kept the warning alive no matter what. The error count now ignores these known-harmless notes and still counts every real error, including ones it has never seen before.
|
|
14
|
+
- **Every intentional restart was counted as a crash.** When the bot updates itself or you run `/update` or `/restart`, it exits on purpose and is immediately relaunched. The crash detector saw the quick exit and scored it as a crash, so simply shipping updates made the crash graph creep upward and trip the alarm. Planned restarts are now recognised as planned and no longer counted as crashes.
|
|
15
|
+
|
|
16
|
+
### What this means for you
|
|
17
|
+
|
|
18
|
+
If you saw a "trend anomaly: errors/crashes steadily climbing" alert shortly after updating, that was the monitor reacting to the update itself, not a real regression. After this release the trend reflects reality. No action needed — update as usual.
|
|
19
|
+
|
|
5
20
|
## [5.1.4] — 2026-05-15
|
|
6
21
|
|
|
7
22
|
### No more false "errors are climbing" health alerts
|
|
@@ -27,6 +27,7 @@ import { BOT_VERSION } from "../version.js";
|
|
|
27
27
|
import { getWebPort } from "../web/server.js";
|
|
28
28
|
import { getUsageSummary, getAllRateLimits, formatTokens } from "../services/usage-tracker.js";
|
|
29
29
|
import { runUpdate, getAutoUpdate, setAutoUpdate, startAutoUpdateLoop } from "../services/updater.js";
|
|
30
|
+
import { markExpectedRestart } from "../services/watchdog.js";
|
|
30
31
|
import { getReleaseHighlights } from "../services/release-highlights.js";
|
|
31
32
|
import { runCleanup, getCleanupPolicy } from "../services/disk-cleanup.js";
|
|
32
33
|
import { getHealthStatus, isFailedOver } from "../services/heartbeat.js";
|
|
@@ -1912,6 +1913,8 @@ export function registerCommands(bot) {
|
|
|
1912
1913
|
bot.command("restart", async (ctx) => {
|
|
1913
1914
|
const lang = getSession(ctx.from.id).language;
|
|
1914
1915
|
await ctx.reply(t("bot.restart.triggered", lang));
|
|
1916
|
+
// Intentional restart — don't let the watchdog score it as a crash.
|
|
1917
|
+
markExpectedRestart();
|
|
1915
1918
|
// Small delay so the Telegram message is actually delivered before exit
|
|
1916
1919
|
setTimeout(() => process.exit(0), 500);
|
|
1917
1920
|
});
|
|
@@ -1936,6 +1939,8 @@ export function registerCommands(bot) {
|
|
|
1936
1939
|
}
|
|
1937
1940
|
if (result.requiresRestart) {
|
|
1938
1941
|
await ctx.reply(t("bot.update.restarting", lang));
|
|
1942
|
+
// Intentional restart — don't let the watchdog score it as a crash.
|
|
1943
|
+
markExpectedRestart();
|
|
1939
1944
|
setTimeout(() => process.exit(0), 500);
|
|
1940
1945
|
}
|
|
1941
1946
|
}
|
package/dist/services/trends.js
CHANGED
|
@@ -42,6 +42,30 @@ const TRENDS_PATH = join(homedir(), ".alvin-bot", "state", "trends.jsonl");
|
|
|
42
42
|
const DEFAULT_INTERVAL_HOURS = 24;
|
|
43
43
|
const DEFAULT_AI_THRESHOLD_DAYS = 7;
|
|
44
44
|
const MAX_RETAIN_DAYS = 90;
|
|
45
|
+
/**
|
|
46
|
+
* What counts as an "error" line in alvin-bot.err.log for the
|
|
47
|
+
* errors_24h metric.
|
|
48
|
+
*
|
|
49
|
+
* stderr IS the bot's error channel, so the default is: count every
|
|
50
|
+
* timestamped line. But a few subsystems deliberately write *benign*
|
|
51
|
+
* operational diagnostics to stderr:
|
|
52
|
+
*
|
|
53
|
+
* - subagent-delivery's self-healing Markdown→plaintext retry
|
|
54
|
+
* (a successful, expected fallback — not an error)
|
|
55
|
+
* - critical-notify's own delivery-outcome line, kept on stderr on
|
|
56
|
+
* purpose so it stays visible even in brake/crash context
|
|
57
|
+
*
|
|
58
|
+
* Counting those turned this very monitor into a false-alarm generator:
|
|
59
|
+
* it flagged its OWN log lines plus every release's restart churn, so
|
|
60
|
+
* the alert kept firing even after the underlying issue was fixed.
|
|
61
|
+
*
|
|
62
|
+
* This is a BLACKLIST (count everything except the known benign
|
|
63
|
+
* emitters), not a whitelist of error signatures — a health monitor
|
|
64
|
+
* must never silently miss a novel real error. New benign emitters, if
|
|
65
|
+
* any, get added here in one place instead of being chased across the
|
|
66
|
+
* codebase.
|
|
67
|
+
*/
|
|
68
|
+
export const ERR_LOG_PATTERN = /^(?!.*(?:\[critical-notify\]|\[subagent-delivery\] Markdown parse failed)).+/;
|
|
45
69
|
let trendsTimer = null;
|
|
46
70
|
function isDisabled() {
|
|
47
71
|
return (process.env.ALVIN_DISABLE_TRENDS === "true" ||
|
|
@@ -114,7 +138,7 @@ function takeSnapshot(activeProvider) {
|
|
|
114
138
|
heap_mb: Math.round(mem.heapUsed / 1024 / 1024),
|
|
115
139
|
crashes_24h: readWatchdogCrashes24h(),
|
|
116
140
|
diag_24h: countDiagnosticBundlesLast24h(),
|
|
117
|
-
errors_24h: countLogLinesLast24h("alvin-bot.err.log"),
|
|
141
|
+
errors_24h: countLogLinesLast24h("alvin-bot.err.log", ERR_LOG_PATTERN),
|
|
118
142
|
provider: activeProvider,
|
|
119
143
|
version: BOT_VERSION,
|
|
120
144
|
};
|
package/dist/services/updater.js
CHANGED
|
@@ -7,8 +7,10 @@
|
|
|
7
7
|
* - startAutoUpdateLoop(): periodic check every 6h if enabled
|
|
8
8
|
*
|
|
9
9
|
* After a successful update that produces new artifacts, the bot calls
|
|
10
|
-
* process.exit(0) and
|
|
11
|
-
*
|
|
10
|
+
* process.exit(0) and relies on its supervising process manager to
|
|
11
|
+
* restart it with fresh code (launchd KeepAlive, systemd Restart=, PM2,
|
|
12
|
+
* Docker restart policy, etc.). This is the only safe self-restart path
|
|
13
|
+
* — we never re-exec the Node process directly.
|
|
12
14
|
*
|
|
13
15
|
* The auto-update flag is persisted to ~/.alvin-bot/auto-update.flag
|
|
14
16
|
* (a plain text file containing "on" or "off"), so it survives restarts.
|
|
@@ -20,6 +22,7 @@ import { fileURLToPath } from "url";
|
|
|
20
22
|
import fs from "fs";
|
|
21
23
|
import os from "os";
|
|
22
24
|
import { BOT_VERSION } from "../version.js";
|
|
25
|
+
import { markExpectedRestart } from "./watchdog.js";
|
|
23
26
|
const execAsync = promisify(exec);
|
|
24
27
|
const PROJECT_ROOT = resolve(dirname(fileURLToPath(import.meta.url)), "../..");
|
|
25
28
|
const DATA_DIR = process.env.ALVIN_DATA_DIR || resolve(os.homedir(), ".alvin-bot");
|
|
@@ -261,7 +264,11 @@ export function startAutoUpdateLoop() {
|
|
|
261
264
|
autoTimer = setInterval(async () => {
|
|
262
265
|
const result = await runUpdate();
|
|
263
266
|
if (result.ok && result.requiresRestart) {
|
|
264
|
-
console.log(`[auto-update] ${result.message} — exiting for
|
|
267
|
+
console.log(`[auto-update] ${result.message} — exiting for process-manager restart`);
|
|
268
|
+
// Flag this as an intentional restart so the watchdog doesn't
|
|
269
|
+
// count the planned exit(0) as a crash (would inflate crashes_24h
|
|
270
|
+
// every release and trip the trend monitor).
|
|
271
|
+
markExpectedRestart();
|
|
265
272
|
// Small delay so any in-flight log write completes
|
|
266
273
|
setTimeout(() => process.exit(0), 1_000);
|
|
267
274
|
}
|
|
@@ -59,10 +59,12 @@ export function decideBrakeAction(previous, now, opts = {}) {
|
|
|
59
59
|
}
|
|
60
60
|
const timeSinceLastBeat = now - previous.lastBeat;
|
|
61
61
|
const previousExitedRecently = timeSinceLastBeat < staleMs;
|
|
62
|
-
if (!previousExitedRecently) {
|
|
63
|
-
// Clean exit
|
|
64
|
-
//
|
|
65
|
-
// already expired
|
|
62
|
+
if (!previousExitedRecently || previous.expectedRestart) {
|
|
63
|
+
// Clean exit, intentional restart (auto-update / `/update`), or a
|
|
64
|
+
// machine reboot between runs → short-window counter resets, but the
|
|
65
|
+
// daily counter keeps going unless its own window already expired
|
|
66
|
+
// above. expectedRestart is honored even when the beacon is fresh:
|
|
67
|
+
// a planned process.exit(0) is not a crash.
|
|
66
68
|
return {
|
|
67
69
|
action: "proceed",
|
|
68
70
|
crashCount: 0,
|
|
@@ -87,6 +87,22 @@ function writeBeacon(data) {
|
|
|
87
87
|
console.error("[watchdog] failed to write beacon:", err);
|
|
88
88
|
}
|
|
89
89
|
}
|
|
90
|
+
/**
|
|
91
|
+
* Mark the imminent process exit as an INTENTIONAL restart so the next
|
|
92
|
+
* boot's decideBrakeAction does not count it as a crash. Called by the
|
|
93
|
+
* updater right before process.exit(0) for auto-update / `/update`.
|
|
94
|
+
*
|
|
95
|
+
* Read-modify-write: preserves the live crash counters; only flips the
|
|
96
|
+
* expectedRestart flag. Best-effort and synchronous — if the beacon
|
|
97
|
+
* can't be read (first run, disk issue) we simply skip; worst case the
|
|
98
|
+
* restart is counted as one crash, which is the pre-fix behavior.
|
|
99
|
+
*/
|
|
100
|
+
export function markExpectedRestart() {
|
|
101
|
+
const current = readBeacon();
|
|
102
|
+
if (!current)
|
|
103
|
+
return;
|
|
104
|
+
writeBeacon({ ...current, lastBeat: Date.now(), expectedRestart: true });
|
|
105
|
+
}
|
|
90
106
|
function writeAlert(reason, crashCount) {
|
|
91
107
|
try {
|
|
92
108
|
const content = [
|