alvin-bot 5.4.0 → 5.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/CHANGELOG.md CHANGED
@@ -2,6 +2,60 @@
2
2
 
3
3
  All notable changes to Alvin Bot are documented here.
4
4
 
5
+ ## [5.6.0] — 2026-05-18
6
+
7
+ ### Background-task reports are now clean and to the point
8
+
9
+ When a scheduled or background task finishes, Alvin now sends you
10
+ just the result — a tight header (what ran, how long, tokens, success)
11
+ and the actual answer — instead of a wall of its working notes. If a
12
+ result is unusually long, the chat message stays short and the
13
+ complete output comes attached as a file, so you never lose anything
14
+ and never have to scroll through a transcript.
15
+
16
+ ### A clear confirmation when you stop something
17
+
18
+ Press ⛔ Stop (or use /cancel) while Alvin is genuinely working and
19
+ you now get a short, plain confirmation in your language that the work
20
+ was halted — not just a fleeting button flash. If nothing was running,
21
+ Alvin still tells you that honestly instead of pretending it stopped
22
+ something.
23
+
24
+ ### Health alerts that don't cry wolf
25
+
26
+ Alvin's self-monitoring now judges its health on recent activity, so a
27
+ one-off rough patch no longer keeps it flagging a problem for weeks. A
28
+ real issue still raises a flag promptly; a quiet, healthy bot stays
29
+ quiet.
30
+
31
+ As always, this shipped after a full multi-pass review and a
32
+ fresh-install + stress verification on a clean separate machine.
33
+
34
+ ## [5.5.0] — 2026-05-18
35
+
36
+ ### The ⛔ Stop button now responds instantly — and honestly
37
+
38
+ Stopping a task is now crisp and truthful. The moment a task finishes,
39
+ the Stop button disappears, so you're never tapping a control for
40
+ something that's already done. And the feedback always matches reality:
41
+ if you tap Stop while Alvin is genuinely working, it stops and says so;
42
+ if the task had already completed, Alvin tells you that plainly instead
43
+ of implying it cut something short. If you hit Stop in that brief moment
44
+ while an answer is being prepared, that answer is now held back — "I
45
+ stopped it" means nothing more arrives. Anything Alvin had already
46
+ shown you stays exactly as it was.
47
+
48
+ ### Fewer false alerts — smarter health monitoring
49
+
50
+ Alvin's self-monitoring got a lot more trustworthy. A planned restart
51
+ or an update is no longer mistaken for a problem, and the daily health
52
+ summary only raises a flag when there's real evidence something is
53
+ actually wrong — so the alerts you do get are ones worth reading.
54
+ Routine background housekeeping no longer shows up as noise.
55
+
56
+ As always, this shipped after a full multi-pass review and a
57
+ fresh-install + stress verification on a clean separate machine.
58
+
5
59
  ## [5.4.0] — 2026-05-18
6
60
 
7
61
  ### Smoother background tasks — and Alvin always tells you the truth
@@ -1918,6 +1918,10 @@ export function registerCommands(bot) {
1918
1918
  if (session.isProcessing) {
1919
1919
  requestStop(session, "soft", buildStopDeps(session));
1920
1920
  await ctx.reply(t("bot.cancel.cancelling", lang));
1921
+ // V56-T2c — a real stop fired: follow the "cancelling…" notice with a
1922
+ // brief confirmation that the work was actually halted (consistent UX
1923
+ // with the ⛔ button). Best-effort — must never throw into the handler.
1924
+ await ctx.reply(t("bot.cancel.confirmed", lang)).catch(() => { });
1921
1925
  }
1922
1926
  else {
1923
1927
  await ctx.reply(t("bot.cancel.noRunning", lang));
@@ -1946,17 +1950,34 @@ export function registerCommands(bot) {
1946
1950
  const sessionKey = ctx.match[1];
1947
1951
  const session = getSession(sessionKey);
1948
1952
  const lang = session.language;
1949
- if (session.isProcessing) {
1953
+ // A1 — Capture isProcessing BEFORE requestStop (which sets it false)
1954
+ // so we can show the right toast: "stopped" vs "already finished".
1955
+ const wasProcessing = session.isProcessing;
1956
+ if (wasProcessing) {
1950
1957
  requestStop(session, "soft", buildStopDeps(session));
1951
1958
  }
1959
+ // A1 — Honest toast: if the turn had already finished when the button was
1960
+ // tapped, don't claim "stopped" — tell the user it was already done.
1961
+ const toastKey = wasProcessing
1962
+ ? "bot.cancel.stoppedToast"
1963
+ : "bot.cancel.alreadyDone";
1952
1964
  try {
1953
- await ctx.answerCallbackQuery({ text: t("bot.cancel.stoppedToast", lang) });
1965
+ await ctx.answerCallbackQuery({ text: t(toastKey, lang) });
1954
1966
  }
1955
1967
  catch { /* harmless grammy race */ }
1956
1968
  try {
1957
1969
  await ctx.editMessageReplyMarkup({});
1958
1970
  }
1959
1971
  catch { /* harmless grammy race — message may already be gone */ }
1972
+ // V56-T2c — when a real stop genuinely fired (wasProcessing), also send a
1973
+ // short in-chat confirmation in the session language so the user gets a
1974
+ // persistent acknowledgement, not only the ephemeral toast. When nothing
1975
+ // was running we deliberately stay silent here (v5.5.0 honesty: the
1976
+ // alreadyDone toast already told the truth). Best-effort — must never
1977
+ // throw into the handler.
1978
+ if (wasProcessing) {
1979
+ await ctx.reply(t("bot.cancel.confirmed", lang)).catch(() => { });
1980
+ }
1960
1981
  });
1961
1982
  // /restart — trigger a PM2-managed restart by exiting the process.
1962
1983
  // The PM2 supervisor picks up the exit and respawns with --update-env.
@@ -122,6 +122,37 @@ const TOOL_ICONS = {
122
122
  WebFetch: "📡",
123
123
  Task: "🤖",
124
124
  };
125
+ // ── A3 — stop-suppress-undelivered pure predicate ────────────────────────────
126
+ /**
127
+ * Determine whether the final answer send should be suppressed because a stop
128
+ * was requested and no visible text has yet been delivered to the user.
129
+ *
130
+ * This closes the gap behind "I clicked Stop but it answered anyway": the
131
+ * Claude SDK delivers short answers atomically, so the for-await loop parks
132
+ * on IPC the whole time, and the complete answer arrives as one block. By the
133
+ * time the consumer bail fires at the top of the loop, the answer is computed
134
+ * and about to be sent. This guard is the only stoppable moment for atomic
135
+ * answers.
136
+ *
137
+ * HARD CONSTRAINT — no-retract invariant: if ANY visible text has already
138
+ * been streamed/committed to the user (visibleTextAlreadySent=true), the
139
+ * predicate returns false regardless of stop state. Partial output that
140
+ * already reached the user is NEVER retracted. The consumer bail in the
141
+ * for-await loop already handles mid-stream stops; this guard only acts on
142
+ * the final commit step.
143
+ *
144
+ * Truth table:
145
+ * stopRequested=truthy + visibleTextAlreadySent=false → true (suppress)
146
+ * stopRequested=truthy + visibleTextAlreadySent=true → false (no-retract)
147
+ * stopRequested=falsy + * → false (normal)
148
+ */
149
+ export function shouldSuppressFinalSend(args) {
150
+ if (!args.stopRequested)
151
+ return false;
152
+ if (args.visibleTextAlreadySent)
153
+ return false;
154
+ return true;
155
+ }
125
156
  // ── v5.2 live steering — pure routing helper ─────────────────────────────────
126
157
  /**
127
158
  * Decide how a mid-task message (arriving while `session.isProcessing`) should
@@ -785,19 +816,45 @@ export async function handleMessage(ctx) {
785
816
  /* harmless — notice is best-effort */
786
817
  }
787
818
  }
788
- // v5.1 stop: user stopped this query — do NOT finalize partial output
789
- // as a successful answer, no 👍, no history commit. The stop trigger
790
- // (/cancel | /stopall | ⛔ button) already acknowledged to the user.
791
- // The `finally` still runs (clears isProcessing/_qHandle/_stopRequested
792
- // + typing indicator).
793
- if (session._stopRequested) {
794
- return;
795
- }
796
819
  if (bypassAborted) {
797
820
  // v4.12.3 — Bypass path took over; don't finalize, don't react 👍.
798
821
  // Just clean up and return. The finally block still fires.
799
822
  return;
800
823
  }
824
+ // A3 — Suppress-or-finalize gate for stopped turns.
825
+ //
826
+ // shouldSuppressFinalSend is the SINGLE gate controlling whether finalize runs:
827
+ //
828
+ // stop + no visible text (suppress=true):
829
+ // Skip finalize and all side-effects. Nothing reached the user — correct.
830
+ // The stop trigger (/cancel | /stopall | ⛔) already acknowledged this.
831
+ // The `finally` still runs (clears isProcessing/_qHandle/_stopRequested
832
+ // + typing indicator).
833
+ //
834
+ // stop + visible text already sent (suppress=false, _stopRequested truthy):
835
+ // The no-retract invariant applies — partial output already shown must not
836
+ // be left visually unfinished. Run streamer.finalize to flush the throttle
837
+ // timer and drop the status line, then return BEFORE the completed-answer
838
+ // side-effects (👍 / broadcastResponseDone / addToHistory). A stopped turn
839
+ // is NOT a successfully completed turn.
840
+ //
841
+ // no stop (suppress=false, _stopRequested falsy):
842
+ // Normal path — fall through to finalize + all side-effects.
843
+ if (shouldSuppressFinalSend({
844
+ stopRequested: session._stopRequested,
845
+ visibleTextAlreadySent: streamer.hasSentText,
846
+ })) {
847
+ // Branch A: stop + no visible text → suppress entirely.
848
+ return;
849
+ }
850
+ if (session._stopRequested && streamer.hasSentText) {
851
+ // Branch B: stop + visible text already sent → finalize the partial cleanly
852
+ // (flushes throttle timer, clears status line) but do NOT emit the
853
+ // completed-answer signals or commit to history.
854
+ await streamer.finalize(finalText);
855
+ return;
856
+ }
857
+ // Branch C: normal (no stop) — fall through.
801
858
  await streamer.finalize(finalText);
802
859
  emit("message:sent", { userId, text: finalText, platform: "telegram" });
803
860
  // v4.5.0: tell observers the response is complete.
@@ -874,6 +931,15 @@ export async function handleMessage(ctx) {
874
931
  // but if a new turn started and re-populated _qHandle via onQueryHandle we
875
932
  // must NOT null it here — that would break Cycle-1 stop teeth for the new turn.
876
933
  if (session._turnId === _thisTurnId) {
934
+ // A2 — Remove the ⛔ Stop control message as the FIRST action when the
935
+ // turn ends, so the stale button disappears before any post-turn work.
936
+ // Best-effort: if it was already deleted or the bot lacks permission, ignore.
937
+ if (stopMsgId !== null) {
938
+ try {
939
+ await ctx.api.deleteMessage(ctx.chat.id, stopMsgId);
940
+ }
941
+ catch { /* harmless grammy race */ }
942
+ }
877
943
  session.isProcessing = false;
878
944
  session.abortController = null;
879
945
  // v5.2 — Close and clear the SteerChannel; reset per-turn ack flag.
@@ -887,14 +953,6 @@ export async function handleMessage(ctx) {
887
953
  session._stopRequested = null; // safe: token matches → no newer turn has set this
888
954
  session._turnId = null;
889
955
  }
890
- // v5.1 — Remove the ⛔ Stop control message (sent at processing start).
891
- // Best-effort: if it was already deleted or the bot lacks permission, ignore.
892
- if (stopMsgId !== null) {
893
- try {
894
- await ctx.api.deleteMessage(ctx.chat.id, stopMsgId);
895
- }
896
- catch { /* harmless grammy race */ }
897
- }
898
956
  // Check for queued messages — they'll be prepended to the next real message
899
957
  // Queue stays in session and gets consumed on next handleMessage call
900
958
  }
package/dist/i18n.js CHANGED
@@ -378,6 +378,21 @@ const strings = {
378
378
  es: "⛔ Detenido",
379
379
  fr: "⛔ Arrêté",
380
380
  },
381
+ "bot.cancel.alreadyDone": {
382
+ en: "Nothing running — that already finished.",
383
+ de: "Nichts läuft — das war schon fertig.",
384
+ es: "Nada en curso — eso ya terminó.",
385
+ fr: "Rien en cours — c'était déjà terminé.",
386
+ },
387
+ // Sent as a brief in-chat confirmation only when a stop GENUINELY halted
388
+ // running work (⛔ button / /cancel with work actually in progress). Not
389
+ // sent when nothing was running — that honest behavior stays unchanged.
390
+ "bot.cancel.confirmed": {
391
+ en: "⛔ Stopped — further work was halted.",
392
+ de: "⛔ Gestoppt — die weitere Arbeit wurde angehalten.",
393
+ es: "⛔ Detenido — se interrumpió el trabajo en curso.",
394
+ fr: "⛔ Arrêté — le travail en cours a été interrompu.",
395
+ },
381
396
  // /model
382
397
  "bot.model.chooseHeader": {
383
398
  en: "🤖 *Choose model:*",
package/dist/index.js CHANGED
@@ -187,7 +187,7 @@ import { loadSkills } from "./services/skills.js";
187
187
  import { loadHooks } from "./services/hooks.js";
188
188
  import { registerShutdownHandler } from "./services/restart.js";
189
189
  import { cancelAllSubAgents } from "./services/subagents.js";
190
- import { startWatchdog, stopWatchdog, checkCrashLoopBrake } from "./services/watchdog.js";
190
+ import { startWatchdog, stopWatchdog, checkCrashLoopBrake, markExpectedRestart } from "./services/watchdog.js";
191
191
  import { getRegistry } from "./engine.js";
192
192
  import { scanAssets } from "./services/asset-index.js";
193
193
  // Scan asset directory and generate INDEX.json + INDEX.md
@@ -383,6 +383,12 @@ const shutdown = async () => {
383
383
  return;
384
384
  isShuttingDown = true;
385
385
  console.log("Graceful shutdown initiated...");
386
+ // Mark the imminent exit as an intentional restart so the next boot's
387
+ // decideBrakeAction does not count it as a crash. This covers launchctl
388
+ // unload/load (SIGTERM from launchd) in addition to /restart and /update
389
+ // which call markExpectedRestart() themselves before process.exit(0).
390
+ // Must run before stopWatchdog() (which just clears timers, not the beacon).
391
+ markExpectedRestart();
386
392
  // E2: shutdown-notification — await the async cancellation so running
387
393
  // agents can post a cancellation message to Telegram before the bot
388
394
  // stops. Capped at 5s internally so a hang can't block shutdown.
@@ -446,9 +446,23 @@ export class ClaudeSDKProvider {
446
446
  sessionResetRequested: true,
447
447
  };
448
448
  }
449
+ // V56-T1 — Surface the SDK's authoritative final answer
450
+ // separately from the accumulated narration. SDKResultSuccess
451
+ // carries a single `result: string` that is the agent's actual
452
+ // outcome (NOT the concatenation of every assistant turn).
453
+ // SDKResultError has no `result` field — leave finalResult
454
+ // undefined there so consumers fall back to buffered text.
455
+ // This is the same source the detached-dispatch path already
456
+ // prefers (`{"type":"result"}.result` in async-agent-parser).
457
+ const finalResult = "subtype" in resultMsg &&
458
+ resultMsg.subtype === "success" &&
459
+ typeof resultMsg.result === "string"
460
+ ? resultMsg.result
461
+ : undefined;
449
462
  yield {
450
463
  type: "done",
451
464
  text: accumulatedText || "",
465
+ ...(finalResult !== undefined ? { finalResult } : {}),
452
466
  sessionId: resultMsg.session_id || capturedSessionId,
453
467
  costUsd: "total_cost_usd" in resultMsg ? resultMsg.total_cost_usd : 0,
454
468
  inputTokens: inputTok,
@@ -27,6 +27,25 @@ import { dirname } from "path";
27
27
  import { parseOutputFileStatus } from "./async-agent-parser.js";
28
28
  import { ASYNC_AGENTS_STATE_FILE } from "../paths.js";
29
29
  import { getAllSessions } from "./session.js";
30
+ /**
31
+ * B3 — Detect a permanent "target chat does not exist" delivery failure
32
+ * (Telegram 400 "Bad Request: chat not found"), e.g. the stale chat_id:1
33
+ * test agent. Such an agent must be abandoned, not retried forever.
34
+ *
35
+ * Kept as a local predicate (mirrors isChatNotFoundError in
36
+ * subagent-delivery.ts) so the watcher does NOT take a new hard
37
+ * dependency on a fresh subagent-delivery export — many test suites mock
38
+ * that module with only deliverSubAgentResult, and a destructured import
39
+ * of a non-mocked symbol would throw. Matched narrowly on the
40
+ * chat-not-found signature only.
41
+ */
42
+ function isChatNotFoundError(err) {
43
+ if (!err || typeof err !== "object")
44
+ return false;
45
+ const e = err;
46
+ const haystack = `${e.message ?? ""} ${e.description ?? ""}`;
47
+ return /chat not found/i.test(haystack);
48
+ }
30
49
  /** How often the polling loop runs against each pending agent. */
31
50
  const POLL_INTERVAL_MS = 15_000;
32
51
  /** Hard ceiling per agent — 12h. After this, give up and deliver
@@ -199,22 +218,38 @@ export async function pollOnce() {
199
218
  const now = Date.now();
200
219
  const toRemove = [];
201
220
  const missingFileFailureMs = getMissingFileFailureMs();
221
+ // B3 — when a delivery attempt proves the target chat is permanently
222
+ // invalid ("chat not found", e.g. the stale chat_id:1 test agent),
223
+ // abandon the agent so the watcher never retries it. Without this, a
224
+ // pending agent with an invalid target spams stderr on every poll
225
+ // cycle (inflating errors_24h) and lingers until the 12h giveUpAt.
226
+ const abandonIfInvalidTarget = (entry, outcome) => {
227
+ if (!outcome.chatNotFound)
228
+ return;
229
+ if (!toRemove.includes(entry.agentId))
230
+ toRemove.push(entry.agentId);
231
+ console.warn(`[async-watcher] abandoning agent ${entry.agentId} — delivery target ` +
232
+ `chat ${String(entry.chatId)} not found (invalid/stale); will not retry`);
233
+ };
202
234
  for (const entry of pending.values()) {
203
235
  entry.lastCheckedAt = now;
204
236
  // Timeout check first — if the agent is past its giveUpAt, give up
205
237
  // regardless of whether the file shows progress.
206
238
  if (now >= entry.giveUpAt) {
207
- await deliverAsFailure(entry, "timeout", "Agent ran longer than 12h — giving up");
239
+ const outcome = await deliverAsFailure(entry, "timeout", "Agent ran longer than 12h — giving up");
240
+ abandonIfInvalidTarget(entry, outcome);
208
241
  toRemove.push(entry.agentId);
209
242
  continue;
210
243
  }
211
244
  const status = await parseOutputFileStatus(entry.outputFile);
212
245
  if (status.state === "completed") {
213
- await deliverAsCompleted(entry, status.output, status.tokensUsed);
246
+ const outcome = await deliverAsCompleted(entry, status.output, status.tokensUsed);
247
+ abandonIfInvalidTarget(entry, outcome);
214
248
  toRemove.push(entry.agentId);
215
249
  }
216
250
  else if (status.state === "failed") {
217
- await deliverAsFailure(entry, "error", status.error);
251
+ const outcome = await deliverAsFailure(entry, "error", status.error);
252
+ abandonIfInvalidTarget(entry, outcome);
218
253
  toRemove.push(entry.agentId);
219
254
  }
220
255
  else if (status.state === "missing" &&
@@ -222,7 +257,8 @@ export async function pollOnce() {
222
257
  // v4.14.2 — Zombie guard: the subprocess never created its
223
258
  // output file within `missingFileFailureMs` (default 10 min).
224
259
  // Declare failed instead of polling until the 12h giveUpAt.
225
- await deliverAsFailure(entry, "error", `Dispatched subprocess never wrote its output file (${Math.round((now - entry.startedAt) / 60_000)}m after start). Likely crashed before initializing, or the file was removed externally.`);
260
+ const outcome = await deliverAsFailure(entry, "error", `Dispatched subprocess never wrote its output file (${Math.round((now - entry.startedAt) / 60_000)}m after start). Likely crashed before initializing, or the file was removed externally.`);
261
+ abandonIfInvalidTarget(entry, outcome);
226
262
  toRemove.push(entry.agentId);
227
263
  }
228
264
  // running / missing-but-young → keep polling next cycle
@@ -254,13 +290,20 @@ async function deliverAsCompleted(entry, output, tokensUsed) {
254
290
  tokensUsed: tokensUsed ?? { input: 0, output: 0 },
255
291
  duration: Date.now() - entry.startedAt,
256
292
  };
293
+ let chatNotFound = false;
257
294
  try {
258
- await deliverSubAgentResult(info, result);
295
+ const outcome = await deliverSubAgentResult(info, result);
296
+ chatNotFound = !!outcome?.chatNotFound;
259
297
  }
260
298
  catch (err) {
261
299
  console.error(`[async-watcher] delivery failed for ${entry.agentId}:`, err);
300
+ // deliverSubAgentResult normally swallows send errors and reports
301
+ // chatNotFound via its return value; if it ever throws, still detect
302
+ // the permanent invalid-target case here.
303
+ chatNotFound = isChatNotFoundError(err);
262
304
  }
263
305
  decrementPendingCount(entry.sessionKey);
306
+ return { chatNotFound };
264
307
  }
265
308
  async function deliverAsFailure(entry, status, error) {
266
309
  const { deliverSubAgentResult } = await import("./subagent-delivery.js");
@@ -283,13 +326,17 @@ async function deliverAsFailure(entry, status, error) {
283
326
  duration: Date.now() - entry.startedAt,
284
327
  error,
285
328
  };
329
+ let chatNotFound = false;
286
330
  try {
287
- await deliverSubAgentResult(info, result);
331
+ const outcome = await deliverSubAgentResult(info, result);
332
+ chatNotFound = !!outcome?.chatNotFound;
288
333
  }
289
334
  catch (err) {
290
335
  console.error(`[async-watcher] failure delivery failed for ${entry.agentId}:`, err);
336
+ chatNotFound = isChatNotFoundError(err);
291
337
  }
292
338
  decrementPendingCount(entry.sessionKey);
339
+ return { chatNotFound };
293
340
  }
294
341
  // ── Test helpers ──────────────────────────────────────────────────
295
342
  /**
@@ -24,6 +24,22 @@ function isTelegramParseError(err) {
24
24
  const haystack = `${e.message ?? ""} ${e.description ?? ""}`;
25
25
  return /can't parse entities|can't find end of the entity/i.test(haystack);
26
26
  }
27
+ /**
28
+ * B3 — A Telegram send rejected because the TARGET CHAT DOES NOT EXIST
29
+ * (HTTP 400 "Bad Request: chat not found"). This is a permanent,
30
+ * non-recoverable condition: the chat id is invalid (e.g. the stale
31
+ * chat_id:1 test agent), so every retry will fail identically and just
32
+ * spam stderr. Distinct from transient failures (network, rate-limit)
33
+ * which ARE worth retrying. Matched narrowly on the chat-not-found
34
+ * signature only — never on generic Bad Request.
35
+ */
36
+ export function isChatNotFoundError(err) {
37
+ if (!err || typeof err !== "object")
38
+ return false;
39
+ const e = err;
40
+ const haystack = `${e.message ?? ""} ${e.description ?? ""}`;
41
+ return /chat not found/i.test(haystack);
42
+ }
27
43
  /**
28
44
  * Send a Markdown message with an automatic plain-text retry on parse
29
45
  * errors. Any other error propagates to the caller's outer catch.
@@ -40,7 +56,52 @@ async function sendWithMarkdownFallback(api, chatId, text) {
40
56
  }
41
57
  }
42
58
  const MAX_TG_CHUNK = 3800; // below Telegram's 4096 limit with headroom
43
- const FILE_UPLOAD_THRESHOLD = 20_000; // switch to .md file upload above this
59
+ // V56-T2 honesty fix the .md file attachment is no longer gated on a
60
+ // separate 20k threshold. It now triggers whenever the cap actually
61
+ // truncates (isTruncated → body.length > BODY_CAP), so every truncated
62
+ // delivery carries the full output as a file and the marker is honest.
63
+ // (The prior 20k-only behavior is fully subsumed by isTruncated.)
64
+ /**
65
+ * V56-T2 (Layer-2) — honest hard cap on the INLINE delivered body.
66
+ *
67
+ * V56-T1 made delivery carry the SDK final result instead of the whole
68
+ * transcript, but a final result can itself occasionally be very long.
69
+ * This bounds the inline-message body so a single agent answer can't
70
+ * flood the chat, while staying HONEST.
71
+ *
72
+ * Honesty contract (fixed after a review found a self-defeating
73
+ * regression): whenever `capBody` actually truncates — i.e. the body is
74
+ * non-empty AND longer than BODY_CAP — the delivery ALSO attaches the
75
+ * COMPLETE uncapped output as a `.md` file via the same upload
76
+ * mechanism the old >20000-char path already used. The marker
77
+ * therefore truthfully says the full output is *attached*, instead of
78
+ * the previous wording that pointed at a `~/.alvin-bot/logs/` file the
79
+ * cap path never actually wrote. Net effect: any truncated delivery =
80
+ * bounded inline message + full `.md` attachment; no lossy inline-only
81
+ * range remains. The old >20000 path is unchanged (it already attached
82
+ * the full body); this just extends "attach the full file" down to
83
+ * "whenever the cap truncated".
84
+ *
85
+ * This is a pure bounded slice + a fixed marker — NOT a structure-
86
+ * guessing heuristic. It no-ops on empty/whitespace so the
87
+ * `(empty output)` truncated-run signal keeps working (and no spurious
88
+ * file is attached for it).
89
+ */
90
+ const BODY_CAP = 1800;
91
+ const TRUNCATION_MARKER = "…(truncated for chat — full output attached)";
92
+ /**
93
+ * True when `capBody` would actually truncate this body — the single
94
+ * source of truth for "did we drop content, so the full output must be
95
+ * attached as a file". Mirrors the `length > BODY_CAP` test in capBody.
96
+ */
97
+ function isTruncated(body) {
98
+ return body.length > BODY_CAP;
99
+ }
100
+ function capBody(body) {
101
+ if (body.length <= BODY_CAP)
102
+ return body;
103
+ return `${body.slice(0, BODY_CAP)}\n\n${TRUNCATION_MARKER}`;
104
+ }
44
105
  let injectedApi = null;
45
106
  let runtimeApi = null;
46
107
  /** Test-only hook for injecting a fake bot API. Production code must NEVER call this. */
@@ -251,28 +312,29 @@ export function createLiveStream(chatId, agentName) {
251
312
  * - "slack" / "discord" / "whatsapp" → delivery-registry lookup
252
313
  */
253
314
  export async function deliverSubAgentResult(info, result, opts = {}) {
315
+ const OK = { chatNotFound: false };
254
316
  // Implicit spawns: the Task-tool bridge in the main stream has already
255
317
  // surfaced the output; extra delivery would be duplication.
256
318
  if (info.source === "implicit")
257
- return;
319
+ return OK;
258
320
  const effective = opts.visibility ?? getVisibility();
259
321
  if (effective === "silent")
260
- return;
322
+ return OK;
261
323
  if (!info.parentChatId) {
262
324
  console.warn(`[subagent-delivery] missing parentChatId for ${info.name} (source=${info.source})`);
263
- return;
325
+ return OK;
264
326
  }
265
327
  // v4.14 — Platform routing. Telegram is the default path (unchanged).
266
328
  const platform = info.platform ?? "telegram";
267
329
  if (platform !== "telegram") {
268
330
  await deliverViaRegistry(platform, info, result);
269
- return;
331
+ return OK;
270
332
  }
271
333
  // ── Telegram path (v4.12.x behavior, unchanged) ──────────────────
272
334
  const api = getBotApi();
273
335
  if (!api) {
274
336
  console.warn(`[subagent-delivery] no bot api available for ${info.name}`);
275
- return;
337
+ return OK;
276
338
  }
277
339
  // Telegram's chatId is always a number at runtime; defensive cast.
278
340
  const tgChatId = typeof info.parentChatId === "number"
@@ -280,40 +342,70 @@ export async function deliverSubAgentResult(info, result, opts = {}) {
280
342
  : Number(info.parentChatId);
281
343
  if (!Number.isFinite(tgChatId)) {
282
344
  console.warn(`[subagent-delivery] invalid telegram chatId for ${info.name}`);
283
- return;
345
+ return OK;
284
346
  }
285
347
  const banner = buildBanner(info, result);
286
348
  const body = result.output?.trim() || `(empty output)`;
349
+ // V56-T2 — bounded variant for the INLINE message path. Whenever this
350
+ // actually truncates (isTruncated), the FULL uncapped `body` is also
351
+ // attached as a .md file below, so the cap never costs the user
352
+ // access to the complete result and the marker stays truthful.
353
+ const inlineBody = capBody(body);
287
354
  try {
288
- // Case 1: very long output file upload with a short banner
289
- if (body.length > FILE_UPLOAD_THRESHOLD) {
355
+ // Truncated honest delivery: short banner + bounded inline body
356
+ // (with the truthful "full output attached" marker) + the COMPLETE
357
+ // uncapped body as a .md file. This single branch covers the whole
358
+ // truncated range (mid-size AND the old > 20000-char range): there
359
+ // is no lossy inline-only range anymore. (The old >20000 behavior
360
+ // is unchanged — it already attached the full body; the change is
361
+ // that mid-size now also attaches it and the marker no longer
362
+ // points at a logs file that was never written.)
363
+ if (isTruncated(body)) {
290
364
  await sendWithMarkdownFallback(api, tgChatId, banner);
365
+ // The bounded inline body fits in one message (BODY_CAP=1800 plus
366
+ // the short marker is well under MAX_TG_CHUNK); send it as plain
367
+ // text so an unbalanced markdown slice can't crash the send.
368
+ await api.sendMessage(tgChatId, inlineBody.slice(0, MAX_TG_CHUNK));
291
369
  try {
292
370
  const { InputFile } = await import("grammy");
293
371
  const buf = Buffer.from(body, "utf-8");
294
372
  await api.sendDocument(tgChatId, new InputFile(buf, `${info.name}.md`));
295
373
  }
296
374
  catch (err) {
375
+ // Upload failed → the bounded inline body was already delivered
376
+ // above, so the user still has something honest (banner + capped
377
+ // text + marker). The marker slightly over-promises here (file
378
+ // didn't attach) but this is the rare failure path, not the
379
+ // normal one, and there is no silent data loss.
297
380
  console.error(`[subagent-delivery] file upload failed:`, err);
298
- await api.sendMessage(tgChatId, body.slice(0, MAX_TG_CHUNK));
299
381
  }
300
- return;
382
+ return OK;
301
383
  }
302
- // Case 2: fits in a single message banner + body joined
303
- if (body.length + banner.length + 2 <= MAX_TG_CHUNK) {
304
- await sendWithMarkdownFallback(api, tgChatId, `${banner}\n\n${body}`);
305
- return;
384
+ // Not truncated (body BODY_CAP)unchanged passthrough.
385
+ // inlineBody === body here (capBody is a no-op), no marker, no file.
386
+ // Case A: fits in a single message → banner + body joined
387
+ if (inlineBody.length + banner.length + 2 <= MAX_TG_CHUNK) {
388
+ await sendWithMarkdownFallback(api, tgChatId, `${banner}\n\n${inlineBody}`);
389
+ return OK;
306
390
  }
307
- // Case 3: medium output banner as its own message, body chunked
391
+ // Case B: defensive a ≤1800-char body still under-runs MAX_TG_CHUNK
392
+ // with the banner, but keep the banner-then-chunk fallback for
393
+ // safety against an unusually long banner.
308
394
  await sendWithMarkdownFallback(api, tgChatId, banner);
309
- for (let i = 0; i < body.length; i += MAX_TG_CHUNK) {
395
+ for (let i = 0; i < inlineBody.length; i += MAX_TG_CHUNK) {
310
396
  // Body chunks are always sent as plain text — markdown across
311
397
  // arbitrary chunk boundaries would be inconsistent anyway.
312
- await api.sendMessage(tgChatId, body.slice(i, i + MAX_TG_CHUNK));
398
+ await api.sendMessage(tgChatId, inlineBody.slice(i, i + MAX_TG_CHUNK));
313
399
  }
400
+ return OK;
314
401
  }
315
402
  catch (err) {
316
403
  console.error(`[subagent-delivery] send failed for ${info.name}:`, err);
404
+ // B3 — report a permanent invalid-target failure so the watcher can
405
+ // abandon this agent instead of retrying it forever. Any other error
406
+ // (network, rate-limit, parse) is NOT reported as chatNotFound, so the
407
+ // agent's normal retry/timeout lifecycle is unchanged.
408
+ return { chatNotFound: isChatNotFoundError(err) };
317
409
  }
318
410
  }
319
411
  /**
@@ -336,36 +428,45 @@ async function deliverViaRegistry(platform, info, result) {
336
428
  const chatId = info.parentChatId;
337
429
  const banner = buildBannerPlain(info, result);
338
430
  const body = result.output?.trim() || `(empty output)`;
431
+ // V56-T2 — same honest contract as the Telegram path. Whenever the
432
+ // cap truncates, the FULL uncapped `body` is attached as a .md file
433
+ // (if the adapter supports uploads) so the marker stays truthful and
434
+ // the complete output remains accessible.
435
+ const inlineBody = capBody(body);
339
436
  const NON_TG_CHUNK = 3800;
340
- const FILE_THRESHOLD = 20_000;
341
437
  try {
342
- // Very long output file upload if supported, else truncated text
343
- if (body.length > FILE_THRESHOLD) {
438
+ // Truncated honest delivery: banner + bounded inline body (with
439
+ // the truthful "full output attached" marker) + the COMPLETE
440
+ // uncapped body as a .md file. Covers the whole truncated range
441
+ // (mid-size AND > the old 20k threshold) — no lossy inline-only
442
+ // range remains. If the adapter has no sendDocument or the upload
443
+ // fails, the bounded inline body still went out (honest, just no
444
+ // file) — no silent data loss.
445
+ if (isTruncated(body)) {
344
446
  await adapter.sendText(chatId, banner);
447
+ for (let i = 0; i < inlineBody.length; i += NON_TG_CHUNK) {
448
+ await adapter.sendText(chatId, inlineBody.slice(i, i + NON_TG_CHUNK));
449
+ }
345
450
  if (adapter.sendDocument) {
346
451
  try {
347
452
  await adapter.sendDocument(chatId, Buffer.from(body, "utf-8"), `${info.name}.md`);
348
- return;
349
453
  }
350
454
  catch (err) {
351
455
  console.error(`[subagent-delivery] ${platform} file upload failed:`, err);
352
456
  }
353
457
  }
354
- // Fallback: chunked text if no file upload or upload failed
355
- for (let i = 0; i < body.length; i += NON_TG_CHUNK) {
356
- await adapter.sendText(chatId, body.slice(i, i + NON_TG_CHUNK));
357
- }
358
458
  return;
359
459
  }
360
- // Fits in one messagecombined
361
- if (body.length + banner.length + 2 <= NON_TG_CHUNK) {
362
- await adapter.sendText(chatId, `${banner}\n\n${body}`);
460
+ // Not truncated (body BODY_CAP) unchanged passthrough.
461
+ // inlineBody === body here, no marker, no file.
462
+ if (inlineBody.length + banner.length + 2 <= NON_TG_CHUNK) {
463
+ await adapter.sendText(chatId, `${banner}\n\n${inlineBody}`);
363
464
  return;
364
465
  }
365
- // Medium banner first, then chunked body
466
+ // Defensive banner-then-chunk fallback (e.g. unusually long banner).
366
467
  await adapter.sendText(chatId, banner);
367
- for (let i = 0; i < body.length; i += NON_TG_CHUNK) {
368
- await adapter.sendText(chatId, body.slice(i, i + NON_TG_CHUNK));
468
+ for (let i = 0; i < inlineBody.length; i += NON_TG_CHUNK) {
469
+ await adapter.sendText(chatId, inlineBody.slice(i, i + NON_TG_CHUNK));
369
470
  }
370
471
  }
371
472
  catch (err) {
@@ -288,7 +288,9 @@ async function runSubAgent(id, agentConfig, abort, resolvedName) {
288
288
  : os.homedir();
289
289
  const systemPrompt = `You are a sub-agent named "${resolvedName}". Complete the following task autonomously. Working directory: ${effectiveCwd}
290
290
 
291
- When done, return ONLY the final result/outcome, concisely. Do NOT narrate your intermediate steps, your reasoning, your tool calls, or a play-by-play of what you did the orchestrator only needs the outcome (the answer, the report, the list, the artifact path), and on failure the error plus what was and wasn't done. No preamble, no "Here's what I did", no step-by-step recap. Run status, duration and token usage are reported separately, so don't restate them.`;
291
+ Do NOT send your own Telegram/chat/notification messages as a step, and do NOT use any tool or skill to message the user or post your progress your final return value is the SOLE delivery path and the orchestrator delivers it for you. A self-sent message causes a duplicate the user sees twice.
292
+
293
+ When done, return ONLY the final result/outcome itself, concisely — nothing else. Do NOT narrate, summarize, or recap your intermediate steps, your reasoning, your tool calls, your plan, or a play-by-play of what you did. The orchestrator needs ONLY the outcome (the answer, the report, the list, the artifact path); on failure, return the error plus exactly what was and wasn't done. No preamble, no meta-commentary, no "Here's what I did", no "I will now…", no step-by-step recap. Run status, duration and token usage are reported separately, so don't restate them.`;
292
294
  // v4.12.2 — Map the toolset preset to an explicit allowedTools list.
293
295
  // The provider honors this override (see src/providers/claude-sdk-provider.ts
294
296
  // line ~140). Passing undefined = full access (provider default).
@@ -326,10 +328,22 @@ When done, return ONLY the final result/outcome, concisely. Do NOT narrate your
326
328
  }
327
329
  }
328
330
  if (chunk.type === "done") {
329
- // done.text is the authoritative final accumulated text from
330
- // the provider. Prefer it over the buffered value so runs that
331
- // end on a tool_use don't leave us with a pre-tool snippet.
332
- if (chunk.text && chunk.text.length > 0) {
331
+ // V56-T1 Prefer the SDK's authoritative FINAL result over the
332
+ // accumulated narration. The Claude Agent SDK emits a terminal
333
+ // `result` message whose single `result` field IS the agent's
334
+ // actual outcome; the provider surfaces it as `chunk.finalResult`.
335
+ // Using it here excludes the step-by-step narration BY
336
+ // CONSTRUCTION (it's a distinct SDK field, not a heuristic over
337
+ // concatenated text), matching what the detached-dispatch path
338
+ // already does. When the provider has no distinct final-result
339
+ // message (non-SDK providers, SDK error results), finalResult is
340
+ // undefined and we fall back to done.text — the previous
341
+ // authoritative-accumulated-text behaviour, so streamed-text
342
+ // consumers and the Fix #5 contract are unaffected.
343
+ if (typeof chunk.finalResult === "string" && chunk.finalResult.length > 0) {
344
+ finalText = chunk.finalResult;
345
+ }
346
+ else if (chunk.text && chunk.text.length > 0) {
333
347
  finalText = chunk.text;
334
348
  }
335
349
  inputTokens = chunk.inputTokens || 0;
@@ -17,6 +17,15 @@ export class TelegramStreamer {
17
17
  this.api = api;
18
18
  this.replyTo = replyToMessageId;
19
19
  }
20
+ /**
21
+ * True when at least one message has been sent to the user (i.e. messageId
22
+ * is set). Used by the A3 suppress-undelivered guard in message.ts to
23
+ * determine whether visible text has already reached the user — if so, the
24
+ * no-retract invariant prevents suppressing the final send.
25
+ */
26
+ get hasSentText() {
27
+ return this.messageId !== null;
28
+ }
20
29
  /**
21
30
  * Set a transient status line (e.g. "📖 Read file.html…") that gets
22
31
  * appended to the current accumulated text. Passing null clears it.
@@ -33,12 +33,81 @@
33
33
  * ALVIN_TRENDS_INTERVAL_HOURS=24 → snapshot cadence
34
34
  * ALVIN_TRENDS_AI_AFTER_DAYS=7 → days of data before AI analysis kicks in
35
35
  */
36
- import { appendFileSync, existsSync, readFileSync, mkdirSync } from "fs";
36
+ import { appendFileSync, existsSync, readFileSync, writeFileSync, mkdirSync } from "fs";
37
37
  import { join, dirname } from "path";
38
38
  import { homedir } from "os";
39
39
  import { BOT_VERSION } from "../version.js";
40
40
  import { emitCritical } from "./critical-notify.js";
41
41
  const TRENDS_PATH = join(homedir(), ".alvin-bot", "state", "trends.jsonl");
42
+ /**
43
+ * B2 — peak-uptime high-water mark. The trends collector takes its FIRST
44
+ * snapshot ~60s after every boot (startTrendsCollector schedules it at
45
+ * 60_000ms). takeSnapshot() records uptime_s = process.uptime(), so the
46
+ * first post-restart sample is structurally ≈ 62s. With deliberate
47
+ * restarts (/update, launchctl reload) those ~62s samples dominate
48
+ * trends.jsonl, so the 30-day AI pass perpetually concludes "restart
49
+ * loop, never lives past ~62s" even when the process has actually been
50
+ * continuously up for hours by the time the daily snapshot fires.
51
+ *
52
+ * Fix: persist the MAXIMUM real uptime this bot has ever observed (across
53
+ * process generations) and record it on every snapshot as uptime_peak_s.
54
+ * The peak only ever derives from process.uptime() — it is never
55
+ * fabricated or extrapolated. The anomaly evaluation then keys on the
56
+ * peak (hasRepresentativeUptime), so a process that genuinely lived for
57
+ * hours is not flagged as a ~62s loop, while a genuine fast-restart loop
58
+ * (peak never climbs past the startup transient) still fires.
59
+ *
60
+ * Stored next to trends.jsonl (state/), honoring ALVIN_DATA_DIR so tests
61
+ * and non-default installs work. Survives restarts by design — that is
62
+ * the whole point of a high-water mark.
63
+ */
64
+ function trendsStateDir() {
65
+ const base = process.env.ALVIN_DATA_DIR || join(homedir(), ".alvin-bot");
66
+ return join(base, "state");
67
+ }
68
+ function uptimePeakPath() {
69
+ return join(trendsStateDir(), "uptime-peak.json");
70
+ }
71
+ /**
72
+ * The startup transient: takeSnapshot's first sample is taken ~60s after
73
+ * boot, so any uptime at/under this is indistinguishable from "just
74
+ * restarted". An uptime ABOVE this proves the process actually lived past
75
+ * the post-restart sampling window. 600s (10 min) is comfortably above
76
+ * the 60s first-sample delay + scheduling jitter and far below the 24h
77
+ * cron cadence, so a healthy bot trivially clears it while a real
78
+ * crash-loop (exits within seconds/a couple minutes) never does.
79
+ */
80
+ export const STARTUP_TRANSIENT_S = 600;
81
+ /**
82
+ * Read the persisted peak uptime, fold in the CURRENT real uptime, persist
83
+ * the (possibly larger) high-water mark, and return it. Pure w.r.t. time
84
+ * sources: the only uptime input is process.uptime() — nothing invented.
85
+ * Disk failures degrade gracefully to the current real uptime.
86
+ */
87
+ function bumpAndReadUptimePeak() {
88
+ const currentReal = Math.round(process.uptime());
89
+ let stored = 0;
90
+ try {
91
+ const raw = readFileSync(uptimePeakPath(), "utf-8");
92
+ const parsed = JSON.parse(raw);
93
+ if (typeof parsed.peak_s === "number" && Number.isFinite(parsed.peak_s) && parsed.peak_s > 0) {
94
+ stored = parsed.peak_s;
95
+ }
96
+ }
97
+ catch {
98
+ // No file yet / unreadable — start the high-water mark from the
99
+ // current real uptime. Not an error.
100
+ }
101
+ const peak = Math.max(stored, currentReal);
102
+ try {
103
+ mkdirSync(trendsStateDir(), { recursive: true });
104
+ writeFileSync(uptimePeakPath(), JSON.stringify({ peak_s: peak }), "utf-8");
105
+ }
106
+ catch {
107
+ // Disk full / permissions — non-fatal; we still return the in-memory peak.
108
+ }
109
+ return peak;
110
+ }
42
111
  const DEFAULT_INTERVAL_HOURS = 24;
43
112
  const DEFAULT_AI_THRESHOLD_DAYS = 7;
44
113
  const MAX_RETAIN_DAYS = 90;
@@ -54,6 +123,18 @@ const MAX_RETAIN_DAYS = 90;
54
123
  * (a successful, expected fallback — not an error)
55
124
  * - critical-notify's own delivery-outcome line, kept on stderr on
56
125
  * purpose so it stays visible even in brake/crash context
126
+ * - B3: subagent-delivery's "send failed … chat not found" line for a
127
+ * stale/test async-agent whose delivery target chat no longer exists
128
+ * (e.g. the recurring chat_id:1 test agent). This is benign noise,
129
+ * not a real fault: the target chat is invalid, the watcher now
130
+ * abandons such agents (see async-agent-watcher.ts), and counting it
131
+ * made errors_24h creep upward indefinitely on every poll cycle.
132
+ * The match is DELIBERATELY narrow — it requires BOTH the
133
+ * `[subagent-delivery] send failed` prefix AND a `chat not found`
134
+ * cause on the same line. A subagent-delivery failure for ANY other
135
+ * reason (network, rate-limit, parse) is still counted, and a
136
+ * `chat not found` from ANY OTHER subsystem (a real misconfigured
137
+ * target) is still counted.
57
138
  *
58
139
  * Counting those turned this very monitor into a false-alarm generator:
59
140
  * it flagged its OWN log lines plus every release's restart churn, so
@@ -65,7 +146,7 @@ const MAX_RETAIN_DAYS = 90;
65
146
  * any, get added here in one place instead of being chased across the
66
147
  * codebase.
67
148
  */
68
- export const ERR_LOG_PATTERN = /^(?!.*(?:\[critical-notify\]|\[subagent-delivery\] Markdown parse failed)).+/;
149
+ export const ERR_LOG_PATTERN = /^(?!.*(?:\[critical-notify\]|\[subagent-delivery\] Markdown parse failed|\[subagent-delivery\] send failed.*chat not found)).+/;
69
150
  let trendsTimer = null;
70
151
  function isDisabled() {
71
152
  return (process.env.ALVIN_DISABLE_TRENDS === "true" ||
@@ -134,6 +215,7 @@ function takeSnapshot(activeProvider) {
134
215
  return {
135
216
  ts: new Date().toISOString(),
136
217
  uptime_s: Math.round(process.uptime()),
218
+ uptime_peak_s: bumpAndReadUptimePeak(),
137
219
  rss_mb: Math.round(mem.rss / 1024 / 1024),
138
220
  heap_mb: Math.round(mem.heapUsed / 1024 / 1024),
139
221
  crashes_24h: readWatchdogCrashes24h(),
@@ -195,6 +277,139 @@ SUGGESTION: <one shell command OR observation for the operator>
195
277
  --- LAST {N} DAYS OF SNAPSHOTS ---
196
278
  {SNAPSHOTS}
197
279
  --- END ---`;
280
+ /**
281
+ * V56 — Recent crash-evidence window.
282
+ *
283
+ * hasRealCrashEvidence keys the WARN-suppression gate on whether ANY
284
+ * persisted snapshot recorded a real crash. Snapshots persist for up to
285
+ * MAX_RETAIN_DAYS and the AI pass reads the last 30 (≈30 days at the 24h
286
+ * cadence). If the WHOLE 30-day history is considered, a history briefly
287
+ * poisoned by miscounted deliberate restarts (pre-v5.5.0 accounting bug,
288
+ * fixed in v5.5.0 for NEW snapshots but the bad lines persist ~30 days)
289
+ * keeps crash-evidence "true" — so the B2/B4 gate never suppresses and the
290
+ * false WARN fires for ~a month instead of self-healing.
291
+ *
292
+ * Restricting the evidence check to the most recent ~48h means: once
293
+ * v5.5.0's correct accounting produces clean recent snapshots
294
+ * (crashes_24h=0), the false WARN clears within ~a day — while a GENUINE
295
+ * crash loop (real crashes in the recent window) still returns true and
296
+ * the WARN still fires (the protective purpose is intact).
297
+ *
298
+ * 48h (not 24h) is chosen because the snapshot cadence is ~24h
299
+ * (DEFAULT_INTERVAL_HOURS): a 48h window reliably retains the last 1–2
300
+ * daily snapshots even across day-boundary jitter / a skipped cron tick,
301
+ * so a genuine recent crash loop is never missed, while crash evidence
302
+ * older than ~2 days (the poisoned history) ages out and self-heals. A
303
+ * timestamp window (not "last N snapshots") is used so self-healing keys
304
+ * on real wall-clock time and is robust to cadence changes / test-tuned
305
+ * ALVIN_TRENDS_INTERVAL_HOURS.
306
+ */
307
+ export const RECENT_CRASH_WINDOW_MS = 48 * 60 * 60 * 1000;
308
+ /**
309
+ * Returns true if at least one snapshot WITHIN THE RECENT WINDOW has a
310
+ * non-zero crashes_24h value, meaning a REAL crash (not an
311
+ * expected/deliberate restart) was recorded recently.
312
+ *
313
+ * After the B1 fix, deliberate restarts (SIGTERM / launchctl reload /
314
+ * /restart / /update) write the expectedRestart beacon flag and are NOT
315
+ * counted in dailyCrashCount. So crashes_24h === 0 across the recent
316
+ * snapshots means the bot was only restarted intentionally — no real
317
+ * crash evidence — even if OLDER snapshots were poisoned by the
318
+ * pre-v5.5.0 miscount (those age out of the window and the false WARN
319
+ * self-heals; see RECENT_CRASH_WINDOW_MS).
320
+ *
321
+ * Recency is determined from each snapshot's `ts` (ISO 8601, written by
322
+ * takeSnapshot via new Date().toISOString()). FAIL-SAFE: a snapshot whose
323
+ * `ts` is missing or unparseable is treated as in-window (counted) — a
324
+ * health monitor must fail toward "visible", never go blind on bad data.
325
+ *
326
+ * Pure function, exported for unit testing.
327
+ */
328
+ export function hasRealCrashEvidence(snaps, nowMs = Date.now()) {
329
+ const cutoff = nowMs - RECENT_CRASH_WINDOW_MS;
330
+ return snaps.some((s) => {
331
+ if (!(typeof s.crashes_24h === "number" && s.crashes_24h > 0))
332
+ return false;
333
+ // FAIL-SAFE: no/garbage ts → treat as recent (never silence on bad data).
334
+ if (typeof s.ts !== "string")
335
+ return true;
336
+ const t = Date.parse(s.ts);
337
+ if (!Number.isFinite(t))
338
+ return true;
339
+ return t >= cutoff;
340
+ });
341
+ }
342
+ /**
343
+ * B2 — Returns true if AT LEAST ONE snapshot proves the bot process
344
+ * genuinely lived past the startup transient (i.e. it is NOT a ~62s
345
+ * restart loop).
346
+ *
347
+ * The first per-boot snapshot is structurally taken ~60s after boot, so
348
+ * its raw uptime_s is always ≈ 62 regardless of how long the process
349
+ * subsequently runs. uptime_peak_s is the high-water mark of REAL
350
+ * process.uptime() carried across process generations, so a single
351
+ * snapshot whose peak exceeds STARTUP_TRANSIENT_S is hard evidence the
352
+ * process did live for a representative duration. Legacy pre-B2 lines
353
+ * have no uptime_peak_s — we fall back to their raw uptime_s, so a legacy
354
+ * 24h cron snapshot still counts as representative on its own.
355
+ *
356
+ * A genuine fast-restart loop never lets the peak climb past the
357
+ * transient, so it correctly returns false and the WARN still fires.
358
+ *
359
+ * Pure function, exported for unit testing.
360
+ */
361
+ export function hasRepresentativeUptime(snaps) {
362
+ return snaps.some((s) => {
363
+ const peak = typeof s.uptime_peak_s === "number" && Number.isFinite(s.uptime_peak_s)
364
+ ? s.uptime_peak_s
365
+ : typeof s.uptime_s === "number" && Number.isFinite(s.uptime_s)
366
+ ? s.uptime_s
367
+ : 0;
368
+ return peak > STARTUP_TRANSIENT_S;
369
+ });
370
+ }
371
+ /**
372
+ * B2/B4 — Pure crash/restart WARN suppression decision.
373
+ *
374
+ * Encodes the SAME two gates, in the SAME precedence, that dailyTask
375
+ * applies inline (B2 before B4). Extracted as a pure function purely so
376
+ * the gate COMPOSITION (not just each helper in isolation) is unit
377
+ * testable — the helpers are individually correct but the interaction
378
+ * is where the real-crash-loop-after-a-healthy-period regression lives.
379
+ *
380
+ * Returns the suppression reason, or "none" when the WARN must fire.
381
+ *
382
+ * - "representative-uptime" (B2): a deliberate-restart / sampling
383
+ * artifact — the AI saw ~62s uptimes but a snapshot peak proves the
384
+ * process actually lived past the startup transient. ONLY applies
385
+ * when there is no real crash evidence: a genuine crash loop after a
386
+ * prior healthy period still carries the persisted high peak, so
387
+ * without the crash-evidence guard B2 would permanently and silently
388
+ * swallow it. With the guard, crashes_24h>0 falls through to B4.
389
+ * - "no-crash-evidence" (B4): crash/restart pattern but crashes_24h===0
390
+ * everywhere (deliberate-restart-only, not a real crash loop).
391
+ * - "none": the WARN is real and must be emitted.
392
+ *
393
+ * Pure function, exported for unit testing.
394
+ */
395
+ export function evaluateCrashRestartSuppression(isCrashRestartPattern, snaps) {
396
+ if (!isCrashRestartPattern)
397
+ return "none";
398
+ const realCrash = hasRealCrashEvidence(snaps);
399
+ // B2: only the deliberate-restart / sampling-artifact case. A real
400
+ // crash loop (crashes_24h>0) must NOT be suppressed here even though
401
+ // the persisted uptime high-water mark still reads representative.
402
+ if (!realCrash && hasRepresentativeUptime(snaps))
403
+ return "representative-uptime";
404
+ // B4: crash/restart pattern with zero real crash evidence.
405
+ if (!realCrash)
406
+ return "no-crash-evidence";
407
+ return "none";
408
+ }
409
+ /** Test-only: take a snapshot without writing to trends.jsonl. */
410
+ export function __takeSnapshotForTest(activeProvider) {
411
+ return takeSnapshot(activeProvider);
412
+ }
198
413
  function parseTrendResponse(text) {
199
414
  if (/^ANOMALY:\s*NONE/im.test(text)) {
200
415
  return {
@@ -296,6 +511,38 @@ async function dailyTask(registry) {
296
511
  console.log(`📊 Trends AI: no anomaly detected`);
297
512
  return;
298
513
  }
514
+ const recentSnaps = readSnapshots(30);
515
+ const isCrashRestartPattern = /crash|restart|loop|uptime/i.test(result.description);
516
+ // B2 gate: suppress an "uptime stuck at ~62s / restart loop" WARN when
517
+ // the snapshots PROVE the process actually lived past the startup
518
+ // transient. The first per-boot snapshot is structurally sampled ~60s
519
+ // after boot, so raw uptime_s reads ≈62 even for a perfectly healthy
520
+ // bot that has been up for hours by the time the daily snapshot fires.
521
+ // uptime_peak_s is the high-water mark of real process.uptime() across
522
+ // process generations: if ANY snapshot's peak exceeds the transient,
523
+ // the "~62s loop" conclusion is factually false. A genuine fast-restart
524
+ // loop never lets the peak climb, so it is NOT suppressed here.
525
+ if (isCrashRestartPattern && !hasRealCrashEvidence(recentSnaps) && hasRepresentativeUptime(recentSnaps)) {
526
+ console.log(`📊 Trends AI: suppressed WARN "${result.description}" — ` +
527
+ `uptime/restart pattern flagged but at least one snapshot shows a ` +
528
+ `representative peak uptime (>${STARTUP_TRANSIENT_S}s); the process ` +
529
+ `did live well past the post-restart sampling window, not a ~62s loop`);
530
+ return;
531
+ }
532
+ // B4 gate: suppress WARN when the AI flags a crash/restart-loop pattern
533
+ // but the historical snapshots contain ZERO real crash evidence
534
+ // (crashes_24h === 0 across the board). This happens when the bot was
535
+ // restarted deliberately (launchctl reload / /update / /restart) — those
536
+ // produce low uptimes that the AI reads as "restart loop", but the
537
+ // crash counter stays at 0 because markExpectedRestart() was written
538
+ // on each clean shutdown. A real crash loop WILL have crashes_24h > 0
539
+ // in at least one snapshot and will still fire the WARN.
540
+ if (isCrashRestartPattern && !hasRealCrashEvidence(recentSnaps)) {
541
+ console.log(`📊 Trends AI: suppressed WARN "${result.description}" — ` +
542
+ `crash/restart pattern detected but crashes_24h=0 across all snapshots ` +
543
+ `(deliberate-restart-only, not a real crash loop)`);
544
+ return;
545
+ }
299
546
  console.log(`📊 Trends AI: ANOMALY (${result.severity}) — ${result.description}`);
300
547
  emitCritical({
301
548
  category: "custom",
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "alvin-bot",
3
- "version": "5.4.0",
3
+ "version": "5.6.0",
4
4
  "description": "Alvin Bot — Your personal AI agent on Telegram, WhatsApp, Discord, Signal, and Web.",
5
5
  "type": "module",
6
6
  "main": "dist/index.js",