alvin-bot 5.4.0 → 5.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +54 -0
- package/dist/handlers/commands.js +23 -2
- package/dist/handlers/message.js +74 -16
- package/dist/i18n.js +15 -0
- package/dist/index.js +7 -1
- package/dist/providers/claude-sdk-provider.js +14 -0
- package/dist/services/async-agent-watcher.js +53 -6
- package/dist/services/subagent-delivery.js +133 -32
- package/dist/services/subagents.js +19 -5
- package/dist/services/telegram.js +9 -0
- package/dist/services/trends.js +249 -2
- package/package.json +1 -1
package/CHANGELOG.md
CHANGED
|
@@ -2,6 +2,60 @@
|
|
|
2
2
|
|
|
3
3
|
All notable changes to Alvin Bot are documented here.
|
|
4
4
|
|
|
5
|
+
## [5.6.0] — 2026-05-18
|
|
6
|
+
|
|
7
|
+
### Background-task reports are now clean and to the point
|
|
8
|
+
|
|
9
|
+
When a scheduled or background task finishes, Alvin now sends you
|
|
10
|
+
just the result — a tight header (what ran, how long, tokens, success)
|
|
11
|
+
and the actual answer — instead of a wall of its working notes. If a
|
|
12
|
+
result is unusually long, the chat message stays short and the
|
|
13
|
+
complete output comes attached as a file, so you never lose anything
|
|
14
|
+
and never have to scroll through a transcript.
|
|
15
|
+
|
|
16
|
+
### A clear confirmation when you stop something
|
|
17
|
+
|
|
18
|
+
Press ⛔ Stop (or use /cancel) while Alvin is genuinely working and
|
|
19
|
+
you now get a short, plain confirmation in your language that the work
|
|
20
|
+
was halted — not just a fleeting button flash. If nothing was running,
|
|
21
|
+
Alvin still tells you that honestly instead of pretending it stopped
|
|
22
|
+
something.
|
|
23
|
+
|
|
24
|
+
### Health alerts that don't cry wolf
|
|
25
|
+
|
|
26
|
+
Alvin's self-monitoring now judges its health on recent activity, so a
|
|
27
|
+
one-off rough patch no longer keeps it flagging a problem for weeks. A
|
|
28
|
+
real issue still raises a flag promptly; a quiet, healthy bot stays
|
|
29
|
+
quiet.
|
|
30
|
+
|
|
31
|
+
As always, this shipped after a full multi-pass review and a
|
|
32
|
+
fresh-install + stress verification on a clean separate machine.
|
|
33
|
+
|
|
34
|
+
## [5.5.0] — 2026-05-18
|
|
35
|
+
|
|
36
|
+
### The ⛔ Stop button now responds instantly — and honestly
|
|
37
|
+
|
|
38
|
+
Stopping a task is now crisp and truthful. The moment a task finishes,
|
|
39
|
+
the Stop button disappears, so you're never tapping a control for
|
|
40
|
+
something that's already done. And the feedback always matches reality:
|
|
41
|
+
if you tap Stop while Alvin is genuinely working, it stops and says so;
|
|
42
|
+
if the task had already completed, Alvin tells you that plainly instead
|
|
43
|
+
of implying it cut something short. If you hit Stop in that brief moment
|
|
44
|
+
while an answer is being prepared, that answer is now held back — "I
|
|
45
|
+
stopped it" means nothing more arrives. Anything Alvin had already
|
|
46
|
+
shown you stays exactly as it was.
|
|
47
|
+
|
|
48
|
+
### Fewer false alerts — smarter health monitoring
|
|
49
|
+
|
|
50
|
+
Alvin's self-monitoring got a lot more trustworthy. A planned restart
|
|
51
|
+
or an update is no longer mistaken for a problem, and the daily health
|
|
52
|
+
summary only raises a flag when there's real evidence something is
|
|
53
|
+
actually wrong — so the alerts you do get are ones worth reading.
|
|
54
|
+
Routine background housekeeping no longer shows up as noise.
|
|
55
|
+
|
|
56
|
+
As always, this shipped after a full multi-pass review and a
|
|
57
|
+
fresh-install + stress verification on a clean separate machine.
|
|
58
|
+
|
|
5
59
|
## [5.4.0] — 2026-05-18
|
|
6
60
|
|
|
7
61
|
### Smoother background tasks — and Alvin always tells you the truth
|
|
@@ -1918,6 +1918,10 @@ export function registerCommands(bot) {
|
|
|
1918
1918
|
if (session.isProcessing) {
|
|
1919
1919
|
requestStop(session, "soft", buildStopDeps(session));
|
|
1920
1920
|
await ctx.reply(t("bot.cancel.cancelling", lang));
|
|
1921
|
+
// V56-T2c — a real stop fired: follow the "cancelling…" notice with a
|
|
1922
|
+
// brief confirmation that the work was actually halted (consistent UX
|
|
1923
|
+
// with the ⛔ button). Best-effort — must never throw into the handler.
|
|
1924
|
+
await ctx.reply(t("bot.cancel.confirmed", lang)).catch(() => { });
|
|
1921
1925
|
}
|
|
1922
1926
|
else {
|
|
1923
1927
|
await ctx.reply(t("bot.cancel.noRunning", lang));
|
|
@@ -1946,17 +1950,34 @@ export function registerCommands(bot) {
|
|
|
1946
1950
|
const sessionKey = ctx.match[1];
|
|
1947
1951
|
const session = getSession(sessionKey);
|
|
1948
1952
|
const lang = session.language;
|
|
1949
|
-
|
|
1953
|
+
// A1 — Capture isProcessing BEFORE requestStop (which sets it false)
|
|
1954
|
+
// so we can show the right toast: "stopped" vs "already finished".
|
|
1955
|
+
const wasProcessing = session.isProcessing;
|
|
1956
|
+
if (wasProcessing) {
|
|
1950
1957
|
requestStop(session, "soft", buildStopDeps(session));
|
|
1951
1958
|
}
|
|
1959
|
+
// A1 — Honest toast: if the turn had already finished when the button was
|
|
1960
|
+
// tapped, don't claim "stopped" — tell the user it was already done.
|
|
1961
|
+
const toastKey = wasProcessing
|
|
1962
|
+
? "bot.cancel.stoppedToast"
|
|
1963
|
+
: "bot.cancel.alreadyDone";
|
|
1952
1964
|
try {
|
|
1953
|
-
await ctx.answerCallbackQuery({ text: t(
|
|
1965
|
+
await ctx.answerCallbackQuery({ text: t(toastKey, lang) });
|
|
1954
1966
|
}
|
|
1955
1967
|
catch { /* harmless grammy race */ }
|
|
1956
1968
|
try {
|
|
1957
1969
|
await ctx.editMessageReplyMarkup({});
|
|
1958
1970
|
}
|
|
1959
1971
|
catch { /* harmless grammy race — message may already be gone */ }
|
|
1972
|
+
// V56-T2c — when a real stop genuinely fired (wasProcessing), also send a
|
|
1973
|
+
// short in-chat confirmation in the session language so the user gets a
|
|
1974
|
+
// persistent acknowledgement, not only the ephemeral toast. When nothing
|
|
1975
|
+
// was running we deliberately stay silent here (v5.5.0 honesty: the
|
|
1976
|
+
// alreadyDone toast already told the truth). Best-effort — must never
|
|
1977
|
+
// throw into the handler.
|
|
1978
|
+
if (wasProcessing) {
|
|
1979
|
+
await ctx.reply(t("bot.cancel.confirmed", lang)).catch(() => { });
|
|
1980
|
+
}
|
|
1960
1981
|
});
|
|
1961
1982
|
// /restart — trigger a PM2-managed restart by exiting the process.
|
|
1962
1983
|
// The PM2 supervisor picks up the exit and respawns with --update-env.
|
package/dist/handlers/message.js
CHANGED
|
@@ -122,6 +122,37 @@ const TOOL_ICONS = {
|
|
|
122
122
|
WebFetch: "📡",
|
|
123
123
|
Task: "🤖",
|
|
124
124
|
};
|
|
125
|
+
// ── A3 — stop-suppress-undelivered pure predicate ────────────────────────────
|
|
126
|
+
/**
|
|
127
|
+
* Determine whether the final answer send should be suppressed because a stop
|
|
128
|
+
* was requested and no visible text has yet been delivered to the user.
|
|
129
|
+
*
|
|
130
|
+
* This closes the gap behind "I clicked Stop but it answered anyway": the
|
|
131
|
+
* Claude SDK delivers short answers atomically, so the for-await loop parks
|
|
132
|
+
* on IPC the whole time, and the complete answer arrives as one block. By the
|
|
133
|
+
* time the consumer bail fires at the top of the loop, the answer is computed
|
|
134
|
+
* and about to be sent. This guard is the only stoppable moment for atomic
|
|
135
|
+
* answers.
|
|
136
|
+
*
|
|
137
|
+
* HARD CONSTRAINT — no-retract invariant: if ANY visible text has already
|
|
138
|
+
* been streamed/committed to the user (visibleTextAlreadySent=true), the
|
|
139
|
+
* predicate returns false regardless of stop state. Partial output that
|
|
140
|
+
* already reached the user is NEVER retracted. The consumer bail in the
|
|
141
|
+
* for-await loop already handles mid-stream stops; this guard only acts on
|
|
142
|
+
* the final commit step.
|
|
143
|
+
*
|
|
144
|
+
* Truth table:
|
|
145
|
+
* stopRequested=truthy + visibleTextAlreadySent=false → true (suppress)
|
|
146
|
+
* stopRequested=truthy + visibleTextAlreadySent=true → false (no-retract)
|
|
147
|
+
* stopRequested=falsy + * → false (normal)
|
|
148
|
+
*/
|
|
149
|
+
export function shouldSuppressFinalSend(args) {
|
|
150
|
+
if (!args.stopRequested)
|
|
151
|
+
return false;
|
|
152
|
+
if (args.visibleTextAlreadySent)
|
|
153
|
+
return false;
|
|
154
|
+
return true;
|
|
155
|
+
}
|
|
125
156
|
// ── v5.2 live steering — pure routing helper ─────────────────────────────────
|
|
126
157
|
/**
|
|
127
158
|
* Decide how a mid-task message (arriving while `session.isProcessing`) should
|
|
@@ -785,19 +816,45 @@ export async function handleMessage(ctx) {
|
|
|
785
816
|
/* harmless — notice is best-effort */
|
|
786
817
|
}
|
|
787
818
|
}
|
|
788
|
-
// v5.1 stop: user stopped this query — do NOT finalize partial output
|
|
789
|
-
// as a successful answer, no 👍, no history commit. The stop trigger
|
|
790
|
-
// (/cancel | /stopall | ⛔ button) already acknowledged to the user.
|
|
791
|
-
// The `finally` still runs (clears isProcessing/_qHandle/_stopRequested
|
|
792
|
-
// + typing indicator).
|
|
793
|
-
if (session._stopRequested) {
|
|
794
|
-
return;
|
|
795
|
-
}
|
|
796
819
|
if (bypassAborted) {
|
|
797
820
|
// v4.12.3 — Bypass path took over; don't finalize, don't react 👍.
|
|
798
821
|
// Just clean up and return. The finally block still fires.
|
|
799
822
|
return;
|
|
800
823
|
}
|
|
824
|
+
// A3 — Suppress-or-finalize gate for stopped turns.
|
|
825
|
+
//
|
|
826
|
+
// shouldSuppressFinalSend is the SINGLE gate controlling whether finalize runs:
|
|
827
|
+
//
|
|
828
|
+
// stop + no visible text (suppress=true):
|
|
829
|
+
// Skip finalize and all side-effects. Nothing reached the user — correct.
|
|
830
|
+
// The stop trigger (/cancel | /stopall | ⛔) already acknowledged this.
|
|
831
|
+
// The `finally` still runs (clears isProcessing/_qHandle/_stopRequested
|
|
832
|
+
// + typing indicator).
|
|
833
|
+
//
|
|
834
|
+
// stop + visible text already sent (suppress=false, _stopRequested truthy):
|
|
835
|
+
// The no-retract invariant applies — partial output already shown must not
|
|
836
|
+
// be left visually unfinished. Run streamer.finalize to flush the throttle
|
|
837
|
+
// timer and drop the status line, then return BEFORE the completed-answer
|
|
838
|
+
// side-effects (👍 / broadcastResponseDone / addToHistory). A stopped turn
|
|
839
|
+
// is NOT a successfully completed turn.
|
|
840
|
+
//
|
|
841
|
+
// no stop (suppress=false, _stopRequested falsy):
|
|
842
|
+
// Normal path — fall through to finalize + all side-effects.
|
|
843
|
+
if (shouldSuppressFinalSend({
|
|
844
|
+
stopRequested: session._stopRequested,
|
|
845
|
+
visibleTextAlreadySent: streamer.hasSentText,
|
|
846
|
+
})) {
|
|
847
|
+
// Branch A: stop + no visible text → suppress entirely.
|
|
848
|
+
return;
|
|
849
|
+
}
|
|
850
|
+
if (session._stopRequested && streamer.hasSentText) {
|
|
851
|
+
// Branch B: stop + visible text already sent → finalize the partial cleanly
|
|
852
|
+
// (flushes throttle timer, clears status line) but do NOT emit the
|
|
853
|
+
// completed-answer signals or commit to history.
|
|
854
|
+
await streamer.finalize(finalText);
|
|
855
|
+
return;
|
|
856
|
+
}
|
|
857
|
+
// Branch C: normal (no stop) — fall through.
|
|
801
858
|
await streamer.finalize(finalText);
|
|
802
859
|
emit("message:sent", { userId, text: finalText, platform: "telegram" });
|
|
803
860
|
// v4.5.0: tell observers the response is complete.
|
|
@@ -874,6 +931,15 @@ export async function handleMessage(ctx) {
|
|
|
874
931
|
// but if a new turn started and re-populated _qHandle via onQueryHandle we
|
|
875
932
|
// must NOT null it here — that would break Cycle-1 stop teeth for the new turn.
|
|
876
933
|
if (session._turnId === _thisTurnId) {
|
|
934
|
+
// A2 — Remove the ⛔ Stop control message as the FIRST action when the
|
|
935
|
+
// turn ends, so the stale button disappears before any post-turn work.
|
|
936
|
+
// Best-effort: if it was already deleted or the bot lacks permission, ignore.
|
|
937
|
+
if (stopMsgId !== null) {
|
|
938
|
+
try {
|
|
939
|
+
await ctx.api.deleteMessage(ctx.chat.id, stopMsgId);
|
|
940
|
+
}
|
|
941
|
+
catch { /* harmless grammy race */ }
|
|
942
|
+
}
|
|
877
943
|
session.isProcessing = false;
|
|
878
944
|
session.abortController = null;
|
|
879
945
|
// v5.2 — Close and clear the SteerChannel; reset per-turn ack flag.
|
|
@@ -887,14 +953,6 @@ export async function handleMessage(ctx) {
|
|
|
887
953
|
session._stopRequested = null; // safe: token matches → no newer turn has set this
|
|
888
954
|
session._turnId = null;
|
|
889
955
|
}
|
|
890
|
-
// v5.1 — Remove the ⛔ Stop control message (sent at processing start).
|
|
891
|
-
// Best-effort: if it was already deleted or the bot lacks permission, ignore.
|
|
892
|
-
if (stopMsgId !== null) {
|
|
893
|
-
try {
|
|
894
|
-
await ctx.api.deleteMessage(ctx.chat.id, stopMsgId);
|
|
895
|
-
}
|
|
896
|
-
catch { /* harmless grammy race */ }
|
|
897
|
-
}
|
|
898
956
|
// Check for queued messages — they'll be prepended to the next real message
|
|
899
957
|
// Queue stays in session and gets consumed on next handleMessage call
|
|
900
958
|
}
|
package/dist/i18n.js
CHANGED
|
@@ -378,6 +378,21 @@ const strings = {
|
|
|
378
378
|
es: "⛔ Detenido",
|
|
379
379
|
fr: "⛔ Arrêté",
|
|
380
380
|
},
|
|
381
|
+
"bot.cancel.alreadyDone": {
|
|
382
|
+
en: "Nothing running — that already finished.",
|
|
383
|
+
de: "Nichts läuft — das war schon fertig.",
|
|
384
|
+
es: "Nada en curso — eso ya terminó.",
|
|
385
|
+
fr: "Rien en cours — c'était déjà terminé.",
|
|
386
|
+
},
|
|
387
|
+
// Sent as a brief in-chat confirmation only when a stop GENUINELY halted
|
|
388
|
+
// running work (⛔ button / /cancel with work actually in progress). Not
|
|
389
|
+
// sent when nothing was running — that honest behavior stays unchanged.
|
|
390
|
+
"bot.cancel.confirmed": {
|
|
391
|
+
en: "⛔ Stopped — further work was halted.",
|
|
392
|
+
de: "⛔ Gestoppt — die weitere Arbeit wurde angehalten.",
|
|
393
|
+
es: "⛔ Detenido — se interrumpió el trabajo en curso.",
|
|
394
|
+
fr: "⛔ Arrêté — le travail en cours a été interrompu.",
|
|
395
|
+
},
|
|
381
396
|
// /model
|
|
382
397
|
"bot.model.chooseHeader": {
|
|
383
398
|
en: "🤖 *Choose model:*",
|
package/dist/index.js
CHANGED
|
@@ -187,7 +187,7 @@ import { loadSkills } from "./services/skills.js";
|
|
|
187
187
|
import { loadHooks } from "./services/hooks.js";
|
|
188
188
|
import { registerShutdownHandler } from "./services/restart.js";
|
|
189
189
|
import { cancelAllSubAgents } from "./services/subagents.js";
|
|
190
|
-
import { startWatchdog, stopWatchdog, checkCrashLoopBrake } from "./services/watchdog.js";
|
|
190
|
+
import { startWatchdog, stopWatchdog, checkCrashLoopBrake, markExpectedRestart } from "./services/watchdog.js";
|
|
191
191
|
import { getRegistry } from "./engine.js";
|
|
192
192
|
import { scanAssets } from "./services/asset-index.js";
|
|
193
193
|
// Scan asset directory and generate INDEX.json + INDEX.md
|
|
@@ -383,6 +383,12 @@ const shutdown = async () => {
|
|
|
383
383
|
return;
|
|
384
384
|
isShuttingDown = true;
|
|
385
385
|
console.log("Graceful shutdown initiated...");
|
|
386
|
+
// Mark the imminent exit as an intentional restart so the next boot's
|
|
387
|
+
// decideBrakeAction does not count it as a crash. This covers launchctl
|
|
388
|
+
// unload/load (SIGTERM from launchd) in addition to /restart and /update
|
|
389
|
+
// which call markExpectedRestart() themselves before process.exit(0).
|
|
390
|
+
// Must run before stopWatchdog() (which just clears timers, not the beacon).
|
|
391
|
+
markExpectedRestart();
|
|
386
392
|
// E2: shutdown-notification — await the async cancellation so running
|
|
387
393
|
// agents can post a cancellation message to Telegram before the bot
|
|
388
394
|
// stops. Capped at 5s internally so a hang can't block shutdown.
|
|
@@ -446,9 +446,23 @@ export class ClaudeSDKProvider {
|
|
|
446
446
|
sessionResetRequested: true,
|
|
447
447
|
};
|
|
448
448
|
}
|
|
449
|
+
// V56-T1 — Surface the SDK's authoritative final answer
|
|
450
|
+
// separately from the accumulated narration. SDKResultSuccess
|
|
451
|
+
// carries a single `result: string` that is the agent's actual
|
|
452
|
+
// outcome (NOT the concatenation of every assistant turn).
|
|
453
|
+
// SDKResultError has no `result` field — leave finalResult
|
|
454
|
+
// undefined there so consumers fall back to buffered text.
|
|
455
|
+
// This is the same source the detached-dispatch path already
|
|
456
|
+
// prefers (`{"type":"result"}.result` in async-agent-parser).
|
|
457
|
+
const finalResult = "subtype" in resultMsg &&
|
|
458
|
+
resultMsg.subtype === "success" &&
|
|
459
|
+
typeof resultMsg.result === "string"
|
|
460
|
+
? resultMsg.result
|
|
461
|
+
: undefined;
|
|
449
462
|
yield {
|
|
450
463
|
type: "done",
|
|
451
464
|
text: accumulatedText || "",
|
|
465
|
+
...(finalResult !== undefined ? { finalResult } : {}),
|
|
452
466
|
sessionId: resultMsg.session_id || capturedSessionId,
|
|
453
467
|
costUsd: "total_cost_usd" in resultMsg ? resultMsg.total_cost_usd : 0,
|
|
454
468
|
inputTokens: inputTok,
|
|
@@ -27,6 +27,25 @@ import { dirname } from "path";
|
|
|
27
27
|
import { parseOutputFileStatus } from "./async-agent-parser.js";
|
|
28
28
|
import { ASYNC_AGENTS_STATE_FILE } from "../paths.js";
|
|
29
29
|
import { getAllSessions } from "./session.js";
|
|
30
|
+
/**
|
|
31
|
+
* B3 — Detect a permanent "target chat does not exist" delivery failure
|
|
32
|
+
* (Telegram 400 "Bad Request: chat not found"), e.g. the stale chat_id:1
|
|
33
|
+
* test agent. Such an agent must be abandoned, not retried forever.
|
|
34
|
+
*
|
|
35
|
+
* Kept as a local predicate (mirrors isChatNotFoundError in
|
|
36
|
+
* subagent-delivery.ts) so the watcher does NOT take a new hard
|
|
37
|
+
* dependency on a fresh subagent-delivery export — many test suites mock
|
|
38
|
+
* that module with only deliverSubAgentResult, and a destructured import
|
|
39
|
+
* of a non-mocked symbol would throw. Matched narrowly on the
|
|
40
|
+
* chat-not-found signature only.
|
|
41
|
+
*/
|
|
42
|
+
function isChatNotFoundError(err) {
|
|
43
|
+
if (!err || typeof err !== "object")
|
|
44
|
+
return false;
|
|
45
|
+
const e = err;
|
|
46
|
+
const haystack = `${e.message ?? ""} ${e.description ?? ""}`;
|
|
47
|
+
return /chat not found/i.test(haystack);
|
|
48
|
+
}
|
|
30
49
|
/** How often the polling loop runs against each pending agent. */
|
|
31
50
|
const POLL_INTERVAL_MS = 15_000;
|
|
32
51
|
/** Hard ceiling per agent — 12h. After this, give up and deliver
|
|
@@ -199,22 +218,38 @@ export async function pollOnce() {
|
|
|
199
218
|
const now = Date.now();
|
|
200
219
|
const toRemove = [];
|
|
201
220
|
const missingFileFailureMs = getMissingFileFailureMs();
|
|
221
|
+
// B3 — when a delivery attempt proves the target chat is permanently
|
|
222
|
+
// invalid ("chat not found", e.g. the stale chat_id:1 test agent),
|
|
223
|
+
// abandon the agent so the watcher never retries it. Without this, a
|
|
224
|
+
// pending agent with an invalid target spams stderr on every poll
|
|
225
|
+
// cycle (inflating errors_24h) and lingers until the 12h giveUpAt.
|
|
226
|
+
const abandonIfInvalidTarget = (entry, outcome) => {
|
|
227
|
+
if (!outcome.chatNotFound)
|
|
228
|
+
return;
|
|
229
|
+
if (!toRemove.includes(entry.agentId))
|
|
230
|
+
toRemove.push(entry.agentId);
|
|
231
|
+
console.warn(`[async-watcher] abandoning agent ${entry.agentId} — delivery target ` +
|
|
232
|
+
`chat ${String(entry.chatId)} not found (invalid/stale); will not retry`);
|
|
233
|
+
};
|
|
202
234
|
for (const entry of pending.values()) {
|
|
203
235
|
entry.lastCheckedAt = now;
|
|
204
236
|
// Timeout check first — if the agent is past its giveUpAt, give up
|
|
205
237
|
// regardless of whether the file shows progress.
|
|
206
238
|
if (now >= entry.giveUpAt) {
|
|
207
|
-
await deliverAsFailure(entry, "timeout", "Agent ran longer than 12h — giving up");
|
|
239
|
+
const outcome = await deliverAsFailure(entry, "timeout", "Agent ran longer than 12h — giving up");
|
|
240
|
+
abandonIfInvalidTarget(entry, outcome);
|
|
208
241
|
toRemove.push(entry.agentId);
|
|
209
242
|
continue;
|
|
210
243
|
}
|
|
211
244
|
const status = await parseOutputFileStatus(entry.outputFile);
|
|
212
245
|
if (status.state === "completed") {
|
|
213
|
-
await deliverAsCompleted(entry, status.output, status.tokensUsed);
|
|
246
|
+
const outcome = await deliverAsCompleted(entry, status.output, status.tokensUsed);
|
|
247
|
+
abandonIfInvalidTarget(entry, outcome);
|
|
214
248
|
toRemove.push(entry.agentId);
|
|
215
249
|
}
|
|
216
250
|
else if (status.state === "failed") {
|
|
217
|
-
await deliverAsFailure(entry, "error", status.error);
|
|
251
|
+
const outcome = await deliverAsFailure(entry, "error", status.error);
|
|
252
|
+
abandonIfInvalidTarget(entry, outcome);
|
|
218
253
|
toRemove.push(entry.agentId);
|
|
219
254
|
}
|
|
220
255
|
else if (status.state === "missing" &&
|
|
@@ -222,7 +257,8 @@ export async function pollOnce() {
|
|
|
222
257
|
// v4.14.2 — Zombie guard: the subprocess never created its
|
|
223
258
|
// output file within `missingFileFailureMs` (default 10 min).
|
|
224
259
|
// Declare failed instead of polling until the 12h giveUpAt.
|
|
225
|
-
await deliverAsFailure(entry, "error", `Dispatched subprocess never wrote its output file (${Math.round((now - entry.startedAt) / 60_000)}m after start). Likely crashed before initializing, or the file was removed externally.`);
|
|
260
|
+
const outcome = await deliverAsFailure(entry, "error", `Dispatched subprocess never wrote its output file (${Math.round((now - entry.startedAt) / 60_000)}m after start). Likely crashed before initializing, or the file was removed externally.`);
|
|
261
|
+
abandonIfInvalidTarget(entry, outcome);
|
|
226
262
|
toRemove.push(entry.agentId);
|
|
227
263
|
}
|
|
228
264
|
// running / missing-but-young → keep polling next cycle
|
|
@@ -254,13 +290,20 @@ async function deliverAsCompleted(entry, output, tokensUsed) {
|
|
|
254
290
|
tokensUsed: tokensUsed ?? { input: 0, output: 0 },
|
|
255
291
|
duration: Date.now() - entry.startedAt,
|
|
256
292
|
};
|
|
293
|
+
let chatNotFound = false;
|
|
257
294
|
try {
|
|
258
|
-
await deliverSubAgentResult(info, result);
|
|
295
|
+
const outcome = await deliverSubAgentResult(info, result);
|
|
296
|
+
chatNotFound = !!outcome?.chatNotFound;
|
|
259
297
|
}
|
|
260
298
|
catch (err) {
|
|
261
299
|
console.error(`[async-watcher] delivery failed for ${entry.agentId}:`, err);
|
|
300
|
+
// deliverSubAgentResult normally swallows send errors and reports
|
|
301
|
+
// chatNotFound via its return value; if it ever throws, still detect
|
|
302
|
+
// the permanent invalid-target case here.
|
|
303
|
+
chatNotFound = isChatNotFoundError(err);
|
|
262
304
|
}
|
|
263
305
|
decrementPendingCount(entry.sessionKey);
|
|
306
|
+
return { chatNotFound };
|
|
264
307
|
}
|
|
265
308
|
async function deliverAsFailure(entry, status, error) {
|
|
266
309
|
const { deliverSubAgentResult } = await import("./subagent-delivery.js");
|
|
@@ -283,13 +326,17 @@ async function deliverAsFailure(entry, status, error) {
|
|
|
283
326
|
duration: Date.now() - entry.startedAt,
|
|
284
327
|
error,
|
|
285
328
|
};
|
|
329
|
+
let chatNotFound = false;
|
|
286
330
|
try {
|
|
287
|
-
await deliverSubAgentResult(info, result);
|
|
331
|
+
const outcome = await deliverSubAgentResult(info, result);
|
|
332
|
+
chatNotFound = !!outcome?.chatNotFound;
|
|
288
333
|
}
|
|
289
334
|
catch (err) {
|
|
290
335
|
console.error(`[async-watcher] failure delivery failed for ${entry.agentId}:`, err);
|
|
336
|
+
chatNotFound = isChatNotFoundError(err);
|
|
291
337
|
}
|
|
292
338
|
decrementPendingCount(entry.sessionKey);
|
|
339
|
+
return { chatNotFound };
|
|
293
340
|
}
|
|
294
341
|
// ── Test helpers ──────────────────────────────────────────────────
|
|
295
342
|
/**
|
|
@@ -24,6 +24,22 @@ function isTelegramParseError(err) {
|
|
|
24
24
|
const haystack = `${e.message ?? ""} ${e.description ?? ""}`;
|
|
25
25
|
return /can't parse entities|can't find end of the entity/i.test(haystack);
|
|
26
26
|
}
|
|
27
|
+
/**
|
|
28
|
+
* B3 — A Telegram send rejected because the TARGET CHAT DOES NOT EXIST
|
|
29
|
+
* (HTTP 400 "Bad Request: chat not found"). This is a permanent,
|
|
30
|
+
* non-recoverable condition: the chat id is invalid (e.g. the stale
|
|
31
|
+
* chat_id:1 test agent), so every retry will fail identically and just
|
|
32
|
+
* spam stderr. Distinct from transient failures (network, rate-limit)
|
|
33
|
+
* which ARE worth retrying. Matched narrowly on the chat-not-found
|
|
34
|
+
* signature only — never on generic Bad Request.
|
|
35
|
+
*/
|
|
36
|
+
export function isChatNotFoundError(err) {
|
|
37
|
+
if (!err || typeof err !== "object")
|
|
38
|
+
return false;
|
|
39
|
+
const e = err;
|
|
40
|
+
const haystack = `${e.message ?? ""} ${e.description ?? ""}`;
|
|
41
|
+
return /chat not found/i.test(haystack);
|
|
42
|
+
}
|
|
27
43
|
/**
|
|
28
44
|
* Send a Markdown message with an automatic plain-text retry on parse
|
|
29
45
|
* errors. Any other error propagates to the caller's outer catch.
|
|
@@ -40,7 +56,52 @@ async function sendWithMarkdownFallback(api, chatId, text) {
|
|
|
40
56
|
}
|
|
41
57
|
}
|
|
42
58
|
const MAX_TG_CHUNK = 3800; // below Telegram's 4096 limit with headroom
|
|
43
|
-
|
|
59
|
+
// V56-T2 honesty fix — the .md file attachment is no longer gated on a
|
|
60
|
+
// separate 20k threshold. It now triggers whenever the cap actually
|
|
61
|
+
// truncates (isTruncated → body.length > BODY_CAP), so every truncated
|
|
62
|
+
// delivery carries the full output as a file and the marker is honest.
|
|
63
|
+
// (The prior 20k-only behavior is fully subsumed by isTruncated.)
|
|
64
|
+
/**
|
|
65
|
+
* V56-T2 (Layer-2) — honest hard cap on the INLINE delivered body.
|
|
66
|
+
*
|
|
67
|
+
* V56-T1 made delivery carry the SDK final result instead of the whole
|
|
68
|
+
* transcript, but a final result can itself occasionally be very long.
|
|
69
|
+
* This bounds the inline-message body so a single agent answer can't
|
|
70
|
+
* flood the chat, while staying HONEST.
|
|
71
|
+
*
|
|
72
|
+
* Honesty contract (fixed after a review found a self-defeating
|
|
73
|
+
* regression): whenever `capBody` actually truncates — i.e. the body is
|
|
74
|
+
* non-empty AND longer than BODY_CAP — the delivery ALSO attaches the
|
|
75
|
+
* COMPLETE uncapped output as a `.md` file via the same upload
|
|
76
|
+
* mechanism the old >20000-char path already used. The marker
|
|
77
|
+
* therefore truthfully says the full output is *attached*, instead of
|
|
78
|
+
* the previous wording that pointed at a `~/.alvin-bot/logs/` file the
|
|
79
|
+
* cap path never actually wrote. Net effect: any truncated delivery =
|
|
80
|
+
* bounded inline message + full `.md` attachment; no lossy inline-only
|
|
81
|
+
* range remains. The old >20000 path is unchanged (it already attached
|
|
82
|
+
* the full body); this just extends "attach the full file" down to
|
|
83
|
+
* "whenever the cap truncated".
|
|
84
|
+
*
|
|
85
|
+
* This is a pure bounded slice + a fixed marker — NOT a structure-
|
|
86
|
+
* guessing heuristic. It no-ops on empty/whitespace so the
|
|
87
|
+
* `(empty output)` truncated-run signal keeps working (and no spurious
|
|
88
|
+
* file is attached for it).
|
|
89
|
+
*/
|
|
90
|
+
const BODY_CAP = 1800;
|
|
91
|
+
const TRUNCATION_MARKER = "…(truncated for chat — full output attached)";
|
|
92
|
+
/**
|
|
93
|
+
* True when `capBody` would actually truncate this body — the single
|
|
94
|
+
* source of truth for "did we drop content, so the full output must be
|
|
95
|
+
* attached as a file". Mirrors the `length > BODY_CAP` test in capBody.
|
|
96
|
+
*/
|
|
97
|
+
function isTruncated(body) {
|
|
98
|
+
return body.length > BODY_CAP;
|
|
99
|
+
}
|
|
100
|
+
function capBody(body) {
|
|
101
|
+
if (body.length <= BODY_CAP)
|
|
102
|
+
return body;
|
|
103
|
+
return `${body.slice(0, BODY_CAP)}\n\n${TRUNCATION_MARKER}`;
|
|
104
|
+
}
|
|
44
105
|
let injectedApi = null;
|
|
45
106
|
let runtimeApi = null;
|
|
46
107
|
/** Test-only hook for injecting a fake bot API. Production code must NEVER call this. */
|
|
@@ -251,28 +312,29 @@ export function createLiveStream(chatId, agentName) {
|
|
|
251
312
|
* - "slack" / "discord" / "whatsapp" → delivery-registry lookup
|
|
252
313
|
*/
|
|
253
314
|
export async function deliverSubAgentResult(info, result, opts = {}) {
|
|
315
|
+
const OK = { chatNotFound: false };
|
|
254
316
|
// Implicit spawns: the Task-tool bridge in the main stream has already
|
|
255
317
|
// surfaced the output; extra delivery would be duplication.
|
|
256
318
|
if (info.source === "implicit")
|
|
257
|
-
return;
|
|
319
|
+
return OK;
|
|
258
320
|
const effective = opts.visibility ?? getVisibility();
|
|
259
321
|
if (effective === "silent")
|
|
260
|
-
return;
|
|
322
|
+
return OK;
|
|
261
323
|
if (!info.parentChatId) {
|
|
262
324
|
console.warn(`[subagent-delivery] missing parentChatId for ${info.name} (source=${info.source})`);
|
|
263
|
-
return;
|
|
325
|
+
return OK;
|
|
264
326
|
}
|
|
265
327
|
// v4.14 — Platform routing. Telegram is the default path (unchanged).
|
|
266
328
|
const platform = info.platform ?? "telegram";
|
|
267
329
|
if (platform !== "telegram") {
|
|
268
330
|
await deliverViaRegistry(platform, info, result);
|
|
269
|
-
return;
|
|
331
|
+
return OK;
|
|
270
332
|
}
|
|
271
333
|
// ── Telegram path (v4.12.x behavior, unchanged) ──────────────────
|
|
272
334
|
const api = getBotApi();
|
|
273
335
|
if (!api) {
|
|
274
336
|
console.warn(`[subagent-delivery] no bot api available for ${info.name}`);
|
|
275
|
-
return;
|
|
337
|
+
return OK;
|
|
276
338
|
}
|
|
277
339
|
// Telegram's chatId is always a number at runtime; defensive cast.
|
|
278
340
|
const tgChatId = typeof info.parentChatId === "number"
|
|
@@ -280,40 +342,70 @@ export async function deliverSubAgentResult(info, result, opts = {}) {
|
|
|
280
342
|
: Number(info.parentChatId);
|
|
281
343
|
if (!Number.isFinite(tgChatId)) {
|
|
282
344
|
console.warn(`[subagent-delivery] invalid telegram chatId for ${info.name}`);
|
|
283
|
-
return;
|
|
345
|
+
return OK;
|
|
284
346
|
}
|
|
285
347
|
const banner = buildBanner(info, result);
|
|
286
348
|
const body = result.output?.trim() || `(empty output)`;
|
|
349
|
+
// V56-T2 — bounded variant for the INLINE message path. Whenever this
|
|
350
|
+
// actually truncates (isTruncated), the FULL uncapped `body` is also
|
|
351
|
+
// attached as a .md file below, so the cap never costs the user
|
|
352
|
+
// access to the complete result and the marker stays truthful.
|
|
353
|
+
const inlineBody = capBody(body);
|
|
287
354
|
try {
|
|
288
|
-
//
|
|
289
|
-
|
|
355
|
+
// Truncated → honest delivery: short banner + bounded inline body
|
|
356
|
+
// (with the truthful "full output attached" marker) + the COMPLETE
|
|
357
|
+
// uncapped body as a .md file. This single branch covers the whole
|
|
358
|
+
// truncated range (mid-size AND the old > 20000-char range): there
|
|
359
|
+
// is no lossy inline-only range anymore. (The old >20000 behavior
|
|
360
|
+
// is unchanged — it already attached the full body; the change is
|
|
361
|
+
// that mid-size now also attaches it and the marker no longer
|
|
362
|
+
// points at a logs file that was never written.)
|
|
363
|
+
if (isTruncated(body)) {
|
|
290
364
|
await sendWithMarkdownFallback(api, tgChatId, banner);
|
|
365
|
+
// The bounded inline body fits in one message (BODY_CAP=1800 plus
|
|
366
|
+
// the short marker is well under MAX_TG_CHUNK); send it as plain
|
|
367
|
+
// text so an unbalanced markdown slice can't crash the send.
|
|
368
|
+
await api.sendMessage(tgChatId, inlineBody.slice(0, MAX_TG_CHUNK));
|
|
291
369
|
try {
|
|
292
370
|
const { InputFile } = await import("grammy");
|
|
293
371
|
const buf = Buffer.from(body, "utf-8");
|
|
294
372
|
await api.sendDocument(tgChatId, new InputFile(buf, `${info.name}.md`));
|
|
295
373
|
}
|
|
296
374
|
catch (err) {
|
|
375
|
+
// Upload failed → the bounded inline body was already delivered
|
|
376
|
+
// above, so the user still has something honest (banner + capped
|
|
377
|
+
// text + marker). The marker slightly over-promises here (file
|
|
378
|
+
// didn't attach) but this is the rare failure path, not the
|
|
379
|
+
// normal one, and there is no silent data loss.
|
|
297
380
|
console.error(`[subagent-delivery] file upload failed:`, err);
|
|
298
|
-
await api.sendMessage(tgChatId, body.slice(0, MAX_TG_CHUNK));
|
|
299
381
|
}
|
|
300
|
-
return;
|
|
382
|
+
return OK;
|
|
301
383
|
}
|
|
302
|
-
//
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
384
|
+
// Not truncated (body ≤ BODY_CAP) → unchanged passthrough.
|
|
385
|
+
// inlineBody === body here (capBody is a no-op), no marker, no file.
|
|
386
|
+
// Case A: fits in a single message → banner + body joined
|
|
387
|
+
if (inlineBody.length + banner.length + 2 <= MAX_TG_CHUNK) {
|
|
388
|
+
await sendWithMarkdownFallback(api, tgChatId, `${banner}\n\n${inlineBody}`);
|
|
389
|
+
return OK;
|
|
306
390
|
}
|
|
307
|
-
// Case
|
|
391
|
+
// Case B: defensive — a ≤1800-char body still under-runs MAX_TG_CHUNK
|
|
392
|
+
// with the banner, but keep the banner-then-chunk fallback for
|
|
393
|
+
// safety against an unusually long banner.
|
|
308
394
|
await sendWithMarkdownFallback(api, tgChatId, banner);
|
|
309
|
-
for (let i = 0; i <
|
|
395
|
+
for (let i = 0; i < inlineBody.length; i += MAX_TG_CHUNK) {
|
|
310
396
|
// Body chunks are always sent as plain text — markdown across
|
|
311
397
|
// arbitrary chunk boundaries would be inconsistent anyway.
|
|
312
|
-
await api.sendMessage(tgChatId,
|
|
398
|
+
await api.sendMessage(tgChatId, inlineBody.slice(i, i + MAX_TG_CHUNK));
|
|
313
399
|
}
|
|
400
|
+
return OK;
|
|
314
401
|
}
|
|
315
402
|
catch (err) {
|
|
316
403
|
console.error(`[subagent-delivery] send failed for ${info.name}:`, err);
|
|
404
|
+
// B3 — report a permanent invalid-target failure so the watcher can
|
|
405
|
+
// abandon this agent instead of retrying it forever. Any other error
|
|
406
|
+
// (network, rate-limit, parse) is NOT reported as chatNotFound, so the
|
|
407
|
+
// agent's normal retry/timeout lifecycle is unchanged.
|
|
408
|
+
return { chatNotFound: isChatNotFoundError(err) };
|
|
317
409
|
}
|
|
318
410
|
}
|
|
319
411
|
/**
|
|
@@ -336,36 +428,45 @@ async function deliverViaRegistry(platform, info, result) {
|
|
|
336
428
|
const chatId = info.parentChatId;
|
|
337
429
|
const banner = buildBannerPlain(info, result);
|
|
338
430
|
const body = result.output?.trim() || `(empty output)`;
|
|
431
|
+
// V56-T2 — same honest contract as the Telegram path. Whenever the
|
|
432
|
+
// cap truncates, the FULL uncapped `body` is attached as a .md file
|
|
433
|
+
// (if the adapter supports uploads) so the marker stays truthful and
|
|
434
|
+
// the complete output remains accessible.
|
|
435
|
+
const inlineBody = capBody(body);
|
|
339
436
|
const NON_TG_CHUNK = 3800;
|
|
340
|
-
const FILE_THRESHOLD = 20_000;
|
|
341
437
|
try {
|
|
342
|
-
//
|
|
343
|
-
|
|
438
|
+
// Truncated → honest delivery: banner + bounded inline body (with
|
|
439
|
+
// the truthful "full output attached" marker) + the COMPLETE
|
|
440
|
+
// uncapped body as a .md file. Covers the whole truncated range
|
|
441
|
+
// (mid-size AND > the old 20k threshold) — no lossy inline-only
|
|
442
|
+
// range remains. If the adapter has no sendDocument or the upload
|
|
443
|
+
// fails, the bounded inline body still went out (honest, just no
|
|
444
|
+
// file) — no silent data loss.
|
|
445
|
+
if (isTruncated(body)) {
|
|
344
446
|
await adapter.sendText(chatId, banner);
|
|
447
|
+
for (let i = 0; i < inlineBody.length; i += NON_TG_CHUNK) {
|
|
448
|
+
await adapter.sendText(chatId, inlineBody.slice(i, i + NON_TG_CHUNK));
|
|
449
|
+
}
|
|
345
450
|
if (adapter.sendDocument) {
|
|
346
451
|
try {
|
|
347
452
|
await adapter.sendDocument(chatId, Buffer.from(body, "utf-8"), `${info.name}.md`);
|
|
348
|
-
return;
|
|
349
453
|
}
|
|
350
454
|
catch (err) {
|
|
351
455
|
console.error(`[subagent-delivery] ${platform} file upload failed:`, err);
|
|
352
456
|
}
|
|
353
457
|
}
|
|
354
|
-
// Fallback: chunked text if no file upload or upload failed
|
|
355
|
-
for (let i = 0; i < body.length; i += NON_TG_CHUNK) {
|
|
356
|
-
await adapter.sendText(chatId, body.slice(i, i + NON_TG_CHUNK));
|
|
357
|
-
}
|
|
358
458
|
return;
|
|
359
459
|
}
|
|
360
|
-
//
|
|
361
|
-
|
|
362
|
-
|
|
460
|
+
// Not truncated (body ≤ BODY_CAP) → unchanged passthrough.
|
|
461
|
+
// inlineBody === body here, no marker, no file.
|
|
462
|
+
if (inlineBody.length + banner.length + 2 <= NON_TG_CHUNK) {
|
|
463
|
+
await adapter.sendText(chatId, `${banner}\n\n${inlineBody}`);
|
|
363
464
|
return;
|
|
364
465
|
}
|
|
365
|
-
//
|
|
466
|
+
// Defensive banner-then-chunk fallback (e.g. unusually long banner).
|
|
366
467
|
await adapter.sendText(chatId, banner);
|
|
367
|
-
for (let i = 0; i <
|
|
368
|
-
await adapter.sendText(chatId,
|
|
468
|
+
for (let i = 0; i < inlineBody.length; i += NON_TG_CHUNK) {
|
|
469
|
+
await adapter.sendText(chatId, inlineBody.slice(i, i + NON_TG_CHUNK));
|
|
369
470
|
}
|
|
370
471
|
}
|
|
371
472
|
catch (err) {
|
|
@@ -288,7 +288,9 @@ async function runSubAgent(id, agentConfig, abort, resolvedName) {
|
|
|
288
288
|
: os.homedir();
|
|
289
289
|
const systemPrompt = `You are a sub-agent named "${resolvedName}". Complete the following task autonomously. Working directory: ${effectiveCwd}
|
|
290
290
|
|
|
291
|
-
|
|
291
|
+
Do NOT send your own Telegram/chat/notification messages as a step, and do NOT use any tool or skill to message the user or post your progress — your final return value is the SOLE delivery path and the orchestrator delivers it for you. A self-sent message causes a duplicate the user sees twice.
|
|
292
|
+
|
|
293
|
+
When done, return ONLY the final result/outcome itself, concisely — nothing else. Do NOT narrate, summarize, or recap your intermediate steps, your reasoning, your tool calls, your plan, or a play-by-play of what you did. The orchestrator needs ONLY the outcome (the answer, the report, the list, the artifact path); on failure, return the error plus exactly what was and wasn't done. No preamble, no meta-commentary, no "Here's what I did", no "I will now…", no step-by-step recap. Run status, duration and token usage are reported separately, so don't restate them.`;
|
|
292
294
|
// v4.12.2 — Map the toolset preset to an explicit allowedTools list.
|
|
293
295
|
// The provider honors this override (see src/providers/claude-sdk-provider.ts
|
|
294
296
|
// line ~140). Passing undefined = full access (provider default).
|
|
@@ -326,10 +328,22 @@ When done, return ONLY the final result/outcome, concisely. Do NOT narrate your
|
|
|
326
328
|
}
|
|
327
329
|
}
|
|
328
330
|
if (chunk.type === "done") {
|
|
329
|
-
//
|
|
330
|
-
//
|
|
331
|
-
//
|
|
332
|
-
|
|
331
|
+
// V56-T1 — Prefer the SDK's authoritative FINAL result over the
|
|
332
|
+
// accumulated narration. The Claude Agent SDK emits a terminal
|
|
333
|
+
// `result` message whose single `result` field IS the agent's
|
|
334
|
+
// actual outcome; the provider surfaces it as `chunk.finalResult`.
|
|
335
|
+
// Using it here excludes the step-by-step narration BY
|
|
336
|
+
// CONSTRUCTION (it's a distinct SDK field, not a heuristic over
|
|
337
|
+
// concatenated text), matching what the detached-dispatch path
|
|
338
|
+
// already does. When the provider has no distinct final-result
|
|
339
|
+
// message (non-SDK providers, SDK error results), finalResult is
|
|
340
|
+
// undefined and we fall back to done.text — the previous
|
|
341
|
+
// authoritative-accumulated-text behaviour, so streamed-text
|
|
342
|
+
// consumers and the Fix #5 contract are unaffected.
|
|
343
|
+
if (typeof chunk.finalResult === "string" && chunk.finalResult.length > 0) {
|
|
344
|
+
finalText = chunk.finalResult;
|
|
345
|
+
}
|
|
346
|
+
else if (chunk.text && chunk.text.length > 0) {
|
|
333
347
|
finalText = chunk.text;
|
|
334
348
|
}
|
|
335
349
|
inputTokens = chunk.inputTokens || 0;
|
|
@@ -17,6 +17,15 @@ export class TelegramStreamer {
|
|
|
17
17
|
this.api = api;
|
|
18
18
|
this.replyTo = replyToMessageId;
|
|
19
19
|
}
|
|
20
|
+
/**
|
|
21
|
+
* True when at least one message has been sent to the user (i.e. messageId
|
|
22
|
+
* is set). Used by the A3 suppress-undelivered guard in message.ts to
|
|
23
|
+
* determine whether visible text has already reached the user — if so, the
|
|
24
|
+
* no-retract invariant prevents suppressing the final send.
|
|
25
|
+
*/
|
|
26
|
+
get hasSentText() {
|
|
27
|
+
return this.messageId !== null;
|
|
28
|
+
}
|
|
20
29
|
/**
|
|
21
30
|
* Set a transient status line (e.g. "📖 Read file.html…") that gets
|
|
22
31
|
* appended to the current accumulated text. Passing null clears it.
|
package/dist/services/trends.js
CHANGED
|
@@ -33,12 +33,81 @@
|
|
|
33
33
|
* ALVIN_TRENDS_INTERVAL_HOURS=24 → snapshot cadence
|
|
34
34
|
* ALVIN_TRENDS_AI_AFTER_DAYS=7 → days of data before AI analysis kicks in
|
|
35
35
|
*/
|
|
36
|
-
import { appendFileSync, existsSync, readFileSync, mkdirSync } from "fs";
|
|
36
|
+
import { appendFileSync, existsSync, readFileSync, writeFileSync, mkdirSync } from "fs";
|
|
37
37
|
import { join, dirname } from "path";
|
|
38
38
|
import { homedir } from "os";
|
|
39
39
|
import { BOT_VERSION } from "../version.js";
|
|
40
40
|
import { emitCritical } from "./critical-notify.js";
|
|
41
41
|
const TRENDS_PATH = join(homedir(), ".alvin-bot", "state", "trends.jsonl");
|
|
42
|
+
/**
|
|
43
|
+
* B2 — peak-uptime high-water mark. The trends collector takes its FIRST
|
|
44
|
+
* snapshot ~60s after every boot (startTrendsCollector schedules it at
|
|
45
|
+
* 60_000ms). takeSnapshot() records uptime_s = process.uptime(), so the
|
|
46
|
+
* first post-restart sample is structurally ≈ 62s. With deliberate
|
|
47
|
+
* restarts (/update, launchctl reload) those ~62s samples dominate
|
|
48
|
+
* trends.jsonl, so the 30-day AI pass perpetually concludes "restart
|
|
49
|
+
* loop, never lives past ~62s" even when the process has actually been
|
|
50
|
+
* continuously up for hours by the time the daily snapshot fires.
|
|
51
|
+
*
|
|
52
|
+
* Fix: persist the MAXIMUM real uptime this bot has ever observed (across
|
|
53
|
+
* process generations) and record it on every snapshot as uptime_peak_s.
|
|
54
|
+
* The peak only ever derives from process.uptime() — it is never
|
|
55
|
+
* fabricated or extrapolated. The anomaly evaluation then keys on the
|
|
56
|
+
* peak (hasRepresentativeUptime), so a process that genuinely lived for
|
|
57
|
+
* hours is not flagged as a ~62s loop, while a genuine fast-restart loop
|
|
58
|
+
* (peak never climbs past the startup transient) still fires.
|
|
59
|
+
*
|
|
60
|
+
* Stored next to trends.jsonl (state/), honoring ALVIN_DATA_DIR so tests
|
|
61
|
+
* and non-default installs work. Survives restarts by design — that is
|
|
62
|
+
* the whole point of a high-water mark.
|
|
63
|
+
*/
|
|
64
|
+
function trendsStateDir() {
|
|
65
|
+
const base = process.env.ALVIN_DATA_DIR || join(homedir(), ".alvin-bot");
|
|
66
|
+
return join(base, "state");
|
|
67
|
+
}
|
|
68
|
+
function uptimePeakPath() {
|
|
69
|
+
return join(trendsStateDir(), "uptime-peak.json");
|
|
70
|
+
}
|
|
71
|
+
/**
|
|
72
|
+
* The startup transient: takeSnapshot's first sample is taken ~60s after
|
|
73
|
+
* boot, so any uptime at/under this is indistinguishable from "just
|
|
74
|
+
* restarted". An uptime ABOVE this proves the process actually lived past
|
|
75
|
+
* the post-restart sampling window. 600s (10 min) is comfortably above
|
|
76
|
+
* the 60s first-sample delay + scheduling jitter and far below the 24h
|
|
77
|
+
* cron cadence, so a healthy bot trivially clears it while a real
|
|
78
|
+
* crash-loop (exits within seconds/a couple minutes) never does.
|
|
79
|
+
*/
|
|
80
|
+
export const STARTUP_TRANSIENT_S = 600;
|
|
81
|
+
/**
|
|
82
|
+
* Read the persisted peak uptime, fold in the CURRENT real uptime, persist
|
|
83
|
+
* the (possibly larger) high-water mark, and return it. Pure w.r.t. time
|
|
84
|
+
* sources: the only uptime input is process.uptime() — nothing invented.
|
|
85
|
+
* Disk failures degrade gracefully to the current real uptime.
|
|
86
|
+
*/
|
|
87
|
+
function bumpAndReadUptimePeak() {
|
|
88
|
+
const currentReal = Math.round(process.uptime());
|
|
89
|
+
let stored = 0;
|
|
90
|
+
try {
|
|
91
|
+
const raw = readFileSync(uptimePeakPath(), "utf-8");
|
|
92
|
+
const parsed = JSON.parse(raw);
|
|
93
|
+
if (typeof parsed.peak_s === "number" && Number.isFinite(parsed.peak_s) && parsed.peak_s > 0) {
|
|
94
|
+
stored = parsed.peak_s;
|
|
95
|
+
}
|
|
96
|
+
}
|
|
97
|
+
catch {
|
|
98
|
+
// No file yet / unreadable — start the high-water mark from the
|
|
99
|
+
// current real uptime. Not an error.
|
|
100
|
+
}
|
|
101
|
+
const peak = Math.max(stored, currentReal);
|
|
102
|
+
try {
|
|
103
|
+
mkdirSync(trendsStateDir(), { recursive: true });
|
|
104
|
+
writeFileSync(uptimePeakPath(), JSON.stringify({ peak_s: peak }), "utf-8");
|
|
105
|
+
}
|
|
106
|
+
catch {
|
|
107
|
+
// Disk full / permissions — non-fatal; we still return the in-memory peak.
|
|
108
|
+
}
|
|
109
|
+
return peak;
|
|
110
|
+
}
|
|
42
111
|
const DEFAULT_INTERVAL_HOURS = 24;
|
|
43
112
|
const DEFAULT_AI_THRESHOLD_DAYS = 7;
|
|
44
113
|
const MAX_RETAIN_DAYS = 90;
|
|
@@ -54,6 +123,18 @@ const MAX_RETAIN_DAYS = 90;
|
|
|
54
123
|
* (a successful, expected fallback — not an error)
|
|
55
124
|
* - critical-notify's own delivery-outcome line, kept on stderr on
|
|
56
125
|
* purpose so it stays visible even in brake/crash context
|
|
126
|
+
* - B3: subagent-delivery's "send failed … chat not found" line for a
|
|
127
|
+
* stale/test async-agent whose delivery target chat no longer exists
|
|
128
|
+
* (e.g. the recurring chat_id:1 test agent). This is benign noise,
|
|
129
|
+
* not a real fault: the target chat is invalid, the watcher now
|
|
130
|
+
* abandons such agents (see async-agent-watcher.ts), and counting it
|
|
131
|
+
* made errors_24h creep upward indefinitely on every poll cycle.
|
|
132
|
+
* The match is DELIBERATELY narrow — it requires BOTH the
|
|
133
|
+
* `[subagent-delivery] send failed` prefix AND a `chat not found`
|
|
134
|
+
* cause on the same line. A subagent-delivery failure for ANY other
|
|
135
|
+
* reason (network, rate-limit, parse) is still counted, and a
|
|
136
|
+
* `chat not found` from ANY OTHER subsystem (a real misconfigured
|
|
137
|
+
* target) is still counted.
|
|
57
138
|
*
|
|
58
139
|
* Counting those turned this very monitor into a false-alarm generator:
|
|
59
140
|
* it flagged its OWN log lines plus every release's restart churn, so
|
|
@@ -65,7 +146,7 @@ const MAX_RETAIN_DAYS = 90;
|
|
|
65
146
|
* any, get added here in one place instead of being chased across the
|
|
66
147
|
* codebase.
|
|
67
148
|
*/
|
|
68
|
-
export const ERR_LOG_PATTERN = /^(?!.*(?:\[critical-notify\]|\[subagent-delivery\] Markdown parse failed)).+/;
|
|
149
|
+
export const ERR_LOG_PATTERN = /^(?!.*(?:\[critical-notify\]|\[subagent-delivery\] Markdown parse failed|\[subagent-delivery\] send failed.*chat not found)).+/;
|
|
69
150
|
let trendsTimer = null;
|
|
70
151
|
function isDisabled() {
|
|
71
152
|
return (process.env.ALVIN_DISABLE_TRENDS === "true" ||
|
|
@@ -134,6 +215,7 @@ function takeSnapshot(activeProvider) {
|
|
|
134
215
|
return {
|
|
135
216
|
ts: new Date().toISOString(),
|
|
136
217
|
uptime_s: Math.round(process.uptime()),
|
|
218
|
+
uptime_peak_s: bumpAndReadUptimePeak(),
|
|
137
219
|
rss_mb: Math.round(mem.rss / 1024 / 1024),
|
|
138
220
|
heap_mb: Math.round(mem.heapUsed / 1024 / 1024),
|
|
139
221
|
crashes_24h: readWatchdogCrashes24h(),
|
|
@@ -195,6 +277,139 @@ SUGGESTION: <one shell command OR observation for the operator>
|
|
|
195
277
|
--- LAST {N} DAYS OF SNAPSHOTS ---
|
|
196
278
|
{SNAPSHOTS}
|
|
197
279
|
--- END ---`;
|
|
280
|
+
/**
|
|
281
|
+
* V56 — Recent crash-evidence window.
|
|
282
|
+
*
|
|
283
|
+
* hasRealCrashEvidence keys the WARN-suppression gate on whether ANY
|
|
284
|
+
* persisted snapshot recorded a real crash. Snapshots persist for up to
|
|
285
|
+
* MAX_RETAIN_DAYS and the AI pass reads the last 30 (≈30 days at the 24h
|
|
286
|
+
* cadence). If the WHOLE 30-day history is considered, a history briefly
|
|
287
|
+
* poisoned by miscounted deliberate restarts (pre-v5.5.0 accounting bug,
|
|
288
|
+
* fixed in v5.5.0 for NEW snapshots but the bad lines persist ~30 days)
|
|
289
|
+
* keeps crash-evidence "true" — so the B2/B4 gate never suppresses and the
|
|
290
|
+
* false WARN fires for ~a month instead of self-healing.
|
|
291
|
+
*
|
|
292
|
+
* Restricting the evidence check to the most recent ~48h means: once
|
|
293
|
+
* v5.5.0's correct accounting produces clean recent snapshots
|
|
294
|
+
* (crashes_24h=0), the false WARN clears within ~a day — while a GENUINE
|
|
295
|
+
* crash loop (real crashes in the recent window) still returns true and
|
|
296
|
+
* the WARN still fires (the protective purpose is intact).
|
|
297
|
+
*
|
|
298
|
+
* 48h (not 24h) is chosen because the snapshot cadence is ~24h
|
|
299
|
+
* (DEFAULT_INTERVAL_HOURS): a 48h window reliably retains the last 1–2
|
|
300
|
+
* daily snapshots even across day-boundary jitter / a skipped cron tick,
|
|
301
|
+
* so a genuine recent crash loop is never missed, while crash evidence
|
|
302
|
+
* older than ~2 days (the poisoned history) ages out and self-heals. A
|
|
303
|
+
* timestamp window (not "last N snapshots") is used so self-healing keys
|
|
304
|
+
* on real wall-clock time and is robust to cadence changes / test-tuned
|
|
305
|
+
* ALVIN_TRENDS_INTERVAL_HOURS.
|
|
306
|
+
*/
|
|
307
|
+
export const RECENT_CRASH_WINDOW_MS = 48 * 60 * 60 * 1000;
|
|
308
|
+
/**
|
|
309
|
+
* Returns true if at least one snapshot WITHIN THE RECENT WINDOW has a
|
|
310
|
+
* non-zero crashes_24h value, meaning a REAL crash (not an
|
|
311
|
+
* expected/deliberate restart) was recorded recently.
|
|
312
|
+
*
|
|
313
|
+
* After the B1 fix, deliberate restarts (SIGTERM / launchctl reload /
|
|
314
|
+
* /restart / /update) write the expectedRestart beacon flag and are NOT
|
|
315
|
+
* counted in dailyCrashCount. So crashes_24h === 0 across the recent
|
|
316
|
+
* snapshots means the bot was only restarted intentionally — no real
|
|
317
|
+
* crash evidence — even if OLDER snapshots were poisoned by the
|
|
318
|
+
* pre-v5.5.0 miscount (those age out of the window and the false WARN
|
|
319
|
+
* self-heals; see RECENT_CRASH_WINDOW_MS).
|
|
320
|
+
*
|
|
321
|
+
* Recency is determined from each snapshot's `ts` (ISO 8601, written by
|
|
322
|
+
* takeSnapshot via new Date().toISOString()). FAIL-SAFE: a snapshot whose
|
|
323
|
+
* `ts` is missing or unparseable is treated as in-window (counted) — a
|
|
324
|
+
* health monitor must fail toward "visible", never go blind on bad data.
|
|
325
|
+
*
|
|
326
|
+
* Pure function, exported for unit testing.
|
|
327
|
+
*/
|
|
328
|
+
export function hasRealCrashEvidence(snaps, nowMs = Date.now()) {
|
|
329
|
+
const cutoff = nowMs - RECENT_CRASH_WINDOW_MS;
|
|
330
|
+
return snaps.some((s) => {
|
|
331
|
+
if (!(typeof s.crashes_24h === "number" && s.crashes_24h > 0))
|
|
332
|
+
return false;
|
|
333
|
+
// FAIL-SAFE: no/garbage ts → treat as recent (never silence on bad data).
|
|
334
|
+
if (typeof s.ts !== "string")
|
|
335
|
+
return true;
|
|
336
|
+
const t = Date.parse(s.ts);
|
|
337
|
+
if (!Number.isFinite(t))
|
|
338
|
+
return true;
|
|
339
|
+
return t >= cutoff;
|
|
340
|
+
});
|
|
341
|
+
}
|
|
342
|
+
/**
|
|
343
|
+
* B2 — Returns true if AT LEAST ONE snapshot proves the bot process
|
|
344
|
+
* genuinely lived past the startup transient (i.e. it is NOT a ~62s
|
|
345
|
+
* restart loop).
|
|
346
|
+
*
|
|
347
|
+
* The first per-boot snapshot is structurally taken ~60s after boot, so
|
|
348
|
+
* its raw uptime_s is always ≈ 62 regardless of how long the process
|
|
349
|
+
* subsequently runs. uptime_peak_s is the high-water mark of REAL
|
|
350
|
+
* process.uptime() carried across process generations, so a single
|
|
351
|
+
* snapshot whose peak exceeds STARTUP_TRANSIENT_S is hard evidence the
|
|
352
|
+
* process did live for a representative duration. Legacy pre-B2 lines
|
|
353
|
+
* have no uptime_peak_s — we fall back to their raw uptime_s, so a legacy
|
|
354
|
+
* 24h cron snapshot still counts as representative on its own.
|
|
355
|
+
*
|
|
356
|
+
* A genuine fast-restart loop never lets the peak climb past the
|
|
357
|
+
* transient, so it correctly returns false and the WARN still fires.
|
|
358
|
+
*
|
|
359
|
+
* Pure function, exported for unit testing.
|
|
360
|
+
*/
|
|
361
|
+
export function hasRepresentativeUptime(snaps) {
|
|
362
|
+
return snaps.some((s) => {
|
|
363
|
+
const peak = typeof s.uptime_peak_s === "number" && Number.isFinite(s.uptime_peak_s)
|
|
364
|
+
? s.uptime_peak_s
|
|
365
|
+
: typeof s.uptime_s === "number" && Number.isFinite(s.uptime_s)
|
|
366
|
+
? s.uptime_s
|
|
367
|
+
: 0;
|
|
368
|
+
return peak > STARTUP_TRANSIENT_S;
|
|
369
|
+
});
|
|
370
|
+
}
|
|
371
|
+
/**
|
|
372
|
+
* B2/B4 — Pure crash/restart WARN suppression decision.
|
|
373
|
+
*
|
|
374
|
+
* Encodes the SAME two gates, in the SAME precedence, that dailyTask
|
|
375
|
+
* applies inline (B2 before B4). Extracted as a pure function purely so
|
|
376
|
+
* the gate COMPOSITION (not just each helper in isolation) is unit
|
|
377
|
+
* testable — the helpers are individually correct but the interaction
|
|
378
|
+
* is where the real-crash-loop-after-a-healthy-period regression lives.
|
|
379
|
+
*
|
|
380
|
+
* Returns the suppression reason, or "none" when the WARN must fire.
|
|
381
|
+
*
|
|
382
|
+
* - "representative-uptime" (B2): a deliberate-restart / sampling
|
|
383
|
+
* artifact — the AI saw ~62s uptimes but a snapshot peak proves the
|
|
384
|
+
* process actually lived past the startup transient. ONLY applies
|
|
385
|
+
* when there is no real crash evidence: a genuine crash loop after a
|
|
386
|
+
* prior healthy period still carries the persisted high peak, so
|
|
387
|
+
* without the crash-evidence guard B2 would permanently and silently
|
|
388
|
+
* swallow it. With the guard, crashes_24h>0 falls through to B4.
|
|
389
|
+
* - "no-crash-evidence" (B4): crash/restart pattern but crashes_24h===0
|
|
390
|
+
* everywhere (deliberate-restart-only, not a real crash loop).
|
|
391
|
+
* - "none": the WARN is real and must be emitted.
|
|
392
|
+
*
|
|
393
|
+
* Pure function, exported for unit testing.
|
|
394
|
+
*/
|
|
395
|
+
export function evaluateCrashRestartSuppression(isCrashRestartPattern, snaps) {
|
|
396
|
+
if (!isCrashRestartPattern)
|
|
397
|
+
return "none";
|
|
398
|
+
const realCrash = hasRealCrashEvidence(snaps);
|
|
399
|
+
// B2: only the deliberate-restart / sampling-artifact case. A real
|
|
400
|
+
// crash loop (crashes_24h>0) must NOT be suppressed here even though
|
|
401
|
+
// the persisted uptime high-water mark still reads representative.
|
|
402
|
+
if (!realCrash && hasRepresentativeUptime(snaps))
|
|
403
|
+
return "representative-uptime";
|
|
404
|
+
// B4: crash/restart pattern with zero real crash evidence.
|
|
405
|
+
if (!realCrash)
|
|
406
|
+
return "no-crash-evidence";
|
|
407
|
+
return "none";
|
|
408
|
+
}
|
|
409
|
+
/** Test-only: take a snapshot without writing to trends.jsonl. */
|
|
410
|
+
export function __takeSnapshotForTest(activeProvider) {
|
|
411
|
+
return takeSnapshot(activeProvider);
|
|
412
|
+
}
|
|
198
413
|
function parseTrendResponse(text) {
|
|
199
414
|
if (/^ANOMALY:\s*NONE/im.test(text)) {
|
|
200
415
|
return {
|
|
@@ -296,6 +511,38 @@ async function dailyTask(registry) {
|
|
|
296
511
|
console.log(`📊 Trends AI: no anomaly detected`);
|
|
297
512
|
return;
|
|
298
513
|
}
|
|
514
|
+
const recentSnaps = readSnapshots(30);
|
|
515
|
+
const isCrashRestartPattern = /crash|restart|loop|uptime/i.test(result.description);
|
|
516
|
+
// B2 gate: suppress an "uptime stuck at ~62s / restart loop" WARN when
|
|
517
|
+
// the snapshots PROVE the process actually lived past the startup
|
|
518
|
+
// transient. The first per-boot snapshot is structurally sampled ~60s
|
|
519
|
+
// after boot, so raw uptime_s reads ≈62 even for a perfectly healthy
|
|
520
|
+
// bot that has been up for hours by the time the daily snapshot fires.
|
|
521
|
+
// uptime_peak_s is the high-water mark of real process.uptime() across
|
|
522
|
+
// process generations: if ANY snapshot's peak exceeds the transient,
|
|
523
|
+
// the "~62s loop" conclusion is factually false. A genuine fast-restart
|
|
524
|
+
// loop never lets the peak climb, so it is NOT suppressed here.
|
|
525
|
+
if (isCrashRestartPattern && !hasRealCrashEvidence(recentSnaps) && hasRepresentativeUptime(recentSnaps)) {
|
|
526
|
+
console.log(`📊 Trends AI: suppressed WARN "${result.description}" — ` +
|
|
527
|
+
`uptime/restart pattern flagged but at least one snapshot shows a ` +
|
|
528
|
+
`representative peak uptime (>${STARTUP_TRANSIENT_S}s); the process ` +
|
|
529
|
+
`did live well past the post-restart sampling window, not a ~62s loop`);
|
|
530
|
+
return;
|
|
531
|
+
}
|
|
532
|
+
// B4 gate: suppress WARN when the AI flags a crash/restart-loop pattern
|
|
533
|
+
// but the historical snapshots contain ZERO real crash evidence
|
|
534
|
+
// (crashes_24h === 0 across the board). This happens when the bot was
|
|
535
|
+
// restarted deliberately (launchctl reload / /update / /restart) — those
|
|
536
|
+
// produce low uptimes that the AI reads as "restart loop", but the
|
|
537
|
+
// crash counter stays at 0 because markExpectedRestart() was written
|
|
538
|
+
// on each clean shutdown. A real crash loop WILL have crashes_24h > 0
|
|
539
|
+
// in at least one snapshot and will still fire the WARN.
|
|
540
|
+
if (isCrashRestartPattern && !hasRealCrashEvidence(recentSnaps)) {
|
|
541
|
+
console.log(`📊 Trends AI: suppressed WARN "${result.description}" — ` +
|
|
542
|
+
`crash/restart pattern detected but crashes_24h=0 across all snapshots ` +
|
|
543
|
+
`(deliberate-restart-only, not a real crash loop)`);
|
|
544
|
+
return;
|
|
545
|
+
}
|
|
299
546
|
console.log(`📊 Trends AI: ANOMALY (${result.severity}) — ${result.description}`);
|
|
300
547
|
emitCritical({
|
|
301
548
|
category: "custom",
|