alvin-bot 4.8.9 → 4.9.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +44 -0
- package/dist/handlers/message.js +5 -2
- package/dist/index.js +14 -10
- package/dist/platforms/whatsapp-auth-helpers.js +53 -0
- package/dist/platforms/whatsapp.js +6 -2
- package/dist/services/browser-manager.js +82 -10
- package/dist/services/browser-webfetch.js +93 -0
- package/dist/services/cron-scheduling.js +142 -0
- package/dist/services/cron.js +32 -6
- package/dist/services/skills.js +15 -11
- package/dist/services/subagent-delivery.js +8 -2
- package/dist/services/subagents.js +49 -8
- package/dist/services/telegram.js +12 -3
- package/dist/services/watchdog-brake.js +113 -0
- package/dist/services/watchdog.js +56 -42
- package/dist/util/console-formatter.js +109 -0
- package/dist/util/debounce.js +24 -0
- package/dist/util/telegram-error-filter.js +62 -0
- package/dist/web/server.js +56 -0
- package/package.json +1 -1
- package/test/browser-webfetch.test.ts +121 -0
- package/test/console-timestamps.test.ts +98 -0
- package/test/cron-restart-resilience.test.ts +191 -0
- package/test/debounce.test.ts +60 -0
- package/test/subagent-final-text.test.ts +132 -0
- package/test/telegram-error-filter.test.ts +85 -0
- package/test/watchdog-brake.test.ts +157 -0
- package/test/web-server-shutdown.test.ts +111 -0
- package/test/whatsapp-auth-resilience.test.ts +96 -0
package/dist/services/cron.js
CHANGED
|
@@ -13,6 +13,7 @@ import fs from "fs";
|
|
|
13
13
|
import { execSync } from "child_process";
|
|
14
14
|
import { dirname } from "path";
|
|
15
15
|
import { CRON_FILE, BOT_ROOT } from "../paths.js";
|
|
16
|
+
import { prepareForExecution, handleStartupCatchup, calculateNextRunFrom, } from "./cron-scheduling.js";
|
|
16
17
|
// ── Storage ─────────────────────────────────────────────
|
|
17
18
|
function loadJobs() {
|
|
18
19
|
try {
|
|
@@ -240,6 +241,25 @@ const runningJobs = new Set(); // Guard against overlapping executions
|
|
|
240
241
|
export function startScheduler() {
|
|
241
242
|
if (schedulerTimer)
|
|
242
243
|
return;
|
|
244
|
+
// Startup catch-up — nachholen runs whose last attempt crashed within
|
|
245
|
+
// the grace window. Must run BEFORE the first scheduler tick so the
|
|
246
|
+
// catch-up nextRunAt rewind is visible on the very next pass.
|
|
247
|
+
try {
|
|
248
|
+
const bootJobs = loadJobs();
|
|
249
|
+
const caught = handleStartupCatchup(bootJobs, Date.now());
|
|
250
|
+
// Only persist if something actually changed to avoid needless writes
|
|
251
|
+
const mutated = caught.some((j, i) => j.nextRunAt !== bootJobs[i].nextRunAt);
|
|
252
|
+
if (mutated) {
|
|
253
|
+
saveJobs(caught);
|
|
254
|
+
const names = caught
|
|
255
|
+
.filter((j, i) => j.nextRunAt !== bootJobs[i].nextRunAt)
|
|
256
|
+
.map((j) => j.name);
|
|
257
|
+
console.log(`⏰ Cron startup catch-up: rewound ${names.length} job(s): ${names.join(", ")}`);
|
|
258
|
+
}
|
|
259
|
+
}
|
|
260
|
+
catch (err) {
|
|
261
|
+
console.error("⏰ Cron startup catch-up failed:", err);
|
|
262
|
+
}
|
|
243
263
|
// Check every 30 seconds for due jobs
|
|
244
264
|
schedulerTimer = setInterval(async () => {
|
|
245
265
|
const jobs = loadJobs();
|
|
@@ -248,7 +268,7 @@ export function startScheduler() {
|
|
|
248
268
|
for (const job of jobs) {
|
|
249
269
|
if (!job.enabled)
|
|
250
270
|
continue;
|
|
251
|
-
// Skip if this job is already running
|
|
271
|
+
// Skip if this job is already running in THIS bot instance
|
|
252
272
|
if (runningJobs.has(job.id))
|
|
253
273
|
continue;
|
|
254
274
|
// Calculate next run if not set
|
|
@@ -258,9 +278,13 @@ export function startScheduler() {
|
|
|
258
278
|
}
|
|
259
279
|
if (job.nextRunAt && now >= job.nextRunAt) {
|
|
260
280
|
console.log(`Cron: Running job "${job.name}" (${job.id})`);
|
|
261
|
-
//
|
|
281
|
+
// Pre-execution state update: advance nextRunAt to the NEXT regular
|
|
282
|
+
// trigger (NOT null) and stamp lastAttemptAt. If the bot crashes
|
|
283
|
+
// mid-execution, handleStartupCatchup will notice the attempt
|
|
284
|
+
// without completion and nachholen within the grace window.
|
|
262
285
|
runningJobs.add(job.id);
|
|
263
|
-
|
|
286
|
+
const prepared = prepareForExecution(job, now);
|
|
287
|
+
Object.assign(job, prepared);
|
|
264
288
|
saveJobs(jobs);
|
|
265
289
|
try {
|
|
266
290
|
const result = await executeJob(job);
|
|
@@ -268,8 +292,8 @@ export function startScheduler() {
|
|
|
268
292
|
const freshJobs = loadJobs();
|
|
269
293
|
const freshJob = freshJobs.find(j => j.id === job.id);
|
|
270
294
|
if (freshJob) {
|
|
271
|
-
freshJob.lastRunAt = now;
|
|
272
|
-
freshJob.lastResult = result.output.slice(0,
|
|
295
|
+
freshJob.lastRunAt = Date.now();
|
|
296
|
+
freshJob.lastResult = result.output.slice(0, 4000);
|
|
273
297
|
freshJob.lastError = result.error || null;
|
|
274
298
|
freshJob.runCount++;
|
|
275
299
|
if (freshJob.oneShot) {
|
|
@@ -277,7 +301,9 @@ export function startScheduler() {
|
|
|
277
301
|
freshJob.nextRunAt = null;
|
|
278
302
|
}
|
|
279
303
|
else {
|
|
280
|
-
|
|
304
|
+
// nextRunAt already set pre-execution, but recalculate in case
|
|
305
|
+
// the schedule or enabled state changed during execution.
|
|
306
|
+
freshJob.nextRunAt = calculateNextRunFrom(freshJob, Date.now());
|
|
281
307
|
}
|
|
282
308
|
saveJobs(freshJobs);
|
|
283
309
|
}
|
package/dist/services/skills.js
CHANGED
|
@@ -21,6 +21,7 @@ import { resolve } from "path";
|
|
|
21
21
|
import { SKILLS_DIR } from "../paths.js";
|
|
22
22
|
import { USER_SKILLS_DIR } from "../paths.js";
|
|
23
23
|
import { loadAssetIndex } from "./asset-index.js";
|
|
24
|
+
import { debounce } from "../util/debounce.js";
|
|
24
25
|
// ── Skill Registry ──────────────────────────────────────
|
|
25
26
|
let cachedSkills = [];
|
|
26
27
|
let lastScanAt = 0;
|
|
@@ -143,23 +144,26 @@ function reloadAllSkills() {
|
|
|
143
144
|
*/
|
|
144
145
|
export function loadSkills() {
|
|
145
146
|
reloadAllSkills();
|
|
146
|
-
// Hot-reload watchers
|
|
147
|
+
// Hot-reload watchers — macOS FSEvents delivers many duplicate events
|
|
148
|
+
// for a single logical change, so we coalesce bursts into one reload.
|
|
149
|
+
const bundledReload = debounce(() => {
|
|
150
|
+
console.log("Skills changed (bundled) \u2014 reloading");
|
|
151
|
+
reloadAllSkills();
|
|
152
|
+
}, 300);
|
|
153
|
+
const userReload = debounce(() => {
|
|
154
|
+
console.log("Skills changed (user) \u2014 reloading");
|
|
155
|
+
reloadAllSkills();
|
|
156
|
+
}, 300);
|
|
147
157
|
try {
|
|
148
|
-
watch(SKILLS_DIR, { recursive: true }, () =>
|
|
149
|
-
console.log("Skills changed (bundled) \u2014 reloading");
|
|
150
|
-
reloadAllSkills();
|
|
151
|
-
});
|
|
158
|
+
watch(SKILLS_DIR, { recursive: true }, () => bundledReload());
|
|
152
159
|
}
|
|
153
|
-
catch { }
|
|
160
|
+
catch { /* ignore — watcher failures fall back to manual reload */ }
|
|
154
161
|
try {
|
|
155
162
|
if (existsSync(USER_SKILLS_DIR)) {
|
|
156
|
-
watch(USER_SKILLS_DIR, { recursive: true }, () =>
|
|
157
|
-
console.log("Skills changed (user) \u2014 reloading");
|
|
158
|
-
reloadAllSkills();
|
|
159
|
-
});
|
|
163
|
+
watch(USER_SKILLS_DIR, { recursive: true }, () => userReload());
|
|
160
164
|
}
|
|
161
165
|
}
|
|
162
|
-
catch { }
|
|
166
|
+
catch { /* ignore */ }
|
|
163
167
|
return cachedSkills;
|
|
164
168
|
}
|
|
165
169
|
/**
|
|
@@ -47,11 +47,17 @@ function statusIcon(status) {
|
|
|
47
47
|
}
|
|
48
48
|
}
|
|
49
49
|
function buildBanner(info, result) {
|
|
50
|
-
|
|
50
|
+
// A "completed" run that produced zero output is almost always a
|
|
51
|
+
// silent failure — a truncated stream, a tool-only final turn, a
|
|
52
|
+
// provider that swallowed its response. Call that out explicitly so
|
|
53
|
+
// the user sees a clear signal instead of a green tick on nothing.
|
|
54
|
+
const truncated = result.status === "completed" && (!result.output || result.output.trim().length === 0);
|
|
55
|
+
const icon = truncated ? "⚠️" : statusIcon(result.status);
|
|
56
|
+
const statusLabel = truncated ? "completed · empty output" : result.status;
|
|
51
57
|
const dur = formatDuration(result.duration);
|
|
52
58
|
const ti = formatTokens(result.tokensUsed.input);
|
|
53
59
|
const to = formatTokens(result.tokensUsed.output);
|
|
54
|
-
return `${icon} *${info.name}* ${
|
|
60
|
+
return `${icon} *${info.name}* ${statusLabel} · ${dur} · ${ti} in / ${to} out`;
|
|
55
61
|
}
|
|
56
62
|
// ── A4 Live-Stream ──────────────────────────────────────────
|
|
57
63
|
/**
|
|
@@ -231,6 +231,13 @@ async function runSubAgent(id, agentConfig, abort, resolvedName) {
|
|
|
231
231
|
console.error(`[subagent ${id}] live-stream init failed:`, err);
|
|
232
232
|
}
|
|
233
233
|
}
|
|
234
|
+
// These live OUTSIDE the try block so the catch handler can read
|
|
235
|
+
// whatever was buffered before the stream failed. Moving them into
|
|
236
|
+
// the try scope was the cause of the "output: ''" regression.
|
|
237
|
+
let finalText = "";
|
|
238
|
+
let inputTokens = 0;
|
|
239
|
+
let outputTokens = 0;
|
|
240
|
+
let streamError = null;
|
|
234
241
|
try {
|
|
235
242
|
const { getRegistry } = await import("../engine.js");
|
|
236
243
|
const registry = getRegistry();
|
|
@@ -243,9 +250,6 @@ async function runSubAgent(id, agentConfig, abort, resolvedName) {
|
|
|
243
250
|
? agentConfig.workingDir || os.homedir()
|
|
244
251
|
: os.homedir();
|
|
245
252
|
const systemPrompt = `You are a sub-agent named "${resolvedName}". Complete the following task autonomously and report your results clearly when done. Working directory: ${effectiveCwd}`;
|
|
246
|
-
let finalText = "";
|
|
247
|
-
let inputTokens = 0;
|
|
248
|
-
let outputTokens = 0;
|
|
249
253
|
for await (const chunk of registry.queryWithFallback({
|
|
250
254
|
prompt: agentConfig.prompt,
|
|
251
255
|
systemPrompt,
|
|
@@ -254,16 +258,33 @@ async function runSubAgent(id, agentConfig, abort, resolvedName) {
|
|
|
254
258
|
abortSignal: abort.signal,
|
|
255
259
|
})) {
|
|
256
260
|
if (chunk.type === "text") {
|
|
257
|
-
|
|
258
|
-
//
|
|
261
|
+
// Both SDK providers emit `text` as the accumulated string.
|
|
262
|
+
// Keep the last non-empty one we've seen so a final tool-only
|
|
263
|
+
// turn doesn't wipe our buffer.
|
|
264
|
+
if (chunk.text && chunk.text.length > 0) {
|
|
265
|
+
finalText = chunk.text;
|
|
266
|
+
}
|
|
259
267
|
if (liveStream && !liveStream.failed) {
|
|
260
268
|
liveStream.update(finalText);
|
|
261
269
|
}
|
|
262
270
|
}
|
|
263
271
|
if (chunk.type === "done") {
|
|
272
|
+
// done.text is the authoritative final accumulated text from
|
|
273
|
+
// the provider. Prefer it over the buffered value so runs that
|
|
274
|
+
// end on a tool_use don't leave us with a pre-tool snippet.
|
|
275
|
+
if (chunk.text && chunk.text.length > 0) {
|
|
276
|
+
finalText = chunk.text;
|
|
277
|
+
}
|
|
264
278
|
inputTokens = chunk.inputTokens || 0;
|
|
265
279
|
outputTokens = chunk.outputTokens || 0;
|
|
266
280
|
}
|
|
281
|
+
if (chunk.type === "error") {
|
|
282
|
+
// Providers surface mid-stream errors as an `error` chunk
|
|
283
|
+
// instead of throwing. Capture the reason so the post-loop
|
|
284
|
+
// status resolution below can distinguish this from a clean
|
|
285
|
+
// finish, and keep whatever text we already buffered.
|
|
286
|
+
streamError = chunk.error || "stream error";
|
|
287
|
+
}
|
|
267
288
|
}
|
|
268
289
|
// If cancelAllSubAgents has already taken over (shutdown path), don't
|
|
269
290
|
// overwrite the cancelled result it synthesised. Also: if the generator
|
|
@@ -285,6 +306,21 @@ async function runSubAgent(id, agentConfig, abort, resolvedName) {
|
|
|
285
306
|
};
|
|
286
307
|
entry.info.status = "cancelled";
|
|
287
308
|
}
|
|
309
|
+
else if (streamError) {
|
|
310
|
+
// Provider emitted an error chunk but the generator ended cleanly —
|
|
311
|
+
// record it as an error, but preserve the text buffered before the
|
|
312
|
+
// failure so the caller sees useful partial output instead of "".
|
|
313
|
+
entry.result = {
|
|
314
|
+
id,
|
|
315
|
+
name: resolvedName,
|
|
316
|
+
status: "error",
|
|
317
|
+
output: finalText,
|
|
318
|
+
tokensUsed: { input: inputTokens, output: outputTokens },
|
|
319
|
+
duration: Date.now() - startTime,
|
|
320
|
+
error: streamError,
|
|
321
|
+
};
|
|
322
|
+
entry.info.status = "error";
|
|
323
|
+
}
|
|
288
324
|
else {
|
|
289
325
|
entry.result = {
|
|
290
326
|
id,
|
|
@@ -312,6 +348,9 @@ async function runSubAgent(id, agentConfig, abort, resolvedName) {
|
|
|
312
348
|
}
|
|
313
349
|
}
|
|
314
350
|
catch (err) {
|
|
351
|
+
// If cancelAllSubAgents already set a cancelled result, keep it.
|
|
352
|
+
if (entry.result && entry.result.status === "cancelled")
|
|
353
|
+
return;
|
|
315
354
|
const isAbort = err instanceof Error && err.message.includes("abort");
|
|
316
355
|
const isTimeout = abort.signal.aborted;
|
|
317
356
|
const status = isTimeout
|
|
@@ -322,11 +361,13 @@ async function runSubAgent(id, agentConfig, abort, resolvedName) {
|
|
|
322
361
|
entry.result = {
|
|
323
362
|
id,
|
|
324
363
|
name: resolvedName,
|
|
325
|
-
|
|
326
|
-
output
|
|
327
|
-
|
|
364
|
+
// Preserve whatever text was buffered before the failure.
|
|
365
|
+
// Empty output here used to throw away multi-minute runs.
|
|
366
|
+
output: finalText,
|
|
367
|
+
tokensUsed: { input: inputTokens, output: outputTokens },
|
|
328
368
|
duration: Date.now() - startTime,
|
|
329
369
|
error: err instanceof Error ? err.message : String(err),
|
|
370
|
+
status,
|
|
330
371
|
};
|
|
331
372
|
entry.info.status = status;
|
|
332
373
|
}
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import { config } from "../config.js";
|
|
2
2
|
import { sanitizeTelegramMarkdown } from "./markdown.js";
|
|
3
|
+
import { isHarmlessTelegramError } from "../util/telegram-error-filter.js";
|
|
3
4
|
export class TelegramStreamer {
|
|
4
5
|
messageId = null;
|
|
5
6
|
chatId;
|
|
@@ -94,9 +95,17 @@ export class TelegramStreamer {
|
|
|
94
95
|
// If text fits in one message, just update the existing one
|
|
95
96
|
if (safeText.length <= config.telegramMaxLength && this.messageId) {
|
|
96
97
|
if (safeText !== this.lastSentText) {
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
98
|
+
try {
|
|
99
|
+
await this.api.editMessageText(this.chatId, this.messageId, safeText, {
|
|
100
|
+
parse_mode: "Markdown",
|
|
101
|
+
}).catch(() => this.api.editMessageText(this.chatId, this.messageId, safeText));
|
|
102
|
+
}
|
|
103
|
+
catch (err) {
|
|
104
|
+
// Drop "message is not modified" / "message to edit not found"
|
|
105
|
+
// races silently — they're harmless and always race-based.
|
|
106
|
+
if (!isHarmlessTelegramError(err))
|
|
107
|
+
throw err;
|
|
108
|
+
}
|
|
100
109
|
}
|
|
101
110
|
return;
|
|
102
111
|
}
|
|
@@ -0,0 +1,113 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Pure crash-loop brake logic, extracted from watchdog.ts so it can be
|
|
3
|
+
* unit-tested without touching the filesystem or launchctl.
|
|
4
|
+
*
|
|
5
|
+
* See test/watchdog-brake.test.ts for the regression this closes:
|
|
6
|
+
* chronic crashes with >5 min of uptime between them used to reset
|
|
7
|
+
* the counter before it could trip the brake, so the bot cycled
|
|
8
|
+
* indefinitely. The new policy enforces TWO thresholds — a fast
|
|
9
|
+
* short-window brake and a hard 24h daily cap — and only resets the
|
|
10
|
+
* counter after a real 1 h of clean uptime.
|
|
11
|
+
*/
|
|
12
|
+
export const DEFAULTS = {
|
|
13
|
+
/** Beacon older than this → previous process exited cleanly (or the
|
|
14
|
+
* machine was rebooted); do not count as a crash. */
|
|
15
|
+
STALE_BEACON_MS: 90_000,
|
|
16
|
+
/** Short-window crash tracking — N crashes in SHORT_WINDOW_MS. */
|
|
17
|
+
SHORT_WINDOW_MS: 10 * 60_000,
|
|
18
|
+
SHORT_BRAKE_THRESHOLD: 10,
|
|
19
|
+
/** Daily crash cap — hard ceiling regardless of gaps. Tripping this
|
|
20
|
+
* means the bot has been restarting >20 times per day, which is
|
|
21
|
+
* almost certainly a chronic issue worth freezing and alerting. */
|
|
22
|
+
DAILY_WINDOW_MS: 24 * 60 * 60 * 1000,
|
|
23
|
+
DAILY_BRAKE_THRESHOLD: 20,
|
|
24
|
+
/** Uptime required before the short-window counter resets. Was 5 min
|
|
25
|
+
* in the buggy version — but 5 min is shorter than the typical
|
|
26
|
+
* sub-agent lifetime (the daily job-alert takes 10+ min), so chronic
|
|
27
|
+
* crashes with ≥5 min gaps sailed right past the brake. 1 h is safer. */
|
|
28
|
+
RESET_AFTER_MS: 60 * 60_000,
|
|
29
|
+
};
|
|
30
|
+
/**
|
|
31
|
+
* Given the previous beacon (or null on first boot) and the current time,
|
|
32
|
+
* decide whether the bot should proceed with boot or engage the crash-loop
|
|
33
|
+
* brake.
|
|
34
|
+
*
|
|
35
|
+
* PURE: no fs, no launchctl, no clock — `now` is an explicit parameter.
|
|
36
|
+
*/
|
|
37
|
+
export function decideBrakeAction(previous, now, opts = {}) {
|
|
38
|
+
const staleMs = opts.staleBeaconMs ?? DEFAULTS.STALE_BEACON_MS;
|
|
39
|
+
const shortWindow = opts.shortWindowMs ?? DEFAULTS.SHORT_WINDOW_MS;
|
|
40
|
+
const shortBrake = opts.shortBrakeThreshold ?? DEFAULTS.SHORT_BRAKE_THRESHOLD;
|
|
41
|
+
const dailyWindow = opts.dailyWindowMs ?? DEFAULTS.DAILY_WINDOW_MS;
|
|
42
|
+
const dailyBrake = opts.dailyBrakeThreshold ?? DEFAULTS.DAILY_BRAKE_THRESHOLD;
|
|
43
|
+
// First boot or no beacon file → clean start
|
|
44
|
+
if (!previous) {
|
|
45
|
+
return {
|
|
46
|
+
action: "proceed",
|
|
47
|
+
crashCount: 0,
|
|
48
|
+
crashWindowStart: now,
|
|
49
|
+
dailyCrashCount: 0,
|
|
50
|
+
dailyCrashWindowStart: now,
|
|
51
|
+
};
|
|
52
|
+
}
|
|
53
|
+
// Daily window roll-over first — it's independent of short window.
|
|
54
|
+
let dailyCount = previous.dailyCrashCount;
|
|
55
|
+
let dailyStart = previous.dailyCrashWindowStart;
|
|
56
|
+
if (now - dailyStart >= dailyWindow) {
|
|
57
|
+
dailyCount = 0;
|
|
58
|
+
dailyStart = now;
|
|
59
|
+
}
|
|
60
|
+
const timeSinceLastBeat = now - previous.lastBeat;
|
|
61
|
+
const previousExitedRecently = timeSinceLastBeat < staleMs;
|
|
62
|
+
if (!previousExitedRecently) {
|
|
63
|
+
// Clean exit (or machine reboot between runs) → short-window counter
|
|
64
|
+
// resets, but the daily counter keeps going unless its own window
|
|
65
|
+
// already expired above.
|
|
66
|
+
return {
|
|
67
|
+
action: "proceed",
|
|
68
|
+
crashCount: 0,
|
|
69
|
+
crashWindowStart: now,
|
|
70
|
+
dailyCrashCount: dailyCount,
|
|
71
|
+
dailyCrashWindowStart: dailyStart,
|
|
72
|
+
};
|
|
73
|
+
}
|
|
74
|
+
// Short-window logic
|
|
75
|
+
const shortWindowExpired = now - previous.crashWindowStart >= shortWindow;
|
|
76
|
+
let crashCount;
|
|
77
|
+
let crashWindowStart;
|
|
78
|
+
if (shortWindowExpired) {
|
|
79
|
+
crashCount = 1;
|
|
80
|
+
crashWindowStart = now;
|
|
81
|
+
}
|
|
82
|
+
else {
|
|
83
|
+
crashCount = previous.crashCount + 1;
|
|
84
|
+
crashWindowStart = previous.crashWindowStart;
|
|
85
|
+
}
|
|
86
|
+
// Increment daily count since we treat this as a crash
|
|
87
|
+
dailyCount += 1;
|
|
88
|
+
if (crashCount >= shortBrake) {
|
|
89
|
+
return {
|
|
90
|
+
action: "brake",
|
|
91
|
+
reason: `${crashCount} crashes within short window (${Math.round(shortWindow / 60_000)}min) — threshold is ${shortBrake}`,
|
|
92
|
+
};
|
|
93
|
+
}
|
|
94
|
+
if (dailyCount >= dailyBrake) {
|
|
95
|
+
return {
|
|
96
|
+
action: "brake",
|
|
97
|
+
reason: `${dailyCount} crashes within daily window (${Math.round(dailyWindow / 3_600_000)}h) — threshold is ${dailyBrake}`,
|
|
98
|
+
};
|
|
99
|
+
}
|
|
100
|
+
return {
|
|
101
|
+
action: "proceed",
|
|
102
|
+
crashCount,
|
|
103
|
+
crashWindowStart,
|
|
104
|
+
dailyCrashCount: dailyCount,
|
|
105
|
+
dailyCrashWindowStart: dailyStart,
|
|
106
|
+
};
|
|
107
|
+
}
|
|
108
|
+
/** Whether the short-window crash counter should be reset after this
|
|
109
|
+
* much clean uptime. Default: 1 h. */
|
|
110
|
+
export function shouldResetCrashCounter(uptimeMs, opts = {}) {
|
|
111
|
+
const threshold = opts.resetAfterMs ?? DEFAULTS.RESET_AFTER_MS;
|
|
112
|
+
return uptimeMs >= threshold;
|
|
113
|
+
}
|
|
@@ -27,15 +27,13 @@ import { resolve } from "path";
|
|
|
27
27
|
import os from "os";
|
|
28
28
|
import { execSync } from "child_process";
|
|
29
29
|
import { BOT_VERSION } from "../version.js";
|
|
30
|
+
import { decideBrakeAction, shouldResetCrashCounter, DEFAULTS, } from "./watchdog-brake.js";
|
|
30
31
|
const DATA_DIR = process.env.ALVIN_DATA_DIR || resolve(os.homedir(), ".alvin-bot");
|
|
31
32
|
const STATE_DIR = resolve(DATA_DIR, "state");
|
|
32
33
|
const BEACON_FILE = resolve(STATE_DIR, "watchdog.json");
|
|
33
34
|
const ALERT_FILE = resolve(STATE_DIR, "crash-loop.alert");
|
|
34
35
|
const BEACON_INTERVAL_MS = 30_000; // write a beacon every 30 s
|
|
35
|
-
|
|
36
|
-
const CRASH_BRAKE_THRESHOLD = 10; // after this many crashes in the window, brake
|
|
37
|
-
const STALE_BEACON_MS = 90_000; // a beacon older than this is considered "old enough that previous process really exited"
|
|
38
|
-
const RECOVERY_UPTIME_MS = 5 * 60 * 1000; // 5 min of clean uptime resets the counter
|
|
36
|
+
// Thresholds and windows live in watchdog-brake.ts DEFAULTS.
|
|
39
37
|
let beaconTimer = null;
|
|
40
38
|
let resetTimer = null;
|
|
41
39
|
let bootTime = 0;
|
|
@@ -57,7 +55,21 @@ function readBeacon() {
|
|
|
57
55
|
typeof parsed.crashCount === "number" &&
|
|
58
56
|
typeof parsed.crashWindowStart === "number" &&
|
|
59
57
|
typeof parsed.version === "string") {
|
|
60
|
-
|
|
58
|
+
// Older beacons don't have daily-counter fields — default them to
|
|
59
|
+
// 0/now so the brake logic treats this run as the start of the
|
|
60
|
+
// first daily window.
|
|
61
|
+
return {
|
|
62
|
+
lastBeat: parsed.lastBeat,
|
|
63
|
+
pid: parsed.pid,
|
|
64
|
+
bootTime: parsed.bootTime,
|
|
65
|
+
crashCount: parsed.crashCount,
|
|
66
|
+
crashWindowStart: parsed.crashWindowStart,
|
|
67
|
+
version: parsed.version,
|
|
68
|
+
dailyCrashCount: typeof parsed.dailyCrashCount === "number" ? parsed.dailyCrashCount : 0,
|
|
69
|
+
dailyCrashWindowStart: typeof parsed.dailyCrashWindowStart === "number"
|
|
70
|
+
? parsed.dailyCrashWindowStart
|
|
71
|
+
: Date.now(),
|
|
72
|
+
};
|
|
61
73
|
}
|
|
62
74
|
return null;
|
|
63
75
|
}
|
|
@@ -78,8 +90,9 @@ function writeAlert(reason, crashCount) {
|
|
|
78
90
|
const content = [
|
|
79
91
|
`Alvin Bot crash-loop brake hit at ${new Date().toISOString()}`,
|
|
80
92
|
`Version: ${BOT_VERSION}`,
|
|
81
|
-
`Crashes in the last ${
|
|
82
|
-
`
|
|
93
|
+
`Crashes in the last ${DEFAULTS.SHORT_WINDOW_MS / 60_000} minutes: ${crashCount}`,
|
|
94
|
+
`Short-window threshold: ${DEFAULTS.SHORT_BRAKE_THRESHOLD}`,
|
|
95
|
+
`Daily threshold: ${DEFAULTS.DAILY_BRAKE_THRESHOLD}`,
|
|
83
96
|
``,
|
|
84
97
|
`Reason: ${reason}`,
|
|
85
98
|
``,
|
|
@@ -147,36 +160,25 @@ export function startWatchdog() {
|
|
|
147
160
|
ensureStateDir();
|
|
148
161
|
bootTime = Date.now();
|
|
149
162
|
const previous = readBeacon();
|
|
150
|
-
|
|
151
|
-
|
|
163
|
+
const decision = decideBrakeAction(previous, bootTime);
|
|
164
|
+
if (decision.action === "brake") {
|
|
165
|
+
console.error(`[watchdog] crash-loop brake triggered: ${decision.reason}`);
|
|
166
|
+
writeAlert(decision.reason, previous?.crashCount ?? 0);
|
|
167
|
+
checkCrashLoopBrake();
|
|
168
|
+
// checkCrashLoopBrake calls process.exit — execution never reaches here.
|
|
169
|
+
return;
|
|
170
|
+
}
|
|
171
|
+
let crashCount = decision.crashCount;
|
|
172
|
+
let crashWindowStart = decision.crashWindowStart;
|
|
173
|
+
let dailyCrashCount = decision.dailyCrashCount;
|
|
174
|
+
let dailyCrashWindowStart = decision.dailyCrashWindowStart;
|
|
152
175
|
if (previous) {
|
|
153
176
|
const timeSinceLastBeat = bootTime - previous.lastBeat;
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
if (inWindow) {
|
|
160
|
-
crashCount = previous.crashCount + 1;
|
|
161
|
-
crashWindowStart = previous.crashWindowStart;
|
|
162
|
-
}
|
|
163
|
-
else {
|
|
164
|
-
// Previous crash was outside the window → reset counter
|
|
165
|
-
crashCount = 1;
|
|
166
|
-
}
|
|
167
|
-
console.log(`[watchdog] detected restart after ${Math.round(timeSinceLastBeat / 1000)}s — crash ${crashCount}/${CRASH_BRAKE_THRESHOLD} in current ${CRASH_WINDOW_MS / 60_000}min window`);
|
|
168
|
-
if (crashCount >= CRASH_BRAKE_THRESHOLD) {
|
|
169
|
-
console.error(`[watchdog] crash-loop brake triggered (${crashCount} crashes in ${CRASH_WINDOW_MS / 60_000}min)`);
|
|
170
|
-
writeAlert(`Process restarted ${crashCount} times within ${CRASH_WINDOW_MS / 60_000} minutes. Last beacon was ${Math.round(timeSinceLastBeat / 1000)}s ago. Most likely a deterministic crash on startup.`, crashCount);
|
|
171
|
-
// Re-use the brake check to unload + exit cleanly
|
|
172
|
-
checkCrashLoopBrake();
|
|
173
|
-
}
|
|
174
|
-
}
|
|
175
|
-
else {
|
|
176
|
-
// Previous beacon was old → process had clean uptime before exit,
|
|
177
|
-
// OR system was rebooted between runs. Reset crash count.
|
|
178
|
-
crashCount = 0;
|
|
179
|
-
crashWindowStart = bootTime;
|
|
177
|
+
if (timeSinceLastBeat < DEFAULTS.STALE_BEACON_MS) {
|
|
178
|
+
console.log(`[watchdog] detected restart after ${Math.round(timeSinceLastBeat / 1000)}s — ` +
|
|
179
|
+
`crash ${crashCount}/${DEFAULTS.SHORT_BRAKE_THRESHOLD} in current ` +
|
|
180
|
+
`${DEFAULTS.SHORT_WINDOW_MS / 60_000}min window, ` +
|
|
181
|
+
`${dailyCrashCount}/${DEFAULTS.DAILY_BRAKE_THRESHOLD} in current 24h window`);
|
|
180
182
|
}
|
|
181
183
|
}
|
|
182
184
|
// Write the first beacon immediately so a fresh restart updates the file
|
|
@@ -186,6 +188,8 @@ export function startWatchdog() {
|
|
|
186
188
|
bootTime,
|
|
187
189
|
crashCount,
|
|
188
190
|
crashWindowStart,
|
|
191
|
+
dailyCrashCount,
|
|
192
|
+
dailyCrashWindowStart,
|
|
189
193
|
version: BOT_VERSION,
|
|
190
194
|
});
|
|
191
195
|
// Periodic beacon writer
|
|
@@ -196,15 +200,20 @@ export function startWatchdog() {
|
|
|
196
200
|
bootTime,
|
|
197
201
|
crashCount,
|
|
198
202
|
crashWindowStart,
|
|
203
|
+
dailyCrashCount,
|
|
204
|
+
dailyCrashWindowStart,
|
|
199
205
|
version: BOT_VERSION,
|
|
200
206
|
});
|
|
201
207
|
}, BEACON_INTERVAL_MS);
|
|
202
|
-
// Schedule a recovery counter reset after
|
|
203
|
-
// uptime.
|
|
204
|
-
//
|
|
208
|
+
// Schedule a recovery counter reset after RESET_AFTER_MS (1 h by default)
|
|
209
|
+
// of clean uptime. The old policy was 5 min — too short because chronic
|
|
210
|
+
// crashes often had 5-10 min gaps and never tripped the brake.
|
|
205
211
|
resetTimer = setTimeout(() => {
|
|
206
|
-
|
|
207
|
-
|
|
212
|
+
const uptime = Date.now() - bootTime;
|
|
213
|
+
if (shouldResetCrashCounter(uptime) && crashCount > 0) {
|
|
214
|
+
console.log(`[watchdog] ${Math.round(uptime / 60_000)}min clean uptime — ` +
|
|
215
|
+
`resetting short-window crash counter from ${crashCount} to 0 ` +
|
|
216
|
+
`(daily counter ${dailyCrashCount} stays)`);
|
|
208
217
|
crashCount = 0;
|
|
209
218
|
crashWindowStart = Date.now();
|
|
210
219
|
writeBeacon({
|
|
@@ -213,11 +222,16 @@ export function startWatchdog() {
|
|
|
213
222
|
bootTime,
|
|
214
223
|
crashCount,
|
|
215
224
|
crashWindowStart,
|
|
225
|
+
dailyCrashCount,
|
|
226
|
+
dailyCrashWindowStart,
|
|
216
227
|
version: BOT_VERSION,
|
|
217
228
|
});
|
|
218
229
|
}
|
|
219
|
-
},
|
|
220
|
-
console.log(`[watchdog] started — beacon every ${BEACON_INTERVAL_MS / 1000}s,
|
|
230
|
+
}, DEFAULTS.RESET_AFTER_MS);
|
|
231
|
+
console.log(`[watchdog] started — beacon every ${BEACON_INTERVAL_MS / 1000}s, ` +
|
|
232
|
+
`brake at ${DEFAULTS.SHORT_BRAKE_THRESHOLD} crashes / ${DEFAULTS.SHORT_WINDOW_MS / 60_000}min ` +
|
|
233
|
+
`or ${DEFAULTS.DAILY_BRAKE_THRESHOLD} / 24h, ` +
|
|
234
|
+
`recovery after ${DEFAULTS.RESET_AFTER_MS / 60_000}min uptime`);
|
|
221
235
|
}
|
|
222
236
|
/**
|
|
223
237
|
* Stop the watchdog cleanly. Called from the shutdown handler in
|
|
@@ -0,0 +1,109 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Console formatter — adds ISO timestamps to every console.log /
|
|
3
|
+
* console.warn / console.error call, and drops high-volume noise
|
|
4
|
+
* (libsignal session dumps, Claude CLI native-binary banner).
|
|
5
|
+
*
|
|
6
|
+
* Installed once at bootstrap time from src/index.ts. Idempotent.
|
|
7
|
+
*
|
|
8
|
+
* Why not pino / winston: those pull in several MB of deps and change
|
|
9
|
+
* the call-site ergonomics. Every caller in the bot today uses plain
|
|
10
|
+
* `console.log`; monkey-patching those is a 40-line change instead of
|
|
11
|
+
* a refactor of every file.
|
|
12
|
+
*/
|
|
13
|
+
import util from "node:util";
|
|
14
|
+
let snapshot = null;
|
|
15
|
+
/**
|
|
16
|
+
* Noise patterns from production logs that fill out.log/err.log with
|
|
17
|
+
* tens of KB per day without carrying useful signal. Added sparingly —
|
|
18
|
+
* every entry here is a line a human will never need to grep for.
|
|
19
|
+
*/
|
|
20
|
+
const NOISE_PATTERNS = [
|
|
21
|
+
// libsignal session dump header — the multi-line body following this
|
|
22
|
+
// line is silenced by the first-line detector below.
|
|
23
|
+
/^Closing session: SessionEntry \{/,
|
|
24
|
+
// libsignal prekey bundle swap notification
|
|
25
|
+
/^Closing open session in favor of incoming prekey bundle/,
|
|
26
|
+
// Claude CLI startup banner — spammed once per query
|
|
27
|
+
/^\[claude\] Native binary: /,
|
|
28
|
+
// libsignal Bad MAC — session desync, harmless, repeats endlessly
|
|
29
|
+
/^Session error:Error: Bad MAC Error: Bad MAC/,
|
|
30
|
+
];
|
|
31
|
+
/** Exported for testing. */
|
|
32
|
+
export function isNoisyLine(line) {
|
|
33
|
+
return NOISE_PATTERNS.some((re) => re.test(line));
|
|
34
|
+
}
|
|
35
|
+
/**
|
|
36
|
+
* Track whether we're currently inside a libsignal multi-line dump. The
|
|
37
|
+
* dumps look like `Closing session: SessionEntry {` followed by several
|
|
38
|
+
* lines of buffer hex, closing with `}`. We swallow everything from the
|
|
39
|
+
* opening brace to its matching `}` line.
|
|
40
|
+
*/
|
|
41
|
+
let suppressDepth = 0;
|
|
42
|
+
function shouldSuppress(raw) {
|
|
43
|
+
const line = raw.trimEnd();
|
|
44
|
+
if (suppressDepth > 0) {
|
|
45
|
+
// Inside a multi-line dump — count braces on this line. The dumps
|
|
46
|
+
// only contain ASCII braces in the structural positions, so this
|
|
47
|
+
// is safe enough for production noise.
|
|
48
|
+
const opens = (line.match(/\{/g) || []).length;
|
|
49
|
+
const closes = (line.match(/\}/g) || []).length;
|
|
50
|
+
suppressDepth += opens;
|
|
51
|
+
suppressDepth -= closes;
|
|
52
|
+
if (suppressDepth < 0)
|
|
53
|
+
suppressDepth = 0;
|
|
54
|
+
return true;
|
|
55
|
+
}
|
|
56
|
+
if (isNoisyLine(line)) {
|
|
57
|
+
// If the noisy header opens a block, start suppressing its body.
|
|
58
|
+
const opens = (line.match(/\{/g) || []).length;
|
|
59
|
+
const closes = (line.match(/\}/g) || []).length;
|
|
60
|
+
suppressDepth = Math.max(0, opens - closes);
|
|
61
|
+
return true;
|
|
62
|
+
}
|
|
63
|
+
return false;
|
|
64
|
+
}
|
|
65
|
+
function formatWithTimestamp(method, stream) {
|
|
66
|
+
return (...args) => {
|
|
67
|
+
// Render args the same way console does — util.format handles %s / %d / objects.
|
|
68
|
+
const text = renderArgs(args);
|
|
69
|
+
if (shouldSuppress(text))
|
|
70
|
+
return;
|
|
71
|
+
const stamp = new Date().toISOString();
|
|
72
|
+
// Write directly to the stream so we don't recurse through console.
|
|
73
|
+
stream.write(`${stamp} ${text}\n`);
|
|
74
|
+
void method; // keep original ref alive for uninstall
|
|
75
|
+
};
|
|
76
|
+
}
|
|
77
|
+
function renderArgs(args) {
|
|
78
|
+
// Use Node's built-in util.format — it matches console.* exactly.
|
|
79
|
+
return util.format(...args);
|
|
80
|
+
}
|
|
81
|
+
/**
|
|
82
|
+
* Install timestamp + noise-filter formatters on console.log/warn/info/error.
|
|
83
|
+
* Safe to call multiple times.
|
|
84
|
+
*/
|
|
85
|
+
export function installConsoleFormatter() {
|
|
86
|
+
if (snapshot)
|
|
87
|
+
return; // already installed
|
|
88
|
+
snapshot = {
|
|
89
|
+
log: console.log.bind(console),
|
|
90
|
+
warn: console.warn.bind(console),
|
|
91
|
+
error: console.error.bind(console),
|
|
92
|
+
info: console.info.bind(console),
|
|
93
|
+
};
|
|
94
|
+
console.log = formatWithTimestamp(snapshot.log, process.stdout);
|
|
95
|
+
console.info = formatWithTimestamp(snapshot.info, process.stdout);
|
|
96
|
+
console.warn = formatWithTimestamp(snapshot.warn, process.stderr);
|
|
97
|
+
console.error = formatWithTimestamp(snapshot.error, process.stderr);
|
|
98
|
+
}
|
|
99
|
+
/** Restore the original console methods. Used by tests + shutdown. */
|
|
100
|
+
export function uninstallConsoleFormatter() {
|
|
101
|
+
if (!snapshot)
|
|
102
|
+
return;
|
|
103
|
+
console.log = snapshot.log;
|
|
104
|
+
console.info = snapshot.info;
|
|
105
|
+
console.warn = snapshot.warn;
|
|
106
|
+
console.error = snapshot.error;
|
|
107
|
+
snapshot = null;
|
|
108
|
+
suppressDepth = 0;
|
|
109
|
+
}
|