alvin-bot 5.1.5 → 5.1.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +40 -0
- package/dist/services/cron.js +94 -1
- package/dist/services/watchdog-brake.js +39 -0
- package/dist/services/watchdog.js +2 -25
- package/package.json +1 -1
package/CHANGELOG.md
CHANGED
|
@@ -2,6 +2,46 @@
|
|
|
2
2
|
|
|
3
3
|
All notable changes to Alvin Bot are documented here.
|
|
4
4
|
|
|
5
|
+
## [5.1.7] — 2026-05-17
|
|
6
|
+
|
|
7
|
+
### Scheduled jobs no longer run twice after a restart
|
|
8
|
+
|
|
9
|
+
If two bot instances were briefly alive at the same time — for example
|
|
10
|
+
right after an auto-update or a restart, while the old process was still
|
|
11
|
+
shutting down — a scheduled job could fire twice within the same minute.
|
|
12
|
+
One real case: a weekly report job sent its email, then sent an empty
|
|
13
|
+
duplicate 30 seconds later. The old overlap guard only worked inside a
|
|
14
|
+
single process, so a second instance never saw the first one's claim.
|
|
15
|
+
|
|
16
|
+
Jobs are now claimed with a small cross-process lock before they run, so
|
|
17
|
+
only one instance can execute a given job for a given slot. A crashed
|
|
18
|
+
run can't wedge the lock — it is reclaimed automatically once the owning
|
|
19
|
+
process is gone. Manual `/cron run` honours the same lock. No
|
|
20
|
+
configuration changes; existing jobs just stop double-firing.
|
|
21
|
+
|
|
22
|
+
## [5.1.6] — 2026-05-15
|
|
23
|
+
|
|
24
|
+
### Planned restarts really stop counting as crashes now
|
|
25
|
+
|
|
26
|
+
v5.1.5 added a flag that marks self-updates and `/update` / `/restart`
|
|
27
|
+
as intentional so they don't inflate the crash counter. Half of it
|
|
28
|
+
didn't actually work: the code that reads the saved state back on the
|
|
29
|
+
next boot rebuilt it field by field and silently dropped that very
|
|
30
|
+
flag, so the crash detector never saw it and planned restarts were
|
|
31
|
+
still scored as crashes. (The other half of v5.1.5 — not counting
|
|
32
|
+
benign log lines as errors — was unaffected and has been working.)
|
|
33
|
+
|
|
34
|
+
This release makes the read path preserve the flag, so a planned
|
|
35
|
+
restart is now genuinely treated as a clean exit. The state-parsing
|
|
36
|
+
logic was pulled into a tested pure function so the read-back round
|
|
37
|
+
trip can't silently regress like this again.
|
|
38
|
+
|
|
39
|
+
### What this means for you
|
|
40
|
+
|
|
41
|
+
If you updated to 5.1.5 and still saw the crash count tick up by one
|
|
42
|
+
each time the bot updated itself, that stops now. The error-trend
|
|
43
|
+
half of the 5.1.5 fix already worked; this completes the crash half.
|
|
44
|
+
|
|
5
45
|
## [5.1.5] — 2026-05-15
|
|
6
46
|
|
|
7
47
|
### Health monitor no longer cries wolf about its own log lines
|
package/dist/services/cron.js
CHANGED
|
@@ -11,7 +11,7 @@
|
|
|
11
11
|
*/
|
|
12
12
|
import fs from "fs";
|
|
13
13
|
import { execSync } from "child_process";
|
|
14
|
-
import { dirname } from "path";
|
|
14
|
+
import { resolve, dirname } from "path";
|
|
15
15
|
import { CRON_FILE, BOT_ROOT } from "../paths.js";
|
|
16
16
|
import { prepareForExecution, handleStartupCatchup, calculateNextRunFrom, } from "./cron-scheduling.js";
|
|
17
17
|
import { resolveJobByNameOrId } from "./cron-resolver.js";
|
|
@@ -256,6 +256,85 @@ async function executeJob(job) {
|
|
|
256
256
|
// ── Scheduler Loop ──────────────────────────────────────
|
|
257
257
|
let schedulerTimer = null;
|
|
258
258
|
const runningJobs = new Set(); // Guard against overlapping executions
|
|
259
|
+
// ── Cross-process job lock ──────────────────────────────
|
|
260
|
+
//
|
|
261
|
+
// `runningJobs` only guards overlap WITHIN this process. If two bot
|
|
262
|
+
// instances are briefly alive at once (a launchd/pm2 restart that left
|
|
263
|
+
// the old process running, or startup-catchup racing the normal tick),
|
|
264
|
+
// each has its own in-memory Set and the same job can fire twice —
|
|
265
|
+
// observed in the wild: a weekly job mailed its report, then mailed an
|
|
266
|
+
// empty duplicate 30 s later. This atomic `mkdir` lock makes the claim
|
|
267
|
+
// cross-process: the second instance sees the lock and skips the slot
|
|
268
|
+
// instead of double-firing. Stale locks (owning PID gone, or — when the
|
|
269
|
+
// meta is unreadable — older than the catch-up grace) are reclaimed so a
|
|
270
|
+
// crash can never wedge a job forever. No deps, cross-platform.
|
|
271
|
+
const CRON_LOCK_DIR = resolve(dirname(CRON_FILE), ".cron-locks");
|
|
272
|
+
const CRON_LOCK_MAX_AGE_MS = 6 * 60 * 60 * 1000; // backstop for corrupt meta
|
|
273
|
+
function cronLockPath(jobId) {
|
|
274
|
+
return resolve(CRON_LOCK_DIR, `${jobId.replace(/[^A-Za-z0-9_-]/g, "_")}.lock`);
|
|
275
|
+
}
|
|
276
|
+
function acquireJobLock(jobId) {
|
|
277
|
+
const lock = cronLockPath(jobId);
|
|
278
|
+
const writeMeta = () => {
|
|
279
|
+
try {
|
|
280
|
+
fs.writeFileSync(resolve(lock, "meta"), JSON.stringify({ pid: process.pid, at: Date.now() }));
|
|
281
|
+
}
|
|
282
|
+
catch { /* meta is best-effort */ }
|
|
283
|
+
};
|
|
284
|
+
try {
|
|
285
|
+
fs.mkdirSync(CRON_LOCK_DIR, { recursive: true });
|
|
286
|
+
}
|
|
287
|
+
catch { /* ignore */ }
|
|
288
|
+
try {
|
|
289
|
+
fs.mkdirSync(lock); // atomic: throws EEXIST if another instance holds it
|
|
290
|
+
writeMeta();
|
|
291
|
+
return true;
|
|
292
|
+
}
|
|
293
|
+
catch {
|
|
294
|
+
let stale = false;
|
|
295
|
+
try {
|
|
296
|
+
const meta = JSON.parse(fs.readFileSync(resolve(lock, "meta"), "utf-8"));
|
|
297
|
+
if (typeof meta.pid === "number") {
|
|
298
|
+
try {
|
|
299
|
+
process.kill(meta.pid, 0); // same-host liveness probe (no signal sent)
|
|
300
|
+
}
|
|
301
|
+
catch (e) {
|
|
302
|
+
if (e.code === "ESRCH")
|
|
303
|
+
stale = true; // owner gone
|
|
304
|
+
}
|
|
305
|
+
}
|
|
306
|
+
else {
|
|
307
|
+
stale = true; // no usable pid recorded
|
|
308
|
+
}
|
|
309
|
+
}
|
|
310
|
+
catch {
|
|
311
|
+
// meta missing/corrupt → fall back to lock-dir age
|
|
312
|
+
try {
|
|
313
|
+
stale = Date.now() - fs.statSync(lock).mtimeMs > CRON_LOCK_MAX_AGE_MS;
|
|
314
|
+
}
|
|
315
|
+
catch {
|
|
316
|
+
stale = false; // can't stat → treat as held (skip rather than double-fire)
|
|
317
|
+
}
|
|
318
|
+
}
|
|
319
|
+
if (!stale)
|
|
320
|
+
return false;
|
|
321
|
+
try {
|
|
322
|
+
fs.rmSync(lock, { recursive: true, force: true });
|
|
323
|
+
fs.mkdirSync(lock);
|
|
324
|
+
writeMeta();
|
|
325
|
+
return true;
|
|
326
|
+
}
|
|
327
|
+
catch {
|
|
328
|
+
return false;
|
|
329
|
+
}
|
|
330
|
+
}
|
|
331
|
+
}
|
|
332
|
+
function releaseJobLock(jobId) {
|
|
333
|
+
try {
|
|
334
|
+
fs.rmSync(cronLockPath(jobId), { recursive: true, force: true });
|
|
335
|
+
}
|
|
336
|
+
catch { /* ignore */ }
|
|
337
|
+
}
|
|
259
338
|
export function startScheduler() {
|
|
260
339
|
if (schedulerTimer)
|
|
261
340
|
return;
|
|
@@ -301,6 +380,13 @@ export function startScheduler() {
|
|
|
301
380
|
// mid-execution, handleStartupCatchup will notice the attempt
|
|
302
381
|
// without completion and nachholen within the grace window.
|
|
303
382
|
runningJobs.add(job.id);
|
|
383
|
+
// Cross-process claim: if another bot instance already owns this
|
|
384
|
+
// slot, skip instead of double-firing (the duplicate-report bug).
|
|
385
|
+
if (!acquireJobLock(job.id)) {
|
|
386
|
+
runningJobs.delete(job.id);
|
|
387
|
+
console.log(`Cron: job "${job.name}" (${job.id}) already claimed by another instance — skipping to avoid double-fire`);
|
|
388
|
+
continue;
|
|
389
|
+
}
|
|
304
390
|
const prepared = prepareForExecution(job, now);
|
|
305
391
|
Object.assign(job, prepared);
|
|
306
392
|
saveJobs(jobs);
|
|
@@ -328,6 +414,7 @@ export function startScheduler() {
|
|
|
328
414
|
}
|
|
329
415
|
finally {
|
|
330
416
|
runningJobs.delete(job.id);
|
|
417
|
+
releaseJobLock(job.id);
|
|
331
418
|
}
|
|
332
419
|
continue; // Skip the outer changed/save since we save inside
|
|
333
420
|
}
|
|
@@ -422,6 +509,11 @@ export async function runJobNow(nameOrId) {
|
|
|
422
509
|
return { status: "already-running", job };
|
|
423
510
|
}
|
|
424
511
|
runningJobs.add(job.id);
|
|
512
|
+
// Cross-process: another bot instance may already be running this job.
|
|
513
|
+
if (!acquireJobLock(job.id)) {
|
|
514
|
+
runningJobs.delete(job.id);
|
|
515
|
+
return { status: "already-running", job };
|
|
516
|
+
}
|
|
425
517
|
try {
|
|
426
518
|
// executeJob catches its own errors and returns { output, error }.
|
|
427
519
|
// The inner try/catch here is a defensive belt against future
|
|
@@ -460,6 +552,7 @@ export async function runJobNow(nameOrId) {
|
|
|
460
552
|
}
|
|
461
553
|
finally {
|
|
462
554
|
runningJobs.delete(job.id);
|
|
555
|
+
releaseJobLock(job.id);
|
|
463
556
|
}
|
|
464
557
|
}
|
|
465
558
|
/**
|
|
@@ -27,6 +27,45 @@ export const DEFAULTS = {
|
|
|
27
27
|
* crashes with ≥5 min gaps sailed right past the brake. 1 h is safer. */
|
|
28
28
|
RESET_AFTER_MS: 60 * 60_000,
|
|
29
29
|
};
|
|
30
|
+
/**
|
|
31
|
+
* Validate + normalize a parsed beacon JSON into a BeaconData (or null
|
|
32
|
+
* if the core fields are missing/wrong-typed). Pure so the read-path
|
|
33
|
+
* field mapping is unit-testable — extracted after v5.1.5 shipped a
|
|
34
|
+
* broken expectedRestart: the old readBeacon() rebuilt the object
|
|
35
|
+
* field-by-field and silently dropped expectedRestart, so the flag
|
|
36
|
+
* never reached decideBrakeAction and intentional restarts were still
|
|
37
|
+
* counted as crashes. Whatever round-trips here is what the brake sees.
|
|
38
|
+
*/
|
|
39
|
+
export function normalizeBeacon(parsed) {
|
|
40
|
+
if (!parsed)
|
|
41
|
+
return null;
|
|
42
|
+
if (typeof parsed.lastBeat === "number" &&
|
|
43
|
+
typeof parsed.pid === "number" &&
|
|
44
|
+
typeof parsed.bootTime === "number" &&
|
|
45
|
+
typeof parsed.crashCount === "number" &&
|
|
46
|
+
typeof parsed.crashWindowStart === "number" &&
|
|
47
|
+
typeof parsed.version === "string") {
|
|
48
|
+
return {
|
|
49
|
+
lastBeat: parsed.lastBeat,
|
|
50
|
+
pid: parsed.pid,
|
|
51
|
+
bootTime: parsed.bootTime,
|
|
52
|
+
crashCount: parsed.crashCount,
|
|
53
|
+
crashWindowStart: parsed.crashWindowStart,
|
|
54
|
+
version: parsed.version,
|
|
55
|
+
// Older beacons don't have daily-counter fields — default them to
|
|
56
|
+
// 0/now so the brake logic treats this run as the start of the
|
|
57
|
+
// first daily window.
|
|
58
|
+
dailyCrashCount: typeof parsed.dailyCrashCount === "number" ? parsed.dailyCrashCount : 0,
|
|
59
|
+
dailyCrashWindowStart: typeof parsed.dailyCrashWindowStart === "number"
|
|
60
|
+
? parsed.dailyCrashWindowStart
|
|
61
|
+
: Date.now(),
|
|
62
|
+
// The whole point of the v5.1.6 fix: propagate expectedRestart so
|
|
63
|
+
// a planned restart is not scored as a crash on the next boot.
|
|
64
|
+
expectedRestart: parsed.expectedRestart === true,
|
|
65
|
+
};
|
|
66
|
+
}
|
|
67
|
+
return null;
|
|
68
|
+
}
|
|
30
69
|
/**
|
|
31
70
|
* Given the previous beacon (or null on first boot) and the current time,
|
|
32
71
|
* decide whether the bot should proceed with boot or engage the crash-loop
|
|
@@ -29,7 +29,7 @@ import { execSync } from "child_process";
|
|
|
29
29
|
import { BOT_VERSION } from "../version.js";
|
|
30
30
|
import { emitCritical } from "./critical-notify.js";
|
|
31
31
|
import { writeDiagnosticBundle } from "./auto-diagnostic.js";
|
|
32
|
-
import { decideBrakeAction, shouldResetCrashCounter, DEFAULTS, } from "./watchdog-brake.js";
|
|
32
|
+
import { decideBrakeAction, shouldResetCrashCounter, normalizeBeacon, DEFAULTS, } from "./watchdog-brake.js";
|
|
33
33
|
const DATA_DIR = process.env.ALVIN_DATA_DIR || resolve(os.homedir(), ".alvin-bot");
|
|
34
34
|
const STATE_DIR = resolve(DATA_DIR, "state");
|
|
35
35
|
const BEACON_FILE = resolve(STATE_DIR, "watchdog.json");
|
|
@@ -50,30 +50,7 @@ function ensureStateDir() {
|
|
|
50
50
|
function readBeacon() {
|
|
51
51
|
try {
|
|
52
52
|
const raw = fs.readFileSync(BEACON_FILE, "utf-8");
|
|
53
|
-
|
|
54
|
-
if (typeof parsed.lastBeat === "number" &&
|
|
55
|
-
typeof parsed.pid === "number" &&
|
|
56
|
-
typeof parsed.bootTime === "number" &&
|
|
57
|
-
typeof parsed.crashCount === "number" &&
|
|
58
|
-
typeof parsed.crashWindowStart === "number" &&
|
|
59
|
-
typeof parsed.version === "string") {
|
|
60
|
-
// Older beacons don't have daily-counter fields — default them to
|
|
61
|
-
// 0/now so the brake logic treats this run as the start of the
|
|
62
|
-
// first daily window.
|
|
63
|
-
return {
|
|
64
|
-
lastBeat: parsed.lastBeat,
|
|
65
|
-
pid: parsed.pid,
|
|
66
|
-
bootTime: parsed.bootTime,
|
|
67
|
-
crashCount: parsed.crashCount,
|
|
68
|
-
crashWindowStart: parsed.crashWindowStart,
|
|
69
|
-
version: parsed.version,
|
|
70
|
-
dailyCrashCount: typeof parsed.dailyCrashCount === "number" ? parsed.dailyCrashCount : 0,
|
|
71
|
-
dailyCrashWindowStart: typeof parsed.dailyCrashWindowStart === "number"
|
|
72
|
-
? parsed.dailyCrashWindowStart
|
|
73
|
-
: Date.now(),
|
|
74
|
-
};
|
|
75
|
-
}
|
|
76
|
-
return null;
|
|
53
|
+
return normalizeBeacon(JSON.parse(raw));
|
|
77
54
|
}
|
|
78
55
|
catch {
|
|
79
56
|
return null;
|