alvin-bot 5.1.5 → 5.1.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/CHANGELOG.md CHANGED
@@ -2,6 +2,46 @@
2
2
 
3
3
  All notable changes to Alvin Bot are documented here.
4
4
 
5
+ ## [5.1.7] — 2026-05-17
6
+
7
+ ### Scheduled jobs no longer run twice after a restart
8
+
9
+ If two bot instances were briefly alive at the same time — for example
10
+ right after an auto-update or a restart, while the old process was still
11
+ shutting down — a scheduled job could fire twice within the same minute.
12
+ One real case: a weekly report job sent its email, then sent an empty
13
+ duplicate 30 seconds later. The old overlap guard only worked inside a
14
+ single process, so a second instance never saw the first one's claim.
15
+
16
+ Jobs are now claimed with a small cross-process lock before they run, so
17
+ only one instance can execute a given job for a given slot. A crashed
18
+ run can't wedge the lock — it is reclaimed automatically once the owning
19
+ process is gone. Manual `/cron run` honours the same lock. No
20
+ configuration changes; existing jobs just stop double-firing.
21
+
22
+ ## [5.1.6] — 2026-05-15
23
+
24
+ ### Planned restarts really stop counting as crashes now
25
+
26
+ v5.1.5 added a flag that marks self-updates and `/update` / `/restart`
27
+ as intentional so they don't inflate the crash counter. Half of it
28
+ didn't actually work: the code that reads the saved state back on the
29
+ next boot rebuilt it field by field and silently dropped that very
30
+ flag, so the crash detector never saw it and planned restarts were
31
+ still scored as crashes. (The other half of v5.1.5 — not counting
32
+ benign log lines as errors — was unaffected and has been working.)
33
+
34
+ This release makes the read path preserve the flag, so a planned
35
+ restart is now genuinely treated as a clean exit. The state-parsing
36
+ logic was pulled into a tested pure function so the read-back round
37
+ trip can't silently regress like this again.
38
+
39
+ ### What this means for you
40
+
41
+ If you updated to 5.1.5 and still saw the crash count tick up by one
42
+ each time the bot updated itself, that stops now. The error-trend
43
+ half of the 5.1.5 fix already worked; this completes the crash half.
44
+
5
45
  ## [5.1.5] — 2026-05-15
6
46
 
7
47
  ### Health monitor no longer cries wolf about its own log lines
@@ -11,7 +11,7 @@
11
11
  */
12
12
  import fs from "fs";
13
13
  import { execSync } from "child_process";
14
- import { dirname } from "path";
14
+ import { resolve, dirname } from "path";
15
15
  import { CRON_FILE, BOT_ROOT } from "../paths.js";
16
16
  import { prepareForExecution, handleStartupCatchup, calculateNextRunFrom, } from "./cron-scheduling.js";
17
17
  import { resolveJobByNameOrId } from "./cron-resolver.js";
@@ -256,6 +256,85 @@ async function executeJob(job) {
256
256
  // ── Scheduler Loop ──────────────────────────────────────
257
257
  let schedulerTimer = null;
258
258
  const runningJobs = new Set(); // Guard against overlapping executions
259
+ // ── Cross-process job lock ──────────────────────────────
260
+ //
261
+ // `runningJobs` only guards overlap WITHIN this process. If two bot
262
+ // instances are briefly alive at once (a launchd/pm2 restart that left
263
+ // the old process running, or startup-catchup racing the normal tick),
264
+ // each has its own in-memory Set and the same job can fire twice —
265
+ // observed in the wild: a weekly job mailed its report, then mailed an
266
+ // empty duplicate 30 s later. This atomic `mkdir` lock makes the claim
267
+ // cross-process: the second instance sees the lock and skips the slot
268
+ // instead of double-firing. Stale locks (owning PID gone, or — when the
269
+ // meta is unreadable — older than the catch-up grace) are reclaimed so a
270
+ // crash can never wedge a job forever. No deps, cross-platform.
271
+ const CRON_LOCK_DIR = resolve(dirname(CRON_FILE), ".cron-locks");
272
+ const CRON_LOCK_MAX_AGE_MS = 6 * 60 * 60 * 1000; // backstop for corrupt meta
273
+ function cronLockPath(jobId) {
274
+ return resolve(CRON_LOCK_DIR, `${jobId.replace(/[^A-Za-z0-9_-]/g, "_")}.lock`);
275
+ }
276
+ function acquireJobLock(jobId) {
277
+ const lock = cronLockPath(jobId);
278
+ const writeMeta = () => {
279
+ try {
280
+ fs.writeFileSync(resolve(lock, "meta"), JSON.stringify({ pid: process.pid, at: Date.now() }));
281
+ }
282
+ catch { /* meta is best-effort */ }
283
+ };
284
+ try {
285
+ fs.mkdirSync(CRON_LOCK_DIR, { recursive: true });
286
+ }
287
+ catch { /* ignore */ }
288
+ try {
289
+ fs.mkdirSync(lock); // atomic: throws EEXIST if another instance holds it
290
+ writeMeta();
291
+ return true;
292
+ }
293
+ catch {
294
+ let stale = false;
295
+ try {
296
+ const meta = JSON.parse(fs.readFileSync(resolve(lock, "meta"), "utf-8"));
297
+ if (typeof meta.pid === "number") {
298
+ try {
299
+ process.kill(meta.pid, 0); // same-host liveness probe (no signal sent)
300
+ }
301
+ catch (e) {
302
+ if (e.code === "ESRCH")
303
+ stale = true; // owner gone
304
+ }
305
+ }
306
+ else {
307
+ stale = true; // no usable pid recorded
308
+ }
309
+ }
310
+ catch {
311
+ // meta missing/corrupt → fall back to lock-dir age
312
+ try {
313
+ stale = Date.now() - fs.statSync(lock).mtimeMs > CRON_LOCK_MAX_AGE_MS;
314
+ }
315
+ catch {
316
+ stale = false; // can't stat → treat as held (skip rather than double-fire)
317
+ }
318
+ }
319
+ if (!stale)
320
+ return false;
321
+ try {
322
+ fs.rmSync(lock, { recursive: true, force: true });
323
+ fs.mkdirSync(lock);
324
+ writeMeta();
325
+ return true;
326
+ }
327
+ catch {
328
+ return false;
329
+ }
330
+ }
331
+ }
332
+ function releaseJobLock(jobId) {
333
+ try {
334
+ fs.rmSync(cronLockPath(jobId), { recursive: true, force: true });
335
+ }
336
+ catch { /* ignore */ }
337
+ }
259
338
  export function startScheduler() {
260
339
  if (schedulerTimer)
261
340
  return;
@@ -301,6 +380,13 @@ export function startScheduler() {
301
380
  // mid-execution, handleStartupCatchup will notice the attempt
302
381
  // without completion and nachholen within the grace window.
303
382
  runningJobs.add(job.id);
383
+ // Cross-process claim: if another bot instance already owns this
384
+ // slot, skip instead of double-firing (the duplicate-report bug).
385
+ if (!acquireJobLock(job.id)) {
386
+ runningJobs.delete(job.id);
387
+ console.log(`Cron: job "${job.name}" (${job.id}) already claimed by another instance — skipping to avoid double-fire`);
388
+ continue;
389
+ }
304
390
  const prepared = prepareForExecution(job, now);
305
391
  Object.assign(job, prepared);
306
392
  saveJobs(jobs);
@@ -328,6 +414,7 @@ export function startScheduler() {
328
414
  }
329
415
  finally {
330
416
  runningJobs.delete(job.id);
417
+ releaseJobLock(job.id);
331
418
  }
332
419
  continue; // Skip the outer changed/save since we save inside
333
420
  }
@@ -422,6 +509,11 @@ export async function runJobNow(nameOrId) {
422
509
  return { status: "already-running", job };
423
510
  }
424
511
  runningJobs.add(job.id);
512
+ // Cross-process: another bot instance may already be running this job.
513
+ if (!acquireJobLock(job.id)) {
514
+ runningJobs.delete(job.id);
515
+ return { status: "already-running", job };
516
+ }
425
517
  try {
426
518
  // executeJob catches its own errors and returns { output, error }.
427
519
  // The inner try/catch here is a defensive belt against future
@@ -460,6 +552,7 @@ export async function runJobNow(nameOrId) {
460
552
  }
461
553
  finally {
462
554
  runningJobs.delete(job.id);
555
+ releaseJobLock(job.id);
463
556
  }
464
557
  }
465
558
  /**
@@ -27,6 +27,45 @@ export const DEFAULTS = {
27
27
  * crashes with ≥5 min gaps sailed right past the brake. 1 h is safer. */
28
28
  RESET_AFTER_MS: 60 * 60_000,
29
29
  };
30
+ /**
31
+ * Validate + normalize a parsed beacon JSON into a BeaconData (or null
32
+ * if the core fields are missing/wrong-typed). Pure so the read-path
33
+ * field mapping is unit-testable — extracted after v5.1.5 shipped a
34
+ * broken expectedRestart: the old readBeacon() rebuilt the object
35
+ * field-by-field and silently dropped expectedRestart, so the flag
36
+ * never reached decideBrakeAction and intentional restarts were still
37
+ * counted as crashes. Whatever round-trips here is what the brake sees.
38
+ */
39
+ export function normalizeBeacon(parsed) {
40
+ if (!parsed)
41
+ return null;
42
+ if (typeof parsed.lastBeat === "number" &&
43
+ typeof parsed.pid === "number" &&
44
+ typeof parsed.bootTime === "number" &&
45
+ typeof parsed.crashCount === "number" &&
46
+ typeof parsed.crashWindowStart === "number" &&
47
+ typeof parsed.version === "string") {
48
+ return {
49
+ lastBeat: parsed.lastBeat,
50
+ pid: parsed.pid,
51
+ bootTime: parsed.bootTime,
52
+ crashCount: parsed.crashCount,
53
+ crashWindowStart: parsed.crashWindowStart,
54
+ version: parsed.version,
55
+ // Older beacons don't have daily-counter fields — default them to
56
+ // 0/now so the brake logic treats this run as the start of the
57
+ // first daily window.
58
+ dailyCrashCount: typeof parsed.dailyCrashCount === "number" ? parsed.dailyCrashCount : 0,
59
+ dailyCrashWindowStart: typeof parsed.dailyCrashWindowStart === "number"
60
+ ? parsed.dailyCrashWindowStart
61
+ : Date.now(),
62
+ // The whole point of the v5.1.6 fix: propagate expectedRestart so
63
+ // a planned restart is not scored as a crash on the next boot.
64
+ expectedRestart: parsed.expectedRestart === true,
65
+ };
66
+ }
67
+ return null;
68
+ }
30
69
  /**
31
70
  * Given the previous beacon (or null on first boot) and the current time,
32
71
  * decide whether the bot should proceed with boot or engage the crash-loop
@@ -29,7 +29,7 @@ import { execSync } from "child_process";
29
29
  import { BOT_VERSION } from "../version.js";
30
30
  import { emitCritical } from "./critical-notify.js";
31
31
  import { writeDiagnosticBundle } from "./auto-diagnostic.js";
32
- import { decideBrakeAction, shouldResetCrashCounter, DEFAULTS, } from "./watchdog-brake.js";
32
+ import { decideBrakeAction, shouldResetCrashCounter, normalizeBeacon, DEFAULTS, } from "./watchdog-brake.js";
33
33
  const DATA_DIR = process.env.ALVIN_DATA_DIR || resolve(os.homedir(), ".alvin-bot");
34
34
  const STATE_DIR = resolve(DATA_DIR, "state");
35
35
  const BEACON_FILE = resolve(STATE_DIR, "watchdog.json");
@@ -50,30 +50,7 @@ function ensureStateDir() {
50
50
  function readBeacon() {
51
51
  try {
52
52
  const raw = fs.readFileSync(BEACON_FILE, "utf-8");
53
- const parsed = JSON.parse(raw);
54
- if (typeof parsed.lastBeat === "number" &&
55
- typeof parsed.pid === "number" &&
56
- typeof parsed.bootTime === "number" &&
57
- typeof parsed.crashCount === "number" &&
58
- typeof parsed.crashWindowStart === "number" &&
59
- typeof parsed.version === "string") {
60
- // Older beacons don't have daily-counter fields — default them to
61
- // 0/now so the brake logic treats this run as the start of the
62
- // first daily window.
63
- return {
64
- lastBeat: parsed.lastBeat,
65
- pid: parsed.pid,
66
- bootTime: parsed.bootTime,
67
- crashCount: parsed.crashCount,
68
- crashWindowStart: parsed.crashWindowStart,
69
- version: parsed.version,
70
- dailyCrashCount: typeof parsed.dailyCrashCount === "number" ? parsed.dailyCrashCount : 0,
71
- dailyCrashWindowStart: typeof parsed.dailyCrashWindowStart === "number"
72
- ? parsed.dailyCrashWindowStart
73
- : Date.now(),
74
- };
75
- }
76
- return null;
53
+ return normalizeBeacon(JSON.parse(raw));
77
54
  }
78
55
  catch {
79
56
  return null;
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "alvin-bot",
3
- "version": "5.1.5",
3
+ "version": "5.1.7",
4
4
  "description": "Alvin Bot — Your personal AI agent on Telegram, WhatsApp, Discord, Signal, and Web.",
5
5
  "type": "module",
6
6
  "main": "dist/index.js",