alvin-bot 5.1.6 → 5.1.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +17 -0
- package/dist/services/cron.js +94 -1
- package/package.json +1 -1
package/CHANGELOG.md
CHANGED
|
@@ -2,6 +2,23 @@
|
|
|
2
2
|
|
|
3
3
|
All notable changes to Alvin Bot are documented here.
|
|
4
4
|
|
|
5
|
+
## [5.1.7] — 2026-05-17
|
|
6
|
+
|
|
7
|
+
### Scheduled jobs no longer run twice after a restart
|
|
8
|
+
|
|
9
|
+
If two bot instances were briefly alive at the same time — for example
|
|
10
|
+
right after an auto-update or a restart, while the old process was still
|
|
11
|
+
shutting down — a scheduled job could fire twice within the same minute.
|
|
12
|
+
One real case: a weekly report job sent its email, then sent an empty
|
|
13
|
+
duplicate 30 seconds later. The old overlap guard only worked inside a
|
|
14
|
+
single process, so a second instance never saw the first one's claim.
|
|
15
|
+
|
|
16
|
+
Jobs are now claimed with a small cross-process lock before they run, so
|
|
17
|
+
only one instance can execute a given job for a given slot. A crashed
|
|
18
|
+
run can't wedge the lock — it is reclaimed automatically once the owning
|
|
19
|
+
process is gone. Manual `/cron run` honours the same lock. No
|
|
20
|
+
configuration changes; existing jobs just stop double-firing.
|
|
21
|
+
|
|
5
22
|
## [5.1.6] — 2026-05-15
|
|
6
23
|
|
|
7
24
|
### Planned restarts really stop counting as crashes now
|
package/dist/services/cron.js
CHANGED
|
@@ -11,7 +11,7 @@
|
|
|
11
11
|
*/
|
|
12
12
|
import fs from "fs";
|
|
13
13
|
import { execSync } from "child_process";
|
|
14
|
-
import { dirname } from "path";
|
|
14
|
+
import { resolve, dirname } from "path";
|
|
15
15
|
import { CRON_FILE, BOT_ROOT } from "../paths.js";
|
|
16
16
|
import { prepareForExecution, handleStartupCatchup, calculateNextRunFrom, } from "./cron-scheduling.js";
|
|
17
17
|
import { resolveJobByNameOrId } from "./cron-resolver.js";
|
|
@@ -256,6 +256,85 @@ async function executeJob(job) {
|
|
|
256
256
|
// ── Scheduler Loop ──────────────────────────────────────
|
|
257
257
|
let schedulerTimer = null;
|
|
258
258
|
const runningJobs = new Set(); // Guard against overlapping executions
|
|
259
|
+
// ── Cross-process job lock ──────────────────────────────
|
|
260
|
+
//
|
|
261
|
+
// `runningJobs` only guards overlap WITHIN this process. If two bot
|
|
262
|
+
// instances are briefly alive at once (a launchd/pm2 restart that left
|
|
263
|
+
// the old process running, or startup-catchup racing the normal tick),
|
|
264
|
+
// each has its own in-memory Set and the same job can fire twice —
|
|
265
|
+
// observed in the wild: a weekly job mailed its report, then mailed an
|
|
266
|
+
// empty duplicate 30 s later. This atomic `mkdir` lock makes the claim
|
|
267
|
+
// cross-process: the second instance sees the lock and skips the slot
|
|
268
|
+
// instead of double-firing. Stale locks (owning PID gone, or — when the
|
|
269
|
+
// meta is unreadable — older than the catch-up grace) are reclaimed so a
|
|
270
|
+
// crash can never wedge a job forever. No deps, cross-platform.
|
|
271
|
+
const CRON_LOCK_DIR = resolve(dirname(CRON_FILE), ".cron-locks");
|
|
272
|
+
const CRON_LOCK_MAX_AGE_MS = 6 * 60 * 60 * 1000; // backstop for corrupt meta
|
|
273
|
+
function cronLockPath(jobId) {
|
|
274
|
+
return resolve(CRON_LOCK_DIR, `${jobId.replace(/[^A-Za-z0-9_-]/g, "_")}.lock`);
|
|
275
|
+
}
|
|
276
|
+
function acquireJobLock(jobId) {
|
|
277
|
+
const lock = cronLockPath(jobId);
|
|
278
|
+
const writeMeta = () => {
|
|
279
|
+
try {
|
|
280
|
+
fs.writeFileSync(resolve(lock, "meta"), JSON.stringify({ pid: process.pid, at: Date.now() }));
|
|
281
|
+
}
|
|
282
|
+
catch { /* meta is best-effort */ }
|
|
283
|
+
};
|
|
284
|
+
try {
|
|
285
|
+
fs.mkdirSync(CRON_LOCK_DIR, { recursive: true });
|
|
286
|
+
}
|
|
287
|
+
catch { /* ignore */ }
|
|
288
|
+
try {
|
|
289
|
+
fs.mkdirSync(lock); // atomic: throws EEXIST if another instance holds it
|
|
290
|
+
writeMeta();
|
|
291
|
+
return true;
|
|
292
|
+
}
|
|
293
|
+
catch {
|
|
294
|
+
let stale = false;
|
|
295
|
+
try {
|
|
296
|
+
const meta = JSON.parse(fs.readFileSync(resolve(lock, "meta"), "utf-8"));
|
|
297
|
+
if (typeof meta.pid === "number") {
|
|
298
|
+
try {
|
|
299
|
+
process.kill(meta.pid, 0); // same-host liveness probe (no signal sent)
|
|
300
|
+
}
|
|
301
|
+
catch (e) {
|
|
302
|
+
if (e.code === "ESRCH")
|
|
303
|
+
stale = true; // owner gone
|
|
304
|
+
}
|
|
305
|
+
}
|
|
306
|
+
else {
|
|
307
|
+
stale = true; // no usable pid recorded
|
|
308
|
+
}
|
|
309
|
+
}
|
|
310
|
+
catch {
|
|
311
|
+
// meta missing/corrupt → fall back to lock-dir age
|
|
312
|
+
try {
|
|
313
|
+
stale = Date.now() - fs.statSync(lock).mtimeMs > CRON_LOCK_MAX_AGE_MS;
|
|
314
|
+
}
|
|
315
|
+
catch {
|
|
316
|
+
stale = false; // can't stat → treat as held (skip rather than double-fire)
|
|
317
|
+
}
|
|
318
|
+
}
|
|
319
|
+
if (!stale)
|
|
320
|
+
return false;
|
|
321
|
+
try {
|
|
322
|
+
fs.rmSync(lock, { recursive: true, force: true });
|
|
323
|
+
fs.mkdirSync(lock);
|
|
324
|
+
writeMeta();
|
|
325
|
+
return true;
|
|
326
|
+
}
|
|
327
|
+
catch {
|
|
328
|
+
return false;
|
|
329
|
+
}
|
|
330
|
+
}
|
|
331
|
+
}
|
|
332
|
+
function releaseJobLock(jobId) {
|
|
333
|
+
try {
|
|
334
|
+
fs.rmSync(cronLockPath(jobId), { recursive: true, force: true });
|
|
335
|
+
}
|
|
336
|
+
catch { /* ignore */ }
|
|
337
|
+
}
|
|
259
338
|
export function startScheduler() {
|
|
260
339
|
if (schedulerTimer)
|
|
261
340
|
return;
|
|
@@ -301,6 +380,13 @@ export function startScheduler() {
|
|
|
301
380
|
// mid-execution, handleStartupCatchup will notice the attempt
|
|
302
381
|
// without completion and nachholen within the grace window.
|
|
303
382
|
runningJobs.add(job.id);
|
|
383
|
+
// Cross-process claim: if another bot instance already owns this
|
|
384
|
+
// slot, skip instead of double-firing (the duplicate-report bug).
|
|
385
|
+
if (!acquireJobLock(job.id)) {
|
|
386
|
+
runningJobs.delete(job.id);
|
|
387
|
+
console.log(`Cron: job "${job.name}" (${job.id}) already claimed by another instance — skipping to avoid double-fire`);
|
|
388
|
+
continue;
|
|
389
|
+
}
|
|
304
390
|
const prepared = prepareForExecution(job, now);
|
|
305
391
|
Object.assign(job, prepared);
|
|
306
392
|
saveJobs(jobs);
|
|
@@ -328,6 +414,7 @@ export function startScheduler() {
|
|
|
328
414
|
}
|
|
329
415
|
finally {
|
|
330
416
|
runningJobs.delete(job.id);
|
|
417
|
+
releaseJobLock(job.id);
|
|
331
418
|
}
|
|
332
419
|
continue; // Skip the outer changed/save since we save inside
|
|
333
420
|
}
|
|
@@ -422,6 +509,11 @@ export async function runJobNow(nameOrId) {
|
|
|
422
509
|
return { status: "already-running", job };
|
|
423
510
|
}
|
|
424
511
|
runningJobs.add(job.id);
|
|
512
|
+
// Cross-process: another bot instance may already be running this job.
|
|
513
|
+
if (!acquireJobLock(job.id)) {
|
|
514
|
+
runningJobs.delete(job.id);
|
|
515
|
+
return { status: "already-running", job };
|
|
516
|
+
}
|
|
425
517
|
try {
|
|
426
518
|
// executeJob catches its own errors and returns { output, error }.
|
|
427
519
|
// The inner try/catch here is a defensive belt against future
|
|
@@ -460,6 +552,7 @@ export async function runJobNow(nameOrId) {
|
|
|
460
552
|
}
|
|
461
553
|
finally {
|
|
462
554
|
runningJobs.delete(job.id);
|
|
555
|
+
releaseJobLock(job.id);
|
|
463
556
|
}
|
|
464
557
|
}
|
|
465
558
|
/**
|