alvin-bot 5.1.6 → 5.1.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +39 -0
- package/dist/services/cron-scheduling.js +34 -6
- package/dist/services/cron.js +98 -2
- package/dist/services/subagents.js +3 -1
- package/dist/services/watchdog.js +17 -0
- package/package.json +1 -1
package/CHANGELOG.md
CHANGED
|
@@ -2,6 +2,45 @@
|
|
|
2
2
|
|
|
3
3
|
All notable changes to Alvin Bot are documented here.
|
|
4
4
|
|
|
5
|
+
## [5.1.8] — 2026-05-17
|
|
6
|
+
|
|
7
|
+
### Interrupted jobs auto-resume after a controlled restart
|
|
8
|
+
|
|
9
|
+
If an auto-update, `/update` or `/restart` interrupted a job while it
|
|
10
|
+
was running, the bot used to give up on that run and ask you to
|
|
11
|
+
re-trigger it manually. Now, when the interruption was a controlled
|
|
12
|
+
restart (not a crash) and it happened within the last 15 minutes, the
|
|
13
|
+
job is re-run automatically on the next tick after the bot is back.
|
|
14
|
+
Crash-loops are deliberately excluded — a job that keeps crashing the
|
|
15
|
+
bot will never resume itself — and each interrupted run is resumed at
|
|
16
|
+
most once. You can opt a single job out with `autoResume: false`.
|
|
17
|
+
|
|
18
|
+
### Sub-agents report a result, not a play-by-play
|
|
19
|
+
|
|
20
|
+
Finished sub-agents (background tasks, cron jobs) no longer dump a long
|
|
21
|
+
step-by-step recap of how they thought and what tools they called. You
|
|
22
|
+
get the compact status line (success/failed · duration · tokens) plus
|
|
23
|
+
the actual result — nothing more. Cron reports still arrive in full;
|
|
24
|
+
only the meta-narration is gone, since the orchestrator processes the
|
|
25
|
+
real output anyway.
|
|
26
|
+
|
|
27
|
+
## [5.1.7] — 2026-05-17
|
|
28
|
+
|
|
29
|
+
### Scheduled jobs no longer run twice after a restart
|
|
30
|
+
|
|
31
|
+
If two bot instances were briefly alive at the same time — for example
|
|
32
|
+
right after an auto-update or a restart, while the old process was still
|
|
33
|
+
shutting down — a scheduled job could fire twice within the same minute.
|
|
34
|
+
One real case: a weekly report job sent its email, then sent an empty
|
|
35
|
+
duplicate 30 seconds later. The old overlap guard only worked inside a
|
|
36
|
+
single process, so a second instance never saw the first one's claim.
|
|
37
|
+
|
|
38
|
+
Jobs are now claimed with a small cross-process lock before they run, so
|
|
39
|
+
only one instance can execute a given job for a given slot. A crashed
|
|
40
|
+
run can't wedge the lock — it is reclaimed automatically once the owning
|
|
41
|
+
process is gone. Manual `/cron run` honours the same lock. No
|
|
42
|
+
configuration changes; existing jobs just stop double-firing.
|
|
43
|
+
|
|
5
44
|
## [5.1.6] — 2026-05-15
|
|
6
45
|
|
|
7
46
|
### Planned restarts really stop counting as crashes now
|
|
@@ -149,6 +149,15 @@ export function prepareForExecution(job, now) {
|
|
|
149
149
|
// ── Startup catch-up ───────────────────────────────────────
|
|
150
150
|
/** Default grace window for catching up an interrupted attempt on boot. */
|
|
151
151
|
export const DEFAULT_CATCHUP_GRACE_MS = 6 * 60 * 60 * 1000; // 6 h
|
|
152
|
+
/**
|
|
153
|
+
* Short window for *fast-resume*: when a controlled restart (auto-update,
|
|
154
|
+
* /update, /restart) interrupts a running job, re-run it immediately on
|
|
155
|
+
* the next boot instead of telling the user to re-trigger — but only if
|
|
156
|
+
* the interruption is this fresh. Deliberately minutes, not hours: a
|
|
157
|
+
* boot many hours later is the "surprise rerun" case slotAlreadyAttempted
|
|
158
|
+
* is designed to suppress; fast-resume is the immediate self-heal.
|
|
159
|
+
*/
|
|
160
|
+
export const DEFAULT_FAST_RESUME_MS = 15 * 60 * 1000; // 15 min
|
|
152
161
|
/**
|
|
153
162
|
* Returns true when the job's *current* schedule slot has already been
|
|
154
163
|
* attempted — i.e. the bot fired the job for this slot before the
|
|
@@ -195,7 +204,8 @@ function slotAlreadyAttempted(job, now) {
|
|
|
195
204
|
*
|
|
196
205
|
* PURE: returns a fresh array, never mutates the input.
|
|
197
206
|
*/
|
|
198
|
-
export function handleStartupCatchup(jobs, now, graceMs = DEFAULT_CATCHUP_GRACE_MS) {
|
|
207
|
+
export function handleStartupCatchup(jobs, now, graceMs = DEFAULT_CATCHUP_GRACE_MS, opts = {}) {
|
|
208
|
+
const fastResumeMs = opts.fastResumeMs ?? DEFAULT_FAST_RESUME_MS;
|
|
199
209
|
return jobs.map((job) => {
|
|
200
210
|
if (!job.enabled)
|
|
201
211
|
return job;
|
|
@@ -210,11 +220,29 @@ export function handleStartupCatchup(jobs, now, graceMs = DEFAULT_CATCHUP_GRACE_
|
|
|
210
220
|
return job; // clock weirdness — skip
|
|
211
221
|
if (ageMs > graceMs)
|
|
212
222
|
return job; // outside grace — give up
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
223
|
+
if (slotAlreadyAttempted(job, now)) {
|
|
224
|
+
// The slot was already attempted. Normally we leave it alone so the
|
|
225
|
+
// user doesn't get a surprise rerun hours later. EXCEPTION —
|
|
226
|
+
// fast-resume: a *controlled* restart interrupted it just minutes
|
|
227
|
+
// ago. Re-run immediately instead of "please re-trigger".
|
|
228
|
+
// • expectedRestart → never true after a crash, so a job that
|
|
229
|
+
// crashes the bot can't loop-resume itself.
|
|
230
|
+
// • autoResume === false → per-job opt-out.
|
|
231
|
+
// • fresh within fastResumeMs → not the "hours later" case.
|
|
232
|
+
// • lastFastResumeAttemptAt !== lastAttemptAt → resume a given
|
|
233
|
+
// interrupted attempt at most once.
|
|
234
|
+
const canFastResume = opts.expectedRestart === true &&
|
|
235
|
+
job.autoResume !== false &&
|
|
236
|
+
ageMs < fastResumeMs &&
|
|
237
|
+
job.lastFastResumeAttemptAt !== job.lastAttemptAt;
|
|
238
|
+
if (!canFastResume)
|
|
239
|
+
return job;
|
|
240
|
+
return {
|
|
241
|
+
...job,
|
|
242
|
+
nextRunAt: now,
|
|
243
|
+
lastFastResumeAttemptAt: job.lastAttemptAt,
|
|
244
|
+
};
|
|
245
|
+
}
|
|
218
246
|
// Within grace, never completed, and current slot hasn't been
|
|
219
247
|
// attempted yet → catch up on next tick.
|
|
220
248
|
return { ...job, nextRunAt: now };
|
package/dist/services/cron.js
CHANGED
|
@@ -11,10 +11,11 @@
|
|
|
11
11
|
*/
|
|
12
12
|
import fs from "fs";
|
|
13
13
|
import { execSync } from "child_process";
|
|
14
|
-
import { dirname } from "path";
|
|
14
|
+
import { resolve, dirname } from "path";
|
|
15
15
|
import { CRON_FILE, BOT_ROOT } from "../paths.js";
|
|
16
16
|
import { prepareForExecution, handleStartupCatchup, calculateNextRunFrom, } from "./cron-scheduling.js";
|
|
17
17
|
import { resolveJobByNameOrId } from "./cron-resolver.js";
|
|
18
|
+
import { bootWasExpectedRestart } from "./watchdog.js";
|
|
18
19
|
// ── Storage ─────────────────────────────────────────────
|
|
19
20
|
function loadJobs() {
|
|
20
21
|
try {
|
|
@@ -256,6 +257,85 @@ async function executeJob(job) {
|
|
|
256
257
|
// ── Scheduler Loop ──────────────────────────────────────
|
|
257
258
|
let schedulerTimer = null;
|
|
258
259
|
const runningJobs = new Set(); // Guard against overlapping executions
|
|
260
|
+
// ── Cross-process job lock ──────────────────────────────
|
|
261
|
+
//
|
|
262
|
+
// `runningJobs` only guards overlap WITHIN this process. If two bot
|
|
263
|
+
// instances are briefly alive at once (a launchd/pm2 restart that left
|
|
264
|
+
// the old process running, or startup-catchup racing the normal tick),
|
|
265
|
+
// each has its own in-memory Set and the same job can fire twice —
|
|
266
|
+
// observed in the wild: a weekly job mailed its report, then mailed an
|
|
267
|
+
// empty duplicate 30 s later. This atomic `mkdir` lock makes the claim
|
|
268
|
+
// cross-process: the second instance sees the lock and skips the slot
|
|
269
|
+
// instead of double-firing. Stale locks (owning PID gone, or — when the
|
|
270
|
+
// meta is unreadable — older than the catch-up grace) are reclaimed so a
|
|
271
|
+
// crash can never wedge a job forever. No deps, cross-platform.
|
|
272
|
+
const CRON_LOCK_DIR = resolve(dirname(CRON_FILE), ".cron-locks");
|
|
273
|
+
const CRON_LOCK_MAX_AGE_MS = 6 * 60 * 60 * 1000; // backstop for corrupt meta
|
|
274
|
+
function cronLockPath(jobId) {
|
|
275
|
+
return resolve(CRON_LOCK_DIR, `${jobId.replace(/[^A-Za-z0-9_-]/g, "_")}.lock`);
|
|
276
|
+
}
|
|
277
|
+
function acquireJobLock(jobId) {
|
|
278
|
+
const lock = cronLockPath(jobId);
|
|
279
|
+
const writeMeta = () => {
|
|
280
|
+
try {
|
|
281
|
+
fs.writeFileSync(resolve(lock, "meta"), JSON.stringify({ pid: process.pid, at: Date.now() }));
|
|
282
|
+
}
|
|
283
|
+
catch { /* meta is best-effort */ }
|
|
284
|
+
};
|
|
285
|
+
try {
|
|
286
|
+
fs.mkdirSync(CRON_LOCK_DIR, { recursive: true });
|
|
287
|
+
}
|
|
288
|
+
catch { /* ignore */ }
|
|
289
|
+
try {
|
|
290
|
+
fs.mkdirSync(lock); // atomic: throws EEXIST if another instance holds it
|
|
291
|
+
writeMeta();
|
|
292
|
+
return true;
|
|
293
|
+
}
|
|
294
|
+
catch {
|
|
295
|
+
let stale = false;
|
|
296
|
+
try {
|
|
297
|
+
const meta = JSON.parse(fs.readFileSync(resolve(lock, "meta"), "utf-8"));
|
|
298
|
+
if (typeof meta.pid === "number") {
|
|
299
|
+
try {
|
|
300
|
+
process.kill(meta.pid, 0); // same-host liveness probe (no signal sent)
|
|
301
|
+
}
|
|
302
|
+
catch (e) {
|
|
303
|
+
if (e.code === "ESRCH")
|
|
304
|
+
stale = true; // owner gone
|
|
305
|
+
}
|
|
306
|
+
}
|
|
307
|
+
else {
|
|
308
|
+
stale = true; // no usable pid recorded
|
|
309
|
+
}
|
|
310
|
+
}
|
|
311
|
+
catch {
|
|
312
|
+
// meta missing/corrupt → fall back to lock-dir age
|
|
313
|
+
try {
|
|
314
|
+
stale = Date.now() - fs.statSync(lock).mtimeMs > CRON_LOCK_MAX_AGE_MS;
|
|
315
|
+
}
|
|
316
|
+
catch {
|
|
317
|
+
stale = false; // can't stat → treat as held (skip rather than double-fire)
|
|
318
|
+
}
|
|
319
|
+
}
|
|
320
|
+
if (!stale)
|
|
321
|
+
return false;
|
|
322
|
+
try {
|
|
323
|
+
fs.rmSync(lock, { recursive: true, force: true });
|
|
324
|
+
fs.mkdirSync(lock);
|
|
325
|
+
writeMeta();
|
|
326
|
+
return true;
|
|
327
|
+
}
|
|
328
|
+
catch {
|
|
329
|
+
return false;
|
|
330
|
+
}
|
|
331
|
+
}
|
|
332
|
+
}
|
|
333
|
+
function releaseJobLock(jobId) {
|
|
334
|
+
try {
|
|
335
|
+
fs.rmSync(cronLockPath(jobId), { recursive: true, force: true });
|
|
336
|
+
}
|
|
337
|
+
catch { /* ignore */ }
|
|
338
|
+
}
|
|
259
339
|
export function startScheduler() {
|
|
260
340
|
if (schedulerTimer)
|
|
261
341
|
return;
|
|
@@ -264,7 +344,9 @@ export function startScheduler() {
|
|
|
264
344
|
// catch-up nextRunAt rewind is visible on the very next pass.
|
|
265
345
|
try {
|
|
266
346
|
const bootJobs = loadJobs();
|
|
267
|
-
const caught = handleStartupCatchup(bootJobs, Date.now()
|
|
347
|
+
const caught = handleStartupCatchup(bootJobs, Date.now(), undefined, {
|
|
348
|
+
expectedRestart: bootWasExpectedRestart(),
|
|
349
|
+
});
|
|
268
350
|
// Only persist if something actually changed to avoid needless writes
|
|
269
351
|
const mutated = caught.some((j, i) => j.nextRunAt !== bootJobs[i].nextRunAt);
|
|
270
352
|
if (mutated) {
|
|
@@ -301,6 +383,13 @@ export function startScheduler() {
|
|
|
301
383
|
// mid-execution, handleStartupCatchup will notice the attempt
|
|
302
384
|
// without completion and nachholen within the grace window.
|
|
303
385
|
runningJobs.add(job.id);
|
|
386
|
+
// Cross-process claim: if another bot instance already owns this
|
|
387
|
+
// slot, skip instead of double-firing (the duplicate-report bug).
|
|
388
|
+
if (!acquireJobLock(job.id)) {
|
|
389
|
+
runningJobs.delete(job.id);
|
|
390
|
+
console.log(`Cron: job "${job.name}" (${job.id}) already claimed by another instance — skipping to avoid double-fire`);
|
|
391
|
+
continue;
|
|
392
|
+
}
|
|
304
393
|
const prepared = prepareForExecution(job, now);
|
|
305
394
|
Object.assign(job, prepared);
|
|
306
395
|
saveJobs(jobs);
|
|
@@ -328,6 +417,7 @@ export function startScheduler() {
|
|
|
328
417
|
}
|
|
329
418
|
finally {
|
|
330
419
|
runningJobs.delete(job.id);
|
|
420
|
+
releaseJobLock(job.id);
|
|
331
421
|
}
|
|
332
422
|
continue; // Skip the outer changed/save since we save inside
|
|
333
423
|
}
|
|
@@ -422,6 +512,11 @@ export async function runJobNow(nameOrId) {
|
|
|
422
512
|
return { status: "already-running", job };
|
|
423
513
|
}
|
|
424
514
|
runningJobs.add(job.id);
|
|
515
|
+
// Cross-process: another bot instance may already be running this job.
|
|
516
|
+
if (!acquireJobLock(job.id)) {
|
|
517
|
+
runningJobs.delete(job.id);
|
|
518
|
+
return { status: "already-running", job };
|
|
519
|
+
}
|
|
425
520
|
try {
|
|
426
521
|
// executeJob catches its own errors and returns { output, error }.
|
|
427
522
|
// The inner try/catch here is a defensive belt against future
|
|
@@ -460,6 +555,7 @@ export async function runJobNow(nameOrId) {
|
|
|
460
555
|
}
|
|
461
556
|
finally {
|
|
462
557
|
runningJobs.delete(job.id);
|
|
558
|
+
releaseJobLock(job.id);
|
|
463
559
|
}
|
|
464
560
|
}
|
|
465
561
|
/**
|
|
@@ -286,7 +286,9 @@ async function runSubAgent(id, agentConfig, abort, resolvedName) {
|
|
|
286
286
|
const effectiveCwd = inheritCwd
|
|
287
287
|
? agentConfig.workingDir || os.homedir()
|
|
288
288
|
: os.homedir();
|
|
289
|
-
const systemPrompt = `You are a sub-agent named "${resolvedName}". Complete the following task autonomously
|
|
289
|
+
const systemPrompt = `You are a sub-agent named "${resolvedName}". Complete the following task autonomously. Working directory: ${effectiveCwd}
|
|
290
|
+
|
|
291
|
+
When done, return ONLY the final result/outcome, concisely. Do NOT narrate your intermediate steps, your reasoning, your tool calls, or a play-by-play of what you did — the orchestrator only needs the outcome (the answer, the report, the list, the artifact path), and on failure the error plus what was and wasn't done. No preamble, no "Here's what I did", no step-by-step recap. Run status, duration and token usage are reported separately, so don't restate them.`;
|
|
290
292
|
// v4.12.2 — Map the toolset preset to an explicit allowedTools list.
|
|
291
293
|
// The provider honors this override (see src/providers/claude-sdk-provider.ts
|
|
292
294
|
// line ~140). Passing undefined = full access (provider default).
|
|
@@ -39,6 +39,18 @@ const BEACON_INTERVAL_MS = 30_000; // write a beacon every 30 s
|
|
|
39
39
|
let beaconTimer = null;
|
|
40
40
|
let resetTimer = null;
|
|
41
41
|
let bootTime = 0;
|
|
42
|
+
/** Captured in startWatchdog(): did the previous process exit via a
|
|
43
|
+
* controlled restart? Read by the cron scheduler for fast-resume. */
|
|
44
|
+
let bootExpectedRestart = false;
|
|
45
|
+
/**
|
|
46
|
+
* True when this boot was preceded by a *controlled* restart
|
|
47
|
+
* (`markExpectedRestart` had set the beacon flag) rather than a crash.
|
|
48
|
+
* Returns false until startWatchdog() has run, and false after a crash —
|
|
49
|
+
* the safe default (no fast-resume) in both cases.
|
|
50
|
+
*/
|
|
51
|
+
export function bootWasExpectedRestart() {
|
|
52
|
+
return bootExpectedRestart;
|
|
53
|
+
}
|
|
42
54
|
function ensureStateDir() {
|
|
43
55
|
try {
|
|
44
56
|
fs.mkdirSync(STATE_DIR, { recursive: true });
|
|
@@ -155,6 +167,11 @@ export function startWatchdog() {
|
|
|
155
167
|
ensureStateDir();
|
|
156
168
|
bootTime = Date.now();
|
|
157
169
|
const previous = readBeacon();
|
|
170
|
+
// Capture whether the *previous* process exited via a controlled
|
|
171
|
+
// restart (markExpectedRestart set the flag) BEFORE writeBeacon below
|
|
172
|
+
// resets it. The cron scheduler uses this to fast-resume a job that a
|
|
173
|
+
// controlled restart interrupted, while never resuming after a crash.
|
|
174
|
+
bootExpectedRestart = previous?.expectedRestart === true;
|
|
158
175
|
const decision = decideBrakeAction(previous, bootTime);
|
|
159
176
|
if (decision.action === "brake") {
|
|
160
177
|
console.error(`[watchdog] crash-loop brake triggered: ${decision.reason}`);
|