alvin-bot 4.8.9 → 4.9.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,142 @@
1
+ /**
2
+ * Pure scheduling helpers for the cron service.
3
+ *
4
+ * Extracted from cron.ts so the startup-catchup and pre-execution state
5
+ * updates can be unit-tested without booting the full scheduler loop.
6
+ * This module is side-effect-free: it does not touch the filesystem, the
7
+ * clock, or the sub-agent registry. Give it jobs + a `now` value and it
8
+ * returns what the next state should look like.
9
+ *
10
+ * Background — see test/cron-restart-resilience.test.ts for the exact
11
+ * contract and the regression it closes.
12
+ */
13
+ // ── Pure parsers ────────────────────────────────────────────
14
+ //
15
+ // These mirror parseInterval / nextCronRun from cron.ts. We duplicate them
16
+ // intentionally instead of importing — cron.ts is the scheduler-with-side-
17
+ // effects, and importing it from a "pure" helper would reintroduce the
18
+ // circular dependency we just broke. The duplication is small and well
19
+ // covered by tests; keep the two in sync when editing.
20
+ function parseInterval(input) {
21
+ const match = input.match(/^(\d+(?:\.\d+)?)\s*(s|sec|m|min|h|hr|d|day)s?$/i);
22
+ if (!match)
23
+ return null;
24
+ const value = parseFloat(match[1]);
25
+ const unit = match[2].toLowerCase();
26
+ const mult = {
27
+ s: 1000, sec: 1000, m: 60_000, min: 60_000,
28
+ h: 3_600_000, hr: 3_600_000, d: 86_400_000, day: 86_400_000,
29
+ };
30
+ return value * (mult[unit] || 60_000);
31
+ }
32
+ function nextCronRun(expression, after) {
33
+ const parts = expression.trim().split(/\s+/);
34
+ if (parts.length !== 5)
35
+ return null;
36
+ const [minExpr, hourExpr, dayExpr, monthExpr, weekdayExpr] = parts;
37
+ function parseField(expr, min, max) {
38
+ if (expr === "*")
39
+ return Array.from({ length: max - min + 1 }, (_, i) => i + min);
40
+ if (expr.includes("/")) {
41
+ const [, step] = expr.split("/");
42
+ const s = parseInt(step);
43
+ return Array.from({ length: max - min + 1 }, (_, i) => i + min).filter((v) => v % s === 0);
44
+ }
45
+ if (expr.includes(","))
46
+ return expr.split(",").map(Number);
47
+ if (expr.includes("-")) {
48
+ const [a, b] = expr.split("-").map(Number);
49
+ return Array.from({ length: b - a + 1 }, (_, i) => i + a);
50
+ }
51
+ return [parseInt(expr)];
52
+ }
53
+ const minutes = parseField(minExpr, 0, 59);
54
+ const hours = parseField(hourExpr, 0, 23);
55
+ const days = parseField(dayExpr, 1, 31);
56
+ const months = parseField(monthExpr, 1, 12);
57
+ const weekdays = parseField(weekdayExpr, 0, 6);
58
+ const candidate = new Date(after);
59
+ candidate.setSeconds(0, 0);
60
+ candidate.setMinutes(candidate.getMinutes() + 1);
61
+ for (let i = 0; i < 366 * 24 * 60; i++) {
62
+ const m = candidate.getMinutes();
63
+ const h = candidate.getHours();
64
+ const d = candidate.getDate();
65
+ const mo = candidate.getMonth() + 1;
66
+ const wd = candidate.getDay();
67
+ if (minutes.includes(m) &&
68
+ hours.includes(h) &&
69
+ days.includes(d) &&
70
+ months.includes(mo) &&
71
+ weekdays.includes(wd)) {
72
+ return candidate;
73
+ }
74
+ candidate.setMinutes(candidate.getMinutes() + 1);
75
+ }
76
+ return null;
77
+ }
78
+ /** Compute the next run relative to an explicit base timestamp.
79
+ * Used by prepareForExecution to make the interval calculation stable
80
+ * even when `lastRunAt` is stale or null. */
81
+ export function calculateNextRunFrom(job, base) {
82
+ if (!job.enabled)
83
+ return null;
84
+ const intervalMs = parseInterval(job.schedule);
85
+ if (intervalMs)
86
+ return base + intervalMs;
87
+ const next = nextCronRun(job.schedule, new Date(base));
88
+ return next ? next.getTime() : null;
89
+ }
90
+ // ── Pre-execution state update ─────────────────────────────
91
+ /**
92
+ * Mark a job as "being attempted" and advance `nextRunAt` to the next
93
+ * regular trigger, pure-functionally. Returns a NEW job object.
94
+ *
95
+ * Why not set `nextRunAt = null`: if the bot crashes between this call
96
+ * and the post-execution save, we still know when the next regular run
97
+ * is — the scheduler simply won't re-trigger. The `lastAttemptAt >
98
+ * lastRunAt` asymmetry is then the signal for handleStartupCatchup to
99
+ * nachholen the current attempt on the next boot.
100
+ */
101
+ export function prepareForExecution(job, now) {
102
+ return {
103
+ ...job,
104
+ lastAttemptAt: now,
105
+ nextRunAt: calculateNextRunFrom(job, now),
106
+ };
107
+ }
108
+ // ── Startup catch-up ───────────────────────────────────────
109
+ /** Default grace window for catching up an interrupted attempt on boot. */
110
+ export const DEFAULT_CATCHUP_GRACE_MS = 6 * 60 * 60 * 1000; // 6 h
111
+ /**
112
+ * Rewind `nextRunAt` to `now` for every enabled job whose most recent
113
+ * attempt never completed AND is still inside the grace window. This
114
+ * makes the very next scheduler tick pick the job up again, without
115
+ * double-firing jobs that actually finished.
116
+ *
117
+ * Jobs whose crashed attempt is older than the grace window are NOT
118
+ * caught up — the assumption is that such a run is too stale to be
119
+ * meaningful (a "daily" run from yesterday isn't what the user wants
120
+ * at 2pm today). Those jobs keep their scheduled future nextRunAt.
121
+ *
122
+ * PURE: returns a fresh array, never mutates the input.
123
+ */
124
+ export function handleStartupCatchup(jobs, now, graceMs = DEFAULT_CATCHUP_GRACE_MS) {
125
+ return jobs.map((job) => {
126
+ if (!job.enabled)
127
+ return job;
128
+ if (!job.lastAttemptAt)
129
+ return job;
130
+ const completed = typeof job.lastRunAt === "number" &&
131
+ job.lastRunAt >= job.lastAttemptAt;
132
+ if (completed)
133
+ return job;
134
+ const ageMs = now - job.lastAttemptAt;
135
+ if (ageMs <= 0)
136
+ return job; // clock weirdness — skip
137
+ if (ageMs > graceMs)
138
+ return job; // outside grace — give up
139
+ // Within grace, never completed → catch up on next tick.
140
+ return { ...job, nextRunAt: now };
141
+ });
142
+ }
@@ -13,6 +13,8 @@ import fs from "fs";
13
13
  import { execSync } from "child_process";
14
14
  import { dirname } from "path";
15
15
  import { CRON_FILE, BOT_ROOT } from "../paths.js";
16
+ import { prepareForExecution, handleStartupCatchup, calculateNextRunFrom, } from "./cron-scheduling.js";
17
+ import { resolveJobByNameOrId } from "./cron-resolver.js";
16
18
  // ── Storage ─────────────────────────────────────────────
17
19
  function loadJobs() {
18
20
  try {
@@ -240,6 +242,25 @@ const runningJobs = new Set(); // Guard against overlapping executions
240
242
  export function startScheduler() {
241
243
  if (schedulerTimer)
242
244
  return;
245
+ // Startup catch-up — nachholen runs whose last attempt crashed within
246
+ // the grace window. Must run BEFORE the first scheduler tick so the
247
+ // catch-up nextRunAt rewind is visible on the very next pass.
248
+ try {
249
+ const bootJobs = loadJobs();
250
+ const caught = handleStartupCatchup(bootJobs, Date.now());
251
+ // Only persist if something actually changed to avoid needless writes
252
+ const mutated = caught.some((j, i) => j.nextRunAt !== bootJobs[i].nextRunAt);
253
+ if (mutated) {
254
+ saveJobs(caught);
255
+ const names = caught
256
+ .filter((j, i) => j.nextRunAt !== bootJobs[i].nextRunAt)
257
+ .map((j) => j.name);
258
+ console.log(`⏰ Cron startup catch-up: rewound ${names.length} job(s): ${names.join(", ")}`);
259
+ }
260
+ }
261
+ catch (err) {
262
+ console.error("⏰ Cron startup catch-up failed:", err);
263
+ }
243
264
  // Check every 30 seconds for due jobs
244
265
  schedulerTimer = setInterval(async () => {
245
266
  const jobs = loadJobs();
@@ -248,7 +269,7 @@ export function startScheduler() {
248
269
  for (const job of jobs) {
249
270
  if (!job.enabled)
250
271
  continue;
251
- // Skip if this job is already running
272
+ // Skip if this job is already running in THIS bot instance
252
273
  if (runningJobs.has(job.id))
253
274
  continue;
254
275
  // Calculate next run if not set
@@ -258,9 +279,13 @@ export function startScheduler() {
258
279
  }
259
280
  if (job.nextRunAt && now >= job.nextRunAt) {
260
281
  console.log(`Cron: Running job "${job.name}" (${job.id})`);
261
- // Mark as running + clear nextRunAt BEFORE async execution to prevent re-trigger
282
+ // Pre-execution state update: advance nextRunAt to the NEXT regular
283
+ // trigger (NOT null) and stamp lastAttemptAt. If the bot crashes
284
+ // mid-execution, handleStartupCatchup will notice the attempt
285
+ // without completion and nachholen within the grace window.
262
286
  runningJobs.add(job.id);
263
- job.nextRunAt = null;
287
+ const prepared = prepareForExecution(job, now);
288
+ Object.assign(job, prepared);
264
289
  saveJobs(jobs);
265
290
  try {
266
291
  const result = await executeJob(job);
@@ -268,8 +293,8 @@ export function startScheduler() {
268
293
  const freshJobs = loadJobs();
269
294
  const freshJob = freshJobs.find(j => j.id === job.id);
270
295
  if (freshJob) {
271
- freshJob.lastRunAt = now;
272
- freshJob.lastResult = result.output.slice(0, 500);
296
+ freshJob.lastRunAt = Date.now();
297
+ freshJob.lastResult = result.output.slice(0, 4000);
273
298
  freshJob.lastError = result.error || null;
274
299
  freshJob.runCount++;
275
300
  if (freshJob.oneShot) {
@@ -277,7 +302,9 @@ export function startScheduler() {
277
302
  freshJob.nextRunAt = null;
278
303
  }
279
304
  else {
280
- freshJob.nextRunAt = calculateNextRun(freshJob);
305
+ // nextRunAt already set pre-execution, but recalculate in case
306
+ // the schedule or enabled state changed during execution.
307
+ freshJob.nextRunAt = calculateNextRunFrom(freshJob, Date.now());
281
308
  }
282
309
  saveJobs(freshJobs);
283
310
  }
@@ -365,11 +392,44 @@ export function toggleJob(id) {
365
392
  saveJobs(jobs);
366
393
  return job;
367
394
  }
368
- export function runJobNow(id) {
369
- const job = getJob(id);
395
+ /**
396
+ * Manual /cron run — resolves `nameOrId` against the job list, then
397
+ * executes the job while honouring the in-memory `runningJobs` guard
398
+ * so a simultaneous scheduler-trigger can't overlap.
399
+ */
400
+ export async function runJobNow(nameOrId) {
401
+ const job = resolveJobByNameOrId(loadJobs(), nameOrId);
370
402
  if (!job)
371
- return null;
372
- return executeJob(job);
403
+ return { status: "not-found" };
404
+ if (runningJobs.has(job.id)) {
405
+ return { status: "already-running", job };
406
+ }
407
+ runningJobs.add(job.id);
408
+ try {
409
+ const result = await executeJob(job);
410
+ // Persist the manual run the same way the scheduler does so the
411
+ // timeline stays honest: lastAttemptAt + lastRunAt + runCount bump.
412
+ try {
413
+ const freshJobs = loadJobs();
414
+ const freshJob = freshJobs.find((j) => j.id === job.id);
415
+ if (freshJob) {
416
+ const now = Date.now();
417
+ freshJob.lastAttemptAt = now;
418
+ freshJob.lastRunAt = now;
419
+ freshJob.lastResult = result.output.slice(0, 4000);
420
+ freshJob.lastError = result.error || null;
421
+ freshJob.runCount++;
422
+ saveJobs(freshJobs);
423
+ }
424
+ }
425
+ catch (err) {
426
+ console.error("[cron] failed to persist manual run state:", err);
427
+ }
428
+ return { status: "ran", job, output: result.output, error: result.error };
429
+ }
430
+ finally {
431
+ runningJobs.delete(job.id);
432
+ }
373
433
  }
374
434
  /**
375
435
  * Convert a cron expression or interval string to a human-readable German description.
@@ -21,6 +21,7 @@ import { resolve } from "path";
21
21
  import { SKILLS_DIR } from "../paths.js";
22
22
  import { USER_SKILLS_DIR } from "../paths.js";
23
23
  import { loadAssetIndex } from "./asset-index.js";
24
+ import { debounce } from "../util/debounce.js";
24
25
  // ── Skill Registry ──────────────────────────────────────
25
26
  let cachedSkills = [];
26
27
  let lastScanAt = 0;
@@ -143,23 +144,26 @@ function reloadAllSkills() {
143
144
  */
144
145
  export function loadSkills() {
145
146
  reloadAllSkills();
146
- // Hot-reload watchers
147
+ // Hot-reload watchers — macOS FSEvents delivers many duplicate events
148
+ // for a single logical change, so we coalesce bursts into one reload.
149
+ const bundledReload = debounce(() => {
150
+ console.log("Skills changed (bundled) \u2014 reloading");
151
+ reloadAllSkills();
152
+ }, 300);
153
+ const userReload = debounce(() => {
154
+ console.log("Skills changed (user) \u2014 reloading");
155
+ reloadAllSkills();
156
+ }, 300);
147
157
  try {
148
- watch(SKILLS_DIR, { recursive: true }, () => {
149
- console.log("Skills changed (bundled) \u2014 reloading");
150
- reloadAllSkills();
151
- });
158
+ watch(SKILLS_DIR, { recursive: true }, () => bundledReload());
152
159
  }
153
- catch { }
160
+ catch { /* ignore — watcher failures fall back to manual reload */ }
154
161
  try {
155
162
  if (existsSync(USER_SKILLS_DIR)) {
156
- watch(USER_SKILLS_DIR, { recursive: true }, () => {
157
- console.log("Skills changed (user) \u2014 reloading");
158
- reloadAllSkills();
159
- });
163
+ watch(USER_SKILLS_DIR, { recursive: true }, () => userReload());
160
164
  }
161
165
  }
162
- catch { }
166
+ catch { /* ignore */ }
163
167
  return cachedSkills;
164
168
  }
165
169
  /**
@@ -47,11 +47,17 @@ function statusIcon(status) {
47
47
  }
48
48
  }
49
49
  function buildBanner(info, result) {
50
- const icon = statusIcon(result.status);
50
+ // A "completed" run that produced zero output is almost always a
51
+ // silent failure — a truncated stream, a tool-only final turn, a
52
+ // provider that swallowed its response. Call that out explicitly so
53
+ // the user sees a clear signal instead of a green tick on nothing.
54
+ const truncated = result.status === "completed" && (!result.output || result.output.trim().length === 0);
55
+ const icon = truncated ? "⚠️" : statusIcon(result.status);
56
+ const statusLabel = truncated ? "completed · empty output" : result.status;
51
57
  const dur = formatDuration(result.duration);
52
58
  const ti = formatTokens(result.tokensUsed.input);
53
59
  const to = formatTokens(result.tokensUsed.output);
54
- return `${icon} *${info.name}* ${result.status} · ${dur} · ${ti} in / ${to} out`;
60
+ return `${icon} *${info.name}* ${statusLabel} · ${dur} · ${ti} in / ${to} out`;
55
61
  }
56
62
  // ── A4 Live-Stream ──────────────────────────────────────────
57
63
  /**
@@ -231,6 +231,13 @@ async function runSubAgent(id, agentConfig, abort, resolvedName) {
231
231
  console.error(`[subagent ${id}] live-stream init failed:`, err);
232
232
  }
233
233
  }
234
+ // These live OUTSIDE the try block so the catch handler can read
235
+ // whatever was buffered before the stream failed. Moving them into
236
+ // the try scope was the cause of the "output: ''" regression.
237
+ let finalText = "";
238
+ let inputTokens = 0;
239
+ let outputTokens = 0;
240
+ let streamError = null;
234
241
  try {
235
242
  const { getRegistry } = await import("../engine.js");
236
243
  const registry = getRegistry();
@@ -243,9 +250,6 @@ async function runSubAgent(id, agentConfig, abort, resolvedName) {
243
250
  ? agentConfig.workingDir || os.homedir()
244
251
  : os.homedir();
245
252
  const systemPrompt = `You are a sub-agent named "${resolvedName}". Complete the following task autonomously and report your results clearly when done. Working directory: ${effectiveCwd}`;
246
- let finalText = "";
247
- let inputTokens = 0;
248
- let outputTokens = 0;
249
253
  for await (const chunk of registry.queryWithFallback({
250
254
  prompt: agentConfig.prompt,
251
255
  systemPrompt,
@@ -254,16 +258,33 @@ async function runSubAgent(id, agentConfig, abort, resolvedName) {
254
258
  abortSignal: abort.signal,
255
259
  })) {
256
260
  if (chunk.type === "text") {
257
- finalText = chunk.text || "";
258
- // A4: push text updates into the throttled live-stream
261
+ // Both SDK providers emit `text` as the accumulated string.
262
+ // Keep the last non-empty one we've seen so a final tool-only
263
+ // turn doesn't wipe our buffer.
264
+ if (chunk.text && chunk.text.length > 0) {
265
+ finalText = chunk.text;
266
+ }
259
267
  if (liveStream && !liveStream.failed) {
260
268
  liveStream.update(finalText);
261
269
  }
262
270
  }
263
271
  if (chunk.type === "done") {
272
+ // done.text is the authoritative final accumulated text from
273
+ // the provider. Prefer it over the buffered value so runs that
274
+ // end on a tool_use don't leave us with a pre-tool snippet.
275
+ if (chunk.text && chunk.text.length > 0) {
276
+ finalText = chunk.text;
277
+ }
264
278
  inputTokens = chunk.inputTokens || 0;
265
279
  outputTokens = chunk.outputTokens || 0;
266
280
  }
281
+ if (chunk.type === "error") {
282
+ // Providers surface mid-stream errors as an `error` chunk
283
+ // instead of throwing. Capture the reason so the post-loop
284
+ // status resolution below can distinguish this from a clean
285
+ // finish, and keep whatever text we already buffered.
286
+ streamError = chunk.error || "stream error";
287
+ }
267
288
  }
268
289
  // If cancelAllSubAgents has already taken over (shutdown path), don't
269
290
  // overwrite the cancelled result it synthesised. Also: if the generator
@@ -285,6 +306,21 @@ async function runSubAgent(id, agentConfig, abort, resolvedName) {
285
306
  };
286
307
  entry.info.status = "cancelled";
287
308
  }
309
+ else if (streamError) {
310
+ // Provider emitted an error chunk but the generator ended cleanly —
311
+ // record it as an error, but preserve the text buffered before the
312
+ // failure so the caller sees useful partial output instead of "".
313
+ entry.result = {
314
+ id,
315
+ name: resolvedName,
316
+ status: "error",
317
+ output: finalText,
318
+ tokensUsed: { input: inputTokens, output: outputTokens },
319
+ duration: Date.now() - startTime,
320
+ error: streamError,
321
+ };
322
+ entry.info.status = "error";
323
+ }
288
324
  else {
289
325
  entry.result = {
290
326
  id,
@@ -312,6 +348,9 @@ async function runSubAgent(id, agentConfig, abort, resolvedName) {
312
348
  }
313
349
  }
314
350
  catch (err) {
351
+ // If cancelAllSubAgents already set a cancelled result, keep it.
352
+ if (entry.result && entry.result.status === "cancelled")
353
+ return;
315
354
  const isAbort = err instanceof Error && err.message.includes("abort");
316
355
  const isTimeout = abort.signal.aborted;
317
356
  const status = isTimeout
@@ -322,11 +361,13 @@ async function runSubAgent(id, agentConfig, abort, resolvedName) {
322
361
  entry.result = {
323
362
  id,
324
363
  name: resolvedName,
325
- status,
326
- output: "",
327
- tokensUsed: { input: 0, output: 0 },
364
+ // Preserve whatever text was buffered before the failure.
365
+ // Empty output here used to throw away multi-minute runs.
366
+ output: finalText,
367
+ tokensUsed: { input: inputTokens, output: outputTokens },
328
368
  duration: Date.now() - startTime,
329
369
  error: err instanceof Error ? err.message : String(err),
370
+ status,
330
371
  };
331
372
  entry.info.status = status;
332
373
  }
@@ -1,5 +1,6 @@
1
1
  import { config } from "../config.js";
2
2
  import { sanitizeTelegramMarkdown } from "./markdown.js";
3
+ import { isHarmlessTelegramError } from "../util/telegram-error-filter.js";
3
4
  export class TelegramStreamer {
4
5
  messageId = null;
5
6
  chatId;
@@ -94,9 +95,17 @@ export class TelegramStreamer {
94
95
  // If text fits in one message, just update the existing one
95
96
  if (safeText.length <= config.telegramMaxLength && this.messageId) {
96
97
  if (safeText !== this.lastSentText) {
97
- await this.api.editMessageText(this.chatId, this.messageId, safeText, {
98
- parse_mode: "Markdown",
99
- }).catch(() => this.api.editMessageText(this.chatId, this.messageId, safeText));
98
+ try {
99
+ await this.api.editMessageText(this.chatId, this.messageId, safeText, {
100
+ parse_mode: "Markdown",
101
+ }).catch(() => this.api.editMessageText(this.chatId, this.messageId, safeText));
102
+ }
103
+ catch (err) {
104
+ // Drop "message is not modified" / "message to edit not found"
105
+ // races silently — they're harmless and always race-based.
106
+ if (!isHarmlessTelegramError(err))
107
+ throw err;
108
+ }
100
109
  }
101
110
  return;
102
111
  }
@@ -0,0 +1,113 @@
1
+ /**
2
+ * Pure crash-loop brake logic, extracted from watchdog.ts so it can be
3
+ * unit-tested without touching the filesystem or launchctl.
4
+ *
5
+ * See test/watchdog-brake.test.ts for the regression this closes:
6
+ * chronic crashes with >5 min of uptime between them used to reset
7
+ * the counter before it could trip the brake, so the bot cycled
8
+ * indefinitely. The new policy enforces TWO thresholds — a fast
9
+ * short-window brake and a hard 24h daily cap — and only resets the
10
+ * counter after a real 1 h of clean uptime.
11
+ */
12
+ export const DEFAULTS = {
13
+ /** Beacon older than this → previous process exited cleanly (or the
14
+ * machine was rebooted); do not count as a crash. */
15
+ STALE_BEACON_MS: 90_000,
16
+ /** Short-window crash tracking — N crashes in SHORT_WINDOW_MS. */
17
+ SHORT_WINDOW_MS: 10 * 60_000,
18
+ SHORT_BRAKE_THRESHOLD: 10,
19
+ /** Daily crash cap — hard ceiling regardless of gaps. Tripping this
20
+ * means the bot has been restarting >20 times per day, which is
21
+ * almost certainly a chronic issue worth freezing and alerting. */
22
+ DAILY_WINDOW_MS: 24 * 60 * 60 * 1000,
23
+ DAILY_BRAKE_THRESHOLD: 20,
24
+ /** Uptime required before the short-window counter resets. Was 5 min
25
+ * in the buggy version — but 5 min is shorter than the typical
26
+ * sub-agent lifetime (the daily job-alert takes 10+ min), so chronic
27
+ * crashes with ≥5 min gaps sailed right past the brake. 1 h is safer. */
28
+ RESET_AFTER_MS: 60 * 60_000,
29
+ };
30
+ /**
31
+ * Given the previous beacon (or null on first boot) and the current time,
32
+ * decide whether the bot should proceed with boot or engage the crash-loop
33
+ * brake.
34
+ *
35
+ * PURE: no fs, no launchctl, no clock — `now` is an explicit parameter.
36
+ */
37
+ export function decideBrakeAction(previous, now, opts = {}) {
38
+ const staleMs = opts.staleBeaconMs ?? DEFAULTS.STALE_BEACON_MS;
39
+ const shortWindow = opts.shortWindowMs ?? DEFAULTS.SHORT_WINDOW_MS;
40
+ const shortBrake = opts.shortBrakeThreshold ?? DEFAULTS.SHORT_BRAKE_THRESHOLD;
41
+ const dailyWindow = opts.dailyWindowMs ?? DEFAULTS.DAILY_WINDOW_MS;
42
+ const dailyBrake = opts.dailyBrakeThreshold ?? DEFAULTS.DAILY_BRAKE_THRESHOLD;
43
+ // First boot or no beacon file → clean start
44
+ if (!previous) {
45
+ return {
46
+ action: "proceed",
47
+ crashCount: 0,
48
+ crashWindowStart: now,
49
+ dailyCrashCount: 0,
50
+ dailyCrashWindowStart: now,
51
+ };
52
+ }
53
+ // Daily window roll-over first — it's independent of short window.
54
+ let dailyCount = previous.dailyCrashCount;
55
+ let dailyStart = previous.dailyCrashWindowStart;
56
+ if (now - dailyStart >= dailyWindow) {
57
+ dailyCount = 0;
58
+ dailyStart = now;
59
+ }
60
+ const timeSinceLastBeat = now - previous.lastBeat;
61
+ const previousExitedRecently = timeSinceLastBeat < staleMs;
62
+ if (!previousExitedRecently) {
63
+ // Clean exit (or machine reboot between runs) → short-window counter
64
+ // resets, but the daily counter keeps going unless its own window
65
+ // already expired above.
66
+ return {
67
+ action: "proceed",
68
+ crashCount: 0,
69
+ crashWindowStart: now,
70
+ dailyCrashCount: dailyCount,
71
+ dailyCrashWindowStart: dailyStart,
72
+ };
73
+ }
74
+ // Short-window logic
75
+ const shortWindowExpired = now - previous.crashWindowStart >= shortWindow;
76
+ let crashCount;
77
+ let crashWindowStart;
78
+ if (shortWindowExpired) {
79
+ crashCount = 1;
80
+ crashWindowStart = now;
81
+ }
82
+ else {
83
+ crashCount = previous.crashCount + 1;
84
+ crashWindowStart = previous.crashWindowStart;
85
+ }
86
+ // Increment daily count since we treat this as a crash
87
+ dailyCount += 1;
88
+ if (crashCount >= shortBrake) {
89
+ return {
90
+ action: "brake",
91
+ reason: `${crashCount} crashes within short window (${Math.round(shortWindow / 60_000)}min) — threshold is ${shortBrake}`,
92
+ };
93
+ }
94
+ if (dailyCount >= dailyBrake) {
95
+ return {
96
+ action: "brake",
97
+ reason: `${dailyCount} crashes within daily window (${Math.round(dailyWindow / 3_600_000)}h) — threshold is ${dailyBrake}`,
98
+ };
99
+ }
100
+ return {
101
+ action: "proceed",
102
+ crashCount,
103
+ crashWindowStart,
104
+ dailyCrashCount: dailyCount,
105
+ dailyCrashWindowStart: dailyStart,
106
+ };
107
+ }
108
+ /** Whether the short-window crash counter should be reset after this
109
+ * much clean uptime. Default: 1 h. */
110
+ export function shouldResetCrashCounter(uptimeMs, opts = {}) {
111
+ const threshold = opts.resetAfterMs ?? DEFAULTS.RESET_AFTER_MS;
112
+ return uptimeMs >= threshold;
113
+ }