clementine-agent 1.0.30 → 1.0.31

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -65,4 +65,22 @@ export declare function computeBrokenJobs(now?: number): BrokenJob[];
65
65
  * Returns the jobs that triggered a fresh notification (mostly for tests/logs).
66
66
  */
67
67
  export declare function runFailureSweep(send: (text: string) => Promise<unknown>, gateway?: import('./router.js').Gateway, now?: number): Promise<BrokenJob[]>;
68
+ export interface StaleCronJob {
69
+ jobName: string;
70
+ agentSlug?: string;
71
+ schedule: string;
72
+ /** ISO timestamp of the job's most recent run, or null if it has never run. */
73
+ lastRunAt: string | null;
74
+ /** Expected time between consecutive scheduled runs, in ms. */
75
+ expectedIntervalMs: number;
76
+ /** How far past the *second* expected tick we are, in minutes. */
77
+ overdueMinutes: number;
78
+ }
79
+ /**
80
+ * Walk enabled cron jobs and find ones that haven't run in at least
81
+ * SLA_MISSED_TICKS × their expected interval. Distinct from the broken-job
82
+ * detector which needs actual error entries — this catches the opposite
83
+ * failure mode: a job that silently stopped firing at all.
84
+ */
85
+ export declare function computeStaleCronJobs(now?: number): Promise<StaleCronJob[]>;
68
86
  //# sourceMappingURL=failure-monitor.d.ts.map
@@ -20,6 +20,7 @@ import path from 'node:path';
20
20
  import Database from 'better-sqlite3';
21
21
  import pino from 'pino';
22
22
  import { BASE_DIR, MEMORY_DB_PATH } from '../config.js';
23
+ import { logAuditJsonl } from '../agent/hooks.js';
23
24
  const logger = pino({ name: 'clementine.failure-monitor' });
24
25
  const RUNS_DIR = path.join(BASE_DIR, 'cron', 'runs');
25
26
  const ADVISOR_EVENTS_FILE = path.join(BASE_DIR, 'cron', 'advisor-events.jsonl');
@@ -40,12 +41,15 @@ const NOTIFY_COOLDOWN_HOURS = 24;
40
41
  function loadState() {
41
42
  try {
42
43
  if (!existsSync(STATE_FILE))
43
- return { notified: {} };
44
+ return { notified: {}, staleNotified: {} };
44
45
  const raw = JSON.parse(readFileSync(STATE_FILE, 'utf-8'));
45
- return { notified: raw.notified ?? {} };
46
+ return {
47
+ notified: raw.notified ?? {},
48
+ staleNotified: raw.staleNotified ?? {},
49
+ };
46
50
  }
47
51
  catch {
48
- return { notified: {} };
52
+ return { notified: {}, staleNotified: {} };
49
53
  }
50
54
  }
51
55
  function saveState(state) {
@@ -470,6 +474,15 @@ export async function runFailureSweep(send, gateway, now = Date.now()) {
470
474
  logger.warn({ err }, 'Suspicious-run grading pre-pass failed (non-fatal)');
471
475
  }
472
476
  }
477
+ // SLA sweep runs unconditionally — catches jobs that silently stopped
478
+ // firing entirely, which the broken-job detector can't see because those
479
+ // jobs produce no error entries.
480
+ try {
481
+ await runSlaSweep(send, now);
482
+ }
483
+ catch (err) {
484
+ logger.warn({ err }, 'SLA sweep failed (non-fatal)');
485
+ }
473
486
  const broken = computeBrokenJobs(now);
474
487
  if (broken.length === 0) {
475
488
  // Clear cooldowns AND diagnostic cache entries for jobs that recovered.
@@ -545,6 +558,67 @@ export async function runFailureSweep(send, gateway, now = Date.now()) {
545
558
  }
546
559
  return fresh;
547
560
  }
561
+ /**
562
+ * Detect and notify about jobs whose last run is >= SLA_MISSED_TICKS
563
+ * expected intervals old. Each stale job gets one notification per
564
+ * SLA_NOTIFY_COOLDOWN_HOURS window. Always emits cron_sla_breach to
565
+ * audit.jsonl regardless of cooldown so the trace record is complete.
566
+ */
567
+ async function runSlaSweep(send, now) {
568
+ const stale = await computeStaleCronJobs(now);
569
+ if (stale.length === 0) {
570
+ // Clear cooldowns for jobs that recovered (are no longer stale).
571
+ const state = loadState();
572
+ let mutated = false;
573
+ for (const name of Object.keys(state.staleNotified)) {
574
+ if (!stale.find(s => s.jobName === name)) {
575
+ delete state.staleNotified[name];
576
+ mutated = true;
577
+ }
578
+ }
579
+ if (mutated)
580
+ saveState(state);
581
+ return;
582
+ }
583
+ // Emit audit events for every stale detection so downstream
584
+ // tooling (dashboards, alerts) has a complete record.
585
+ for (const job of stale) {
586
+ logAuditJsonl({
587
+ event_type: 'cron_sla_breach',
588
+ jobName: job.jobName,
589
+ agent_slug: job.agentSlug,
590
+ schedule: job.schedule,
591
+ lastRunAt: job.lastRunAt,
592
+ expectedIntervalMs: job.expectedIntervalMs,
593
+ overdueMinutes: job.overdueMinutes,
594
+ });
595
+ }
596
+ // Apply per-job notify cooldown so we don't spam.
597
+ const state = loadState();
598
+ const cooldownMs = SLA_NOTIFY_COOLDOWN_HOURS * 60 * 60 * 1000;
599
+ const fresh = [];
600
+ for (const job of stale) {
601
+ const prev = state.staleNotified[job.jobName];
602
+ if (prev && now - Date.parse(prev.lastNotifiedAt) < cooldownMs)
603
+ continue;
604
+ fresh.push(job);
605
+ }
606
+ if (fresh.length === 0)
607
+ return;
608
+ try {
609
+ await send(formatStaleReport(fresh));
610
+ const stamp = new Date(now).toISOString();
611
+ for (const job of fresh) {
612
+ state.staleNotified[job.jobName] = { lastNotifiedAt: stamp, lastRunAt: job.lastRunAt };
613
+ }
614
+ saveState(state);
615
+ appendAuditLog('sla_notified', fresh.map(j => j.jobName));
616
+ logger.info({ count: fresh.length, jobs: fresh.map(j => j.jobName) }, 'SLA monitor: notified owner about stale jobs');
617
+ }
618
+ catch (err) {
619
+ logger.warn({ err }, 'SLA monitor: notification dispatch failed');
620
+ }
621
+ }
548
622
  function appendAuditLog(action, jobNames) {
549
623
  try {
550
624
  const auditPath = path.join(BASE_DIR, 'cron', 'failure-monitor.log');
@@ -606,4 +680,73 @@ async function loadJobPrompt(jobName) {
606
680
  return null;
607
681
  }
608
682
  }
683
+ /** A job is "stale" if it has missed >= this many expected ticks. */
684
+ const SLA_MISSED_TICKS = 3;
685
+ /** Absolute floor so sub-hourly cron jobs don't alert too aggressively. */
686
+ const SLA_MIN_OVERDUE_MS = 30 * 60 * 1000;
687
+ /** Don't re-DM the owner about the same stale job within this window. */
688
+ const SLA_NOTIFY_COOLDOWN_HOURS = 12;
689
+ /**
690
+ * Walk enabled cron jobs and find ones that haven't run in at least
691
+ * SLA_MISSED_TICKS × their expected interval. Distinct from the broken-job
692
+ * detector which needs actual error entries — this catches the opposite
693
+ * failure mode: a job that silently stopped firing at all.
694
+ */
695
+ export async function computeStaleCronJobs(now = Date.now()) {
696
+ const { parseCronJobs, parseAgentCronJobs, CronRunLog } = await import('./cron-scheduler.js');
697
+ const { AGENTS_DIR } = await import('../config.js');
698
+ const cronParser = await import('cron-parser');
699
+ const jobs = [...parseCronJobs(), ...parseAgentCronJobs(AGENTS_DIR)];
700
+ const runLog = new CronRunLog();
701
+ const stale = [];
702
+ for (const job of jobs) {
703
+ if (!job.enabled)
704
+ continue;
705
+ if (job.mode === 'unleashed')
706
+ continue; // one-shot, no recurring SLA
707
+ // Normalize schedule — node-cron accepts 6-field (with seconds) but
708
+ // cron-parser only takes 5-field.
709
+ const fields = job.schedule.trim().split(/\s+/);
710
+ const expr = fields.length === 6 ? fields.slice(1).join(' ') : job.schedule;
711
+ let intervalMs;
712
+ try {
713
+ const parser = cronParser.CronExpressionParser.parse(expr);
714
+ const next = parser.next().toDate().getTime();
715
+ const prev = parser.prev().toDate().getTime();
716
+ intervalMs = next - prev;
717
+ }
718
+ catch {
719
+ continue; // malformed schedule — separate concern
720
+ }
721
+ if (intervalMs <= 0)
722
+ continue;
723
+ const recent = runLog.readRecent(job.name, 1);
724
+ const lastRunAt = recent[0]?.startedAt ?? null;
725
+ const lastRunMs = lastRunAt ? Date.parse(lastRunAt) : 0;
726
+ const threshold = Math.max(intervalMs * SLA_MISSED_TICKS, SLA_MIN_OVERDUE_MS);
727
+ const sinceLastRun = now - lastRunMs;
728
+ if (sinceLastRun <= threshold)
729
+ continue;
730
+ stale.push({
731
+ jobName: job.name,
732
+ agentSlug: job.agentSlug,
733
+ schedule: job.schedule,
734
+ lastRunAt,
735
+ expectedIntervalMs: intervalMs,
736
+ overdueMinutes: Math.round((sinceLastRun - intervalMs) / 60_000),
737
+ });
738
+ }
739
+ return stale;
740
+ }
741
+ function formatStaleReport(stale) {
742
+ const lines = ['**Cron SLA breach — jobs that should have run but didn\'t:**', ''];
743
+ for (const job of stale) {
744
+ const last = job.lastRunAt ? `last ran ${new Date(job.lastRunAt).toISOString().slice(0, 16).replace('T', ' ')}` : 'never run';
745
+ const intervalMin = Math.round(job.expectedIntervalMs / 60_000);
746
+ lines.push(`- **${job.jobName}** (${job.schedule}, every ${intervalMin}m) — ${last}, overdue by ~${job.overdueMinutes}m`);
747
+ }
748
+ lines.push('');
749
+ lines.push('The scheduler may be stuck, the job may have thrown before logging, or it may have been silently disabled. Check the dashboard Scheduled Tasks panel.');
750
+ return lines.join('\n');
751
+ }
609
752
  //# sourceMappingURL=failure-monitor.js.map
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "clementine-agent",
3
- "version": "1.0.30",
3
+ "version": "1.0.31",
4
4
  "description": "Clementine — Personal AI Assistant (TypeScript)",
5
5
  "type": "module",
6
6
  "main": "dist/index.js",