npm - clementine-agent - Versions diffs - 1.0.14 → 1.0.15 - Mend

clementine-agent 1.0.14 → 1.0.15

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

package/dist/agent/self-improve.js +23 -0
package/dist/cli/dashboard.js +78 -1
package/dist/gateway/cron-scheduler.d.ts +5 -0
package/dist/gateway/cron-scheduler.js +32 -5
package/dist/gateway/failure-monitor.d.ts +40 -0
package/dist/gateway/failure-monitor.js +416 -0
package/dist/gateway/fix-verification.d.ts +39 -0
package/dist/gateway/fix-verification.js +144 -0
package/dist/gateway/heartbeat-scheduler.js +42 -4
package/package.json +1 -1

package/dist/agent/self-improve.js CHANGED Viewed

@@ -168,6 +168,29 @@ export class SelfImproveLoop {
             logger.info('Captured SOUL.md baseline for drift detection');
         }
         const state = this.loadState();
+        // If a prior run aborted on an infrastructure error that can't be fixed
+        // by retrying (malformed MCP tool schema, bad auth, etc.), don't spin
+        // the loop pointlessly. Wait at least 24h before re-probing — this gives
+        // the owner time to fix the infra and prevents us from writing dozens
+        // of identical error experiments. The failure monitor surfaces the
+        // infraError to the owner via the broken-jobs pipeline.
+        if (state.infraError && state.lastRunAt) {
+            const hoursSinceRun = (Date.now() - Date.parse(state.lastRunAt)) / 3_600_000;
+            if (Number.isFinite(hoursSinceRun) && hoursSinceRun < 24) {
+                logger.warn({
+                    category: state.infraError.category,
+                    diagnostic: state.infraError.diagnostic,
+                    hoursSinceRun: Math.round(hoursSinceRun),
+                }, 'Self-improve skipped — prior infra error still in cooldown. See Broken Jobs panel.');
+                state.status = 'completed';
+                this.saveState(state);
+                return state;
+            }
+            // Past cooldown — clear the flag and probe fresh. If it still errors,
+            // the loop will set it again.
+            logger.info('Self-improve: infra error cooldown elapsed, probing again');
+            delete state.infraError;
+        }
         state.status = 'running';
         state.lastRunAt = new Date().toISOString();
         state.currentIteration = 0;

package/dist/cli/dashboard.js CHANGED Viewed

@@ -2075,6 +2075,16 @@ export async function cmdDashboard(opts) {
             res.status(500).json({ error: String(err) });
         }
     });
+    // ── Broken jobs (failure monitor) ───────────────────────────────
+    app.get('/api/cron/broken-jobs', async (_req, res) => {
+        try {
+            const { computeBrokenJobs } = await import('../gateway/failure-monitor.js');
+            res.json({ jobs: computeBrokenJobs() });
+        }
+        catch (err) {
+            res.status(500).json({ error: String(err) });
+        }
+    });
     // ── Cron trace viewer ──────────────────────────────────────────
     app.get('/api/cron/traces/:job', (req, res) => {
         try {
@@ -9075,6 +9085,7 @@ if('serviceWorker' in navigator){navigator.serviceWorker.getRegistrations().then
       <div class="page-title">Scheduled Tasks</div>
       <div class="tab-bar" id="automations-tabs">
         <button class="active" onclick="switchTab('automations','scheduled')">Scheduled Tasks</button>
+        <button onclick="switchTab('automations','broken')">Broken Jobs <span class="tab-badge" id="tab-broken-count" title="repeatedly failing" style="display:none;background:#ef4444;color:#fff">0</span></button>
         <button onclick="switchTab('automations','timers')">Timers <span class="tab-badge" id="tab-timer-count" style="display:none">0</span></button>
         <button onclick="switchTab('automations','self-improve')">Self-Improve <span class="tab-badge" id="tab-si-pending" style="display:none">0</span></button>
         <button onclick="switchTab('automations','skills')">Skills <span class="tab-badge" id="tab-skill-count" style="display:none">0</span><span class="tab-badge" id="tab-pending-skill-count" title="pending approval" style="display:none;background:#f59e0b;color:#000">0</span></button>
@@ -9084,6 +9095,15 @@ if('serviceWorker' in navigator){navigator.serviceWorker.getRegistrations().then
         <div class="tab-pane active" id="tab-automations-scheduled">
           <div id="panel-cron"><div class="empty-state">Loading...</div></div>
         </div>
+        <div class="tab-pane" id="tab-automations-broken">
+          <div class="card">
+            <div class="card-header" style="display:flex;align-items:center;justify-content:space-between">
+              <span>Repeatedly Failing Jobs (last 48h)</span>
+              <span class="badge badge-gray" id="broken-count-badge" style="font-size:10px">0 jobs</span>
+            </div>
+            <div class="card-body" id="panel-broken-jobs"><div class="empty-state">Loading...</div></div>
+          </div>
+        </div>
         <div class="tab-pane" id="tab-automations-timers">
           <div class="card">
             <div class="card-body" id="panel-timers"><div class="empty-state">Loading...</div></div>
@@ -10307,7 +10327,7 @@ function navigateTo(page, opts) {
     updateBuilderMode();
     document.getElementById('builder-input').focus();
   }
-  if (page === 'automations') { refreshCron(); refreshTimers(); refreshSelfImprove(); refreshSkills(); }
+  if (page === 'automations') { refreshCron(); refreshTimers(); refreshSelfImprove(); refreshSkills(); refreshBrokenJobs(); }
   if (page === 'intelligence') { refreshMemory(); }
   if (page === 'settings') { refreshSettings(); refreshRemoteAccess(); refreshSalesforce(); refreshClaudeIntegrations(); refreshMcpServers(); }
   if (page === 'logs') refreshLogs();
@@ -10348,6 +10368,7 @@ function switchTab(group, tab) {
   // Tab-specific refresh
   if (group === 'automations') {
     if (tab === 'scheduled') refreshCron();
+    if (tab === 'broken') refreshBrokenJobs();
     if (tab === 'timers') refreshTimers();
     if (tab === 'self-improve') refreshSelfImprove();
     if (tab === 'workflows') refreshWorkflows();
@@ -16141,6 +16162,62 @@ async function expandSkill(name) {
   } catch(e) { toast('Failed to load skill', 'error'); }
 }
+async function refreshBrokenJobs() {
+  try {
+    var r = await apiFetch('/api/cron/broken-jobs');
+    var d = await r.json();
+    var jobs = d.jobs || [];
+    var tabBadge = document.getElementById('tab-broken-count');
+    if (tabBadge) {
+      tabBadge.textContent = String(jobs.length);
+      tabBadge.style.display = jobs.length > 0 ? '' : 'none';
+    }
+    var countBadge = document.getElementById('broken-count-badge');
+    if (countBadge) countBadge.textContent = jobs.length + ' job' + (jobs.length !== 1 ? 's' : '');
+    var container = document.getElementById('panel-broken-jobs');
+    if (!container) return;
+    if (jobs.length === 0) {
+      container.innerHTML = '<div class="empty-state">All jobs healthy in the last 48h.</div>';
+      return;
+    }
+    var html = '<div style="display:flex;flex-direction:column;gap:12px">';
+    for (var j of jobs) {
+      var breaker = j.circuitBreakerEngagedAt
+        ? '<span class="badge" style="background:rgba(239,68,68,0.15);color:#ef4444;font-size:10px">circuit broken</span>'
+        : '';
+      var lastErrorAt = j.lastErrorAt ? timeAgo(j.lastErrorAt) : 'unknown';
+      var failureRatio = j.errorCount48h + '/' + j.totalRuns48h;
+      var advisorLine = j.lastAdvisorOpinion
+        ? '<div style="font-size:11px;color:var(--text-muted);margin-top:6px"><strong>Advisor:</strong> ' + esc(j.lastAdvisorOpinion) + '</div>'
+        : '';
+      var errorsHtml = '';
+      if (j.lastErrors && j.lastErrors.length > 0) {
+        errorsHtml = '<div style="margin-top:8px;display:flex;flex-direction:column;gap:4px">';
+        for (var e of j.lastErrors) {
+          errorsHtml += '<pre style="font-size:11px;color:var(--text-secondary);background:var(--bg-tertiary);padding:6px 8px;border-radius:4px;white-space:pre-wrap;word-break:break-word;margin:0;max-height:120px;overflow-y:auto">' + esc(e) + '</pre>';
+        }
+        errorsHtml += '</div>';
+      }
+      var agentTag = j.agentSlug
+        ? '<span class="badge badge-blue" style="font-size:10px">' + esc(j.agentSlug) + '</span>'
+        : '';
+      html += '<div style="padding:12px;border:1px solid var(--border);border-radius:8px;background:var(--bg-secondary)">'
+        + '<div style="display:flex;align-items:center;gap:8px;flex-wrap:wrap">'
+        + '<strong>' + esc(j.jobName) + '</strong> ' + agentTag + ' ' + breaker
+        + '<span style="margin-left:auto;font-size:11px;color:var(--text-muted)">' + failureRatio + ' failed \\u00b7 last error ' + lastErrorAt + '</span>'
+        + '</div>'
+        + errorsHtml
+        + advisorLine
+        + '</div>';
+    }
+    html += '</div>';
+    container.innerHTML = html;
+  } catch(e) {
+    var c = document.getElementById('panel-broken-jobs');
+    if (c) c.innerHTML = '<div class="empty-state" style="color:var(--red)">Failed to load broken jobs</div>';
+  }
+}
 async function refreshPendingSkills() {
   try {
     var r = await apiFetch('/api/skills/pending');

package/dist/gateway/cron-scheduler.d.ts CHANGED Viewed

@@ -87,6 +87,11 @@ export declare class CronScheduler {
     private watchAgentsDir;
     private unwatchAgentsDir;
     reloadJobs(): void;
+    /**
+     * Wrap runLog.append so every completion also checks whether a fix
+     * verification is pending and DMs the verdict if so.
+     */
+    private _logRun;
     private runJob;
     /**
      * Log an advisor event to the events JSONL file for dashboard surfacing.

package/dist/gateway/cron-scheduler.js CHANGED Viewed

@@ -491,6 +491,9 @@ export class CronScheduler {
         this.watchingAgents = false;
     }
     reloadJobs() {
+        // Snapshot the pre-reload job definitions so fix-verification can diff
+        // and flag any currently-failing job whose config just changed.
+        const oldJobs = this.jobs.map(j => ({ ...j }));
         // Stop existing scheduled tasks (but NOT the file watcher)
         for (const [name, task] of this.scheduledTasks) {
             task.stop();
@@ -580,6 +583,30 @@ export class CronScheduler {
                 logger.info(`Cron job '${def.name}' scheduled: ${def.schedule} (${SYSTEM_TIMEZONE})`);
             }
         }
+        // Fix-verification: detect any currently-failing job whose definition just
+        // changed, and record a pending verification for their next run.
+        // Skipped on the first load (oldJobs empty) since there's no edit to verify.
+        if (oldJobs.length > 0) {
+            import('./fix-verification.js').then(({ recordEditsForFailingJobs }) => {
+                try {
+                    recordEditsForFailingJobs(oldJobs, this.jobs);
+                }
+                catch (err) {
+                    logger.warn({ err }, 'Fix-verification capture failed');
+                }
+            }).catch(err => logger.warn({ err }, 'Fix-verification import failed'));
+        }
+    }
+    /**
+     * Wrap runLog.append so every completion also checks whether a fix
+     * verification is pending and DMs the verdict if so.
+     */
+    _logRun(entry) {
+        this.runLog.append(entry);
+        import('./fix-verification.js').then(({ checkAndDeliverVerification }) => {
+            checkAndDeliverVerification(entry, (text) => this.dispatcher.send(text, {}))
+                .catch(err => logger.warn({ err, job: entry.jobName }, 'Fix verification DM failed'));
+        }).catch(err => logger.warn({ err }, 'Fix-verification import failed'));
     }
     async runJob(job) {
         // Agent status check — skip if agent is paused/terminated
@@ -649,7 +676,7 @@ export class CronScheduler {
                 // Non-zero exit or timeout → skip the job
                 const exitCode = preCheckErr.status ?? 1;
                 logger.info({ job: job.name, exitCode }, 'Pre-check failed — skipping job (no work to do)');
-                this.runLog.append({
+                this._logRun({
                     jobName: job.name,
                     startedAt: new Date().toISOString(),
                     finishedAt: new Date().toISOString(),
@@ -690,7 +717,7 @@ export class CronScheduler {
             });
             if (!approved) {
                 logger.info({ job: job.name }, 'Cron job skipped by owner');
-                this.runLog.append({
+                this._logRun({
                     jobName: job.name,
                     startedAt: new Date().toISOString(),
                     finishedAt: new Date().toISOString(),
@@ -709,7 +736,7 @@ export class CronScheduler {
         const advice = getExecutionAdvice(job.name, job);
         if (advice.shouldSkip) {
             logger.info({ job: job.name, reason: advice.skipReason }, 'Execution advisor: circuit breaker — skipping job');
-            this.runLog.append({
+            this._logRun({
                 jobName: job.name,
                 startedAt: new Date().toISOString(),
                 finishedAt: new Date().toISOString(),
@@ -876,7 +903,7 @@ export class CronScheduler {
                             this.gateway.injectContext(`discord:user:${DISCORD_OWNER_ID}`, `[Scheduled cron: ${job.name}]`, response);
                         }
                     }
-                    this.runLog.append(entry);
+                    this._logRun(entry);
                     // Fire-and-forget: extract procedural skill from successful long-running cron jobs
                     if (entry.status === 'ok' && entry.durationMs > 30_000 && response && response.length > 500) {
                         this.gateway.extractCronSkill(job.name, job.prompt, response, entry.durationMs, job.agentSlug)
@@ -902,7 +929,7 @@ export class CronScheduler {
                     const errorType = errTerminalReason
                         ? classifyTerminalReason(errTerminalReason)
                         : classifyError(err);
-                    this.runLog.append({
+                    this._logRun({
                         jobName: job.name,
                         startedAt: startedAt.toISOString(),
                         finishedAt: finishedAt.toISOString(),

package/dist/gateway/failure-monitor.d.ts ADDED Viewed

@@ -0,0 +1,40 @@
+/**
+ * Clementine TypeScript — Cron failure monitor.
+ *
+ * Surfaces cron jobs that have been failing repeatedly so they don't sit
+ * silently broken (which is what happened to ross-the-sdr:reply-detection —
+ * the existing circuit breaker fired ONCE at consErrors=5 and then went
+ * quiet for days).
+ *
+ * Threshold: a job is "broken" if either
+ *   - it has >= 3 error/retried entries in the last 48h, OR
+ *   - the circuit breaker engaged for it within the last 48h.
+ *
+ * Per-job 24h cooldown prevents re-spamming the owner with the same news.
+ *
+ * Read-only with respect to the cron run logs and advisor events; mutates
+ * only its own state file (cron/failure-monitor.json).
+ */
+export interface BrokenJob {
+    jobName: string;
+    agentSlug?: string;
+    errorCount48h: number;
+    totalRuns48h: number;
+    lastErrorAt: string | null;
+    lastErrors: string[];
+    circuitBreakerEngagedAt: string | null;
+    lastAdvisorOpinion: string | null;
+}
+/**
+ * Compute the current set of broken jobs by scanning all run logs.
+ * Pure function (state-free) — used both by the monitor sweep and the dashboard endpoint.
+ */
+export declare function computeBrokenJobs(now?: number): BrokenJob[];
+/**
+ * Run a sweep: identify currently-broken jobs, pick the ones we haven't
+ * notified about recently, and dispatch one consolidated DM.
+ *
+ * Returns the jobs that triggered a fresh notification (mostly for tests/logs).
+ */
+export declare function runFailureSweep(send: (text: string) => Promise<unknown>, now?: number): Promise<BrokenJob[]>;
+//# sourceMappingURL=failure-monitor.d.ts.map

package/dist/gateway/failure-monitor.js ADDED Viewed

@@ -0,0 +1,416 @@
+/**
+ * Clementine TypeScript — Cron failure monitor.
+ *
+ * Surfaces cron jobs that have been failing repeatedly so they don't sit
+ * silently broken (which is what happened to ross-the-sdr:reply-detection —
+ * the existing circuit breaker fired ONCE at consErrors=5 and then went
+ * quiet for days).
+ *
+ * Threshold: a job is "broken" if either
+ *   - it has >= 3 error/retried entries in the last 48h, OR
+ *   - the circuit breaker engaged for it within the last 48h.
+ *
+ * Per-job 24h cooldown prevents re-spamming the owner with the same news.
+ *
+ * Read-only with respect to the cron run logs and advisor events; mutates
+ * only its own state file (cron/failure-monitor.json).
+ */
+import { appendFileSync, existsSync, mkdirSync, readFileSync, readdirSync, writeFileSync, } from 'node:fs';
+import path from 'node:path';
+import pino from 'pino';
+import { BASE_DIR } from '../config.js';
+const logger = pino({ name: 'clementine.failure-monitor' });
+const RUNS_DIR = path.join(BASE_DIR, 'cron', 'runs');
+const ADVISOR_EVENTS_FILE = path.join(BASE_DIR, 'cron', 'advisor-events.jsonl');
+const STATE_FILE = path.join(BASE_DIR, 'cron', 'failure-monitor.json');
+const SELF_IMPROVE_STATE_FILE = path.join(BASE_DIR, 'self-improve', 'state.json');
+const SELF_IMPROVE_LOG_FILE = path.join(BASE_DIR, 'self-improve', 'experiment-log.jsonl');
+/** A job is broken if it crosses any of these thresholds in the lookback window. */
+const ERRORS_IN_WINDOW = 3;
+const WINDOW_HOURS = 48;
+/**
+ * Independent of the window — a job whose last N runs are all failures is
+ * broken even if they're spread over days (daily cron jobs can't accumulate
+ * 3 failures in 48h, but 2 consecutive BLOCKED days is still broken).
+ */
+const CONSECUTIVE_FAILURES = 2;
+/** Don't re-DM the owner about the same broken job within this window. */
+const NOTIFY_COOLDOWN_HOURS = 24;
+function loadState() {
+    try {
+        if (!existsSync(STATE_FILE))
+            return { notified: {} };
+        const raw = JSON.parse(readFileSync(STATE_FILE, 'utf-8'));
+        return { notified: raw.notified ?? {} };
+    }
+    catch {
+        return { notified: {} };
+    }
+}
+function saveState(state) {
+    try {
+        mkdirSync(path.dirname(STATE_FILE), { recursive: true });
+        writeFileSync(STATE_FILE, JSON.stringify(state, null, 2));
+    }
+    catch (err) {
+        logger.warn({ err }, 'Failed to persist failure-monitor state');
+    }
+}
+function readRunLog(filePath) {
+    try {
+        return readFileSync(filePath, 'utf-8')
+            .trim()
+            .split('\n')
+            .filter(Boolean)
+            .map(line => {
+            try {
+                return JSON.parse(line);
+            }
+            catch {
+                return null;
+            }
+        })
+            .filter((e) => e !== null);
+    }
+    catch {
+        return [];
+    }
+}
+function isFailure(entry) {
+    return entry.status === 'error' || entry.status === 'retried' || isSemanticFailure(entry);
+}
+/**
+ * "Semantic failure" — a run the scheduler called `ok` but whose agent output
+ * self-reports the task didn't actually complete. We only flag on explicit
+ * block/failure markers in the preview; the duration-vs-output heuristic was
+ * tested against the live corpus and produced too many false positives on
+ * legitimately quiet jobs (healthchecks, inbox probes that return empty
+ * when there's nothing to report).
+ *
+ * Markers are drawn from observed failure modes in Ross's cron jobs
+ * (kernel-vs-local Bash, "BLOCKED (no local bash access)") plus generic
+ * agent self-reports.
+ */
+function isSemanticFailure(entry) {
+    if (entry.status !== 'ok')
+        return false;
+    const preview = (entry.outputPreview ?? '').trim();
+    if (!preview)
+        return false;
+    const previewLower = preview.toLowerCase();
+    // Match on word boundaries so "BLOCKED" matches "Result: BLOCKED" but
+    // "blockedBy" in a stray JSON fragment doesn't.
+    const markerRegexes = [
+        /\b(blocked|task_blocked|task_incomplete)\b/,
+        /\b(failed|could not|unable to|no local bash|permission denied)\b/,
+        /__nothing__/,
+    ];
+    for (const re of markerRegexes) {
+        if (re.test(previewLower))
+            return true;
+    }
+    return false;
+}
+/**
+ * Pull the most recent circuit-breaker engagement for a job, looking at the
+ * entire advisor log (not just the 48h window). A stuck breaker counts as a
+ * broken job even if it last fired weeks ago, because while engaged the job
+ * stops running entirely and produces no new failure entries.
+ *
+ * Returns the engagement timestamp (if currently engaged with no subsequent
+ * recovery) and the most recent advisor opinion string, if any.
+ */
+function lastCircuitBreakerEvent(jobName) {
+    if (!existsSync(ADVISOR_EVENTS_FILE))
+        return { engagedAt: null, lastOpinion: null };
+    let engagedAt = null;
+    let lastOpinion = null;
+    try {
+        const lines = readFileSync(ADVISOR_EVENTS_FILE, 'utf-8').trim().split('\n');
+        for (const line of lines) {
+            try {
+                const evt = JSON.parse(line);
+                if (evt.jobName !== jobName)
+                    continue;
+                // Capture the most recent opinion regardless of type
+                lastOpinion = `${evt.type}: ${evt.detail}`;
+                if (evt.type === 'circuit-breaker')
+                    engagedAt = evt.timestamp;
+                if (evt.type === 'circuit-recovery' || evt.type === 'auto-disabled')
+                    engagedAt = null;
+            }
+            catch { /* skip malformed */ }
+        }
+    }
+    catch { /* non-fatal */ }
+    return { engagedAt, lastOpinion };
+}
+/**
+ * Compute the current set of broken jobs by scanning all run logs.
+ * Pure function (state-free) — used both by the monitor sweep and the dashboard endpoint.
+ */
+export function computeBrokenJobs(now = Date.now()) {
+    if (!existsSync(RUNS_DIR))
+        return [];
+    const sinceMs = now - WINDOW_HOURS * 60 * 60 * 1000;
+    const broken = [];
+    let files = [];
+    try {
+        files = readdirSync(RUNS_DIR).filter(f => f.endsWith('.jsonl'));
+    }
+    catch {
+        return [];
+    }
+    const dormantCutoffMs = now - 7 * 24 * 60 * 60 * 1000;
+    for (const file of files) {
+        const entries = readRunLog(path.join(RUNS_DIR, file));
+        if (entries.length === 0)
+            continue;
+        const jobName = entries[0].jobName;
+        // Skip dormant jobs — if the last run is >7 days old the job is
+        // probably removed or renamed and its historical failures aren't
+        // actionable. Circuit breaker still counts because an engaged breaker
+        // is itself "the job stopped running".
+        const lastEntry = entries[entries.length - 1];
+        const lastRunMs = Date.parse(lastEntry.startedAt);
+        // Always consult the breaker state — a stuck breaker is the primary
+        // signal for "job has been silently broken for days".
+        const cb = lastCircuitBreakerEvent(jobName);
+        if (!cb.engagedAt && Number.isFinite(lastRunMs) && lastRunMs < dormantCutoffMs) {
+            continue;
+        }
+        const inWindow = entries.filter(e => {
+            const ts = Date.parse(e.startedAt);
+            return Number.isFinite(ts) && ts >= sinceMs;
+        });
+        const failures = inWindow.filter(isFailure);
+        // Consecutive-failure signal: scan from most recent entry backward.
+        // Stops at the first non-failure (ignoring 'skipped' which is neither
+        // signal). Catches daily jobs that fail every run without accumulating
+        // 3 in a 48h window.
+        let consecutiveFailures = 0;
+        for (let i = entries.length - 1; i >= 0; i--) {
+            const e = entries[i];
+            if (e.status === 'skipped')
+                continue;
+            if (isFailure(e))
+                consecutiveFailures++;
+            else
+                break;
+        }
+        const meetsThreshold = failures.length >= ERRORS_IN_WINDOW
+            || consecutiveFailures >= CONSECUTIVE_FAILURES
+            || !!cb.engagedAt;
+        if (!meetsThreshold)
+            continue;
+        // Gather up to 3 distinct error messages, newest first. Prefer in-window
+        // errors; if the breaker is engaged and there are no recent runs, fall
+        // back to the most recent errors anywhere in the log.
+        const errSource = failures.length > 0
+            ? failures
+            : entries.filter(isFailure);
+        const distinctErrors = [];
+        const seen = new Set();
+        for (let i = errSource.length - 1; i >= 0 && distinctErrors.length < 3; i--) {
+            const err = (errSource[i].error ?? '').trim();
+            if (!err)
+                continue;
+            const key = err.slice(0, 120);
+            if (seen.has(key))
+                continue;
+            seen.add(key);
+            distinctErrors.push(err.slice(0, 400));
+        }
+        const lastFailureEntry = failures[failures.length - 1] ?? errSource[errSource.length - 1] ?? null;
+        const agentSlug = jobName.includes(':') ? jobName.split(':')[0] : undefined;
+        broken.push({
+            jobName,
+            agentSlug,
+            errorCount48h: failures.length,
+            totalRuns48h: inWindow.length,
+            lastErrorAt: lastFailureEntry?.startedAt ?? null,
+            lastErrors: distinctErrors,
+            circuitBreakerEngagedAt: cb.engagedAt,
+            lastAdvisorOpinion: cb.lastOpinion,
+        });
+    }
+    // Also check the self-improve loop — it has its own log (not cron/runs/).
+    const siBroken = detectSelfImproveBreakage(now);
+    if (siBroken)
+        broken.push(siBroken);
+    // Most recently failing first
+    broken.sort((a, b) => {
+        const aT = a.lastErrorAt ? Date.parse(a.lastErrorAt) : 0;
+        const bT = b.lastErrorAt ? Date.parse(b.lastErrorAt) : 0;
+        return bT - aT;
+    });
+    return broken;
+}
+/**
+ * The self-improve loop writes to its own experiment-log.jsonl, not cron/runs/.
+ * Its breakage pattern is: state.lastRunAt keeps getting updated nightly but
+ * no new experiments are being appended (they're all failing pre-iteration),
+ * OR the most recent experiments are all errors, OR state.infraError is set.
+ *
+ * Returns a synthetic BrokenJob for the self-improve pseudo-job, or null if
+ * healthy / no data.
+ */
+function detectSelfImproveBreakage(now) {
+    if (!existsSync(SELF_IMPROVE_STATE_FILE))
+        return null;
+    let state = {};
+    try {
+        state = JSON.parse(readFileSync(SELF_IMPROVE_STATE_FILE, 'utf-8'));
+    }
+    catch {
+        return null;
+    }
+    const experiments = [];
+    if (existsSync(SELF_IMPROVE_LOG_FILE)) {
+        try {
+            const lines = readFileSync(SELF_IMPROVE_LOG_FILE, 'utf-8').trim().split('\n').filter(Boolean);
+            for (const line of lines.slice(-10)) {
+                try {
+                    experiments.push(JSON.parse(line));
+                }
+                catch { /* skip */ }
+            }
+        }
+        catch { /* non-fatal */ }
+    }
+    const lastRunMs = state.lastRunAt ? Date.parse(state.lastRunAt) : 0;
+    const lookback48h = now - 48 * 60 * 60 * 1000;
+    const staleLookback = now - 7 * 24 * 60 * 60 * 1000; // 7 days
+    const recentExperiments = experiments.filter(e => {
+        const ts = e.startedAt ? Date.parse(e.startedAt) : 0;
+        return Number.isFinite(ts) && ts >= staleLookback;
+    });
+    const recentErrors = recentExperiments.filter(e => e.approvalStatus === 'denied' && (e.reason?.startsWith('Error') ?? false));
+    // Three break modes:
+    //  a. state.infraError is set (loop detected unfixable infra issue)
+    //  b. all 3+ most recent experiments within lookback are errors
+    //  c. loop ran recently but no new experiments appeared (silent early-exit)
+    const hasInfraError = !!state.infraError;
+    const allRecentErrored = recentExperiments.length >= 3
+        && recentExperiments.every(e => e.approvalStatus === 'denied');
+    const silentEarlyExit = lastRunMs > lookback48h
+        && recentExperiments.length === 0;
+    if (!hasInfraError && !allRecentErrored && !silentEarlyExit)
+        return null;
+    const lastErrors = [];
+    for (let i = experiments.length - 1; i >= 0 && lastErrors.length < 3; i--) {
+        const err = (experiments[i].error ?? '').trim();
+        if (!err)
+            continue;
+        lastErrors.push(err.slice(0, 400));
+    }
+    // If we don't have an explicit infraError but the last recorded error
+    // looks schema-related, surface it — this captures the state where all
+    // iterations died with the same API 400 but state.infraError never got
+    // persisted (happens when MAX_INFRA_ERRORS isn't crossed within a run).
+    const lastLoggedError = experiments.length > 0 ? (experiments[experiments.length - 1].error ?? '') : '';
+    const inferredInfraSchema = /input_schema|tools\.\d+\.custom/i.test(lastLoggedError);
+    let opinion;
+    if (hasInfraError) {
+        opinion = `infra: ${state.infraError.category} — ${state.infraError.diagnostic.slice(0, 200)}`;
+    }
+    else if (silentEarlyExit && inferredInfraSchema) {
+        opinion = 'loop ran but produced no experiments — last logged error was an MCP tool schema validation (API 400). Check external MCP servers (claude_desktop_config.json, Claude Code settings) for a recently-updated package exposing a malformed input_schema.';
+    }
+    else if (silentEarlyExit) {
+        opinion = 'loop ran but produced no experiments — likely crashing before iteration (check metrics gathering or hypothesis generation)';
+    }
+    else {
+        opinion = `${recentErrors.length}/${recentExperiments.length} recent iterations errored`;
+    }
+    return {
+        jobName: 'self-improve',
+        agentSlug: undefined,
+        errorCount48h: recentErrors.length,
+        totalRuns48h: recentExperiments.length,
+        lastErrorAt: experiments[experiments.length - 1]?.startedAt ?? state.lastRunAt ?? null,
+        lastErrors,
+        circuitBreakerEngagedAt: hasInfraError ? state.lastRunAt ?? null : null,
+        lastAdvisorOpinion: opinion,
+    };
+}
+/** Format a broken-job report for the owner DM. */
+function formatReport(jobs) {
+    const lines = [];
+    lines.push(`🚨 **${jobs.length} cron job${jobs.length === 1 ? '' : 's'} repeatedly failing** (last ${WINDOW_HOURS}h)`);
+    lines.push('');
+    for (const j of jobs) {
+        const breaker = j.circuitBreakerEngagedAt ? ' · circuit breaker engaged' : '';
+        lines.push(`• \`${j.jobName}\` — ${j.errorCount48h}/${j.totalRuns48h} runs failed${breaker}`);
+        if (j.lastErrors.length > 0) {
+            const preview = j.lastErrors[0].split('\n')[0].slice(0, 140);
+            lines.push(`  Last error: ${preview}`);
+        }
+        if (j.lastAdvisorOpinion) {
+            lines.push(`  Advisor: ${j.lastAdvisorOpinion.slice(0, 140)}`);
+        }
+    }
+    lines.push('');
+    lines.push('Open the dashboard → Broken Jobs panel for the full picture.');
+    return lines.join('\n');
+}
+/**
+ * Run a sweep: identify currently-broken jobs, pick the ones we haven't
+ * notified about recently, and dispatch one consolidated DM.
+ *
+ * Returns the jobs that triggered a fresh notification (mostly for tests/logs).
+ */
+export async function runFailureSweep(send, now = Date.now()) {
+    const broken = computeBrokenJobs(now);
+    if (broken.length === 0) {
+        // Clear cooldowns for jobs that recovered so future failures notify promptly.
+        const state = loadState();
+        let mutated = false;
+        for (const name of Object.keys(state.notified)) {
+            if (!broken.find(b => b.jobName === name)) {
+                delete state.notified[name];
+                mutated = true;
+            }
+        }
+        if (mutated)
+            saveState(state);
+        return [];
+    }
+    const state = loadState();
+    const cooldownMs = NOTIFY_COOLDOWN_HOURS * 60 * 60 * 1000;
+    const fresh = [];
+    for (const job of broken) {
+        const prev = state.notified[job.jobName];
+        if (prev && now - Date.parse(prev.lastNotifiedAt) < cooldownMs)
+            continue;
+        fresh.push(job);
+    }
+    if (fresh.length === 0)
+        return [];
+    try {
+        await send(formatReport(fresh));
+        const stamp = new Date(now).toISOString();
+        for (const job of fresh) {
+            state.notified[job.jobName] = { lastNotifiedAt: stamp, lastErrorCount: job.errorCount48h };
+        }
+        saveState(state);
+        appendAuditLog('notified', fresh.map(j => j.jobName));
+        logger.info({ count: fresh.length, jobs: fresh.map(j => j.jobName) }, 'Failure monitor: notified owner');
+    }
+    catch (err) {
+        logger.warn({ err }, 'Failure monitor: notification dispatch failed');
+    }
+    return fresh;
+}
+function appendAuditLog(action, jobNames) {
+    try {
+        const auditPath = path.join(BASE_DIR, 'cron', 'failure-monitor.log');
+        appendFileSync(auditPath, JSON.stringify({
+            action,
+            jobs: jobNames,
+            timestamp: new Date().toISOString(),
+        }) + '\n');
+    }
+    catch { /* non-fatal */ }
+}
+//# sourceMappingURL=failure-monitor.js.map

package/dist/gateway/fix-verification.d.ts ADDED Viewed

@@ -0,0 +1,39 @@
+/**
+ * Clementine TypeScript — Cron fix verification.
+ *
+ * When a CRON.md (global or per-agent) is edited, we record a "pending
+ * verification" for any job whose definition changed AND that is currently
+ * in a failing state. After that job's next non-skipped run, we DM the
+ * owner with the verdict — succeeded or still failing — so a self-reported
+ * "fix" can't go untested again.
+ */
+import type { CronJobDefinition, CronRunEntry } from '../types.js';
+interface PendingVerification {
+    jobName: string;
+    recordedAt: string;
+    preFailureCount: number;
+    preLastError: string | null;
+}
+/**
+ * Compare an old and new jobs list and record verifications for any job that:
+ *   - exists in both lists (new jobs aren't "fixes" of existing problems)
+ *   - has its definition hash changed
+ *   - is currently in a failing state per failure-monitor
+ *
+ * Disabled jobs and removed jobs are tracked too: if a previously failing
+ * job gets disabled or removed in the edit, we surface that as a "removed
+ * pending verification" rather than waiting for a run that will never come.
+ */
+export declare function recordEditsForFailingJobs(oldJobs: CronJobDefinition[], newJobs: CronJobDefinition[]): void;
+/**
+ * After a cron run completes, check whether we were waiting on a fix
+ * verification for this job. If so, send the owner a verdict and clear it.
+ *
+ * Skipped runs (circuit breaker, pre-check exit, etc.) don't carry signal
+ * and shouldn't count as a verdict either way.
+ */
+export declare function checkAndDeliverVerification(entry: CronRunEntry, send: (text: string) => Promise<unknown>): Promise<void>;
+/** Read-only accessor for dashboards or debugging. */
+export declare function listPendingVerifications(): PendingVerification[];
+export {};
+//# sourceMappingURL=fix-verification.d.ts.map

package/dist/gateway/fix-verification.js ADDED Viewed

@@ -0,0 +1,144 @@
+/**
+ * Clementine TypeScript — Cron fix verification.
+ *
+ * When a CRON.md (global or per-agent) is edited, we record a "pending
+ * verification" for any job whose definition changed AND that is currently
+ * in a failing state. After that job's next non-skipped run, we DM the
+ * owner with the verdict — succeeded or still failing — so a self-reported
+ * "fix" can't go untested again.
+ */
+import { existsSync, mkdirSync, readFileSync, writeFileSync, } from 'node:fs';
+import path from 'node:path';
+import crypto from 'node:crypto';
+import pino from 'pino';
+import { BASE_DIR } from '../config.js';
+import { computeBrokenJobs } from './failure-monitor.js';
+const logger = pino({ name: 'clementine.fix-verification' });
+const STATE_FILE = path.join(BASE_DIR, 'cron', 'fix-verifications.json');
+function loadState() {
+    try {
+        if (!existsSync(STATE_FILE))
+            return { pending: {} };
+        const raw = JSON.parse(readFileSync(STATE_FILE, 'utf-8'));
+        return { pending: raw.pending ?? {} };
+    }
+    catch {
+        return { pending: {} };
+    }
+}
+function saveState(state) {
+    try {
+        mkdirSync(path.dirname(STATE_FILE), { recursive: true });
+        writeFileSync(STATE_FILE, JSON.stringify(state, null, 2));
+    }
+    catch (err) {
+        logger.warn({ err }, 'Failed to persist fix-verification state');
+    }
+}
+/**
+ * Hash the job fields a fix could touch. Schedule + prompt + tier + mode +
+ * model + maxTurns + maxHours + workDir + preCheck + successCriteria are the
+ * only fields a "fix" would realistically change. We deliberately ignore
+ * `enabled` because disabling isn't a fix.
+ */
+function jobHash(j) {
+    const data = JSON.stringify({
+        schedule: j.schedule,
+        prompt: j.prompt,
+        tier: j.tier,
+        maxTurns: j.maxTurns,
+        model: j.model,
+        workDir: j.workDir,
+        mode: j.mode,
+        maxHours: j.maxHours,
+        preCheck: j.preCheck,
+        successCriteria: j.successCriteria,
+    });
+    return crypto.createHash('sha1').update(data).digest('hex').slice(0, 12);
+}
+/**
+ * Compare an old and new jobs list and record verifications for any job that:
+ *   - exists in both lists (new jobs aren't "fixes" of existing problems)
+ *   - has its definition hash changed
+ *   - is currently in a failing state per failure-monitor
+ *
+ * Disabled jobs and removed jobs are tracked too: if a previously failing
+ * job gets disabled or removed in the edit, we surface that as a "removed
+ * pending verification" rather than waiting for a run that will never come.
+ */
+export function recordEditsForFailingJobs(oldJobs, newJobs) {
+    const oldByName = new Map(oldJobs.map(j => [j.name, j]));
+    const newByName = new Map(newJobs.map(j => [j.name, j]));
+    const broken = computeBrokenJobs();
+    const brokenByName = new Map(broken.map(b => [b.jobName, b]));
+    const state = loadState();
+    const stamp = new Date().toISOString();
+    let mutated = false;
+    for (const [name, oj] of oldByName) {
+        const b = brokenByName.get(name);
+        if (!b)
+            continue; // not currently broken — nothing to verify
+        const nj = newByName.get(name);
+        if (!nj) {
+            // Job removed entirely. Treat as resolved by removal.
+            delete state.pending[name];
+            mutated = true;
+            logger.info({ job: name }, 'Failing job removed from CRON.md — verification cleared');
+            continue;
+        }
+        if (!nj.enabled) {
+            // Job disabled. Don't wait for a run; clear and note.
+            delete state.pending[name];
+            mutated = true;
+            logger.info({ job: name }, 'Failing job disabled in CRON.md — verification cleared');
+            continue;
+        }
+        if (jobHash(oj) === jobHash(nj))
+            continue; // no relevant changes
+        state.pending[name] = {
+            jobName: name,
+            recordedAt: stamp,
+            preFailureCount: b.errorCount48h,
+            preLastError: b.lastErrors[0] ?? null,
+        };
+        mutated = true;
+        logger.info({ job: name, preFailureCount: b.errorCount48h }, 'Recorded pending fix verification');
+    }
+    if (mutated)
+        saveState(state);
+}
+/**
+ * After a cron run completes, check whether we were waiting on a fix
+ * verification for this job. If so, send the owner a verdict and clear it.
+ *
+ * Skipped runs (circuit breaker, pre-check exit, etc.) don't carry signal
+ * and shouldn't count as a verdict either way.
+ */
+export async function checkAndDeliverVerification(entry, send) {
+    if (entry.status === 'skipped')
+        return;
+    const state = loadState();
+    const pending = state.pending[entry.jobName];
+    if (!pending)
+        return;
+    delete state.pending[entry.jobName];
+    saveState(state);
+    const ok = entry.status === 'ok';
+    const verdict = ok ? '✅ succeeded' : '⚠️ still failing';
+    const ageMin = Math.max(1, Math.round((Date.now() - Date.parse(pending.recordedAt)) / 60000));
+    const detail = ok
+        ? ''
+        : `\nError: ${(entry.error ?? 'unknown').split('\n')[0].slice(0, 200)}`;
+    const msg = `**[Fix verification]** \`${entry.jobName}\` ${verdict} on its first run after edit (${ageMin}m later).${detail}`;
+    try {
+        await send(msg);
+    }
+    catch (err) {
+        logger.warn({ err, job: entry.jobName }, 'Failed to send fix verification DM');
+    }
+}
+/** Read-only accessor for dashboards or debugging. */
+export function listPendingVerifications() {
+    return Object.values(loadState().pending);
+}
+//# sourceMappingURL=fix-verification.js.map

package/dist/gateway/heartbeat-scheduler.js CHANGED Viewed

@@ -103,6 +103,13 @@ export class HeartbeatScheduler {
         catch (err) {
             logger.warn({ err }, 'Session eviction failed');
         }
+        // Cron failure sweep — surface jobs that have been silently failing.
+        // Runs every tick; per-job 24h cooldown lives inside the monitor.
+        import('./failure-monitor.js').then(({ runFailureSweep }) => {
+            runFailureSweep((text) => this.dispatcher.send(text, {})).catch(err => {
+                logger.warn({ err }, 'Failure sweep failed');
+            });
+        }).catch(err => logger.warn({ err }, 'Failure sweep import failed'));
         const now = new Date();
         const hour = now.getHours();
         // ── Nightly tasks: run regardless of active hours ─────────────────
@@ -626,10 +633,41 @@ export class HeartbeatScheduler {
         const prompt = buildInsightPrompt(signals);
         if (!prompt)
             return;
-        // Run lightweight LLM call via gateway
-        const response = await this.gateway.handleCronJob('insight-check', prompt, 1, // tier 1
-        1, // max 1 turn (just rating + message)
-        'haiku');
+        // Run lightweight LLM call via gateway. Log success AND failure to the
+        // cron run log so the failure monitor can see hourly breakage.
+        // maxTurns bumped 1 → 3 because the agent needs to fan out ~4 parallel
+        // tool calls (activity_history, outlook_inbox, goal_list, task_list)
+        // before composing its rating — at 1 turn it always crashes with
+        // "Reached maximum number of turns".
+        const icStartedAt = new Date();
+        let response = null;
+        try {
+            response = await this.gateway.handleCronJob('insight-check', prompt, 1, // tier 1
+            3, // max 3 turns (parallel tool fan-out + synthesis)
+            'haiku');
+            this.runLog.append({
+                jobName: 'insight-check',
+                startedAt: icStartedAt.toISOString(),
+                finishedAt: new Date().toISOString(),
+                status: 'ok',
+                durationMs: Date.now() - icStartedAt.getTime(),
+                attempt: 1,
+                outputPreview: (response ?? '').slice(0, 200),
+            });
+        }
+        catch (err) {
+            this.runLog.append({
+                jobName: 'insight-check',
+                startedAt: icStartedAt.toISOString(),
+                finishedAt: new Date().toISOString(),
+                status: 'error',
+                durationMs: Date.now() - icStartedAt.getTime(),
+                attempt: 1,
+                error: String(err).slice(0, 400),
+                errorType: 'transient',
+            });
+            throw err;
+        }
         if (!response)
             return;
         const insight = parseInsightResponse(response);

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "clementine-agent",
-  "version": "1.0.14",
+  "version": "1.0.15",
   "description": "Clementine — Personal AI Assistant (TypeScript)",
   "type": "module",
   "main": "dist/index.js",