clementine-agent 1.0.14 → 1.0.15
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/agent/self-improve.js +23 -0
- package/dist/cli/dashboard.js +78 -1
- package/dist/gateway/cron-scheduler.d.ts +5 -0
- package/dist/gateway/cron-scheduler.js +32 -5
- package/dist/gateway/failure-monitor.d.ts +40 -0
- package/dist/gateway/failure-monitor.js +416 -0
- package/dist/gateway/fix-verification.d.ts +39 -0
- package/dist/gateway/fix-verification.js +144 -0
- package/dist/gateway/heartbeat-scheduler.js +42 -4
- package/package.json +1 -1
|
@@ -168,6 +168,29 @@ export class SelfImproveLoop {
|
|
|
168
168
|
logger.info('Captured SOUL.md baseline for drift detection');
|
|
169
169
|
}
|
|
170
170
|
const state = this.loadState();
|
|
171
|
+
// If a prior run aborted on an infrastructure error that can't be fixed
|
|
172
|
+
// by retrying (malformed MCP tool schema, bad auth, etc.), don't spin
|
|
173
|
+
// the loop pointlessly. Wait at least 24h before re-probing — this gives
|
|
174
|
+
// the owner time to fix the infra and prevents us from writing dozens
|
|
175
|
+
// of identical error experiments. The failure monitor surfaces the
|
|
176
|
+
// infraError to the owner via the broken-jobs pipeline.
|
|
177
|
+
if (state.infraError && state.lastRunAt) {
|
|
178
|
+
const hoursSinceRun = (Date.now() - Date.parse(state.lastRunAt)) / 3_600_000;
|
|
179
|
+
if (Number.isFinite(hoursSinceRun) && hoursSinceRun < 24) {
|
|
180
|
+
logger.warn({
|
|
181
|
+
category: state.infraError.category,
|
|
182
|
+
diagnostic: state.infraError.diagnostic,
|
|
183
|
+
hoursSinceRun: Math.round(hoursSinceRun),
|
|
184
|
+
}, 'Self-improve skipped — prior infra error still in cooldown. See Broken Jobs panel.');
|
|
185
|
+
state.status = 'completed';
|
|
186
|
+
this.saveState(state);
|
|
187
|
+
return state;
|
|
188
|
+
}
|
|
189
|
+
// Past cooldown — clear the flag and probe fresh. If it still errors,
|
|
190
|
+
// the loop will set it again.
|
|
191
|
+
logger.info('Self-improve: infra error cooldown elapsed, probing again');
|
|
192
|
+
delete state.infraError;
|
|
193
|
+
}
|
|
171
194
|
state.status = 'running';
|
|
172
195
|
state.lastRunAt = new Date().toISOString();
|
|
173
196
|
state.currentIteration = 0;
|
package/dist/cli/dashboard.js
CHANGED
|
@@ -2075,6 +2075,16 @@ export async function cmdDashboard(opts) {
|
|
|
2075
2075
|
res.status(500).json({ error: String(err) });
|
|
2076
2076
|
}
|
|
2077
2077
|
});
|
|
2078
|
+
// ── Broken jobs (failure monitor) ───────────────────────────────
|
|
2079
|
+
app.get('/api/cron/broken-jobs', async (_req, res) => {
|
|
2080
|
+
try {
|
|
2081
|
+
const { computeBrokenJobs } = await import('../gateway/failure-monitor.js');
|
|
2082
|
+
res.json({ jobs: computeBrokenJobs() });
|
|
2083
|
+
}
|
|
2084
|
+
catch (err) {
|
|
2085
|
+
res.status(500).json({ error: String(err) });
|
|
2086
|
+
}
|
|
2087
|
+
});
|
|
2078
2088
|
// ── Cron trace viewer ──────────────────────────────────────────
|
|
2079
2089
|
app.get('/api/cron/traces/:job', (req, res) => {
|
|
2080
2090
|
try {
|
|
@@ -9075,6 +9085,7 @@ if('serviceWorker' in navigator){navigator.serviceWorker.getRegistrations().then
|
|
|
9075
9085
|
<div class="page-title">Scheduled Tasks</div>
|
|
9076
9086
|
<div class="tab-bar" id="automations-tabs">
|
|
9077
9087
|
<button class="active" onclick="switchTab('automations','scheduled')">Scheduled Tasks</button>
|
|
9088
|
+
<button onclick="switchTab('automations','broken')">Broken Jobs <span class="tab-badge" id="tab-broken-count" title="repeatedly failing" style="display:none;background:#ef4444;color:#fff">0</span></button>
|
|
9078
9089
|
<button onclick="switchTab('automations','timers')">Timers <span class="tab-badge" id="tab-timer-count" style="display:none">0</span></button>
|
|
9079
9090
|
<button onclick="switchTab('automations','self-improve')">Self-Improve <span class="tab-badge" id="tab-si-pending" style="display:none">0</span></button>
|
|
9080
9091
|
<button onclick="switchTab('automations','skills')">Skills <span class="tab-badge" id="tab-skill-count" style="display:none">0</span><span class="tab-badge" id="tab-pending-skill-count" title="pending approval" style="display:none;background:#f59e0b;color:#000">0</span></button>
|
|
@@ -9084,6 +9095,15 @@ if('serviceWorker' in navigator){navigator.serviceWorker.getRegistrations().then
|
|
|
9084
9095
|
<div class="tab-pane active" id="tab-automations-scheduled">
|
|
9085
9096
|
<div id="panel-cron"><div class="empty-state">Loading...</div></div>
|
|
9086
9097
|
</div>
|
|
9098
|
+
<div class="tab-pane" id="tab-automations-broken">
|
|
9099
|
+
<div class="card">
|
|
9100
|
+
<div class="card-header" style="display:flex;align-items:center;justify-content:space-between">
|
|
9101
|
+
<span>Repeatedly Failing Jobs (last 48h)</span>
|
|
9102
|
+
<span class="badge badge-gray" id="broken-count-badge" style="font-size:10px">0 jobs</span>
|
|
9103
|
+
</div>
|
|
9104
|
+
<div class="card-body" id="panel-broken-jobs"><div class="empty-state">Loading...</div></div>
|
|
9105
|
+
</div>
|
|
9106
|
+
</div>
|
|
9087
9107
|
<div class="tab-pane" id="tab-automations-timers">
|
|
9088
9108
|
<div class="card">
|
|
9089
9109
|
<div class="card-body" id="panel-timers"><div class="empty-state">Loading...</div></div>
|
|
@@ -10307,7 +10327,7 @@ function navigateTo(page, opts) {
|
|
|
10307
10327
|
updateBuilderMode();
|
|
10308
10328
|
document.getElementById('builder-input').focus();
|
|
10309
10329
|
}
|
|
10310
|
-
if (page === 'automations') { refreshCron(); refreshTimers(); refreshSelfImprove(); refreshSkills(); }
|
|
10330
|
+
if (page === 'automations') { refreshCron(); refreshTimers(); refreshSelfImprove(); refreshSkills(); refreshBrokenJobs(); }
|
|
10311
10331
|
if (page === 'intelligence') { refreshMemory(); }
|
|
10312
10332
|
if (page === 'settings') { refreshSettings(); refreshRemoteAccess(); refreshSalesforce(); refreshClaudeIntegrations(); refreshMcpServers(); }
|
|
10313
10333
|
if (page === 'logs') refreshLogs();
|
|
@@ -10348,6 +10368,7 @@ function switchTab(group, tab) {
|
|
|
10348
10368
|
// Tab-specific refresh
|
|
10349
10369
|
if (group === 'automations') {
|
|
10350
10370
|
if (tab === 'scheduled') refreshCron();
|
|
10371
|
+
if (tab === 'broken') refreshBrokenJobs();
|
|
10351
10372
|
if (tab === 'timers') refreshTimers();
|
|
10352
10373
|
if (tab === 'self-improve') refreshSelfImprove();
|
|
10353
10374
|
if (tab === 'workflows') refreshWorkflows();
|
|
@@ -16141,6 +16162,62 @@ async function expandSkill(name) {
|
|
|
16141
16162
|
} catch(e) { toast('Failed to load skill', 'error'); }
|
|
16142
16163
|
}
|
|
16143
16164
|
|
|
16165
|
+
async function refreshBrokenJobs() {
|
|
16166
|
+
try {
|
|
16167
|
+
var r = await apiFetch('/api/cron/broken-jobs');
|
|
16168
|
+
var d = await r.json();
|
|
16169
|
+
var jobs = d.jobs || [];
|
|
16170
|
+
var tabBadge = document.getElementById('tab-broken-count');
|
|
16171
|
+
if (tabBadge) {
|
|
16172
|
+
tabBadge.textContent = String(jobs.length);
|
|
16173
|
+
tabBadge.style.display = jobs.length > 0 ? '' : 'none';
|
|
16174
|
+
}
|
|
16175
|
+
var countBadge = document.getElementById('broken-count-badge');
|
|
16176
|
+
if (countBadge) countBadge.textContent = jobs.length + ' job' + (jobs.length !== 1 ? 's' : '');
|
|
16177
|
+
var container = document.getElementById('panel-broken-jobs');
|
|
16178
|
+
if (!container) return;
|
|
16179
|
+
if (jobs.length === 0) {
|
|
16180
|
+
container.innerHTML = '<div class="empty-state">All jobs healthy in the last 48h.</div>';
|
|
16181
|
+
return;
|
|
16182
|
+
}
|
|
16183
|
+
var html = '<div style="display:flex;flex-direction:column;gap:12px">';
|
|
16184
|
+
for (var j of jobs) {
|
|
16185
|
+
var breaker = j.circuitBreakerEngagedAt
|
|
16186
|
+
? '<span class="badge" style="background:rgba(239,68,68,0.15);color:#ef4444;font-size:10px">circuit broken</span>'
|
|
16187
|
+
: '';
|
|
16188
|
+
var lastErrorAt = j.lastErrorAt ? timeAgo(j.lastErrorAt) : 'unknown';
|
|
16189
|
+
var failureRatio = j.errorCount48h + '/' + j.totalRuns48h;
|
|
16190
|
+
var advisorLine = j.lastAdvisorOpinion
|
|
16191
|
+
? '<div style="font-size:11px;color:var(--text-muted);margin-top:6px"><strong>Advisor:</strong> ' + esc(j.lastAdvisorOpinion) + '</div>'
|
|
16192
|
+
: '';
|
|
16193
|
+
var errorsHtml = '';
|
|
16194
|
+
if (j.lastErrors && j.lastErrors.length > 0) {
|
|
16195
|
+
errorsHtml = '<div style="margin-top:8px;display:flex;flex-direction:column;gap:4px">';
|
|
16196
|
+
for (var e of j.lastErrors) {
|
|
16197
|
+
errorsHtml += '<pre style="font-size:11px;color:var(--text-secondary);background:var(--bg-tertiary);padding:6px 8px;border-radius:4px;white-space:pre-wrap;word-break:break-word;margin:0;max-height:120px;overflow-y:auto">' + esc(e) + '</pre>';
|
|
16198
|
+
}
|
|
16199
|
+
errorsHtml += '</div>';
|
|
16200
|
+
}
|
|
16201
|
+
var agentTag = j.agentSlug
|
|
16202
|
+
? '<span class="badge badge-blue" style="font-size:10px">' + esc(j.agentSlug) + '</span>'
|
|
16203
|
+
: '';
|
|
16204
|
+
html += '<div style="padding:12px;border:1px solid var(--border);border-radius:8px;background:var(--bg-secondary)">'
|
|
16205
|
+
+ '<div style="display:flex;align-items:center;gap:8px;flex-wrap:wrap">'
|
|
16206
|
+
+ '<strong>' + esc(j.jobName) + '</strong> ' + agentTag + ' ' + breaker
|
|
16207
|
+
+ '<span style="margin-left:auto;font-size:11px;color:var(--text-muted)">' + failureRatio + ' failed \\u00b7 last error ' + lastErrorAt + '</span>'
|
|
16208
|
+
+ '</div>'
|
|
16209
|
+
+ errorsHtml
|
|
16210
|
+
+ advisorLine
|
|
16211
|
+
+ '</div>';
|
|
16212
|
+
}
|
|
16213
|
+
html += '</div>';
|
|
16214
|
+
container.innerHTML = html;
|
|
16215
|
+
} catch(e) {
|
|
16216
|
+
var c = document.getElementById('panel-broken-jobs');
|
|
16217
|
+
if (c) c.innerHTML = '<div class="empty-state" style="color:var(--red)">Failed to load broken jobs</div>';
|
|
16218
|
+
}
|
|
16219
|
+
}
|
|
16220
|
+
|
|
16144
16221
|
async function refreshPendingSkills() {
|
|
16145
16222
|
try {
|
|
16146
16223
|
var r = await apiFetch('/api/skills/pending');
|
|
@@ -87,6 +87,11 @@ export declare class CronScheduler {
|
|
|
87
87
|
private watchAgentsDir;
|
|
88
88
|
private unwatchAgentsDir;
|
|
89
89
|
reloadJobs(): void;
|
|
90
|
+
/**
|
|
91
|
+
* Wrap runLog.append so every completion also checks whether a fix
|
|
92
|
+
* verification is pending and DMs the verdict if so.
|
|
93
|
+
*/
|
|
94
|
+
private _logRun;
|
|
90
95
|
private runJob;
|
|
91
96
|
/**
|
|
92
97
|
* Log an advisor event to the events JSONL file for dashboard surfacing.
|
|
@@ -491,6 +491,9 @@ export class CronScheduler {
|
|
|
491
491
|
this.watchingAgents = false;
|
|
492
492
|
}
|
|
493
493
|
reloadJobs() {
|
|
494
|
+
// Snapshot the pre-reload job definitions so fix-verification can diff
|
|
495
|
+
// and flag any currently-failing job whose config just changed.
|
|
496
|
+
const oldJobs = this.jobs.map(j => ({ ...j }));
|
|
494
497
|
// Stop existing scheduled tasks (but NOT the file watcher)
|
|
495
498
|
for (const [name, task] of this.scheduledTasks) {
|
|
496
499
|
task.stop();
|
|
@@ -580,6 +583,30 @@ export class CronScheduler {
|
|
|
580
583
|
logger.info(`Cron job '${def.name}' scheduled: ${def.schedule} (${SYSTEM_TIMEZONE})`);
|
|
581
584
|
}
|
|
582
585
|
}
|
|
586
|
+
// Fix-verification: detect any currently-failing job whose definition just
|
|
587
|
+
// changed, and record a pending verification for their next run.
|
|
588
|
+
// Skipped on the first load (oldJobs empty) since there's no edit to verify.
|
|
589
|
+
if (oldJobs.length > 0) {
|
|
590
|
+
import('./fix-verification.js').then(({ recordEditsForFailingJobs }) => {
|
|
591
|
+
try {
|
|
592
|
+
recordEditsForFailingJobs(oldJobs, this.jobs);
|
|
593
|
+
}
|
|
594
|
+
catch (err) {
|
|
595
|
+
logger.warn({ err }, 'Fix-verification capture failed');
|
|
596
|
+
}
|
|
597
|
+
}).catch(err => logger.warn({ err }, 'Fix-verification import failed'));
|
|
598
|
+
}
|
|
599
|
+
}
|
|
600
|
+
/**
|
|
601
|
+
* Wrap runLog.append so every completion also checks whether a fix
|
|
602
|
+
* verification is pending and DMs the verdict if so.
|
|
603
|
+
*/
|
|
604
|
+
_logRun(entry) {
|
|
605
|
+
this.runLog.append(entry);
|
|
606
|
+
import('./fix-verification.js').then(({ checkAndDeliverVerification }) => {
|
|
607
|
+
checkAndDeliverVerification(entry, (text) => this.dispatcher.send(text, {}))
|
|
608
|
+
.catch(err => logger.warn({ err, job: entry.jobName }, 'Fix verification DM failed'));
|
|
609
|
+
}).catch(err => logger.warn({ err }, 'Fix-verification import failed'));
|
|
583
610
|
}
|
|
584
611
|
async runJob(job) {
|
|
585
612
|
// Agent status check — skip if agent is paused/terminated
|
|
@@ -649,7 +676,7 @@ export class CronScheduler {
|
|
|
649
676
|
// Non-zero exit or timeout → skip the job
|
|
650
677
|
const exitCode = preCheckErr.status ?? 1;
|
|
651
678
|
logger.info({ job: job.name, exitCode }, 'Pre-check failed — skipping job (no work to do)');
|
|
652
|
-
this.
|
|
679
|
+
this._logRun({
|
|
653
680
|
jobName: job.name,
|
|
654
681
|
startedAt: new Date().toISOString(),
|
|
655
682
|
finishedAt: new Date().toISOString(),
|
|
@@ -690,7 +717,7 @@ export class CronScheduler {
|
|
|
690
717
|
});
|
|
691
718
|
if (!approved) {
|
|
692
719
|
logger.info({ job: job.name }, 'Cron job skipped by owner');
|
|
693
|
-
this.
|
|
720
|
+
this._logRun({
|
|
694
721
|
jobName: job.name,
|
|
695
722
|
startedAt: new Date().toISOString(),
|
|
696
723
|
finishedAt: new Date().toISOString(),
|
|
@@ -709,7 +736,7 @@ export class CronScheduler {
|
|
|
709
736
|
const advice = getExecutionAdvice(job.name, job);
|
|
710
737
|
if (advice.shouldSkip) {
|
|
711
738
|
logger.info({ job: job.name, reason: advice.skipReason }, 'Execution advisor: circuit breaker — skipping job');
|
|
712
|
-
this.
|
|
739
|
+
this._logRun({
|
|
713
740
|
jobName: job.name,
|
|
714
741
|
startedAt: new Date().toISOString(),
|
|
715
742
|
finishedAt: new Date().toISOString(),
|
|
@@ -876,7 +903,7 @@ export class CronScheduler {
|
|
|
876
903
|
this.gateway.injectContext(`discord:user:${DISCORD_OWNER_ID}`, `[Scheduled cron: ${job.name}]`, response);
|
|
877
904
|
}
|
|
878
905
|
}
|
|
879
|
-
this.
|
|
906
|
+
this._logRun(entry);
|
|
880
907
|
// Fire-and-forget: extract procedural skill from successful long-running cron jobs
|
|
881
908
|
if (entry.status === 'ok' && entry.durationMs > 30_000 && response && response.length > 500) {
|
|
882
909
|
this.gateway.extractCronSkill(job.name, job.prompt, response, entry.durationMs, job.agentSlug)
|
|
@@ -902,7 +929,7 @@ export class CronScheduler {
|
|
|
902
929
|
const errorType = errTerminalReason
|
|
903
930
|
? classifyTerminalReason(errTerminalReason)
|
|
904
931
|
: classifyError(err);
|
|
905
|
-
this.
|
|
932
|
+
this._logRun({
|
|
906
933
|
jobName: job.name,
|
|
907
934
|
startedAt: startedAt.toISOString(),
|
|
908
935
|
finishedAt: finishedAt.toISOString(),
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Clementine TypeScript — Cron failure monitor.
|
|
3
|
+
*
|
|
4
|
+
* Surfaces cron jobs that have been failing repeatedly so they don't sit
|
|
5
|
+
* silently broken (which is what happened to ross-the-sdr:reply-detection —
|
|
6
|
+
* the existing circuit breaker fired ONCE at consErrors=5 and then went
|
|
7
|
+
* quiet for days).
|
|
8
|
+
*
|
|
9
|
+
* Threshold: a job is "broken" if either
|
|
10
|
+
* - it has >= 3 error/retried entries in the last 48h, OR
|
|
11
|
+
* - the circuit breaker engaged for it within the last 48h.
|
|
12
|
+
*
|
|
13
|
+
* Per-job 24h cooldown prevents re-spamming the owner with the same news.
|
|
14
|
+
*
|
|
15
|
+
* Read-only with respect to the cron run logs and advisor events; mutates
|
|
16
|
+
* only its own state file (cron/failure-monitor.json).
|
|
17
|
+
*/
|
|
18
|
+
export interface BrokenJob {
|
|
19
|
+
jobName: string;
|
|
20
|
+
agentSlug?: string;
|
|
21
|
+
errorCount48h: number;
|
|
22
|
+
totalRuns48h: number;
|
|
23
|
+
lastErrorAt: string | null;
|
|
24
|
+
lastErrors: string[];
|
|
25
|
+
circuitBreakerEngagedAt: string | null;
|
|
26
|
+
lastAdvisorOpinion: string | null;
|
|
27
|
+
}
|
|
28
|
+
/**
|
|
29
|
+
* Compute the current set of broken jobs by scanning all run logs.
|
|
30
|
+
* Pure function (state-free) — used both by the monitor sweep and the dashboard endpoint.
|
|
31
|
+
*/
|
|
32
|
+
export declare function computeBrokenJobs(now?: number): BrokenJob[];
|
|
33
|
+
/**
|
|
34
|
+
* Run a sweep: identify currently-broken jobs, pick the ones we haven't
|
|
35
|
+
* notified about recently, and dispatch one consolidated DM.
|
|
36
|
+
*
|
|
37
|
+
* Returns the jobs that triggered a fresh notification (mostly for tests/logs).
|
|
38
|
+
*/
|
|
39
|
+
export declare function runFailureSweep(send: (text: string) => Promise<unknown>, now?: number): Promise<BrokenJob[]>;
|
|
40
|
+
//# sourceMappingURL=failure-monitor.d.ts.map
|
|
@@ -0,0 +1,416 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Clementine TypeScript — Cron failure monitor.
|
|
3
|
+
*
|
|
4
|
+
* Surfaces cron jobs that have been failing repeatedly so they don't sit
|
|
5
|
+
* silently broken (which is what happened to ross-the-sdr:reply-detection —
|
|
6
|
+
* the existing circuit breaker fired ONCE at consErrors=5 and then went
|
|
7
|
+
* quiet for days).
|
|
8
|
+
*
|
|
9
|
+
* Threshold: a job is "broken" if either
|
|
10
|
+
* - it has >= 3 error/retried entries in the last 48h, OR
|
|
11
|
+
* - the circuit breaker engaged for it within the last 48h.
|
|
12
|
+
*
|
|
13
|
+
* Per-job 24h cooldown prevents re-spamming the owner with the same news.
|
|
14
|
+
*
|
|
15
|
+
* Read-only with respect to the cron run logs and advisor events; mutates
|
|
16
|
+
* only its own state file (cron/failure-monitor.json).
|
|
17
|
+
*/
|
|
18
|
+
import { appendFileSync, existsSync, mkdirSync, readFileSync, readdirSync, writeFileSync, } from 'node:fs';
|
|
19
|
+
import path from 'node:path';
|
|
20
|
+
import pino from 'pino';
|
|
21
|
+
import { BASE_DIR } from '../config.js';
|
|
22
|
+
const logger = pino({ name: 'clementine.failure-monitor' });
|
|
23
|
+
const RUNS_DIR = path.join(BASE_DIR, 'cron', 'runs');
|
|
24
|
+
const ADVISOR_EVENTS_FILE = path.join(BASE_DIR, 'cron', 'advisor-events.jsonl');
|
|
25
|
+
const STATE_FILE = path.join(BASE_DIR, 'cron', 'failure-monitor.json');
|
|
26
|
+
const SELF_IMPROVE_STATE_FILE = path.join(BASE_DIR, 'self-improve', 'state.json');
|
|
27
|
+
const SELF_IMPROVE_LOG_FILE = path.join(BASE_DIR, 'self-improve', 'experiment-log.jsonl');
|
|
28
|
+
/** A job is broken if it crosses any of these thresholds in the lookback window. */
|
|
29
|
+
const ERRORS_IN_WINDOW = 3;
|
|
30
|
+
const WINDOW_HOURS = 48;
|
|
31
|
+
/**
|
|
32
|
+
* Independent of the window — a job whose last N runs are all failures is
|
|
33
|
+
* broken even if they're spread over days (daily cron jobs can't accumulate
|
|
34
|
+
* 3 failures in 48h, but 2 consecutive BLOCKED days is still broken).
|
|
35
|
+
*/
|
|
36
|
+
const CONSECUTIVE_FAILURES = 2;
|
|
37
|
+
/** Don't re-DM the owner about the same broken job within this window. */
|
|
38
|
+
const NOTIFY_COOLDOWN_HOURS = 24;
|
|
39
|
+
function loadState() {
|
|
40
|
+
try {
|
|
41
|
+
if (!existsSync(STATE_FILE))
|
|
42
|
+
return { notified: {} };
|
|
43
|
+
const raw = JSON.parse(readFileSync(STATE_FILE, 'utf-8'));
|
|
44
|
+
return { notified: raw.notified ?? {} };
|
|
45
|
+
}
|
|
46
|
+
catch {
|
|
47
|
+
return { notified: {} };
|
|
48
|
+
}
|
|
49
|
+
}
|
|
50
|
+
function saveState(state) {
|
|
51
|
+
try {
|
|
52
|
+
mkdirSync(path.dirname(STATE_FILE), { recursive: true });
|
|
53
|
+
writeFileSync(STATE_FILE, JSON.stringify(state, null, 2));
|
|
54
|
+
}
|
|
55
|
+
catch (err) {
|
|
56
|
+
logger.warn({ err }, 'Failed to persist failure-monitor state');
|
|
57
|
+
}
|
|
58
|
+
}
|
|
59
|
+
function readRunLog(filePath) {
|
|
60
|
+
try {
|
|
61
|
+
return readFileSync(filePath, 'utf-8')
|
|
62
|
+
.trim()
|
|
63
|
+
.split('\n')
|
|
64
|
+
.filter(Boolean)
|
|
65
|
+
.map(line => {
|
|
66
|
+
try {
|
|
67
|
+
return JSON.parse(line);
|
|
68
|
+
}
|
|
69
|
+
catch {
|
|
70
|
+
return null;
|
|
71
|
+
}
|
|
72
|
+
})
|
|
73
|
+
.filter((e) => e !== null);
|
|
74
|
+
}
|
|
75
|
+
catch {
|
|
76
|
+
return [];
|
|
77
|
+
}
|
|
78
|
+
}
|
|
79
|
+
function isFailure(entry) {
|
|
80
|
+
return entry.status === 'error' || entry.status === 'retried' || isSemanticFailure(entry);
|
|
81
|
+
}
|
|
82
|
+
/**
|
|
83
|
+
* "Semantic failure" — a run the scheduler called `ok` but whose agent output
|
|
84
|
+
* self-reports the task didn't actually complete. We only flag on explicit
|
|
85
|
+
* block/failure markers in the preview; the duration-vs-output heuristic was
|
|
86
|
+
* tested against the live corpus and produced too many false positives on
|
|
87
|
+
* legitimately quiet jobs (healthchecks, inbox probes that return empty
|
|
88
|
+
* when there's nothing to report).
|
|
89
|
+
*
|
|
90
|
+
* Markers are drawn from observed failure modes in Ross's cron jobs
|
|
91
|
+
* (kernel-vs-local Bash, "BLOCKED (no local bash access)") plus generic
|
|
92
|
+
* agent self-reports.
|
|
93
|
+
*/
|
|
94
|
+
function isSemanticFailure(entry) {
|
|
95
|
+
if (entry.status !== 'ok')
|
|
96
|
+
return false;
|
|
97
|
+
const preview = (entry.outputPreview ?? '').trim();
|
|
98
|
+
if (!preview)
|
|
99
|
+
return false;
|
|
100
|
+
const previewLower = preview.toLowerCase();
|
|
101
|
+
// Match on word boundaries so "BLOCKED" matches "Result: BLOCKED" but
|
|
102
|
+
// "blockedBy" in a stray JSON fragment doesn't.
|
|
103
|
+
const markerRegexes = [
|
|
104
|
+
/\b(blocked|task_blocked|task_incomplete)\b/,
|
|
105
|
+
/\b(failed|could not|unable to|no local bash|permission denied)\b/,
|
|
106
|
+
/__nothing__/,
|
|
107
|
+
];
|
|
108
|
+
for (const re of markerRegexes) {
|
|
109
|
+
if (re.test(previewLower))
|
|
110
|
+
return true;
|
|
111
|
+
}
|
|
112
|
+
return false;
|
|
113
|
+
}
|
|
114
|
+
/**
|
|
115
|
+
* Pull the most recent circuit-breaker engagement for a job, looking at the
|
|
116
|
+
* entire advisor log (not just the 48h window). A stuck breaker counts as a
|
|
117
|
+
* broken job even if it last fired weeks ago, because while engaged the job
|
|
118
|
+
* stops running entirely and produces no new failure entries.
|
|
119
|
+
*
|
|
120
|
+
* Returns the engagement timestamp (if currently engaged with no subsequent
|
|
121
|
+
* recovery) and the most recent advisor opinion string, if any.
|
|
122
|
+
*/
|
|
123
|
+
function lastCircuitBreakerEvent(jobName) {
|
|
124
|
+
if (!existsSync(ADVISOR_EVENTS_FILE))
|
|
125
|
+
return { engagedAt: null, lastOpinion: null };
|
|
126
|
+
let engagedAt = null;
|
|
127
|
+
let lastOpinion = null;
|
|
128
|
+
try {
|
|
129
|
+
const lines = readFileSync(ADVISOR_EVENTS_FILE, 'utf-8').trim().split('\n');
|
|
130
|
+
for (const line of lines) {
|
|
131
|
+
try {
|
|
132
|
+
const evt = JSON.parse(line);
|
|
133
|
+
if (evt.jobName !== jobName)
|
|
134
|
+
continue;
|
|
135
|
+
// Capture the most recent opinion regardless of type
|
|
136
|
+
lastOpinion = `${evt.type}: ${evt.detail}`;
|
|
137
|
+
if (evt.type === 'circuit-breaker')
|
|
138
|
+
engagedAt = evt.timestamp;
|
|
139
|
+
if (evt.type === 'circuit-recovery' || evt.type === 'auto-disabled')
|
|
140
|
+
engagedAt = null;
|
|
141
|
+
}
|
|
142
|
+
catch { /* skip malformed */ }
|
|
143
|
+
}
|
|
144
|
+
}
|
|
145
|
+
catch { /* non-fatal */ }
|
|
146
|
+
return { engagedAt, lastOpinion };
|
|
147
|
+
}
|
|
148
|
+
/**
|
|
149
|
+
* Compute the current set of broken jobs by scanning all run logs.
|
|
150
|
+
* Pure function (state-free) — used both by the monitor sweep and the dashboard endpoint.
|
|
151
|
+
*/
|
|
152
|
+
export function computeBrokenJobs(now = Date.now()) {
|
|
153
|
+
if (!existsSync(RUNS_DIR))
|
|
154
|
+
return [];
|
|
155
|
+
const sinceMs = now - WINDOW_HOURS * 60 * 60 * 1000;
|
|
156
|
+
const broken = [];
|
|
157
|
+
let files = [];
|
|
158
|
+
try {
|
|
159
|
+
files = readdirSync(RUNS_DIR).filter(f => f.endsWith('.jsonl'));
|
|
160
|
+
}
|
|
161
|
+
catch {
|
|
162
|
+
return [];
|
|
163
|
+
}
|
|
164
|
+
const dormantCutoffMs = now - 7 * 24 * 60 * 60 * 1000;
|
|
165
|
+
for (const file of files) {
|
|
166
|
+
const entries = readRunLog(path.join(RUNS_DIR, file));
|
|
167
|
+
if (entries.length === 0)
|
|
168
|
+
continue;
|
|
169
|
+
const jobName = entries[0].jobName;
|
|
170
|
+
// Skip dormant jobs — if the last run is >7 days old the job is
|
|
171
|
+
// probably removed or renamed and its historical failures aren't
|
|
172
|
+
// actionable. Circuit breaker still counts because an engaged breaker
|
|
173
|
+
// is itself "the job stopped running".
|
|
174
|
+
const lastEntry = entries[entries.length - 1];
|
|
175
|
+
const lastRunMs = Date.parse(lastEntry.startedAt);
|
|
176
|
+
// Always consult the breaker state — a stuck breaker is the primary
|
|
177
|
+
// signal for "job has been silently broken for days".
|
|
178
|
+
const cb = lastCircuitBreakerEvent(jobName);
|
|
179
|
+
if (!cb.engagedAt && Number.isFinite(lastRunMs) && lastRunMs < dormantCutoffMs) {
|
|
180
|
+
continue;
|
|
181
|
+
}
|
|
182
|
+
const inWindow = entries.filter(e => {
|
|
183
|
+
const ts = Date.parse(e.startedAt);
|
|
184
|
+
return Number.isFinite(ts) && ts >= sinceMs;
|
|
185
|
+
});
|
|
186
|
+
const failures = inWindow.filter(isFailure);
|
|
187
|
+
// Consecutive-failure signal: scan from most recent entry backward.
|
|
188
|
+
// Stops at the first non-failure (ignoring 'skipped' which is neither
|
|
189
|
+
// signal). Catches daily jobs that fail every run without accumulating
|
|
190
|
+
// 3 in a 48h window.
|
|
191
|
+
let consecutiveFailures = 0;
|
|
192
|
+
for (let i = entries.length - 1; i >= 0; i--) {
|
|
193
|
+
const e = entries[i];
|
|
194
|
+
if (e.status === 'skipped')
|
|
195
|
+
continue;
|
|
196
|
+
if (isFailure(e))
|
|
197
|
+
consecutiveFailures++;
|
|
198
|
+
else
|
|
199
|
+
break;
|
|
200
|
+
}
|
|
201
|
+
const meetsThreshold = failures.length >= ERRORS_IN_WINDOW
|
|
202
|
+
|| consecutiveFailures >= CONSECUTIVE_FAILURES
|
|
203
|
+
|| !!cb.engagedAt;
|
|
204
|
+
if (!meetsThreshold)
|
|
205
|
+
continue;
|
|
206
|
+
// Gather up to 3 distinct error messages, newest first. Prefer in-window
|
|
207
|
+
// errors; if the breaker is engaged and there are no recent runs, fall
|
|
208
|
+
// back to the most recent errors anywhere in the log.
|
|
209
|
+
const errSource = failures.length > 0
|
|
210
|
+
? failures
|
|
211
|
+
: entries.filter(isFailure);
|
|
212
|
+
const distinctErrors = [];
|
|
213
|
+
const seen = new Set();
|
|
214
|
+
for (let i = errSource.length - 1; i >= 0 && distinctErrors.length < 3; i--) {
|
|
215
|
+
const err = (errSource[i].error ?? '').trim();
|
|
216
|
+
if (!err)
|
|
217
|
+
continue;
|
|
218
|
+
const key = err.slice(0, 120);
|
|
219
|
+
if (seen.has(key))
|
|
220
|
+
continue;
|
|
221
|
+
seen.add(key);
|
|
222
|
+
distinctErrors.push(err.slice(0, 400));
|
|
223
|
+
}
|
|
224
|
+
const lastFailureEntry = failures[failures.length - 1] ?? errSource[errSource.length - 1] ?? null;
|
|
225
|
+
const agentSlug = jobName.includes(':') ? jobName.split(':')[0] : undefined;
|
|
226
|
+
broken.push({
|
|
227
|
+
jobName,
|
|
228
|
+
agentSlug,
|
|
229
|
+
errorCount48h: failures.length,
|
|
230
|
+
totalRuns48h: inWindow.length,
|
|
231
|
+
lastErrorAt: lastFailureEntry?.startedAt ?? null,
|
|
232
|
+
lastErrors: distinctErrors,
|
|
233
|
+
circuitBreakerEngagedAt: cb.engagedAt,
|
|
234
|
+
lastAdvisorOpinion: cb.lastOpinion,
|
|
235
|
+
});
|
|
236
|
+
}
|
|
237
|
+
// Also check the self-improve loop — it has its own log (not cron/runs/).
|
|
238
|
+
const siBroken = detectSelfImproveBreakage(now);
|
|
239
|
+
if (siBroken)
|
|
240
|
+
broken.push(siBroken);
|
|
241
|
+
// Most recently failing first
|
|
242
|
+
broken.sort((a, b) => {
|
|
243
|
+
const aT = a.lastErrorAt ? Date.parse(a.lastErrorAt) : 0;
|
|
244
|
+
const bT = b.lastErrorAt ? Date.parse(b.lastErrorAt) : 0;
|
|
245
|
+
return bT - aT;
|
|
246
|
+
});
|
|
247
|
+
return broken;
|
|
248
|
+
}
|
|
249
|
+
/**
|
|
250
|
+
* The self-improve loop writes to its own experiment-log.jsonl, not cron/runs/.
|
|
251
|
+
* Its breakage pattern is: state.lastRunAt keeps getting updated nightly but
|
|
252
|
+
* no new experiments are being appended (they're all failing pre-iteration),
|
|
253
|
+
* OR the most recent experiments are all errors, OR state.infraError is set.
|
|
254
|
+
*
|
|
255
|
+
* Returns a synthetic BrokenJob for the self-improve pseudo-job, or null if
|
|
256
|
+
* healthy / no data.
|
|
257
|
+
*/
|
|
258
|
+
function detectSelfImproveBreakage(now) {
|
|
259
|
+
if (!existsSync(SELF_IMPROVE_STATE_FILE))
|
|
260
|
+
return null;
|
|
261
|
+
let state = {};
|
|
262
|
+
try {
|
|
263
|
+
state = JSON.parse(readFileSync(SELF_IMPROVE_STATE_FILE, 'utf-8'));
|
|
264
|
+
}
|
|
265
|
+
catch {
|
|
266
|
+
return null;
|
|
267
|
+
}
|
|
268
|
+
const experiments = [];
|
|
269
|
+
if (existsSync(SELF_IMPROVE_LOG_FILE)) {
|
|
270
|
+
try {
|
|
271
|
+
const lines = readFileSync(SELF_IMPROVE_LOG_FILE, 'utf-8').trim().split('\n').filter(Boolean);
|
|
272
|
+
for (const line of lines.slice(-10)) {
|
|
273
|
+
try {
|
|
274
|
+
experiments.push(JSON.parse(line));
|
|
275
|
+
}
|
|
276
|
+
catch { /* skip */ }
|
|
277
|
+
}
|
|
278
|
+
}
|
|
279
|
+
catch { /* non-fatal */ }
|
|
280
|
+
}
|
|
281
|
+
const lastRunMs = state.lastRunAt ? Date.parse(state.lastRunAt) : 0;
|
|
282
|
+
const lookback48h = now - 48 * 60 * 60 * 1000;
|
|
283
|
+
const staleLookback = now - 7 * 24 * 60 * 60 * 1000; // 7 days
|
|
284
|
+
const recentExperiments = experiments.filter(e => {
|
|
285
|
+
const ts = e.startedAt ? Date.parse(e.startedAt) : 0;
|
|
286
|
+
return Number.isFinite(ts) && ts >= staleLookback;
|
|
287
|
+
});
|
|
288
|
+
const recentErrors = recentExperiments.filter(e => e.approvalStatus === 'denied' && (e.reason?.startsWith('Error') ?? false));
|
|
289
|
+
// Three break modes:
|
|
290
|
+
// a. state.infraError is set (loop detected unfixable infra issue)
|
|
291
|
+
// b. all 3+ most recent experiments within lookback are errors
|
|
292
|
+
// c. loop ran recently but no new experiments appeared (silent early-exit)
|
|
293
|
+
const hasInfraError = !!state.infraError;
|
|
294
|
+
const allRecentErrored = recentExperiments.length >= 3
|
|
295
|
+
&& recentExperiments.every(e => e.approvalStatus === 'denied');
|
|
296
|
+
const silentEarlyExit = lastRunMs > lookback48h
|
|
297
|
+
&& recentExperiments.length === 0;
|
|
298
|
+
if (!hasInfraError && !allRecentErrored && !silentEarlyExit)
|
|
299
|
+
return null;
|
|
300
|
+
const lastErrors = [];
|
|
301
|
+
for (let i = experiments.length - 1; i >= 0 && lastErrors.length < 3; i--) {
|
|
302
|
+
const err = (experiments[i].error ?? '').trim();
|
|
303
|
+
if (!err)
|
|
304
|
+
continue;
|
|
305
|
+
lastErrors.push(err.slice(0, 400));
|
|
306
|
+
}
|
|
307
|
+
// If we don't have an explicit infraError but the last recorded error
|
|
308
|
+
// looks schema-related, surface it — this captures the state where all
|
|
309
|
+
// iterations died with the same API 400 but state.infraError never got
|
|
310
|
+
// persisted (happens when MAX_INFRA_ERRORS isn't crossed within a run).
|
|
311
|
+
const lastLoggedError = experiments.length > 0 ? (experiments[experiments.length - 1].error ?? '') : '';
|
|
312
|
+
const inferredInfraSchema = /input_schema|tools\.\d+\.custom/i.test(lastLoggedError);
|
|
313
|
+
let opinion;
|
|
314
|
+
if (hasInfraError) {
|
|
315
|
+
opinion = `infra: ${state.infraError.category} — ${state.infraError.diagnostic.slice(0, 200)}`;
|
|
316
|
+
}
|
|
317
|
+
else if (silentEarlyExit && inferredInfraSchema) {
|
|
318
|
+
opinion = 'loop ran but produced no experiments — last logged error was an MCP tool schema validation (API 400). Check external MCP servers (claude_desktop_config.json, Claude Code settings) for a recently-updated package exposing a malformed input_schema.';
|
|
319
|
+
}
|
|
320
|
+
else if (silentEarlyExit) {
|
|
321
|
+
opinion = 'loop ran but produced no experiments — likely crashing before iteration (check metrics gathering or hypothesis generation)';
|
|
322
|
+
}
|
|
323
|
+
else {
|
|
324
|
+
opinion = `${recentErrors.length}/${recentExperiments.length} recent iterations errored`;
|
|
325
|
+
}
|
|
326
|
+
return {
|
|
327
|
+
jobName: 'self-improve',
|
|
328
|
+
agentSlug: undefined,
|
|
329
|
+
errorCount48h: recentErrors.length,
|
|
330
|
+
totalRuns48h: recentExperiments.length,
|
|
331
|
+
lastErrorAt: experiments[experiments.length - 1]?.startedAt ?? state.lastRunAt ?? null,
|
|
332
|
+
lastErrors,
|
|
333
|
+
circuitBreakerEngagedAt: hasInfraError ? state.lastRunAt ?? null : null,
|
|
334
|
+
lastAdvisorOpinion: opinion,
|
|
335
|
+
};
|
|
336
|
+
}
|
|
337
|
+
/** Format a broken-job report for the owner DM. */
|
|
338
|
+
function formatReport(jobs) {
|
|
339
|
+
const lines = [];
|
|
340
|
+
lines.push(`🚨 **${jobs.length} cron job${jobs.length === 1 ? '' : 's'} repeatedly failing** (last ${WINDOW_HOURS}h)`);
|
|
341
|
+
lines.push('');
|
|
342
|
+
for (const j of jobs) {
|
|
343
|
+
const breaker = j.circuitBreakerEngagedAt ? ' · circuit breaker engaged' : '';
|
|
344
|
+
lines.push(`• \`${j.jobName}\` — ${j.errorCount48h}/${j.totalRuns48h} runs failed${breaker}`);
|
|
345
|
+
if (j.lastErrors.length > 0) {
|
|
346
|
+
const preview = j.lastErrors[0].split('\n')[0].slice(0, 140);
|
|
347
|
+
lines.push(` Last error: ${preview}`);
|
|
348
|
+
}
|
|
349
|
+
if (j.lastAdvisorOpinion) {
|
|
350
|
+
lines.push(` Advisor: ${j.lastAdvisorOpinion.slice(0, 140)}`);
|
|
351
|
+
}
|
|
352
|
+
}
|
|
353
|
+
lines.push('');
|
|
354
|
+
lines.push('Open the dashboard → Broken Jobs panel for the full picture.');
|
|
355
|
+
return lines.join('\n');
|
|
356
|
+
}
|
|
357
|
+
/**
|
|
358
|
+
* Run a sweep: identify currently-broken jobs, pick the ones we haven't
|
|
359
|
+
* notified about recently, and dispatch one consolidated DM.
|
|
360
|
+
*
|
|
361
|
+
* Returns the jobs that triggered a fresh notification (mostly for tests/logs).
|
|
362
|
+
*/
|
|
363
|
+
export async function runFailureSweep(send, now = Date.now()) {
|
|
364
|
+
const broken = computeBrokenJobs(now);
|
|
365
|
+
if (broken.length === 0) {
|
|
366
|
+
// Clear cooldowns for jobs that recovered so future failures notify promptly.
|
|
367
|
+
const state = loadState();
|
|
368
|
+
let mutated = false;
|
|
369
|
+
for (const name of Object.keys(state.notified)) {
|
|
370
|
+
if (!broken.find(b => b.jobName === name)) {
|
|
371
|
+
delete state.notified[name];
|
|
372
|
+
mutated = true;
|
|
373
|
+
}
|
|
374
|
+
}
|
|
375
|
+
if (mutated)
|
|
376
|
+
saveState(state);
|
|
377
|
+
return [];
|
|
378
|
+
}
|
|
379
|
+
const state = loadState();
|
|
380
|
+
const cooldownMs = NOTIFY_COOLDOWN_HOURS * 60 * 60 * 1000;
|
|
381
|
+
const fresh = [];
|
|
382
|
+
for (const job of broken) {
|
|
383
|
+
const prev = state.notified[job.jobName];
|
|
384
|
+
if (prev && now - Date.parse(prev.lastNotifiedAt) < cooldownMs)
|
|
385
|
+
continue;
|
|
386
|
+
fresh.push(job);
|
|
387
|
+
}
|
|
388
|
+
if (fresh.length === 0)
|
|
389
|
+
return [];
|
|
390
|
+
try {
|
|
391
|
+
await send(formatReport(fresh));
|
|
392
|
+
const stamp = new Date(now).toISOString();
|
|
393
|
+
for (const job of fresh) {
|
|
394
|
+
state.notified[job.jobName] = { lastNotifiedAt: stamp, lastErrorCount: job.errorCount48h };
|
|
395
|
+
}
|
|
396
|
+
saveState(state);
|
|
397
|
+
appendAuditLog('notified', fresh.map(j => j.jobName));
|
|
398
|
+
logger.info({ count: fresh.length, jobs: fresh.map(j => j.jobName) }, 'Failure monitor: notified owner');
|
|
399
|
+
}
|
|
400
|
+
catch (err) {
|
|
401
|
+
logger.warn({ err }, 'Failure monitor: notification dispatch failed');
|
|
402
|
+
}
|
|
403
|
+
return fresh;
|
|
404
|
+
}
|
|
405
|
+
function appendAuditLog(action, jobNames) {
|
|
406
|
+
try {
|
|
407
|
+
const auditPath = path.join(BASE_DIR, 'cron', 'failure-monitor.log');
|
|
408
|
+
appendFileSync(auditPath, JSON.stringify({
|
|
409
|
+
action,
|
|
410
|
+
jobs: jobNames,
|
|
411
|
+
timestamp: new Date().toISOString(),
|
|
412
|
+
}) + '\n');
|
|
413
|
+
}
|
|
414
|
+
catch { /* non-fatal */ }
|
|
415
|
+
}
|
|
416
|
+
//# sourceMappingURL=failure-monitor.js.map
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Clementine TypeScript — Cron fix verification.
|
|
3
|
+
*
|
|
4
|
+
* When a CRON.md (global or per-agent) is edited, we record a "pending
|
|
5
|
+
* verification" for any job whose definition changed AND that is currently
|
|
6
|
+
* in a failing state. After that job's next non-skipped run, we DM the
|
|
7
|
+
* owner with the verdict — succeeded or still failing — so a self-reported
|
|
8
|
+
* "fix" can't go untested again.
|
|
9
|
+
*/
|
|
10
|
+
import type { CronJobDefinition, CronRunEntry } from '../types.js';
|
|
11
|
+
interface PendingVerification {
|
|
12
|
+
jobName: string;
|
|
13
|
+
recordedAt: string;
|
|
14
|
+
preFailureCount: number;
|
|
15
|
+
preLastError: string | null;
|
|
16
|
+
}
|
|
17
|
+
/**
|
|
18
|
+
* Compare an old and new jobs list and record verifications for any job that:
|
|
19
|
+
* - exists in both lists (new jobs aren't "fixes" of existing problems)
|
|
20
|
+
* - has its definition hash changed
|
|
21
|
+
* - is currently in a failing state per failure-monitor
|
|
22
|
+
*
|
|
23
|
+
* Disabled jobs and removed jobs are tracked too: if a previously failing
|
|
24
|
+
* job gets disabled or removed in the edit, we surface that as a "removed
|
|
25
|
+
* pending verification" rather than waiting for a run that will never come.
|
|
26
|
+
*/
|
|
27
|
+
export declare function recordEditsForFailingJobs(oldJobs: CronJobDefinition[], newJobs: CronJobDefinition[]): void;
|
|
28
|
+
/**
|
|
29
|
+
* After a cron run completes, check whether we were waiting on a fix
|
|
30
|
+
* verification for this job. If so, send the owner a verdict and clear it.
|
|
31
|
+
*
|
|
32
|
+
* Skipped runs (circuit breaker, pre-check exit, etc.) don't carry signal
|
|
33
|
+
* and shouldn't count as a verdict either way.
|
|
34
|
+
*/
|
|
35
|
+
export declare function checkAndDeliverVerification(entry: CronRunEntry, send: (text: string) => Promise<unknown>): Promise<void>;
|
|
36
|
+
/** Read-only accessor for dashboards or debugging. */
|
|
37
|
+
export declare function listPendingVerifications(): PendingVerification[];
|
|
38
|
+
export {};
|
|
39
|
+
//# sourceMappingURL=fix-verification.d.ts.map
|
|
@@ -0,0 +1,144 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Clementine TypeScript — Cron fix verification.
|
|
3
|
+
*
|
|
4
|
+
* When a CRON.md (global or per-agent) is edited, we record a "pending
|
|
5
|
+
* verification" for any job whose definition changed AND that is currently
|
|
6
|
+
* in a failing state. After that job's next non-skipped run, we DM the
|
|
7
|
+
* owner with the verdict — succeeded or still failing — so a self-reported
|
|
8
|
+
* "fix" can't go untested again.
|
|
9
|
+
*/
|
|
10
|
+
import { existsSync, mkdirSync, readFileSync, writeFileSync, } from 'node:fs';
|
|
11
|
+
import path from 'node:path';
|
|
12
|
+
import crypto from 'node:crypto';
|
|
13
|
+
import pino from 'pino';
|
|
14
|
+
import { BASE_DIR } from '../config.js';
|
|
15
|
+
import { computeBrokenJobs } from './failure-monitor.js';
|
|
16
|
+
const logger = pino({ name: 'clementine.fix-verification' });
|
|
17
|
+
const STATE_FILE = path.join(BASE_DIR, 'cron', 'fix-verifications.json');
|
|
18
|
+
function loadState() {
|
|
19
|
+
try {
|
|
20
|
+
if (!existsSync(STATE_FILE))
|
|
21
|
+
return { pending: {} };
|
|
22
|
+
const raw = JSON.parse(readFileSync(STATE_FILE, 'utf-8'));
|
|
23
|
+
return { pending: raw.pending ?? {} };
|
|
24
|
+
}
|
|
25
|
+
catch {
|
|
26
|
+
return { pending: {} };
|
|
27
|
+
}
|
|
28
|
+
}
|
|
29
|
+
function saveState(state) {
|
|
30
|
+
try {
|
|
31
|
+
mkdirSync(path.dirname(STATE_FILE), { recursive: true });
|
|
32
|
+
writeFileSync(STATE_FILE, JSON.stringify(state, null, 2));
|
|
33
|
+
}
|
|
34
|
+
catch (err) {
|
|
35
|
+
logger.warn({ err }, 'Failed to persist fix-verification state');
|
|
36
|
+
}
|
|
37
|
+
}
|
|
38
|
+
/**
|
|
39
|
+
* Hash the job fields a fix could touch. Schedule + prompt + tier + mode +
|
|
40
|
+
* model + maxTurns + maxHours + workDir + preCheck + successCriteria are the
|
|
41
|
+
* only fields a "fix" would realistically change. We deliberately ignore
|
|
42
|
+
* `enabled` because disabling isn't a fix.
|
|
43
|
+
*/
|
|
44
|
+
function jobHash(j) {
|
|
45
|
+
const data = JSON.stringify({
|
|
46
|
+
schedule: j.schedule,
|
|
47
|
+
prompt: j.prompt,
|
|
48
|
+
tier: j.tier,
|
|
49
|
+
maxTurns: j.maxTurns,
|
|
50
|
+
model: j.model,
|
|
51
|
+
workDir: j.workDir,
|
|
52
|
+
mode: j.mode,
|
|
53
|
+
maxHours: j.maxHours,
|
|
54
|
+
preCheck: j.preCheck,
|
|
55
|
+
successCriteria: j.successCriteria,
|
|
56
|
+
});
|
|
57
|
+
return crypto.createHash('sha1').update(data).digest('hex').slice(0, 12);
|
|
58
|
+
}
|
|
59
|
+
/**
|
|
60
|
+
* Compare an old and new jobs list and record verifications for any job that:
|
|
61
|
+
* - exists in both lists (new jobs aren't "fixes" of existing problems)
|
|
62
|
+
* - has its definition hash changed
|
|
63
|
+
* - is currently in a failing state per failure-monitor
|
|
64
|
+
*
|
|
65
|
+
* Disabled jobs and removed jobs are tracked too: if a previously failing
|
|
66
|
+
* job gets disabled or removed in the edit, we surface that as a "removed
|
|
67
|
+
* pending verification" rather than waiting for a run that will never come.
|
|
68
|
+
*/
|
|
69
|
+
export function recordEditsForFailingJobs(oldJobs, newJobs) {
|
|
70
|
+
const oldByName = new Map(oldJobs.map(j => [j.name, j]));
|
|
71
|
+
const newByName = new Map(newJobs.map(j => [j.name, j]));
|
|
72
|
+
const broken = computeBrokenJobs();
|
|
73
|
+
const brokenByName = new Map(broken.map(b => [b.jobName, b]));
|
|
74
|
+
const state = loadState();
|
|
75
|
+
const stamp = new Date().toISOString();
|
|
76
|
+
let mutated = false;
|
|
77
|
+
for (const [name, oj] of oldByName) {
|
|
78
|
+
const b = brokenByName.get(name);
|
|
79
|
+
if (!b)
|
|
80
|
+
continue; // not currently broken — nothing to verify
|
|
81
|
+
const nj = newByName.get(name);
|
|
82
|
+
if (!nj) {
|
|
83
|
+
// Job removed entirely. Treat as resolved by removal.
|
|
84
|
+
delete state.pending[name];
|
|
85
|
+
mutated = true;
|
|
86
|
+
logger.info({ job: name }, 'Failing job removed from CRON.md — verification cleared');
|
|
87
|
+
continue;
|
|
88
|
+
}
|
|
89
|
+
if (!nj.enabled) {
|
|
90
|
+
// Job disabled. Don't wait for a run; clear and note.
|
|
91
|
+
delete state.pending[name];
|
|
92
|
+
mutated = true;
|
|
93
|
+
logger.info({ job: name }, 'Failing job disabled in CRON.md — verification cleared');
|
|
94
|
+
continue;
|
|
95
|
+
}
|
|
96
|
+
if (jobHash(oj) === jobHash(nj))
|
|
97
|
+
continue; // no relevant changes
|
|
98
|
+
state.pending[name] = {
|
|
99
|
+
jobName: name,
|
|
100
|
+
recordedAt: stamp,
|
|
101
|
+
preFailureCount: b.errorCount48h,
|
|
102
|
+
preLastError: b.lastErrors[0] ?? null,
|
|
103
|
+
};
|
|
104
|
+
mutated = true;
|
|
105
|
+
logger.info({ job: name, preFailureCount: b.errorCount48h }, 'Recorded pending fix verification');
|
|
106
|
+
}
|
|
107
|
+
if (mutated)
|
|
108
|
+
saveState(state);
|
|
109
|
+
}
|
|
110
|
+
/**
|
|
111
|
+
* After a cron run completes, check whether we were waiting on a fix
|
|
112
|
+
* verification for this job. If so, send the owner a verdict and clear it.
|
|
113
|
+
*
|
|
114
|
+
* Skipped runs (circuit breaker, pre-check exit, etc.) don't carry signal
|
|
115
|
+
* and shouldn't count as a verdict either way.
|
|
116
|
+
*/
|
|
117
|
+
export async function checkAndDeliverVerification(entry, send) {
|
|
118
|
+
if (entry.status === 'skipped')
|
|
119
|
+
return;
|
|
120
|
+
const state = loadState();
|
|
121
|
+
const pending = state.pending[entry.jobName];
|
|
122
|
+
if (!pending)
|
|
123
|
+
return;
|
|
124
|
+
delete state.pending[entry.jobName];
|
|
125
|
+
saveState(state);
|
|
126
|
+
const ok = entry.status === 'ok';
|
|
127
|
+
const verdict = ok ? '✅ succeeded' : '⚠️ still failing';
|
|
128
|
+
const ageMin = Math.max(1, Math.round((Date.now() - Date.parse(pending.recordedAt)) / 60000));
|
|
129
|
+
const detail = ok
|
|
130
|
+
? ''
|
|
131
|
+
: `\nError: ${(entry.error ?? 'unknown').split('\n')[0].slice(0, 200)}`;
|
|
132
|
+
const msg = `**[Fix verification]** \`${entry.jobName}\` ${verdict} on its first run after edit (${ageMin}m later).${detail}`;
|
|
133
|
+
try {
|
|
134
|
+
await send(msg);
|
|
135
|
+
}
|
|
136
|
+
catch (err) {
|
|
137
|
+
logger.warn({ err, job: entry.jobName }, 'Failed to send fix verification DM');
|
|
138
|
+
}
|
|
139
|
+
}
|
|
140
|
+
/** Read-only accessor for dashboards or debugging. */
|
|
141
|
+
export function listPendingVerifications() {
|
|
142
|
+
return Object.values(loadState().pending);
|
|
143
|
+
}
|
|
144
|
+
//# sourceMappingURL=fix-verification.js.map
|
|
@@ -103,6 +103,13 @@ export class HeartbeatScheduler {
|
|
|
103
103
|
catch (err) {
|
|
104
104
|
logger.warn({ err }, 'Session eviction failed');
|
|
105
105
|
}
|
|
106
|
+
// Cron failure sweep — surface jobs that have been silently failing.
|
|
107
|
+
// Runs every tick; per-job 24h cooldown lives inside the monitor.
|
|
108
|
+
import('./failure-monitor.js').then(({ runFailureSweep }) => {
|
|
109
|
+
runFailureSweep((text) => this.dispatcher.send(text, {})).catch(err => {
|
|
110
|
+
logger.warn({ err }, 'Failure sweep failed');
|
|
111
|
+
});
|
|
112
|
+
}).catch(err => logger.warn({ err }, 'Failure sweep import failed'));
|
|
106
113
|
const now = new Date();
|
|
107
114
|
const hour = now.getHours();
|
|
108
115
|
// ── Nightly tasks: run regardless of active hours ─────────────────
|
|
@@ -626,10 +633,41 @@ export class HeartbeatScheduler {
|
|
|
626
633
|
const prompt = buildInsightPrompt(signals);
|
|
627
634
|
if (!prompt)
|
|
628
635
|
return;
|
|
629
|
-
// Run lightweight LLM call via gateway
|
|
630
|
-
|
|
631
|
-
|
|
632
|
-
|
|
636
|
+
// Run lightweight LLM call via gateway. Log success AND failure to the
|
|
637
|
+
// cron run log so the failure monitor can see hourly breakage.
|
|
638
|
+
// maxTurns bumped 1 → 3 because the agent needs to fan out ~4 parallel
|
|
639
|
+
// tool calls (activity_history, outlook_inbox, goal_list, task_list)
|
|
640
|
+
// before composing its rating — at 1 turn it always crashes with
|
|
641
|
+
// "Reached maximum number of turns".
|
|
642
|
+
const icStartedAt = new Date();
|
|
643
|
+
let response = null;
|
|
644
|
+
try {
|
|
645
|
+
response = await this.gateway.handleCronJob('insight-check', prompt, 1, // tier 1
|
|
646
|
+
3, // max 3 turns (parallel tool fan-out + synthesis)
|
|
647
|
+
'haiku');
|
|
648
|
+
this.runLog.append({
|
|
649
|
+
jobName: 'insight-check',
|
|
650
|
+
startedAt: icStartedAt.toISOString(),
|
|
651
|
+
finishedAt: new Date().toISOString(),
|
|
652
|
+
status: 'ok',
|
|
653
|
+
durationMs: Date.now() - icStartedAt.getTime(),
|
|
654
|
+
attempt: 1,
|
|
655
|
+
outputPreview: (response ?? '').slice(0, 200),
|
|
656
|
+
});
|
|
657
|
+
}
|
|
658
|
+
catch (err) {
|
|
659
|
+
this.runLog.append({
|
|
660
|
+
jobName: 'insight-check',
|
|
661
|
+
startedAt: icStartedAt.toISOString(),
|
|
662
|
+
finishedAt: new Date().toISOString(),
|
|
663
|
+
status: 'error',
|
|
664
|
+
durationMs: Date.now() - icStartedAt.getTime(),
|
|
665
|
+
attempt: 1,
|
|
666
|
+
error: String(err).slice(0, 400),
|
|
667
|
+
errorType: 'transient',
|
|
668
|
+
});
|
|
669
|
+
throw err;
|
|
670
|
+
}
|
|
633
671
|
if (!response)
|
|
634
672
|
return;
|
|
635
673
|
const insight = parseInsightResponse(response);
|