clementine-agent 1.0.13 → 1.0.15
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/agent/assistant.js +32 -2
- package/dist/agent/self-improve.js +23 -0
- package/dist/agent/skill-extractor.d.ts +10 -0
- package/dist/agent/skill-extractor.js +61 -0
- package/dist/channels/discord-agent-bot.d.ts +4 -0
- package/dist/channels/discord-agent-bot.js +35 -0
- package/dist/channels/discord-bot-manager.d.ts +4 -0
- package/dist/channels/discord-bot-manager.js +16 -0
- package/dist/channels/discord.js +141 -0
- package/dist/channels/slack.js +51 -1
- package/dist/channels/telegram.js +28 -1
- package/dist/cli/dashboard.js +299 -5
- package/dist/gateway/cron-scheduler.d.ts +5 -0
- package/dist/gateway/cron-scheduler.js +32 -5
- package/dist/gateway/failure-monitor.d.ts +40 -0
- package/dist/gateway/failure-monitor.js +416 -0
- package/dist/gateway/fix-verification.d.ts +39 -0
- package/dist/gateway/fix-verification.js +144 -0
- package/dist/gateway/heartbeat-scheduler.js +61 -4
- package/dist/gateway/notifications.js +26 -1
- package/dist/gateway/router.js +2 -2
- package/dist/memory/store.d.ts +20 -0
- package/dist/memory/store.js +64 -0
- package/dist/types.d.ts +3 -0
- package/package.json +1 -1
|
@@ -0,0 +1,416 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Clementine TypeScript — Cron failure monitor.
|
|
3
|
+
*
|
|
4
|
+
* Surfaces cron jobs that have been failing repeatedly so they don't sit
|
|
5
|
+
* silently broken (which is what happened to ross-the-sdr:reply-detection —
|
|
6
|
+
* the existing circuit breaker fired ONCE at consErrors=5 and then went
|
|
7
|
+
* quiet for days).
|
|
8
|
+
*
|
|
9
|
+
* Threshold: a job is "broken" if either
|
|
10
|
+
* - it has >= 3 error/retried entries in the last 48h, OR
|
|
11
|
+
* - the circuit breaker engaged for it within the last 48h.
|
|
12
|
+
*
|
|
13
|
+
* Per-job 24h cooldown prevents re-spamming the owner with the same news.
|
|
14
|
+
*
|
|
15
|
+
* Read-only with respect to the cron run logs and advisor events; mutates
|
|
16
|
+
* only its own state file (cron/failure-monitor.json).
|
|
17
|
+
*/
|
|
18
|
+
import { appendFileSync, existsSync, mkdirSync, readFileSync, readdirSync, writeFileSync, } from 'node:fs';
|
|
19
|
+
import path from 'node:path';
|
|
20
|
+
import pino from 'pino';
|
|
21
|
+
import { BASE_DIR } from '../config.js';
|
|
22
|
+
const logger = pino({ name: 'clementine.failure-monitor' });
|
|
23
|
+
const RUNS_DIR = path.join(BASE_DIR, 'cron', 'runs');
|
|
24
|
+
const ADVISOR_EVENTS_FILE = path.join(BASE_DIR, 'cron', 'advisor-events.jsonl');
|
|
25
|
+
const STATE_FILE = path.join(BASE_DIR, 'cron', 'failure-monitor.json');
|
|
26
|
+
const SELF_IMPROVE_STATE_FILE = path.join(BASE_DIR, 'self-improve', 'state.json');
|
|
27
|
+
const SELF_IMPROVE_LOG_FILE = path.join(BASE_DIR, 'self-improve', 'experiment-log.jsonl');
|
|
28
|
+
/** A job is broken if it crosses any of these thresholds in the lookback window. */
|
|
29
|
+
const ERRORS_IN_WINDOW = 3;
|
|
30
|
+
const WINDOW_HOURS = 48;
|
|
31
|
+
/**
|
|
32
|
+
* Independent of the window — a job whose last N runs are all failures is
|
|
33
|
+
* broken even if they're spread over days (daily cron jobs can't accumulate
|
|
34
|
+
* 3 failures in 48h, but 2 consecutive BLOCKED days is still broken).
|
|
35
|
+
*/
|
|
36
|
+
const CONSECUTIVE_FAILURES = 2;
|
|
37
|
+
/** Don't re-DM the owner about the same broken job within this window. */
|
|
38
|
+
const NOTIFY_COOLDOWN_HOURS = 24;
|
|
39
|
+
function loadState() {
|
|
40
|
+
try {
|
|
41
|
+
if (!existsSync(STATE_FILE))
|
|
42
|
+
return { notified: {} };
|
|
43
|
+
const raw = JSON.parse(readFileSync(STATE_FILE, 'utf-8'));
|
|
44
|
+
return { notified: raw.notified ?? {} };
|
|
45
|
+
}
|
|
46
|
+
catch {
|
|
47
|
+
return { notified: {} };
|
|
48
|
+
}
|
|
49
|
+
}
|
|
50
|
+
function saveState(state) {
|
|
51
|
+
try {
|
|
52
|
+
mkdirSync(path.dirname(STATE_FILE), { recursive: true });
|
|
53
|
+
writeFileSync(STATE_FILE, JSON.stringify(state, null, 2));
|
|
54
|
+
}
|
|
55
|
+
catch (err) {
|
|
56
|
+
logger.warn({ err }, 'Failed to persist failure-monitor state');
|
|
57
|
+
}
|
|
58
|
+
}
|
|
59
|
+
function readRunLog(filePath) {
|
|
60
|
+
try {
|
|
61
|
+
return readFileSync(filePath, 'utf-8')
|
|
62
|
+
.trim()
|
|
63
|
+
.split('\n')
|
|
64
|
+
.filter(Boolean)
|
|
65
|
+
.map(line => {
|
|
66
|
+
try {
|
|
67
|
+
return JSON.parse(line);
|
|
68
|
+
}
|
|
69
|
+
catch {
|
|
70
|
+
return null;
|
|
71
|
+
}
|
|
72
|
+
})
|
|
73
|
+
.filter((e) => e !== null);
|
|
74
|
+
}
|
|
75
|
+
catch {
|
|
76
|
+
return [];
|
|
77
|
+
}
|
|
78
|
+
}
|
|
79
|
+
function isFailure(entry) {
|
|
80
|
+
return entry.status === 'error' || entry.status === 'retried' || isSemanticFailure(entry);
|
|
81
|
+
}
|
|
82
|
+
/**
|
|
83
|
+
* "Semantic failure" — a run the scheduler called `ok` but whose agent output
|
|
84
|
+
* self-reports the task didn't actually complete. We only flag on explicit
|
|
85
|
+
* block/failure markers in the preview; the duration-vs-output heuristic was
|
|
86
|
+
* tested against the live corpus and produced too many false positives on
|
|
87
|
+
* legitimately quiet jobs (healthchecks, inbox probes that return empty
|
|
88
|
+
* when there's nothing to report).
|
|
89
|
+
*
|
|
90
|
+
* Markers are drawn from observed failure modes in Ross's cron jobs
|
|
91
|
+
* (kernel-vs-local Bash, "BLOCKED (no local bash access)") plus generic
|
|
92
|
+
* agent self-reports.
|
|
93
|
+
*/
|
|
94
|
+
function isSemanticFailure(entry) {
|
|
95
|
+
if (entry.status !== 'ok')
|
|
96
|
+
return false;
|
|
97
|
+
const preview = (entry.outputPreview ?? '').trim();
|
|
98
|
+
if (!preview)
|
|
99
|
+
return false;
|
|
100
|
+
const previewLower = preview.toLowerCase();
|
|
101
|
+
// Match on word boundaries so "BLOCKED" matches "Result: BLOCKED" but
|
|
102
|
+
// "blockedBy" in a stray JSON fragment doesn't.
|
|
103
|
+
const markerRegexes = [
|
|
104
|
+
/\b(blocked|task_blocked|task_incomplete)\b/,
|
|
105
|
+
/\b(failed|could not|unable to|no local bash|permission denied)\b/,
|
|
106
|
+
/__nothing__/,
|
|
107
|
+
];
|
|
108
|
+
for (const re of markerRegexes) {
|
|
109
|
+
if (re.test(previewLower))
|
|
110
|
+
return true;
|
|
111
|
+
}
|
|
112
|
+
return false;
|
|
113
|
+
}
|
|
114
|
+
/**
|
|
115
|
+
* Pull the most recent circuit-breaker engagement for a job, looking at the
|
|
116
|
+
* entire advisor log (not just the 48h window). A stuck breaker counts as a
|
|
117
|
+
* broken job even if it last fired weeks ago, because while engaged the job
|
|
118
|
+
* stops running entirely and produces no new failure entries.
|
|
119
|
+
*
|
|
120
|
+
* Returns the engagement timestamp (if currently engaged with no subsequent
|
|
121
|
+
* recovery) and the most recent advisor opinion string, if any.
|
|
122
|
+
*/
|
|
123
|
+
function lastCircuitBreakerEvent(jobName) {
|
|
124
|
+
if (!existsSync(ADVISOR_EVENTS_FILE))
|
|
125
|
+
return { engagedAt: null, lastOpinion: null };
|
|
126
|
+
let engagedAt = null;
|
|
127
|
+
let lastOpinion = null;
|
|
128
|
+
try {
|
|
129
|
+
const lines = readFileSync(ADVISOR_EVENTS_FILE, 'utf-8').trim().split('\n');
|
|
130
|
+
for (const line of lines) {
|
|
131
|
+
try {
|
|
132
|
+
const evt = JSON.parse(line);
|
|
133
|
+
if (evt.jobName !== jobName)
|
|
134
|
+
continue;
|
|
135
|
+
// Capture the most recent opinion regardless of type
|
|
136
|
+
lastOpinion = `${evt.type}: ${evt.detail}`;
|
|
137
|
+
if (evt.type === 'circuit-breaker')
|
|
138
|
+
engagedAt = evt.timestamp;
|
|
139
|
+
if (evt.type === 'circuit-recovery' || evt.type === 'auto-disabled')
|
|
140
|
+
engagedAt = null;
|
|
141
|
+
}
|
|
142
|
+
catch { /* skip malformed */ }
|
|
143
|
+
}
|
|
144
|
+
}
|
|
145
|
+
catch { /* non-fatal */ }
|
|
146
|
+
return { engagedAt, lastOpinion };
|
|
147
|
+
}
|
|
148
|
+
/**
|
|
149
|
+
* Compute the current set of broken jobs by scanning all run logs.
|
|
150
|
+
* Pure function (state-free) — used both by the monitor sweep and the dashboard endpoint.
|
|
151
|
+
*/
|
|
152
|
+
export function computeBrokenJobs(now = Date.now()) {
|
|
153
|
+
if (!existsSync(RUNS_DIR))
|
|
154
|
+
return [];
|
|
155
|
+
const sinceMs = now - WINDOW_HOURS * 60 * 60 * 1000;
|
|
156
|
+
const broken = [];
|
|
157
|
+
let files = [];
|
|
158
|
+
try {
|
|
159
|
+
files = readdirSync(RUNS_DIR).filter(f => f.endsWith('.jsonl'));
|
|
160
|
+
}
|
|
161
|
+
catch {
|
|
162
|
+
return [];
|
|
163
|
+
}
|
|
164
|
+
const dormantCutoffMs = now - 7 * 24 * 60 * 60 * 1000;
|
|
165
|
+
for (const file of files) {
|
|
166
|
+
const entries = readRunLog(path.join(RUNS_DIR, file));
|
|
167
|
+
if (entries.length === 0)
|
|
168
|
+
continue;
|
|
169
|
+
const jobName = entries[0].jobName;
|
|
170
|
+
// Skip dormant jobs — if the last run is >7 days old the job is
|
|
171
|
+
// probably removed or renamed and its historical failures aren't
|
|
172
|
+
// actionable. Circuit breaker still counts because an engaged breaker
|
|
173
|
+
// is itself "the job stopped running".
|
|
174
|
+
const lastEntry = entries[entries.length - 1];
|
|
175
|
+
const lastRunMs = Date.parse(lastEntry.startedAt);
|
|
176
|
+
// Always consult the breaker state — a stuck breaker is the primary
|
|
177
|
+
// signal for "job has been silently broken for days".
|
|
178
|
+
const cb = lastCircuitBreakerEvent(jobName);
|
|
179
|
+
if (!cb.engagedAt && Number.isFinite(lastRunMs) && lastRunMs < dormantCutoffMs) {
|
|
180
|
+
continue;
|
|
181
|
+
}
|
|
182
|
+
const inWindow = entries.filter(e => {
|
|
183
|
+
const ts = Date.parse(e.startedAt);
|
|
184
|
+
return Number.isFinite(ts) && ts >= sinceMs;
|
|
185
|
+
});
|
|
186
|
+
const failures = inWindow.filter(isFailure);
|
|
187
|
+
// Consecutive-failure signal: scan from most recent entry backward.
|
|
188
|
+
// Stops at the first non-failure (ignoring 'skipped' which is neither
|
|
189
|
+
// signal). Catches daily jobs that fail every run without accumulating
|
|
190
|
+
// 3 in a 48h window.
|
|
191
|
+
let consecutiveFailures = 0;
|
|
192
|
+
for (let i = entries.length - 1; i >= 0; i--) {
|
|
193
|
+
const e = entries[i];
|
|
194
|
+
if (e.status === 'skipped')
|
|
195
|
+
continue;
|
|
196
|
+
if (isFailure(e))
|
|
197
|
+
consecutiveFailures++;
|
|
198
|
+
else
|
|
199
|
+
break;
|
|
200
|
+
}
|
|
201
|
+
const meetsThreshold = failures.length >= ERRORS_IN_WINDOW
|
|
202
|
+
|| consecutiveFailures >= CONSECUTIVE_FAILURES
|
|
203
|
+
|| !!cb.engagedAt;
|
|
204
|
+
if (!meetsThreshold)
|
|
205
|
+
continue;
|
|
206
|
+
// Gather up to 3 distinct error messages, newest first. Prefer in-window
|
|
207
|
+
// errors; if the breaker is engaged and there are no recent runs, fall
|
|
208
|
+
// back to the most recent errors anywhere in the log.
|
|
209
|
+
const errSource = failures.length > 0
|
|
210
|
+
? failures
|
|
211
|
+
: entries.filter(isFailure);
|
|
212
|
+
const distinctErrors = [];
|
|
213
|
+
const seen = new Set();
|
|
214
|
+
for (let i = errSource.length - 1; i >= 0 && distinctErrors.length < 3; i--) {
|
|
215
|
+
const err = (errSource[i].error ?? '').trim();
|
|
216
|
+
if (!err)
|
|
217
|
+
continue;
|
|
218
|
+
const key = err.slice(0, 120);
|
|
219
|
+
if (seen.has(key))
|
|
220
|
+
continue;
|
|
221
|
+
seen.add(key);
|
|
222
|
+
distinctErrors.push(err.slice(0, 400));
|
|
223
|
+
}
|
|
224
|
+
const lastFailureEntry = failures[failures.length - 1] ?? errSource[errSource.length - 1] ?? null;
|
|
225
|
+
const agentSlug = jobName.includes(':') ? jobName.split(':')[0] : undefined;
|
|
226
|
+
broken.push({
|
|
227
|
+
jobName,
|
|
228
|
+
agentSlug,
|
|
229
|
+
errorCount48h: failures.length,
|
|
230
|
+
totalRuns48h: inWindow.length,
|
|
231
|
+
lastErrorAt: lastFailureEntry?.startedAt ?? null,
|
|
232
|
+
lastErrors: distinctErrors,
|
|
233
|
+
circuitBreakerEngagedAt: cb.engagedAt,
|
|
234
|
+
lastAdvisorOpinion: cb.lastOpinion,
|
|
235
|
+
});
|
|
236
|
+
}
|
|
237
|
+
// Also check the self-improve loop — it has its own log (not cron/runs/).
|
|
238
|
+
const siBroken = detectSelfImproveBreakage(now);
|
|
239
|
+
if (siBroken)
|
|
240
|
+
broken.push(siBroken);
|
|
241
|
+
// Most recently failing first
|
|
242
|
+
broken.sort((a, b) => {
|
|
243
|
+
const aT = a.lastErrorAt ? Date.parse(a.lastErrorAt) : 0;
|
|
244
|
+
const bT = b.lastErrorAt ? Date.parse(b.lastErrorAt) : 0;
|
|
245
|
+
return bT - aT;
|
|
246
|
+
});
|
|
247
|
+
return broken;
|
|
248
|
+
}
|
|
249
|
+
/**
|
|
250
|
+
* The self-improve loop writes to its own experiment-log.jsonl, not cron/runs/.
|
|
251
|
+
* Its breakage pattern is: state.lastRunAt keeps getting updated nightly but
|
|
252
|
+
* no new experiments are being appended (they're all failing pre-iteration),
|
|
253
|
+
* OR the most recent experiments are all errors, OR state.infraError is set.
|
|
254
|
+
*
|
|
255
|
+
* Returns a synthetic BrokenJob for the self-improve pseudo-job, or null if
|
|
256
|
+
* healthy / no data.
|
|
257
|
+
*/
|
|
258
|
+
function detectSelfImproveBreakage(now) {
|
|
259
|
+
if (!existsSync(SELF_IMPROVE_STATE_FILE))
|
|
260
|
+
return null;
|
|
261
|
+
let state = {};
|
|
262
|
+
try {
|
|
263
|
+
state = JSON.parse(readFileSync(SELF_IMPROVE_STATE_FILE, 'utf-8'));
|
|
264
|
+
}
|
|
265
|
+
catch {
|
|
266
|
+
return null;
|
|
267
|
+
}
|
|
268
|
+
const experiments = [];
|
|
269
|
+
if (existsSync(SELF_IMPROVE_LOG_FILE)) {
|
|
270
|
+
try {
|
|
271
|
+
const lines = readFileSync(SELF_IMPROVE_LOG_FILE, 'utf-8').trim().split('\n').filter(Boolean);
|
|
272
|
+
for (const line of lines.slice(-10)) {
|
|
273
|
+
try {
|
|
274
|
+
experiments.push(JSON.parse(line));
|
|
275
|
+
}
|
|
276
|
+
catch { /* skip */ }
|
|
277
|
+
}
|
|
278
|
+
}
|
|
279
|
+
catch { /* non-fatal */ }
|
|
280
|
+
}
|
|
281
|
+
const lastRunMs = state.lastRunAt ? Date.parse(state.lastRunAt) : 0;
|
|
282
|
+
const lookback48h = now - 48 * 60 * 60 * 1000;
|
|
283
|
+
const staleLookback = now - 7 * 24 * 60 * 60 * 1000; // 7 days
|
|
284
|
+
const recentExperiments = experiments.filter(e => {
|
|
285
|
+
const ts = e.startedAt ? Date.parse(e.startedAt) : 0;
|
|
286
|
+
return Number.isFinite(ts) && ts >= staleLookback;
|
|
287
|
+
});
|
|
288
|
+
const recentErrors = recentExperiments.filter(e => e.approvalStatus === 'denied' && (e.reason?.startsWith('Error') ?? false));
|
|
289
|
+
// Three break modes:
|
|
290
|
+
// a. state.infraError is set (loop detected unfixable infra issue)
|
|
291
|
+
// b. all 3+ most recent experiments within lookback are errors
|
|
292
|
+
// c. loop ran recently but no new experiments appeared (silent early-exit)
|
|
293
|
+
const hasInfraError = !!state.infraError;
|
|
294
|
+
const allRecentErrored = recentExperiments.length >= 3
|
|
295
|
+
&& recentExperiments.every(e => e.approvalStatus === 'denied');
|
|
296
|
+
const silentEarlyExit = lastRunMs > lookback48h
|
|
297
|
+
&& recentExperiments.length === 0;
|
|
298
|
+
if (!hasInfraError && !allRecentErrored && !silentEarlyExit)
|
|
299
|
+
return null;
|
|
300
|
+
const lastErrors = [];
|
|
301
|
+
for (let i = experiments.length - 1; i >= 0 && lastErrors.length < 3; i--) {
|
|
302
|
+
const err = (experiments[i].error ?? '').trim();
|
|
303
|
+
if (!err)
|
|
304
|
+
continue;
|
|
305
|
+
lastErrors.push(err.slice(0, 400));
|
|
306
|
+
}
|
|
307
|
+
// If we don't have an explicit infraError but the last recorded error
|
|
308
|
+
// looks schema-related, surface it — this captures the state where all
|
|
309
|
+
// iterations died with the same API 400 but state.infraError never got
|
|
310
|
+
// persisted (happens when MAX_INFRA_ERRORS isn't crossed within a run).
|
|
311
|
+
const lastLoggedError = experiments.length > 0 ? (experiments[experiments.length - 1].error ?? '') : '';
|
|
312
|
+
const inferredInfraSchema = /input_schema|tools\.\d+\.custom/i.test(lastLoggedError);
|
|
313
|
+
let opinion;
|
|
314
|
+
if (hasInfraError) {
|
|
315
|
+
opinion = `infra: ${state.infraError.category} — ${state.infraError.diagnostic.slice(0, 200)}`;
|
|
316
|
+
}
|
|
317
|
+
else if (silentEarlyExit && inferredInfraSchema) {
|
|
318
|
+
opinion = 'loop ran but produced no experiments — last logged error was an MCP tool schema validation (API 400). Check external MCP servers (claude_desktop_config.json, Claude Code settings) for a recently-updated package exposing a malformed input_schema.';
|
|
319
|
+
}
|
|
320
|
+
else if (silentEarlyExit) {
|
|
321
|
+
opinion = 'loop ran but produced no experiments — likely crashing before iteration (check metrics gathering or hypothesis generation)';
|
|
322
|
+
}
|
|
323
|
+
else {
|
|
324
|
+
opinion = `${recentErrors.length}/${recentExperiments.length} recent iterations errored`;
|
|
325
|
+
}
|
|
326
|
+
return {
|
|
327
|
+
jobName: 'self-improve',
|
|
328
|
+
agentSlug: undefined,
|
|
329
|
+
errorCount48h: recentErrors.length,
|
|
330
|
+
totalRuns48h: recentExperiments.length,
|
|
331
|
+
lastErrorAt: experiments[experiments.length - 1]?.startedAt ?? state.lastRunAt ?? null,
|
|
332
|
+
lastErrors,
|
|
333
|
+
circuitBreakerEngagedAt: hasInfraError ? state.lastRunAt ?? null : null,
|
|
334
|
+
lastAdvisorOpinion: opinion,
|
|
335
|
+
};
|
|
336
|
+
}
|
|
337
|
+
/** Format a broken-job report for the owner DM. */
|
|
338
|
+
function formatReport(jobs) {
|
|
339
|
+
const lines = [];
|
|
340
|
+
lines.push(`🚨 **${jobs.length} cron job${jobs.length === 1 ? '' : 's'} repeatedly failing** (last ${WINDOW_HOURS}h)`);
|
|
341
|
+
lines.push('');
|
|
342
|
+
for (const j of jobs) {
|
|
343
|
+
const breaker = j.circuitBreakerEngagedAt ? ' · circuit breaker engaged' : '';
|
|
344
|
+
lines.push(`• \`${j.jobName}\` — ${j.errorCount48h}/${j.totalRuns48h} runs failed${breaker}`);
|
|
345
|
+
if (j.lastErrors.length > 0) {
|
|
346
|
+
const preview = j.lastErrors[0].split('\n')[0].slice(0, 140);
|
|
347
|
+
lines.push(` Last error: ${preview}`);
|
|
348
|
+
}
|
|
349
|
+
if (j.lastAdvisorOpinion) {
|
|
350
|
+
lines.push(` Advisor: ${j.lastAdvisorOpinion.slice(0, 140)}`);
|
|
351
|
+
}
|
|
352
|
+
}
|
|
353
|
+
lines.push('');
|
|
354
|
+
lines.push('Open the dashboard → Broken Jobs panel for the full picture.');
|
|
355
|
+
return lines.join('\n');
|
|
356
|
+
}
|
|
357
|
+
/**
|
|
358
|
+
* Run a sweep: identify currently-broken jobs, pick the ones we haven't
|
|
359
|
+
* notified about recently, and dispatch one consolidated DM.
|
|
360
|
+
*
|
|
361
|
+
* Returns the jobs that triggered a fresh notification (mostly for tests/logs).
|
|
362
|
+
*/
|
|
363
|
+
export async function runFailureSweep(send, now = Date.now()) {
|
|
364
|
+
const broken = computeBrokenJobs(now);
|
|
365
|
+
if (broken.length === 0) {
|
|
366
|
+
// Clear cooldowns for jobs that recovered so future failures notify promptly.
|
|
367
|
+
const state = loadState();
|
|
368
|
+
let mutated = false;
|
|
369
|
+
for (const name of Object.keys(state.notified)) {
|
|
370
|
+
if (!broken.find(b => b.jobName === name)) {
|
|
371
|
+
delete state.notified[name];
|
|
372
|
+
mutated = true;
|
|
373
|
+
}
|
|
374
|
+
}
|
|
375
|
+
if (mutated)
|
|
376
|
+
saveState(state);
|
|
377
|
+
return [];
|
|
378
|
+
}
|
|
379
|
+
const state = loadState();
|
|
380
|
+
const cooldownMs = NOTIFY_COOLDOWN_HOURS * 60 * 60 * 1000;
|
|
381
|
+
const fresh = [];
|
|
382
|
+
for (const job of broken) {
|
|
383
|
+
const prev = state.notified[job.jobName];
|
|
384
|
+
if (prev && now - Date.parse(prev.lastNotifiedAt) < cooldownMs)
|
|
385
|
+
continue;
|
|
386
|
+
fresh.push(job);
|
|
387
|
+
}
|
|
388
|
+
if (fresh.length === 0)
|
|
389
|
+
return [];
|
|
390
|
+
try {
|
|
391
|
+
await send(formatReport(fresh));
|
|
392
|
+
const stamp = new Date(now).toISOString();
|
|
393
|
+
for (const job of fresh) {
|
|
394
|
+
state.notified[job.jobName] = { lastNotifiedAt: stamp, lastErrorCount: job.errorCount48h };
|
|
395
|
+
}
|
|
396
|
+
saveState(state);
|
|
397
|
+
appendAuditLog('notified', fresh.map(j => j.jobName));
|
|
398
|
+
logger.info({ count: fresh.length, jobs: fresh.map(j => j.jobName) }, 'Failure monitor: notified owner');
|
|
399
|
+
}
|
|
400
|
+
catch (err) {
|
|
401
|
+
logger.warn({ err }, 'Failure monitor: notification dispatch failed');
|
|
402
|
+
}
|
|
403
|
+
return fresh;
|
|
404
|
+
}
|
|
405
|
+
function appendAuditLog(action, jobNames) {
|
|
406
|
+
try {
|
|
407
|
+
const auditPath = path.join(BASE_DIR, 'cron', 'failure-monitor.log');
|
|
408
|
+
appendFileSync(auditPath, JSON.stringify({
|
|
409
|
+
action,
|
|
410
|
+
jobs: jobNames,
|
|
411
|
+
timestamp: new Date().toISOString(),
|
|
412
|
+
}) + '\n');
|
|
413
|
+
}
|
|
414
|
+
catch { /* non-fatal */ }
|
|
415
|
+
}
|
|
416
|
+
//# sourceMappingURL=failure-monitor.js.map
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Clementine TypeScript — Cron fix verification.
|
|
3
|
+
*
|
|
4
|
+
* When a CRON.md (global or per-agent) is edited, we record a "pending
|
|
5
|
+
* verification" for any job whose definition changed AND that is currently
|
|
6
|
+
* in a failing state. After that job's next non-skipped run, we DM the
|
|
7
|
+
* owner with the verdict — succeeded or still failing — so a self-reported
|
|
8
|
+
* "fix" can't go untested again.
|
|
9
|
+
*/
|
|
10
|
+
import type { CronJobDefinition, CronRunEntry } from '../types.js';
|
|
11
|
+
interface PendingVerification {
|
|
12
|
+
jobName: string;
|
|
13
|
+
recordedAt: string;
|
|
14
|
+
preFailureCount: number;
|
|
15
|
+
preLastError: string | null;
|
|
16
|
+
}
|
|
17
|
+
/**
|
|
18
|
+
* Compare an old and new jobs list and record verifications for any job that:
|
|
19
|
+
* - exists in both lists (new jobs aren't "fixes" of existing problems)
|
|
20
|
+
* - has its definition hash changed
|
|
21
|
+
* - is currently in a failing state per failure-monitor
|
|
22
|
+
*
|
|
23
|
+
* Disabled jobs and removed jobs are tracked too: if a previously failing
|
|
24
|
+
* job gets disabled or removed in the edit, we surface that as a "removed
|
|
25
|
+
* pending verification" rather than waiting for a run that will never come.
|
|
26
|
+
*/
|
|
27
|
+
export declare function recordEditsForFailingJobs(oldJobs: CronJobDefinition[], newJobs: CronJobDefinition[]): void;
|
|
28
|
+
/**
|
|
29
|
+
* After a cron run completes, check whether we were waiting on a fix
|
|
30
|
+
* verification for this job. If so, send the owner a verdict and clear it.
|
|
31
|
+
*
|
|
32
|
+
* Skipped runs (circuit breaker, pre-check exit, etc.) don't carry signal
|
|
33
|
+
* and shouldn't count as a verdict either way.
|
|
34
|
+
*/
|
|
35
|
+
export declare function checkAndDeliverVerification(entry: CronRunEntry, send: (text: string) => Promise<unknown>): Promise<void>;
|
|
36
|
+
/** Read-only accessor for dashboards or debugging. */
|
|
37
|
+
export declare function listPendingVerifications(): PendingVerification[];
|
|
38
|
+
export {};
|
|
39
|
+
//# sourceMappingURL=fix-verification.d.ts.map
|
|
@@ -0,0 +1,144 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Clementine TypeScript — Cron fix verification.
|
|
3
|
+
*
|
|
4
|
+
* When a CRON.md (global or per-agent) is edited, we record a "pending
|
|
5
|
+
* verification" for any job whose definition changed AND that is currently
|
|
6
|
+
* in a failing state. After that job's next non-skipped run, we DM the
|
|
7
|
+
* owner with the verdict — succeeded or still failing — so a self-reported
|
|
8
|
+
* "fix" can't go untested again.
|
|
9
|
+
*/
|
|
10
|
+
import { existsSync, mkdirSync, readFileSync, writeFileSync, } from 'node:fs';
|
|
11
|
+
import path from 'node:path';
|
|
12
|
+
import crypto from 'node:crypto';
|
|
13
|
+
import pino from 'pino';
|
|
14
|
+
import { BASE_DIR } from '../config.js';
|
|
15
|
+
import { computeBrokenJobs } from './failure-monitor.js';
|
|
16
|
+
const logger = pino({ name: 'clementine.fix-verification' });
|
|
17
|
+
const STATE_FILE = path.join(BASE_DIR, 'cron', 'fix-verifications.json');
|
|
18
|
+
function loadState() {
|
|
19
|
+
try {
|
|
20
|
+
if (!existsSync(STATE_FILE))
|
|
21
|
+
return { pending: {} };
|
|
22
|
+
const raw = JSON.parse(readFileSync(STATE_FILE, 'utf-8'));
|
|
23
|
+
return { pending: raw.pending ?? {} };
|
|
24
|
+
}
|
|
25
|
+
catch {
|
|
26
|
+
return { pending: {} };
|
|
27
|
+
}
|
|
28
|
+
}
|
|
29
|
+
function saveState(state) {
|
|
30
|
+
try {
|
|
31
|
+
mkdirSync(path.dirname(STATE_FILE), { recursive: true });
|
|
32
|
+
writeFileSync(STATE_FILE, JSON.stringify(state, null, 2));
|
|
33
|
+
}
|
|
34
|
+
catch (err) {
|
|
35
|
+
logger.warn({ err }, 'Failed to persist fix-verification state');
|
|
36
|
+
}
|
|
37
|
+
}
|
|
38
|
+
/**
|
|
39
|
+
* Hash the job fields a fix could touch. Schedule + prompt + tier + mode +
|
|
40
|
+
* model + maxTurns + maxHours + workDir + preCheck + successCriteria are the
|
|
41
|
+
* only fields a "fix" would realistically change. We deliberately ignore
|
|
42
|
+
* `enabled` because disabling isn't a fix.
|
|
43
|
+
*/
|
|
44
|
+
function jobHash(j) {
|
|
45
|
+
const data = JSON.stringify({
|
|
46
|
+
schedule: j.schedule,
|
|
47
|
+
prompt: j.prompt,
|
|
48
|
+
tier: j.tier,
|
|
49
|
+
maxTurns: j.maxTurns,
|
|
50
|
+
model: j.model,
|
|
51
|
+
workDir: j.workDir,
|
|
52
|
+
mode: j.mode,
|
|
53
|
+
maxHours: j.maxHours,
|
|
54
|
+
preCheck: j.preCheck,
|
|
55
|
+
successCriteria: j.successCriteria,
|
|
56
|
+
});
|
|
57
|
+
return crypto.createHash('sha1').update(data).digest('hex').slice(0, 12);
|
|
58
|
+
}
|
|
59
|
+
/**
|
|
60
|
+
* Compare an old and new jobs list and record verifications for any job that:
|
|
61
|
+
* - exists in both lists (new jobs aren't "fixes" of existing problems)
|
|
62
|
+
* - has its definition hash changed
|
|
63
|
+
* - is currently in a failing state per failure-monitor
|
|
64
|
+
*
|
|
65
|
+
* Disabled jobs and removed jobs are tracked too: if a previously failing
|
|
66
|
+
* job gets disabled or removed in the edit, we surface that as a "removed
|
|
67
|
+
* pending verification" rather than waiting for a run that will never come.
|
|
68
|
+
*/
|
|
69
|
+
export function recordEditsForFailingJobs(oldJobs, newJobs) {
|
|
70
|
+
const oldByName = new Map(oldJobs.map(j => [j.name, j]));
|
|
71
|
+
const newByName = new Map(newJobs.map(j => [j.name, j]));
|
|
72
|
+
const broken = computeBrokenJobs();
|
|
73
|
+
const brokenByName = new Map(broken.map(b => [b.jobName, b]));
|
|
74
|
+
const state = loadState();
|
|
75
|
+
const stamp = new Date().toISOString();
|
|
76
|
+
let mutated = false;
|
|
77
|
+
for (const [name, oj] of oldByName) {
|
|
78
|
+
const b = brokenByName.get(name);
|
|
79
|
+
if (!b)
|
|
80
|
+
continue; // not currently broken — nothing to verify
|
|
81
|
+
const nj = newByName.get(name);
|
|
82
|
+
if (!nj) {
|
|
83
|
+
// Job removed entirely. Treat as resolved by removal.
|
|
84
|
+
delete state.pending[name];
|
|
85
|
+
mutated = true;
|
|
86
|
+
logger.info({ job: name }, 'Failing job removed from CRON.md — verification cleared');
|
|
87
|
+
continue;
|
|
88
|
+
}
|
|
89
|
+
if (!nj.enabled) {
|
|
90
|
+
// Job disabled. Don't wait for a run; clear and note.
|
|
91
|
+
delete state.pending[name];
|
|
92
|
+
mutated = true;
|
|
93
|
+
logger.info({ job: name }, 'Failing job disabled in CRON.md — verification cleared');
|
|
94
|
+
continue;
|
|
95
|
+
}
|
|
96
|
+
if (jobHash(oj) === jobHash(nj))
|
|
97
|
+
continue; // no relevant changes
|
|
98
|
+
state.pending[name] = {
|
|
99
|
+
jobName: name,
|
|
100
|
+
recordedAt: stamp,
|
|
101
|
+
preFailureCount: b.errorCount48h,
|
|
102
|
+
preLastError: b.lastErrors[0] ?? null,
|
|
103
|
+
};
|
|
104
|
+
mutated = true;
|
|
105
|
+
logger.info({ job: name, preFailureCount: b.errorCount48h }, 'Recorded pending fix verification');
|
|
106
|
+
}
|
|
107
|
+
if (mutated)
|
|
108
|
+
saveState(state);
|
|
109
|
+
}
|
|
110
|
+
/**
|
|
111
|
+
* After a cron run completes, check whether we were waiting on a fix
|
|
112
|
+
* verification for this job. If so, send the owner a verdict and clear it.
|
|
113
|
+
*
|
|
114
|
+
* Skipped runs (circuit breaker, pre-check exit, etc.) don't carry signal
|
|
115
|
+
* and shouldn't count as a verdict either way.
|
|
116
|
+
*/
|
|
117
|
+
export async function checkAndDeliverVerification(entry, send) {
|
|
118
|
+
if (entry.status === 'skipped')
|
|
119
|
+
return;
|
|
120
|
+
const state = loadState();
|
|
121
|
+
const pending = state.pending[entry.jobName];
|
|
122
|
+
if (!pending)
|
|
123
|
+
return;
|
|
124
|
+
delete state.pending[entry.jobName];
|
|
125
|
+
saveState(state);
|
|
126
|
+
const ok = entry.status === 'ok';
|
|
127
|
+
const verdict = ok ? '✅ succeeded' : '⚠️ still failing';
|
|
128
|
+
const ageMin = Math.max(1, Math.round((Date.now() - Date.parse(pending.recordedAt)) / 60000));
|
|
129
|
+
const detail = ok
|
|
130
|
+
? ''
|
|
131
|
+
: `\nError: ${(entry.error ?? 'unknown').split('\n')[0].slice(0, 200)}`;
|
|
132
|
+
const msg = `**[Fix verification]** \`${entry.jobName}\` ${verdict} on its first run after edit (${ageMin}m later).${detail}`;
|
|
133
|
+
try {
|
|
134
|
+
await send(msg);
|
|
135
|
+
}
|
|
136
|
+
catch (err) {
|
|
137
|
+
logger.warn({ err, job: entry.jobName }, 'Failed to send fix verification DM');
|
|
138
|
+
}
|
|
139
|
+
}
|
|
140
|
+
/** Read-only accessor for dashboards or debugging. */
|
|
141
|
+
export function listPendingVerifications() {
|
|
142
|
+
return Object.values(loadState().pending);
|
|
143
|
+
}
|
|
144
|
+
//# sourceMappingURL=fix-verification.js.map
|
|
@@ -103,6 +103,13 @@ export class HeartbeatScheduler {
|
|
|
103
103
|
catch (err) {
|
|
104
104
|
logger.warn({ err }, 'Session eviction failed');
|
|
105
105
|
}
|
|
106
|
+
// Cron failure sweep — surface jobs that have been silently failing.
|
|
107
|
+
// Runs every tick; per-job 24h cooldown lives inside the monitor.
|
|
108
|
+
import('./failure-monitor.js').then(({ runFailureSweep }) => {
|
|
109
|
+
runFailureSweep((text) => this.dispatcher.send(text, {})).catch(err => {
|
|
110
|
+
logger.warn({ err }, 'Failure sweep failed');
|
|
111
|
+
});
|
|
112
|
+
}).catch(err => logger.warn({ err }, 'Failure sweep import failed'));
|
|
106
113
|
const now = new Date();
|
|
107
114
|
const hour = now.getHours();
|
|
108
115
|
// ── Nightly tasks: run regardless of active hours ─────────────────
|
|
@@ -174,6 +181,25 @@ export class HeartbeatScheduler {
|
|
|
174
181
|
logger.warn({ err }, 'Per-agent self-improvement scheduling error');
|
|
175
182
|
}
|
|
176
183
|
}
|
|
184
|
+
// Daily stale-skill archival: run once per day at 3 AM. Skills never
|
|
185
|
+
// retrieved in 90+ days (both frontmatter useCount and skill_usage empty)
|
|
186
|
+
// get moved to .archive/ so they stop competing in trigger matching.
|
|
187
|
+
if (hour === 3 && this.lastState.lastSkillDecayDate !== todayISO()) {
|
|
188
|
+
this.lastState.lastSkillDecayDate = todayISO();
|
|
189
|
+
this.saveState();
|
|
190
|
+
import('../agent/skill-extractor.js').then(({ archiveStaleSkills }) => {
|
|
191
|
+
try {
|
|
192
|
+
const store = this.gateway.getMemoryStore();
|
|
193
|
+
const archived = archiveStaleSkills(90, store ? (name) => store.skillRetrievalCount(name) : undefined);
|
|
194
|
+
if (archived.length > 0) {
|
|
195
|
+
logger.info({ count: archived.length, names: archived.slice(0, 5) }, 'Archived stale skills');
|
|
196
|
+
}
|
|
197
|
+
}
|
|
198
|
+
catch (err) {
|
|
199
|
+
logger.warn({ err }, 'Stale skill archival failed');
|
|
200
|
+
}
|
|
201
|
+
}).catch(err => logger.warn({ err }, 'Stale skill archival import failed'));
|
|
202
|
+
}
|
|
177
203
|
// Evening memory consolidation: once per day between 7-9 PM
|
|
178
204
|
if (hour >= 19 && hour < 21 && this.lastConsolidationDate !== todayISO()) {
|
|
179
205
|
this.lastConsolidationDate = todayISO();
|
|
@@ -607,10 +633,41 @@ export class HeartbeatScheduler {
|
|
|
607
633
|
const prompt = buildInsightPrompt(signals);
|
|
608
634
|
if (!prompt)
|
|
609
635
|
return;
|
|
610
|
-
// Run lightweight LLM call via gateway
|
|
611
|
-
|
|
612
|
-
|
|
613
|
-
|
|
636
|
+
// Run lightweight LLM call via gateway. Log success AND failure to the
|
|
637
|
+
// cron run log so the failure monitor can see hourly breakage.
|
|
638
|
+
// maxTurns bumped 1 → 3 because the agent needs to fan out ~4 parallel
|
|
639
|
+
// tool calls (activity_history, outlook_inbox, goal_list, task_list)
|
|
640
|
+
// before composing its rating — at 1 turn it always crashes with
|
|
641
|
+
// "Reached maximum number of turns".
|
|
642
|
+
const icStartedAt = new Date();
|
|
643
|
+
let response = null;
|
|
644
|
+
try {
|
|
645
|
+
response = await this.gateway.handleCronJob('insight-check', prompt, 1, // tier 1
|
|
646
|
+
3, // max 3 turns (parallel tool fan-out + synthesis)
|
|
647
|
+
'haiku');
|
|
648
|
+
this.runLog.append({
|
|
649
|
+
jobName: 'insight-check',
|
|
650
|
+
startedAt: icStartedAt.toISOString(),
|
|
651
|
+
finishedAt: new Date().toISOString(),
|
|
652
|
+
status: 'ok',
|
|
653
|
+
durationMs: Date.now() - icStartedAt.getTime(),
|
|
654
|
+
attempt: 1,
|
|
655
|
+
outputPreview: (response ?? '').slice(0, 200),
|
|
656
|
+
});
|
|
657
|
+
}
|
|
658
|
+
catch (err) {
|
|
659
|
+
this.runLog.append({
|
|
660
|
+
jobName: 'insight-check',
|
|
661
|
+
startedAt: icStartedAt.toISOString(),
|
|
662
|
+
finishedAt: new Date().toISOString(),
|
|
663
|
+
status: 'error',
|
|
664
|
+
durationMs: Date.now() - icStartedAt.getTime(),
|
|
665
|
+
attempt: 1,
|
|
666
|
+
error: String(err).slice(0, 400),
|
|
667
|
+
errorType: 'transient',
|
|
668
|
+
});
|
|
669
|
+
throw err;
|
|
670
|
+
}
|
|
614
671
|
if (!response)
|
|
615
672
|
return;
|
|
616
673
|
const insight = parseInsightResponse(response);
|