clementine-agent 1.0.17 → 1.0.19
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli/dashboard.js +142 -0
- package/dist/gateway/failure-diagnostics.d.ts +64 -0
- package/dist/gateway/failure-diagnostics.js +332 -0
- package/dist/gateway/failure-monitor.d.ts +30 -2
- package/dist/gateway/failure-monitor.js +85 -8
- package/dist/gateway/fix-applier.d.ts +34 -0
- package/dist/gateway/fix-applier.js +308 -0
- package/dist/gateway/heartbeat-scheduler.js +3 -1
- package/package.json +1 -1
package/dist/cli/dashboard.js
CHANGED
|
@@ -2085,6 +2085,54 @@ export async function cmdDashboard(opts) {
|
|
|
2085
2085
|
res.status(500).json({ error: String(err) });
|
|
2086
2086
|
}
|
|
2087
2087
|
});
|
|
2088
|
+
/**
|
|
2089
|
+
* Apply the cached diagnosis's autoApply operations to the right CRON.md.
|
|
2090
|
+
* Strict safety: requires a fresh diagnosis, requires autoApply present,
|
|
2091
|
+
* requires riskLevel == 'low'. Otherwise returns 409 with a reason.
|
|
2092
|
+
*/
|
|
2093
|
+
app.post('/api/cron/broken-jobs/:jobName/apply-fix', async (req, res) => {
|
|
2094
|
+
const jobName = req.params.jobName;
|
|
2095
|
+
try {
|
|
2096
|
+
const { getDiagnosisIfFresh, clearDiagnosis } = await import('../gateway/failure-diagnostics.js');
|
|
2097
|
+
const { applyFix } = await import('../gateway/fix-applier.js');
|
|
2098
|
+
const d = getDiagnosisIfFresh(jobName);
|
|
2099
|
+
if (!d) {
|
|
2100
|
+
res.status(404).json({ error: 'No fresh diagnosis for this job. Wait for the next sweep.' });
|
|
2101
|
+
return;
|
|
2102
|
+
}
|
|
2103
|
+
if (!d.proposedFix.autoApply) {
|
|
2104
|
+
res.status(409).json({ error: 'Diagnosis has no auto-applicable operations — review manually.' });
|
|
2105
|
+
return;
|
|
2106
|
+
}
|
|
2107
|
+
if (d.riskLevel !== 'low') {
|
|
2108
|
+
res.status(409).json({ error: `riskLevel is '${d.riskLevel}' — only 'low' is auto-apply-able.` });
|
|
2109
|
+
return;
|
|
2110
|
+
}
|
|
2111
|
+
const dryRun = req.body?.dryRun === true;
|
|
2112
|
+
const result = applyFix(jobName, d.proposedFix.autoApply, { dryRun });
|
|
2113
|
+
if (result.ok && !dryRun) {
|
|
2114
|
+
// Clear the cached diagnosis so the next sweep re-evaluates with the
|
|
2115
|
+
// new config. The existing CRON.md watcher will reload cron jobs
|
|
2116
|
+
// within a couple of seconds.
|
|
2117
|
+
clearDiagnosis(jobName);
|
|
2118
|
+
}
|
|
2119
|
+
res.status(result.ok ? 200 : 400).json(result);
|
|
2120
|
+
}
|
|
2121
|
+
catch (err) {
|
|
2122
|
+
res.status(500).json({ error: String(err) });
|
|
2123
|
+
}
|
|
2124
|
+
});
|
|
2125
|
+
/** Dismiss a diagnosis without applying — clears the cached result. */
|
|
2126
|
+
app.post('/api/cron/broken-jobs/:jobName/dismiss-diagnosis', async (req, res) => {
|
|
2127
|
+
try {
|
|
2128
|
+
const { clearDiagnosis } = await import('../gateway/failure-diagnostics.js');
|
|
2129
|
+
clearDiagnosis(req.params.jobName);
|
|
2130
|
+
res.json({ ok: true });
|
|
2131
|
+
}
|
|
2132
|
+
catch (err) {
|
|
2133
|
+
res.status(500).json({ error: String(err) });
|
|
2134
|
+
}
|
|
2135
|
+
});
|
|
2088
2136
|
// ── Cron trace viewer ──────────────────────────────────────────
|
|
2089
2137
|
app.get('/api/cron/traces/:job', (req, res) => {
|
|
2090
2138
|
try {
|
|
@@ -16162,6 +16210,49 @@ async function expandSkill(name) {
|
|
|
16162
16210
|
} catch(e) { toast('Failed to load skill', 'error'); }
|
|
16163
16211
|
}
|
|
16164
16212
|
|
|
16213
|
+
async function applyBrokenJobFix(jobName) {
|
|
16214
|
+
try {
|
|
16215
|
+
// First: dry-run to get the actual diff to show in the confirm dialog
|
|
16216
|
+
var dryRes = await apiJson('POST', '/api/cron/broken-jobs/' + encodeURIComponent(jobName) + '/apply-fix', { dryRun: true });
|
|
16217
|
+
if (!dryRes || !dryRes.ok) {
|
|
16218
|
+
toast('Cannot apply: ' + ((dryRes && (dryRes.message || dryRes.error)) || 'unknown error'), 'error');
|
|
16219
|
+
return;
|
|
16220
|
+
}
|
|
16221
|
+
var diffPreview = (dryRes.diff || '(no diff)').slice(0, 1200);
|
|
16222
|
+
var msg = 'Apply this fix to ' + jobName + '?\n\n'
|
|
16223
|
+
+ 'File: ' + (dryRes.file || 'unknown') + '\n'
|
|
16224
|
+
+ 'Operations: ' + (dryRes.appliedOps || []).length + '\n\n'
|
|
16225
|
+
+ diffPreview
|
|
16226
|
+
+ '\n\nA .bak will be written. The daemon auto-reloads; the next run will be fix-verified.';
|
|
16227
|
+
if (!confirm(msg)) return;
|
|
16228
|
+
|
|
16229
|
+
var res = await apiJson('POST', '/api/cron/broken-jobs/' + encodeURIComponent(jobName) + '/apply-fix', {});
|
|
16230
|
+
if (res && res.ok) {
|
|
16231
|
+
toast('Applied ' + (res.appliedOps || []).length + ' op(s) to ' + jobName, 'success');
|
|
16232
|
+
refreshBrokenJobs();
|
|
16233
|
+
} else {
|
|
16234
|
+
toast('Apply failed: ' + ((res && (res.message || res.error)) || 'unknown'), 'error');
|
|
16235
|
+
}
|
|
16236
|
+
} catch (e) {
|
|
16237
|
+
toast('Apply failed: ' + String(e), 'error');
|
|
16238
|
+
}
|
|
16239
|
+
}
|
|
16240
|
+
|
|
16241
|
+
async function dismissBrokenJobDiagnosis(jobName) {
|
|
16242
|
+
if (!confirm('Clear the cached diagnosis for ' + jobName + '? It will be re-diagnosed on the next sweep if still failing.')) return;
|
|
16243
|
+
try {
|
|
16244
|
+
var res = await apiJson('POST', '/api/cron/broken-jobs/' + encodeURIComponent(jobName) + '/dismiss-diagnosis', {});
|
|
16245
|
+
if (res && res.ok) {
|
|
16246
|
+
toast('Diagnosis dismissed', 'info');
|
|
16247
|
+
refreshBrokenJobs();
|
|
16248
|
+
} else {
|
|
16249
|
+
toast('Failed to dismiss: ' + ((res && res.error) || 'unknown'), 'error');
|
|
16250
|
+
}
|
|
16251
|
+
} catch (e) {
|
|
16252
|
+
toast('Failed to dismiss: ' + String(e), 'error');
|
|
16253
|
+
}
|
|
16254
|
+
}
|
|
16255
|
+
|
|
16165
16256
|
async function refreshBrokenJobs() {
|
|
16166
16257
|
try {
|
|
16167
16258
|
var r = await apiFetch('/api/cron/broken-jobs');
|
|
@@ -16201,11 +16292,62 @@ async function refreshBrokenJobs() {
|
|
|
16201
16292
|
var agentTag = j.agentSlug
|
|
16202
16293
|
? '<span class="badge badge-blue" style="font-size:10px">' + esc(j.agentSlug) + '</span>'
|
|
16203
16294
|
: '';
|
|
16295
|
+
|
|
16296
|
+
// Diagnosis block — root cause + proposed fix + diff preview +
|
|
16297
|
+
// Apply/Dismiss buttons when autoApply is present and risk is low.
|
|
16298
|
+
var diagnosisHtml = '';
|
|
16299
|
+
if (j.diagnosis) {
|
|
16300
|
+
var riskColor = j.diagnosis.riskLevel === 'high' ? '#ef4444'
|
|
16301
|
+
: j.diagnosis.riskLevel === 'medium' ? '#f59e0b' : '#22c55e';
|
|
16302
|
+
var confLabel = j.diagnosis.confidence !== 'high'
|
|
16303
|
+
? ' <span style="font-size:10px;color:var(--text-muted)">(' + esc(j.diagnosis.confidence) + ' confidence)</span>'
|
|
16304
|
+
: '';
|
|
16305
|
+
var diffHtml = '';
|
|
16306
|
+
if (j.diagnosis.proposedFix.diff) {
|
|
16307
|
+
diffHtml = '<pre style="font-size:11px;background:#0f172a;color:#e2e8f0;padding:8px;border-radius:4px;margin:6px 0 0;white-space:pre-wrap;word-break:break-word;max-height:200px;overflow-y:auto">'
|
|
16308
|
+
+ esc(j.diagnosis.proposedFix.diff) + '</pre>';
|
|
16309
|
+
}
|
|
16310
|
+
|
|
16311
|
+
var canAutoApply = !!j.diagnosis.proposedFix.autoApply
|
|
16312
|
+
&& j.diagnosis.riskLevel === 'low';
|
|
16313
|
+
var actionsHtml = '';
|
|
16314
|
+
if (canAutoApply) {
|
|
16315
|
+
var opCount = (j.diagnosis.proposedFix.autoApply.operations || []).length;
|
|
16316
|
+
actionsHtml = '<div style="margin-top:10px;display:flex;gap:8px;align-items:center">'
|
|
16317
|
+
+ '<button onclick="applyBrokenJobFix(\\x27' + esc(j.jobName) + '\\x27)" '
|
|
16318
|
+
+ 'style="background:var(--accent);border:1px solid var(--accent);color:white;padding:4px 12px;border-radius:4px;font-size:11px;cursor:pointer">'
|
|
16319
|
+
+ 'Apply fix (' + opCount + ' op' + (opCount === 1 ? '' : 's') + ')</button>'
|
|
16320
|
+
+ '<button onclick="dismissBrokenJobDiagnosis(\\x27' + esc(j.jobName) + '\\x27)" '
|
|
16321
|
+
+ 'style="background:none;border:1px solid var(--border);color:var(--text-secondary);padding:4px 12px;border-radius:4px;font-size:11px;cursor:pointer">'
|
|
16322
|
+
+ 'Dismiss</button>'
|
|
16323
|
+
+ '<span style="font-size:10px;color:var(--text-muted);margin-left:auto">auto-verified after next run</span>'
|
|
16324
|
+
+ '</div>';
|
|
16325
|
+
} else if (j.diagnosis.proposedFix.autoApply && j.diagnosis.riskLevel !== 'low') {
|
|
16326
|
+
actionsHtml = '<div style="margin-top:10px;font-size:11px;color:var(--text-muted);font-style:italic">'
|
|
16327
|
+
+ 'Not auto-applicable (risk: ' + esc(j.diagnosis.riskLevel) + ') — review manually'
|
|
16328
|
+
+ '</div>';
|
|
16329
|
+
}
|
|
16330
|
+
|
|
16331
|
+
diagnosisHtml = '<div style="margin-top:10px;padding:10px;border-left:3px solid ' + riskColor
|
|
16332
|
+
+ ';background:var(--bg-tertiary);border-radius:4px">'
|
|
16333
|
+
+ '<div style="font-size:12px;margin-bottom:4px"><strong>Root cause' + confLabel + ':</strong> '
|
|
16334
|
+
+ esc(j.diagnosis.rootCause) + '</div>'
|
|
16335
|
+
+ '<div style="font-size:12px"><strong>Proposed fix:</strong> '
|
|
16336
|
+
+ esc(j.diagnosis.proposedFix.details) + '</div>'
|
|
16337
|
+
+ diffHtml
|
|
16338
|
+
+ actionsHtml
|
|
16339
|
+
+ '<div style="font-size:10px;color:var(--text-muted);margin-top:6px">'
|
|
16340
|
+
+ esc(j.diagnosis.proposedFix.type) + ' \\u00b7 ' + esc(j.diagnosis.riskLevel) + ' risk \\u00b7 diagnosed ' + timeAgo(j.diagnosis.generatedAt)
|
|
16341
|
+
+ '</div>'
|
|
16342
|
+
+ '</div>';
|
|
16343
|
+
}
|
|
16344
|
+
|
|
16204
16345
|
html += '<div style="padding:12px;border:1px solid var(--border);border-radius:8px;background:var(--bg-secondary)">'
|
|
16205
16346
|
+ '<div style="display:flex;align-items:center;gap:8px;flex-wrap:wrap">'
|
|
16206
16347
|
+ '<strong>' + esc(j.jobName) + '</strong> ' + agentTag + ' ' + breaker
|
|
16207
16348
|
+ '<span style="margin-left:auto;font-size:11px;color:var(--text-muted)">' + failureRatio + ' failed \\u00b7 last error ' + lastErrorAt + '</span>'
|
|
16208
16349
|
+ '</div>'
|
|
16350
|
+
+ diagnosisHtml
|
|
16209
16351
|
+ errorsHtml
|
|
16210
16352
|
+ advisorLine
|
|
16211
16353
|
+ '</div>';
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Clementine TypeScript — Broken-job diagnostic agent.
|
|
3
|
+
*
|
|
4
|
+
* When the failure monitor flags a job, this runs a cheap Haiku-level
|
|
5
|
+
* analysis over the job definition, agent profile, and recent runs to
|
|
6
|
+
* propose a root cause and a specific fix. Read-only: it never writes
|
|
7
|
+
* anything except its own cache.
|
|
8
|
+
*
|
|
9
|
+
* Output surfaces in the Broken Jobs dashboard panel and the owner DM
|
|
10
|
+
* so the response to a silent failure is "here's what's wrong and
|
|
11
|
+
* here's what to change" rather than "go investigate."
|
|
12
|
+
*/
|
|
13
|
+
import type { Gateway } from './router.js';
|
|
14
|
+
import type { BrokenJob } from './failure-monitor.js';
|
|
15
|
+
/**
|
|
16
|
+
* Fields safe for one-click auto-apply. Limited to simple scalar YAML
|
|
17
|
+
* fields on cron jobs — nothing multi-line (prompt, pre_check, context,
|
|
18
|
+
* success_criteria), nothing structural (schedule edits would re-schedule
|
|
19
|
+
* a running job, handled manually).
|
|
20
|
+
*/
|
|
21
|
+
export declare const EDITABLE_FIELDS: Set<string>;
|
|
22
|
+
export type FixOperation = {
|
|
23
|
+
op: 'set';
|
|
24
|
+
field: string;
|
|
25
|
+
value: string | number | boolean;
|
|
26
|
+
} | {
|
|
27
|
+
op: 'remove';
|
|
28
|
+
field: string;
|
|
29
|
+
};
|
|
30
|
+
export interface Diagnosis {
|
|
31
|
+
rootCause: string;
|
|
32
|
+
confidence: 'high' | 'medium' | 'low';
|
|
33
|
+
proposedFix: {
|
|
34
|
+
type: 'config_change' | 'prompt_change' | 'agent_scope' | 'disable' | 'credential_refresh' | 'escalate_to_owner';
|
|
35
|
+
details: string;
|
|
36
|
+
diff?: string;
|
|
37
|
+
/**
|
|
38
|
+
* When present, the fix can be applied with one click via the
|
|
39
|
+
* /api/cron/broken-jobs/:jobName/apply-fix endpoint. Operations are
|
|
40
|
+
* silently filtered against EDITABLE_FIELDS — a proposal that mixes
|
|
41
|
+
* safe and unsafe edits gets the unsafe ones dropped.
|
|
42
|
+
*/
|
|
43
|
+
autoApply?: {
|
|
44
|
+
agentSlug?: string;
|
|
45
|
+
operations: FixOperation[];
|
|
46
|
+
};
|
|
47
|
+
};
|
|
48
|
+
riskLevel: 'low' | 'medium' | 'high';
|
|
49
|
+
generatedAt: string;
|
|
50
|
+
}
|
|
51
|
+
/**
|
|
52
|
+
* Diagnose one broken job. Returns a cached diagnosis if one exists and is
|
|
53
|
+
* fresher than 24h; otherwise invokes the LLM. Always best-effort — returns
|
|
54
|
+
* null instead of throwing so failure detection stays robust.
|
|
55
|
+
*/
|
|
56
|
+
export declare function diagnoseBrokenJob(broken: BrokenJob, gateway: Gateway): Promise<Diagnosis | null>;
|
|
57
|
+
/**
|
|
58
|
+
* Clear cached diagnosis for a job (e.g., after the owner applies a fix).
|
|
59
|
+
* Called opportunistically when a broken job disappears from the live set.
|
|
60
|
+
*/
|
|
61
|
+
export declare function clearDiagnosis(jobName: string): void;
|
|
62
|
+
/** Read-only accessor for the dashboard. */
|
|
63
|
+
export declare function getDiagnosisIfFresh(jobName: string): Diagnosis | null;
|
|
64
|
+
//# sourceMappingURL=failure-diagnostics.d.ts.map
|
|
@@ -0,0 +1,332 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Clementine TypeScript — Broken-job diagnostic agent.
|
|
3
|
+
*
|
|
4
|
+
* When the failure monitor flags a job, this runs a cheap Haiku-level
|
|
5
|
+
* analysis over the job definition, agent profile, and recent runs to
|
|
6
|
+
* propose a root cause and a specific fix. Read-only: it never writes
|
|
7
|
+
* anything except its own cache.
|
|
8
|
+
*
|
|
9
|
+
* Output surfaces in the Broken Jobs dashboard panel and the owner DM
|
|
10
|
+
* so the response to a silent failure is "here's what's wrong and
|
|
11
|
+
* here's what to change" rather than "go investigate."
|
|
12
|
+
*/
|
|
13
|
+
import { existsSync, mkdirSync, readFileSync, writeFileSync, } from 'node:fs';
|
|
14
|
+
import path from 'node:path';
|
|
15
|
+
import pino from 'pino';
|
|
16
|
+
import { AGENTS_DIR, BASE_DIR, CRON_FILE } from '../config.js';
|
|
17
|
+
const logger = pino({ name: 'clementine.failure-diagnostics' });
|
|
18
|
+
const CACHE_FILE = path.join(BASE_DIR, 'cron', 'failure-diagnostics.json');
|
|
19
|
+
const CACHE_TTL_MS = 24 * 60 * 60 * 1000;
|
|
20
|
+
const RUNS_DIR = path.join(BASE_DIR, 'cron', 'runs');
|
|
21
|
+
/**
|
|
22
|
+
* Fields safe for one-click auto-apply. Limited to simple scalar YAML
|
|
23
|
+
* fields on cron jobs — nothing multi-line (prompt, pre_check, context,
|
|
24
|
+
* success_criteria), nothing structural (schedule edits would re-schedule
|
|
25
|
+
* a running job, handled manually).
|
|
26
|
+
*/
|
|
27
|
+
export const EDITABLE_FIELDS = new Set([
|
|
28
|
+
'tier',
|
|
29
|
+
'mode',
|
|
30
|
+
'max_hours',
|
|
31
|
+
'max_turns',
|
|
32
|
+
'max_retries',
|
|
33
|
+
'enabled',
|
|
34
|
+
'agentSlug',
|
|
35
|
+
'work_dir',
|
|
36
|
+
'model',
|
|
37
|
+
'always_deliver',
|
|
38
|
+
'after',
|
|
39
|
+
'timeout_ms',
|
|
40
|
+
]);
|
|
41
|
+
function loadCache() {
|
|
42
|
+
try {
|
|
43
|
+
if (!existsSync(CACHE_FILE))
|
|
44
|
+
return {};
|
|
45
|
+
const raw = JSON.parse(readFileSync(CACHE_FILE, 'utf-8'));
|
|
46
|
+
return raw;
|
|
47
|
+
}
|
|
48
|
+
catch {
|
|
49
|
+
return {};
|
|
50
|
+
}
|
|
51
|
+
}
|
|
52
|
+
function saveCache(cache) {
|
|
53
|
+
try {
|
|
54
|
+
mkdirSync(path.dirname(CACHE_FILE), { recursive: true });
|
|
55
|
+
writeFileSync(CACHE_FILE, JSON.stringify(cache, null, 2));
|
|
56
|
+
}
|
|
57
|
+
catch (err) {
|
|
58
|
+
logger.warn({ err }, 'Failed to persist diagnostic cache');
|
|
59
|
+
}
|
|
60
|
+
}
|
|
61
|
+
/**
|
|
62
|
+
* Pull the raw YAML entry for a cron job from CRON.md (global or agent-scoped).
|
|
63
|
+
* Returns the text of the entry, not the parsed object, so the diagnostic
|
|
64
|
+
* agent sees the exact fields the user edits.
|
|
65
|
+
*/
|
|
66
|
+
function readJobDefinition(jobName) {
|
|
67
|
+
const [maybeSlug, ...rest] = jobName.split(':');
|
|
68
|
+
const bareName = rest.length > 0 ? rest.join(':') : maybeSlug;
|
|
69
|
+
const candidateFiles = [];
|
|
70
|
+
if (rest.length > 0) {
|
|
71
|
+
// agent-scoped: ross-the-sdr:reply-detection
|
|
72
|
+
candidateFiles.push(path.join(AGENTS_DIR, maybeSlug, 'CRON.md'));
|
|
73
|
+
}
|
|
74
|
+
candidateFiles.push(CRON_FILE);
|
|
75
|
+
for (const file of candidateFiles) {
|
|
76
|
+
if (!existsSync(file))
|
|
77
|
+
continue;
|
|
78
|
+
try {
|
|
79
|
+
const raw = readFileSync(file, 'utf-8');
|
|
80
|
+
// Find the YAML block for "- name: bareName" and return until the next
|
|
81
|
+
// "- name:" at the same indent or end of file.
|
|
82
|
+
const pattern = new RegExp(`^( - name: ${bareName.replace(/[.*+?^${}()|[\]\\]/g, '\\$&')}\\s*$[\\s\\S]*?)(?=^ - name: |\\z)`, 'm');
|
|
83
|
+
const m = raw.match(pattern);
|
|
84
|
+
if (m)
|
|
85
|
+
return m[1].slice(0, 6000);
|
|
86
|
+
}
|
|
87
|
+
catch { /* skip */ }
|
|
88
|
+
}
|
|
89
|
+
return null;
|
|
90
|
+
}
|
|
91
|
+
/**
|
|
92
|
+
* Read the agent profile markdown if this job is scoped to an agent.
|
|
93
|
+
* Returns the first 2K chars which covers name/description/tier/allowedTools.
|
|
94
|
+
*/
|
|
95
|
+
function readAgentProfile(agentSlug) {
|
|
96
|
+
const profile = path.join(AGENTS_DIR, agentSlug, 'agent.md');
|
|
97
|
+
if (!existsSync(profile))
|
|
98
|
+
return null;
|
|
99
|
+
try {
|
|
100
|
+
return readFileSync(profile, 'utf-8').slice(0, 2500);
|
|
101
|
+
}
|
|
102
|
+
catch {
|
|
103
|
+
return null;
|
|
104
|
+
}
|
|
105
|
+
}
|
|
106
|
+
/** Last N cron run entries for the job, oldest → newest for the prompt. */
|
|
107
|
+
function readRecentRuns(jobName, limit = 10) {
|
|
108
|
+
const safe = jobName.replace(/[^a-zA-Z0-9_-]/g, '_');
|
|
109
|
+
const file = path.join(RUNS_DIR, `${safe}.jsonl`);
|
|
110
|
+
if (!existsSync(file))
|
|
111
|
+
return '(no run log)';
|
|
112
|
+
try {
|
|
113
|
+
const lines = readFileSync(file, 'utf-8').trim().split('\n').filter(Boolean);
|
|
114
|
+
const recent = lines.slice(-limit);
|
|
115
|
+
const summaries = recent.map(line => {
|
|
116
|
+
try {
|
|
117
|
+
const d = JSON.parse(line);
|
|
118
|
+
const detail = d.status === 'ok'
|
|
119
|
+
? `preview="${(d.outputPreview ?? '').slice(0, 120).replace(/\n/g, ' ')}"`
|
|
120
|
+
: `error="${(d.error ?? '').split('\n')[0].slice(0, 160)}"`;
|
|
121
|
+
return `${d.startedAt} ${d.status} (${Math.round(d.durationMs / 1000)}s) ${detail}`;
|
|
122
|
+
}
|
|
123
|
+
catch {
|
|
124
|
+
return line.slice(0, 160);
|
|
125
|
+
}
|
|
126
|
+
});
|
|
127
|
+
return summaries.join('\n');
|
|
128
|
+
}
|
|
129
|
+
catch {
|
|
130
|
+
return '(failed to read run log)';
|
|
131
|
+
}
|
|
132
|
+
}
|
|
133
|
+
function buildPrompt(broken, jobDef, agentProfile, recentRuns) {
|
|
134
|
+
const breakerLine = broken.circuitBreakerEngagedAt
|
|
135
|
+
? `Circuit breaker engaged at ${broken.circuitBreakerEngagedAt}.`
|
|
136
|
+
: 'No active circuit breaker.';
|
|
137
|
+
return [
|
|
138
|
+
`You are a reliability engineer diagnosing a cron job that's been failing in Clementine (a personal AI assistant framework).`,
|
|
139
|
+
`Your output must be a single JSON object with the schema shown at the end. No preamble, no postscript.`,
|
|
140
|
+
'',
|
|
141
|
+
`## Job name: ${broken.jobName}`,
|
|
142
|
+
broken.agentSlug ? `## Agent scope: ${broken.agentSlug}` : '## Scope: global (no agent)',
|
|
143
|
+
`## Failure stats: ${broken.errorCount48h}/${broken.totalRuns48h} runs failed in last 48h. ${breakerLine}`,
|
|
144
|
+
broken.lastAdvisorOpinion ? `## Advisor notes: ${broken.lastAdvisorOpinion}` : '',
|
|
145
|
+
'',
|
|
146
|
+
'## Job definition (CURRENT state of CRON.md):',
|
|
147
|
+
jobDef ?? '(not found in CRON.md — may be a heartbeat pseudo-job like insight-check)',
|
|
148
|
+
'',
|
|
149
|
+
agentProfile ? '## Agent profile (agent.md, truncated):\n' + agentProfile + '\n' : '',
|
|
150
|
+
'## Recent runs (oldest → newest):',
|
|
151
|
+
recentRuns,
|
|
152
|
+
'',
|
|
153
|
+
broken.lastErrors.length > 0 ? '## Distinct recent errors:\n' + broken.lastErrors.map(e => '- ' + e.slice(0, 400)).join('\n') : '',
|
|
154
|
+
'',
|
|
155
|
+
'## Critical reasoning rules',
|
|
156
|
+
'',
|
|
157
|
+
'**The CURRENT job definition above may differ from the config at the time of past failures.** If you see old errors (e.g. "timeout kill") but the current config ALREADY contains the fields that would have caused those errors, treat those errors as resolved by a recent fix — do NOT propose re-adding the fields that caused them.',
|
|
158
|
+
'',
|
|
159
|
+
'**Look at the MOST RECENT runs specifically.** If the last 2+ runs succeeded, the job has recovered — propose `escalate_to_owner` with "appears recovered, no fix needed" as details, confidence: high, risk: low.',
|
|
160
|
+
'',
|
|
161
|
+
'**Don\'t propose reverting a fix.** If the current config does NOT contain `mode: unleashed` but recent runs show "Claude Code process aborted by user", do NOT propose adding `mode: unleashed` back. That error pattern occurs in BOTH unleashed (hit max_hours) and standard (hit timeoutMs) modes. Without strong evidence the current config is wrong, prefer raising `timeoutMs` or adding `max_turns` over toggling `mode`.',
|
|
162
|
+
'',
|
|
163
|
+
'## Diagnostic patterns (use these as priors)',
|
|
164
|
+
'',
|
|
165
|
+
'- **"API 400 input_schema"** → external MCP server exposes a malformed tool. Propose checking claude_desktop_config.json and ~/.claude.json for recently-updated packages. Type: escalate_to_owner.',
|
|
166
|
+
'- **401/403 errors** → credential refresh needed. Type: credential_refresh. Name the specific service if possible.',
|
|
167
|
+
'- **"Claude Code process aborted by user" with long durations (>60s)** → timeout kill. If current config has `mode: unleashed`, propose removing it + adding `max_turns: 25`. If current config is already standard, propose raising `timeoutMs` or investigating the prompt for infinite loops.',
|
|
168
|
+
'- **"Reached maximum number of turns (N)"** → maxTurns set too low for the job\'s tool fan-out. Propose raising `max_turns` to 3×N.',
|
|
169
|
+
'- **Output preview contains BLOCKED / "no local bash" / "permission denied"** → agent picked the wrong tool. Propose either scoping the job to an agent whose allowedTools excludes the bad MCP, or adding explicit tool-choice guidance in the prompt.',
|
|
170
|
+
'- **No clear pattern** → escalate_to_owner with what you would need to know.',
|
|
171
|
+
'',
|
|
172
|
+
'## Auto-apply contract',
|
|
173
|
+
'',
|
|
174
|
+
'When (and ONLY when) the fix is a simple edit to one of these scalar fields — tier, mode, max_hours, max_turns, max_retries, enabled, agentSlug, work_dir, model, always_deliver, after, timeout_ms — also populate `proposedFix.autoApply`. The owner can one-click approve it from the dashboard.',
|
|
175
|
+
'',
|
|
176
|
+
'For multi-line fields (prompt, pre_check, context, success_criteria), or for credential refreshes, or any change you are not very confident about: OMIT autoApply entirely. The owner will handle those manually.',
|
|
177
|
+
'',
|
|
178
|
+
`If the job is agent-scoped (job name includes ":"), set autoApply.agentSlug to the part BEFORE the colon. Otherwise omit it (global CRON.md).`,
|
|
179
|
+
'',
|
|
180
|
+
'Operations use the shape { "op": "set", "field": "<name>", "value": <scalar> } or { "op": "remove", "field": "<name>" }. Values are strings, numbers, or booleans.',
|
|
181
|
+
'',
|
|
182
|
+
'Examples:',
|
|
183
|
+
'- Remove unleashed mode + its companion: operations: [{"op":"remove","field":"mode"}, {"op":"remove","field":"max_hours"}, {"op":"set","field":"max_turns","value":25}]',
|
|
184
|
+
'- Scope a broken global job to Ross\'s profile: operations: [{"op":"set","field":"agentSlug","value":"ross-the-sdr"}]',
|
|
185
|
+
'- Bump maxTurns on an under-resourced job: operations: [{"op":"set","field":"max_turns","value":10}]',
|
|
186
|
+
'',
|
|
187
|
+
'## Output schema (JSON only, no markdown fences):',
|
|
188
|
+
'{',
|
|
189
|
+
' "rootCause": "1-2 sentences explaining WHY the job is failing, referencing specific fields or error patterns from the CURRENT config",',
|
|
190
|
+
' "confidence": "high|medium|low",',
|
|
191
|
+
' "proposedFix": {',
|
|
192
|
+
' "type": "config_change|prompt_change|agent_scope|disable|credential_refresh|escalate_to_owner",',
|
|
193
|
+
' "details": "prose description of the fix, citing the exact field(s) to change",',
|
|
194
|
+
' "diff": "optional: exact before/after diff",',
|
|
195
|
+
' "autoApply": "optional: { agentSlug?, operations: [...] } — ONLY for simple scalar-field edits on the allowlist"',
|
|
196
|
+
' },',
|
|
197
|
+
' "riskLevel": "low|medium|high"',
|
|
198
|
+
'}',
|
|
199
|
+
].filter(Boolean).join('\n');
|
|
200
|
+
}
|
|
201
|
+
function parseResponse(raw) {
|
|
202
|
+
try {
|
|
203
|
+
// The model sometimes wraps the JSON in markdown fences; extract the
|
|
204
|
+
// first top-level {...} object.
|
|
205
|
+
const match = raw.match(/\{[\s\S]*\}/);
|
|
206
|
+
if (!match)
|
|
207
|
+
return null;
|
|
208
|
+
const parsed = JSON.parse(match[0]);
|
|
209
|
+
if (!parsed.rootCause || !parsed.proposedFix)
|
|
210
|
+
return null;
|
|
211
|
+
const autoApply = sanitizeAutoApply(parsed.proposedFix.autoApply);
|
|
212
|
+
return {
|
|
213
|
+
rootCause: String(parsed.rootCause).slice(0, 500),
|
|
214
|
+
confidence: (parsed.confidence ?? 'medium'),
|
|
215
|
+
proposedFix: {
|
|
216
|
+
type: (parsed.proposedFix.type ?? 'escalate_to_owner'),
|
|
217
|
+
details: String(parsed.proposedFix.details ?? '').slice(0, 800),
|
|
218
|
+
diff: parsed.proposedFix.diff ? String(parsed.proposedFix.diff).slice(0, 1000) : undefined,
|
|
219
|
+
...(autoApply ? { autoApply } : {}),
|
|
220
|
+
},
|
|
221
|
+
riskLevel: (parsed.riskLevel ?? 'medium'),
|
|
222
|
+
generatedAt: new Date().toISOString(),
|
|
223
|
+
};
|
|
224
|
+
}
|
|
225
|
+
catch (err) {
|
|
226
|
+
logger.warn({ err }, 'Failed to parse diagnostic JSON');
|
|
227
|
+
return null;
|
|
228
|
+
}
|
|
229
|
+
}
|
|
230
|
+
/**
|
|
231
|
+
* Strictly validate and filter autoApply. Drops ops on non-allowlisted fields
|
|
232
|
+
* silently (rather than rejecting the whole diagnosis). Returns null if
|
|
233
|
+
* nothing valid remains.
|
|
234
|
+
*/
|
|
235
|
+
function sanitizeAutoApply(raw) {
|
|
236
|
+
if (!raw || typeof raw !== 'object')
|
|
237
|
+
return null;
|
|
238
|
+
const obj = raw;
|
|
239
|
+
if (!Array.isArray(obj.operations))
|
|
240
|
+
return null;
|
|
241
|
+
const operations = [];
|
|
242
|
+
for (const op of obj.operations) {
|
|
243
|
+
if (!op || typeof op !== 'object')
|
|
244
|
+
continue;
|
|
245
|
+
const raw = op;
|
|
246
|
+
if (typeof raw.field !== 'string')
|
|
247
|
+
continue;
|
|
248
|
+
if (!EDITABLE_FIELDS.has(raw.field))
|
|
249
|
+
continue;
|
|
250
|
+
if (raw.op === 'remove') {
|
|
251
|
+
operations.push({ op: 'remove', field: raw.field });
|
|
252
|
+
}
|
|
253
|
+
else if (raw.op === 'set') {
|
|
254
|
+
const v = raw.value;
|
|
255
|
+
if (typeof v === 'string' || typeof v === 'number' || typeof v === 'boolean') {
|
|
256
|
+
operations.push({ op: 'set', field: raw.field, value: v });
|
|
257
|
+
}
|
|
258
|
+
}
|
|
259
|
+
}
|
|
260
|
+
if (operations.length === 0)
|
|
261
|
+
return null;
|
|
262
|
+
const agentSlug = typeof obj.agentSlug === 'string' && /^[a-z0-9-]+$/i.test(obj.agentSlug)
|
|
263
|
+
? obj.agentSlug
|
|
264
|
+
: undefined;
|
|
265
|
+
return agentSlug ? { agentSlug, operations } : { operations };
|
|
266
|
+
}
|
|
267
|
+
/**
|
|
268
|
+
* Diagnose one broken job. Returns a cached diagnosis if one exists and is
|
|
269
|
+
* fresher than 24h; otherwise invokes the LLM. Always best-effort — returns
|
|
270
|
+
* null instead of throwing so failure detection stays robust.
|
|
271
|
+
*/
|
|
272
|
+
export async function diagnoseBrokenJob(broken, gateway) {
|
|
273
|
+
const cache = loadCache();
|
|
274
|
+
const cached = cache[broken.jobName];
|
|
275
|
+
if (cached) {
|
|
276
|
+
const age = Date.now() - Date.parse(cached.generatedAt);
|
|
277
|
+
if (Number.isFinite(age) && age < CACHE_TTL_MS) {
|
|
278
|
+
logger.debug({ job: broken.jobName, ageMin: Math.round(age / 60000) }, 'Using cached diagnosis');
|
|
279
|
+
return cached;
|
|
280
|
+
}
|
|
281
|
+
}
|
|
282
|
+
const jobDef = readJobDefinition(broken.jobName);
|
|
283
|
+
const agentProfile = broken.agentSlug ? readAgentProfile(broken.agentSlug) : null;
|
|
284
|
+
const recentRuns = readRecentRuns(broken.jobName, 10);
|
|
285
|
+
const prompt = buildPrompt(broken, jobDef, agentProfile, recentRuns);
|
|
286
|
+
let rawResponse;
|
|
287
|
+
try {
|
|
288
|
+
rawResponse = await gateway.handleCronJob(`diagnose:${broken.jobName}`, prompt, 1, // tier 1 — cheap
|
|
289
|
+
5, // maxTurns — diagnosis doesn't need tools typically
|
|
290
|
+
'haiku');
|
|
291
|
+
}
|
|
292
|
+
catch (err) {
|
|
293
|
+
logger.warn({ err, job: broken.jobName }, 'Diagnostic LLM call failed');
|
|
294
|
+
return null;
|
|
295
|
+
}
|
|
296
|
+
const diagnosis = parseResponse(rawResponse);
|
|
297
|
+
if (!diagnosis) {
|
|
298
|
+
logger.warn({ job: broken.jobName, rawHead: rawResponse.slice(0, 200) }, 'Diagnosis returned unparseable response');
|
|
299
|
+
return null;
|
|
300
|
+
}
|
|
301
|
+
cache[broken.jobName] = diagnosis;
|
|
302
|
+
saveCache(cache);
|
|
303
|
+
logger.info({
|
|
304
|
+
job: broken.jobName,
|
|
305
|
+
confidence: diagnosis.confidence,
|
|
306
|
+
fixType: diagnosis.proposedFix.type,
|
|
307
|
+
}, 'Broken-job diagnosis generated');
|
|
308
|
+
return diagnosis;
|
|
309
|
+
}
|
|
310
|
+
/**
|
|
311
|
+
* Clear cached diagnosis for a job (e.g., after the owner applies a fix).
|
|
312
|
+
* Called opportunistically when a broken job disappears from the live set.
|
|
313
|
+
*/
|
|
314
|
+
export function clearDiagnosis(jobName) {
|
|
315
|
+
const cache = loadCache();
|
|
316
|
+
if (cache[jobName]) {
|
|
317
|
+
delete cache[jobName];
|
|
318
|
+
saveCache(cache);
|
|
319
|
+
}
|
|
320
|
+
}
|
|
321
|
+
/** Read-only accessor for the dashboard. */
|
|
322
|
+
export function getDiagnosisIfFresh(jobName) {
|
|
323
|
+
const cache = loadCache();
|
|
324
|
+
const d = cache[jobName];
|
|
325
|
+
if (!d)
|
|
326
|
+
return null;
|
|
327
|
+
const age = Date.now() - Date.parse(d.generatedAt);
|
|
328
|
+
if (!Number.isFinite(age) || age >= CACHE_TTL_MS)
|
|
329
|
+
return null;
|
|
330
|
+
return d;
|
|
331
|
+
}
|
|
332
|
+
//# sourceMappingURL=failure-diagnostics.js.map
|
|
@@ -24,6 +24,29 @@ export interface BrokenJob {
|
|
|
24
24
|
lastErrors: string[];
|
|
25
25
|
circuitBreakerEngagedAt: string | null;
|
|
26
26
|
lastAdvisorOpinion: string | null;
|
|
27
|
+
/** Populated asynchronously by the diagnostic agent when available. */
|
|
28
|
+
diagnosis?: {
|
|
29
|
+
rootCause: string;
|
|
30
|
+
confidence: 'high' | 'medium' | 'low';
|
|
31
|
+
proposedFix: {
|
|
32
|
+
type: string;
|
|
33
|
+
details: string;
|
|
34
|
+
diff?: string;
|
|
35
|
+
autoApply?: {
|
|
36
|
+
agentSlug?: string;
|
|
37
|
+
operations: Array<{
|
|
38
|
+
op: 'set';
|
|
39
|
+
field: string;
|
|
40
|
+
value: string | number | boolean;
|
|
41
|
+
} | {
|
|
42
|
+
op: 'remove';
|
|
43
|
+
field: string;
|
|
44
|
+
}>;
|
|
45
|
+
};
|
|
46
|
+
};
|
|
47
|
+
riskLevel: 'low' | 'medium' | 'high';
|
|
48
|
+
generatedAt: string;
|
|
49
|
+
};
|
|
27
50
|
}
|
|
28
51
|
/**
|
|
29
52
|
* Compute the current set of broken jobs by scanning all run logs.
|
|
@@ -32,9 +55,14 @@ export interface BrokenJob {
|
|
|
32
55
|
export declare function computeBrokenJobs(now?: number): BrokenJob[];
|
|
33
56
|
/**
|
|
34
57
|
* Run a sweep: identify currently-broken jobs, pick the ones we haven't
|
|
35
|
-
* notified about recently,
|
|
58
|
+
* notified about recently, invoke the diagnostic agent for new entries,
|
|
59
|
+
* and dispatch one consolidated DM.
|
|
60
|
+
*
|
|
61
|
+
* `gateway` is optional — omitted for tests that want to skip the LLM call.
|
|
62
|
+
* When present, we diagnose fresh broken jobs before notifying, so the
|
|
63
|
+
* report includes a root-cause + proposed fix for each.
|
|
36
64
|
*
|
|
37
65
|
* Returns the jobs that triggered a fresh notification (mostly for tests/logs).
|
|
38
66
|
*/
|
|
39
|
-
export declare function runFailureSweep(send: (text: string) => Promise<unknown>, now?: number): Promise<BrokenJob[]>;
|
|
67
|
+
export declare function runFailureSweep(send: (text: string) => Promise<unknown>, gateway?: import('./router.js').Gateway, now?: number): Promise<BrokenJob[]>;
|
|
40
68
|
//# sourceMappingURL=failure-monitor.d.ts.map
|
|
@@ -257,8 +257,30 @@ export function computeBrokenJobs(now = Date.now()) {
|
|
|
257
257
|
const bT = b.lastErrorAt ? Date.parse(b.lastErrorAt) : 0;
|
|
258
258
|
return bT - aT;
|
|
259
259
|
});
|
|
260
|
+
// Attach any cached diagnosis (fresh within 24h). Reads the cache file
|
|
261
|
+
// directly — avoids circular imports with failure-diagnostics.
|
|
262
|
+
attachCachedDiagnoses(broken, now);
|
|
260
263
|
return broken;
|
|
261
264
|
}
|
|
265
|
+
const DIAGNOSTICS_CACHE_FILE = path.join(BASE_DIR, 'cron', 'failure-diagnostics.json');
|
|
266
|
+
const DIAGNOSIS_TTL_MS = 24 * 60 * 60 * 1000;
|
|
267
|
+
function attachCachedDiagnoses(jobs, now) {
|
|
268
|
+
if (!existsSync(DIAGNOSTICS_CACHE_FILE))
|
|
269
|
+
return;
|
|
270
|
+
try {
|
|
271
|
+
const cache = JSON.parse(readFileSync(DIAGNOSTICS_CACHE_FILE, 'utf-8'));
|
|
272
|
+
for (const j of jobs) {
|
|
273
|
+
const d = cache[j.jobName];
|
|
274
|
+
if (!d)
|
|
275
|
+
continue;
|
|
276
|
+
const age = now - Date.parse(d.generatedAt);
|
|
277
|
+
if (Number.isFinite(age) && age < DIAGNOSIS_TTL_MS) {
|
|
278
|
+
j.diagnosis = d;
|
|
279
|
+
}
|
|
280
|
+
}
|
|
281
|
+
}
|
|
282
|
+
catch { /* cache may be malformed — ignore */ }
|
|
283
|
+
}
|
|
262
284
|
/**
|
|
263
285
|
* The self-improve loop writes to its own experiment-log.jsonl, not cron/runs/.
|
|
264
286
|
* Its breakage pattern is: state.lastRunAt keeps getting updated nightly but
|
|
@@ -351,12 +373,28 @@ function formatReport(jobs) {
|
|
|
351
373
|
for (const j of jobs) {
|
|
352
374
|
const breaker = j.circuitBreakerEngagedAt ? ' · circuit breaker engaged' : '';
|
|
353
375
|
lines.push(`• \`${j.jobName}\` — ${j.errorCount48h}/${j.totalRuns48h} runs failed${breaker}`);
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
376
|
+
// Prefer the diagnostic agent's analysis when available — it's more
|
|
377
|
+
// actionable than the raw error. Fall back to error + advisor lines.
|
|
378
|
+
if (j.diagnosis) {
|
|
379
|
+
const conf = j.diagnosis.confidence === 'high' ? '' : ` (${j.diagnosis.confidence} confidence)`;
|
|
380
|
+
lines.push(` **Cause${conf}:** ${j.diagnosis.rootCause.slice(0, 240)}`);
|
|
381
|
+
lines.push(` **Proposed fix:** ${j.diagnosis.proposedFix.details.slice(0, 240)}`);
|
|
382
|
+
if (j.diagnosis.proposedFix.diff) {
|
|
383
|
+
// Show a short diff preview inline; full diff in the dashboard.
|
|
384
|
+
const diffShort = j.diagnosis.proposedFix.diff.split('\n').slice(0, 4).join('\n');
|
|
385
|
+
lines.push(' ```diff');
|
|
386
|
+
lines.push(' ' + diffShort.replace(/\n/g, '\n '));
|
|
387
|
+
lines.push(' ```');
|
|
388
|
+
}
|
|
357
389
|
}
|
|
358
|
-
|
|
359
|
-
|
|
390
|
+
else {
|
|
391
|
+
if (j.lastErrors.length > 0) {
|
|
392
|
+
const preview = j.lastErrors[0].split('\n')[0].slice(0, 140);
|
|
393
|
+
lines.push(` Last error: ${preview}`);
|
|
394
|
+
}
|
|
395
|
+
if (j.lastAdvisorOpinion) {
|
|
396
|
+
lines.push(` Advisor: ${j.lastAdvisorOpinion.slice(0, 140)}`);
|
|
397
|
+
}
|
|
360
398
|
}
|
|
361
399
|
}
|
|
362
400
|
lines.push('');
|
|
@@ -365,24 +403,40 @@ function formatReport(jobs) {
|
|
|
365
403
|
}
|
|
366
404
|
/**
|
|
367
405
|
* Run a sweep: identify currently-broken jobs, pick the ones we haven't
|
|
368
|
-
* notified about recently,
|
|
406
|
+
* notified about recently, invoke the diagnostic agent for new entries,
|
|
407
|
+
* and dispatch one consolidated DM.
|
|
408
|
+
*
|
|
409
|
+
* `gateway` is optional — omitted for tests that want to skip the LLM call.
|
|
410
|
+
* When present, we diagnose fresh broken jobs before notifying, so the
|
|
411
|
+
* report includes a root-cause + proposed fix for each.
|
|
369
412
|
*
|
|
370
413
|
* Returns the jobs that triggered a fresh notification (mostly for tests/logs).
|
|
371
414
|
*/
|
|
372
|
-
export async function runFailureSweep(send, now = Date.now()) {
|
|
415
|
+
export async function runFailureSweep(send, gateway, now = Date.now()) {
|
|
373
416
|
const broken = computeBrokenJobs(now);
|
|
374
417
|
if (broken.length === 0) {
|
|
375
|
-
// Clear cooldowns for jobs that recovered
|
|
418
|
+
// Clear cooldowns AND diagnostic cache entries for jobs that recovered.
|
|
376
419
|
const state = loadState();
|
|
377
420
|
let mutated = false;
|
|
421
|
+
const healedJobs = [];
|
|
378
422
|
for (const name of Object.keys(state.notified)) {
|
|
379
423
|
if (!broken.find(b => b.jobName === name)) {
|
|
380
424
|
delete state.notified[name];
|
|
425
|
+
healedJobs.push(name);
|
|
381
426
|
mutated = true;
|
|
382
427
|
}
|
|
383
428
|
}
|
|
384
429
|
if (mutated)
|
|
385
430
|
saveState(state);
|
|
431
|
+
// Opportunistically drop diagnosis cache for healed jobs
|
|
432
|
+
if (healedJobs.length > 0) {
|
|
433
|
+
try {
|
|
434
|
+
const { clearDiagnosis } = await import('./failure-diagnostics.js');
|
|
435
|
+
for (const name of healedJobs)
|
|
436
|
+
clearDiagnosis(name);
|
|
437
|
+
}
|
|
438
|
+
catch { /* non-fatal */ }
|
|
439
|
+
}
|
|
386
440
|
return [];
|
|
387
441
|
}
|
|
388
442
|
const state = loadState();
|
|
@@ -396,6 +450,29 @@ export async function runFailureSweep(send, now = Date.now()) {
|
|
|
396
450
|
}
|
|
397
451
|
if (fresh.length === 0)
|
|
398
452
|
return [];
|
|
453
|
+
// Diagnose fresh broken jobs before DMing. Each call is cached 24h, so a
|
|
454
|
+
// recurring failure doesn't re-invoke the LLM. Diagnosis is best-effort —
|
|
455
|
+
// if it fails or the gateway isn't wired, the report still goes out.
|
|
456
|
+
if (gateway) {
|
|
457
|
+
try {
|
|
458
|
+
const { diagnoseBrokenJob } = await import('./failure-diagnostics.js');
|
|
459
|
+
for (const job of fresh) {
|
|
460
|
+
if (job.diagnosis)
|
|
461
|
+
continue; // already attached from cache
|
|
462
|
+
try {
|
|
463
|
+
const d = await diagnoseBrokenJob(job, gateway);
|
|
464
|
+
if (d)
|
|
465
|
+
job.diagnosis = d;
|
|
466
|
+
}
|
|
467
|
+
catch (err) {
|
|
468
|
+
logger.warn({ err, job: job.jobName }, 'Diagnosis attempt failed');
|
|
469
|
+
}
|
|
470
|
+
}
|
|
471
|
+
}
|
|
472
|
+
catch (err) {
|
|
473
|
+
logger.warn({ err }, 'Failed to load diagnostics module');
|
|
474
|
+
}
|
|
475
|
+
}
|
|
399
476
|
try {
|
|
400
477
|
await send(formatReport(fresh));
|
|
401
478
|
const stamp = new Date(now).toISOString();
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Clementine TypeScript — Deterministic cron job fix applier.
|
|
3
|
+
*
|
|
4
|
+
* Applies the `autoApply` operations from a Diagnosis to a CRON.md file
|
|
5
|
+
* (global or agent-scoped). Strictly scoped to:
|
|
6
|
+
* - Allowlisted scalar fields only (enforced by the diagnostics module
|
|
7
|
+
* before they arrive here, and re-checked here for safety).
|
|
8
|
+
* - A single job's YAML block, identified by `- name: <jobName>`.
|
|
9
|
+
* - Line-level edits — never touches multi-line fields like `prompt`.
|
|
10
|
+
*
|
|
11
|
+
* Every apply writes a .bak next to the CRON.md and appends to an audit
|
|
12
|
+
* log before touching the file.
|
|
13
|
+
*/
|
|
14
|
+
import { type FixOperation } from './failure-diagnostics.js';
|
|
15
|
+
export interface ApplyResult {
|
|
16
|
+
ok: boolean;
|
|
17
|
+
message: string;
|
|
18
|
+
file?: string;
|
|
19
|
+
appliedOps?: FixOperation[];
|
|
20
|
+
skippedOps?: FixOperation[];
|
|
21
|
+
diff?: string;
|
|
22
|
+
}
|
|
23
|
+
/**
|
|
24
|
+
* Apply a proposed fix to the right CRON.md file. Idempotent with respect
|
|
25
|
+
* to already-applied ops (remove on a missing field is a no-op, set on a
|
|
26
|
+
* matching value is a no-op).
|
|
27
|
+
*/
|
|
28
|
+
export declare function applyFix(jobName: string, autoApply: {
|
|
29
|
+
agentSlug?: string;
|
|
30
|
+
operations: FixOperation[];
|
|
31
|
+
}, opts?: {
|
|
32
|
+
dryRun?: boolean;
|
|
33
|
+
}): ApplyResult;
|
|
34
|
+
//# sourceMappingURL=fix-applier.d.ts.map
|
|
@@ -0,0 +1,308 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Clementine TypeScript — Deterministic cron job fix applier.
|
|
3
|
+
*
|
|
4
|
+
* Applies the `autoApply` operations from a Diagnosis to a CRON.md file
|
|
5
|
+
* (global or agent-scoped). Strictly scoped to:
|
|
6
|
+
* - Allowlisted scalar fields only (enforced by the diagnostics module
|
|
7
|
+
* before they arrive here, and re-checked here for safety).
|
|
8
|
+
* - A single job's YAML block, identified by `- name: <jobName>`.
|
|
9
|
+
* - Line-level edits — never touches multi-line fields like `prompt`.
|
|
10
|
+
*
|
|
11
|
+
* Every apply writes a .bak next to the CRON.md and appends to an audit
|
|
12
|
+
* log before touching the file.
|
|
13
|
+
*/
|
|
14
|
+
import { appendFileSync, copyFileSync, existsSync, mkdirSync, readFileSync, writeFileSync, } from 'node:fs';
|
|
15
|
+
import path from 'node:path';
|
|
16
|
+
import pino from 'pino';
|
|
17
|
+
import { AGENTS_DIR, BASE_DIR, CRON_FILE } from '../config.js';
|
|
18
|
+
import { EDITABLE_FIELDS } from './failure-diagnostics.js';
|
|
19
|
+
const logger = pino({ name: 'clementine.fix-applier' });
|
|
20
|
+
const AUDIT_FILE = path.join(BASE_DIR, 'cron', 'fix-applier.log');
|
|
21
|
+
/**
|
|
22
|
+
* Resolve which CRON.md to edit for this job. Agent-scoped jobs live in
|
|
23
|
+
* vault/00-System/agents/<slug>/CRON.md; everything else is the global
|
|
24
|
+
* vault/00-System/CRON.md. If autoApply.agentSlug is provided, trust it;
|
|
25
|
+
* otherwise infer from the job name.
|
|
26
|
+
*/
|
|
27
|
+
function resolveCronFile(jobName, autoApply) {
|
|
28
|
+
if (autoApply.agentSlug) {
|
|
29
|
+
const f = path.join(AGENTS_DIR, autoApply.agentSlug, 'CRON.md');
|
|
30
|
+
if (existsSync(f))
|
|
31
|
+
return f;
|
|
32
|
+
logger.warn({ agentSlug: autoApply.agentSlug, expected: f }, 'agent-scoped CRON.md not found');
|
|
33
|
+
return null;
|
|
34
|
+
}
|
|
35
|
+
// Infer from jobName prefix (e.g., "ross-the-sdr:reply-detection")
|
|
36
|
+
if (jobName.includes(':')) {
|
|
37
|
+
const slug = jobName.split(':')[0];
|
|
38
|
+
const f = path.join(AGENTS_DIR, slug, 'CRON.md');
|
|
39
|
+
if (existsSync(f))
|
|
40
|
+
return f;
|
|
41
|
+
}
|
|
42
|
+
return existsSync(CRON_FILE) ? CRON_FILE : null;
|
|
43
|
+
}
|
|
44
|
+
/**
|
|
45
|
+
* The bare job name without the agent prefix. Agent-scoped cron jobs are
|
|
46
|
+
* written in their own file without the prefix — it's added programmatically
|
|
47
|
+
* when the scheduler merges them into the global job list.
|
|
48
|
+
*/
|
|
49
|
+
function bareJobName(jobName) {
|
|
50
|
+
const idx = jobName.indexOf(':');
|
|
51
|
+
return idx === -1 ? jobName : jobName.slice(idx + 1);
|
|
52
|
+
}
|
|
53
|
+
/**
|
|
54
|
+
* Find the line-range of a job's YAML block in a CRON.md file.
|
|
55
|
+
* Blocks start with ` - name: <bareName>` and run until the next ` - name:`
|
|
56
|
+
* at the same indent, or end of the jobs array.
|
|
57
|
+
*/
|
|
58
|
+
function findJobBlock(lines, bareName) {
|
|
59
|
+
// Match: two-space indent, hyphen, space, "name:", name (allow trailing spaces)
|
|
60
|
+
const nameEsc = bareName.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
|
|
61
|
+
const startRe = new RegExp(`^ - name:\\s+${nameEsc}\\s*$`);
|
|
62
|
+
const anyStartRe = /^ - name:\s+/;
|
|
63
|
+
let start = -1;
|
|
64
|
+
for (let i = 0; i < lines.length; i++) {
|
|
65
|
+
if (startRe.test(lines[i])) {
|
|
66
|
+
start = i;
|
|
67
|
+
break;
|
|
68
|
+
}
|
|
69
|
+
}
|
|
70
|
+
if (start === -1)
|
|
71
|
+
return null;
|
|
72
|
+
let end = lines.length;
|
|
73
|
+
for (let i = start + 1; i < lines.length; i++) {
|
|
74
|
+
if (anyStartRe.test(lines[i])) {
|
|
75
|
+
end = i;
|
|
76
|
+
break;
|
|
77
|
+
}
|
|
78
|
+
}
|
|
79
|
+
return { start, end };
|
|
80
|
+
}
|
|
81
|
+
/**
|
|
82
|
+
* Search a job block for a top-level scalar field (4-space indent, single
|
|
83
|
+
* line `key: value`). Returns the line index, or -1 if not present.
|
|
84
|
+
* Skips lines inside multi-line blocks (|>|, >) by tracking when we enter
|
|
85
|
+
* and exit them.
|
|
86
|
+
*/
|
|
87
|
+
function findFieldLine(lines, blockStart, blockEnd, field) {
|
|
88
|
+
const fieldEsc = field.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
|
|
89
|
+
const fieldRe = new RegExp(`^ ${fieldEsc}:\\s*(.*)$`);
|
|
90
|
+
// Multi-line marker pattern: ` key: |` or ` key: >-` or ` key: >`
|
|
91
|
+
const multiLineStartRe = /^ \w[\w-]*:\s*[|>][-+]?\s*$/;
|
|
92
|
+
let inMultiLine = false;
|
|
93
|
+
for (let i = blockStart + 1; i < blockEnd; i++) {
|
|
94
|
+
const line = lines[i];
|
|
95
|
+
if (inMultiLine) {
|
|
96
|
+
// Multi-line content is indented MORE than 4 spaces. When we hit a line
|
|
97
|
+
// indented exactly 4 (another field) or less, we've exited.
|
|
98
|
+
if (/^ \S/.test(line) && !/^ /.test(line)) {
|
|
99
|
+
inMultiLine = false;
|
|
100
|
+
// Fall through to check this line
|
|
101
|
+
}
|
|
102
|
+
else {
|
|
103
|
+
continue;
|
|
104
|
+
}
|
|
105
|
+
}
|
|
106
|
+
if (multiLineStartRe.test(line)) {
|
|
107
|
+
inMultiLine = true;
|
|
108
|
+
continue;
|
|
109
|
+
}
|
|
110
|
+
if (fieldRe.test(line))
|
|
111
|
+
return i;
|
|
112
|
+
}
|
|
113
|
+
return -1;
|
|
114
|
+
}
|
|
115
|
+
/**
|
|
116
|
+
* Serialize a scalar value to YAML. Strings with colons, leading dashes, or
|
|
117
|
+
* YAML-sensitive characters get quoted. Everything else emits bare.
|
|
118
|
+
*/
|
|
119
|
+
function yamlScalar(value) {
|
|
120
|
+
if (typeof value === 'boolean')
|
|
121
|
+
return value ? 'true' : 'false';
|
|
122
|
+
if (typeof value === 'number')
|
|
123
|
+
return String(value);
|
|
124
|
+
const s = String(value);
|
|
125
|
+
if (/^[\w\-./]+$/.test(s) && !/^(true|false|yes|no|null|~|\d)/i.test(s)) {
|
|
126
|
+
return s;
|
|
127
|
+
}
|
|
128
|
+
// Quote with double quotes, escape any embedded "
|
|
129
|
+
return `"${s.replace(/\\/g, '\\\\').replace(/"/g, '\\"')}"`;
|
|
130
|
+
}
|
|
131
|
+
/**
|
|
132
|
+
* Apply operations to a single job block in-place (returns a new array of
|
|
133
|
+
* lines). Silently drops operations targeting fields not in EDITABLE_FIELDS
|
|
134
|
+
* (defense in depth — the diagnostics parser filters these too).
|
|
135
|
+
*/
|
|
136
|
+
function applyOperations(lines, block, operations) {
|
|
137
|
+
// Work on a mutable copy. We track the evolving block.end as we insert/delete.
|
|
138
|
+
let working = lines.slice();
|
|
139
|
+
let blockEnd = block.end;
|
|
140
|
+
const applied = [];
|
|
141
|
+
const skipped = [];
|
|
142
|
+
for (const op of operations) {
|
|
143
|
+
if (!EDITABLE_FIELDS.has(op.field)) {
|
|
144
|
+
skipped.push(op);
|
|
145
|
+
continue;
|
|
146
|
+
}
|
|
147
|
+
const existing = findFieldLine(working, block.start, blockEnd, op.field);
|
|
148
|
+
if (op.op === 'remove') {
|
|
149
|
+
if (existing === -1) {
|
|
150
|
+
skipped.push(op); // nothing to remove
|
|
151
|
+
continue;
|
|
152
|
+
}
|
|
153
|
+
working.splice(existing, 1);
|
|
154
|
+
blockEnd -= 1;
|
|
155
|
+
applied.push(op);
|
|
156
|
+
}
|
|
157
|
+
else if (op.op === 'set') {
|
|
158
|
+
const newLine = ` ${op.field}: ${yamlScalar(op.value)}`;
|
|
159
|
+
if (existing !== -1) {
|
|
160
|
+
working[existing] = newLine;
|
|
161
|
+
}
|
|
162
|
+
else {
|
|
163
|
+
// Insert right after the name line so field order stays predictable.
|
|
164
|
+
working.splice(block.start + 1, 0, newLine);
|
|
165
|
+
blockEnd += 1;
|
|
166
|
+
}
|
|
167
|
+
applied.push(op);
|
|
168
|
+
}
|
|
169
|
+
}
|
|
170
|
+
return { newLines: working, applied, skipped };
|
|
171
|
+
}
|
|
172
|
+
/**
|
|
173
|
+
* Build a compact diff of only the scalar-field lines that changed.
|
|
174
|
+
* Ignores multi-line content like embedded prompts — walks each block,
|
|
175
|
+
* extracts single-line ` key: value` fields, and compares those.
|
|
176
|
+
* Keeps output readable for confirm dialogs and audit logs.
|
|
177
|
+
*/
|
|
178
|
+
function makeDiff(before, after, blockStart, newBlockEnd) {
|
|
179
|
+
const beforeEnd = findBlockEnd(before, blockStart);
|
|
180
|
+
const beforeFields = extractScalarFields(before.slice(blockStart, beforeEnd));
|
|
181
|
+
const afterFields = extractScalarFields(after.slice(blockStart, newBlockEnd));
|
|
182
|
+
const allKeys = new Set([...beforeFields.keys(), ...afterFields.keys()]);
|
|
183
|
+
const lines = [];
|
|
184
|
+
lines.push(`@@ ${after[blockStart].trim()} @@`);
|
|
185
|
+
for (const key of allKeys) {
|
|
186
|
+
const b = beforeFields.get(key);
|
|
187
|
+
const a = afterFields.get(key);
|
|
188
|
+
if (b === a)
|
|
189
|
+
continue;
|
|
190
|
+
if (b !== undefined)
|
|
191
|
+
lines.push(`- ${b}`);
|
|
192
|
+
if (a !== undefined)
|
|
193
|
+
lines.push(`+ ${a}`);
|
|
194
|
+
}
|
|
195
|
+
return lines.join('\n');
|
|
196
|
+
}
|
|
197
|
+
/**
|
|
198
|
+
* Extract single-line scalar ` key: value` fields from a job block.
|
|
199
|
+
* Skips the `- name:` line and multi-line `key: |` / `key: >` content.
|
|
200
|
+
*/
|
|
201
|
+
function extractScalarFields(blockLines) {
|
|
202
|
+
const out = new Map();
|
|
203
|
+
const scalarRe = /^ ([\w-]+):\s*(.*)$/;
|
|
204
|
+
const multiStartRe = /^ [\w-]+:\s*[|>][-+]?\s*$/;
|
|
205
|
+
let inMulti = false;
|
|
206
|
+
for (const line of blockLines) {
|
|
207
|
+
if (inMulti) {
|
|
208
|
+
// Exit when we hit another 4-space field
|
|
209
|
+
if (/^ \S/.test(line) && !/^ /.test(line)) {
|
|
210
|
+
inMulti = false;
|
|
211
|
+
}
|
|
212
|
+
else {
|
|
213
|
+
continue;
|
|
214
|
+
}
|
|
215
|
+
}
|
|
216
|
+
if (multiStartRe.test(line)) {
|
|
217
|
+
inMulti = true;
|
|
218
|
+
continue;
|
|
219
|
+
}
|
|
220
|
+
const m = line.match(scalarRe);
|
|
221
|
+
if (m)
|
|
222
|
+
out.set(m[1], line);
|
|
223
|
+
}
|
|
224
|
+
return out;
|
|
225
|
+
}
|
|
226
|
+
function findBlockEnd(lines, start) {
|
|
227
|
+
const anyStartRe = /^ - name:\s+/;
|
|
228
|
+
for (let i = start + 1; i < lines.length; i++) {
|
|
229
|
+
if (anyStartRe.test(lines[i]))
|
|
230
|
+
return i;
|
|
231
|
+
}
|
|
232
|
+
return lines.length;
|
|
233
|
+
}
|
|
234
|
+
/**
|
|
235
|
+
* Apply a proposed fix to the right CRON.md file. Idempotent with respect
|
|
236
|
+
* to already-applied ops (remove on a missing field is a no-op, set on a
|
|
237
|
+
* matching value is a no-op).
|
|
238
|
+
*/
|
|
239
|
+
export function applyFix(jobName, autoApply, opts = {}) {
|
|
240
|
+
const cronFile = resolveCronFile(jobName, autoApply);
|
|
241
|
+
if (!cronFile) {
|
|
242
|
+
return { ok: false, message: `No CRON.md found for ${jobName}` };
|
|
243
|
+
}
|
|
244
|
+
const bare = bareJobName(jobName);
|
|
245
|
+
const original = readFileSync(cronFile, 'utf-8');
|
|
246
|
+
const lines = original.split('\n');
|
|
247
|
+
const block = findJobBlock(lines, bare);
|
|
248
|
+
if (!block) {
|
|
249
|
+
return { ok: false, message: `Job '${bare}' not found in ${cronFile}`, file: cronFile };
|
|
250
|
+
}
|
|
251
|
+
const { newLines, applied, skipped } = applyOperations(lines, block, autoApply.operations);
|
|
252
|
+
if (applied.length === 0) {
|
|
253
|
+
return {
|
|
254
|
+
ok: false,
|
|
255
|
+
message: 'Nothing to apply (all ops were no-ops or on disallowed fields)',
|
|
256
|
+
file: cronFile,
|
|
257
|
+
appliedOps: applied,
|
|
258
|
+
skippedOps: skipped,
|
|
259
|
+
};
|
|
260
|
+
}
|
|
261
|
+
const newBlockEnd = findBlockEnd(newLines, block.start);
|
|
262
|
+
const diff = makeDiff(lines, newLines, block.start, newBlockEnd);
|
|
263
|
+
if (opts.dryRun) {
|
|
264
|
+
return {
|
|
265
|
+
ok: true,
|
|
266
|
+
message: `Dry run: ${applied.length} op(s) would apply`,
|
|
267
|
+
file: cronFile,
|
|
268
|
+
appliedOps: applied,
|
|
269
|
+
skippedOps: skipped,
|
|
270
|
+
diff,
|
|
271
|
+
};
|
|
272
|
+
}
|
|
273
|
+
// Backup before write
|
|
274
|
+
try {
|
|
275
|
+
copyFileSync(cronFile, cronFile + '.bak');
|
|
276
|
+
}
|
|
277
|
+
catch (err) {
|
|
278
|
+
logger.warn({ err, file: cronFile }, 'Failed to write .bak before applying fix');
|
|
279
|
+
}
|
|
280
|
+
const newContent = newLines.join('\n');
|
|
281
|
+
writeFileSync(cronFile, newContent);
|
|
282
|
+
appendAudit({
|
|
283
|
+
jobName,
|
|
284
|
+
file: cronFile,
|
|
285
|
+
applied,
|
|
286
|
+
skipped,
|
|
287
|
+
diff,
|
|
288
|
+
});
|
|
289
|
+
logger.info({ jobName, file: cronFile, applied: applied.length }, 'Applied cron job fix');
|
|
290
|
+
return {
|
|
291
|
+
ok: true,
|
|
292
|
+
message: `Applied ${applied.length} op(s) to ${path.basename(cronFile)}`,
|
|
293
|
+
file: cronFile,
|
|
294
|
+
appliedOps: applied,
|
|
295
|
+
skippedOps: skipped,
|
|
296
|
+
diff,
|
|
297
|
+
};
|
|
298
|
+
}
|
|
299
|
+
function appendAudit(entry) {
|
|
300
|
+
try {
|
|
301
|
+
mkdirSync(path.dirname(AUDIT_FILE), { recursive: true });
|
|
302
|
+
appendFileSync(AUDIT_FILE, JSON.stringify({ ...entry, timestamp: new Date().toISOString() }) + '\n');
|
|
303
|
+
}
|
|
304
|
+
catch (err) {
|
|
305
|
+
logger.warn({ err }, 'Failed to append fix-applier audit');
|
|
306
|
+
}
|
|
307
|
+
}
|
|
308
|
+
//# sourceMappingURL=fix-applier.js.map
|
|
@@ -105,8 +105,10 @@ export class HeartbeatScheduler {
|
|
|
105
105
|
}
|
|
106
106
|
// Cron failure sweep — surface jobs that have been silently failing.
|
|
107
107
|
// Runs every tick; per-job 24h cooldown lives inside the monitor.
|
|
108
|
+
// Passes the gateway so freshly-broken jobs get a diagnostic LLM call
|
|
109
|
+
// (cached 24h) before the DM goes out.
|
|
108
110
|
import('./failure-monitor.js').then(({ runFailureSweep }) => {
|
|
109
|
-
runFailureSweep((text) => this.dispatcher.send(text, {})).catch(err => {
|
|
111
|
+
runFailureSweep((text) => this.dispatcher.send(text, {}), this.gateway).catch(err => {
|
|
110
112
|
logger.warn({ err }, 'Failure sweep failed');
|
|
111
113
|
});
|
|
112
114
|
}).catch(err => logger.warn({ err }, 'Failure sweep import failed'));
|