clementine-agent 1.0.16 → 1.0.18

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -16201,11 +16201,39 @@ async function refreshBrokenJobs() {
16201
16201
  var agentTag = j.agentSlug
16202
16202
  ? '<span class="badge badge-blue" style="font-size:10px">' + esc(j.agentSlug) + '</span>'
16203
16203
  : '';
16204
+
16205
+ // Diagnosis block — root cause + proposed fix + diff preview.
16206
+ var diagnosisHtml = '';
16207
+ if (j.diagnosis) {
16208
+ var riskColor = j.diagnosis.riskLevel === 'high' ? '#ef4444'
16209
+ : j.diagnosis.riskLevel === 'medium' ? '#f59e0b' : '#22c55e';
16210
+ var confLabel = j.diagnosis.confidence !== 'high'
16211
+ ? ' <span style="font-size:10px;color:var(--text-muted)">(' + esc(j.diagnosis.confidence) + ' confidence)</span>'
16212
+ : '';
16213
+ var diffHtml = '';
16214
+ if (j.diagnosis.proposedFix.diff) {
16215
+ diffHtml = '<pre style="font-size:11px;background:#0f172a;color:#e2e8f0;padding:8px;border-radius:4px;margin:6px 0 0;white-space:pre-wrap;word-break:break-word;max-height:200px;overflow-y:auto">'
16216
+ + esc(j.diagnosis.proposedFix.diff) + '</pre>';
16217
+ }
16218
+ diagnosisHtml = '<div style="margin-top:10px;padding:10px;border-left:3px solid ' + riskColor
16219
+ + ';background:var(--bg-tertiary);border-radius:4px">'
16220
+ + '<div style="font-size:12px;margin-bottom:4px"><strong>Root cause' + confLabel + ':</strong> '
16221
+ + esc(j.diagnosis.rootCause) + '</div>'
16222
+ + '<div style="font-size:12px"><strong>Proposed fix:</strong> '
16223
+ + esc(j.diagnosis.proposedFix.details) + '</div>'
16224
+ + diffHtml
16225
+ + '<div style="font-size:10px;color:var(--text-muted);margin-top:6px">'
16226
+ + esc(j.diagnosis.proposedFix.type) + ' \\u00b7 ' + esc(j.diagnosis.riskLevel) + ' risk \\u00b7 diagnosed ' + timeAgo(j.diagnosis.generatedAt)
16227
+ + '</div>'
16228
+ + '</div>';
16229
+ }
16230
+
16204
16231
  html += '<div style="padding:12px;border:1px solid var(--border);border-radius:8px;background:var(--bg-secondary)">'
16205
16232
  + '<div style="display:flex;align-items:center;gap:8px;flex-wrap:wrap">'
16206
16233
  + '<strong>' + esc(j.jobName) + '</strong> ' + agentTag + ' ' + breaker
16207
16234
  + '<span style="margin-left:auto;font-size:11px;color:var(--text-muted)">' + failureRatio + ' failed \\u00b7 last error ' + lastErrorAt + '</span>'
16208
16235
  + '</div>'
16236
+ + diagnosisHtml
16209
16237
  + errorsHtml
16210
16238
  + advisorLine
16211
16239
  + '</div>';
@@ -0,0 +1,39 @@
1
+ /**
2
+ * Clementine TypeScript — Broken-job diagnostic agent.
3
+ *
4
+ * When the failure monitor flags a job, this runs a cheap Haiku-level
5
+ * analysis over the job definition, agent profile, and recent runs to
6
+ * propose a root cause and a specific fix. Read-only: it never writes
7
+ * anything except its own cache.
8
+ *
9
+ * Output surfaces in the Broken Jobs dashboard panel and the owner DM
10
+ * so the response to a silent failure is "here's what's wrong and
11
+ * here's what to change" rather than "go investigate."
12
+ */
13
+ import type { Gateway } from './router.js';
14
+ import type { BrokenJob } from './failure-monitor.js';
15
+ export interface Diagnosis {
16
+ rootCause: string;
17
+ confidence: 'high' | 'medium' | 'low';
18
+ proposedFix: {
19
+ type: 'config_change' | 'prompt_change' | 'agent_scope' | 'disable' | 'credential_refresh' | 'escalate_to_owner';
20
+ details: string;
21
+ diff?: string;
22
+ };
23
+ riskLevel: 'low' | 'medium' | 'high';
24
+ generatedAt: string;
25
+ }
26
+ /**
27
+ * Diagnose one broken job. Returns a cached diagnosis if one exists and is
28
+ * fresher than 24h; otherwise invokes the LLM. Always best-effort — returns
29
+ * null instead of throwing so failure detection stays robust.
30
+ */
31
+ export declare function diagnoseBrokenJob(broken: BrokenJob, gateway: Gateway): Promise<Diagnosis | null>;
32
+ /**
33
+ * Clear cached diagnosis for a job (e.g., after the owner applies a fix).
34
+ * Called opportunistically when a broken job disappears from the live set.
35
+ */
36
+ export declare function clearDiagnosis(jobName: string): void;
37
+ /** Read-only accessor for the dashboard. */
38
+ export declare function getDiagnosisIfFresh(jobName: string): Diagnosis | null;
39
+ //# sourceMappingURL=failure-diagnostics.d.ts.map
@@ -0,0 +1,257 @@
1
+ /**
2
+ * Clementine TypeScript — Broken-job diagnostic agent.
3
+ *
4
+ * When the failure monitor flags a job, this runs a cheap Haiku-level
5
+ * analysis over the job definition, agent profile, and recent runs to
6
+ * propose a root cause and a specific fix. Read-only: it never writes
7
+ * anything except its own cache.
8
+ *
9
+ * Output surfaces in the Broken Jobs dashboard panel and the owner DM
10
+ * so the response to a silent failure is "here's what's wrong and
11
+ * here's what to change" rather than "go investigate."
12
+ */
13
+ import { existsSync, mkdirSync, readFileSync, writeFileSync, } from 'node:fs';
14
+ import path from 'node:path';
15
+ import pino from 'pino';
16
+ import { AGENTS_DIR, BASE_DIR, CRON_FILE } from '../config.js';
17
+ const logger = pino({ name: 'clementine.failure-diagnostics' });
18
+ const CACHE_FILE = path.join(BASE_DIR, 'cron', 'failure-diagnostics.json');
19
+ const CACHE_TTL_MS = 24 * 60 * 60 * 1000;
20
+ const RUNS_DIR = path.join(BASE_DIR, 'cron', 'runs');
21
+ function loadCache() {
22
+ try {
23
+ if (!existsSync(CACHE_FILE))
24
+ return {};
25
+ const raw = JSON.parse(readFileSync(CACHE_FILE, 'utf-8'));
26
+ return raw;
27
+ }
28
+ catch {
29
+ return {};
30
+ }
31
+ }
32
+ function saveCache(cache) {
33
+ try {
34
+ mkdirSync(path.dirname(CACHE_FILE), { recursive: true });
35
+ writeFileSync(CACHE_FILE, JSON.stringify(cache, null, 2));
36
+ }
37
+ catch (err) {
38
+ logger.warn({ err }, 'Failed to persist diagnostic cache');
39
+ }
40
+ }
41
+ /**
42
+ * Pull the raw YAML entry for a cron job from CRON.md (global or agent-scoped).
43
+ * Returns the text of the entry, not the parsed object, so the diagnostic
44
+ * agent sees the exact fields the user edits.
45
+ */
46
+ function readJobDefinition(jobName) {
47
+ const [maybeSlug, ...rest] = jobName.split(':');
48
+ const bareName = rest.length > 0 ? rest.join(':') : maybeSlug;
49
+ const candidateFiles = [];
50
+ if (rest.length > 0) {
51
+ // agent-scoped: ross-the-sdr:reply-detection
52
+ candidateFiles.push(path.join(AGENTS_DIR, maybeSlug, 'CRON.md'));
53
+ }
54
+ candidateFiles.push(CRON_FILE);
55
+ for (const file of candidateFiles) {
56
+ if (!existsSync(file))
57
+ continue;
58
+ try {
59
+ const raw = readFileSync(file, 'utf-8');
60
+ // Find the YAML block for "- name: bareName" and return until the next
61
+ // "- name:" at the same indent or end of file.
62
+ const pattern = new RegExp(`^( - name: ${bareName.replace(/[.*+?^${}()|[\]\\]/g, '\\$&')}\\s*$[\\s\\S]*?)(?=^ - name: |\\z)`, 'm');
63
+ const m = raw.match(pattern);
64
+ if (m)
65
+ return m[1].slice(0, 6000);
66
+ }
67
+ catch { /* skip */ }
68
+ }
69
+ return null;
70
+ }
71
+ /**
72
+ * Read the agent profile markdown if this job is scoped to an agent.
73
+ * Returns the first 2K chars which covers name/description/tier/allowedTools.
74
+ */
75
+ function readAgentProfile(agentSlug) {
76
+ const profile = path.join(AGENTS_DIR, agentSlug, 'agent.md');
77
+ if (!existsSync(profile))
78
+ return null;
79
+ try {
80
+ return readFileSync(profile, 'utf-8').slice(0, 2500);
81
+ }
82
+ catch {
83
+ return null;
84
+ }
85
+ }
86
+ /** Last N cron run entries for the job, oldest → newest for the prompt. */
87
+ function readRecentRuns(jobName, limit = 10) {
88
+ const safe = jobName.replace(/[^a-zA-Z0-9_-]/g, '_');
89
+ const file = path.join(RUNS_DIR, `${safe}.jsonl`);
90
+ if (!existsSync(file))
91
+ return '(no run log)';
92
+ try {
93
+ const lines = readFileSync(file, 'utf-8').trim().split('\n').filter(Boolean);
94
+ const recent = lines.slice(-limit);
95
+ const summaries = recent.map(line => {
96
+ try {
97
+ const d = JSON.parse(line);
98
+ const detail = d.status === 'ok'
99
+ ? `preview="${(d.outputPreview ?? '').slice(0, 120).replace(/\n/g, ' ')}"`
100
+ : `error="${(d.error ?? '').split('\n')[0].slice(0, 160)}"`;
101
+ return `${d.startedAt} ${d.status} (${Math.round(d.durationMs / 1000)}s) ${detail}`;
102
+ }
103
+ catch {
104
+ return line.slice(0, 160);
105
+ }
106
+ });
107
+ return summaries.join('\n');
108
+ }
109
+ catch {
110
+ return '(failed to read run log)';
111
+ }
112
+ }
113
+ function buildPrompt(broken, jobDef, agentProfile, recentRuns) {
114
+ const breakerLine = broken.circuitBreakerEngagedAt
115
+ ? `Circuit breaker engaged at ${broken.circuitBreakerEngagedAt}.`
116
+ : 'No active circuit breaker.';
117
+ return [
118
+ `You are a reliability engineer diagnosing a cron job that's been failing in Clementine (a personal AI assistant framework).`,
119
+ `Your output must be a single JSON object with the schema shown at the end. No preamble, no postscript.`,
120
+ '',
121
+ `## Job name: ${broken.jobName}`,
122
+ broken.agentSlug ? `## Agent scope: ${broken.agentSlug}` : '## Scope: global (no agent)',
123
+ `## Failure stats: ${broken.errorCount48h}/${broken.totalRuns48h} runs failed in last 48h. ${breakerLine}`,
124
+ broken.lastAdvisorOpinion ? `## Advisor notes: ${broken.lastAdvisorOpinion}` : '',
125
+ '',
126
+ '## Job definition (CURRENT state of CRON.md):',
127
+ jobDef ?? '(not found in CRON.md — may be a heartbeat pseudo-job like insight-check)',
128
+ '',
129
+ agentProfile ? '## Agent profile (agent.md, truncated):\n' + agentProfile + '\n' : '',
130
+ '## Recent runs (oldest → newest):',
131
+ recentRuns,
132
+ '',
133
+ broken.lastErrors.length > 0 ? '## Distinct recent errors:\n' + broken.lastErrors.map(e => '- ' + e.slice(0, 400)).join('\n') : '',
134
+ '',
135
+ '## Critical reasoning rules',
136
+ '',
137
+ '**The CURRENT job definition above may differ from the config at the time of past failures.** If you see old errors (e.g. "timeout kill") but the current config ALREADY contains the fields that would have caused those errors, treat those errors as resolved by a recent fix — do NOT propose re-adding the fields that caused them.',
138
+ '',
139
+ '**Look at the MOST RECENT runs specifically.** If the last 2+ runs succeeded, the job has recovered — propose `escalate_to_owner` with "appears recovered, no fix needed" as details, confidence: high, risk: low.',
140
+ '',
141
+ '**Don\'t propose reverting a fix.** If the current config does NOT contain `mode: unleashed` but recent runs show "Claude Code process aborted by user", do NOT propose adding `mode: unleashed` back. That error pattern occurs in BOTH unleashed (hit max_hours) and standard (hit timeoutMs) modes. Without strong evidence the current config is wrong, prefer raising `timeoutMs` or adding `max_turns` over toggling `mode`.',
142
+ '',
143
+ '## Diagnostic patterns (use these as priors)',
144
+ '',
145
+ '- **"API 400 input_schema"** → external MCP server exposes a malformed tool. Propose checking claude_desktop_config.json and ~/.claude.json for recently-updated packages. Type: escalate_to_owner.',
146
+ '- **401/403 errors** → credential refresh needed. Type: credential_refresh. Name the specific service if possible.',
147
+ '- **"Claude Code process aborted by user" with long durations (>60s)** → timeout kill. If current config has `mode: unleashed`, propose removing it + adding `max_turns: 25`. If current config is already standard, propose raising `timeoutMs` or investigating the prompt for infinite loops.',
148
+ '- **"Reached maximum number of turns (N)"** → maxTurns set too low for the job\'s tool fan-out. Propose raising `max_turns` to 3×N.',
149
+ '- **Output preview contains BLOCKED / "no local bash" / "permission denied"** → agent picked the wrong tool. Propose either scoping the job to an agent whose allowedTools excludes the bad MCP, or adding explicit tool-choice guidance in the prompt.',
150
+ '- **No clear pattern** → escalate_to_owner with what you would need to know.',
151
+ '',
152
+ '## Output schema (JSON only, no markdown fences):',
153
+ '{',
154
+ ' "rootCause": "1-2 sentences explaining WHY the job is failing, referencing specific fields or error patterns from the CURRENT config",',
155
+ ' "confidence": "high|medium|low",',
156
+ ' "proposedFix": {',
157
+ ' "type": "config_change|prompt_change|agent_scope|disable|credential_refresh|escalate_to_owner",',
158
+ ' "details": "prose description of the fix, citing the exact field(s) to change",',
159
+ ' "diff": "optional: exact before/after diff if it is a small config edit"',
160
+ ' },',
161
+ ' "riskLevel": "low|medium|high"',
162
+ '}',
163
+ ].filter(Boolean).join('\n');
164
+ }
165
+ function parseResponse(raw) {
166
+ try {
167
+ // The model sometimes wraps the JSON in markdown fences; extract the
168
+ // first top-level {...} object.
169
+ const match = raw.match(/\{[\s\S]*\}/);
170
+ if (!match)
171
+ return null;
172
+ const parsed = JSON.parse(match[0]);
173
+ if (!parsed.rootCause || !parsed.proposedFix)
174
+ return null;
175
+ return {
176
+ rootCause: String(parsed.rootCause).slice(0, 500),
177
+ confidence: (parsed.confidence ?? 'medium'),
178
+ proposedFix: {
179
+ type: (parsed.proposedFix.type ?? 'escalate_to_owner'),
180
+ details: String(parsed.proposedFix.details ?? '').slice(0, 800),
181
+ diff: parsed.proposedFix.diff ? String(parsed.proposedFix.diff).slice(0, 1000) : undefined,
182
+ },
183
+ riskLevel: (parsed.riskLevel ?? 'medium'),
184
+ generatedAt: new Date().toISOString(),
185
+ };
186
+ }
187
+ catch (err) {
188
+ logger.warn({ err }, 'Failed to parse diagnostic JSON');
189
+ return null;
190
+ }
191
+ }
192
+ /**
193
+ * Diagnose one broken job. Returns a cached diagnosis if one exists and is
194
+ * fresher than 24h; otherwise invokes the LLM. Always best-effort — returns
195
+ * null instead of throwing so failure detection stays robust.
196
+ */
197
+ export async function diagnoseBrokenJob(broken, gateway) {
198
+ const cache = loadCache();
199
+ const cached = cache[broken.jobName];
200
+ if (cached) {
201
+ const age = Date.now() - Date.parse(cached.generatedAt);
202
+ if (Number.isFinite(age) && age < CACHE_TTL_MS) {
203
+ logger.debug({ job: broken.jobName, ageMin: Math.round(age / 60000) }, 'Using cached diagnosis');
204
+ return cached;
205
+ }
206
+ }
207
+ const jobDef = readJobDefinition(broken.jobName);
208
+ const agentProfile = broken.agentSlug ? readAgentProfile(broken.agentSlug) : null;
209
+ const recentRuns = readRecentRuns(broken.jobName, 10);
210
+ const prompt = buildPrompt(broken, jobDef, agentProfile, recentRuns);
211
+ let rawResponse;
212
+ try {
213
+ rawResponse = await gateway.handleCronJob(`diagnose:${broken.jobName}`, prompt, 1, // tier 1 — cheap
214
+ 5, // maxTurns — diagnosis doesn't need tools typically
215
+ 'haiku');
216
+ }
217
+ catch (err) {
218
+ logger.warn({ err, job: broken.jobName }, 'Diagnostic LLM call failed');
219
+ return null;
220
+ }
221
+ const diagnosis = parseResponse(rawResponse);
222
+ if (!diagnosis) {
223
+ logger.warn({ job: broken.jobName, rawHead: rawResponse.slice(0, 200) }, 'Diagnosis returned unparseable response');
224
+ return null;
225
+ }
226
+ cache[broken.jobName] = diagnosis;
227
+ saveCache(cache);
228
+ logger.info({
229
+ job: broken.jobName,
230
+ confidence: diagnosis.confidence,
231
+ fixType: diagnosis.proposedFix.type,
232
+ }, 'Broken-job diagnosis generated');
233
+ return diagnosis;
234
+ }
235
+ /**
236
+ * Clear cached diagnosis for a job (e.g., after the owner applies a fix).
237
+ * Called opportunistically when a broken job disappears from the live set.
238
+ */
239
+ export function clearDiagnosis(jobName) {
240
+ const cache = loadCache();
241
+ if (cache[jobName]) {
242
+ delete cache[jobName];
243
+ saveCache(cache);
244
+ }
245
+ }
246
+ /** Read-only accessor for the dashboard. */
247
+ export function getDiagnosisIfFresh(jobName) {
248
+ const cache = loadCache();
249
+ const d = cache[jobName];
250
+ if (!d)
251
+ return null;
252
+ const age = Date.now() - Date.parse(d.generatedAt);
253
+ if (!Number.isFinite(age) || age >= CACHE_TTL_MS)
254
+ return null;
255
+ return d;
256
+ }
257
+ //# sourceMappingURL=failure-diagnostics.js.map
@@ -24,6 +24,18 @@ export interface BrokenJob {
24
24
  lastErrors: string[];
25
25
  circuitBreakerEngagedAt: string | null;
26
26
  lastAdvisorOpinion: string | null;
27
+ /** Populated asynchronously by the diagnostic agent when available. */
28
+ diagnosis?: {
29
+ rootCause: string;
30
+ confidence: 'high' | 'medium' | 'low';
31
+ proposedFix: {
32
+ type: string;
33
+ details: string;
34
+ diff?: string;
35
+ };
36
+ riskLevel: 'low' | 'medium' | 'high';
37
+ generatedAt: string;
38
+ };
27
39
  }
28
40
  /**
29
41
  * Compute the current set of broken jobs by scanning all run logs.
@@ -32,9 +44,14 @@ export interface BrokenJob {
32
44
  export declare function computeBrokenJobs(now?: number): BrokenJob[];
33
45
  /**
34
46
  * Run a sweep: identify currently-broken jobs, pick the ones we haven't
35
- * notified about recently, and dispatch one consolidated DM.
47
+ * notified about recently, invoke the diagnostic agent for new entries,
48
+ * and dispatch one consolidated DM.
49
+ *
50
+ * `gateway` is optional — omitted for tests that want to skip the LLM call.
51
+ * When present, we diagnose fresh broken jobs before notifying, so the
52
+ * report includes a root-cause + proposed fix for each.
36
53
  *
37
54
  * Returns the jobs that triggered a fresh notification (mostly for tests/logs).
38
55
  */
39
- export declare function runFailureSweep(send: (text: string) => Promise<unknown>, now?: number): Promise<BrokenJob[]>;
56
+ export declare function runFailureSweep(send: (text: string) => Promise<unknown>, gateway?: import('./router.js').Gateway, now?: number): Promise<BrokenJob[]>;
40
57
  //# sourceMappingURL=failure-monitor.d.ts.map
@@ -175,7 +175,20 @@ export function computeBrokenJobs(now = Date.now()) {
175
175
  const lastRunMs = Date.parse(lastEntry.startedAt);
176
176
  // Always consult the breaker state — a stuck breaker is the primary
177
177
  // signal for "job has been silently broken for days".
178
- const cb = lastCircuitBreakerEvent(jobName);
178
+ let cb = lastCircuitBreakerEvent(jobName);
179
+ // Clear a "stuck" breaker flag if we see an ok run AFTER the last
180
+ // breaker engagement. The scheduler only logs a circuit-recovery
181
+ // event when consecutiveErrors >= 5 at recovery time — but a
182
+ // successful manual/probe run resets consecutiveErrors to 0 first,
183
+ // so the recovery branch never fires and the advisor log keeps the
184
+ // breaker appearing engaged forever. Fix: use run-log truth instead.
185
+ if (cb.engagedAt) {
186
+ const engagedMs = Date.parse(cb.engagedAt);
187
+ const hasOkSinceBreaker = entries.some(e => e.status === 'ok' && Date.parse(e.startedAt) > engagedMs);
188
+ if (hasOkSinceBreaker) {
189
+ cb = { engagedAt: null, lastOpinion: cb.lastOpinion };
190
+ }
191
+ }
179
192
  if (!cb.engagedAt && Number.isFinite(lastRunMs) && lastRunMs < dormantCutoffMs) {
180
193
  continue;
181
194
  }
@@ -244,8 +257,30 @@ export function computeBrokenJobs(now = Date.now()) {
244
257
  const bT = b.lastErrorAt ? Date.parse(b.lastErrorAt) : 0;
245
258
  return bT - aT;
246
259
  });
260
+ // Attach any cached diagnosis (fresh within 24h). Reads the cache file
261
+ // directly — avoids circular imports with failure-diagnostics.
262
+ attachCachedDiagnoses(broken, now);
247
263
  return broken;
248
264
  }
265
+ const DIAGNOSTICS_CACHE_FILE = path.join(BASE_DIR, 'cron', 'failure-diagnostics.json');
266
+ const DIAGNOSIS_TTL_MS = 24 * 60 * 60 * 1000;
267
+ function attachCachedDiagnoses(jobs, now) {
268
+ if (!existsSync(DIAGNOSTICS_CACHE_FILE))
269
+ return;
270
+ try {
271
+ const cache = JSON.parse(readFileSync(DIAGNOSTICS_CACHE_FILE, 'utf-8'));
272
+ for (const j of jobs) {
273
+ const d = cache[j.jobName];
274
+ if (!d)
275
+ continue;
276
+ const age = now - Date.parse(d.generatedAt);
277
+ if (Number.isFinite(age) && age < DIAGNOSIS_TTL_MS) {
278
+ j.diagnosis = d;
279
+ }
280
+ }
281
+ }
282
+ catch { /* cache may be malformed — ignore */ }
283
+ }
249
284
  /**
250
285
  * The self-improve loop writes to its own experiment-log.jsonl, not cron/runs/.
251
286
  * Its breakage pattern is: state.lastRunAt keeps getting updated nightly but
@@ -338,12 +373,28 @@ function formatReport(jobs) {
338
373
  for (const j of jobs) {
339
374
  const breaker = j.circuitBreakerEngagedAt ? ' · circuit breaker engaged' : '';
340
375
  lines.push(`• \`${j.jobName}\` — ${j.errorCount48h}/${j.totalRuns48h} runs failed${breaker}`);
341
- if (j.lastErrors.length > 0) {
342
- const preview = j.lastErrors[0].split('\n')[0].slice(0, 140);
343
- lines.push(` Last error: ${preview}`);
376
+ // Prefer the diagnostic agent's analysis when available — it's more
377
+ // actionable than the raw error. Fall back to error + advisor lines.
378
+ if (j.diagnosis) {
379
+ const conf = j.diagnosis.confidence === 'high' ? '' : ` (${j.diagnosis.confidence} confidence)`;
380
+ lines.push(` **Cause${conf}:** ${j.diagnosis.rootCause.slice(0, 240)}`);
381
+ lines.push(` **Proposed fix:** ${j.diagnosis.proposedFix.details.slice(0, 240)}`);
382
+ if (j.diagnosis.proposedFix.diff) {
383
+ // Show a short diff preview inline; full diff in the dashboard.
384
+ const diffShort = j.diagnosis.proposedFix.diff.split('\n').slice(0, 4).join('\n');
385
+ lines.push(' ```diff');
386
+ lines.push(' ' + diffShort.replace(/\n/g, '\n '));
387
+ lines.push(' ```');
388
+ }
344
389
  }
345
- if (j.lastAdvisorOpinion) {
346
- lines.push(` Advisor: ${j.lastAdvisorOpinion.slice(0, 140)}`);
390
+ else {
391
+ if (j.lastErrors.length > 0) {
392
+ const preview = j.lastErrors[0].split('\n')[0].slice(0, 140);
393
+ lines.push(` Last error: ${preview}`);
394
+ }
395
+ if (j.lastAdvisorOpinion) {
396
+ lines.push(` Advisor: ${j.lastAdvisorOpinion.slice(0, 140)}`);
397
+ }
347
398
  }
348
399
  }
349
400
  lines.push('');
@@ -352,24 +403,40 @@ function formatReport(jobs) {
352
403
  }
353
404
  /**
354
405
  * Run a sweep: identify currently-broken jobs, pick the ones we haven't
355
- * notified about recently, and dispatch one consolidated DM.
406
+ * notified about recently, invoke the diagnostic agent for new entries,
407
+ * and dispatch one consolidated DM.
408
+ *
409
+ * `gateway` is optional — omitted for tests that want to skip the LLM call.
410
+ * When present, we diagnose fresh broken jobs before notifying, so the
411
+ * report includes a root-cause + proposed fix for each.
356
412
  *
357
413
  * Returns the jobs that triggered a fresh notification (mostly for tests/logs).
358
414
  */
359
- export async function runFailureSweep(send, now = Date.now()) {
415
+ export async function runFailureSweep(send, gateway, now = Date.now()) {
360
416
  const broken = computeBrokenJobs(now);
361
417
  if (broken.length === 0) {
362
- // Clear cooldowns for jobs that recovered so future failures notify promptly.
418
+ // Clear cooldowns AND diagnostic cache entries for jobs that recovered.
363
419
  const state = loadState();
364
420
  let mutated = false;
421
+ const healedJobs = [];
365
422
  for (const name of Object.keys(state.notified)) {
366
423
  if (!broken.find(b => b.jobName === name)) {
367
424
  delete state.notified[name];
425
+ healedJobs.push(name);
368
426
  mutated = true;
369
427
  }
370
428
  }
371
429
  if (mutated)
372
430
  saveState(state);
431
+ // Opportunistically drop diagnosis cache for healed jobs
432
+ if (healedJobs.length > 0) {
433
+ try {
434
+ const { clearDiagnosis } = await import('./failure-diagnostics.js');
435
+ for (const name of healedJobs)
436
+ clearDiagnosis(name);
437
+ }
438
+ catch { /* non-fatal */ }
439
+ }
373
440
  return [];
374
441
  }
375
442
  const state = loadState();
@@ -383,6 +450,29 @@ export async function runFailureSweep(send, now = Date.now()) {
383
450
  }
384
451
  if (fresh.length === 0)
385
452
  return [];
453
+ // Diagnose fresh broken jobs before DMing. Each call is cached 24h, so a
454
+ // recurring failure doesn't re-invoke the LLM. Diagnosis is best-effort —
455
+ // if it fails or the gateway isn't wired, the report still goes out.
456
+ if (gateway) {
457
+ try {
458
+ const { diagnoseBrokenJob } = await import('./failure-diagnostics.js');
459
+ for (const job of fresh) {
460
+ if (job.diagnosis)
461
+ continue; // already attached from cache
462
+ try {
463
+ const d = await diagnoseBrokenJob(job, gateway);
464
+ if (d)
465
+ job.diagnosis = d;
466
+ }
467
+ catch (err) {
468
+ logger.warn({ err, job: job.jobName }, 'Diagnosis attempt failed');
469
+ }
470
+ }
471
+ }
472
+ catch (err) {
473
+ logger.warn({ err }, 'Failed to load diagnostics module');
474
+ }
475
+ }
386
476
  try {
387
477
  await send(formatReport(fresh));
388
478
  const stamp = new Date(now).toISOString();
@@ -105,8 +105,10 @@ export class HeartbeatScheduler {
105
105
  }
106
106
  // Cron failure sweep — surface jobs that have been silently failing.
107
107
  // Runs every tick; per-job 24h cooldown lives inside the monitor.
108
+ // Passes the gateway so freshly-broken jobs get a diagnostic LLM call
109
+ // (cached 24h) before the DM goes out.
108
110
  import('./failure-monitor.js').then(({ runFailureSweep }) => {
109
- runFailureSweep((text) => this.dispatcher.send(text, {})).catch(err => {
111
+ runFailureSweep((text) => this.dispatcher.send(text, {}), this.gateway).catch(err => {
110
112
  logger.warn({ err }, 'Failure sweep failed');
111
113
  });
112
114
  }).catch(err => logger.warn({ err }, 'Failure sweep import failed'));
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "clementine-agent",
3
- "version": "1.0.16",
3
+ "version": "1.0.18",
4
4
  "description": "Clementine — Personal AI Assistant (TypeScript)",
5
5
  "type": "module",
6
6
  "main": "dist/index.js",