clementine-agent 1.0.21 → 1.0.22
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/agent/self-improve.js +74 -29
- package/dist/gateway/claim-tracker.d.ts +8 -0
- package/dist/gateway/claim-tracker.js +145 -1
- package/dist/gateway/failure-monitor.js +108 -5
- package/dist/gateway/heartbeat-scheduler.js +16 -3
- package/dist/gateway/outcome-grader.d.ts +41 -0
- package/dist/gateway/outcome-grader.js +173 -0
- package/dist/memory/store.js +11 -0
- package/package.json +1 -1
|
@@ -611,63 +611,108 @@ export class SelfImproveLoop {
|
|
|
611
611
|
async hypothesize(metrics, history) {
|
|
612
612
|
// Read targeted triggers (written by cron scheduler when jobs fail repeatedly)
|
|
613
613
|
let targetedTriggers = '';
|
|
614
|
+
const triggerBullets = [];
|
|
615
|
+
// Source 1: explicit triggers written by the cron scheduler at 3+
|
|
616
|
+
// consecutive errors (legacy path — we still honor and drain).
|
|
614
617
|
const triggersDir = path.join(SELF_IMPROVE_DIR, 'triggers');
|
|
615
618
|
if (existsSync(triggersDir)) {
|
|
616
619
|
const triggerFiles = readdirSync(triggersDir).filter(f => f.endsWith('.json'));
|
|
617
|
-
|
|
618
|
-
|
|
619
|
-
|
|
620
|
-
|
|
621
|
-
|
|
622
|
-
|
|
623
|
-
|
|
624
|
-
|
|
625
|
-
catch {
|
|
626
|
-
return null;
|
|
627
|
-
}
|
|
628
|
-
}).filter(Boolean);
|
|
629
|
-
if (triggers.length > 0) {
|
|
630
|
-
targetedTriggers = `\n\n## PRIORITY: Failing Jobs Needing Attention\n` +
|
|
631
|
-
`These jobs have been failing repeatedly and need prompt/config fixes:\n` +
|
|
632
|
-
triggers.map((t) => `- **${t.jobName}**: ${t.consecutiveErrors} consecutive errors. Recent: ${(t.recentErrors ?? []).join('; ')}`).join('\n') +
|
|
633
|
-
`\n\nFocus your improvement hypothesis on fixing these jobs first.\n`;
|
|
620
|
+
const triggers = triggerFiles.slice(0, 3).map(f => {
|
|
621
|
+
try {
|
|
622
|
+
const t = JSON.parse(readFileSync(path.join(triggersDir, f), 'utf-8'));
|
|
623
|
+
unlinkSync(path.join(triggersDir, f));
|
|
624
|
+
return t;
|
|
625
|
+
}
|
|
626
|
+
catch {
|
|
627
|
+
return null;
|
|
634
628
|
}
|
|
629
|
+
}).filter(Boolean);
|
|
630
|
+
for (const t of triggers) {
|
|
631
|
+
triggerBullets.push(`- **${t.jobName}**: ${t.consecutiveErrors} consecutive errors. Recent: ${(t.recentErrors ?? []).join('; ')}`);
|
|
635
632
|
}
|
|
636
633
|
}
|
|
634
|
+
// Source 2: broken-jobs from the failure monitor. These are jobs the
|
|
635
|
+
// user hasn't applied a fix for yet — real, current gaps the hypothesizer
|
|
636
|
+
// should target. Complements the diversity constraint: even if the area
|
|
637
|
+
// has been over-targeted historically, a specific broken job is a fresh
|
|
638
|
+
// concrete signal.
|
|
639
|
+
try {
|
|
640
|
+
const { computeBrokenJobs } = await import('../gateway/failure-monitor.js');
|
|
641
|
+
const broken = computeBrokenJobs();
|
|
642
|
+
for (const b of broken.slice(0, 3)) {
|
|
643
|
+
const diagHint = b.diagnosis
|
|
644
|
+
? ` Diagnosis: ${b.diagnosis.rootCause.slice(0, 120)}`
|
|
645
|
+
: '';
|
|
646
|
+
triggerBullets.push(`- **${b.jobName}**: ${b.errorCount48h}/${b.totalRuns48h} failed in 48h${b.circuitBreakerEngagedAt ? ' (breaker engaged)' : ''}.${diagHint}`);
|
|
647
|
+
}
|
|
648
|
+
}
|
|
649
|
+
catch { /* failure-monitor module optional */ }
|
|
650
|
+
if (triggerBullets.length > 0) {
|
|
651
|
+
targetedTriggers = `\n\n## PRIORITY: Failing Jobs Needing Attention\n` +
|
|
652
|
+
`These jobs have been failing recently and need prompt/config fixes:\n` +
|
|
653
|
+
triggerBullets.join('\n') +
|
|
654
|
+
`\n\nFocus your improvement hypothesis on fixing these jobs first.\n`;
|
|
655
|
+
}
|
|
637
656
|
// Format experiment history for the prompt
|
|
638
657
|
const historyText = history.slice(-20).map(e => `#${e.iteration} | ${e.area} | "${e.hypothesis.slice(0, 60)}" | ${(e.score * 10).toFixed(1)}/10 ${e.accepted ? '✅' : '❌'}`).join('\n') || '(no prior experiments)';
|
|
639
|
-
// Enforce diversity: count recent proposals per area:target AND per area
|
|
658
|
+
// Enforce diversity: count recent proposals per area:target AND per area.
|
|
659
|
+
// A pair is only "over-targeted" if its MOST RECENT attempt was within
|
|
660
|
+
// the last 30 days — otherwise it's fair game to retry with fresh data.
|
|
661
|
+
// Stops the saturation state where after ~60 experiments the loop has
|
|
662
|
+
// blocked every area:target pair permanently and produces no new
|
|
663
|
+
// hypotheses (the Apr 11-19 plateau).
|
|
664
|
+
const DIVERSITY_WINDOW_MS = 30 * 24 * 60 * 60 * 1000;
|
|
665
|
+
const diversityCutoff = Date.now() - DIVERSITY_WINDOW_MS;
|
|
640
666
|
const recentTargets = new Map();
|
|
641
667
|
const recentAreas = new Map();
|
|
642
|
-
for (const e of history.slice(-
|
|
668
|
+
for (const e of history.slice(-50)) {
|
|
643
669
|
const key = `${e.area}:${e.target}`;
|
|
644
|
-
|
|
645
|
-
|
|
670
|
+
const ts = Date.parse(e.startedAt);
|
|
671
|
+
const tsMs = Number.isFinite(ts) ? ts : 0;
|
|
672
|
+
const cur = recentTargets.get(key);
|
|
673
|
+
recentTargets.set(key, {
|
|
674
|
+
count: (cur?.count ?? 0) + 1,
|
|
675
|
+
newestMs: Math.max(cur?.newestMs ?? 0, tsMs),
|
|
676
|
+
});
|
|
677
|
+
const curA = recentAreas.get(e.area);
|
|
678
|
+
recentAreas.set(e.area, {
|
|
679
|
+
count: (curA?.count ?? 0) + 1,
|
|
680
|
+
newestMs: Math.max(curA?.newestMs ?? 0, tsMs),
|
|
681
|
+
});
|
|
646
682
|
}
|
|
647
683
|
for (const p of this.getPendingChanges()) {
|
|
648
684
|
const key = `${p.area}:${p.target}`;
|
|
649
|
-
|
|
650
|
-
|
|
685
|
+
const now = Date.now();
|
|
686
|
+
const cur = recentTargets.get(key);
|
|
687
|
+
recentTargets.set(key, {
|
|
688
|
+
count: (cur?.count ?? 0) + 1,
|
|
689
|
+
newestMs: Math.max(cur?.newestMs ?? 0, now),
|
|
690
|
+
});
|
|
691
|
+
const curA = recentAreas.get(p.area);
|
|
692
|
+
recentAreas.set(p.area, {
|
|
693
|
+
count: (curA?.count ?? 0) + 1,
|
|
694
|
+
newestMs: Math.max(curA?.newestMs ?? 0, now),
|
|
695
|
+
});
|
|
651
696
|
}
|
|
652
|
-
// Block
|
|
697
|
+
// Block only when both (a) count is high enough AND (b) the last attempt
|
|
698
|
+
// was within the diversity window.
|
|
653
699
|
const overTargeted = [...recentTargets.entries()]
|
|
654
|
-
.filter(([,
|
|
700
|
+
.filter(([, v]) => v.count >= 2 && v.newestMs > diversityCutoff)
|
|
655
701
|
.map(([key]) => key);
|
|
656
|
-
// Block entire areas with >= 3 recent proposals
|
|
657
702
|
const overTargetedAreas = [...recentAreas.entries()]
|
|
658
|
-
.filter(([,
|
|
703
|
+
.filter(([, v]) => v.count >= 3 && v.newestMs > diversityCutoff)
|
|
659
704
|
.map(([area]) => area);
|
|
660
705
|
// Build area coverage stats to nudge the LLM toward unexplored areas
|
|
661
706
|
const allAreas = this.config.areas;
|
|
662
707
|
const areaCoverage = allAreas.map(area => {
|
|
663
|
-
const count = recentAreas.get(area) ?? 0;
|
|
708
|
+
const count = recentAreas.get(area)?.count ?? 0;
|
|
664
709
|
return `- ${area}: ${count} recent proposals`;
|
|
665
710
|
}).join('\n');
|
|
666
711
|
const diversityConstraint = `\n\n## AREA COVERAGE (target under-explored areas)\n${areaCoverage}\n` +
|
|
667
712
|
(overTargeted.length > 0 || overTargetedAreas.length > 0
|
|
668
713
|
? `\n## DIVERSITY CONSTRAINT\n` +
|
|
669
714
|
(overTargetedAreas.length > 0
|
|
670
|
-
? `These AREAS have been over-targeted and MUST NOT be chosen:\n${overTargetedAreas.map(a => `- ${a} (${recentAreas.get(a)} proposals)`).join('\n')}\n`
|
|
715
|
+
? `These AREAS have been over-targeted and MUST NOT be chosen:\n${overTargetedAreas.map(a => `- ${a} (${recentAreas.get(a)?.count ?? 0} proposals)`).join('\n')}\n`
|
|
671
716
|
: '') +
|
|
672
717
|
(overTargeted.length > 0
|
|
673
718
|
? `These specific targets MUST NOT be re-targeted:\n${overTargeted.map(t => `- ${t}`).join('\n')}\n`
|
|
@@ -13,6 +13,7 @@
|
|
|
13
13
|
* Extraction is regex-only to keep cost at $0 per DM. For nuanced
|
|
14
14
|
* claims the dashboard's manual verify/fail path covers the gap.
|
|
15
15
|
*/
|
|
16
|
+
import type { Gateway } from './router.js';
|
|
16
17
|
export type ClaimType = 'scheduled' | 'fixed' | 'will_do' | 'sent' | 'added' | 'unknown';
|
|
17
18
|
export type VerifyStrategy = 'cron_run_check' | 'config_inspect' | 'manual';
|
|
18
19
|
export type ClaimStatus = 'pending' | 'verified' | 'failed' | 'expired' | 'dismissed';
|
|
@@ -35,6 +36,13 @@ export interface Claim {
|
|
|
35
36
|
* Caller supplies sessionKey for traceability. Never throws.
|
|
36
37
|
*/
|
|
37
38
|
export declare function extractClaims(text: string, sessionKey?: string | null, agentSlug?: string | null): Omit<Claim, 'status' | 'extractedAt' | 'verifiedAt' | 'verdict'>[];
|
|
39
|
+
/**
|
|
40
|
+
* Drain the LLM-fallback queue: pick up to N enqueued DMs, ask Haiku
|
|
41
|
+
* to extract claims via the same shape the regex patterns use, persist
|
|
42
|
+
* any found. Best-effort — errors just leave the queue unchanged for
|
|
43
|
+
* the next sweep.
|
|
44
|
+
*/
|
|
45
|
+
export declare function drainLLMFallback(gateway: Gateway, maxPerSweep?: number): Promise<number>;
|
|
38
46
|
export declare function recordClaims(claims: Omit<Claim, 'status' | 'extractedAt' | 'verifiedAt' | 'verdict'>[]): Promise<void>;
|
|
39
47
|
export declare function listClaims(opts?: {
|
|
40
48
|
status?: ClaimStatus;
|
|
@@ -13,7 +13,7 @@
|
|
|
13
13
|
* Extraction is regex-only to keep cost at $0 per DM. For nuanced
|
|
14
14
|
* claims the dashboard's manual verify/fail path covers the gap.
|
|
15
15
|
*/
|
|
16
|
-
import { randomBytes } from 'node:crypto';
|
|
16
|
+
import { createHash, randomBytes } from 'node:crypto';
|
|
17
17
|
import path from 'node:path';
|
|
18
18
|
import pino from 'pino';
|
|
19
19
|
import { BASE_DIR, MEMORY_DB_PATH, VAULT_DIR } from '../config.js';
|
|
@@ -115,6 +115,42 @@ const PATTERNS = [
|
|
|
115
115
|
verifyStrategy: 'manual',
|
|
116
116
|
},
|
|
117
117
|
];
|
|
118
|
+
/**
|
|
119
|
+
* In-memory queue of DMs that regex-extraction missed but that look like
|
|
120
|
+
* they might contain claims (long enough, user-facing session). The
|
|
121
|
+
* heartbeat sweep drains this queue and runs the LLM fallback.
|
|
122
|
+
*
|
|
123
|
+
* Bounded to prevent memory growth — oldest entries are evicted.
|
|
124
|
+
*/
|
|
125
|
+
const MAX_PENDING_LLM = 20;
|
|
126
|
+
const pendingLLMExtraction = [];
|
|
127
|
+
function enqueueForLLM(text, sessionKey, agentSlug) {
|
|
128
|
+
// De-dup by text hash within the queue — don't re-enqueue the same DM.
|
|
129
|
+
const hash = sha1(text);
|
|
130
|
+
if (pendingLLMExtraction.some(e => sha1(e.text) === hash))
|
|
131
|
+
return;
|
|
132
|
+
pendingLLMExtraction.push({ text, sessionKey, agentSlug, queuedAt: Date.now() });
|
|
133
|
+
while (pendingLLMExtraction.length > MAX_PENDING_LLM)
|
|
134
|
+
pendingLLMExtraction.shift();
|
|
135
|
+
}
|
|
136
|
+
function sha1(s) {
|
|
137
|
+
return createHash('sha1').update(s).digest('hex');
|
|
138
|
+
}
|
|
139
|
+
/** Should a non-matching DM be considered for LLM fallback? */
|
|
140
|
+
function isLLMFallbackCandidate(text, sessionKey) {
|
|
141
|
+
if (!sessionKey)
|
|
142
|
+
return false;
|
|
143
|
+
if (text.length < 100)
|
|
144
|
+
return false;
|
|
145
|
+
// Owner-facing DMs only. Skip heartbeat check-ins (they have their own
|
|
146
|
+
// gate) and skip cron notification messages that are the system talking
|
|
147
|
+
// about itself.
|
|
148
|
+
if (!sessionKey.startsWith('discord:') && !sessionKey.startsWith('slack:') && !sessionKey.startsWith('telegram:'))
|
|
149
|
+
return false;
|
|
150
|
+
if (text.startsWith('**[') && text.includes('check-in]'))
|
|
151
|
+
return false;
|
|
152
|
+
return true;
|
|
153
|
+
}
|
|
118
154
|
/**
|
|
119
155
|
* Extract claims from a message. Returns empty array if nothing matched.
|
|
120
156
|
* Caller supplies sessionKey for traceability. Never throws.
|
|
@@ -157,8 +193,116 @@ export function extractClaims(text, sessionKey, agentSlug) {
|
|
|
157
193
|
agentSlug: agentSlug ?? null,
|
|
158
194
|
});
|
|
159
195
|
}
|
|
196
|
+
// Regex missed this DM but it looks like it could contain a claim the
|
|
197
|
+
// regex patterns can't catch ("Got that done", "Sent it, you should see
|
|
198
|
+
// it in a minute"). Queue for LLM fallback on the next heartbeat.
|
|
199
|
+
if (out.length === 0 && isLLMFallbackCandidate(text, sessionKey ?? null)) {
|
|
200
|
+
enqueueForLLM(text, sessionKey ?? null, agentSlug ?? null);
|
|
201
|
+
}
|
|
160
202
|
return out;
|
|
161
203
|
}
|
|
204
|
+
/**
|
|
205
|
+
* Drain the LLM-fallback queue: pick up to N enqueued DMs, ask Haiku
|
|
206
|
+
* to extract claims via the same shape the regex patterns use, persist
|
|
207
|
+
* any found. Best-effort — errors just leave the queue unchanged for
|
|
208
|
+
* the next sweep.
|
|
209
|
+
*/
|
|
210
|
+
export async function drainLLMFallback(gateway, maxPerSweep = 3) {
|
|
211
|
+
let drained = 0;
|
|
212
|
+
const batch = pendingLLMExtraction.splice(0, Math.min(maxPerSweep, pendingLLMExtraction.length));
|
|
213
|
+
for (const item of batch) {
|
|
214
|
+
try {
|
|
215
|
+
const claims = await llmExtractClaims(item.text, gateway);
|
|
216
|
+
if (claims.length === 0)
|
|
217
|
+
continue;
|
|
218
|
+
const toRecord = claims.map(c => ({
|
|
219
|
+
id: randomBytes(6).toString('hex'),
|
|
220
|
+
sessionKey: item.sessionKey,
|
|
221
|
+
messageSnippet: item.text.slice(0, 400),
|
|
222
|
+
claimType: c.claimType,
|
|
223
|
+
subject: c.subject,
|
|
224
|
+
dueAt: c.dueAt,
|
|
225
|
+
verifyStrategy: c.verifyStrategy,
|
|
226
|
+
agentSlug: item.agentSlug,
|
|
227
|
+
}));
|
|
228
|
+
await recordClaims(toRecord);
|
|
229
|
+
drained += claims.length;
|
|
230
|
+
}
|
|
231
|
+
catch (err) {
|
|
232
|
+
logger.debug({ err }, 'LLM fallback extraction failed for one DM');
|
|
233
|
+
}
|
|
234
|
+
}
|
|
235
|
+
return drained;
|
|
236
|
+
}
|
|
237
|
+
async function llmExtractClaims(text, gateway) {
|
|
238
|
+
const prompt = [
|
|
239
|
+
'You are analyzing a chat message Clementine (an AI assistant) just sent to her owner. Did Clementine make any commitments, promises, or claims about something she did or will do?',
|
|
240
|
+
'',
|
|
241
|
+
'Only extract claims where there\'s a clear, concrete action. Do NOT extract:',
|
|
242
|
+
'- Status updates ("inbox has 5 messages")',
|
|
243
|
+
'- Questions ("Should I proceed?")',
|
|
244
|
+
'- Suggestions ("You might want to check X")',
|
|
245
|
+
'- Routine check-ins or greetings',
|
|
246
|
+
'',
|
|
247
|
+
'DO extract:',
|
|
248
|
+
'- "I scheduled X" / "I added Y to your tasks" / "I fixed Z"',
|
|
249
|
+
'- "I\'ll send X at Ypm" / "Will run Y tomorrow"',
|
|
250
|
+
'- "Sent email to X" / "Posted to #channel"',
|
|
251
|
+
'',
|
|
252
|
+
'## Message:',
|
|
253
|
+
text.slice(0, 1500),
|
|
254
|
+
'',
|
|
255
|
+
'Output a JSON object only (no fences):',
|
|
256
|
+
'{',
|
|
257
|
+
' "claims": [',
|
|
258
|
+
' {',
|
|
259
|
+
' "claimType": "scheduled|fixed|will_do|sent|added",',
|
|
260
|
+
' "subject": "short description of what (the noun phrase)",',
|
|
261
|
+
' "dueAt": "ISO timestamp if a specific time was mentioned, else null"',
|
|
262
|
+
' }',
|
|
263
|
+
' ]',
|
|
264
|
+
'}',
|
|
265
|
+
'Empty array if no real commitments.',
|
|
266
|
+
].join('\n');
|
|
267
|
+
let raw;
|
|
268
|
+
try {
|
|
269
|
+
raw = await gateway.handleCronJob('llm-claim-extract', prompt, 1, // tier 1
|
|
270
|
+
3, // tight maxTurns
|
|
271
|
+
'haiku');
|
|
272
|
+
}
|
|
273
|
+
catch {
|
|
274
|
+
return [];
|
|
275
|
+
}
|
|
276
|
+
try {
|
|
277
|
+
const m = raw.match(/\{[\s\S]*\}/);
|
|
278
|
+
if (!m)
|
|
279
|
+
return [];
|
|
280
|
+
const parsed = JSON.parse(m[0]);
|
|
281
|
+
const claims = parsed.claims ?? [];
|
|
282
|
+
const out = [];
|
|
283
|
+
const validTypes = ['scheduled', 'fixed', 'will_do', 'sent', 'added'];
|
|
284
|
+
for (const c of claims) {
|
|
285
|
+
if (typeof c.subject !== 'string' || !c.subject.trim())
|
|
286
|
+
continue;
|
|
287
|
+
const type = typeof c.claimType === 'string' && validTypes.includes(c.claimType)
|
|
288
|
+
? c.claimType
|
|
289
|
+
: 'unknown';
|
|
290
|
+
if (type === 'unknown')
|
|
291
|
+
continue;
|
|
292
|
+
const dueAt = typeof c.dueAt === 'string' && /^\d{4}-\d{2}-\d{2}T/.test(c.dueAt) ? c.dueAt : null;
|
|
293
|
+
out.push({
|
|
294
|
+
claimType: type,
|
|
295
|
+
subject: c.subject.trim().slice(0, 200),
|
|
296
|
+
dueAt,
|
|
297
|
+
verifyStrategy: type === 'scheduled' ? 'cron_run_check' : 'manual',
|
|
298
|
+
});
|
|
299
|
+
}
|
|
300
|
+
return out;
|
|
301
|
+
}
|
|
302
|
+
catch {
|
|
303
|
+
return [];
|
|
304
|
+
}
|
|
305
|
+
}
|
|
162
306
|
// ── Persistence ──────────────────────────────────────────────────────
|
|
163
307
|
async function getStore() {
|
|
164
308
|
const { MemoryStore } = await import('../memory/store.js');
|
|
@@ -76,8 +76,48 @@ function readRunLog(filePath) {
|
|
|
76
76
|
return [];
|
|
77
77
|
}
|
|
78
78
|
}
|
|
79
|
-
function isFailure(entry) {
|
|
80
|
-
|
|
79
|
+
function isFailure(entry, gradeCache) {
|
|
80
|
+
if (entry.status === 'error' || entry.status === 'retried')
|
|
81
|
+
return true;
|
|
82
|
+
if (isSemanticFailure(entry))
|
|
83
|
+
return true;
|
|
84
|
+
// Outcome grader verdict, if we have one for this (job, time) tuple.
|
|
85
|
+
// Key format: `${jobName}|${startedAt}`. A `false` grade means the LLM
|
|
86
|
+
// judged the apparent-ok run as semantically failed.
|
|
87
|
+
if (gradeCache) {
|
|
88
|
+
const key = `${entry.jobName}|${entry.startedAt}`;
|
|
89
|
+
const passed = gradeCache.get(key);
|
|
90
|
+
if (passed === false)
|
|
91
|
+
return true;
|
|
92
|
+
}
|
|
93
|
+
return false;
|
|
94
|
+
}
|
|
95
|
+
/**
|
|
96
|
+
* Pre-load outcome grades for recent runs of all jobs so the synchronous
|
|
97
|
+
* isFailure check can consult them without hitting SQLite per call.
|
|
98
|
+
* Returns a map keyed by `${jobName}|${startedAt}` with the `passed` verdict.
|
|
99
|
+
*/
|
|
100
|
+
function loadGradeCache() {
|
|
101
|
+
const cache = new Map();
|
|
102
|
+
try {
|
|
103
|
+
const { MEMORY_DB_PATH } = require('../config.js');
|
|
104
|
+
if (!existsSync(MEMORY_DB_PATH))
|
|
105
|
+
return cache;
|
|
106
|
+
// eslint-disable-next-line @typescript-eslint/no-require-imports
|
|
107
|
+
const Database = require('better-sqlite3');
|
|
108
|
+
const db = new Database(MEMORY_DB_PATH, { readonly: true });
|
|
109
|
+
try {
|
|
110
|
+
const rows = db.prepare(`SELECT job_name, started_at, passed FROM graded_runs
|
|
111
|
+
WHERE graded_at >= datetime('now', '-14 days')`).all();
|
|
112
|
+
for (const r of rows) {
|
|
113
|
+
cache.set(`${r.job_name}|${r.started_at}`, r.passed === 1);
|
|
114
|
+
}
|
|
115
|
+
}
|
|
116
|
+
catch { /* graded_runs may not exist on older DBs */ }
|
|
117
|
+
db.close();
|
|
118
|
+
}
|
|
119
|
+
catch { /* non-fatal */ }
|
|
120
|
+
return cache;
|
|
81
121
|
}
|
|
82
122
|
/**
|
|
83
123
|
* "Semantic failure" — a run the scheduler called `ok` but whose agent output
|
|
@@ -162,6 +202,7 @@ export function computeBrokenJobs(now = Date.now()) {
|
|
|
162
202
|
return [];
|
|
163
203
|
}
|
|
164
204
|
const dormantCutoffMs = now - 7 * 24 * 60 * 60 * 1000;
|
|
205
|
+
const gradeCache = loadGradeCache();
|
|
165
206
|
for (const file of files) {
|
|
166
207
|
const entries = readRunLog(path.join(RUNS_DIR, file));
|
|
167
208
|
if (entries.length === 0)
|
|
@@ -196,7 +237,7 @@ export function computeBrokenJobs(now = Date.now()) {
|
|
|
196
237
|
const ts = Date.parse(e.startedAt);
|
|
197
238
|
return Number.isFinite(ts) && ts >= sinceMs;
|
|
198
239
|
});
|
|
199
|
-
const failures = inWindow.filter(isFailure);
|
|
240
|
+
const failures = inWindow.filter(e => isFailure(e, gradeCache));
|
|
200
241
|
// Consecutive-failure signal: scan from most recent entry backward.
|
|
201
242
|
// Stops at the first non-failure (ignoring 'skipped' which is neither
|
|
202
243
|
// signal). Catches daily jobs that fail every run without accumulating
|
|
@@ -206,7 +247,7 @@ export function computeBrokenJobs(now = Date.now()) {
|
|
|
206
247
|
const e = entries[i];
|
|
207
248
|
if (e.status === 'skipped')
|
|
208
249
|
continue;
|
|
209
|
-
if (isFailure(e))
|
|
250
|
+
if (isFailure(e, gradeCache))
|
|
210
251
|
consecutiveFailures++;
|
|
211
252
|
else
|
|
212
253
|
break;
|
|
@@ -221,7 +262,7 @@ export function computeBrokenJobs(now = Date.now()) {
|
|
|
221
262
|
// back to the most recent errors anywhere in the log.
|
|
222
263
|
const errSource = failures.length > 0
|
|
223
264
|
? failures
|
|
224
|
-
: entries.filter(isFailure);
|
|
265
|
+
: entries.filter(e => isFailure(e, gradeCache));
|
|
225
266
|
const distinctErrors = [];
|
|
226
267
|
const seen = new Set();
|
|
227
268
|
for (let i = errSource.length - 1; i >= 0 && distinctErrors.length < 3; i--) {
|
|
@@ -413,6 +454,18 @@ function formatReport(jobs) {
|
|
|
413
454
|
* Returns the jobs that triggered a fresh notification (mostly for tests/logs).
|
|
414
455
|
*/
|
|
415
456
|
export async function runFailureSweep(send, gateway, now = Date.now()) {
|
|
457
|
+
// Opportunistically grade suspicious ok runs BEFORE computing broken
|
|
458
|
+
// jobs, so fresh grades feed into this same sweep's detection.
|
|
459
|
+
// Scoped to a handful of recent suspicious entries per job to keep cost
|
|
460
|
+
// bounded (~$0.01 per grade; cached forever).
|
|
461
|
+
if (gateway) {
|
|
462
|
+
try {
|
|
463
|
+
await gradeSuspiciousRecentRuns(gateway, now);
|
|
464
|
+
}
|
|
465
|
+
catch (err) {
|
|
466
|
+
logger.warn({ err }, 'Suspicious-run grading pre-pass failed (non-fatal)');
|
|
467
|
+
}
|
|
468
|
+
}
|
|
416
469
|
const broken = computeBrokenJobs(now);
|
|
417
470
|
if (broken.length === 0) {
|
|
418
471
|
// Clear cooldowns AND diagnostic cache entries for jobs that recovered.
|
|
@@ -499,4 +552,54 @@ function appendAuditLog(action, jobNames) {
|
|
|
499
552
|
}
|
|
500
553
|
catch { /* non-fatal */ }
|
|
501
554
|
}
|
|
555
|
+
/**
|
|
556
|
+
* Scan each job's recent runs for suspicious apparent-ok entries and grade
|
|
557
|
+
* them. Each job contributes at most 2 LLM calls per sweep. Results are
|
|
558
|
+
* cached per (jobName, startedAt), so a suspicious run grades exactly once.
|
|
559
|
+
*/
|
|
560
|
+
async function gradeSuspiciousRecentRuns(gateway, now) {
|
|
561
|
+
const { isSuspicious, gradeRun, getGrade } = await import('./outcome-grader.js');
|
|
562
|
+
if (!existsSync(RUNS_DIR))
|
|
563
|
+
return;
|
|
564
|
+
// Only look at the last 48h so we're not burning grades on ancient entries.
|
|
565
|
+
const sinceMs = now - 48 * 60 * 60 * 1000;
|
|
566
|
+
let files = [];
|
|
567
|
+
try {
|
|
568
|
+
files = readdirSync(RUNS_DIR).filter(f => f.endsWith('.jsonl'));
|
|
569
|
+
}
|
|
570
|
+
catch {
|
|
571
|
+
return;
|
|
572
|
+
}
|
|
573
|
+
for (const file of files) {
|
|
574
|
+
const entries = readRunLog(path.join(RUNS_DIR, file));
|
|
575
|
+
const recent = entries
|
|
576
|
+
.filter(e => {
|
|
577
|
+
const ts = Date.parse(e.startedAt);
|
|
578
|
+
return Number.isFinite(ts) && ts >= sinceMs;
|
|
579
|
+
})
|
|
580
|
+
.filter(isSuspicious);
|
|
581
|
+
// Budget: at most 2 per job per sweep. Take the newest.
|
|
582
|
+
for (const entry of recent.slice(-2)) {
|
|
583
|
+
const cached = await getGrade(entry.jobName, entry.startedAt);
|
|
584
|
+
if (cached)
|
|
585
|
+
continue;
|
|
586
|
+
// Attempt to read the job prompt for richer grading context.
|
|
587
|
+
const jobPrompt = await loadJobPrompt(entry.jobName);
|
|
588
|
+
await gradeRun(entry, gateway, jobPrompt ?? undefined);
|
|
589
|
+
}
|
|
590
|
+
}
|
|
591
|
+
}
|
|
592
|
+
/** Load the current CRON.md prompt for a job. Returns null if not found. */
|
|
593
|
+
async function loadJobPrompt(jobName) {
|
|
594
|
+
try {
|
|
595
|
+
const { parseCronJobs, parseAgentCronJobs } = await import('./cron-scheduler.js');
|
|
596
|
+
const { AGENTS_DIR } = await import('../config.js');
|
|
597
|
+
const allJobs = [...parseCronJobs(), ...parseAgentCronJobs(AGENTS_DIR)];
|
|
598
|
+
const job = allJobs.find(j => j.name === jobName);
|
|
599
|
+
return job?.prompt ?? null;
|
|
600
|
+
}
|
|
601
|
+
catch {
|
|
602
|
+
return null;
|
|
603
|
+
}
|
|
604
|
+
}
|
|
502
605
|
//# sourceMappingURL=failure-monitor.js.map
|
|
@@ -114,12 +114,25 @@ export class HeartbeatScheduler {
|
|
|
114
114
|
}).catch(err => logger.warn({ err }, 'Failure sweep import failed'));
|
|
115
115
|
// Claim verification sweep — auto-verify pending claims whose due
|
|
116
116
|
// times have passed (e.g. "I scheduled X for 8am" → check at 9am).
|
|
117
|
-
import('./claim-tracker.js').then(({ verifyDueClaims }) => {
|
|
118
|
-
|
|
117
|
+
import('./claim-tracker.js').then(async ({ verifyDueClaims, drainLLMFallback }) => {
|
|
118
|
+
try {
|
|
119
|
+
const { verified, failed, expired } = await verifyDueClaims();
|
|
119
120
|
if (verified + failed + expired > 0) {
|
|
120
121
|
logger.info({ verified, failed, expired }, 'Claim verification sweep complete');
|
|
121
122
|
}
|
|
122
|
-
}
|
|
123
|
+
}
|
|
124
|
+
catch (err) {
|
|
125
|
+
logger.warn({ err }, 'Claim verification sweep failed');
|
|
126
|
+
}
|
|
127
|
+
// LLM fallback for regex-missed DMs — bounded batch per sweep
|
|
128
|
+
try {
|
|
129
|
+
const drained = await drainLLMFallback(this.gateway, 3);
|
|
130
|
+
if (drained > 0)
|
|
131
|
+
logger.info({ count: drained }, 'LLM claim fallback extracted');
|
|
132
|
+
}
|
|
133
|
+
catch (err) {
|
|
134
|
+
logger.debug({ err }, 'LLM claim fallback failed (non-fatal)');
|
|
135
|
+
}
|
|
123
136
|
}).catch(err => logger.warn({ err }, 'Claim tracker import failed'));
|
|
124
137
|
const now = new Date();
|
|
125
138
|
const hour = now.getHours();
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Clementine TypeScript — Outcome grader.
|
|
3
|
+
*
|
|
4
|
+
* Second-pass check for cron runs the scheduler called `ok` but that
|
|
5
|
+
* might actually be semantic failures. Covers the gap between the
|
|
6
|
+
* marker-based semantic detection in failure-monitor (which caught
|
|
7
|
+
* BLOCKED / FAILED / etc. in output) and the empty-output-too-short
|
|
8
|
+
* case (which false-positives on legitimate quiet healthchecks).
|
|
9
|
+
*
|
|
10
|
+
* Strategy: only invoke the LLM when the run is SUSPICIOUS (empty
|
|
11
|
+
* preview with non-trivial duration, or ambiguous content). Cost:
|
|
12
|
+
* bounded to ~$0.01 per suspicious run, cached forever per
|
|
13
|
+
* (job_name, started_at) tuple.
|
|
14
|
+
*/
|
|
15
|
+
import type { CronRunEntry } from '../types.js';
|
|
16
|
+
import type { Gateway } from './router.js';
|
|
17
|
+
export interface Grade {
|
|
18
|
+
jobName: string;
|
|
19
|
+
startedAt: string;
|
|
20
|
+
passed: boolean;
|
|
21
|
+
score: number;
|
|
22
|
+
reasoning: string;
|
|
23
|
+
gradedAt: string;
|
|
24
|
+
}
|
|
25
|
+
/**
|
|
26
|
+
* Decide whether a run warrants LLM grading. Heuristic — designed to
|
|
27
|
+
* fire on the exact pattern that slipped through today: apparent-ok
|
|
28
|
+
* runs with empty output + duration suggesting real work happened.
|
|
29
|
+
*/
|
|
30
|
+
export declare function isSuspicious(entry: CronRunEntry): boolean;
|
|
31
|
+
export declare function getGrade(jobName: string, startedAt: string): Promise<Grade | null>;
|
|
32
|
+
export declare function recordGrade(grade: Grade): Promise<void>;
|
|
33
|
+
/**
|
|
34
|
+
* Grade a single cron run. Returns cached grade if we've already graded
|
|
35
|
+
* this (job, startedAt) tuple. Returns null if grading fails — caller
|
|
36
|
+
* should fall back to existing signals.
|
|
37
|
+
*/
|
|
38
|
+
export declare function gradeRun(entry: CronRunEntry, gateway: Gateway, jobPrompt?: string): Promise<Grade | null>;
|
|
39
|
+
/** Look up recent grades for a job — used by the dashboard. */
|
|
40
|
+
export declare function recentGrades(jobName: string, limit?: number): Promise<Grade[]>;
|
|
41
|
+
//# sourceMappingURL=outcome-grader.d.ts.map
|
|
@@ -0,0 +1,173 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Clementine TypeScript — Outcome grader.
|
|
3
|
+
*
|
|
4
|
+
* Second-pass check for cron runs the scheduler called `ok` but that
|
|
5
|
+
* might actually be semantic failures. Covers the gap between the
|
|
6
|
+
* marker-based semantic detection in failure-monitor (which caught
|
|
7
|
+
* BLOCKED / FAILED / etc. in output) and the empty-output-too-short
|
|
8
|
+
* case (which false-positives on legitimate quiet healthchecks).
|
|
9
|
+
*
|
|
10
|
+
* Strategy: only invoke the LLM when the run is SUSPICIOUS (empty
|
|
11
|
+
* preview with non-trivial duration, or ambiguous content). Cost:
|
|
12
|
+
* bounded to ~$0.01 per suspicious run, cached forever per
|
|
13
|
+
* (job_name, started_at) tuple.
|
|
14
|
+
*/
|
|
15
|
+
import pino from 'pino';
|
|
16
|
+
import { MEMORY_DB_PATH, VAULT_DIR } from '../config.js';
|
|
17
|
+
const logger = pino({ name: 'clementine.outcome-grader' });
|
|
18
|
+
async function getStore() {
|
|
19
|
+
const { MemoryStore } = await import('../memory/store.js');
|
|
20
|
+
const store = new MemoryStore(MEMORY_DB_PATH, VAULT_DIR);
|
|
21
|
+
store.initialize();
|
|
22
|
+
return store;
|
|
23
|
+
}
|
|
24
|
+
/**
|
|
25
|
+
* Decide whether a run warrants LLM grading. Heuristic — designed to
|
|
26
|
+
* fire on the exact pattern that slipped through today: apparent-ok
|
|
27
|
+
* runs with empty output + duration suggesting real work happened.
|
|
28
|
+
*/
|
|
29
|
+
export function isSuspicious(entry) {
|
|
30
|
+
if (entry.status !== 'ok')
|
|
31
|
+
return false;
|
|
32
|
+
const preview = (entry.outputPreview ?? '').trim();
|
|
33
|
+
// Case 1: empty or near-empty preview with meaningful duration.
|
|
34
|
+
// 20s threshold catches today's empty-market-leader-followup pattern
|
|
35
|
+
// (23s + $0.57 cost, returned nothing). Legitimate quiet healthchecks
|
|
36
|
+
// can run 15-33s too — we'll grade them once, the LLM correctly judges
|
|
37
|
+
// "nothing to report" as passed, and the cached result means no re-grade.
|
|
38
|
+
if (preview.length < 20 && entry.durationMs > 20_000)
|
|
39
|
+
return true;
|
|
40
|
+
// Case 2: reasonable preview but contains soft-negative language that
|
|
41
|
+
// marker-based detection might miss. Kept tight so we don't spend on
|
|
42
|
+
// normal variance.
|
|
43
|
+
const lower = preview.toLowerCase();
|
|
44
|
+
if (/\b(partial|skipped\s+\d+|could\s+not\s+complete|insufficient|timeout(?!ed)|not\s+enough|attempting|retrying)\b/.test(lower)) {
|
|
45
|
+
return true;
|
|
46
|
+
}
|
|
47
|
+
return false;
|
|
48
|
+
}
|
|
49
|
+
export async function getGrade(jobName, startedAt) {
|
|
50
|
+
try {
|
|
51
|
+
const store = await getStore();
|
|
52
|
+
const db = store.conn;
|
|
53
|
+
const row = db.prepare(`SELECT job_name AS jobName, started_at AS startedAt, passed, score, reasoning, graded_at AS gradedAt
|
|
54
|
+
FROM graded_runs WHERE job_name = ? AND started_at = ?`).get(jobName, startedAt);
|
|
55
|
+
store.close();
|
|
56
|
+
if (!row)
|
|
57
|
+
return null;
|
|
58
|
+
return { ...row, passed: row.passed === 1 };
|
|
59
|
+
}
|
|
60
|
+
catch {
|
|
61
|
+
return null;
|
|
62
|
+
}
|
|
63
|
+
}
|
|
64
|
+
export async function recordGrade(grade) {
|
|
65
|
+
try {
|
|
66
|
+
const store = await getStore();
|
|
67
|
+
const db = store.conn;
|
|
68
|
+
db.prepare(`INSERT OR REPLACE INTO graded_runs (job_name, started_at, passed, score, reasoning)
|
|
69
|
+
VALUES (?, ?, ?, ?, ?)`).run(grade.jobName, grade.startedAt, grade.passed ? 1 : 0, grade.score, grade.reasoning);
|
|
70
|
+
store.close();
|
|
71
|
+
}
|
|
72
|
+
catch (err) {
|
|
73
|
+
logger.warn({ err, jobName: grade.jobName }, 'Failed to record grade');
|
|
74
|
+
}
|
|
75
|
+
}
|
|
76
|
+
function buildPrompt(entry, jobPrompt) {
|
|
77
|
+
return [
|
|
78
|
+
'You are judging whether a cron job execution actually accomplished its intent.',
|
|
79
|
+
'',
|
|
80
|
+
`## Job: ${entry.jobName}`,
|
|
81
|
+
`## Duration: ${Math.round(entry.durationMs / 1000)}s`,
|
|
82
|
+
'',
|
|
83
|
+
'## Job instructions (the prompt the agent was given):',
|
|
84
|
+
jobPrompt ? jobPrompt.slice(0, 2000) : '(instructions unavailable)',
|
|
85
|
+
'',
|
|
86
|
+
'## What the agent produced (output preview, may be truncated):',
|
|
87
|
+
(entry.outputPreview ?? '(empty)').slice(0, 1500),
|
|
88
|
+
'',
|
|
89
|
+
'## Your job',
|
|
90
|
+
'Decide: did the agent actually accomplish the task, or did it superficially succeed while failing semantically?',
|
|
91
|
+
'Examples of semantic success that looks like failure: a healthcheck that returns nothing because everything is healthy; a reply-detection sweep that returns nothing because no replies came in.',
|
|
92
|
+
'Examples of semantic failure that looks like success: the agent hits a blocker, logs status=ok, returns empty; the agent fails auth and returns a generic "cannot proceed"; the agent reports "attempting X" but never actually does X.',
|
|
93
|
+
'',
|
|
94
|
+
'Output ONLY a JSON object, no fences:',
|
|
95
|
+
'{',
|
|
96
|
+
' "passed": true|false,',
|
|
97
|
+
' "score": 0-5 (5 = clearly accomplished, 0 = clearly failed),',
|
|
98
|
+
' "reasoning": "one sentence explaining your judgment"',
|
|
99
|
+
'}',
|
|
100
|
+
].join('\n');
|
|
101
|
+
}
|
|
102
|
+
function parseGrade(raw) {
|
|
103
|
+
try {
|
|
104
|
+
const m = raw.match(/\{[\s\S]*\}/);
|
|
105
|
+
if (!m)
|
|
106
|
+
return null;
|
|
107
|
+
const p = JSON.parse(m[0]);
|
|
108
|
+
if (typeof p.passed !== 'boolean')
|
|
109
|
+
return null;
|
|
110
|
+
const score = typeof p.score === 'number' ? Math.max(0, Math.min(5, Math.round(p.score))) : (p.passed ? 4 : 1);
|
|
111
|
+
return {
|
|
112
|
+
passed: p.passed,
|
|
113
|
+
score,
|
|
114
|
+
reasoning: typeof p.reasoning === 'string' ? p.reasoning.slice(0, 300) : '(no reasoning)',
|
|
115
|
+
};
|
|
116
|
+
}
|
|
117
|
+
catch {
|
|
118
|
+
return null;
|
|
119
|
+
}
|
|
120
|
+
}
|
|
121
|
+
/**
|
|
122
|
+
* Grade a single cron run. Returns cached grade if we've already graded
|
|
123
|
+
* this (job, startedAt) tuple. Returns null if grading fails — caller
|
|
124
|
+
* should fall back to existing signals.
|
|
125
|
+
*/
|
|
126
|
+
export async function gradeRun(entry, gateway, jobPrompt) {
|
|
127
|
+
// Cache lookup
|
|
128
|
+
const cached = await getGrade(entry.jobName, entry.startedAt);
|
|
129
|
+
if (cached)
|
|
130
|
+
return cached;
|
|
131
|
+
if (!isSuspicious(entry))
|
|
132
|
+
return null;
|
|
133
|
+
const prompt = buildPrompt(entry, jobPrompt ?? null);
|
|
134
|
+
let raw;
|
|
135
|
+
try {
|
|
136
|
+
raw = await gateway.handleCronJob(`grade:${entry.jobName}`, prompt, 1, // tier 1
|
|
137
|
+
3, // maxTurns — tight
|
|
138
|
+
'haiku');
|
|
139
|
+
}
|
|
140
|
+
catch (err) {
|
|
141
|
+
logger.warn({ err, jobName: entry.jobName }, 'Outcome grader LLM call failed');
|
|
142
|
+
return null;
|
|
143
|
+
}
|
|
144
|
+
const parsed = parseGrade(raw);
|
|
145
|
+
if (!parsed) {
|
|
146
|
+
logger.warn({ jobName: entry.jobName, rawHead: raw.slice(0, 200) }, 'Outcome grader returned unparseable response');
|
|
147
|
+
return null;
|
|
148
|
+
}
|
|
149
|
+
const grade = {
|
|
150
|
+
jobName: entry.jobName,
|
|
151
|
+
startedAt: entry.startedAt,
|
|
152
|
+
...parsed,
|
|
153
|
+
gradedAt: new Date().toISOString(),
|
|
154
|
+
};
|
|
155
|
+
await recordGrade(grade);
|
|
156
|
+
logger.info({ jobName: grade.jobName, passed: grade.passed, score: grade.score }, 'Graded run');
|
|
157
|
+
return grade;
|
|
158
|
+
}
|
|
159
|
+
/** Look up recent grades for a job — used by the dashboard. */
|
|
160
|
+
export async function recentGrades(jobName, limit = 10) {
|
|
161
|
+
try {
|
|
162
|
+
const store = await getStore();
|
|
163
|
+
const db = store.conn;
|
|
164
|
+
const rows = db.prepare(`SELECT job_name AS jobName, started_at AS startedAt, passed, score, reasoning, graded_at AS gradedAt
|
|
165
|
+
FROM graded_runs WHERE job_name = ? ORDER BY started_at DESC LIMIT ?`).all(jobName, limit);
|
|
166
|
+
store.close();
|
|
167
|
+
return rows.map(r => ({ ...r, passed: r.passed === 1 }));
|
|
168
|
+
}
|
|
169
|
+
catch {
|
|
170
|
+
return [];
|
|
171
|
+
}
|
|
172
|
+
}
|
|
173
|
+
//# sourceMappingURL=outcome-grader.js.map
|
package/dist/memory/store.js
CHANGED
|
@@ -436,6 +436,17 @@ export class MemoryStore {
|
|
|
436
436
|
CREATE INDEX IF NOT EXISTS idx_claims_status ON claims(status, extracted_at DESC);
|
|
437
437
|
CREATE INDEX IF NOT EXISTS idx_claims_due ON claims(due_at) WHERE status = 'pending';
|
|
438
438
|
CREATE INDEX IF NOT EXISTS idx_claims_extracted ON claims(extracted_at DESC);
|
|
439
|
+
|
|
440
|
+
CREATE TABLE IF NOT EXISTS graded_runs (
|
|
441
|
+
job_name TEXT NOT NULL,
|
|
442
|
+
started_at TEXT NOT NULL,
|
|
443
|
+
passed INTEGER NOT NULL,
|
|
444
|
+
score INTEGER NOT NULL,
|
|
445
|
+
reasoning TEXT,
|
|
446
|
+
graded_at TEXT NOT NULL DEFAULT (datetime('now')),
|
|
447
|
+
PRIMARY KEY (job_name, started_at)
|
|
448
|
+
);
|
|
449
|
+
CREATE INDEX IF NOT EXISTS idx_graded_runs_job ON graded_runs(job_name, started_at DESC);
|
|
439
450
|
`);
|
|
440
451
|
}
|
|
441
452
|
// ── Skill usage telemetry ─────────────────────────────────────────
|