clementine-agent 1.0.15 → 1.0.16
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
|
@@ -220,6 +220,26 @@ export class SelfImproveLoop {
|
|
|
220
220
|
// Check plateau
|
|
221
221
|
if (consecutiveLow >= this.config.plateauLimit) {
|
|
222
222
|
logger.info({ consecutiveLow }, 'Plateau detected — stopping');
|
|
223
|
+
// Record the plateau in the experiment log so it's not silently
|
|
224
|
+
// invisible. Helps the dashboard and failure monitor distinguish
|
|
225
|
+
// "exhausted diverse hypotheses" from "crashed mid-run".
|
|
226
|
+
const plateauExperiment = {
|
|
227
|
+
id: randomBytes(4).toString('hex'),
|
|
228
|
+
iteration: i,
|
|
229
|
+
startedAt: new Date(loopStart).toISOString(),
|
|
230
|
+
finishedAt: new Date().toISOString(),
|
|
231
|
+
durationMs: Date.now() - loopStart,
|
|
232
|
+
area: 'soul',
|
|
233
|
+
target: 'n/a',
|
|
234
|
+
hypothesis: 'No new hypothesis — diversity constraint exhausted',
|
|
235
|
+
proposedChange: '',
|
|
236
|
+
baselineScore: 0,
|
|
237
|
+
score: 0,
|
|
238
|
+
accepted: false,
|
|
239
|
+
approvalStatus: 'denied',
|
|
240
|
+
reason: 'Plateau: no novel improvement area remaining',
|
|
241
|
+
};
|
|
242
|
+
this.appendExperimentLog(plateauExperiment);
|
|
223
243
|
break;
|
|
224
244
|
}
|
|
225
245
|
state.currentIteration = i;
|
|
@@ -278,24 +278,29 @@ function detectSelfImproveBreakage(now) {
|
|
|
278
278
|
}
|
|
279
279
|
catch { /* non-fatal */ }
|
|
280
280
|
}
|
|
281
|
-
const lastRunMs = state.lastRunAt ? Date.parse(state.lastRunAt) : 0;
|
|
282
|
-
const lookback48h = now - 48 * 60 * 60 * 1000;
|
|
283
281
|
const staleLookback = now - 7 * 24 * 60 * 60 * 1000; // 7 days
|
|
284
282
|
const recentExperiments = experiments.filter(e => {
|
|
285
283
|
const ts = e.startedAt ? Date.parse(e.startedAt) : 0;
|
|
286
284
|
return Number.isFinite(ts) && ts >= staleLookback;
|
|
287
285
|
});
|
|
288
286
|
const recentErrors = recentExperiments.filter(e => e.approvalStatus === 'denied' && (e.reason?.startsWith('Error') ?? false));
|
|
289
|
-
//
|
|
290
|
-
// a. state.infraError is set
|
|
291
|
-
// b.
|
|
292
|
-
// c.
|
|
287
|
+
// Break modes we care about:
|
|
288
|
+
// a. state.infraError is set — loop detected unfixable infra issue
|
|
289
|
+
// b. state.status === 'failed' — run threw, didn't complete normally
|
|
290
|
+
// c. all 3+ most recent experiments are errors — persistent iteration failures
|
|
291
|
+
//
|
|
292
|
+
// Deliberately NOT flagging "silent early exit" (lastRunAt recent but no new
|
|
293
|
+
// experiments) when state.status === 'completed'. That's the expected
|
|
294
|
+
// plateau state: the hypothesizer returns null for every iteration because
|
|
295
|
+
// the diversity constraint has blocked every previously-targeted area, the
|
|
296
|
+
// loop skips, plateau triggers, loop exits cleanly. Not broken — saturated.
|
|
297
|
+
// Forcing alarm on a saturated-but-healthy loop would make the monitor
|
|
298
|
+
// unusable long-term.
|
|
293
299
|
const hasInfraError = !!state.infraError;
|
|
300
|
+
const runFailed = state.status === 'failed';
|
|
294
301
|
const allRecentErrored = recentExperiments.length >= 3
|
|
295
302
|
&& recentExperiments.every(e => e.approvalStatus === 'denied');
|
|
296
|
-
|
|
297
|
-
&& recentExperiments.length === 0;
|
|
298
|
-
if (!hasInfraError && !allRecentErrored && !silentEarlyExit)
|
|
303
|
+
if (!hasInfraError && !runFailed && !allRecentErrored)
|
|
299
304
|
return null;
|
|
300
305
|
const lastErrors = [];
|
|
301
306
|
for (let i = experiments.length - 1; i >= 0 && lastErrors.length < 3; i--) {
|
|
@@ -304,21 +309,12 @@ function detectSelfImproveBreakage(now) {
|
|
|
304
309
|
continue;
|
|
305
310
|
lastErrors.push(err.slice(0, 400));
|
|
306
311
|
}
|
|
307
|
-
// If we don't have an explicit infraError but the last recorded error
|
|
308
|
-
// looks schema-related, surface it — this captures the state where all
|
|
309
|
-
// iterations died with the same API 400 but state.infraError never got
|
|
310
|
-
// persisted (happens when MAX_INFRA_ERRORS isn't crossed within a run).
|
|
311
|
-
const lastLoggedError = experiments.length > 0 ? (experiments[experiments.length - 1].error ?? '') : '';
|
|
312
|
-
const inferredInfraSchema = /input_schema|tools\.\d+\.custom/i.test(lastLoggedError);
|
|
313
312
|
let opinion;
|
|
314
313
|
if (hasInfraError) {
|
|
315
314
|
opinion = `infra: ${state.infraError.category} — ${state.infraError.diagnostic.slice(0, 200)}`;
|
|
316
315
|
}
|
|
317
|
-
else if (
|
|
318
|
-
opinion = 'loop
|
|
319
|
-
}
|
|
320
|
-
else if (silentEarlyExit) {
|
|
321
|
-
opinion = 'loop ran but produced no experiments — likely crashing before iteration (check metrics gathering or hypothesis generation)';
|
|
316
|
+
else if (runFailed) {
|
|
317
|
+
opinion = 'loop exited with status=failed — check daemon log for the thrown error';
|
|
322
318
|
}
|
|
323
319
|
else {
|
|
324
320
|
opinion = `${recentErrors.length}/${recentExperiments.length} recent iterations errored`;
|