clementine-agent 1.18.133 → 1.18.134
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
|
@@ -98,7 +98,7 @@ export interface SelfImproveLoopOptions {
|
|
|
98
98
|
allowAutoApplySafeFixes?: boolean;
|
|
99
99
|
}
|
|
100
100
|
export declare function classifyFailure(recentErrors: string[]): FixRecipe;
|
|
101
|
-
export declare class
|
|
101
|
+
export declare class FailureFixConsumer {
|
|
102
102
|
private readonly tickMs;
|
|
103
103
|
private readonly triggersDir;
|
|
104
104
|
private readonly pendingDir;
|
|
@@ -133,4 +133,4 @@ export declare class SelfImproveLoop {
|
|
|
133
133
|
private processOne;
|
|
134
134
|
private notifyAgent;
|
|
135
135
|
}
|
|
136
|
-
//# sourceMappingURL=
|
|
136
|
+
//# sourceMappingURL=failure-fix-consumer.d.ts.map
|
|
@@ -29,7 +29,7 @@ import path from 'node:path';
|
|
|
29
29
|
import matter from 'gray-matter';
|
|
30
30
|
import pino from 'pino';
|
|
31
31
|
import { AGENTS_DIR, BASE_DIR, SYSTEM_DIR } from '../config.js';
|
|
32
|
-
const logger = pino({ name: 'clementine.
|
|
32
|
+
const logger = pino({ name: 'clementine.failure-fix-consumer' });
|
|
33
33
|
/**
|
|
34
34
|
* Fallback tick interval. The loop is primarily event-driven via fs.watch
|
|
35
35
|
* on the triggers directory — this is just a slow safety net for cases
|
|
@@ -231,7 +231,7 @@ function writePendingChange(record, dir) {
|
|
|
231
231
|
return file;
|
|
232
232
|
}
|
|
233
233
|
// ── Main loop ────────────────────────────────────────────────────────
|
|
234
|
-
export class
|
|
234
|
+
export class FailureFixConsumer {
|
|
235
235
|
tickMs;
|
|
236
236
|
triggersDir;
|
|
237
237
|
pendingDir;
|
|
@@ -503,4 +503,4 @@ export class SelfImproveLoop {
|
|
|
503
503
|
}
|
|
504
504
|
}
|
|
505
505
|
}
|
|
506
|
-
//# sourceMappingURL=
|
|
506
|
+
//# sourceMappingURL=failure-fix-consumer.js.map
|
|
@@ -411,12 +411,18 @@ export class SelfImproveLoop {
|
|
|
411
411
|
consecutiveLow++;
|
|
412
412
|
continue;
|
|
413
413
|
}
|
|
414
|
-
// Diversity safety net: skip if hypothesis targets an over-represented area:target
|
|
414
|
+
// Diversity safety net: skip if hypothesis targets an over-represented area:target.
|
|
415
|
+
// 1.18.134 — loosened cap from 3 to 5. The old cap caused the
|
|
416
|
+
// loop to plateau immediately whenever it had ~3 ideas about
|
|
417
|
+
// SOUL.md (which is a frequent attractor). At 5 the
|
|
418
|
+
// hypothesizer gets a few more swings at the same area before
|
|
419
|
+
// diversity kicks in — but still avoids monomania.
|
|
420
|
+
const DIVERSITY_CAP = 5;
|
|
415
421
|
const proposalKey = `${proposal.area}:${proposal.target}`;
|
|
416
422
|
const proposalCount = history.filter(e => `${e.area}:${e.target}` === proposalKey).length
|
|
417
423
|
+ this.getPendingChanges().filter(p => `${p.area}:${p.target}` === proposalKey).length;
|
|
418
|
-
if (proposalCount >=
|
|
419
|
-
logger.warn({ area: proposal.area, target: proposal.target, count: proposalCount }, 'Hypothesis over-targeted — skipping');
|
|
424
|
+
if (proposalCount >= DIVERSITY_CAP) {
|
|
425
|
+
logger.warn({ area: proposal.area, target: proposal.target, count: proposalCount, cap: DIVERSITY_CAP }, 'Hypothesis over-targeted — skipping');
|
|
420
426
|
consecutiveLow++;
|
|
421
427
|
continue;
|
|
422
428
|
}
|
|
@@ -440,8 +446,37 @@ export class SelfImproveLoop {
|
|
|
440
446
|
const before = await this.readCurrentState(proposal.area, proposal.target);
|
|
441
447
|
// Step 5: Evaluate
|
|
442
448
|
const evaluation = await this.withTimeout(this.evaluate(before, proposal.proposedChange, proposal.hypothesis), 60_000);
|
|
443
|
-
const
|
|
444
|
-
const
|
|
449
|
+
const llmScore = evaluation?.score ?? 0;
|
|
450
|
+
const normalizedLlmScore = llmScore / 10; // Convert 0-10 to 0-1
|
|
451
|
+
// 1.18.134 — blend the LLM evaluator score with an objective
|
|
452
|
+
// signal pulled from real metrics. Karpathy's autoresearch uses
|
|
453
|
+
// ONE objective metric (val_bpb); Clementine's LLM-only score
|
|
454
|
+
// can drift, especially when proposals affect SOUL.md or
|
|
455
|
+
// cosmetic prompt fields. The objective floor is computed from
|
|
456
|
+
// the baseline metrics gathered at the start of THIS run:
|
|
457
|
+
//
|
|
458
|
+
// objective = cronSuccessRate * feedbackPositiveRatio
|
|
459
|
+
//
|
|
460
|
+
// Both 0..1; product preserves the multiplicative penalty of
|
|
461
|
+
// either metric being weak. We weight 70% LLM / 30% objective
|
|
462
|
+
// — the LLM still drives most decisions, but a proposal can't
|
|
463
|
+
// sail through on a 9/10 evaluator score when feedback is at
|
|
464
|
+
// 35% positive (which is the current reality). Set the env
|
|
465
|
+
// var SELF_IMPROVE_OBJECTIVE_WEIGHT to override (0..1).
|
|
466
|
+
const objectiveWeightRaw = parseFloat(process.env.SELF_IMPROVE_OBJECTIVE_WEIGHT ?? '0.3');
|
|
467
|
+
const objectiveWeight = Number.isFinite(objectiveWeightRaw) && objectiveWeightRaw >= 0 && objectiveWeightRaw <= 1
|
|
468
|
+
? objectiveWeightRaw : 0.3;
|
|
469
|
+
const llmWeight = 1 - objectiveWeight;
|
|
470
|
+
const objectiveScore = (state.baselineMetrics.cronSuccessRate || 0)
|
|
471
|
+
* (state.baselineMetrics.feedbackPositiveRatio || 0);
|
|
472
|
+
const normalizedScore = (normalizedLlmScore * llmWeight) + (objectiveScore * objectiveWeight);
|
|
473
|
+
const score = normalizedScore * 10; // For display + reason text
|
|
474
|
+
logger.debug({
|
|
475
|
+
llm: normalizedLlmScore.toFixed(3),
|
|
476
|
+
objective: objectiveScore.toFixed(3),
|
|
477
|
+
blended: normalizedScore.toFixed(3),
|
|
478
|
+
weight: objectiveWeight,
|
|
479
|
+
}, 'Score blend');
|
|
445
480
|
const accepted = normalizedScore >= this.config.acceptThreshold;
|
|
446
481
|
// Surface gate: even when accepted, only score >= surfaceThreshold
|
|
447
482
|
// reaches the user's pending-changes inbox. Below that floor we
|
package/dist/index.js
CHANGED
|
@@ -754,11 +754,14 @@ async function asyncMain() {
|
|
|
754
754
|
// output to their Discord channel.
|
|
755
755
|
const { AgentHeartbeatManager } = await import('./gateway/agent-heartbeat-manager.js');
|
|
756
756
|
const agentHeartbeats = new AgentHeartbeatManager(gateway.getAgentManager(), gateway);
|
|
757
|
-
//
|
|
758
|
-
//
|
|
759
|
-
//
|
|
760
|
-
|
|
761
|
-
|
|
757
|
+
// Failure-fix consumer (1.18.134 — renamed from "self-improve-loop"
|
|
758
|
+
// to disambiguate from the Karpathy autoresearch SelfImproveLoop in
|
|
759
|
+
// src/agent/self-improve.ts). Every 10 min, scans
|
|
760
|
+
// self-improve/triggers/, classifies failures, auto-applies safe
|
|
761
|
+
// cron-config fixes, escalates risky ones. Different concern from
|
|
762
|
+
// the autoresearch hypothesize/evaluate loop.
|
|
763
|
+
const { FailureFixConsumer } = await import('./agent/failure-fix-consumer.js');
|
|
764
|
+
const failureFixConsumer = new FailureFixConsumer(dispatcher);
|
|
762
765
|
// ── Build channel tasks ──────────────────────────────────────────
|
|
763
766
|
const channelTasks = [];
|
|
764
767
|
const activeChannels = [];
|
|
@@ -856,7 +859,38 @@ async function asyncMain() {
|
|
|
856
859
|
heartbeat.start();
|
|
857
860
|
cronScheduler.start();
|
|
858
861
|
agentHeartbeats.start();
|
|
859
|
-
|
|
862
|
+
failureFixConsumer.start();
|
|
863
|
+
// 1.18.134 — nightly Karpathy-autoresearch self-improve trigger.
|
|
864
|
+
// The Karpathy SelfImproveLoop (src/agent/self-improve.ts) was
|
|
865
|
+
// previously only triggered by /self-improve run or CLI. With no
|
|
866
|
+
// automatic schedule the loop ran ~3 times in the prior 4 days and
|
|
867
|
+
// sat plateaued. This wires a daily 3am trigger so it iterates on
|
|
868
|
+
// its own — matching Karpathy's continuous-iteration model.
|
|
869
|
+
// SELF_IMPROVE_HOUR env var overrides (0–23, default 3).
|
|
870
|
+
const selfImproveHour = (() => {
|
|
871
|
+
const raw = parseInt(process.env.SELF_IMPROVE_HOUR ?? '3', 10);
|
|
872
|
+
if (Number.isFinite(raw) && raw >= 0 && raw <= 23)
|
|
873
|
+
return raw;
|
|
874
|
+
return 3;
|
|
875
|
+
})();
|
|
876
|
+
// node-cron is already a dependency (used by cron-scheduler). Schedule
|
|
877
|
+
// a single daily tick — the SelfImproveLoop's own time/iteration caps
|
|
878
|
+
// and plateau detection bound the work; we don't need finer granularity.
|
|
879
|
+
try {
|
|
880
|
+
const nodeCron = (await import('node-cron')).default;
|
|
881
|
+
nodeCron.schedule(`0 ${selfImproveHour} * * *`, () => {
|
|
882
|
+
logger.info({ hour: selfImproveHour }, 'Nightly self-improve trigger firing');
|
|
883
|
+
gateway.handleSelfImprove('run').then((summary) => {
|
|
884
|
+
logger.info({ summary }, 'Nightly self-improve trigger complete');
|
|
885
|
+
}).catch((err) => {
|
|
886
|
+
logger.error({ err }, 'Nightly self-improve trigger failed');
|
|
887
|
+
});
|
|
888
|
+
}, { timezone: process.env.TZ || 'America/Los_Angeles' });
|
|
889
|
+
logger.info({ hour: selfImproveHour }, `Self-improve nightly trigger scheduled (${selfImproveHour}:00 daily)`);
|
|
890
|
+
}
|
|
891
|
+
catch (err) {
|
|
892
|
+
logger.warn({ err }, 'Failed to schedule nightly self-improve trigger');
|
|
893
|
+
}
|
|
860
894
|
// Background-task hygiene: any task left in 'running' is from a prior
|
|
861
895
|
// process. Mark them aborted so the lifecycle is honest. (P6b will add
|
|
862
896
|
// resumability; for now fail-fast is clearer than silently re-running.)
|
|
@@ -1086,7 +1120,7 @@ async function asyncMain() {
|
|
|
1086
1120
|
heartbeat.stop();
|
|
1087
1121
|
cronScheduler.stop();
|
|
1088
1122
|
agentHeartbeats.stop();
|
|
1089
|
-
|
|
1123
|
+
failureFixConsumer.stop();
|
|
1090
1124
|
// ── Self-restart (enhanced with health check + rollback) ────────
|
|
1091
1125
|
if (restartRequested) {
|
|
1092
1126
|
// Clear our PID file BEFORE spawning the child, so ensureSingleton()
|