clementine-agent 1.18.133 → 1.18.135

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -98,7 +98,7 @@ export interface SelfImproveLoopOptions {
98
98
  allowAutoApplySafeFixes?: boolean;
99
99
  }
100
100
  export declare function classifyFailure(recentErrors: string[]): FixRecipe;
101
- export declare class SelfImproveLoop {
101
+ export declare class FailureFixConsumer {
102
102
  private readonly tickMs;
103
103
  private readonly triggersDir;
104
104
  private readonly pendingDir;
@@ -133,4 +133,4 @@ export declare class SelfImproveLoop {
133
133
  private processOne;
134
134
  private notifyAgent;
135
135
  }
136
- //# sourceMappingURL=self-improve-loop.d.ts.map
136
+ //# sourceMappingURL=failure-fix-consumer.d.ts.map
@@ -29,7 +29,7 @@ import path from 'node:path';
29
29
  import matter from 'gray-matter';
30
30
  import pino from 'pino';
31
31
  import { AGENTS_DIR, BASE_DIR, SYSTEM_DIR } from '../config.js';
32
- const logger = pino({ name: 'clementine.self-improve-loop' });
32
+ const logger = pino({ name: 'clementine.failure-fix-consumer' });
33
33
  /**
34
34
  * Fallback tick interval. The loop is primarily event-driven via fs.watch
35
35
  * on the triggers directory — this is just a slow safety net for cases
@@ -231,7 +231,7 @@ function writePendingChange(record, dir) {
231
231
  return file;
232
232
  }
233
233
  // ── Main loop ────────────────────────────────────────────────────────
234
- export class SelfImproveLoop {
234
+ export class FailureFixConsumer {
235
235
  tickMs;
236
236
  triggersDir;
237
237
  pendingDir;
@@ -503,4 +503,4 @@ export class SelfImproveLoop {
503
503
  }
504
504
  }
505
505
  }
506
- //# sourceMappingURL=self-improve-loop.js.map
506
+ //# sourceMappingURL=failure-fix-consumer.js.map
@@ -411,12 +411,18 @@ export class SelfImproveLoop {
411
411
  consecutiveLow++;
412
412
  continue;
413
413
  }
414
- // Diversity safety net: skip if hypothesis targets an over-represented area:target
414
+ // Diversity safety net: skip if hypothesis targets an over-represented area:target.
415
+ // 1.18.134 — loosened cap from 3 to 5. The old cap caused the
416
+ // loop to plateau immediately whenever it had ~3 ideas about
417
+ // SOUL.md (which is a frequent attractor). At 5 the
418
+ // hypothesizer gets a few more swings at the same area before
419
+ // diversity kicks in — but still avoids monomania.
420
+ const DIVERSITY_CAP = 5;
415
421
  const proposalKey = `${proposal.area}:${proposal.target}`;
416
422
  const proposalCount = history.filter(e => `${e.area}:${e.target}` === proposalKey).length
417
423
  + this.getPendingChanges().filter(p => `${p.area}:${p.target}` === proposalKey).length;
418
- if (proposalCount >= 3) {
419
- logger.warn({ area: proposal.area, target: proposal.target, count: proposalCount }, 'Hypothesis over-targeted — skipping');
424
+ if (proposalCount >= DIVERSITY_CAP) {
425
+ logger.warn({ area: proposal.area, target: proposal.target, count: proposalCount, cap: DIVERSITY_CAP }, 'Hypothesis over-targeted — skipping');
420
426
  consecutiveLow++;
421
427
  continue;
422
428
  }
@@ -440,8 +446,37 @@ export class SelfImproveLoop {
440
446
  const before = await this.readCurrentState(proposal.area, proposal.target);
441
447
  // Step 5: Evaluate
442
448
  const evaluation = await this.withTimeout(this.evaluate(before, proposal.proposedChange, proposal.hypothesis), 60_000);
443
- const score = evaluation?.score ?? 0;
444
- const normalizedScore = score / 10; // Convert 0-10 to 0-1
449
+ const llmScore = evaluation?.score ?? 0;
450
+ const normalizedLlmScore = llmScore / 10; // Convert 0-10 to 0-1
451
+ // 1.18.134 — blend the LLM evaluator score with an objective
452
+ // signal pulled from real metrics. Karpathy's autoresearch uses
453
+ // ONE objective metric (val_bpb); Clementine's LLM-only score
454
+ // can drift, especially when proposals affect SOUL.md or
455
+ // cosmetic prompt fields. The objective floor is computed from
456
+ // the baseline metrics gathered at the start of THIS run:
457
+ //
458
+ // objective = cronSuccessRate * feedbackPositiveRatio
459
+ //
460
+ // Both 0..1; product preserves the multiplicative penalty of
461
+ // either metric being weak. We weight 70% LLM / 30% objective
462
+ // — the LLM still drives most decisions, but a proposal can't
463
+ // sail through on a 9/10 evaluator score when feedback is at
464
+ // 35% positive (which is the current reality). Set the env
465
+ // var SELF_IMPROVE_OBJECTIVE_WEIGHT to override (0..1).
466
+ const objectiveWeightRaw = parseFloat(process.env.SELF_IMPROVE_OBJECTIVE_WEIGHT ?? '0.3');
467
+ const objectiveWeight = Number.isFinite(objectiveWeightRaw) && objectiveWeightRaw >= 0 && objectiveWeightRaw <= 1
468
+ ? objectiveWeightRaw : 0.3;
469
+ const llmWeight = 1 - objectiveWeight;
470
+ const objectiveScore = (state.baselineMetrics.cronSuccessRate || 0)
471
+ * (state.baselineMetrics.feedbackPositiveRatio || 0);
472
+ const normalizedScore = (normalizedLlmScore * llmWeight) + (objectiveScore * objectiveWeight);
473
+ const score = normalizedScore * 10; // For display + reason text
474
+ logger.debug({
475
+ llm: normalizedLlmScore.toFixed(3),
476
+ objective: objectiveScore.toFixed(3),
477
+ blended: normalizedScore.toFixed(3),
478
+ weight: objectiveWeight,
479
+ }, 'Score blend');
445
480
  const accepted = normalizedScore >= this.config.acceptThreshold;
446
481
  // Surface gate: even when accepted, only score >= surfaceThreshold
447
482
  // reaches the user's pending-changes inbox. Below that floor we
@@ -19771,10 +19771,15 @@ if('serviceWorker' in navigator){navigator.serviceWorker.getRegistrations().then
19771
19771
  <div class="tab-pane" id="tab-intelligence-learning">
19772
19772
  <div style="display:flex;justify-content:space-between;align-items:center;margin-bottom:12px;gap:12px;flex-wrap:wrap">
19773
19773
  <div style="font-size:13px;color:var(--text-secondary);max-width:680px">
19774
- Self-improvement runs nightly at 1 AM. The autonomous loop also auto-fixes failing crons (3+ consecutive errors) and verifies each fix over the next 3 runs &mdash; reverting automatically if it doesn't help.
19774
+ Karpathy-style autoresearch loop. Hypothesizes improvements to SOUL, prompts, crons; evaluates each with a blended score (LLM &times; 0.7 + objective &times; 0.3). Fires nightly at <strong id="si-schedule-hour">3 AM</strong> (override via SELF_IMPROVE_HOUR env). The auto-fix consumer also handles failing crons (3+ consecutive errors) and verifies each fix over the next 3 runs &mdash; reverting automatically if it doesn't help.
19775
19775
  </div>
19776
19776
  <button class="btn-sm btn-primary" onclick="siRunCycle()" id="si-run-btn">Run Now</button>
19777
19777
  </div>
19778
+ <!-- 1.18.135 — last-diagnostic banner. Surfaces the loop's
19779
+ own self-report ("Plateau: no novel improvement area remaining"
19780
+ etc.) so the user knows WHY recent runs were quiet without
19781
+ digging into state.json. -->
19782
+ <div id="si-diagnostic-banner" style="display:none;margin-bottom:12px;padding:10px 14px;border-radius:6px;font-size:12px"></div>
19778
19783
  <div class="grid-2" id="si-status-cards">
19779
19784
  <div class="skel-block"><div class="skel-row med"></div><div class="skel-row short"></div></div>
19780
19785
  <div class="skel-block"><div class="skel-row med"></div><div class="skel-row short"></div></div>
@@ -40193,17 +40198,61 @@ async function refreshSelfImprove() {
40193
40198
  }
40194
40199
  }
40195
40200
 
40201
+ // 1.18.135 — diagnostic banner. Shows the loop's own last self-report
40202
+ // (plateau, cooldown, infra error). Color-coded so the user can tell
40203
+ // at a glance whether the silence is intentional ("plateau, no fresh
40204
+ // signal") vs broken ("infra error, in 24h cooldown").
40205
+ const diagEl = document.getElementById('si-diagnostic-banner');
40206
+ if (diagEl) {
40207
+ const diag = state && state.lastDiagnostic ? String(state.lastDiagnostic) : '';
40208
+ const infra = state && state.infraError ? state.infraError : null;
40209
+ if (infra) {
40210
+ diagEl.style.display = '';
40211
+ diagEl.style.background = 'rgba(239,68,68,0.08)';
40212
+ diagEl.style.border = '1px solid var(--red)';
40213
+ diagEl.style.color = 'var(--text-primary)';
40214
+ diagEl.innerHTML = '<strong style="color:var(--red)">⚠ Infrastructure error</strong> &middot; ' + esc(infra.diagnostic || infra.category || 'unknown') + ' &middot; in 24h cooldown until ' + (infra.cooldownUntil ? new Date(infra.cooldownUntil).toLocaleString() : 'next probe');
40215
+ } else if (diag) {
40216
+ diagEl.style.display = '';
40217
+ const isPlateau = /plateau/i.test(diag);
40218
+ diagEl.style.background = isPlateau ? 'rgba(245,158,11,0.08)' : 'var(--bg-tertiary)';
40219
+ diagEl.style.border = isPlateau ? '1px solid var(--yellow)' : '1px solid var(--border)';
40220
+ diagEl.style.color = 'var(--text-secondary)';
40221
+ diagEl.innerHTML = (isPlateau ? '<strong style="color:var(--yellow)">⏸ Last run note:</strong> ' : '<strong>Last run note:</strong> ') + esc(diag);
40222
+ } else {
40223
+ diagEl.style.display = 'none';
40224
+ }
40225
+ }
40226
+
40196
40227
  // Status cards
40197
40228
  const cards = document.getElementById('si-status-cards');
40198
40229
  if (cards && state) {
40199
40230
  const m = state.baselineMetrics || {};
40231
+ // 1.18.135 — show the blended objective signal (cronSuccessRate ×
40232
+ // feedbackPositiveRatio) explicitly so the user sees the floor that
40233
+ // every LLM-rated proposal is now multiplied against (1.18.134).
40234
+ const objectiveScore = (m.cronSuccessRate || 0) * (m.feedbackPositiveRatio || 0);
40235
+ // Compute next scheduled run time. Defaults to 3am unless the env var
40236
+ // SELF_IMPROVE_HOUR was set on the daemon — we don't have visibility
40237
+ // into env from the dashboard, so we always show the default.
40238
+ const nextRunAt = (() => {
40239
+ const next = new Date();
40240
+ next.setHours(3, 0, 0, 0);
40241
+ if (next.getTime() <= Date.now()) next.setDate(next.getDate() + 1);
40242
+ return next;
40243
+ })();
40244
+ const nextRunMs = nextRunAt.getTime() - Date.now();
40245
+ const nextRunHrs = Math.max(0, Math.floor(nextRunMs / 3600000));
40246
+ const nextRunLabel = nextRunHrs >= 24 ? Math.floor(nextRunHrs / 24) + 'd' : nextRunHrs + 'h';
40200
40247
  cards.innerHTML =
40201
40248
  '<div class="stat-card"><div class="stat-value">' + (state.status || 'idle') + '</div><div class="stat-label">Status</div></div>' +
40202
40249
  '<div class="stat-card"><div class="stat-value">' + (state.totalExperiments || 0) + '</div><div class="stat-label">Total Experiments</div></div>' +
40203
40250
  '<div class="stat-card"><div class="stat-value">' + (pending.length || 0) + '</div><div class="stat-label">Pending Approvals</div></div>' +
40204
40251
  '<div class="stat-card"><div class="stat-value">' + (state.lastRunAt ? new Date(state.lastRunAt).toLocaleDateString() : 'Never') + '</div><div class="stat-label">Last Run</div></div>' +
40252
+ '<div class="stat-card" title="Next nightly trigger (3am default)"><div class="stat-value">in ' + nextRunLabel + '</div><div class="stat-label">Next Run</div></div>' +
40205
40253
  '<div class="stat-card"><div class="stat-value">' + ((m.feedbackPositiveRatio || 0) * 100).toFixed(0) + '%</div><div class="stat-label">Feedback Positive</div></div>' +
40206
- '<div class="stat-card"><div class="stat-value">' + ((m.cronSuccessRate || 0) * 100).toFixed(0) + '%</div><div class="stat-label">Cron Success</div></div>';
40254
+ '<div class="stat-card"><div class="stat-value">' + ((m.cronSuccessRate || 0) * 100).toFixed(0) + '%</div><div class="stat-label">Cron Success</div></div>' +
40255
+ '<div class="stat-card" title="cronSuccessRate × feedbackPositiveRatio. Blended into every proposal score at 30% weight (override via SELF_IMPROVE_OBJECTIVE_WEIGHT)."><div class="stat-value">' + (objectiveScore * 10).toFixed(1) + '/10</div><div class="stat-label">Objective Floor</div></div>';
40207
40256
  } else if (cards) {
40208
40257
  cards.innerHTML = '<div class="stat-card"><div class="stat-value">idle</div><div class="stat-label">Status</div></div>' +
40209
40258
  '<div class="stat-card"><div class="stat-value">0</div><div class="stat-label">Total Experiments</div></div>';
package/dist/index.js CHANGED
@@ -754,11 +754,14 @@ async function asyncMain() {
754
754
  // output to their Discord channel.
755
755
  const { AgentHeartbeatManager } = await import('./gateway/agent-heartbeat-manager.js');
756
756
  const agentHeartbeats = new AgentHeartbeatManager(gateway.getAgentManager(), gateway);
757
- // Self-improve loopcloses the gap between "trigger written" and
758
- // "fix applied." Every 10 min, scans self-improve/triggers/, classifies
759
- // failures, auto-applies safe cron-config fixes, escalates risky ones.
760
- const { SelfImproveLoop } = await import('./agent/self-improve-loop.js');
761
- const selfImproveLoop = new SelfImproveLoop(dispatcher);
757
+ // Failure-fix consumer (1.18.134 renamed from "self-improve-loop"
758
+ // to disambiguate from the Karpathy autoresearch SelfImproveLoop in
759
+ // src/agent/self-improve.ts). Every 10 min, scans
760
+ // self-improve/triggers/, classifies failures, auto-applies safe
761
+ // cron-config fixes, escalates risky ones. Different concern from
762
+ // the autoresearch hypothesize/evaluate loop.
763
+ const { FailureFixConsumer } = await import('./agent/failure-fix-consumer.js');
764
+ const failureFixConsumer = new FailureFixConsumer(dispatcher);
762
765
  // ── Build channel tasks ──────────────────────────────────────────
763
766
  const channelTasks = [];
764
767
  const activeChannels = [];
@@ -856,7 +859,38 @@ async function asyncMain() {
856
859
  heartbeat.start();
857
860
  cronScheduler.start();
858
861
  agentHeartbeats.start();
859
- selfImproveLoop.start();
862
+ failureFixConsumer.start();
863
+ // 1.18.134 — nightly Karpathy-autoresearch self-improve trigger.
864
+ // The Karpathy SelfImproveLoop (src/agent/self-improve.ts) was
865
+ // previously only triggered by /self-improve run or CLI. With no
866
+ // automatic schedule the loop ran ~3 times in the prior 4 days and
867
+ // sat plateaued. This wires a daily 3am trigger so it iterates on
868
+ // its own — matching Karpathy's continuous-iteration model.
869
+ // SELF_IMPROVE_HOUR env var overrides (0–23, default 3).
870
+ const selfImproveHour = (() => {
871
+ const raw = parseInt(process.env.SELF_IMPROVE_HOUR ?? '3', 10);
872
+ if (Number.isFinite(raw) && raw >= 0 && raw <= 23)
873
+ return raw;
874
+ return 3;
875
+ })();
876
+ // node-cron is already a dependency (used by cron-scheduler). Schedule
877
+ // a single daily tick — the SelfImproveLoop's own time/iteration caps
878
+ // and plateau detection bound the work; we don't need finer granularity.
879
+ try {
880
+ const nodeCron = (await import('node-cron')).default;
881
+ nodeCron.schedule(`0 ${selfImproveHour} * * *`, () => {
882
+ logger.info({ hour: selfImproveHour }, 'Nightly self-improve trigger firing');
883
+ gateway.handleSelfImprove('run').then((summary) => {
884
+ logger.info({ summary }, 'Nightly self-improve trigger complete');
885
+ }).catch((err) => {
886
+ logger.error({ err }, 'Nightly self-improve trigger failed');
887
+ });
888
+ }, { timezone: process.env.TZ || 'America/Los_Angeles' });
889
+ logger.info({ hour: selfImproveHour }, `Self-improve nightly trigger scheduled (${selfImproveHour}:00 daily)`);
890
+ }
891
+ catch (err) {
892
+ logger.warn({ err }, 'Failed to schedule nightly self-improve trigger');
893
+ }
860
894
  // Background-task hygiene: any task left in 'running' is from a prior
861
895
  // process. Mark them aborted so the lifecycle is honest. (P6b will add
862
896
  // resumability; for now fail-fast is clearer than silently re-running.)
@@ -1086,7 +1120,7 @@ async function asyncMain() {
1086
1120
  heartbeat.stop();
1087
1121
  cronScheduler.stop();
1088
1122
  agentHeartbeats.stop();
1089
- selfImproveLoop.stop();
1123
+ failureFixConsumer.stop();
1090
1124
  // ── Self-restart (enhanced with health check + rollback) ────────
1091
1125
  if (restartRequested) {
1092
1126
  // Clear our PID file BEFORE spawning the child, so ensureSingleton()
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "clementine-agent",
3
- "version": "1.18.133",
3
+ "version": "1.18.135",
4
4
  "description": "Clementine — Personal AI Assistant (TypeScript)",
5
5
  "type": "module",
6
6
  "main": "dist/index.js",