npm - clementine-agent - Versions diffs - 1.18.133 → 1.18.135 - Mend

clementine-agent 1.18.133 → 1.18.135

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

package/dist/agent/{self-improve-loop.d.ts → failure-fix-consumer.d.ts} +2 -2
package/dist/agent/{self-improve-loop.js → failure-fix-consumer.js} +3 -3
package/dist/agent/self-improve.js +40 -5
package/dist/cli/dashboard.js +51 -2
package/dist/index.js +41 -7
package/package.json +1 -1

package/dist/agent/{self-improve-loop.d.ts → failure-fix-consumer.d.ts} RENAMED Viewed

@@ -98,7 +98,7 @@ export interface SelfImproveLoopOptions {
     allowAutoApplySafeFixes?: boolean;
 }
 export declare function classifyFailure(recentErrors: string[]): FixRecipe;
-export declare class SelfImproveLoop {
+export declare class FailureFixConsumer {
     private readonly tickMs;
     private readonly triggersDir;
     private readonly pendingDir;
@@ -133,4 +133,4 @@ export declare class SelfImproveLoop {
     private processOne;
     private notifyAgent;
 }
-//# sourceMappingURL=self-improve-loop.d.ts.map
+//# sourceMappingURL=failure-fix-consumer.d.ts.map

package/dist/agent/{self-improve-loop.js → failure-fix-consumer.js} RENAMED Viewed

@@ -29,7 +29,7 @@ import path from 'node:path';
 import matter from 'gray-matter';
 import pino from 'pino';
 import { AGENTS_DIR, BASE_DIR, SYSTEM_DIR } from '../config.js';
-const logger = pino({ name: 'clementine.self-improve-loop' });
+const logger = pino({ name: 'clementine.failure-fix-consumer' });
 /**
  * Fallback tick interval. The loop is primarily event-driven via fs.watch
  * on the triggers directory — this is just a slow safety net for cases
@@ -231,7 +231,7 @@ function writePendingChange(record, dir) {
     return file;
 }
 // ── Main loop ────────────────────────────────────────────────────────
-export class SelfImproveLoop {
+export class FailureFixConsumer {
     tickMs;
     triggersDir;
     pendingDir;
@@ -503,4 +503,4 @@ export class SelfImproveLoop {
         }
     }
 }
-//# sourceMappingURL=self-improve-loop.js.map
+//# sourceMappingURL=failure-fix-consumer.js.map

package/dist/agent/self-improve.js CHANGED Viewed

@@ -411,12 +411,18 @@ export class SelfImproveLoop {
                         consecutiveLow++;
                         continue;
                     }
-                    // Diversity safety net: skip if hypothesis targets an over-represented area:target
+                    // Diversity safety net: skip if hypothesis targets an over-represented area:target.
+                    // 1.18.134 — loosened cap from 3 to 5. The old cap caused the
+                    // loop to plateau immediately whenever it had ~3 ideas about
+                    // SOUL.md (which is a frequent attractor). At 5 the
+                    // hypothesizer gets a few more swings at the same area before
+                    // diversity kicks in — but still avoids monomania.
+                    const DIVERSITY_CAP = 5;
                     const proposalKey = `${proposal.area}:${proposal.target}`;
                     const proposalCount = history.filter(e => `${e.area}:${e.target}` === proposalKey).length
                         + this.getPendingChanges().filter(p => `${p.area}:${p.target}` === proposalKey).length;
-                    if (proposalCount >= 3) {
-                        logger.warn({ area: proposal.area, target: proposal.target, count: proposalCount }, 'Hypothesis over-targeted — skipping');
+                    if (proposalCount >= DIVERSITY_CAP) {
+                        logger.warn({ area: proposal.area, target: proposal.target, count: proposalCount, cap: DIVERSITY_CAP }, 'Hypothesis over-targeted — skipping');
                         consecutiveLow++;
                         continue;
                     }
@@ -440,8 +446,37 @@ export class SelfImproveLoop {
                     const before = await this.readCurrentState(proposal.area, proposal.target);
                     // Step 5: Evaluate
                     const evaluation = await this.withTimeout(this.evaluate(before, proposal.proposedChange, proposal.hypothesis), 60_000);
-                    const score = evaluation?.score ?? 0;
-                    const normalizedScore = score / 10; // Convert 0-10 to 0-1
+                    const llmScore = evaluation?.score ?? 0;
+                    const normalizedLlmScore = llmScore / 10; // Convert 0-10 to 0-1
+                    // 1.18.134 — blend the LLM evaluator score with an objective
+                    // signal pulled from real metrics. Karpathy's autoresearch uses
+                    // ONE objective metric (val_bpb); Clementine's LLM-only score
+                    // can drift, especially when proposals affect SOUL.md or
+                    // cosmetic prompt fields. The objective floor is computed from
+                    // the baseline metrics gathered at the start of THIS run:
+                    //
+                    //   objective = cronSuccessRate * feedbackPositiveRatio
+                    //
+                    // Both 0..1; product preserves the multiplicative penalty of
+                    // either metric being weak. We weight 70% LLM / 30% objective
+                    // — the LLM still drives most decisions, but a proposal can't
+                    // sail through on a 9/10 evaluator score when feedback is at
+                    // 35% positive (which is the current reality). Set the env
+                    // var SELF_IMPROVE_OBJECTIVE_WEIGHT to override (0..1).
+                    const objectiveWeightRaw = parseFloat(process.env.SELF_IMPROVE_OBJECTIVE_WEIGHT ?? '0.3');
+                    const objectiveWeight = Number.isFinite(objectiveWeightRaw) && objectiveWeightRaw >= 0 && objectiveWeightRaw <= 1
+                        ? objectiveWeightRaw : 0.3;
+                    const llmWeight = 1 - objectiveWeight;
+                    const objectiveScore = (state.baselineMetrics.cronSuccessRate || 0)
+                        * (state.baselineMetrics.feedbackPositiveRatio || 0);
+                    const normalizedScore = (normalizedLlmScore * llmWeight) + (objectiveScore * objectiveWeight);
+                    const score = normalizedScore * 10; // For display + reason text
+                    logger.debug({
+                        llm: normalizedLlmScore.toFixed(3),
+                        objective: objectiveScore.toFixed(3),
+                        blended: normalizedScore.toFixed(3),
+                        weight: objectiveWeight,
+                    }, 'Score blend');
                     const accepted = normalizedScore >= this.config.acceptThreshold;
                     // Surface gate: even when accepted, only score >= surfaceThreshold
                     // reaches the user's pending-changes inbox. Below that floor we

package/dist/cli/dashboard.js CHANGED Viewed

@@ -19771,10 +19771,15 @@ if('serviceWorker' in navigator){navigator.serviceWorker.getRegistrations().then
         <div class="tab-pane" id="tab-intelligence-learning">
           <div style="display:flex;justify-content:space-between;align-items:center;margin-bottom:12px;gap:12px;flex-wrap:wrap">
             <div style="font-size:13px;color:var(--text-secondary);max-width:680px">
-              Self-improvement runs nightly at 1 AM. The autonomous loop also auto-fixes failing crons (3+ consecutive errors) and verifies each fix over the next 3 runs &mdash; reverting automatically if it doesn't help.
+              Karpathy-style autoresearch loop. Hypothesizes improvements to SOUL, prompts, crons; evaluates each with a blended score (LLM &times; 0.7 + objective &times; 0.3). Fires nightly at <strong id="si-schedule-hour">3 AM</strong> (override via SELF_IMPROVE_HOUR env). The auto-fix consumer also handles failing crons (3+ consecutive errors) and verifies each fix over the next 3 runs &mdash; reverting automatically if it doesn't help.
             </div>
             <button class="btn-sm btn-primary" onclick="siRunCycle()" id="si-run-btn">Run Now</button>
           </div>
+          <!-- 1.18.135 — last-diagnostic banner. Surfaces the loop's
+               own self-report ("Plateau: no novel improvement area remaining"
+               etc.) so the user knows WHY recent runs were quiet without
+               digging into state.json. -->
+          <div id="si-diagnostic-banner" style="display:none;margin-bottom:12px;padding:10px 14px;border-radius:6px;font-size:12px"></div>
           <div class="grid-2" id="si-status-cards">
             <div class="skel-block"><div class="skel-row med"></div><div class="skel-row short"></div></div>
             <div class="skel-block"><div class="skel-row med"></div><div class="skel-row short"></div></div>
@@ -40193,17 +40198,61 @@ async function refreshSelfImprove() {
       }
     }
+    // 1.18.135 — diagnostic banner. Shows the loop's own last self-report
+    // (plateau, cooldown, infra error). Color-coded so the user can tell
+    // at a glance whether the silence is intentional ("plateau, no fresh
+    // signal") vs broken ("infra error, in 24h cooldown").
+    const diagEl = document.getElementById('si-diagnostic-banner');
+    if (diagEl) {
+      const diag = state && state.lastDiagnostic ? String(state.lastDiagnostic) : '';
+      const infra = state && state.infraError ? state.infraError : null;
+      if (infra) {
+        diagEl.style.display = '';
+        diagEl.style.background = 'rgba(239,68,68,0.08)';
+        diagEl.style.border = '1px solid var(--red)';
+        diagEl.style.color = 'var(--text-primary)';
+        diagEl.innerHTML = '<strong style="color:var(--red)">⚠ Infrastructure error</strong> &middot; ' + esc(infra.diagnostic || infra.category || 'unknown') + ' &middot; in 24h cooldown until ' + (infra.cooldownUntil ? new Date(infra.cooldownUntil).toLocaleString() : 'next probe');
+      } else if (diag) {
+        diagEl.style.display = '';
+        const isPlateau = /plateau/i.test(diag);
+        diagEl.style.background = isPlateau ? 'rgba(245,158,11,0.08)' : 'var(--bg-tertiary)';
+        diagEl.style.border = isPlateau ? '1px solid var(--yellow)' : '1px solid var(--border)';
+        diagEl.style.color = 'var(--text-secondary)';
+        diagEl.innerHTML = (isPlateau ? '<strong style="color:var(--yellow)">⏸ Last run note:</strong> ' : '<strong>Last run note:</strong> ') + esc(diag);
+      } else {
+        diagEl.style.display = 'none';
+      }
+    }
     // Status cards
     const cards = document.getElementById('si-status-cards');
     if (cards && state) {
       const m = state.baselineMetrics || {};
+      // 1.18.135 — show the blended objective signal (cronSuccessRate ×
+      // feedbackPositiveRatio) explicitly so the user sees the floor that
+      // every LLM-rated proposal is now multiplied against (1.18.134).
+      const objectiveScore = (m.cronSuccessRate || 0) * (m.feedbackPositiveRatio || 0);
+      // Compute next scheduled run time. Defaults to 3am unless the env var
+      // SELF_IMPROVE_HOUR was set on the daemon — we don't have visibility
+      // into env from the dashboard, so we always show the default.
+      const nextRunAt = (() => {
+        const next = new Date();
+        next.setHours(3, 0, 0, 0);
+        if (next.getTime() <= Date.now()) next.setDate(next.getDate() + 1);
+        return next;
+      })();
+      const nextRunMs = nextRunAt.getTime() - Date.now();
+      const nextRunHrs = Math.max(0, Math.floor(nextRunMs / 3600000));
+      const nextRunLabel = nextRunHrs >= 24 ? Math.floor(nextRunHrs / 24) + 'd' : nextRunHrs + 'h';
       cards.innerHTML =
         '<div class="stat-card"><div class="stat-value">' + (state.status || 'idle') + '</div><div class="stat-label">Status</div></div>' +
         '<div class="stat-card"><div class="stat-value">' + (state.totalExperiments || 0) + '</div><div class="stat-label">Total Experiments</div></div>' +
         '<div class="stat-card"><div class="stat-value">' + (pending.length || 0) + '</div><div class="stat-label">Pending Approvals</div></div>' +
         '<div class="stat-card"><div class="stat-value">' + (state.lastRunAt ? new Date(state.lastRunAt).toLocaleDateString() : 'Never') + '</div><div class="stat-label">Last Run</div></div>' +
+        '<div class="stat-card" title="Next nightly trigger (3am default)"><div class="stat-value">in ' + nextRunLabel + '</div><div class="stat-label">Next Run</div></div>' +
         '<div class="stat-card"><div class="stat-value">' + ((m.feedbackPositiveRatio || 0) * 100).toFixed(0) + '%</div><div class="stat-label">Feedback Positive</div></div>' +
-        '<div class="stat-card"><div class="stat-value">' + ((m.cronSuccessRate || 0) * 100).toFixed(0) + '%</div><div class="stat-label">Cron Success</div></div>';
+        '<div class="stat-card"><div class="stat-value">' + ((m.cronSuccessRate || 0) * 100).toFixed(0) + '%</div><div class="stat-label">Cron Success</div></div>' +
+        '<div class="stat-card" title="cronSuccessRate × feedbackPositiveRatio. Blended into every proposal score at 30% weight (override via SELF_IMPROVE_OBJECTIVE_WEIGHT)."><div class="stat-value">' + (objectiveScore * 10).toFixed(1) + '/10</div><div class="stat-label">Objective Floor</div></div>';
     } else if (cards) {
       cards.innerHTML = '<div class="stat-card"><div class="stat-value">idle</div><div class="stat-label">Status</div></div>' +
         '<div class="stat-card"><div class="stat-value">0</div><div class="stat-label">Total Experiments</div></div>';

package/dist/index.js CHANGED Viewed

@@ -754,11 +754,14 @@ async function asyncMain() {
     // output to their Discord channel.
     const { AgentHeartbeatManager } = await import('./gateway/agent-heartbeat-manager.js');
     const agentHeartbeats = new AgentHeartbeatManager(gateway.getAgentManager(), gateway);
-    // Self-improve loop — closes the gap between "trigger written" and
-    // "fix applied." Every 10 min, scans self-improve/triggers/, classifies
-    // failures, auto-applies safe cron-config fixes, escalates risky ones.
-    const { SelfImproveLoop } = await import('./agent/self-improve-loop.js');
-    const selfImproveLoop = new SelfImproveLoop(dispatcher);
+    // Failure-fix consumer (1.18.134 — renamed from "self-improve-loop"
+    // to disambiguate from the Karpathy autoresearch SelfImproveLoop in
+    // src/agent/self-improve.ts). Every 10 min, scans
+    // self-improve/triggers/, classifies failures, auto-applies safe
+    // cron-config fixes, escalates risky ones. Different concern from
+    // the autoresearch hypothesize/evaluate loop.
+    const { FailureFixConsumer } = await import('./agent/failure-fix-consumer.js');
+    const failureFixConsumer = new FailureFixConsumer(dispatcher);
     // ── Build channel tasks ──────────────────────────────────────────
     const channelTasks = [];
     const activeChannels = [];
@@ -856,7 +859,38 @@ async function asyncMain() {
     heartbeat.start();
     cronScheduler.start();
     agentHeartbeats.start();
-    selfImproveLoop.start();
+    failureFixConsumer.start();
+    // 1.18.134 — nightly Karpathy-autoresearch self-improve trigger.
+    // The Karpathy SelfImproveLoop (src/agent/self-improve.ts) was
+    // previously only triggered by /self-improve run or CLI. With no
+    // automatic schedule the loop ran ~3 times in the prior 4 days and
+    // sat plateaued. This wires a daily 3am trigger so it iterates on
+    // its own — matching Karpathy's continuous-iteration model.
+    // SELF_IMPROVE_HOUR env var overrides (0–23, default 3).
+    const selfImproveHour = (() => {
+        const raw = parseInt(process.env.SELF_IMPROVE_HOUR ?? '3', 10);
+        if (Number.isFinite(raw) && raw >= 0 && raw <= 23)
+            return raw;
+        return 3;
+    })();
+    // node-cron is already a dependency (used by cron-scheduler). Schedule
+    // a single daily tick — the SelfImproveLoop's own time/iteration caps
+    // and plateau detection bound the work; we don't need finer granularity.
+    try {
+        const nodeCron = (await import('node-cron')).default;
+        nodeCron.schedule(`0 ${selfImproveHour} * * *`, () => {
+            logger.info({ hour: selfImproveHour }, 'Nightly self-improve trigger firing');
+            gateway.handleSelfImprove('run').then((summary) => {
+                logger.info({ summary }, 'Nightly self-improve trigger complete');
+            }).catch((err) => {
+                logger.error({ err }, 'Nightly self-improve trigger failed');
+            });
+        }, { timezone: process.env.TZ || 'America/Los_Angeles' });
+        logger.info({ hour: selfImproveHour }, `Self-improve nightly trigger scheduled (${selfImproveHour}:00 daily)`);
+    }
+    catch (err) {
+        logger.warn({ err }, 'Failed to schedule nightly self-improve trigger');
+    }
     // Background-task hygiene: any task left in 'running' is from a prior
     // process. Mark them aborted so the lifecycle is honest. (P6b will add
     // resumability; for now fail-fast is clearer than silently re-running.)
@@ -1086,7 +1120,7 @@ async function asyncMain() {
     heartbeat.stop();
     cronScheduler.stop();
     agentHeartbeats.stop();
-    selfImproveLoop.stop();
+    failureFixConsumer.stop();
     // ── Self-restart (enhanced with health check + rollback) ────────
     if (restartRequested) {
         // Clear our PID file BEFORE spawning the child, so ensureSingleton()

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "clementine-agent",
-  "version": "1.18.133",
+  "version": "1.18.135",
   "description": "Clementine — Personal AI Assistant (TypeScript)",
   "type": "module",
   "main": "dist/index.js",