npm - clementine-agent - Versions diffs - 1.18.162 → 1.18.163 - Mend

clementine-agent 1.18.162 → 1.18.163

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (5) hide show

package/dist/agent/self-improve.js +14 -0
package/dist/cli/dashboard.js +51 -2
package/dist/gateway/failure-clustering.d.ts +94 -0
package/dist/gateway/failure-clustering.js +190 -0
package/package.json +1 -1

package/dist/agent/self-improve.js CHANGED Viewed

@@ -19,6 +19,7 @@ import { listAllGoals } from '../tools/shared.js';
 import { MemoryStore } from '../memory/store.js';
 import { ANTHROPIC_SKILL_NAME_PATTERN } from './skill-store.js';
 import { recordApprovalSignal, formatApprovalSignalsForHypothesizer } from './approval-signals.js';
+import { clusterBrokenJobs, formatClustersForHypothesizer } from '../gateway/failure-clustering.js';
 const logger = pino({ name: 'clementine.self-improve' });
 // ── Defaults ─────────────────────────────────────────────────────────
 const DEFAULT_CONFIG = {
@@ -1102,6 +1103,18 @@ export class SelfImproveLoop {
         // owner has approved, away from those they've denied. Empty string for
         // fresh installs, which keeps the prompt clean.
         const approvalSignalsText = formatApprovalSignalsForHypothesizer();
+        // Cross-job failure clusters (1.18.163) — when ≥3 jobs hit the same
+        // normalized error pattern in 48h, surface ONE cluster summary so
+        // the hypothesizer proposes a root-cause fix instead of N per-job
+        // patches. Empty string when no cluster meets the threshold.
+        let failureClusterText = '';
+        try {
+            const clusters = clusterBrokenJobs();
+            failureClusterText = formatClustersForHypothesizer(clusters);
+        }
+        catch (err) {
+            logger.warn({ err }, 'Failed to compute failure clusters — proceeding without them');
+        }
         // ── Step 1: Analysis — identify top opportunities from metrics (no config dumps) ──
         const analysisPrompt = `You are Clementine's self-improvement strategist. Analyze the performance data below and identify the top 3 improvement opportunities.\n\n` +
             `## Recent Performance Data (last 7 days)\n` +
@@ -1119,6 +1132,7 @@ export class SelfImproveLoop {
             diversityConstraint +
             agentFocusText +
             soulCandidatesText +
+            (failureClusterText ? `\n${failureClusterText}` : '') +
             (approvalSignalsText ? `\n${approvalSignalsText}` : '') +
             `\n## Instructions\n` +
             `Propose **1-3 concrete, high-impact improvements** the owner should review today — no fewer (aim for at least one actionable suggestion when data warrants it), no more (the owner reads each proposal manually and you'll overwhelm them). Rank by expected impact; drop anything below "solid idea".\n\n` +

package/dist/cli/dashboard.js CHANGED Viewed

@@ -11407,7 +11407,7 @@ If the tool returns nothing or errors, return an empty array \`[]\`.`,
             res.status(500).json({ error: String(err) });
         }
     });
-    app.get('/api/self-improve', (_req, res) => {
+    app.get('/api/self-improve', async (_req, res) => {
         const siDir = path.join(BASE_DIR, 'self-improve');
         const stateFile = path.join(siDir, 'state.json');
         const logFile = path.join(siDir, 'experiment-log.jsonl');
@@ -11472,7 +11472,18 @@ If the tool returns nothing or errors, return an empty array \`[]\`.`,
             }
             catch { /* ignore */ }
         }
-        res.json({ state, experiments, pending, triggers, verifications });
+        // 1.18.163 — cross-job failure clusters (≥3 jobs hitting the same
+        // normalized error pattern in 48h). Computed on demand from
+        // computeBrokenJobs(); no schema, no persistence. The Self-Improve
+        // tab surfaces this so the owner sees "5 jobs hit X — propose one
+        // root-cause fix" instead of N per-job rows.
+        let clusters = [];
+        try {
+            const { clusterBrokenJobs } = await import('../gateway/failure-clustering.js');
+            clusters = clusterBrokenJobs();
+        }
+        catch { /* non-fatal — empty clusters list */ }
+        res.json({ state, experiments, pending, triggers, verifications, clusters });
     });
     app.post('/api/self-improve/run', async (_req, res) => {
         try {
@@ -19940,6 +19951,13 @@ if('serviceWorker' in navigator){navigator.serviceWorker.getRegistrations().then
               <div class="empty-state" style="padding:14px">No active failures &mdash; nothing has tripped 3+ consecutive errors.</div>
             </div>
           </div>
+          <div class="card" style="margin-top:16px" id="si-clusters-card" hidden>
+            <div class="card-header" style="display:flex;align-items:center;justify-content:space-between">
+              <span>Cross-job failure clusters <span style="font-weight:normal;font-size:11px;color:var(--text-muted)">&middot; 3+ jobs hitting the same error pattern (last 48h)</span></span>
+              <span class="tab-badge" id="tab-si-clusters" style="background:#a855f7;color:#fff">0</span>
+            </div>
+            <div class="card-body" id="si-clusters-list" style="padding:0"></div>
+          </div>
           <div class="card" style="margin-top:16px">
             <div class="card-header" style="display:flex;align-items:center;justify-content:space-between">
               <span>Verifying fixes</span>
@@ -40500,6 +40518,7 @@ async function refreshSelfImprove() {
     const pending = d.pending || [];
     const triggers = d.triggers || [];
     const verifications = d.verifications || [];
+    const clusters = d.clusters || [];
     // Update tab badge — combine human-attention queues so the sidebar
     // count reflects "things that need you to look at", not just proposals.
@@ -40537,6 +40556,36 @@ async function refreshSelfImprove() {
       }
     }
+    // 1.18.163 — cross-job failure clusters (≥3 jobs hitting the same
+    // normalized pattern). Hidden when the list is empty so the card
+    // doesn't take up space on a healthy install.
+    const clustersCard = document.getElementById('si-clusters-card');
+    const clustersList = document.getElementById('si-clusters-list');
+    const clustersBadge = document.getElementById('tab-si-clusters');
+    if (clustersCard && clustersList) {
+      if (clusters.length === 0) {
+        clustersCard.hidden = true;
+      } else {
+        clustersCard.hidden = false;
+        if (clustersBadge) clustersBadge.textContent = clusters.length;
+        clustersList.innerHTML = clusters.map(function(c) {
+          var rep = String(c.representative || '').slice(0, 200);
+          var jobsList = (c.jobs || []).slice(0, 5).map(function(j) {
+            return '<span class="badge" style="margin-right:4px;font-size:11px">' + esc(j.jobName) + ' &times;' + (j.errorCount48h || 0) + '</span>';
+          }).join('');
+          var more = (c.jobs && c.jobs.length > 5) ? '<span style="font-size:11px;color:var(--text-muted)">+' + (c.jobs.length - 5) + ' more</span>' : '';
+          return '<div style="padding:12px;border-bottom:1px solid var(--border)">' +
+            '<div style="display:flex;justify-content:space-between;align-items:baseline;gap:8px;flex-wrap:wrap">' +
+              '<div><strong>' + (c.jobs ? c.jobs.length : 0) + ' jobs</strong> &middot; ' +
+              '<span style="font-size:11px;color:var(--text-muted)">' + (c.totalErrors || 0) + ' total errors (48h)</span></div>' +
+            '</div>' +
+            '<div style="margin-top:6px;font-size:12px;color:var(--text-secondary);font-family:ui-monospace,monospace">' + esc(rep) + '</div>' +
+            '<div style="margin-top:8px">' + jobsList + ' ' + more + '</div>' +
+          '</div>';
+        }).join('');
+      }
+    }
     // Pending fix verifications — auto-fixes soaking through the 3-run window.
     const verifyEl = document.getElementById('si-verifying-list');
     if (verifyEl) {

package/dist/gateway/failure-clustering.d.ts ADDED Viewed

@@ -0,0 +1,94 @@
+/**
+ * Cross-job failure clustering (1.18.163).
+ *
+ * Today the failure pipeline is per-job:
+ *   broken-job(jobName) → classifyFailure(lastErrors) → 1 fix proposal
+ *
+ * That means when 5 different cron jobs all hit the same root cause
+ * (e.g. all 5 fail with "Prompt is too long"), the system generates
+ * 5 isolated patches instead of 1 root-cause fix. The owner sees
+ * 5 separate proposals in the Self-Improve tab and either approves all
+ * 5 (busywork) or denies them (and the underlying issue persists).
+ *
+ * This module groups recent broken jobs by *normalized error pattern*.
+ * When ≥3 distinct jobs hit the same cluster, the owner gets ONE
+ * "5 jobs all hit X — propose Y for all of them" suggestion instead of
+ * N separate ones.
+ *
+ * This is purely a *suggestion / presentation* layer — clusters are
+ * surfaced as a hint to the hypothesizer + dashboard. The existing
+ * per-job `failure-fix-consumer` continues to handle individual patches
+ * unchanged. Clustering is additive observability, not a replacement
+ * for per-job fixes.
+ *
+ * Reads from the existing `computeBrokenJobs()` source — no new schema,
+ * no new persistence, computed on demand.
+ */
+import type { BrokenJob } from './failure-monitor.js';
+/**
+ * Minimum distinct jobs required to form a cluster. Below this we don't
+ * bother — a single repeated error is just a per-job problem.
+ *
+ * 3 is conservative: 2 looks coincidental, 3 is "this is a systemic
+ * thing." Tunable if we get noise.
+ */
+export declare const MIN_CLUSTER_SIZE = 3;
+/**
+ * Normalize an error message into a clustering key.
+ *
+ * Goals:
+ *  - "Prompt is too long (12345 tokens)" and "Prompt is too long (45678
+ *    tokens)" should collapse to the same key.
+ *  - Job-specific tokens (UUIDs, timestamps, paths with the job name)
+ *    should be stripped.
+ *  - The result should still be human-readable (we surface it in the UI).
+ *
+ * Strategy:
+ *  1. Lowercase + collapse whitespace
+ *  2. Strip ISO timestamps + UNIX epochs
+ *  3. Strip UUIDs and long hex tokens
+ *  4. Strip parenthesized numbers ("(12345 tokens)" → "(N tokens)")
+ *  5. Strip absolute paths
+ *  6. Truncate to ERROR_NORMALIZE_LEN
+ */
+export declare function normalizeErrorMessage(raw: string): string;
+export interface FailureCluster {
+    /** The normalized pattern key. Stable across jobs/runs. */
+    pattern: string;
+    /** A representative human-readable error message (one of the original
+     *  uncleaned strings, picked by frequency). */
+    representative: string;
+    /** Distinct jobs hitting this cluster, sorted by error count desc. */
+    jobs: Array<{
+        jobName: string;
+        agentSlug?: string;
+        errorCount48h: number;
+        lastErrorAt: string | null;
+    }>;
+    /** Total errors across all jobs in the cluster (last 48h). */
+    totalErrors: number;
+}
+/**
+ * Group the current broken jobs by normalized error pattern. Only
+ * returns clusters with ≥ MIN_CLUSTER_SIZE distinct jobs. Returns
+ * largest clusters first (by distinct-job count, then total error
+ * count).
+ *
+ * Each broken job contributes UP TO 3 patterns (its `lastErrors[]`).
+ * A job that hits two distinct patterns counts toward both clusters
+ * — that's by design, since a job with two root causes really does
+ * need both fixes.
+ */
+export declare function clusterBrokenJobs(jobs?: BrokenJob[]): FailureCluster[];
+/**
+ * Render a cluster summary for the hypothesizer prompt block. Empty
+ * string when no clusters meet the threshold.
+ *
+ * Format:
+ *   ### Cross-job failure clusters (last 48h)
+ *   - "Prompt is too long (N tokens)" — 5 jobs: insight-check, outcome-grader, route-classifier, ...
+ *   - "Reached maximum number of turns (N)" — 3 jobs: ...
+ *   Bias one root-cause proposal toward the largest cluster instead of N per-job ones.
+ */
+export declare function formatClustersForHypothesizer(clusters: FailureCluster[]): string;
+//# sourceMappingURL=failure-clustering.d.ts.map

package/dist/gateway/failure-clustering.js ADDED Viewed

@@ -0,0 +1,190 @@
+/**
+ * Cross-job failure clustering (1.18.163).
+ *
+ * Today the failure pipeline is per-job:
+ *   broken-job(jobName) → classifyFailure(lastErrors) → 1 fix proposal
+ *
+ * That means when 5 different cron jobs all hit the same root cause
+ * (e.g. all 5 fail with "Prompt is too long"), the system generates
+ * 5 isolated patches instead of 1 root-cause fix. The owner sees
+ * 5 separate proposals in the Self-Improve tab and either approves all
+ * 5 (busywork) or denies them (and the underlying issue persists).
+ *
+ * This module groups recent broken jobs by *normalized error pattern*.
+ * When ≥3 distinct jobs hit the same cluster, the owner gets ONE
+ * "5 jobs all hit X — propose Y for all of them" suggestion instead of
+ * N separate ones.
+ *
+ * This is purely a *suggestion / presentation* layer — clusters are
+ * surfaced as a hint to the hypothesizer + dashboard. The existing
+ * per-job `failure-fix-consumer` continues to handle individual patches
+ * unchanged. Clustering is additive observability, not a replacement
+ * for per-job fixes.
+ *
+ * Reads from the existing `computeBrokenJobs()` source — no new schema,
+ * no new persistence, computed on demand.
+ */
+import pino from 'pino';
+import { computeBrokenJobs } from './failure-monitor.js';
+const logger = pino({ name: 'clementine.failure-clustering' });
+// ── Tunables ─────────────────────────────────────────────────────────
+/**
+ * Minimum distinct jobs required to form a cluster. Below this we don't
+ * bother — a single repeated error is just a per-job problem.
+ *
+ * 3 is conservative: 2 looks coincidental, 3 is "this is a systemic
+ * thing." Tunable if we get noise.
+ */
+export const MIN_CLUSTER_SIZE = 3;
+/** Max chars of an error message we consider when normalizing. The
+ *  important signal is in the first ~200 chars; longer suffixes are
+ *  usually stack traces or per-call IDs that destroy clustering. */
+const ERROR_NORMALIZE_LEN = 200;
+// ── Normalization ────────────────────────────────────────────────────
+/**
+ * Normalize an error message into a clustering key.
+ *
+ * Goals:
+ *  - "Prompt is too long (12345 tokens)" and "Prompt is too long (45678
+ *    tokens)" should collapse to the same key.
+ *  - Job-specific tokens (UUIDs, timestamps, paths with the job name)
+ *    should be stripped.
+ *  - The result should still be human-readable (we surface it in the UI).
+ *
+ * Strategy:
+ *  1. Lowercase + collapse whitespace
+ *  2. Strip ISO timestamps + UNIX epochs
+ *  3. Strip UUIDs and long hex tokens
+ *  4. Strip parenthesized numbers ("(12345 tokens)" → "(N tokens)")
+ *  5. Strip absolute paths
+ *  6. Truncate to ERROR_NORMALIZE_LEN
+ */
+export function normalizeErrorMessage(raw) {
+    if (!raw)
+        return '';
+    let s = raw.toLowerCase().trim();
+    // ISO timestamps: 2026-05-10T14:23:00.000Z (with optional millis/tz)
+    s = s.replace(/\d{4}-\d{2}-\d{2}t\d{2}:\d{2}:\d{2}(\.\d+)?(z|[+-]\d{2}:?\d{2})?/g, '<ts>');
+    // Unix epoch ms (13-digit) + sec (10-digit) — must come BEFORE plain numbers
+    s = s.replace(/\b\d{13}\b/g, '<ts>');
+    s = s.replace(/\b\d{10}\b/g, '<ts>');
+    // UUIDs
+    s = s.replace(/[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}/g, '<uuid>');
+    // Long hex (16+ chars, like commit SHAs / session ids)
+    s = s.replace(/\b[0-9a-f]{16,}\b/g, '<hex>');
+    // Parenthesized numbers: (12345) → (N) ; (12345 tokens) → (N tokens)
+    s = s.replace(/\(\s*\d[\d,_.]*\s*([a-z]*)\s*\)/g, (_m, suffix) => suffix ? `(N ${suffix})` : '(N)');
+    // Absolute paths — keep just the basename
+    s = s.replace(/\/[\w./-]+\/([\w.-]+)/g, '<path>/$1');
+    // Generic standalone large numbers
+    s = s.replace(/\b\d{4,}\b/g, '<N>');
+    // Collapse whitespace
+    s = s.replace(/\s+/g, ' ').trim();
+    return s.slice(0, ERROR_NORMALIZE_LEN);
+}
+// ── Clusterer ────────────────────────────────────────────────────────
+/**
+ * Group the current broken jobs by normalized error pattern. Only
+ * returns clusters with ≥ MIN_CLUSTER_SIZE distinct jobs. Returns
+ * largest clusters first (by distinct-job count, then total error
+ * count).
+ *
+ * Each broken job contributes UP TO 3 patterns (its `lastErrors[]`).
+ * A job that hits two distinct patterns counts toward both clusters
+ * — that's by design, since a job with two root causes really does
+ * need both fixes.
+ */
+export function clusterBrokenJobs(jobs) {
+    const source = jobs ?? computeBrokenJobs();
+    if (source.length === 0)
+        return [];
+    // pattern → { representative (most common raw), jobs map keyed by jobName }
+    const buckets = new Map();
+    for (const job of source) {
+        const seenForThisJob = new Set();
+        for (const raw of job.lastErrors ?? []) {
+            const key = normalizeErrorMessage(raw);
+            if (!key)
+                continue;
+            // Don't double-count this job for the same pattern even if
+            // lastErrors contains two near-identical messages.
+            if (seenForThisJob.has(key))
+                continue;
+            seenForThisJob.add(key);
+            let bucket = buckets.get(key);
+            if (!bucket) {
+                bucket = { representative: raw, rawCounts: new Map(), jobs: new Map() };
+                buckets.set(key, bucket);
+            }
+            bucket.rawCounts.set(raw, (bucket.rawCounts.get(raw) ?? 0) + 1);
+            // Pick the most-common raw form as the representative on the fly.
+            const cur = bucket.rawCounts.get(raw);
+            const best = bucket.rawCounts.get(bucket.representative) ?? 0;
+            if (cur > best)
+                bucket.representative = raw;
+            const existing = bucket.jobs.get(job.jobName);
+            if (existing) {
+                existing.errorCount48h += job.errorCount48h;
+            }
+            else {
+                bucket.jobs.set(job.jobName, {
+                    jobName: job.jobName,
+                    ...(job.agentSlug ? { agentSlug: job.agentSlug } : {}),
+                    errorCount48h: job.errorCount48h,
+                    lastErrorAt: job.lastErrorAt,
+                });
+            }
+        }
+    }
+    const clusters = [];
+    for (const [pattern, bucket] of buckets) {
+        if (bucket.jobs.size < MIN_CLUSTER_SIZE)
+            continue;
+        const jobsArr = [...bucket.jobs.values()].sort((a, b) => b.errorCount48h - a.errorCount48h);
+        const totalErrors = jobsArr.reduce((acc, j) => acc + j.errorCount48h, 0);
+        clusters.push({
+            pattern,
+            representative: bucket.representative,
+            jobs: jobsArr,
+            totalErrors,
+        });
+    }
+    // Sort: distinct-job count desc, then total errors desc, then pattern asc
+    clusters.sort((a, b) => {
+        if (b.jobs.length !== a.jobs.length)
+            return b.jobs.length - a.jobs.length;
+        if (b.totalErrors !== a.totalErrors)
+            return b.totalErrors - a.totalErrors;
+        return a.pattern.localeCompare(b.pattern);
+    });
+    if (clusters.length > 0) {
+        logger.info({ count: clusters.length, top: clusters[0]?.pattern.slice(0, 80), topJobs: clusters[0]?.jobs.length }, 'Failure clusters detected');
+    }
+    return clusters;
+}
+/**
+ * Render a cluster summary for the hypothesizer prompt block. Empty
+ * string when no clusters meet the threshold.
+ *
+ * Format:
+ *   ### Cross-job failure clusters (last 48h)
+ *   - "Prompt is too long (N tokens)" — 5 jobs: insight-check, outcome-grader, route-classifier, ...
+ *   - "Reached maximum number of turns (N)" — 3 jobs: ...
+ *   Bias one root-cause proposal toward the largest cluster instead of N per-job ones.
+ */
+export function formatClustersForHypothesizer(clusters) {
+    if (!clusters || clusters.length === 0)
+        return '';
+    const lines = ['### Cross-job failure clusters (last 48h)'];
+    for (const c of clusters.slice(0, 5)) {
+        const jobNames = c.jobs.slice(0, 5).map(j => j.jobName).join(', ');
+        const more = c.jobs.length > 5 ? `, +${c.jobs.length - 5} more` : '';
+        const rep = c.representative.length > 100 ? c.representative.slice(0, 100) + '…' : c.representative;
+        lines.push(`- "${rep}" — ${c.jobs.length} jobs (${c.totalErrors} total errors): ${jobNames}${more}`);
+    }
+    lines.push('When a cluster of 3+ jobs hits the same pattern, prefer ONE root-cause proposal ' +
+        '(e.g. an advisor-rule, a prompt-override at agent or global scope, or a shared ' +
+        'config change) over N per-job patches.');
+    return lines.join('\n') + '\n\n';
+}
+//# sourceMappingURL=failure-clustering.js.map

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "clementine-agent",
-  "version": "1.18.162",
+  "version": "1.18.163",
   "description": "Clementine — Personal AI Assistant (TypeScript)",
   "type": "module",
   "main": "dist/index.js",