npm - tuna-agent - Versions diffs - 0.1.164 → 0.1.165 - Mend

tuna-agent 0.1.164 → 0.1.165

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (3) hide show

package/dist/agents/claude-code-adapter.d.ts +20 -0
package/dist/agents/claude-code-adapter.js +93 -1
package/package.json +1 -1

package/dist/agents/claude-code-adapter.d.ts CHANGED Viewed

@@ -22,6 +22,8 @@ export interface AgentMetrics {
     lastReflectionAt: string | null;
     lastPatternAt: string | null;
     latestLearnedRule: string | null;
+    lastScore: number | null;
+    scoreTrend: string | null;
     upSince: string;
 }
 export declare class ClaudeCodeAdapter implements AgentAdapter {
@@ -76,6 +78,24 @@ export declare class ClaudeCodeAdapter implements AgentAdapter {
     private static isSimilarRule;
     /** Cosine similarity between two equal-length embedding vectors. */
     private static cosine;
+    /** Path to the per-agent self-improvement score log (one JSON line per scored task). */
+    private static scoresFile;
+    /**
+     * Derive a 0-10 quality score for a completed task — the signal the self-improvement
+     * loop optimizes against (SIA-style: a measurable score gates whether changes help).
+     * Prefers an explicit rubric score the agent emits ("[SCORE: N]" / "QUALITY_SCORE: N");
+     * otherwise falls back to a deterministic proxy from status + result keywords.
+     */
+    static deriveScore(status: 'done' | 'failed', resultSummary: string): number;
+    /** Append one score record to the agent's score log; update in-memory metrics. */
+    recordRunScore(cwd: string, agentId: string, score: number, note: string): void;
+    /** Read the last N scores from the agent's score log (oldest→newest). */
+    static readRecentScores(cwd: string, n: number): number[];
+    /**
+     * Classify the recent score trend. Compares the latest score to the mean of the
+     * prior window. Needs >=3 points; otherwise 'insufficient'.
+     */
+    static computeScoreTrend(scores: number[]): 'up' | 'flat' | 'down' | 'insufficient';
     runSelfImprovement(cwd: string, agentId: string): Promise<void>;
     /**
      * Parse "## Learned Rules" section from CLAUDE.md and store in learnedRulesMap.

package/dist/agents/claude-code-adapter.js CHANGED Viewed

@@ -36,6 +36,8 @@ export class ClaudeCodeAdapter {
                 lastReflectionAt: null,
                 lastPatternAt: null,
                 latestLearnedRule: null,
+                lastScore: null,
+                scoreTrend: null,
                 upSince: new Date().toISOString(),
             });
         }
@@ -844,6 +846,10 @@ export class ClaudeCodeAdapter {
             return;
         const agentName = path.basename(cwd);
         const agentId = task.agentId || '';
+        // Record a quality score for this task — the signal the self-improvement loop
+        // optimizes against (closes the loop: score in -> trend gates rule changes out).
+        const runScore = ClaudeCodeAdapter.deriveScore(status, resultSummary);
+        this.recordRunScore(cwd, agentId, runScore, `${status}: ${task.description.substring(0, 80)}`);
         try {
             // Step 1: Generate AI-powered reflection via Ollama
             console.log(`[Reflection] Generating AI reflection for task ${task.id} (${status}), input: ${resultSummary.substring(0, 150)}...`);
@@ -934,6 +940,85 @@ export class ClaudeCodeAdapter {
         }
         return (na && nb) ? dot / (Math.sqrt(na) * Math.sqrt(nb)) : 0;
     }
+    /** Path to the per-agent self-improvement score log (one JSON line per scored task). */
+    static scoresFile(cwd) {
+        return path.join(cwd, 'self-improve-scores.jsonl');
+    }
+    /**
+     * Derive a 0-10 quality score for a completed task — the signal the self-improvement
+     * loop optimizes against (SIA-style: a measurable score gates whether changes help).
+     * Prefers an explicit rubric score the agent emits ("[SCORE: N]" / "QUALITY_SCORE: N");
+     * otherwise falls back to a deterministic proxy from status + result keywords.
+     */
+    static deriveScore(status, resultSummary) {
+        const text = resultSummary || '';
+        const m = text.match(/(?:\[score:?\s*|quality_score:?\s*|score:\s*)(\d+(?:\.\d+)?)\s*(?:\/\s*10)?/i);
+        if (m) {
+            const s = parseFloat(m[1]);
+            if (!isNaN(s))
+                return Math.max(0, Math.min(10, s));
+        }
+        if (status === 'failed')
+            return 2;
+        let score = 6; // baseline for a completed task
+        const low = text.toLowerCase();
+        if (/\b(verified|confirmed|passed|deployed|completed successfully|all tests pass)\b/.test(low))
+            score += 2;
+        if (/\b(improved|optimi[sz]ed|fixed|resolved)\b/.test(low))
+            score += 1;
+        if (/\b(partial|incomplete|could not|unable|skipped|blocked|warning|fallback|degraded)\b/.test(low))
+            score -= 2;
+        if (/\b(error|failed|exception|timeout)\b/.test(low))
+            score -= 1;
+        return Math.max(0, Math.min(10, score));
+    }
+    /** Append one score record to the agent's score log; update in-memory metrics. */
+    recordRunScore(cwd, agentId, score, note) {
+        try {
+            const m = this.getMetricsForAgent(agentId);
+            const entry = { date: new Date().toISOString(), score, rulesCount: m.rulesCount, note: note.substring(0, 200) };
+            fs.appendFileSync(ClaudeCodeAdapter.scoresFile(cwd), JSON.stringify(entry) + '\n');
+            m.lastScore = score;
+        }
+        catch (err) {
+            console.warn(`[Self-Improve] recordRunScore failed:`, err instanceof Error ? err.message : err);
+        }
+    }
+    /** Read the last N scores from the agent's score log (oldest→newest). */
+    static readRecentScores(cwd, n) {
+        try {
+            const f = ClaudeCodeAdapter.scoresFile(cwd);
+            if (!fs.existsSync(f))
+                return [];
+            const lines = fs.readFileSync(f, 'utf-8').trim().split('\n').filter(Boolean);
+            return lines.slice(-n).map(l => { try {
+                return JSON.parse(l).score;
+            }
+            catch {
+                return NaN;
+            } }).filter(s => !isNaN(s));
+        }
+        catch {
+            return [];
+        }
+    }
+    /**
+     * Classify the recent score trend. Compares the latest score to the mean of the
+     * prior window. Needs >=3 points; otherwise 'insufficient'.
+     */
+    static computeScoreTrend(scores) {
+        if (scores.length < 3)
+            return 'insufficient';
+        const recent = scores[scores.length - 1];
+        const prior = scores.slice(0, -1);
+        const priorAvg = prior.reduce((a, b) => a + b, 0) / prior.length;
+        const delta = recent - priorAvg;
+        if (delta > 0.5)
+            return 'up';
+        if (delta < -0.5)
+            return 'down';
+        return 'flat';
+    }
     async runSelfImprovement(cwd, agentId) {
         if (!process.env.MEM0_SSH_HOST)
             return;
@@ -975,7 +1060,14 @@ export class ClaudeCodeAdapter {
             }
             // Filter 1: Quality gate — reject garbage rules
             const MAX_LEARNED_RULES = 50;
-            const MIN_CONFIDENCE = 2;
+            // Score-gate (SIA-style): if recent self-improvements have NOT improved the agent's
+            // score (flat/down trend), raise the confidence bar so we stop piling on rules that
+            // aren't helping. Only loosen the bar when the trend is actually improving.
+            const recentScores = ClaudeCodeAdapter.readRecentScores(cwd, 5);
+            const trend = ClaudeCodeAdapter.computeScoreTrend(recentScores);
+            this.getMetricsForAgent(agentId).scoreTrend = trend;
+            const MIN_CONFIDENCE = (trend === 'down' || trend === 'flat') ? 4 : 2;
+            console.log(`[Self-Improve] Score trend '${trend}' (recent=[${recentScores.join(',')}]) → confidence bar ${MIN_CONFIDENCE}${(trend === 'down' || trend === 'flat') ? ' (raised — recent rule additions not improving outcomes)' : ''}`);
             const qualityPatterns = patterns.filter(p => {
                 const r = p.rule.trim();
                 // Confidence gate — only persist rules seen 2+ times

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "tuna-agent",
-  "version": "0.1.164",
+  "version": "0.1.165",
   "description": "Tuna Agent - Run AI coding tasks on your machine",
   "bin": {
     "tuna-agent": "dist/cli/index.js"