tuna-agent 0.1.164 → 0.1.165

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -22,6 +22,8 @@ export interface AgentMetrics {
22
22
  lastReflectionAt: string | null;
23
23
  lastPatternAt: string | null;
24
24
  latestLearnedRule: string | null;
25
+ lastScore: number | null;
26
+ scoreTrend: string | null;
25
27
  upSince: string;
26
28
  }
27
29
  export declare class ClaudeCodeAdapter implements AgentAdapter {
@@ -76,6 +78,24 @@ export declare class ClaudeCodeAdapter implements AgentAdapter {
76
78
  private static isSimilarRule;
77
79
  /** Cosine similarity between two equal-length embedding vectors. */
78
80
  private static cosine;
81
+ /** Path to the per-agent self-improvement score log (one JSON line per scored task). */
82
+ private static scoresFile;
83
+ /**
84
+ * Derive a 0-10 quality score for a completed task — the signal the self-improvement
85
+ * loop optimizes against (SIA-style: a measurable score gates whether changes help).
86
+ * Prefers an explicit rubric score the agent emits ("[SCORE: N]" / "QUALITY_SCORE: N");
87
+ * otherwise falls back to a deterministic proxy from status + result keywords.
88
+ */
89
+ static deriveScore(status: 'done' | 'failed', resultSummary: string): number;
90
+ /** Append one score record to the agent's score log; update in-memory metrics. */
91
+ recordRunScore(cwd: string, agentId: string, score: number, note: string): void;
92
+ /** Read the last N scores from the agent's score log (oldest→newest). */
93
+ static readRecentScores(cwd: string, n: number): number[];
94
+ /**
95
+ * Classify the recent score trend. Compares the latest score to the mean of the
96
+ * prior window. Needs >=3 points; otherwise 'insufficient'.
97
+ */
98
+ static computeScoreTrend(scores: number[]): 'up' | 'flat' | 'down' | 'insufficient';
79
99
  runSelfImprovement(cwd: string, agentId: string): Promise<void>;
80
100
  /**
81
101
  * Parse "## Learned Rules" section from CLAUDE.md and store in learnedRulesMap.
@@ -36,6 +36,8 @@ export class ClaudeCodeAdapter {
36
36
  lastReflectionAt: null,
37
37
  lastPatternAt: null,
38
38
  latestLearnedRule: null,
39
+ lastScore: null,
40
+ scoreTrend: null,
39
41
  upSince: new Date().toISOString(),
40
42
  });
41
43
  }
@@ -844,6 +846,10 @@ export class ClaudeCodeAdapter {
844
846
  return;
845
847
  const agentName = path.basename(cwd);
846
848
  const agentId = task.agentId || '';
849
+ // Record a quality score for this task — the signal the self-improvement loop
850
+ // optimizes against (closes the loop: score in -> trend gates rule changes out).
851
+ const runScore = ClaudeCodeAdapter.deriveScore(status, resultSummary);
852
+ this.recordRunScore(cwd, agentId, runScore, `${status}: ${task.description.substring(0, 80)}`);
847
853
  try {
848
854
  // Step 1: Generate AI-powered reflection via Ollama
849
855
  console.log(`[Reflection] Generating AI reflection for task ${task.id} (${status}), input: ${resultSummary.substring(0, 150)}...`);
@@ -934,6 +940,85 @@ export class ClaudeCodeAdapter {
934
940
  }
935
941
  return (na && nb) ? dot / (Math.sqrt(na) * Math.sqrt(nb)) : 0;
936
942
  }
943
+ /** Path to the per-agent self-improvement score log (one JSON line per scored task). */
944
+ static scoresFile(cwd) {
945
+ return path.join(cwd, 'self-improve-scores.jsonl');
946
+ }
947
+ /**
948
+ * Derive a 0-10 quality score for a completed task — the signal the self-improvement
949
+ * loop optimizes against (SIA-style: a measurable score gates whether changes help).
950
+ * Prefers an explicit rubric score the agent emits ("[SCORE: N]" / "QUALITY_SCORE: N");
951
+ * otherwise falls back to a deterministic proxy from status + result keywords.
952
+ */
953
+ static deriveScore(status, resultSummary) {
954
+ const text = resultSummary || '';
955
+ const m = text.match(/(?:\[score:?\s*|quality_score:?\s*|score:\s*)(\d+(?:\.\d+)?)\s*(?:\/\s*10)?/i);
956
+ if (m) {
957
+ const s = parseFloat(m[1]);
958
+ if (!isNaN(s))
959
+ return Math.max(0, Math.min(10, s));
960
+ }
961
+ if (status === 'failed')
962
+ return 2;
963
+ let score = 6; // baseline for a completed task
964
+ const low = text.toLowerCase();
965
+ if (/\b(verified|confirmed|passed|deployed|completed successfully|all tests pass)\b/.test(low))
966
+ score += 2;
967
+ if (/\b(improved|optimi[sz]ed|fixed|resolved)\b/.test(low))
968
+ score += 1;
969
+ if (/\b(partial|incomplete|could not|unable|skipped|blocked|warning|fallback|degraded)\b/.test(low))
970
+ score -= 2;
971
+ if (/\b(error|failed|exception|timeout)\b/.test(low))
972
+ score -= 1;
973
+ return Math.max(0, Math.min(10, score));
974
+ }
975
+ /** Append one score record to the agent's score log; update in-memory metrics. */
976
+ recordRunScore(cwd, agentId, score, note) {
977
+ try {
978
+ const m = this.getMetricsForAgent(agentId);
979
+ const entry = { date: new Date().toISOString(), score, rulesCount: m.rulesCount, note: note.substring(0, 200) };
980
+ fs.appendFileSync(ClaudeCodeAdapter.scoresFile(cwd), JSON.stringify(entry) + '\n');
981
+ m.lastScore = score;
982
+ }
983
+ catch (err) {
984
+ console.warn(`[Self-Improve] recordRunScore failed:`, err instanceof Error ? err.message : err);
985
+ }
986
+ }
987
+ /** Read the last N scores from the agent's score log (oldest→newest). */
988
+ static readRecentScores(cwd, n) {
989
+ try {
990
+ const f = ClaudeCodeAdapter.scoresFile(cwd);
991
+ if (!fs.existsSync(f))
992
+ return [];
993
+ const lines = fs.readFileSync(f, 'utf-8').trim().split('\n').filter(Boolean);
994
+ return lines.slice(-n).map(l => { try {
995
+ return JSON.parse(l).score;
996
+ }
997
+ catch {
998
+ return NaN;
999
+ } }).filter(s => !isNaN(s));
1000
+ }
1001
+ catch {
1002
+ return [];
1003
+ }
1004
+ }
1005
+ /**
1006
+ * Classify the recent score trend. Compares the latest score to the mean of the
1007
+ * prior window. Needs >=3 points; otherwise 'insufficient'.
1008
+ */
1009
+ static computeScoreTrend(scores) {
1010
+ if (scores.length < 3)
1011
+ return 'insufficient';
1012
+ const recent = scores[scores.length - 1];
1013
+ const prior = scores.slice(0, -1);
1014
+ const priorAvg = prior.reduce((a, b) => a + b, 0) / prior.length;
1015
+ const delta = recent - priorAvg;
1016
+ if (delta > 0.5)
1017
+ return 'up';
1018
+ if (delta < -0.5)
1019
+ return 'down';
1020
+ return 'flat';
1021
+ }
937
1022
  async runSelfImprovement(cwd, agentId) {
938
1023
  if (!process.env.MEM0_SSH_HOST)
939
1024
  return;
@@ -975,7 +1060,14 @@ export class ClaudeCodeAdapter {
975
1060
  }
976
1061
  // Filter 1: Quality gate — reject garbage rules
977
1062
  const MAX_LEARNED_RULES = 50;
978
- const MIN_CONFIDENCE = 2;
1063
+ // Score-gate (SIA-style): if recent self-improvements have NOT improved the agent's
1064
+ // score (flat/down trend), raise the confidence bar so we stop piling on rules that
1065
+ // aren't helping. Only loosen the bar when the trend is actually improving.
1066
+ const recentScores = ClaudeCodeAdapter.readRecentScores(cwd, 5);
1067
+ const trend = ClaudeCodeAdapter.computeScoreTrend(recentScores);
1068
+ this.getMetricsForAgent(agentId).scoreTrend = trend;
1069
+ const MIN_CONFIDENCE = (trend === 'down' || trend === 'flat') ? 4 : 2;
1070
+ console.log(`[Self-Improve] Score trend '${trend}' (recent=[${recentScores.join(',')}]) → confidence bar ${MIN_CONFIDENCE}${(trend === 'down' || trend === 'flat') ? ' (raised — recent rule additions not improving outcomes)' : ''}`);
979
1071
  const qualityPatterns = patterns.filter(p => {
980
1072
  const r = p.rule.trim();
981
1073
  // Confidence gate — only persist rules seen 2+ times
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "tuna-agent",
3
- "version": "0.1.164",
3
+ "version": "0.1.165",
4
4
  "description": "Tuna Agent - Run AI coding tasks on your machine",
5
5
  "bin": {
6
6
  "tuna-agent": "dist/cli/index.js"