tuna-agent 0.1.164 → 0.1.166
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
|
@@ -22,6 +22,8 @@ export interface AgentMetrics {
|
|
|
22
22
|
lastReflectionAt: string | null;
|
|
23
23
|
lastPatternAt: string | null;
|
|
24
24
|
latestLearnedRule: string | null;
|
|
25
|
+
lastScore: number | null;
|
|
26
|
+
scoreTrend: string | null;
|
|
25
27
|
upSince: string;
|
|
26
28
|
}
|
|
27
29
|
export declare class ClaudeCodeAdapter implements AgentAdapter {
|
|
@@ -76,6 +78,24 @@ export declare class ClaudeCodeAdapter implements AgentAdapter {
|
|
|
76
78
|
private static isSimilarRule;
|
|
77
79
|
/** Cosine similarity between two equal-length embedding vectors. */
|
|
78
80
|
private static cosine;
|
|
81
|
+
/** Path to the per-agent self-improvement score log (one JSON line per scored task). */
|
|
82
|
+
private static scoresFile;
|
|
83
|
+
/**
|
|
84
|
+
* Derive a 0-10 quality score for a completed task — the signal the self-improvement
|
|
85
|
+
* loop optimizes against (SIA-style: a measurable score gates whether changes help).
|
|
86
|
+
* Prefers an explicit rubric score the agent emits ("[SCORE: N]" / "QUALITY_SCORE: N");
|
|
87
|
+
* otherwise falls back to a deterministic proxy from status + result keywords.
|
|
88
|
+
*/
|
|
89
|
+
static deriveScore(status: 'done' | 'failed', resultSummary: string): number;
|
|
90
|
+
/** Append one score record to the agent's score log; update in-memory metrics. */
|
|
91
|
+
recordRunScore(cwd: string, agentId: string, score: number, note: string): void;
|
|
92
|
+
/** Read the last N scores from the agent's score log (oldest→newest). */
|
|
93
|
+
static readRecentScores(cwd: string, n: number): number[];
|
|
94
|
+
/**
|
|
95
|
+
* Classify the recent score trend. Compares the latest score to the mean of the
|
|
96
|
+
* prior window. Needs >=3 points; otherwise 'insufficient'.
|
|
97
|
+
*/
|
|
98
|
+
static computeScoreTrend(scores: number[]): 'up' | 'flat' | 'down' | 'insufficient';
|
|
79
99
|
runSelfImprovement(cwd: string, agentId: string): Promise<void>;
|
|
80
100
|
/**
|
|
81
101
|
* Parse "## Learned Rules" section from CLAUDE.md and store in learnedRulesMap.
|
|
@@ -36,6 +36,8 @@ export class ClaudeCodeAdapter {
|
|
|
36
36
|
lastReflectionAt: null,
|
|
37
37
|
lastPatternAt: null,
|
|
38
38
|
latestLearnedRule: null,
|
|
39
|
+
lastScore: null,
|
|
40
|
+
scoreTrend: null,
|
|
39
41
|
upSince: new Date().toISOString(),
|
|
40
42
|
});
|
|
41
43
|
}
|
|
@@ -844,6 +846,10 @@ export class ClaudeCodeAdapter {
|
|
|
844
846
|
return;
|
|
845
847
|
const agentName = path.basename(cwd);
|
|
846
848
|
const agentId = task.agentId || '';
|
|
849
|
+
// Record a quality score for this task — the signal the self-improvement loop
|
|
850
|
+
// optimizes against (closes the loop: score in -> trend gates rule changes out).
|
|
851
|
+
const runScore = ClaudeCodeAdapter.deriveScore(status, resultSummary);
|
|
852
|
+
this.recordRunScore(cwd, agentId, runScore, `${status}: ${task.description.substring(0, 80)}`);
|
|
847
853
|
try {
|
|
848
854
|
// Step 1: Generate AI-powered reflection via Ollama
|
|
849
855
|
console.log(`[Reflection] Generating AI reflection for task ${task.id} (${status}), input: ${resultSummary.substring(0, 150)}...`);
|
|
@@ -934,6 +940,85 @@ export class ClaudeCodeAdapter {
|
|
|
934
940
|
}
|
|
935
941
|
return (na && nb) ? dot / (Math.sqrt(na) * Math.sqrt(nb)) : 0;
|
|
936
942
|
}
|
|
943
|
+
/** Path to the per-agent self-improvement score log (one JSON line per scored task). */
|
|
944
|
+
static scoresFile(cwd) {
|
|
945
|
+
return path.join(cwd, 'self-improve-scores.jsonl');
|
|
946
|
+
}
|
|
947
|
+
/**
|
|
948
|
+
* Derive a 0-10 quality score for a completed task — the signal the self-improvement
|
|
949
|
+
* loop optimizes against (SIA-style: a measurable score gates whether changes help).
|
|
950
|
+
* Prefers an explicit rubric score the agent emits ("[SCORE: N]" / "QUALITY_SCORE: N");
|
|
951
|
+
* otherwise falls back to a deterministic proxy from status + result keywords.
|
|
952
|
+
*/
|
|
953
|
+
static deriveScore(status, resultSummary) {
|
|
954
|
+
const text = resultSummary || '';
|
|
955
|
+
const m = text.match(/(?:\[score:?\s*|quality_score:?\s*|score:\s*)(\d+(?:\.\d+)?)\s*(?:\/\s*10)?/i);
|
|
956
|
+
if (m) {
|
|
957
|
+
const s = parseFloat(m[1]);
|
|
958
|
+
if (!isNaN(s))
|
|
959
|
+
return Math.max(0, Math.min(10, s));
|
|
960
|
+
}
|
|
961
|
+
if (status === 'failed')
|
|
962
|
+
return 2;
|
|
963
|
+
let score = 6; // baseline for a completed task
|
|
964
|
+
const low = text.toLowerCase();
|
|
965
|
+
if (/\b(verified|confirmed|passed|deployed|completed successfully|all tests pass)\b/.test(low))
|
|
966
|
+
score += 2;
|
|
967
|
+
if (/\b(improved|optimi[sz]ed|fixed|resolved)\b/.test(low))
|
|
968
|
+
score += 1;
|
|
969
|
+
if (/\b(partial|incomplete|could not|unable|skipped|blocked|warning|fallback|degraded)\b/.test(low))
|
|
970
|
+
score -= 2;
|
|
971
|
+
if (/\b(error|failed|exception|timeout)\b/.test(low))
|
|
972
|
+
score -= 1;
|
|
973
|
+
return Math.max(0, Math.min(10, score));
|
|
974
|
+
}
|
|
975
|
+
/** Append one score record to the agent's score log; update in-memory metrics. */
|
|
976
|
+
recordRunScore(cwd, agentId, score, note) {
|
|
977
|
+
try {
|
|
978
|
+
const m = this.getMetricsForAgent(agentId);
|
|
979
|
+
const entry = { date: new Date().toISOString(), score, rulesCount: m.rulesCount, note: note.substring(0, 200) };
|
|
980
|
+
fs.appendFileSync(ClaudeCodeAdapter.scoresFile(cwd), JSON.stringify(entry) + '\n');
|
|
981
|
+
m.lastScore = score;
|
|
982
|
+
}
|
|
983
|
+
catch (err) {
|
|
984
|
+
console.warn(`[Self-Improve] recordRunScore failed:`, err instanceof Error ? err.message : err);
|
|
985
|
+
}
|
|
986
|
+
}
|
|
987
|
+
/** Read the last N scores from the agent's score log (oldest→newest). */
|
|
988
|
+
static readRecentScores(cwd, n) {
|
|
989
|
+
try {
|
|
990
|
+
const f = ClaudeCodeAdapter.scoresFile(cwd);
|
|
991
|
+
if (!fs.existsSync(f))
|
|
992
|
+
return [];
|
|
993
|
+
const lines = fs.readFileSync(f, 'utf-8').trim().split('\n').filter(Boolean);
|
|
994
|
+
return lines.slice(-n).map(l => { try {
|
|
995
|
+
return JSON.parse(l).score;
|
|
996
|
+
}
|
|
997
|
+
catch {
|
|
998
|
+
return NaN;
|
|
999
|
+
} }).filter(s => !isNaN(s));
|
|
1000
|
+
}
|
|
1001
|
+
catch {
|
|
1002
|
+
return [];
|
|
1003
|
+
}
|
|
1004
|
+
}
|
|
1005
|
+
/**
|
|
1006
|
+
* Classify the recent score trend. Compares the latest score to the mean of the
|
|
1007
|
+
* prior window. Needs >=3 points; otherwise 'insufficient'.
|
|
1008
|
+
*/
|
|
1009
|
+
static computeScoreTrend(scores) {
|
|
1010
|
+
if (scores.length < 3)
|
|
1011
|
+
return 'insufficient';
|
|
1012
|
+
const recent = scores[scores.length - 1];
|
|
1013
|
+
const prior = scores.slice(0, -1);
|
|
1014
|
+
const priorAvg = prior.reduce((a, b) => a + b, 0) / prior.length;
|
|
1015
|
+
const delta = recent - priorAvg;
|
|
1016
|
+
if (delta > 0.5)
|
|
1017
|
+
return 'up';
|
|
1018
|
+
if (delta < -0.5)
|
|
1019
|
+
return 'down';
|
|
1020
|
+
return 'flat';
|
|
1021
|
+
}
|
|
937
1022
|
async runSelfImprovement(cwd, agentId) {
|
|
938
1023
|
if (!process.env.MEM0_SSH_HOST)
|
|
939
1024
|
return;
|
|
@@ -975,7 +1060,14 @@ export class ClaudeCodeAdapter {
|
|
|
975
1060
|
}
|
|
976
1061
|
// Filter 1: Quality gate — reject garbage rules
|
|
977
1062
|
const MAX_LEARNED_RULES = 50;
|
|
978
|
-
|
|
1063
|
+
// Score-gate (SIA-style): if recent self-improvements have NOT improved the agent's
|
|
1064
|
+
// score (flat/down trend), raise the confidence bar so we stop piling on rules that
|
|
1065
|
+
// aren't helping. Only loosen the bar when the trend is actually improving.
|
|
1066
|
+
const recentScores = ClaudeCodeAdapter.readRecentScores(cwd, 5);
|
|
1067
|
+
const trend = ClaudeCodeAdapter.computeScoreTrend(recentScores);
|
|
1068
|
+
this.getMetricsForAgent(agentId).scoreTrend = trend;
|
|
1069
|
+
const MIN_CONFIDENCE = (trend === 'down' || trend === 'flat') ? 4 : 2;
|
|
1070
|
+
console.log(`[Self-Improve] Score trend '${trend}' (recent=[${recentScores.join(',')}]) → confidence bar ${MIN_CONFIDENCE}${(trend === 'down' || trend === 'flat') ? ' (raised — recent rule additions not improving outcomes)' : ''}`);
|
|
979
1071
|
const qualityPatterns = patterns.filter(p => {
|
|
980
1072
|
const r = p.rule.trim();
|
|
981
1073
|
// Confidence gate — only persist rules seen 2+ times
|
package/dist/daemon/index.js
CHANGED
|
@@ -137,6 +137,7 @@ export async function startDaemon(config) {
|
|
|
137
137
|
const pendingPermissionResolvers = new Map();
|
|
138
138
|
// Track active tasks per agent (agentId → taskId)
|
|
139
139
|
const activeAgentTasks = new Map();
|
|
140
|
+
const agentQueues = new Map();
|
|
140
141
|
// Track abort controllers per task
|
|
141
142
|
const taskAbortControllers = new Map();
|
|
142
143
|
// Note: currentTaskId/currentTaskAbort removed — use taskAbortControllers + activeAgentTasks instead
|
|
@@ -218,41 +219,13 @@ export async function startDaemon(config) {
|
|
|
218
219
|
if (task.repoPath?.startsWith('~/')) {
|
|
219
220
|
task.repoPath = path.join(os.homedir(), task.repoPath.slice(2));
|
|
220
221
|
}
|
|
221
|
-
//
|
|
222
|
+
// Per-agent concurrency: if busy, queue instead of rejecting.
|
|
222
223
|
const agentId = task.agentId || '__default__';
|
|
223
224
|
if (activeAgentTasks.has(agentId)) {
|
|
224
|
-
|
|
225
|
-
ws.send({ action: 'task_rejected', taskId: task.id, reason: 'agent_busy' });
|
|
225
|
+
enqueueForAgent(agentId, { kind: 'task', task });
|
|
226
226
|
break;
|
|
227
227
|
}
|
|
228
|
-
|
|
229
|
-
activeAgentTasks.set(agentId, task.id);
|
|
230
|
-
const abort = new AbortController();
|
|
231
|
-
taskAbortControllers.set(task.id, abort);
|
|
232
|
-
console.log(`[Daemon] Received task: ${task.id} agent=${agentId} — ${task.description.slice(0, 80)} (attachments: ${task.attachments?.length ?? 0}) [active: ${activeTasks}]`);
|
|
233
|
-
// Run task in background (non-blocking) to allow parallel agent execution
|
|
234
|
-
(async () => {
|
|
235
|
-
try {
|
|
236
|
-
await adapter.handleTask(task, ws, pendingInputResolvers, abort.signal, pendingPermissionResolvers);
|
|
237
|
-
}
|
|
238
|
-
catch (err) {
|
|
239
|
-
const errMsg = err instanceof Error ? err.message : String(err);
|
|
240
|
-
if (abort.signal.aborted) {
|
|
241
|
-
console.log(`[Daemon] Task ${task.id} cancelled`);
|
|
242
|
-
}
|
|
243
|
-
else {
|
|
244
|
-
ws.sendTaskFailed(task.id, errMsg);
|
|
245
|
-
console.error(`[Daemon] Task ${task.id} error:`, errMsg);
|
|
246
|
-
}
|
|
247
|
-
}
|
|
248
|
-
finally {
|
|
249
|
-
activeTasks--;
|
|
250
|
-
activeAgentTasks.delete(agentId);
|
|
251
|
-
taskAbortControllers.delete(task.id);
|
|
252
|
-
pendingInputResolvers.delete(task.id);
|
|
253
|
-
ws.send({ action: 'agent_ready', agentId });
|
|
254
|
-
}
|
|
255
|
-
})();
|
|
228
|
+
runTaskNow(task);
|
|
256
229
|
break;
|
|
257
230
|
}
|
|
258
231
|
case 'task_cancelled': {
|
|
@@ -479,19 +452,15 @@ ${skillContent.slice(0, 15000)}`;
|
|
|
479
452
|
// No resolver — check if we have a persisted session to resume
|
|
480
453
|
const savedState = loadPMState(taskId);
|
|
481
454
|
if (savedState) {
|
|
482
|
-
//
|
|
455
|
+
// Per-agent concurrency: if busy, queue the reply instead of dropping it.
|
|
483
456
|
const resumeAgentId = savedState.agentId || '__default__';
|
|
484
457
|
if (activeAgentTasks.has(resumeAgentId)) {
|
|
485
458
|
const busyTask = activeAgentTasks.get(resumeAgentId);
|
|
486
|
-
console.
|
|
487
|
-
|
|
459
|
+
console.log(`[Daemon] Agent ${resumeAgentId} busy with ${busyTask} — queuing reply for task ${taskId}`);
|
|
460
|
+
enqueueForAgent(resumeAgentId, { kind: 'input', taskId, answer, attachments, savedState });
|
|
488
461
|
ws.sendPMMessage(taskId, {
|
|
489
462
|
sender: 'pm',
|
|
490
|
-
content: '
|
|
491
|
-
});
|
|
492
|
-
ws.sendTaskDone(taskId, {
|
|
493
|
-
result: 'Agent busy — message not processed',
|
|
494
|
-
durationMs: 0,
|
|
463
|
+
content: '⏳ Đang xử lý việc khác, sẽ trả lời ngay khi xong.',
|
|
495
464
|
});
|
|
496
465
|
break;
|
|
497
466
|
}
|
|
@@ -697,6 +666,67 @@ ${skillContent.slice(0, 15000)}`;
|
|
|
697
666
|
console.log(`[Daemon] Unknown message type: ${type}`);
|
|
698
667
|
}
|
|
699
668
|
}, onAuthFailed);
|
|
669
|
+
/** Queue work for an agent that is currently busy. */
|
|
670
|
+
function enqueueForAgent(agentId, item) {
|
|
671
|
+
let q = agentQueues.get(agentId);
|
|
672
|
+
if (!q) {
|
|
673
|
+
q = [];
|
|
674
|
+
agentQueues.set(agentId, q);
|
|
675
|
+
}
|
|
676
|
+
q.push(item);
|
|
677
|
+
console.log(`[Daemon] Agent ${agentId} busy — queued ${item.kind} (queue: ${q.length})`);
|
|
678
|
+
}
|
|
679
|
+
/** Start the next queued item for an agent, if any and the agent is free. */
|
|
680
|
+
function processNextForAgent(agentId) {
|
|
681
|
+
if (activeAgentTasks.has(agentId))
|
|
682
|
+
return; // still busy
|
|
683
|
+
const q = agentQueues.get(agentId);
|
|
684
|
+
if (!q || q.length === 0)
|
|
685
|
+
return;
|
|
686
|
+
const item = q.shift();
|
|
687
|
+
console.log(`[Daemon] Dequeue ${item.kind} for agent ${agentId} (remaining: ${q.length})`);
|
|
688
|
+
if (item.kind === 'task') {
|
|
689
|
+
runTaskNow(item.task);
|
|
690
|
+
}
|
|
691
|
+
else if (item.savedState.mode === 'agent_team') {
|
|
692
|
+
resumeAgentTeamChat(item.taskId, item.answer, item.attachments, item.savedState, ws, pendingInputResolvers);
|
|
693
|
+
}
|
|
694
|
+
else {
|
|
695
|
+
resumePMChat(item.taskId, item.answer, item.attachments, item.savedState, ws, pendingInputResolvers);
|
|
696
|
+
}
|
|
697
|
+
}
|
|
698
|
+
/** Run a task immediately (agent assumed free). Drains the queue when done. */
|
|
699
|
+
function runTaskNow(task) {
|
|
700
|
+
const agentId = task.agentId || '__default__';
|
|
701
|
+
activeTasks++;
|
|
702
|
+
activeAgentTasks.set(agentId, task.id);
|
|
703
|
+
const abort = new AbortController();
|
|
704
|
+
taskAbortControllers.set(task.id, abort);
|
|
705
|
+
console.log(`[Daemon] Received task: ${task.id} agent=${agentId} — ${task.description.slice(0, 80)} (attachments: ${task.attachments?.length ?? 0}) [active: ${activeTasks}]`);
|
|
706
|
+
(async () => {
|
|
707
|
+
try {
|
|
708
|
+
await adapter.handleTask(task, ws, pendingInputResolvers, abort.signal, pendingPermissionResolvers);
|
|
709
|
+
}
|
|
710
|
+
catch (err) {
|
|
711
|
+
const errMsg = err instanceof Error ? err.message : String(err);
|
|
712
|
+
if (abort.signal.aborted) {
|
|
713
|
+
console.log(`[Daemon] Task ${task.id} cancelled`);
|
|
714
|
+
}
|
|
715
|
+
else {
|
|
716
|
+
ws.sendTaskFailed(task.id, errMsg);
|
|
717
|
+
console.error(`[Daemon] Task ${task.id} error:`, errMsg);
|
|
718
|
+
}
|
|
719
|
+
}
|
|
720
|
+
finally {
|
|
721
|
+
activeTasks--;
|
|
722
|
+
activeAgentTasks.delete(agentId);
|
|
723
|
+
taskAbortControllers.delete(task.id);
|
|
724
|
+
pendingInputResolvers.delete(task.id);
|
|
725
|
+
ws.send({ action: 'agent_ready', agentId });
|
|
726
|
+
processNextForAgent(agentId);
|
|
727
|
+
}
|
|
728
|
+
})();
|
|
729
|
+
}
|
|
700
730
|
/**
|
|
701
731
|
* Resume PM chat for a task after agent restart or done task reopen.
|
|
702
732
|
* Loads persisted pmSessionId and runs a chat loop.
|
|
@@ -803,6 +833,7 @@ ${skillContent.slice(0, 15000)}`;
|
|
|
803
833
|
resolvers.delete(taskId);
|
|
804
834
|
cleanupAttachments(taskId);
|
|
805
835
|
wsClient.send({ action: 'agent_ready', agentId });
|
|
836
|
+
processNextForAgent(agentId);
|
|
806
837
|
}
|
|
807
838
|
}
|
|
808
839
|
/**
|
|
@@ -1049,6 +1080,7 @@ ${skillContent.slice(0, 15000)}`;
|
|
|
1049
1080
|
resolvers.delete(taskId);
|
|
1050
1081
|
cleanupAttachments(taskId);
|
|
1051
1082
|
wsClient.send({ action: 'agent_ready', agentId });
|
|
1083
|
+
processNextForAgent(agentId);
|
|
1052
1084
|
}
|
|
1053
1085
|
}
|
|
1054
1086
|
// Wire up agent metrics to heartbeat
|