engram-sdk 0.4.2 → 0.4.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/Dockerfile +12 -5
- package/LICENSE +85 -80
- package/dist/accounts.d.ts +0 -1
- package/dist/accounts.d.ts.map +1 -1
- package/dist/accounts.js +0 -14
- package/dist/accounts.js.map +1 -1
- package/dist/cli.js +0 -0
- package/dist/hosted.d.ts.map +1 -1
- package/dist/hosted.js +1 -52
- package/dist/hosted.js.map +1 -1
- package/dist/server.js +0 -0
- package/eval-codebase-v2-NOTE.md +22 -0
- package/package.json +3 -2
- package/rescore-codebase.ts +165 -94
- package/rescore-vscode.ts +142 -0
- package/server.json +29 -0
- package/EVAL-FRAMEWORK.md +0 -70
- package/EVAL.md +0 -127
- package/agent-eval.md +0 -144
- package/dist/telemetry.d.ts +0 -36
- package/dist/telemetry.d.ts.map +0 -1
- package/dist/telemetry.js +0 -140
- package/dist/telemetry.js.map +0 -1
package/rescore-codebase.ts
CHANGED
|
@@ -1,113 +1,184 @@
|
|
|
1
1
|
#!/usr/bin/env npx tsx
|
|
2
2
|
/**
|
|
3
|
-
* Re-score existing codebase eval results using LLM
|
|
3
|
+
* rescore-codebase.ts — Re-score existing codebase eval results using LLM judge
|
|
4
|
+
* Uses the saved answers + ground truth, just re-runs scoring
|
|
4
5
|
*/
|
|
5
6
|
|
|
6
7
|
import { readFileSync, writeFileSync } from 'fs';
|
|
7
|
-
import {
|
|
8
|
-
import {
|
|
8
|
+
import { resolve, dirname } from 'path';
|
|
9
|
+
import { fileURLToPath } from 'url';
|
|
9
10
|
|
|
10
|
-
const
|
|
11
|
-
const
|
|
12
|
-
const
|
|
11
|
+
const __dirname = dirname(fileURLToPath(import.meta.url));
|
|
12
|
+
const GEMINI_KEY = readFileSync(resolve(process.env.HOME!, '.config/engram/gemini-key'), 'utf8').trim();
|
|
13
|
+
const RESULTS_PATH = resolve(__dirname, 'eval-scale-data/codebase-results-vscode.json');
|
|
14
|
+
const REPORT_PATH = resolve(__dirname, 'eval-scale-data/codebase-report-vscode-v2.json');
|
|
13
15
|
|
|
14
|
-
function
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
16
|
+
async function geminiCall(prompt: string, maxTokens = 100): Promise<string> {
|
|
17
|
+
for (let attempt = 0; attempt < 3; attempt++) {
|
|
18
|
+
try {
|
|
19
|
+
const response = await fetch(
|
|
20
|
+
`https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent?key=${GEMINI_KEY}`,
|
|
21
|
+
{
|
|
22
|
+
method: 'POST',
|
|
23
|
+
headers: { 'Content-Type': 'application/json' },
|
|
24
|
+
body: JSON.stringify({
|
|
25
|
+
contents: [{ parts: [{ text: prompt }] }],
|
|
26
|
+
generationConfig: { maxOutputTokens: maxTokens, temperature: 0 },
|
|
27
|
+
}),
|
|
28
|
+
}
|
|
29
|
+
);
|
|
30
|
+
if (response.status === 429) {
|
|
31
|
+
const retryAfter = parseInt(response.headers.get('retry-after') || '10');
|
|
32
|
+
console.log(` Rate limited, waiting ${retryAfter}s...`);
|
|
33
|
+
await new Promise(r => setTimeout(r, retryAfter * 1000));
|
|
34
|
+
continue;
|
|
35
|
+
}
|
|
36
|
+
if (!response.ok) {
|
|
37
|
+
console.log(` API error ${response.status}, retrying...`);
|
|
38
|
+
await new Promise(r => setTimeout(r, 2000));
|
|
39
|
+
continue;
|
|
40
|
+
}
|
|
41
|
+
const data = await response.json() as any;
|
|
42
|
+
return data.candidates?.[0]?.content?.parts?.[0]?.text ?? '';
|
|
43
|
+
} catch (e: any) {
|
|
44
|
+
console.log(` Fetch error: ${e.message}, retrying...`);
|
|
45
|
+
await new Promise(r => setTimeout(r, 2000));
|
|
36
46
|
}
|
|
37
|
-
if (!response.ok) throw new Error(`Gemini API error: ${response.status}`);
|
|
38
|
-
const data = await response.json() as any;
|
|
39
|
-
return data.candidates?.[0]?.content?.parts?.[0]?.text ?? '';
|
|
40
47
|
}
|
|
41
|
-
|
|
48
|
+
return '';
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
async function scoreAnswer(question: string, truth: string, answer: string): Promise<number> {
|
|
52
|
+
const prompt = `You are evaluating an AI's answer about a codebase. Score it from 0.0 to 1.0.
|
|
53
|
+
|
|
54
|
+
- 1.0 = Correct and complete
|
|
55
|
+
- 0.7 = Mostly correct, minor gaps
|
|
56
|
+
- 0.5 = Partially correct
|
|
57
|
+
- 0.3 = Mentions something relevant but mostly wrong
|
|
58
|
+
- 0.0 = Wrong or "I don't know"
|
|
59
|
+
|
|
60
|
+
Question: ${question}
|
|
61
|
+
Ground Truth: ${truth}
|
|
62
|
+
AI's Answer: ${answer}
|
|
63
|
+
|
|
64
|
+
Respond with ONLY a decimal number (e.g. 0.7). Nothing else.`;
|
|
65
|
+
|
|
66
|
+
const response = await geminiCall(prompt);
|
|
67
|
+
const cleaned = response.trim();
|
|
68
|
+
|
|
69
|
+
// Try direct float parse first
|
|
70
|
+
const direct = parseFloat(cleaned);
|
|
71
|
+
if (!isNaN(direct) && direct >= 0 && direct <= 1) return direct;
|
|
72
|
+
|
|
73
|
+
// Try regex
|
|
74
|
+
const match = cleaned.match(/(0\.\d+|1\.0|0|1)/);
|
|
75
|
+
if (match) return parseFloat(match[1]);
|
|
76
|
+
|
|
77
|
+
console.log(` Failed to parse score: "${cleaned}"`);
|
|
78
|
+
return -1; // Mark as failed, don't default to 0
|
|
42
79
|
}
|
|
43
80
|
|
|
44
81
|
async function main() {
|
|
45
82
|
const results = JSON.parse(readFileSync(RESULTS_PATH, 'utf8'));
|
|
46
|
-
console.log(`Rescoring ${results.length}
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
const
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
Ground Truth: ${r.groundTruth}
|
|
66
|
-
|
|
67
|
-
Answer A (Engram): ${r.engram.answer}
|
|
68
|
-
Answer B (Full Context): ${r.fullContext.answer}
|
|
69
|
-
Answer C (Grep Search): ${r.grepSearch.answer}
|
|
70
|
-
|
|
71
|
-
Output ONLY a JSON object: {"a": <score>, "b": <score>, "c": <score>}`;
|
|
72
|
-
|
|
73
|
-
const scoreResponse = await geminiCall(scorePrompt, 200, 3, true);
|
|
74
|
-
// Aggressively extract JSON from any preamble/wrapper text
|
|
75
|
-
const firstBrace = scoreResponse.indexOf('{');
|
|
76
|
-
const lastBrace = scoreResponse.lastIndexOf('}');
|
|
77
|
-
const scores = (firstBrace >= 0 && lastBrace > firstBrace)
|
|
78
|
-
? JSON.parse(scoreResponse.slice(firstBrace, lastBrace + 1))
|
|
79
|
-
: null;
|
|
80
|
-
|
|
81
|
-
if (!scores) {
|
|
82
|
-
console.log(` [${r.index}] Failed to parse scores: ${scoreResponse.slice(0, 100)}`);
|
|
83
|
-
continue;
|
|
84
|
-
}
|
|
85
|
-
|
|
86
|
-
r.engram.correct = scores.a;
|
|
87
|
-
r.fullContext.correct = scores.b;
|
|
88
|
-
if (r.grepSearch) r.grepSearch.correct = scores.c;
|
|
89
|
-
|
|
90
|
-
scored++;
|
|
91
|
-
console.log(` [${r.index}] E: ${scores.a.toFixed(1)} | F: ${scores.b.toFixed(1)} | G: ${scores.c.toFixed(1)} — ${r.question.slice(0, 60)}...`);
|
|
92
|
-
|
|
93
|
-
// Auto-save every 5
|
|
94
|
-
if (scored % 5 === 0) {
|
|
95
|
-
writeFileSync(RESULTS_PATH, JSON.stringify(results, null, 2));
|
|
83
|
+
console.log(`Rescoring ${results.length} results...\n`);
|
|
84
|
+
|
|
85
|
+
const systems = ['engram', 'cappedContext', 'naiveRag', 'grepSearch'] as const;
|
|
86
|
+
let totalScored = 0;
|
|
87
|
+
let totalFailed = 0;
|
|
88
|
+
|
|
89
|
+
for (let i = 0; i < results.length; i++) {
|
|
90
|
+
const r = results[i];
|
|
91
|
+
console.log(`[${i+1}/${results.length}] (${r.category}/${r.difficulty}) ${r.question.slice(0, 70)}...`);
|
|
92
|
+
|
|
93
|
+
const scores: Record<string, number> = {};
|
|
94
|
+
for (const sys of systems) {
|
|
95
|
+
if (!r[sys]?.answer) { scores[sys] = 0; continue; }
|
|
96
|
+
const score = await scoreAnswer(r.question, r.groundTruth, r[sys].answer);
|
|
97
|
+
if (score === -1) {
|
|
98
|
+
totalFailed++;
|
|
99
|
+
scores[sys] = 0;
|
|
100
|
+
} else {
|
|
101
|
+
scores[sys] = score;
|
|
96
102
|
}
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
103
|
+
r[sys].score = scores[sys];
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
totalScored += systems.length;
|
|
107
|
+
const line = systems.map(s => `${s[0].toUpperCase()}:${scores[s].toFixed(2)}`).join(' ');
|
|
108
|
+
console.log(` ${line}`);
|
|
109
|
+
|
|
110
|
+
// Save progress every 5 questions
|
|
111
|
+
if ((i + 1) % 5 === 0 || i === results.length - 1) {
|
|
112
|
+
writeFileSync(RESULTS_PATH.replace('.json', '-rescored2.json'), JSON.stringify(results, null, 2));
|
|
100
113
|
}
|
|
101
114
|
}
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
115
|
+
|
|
116
|
+
console.log(`\nScored: ${totalScored}, Failed parses: ${totalFailed}\n`);
|
|
117
|
+
|
|
118
|
+
// Generate report
|
|
119
|
+
const avg = (sys: string) => {
|
|
120
|
+
const vals = results.map((r: any) => r[sys]?.score ?? 0);
|
|
121
|
+
return vals.reduce((a: number, b: number) => a + b, 0) / vals.length;
|
|
122
|
+
};
|
|
123
|
+
const avgTokens = (sys: string) => {
|
|
124
|
+
const vals = results.map((r: any) => r[sys]?.tokensUsed ?? 0);
|
|
125
|
+
return Math.round(vals.reduce((a: number, b: number) => a + b, 0) / vals.length);
|
|
126
|
+
};
|
|
127
|
+
|
|
128
|
+
console.log('=== VS Code Codebase Evaluation Report ===\n');
|
|
129
|
+
console.log('OVERALL (50 questions)');
|
|
130
|
+
console.log(`${'System'.padEnd(20)} ${'Accuracy'.padEnd(12)} Avg Tokens`);
|
|
131
|
+
for (const sys of systems) {
|
|
132
|
+
const acc = (avg(sys) * 100).toFixed(1);
|
|
133
|
+
console.log(`${sys.padEnd(20)} ${(acc + '%').padEnd(12)} ${avgTokens(sys)}`);
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
// Per category
|
|
137
|
+
const categories = [...new Set(results.map((r: any) => r.category))];
|
|
138
|
+
for (const cat of categories) {
|
|
139
|
+
const catResults = results.filter((r: any) => r.category === cat);
|
|
140
|
+
const catAvg = (sys: string) => {
|
|
141
|
+
const vals = catResults.map((r: any) => r[sys]?.score ?? 0);
|
|
142
|
+
return (vals.reduce((a: number, b: number) => a + b, 0) / vals.length * 100).toFixed(1);
|
|
143
|
+
};
|
|
144
|
+
console.log(`\n ${cat.toUpperCase()} (n=${catResults.length}): ${systems.map(s => `${s[0].toUpperCase()}:${catAvg(s)}%`).join(' ')}`);
|
|
145
|
+
}
|
|
146
|
+
|
|
147
|
+
// Per difficulty
|
|
148
|
+
const diffs = [...new Set(results.map((r: any) => r.difficulty))];
|
|
149
|
+
for (const diff of diffs) {
|
|
150
|
+
const diffResults = results.filter((r: any) => r.difficulty === diff);
|
|
151
|
+
const diffAvg = (sys: string) => {
|
|
152
|
+
const vals = diffResults.map((r: any) => r[sys]?.score ?? 0);
|
|
153
|
+
return (vals.reduce((a: number, b: number) => a + b, 0) / vals.length * 100).toFixed(1);
|
|
154
|
+
};
|
|
155
|
+
console.log(`\n ${diff.toUpperCase()} (n=${diffResults.length}): ${systems.map(s => `${s[0].toUpperCase()}:${diffAvg(s)}%`).join(' ')}`);
|
|
156
|
+
}
|
|
157
|
+
|
|
158
|
+
const tokenSavings = (1 - avgTokens('engram') / avgTokens('cappedContext')) * 100;
|
|
159
|
+
console.log(`\n Token savings vs capped context: ${tokenSavings.toFixed(1)}%`);
|
|
160
|
+
|
|
161
|
+
// Save report
|
|
162
|
+
const report = {
|
|
163
|
+
timestamp: new Date().toISOString(),
|
|
164
|
+
totalQuestions: results.length,
|
|
165
|
+
failedParses: totalFailed,
|
|
166
|
+
overall: Object.fromEntries(systems.map(s => [s, {
|
|
167
|
+
accuracy: (avg(s) * 100).toFixed(1),
|
|
168
|
+
avgTokens: avgTokens(s),
|
|
169
|
+
}])),
|
|
170
|
+
byCategory: Object.fromEntries(categories.map(c => {
|
|
171
|
+
const cr = results.filter((r: any) => r.category === c);
|
|
172
|
+
return [c, Object.fromEntries(systems.map(s => [s, (cr.reduce((a: number, r: any) => a + (r[s]?.score ?? 0), 0) / cr.length * 100).toFixed(1)]))];
|
|
173
|
+
})),
|
|
174
|
+
byDifficulty: Object.fromEntries(diffs.map(d => {
|
|
175
|
+
const dr = results.filter((r: any) => r.difficulty === d);
|
|
176
|
+
return [d, Object.fromEntries(systems.map(s => [s, (dr.reduce((a: number, r: any) => a + (r[s]?.score ?? 0), 0) / dr.length * 100).toFixed(1)]))];
|
|
177
|
+
})),
|
|
178
|
+
tokenSavingsVsCapped: tokenSavings.toFixed(1) + '%',
|
|
179
|
+
};
|
|
180
|
+
writeFileSync(REPORT_PATH, JSON.stringify(report, null, 2));
|
|
181
|
+
console.log(`\nReport saved: ${REPORT_PATH}`);
|
|
111
182
|
}
|
|
112
183
|
|
|
113
184
|
main().catch(console.error);
|
|
@@ -0,0 +1,142 @@
|
|
|
1
|
+
#!/usr/bin/env npx tsx
|
|
2
|
+
/**
|
|
3
|
+
* rescore-vscode.ts -- Re-score the VS Code codebase eval results
|
|
4
|
+
*
|
|
5
|
+
* The original eval generated good answers but the judge scoring returned
|
|
6
|
+
* unparseable responses (all 0s). This script re-runs ONLY the scoring step.
|
|
7
|
+
*/
|
|
8
|
+
|
|
9
|
+
import { readFileSync, writeFileSync } from 'fs';
|
|
10
|
+
import { homedir } from 'os';
|
|
11
|
+
import { join } from 'path';
|
|
12
|
+
|
|
13
|
+
const GEMINI_KEY = readFileSync(join(homedir(), '.config/engram/gemini-key'), 'utf8').trim();
|
|
14
|
+
const EVAL_DIR = join(homedir(), '.openclaw/workspace/engram/eval-scale-data');
|
|
15
|
+
const RESULTS_PATH = join(EVAL_DIR, 'codebase-results-vscode.json');
|
|
16
|
+
const RESCORED_PATH = join(EVAL_DIR, 'codebase-results-vscode-rescored.json');
|
|
17
|
+
const RATE_LIMIT_MS = 1500;
|
|
18
|
+
|
|
19
|
+
async function sleep(ms: number) { return new Promise(r => setTimeout(r, ms)); }
|
|
20
|
+
|
|
21
|
+
async function withRetry<T>(fn: () => Promise<T>, retries = 5): Promise<T> {
|
|
22
|
+
for (let i = 0; i < retries; i++) {
|
|
23
|
+
try {
|
|
24
|
+
return await fn();
|
|
25
|
+
} catch (err: any) {
|
|
26
|
+
if (err.message?.includes('429') && i < retries - 1) {
|
|
27
|
+
const backoff = Math.min(1500 * Math.pow(2, i + 1), 60000);
|
|
28
|
+
console.log(` [Retry ${i + 1}/${retries}] 429, waiting ${backoff}ms...`);
|
|
29
|
+
await sleep(backoff);
|
|
30
|
+
} else {
|
|
31
|
+
throw err;
|
|
32
|
+
}
|
|
33
|
+
}
|
|
34
|
+
}
|
|
35
|
+
throw new Error('Exhausted retries');
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
async function geminiCall(prompt: string, maxTokens = 200): Promise<string> {
|
|
39
|
+
const res = await fetch(
|
|
40
|
+
`https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent?key=${GEMINI_KEY}`,
|
|
41
|
+
{
|
|
42
|
+
method: 'POST',
|
|
43
|
+
headers: { 'Content-Type': 'application/json' },
|
|
44
|
+
body: JSON.stringify({
|
|
45
|
+
contents: [{ parts: [{ text: prompt }] }],
|
|
46
|
+
generationConfig: { maxOutputTokens: maxTokens, temperature: 0.0 },
|
|
47
|
+
}),
|
|
48
|
+
}
|
|
49
|
+
);
|
|
50
|
+
if (!res.ok) {
|
|
51
|
+
const body = await res.text();
|
|
52
|
+
throw new Error(`Gemini ${res.status}: ${body.slice(0, 200)}`);
|
|
53
|
+
}
|
|
54
|
+
const data = await res.json() as any;
|
|
55
|
+
return data.candidates?.[0]?.content?.parts?.[0]?.text?.trim() || '';
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
async function scoreAnswer(question: string, groundTruth: string, answer: string): Promise<number> {
|
|
59
|
+
const prompt = `You are a strict code knowledge evaluator. Score this answer about the VS Code codebase.
|
|
60
|
+
|
|
61
|
+
Question: ${question}
|
|
62
|
+
|
|
63
|
+
Correct Answer: ${groundTruth}
|
|
64
|
+
|
|
65
|
+
Given Answer: ${answer}
|
|
66
|
+
|
|
67
|
+
Score from 0.0 to 1.0:
|
|
68
|
+
- 1.0 = Completely correct, mentions the right files/classes/functions
|
|
69
|
+
- 0.7 = Mostly correct, minor details missing
|
|
70
|
+
- 0.5 = Partially correct, gets the general area but misses specifics
|
|
71
|
+
- 0.3 = Vaguely related but mostly wrong
|
|
72
|
+
- 0.0 = Completely wrong or says "insufficient context"
|
|
73
|
+
|
|
74
|
+
Respond with ONLY a single number between 0.0 and 1.0. Nothing else.`;
|
|
75
|
+
|
|
76
|
+
const response = await withRetry(() => geminiCall(prompt, 10));
|
|
77
|
+
const score = parseFloat(response);
|
|
78
|
+
if (isNaN(score) || score < 0 || score > 1) {
|
|
79
|
+
console.log(` Warning: unparseable score "${response}", defaulting to 0`);
|
|
80
|
+
return 0;
|
|
81
|
+
}
|
|
82
|
+
return score;
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
async function main() {
|
|
86
|
+
console.log('=== Re-scoring VS Code Codebase Eval ===\n');
|
|
87
|
+
|
|
88
|
+
const results = JSON.parse(readFileSync(RESULTS_PATH, 'utf8'));
|
|
89
|
+
console.log(`Loaded ${results.length} results\n`);
|
|
90
|
+
|
|
91
|
+
for (let i = 0; i < results.length; i++) {
|
|
92
|
+
const r = results[i];
|
|
93
|
+
console.log(`[${i + 1}/${results.length}] (${r.category}/${r.difficulty}) ${r.question.slice(0, 65)}...`);
|
|
94
|
+
|
|
95
|
+
// Score each system individually (more reliable than 4-at-once)
|
|
96
|
+
await sleep(RATE_LIMIT_MS);
|
|
97
|
+
r.engram.score = await scoreAnswer(r.question, r.groundTruth, r.engram.answer);
|
|
98
|
+
|
|
99
|
+
await sleep(RATE_LIMIT_MS);
|
|
100
|
+
r.cappedContext.score = await scoreAnswer(r.question, r.groundTruth, r.cappedContext.answer);
|
|
101
|
+
|
|
102
|
+
await sleep(RATE_LIMIT_MS);
|
|
103
|
+
r.naiveRag.score = await scoreAnswer(r.question, r.groundTruth, r.naiveRag.answer);
|
|
104
|
+
|
|
105
|
+
await sleep(RATE_LIMIT_MS);
|
|
106
|
+
r.grepSearch.score = await scoreAnswer(r.question, r.groundTruth, r.grepSearch.answer);
|
|
107
|
+
|
|
108
|
+
console.log(` E:${r.engram.score.toFixed(2)} C:${r.cappedContext.score.toFixed(2)} R:${r.naiveRag.score.toFixed(2)} G:${r.grepSearch.score.toFixed(2)}`);
|
|
109
|
+
|
|
110
|
+
// Save every 5
|
|
111
|
+
if ((i + 1) % 5 === 0) {
|
|
112
|
+
writeFileSync(RESCORED_PATH, JSON.stringify(results, null, 2));
|
|
113
|
+
}
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
writeFileSync(RESCORED_PATH, JSON.stringify(results, null, 2));
|
|
117
|
+
console.log(`\nSaved to ${RESCORED_PATH}`);
|
|
118
|
+
|
|
119
|
+
// Print summary
|
|
120
|
+
const n = results.length;
|
|
121
|
+
const avg = (key: string) => (results.reduce((s: number, r: any) => s + r[key].score, 0) / n * 100).toFixed(1);
|
|
122
|
+
const avgTok = (key: string) => Math.round(results.reduce((s: number, r: any) => s + r[key].tokensUsed, 0) / n);
|
|
123
|
+
|
|
124
|
+
console.log(`\n=== RESULTS (${n} questions) ===`);
|
|
125
|
+
console.log(`Engram: ${avg('engram')}% (${avgTok('engram')} tok/q)`);
|
|
126
|
+
console.log(`Capped Context: ${avg('cappedContext')}% (${avgTok('cappedContext')} tok/q)`);
|
|
127
|
+
console.log(`Naive RAG: ${avg('naiveRag')}% (${avgTok('naiveRag')} tok/q)`);
|
|
128
|
+
console.log(`Grep Search: ${avg('grepSearch')}% (${avgTok('grepSearch')} tok/q)`);
|
|
129
|
+
|
|
130
|
+
// By category
|
|
131
|
+
const cats = [...new Set(results.map((r: any) => r.category))];
|
|
132
|
+
for (const cat of cats) {
|
|
133
|
+
const cr = results.filter((r: any) => r.category === cat);
|
|
134
|
+
const catAvg = (key: string) => (cr.reduce((s: number, r: any) => s + r[key].score, 0) / cr.length * 100).toFixed(1);
|
|
135
|
+
console.log(`\n${(cat as string).toUpperCase()} (n=${cr.length}): E:${catAvg('engram')}% C:${catAvg('cappedContext')}% R:${catAvg('naiveRag')}% G:${catAvg('grepSearch')}%`);
|
|
136
|
+
}
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
main().catch(err => {
|
|
140
|
+
console.error('Fatal:', err);
|
|
141
|
+
process.exit(1);
|
|
142
|
+
});
|
package/server.json
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
{
|
|
2
|
+
"$schema": "https://static.modelcontextprotocol.io/schemas/2025-12-11/server.schema.json",
|
|
3
|
+
"name": "io.github.tstockham96/engram",
|
|
4
|
+
"description": "Intelligent memory layer for AI agents. Automatic extraction, consolidation, bi-temporal recall, and proactive context surfacing. Scores 80% on LOCOMO benchmark at 93% fewer tokens than full context.",
|
|
5
|
+
"repository": {
|
|
6
|
+
"url": "https://github.com/tstockham96/engram",
|
|
7
|
+
"source": "github"
|
|
8
|
+
},
|
|
9
|
+
"version": "0.4.4",
|
|
10
|
+
"packages": [
|
|
11
|
+
{
|
|
12
|
+
"registryType": "npm",
|
|
13
|
+
"identifier": "engram-sdk",
|
|
14
|
+
"version": "0.4.4",
|
|
15
|
+
"transport": {
|
|
16
|
+
"type": "stdio"
|
|
17
|
+
},
|
|
18
|
+
"environmentVariables": [
|
|
19
|
+
{
|
|
20
|
+
"description": "Google Gemini API key for embeddings and LLM operations",
|
|
21
|
+
"isRequired": true,
|
|
22
|
+
"format": "string",
|
|
23
|
+
"isSecret": true,
|
|
24
|
+
"name": "GEMINI_API_KEY"
|
|
25
|
+
}
|
|
26
|
+
]
|
|
27
|
+
}
|
|
28
|
+
]
|
|
29
|
+
}
|
package/EVAL-FRAMEWORK.md
DELETED
|
@@ -1,70 +0,0 @@
|
|
|
1
|
-
# Engram Eval Framework
|
|
2
|
-
|
|
3
|
-
## The Question
|
|
4
|
-
Does Engram make an AI agent meaningfully better at its job compared to flat-file memory (MEMORY.md + daily notes + vector search)?
|
|
5
|
-
|
|
6
|
-
## What "Better" Means
|
|
7
|
-
1. **Fewer repeated questions** — Agent doesn't ask things it should already know
|
|
8
|
-
2. **Surprise recall** — Agent surfaces relevant context the human didn't prompt for
|
|
9
|
-
3. **Contradiction prevention** — Agent catches itself before giving conflicting info
|
|
10
|
-
4. **Reduced curation toil** — Human spends less time maintaining memory files
|
|
11
|
-
5. **Token efficiency** — Same or better context quality with fewer tokens
|
|
12
|
-
|
|
13
|
-
## Daily Automated Eval (runs via cron)
|
|
14
|
-
|
|
15
|
-
### Recall Accuracy Test
|
|
16
|
-
Run a fixed set of 20 questions with known answers. Score: % of correct top-3 results.
|
|
17
|
-
Track over time as memories accumulate and consolidation runs.
|
|
18
|
-
|
|
19
|
-
Questions should cover:
|
|
20
|
-
- Factual (What is Thomas's job? → Senior PM at BambooHR)
|
|
21
|
-
- Procedural (How do I deploy the site? → cd engram-site && npx vercel --prod)
|
|
22
|
-
- Relational (Who are Engram's competitors? → Mem0, Zep, Letta, LangMem)
|
|
23
|
-
- Temporal (What did we build yesterday? → depends on date)
|
|
24
|
-
- Personal preference (How does Thomas like communication? → direct, no fluff)
|
|
25
|
-
|
|
26
|
-
### Token Comparison
|
|
27
|
-
- Measure: bytes of MEMORY.md + loaded daily files vs Engram briefing() output
|
|
28
|
-
- Track both as they grow over time
|
|
29
|
-
- Calculate: tokens saved per request if using Engram briefing instead of file dump
|
|
30
|
-
|
|
31
|
-
### Consolidation Quality
|
|
32
|
-
After each consolidation run, evaluate:
|
|
33
|
-
- Are new semantic memories factually accurate?
|
|
34
|
-
- Are they non-redundant with existing memories?
|
|
35
|
-
- Do they capture something the raw episodes didn't explicitly state? (insight generation)
|
|
36
|
-
|
|
37
|
-
### Memory Freshness
|
|
38
|
-
- Count: memories with status "pending" that are actually fulfilled
|
|
39
|
-
- Count: memories that contradict each other
|
|
40
|
-
- Track whether lifecycle management is happening
|
|
41
|
-
|
|
42
|
-
## Weekly Human Eval
|
|
43
|
-
|
|
44
|
-
Thomas rates (1-5) on a weekly basis:
|
|
45
|
-
1. Did the agent remember things it should have?
|
|
46
|
-
2. Did the agent forget things it shouldn't have?
|
|
47
|
-
3. Did the agent surprise you with relevant context?
|
|
48
|
-
4. Was there less repetitive Q&A this week vs last?
|
|
49
|
-
|
|
50
|
-
## Success Criteria
|
|
51
|
-
|
|
52
|
-
### Engram is WORKING when:
|
|
53
|
-
- Recall accuracy consistently >80% on the fixed question set
|
|
54
|
-
- Agent surfaces relevant context Thomas didn't ask for at least 2x/week
|
|
55
|
-
- Agent catches a contradiction or stale commitment at least 1x/week
|
|
56
|
-
- MEMORY.md stops growing because Engram captures what it used to
|
|
57
|
-
|
|
58
|
-
### Engram is NOT WORKING when:
|
|
59
|
-
- MEMORY.md remains the primary useful memory source
|
|
60
|
-
- Recall accuracy stays flat or degrades as memories accumulate
|
|
61
|
-
- Consolidation produces redundant summaries instead of insights
|
|
62
|
-
- Token cost increases without quality improvement
|
|
63
|
-
|
|
64
|
-
## Scale Milestones
|
|
65
|
-
Track value at each milestone:
|
|
66
|
-
- 220 memories (current) — baseline
|
|
67
|
-
- 500 memories — first consolidation cycle quality check
|
|
68
|
-
- 1,000 memories — graph density threshold
|
|
69
|
-
- 5,000 memories — flat-file memory should be breaking down by here
|
|
70
|
-
- 10,000+ memories — Engram must be clearly better or it's not working
|
package/EVAL.md
DELETED
|
@@ -1,127 +0,0 @@
|
|
|
1
|
-
# Engram Self-Eval: Agent Performance With vs Without Memory
|
|
2
|
-
|
|
3
|
-
*Designed Feb 16, 2026. Running over 72 hours.*
|
|
4
|
-
|
|
5
|
-
## Hypothesis
|
|
6
|
-
|
|
7
|
-
An AI agent with Engram memory performs measurably better at tasks requiring context from prior conversations than the same agent without it.
|
|
8
|
-
|
|
9
|
-
## Design
|
|
10
|
-
|
|
11
|
-
**Subject:** Jarvis (Claude Opus via OpenClaw) — the same agent, tested in two conditions:
|
|
12
|
-
|
|
13
|
-
1. **Baseline (no Engram):** Agent uses only OpenClaw's default memory system (MEMORY.md flat files + vector search over markdown)
|
|
14
|
-
2. **Engram-enhanced:** Agent uses Engram vault (auto-ingest, semantic recall, consolidation, knowledge graph)
|
|
15
|
-
|
|
16
|
-
**Interface:** REST API + auto-ingest pipeline — the same way any OpenClaw user would run it. No MCP needed. Just the Engram server running alongside the agent, auto-ingesting conversations, and serving recall queries via HTTP.
|
|
17
|
-
|
|
18
|
-
**Method:** Over 72 hours of normal use with Thomas, I'll log specific moments where memory matters. At each moment, I'll:
|
|
19
|
-
1. Query Engram's REST API for relevant context
|
|
20
|
-
2. Note what my flat-file memory (MEMORY.md + vector search) would have returned
|
|
21
|
-
3. Compare the quality of recall
|
|
22
|
-
|
|
23
|
-
This isn't a synthetic benchmark — it's real-world usage on the same setup any OpenClaw user would have. More ecologically valid than a lab test.
|
|
24
|
-
|
|
25
|
-
## Metrics
|
|
26
|
-
|
|
27
|
-
### 1. Recall Accuracy (quantitative)
|
|
28
|
-
At each memory-dependent moment, score:
|
|
29
|
-
- **Hit**: Correct, relevant memory surfaced → 1 point
|
|
30
|
-
- **Partial**: Related but incomplete or slightly wrong → 0.5 points
|
|
31
|
-
- **Miss**: Relevant memory existed but wasn't recalled → 0 points
|
|
32
|
-
- **Hallucination**: Confidently wrong memory → -1 point
|
|
33
|
-
|
|
34
|
-
Score both systems independently. Compare hit rates.
|
|
35
|
-
|
|
36
|
-
### 2. Context Richness (qualitative, 1-5 scale)
|
|
37
|
-
When recalling context, rate the depth:
|
|
38
|
-
- 1: No useful context
|
|
39
|
-
- 2: Basic facts only ("Thomas works at BambooHR")
|
|
40
|
-
- 3: Facts + preferences ("Thomas works at BambooHR, prefers direct communication")
|
|
41
|
-
- 4: Facts + preferences + history ("...and he pivoted from Kin to Engram because thin-margin hosting has no moat")
|
|
42
|
-
- 5: Full situational awareness including decisions, reasoning, and emotional context
|
|
43
|
-
|
|
44
|
-
### 3. Consolidation Quality (qualitative)
|
|
45
|
-
After each consolidation run:
|
|
46
|
-
- How many genuinely useful semantic memories were created?
|
|
47
|
-
- Were any contradictions correctly identified?
|
|
48
|
-
- Were any stale memories correctly superseded?
|
|
49
|
-
- Did the knowledge graph capture meaningful relationships?
|
|
50
|
-
|
|
51
|
-
### 4. Time-to-Context (qualitative)
|
|
52
|
-
How quickly does the agent reach useful context?
|
|
53
|
-
- Engram: single API call, structured results
|
|
54
|
-
- Flat files: grep through markdown, parse manually, hope the right section is there
|
|
55
|
-
|
|
56
|
-
### 5. "Surprise" Moments
|
|
57
|
-
Log any moment where Engram surfaced context the agent wouldn't have thought to look for. These are the moments that demonstrate proactive value.
|
|
58
|
-
|
|
59
|
-
## Test Scenarios
|
|
60
|
-
|
|
61
|
-
Run these naturally over the 72-hour period:
|
|
62
|
-
|
|
63
|
-
### Day 1 (Feb 16-17): Foundation
|
|
64
|
-
- [x] Seed vault from existing conversations
|
|
65
|
-
- [x] Run auto-ingest on current session
|
|
66
|
-
- [x] First consolidation run
|
|
67
|
-
- [ ] Evening: Ask about decisions made in morning (test same-day recall)
|
|
68
|
-
- [ ] Evening: Run consolidation, note what it produces
|
|
69
|
-
|
|
70
|
-
### Day 2 (Feb 17-18): Cross-Session Recall
|
|
71
|
-
- [ ] Morning: Can Engram recall yesterday's context without being told?
|
|
72
|
-
- [ ] Ask about pricing decisions (tests semantic memory from consolidation)
|
|
73
|
-
- [ ] Ask about Thomas's preferences (tests across multiple conversations)
|
|
74
|
-
- [ ] Work on a task that references past decisions — compare with/without
|
|
75
|
-
- [ ] Run consolidation, note cross-day patterns
|
|
76
|
-
|
|
77
|
-
### Day 3 (Feb 18-19): Stress Tests
|
|
78
|
-
- [ ] Ask a question that requires connecting 3+ separate memories
|
|
79
|
-
- [ ] Ask about something mentioned once 3 days ago (tests decay/persistence)
|
|
80
|
-
- [ ] Introduce contradictory information — does consolidation catch it?
|
|
81
|
-
- [ ] Ask an ambiguous question — does the knowledge graph help disambiguate?
|
|
82
|
-
- [ ] Final consolidation + full analysis
|
|
83
|
-
|
|
84
|
-
## Logging Format
|
|
85
|
-
|
|
86
|
-
Each test moment gets logged as:
|
|
87
|
-
|
|
88
|
-
```markdown
|
|
89
|
-
### Test #N: [Description]
|
|
90
|
-
**Time:** YYYY-MM-DD HH:MM
|
|
91
|
-
**Query:** "What I asked / needed to recall"
|
|
92
|
-
**Engram result:** [what Engram returned]
|
|
93
|
-
**Flat-file result:** [what MEMORY.md search would return]
|
|
94
|
-
**Recall accuracy:** Hit / Partial / Miss / Hallucination (for each system)
|
|
95
|
-
**Context richness:** X/5 (for each system)
|
|
96
|
-
**Notes:** [observations]
|
|
97
|
-
```
|
|
98
|
-
|
|
99
|
-
## Success Criteria
|
|
100
|
-
|
|
101
|
-
Engram is **better** if:
|
|
102
|
-
- Recall accuracy is ≥20% higher than flat files
|
|
103
|
-
- Context richness averages ≥1 point higher
|
|
104
|
-
- At least 3 "surprise" moments where Engram surfaced unexpected relevant context
|
|
105
|
-
- Consolidation produces ≥10 genuinely useful semantic memories over 72 hours
|
|
106
|
-
- Zero hallucinated memories (confidence should prevent false recalls)
|
|
107
|
-
|
|
108
|
-
Engram is **equivalent** if scores are within 10%. Even equivalence is a win if Engram requires less manual effort (no hand-editing MEMORY.md).
|
|
109
|
-
|
|
110
|
-
Engram **fails** if:
|
|
111
|
-
- Recall accuracy is lower than flat files
|
|
112
|
-
- Consolidation produces mostly noise
|
|
113
|
-
- The agent performs worse on tasks due to bad memory surfacing
|
|
114
|
-
|
|
115
|
-
## Results
|
|
116
|
-
|
|
117
|
-
*Will be filled in as the eval progresses.*
|
|
118
|
-
|
|
119
|
-
### Summary (to be completed Feb 19)
|
|
120
|
-
- Total test moments:
|
|
121
|
-
- Engram recall accuracy:
|
|
122
|
-
- Flat-file recall accuracy:
|
|
123
|
-
- Engram context richness avg:
|
|
124
|
-
- Flat-file context richness avg:
|
|
125
|
-
- Surprise moments:
|
|
126
|
-
- Semantic memories from consolidation:
|
|
127
|
-
- Verdict:
|