engram-sdk 0.4.3 → 0.4.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +2 -1
- package/rescore-codebase.ts +165 -94
- package/server.json +29 -0
package/package.json
CHANGED
package/rescore-codebase.ts
CHANGED
|
@@ -1,113 +1,184 @@
|
|
|
1
1
|
#!/usr/bin/env npx tsx
|
|
2
2
|
/**
|
|
3
|
-
* Re-score existing codebase eval results using LLM
|
|
3
|
+
* rescore-codebase.ts — Re-score existing codebase eval results using LLM judge
|
|
4
|
+
* Uses the saved answers + ground truth, just re-runs scoring
|
|
4
5
|
*/
|
|
5
6
|
|
|
6
7
|
import { readFileSync, writeFileSync } from 'fs';
|
|
7
|
-
import {
|
|
8
|
-
import {
|
|
8
|
+
import { resolve, dirname } from 'path';
|
|
9
|
+
import { fileURLToPath } from 'url';
|
|
9
10
|
|
|
10
|
-
const
|
|
11
|
-
const
|
|
12
|
-
const
|
|
11
|
+
const __dirname = dirname(fileURLToPath(import.meta.url));
|
|
12
|
+
const GEMINI_KEY = readFileSync(resolve(process.env.HOME!, '.config/engram/gemini-key'), 'utf8').trim();
|
|
13
|
+
const RESULTS_PATH = resolve(__dirname, 'eval-scale-data/codebase-results-vscode.json');
|
|
14
|
+
const REPORT_PATH = resolve(__dirname, 'eval-scale-data/codebase-report-vscode-v2.json');
|
|
13
15
|
|
|
14
|
-
function
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
16
|
+
async function geminiCall(prompt: string, maxTokens = 100): Promise<string> {
|
|
17
|
+
for (let attempt = 0; attempt < 3; attempt++) {
|
|
18
|
+
try {
|
|
19
|
+
const response = await fetch(
|
|
20
|
+
`https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent?key=${GEMINI_KEY}`,
|
|
21
|
+
{
|
|
22
|
+
method: 'POST',
|
|
23
|
+
headers: { 'Content-Type': 'application/json' },
|
|
24
|
+
body: JSON.stringify({
|
|
25
|
+
contents: [{ parts: [{ text: prompt }] }],
|
|
26
|
+
generationConfig: { maxOutputTokens: maxTokens, temperature: 0 },
|
|
27
|
+
}),
|
|
28
|
+
}
|
|
29
|
+
);
|
|
30
|
+
if (response.status === 429) {
|
|
31
|
+
const retryAfter = parseInt(response.headers.get('retry-after') || '10');
|
|
32
|
+
console.log(` Rate limited, waiting ${retryAfter}s...`);
|
|
33
|
+
await new Promise(r => setTimeout(r, retryAfter * 1000));
|
|
34
|
+
continue;
|
|
35
|
+
}
|
|
36
|
+
if (!response.ok) {
|
|
37
|
+
console.log(` API error ${response.status}, retrying...`);
|
|
38
|
+
await new Promise(r => setTimeout(r, 2000));
|
|
39
|
+
continue;
|
|
40
|
+
}
|
|
41
|
+
const data = await response.json() as any;
|
|
42
|
+
return data.candidates?.[0]?.content?.parts?.[0]?.text ?? '';
|
|
43
|
+
} catch (e: any) {
|
|
44
|
+
console.log(` Fetch error: ${e.message}, retrying...`);
|
|
45
|
+
await new Promise(r => setTimeout(r, 2000));
|
|
36
46
|
}
|
|
37
|
-
if (!response.ok) throw new Error(`Gemini API error: ${response.status}`);
|
|
38
|
-
const data = await response.json() as any;
|
|
39
|
-
return data.candidates?.[0]?.content?.parts?.[0]?.text ?? '';
|
|
40
47
|
}
|
|
41
|
-
|
|
48
|
+
return '';
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
async function scoreAnswer(question: string, truth: string, answer: string): Promise<number> {
|
|
52
|
+
const prompt = `You are evaluating an AI's answer about a codebase. Score it from 0.0 to 1.0.
|
|
53
|
+
|
|
54
|
+
- 1.0 = Correct and complete
|
|
55
|
+
- 0.7 = Mostly correct, minor gaps
|
|
56
|
+
- 0.5 = Partially correct
|
|
57
|
+
- 0.3 = Mentions something relevant but mostly wrong
|
|
58
|
+
- 0.0 = Wrong or "I don't know"
|
|
59
|
+
|
|
60
|
+
Question: ${question}
|
|
61
|
+
Ground Truth: ${truth}
|
|
62
|
+
AI's Answer: ${answer}
|
|
63
|
+
|
|
64
|
+
Respond with ONLY a decimal number (e.g. 0.7). Nothing else.`;
|
|
65
|
+
|
|
66
|
+
const response = await geminiCall(prompt);
|
|
67
|
+
const cleaned = response.trim();
|
|
68
|
+
|
|
69
|
+
// Try direct float parse first
|
|
70
|
+
const direct = parseFloat(cleaned);
|
|
71
|
+
if (!isNaN(direct) && direct >= 0 && direct <= 1) return direct;
|
|
72
|
+
|
|
73
|
+
// Try regex
|
|
74
|
+
const match = cleaned.match(/(0\.\d+|1\.0|0|1)/);
|
|
75
|
+
if (match) return parseFloat(match[1]);
|
|
76
|
+
|
|
77
|
+
console.log(` Failed to parse score: "${cleaned}"`);
|
|
78
|
+
return -1; // Mark as failed, don't default to 0
|
|
42
79
|
}
|
|
43
80
|
|
|
44
81
|
async function main() {
|
|
45
82
|
const results = JSON.parse(readFileSync(RESULTS_PATH, 'utf8'));
|
|
46
|
-
console.log(`Rescoring ${results.length}
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
const
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
Ground Truth: ${r.groundTruth}
|
|
66
|
-
|
|
67
|
-
Answer A (Engram): ${r.engram.answer}
|
|
68
|
-
Answer B (Full Context): ${r.fullContext.answer}
|
|
69
|
-
Answer C (Grep Search): ${r.grepSearch.answer}
|
|
70
|
-
|
|
71
|
-
Output ONLY a JSON object: {"a": <score>, "b": <score>, "c": <score>}`;
|
|
72
|
-
|
|
73
|
-
const scoreResponse = await geminiCall(scorePrompt, 200, 3, true);
|
|
74
|
-
// Aggressively extract JSON from any preamble/wrapper text
|
|
75
|
-
const firstBrace = scoreResponse.indexOf('{');
|
|
76
|
-
const lastBrace = scoreResponse.lastIndexOf('}');
|
|
77
|
-
const scores = (firstBrace >= 0 && lastBrace > firstBrace)
|
|
78
|
-
? JSON.parse(scoreResponse.slice(firstBrace, lastBrace + 1))
|
|
79
|
-
: null;
|
|
80
|
-
|
|
81
|
-
if (!scores) {
|
|
82
|
-
console.log(` [${r.index}] Failed to parse scores: ${scoreResponse.slice(0, 100)}`);
|
|
83
|
-
continue;
|
|
84
|
-
}
|
|
85
|
-
|
|
86
|
-
r.engram.correct = scores.a;
|
|
87
|
-
r.fullContext.correct = scores.b;
|
|
88
|
-
if (r.grepSearch) r.grepSearch.correct = scores.c;
|
|
89
|
-
|
|
90
|
-
scored++;
|
|
91
|
-
console.log(` [${r.index}] E: ${scores.a.toFixed(1)} | F: ${scores.b.toFixed(1)} | G: ${scores.c.toFixed(1)} — ${r.question.slice(0, 60)}...`);
|
|
92
|
-
|
|
93
|
-
// Auto-save every 5
|
|
94
|
-
if (scored % 5 === 0) {
|
|
95
|
-
writeFileSync(RESULTS_PATH, JSON.stringify(results, null, 2));
|
|
83
|
+
console.log(`Rescoring ${results.length} results...\n`);
|
|
84
|
+
|
|
85
|
+
const systems = ['engram', 'cappedContext', 'naiveRag', 'grepSearch'] as const;
|
|
86
|
+
let totalScored = 0;
|
|
87
|
+
let totalFailed = 0;
|
|
88
|
+
|
|
89
|
+
for (let i = 0; i < results.length; i++) {
|
|
90
|
+
const r = results[i];
|
|
91
|
+
console.log(`[${i+1}/${results.length}] (${r.category}/${r.difficulty}) ${r.question.slice(0, 70)}...`);
|
|
92
|
+
|
|
93
|
+
const scores: Record<string, number> = {};
|
|
94
|
+
for (const sys of systems) {
|
|
95
|
+
if (!r[sys]?.answer) { scores[sys] = 0; continue; }
|
|
96
|
+
const score = await scoreAnswer(r.question, r.groundTruth, r[sys].answer);
|
|
97
|
+
if (score === -1) {
|
|
98
|
+
totalFailed++;
|
|
99
|
+
scores[sys] = 0;
|
|
100
|
+
} else {
|
|
101
|
+
scores[sys] = score;
|
|
96
102
|
}
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
103
|
+
r[sys].score = scores[sys];
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
totalScored += systems.length;
|
|
107
|
+
const line = systems.map(s => `${s[0].toUpperCase()}:${scores[s].toFixed(2)}`).join(' ');
|
|
108
|
+
console.log(` ${line}`);
|
|
109
|
+
|
|
110
|
+
// Save progress every 5 questions
|
|
111
|
+
if ((i + 1) % 5 === 0 || i === results.length - 1) {
|
|
112
|
+
writeFileSync(RESULTS_PATH.replace('.json', '-rescored2.json'), JSON.stringify(results, null, 2));
|
|
100
113
|
}
|
|
101
114
|
}
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
115
|
+
|
|
116
|
+
console.log(`\nScored: ${totalScored}, Failed parses: ${totalFailed}\n`);
|
|
117
|
+
|
|
118
|
+
// Generate report
|
|
119
|
+
const avg = (sys: string) => {
|
|
120
|
+
const vals = results.map((r: any) => r[sys]?.score ?? 0);
|
|
121
|
+
return vals.reduce((a: number, b: number) => a + b, 0) / vals.length;
|
|
122
|
+
};
|
|
123
|
+
const avgTokens = (sys: string) => {
|
|
124
|
+
const vals = results.map((r: any) => r[sys]?.tokensUsed ?? 0);
|
|
125
|
+
return Math.round(vals.reduce((a: number, b: number) => a + b, 0) / vals.length);
|
|
126
|
+
};
|
|
127
|
+
|
|
128
|
+
console.log('=== VS Code Codebase Evaluation Report ===\n');
|
|
129
|
+
console.log('OVERALL (50 questions)');
|
|
130
|
+
console.log(`${'System'.padEnd(20)} ${'Accuracy'.padEnd(12)} Avg Tokens`);
|
|
131
|
+
for (const sys of systems) {
|
|
132
|
+
const acc = (avg(sys) * 100).toFixed(1);
|
|
133
|
+
console.log(`${sys.padEnd(20)} ${(acc + '%').padEnd(12)} ${avgTokens(sys)}`);
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
// Per category
|
|
137
|
+
const categories = [...new Set(results.map((r: any) => r.category))];
|
|
138
|
+
for (const cat of categories) {
|
|
139
|
+
const catResults = results.filter((r: any) => r.category === cat);
|
|
140
|
+
const catAvg = (sys: string) => {
|
|
141
|
+
const vals = catResults.map((r: any) => r[sys]?.score ?? 0);
|
|
142
|
+
return (vals.reduce((a: number, b: number) => a + b, 0) / vals.length * 100).toFixed(1);
|
|
143
|
+
};
|
|
144
|
+
console.log(`\n ${cat.toUpperCase()} (n=${catResults.length}): ${systems.map(s => `${s[0].toUpperCase()}:${catAvg(s)}%`).join(' ')}`);
|
|
145
|
+
}
|
|
146
|
+
|
|
147
|
+
// Per difficulty
|
|
148
|
+
const diffs = [...new Set(results.map((r: any) => r.difficulty))];
|
|
149
|
+
for (const diff of diffs) {
|
|
150
|
+
const diffResults = results.filter((r: any) => r.difficulty === diff);
|
|
151
|
+
const diffAvg = (sys: string) => {
|
|
152
|
+
const vals = diffResults.map((r: any) => r[sys]?.score ?? 0);
|
|
153
|
+
return (vals.reduce((a: number, b: number) => a + b, 0) / vals.length * 100).toFixed(1);
|
|
154
|
+
};
|
|
155
|
+
console.log(`\n ${diff.toUpperCase()} (n=${diffResults.length}): ${systems.map(s => `${s[0].toUpperCase()}:${diffAvg(s)}%`).join(' ')}`);
|
|
156
|
+
}
|
|
157
|
+
|
|
158
|
+
const tokenSavings = (1 - avgTokens('engram') / avgTokens('cappedContext')) * 100;
|
|
159
|
+
console.log(`\n Token savings vs capped context: ${tokenSavings.toFixed(1)}%`);
|
|
160
|
+
|
|
161
|
+
// Save report
|
|
162
|
+
const report = {
|
|
163
|
+
timestamp: new Date().toISOString(),
|
|
164
|
+
totalQuestions: results.length,
|
|
165
|
+
failedParses: totalFailed,
|
|
166
|
+
overall: Object.fromEntries(systems.map(s => [s, {
|
|
167
|
+
accuracy: (avg(s) * 100).toFixed(1),
|
|
168
|
+
avgTokens: avgTokens(s),
|
|
169
|
+
}])),
|
|
170
|
+
byCategory: Object.fromEntries(categories.map(c => {
|
|
171
|
+
const cr = results.filter((r: any) => r.category === c);
|
|
172
|
+
return [c, Object.fromEntries(systems.map(s => [s, (cr.reduce((a: number, r: any) => a + (r[s]?.score ?? 0), 0) / cr.length * 100).toFixed(1)]))];
|
|
173
|
+
})),
|
|
174
|
+
byDifficulty: Object.fromEntries(diffs.map(d => {
|
|
175
|
+
const dr = results.filter((r: any) => r.difficulty === d);
|
|
176
|
+
return [d, Object.fromEntries(systems.map(s => [s, (dr.reduce((a: number, r: any) => a + (r[s]?.score ?? 0), 0) / dr.length * 100).toFixed(1)]))];
|
|
177
|
+
})),
|
|
178
|
+
tokenSavingsVsCapped: tokenSavings.toFixed(1) + '%',
|
|
179
|
+
};
|
|
180
|
+
writeFileSync(REPORT_PATH, JSON.stringify(report, null, 2));
|
|
181
|
+
console.log(`\nReport saved: ${REPORT_PATH}`);
|
|
111
182
|
}
|
|
112
183
|
|
|
113
184
|
main().catch(console.error);
|
package/server.json
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
{
|
|
2
|
+
"$schema": "https://static.modelcontextprotocol.io/schemas/2025-12-11/server.schema.json",
|
|
3
|
+
"name": "io.github.tstockham96/engram",
|
|
4
|
+
"description": "Intelligent memory layer for AI agents. Automatic extraction, consolidation, bi-temporal recall, and proactive context surfacing. Scores 80% on LOCOMO benchmark at 93% fewer tokens than full context.",
|
|
5
|
+
"repository": {
|
|
6
|
+
"url": "https://github.com/tstockham96/engram",
|
|
7
|
+
"source": "github"
|
|
8
|
+
},
|
|
9
|
+
"version": "0.4.4",
|
|
10
|
+
"packages": [
|
|
11
|
+
{
|
|
12
|
+
"registryType": "npm",
|
|
13
|
+
"identifier": "engram-sdk",
|
|
14
|
+
"version": "0.4.4",
|
|
15
|
+
"transport": {
|
|
16
|
+
"type": "stdio"
|
|
17
|
+
},
|
|
18
|
+
"environmentVariables": [
|
|
19
|
+
{
|
|
20
|
+
"description": "Google Gemini API key for embeddings and LLM operations",
|
|
21
|
+
"isRequired": true,
|
|
22
|
+
"format": "string",
|
|
23
|
+
"isSecret": true,
|
|
24
|
+
"name": "GEMINI_API_KEY"
|
|
25
|
+
}
|
|
26
|
+
]
|
|
27
|
+
}
|
|
28
|
+
]
|
|
29
|
+
}
|