engram-sdk 0.4.3 → 0.4.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,7 @@
1
1
  {
2
2
  "name": "engram-sdk",
3
- "version": "0.4.3",
3
+ "mcpName": "io.github.tstockham96/engram",
4
+ "version": "0.4.4",
4
5
  "description": "Universal memory layer for AI agents. Remember, recall, consolidate.",
5
6
  "type": "module",
6
7
  "main": "dist/index.js",
@@ -1,113 +1,184 @@
1
1
  #!/usr/bin/env npx tsx
2
2
  /**
3
- * Re-score existing codebase eval results using LLM-as-judge
3
+ * rescore-codebase.ts — Re-score existing codebase eval results using LLM judge
4
+ * Uses the saved answers + ground truth, just re-runs scoring
4
5
  */
5
6
 
6
7
  import { readFileSync, writeFileSync } from 'fs';
7
- import { join, dirname } from 'path';
8
- import { homedir } from 'os';
8
+ import { resolve, dirname } from 'path';
9
+ import { fileURLToPath } from 'url';
9
10
 
10
- const GEMINI_KEY = readFileSync(join(homedir(), '.config/engram/gemini-key'), 'utf8').trim();
11
- const RESULTS_PATH = join(homedir(), '.openclaw/workspace/engram/eval-scale-data/codebase-results-openclaw.json');
12
- const RATE_LIMIT_MS = 6000;
11
+ const __dirname = dirname(fileURLToPath(import.meta.url));
12
+ const GEMINI_KEY = readFileSync(resolve(process.env.HOME!, '.config/engram/gemini-key'), 'utf8').trim();
13
+ const RESULTS_PATH = resolve(__dirname, 'eval-scale-data/codebase-results-vscode.json');
14
+ const REPORT_PATH = resolve(__dirname, 'eval-scale-data/codebase-report-vscode-v2.json');
13
15
 
14
- function sleep(ms: number) { return new Promise(r => setTimeout(r, ms)); }
15
-
16
- async function geminiCall(prompt: string, maxTokens = 200, retries = 3, jsonMode = false): Promise<string> {
17
- for (let attempt = 0; attempt < retries; attempt++) {
18
- const response = await fetch(
19
- `https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash-lite:generateContent?key=${GEMINI_KEY}`,
20
- {
21
- method: 'POST',
22
- headers: { 'Content-Type': 'application/json' },
23
- body: JSON.stringify({
24
- contents: [{ parts: [{ text: prompt }] }],
25
- generationConfig: {
26
- maxOutputTokens: maxTokens,
27
- ...(jsonMode ? { responseMimeType: 'application/json' } : {}),
28
- },
29
- }),
30
- },
31
- );
32
- if (response.status === 429) {
33
- console.log(` Rate limited, waiting ${(attempt + 1) * 10}s...`);
34
- await sleep((attempt + 1) * 10000);
35
- continue;
16
+ async function geminiCall(prompt: string, maxTokens = 100): Promise<string> {
17
+ for (let attempt = 0; attempt < 3; attempt++) {
18
+ try {
19
+ const response = await fetch(
20
+ `https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent?key=${GEMINI_KEY}`,
21
+ {
22
+ method: 'POST',
23
+ headers: { 'Content-Type': 'application/json' },
24
+ body: JSON.stringify({
25
+ contents: [{ parts: [{ text: prompt }] }],
26
+ generationConfig: { maxOutputTokens: maxTokens, temperature: 0 },
27
+ }),
28
+ }
29
+ );
30
+ if (response.status === 429) {
31
+ const retryAfter = parseInt(response.headers.get('retry-after') || '10');
32
+ console.log(` Rate limited, waiting ${retryAfter}s...`);
33
+ await new Promise(r => setTimeout(r, retryAfter * 1000));
34
+ continue;
35
+ }
36
+ if (!response.ok) {
37
+ console.log(` API error ${response.status}, retrying...`);
38
+ await new Promise(r => setTimeout(r, 2000));
39
+ continue;
40
+ }
41
+ const data = await response.json() as any;
42
+ return data.candidates?.[0]?.content?.parts?.[0]?.text ?? '';
43
+ } catch (e: any) {
44
+ console.log(` Fetch error: ${e.message}, retrying...`);
45
+ await new Promise(r => setTimeout(r, 2000));
36
46
  }
37
- if (!response.ok) throw new Error(`Gemini API error: ${response.status}`);
38
- const data = await response.json() as any;
39
- return data.candidates?.[0]?.content?.parts?.[0]?.text ?? '';
40
47
  }
41
- throw new Error('Max retries exceeded');
48
+ return '';
49
+ }
50
+
51
+ async function scoreAnswer(question: string, truth: string, answer: string): Promise<number> {
52
+ const prompt = `You are evaluating an AI's answer about a codebase. Score it from 0.0 to 1.0.
53
+
54
+ - 1.0 = Correct and complete
55
+ - 0.7 = Mostly correct, minor gaps
56
+ - 0.5 = Partially correct
57
+ - 0.3 = Mentions something relevant but mostly wrong
58
+ - 0.0 = Wrong or "I don't know"
59
+
60
+ Question: ${question}
61
+ Ground Truth: ${truth}
62
+ AI's Answer: ${answer}
63
+
64
+ Respond with ONLY a decimal number (e.g. 0.7). Nothing else.`;
65
+
66
+ const response = await geminiCall(prompt);
67
+ const cleaned = response.trim();
68
+
69
+ // Try direct float parse first
70
+ const direct = parseFloat(cleaned);
71
+ if (!isNaN(direct) && direct >= 0 && direct <= 1) return direct;
72
+
73
+ // Try regex
74
+ const match = cleaned.match(/(0\.\d+|1\.0|0|1)/);
75
+ if (match) return parseFloat(match[1]);
76
+
77
+ console.log(` Failed to parse score: "${cleaned}"`);
78
+ return -1; // Mark as failed, don't default to 0
42
79
  }
43
80
 
44
81
  async function main() {
45
82
  const results = JSON.parse(readFileSync(RESULTS_PATH, 'utf8'));
46
- console.log(`Rescoring ${results.length} codebase questions...\n`);
47
-
48
- let scored = 0;
49
- for (const r of results) {
50
- // Skip if already scored
51
- if (r.engram.correct > 0 || r.fullContext.correct > 0 || r.grepSearch?.correct > 0) {
52
- console.log(` [${r.index}] Already scored, skipping`);
53
- scored++;
54
- continue;
55
- }
56
-
57
- try {
58
- await sleep(RATE_LIMIT_MS);
59
- const scorePrompt = `Score these three answers to a codebase question on a scale of 0.0 to 1.0.
60
- 0.0 = completely wrong or irrelevant
61
- 0.5 = partially correct, missing key details
62
- 1.0 = fully correct and complete
63
-
64
- Question: ${r.question}
65
- Ground Truth: ${r.groundTruth}
66
-
67
- Answer A (Engram): ${r.engram.answer}
68
- Answer B (Full Context): ${r.fullContext.answer}
69
- Answer C (Grep Search): ${r.grepSearch.answer}
70
-
71
- Output ONLY a JSON object: {"a": <score>, "b": <score>, "c": <score>}`;
72
-
73
- const scoreResponse = await geminiCall(scorePrompt, 200, 3, true);
74
- // Aggressively extract JSON from any preamble/wrapper text
75
- const firstBrace = scoreResponse.indexOf('{');
76
- const lastBrace = scoreResponse.lastIndexOf('}');
77
- const scores = (firstBrace >= 0 && lastBrace > firstBrace)
78
- ? JSON.parse(scoreResponse.slice(firstBrace, lastBrace + 1))
79
- : null;
80
-
81
- if (!scores) {
82
- console.log(` [${r.index}] Failed to parse scores: ${scoreResponse.slice(0, 100)}`);
83
- continue;
84
- }
85
-
86
- r.engram.correct = scores.a;
87
- r.fullContext.correct = scores.b;
88
- if (r.grepSearch) r.grepSearch.correct = scores.c;
89
-
90
- scored++;
91
- console.log(` [${r.index}] E: ${scores.a.toFixed(1)} | F: ${scores.b.toFixed(1)} | G: ${scores.c.toFixed(1)} — ${r.question.slice(0, 60)}...`);
92
-
93
- // Auto-save every 5
94
- if (scored % 5 === 0) {
95
- writeFileSync(RESULTS_PATH, JSON.stringify(results, null, 2));
83
+ console.log(`Rescoring ${results.length} results...\n`);
84
+
85
+ const systems = ['engram', 'cappedContext', 'naiveRag', 'grepSearch'] as const;
86
+ let totalScored = 0;
87
+ let totalFailed = 0;
88
+
89
+ for (let i = 0; i < results.length; i++) {
90
+ const r = results[i];
91
+ console.log(`[${i+1}/${results.length}] (${r.category}/${r.difficulty}) ${r.question.slice(0, 70)}...`);
92
+
93
+ const scores: Record<string, number> = {};
94
+ for (const sys of systems) {
95
+ if (!r[sys]?.answer) { scores[sys] = 0; continue; }
96
+ const score = await scoreAnswer(r.question, r.groundTruth, r[sys].answer);
97
+ if (score === -1) {
98
+ totalFailed++;
99
+ scores[sys] = 0;
100
+ } else {
101
+ scores[sys] = score;
96
102
  }
97
- } catch (err: any) {
98
- console.error(` [${r.index}] Error: ${err.message}`);
99
- await sleep(15000);
103
+ r[sys].score = scores[sys];
104
+ }
105
+
106
+ totalScored += systems.length;
107
+ const line = systems.map(s => `${s[0].toUpperCase()}:${scores[s].toFixed(2)}`).join(' ');
108
+ console.log(` ${line}`);
109
+
110
+ // Save progress every 5 questions
111
+ if ((i + 1) % 5 === 0 || i === results.length - 1) {
112
+ writeFileSync(RESULTS_PATH.replace('.json', '-rescored2.json'), JSON.stringify(results, null, 2));
100
113
  }
101
114
  }
102
-
103
- writeFileSync(RESULTS_PATH, JSON.stringify(results, null, 2));
104
- console.log(`\n✅ Rescored ${scored}/${results.length} questions`);
105
-
106
- // Quick summary
107
- const avgE = results.reduce((s: number, r: any) => s + r.engram.correct, 0) / results.length * 100;
108
- const avgF = results.reduce((s: number, r: any) => s + r.fullContext.correct, 0) / results.length * 100;
109
- const avgG = results.reduce((s: number, r: any) => s + (r.grepSearch?.correct || 0), 0) / results.length * 100;
110
- console.log(`\nEngram: ${avgE.toFixed(1)}% | Full Context: ${avgF.toFixed(1)}% | Grep: ${avgG.toFixed(1)}%`);
115
+
116
+ console.log(`\nScored: ${totalScored}, Failed parses: ${totalFailed}\n`);
117
+
118
+ // Generate report
119
+ const avg = (sys: string) => {
120
+ const vals = results.map((r: any) => r[sys]?.score ?? 0);
121
+ return vals.reduce((a: number, b: number) => a + b, 0) / vals.length;
122
+ };
123
+ const avgTokens = (sys: string) => {
124
+ const vals = results.map((r: any) => r[sys]?.tokensUsed ?? 0);
125
+ return Math.round(vals.reduce((a: number, b: number) => a + b, 0) / vals.length);
126
+ };
127
+
128
+ console.log('=== VS Code Codebase Evaluation Report ===\n');
129
+ console.log('OVERALL (50 questions)');
130
+ console.log(`${'System'.padEnd(20)} ${'Accuracy'.padEnd(12)} Avg Tokens`);
131
+ for (const sys of systems) {
132
+ const acc = (avg(sys) * 100).toFixed(1);
133
+ console.log(`${sys.padEnd(20)} ${(acc + '%').padEnd(12)} ${avgTokens(sys)}`);
134
+ }
135
+
136
+ // Per category
137
+ const categories = [...new Set(results.map((r: any) => r.category))];
138
+ for (const cat of categories) {
139
+ const catResults = results.filter((r: any) => r.category === cat);
140
+ const catAvg = (sys: string) => {
141
+ const vals = catResults.map((r: any) => r[sys]?.score ?? 0);
142
+ return (vals.reduce((a: number, b: number) => a + b, 0) / vals.length * 100).toFixed(1);
143
+ };
144
+ console.log(`\n ${cat.toUpperCase()} (n=${catResults.length}): ${systems.map(s => `${s[0].toUpperCase()}:${catAvg(s)}%`).join(' ')}`);
145
+ }
146
+
147
+ // Per difficulty
148
+ const diffs = [...new Set(results.map((r: any) => r.difficulty))];
149
+ for (const diff of diffs) {
150
+ const diffResults = results.filter((r: any) => r.difficulty === diff);
151
+ const diffAvg = (sys: string) => {
152
+ const vals = diffResults.map((r: any) => r[sys]?.score ?? 0);
153
+ return (vals.reduce((a: number, b: number) => a + b, 0) / vals.length * 100).toFixed(1);
154
+ };
155
+ console.log(`\n ${diff.toUpperCase()} (n=${diffResults.length}): ${systems.map(s => `${s[0].toUpperCase()}:${diffAvg(s)}%`).join(' ')}`);
156
+ }
157
+
158
+ const tokenSavings = (1 - avgTokens('engram') / avgTokens('cappedContext')) * 100;
159
+ console.log(`\n Token savings vs capped context: ${tokenSavings.toFixed(1)}%`);
160
+
161
+ // Save report
162
+ const report = {
163
+ timestamp: new Date().toISOString(),
164
+ totalQuestions: results.length,
165
+ failedParses: totalFailed,
166
+ overall: Object.fromEntries(systems.map(s => [s, {
167
+ accuracy: (avg(s) * 100).toFixed(1),
168
+ avgTokens: avgTokens(s),
169
+ }])),
170
+ byCategory: Object.fromEntries(categories.map(c => {
171
+ const cr = results.filter((r: any) => r.category === c);
172
+ return [c, Object.fromEntries(systems.map(s => [s, (cr.reduce((a: number, r: any) => a + (r[s]?.score ?? 0), 0) / cr.length * 100).toFixed(1)]))];
173
+ })),
174
+ byDifficulty: Object.fromEntries(diffs.map(d => {
175
+ const dr = results.filter((r: any) => r.difficulty === d);
176
+ return [d, Object.fromEntries(systems.map(s => [s, (dr.reduce((a: number, r: any) => a + (r[s]?.score ?? 0), 0) / dr.length * 100).toFixed(1)]))];
177
+ })),
178
+ tokenSavingsVsCapped: tokenSavings.toFixed(1) + '%',
179
+ };
180
+ writeFileSync(REPORT_PATH, JSON.stringify(report, null, 2));
181
+ console.log(`\nReport saved: ${REPORT_PATH}`);
111
182
  }
112
183
 
113
184
  main().catch(console.error);
package/server.json ADDED
@@ -0,0 +1,29 @@
1
+ {
2
+ "$schema": "https://static.modelcontextprotocol.io/schemas/2025-12-11/server.schema.json",
3
+ "name": "io.github.tstockham96/engram",
4
+ "description": "Intelligent memory layer for AI agents. Automatic extraction, consolidation, bi-temporal recall, and proactive context surfacing. Scores 80% on LOCOMO benchmark at 93% fewer tokens than full context.",
5
+ "repository": {
6
+ "url": "https://github.com/tstockham96/engram",
7
+ "source": "github"
8
+ },
9
+ "version": "0.4.4",
10
+ "packages": [
11
+ {
12
+ "registryType": "npm",
13
+ "identifier": "engram-sdk",
14
+ "version": "0.4.4",
15
+ "transport": {
16
+ "type": "stdio"
17
+ },
18
+ "environmentVariables": [
19
+ {
20
+ "description": "Google Gemini API key for embeddings and LLM operations",
21
+ "isRequired": true,
22
+ "format": "string",
23
+ "isSecret": true,
24
+ "name": "GEMINI_API_KEY"
25
+ }
26
+ ]
27
+ }
28
+ ]
29
+ }