engram-sdk 0.5.5 → 0.5.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/BADGE.md +47 -0
- package/README.md +13 -1
- package/assets/BADGE-USAGE.md +39 -0
- package/assets/badge-made-with-engram.svg +23 -0
- package/dist/accounts.d.ts +30 -1
- package/dist/accounts.d.ts.map +1 -1
- package/dist/accounts.js +132 -1
- package/dist/accounts.js.map +1 -1
- package/dist/cli.js +257 -11
- package/dist/cli.js.map +1 -1
- package/dist/hosted-client.d.ts +71 -0
- package/dist/hosted-client.d.ts.map +1 -0
- package/dist/hosted-client.js +154 -0
- package/dist/hosted-client.js.map +1 -0
- package/dist/hosted.d.ts.map +1 -1
- package/dist/hosted.js +152 -11
- package/dist/hosted.js.map +1 -1
- package/dist/mcp.js +127 -15
- package/dist/mcp.js.map +1 -1
- package/dist/store.d.ts.map +1 -1
- package/dist/store.js +10 -3
- package/dist/store.js.map +1 -1
- package/dist/types.d.ts +4 -0
- package/dist/types.d.ts.map +1 -1
- package/dist/types.js +1 -0
- package/dist/types.js.map +1 -1
- package/eval-rescore-judge.mjs +158 -0
- package/package.json +1 -1
|
@@ -0,0 +1,158 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
/**
|
|
3
|
+
* Re-score existing LOCOMO results with GPT-4o-mini judge (matches Mem0 methodology).
|
|
4
|
+
*
|
|
5
|
+
* Usage:
|
|
6
|
+
* OPENAI_API_KEY=sk-... node eval-rescore-judge.mjs # Run re-judge
|
|
7
|
+
* node eval-rescore-judge.mjs --report # Compare judges
|
|
8
|
+
*/
|
|
9
|
+
|
|
10
|
+
import { readFileSync, writeFileSync, existsSync } from 'fs';
|
|
11
|
+
import { join } from 'path';
|
|
12
|
+
import { homedir } from 'os';
|
|
13
|
+
|
|
14
|
+
const EVAL_DIR = join(homedir(), '.openclaw/workspace/engram/eval-scale-data');
|
|
15
|
+
const RESULTS_PATH = join(EVAL_DIR, 'locomo-results.json');
|
|
16
|
+
const OPENAI_KEY = process.env.OPENAI_API_KEY || readFileSync(join(homedir(), '.config/engram/openai-key'), 'utf8').trim();
|
|
17
|
+
const OUTPUT_PATH = join(EVAL_DIR, 'locomo-results-rejudge-openai.json');
|
|
18
|
+
|
|
19
|
+
const sleep = ms => new Promise(r => setTimeout(r, ms));
|
|
20
|
+
|
|
21
|
+
async function judgeOpenAI(question, groundTruth, systemAnswer) {
|
|
22
|
+
const prompt = `You are evaluating the quality of an AI system's answer about a conversation.
|
|
23
|
+
|
|
24
|
+
Question: ${question}
|
|
25
|
+
Ground Truth Answer: ${groundTruth}
|
|
26
|
+
System Answer: ${systemAnswer}
|
|
27
|
+
|
|
28
|
+
Rate the system's answer on a scale from 0.0 to 1.0 based on:
|
|
29
|
+
- Factual accuracy compared to the ground truth
|
|
30
|
+
- Relevance to the question asked
|
|
31
|
+
- Completeness of the answer
|
|
32
|
+
- Whether it contains any incorrect information
|
|
33
|
+
|
|
34
|
+
Provide your evaluation as a JSON object:
|
|
35
|
+
{"score": 0.85, "reason": "The answer correctly identifies..."}`;
|
|
36
|
+
|
|
37
|
+
for (let attempt = 1; attempt <= 5; attempt++) {
|
|
38
|
+
try {
|
|
39
|
+
const resp = await fetch('https://api.openai.com/v1/chat/completions', {
|
|
40
|
+
method: 'POST',
|
|
41
|
+
headers: {
|
|
42
|
+
'Content-Type': 'application/json',
|
|
43
|
+
'Authorization': `Bearer ${OPENAI_KEY}`,
|
|
44
|
+
},
|
|
45
|
+
body: JSON.stringify({
|
|
46
|
+
model: 'gpt-4o-mini',
|
|
47
|
+
messages: [{ role: 'user', content: prompt }],
|
|
48
|
+
temperature: 0.1,
|
|
49
|
+
max_tokens: 500,
|
|
50
|
+
response_format: { type: 'json_object' },
|
|
51
|
+
}),
|
|
52
|
+
});
|
|
53
|
+
if (resp.status === 429) {
|
|
54
|
+
const wait = attempt * 15;
|
|
55
|
+
console.warn(` [429] Retry in ${wait}s...`);
|
|
56
|
+
await sleep(wait * 1000);
|
|
57
|
+
continue;
|
|
58
|
+
}
|
|
59
|
+
const data = await resp.json();
|
|
60
|
+
const text = data.choices?.[0]?.message?.content || '';
|
|
61
|
+
const parsed = JSON.parse(text);
|
|
62
|
+
return Math.max(0, Math.min(1, parsed.score || 0));
|
|
63
|
+
} catch (err) {
|
|
64
|
+
console.warn(` [ERR attempt ${attempt}]`, err.message);
|
|
65
|
+
if (attempt === 5) return 0;
|
|
66
|
+
await sleep(attempt * 5000);
|
|
67
|
+
}
|
|
68
|
+
}
|
|
69
|
+
return 0;
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
async function report() {
|
|
73
|
+
const cats = { '1': 'Single-Hop', '2': 'Temporal', '3': 'Open-Domain', '4': 'Multi-Hop' };
|
|
74
|
+
|
|
75
|
+
for (const [label, path] of [['Original (Gemini judge)', RESULTS_PATH], ['GPT-4o-mini judge', OUTPUT_PATH]]) {
|
|
76
|
+
if (!existsSync(path)) { console.log(`${label}: not found`); continue; }
|
|
77
|
+
const data = JSON.parse(readFileSync(path, 'utf8'));
|
|
78
|
+
|
|
79
|
+
const catScores = {};
|
|
80
|
+
for (const r of data) {
|
|
81
|
+
const cat = cats[String(r.category)] || String(r.category);
|
|
82
|
+
if (!catScores[cat]) catScores[cat] = { engram: [], full: [], md: [] };
|
|
83
|
+
catScores[cat].engram.push(r.results.engram.score);
|
|
84
|
+
catScores[cat].full.push(r.results.fullContext.score);
|
|
85
|
+
catScores[cat].md.push(r.results.memoryMd.score);
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
const avg = arr => arr.length ? (arr.reduce((a, b) => a + b, 0) / arr.length * 100).toFixed(1) : 'N/A';
|
|
89
|
+
|
|
90
|
+
console.log(`\n=== ${label} (${data.length} questions) ===`);
|
|
91
|
+
console.log(`${'Category'.padEnd(15)} ${'Engram'.padStart(8)} ${'Full'.padStart(8)} ${'MD'.padStart(8)} ${'Count'.padStart(6)}`);
|
|
92
|
+
console.log('-'.repeat(50));
|
|
93
|
+
|
|
94
|
+
let allE = [], allF = [], allM = [];
|
|
95
|
+
for (const cat of ['Single-Hop', 'Temporal', 'Multi-Hop', 'Open-Domain']) {
|
|
96
|
+
const d = catScores[cat];
|
|
97
|
+
if (!d) continue;
|
|
98
|
+
console.log(`${cat.padEnd(15)} ${avg(d.engram).padStart(8)} ${avg(d.full).padStart(8)} ${avg(d.md).padStart(8)} ${String(d.engram.length).padStart(6)}`);
|
|
99
|
+
allE.push(...d.engram); allF.push(...d.full); allM.push(...d.md);
|
|
100
|
+
}
|
|
101
|
+
console.log('-'.repeat(50));
|
|
102
|
+
console.log(`${'Overall'.padEnd(15)} ${avg(allE).padStart(8)} ${avg(allF).padStart(8)} ${avg(allM).padStart(8)} ${String(allE.length).padStart(6)}`);
|
|
103
|
+
}
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
async function main() {
|
|
107
|
+
if (process.argv.includes('--report')) return report();
|
|
108
|
+
|
|
109
|
+
console.log('Re-judging LOCOMO results with GPT-4o-mini...');
|
|
110
|
+
console.log(`Input: ${RESULTS_PATH}`);
|
|
111
|
+
console.log(`Output: ${OUTPUT_PATH}\n`);
|
|
112
|
+
|
|
113
|
+
const results = JSON.parse(readFileSync(RESULTS_PATH, 'utf8'));
|
|
114
|
+
|
|
115
|
+
// Resume support
|
|
116
|
+
let rescored = [];
|
|
117
|
+
const doneSet = new Set();
|
|
118
|
+
if (existsSync(OUTPUT_PATH)) {
|
|
119
|
+
rescored = JSON.parse(readFileSync(OUTPUT_PATH, 'utf8'));
|
|
120
|
+
for (const r of rescored) doneSet.add(r.questionId);
|
|
121
|
+
console.log(`Resuming: ${rescored.length}/${results.length} already done\n`);
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
let done = rescored.length;
|
|
125
|
+
const total = results.length;
|
|
126
|
+
|
|
127
|
+
for (const r of results) {
|
|
128
|
+
if (doneSet.has(r.questionId)) continue;
|
|
129
|
+
|
|
130
|
+
const eScore = await judgeOpenAI(r.question, r.groundTruth, r.results.engram.answer);
|
|
131
|
+
await sleep(200);
|
|
132
|
+
const fScore = await judgeOpenAI(r.question, r.groundTruth, r.results.fullContext.answer);
|
|
133
|
+
await sleep(200);
|
|
134
|
+
const mScore = await judgeOpenAI(r.question, r.groundTruth, r.results.memoryMd.answer);
|
|
135
|
+
await sleep(200);
|
|
136
|
+
|
|
137
|
+
rescored.push({
|
|
138
|
+
...r,
|
|
139
|
+
results: {
|
|
140
|
+
engram: { ...r.results.engram, score: eScore },
|
|
141
|
+
fullContext: { ...r.results.fullContext, score: fScore },
|
|
142
|
+
memoryMd: { ...r.results.memoryMd, score: mScore },
|
|
143
|
+
},
|
|
144
|
+
});
|
|
145
|
+
done++;
|
|
146
|
+
|
|
147
|
+
if (done % 10 === 0 || done === total) {
|
|
148
|
+
writeFileSync(OUTPUT_PATH, JSON.stringify(rescored, null, 2));
|
|
149
|
+
const eAvg = (rescored.reduce((s, x) => s + x.results.engram.score, 0) / rescored.length * 100).toFixed(1);
|
|
150
|
+
const fAvg = (rescored.reduce((s, x) => s + x.results.fullContext.score, 0) / rescored.length * 100).toFixed(1);
|
|
151
|
+
console.log(`[${done}/${total}] Engram: ${eAvg}% | Full: ${fAvg}% | Last: E=${eScore.toFixed(2)} F=${fScore.toFixed(2)} M=${mScore.toFixed(2)}`);
|
|
152
|
+
}
|
|
153
|
+
}
|
|
154
|
+
|
|
155
|
+
console.log('\nDone! Run with --report to compare.');
|
|
156
|
+
}
|
|
157
|
+
|
|
158
|
+
main().catch(err => { console.error(err); process.exit(1); });
|
package/package.json
CHANGED