@eidentic/bench 0.1.1 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/chunk-KOMVTEKE.js +98 -0
- package/dist/index.cjs +684 -3
- package/dist/index.d.cts +281 -2
- package/dist/index.d.ts +281 -2
- package/dist/index.js +575 -3
- package/dist/lme-loader-WSJ72GEP.js +10 -0
- package/package.json +4 -4
package/dist/index.cjs
CHANGED
|
@@ -136,22 +136,132 @@ var init_locomo_loader = __esm({
|
|
|
136
136
|
}
|
|
137
137
|
});
|
|
138
138
|
|
|
139
|
+
// src/lme-loader.ts
|
|
140
|
+
var lme_loader_exports = {};
|
|
141
|
+
__export(lme_loader_exports, {
|
|
142
|
+
LONGMEMEVAL_SOURCE: () => LONGMEMEVAL_SOURCE,
|
|
143
|
+
loadLongMemEval: () => loadLongMemEval2,
|
|
144
|
+
parseLmeDateTimeString: () => parseLmeDateTimeString
|
|
145
|
+
});
|
|
146
|
+
async function assertFileSize3(filePath, maxBytes) {
|
|
147
|
+
let fileSize;
|
|
148
|
+
try {
|
|
149
|
+
const s = await (0, import_promises4.stat)(filePath);
|
|
150
|
+
fileSize = s.size;
|
|
151
|
+
} catch (err) {
|
|
152
|
+
throw new Error(
|
|
153
|
+
`bench loader: cannot stat file "${filePath}": ${err.message}`
|
|
154
|
+
);
|
|
155
|
+
}
|
|
156
|
+
if (fileSize > maxBytes) {
|
|
157
|
+
const mb = (fileSize / (1024 * 1024)).toFixed(1);
|
|
158
|
+
const capMb = (maxBytes / (1024 * 1024)).toFixed(0);
|
|
159
|
+
throw new Error(
|
|
160
|
+
`bench loader: file "${filePath}" is ${mb} MiB, which exceeds the ${capMb} MiB cap. Pass a larger maxBytes option if this is intentional.`
|
|
161
|
+
);
|
|
162
|
+
}
|
|
163
|
+
}
|
|
164
|
+
function parseLmeDateTimeString(raw) {
|
|
165
|
+
if (!raw) return 0;
|
|
166
|
+
const cleaned = raw.replace(/\s*\([A-Za-z]+\)\s*/, " ").trim();
|
|
167
|
+
const iso = cleaned.replace(/^(\d{4})\/(\d{2})\/(\d{2})/, "$1-$2-$3");
|
|
168
|
+
const ms = Date.parse(iso);
|
|
169
|
+
if (Number.isNaN(ms)) return 0;
|
|
170
|
+
return ms;
|
|
171
|
+
}
|
|
172
|
+
function extractBaseType(rawType) {
|
|
173
|
+
if (rawType.endsWith("_abs")) {
|
|
174
|
+
return rawType.slice(0, -4);
|
|
175
|
+
}
|
|
176
|
+
return rawType;
|
|
177
|
+
}
|
|
178
|
+
function parseSession(id, dateTime, rawTurns) {
|
|
179
|
+
const turns = rawTurns.map((t) => ({
|
|
180
|
+
role: t.role === "assistant" ? "assistant" : "user",
|
|
181
|
+
content: t.content ?? "",
|
|
182
|
+
hasAnswer: t.has_answer === true
|
|
183
|
+
}));
|
|
184
|
+
return {
|
|
185
|
+
id,
|
|
186
|
+
dateTime,
|
|
187
|
+
dateTimeMs: parseLmeDateTimeString(dateTime),
|
|
188
|
+
turns
|
|
189
|
+
};
|
|
190
|
+
}
|
|
191
|
+
async function loadLongMemEval2(jsonPath, opts) {
|
|
192
|
+
await assertFileSize3(jsonPath, opts?.maxBytes ?? DEFAULT_MAX_BYTES3);
|
|
193
|
+
const raw = JSON.parse(await (0, import_promises4.readFile)(jsonPath, "utf-8"));
|
|
194
|
+
if (!Array.isArray(raw)) {
|
|
195
|
+
throw new Error(
|
|
196
|
+
`bench loader: expected the LongMemEval JSON root to be an array, but got ${typeof raw}. Did you pass the correct file?`
|
|
197
|
+
);
|
|
198
|
+
}
|
|
199
|
+
const rawQuestions = raw;
|
|
200
|
+
const questions = rawQuestions.map((q, i) => {
|
|
201
|
+
const id = q.question_id ?? String(i);
|
|
202
|
+
const rawType = q.question_type ?? "single-session-user";
|
|
203
|
+
const baseType = extractBaseType(rawType);
|
|
204
|
+
const isAbstention = rawType.endsWith("_abs");
|
|
205
|
+
const rawSessions = Array.isArray(q.haystack_sessions) ? q.haystack_sessions : [];
|
|
206
|
+
const dates = Array.isArray(q.haystack_dates) ? q.haystack_dates : [];
|
|
207
|
+
const sessionIds = Array.isArray(q.haystack_session_ids) ? q.haystack_session_ids : [];
|
|
208
|
+
const sessions = rawSessions.map((turns, idx) => {
|
|
209
|
+
const sessId = sessionIds[idx] ?? `sess-${idx}`;
|
|
210
|
+
const dateTime = dates[idx] ?? "";
|
|
211
|
+
return parseSession(sessId, dateTime, Array.isArray(turns) ? turns : []);
|
|
212
|
+
});
|
|
213
|
+
sessions.sort((a, b) => a.dateTimeMs - b.dateTimeMs);
|
|
214
|
+
return {
|
|
215
|
+
id,
|
|
216
|
+
type: rawType,
|
|
217
|
+
baseType,
|
|
218
|
+
isAbstention,
|
|
219
|
+
question: q.question ?? "",
|
|
220
|
+
answer: q.answer ?? "",
|
|
221
|
+
questionDate: q.question_date ?? "",
|
|
222
|
+
questionDateMs: parseLmeDateTimeString(q.question_date ?? ""),
|
|
223
|
+
sessions,
|
|
224
|
+
answerSessionIds: Array.isArray(q.answer_session_ids) ? q.answer_session_ids : []
|
|
225
|
+
};
|
|
226
|
+
});
|
|
227
|
+
return { questions };
|
|
228
|
+
}
|
|
229
|
+
var import_promises4, LONGMEMEVAL_SOURCE, DEFAULT_MAX_BYTES3;
|
|
230
|
+
var init_lme_loader = __esm({
|
|
231
|
+
"src/lme-loader.ts"() {
|
|
232
|
+
"use strict";
|
|
233
|
+
import_promises4 = require("node:fs/promises");
|
|
234
|
+
LONGMEMEVAL_SOURCE = {
|
|
235
|
+
url: "https://huggingface.co/datasets/xiaowu0162/longmemeval",
|
|
236
|
+
snapshotSha: "2ec2a557f339b6c0369619b1ed5793734cc87533",
|
|
237
|
+
file: "longmemeval_s",
|
|
238
|
+
license: "MIT"
|
|
239
|
+
};
|
|
240
|
+
DEFAULT_MAX_BYTES3 = 512 * 1024 * 1024;
|
|
241
|
+
}
|
|
242
|
+
});
|
|
243
|
+
|
|
139
244
|
// src/index.ts
|
|
140
245
|
var index_exports = {};
|
|
141
246
|
__export(index_exports, {
|
|
142
247
|
CONTRADICTION_FIXTURES: () => CONTRADICTION_FIXTURES,
|
|
143
248
|
JUNK_STREAM_FIXTURES: () => JUNK_STREAM_FIXTURES,
|
|
144
249
|
LOCOMO_SOURCE_SHA: () => LOCOMO_SOURCE_SHA,
|
|
250
|
+
LONGMEMEVAL_SOURCE: () => LONGMEMEVAL_SOURCE,
|
|
145
251
|
factRecall: () => factRecall,
|
|
146
252
|
loadLoCoMo: () => loadLoCoMo2,
|
|
147
253
|
loadLoCoMoLegacy: () => loadLoCoMo,
|
|
148
|
-
loadLongMemEval: () =>
|
|
254
|
+
loadLongMemEval: () => loadLongMemEval2,
|
|
255
|
+
loadLongMemEvalLegacy: () => loadLongMemEval,
|
|
149
256
|
normalizeText: () => normalizeText,
|
|
150
257
|
normalizedIncludes: () => normalizedIncludes,
|
|
258
|
+
parseLmeDateTimeString: () => parseLmeDateTimeString,
|
|
151
259
|
recallAtK: () => recallAtK,
|
|
152
260
|
renderLocomoReportMarkdown: () => renderLocomoReportMarkdown,
|
|
261
|
+
renderLongMemEvalReportMarkdown: () => renderLongMemEvalReportMarkdown,
|
|
153
262
|
resolveEvidence: () => resolveEvidence,
|
|
154
263
|
runLocomoBench: () => runLocomoBench,
|
|
264
|
+
runLongMemEvalBench: () => runLongMemEvalBench,
|
|
155
265
|
runMemoryBench: () => runMemoryBench,
|
|
156
266
|
runTemporalBench: () => runTemporalBench,
|
|
157
267
|
runWriteQualityBench: () => runWriteQualityBench,
|
|
@@ -1094,6 +1204,572 @@ function resolveEvidence(sample, diaIds) {
|
|
|
1094
1204
|
return results;
|
|
1095
1205
|
}
|
|
1096
1206
|
|
|
1207
|
+
// src/index.ts
|
|
1208
|
+
init_lme_loader();
|
|
1209
|
+
|
|
1210
|
+
// src/lme-run.ts
|
|
1211
|
+
var import_promises5 = require("node:fs/promises");
|
|
1212
|
+
var import_node_fs2 = require("node:fs");
|
|
1213
|
+
init_lme_loader();
|
|
1214
|
+
function makeRng2(seed) {
|
|
1215
|
+
let s = seed >>> 0;
|
|
1216
|
+
if (s === 0) s = 1;
|
|
1217
|
+
return () => {
|
|
1218
|
+
s ^= s << 13;
|
|
1219
|
+
s ^= s >>> 17;
|
|
1220
|
+
s ^= s << 5;
|
|
1221
|
+
s = s >>> 0;
|
|
1222
|
+
return s / 4294967296;
|
|
1223
|
+
};
|
|
1224
|
+
}
|
|
1225
|
+
function seededShuffle2(arr, rng) {
|
|
1226
|
+
const out = [...arr];
|
|
1227
|
+
for (let i = out.length - 1; i > 0; i--) {
|
|
1228
|
+
const j = Math.floor(rng() * (i + 1));
|
|
1229
|
+
[out[i], out[j]] = [out[j], out[i]];
|
|
1230
|
+
}
|
|
1231
|
+
return out;
|
|
1232
|
+
}
|
|
1233
|
+
var DEFAULT_FULL_CONTEXT_MAX_CHARS = 48e4;
|
|
1234
|
+
var DECLINE_PATTERNS2 = /\bno information available\b|\bi (don'?t|do not) (know|have)\b|\bcannot (find|answer|determine|provide)\b|\bnot (mentioned|stated|provided|found|available)\b|\bunable to (find|answer|determine)\b|\bno relevant (information|data)\b/i;
|
|
1235
|
+
function appearsToDecline2(answer) {
|
|
1236
|
+
return DECLINE_PATTERNS2.test(answer);
|
|
1237
|
+
}
|
|
1238
|
+
var ANSWER_INSTRUCTION2 = "Answer the question concisely and specifically based only on the information provided. If the provided context does not contain the answer, reply exactly: No information available";
|
|
1239
|
+
function buildMemoryAnswerPrompt2(snippets, question, currentDate) {
|
|
1240
|
+
const context = snippets.length > 0 ? snippets.map((s, i) => `[${i + 1}] ${s}`).join("\n\n") : "(no relevant context retrieved)";
|
|
1241
|
+
const dateNote = currentDate ? `
|
|
1242
|
+
Current date: ${currentDate}` : "";
|
|
1243
|
+
return `Context from conversation history:
|
|
1244
|
+
${context}${dateNote}
|
|
1245
|
+
|
|
1246
|
+
Question: ${question}
|
|
1247
|
+
|
|
1248
|
+
${ANSWER_INSTRUCTION2}`;
|
|
1249
|
+
}
|
|
1250
|
+
function buildFullContextAnswerPrompt2(haystackText, question, currentDate) {
|
|
1251
|
+
const dateNote = currentDate ? `
|
|
1252
|
+
Current date: ${currentDate}` : "";
|
|
1253
|
+
return `Conversation history:
|
|
1254
|
+
${haystackText}${dateNote}
|
|
1255
|
+
|
|
1256
|
+
Question: ${question}
|
|
1257
|
+
|
|
1258
|
+
${ANSWER_INSTRUCTION2}`;
|
|
1259
|
+
}
|
|
1260
|
+
function buildJudgePrompt2(question, goldAnswer, modelAnswer, questionType, isAbstention) {
|
|
1261
|
+
if (isAbstention) {
|
|
1262
|
+
return `You are a strict judge. This question requires the model to recognize that the information is not present in the conversation history, or that the premise is flawed. A CORRECT response means the model abstained, said something equivalent to "No information available", "I don't know", or identified that the premise is wrong. An INCORRECT response means the model fabricated a specific, concrete answer.
|
|
1263
|
+
|
|
1264
|
+
Question: ${question}
|
|
1265
|
+
Model answer: ${modelAnswer}
|
|
1266
|
+
|
|
1267
|
+
Respond with JSON only: {"correct": true} if the model appropriately abstained, {"correct": false} if it fabricated a specific answer.`;
|
|
1268
|
+
}
|
|
1269
|
+
const temporalNote = questionType === "temporal-reasoning" ? "\nFor time/date questions: equivalent date expressions referring to the same date or duration are correct." : "";
|
|
1270
|
+
return `You are a strict judge evaluating whether a model answer is correct.
|
|
1271
|
+
CORRECT: the model answer contains the specific information in the gold answer (exact match or clear paraphrase is fine).${temporalNote}
|
|
1272
|
+
WRONG: the model answer is vague, only topically related, contradicts the gold answer, or says "no information" when a specific answer exists.
|
|
1273
|
+
|
|
1274
|
+
Question: ${question}
|
|
1275
|
+
Gold answer: ${goldAnswer}
|
|
1276
|
+
Model answer: ${modelAnswer}
|
|
1277
|
+
|
|
1278
|
+
Respond with JSON only: {"correct": true} or {"correct": false}`;
|
|
1279
|
+
}
|
|
1280
|
+
function renderHaystack(sessions) {
|
|
1281
|
+
const lines = [];
|
|
1282
|
+
for (let i = 0; i < sessions.length; i++) {
|
|
1283
|
+
const sess = sessions[i];
|
|
1284
|
+
const label = `Session ${i + 1} \u2014 ${sess.dateTime || sess.id}`;
|
|
1285
|
+
lines.push(label);
|
|
1286
|
+
for (const turn of sess.turns) {
|
|
1287
|
+
const roleLabel = turn.role === "user" ? "User" : "Assistant";
|
|
1288
|
+
lines.push(`[${roleLabel}]: ${turn.content}`);
|
|
1289
|
+
}
|
|
1290
|
+
lines.push("");
|
|
1291
|
+
}
|
|
1292
|
+
return lines.join("\n").trim();
|
|
1293
|
+
}
|
|
1294
|
+
function renderHaystackCapped(sessions, maxChars) {
|
|
1295
|
+
const full = renderHaystack(sessions);
|
|
1296
|
+
if (full.length <= maxChars) return { text: full, truncated: false };
|
|
1297
|
+
let kept = sessions.slice();
|
|
1298
|
+
while (kept.length > 1) {
|
|
1299
|
+
kept = kept.slice(1);
|
|
1300
|
+
const t = renderHaystack(kept);
|
|
1301
|
+
if (t.length <= maxChars) return { text: t, truncated: true };
|
|
1302
|
+
}
|
|
1303
|
+
return { text: renderHaystack(kept).slice(0, maxChars), truncated: true };
|
|
1304
|
+
}
|
|
1305
|
+
var EMBED_CHAR_CAP = 2e4;
|
|
1306
|
+
function capForEmbedding(text) {
|
|
1307
|
+
return text.length <= EMBED_CHAR_CAP ? text : text.slice(0, EMBED_CHAR_CAP);
|
|
1308
|
+
}
|
|
1309
|
+
async function ingestQuestionIntoMemory(question, memory, scope) {
|
|
1310
|
+
const events = [];
|
|
1311
|
+
for (let i = 0; i < question.sessions.length; i++) {
|
|
1312
|
+
const sess = question.sessions[i];
|
|
1313
|
+
const sessLabel = `Session ${i + 1} \u2014 ${sess.dateTime || sess.id}`;
|
|
1314
|
+
for (let t = 0; t < sess.turns.length; t++) {
|
|
1315
|
+
const turn = sess.turns[t];
|
|
1316
|
+
const roleLabel = turn.role === "user" ? "User" : "Assistant";
|
|
1317
|
+
events.push({
|
|
1318
|
+
id: `${question.id}:sess${i}:turn${t}`,
|
|
1319
|
+
scope,
|
|
1320
|
+
text: capForEmbedding(`[${sessLabel}] [${roleLabel}]: ${turn.content}`),
|
|
1321
|
+
metadata: {
|
|
1322
|
+
sessionId: sess.id,
|
|
1323
|
+
sessionIndex: i,
|
|
1324
|
+
turnRole: turn.role,
|
|
1325
|
+
ingestedAt: sess.dateTimeMs || void 0
|
|
1326
|
+
}
|
|
1327
|
+
});
|
|
1328
|
+
}
|
|
1329
|
+
const sessionText = [
|
|
1330
|
+
sessLabel,
|
|
1331
|
+
...sess.turns.map((t) => `[${t.role === "user" ? "User" : "Assistant"}]: ${t.content}`)
|
|
1332
|
+
].join("\n");
|
|
1333
|
+
events.push({
|
|
1334
|
+
id: `${question.id}:sess${i}:chunk`,
|
|
1335
|
+
scope,
|
|
1336
|
+
text: capForEmbedding(sessionText),
|
|
1337
|
+
metadata: {
|
|
1338
|
+
sessionId: sess.id,
|
|
1339
|
+
sessionIndex: i,
|
|
1340
|
+
ingestedAt: sess.dateTimeMs || void 0
|
|
1341
|
+
}
|
|
1342
|
+
});
|
|
1343
|
+
}
|
|
1344
|
+
await memory.ingest(events);
|
|
1345
|
+
}
|
|
1346
|
+
async function callJudge2(judgeModel, prompt) {
|
|
1347
|
+
const response = await judgeModel.complete({
|
|
1348
|
+
messages: [{ role: "user", content: prompt }],
|
|
1349
|
+
tools: [],
|
|
1350
|
+
outputSchema: {
|
|
1351
|
+
type: "object",
|
|
1352
|
+
properties: { correct: { type: "boolean" } },
|
|
1353
|
+
required: ["correct"],
|
|
1354
|
+
// OpenAI strict structured-output mode requires this to be explicit.
|
|
1355
|
+
additionalProperties: false
|
|
1356
|
+
}
|
|
1357
|
+
});
|
|
1358
|
+
let correct = false;
|
|
1359
|
+
if (response.object && typeof response.object.correct === "boolean") {
|
|
1360
|
+
correct = response.object.correct;
|
|
1361
|
+
} else {
|
|
1362
|
+
const text = response.content.filter((b) => b.type === "text").map((b) => b.text ?? "").join("").toLowerCase().trim();
|
|
1363
|
+
if (/"correct"\s*:\s*true/i.test(text)) correct = true;
|
|
1364
|
+
else if (/"correct"\s*:\s*false/i.test(text)) correct = false;
|
|
1365
|
+
else correct = text.includes("true");
|
|
1366
|
+
}
|
|
1367
|
+
return {
|
|
1368
|
+
correct,
|
|
1369
|
+
inputTokens: response.usage?.inputTokens ?? 0,
|
|
1370
|
+
outputTokens: response.usage?.outputTokens ?? 0
|
|
1371
|
+
};
|
|
1372
|
+
}
|
|
1373
|
+
async function loadCheckpoint2(path) {
|
|
1374
|
+
const done = /* @__PURE__ */ new Set();
|
|
1375
|
+
if (!(0, import_node_fs2.existsSync)(path)) return done;
|
|
1376
|
+
const raw = await (0, import_promises5.readFile)(path, "utf-8");
|
|
1377
|
+
for (const line of raw.split("\n")) {
|
|
1378
|
+
const trimmed = line.trim();
|
|
1379
|
+
if (!trimmed) continue;
|
|
1380
|
+
try {
|
|
1381
|
+
const row = JSON.parse(trimmed);
|
|
1382
|
+
if (row.questionId) done.add(row.questionId);
|
|
1383
|
+
} catch {
|
|
1384
|
+
}
|
|
1385
|
+
}
|
|
1386
|
+
return done;
|
|
1387
|
+
}
|
|
1388
|
+
async function appendCheckpointRow2(path, row) {
|
|
1389
|
+
await (0, import_promises5.appendFile)(path, JSON.stringify(row) + "\n", "utf-8");
|
|
1390
|
+
}
|
|
1391
|
+
async function runLongMemEvalBench(opts) {
|
|
1392
|
+
const {
|
|
1393
|
+
answerModel,
|
|
1394
|
+
judgeModel,
|
|
1395
|
+
mode,
|
|
1396
|
+
types,
|
|
1397
|
+
questionLimit,
|
|
1398
|
+
seed = 42,
|
|
1399
|
+
concurrency = 1,
|
|
1400
|
+
onProgress,
|
|
1401
|
+
checkpointPath,
|
|
1402
|
+
fullContextMaxChars = DEFAULT_FULL_CONTEXT_MAX_CHARS
|
|
1403
|
+
} = opts;
|
|
1404
|
+
const topK = Math.min(opts.topK ?? 10, 10);
|
|
1405
|
+
const rng = makeRng2(seed);
|
|
1406
|
+
if (mode === "memory" && !opts.memoryFactory) {
|
|
1407
|
+
throw new Error("runLongMemEvalBench: memoryFactory is required when mode='memory'");
|
|
1408
|
+
}
|
|
1409
|
+
let dataset;
|
|
1410
|
+
if (opts.dataset) {
|
|
1411
|
+
dataset = opts.dataset;
|
|
1412
|
+
} else if (opts.dataPath) {
|
|
1413
|
+
const { loadLongMemEval: loader } = await Promise.resolve().then(() => (init_lme_loader(), lme_loader_exports));
|
|
1414
|
+
dataset = await loader(opts.dataPath);
|
|
1415
|
+
} else {
|
|
1416
|
+
throw new Error("runLongMemEvalBench: either dataPath or dataset must be provided");
|
|
1417
|
+
}
|
|
1418
|
+
let questions = dataset.questions;
|
|
1419
|
+
if (types && types.length > 0) {
|
|
1420
|
+
questions = questions.filter((q) => types.includes(q.type) || types.includes(q.baseType));
|
|
1421
|
+
}
|
|
1422
|
+
if (questionLimit !== void 0 && questionLimit < questions.length) {
|
|
1423
|
+
questions = seededShuffle2(questions, makeRng2(seed)).slice(0, questionLimit);
|
|
1424
|
+
}
|
|
1425
|
+
const checkpoint = checkpointPath ? await loadCheckpoint2(checkpointPath) : /* @__PURE__ */ new Set();
|
|
1426
|
+
const results = [];
|
|
1427
|
+
let totalAnswerInputTokens = 0;
|
|
1428
|
+
let totalAnswerOutputTokens = 0;
|
|
1429
|
+
let totalJudgeInputTokens = 0;
|
|
1430
|
+
let totalJudgeOutputTokens = 0;
|
|
1431
|
+
let ingestEmbedTokens = 0;
|
|
1432
|
+
let errorCount = 0;
|
|
1433
|
+
let done = 0;
|
|
1434
|
+
const total = questions.length - checkpoint.size;
|
|
1435
|
+
const startTime = Date.now();
|
|
1436
|
+
const processQuestion = async (q) => {
|
|
1437
|
+
if (checkpoint.has(q.id)) return;
|
|
1438
|
+
let modelAnswer = "";
|
|
1439
|
+
let answerIn = 0;
|
|
1440
|
+
let answerOut = 0;
|
|
1441
|
+
let judgeIn = 0;
|
|
1442
|
+
let judgeOut = 0;
|
|
1443
|
+
let correct = false;
|
|
1444
|
+
let contextTruncated = false;
|
|
1445
|
+
let errorMsg;
|
|
1446
|
+
try {
|
|
1447
|
+
if (mode === "memory") {
|
|
1448
|
+
const memory = await opts.memoryFactory(q.id);
|
|
1449
|
+
const scope = { kind: "agent", agentId: `lme:${q.id}` };
|
|
1450
|
+
await ingestQuestionIntoMemory(q, memory, scope);
|
|
1451
|
+
const retrieved = await memory.retrieve({ text: q.question, scope, topK });
|
|
1452
|
+
const snippets = retrieved.snippets.map((s) => s.text);
|
|
1453
|
+
const prompt = buildMemoryAnswerPrompt2(snippets, q.question, q.questionDate);
|
|
1454
|
+
const resp = await answerModel.complete({
|
|
1455
|
+
messages: [{ role: "user", content: prompt }],
|
|
1456
|
+
tools: []
|
|
1457
|
+
});
|
|
1458
|
+
const textBlocks = resp.content.filter((b) => b.type === "text");
|
|
1459
|
+
modelAnswer = textBlocks.map((b) => b.text).join("").trim();
|
|
1460
|
+
answerIn = resp.usage?.inputTokens ?? 0;
|
|
1461
|
+
answerOut = resp.usage?.outputTokens ?? 0;
|
|
1462
|
+
} else {
|
|
1463
|
+
const { text, truncated } = renderHaystackCapped(q.sessions, fullContextMaxChars);
|
|
1464
|
+
contextTruncated = truncated;
|
|
1465
|
+
const prompt = buildFullContextAnswerPrompt2(text, q.question, q.questionDate);
|
|
1466
|
+
const resp = await answerModel.complete({
|
|
1467
|
+
messages: [{ role: "user", content: prompt }],
|
|
1468
|
+
tools: []
|
|
1469
|
+
});
|
|
1470
|
+
const textBlocks = resp.content.filter((b) => b.type === "text");
|
|
1471
|
+
modelAnswer = textBlocks.map((b) => b.text).join("").trim();
|
|
1472
|
+
answerIn = resp.usage?.inputTokens ?? 0;
|
|
1473
|
+
answerOut = resp.usage?.outputTokens ?? 0;
|
|
1474
|
+
}
|
|
1475
|
+
const judgePrompt = buildJudgePrompt2(
|
|
1476
|
+
q.question,
|
|
1477
|
+
q.answer,
|
|
1478
|
+
modelAnswer,
|
|
1479
|
+
q.baseType,
|
|
1480
|
+
q.isAbstention
|
|
1481
|
+
);
|
|
1482
|
+
const judgeResult = await callJudge2(judgeModel, judgePrompt);
|
|
1483
|
+
correct = judgeResult.correct;
|
|
1484
|
+
judgeIn = judgeResult.inputTokens;
|
|
1485
|
+
judgeOut = judgeResult.outputTokens;
|
|
1486
|
+
} catch (err) {
|
|
1487
|
+
errorMsg = err.message;
|
|
1488
|
+
errorCount++;
|
|
1489
|
+
correct = false;
|
|
1490
|
+
}
|
|
1491
|
+
const appearedToAbstain = appearsToDecline2(modelAnswer);
|
|
1492
|
+
const row = {
|
|
1493
|
+
questionId: q.id,
|
|
1494
|
+
questionType: q.type,
|
|
1495
|
+
isAbstention: q.isAbstention,
|
|
1496
|
+
question: q.question,
|
|
1497
|
+
goldAnswer: q.answer,
|
|
1498
|
+
modelAnswer,
|
|
1499
|
+
correct,
|
|
1500
|
+
appearedToAbstain,
|
|
1501
|
+
...contextTruncated ? { contextTruncated } : {},
|
|
1502
|
+
...errorMsg !== void 0 ? { error: errorMsg } : {},
|
|
1503
|
+
answerInputTokens: answerIn,
|
|
1504
|
+
answerOutputTokens: answerOut,
|
|
1505
|
+
judgeInputTokens: judgeIn,
|
|
1506
|
+
judgeOutputTokens: judgeOut
|
|
1507
|
+
};
|
|
1508
|
+
results.push(row);
|
|
1509
|
+
totalAnswerInputTokens += answerIn;
|
|
1510
|
+
totalAnswerOutputTokens += answerOut;
|
|
1511
|
+
totalJudgeInputTokens += judgeIn;
|
|
1512
|
+
totalJudgeOutputTokens += judgeOut;
|
|
1513
|
+
if (checkpointPath) {
|
|
1514
|
+
await appendCheckpointRow2(checkpointPath, row);
|
|
1515
|
+
}
|
|
1516
|
+
done++;
|
|
1517
|
+
if (onProgress) onProgress(done, total);
|
|
1518
|
+
};
|
|
1519
|
+
const concurrencyLimit = Math.max(1, concurrency);
|
|
1520
|
+
const pending = [];
|
|
1521
|
+
for (const q of questions) {
|
|
1522
|
+
const p = processQuestion(q);
|
|
1523
|
+
pending.push(p);
|
|
1524
|
+
if (pending.length >= concurrencyLimit) {
|
|
1525
|
+
await Promise.all(pending.splice(0, concurrencyLimit));
|
|
1526
|
+
}
|
|
1527
|
+
}
|
|
1528
|
+
if (pending.length > 0) await Promise.all(pending);
|
|
1529
|
+
if (checkpointPath && checkpoint.size > 0) {
|
|
1530
|
+
const raw = await (0, import_promises5.readFile)(checkpointPath, "utf-8").catch(() => "");
|
|
1531
|
+
for (const line of raw.split("\n")) {
|
|
1532
|
+
const trimmed = line.trim();
|
|
1533
|
+
if (!trimmed) continue;
|
|
1534
|
+
try {
|
|
1535
|
+
const row = JSON.parse(trimmed);
|
|
1536
|
+
if (checkpoint.has(row.questionId)) {
|
|
1537
|
+
results.push(row);
|
|
1538
|
+
totalAnswerInputTokens += row.answerInputTokens ?? 0;
|
|
1539
|
+
totalAnswerOutputTokens += row.answerOutputTokens ?? 0;
|
|
1540
|
+
totalJudgeInputTokens += row.judgeInputTokens ?? 0;
|
|
1541
|
+
totalJudgeOutputTokens += row.judgeOutputTokens ?? 0;
|
|
1542
|
+
}
|
|
1543
|
+
} catch {
|
|
1544
|
+
}
|
|
1545
|
+
}
|
|
1546
|
+
}
|
|
1547
|
+
const byTypeMap = {};
|
|
1548
|
+
let overallCorrect = 0;
|
|
1549
|
+
let overallTotal = 0;
|
|
1550
|
+
let abstentionCorrect = 0;
|
|
1551
|
+
let abstentionTotal = 0;
|
|
1552
|
+
for (const row of results) {
|
|
1553
|
+
if (row.isAbstention) {
|
|
1554
|
+
abstentionTotal++;
|
|
1555
|
+
if (row.correct) abstentionCorrect++;
|
|
1556
|
+
} else {
|
|
1557
|
+
const bt = row.questionType.endsWith("_abs") ? row.questionType.slice(0, -4) : row.questionType;
|
|
1558
|
+
if (!byTypeMap[bt]) byTypeMap[bt] = { correct: 0, total: 0 };
|
|
1559
|
+
overallTotal++;
|
|
1560
|
+
if (row.correct) overallCorrect++;
|
|
1561
|
+
byTypeMap[bt].total++;
|
|
1562
|
+
if (row.correct) byTypeMap[bt].correct++;
|
|
1563
|
+
}
|
|
1564
|
+
}
|
|
1565
|
+
const byType = {};
|
|
1566
|
+
for (const [t, stats] of Object.entries(byTypeMap)) {
|
|
1567
|
+
byType[t] = {
|
|
1568
|
+
...stats,
|
|
1569
|
+
accuracy: stats.total > 0 ? stats.correct / stats.total : 0
|
|
1570
|
+
};
|
|
1571
|
+
}
|
|
1572
|
+
const abstentionAccuracy = abstentionTotal > 0 ? {
|
|
1573
|
+
correct: abstentionCorrect,
|
|
1574
|
+
total: abstentionTotal,
|
|
1575
|
+
accuracy: abstentionCorrect / abstentionTotal
|
|
1576
|
+
} : void 0;
|
|
1577
|
+
const wallClockMs = Date.now() - startTime;
|
|
1578
|
+
const allTypes = [...new Set(questions.map((q) => q.type))].sort();
|
|
1579
|
+
return {
|
|
1580
|
+
config: {
|
|
1581
|
+
mode,
|
|
1582
|
+
topK,
|
|
1583
|
+
answerModelId: answerModel.modelId ?? "(unknown)",
|
|
1584
|
+
judgeModelId: judgeModel.modelId ?? "(unknown)",
|
|
1585
|
+
datasetSource: LONGMEMEVAL_SOURCE,
|
|
1586
|
+
seed,
|
|
1587
|
+
types: allTypes,
|
|
1588
|
+
questionsRun: results.length
|
|
1589
|
+
},
|
|
1590
|
+
overall: {
|
|
1591
|
+
correct: overallCorrect,
|
|
1592
|
+
total: overallTotal,
|
|
1593
|
+
accuracy: overallTotal > 0 ? overallCorrect / overallTotal : 0
|
|
1594
|
+
},
|
|
1595
|
+
byType,
|
|
1596
|
+
...abstentionAccuracy !== void 0 ? { abstentionAccuracy } : {},
|
|
1597
|
+
tokens: {
|
|
1598
|
+
ingestEmbedTokens,
|
|
1599
|
+
answerInputTokens: totalAnswerInputTokens,
|
|
1600
|
+
answerOutputTokens: totalAnswerOutputTokens,
|
|
1601
|
+
judgeInputTokens: totalJudgeInputTokens,
|
|
1602
|
+
judgeOutputTokens: totalJudgeOutputTokens,
|
|
1603
|
+
totalInputTokens: totalAnswerInputTokens + totalJudgeInputTokens,
|
|
1604
|
+
totalOutputTokens: totalAnswerOutputTokens + totalJudgeOutputTokens
|
|
1605
|
+
},
|
|
1606
|
+
wallClockMs,
|
|
1607
|
+
questions: results,
|
|
1608
|
+
errorCount
|
|
1609
|
+
};
|
|
1610
|
+
}
|
|
1611
|
+
|
|
1612
|
+
// src/lme-render.ts
|
|
1613
|
+
var QUESTION_TYPE_LABELS = {
|
|
1614
|
+
"single-session-user": "Single-session (user)",
|
|
1615
|
+
"single-session-assistant": "Single-session (asst.)",
|
|
1616
|
+
"single-session-preference": "Single-session (pref.)",
|
|
1617
|
+
"multi-session": "Multi-session",
|
|
1618
|
+
"temporal-reasoning": "Temporal reasoning",
|
|
1619
|
+
"knowledge-update": "Knowledge update"
|
|
1620
|
+
};
|
|
1621
|
+
function pct2(n) {
|
|
1622
|
+
return (n * 100).toFixed(1) + "%";
|
|
1623
|
+
}
|
|
1624
|
+
function fmtNum2(n) {
|
|
1625
|
+
return n.toLocaleString("en-US");
|
|
1626
|
+
}
|
|
1627
|
+
function estimateCost2(tokens, prices) {
|
|
1628
|
+
if (!prices) return "\u2014";
|
|
1629
|
+
const cost = tokens.totalInputTokens / 1e6 * prices.inputPer1M + tokens.totalOutputTokens / 1e6 * prices.outputPer1M;
|
|
1630
|
+
return `$${cost.toFixed(4)}`;
|
|
1631
|
+
}
|
|
1632
|
+
function fmtStat(s) {
|
|
1633
|
+
if (!s || s.total === 0) return "\u2014";
|
|
1634
|
+
return `${pct2(s.accuracy)} (${s.correct}/${s.total})`;
|
|
1635
|
+
}
|
|
1636
|
+
function renderLongMemEvalReportMarkdown(reports, prices) {
|
|
1637
|
+
const lines = [];
|
|
1638
|
+
lines.push("# LongMemEval Benchmark Results");
|
|
1639
|
+
lines.push("");
|
|
1640
|
+
lines.push(
|
|
1641
|
+
"Dataset: [LongMemEval](https://github.com/xiaowu0162/LongMemEval) (Wu et al.) \xB7 MIT License"
|
|
1642
|
+
);
|
|
1643
|
+
lines.push("Raw data is not redistributed. Only aggregate results are published here.");
|
|
1644
|
+
lines.push("");
|
|
1645
|
+
if (reports.length === 0) {
|
|
1646
|
+
lines.push("_No results yet._");
|
|
1647
|
+
return lines.join("\n");
|
|
1648
|
+
}
|
|
1649
|
+
const allBaseTypes = /* @__PURE__ */ new Set();
|
|
1650
|
+
for (const r of reports) {
|
|
1651
|
+
for (const t of Object.keys(r.byType)) allBaseTypes.add(t);
|
|
1652
|
+
}
|
|
1653
|
+
const sortedTypes = [
|
|
1654
|
+
"single-session-user",
|
|
1655
|
+
"single-session-assistant",
|
|
1656
|
+
"single-session-preference",
|
|
1657
|
+
"multi-session",
|
|
1658
|
+
"temporal-reasoning",
|
|
1659
|
+
"knowledge-update"
|
|
1660
|
+
].filter((t) => allBaseTypes.has(t));
|
|
1661
|
+
for (const t of [...allBaseTypes].sort()) {
|
|
1662
|
+
if (!sortedTypes.includes(t)) sortedTypes.push(t);
|
|
1663
|
+
}
|
|
1664
|
+
const typeHeaders = sortedTypes.map(
|
|
1665
|
+
(t) => QUESTION_TYPE_LABELS[t] ?? t
|
|
1666
|
+
);
|
|
1667
|
+
const headers = [
|
|
1668
|
+
"System / Mode",
|
|
1669
|
+
...typeHeaders,
|
|
1670
|
+
"Overall accuracy",
|
|
1671
|
+
"Abstention accuracy",
|
|
1672
|
+
"Tokens/query",
|
|
1673
|
+
"Est. cost/run",
|
|
1674
|
+
"Answer model",
|
|
1675
|
+
"Judge model",
|
|
1676
|
+
"topK",
|
|
1677
|
+
"n-Q",
|
|
1678
|
+
"Seed",
|
|
1679
|
+
"Dataset provenance"
|
|
1680
|
+
];
|
|
1681
|
+
lines.push("## Results");
|
|
1682
|
+
lines.push("");
|
|
1683
|
+
lines.push("| " + headers.join(" | ") + " |");
|
|
1684
|
+
lines.push("| " + headers.map(() => "---").join(" | ") + " |");
|
|
1685
|
+
for (const r of reports) {
|
|
1686
|
+
const c = r.config;
|
|
1687
|
+
const typeRow = sortedTypes.map((t) => fmtStat(r.byType[t]));
|
|
1688
|
+
const totalQ = r.questions.length;
|
|
1689
|
+
const tokensPerQuery = totalQ > 0 ? Math.round(
|
|
1690
|
+
(r.tokens.totalInputTokens + r.tokens.totalOutputTokens) / totalQ
|
|
1691
|
+
) : 0;
|
|
1692
|
+
const provenance = `${c.datasetSource.url.replace("https://", "")} @ ${c.datasetSource.snapshotSha.slice(0, 8)}`;
|
|
1693
|
+
const row = [
|
|
1694
|
+
`${c.answerModelId} / ${c.mode}`,
|
|
1695
|
+
...typeRow,
|
|
1696
|
+
fmtStat(r.overall),
|
|
1697
|
+
fmtStat(r.abstentionAccuracy),
|
|
1698
|
+
fmtNum2(tokensPerQuery),
|
|
1699
|
+
estimateCost2(r.tokens, prices),
|
|
1700
|
+
c.answerModelId,
|
|
1701
|
+
c.judgeModelId,
|
|
1702
|
+
c.mode === "memory" ? String(c.topK) : "\u2014",
|
|
1703
|
+
fmtNum2(r.config.questionsRun),
|
|
1704
|
+
String(c.seed),
|
|
1705
|
+
provenance
|
|
1706
|
+
];
|
|
1707
|
+
lines.push("| " + row.join(" | ") + " |");
|
|
1708
|
+
}
|
|
1709
|
+
lines.push("");
|
|
1710
|
+
lines.push("## Run Configuration");
|
|
1711
|
+
lines.push("");
|
|
1712
|
+
for (const r of reports) {
|
|
1713
|
+
const c = r.config;
|
|
1714
|
+
lines.push(`### ${c.answerModelId} / ${c.mode}`);
|
|
1715
|
+
lines.push("");
|
|
1716
|
+
lines.push(`- **Mode**: ${c.mode}`);
|
|
1717
|
+
lines.push(`- **Answer model**: ${c.answerModelId}`);
|
|
1718
|
+
lines.push(`- **Judge model**: ${c.judgeModelId}`);
|
|
1719
|
+
if (c.mode === "memory") lines.push(`- **topK**: ${c.topK}`);
|
|
1720
|
+
lines.push(`- **Dataset source**: ${c.datasetSource.url}`);
|
|
1721
|
+
lines.push(`- **Dataset snapshot SHA**: \`${c.datasetSource.snapshotSha}\``);
|
|
1722
|
+
lines.push(`- **Dataset file**: ${c.datasetSource.file}`);
|
|
1723
|
+
lines.push(`- **Dataset license**: ${c.datasetSource.license}`);
|
|
1724
|
+
lines.push(`- **Seed**: ${c.seed}`);
|
|
1725
|
+
lines.push(`- **Types**: ${c.types.join(", ") || "all"}`);
|
|
1726
|
+
lines.push(`- **Questions run**: ${c.questionsRun}`);
|
|
1727
|
+
lines.push(`- **Wall-clock**: ${(r.wallClockMs / 1e3).toFixed(1)}s`);
|
|
1728
|
+
lines.push(`- **Errors**: ${r.errorCount}`);
|
|
1729
|
+
lines.push(
|
|
1730
|
+
`- **Tokens** (in/out): ${fmtNum2(r.tokens.totalInputTokens)} / ${fmtNum2(r.tokens.totalOutputTokens)}`
|
|
1731
|
+
);
|
|
1732
|
+
lines.push("");
|
|
1733
|
+
}
|
|
1734
|
+
lines.push("## Methodology Notes");
|
|
1735
|
+
lines.push("");
|
|
1736
|
+
lines.push(
|
|
1737
|
+
"These results were produced using the Eidentic LongMemEval fair-run harness. The following rules apply:"
|
|
1738
|
+
);
|
|
1739
|
+
lines.push("");
|
|
1740
|
+
lines.push(
|
|
1741
|
+
"1. **Per-question memory scope.** Each question has its own haystack (~50 sessions on average). A fresh Memory instance is created per question; no cross-question contamination."
|
|
1742
|
+
);
|
|
1743
|
+
lines.push(
|
|
1744
|
+
"2. **Dual-granularity ingest.** Each turn is ingested with its session date in the text (temporally anchored). An additional session-level chunk entry captures multi-turn context."
|
|
1745
|
+
);
|
|
1746
|
+
lines.push(
|
|
1747
|
+
"3. **Current date in prompt.** The `question_date` is passed to the answer prompt so temporal questions can reason about recency."
|
|
1748
|
+
);
|
|
1749
|
+
lines.push(
|
|
1750
|
+
"4. **topK \u2264 10 in memory mode.** Larger topK values trivialise retrieval quality and are not permitted."
|
|
1751
|
+
);
|
|
1752
|
+
lines.push(
|
|
1753
|
+
"5. **Full-context baseline is required** alongside any memory-mode result."
|
|
1754
|
+
);
|
|
1755
|
+
lines.push(
|
|
1756
|
+
"6. **Judge is strict**: a model answer is correct only when it contains the gold answer's specific information. Vague/topical-only answers are wrong. Equivalent date expressions for the same date/duration are correct (temporal-reasoning type)."
|
|
1757
|
+
);
|
|
1758
|
+
lines.push(
|
|
1759
|
+
"7. **Abstention questions** (not present in longmemeval_s.json standard split): correct = model declined / said no-info / identified a flawed premise; fabricating a specific answer = wrong. Abstention accuracy is reported separately and not folded into overall accuracy."
|
|
1760
|
+
);
|
|
1761
|
+
lines.push(
|
|
1762
|
+
"8. **Dataset license**: MIT \u2014 raw data is not redistributed; only aggregate results are published."
|
|
1763
|
+
);
|
|
1764
|
+
lines.push("");
|
|
1765
|
+
lines.push("> Per-type question counts in longmemeval_s.json (500 total):");
|
|
1766
|
+
lines.push("> single-session-user 70, single-session-assistant 56, single-session-preference 30,");
|
|
1767
|
+
lines.push("> multi-session 133, temporal-reasoning 133, knowledge-update 78.");
|
|
1768
|
+
lines.push("> No abstention variants in the standard _s split.");
|
|
1769
|
+
lines.push("");
|
|
1770
|
+
return lines.join("\n");
|
|
1771
|
+
}
|
|
1772
|
+
|
|
1097
1773
|
// src/write-quality.ts
|
|
1098
1774
|
var CONTRADICTION_FIXTURES = [
|
|
1099
1775
|
{
|
|
@@ -1471,7 +2147,7 @@ async function runTemporalBench(memory, dataset, opts = {}) {
|
|
|
1471
2147
|
}
|
|
1472
2148
|
|
|
1473
2149
|
// src/datasets/temporal.ts
|
|
1474
|
-
function
|
|
2150
|
+
function makeRng3(seed) {
|
|
1475
2151
|
let s = seed >>> 0;
|
|
1476
2152
|
if (s === 0) s = 1;
|
|
1477
2153
|
return () => {
|
|
@@ -1554,7 +2230,7 @@ function syntheticTemporalDataset(opts = {}) {
|
|
|
1554
2230
|
const entityCount = opts.entityCount ?? 4;
|
|
1555
2231
|
const seed = opts.seed ?? 42;
|
|
1556
2232
|
const changesPerProperty = opts.changesPerProperty ?? 3;
|
|
1557
|
-
const rng =
|
|
2233
|
+
const rng = makeRng3(seed);
|
|
1558
2234
|
const entities = [];
|
|
1559
2235
|
const asserts = [];
|
|
1560
2236
|
const questions = [];
|
|
@@ -1642,16 +2318,21 @@ function syntheticTemporalDataset(opts = {}) {
|
|
|
1642
2318
|
CONTRADICTION_FIXTURES,
|
|
1643
2319
|
JUNK_STREAM_FIXTURES,
|
|
1644
2320
|
LOCOMO_SOURCE_SHA,
|
|
2321
|
+
LONGMEMEVAL_SOURCE,
|
|
1645
2322
|
factRecall,
|
|
1646
2323
|
loadLoCoMo,
|
|
1647
2324
|
loadLoCoMoLegacy,
|
|
1648
2325
|
loadLongMemEval,
|
|
2326
|
+
loadLongMemEvalLegacy,
|
|
1649
2327
|
normalizeText,
|
|
1650
2328
|
normalizedIncludes,
|
|
2329
|
+
parseLmeDateTimeString,
|
|
1651
2330
|
recallAtK,
|
|
1652
2331
|
renderLocomoReportMarkdown,
|
|
2332
|
+
renderLongMemEvalReportMarkdown,
|
|
1653
2333
|
resolveEvidence,
|
|
1654
2334
|
runLocomoBench,
|
|
2335
|
+
runLongMemEvalBench,
|
|
1655
2336
|
runMemoryBench,
|
|
1656
2337
|
runTemporalBench,
|
|
1657
2338
|
runWriteQualityBench,
|