@eidentic/bench 0.1.1 → 0.1.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/chunk-KOMVTEKE.js +98 -0
- package/dist/index.cjs +684 -3
- package/dist/index.d.cts +281 -2
- package/dist/index.d.ts +281 -2
- package/dist/index.js +575 -3
- package/dist/lme-loader-WSJ72GEP.js +10 -0
- package/package.json +4 -4
package/dist/index.js
CHANGED
|
@@ -2,6 +2,11 @@ import {
|
|
|
2
2
|
LOCOMO_SOURCE_SHA,
|
|
3
3
|
loadLoCoMo
|
|
4
4
|
} from "./chunk-PVIWNXCY.js";
|
|
5
|
+
import {
|
|
6
|
+
LONGMEMEVAL_SOURCE,
|
|
7
|
+
loadLongMemEval,
|
|
8
|
+
parseLmeDateTimeString
|
|
9
|
+
} from "./chunk-KOMVTEKE.js";
|
|
5
10
|
|
|
6
11
|
// src/recall.ts
|
|
7
12
|
function normalizeText(text) {
|
|
@@ -341,7 +346,7 @@ async function assertFileSize(filePath, maxBytes = DEFAULT_MAX_BYTES) {
|
|
|
341
346
|
);
|
|
342
347
|
}
|
|
343
348
|
}
|
|
344
|
-
async function
|
|
349
|
+
async function loadLongMemEval2(jsonPath, opts) {
|
|
345
350
|
await assertFileSize(jsonPath, opts?.maxBytes);
|
|
346
351
|
const raw = JSON.parse(await readFile(jsonPath, "utf-8"));
|
|
347
352
|
const cases = Array.isArray(raw) ? raw : [];
|
|
@@ -933,6 +938,568 @@ function resolveEvidence(sample, diaIds) {
|
|
|
933
938
|
return results;
|
|
934
939
|
}
|
|
935
940
|
|
|
941
|
+
// src/lme-run.ts
|
|
942
|
+
import { readFile as readFile3, appendFile as appendFile2 } from "node:fs/promises";
|
|
943
|
+
import { existsSync as existsSync2 } from "node:fs";
|
|
944
|
+
function makeRng2(seed) {
|
|
945
|
+
let s = seed >>> 0;
|
|
946
|
+
if (s === 0) s = 1;
|
|
947
|
+
return () => {
|
|
948
|
+
s ^= s << 13;
|
|
949
|
+
s ^= s >>> 17;
|
|
950
|
+
s ^= s << 5;
|
|
951
|
+
s = s >>> 0;
|
|
952
|
+
return s / 4294967296;
|
|
953
|
+
};
|
|
954
|
+
}
|
|
955
|
+
function seededShuffle2(arr, rng) {
|
|
956
|
+
const out = [...arr];
|
|
957
|
+
for (let i = out.length - 1; i > 0; i--) {
|
|
958
|
+
const j = Math.floor(rng() * (i + 1));
|
|
959
|
+
[out[i], out[j]] = [out[j], out[i]];
|
|
960
|
+
}
|
|
961
|
+
return out;
|
|
962
|
+
}
|
|
963
|
+
var DEFAULT_FULL_CONTEXT_MAX_CHARS = 48e4;
|
|
964
|
+
var DECLINE_PATTERNS2 = /\bno information available\b|\bi (don'?t|do not) (know|have)\b|\bcannot (find|answer|determine|provide)\b|\bnot (mentioned|stated|provided|found|available)\b|\bunable to (find|answer|determine)\b|\bno relevant (information|data)\b/i;
|
|
965
|
+
function appearsToDecline2(answer) {
|
|
966
|
+
return DECLINE_PATTERNS2.test(answer);
|
|
967
|
+
}
|
|
968
|
+
var ANSWER_INSTRUCTION2 = "Answer the question concisely and specifically based only on the information provided. If the provided context does not contain the answer, reply exactly: No information available";
|
|
969
|
+
function buildMemoryAnswerPrompt2(snippets, question, currentDate) {
|
|
970
|
+
const context = snippets.length > 0 ? snippets.map((s, i) => `[${i + 1}] ${s}`).join("\n\n") : "(no relevant context retrieved)";
|
|
971
|
+
const dateNote = currentDate ? `
|
|
972
|
+
Current date: ${currentDate}` : "";
|
|
973
|
+
return `Context from conversation history:
|
|
974
|
+
${context}${dateNote}
|
|
975
|
+
|
|
976
|
+
Question: ${question}
|
|
977
|
+
|
|
978
|
+
${ANSWER_INSTRUCTION2}`;
|
|
979
|
+
}
|
|
980
|
+
function buildFullContextAnswerPrompt2(haystackText, question, currentDate) {
|
|
981
|
+
const dateNote = currentDate ? `
|
|
982
|
+
Current date: ${currentDate}` : "";
|
|
983
|
+
return `Conversation history:
|
|
984
|
+
${haystackText}${dateNote}
|
|
985
|
+
|
|
986
|
+
Question: ${question}
|
|
987
|
+
|
|
988
|
+
${ANSWER_INSTRUCTION2}`;
|
|
989
|
+
}
|
|
990
|
+
function buildJudgePrompt2(question, goldAnswer, modelAnswer, questionType, isAbstention) {
|
|
991
|
+
if (isAbstention) {
|
|
992
|
+
return `You are a strict judge. This question requires the model to recognize that the information is not present in the conversation history, or that the premise is flawed. A CORRECT response means the model abstained, said something equivalent to "No information available", "I don't know", or identified that the premise is wrong. An INCORRECT response means the model fabricated a specific, concrete answer.
|
|
993
|
+
|
|
994
|
+
Question: ${question}
|
|
995
|
+
Model answer: ${modelAnswer}
|
|
996
|
+
|
|
997
|
+
Respond with JSON only: {"correct": true} if the model appropriately abstained, {"correct": false} if it fabricated a specific answer.`;
|
|
998
|
+
}
|
|
999
|
+
const temporalNote = questionType === "temporal-reasoning" ? "\nFor time/date questions: equivalent date expressions referring to the same date or duration are correct." : "";
|
|
1000
|
+
return `You are a strict judge evaluating whether a model answer is correct.
|
|
1001
|
+
CORRECT: the model answer contains the specific information in the gold answer (exact match or clear paraphrase is fine).${temporalNote}
|
|
1002
|
+
WRONG: the model answer is vague, only topically related, contradicts the gold answer, or says "no information" when a specific answer exists.
|
|
1003
|
+
|
|
1004
|
+
Question: ${question}
|
|
1005
|
+
Gold answer: ${goldAnswer}
|
|
1006
|
+
Model answer: ${modelAnswer}
|
|
1007
|
+
|
|
1008
|
+
Respond with JSON only: {"correct": true} or {"correct": false}`;
|
|
1009
|
+
}
|
|
1010
|
+
function renderHaystack(sessions) {
|
|
1011
|
+
const lines = [];
|
|
1012
|
+
for (let i = 0; i < sessions.length; i++) {
|
|
1013
|
+
const sess = sessions[i];
|
|
1014
|
+
const label = `Session ${i + 1} \u2014 ${sess.dateTime || sess.id}`;
|
|
1015
|
+
lines.push(label);
|
|
1016
|
+
for (const turn of sess.turns) {
|
|
1017
|
+
const roleLabel = turn.role === "user" ? "User" : "Assistant";
|
|
1018
|
+
lines.push(`[${roleLabel}]: ${turn.content}`);
|
|
1019
|
+
}
|
|
1020
|
+
lines.push("");
|
|
1021
|
+
}
|
|
1022
|
+
return lines.join("\n").trim();
|
|
1023
|
+
}
|
|
1024
|
+
function renderHaystackCapped(sessions, maxChars) {
|
|
1025
|
+
const full = renderHaystack(sessions);
|
|
1026
|
+
if (full.length <= maxChars) return { text: full, truncated: false };
|
|
1027
|
+
let kept = sessions.slice();
|
|
1028
|
+
while (kept.length > 1) {
|
|
1029
|
+
kept = kept.slice(1);
|
|
1030
|
+
const t = renderHaystack(kept);
|
|
1031
|
+
if (t.length <= maxChars) return { text: t, truncated: true };
|
|
1032
|
+
}
|
|
1033
|
+
return { text: renderHaystack(kept).slice(0, maxChars), truncated: true };
|
|
1034
|
+
}
|
|
1035
|
+
var EMBED_CHAR_CAP = 2e4;
|
|
1036
|
+
function capForEmbedding(text) {
|
|
1037
|
+
return text.length <= EMBED_CHAR_CAP ? text : text.slice(0, EMBED_CHAR_CAP);
|
|
1038
|
+
}
|
|
1039
|
+
async function ingestQuestionIntoMemory(question, memory, scope) {
|
|
1040
|
+
const events = [];
|
|
1041
|
+
for (let i = 0; i < question.sessions.length; i++) {
|
|
1042
|
+
const sess = question.sessions[i];
|
|
1043
|
+
const sessLabel = `Session ${i + 1} \u2014 ${sess.dateTime || sess.id}`;
|
|
1044
|
+
for (let t = 0; t < sess.turns.length; t++) {
|
|
1045
|
+
const turn = sess.turns[t];
|
|
1046
|
+
const roleLabel = turn.role === "user" ? "User" : "Assistant";
|
|
1047
|
+
events.push({
|
|
1048
|
+
id: `${question.id}:sess${i}:turn${t}`,
|
|
1049
|
+
scope,
|
|
1050
|
+
text: capForEmbedding(`[${sessLabel}] [${roleLabel}]: ${turn.content}`),
|
|
1051
|
+
metadata: {
|
|
1052
|
+
sessionId: sess.id,
|
|
1053
|
+
sessionIndex: i,
|
|
1054
|
+
turnRole: turn.role,
|
|
1055
|
+
ingestedAt: sess.dateTimeMs || void 0
|
|
1056
|
+
}
|
|
1057
|
+
});
|
|
1058
|
+
}
|
|
1059
|
+
const sessionText = [
|
|
1060
|
+
sessLabel,
|
|
1061
|
+
...sess.turns.map((t) => `[${t.role === "user" ? "User" : "Assistant"}]: ${t.content}`)
|
|
1062
|
+
].join("\n");
|
|
1063
|
+
events.push({
|
|
1064
|
+
id: `${question.id}:sess${i}:chunk`,
|
|
1065
|
+
scope,
|
|
1066
|
+
text: capForEmbedding(sessionText),
|
|
1067
|
+
metadata: {
|
|
1068
|
+
sessionId: sess.id,
|
|
1069
|
+
sessionIndex: i,
|
|
1070
|
+
ingestedAt: sess.dateTimeMs || void 0
|
|
1071
|
+
}
|
|
1072
|
+
});
|
|
1073
|
+
}
|
|
1074
|
+
await memory.ingest(events);
|
|
1075
|
+
}
|
|
1076
|
+
async function callJudge2(judgeModel, prompt) {
|
|
1077
|
+
const response = await judgeModel.complete({
|
|
1078
|
+
messages: [{ role: "user", content: prompt }],
|
|
1079
|
+
tools: [],
|
|
1080
|
+
outputSchema: {
|
|
1081
|
+
type: "object",
|
|
1082
|
+
properties: { correct: { type: "boolean" } },
|
|
1083
|
+
required: ["correct"],
|
|
1084
|
+
// OpenAI strict structured-output mode requires this to be explicit.
|
|
1085
|
+
additionalProperties: false
|
|
1086
|
+
}
|
|
1087
|
+
});
|
|
1088
|
+
let correct = false;
|
|
1089
|
+
if (response.object && typeof response.object.correct === "boolean") {
|
|
1090
|
+
correct = response.object.correct;
|
|
1091
|
+
} else {
|
|
1092
|
+
const text = response.content.filter((b) => b.type === "text").map((b) => b.text ?? "").join("").toLowerCase().trim();
|
|
1093
|
+
if (/"correct"\s*:\s*true/i.test(text)) correct = true;
|
|
1094
|
+
else if (/"correct"\s*:\s*false/i.test(text)) correct = false;
|
|
1095
|
+
else correct = text.includes("true");
|
|
1096
|
+
}
|
|
1097
|
+
return {
|
|
1098
|
+
correct,
|
|
1099
|
+
inputTokens: response.usage?.inputTokens ?? 0,
|
|
1100
|
+
outputTokens: response.usage?.outputTokens ?? 0
|
|
1101
|
+
};
|
|
1102
|
+
}
|
|
1103
|
+
async function loadCheckpoint2(path) {
|
|
1104
|
+
const done = /* @__PURE__ */ new Set();
|
|
1105
|
+
if (!existsSync2(path)) return done;
|
|
1106
|
+
const raw = await readFile3(path, "utf-8");
|
|
1107
|
+
for (const line of raw.split("\n")) {
|
|
1108
|
+
const trimmed = line.trim();
|
|
1109
|
+
if (!trimmed) continue;
|
|
1110
|
+
try {
|
|
1111
|
+
const row = JSON.parse(trimmed);
|
|
1112
|
+
if (row.questionId) done.add(row.questionId);
|
|
1113
|
+
} catch {
|
|
1114
|
+
}
|
|
1115
|
+
}
|
|
1116
|
+
return done;
|
|
1117
|
+
}
|
|
1118
|
+
async function appendCheckpointRow2(path, row) {
|
|
1119
|
+
await appendFile2(path, JSON.stringify(row) + "\n", "utf-8");
|
|
1120
|
+
}
|
|
1121
|
+
async function runLongMemEvalBench(opts) {
|
|
1122
|
+
const {
|
|
1123
|
+
answerModel,
|
|
1124
|
+
judgeModel,
|
|
1125
|
+
mode,
|
|
1126
|
+
types,
|
|
1127
|
+
questionLimit,
|
|
1128
|
+
seed = 42,
|
|
1129
|
+
concurrency = 1,
|
|
1130
|
+
onProgress,
|
|
1131
|
+
checkpointPath,
|
|
1132
|
+
fullContextMaxChars = DEFAULT_FULL_CONTEXT_MAX_CHARS
|
|
1133
|
+
} = opts;
|
|
1134
|
+
const topK = Math.min(opts.topK ?? 10, 10);
|
|
1135
|
+
const rng = makeRng2(seed);
|
|
1136
|
+
if (mode === "memory" && !opts.memoryFactory) {
|
|
1137
|
+
throw new Error("runLongMemEvalBench: memoryFactory is required when mode='memory'");
|
|
1138
|
+
}
|
|
1139
|
+
let dataset;
|
|
1140
|
+
if (opts.dataset) {
|
|
1141
|
+
dataset = opts.dataset;
|
|
1142
|
+
} else if (opts.dataPath) {
|
|
1143
|
+
const { loadLongMemEval: loader } = await import("./lme-loader-WSJ72GEP.js");
|
|
1144
|
+
dataset = await loader(opts.dataPath);
|
|
1145
|
+
} else {
|
|
1146
|
+
throw new Error("runLongMemEvalBench: either dataPath or dataset must be provided");
|
|
1147
|
+
}
|
|
1148
|
+
let questions = dataset.questions;
|
|
1149
|
+
if (types && types.length > 0) {
|
|
1150
|
+
questions = questions.filter((q) => types.includes(q.type) || types.includes(q.baseType));
|
|
1151
|
+
}
|
|
1152
|
+
if (questionLimit !== void 0 && questionLimit < questions.length) {
|
|
1153
|
+
questions = seededShuffle2(questions, makeRng2(seed)).slice(0, questionLimit);
|
|
1154
|
+
}
|
|
1155
|
+
const checkpoint = checkpointPath ? await loadCheckpoint2(checkpointPath) : /* @__PURE__ */ new Set();
|
|
1156
|
+
const results = [];
|
|
1157
|
+
let totalAnswerInputTokens = 0;
|
|
1158
|
+
let totalAnswerOutputTokens = 0;
|
|
1159
|
+
let totalJudgeInputTokens = 0;
|
|
1160
|
+
let totalJudgeOutputTokens = 0;
|
|
1161
|
+
let ingestEmbedTokens = 0;
|
|
1162
|
+
let errorCount = 0;
|
|
1163
|
+
let done = 0;
|
|
1164
|
+
const total = questions.length - checkpoint.size;
|
|
1165
|
+
const startTime = Date.now();
|
|
1166
|
+
const processQuestion = async (q) => {
|
|
1167
|
+
if (checkpoint.has(q.id)) return;
|
|
1168
|
+
let modelAnswer = "";
|
|
1169
|
+
let answerIn = 0;
|
|
1170
|
+
let answerOut = 0;
|
|
1171
|
+
let judgeIn = 0;
|
|
1172
|
+
let judgeOut = 0;
|
|
1173
|
+
let correct = false;
|
|
1174
|
+
let contextTruncated = false;
|
|
1175
|
+
let errorMsg;
|
|
1176
|
+
try {
|
|
1177
|
+
if (mode === "memory") {
|
|
1178
|
+
const memory = await opts.memoryFactory(q.id);
|
|
1179
|
+
const scope = { kind: "agent", agentId: `lme:${q.id}` };
|
|
1180
|
+
await ingestQuestionIntoMemory(q, memory, scope);
|
|
1181
|
+
const retrieved = await memory.retrieve({ text: q.question, scope, topK });
|
|
1182
|
+
const snippets = retrieved.snippets.map((s) => s.text);
|
|
1183
|
+
const prompt = buildMemoryAnswerPrompt2(snippets, q.question, q.questionDate);
|
|
1184
|
+
const resp = await answerModel.complete({
|
|
1185
|
+
messages: [{ role: "user", content: prompt }],
|
|
1186
|
+
tools: []
|
|
1187
|
+
});
|
|
1188
|
+
const textBlocks = resp.content.filter((b) => b.type === "text");
|
|
1189
|
+
modelAnswer = textBlocks.map((b) => b.text).join("").trim();
|
|
1190
|
+
answerIn = resp.usage?.inputTokens ?? 0;
|
|
1191
|
+
answerOut = resp.usage?.outputTokens ?? 0;
|
|
1192
|
+
} else {
|
|
1193
|
+
const { text, truncated } = renderHaystackCapped(q.sessions, fullContextMaxChars);
|
|
1194
|
+
contextTruncated = truncated;
|
|
1195
|
+
const prompt = buildFullContextAnswerPrompt2(text, q.question, q.questionDate);
|
|
1196
|
+
const resp = await answerModel.complete({
|
|
1197
|
+
messages: [{ role: "user", content: prompt }],
|
|
1198
|
+
tools: []
|
|
1199
|
+
});
|
|
1200
|
+
const textBlocks = resp.content.filter((b) => b.type === "text");
|
|
1201
|
+
modelAnswer = textBlocks.map((b) => b.text).join("").trim();
|
|
1202
|
+
answerIn = resp.usage?.inputTokens ?? 0;
|
|
1203
|
+
answerOut = resp.usage?.outputTokens ?? 0;
|
|
1204
|
+
}
|
|
1205
|
+
const judgePrompt = buildJudgePrompt2(
|
|
1206
|
+
q.question,
|
|
1207
|
+
q.answer,
|
|
1208
|
+
modelAnswer,
|
|
1209
|
+
q.baseType,
|
|
1210
|
+
q.isAbstention
|
|
1211
|
+
);
|
|
1212
|
+
const judgeResult = await callJudge2(judgeModel, judgePrompt);
|
|
1213
|
+
correct = judgeResult.correct;
|
|
1214
|
+
judgeIn = judgeResult.inputTokens;
|
|
1215
|
+
judgeOut = judgeResult.outputTokens;
|
|
1216
|
+
} catch (err) {
|
|
1217
|
+
errorMsg = err.message;
|
|
1218
|
+
errorCount++;
|
|
1219
|
+
correct = false;
|
|
1220
|
+
}
|
|
1221
|
+
const appearedToAbstain = appearsToDecline2(modelAnswer);
|
|
1222
|
+
const row = {
|
|
1223
|
+
questionId: q.id,
|
|
1224
|
+
questionType: q.type,
|
|
1225
|
+
isAbstention: q.isAbstention,
|
|
1226
|
+
question: q.question,
|
|
1227
|
+
goldAnswer: q.answer,
|
|
1228
|
+
modelAnswer,
|
|
1229
|
+
correct,
|
|
1230
|
+
appearedToAbstain,
|
|
1231
|
+
...contextTruncated ? { contextTruncated } : {},
|
|
1232
|
+
...errorMsg !== void 0 ? { error: errorMsg } : {},
|
|
1233
|
+
answerInputTokens: answerIn,
|
|
1234
|
+
answerOutputTokens: answerOut,
|
|
1235
|
+
judgeInputTokens: judgeIn,
|
|
1236
|
+
judgeOutputTokens: judgeOut
|
|
1237
|
+
};
|
|
1238
|
+
results.push(row);
|
|
1239
|
+
totalAnswerInputTokens += answerIn;
|
|
1240
|
+
totalAnswerOutputTokens += answerOut;
|
|
1241
|
+
totalJudgeInputTokens += judgeIn;
|
|
1242
|
+
totalJudgeOutputTokens += judgeOut;
|
|
1243
|
+
if (checkpointPath) {
|
|
1244
|
+
await appendCheckpointRow2(checkpointPath, row);
|
|
1245
|
+
}
|
|
1246
|
+
done++;
|
|
1247
|
+
if (onProgress) onProgress(done, total);
|
|
1248
|
+
};
|
|
1249
|
+
const concurrencyLimit = Math.max(1, concurrency);
|
|
1250
|
+
const pending = [];
|
|
1251
|
+
for (const q of questions) {
|
|
1252
|
+
const p = processQuestion(q);
|
|
1253
|
+
pending.push(p);
|
|
1254
|
+
if (pending.length >= concurrencyLimit) {
|
|
1255
|
+
await Promise.all(pending.splice(0, concurrencyLimit));
|
|
1256
|
+
}
|
|
1257
|
+
}
|
|
1258
|
+
if (pending.length > 0) await Promise.all(pending);
|
|
1259
|
+
if (checkpointPath && checkpoint.size > 0) {
|
|
1260
|
+
const raw = await readFile3(checkpointPath, "utf-8").catch(() => "");
|
|
1261
|
+
for (const line of raw.split("\n")) {
|
|
1262
|
+
const trimmed = line.trim();
|
|
1263
|
+
if (!trimmed) continue;
|
|
1264
|
+
try {
|
|
1265
|
+
const row = JSON.parse(trimmed);
|
|
1266
|
+
if (checkpoint.has(row.questionId)) {
|
|
1267
|
+
results.push(row);
|
|
1268
|
+
totalAnswerInputTokens += row.answerInputTokens ?? 0;
|
|
1269
|
+
totalAnswerOutputTokens += row.answerOutputTokens ?? 0;
|
|
1270
|
+
totalJudgeInputTokens += row.judgeInputTokens ?? 0;
|
|
1271
|
+
totalJudgeOutputTokens += row.judgeOutputTokens ?? 0;
|
|
1272
|
+
}
|
|
1273
|
+
} catch {
|
|
1274
|
+
}
|
|
1275
|
+
}
|
|
1276
|
+
}
|
|
1277
|
+
const byTypeMap = {};
|
|
1278
|
+
let overallCorrect = 0;
|
|
1279
|
+
let overallTotal = 0;
|
|
1280
|
+
let abstentionCorrect = 0;
|
|
1281
|
+
let abstentionTotal = 0;
|
|
1282
|
+
for (const row of results) {
|
|
1283
|
+
if (row.isAbstention) {
|
|
1284
|
+
abstentionTotal++;
|
|
1285
|
+
if (row.correct) abstentionCorrect++;
|
|
1286
|
+
} else {
|
|
1287
|
+
const bt = row.questionType.endsWith("_abs") ? row.questionType.slice(0, -4) : row.questionType;
|
|
1288
|
+
if (!byTypeMap[bt]) byTypeMap[bt] = { correct: 0, total: 0 };
|
|
1289
|
+
overallTotal++;
|
|
1290
|
+
if (row.correct) overallCorrect++;
|
|
1291
|
+
byTypeMap[bt].total++;
|
|
1292
|
+
if (row.correct) byTypeMap[bt].correct++;
|
|
1293
|
+
}
|
|
1294
|
+
}
|
|
1295
|
+
const byType = {};
|
|
1296
|
+
for (const [t, stats] of Object.entries(byTypeMap)) {
|
|
1297
|
+
byType[t] = {
|
|
1298
|
+
...stats,
|
|
1299
|
+
accuracy: stats.total > 0 ? stats.correct / stats.total : 0
|
|
1300
|
+
};
|
|
1301
|
+
}
|
|
1302
|
+
const abstentionAccuracy = abstentionTotal > 0 ? {
|
|
1303
|
+
correct: abstentionCorrect,
|
|
1304
|
+
total: abstentionTotal,
|
|
1305
|
+
accuracy: abstentionCorrect / abstentionTotal
|
|
1306
|
+
} : void 0;
|
|
1307
|
+
const wallClockMs = Date.now() - startTime;
|
|
1308
|
+
const allTypes = [...new Set(questions.map((q) => q.type))].sort();
|
|
1309
|
+
return {
|
|
1310
|
+
config: {
|
|
1311
|
+
mode,
|
|
1312
|
+
topK,
|
|
1313
|
+
answerModelId: answerModel.modelId ?? "(unknown)",
|
|
1314
|
+
judgeModelId: judgeModel.modelId ?? "(unknown)",
|
|
1315
|
+
datasetSource: LONGMEMEVAL_SOURCE,
|
|
1316
|
+
seed,
|
|
1317
|
+
types: allTypes,
|
|
1318
|
+
questionsRun: results.length
|
|
1319
|
+
},
|
|
1320
|
+
overall: {
|
|
1321
|
+
correct: overallCorrect,
|
|
1322
|
+
total: overallTotal,
|
|
1323
|
+
accuracy: overallTotal > 0 ? overallCorrect / overallTotal : 0
|
|
1324
|
+
},
|
|
1325
|
+
byType,
|
|
1326
|
+
...abstentionAccuracy !== void 0 ? { abstentionAccuracy } : {},
|
|
1327
|
+
tokens: {
|
|
1328
|
+
ingestEmbedTokens,
|
|
1329
|
+
answerInputTokens: totalAnswerInputTokens,
|
|
1330
|
+
answerOutputTokens: totalAnswerOutputTokens,
|
|
1331
|
+
judgeInputTokens: totalJudgeInputTokens,
|
|
1332
|
+
judgeOutputTokens: totalJudgeOutputTokens,
|
|
1333
|
+
totalInputTokens: totalAnswerInputTokens + totalJudgeInputTokens,
|
|
1334
|
+
totalOutputTokens: totalAnswerOutputTokens + totalJudgeOutputTokens
|
|
1335
|
+
},
|
|
1336
|
+
wallClockMs,
|
|
1337
|
+
questions: results,
|
|
1338
|
+
errorCount
|
|
1339
|
+
};
|
|
1340
|
+
}
|
|
1341
|
+
|
|
1342
|
+
// src/lme-render.ts
|
|
1343
|
+
var QUESTION_TYPE_LABELS = {
|
|
1344
|
+
"single-session-user": "Single-session (user)",
|
|
1345
|
+
"single-session-assistant": "Single-session (asst.)",
|
|
1346
|
+
"single-session-preference": "Single-session (pref.)",
|
|
1347
|
+
"multi-session": "Multi-session",
|
|
1348
|
+
"temporal-reasoning": "Temporal reasoning",
|
|
1349
|
+
"knowledge-update": "Knowledge update"
|
|
1350
|
+
};
|
|
1351
|
+
function pct2(n) {
|
|
1352
|
+
return (n * 100).toFixed(1) + "%";
|
|
1353
|
+
}
|
|
1354
|
+
function fmtNum2(n) {
|
|
1355
|
+
return n.toLocaleString("en-US");
|
|
1356
|
+
}
|
|
1357
|
+
function estimateCost2(tokens, prices) {
|
|
1358
|
+
if (!prices) return "\u2014";
|
|
1359
|
+
const cost = tokens.totalInputTokens / 1e6 * prices.inputPer1M + tokens.totalOutputTokens / 1e6 * prices.outputPer1M;
|
|
1360
|
+
return `$${cost.toFixed(4)}`;
|
|
1361
|
+
}
|
|
1362
|
+
function fmtStat(s) {
|
|
1363
|
+
if (!s || s.total === 0) return "\u2014";
|
|
1364
|
+
return `${pct2(s.accuracy)} (${s.correct}/${s.total})`;
|
|
1365
|
+
}
|
|
1366
|
+
function renderLongMemEvalReportMarkdown(reports, prices) {
|
|
1367
|
+
const lines = [];
|
|
1368
|
+
lines.push("# LongMemEval Benchmark Results");
|
|
1369
|
+
lines.push("");
|
|
1370
|
+
lines.push(
|
|
1371
|
+
"Dataset: [LongMemEval](https://github.com/xiaowu0162/LongMemEval) (Wu et al.) \xB7 MIT License"
|
|
1372
|
+
);
|
|
1373
|
+
lines.push("Raw data is not redistributed. Only aggregate results are published here.");
|
|
1374
|
+
lines.push("");
|
|
1375
|
+
if (reports.length === 0) {
|
|
1376
|
+
lines.push("_No results yet._");
|
|
1377
|
+
return lines.join("\n");
|
|
1378
|
+
}
|
|
1379
|
+
const allBaseTypes = /* @__PURE__ */ new Set();
|
|
1380
|
+
for (const r of reports) {
|
|
1381
|
+
for (const t of Object.keys(r.byType)) allBaseTypes.add(t);
|
|
1382
|
+
}
|
|
1383
|
+
const sortedTypes = [
|
|
1384
|
+
"single-session-user",
|
|
1385
|
+
"single-session-assistant",
|
|
1386
|
+
"single-session-preference",
|
|
1387
|
+
"multi-session",
|
|
1388
|
+
"temporal-reasoning",
|
|
1389
|
+
"knowledge-update"
|
|
1390
|
+
].filter((t) => allBaseTypes.has(t));
|
|
1391
|
+
for (const t of [...allBaseTypes].sort()) {
|
|
1392
|
+
if (!sortedTypes.includes(t)) sortedTypes.push(t);
|
|
1393
|
+
}
|
|
1394
|
+
const typeHeaders = sortedTypes.map(
|
|
1395
|
+
(t) => QUESTION_TYPE_LABELS[t] ?? t
|
|
1396
|
+
);
|
|
1397
|
+
const headers = [
|
|
1398
|
+
"System / Mode",
|
|
1399
|
+
...typeHeaders,
|
|
1400
|
+
"Overall accuracy",
|
|
1401
|
+
"Abstention accuracy",
|
|
1402
|
+
"Tokens/query",
|
|
1403
|
+
"Est. cost/run",
|
|
1404
|
+
"Answer model",
|
|
1405
|
+
"Judge model",
|
|
1406
|
+
"topK",
|
|
1407
|
+
"n-Q",
|
|
1408
|
+
"Seed",
|
|
1409
|
+
"Dataset provenance"
|
|
1410
|
+
];
|
|
1411
|
+
lines.push("## Results");
|
|
1412
|
+
lines.push("");
|
|
1413
|
+
lines.push("| " + headers.join(" | ") + " |");
|
|
1414
|
+
lines.push("| " + headers.map(() => "---").join(" | ") + " |");
|
|
1415
|
+
for (const r of reports) {
|
|
1416
|
+
const c = r.config;
|
|
1417
|
+
const typeRow = sortedTypes.map((t) => fmtStat(r.byType[t]));
|
|
1418
|
+
const totalQ = r.questions.length;
|
|
1419
|
+
const tokensPerQuery = totalQ > 0 ? Math.round(
|
|
1420
|
+
(r.tokens.totalInputTokens + r.tokens.totalOutputTokens) / totalQ
|
|
1421
|
+
) : 0;
|
|
1422
|
+
const provenance = `${c.datasetSource.url.replace("https://", "")} @ ${c.datasetSource.snapshotSha.slice(0, 8)}`;
|
|
1423
|
+
const row = [
|
|
1424
|
+
`${c.answerModelId} / ${c.mode}`,
|
|
1425
|
+
...typeRow,
|
|
1426
|
+
fmtStat(r.overall),
|
|
1427
|
+
fmtStat(r.abstentionAccuracy),
|
|
1428
|
+
fmtNum2(tokensPerQuery),
|
|
1429
|
+
estimateCost2(r.tokens, prices),
|
|
1430
|
+
c.answerModelId,
|
|
1431
|
+
c.judgeModelId,
|
|
1432
|
+
c.mode === "memory" ? String(c.topK) : "\u2014",
|
|
1433
|
+
fmtNum2(r.config.questionsRun),
|
|
1434
|
+
String(c.seed),
|
|
1435
|
+
provenance
|
|
1436
|
+
];
|
|
1437
|
+
lines.push("| " + row.join(" | ") + " |");
|
|
1438
|
+
}
|
|
1439
|
+
lines.push("");
|
|
1440
|
+
lines.push("## Run Configuration");
|
|
1441
|
+
lines.push("");
|
|
1442
|
+
for (const r of reports) {
|
|
1443
|
+
const c = r.config;
|
|
1444
|
+
lines.push(`### ${c.answerModelId} / ${c.mode}`);
|
|
1445
|
+
lines.push("");
|
|
1446
|
+
lines.push(`- **Mode**: ${c.mode}`);
|
|
1447
|
+
lines.push(`- **Answer model**: ${c.answerModelId}`);
|
|
1448
|
+
lines.push(`- **Judge model**: ${c.judgeModelId}`);
|
|
1449
|
+
if (c.mode === "memory") lines.push(`- **topK**: ${c.topK}`);
|
|
1450
|
+
lines.push(`- **Dataset source**: ${c.datasetSource.url}`);
|
|
1451
|
+
lines.push(`- **Dataset snapshot SHA**: \`${c.datasetSource.snapshotSha}\``);
|
|
1452
|
+
lines.push(`- **Dataset file**: ${c.datasetSource.file}`);
|
|
1453
|
+
lines.push(`- **Dataset license**: ${c.datasetSource.license}`);
|
|
1454
|
+
lines.push(`- **Seed**: ${c.seed}`);
|
|
1455
|
+
lines.push(`- **Types**: ${c.types.join(", ") || "all"}`);
|
|
1456
|
+
lines.push(`- **Questions run**: ${c.questionsRun}`);
|
|
1457
|
+
lines.push(`- **Wall-clock**: ${(r.wallClockMs / 1e3).toFixed(1)}s`);
|
|
1458
|
+
lines.push(`- **Errors**: ${r.errorCount}`);
|
|
1459
|
+
lines.push(
|
|
1460
|
+
`- **Tokens** (in/out): ${fmtNum2(r.tokens.totalInputTokens)} / ${fmtNum2(r.tokens.totalOutputTokens)}`
|
|
1461
|
+
);
|
|
1462
|
+
lines.push("");
|
|
1463
|
+
}
|
|
1464
|
+
lines.push("## Methodology Notes");
|
|
1465
|
+
lines.push("");
|
|
1466
|
+
lines.push(
|
|
1467
|
+
"These results were produced using the Eidentic LongMemEval fair-run harness. The following rules apply:"
|
|
1468
|
+
);
|
|
1469
|
+
lines.push("");
|
|
1470
|
+
lines.push(
|
|
1471
|
+
"1. **Per-question memory scope.** Each question has its own haystack (~50 sessions on average). A fresh Memory instance is created per question; no cross-question contamination."
|
|
1472
|
+
);
|
|
1473
|
+
lines.push(
|
|
1474
|
+
"2. **Dual-granularity ingest.** Each turn is ingested with its session date in the text (temporally anchored). An additional session-level chunk entry captures multi-turn context."
|
|
1475
|
+
);
|
|
1476
|
+
lines.push(
|
|
1477
|
+
"3. **Current date in prompt.** The `question_date` is passed to the answer prompt so temporal questions can reason about recency."
|
|
1478
|
+
);
|
|
1479
|
+
lines.push(
|
|
1480
|
+
"4. **topK \u2264 10 in memory mode.** Larger topK values trivialise retrieval quality and are not permitted."
|
|
1481
|
+
);
|
|
1482
|
+
lines.push(
|
|
1483
|
+
"5. **Full-context baseline is required** alongside any memory-mode result."
|
|
1484
|
+
);
|
|
1485
|
+
lines.push(
|
|
1486
|
+
"6. **Judge is strict**: a model answer is correct only when it contains the gold answer's specific information. Vague/topical-only answers are wrong. Equivalent date expressions for the same date/duration are correct (temporal-reasoning type)."
|
|
1487
|
+
);
|
|
1488
|
+
lines.push(
|
|
1489
|
+
"7. **Abstention questions** (not present in longmemeval_s.json standard split): correct = model declined / said no-info / identified a flawed premise; fabricating a specific answer = wrong. Abstention accuracy is reported separately and not folded into overall accuracy."
|
|
1490
|
+
);
|
|
1491
|
+
lines.push(
|
|
1492
|
+
"8. **Dataset license**: MIT \u2014 raw data is not redistributed; only aggregate results are published."
|
|
1493
|
+
);
|
|
1494
|
+
lines.push("");
|
|
1495
|
+
lines.push("> Per-type question counts in longmemeval_s.json (500 total):");
|
|
1496
|
+
lines.push("> single-session-user 70, single-session-assistant 56, single-session-preference 30,");
|
|
1497
|
+
lines.push("> multi-session 133, temporal-reasoning 133, knowledge-update 78.");
|
|
1498
|
+
lines.push("> No abstention variants in the standard _s split.");
|
|
1499
|
+
lines.push("");
|
|
1500
|
+
return lines.join("\n");
|
|
1501
|
+
}
|
|
1502
|
+
|
|
936
1503
|
// src/write-quality.ts
|
|
937
1504
|
var CONTRADICTION_FIXTURES = [
|
|
938
1505
|
{
|
|
@@ -1310,7 +1877,7 @@ async function runTemporalBench(memory, dataset, opts = {}) {
|
|
|
1310
1877
|
}
|
|
1311
1878
|
|
|
1312
1879
|
// src/datasets/temporal.ts
|
|
1313
|
-
function
|
|
1880
|
+
function makeRng3(seed) {
|
|
1314
1881
|
let s = seed >>> 0;
|
|
1315
1882
|
if (s === 0) s = 1;
|
|
1316
1883
|
return () => {
|
|
@@ -1393,7 +1960,7 @@ function syntheticTemporalDataset(opts = {}) {
|
|
|
1393
1960
|
const entityCount = opts.entityCount ?? 4;
|
|
1394
1961
|
const seed = opts.seed ?? 42;
|
|
1395
1962
|
const changesPerProperty = opts.changesPerProperty ?? 3;
|
|
1396
|
-
const rng =
|
|
1963
|
+
const rng = makeRng3(seed);
|
|
1397
1964
|
const entities = [];
|
|
1398
1965
|
const asserts = [];
|
|
1399
1966
|
const questions = [];
|
|
@@ -1480,16 +2047,21 @@ export {
|
|
|
1480
2047
|
CONTRADICTION_FIXTURES,
|
|
1481
2048
|
JUNK_STREAM_FIXTURES,
|
|
1482
2049
|
LOCOMO_SOURCE_SHA,
|
|
2050
|
+
LONGMEMEVAL_SOURCE,
|
|
1483
2051
|
factRecall,
|
|
1484
2052
|
loadLoCoMo,
|
|
1485
2053
|
loadLoCoMo2 as loadLoCoMoLegacy,
|
|
1486
2054
|
loadLongMemEval,
|
|
2055
|
+
loadLongMemEval2 as loadLongMemEvalLegacy,
|
|
1487
2056
|
normalizeText,
|
|
1488
2057
|
normalizedIncludes,
|
|
2058
|
+
parseLmeDateTimeString,
|
|
1489
2059
|
recallAtK,
|
|
1490
2060
|
renderLocomoReportMarkdown,
|
|
2061
|
+
renderLongMemEvalReportMarkdown,
|
|
1491
2062
|
resolveEvidence,
|
|
1492
2063
|
runLocomoBench,
|
|
2064
|
+
runLongMemEvalBench,
|
|
1493
2065
|
runMemoryBench,
|
|
1494
2066
|
runTemporalBench,
|
|
1495
2067
|
runWriteQualityBench,
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@eidentic/bench",
|
|
3
|
-
"version": "0.1.
|
|
3
|
+
"version": "0.1.3",
|
|
4
4
|
"type": "module",
|
|
5
5
|
"license": "Apache-2.0",
|
|
6
6
|
"publishConfig": {
|
|
@@ -28,9 +28,9 @@
|
|
|
28
28
|
"README.md"
|
|
29
29
|
],
|
|
30
30
|
"dependencies": {
|
|
31
|
-
"@eidentic/
|
|
32
|
-
"@eidentic/
|
|
33
|
-
"@eidentic/
|
|
31
|
+
"@eidentic/types": "0.2.1",
|
|
32
|
+
"@eidentic/memory": "0.1.3",
|
|
33
|
+
"@eidentic/eval": "0.1.3"
|
|
34
34
|
},
|
|
35
35
|
"description": "Memory benchmark harness for Eidentic — run LongMemEval / LoCoMo / temporal-reasoning benchmarks with deterministic recall metrics.",
|
|
36
36
|
"keywords": [
|