@eidentic/bench 0.1.1 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs CHANGED
@@ -136,22 +136,132 @@ var init_locomo_loader = __esm({
136
136
  }
137
137
  });
138
138
 
139
+ // src/lme-loader.ts
140
+ var lme_loader_exports = {};
141
+ __export(lme_loader_exports, {
142
+ LONGMEMEVAL_SOURCE: () => LONGMEMEVAL_SOURCE,
143
+ loadLongMemEval: () => loadLongMemEval2,
144
+ parseLmeDateTimeString: () => parseLmeDateTimeString
145
+ });
146
+ async function assertFileSize3(filePath, maxBytes) {
147
+ let fileSize;
148
+ try {
149
+ const s = await (0, import_promises4.stat)(filePath);
150
+ fileSize = s.size;
151
+ } catch (err) {
152
+ throw new Error(
153
+ `bench loader: cannot stat file "${filePath}": ${err.message}`
154
+ );
155
+ }
156
+ if (fileSize > maxBytes) {
157
+ const mb = (fileSize / (1024 * 1024)).toFixed(1);
158
+ const capMb = (maxBytes / (1024 * 1024)).toFixed(0);
159
+ throw new Error(
160
+ `bench loader: file "${filePath}" is ${mb} MiB, which exceeds the ${capMb} MiB cap. Pass a larger maxBytes option if this is intentional.`
161
+ );
162
+ }
163
+ }
164
+ function parseLmeDateTimeString(raw) {
165
+ if (!raw) return 0;
166
+ const cleaned = raw.replace(/\s*\([A-Za-z]+\)\s*/, " ").trim();
167
+ const iso = cleaned.replace(/^(\d{4})\/(\d{2})\/(\d{2})/, "$1-$2-$3");
168
+ const ms = Date.parse(iso);
169
+ if (Number.isNaN(ms)) return 0;
170
+ return ms;
171
+ }
172
+ function extractBaseType(rawType) {
173
+ if (rawType.endsWith("_abs")) {
174
+ return rawType.slice(0, -4);
175
+ }
176
+ return rawType;
177
+ }
178
+ function parseSession(id, dateTime, rawTurns) {
179
+ const turns = rawTurns.map((t) => ({
180
+ role: t.role === "assistant" ? "assistant" : "user",
181
+ content: t.content ?? "",
182
+ hasAnswer: t.has_answer === true
183
+ }));
184
+ return {
185
+ id,
186
+ dateTime,
187
+ dateTimeMs: parseLmeDateTimeString(dateTime),
188
+ turns
189
+ };
190
+ }
191
+ async function loadLongMemEval2(jsonPath, opts) {
192
+ await assertFileSize3(jsonPath, opts?.maxBytes ?? DEFAULT_MAX_BYTES3);
193
+ const raw = JSON.parse(await (0, import_promises4.readFile)(jsonPath, "utf-8"));
194
+ if (!Array.isArray(raw)) {
195
+ throw new Error(
196
+ `bench loader: expected the LongMemEval JSON root to be an array, but got ${typeof raw}. Did you pass the correct file?`
197
+ );
198
+ }
199
+ const rawQuestions = raw;
200
+ const questions = rawQuestions.map((q, i) => {
201
+ const id = q.question_id ?? String(i);
202
+ const rawType = q.question_type ?? "single-session-user";
203
+ const baseType = extractBaseType(rawType);
204
+ const isAbstention = rawType.endsWith("_abs");
205
+ const rawSessions = Array.isArray(q.haystack_sessions) ? q.haystack_sessions : [];
206
+ const dates = Array.isArray(q.haystack_dates) ? q.haystack_dates : [];
207
+ const sessionIds = Array.isArray(q.haystack_session_ids) ? q.haystack_session_ids : [];
208
+ const sessions = rawSessions.map((turns, idx) => {
209
+ const sessId = sessionIds[idx] ?? `sess-${idx}`;
210
+ const dateTime = dates[idx] ?? "";
211
+ return parseSession(sessId, dateTime, Array.isArray(turns) ? turns : []);
212
+ });
213
+ sessions.sort((a, b) => a.dateTimeMs - b.dateTimeMs);
214
+ return {
215
+ id,
216
+ type: rawType,
217
+ baseType,
218
+ isAbstention,
219
+ question: q.question ?? "",
220
+ answer: q.answer ?? "",
221
+ questionDate: q.question_date ?? "",
222
+ questionDateMs: parseLmeDateTimeString(q.question_date ?? ""),
223
+ sessions,
224
+ answerSessionIds: Array.isArray(q.answer_session_ids) ? q.answer_session_ids : []
225
+ };
226
+ });
227
+ return { questions };
228
+ }
229
+ var import_promises4, LONGMEMEVAL_SOURCE, DEFAULT_MAX_BYTES3;
230
+ var init_lme_loader = __esm({
231
+ "src/lme-loader.ts"() {
232
+ "use strict";
233
+ import_promises4 = require("node:fs/promises");
234
+ LONGMEMEVAL_SOURCE = {
235
+ url: "https://huggingface.co/datasets/xiaowu0162/longmemeval",
236
+ snapshotSha: "2ec2a557f339b6c0369619b1ed5793734cc87533",
237
+ file: "longmemeval_s",
238
+ license: "MIT"
239
+ };
240
+ DEFAULT_MAX_BYTES3 = 512 * 1024 * 1024;
241
+ }
242
+ });
243
+
139
244
  // src/index.ts
140
245
  var index_exports = {};
141
246
  __export(index_exports, {
142
247
  CONTRADICTION_FIXTURES: () => CONTRADICTION_FIXTURES,
143
248
  JUNK_STREAM_FIXTURES: () => JUNK_STREAM_FIXTURES,
144
249
  LOCOMO_SOURCE_SHA: () => LOCOMO_SOURCE_SHA,
250
+ LONGMEMEVAL_SOURCE: () => LONGMEMEVAL_SOURCE,
145
251
  factRecall: () => factRecall,
146
252
  loadLoCoMo: () => loadLoCoMo2,
147
253
  loadLoCoMoLegacy: () => loadLoCoMo,
148
- loadLongMemEval: () => loadLongMemEval,
254
+ loadLongMemEval: () => loadLongMemEval2,
255
+ loadLongMemEvalLegacy: () => loadLongMemEval,
149
256
  normalizeText: () => normalizeText,
150
257
  normalizedIncludes: () => normalizedIncludes,
258
+ parseLmeDateTimeString: () => parseLmeDateTimeString,
151
259
  recallAtK: () => recallAtK,
152
260
  renderLocomoReportMarkdown: () => renderLocomoReportMarkdown,
261
+ renderLongMemEvalReportMarkdown: () => renderLongMemEvalReportMarkdown,
153
262
  resolveEvidence: () => resolveEvidence,
154
263
  runLocomoBench: () => runLocomoBench,
264
+ runLongMemEvalBench: () => runLongMemEvalBench,
155
265
  runMemoryBench: () => runMemoryBench,
156
266
  runTemporalBench: () => runTemporalBench,
157
267
  runWriteQualityBench: () => runWriteQualityBench,
@@ -1094,6 +1204,572 @@ function resolveEvidence(sample, diaIds) {
1094
1204
  return results;
1095
1205
  }
1096
1206
 
1207
+ // src/index.ts
1208
+ init_lme_loader();
1209
+
1210
+ // src/lme-run.ts
1211
+ var import_promises5 = require("node:fs/promises");
1212
+ var import_node_fs2 = require("node:fs");
1213
+ init_lme_loader();
1214
+ function makeRng2(seed) {
1215
+ let s = seed >>> 0;
1216
+ if (s === 0) s = 1;
1217
+ return () => {
1218
+ s ^= s << 13;
1219
+ s ^= s >>> 17;
1220
+ s ^= s << 5;
1221
+ s = s >>> 0;
1222
+ return s / 4294967296;
1223
+ };
1224
+ }
1225
+ function seededShuffle2(arr, rng) {
1226
+ const out = [...arr];
1227
+ for (let i = out.length - 1; i > 0; i--) {
1228
+ const j = Math.floor(rng() * (i + 1));
1229
+ [out[i], out[j]] = [out[j], out[i]];
1230
+ }
1231
+ return out;
1232
+ }
1233
+ var DEFAULT_FULL_CONTEXT_MAX_CHARS = 48e4;
1234
+ var DECLINE_PATTERNS2 = /\bno information available\b|\bi (don'?t|do not) (know|have)\b|\bcannot (find|answer|determine|provide)\b|\bnot (mentioned|stated|provided|found|available)\b|\bunable to (find|answer|determine)\b|\bno relevant (information|data)\b/i;
1235
+ function appearsToDecline2(answer) {
1236
+ return DECLINE_PATTERNS2.test(answer);
1237
+ }
1238
+ var ANSWER_INSTRUCTION2 = "Answer the question concisely and specifically based only on the information provided. If the provided context does not contain the answer, reply exactly: No information available";
1239
+ function buildMemoryAnswerPrompt2(snippets, question, currentDate) {
1240
+ const context = snippets.length > 0 ? snippets.map((s, i) => `[${i + 1}] ${s}`).join("\n\n") : "(no relevant context retrieved)";
1241
+ const dateNote = currentDate ? `
1242
+ Current date: ${currentDate}` : "";
1243
+ return `Context from conversation history:
1244
+ ${context}${dateNote}
1245
+
1246
+ Question: ${question}
1247
+
1248
+ ${ANSWER_INSTRUCTION2}`;
1249
+ }
1250
+ function buildFullContextAnswerPrompt2(haystackText, question, currentDate) {
1251
+ const dateNote = currentDate ? `
1252
+ Current date: ${currentDate}` : "";
1253
+ return `Conversation history:
1254
+ ${haystackText}${dateNote}
1255
+
1256
+ Question: ${question}
1257
+
1258
+ ${ANSWER_INSTRUCTION2}`;
1259
+ }
1260
+ function buildJudgePrompt2(question, goldAnswer, modelAnswer, questionType, isAbstention) {
1261
+ if (isAbstention) {
1262
+ return `You are a strict judge. This question requires the model to recognize that the information is not present in the conversation history, or that the premise is flawed. A CORRECT response means the model abstained, said something equivalent to "No information available", "I don't know", or identified that the premise is wrong. An INCORRECT response means the model fabricated a specific, concrete answer.
1263
+
1264
+ Question: ${question}
1265
+ Model answer: ${modelAnswer}
1266
+
1267
+ Respond with JSON only: {"correct": true} if the model appropriately abstained, {"correct": false} if it fabricated a specific answer.`;
1268
+ }
1269
+ const temporalNote = questionType === "temporal-reasoning" ? "\nFor time/date questions: equivalent date expressions referring to the same date or duration are correct." : "";
1270
+ return `You are a strict judge evaluating whether a model answer is correct.
1271
+ CORRECT: the model answer contains the specific information in the gold answer (exact match or clear paraphrase is fine).${temporalNote}
1272
+ WRONG: the model answer is vague, only topically related, contradicts the gold answer, or says "no information" when a specific answer exists.
1273
+
1274
+ Question: ${question}
1275
+ Gold answer: ${goldAnswer}
1276
+ Model answer: ${modelAnswer}
1277
+
1278
+ Respond with JSON only: {"correct": true} or {"correct": false}`;
1279
+ }
1280
+ function renderHaystack(sessions) {
1281
+ const lines = [];
1282
+ for (let i = 0; i < sessions.length; i++) {
1283
+ const sess = sessions[i];
1284
+ const label = `Session ${i + 1} \u2014 ${sess.dateTime || sess.id}`;
1285
+ lines.push(label);
1286
+ for (const turn of sess.turns) {
1287
+ const roleLabel = turn.role === "user" ? "User" : "Assistant";
1288
+ lines.push(`[${roleLabel}]: ${turn.content}`);
1289
+ }
1290
+ lines.push("");
1291
+ }
1292
+ return lines.join("\n").trim();
1293
+ }
1294
+ function renderHaystackCapped(sessions, maxChars) {
1295
+ const full = renderHaystack(sessions);
1296
+ if (full.length <= maxChars) return { text: full, truncated: false };
1297
+ let kept = sessions.slice();
1298
+ while (kept.length > 1) {
1299
+ kept = kept.slice(1);
1300
+ const t = renderHaystack(kept);
1301
+ if (t.length <= maxChars) return { text: t, truncated: true };
1302
+ }
1303
+ return { text: renderHaystack(kept).slice(0, maxChars), truncated: true };
1304
+ }
1305
+ var EMBED_CHAR_CAP = 2e4;
1306
+ function capForEmbedding(text) {
1307
+ return text.length <= EMBED_CHAR_CAP ? text : text.slice(0, EMBED_CHAR_CAP);
1308
+ }
1309
+ async function ingestQuestionIntoMemory(question, memory, scope) {
1310
+ const events = [];
1311
+ for (let i = 0; i < question.sessions.length; i++) {
1312
+ const sess = question.sessions[i];
1313
+ const sessLabel = `Session ${i + 1} \u2014 ${sess.dateTime || sess.id}`;
1314
+ for (let t = 0; t < sess.turns.length; t++) {
1315
+ const turn = sess.turns[t];
1316
+ const roleLabel = turn.role === "user" ? "User" : "Assistant";
1317
+ events.push({
1318
+ id: `${question.id}:sess${i}:turn${t}`,
1319
+ scope,
1320
+ text: capForEmbedding(`[${sessLabel}] [${roleLabel}]: ${turn.content}`),
1321
+ metadata: {
1322
+ sessionId: sess.id,
1323
+ sessionIndex: i,
1324
+ turnRole: turn.role,
1325
+ ingestedAt: sess.dateTimeMs || void 0
1326
+ }
1327
+ });
1328
+ }
1329
+ const sessionText = [
1330
+ sessLabel,
1331
+ ...sess.turns.map((t) => `[${t.role === "user" ? "User" : "Assistant"}]: ${t.content}`)
1332
+ ].join("\n");
1333
+ events.push({
1334
+ id: `${question.id}:sess${i}:chunk`,
1335
+ scope,
1336
+ text: capForEmbedding(sessionText),
1337
+ metadata: {
1338
+ sessionId: sess.id,
1339
+ sessionIndex: i,
1340
+ ingestedAt: sess.dateTimeMs || void 0
1341
+ }
1342
+ });
1343
+ }
1344
+ await memory.ingest(events);
1345
+ }
1346
+ async function callJudge2(judgeModel, prompt) {
1347
+ const response = await judgeModel.complete({
1348
+ messages: [{ role: "user", content: prompt }],
1349
+ tools: [],
1350
+ outputSchema: {
1351
+ type: "object",
1352
+ properties: { correct: { type: "boolean" } },
1353
+ required: ["correct"],
1354
+ // OpenAI strict structured-output mode requires this to be explicit.
1355
+ additionalProperties: false
1356
+ }
1357
+ });
1358
+ let correct = false;
1359
+ if (response.object && typeof response.object.correct === "boolean") {
1360
+ correct = response.object.correct;
1361
+ } else {
1362
+ const text = response.content.filter((b) => b.type === "text").map((b) => b.text ?? "").join("").toLowerCase().trim();
1363
+ if (/"correct"\s*:\s*true/i.test(text)) correct = true;
1364
+ else if (/"correct"\s*:\s*false/i.test(text)) correct = false;
1365
+ else correct = text.includes("true");
1366
+ }
1367
+ return {
1368
+ correct,
1369
+ inputTokens: response.usage?.inputTokens ?? 0,
1370
+ outputTokens: response.usage?.outputTokens ?? 0
1371
+ };
1372
+ }
1373
+ async function loadCheckpoint2(path) {
1374
+ const done = /* @__PURE__ */ new Set();
1375
+ if (!(0, import_node_fs2.existsSync)(path)) return done;
1376
+ const raw = await (0, import_promises5.readFile)(path, "utf-8");
1377
+ for (const line of raw.split("\n")) {
1378
+ const trimmed = line.trim();
1379
+ if (!trimmed) continue;
1380
+ try {
1381
+ const row = JSON.parse(trimmed);
1382
+ if (row.questionId) done.add(row.questionId);
1383
+ } catch {
1384
+ }
1385
+ }
1386
+ return done;
1387
+ }
1388
+ async function appendCheckpointRow2(path, row) {
1389
+ await (0, import_promises5.appendFile)(path, JSON.stringify(row) + "\n", "utf-8");
1390
+ }
1391
+ async function runLongMemEvalBench(opts) {
1392
+ const {
1393
+ answerModel,
1394
+ judgeModel,
1395
+ mode,
1396
+ types,
1397
+ questionLimit,
1398
+ seed = 42,
1399
+ concurrency = 1,
1400
+ onProgress,
1401
+ checkpointPath,
1402
+ fullContextMaxChars = DEFAULT_FULL_CONTEXT_MAX_CHARS
1403
+ } = opts;
1404
+ const topK = Math.min(opts.topK ?? 10, 10);
1405
+ const rng = makeRng2(seed);
1406
+ if (mode === "memory" && !opts.memoryFactory) {
1407
+ throw new Error("runLongMemEvalBench: memoryFactory is required when mode='memory'");
1408
+ }
1409
+ let dataset;
1410
+ if (opts.dataset) {
1411
+ dataset = opts.dataset;
1412
+ } else if (opts.dataPath) {
1413
+ const { loadLongMemEval: loader } = await Promise.resolve().then(() => (init_lme_loader(), lme_loader_exports));
1414
+ dataset = await loader(opts.dataPath);
1415
+ } else {
1416
+ throw new Error("runLongMemEvalBench: either dataPath or dataset must be provided");
1417
+ }
1418
+ let questions = dataset.questions;
1419
+ if (types && types.length > 0) {
1420
+ questions = questions.filter((q) => types.includes(q.type) || types.includes(q.baseType));
1421
+ }
1422
+ if (questionLimit !== void 0 && questionLimit < questions.length) {
1423
+ questions = seededShuffle2(questions, makeRng2(seed)).slice(0, questionLimit);
1424
+ }
1425
+ const checkpoint = checkpointPath ? await loadCheckpoint2(checkpointPath) : /* @__PURE__ */ new Set();
1426
+ const results = [];
1427
+ let totalAnswerInputTokens = 0;
1428
+ let totalAnswerOutputTokens = 0;
1429
+ let totalJudgeInputTokens = 0;
1430
+ let totalJudgeOutputTokens = 0;
1431
+ let ingestEmbedTokens = 0;
1432
+ let errorCount = 0;
1433
+ let done = 0;
1434
+ const total = questions.length - checkpoint.size;
1435
+ const startTime = Date.now();
1436
+ const processQuestion = async (q) => {
1437
+ if (checkpoint.has(q.id)) return;
1438
+ let modelAnswer = "";
1439
+ let answerIn = 0;
1440
+ let answerOut = 0;
1441
+ let judgeIn = 0;
1442
+ let judgeOut = 0;
1443
+ let correct = false;
1444
+ let contextTruncated = false;
1445
+ let errorMsg;
1446
+ try {
1447
+ if (mode === "memory") {
1448
+ const memory = await opts.memoryFactory(q.id);
1449
+ const scope = { kind: "agent", agentId: `lme:${q.id}` };
1450
+ await ingestQuestionIntoMemory(q, memory, scope);
1451
+ const retrieved = await memory.retrieve({ text: q.question, scope, topK });
1452
+ const snippets = retrieved.snippets.map((s) => s.text);
1453
+ const prompt = buildMemoryAnswerPrompt2(snippets, q.question, q.questionDate);
1454
+ const resp = await answerModel.complete({
1455
+ messages: [{ role: "user", content: prompt }],
1456
+ tools: []
1457
+ });
1458
+ const textBlocks = resp.content.filter((b) => b.type === "text");
1459
+ modelAnswer = textBlocks.map((b) => b.text).join("").trim();
1460
+ answerIn = resp.usage?.inputTokens ?? 0;
1461
+ answerOut = resp.usage?.outputTokens ?? 0;
1462
+ } else {
1463
+ const { text, truncated } = renderHaystackCapped(q.sessions, fullContextMaxChars);
1464
+ contextTruncated = truncated;
1465
+ const prompt = buildFullContextAnswerPrompt2(text, q.question, q.questionDate);
1466
+ const resp = await answerModel.complete({
1467
+ messages: [{ role: "user", content: prompt }],
1468
+ tools: []
1469
+ });
1470
+ const textBlocks = resp.content.filter((b) => b.type === "text");
1471
+ modelAnswer = textBlocks.map((b) => b.text).join("").trim();
1472
+ answerIn = resp.usage?.inputTokens ?? 0;
1473
+ answerOut = resp.usage?.outputTokens ?? 0;
1474
+ }
1475
+ const judgePrompt = buildJudgePrompt2(
1476
+ q.question,
1477
+ q.answer,
1478
+ modelAnswer,
1479
+ q.baseType,
1480
+ q.isAbstention
1481
+ );
1482
+ const judgeResult = await callJudge2(judgeModel, judgePrompt);
1483
+ correct = judgeResult.correct;
1484
+ judgeIn = judgeResult.inputTokens;
1485
+ judgeOut = judgeResult.outputTokens;
1486
+ } catch (err) {
1487
+ errorMsg = err.message;
1488
+ errorCount++;
1489
+ correct = false;
1490
+ }
1491
+ const appearedToAbstain = appearsToDecline2(modelAnswer);
1492
+ const row = {
1493
+ questionId: q.id,
1494
+ questionType: q.type,
1495
+ isAbstention: q.isAbstention,
1496
+ question: q.question,
1497
+ goldAnswer: q.answer,
1498
+ modelAnswer,
1499
+ correct,
1500
+ appearedToAbstain,
1501
+ ...contextTruncated ? { contextTruncated } : {},
1502
+ ...errorMsg !== void 0 ? { error: errorMsg } : {},
1503
+ answerInputTokens: answerIn,
1504
+ answerOutputTokens: answerOut,
1505
+ judgeInputTokens: judgeIn,
1506
+ judgeOutputTokens: judgeOut
1507
+ };
1508
+ results.push(row);
1509
+ totalAnswerInputTokens += answerIn;
1510
+ totalAnswerOutputTokens += answerOut;
1511
+ totalJudgeInputTokens += judgeIn;
1512
+ totalJudgeOutputTokens += judgeOut;
1513
+ if (checkpointPath) {
1514
+ await appendCheckpointRow2(checkpointPath, row);
1515
+ }
1516
+ done++;
1517
+ if (onProgress) onProgress(done, total);
1518
+ };
1519
+ const concurrencyLimit = Math.max(1, concurrency);
1520
+ const pending = [];
1521
+ for (const q of questions) {
1522
+ const p = processQuestion(q);
1523
+ pending.push(p);
1524
+ if (pending.length >= concurrencyLimit) {
1525
+ await Promise.all(pending.splice(0, concurrencyLimit));
1526
+ }
1527
+ }
1528
+ if (pending.length > 0) await Promise.all(pending);
1529
+ if (checkpointPath && checkpoint.size > 0) {
1530
+ const raw = await (0, import_promises5.readFile)(checkpointPath, "utf-8").catch(() => "");
1531
+ for (const line of raw.split("\n")) {
1532
+ const trimmed = line.trim();
1533
+ if (!trimmed) continue;
1534
+ try {
1535
+ const row = JSON.parse(trimmed);
1536
+ if (checkpoint.has(row.questionId)) {
1537
+ results.push(row);
1538
+ totalAnswerInputTokens += row.answerInputTokens ?? 0;
1539
+ totalAnswerOutputTokens += row.answerOutputTokens ?? 0;
1540
+ totalJudgeInputTokens += row.judgeInputTokens ?? 0;
1541
+ totalJudgeOutputTokens += row.judgeOutputTokens ?? 0;
1542
+ }
1543
+ } catch {
1544
+ }
1545
+ }
1546
+ }
1547
+ const byTypeMap = {};
1548
+ let overallCorrect = 0;
1549
+ let overallTotal = 0;
1550
+ let abstentionCorrect = 0;
1551
+ let abstentionTotal = 0;
1552
+ for (const row of results) {
1553
+ if (row.isAbstention) {
1554
+ abstentionTotal++;
1555
+ if (row.correct) abstentionCorrect++;
1556
+ } else {
1557
+ const bt = row.questionType.endsWith("_abs") ? row.questionType.slice(0, -4) : row.questionType;
1558
+ if (!byTypeMap[bt]) byTypeMap[bt] = { correct: 0, total: 0 };
1559
+ overallTotal++;
1560
+ if (row.correct) overallCorrect++;
1561
+ byTypeMap[bt].total++;
1562
+ if (row.correct) byTypeMap[bt].correct++;
1563
+ }
1564
+ }
1565
+ const byType = {};
1566
+ for (const [t, stats] of Object.entries(byTypeMap)) {
1567
+ byType[t] = {
1568
+ ...stats,
1569
+ accuracy: stats.total > 0 ? stats.correct / stats.total : 0
1570
+ };
1571
+ }
1572
+ const abstentionAccuracy = abstentionTotal > 0 ? {
1573
+ correct: abstentionCorrect,
1574
+ total: abstentionTotal,
1575
+ accuracy: abstentionCorrect / abstentionTotal
1576
+ } : void 0;
1577
+ const wallClockMs = Date.now() - startTime;
1578
+ const allTypes = [...new Set(questions.map((q) => q.type))].sort();
1579
+ return {
1580
+ config: {
1581
+ mode,
1582
+ topK,
1583
+ answerModelId: answerModel.modelId ?? "(unknown)",
1584
+ judgeModelId: judgeModel.modelId ?? "(unknown)",
1585
+ datasetSource: LONGMEMEVAL_SOURCE,
1586
+ seed,
1587
+ types: allTypes,
1588
+ questionsRun: results.length
1589
+ },
1590
+ overall: {
1591
+ correct: overallCorrect,
1592
+ total: overallTotal,
1593
+ accuracy: overallTotal > 0 ? overallCorrect / overallTotal : 0
1594
+ },
1595
+ byType,
1596
+ ...abstentionAccuracy !== void 0 ? { abstentionAccuracy } : {},
1597
+ tokens: {
1598
+ ingestEmbedTokens,
1599
+ answerInputTokens: totalAnswerInputTokens,
1600
+ answerOutputTokens: totalAnswerOutputTokens,
1601
+ judgeInputTokens: totalJudgeInputTokens,
1602
+ judgeOutputTokens: totalJudgeOutputTokens,
1603
+ totalInputTokens: totalAnswerInputTokens + totalJudgeInputTokens,
1604
+ totalOutputTokens: totalAnswerOutputTokens + totalJudgeOutputTokens
1605
+ },
1606
+ wallClockMs,
1607
+ questions: results,
1608
+ errorCount
1609
+ };
1610
+ }
1611
+
1612
+ // src/lme-render.ts
1613
+ var QUESTION_TYPE_LABELS = {
1614
+ "single-session-user": "Single-session (user)",
1615
+ "single-session-assistant": "Single-session (asst.)",
1616
+ "single-session-preference": "Single-session (pref.)",
1617
+ "multi-session": "Multi-session",
1618
+ "temporal-reasoning": "Temporal reasoning",
1619
+ "knowledge-update": "Knowledge update"
1620
+ };
1621
+ function pct2(n) {
1622
+ return (n * 100).toFixed(1) + "%";
1623
+ }
1624
+ function fmtNum2(n) {
1625
+ return n.toLocaleString("en-US");
1626
+ }
1627
+ function estimateCost2(tokens, prices) {
1628
+ if (!prices) return "\u2014";
1629
+ const cost = tokens.totalInputTokens / 1e6 * prices.inputPer1M + tokens.totalOutputTokens / 1e6 * prices.outputPer1M;
1630
+ return `$${cost.toFixed(4)}`;
1631
+ }
1632
+ function fmtStat(s) {
1633
+ if (!s || s.total === 0) return "\u2014";
1634
+ return `${pct2(s.accuracy)} (${s.correct}/${s.total})`;
1635
+ }
1636
+ function renderLongMemEvalReportMarkdown(reports, prices) {
1637
+ const lines = [];
1638
+ lines.push("# LongMemEval Benchmark Results");
1639
+ lines.push("");
1640
+ lines.push(
1641
+ "Dataset: [LongMemEval](https://github.com/xiaowu0162/LongMemEval) (Wu et al.) \xB7 MIT License"
1642
+ );
1643
+ lines.push("Raw data is not redistributed. Only aggregate results are published here.");
1644
+ lines.push("");
1645
+ if (reports.length === 0) {
1646
+ lines.push("_No results yet._");
1647
+ return lines.join("\n");
1648
+ }
1649
+ const allBaseTypes = /* @__PURE__ */ new Set();
1650
+ for (const r of reports) {
1651
+ for (const t of Object.keys(r.byType)) allBaseTypes.add(t);
1652
+ }
1653
+ const sortedTypes = [
1654
+ "single-session-user",
1655
+ "single-session-assistant",
1656
+ "single-session-preference",
1657
+ "multi-session",
1658
+ "temporal-reasoning",
1659
+ "knowledge-update"
1660
+ ].filter((t) => allBaseTypes.has(t));
1661
+ for (const t of [...allBaseTypes].sort()) {
1662
+ if (!sortedTypes.includes(t)) sortedTypes.push(t);
1663
+ }
1664
+ const typeHeaders = sortedTypes.map(
1665
+ (t) => QUESTION_TYPE_LABELS[t] ?? t
1666
+ );
1667
+ const headers = [
1668
+ "System / Mode",
1669
+ ...typeHeaders,
1670
+ "Overall accuracy",
1671
+ "Abstention accuracy",
1672
+ "Tokens/query",
1673
+ "Est. cost/run",
1674
+ "Answer model",
1675
+ "Judge model",
1676
+ "topK",
1677
+ "n-Q",
1678
+ "Seed",
1679
+ "Dataset provenance"
1680
+ ];
1681
+ lines.push("## Results");
1682
+ lines.push("");
1683
+ lines.push("| " + headers.join(" | ") + " |");
1684
+ lines.push("| " + headers.map(() => "---").join(" | ") + " |");
1685
+ for (const r of reports) {
1686
+ const c = r.config;
1687
+ const typeRow = sortedTypes.map((t) => fmtStat(r.byType[t]));
1688
+ const totalQ = r.questions.length;
1689
+ const tokensPerQuery = totalQ > 0 ? Math.round(
1690
+ (r.tokens.totalInputTokens + r.tokens.totalOutputTokens) / totalQ
1691
+ ) : 0;
1692
+ const provenance = `${c.datasetSource.url.replace("https://", "")} @ ${c.datasetSource.snapshotSha.slice(0, 8)}`;
1693
+ const row = [
1694
+ `${c.answerModelId} / ${c.mode}`,
1695
+ ...typeRow,
1696
+ fmtStat(r.overall),
1697
+ fmtStat(r.abstentionAccuracy),
1698
+ fmtNum2(tokensPerQuery),
1699
+ estimateCost2(r.tokens, prices),
1700
+ c.answerModelId,
1701
+ c.judgeModelId,
1702
+ c.mode === "memory" ? String(c.topK) : "\u2014",
1703
+ fmtNum2(r.config.questionsRun),
1704
+ String(c.seed),
1705
+ provenance
1706
+ ];
1707
+ lines.push("| " + row.join(" | ") + " |");
1708
+ }
1709
+ lines.push("");
1710
+ lines.push("## Run Configuration");
1711
+ lines.push("");
1712
+ for (const r of reports) {
1713
+ const c = r.config;
1714
+ lines.push(`### ${c.answerModelId} / ${c.mode}`);
1715
+ lines.push("");
1716
+ lines.push(`- **Mode**: ${c.mode}`);
1717
+ lines.push(`- **Answer model**: ${c.answerModelId}`);
1718
+ lines.push(`- **Judge model**: ${c.judgeModelId}`);
1719
+ if (c.mode === "memory") lines.push(`- **topK**: ${c.topK}`);
1720
+ lines.push(`- **Dataset source**: ${c.datasetSource.url}`);
1721
+ lines.push(`- **Dataset snapshot SHA**: \`${c.datasetSource.snapshotSha}\``);
1722
+ lines.push(`- **Dataset file**: ${c.datasetSource.file}`);
1723
+ lines.push(`- **Dataset license**: ${c.datasetSource.license}`);
1724
+ lines.push(`- **Seed**: ${c.seed}`);
1725
+ lines.push(`- **Types**: ${c.types.join(", ") || "all"}`);
1726
+ lines.push(`- **Questions run**: ${c.questionsRun}`);
1727
+ lines.push(`- **Wall-clock**: ${(r.wallClockMs / 1e3).toFixed(1)}s`);
1728
+ lines.push(`- **Errors**: ${r.errorCount}`);
1729
+ lines.push(
1730
+ `- **Tokens** (in/out): ${fmtNum2(r.tokens.totalInputTokens)} / ${fmtNum2(r.tokens.totalOutputTokens)}`
1731
+ );
1732
+ lines.push("");
1733
+ }
1734
+ lines.push("## Methodology Notes");
1735
+ lines.push("");
1736
+ lines.push(
1737
+ "These results were produced using the Eidentic LongMemEval fair-run harness. The following rules apply:"
1738
+ );
1739
+ lines.push("");
1740
+ lines.push(
1741
+ "1. **Per-question memory scope.** Each question has its own haystack (~50 sessions on average). A fresh Memory instance is created per question; no cross-question contamination."
1742
+ );
1743
+ lines.push(
1744
+ "2. **Dual-granularity ingest.** Each turn is ingested with its session date in the text (temporally anchored). An additional session-level chunk entry captures multi-turn context."
1745
+ );
1746
+ lines.push(
1747
+ "3. **Current date in prompt.** The `question_date` is passed to the answer prompt so temporal questions can reason about recency."
1748
+ );
1749
+ lines.push(
1750
+ "4. **topK \u2264 10 in memory mode.** Larger topK values trivialise retrieval quality and are not permitted."
1751
+ );
1752
+ lines.push(
1753
+ "5. **Full-context baseline is required** alongside any memory-mode result."
1754
+ );
1755
+ lines.push(
1756
+ "6. **Judge is strict**: a model answer is correct only when it contains the gold answer's specific information. Vague/topical-only answers are wrong. Equivalent date expressions for the same date/duration are correct (temporal-reasoning type)."
1757
+ );
1758
+ lines.push(
1759
+ "7. **Abstention questions** (not present in longmemeval_s.json standard split): correct = model declined / said no-info / identified a flawed premise; fabricating a specific answer = wrong. Abstention accuracy is reported separately and not folded into overall accuracy."
1760
+ );
1761
+ lines.push(
1762
+ "8. **Dataset license**: MIT \u2014 raw data is not redistributed; only aggregate results are published."
1763
+ );
1764
+ lines.push("");
1765
+ lines.push("> Per-type question counts in longmemeval_s.json (500 total):");
1766
+ lines.push("> single-session-user 70, single-session-assistant 56, single-session-preference 30,");
1767
+ lines.push("> multi-session 133, temporal-reasoning 133, knowledge-update 78.");
1768
+ lines.push("> No abstention variants in the standard _s split.");
1769
+ lines.push("");
1770
+ return lines.join("\n");
1771
+ }
1772
+
1097
1773
  // src/write-quality.ts
1098
1774
  var CONTRADICTION_FIXTURES = [
1099
1775
  {
@@ -1471,7 +2147,7 @@ async function runTemporalBench(memory, dataset, opts = {}) {
1471
2147
  }
1472
2148
 
1473
2149
  // src/datasets/temporal.ts
1474
- function makeRng2(seed) {
2150
+ function makeRng3(seed) {
1475
2151
  let s = seed >>> 0;
1476
2152
  if (s === 0) s = 1;
1477
2153
  return () => {
@@ -1554,7 +2230,7 @@ function syntheticTemporalDataset(opts = {}) {
1554
2230
  const entityCount = opts.entityCount ?? 4;
1555
2231
  const seed = opts.seed ?? 42;
1556
2232
  const changesPerProperty = opts.changesPerProperty ?? 3;
1557
- const rng = makeRng2(seed);
2233
+ const rng = makeRng3(seed);
1558
2234
  const entities = [];
1559
2235
  const asserts = [];
1560
2236
  const questions = [];
@@ -1642,16 +2318,21 @@ function syntheticTemporalDataset(opts = {}) {
1642
2318
  CONTRADICTION_FIXTURES,
1643
2319
  JUNK_STREAM_FIXTURES,
1644
2320
  LOCOMO_SOURCE_SHA,
2321
+ LONGMEMEVAL_SOURCE,
1645
2322
  factRecall,
1646
2323
  loadLoCoMo,
1647
2324
  loadLoCoMoLegacy,
1648
2325
  loadLongMemEval,
2326
+ loadLongMemEvalLegacy,
1649
2327
  normalizeText,
1650
2328
  normalizedIncludes,
2329
+ parseLmeDateTimeString,
1651
2330
  recallAtK,
1652
2331
  renderLocomoReportMarkdown,
2332
+ renderLongMemEvalReportMarkdown,
1653
2333
  resolveEvidence,
1654
2334
  runLocomoBench,
2335
+ runLongMemEvalBench,
1655
2336
  runMemoryBench,
1656
2337
  runTemporalBench,
1657
2338
  runWriteQualityBench,