@eidentic/bench 0.1.0 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +39 -2
- package/dist/chunk-KOMVTEKE.js +98 -0
- package/dist/chunk-PVIWNXCY.js +99 -0
- package/dist/index.cjs +1339 -4
- package/dist/index.d.cts +535 -3
- package/dist/index.d.ts +535 -3
- package/dist/index.js +1117 -4
- package/dist/lme-loader-WSJ72GEP.js +10 -0
- package/dist/locomo-loader-YA3IEOND.js +8 -0
- package/package.json +4 -4
package/dist/index.js
CHANGED
|
@@ -1,3 +1,13 @@
|
|
|
1
|
+
import {
|
|
2
|
+
LOCOMO_SOURCE_SHA,
|
|
3
|
+
loadLoCoMo
|
|
4
|
+
} from "./chunk-PVIWNXCY.js";
|
|
5
|
+
import {
|
|
6
|
+
LONGMEMEVAL_SOURCE,
|
|
7
|
+
loadLongMemEval,
|
|
8
|
+
parseLmeDateTimeString
|
|
9
|
+
} from "./chunk-KOMVTEKE.js";
|
|
10
|
+
|
|
1
11
|
// src/recall.ts
|
|
2
12
|
function normalizeText(text) {
|
|
3
13
|
return text.toLowerCase().replace(/[''`]/g, "").replace(/[^\w\s]/g, " ").replace(/\s+/g, " ").trim();
|
|
@@ -336,7 +346,7 @@ async function assertFileSize(filePath, maxBytes = DEFAULT_MAX_BYTES) {
|
|
|
336
346
|
);
|
|
337
347
|
}
|
|
338
348
|
}
|
|
339
|
-
async function
|
|
349
|
+
async function loadLongMemEval2(jsonPath, opts) {
|
|
340
350
|
await assertFileSize(jsonPath, opts?.maxBytes);
|
|
341
351
|
const raw = JSON.parse(await readFile(jsonPath, "utf-8"));
|
|
342
352
|
const cases = Array.isArray(raw) ? raw : [];
|
|
@@ -362,7 +372,7 @@ function mapLoCoMoType(t) {
|
|
|
362
372
|
if (t.includes("multi")) return "multi-session";
|
|
363
373
|
return "single-session";
|
|
364
374
|
}
|
|
365
|
-
async function
|
|
375
|
+
async function loadLoCoMo2(jsonPath, opts) {
|
|
366
376
|
await assertFileSize(jsonPath, opts?.maxBytes);
|
|
367
377
|
const raw = JSON.parse(await readFile(jsonPath, "utf-8"));
|
|
368
378
|
let cases;
|
|
@@ -397,6 +407,1099 @@ async function loadLoCoMo(jsonPath, opts) {
|
|
|
397
407
|
return { name: "LoCoMo", cases: benchCases };
|
|
398
408
|
}
|
|
399
409
|
|
|
410
|
+
// src/locomo-run.ts
|
|
411
|
+
import { readFile as readFile2, appendFile } from "node:fs/promises";
|
|
412
|
+
import { existsSync } from "node:fs";
|
|
413
|
+
function makeRng(seed) {
|
|
414
|
+
let s = seed >>> 0;
|
|
415
|
+
if (s === 0) s = 1;
|
|
416
|
+
return () => {
|
|
417
|
+
s ^= s << 13;
|
|
418
|
+
s ^= s >>> 17;
|
|
419
|
+
s ^= s << 5;
|
|
420
|
+
s = s >>> 0;
|
|
421
|
+
return s / 4294967296;
|
|
422
|
+
};
|
|
423
|
+
}
|
|
424
|
+
function seededShuffle(arr, rng) {
|
|
425
|
+
const out = [...arr];
|
|
426
|
+
for (let i = out.length - 1; i > 0; i--) {
|
|
427
|
+
const j = Math.floor(rng() * (i + 1));
|
|
428
|
+
[out[i], out[j]] = [out[j], out[i]];
|
|
429
|
+
}
|
|
430
|
+
return out;
|
|
431
|
+
}
|
|
432
|
+
var ANSWER_INSTRUCTION = "Answer the question concisely and specifically based only on the information provided. If the provided context contains no answer to the question, reply exactly: No information available";
|
|
433
|
+
function buildMemoryAnswerPrompt(snippets, question) {
|
|
434
|
+
const context = snippets.length > 0 ? snippets.map((s, i) => `[${i + 1}] ${s}`).join("\n\n") : "(no relevant context retrieved)";
|
|
435
|
+
return `Context from conversation:
|
|
436
|
+
${context}
|
|
437
|
+
|
|
438
|
+
Question: ${question}
|
|
439
|
+
|
|
440
|
+
${ANSWER_INSTRUCTION}`;
|
|
441
|
+
}
|
|
442
|
+
function buildFullContextAnswerPrompt(conversationText, question) {
|
|
443
|
+
return `Conversation:
|
|
444
|
+
${conversationText}
|
|
445
|
+
|
|
446
|
+
Question: ${question}
|
|
447
|
+
|
|
448
|
+
${ANSWER_INSTRUCTION}`;
|
|
449
|
+
}
|
|
450
|
+
function buildJudgePrompt(question, goldAnswer, modelAnswer, adversarialAnswer, isCategory5) {
|
|
451
|
+
if (isCategory5) {
|
|
452
|
+
const trapNote = adversarialAnswer ? `
|
|
453
|
+
WARNING \u2014 trap answer to watch for: "${adversarialAnswer}". If the model answer closely matches this trap, mark incorrect.` : "";
|
|
454
|
+
return `You are a strict judge. For this unanswerable question, a CORRECT response means the model refused to answer or said something equivalent to "No information available" or "I don't know".${trapNote}
|
|
455
|
+
|
|
456
|
+
Question: ${question}
|
|
457
|
+
Model answer: ${modelAnswer}
|
|
458
|
+
|
|
459
|
+
Respond with JSON only: {"correct": true} if the model declined (correct), {"correct": false} if it made a claim.`;
|
|
460
|
+
}
|
|
461
|
+
return `You are a strict judge evaluating whether a model answer is correct.
|
|
462
|
+
CORRECT: the model answer contains the specific information in the gold answer (exact match or clear paraphrase).
|
|
463
|
+
WRONG: the model answer is vague, only topically related, or contradicts the gold answer.
|
|
464
|
+
For date/time questions: equivalent date expressions referring to the same date are correct.
|
|
465
|
+
|
|
466
|
+
Question: ${question}
|
|
467
|
+
Gold answer: ${goldAnswer ?? "(none)"}
|
|
468
|
+
Model answer: ${modelAnswer}
|
|
469
|
+
|
|
470
|
+
Respond with JSON only: {"correct": true} or {"correct": false}`;
|
|
471
|
+
}
|
|
472
|
+
function renderConversation(sample) {
|
|
473
|
+
const lines = [];
|
|
474
|
+
for (const sess of sample.sessions) {
|
|
475
|
+
const dateLabel = sess.dateTime ? ` \u2014 ${sess.dateTime}` : "";
|
|
476
|
+
lines.push(`Session ${sess.index}${dateLabel}`);
|
|
477
|
+
for (const turn of sess.turns) {
|
|
478
|
+
lines.push(`[${turn.speaker}]: ${turn.text}`);
|
|
479
|
+
}
|
|
480
|
+
lines.push("");
|
|
481
|
+
}
|
|
482
|
+
return lines.join("\n").trim();
|
|
483
|
+
}
|
|
484
|
+
async function ingestSampleIntoMemory(sample, memory, scope) {
|
|
485
|
+
const events = [];
|
|
486
|
+
for (const sess of sample.sessions) {
|
|
487
|
+
const dateLabel = sess.dateTime ? ` \u2014 ${sess.dateTime}` : "";
|
|
488
|
+
const header = `Session ${sess.index}${dateLabel}`;
|
|
489
|
+
for (const turn of sess.turns) {
|
|
490
|
+
events.push({
|
|
491
|
+
id: `${sample.sampleId}:turn:${turn.diaId}`,
|
|
492
|
+
scope,
|
|
493
|
+
text: `[${header}] [${turn.speaker}]: ${turn.text}`,
|
|
494
|
+
metadata: {
|
|
495
|
+
diaId: turn.diaId,
|
|
496
|
+
sessionIndex: sess.index,
|
|
497
|
+
ingestedAt: sess.dateTimeMs || void 0
|
|
498
|
+
}
|
|
499
|
+
});
|
|
500
|
+
}
|
|
501
|
+
const sessionText = [
|
|
502
|
+
header,
|
|
503
|
+
...sess.turns.map((t) => `[${t.speaker}]: ${t.text}`)
|
|
504
|
+
].join("\n");
|
|
505
|
+
events.push({
|
|
506
|
+
id: `${sample.sampleId}:sess:${sess.index}:chunk`,
|
|
507
|
+
scope,
|
|
508
|
+
text: sessionText,
|
|
509
|
+
metadata: { sessionIndex: sess.index, ingestedAt: sess.dateTimeMs || void 0 }
|
|
510
|
+
});
|
|
511
|
+
}
|
|
512
|
+
await memory.ingest(events);
|
|
513
|
+
}
|
|
514
|
+
async function callJudge(judgeModel, prompt) {
|
|
515
|
+
const response = await judgeModel.complete({
|
|
516
|
+
messages: [{ role: "user", content: prompt }],
|
|
517
|
+
tools: [],
|
|
518
|
+
outputSchema: {
|
|
519
|
+
type: "object",
|
|
520
|
+
properties: { correct: { type: "boolean" } },
|
|
521
|
+
required: ["correct"],
|
|
522
|
+
// OpenAI strict structured-output mode requires this to be explicit.
|
|
523
|
+
additionalProperties: false
|
|
524
|
+
}
|
|
525
|
+
});
|
|
526
|
+
let correct = false;
|
|
527
|
+
if (response.object && typeof response.object.correct === "boolean") {
|
|
528
|
+
correct = response.object.correct;
|
|
529
|
+
} else {
|
|
530
|
+
const text = response.content.filter((b) => b.type === "text").map((b) => b.text ?? "").join("").toLowerCase().trim();
|
|
531
|
+
if (/"correct"\s*:\s*true/i.test(text)) correct = true;
|
|
532
|
+
else if (/"correct"\s*:\s*false/i.test(text)) correct = false;
|
|
533
|
+
else correct = text.includes("true");
|
|
534
|
+
}
|
|
535
|
+
return {
|
|
536
|
+
correct,
|
|
537
|
+
inputTokens: response.usage?.inputTokens ?? 0,
|
|
538
|
+
outputTokens: response.usage?.outputTokens ?? 0
|
|
539
|
+
};
|
|
540
|
+
}
|
|
541
|
+
async function loadCheckpoint(path) {
|
|
542
|
+
const done = /* @__PURE__ */ new Set();
|
|
543
|
+
if (!existsSync(path)) return done;
|
|
544
|
+
const raw = await readFile2(path, "utf-8");
|
|
545
|
+
for (const line of raw.split("\n")) {
|
|
546
|
+
const trimmed = line.trim();
|
|
547
|
+
if (!trimmed) continue;
|
|
548
|
+
try {
|
|
549
|
+
const row = JSON.parse(trimmed);
|
|
550
|
+
if (row.sampleId && row.questionIndex !== void 0) {
|
|
551
|
+
done.add(`${row.sampleId}:${row.questionIndex}`);
|
|
552
|
+
}
|
|
553
|
+
} catch {
|
|
554
|
+
}
|
|
555
|
+
}
|
|
556
|
+
return done;
|
|
557
|
+
}
|
|
558
|
+
async function appendCheckpointRow(path, row) {
|
|
559
|
+
await appendFile(path, JSON.stringify(row) + "\n", "utf-8");
|
|
560
|
+
}
|
|
561
|
+
var DECLINE_PATTERNS = /\bno information available\b|\bi (don'?t|do not) know\b|\bcannot (find|answer|determine)\b|\bunavailable\b/i;
|
|
562
|
+
function appearsToDecline(answer) {
|
|
563
|
+
return DECLINE_PATTERNS.test(answer);
|
|
564
|
+
}
|
|
565
|
+
async function runLocomoBench(opts) {
|
|
566
|
+
const {
|
|
567
|
+
answerModel,
|
|
568
|
+
judgeModel,
|
|
569
|
+
mode,
|
|
570
|
+
categories = [1, 2, 3, 4, 5],
|
|
571
|
+
sampleLimit,
|
|
572
|
+
questionLimit,
|
|
573
|
+
seed = 42,
|
|
574
|
+
topK: rawTopK = 10,
|
|
575
|
+
concurrency = 1,
|
|
576
|
+
onProgress,
|
|
577
|
+
checkpointPath
|
|
578
|
+
} = opts;
|
|
579
|
+
const topK = Math.min(rawTopK, 10);
|
|
580
|
+
const rng = makeRng(seed);
|
|
581
|
+
let dataset;
|
|
582
|
+
if (opts.dataset) {
|
|
583
|
+
dataset = opts.dataset;
|
|
584
|
+
} else {
|
|
585
|
+
const { loadLoCoMo: loadLoCoMo3 } = await import("./locomo-loader-YA3IEOND.js");
|
|
586
|
+
dataset = await loadLoCoMo3(opts.dataPath);
|
|
587
|
+
}
|
|
588
|
+
let samples = [...dataset.samples];
|
|
589
|
+
if (sampleLimit !== void 0 && sampleLimit < samples.length) {
|
|
590
|
+
samples = seededShuffle(samples, makeRng(seed)).slice(0, sampleLimit);
|
|
591
|
+
}
|
|
592
|
+
const queue = [];
|
|
593
|
+
for (const sample of samples) {
|
|
594
|
+
let qaList = sample.qa.filter((q) => categories.includes(q.category));
|
|
595
|
+
if (questionLimit !== void 0 && questionLimit < qaList.length) {
|
|
596
|
+
qaList = seededShuffle(qaList, makeRng(seed + 1)).slice(0, questionLimit);
|
|
597
|
+
}
|
|
598
|
+
for (const qa of qaList) {
|
|
599
|
+
queue.push({ sample, qaIndex: sample.qa.indexOf(qa), qa });
|
|
600
|
+
}
|
|
601
|
+
}
|
|
602
|
+
const checkpoint = checkpointPath ? await loadCheckpoint(checkpointPath) : /* @__PURE__ */ new Set();
|
|
603
|
+
const conversationText = /* @__PURE__ */ new Map();
|
|
604
|
+
if (mode === "full-context") {
|
|
605
|
+
for (const sample of samples) {
|
|
606
|
+
conversationText.set(sample.sampleId, renderConversation(sample));
|
|
607
|
+
}
|
|
608
|
+
}
|
|
609
|
+
const memories = /* @__PURE__ */ new Map();
|
|
610
|
+
let ingestInputTokens = 0;
|
|
611
|
+
let ingestOutputTokens = 0;
|
|
612
|
+
if (mode === "memory") {
|
|
613
|
+
if (!opts.memoryFactory) {
|
|
614
|
+
throw new Error("runLocomoBench: memoryFactory is required when mode='memory'");
|
|
615
|
+
}
|
|
616
|
+
for (const sample of samples) {
|
|
617
|
+
const memory = await opts.memoryFactory(sample.sampleId);
|
|
618
|
+
const scope = { kind: "agent", agentId: `locomo:${sample.sampleId}` };
|
|
619
|
+
await ingestSampleIntoMemory(sample, memory, scope);
|
|
620
|
+
memories.set(sample.sampleId, memory);
|
|
621
|
+
}
|
|
622
|
+
}
|
|
623
|
+
const results = [];
|
|
624
|
+
let totalAnswerInputTokens = 0;
|
|
625
|
+
let totalAnswerOutputTokens = 0;
|
|
626
|
+
let totalJudgeInputTokens = 0;
|
|
627
|
+
let totalJudgeOutputTokens = 0;
|
|
628
|
+
let errorCount = 0;
|
|
629
|
+
let done = 0;
|
|
630
|
+
const total = queue.length - checkpoint.size;
|
|
631
|
+
const startTime = Date.now();
|
|
632
|
+
const processItem = async (item) => {
|
|
633
|
+
const key = `${item.sample.sampleId}:${item.qaIndex}`;
|
|
634
|
+
if (checkpoint.has(key)) return;
|
|
635
|
+
let modelAnswer = "";
|
|
636
|
+
let answerIn = 0;
|
|
637
|
+
let answerOut = 0;
|
|
638
|
+
let judgeIn = 0;
|
|
639
|
+
let judgeOut = 0;
|
|
640
|
+
let correct = false;
|
|
641
|
+
let trapTriggered;
|
|
642
|
+
let errorMsg;
|
|
643
|
+
try {
|
|
644
|
+
if (mode === "memory") {
|
|
645
|
+
const memory = memories.get(item.sample.sampleId);
|
|
646
|
+
const scope = { kind: "agent", agentId: `locomo:${item.sample.sampleId}` };
|
|
647
|
+
const retrieved = await memory.retrieve({ text: item.qa.question, scope, topK });
|
|
648
|
+
const snippets = retrieved.snippets.map((s) => s.text);
|
|
649
|
+
const prompt = buildMemoryAnswerPrompt(snippets, item.qa.question);
|
|
650
|
+
const resp = await answerModel.complete({
|
|
651
|
+
messages: [{ role: "user", content: prompt }],
|
|
652
|
+
tools: []
|
|
653
|
+
});
|
|
654
|
+
const textBlocks = resp.content.filter((b) => b.type === "text");
|
|
655
|
+
modelAnswer = textBlocks.map((b) => b.text).join("").trim();
|
|
656
|
+
answerIn = resp.usage?.inputTokens ?? 0;
|
|
657
|
+
answerOut = resp.usage?.outputTokens ?? 0;
|
|
658
|
+
} else {
|
|
659
|
+
const convText = conversationText.get(item.sample.sampleId) ?? "";
|
|
660
|
+
const prompt = buildFullContextAnswerPrompt(convText, item.qa.question);
|
|
661
|
+
const resp = await answerModel.complete({
|
|
662
|
+
messages: [{ role: "user", content: prompt }],
|
|
663
|
+
tools: []
|
|
664
|
+
});
|
|
665
|
+
const textBlocks = resp.content.filter((b) => b.type === "text");
|
|
666
|
+
modelAnswer = textBlocks.map((b) => b.text).join("").trim();
|
|
667
|
+
answerIn = resp.usage?.inputTokens ?? 0;
|
|
668
|
+
answerOut = resp.usage?.outputTokens ?? 0;
|
|
669
|
+
}
|
|
670
|
+
const isCategory5 = item.qa.category === 5;
|
|
671
|
+
const judgePrompt = buildJudgePrompt(
|
|
672
|
+
item.qa.question,
|
|
673
|
+
item.qa.answer,
|
|
674
|
+
modelAnswer,
|
|
675
|
+
item.qa.adversarialAnswer,
|
|
676
|
+
isCategory5
|
|
677
|
+
);
|
|
678
|
+
const judgeResult = await callJudge(judgeModel, judgePrompt);
|
|
679
|
+
correct = judgeResult.correct;
|
|
680
|
+
judgeIn = judgeResult.inputTokens;
|
|
681
|
+
judgeOut = judgeResult.outputTokens;
|
|
682
|
+
if (isCategory5 && item.qa.adversarialAnswer) {
|
|
683
|
+
const adversarialLower = item.qa.adversarialAnswer.toLowerCase();
|
|
684
|
+
const answerLower = modelAnswer.toLowerCase();
|
|
685
|
+
trapTriggered = answerLower.includes(adversarialLower) || adversarialLower.includes(answerLower.slice(0, Math.min(answerLower.length, 30)));
|
|
686
|
+
if (trapTriggered && correct) correct = false;
|
|
687
|
+
if (!correct && appearsToDecline(modelAnswer)) {
|
|
688
|
+
correct = true;
|
|
689
|
+
trapTriggered = false;
|
|
690
|
+
}
|
|
691
|
+
}
|
|
692
|
+
} catch (err) {
|
|
693
|
+
errorMsg = err.message;
|
|
694
|
+
errorCount++;
|
|
695
|
+
correct = false;
|
|
696
|
+
}
|
|
697
|
+
const row = {
|
|
698
|
+
sampleId: item.sample.sampleId,
|
|
699
|
+
questionIndex: item.qaIndex,
|
|
700
|
+
question: item.qa.question,
|
|
701
|
+
goldAnswer: item.qa.answer,
|
|
702
|
+
category: item.qa.category,
|
|
703
|
+
modelAnswer,
|
|
704
|
+
correct,
|
|
705
|
+
...trapTriggered !== void 0 ? { trapTriggered } : {},
|
|
706
|
+
...errorMsg !== void 0 ? { error: errorMsg } : {},
|
|
707
|
+
answerInputTokens: answerIn,
|
|
708
|
+
answerOutputTokens: answerOut,
|
|
709
|
+
judgeInputTokens: judgeIn,
|
|
710
|
+
judgeOutputTokens: judgeOut
|
|
711
|
+
};
|
|
712
|
+
results.push(row);
|
|
713
|
+
totalAnswerInputTokens += answerIn;
|
|
714
|
+
totalAnswerOutputTokens += answerOut;
|
|
715
|
+
totalJudgeInputTokens += judgeIn;
|
|
716
|
+
totalJudgeOutputTokens += judgeOut;
|
|
717
|
+
if (checkpointPath) {
|
|
718
|
+
await appendCheckpointRow(checkpointPath, row);
|
|
719
|
+
}
|
|
720
|
+
done++;
|
|
721
|
+
if (onProgress) onProgress(done, total);
|
|
722
|
+
};
|
|
723
|
+
const concurrencyLimit = Math.max(1, concurrency);
|
|
724
|
+
const pending = [];
|
|
725
|
+
for (const item of queue) {
|
|
726
|
+
const p = processItem(item);
|
|
727
|
+
pending.push(p);
|
|
728
|
+
if (pending.length >= concurrencyLimit) {
|
|
729
|
+
await Promise.all(pending.splice(0, concurrencyLimit));
|
|
730
|
+
}
|
|
731
|
+
}
|
|
732
|
+
if (pending.length > 0) await Promise.all(pending);
|
|
733
|
+
if (checkpointPath && checkpoint.size > 0) {
|
|
734
|
+
const raw = await readFile2(checkpointPath, "utf-8").catch(() => "");
|
|
735
|
+
for (const line of raw.split("\n")) {
|
|
736
|
+
const trimmed = line.trim();
|
|
737
|
+
if (!trimmed) continue;
|
|
738
|
+
try {
|
|
739
|
+
const row = JSON.parse(trimmed);
|
|
740
|
+
const key = `${row.sampleId}:${row.questionIndex}`;
|
|
741
|
+
if (checkpoint.has(key)) {
|
|
742
|
+
results.push(row);
|
|
743
|
+
totalAnswerInputTokens += row.answerInputTokens ?? 0;
|
|
744
|
+
totalAnswerOutputTokens += row.answerOutputTokens ?? 0;
|
|
745
|
+
totalJudgeInputTokens += row.judgeInputTokens ?? 0;
|
|
746
|
+
totalJudgeOutputTokens += row.judgeOutputTokens ?? 0;
|
|
747
|
+
}
|
|
748
|
+
} catch {
|
|
749
|
+
}
|
|
750
|
+
}
|
|
751
|
+
}
|
|
752
|
+
const byCategoryMap = {};
|
|
753
|
+
for (const row of results) {
|
|
754
|
+
const cat = String(row.category);
|
|
755
|
+
if (!byCategoryMap[cat]) byCategoryMap[cat] = { correct: 0, total: 0 };
|
|
756
|
+
byCategoryMap[cat].total++;
|
|
757
|
+
if (row.correct) byCategoryMap[cat].correct++;
|
|
758
|
+
}
|
|
759
|
+
const byCategory = {};
|
|
760
|
+
for (const [cat, stats] of Object.entries(byCategoryMap)) {
|
|
761
|
+
byCategory[cat] = {
|
|
762
|
+
...stats,
|
|
763
|
+
accuracy: stats.total > 0 ? stats.correct / stats.total : 0
|
|
764
|
+
};
|
|
765
|
+
}
|
|
766
|
+
let j14Correct = 0;
|
|
767
|
+
let j14Total = 0;
|
|
768
|
+
for (const [cat, stats] of Object.entries(byCategory)) {
|
|
769
|
+
const n = parseInt(cat);
|
|
770
|
+
if (n >= 1 && n <= 4) {
|
|
771
|
+
j14Correct += stats.correct;
|
|
772
|
+
j14Total += stats.total;
|
|
773
|
+
}
|
|
774
|
+
}
|
|
775
|
+
const cat5Stats = byCategory["5"];
|
|
776
|
+
const cat5RefusalRate = cat5Stats ? { correct: cat5Stats.correct, total: cat5Stats.total, rate: cat5Stats.accuracy } : void 0;
|
|
777
|
+
const wallClockMs = Date.now() - startTime;
|
|
778
|
+
return {
|
|
779
|
+
config: {
|
|
780
|
+
mode,
|
|
781
|
+
topK,
|
|
782
|
+
answerModelId: answerModel.modelId ?? "(unknown)",
|
|
783
|
+
judgeModelId: judgeModel.modelId ?? "(unknown)",
|
|
784
|
+
datasetSha: LOCOMO_SOURCE_SHA,
|
|
785
|
+
seed,
|
|
786
|
+
categories: [...categories].sort((a, b) => a - b),
|
|
787
|
+
samplesRun: samples.length,
|
|
788
|
+
questionsRun: results.length
|
|
789
|
+
},
|
|
790
|
+
overallJ14: {
|
|
791
|
+
correct: j14Correct,
|
|
792
|
+
total: j14Total,
|
|
793
|
+
accuracy: j14Total > 0 ? j14Correct / j14Total : 0
|
|
794
|
+
},
|
|
795
|
+
byCategory,
|
|
796
|
+
cat5RefusalRate,
|
|
797
|
+
tokens: {
|
|
798
|
+
ingestInputTokens,
|
|
799
|
+
ingestOutputTokens,
|
|
800
|
+
answerInputTokens: totalAnswerInputTokens,
|
|
801
|
+
answerOutputTokens: totalAnswerOutputTokens,
|
|
802
|
+
judgeInputTokens: totalJudgeInputTokens,
|
|
803
|
+
judgeOutputTokens: totalJudgeOutputTokens,
|
|
804
|
+
totalInputTokens: ingestInputTokens + totalAnswerInputTokens + totalJudgeInputTokens,
|
|
805
|
+
totalOutputTokens: ingestOutputTokens + totalAnswerOutputTokens + totalJudgeOutputTokens
|
|
806
|
+
},
|
|
807
|
+
wallClockMs,
|
|
808
|
+
questions: results,
|
|
809
|
+
errorCount
|
|
810
|
+
};
|
|
811
|
+
}
|
|
812
|
+
|
|
813
|
+
// src/locomo-render.ts
|
|
814
|
+
function pct(n) {
|
|
815
|
+
return (n * 100).toFixed(1) + "%";
|
|
816
|
+
}
|
|
817
|
+
function fmtNum(n) {
|
|
818
|
+
return n.toLocaleString("en-US");
|
|
819
|
+
}
|
|
820
|
+
function estimateCost(tokens, prices) {
|
|
821
|
+
if (!prices) return "\u2014";
|
|
822
|
+
const cost = tokens.totalInputTokens / 1e6 * prices.inputPer1M + tokens.totalOutputTokens / 1e6 * prices.outputPer1M;
|
|
823
|
+
return `$${cost.toFixed(4)}`;
|
|
824
|
+
}
|
|
825
|
+
function renderLocomoReportMarkdown(reports, prices) {
|
|
826
|
+
const lines = [];
|
|
827
|
+
lines.push("# LoCoMo Benchmark Results");
|
|
828
|
+
lines.push("");
|
|
829
|
+
lines.push("Dataset: [LoCoMo](https://github.com/snap-research/locomo) (Snap Research) \xB7 CC BY-NC 4.0");
|
|
830
|
+
lines.push("Raw data is not redistributed. Only aggregate results are published here.");
|
|
831
|
+
lines.push("");
|
|
832
|
+
if (reports.length === 0) {
|
|
833
|
+
lines.push("_No results yet._");
|
|
834
|
+
return lines.join("\n");
|
|
835
|
+
}
|
|
836
|
+
const headers = [
|
|
837
|
+
"System / Mode",
|
|
838
|
+
"Cat1 (multi-hop)",
|
|
839
|
+
"Cat2 (temporal)",
|
|
840
|
+
"Cat3 (open-domain)",
|
|
841
|
+
"Cat4 (single-hop)",
|
|
842
|
+
"J(1\u20134) overall",
|
|
843
|
+
"Cat5 refusal rate",
|
|
844
|
+
"Tokens/query",
|
|
845
|
+
"Est. cost/run",
|
|
846
|
+
"Answer model",
|
|
847
|
+
"Judge model",
|
|
848
|
+
"topK",
|
|
849
|
+
"n-Q",
|
|
850
|
+
"Seed",
|
|
851
|
+
"Dataset SHA"
|
|
852
|
+
];
|
|
853
|
+
lines.push("## Results");
|
|
854
|
+
lines.push("");
|
|
855
|
+
lines.push("| " + headers.join(" | ") + " |");
|
|
856
|
+
lines.push("| " + headers.map(() => "---").join(" | ") + " |");
|
|
857
|
+
for (const r of reports) {
|
|
858
|
+
const c = r.config;
|
|
859
|
+
const cat = (n) => {
|
|
860
|
+
const s = r.byCategory[String(n)];
|
|
861
|
+
if (!s) return "\u2014";
|
|
862
|
+
return `${pct(s.accuracy)} (${s.correct}/${s.total})`;
|
|
863
|
+
};
|
|
864
|
+
const cat5 = r.cat5RefusalRate ? `${pct(r.cat5RefusalRate.rate)} (${r.cat5RefusalRate.correct}/${r.cat5RefusalRate.total})` : "\u2014";
|
|
865
|
+
const totalQ = r.questions.length;
|
|
866
|
+
const tokensPerQuery = totalQ > 0 ? Math.round((r.tokens.totalInputTokens + r.tokens.totalOutputTokens) / totalQ) : 0;
|
|
867
|
+
const row = [
|
|
868
|
+
`${c.answerModelId} / ${c.mode}`,
|
|
869
|
+
cat(1),
|
|
870
|
+
cat(2),
|
|
871
|
+
cat(3),
|
|
872
|
+
cat(4),
|
|
873
|
+
`${pct(r.overallJ14.accuracy)} (${r.overallJ14.correct}/${r.overallJ14.total})`,
|
|
874
|
+
cat5,
|
|
875
|
+
fmtNum(tokensPerQuery),
|
|
876
|
+
estimateCost(r.tokens, prices),
|
|
877
|
+
c.answerModelId,
|
|
878
|
+
c.judgeModelId,
|
|
879
|
+
String(c.topK),
|
|
880
|
+
fmtNum(r.config.questionsRun),
|
|
881
|
+
String(c.seed),
|
|
882
|
+
c.datasetSha.slice(0, 8)
|
|
883
|
+
];
|
|
884
|
+
lines.push("| " + row.join(" | ") + " |");
|
|
885
|
+
}
|
|
886
|
+
lines.push("");
|
|
887
|
+
lines.push("## Run Configuration");
|
|
888
|
+
lines.push("");
|
|
889
|
+
for (const r of reports) {
|
|
890
|
+
const c = r.config;
|
|
891
|
+
lines.push(`### ${c.answerModelId} / ${c.mode}`);
|
|
892
|
+
lines.push("");
|
|
893
|
+
lines.push(`- **Mode**: ${c.mode}`);
|
|
894
|
+
lines.push(`- **Answer model**: ${c.answerModelId}`);
|
|
895
|
+
lines.push(`- **Judge model**: ${c.judgeModelId}`);
|
|
896
|
+
lines.push(`- **topK**: ${c.topK}`);
|
|
897
|
+
lines.push(`- **Dataset SHA**: \`${c.datasetSha}\``);
|
|
898
|
+
lines.push(`- **Seed**: ${c.seed}`);
|
|
899
|
+
lines.push(`- **Categories**: ${c.categories.join(", ")}`);
|
|
900
|
+
lines.push(`- **Samples run**: ${c.samplesRun}`);
|
|
901
|
+
lines.push(`- **Questions run**: ${c.questionsRun}`);
|
|
902
|
+
lines.push(`- **Wall-clock**: ${(r.wallClockMs / 1e3).toFixed(1)}s`);
|
|
903
|
+
lines.push(`- **Errors**: ${r.errorCount}`);
|
|
904
|
+
lines.push(`- **Tokens** (in/out): ${fmtNum(r.tokens.totalInputTokens)} / ${fmtNum(r.tokens.totalOutputTokens)}`);
|
|
905
|
+
lines.push("");
|
|
906
|
+
}
|
|
907
|
+
lines.push("## Methodology Notes");
|
|
908
|
+
lines.push("");
|
|
909
|
+
lines.push("These results were produced using the Eidentic LoCoMo fair-run harness. The following rules apply:");
|
|
910
|
+
lines.push("");
|
|
911
|
+
lines.push("1. **Both speakers are treated as humans.** Turns are ingested as `[SpeakerName]: text` \u2014 never mapped to user/assistant roles.");
|
|
912
|
+
lines.push("2. **Timestamps are structural.** Each session is prefixed with a header line `Session N \u2014 <date>` and `ingestedAt` metadata carries the epoch-ms.");
|
|
913
|
+
lines.push("3. **topK \u2264 10 in memory mode.** Larger topK values trivialise retrieval quality and are not permitted.");
|
|
914
|
+
lines.push("4. **Full-context baseline is required** alongside any memory-mode result.");
|
|
915
|
+
lines.push("5. **Judge is strict**: a model answer is correct only when it contains the gold answer's specific information. Vague/topical-only answers are wrong.");
|
|
916
|
+
lines.push("6. **Category 5 (adversarial)**: correct = model declined; adversarial-trap match = wrong.");
|
|
917
|
+
lines.push("7. **Primary metric J(1\u20134)**: denominator is the number of cat 1\u20134 questions actually run (max 1540 on full dataset).");
|
|
918
|
+
lines.push("8. **Dataset license**: CC BY-NC 4.0 \u2014 raw data is not redistributed; only aggregate results are published.");
|
|
919
|
+
lines.push("");
|
|
920
|
+
lines.push("> Category mapping in locomo10.json: 1=multi-hop (282), 2=temporal (321), 3=open-domain (96), 4=single-hop (841), 5=adversarial (446).");
|
|
921
|
+
lines.push("");
|
|
922
|
+
return lines.join("\n");
|
|
923
|
+
}
|
|
924
|
+
|
|
925
|
+
// src/locomo-types.ts
|
|
926
|
+
function resolveEvidence(sample, diaIds) {
|
|
927
|
+
const turnMap = /* @__PURE__ */ new Map();
|
|
928
|
+
for (const sess of sample.sessions) {
|
|
929
|
+
for (const turn of sess.turns) {
|
|
930
|
+
turnMap.set(turn.diaId, turn);
|
|
931
|
+
}
|
|
932
|
+
}
|
|
933
|
+
const results = [];
|
|
934
|
+
for (const id of diaIds) {
|
|
935
|
+
const t = turnMap.get(id);
|
|
936
|
+
if (t) results.push(t.text);
|
|
937
|
+
}
|
|
938
|
+
return results;
|
|
939
|
+
}
|
|
940
|
+
|
|
941
|
+
// src/lme-run.ts
|
|
942
|
+
import { readFile as readFile3, appendFile as appendFile2 } from "node:fs/promises";
|
|
943
|
+
import { existsSync as existsSync2 } from "node:fs";
|
|
944
|
+
function makeRng2(seed) {
|
|
945
|
+
let s = seed >>> 0;
|
|
946
|
+
if (s === 0) s = 1;
|
|
947
|
+
return () => {
|
|
948
|
+
s ^= s << 13;
|
|
949
|
+
s ^= s >>> 17;
|
|
950
|
+
s ^= s << 5;
|
|
951
|
+
s = s >>> 0;
|
|
952
|
+
return s / 4294967296;
|
|
953
|
+
};
|
|
954
|
+
}
|
|
955
|
+
function seededShuffle2(arr, rng) {
|
|
956
|
+
const out = [...arr];
|
|
957
|
+
for (let i = out.length - 1; i > 0; i--) {
|
|
958
|
+
const j = Math.floor(rng() * (i + 1));
|
|
959
|
+
[out[i], out[j]] = [out[j], out[i]];
|
|
960
|
+
}
|
|
961
|
+
return out;
|
|
962
|
+
}
|
|
963
|
+
var DEFAULT_FULL_CONTEXT_MAX_CHARS = 48e4;
|
|
964
|
+
var DECLINE_PATTERNS2 = /\bno information available\b|\bi (don'?t|do not) (know|have)\b|\bcannot (find|answer|determine|provide)\b|\bnot (mentioned|stated|provided|found|available)\b|\bunable to (find|answer|determine)\b|\bno relevant (information|data)\b/i;
|
|
965
|
+
function appearsToDecline2(answer) {
|
|
966
|
+
return DECLINE_PATTERNS2.test(answer);
|
|
967
|
+
}
|
|
968
|
+
var ANSWER_INSTRUCTION2 = "Answer the question concisely and specifically based only on the information provided. If the provided context does not contain the answer, reply exactly: No information available";
|
|
969
|
+
function buildMemoryAnswerPrompt2(snippets, question, currentDate) {
|
|
970
|
+
const context = snippets.length > 0 ? snippets.map((s, i) => `[${i + 1}] ${s}`).join("\n\n") : "(no relevant context retrieved)";
|
|
971
|
+
const dateNote = currentDate ? `
|
|
972
|
+
Current date: ${currentDate}` : "";
|
|
973
|
+
return `Context from conversation history:
|
|
974
|
+
${context}${dateNote}
|
|
975
|
+
|
|
976
|
+
Question: ${question}
|
|
977
|
+
|
|
978
|
+
${ANSWER_INSTRUCTION2}`;
|
|
979
|
+
}
|
|
980
|
+
function buildFullContextAnswerPrompt2(haystackText, question, currentDate) {
|
|
981
|
+
const dateNote = currentDate ? `
|
|
982
|
+
Current date: ${currentDate}` : "";
|
|
983
|
+
return `Conversation history:
|
|
984
|
+
${haystackText}${dateNote}
|
|
985
|
+
|
|
986
|
+
Question: ${question}
|
|
987
|
+
|
|
988
|
+
${ANSWER_INSTRUCTION2}`;
|
|
989
|
+
}
|
|
990
|
+
function buildJudgePrompt2(question, goldAnswer, modelAnswer, questionType, isAbstention) {
|
|
991
|
+
if (isAbstention) {
|
|
992
|
+
return `You are a strict judge. This question requires the model to recognize that the information is not present in the conversation history, or that the premise is flawed. A CORRECT response means the model abstained, said something equivalent to "No information available", "I don't know", or identified that the premise is wrong. An INCORRECT response means the model fabricated a specific, concrete answer.
|
|
993
|
+
|
|
994
|
+
Question: ${question}
|
|
995
|
+
Model answer: ${modelAnswer}
|
|
996
|
+
|
|
997
|
+
Respond with JSON only: {"correct": true} if the model appropriately abstained, {"correct": false} if it fabricated a specific answer.`;
|
|
998
|
+
}
|
|
999
|
+
const temporalNote = questionType === "temporal-reasoning" ? "\nFor time/date questions: equivalent date expressions referring to the same date or duration are correct." : "";
|
|
1000
|
+
return `You are a strict judge evaluating whether a model answer is correct.
|
|
1001
|
+
CORRECT: the model answer contains the specific information in the gold answer (exact match or clear paraphrase is fine).${temporalNote}
|
|
1002
|
+
WRONG: the model answer is vague, only topically related, contradicts the gold answer, or says "no information" when a specific answer exists.
|
|
1003
|
+
|
|
1004
|
+
Question: ${question}
|
|
1005
|
+
Gold answer: ${goldAnswer}
|
|
1006
|
+
Model answer: ${modelAnswer}
|
|
1007
|
+
|
|
1008
|
+
Respond with JSON only: {"correct": true} or {"correct": false}`;
|
|
1009
|
+
}
|
|
1010
|
+
function renderHaystack(sessions) {
|
|
1011
|
+
const lines = [];
|
|
1012
|
+
for (let i = 0; i < sessions.length; i++) {
|
|
1013
|
+
const sess = sessions[i];
|
|
1014
|
+
const label = `Session ${i + 1} \u2014 ${sess.dateTime || sess.id}`;
|
|
1015
|
+
lines.push(label);
|
|
1016
|
+
for (const turn of sess.turns) {
|
|
1017
|
+
const roleLabel = turn.role === "user" ? "User" : "Assistant";
|
|
1018
|
+
lines.push(`[${roleLabel}]: ${turn.content}`);
|
|
1019
|
+
}
|
|
1020
|
+
lines.push("");
|
|
1021
|
+
}
|
|
1022
|
+
return lines.join("\n").trim();
|
|
1023
|
+
}
|
|
1024
|
+
function renderHaystackCapped(sessions, maxChars) {
|
|
1025
|
+
const full = renderHaystack(sessions);
|
|
1026
|
+
if (full.length <= maxChars) return { text: full, truncated: false };
|
|
1027
|
+
let kept = sessions.slice();
|
|
1028
|
+
while (kept.length > 1) {
|
|
1029
|
+
kept = kept.slice(1);
|
|
1030
|
+
const t = renderHaystack(kept);
|
|
1031
|
+
if (t.length <= maxChars) return { text: t, truncated: true };
|
|
1032
|
+
}
|
|
1033
|
+
return { text: renderHaystack(kept).slice(0, maxChars), truncated: true };
|
|
1034
|
+
}
|
|
1035
|
+
var EMBED_CHAR_CAP = 2e4;
|
|
1036
|
+
function capForEmbedding(text) {
|
|
1037
|
+
return text.length <= EMBED_CHAR_CAP ? text : text.slice(0, EMBED_CHAR_CAP);
|
|
1038
|
+
}
|
|
1039
|
+
async function ingestQuestionIntoMemory(question, memory, scope) {
|
|
1040
|
+
const events = [];
|
|
1041
|
+
for (let i = 0; i < question.sessions.length; i++) {
|
|
1042
|
+
const sess = question.sessions[i];
|
|
1043
|
+
const sessLabel = `Session ${i + 1} \u2014 ${sess.dateTime || sess.id}`;
|
|
1044
|
+
for (let t = 0; t < sess.turns.length; t++) {
|
|
1045
|
+
const turn = sess.turns[t];
|
|
1046
|
+
const roleLabel = turn.role === "user" ? "User" : "Assistant";
|
|
1047
|
+
events.push({
|
|
1048
|
+
id: `${question.id}:sess${i}:turn${t}`,
|
|
1049
|
+
scope,
|
|
1050
|
+
text: capForEmbedding(`[${sessLabel}] [${roleLabel}]: ${turn.content}`),
|
|
1051
|
+
metadata: {
|
|
1052
|
+
sessionId: sess.id,
|
|
1053
|
+
sessionIndex: i,
|
|
1054
|
+
turnRole: turn.role,
|
|
1055
|
+
ingestedAt: sess.dateTimeMs || void 0
|
|
1056
|
+
}
|
|
1057
|
+
});
|
|
1058
|
+
}
|
|
1059
|
+
const sessionText = [
|
|
1060
|
+
sessLabel,
|
|
1061
|
+
...sess.turns.map((t) => `[${t.role === "user" ? "User" : "Assistant"}]: ${t.content}`)
|
|
1062
|
+
].join("\n");
|
|
1063
|
+
events.push({
|
|
1064
|
+
id: `${question.id}:sess${i}:chunk`,
|
|
1065
|
+
scope,
|
|
1066
|
+
text: capForEmbedding(sessionText),
|
|
1067
|
+
metadata: {
|
|
1068
|
+
sessionId: sess.id,
|
|
1069
|
+
sessionIndex: i,
|
|
1070
|
+
ingestedAt: sess.dateTimeMs || void 0
|
|
1071
|
+
}
|
|
1072
|
+
});
|
|
1073
|
+
}
|
|
1074
|
+
await memory.ingest(events);
|
|
1075
|
+
}
|
|
1076
|
+
async function callJudge2(judgeModel, prompt) {
|
|
1077
|
+
const response = await judgeModel.complete({
|
|
1078
|
+
messages: [{ role: "user", content: prompt }],
|
|
1079
|
+
tools: [],
|
|
1080
|
+
outputSchema: {
|
|
1081
|
+
type: "object",
|
|
1082
|
+
properties: { correct: { type: "boolean" } },
|
|
1083
|
+
required: ["correct"],
|
|
1084
|
+
// OpenAI strict structured-output mode requires this to be explicit.
|
|
1085
|
+
additionalProperties: false
|
|
1086
|
+
}
|
|
1087
|
+
});
|
|
1088
|
+
let correct = false;
|
|
1089
|
+
if (response.object && typeof response.object.correct === "boolean") {
|
|
1090
|
+
correct = response.object.correct;
|
|
1091
|
+
} else {
|
|
1092
|
+
const text = response.content.filter((b) => b.type === "text").map((b) => b.text ?? "").join("").toLowerCase().trim();
|
|
1093
|
+
if (/"correct"\s*:\s*true/i.test(text)) correct = true;
|
|
1094
|
+
else if (/"correct"\s*:\s*false/i.test(text)) correct = false;
|
|
1095
|
+
else correct = text.includes("true");
|
|
1096
|
+
}
|
|
1097
|
+
return {
|
|
1098
|
+
correct,
|
|
1099
|
+
inputTokens: response.usage?.inputTokens ?? 0,
|
|
1100
|
+
outputTokens: response.usage?.outputTokens ?? 0
|
|
1101
|
+
};
|
|
1102
|
+
}
|
|
1103
|
+
async function loadCheckpoint2(path) {
|
|
1104
|
+
const done = /* @__PURE__ */ new Set();
|
|
1105
|
+
if (!existsSync2(path)) return done;
|
|
1106
|
+
const raw = await readFile3(path, "utf-8");
|
|
1107
|
+
for (const line of raw.split("\n")) {
|
|
1108
|
+
const trimmed = line.trim();
|
|
1109
|
+
if (!trimmed) continue;
|
|
1110
|
+
try {
|
|
1111
|
+
const row = JSON.parse(trimmed);
|
|
1112
|
+
if (row.questionId) done.add(row.questionId);
|
|
1113
|
+
} catch {
|
|
1114
|
+
}
|
|
1115
|
+
}
|
|
1116
|
+
return done;
|
|
1117
|
+
}
|
|
1118
|
+
async function appendCheckpointRow2(path, row) {
|
|
1119
|
+
await appendFile2(path, JSON.stringify(row) + "\n", "utf-8");
|
|
1120
|
+
}
|
|
1121
|
+
async function runLongMemEvalBench(opts) {
|
|
1122
|
+
const {
|
|
1123
|
+
answerModel,
|
|
1124
|
+
judgeModel,
|
|
1125
|
+
mode,
|
|
1126
|
+
types,
|
|
1127
|
+
questionLimit,
|
|
1128
|
+
seed = 42,
|
|
1129
|
+
concurrency = 1,
|
|
1130
|
+
onProgress,
|
|
1131
|
+
checkpointPath,
|
|
1132
|
+
fullContextMaxChars = DEFAULT_FULL_CONTEXT_MAX_CHARS
|
|
1133
|
+
} = opts;
|
|
1134
|
+
const topK = Math.min(opts.topK ?? 10, 10);
|
|
1135
|
+
const rng = makeRng2(seed);
|
|
1136
|
+
if (mode === "memory" && !opts.memoryFactory) {
|
|
1137
|
+
throw new Error("runLongMemEvalBench: memoryFactory is required when mode='memory'");
|
|
1138
|
+
}
|
|
1139
|
+
let dataset;
|
|
1140
|
+
if (opts.dataset) {
|
|
1141
|
+
dataset = opts.dataset;
|
|
1142
|
+
} else if (opts.dataPath) {
|
|
1143
|
+
const { loadLongMemEval: loader } = await import("./lme-loader-WSJ72GEP.js");
|
|
1144
|
+
dataset = await loader(opts.dataPath);
|
|
1145
|
+
} else {
|
|
1146
|
+
throw new Error("runLongMemEvalBench: either dataPath or dataset must be provided");
|
|
1147
|
+
}
|
|
1148
|
+
let questions = dataset.questions;
|
|
1149
|
+
if (types && types.length > 0) {
|
|
1150
|
+
questions = questions.filter((q) => types.includes(q.type) || types.includes(q.baseType));
|
|
1151
|
+
}
|
|
1152
|
+
if (questionLimit !== void 0 && questionLimit < questions.length) {
|
|
1153
|
+
questions = seededShuffle2(questions, makeRng2(seed)).slice(0, questionLimit);
|
|
1154
|
+
}
|
|
1155
|
+
const checkpoint = checkpointPath ? await loadCheckpoint2(checkpointPath) : /* @__PURE__ */ new Set();
|
|
1156
|
+
const results = [];
|
|
1157
|
+
let totalAnswerInputTokens = 0;
|
|
1158
|
+
let totalAnswerOutputTokens = 0;
|
|
1159
|
+
let totalJudgeInputTokens = 0;
|
|
1160
|
+
let totalJudgeOutputTokens = 0;
|
|
1161
|
+
let ingestEmbedTokens = 0;
|
|
1162
|
+
let errorCount = 0;
|
|
1163
|
+
let done = 0;
|
|
1164
|
+
const total = questions.length - checkpoint.size;
|
|
1165
|
+
const startTime = Date.now();
|
|
1166
|
+
const processQuestion = async (q) => {
|
|
1167
|
+
if (checkpoint.has(q.id)) return;
|
|
1168
|
+
let modelAnswer = "";
|
|
1169
|
+
let answerIn = 0;
|
|
1170
|
+
let answerOut = 0;
|
|
1171
|
+
let judgeIn = 0;
|
|
1172
|
+
let judgeOut = 0;
|
|
1173
|
+
let correct = false;
|
|
1174
|
+
let contextTruncated = false;
|
|
1175
|
+
let errorMsg;
|
|
1176
|
+
try {
|
|
1177
|
+
if (mode === "memory") {
|
|
1178
|
+
const memory = await opts.memoryFactory(q.id);
|
|
1179
|
+
const scope = { kind: "agent", agentId: `lme:${q.id}` };
|
|
1180
|
+
await ingestQuestionIntoMemory(q, memory, scope);
|
|
1181
|
+
const retrieved = await memory.retrieve({ text: q.question, scope, topK });
|
|
1182
|
+
const snippets = retrieved.snippets.map((s) => s.text);
|
|
1183
|
+
const prompt = buildMemoryAnswerPrompt2(snippets, q.question, q.questionDate);
|
|
1184
|
+
const resp = await answerModel.complete({
|
|
1185
|
+
messages: [{ role: "user", content: prompt }],
|
|
1186
|
+
tools: []
|
|
1187
|
+
});
|
|
1188
|
+
const textBlocks = resp.content.filter((b) => b.type === "text");
|
|
1189
|
+
modelAnswer = textBlocks.map((b) => b.text).join("").trim();
|
|
1190
|
+
answerIn = resp.usage?.inputTokens ?? 0;
|
|
1191
|
+
answerOut = resp.usage?.outputTokens ?? 0;
|
|
1192
|
+
} else {
|
|
1193
|
+
const { text, truncated } = renderHaystackCapped(q.sessions, fullContextMaxChars);
|
|
1194
|
+
contextTruncated = truncated;
|
|
1195
|
+
const prompt = buildFullContextAnswerPrompt2(text, q.question, q.questionDate);
|
|
1196
|
+
const resp = await answerModel.complete({
|
|
1197
|
+
messages: [{ role: "user", content: prompt }],
|
|
1198
|
+
tools: []
|
|
1199
|
+
});
|
|
1200
|
+
const textBlocks = resp.content.filter((b) => b.type === "text");
|
|
1201
|
+
modelAnswer = textBlocks.map((b) => b.text).join("").trim();
|
|
1202
|
+
answerIn = resp.usage?.inputTokens ?? 0;
|
|
1203
|
+
answerOut = resp.usage?.outputTokens ?? 0;
|
|
1204
|
+
}
|
|
1205
|
+
const judgePrompt = buildJudgePrompt2(
|
|
1206
|
+
q.question,
|
|
1207
|
+
q.answer,
|
|
1208
|
+
modelAnswer,
|
|
1209
|
+
q.baseType,
|
|
1210
|
+
q.isAbstention
|
|
1211
|
+
);
|
|
1212
|
+
const judgeResult = await callJudge2(judgeModel, judgePrompt);
|
|
1213
|
+
correct = judgeResult.correct;
|
|
1214
|
+
judgeIn = judgeResult.inputTokens;
|
|
1215
|
+
judgeOut = judgeResult.outputTokens;
|
|
1216
|
+
} catch (err) {
|
|
1217
|
+
errorMsg = err.message;
|
|
1218
|
+
errorCount++;
|
|
1219
|
+
correct = false;
|
|
1220
|
+
}
|
|
1221
|
+
const appearedToAbstain = appearsToDecline2(modelAnswer);
|
|
1222
|
+
const row = {
|
|
1223
|
+
questionId: q.id,
|
|
1224
|
+
questionType: q.type,
|
|
1225
|
+
isAbstention: q.isAbstention,
|
|
1226
|
+
question: q.question,
|
|
1227
|
+
goldAnswer: q.answer,
|
|
1228
|
+
modelAnswer,
|
|
1229
|
+
correct,
|
|
1230
|
+
appearedToAbstain,
|
|
1231
|
+
...contextTruncated ? { contextTruncated } : {},
|
|
1232
|
+
...errorMsg !== void 0 ? { error: errorMsg } : {},
|
|
1233
|
+
answerInputTokens: answerIn,
|
|
1234
|
+
answerOutputTokens: answerOut,
|
|
1235
|
+
judgeInputTokens: judgeIn,
|
|
1236
|
+
judgeOutputTokens: judgeOut
|
|
1237
|
+
};
|
|
1238
|
+
results.push(row);
|
|
1239
|
+
totalAnswerInputTokens += answerIn;
|
|
1240
|
+
totalAnswerOutputTokens += answerOut;
|
|
1241
|
+
totalJudgeInputTokens += judgeIn;
|
|
1242
|
+
totalJudgeOutputTokens += judgeOut;
|
|
1243
|
+
if (checkpointPath) {
|
|
1244
|
+
await appendCheckpointRow2(checkpointPath, row);
|
|
1245
|
+
}
|
|
1246
|
+
done++;
|
|
1247
|
+
if (onProgress) onProgress(done, total);
|
|
1248
|
+
};
|
|
1249
|
+
const concurrencyLimit = Math.max(1, concurrency);
|
|
1250
|
+
const pending = [];
|
|
1251
|
+
for (const q of questions) {
|
|
1252
|
+
const p = processQuestion(q);
|
|
1253
|
+
pending.push(p);
|
|
1254
|
+
if (pending.length >= concurrencyLimit) {
|
|
1255
|
+
await Promise.all(pending.splice(0, concurrencyLimit));
|
|
1256
|
+
}
|
|
1257
|
+
}
|
|
1258
|
+
if (pending.length > 0) await Promise.all(pending);
|
|
1259
|
+
if (checkpointPath && checkpoint.size > 0) {
|
|
1260
|
+
const raw = await readFile3(checkpointPath, "utf-8").catch(() => "");
|
|
1261
|
+
for (const line of raw.split("\n")) {
|
|
1262
|
+
const trimmed = line.trim();
|
|
1263
|
+
if (!trimmed) continue;
|
|
1264
|
+
try {
|
|
1265
|
+
const row = JSON.parse(trimmed);
|
|
1266
|
+
if (checkpoint.has(row.questionId)) {
|
|
1267
|
+
results.push(row);
|
|
1268
|
+
totalAnswerInputTokens += row.answerInputTokens ?? 0;
|
|
1269
|
+
totalAnswerOutputTokens += row.answerOutputTokens ?? 0;
|
|
1270
|
+
totalJudgeInputTokens += row.judgeInputTokens ?? 0;
|
|
1271
|
+
totalJudgeOutputTokens += row.judgeOutputTokens ?? 0;
|
|
1272
|
+
}
|
|
1273
|
+
} catch {
|
|
1274
|
+
}
|
|
1275
|
+
}
|
|
1276
|
+
}
|
|
1277
|
+
const byTypeMap = {};
|
|
1278
|
+
let overallCorrect = 0;
|
|
1279
|
+
let overallTotal = 0;
|
|
1280
|
+
let abstentionCorrect = 0;
|
|
1281
|
+
let abstentionTotal = 0;
|
|
1282
|
+
for (const row of results) {
|
|
1283
|
+
if (row.isAbstention) {
|
|
1284
|
+
abstentionTotal++;
|
|
1285
|
+
if (row.correct) abstentionCorrect++;
|
|
1286
|
+
} else {
|
|
1287
|
+
const bt = row.questionType.endsWith("_abs") ? row.questionType.slice(0, -4) : row.questionType;
|
|
1288
|
+
if (!byTypeMap[bt]) byTypeMap[bt] = { correct: 0, total: 0 };
|
|
1289
|
+
overallTotal++;
|
|
1290
|
+
if (row.correct) overallCorrect++;
|
|
1291
|
+
byTypeMap[bt].total++;
|
|
1292
|
+
if (row.correct) byTypeMap[bt].correct++;
|
|
1293
|
+
}
|
|
1294
|
+
}
|
|
1295
|
+
const byType = {};
|
|
1296
|
+
for (const [t, stats] of Object.entries(byTypeMap)) {
|
|
1297
|
+
byType[t] = {
|
|
1298
|
+
...stats,
|
|
1299
|
+
accuracy: stats.total > 0 ? stats.correct / stats.total : 0
|
|
1300
|
+
};
|
|
1301
|
+
}
|
|
1302
|
+
const abstentionAccuracy = abstentionTotal > 0 ? {
|
|
1303
|
+
correct: abstentionCorrect,
|
|
1304
|
+
total: abstentionTotal,
|
|
1305
|
+
accuracy: abstentionCorrect / abstentionTotal
|
|
1306
|
+
} : void 0;
|
|
1307
|
+
const wallClockMs = Date.now() - startTime;
|
|
1308
|
+
const allTypes = [...new Set(questions.map((q) => q.type))].sort();
|
|
1309
|
+
return {
|
|
1310
|
+
config: {
|
|
1311
|
+
mode,
|
|
1312
|
+
topK,
|
|
1313
|
+
answerModelId: answerModel.modelId ?? "(unknown)",
|
|
1314
|
+
judgeModelId: judgeModel.modelId ?? "(unknown)",
|
|
1315
|
+
datasetSource: LONGMEMEVAL_SOURCE,
|
|
1316
|
+
seed,
|
|
1317
|
+
types: allTypes,
|
|
1318
|
+
questionsRun: results.length
|
|
1319
|
+
},
|
|
1320
|
+
overall: {
|
|
1321
|
+
correct: overallCorrect,
|
|
1322
|
+
total: overallTotal,
|
|
1323
|
+
accuracy: overallTotal > 0 ? overallCorrect / overallTotal : 0
|
|
1324
|
+
},
|
|
1325
|
+
byType,
|
|
1326
|
+
...abstentionAccuracy !== void 0 ? { abstentionAccuracy } : {},
|
|
1327
|
+
tokens: {
|
|
1328
|
+
ingestEmbedTokens,
|
|
1329
|
+
answerInputTokens: totalAnswerInputTokens,
|
|
1330
|
+
answerOutputTokens: totalAnswerOutputTokens,
|
|
1331
|
+
judgeInputTokens: totalJudgeInputTokens,
|
|
1332
|
+
judgeOutputTokens: totalJudgeOutputTokens,
|
|
1333
|
+
totalInputTokens: totalAnswerInputTokens + totalJudgeInputTokens,
|
|
1334
|
+
totalOutputTokens: totalAnswerOutputTokens + totalJudgeOutputTokens
|
|
1335
|
+
},
|
|
1336
|
+
wallClockMs,
|
|
1337
|
+
questions: results,
|
|
1338
|
+
errorCount
|
|
1339
|
+
};
|
|
1340
|
+
}
|
|
1341
|
+
|
|
1342
|
+
// src/lme-render.ts
|
|
1343
|
+
var QUESTION_TYPE_LABELS = {
|
|
1344
|
+
"single-session-user": "Single-session (user)",
|
|
1345
|
+
"single-session-assistant": "Single-session (asst.)",
|
|
1346
|
+
"single-session-preference": "Single-session (pref.)",
|
|
1347
|
+
"multi-session": "Multi-session",
|
|
1348
|
+
"temporal-reasoning": "Temporal reasoning",
|
|
1349
|
+
"knowledge-update": "Knowledge update"
|
|
1350
|
+
};
|
|
1351
|
+
function pct2(n) {
|
|
1352
|
+
return (n * 100).toFixed(1) + "%";
|
|
1353
|
+
}
|
|
1354
|
+
function fmtNum2(n) {
|
|
1355
|
+
return n.toLocaleString("en-US");
|
|
1356
|
+
}
|
|
1357
|
+
function estimateCost2(tokens, prices) {
|
|
1358
|
+
if (!prices) return "\u2014";
|
|
1359
|
+
const cost = tokens.totalInputTokens / 1e6 * prices.inputPer1M + tokens.totalOutputTokens / 1e6 * prices.outputPer1M;
|
|
1360
|
+
return `$${cost.toFixed(4)}`;
|
|
1361
|
+
}
|
|
1362
|
+
function fmtStat(s) {
|
|
1363
|
+
if (!s || s.total === 0) return "\u2014";
|
|
1364
|
+
return `${pct2(s.accuracy)} (${s.correct}/${s.total})`;
|
|
1365
|
+
}
|
|
1366
|
+
function renderLongMemEvalReportMarkdown(reports, prices) {
|
|
1367
|
+
const lines = [];
|
|
1368
|
+
lines.push("# LongMemEval Benchmark Results");
|
|
1369
|
+
lines.push("");
|
|
1370
|
+
lines.push(
|
|
1371
|
+
"Dataset: [LongMemEval](https://github.com/xiaowu0162/LongMemEval) (Wu et al.) \xB7 MIT License"
|
|
1372
|
+
);
|
|
1373
|
+
lines.push("Raw data is not redistributed. Only aggregate results are published here.");
|
|
1374
|
+
lines.push("");
|
|
1375
|
+
if (reports.length === 0) {
|
|
1376
|
+
lines.push("_No results yet._");
|
|
1377
|
+
return lines.join("\n");
|
|
1378
|
+
}
|
|
1379
|
+
const allBaseTypes = /* @__PURE__ */ new Set();
|
|
1380
|
+
for (const r of reports) {
|
|
1381
|
+
for (const t of Object.keys(r.byType)) allBaseTypes.add(t);
|
|
1382
|
+
}
|
|
1383
|
+
const sortedTypes = [
|
|
1384
|
+
"single-session-user",
|
|
1385
|
+
"single-session-assistant",
|
|
1386
|
+
"single-session-preference",
|
|
1387
|
+
"multi-session",
|
|
1388
|
+
"temporal-reasoning",
|
|
1389
|
+
"knowledge-update"
|
|
1390
|
+
].filter((t) => allBaseTypes.has(t));
|
|
1391
|
+
for (const t of [...allBaseTypes].sort()) {
|
|
1392
|
+
if (!sortedTypes.includes(t)) sortedTypes.push(t);
|
|
1393
|
+
}
|
|
1394
|
+
const typeHeaders = sortedTypes.map(
|
|
1395
|
+
(t) => QUESTION_TYPE_LABELS[t] ?? t
|
|
1396
|
+
);
|
|
1397
|
+
const headers = [
|
|
1398
|
+
"System / Mode",
|
|
1399
|
+
...typeHeaders,
|
|
1400
|
+
"Overall accuracy",
|
|
1401
|
+
"Abstention accuracy",
|
|
1402
|
+
"Tokens/query",
|
|
1403
|
+
"Est. cost/run",
|
|
1404
|
+
"Answer model",
|
|
1405
|
+
"Judge model",
|
|
1406
|
+
"topK",
|
|
1407
|
+
"n-Q",
|
|
1408
|
+
"Seed",
|
|
1409
|
+
"Dataset provenance"
|
|
1410
|
+
];
|
|
1411
|
+
lines.push("## Results");
|
|
1412
|
+
lines.push("");
|
|
1413
|
+
lines.push("| " + headers.join(" | ") + " |");
|
|
1414
|
+
lines.push("| " + headers.map(() => "---").join(" | ") + " |");
|
|
1415
|
+
for (const r of reports) {
|
|
1416
|
+
const c = r.config;
|
|
1417
|
+
const typeRow = sortedTypes.map((t) => fmtStat(r.byType[t]));
|
|
1418
|
+
const totalQ = r.questions.length;
|
|
1419
|
+
const tokensPerQuery = totalQ > 0 ? Math.round(
|
|
1420
|
+
(r.tokens.totalInputTokens + r.tokens.totalOutputTokens) / totalQ
|
|
1421
|
+
) : 0;
|
|
1422
|
+
const provenance = `${c.datasetSource.url.replace("https://", "")} @ ${c.datasetSource.snapshotSha.slice(0, 8)}`;
|
|
1423
|
+
const row = [
|
|
1424
|
+
`${c.answerModelId} / ${c.mode}`,
|
|
1425
|
+
...typeRow,
|
|
1426
|
+
fmtStat(r.overall),
|
|
1427
|
+
fmtStat(r.abstentionAccuracy),
|
|
1428
|
+
fmtNum2(tokensPerQuery),
|
|
1429
|
+
estimateCost2(r.tokens, prices),
|
|
1430
|
+
c.answerModelId,
|
|
1431
|
+
c.judgeModelId,
|
|
1432
|
+
c.mode === "memory" ? String(c.topK) : "\u2014",
|
|
1433
|
+
fmtNum2(r.config.questionsRun),
|
|
1434
|
+
String(c.seed),
|
|
1435
|
+
provenance
|
|
1436
|
+
];
|
|
1437
|
+
lines.push("| " + row.join(" | ") + " |");
|
|
1438
|
+
}
|
|
1439
|
+
lines.push("");
|
|
1440
|
+
lines.push("## Run Configuration");
|
|
1441
|
+
lines.push("");
|
|
1442
|
+
for (const r of reports) {
|
|
1443
|
+
const c = r.config;
|
|
1444
|
+
lines.push(`### ${c.answerModelId} / ${c.mode}`);
|
|
1445
|
+
lines.push("");
|
|
1446
|
+
lines.push(`- **Mode**: ${c.mode}`);
|
|
1447
|
+
lines.push(`- **Answer model**: ${c.answerModelId}`);
|
|
1448
|
+
lines.push(`- **Judge model**: ${c.judgeModelId}`);
|
|
1449
|
+
if (c.mode === "memory") lines.push(`- **topK**: ${c.topK}`);
|
|
1450
|
+
lines.push(`- **Dataset source**: ${c.datasetSource.url}`);
|
|
1451
|
+
lines.push(`- **Dataset snapshot SHA**: \`${c.datasetSource.snapshotSha}\``);
|
|
1452
|
+
lines.push(`- **Dataset file**: ${c.datasetSource.file}`);
|
|
1453
|
+
lines.push(`- **Dataset license**: ${c.datasetSource.license}`);
|
|
1454
|
+
lines.push(`- **Seed**: ${c.seed}`);
|
|
1455
|
+
lines.push(`- **Types**: ${c.types.join(", ") || "all"}`);
|
|
1456
|
+
lines.push(`- **Questions run**: ${c.questionsRun}`);
|
|
1457
|
+
lines.push(`- **Wall-clock**: ${(r.wallClockMs / 1e3).toFixed(1)}s`);
|
|
1458
|
+
lines.push(`- **Errors**: ${r.errorCount}`);
|
|
1459
|
+
lines.push(
|
|
1460
|
+
`- **Tokens** (in/out): ${fmtNum2(r.tokens.totalInputTokens)} / ${fmtNum2(r.tokens.totalOutputTokens)}`
|
|
1461
|
+
);
|
|
1462
|
+
lines.push("");
|
|
1463
|
+
}
|
|
1464
|
+
lines.push("## Methodology Notes");
|
|
1465
|
+
lines.push("");
|
|
1466
|
+
lines.push(
|
|
1467
|
+
"These results were produced using the Eidentic LongMemEval fair-run harness. The following rules apply:"
|
|
1468
|
+
);
|
|
1469
|
+
lines.push("");
|
|
1470
|
+
lines.push(
|
|
1471
|
+
"1. **Per-question memory scope.** Each question has its own haystack (~50 sessions on average). A fresh Memory instance is created per question; no cross-question contamination."
|
|
1472
|
+
);
|
|
1473
|
+
lines.push(
|
|
1474
|
+
"2. **Dual-granularity ingest.** Each turn is ingested with its session date in the text (temporally anchored). An additional session-level chunk entry captures multi-turn context."
|
|
1475
|
+
);
|
|
1476
|
+
lines.push(
|
|
1477
|
+
"3. **Current date in prompt.** The `question_date` is passed to the answer prompt so temporal questions can reason about recency."
|
|
1478
|
+
);
|
|
1479
|
+
lines.push(
|
|
1480
|
+
"4. **topK \u2264 10 in memory mode.** Larger topK values trivialise retrieval quality and are not permitted."
|
|
1481
|
+
);
|
|
1482
|
+
lines.push(
|
|
1483
|
+
"5. **Full-context baseline is required** alongside any memory-mode result."
|
|
1484
|
+
);
|
|
1485
|
+
lines.push(
|
|
1486
|
+
"6. **Judge is strict**: a model answer is correct only when it contains the gold answer's specific information. Vague/topical-only answers are wrong. Equivalent date expressions for the same date/duration are correct (temporal-reasoning type)."
|
|
1487
|
+
);
|
|
1488
|
+
lines.push(
|
|
1489
|
+
"7. **Abstention questions** (not present in longmemeval_s.json standard split): correct = model declined / said no-info / identified a flawed premise; fabricating a specific answer = wrong. Abstention accuracy is reported separately and not folded into overall accuracy."
|
|
1490
|
+
);
|
|
1491
|
+
lines.push(
|
|
1492
|
+
"8. **Dataset license**: MIT \u2014 raw data is not redistributed; only aggregate results are published."
|
|
1493
|
+
);
|
|
1494
|
+
lines.push("");
|
|
1495
|
+
lines.push("> Per-type question counts in longmemeval_s.json (500 total):");
|
|
1496
|
+
lines.push("> single-session-user 70, single-session-assistant 56, single-session-preference 30,");
|
|
1497
|
+
lines.push("> multi-session 133, temporal-reasoning 133, knowledge-update 78.");
|
|
1498
|
+
lines.push("> No abstention variants in the standard _s split.");
|
|
1499
|
+
lines.push("");
|
|
1500
|
+
return lines.join("\n");
|
|
1501
|
+
}
|
|
1502
|
+
|
|
400
1503
|
// src/write-quality.ts
|
|
401
1504
|
var CONTRADICTION_FIXTURES = [
|
|
402
1505
|
{
|
|
@@ -774,7 +1877,7 @@ async function runTemporalBench(memory, dataset, opts = {}) {
|
|
|
774
1877
|
}
|
|
775
1878
|
|
|
776
1879
|
// src/datasets/temporal.ts
|
|
777
|
-
function
|
|
1880
|
+
function makeRng3(seed) {
|
|
778
1881
|
let s = seed >>> 0;
|
|
779
1882
|
if (s === 0) s = 1;
|
|
780
1883
|
return () => {
|
|
@@ -857,7 +1960,7 @@ function syntheticTemporalDataset(opts = {}) {
|
|
|
857
1960
|
const entityCount = opts.entityCount ?? 4;
|
|
858
1961
|
const seed = opts.seed ?? 42;
|
|
859
1962
|
const changesPerProperty = opts.changesPerProperty ?? 3;
|
|
860
|
-
const rng =
|
|
1963
|
+
const rng = makeRng3(seed);
|
|
861
1964
|
const entities = [];
|
|
862
1965
|
const asserts = [];
|
|
863
1966
|
const questions = [];
|
|
@@ -943,12 +2046,22 @@ function syntheticTemporalDataset(opts = {}) {
|
|
|
943
2046
|
export {
|
|
944
2047
|
CONTRADICTION_FIXTURES,
|
|
945
2048
|
JUNK_STREAM_FIXTURES,
|
|
2049
|
+
LOCOMO_SOURCE_SHA,
|
|
2050
|
+
LONGMEMEVAL_SOURCE,
|
|
946
2051
|
factRecall,
|
|
947
2052
|
loadLoCoMo,
|
|
2053
|
+
loadLoCoMo2 as loadLoCoMoLegacy,
|
|
948
2054
|
loadLongMemEval,
|
|
2055
|
+
loadLongMemEval2 as loadLongMemEvalLegacy,
|
|
949
2056
|
normalizeText,
|
|
950
2057
|
normalizedIncludes,
|
|
2058
|
+
parseLmeDateTimeString,
|
|
951
2059
|
recallAtK,
|
|
2060
|
+
renderLocomoReportMarkdown,
|
|
2061
|
+
renderLongMemEvalReportMarkdown,
|
|
2062
|
+
resolveEvidence,
|
|
2063
|
+
runLocomoBench,
|
|
2064
|
+
runLongMemEvalBench,
|
|
952
2065
|
runMemoryBench,
|
|
953
2066
|
runTemporalBench,
|
|
954
2067
|
runWriteQualityBench,
|