@eidentic/bench 0.1.0 → 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +39 -2
- package/dist/chunk-PVIWNXCY.js +99 -0
- package/dist/index.cjs +657 -3
- package/dist/index.d.cts +255 -2
- package/dist/index.d.ts +255 -2
- package/dist/index.js +544 -3
- package/dist/locomo-loader-YA3IEOND.js +8 -0
- package/package.json +4 -4
package/dist/index.cjs
CHANGED
|
@@ -5,6 +5,9 @@ var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
|
|
|
5
5
|
var __getOwnPropNames = Object.getOwnPropertyNames;
|
|
6
6
|
var __getProtoOf = Object.getPrototypeOf;
|
|
7
7
|
var __hasOwnProp = Object.prototype.hasOwnProperty;
|
|
8
|
+
var __esm = (fn, res) => function __init() {
|
|
9
|
+
return fn && (res = (0, fn[__getOwnPropNames(fn)[0]])(fn = 0)), res;
|
|
10
|
+
};
|
|
8
11
|
var __export = (target, all) => {
|
|
9
12
|
for (var name in all)
|
|
10
13
|
__defProp(target, name, { get: all[name], enumerable: true });
|
|
@@ -27,17 +30,128 @@ var __toESM = (mod, isNodeMode, target) => (target = mod != null ? __create(__ge
|
|
|
27
30
|
));
|
|
28
31
|
var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
|
|
29
32
|
|
|
33
|
+
// src/locomo-loader.ts
|
|
34
|
+
var locomo_loader_exports = {};
|
|
35
|
+
__export(locomo_loader_exports, {
|
|
36
|
+
LOCOMO_SOURCE_SHA: () => LOCOMO_SOURCE_SHA,
|
|
37
|
+
loadLoCoMo: () => loadLoCoMo2
|
|
38
|
+
});
|
|
39
|
+
async function assertFileSize2(filePath, maxBytes) {
|
|
40
|
+
let fileSize;
|
|
41
|
+
try {
|
|
42
|
+
const s = await (0, import_promises2.stat)(filePath);
|
|
43
|
+
fileSize = s.size;
|
|
44
|
+
} catch (err) {
|
|
45
|
+
throw new Error(
|
|
46
|
+
`bench loader: cannot stat file "${filePath}": ${err.message}`
|
|
47
|
+
);
|
|
48
|
+
}
|
|
49
|
+
if (fileSize > maxBytes) {
|
|
50
|
+
const mb = (fileSize / (1024 * 1024)).toFixed(1);
|
|
51
|
+
const capMb = (maxBytes / (1024 * 1024)).toFixed(0);
|
|
52
|
+
throw new Error(
|
|
53
|
+
`bench loader: file "${filePath}" is ${mb} MiB, which exceeds the ${capMb} MiB cap. Pass a larger maxBytes option if this is intentional.`
|
|
54
|
+
);
|
|
55
|
+
}
|
|
56
|
+
}
|
|
57
|
+
function parseLocomoDateTime(raw) {
|
|
58
|
+
const cleaned = raw.replace(/\s*on\s+/, " ").trim();
|
|
59
|
+
const ms = Date.parse(cleaned);
|
|
60
|
+
if (Number.isNaN(ms)) {
|
|
61
|
+
const match = /^(\d{1,2}:\d{2}\s*(?:am|pm))\s+(.+)$/i.exec(cleaned);
|
|
62
|
+
if (match) {
|
|
63
|
+
const ms2 = Date.parse(`${match[2]} ${match[1]}`);
|
|
64
|
+
return Number.isNaN(ms2) ? 0 : ms2;
|
|
65
|
+
}
|
|
66
|
+
return 0;
|
|
67
|
+
}
|
|
68
|
+
return ms;
|
|
69
|
+
}
|
|
70
|
+
function parseSessions(conv) {
|
|
71
|
+
const indices = [];
|
|
72
|
+
for (const key of Object.keys(conv)) {
|
|
73
|
+
const m = /^session_(\d+)$/.exec(key);
|
|
74
|
+
if (m) indices.push(parseInt(m[1], 10));
|
|
75
|
+
}
|
|
76
|
+
indices.sort((a, b) => a - b);
|
|
77
|
+
const sessions = [];
|
|
78
|
+
for (const idx of indices) {
|
|
79
|
+
const turnsRaw = conv[`session_${idx}`];
|
|
80
|
+
if (!Array.isArray(turnsRaw)) continue;
|
|
81
|
+
const dateTimeRaw = typeof conv[`session_${idx}_date_time`] === "string" ? conv[`session_${idx}_date_time`] : "";
|
|
82
|
+
const turns = turnsRaw.map((t) => ({
|
|
83
|
+
speaker: t.speaker ?? "",
|
|
84
|
+
diaId: t.dia_id ?? "",
|
|
85
|
+
text: t.text ?? ""
|
|
86
|
+
}));
|
|
87
|
+
sessions.push({
|
|
88
|
+
index: idx,
|
|
89
|
+
dateTime: dateTimeRaw,
|
|
90
|
+
dateTimeMs: dateTimeRaw ? parseLocomoDateTime(dateTimeRaw) : 0,
|
|
91
|
+
turns
|
|
92
|
+
});
|
|
93
|
+
}
|
|
94
|
+
return sessions;
|
|
95
|
+
}
|
|
96
|
+
async function loadLoCoMo2(jsonPath, opts) {
|
|
97
|
+
await assertFileSize2(jsonPath, opts?.maxBytes ?? DEFAULT_MAX_BYTES2);
|
|
98
|
+
const raw = JSON.parse(await (0, import_promises2.readFile)(jsonPath, "utf-8"));
|
|
99
|
+
if (!Array.isArray(raw)) {
|
|
100
|
+
throw new Error(
|
|
101
|
+
`bench loader: expected the LoCoMo JSON root to be an array, but got ${typeof raw}. Did you pass the correct file?`
|
|
102
|
+
);
|
|
103
|
+
}
|
|
104
|
+
const rawSamples = raw;
|
|
105
|
+
const samples = rawSamples.map((s, i) => {
|
|
106
|
+
const sampleId = s.sample_id ?? String(i);
|
|
107
|
+
const conv = s.conversation ?? {};
|
|
108
|
+
const sessions = parseSessions(conv);
|
|
109
|
+
const qa = (s.qa ?? []).map((q) => {
|
|
110
|
+
const answer = q.answer !== void 0 ? String(q.answer) : void 0;
|
|
111
|
+
return {
|
|
112
|
+
question: q.question ?? "",
|
|
113
|
+
answer,
|
|
114
|
+
category: q.category,
|
|
115
|
+
evidence: Array.isArray(q.evidence) ? q.evidence : [],
|
|
116
|
+
adversarialAnswer: q.adversarial_answer
|
|
117
|
+
};
|
|
118
|
+
});
|
|
119
|
+
return {
|
|
120
|
+
sampleId,
|
|
121
|
+
speakerA: String(conv.speaker_a ?? ""),
|
|
122
|
+
speakerB: String(conv.speaker_b ?? ""),
|
|
123
|
+
sessions,
|
|
124
|
+
qa
|
|
125
|
+
};
|
|
126
|
+
});
|
|
127
|
+
return { samples };
|
|
128
|
+
}
|
|
129
|
+
var import_promises2, LOCOMO_SOURCE_SHA, DEFAULT_MAX_BYTES2;
|
|
130
|
+
var init_locomo_loader = __esm({
|
|
131
|
+
"src/locomo-loader.ts"() {
|
|
132
|
+
"use strict";
|
|
133
|
+
import_promises2 = require("node:fs/promises");
|
|
134
|
+
LOCOMO_SOURCE_SHA = "3eb6f2c585f5e1699204e3c3bdf7adc5c28cb376";
|
|
135
|
+
DEFAULT_MAX_BYTES2 = 256 * 1024 * 1024;
|
|
136
|
+
}
|
|
137
|
+
});
|
|
138
|
+
|
|
30
139
|
// src/index.ts
|
|
31
140
|
var index_exports = {};
|
|
32
141
|
__export(index_exports, {
|
|
33
142
|
CONTRADICTION_FIXTURES: () => CONTRADICTION_FIXTURES,
|
|
34
143
|
JUNK_STREAM_FIXTURES: () => JUNK_STREAM_FIXTURES,
|
|
144
|
+
LOCOMO_SOURCE_SHA: () => LOCOMO_SOURCE_SHA,
|
|
35
145
|
factRecall: () => factRecall,
|
|
36
|
-
loadLoCoMo: () =>
|
|
146
|
+
loadLoCoMo: () => loadLoCoMo2,
|
|
147
|
+
loadLoCoMoLegacy: () => loadLoCoMo,
|
|
37
148
|
loadLongMemEval: () => loadLongMemEval,
|
|
38
149
|
normalizeText: () => normalizeText,
|
|
39
150
|
normalizedIncludes: () => normalizedIncludes,
|
|
40
151
|
recallAtK: () => recallAtK,
|
|
152
|
+
renderLocomoReportMarkdown: () => renderLocomoReportMarkdown,
|
|
153
|
+
resolveEvidence: () => resolveEvidence,
|
|
154
|
+
runLocomoBench: () => runLocomoBench,
|
|
41
155
|
runMemoryBench: () => runMemoryBench,
|
|
42
156
|
runTemporalBench: () => runTemporalBench,
|
|
43
157
|
runWriteQualityBench: () => runWriteQualityBench,
|
|
@@ -445,6 +559,541 @@ async function loadLoCoMo(jsonPath, opts) {
|
|
|
445
559
|
return { name: "LoCoMo", cases: benchCases };
|
|
446
560
|
}
|
|
447
561
|
|
|
562
|
+
// src/index.ts
|
|
563
|
+
init_locomo_loader();
|
|
564
|
+
|
|
565
|
+
// src/locomo-run.ts
|
|
566
|
+
var import_promises3 = require("node:fs/promises");
|
|
567
|
+
var import_node_fs = require("node:fs");
|
|
568
|
+
init_locomo_loader();
|
|
569
|
+
function makeRng(seed) {
|
|
570
|
+
let s = seed >>> 0;
|
|
571
|
+
if (s === 0) s = 1;
|
|
572
|
+
return () => {
|
|
573
|
+
s ^= s << 13;
|
|
574
|
+
s ^= s >>> 17;
|
|
575
|
+
s ^= s << 5;
|
|
576
|
+
s = s >>> 0;
|
|
577
|
+
return s / 4294967296;
|
|
578
|
+
};
|
|
579
|
+
}
|
|
580
|
+
function seededShuffle(arr, rng) {
|
|
581
|
+
const out = [...arr];
|
|
582
|
+
for (let i = out.length - 1; i > 0; i--) {
|
|
583
|
+
const j = Math.floor(rng() * (i + 1));
|
|
584
|
+
[out[i], out[j]] = [out[j], out[i]];
|
|
585
|
+
}
|
|
586
|
+
return out;
|
|
587
|
+
}
|
|
588
|
+
var ANSWER_INSTRUCTION = "Answer the question concisely and specifically based only on the information provided. If the provided context contains no answer to the question, reply exactly: No information available";
|
|
589
|
+
function buildMemoryAnswerPrompt(snippets, question) {
|
|
590
|
+
const context = snippets.length > 0 ? snippets.map((s, i) => `[${i + 1}] ${s}`).join("\n\n") : "(no relevant context retrieved)";
|
|
591
|
+
return `Context from conversation:
|
|
592
|
+
${context}
|
|
593
|
+
|
|
594
|
+
Question: ${question}
|
|
595
|
+
|
|
596
|
+
${ANSWER_INSTRUCTION}`;
|
|
597
|
+
}
|
|
598
|
+
function buildFullContextAnswerPrompt(conversationText, question) {
|
|
599
|
+
return `Conversation:
|
|
600
|
+
${conversationText}
|
|
601
|
+
|
|
602
|
+
Question: ${question}
|
|
603
|
+
|
|
604
|
+
${ANSWER_INSTRUCTION}`;
|
|
605
|
+
}
|
|
606
|
+
function buildJudgePrompt(question, goldAnswer, modelAnswer, adversarialAnswer, isCategory5) {
|
|
607
|
+
if (isCategory5) {
|
|
608
|
+
const trapNote = adversarialAnswer ? `
|
|
609
|
+
WARNING \u2014 trap answer to watch for: "${adversarialAnswer}". If the model answer closely matches this trap, mark incorrect.` : "";
|
|
610
|
+
return `You are a strict judge. For this unanswerable question, a CORRECT response means the model refused to answer or said something equivalent to "No information available" or "I don't know".${trapNote}
|
|
611
|
+
|
|
612
|
+
Question: ${question}
|
|
613
|
+
Model answer: ${modelAnswer}
|
|
614
|
+
|
|
615
|
+
Respond with JSON only: {"correct": true} if the model declined (correct), {"correct": false} if it made a claim.`;
|
|
616
|
+
}
|
|
617
|
+
return `You are a strict judge evaluating whether a model answer is correct.
|
|
618
|
+
CORRECT: the model answer contains the specific information in the gold answer (exact match or clear paraphrase).
|
|
619
|
+
WRONG: the model answer is vague, only topically related, or contradicts the gold answer.
|
|
620
|
+
For date/time questions: equivalent date expressions referring to the same date are correct.
|
|
621
|
+
|
|
622
|
+
Question: ${question}
|
|
623
|
+
Gold answer: ${goldAnswer ?? "(none)"}
|
|
624
|
+
Model answer: ${modelAnswer}
|
|
625
|
+
|
|
626
|
+
Respond with JSON only: {"correct": true} or {"correct": false}`;
|
|
627
|
+
}
|
|
628
|
+
function renderConversation(sample) {
|
|
629
|
+
const lines = [];
|
|
630
|
+
for (const sess of sample.sessions) {
|
|
631
|
+
const dateLabel = sess.dateTime ? ` \u2014 ${sess.dateTime}` : "";
|
|
632
|
+
lines.push(`Session ${sess.index}${dateLabel}`);
|
|
633
|
+
for (const turn of sess.turns) {
|
|
634
|
+
lines.push(`[${turn.speaker}]: ${turn.text}`);
|
|
635
|
+
}
|
|
636
|
+
lines.push("");
|
|
637
|
+
}
|
|
638
|
+
return lines.join("\n").trim();
|
|
639
|
+
}
|
|
640
|
+
async function ingestSampleIntoMemory(sample, memory, scope) {
|
|
641
|
+
const events = [];
|
|
642
|
+
for (const sess of sample.sessions) {
|
|
643
|
+
const dateLabel = sess.dateTime ? ` \u2014 ${sess.dateTime}` : "";
|
|
644
|
+
const header = `Session ${sess.index}${dateLabel}`;
|
|
645
|
+
for (const turn of sess.turns) {
|
|
646
|
+
events.push({
|
|
647
|
+
id: `${sample.sampleId}:turn:${turn.diaId}`,
|
|
648
|
+
scope,
|
|
649
|
+
text: `[${header}] [${turn.speaker}]: ${turn.text}`,
|
|
650
|
+
metadata: {
|
|
651
|
+
diaId: turn.diaId,
|
|
652
|
+
sessionIndex: sess.index,
|
|
653
|
+
ingestedAt: sess.dateTimeMs || void 0
|
|
654
|
+
}
|
|
655
|
+
});
|
|
656
|
+
}
|
|
657
|
+
const sessionText = [
|
|
658
|
+
header,
|
|
659
|
+
...sess.turns.map((t) => `[${t.speaker}]: ${t.text}`)
|
|
660
|
+
].join("\n");
|
|
661
|
+
events.push({
|
|
662
|
+
id: `${sample.sampleId}:sess:${sess.index}:chunk`,
|
|
663
|
+
scope,
|
|
664
|
+
text: sessionText,
|
|
665
|
+
metadata: { sessionIndex: sess.index, ingestedAt: sess.dateTimeMs || void 0 }
|
|
666
|
+
});
|
|
667
|
+
}
|
|
668
|
+
await memory.ingest(events);
|
|
669
|
+
}
|
|
670
|
+
async function callJudge(judgeModel, prompt) {
|
|
671
|
+
const response = await judgeModel.complete({
|
|
672
|
+
messages: [{ role: "user", content: prompt }],
|
|
673
|
+
tools: [],
|
|
674
|
+
outputSchema: {
|
|
675
|
+
type: "object",
|
|
676
|
+
properties: { correct: { type: "boolean" } },
|
|
677
|
+
required: ["correct"],
|
|
678
|
+
// OpenAI strict structured-output mode requires this to be explicit.
|
|
679
|
+
additionalProperties: false
|
|
680
|
+
}
|
|
681
|
+
});
|
|
682
|
+
let correct = false;
|
|
683
|
+
if (response.object && typeof response.object.correct === "boolean") {
|
|
684
|
+
correct = response.object.correct;
|
|
685
|
+
} else {
|
|
686
|
+
const text = response.content.filter((b) => b.type === "text").map((b) => b.text ?? "").join("").toLowerCase().trim();
|
|
687
|
+
if (/"correct"\s*:\s*true/i.test(text)) correct = true;
|
|
688
|
+
else if (/"correct"\s*:\s*false/i.test(text)) correct = false;
|
|
689
|
+
else correct = text.includes("true");
|
|
690
|
+
}
|
|
691
|
+
return {
|
|
692
|
+
correct,
|
|
693
|
+
inputTokens: response.usage?.inputTokens ?? 0,
|
|
694
|
+
outputTokens: response.usage?.outputTokens ?? 0
|
|
695
|
+
};
|
|
696
|
+
}
|
|
697
|
+
async function loadCheckpoint(path) {
|
|
698
|
+
const done = /* @__PURE__ */ new Set();
|
|
699
|
+
if (!(0, import_node_fs.existsSync)(path)) return done;
|
|
700
|
+
const raw = await (0, import_promises3.readFile)(path, "utf-8");
|
|
701
|
+
for (const line of raw.split("\n")) {
|
|
702
|
+
const trimmed = line.trim();
|
|
703
|
+
if (!trimmed) continue;
|
|
704
|
+
try {
|
|
705
|
+
const row = JSON.parse(trimmed);
|
|
706
|
+
if (row.sampleId && row.questionIndex !== void 0) {
|
|
707
|
+
done.add(`${row.sampleId}:${row.questionIndex}`);
|
|
708
|
+
}
|
|
709
|
+
} catch {
|
|
710
|
+
}
|
|
711
|
+
}
|
|
712
|
+
return done;
|
|
713
|
+
}
|
|
714
|
+
async function appendCheckpointRow(path, row) {
|
|
715
|
+
await (0, import_promises3.appendFile)(path, JSON.stringify(row) + "\n", "utf-8");
|
|
716
|
+
}
|
|
717
|
+
var DECLINE_PATTERNS = /\bno information available\b|\bi (don'?t|do not) know\b|\bcannot (find|answer|determine)\b|\bunavailable\b/i;
|
|
718
|
+
function appearsToDecline(answer) {
|
|
719
|
+
return DECLINE_PATTERNS.test(answer);
|
|
720
|
+
}
|
|
721
|
+
async function runLocomoBench(opts) {
|
|
722
|
+
const {
|
|
723
|
+
answerModel,
|
|
724
|
+
judgeModel,
|
|
725
|
+
mode,
|
|
726
|
+
categories = [1, 2, 3, 4, 5],
|
|
727
|
+
sampleLimit,
|
|
728
|
+
questionLimit,
|
|
729
|
+
seed = 42,
|
|
730
|
+
topK: rawTopK = 10,
|
|
731
|
+
concurrency = 1,
|
|
732
|
+
onProgress,
|
|
733
|
+
checkpointPath
|
|
734
|
+
} = opts;
|
|
735
|
+
const topK = Math.min(rawTopK, 10);
|
|
736
|
+
const rng = makeRng(seed);
|
|
737
|
+
let dataset;
|
|
738
|
+
if (opts.dataset) {
|
|
739
|
+
dataset = opts.dataset;
|
|
740
|
+
} else {
|
|
741
|
+
const { loadLoCoMo: loadLoCoMo3 } = await Promise.resolve().then(() => (init_locomo_loader(), locomo_loader_exports));
|
|
742
|
+
dataset = await loadLoCoMo3(opts.dataPath);
|
|
743
|
+
}
|
|
744
|
+
let samples = [...dataset.samples];
|
|
745
|
+
if (sampleLimit !== void 0 && sampleLimit < samples.length) {
|
|
746
|
+
samples = seededShuffle(samples, makeRng(seed)).slice(0, sampleLimit);
|
|
747
|
+
}
|
|
748
|
+
const queue = [];
|
|
749
|
+
for (const sample of samples) {
|
|
750
|
+
let qaList = sample.qa.filter((q) => categories.includes(q.category));
|
|
751
|
+
if (questionLimit !== void 0 && questionLimit < qaList.length) {
|
|
752
|
+
qaList = seededShuffle(qaList, makeRng(seed + 1)).slice(0, questionLimit);
|
|
753
|
+
}
|
|
754
|
+
for (const qa of qaList) {
|
|
755
|
+
queue.push({ sample, qaIndex: sample.qa.indexOf(qa), qa });
|
|
756
|
+
}
|
|
757
|
+
}
|
|
758
|
+
const checkpoint = checkpointPath ? await loadCheckpoint(checkpointPath) : /* @__PURE__ */ new Set();
|
|
759
|
+
const conversationText = /* @__PURE__ */ new Map();
|
|
760
|
+
if (mode === "full-context") {
|
|
761
|
+
for (const sample of samples) {
|
|
762
|
+
conversationText.set(sample.sampleId, renderConversation(sample));
|
|
763
|
+
}
|
|
764
|
+
}
|
|
765
|
+
const memories = /* @__PURE__ */ new Map();
|
|
766
|
+
let ingestInputTokens = 0;
|
|
767
|
+
let ingestOutputTokens = 0;
|
|
768
|
+
if (mode === "memory") {
|
|
769
|
+
if (!opts.memoryFactory) {
|
|
770
|
+
throw new Error("runLocomoBench: memoryFactory is required when mode='memory'");
|
|
771
|
+
}
|
|
772
|
+
for (const sample of samples) {
|
|
773
|
+
const memory = await opts.memoryFactory(sample.sampleId);
|
|
774
|
+
const scope = { kind: "agent", agentId: `locomo:${sample.sampleId}` };
|
|
775
|
+
await ingestSampleIntoMemory(sample, memory, scope);
|
|
776
|
+
memories.set(sample.sampleId, memory);
|
|
777
|
+
}
|
|
778
|
+
}
|
|
779
|
+
const results = [];
|
|
780
|
+
let totalAnswerInputTokens = 0;
|
|
781
|
+
let totalAnswerOutputTokens = 0;
|
|
782
|
+
let totalJudgeInputTokens = 0;
|
|
783
|
+
let totalJudgeOutputTokens = 0;
|
|
784
|
+
let errorCount = 0;
|
|
785
|
+
let done = 0;
|
|
786
|
+
const total = queue.length - checkpoint.size;
|
|
787
|
+
const startTime = Date.now();
|
|
788
|
+
const processItem = async (item) => {
|
|
789
|
+
const key = `${item.sample.sampleId}:${item.qaIndex}`;
|
|
790
|
+
if (checkpoint.has(key)) return;
|
|
791
|
+
let modelAnswer = "";
|
|
792
|
+
let answerIn = 0;
|
|
793
|
+
let answerOut = 0;
|
|
794
|
+
let judgeIn = 0;
|
|
795
|
+
let judgeOut = 0;
|
|
796
|
+
let correct = false;
|
|
797
|
+
let trapTriggered;
|
|
798
|
+
let errorMsg;
|
|
799
|
+
try {
|
|
800
|
+
if (mode === "memory") {
|
|
801
|
+
const memory = memories.get(item.sample.sampleId);
|
|
802
|
+
const scope = { kind: "agent", agentId: `locomo:${item.sample.sampleId}` };
|
|
803
|
+
const retrieved = await memory.retrieve({ text: item.qa.question, scope, topK });
|
|
804
|
+
const snippets = retrieved.snippets.map((s) => s.text);
|
|
805
|
+
const prompt = buildMemoryAnswerPrompt(snippets, item.qa.question);
|
|
806
|
+
const resp = await answerModel.complete({
|
|
807
|
+
messages: [{ role: "user", content: prompt }],
|
|
808
|
+
tools: []
|
|
809
|
+
});
|
|
810
|
+
const textBlocks = resp.content.filter((b) => b.type === "text");
|
|
811
|
+
modelAnswer = textBlocks.map((b) => b.text).join("").trim();
|
|
812
|
+
answerIn = resp.usage?.inputTokens ?? 0;
|
|
813
|
+
answerOut = resp.usage?.outputTokens ?? 0;
|
|
814
|
+
} else {
|
|
815
|
+
const convText = conversationText.get(item.sample.sampleId) ?? "";
|
|
816
|
+
const prompt = buildFullContextAnswerPrompt(convText, item.qa.question);
|
|
817
|
+
const resp = await answerModel.complete({
|
|
818
|
+
messages: [{ role: "user", content: prompt }],
|
|
819
|
+
tools: []
|
|
820
|
+
});
|
|
821
|
+
const textBlocks = resp.content.filter((b) => b.type === "text");
|
|
822
|
+
modelAnswer = textBlocks.map((b) => b.text).join("").trim();
|
|
823
|
+
answerIn = resp.usage?.inputTokens ?? 0;
|
|
824
|
+
answerOut = resp.usage?.outputTokens ?? 0;
|
|
825
|
+
}
|
|
826
|
+
const isCategory5 = item.qa.category === 5;
|
|
827
|
+
const judgePrompt = buildJudgePrompt(
|
|
828
|
+
item.qa.question,
|
|
829
|
+
item.qa.answer,
|
|
830
|
+
modelAnswer,
|
|
831
|
+
item.qa.adversarialAnswer,
|
|
832
|
+
isCategory5
|
|
833
|
+
);
|
|
834
|
+
const judgeResult = await callJudge(judgeModel, judgePrompt);
|
|
835
|
+
correct = judgeResult.correct;
|
|
836
|
+
judgeIn = judgeResult.inputTokens;
|
|
837
|
+
judgeOut = judgeResult.outputTokens;
|
|
838
|
+
if (isCategory5 && item.qa.adversarialAnswer) {
|
|
839
|
+
const adversarialLower = item.qa.adversarialAnswer.toLowerCase();
|
|
840
|
+
const answerLower = modelAnswer.toLowerCase();
|
|
841
|
+
trapTriggered = answerLower.includes(adversarialLower) || adversarialLower.includes(answerLower.slice(0, Math.min(answerLower.length, 30)));
|
|
842
|
+
if (trapTriggered && correct) correct = false;
|
|
843
|
+
if (!correct && appearsToDecline(modelAnswer)) {
|
|
844
|
+
correct = true;
|
|
845
|
+
trapTriggered = false;
|
|
846
|
+
}
|
|
847
|
+
}
|
|
848
|
+
} catch (err) {
|
|
849
|
+
errorMsg = err.message;
|
|
850
|
+
errorCount++;
|
|
851
|
+
correct = false;
|
|
852
|
+
}
|
|
853
|
+
const row = {
|
|
854
|
+
sampleId: item.sample.sampleId,
|
|
855
|
+
questionIndex: item.qaIndex,
|
|
856
|
+
question: item.qa.question,
|
|
857
|
+
goldAnswer: item.qa.answer,
|
|
858
|
+
category: item.qa.category,
|
|
859
|
+
modelAnswer,
|
|
860
|
+
correct,
|
|
861
|
+
...trapTriggered !== void 0 ? { trapTriggered } : {},
|
|
862
|
+
...errorMsg !== void 0 ? { error: errorMsg } : {},
|
|
863
|
+
answerInputTokens: answerIn,
|
|
864
|
+
answerOutputTokens: answerOut,
|
|
865
|
+
judgeInputTokens: judgeIn,
|
|
866
|
+
judgeOutputTokens: judgeOut
|
|
867
|
+
};
|
|
868
|
+
results.push(row);
|
|
869
|
+
totalAnswerInputTokens += answerIn;
|
|
870
|
+
totalAnswerOutputTokens += answerOut;
|
|
871
|
+
totalJudgeInputTokens += judgeIn;
|
|
872
|
+
totalJudgeOutputTokens += judgeOut;
|
|
873
|
+
if (checkpointPath) {
|
|
874
|
+
await appendCheckpointRow(checkpointPath, row);
|
|
875
|
+
}
|
|
876
|
+
done++;
|
|
877
|
+
if (onProgress) onProgress(done, total);
|
|
878
|
+
};
|
|
879
|
+
const concurrencyLimit = Math.max(1, concurrency);
|
|
880
|
+
const pending = [];
|
|
881
|
+
for (const item of queue) {
|
|
882
|
+
const p = processItem(item);
|
|
883
|
+
pending.push(p);
|
|
884
|
+
if (pending.length >= concurrencyLimit) {
|
|
885
|
+
await Promise.all(pending.splice(0, concurrencyLimit));
|
|
886
|
+
}
|
|
887
|
+
}
|
|
888
|
+
if (pending.length > 0) await Promise.all(pending);
|
|
889
|
+
if (checkpointPath && checkpoint.size > 0) {
|
|
890
|
+
const raw = await (0, import_promises3.readFile)(checkpointPath, "utf-8").catch(() => "");
|
|
891
|
+
for (const line of raw.split("\n")) {
|
|
892
|
+
const trimmed = line.trim();
|
|
893
|
+
if (!trimmed) continue;
|
|
894
|
+
try {
|
|
895
|
+
const row = JSON.parse(trimmed);
|
|
896
|
+
const key = `${row.sampleId}:${row.questionIndex}`;
|
|
897
|
+
if (checkpoint.has(key)) {
|
|
898
|
+
results.push(row);
|
|
899
|
+
totalAnswerInputTokens += row.answerInputTokens ?? 0;
|
|
900
|
+
totalAnswerOutputTokens += row.answerOutputTokens ?? 0;
|
|
901
|
+
totalJudgeInputTokens += row.judgeInputTokens ?? 0;
|
|
902
|
+
totalJudgeOutputTokens += row.judgeOutputTokens ?? 0;
|
|
903
|
+
}
|
|
904
|
+
} catch {
|
|
905
|
+
}
|
|
906
|
+
}
|
|
907
|
+
}
|
|
908
|
+
const byCategoryMap = {};
|
|
909
|
+
for (const row of results) {
|
|
910
|
+
const cat = String(row.category);
|
|
911
|
+
if (!byCategoryMap[cat]) byCategoryMap[cat] = { correct: 0, total: 0 };
|
|
912
|
+
byCategoryMap[cat].total++;
|
|
913
|
+
if (row.correct) byCategoryMap[cat].correct++;
|
|
914
|
+
}
|
|
915
|
+
const byCategory = {};
|
|
916
|
+
for (const [cat, stats] of Object.entries(byCategoryMap)) {
|
|
917
|
+
byCategory[cat] = {
|
|
918
|
+
...stats,
|
|
919
|
+
accuracy: stats.total > 0 ? stats.correct / stats.total : 0
|
|
920
|
+
};
|
|
921
|
+
}
|
|
922
|
+
let j14Correct = 0;
|
|
923
|
+
let j14Total = 0;
|
|
924
|
+
for (const [cat, stats] of Object.entries(byCategory)) {
|
|
925
|
+
const n = parseInt(cat);
|
|
926
|
+
if (n >= 1 && n <= 4) {
|
|
927
|
+
j14Correct += stats.correct;
|
|
928
|
+
j14Total += stats.total;
|
|
929
|
+
}
|
|
930
|
+
}
|
|
931
|
+
const cat5Stats = byCategory["5"];
|
|
932
|
+
const cat5RefusalRate = cat5Stats ? { correct: cat5Stats.correct, total: cat5Stats.total, rate: cat5Stats.accuracy } : void 0;
|
|
933
|
+
const wallClockMs = Date.now() - startTime;
|
|
934
|
+
return {
|
|
935
|
+
config: {
|
|
936
|
+
mode,
|
|
937
|
+
topK,
|
|
938
|
+
answerModelId: answerModel.modelId ?? "(unknown)",
|
|
939
|
+
judgeModelId: judgeModel.modelId ?? "(unknown)",
|
|
940
|
+
datasetSha: LOCOMO_SOURCE_SHA,
|
|
941
|
+
seed,
|
|
942
|
+
categories: [...categories].sort((a, b) => a - b),
|
|
943
|
+
samplesRun: samples.length,
|
|
944
|
+
questionsRun: results.length
|
|
945
|
+
},
|
|
946
|
+
overallJ14: {
|
|
947
|
+
correct: j14Correct,
|
|
948
|
+
total: j14Total,
|
|
949
|
+
accuracy: j14Total > 0 ? j14Correct / j14Total : 0
|
|
950
|
+
},
|
|
951
|
+
byCategory,
|
|
952
|
+
cat5RefusalRate,
|
|
953
|
+
tokens: {
|
|
954
|
+
ingestInputTokens,
|
|
955
|
+
ingestOutputTokens,
|
|
956
|
+
answerInputTokens: totalAnswerInputTokens,
|
|
957
|
+
answerOutputTokens: totalAnswerOutputTokens,
|
|
958
|
+
judgeInputTokens: totalJudgeInputTokens,
|
|
959
|
+
judgeOutputTokens: totalJudgeOutputTokens,
|
|
960
|
+
totalInputTokens: ingestInputTokens + totalAnswerInputTokens + totalJudgeInputTokens,
|
|
961
|
+
totalOutputTokens: ingestOutputTokens + totalAnswerOutputTokens + totalJudgeOutputTokens
|
|
962
|
+
},
|
|
963
|
+
wallClockMs,
|
|
964
|
+
questions: results,
|
|
965
|
+
errorCount
|
|
966
|
+
};
|
|
967
|
+
}
|
|
968
|
+
|
|
969
|
+
// src/locomo-render.ts
|
|
970
|
+
function pct(n) {
|
|
971
|
+
return (n * 100).toFixed(1) + "%";
|
|
972
|
+
}
|
|
973
|
+
function fmtNum(n) {
|
|
974
|
+
return n.toLocaleString("en-US");
|
|
975
|
+
}
|
|
976
|
+
function estimateCost(tokens, prices) {
|
|
977
|
+
if (!prices) return "\u2014";
|
|
978
|
+
const cost = tokens.totalInputTokens / 1e6 * prices.inputPer1M + tokens.totalOutputTokens / 1e6 * prices.outputPer1M;
|
|
979
|
+
return `$${cost.toFixed(4)}`;
|
|
980
|
+
}
|
|
981
|
+
function renderLocomoReportMarkdown(reports, prices) {
|
|
982
|
+
const lines = [];
|
|
983
|
+
lines.push("# LoCoMo Benchmark Results");
|
|
984
|
+
lines.push("");
|
|
985
|
+
lines.push("Dataset: [LoCoMo](https://github.com/snap-research/locomo) (Snap Research) \xB7 CC BY-NC 4.0");
|
|
986
|
+
lines.push("Raw data is not redistributed. Only aggregate results are published here.");
|
|
987
|
+
lines.push("");
|
|
988
|
+
if (reports.length === 0) {
|
|
989
|
+
lines.push("_No results yet._");
|
|
990
|
+
return lines.join("\n");
|
|
991
|
+
}
|
|
992
|
+
const headers = [
|
|
993
|
+
"System / Mode",
|
|
994
|
+
"Cat1 (multi-hop)",
|
|
995
|
+
"Cat2 (temporal)",
|
|
996
|
+
"Cat3 (open-domain)",
|
|
997
|
+
"Cat4 (single-hop)",
|
|
998
|
+
"J(1\u20134) overall",
|
|
999
|
+
"Cat5 refusal rate",
|
|
1000
|
+
"Tokens/query",
|
|
1001
|
+
"Est. cost/run",
|
|
1002
|
+
"Answer model",
|
|
1003
|
+
"Judge model",
|
|
1004
|
+
"topK",
|
|
1005
|
+
"n-Q",
|
|
1006
|
+
"Seed",
|
|
1007
|
+
"Dataset SHA"
|
|
1008
|
+
];
|
|
1009
|
+
lines.push("## Results");
|
|
1010
|
+
lines.push("");
|
|
1011
|
+
lines.push("| " + headers.join(" | ") + " |");
|
|
1012
|
+
lines.push("| " + headers.map(() => "---").join(" | ") + " |");
|
|
1013
|
+
for (const r of reports) {
|
|
1014
|
+
const c = r.config;
|
|
1015
|
+
const cat = (n) => {
|
|
1016
|
+
const s = r.byCategory[String(n)];
|
|
1017
|
+
if (!s) return "\u2014";
|
|
1018
|
+
return `${pct(s.accuracy)} (${s.correct}/${s.total})`;
|
|
1019
|
+
};
|
|
1020
|
+
const cat5 = r.cat5RefusalRate ? `${pct(r.cat5RefusalRate.rate)} (${r.cat5RefusalRate.correct}/${r.cat5RefusalRate.total})` : "\u2014";
|
|
1021
|
+
const totalQ = r.questions.length;
|
|
1022
|
+
const tokensPerQuery = totalQ > 0 ? Math.round((r.tokens.totalInputTokens + r.tokens.totalOutputTokens) / totalQ) : 0;
|
|
1023
|
+
const row = [
|
|
1024
|
+
`${c.answerModelId} / ${c.mode}`,
|
|
1025
|
+
cat(1),
|
|
1026
|
+
cat(2),
|
|
1027
|
+
cat(3),
|
|
1028
|
+
cat(4),
|
|
1029
|
+
`${pct(r.overallJ14.accuracy)} (${r.overallJ14.correct}/${r.overallJ14.total})`,
|
|
1030
|
+
cat5,
|
|
1031
|
+
fmtNum(tokensPerQuery),
|
|
1032
|
+
estimateCost(r.tokens, prices),
|
|
1033
|
+
c.answerModelId,
|
|
1034
|
+
c.judgeModelId,
|
|
1035
|
+
String(c.topK),
|
|
1036
|
+
fmtNum(r.config.questionsRun),
|
|
1037
|
+
String(c.seed),
|
|
1038
|
+
c.datasetSha.slice(0, 8)
|
|
1039
|
+
];
|
|
1040
|
+
lines.push("| " + row.join(" | ") + " |");
|
|
1041
|
+
}
|
|
1042
|
+
lines.push("");
|
|
1043
|
+
lines.push("## Run Configuration");
|
|
1044
|
+
lines.push("");
|
|
1045
|
+
for (const r of reports) {
|
|
1046
|
+
const c = r.config;
|
|
1047
|
+
lines.push(`### ${c.answerModelId} / ${c.mode}`);
|
|
1048
|
+
lines.push("");
|
|
1049
|
+
lines.push(`- **Mode**: ${c.mode}`);
|
|
1050
|
+
lines.push(`- **Answer model**: ${c.answerModelId}`);
|
|
1051
|
+
lines.push(`- **Judge model**: ${c.judgeModelId}`);
|
|
1052
|
+
lines.push(`- **topK**: ${c.topK}`);
|
|
1053
|
+
lines.push(`- **Dataset SHA**: \`${c.datasetSha}\``);
|
|
1054
|
+
lines.push(`- **Seed**: ${c.seed}`);
|
|
1055
|
+
lines.push(`- **Categories**: ${c.categories.join(", ")}`);
|
|
1056
|
+
lines.push(`- **Samples run**: ${c.samplesRun}`);
|
|
1057
|
+
lines.push(`- **Questions run**: ${c.questionsRun}`);
|
|
1058
|
+
lines.push(`- **Wall-clock**: ${(r.wallClockMs / 1e3).toFixed(1)}s`);
|
|
1059
|
+
lines.push(`- **Errors**: ${r.errorCount}`);
|
|
1060
|
+
lines.push(`- **Tokens** (in/out): ${fmtNum(r.tokens.totalInputTokens)} / ${fmtNum(r.tokens.totalOutputTokens)}`);
|
|
1061
|
+
lines.push("");
|
|
1062
|
+
}
|
|
1063
|
+
lines.push("## Methodology Notes");
|
|
1064
|
+
lines.push("");
|
|
1065
|
+
lines.push("These results were produced using the Eidentic LoCoMo fair-run harness. The following rules apply:");
|
|
1066
|
+
lines.push("");
|
|
1067
|
+
lines.push("1. **Both speakers are treated as humans.** Turns are ingested as `[SpeakerName]: text` \u2014 never mapped to user/assistant roles.");
|
|
1068
|
+
lines.push("2. **Timestamps are structural.** Each session is prefixed with a header line `Session N \u2014 <date>` and `ingestedAt` metadata carries the epoch-ms.");
|
|
1069
|
+
lines.push("3. **topK \u2264 10 in memory mode.** Larger topK values trivialise retrieval quality and are not permitted.");
|
|
1070
|
+
lines.push("4. **Full-context baseline is required** alongside any memory-mode result.");
|
|
1071
|
+
lines.push("5. **Judge is strict**: a model answer is correct only when it contains the gold answer's specific information. Vague/topical-only answers are wrong.");
|
|
1072
|
+
lines.push("6. **Category 5 (adversarial)**: correct = model declined; adversarial-trap match = wrong.");
|
|
1073
|
+
lines.push("7. **Primary metric J(1\u20134)**: denominator is the number of cat 1\u20134 questions actually run (max 1540 on full dataset).");
|
|
1074
|
+
lines.push("8. **Dataset license**: CC BY-NC 4.0 \u2014 raw data is not redistributed; only aggregate results are published.");
|
|
1075
|
+
lines.push("");
|
|
1076
|
+
lines.push("> Category mapping in locomo10.json: 1=multi-hop (282), 2=temporal (321), 3=open-domain (96), 4=single-hop (841), 5=adversarial (446).");
|
|
1077
|
+
lines.push("");
|
|
1078
|
+
return lines.join("\n");
|
|
1079
|
+
}
|
|
1080
|
+
|
|
1081
|
+
// src/locomo-types.ts
|
|
1082
|
+
function resolveEvidence(sample, diaIds) {
|
|
1083
|
+
const turnMap = /* @__PURE__ */ new Map();
|
|
1084
|
+
for (const sess of sample.sessions) {
|
|
1085
|
+
for (const turn of sess.turns) {
|
|
1086
|
+
turnMap.set(turn.diaId, turn);
|
|
1087
|
+
}
|
|
1088
|
+
}
|
|
1089
|
+
const results = [];
|
|
1090
|
+
for (const id of diaIds) {
|
|
1091
|
+
const t = turnMap.get(id);
|
|
1092
|
+
if (t) results.push(t.text);
|
|
1093
|
+
}
|
|
1094
|
+
return results;
|
|
1095
|
+
}
|
|
1096
|
+
|
|
448
1097
|
// src/write-quality.ts
|
|
449
1098
|
var CONTRADICTION_FIXTURES = [
|
|
450
1099
|
{
|
|
@@ -822,7 +1471,7 @@ async function runTemporalBench(memory, dataset, opts = {}) {
|
|
|
822
1471
|
}
|
|
823
1472
|
|
|
824
1473
|
// src/datasets/temporal.ts
|
|
825
|
-
function
|
|
1474
|
+
function makeRng2(seed) {
|
|
826
1475
|
let s = seed >>> 0;
|
|
827
1476
|
if (s === 0) s = 1;
|
|
828
1477
|
return () => {
|
|
@@ -905,7 +1554,7 @@ function syntheticTemporalDataset(opts = {}) {
|
|
|
905
1554
|
const entityCount = opts.entityCount ?? 4;
|
|
906
1555
|
const seed = opts.seed ?? 42;
|
|
907
1556
|
const changesPerProperty = opts.changesPerProperty ?? 3;
|
|
908
|
-
const rng =
|
|
1557
|
+
const rng = makeRng2(seed);
|
|
909
1558
|
const entities = [];
|
|
910
1559
|
const asserts = [];
|
|
911
1560
|
const questions = [];
|
|
@@ -992,12 +1641,17 @@ function syntheticTemporalDataset(opts = {}) {
|
|
|
992
1641
|
0 && (module.exports = {
|
|
993
1642
|
CONTRADICTION_FIXTURES,
|
|
994
1643
|
JUNK_STREAM_FIXTURES,
|
|
1644
|
+
LOCOMO_SOURCE_SHA,
|
|
995
1645
|
factRecall,
|
|
996
1646
|
loadLoCoMo,
|
|
1647
|
+
loadLoCoMoLegacy,
|
|
997
1648
|
loadLongMemEval,
|
|
998
1649
|
normalizeText,
|
|
999
1650
|
normalizedIncludes,
|
|
1000
1651
|
recallAtK,
|
|
1652
|
+
renderLocomoReportMarkdown,
|
|
1653
|
+
resolveEvidence,
|
|
1654
|
+
runLocomoBench,
|
|
1001
1655
|
runMemoryBench,
|
|
1002
1656
|
runTemporalBench,
|
|
1003
1657
|
runWriteQualityBench,
|