@eidentic/bench 0.1.0 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs CHANGED
@@ -5,6 +5,9 @@ var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
5
5
  var __getOwnPropNames = Object.getOwnPropertyNames;
6
6
  var __getProtoOf = Object.getPrototypeOf;
7
7
  var __hasOwnProp = Object.prototype.hasOwnProperty;
8
+ var __esm = (fn, res) => function __init() {
9
+ return fn && (res = (0, fn[__getOwnPropNames(fn)[0]])(fn = 0)), res;
10
+ };
8
11
  var __export = (target, all) => {
9
12
  for (var name in all)
10
13
  __defProp(target, name, { get: all[name], enumerable: true });
@@ -27,17 +30,238 @@ var __toESM = (mod, isNodeMode, target) => (target = mod != null ? __create(__ge
27
30
  ));
28
31
  var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
29
32
 
33
+ // src/locomo-loader.ts
34
+ var locomo_loader_exports = {};
35
+ __export(locomo_loader_exports, {
36
+ LOCOMO_SOURCE_SHA: () => LOCOMO_SOURCE_SHA,
37
+ loadLoCoMo: () => loadLoCoMo2
38
+ });
39
+ async function assertFileSize2(filePath, maxBytes) {
40
+ let fileSize;
41
+ try {
42
+ const s = await (0, import_promises2.stat)(filePath);
43
+ fileSize = s.size;
44
+ } catch (err) {
45
+ throw new Error(
46
+ `bench loader: cannot stat file "${filePath}": ${err.message}`
47
+ );
48
+ }
49
+ if (fileSize > maxBytes) {
50
+ const mb = (fileSize / (1024 * 1024)).toFixed(1);
51
+ const capMb = (maxBytes / (1024 * 1024)).toFixed(0);
52
+ throw new Error(
53
+ `bench loader: file "${filePath}" is ${mb} MiB, which exceeds the ${capMb} MiB cap. Pass a larger maxBytes option if this is intentional.`
54
+ );
55
+ }
56
+ }
57
+ function parseLocomoDateTime(raw) {
58
+ const cleaned = raw.replace(/\s*on\s+/, " ").trim();
59
+ const ms = Date.parse(cleaned);
60
+ if (Number.isNaN(ms)) {
61
+ const match = /^(\d{1,2}:\d{2}\s*(?:am|pm))\s+(.+)$/i.exec(cleaned);
62
+ if (match) {
63
+ const ms2 = Date.parse(`${match[2]} ${match[1]}`);
64
+ return Number.isNaN(ms2) ? 0 : ms2;
65
+ }
66
+ return 0;
67
+ }
68
+ return ms;
69
+ }
70
+ function parseSessions(conv) {
71
+ const indices = [];
72
+ for (const key of Object.keys(conv)) {
73
+ const m = /^session_(\d+)$/.exec(key);
74
+ if (m) indices.push(parseInt(m[1], 10));
75
+ }
76
+ indices.sort((a, b) => a - b);
77
+ const sessions = [];
78
+ for (const idx of indices) {
79
+ const turnsRaw = conv[`session_${idx}`];
80
+ if (!Array.isArray(turnsRaw)) continue;
81
+ const dateTimeRaw = typeof conv[`session_${idx}_date_time`] === "string" ? conv[`session_${idx}_date_time`] : "";
82
+ const turns = turnsRaw.map((t) => ({
83
+ speaker: t.speaker ?? "",
84
+ diaId: t.dia_id ?? "",
85
+ text: t.text ?? ""
86
+ }));
87
+ sessions.push({
88
+ index: idx,
89
+ dateTime: dateTimeRaw,
90
+ dateTimeMs: dateTimeRaw ? parseLocomoDateTime(dateTimeRaw) : 0,
91
+ turns
92
+ });
93
+ }
94
+ return sessions;
95
+ }
96
+ async function loadLoCoMo2(jsonPath, opts) {
97
+ await assertFileSize2(jsonPath, opts?.maxBytes ?? DEFAULT_MAX_BYTES2);
98
+ const raw = JSON.parse(await (0, import_promises2.readFile)(jsonPath, "utf-8"));
99
+ if (!Array.isArray(raw)) {
100
+ throw new Error(
101
+ `bench loader: expected the LoCoMo JSON root to be an array, but got ${typeof raw}. Did you pass the correct file?`
102
+ );
103
+ }
104
+ const rawSamples = raw;
105
+ const samples = rawSamples.map((s, i) => {
106
+ const sampleId = s.sample_id ?? String(i);
107
+ const conv = s.conversation ?? {};
108
+ const sessions = parseSessions(conv);
109
+ const qa = (s.qa ?? []).map((q) => {
110
+ const answer = q.answer !== void 0 ? String(q.answer) : void 0;
111
+ return {
112
+ question: q.question ?? "",
113
+ answer,
114
+ category: q.category,
115
+ evidence: Array.isArray(q.evidence) ? q.evidence : [],
116
+ adversarialAnswer: q.adversarial_answer
117
+ };
118
+ });
119
+ return {
120
+ sampleId,
121
+ speakerA: String(conv.speaker_a ?? ""),
122
+ speakerB: String(conv.speaker_b ?? ""),
123
+ sessions,
124
+ qa
125
+ };
126
+ });
127
+ return { samples };
128
+ }
129
+ var import_promises2, LOCOMO_SOURCE_SHA, DEFAULT_MAX_BYTES2;
130
+ var init_locomo_loader = __esm({
131
+ "src/locomo-loader.ts"() {
132
+ "use strict";
133
+ import_promises2 = require("node:fs/promises");
134
+ LOCOMO_SOURCE_SHA = "3eb6f2c585f5e1699204e3c3bdf7adc5c28cb376";
135
+ DEFAULT_MAX_BYTES2 = 256 * 1024 * 1024;
136
+ }
137
+ });
138
+
139
+ // src/lme-loader.ts
140
+ var lme_loader_exports = {};
141
+ __export(lme_loader_exports, {
142
+ LONGMEMEVAL_SOURCE: () => LONGMEMEVAL_SOURCE,
143
+ loadLongMemEval: () => loadLongMemEval2,
144
+ parseLmeDateTimeString: () => parseLmeDateTimeString
145
+ });
146
+ async function assertFileSize3(filePath, maxBytes) {
147
+ let fileSize;
148
+ try {
149
+ const s = await (0, import_promises4.stat)(filePath);
150
+ fileSize = s.size;
151
+ } catch (err) {
152
+ throw new Error(
153
+ `bench loader: cannot stat file "${filePath}": ${err.message}`
154
+ );
155
+ }
156
+ if (fileSize > maxBytes) {
157
+ const mb = (fileSize / (1024 * 1024)).toFixed(1);
158
+ const capMb = (maxBytes / (1024 * 1024)).toFixed(0);
159
+ throw new Error(
160
+ `bench loader: file "${filePath}" is ${mb} MiB, which exceeds the ${capMb} MiB cap. Pass a larger maxBytes option if this is intentional.`
161
+ );
162
+ }
163
+ }
164
+ function parseLmeDateTimeString(raw) {
165
+ if (!raw) return 0;
166
+ const cleaned = raw.replace(/\s*\([A-Za-z]+\)\s*/, " ").trim();
167
+ const iso = cleaned.replace(/^(\d{4})\/(\d{2})\/(\d{2})/, "$1-$2-$3");
168
+ const ms = Date.parse(iso);
169
+ if (Number.isNaN(ms)) return 0;
170
+ return ms;
171
+ }
172
+ function extractBaseType(rawType) {
173
+ if (rawType.endsWith("_abs")) {
174
+ return rawType.slice(0, -4);
175
+ }
176
+ return rawType;
177
+ }
178
+ function parseSession(id, dateTime, rawTurns) {
179
+ const turns = rawTurns.map((t) => ({
180
+ role: t.role === "assistant" ? "assistant" : "user",
181
+ content: t.content ?? "",
182
+ hasAnswer: t.has_answer === true
183
+ }));
184
+ return {
185
+ id,
186
+ dateTime,
187
+ dateTimeMs: parseLmeDateTimeString(dateTime),
188
+ turns
189
+ };
190
+ }
191
+ async function loadLongMemEval2(jsonPath, opts) {
192
+ await assertFileSize3(jsonPath, opts?.maxBytes ?? DEFAULT_MAX_BYTES3);
193
+ const raw = JSON.parse(await (0, import_promises4.readFile)(jsonPath, "utf-8"));
194
+ if (!Array.isArray(raw)) {
195
+ throw new Error(
196
+ `bench loader: expected the LongMemEval JSON root to be an array, but got ${typeof raw}. Did you pass the correct file?`
197
+ );
198
+ }
199
+ const rawQuestions = raw;
200
+ const questions = rawQuestions.map((q, i) => {
201
+ const id = q.question_id ?? String(i);
202
+ const rawType = q.question_type ?? "single-session-user";
203
+ const baseType = extractBaseType(rawType);
204
+ const isAbstention = rawType.endsWith("_abs");
205
+ const rawSessions = Array.isArray(q.haystack_sessions) ? q.haystack_sessions : [];
206
+ const dates = Array.isArray(q.haystack_dates) ? q.haystack_dates : [];
207
+ const sessionIds = Array.isArray(q.haystack_session_ids) ? q.haystack_session_ids : [];
208
+ const sessions = rawSessions.map((turns, idx) => {
209
+ const sessId = sessionIds[idx] ?? `sess-${idx}`;
210
+ const dateTime = dates[idx] ?? "";
211
+ return parseSession(sessId, dateTime, Array.isArray(turns) ? turns : []);
212
+ });
213
+ sessions.sort((a, b) => a.dateTimeMs - b.dateTimeMs);
214
+ return {
215
+ id,
216
+ type: rawType,
217
+ baseType,
218
+ isAbstention,
219
+ question: q.question ?? "",
220
+ answer: q.answer ?? "",
221
+ questionDate: q.question_date ?? "",
222
+ questionDateMs: parseLmeDateTimeString(q.question_date ?? ""),
223
+ sessions,
224
+ answerSessionIds: Array.isArray(q.answer_session_ids) ? q.answer_session_ids : []
225
+ };
226
+ });
227
+ return { questions };
228
+ }
229
+ var import_promises4, LONGMEMEVAL_SOURCE, DEFAULT_MAX_BYTES3;
230
+ var init_lme_loader = __esm({
231
+ "src/lme-loader.ts"() {
232
+ "use strict";
233
+ import_promises4 = require("node:fs/promises");
234
+ LONGMEMEVAL_SOURCE = {
235
+ url: "https://huggingface.co/datasets/xiaowu0162/longmemeval",
236
+ snapshotSha: "2ec2a557f339b6c0369619b1ed5793734cc87533",
237
+ file: "longmemeval_s",
238
+ license: "MIT"
239
+ };
240
+ DEFAULT_MAX_BYTES3 = 512 * 1024 * 1024;
241
+ }
242
+ });
243
+
30
244
  // src/index.ts
31
245
  var index_exports = {};
32
246
  __export(index_exports, {
33
247
  CONTRADICTION_FIXTURES: () => CONTRADICTION_FIXTURES,
34
248
  JUNK_STREAM_FIXTURES: () => JUNK_STREAM_FIXTURES,
249
+ LOCOMO_SOURCE_SHA: () => LOCOMO_SOURCE_SHA,
250
+ LONGMEMEVAL_SOURCE: () => LONGMEMEVAL_SOURCE,
35
251
  factRecall: () => factRecall,
36
- loadLoCoMo: () => loadLoCoMo,
37
- loadLongMemEval: () => loadLongMemEval,
252
+ loadLoCoMo: () => loadLoCoMo2,
253
+ loadLoCoMoLegacy: () => loadLoCoMo,
254
+ loadLongMemEval: () => loadLongMemEval2,
255
+ loadLongMemEvalLegacy: () => loadLongMemEval,
38
256
  normalizeText: () => normalizeText,
39
257
  normalizedIncludes: () => normalizedIncludes,
258
+ parseLmeDateTimeString: () => parseLmeDateTimeString,
40
259
  recallAtK: () => recallAtK,
260
+ renderLocomoReportMarkdown: () => renderLocomoReportMarkdown,
261
+ renderLongMemEvalReportMarkdown: () => renderLongMemEvalReportMarkdown,
262
+ resolveEvidence: () => resolveEvidence,
263
+ runLocomoBench: () => runLocomoBench,
264
+ runLongMemEvalBench: () => runLongMemEvalBench,
41
265
  runMemoryBench: () => runMemoryBench,
42
266
  runTemporalBench: () => runTemporalBench,
43
267
  runWriteQualityBench: () => runWriteQualityBench,
@@ -445,6 +669,1107 @@ async function loadLoCoMo(jsonPath, opts) {
445
669
  return { name: "LoCoMo", cases: benchCases };
446
670
  }
447
671
 
672
+ // src/index.ts
673
+ init_locomo_loader();
674
+
675
+ // src/locomo-run.ts
676
+ var import_promises3 = require("node:fs/promises");
677
+ var import_node_fs = require("node:fs");
678
+ init_locomo_loader();
679
+ function makeRng(seed) {
680
+ let s = seed >>> 0;
681
+ if (s === 0) s = 1;
682
+ return () => {
683
+ s ^= s << 13;
684
+ s ^= s >>> 17;
685
+ s ^= s << 5;
686
+ s = s >>> 0;
687
+ return s / 4294967296;
688
+ };
689
+ }
690
+ function seededShuffle(arr, rng) {
691
+ const out = [...arr];
692
+ for (let i = out.length - 1; i > 0; i--) {
693
+ const j = Math.floor(rng() * (i + 1));
694
+ [out[i], out[j]] = [out[j], out[i]];
695
+ }
696
+ return out;
697
+ }
698
+ var ANSWER_INSTRUCTION = "Answer the question concisely and specifically based only on the information provided. If the provided context contains no answer to the question, reply exactly: No information available";
699
+ function buildMemoryAnswerPrompt(snippets, question) {
700
+ const context = snippets.length > 0 ? snippets.map((s, i) => `[${i + 1}] ${s}`).join("\n\n") : "(no relevant context retrieved)";
701
+ return `Context from conversation:
702
+ ${context}
703
+
704
+ Question: ${question}
705
+
706
+ ${ANSWER_INSTRUCTION}`;
707
+ }
708
+ function buildFullContextAnswerPrompt(conversationText, question) {
709
+ return `Conversation:
710
+ ${conversationText}
711
+
712
+ Question: ${question}
713
+
714
+ ${ANSWER_INSTRUCTION}`;
715
+ }
716
+ function buildJudgePrompt(question, goldAnswer, modelAnswer, adversarialAnswer, isCategory5) {
717
+ if (isCategory5) {
718
+ const trapNote = adversarialAnswer ? `
719
+ WARNING \u2014 trap answer to watch for: "${adversarialAnswer}". If the model answer closely matches this trap, mark incorrect.` : "";
720
+ return `You are a strict judge. For this unanswerable question, a CORRECT response means the model refused to answer or said something equivalent to "No information available" or "I don't know".${trapNote}
721
+
722
+ Question: ${question}
723
+ Model answer: ${modelAnswer}
724
+
725
+ Respond with JSON only: {"correct": true} if the model declined (correct), {"correct": false} if it made a claim.`;
726
+ }
727
+ return `You are a strict judge evaluating whether a model answer is correct.
728
+ CORRECT: the model answer contains the specific information in the gold answer (exact match or clear paraphrase).
729
+ WRONG: the model answer is vague, only topically related, or contradicts the gold answer.
730
+ For date/time questions: equivalent date expressions referring to the same date are correct.
731
+
732
+ Question: ${question}
733
+ Gold answer: ${goldAnswer ?? "(none)"}
734
+ Model answer: ${modelAnswer}
735
+
736
+ Respond with JSON only: {"correct": true} or {"correct": false}`;
737
+ }
738
+ function renderConversation(sample) {
739
+ const lines = [];
740
+ for (const sess of sample.sessions) {
741
+ const dateLabel = sess.dateTime ? ` \u2014 ${sess.dateTime}` : "";
742
+ lines.push(`Session ${sess.index}${dateLabel}`);
743
+ for (const turn of sess.turns) {
744
+ lines.push(`[${turn.speaker}]: ${turn.text}`);
745
+ }
746
+ lines.push("");
747
+ }
748
+ return lines.join("\n").trim();
749
+ }
750
+ async function ingestSampleIntoMemory(sample, memory, scope) {
751
+ const events = [];
752
+ for (const sess of sample.sessions) {
753
+ const dateLabel = sess.dateTime ? ` \u2014 ${sess.dateTime}` : "";
754
+ const header = `Session ${sess.index}${dateLabel}`;
755
+ for (const turn of sess.turns) {
756
+ events.push({
757
+ id: `${sample.sampleId}:turn:${turn.diaId}`,
758
+ scope,
759
+ text: `[${header}] [${turn.speaker}]: ${turn.text}`,
760
+ metadata: {
761
+ diaId: turn.diaId,
762
+ sessionIndex: sess.index,
763
+ ingestedAt: sess.dateTimeMs || void 0
764
+ }
765
+ });
766
+ }
767
+ const sessionText = [
768
+ header,
769
+ ...sess.turns.map((t) => `[${t.speaker}]: ${t.text}`)
770
+ ].join("\n");
771
+ events.push({
772
+ id: `${sample.sampleId}:sess:${sess.index}:chunk`,
773
+ scope,
774
+ text: sessionText,
775
+ metadata: { sessionIndex: sess.index, ingestedAt: sess.dateTimeMs || void 0 }
776
+ });
777
+ }
778
+ await memory.ingest(events);
779
+ }
780
+ async function callJudge(judgeModel, prompt) {
781
+ const response = await judgeModel.complete({
782
+ messages: [{ role: "user", content: prompt }],
783
+ tools: [],
784
+ outputSchema: {
785
+ type: "object",
786
+ properties: { correct: { type: "boolean" } },
787
+ required: ["correct"],
788
+ // OpenAI strict structured-output mode requires this to be explicit.
789
+ additionalProperties: false
790
+ }
791
+ });
792
+ let correct = false;
793
+ if (response.object && typeof response.object.correct === "boolean") {
794
+ correct = response.object.correct;
795
+ } else {
796
+ const text = response.content.filter((b) => b.type === "text").map((b) => b.text ?? "").join("").toLowerCase().trim();
797
+ if (/"correct"\s*:\s*true/i.test(text)) correct = true;
798
+ else if (/"correct"\s*:\s*false/i.test(text)) correct = false;
799
+ else correct = text.includes("true");
800
+ }
801
+ return {
802
+ correct,
803
+ inputTokens: response.usage?.inputTokens ?? 0,
804
+ outputTokens: response.usage?.outputTokens ?? 0
805
+ };
806
+ }
807
+ async function loadCheckpoint(path) {
808
+ const done = /* @__PURE__ */ new Set();
809
+ if (!(0, import_node_fs.existsSync)(path)) return done;
810
+ const raw = await (0, import_promises3.readFile)(path, "utf-8");
811
+ for (const line of raw.split("\n")) {
812
+ const trimmed = line.trim();
813
+ if (!trimmed) continue;
814
+ try {
815
+ const row = JSON.parse(trimmed);
816
+ if (row.sampleId && row.questionIndex !== void 0) {
817
+ done.add(`${row.sampleId}:${row.questionIndex}`);
818
+ }
819
+ } catch {
820
+ }
821
+ }
822
+ return done;
823
+ }
824
+ async function appendCheckpointRow(path, row) {
825
+ await (0, import_promises3.appendFile)(path, JSON.stringify(row) + "\n", "utf-8");
826
+ }
827
+ var DECLINE_PATTERNS = /\bno information available\b|\bi (don'?t|do not) know\b|\bcannot (find|answer|determine)\b|\bunavailable\b/i;
828
+ function appearsToDecline(answer) {
829
+ return DECLINE_PATTERNS.test(answer);
830
+ }
831
+ async function runLocomoBench(opts) {
832
+ const {
833
+ answerModel,
834
+ judgeModel,
835
+ mode,
836
+ categories = [1, 2, 3, 4, 5],
837
+ sampleLimit,
838
+ questionLimit,
839
+ seed = 42,
840
+ topK: rawTopK = 10,
841
+ concurrency = 1,
842
+ onProgress,
843
+ checkpointPath
844
+ } = opts;
845
+ const topK = Math.min(rawTopK, 10);
846
+ const rng = makeRng(seed);
847
+ let dataset;
848
+ if (opts.dataset) {
849
+ dataset = opts.dataset;
850
+ } else {
851
+ const { loadLoCoMo: loadLoCoMo3 } = await Promise.resolve().then(() => (init_locomo_loader(), locomo_loader_exports));
852
+ dataset = await loadLoCoMo3(opts.dataPath);
853
+ }
854
+ let samples = [...dataset.samples];
855
+ if (sampleLimit !== void 0 && sampleLimit < samples.length) {
856
+ samples = seededShuffle(samples, makeRng(seed)).slice(0, sampleLimit);
857
+ }
858
+ const queue = [];
859
+ for (const sample of samples) {
860
+ let qaList = sample.qa.filter((q) => categories.includes(q.category));
861
+ if (questionLimit !== void 0 && questionLimit < qaList.length) {
862
+ qaList = seededShuffle(qaList, makeRng(seed + 1)).slice(0, questionLimit);
863
+ }
864
+ for (const qa of qaList) {
865
+ queue.push({ sample, qaIndex: sample.qa.indexOf(qa), qa });
866
+ }
867
+ }
868
+ const checkpoint = checkpointPath ? await loadCheckpoint(checkpointPath) : /* @__PURE__ */ new Set();
869
+ const conversationText = /* @__PURE__ */ new Map();
870
+ if (mode === "full-context") {
871
+ for (const sample of samples) {
872
+ conversationText.set(sample.sampleId, renderConversation(sample));
873
+ }
874
+ }
875
+ const memories = /* @__PURE__ */ new Map();
876
+ let ingestInputTokens = 0;
877
+ let ingestOutputTokens = 0;
878
+ if (mode === "memory") {
879
+ if (!opts.memoryFactory) {
880
+ throw new Error("runLocomoBench: memoryFactory is required when mode='memory'");
881
+ }
882
+ for (const sample of samples) {
883
+ const memory = await opts.memoryFactory(sample.sampleId);
884
+ const scope = { kind: "agent", agentId: `locomo:${sample.sampleId}` };
885
+ await ingestSampleIntoMemory(sample, memory, scope);
886
+ memories.set(sample.sampleId, memory);
887
+ }
888
+ }
889
+ const results = [];
890
+ let totalAnswerInputTokens = 0;
891
+ let totalAnswerOutputTokens = 0;
892
+ let totalJudgeInputTokens = 0;
893
+ let totalJudgeOutputTokens = 0;
894
+ let errorCount = 0;
895
+ let done = 0;
896
+ const total = queue.length - checkpoint.size;
897
+ const startTime = Date.now();
898
+ const processItem = async (item) => {
899
+ const key = `${item.sample.sampleId}:${item.qaIndex}`;
900
+ if (checkpoint.has(key)) return;
901
+ let modelAnswer = "";
902
+ let answerIn = 0;
903
+ let answerOut = 0;
904
+ let judgeIn = 0;
905
+ let judgeOut = 0;
906
+ let correct = false;
907
+ let trapTriggered;
908
+ let errorMsg;
909
+ try {
910
+ if (mode === "memory") {
911
+ const memory = memories.get(item.sample.sampleId);
912
+ const scope = { kind: "agent", agentId: `locomo:${item.sample.sampleId}` };
913
+ const retrieved = await memory.retrieve({ text: item.qa.question, scope, topK });
914
+ const snippets = retrieved.snippets.map((s) => s.text);
915
+ const prompt = buildMemoryAnswerPrompt(snippets, item.qa.question);
916
+ const resp = await answerModel.complete({
917
+ messages: [{ role: "user", content: prompt }],
918
+ tools: []
919
+ });
920
+ const textBlocks = resp.content.filter((b) => b.type === "text");
921
+ modelAnswer = textBlocks.map((b) => b.text).join("").trim();
922
+ answerIn = resp.usage?.inputTokens ?? 0;
923
+ answerOut = resp.usage?.outputTokens ?? 0;
924
+ } else {
925
+ const convText = conversationText.get(item.sample.sampleId) ?? "";
926
+ const prompt = buildFullContextAnswerPrompt(convText, item.qa.question);
927
+ const resp = await answerModel.complete({
928
+ messages: [{ role: "user", content: prompt }],
929
+ tools: []
930
+ });
931
+ const textBlocks = resp.content.filter((b) => b.type === "text");
932
+ modelAnswer = textBlocks.map((b) => b.text).join("").trim();
933
+ answerIn = resp.usage?.inputTokens ?? 0;
934
+ answerOut = resp.usage?.outputTokens ?? 0;
935
+ }
936
+ const isCategory5 = item.qa.category === 5;
937
+ const judgePrompt = buildJudgePrompt(
938
+ item.qa.question,
939
+ item.qa.answer,
940
+ modelAnswer,
941
+ item.qa.adversarialAnswer,
942
+ isCategory5
943
+ );
944
+ const judgeResult = await callJudge(judgeModel, judgePrompt);
945
+ correct = judgeResult.correct;
946
+ judgeIn = judgeResult.inputTokens;
947
+ judgeOut = judgeResult.outputTokens;
948
+ if (isCategory5 && item.qa.adversarialAnswer) {
949
+ const adversarialLower = item.qa.adversarialAnswer.toLowerCase();
950
+ const answerLower = modelAnswer.toLowerCase();
951
+ trapTriggered = answerLower.includes(adversarialLower) || adversarialLower.includes(answerLower.slice(0, Math.min(answerLower.length, 30)));
952
+ if (trapTriggered && correct) correct = false;
953
+ if (!correct && appearsToDecline(modelAnswer)) {
954
+ correct = true;
955
+ trapTriggered = false;
956
+ }
957
+ }
958
+ } catch (err) {
959
+ errorMsg = err.message;
960
+ errorCount++;
961
+ correct = false;
962
+ }
963
+ const row = {
964
+ sampleId: item.sample.sampleId,
965
+ questionIndex: item.qaIndex,
966
+ question: item.qa.question,
967
+ goldAnswer: item.qa.answer,
968
+ category: item.qa.category,
969
+ modelAnswer,
970
+ correct,
971
+ ...trapTriggered !== void 0 ? { trapTriggered } : {},
972
+ ...errorMsg !== void 0 ? { error: errorMsg } : {},
973
+ answerInputTokens: answerIn,
974
+ answerOutputTokens: answerOut,
975
+ judgeInputTokens: judgeIn,
976
+ judgeOutputTokens: judgeOut
977
+ };
978
+ results.push(row);
979
+ totalAnswerInputTokens += answerIn;
980
+ totalAnswerOutputTokens += answerOut;
981
+ totalJudgeInputTokens += judgeIn;
982
+ totalJudgeOutputTokens += judgeOut;
983
+ if (checkpointPath) {
984
+ await appendCheckpointRow(checkpointPath, row);
985
+ }
986
+ done++;
987
+ if (onProgress) onProgress(done, total);
988
+ };
989
+ const concurrencyLimit = Math.max(1, concurrency);
990
+ const pending = [];
991
+ for (const item of queue) {
992
+ const p = processItem(item);
993
+ pending.push(p);
994
+ if (pending.length >= concurrencyLimit) {
995
+ await Promise.all(pending.splice(0, concurrencyLimit));
996
+ }
997
+ }
998
+ if (pending.length > 0) await Promise.all(pending);
999
+ if (checkpointPath && checkpoint.size > 0) {
1000
+ const raw = await (0, import_promises3.readFile)(checkpointPath, "utf-8").catch(() => "");
1001
+ for (const line of raw.split("\n")) {
1002
+ const trimmed = line.trim();
1003
+ if (!trimmed) continue;
1004
+ try {
1005
+ const row = JSON.parse(trimmed);
1006
+ const key = `${row.sampleId}:${row.questionIndex}`;
1007
+ if (checkpoint.has(key)) {
1008
+ results.push(row);
1009
+ totalAnswerInputTokens += row.answerInputTokens ?? 0;
1010
+ totalAnswerOutputTokens += row.answerOutputTokens ?? 0;
1011
+ totalJudgeInputTokens += row.judgeInputTokens ?? 0;
1012
+ totalJudgeOutputTokens += row.judgeOutputTokens ?? 0;
1013
+ }
1014
+ } catch {
1015
+ }
1016
+ }
1017
+ }
1018
+ const byCategoryMap = {};
1019
+ for (const row of results) {
1020
+ const cat = String(row.category);
1021
+ if (!byCategoryMap[cat]) byCategoryMap[cat] = { correct: 0, total: 0 };
1022
+ byCategoryMap[cat].total++;
1023
+ if (row.correct) byCategoryMap[cat].correct++;
1024
+ }
1025
+ const byCategory = {};
1026
+ for (const [cat, stats] of Object.entries(byCategoryMap)) {
1027
+ byCategory[cat] = {
1028
+ ...stats,
1029
+ accuracy: stats.total > 0 ? stats.correct / stats.total : 0
1030
+ };
1031
+ }
1032
+ let j14Correct = 0;
1033
+ let j14Total = 0;
1034
+ for (const [cat, stats] of Object.entries(byCategory)) {
1035
+ const n = parseInt(cat);
1036
+ if (n >= 1 && n <= 4) {
1037
+ j14Correct += stats.correct;
1038
+ j14Total += stats.total;
1039
+ }
1040
+ }
1041
+ const cat5Stats = byCategory["5"];
1042
+ const cat5RefusalRate = cat5Stats ? { correct: cat5Stats.correct, total: cat5Stats.total, rate: cat5Stats.accuracy } : void 0;
1043
+ const wallClockMs = Date.now() - startTime;
1044
+ return {
1045
+ config: {
1046
+ mode,
1047
+ topK,
1048
+ answerModelId: answerModel.modelId ?? "(unknown)",
1049
+ judgeModelId: judgeModel.modelId ?? "(unknown)",
1050
+ datasetSha: LOCOMO_SOURCE_SHA,
1051
+ seed,
1052
+ categories: [...categories].sort((a, b) => a - b),
1053
+ samplesRun: samples.length,
1054
+ questionsRun: results.length
1055
+ },
1056
+ overallJ14: {
1057
+ correct: j14Correct,
1058
+ total: j14Total,
1059
+ accuracy: j14Total > 0 ? j14Correct / j14Total : 0
1060
+ },
1061
+ byCategory,
1062
+ cat5RefusalRate,
1063
+ tokens: {
1064
+ ingestInputTokens,
1065
+ ingestOutputTokens,
1066
+ answerInputTokens: totalAnswerInputTokens,
1067
+ answerOutputTokens: totalAnswerOutputTokens,
1068
+ judgeInputTokens: totalJudgeInputTokens,
1069
+ judgeOutputTokens: totalJudgeOutputTokens,
1070
+ totalInputTokens: ingestInputTokens + totalAnswerInputTokens + totalJudgeInputTokens,
1071
+ totalOutputTokens: ingestOutputTokens + totalAnswerOutputTokens + totalJudgeOutputTokens
1072
+ },
1073
+ wallClockMs,
1074
+ questions: results,
1075
+ errorCount
1076
+ };
1077
+ }
1078
+
1079
+ // src/locomo-render.ts
1080
+ function pct(n) {
1081
+ return (n * 100).toFixed(1) + "%";
1082
+ }
1083
+ function fmtNum(n) {
1084
+ return n.toLocaleString("en-US");
1085
+ }
1086
+ function estimateCost(tokens, prices) {
1087
+ if (!prices) return "\u2014";
1088
+ const cost = tokens.totalInputTokens / 1e6 * prices.inputPer1M + tokens.totalOutputTokens / 1e6 * prices.outputPer1M;
1089
+ return `$${cost.toFixed(4)}`;
1090
+ }
1091
+ function renderLocomoReportMarkdown(reports, prices) {
1092
+ const lines = [];
1093
+ lines.push("# LoCoMo Benchmark Results");
1094
+ lines.push("");
1095
+ lines.push("Dataset: [LoCoMo](https://github.com/snap-research/locomo) (Snap Research) \xB7 CC BY-NC 4.0");
1096
+ lines.push("Raw data is not redistributed. Only aggregate results are published here.");
1097
+ lines.push("");
1098
+ if (reports.length === 0) {
1099
+ lines.push("_No results yet._");
1100
+ return lines.join("\n");
1101
+ }
1102
+ const headers = [
1103
+ "System / Mode",
1104
+ "Cat1 (multi-hop)",
1105
+ "Cat2 (temporal)",
1106
+ "Cat3 (open-domain)",
1107
+ "Cat4 (single-hop)",
1108
+ "J(1\u20134) overall",
1109
+ "Cat5 refusal rate",
1110
+ "Tokens/query",
1111
+ "Est. cost/run",
1112
+ "Answer model",
1113
+ "Judge model",
1114
+ "topK",
1115
+ "n-Q",
1116
+ "Seed",
1117
+ "Dataset SHA"
1118
+ ];
1119
+ lines.push("## Results");
1120
+ lines.push("");
1121
+ lines.push("| " + headers.join(" | ") + " |");
1122
+ lines.push("| " + headers.map(() => "---").join(" | ") + " |");
1123
+ for (const r of reports) {
1124
+ const c = r.config;
1125
+ const cat = (n) => {
1126
+ const s = r.byCategory[String(n)];
1127
+ if (!s) return "\u2014";
1128
+ return `${pct(s.accuracy)} (${s.correct}/${s.total})`;
1129
+ };
1130
+ const cat5 = r.cat5RefusalRate ? `${pct(r.cat5RefusalRate.rate)} (${r.cat5RefusalRate.correct}/${r.cat5RefusalRate.total})` : "\u2014";
1131
+ const totalQ = r.questions.length;
1132
+ const tokensPerQuery = totalQ > 0 ? Math.round((r.tokens.totalInputTokens + r.tokens.totalOutputTokens) / totalQ) : 0;
1133
+ const row = [
1134
+ `${c.answerModelId} / ${c.mode}`,
1135
+ cat(1),
1136
+ cat(2),
1137
+ cat(3),
1138
+ cat(4),
1139
+ `${pct(r.overallJ14.accuracy)} (${r.overallJ14.correct}/${r.overallJ14.total})`,
1140
+ cat5,
1141
+ fmtNum(tokensPerQuery),
1142
+ estimateCost(r.tokens, prices),
1143
+ c.answerModelId,
1144
+ c.judgeModelId,
1145
+ String(c.topK),
1146
+ fmtNum(r.config.questionsRun),
1147
+ String(c.seed),
1148
+ c.datasetSha.slice(0, 8)
1149
+ ];
1150
+ lines.push("| " + row.join(" | ") + " |");
1151
+ }
1152
+ lines.push("");
1153
+ lines.push("## Run Configuration");
1154
+ lines.push("");
1155
+ for (const r of reports) {
1156
+ const c = r.config;
1157
+ lines.push(`### ${c.answerModelId} / ${c.mode}`);
1158
+ lines.push("");
1159
+ lines.push(`- **Mode**: ${c.mode}`);
1160
+ lines.push(`- **Answer model**: ${c.answerModelId}`);
1161
+ lines.push(`- **Judge model**: ${c.judgeModelId}`);
1162
+ lines.push(`- **topK**: ${c.topK}`);
1163
+ lines.push(`- **Dataset SHA**: \`${c.datasetSha}\``);
1164
+ lines.push(`- **Seed**: ${c.seed}`);
1165
+ lines.push(`- **Categories**: ${c.categories.join(", ")}`);
1166
+ lines.push(`- **Samples run**: ${c.samplesRun}`);
1167
+ lines.push(`- **Questions run**: ${c.questionsRun}`);
1168
+ lines.push(`- **Wall-clock**: ${(r.wallClockMs / 1e3).toFixed(1)}s`);
1169
+ lines.push(`- **Errors**: ${r.errorCount}`);
1170
+ lines.push(`- **Tokens** (in/out): ${fmtNum(r.tokens.totalInputTokens)} / ${fmtNum(r.tokens.totalOutputTokens)}`);
1171
+ lines.push("");
1172
+ }
1173
+ lines.push("## Methodology Notes");
1174
+ lines.push("");
1175
+ lines.push("These results were produced using the Eidentic LoCoMo fair-run harness. The following rules apply:");
1176
+ lines.push("");
1177
+ lines.push("1. **Both speakers are treated as humans.** Turns are ingested as `[SpeakerName]: text` \u2014 never mapped to user/assistant roles.");
1178
+ lines.push("2. **Timestamps are structural.** Each session is prefixed with a header line `Session N \u2014 <date>` and `ingestedAt` metadata carries the epoch-ms.");
1179
+ lines.push("3. **topK \u2264 10 in memory mode.** Larger topK values trivialise retrieval quality and are not permitted.");
1180
+ lines.push("4. **Full-context baseline is required** alongside any memory-mode result.");
1181
+ lines.push("5. **Judge is strict**: a model answer is correct only when it contains the gold answer's specific information. Vague/topical-only answers are wrong.");
1182
+ lines.push("6. **Category 5 (adversarial)**: correct = model declined; adversarial-trap match = wrong.");
1183
+ lines.push("7. **Primary metric J(1\u20134)**: denominator is the number of cat 1\u20134 questions actually run (max 1540 on full dataset).");
1184
+ lines.push("8. **Dataset license**: CC BY-NC 4.0 \u2014 raw data is not redistributed; only aggregate results are published.");
1185
+ lines.push("");
1186
+ lines.push("> Category mapping in locomo10.json: 1=multi-hop (282), 2=temporal (321), 3=open-domain (96), 4=single-hop (841), 5=adversarial (446).");
1187
+ lines.push("");
1188
+ return lines.join("\n");
1189
+ }
1190
+
1191
+ // src/locomo-types.ts
1192
+ function resolveEvidence(sample, diaIds) {
1193
+ const turnMap = /* @__PURE__ */ new Map();
1194
+ for (const sess of sample.sessions) {
1195
+ for (const turn of sess.turns) {
1196
+ turnMap.set(turn.diaId, turn);
1197
+ }
1198
+ }
1199
+ const results = [];
1200
+ for (const id of diaIds) {
1201
+ const t = turnMap.get(id);
1202
+ if (t) results.push(t.text);
1203
+ }
1204
+ return results;
1205
+ }
1206
+
1207
+ // src/index.ts
1208
+ init_lme_loader();
1209
+
1210
+ // src/lme-run.ts
1211
+ var import_promises5 = require("node:fs/promises");
1212
+ var import_node_fs2 = require("node:fs");
1213
+ init_lme_loader();
1214
+ function makeRng2(seed) {
1215
+ let s = seed >>> 0;
1216
+ if (s === 0) s = 1;
1217
+ return () => {
1218
+ s ^= s << 13;
1219
+ s ^= s >>> 17;
1220
+ s ^= s << 5;
1221
+ s = s >>> 0;
1222
+ return s / 4294967296;
1223
+ };
1224
+ }
1225
+ function seededShuffle2(arr, rng) {
1226
+ const out = [...arr];
1227
+ for (let i = out.length - 1; i > 0; i--) {
1228
+ const j = Math.floor(rng() * (i + 1));
1229
+ [out[i], out[j]] = [out[j], out[i]];
1230
+ }
1231
+ return out;
1232
+ }
1233
+ var DEFAULT_FULL_CONTEXT_MAX_CHARS = 48e4;
1234
+ var DECLINE_PATTERNS2 = /\bno information available\b|\bi (don'?t|do not) (know|have)\b|\bcannot (find|answer|determine|provide)\b|\bnot (mentioned|stated|provided|found|available)\b|\bunable to (find|answer|determine)\b|\bno relevant (information|data)\b/i;
1235
+ function appearsToDecline2(answer) {
1236
+ return DECLINE_PATTERNS2.test(answer);
1237
+ }
1238
+ var ANSWER_INSTRUCTION2 = "Answer the question concisely and specifically based only on the information provided. If the provided context does not contain the answer, reply exactly: No information available";
1239
+ function buildMemoryAnswerPrompt2(snippets, question, currentDate) {
1240
+ const context = snippets.length > 0 ? snippets.map((s, i) => `[${i + 1}] ${s}`).join("\n\n") : "(no relevant context retrieved)";
1241
+ const dateNote = currentDate ? `
1242
+ Current date: ${currentDate}` : "";
1243
+ return `Context from conversation history:
1244
+ ${context}${dateNote}
1245
+
1246
+ Question: ${question}
1247
+
1248
+ ${ANSWER_INSTRUCTION2}`;
1249
+ }
1250
+ function buildFullContextAnswerPrompt2(haystackText, question, currentDate) {
1251
+ const dateNote = currentDate ? `
1252
+ Current date: ${currentDate}` : "";
1253
+ return `Conversation history:
1254
+ ${haystackText}${dateNote}
1255
+
1256
+ Question: ${question}
1257
+
1258
+ ${ANSWER_INSTRUCTION2}`;
1259
+ }
1260
+ function buildJudgePrompt2(question, goldAnswer, modelAnswer, questionType, isAbstention) {
1261
+ if (isAbstention) {
1262
+ return `You are a strict judge. This question requires the model to recognize that the information is not present in the conversation history, or that the premise is flawed. A CORRECT response means the model abstained, said something equivalent to "No information available", "I don't know", or identified that the premise is wrong. An INCORRECT response means the model fabricated a specific, concrete answer.
1263
+
1264
+ Question: ${question}
1265
+ Model answer: ${modelAnswer}
1266
+
1267
+ Respond with JSON only: {"correct": true} if the model appropriately abstained, {"correct": false} if it fabricated a specific answer.`;
1268
+ }
1269
+ const temporalNote = questionType === "temporal-reasoning" ? "\nFor time/date questions: equivalent date expressions referring to the same date or duration are correct." : "";
1270
+ return `You are a strict judge evaluating whether a model answer is correct.
1271
+ CORRECT: the model answer contains the specific information in the gold answer (exact match or clear paraphrase is fine).${temporalNote}
1272
+ WRONG: the model answer is vague, only topically related, contradicts the gold answer, or says "no information" when a specific answer exists.
1273
+
1274
+ Question: ${question}
1275
+ Gold answer: ${goldAnswer}
1276
+ Model answer: ${modelAnswer}
1277
+
1278
+ Respond with JSON only: {"correct": true} or {"correct": false}`;
1279
+ }
1280
+ function renderHaystack(sessions) {
1281
+ const lines = [];
1282
+ for (let i = 0; i < sessions.length; i++) {
1283
+ const sess = sessions[i];
1284
+ const label = `Session ${i + 1} \u2014 ${sess.dateTime || sess.id}`;
1285
+ lines.push(label);
1286
+ for (const turn of sess.turns) {
1287
+ const roleLabel = turn.role === "user" ? "User" : "Assistant";
1288
+ lines.push(`[${roleLabel}]: ${turn.content}`);
1289
+ }
1290
+ lines.push("");
1291
+ }
1292
+ return lines.join("\n").trim();
1293
+ }
1294
+ function renderHaystackCapped(sessions, maxChars) {
1295
+ const full = renderHaystack(sessions);
1296
+ if (full.length <= maxChars) return { text: full, truncated: false };
1297
+ let kept = sessions.slice();
1298
+ while (kept.length > 1) {
1299
+ kept = kept.slice(1);
1300
+ const t = renderHaystack(kept);
1301
+ if (t.length <= maxChars) return { text: t, truncated: true };
1302
+ }
1303
+ return { text: renderHaystack(kept).slice(0, maxChars), truncated: true };
1304
+ }
1305
+ var EMBED_CHAR_CAP = 2e4;
1306
+ function capForEmbedding(text) {
1307
+ return text.length <= EMBED_CHAR_CAP ? text : text.slice(0, EMBED_CHAR_CAP);
1308
+ }
1309
+ async function ingestQuestionIntoMemory(question, memory, scope) {
1310
+ const events = [];
1311
+ for (let i = 0; i < question.sessions.length; i++) {
1312
+ const sess = question.sessions[i];
1313
+ const sessLabel = `Session ${i + 1} \u2014 ${sess.dateTime || sess.id}`;
1314
+ for (let t = 0; t < sess.turns.length; t++) {
1315
+ const turn = sess.turns[t];
1316
+ const roleLabel = turn.role === "user" ? "User" : "Assistant";
1317
+ events.push({
1318
+ id: `${question.id}:sess${i}:turn${t}`,
1319
+ scope,
1320
+ text: capForEmbedding(`[${sessLabel}] [${roleLabel}]: ${turn.content}`),
1321
+ metadata: {
1322
+ sessionId: sess.id,
1323
+ sessionIndex: i,
1324
+ turnRole: turn.role,
1325
+ ingestedAt: sess.dateTimeMs || void 0
1326
+ }
1327
+ });
1328
+ }
1329
+ const sessionText = [
1330
+ sessLabel,
1331
+ ...sess.turns.map((t) => `[${t.role === "user" ? "User" : "Assistant"}]: ${t.content}`)
1332
+ ].join("\n");
1333
+ events.push({
1334
+ id: `${question.id}:sess${i}:chunk`,
1335
+ scope,
1336
+ text: capForEmbedding(sessionText),
1337
+ metadata: {
1338
+ sessionId: sess.id,
1339
+ sessionIndex: i,
1340
+ ingestedAt: sess.dateTimeMs || void 0
1341
+ }
1342
+ });
1343
+ }
1344
+ await memory.ingest(events);
1345
+ }
1346
+ async function callJudge2(judgeModel, prompt) {
1347
+ const response = await judgeModel.complete({
1348
+ messages: [{ role: "user", content: prompt }],
1349
+ tools: [],
1350
+ outputSchema: {
1351
+ type: "object",
1352
+ properties: { correct: { type: "boolean" } },
1353
+ required: ["correct"],
1354
+ // OpenAI strict structured-output mode requires this to be explicit.
1355
+ additionalProperties: false
1356
+ }
1357
+ });
1358
+ let correct = false;
1359
+ if (response.object && typeof response.object.correct === "boolean") {
1360
+ correct = response.object.correct;
1361
+ } else {
1362
+ const text = response.content.filter((b) => b.type === "text").map((b) => b.text ?? "").join("").toLowerCase().trim();
1363
+ if (/"correct"\s*:\s*true/i.test(text)) correct = true;
1364
+ else if (/"correct"\s*:\s*false/i.test(text)) correct = false;
1365
+ else correct = text.includes("true");
1366
+ }
1367
+ return {
1368
+ correct,
1369
+ inputTokens: response.usage?.inputTokens ?? 0,
1370
+ outputTokens: response.usage?.outputTokens ?? 0
1371
+ };
1372
+ }
1373
+ async function loadCheckpoint2(path) {
1374
+ const done = /* @__PURE__ */ new Set();
1375
+ if (!(0, import_node_fs2.existsSync)(path)) return done;
1376
+ const raw = await (0, import_promises5.readFile)(path, "utf-8");
1377
+ for (const line of raw.split("\n")) {
1378
+ const trimmed = line.trim();
1379
+ if (!trimmed) continue;
1380
+ try {
1381
+ const row = JSON.parse(trimmed);
1382
+ if (row.questionId) done.add(row.questionId);
1383
+ } catch {
1384
+ }
1385
+ }
1386
+ return done;
1387
+ }
1388
+ async function appendCheckpointRow2(path, row) {
1389
+ await (0, import_promises5.appendFile)(path, JSON.stringify(row) + "\n", "utf-8");
1390
+ }
1391
+ async function runLongMemEvalBench(opts) {
1392
+ const {
1393
+ answerModel,
1394
+ judgeModel,
1395
+ mode,
1396
+ types,
1397
+ questionLimit,
1398
+ seed = 42,
1399
+ concurrency = 1,
1400
+ onProgress,
1401
+ checkpointPath,
1402
+ fullContextMaxChars = DEFAULT_FULL_CONTEXT_MAX_CHARS
1403
+ } = opts;
1404
+ const topK = Math.min(opts.topK ?? 10, 10);
1405
+ const rng = makeRng2(seed);
1406
+ if (mode === "memory" && !opts.memoryFactory) {
1407
+ throw new Error("runLongMemEvalBench: memoryFactory is required when mode='memory'");
1408
+ }
1409
+ let dataset;
1410
+ if (opts.dataset) {
1411
+ dataset = opts.dataset;
1412
+ } else if (opts.dataPath) {
1413
+ const { loadLongMemEval: loader } = await Promise.resolve().then(() => (init_lme_loader(), lme_loader_exports));
1414
+ dataset = await loader(opts.dataPath);
1415
+ } else {
1416
+ throw new Error("runLongMemEvalBench: either dataPath or dataset must be provided");
1417
+ }
1418
+ let questions = dataset.questions;
1419
+ if (types && types.length > 0) {
1420
+ questions = questions.filter((q) => types.includes(q.type) || types.includes(q.baseType));
1421
+ }
1422
+ if (questionLimit !== void 0 && questionLimit < questions.length) {
1423
+ questions = seededShuffle2(questions, makeRng2(seed)).slice(0, questionLimit);
1424
+ }
1425
+ const checkpoint = checkpointPath ? await loadCheckpoint2(checkpointPath) : /* @__PURE__ */ new Set();
1426
+ const results = [];
1427
+ let totalAnswerInputTokens = 0;
1428
+ let totalAnswerOutputTokens = 0;
1429
+ let totalJudgeInputTokens = 0;
1430
+ let totalJudgeOutputTokens = 0;
1431
+ let ingestEmbedTokens = 0;
1432
+ let errorCount = 0;
1433
+ let done = 0;
1434
+ const total = questions.length - checkpoint.size;
1435
+ const startTime = Date.now();
1436
+ const processQuestion = async (q) => {
1437
+ if (checkpoint.has(q.id)) return;
1438
+ let modelAnswer = "";
1439
+ let answerIn = 0;
1440
+ let answerOut = 0;
1441
+ let judgeIn = 0;
1442
+ let judgeOut = 0;
1443
+ let correct = false;
1444
+ let contextTruncated = false;
1445
+ let errorMsg;
1446
+ try {
1447
+ if (mode === "memory") {
1448
+ const memory = await opts.memoryFactory(q.id);
1449
+ const scope = { kind: "agent", agentId: `lme:${q.id}` };
1450
+ await ingestQuestionIntoMemory(q, memory, scope);
1451
+ const retrieved = await memory.retrieve({ text: q.question, scope, topK });
1452
+ const snippets = retrieved.snippets.map((s) => s.text);
1453
+ const prompt = buildMemoryAnswerPrompt2(snippets, q.question, q.questionDate);
1454
+ const resp = await answerModel.complete({
1455
+ messages: [{ role: "user", content: prompt }],
1456
+ tools: []
1457
+ });
1458
+ const textBlocks = resp.content.filter((b) => b.type === "text");
1459
+ modelAnswer = textBlocks.map((b) => b.text).join("").trim();
1460
+ answerIn = resp.usage?.inputTokens ?? 0;
1461
+ answerOut = resp.usage?.outputTokens ?? 0;
1462
+ } else {
1463
+ const { text, truncated } = renderHaystackCapped(q.sessions, fullContextMaxChars);
1464
+ contextTruncated = truncated;
1465
+ const prompt = buildFullContextAnswerPrompt2(text, q.question, q.questionDate);
1466
+ const resp = await answerModel.complete({
1467
+ messages: [{ role: "user", content: prompt }],
1468
+ tools: []
1469
+ });
1470
+ const textBlocks = resp.content.filter((b) => b.type === "text");
1471
+ modelAnswer = textBlocks.map((b) => b.text).join("").trim();
1472
+ answerIn = resp.usage?.inputTokens ?? 0;
1473
+ answerOut = resp.usage?.outputTokens ?? 0;
1474
+ }
1475
+ const judgePrompt = buildJudgePrompt2(
1476
+ q.question,
1477
+ q.answer,
1478
+ modelAnswer,
1479
+ q.baseType,
1480
+ q.isAbstention
1481
+ );
1482
+ const judgeResult = await callJudge2(judgeModel, judgePrompt);
1483
+ correct = judgeResult.correct;
1484
+ judgeIn = judgeResult.inputTokens;
1485
+ judgeOut = judgeResult.outputTokens;
1486
+ } catch (err) {
1487
+ errorMsg = err.message;
1488
+ errorCount++;
1489
+ correct = false;
1490
+ }
1491
+ const appearedToAbstain = appearsToDecline2(modelAnswer);
1492
+ const row = {
1493
+ questionId: q.id,
1494
+ questionType: q.type,
1495
+ isAbstention: q.isAbstention,
1496
+ question: q.question,
1497
+ goldAnswer: q.answer,
1498
+ modelAnswer,
1499
+ correct,
1500
+ appearedToAbstain,
1501
+ ...contextTruncated ? { contextTruncated } : {},
1502
+ ...errorMsg !== void 0 ? { error: errorMsg } : {},
1503
+ answerInputTokens: answerIn,
1504
+ answerOutputTokens: answerOut,
1505
+ judgeInputTokens: judgeIn,
1506
+ judgeOutputTokens: judgeOut
1507
+ };
1508
+ results.push(row);
1509
+ totalAnswerInputTokens += answerIn;
1510
+ totalAnswerOutputTokens += answerOut;
1511
+ totalJudgeInputTokens += judgeIn;
1512
+ totalJudgeOutputTokens += judgeOut;
1513
+ if (checkpointPath) {
1514
+ await appendCheckpointRow2(checkpointPath, row);
1515
+ }
1516
+ done++;
1517
+ if (onProgress) onProgress(done, total);
1518
+ };
1519
+ const concurrencyLimit = Math.max(1, concurrency);
1520
+ const pending = [];
1521
+ for (const q of questions) {
1522
+ const p = processQuestion(q);
1523
+ pending.push(p);
1524
+ if (pending.length >= concurrencyLimit) {
1525
+ await Promise.all(pending.splice(0, concurrencyLimit));
1526
+ }
1527
+ }
1528
+ if (pending.length > 0) await Promise.all(pending);
1529
+ if (checkpointPath && checkpoint.size > 0) {
1530
+ const raw = await (0, import_promises5.readFile)(checkpointPath, "utf-8").catch(() => "");
1531
+ for (const line of raw.split("\n")) {
1532
+ const trimmed = line.trim();
1533
+ if (!trimmed) continue;
1534
+ try {
1535
+ const row = JSON.parse(trimmed);
1536
+ if (checkpoint.has(row.questionId)) {
1537
+ results.push(row);
1538
+ totalAnswerInputTokens += row.answerInputTokens ?? 0;
1539
+ totalAnswerOutputTokens += row.answerOutputTokens ?? 0;
1540
+ totalJudgeInputTokens += row.judgeInputTokens ?? 0;
1541
+ totalJudgeOutputTokens += row.judgeOutputTokens ?? 0;
1542
+ }
1543
+ } catch {
1544
+ }
1545
+ }
1546
+ }
1547
+ const byTypeMap = {};
1548
+ let overallCorrect = 0;
1549
+ let overallTotal = 0;
1550
+ let abstentionCorrect = 0;
1551
+ let abstentionTotal = 0;
1552
+ for (const row of results) {
1553
+ if (row.isAbstention) {
1554
+ abstentionTotal++;
1555
+ if (row.correct) abstentionCorrect++;
1556
+ } else {
1557
+ const bt = row.questionType.endsWith("_abs") ? row.questionType.slice(0, -4) : row.questionType;
1558
+ if (!byTypeMap[bt]) byTypeMap[bt] = { correct: 0, total: 0 };
1559
+ overallTotal++;
1560
+ if (row.correct) overallCorrect++;
1561
+ byTypeMap[bt].total++;
1562
+ if (row.correct) byTypeMap[bt].correct++;
1563
+ }
1564
+ }
1565
+ const byType = {};
1566
+ for (const [t, stats] of Object.entries(byTypeMap)) {
1567
+ byType[t] = {
1568
+ ...stats,
1569
+ accuracy: stats.total > 0 ? stats.correct / stats.total : 0
1570
+ };
1571
+ }
1572
+ const abstentionAccuracy = abstentionTotal > 0 ? {
1573
+ correct: abstentionCorrect,
1574
+ total: abstentionTotal,
1575
+ accuracy: abstentionCorrect / abstentionTotal
1576
+ } : void 0;
1577
+ const wallClockMs = Date.now() - startTime;
1578
+ const allTypes = [...new Set(questions.map((q) => q.type))].sort();
1579
+ return {
1580
+ config: {
1581
+ mode,
1582
+ topK,
1583
+ answerModelId: answerModel.modelId ?? "(unknown)",
1584
+ judgeModelId: judgeModel.modelId ?? "(unknown)",
1585
+ datasetSource: LONGMEMEVAL_SOURCE,
1586
+ seed,
1587
+ types: allTypes,
1588
+ questionsRun: results.length
1589
+ },
1590
+ overall: {
1591
+ correct: overallCorrect,
1592
+ total: overallTotal,
1593
+ accuracy: overallTotal > 0 ? overallCorrect / overallTotal : 0
1594
+ },
1595
+ byType,
1596
+ ...abstentionAccuracy !== void 0 ? { abstentionAccuracy } : {},
1597
+ tokens: {
1598
+ ingestEmbedTokens,
1599
+ answerInputTokens: totalAnswerInputTokens,
1600
+ answerOutputTokens: totalAnswerOutputTokens,
1601
+ judgeInputTokens: totalJudgeInputTokens,
1602
+ judgeOutputTokens: totalJudgeOutputTokens,
1603
+ totalInputTokens: totalAnswerInputTokens + totalJudgeInputTokens,
1604
+ totalOutputTokens: totalAnswerOutputTokens + totalJudgeOutputTokens
1605
+ },
1606
+ wallClockMs,
1607
+ questions: results,
1608
+ errorCount
1609
+ };
1610
+ }
1611
+
1612
+ // src/lme-render.ts
1613
+ var QUESTION_TYPE_LABELS = {
1614
+ "single-session-user": "Single-session (user)",
1615
+ "single-session-assistant": "Single-session (asst.)",
1616
+ "single-session-preference": "Single-session (pref.)",
1617
+ "multi-session": "Multi-session",
1618
+ "temporal-reasoning": "Temporal reasoning",
1619
+ "knowledge-update": "Knowledge update"
1620
+ };
1621
+ function pct2(n) {
1622
+ return (n * 100).toFixed(1) + "%";
1623
+ }
1624
+ function fmtNum2(n) {
1625
+ return n.toLocaleString("en-US");
1626
+ }
1627
+ function estimateCost2(tokens, prices) {
1628
+ if (!prices) return "\u2014";
1629
+ const cost = tokens.totalInputTokens / 1e6 * prices.inputPer1M + tokens.totalOutputTokens / 1e6 * prices.outputPer1M;
1630
+ return `$${cost.toFixed(4)}`;
1631
+ }
1632
+ function fmtStat(s) {
1633
+ if (!s || s.total === 0) return "\u2014";
1634
+ return `${pct2(s.accuracy)} (${s.correct}/${s.total})`;
1635
+ }
1636
+ function renderLongMemEvalReportMarkdown(reports, prices) {
1637
+ const lines = [];
1638
+ lines.push("# LongMemEval Benchmark Results");
1639
+ lines.push("");
1640
+ lines.push(
1641
+ "Dataset: [LongMemEval](https://github.com/xiaowu0162/LongMemEval) (Wu et al.) \xB7 MIT License"
1642
+ );
1643
+ lines.push("Raw data is not redistributed. Only aggregate results are published here.");
1644
+ lines.push("");
1645
+ if (reports.length === 0) {
1646
+ lines.push("_No results yet._");
1647
+ return lines.join("\n");
1648
+ }
1649
+ const allBaseTypes = /* @__PURE__ */ new Set();
1650
+ for (const r of reports) {
1651
+ for (const t of Object.keys(r.byType)) allBaseTypes.add(t);
1652
+ }
1653
+ const sortedTypes = [
1654
+ "single-session-user",
1655
+ "single-session-assistant",
1656
+ "single-session-preference",
1657
+ "multi-session",
1658
+ "temporal-reasoning",
1659
+ "knowledge-update"
1660
+ ].filter((t) => allBaseTypes.has(t));
1661
+ for (const t of [...allBaseTypes].sort()) {
1662
+ if (!sortedTypes.includes(t)) sortedTypes.push(t);
1663
+ }
1664
+ const typeHeaders = sortedTypes.map(
1665
+ (t) => QUESTION_TYPE_LABELS[t] ?? t
1666
+ );
1667
+ const headers = [
1668
+ "System / Mode",
1669
+ ...typeHeaders,
1670
+ "Overall accuracy",
1671
+ "Abstention accuracy",
1672
+ "Tokens/query",
1673
+ "Est. cost/run",
1674
+ "Answer model",
1675
+ "Judge model",
1676
+ "topK",
1677
+ "n-Q",
1678
+ "Seed",
1679
+ "Dataset provenance"
1680
+ ];
1681
+ lines.push("## Results");
1682
+ lines.push("");
1683
+ lines.push("| " + headers.join(" | ") + " |");
1684
+ lines.push("| " + headers.map(() => "---").join(" | ") + " |");
1685
+ for (const r of reports) {
1686
+ const c = r.config;
1687
+ const typeRow = sortedTypes.map((t) => fmtStat(r.byType[t]));
1688
+ const totalQ = r.questions.length;
1689
+ const tokensPerQuery = totalQ > 0 ? Math.round(
1690
+ (r.tokens.totalInputTokens + r.tokens.totalOutputTokens) / totalQ
1691
+ ) : 0;
1692
+ const provenance = `${c.datasetSource.url.replace("https://", "")} @ ${c.datasetSource.snapshotSha.slice(0, 8)}`;
1693
+ const row = [
1694
+ `${c.answerModelId} / ${c.mode}`,
1695
+ ...typeRow,
1696
+ fmtStat(r.overall),
1697
+ fmtStat(r.abstentionAccuracy),
1698
+ fmtNum2(tokensPerQuery),
1699
+ estimateCost2(r.tokens, prices),
1700
+ c.answerModelId,
1701
+ c.judgeModelId,
1702
+ c.mode === "memory" ? String(c.topK) : "\u2014",
1703
+ fmtNum2(r.config.questionsRun),
1704
+ String(c.seed),
1705
+ provenance
1706
+ ];
1707
+ lines.push("| " + row.join(" | ") + " |");
1708
+ }
1709
+ lines.push("");
1710
+ lines.push("## Run Configuration");
1711
+ lines.push("");
1712
+ for (const r of reports) {
1713
+ const c = r.config;
1714
+ lines.push(`### ${c.answerModelId} / ${c.mode}`);
1715
+ lines.push("");
1716
+ lines.push(`- **Mode**: ${c.mode}`);
1717
+ lines.push(`- **Answer model**: ${c.answerModelId}`);
1718
+ lines.push(`- **Judge model**: ${c.judgeModelId}`);
1719
+ if (c.mode === "memory") lines.push(`- **topK**: ${c.topK}`);
1720
+ lines.push(`- **Dataset source**: ${c.datasetSource.url}`);
1721
+ lines.push(`- **Dataset snapshot SHA**: \`${c.datasetSource.snapshotSha}\``);
1722
+ lines.push(`- **Dataset file**: ${c.datasetSource.file}`);
1723
+ lines.push(`- **Dataset license**: ${c.datasetSource.license}`);
1724
+ lines.push(`- **Seed**: ${c.seed}`);
1725
+ lines.push(`- **Types**: ${c.types.join(", ") || "all"}`);
1726
+ lines.push(`- **Questions run**: ${c.questionsRun}`);
1727
+ lines.push(`- **Wall-clock**: ${(r.wallClockMs / 1e3).toFixed(1)}s`);
1728
+ lines.push(`- **Errors**: ${r.errorCount}`);
1729
+ lines.push(
1730
+ `- **Tokens** (in/out): ${fmtNum2(r.tokens.totalInputTokens)} / ${fmtNum2(r.tokens.totalOutputTokens)}`
1731
+ );
1732
+ lines.push("");
1733
+ }
1734
+ lines.push("## Methodology Notes");
1735
+ lines.push("");
1736
+ lines.push(
1737
+ "These results were produced using the Eidentic LongMemEval fair-run harness. The following rules apply:"
1738
+ );
1739
+ lines.push("");
1740
+ lines.push(
1741
+ "1. **Per-question memory scope.** Each question has its own haystack (~50 sessions on average). A fresh Memory instance is created per question; no cross-question contamination."
1742
+ );
1743
+ lines.push(
1744
+ "2. **Dual-granularity ingest.** Each turn is ingested with its session date in the text (temporally anchored). An additional session-level chunk entry captures multi-turn context."
1745
+ );
1746
+ lines.push(
1747
+ "3. **Current date in prompt.** The `question_date` is passed to the answer prompt so temporal questions can reason about recency."
1748
+ );
1749
+ lines.push(
1750
+ "4. **topK \u2264 10 in memory mode.** Larger topK values trivialise retrieval quality and are not permitted."
1751
+ );
1752
+ lines.push(
1753
+ "5. **Full-context baseline is required** alongside any memory-mode result."
1754
+ );
1755
+ lines.push(
1756
+ "6. **Judge is strict**: a model answer is correct only when it contains the gold answer's specific information. Vague/topical-only answers are wrong. Equivalent date expressions for the same date/duration are correct (temporal-reasoning type)."
1757
+ );
1758
+ lines.push(
1759
+ "7. **Abstention questions** (not present in longmemeval_s.json standard split): correct = model declined / said no-info / identified a flawed premise; fabricating a specific answer = wrong. Abstention accuracy is reported separately and not folded into overall accuracy."
1760
+ );
1761
+ lines.push(
1762
+ "8. **Dataset license**: MIT \u2014 raw data is not redistributed; only aggregate results are published."
1763
+ );
1764
+ lines.push("");
1765
+ lines.push("> Per-type question counts in longmemeval_s.json (500 total):");
1766
+ lines.push("> single-session-user 70, single-session-assistant 56, single-session-preference 30,");
1767
+ lines.push("> multi-session 133, temporal-reasoning 133, knowledge-update 78.");
1768
+ lines.push("> No abstention variants in the standard _s split.");
1769
+ lines.push("");
1770
+ return lines.join("\n");
1771
+ }
1772
+
448
1773
  // src/write-quality.ts
449
1774
  var CONTRADICTION_FIXTURES = [
450
1775
  {
@@ -822,7 +2147,7 @@ async function runTemporalBench(memory, dataset, opts = {}) {
822
2147
  }
823
2148
 
824
2149
  // src/datasets/temporal.ts
825
- function makeRng(seed) {
2150
+ function makeRng3(seed) {
826
2151
  let s = seed >>> 0;
827
2152
  if (s === 0) s = 1;
828
2153
  return () => {
@@ -905,7 +2230,7 @@ function syntheticTemporalDataset(opts = {}) {
905
2230
  const entityCount = opts.entityCount ?? 4;
906
2231
  const seed = opts.seed ?? 42;
907
2232
  const changesPerProperty = opts.changesPerProperty ?? 3;
908
- const rng = makeRng(seed);
2233
+ const rng = makeRng3(seed);
909
2234
  const entities = [];
910
2235
  const asserts = [];
911
2236
  const questions = [];
@@ -992,12 +2317,22 @@ function syntheticTemporalDataset(opts = {}) {
992
2317
  0 && (module.exports = {
993
2318
  CONTRADICTION_FIXTURES,
994
2319
  JUNK_STREAM_FIXTURES,
2320
+ LOCOMO_SOURCE_SHA,
2321
+ LONGMEMEVAL_SOURCE,
995
2322
  factRecall,
996
2323
  loadLoCoMo,
2324
+ loadLoCoMoLegacy,
997
2325
  loadLongMemEval,
2326
+ loadLongMemEvalLegacy,
998
2327
  normalizeText,
999
2328
  normalizedIncludes,
2329
+ parseLmeDateTimeString,
1000
2330
  recallAtK,
2331
+ renderLocomoReportMarkdown,
2332
+ renderLongMemEvalReportMarkdown,
2333
+ resolveEvidence,
2334
+ runLocomoBench,
2335
+ runLongMemEvalBench,
1001
2336
  runMemoryBench,
1002
2337
  runTemporalBench,
1003
2338
  runWriteQualityBench,