@eidentic/bench 0.1.0 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -103,10 +103,47 @@ const report = await runWriteQualityBench(memory, {
103
103
  });
104
104
  ```
105
105
 
106
- ### Real dataset loaders (gated)
106
+ ### LoCoMo fair-run harness
107
+
108
+ The LoCoMo harness is a rigorous end-to-end benchmark: ingest → retrieve → answer → LLM judge.
109
+ It enforces strict fair-run rules (both speakers as humans, structural timestamps, topK ≤ 10,
110
+ mandatory full-context baseline). See [BASELINES.md](./BASELINES.md) for the full methodology.
111
+
112
+ ```ts
113
+ import { loadLoCoMo, runLocomoBench, renderLocomoReportMarkdown } from "@eidentic/bench";
114
+ import { Memory } from "@eidentic/memory";
115
+ import { InMemoryStore, InMemoryVectorStore, FakeEmbedder } from "@eidentic/types/testing";
116
+
117
+ // Download data/locomo10.json first — CC BY-NC 4.0, do not commit
118
+ const dataset = await loadLoCoMo("data/locomo10.json");
119
+
120
+ const report = await runLocomoBench({
121
+ dataPath: "data/locomo10.json",
122
+ dataset,
123
+ answerModel: myModel,
124
+ judgeModel: myJudgeModel,
125
+ mode: "full-context", // or "memory" with memoryFactory
126
+ categories: [1, 2, 3, 4, 5],
127
+ memoryFactory: (sampleId) => new Memory({
128
+ store: new InMemoryStore(),
129
+ vector: new InMemoryVectorStore(),
130
+ embedder: new FakeEmbedder(32),
131
+ }),
132
+ });
133
+
134
+ console.log("J(1-4):", report.overallJ14.accuracy);
135
+ console.log(renderLocomoReportMarkdown([report]));
136
+ ```
137
+
138
+ Run the full pilot:
139
+ ```bash
140
+ ANTHROPIC_API_KEY=... pnpm --filter eidentic-examples bench:locomo -- --mode full-context --samples 2
141
+ ```
142
+
143
+ ### LongMemEval loader (retrieval benchmark, gated)
107
144
 
108
145
  ```ts
109
- import { loadLongMemEval, loadLoCoMo } from "@eidentic/bench";
146
+ import { loadLongMemEval } from "@eidentic/bench";
110
147
 
111
148
  const dataset = await loadLongMemEval("/path/to/longmemeval.json");
112
149
  const report = await runMemoryBench(() => myMemory(), dataset, { topK: 10 });
@@ -0,0 +1,98 @@
1
+ // src/lme-loader.ts
2
+ import { readFile, stat } from "node:fs/promises";
3
+ var LONGMEMEVAL_SOURCE = {
4
+ url: "https://huggingface.co/datasets/xiaowu0162/longmemeval",
5
+ snapshotSha: "2ec2a557f339b6c0369619b1ed5793734cc87533",
6
+ file: "longmemeval_s",
7
+ license: "MIT"
8
+ };
9
+ var DEFAULT_MAX_BYTES = 512 * 1024 * 1024;
10
+ async function assertFileSize(filePath, maxBytes) {
11
+ let fileSize;
12
+ try {
13
+ const s = await stat(filePath);
14
+ fileSize = s.size;
15
+ } catch (err) {
16
+ throw new Error(
17
+ `bench loader: cannot stat file "${filePath}": ${err.message}`
18
+ );
19
+ }
20
+ if (fileSize > maxBytes) {
21
+ const mb = (fileSize / (1024 * 1024)).toFixed(1);
22
+ const capMb = (maxBytes / (1024 * 1024)).toFixed(0);
23
+ throw new Error(
24
+ `bench loader: file "${filePath}" is ${mb} MiB, which exceeds the ${capMb} MiB cap. Pass a larger maxBytes option if this is intentional.`
25
+ );
26
+ }
27
+ }
28
+ function parseLmeDateTimeString(raw) {
29
+ if (!raw) return 0;
30
+ const cleaned = raw.replace(/\s*\([A-Za-z]+\)\s*/, " ").trim();
31
+ const iso = cleaned.replace(/^(\d{4})\/(\d{2})\/(\d{2})/, "$1-$2-$3");
32
+ const ms = Date.parse(iso);
33
+ if (Number.isNaN(ms)) return 0;
34
+ return ms;
35
+ }
36
+ function extractBaseType(rawType) {
37
+ if (rawType.endsWith("_abs")) {
38
+ return rawType.slice(0, -4);
39
+ }
40
+ return rawType;
41
+ }
42
+ function parseSession(id, dateTime, rawTurns) {
43
+ const turns = rawTurns.map((t) => ({
44
+ role: t.role === "assistant" ? "assistant" : "user",
45
+ content: t.content ?? "",
46
+ hasAnswer: t.has_answer === true
47
+ }));
48
+ return {
49
+ id,
50
+ dateTime,
51
+ dateTimeMs: parseLmeDateTimeString(dateTime),
52
+ turns
53
+ };
54
+ }
55
+ async function loadLongMemEval(jsonPath, opts) {
56
+ await assertFileSize(jsonPath, opts?.maxBytes ?? DEFAULT_MAX_BYTES);
57
+ const raw = JSON.parse(await readFile(jsonPath, "utf-8"));
58
+ if (!Array.isArray(raw)) {
59
+ throw new Error(
60
+ `bench loader: expected the LongMemEval JSON root to be an array, but got ${typeof raw}. Did you pass the correct file?`
61
+ );
62
+ }
63
+ const rawQuestions = raw;
64
+ const questions = rawQuestions.map((q, i) => {
65
+ const id = q.question_id ?? String(i);
66
+ const rawType = q.question_type ?? "single-session-user";
67
+ const baseType = extractBaseType(rawType);
68
+ const isAbstention = rawType.endsWith("_abs");
69
+ const rawSessions = Array.isArray(q.haystack_sessions) ? q.haystack_sessions : [];
70
+ const dates = Array.isArray(q.haystack_dates) ? q.haystack_dates : [];
71
+ const sessionIds = Array.isArray(q.haystack_session_ids) ? q.haystack_session_ids : [];
72
+ const sessions = rawSessions.map((turns, idx) => {
73
+ const sessId = sessionIds[idx] ?? `sess-${idx}`;
74
+ const dateTime = dates[idx] ?? "";
75
+ return parseSession(sessId, dateTime, Array.isArray(turns) ? turns : []);
76
+ });
77
+ sessions.sort((a, b) => a.dateTimeMs - b.dateTimeMs);
78
+ return {
79
+ id,
80
+ type: rawType,
81
+ baseType,
82
+ isAbstention,
83
+ question: q.question ?? "",
84
+ answer: q.answer ?? "",
85
+ questionDate: q.question_date ?? "",
86
+ questionDateMs: parseLmeDateTimeString(q.question_date ?? ""),
87
+ sessions,
88
+ answerSessionIds: Array.isArray(q.answer_session_ids) ? q.answer_session_ids : []
89
+ };
90
+ });
91
+ return { questions };
92
+ }
93
+
94
+ export {
95
+ LONGMEMEVAL_SOURCE,
96
+ parseLmeDateTimeString,
97
+ loadLongMemEval
98
+ };
@@ -0,0 +1,99 @@
1
+ // src/locomo-loader.ts
2
+ import { readFile, stat } from "node:fs/promises";
3
+ var LOCOMO_SOURCE_SHA = "3eb6f2c585f5e1699204e3c3bdf7adc5c28cb376";
4
+ var DEFAULT_MAX_BYTES = 256 * 1024 * 1024;
5
+ async function assertFileSize(filePath, maxBytes) {
6
+ let fileSize;
7
+ try {
8
+ const s = await stat(filePath);
9
+ fileSize = s.size;
10
+ } catch (err) {
11
+ throw new Error(
12
+ `bench loader: cannot stat file "${filePath}": ${err.message}`
13
+ );
14
+ }
15
+ if (fileSize > maxBytes) {
16
+ const mb = (fileSize / (1024 * 1024)).toFixed(1);
17
+ const capMb = (maxBytes / (1024 * 1024)).toFixed(0);
18
+ throw new Error(
19
+ `bench loader: file "${filePath}" is ${mb} MiB, which exceeds the ${capMb} MiB cap. Pass a larger maxBytes option if this is intentional.`
20
+ );
21
+ }
22
+ }
23
+ function parseLocomoDateTime(raw) {
24
+ const cleaned = raw.replace(/\s*on\s+/, " ").trim();
25
+ const ms = Date.parse(cleaned);
26
+ if (Number.isNaN(ms)) {
27
+ const match = /^(\d{1,2}:\d{2}\s*(?:am|pm))\s+(.+)$/i.exec(cleaned);
28
+ if (match) {
29
+ const ms2 = Date.parse(`${match[2]} ${match[1]}`);
30
+ return Number.isNaN(ms2) ? 0 : ms2;
31
+ }
32
+ return 0;
33
+ }
34
+ return ms;
35
+ }
36
+ function parseSessions(conv) {
37
+ const indices = [];
38
+ for (const key of Object.keys(conv)) {
39
+ const m = /^session_(\d+)$/.exec(key);
40
+ if (m) indices.push(parseInt(m[1], 10));
41
+ }
42
+ indices.sort((a, b) => a - b);
43
+ const sessions = [];
44
+ for (const idx of indices) {
45
+ const turnsRaw = conv[`session_${idx}`];
46
+ if (!Array.isArray(turnsRaw)) continue;
47
+ const dateTimeRaw = typeof conv[`session_${idx}_date_time`] === "string" ? conv[`session_${idx}_date_time`] : "";
48
+ const turns = turnsRaw.map((t) => ({
49
+ speaker: t.speaker ?? "",
50
+ diaId: t.dia_id ?? "",
51
+ text: t.text ?? ""
52
+ }));
53
+ sessions.push({
54
+ index: idx,
55
+ dateTime: dateTimeRaw,
56
+ dateTimeMs: dateTimeRaw ? parseLocomoDateTime(dateTimeRaw) : 0,
57
+ turns
58
+ });
59
+ }
60
+ return sessions;
61
+ }
62
+ async function loadLoCoMo(jsonPath, opts) {
63
+ await assertFileSize(jsonPath, opts?.maxBytes ?? DEFAULT_MAX_BYTES);
64
+ const raw = JSON.parse(await readFile(jsonPath, "utf-8"));
65
+ if (!Array.isArray(raw)) {
66
+ throw new Error(
67
+ `bench loader: expected the LoCoMo JSON root to be an array, but got ${typeof raw}. Did you pass the correct file?`
68
+ );
69
+ }
70
+ const rawSamples = raw;
71
+ const samples = rawSamples.map((s, i) => {
72
+ const sampleId = s.sample_id ?? String(i);
73
+ const conv = s.conversation ?? {};
74
+ const sessions = parseSessions(conv);
75
+ const qa = (s.qa ?? []).map((q) => {
76
+ const answer = q.answer !== void 0 ? String(q.answer) : void 0;
77
+ return {
78
+ question: q.question ?? "",
79
+ answer,
80
+ category: q.category,
81
+ evidence: Array.isArray(q.evidence) ? q.evidence : [],
82
+ adversarialAnswer: q.adversarial_answer
83
+ };
84
+ });
85
+ return {
86
+ sampleId,
87
+ speakerA: String(conv.speaker_a ?? ""),
88
+ speakerB: String(conv.speaker_b ?? ""),
89
+ sessions,
90
+ qa
91
+ };
92
+ });
93
+ return { samples };
94
+ }
95
+
96
+ export {
97
+ LOCOMO_SOURCE_SHA,
98
+ loadLoCoMo
99
+ };