@eidentic/bench 0.1.0 → 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +39 -2
- package/dist/chunk-PVIWNXCY.js +99 -0
- package/dist/index.cjs +657 -3
- package/dist/index.d.cts +255 -2
- package/dist/index.d.ts +255 -2
- package/dist/index.js +544 -3
- package/dist/locomo-loader-YA3IEOND.js +8 -0
- package/package.json +4 -4
package/README.md
CHANGED
|
@@ -103,10 +103,47 @@ const report = await runWriteQualityBench(memory, {
|
|
|
103
103
|
});
|
|
104
104
|
```
|
|
105
105
|
|
|
106
|
-
###
|
|
106
|
+
### LoCoMo fair-run harness
|
|
107
|
+
|
|
108
|
+
The LoCoMo harness is a rigorous end-to-end benchmark: ingest → retrieve → answer → LLM judge.
|
|
109
|
+
It enforces strict fair-run rules (both speakers as humans, structural timestamps, topK ≤ 10,
|
|
110
|
+
mandatory full-context baseline). See [BASELINES.md](./BASELINES.md) for the full methodology.
|
|
111
|
+
|
|
112
|
+
```ts
|
|
113
|
+
import { loadLoCoMo, runLocomoBench, renderLocomoReportMarkdown } from "@eidentic/bench";
|
|
114
|
+
import { Memory } from "@eidentic/memory";
|
|
115
|
+
import { InMemoryStore, InMemoryVectorStore, FakeEmbedder } from "@eidentic/types/testing";
|
|
116
|
+
|
|
117
|
+
// Download data/locomo10.json first — CC BY-NC 4.0, do not commit
|
|
118
|
+
const dataset = await loadLoCoMo("data/locomo10.json");
|
|
119
|
+
|
|
120
|
+
const report = await runLocomoBench({
|
|
121
|
+
dataPath: "data/locomo10.json",
|
|
122
|
+
dataset,
|
|
123
|
+
answerModel: myModel,
|
|
124
|
+
judgeModel: myJudgeModel,
|
|
125
|
+
mode: "full-context", // or "memory" with memoryFactory
|
|
126
|
+
categories: [1, 2, 3, 4, 5],
|
|
127
|
+
memoryFactory: (sampleId) => new Memory({
|
|
128
|
+
store: new InMemoryStore(),
|
|
129
|
+
vector: new InMemoryVectorStore(),
|
|
130
|
+
embedder: new FakeEmbedder(32),
|
|
131
|
+
}),
|
|
132
|
+
});
|
|
133
|
+
|
|
134
|
+
console.log("J(1-4):", report.overallJ14.accuracy);
|
|
135
|
+
console.log(renderLocomoReportMarkdown([report]));
|
|
136
|
+
```
|
|
137
|
+
|
|
138
|
+
Run the full pilot:
|
|
139
|
+
```bash
|
|
140
|
+
ANTHROPIC_API_KEY=... pnpm --filter eidentic-examples bench:locomo -- --mode full-context --samples 2
|
|
141
|
+
```
|
|
142
|
+
|
|
143
|
+
### LongMemEval loader (retrieval benchmark, gated)
|
|
107
144
|
|
|
108
145
|
```ts
|
|
109
|
-
import { loadLongMemEval
|
|
146
|
+
import { loadLongMemEval } from "@eidentic/bench";
|
|
110
147
|
|
|
111
148
|
const dataset = await loadLongMemEval("/path/to/longmemeval.json");
|
|
112
149
|
const report = await runMemoryBench(() => myMemory(), dataset, { topK: 10 });
|
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
// src/locomo-loader.ts
|
|
2
|
+
import { readFile, stat } from "node:fs/promises";
|
|
3
|
+
var LOCOMO_SOURCE_SHA = "3eb6f2c585f5e1699204e3c3bdf7adc5c28cb376";
|
|
4
|
+
var DEFAULT_MAX_BYTES = 256 * 1024 * 1024;
|
|
5
|
+
async function assertFileSize(filePath, maxBytes) {
|
|
6
|
+
let fileSize;
|
|
7
|
+
try {
|
|
8
|
+
const s = await stat(filePath);
|
|
9
|
+
fileSize = s.size;
|
|
10
|
+
} catch (err) {
|
|
11
|
+
throw new Error(
|
|
12
|
+
`bench loader: cannot stat file "${filePath}": ${err.message}`
|
|
13
|
+
);
|
|
14
|
+
}
|
|
15
|
+
if (fileSize > maxBytes) {
|
|
16
|
+
const mb = (fileSize / (1024 * 1024)).toFixed(1);
|
|
17
|
+
const capMb = (maxBytes / (1024 * 1024)).toFixed(0);
|
|
18
|
+
throw new Error(
|
|
19
|
+
`bench loader: file "${filePath}" is ${mb} MiB, which exceeds the ${capMb} MiB cap. Pass a larger maxBytes option if this is intentional.`
|
|
20
|
+
);
|
|
21
|
+
}
|
|
22
|
+
}
|
|
23
|
+
function parseLocomoDateTime(raw) {
|
|
24
|
+
const cleaned = raw.replace(/\s*on\s+/, " ").trim();
|
|
25
|
+
const ms = Date.parse(cleaned);
|
|
26
|
+
if (Number.isNaN(ms)) {
|
|
27
|
+
const match = /^(\d{1,2}:\d{2}\s*(?:am|pm))\s+(.+)$/i.exec(cleaned);
|
|
28
|
+
if (match) {
|
|
29
|
+
const ms2 = Date.parse(`${match[2]} ${match[1]}`);
|
|
30
|
+
return Number.isNaN(ms2) ? 0 : ms2;
|
|
31
|
+
}
|
|
32
|
+
return 0;
|
|
33
|
+
}
|
|
34
|
+
return ms;
|
|
35
|
+
}
|
|
36
|
+
function parseSessions(conv) {
|
|
37
|
+
const indices = [];
|
|
38
|
+
for (const key of Object.keys(conv)) {
|
|
39
|
+
const m = /^session_(\d+)$/.exec(key);
|
|
40
|
+
if (m) indices.push(parseInt(m[1], 10));
|
|
41
|
+
}
|
|
42
|
+
indices.sort((a, b) => a - b);
|
|
43
|
+
const sessions = [];
|
|
44
|
+
for (const idx of indices) {
|
|
45
|
+
const turnsRaw = conv[`session_${idx}`];
|
|
46
|
+
if (!Array.isArray(turnsRaw)) continue;
|
|
47
|
+
const dateTimeRaw = typeof conv[`session_${idx}_date_time`] === "string" ? conv[`session_${idx}_date_time`] : "";
|
|
48
|
+
const turns = turnsRaw.map((t) => ({
|
|
49
|
+
speaker: t.speaker ?? "",
|
|
50
|
+
diaId: t.dia_id ?? "",
|
|
51
|
+
text: t.text ?? ""
|
|
52
|
+
}));
|
|
53
|
+
sessions.push({
|
|
54
|
+
index: idx,
|
|
55
|
+
dateTime: dateTimeRaw,
|
|
56
|
+
dateTimeMs: dateTimeRaw ? parseLocomoDateTime(dateTimeRaw) : 0,
|
|
57
|
+
turns
|
|
58
|
+
});
|
|
59
|
+
}
|
|
60
|
+
return sessions;
|
|
61
|
+
}
|
|
62
|
+
async function loadLoCoMo(jsonPath, opts) {
|
|
63
|
+
await assertFileSize(jsonPath, opts?.maxBytes ?? DEFAULT_MAX_BYTES);
|
|
64
|
+
const raw = JSON.parse(await readFile(jsonPath, "utf-8"));
|
|
65
|
+
if (!Array.isArray(raw)) {
|
|
66
|
+
throw new Error(
|
|
67
|
+
`bench loader: expected the LoCoMo JSON root to be an array, but got ${typeof raw}. Did you pass the correct file?`
|
|
68
|
+
);
|
|
69
|
+
}
|
|
70
|
+
const rawSamples = raw;
|
|
71
|
+
const samples = rawSamples.map((s, i) => {
|
|
72
|
+
const sampleId = s.sample_id ?? String(i);
|
|
73
|
+
const conv = s.conversation ?? {};
|
|
74
|
+
const sessions = parseSessions(conv);
|
|
75
|
+
const qa = (s.qa ?? []).map((q) => {
|
|
76
|
+
const answer = q.answer !== void 0 ? String(q.answer) : void 0;
|
|
77
|
+
return {
|
|
78
|
+
question: q.question ?? "",
|
|
79
|
+
answer,
|
|
80
|
+
category: q.category,
|
|
81
|
+
evidence: Array.isArray(q.evidence) ? q.evidence : [],
|
|
82
|
+
adversarialAnswer: q.adversarial_answer
|
|
83
|
+
};
|
|
84
|
+
});
|
|
85
|
+
return {
|
|
86
|
+
sampleId,
|
|
87
|
+
speakerA: String(conv.speaker_a ?? ""),
|
|
88
|
+
speakerB: String(conv.speaker_b ?? ""),
|
|
89
|
+
sessions,
|
|
90
|
+
qa
|
|
91
|
+
};
|
|
92
|
+
});
|
|
93
|
+
return { samples };
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
export {
|
|
97
|
+
LOCOMO_SOURCE_SHA,
|
|
98
|
+
loadLoCoMo
|
|
99
|
+
};
|