@eidentic/bench 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,496 @@
1
+ import { Memory } from '@eidentic/memory';
2
+ import { ModelPort, Scope, AssertFactInput } from '@eidentic/types';
3
+
4
+ /**
5
+ * Core dataset shapes for the Eidentic memory benchmark harness (§6.10).
6
+ *
7
+ * A BenchDataset is a named collection of BenchCase objects. Each case represents a
8
+ * multi-session conversation (turns) followed by a set of retrieval questions. Each question
9
+ * specifies one or more gold facts — substrings that MUST appear in the retrieved context for
10
+ * the answer to be considered recalled.
11
+ */
12
+ /** A single conversation turn ingested into Memory before recall is evaluated. */
13
+ interface BenchTurn {
14
+ role: "user" | "assistant";
15
+ text: string;
16
+ /** Optional session identifier — used to group turns into multi-session conversations. */
17
+ sessionId?: string;
18
+ /** Optional ISO timestamp — used for temporal ordering and knowledge-update cases. */
19
+ ts?: string;
20
+ }
21
+ /** A single retrieval question with gold evidence. */
22
+ interface BenchQuestion {
23
+ question: string;
24
+ /**
25
+ * Substrings that must appear (normalized) in the top-K retrieved context for this question to
26
+ * be considered correctly recalled. Recall@K = fraction of goldFacts that are present.
27
+ */
28
+ goldFacts: string[];
29
+ /** Optional ground-truth answer string (used by the optional LLM judge for answer-correctness). */
30
+ answer?: string;
31
+ /** Category for per-category breakdown in BenchReport. */
32
+ category?: "single-session" | "multi-session" | "temporal" | "knowledge-update";
33
+ }
34
+ /** A single benchmark case: ingested turns + evaluation questions. */
35
+ interface BenchCase {
36
+ id: string;
37
+ turns: BenchTurn[];
38
+ questions: BenchQuestion[];
39
+ }
40
+ /** A named benchmark dataset containing one or more cases. */
41
+ interface BenchDataset {
42
+ name: string;
43
+ cases: BenchCase[];
44
+ }
45
+ /** Per-question result recorded in BenchReport. */
46
+ interface QuestionResult {
47
+ caseId: string;
48
+ question: string;
49
+ category?: BenchQuestion["category"];
50
+ recallAtK: number;
51
+ /** Snippets retrieved (text only, for inspection). */
52
+ retrieved: string[];
53
+ /** Gold facts found vs expected. */
54
+ foundFacts: number;
55
+ totalFacts: number;
56
+ }
57
+ /** Per-case summary in BenchReport. */
58
+ interface CaseResult {
59
+ caseId: string;
60
+ recallAtK: {
61
+ mean: number;
62
+ n: number;
63
+ };
64
+ questions: QuestionResult[];
65
+ }
66
+ /** The complete report produced by runMemoryBench. */
67
+ interface BenchReport {
68
+ dataset: string;
69
+ recallAtK: {
70
+ mean: number;
71
+ n: number;
72
+ };
73
+ byCategory: Record<string, {
74
+ mean: number;
75
+ n: number;
76
+ }>;
77
+ perCase: CaseResult[];
78
+ }
79
+
80
+ /**
81
+ * Deterministic recall metric for the memory benchmark harness (§6.10).
82
+ *
83
+ * recallAtK: fraction of gold facts whose text appears (normalized substring match) in the top-K
84
+ * retrieved context snippets. No model required — fully deterministic.
85
+ */
86
+ /**
87
+ * Normalize a string for comparison: lowercase, collapse whitespace, strip leading/trailing
88
+ * punctuation from each word so that "Alice's" and "Alice" both match. This is intentionally
89
+ * lenient — the point is to detect whether the semantic content was retrieved, not to penalize
90
+ * minor formatting differences.
91
+ */
92
+ declare function normalizeText(text: string): string;
93
+ /**
94
+ * Check whether `needle` appears in `haystack` after normalization.
95
+ * Uses substring matching so partial matches count (e.g. "TypeScript" found in a longer sentence).
96
+ */
97
+ declare function normalizedIncludes(haystack: string, needle: string): boolean;
98
+ /**
99
+ * Compute recall@K: the fraction of gold facts that appear as substrings in the retrieved context.
100
+ *
101
+ * @param retrievedSnippets - The text of the top-K retrieved memory snippets.
102
+ * @param goldFacts - The gold evidence substrings that must be found in the retrieved context.
103
+ * @returns A value in [0, 1]. Returns 1.0 when goldFacts is empty (vacuously true).
104
+ */
105
+ declare function recallAtK(retrievedSnippets: string[], goldFacts: string[]): number;
106
+ /**
107
+ * Compute fact recall against an asserted knowledge graph (KG).
108
+ *
109
+ * Given a set of expected (subject, predicate, object) triples as plain text strings (e.g.
110
+ * "Alice lives in London"), checks how many appear in the provided fact texts. Useful when the
111
+ * benchmark asserts facts into the temporal KG and queries them back.
112
+ *
113
+ * @param factTexts - Texts of facts retrieved from the KG (e.g. "subject predicate object" strings).
114
+ * @param goldFacts - Expected fact substrings.
115
+ * @returns Fraction found in [0, 1].
116
+ */
117
+ declare function factRecall(factTexts: string[], goldFacts: string[]): number;
118
+
119
+ /**
120
+ * Core benchmark runner for the Eidentic memory benchmark harness (§6.10).
121
+ *
122
+ * runMemoryBench: ingests a multi-session conversation into a fresh Memory instance, then for each
123
+ * question retrieves and checks whether the gold evidence was recalled. Aggregates into BenchReport.
124
+ */
125
+
126
+ /** Options for runMemoryBench. */
127
+ interface BenchOptions {
128
+ /** Number of top snippets to retrieve per question. Default 8. */
129
+ topK?: number;
130
+ /**
131
+ * Optional LLM judge (a ModelPort). When provided AND a question has an `answer` field,
132
+ * also scores answer-correctness via eval's llmJudge pattern (gated/optional).
133
+ * No judge means deterministic recall-only mode — safe for CI.
134
+ */
135
+ judge?: ModelPort;
136
+ }
137
+ /**
138
+ * Run a BenchDataset through a fresh Memory instance per case and produce a BenchReport.
139
+ *
140
+ * @param makeMemory - Factory called once per case to produce a fresh, empty Memory.
141
+ * @param dataset - The benchmark dataset to evaluate.
142
+ * @param opts - Optional: topK, judge.
143
+ */
144
+ declare function runMemoryBench(makeMemory: () => Promise<Memory> | Memory, dataset: BenchDataset, opts?: BenchOptions): Promise<BenchReport>;
145
+
146
+ /**
147
+ * Bundled synthetic benchmark dataset for CI (§6.10).
148
+ *
149
+ * Covers all four categories: single-session, multi-session, temporal, knowledge-update.
150
+ * Deterministic, hand-written, no real models or large files needed.
151
+ *
152
+ * Design notes for FakeEmbedder compatibility:
153
+ * FakeEmbedder hashes word tokens into a fixed-dim bag-of-words vector. Recall is highest
154
+ * when the gold fact shares tokens with the question OR the ingested turn. Gold facts are
155
+ * therefore written as verbatim substrings of ingested turns so that lexical recall works,
156
+ * and questions use overlapping vocabulary so semantic (vector) recall also fires.
157
+ */
158
+
159
+ declare const syntheticDataset: BenchDataset;
160
+
161
+ /**
162
+ * Load a LongMemEval JSON file and convert it to BenchDataset.
163
+ *
164
+ * @param jsonPath - Absolute or relative path to the LongMemEval JSON file.
165
+ * **Security note:** callers must validate untrusted paths before passing
166
+ * them here; this function does not perform path-traversal checks.
167
+ * @param opts.maxBytes - Maximum allowed file size in bytes (default 256 MiB).
168
+ * Increase this only if you are loading a vetted, trusted dataset file.
169
+ */
170
+ declare function loadLongMemEval(jsonPath: string, opts?: {
171
+ maxBytes?: number;
172
+ }): Promise<BenchDataset>;
173
+ /**
174
+ * Load a LoCoMo JSON file and convert it to BenchDataset.
175
+ *
176
+ * @param jsonPath - Absolute or relative path to the LoCoMo JSON file.
177
+ * **Security note:** callers must validate untrusted paths before passing
178
+ * them here; this function does not perform path-traversal checks.
179
+ * @param opts.maxBytes - Maximum allowed file size in bytes (default 256 MiB).
180
+ * Increase this only if you are loading a vetted, trusted dataset file.
181
+ */
182
+ declare function loadLoCoMo(jsonPath: string, opts?: {
183
+ maxBytes?: number;
184
+ }): Promise<BenchDataset>;
185
+
186
+ /**
187
+ * Write-quality benchmark for the Eidentic memory harness.
188
+ *
189
+ * Measures three orthogonal write-side properties that retrieval-only benchmarks miss:
190
+ *
191
+ * 1. Contradiction suppression — when a newer fact contradicts an older one, the
192
+ * stale fact should be suppressed (invalidated) and the current fact should win.
193
+ *
194
+ * 2. Junk resistance — the system should NOT store system-prompt dumps, tool outputs,
195
+ * transient task chatter, or other non-durable content. Measured as:
196
+ * junkRate = junk items stored / junk items fed (want: 0)
197
+ * factRecall = real facts stored / real facts fed (want: 1)
198
+ *
199
+ * 3. Duplicate resistance — re-ingesting the same events (across simulated sessions,
200
+ * including recalled-echo patterns) should not inflate the store.
201
+ * duplicateRate = extra copies written / total re-ingest events (want: 0)
202
+ *
203
+ * All three metrics are paired with cost-transparency fields: llmCallsPerWrite and
204
+ * tokensUsedIfAny. With MockModel these are deterministic counts.
205
+ *
206
+ * None of these metrics require a real LLM or external infrastructure.
207
+ */
208
+
209
+ /** A single contradiction fixture: subject + predicate, initial value, then an updated value. */
210
+ interface ContradictionFixture {
211
+ subject: string;
212
+ predicate: string;
213
+ /** The stale (earlier) value. Should be invalidated after the update is asserted. */
214
+ staleObject: string;
215
+ /** The current (later) value. Should win after assertion. */
216
+ currentObject: string;
217
+ /** ISO timestamps establishing temporal order. staleFrom < currentFrom. */
218
+ staleFrom: string;
219
+ currentFrom: string;
220
+ }
221
+ /** A single junk-resistance item in the mixed-stream fixture. */
222
+ interface JunkItem {
223
+ /**
224
+ * "real" — a genuine durable user fact that SHOULD be recorded.
225
+ * "junk" — content that SHOULD NOT be stored (system prompt, tool output, chatter).
226
+ */
227
+ kind: "real" | "junk";
228
+ /** Sub-classification for reporting (not used in scoring). */
229
+ junkKind?: "system-prompt" | "tool-output" | "transient-state" | "agent-scratchpad";
230
+ text: string;
231
+ /** For "real" items: the expected fact triple that proves storage. */
232
+ expectedFact?: {
233
+ subject: string;
234
+ predicate: string;
235
+ object: string;
236
+ };
237
+ }
238
+ /** Per-item detail entry in WriteQualityReport. */
239
+ interface WriteQualityDetail {
240
+ kind: "contradiction" | "junk" | "duplicate";
241
+ label: string;
242
+ passed: boolean;
243
+ note?: string;
244
+ }
245
+ /** Full report produced by runWriteQualityBench. */
246
+ interface WriteQualityReport {
247
+ /**
248
+ * Fraction of contradiction pairs where the CURRENT fact wins and the stale fact is
249
+ * suppressed / invalidated. Range [0, 1]. Higher is better.
250
+ */
251
+ contradictionAccuracy: number;
252
+ /**
253
+ * Fraction of junk items that were stored (lower is better — want 0).
254
+ * junkRate = junk items stored / junk items fed.
255
+ */
256
+ junkRate: number;
257
+ /**
258
+ * Fraction of real facts that were stored (higher is better — want 1).
259
+ * factRecall = real facts stored / real facts fed.
260
+ */
261
+ factRecall: number;
262
+ /**
263
+ * Fraction of re-ingested duplicate events that resulted in an additional write
264
+ * (lower is better — want 0).
265
+ * duplicateRate = extra copies written / total re-ingest events.
266
+ */
267
+ duplicateRate: number;
268
+ /**
269
+ * LLM calls issued per ingest write (averaged across all writes in the bench run).
270
+ * With MockModel this is a deterministic count.
271
+ */
272
+ llmCallsPerWrite: number;
273
+ /**
274
+ * Total tokens consumed across all LLM calls in the bench run (input + output).
275
+ * With MockModel this is a deterministic count.
276
+ */
277
+ tokensUsedIfAny: number;
278
+ /** Per-item details for inspection and debugging. */
279
+ details: WriteQualityDetail[];
280
+ }
281
+ /**
282
+ * Built-in contradiction fixtures.
283
+ * Each pair represents a fact that changes over time: employer, city, job title.
284
+ */
285
+ declare const CONTRADICTION_FIXTURES: ContradictionFixture[];
286
+ /**
287
+ * Built-in mixed-stream fixture set for junk-resistance testing.
288
+ *
289
+ * Designed to exercise the Consolidator / passive-extraction rejection gates:
290
+ * - System-prompt content → REJECT (config, not user fact)
291
+ * - Tool output / API response → REJECT (derived data, not stated fact)
292
+ * - Transient in-progress state → REJECT (ephemeral)
293
+ * - Agent scratchpad / reasoning → REJECT (internal)
294
+ * - Real user facts → KEEP
295
+ */
296
+ declare const JUNK_STREAM_FIXTURES: JunkItem[];
297
+ interface WriteQualityOptions {
298
+ /**
299
+ * Custom contradiction fixtures. Defaults to CONTRADICTION_FIXTURES.
300
+ */
301
+ contradictionFixtures?: ContradictionFixture[];
302
+ /**
303
+ * Custom junk-stream fixtures. Defaults to JUNK_STREAM_FIXTURES.
304
+ */
305
+ junkStreamFixtures?: JunkItem[];
306
+ /**
307
+ * Number of simulated sessions for duplicate-resistance measurement.
308
+ * Each session re-ingests the same real facts. Default: 3.
309
+ */
310
+ duplicateSessions?: number;
311
+ /**
312
+ * Scope to use for all bench operations. Defaults to a stable bench scope.
313
+ */
314
+ scope?: Scope;
315
+ }
316
+ /**
317
+ * Run the write-quality benchmark against a Memory instance.
318
+ *
319
+ * The Memory instance must have a graph configured (to use assertFact / queryFacts).
320
+ * For junk-resistance, the test exercises the passive-extraction path via
321
+ * assertFact directly using a MockModel-driven Consolidator pattern — or, when no
322
+ * model is configured, falls back to direct assertFact to verify the KG behavior.
323
+ *
324
+ * Cost transparency: every LLM call and token count is tracked and reported.
325
+ *
326
+ * @param memory - A fresh Memory instance with a graph backend.
327
+ * @param opts - Optional customization of fixtures and parameters.
328
+ */
329
+ declare function runWriteQualityBench(memory: Memory, opts?: WriteQualityOptions): Promise<WriteQualityReport>;
330
+
331
+ /**
332
+ * Synthetic temporal benchmark dataset generator.
333
+ *
334
+ * Generates K entities with state histories (employer / city / preference changing
335
+ * over time at known dates). Produces:
336
+ * - A sequence of AssertFactInput calls that establish the temporal record.
337
+ * - A question set "what was X's <property> at <date>?" with gold answers.
338
+ * Covers dates:
339
+ * • BETWEEN two changes (mid-interval)
340
+ * • AT the exact change boundary
341
+ * • BEFORE first known fact (expect: no answer)
342
+ * • AT the current/latest state
343
+ *
344
+ * This benchmark is ONLY passable by systems with timestamped fact validity (validAt).
345
+ * Pure retrieval systems that do not record fact-level temporal intervals cannot pass it —
346
+ * this distinction is documented plainly in BASELINES.md.
347
+ *
348
+ * The generator is seeded for determinism (seed parameter controls entity names +
349
+ * timestamps so the same seed always produces the same question/answer pairs).
350
+ */
351
+
352
+ /** A single state transition for one entity property. */
353
+ interface StateTransition {
354
+ /** ISO timestamp at which this state becomes valid. */
355
+ validFrom: string;
356
+ /** The value of the property at this point in time. */
357
+ value: string;
358
+ }
359
+ /** One entity with one or more tracked properties and their state history. */
360
+ interface TemporalEntity {
361
+ /** Stable entity name (e.g. "entity_0"). */
362
+ name: string;
363
+ /** Map from predicate name (e.g. "employer") to ordered history of transitions. */
364
+ history: Record<string, StateTransition[]>;
365
+ }
366
+ /** A single point-in-time question with gold answer. */
367
+ interface TemporalQuestion {
368
+ subject: string;
369
+ predicate: string;
370
+ /** The ISO date at which the question is asked. */
371
+ askedAt: string;
372
+ /**
373
+ * Gold answer: the value of the property at askedAt, or null if no fact was known
374
+ * before askedAt (query before first known state).
375
+ */
376
+ goldAnswer: string | null;
377
+ /** Human-readable description of why this answer is correct. */
378
+ rationale: string;
379
+ }
380
+ /** The complete synthetic temporal dataset. */
381
+ interface SyntheticTemporalDataset {
382
+ /** Human-readable name for display. */
383
+ name: string;
384
+ /** Seed used to generate this dataset (for reproducibility). */
385
+ seed: number;
386
+ /** Generated entities with full state histories. */
387
+ entities: TemporalEntity[];
388
+ /**
389
+ * AssertFactInput objects ready to be passed to memory.assertFact, in ascending
390
+ * validFrom order. Call with the scope of your choice.
391
+ */
392
+ asserts: AssertFactInput[];
393
+ /** Point-in-time QA pairs for evaluation. */
394
+ questions: TemporalQuestion[];
395
+ }
396
+ /**
397
+ * Generate a synthetic temporal dataset.
398
+ *
399
+ * @param opts.entityCount Number of entities (default 4).
400
+ * @param opts.seed Seed for determinism (default 42).
401
+ * @param opts.changesPerProperty Number of state changes per property per entity (default 3).
402
+ */
403
+ declare function syntheticTemporalDataset(opts?: {
404
+ entityCount?: number;
405
+ seed?: number;
406
+ changesPerProperty?: number;
407
+ }): SyntheticTemporalDataset;
408
+
409
+ /**
410
+ * Temporal point-in-time QA benchmark (`runTemporalBench`).
411
+ *
412
+ * Evaluates whether a Memory instance with a graph backend can answer point-in-time
413
+ * questions: "what was X's <property> at <date>?" using the `validAt` parameter of
414
+ * queryFacts. No LLM is required — answers are compared directly to gold.
415
+ *
416
+ * Two metrics:
417
+ * - pointInTimeAccuracy: fraction of non-current-state questions answered correctly.
418
+ * - currentStateAccuracy: fraction of "latest state" questions answered correctly.
419
+ *
420
+ * Cost transparency: llmCallsPerWrite and tokensUsedIfAny are always 0 for this
421
+ * deterministic benchmark (no model calls). Reported for API consistency.
422
+ *
423
+ * This benchmark is ONLY passable by systems with timestamped fact validity. A system
424
+ * that stores facts without validFrom/validUntil intervals cannot correctly answer
425
+ * "what was X at <past date>?" — it will always return the current state.
426
+ */
427
+
428
+ /** Per-question result in TemporalBenchReport. */
429
+ interface TemporalQuestionResult {
430
+ subject: string;
431
+ predicate: string;
432
+ askedAt: string;
433
+ goldAnswer: string | null;
434
+ systemAnswer: string | null;
435
+ correct: boolean;
436
+ /** "before-first-fact" | "at-boundary" | "mid-interval" | "current-state" */
437
+ questionType: string;
438
+ rationale: string;
439
+ }
440
+ /** Full report produced by runTemporalBench. */
441
+ interface TemporalBenchReport {
442
+ datasetName: string;
443
+ /**
444
+ * Accuracy on questions whose askedAt is BEFORE or BETWEEN known states
445
+ * (excludes current-state questions). Range [0, 1]. Higher is better.
446
+ */
447
+ pointInTimeAccuracy: number;
448
+ /**
449
+ * Accuracy on questions asking for the latest / current state.
450
+ * Range [0, 1]. Higher is better.
451
+ */
452
+ currentStateAccuracy: number;
453
+ /**
454
+ * Accuracy on questions asked BEFORE the first known fact (correct answer: null/no result).
455
+ * Range [0, 1]. A system that returns stale facts here fails this metric.
456
+ */
457
+ beforeFirstFactAccuracy: number;
458
+ /** Total questions evaluated. */
459
+ totalQuestions: number;
460
+ /**
461
+ * LLM calls per write. Always 0 for this deterministic benchmark.
462
+ * Present for cost-transparency API consistency.
463
+ */
464
+ llmCallsPerWrite: number;
465
+ /**
466
+ * Tokens used. Always 0 for this deterministic benchmark.
467
+ * Present for cost-transparency API consistency.
468
+ */
469
+ tokensUsedIfAny: number;
470
+ /** Per-question results for inspection. */
471
+ results: TemporalQuestionResult[];
472
+ }
473
+ interface TemporalBenchOptions {
474
+ /**
475
+ * Scope to use for assertFact and queryFacts calls.
476
+ * Default: { kind: "agent", agentId: "bench:temporal" }
477
+ */
478
+ scope?: Scope;
479
+ }
480
+ /**
481
+ * Run the temporal point-in-time QA benchmark.
482
+ *
483
+ * Workflow:
484
+ * 1. Assert all facts from dataset.asserts into the memory graph, in ascending
485
+ * validFrom order (required for temporal-order correctness).
486
+ * 2. For each question in dataset.questions, call queryFacts({ validAt: q.askedAt }).
487
+ * 3. Compare the returned fact's object to q.goldAnswer.
488
+ * 4. Aggregate into TemporalBenchReport.
489
+ *
490
+ * @param memory A Memory instance with a configured graph backend (required).
491
+ * @param dataset The synthetic temporal dataset produced by syntheticTemporalDataset().
492
+ * @param opts Optional scope configuration.
493
+ */
494
+ declare function runTemporalBench(memory: Memory, dataset: SyntheticTemporalDataset, opts?: TemporalBenchOptions): Promise<TemporalBenchReport>;
495
+
496
+ export { type BenchCase, type BenchDataset, type BenchOptions, type BenchQuestion, type BenchReport, type BenchTurn, CONTRADICTION_FIXTURES, type CaseResult, type ContradictionFixture, JUNK_STREAM_FIXTURES, type JunkItem, type QuestionResult, type StateTransition, type SyntheticTemporalDataset, type TemporalBenchOptions, type TemporalBenchReport, type TemporalEntity, type TemporalQuestion, type TemporalQuestionResult, type WriteQualityDetail, type WriteQualityOptions, type WriteQualityReport, factRecall, loadLoCoMo, loadLongMemEval, normalizeText, normalizedIncludes, recallAtK, runMemoryBench, runTemporalBench, runWriteQualityBench, syntheticDataset, syntheticTemporalDataset };