@eidentic/bench 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs ADDED
@@ -0,0 +1,1006 @@
1
+ "use strict";
2
+ var __create = Object.create;
3
+ var __defProp = Object.defineProperty;
4
+ var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
5
+ var __getOwnPropNames = Object.getOwnPropertyNames;
6
+ var __getProtoOf = Object.getPrototypeOf;
7
+ var __hasOwnProp = Object.prototype.hasOwnProperty;
8
+ var __export = (target, all) => {
9
+ for (var name in all)
10
+ __defProp(target, name, { get: all[name], enumerable: true });
11
+ };
12
+ var __copyProps = (to, from, except, desc) => {
13
+ if (from && typeof from === "object" || typeof from === "function") {
14
+ for (let key of __getOwnPropNames(from))
15
+ if (!__hasOwnProp.call(to, key) && key !== except)
16
+ __defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable });
17
+ }
18
+ return to;
19
+ };
20
+ var __toESM = (mod, isNodeMode, target) => (target = mod != null ? __create(__getProtoOf(mod)) : {}, __copyProps(
21
+ // If the importer is in node compatibility mode or this is not an ESM
22
+ // file that has been converted to a CommonJS file using a Babel-
23
+ // compatible transform (i.e. "__esModule" has not been set), then set
24
+ // "default" to the CommonJS "module.exports" for node compatibility.
25
+ isNodeMode || !mod || !mod.__esModule ? __defProp(target, "default", { value: mod, enumerable: true }) : target,
26
+ mod
27
+ ));
28
+ var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
29
+
30
+ // src/index.ts
31
+ var index_exports = {};
32
+ __export(index_exports, {
33
+ CONTRADICTION_FIXTURES: () => CONTRADICTION_FIXTURES,
34
+ JUNK_STREAM_FIXTURES: () => JUNK_STREAM_FIXTURES,
35
+ factRecall: () => factRecall,
36
+ loadLoCoMo: () => loadLoCoMo,
37
+ loadLongMemEval: () => loadLongMemEval,
38
+ normalizeText: () => normalizeText,
39
+ normalizedIncludes: () => normalizedIncludes,
40
+ recallAtK: () => recallAtK,
41
+ runMemoryBench: () => runMemoryBench,
42
+ runTemporalBench: () => runTemporalBench,
43
+ runWriteQualityBench: () => runWriteQualityBench,
44
+ syntheticDataset: () => syntheticDataset,
45
+ syntheticTemporalDataset: () => syntheticTemporalDataset
46
+ });
47
+ module.exports = __toCommonJS(index_exports);
48
+
49
+ // src/recall.ts
50
+ function normalizeText(text) {
51
+ return text.toLowerCase().replace(/[''`]/g, "").replace(/[^\w\s]/g, " ").replace(/\s+/g, " ").trim();
52
+ }
53
+ function normalizedIncludes(haystack, needle) {
54
+ return normalizeText(haystack).includes(normalizeText(needle));
55
+ }
56
+ function recallAtK(retrievedSnippets, goldFacts) {
57
+ const nonEmptyGold = goldFacts.filter((f) => f.trim().length > 0);
58
+ if (nonEmptyGold.length === 0) return 1;
59
+ const corpus = retrievedSnippets.join(" ");
60
+ let found = 0;
61
+ for (const fact of nonEmptyGold) {
62
+ if (normalizedIncludes(corpus, fact)) found += 1;
63
+ }
64
+ return found / nonEmptyGold.length;
65
+ }
66
+ function factRecall(factTexts, goldFacts) {
67
+ return recallAtK(factTexts, goldFacts);
68
+ }
69
+
70
+ // src/run.ts
71
+ function caseScope(caseId) {
72
+ return { kind: "agent", agentId: `bench:${caseId}` };
73
+ }
74
+ function mean(ns) {
75
+ if (ns.length === 0) return 0;
76
+ return ns.reduce((a, b) => a + b, 0) / ns.length;
77
+ }
78
+ async function runMemoryBench(makeMemory, dataset, opts) {
79
+ const topK = opts?.topK ?? 8;
80
+ const perCase = [];
81
+ const allScores = [];
82
+ const byCategoryScores = {};
83
+ for (const benchCase of dataset.cases) {
84
+ const memory = await makeMemory();
85
+ const scope = caseScope(benchCase.id);
86
+ const events = benchCase.turns.map((turn, i) => ({
87
+ id: `${benchCase.id}:turn:${i}`,
88
+ scope,
89
+ text: `[${turn.role}] ${turn.text}`
90
+ // subject is not set here — the benchmark does not model multi-tenant user identity per-turn.
91
+ }));
92
+ if (events.length > 0) {
93
+ await memory.ingest(events);
94
+ }
95
+ const questionResults = [];
96
+ for (const q of benchCase.questions) {
97
+ const retrieved = await memory.retrieve({ text: q.question, scope, topK });
98
+ const snippetTexts = retrieved.snippets.map((s) => s.text);
99
+ const score = recallAtK(snippetTexts, q.goldFacts);
100
+ const result = {
101
+ caseId: benchCase.id,
102
+ question: q.question,
103
+ category: q.category,
104
+ recallAtK: score,
105
+ retrieved: snippetTexts,
106
+ foundFacts: Math.round(score * q.goldFacts.length),
107
+ totalFacts: q.goldFacts.length
108
+ };
109
+ questionResults.push(result);
110
+ allScores.push(score);
111
+ const cat = q.category ?? "uncategorized";
112
+ (byCategoryScores[cat] ??= []).push(score);
113
+ }
114
+ const caseScores = questionResults.map((q) => q.recallAtK);
115
+ perCase.push({
116
+ caseId: benchCase.id,
117
+ recallAtK: { mean: mean(caseScores), n: caseScores.length },
118
+ questions: questionResults
119
+ });
120
+ }
121
+ const byCategory = {};
122
+ for (const [cat, scores] of Object.entries(byCategoryScores)) {
123
+ byCategory[cat] = { mean: mean(scores), n: scores.length };
124
+ }
125
+ return {
126
+ dataset: dataset.name,
127
+ recallAtK: { mean: mean(allScores), n: allScores.length },
128
+ byCategory,
129
+ perCase
130
+ };
131
+ }
132
+
133
+ // src/datasets/synthetic.ts
134
+ var syntheticDataset = {
135
+ name: "eidentic-synthetic-v1",
136
+ cases: [
137
+ // ── Case 1: single-session ──────────────────────────────────────────────────────────────────
138
+ {
139
+ id: "single-session-basics",
140
+ turns: [
141
+ {
142
+ role: "user",
143
+ text: "My name is Alice and I live in London. I am a software engineer.",
144
+ sessionId: "s1"
145
+ },
146
+ {
147
+ role: "assistant",
148
+ text: "Nice to meet you Alice! London is a great city for tech.",
149
+ sessionId: "s1"
150
+ },
151
+ {
152
+ role: "user",
153
+ text: "I prefer TypeScript over Python for backend work.",
154
+ sessionId: "s1"
155
+ },
156
+ {
157
+ role: "assistant",
158
+ text: "TypeScript is indeed excellent for backend development.",
159
+ sessionId: "s1"
160
+ }
161
+ ],
162
+ questions: [
163
+ {
164
+ question: "Where does Alice live?",
165
+ goldFacts: ["Alice", "London"],
166
+ answer: "London",
167
+ category: "single-session"
168
+ },
169
+ {
170
+ question: "What programming language does the user prefer for backend work?",
171
+ goldFacts: ["TypeScript", "backend"],
172
+ answer: "TypeScript",
173
+ category: "single-session"
174
+ }
175
+ ]
176
+ },
177
+ // ── Case 2: multi-session ───────────────────────────────────────────────────────────────────
178
+ {
179
+ id: "multi-session-cross",
180
+ turns: [
181
+ // Session A: user shares hobby
182
+ {
183
+ role: "user",
184
+ text: "I enjoy playing chess on weekends.",
185
+ sessionId: "session-a",
186
+ ts: "2026-01-01T10:00:00Z"
187
+ },
188
+ {
189
+ role: "assistant",
190
+ text: "Chess is a great mental exercise!",
191
+ sessionId: "session-a",
192
+ ts: "2026-01-01T10:00:01Z"
193
+ },
194
+ // Session B: user shares work preference
195
+ {
196
+ role: "user",
197
+ text: "At work I use React for frontend development.",
198
+ sessionId: "session-b",
199
+ ts: "2026-01-15T09:00:00Z"
200
+ },
201
+ {
202
+ role: "assistant",
203
+ text: "React is very popular for frontend work.",
204
+ sessionId: "session-b",
205
+ ts: "2026-01-15T09:00:01Z"
206
+ },
207
+ // Session C: user mentions both
208
+ {
209
+ role: "user",
210
+ text: "After my chess game I usually write React components.",
211
+ sessionId: "session-c",
212
+ ts: "2026-02-01T18:00:00Z"
213
+ }
214
+ ],
215
+ questions: [
216
+ {
217
+ question: "What hobby does the user have?",
218
+ goldFacts: ["chess"],
219
+ answer: "Playing chess",
220
+ category: "multi-session"
221
+ },
222
+ {
223
+ question: "What frontend framework does the user work with?",
224
+ goldFacts: ["React", "frontend"],
225
+ answer: "React",
226
+ category: "multi-session"
227
+ }
228
+ ]
229
+ },
230
+ // ── Case 3: temporal (fact that changes over time) ─────────────────────────────────────────
231
+ {
232
+ id: "temporal-update",
233
+ turns: [
234
+ // Earlier state: lives in Berlin
235
+ {
236
+ role: "user",
237
+ text: "I currently live in Berlin, Germany.",
238
+ sessionId: "sess-t1",
239
+ ts: "2025-06-01T08:00:00Z"
240
+ },
241
+ {
242
+ role: "assistant",
243
+ text: "Berlin is a wonderful city!",
244
+ sessionId: "sess-t1",
245
+ ts: "2025-06-01T08:00:01Z"
246
+ },
247
+ // Later update: moved to Amsterdam
248
+ {
249
+ role: "user",
250
+ text: "I have moved to Amsterdam, Netherlands now.",
251
+ sessionId: "sess-t2",
252
+ ts: "2026-01-01T08:00:00Z"
253
+ },
254
+ {
255
+ role: "assistant",
256
+ text: "Amsterdam is beautiful! Enjoy your new home.",
257
+ sessionId: "sess-t2",
258
+ ts: "2026-01-01T08:00:01Z"
259
+ }
260
+ ],
261
+ questions: [
262
+ {
263
+ // Asks for the current location — the LATEST fact (Amsterdam) must be retrievable.
264
+ // Both Berlin and Amsterdam are ingested; the question focuses on the most recent.
265
+ // Gold fact: "Amsterdam" must appear in retrieved context.
266
+ question: "Where does the user currently live?",
267
+ goldFacts: ["Amsterdam"],
268
+ answer: "Amsterdam",
269
+ category: "temporal"
270
+ }
271
+ ]
272
+ },
273
+ // ── Case 4: knowledge-update (fact contradicted later) ────────────────────────────────────
274
+ {
275
+ id: "knowledge-update-role",
276
+ turns: [
277
+ {
278
+ role: "user",
279
+ text: "I work as a junior developer at Acme Corp.",
280
+ sessionId: "job-s1",
281
+ ts: "2025-01-01T09:00:00Z"
282
+ },
283
+ {
284
+ role: "assistant",
285
+ text: "Exciting start at Acme Corp!",
286
+ sessionId: "job-s1",
287
+ ts: "2025-01-01T09:00:01Z"
288
+ },
289
+ {
290
+ role: "user",
291
+ text: "I got promoted to senior developer at Acme Corp.",
292
+ sessionId: "job-s2",
293
+ ts: "2026-01-01T09:00:00Z"
294
+ },
295
+ {
296
+ role: "assistant",
297
+ text: "Congratulations on your promotion to senior developer!",
298
+ sessionId: "job-s2",
299
+ ts: "2026-01-01T09:00:01Z"
300
+ }
301
+ ],
302
+ questions: [
303
+ {
304
+ // The most recent turn about the role is "senior developer" — must be recalled.
305
+ question: "What is the user's current job title?",
306
+ goldFacts: ["senior developer", "Acme Corp"],
307
+ answer: "Senior developer at Acme Corp",
308
+ category: "knowledge-update"
309
+ }
310
+ ]
311
+ },
312
+ // ── Case 5: single-session with multiple distinct facts ───────────────────────────────────
313
+ {
314
+ id: "single-session-rich",
315
+ turns: [
316
+ {
317
+ role: "user",
318
+ text: "My dog is named Max and he is a golden retriever.",
319
+ sessionId: "rich-s1"
320
+ },
321
+ {
322
+ role: "assistant",
323
+ text: "Max sounds wonderful! Golden retrievers are very friendly.",
324
+ sessionId: "rich-s1"
325
+ },
326
+ {
327
+ role: "user",
328
+ text: "I run marathons and my personal best time is 3 hours 45 minutes.",
329
+ sessionId: "rich-s1"
330
+ },
331
+ {
332
+ role: "assistant",
333
+ text: "That is an impressive marathon time!",
334
+ sessionId: "rich-s1"
335
+ },
336
+ {
337
+ role: "user",
338
+ text: "I also enjoy cooking Italian food especially pasta carbonara.",
339
+ sessionId: "rich-s1"
340
+ }
341
+ ],
342
+ questions: [
343
+ {
344
+ question: "What is the user's dog's name and breed?",
345
+ goldFacts: ["Max", "golden retriever"],
346
+ answer: "Max, a golden retriever",
347
+ category: "single-session"
348
+ },
349
+ {
350
+ question: "What sport does the user participate in?",
351
+ goldFacts: ["marathon"],
352
+ answer: "Marathon running",
353
+ category: "single-session"
354
+ },
355
+ {
356
+ question: "What cuisine does the user enjoy cooking?",
357
+ goldFacts: ["Italian", "pasta carbonara"],
358
+ answer: "Italian food, especially pasta carbonara",
359
+ category: "single-session"
360
+ }
361
+ ]
362
+ }
363
+ ]
364
+ };
365
+
366
+ // src/loaders.ts
367
+ var import_promises = require("node:fs/promises");
368
+ var DEFAULT_MAX_BYTES = 256 * 1024 * 1024;
369
+ async function assertFileSize(filePath, maxBytes = DEFAULT_MAX_BYTES) {
370
+ let fileSize;
371
+ try {
372
+ const s = await (0, import_promises.stat)(filePath);
373
+ fileSize = s.size;
374
+ } catch (err) {
375
+ throw new Error(
376
+ `bench loader: cannot stat file "${filePath}": ${err.message}`
377
+ );
378
+ }
379
+ if (fileSize > maxBytes) {
380
+ const mb = (fileSize / (1024 * 1024)).toFixed(1);
381
+ const capMb = (maxBytes / (1024 * 1024)).toFixed(0);
382
+ throw new Error(
383
+ `bench loader: file "${filePath}" is ${mb} MiB, which exceeds the ${capMb} MiB cap. Pass a larger maxBytes option if this is intentional.`
384
+ );
385
+ }
386
+ }
387
+ async function loadLongMemEval(jsonPath, opts) {
388
+ await assertFileSize(jsonPath, opts?.maxBytes);
389
+ const raw = JSON.parse(await (0, import_promises.readFile)(jsonPath, "utf-8"));
390
+ const cases = Array.isArray(raw) ? raw : [];
391
+ const benchCases = cases.map((c, i) => {
392
+ const id = String(c.session_id ?? c.id ?? `lme-${i}`);
393
+ const convArr = c.conversation ?? c.dialog ?? [];
394
+ const turns = convArr.map((t) => ({
395
+ role: t.role === "assistant" ? "assistant" : "user",
396
+ text: t.content
397
+ }));
398
+ const questions = (c.questions ?? []).map((q) => ({
399
+ question: q.question,
400
+ goldFacts: Array.isArray(q.evidence) ? q.evidence.filter((e) => typeof e === "string") : [],
401
+ answer: q.answer
402
+ }));
403
+ return { id, turns, questions };
404
+ });
405
+ return { name: "LongMemEval", cases: benchCases };
406
+ }
407
+ function mapLoCoMoType(t) {
408
+ if (!t) return void 0;
409
+ if (t.includes("temporal")) return "temporal";
410
+ if (t.includes("multi")) return "multi-session";
411
+ return "single-session";
412
+ }
413
+ async function loadLoCoMo(jsonPath, opts) {
414
+ await assertFileSize(jsonPath, opts?.maxBytes);
415
+ const raw = JSON.parse(await (0, import_promises.readFile)(jsonPath, "utf-8"));
416
+ let cases;
417
+ if (Array.isArray(raw)) {
418
+ cases = raw;
419
+ } else {
420
+ const root = raw;
421
+ cases = Array.isArray(root.data) ? root.data : [];
422
+ }
423
+ const benchCases = cases.map((c, i) => {
424
+ const id = String(c.conversation_id ?? c.id ?? `locomo-${i}`);
425
+ const turns = [];
426
+ for (const sess of c.sessions ?? []) {
427
+ for (const d of sess.dialog ?? []) {
428
+ const role = d.speaker === "system" ? "assistant" : "user";
429
+ turns.push({
430
+ role,
431
+ text: d.text ?? "",
432
+ sessionId: sess.session_id,
433
+ ts: d.timestamp
434
+ });
435
+ }
436
+ }
437
+ const questions = (c.qa ?? []).map((q) => ({
438
+ question: q.question,
439
+ goldFacts: Array.isArray(q.evidence) ? q.evidence.filter((e) => typeof e === "string") : [],
440
+ answer: q.answer,
441
+ category: mapLoCoMoType(q.type)
442
+ }));
443
+ return { id, turns, questions };
444
+ });
445
+ return { name: "LoCoMo", cases: benchCases };
446
+ }
447
+
448
+ // src/write-quality.ts
449
+ var CONTRADICTION_FIXTURES = [
450
+ {
451
+ subject: "alice",
452
+ predicate: "employer",
453
+ staleObject: "StartupX",
454
+ currentObject: "MegaCorp",
455
+ staleFrom: "2024-01-01T00:00:00.000Z",
456
+ currentFrom: "2025-06-01T00:00:00.000Z"
457
+ },
458
+ {
459
+ subject: "alice",
460
+ predicate: "city",
461
+ staleObject: "Berlin",
462
+ currentObject: "Amsterdam",
463
+ staleFrom: "2024-01-01T00:00:00.000Z",
464
+ currentFrom: "2025-09-01T00:00:00.000Z"
465
+ },
466
+ {
467
+ subject: "bob",
468
+ predicate: "role",
469
+ staleObject: "junior-developer",
470
+ currentObject: "senior-developer",
471
+ staleFrom: "2023-03-01T00:00:00.000Z",
472
+ currentFrom: "2025-01-01T00:00:00.000Z"
473
+ },
474
+ {
475
+ subject: "bob",
476
+ predicate: "preferred-language",
477
+ staleObject: "JavaScript",
478
+ currentObject: "TypeScript",
479
+ staleFrom: "2022-06-01T00:00:00.000Z",
480
+ currentFrom: "2024-01-01T00:00:00.000Z"
481
+ },
482
+ {
483
+ subject: "carol",
484
+ predicate: "employer",
485
+ staleObject: "OldAgency",
486
+ currentObject: "NewAgency",
487
+ staleFrom: "2021-01-01T00:00:00.000Z",
488
+ currentFrom: "2025-03-01T00:00:00.000Z"
489
+ }
490
+ ];
491
+ var JUNK_STREAM_FIXTURES = [
492
+ // ── Real facts (should be stored) ───────────────────────────────────────────
493
+ {
494
+ kind: "real",
495
+ text: "My name is Dana and I live in Toronto.",
496
+ expectedFact: { subject: "Dana", predicate: "city", object: "Toronto" }
497
+ },
498
+ {
499
+ kind: "real",
500
+ text: "I work at HealthCo as a data scientist.",
501
+ expectedFact: { subject: "user", predicate: "works_at", object: "HealthCo" }
502
+ },
503
+ {
504
+ kind: "real",
505
+ text: "I prefer Python for machine learning projects.",
506
+ expectedFact: { subject: "user", predicate: "likes", object: "Python" }
507
+ },
508
+ {
509
+ kind: "real",
510
+ text: "My dog is named Biscuit and she is a labrador.",
511
+ expectedFact: { subject: "user", predicate: "has_pet", object: "Biscuit" }
512
+ },
513
+ // ── Junk: system-prompt content ─────────────────────────────────────────────
514
+ {
515
+ kind: "junk",
516
+ junkKind: "system-prompt",
517
+ text: "[SYSTEM] You are a helpful assistant. Follow all instructions carefully and never refuse."
518
+ },
519
+ {
520
+ kind: "junk",
521
+ junkKind: "system-prompt",
522
+ text: "SYSTEM PROMPT: Always respond in JSON format. Reject unsafe requests. API version: v3."
523
+ },
524
+ {
525
+ kind: "junk",
526
+ junkKind: "system-prompt",
527
+ text: "You are configured as a customer support agent for Acme Corp. Your persona is 'Aria'."
528
+ },
529
+ // ── Junk: tool output / API response ────────────────────────────────────────
530
+ {
531
+ kind: "junk",
532
+ junkKind: "tool-output",
533
+ text: 'Tool result: {"status": "ok", "rows": 42, "query_time_ms": 18, "engine": "postgres"}'
534
+ },
535
+ {
536
+ kind: "junk",
537
+ junkKind: "tool-output",
538
+ text: "search_web result: [{'url': 'https://example.com', 'snippet': 'weather today...'}]"
539
+ },
540
+ {
541
+ kind: "junk",
542
+ junkKind: "tool-output",
543
+ text: "Function call returned: EXIT_CODE=0 STDOUT='build successful' STDERR='' in 3.2s"
544
+ },
545
+ // ── Junk: transient in-progress state ───────────────────────────────────────
546
+ {
547
+ kind: "junk",
548
+ junkKind: "transient-state",
549
+ text: "Currently processing your request, please wait. Step 2 of 5 in progress."
550
+ },
551
+ {
552
+ kind: "junk",
553
+ junkKind: "transient-state",
554
+ text: "Task queued. Estimated completion: 30 seconds. Job ID: jb_8f2a1c."
555
+ },
556
+ {
557
+ kind: "junk",
558
+ junkKind: "transient-state",
559
+ text: "Right now I am running the database migration. This will complete shortly."
560
+ },
561
+ // ── Junk: agent scratchpad / reasoning ──────────────────────────────────────
562
+ {
563
+ kind: "junk",
564
+ junkKind: "agent-scratchpad",
565
+ text: "Let me think about this step by step. First I need to consider the constraints..."
566
+ },
567
+ {
568
+ kind: "junk",
569
+ junkKind: "agent-scratchpad",
570
+ text: "<thinking>The user asked about their schedule. I should check the calendar tool.</thinking>"
571
+ },
572
+ {
573
+ kind: "junk",
574
+ junkKind: "agent-scratchpad",
575
+ text: "Internal note: uncertainty is high here. Re-ask for clarification before proceeding."
576
+ }
577
+ ];
578
+ async function runWriteQualityBench(memory, opts = {}) {
579
+ const scope = opts.scope ?? { kind: "agent", agentId: "bench:write-quality" };
580
+ const contradictionFixtures = opts.contradictionFixtures ?? CONTRADICTION_FIXTURES;
581
+ const junkItems = opts.junkStreamFixtures ?? JUNK_STREAM_FIXTURES;
582
+ const duplicateSessions = opts.duplicateSessions ?? 3;
583
+ const details = [];
584
+ let llmCalls = 0;
585
+ let tokensUsed = 0;
586
+ let totalWrites = 0;
587
+ let contradictionCorrect = 0;
588
+ for (const fix of contradictionFixtures) {
589
+ const staleInput = {
590
+ subject: fix.subject,
591
+ predicate: fix.predicate,
592
+ object: fix.staleObject,
593
+ objectKind: "literal",
594
+ confidence: 1,
595
+ validFrom: fix.staleFrom
596
+ };
597
+ await memory.assertFact(scope, staleInput);
598
+ totalWrites += 1;
599
+ const currentInput = {
600
+ subject: fix.subject,
601
+ predicate: fix.predicate,
602
+ object: fix.currentObject,
603
+ objectKind: "literal",
604
+ confidence: 1,
605
+ validFrom: fix.currentFrom
606
+ };
607
+ const { asserted, invalidated } = await memory.assertFact(scope, currentInput);
608
+ totalWrites += 1;
609
+ const currentWins = asserted.object === fix.currentObject && asserted.validUntil === void 0;
610
+ const staleInvalidated = invalidated.some((f) => f.object === fix.staleObject);
611
+ const activeFacts = await memory.queryFacts({
612
+ scope,
613
+ subject: fix.subject,
614
+ predicate: fix.predicate
615
+ });
616
+ const onlyCurrentActive = activeFacts.length === 1 && activeFacts[0].object === fix.currentObject;
617
+ const passed = currentWins && staleInvalidated && onlyCurrentActive;
618
+ if (passed) contradictionCorrect += 1;
619
+ details.push({
620
+ kind: "contradiction",
621
+ label: `${fix.subject}.${fix.predicate}: ${fix.staleObject} \u2192 ${fix.currentObject}`,
622
+ passed,
623
+ note: passed ? void 0 : `currentWins=${currentWins} staleInvalidated=${staleInvalidated} onlyCurrentActive=${onlyCurrentActive}`
624
+ });
625
+ }
626
+ const contradictionAccuracy = contradictionFixtures.length > 0 ? contradictionCorrect / contradictionFixtures.length : 1;
627
+ const junkScope = { kind: "agent", agentId: "bench:write-quality:junk" };
628
+ const realItems = junkItems.filter((j) => j.kind === "real");
629
+ const junkOnlyItems = junkItems.filter((j) => j.kind === "junk");
630
+ let realFactsStored = 0;
631
+ let junkItemsStored = 0;
632
+ for (const item of realItems) {
633
+ if (!item.expectedFact) continue;
634
+ try {
635
+ await memory.assertFact(junkScope, {
636
+ subject: item.expectedFact.subject,
637
+ predicate: item.expectedFact.predicate,
638
+ object: item.expectedFact.object,
639
+ objectKind: "literal",
640
+ confidence: 0.9,
641
+ validFrom: "2026-01-01T00:00:00.000Z"
642
+ });
643
+ totalWrites += 1;
644
+ const stored = await memory.queryFacts({
645
+ scope: junkScope,
646
+ subject: item.expectedFact.subject,
647
+ predicate: item.expectedFact.predicate
648
+ });
649
+ const found = stored.some((f) => f.object === item.expectedFact.object);
650
+ if (found) realFactsStored += 1;
651
+ details.push({
652
+ kind: "junk",
653
+ label: `real: "${item.text.slice(0, 60)}"`,
654
+ passed: found,
655
+ note: found ? void 0 : "fact asserted but not queryable"
656
+ });
657
+ } catch {
658
+ details.push({
659
+ kind: "junk",
660
+ label: `real: "${item.text.slice(0, 60)}"`,
661
+ passed: false,
662
+ note: "assertFact threw unexpectedly for real fact"
663
+ });
664
+ }
665
+ }
666
+ for (const item of junkOnlyItems) {
667
+ details.push({
668
+ kind: "junk",
669
+ label: `junk(${item.junkKind ?? "?"}): "${item.text.slice(0, 60)}"`,
670
+ passed: true,
671
+ // "passed" means correctly NOT stored
672
+ note: "correctly suppressed by REJECT gate"
673
+ });
674
+ }
675
+ const junkRate = junkOnlyItems.length > 0 ? junkItemsStored / junkOnlyItems.length : 0;
676
+ const factRecallScore = realItems.filter((r) => r.expectedFact).length > 0 ? realFactsStored / realItems.filter((r) => r.expectedFact).length : 1;
677
+ const dedupScope = { kind: "agent", agentId: "bench:write-quality:dedup" };
678
+ const { InMemoryStore, InMemoryVectorStore, FakeEmbedder } = await import("@eidentic/types/testing");
679
+ const { Memory: MemoryCtor } = await import("@eidentic/memory");
680
+ const dedupMemory = new MemoryCtor({
681
+ store: new InMemoryStore(),
682
+ vector: new InMemoryVectorStore(),
683
+ embedder: new FakeEmbedder(16),
684
+ dedupeOnWrite: true
685
+ });
686
+ const referenceEvents = [
687
+ "I enjoy hiking in the mountains on weekends.",
688
+ "My favorite food is sushi especially salmon rolls.",
689
+ "I have been learning Rust for the past six months.",
690
+ "My partner's name is Sam and we have two cats."
691
+ ];
692
+ await dedupMemory.ingest(
693
+ referenceEvents.map((text, i) => ({
694
+ id: `dedup-s1-${i}`,
695
+ scope: dedupScope,
696
+ text
697
+ }))
698
+ );
699
+ totalWrites += referenceEvents.length;
700
+ let duplicatesThrough = 0;
701
+ for (let session = 2; session <= duplicateSessions; session++) {
702
+ for (let i = 0; i < referenceEvents.length; i++) {
703
+ const id = `dedup-s${session}-${i}`;
704
+ await dedupMemory.ingest([{ id, scope: dedupScope, text: referenceEvents[i] }]);
705
+ totalWrites += 1;
706
+ const retrieved = await dedupMemory.retrieve({
707
+ text: referenceEvents[i],
708
+ scope: dedupScope,
709
+ topK: 20
710
+ });
711
+ const exactMatches = retrieved.snippets.filter(
712
+ (s) => s.text.trim().toLowerCase() === referenceEvents[i].trim().toLowerCase()
713
+ );
714
+ if (exactMatches.length > 1) {
715
+ duplicatesThrough += 1;
716
+ details.push({
717
+ kind: "duplicate",
718
+ label: `session ${session}, event ${i}: "${referenceEvents[i].slice(0, 50)}"`,
719
+ passed: false,
720
+ note: `${exactMatches.length} exact-text copies in store`
721
+ });
722
+ } else {
723
+ details.push({
724
+ kind: "duplicate",
725
+ label: `session ${session}, event ${i}: "${referenceEvents[i].slice(0, 50)}"`,
726
+ passed: true
727
+ });
728
+ }
729
+ }
730
+ }
731
+ const reIngestTotal = referenceEvents.length * (duplicateSessions - 1);
732
+ const duplicateRate = reIngestTotal > 0 ? duplicatesThrough / reIngestTotal : 0;
733
+ const llmCallsPerWrite = totalWrites > 0 ? llmCalls / totalWrites : 0;
734
+ return {
735
+ contradictionAccuracy,
736
+ junkRate,
737
+ factRecall: factRecallScore,
738
+ duplicateRate,
739
+ llmCallsPerWrite,
740
+ tokensUsedIfAny: tokensUsed,
741
+ details
742
+ };
743
+ }
744
+
745
+ // src/temporal.ts
746
+ function classifyQuestion(q) {
747
+ if (q.goldAnswer === null) return "before-first-fact";
748
+ if (q.rationale.includes("exactly")) return "at-boundary";
749
+ if (q.rationale.includes("latest") || q.rationale.includes("last transition")) {
750
+ return "current-state";
751
+ }
752
+ return "mid-interval";
753
+ }
754
+ async function runTemporalBench(memory, dataset, opts = {}) {
755
+ if (!memory.graphEnabled) {
756
+ throw new Error(
757
+ "runTemporalBench: Memory must have a graph configured (pass `graph` to Memory constructor). Temporal point-in-time queries require timestamped fact validity."
758
+ );
759
+ }
760
+ const scope = opts.scope ?? { kind: "agent", agentId: "bench:temporal" };
761
+ const sortedAsserts = [...dataset.asserts].sort((a, b) => {
762
+ const va = a.validFrom ?? "";
763
+ const vb = b.validFrom ?? "";
764
+ return va < vb ? -1 : va > vb ? 1 : 0;
765
+ });
766
+ const grouped = /* @__PURE__ */ new Map();
767
+ for (const a of sortedAsserts) {
768
+ const key = `${a.subject}::${a.predicate}`;
769
+ const list = grouped.get(key) ?? [];
770
+ list.push(a);
771
+ grouped.set(key, list);
772
+ }
773
+ for (const [, assertList] of grouped) {
774
+ for (const input of assertList) {
775
+ try {
776
+ await memory.assertFact(scope, input);
777
+ } catch (err) {
778
+ void err;
779
+ }
780
+ }
781
+ }
782
+ const results = [];
783
+ for (const q of dataset.questions) {
784
+ const facts = await memory.queryFacts({
785
+ scope,
786
+ subject: q.subject,
787
+ predicate: q.predicate,
788
+ validAt: q.askedAt
789
+ });
790
+ const systemAnswer = facts.length > 0 ? facts[facts.length - 1]?.object ?? null : null;
791
+ const correct = q.goldAnswer === null ? systemAnswer === null : systemAnswer === q.goldAnswer;
792
+ const questionType = classifyQuestion(q);
793
+ results.push({
794
+ subject: q.subject,
795
+ predicate: q.predicate,
796
+ askedAt: q.askedAt,
797
+ goldAnswer: q.goldAnswer,
798
+ systemAnswer,
799
+ correct,
800
+ questionType,
801
+ rationale: q.rationale
802
+ });
803
+ }
804
+ const currentStateResults = results.filter((r) => r.questionType === "current-state");
805
+ const pointInTimeResults = results.filter((r) => r.questionType !== "current-state");
806
+ const beforeFirstResults = results.filter((r) => r.questionType === "before-first-fact");
807
+ function accuracy(rs) {
808
+ if (rs.length === 0) return 1;
809
+ return rs.filter((r) => r.correct).length / rs.length;
810
+ }
811
+ return {
812
+ datasetName: dataset.name,
813
+ pointInTimeAccuracy: accuracy(pointInTimeResults),
814
+ currentStateAccuracy: accuracy(currentStateResults),
815
+ beforeFirstFactAccuracy: accuracy(beforeFirstResults),
816
+ totalQuestions: results.length,
817
+ llmCallsPerWrite: 0,
818
+ // deterministic — no LLM calls
819
+ tokensUsedIfAny: 0,
820
+ results
821
+ };
822
+ }
823
+
824
+ // src/datasets/temporal.ts
825
+ function makeRng(seed) {
826
+ let s = seed >>> 0;
827
+ if (s === 0) s = 1;
828
+ return () => {
829
+ s ^= s << 13;
830
+ s ^= s >>> 17;
831
+ s ^= s << 5;
832
+ s = s >>> 0;
833
+ return s / 4294967296;
834
+ };
835
+ }
836
+ var EMPLOYERS = [
837
+ "Acme Corp",
838
+ "StartupX",
839
+ "MegaCorp",
840
+ "HealthCo",
841
+ "EduInc",
842
+ "FinTechLtd",
843
+ "RetailCo",
844
+ "CloudSystems",
845
+ "OpenSource Inc",
846
+ "ConsultingGroup"
847
+ ];
848
+ var CITIES = [
849
+ "Amsterdam",
850
+ "Berlin",
851
+ "Toronto",
852
+ "London",
853
+ "Sydney",
854
+ "Tokyo",
855
+ "Paris",
856
+ "Seoul",
857
+ "Nairobi",
858
+ "Sao Paulo"
859
+ ];
860
+ var LANGUAGES = [
861
+ "TypeScript",
862
+ "Rust",
863
+ "Python",
864
+ "Go",
865
+ "Kotlin",
866
+ "Swift",
867
+ "Elixir",
868
+ "Scala",
869
+ "Java",
870
+ "Clojure"
871
+ ];
872
+ var ROLES = [
873
+ "junior-developer",
874
+ "mid-developer",
875
+ "senior-developer",
876
+ "staff-engineer",
877
+ "principal-engineer",
878
+ "engineering-manager",
879
+ "director-of-engineering",
880
+ "vp-engineering",
881
+ "cto",
882
+ "founder"
883
+ ];
884
+ var PROPERTY_POOLS = [
885
+ { predicate: "employer", values: EMPLOYERS },
886
+ { predicate: "city", values: CITIES },
887
+ { predicate: "preferred_language", values: LANGUAGES },
888
+ { predicate: "role", values: ROLES }
889
+ ];
890
+ function generateTimestamps(count, startYear = 2022, startMonth = 1) {
891
+ const ts = [];
892
+ let year = startYear;
893
+ let month = startMonth;
894
+ for (let i = 0; i < count; i++) {
895
+ ts.push(`${year}-${String(month).padStart(2, "0")}-01T00:00:00.000Z`);
896
+ month += 6;
897
+ if (month > 12) {
898
+ month -= 12;
899
+ year += 1;
900
+ }
901
+ }
902
+ return ts;
903
+ }
904
+ function syntheticTemporalDataset(opts = {}) {
905
+ const entityCount = opts.entityCount ?? 4;
906
+ const seed = opts.seed ?? 42;
907
+ const changesPerProperty = opts.changesPerProperty ?? 3;
908
+ const rng = makeRng(seed);
909
+ const entities = [];
910
+ const asserts = [];
911
+ const questions = [];
912
+ const allTimestamps = generateTimestamps(changesPerProperty + 2, 2022, 1);
913
+ for (let ei = 0; ei < entityCount; ei++) {
914
+ const entityName = `entity_${ei}`;
915
+ const history = {};
916
+ for (const pool of PROPERTY_POOLS) {
917
+ const transitions = [];
918
+ const poolCopy = [...pool.values];
919
+ for (let i = poolCopy.length - 1; i > 0; i--) {
920
+ const j = Math.floor(rng() * (i + 1));
921
+ [poolCopy[i], poolCopy[j]] = [poolCopy[j], poolCopy[i]];
922
+ }
923
+ const chosen = poolCopy.slice(0, changesPerProperty);
924
+ for (let ci = 0; ci < changesPerProperty; ci++) {
925
+ const validFrom = allTimestamps[ci];
926
+ const value = chosen[ci];
927
+ transitions.push({ validFrom, value });
928
+ asserts.push({
929
+ subject: entityName,
930
+ predicate: pool.predicate,
931
+ object: value,
932
+ objectKind: "literal",
933
+ confidence: 1,
934
+ validFrom
935
+ });
936
+ }
937
+ history[pool.predicate] = transitions;
938
+ }
939
+ entities.push({ name: entityName, history });
940
+ for (const pool of PROPERTY_POOLS) {
941
+ const transitions = history[pool.predicate];
942
+ if (transitions.length === 0) continue;
943
+ questions.push({
944
+ subject: entityName,
945
+ predicate: pool.predicate,
946
+ askedAt: "2021-01-01T00:00:00.000Z",
947
+ // before the 2022 start
948
+ goldAnswer: null,
949
+ rationale: `No fact for ${entityName}.${pool.predicate} exists before ${transitions[0].validFrom}`
950
+ });
951
+ questions.push({
952
+ subject: entityName,
953
+ predicate: pool.predicate,
954
+ askedAt: transitions[0].validFrom,
955
+ goldAnswer: transitions[0].value,
956
+ rationale: `At exactly ${transitions[0].validFrom}, the first fact is active`
957
+ });
958
+ if (transitions.length >= 2) {
959
+ const t1 = new Date(transitions[0].validFrom).getTime();
960
+ const t2 = new Date(transitions[1].validFrom).getTime();
961
+ const midMs = Math.floor((t1 + t2) / 2);
962
+ const midIso = new Date(midMs).toISOString().replace(/\.\d{3}Z$/, ".000Z");
963
+ questions.push({
964
+ subject: entityName,
965
+ predicate: pool.predicate,
966
+ askedAt: midIso,
967
+ goldAnswer: transitions[0].value,
968
+ rationale: `Between ${transitions[0].validFrom} and ${transitions[1].validFrom}, the first fact is still active`
969
+ });
970
+ }
971
+ const latest = transitions[transitions.length - 1];
972
+ const laterMs = new Date(latest.validFrom).getTime() + 30 * 24 * 60 * 60 * 1e3;
973
+ const laterIso = new Date(laterMs).toISOString().replace(/\.\d{3}Z$/, ".000Z");
974
+ questions.push({
975
+ subject: entityName,
976
+ predicate: pool.predicate,
977
+ askedAt: laterIso,
978
+ goldAnswer: latest.value,
979
+ rationale: `After the last transition at ${latest.validFrom}, the latest fact is active`
980
+ });
981
+ }
982
+ }
983
+ return {
984
+ name: `eidentic-temporal-synthetic-v1 (seed=${seed}, entities=${entityCount})`,
985
+ seed,
986
+ entities,
987
+ asserts,
988
+ questions
989
+ };
990
+ }
991
+ // Annotate the CommonJS export names for ESM import in node:
992
+ 0 && (module.exports = {
993
+ CONTRADICTION_FIXTURES,
994
+ JUNK_STREAM_FIXTURES,
995
+ factRecall,
996
+ loadLoCoMo,
997
+ loadLongMemEval,
998
+ normalizeText,
999
+ normalizedIncludes,
1000
+ recallAtK,
1001
+ runMemoryBench,
1002
+ runTemporalBench,
1003
+ runWriteQualityBench,
1004
+ syntheticDataset,
1005
+ syntheticTemporalDataset
1006
+ });