@1mbrain/benchmarks 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (69) hide show
  1. package/README.md +85 -0
  2. package/fixtures/1mbrain-focused-mini/1mbrain-focused-mini.json +928 -0
  3. package/fixtures/1mbrain-focused-mini/README.md +45 -0
  4. package/fixtures/adversarial-memory/dataset_claude_adversarial.json +3333 -0
  5. package/fixtures/adversarial-memory/dataset_gemini_adversarial_memory.json +2984 -0
  6. package/fixtures/balanced-mini/dataset_claude_balanced_mini.json +2077 -0
  7. package/fixtures/balanced-mini/dataset_gemini_balanced_mini.json +1995 -0
  8. package/fixtures/generate_datasets.js +1741 -0
  9. package/fixtures/graph-stress-hard/README.md +43 -0
  10. package/fixtures/graph-stress-hard/dataset_graph_stress_hard.json +4374 -0
  11. package/fixtures/graph-stress-hard/generate_graph_stress_hard.js +526 -0
  12. package/fixtures/realistic-medium/dataset_claude_realistic_medium.json +7462 -0
  13. package/fixtures/realistic-medium/dataset_gemini_realistic_medium.json +7277 -0
  14. package/fixtures/realistic-medium/gen_claude_medium.js +600 -0
  15. package/package.json +22 -0
  16. package/reports/benchmark_report.md +48 -0
  17. package/reports/benchmark_report_claude_adversarial.md +42 -0
  18. package/reports/benchmark_report_claude_adversarial_adaptive.md +42 -0
  19. package/reports/benchmark_report_claude_adversarial_adaptive2_fast.md +42 -0
  20. package/reports/benchmark_report_claude_adversarial_adaptive_fast.md +42 -0
  21. package/reports/benchmark_report_claude_adversarial_rerank.md +42 -0
  22. package/reports/benchmark_report_claude_balanced_mini.md +42 -0
  23. package/reports/benchmark_report_claude_balanced_mini_adaptive.md +42 -0
  24. package/reports/benchmark_report_claude_balanced_mini_adaptive2_fast.md +42 -0
  25. package/reports/benchmark_report_claude_balanced_mini_adaptive_fast.md +42 -0
  26. package/reports/benchmark_report_claude_balanced_mini_rerank.md +42 -0
  27. package/reports/benchmark_report_claude_realistic_medium.md +42 -0
  28. package/reports/benchmark_report_claude_realistic_medium_adaptive.md +42 -0
  29. package/reports/benchmark_report_claude_realistic_medium_adaptive2_fast.md +42 -0
  30. package/reports/benchmark_report_claude_realistic_medium_adaptive_fast.md +42 -0
  31. package/reports/benchmark_report_claude_realistic_medium_evidence_rerank_local.md +42 -0
  32. package/reports/benchmark_report_claude_realistic_medium_openai_evidence_rerank.md +41 -0
  33. package/reports/benchmark_report_claude_realistic_medium_openai_multi_signal.md +41 -0
  34. package/reports/benchmark_report_claude_realistic_medium_openai_multi_signal_scoped.md +41 -0
  35. package/reports/benchmark_report_claude_realistic_medium_openai_phase8_no_judge.md +42 -0
  36. package/reports/benchmark_report_claude_realistic_medium_openai_rankingpolicy.md +41 -0
  37. package/reports/benchmark_report_claude_realistic_medium_openai_stale_filter.md +41 -0
  38. package/reports/benchmark_report_claude_realistic_medium_openai_stale_filter_absence_fix.md +41 -0
  39. package/reports/benchmark_report_claude_realistic_medium_openai_write_time_invalidation.md +41 -0
  40. package/reports/benchmark_report_claude_realistic_medium_rerank.md +42 -0
  41. package/reports/benchmark_report_claude_realistic_medium_stale_filter_local.md +42 -0
  42. package/reports/benchmark_report_graph_stress_hard.md +42 -0
  43. package/reports/benchmark_report_graph_stress_hard_absence_fix.md +42 -0
  44. package/reports/benchmark_report_graph_stress_hard_adaptive.md +42 -0
  45. package/reports/benchmark_report_graph_stress_hard_evidence_rerank.md +42 -0
  46. package/reports/benchmark_report_graph_stress_hard_multi_signal_current_guardrail.md +42 -0
  47. package/reports/benchmark_report_graph_stress_hard_multi_signal_guardrail_fixed.md +42 -0
  48. package/reports/benchmark_report_graph_stress_hard_multi_signal_local.md +42 -0
  49. package/reports/benchmark_report_graph_stress_hard_multi_signal_scoped_guardrail.md +42 -0
  50. package/reports/benchmark_report_graph_stress_hard_multi_signal_vector_pure_guardrail.md +42 -0
  51. package/reports/benchmark_report_graph_stress_hard_phase8_sdk_guardrail.md +42 -0
  52. package/reports/benchmark_report_graph_stress_hard_rerank.md +42 -0
  53. package/reports/benchmark_report_graph_stress_hard_stale_filter.md +42 -0
  54. package/reports/benchmark_report_graph_stress_hard_write_time_invalidation.md +42 -0
  55. package/results/.gitignore +2 -0
  56. package/src/adapters/1mbrain.ts +317 -0
  57. package/src/adapters/keyword-embedding.ts +48 -0
  58. package/src/adapters/mem0.ts +124 -0
  59. package/src/adapters/qdrant.ts +214 -0
  60. package/src/adapters/unavailable.ts +49 -0
  61. package/src/adapters/vector-baseline.ts +149 -0
  62. package/src/datasets/focused-mini.ts +158 -0
  63. package/src/datasets/synthetic-agent-memory.ts +532 -0
  64. package/src/llm-evaluator.ts +262 -0
  65. package/src/metrics.ts +482 -0
  66. package/src/provider.ts +151 -0
  67. package/src/runner.ts +635 -0
  68. package/tsconfig.json +10 -0
  69. package/tsconfig.tsbuildinfo +1 -0
@@ -0,0 +1,532 @@
1
+ import type {
2
+ BenchmarkCase,
3
+ BenchmarkDataset,
4
+ BenchmarkMemoryRecord,
5
+ BenchmarkMemoryType,
6
+ } from '../provider.js';
7
+
8
+ const DEFAULT_RECALL_OPTIONS = {
9
+ limit: 5,
10
+ minScore: 0.08,
11
+ };
12
+
13
+ export function createSyntheticAgentMemoryDataset(): BenchmarkDataset {
14
+ const cases: BenchmarkCase[] = [
15
+ ...buildBasicSemanticRecallCases(),
16
+ ...buildMultiHopRecallCases(),
17
+ ...buildMemoryUpdateCases(),
18
+ ...buildNoiseResistanceCases(),
19
+ ...buildSelectiveForgettingCases(),
20
+ ...buildDecayRefreshCases(),
21
+ ...buildPortabilityCases(),
22
+ ...buildAgentTaskContextCases(),
23
+ ];
24
+
25
+ return {
26
+ name: 'synthetic-memory-benchmark-v2',
27
+ generatedAt: new Date('2026-06-18T00:00:00.000Z').toISOString(),
28
+ cases,
29
+ };
30
+ }
31
+
32
+ function buildBasicSemanticRecallCases(): BenchmarkCase[] {
33
+ const topics = [
34
+ ['local agent tooling', 'TypeScript', 'PostgreSQL'],
35
+ ['retrieval api', 'TypeScript', 'SQLite'],
36
+ ['memory dashboard', 'React', 'PostgreSQL'],
37
+ ['analytics ingest worker', 'Python', 'PostgreSQL'],
38
+ ['evaluation harness', 'TypeScript', 'SQLite'],
39
+ ['offline assistant', 'Rust', 'SQLite'],
40
+ ['workflow orchestrator', 'TypeScript', 'Redis'],
41
+ ['plugin runtime', 'TypeScript', 'PostgreSQL'],
42
+ ['docs indexing service', 'Python', 'SQLite'],
43
+ ['agent audit trail', 'TypeScript', 'PostgreSQL'],
44
+ ] as const;
45
+
46
+ return Array.from({ length: 30 }, (_, index) => {
47
+ const [topic, language, database] = topics[index % topics.length];
48
+ const caseId = `basic_${pad(index + 1)}`;
49
+ const memoryId = `${caseId}_stack`;
50
+ const memories = [
51
+ createMemory(memoryId, 'semantic', isoDay(index), [
52
+ `Preferred stack for ${topic}: ${language} with ${database} for the production build.`,
53
+ [topicToken(topic), language.toLowerCase(), database.toLowerCase(), 'preferred', 'stack'],
54
+ ]),
55
+ ...createGenericNoise(caseId, 3, [`${topic} meeting notes`, 'calendar sync reminder']),
56
+ ];
57
+
58
+ return {
59
+ scenarioId: caseId,
60
+ scenarioType: 'basic_semantic_recall',
61
+ title: `Basic semantic recall ${index + 1}`,
62
+ description: `Retrieve the preferred stack for ${topic}.`,
63
+ agentId: `${caseId}_agent`,
64
+ memories,
65
+ operations: [],
66
+ question: `What stack is preferred for ${topic}?`,
67
+ expectedAnswer: `${language} and ${database}.`,
68
+ recallOptions: DEFAULT_RECALL_OPTIONS,
69
+ expectations: {
70
+ requiredMemoryIds: [memoryId],
71
+ forbiddenMemoryIds: [],
72
+ },
73
+ };
74
+ });
75
+ }
76
+
77
+ function buildMultiHopRecallCases(): BenchmarkCase[] {
78
+ const projects = [
79
+ ['Kreasa', 'ai mentor app', 'teach users with generated tasks', 'guidance stays too generic'],
80
+ ['PulseBoard', 'ops dashboard', 'coordinate oncall responses', 'alerts lack owner context'],
81
+ ['TraceLamp', 'debug assistant', 'summarize failures for developers', 'root cause summaries are too shallow'],
82
+ ['ForgeFlow', 'coding copilot', 'generate implementation plans', 'diffs ignore repo conventions'],
83
+ ['AtlasNote', 'research workspace', 'connect project findings', 'linked notes miss contradictory evidence'],
84
+ ['RelayDesk', 'support assistant', 'draft customer replies', 'handoff context loses urgency details'],
85
+ ] as const;
86
+
87
+ return Array.from({ length: 30 }, (_, index) => {
88
+ const [project, projectType, benefit, weakness] = projects[index % projects.length];
89
+ const caseId = `multihop_${pad(index + 1)}`;
90
+ const memoryA = `${caseId}_identity`;
91
+ const memoryB = `${caseId}_workflow`;
92
+ const memoryC = `${caseId}_weakness`;
93
+ const memories = [
94
+ createMemory(memoryA, 'semantic', isoDay(index), [
95
+ `Project ${project} is the user's ${projectType}.`,
96
+ [project.toLowerCase(), ...projectType.split(' '), 'project'],
97
+ ], [{ targetId: memoryB, strength: 0.92 }]),
98
+ createMemory(memoryB, 'procedural', isoDay(index + 1), [
99
+ `${project} helps users ${benefit}.`,
100
+ [project.toLowerCase(), ...benefit.split(' '), 'workflow'],
101
+ ], [{ targetId: memoryC, strength: 0.9 }]),
102
+ createMemory(memoryC, 'episodic', isoDay(index + 2), [
103
+ `Main weakness this quarter: ${weakness}.`,
104
+ [...weakness.split(' '), 'weakness'],
105
+ ]),
106
+ ...createGenericNoise(caseId, 2, ['marketing banner text', 'cookie notice footer']),
107
+ ];
108
+
109
+ return {
110
+ scenarioId: caseId,
111
+ scenarioType: 'multi_hop_recall',
112
+ title: `Multi-hop recall ${index + 1}`,
113
+ description: `Connect ${project} to its weakness through explicit associations.`,
114
+ agentId: `${caseId}_agent`,
115
+ memories,
116
+ operations: [],
117
+ question: `What is the main weakness of the user's ${projectType} project ${project}?`,
118
+ expectedAnswer: weakness,
119
+ recallOptions: {
120
+ ...DEFAULT_RECALL_OPTIONS,
121
+ limit: 5,
122
+ },
123
+ expectations: {
124
+ requiredMemoryIds: [memoryA, memoryB, memoryC],
125
+ forbiddenMemoryIds: [],
126
+ },
127
+ };
128
+ });
129
+ }
130
+
131
+ function buildMemoryUpdateCases(): BenchmarkCase[] {
132
+ const projects = ['core benchmark', 'agent sdk', 'support copilot', 'memory passport', 'dashboard build'];
133
+ const oldModels = ['Gemini 2.5 Flash', 'Claude Sonnet', 'Llama 3.1', 'GPT-4.1 mini', 'Mixtral'];
134
+ const newModels = ['DeepSeek V4 Pro', 'Claude Opus', 'Qwen Coder', 'GPT-4.1', 'DeepSeek R1'];
135
+
136
+ return Array.from({ length: 30 }, (_, index) => {
137
+ const caseId = `update_${pad(index + 1)}`;
138
+ const project = projects[index % projects.length];
139
+ const oldModel = oldModels[index % oldModels.length];
140
+ const newModel = newModels[index % newModels.length];
141
+ const oldId = `${caseId}_old`;
142
+ const newId = `${caseId}_new`;
143
+ const memories = [
144
+ createMemory(oldId, 'semantic', isoDay(index), [
145
+ `Archived note 2026-05-${pad((index % 20) + 1)}: ${project} used ${oldModel} for coding tasks before the review.`,
146
+ [topicToken(project), ...tokenBag(oldModel), 'archived', 'coding', 'tasks'],
147
+ ]),
148
+ createMemory(newId, 'semantic', isoDay(index + 20), [
149
+ `Current plan 2026-06-${pad((index % 20) + 1)}: ${project} uses ${newModel} for coding focused tasks after the review.`,
150
+ [topicToken(project), ...tokenBag(newModel), 'current', 'coding', 'focused', 'tasks'],
151
+ ]),
152
+ ...createGenericNoise(caseId, 2, ['incident retrospective', 'billing reminder']),
153
+ ];
154
+
155
+ return {
156
+ scenarioId: caseId,
157
+ scenarioType: 'memory_update',
158
+ title: `Memory update ${index + 1}`,
159
+ description: `Prefer the newer model decision for ${project}.`,
160
+ agentId: `${caseId}_agent`,
161
+ memories,
162
+ operations: [],
163
+ question: `Which model is currently planned for coding focused tasks in ${project}?`,
164
+ expectedAnswer: newModel,
165
+ recallOptions: DEFAULT_RECALL_OPTIONS,
166
+ expectations: {
167
+ requiredMemoryIds: [newId],
168
+ forbiddenMemoryIds: [],
169
+ preferredOver: [
170
+ {
171
+ preferredId: newId,
172
+ competingIds: [oldId],
173
+ },
174
+ ],
175
+ },
176
+ };
177
+ });
178
+ }
179
+
180
+ function buildNoiseResistanceCases(): BenchmarkCase[] {
181
+ const projects = [
182
+ ['Atlas Memory', 'SQLite with PostgreSQL fallback'],
183
+ ['Kreasa Assist', 'PostgreSQL with pgvector'],
184
+ ['Pulse Brain', 'SQLite with local backup'],
185
+ ['Hermes Cache', 'Redis for ephemeral cache only'],
186
+ ['Trace Relay', 'PostgreSQL with nightly export'],
187
+ ] as const;
188
+
189
+ return Array.from({ length: 20 }, (_, index) => {
190
+ const [project, storageDecision] = projects[index % projects.length];
191
+ const caseId = `noise_${pad(index + 1)}`;
192
+ const memoryId = `${caseId}_decision`;
193
+ const memories = [
194
+ createMemory(memoryId, 'semantic', isoDay(index), [
195
+ `Decision for ${project}: memory storage will use ${storageDecision} as the primary backend.`,
196
+ [project.toLowerCase().split(' ')[0], 'decision', 'memory', 'storage', ...tokenBag(storageDecision)],
197
+ ]),
198
+ ...createNoiseCluster(caseId, project),
199
+ ];
200
+
201
+ return {
202
+ scenarioId: caseId,
203
+ scenarioType: 'noise_resistance',
204
+ title: `Noise resistance ${index + 1}`,
205
+ description: `Ignore project boilerplate and retrieve the real storage decision for ${project}.`,
206
+ agentId: `${caseId}_agent`,
207
+ memories,
208
+ operations: [],
209
+ question: `What is the actual project decision about memory storage in ${project}?`,
210
+ expectedAnswer: storageDecision,
211
+ recallOptions: DEFAULT_RECALL_OPTIONS,
212
+ expectations: {
213
+ requiredMemoryIds: [memoryId],
214
+ forbiddenMemoryIds: [],
215
+ },
216
+ };
217
+ });
218
+ }
219
+
220
+ function buildSelectiveForgettingCases(): BenchmarkCase[] {
221
+ const prototypes = [
222
+ ['Kreasa prototype', 'Firebase'],
223
+ ['Atlas note prototype', 'Supabase'],
224
+ ['Relay desk prototype', 'MongoDB'],
225
+ ['Pulse memory prototype', 'Redis'],
226
+ ['Trace lamp prototype', 'DynamoDB'],
227
+ ] as const;
228
+
229
+ return Array.from({ length: 20 }, (_, index) => {
230
+ const [prototype, backend] = prototypes[index % prototypes.length];
231
+ const caseId = `forget_${pad(index + 1)}`;
232
+ const memoryId = `${caseId}_removed`;
233
+ const memories = [
234
+ createMemory(memoryId, 'semantic', isoDay(index), [
235
+ `Prototype storage preference for ${prototype}: ${backend} for the first internal demo.`,
236
+ [topicToken(prototype), ...tokenBag(backend), 'prototype', 'storage', 'preference'],
237
+ ]),
238
+ ...createGenericNoise(caseId, 3, ['team lunch reminder', 'wireframe review comment']),
239
+ ];
240
+
241
+ return {
242
+ scenarioId: caseId,
243
+ scenarioType: 'selective_forgetting',
244
+ title: `Selective forgetting ${index + 1}`,
245
+ description: `Deleted storage preference for ${prototype} should not leak into recall.`,
246
+ agentId: `${caseId}_agent`,
247
+ memories,
248
+ operations: [{ kind: 'forget', memoryId }],
249
+ question: `What backend storage was preferred for the ${prototype}?`,
250
+ expectedAnswer: 'No current memory should be returned after deletion.',
251
+ recallOptions: DEFAULT_RECALL_OPTIONS,
252
+ expectations: {
253
+ requiredMemoryIds: [],
254
+ forbiddenMemoryIds: [memoryId],
255
+ shouldAbstain: true,
256
+ },
257
+ };
258
+ });
259
+ }
260
+
261
+ function buildDecayRefreshCases(): BenchmarkCase[] {
262
+ const projects = [
263
+ ['Atlas memory', 'SQLite with pgvector fallback', 'Redis scratch cache'],
264
+ ['Kreasa planner', 'PostgreSQL with nightly export', 'Firebase scratch store'],
265
+ ['Pulse board', 'SQLite local-first mode', 'MongoDB prototype store'],
266
+ ['Trace relay', 'PostgreSQL durable log', 'Redis event scratchpad'],
267
+ ['Forge flow', 'SQLite embedded mode', 'DynamoDB experiment'],
268
+ ] as const;
269
+
270
+ return Array.from({ length: 20 }, (_, index) => {
271
+ const [project, approvedChoice, oldChoice] = projects[index % projects.length];
272
+ const caseId = `decay_${pad(index + 1)}`;
273
+ const approvedId = `${caseId}_approved`;
274
+ const oldId = `${caseId}_old`;
275
+ const sideId = `${caseId}_side`;
276
+ const question = `What memory persistence should ${project} use?`;
277
+ const memories = [
278
+ createMemory(approvedId, 'semantic', isoDay(index), [
279
+ `Approved memory platform for ${project}: ${approvedChoice} is the canonical backend after review.`,
280
+ [topicToken(project), ...tokenBag(approvedChoice), 'approved', 'canonical', 'backend'],
281
+ ]),
282
+ createMemory(oldId, 'episodic', isoDay(index - 5), [
283
+ `Old experiment for ${project}: ${oldChoice} for memory persistence during prototype tests.`,
284
+ [topicToken(project), ...tokenBag(oldChoice), 'old', 'experiment', 'memory', 'persistence', 'prototype'],
285
+ ]),
286
+ createMemory(sideId, 'semantic', isoDay(index + 1), [
287
+ `Side note for ${project}: dashboard color review stayed unchanged.`,
288
+ [topicToken(project), 'dashboard', 'color', 'review'],
289
+ ]),
290
+ ];
291
+
292
+ return {
293
+ scenarioId: caseId,
294
+ scenarioType: 'decay_refresh',
295
+ title: `Decay and refresh ${index + 1}`,
296
+ description: `Repeated access should keep the approved choice ahead of the old experiment for ${project}.`,
297
+ agentId: `${caseId}_agent`,
298
+ memories,
299
+ operations: [
300
+ {
301
+ kind: 'recall_probe',
302
+ label: 'before_refresh',
303
+ query: question,
304
+ options: DEFAULT_RECALL_OPTIONS,
305
+ },
306
+ {
307
+ kind: 'recall_probe',
308
+ label: 'refresh_target',
309
+ query: `approved memory platform ${project} ${approvedChoice}`,
310
+ repeat: 4,
311
+ options: DEFAULT_RECALL_OPTIONS,
312
+ },
313
+ {
314
+ kind: 'decay',
315
+ cycles: 5,
316
+ decayRate: 0.2,
317
+ minScore: 0.01,
318
+ },
319
+ {
320
+ kind: 'recall_probe',
321
+ label: 'after_refresh',
322
+ query: question,
323
+ options: DEFAULT_RECALL_OPTIONS,
324
+ },
325
+ ],
326
+ question,
327
+ expectedAnswer: approvedChoice,
328
+ recallOptions: DEFAULT_RECALL_OPTIONS,
329
+ expectations: {
330
+ requiredMemoryIds: [approvedId],
331
+ forbiddenMemoryIds: [],
332
+ preferredOver: [
333
+ {
334
+ preferredId: approvedId,
335
+ competingIds: [oldId],
336
+ },
337
+ ],
338
+ probeComparisons: [
339
+ {
340
+ labelBefore: 'before_refresh',
341
+ labelAfter: 'after_refresh',
342
+ memoryId: approvedId,
343
+ },
344
+ ],
345
+ },
346
+ };
347
+ });
348
+ }
349
+
350
+ function buildPortabilityCases(): BenchmarkCase[] {
351
+ const products = [
352
+ ['Atlas Memory', 'local first storage', 'portable memory passport', 'association graph recall'],
353
+ ['Kreasa Mentor', 'specific coaching notes', 'portable project memory', 'semantic summary recall'],
354
+ ['Pulse Brain', 'dashboard event memory', 'portable backup workflow', 'graph expansion recall'],
355
+ ['Trace Relay', 'debug audit memory', 'portable incident ledger', 'linked evidence recall'],
356
+ ['Forge Flow', 'coding decision memory', 'portable build checkpoint', 'task history recall'],
357
+ ] as const;
358
+
359
+ return Array.from({ length: 10 }, (_, index) => {
360
+ const [product, storageNote, portabilityNote, recallNote] = products[index % products.length];
361
+ const caseId = `passport_${pad(index + 1)}`;
362
+ const firstId = `${caseId}_storage`;
363
+ const secondId = `${caseId}_portable`;
364
+ const thirdId = `${caseId}_recall`;
365
+ const memories = [
366
+ createMemory(firstId, 'semantic', isoDay(index), [
367
+ `${product} keeps ${storageNote}.`,
368
+ [product.toLowerCase().split(' ')[0], ...storageNote.split(' ')],
369
+ ], [{ targetId: secondId, strength: 0.9 }]),
370
+ createMemory(secondId, 'semantic', isoDay(index + 1), [
371
+ `${product} ships ${portabilityNote}.`,
372
+ [product.toLowerCase().split(' ')[0], ...portabilityNote.split(' ')],
373
+ ], [{ targetId: thirdId, strength: 0.88 }]),
374
+ createMemory(thirdId, 'procedural', isoDay(index + 2), [
375
+ `${product} uses ${recallNote}.`,
376
+ [product.toLowerCase().split(' ')[0], ...recallNote.split(' ')],
377
+ ]),
378
+ ];
379
+ const question = `Which portable memory capabilities does ${product} keep after export and import?`;
380
+
381
+ return {
382
+ scenarioId: caseId,
383
+ scenarioType: 'portability',
384
+ title: `Portability ${index + 1}`,
385
+ description: `Export and import should preserve retrievable memory for ${product}.`,
386
+ agentId: `${caseId}_agent`,
387
+ memories,
388
+ operations: [
389
+ {
390
+ kind: 'recall_probe',
391
+ label: 'before_export',
392
+ query: question,
393
+ options: DEFAULT_RECALL_OPTIONS,
394
+ },
395
+ {
396
+ kind: 'export_import',
397
+ targetAgentId: `${caseId}_imported_agent`,
398
+ },
399
+ ],
400
+ question,
401
+ expectedAnswer: `${storageNote}, ${portabilityNote}, and ${recallNote}.`,
402
+ recallOptions: DEFAULT_RECALL_OPTIONS,
403
+ expectations: {
404
+ requiredMemoryIds: [firstId, secondId, thirdId],
405
+ forbiddenMemoryIds: [],
406
+ preserveAfterImportIds: [firstId, secondId, thirdId],
407
+ },
408
+ };
409
+ });
410
+ }
411
+
412
+ function buildAgentTaskContextCases(): BenchmarkCase[] {
413
+ const products = [
414
+ ['Atlas Memory', 'engineering teams', 'SQLite and PostgreSQL', 'association graph retrieval', 'quiet operator tone'],
415
+ ['Kreasa Mentor', 'learners and mentors', 'PostgreSQL with backup export', 'semantic coaching recall', 'specific helpful tone'],
416
+ ['Pulse Brain', 'ops teams', 'SQLite local-first storage', 'event graph recall', 'concise operational tone'],
417
+ ['Trace Relay', 'debugging teams', 'PostgreSQL durable audit store', 'linked evidence recall', 'plain diagnostic tone'],
418
+ ['Forge Flow', 'coding agents', 'SQLite embedded mode', 'task history recall', 'direct engineering tone'],
419
+ ] as const;
420
+
421
+ return Array.from({ length: 20 }, (_, index) => {
422
+ const [product, audience, storage, retrieval, tone] = products[index % products.length];
423
+ const caseId = `task_${pad(index + 1)}`;
424
+ const memories = [
425
+ createMemory(`${caseId}_audience`, 'semantic', isoDay(index), [
426
+ `${product} serves ${audience}.`,
427
+ [product.toLowerCase().split(' ')[0], ...audience.split(' '), 'audience'],
428
+ ]),
429
+ createMemory(`${caseId}_storage`, 'semantic', isoDay(index + 1), [
430
+ `${product} uses ${storage}.`,
431
+ [product.toLowerCase().split(' ')[0], ...storage.split(' '), 'storage'],
432
+ ]),
433
+ createMemory(`${caseId}_retrieval`, 'procedural', isoDay(index + 2), [
434
+ `${product} relies on ${retrieval}.`,
435
+ [product.toLowerCase().split(' ')[0], ...retrieval.split(' '), 'retrieval'],
436
+ ]),
437
+ createMemory(`${caseId}_tone`, 'semantic', isoDay(index + 3), [
438
+ `${product} should keep a ${tone}.`,
439
+ [product.toLowerCase().split(' ')[0], ...tone.split(' '), 'tone'],
440
+ ]),
441
+ ...createGenericNoise(caseId, 2, ['coupon banner', 'social share prompt']),
442
+ ];
443
+ const requiredIds = memories.slice(0, 4).map((memory) => memory.id);
444
+
445
+ return {
446
+ scenarioId: caseId,
447
+ scenarioType: 'agent_task_context',
448
+ title: `Agent task context ${index + 1}`,
449
+ description: `Retrieve the full context needed to draft positioning for ${product}.`,
450
+ agentId: `${caseId}_agent`,
451
+ memories,
452
+ operations: [],
453
+ question: `Prepare positioning context for ${product}: audience storage retrieval tone.`,
454
+ expectedAnswer: `Context should mention ${audience}, ${storage}, ${retrieval}, and ${tone}.`,
455
+ recallOptions: DEFAULT_RECALL_OPTIONS,
456
+ expectations: {
457
+ requiredMemoryIds: requiredIds,
458
+ forbiddenMemoryIds: [],
459
+ },
460
+ };
461
+ });
462
+ }
463
+
464
+ function createMemory(
465
+ id: string,
466
+ type: BenchmarkMemoryType,
467
+ timestamp: string,
468
+ [content, tags]: [string, string[]],
469
+ associations?: Array<{ targetId: string; strength: number }>,
470
+ ): BenchmarkMemoryRecord {
471
+ return {
472
+ id,
473
+ type,
474
+ timestamp,
475
+ content,
476
+ tags,
477
+ importance: type === 'procedural' ? 0.9 : type === 'semantic' ? 0.82 : 0.7,
478
+ metadata: {
479
+ source: 'synthetic-benchmark',
480
+ timestamp,
481
+ },
482
+ associations,
483
+ };
484
+ }
485
+
486
+ function createGenericNoise(caseId: string, count: number, subjects: string[]): BenchmarkMemoryRecord[] {
487
+ return Array.from({ length: count }, (_, index) =>
488
+ createMemory(`${caseId}_noise_${index + 1}`, 'episodic', isoDay(index + 40), [
489
+ `Noise note ${index + 1}: ${subjects[index % subjects.length]}.`,
490
+ ['noise', ...tokenBag(subjects[index % subjects.length])],
491
+ ]),
492
+ );
493
+ }
494
+
495
+ function createNoiseCluster(caseId: string, project: string): BenchmarkMemoryRecord[] {
496
+ const messages = [
497
+ `Subscribe now for ${project} project decision updates.`,
498
+ `This website uses cookies for ${project} analytics.`,
499
+ `Sponsored content about ${project} storage tutorials.`,
500
+ `Advertisement for project memory storage discounts.`,
501
+ `Click here to unlock the full project decision guide.`,
502
+ `Promotional banner for memory storage webinars.`,
503
+ ];
504
+
505
+ return messages.map((message, index) =>
506
+ createMemory(`${caseId}_noise_${index + 1}`, 'episodic', isoDay(index + 50), [
507
+ message,
508
+ ['noise', ...tokenBag(message)],
509
+ ]),
510
+ );
511
+ }
512
+
513
+ function isoDay(dayOffset: number): string {
514
+ const date = new Date(Date.UTC(2026, 5, 1 + dayOffset));
515
+ return date.toISOString();
516
+ }
517
+
518
+ function pad(value: number): string {
519
+ return value.toString().padStart(2, '0');
520
+ }
521
+
522
+ function topicToken(input: string): string {
523
+ return input.toLowerCase().replace(/[^a-z0-9]+/g, '_');
524
+ }
525
+
526
+ function tokenBag(text: string): string[] {
527
+ return text
528
+ .toLowerCase()
529
+ .replace(/[^a-z0-9]+/g, ' ')
530
+ .split(/\s+/)
531
+ .filter(Boolean);
532
+ }