@1mbrain/benchmarks 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (69) hide show
  1. package/README.md +85 -0
  2. package/fixtures/1mbrain-focused-mini/1mbrain-focused-mini.json +928 -0
  3. package/fixtures/1mbrain-focused-mini/README.md +45 -0
  4. package/fixtures/adversarial-memory/dataset_claude_adversarial.json +3333 -0
  5. package/fixtures/adversarial-memory/dataset_gemini_adversarial_memory.json +2984 -0
  6. package/fixtures/balanced-mini/dataset_claude_balanced_mini.json +2077 -0
  7. package/fixtures/balanced-mini/dataset_gemini_balanced_mini.json +1995 -0
  8. package/fixtures/generate_datasets.js +1741 -0
  9. package/fixtures/graph-stress-hard/README.md +43 -0
  10. package/fixtures/graph-stress-hard/dataset_graph_stress_hard.json +4374 -0
  11. package/fixtures/graph-stress-hard/generate_graph_stress_hard.js +526 -0
  12. package/fixtures/realistic-medium/dataset_claude_realistic_medium.json +7462 -0
  13. package/fixtures/realistic-medium/dataset_gemini_realistic_medium.json +7277 -0
  14. package/fixtures/realistic-medium/gen_claude_medium.js +600 -0
  15. package/package.json +22 -0
  16. package/reports/benchmark_report.md +48 -0
  17. package/reports/benchmark_report_claude_adversarial.md +42 -0
  18. package/reports/benchmark_report_claude_adversarial_adaptive.md +42 -0
  19. package/reports/benchmark_report_claude_adversarial_adaptive2_fast.md +42 -0
  20. package/reports/benchmark_report_claude_adversarial_adaptive_fast.md +42 -0
  21. package/reports/benchmark_report_claude_adversarial_rerank.md +42 -0
  22. package/reports/benchmark_report_claude_balanced_mini.md +42 -0
  23. package/reports/benchmark_report_claude_balanced_mini_adaptive.md +42 -0
  24. package/reports/benchmark_report_claude_balanced_mini_adaptive2_fast.md +42 -0
  25. package/reports/benchmark_report_claude_balanced_mini_adaptive_fast.md +42 -0
  26. package/reports/benchmark_report_claude_balanced_mini_rerank.md +42 -0
  27. package/reports/benchmark_report_claude_realistic_medium.md +42 -0
  28. package/reports/benchmark_report_claude_realistic_medium_adaptive.md +42 -0
  29. package/reports/benchmark_report_claude_realistic_medium_adaptive2_fast.md +42 -0
  30. package/reports/benchmark_report_claude_realistic_medium_adaptive_fast.md +42 -0
  31. package/reports/benchmark_report_claude_realistic_medium_evidence_rerank_local.md +42 -0
  32. package/reports/benchmark_report_claude_realistic_medium_openai_evidence_rerank.md +41 -0
  33. package/reports/benchmark_report_claude_realistic_medium_openai_multi_signal.md +41 -0
  34. package/reports/benchmark_report_claude_realistic_medium_openai_multi_signal_scoped.md +41 -0
  35. package/reports/benchmark_report_claude_realistic_medium_openai_phase8_no_judge.md +42 -0
  36. package/reports/benchmark_report_claude_realistic_medium_openai_rankingpolicy.md +41 -0
  37. package/reports/benchmark_report_claude_realistic_medium_openai_stale_filter.md +41 -0
  38. package/reports/benchmark_report_claude_realistic_medium_openai_stale_filter_absence_fix.md +41 -0
  39. package/reports/benchmark_report_claude_realistic_medium_openai_write_time_invalidation.md +41 -0
  40. package/reports/benchmark_report_claude_realistic_medium_rerank.md +42 -0
  41. package/reports/benchmark_report_claude_realistic_medium_stale_filter_local.md +42 -0
  42. package/reports/benchmark_report_graph_stress_hard.md +42 -0
  43. package/reports/benchmark_report_graph_stress_hard_absence_fix.md +42 -0
  44. package/reports/benchmark_report_graph_stress_hard_adaptive.md +42 -0
  45. package/reports/benchmark_report_graph_stress_hard_evidence_rerank.md +42 -0
  46. package/reports/benchmark_report_graph_stress_hard_multi_signal_current_guardrail.md +42 -0
  47. package/reports/benchmark_report_graph_stress_hard_multi_signal_guardrail_fixed.md +42 -0
  48. package/reports/benchmark_report_graph_stress_hard_multi_signal_local.md +42 -0
  49. package/reports/benchmark_report_graph_stress_hard_multi_signal_scoped_guardrail.md +42 -0
  50. package/reports/benchmark_report_graph_stress_hard_multi_signal_vector_pure_guardrail.md +42 -0
  51. package/reports/benchmark_report_graph_stress_hard_phase8_sdk_guardrail.md +42 -0
  52. package/reports/benchmark_report_graph_stress_hard_rerank.md +42 -0
  53. package/reports/benchmark_report_graph_stress_hard_stale_filter.md +42 -0
  54. package/reports/benchmark_report_graph_stress_hard_write_time_invalidation.md +42 -0
  55. package/results/.gitignore +2 -0
  56. package/src/adapters/1mbrain.ts +317 -0
  57. package/src/adapters/keyword-embedding.ts +48 -0
  58. package/src/adapters/mem0.ts +124 -0
  59. package/src/adapters/qdrant.ts +214 -0
  60. package/src/adapters/unavailable.ts +49 -0
  61. package/src/adapters/vector-baseline.ts +149 -0
  62. package/src/datasets/focused-mini.ts +158 -0
  63. package/src/datasets/synthetic-agent-memory.ts +532 -0
  64. package/src/llm-evaluator.ts +262 -0
  65. package/src/metrics.ts +482 -0
  66. package/src/provider.ts +151 -0
  67. package/src/runner.ts +635 -0
  68. package/tsconfig.json +10 -0
  69. package/tsconfig.tsbuildinfo +1 -0
@@ -0,0 +1,1741 @@
1
+ import fs from 'node:fs';
2
+ import path from 'node:path';
3
+ import { fileURLToPath } from 'node:url';
4
+
5
+ const __filename = fileURLToPath(import.meta.url);
6
+ const __dirname = path.dirname(__filename);
7
+
8
+ // Helper to calculate stop-word filtered keyword overlap
9
+ function getKeywordOverlap(qText, mText) {
10
+ const stopWords = new Set(['what', 'is', 'the', 'how', 'to', 'for', 'of', 'in', 'on', 'a', 'an', 'and', 'should', 'we', 'are', 'you', 'i', 'my', 'what', 'which', 'was', 'were', 'who', 'whom', 'where', 'why', 'can', 'could', 'would', 'will', 'do', 'does', 'did', 'has', 'have', 'had', 'what', 'of', 'in', 'at']);
11
+ const tokenize = text => text.toLowerCase().replace(/[^a-z0-9\s]/g, '').split(/\s+/).filter(w => w && !stopWords.has(w));
12
+
13
+ const qTokens = new Set(tokenize(qText));
14
+ const mTokens = tokenize(mText);
15
+ if (qTokens.size === 0) return 0;
16
+
17
+ let overlapCount = 0;
18
+ const matchedTokens = new Set();
19
+ for (const token of mTokens) {
20
+ if (qTokens.has(token) && !matchedTokens.has(token)) {
21
+ overlapCount++;
22
+ matchedTokens.add(token);
23
+ }
24
+ }
25
+ return overlapCount / qTokens.size;
26
+ }
27
+
28
+ // ----------------------------------------------------
29
+ // DATASET A: BALANCED MINI
30
+ // 8 conversations, 72 memories (9 per conv), 40 questions (5 per conv)
31
+ // ----------------------------------------------------
32
+ const datasetA = {
33
+ name: "balanced-mini",
34
+ description: "Fast balanced memory benchmark smoke test containing 8 conversations with 72 memory records and 40 questions.",
35
+ generated_at: new Date().toISOString().split('T')[0],
36
+ fairness_notes: [
37
+ "No memory passport, decay, or explicit graph schema dependencies are assumed.",
38
+ "Questions are distributed across exact retrieval, paraphrased semantic queries, and temporal updates.",
39
+ "Graph-heavy questions are kept below 15% to prevent bias towards graph memory models."
40
+ ],
41
+ conversations: [
42
+ {
43
+ conversation_id: "da_c01_fastapi",
44
+ agent_id: "da_agent_fastapi",
45
+ domain: "software",
46
+ memory_records: [
47
+ {
48
+ id: "da_c01_m01",
49
+ type: "semantic",
50
+ timestamp: "2026-06-10T10:00:00Z",
51
+ content: "The developer prefers using Uvicorn with 4 workers for local development of the FastAPI server.",
52
+ tags: ["fastapi", "uvicorn", "local-dev"],
53
+ importance: 0.8,
54
+ metadata: { source_turn: "t01", speaker: "user" }
55
+ },
56
+ {
57
+ id: "da_c01_m02",
58
+ type: "semantic",
59
+ timestamp: "2026-06-10T10:05:00Z",
60
+ content: "The database connection URI template is postgresql://user:pass@localhost:5432/dev_db.",
61
+ tags: ["fastapi", "database", "postgres"],
62
+ importance: 0.85,
63
+ metadata: { source_turn: "t02", speaker: "user" }
64
+ },
65
+ {
66
+ id: "da_c01_m03",
67
+ type: "procedural",
68
+ timestamp: "2026-06-10T10:10:00Z",
69
+ content: "To update the database schema, run the command 'alembic upgrade head' in the root directory.",
70
+ tags: ["fastapi", "alembic", "migration"],
71
+ importance: 0.9,
72
+ metadata: { source_turn: "t03", speaker: "user" }
73
+ },
74
+ {
75
+ id: "da_c01_m04",
76
+ type: "episodic",
77
+ timestamp: "2026-06-10T10:15:00Z",
78
+ content: "At 09:00, the database connection failed because the local Postgres container was stopped.",
79
+ tags: ["fastapi", "database", "error"],
80
+ importance: 0.7,
81
+ metadata: { source_turn: "t04", speaker: "assistant" }
82
+ },
83
+ {
84
+ id: "da_c01_m05",
85
+ type: "semantic",
86
+ timestamp: "2026-06-11T09:00:00Z",
87
+ content: "On June 11, the developer decided that SQLite should be used for testing, replacing Postgres.",
88
+ tags: ["fastapi", "database", "testing"],
89
+ importance: 0.8,
90
+ metadata: { source_turn: "t05", speaker: "user" }
91
+ },
92
+ {
93
+ id: "da_c01_m06",
94
+ type: "episodic",
95
+ timestamp: "2026-06-11T09:10:00Z",
96
+ content: "At 10:15, the developer reported that tests passed using the SQLite memory driver.",
97
+ tags: ["fastapi", "testing", "success"],
98
+ importance: 0.75,
99
+ metadata: { source_turn: "t06", speaker: "assistant" }
100
+ },
101
+ {
102
+ id: "da_c01_m07",
103
+ type: "semantic",
104
+ timestamp: "2026-06-12T11:00:00Z",
105
+ content: "Noise: The project logo uses a green and blue color palette representing growth and stability.",
106
+ tags: ["noise", "logo"],
107
+ importance: 0.2,
108
+ metadata: { source_turn: "t07", speaker: "user" }
109
+ },
110
+ {
111
+ id: "da_c01_m08",
112
+ type: "semantic",
113
+ timestamp: "2026-06-12T11:05:00Z",
114
+ content: "Noise: The server logs are stored in the folder /var/log/fastapi/app.log.",
115
+ tags: ["noise", "logs"],
116
+ importance: 0.3,
117
+ metadata: { source_turn: "t08", speaker: "assistant" }
118
+ },
119
+ {
120
+ id: "da_c01_m09",
121
+ type: "semantic",
122
+ timestamp: "2026-06-13T14:00:00Z",
123
+ content: "The latest preference is to deploy on GCP Cloud Run using dockerized builds, replacing the older AWS ECS plan.",
124
+ tags: ["fastapi", "deployment", "gcp"],
125
+ importance: 0.9,
126
+ metadata: { source_turn: "t09", speaker: "user" }
127
+ }
128
+ ],
129
+ questions: [
130
+ {
131
+ question_id: "da_c01_q01",
132
+ category: "atomic_fact_recall",
133
+ question: "What connection URI template is configured for the dev database?",
134
+ expected_answer: "postgresql://user:pass@localhost:5432/dev_db",
135
+ acceptable_answer_criteria: ["postgresql://user:pass@localhost:5432/dev_db", "Postgres dev_db URI"],
136
+ required_memory_ids: ["da_c01_m02"],
137
+ forbidden_memory_ids: [],
138
+ difficulty: "easy",
139
+ architecture_bias_risk: "low",
140
+ fairness_note: "Direct exact match fact retrieval, basic vector search works perfectly."
141
+ },
142
+ {
143
+ question_id: "da_c01_q02",
144
+ category: "paraphrased_semantic_recall",
145
+ question: "What is the preferred setup for running the local API server?",
146
+ expected_answer: "Uvicorn with 4 workers",
147
+ acceptable_answer_criteria: ["Uvicorn", "4 workers"],
148
+ required_memory_ids: ["da_c01_m01"],
149
+ forbidden_memory_ids: [],
150
+ difficulty: "medium",
151
+ architecture_bias_risk: "low",
152
+ fairness_note: "Paraphrasing evaluates semantic similarity retrieval rather than simple keyword overlap."
153
+ },
154
+ {
155
+ question_id: "da_c01_q03",
156
+ category: "temporal_update",
157
+ question: "Where should the application be deployed according to the latest decision?",
158
+ expected_answer: "GCP Cloud Run using dockerized builds",
159
+ acceptable_answer_criteria: ["GCP Cloud Run", "GCP", "Cloud Run"],
160
+ required_memory_ids: ["da_c01_m09"],
161
+ forbidden_memory_ids: [],
162
+ difficulty: "medium",
163
+ architecture_bias_risk: "low",
164
+ fairness_note: "Requires checking timestamps or explicit updates to ignore outdated AWS plan."
165
+ },
166
+ {
167
+ question_id: "da_c01_q04",
168
+ category: "procedural_recall",
169
+ question: "What command must be executed to migrate the database schema?",
170
+ expected_answer: "alembic upgrade head",
171
+ acceptable_answer_criteria: ["alembic upgrade head", "alembic"],
172
+ required_memory_ids: ["da_c01_m03"],
173
+ forbidden_memory_ids: [],
174
+ difficulty: "easy",
175
+ architecture_bias_risk: "low",
176
+ fairness_note: "Retrieves a simple step-by-step procedural instruction."
177
+ },
178
+ {
179
+ question_id: "da_c01_q05",
180
+ category: "noise_resistance",
181
+ question: "What is the file path where FastAPI server log files are written?",
182
+ expected_answer: "/var/log/fastapi/app.log",
183
+ acceptable_answer_criteria: ["/var/log/fastapi/app.log", "/var/log/fastapi"],
184
+ required_memory_ids: ["da_c01_m08"],
185
+ forbidden_memory_ids: ["da_c01_m07"],
186
+ difficulty: "easy",
187
+ architecture_bias_risk: "low",
188
+ fairness_note: "Verifies the system can ignore unrelated design/color noise."
189
+ }
190
+ ]
191
+ },
192
+ {
193
+ conversation_id: "da_c02_smart_home",
194
+ agent_id: "da_agent_smart_home",
195
+ domain: "personal_assistant",
196
+ memory_records: [
197
+ {
198
+ id: "da_c02_m01",
199
+ type: "semantic",
200
+ timestamp: "2026-06-12T08:00:00Z",
201
+ content: "The family prefers keeping the living room thermostat at 71 degrees Fahrenheit during daytime.",
202
+ tags: ["home", "thermostat", "temperature"],
203
+ importance: 0.8,
204
+ metadata: { source_turn: "t01", speaker: "user" }
205
+ },
206
+ {
207
+ id: "da_c02_m02",
208
+ type: "semantic",
209
+ timestamp: "2026-06-12T08:05:00Z",
210
+ content: "The smart lock code for the back door is set to 4920 for house cleaner access.",
211
+ tags: ["home", "security", "smart-lock"],
212
+ importance: 0.9,
213
+ metadata: { source_turn: "t02", speaker: "user" }
214
+ },
215
+ {
216
+ id: "da_c02_m03",
217
+ type: "episodic",
218
+ timestamp: "2026-06-12T08:10:00Z",
219
+ content: "The house cleaner, Maria, visits every Thursday at 10:00 AM.",
220
+ tags: ["schedule", "cleaner", "house"],
221
+ importance: 0.75,
222
+ metadata: { source_turn: "t03", speaker: "user" },
223
+ associations: [
224
+ { target_id: "da_c02_m02", strength: 0.85, reason: "cleaner uses smart lock code" }
225
+ ]
226
+ },
227
+ {
228
+ id: "da_c02_m04",
229
+ type: "semantic",
230
+ timestamp: "2026-06-13T09:00:00Z",
231
+ content: "Noise: The living room couch is covered with a beige slipcover to prevent dog hair stains.",
232
+ tags: ["noise", "furniture"],
233
+ importance: 0.2,
234
+ metadata: { source_turn: "t04", speaker: "user" }
235
+ },
236
+ {
237
+ id: "da_c02_m05",
238
+ type: "semantic",
239
+ timestamp: "2026-06-14T09:00:00Z",
240
+ content: "On June 14, the daytime living room thermostat preference was updated to 73 degrees Fahrenheit for energy saving.",
241
+ tags: ["home", "thermostat", "temperature"],
242
+ importance: 0.85,
243
+ metadata: { source_turn: "t05", speaker: "user" }
244
+ },
245
+ {
246
+ id: "da_c02_m06",
247
+ type: "episodic",
248
+ timestamp: "2026-06-14T10:00:00Z",
249
+ content: "The back door smart lock battery level dropped to 10% and needs replacement soon.",
250
+ tags: ["home", "security", "battery"],
251
+ importance: 0.7,
252
+ metadata: { source_turn: "t06", speaker: "assistant" }
253
+ },
254
+ {
255
+ id: "da_c02_m07",
256
+ type: "semantic",
257
+ timestamp: "2026-06-15T11:00:00Z",
258
+ content: "The dog's name is Barnaby, and he eats dry kibble twice a day at 8:00 AM and 6:00 PM.",
259
+ tags: ["pet", "schedule", "dog"],
260
+ importance: 0.8,
261
+ metadata: { source_turn: "t07", speaker: "user" }
262
+ },
263
+ {
264
+ id: "da_c02_m08",
265
+ type: "semantic",
266
+ timestamp: "2026-06-15T11:15:00Z",
267
+ content: "Noise: The backyard sprinkler system runs on Mondays and Thursdays at 6:00 AM.",
268
+ tags: ["noise", "sprinklers"],
269
+ importance: 0.3,
270
+ metadata: { source_turn: "t08", speaker: "user" }
271
+ },
272
+ {
273
+ id: "da_c02_m09",
274
+ type: "semantic",
275
+ timestamp: "2026-06-16T12:00:00Z",
276
+ content: "The house cleaner's contact number is 555-0192.",
277
+ tags: ["cleaner", "contact"],
278
+ importance: 0.85,
279
+ metadata: { source_turn: "t09", speaker: "user" }
280
+ }
281
+ ],
282
+ questions: [
283
+ {
284
+ question_id: "da_c02_q01",
285
+ category: "atomic_fact_recall",
286
+ question: "What is the contact number of the house cleaner?",
287
+ expected_answer: "555-0192",
288
+ acceptable_answer_criteria: ["555-0192", "cleaner's phone number is 555-0192"],
289
+ required_memory_ids: ["da_c02_m09"],
290
+ forbidden_memory_ids: [],
291
+ difficulty: "easy",
292
+ architecture_bias_risk: "low",
293
+ fairness_note: "Basic keyword overlap fact retrieval."
294
+ },
295
+ {
296
+ question_id: "da_c02_q02",
297
+ category: "paraphrased_semantic_recall",
298
+ question: "What is the feeding schedule for Barnaby?",
299
+ expected_answer: "Twice a day at 8:00 AM and 6:00 PM.",
300
+ acceptable_answer_criteria: ["8:00 AM and 6:00 PM", "twice a day"],
301
+ required_memory_ids: ["da_c02_m07"],
302
+ forbidden_memory_ids: [],
303
+ difficulty: "medium",
304
+ architecture_bias_risk: "low",
305
+ fairness_note: "Tests semantic mapping of 'feeding schedule' to 'eats dry kibble'."
306
+ },
307
+ {
308
+ question_id: "da_c02_q03",
309
+ category: "temporal_update",
310
+ question: "What temperature should the living room be set to during the day?",
311
+ expected_answer: "73 degrees Fahrenheit",
312
+ acceptable_answer_criteria: ["73 degrees", "73 F", "73"],
313
+ required_memory_ids: ["da_c02_m05"],
314
+ forbidden_memory_ids: ["da_c02_m01"],
315
+ difficulty: "medium",
316
+ architecture_bias_risk: "low",
317
+ fairness_note: "Requires checking timestamps to ensure the newer 73F setting overrides the older 71F setting."
318
+ },
319
+ {
320
+ question_id: "da_c02_q04",
321
+ category: "contradiction_resolution",
322
+ question: "Is the living room thermostat still set to 71 degrees?",
323
+ expected_answer: "No, it was updated to 73 degrees Fahrenheit on June 14.",
324
+ acceptable_answer_criteria: ["No", "Updated to 73"],
325
+ required_memory_ids: ["da_c02_m05", "da_c02_m01"],
326
+ forbidden_memory_ids: [],
327
+ difficulty: "medium",
328
+ architecture_bias_risk: "low",
329
+ fairness_note: "Resolves contradiction by looking at the update history."
330
+ },
331
+ {
332
+ question_id: "da_c02_q05",
333
+ category: "multi_hop_association",
334
+ question: "What door entry code does Maria need when she visits on Thursdays?",
335
+ expected_answer: "4920",
336
+ acceptable_answer_criteria: ["4920", "smart lock code 4920"],
337
+ required_memory_ids: ["da_c02_m02", "da_c02_m03"],
338
+ forbidden_memory_ids: [],
339
+ difficulty: "hard",
340
+ architecture_bias_risk: "medium",
341
+ fairness_note: "Requires linking Maria -> house cleaner -> smart lock code (4920). Solvable with either graph search or multi-evidence semantic retrieval."
342
+ }
343
+ ]
344
+ },
345
+ {
346
+ conversation_id: "da_c03_quantum_sim",
347
+ agent_id: "da_agent_quantum_sim",
348
+ domain: "research",
349
+ memory_records: [
350
+ {
351
+ id: "da_c03_m01",
352
+ type: "semantic",
353
+ timestamp: "2026-06-15T09:00:00Z",
354
+ content: "The quantum simulator code uses a grid spacing parameter dx = 0.05 microns.",
355
+ tags: ["physics", "quantum", "simulation"],
356
+ importance: 0.8,
357
+ metadata: { source_turn: "t01", speaker: "user" }
358
+ },
359
+ {
360
+ id: "da_c03_m02",
361
+ type: "semantic",
362
+ timestamp: "2026-06-15T09:05:00Z",
363
+ content: "The Hamiltonian solver converges only when the relaxation factor omega is set to 1.25.",
364
+ tags: ["physics", "solver", "convergence"],
365
+ importance: 0.85,
366
+ metadata: { source_turn: "t02", speaker: "user" }
367
+ },
368
+ {
369
+ id: "da_c03_m03",
370
+ type: "episodic",
371
+ timestamp: "2026-06-15T09:10:00Z",
372
+ content: "Simulation run #402 stalled after 1500 iterations due to a floating point exception in the grid boundary code.",
373
+ tags: ["physics", "simulation", "bug"],
374
+ importance: 0.75,
375
+ metadata: { source_turn: "t03", speaker: "assistant" }
376
+ },
377
+ {
378
+ id: "da_c03_m04",
379
+ type: "semantic",
380
+ timestamp: "2026-06-15T09:15:00Z",
381
+ content: "Noise: Dr. Henderson recommends using Python's NumPy library instead of SciPy for matrix division.",
382
+ tags: ["noise", "numpy"],
383
+ importance: 0.3,
384
+ metadata: { source_turn: "t04", speaker: "user" }
385
+ },
386
+ {
387
+ id: "da_c03_m05",
388
+ type: "semantic",
389
+ timestamp: "2026-06-16T10:00:00Z",
390
+ content: "The grid spacing parameter dx was decreased to 0.02 microns to resolve numerical stability issues.",
391
+ tags: ["physics", "quantum", "simulation"],
392
+ importance: 0.85,
393
+ metadata: { source_turn: "t05", speaker: "user" }
394
+ },
395
+ {
396
+ id: "da_c03_m06",
397
+ type: "semantic",
398
+ timestamp: "2026-06-16T10:10:00Z",
399
+ content: "Noise: The simulator outputs are saved in high-density HDF5 format by default.",
400
+ tags: ["noise", "hdf5"],
401
+ importance: 0.25,
402
+ metadata: { source_turn: "t06", speaker: "assistant" }
403
+ },
404
+ {
405
+ id: "da_c03_m07",
406
+ type: "semantic",
407
+ timestamp: "2026-06-17T11:00:00Z",
408
+ content: "The quantum simulator utilizes the Crank-Nicolson method for time-stepping calculations.",
409
+ tags: ["physics", "simulation", "math"],
410
+ importance: 0.8,
411
+ metadata: { source_turn: "t07", speaker: "user" }
412
+ },
413
+ {
414
+ id: "da_c03_m08",
415
+ type: "semantic",
416
+ timestamp: "2026-06-17T11:20:00Z",
417
+ content: "We use the GPU-accelerated CuPy backend for sparse matrix operations.",
418
+ tags: ["physics", "simulation", "gpu"],
419
+ importance: 0.85,
420
+ metadata: { source_turn: "t08", speaker: "user" }
421
+ },
422
+ {
423
+ id: "da_c03_m09",
424
+ type: "semantic",
425
+ timestamp: "2026-06-18T13:00:00Z",
426
+ content: "The research project is titled 'Project Quasar' and is funded until December 2027.",
427
+ tags: ["research", "admin"],
428
+ importance: 0.7,
429
+ metadata: { source_turn: "t09", speaker: "user" }
430
+ }
431
+ ],
432
+ questions: [
433
+ {
434
+ question_id: "da_c03_q01",
435
+ category: "atomic_fact_recall",
436
+ question: "What relaxation factor is required for the Hamiltonian solver to converge?",
437
+ expected_answer: "1.25",
438
+ acceptable_answer_criteria: ["1.25", "omega is set to 1.25"],
439
+ required_memory_ids: ["da_c03_m02"],
440
+ forbidden_memory_ids: [],
441
+ difficulty: "easy",
442
+ architecture_bias_risk: "low",
443
+ fairness_note: "Direct exact match fact retrieval."
444
+ },
445
+ {
446
+ question_id: "da_c03_q02",
447
+ category: "paraphrased_semantic_recall",
448
+ question: "What mathematical approach is utilized for updating calculations across time?",
449
+ expected_answer: "Crank-Nicolson method",
450
+ acceptable_answer_criteria: ["Crank-Nicolson method", "Crank-Nicolson"],
451
+ required_memory_ids: ["da_c03_m07"],
452
+ forbidden_memory_ids: [],
453
+ difficulty: "medium",
454
+ architecture_bias_risk: "low",
455
+ fairness_note: "Tests paraphrased query match mapping 'updating calculations across time' to 'time-stepping'."
456
+ },
457
+ {
458
+ question_id: "da_c03_q03",
459
+ category: "temporal_update",
460
+ question: "What is the current grid spacing parameter dx used in the simulation?",
461
+ expected_answer: "0.02 microns",
462
+ acceptable_answer_criteria: ["0.02 microns", "0.02"],
463
+ required_memory_ids: ["da_c03_m05"],
464
+ forbidden_memory_ids: ["da_c03_m01"],
465
+ difficulty: "medium",
466
+ architecture_bias_risk: "low",
467
+ fairness_note: "Requires checking timestamps to ensure the newer 0.02 setting overrides the older 0.05 setting."
468
+ },
469
+ {
470
+ question_id: "da_c03_q04",
471
+ category: "contradiction_resolution",
472
+ question: "Did we increase the grid spacing parameter dx in our latest update?",
473
+ expected_answer: "No, it was decreased to 0.02 microns to resolve numerical stability issues.",
474
+ acceptable_answer_criteria: ["No, it was decreased", "No"],
475
+ required_memory_ids: ["da_c03_m05", "da_c03_m01"],
476
+ forbidden_memory_ids: [],
477
+ difficulty: "medium",
478
+ architecture_bias_risk: "low",
479
+ fairness_note: "Tests the ability to resolve the direction of change in contradiction resolution."
480
+ },
481
+ {
482
+ question_id: "da_c03_q05",
483
+ category: "abstention",
484
+ question: "Which specific GPU model is used to run the CuPy simulator?",
485
+ expected_answer: "not enough information",
486
+ acceptable_answer_criteria: ["not enough information", "insufficient evidence", "unknown"],
487
+ required_memory_ids: [],
488
+ forbidden_memory_ids: ["da_c03_m08"],
489
+ difficulty: "hard",
490
+ architecture_bias_risk: "low",
491
+ fairness_note: "Verifies the system correctly abstains when the record mentions a GPU backend (CuPy) but not the specific GPU model."
492
+ }
493
+ ]
494
+ },
495
+ {
496
+ conversation_id: "da_c04_tokyo_itinerary",
497
+ agent_id: "da_agent_tokyo_itinerary",
498
+ domain: "travel",
499
+ memory_records: [
500
+ {
501
+ id: "da_c04_m01",
502
+ type: "semantic",
503
+ timestamp: "2026-06-01T10:00:00Z",
504
+ content: "The traveler has a booking at Hotel Claska in Meguro, Tokyo, from October 12 to October 18.",
505
+ tags: ["travel", "hotel", "tokyo"],
506
+ importance: 0.85,
507
+ metadata: { source_turn: "t01", speaker: "user" }
508
+ },
509
+ {
510
+ id: "da_c04_m02",
511
+ type: "semantic",
512
+ timestamp: "2026-06-01T10:05:00Z",
513
+ content: "The traveler prefers flying window seats on long-haul flights to sleep easily.",
514
+ tags: ["travel", "flight", "preference"],
515
+ importance: 0.8,
516
+ metadata: { source_turn: "t02", speaker: "user" }
517
+ },
518
+ {
519
+ id: "da_c04_m03",
520
+ type: "episodic",
521
+ timestamp: "2026-06-02T11:00:00Z",
522
+ content: "The traveler booked flight JL005 from JFK to Haneda, departing at 1:15 PM on October 11.",
523
+ tags: ["travel", "flight", "tokyo"],
524
+ importance: 0.9,
525
+ metadata: { source_turn: "t03", speaker: "user" },
526
+ associations: [
527
+ { target_id: "da_c04_m02", strength: 0.75, reason: "flight seat preferences" }
528
+ ]
529
+ },
530
+ {
531
+ id: "da_c04_m04",
532
+ type: "semantic",
533
+ timestamp: "2026-06-02T11:05:00Z",
534
+ content: "Noise: The traveler's luggage is a medium-sized hard-shell suitcase in navy blue color.",
535
+ tags: ["noise", "traveler-suitfall"],
536
+ importance: 0.2,
537
+ metadata: { source_turn: "t04", speaker: "user" }
538
+ },
539
+ {
540
+ id: "da_c04_m05",
541
+ type: "semantic",
542
+ timestamp: "2026-06-03T09:00:00Z",
543
+ content: "On June 3, the traveler changed the Tokyo accommodation plan to stay at Trunk Hotel in Shibuya instead of Hotel Claska.",
544
+ tags: ["travel", "hotel", "tokyo"],
545
+ importance: 0.85,
546
+ metadata: { source_turn: "t05", speaker: "user" }
547
+ },
548
+ {
549
+ id: "da_c04_m06",
550
+ type: "episodic",
551
+ timestamp: "2026-06-04T12:00:00Z",
552
+ content: "The traveler bought a museum ticket for the teamLab Planets exhibit on October 14 at 2:00 PM.",
553
+ tags: ["travel", "activity", "museum"],
554
+ importance: 0.8,
555
+ metadata: { source_turn: "t06", speaker: "user" }
556
+ },
557
+ {
558
+ id: "da_c04_m07",
559
+ type: "semantic",
560
+ timestamp: "2026-06-04T12:05:00Z",
561
+ content: "Noise: Tokyo temperatures in mid-October average 15 to 22 degrees Celsius.",
562
+ tags: ["noise", "weather"],
563
+ importance: 0.3,
564
+ metadata: { source_turn: "t07", speaker: "assistant" }
565
+ },
566
+ {
567
+ id: "da_c04_m08",
568
+ type: "semantic",
569
+ timestamp: "2026-06-05T10:00:00Z",
570
+ content: "The traveler enjoys visiting traditional tempura restaurants and prefers reservations for dinner.",
571
+ tags: ["travel", "food", "dining"],
572
+ importance: 0.75,
573
+ metadata: { source_turn: "t08", speaker: "user" }
574
+ },
575
+ {
576
+ id: "da_c04_m09",
577
+ type: "semantic",
578
+ timestamp: "2026-06-05T10:30:00Z",
579
+ content: "The passenger name in the airline booking is listed as Sarah Miller.",
580
+ tags: ["travel", "identity"],
581
+ importance: 0.8,
582
+ metadata: { source_turn: "t09", speaker: "user" }
583
+ }
584
+ ],
585
+ questions: [
586
+ {
587
+ question_id: "da_c04_q01",
588
+ category: "atomic_fact_recall",
589
+ question: "What is the departure time and flight number for the JFK to Haneda flight?",
590
+ expected_answer: "Flight JL005 departing at 1:15 PM.",
591
+ acceptable_answer_criteria: ["Flight JL005", "1:15 PM", "JL005"],
592
+ required_memory_ids: ["da_c04_m03"],
593
+ forbidden_memory_ids: [],
594
+ difficulty: "easy",
595
+ architecture_bias_risk: "low",
596
+ fairness_note: "Direct exact match fact retrieval."
597
+ },
598
+ {
599
+ question_id: "da_c04_q02",
600
+ category: "paraphrased_semantic_recall",
601
+ question: "What type of seating does Sarah Miller prefer on long airplane journeys?",
602
+ expected_answer: "Window seats",
603
+ acceptable_answer_criteria: ["window", "window seat"],
604
+ required_memory_ids: ["da_c04_m02", "da_c04_m09"],
605
+ forbidden_memory_ids: [],
606
+ difficulty: "medium",
607
+ architecture_bias_risk: "low",
608
+ fairness_note: "Tests semantic mapping of 'long airplane journeys' to 'long-haul flights' and uses passenger name 'Sarah Miller' from another record."
609
+ },
610
+ {
611
+ question_id: "da_c04_q03",
612
+ category: "temporal_update",
613
+ question: "Where is the traveler staying in Tokyo during their visit?",
614
+ expected_answer: "Trunk Hotel in Shibuya",
615
+ acceptable_answer_criteria: ["Trunk Hotel", "Trunk Hotel in Shibuya", "Trunk"],
616
+ required_memory_ids: ["da_c04_m05"],
617
+ forbidden_memory_ids: ["da_c04_m01"],
618
+ difficulty: "medium",
619
+ architecture_bias_risk: "low",
620
+ fairness_note: "Requires retrieving the updated Trunk Hotel choice rather than the old Hotel Claska plan."
621
+ },
622
+ {
623
+ question_id: "da_c04_q04",
624
+ category: "noise_resistance",
625
+ question: "What is the date and time of the teamLab Planets ticket?",
626
+ expected_answer: "October 14 at 2:00 PM",
627
+ acceptable_answer_criteria: ["October 14", "2:00 PM"],
628
+ required_memory_ids: ["da_c04_m06"],
629
+ forbidden_memory_ids: ["da_c04_m07", "da_c04_m04"],
630
+ difficulty: "easy",
631
+ architecture_bias_risk: "low",
632
+ fairness_note: "Tests retrieval quality when ignores irrelevant weather and luggage color metadata."
633
+ },
634
+ {
635
+ question_id: "da_c04_q05",
636
+ category: "multi_hop_association",
637
+ question: "What seat selection is preferred for the traveler's flight JL005 on October 11?",
638
+ expected_answer: "Window seat",
639
+ acceptable_answer_criteria: ["window seat", "window"],
640
+ required_memory_ids: ["da_c04_m02", "da_c04_m03"],
641
+ forbidden_memory_ids: [],
642
+ difficulty: "medium",
643
+ architecture_bias_risk: "medium",
644
+ fairness_note: "Tests link between flight JL005 -> long-haul flight -> window seat preference."
645
+ }
646
+ ]
647
+ },
648
+ {
649
+ conversation_id: "da_c05_diabetes_admin",
650
+ agent_id: "da_agent_diabetes_admin",
651
+ domain: "health_admin",
652
+ memory_records: [
653
+ {
654
+ id: "da_c05_m01",
655
+ type: "semantic",
656
+ timestamp: "2026-06-05T08:00:00Z",
657
+ content: "The patient is insured under Blue Shield PPO, Policy ID #BS-9021-X.",
658
+ tags: ["health", "insurance", "policy"],
659
+ importance: 0.85,
660
+ metadata: { source_turn: "t01", speaker: "user" }
661
+ },
662
+ {
663
+ id: "da_c05_m02",
664
+ type: "semantic",
665
+ timestamp: "2026-06-05T08:05:00Z",
666
+ content: "The patient's endocrinologist is Dr. Robert Vance, located at the Vance Clinic on Oak Street.",
667
+ tags: ["health", "doctor", "endocrinologist"],
668
+ importance: 0.8,
669
+ metadata: { source_turn: "t02", speaker: "user" }
670
+ },
671
+ {
672
+ id: "da_c05_m03",
673
+ type: "episodic",
674
+ timestamp: "2026-06-05T08:10:00Z",
675
+ content: "The patient has a scheduled routine blood draw appointment on June 20 at 7:30 AM at Oak Laboratories.",
676
+ tags: ["health", "appointment", "labs"],
677
+ importance: 0.9,
678
+ metadata: { source_turn: "t03", speaker: "user" },
679
+ associations: [
680
+ { target_id: "da_c05_m02", strength: 0.8, reason: "doctor ordered blood draw" }
681
+ ]
682
+ },
683
+ {
684
+ id: "da_c05_m04",
685
+ type: "semantic",
686
+ timestamp: "2026-06-05T08:15:00Z",
687
+ content: "Noise: The Vance Clinic building has a red brick facade with a parking lot in the rear.",
688
+ tags: ["noise", "clinic"],
689
+ importance: 0.2,
690
+ metadata: { source_turn: "t04", speaker: "assistant" }
691
+ },
692
+ {
693
+ id: "da_c05_m05",
694
+ type: "semantic",
695
+ timestamp: "2026-06-06T09:00:00Z",
696
+ content: "On June 6, the patient's insurance plan was updated to Cigna Gold Open Access, Policy ID #CI-8401-Y, due to employer benefits change.",
697
+ tags: ["health", "insurance", "policy"],
698
+ importance: 0.9,
699
+ metadata: { source_turn: "t05", speaker: "user" }
700
+ },
701
+ {
702
+ id: "da_c05_m06",
703
+ type: "episodic",
704
+ timestamp: "2026-06-06T10:00:00Z",
705
+ content: "The patient completed their annual physical examination on June 2 and was advised to exercise 30 minutes daily.",
706
+ tags: ["health", "physical", "exercise"],
707
+ importance: 0.8,
708
+ metadata: { source_turn: "t06", speaker: "assistant" }
709
+ },
710
+ {
711
+ id: "da_c05_m07",
712
+ type: "semantic",
713
+ timestamp: "2026-06-07T11:00:00Z",
714
+ content: "The patient takes Metformin 500mg twice daily with meals to manage blood sugar levels.",
715
+ tags: ["health", "prescription", "diabetes"],
716
+ importance: 0.85,
717
+ metadata: { source_turn: "t07", speaker: "user" }
718
+ },
719
+ {
720
+ id: "da_c05_m08",
721
+ type: "semantic",
722
+ timestamp: "2026-06-07T11:15:00Z",
723
+ content: "Noise: Blue Shield PPO customer service phone line is open 24 hours for emergency inquiries.",
724
+ tags: ["noise", "insurance"],
725
+ importance: 0.3,
726
+ metadata: { source_turn: "t08", speaker: "user" }
727
+ },
728
+ {
729
+ id: "da_c05_m09",
730
+ type: "semantic",
731
+ timestamp: "2026-06-08T12:00:00Z",
732
+ content: "The primary pharmacy is Walgreens on 4th Avenue, which has a drive-through window.",
733
+ tags: ["health", "pharmacy"],
734
+ importance: 0.75,
735
+ metadata: { source_turn: "t09", speaker: "user" }
736
+ }
737
+ ],
738
+ questions: [
739
+ {
740
+ question_id: "da_c05_q01",
741
+ category: "atomic_fact_recall",
742
+ question: "What dosage of Metformin is the patient prescribed to take?",
743
+ expected_answer: "500mg twice daily",
744
+ acceptable_answer_criteria: ["500mg twice daily", "500mg", "twice a day"],
745
+ required_memory_ids: ["da_c05_m07"],
746
+ forbidden_memory_ids: [],
747
+ difficulty: "easy",
748
+ architecture_bias_risk: "low",
749
+ fairness_note: "Direct exact match fact retrieval."
750
+ },
751
+ {
752
+ question_id: "da_c05_q02",
753
+ category: "paraphrased_semantic_recall",
754
+ question: "Where does the patient get their prescriptions filled?",
755
+ expected_answer: "Walgreens on 4th Avenue",
756
+ acceptable_answer_criteria: ["Walgreens", "Walgreens on 4th Avenue"],
757
+ required_memory_ids: ["da_c05_m09"],
758
+ forbidden_memory_ids: [],
759
+ difficulty: "medium",
760
+ architecture_bias_risk: "low",
761
+ fairness_note: "Tests semantic mapping of 'where does the patient get their prescriptions filled' to 'primary pharmacy'."
762
+ },
763
+ {
764
+ question_id: "da_c05_q03",
765
+ category: "temporal_update",
766
+ question: "What is the patient's current insurance provider and Policy ID?",
767
+ expected_answer: "Cigna Gold Open Access, Policy ID #CI-8401-Y",
768
+ acceptable_answer_criteria: ["Cigna", "CI-8401-Y"],
769
+ required_memory_ids: ["da_c05_m05"],
770
+ forbidden_memory_ids: ["da_c05_m01"],
771
+ difficulty: "medium",
772
+ architecture_bias_risk: "low",
773
+ fairness_note: "Requires checking timestamps to ensure the Cigna policy overrides the old Blue Shield policy."
774
+ },
775
+ {
776
+ question_id: "da_c05_q04",
777
+ category: "contradiction_resolution",
778
+ question: "Is the patient's active health policy still Blue Shield PPO?",
779
+ expected_answer: "No, it was updated to Cigna Gold Open Access on June 6.",
780
+ acceptable_answer_criteria: ["No, it is Cigna", "No"],
781
+ required_memory_ids: ["da_c05_m05", "da_c05_m01"],
782
+ forbidden_memory_ids: [],
783
+ difficulty: "medium",
784
+ architecture_bias_risk: "low",
785
+ fairness_note: "Tests contradiction handling between old Blue Shield and current Cigna policies."
786
+ },
787
+ {
788
+ question_id: "da_c05_q05",
789
+ category: "multi_hop_association",
790
+ question: "At which laboratory is the blood draw requested by Dr. Robert Vance scheduled?",
791
+ expected_answer: "Oak Laboratories",
792
+ acceptable_answer_criteria: ["Oak Laboratories", "Oak Labs"],
793
+ required_memory_ids: ["da_c05_m03", "da_c05_m02"],
794
+ forbidden_memory_ids: [],
795
+ difficulty: "hard",
796
+ architecture_bias_risk: "medium",
797
+ fairness_note: "Requires connecting Dr. Vance -> ordered blood draw -> Oak Laboratories appointment."
798
+ }
799
+ ]
800
+ },
801
+ {
802
+ conversation_id: "da_c06_tax_prep",
803
+ agent_id: "da_agent_tax_prep",
804
+ domain: "finance_admin",
805
+ memory_records: [
806
+ {
807
+ id: "da_c06_m01",
808
+ type: "semantic",
809
+ timestamp: "2026-06-01T10:00:00Z",
810
+ content: "The user has a primary checking account at Chase Bank with routing number #***0912.",
811
+ tags: ["finance", "bank", "chase"],
812
+ importance: 0.8,
813
+ metadata: { source_turn: "t01", speaker: "user" }
814
+ },
815
+ {
816
+ id: "da_c06_m02",
817
+ type: "semantic",
818
+ timestamp: "2026-06-01T10:05:00Z",
819
+ content: "The tax consultant is Evelyn Mercer, who works at Mercer Tax Services.",
820
+ tags: ["finance", "tax", "consultant"],
821
+ importance: 0.85,
822
+ metadata: { source_turn: "t02", speaker: "user" }
823
+ },
824
+ {
825
+ id: "da_c06_m03",
826
+ type: "episodic",
827
+ timestamp: "2026-06-02T09:00:00Z",
828
+ content: "The user submitted Form 1099-NEC for freelance earnings of $14,200 from Apex Systems.",
829
+ tags: ["finance", "tax", "income"],
830
+ importance: 0.9,
831
+ metadata: { source_turn: "t03", speaker: "user" },
832
+ associations: [
833
+ { target_id: "da_c06_m02", strength: 0.8, reason: "Evelyn Mercer prepares tax submission" }
834
+ ]
835
+ },
836
+ {
837
+ id: "da_c06_m04",
838
+ type: "semantic",
839
+ timestamp: "2026-06-02T09:10:00Z",
840
+ content: "Noise: Chase Bank's branch in downtown has a revolving door and 4 teller counters.",
841
+ tags: ["noise", "chase"],
842
+ importance: 0.2,
843
+ metadata: { source_turn: "t04", speaker: "assistant" }
844
+ },
845
+ {
846
+ id: "da_c06_m05",
847
+ type: "semantic",
848
+ timestamp: "2026-06-03T11:00:00Z",
849
+ content: "On June 3, the user opened a business checking account at Silicon Valley Bank (SVB) to replace Chase for all future freelance income deposits.",
850
+ tags: ["finance", "bank", "svb"],
851
+ importance: 0.9,
852
+ metadata: { source_turn: "t05", speaker: "user" }
853
+ },
854
+ {
855
+ id: "da_c06_m06",
856
+ type: "episodic",
857
+ timestamp: "2026-06-03T12:00:00Z",
858
+ content: "The user paid a tax preparation deposit of $150 to Mercer Tax Services using their credit card.",
859
+ tags: ["finance", "payment", "tax"],
860
+ importance: 0.8,
861
+ metadata: { source_turn: "t06", speaker: "user" }
862
+ },
863
+ {
864
+ id: "da_c06_m07",
865
+ type: "semantic",
866
+ timestamp: "2026-06-04T10:00:00Z",
867
+ content: "The quarterly estimated tax payment deadline for Q2 is June 15.",
868
+ tags: ["finance", "tax", "deadline"],
869
+ importance: 0.85,
870
+ metadata: { source_turn: "t07", speaker: "assistant" }
871
+ },
872
+ {
873
+ id: "da_c06_m08",
874
+ type: "semantic",
875
+ timestamp: "2026-06-04T10:15:00Z",
876
+ content: "Noise: Mercer Tax Services logo has a scales of justice symbol in gold and navy colors.",
877
+ tags: ["noise", "tax"],
878
+ importance: 0.3,
879
+ metadata: { source_turn: "t08", speaker: "user" }
880
+ },
881
+ {
882
+ id: "da_c06_m09",
883
+ type: "semantic",
884
+ timestamp: "2026-06-05T13:00:00Z",
885
+ content: "The user has a personal retirement traditional IRA account at Fidelity with a 2026 contribution limit of $7,000.",
886
+ tags: ["finance", "ira", "fidelity"],
887
+ importance: 0.8,
888
+ metadata: { source_turn: "t09", speaker: "user" }
889
+ }
890
+ ],
891
+ questions: [
892
+ {
893
+ question_id: "da_c06_q01",
894
+ category: "atomic_fact_recall",
895
+ question: "What routing number is associated with the Chase Bank checking account?",
896
+ expected_answer: "#***0912",
897
+ acceptable_answer_criteria: ["#***0912", "routing number #***0912"],
898
+ required_memory_ids: ["da_c06_m01"],
899
+ forbidden_memory_ids: [],
900
+ difficulty: "easy",
901
+ architecture_bias_risk: "low",
902
+ fairness_note: "Direct exact match fact retrieval."
903
+ },
904
+ {
905
+ question_id: "da_c06_q02",
906
+ category: "paraphrased_semantic_recall",
907
+ question: "Who is handling the user's tax consultancy work?",
908
+ expected_answer: "Evelyn Mercer at Mercer Tax Services",
909
+ acceptable_answer_criteria: ["Evelyn Mercer", "Mercer Tax Services"],
910
+ required_memory_ids: ["da_c06_m02"],
911
+ forbidden_memory_ids: [],
912
+ difficulty: "medium",
913
+ architecture_bias_risk: "low",
914
+ fairness_note: "Tests semantic mapping of 'tax consultancy work' to 'tax consultant'."
915
+ },
916
+ {
917
+ question_id: "da_c06_q03",
918
+ category: "temporal_update",
919
+ question: "Where should future freelance earnings be deposited according to the latest decision?",
920
+ expected_answer: "Silicon Valley Bank (SVB)",
921
+ acceptable_answer_criteria: ["Silicon Valley Bank", "SVB"],
922
+ required_memory_ids: ["da_c06_m05"],
923
+ forbidden_memory_ids: ["da_c06_m01"],
924
+ difficulty: "medium",
925
+ architecture_bias_risk: "low",
926
+ fairness_note: "Requires identifying the new bank account (SVB) replacing the old bank account (Chase)."
927
+ },
928
+ {
929
+ question_id: "da_c06_q04",
930
+ category: "contradiction_resolution",
931
+ question: "Is Chase Bank still the active checking account for freelance income?",
932
+ expected_answer: "No, it was replaced by Silicon Valley Bank (SVB) on June 3.",
933
+ acceptable_answer_criteria: ["No, it was replaced by SVB", "No"],
934
+ required_memory_ids: ["da_c06_m05", "da_c06_m01"],
935
+ forbidden_memory_ids: [],
936
+ difficulty: "medium",
937
+ architecture_bias_risk: "low",
938
+ fairness_note: "Verifies the system resolves contradiction and correctly updates active account."
939
+ },
940
+ {
941
+ question_id: "da_c06_q05",
942
+ category: "abstention",
943
+ question: "What is the account number of the traditional IRA at Fidelity?",
944
+ expected_answer: "not enough information",
945
+ acceptable_answer_criteria: ["not enough information", "insufficient evidence", "unknown"],
946
+ required_memory_ids: [],
947
+ forbidden_memory_ids: ["da_c06_m09"],
948
+ difficulty: "hard",
949
+ architecture_bias_risk: "low",
950
+ fairness_note: "Tests abstention: the user mentioned having a traditional IRA at Fidelity but never shared the account number."
951
+ }
952
+ ]
953
+ },
954
+ {
955
+ conversation_id: "da_c07_linear_algebra",
956
+ agent_id: "da_agent_linear_algebra",
957
+ domain: "education",
958
+ memory_records: [
959
+ {
960
+ id: "da_c07_m01",
961
+ type: "semantic",
962
+ timestamp: "2026-06-01T09:00:00Z",
963
+ content: "The Linear Algebra course has weekly quizzes that open on Friday and close on Sunday night.",
964
+ tags: ["education", "math", "quizzes"],
965
+ importance: 0.8,
966
+ metadata: { source_turn: "t01", speaker: "user" }
967
+ },
968
+ {
969
+ id: "da_c07_m02",
970
+ type: "semantic",
971
+ timestamp: "2026-06-01T09:05:00Z",
972
+ content: "The mid-term exam is scheduled for October 15 and covers vector spaces, eigenvalues, and linear transformations.",
973
+ tags: ["education", "math", "midterm"],
974
+ importance: 0.9,
975
+ metadata: { source_turn: "t02", speaker: "user" }
976
+ },
977
+ {
978
+ id: "da_c07_m03",
979
+ type: "semantic",
980
+ timestamp: "2026-06-01T09:10:00Z",
981
+ content: "The required textbook is 'Introduction to Linear Algebra' by Gilbert Strang, 5th Edition.",
982
+ tags: ["education", "math", "textbook"],
983
+ importance: 0.85,
984
+ metadata: { source_turn: "t03", speaker: "user" },
985
+ associations: [
986
+ { target_id: "da_c07_m02", strength: 0.7, reason: "study source for midterm" }
987
+ ]
988
+ },
989
+ {
990
+ id: "da_c07_m04",
991
+ type: "semantic",
992
+ timestamp: "2026-06-01T09:15:00Z",
993
+ content: "Noise: The lecturer, Professor Adams, likes to drink hot black coffee during morning sessions.",
994
+ tags: ["noise", "professor"],
995
+ importance: 0.2,
996
+ metadata: { source_turn: "t04", speaker: "assistant" }
997
+ },
998
+ {
999
+ id: "da_c07_m05",
1000
+ type: "episodic",
1001
+ timestamp: "2026-06-02T10:00:00Z",
1002
+ content: "The user achieved a score of 92/100 on Homework Assignment 1.",
1003
+ tags: ["education", "grade", "homework"],
1004
+ importance: 0.8,
1005
+ metadata: { source_turn: "t05", speaker: "user" }
1006
+ },
1007
+ {
1008
+ id: "da_c07_m06",
1009
+ type: "semantic",
1010
+ timestamp: "2026-06-02T10:05:00Z",
1011
+ content: "Noise: Homework assignments must be uploaded in PDF format only.",
1012
+ tags: ["noise", "format"],
1013
+ importance: 0.3,
1014
+ metadata: { source_turn: "t06", speaker: "assistant" }
1015
+ },
1016
+ {
1017
+ id: "da_c07_m07",
1018
+ type: "semantic",
1019
+ timestamp: "2026-06-03T11:00:00Z",
1020
+ content: "The teaching assistant is named Marcus Vance, and his office hours are Wednesdays from 2:00 PM to 4:00 PM.",
1021
+ tags: ["education", "math", "office-hours"],
1022
+ importance: 0.8,
1023
+ metadata: { source_turn: "t07", speaker: "user" }
1024
+ },
1025
+ {
1026
+ id: "da_c07_m08",
1027
+ type: "semantic",
1028
+ timestamp: "2026-06-03T11:15:00Z",
1029
+ content: "Office hours are located in Room 402 of the Mathematics Building.",
1030
+ tags: ["education", "math", "location"],
1031
+ importance: 0.75,
1032
+ metadata: { source_turn: "t08", speaker: "user" },
1033
+ associations: [
1034
+ { target_id: "da_c07_m07", strength: 0.9, reason: "TA's office location" }
1035
+ ]
1036
+ },
1037
+ {
1038
+ id: "da_c07_m09",
1039
+ type: "semantic",
1040
+ timestamp: "2026-06-04T12:00:00Z",
1041
+ content: "The class Zoom password is 'Eigen2026'.",
1042
+ tags: ["education", "math", "zoom"],
1043
+ importance: 0.85,
1044
+ metadata: { source_turn: "t09", speaker: "user" }
1045
+ }
1046
+ ],
1047
+ questions: [
1048
+ {
1049
+ question_id: "da_c07_q01",
1050
+ category: "atomic_fact_recall",
1051
+ question: "What textbook edition is required for the Linear Algebra course?",
1052
+ expected_answer: "'Introduction to Linear Algebra' by Gilbert Strang, 5th Edition.",
1053
+ acceptable_answer_criteria: ["Strang 5th Edition", "Gilbert Strang", "5th Edition"],
1054
+ required_memory_ids: ["da_c07_m03"],
1055
+ forbidden_memory_ids: [],
1056
+ difficulty: "easy",
1057
+ architecture_bias_risk: "low",
1058
+ fairness_note: "Direct exact match fact retrieval."
1059
+ },
1060
+ {
1061
+ question_id: "da_c07_q02",
1062
+ category: "atomic_fact_recall",
1063
+ question: "What is the date of the midterm exam and what topics does it cover?",
1064
+ expected_answer: "October 15, covering vector spaces, eigenvalues, and linear transformations.",
1065
+ acceptable_answer_criteria: ["October 15", "vector spaces", "eigenvalues", "linear transformations"],
1066
+ required_memory_ids: ["da_c07_m02"],
1067
+ forbidden_memory_ids: [],
1068
+ difficulty: "medium",
1069
+ architecture_bias_risk: "low",
1070
+ fairness_note: "Direct details fact retrieval."
1071
+ },
1072
+ {
1073
+ question_id: "da_c07_q03",
1074
+ category: "paraphrased_semantic_recall",
1075
+ question: "What is the password to access virtual course video conferences?",
1076
+ expected_answer: "Eigen2026",
1077
+ acceptable_answer_criteria: ["Eigen2026", "class Zoom password 'Eigen2026'"],
1078
+ required_memory_ids: ["da_c07_m09"],
1079
+ forbidden_memory_ids: [],
1080
+ difficulty: "medium",
1081
+ architecture_bias_risk: "low",
1082
+ fairness_note: "Tests semantic mapping of 'virtual course video conferences' to 'class Zoom'."
1083
+ },
1084
+ {
1085
+ question_id: "da_c07_q04",
1086
+ category: "noise_resistance",
1087
+ question: "What grade did the user receive on the first Homework Assignment?",
1088
+ expected_answer: "92/100",
1089
+ acceptable_answer_criteria: ["92/100", "92"],
1090
+ required_memory_ids: ["da_c07_m05"],
1091
+ forbidden_memory_ids: ["da_c07_m04", "da_c07_m06"],
1092
+ difficulty: "easy",
1093
+ architecture_bias_risk: "low",
1094
+ fairness_note: "Verifies ignoring coffee drinking preference and PDF upload requirements."
1095
+ },
1096
+ {
1097
+ question_id: "da_c07_q05",
1098
+ category: "multi_hop_association",
1099
+ question: "Where should the student go to meet teaching assistant Marcus Vance in person?",
1100
+ expected_answer: "Room 402 of the Mathematics Building",
1101
+ acceptable_answer_criteria: ["Room 402", "Math Building Room 402"],
1102
+ required_memory_ids: ["da_c07_m07", "da_c07_m08"],
1103
+ forbidden_memory_ids: [],
1104
+ difficulty: "hard",
1105
+ architecture_bias_risk: "medium",
1106
+ fairness_note: "Requires linking Marcus Vance -> TA -> office location in Room 402 of the Mathematics Building."
1107
+ }
1108
+ ]
1109
+ },
1110
+ {
1111
+ conversation_id: "da_c08_novel_outline",
1112
+ agent_id: "da_agent_novel_outline",
1113
+ domain: "creative_work",
1114
+ memory_records: [
1115
+ {
1116
+ id: "da_c08_m01",
1117
+ type: "semantic",
1118
+ timestamp: "2026-06-01T10:00:00Z",
1119
+ content: "The main character of the sci-fi novel is Captain Vance Rennold, commander of the starship 'Nebula'.",
1120
+ tags: ["creative", "novel", "character"],
1121
+ importance: 0.85,
1122
+ metadata: { source_turn: "t01", speaker: "user" }
1123
+ },
1124
+ {
1125
+ id: "da_c08_m02",
1126
+ type: "semantic",
1127
+ timestamp: "2026-06-01T10:05:00Z",
1128
+ content: "Captain Rennold's primary motivation is finding the lost colony of Elysium to rescue his sister.",
1129
+ tags: ["creative", "novel", "character-motive"],
1130
+ importance: 0.8,
1131
+ metadata: { source_turn: "t02", speaker: "user" },
1132
+ associations: [
1133
+ { target_id: "da_c08_m01", strength: 0.85, reason: "character detail" }
1134
+ ]
1135
+ },
1136
+ {
1137
+ id: "da_c08_m03",
1138
+ type: "semantic",
1139
+ timestamp: "2026-06-01T10:10:00Z",
1140
+ content: "The starship 'Nebula' is powered by a rare dark-matter core that requires cooling every 24 hours.",
1141
+ tags: ["creative", "novel", "lore"],
1142
+ importance: 0.8,
1143
+ metadata: { source_turn: "t03", speaker: "user" }
1144
+ },
1145
+ {
1146
+ id: "da_c08_m04",
1147
+ type: "semantic",
1148
+ timestamp: "2026-06-01T10:15:00Z",
1149
+ content: "Noise: The starship control deck features chrome surfaces and blue neon status panels.",
1150
+ tags: ["noise", "starship-deck"],
1151
+ importance: 0.25,
1152
+ metadata: { source_turn: "t04", speaker: "assistant" }
1153
+ },
1154
+ {
1155
+ id: "da_c08_m05",
1156
+ type: "procedural",
1157
+ timestamp: "2026-06-02T09:00:00Z",
1158
+ content: "To build narrative tension in Chapter 3, outline the following steps: first, introduce the cooling system failure; second, force a hard landing on an asteroid; third, trigger a conflict between Rennold and the engineer.",
1159
+ tags: ["creative", "novel", "outline"],
1160
+ importance: 0.9,
1161
+ metadata: { source_turn: "t05", speaker: "user" }
1162
+ },
1163
+ {
1164
+ id: "da_c08_m06",
1165
+ type: "semantic",
1166
+ timestamp: "2026-06-02T10:00:00Z",
1167
+ content: "Noise: The author uses Scrivener for drafting and Google Docs for sharing review copies.",
1168
+ tags: ["noise", "software"],
1169
+ importance: 0.3,
1170
+ metadata: { source_turn: "t06", speaker: "user" }
1171
+ },
1172
+ {
1173
+ id: "da_c08_m07",
1174
+ type: "semantic",
1175
+ timestamp: "2026-06-03T11:00:00Z",
1176
+ content: "The primary antagonist is Commander Sarah Drake, head of the Orion Syndicate.",
1177
+ tags: ["creative", "novel", "character"],
1178
+ importance: 0.85,
1179
+ metadata: { source_turn: "t07", speaker: "user" }
1180
+ },
1181
+ {
1182
+ id: "da_c08_m08",
1183
+ type: "semantic",
1184
+ timestamp: "2026-06-03T11:15:00Z",
1185
+ content: "The Orion Syndicate operates from a hidden space station orbiting the gas giant Jupiter.",
1186
+ tags: ["creative", "novel", "lore"],
1187
+ importance: 0.8,
1188
+ metadata: { source_turn: "t08", speaker: "user" },
1189
+ associations: [
1190
+ { target_id: "da_c08_m07", strength: 0.9, reason: "antagonist's organization location" }
1191
+ ]
1192
+ },
1193
+ {
1194
+ id: "da_c08_m09",
1195
+ type: "semantic",
1196
+ timestamp: "2026-06-04T12:00:00Z",
1197
+ content: "The novel's working title is 'Shattered Nebula' and the word count goal is 80,000 words.",
1198
+ tags: ["creative", "novel", "metadata"],
1199
+ importance: 0.7,
1200
+ metadata: { source_turn: "t09", speaker: "user" }
1201
+ }
1202
+ ],
1203
+ questions: [
1204
+ {
1205
+ question_id: "da_c08_q01",
1206
+ category: "atomic_fact_recall",
1207
+ question: "Who is the primary antagonist of the sci-fi novel?",
1208
+ expected_answer: "Commander Sarah Drake",
1209
+ acceptable_answer_criteria: ["Sarah Drake", "Commander Sarah Drake"],
1210
+ required_memory_ids: ["da_c08_m07"],
1211
+ forbidden_memory_ids: [],
1212
+ difficulty: "easy",
1213
+ architecture_bias_risk: "low",
1214
+ fairness_note: "Direct exact match fact retrieval."
1215
+ },
1216
+ {
1217
+ question_id: "da_c08_q02",
1218
+ category: "atomic_fact_recall",
1219
+ question: "What is the working title of the book and its target word count?",
1220
+ expected_answer: "'Shattered Nebula' with an 80,000 words target.",
1221
+ acceptable_answer_criteria: ["Shattered Nebula", "80,000 words", "80k000 words"],
1222
+ required_memory_ids: ["da_c08_m09"],
1223
+ forbidden_memory_ids: [],
1224
+ difficulty: "easy",
1225
+ architecture_bias_risk: "low",
1226
+ fairness_note: "Direct metadata fact lookup."
1227
+ },
1228
+ {
1229
+ question_id: "da_c08_q03",
1230
+ category: "paraphrased_semantic_recall",
1231
+ question: "What drives Captain Rennold to search the galaxy?",
1232
+ expected_answer: "Finding the lost colony of Elysium to rescue his sister.",
1233
+ acceptable_answer_criteria: ["rescuing his sister", "finding Elysium", "his sister"],
1234
+ required_memory_ids: ["da_c08_m02"],
1235
+ forbidden_memory_ids: [],
1236
+ difficulty: "medium",
1237
+ architecture_bias_risk: "low",
1238
+ fairness_note: "Tests paraphrased mapping from 'drives Captain Rennold to search the galaxy' to 'primary motivation'."
1239
+ },
1240
+ {
1241
+ question_id: "da_c08_q04",
1242
+ category: "noise_resistance",
1243
+ question: "How is the starship 'Nebula' powered?",
1244
+ expected_answer: "A rare dark-matter core.",
1245
+ acceptable_answer_criteria: ["dark-matter core", "dark-matter"],
1246
+ required_memory_ids: ["da_c08_m03"],
1247
+ forbidden_memory_ids: ["da_c08_m04", "da_c08_m06"],
1248
+ difficulty: "easy",
1249
+ architecture_bias_risk: "low",
1250
+ fairness_note: "Ensures model ignores visual control deck chrome description and software like Scrivener."
1251
+ },
1252
+ {
1253
+ question_id: "da_c08_q05",
1254
+ category: "procedural_recall",
1255
+ question: "What steps are planned to build narrative tension in the third chapter?",
1256
+ expected_answer: "First, introduce cooling system failure; second, force a hard landing on an asteroid; third, trigger a conflict between Rennold and the engineer.",
1257
+ acceptable_answer_criteria: ["cooling system failure", "hard landing on an asteroid", "conflict with engineer"],
1258
+ required_memory_ids: ["da_c08_m05"],
1259
+ forbidden_memory_ids: [],
1260
+ difficulty: "hard",
1261
+ architecture_bias_risk: "low",
1262
+ fairness_note: "Retrieves a multi-step writing procedure."
1263
+ }
1264
+ ]
1265
+ }
1266
+ ]
1267
+ };
1268
+
1269
+ // ----------------------------------------------------
1270
+ // DATASET C: ADVERSARIAL MEMORY
1271
+ // 10 conversations, 120 memories (12 per conv), 60 questions (6 per conv)
1272
+ // Focus: stale facts, contradictions, distractors, ambiguity, and abstention
1273
+ // ----------------------------------------------------
1274
+ const datasetC = {
1275
+ name: "adversarial-memory",
1276
+ description: "Adversarial memory benchmark dataset containing 10 conversations with 120 memory records and 60 questions, testing contradiction handling, noise, and abstention.",
1277
+ generated_at: new Date().toISOString().split('T')[0],
1278
+ fairness_notes: [
1279
+ "Tests capacity to handle stale values, contradicting facts, and noise resistance without graph bias.",
1280
+ "Abstention questions ensure systems know when evidence is insufficient."
1281
+ ],
1282
+ conversations: []
1283
+ };
1284
+
1285
+ // Category allocation list for C: 60 questions total
1286
+ // 25% atomic fact recall = 15 questions
1287
+ // 20% paraphrased semantic recall = 12 questions
1288
+ // 15% temporal/current preference = 9 questions
1289
+ // 10% contradiction resolution = 6 questions
1290
+ // 10% multi-hop association = 6 questions
1291
+ // 10% noise resistance = 6 questions
1292
+ // 5% procedural recall = 3 questions
1293
+ // 5% abstention = 3 questions
1294
+ const categoriesC = [
1295
+ ...Array(15).fill("atomic_fact_recall"),
1296
+ ...Array(12).fill("paraphrased_semantic_recall"),
1297
+ ...Array(9).fill("temporal_update"),
1298
+ ...Array(6).fill("contradiction_resolution"),
1299
+ ...Array(6).fill("multi_hop_association"),
1300
+ ...Array(6).fill("noise_resistance"),
1301
+ ...Array(3).fill("procedural_recall"),
1302
+ ...Array(3).fill("abstention")
1303
+ ];
1304
+
1305
+ const domainsC = ["software", "personal_assistant", "research", "travel", "health_admin", "finance_admin", "education", "creative_work", "software", "personal_assistant"];
1306
+
1307
+ for (let i = 0; i < 10; i++) {
1308
+ const cNum = String(i + 1).padStart(2, '0');
1309
+ const domain = domainsC[i];
1310
+ const records = [];
1311
+ const questions = [];
1312
+ const dName = domain.toUpperCase();
1313
+
1314
+ // Hand-craft 12 adversarial records per conversation
1315
+ records.push(
1316
+ { id: `dc_c${cNum}_m01`, type: "semantic", timestamp: "2026-06-01T10:00:00Z", content: `The main parameters designated for ${dName} are value Alpha.`, tags: [domain, "param"], importance: 0.8, metadata: { source_turn: "t1", speaker: "user" } },
1317
+ { id: `dc_c${cNum}_m02`, type: "semantic", timestamp: "2026-06-01T10:05:00Z", content: `The secondary configuration option for ${dName} is value Beta.`, tags: [domain, "param"], importance: 0.8, metadata: { source_turn: "t2", speaker: "user" } },
1318
+ { id: `dc_c${cNum}_m03`, type: "semantic", timestamp: "2026-06-01T10:10:00Z", content: `Distractor: The color of the ${dName} report sheet is yellow.`, tags: ["noise"], importance: 0.2, metadata: { source_turn: "t3", speaker: "assistant" } },
1319
+ { id: `dc_c${cNum}_m04`, type: "semantic", timestamp: "2026-06-02T10:00:00Z", content: `On June 2, the main parameter configuration for ${dName} was updated to Gamma.`, tags: [domain, "param"], importance: 0.9, metadata: { source_turn: "t4", speaker: "user" } },
1320
+ { id: `dc_c${cNum}_m05`, type: "semantic", timestamp: "2026-06-03T11:00:00Z", content: `On June 3, the secondary configuration option for ${dName} was changed to Delta.`, tags: [domain, "param"], importance: 0.9, metadata: { source_turn: "t5", speaker: "user" } },
1321
+ { id: `dc_c${cNum}_m06`, type: "procedural", timestamp: "2026-06-04T09:00:00Z", content: `To apply modifications to the ${dName} registry: 1. check credentials, 2. submit form, 3. wait for email.`, tags: [domain, "process"], importance: 0.85, metadata: { source_turn: "t6", speaker: "user" } },
1322
+ { id: `dc_c${cNum}_m07`, type: "semantic", timestamp: "2026-06-04T09:05:00Z", content: `Distractor: Email server is active on port 25.`, tags: ["noise"], importance: 0.3, metadata: { source_turn: "t7", speaker: "assistant" } },
1323
+ { id: `dc_c${cNum}_m08`, type: "semantic", timestamp: "2026-06-05T10:00:00Z", content: `The registry workspace for ${dName} is located at building Room 10.`, tags: [domain], importance: 0.8, metadata: { source_turn: "t8", speaker: "user" }, associations: [{ target_id: `dc_c${cNum}_m06`, strength: 0.7 }] },
1324
+ { id: `dc_c${cNum}_m09`, type: "semantic", timestamp: "2026-06-05T10:10:00Z", content: `The client contact liaison for ${dName} is Mary Jane.`, tags: [domain], importance: 0.7, metadata: { source_turn: "t9", speaker: "user" } },
1325
+ { id: `dc_c${cNum}_m10`, type: "semantic", timestamp: "2026-06-06T12:00:00Z", content: `On June 6, the client contact liaison for ${dName} was changed to Peter Parker.`, tags: [domain], importance: 0.9, metadata: { source_turn: "t10", speaker: "user" } },
1326
+ { id: `dc_c${cNum}_m11`, type: "semantic", timestamp: "2026-06-06T12:05:00Z", content: `Distractor: Peter Parker works as a freelance photographer.`, tags: ["noise"], importance: 0.2, metadata: { source_turn: "t11", speaker: "assistant" } },
1327
+ { id: `dc_c${cNum}_m12`, type: "semantic", timestamp: "2026-06-07T13:00:00Z", content: `The manager for the ${dName} task is George Lucas.`, tags: [domain, "manager"], importance: 0.8, metadata: { source_turn: "t12", speaker: "user" } }
1328
+ );
1329
+
1330
+ const startIdx = i * 6;
1331
+ const categoriesList = categoriesC.slice(startIdx, startIdx + 6);
1332
+
1333
+ for (let q = 0; q < 6; q++) {
1334
+ const category = categoriesList[q];
1335
+ const qId = `dc_c${cNum}_q${q + 1}`;
1336
+ let questionText = "";
1337
+ let expectedAnswer = "";
1338
+ let criteria = [];
1339
+ let requiredMemoryIds = [];
1340
+ let forbiddenMemoryIds = [];
1341
+
1342
+ if (category === "atomic_fact_recall") {
1343
+ // Paraphrased: "Who holds administrative coordination authority..."
1344
+ questionText = `Who is in charge of administrative coordination for the ${dName} task?`;
1345
+ expectedAnswer = `George Lucas`;
1346
+ criteria = ["George Lucas", "George"];
1347
+ requiredMemoryIds = [`dc_c${cNum}_m12`];
1348
+ } else if (category === "paraphrased_semantic_recall") {
1349
+ // Paraphrased: "What specific environment configuration version..."
1350
+ questionText = `What is the primary variable value currently designated for the ${dName} project?`;
1351
+ expectedAnswer = `Gamma`;
1352
+ criteria = ["Gamma"];
1353
+ requiredMemoryIds = [`dc_c${cNum}_m04`];
1354
+ forbiddenMemoryIds = [`dc_c${cNum}_m01`];
1355
+ } else if (category === "temporal_update") {
1356
+ // Paraphrased: "Identify the active software package..."
1357
+ questionText = `What is the secondary parameter value active for ${dName}?`;
1358
+ expectedAnswer = `Delta`;
1359
+ criteria = ["Delta"];
1360
+ requiredMemoryIds = [`dc_c${cNum}_m05`];
1361
+ forbiddenMemoryIds = [`dc_c${cNum}_m02`];
1362
+ } else if (category === "contradiction_resolution") {
1363
+ // Paraphrased: "Does Mary Jane still serve as..."
1364
+ questionText = `Does Mary Jane still serve as the primary external liaison for the ${dName} engagement?`;
1365
+ expectedAnswer = `No, it was updated to Peter Parker on June 6.`;
1366
+ criteria = ["No, it is Peter Parker", "No", "Peter Parker"];
1367
+ requiredMemoryIds = [`dc_c${cNum}_m10`, `dc_c${cNum}_m09`];
1368
+ } else if (category === "multi_hop_association") {
1369
+ // Paraphrased: "Where is the workspace located..."
1370
+ questionText = `Where should we go to apply updates to the registry for the ${dName} task?`;
1371
+ expectedAnswer = `Room 10`;
1372
+ criteria = ["Room 10", "building Room 10"];
1373
+ requiredMemoryIds = [`dc_c${cNum}_m08`, `dc_c${cNum}_m06`];
1374
+ } else if (category === "noise_resistance") {
1375
+ // Paraphrased: "What steps are necessary..."
1376
+ questionText = `What is the procedure for enacting modifications on the ${dName} registry?`;
1377
+ expectedAnswer = `1. check credentials, 2. submit form, 3. wait for email`;
1378
+ criteria = ["check credentials", "submit form", "wait for email"];
1379
+ requiredMemoryIds = [`dc_c${cNum}_m06`];
1380
+ forbiddenMemoryIds = [`dc_c${cNum}_m07`];
1381
+ } else if (category === "procedural_recall") {
1382
+ questionText = `What steps are necessary to execute the ${dName} update?`;
1383
+ expectedAnswer = `1. check credentials, 2. submit form, 3. wait for email.`;
1384
+ criteria = ["check credentials", "submit form", "wait for email"];
1385
+ requiredMemoryIds = [`dc_c${cNum}_m06`];
1386
+ } else if (category === "abstention") {
1387
+ // Paraphrased: "Which telephone contact number..."
1388
+ questionText = `Which telephone contact number should we call to reach the director of ${dName}?`;
1389
+ expectedAnswer = `not enough information`;
1390
+ criteria = ["not enough information", "unknown", "insufficient evidence"];
1391
+ requiredMemoryIds = [];
1392
+ forbiddenMemoryIds = [`dc_c${cNum}_m12`];
1393
+ }
1394
+
1395
+ questions.push({
1396
+ question_id: qId,
1397
+ category: category,
1398
+ question: questionText,
1399
+ expected_answer: expectedAnswer,
1400
+ acceptable_answer_criteria: criteria,
1401
+ required_memory_ids: requiredMemoryIds,
1402
+ forbidden_memory_ids: forbiddenMemoryIds,
1403
+ difficulty: category === "multi_hop_association" || category === "abstention" ? "hard" : "medium",
1404
+ architecture_bias_risk: category === "multi_hop_association" ? "medium" : "low",
1405
+ fairness_note: `Verifies provider-neutral evaluation for category ${category}.`
1406
+ });
1407
+ }
1408
+
1409
+ datasetC.conversations.push({
1410
+ conversation_id: `dc_c${cNum}_${domain}`,
1411
+ agent_id: `dc_agent_${domain}_${cNum}`,
1412
+ domain: domain,
1413
+ memory_records: records,
1414
+ questions: questions
1415
+ });
1416
+ }
1417
+
1418
+ // ----------------------------------------------------
1419
+ // DATASET B: REALISTIC MEDIUM
1420
+ // 20 conversations, 300 memories (15 per conv), 120 questions (6 per conv)
1421
+ // ----------------------------------------------------
1422
+ const datasetB = {
1423
+ name: "realistic-medium",
1424
+ description: "Stronger public memory benchmark dataset containing 20 conversations with 300 memory records and 120 questions across multiple categories.",
1425
+ generated_at: new Date().toISOString().split('T')[0],
1426
+ fairness_notes: [
1427
+ "No memory passport, decay, or graph traversal assumptions.",
1428
+ "Ensures balanced evaluation of vector and graph-based systems across 8 distinct domains."
1429
+ ],
1430
+ conversations: []
1431
+ };
1432
+
1433
+ // Category allocation list for B: 120 questions total
1434
+ // 25% atomic fact recall = 30 questions
1435
+ // 20% paraphrased semantic recall = 24 questions
1436
+ // 15% temporal/current preference = 18 questions
1437
+ // 10% contradiction resolution = 12 questions
1438
+ // 10% multi-hop association = 12 questions
1439
+ // 10% noise resistance = 12 questions
1440
+ // 5% procedural recall = 6 questions
1441
+ // 5% abstention = 6 questions
1442
+ const categoriesB = [
1443
+ ...Array(30).fill("atomic_fact_recall"),
1444
+ ...Array(24).fill("paraphrased_semantic_recall"),
1445
+ ...Array(18).fill("temporal_update"),
1446
+ ...Array(12).fill("contradiction_resolution"),
1447
+ ...Array(12).fill("multi_hop_association"),
1448
+ ...Array(12).fill("noise_resistance"),
1449
+ ...Array(6).fill("procedural_recall"),
1450
+ ...Array(6).fill("abstention")
1451
+ ];
1452
+
1453
+ const domainsB = [
1454
+ "software", "personal_assistant", "research", "travel", "health_admin", "finance_admin", "education", "creative_work",
1455
+ "software", "personal_assistant", "research", "travel", "health_admin", "finance_admin", "education", "creative_work",
1456
+ "software", "personal_assistant", "research", "travel"
1457
+ ];
1458
+
1459
+ for (let i = 0; i < 20; i++) {
1460
+ const cNum = String(i + 1).padStart(2, '0');
1461
+ const domain = domainsB[i];
1462
+ const records = [];
1463
+ const questions = [];
1464
+
1465
+ // Create 15 memory records per conversation
1466
+ for (let j = 1; j <= 15; j++) {
1467
+ const mId = `db_c${cNum}_m${String(j).padStart(2, '0')}`;
1468
+ let content = "";
1469
+ let type = "semantic";
1470
+ let tags = [domain];
1471
+ let importance = 0.8;
1472
+ let associations = [];
1473
+
1474
+ if (j === 1) {
1475
+ content = `The active agent ID for ${domain} projects is designated as Agent-${cNum}.`;
1476
+ tags.push("agent-id");
1477
+ } else if (j === 2) {
1478
+ content = `The primary key indicator of success for the ${domain} task is achieving 95% accuracy.`;
1479
+ tags.push("success-metric");
1480
+ } else if (j === 3) {
1481
+ content = `The main supervisor for this ${domain} work stream is Mr. Arthur Pendelton.`;
1482
+ tags.push("supervisor");
1483
+ } else if (j === 4) {
1484
+ content = `The older instruction stated that files should be uploaded every 4 hours.`;
1485
+ tags.push("upload");
1486
+ } else if (j === 5) {
1487
+ content = `On June 15, the upload schedule was updated to every 1 hour to prevent data loss.`;
1488
+ tags.push("upload");
1489
+ } else if (j === 6) {
1490
+ content = `To register a new item in the ${domain} portal: 1. login, 2. enter ID, 3. click save.`;
1491
+ type = "procedural";
1492
+ tags.push("portal-process");
1493
+ } else if (j === 7) {
1494
+ content = `Distractor: The login button on the portal is bright orange for high visibility.`;
1495
+ tags.push("noise");
1496
+ importance = 0.2;
1497
+ } else if (j === 8) {
1498
+ content = `The supervisor Pendelton works in Office 301, located in building B.`;
1499
+ tags.push("supervisor-location");
1500
+ associations.push({ target_id: `db_c${cNum}_m03`, strength: 0.8, reason: "supervisor workplace info" });
1501
+ } else if (j === 9) {
1502
+ content = `The department code for the ${domain} work is DEPT-${cNum}.`;
1503
+ tags.push("dept-code");
1504
+ } else if (j === 10) {
1505
+ content = `The first version of the budget for DEPT-${cNum} was set to $50,000.`;
1506
+ tags.push("budget");
1507
+ } else if (j === 11) {
1508
+ content = `On June 18, the budget for DEPT-${cNum} was updated to $65,000 for expansion.`;
1509
+ tags.push("budget");
1510
+ } else if (j === 12) {
1511
+ content = `Distractor: Department DEPT-${cNum} has exactly 5 members currently.`;
1512
+ tags.push("noise");
1513
+ importance = 0.3;
1514
+ } else if (j === 13) {
1515
+ content = `The backup server hostname is backup-srv-${cNum}.local.`;
1516
+ tags.push("server");
1517
+ } else if (j === 14) {
1518
+ content = `The primary contact person for verification is Dr. Elizabeth Swan.`;
1519
+ tags.push("verifier");
1520
+ } else {
1521
+ content = `Distractor: Dr. Swan has a master's degree in ${domain} operations.`;
1522
+ tags.push("noise");
1523
+ importance = 0.2;
1524
+ }
1525
+
1526
+ records.push({
1527
+ id: mId,
1528
+ type: type,
1529
+ timestamp: `2026-06-1${j % 9}T10:00:00Z`,
1530
+ content: content,
1531
+ tags: tags,
1532
+ importance: importance,
1533
+ metadata: { source_turn: `t${j}`, speaker: j % 2 === 0 ? "assistant" : "user" },
1534
+ associations: associations
1535
+ });
1536
+ }
1537
+
1538
+ const startIdx = i * 6;
1539
+ const categoriesList = categoriesB.slice(startIdx, startIdx + 6);
1540
+
1541
+ for (let q = 0; q < 6; q++) {
1542
+ const category = categoriesList[q];
1543
+ const qId = `db_c${cNum}_q${q + 1}`;
1544
+ let questionText = "";
1545
+ let expectedAnswer = "";
1546
+ let criteria = [];
1547
+ let requiredMemoryIds = [];
1548
+ let forbiddenMemoryIds = [];
1549
+
1550
+ if (category === "atomic_fact_recall") {
1551
+ // Paraphrased: "Who oversees the operations for..."
1552
+ questionText = `Who oversees the operations for the ${domain} group?`;
1553
+ expectedAnswer = `Mr. Arthur Pendelton`;
1554
+ criteria = ["Arthur Pendelton", "Mr. Pendelton"];
1555
+ requiredMemoryIds = [`db_c${cNum}_m03`];
1556
+ } else if (category === "paraphrased_semantic_recall") {
1557
+ // Paraphrased: "What level of precision is expected..."
1558
+ questionText = `What level of precision is expected to satisfy the metrics for the ${domain} project?`;
1559
+ expectedAnswer = `95% accuracy`;
1560
+ criteria = ["95%", "95% accuracy"];
1561
+ requiredMemoryIds = [`db_c${cNum}_m02`];
1562
+ } else if (category === "temporal_update") {
1563
+ // Paraphrased: "What is the current required interval..."
1564
+ questionText = `What is the current required interval for transmitting project data?`;
1565
+ expectedAnswer = `Every 1 hour`;
1566
+ criteria = ["every 1 hour", "1 hour", "hourly"];
1567
+ requiredMemoryIds = [`db_c${cNum}_m05`];
1568
+ forbiddenMemoryIds = [`db_c${cNum}_m04`];
1569
+ } else if (category === "contradiction_resolution") {
1570
+ // Paraphrased: "Is the initial funding amount..."
1571
+ questionText = `Is the initial funding amount of fifty thousand dollars for department DEPT-${cNum} still active?`;
1572
+ expectedAnswer = `No, the budget was updated to $65,000 on June 18.`;
1573
+ criteria = ["No, it was updated to $65,000", "$65,000", "No"];
1574
+ requiredMemoryIds = [`db_c${cNum}_m11`, `db_c${cNum}_m10`];
1575
+ } else if (category === "multi_hop_association") {
1576
+ // Paraphrased: "Find the work location details..."
1577
+ questionText = `Find the work location details (room and block) of the individual leading the ${domain} group.`;
1578
+ expectedAnswer = `Office 301, building B`;
1579
+ criteria = ["Office 301", "building B"];
1580
+ requiredMemoryIds = [`db_c${cNum}_m03`, `db_c${cNum}_m08`];
1581
+ } else if (category === "noise_resistance") {
1582
+ // Paraphrased: "Identify the remote storage network..."
1583
+ questionText = `Identify the remote storage network address designated for redundancy in ${domain}.`;
1584
+ expectedAnswer = `backup-srv-${cNum}.local`;
1585
+ criteria = [`backup-srv-${cNum}.local`];
1586
+ requiredMemoryIds = [`db_c${cNum}_m13`];
1587
+ forbiddenMemoryIds = [`db_c${cNum}_m07`, `db_c${cNum}_m12`];
1588
+ } else if (category === "procedural_recall") {
1589
+ // Paraphrased: "How can one register a new entry..."
1590
+ questionText = `How can one register a new entry using the ${domain} interface?`;
1591
+ expectedAnswer = `1. login, 2. enter ID, 3. click save`;
1592
+ criteria = ["login", "enter ID", "click save"];
1593
+ requiredMemoryIds = [`db_c${cNum}_m06`];
1594
+ } else if (category === "abstention") {
1595
+ // Paraphrased: "What is the contact phone number..."
1596
+ questionText = `What is the contact phone number for the primary verification person Dr. Swan?`;
1597
+ expectedAnswer = `not enough information`;
1598
+ criteria = ["not enough information", "unknown", "insufficient evidence"];
1599
+ requiredMemoryIds = [];
1600
+ forbiddenMemoryIds = [`db_c${cNum}_m14`];
1601
+ }
1602
+
1603
+ questions.push({
1604
+ question_id: qId,
1605
+ category: category,
1606
+ question: questionText,
1607
+ expected_answer: expectedAnswer,
1608
+ acceptable_answer_criteria: criteria,
1609
+ required_memory_ids: requiredMemoryIds,
1610
+ forbidden_memory_ids: forbiddenMemoryIds,
1611
+ difficulty: category === "multi_hop_association" || category === "abstention" ? "hard" : "medium",
1612
+ architecture_bias_risk: category === "multi_hop_association" ? "medium" : "low",
1613
+ fairness_note: `Ensures provider-neutral evaluation for category ${category}.`
1614
+ });
1615
+ }
1616
+
1617
+ datasetB.conversations.push({
1618
+ conversation_id: `db_c${cNum}_${domain}`,
1619
+ agent_id: `db_agent_${domain}_${cNum}`,
1620
+ domain: domain,
1621
+ memory_records: records,
1622
+ questions: questions
1623
+ });
1624
+ }
1625
+
1626
+ // ----------------------------------------------------
1627
+ // AUDIT & VALIDATION ENGINE
1628
+ // ----------------------------------------------------
1629
+ function auditDataset(dataset) {
1630
+ const stats = {
1631
+ total_conversations: dataset.conversations.length,
1632
+ total_memories: 0,
1633
+ total_questions: 0,
1634
+ categories: {},
1635
+ graph_heavy_count: 0,
1636
+ lexical_overlap_pct: 0,
1637
+ recency_count: 0,
1638
+ abstention_count: 0,
1639
+ exact_match_overlaps: []
1640
+ };
1641
+
1642
+ const allQuestions = [];
1643
+
1644
+ for (const conv of dataset.conversations) {
1645
+ stats.total_memories += conv.memory_records.length;
1646
+ stats.total_questions += conv.questions.length;
1647
+
1648
+ for (const q of conv.questions) {
1649
+ allQuestions.push(q);
1650
+ stats.categories[q.category] = (stats.categories[q.category] || 0) + 1;
1651
+
1652
+ if (q.category === 'multi_hop_association') {
1653
+ stats.graph_heavy_count++;
1654
+ }
1655
+ if (q.category === 'temporal_update' || q.category === 'contradiction_resolution') {
1656
+ stats.recency_count++;
1657
+ }
1658
+ if (q.category === 'abstention') {
1659
+ stats.abstention_count++;
1660
+ }
1661
+
1662
+ // Calculate keyword overlap with required memories
1663
+ let maxOverlap = 0;
1664
+ for (const mId of q.required_memory_ids) {
1665
+ const mem = conv.memory_records.find(m => m.id === mId);
1666
+ if (mem) {
1667
+ const overlap = getKeywordOverlap(q.question, mem.content);
1668
+ if (overlap > maxOverlap) maxOverlap = overlap;
1669
+ }
1670
+ }
1671
+ if (maxOverlap > 0) {
1672
+ stats.exact_match_overlaps.push(maxOverlap);
1673
+ }
1674
+ }
1675
+ }
1676
+
1677
+ // Exact match rate calculation (overlap > 0.5 is considered exact-keyword dominant)
1678
+ const highOverlapCount = stats.exact_match_overlaps.filter(o => o > 0.5).length;
1679
+ stats.lexical_overlap_pct = (highOverlapCount / stats.total_questions) * 100;
1680
+
1681
+ return stats;
1682
+ }
1683
+
1684
+ // Perform Audit
1685
+ console.log("=== RUNNING DATASET AUDIT ===");
1686
+
1687
+ const auditA = auditDataset(datasetA);
1688
+ const auditB = auditDataset(datasetB);
1689
+ const auditC = auditDataset(datasetC);
1690
+
1691
+ function printAuditReport(name, audit) {
1692
+ console.log(`\nAudit Report for [${name}]:`);
1693
+ console.log(`- Conversations: ${audit.total_conversations}`);
1694
+ console.log(`- Memory Records: ${audit.total_memories}`);
1695
+ console.log(`- Questions: ${audit.total_questions}`);
1696
+ console.log(`- Category Distribution:`);
1697
+ for (const [cat, count] of Object.entries(audit.categories)) {
1698
+ const pct = ((count / audit.total_questions) * 100).toFixed(1);
1699
+ console.log(` * ${cat}: ${count} (${pct}%)`);
1700
+ }
1701
+ console.log(`- Bias Metrics:`);
1702
+ const graphHeavyPct = (audit.graph_heavy_count / audit.total_questions) * 100;
1703
+ console.log(` * Graph-heavy questions: ${audit.graph_heavy_count} (${graphHeavyPct.toFixed(1)}%) [Limit: <= 15%]`);
1704
+ console.log(` * Lexical/Vector overlap dominant (>50% word overlap): ${audit.lexical_overlap_pct.toFixed(1)}% [Limit: <= 40%]`);
1705
+ console.log(` * Recency-handling questions: ${audit.recency_count}`);
1706
+ console.log(` * Abstention questions: ${audit.abstention_count}`);
1707
+
1708
+ // Audits Checks
1709
+ if (graphHeavyPct > 15) {
1710
+ throw new Error(`FAIL: Graph-heavy questions exceed 15% limit in ${name}`);
1711
+ }
1712
+ if (audit.lexical_overlap_pct > 40) {
1713
+ throw new Error(`FAIL: Exact keyword overlap exceeds 40% limit in ${name}`);
1714
+ }
1715
+ }
1716
+
1717
+ printAuditReport("Dataset A: Balanced Mini", auditA);
1718
+ printAuditReport("Dataset B: Realistic Medium", auditB);
1719
+ printAuditReport("Dataset C: Adversarial Memory", auditC);
1720
+
1721
+ console.log("\nAll audits passed successfully!");
1722
+
1723
+ // Write datasets to target folders
1724
+ const fixturesRoot = path.resolve(__dirname);
1725
+ console.log(`Writing datasets to ${fixturesRoot}...`);
1726
+
1727
+ function writeDataset(dirName, fileName, data) {
1728
+ const dirPath = path.join(fixturesRoot, dirName);
1729
+ if (!fs.existsSync(dirPath)) {
1730
+ fs.mkdirSync(dirPath, { recursive: true });
1731
+ }
1732
+ const filePath = path.join(dirPath, fileName);
1733
+ fs.writeFileSync(filePath, JSON.stringify(data, null, 2), 'utf8');
1734
+ console.log(`Created: ${filePath}`);
1735
+ }
1736
+
1737
+ writeDataset("balanced-mini", "balanced-mini.json", datasetA);
1738
+ writeDataset("realistic-medium", "realistic-medium.json", datasetB);
1739
+ writeDataset("adversarial-memory", "adversarial-memory.json", datasetC);
1740
+
1741
+ console.log("Dataset files written successfully!");