@1mbrain/benchmarks 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (69) hide show
  1. package/README.md +85 -0
  2. package/fixtures/1mbrain-focused-mini/1mbrain-focused-mini.json +928 -0
  3. package/fixtures/1mbrain-focused-mini/README.md +45 -0
  4. package/fixtures/adversarial-memory/dataset_claude_adversarial.json +3333 -0
  5. package/fixtures/adversarial-memory/dataset_gemini_adversarial_memory.json +2984 -0
  6. package/fixtures/balanced-mini/dataset_claude_balanced_mini.json +2077 -0
  7. package/fixtures/balanced-mini/dataset_gemini_balanced_mini.json +1995 -0
  8. package/fixtures/generate_datasets.js +1741 -0
  9. package/fixtures/graph-stress-hard/README.md +43 -0
  10. package/fixtures/graph-stress-hard/dataset_graph_stress_hard.json +4374 -0
  11. package/fixtures/graph-stress-hard/generate_graph_stress_hard.js +526 -0
  12. package/fixtures/realistic-medium/dataset_claude_realistic_medium.json +7462 -0
  13. package/fixtures/realistic-medium/dataset_gemini_realistic_medium.json +7277 -0
  14. package/fixtures/realistic-medium/gen_claude_medium.js +600 -0
  15. package/package.json +22 -0
  16. package/reports/benchmark_report.md +48 -0
  17. package/reports/benchmark_report_claude_adversarial.md +42 -0
  18. package/reports/benchmark_report_claude_adversarial_adaptive.md +42 -0
  19. package/reports/benchmark_report_claude_adversarial_adaptive2_fast.md +42 -0
  20. package/reports/benchmark_report_claude_adversarial_adaptive_fast.md +42 -0
  21. package/reports/benchmark_report_claude_adversarial_rerank.md +42 -0
  22. package/reports/benchmark_report_claude_balanced_mini.md +42 -0
  23. package/reports/benchmark_report_claude_balanced_mini_adaptive.md +42 -0
  24. package/reports/benchmark_report_claude_balanced_mini_adaptive2_fast.md +42 -0
  25. package/reports/benchmark_report_claude_balanced_mini_adaptive_fast.md +42 -0
  26. package/reports/benchmark_report_claude_balanced_mini_rerank.md +42 -0
  27. package/reports/benchmark_report_claude_realistic_medium.md +42 -0
  28. package/reports/benchmark_report_claude_realistic_medium_adaptive.md +42 -0
  29. package/reports/benchmark_report_claude_realistic_medium_adaptive2_fast.md +42 -0
  30. package/reports/benchmark_report_claude_realistic_medium_adaptive_fast.md +42 -0
  31. package/reports/benchmark_report_claude_realistic_medium_evidence_rerank_local.md +42 -0
  32. package/reports/benchmark_report_claude_realistic_medium_openai_evidence_rerank.md +41 -0
  33. package/reports/benchmark_report_claude_realistic_medium_openai_multi_signal.md +41 -0
  34. package/reports/benchmark_report_claude_realistic_medium_openai_multi_signal_scoped.md +41 -0
  35. package/reports/benchmark_report_claude_realistic_medium_openai_phase8_no_judge.md +42 -0
  36. package/reports/benchmark_report_claude_realistic_medium_openai_rankingpolicy.md +41 -0
  37. package/reports/benchmark_report_claude_realistic_medium_openai_stale_filter.md +41 -0
  38. package/reports/benchmark_report_claude_realistic_medium_openai_stale_filter_absence_fix.md +41 -0
  39. package/reports/benchmark_report_claude_realistic_medium_openai_write_time_invalidation.md +41 -0
  40. package/reports/benchmark_report_claude_realistic_medium_rerank.md +42 -0
  41. package/reports/benchmark_report_claude_realistic_medium_stale_filter_local.md +42 -0
  42. package/reports/benchmark_report_graph_stress_hard.md +42 -0
  43. package/reports/benchmark_report_graph_stress_hard_absence_fix.md +42 -0
  44. package/reports/benchmark_report_graph_stress_hard_adaptive.md +42 -0
  45. package/reports/benchmark_report_graph_stress_hard_evidence_rerank.md +42 -0
  46. package/reports/benchmark_report_graph_stress_hard_multi_signal_current_guardrail.md +42 -0
  47. package/reports/benchmark_report_graph_stress_hard_multi_signal_guardrail_fixed.md +42 -0
  48. package/reports/benchmark_report_graph_stress_hard_multi_signal_local.md +42 -0
  49. package/reports/benchmark_report_graph_stress_hard_multi_signal_scoped_guardrail.md +42 -0
  50. package/reports/benchmark_report_graph_stress_hard_multi_signal_vector_pure_guardrail.md +42 -0
  51. package/reports/benchmark_report_graph_stress_hard_phase8_sdk_guardrail.md +42 -0
  52. package/reports/benchmark_report_graph_stress_hard_rerank.md +42 -0
  53. package/reports/benchmark_report_graph_stress_hard_stale_filter.md +42 -0
  54. package/reports/benchmark_report_graph_stress_hard_write_time_invalidation.md +42 -0
  55. package/results/.gitignore +2 -0
  56. package/src/adapters/1mbrain.ts +317 -0
  57. package/src/adapters/keyword-embedding.ts +48 -0
  58. package/src/adapters/mem0.ts +124 -0
  59. package/src/adapters/qdrant.ts +214 -0
  60. package/src/adapters/unavailable.ts +49 -0
  61. package/src/adapters/vector-baseline.ts +149 -0
  62. package/src/datasets/focused-mini.ts +158 -0
  63. package/src/datasets/synthetic-agent-memory.ts +532 -0
  64. package/src/llm-evaluator.ts +262 -0
  65. package/src/metrics.ts +482 -0
  66. package/src/provider.ts +151 -0
  67. package/src/runner.ts +635 -0
  68. package/tsconfig.json +10 -0
  69. package/tsconfig.tsbuildinfo +1 -0
@@ -0,0 +1,2077 @@
1
+ {
2
+ "name": "memory-bench-balanced-mini",
3
+ "description": "Dataset A (Balanced Mini): an 8-conversation, provider-neutral smoke-test benchmark spanning 8 domains, covering atomic recall, paraphrase, temporal updates, contradiction resolution, multi-hop association, noise resistance, procedural recall, and abstention.",
4
+ "generated_at": "2026-06-19",
5
+ "fairness_notes": [
6
+ "No memory record includes graph-specific fields such as explicit edge weights required for retrieval, decay scores, or memory-passport metadata; the optional 'associations' field is descriptive context, not a required index, so vector-only and flat-store systems can ignore it without losing answerable information.",
7
+ "Every question is answerable purely from the natural-language content of the listed required_memory_ids; no question depends on a system having pre-built a graph structure to succeed.",
8
+ "Atomic and paraphrased categories (45% combined) are intentionally the largest share, since plain recall and semantic-similarity retrieval are exactly what vector-only systems are expected to handle well.",
9
+ "Multi-hop questions are capped at 10% of the dataset (4 of 40) so that graph-native systems do not get a structurally inflated advantage.",
10
+ "Contradiction and temporal-update questions (25% combined) test recency handling, which any system with timestamped storage can address regardless of architecture.",
11
+ "Distractor memories are semantically similar but topically distinct (a different app, a different city, a different person, a one-off event), so noise resistance tests retrieval precision rather than rewarding any one indexing strategy."
12
+ ],
13
+ "conversations": [
14
+ {
15
+ "conversation_id": "convA-software-01",
16
+ "agent_id": "agent-dev-assist",
17
+ "domain": "software",
18
+ "memory_records": [
19
+ {
20
+ "id": "memA-sw-001",
21
+ "type": "semantic",
22
+ "timestamp": "2025-11-02T09:10:00Z",
23
+ "content": "Maya decided to build Stridewell, a habit-tracking Android app, using Kotlin with Jetpack Compose for the UI layer.",
24
+ "tags": [
25
+ "stridewell",
26
+ "tech-stack"
27
+ ],
28
+ "importance": 0.8,
29
+ "metadata": {
30
+ "source_turn": "turn-001",
31
+ "speaker": "user"
32
+ },
33
+ "associations": []
34
+ },
35
+ {
36
+ "id": "memA-sw-002",
37
+ "type": "semantic",
38
+ "timestamp": "2025-11-05T14:20:00Z",
39
+ "content": "Maya picked a teal-and-cream color palette for Stridewell because she wanted the app to feel calm rather than gamified and aggressive.",
40
+ "tags": [
41
+ "stridewell",
42
+ "design"
43
+ ],
44
+ "importance": 0.6,
45
+ "metadata": {
46
+ "source_turn": "turn-002",
47
+ "speaker": "user"
48
+ },
49
+ "associations": []
50
+ },
51
+ {
52
+ "id": "memA-sw-003",
53
+ "type": "episodic",
54
+ "timestamp": "2025-11-20T11:00:00Z",
55
+ "content": "Maya is storing habit logs in SharedPreferences as a quick prototype solution while she gets the streak-counting logic working.",
56
+ "tags": [
57
+ "stridewell",
58
+ "storage"
59
+ ],
60
+ "importance": 0.5,
61
+ "metadata": {
62
+ "source_turn": "turn-003",
63
+ "speaker": "user"
64
+ },
65
+ "associations": []
66
+ },
67
+ {
68
+ "id": "memA-sw-004",
69
+ "type": "semantic",
70
+ "timestamp": "2026-01-14T16:45:00Z",
71
+ "content": "After hitting performance issues with large habit histories, Maya migrated Stridewell's local storage from SharedPreferences to a Room database backed by SQLite.",
72
+ "tags": [
73
+ "stridewell",
74
+ "storage"
75
+ ],
76
+ "importance": 0.85,
77
+ "metadata": {
78
+ "source_turn": "turn-004",
79
+ "speaker": "user"
80
+ },
81
+ "associations": [
82
+ {
83
+ "target_id": "memA-sw-003",
84
+ "strength": 0.9,
85
+ "reason": "replaces the earlier SharedPreferences storage approach"
86
+ }
87
+ ]
88
+ },
89
+ {
90
+ "id": "memA-sw-005",
91
+ "type": "episodic",
92
+ "timestamp": "2026-01-15T10:05:00Z",
93
+ "content": "Maya mentioned that an earlier prototype of hers, a recipe-sharing app called Panfry, had used Firebase Firestore for storage, though that project has been shelved for months.",
94
+ "tags": [
95
+ "panfry",
96
+ "storage"
97
+ ],
98
+ "importance": 0.3,
99
+ "metadata": {
100
+ "source_turn": "turn-005",
101
+ "speaker": "user"
102
+ },
103
+ "associations": []
104
+ },
105
+ {
106
+ "id": "memA-sw-006",
107
+ "type": "procedural",
108
+ "timestamp": "2026-02-02T13:00:00Z",
109
+ "content": "Maya's workflow for adding a new habit type to Stridewell is: define the data model as a Room entity, write a migration script, then wire up the new icon set in the UI.",
110
+ "tags": [
111
+ "stridewell",
112
+ "workflow"
113
+ ],
114
+ "importance": 0.55,
115
+ "metadata": {
116
+ "source_turn": "turn-006",
117
+ "speaker": "user"
118
+ },
119
+ "associations": []
120
+ },
121
+ {
122
+ "id": "memA-sw-007",
123
+ "type": "episodic",
124
+ "timestamp": "2026-03-10T15:30:00Z",
125
+ "content": "Maya's mentor, a senior Android developer named Oskar, reviewed Stridewell's codebase and recommended she adopt an MVVM architecture with a single source-of-truth ViewModel.",
126
+ "tags": [
127
+ "stridewell",
128
+ "architecture",
129
+ "oskar"
130
+ ],
131
+ "importance": 0.7,
132
+ "metadata": {
133
+ "source_turn": "turn-007",
134
+ "speaker": "user"
135
+ },
136
+ "associations": []
137
+ },
138
+ {
139
+ "id": "memA-sw-008",
140
+ "type": "semantic",
141
+ "timestamp": "2026-03-12T09:00:00Z",
142
+ "content": "Following Oskar's review, Maya restructured Stridewell around MVVM, which made the streak-counting logic testable in isolation for the first time.",
143
+ "tags": [
144
+ "stridewell",
145
+ "architecture"
146
+ ],
147
+ "importance": 0.75,
148
+ "metadata": {
149
+ "source_turn": "turn-008",
150
+ "speaker": "user"
151
+ },
152
+ "associations": [
153
+ {
154
+ "target_id": "memA-sw-007",
155
+ "strength": 0.85,
156
+ "reason": "implements the mentor's recommendation"
157
+ }
158
+ ]
159
+ },
160
+ {
161
+ "id": "memA-sw-009",
162
+ "type": "episodic",
163
+ "timestamp": "2026-05-20T18:10:00Z",
164
+ "content": "Stridewell crashed for users with more than 200 logged habits, traced to an unindexed query in the Room database.",
165
+ "tags": [
166
+ "stridewell",
167
+ "bug"
168
+ ],
169
+ "importance": 0.65,
170
+ "metadata": {
171
+ "source_turn": "turn-009",
172
+ "speaker": "user"
173
+ },
174
+ "associations": [
175
+ {
176
+ "target_id": "memA-sw-004",
177
+ "strength": 0.6,
178
+ "reason": "the bug occurs in the storage layer introduced by this migration"
179
+ }
180
+ ]
181
+ }
182
+ ],
183
+ "questions": [
184
+ {
185
+ "question_id": "qA-sw-01",
186
+ "category": "atomic_fact_recall",
187
+ "question": "What UI toolkit is Maya using to build Stridewell?",
188
+ "expected_answer": "Jetpack Compose",
189
+ "acceptable_answer_criteria": [
190
+ "mentions Jetpack Compose",
191
+ "associates it with Kotlin/Android"
192
+ ],
193
+ "required_memory_ids": [
194
+ "memA-sw-001"
195
+ ],
196
+ "forbidden_memory_ids": [],
197
+ "difficulty": "easy",
198
+ "architecture_bias_risk": "low",
199
+ "fairness_note": "Single direct fact stated once; equally retrievable by vector similarity or graph lookup."
200
+ },
201
+ {
202
+ "question_id": "qA-sw-02",
203
+ "category": "atomic_fact_recall",
204
+ "question": "Who reviewed Stridewell's codebase and what architecture did they recommend?",
205
+ "expected_answer": "Oskar, her mentor, recommended an MVVM architecture with a single source-of-truth ViewModel.",
206
+ "acceptable_answer_criteria": [
207
+ "names Oskar",
208
+ "mentions MVVM"
209
+ ],
210
+ "required_memory_ids": [
211
+ "memA-sw-007"
212
+ ],
213
+ "forbidden_memory_ids": [],
214
+ "difficulty": "easy",
215
+ "architecture_bias_risk": "low",
216
+ "fairness_note": "Directly stated fact in one record; no relational reasoning required."
217
+ },
218
+ {
219
+ "question_id": "qA-sw-03",
220
+ "category": "paraphrased_semantic_recall",
221
+ "question": "Why did Maya go with a soft, muted look for the app instead of something flashier?",
222
+ "expected_answer": "She wanted Stridewell to feel calm rather than gamified or aggressive.",
223
+ "acceptable_answer_criteria": [
224
+ "captures the calm-vs-gamified rationale"
225
+ ],
226
+ "required_memory_ids": [
227
+ "memA-sw-002"
228
+ ],
229
+ "forbidden_memory_ids": [],
230
+ "difficulty": "medium",
231
+ "architecture_bias_risk": "low",
232
+ "fairness_note": "Question rewords 'teal-and-cream/calm' without reusing the source phrasing; testable by any embedding-based retriever."
233
+ },
234
+ {
235
+ "question_id": "qA-sw-04",
236
+ "category": "contradiction_resolution",
237
+ "question": "What does Stridewell currently use to store habit logs locally?",
238
+ "expected_answer": "A Room database (SQLite-backed). The earlier SharedPreferences approach was just an early prototype and has been replaced.",
239
+ "acceptable_answer_criteria": [
240
+ "identifies Room/SQLite as current",
241
+ "recognizes SharedPreferences is outdated"
242
+ ],
243
+ "required_memory_ids": [
244
+ "memA-sw-004"
245
+ ],
246
+ "forbidden_memory_ids": [
247
+ "memA-sw-003"
248
+ ],
249
+ "difficulty": "medium",
250
+ "architecture_bias_risk": "medium",
251
+ "fairness_note": "Requires preferring the more recent record over a stale one; doable by any system that tracks timestamps, not just graph stores."
252
+ },
253
+ {
254
+ "question_id": "qA-sw-05",
255
+ "category": "noise_resistance",
256
+ "question": "Did Stridewell ever use Firebase Firestore for storage?",
257
+ "expected_answer": "No. Firebase Firestore was used in a different, unrelated project of Maya's (Panfry), not in Stridewell.",
258
+ "acceptable_answer_criteria": [
259
+ "correctly separates Panfry from Stridewell",
260
+ "answers no for Stridewell specifically"
261
+ ],
262
+ "required_memory_ids": [
263
+ "memA-sw-001",
264
+ "memA-sw-005"
265
+ ],
266
+ "forbidden_memory_ids": [],
267
+ "difficulty": "medium",
268
+ "architecture_bias_risk": "medium",
269
+ "fairness_note": "Tests whether retrieval over-matches on a semantically similar but irrelevant distractor (storage tech mentioned for a different app)."
270
+ }
271
+ ]
272
+ },
273
+ {
274
+ "conversation_id": "convA-personal-assistant-01",
275
+ "agent_id": "agent-pa",
276
+ "domain": "personal_assistant",
277
+ "memory_records": [
278
+ {
279
+ "id": "memA-pa-001",
280
+ "type": "semantic",
281
+ "timestamp": "2025-09-10T08:00:00Z",
282
+ "content": "Tomas works a hybrid schedule: in the office Tuesdays and Thursdays, remote the rest of the week.",
283
+ "tags": [
284
+ "schedule",
285
+ "work"
286
+ ],
287
+ "importance": 0.6,
288
+ "metadata": {
289
+ "source_turn": "turn-001",
290
+ "speaker": "user"
291
+ },
292
+ "associations": []
293
+ },
294
+ {
295
+ "id": "memA-pa-002",
296
+ "type": "semantic",
297
+ "timestamp": "2025-09-12T09:00:00Z",
298
+ "content": "Tomas asked to be reminded to take a short walk every day at 3pm to break up long stretches of sitting.",
299
+ "tags": [
300
+ "reminder",
301
+ "health"
302
+ ],
303
+ "importance": 0.5,
304
+ "metadata": {
305
+ "source_turn": "turn-002",
306
+ "speaker": "user"
307
+ },
308
+ "associations": []
309
+ },
310
+ {
311
+ "id": "memA-pa-003",
312
+ "type": "episodic",
313
+ "timestamp": "2025-10-01T12:30:00Z",
314
+ "content": "Tomas started a vegetarian diet for health reasons and wants recipe suggestions without meat.",
315
+ "tags": [
316
+ "diet"
317
+ ],
318
+ "importance": 0.55,
319
+ "metadata": {
320
+ "source_turn": "turn-003",
321
+ "speaker": "user"
322
+ },
323
+ "associations": []
324
+ },
325
+ {
326
+ "id": "memA-pa-004",
327
+ "type": "semantic",
328
+ "timestamp": "2026-02-18T19:00:00Z",
329
+ "content": "Tomas loosened his diet to pescatarian, now eating fish a couple of times a week while still avoiding red meat and poultry.",
330
+ "tags": [
331
+ "diet"
332
+ ],
333
+ "importance": 0.7,
334
+ "metadata": {
335
+ "source_turn": "turn-004",
336
+ "speaker": "user"
337
+ },
338
+ "associations": [
339
+ {
340
+ "target_id": "memA-pa-003",
341
+ "strength": 0.9,
342
+ "reason": "updates the earlier vegetarian-only preference"
343
+ }
344
+ ]
345
+ },
346
+ {
347
+ "id": "memA-pa-005",
348
+ "type": "episodic",
349
+ "timestamp": "2026-02-20T17:00:00Z",
350
+ "content": "Tomas asked for help picking a restaurant for his sister's birthday dinner, a one-off occasion unrelated to his regular eating routine.",
351
+ "tags": [
352
+ "one-off",
353
+ "dining"
354
+ ],
355
+ "importance": 0.3,
356
+ "metadata": {
357
+ "source_turn": "turn-005",
358
+ "speaker": "user"
359
+ },
360
+ "associations": []
361
+ },
362
+ {
363
+ "id": "memA-pa-006",
364
+ "type": "semantic",
365
+ "timestamp": "2026-03-05T08:15:00Z",
366
+ "content": "Tomas has a recurring team standup every weekday at 9:30am that should never be scheduled over.",
367
+ "tags": [
368
+ "schedule",
369
+ "work"
370
+ ],
371
+ "importance": 0.65,
372
+ "metadata": {
373
+ "source_turn": "turn-006",
374
+ "speaker": "user"
375
+ },
376
+ "associations": []
377
+ },
378
+ {
379
+ "id": "memA-pa-007",
380
+ "type": "episodic",
381
+ "timestamp": "2026-03-22T11:00:00Z",
382
+ "content": "Tomas booked an annual physical with Dr. Whitfield for the first week of July.",
383
+ "tags": [
384
+ "health",
385
+ "appointment"
386
+ ],
387
+ "importance": 0.6,
388
+ "metadata": {
389
+ "source_turn": "turn-007",
390
+ "speaker": "user"
391
+ },
392
+ "associations": []
393
+ },
394
+ {
395
+ "id": "memA-pa-008",
396
+ "type": "procedural",
397
+ "timestamp": "2026-04-02T10:00:00Z",
398
+ "content": "When Tomas asks to 'block focus time,' the assistant should reserve two-hour quiet blocks in the morning and avoid placing meetings during them.",
399
+ "tags": [
400
+ "workflow",
401
+ "calendar"
402
+ ],
403
+ "importance": 0.55,
404
+ "metadata": {
405
+ "source_turn": "turn-008",
406
+ "speaker": "user"
407
+ },
408
+ "associations": []
409
+ },
410
+ {
411
+ "id": "memA-pa-009",
412
+ "type": "episodic",
413
+ "timestamp": "2026-05-30T14:00:00Z",
414
+ "content": "Tomas confirmed his office days are changing starting June to Mondays and Wednesdays because his team reorganized.",
415
+ "tags": [
416
+ "schedule",
417
+ "work"
418
+ ],
419
+ "importance": 0.75,
420
+ "metadata": {
421
+ "source_turn": "turn-009",
422
+ "speaker": "user"
423
+ },
424
+ "associations": [
425
+ {
426
+ "target_id": "memA-pa-001",
427
+ "strength": 0.85,
428
+ "reason": "updates the earlier hybrid-schedule days"
429
+ }
430
+ ]
431
+ }
432
+ ],
433
+ "questions": [
434
+ {
435
+ "question_id": "qA-pa-01",
436
+ "category": "atomic_fact_recall",
437
+ "question": "What time does Tomas like to be reminded to take a walk?",
438
+ "expected_answer": "3pm",
439
+ "acceptable_answer_criteria": [
440
+ "states 3pm"
441
+ ],
442
+ "required_memory_ids": [
443
+ "memA-pa-002"
444
+ ],
445
+ "forbidden_memory_ids": [],
446
+ "difficulty": "easy",
447
+ "architecture_bias_risk": "low",
448
+ "fairness_note": "Single explicit fact, no temporal conflict involved."
449
+ },
450
+ {
451
+ "question_id": "qA-pa-02",
452
+ "category": "atomic_fact_recall",
453
+ "question": "Which doctor did Tomas book his annual physical with?",
454
+ "expected_answer": "Dr. Whitfield",
455
+ "acceptable_answer_criteria": [
456
+ "names Dr. Whitfield"
457
+ ],
458
+ "required_memory_ids": [
459
+ "memA-pa-007"
460
+ ],
461
+ "forbidden_memory_ids": [],
462
+ "difficulty": "easy",
463
+ "architecture_bias_risk": "low",
464
+ "fairness_note": "Directly stated once; equally accessible to any retrieval method."
465
+ },
466
+ {
467
+ "question_id": "qA-pa-03",
468
+ "category": "paraphrased_semantic_recall",
469
+ "question": "How should the assistant handle Tomas's calendar when he wants quiet, uninterrupted work time?",
470
+ "expected_answer": "Reserve two-hour quiet blocks in the morning and keep meetings out of them.",
471
+ "acceptable_answer_criteria": [
472
+ "mentions two-hour blocks",
473
+ "mentions morning placement",
474
+ "no meetings during the block"
475
+ ],
476
+ "required_memory_ids": [
477
+ "memA-pa-008"
478
+ ],
479
+ "forbidden_memory_ids": [],
480
+ "difficulty": "medium",
481
+ "architecture_bias_risk": "low",
482
+ "fairness_note": "Reworded from 'focus time' procedural memory without reusing its exact phrasing."
483
+ },
484
+ {
485
+ "question_id": "qA-pa-04",
486
+ "category": "contradiction_resolution",
487
+ "question": "Which days is Tomas currently in the office?",
488
+ "expected_answer": "Mondays and Wednesdays. This replaced his earlier Tuesday/Thursday hybrid schedule after his team reorganized.",
489
+ "acceptable_answer_criteria": [
490
+ "gives Monday/Wednesday",
491
+ "notes the change from the old schedule"
492
+ ],
493
+ "required_memory_ids": [
494
+ "memA-pa-009"
495
+ ],
496
+ "forbidden_memory_ids": [
497
+ "memA-pa-001"
498
+ ],
499
+ "difficulty": "medium",
500
+ "architecture_bias_risk": "medium",
501
+ "fairness_note": "Tests whether the system updates a previously stable fact rather than returning the older, now-stale schedule."
502
+ },
503
+ {
504
+ "question_id": "qA-pa-05",
505
+ "category": "noise_resistance",
506
+ "question": "Is picking a restaurant for his sister's birthday part of Tomas's regular eating routine?",
507
+ "expected_answer": "No, that was a one-off event for a birthday dinner, unrelated to his standing pescatarian diet.",
508
+ "acceptable_answer_criteria": [
509
+ "identifies it as a one-off, not a standing preference"
510
+ ],
511
+ "required_memory_ids": [
512
+ "memA-pa-005",
513
+ "memA-pa-004"
514
+ ],
515
+ "forbidden_memory_ids": [],
516
+ "difficulty": "medium",
517
+ "architecture_bias_risk": "medium",
518
+ "fairness_note": "Distractor shares the 'food' topic with his diet preference but describes an unrelated single event."
519
+ }
520
+ ]
521
+ },
522
+ {
523
+ "conversation_id": "convA-research-01",
524
+ "agent_id": "agent-research",
525
+ "domain": "research",
526
+ "memory_records": [
527
+ {
528
+ "id": "memA-res-001",
529
+ "type": "semantic",
530
+ "timestamp": "2025-12-01T10:00:00Z",
531
+ "content": "Elena's thesis focuses on how tree canopy coverage affects surface temperatures in mid-sized cities.",
532
+ "tags": [
533
+ "thesis",
534
+ "topic"
535
+ ],
536
+ "importance": 0.7,
537
+ "metadata": {
538
+ "source_turn": "turn-001",
539
+ "speaker": "user"
540
+ },
541
+ "associations": []
542
+ },
543
+ {
544
+ "id": "memA-res-002",
545
+ "type": "episodic",
546
+ "timestamp": "2025-12-03T11:00:00Z",
547
+ "content": "Elena's advisor, Professor Nakamura, suggested she look at a 2019 satellite-imagery methodology paper by Choudhury et al. for measuring canopy density.",
548
+ "tags": [
549
+ "advisor",
550
+ "methodology"
551
+ ],
552
+ "importance": 0.65,
553
+ "metadata": {
554
+ "source_turn": "turn-002",
555
+ "speaker": "user"
556
+ },
557
+ "associations": []
558
+ },
559
+ {
560
+ "id": "memA-res-003",
561
+ "type": "semantic",
562
+ "timestamp": "2025-12-10T09:30:00Z",
563
+ "content": "Reading the Choudhury paper led Elena to adopt NDVI (Normalized Difference Vegetation Index) thresholds as her canopy-density measure.",
564
+ "tags": [
565
+ "methodology",
566
+ "ndvi"
567
+ ],
568
+ "importance": 0.75,
569
+ "metadata": {
570
+ "source_turn": "turn-003",
571
+ "speaker": "user"
572
+ },
573
+ "associations": [
574
+ {
575
+ "target_id": "memA-res-002",
576
+ "strength": 0.8,
577
+ "reason": "methodology adopted because of the advisor's suggested paper"
578
+ }
579
+ ]
580
+ },
581
+ {
582
+ "id": "memA-res-004",
583
+ "type": "episodic",
584
+ "timestamp": "2026-01-08T15:00:00Z",
585
+ "content": "Elena narrowed her thesis scope from three cities down to just one, Cordoba, after her committee flagged the original scope as too broad for a one-year thesis.",
586
+ "tags": [
587
+ "scope"
588
+ ],
589
+ "importance": 0.7,
590
+ "metadata": {
591
+ "source_turn": "turn-004",
592
+ "speaker": "user"
593
+ },
594
+ "associations": []
595
+ },
596
+ {
597
+ "id": "memA-res-005",
598
+ "type": "episodic",
599
+ "timestamp": "2026-01-09T10:00:00Z",
600
+ "content": "Elena briefly considered a side project on coastal erosion with a labmate but decided not to pursue it due to time constraints.",
601
+ "tags": [
602
+ "side-project"
603
+ ],
604
+ "importance": 0.25,
605
+ "metadata": {
606
+ "source_turn": "turn-005",
607
+ "speaker": "user"
608
+ },
609
+ "associations": []
610
+ },
611
+ {
612
+ "id": "memA-res-006",
613
+ "type": "semantic",
614
+ "timestamp": "2026-02-14T13:00:00Z",
615
+ "content": "Elena's preliminary results show neighborhoods with over 30% canopy coverage run roughly 2.5°C cooler on average during summer afternoons.",
616
+ "tags": [
617
+ "results"
618
+ ],
619
+ "importance": 0.8,
620
+ "metadata": {
621
+ "source_turn": "turn-006",
622
+ "speaker": "user"
623
+ },
624
+ "associations": []
625
+ },
626
+ {
627
+ "id": "memA-res-007",
628
+ "type": "episodic",
629
+ "timestamp": "2026-03-01T09:00:00Z",
630
+ "content": "Elena's early hypothesis was that building material, asphalt versus concrete, would matter more than tree cover, but her data later showed canopy coverage was the stronger predictor.",
631
+ "tags": [
632
+ "hypothesis",
633
+ "results"
634
+ ],
635
+ "importance": 0.75,
636
+ "metadata": {
637
+ "source_turn": "turn-007",
638
+ "speaker": "user"
639
+ },
640
+ "associations": []
641
+ },
642
+ {
643
+ "id": "memA-res-008",
644
+ "type": "procedural",
645
+ "timestamp": "2026-03-15T14:00:00Z",
646
+ "content": "Elena's data-cleaning routine for new satellite passes is: filter out cloud cover above 10%, normalize for time-of-day, then run the NDVI threshold script.",
647
+ "tags": [
648
+ "workflow"
649
+ ],
650
+ "importance": 0.5,
651
+ "metadata": {
652
+ "source_turn": "turn-008",
653
+ "speaker": "user"
654
+ },
655
+ "associations": []
656
+ },
657
+ {
658
+ "id": "memA-res-009",
659
+ "type": "episodic",
660
+ "timestamp": "2026-05-02T16:00:00Z",
661
+ "content": "Elena submitted her draft methodology section, crediting the Choudhury et al. paper as the basis for her NDVI approach, itself originally suggested by Professor Nakamura.",
662
+ "tags": [
663
+ "methodology",
664
+ "writing"
665
+ ],
666
+ "importance": 0.6,
667
+ "metadata": {
668
+ "source_turn": "turn-009",
669
+ "speaker": "user"
670
+ },
671
+ "associations": [
672
+ {
673
+ "target_id": "memA-res-003",
674
+ "strength": 0.7,
675
+ "reason": "restates the adopted methodology"
676
+ },
677
+ {
678
+ "target_id": "memA-res-002",
679
+ "strength": 0.6,
680
+ "reason": "traces the methodology back to the advisor's original suggestion"
681
+ }
682
+ ]
683
+ }
684
+ ],
685
+ "questions": [
686
+ {
687
+ "question_id": "qA-res-01",
688
+ "category": "atomic_fact_recall",
689
+ "question": "Which city did Elena ultimately choose to focus her thesis on?",
690
+ "expected_answer": "Cordoba",
691
+ "acceptable_answer_criteria": [
692
+ "names Cordoba"
693
+ ],
694
+ "required_memory_ids": [
695
+ "memA-res-004"
696
+ ],
697
+ "forbidden_memory_ids": [],
698
+ "difficulty": "easy",
699
+ "architecture_bias_risk": "low",
700
+ "fairness_note": "Single stated fact; no relational chain needed."
701
+ },
702
+ {
703
+ "question_id": "qA-res-02",
704
+ "category": "paraphrased_semantic_recall",
705
+ "question": "What approach does Elena use to quantify how much tree canopy an area has?",
706
+ "expected_answer": "NDVI (Normalized Difference Vegetation Index) thresholds, drawn from the Choudhury et al. methodology.",
707
+ "acceptable_answer_criteria": [
708
+ "mentions NDVI",
709
+ "credits the methodology source"
710
+ ],
711
+ "required_memory_ids": [
712
+ "memA-res-003"
713
+ ],
714
+ "forbidden_memory_ids": [],
715
+ "difficulty": "medium",
716
+ "architecture_bias_risk": "low",
717
+ "fairness_note": "Rewords 'canopy density measure' as 'how much tree canopy an area has'; answerable via semantic similarity alone."
718
+ },
719
+ {
720
+ "question_id": "qA-res-03",
721
+ "category": "temporal_update",
722
+ "question": "How many cities is Elena currently studying for her thesis?",
723
+ "expected_answer": "One — Cordoba. She originally planned three but narrowed the scope after committee feedback.",
724
+ "acceptable_answer_criteria": [
725
+ "states one/Cordoba",
726
+ "notes the reduction from three"
727
+ ],
728
+ "required_memory_ids": [
729
+ "memA-res-004"
730
+ ],
731
+ "forbidden_memory_ids": [],
732
+ "difficulty": "medium",
733
+ "architecture_bias_risk": "medium",
734
+ "fairness_note": "Tests whether the system reports the current, narrowed scope rather than an outdated broader plan."
735
+ },
736
+ {
737
+ "question_id": "qA-res-04",
738
+ "category": "contradiction_resolution",
739
+ "question": "Did Elena's data end up supporting her original hypothesis about building material?",
740
+ "expected_answer": "No. She initially hypothesized building material mattered more, but her data showed canopy coverage was the stronger predictor of cooling.",
741
+ "acceptable_answer_criteria": [
742
+ "names the original hypothesis",
743
+ "states the data contradicted it"
744
+ ],
745
+ "required_memory_ids": [
746
+ "memA-res-007"
747
+ ],
748
+ "forbidden_memory_ids": [],
749
+ "difficulty": "medium",
750
+ "architecture_bias_risk": "medium",
751
+ "fairness_note": "The contradiction is between a stated hypothesis and a later stated result within one record; any system reading the full record can resolve it without graph traversal."
752
+ },
753
+ {
754
+ "question_id": "qA-res-05",
755
+ "category": "noise_resistance",
756
+ "question": "Did Elena's coastal erosion side-project become part of her thesis?",
757
+ "expected_answer": "No, she decided not to pursue it due to time constraints; it stayed a brief side consideration, not part of the urban heat island thesis.",
758
+ "acceptable_answer_criteria": [
759
+ "identifies it as not pursued/unrelated"
760
+ ],
761
+ "required_memory_ids": [
762
+ "memA-res-005",
763
+ "memA-res-001"
764
+ ],
765
+ "forbidden_memory_ids": [],
766
+ "difficulty": "medium",
767
+ "architecture_bias_risk": "medium",
768
+ "fairness_note": "Distractor topic (a different environmental research idea) is thematically close but explicitly dropped, testing resistance to topical noise."
769
+ }
770
+ ]
771
+ },
772
+ {
773
+ "conversation_id": "convA-travel-01",
774
+ "agent_id": "agent-travel",
775
+ "domain": "travel",
776
+ "memory_records": [
777
+ {
778
+ "id": "memA-tr-001",
779
+ "type": "episodic",
780
+ "timestamp": "2026-01-05T10:00:00Z",
781
+ "content": "Raf started planning a one-week trip to Lisbon, originally for early September.",
782
+ "tags": [
783
+ "lisbon",
784
+ "planning"
785
+ ],
786
+ "importance": 0.6,
787
+ "metadata": {
788
+ "source_turn": "turn-001",
789
+ "speaker": "user"
790
+ },
791
+ "associations": []
792
+ },
793
+ {
794
+ "id": "memA-tr-002",
795
+ "type": "episodic",
796
+ "timestamp": "2026-01-06T11:00:00Z",
797
+ "content": "Raf booked a budget hostel near Alfama to keep costs low.",
798
+ "tags": [
799
+ "lisbon",
800
+ "lodging"
801
+ ],
802
+ "importance": 0.5,
803
+ "metadata": {
804
+ "source_turn": "turn-002",
805
+ "speaker": "user"
806
+ },
807
+ "associations": []
808
+ },
809
+ {
810
+ "id": "memA-tr-003",
811
+ "type": "semantic",
812
+ "timestamp": "2026-01-20T09:00:00Z",
813
+ "content": "Raf is allergic to shellfish, which matters for restaurant recommendations in a coastal city like Lisbon.",
814
+ "tags": [
815
+ "allergy",
816
+ "dining"
817
+ ],
818
+ "importance": 0.75,
819
+ "metadata": {
820
+ "source_turn": "turn-003",
821
+ "speaker": "user"
822
+ },
823
+ "associations": []
824
+ },
825
+ {
826
+ "id": "memA-tr-004",
827
+ "type": "episodic",
828
+ "timestamp": "2026-02-15T15:00:00Z",
829
+ "content": "Raf upgraded from the hostel to a boutique hotel in Chiado, deciding to treat himself since his 30th birthday falls during the trip.",
830
+ "tags": [
831
+ "lisbon",
832
+ "lodging"
833
+ ],
834
+ "importance": 0.7,
835
+ "metadata": {
836
+ "source_turn": "turn-004",
837
+ "speaker": "user"
838
+ },
839
+ "associations": [
840
+ {
841
+ "target_id": "memA-tr-002",
842
+ "strength": 0.9,
843
+ "reason": "replaces the originally booked hostel"
844
+ }
845
+ ]
846
+ },
847
+ {
848
+ "id": "memA-tr-005",
849
+ "type": "episodic",
850
+ "timestamp": "2026-02-16T16:00:00Z",
851
+ "content": "Raf recalled that on a trip to Madrid two years earlier he stayed in a similar boutique hotel and loved the central location.",
852
+ "tags": [
853
+ "madrid",
854
+ "past-trip"
855
+ ],
856
+ "importance": 0.3,
857
+ "metadata": {
858
+ "source_turn": "turn-005",
859
+ "speaker": "user"
860
+ },
861
+ "associations": []
862
+ },
863
+ {
864
+ "id": "memA-tr-006",
865
+ "type": "episodic",
866
+ "timestamp": "2026-04-10T12:00:00Z",
867
+ "content": "Raf moved his Lisbon trip dates from early September to late September because of a work conflict.",
868
+ "tags": [
869
+ "lisbon",
870
+ "dates"
871
+ ],
872
+ "importance": 0.65,
873
+ "metadata": {
874
+ "source_turn": "turn-006",
875
+ "speaker": "user"
876
+ },
877
+ "associations": []
878
+ },
879
+ {
880
+ "id": "memA-tr-007",
881
+ "type": "procedural",
882
+ "timestamp": "2026-04-12T13:00:00Z",
883
+ "content": "Raf prefers to book day trips no more than a week in advance so he can adjust plans based on the weather.",
884
+ "tags": [
885
+ "workflow"
886
+ ],
887
+ "importance": 0.45,
888
+ "metadata": {
889
+ "source_turn": "turn-007",
890
+ "speaker": "user"
891
+ },
892
+ "associations": []
893
+ },
894
+ {
895
+ "id": "memA-tr-008",
896
+ "type": "episodic",
897
+ "timestamp": "2026-05-01T10:00:00Z",
898
+ "content": "Raf added a day trip to Sintra to see Pena Palace, booked through a small local tour operator his hotel concierge recommended.",
899
+ "tags": [
900
+ "lisbon",
901
+ "sintra"
902
+ ],
903
+ "importance": 0.55,
904
+ "metadata": {
905
+ "source_turn": "turn-008",
906
+ "speaker": "user"
907
+ },
908
+ "associations": [
909
+ {
910
+ "target_id": "memA-tr-004",
911
+ "strength": 0.5,
912
+ "reason": "the concierge recommendation came from the Chiado hotel stay"
913
+ }
914
+ ]
915
+ },
916
+ {
917
+ "id": "memA-tr-009",
918
+ "type": "episodic",
919
+ "timestamp": "2026-06-02T09:00:00Z",
920
+ "content": "Raf confirmed his flight: arriving in Lisbon September 28th, departing October 5th.",
921
+ "tags": [
922
+ "lisbon",
923
+ "dates"
924
+ ],
925
+ "importance": 0.8,
926
+ "metadata": {
927
+ "source_turn": "turn-009",
928
+ "speaker": "user"
929
+ },
930
+ "associations": [
931
+ {
932
+ "target_id": "memA-tr-006",
933
+ "strength": 0.85,
934
+ "reason": "confirms the previously moved dates"
935
+ }
936
+ ]
937
+ }
938
+ ],
939
+ "questions": [
940
+ {
941
+ "question_id": "qA-tr-01",
942
+ "category": "atomic_fact_recall",
943
+ "question": "What food allergy does Raf have that matters for restaurant planning?",
944
+ "expected_answer": "Shellfish",
945
+ "acceptable_answer_criteria": [
946
+ "states shellfish"
947
+ ],
948
+ "required_memory_ids": [
949
+ "memA-tr-003"
950
+ ],
951
+ "forbidden_memory_ids": [],
952
+ "difficulty": "easy",
953
+ "architecture_bias_risk": "low",
954
+ "fairness_note": "Single explicit fact."
955
+ },
956
+ {
957
+ "question_id": "qA-tr-02",
958
+ "category": "paraphrased_semantic_recall",
959
+ "question": "How far ahead does Raf like to lock in his day trips?",
960
+ "expected_answer": "No more than a week ahead, so he can adjust based on the weather.",
961
+ "acceptable_answer_criteria": [
962
+ "mentions about a week",
963
+ "mentions weather flexibility"
964
+ ],
965
+ "required_memory_ids": [
966
+ "memA-tr-007"
967
+ ],
968
+ "forbidden_memory_ids": [],
969
+ "difficulty": "medium",
970
+ "architecture_bias_risk": "low",
971
+ "fairness_note": "Reworded from the procedural memory's phrasing; testable by semantic similarity."
972
+ },
973
+ {
974
+ "question_id": "qA-tr-03",
975
+ "category": "temporal_update",
976
+ "question": "When is Raf's Lisbon trip currently scheduled?",
977
+ "expected_answer": "September 28th to October 5th — moved from the original early-September plan.",
978
+ "acceptable_answer_criteria": [
979
+ "gives the new dates",
980
+ "notes they were moved"
981
+ ],
982
+ "required_memory_ids": [
983
+ "memA-tr-009",
984
+ "memA-tr-006"
985
+ ],
986
+ "forbidden_memory_ids": [],
987
+ "difficulty": "medium",
988
+ "architecture_bias_risk": "medium",
989
+ "fairness_note": "Requires surfacing the latest confirmed dates over the earlier 'early September' plan."
990
+ },
991
+ {
992
+ "question_id": "qA-tr-04",
993
+ "category": "contradiction_resolution",
994
+ "question": "Where is Raf staying in Lisbon?",
995
+ "expected_answer": "A boutique hotel in Chiado — he upgraded from the hostel near Alfama he originally booked.",
996
+ "acceptable_answer_criteria": [
997
+ "names Chiado boutique hotel",
998
+ "notes the upgrade from the hostel"
999
+ ],
1000
+ "required_memory_ids": [
1001
+ "memA-tr-004"
1002
+ ],
1003
+ "forbidden_memory_ids": [
1004
+ "memA-tr-002"
1005
+ ],
1006
+ "difficulty": "medium",
1007
+ "architecture_bias_risk": "medium",
1008
+ "fairness_note": "Tests preference for the superseding lodging record over the stale hostel booking."
1009
+ },
1010
+ {
1011
+ "question_id": "qA-tr-05",
1012
+ "category": "noise_resistance",
1013
+ "question": "Did Raf stay in a boutique hotel in Lisbon on an earlier trip?",
1014
+ "expected_answer": "No — the boutique hotel he recalled was in Madrid on a previous trip, not in Lisbon.",
1015
+ "acceptable_answer_criteria": [
1016
+ "correctly attributes the memory to Madrid, not Lisbon"
1017
+ ],
1018
+ "required_memory_ids": [
1019
+ "memA-tr-005",
1020
+ "memA-tr-004"
1021
+ ],
1022
+ "forbidden_memory_ids": [],
1023
+ "difficulty": "medium",
1024
+ "architecture_bias_risk": "medium",
1025
+ "fairness_note": "Distractor mentions a boutique hotel and is thematically close to the current Lisbon hotel, but refers to a different city/trip."
1026
+ }
1027
+ ]
1028
+ },
1029
+ {
1030
+ "conversation_id": "convA-health-admin-01",
1031
+ "agent_id": "agent-health",
1032
+ "domain": "health_admin",
1033
+ "memory_records": [
1034
+ {
1035
+ "id": "memA-ha-001",
1036
+ "type": "episodic",
1037
+ "timestamp": "2025-10-02T09:00:00Z",
1038
+ "content": "Priya started coordinating medical care for her father, Vikram, after his recent diagnosis of high blood pressure.",
1039
+ "tags": [
1040
+ "vikram",
1041
+ "care-coordination"
1042
+ ],
1043
+ "importance": 0.7,
1044
+ "metadata": {
1045
+ "source_turn": "turn-001",
1046
+ "speaker": "user"
1047
+ },
1048
+ "associations": []
1049
+ },
1050
+ {
1051
+ "id": "memA-ha-002",
1052
+ "type": "semantic",
1053
+ "timestamp": "2025-10-05T10:00:00Z",
1054
+ "content": "Vikram's doctor, Dr. Alam, initially prescribed 5mg of amlodipine daily for his blood pressure.",
1055
+ "tags": [
1056
+ "vikram",
1057
+ "medication"
1058
+ ],
1059
+ "importance": 0.7,
1060
+ "metadata": {
1061
+ "source_turn": "turn-002",
1062
+ "speaker": "user"
1063
+ },
1064
+ "associations": []
1065
+ },
1066
+ {
1067
+ "id": "memA-ha-003",
1068
+ "type": "episodic",
1069
+ "timestamp": "2026-01-12T11:00:00Z",
1070
+ "content": "At a follow-up, Dr. Alam increased Vikram's amlodipine dosage to 10mg daily after his readings stayed high.",
1071
+ "tags": [
1072
+ "vikram",
1073
+ "medication"
1074
+ ],
1075
+ "importance": 0.8,
1076
+ "metadata": {
1077
+ "source_turn": "turn-003",
1078
+ "speaker": "user"
1079
+ },
1080
+ "associations": [
1081
+ {
1082
+ "target_id": "memA-ha-002",
1083
+ "strength": 0.9,
1084
+ "reason": "updates the original 5mg dosage"
1085
+ }
1086
+ ]
1087
+ },
1088
+ {
1089
+ "id": "memA-ha-004",
1090
+ "type": "episodic",
1091
+ "timestamp": "2026-01-13T12:00:00Z",
1092
+ "content": "Priya mentioned her own annual dental cleaning was due around the same time, unrelated to her father's care.",
1093
+ "tags": [
1094
+ "priya",
1095
+ "personal"
1096
+ ],
1097
+ "importance": 0.2,
1098
+ "metadata": {
1099
+ "source_turn": "turn-004",
1100
+ "speaker": "user"
1101
+ },
1102
+ "associations": []
1103
+ },
1104
+ {
1105
+ "id": "memA-ha-005",
1106
+ "type": "episodic",
1107
+ "timestamp": "2026-02-20T14:00:00Z",
1108
+ "content": "Dr. Alam referred Vikram to a cardiologist, Dr. Reyes, after an EKG showed an irregular heartbeat.",
1109
+ "tags": [
1110
+ "vikram",
1111
+ "referral"
1112
+ ],
1113
+ "importance": 0.75,
1114
+ "metadata": {
1115
+ "source_turn": "turn-005",
1116
+ "speaker": "user"
1117
+ },
1118
+ "associations": [
1119
+ {
1120
+ "target_id": "memA-ha-003",
1121
+ "strength": 0.6,
1122
+ "reason": "referral followed continued high readings after the dosage increase"
1123
+ }
1124
+ ]
1125
+ },
1126
+ {
1127
+ "id": "memA-ha-006",
1128
+ "type": "episodic",
1129
+ "timestamp": "2026-02-28T15:00:00Z",
1130
+ "content": "Dr. Reyes reviewed the EKG and recommended a 24-hour Holter monitor test before deciding on further treatment.",
1131
+ "tags": [
1132
+ "vikram",
1133
+ "cardiology"
1134
+ ],
1135
+ "importance": 0.7,
1136
+ "metadata": {
1137
+ "source_turn": "turn-006",
1138
+ "speaker": "user"
1139
+ },
1140
+ "associations": [
1141
+ {
1142
+ "target_id": "memA-ha-005",
1143
+ "strength": 0.85,
1144
+ "reason": "follow-up test ordered after the referral"
1145
+ }
1146
+ ]
1147
+ },
1148
+ {
1149
+ "id": "memA-ha-007",
1150
+ "type": "semantic",
1151
+ "timestamp": "2026-03-10T09:00:00Z",
1152
+ "content": "Vikram's insurance switched from Plan B to Plan C in March, changing his specialist-visit copay from $40 to $25.",
1153
+ "tags": [
1154
+ "vikram",
1155
+ "insurance"
1156
+ ],
1157
+ "importance": 0.6,
1158
+ "metadata": {
1159
+ "source_turn": "turn-007",
1160
+ "speaker": "user"
1161
+ },
1162
+ "associations": []
1163
+ },
1164
+ {
1165
+ "id": "memA-ha-008",
1166
+ "type": "procedural",
1167
+ "timestamp": "2026-03-15T10:00:00Z",
1168
+ "content": "To refill Vikram's amlodipine prescription, Priya calls the pharmacy two days before it runs out and confirms Dr. Alam's office has approved the refill.",
1169
+ "tags": [
1170
+ "vikram",
1171
+ "workflow"
1172
+ ],
1173
+ "importance": 0.55,
1174
+ "metadata": {
1175
+ "source_turn": "turn-008",
1176
+ "speaker": "user"
1177
+ },
1178
+ "associations": []
1179
+ },
1180
+ {
1181
+ "id": "memA-ha-009",
1182
+ "type": "episodic",
1183
+ "timestamp": "2026-05-18T16:00:00Z",
1184
+ "content": "The Holter monitor results came back normal, and Dr. Reyes cleared Vikram to continue his current blood pressure regimen without further cardiac intervention.",
1185
+ "tags": [
1186
+ "vikram",
1187
+ "cardiology",
1188
+ "results"
1189
+ ],
1190
+ "importance": 0.75,
1191
+ "metadata": {
1192
+ "source_turn": "turn-009",
1193
+ "speaker": "user"
1194
+ },
1195
+ "associations": [
1196
+ {
1197
+ "target_id": "memA-ha-006",
1198
+ "strength": 0.85,
1199
+ "reason": "this is the result of the ordered Holter monitor test"
1200
+ }
1201
+ ]
1202
+ }
1203
+ ],
1204
+ "questions": [
1205
+ {
1206
+ "question_id": "qA-ha-01",
1207
+ "category": "atomic_fact_recall",
1208
+ "question": "What is Vikram's current amlodipine dosage?",
1209
+ "expected_answer": "10mg daily",
1210
+ "acceptable_answer_criteria": [
1211
+ "states 10mg"
1212
+ ],
1213
+ "required_memory_ids": [
1214
+ "memA-ha-003"
1215
+ ],
1216
+ "forbidden_memory_ids": [],
1217
+ "difficulty": "easy",
1218
+ "architecture_bias_risk": "low",
1219
+ "fairness_note": "Most recent dosage record states this directly."
1220
+ },
1221
+ {
1222
+ "question_id": "qA-ha-02",
1223
+ "category": "paraphrased_semantic_recall",
1224
+ "question": "What changed about Vikram's specialist-visit costs in March?",
1225
+ "expected_answer": "His copay dropped from $40 to $25 after switching insurance plans.",
1226
+ "acceptable_answer_criteria": [
1227
+ "mentions the copay drop",
1228
+ "mentions the insurance switch"
1229
+ ],
1230
+ "required_memory_ids": [
1231
+ "memA-ha-007"
1232
+ ],
1233
+ "forbidden_memory_ids": [],
1234
+ "difficulty": "medium",
1235
+ "architecture_bias_risk": "low",
1236
+ "fairness_note": "Reworded from 'copay' language without copying it verbatim."
1237
+ },
1238
+ {
1239
+ "question_id": "qA-ha-03",
1240
+ "category": "temporal_update",
1241
+ "question": "What is Vikram's blood pressure medication dosage now, and how did it get there?",
1242
+ "expected_answer": "10mg daily now, increased by Dr. Alam from the original 5mg dose at a January follow-up.",
1243
+ "acceptable_answer_criteria": [
1244
+ "gives current 10mg",
1245
+ "traces it back to the 5mg starting dose"
1246
+ ],
1247
+ "required_memory_ids": [
1248
+ "memA-ha-003",
1249
+ "memA-ha-002"
1250
+ ],
1251
+ "forbidden_memory_ids": [],
1252
+ "difficulty": "medium",
1253
+ "architecture_bias_risk": "medium",
1254
+ "fairness_note": "Tests resolving the current value while still being able to reference the prior one for context."
1255
+ },
1256
+ {
1257
+ "question_id": "qA-ha-04",
1258
+ "category": "multi_hop_association",
1259
+ "question": "Why did Vikram end up getting a Holter monitor test?",
1260
+ "expected_answer": "An EKG showed an irregular heartbeat, which led Dr. Alam to refer him to cardiologist Dr. Reyes, who then ordered the Holter monitor.",
1261
+ "acceptable_answer_criteria": [
1262
+ "mentions the EKG finding",
1263
+ "mentions the referral to Dr. Reyes",
1264
+ "connects it to the Holter monitor order"
1265
+ ],
1266
+ "required_memory_ids": [
1267
+ "memA-ha-005",
1268
+ "memA-ha-006"
1269
+ ],
1270
+ "forbidden_memory_ids": [],
1271
+ "difficulty": "hard",
1272
+ "architecture_bias_risk": "high",
1273
+ "fairness_note": "Requires chaining two linked records (referral -> follow-up test); a pure single-vector lookup on the question text alone would likely retrieve only one of the two records."
1274
+ },
1275
+ {
1276
+ "question_id": "qA-ha-05",
1277
+ "category": "procedural_recall",
1278
+ "question": "What does Priya do to refill her father's blood pressure medication?",
1279
+ "expected_answer": "She calls the pharmacy two days before it runs out and confirms Dr. Alam's office approved the refill.",
1280
+ "acceptable_answer_criteria": [
1281
+ "mentions calling two days ahead",
1282
+ "mentions confirming approval with the doctor's office"
1283
+ ],
1284
+ "required_memory_ids": [
1285
+ "memA-ha-008"
1286
+ ],
1287
+ "forbidden_memory_ids": [],
1288
+ "difficulty": "easy",
1289
+ "architecture_bias_risk": "low",
1290
+ "fairness_note": "Procedure is stated in a single record; no relational reasoning needed."
1291
+ }
1292
+ ]
1293
+ },
1294
+ {
1295
+ "conversation_id": "convA-finance-admin-01",
1296
+ "agent_id": "agent-finance",
1297
+ "domain": "finance_admin",
1298
+ "memory_records": [
1299
+ {
1300
+ "id": "memA-fa-001",
1301
+ "type": "semantic",
1302
+ "timestamp": "2025-11-01T08:00:00Z",
1303
+ "content": "Jonas set a monthly savings goal of $400 when he started budgeting with the assistant.",
1304
+ "tags": [
1305
+ "budget",
1306
+ "goal"
1307
+ ],
1308
+ "importance": 0.65,
1309
+ "metadata": {
1310
+ "source_turn": "turn-001",
1311
+ "speaker": "user"
1312
+ },
1313
+ "associations": []
1314
+ },
1315
+ {
1316
+ "id": "memA-fa-002",
1317
+ "type": "episodic",
1318
+ "timestamp": "2025-11-10T09:00:00Z",
1319
+ "content": "Jonas switched his primary checking account from Northfield Bank to Harbor Credit Union for better fees.",
1320
+ "tags": [
1321
+ "banking"
1322
+ ],
1323
+ "importance": 0.6,
1324
+ "metadata": {
1325
+ "source_turn": "turn-002",
1326
+ "speaker": "user"
1327
+ },
1328
+ "associations": []
1329
+ },
1330
+ {
1331
+ "id": "memA-fa-003",
1332
+ "type": "episodic",
1333
+ "timestamp": "2026-01-05T10:00:00Z",
1334
+ "content": "Jonas canceled a $15/month streaming subscription he rarely used.",
1335
+ "tags": [
1336
+ "expenses"
1337
+ ],
1338
+ "importance": 0.5,
1339
+ "metadata": {
1340
+ "source_turn": "turn-003",
1341
+ "speaker": "user"
1342
+ },
1343
+ "associations": []
1344
+ },
1345
+ {
1346
+ "id": "memA-fa-004",
1347
+ "type": "episodic",
1348
+ "timestamp": "2026-01-06T11:00:00Z",
1349
+ "content": "Jonas also canceled a $30/month gym membership after switching to home workouts, noting it freed up budget room.",
1350
+ "tags": [
1351
+ "expenses"
1352
+ ],
1353
+ "importance": 0.5,
1354
+ "metadata": {
1355
+ "source_turn": "turn-004",
1356
+ "speaker": "user"
1357
+ },
1358
+ "associations": []
1359
+ },
1360
+ {
1361
+ "id": "memA-fa-005",
1362
+ "type": "episodic",
1363
+ "timestamp": "2026-01-20T12:00:00Z",
1364
+ "content": "With the combined $45/month in cancellations, Jonas raised his monthly savings goal from $400 to $450.",
1365
+ "tags": [
1366
+ "budget",
1367
+ "goal"
1368
+ ],
1369
+ "importance": 0.75,
1370
+ "metadata": {
1371
+ "source_turn": "turn-005",
1372
+ "speaker": "user"
1373
+ },
1374
+ "associations": [
1375
+ {
1376
+ "target_id": "memA-fa-003",
1377
+ "strength": 0.7,
1378
+ "reason": "freed-up subscription money funded the higher goal"
1379
+ },
1380
+ {
1381
+ "target_id": "memA-fa-004",
1382
+ "strength": 0.7,
1383
+ "reason": "freed-up gym money funded the higher goal"
1384
+ },
1385
+ {
1386
+ "target_id": "memA-fa-001",
1387
+ "strength": 0.85,
1388
+ "reason": "updates the original $400 goal"
1389
+ }
1390
+ ]
1391
+ },
1392
+ {
1393
+ "id": "memA-fa-006",
1394
+ "type": "episodic",
1395
+ "timestamp": "2026-02-02T13:00:00Z",
1396
+ "content": "Jonas's sister asked him for advice on her own student loan refinancing, which he helped research separately.",
1397
+ "tags": [
1398
+ "other-person"
1399
+ ],
1400
+ "importance": 0.2,
1401
+ "metadata": {
1402
+ "source_turn": "turn-006",
1403
+ "speaker": "user"
1404
+ },
1405
+ "associations": []
1406
+ },
1407
+ {
1408
+ "id": "memA-fa-007",
1409
+ "type": "procedural",
1410
+ "timestamp": "2026-02-10T14:00:00Z",
1411
+ "content": "Jonas's process for disputing a charge is to first call Harbor Credit Union's support line, then follow up in writing within 10 days if it isn't resolved.",
1412
+ "tags": [
1413
+ "workflow",
1414
+ "banking"
1415
+ ],
1416
+ "importance": 0.55,
1417
+ "metadata": {
1418
+ "source_turn": "turn-007",
1419
+ "speaker": "user"
1420
+ },
1421
+ "associations": []
1422
+ },
1423
+ {
1424
+ "id": "memA-fa-008",
1425
+ "type": "semantic",
1426
+ "timestamp": "2026-04-01T09:00:00Z",
1427
+ "content": "Jonas started allocating an extra $100/month toward an emergency fund, separate from his regular savings goal.",
1428
+ "tags": [
1429
+ "budget",
1430
+ "emergency-fund"
1431
+ ],
1432
+ "importance": 0.7,
1433
+ "metadata": {
1434
+ "source_turn": "turn-008",
1435
+ "speaker": "user"
1436
+ },
1437
+ "associations": []
1438
+ },
1439
+ {
1440
+ "id": "memA-fa-009",
1441
+ "type": "episodic",
1442
+ "timestamp": "2026-05-15T15:00:00Z",
1443
+ "content": "Jonas disputed an incorrect $60 charge from a hardware store and got it reversed within a week using his usual process.",
1444
+ "tags": [
1445
+ "banking",
1446
+ "dispute"
1447
+ ],
1448
+ "importance": 0.6,
1449
+ "metadata": {
1450
+ "source_turn": "turn-009",
1451
+ "speaker": "user"
1452
+ },
1453
+ "associations": [
1454
+ {
1455
+ "target_id": "memA-fa-007",
1456
+ "strength": 0.8,
1457
+ "reason": "followed his established dispute procedure"
1458
+ }
1459
+ ]
1460
+ }
1461
+ ],
1462
+ "questions": [
1463
+ {
1464
+ "question_id": "qA-fa-01",
1465
+ "category": "atomic_fact_recall",
1466
+ "question": "Which bank does Jonas currently use for checking?",
1467
+ "expected_answer": "Harbor Credit Union",
1468
+ "acceptable_answer_criteria": [
1469
+ "names Harbor Credit Union"
1470
+ ],
1471
+ "required_memory_ids": [
1472
+ "memA-fa-002"
1473
+ ],
1474
+ "forbidden_memory_ids": [],
1475
+ "difficulty": "easy",
1476
+ "architecture_bias_risk": "low",
1477
+ "fairness_note": "Directly stated single fact."
1478
+ },
1479
+ {
1480
+ "question_id": "qA-fa-02",
1481
+ "category": "paraphrased_semantic_recall",
1482
+ "question": "What is Jonas currently doing to build up a financial cushion for emergencies?",
1483
+ "expected_answer": "He puts an extra $100/month into an emergency fund, separate from his regular savings goal.",
1484
+ "acceptable_answer_criteria": [
1485
+ "mentions $100/month",
1486
+ "mentions it's separate from regular savings"
1487
+ ],
1488
+ "required_memory_ids": [
1489
+ "memA-fa-008"
1490
+ ],
1491
+ "forbidden_memory_ids": [],
1492
+ "difficulty": "medium",
1493
+ "architecture_bias_risk": "low",
1494
+ "fairness_note": "Rewords 'emergency fund' question without echoing the source sentence."
1495
+ },
1496
+ {
1497
+ "question_id": "qA-fa-03",
1498
+ "category": "temporal_update",
1499
+ "question": "What is Jonas's monthly savings goal right now?",
1500
+ "expected_answer": "$450 — raised from the original $400 goal.",
1501
+ "acceptable_answer_criteria": [
1502
+ "states $450",
1503
+ "notes it was raised from $400"
1504
+ ],
1505
+ "required_memory_ids": [
1506
+ "memA-fa-005",
1507
+ "memA-fa-001"
1508
+ ],
1509
+ "forbidden_memory_ids": [],
1510
+ "difficulty": "medium",
1511
+ "architecture_bias_risk": "medium",
1512
+ "fairness_note": "Requires returning the updated goal rather than the original starting figure."
1513
+ },
1514
+ {
1515
+ "question_id": "qA-fa-04",
1516
+ "category": "multi_hop_association",
1517
+ "question": "Why did Jonas raise his savings goal to $450?",
1518
+ "expected_answer": "Canceling his streaming subscription and gym membership freed up $45/month combined, which he redirected into a higher savings goal.",
1519
+ "acceptable_answer_criteria": [
1520
+ "mentions both cancellations",
1521
+ "connects the freed-up amount to the goal increase"
1522
+ ],
1523
+ "required_memory_ids": [
1524
+ "memA-fa-003",
1525
+ "memA-fa-004",
1526
+ "memA-fa-005"
1527
+ ],
1528
+ "forbidden_memory_ids": [],
1529
+ "difficulty": "hard",
1530
+ "architecture_bias_risk": "high",
1531
+ "fairness_note": "Requires combining two separate cancellation records with the goal-change record; a single nearest-neighbor lookup on the question alone may miss one of the three."
1532
+ },
1533
+ {
1534
+ "question_id": "qA-fa-05",
1535
+ "category": "procedural_recall",
1536
+ "question": "What does Jonas do when he needs to dispute a charge?",
1537
+ "expected_answer": "He calls Harbor Credit Union's support line first, then follows up in writing within 10 days if it's still unresolved.",
1538
+ "acceptable_answer_criteria": [
1539
+ "mentions calling support first",
1540
+ "mentions the 10-day written follow-up"
1541
+ ],
1542
+ "required_memory_ids": [
1543
+ "memA-fa-007"
1544
+ ],
1545
+ "forbidden_memory_ids": [],
1546
+ "difficulty": "easy",
1547
+ "architecture_bias_risk": "low",
1548
+ "fairness_note": "Single procedural record states this directly."
1549
+ }
1550
+ ]
1551
+ },
1552
+ {
1553
+ "conversation_id": "convA-education-01",
1554
+ "agent_id": "agent-tutor",
1555
+ "domain": "education",
1556
+ "memory_records": [
1557
+ {
1558
+ "id": "memA-ed-001",
1559
+ "type": "episodic",
1560
+ "timestamp": "2025-12-01T09:00:00Z",
1561
+ "content": "Aisha started studying for the PMP (Project Management Professional) certification exam.",
1562
+ "tags": [
1563
+ "pmp",
1564
+ "study"
1565
+ ],
1566
+ "importance": 0.7,
1567
+ "metadata": {
1568
+ "source_turn": "turn-001",
1569
+ "speaker": "user"
1570
+ },
1571
+ "associations": []
1572
+ },
1573
+ {
1574
+ "id": "memA-ed-002",
1575
+ "type": "episodic",
1576
+ "timestamp": "2025-12-05T10:00:00Z",
1577
+ "content": "Aisha originally scheduled her PMP exam for March 15.",
1578
+ "tags": [
1579
+ "pmp",
1580
+ "schedule"
1581
+ ],
1582
+ "importance": 0.6,
1583
+ "metadata": {
1584
+ "source_turn": "turn-002",
1585
+ "speaker": "user"
1586
+ },
1587
+ "associations": []
1588
+ },
1589
+ {
1590
+ "id": "memA-ed-003",
1591
+ "type": "episodic",
1592
+ "timestamp": "2026-02-01T11:00:00Z",
1593
+ "content": "Aisha joined a weekly study group organized through a local professional association.",
1594
+ "tags": [
1595
+ "pmp",
1596
+ "study-group"
1597
+ ],
1598
+ "importance": 0.55,
1599
+ "metadata": {
1600
+ "source_turn": "turn-003",
1601
+ "speaker": "user"
1602
+ },
1603
+ "associations": []
1604
+ },
1605
+ {
1606
+ "id": "memA-ed-004",
1607
+ "type": "episodic",
1608
+ "timestamp": "2026-02-08T12:00:00Z",
1609
+ "content": "A study group member recommended a practice-exam question bank from a provider called PrepMaster.",
1610
+ "tags": [
1611
+ "pmp",
1612
+ "resource"
1613
+ ],
1614
+ "importance": 0.5,
1615
+ "metadata": {
1616
+ "source_turn": "turn-004",
1617
+ "speaker": "user"
1618
+ },
1619
+ "associations": [
1620
+ {
1621
+ "target_id": "memA-ed-003",
1622
+ "strength": 0.8,
1623
+ "reason": "recommendation came from the study group"
1624
+ }
1625
+ ]
1626
+ },
1627
+ {
1628
+ "id": "memA-ed-005",
1629
+ "type": "episodic",
1630
+ "timestamp": "2026-02-15T13:00:00Z",
1631
+ "content": "Using the PrepMaster question bank, Aisha discovered she was consistently weak on risk-management questions.",
1632
+ "tags": [
1633
+ "pmp",
1634
+ "weak-area"
1635
+ ],
1636
+ "importance": 0.65,
1637
+ "metadata": {
1638
+ "source_turn": "turn-005",
1639
+ "speaker": "user"
1640
+ },
1641
+ "associations": [
1642
+ {
1643
+ "target_id": "memA-ed-004",
1644
+ "strength": 0.8,
1645
+ "reason": "weakness surfaced through this recommended resource"
1646
+ }
1647
+ ]
1648
+ },
1649
+ {
1650
+ "id": "memA-ed-006",
1651
+ "type": "episodic",
1652
+ "timestamp": "2026-02-20T14:00:00Z",
1653
+ "content": "Aisha pushed her PMP exam date back to April 20 to allow more time to shore up her risk-management knowledge.",
1654
+ "tags": [
1655
+ "pmp",
1656
+ "schedule"
1657
+ ],
1658
+ "importance": 0.75,
1659
+ "metadata": {
1660
+ "source_turn": "turn-006",
1661
+ "speaker": "user"
1662
+ },
1663
+ "associations": [
1664
+ {
1665
+ "target_id": "memA-ed-005",
1666
+ "strength": 0.85,
1667
+ "reason": "weak area discovery led to rescheduling"
1668
+ },
1669
+ {
1670
+ "target_id": "memA-ed-002",
1671
+ "strength": 0.7,
1672
+ "reason": "updates the original March 15 date"
1673
+ }
1674
+ ]
1675
+ },
1676
+ {
1677
+ "id": "memA-ed-007",
1678
+ "type": "episodic",
1679
+ "timestamp": "2026-03-01T09:00:00Z",
1680
+ "content": "Aisha mentioned she'd earned a different certification, Six Sigma Green Belt, two years earlier for a previous job.",
1681
+ "tags": [
1682
+ "other-certification"
1683
+ ],
1684
+ "importance": 0.25,
1685
+ "metadata": {
1686
+ "source_turn": "turn-007",
1687
+ "speaker": "user"
1688
+ },
1689
+ "associations": []
1690
+ },
1691
+ {
1692
+ "id": "memA-ed-008",
1693
+ "type": "procedural",
1694
+ "timestamp": "2026-03-10T10:00:00Z",
1695
+ "content": "Aisha's study routine is 20 practice questions every morning with immediate review of wrong answers, then one PMBOK chapter in the evening.",
1696
+ "tags": [
1697
+ "pmp",
1698
+ "workflow"
1699
+ ],
1700
+ "importance": 0.55,
1701
+ "metadata": {
1702
+ "source_turn": "turn-008",
1703
+ "speaker": "user"
1704
+ },
1705
+ "associations": []
1706
+ },
1707
+ {
1708
+ "id": "memA-ed-009",
1709
+ "type": "episodic",
1710
+ "timestamp": "2026-04-05T11:00:00Z",
1711
+ "content": "Aisha said she felt much more confident on risk-management topics after several weeks of focused practice.",
1712
+ "tags": [
1713
+ "pmp",
1714
+ "progress"
1715
+ ],
1716
+ "importance": 0.55,
1717
+ "metadata": {
1718
+ "source_turn": "turn-009",
1719
+ "speaker": "user"
1720
+ },
1721
+ "associations": []
1722
+ }
1723
+ ],
1724
+ "questions": [
1725
+ {
1726
+ "question_id": "qA-ed-01",
1727
+ "category": "atomic_fact_recall",
1728
+ "question": "What certification is Aisha studying for?",
1729
+ "expected_answer": "PMP (Project Management Professional)",
1730
+ "acceptable_answer_criteria": [
1731
+ "names PMP"
1732
+ ],
1733
+ "required_memory_ids": [
1734
+ "memA-ed-001"
1735
+ ],
1736
+ "forbidden_memory_ids": [],
1737
+ "difficulty": "easy",
1738
+ "architecture_bias_risk": "low",
1739
+ "fairness_note": "Direct single-fact recall."
1740
+ },
1741
+ {
1742
+ "question_id": "qA-ed-02",
1743
+ "category": "paraphrased_semantic_recall",
1744
+ "question": "What does Aisha's daily exam-prep routine look like?",
1745
+ "expected_answer": "20 practice questions each morning with immediate review of mistakes, plus a PMBOK chapter in the evening.",
1746
+ "acceptable_answer_criteria": [
1747
+ "mentions the morning practice questions",
1748
+ "mentions the evening reading"
1749
+ ],
1750
+ "required_memory_ids": [
1751
+ "memA-ed-008"
1752
+ ],
1753
+ "forbidden_memory_ids": [],
1754
+ "difficulty": "medium",
1755
+ "architecture_bias_risk": "low",
1756
+ "fairness_note": "Reworded from the source procedural record's wording."
1757
+ },
1758
+ {
1759
+ "question_id": "qA-ed-03",
1760
+ "category": "temporal_update",
1761
+ "question": "When is Aisha's PMP exam currently scheduled?",
1762
+ "expected_answer": "April 20 — moved from the original March 15 date.",
1763
+ "acceptable_answer_criteria": [
1764
+ "states April 20",
1765
+ "notes the change from March 15"
1766
+ ],
1767
+ "required_memory_ids": [
1768
+ "memA-ed-006",
1769
+ "memA-ed-002"
1770
+ ],
1771
+ "forbidden_memory_ids": [],
1772
+ "difficulty": "medium",
1773
+ "architecture_bias_risk": "medium",
1774
+ "fairness_note": "Requires surfacing the rescheduled date over the originally stated one."
1775
+ },
1776
+ {
1777
+ "question_id": "qA-ed-04",
1778
+ "category": "multi_hop_association",
1779
+ "question": "How did Aisha discover she needed to push back her exam date?",
1780
+ "expected_answer": "Her study group recommended the PrepMaster question bank; using it revealed she was weak on risk-management, which is why she rescheduled.",
1781
+ "acceptable_answer_criteria": [
1782
+ "mentions the study group",
1783
+ "mentions PrepMaster",
1784
+ "mentions the risk-management weakness",
1785
+ "connects it to the rescheduling"
1786
+ ],
1787
+ "required_memory_ids": [
1788
+ "memA-ed-003",
1789
+ "memA-ed-004",
1790
+ "memA-ed-005",
1791
+ "memA-ed-006"
1792
+ ],
1793
+ "forbidden_memory_ids": [],
1794
+ "difficulty": "hard",
1795
+ "architecture_bias_risk": "high",
1796
+ "fairness_note": "Four-record causal chain; answering well requires connecting study-group -> resource -> discovered weakness -> reschedule, which stresses any system's ability to chain related facts."
1797
+ },
1798
+ {
1799
+ "question_id": "qA-ed-05",
1800
+ "category": "abstention",
1801
+ "question": "Did Aisha pass her PMP exam?",
1802
+ "expected_answer": "Not enough information — there's no record of her having taken the exam or received a result yet.",
1803
+ "acceptable_answer_criteria": [
1804
+ "states that the outcome is unknown/not recorded",
1805
+ "does not fabricate a pass or fail result"
1806
+ ],
1807
+ "required_memory_ids": [],
1808
+ "forbidden_memory_ids": [
1809
+ "memA-ed-009"
1810
+ ],
1811
+ "difficulty": "medium",
1812
+ "architecture_bias_risk": "low",
1813
+ "fairness_note": "Tests whether the system resists treating 'felt more confident' as proof of passing; correct behavior is to abstain rather than guess."
1814
+ }
1815
+ ]
1816
+ },
1817
+ {
1818
+ "conversation_id": "convA-creative-work-01",
1819
+ "agent_id": "agent-writing",
1820
+ "domain": "creative_work",
1821
+ "memory_records": [
1822
+ {
1823
+ "id": "memA-cw-001",
1824
+ "type": "episodic",
1825
+ "timestamp": "2025-10-01T09:00:00Z",
1826
+ "content": "Leo started drafting a fantasy novel originally titled 'The Glass Throne.'",
1827
+ "tags": [
1828
+ "novel",
1829
+ "title"
1830
+ ],
1831
+ "importance": 0.65,
1832
+ "metadata": {
1833
+ "source_turn": "turn-001",
1834
+ "speaker": "user"
1835
+ },
1836
+ "associations": []
1837
+ },
1838
+ {
1839
+ "id": "memA-cw-002",
1840
+ "type": "semantic",
1841
+ "timestamp": "2025-10-10T10:00:00Z",
1842
+ "content": "Leo's protagonist was originally named Kestrel, a runaway blacksmith's apprentice.",
1843
+ "tags": [
1844
+ "novel",
1845
+ "character"
1846
+ ],
1847
+ "importance": 0.6,
1848
+ "metadata": {
1849
+ "source_turn": "turn-002",
1850
+ "speaker": "user"
1851
+ },
1852
+ "associations": []
1853
+ },
1854
+ {
1855
+ "id": "memA-cw-003",
1856
+ "type": "episodic",
1857
+ "timestamp": "2025-12-01T11:00:00Z",
1858
+ "content": "Leo planted an early detail that the protagonist's missing mother left behind a strange iron key, intending it to matter later.",
1859
+ "tags": [
1860
+ "novel",
1861
+ "plot-seed"
1862
+ ],
1863
+ "importance": 0.6,
1864
+ "metadata": {
1865
+ "source_turn": "turn-003",
1866
+ "speaker": "user"
1867
+ },
1868
+ "associations": []
1869
+ },
1870
+ {
1871
+ "id": "memA-cw-004",
1872
+ "type": "episodic",
1873
+ "timestamp": "2026-01-15T12:00:00Z",
1874
+ "content": "Leo renamed the protagonist from Kestrel to Wren partway through drafting because Kestrel sounded too similar to another character's name.",
1875
+ "tags": [
1876
+ "novel",
1877
+ "character"
1878
+ ],
1879
+ "importance": 0.6,
1880
+ "metadata": {
1881
+ "source_turn": "turn-004",
1882
+ "speaker": "user"
1883
+ },
1884
+ "associations": [
1885
+ {
1886
+ "target_id": "memA-cw-002",
1887
+ "strength": 0.85,
1888
+ "reason": "renames the original protagonist"
1889
+ }
1890
+ ]
1891
+ },
1892
+ {
1893
+ "id": "memA-cw-005",
1894
+ "type": "episodic",
1895
+ "timestamp": "2026-02-10T13:00:00Z",
1896
+ "content": "Leo sketched out an unrelated short story idea about a lighthouse keeper, just for fun between novel-writing sessions.",
1897
+ "tags": [
1898
+ "side-project"
1899
+ ],
1900
+ "importance": 0.2,
1901
+ "metadata": {
1902
+ "source_turn": "turn-005",
1903
+ "speaker": "user"
1904
+ },
1905
+ "associations": []
1906
+ },
1907
+ {
1908
+ "id": "memA-cw-006",
1909
+ "type": "episodic",
1910
+ "timestamp": "2026-03-20T14:00:00Z",
1911
+ "content": "In the midpoint twist, the iron key from the protagonist's mother unlocks a hidden chamber beneath the blacksmith's forge where the novel's villain was once imprisoned.",
1912
+ "tags": [
1913
+ "novel",
1914
+ "plot-payoff"
1915
+ ],
1916
+ "importance": 0.75,
1917
+ "metadata": {
1918
+ "source_turn": "turn-006",
1919
+ "speaker": "user"
1920
+ },
1921
+ "associations": [
1922
+ {
1923
+ "target_id": "memA-cw-003",
1924
+ "strength": 0.85,
1925
+ "reason": "pays off the earlier planted detail"
1926
+ }
1927
+ ]
1928
+ },
1929
+ {
1930
+ "id": "memA-cw-007",
1931
+ "type": "procedural",
1932
+ "timestamp": "2026-04-01T09:00:00Z",
1933
+ "content": "Leo's revision process is to finish a full draft first, then do a structural pass focused only on plot, then a final pass focused on prose style.",
1934
+ "tags": [
1935
+ "novel",
1936
+ "workflow"
1937
+ ],
1938
+ "importance": 0.5,
1939
+ "metadata": {
1940
+ "source_turn": "turn-007",
1941
+ "speaker": "user"
1942
+ },
1943
+ "associations": []
1944
+ },
1945
+ {
1946
+ "id": "memA-cw-008",
1947
+ "type": "episodic",
1948
+ "timestamp": "2026-05-01T10:00:00Z",
1949
+ "content": "Leo changed the novel's title from 'The Glass Throne' to 'The Forge Beneath' because the original no longer fit the story.",
1950
+ "tags": [
1951
+ "novel",
1952
+ "title"
1953
+ ],
1954
+ "importance": 0.65,
1955
+ "metadata": {
1956
+ "source_turn": "turn-008",
1957
+ "speaker": "user"
1958
+ },
1959
+ "associations": [
1960
+ {
1961
+ "target_id": "memA-cw-001",
1962
+ "strength": 0.85,
1963
+ "reason": "renames the originally chosen title"
1964
+ }
1965
+ ]
1966
+ },
1967
+ {
1968
+ "id": "memA-cw-009",
1969
+ "type": "episodic",
1970
+ "timestamp": "2026-06-05T11:00:00Z",
1971
+ "content": "Leo finished his structural revision pass and started the prose-style pass, following his usual process.",
1972
+ "tags": [
1973
+ "novel",
1974
+ "progress"
1975
+ ],
1976
+ "importance": 0.55,
1977
+ "metadata": {
1978
+ "source_turn": "turn-009",
1979
+ "speaker": "user"
1980
+ },
1981
+ "associations": [
1982
+ {
1983
+ "target_id": "memA-cw-007",
1984
+ "strength": 0.8,
1985
+ "reason": "following the established revision process"
1986
+ }
1987
+ ]
1988
+ }
1989
+ ],
1990
+ "questions": [
1991
+ {
1992
+ "question_id": "qA-cw-01",
1993
+ "category": "atomic_fact_recall",
1994
+ "question": "Who left behind the iron key in Leo's novel?",
1995
+ "expected_answer": "The protagonist's missing mother",
1996
+ "acceptable_answer_criteria": [
1997
+ "identifies the mother"
1998
+ ],
1999
+ "required_memory_ids": [
2000
+ "memA-cw-003"
2001
+ ],
2002
+ "forbidden_memory_ids": [],
2003
+ "difficulty": "easy",
2004
+ "architecture_bias_risk": "low",
2005
+ "fairness_note": "Single explicit fact."
2006
+ },
2007
+ {
2008
+ "question_id": "qA-cw-02",
2009
+ "category": "paraphrased_semantic_recall",
2010
+ "question": "What's Leo's general approach to revising a finished draft?",
2011
+ "expected_answer": "Finish the full draft first, then a structural/plot-focused pass, then a final pass focused on prose style.",
2012
+ "acceptable_answer_criteria": [
2013
+ "mentions the three-stage order: draft, plot pass, prose pass"
2014
+ ],
2015
+ "required_memory_ids": [
2016
+ "memA-cw-007"
2017
+ ],
2018
+ "forbidden_memory_ids": [],
2019
+ "difficulty": "medium",
2020
+ "architecture_bias_risk": "low",
2021
+ "fairness_note": "Rewords the procedural memory without echoing its exact phrasing."
2022
+ },
2023
+ {
2024
+ "question_id": "qA-cw-03",
2025
+ "category": "temporal_update",
2026
+ "question": "What is the novel currently titled?",
2027
+ "expected_answer": "'The Forge Beneath' — changed from the original title 'The Glass Throne.'",
2028
+ "acceptable_answer_criteria": [
2029
+ "gives the current title",
2030
+ "notes the change from the old title"
2031
+ ],
2032
+ "required_memory_ids": [
2033
+ "memA-cw-008",
2034
+ "memA-cw-001"
2035
+ ],
2036
+ "forbidden_memory_ids": [],
2037
+ "difficulty": "medium",
2038
+ "architecture_bias_risk": "medium",
2039
+ "fairness_note": "Requires surfacing the renamed title over the originally stated one."
2040
+ },
2041
+ {
2042
+ "question_id": "qA-cw-04",
2043
+ "category": "multi_hop_association",
2044
+ "question": "What turns out to be the significance of the iron key the protagonist's mother left behind?",
2045
+ "expected_answer": "It unlocks a hidden chamber beneath the blacksmith's forge where the novel's villain was once imprisoned — the payoff of an early planted detail.",
2046
+ "acceptable_answer_criteria": [
2047
+ "connects the key to the hidden chamber",
2048
+ "mentions the villain's imprisonment"
2049
+ ],
2050
+ "required_memory_ids": [
2051
+ "memA-cw-003",
2052
+ "memA-cw-006"
2053
+ ],
2054
+ "forbidden_memory_ids": [],
2055
+ "difficulty": "hard",
2056
+ "architecture_bias_risk": "high",
2057
+ "fairness_note": "Requires linking a plot seed planted months earlier to its payoff in a separate record; a flat similarity search on 'iron key' alone might surface only the planting record."
2058
+ },
2059
+ {
2060
+ "question_id": "qA-cw-05",
2061
+ "category": "abstention",
2062
+ "question": "Has Leo found a publisher for the novel?",
2063
+ "expected_answer": "Not enough information — no record discusses publishing, submission, or a publisher.",
2064
+ "acceptable_answer_criteria": [
2065
+ "states the information is not available",
2066
+ "does not invent a publisher or deal"
2067
+ ],
2068
+ "required_memory_ids": [],
2069
+ "forbidden_memory_ids": [],
2070
+ "difficulty": "medium",
2071
+ "architecture_bias_risk": "low",
2072
+ "fairness_note": "No memory addresses publishing status at all, so the correct behavior is abstention rather than fabrication."
2073
+ }
2074
+ ]
2075
+ }
2076
+ ]
2077
+ }