@1mbrain/benchmarks 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (69) hide show
  1. package/README.md +85 -0
  2. package/fixtures/1mbrain-focused-mini/1mbrain-focused-mini.json +928 -0
  3. package/fixtures/1mbrain-focused-mini/README.md +45 -0
  4. package/fixtures/adversarial-memory/dataset_claude_adversarial.json +3333 -0
  5. package/fixtures/adversarial-memory/dataset_gemini_adversarial_memory.json +2984 -0
  6. package/fixtures/balanced-mini/dataset_claude_balanced_mini.json +2077 -0
  7. package/fixtures/balanced-mini/dataset_gemini_balanced_mini.json +1995 -0
  8. package/fixtures/generate_datasets.js +1741 -0
  9. package/fixtures/graph-stress-hard/README.md +43 -0
  10. package/fixtures/graph-stress-hard/dataset_graph_stress_hard.json +4374 -0
  11. package/fixtures/graph-stress-hard/generate_graph_stress_hard.js +526 -0
  12. package/fixtures/realistic-medium/dataset_claude_realistic_medium.json +7462 -0
  13. package/fixtures/realistic-medium/dataset_gemini_realistic_medium.json +7277 -0
  14. package/fixtures/realistic-medium/gen_claude_medium.js +600 -0
  15. package/package.json +22 -0
  16. package/reports/benchmark_report.md +48 -0
  17. package/reports/benchmark_report_claude_adversarial.md +42 -0
  18. package/reports/benchmark_report_claude_adversarial_adaptive.md +42 -0
  19. package/reports/benchmark_report_claude_adversarial_adaptive2_fast.md +42 -0
  20. package/reports/benchmark_report_claude_adversarial_adaptive_fast.md +42 -0
  21. package/reports/benchmark_report_claude_adversarial_rerank.md +42 -0
  22. package/reports/benchmark_report_claude_balanced_mini.md +42 -0
  23. package/reports/benchmark_report_claude_balanced_mini_adaptive.md +42 -0
  24. package/reports/benchmark_report_claude_balanced_mini_adaptive2_fast.md +42 -0
  25. package/reports/benchmark_report_claude_balanced_mini_adaptive_fast.md +42 -0
  26. package/reports/benchmark_report_claude_balanced_mini_rerank.md +42 -0
  27. package/reports/benchmark_report_claude_realistic_medium.md +42 -0
  28. package/reports/benchmark_report_claude_realistic_medium_adaptive.md +42 -0
  29. package/reports/benchmark_report_claude_realistic_medium_adaptive2_fast.md +42 -0
  30. package/reports/benchmark_report_claude_realistic_medium_adaptive_fast.md +42 -0
  31. package/reports/benchmark_report_claude_realistic_medium_evidence_rerank_local.md +42 -0
  32. package/reports/benchmark_report_claude_realistic_medium_openai_evidence_rerank.md +41 -0
  33. package/reports/benchmark_report_claude_realistic_medium_openai_multi_signal.md +41 -0
  34. package/reports/benchmark_report_claude_realistic_medium_openai_multi_signal_scoped.md +41 -0
  35. package/reports/benchmark_report_claude_realistic_medium_openai_phase8_no_judge.md +42 -0
  36. package/reports/benchmark_report_claude_realistic_medium_openai_rankingpolicy.md +41 -0
  37. package/reports/benchmark_report_claude_realistic_medium_openai_stale_filter.md +41 -0
  38. package/reports/benchmark_report_claude_realistic_medium_openai_stale_filter_absence_fix.md +41 -0
  39. package/reports/benchmark_report_claude_realistic_medium_openai_write_time_invalidation.md +41 -0
  40. package/reports/benchmark_report_claude_realistic_medium_rerank.md +42 -0
  41. package/reports/benchmark_report_claude_realistic_medium_stale_filter_local.md +42 -0
  42. package/reports/benchmark_report_graph_stress_hard.md +42 -0
  43. package/reports/benchmark_report_graph_stress_hard_absence_fix.md +42 -0
  44. package/reports/benchmark_report_graph_stress_hard_adaptive.md +42 -0
  45. package/reports/benchmark_report_graph_stress_hard_evidence_rerank.md +42 -0
  46. package/reports/benchmark_report_graph_stress_hard_multi_signal_current_guardrail.md +42 -0
  47. package/reports/benchmark_report_graph_stress_hard_multi_signal_guardrail_fixed.md +42 -0
  48. package/reports/benchmark_report_graph_stress_hard_multi_signal_local.md +42 -0
  49. package/reports/benchmark_report_graph_stress_hard_multi_signal_scoped_guardrail.md +42 -0
  50. package/reports/benchmark_report_graph_stress_hard_multi_signal_vector_pure_guardrail.md +42 -0
  51. package/reports/benchmark_report_graph_stress_hard_phase8_sdk_guardrail.md +42 -0
  52. package/reports/benchmark_report_graph_stress_hard_rerank.md +42 -0
  53. package/reports/benchmark_report_graph_stress_hard_stale_filter.md +42 -0
  54. package/reports/benchmark_report_graph_stress_hard_write_time_invalidation.md +42 -0
  55. package/results/.gitignore +2 -0
  56. package/src/adapters/1mbrain.ts +317 -0
  57. package/src/adapters/keyword-embedding.ts +48 -0
  58. package/src/adapters/mem0.ts +124 -0
  59. package/src/adapters/qdrant.ts +214 -0
  60. package/src/adapters/unavailable.ts +49 -0
  61. package/src/adapters/vector-baseline.ts +149 -0
  62. package/src/datasets/focused-mini.ts +158 -0
  63. package/src/datasets/synthetic-agent-memory.ts +532 -0
  64. package/src/llm-evaluator.ts +262 -0
  65. package/src/metrics.ts +482 -0
  66. package/src/provider.ts +151 -0
  67. package/src/runner.ts +635 -0
  68. package/tsconfig.json +10 -0
  69. package/tsconfig.tsbuildinfo +1 -0
@@ -0,0 +1,3333 @@
1
+ {
2
+ "name": "memory-bench-adversarial",
3
+ "description": "Dataset C (Adversarial Memory): a 10-conversation, provider-neutral benchmark stress-testing stale facts, multi-step contradictions, near-duplicate-name distractors, ambiguity, and abstention, while keeping the same category distribution as Datasets A and B.",
4
+ "generated_at": "2026-06-19",
5
+ "fairness_notes": [
6
+ "Adversarial difficulty comes from the conversation content itself (multi-step contradiction chains, near-duplicate names like Sam/Sami, Adam/Adnan, Alpha/Beta, and corrected misconceptions vs. real-world changes) rather than from skewing the category distribution toward any one architecture's strength.",
7
+ "Multi-hop questions remain capped at 10% (6 of 60), the same ratio as Datasets A and B, so this harder dataset does not become a de facto graph-memory benchmark.",
8
+ "Contradiction-resolution questions intentionally mix two distinct patterns -- a real-world fact that changed (e.g. a moved schedule) and a person's belief that was simply corrected (e.g. a misunderstood renewal date or scoring breakdown) -- so the dataset doesn't only reward systems that track timestamps; it also rewards systems that track which statement is the final, corrected one.",
9
+ "Near-duplicate-name distractors (Sam/Sami, Adam/Adnan, Alpha/Beta) are designed to penalize over-eager similarity matching equally regardless of whether the underlying index is a vector store or a graph store -- both must disambiguate by content, not just by string or embedding proximity.",
10
+ "Abstention questions are split between true information gaps (no record addresses the topic) and a stated-but-unresolved status (e.g. 'hasn't decided yet'), so abstention isn't trivially solvable by checking for the mere absence of a keyword match."
11
+ ],
12
+ "conversations": [
13
+ {
14
+ "conversation_id": "convC-software-01",
15
+ "agent_id": "agent-dev-assist",
16
+ "domain": "software",
17
+ "memory_records": [
18
+ {
19
+ "id": "memC-sw-001",
20
+ "type": "semantic",
21
+ "timestamp": "2025-10-01T09:00:00Z",
22
+ "content": "Nadia is building Ticketly, a bug-tracking tool for small software teams, with a Next.js frontend and a Postgres backend.",
23
+ "tags": [
24
+ "ticketly",
25
+ "tech-stack"
26
+ ],
27
+ "importance": 0.7,
28
+ "metadata": {
29
+ "source_turn": "turn-001",
30
+ "speaker": "user"
31
+ },
32
+ "associations": []
33
+ },
34
+ {
35
+ "id": "memC-sw-002",
36
+ "type": "semantic",
37
+ "timestamp": "2025-10-05T10:00:00Z",
38
+ "content": "Ticketly's pricing model started out free for all teams under 5 users.",
39
+ "tags": [
40
+ "ticketly",
41
+ "pricing"
42
+ ],
43
+ "importance": 0.5,
44
+ "metadata": {
45
+ "source_turn": "turn-002",
46
+ "speaker": "user"
47
+ },
48
+ "associations": []
49
+ },
50
+ {
51
+ "id": "memC-sw-003",
52
+ "type": "episodic",
53
+ "timestamp": "2026-01-05T11:00:00Z",
54
+ "content": "After three months, Nadia switched Ticketly to a paid-only model at $9/user/month because the free tier wasn't converting to revenue.",
55
+ "tags": [
56
+ "ticketly",
57
+ "pricing"
58
+ ],
59
+ "importance": 0.6,
60
+ "metadata": {
61
+ "source_turn": "turn-003",
62
+ "speaker": "user"
63
+ },
64
+ "associations": [
65
+ {
66
+ "target_id": "memC-sw-002",
67
+ "strength": 0.8,
68
+ "reason": "replaces the original free pricing"
69
+ }
70
+ ]
71
+ },
72
+ {
73
+ "id": "memC-sw-004",
74
+ "type": "episodic",
75
+ "timestamp": "2026-02-16T12:00:00Z",
76
+ "content": "Six weeks later, Nadia reverted Ticketly to a freemium model: free for teams under 3 users, $9/user/month above that.",
77
+ "tags": [
78
+ "ticketly",
79
+ "pricing"
80
+ ],
81
+ "importance": 0.75,
82
+ "metadata": {
83
+ "source_turn": "turn-004",
84
+ "speaker": "user"
85
+ },
86
+ "associations": [
87
+ {
88
+ "target_id": "memC-sw-003",
89
+ "strength": 0.85,
90
+ "reason": "replaces the brief paid-only period"
91
+ }
92
+ ]
93
+ },
94
+ {
95
+ "id": "memC-sw-005",
96
+ "type": "episodic",
97
+ "timestamp": "2026-02-20T13:00:00Z",
98
+ "content": "Nadia hired a contractor named Sam to handle customer support tickets.",
99
+ "tags": [
100
+ "ticketly",
101
+ "team"
102
+ ],
103
+ "importance": 0.4,
104
+ "metadata": {
105
+ "source_turn": "turn-005",
106
+ "speaker": "user"
107
+ },
108
+ "associations": []
109
+ },
110
+ {
111
+ "id": "memC-sw-006",
112
+ "type": "episodic",
113
+ "timestamp": "2026-02-25T14:00:00Z",
114
+ "content": "Nadia also brought on a part-time designer named Sami to redesign Ticketly's dashboard.",
115
+ "tags": [
116
+ "ticketly",
117
+ "team"
118
+ ],
119
+ "importance": 0.4,
120
+ "metadata": {
121
+ "source_turn": "turn-006",
122
+ "speaker": "user"
123
+ },
124
+ "associations": []
125
+ },
126
+ {
127
+ "id": "memC-sw-007",
128
+ "type": "procedural",
129
+ "timestamp": "2026-03-01T09:00:00Z",
130
+ "content": "Nadia's release process is: merge to main, run the CI test suite, then deploy to staging for 24 hours before pushing to production.",
131
+ "tags": [
132
+ "ticketly",
133
+ "workflow"
134
+ ],
135
+ "importance": 0.55,
136
+ "metadata": {
137
+ "source_turn": "turn-007",
138
+ "speaker": "user"
139
+ },
140
+ "associations": []
141
+ },
142
+ {
143
+ "id": "memC-sw-008",
144
+ "type": "episodic",
145
+ "timestamp": "2026-03-05T10:00:00Z",
146
+ "content": "Nadia mentioned a different past project, a Chrome extension called 'Tabsy,' which she sold two years ago and no longer maintains.",
147
+ "tags": [
148
+ "tabsy",
149
+ "past-project"
150
+ ],
151
+ "importance": 0.2,
152
+ "metadata": {
153
+ "source_turn": "turn-008",
154
+ "speaker": "user"
155
+ },
156
+ "associations": []
157
+ },
158
+ {
159
+ "id": "memC-sw-009",
160
+ "type": "semantic",
161
+ "timestamp": "2026-03-20T11:00:00Z",
162
+ "content": "Ticketly currently supports integrations with GitHub and Slack, but not Jira yet.",
163
+ "tags": [
164
+ "ticketly",
165
+ "integrations"
166
+ ],
167
+ "importance": 0.6,
168
+ "metadata": {
169
+ "source_turn": "turn-009",
170
+ "speaker": "user"
171
+ },
172
+ "associations": []
173
+ },
174
+ {
175
+ "id": "memC-sw-010",
176
+ "type": "episodic",
177
+ "timestamp": "2026-04-02T12:00:00Z",
178
+ "content": "A beta user reported that Ticketly's email notifications were arriving up to 20 minutes late.",
179
+ "tags": [
180
+ "ticketly",
181
+ "bug"
182
+ ],
183
+ "importance": 0.5,
184
+ "metadata": {
185
+ "source_turn": "turn-010",
186
+ "speaker": "user"
187
+ },
188
+ "associations": []
189
+ },
190
+ {
191
+ "id": "memC-sw-011",
192
+ "type": "episodic",
193
+ "timestamp": "2026-04-03T13:00:00Z",
194
+ "content": "Investigating the delay, Nadia found the cause was a misconfigured queue worker, not the email provider itself.",
195
+ "tags": [
196
+ "ticketly",
197
+ "bug"
198
+ ],
199
+ "importance": 0.55,
200
+ "metadata": {
201
+ "source_turn": "turn-011",
202
+ "speaker": "user"
203
+ },
204
+ "associations": [
205
+ {
206
+ "target_id": "memC-sw-010",
207
+ "strength": 0.85,
208
+ "reason": "diagnosis of the reported delay"
209
+ }
210
+ ]
211
+ },
212
+ {
213
+ "id": "memC-sw-012",
214
+ "type": "episodic",
215
+ "timestamp": "2026-04-05T14:00:00Z",
216
+ "content": "Nadia fixed the queue worker configuration, bringing notification delays down to under 10 seconds.",
217
+ "tags": [
218
+ "ticketly",
219
+ "bug"
220
+ ],
221
+ "importance": 0.55,
222
+ "metadata": {
223
+ "source_turn": "turn-012",
224
+ "speaker": "user"
225
+ },
226
+ "associations": [
227
+ {
228
+ "target_id": "memC-sw-011",
229
+ "strength": 0.85,
230
+ "reason": "resolves the diagnosed cause"
231
+ }
232
+ ]
233
+ }
234
+ ],
235
+ "questions": [
236
+ {
237
+ "question_id": "qC-sw-01",
238
+ "category": "atomic_fact_recall",
239
+ "question": "What backend database does Ticketly use?",
240
+ "expected_answer": "Postgres",
241
+ "acceptable_answer_criteria": [
242
+ "names Postgres"
243
+ ],
244
+ "required_memory_ids": [
245
+ "memC-sw-001"
246
+ ],
247
+ "forbidden_memory_ids": [],
248
+ "difficulty": "easy",
249
+ "architecture_bias_risk": "low",
250
+ "fairness_note": "Single stated fact in one record."
251
+ },
252
+ {
253
+ "question_id": "qC-sw-02",
254
+ "category": "atomic_fact_recall",
255
+ "question": "Which two integrations does Ticketly currently support?",
256
+ "expected_answer": "GitHub and Slack (not Jira yet)",
257
+ "acceptable_answer_criteria": [
258
+ "names GitHub and Slack"
259
+ ],
260
+ "required_memory_ids": [
261
+ "memC-sw-009"
262
+ ],
263
+ "forbidden_memory_ids": [],
264
+ "difficulty": "easy",
265
+ "architecture_bias_risk": "low",
266
+ "fairness_note": "Directly stated; no architecture-specific advantage either way."
267
+ },
268
+ {
269
+ "question_id": "qC-sw-03",
270
+ "category": "paraphrased_semantic_recall",
271
+ "question": "What does Nadia do before a release goes live?",
272
+ "expected_answer": "Merge to main, run the CI suite, then sit on staging for 24 hours before production.",
273
+ "acceptable_answer_criteria": [
274
+ "mentions CI",
275
+ "mentions the 24-hour staging step"
276
+ ],
277
+ "required_memory_ids": [
278
+ "memC-sw-007"
279
+ ],
280
+ "forbidden_memory_ids": [],
281
+ "difficulty": "medium",
282
+ "architecture_bias_risk": "low",
283
+ "fairness_note": "Reworded from the procedural memory's exact phrasing."
284
+ },
285
+ {
286
+ "question_id": "qC-sw-04",
287
+ "category": "temporal_update",
288
+ "question": "What is Ticketly's current pricing model?",
289
+ "expected_answer": "Freemium: free for teams under 3 users, $9/user/month above that.",
290
+ "acceptable_answer_criteria": [
291
+ "states the freemium structure",
292
+ "gives the 3-user threshold"
293
+ ],
294
+ "required_memory_ids": [
295
+ "memC-sw-004"
296
+ ],
297
+ "forbidden_memory_ids": [],
298
+ "difficulty": "medium",
299
+ "architecture_bias_risk": "medium",
300
+ "fairness_note": "Pricing changed twice; the system must surface the most recent state, not either earlier one."
301
+ },
302
+ {
303
+ "question_id": "qC-sw-05",
304
+ "category": "contradiction_resolution",
305
+ "question": "Is Ticketly currently free for all teams under 5 users?",
306
+ "expected_answer": "No. That was the original pricing; it has since changed twice and is now free only for teams under 3 users, with paid tiers above that.",
307
+ "acceptable_answer_criteria": [
308
+ "says no",
309
+ "gives the current 3-user freemium threshold"
310
+ ],
311
+ "required_memory_ids": [
312
+ "memC-sw-004"
313
+ ],
314
+ "forbidden_memory_ids": [
315
+ "memC-sw-002"
316
+ ],
317
+ "difficulty": "medium",
318
+ "architecture_bias_risk": "medium",
319
+ "fairness_note": "Tests resistance to a stale fact that was true at one point but has been superseded twice since."
320
+ },
321
+ {
322
+ "question_id": "qC-sw-06",
323
+ "category": "noise_resistance",
324
+ "question": "Did Sam, the customer-support contractor, redesign Ticketly's dashboard?",
325
+ "expected_answer": "No, that was Sami, the part-time designer -- a different person with a similar-sounding name.",
326
+ "acceptable_answer_criteria": [
327
+ "distinguishes Sam from Sami",
328
+ "correctly attributes the dashboard redesign to Sami"
329
+ ],
330
+ "required_memory_ids": [
331
+ "memC-sw-005",
332
+ "memC-sw-006"
333
+ ],
334
+ "forbidden_memory_ids": [],
335
+ "difficulty": "hard",
336
+ "architecture_bias_risk": "medium",
337
+ "fairness_note": "Near-duplicate names (Sam/Sami) with different roles are a deliberate adversarial trap for similarity-based retrieval that doesn't disambiguate carefully."
338
+ }
339
+ ]
340
+ },
341
+ {
342
+ "conversation_id": "convC-personal-assistant-01",
343
+ "agent_id": "agent-pa",
344
+ "domain": "personal_assistant",
345
+ "memory_records": [
346
+ {
347
+ "id": "memC-pa-001",
348
+ "type": "semantic",
349
+ "timestamp": "2025-09-01T08:00:00Z",
350
+ "content": "Soren is based in Berlin but travels frequently for work, often to Sao Paulo and Singapore.",
351
+ "tags": [
352
+ "soren",
353
+ "travel"
354
+ ],
355
+ "importance": 0.5,
356
+ "metadata": {
357
+ "source_turn": "turn-001",
358
+ "speaker": "user"
359
+ },
360
+ "associations": []
361
+ },
362
+ {
363
+ "id": "memC-pa-002",
364
+ "type": "semantic",
365
+ "timestamp": "2025-09-02T09:00:00Z",
366
+ "content": "Soren prefers all meeting times communicated in his local time zone, not UTC.",
367
+ "tags": [
368
+ "soren",
369
+ "preference"
370
+ ],
371
+ "importance": 0.5,
372
+ "metadata": {
373
+ "source_turn": "turn-002",
374
+ "speaker": "user"
375
+ },
376
+ "associations": []
377
+ },
378
+ {
379
+ "id": "memC-pa-003",
380
+ "type": "episodic",
381
+ "timestamp": "2025-09-10T10:00:00Z",
382
+ "content": "Soren set up a recurring client call every Wednesday at 4pm Berlin time.",
383
+ "tags": [
384
+ "soren",
385
+ "schedule"
386
+ ],
387
+ "importance": 0.6,
388
+ "metadata": {
389
+ "source_turn": "turn-003",
390
+ "speaker": "user"
391
+ },
392
+ "associations": []
393
+ },
394
+ {
395
+ "id": "memC-pa-004",
396
+ "type": "episodic",
397
+ "timestamp": "2026-03-03T11:00:00Z",
398
+ "content": "While traveling to Sao Paulo in March, Soren asked to shift that Wednesday call to 11am Sao Paulo time for the duration of the trip.",
399
+ "tags": [
400
+ "soren",
401
+ "schedule"
402
+ ],
403
+ "importance": 0.55,
404
+ "metadata": {
405
+ "source_turn": "turn-004",
406
+ "speaker": "user"
407
+ },
408
+ "associations": [
409
+ {
410
+ "target_id": "memC-pa-003",
411
+ "strength": 0.7,
412
+ "reason": "temporary override while traveling"
413
+ }
414
+ ]
415
+ },
416
+ {
417
+ "id": "memC-pa-005",
418
+ "type": "episodic",
419
+ "timestamp": "2026-04-01T12:00:00Z",
420
+ "content": "Soren returned to Berlin in April and the Wednesday call reverted to 4pm Berlin time.",
421
+ "tags": [
422
+ "soren",
423
+ "schedule"
424
+ ],
425
+ "importance": 0.55,
426
+ "metadata": {
427
+ "source_turn": "turn-005",
428
+ "speaker": "user"
429
+ },
430
+ "associations": [
431
+ {
432
+ "target_id": "memC-pa-004",
433
+ "strength": 0.7,
434
+ "reason": "reverts the temporary travel override"
435
+ }
436
+ ]
437
+ },
438
+ {
439
+ "id": "memC-pa-006",
440
+ "type": "episodic",
441
+ "timestamp": "2026-04-05T13:00:00Z",
442
+ "content": "Soren mentioned his colleague Lena also has a recurring Wednesday call, but hers is at 2pm and unrelated to his.",
443
+ "tags": [
444
+ "lena",
445
+ "distractor"
446
+ ],
447
+ "importance": 0.2,
448
+ "metadata": {
449
+ "source_turn": "turn-006",
450
+ "speaker": "user"
451
+ },
452
+ "associations": []
453
+ },
454
+ {
455
+ "id": "memC-pa-007",
456
+ "type": "semantic",
457
+ "timestamp": "2025-09-15T08:00:00Z",
458
+ "content": "Soren is allergic to penicillin, noted for any travel medical forms.",
459
+ "tags": [
460
+ "soren",
461
+ "health"
462
+ ],
463
+ "importance": 0.6,
464
+ "metadata": {
465
+ "source_turn": "turn-007",
466
+ "speaker": "user"
467
+ },
468
+ "associations": []
469
+ },
470
+ {
471
+ "id": "memC-pa-008",
472
+ "type": "procedural",
473
+ "timestamp": "2025-09-20T09:00:00Z",
474
+ "content": "When booking Soren's flights, the assistant should always default to aisle seats and avoid red-eye flights longer than 6 hours.",
475
+ "tags": [
476
+ "soren",
477
+ "workflow"
478
+ ],
479
+ "importance": 0.5,
480
+ "metadata": {
481
+ "source_turn": "turn-008",
482
+ "speaker": "user"
483
+ },
484
+ "associations": []
485
+ },
486
+ {
487
+ "id": "memC-pa-009",
488
+ "type": "episodic",
489
+ "timestamp": "2026-05-06T10:00:00Z",
490
+ "content": "In May, Soren's recurring Wednesday call was permanently moved to Thursdays at 10am Berlin time after the client requested the change.",
491
+ "tags": [
492
+ "soren",
493
+ "schedule"
494
+ ],
495
+ "importance": 0.7,
496
+ "metadata": {
497
+ "source_turn": "turn-009",
498
+ "speaker": "user"
499
+ },
500
+ "associations": [
501
+ {
502
+ "target_id": "memC-pa-005",
503
+ "strength": 0.8,
504
+ "reason": "permanent change replacing the Wednesday slot"
505
+ }
506
+ ]
507
+ },
508
+ {
509
+ "id": "memC-pa-010",
510
+ "type": "episodic",
511
+ "timestamp": "2026-06-10T11:00:00Z",
512
+ "content": "Soren asked for a one-time exception in June to push a Thursday call to Friday because of a public holiday.",
513
+ "tags": [
514
+ "soren",
515
+ "schedule"
516
+ ],
517
+ "importance": 0.4,
518
+ "metadata": {
519
+ "source_turn": "turn-010",
520
+ "speaker": "user"
521
+ },
522
+ "associations": []
523
+ },
524
+ {
525
+ "id": "memC-pa-011",
526
+ "type": "semantic",
527
+ "timestamp": "2025-09-25T08:00:00Z",
528
+ "content": "Soren's emergency contact is his partner, Mikael.",
529
+ "tags": [
530
+ "soren",
531
+ "contact"
532
+ ],
533
+ "importance": 0.5,
534
+ "metadata": {
535
+ "source_turn": "turn-011",
536
+ "speaker": "user"
537
+ },
538
+ "associations": []
539
+ },
540
+ {
541
+ "id": "memC-pa-012",
542
+ "type": "episodic",
543
+ "timestamp": "2026-06-15T12:00:00Z",
544
+ "content": "Soren confirmed that, aside from the one-time June exception, his recurring client call remains on Thursdays at 10am Berlin time.",
545
+ "tags": [
546
+ "soren",
547
+ "schedule"
548
+ ],
549
+ "importance": 0.7,
550
+ "metadata": {
551
+ "source_turn": "turn-012",
552
+ "speaker": "user"
553
+ },
554
+ "associations": [
555
+ {
556
+ "target_id": "memC-pa-009",
557
+ "strength": 0.85,
558
+ "reason": "confirms the current standing schedule"
559
+ }
560
+ ]
561
+ }
562
+ ],
563
+ "questions": [
564
+ {
565
+ "question_id": "qC-pa-01",
566
+ "category": "atomic_fact_recall",
567
+ "question": "What is Soren allergic to?",
568
+ "expected_answer": "Penicillin",
569
+ "acceptable_answer_criteria": [
570
+ "states penicillin"
571
+ ],
572
+ "required_memory_ids": [
573
+ "memC-pa-007"
574
+ ],
575
+ "forbidden_memory_ids": [],
576
+ "difficulty": "easy",
577
+ "architecture_bias_risk": "low",
578
+ "fairness_note": "Single stated fact."
579
+ },
580
+ {
581
+ "question_id": "qC-pa-02",
582
+ "category": "paraphrased_semantic_recall",
583
+ "question": "What's the assistant's rule of thumb when booking Soren's flights?",
584
+ "expected_answer": "Default to aisle seats and avoid red-eyes longer than 6 hours.",
585
+ "acceptable_answer_criteria": [
586
+ "mentions aisle seats",
587
+ "mentions the red-eye limit"
588
+ ],
589
+ "required_memory_ids": [
590
+ "memC-pa-008"
591
+ ],
592
+ "forbidden_memory_ids": [],
593
+ "difficulty": "medium",
594
+ "architecture_bias_risk": "low",
595
+ "fairness_note": "Reworded from the procedural memory's phrasing."
596
+ },
597
+ {
598
+ "question_id": "qC-pa-03",
599
+ "category": "temporal_update",
600
+ "question": "What day and time is Soren's recurring client call currently?",
601
+ "expected_answer": "Thursdays at 10am Berlin time -- moved from the original Wednesday 4pm slot in May.",
602
+ "acceptable_answer_criteria": [
603
+ "gives Thursday 10am",
604
+ "notes it changed from Wednesday"
605
+ ],
606
+ "required_memory_ids": [
607
+ "memC-pa-009",
608
+ "memC-pa-012"
609
+ ],
610
+ "forbidden_memory_ids": [],
611
+ "difficulty": "medium",
612
+ "architecture_bias_risk": "medium",
613
+ "fairness_note": "Schedule changed multiple times (temporary travel shift, then permanent move); requires surfacing the latest standing state."
614
+ },
615
+ {
616
+ "question_id": "qC-pa-04",
617
+ "category": "contradiction_resolution",
618
+ "question": "Is Soren's recurring client call still on Wednesdays?",
619
+ "expected_answer": "No -- it was permanently moved to Thursdays at 10am Berlin time in May. The earlier Sao Paulo time shift was only a temporary travel override, not the current state.",
620
+ "acceptable_answer_criteria": [
621
+ "says no",
622
+ "gives the current Thursday slot",
623
+ "doesn't get tripped up by the temporary Sao Paulo override"
624
+ ],
625
+ "required_memory_ids": [
626
+ "memC-pa-009"
627
+ ],
628
+ "forbidden_memory_ids": [
629
+ "memC-pa-003"
630
+ ],
631
+ "difficulty": "hard",
632
+ "architecture_bias_risk": "medium",
633
+ "fairness_note": "Three sequential changes to the same fact (temporary override, revert, permanent move) make this a harder recency test than a single before/after pair."
634
+ },
635
+ {
636
+ "question_id": "qC-pa-05",
637
+ "category": "noise_resistance",
638
+ "question": "Is Lena's Wednesday call the same one Soren reschedules when he travels?",
639
+ "expected_answer": "No -- Lena's Wednesday 2pm call is a separate, unrelated recurring meeting that belongs to a colleague, not Soren.",
640
+ "acceptable_answer_criteria": [
641
+ "identifies Lena's call as unrelated"
642
+ ],
643
+ "required_memory_ids": [
644
+ "memC-pa-006",
645
+ "memC-pa-003"
646
+ ],
647
+ "forbidden_memory_ids": [],
648
+ "difficulty": "medium",
649
+ "architecture_bias_risk": "medium",
650
+ "fairness_note": "Distractor shares the 'recurring Wednesday call' framing but belongs to a different person entirely."
651
+ },
652
+ {
653
+ "question_id": "qC-pa-06",
654
+ "category": "atomic_fact_recall",
655
+ "question": "Where is Soren based?",
656
+ "expected_answer": "Berlin",
657
+ "acceptable_answer_criteria": [
658
+ "states Berlin"
659
+ ],
660
+ "required_memory_ids": [
661
+ "memC-pa-001"
662
+ ],
663
+ "forbidden_memory_ids": [],
664
+ "difficulty": "easy",
665
+ "architecture_bias_risk": "low",
666
+ "fairness_note": "Single stated fact; a second, separate atomic-recall check distinct from the schedule/contradiction questions above."
667
+ }
668
+ ]
669
+ },
670
+ {
671
+ "conversation_id": "convC-research-01",
672
+ "agent_id": "agent-research",
673
+ "domain": "research",
674
+ "memory_records": [
675
+ {
676
+ "id": "memC-res-001",
677
+ "type": "semantic",
678
+ "timestamp": "2025-11-01T09:00:00Z",
679
+ "content": "Farah's research examines microplastic concentration in freshwater lakes near urban centers.",
680
+ "tags": [
681
+ "farah",
682
+ "topic"
683
+ ],
684
+ "importance": 0.6,
685
+ "metadata": {
686
+ "source_turn": "turn-001",
687
+ "speaker": "user"
688
+ },
689
+ "associations": []
690
+ },
691
+ {
692
+ "id": "memC-res-002",
693
+ "type": "episodic",
694
+ "timestamp": "2025-11-05T10:00:00Z",
695
+ "content": "Farah's collaborator, Dr. Lindqvist, shared a sampling protocol originally developed for ocean microplastics.",
696
+ "tags": [
697
+ "protocol"
698
+ ],
699
+ "importance": 0.5,
700
+ "metadata": {
701
+ "source_turn": "turn-002",
702
+ "speaker": "user"
703
+ },
704
+ "associations": []
705
+ },
706
+ {
707
+ "id": "memC-res-003",
708
+ "type": "episodic",
709
+ "timestamp": "2025-11-20T11:00:00Z",
710
+ "content": "Farah adapted Dr. Lindqvist's ocean protocol for freshwater by reducing the filter mesh size from 333 to 100 micrometers.",
711
+ "tags": [
712
+ "protocol"
713
+ ],
714
+ "importance": 0.7,
715
+ "metadata": {
716
+ "source_turn": "turn-003",
717
+ "speaker": "user"
718
+ },
719
+ "associations": [
720
+ {
721
+ "target_id": "memC-res-002",
722
+ "strength": 0.85,
723
+ "reason": "adapted from this original protocol"
724
+ }
725
+ ]
726
+ },
727
+ {
728
+ "id": "memC-res-004",
729
+ "type": "semantic",
730
+ "timestamp": "2025-10-15T08:00:00Z",
731
+ "content": "Farah originally estimated Lake Halvorsen's microplastic count using older 333-micrometer mesh data from a 2021 regional survey.",
732
+ "tags": [
733
+ "estimate"
734
+ ],
735
+ "importance": 0.5,
736
+ "metadata": {
737
+ "source_turn": "turn-004",
738
+ "speaker": "user"
739
+ },
740
+ "associations": []
741
+ },
742
+ {
743
+ "id": "memC-res-005",
744
+ "type": "episodic",
745
+ "timestamp": "2026-01-10T12:00:00Z",
746
+ "content": "Using the adapted 100-micrometer protocol, Farah found microplastic counts in Lake Halvorsen were nearly triple her initial estimate.",
747
+ "tags": [
748
+ "results"
749
+ ],
750
+ "importance": 0.75,
751
+ "metadata": {
752
+ "source_turn": "turn-005",
753
+ "speaker": "user"
754
+ },
755
+ "associations": [
756
+ {
757
+ "target_id": "memC-res-003",
758
+ "strength": 0.85,
759
+ "reason": "result of the adapted protocol"
760
+ },
761
+ {
762
+ "target_id": "memC-res-004",
763
+ "strength": 0.6,
764
+ "reason": "compared against this original estimate"
765
+ }
766
+ ]
767
+ },
768
+ {
769
+ "id": "memC-res-006",
770
+ "type": "episodic",
771
+ "timestamp": "2026-01-25T13:00:00Z",
772
+ "content": "The unexpectedly high count led Farah's team to add two sampling sites downstream of a wastewater treatment plant.",
773
+ "tags": [
774
+ "expansion"
775
+ ],
776
+ "importance": 0.65,
777
+ "metadata": {
778
+ "source_turn": "turn-006",
779
+ "speaker": "user"
780
+ },
781
+ "associations": [
782
+ {
783
+ "target_id": "memC-res-005",
784
+ "strength": 0.85,
785
+ "reason": "new sites added because of this high count"
786
+ }
787
+ ]
788
+ },
789
+ {
790
+ "id": "memC-res-007",
791
+ "type": "episodic",
792
+ "timestamp": "2026-02-10T14:00:00Z",
793
+ "content": "Sampling at the new downstream sites confirmed the treatment plant as a major contributor, with counts dropping 70% just 2km past the outflow.",
794
+ "tags": [
795
+ "results"
796
+ ],
797
+ "importance": 0.7,
798
+ "metadata": {
799
+ "source_turn": "turn-007",
800
+ "speaker": "user"
801
+ },
802
+ "associations": [
803
+ {
804
+ "target_id": "memC-res-006",
805
+ "strength": 0.85,
806
+ "reason": "result of the site expansion"
807
+ }
808
+ ]
809
+ },
810
+ {
811
+ "id": "memC-res-008",
812
+ "type": "episodic",
813
+ "timestamp": "2026-02-15T09:00:00Z",
814
+ "content": "Farah mentioned a separate, smaller dataset her undergrad assistant collected on plastic in soil samples, which is not part of the lake study.",
815
+ "tags": [
816
+ "distractor"
817
+ ],
818
+ "importance": 0.2,
819
+ "metadata": {
820
+ "source_turn": "turn-008",
821
+ "speaker": "user"
822
+ },
823
+ "associations": []
824
+ },
825
+ {
826
+ "id": "memC-res-009",
827
+ "type": "episodic",
828
+ "timestamp": "2025-12-01T10:00:00Z",
829
+ "content": "Farah's funding agency initially approved a one-year grant.",
830
+ "tags": [
831
+ "funding"
832
+ ],
833
+ "importance": 0.5,
834
+ "metadata": {
835
+ "source_turn": "turn-009",
836
+ "speaker": "user"
837
+ },
838
+ "associations": []
839
+ },
840
+ {
841
+ "id": "memC-res-010",
842
+ "type": "episodic",
843
+ "timestamp": "2026-02-20T11:00:00Z",
844
+ "content": "The grant was extended to 18 months after the downstream sampling expansion required more time.",
845
+ "tags": [
846
+ "funding"
847
+ ],
848
+ "importance": 0.6,
849
+ "metadata": {
850
+ "source_turn": "turn-010",
851
+ "speaker": "user"
852
+ },
853
+ "associations": [
854
+ {
855
+ "target_id": "memC-res-009",
856
+ "strength": 0.8,
857
+ "reason": "extends the original one-year grant"
858
+ }
859
+ ]
860
+ },
861
+ {
862
+ "id": "memC-res-011",
863
+ "type": "procedural",
864
+ "timestamp": "2025-11-10T09:00:00Z",
865
+ "content": "Farah's lab protocol requires triple-rinsing all sampling jars with deionized water before each collection trip.",
866
+ "tags": [
867
+ "workflow"
868
+ ],
869
+ "importance": 0.45,
870
+ "metadata": {
871
+ "source_turn": "turn-011",
872
+ "speaker": "user"
873
+ },
874
+ "associations": []
875
+ },
876
+ {
877
+ "id": "memC-res-012",
878
+ "type": "episodic",
879
+ "timestamp": "2026-04-05T12:00:00Z",
880
+ "content": "A lab tech accidentally mixed up sample labels from Lake Halvorsen and a nearby control lake, Lake Persson, in April, requiring Farah to redo that batch; the corrected results matched the earlier finding of nearly triple the original estimate.",
881
+ "tags": [
882
+ "correction"
883
+ ],
884
+ "importance": 0.6,
885
+ "metadata": {
886
+ "source_turn": "turn-012",
887
+ "speaker": "user"
888
+ },
889
+ "associations": [
890
+ {
891
+ "target_id": "memC-res-005",
892
+ "strength": 0.7,
893
+ "reason": "redo confirms this earlier finding"
894
+ }
895
+ ]
896
+ }
897
+ ],
898
+ "questions": [
899
+ {
900
+ "question_id": "qC-res-01",
901
+ "category": "atomic_fact_recall",
902
+ "question": "What mesh size does Farah currently use for freshwater sampling?",
903
+ "expected_answer": "100 micrometers",
904
+ "acceptable_answer_criteria": [
905
+ "states 100 micrometers"
906
+ ],
907
+ "required_memory_ids": [
908
+ "memC-res-003"
909
+ ],
910
+ "forbidden_memory_ids": [],
911
+ "difficulty": "easy",
912
+ "architecture_bias_risk": "low",
913
+ "fairness_note": "Single stated fact, the adapted/current protocol value."
914
+ },
915
+ {
916
+ "question_id": "qC-res-02",
917
+ "category": "atomic_fact_recall",
918
+ "question": "Who originally developed the ocean sampling protocol Farah adapted?",
919
+ "expected_answer": "Dr. Lindqvist",
920
+ "acceptable_answer_criteria": [
921
+ "names Dr. Lindqvist"
922
+ ],
923
+ "required_memory_ids": [
924
+ "memC-res-002"
925
+ ],
926
+ "forbidden_memory_ids": [],
927
+ "difficulty": "easy",
928
+ "architecture_bias_risk": "low",
929
+ "fairness_note": "Directly stated single fact."
930
+ },
931
+ {
932
+ "question_id": "qC-res-03",
933
+ "category": "paraphrased_semantic_recall",
934
+ "question": "What's the standard cleaning step before Farah's team heads out on a collection trip?",
935
+ "expected_answer": "Triple-rinse all sampling jars with deionized water beforehand.",
936
+ "acceptable_answer_criteria": [
937
+ "mentions triple-rinsing",
938
+ "mentions deionized water"
939
+ ],
940
+ "required_memory_ids": [
941
+ "memC-res-011"
942
+ ],
943
+ "forbidden_memory_ids": [],
944
+ "difficulty": "medium",
945
+ "architecture_bias_risk": "low",
946
+ "fairness_note": "Reworded from the procedural record's exact phrasing."
947
+ },
948
+ {
949
+ "question_id": "qC-res-04",
950
+ "category": "temporal_update",
951
+ "question": "How long is Farah's research grant currently funded for?",
952
+ "expected_answer": "18 months -- extended from the original one-year approval.",
953
+ "acceptable_answer_criteria": [
954
+ "states 18 months",
955
+ "notes the extension"
956
+ ],
957
+ "required_memory_ids": [
958
+ "memC-res-010",
959
+ "memC-res-009"
960
+ ],
961
+ "forbidden_memory_ids": [],
962
+ "difficulty": "medium",
963
+ "architecture_bias_risk": "medium",
964
+ "fairness_note": "Requires surfacing the extended duration over the original approval."
965
+ },
966
+ {
967
+ "question_id": "qC-res-05",
968
+ "category": "multi_hop_association",
969
+ "question": "Why did Farah's team add two new sampling sites downstream of the wastewater treatment plant?",
970
+ "expected_answer": "The adapted 100-micrometer protocol showed Lake Halvorsen's microplastic count was nearly triple the original estimate, prompting the team to investigate further by sampling downstream of the treatment plant.",
971
+ "acceptable_answer_criteria": [
972
+ "mentions the protocol/result that revealed the high count",
973
+ "connects it to the downstream expansion decision"
974
+ ],
975
+ "required_memory_ids": [
976
+ "memC-res-003",
977
+ "memC-res-005",
978
+ "memC-res-006"
979
+ ],
980
+ "forbidden_memory_ids": [],
981
+ "difficulty": "hard",
982
+ "architecture_bias_risk": "high",
983
+ "fairness_note": "Three-record causal chain (protocol change -> surprising result -> site expansion); rewards systems that can connect a sequence of related facts rather than just the nearest single match."
984
+ },
985
+ {
986
+ "question_id": "qC-res-06",
987
+ "category": "noise_resistance",
988
+ "question": "Is the soil microplastic dataset collected by Farah's undergrad assistant part of the lake study?",
989
+ "expected_answer": "No, it's a separate, smaller dataset unrelated to the freshwater lake research.",
990
+ "acceptable_answer_criteria": [
991
+ "identifies it as separate/unrelated"
992
+ ],
993
+ "required_memory_ids": [
994
+ "memC-res-008",
995
+ "memC-res-001"
996
+ ],
997
+ "forbidden_memory_ids": [],
998
+ "difficulty": "medium",
999
+ "architecture_bias_risk": "medium",
1000
+ "fairness_note": "Distractor shares the 'microplastics' topic but is an explicitly separate dataset, testing precision over topical similarity."
1001
+ }
1002
+ ]
1003
+ },
1004
+ {
1005
+ "conversation_id": "convC-travel-01",
1006
+ "agent_id": "agent-travel",
1007
+ "domain": "travel",
1008
+ "memory_records": [
1009
+ {
1010
+ "id": "memC-tr-001",
1011
+ "type": "episodic",
1012
+ "timestamp": "2025-12-01T09:00:00Z",
1013
+ "content": "Bram is planning a 6-week backpacking trip through Vietnam, Thailand, and Cambodia starting in November.",
1014
+ "tags": [
1015
+ "bram",
1016
+ "planning"
1017
+ ],
1018
+ "importance": 0.6,
1019
+ "metadata": {
1020
+ "source_turn": "turn-001",
1021
+ "speaker": "user"
1022
+ },
1023
+ "associations": []
1024
+ },
1025
+ {
1026
+ "id": "memC-tr-002",
1027
+ "type": "episodic",
1028
+ "timestamp": "2025-12-02T10:00:00Z",
1029
+ "content": "Bram originally planned to spend 3 weeks in Vietnam, 2 in Thailand, and 1 in Cambodia.",
1030
+ "tags": [
1031
+ "bram",
1032
+ "itinerary"
1033
+ ],
1034
+ "importance": 0.5,
1035
+ "metadata": {
1036
+ "source_turn": "turn-002",
1037
+ "speaker": "user"
1038
+ },
1039
+ "associations": []
1040
+ },
1041
+ {
1042
+ "id": "memC-tr-003",
1043
+ "type": "episodic",
1044
+ "timestamp": "2026-01-15T11:00:00Z",
1045
+ "content": "After a friend's recommendation, Bram revised the split to 2 weeks Vietnam, 2 weeks Thailand, 2 weeks Cambodia.",
1046
+ "tags": [
1047
+ "bram",
1048
+ "itinerary"
1049
+ ],
1050
+ "importance": 0.65,
1051
+ "metadata": {
1052
+ "source_turn": "turn-003",
1053
+ "speaker": "user"
1054
+ },
1055
+ "associations": [
1056
+ {
1057
+ "target_id": "memC-tr-002",
1058
+ "strength": 0.8,
1059
+ "reason": "replaces the original time split"
1060
+ }
1061
+ ]
1062
+ },
1063
+ {
1064
+ "id": "memC-tr-004",
1065
+ "type": "semantic",
1066
+ "timestamp": "2025-12-05T08:00:00Z",
1067
+ "content": "Bram is vegan and needs restaurant recommendations that accommodate that across all three countries.",
1068
+ "tags": [
1069
+ "bram",
1070
+ "diet"
1071
+ ],
1072
+ "importance": 0.6,
1073
+ "metadata": {
1074
+ "source_turn": "turn-004",
1075
+ "speaker": "user"
1076
+ },
1077
+ "associations": []
1078
+ },
1079
+ {
1080
+ "id": "memC-tr-005",
1081
+ "type": "episodic",
1082
+ "timestamp": "2026-01-14T12:00:00Z",
1083
+ "content": "Bram's friend Ingrid, who recommended more time in Cambodia, had visited Siem Reap the year before and said the temple sites needed at least 4 full days, not the 2 Bram initially budgeted.",
1084
+ "tags": [
1085
+ "bram",
1086
+ "ingrid"
1087
+ ],
1088
+ "importance": 0.55,
1089
+ "metadata": {
1090
+ "source_turn": "turn-005",
1091
+ "speaker": "user"
1092
+ },
1093
+ "associations": []
1094
+ },
1095
+ {
1096
+ "id": "memC-tr-006",
1097
+ "type": "episodic",
1098
+ "timestamp": "2026-01-16T13:00:00Z",
1099
+ "content": "Because of Ingrid's recommendation about Siem Reap specifically, Bram now plans to dedicate 4 of his Cambodia days to the Angkor temple complex.",
1100
+ "tags": [
1101
+ "bram",
1102
+ "itinerary"
1103
+ ],
1104
+ "importance": 0.6,
1105
+ "metadata": {
1106
+ "source_turn": "turn-006",
1107
+ "speaker": "user"
1108
+ },
1109
+ "associations": [
1110
+ {
1111
+ "target_id": "memC-tr-005",
1112
+ "strength": 0.85,
1113
+ "reason": "Angkor day allocation driven by this tip"
1114
+ }
1115
+ ]
1116
+ },
1117
+ {
1118
+ "id": "memC-tr-007",
1119
+ "type": "episodic",
1120
+ "timestamp": "2026-02-01T09:00:00Z",
1121
+ "content": "Bram mentioned a separate weekend trip to Portugal in August, before the Southeast Asia trip even starts.",
1122
+ "tags": [
1123
+ "bram",
1124
+ "portugal",
1125
+ "distractor"
1126
+ ],
1127
+ "importance": 0.2,
1128
+ "metadata": {
1129
+ "source_turn": "turn-007",
1130
+ "speaker": "user"
1131
+ },
1132
+ "associations": []
1133
+ },
1134
+ {
1135
+ "id": "memC-tr-008",
1136
+ "type": "procedural",
1137
+ "timestamp": "2025-12-10T10:00:00Z",
1138
+ "content": "Bram's packing rule is to fit everything into one 40-liter backpack, no checked luggage, ever.",
1139
+ "tags": [
1140
+ "bram",
1141
+ "workflow"
1142
+ ],
1143
+ "importance": 0.45,
1144
+ "metadata": {
1145
+ "source_turn": "turn-008",
1146
+ "speaker": "user"
1147
+ },
1148
+ "associations": []
1149
+ },
1150
+ {
1151
+ "id": "memC-tr-009",
1152
+ "type": "episodic",
1153
+ "timestamp": "2026-03-01T11:00:00Z",
1154
+ "content": "Bram bought travel insurance covering medical evacuation up to $100,000, valid for the full 6-week trip.",
1155
+ "tags": [
1156
+ "bram",
1157
+ "insurance"
1158
+ ],
1159
+ "importance": 0.6,
1160
+ "metadata": {
1161
+ "source_turn": "turn-009",
1162
+ "speaker": "user"
1163
+ },
1164
+ "associations": []
1165
+ },
1166
+ {
1167
+ "id": "memC-tr-010",
1168
+ "type": "episodic",
1169
+ "timestamp": "2026-04-10T12:00:00Z",
1170
+ "content": "Bram's departure date shifted from November 1st to November 8th after visa processing took longer than expected.",
1171
+ "tags": [
1172
+ "bram",
1173
+ "dates"
1174
+ ],
1175
+ "importance": 0.65,
1176
+ "metadata": {
1177
+ "source_turn": "turn-010",
1178
+ "speaker": "user"
1179
+ },
1180
+ "associations": [
1181
+ {
1182
+ "target_id": "memC-tr-001",
1183
+ "strength": 0.7,
1184
+ "reason": "updates the originally planned start date"
1185
+ }
1186
+ ]
1187
+ },
1188
+ {
1189
+ "id": "memC-tr-011",
1190
+ "type": "episodic",
1191
+ "timestamp": "2026-04-11T13:00:00Z",
1192
+ "content": "With the new November 8th departure, Bram's return date also shifted, now landing in mid-to-late December instead of his original target of returning before December 15th.",
1193
+ "tags": [
1194
+ "bram",
1195
+ "dates"
1196
+ ],
1197
+ "importance": 0.6,
1198
+ "metadata": {
1199
+ "source_turn": "turn-011",
1200
+ "speaker": "user"
1201
+ },
1202
+ "associations": [
1203
+ {
1204
+ "target_id": "memC-tr-010",
1205
+ "strength": 0.75,
1206
+ "reason": "consequence of the departure date shift"
1207
+ }
1208
+ ]
1209
+ },
1210
+ {
1211
+ "id": "memC-tr-012",
1212
+ "type": "episodic",
1213
+ "timestamp": "2026-04-15T14:00:00Z",
1214
+ "content": "Bram confirmed his final itinerary: depart November 8th, return December 20th, with the revised 2-2-2 week country split.",
1215
+ "tags": [
1216
+ "bram",
1217
+ "itinerary",
1218
+ "dates"
1219
+ ],
1220
+ "importance": 0.75,
1221
+ "metadata": {
1222
+ "source_turn": "turn-012",
1223
+ "speaker": "user"
1224
+ },
1225
+ "associations": [
1226
+ {
1227
+ "target_id": "memC-tr-011",
1228
+ "strength": 0.8,
1229
+ "reason": "finalizes the shifted dates"
1230
+ },
1231
+ {
1232
+ "target_id": "memC-tr-003",
1233
+ "strength": 0.8,
1234
+ "reason": "finalizes the revised country split"
1235
+ }
1236
+ ]
1237
+ }
1238
+ ],
1239
+ "questions": [
1240
+ {
1241
+ "question_id": "qC-tr-01",
1242
+ "category": "atomic_fact_recall",
1243
+ "question": "What dietary requirement does Bram have for restaurant planning?",
1244
+ "expected_answer": "Vegan",
1245
+ "acceptable_answer_criteria": [
1246
+ "states vegan"
1247
+ ],
1248
+ "required_memory_ids": [
1249
+ "memC-tr-004"
1250
+ ],
1251
+ "forbidden_memory_ids": [],
1252
+ "difficulty": "easy",
1253
+ "architecture_bias_risk": "low",
1254
+ "fairness_note": "Single stated fact."
1255
+ },
1256
+ {
1257
+ "question_id": "qC-tr-02",
1258
+ "category": "atomic_fact_recall",
1259
+ "question": "What's Bram's rule about luggage for the trip?",
1260
+ "expected_answer": "Everything must fit in one 40-liter backpack, no checked luggage.",
1261
+ "acceptable_answer_criteria": [
1262
+ "mentions 40-liter backpack",
1263
+ "mentions no checked luggage"
1264
+ ],
1265
+ "required_memory_ids": [
1266
+ "memC-tr-008"
1267
+ ],
1268
+ "forbidden_memory_ids": [],
1269
+ "difficulty": "easy",
1270
+ "architecture_bias_risk": "low",
1271
+ "fairness_note": "Directly stated procedural fact."
1272
+ },
1273
+ {
1274
+ "question_id": "qC-tr-03",
1275
+ "category": "paraphrased_semantic_recall",
1276
+ "question": "What level of medical coverage does Bram's travel insurance include?",
1277
+ "expected_answer": "Medical evacuation coverage up to $100,000 for the whole trip.",
1278
+ "acceptable_answer_criteria": [
1279
+ "mentions $100,000",
1280
+ "mentions medical evacuation"
1281
+ ],
1282
+ "required_memory_ids": [
1283
+ "memC-tr-009"
1284
+ ],
1285
+ "forbidden_memory_ids": [],
1286
+ "difficulty": "medium",
1287
+ "architecture_bias_risk": "low",
1288
+ "fairness_note": "Reworded from the source record."
1289
+ },
1290
+ {
1291
+ "question_id": "qC-tr-04",
1292
+ "category": "temporal_update",
1293
+ "question": "When does Bram's trip currently start and end?",
1294
+ "expected_answer": "Departs November 8th, returns December 20th -- shifted from the original November 1st departure and pre-December-15th return target.",
1295
+ "acceptable_answer_criteria": [
1296
+ "gives Nov 8 / Dec 20",
1297
+ "notes these are shifted from earlier dates"
1298
+ ],
1299
+ "required_memory_ids": [
1300
+ "memC-tr-010",
1301
+ "memC-tr-011",
1302
+ "memC-tr-012"
1303
+ ],
1304
+ "forbidden_memory_ids": [],
1305
+ "difficulty": "medium",
1306
+ "architecture_bias_risk": "medium",
1307
+ "fairness_note": "Dates shifted twice in sequence (departure, then consequent return); requires surfacing the final confirmed dates."
1308
+ },
1309
+ {
1310
+ "question_id": "qC-tr-05",
1311
+ "category": "multi_hop_association",
1312
+ "question": "Why is Bram now planning to spend 4 days specifically around the Angkor temple complex?",
1313
+ "expected_answer": "His friend Ingrid, who'd visited Siem Reap before, told him the temple sites needed at least 4 full days, which led Bram to dedicate that much time to Angkor specifically.",
1314
+ "acceptable_answer_criteria": [
1315
+ "mentions Ingrid's prior visit/recommendation",
1316
+ "connects it to the 4-day Angkor allocation"
1317
+ ],
1318
+ "required_memory_ids": [
1319
+ "memC-tr-005",
1320
+ "memC-tr-006"
1321
+ ],
1322
+ "forbidden_memory_ids": [],
1323
+ "difficulty": "hard",
1324
+ "architecture_bias_risk": "high",
1325
+ "fairness_note": "Requires linking a friend's secondhand travel tip to a specific itinerary decision made later."
1326
+ },
1327
+ {
1328
+ "question_id": "qC-tr-06",
1329
+ "category": "noise_resistance",
1330
+ "question": "Is Bram's August Portugal weekend trip part of his Southeast Asia backpacking itinerary?",
1331
+ "expected_answer": "No, it's a separate, unrelated trip happening before the Southeast Asia trip even begins.",
1332
+ "acceptable_answer_criteria": [
1333
+ "identifies Portugal trip as unrelated"
1334
+ ],
1335
+ "required_memory_ids": [
1336
+ "memC-tr-007",
1337
+ "memC-tr-001"
1338
+ ],
1339
+ "forbidden_memory_ids": [],
1340
+ "difficulty": "medium",
1341
+ "architecture_bias_risk": "medium",
1342
+ "fairness_note": "Distractor is a real but unrelated trip mentioned in the same conversation, testing topical precision."
1343
+ }
1344
+ ]
1345
+ },
1346
+ {
1347
+ "conversation_id": "convC-health-admin-01",
1348
+ "agent_id": "agent-health",
1349
+ "domain": "health_admin",
1350
+ "memory_records": [
1351
+ {
1352
+ "id": "memC-ha-001",
1353
+ "type": "episodic",
1354
+ "timestamp": "2025-08-01T09:00:00Z",
1355
+ "content": "Noor started seeing a neurologist, Dr. Castellano, for chronic migraines.",
1356
+ "tags": [
1357
+ "noor",
1358
+ "care"
1359
+ ],
1360
+ "importance": 0.6,
1361
+ "metadata": {
1362
+ "source_turn": "turn-001",
1363
+ "speaker": "user"
1364
+ },
1365
+ "associations": []
1366
+ },
1367
+ {
1368
+ "id": "memC-ha-002",
1369
+ "type": "semantic",
1370
+ "timestamp": "2025-08-05T10:00:00Z",
1371
+ "content": "Dr. Castellano initially prescribed sumatriptan as an as-needed medication for migraine attacks.",
1372
+ "tags": [
1373
+ "noor",
1374
+ "medication"
1375
+ ],
1376
+ "importance": 0.6,
1377
+ "metadata": {
1378
+ "source_turn": "turn-002",
1379
+ "speaker": "user"
1380
+ },
1381
+ "associations": []
1382
+ },
1383
+ {
1384
+ "id": "memC-ha-003",
1385
+ "type": "episodic",
1386
+ "timestamp": "2025-11-10T11:00:00Z",
1387
+ "content": "After migraines persisted at more than 8 days a month, Dr. Castellano added propranolol as a daily preventive medication.",
1388
+ "tags": [
1389
+ "noor",
1390
+ "medication"
1391
+ ],
1392
+ "importance": 0.65,
1393
+ "metadata": {
1394
+ "source_turn": "turn-003",
1395
+ "speaker": "user"
1396
+ },
1397
+ "associations": []
1398
+ },
1399
+ {
1400
+ "id": "memC-ha-004",
1401
+ "type": "episodic",
1402
+ "timestamp": "2026-01-15T12:00:00Z",
1403
+ "content": "Propranolol caused fatigue, so Dr. Castellano switched the preventive medication to topiramate.",
1404
+ "tags": [
1405
+ "noor",
1406
+ "medication"
1407
+ ],
1408
+ "importance": 0.75,
1409
+ "metadata": {
1410
+ "source_turn": "turn-004",
1411
+ "speaker": "user"
1412
+ },
1413
+ "associations": [
1414
+ {
1415
+ "target_id": "memC-ha-003",
1416
+ "strength": 0.85,
1417
+ "reason": "replaces propranolol as the preventive medication"
1418
+ }
1419
+ ]
1420
+ },
1421
+ {
1422
+ "id": "memC-ha-005",
1423
+ "type": "episodic",
1424
+ "timestamp": "2026-04-10T13:00:00Z",
1425
+ "content": "On topiramate, Noor's migraine frequency dropped from over 8 days a month to about 3 days a month.",
1426
+ "tags": [
1427
+ "noor",
1428
+ "results"
1429
+ ],
1430
+ "importance": 0.7,
1431
+ "metadata": {
1432
+ "source_turn": "turn-005",
1433
+ "speaker": "user"
1434
+ },
1435
+ "associations": [
1436
+ {
1437
+ "target_id": "memC-ha-004",
1438
+ "strength": 0.85,
1439
+ "reason": "result of the medication switch"
1440
+ }
1441
+ ]
1442
+ },
1443
+ {
1444
+ "id": "memC-ha-006",
1445
+ "type": "episodic",
1446
+ "timestamp": "2026-02-01T09:00:00Z",
1447
+ "content": "Noor's primary care doctor, Dr. Femi, separately monitors her blood pressure, which has been stable and unrelated to the migraine treatment changes.",
1448
+ "tags": [
1449
+ "noor",
1450
+ "distractor"
1451
+ ],
1452
+ "importance": 0.25,
1453
+ "metadata": {
1454
+ "source_turn": "turn-006",
1455
+ "speaker": "user"
1456
+ },
1457
+ "associations": []
1458
+ },
1459
+ {
1460
+ "id": "memC-ha-007",
1461
+ "type": "episodic",
1462
+ "timestamp": "2025-12-01T10:00:00Z",
1463
+ "content": "Dr. Castellano recommended Noor keep a migraine diary, which led to discovering her attacks frequently followed nights with less than 6 hours of sleep.",
1464
+ "tags": [
1465
+ "noor",
1466
+ "diary"
1467
+ ],
1468
+ "importance": 0.65,
1469
+ "metadata": {
1470
+ "source_turn": "turn-007",
1471
+ "speaker": "user"
1472
+ },
1473
+ "associations": []
1474
+ },
1475
+ {
1476
+ "id": "memC-ha-008",
1477
+ "type": "episodic",
1478
+ "timestamp": "2025-12-15T11:00:00Z",
1479
+ "content": "After connecting poor sleep to her migraine triggers, Noor started a consistent 10:30pm bedtime, which her diary later showed reduced trigger-related attacks further.",
1480
+ "tags": [
1481
+ "noor",
1482
+ "habit"
1483
+ ],
1484
+ "importance": 0.65,
1485
+ "metadata": {
1486
+ "source_turn": "turn-008",
1487
+ "speaker": "user"
1488
+ },
1489
+ "associations": [
1490
+ {
1491
+ "target_id": "memC-ha-007",
1492
+ "strength": 0.85,
1493
+ "reason": "sleep insight led to this bedtime change"
1494
+ }
1495
+ ]
1496
+ },
1497
+ {
1498
+ "id": "memC-ha-009",
1499
+ "type": "semantic",
1500
+ "timestamp": "2025-08-10T09:00:00Z",
1501
+ "content": "Noor is also mildly lactose intolerant, noted in her general health profile but not connected to her migraine care.",
1502
+ "tags": [
1503
+ "noor",
1504
+ "distractor"
1505
+ ],
1506
+ "importance": 0.2,
1507
+ "metadata": {
1508
+ "source_turn": "turn-009",
1509
+ "speaker": "user"
1510
+ },
1511
+ "associations": []
1512
+ },
1513
+ {
1514
+ "id": "memC-ha-010",
1515
+ "type": "procedural",
1516
+ "timestamp": "2025-12-05T10:00:00Z",
1517
+ "content": "Noor's process for logging a migraine is to note the start time, suspected trigger, medication taken, and relief time within an hour of the attack starting.",
1518
+ "tags": [
1519
+ "noor",
1520
+ "workflow"
1521
+ ],
1522
+ "importance": 0.5,
1523
+ "metadata": {
1524
+ "source_turn": "turn-010",
1525
+ "speaker": "user"
1526
+ },
1527
+ "associations": []
1528
+ },
1529
+ {
1530
+ "id": "memC-ha-011",
1531
+ "type": "episodic",
1532
+ "timestamp": "2026-01-10T11:00:00Z",
1533
+ "content": "Noor's insurance required prior authorization for topiramate, which took two weeks to approve before she could start taking it.",
1534
+ "tags": [
1535
+ "noor",
1536
+ "insurance"
1537
+ ],
1538
+ "importance": 0.45,
1539
+ "metadata": {
1540
+ "source_turn": "turn-011",
1541
+ "speaker": "user"
1542
+ },
1543
+ "associations": []
1544
+ },
1545
+ {
1546
+ "id": "memC-ha-012",
1547
+ "type": "episodic",
1548
+ "timestamp": "2026-06-10T12:00:00Z",
1549
+ "content": "As of her June check-in, Noor remains on topiramate with sumatriptan as the as-needed rescue medication, and her migraine frequency has held steady at around 3 days a month.",
1550
+ "tags": [
1551
+ "noor",
1552
+ "status"
1553
+ ],
1554
+ "importance": 0.7,
1555
+ "metadata": {
1556
+ "source_turn": "turn-012",
1557
+ "speaker": "user"
1558
+ },
1559
+ "associations": [
1560
+ {
1561
+ "target_id": "memC-ha-004",
1562
+ "strength": 0.8,
1563
+ "reason": "confirms topiramate is still the current preventive medication"
1564
+ },
1565
+ {
1566
+ "target_id": "memC-ha-002",
1567
+ "strength": 0.7,
1568
+ "reason": "confirms sumatriptan is still the rescue medication"
1569
+ }
1570
+ ]
1571
+ }
1572
+ ],
1573
+ "questions": [
1574
+ {
1575
+ "question_id": "qC-ha-01",
1576
+ "category": "atomic_fact_recall",
1577
+ "question": "What as-needed medication does Noor take for migraine attacks?",
1578
+ "expected_answer": "Sumatriptan",
1579
+ "acceptable_answer_criteria": [
1580
+ "states sumatriptan"
1581
+ ],
1582
+ "required_memory_ids": [
1583
+ "memC-ha-002",
1584
+ "memC-ha-012"
1585
+ ],
1586
+ "forbidden_memory_ids": [],
1587
+ "difficulty": "easy",
1588
+ "architecture_bias_risk": "low",
1589
+ "fairness_note": "Stated directly and reconfirmed later; no architecture-specific advantage."
1590
+ },
1591
+ {
1592
+ "question_id": "qC-ha-02",
1593
+ "category": "atomic_fact_recall",
1594
+ "question": "What other health condition does Noor's primary care doctor monitor, separate from her migraines?",
1595
+ "expected_answer": "Blood pressure",
1596
+ "acceptable_answer_criteria": [
1597
+ "states blood pressure"
1598
+ ],
1599
+ "required_memory_ids": [
1600
+ "memC-ha-006"
1601
+ ],
1602
+ "forbidden_memory_ids": [],
1603
+ "difficulty": "easy",
1604
+ "architecture_bias_risk": "low",
1605
+ "fairness_note": "Single stated fact about an unrelated condition."
1606
+ },
1607
+ {
1608
+ "question_id": "qC-ha-03",
1609
+ "category": "paraphrased_semantic_recall",
1610
+ "question": "What sleep-related habit did Noor adopt after noticing a pattern in her migraine diary?",
1611
+ "expected_answer": "A consistent 10:30pm bedtime.",
1612
+ "acceptable_answer_criteria": [
1613
+ "mentions 10:30pm bedtime"
1614
+ ],
1615
+ "required_memory_ids": [
1616
+ "memC-ha-008"
1617
+ ],
1618
+ "forbidden_memory_ids": [],
1619
+ "difficulty": "medium",
1620
+ "architecture_bias_risk": "low",
1621
+ "fairness_note": "Reworded from the source record's phrasing."
1622
+ },
1623
+ {
1624
+ "question_id": "qC-ha-04",
1625
+ "category": "temporal_update",
1626
+ "question": "What is Noor's current daily preventive medication for migraines?",
1627
+ "expected_answer": "Topiramate -- switched from propranolol after propranolol caused fatigue.",
1628
+ "acceptable_answer_criteria": [
1629
+ "states topiramate",
1630
+ "notes the switch from propranolol"
1631
+ ],
1632
+ "required_memory_ids": [
1633
+ "memC-ha-004"
1634
+ ],
1635
+ "forbidden_memory_ids": [
1636
+ "memC-ha-003"
1637
+ ],
1638
+ "difficulty": "medium",
1639
+ "architecture_bias_risk": "medium",
1640
+ "fairness_note": "Two preventive medications were tried in sequence; the system must surface the current one, not the first one tried."
1641
+ },
1642
+ {
1643
+ "question_id": "qC-ha-05",
1644
+ "category": "multi_hop_association",
1645
+ "question": "How did Noor's migraine diary lead to a change in her sleep habits?",
1646
+ "expected_answer": "Dr. Castellano recommended the diary, which revealed her attacks often followed nights with less than 6 hours of sleep, leading Noor to adopt a consistent 10:30pm bedtime.",
1647
+ "acceptable_answer_criteria": [
1648
+ "mentions the diary recommendation",
1649
+ "mentions the sleep-attack pattern",
1650
+ "connects it to the bedtime change"
1651
+ ],
1652
+ "required_memory_ids": [
1653
+ "memC-ha-007",
1654
+ "memC-ha-008"
1655
+ ],
1656
+ "forbidden_memory_ids": [],
1657
+ "difficulty": "hard",
1658
+ "architecture_bias_risk": "high",
1659
+ "fairness_note": "Requires connecting a recommendation, a discovered pattern, and a resulting behavior change across two records."
1660
+ },
1661
+ {
1662
+ "question_id": "qC-ha-06",
1663
+ "category": "procedural_recall",
1664
+ "question": "What does Noor record when logging a migraine attack?",
1665
+ "expected_answer": "Start time, suspected trigger, medication taken, and relief time, logged within an hour of the attack starting.",
1666
+ "acceptable_answer_criteria": [
1667
+ "mentions all four elements",
1668
+ "mentions the one-hour logging window"
1669
+ ],
1670
+ "required_memory_ids": [
1671
+ "memC-ha-010"
1672
+ ],
1673
+ "forbidden_memory_ids": [],
1674
+ "difficulty": "easy",
1675
+ "architecture_bias_risk": "low",
1676
+ "fairness_note": "Single procedural record states this directly."
1677
+ }
1678
+ ]
1679
+ },
1680
+ {
1681
+ "conversation_id": "convC-finance-admin-01",
1682
+ "agent_id": "agent-finance",
1683
+ "domain": "finance_admin",
1684
+ "memory_records": [
1685
+ {
1686
+ "id": "memC-fa-001",
1687
+ "type": "semantic",
1688
+ "timestamp": "2025-09-01T08:00:00Z",
1689
+ "content": "Theo is a freelance graphic designer who invoices clients monthly and has irregular income.",
1690
+ "tags": [
1691
+ "theo",
1692
+ "profile"
1693
+ ],
1694
+ "importance": 0.5,
1695
+ "metadata": {
1696
+ "source_turn": "turn-001",
1697
+ "speaker": "user"
1698
+ },
1699
+ "associations": []
1700
+ },
1701
+ {
1702
+ "id": "memC-fa-002",
1703
+ "type": "semantic",
1704
+ "timestamp": "2025-09-02T09:00:00Z",
1705
+ "content": "Theo sets aside a percentage of every invoice payment into a separate tax savings account, originally 30%.",
1706
+ "tags": [
1707
+ "theo",
1708
+ "tax"
1709
+ ],
1710
+ "importance": 0.6,
1711
+ "metadata": {
1712
+ "source_turn": "turn-002",
1713
+ "speaker": "user"
1714
+ },
1715
+ "associations": []
1716
+ },
1717
+ {
1718
+ "id": "memC-fa-003",
1719
+ "type": "episodic",
1720
+ "timestamp": "2026-02-01T10:00:00Z",
1721
+ "content": "Theo switched accounting software from a spreadsheet system to an app called LedgerLeaf.",
1722
+ "tags": [
1723
+ "theo",
1724
+ "tools"
1725
+ ],
1726
+ "importance": 0.55,
1727
+ "metadata": {
1728
+ "source_turn": "turn-003",
1729
+ "speaker": "user"
1730
+ },
1731
+ "associations": []
1732
+ },
1733
+ {
1734
+ "id": "memC-fa-004",
1735
+ "type": "semantic",
1736
+ "timestamp": "2025-09-05T08:00:00Z",
1737
+ "content": "Theo's biggest recurring client, a marketing agency called Brightwell, pays on a 45-day cycle instead of the standard 30 days most of his other clients use.",
1738
+ "tags": [
1739
+ "theo",
1740
+ "brightwell"
1741
+ ],
1742
+ "importance": 0.55,
1743
+ "metadata": {
1744
+ "source_turn": "turn-004",
1745
+ "speaker": "user"
1746
+ },
1747
+ "associations": []
1748
+ },
1749
+ {
1750
+ "id": "memC-fa-005",
1751
+ "type": "episodic",
1752
+ "timestamp": "2025-09-10T09:00:00Z",
1753
+ "content": "Because of Brightwell's longer payment cycle, Theo built up a one-month cash buffer to cover the gap between invoicing and getting paid.",
1754
+ "tags": [
1755
+ "theo",
1756
+ "buffer"
1757
+ ],
1758
+ "importance": 0.6,
1759
+ "metadata": {
1760
+ "source_turn": "turn-005",
1761
+ "speaker": "user"
1762
+ },
1763
+ "associations": [
1764
+ {
1765
+ "target_id": "memC-fa-004",
1766
+ "strength": 0.8,
1767
+ "reason": "buffer exists specifically because of this client's payment cycle"
1768
+ }
1769
+ ]
1770
+ },
1771
+ {
1772
+ "id": "memC-fa-006",
1773
+ "type": "episodic",
1774
+ "timestamp": "2025-11-01T10:00:00Z",
1775
+ "content": "Theo also does occasional one-off logo design gigs through a marketplace site, which he doesn't track with the same monthly invoicing rhythm.",
1776
+ "tags": [
1777
+ "theo",
1778
+ "distractor"
1779
+ ],
1780
+ "importance": 0.2,
1781
+ "metadata": {
1782
+ "source_turn": "turn-006",
1783
+ "speaker": "user"
1784
+ },
1785
+ "associations": []
1786
+ },
1787
+ {
1788
+ "id": "memC-fa-007",
1789
+ "type": "procedural",
1790
+ "timestamp": "2025-09-15T08:00:00Z",
1791
+ "content": "Theo's quarterly estimated tax process is to total income in LedgerLeaf, subtract business expenses, then pay 25% of the net via the IRS online portal.",
1792
+ "tags": [
1793
+ "theo",
1794
+ "workflow"
1795
+ ],
1796
+ "importance": 0.55,
1797
+ "metadata": {
1798
+ "source_turn": "turn-007",
1799
+ "speaker": "user"
1800
+ },
1801
+ "associations": []
1802
+ },
1803
+ {
1804
+ "id": "memC-fa-008",
1805
+ "type": "episodic",
1806
+ "timestamp": "2026-05-01T11:00:00Z",
1807
+ "content": "Theo raised his tax savings rate from 30% to 35% after underestimating his Q1 tax payment.",
1808
+ "tags": [
1809
+ "theo",
1810
+ "tax"
1811
+ ],
1812
+ "importance": 0.7,
1813
+ "metadata": {
1814
+ "source_turn": "turn-008",
1815
+ "speaker": "user"
1816
+ },
1817
+ "associations": [
1818
+ {
1819
+ "target_id": "memC-fa-002",
1820
+ "strength": 0.8,
1821
+ "reason": "updates the original 30% savings rate"
1822
+ }
1823
+ ]
1824
+ },
1825
+ {
1826
+ "id": "memC-fa-009",
1827
+ "type": "semantic",
1828
+ "timestamp": "2025-09-20T09:00:00Z",
1829
+ "content": "Theo's standard payment terms for new clients are net-30, with a 2% late fee after 15 days overdue.",
1830
+ "tags": [
1831
+ "theo",
1832
+ "terms"
1833
+ ],
1834
+ "importance": 0.5,
1835
+ "metadata": {
1836
+ "source_turn": "turn-009",
1837
+ "speaker": "user"
1838
+ },
1839
+ "associations": []
1840
+ },
1841
+ {
1842
+ "id": "memC-fa-010",
1843
+ "type": "episodic",
1844
+ "timestamp": "2026-06-01T10:00:00Z",
1845
+ "content": "Theo hired a part-time bookkeeper, Priya, to help reconcile invoices in LedgerLeaf monthly.",
1846
+ "tags": [
1847
+ "theo",
1848
+ "team"
1849
+ ],
1850
+ "importance": 0.55,
1851
+ "metadata": {
1852
+ "source_turn": "turn-010",
1853
+ "speaker": "user"
1854
+ },
1855
+ "associations": []
1856
+ },
1857
+ {
1858
+ "id": "memC-fa-011",
1859
+ "type": "episodic",
1860
+ "timestamp": "2026-03-01T09:00:00Z",
1861
+ "content": "Theo's average monthly income over the past year has ranged between $4,200 and $7,800 depending on the season.",
1862
+ "tags": [
1863
+ "theo",
1864
+ "income"
1865
+ ],
1866
+ "importance": 0.5,
1867
+ "metadata": {
1868
+ "source_turn": "turn-011",
1869
+ "speaker": "user"
1870
+ },
1871
+ "associations": []
1872
+ },
1873
+ {
1874
+ "id": "memC-fa-012",
1875
+ "type": "episodic",
1876
+ "timestamp": "2026-06-15T11:00:00Z",
1877
+ "content": "After the rate increase, Theo confirmed in his June check-in that the 35% tax savings rate has made his Q2 estimated payment much easier to cover.",
1878
+ "tags": [
1879
+ "theo",
1880
+ "tax"
1881
+ ],
1882
+ "importance": 0.6,
1883
+ "metadata": {
1884
+ "source_turn": "turn-012",
1885
+ "speaker": "user"
1886
+ },
1887
+ "associations": [
1888
+ {
1889
+ "target_id": "memC-fa-008",
1890
+ "strength": 0.8,
1891
+ "reason": "confirms the raised rate is still in effect"
1892
+ }
1893
+ ]
1894
+ }
1895
+ ],
1896
+ "questions": [
1897
+ {
1898
+ "question_id": "qC-fa-01",
1899
+ "category": "atomic_fact_recall",
1900
+ "question": "What accounting software does Theo currently use?",
1901
+ "expected_answer": "LedgerLeaf",
1902
+ "acceptable_answer_criteria": [
1903
+ "names LedgerLeaf"
1904
+ ],
1905
+ "required_memory_ids": [
1906
+ "memC-fa-003"
1907
+ ],
1908
+ "forbidden_memory_ids": [],
1909
+ "difficulty": "easy",
1910
+ "architecture_bias_risk": "low",
1911
+ "fairness_note": "Single stated fact."
1912
+ },
1913
+ {
1914
+ "question_id": "qC-fa-02",
1915
+ "category": "paraphrased_semantic_recall",
1916
+ "question": "What are Theo's default payment terms for a new client?",
1917
+ "expected_answer": "Net-30, with a 2% late fee if more than 15 days overdue.",
1918
+ "acceptable_answer_criteria": [
1919
+ "mentions net-30",
1920
+ "mentions the 2% late fee"
1921
+ ],
1922
+ "required_memory_ids": [
1923
+ "memC-fa-009"
1924
+ ],
1925
+ "forbidden_memory_ids": [],
1926
+ "difficulty": "medium",
1927
+ "architecture_bias_risk": "low",
1928
+ "fairness_note": "Reworded from the source record."
1929
+ },
1930
+ {
1931
+ "question_id": "qC-fa-03",
1932
+ "category": "paraphrased_semantic_recall",
1933
+ "question": "Who did Theo bring on to help with monthly invoice reconciliation?",
1934
+ "expected_answer": "A part-time bookkeeper, Priya.",
1935
+ "acceptable_answer_criteria": [
1936
+ "names Priya",
1937
+ "mentions the bookkeeper role"
1938
+ ],
1939
+ "required_memory_ids": [
1940
+ "memC-fa-010"
1941
+ ],
1942
+ "forbidden_memory_ids": [],
1943
+ "difficulty": "medium",
1944
+ "architecture_bias_risk": "low",
1945
+ "fairness_note": "Reworded from 'help reconcile invoices' phrasing."
1946
+ },
1947
+ {
1948
+ "question_id": "qC-fa-04",
1949
+ "category": "temporal_update",
1950
+ "question": "What percentage of each invoice does Theo currently set aside for taxes?",
1951
+ "expected_answer": "35% -- raised from the original 30% after underestimating his Q1 payment.",
1952
+ "acceptable_answer_criteria": [
1953
+ "states 35%",
1954
+ "notes the change from 30%"
1955
+ ],
1956
+ "required_memory_ids": [
1957
+ "memC-fa-008",
1958
+ "memC-fa-002"
1959
+ ],
1960
+ "forbidden_memory_ids": [],
1961
+ "difficulty": "medium",
1962
+ "architecture_bias_risk": "medium",
1963
+ "fairness_note": "Requires returning the updated rate, not the original starting figure."
1964
+ },
1965
+ {
1966
+ "question_id": "qC-fa-05",
1967
+ "category": "multi_hop_association",
1968
+ "question": "Why did Theo build up a one-month cash buffer?",
1969
+ "expected_answer": "His biggest client, Brightwell, pays on a 45-day cycle instead of the standard 30 days, so the buffer covers the gap between invoicing and getting paid.",
1970
+ "acceptable_answer_criteria": [
1971
+ "mentions Brightwell's 45-day cycle",
1972
+ "connects it to the buffer's purpose"
1973
+ ],
1974
+ "required_memory_ids": [
1975
+ "memC-fa-004",
1976
+ "memC-fa-005"
1977
+ ],
1978
+ "forbidden_memory_ids": [],
1979
+ "difficulty": "hard",
1980
+ "architecture_bias_risk": "high",
1981
+ "fairness_note": "Requires connecting a client-specific payment term to a financial decision made because of it."
1982
+ },
1983
+ {
1984
+ "question_id": "qC-fa-06",
1985
+ "category": "procedural_recall",
1986
+ "question": "What is Theo's process for paying quarterly estimated taxes?",
1987
+ "expected_answer": "Total income in LedgerLeaf, subtract business expenses, then pay 25% of the net through the IRS online portal.",
1988
+ "acceptable_answer_criteria": [
1989
+ "mentions LedgerLeaf totals",
1990
+ "mentions subtracting expenses",
1991
+ "mentions the 25% portal payment"
1992
+ ],
1993
+ "required_memory_ids": [
1994
+ "memC-fa-007"
1995
+ ],
1996
+ "forbidden_memory_ids": [],
1997
+ "difficulty": "easy",
1998
+ "architecture_bias_risk": "low",
1999
+ "fairness_note": "Single procedural record states this directly."
2000
+ }
2001
+ ]
2002
+ },
2003
+ {
2004
+ "conversation_id": "convC-education-01",
2005
+ "agent_id": "agent-tutor",
2006
+ "domain": "education",
2007
+ "memory_records": [
2008
+ {
2009
+ "id": "memC-ed-001",
2010
+ "type": "episodic",
2011
+ "timestamp": "2026-01-05T09:00:00Z",
2012
+ "content": "Mira started studying for the state bar exam, six months before the original July test date.",
2013
+ "tags": [
2014
+ "mira",
2015
+ "study"
2016
+ ],
2017
+ "importance": 0.6,
2018
+ "metadata": {
2019
+ "source_turn": "turn-001",
2020
+ "speaker": "user"
2021
+ },
2022
+ "associations": []
2023
+ },
2024
+ {
2025
+ "id": "memC-ed-002",
2026
+ "type": "episodic",
2027
+ "timestamp": "2026-01-06T10:00:00Z",
2028
+ "content": "Mira originally planned to study primarily from a commercial outline service called BarReady.",
2029
+ "tags": [
2030
+ "mira",
2031
+ "materials"
2032
+ ],
2033
+ "importance": 0.5,
2034
+ "metadata": {
2035
+ "source_turn": "turn-002",
2036
+ "speaker": "user"
2037
+ },
2038
+ "associations": []
2039
+ },
2040
+ {
2041
+ "id": "memC-ed-003",
2042
+ "type": "episodic",
2043
+ "timestamp": "2026-03-10T11:00:00Z",
2044
+ "content": "After a practice test on BarReady material scored lower than expected, Mira switched her primary study material to a different outline service, ExamCore.",
2045
+ "tags": [
2046
+ "mira",
2047
+ "materials"
2048
+ ],
2049
+ "importance": 0.65,
2050
+ "metadata": {
2051
+ "source_turn": "turn-003",
2052
+ "speaker": "user"
2053
+ },
2054
+ "associations": [
2055
+ {
2056
+ "target_id": "memC-ed-002",
2057
+ "strength": 0.8,
2058
+ "reason": "replaces the original outline service"
2059
+ }
2060
+ ]
2061
+ },
2062
+ {
2063
+ "id": "memC-ed-004",
2064
+ "type": "episodic",
2065
+ "timestamp": "2026-03-15T12:00:00Z",
2066
+ "content": "Mira's study partner, Devon, also switched to ExamCore around the same time after hearing about Mira's experience.",
2067
+ "tags": [
2068
+ "mira",
2069
+ "devon",
2070
+ "distractor"
2071
+ ],
2072
+ "importance": 0.25,
2073
+ "metadata": {
2074
+ "source_turn": "turn-004",
2075
+ "speaker": "user"
2076
+ },
2077
+ "associations": []
2078
+ },
2079
+ {
2080
+ "id": "memC-ed-005",
2081
+ "type": "episodic",
2082
+ "timestamp": "2026-01-10T09:00:00Z",
2083
+ "content": "Mira initially believed the bar exam's essay portion counted for 50% of her total score.",
2084
+ "tags": [
2085
+ "mira",
2086
+ "scoring"
2087
+ ],
2088
+ "importance": 0.4,
2089
+ "metadata": {
2090
+ "source_turn": "turn-005",
2091
+ "speaker": "user"
2092
+ },
2093
+ "associations": []
2094
+ },
2095
+ {
2096
+ "id": "memC-ed-006",
2097
+ "type": "episodic",
2098
+ "timestamp": "2026-02-01T10:00:00Z",
2099
+ "content": "After reviewing the exact scoring breakdown, Mira learned the essay portion actually counts for 30% of the total score, with multiple-choice worth 50% and the performance test worth 20%.",
2100
+ "tags": [
2101
+ "mira",
2102
+ "scoring"
2103
+ ],
2104
+ "importance": 0.65,
2105
+ "metadata": {
2106
+ "source_turn": "turn-006",
2107
+ "speaker": "user"
2108
+ },
2109
+ "associations": [
2110
+ {
2111
+ "target_id": "memC-ed-005",
2112
+ "strength": 0.85,
2113
+ "reason": "corrects this earlier mistaken belief"
2114
+ }
2115
+ ]
2116
+ },
2117
+ {
2118
+ "id": "memC-ed-007",
2119
+ "type": "procedural",
2120
+ "timestamp": "2026-01-20T09:00:00Z",
2121
+ "content": "Mira's weekly study routine is three practice essays, one full practice multiple-choice set, and one performance-test simulation, every Sunday.",
2122
+ "tags": [
2123
+ "mira",
2124
+ "workflow"
2125
+ ],
2126
+ "importance": 0.55,
2127
+ "metadata": {
2128
+ "source_turn": "turn-007",
2129
+ "speaker": "user"
2130
+ },
2131
+ "associations": []
2132
+ },
2133
+ {
2134
+ "id": "memC-ed-008",
2135
+ "type": "episodic",
2136
+ "timestamp": "2026-04-01T11:00:00Z",
2137
+ "content": "Mira's bar exam date moved from July 28th to August 4th after the state bar association rescheduled due to a venue conflict.",
2138
+ "tags": [
2139
+ "mira",
2140
+ "schedule"
2141
+ ],
2142
+ "importance": 0.6,
2143
+ "metadata": {
2144
+ "source_turn": "turn-008",
2145
+ "speaker": "user"
2146
+ },
2147
+ "associations": []
2148
+ },
2149
+ {
2150
+ "id": "memC-ed-009",
2151
+ "type": "episodic",
2152
+ "timestamp": "2026-02-10T10:00:00Z",
2153
+ "content": "Mira mentioned she'd already passed a separate licensing exam in an unrelated field, real estate, three years earlier.",
2154
+ "tags": [
2155
+ "mira",
2156
+ "distractor"
2157
+ ],
2158
+ "importance": 0.2,
2159
+ "metadata": {
2160
+ "source_turn": "turn-009",
2161
+ "speaker": "user"
2162
+ },
2163
+ "associations": []
2164
+ },
2165
+ {
2166
+ "id": "memC-ed-010",
2167
+ "type": "episodic",
2168
+ "timestamp": "2026-03-20T12:00:00Z",
2169
+ "content": "Switching to ExamCore's essay-focused materials specifically helped Mira improve her practice essay scores, since BarReady had been comparatively light on essay technique coverage.",
2170
+ "tags": [
2171
+ "mira",
2172
+ "progress"
2173
+ ],
2174
+ "importance": 0.6,
2175
+ "metadata": {
2176
+ "source_turn": "turn-010",
2177
+ "speaker": "user"
2178
+ },
2179
+ "associations": [
2180
+ {
2181
+ "target_id": "memC-ed-003",
2182
+ "strength": 0.8,
2183
+ "reason": "result of the switch to ExamCore"
2184
+ }
2185
+ ]
2186
+ },
2187
+ {
2188
+ "id": "memC-ed-011",
2189
+ "type": "episodic",
2190
+ "timestamp": "2026-03-25T13:00:00Z",
2191
+ "content": "Because her essay scores improved most after switching materials, Mira added a second weekly essay-focused study session, on Wednesdays.",
2192
+ "tags": [
2193
+ "mira",
2194
+ "workflow"
2195
+ ],
2196
+ "importance": 0.65,
2197
+ "metadata": {
2198
+ "source_turn": "turn-011",
2199
+ "speaker": "user"
2200
+ },
2201
+ "associations": [
2202
+ {
2203
+ "target_id": "memC-ed-010",
2204
+ "strength": 0.85,
2205
+ "reason": "added because of this improvement"
2206
+ }
2207
+ ]
2208
+ },
2209
+ {
2210
+ "id": "memC-ed-012",
2211
+ "type": "episodic",
2212
+ "timestamp": "2026-05-05T14:00:00Z",
2213
+ "content": "Mira confirmed in her May check-in that her current routine is the original Sunday session plus the new Wednesday essay-focused session, continuing through to her August 4th exam date.",
2214
+ "tags": [
2215
+ "mira",
2216
+ "status"
2217
+ ],
2218
+ "importance": 0.65,
2219
+ "metadata": {
2220
+ "source_turn": "turn-012",
2221
+ "speaker": "user"
2222
+ },
2223
+ "associations": [
2224
+ {
2225
+ "target_id": "memC-ed-011",
2226
+ "strength": 0.8,
2227
+ "reason": "confirms the Wednesday session is still active"
2228
+ },
2229
+ {
2230
+ "target_id": "memC-ed-008",
2231
+ "strength": 0.7,
2232
+ "reason": "confirms the current exam date"
2233
+ }
2234
+ ]
2235
+ }
2236
+ ],
2237
+ "questions": [
2238
+ {
2239
+ "question_id": "qC-ed-01",
2240
+ "category": "atomic_fact_recall",
2241
+ "question": "Which outline service is Mira currently using as her primary study material?",
2242
+ "expected_answer": "ExamCore",
2243
+ "acceptable_answer_criteria": [
2244
+ "names ExamCore"
2245
+ ],
2246
+ "required_memory_ids": [
2247
+ "memC-ed-003"
2248
+ ],
2249
+ "forbidden_memory_ids": [],
2250
+ "difficulty": "easy",
2251
+ "architecture_bias_risk": "low",
2252
+ "fairness_note": "Most recent record names the current resource directly."
2253
+ },
2254
+ {
2255
+ "question_id": "qC-ed-02",
2256
+ "category": "paraphrased_semantic_recall",
2257
+ "question": "What did a typical Sunday study session look like for Mira before her schedule changed?",
2258
+ "expected_answer": "Three practice essays, one full multiple-choice set, and one performance-test simulation.",
2259
+ "acceptable_answer_criteria": [
2260
+ "mentions the three components"
2261
+ ],
2262
+ "required_memory_ids": [
2263
+ "memC-ed-007"
2264
+ ],
2265
+ "forbidden_memory_ids": [],
2266
+ "difficulty": "medium",
2267
+ "architecture_bias_risk": "low",
2268
+ "fairness_note": "Reworded from the procedural record's phrasing."
2269
+ },
2270
+ {
2271
+ "question_id": "qC-ed-03",
2272
+ "category": "paraphrased_semantic_recall",
2273
+ "question": "What other professional exam had Mira already completed before starting bar exam prep?",
2274
+ "expected_answer": "A real estate licensing exam, passed three years earlier.",
2275
+ "acceptable_answer_criteria": [
2276
+ "mentions real estate",
2277
+ "mentions it was already passed"
2278
+ ],
2279
+ "required_memory_ids": [
2280
+ "memC-ed-009"
2281
+ ],
2282
+ "forbidden_memory_ids": [],
2283
+ "difficulty": "medium",
2284
+ "architecture_bias_risk": "low",
2285
+ "fairness_note": "Reworded; tests retrieving a minor but explicit fact rather than confusing it with bar exam progress."
2286
+ },
2287
+ {
2288
+ "question_id": "qC-ed-04",
2289
+ "category": "contradiction_resolution",
2290
+ "question": "Does the essay portion of the bar exam count for 50% of Mira's total score?",
2291
+ "expected_answer": "No -- that was Mira's initial mistaken belief. The essay portion actually counts for 30%, with multiple-choice at 50% and the performance test at 20%.",
2292
+ "acceptable_answer_criteria": [
2293
+ "says no",
2294
+ "gives the corrected 30/50/20 breakdown"
2295
+ ],
2296
+ "required_memory_ids": [
2297
+ "memC-ed-006"
2298
+ ],
2299
+ "forbidden_memory_ids": [
2300
+ "memC-ed-005"
2301
+ ],
2302
+ "difficulty": "medium",
2303
+ "architecture_bias_risk": "medium",
2304
+ "fairness_note": "The 'stale' fact here is an initial misunderstanding rather than an outdated state of the world, testing whether corrections are tracked as well as updates."
2305
+ },
2306
+ {
2307
+ "question_id": "qC-ed-05",
2308
+ "category": "multi_hop_association",
2309
+ "question": "Why did Mira add a second weekly study session on Wednesdays?",
2310
+ "expected_answer": "Switching from BarReady to ExamCore's essay-focused materials improved her practice essay scores, and because of that improvement she added a Wednesday essay-focused session on top of her usual Sunday routine.",
2311
+ "acceptable_answer_criteria": [
2312
+ "mentions the materials switch",
2313
+ "mentions the essay-score improvement",
2314
+ "connects it to adding the Wednesday session"
2315
+ ],
2316
+ "required_memory_ids": [
2317
+ "memC-ed-010",
2318
+ "memC-ed-011"
2319
+ ],
2320
+ "forbidden_memory_ids": [],
2321
+ "difficulty": "hard",
2322
+ "architecture_bias_risk": "high",
2323
+ "fairness_note": "Requires connecting a materials switch, its measured effect, and a resulting schedule change."
2324
+ },
2325
+ {
2326
+ "question_id": "qC-ed-06",
2327
+ "category": "procedural_recall",
2328
+ "question": "What is Mira's current weekly study routine?",
2329
+ "expected_answer": "Her original Sunday session (three essays, one multiple-choice set, one performance test) plus a new Wednesday essay-focused session.",
2330
+ "acceptable_answer_criteria": [
2331
+ "mentions both Sunday and Wednesday sessions"
2332
+ ],
2333
+ "required_memory_ids": [
2334
+ "memC-ed-007",
2335
+ "memC-ed-011",
2336
+ "memC-ed-012"
2337
+ ],
2338
+ "forbidden_memory_ids": [],
2339
+ "difficulty": "medium",
2340
+ "architecture_bias_risk": "medium",
2341
+ "fairness_note": "Requires combining the original procedural record with a later addition to describe the current full routine."
2342
+ }
2343
+ ]
2344
+ },
2345
+ {
2346
+ "conversation_id": "convC-creative-work-01",
2347
+ "agent_id": "agent-writing",
2348
+ "domain": "creative_work",
2349
+ "memory_records": [
2350
+ {
2351
+ "id": "memC-cw-001",
2352
+ "type": "episodic",
2353
+ "timestamp": "2025-09-01T09:00:00Z",
2354
+ "content": "Iskandar started writing songs for a concept album about migration, originally planned as 10 tracks.",
2355
+ "tags": [
2356
+ "iskandar",
2357
+ "album"
2358
+ ],
2359
+ "importance": 0.6,
2360
+ "metadata": {
2361
+ "source_turn": "turn-001",
2362
+ "speaker": "user"
2363
+ },
2364
+ "associations": []
2365
+ },
2366
+ {
2367
+ "id": "memC-cw-002",
2368
+ "type": "episodic",
2369
+ "timestamp": "2026-01-10T10:00:00Z",
2370
+ "content": "Iskandar expanded the album to 12 tracks after writing two additional songs that fit the theme better than two he'd originally planned to cut.",
2371
+ "tags": [
2372
+ "iskandar",
2373
+ "album"
2374
+ ],
2375
+ "importance": 0.55,
2376
+ "metadata": {
2377
+ "source_turn": "turn-002",
2378
+ "speaker": "user"
2379
+ },
2380
+ "associations": [
2381
+ {
2382
+ "target_id": "memC-cw-001",
2383
+ "strength": 0.7,
2384
+ "reason": "updates the original 10-track plan"
2385
+ }
2386
+ ]
2387
+ },
2388
+ {
2389
+ "id": "memC-cw-003",
2390
+ "type": "semantic",
2391
+ "timestamp": "2025-09-05T08:00:00Z",
2392
+ "content": "The album's working title is 'Crossing Lines.'",
2393
+ "tags": [
2394
+ "iskandar",
2395
+ "title"
2396
+ ],
2397
+ "importance": 0.6,
2398
+ "metadata": {
2399
+ "source_turn": "turn-003",
2400
+ "speaker": "user"
2401
+ },
2402
+ "associations": []
2403
+ },
2404
+ {
2405
+ "id": "memC-cw-004",
2406
+ "type": "episodic",
2407
+ "timestamp": "2025-10-01T09:00:00Z",
2408
+ "content": "Iskandar originally planned the album's opening track to be a song called 'Harbor Lights.'",
2409
+ "tags": [
2410
+ "iskandar",
2411
+ "tracklist"
2412
+ ],
2413
+ "importance": 0.5,
2414
+ "metadata": {
2415
+ "source_turn": "turn-004",
2416
+ "speaker": "user"
2417
+ },
2418
+ "associations": []
2419
+ },
2420
+ {
2421
+ "id": "memC-cw-005",
2422
+ "type": "episodic",
2423
+ "timestamp": "2026-02-15T11:00:00Z",
2424
+ "content": "After feedback from his producer, Iskandar moved 'Harbor Lights' to track 4 and made a newer song, 'First Light,' the opening track instead.",
2425
+ "tags": [
2426
+ "iskandar",
2427
+ "tracklist"
2428
+ ],
2429
+ "importance": 0.65,
2430
+ "metadata": {
2431
+ "source_turn": "turn-005",
2432
+ "speaker": "user"
2433
+ },
2434
+ "associations": [
2435
+ {
2436
+ "target_id": "memC-cw-004",
2437
+ "strength": 0.8,
2438
+ "reason": "replaces the original opening-track plan"
2439
+ }
2440
+ ]
2441
+ },
2442
+ {
2443
+ "id": "memC-cw-006",
2444
+ "type": "episodic",
2445
+ "timestamp": "2026-02-14T10:00:00Z",
2446
+ "content": "Iskandar's producer, Mette, specifically said 'Harbor Lights' felt more like a midpoint emotional beat than an opener.",
2447
+ "tags": [
2448
+ "iskandar",
2449
+ "mette"
2450
+ ],
2451
+ "importance": 0.55,
2452
+ "metadata": {
2453
+ "source_turn": "turn-006",
2454
+ "speaker": "user"
2455
+ },
2456
+ "associations": []
2457
+ },
2458
+ {
2459
+ "id": "memC-cw-007",
2460
+ "type": "episodic",
2461
+ "timestamp": "2026-03-01T09:00:00Z",
2462
+ "content": "Iskandar mentioned he'd also written an unrelated jingle for a friend's small business, completely separate from the album project.",
2463
+ "tags": [
2464
+ "iskandar",
2465
+ "distractor"
2466
+ ],
2467
+ "importance": 0.2,
2468
+ "metadata": {
2469
+ "source_turn": "turn-007",
2470
+ "speaker": "user"
2471
+ },
2472
+ "associations": []
2473
+ },
2474
+ {
2475
+ "id": "memC-cw-008",
2476
+ "type": "procedural",
2477
+ "timestamp": "2025-09-10T08:00:00Z",
2478
+ "content": "Iskandar's recording routine is to track scratch vocals first, then build instrumentation around them, then record final vocals last.",
2479
+ "tags": [
2480
+ "iskandar",
2481
+ "workflow"
2482
+ ],
2483
+ "importance": 0.5,
2484
+ "metadata": {
2485
+ "source_turn": "turn-008",
2486
+ "speaker": "user"
2487
+ },
2488
+ "associations": []
2489
+ },
2490
+ {
2491
+ "id": "memC-cw-009",
2492
+ "type": "episodic",
2493
+ "timestamp": "2026-04-01T10:00:00Z",
2494
+ "content": "Iskandar pushed the album's release date from October to January after deciding two tracks needed additional production time.",
2495
+ "tags": [
2496
+ "iskandar",
2497
+ "release"
2498
+ ],
2499
+ "importance": 0.6,
2500
+ "metadata": {
2501
+ "source_turn": "turn-009",
2502
+ "speaker": "user"
2503
+ },
2504
+ "associations": []
2505
+ },
2506
+ {
2507
+ "id": "memC-cw-010",
2508
+ "type": "episodic",
2509
+ "timestamp": "2026-04-10T11:00:00Z",
2510
+ "content": "Iskandar said the closing track, still untitled as of his last update, will feature a string arrangement recorded with a local chamber ensemble.",
2511
+ "tags": [
2512
+ "iskandar",
2513
+ "tracklist"
2514
+ ],
2515
+ "importance": 0.5,
2516
+ "metadata": {
2517
+ "source_turn": "turn-010",
2518
+ "speaker": "user"
2519
+ },
2520
+ "associations": []
2521
+ },
2522
+ {
2523
+ "id": "memC-cw-011",
2524
+ "type": "episodic",
2525
+ "timestamp": "2026-02-20T12:00:00Z",
2526
+ "content": "Following the track-order change, Iskandar confirmed the album's sequence has 'First Light' as track 1 and 'Harbor Lights' as track 4.",
2527
+ "tags": [
2528
+ "iskandar",
2529
+ "tracklist"
2530
+ ],
2531
+ "importance": 0.7,
2532
+ "metadata": {
2533
+ "source_turn": "turn-011",
2534
+ "speaker": "user"
2535
+ },
2536
+ "associations": [
2537
+ {
2538
+ "target_id": "memC-cw-005",
2539
+ "strength": 0.85,
2540
+ "reason": "confirms the reordering"
2541
+ }
2542
+ ]
2543
+ },
2544
+ {
2545
+ "id": "memC-cw-012",
2546
+ "type": "episodic",
2547
+ "timestamp": "2026-05-01T13:00:00Z",
2548
+ "content": "Iskandar has not yet decided on a record label or distribution method for the album.",
2549
+ "tags": [
2550
+ "iskandar",
2551
+ "release"
2552
+ ],
2553
+ "importance": 0.4,
2554
+ "metadata": {
2555
+ "source_turn": "turn-012",
2556
+ "speaker": "user"
2557
+ },
2558
+ "associations": []
2559
+ }
2560
+ ],
2561
+ "questions": [
2562
+ {
2563
+ "question_id": "qC-cw-01",
2564
+ "category": "atomic_fact_recall",
2565
+ "question": "What is the working title of Iskandar's album?",
2566
+ "expected_answer": "Crossing Lines",
2567
+ "acceptable_answer_criteria": [
2568
+ "names Crossing Lines"
2569
+ ],
2570
+ "required_memory_ids": [
2571
+ "memC-cw-003"
2572
+ ],
2573
+ "forbidden_memory_ids": [],
2574
+ "difficulty": "easy",
2575
+ "architecture_bias_risk": "low",
2576
+ "fairness_note": "Single stated fact."
2577
+ },
2578
+ {
2579
+ "question_id": "qC-cw-02",
2580
+ "category": "paraphrased_semantic_recall",
2581
+ "question": "What's Iskandar's typical order of operations when recording a track?",
2582
+ "expected_answer": "Scratch vocals first, then instrumentation, then final vocals last.",
2583
+ "acceptable_answer_criteria": [
2584
+ "mentions all three steps in order"
2585
+ ],
2586
+ "required_memory_ids": [
2587
+ "memC-cw-008"
2588
+ ],
2589
+ "forbidden_memory_ids": [],
2590
+ "difficulty": "medium",
2591
+ "architecture_bias_risk": "low",
2592
+ "fairness_note": "Reworded from the procedural record."
2593
+ },
2594
+ {
2595
+ "question_id": "qC-cw-03",
2596
+ "category": "temporal_update",
2597
+ "question": "When is the album currently scheduled to release?",
2598
+ "expected_answer": "January -- pushed back from the original October target.",
2599
+ "acceptable_answer_criteria": [
2600
+ "states January",
2601
+ "notes the change from October"
2602
+ ],
2603
+ "required_memory_ids": [
2604
+ "memC-cw-009"
2605
+ ],
2606
+ "forbidden_memory_ids": [],
2607
+ "difficulty": "medium",
2608
+ "architecture_bias_risk": "medium",
2609
+ "fairness_note": "Requires surfacing the updated release date."
2610
+ },
2611
+ {
2612
+ "question_id": "qC-cw-04",
2613
+ "category": "contradiction_resolution",
2614
+ "question": "Is 'Harbor Lights' currently the album's opening track?",
2615
+ "expected_answer": "No -- it was originally planned as the opener but was moved to track 4; 'First Light' is now the opening track.",
2616
+ "acceptable_answer_criteria": [
2617
+ "says no",
2618
+ "names First Light as the current opener"
2619
+ ],
2620
+ "required_memory_ids": [
2621
+ "memC-cw-011"
2622
+ ],
2623
+ "forbidden_memory_ids": [
2624
+ "memC-cw-004"
2625
+ ],
2626
+ "difficulty": "medium",
2627
+ "architecture_bias_risk": "medium",
2628
+ "fairness_note": "Tests resistance to an early, stale plan that was explicitly changed later in the conversation."
2629
+ },
2630
+ {
2631
+ "question_id": "qC-cw-05",
2632
+ "category": "multi_hop_association",
2633
+ "question": "Why did Iskandar move 'Harbor Lights' out of the opening slot?",
2634
+ "expected_answer": "His producer, Mette, felt it played more like a midpoint emotional beat than an opener, so Iskandar swapped in 'First Light' as the opening track instead.",
2635
+ "acceptable_answer_criteria": [
2636
+ "mentions Mette's feedback",
2637
+ "connects it to the track-order change"
2638
+ ],
2639
+ "required_memory_ids": [
2640
+ "memC-cw-006",
2641
+ "memC-cw-005"
2642
+ ],
2643
+ "forbidden_memory_ids": [],
2644
+ "difficulty": "hard",
2645
+ "architecture_bias_risk": "high",
2646
+ "fairness_note": "Requires linking a producer's feedback to the resulting creative decision."
2647
+ },
2648
+ {
2649
+ "question_id": "qC-cw-06",
2650
+ "category": "abstention",
2651
+ "question": "Which record label will release Iskandar's album?",
2652
+ "expected_answer": "Not enough information -- Iskandar hasn't decided on a label or distribution method yet.",
2653
+ "acceptable_answer_criteria": [
2654
+ "states this is undecided",
2655
+ "does not invent a label"
2656
+ ],
2657
+ "required_memory_ids": [
2658
+ "memC-cw-012"
2659
+ ],
2660
+ "forbidden_memory_ids": [],
2661
+ "difficulty": "medium",
2662
+ "architecture_bias_risk": "low",
2663
+ "fairness_note": "The correct answer is explicitly 'not yet decided,' which is itself stated in a record, rather than a true information gap -- testing that abstention-worthy answers are still grounded in evidence when such evidence exists."
2664
+ }
2665
+ ]
2666
+ },
2667
+ {
2668
+ "conversation_id": "convC-software-02",
2669
+ "agent_id": "agent-dev-assist",
2670
+ "domain": "software",
2671
+ "memory_records": [
2672
+ {
2673
+ "id": "memC-sw2-001",
2674
+ "type": "semantic",
2675
+ "timestamp": "2025-10-01T09:00:00Z",
2676
+ "content": "Petra is building Reviewlytic, an automated code-review tool that posts inline comments on pull requests, written in TypeScript with a Node.js backend.",
2677
+ "tags": [
2678
+ "reviewlytic",
2679
+ "tech-stack"
2680
+ ],
2681
+ "importance": 0.7,
2682
+ "metadata": {
2683
+ "source_turn": "turn-001",
2684
+ "speaker": "user"
2685
+ },
2686
+ "associations": []
2687
+ },
2688
+ {
2689
+ "id": "memC-sw2-002",
2690
+ "type": "episodic",
2691
+ "timestamp": "2025-10-05T10:00:00Z",
2692
+ "content": "Reviewlytic originally only supported GitHub pull requests.",
2693
+ "tags": [
2694
+ "reviewlytic",
2695
+ "platforms"
2696
+ ],
2697
+ "importance": 0.5,
2698
+ "metadata": {
2699
+ "source_turn": "turn-002",
2700
+ "speaker": "user"
2701
+ },
2702
+ "associations": []
2703
+ },
2704
+ {
2705
+ "id": "memC-sw2-003",
2706
+ "type": "episodic",
2707
+ "timestamp": "2026-02-10T11:00:00Z",
2708
+ "content": "Petra added GitLab merge request support, making Reviewlytic work across both GitHub and GitLab.",
2709
+ "tags": [
2710
+ "reviewlytic",
2711
+ "platforms"
2712
+ ],
2713
+ "importance": 0.6,
2714
+ "metadata": {
2715
+ "source_turn": "turn-003",
2716
+ "speaker": "user"
2717
+ },
2718
+ "associations": [
2719
+ {
2720
+ "target_id": "memC-sw2-002",
2721
+ "strength": 0.7,
2722
+ "reason": "extends platform support beyond GitHub"
2723
+ }
2724
+ ]
2725
+ },
2726
+ {
2727
+ "id": "memC-sw2-004",
2728
+ "type": "episodic",
2729
+ "timestamp": "2026-04-15T12:00:00Z",
2730
+ "content": "Petra also added Bitbucket support, so Reviewlytic now supports all three major platforms: GitHub, GitLab, and Bitbucket.",
2731
+ "tags": [
2732
+ "reviewlytic",
2733
+ "platforms"
2734
+ ],
2735
+ "importance": 0.65,
2736
+ "metadata": {
2737
+ "source_turn": "turn-004",
2738
+ "speaker": "user"
2739
+ },
2740
+ "associations": [
2741
+ {
2742
+ "target_id": "memC-sw2-003",
2743
+ "strength": 0.7,
2744
+ "reason": "extends platform support to a third platform"
2745
+ }
2746
+ ]
2747
+ },
2748
+ {
2749
+ "id": "memC-sw2-005",
2750
+ "type": "episodic",
2751
+ "timestamp": "2025-11-01T09:00:00Z",
2752
+ "content": "Petra's earliest internal build was nicknamed 'Reviewlytic Alpha,' strictly for her own testing and never released to anyone else.",
2753
+ "tags": [
2754
+ "reviewlytic",
2755
+ "alpha"
2756
+ ],
2757
+ "importance": 0.3,
2758
+ "metadata": {
2759
+ "source_turn": "turn-005",
2760
+ "speaker": "user"
2761
+ },
2762
+ "associations": []
2763
+ },
2764
+ {
2765
+ "id": "memC-sw2-006",
2766
+ "type": "episodic",
2767
+ "timestamp": "2026-01-01T10:00:00Z",
2768
+ "content": "A separate, actually-released early-access version was called 'Reviewlytic Beta,' used by a small group of 20 external testers starting in March.",
2769
+ "tags": [
2770
+ "reviewlytic",
2771
+ "beta"
2772
+ ],
2773
+ "importance": 0.5,
2774
+ "metadata": {
2775
+ "source_turn": "turn-006",
2776
+ "speaker": "user"
2777
+ },
2778
+ "associations": []
2779
+ },
2780
+ {
2781
+ "id": "memC-sw2-007",
2782
+ "type": "episodic",
2783
+ "timestamp": "2025-12-01T09:00:00Z",
2784
+ "content": "Petra initially set the per-seat price at $12/month for the planned public launch.",
2785
+ "tags": [
2786
+ "reviewlytic",
2787
+ "pricing"
2788
+ ],
2789
+ "importance": 0.5,
2790
+ "metadata": {
2791
+ "source_turn": "turn-007",
2792
+ "speaker": "user"
2793
+ },
2794
+ "associations": []
2795
+ },
2796
+ {
2797
+ "id": "memC-sw2-008",
2798
+ "type": "episodic",
2799
+ "timestamp": "2026-02-20T11:00:00Z",
2800
+ "content": "After surveying beta testers, Petra lowered the price to $8/month before the public launch to be more competitive.",
2801
+ "tags": [
2802
+ "reviewlytic",
2803
+ "pricing"
2804
+ ],
2805
+ "importance": 0.65,
2806
+ "metadata": {
2807
+ "source_turn": "turn-008",
2808
+ "speaker": "user"
2809
+ },
2810
+ "associations": [
2811
+ {
2812
+ "target_id": "memC-sw2-007",
2813
+ "strength": 0.8,
2814
+ "reason": "replaces the original $12 price"
2815
+ }
2816
+ ]
2817
+ },
2818
+ {
2819
+ "id": "memC-sw2-009",
2820
+ "type": "semantic",
2821
+ "timestamp": "2026-03-01T09:00:00Z",
2822
+ "content": "Reviewlytic's comment-posting feature currently has a known limitation: it cannot yet flag issues across multiple files in a single combined comment.",
2823
+ "tags": [
2824
+ "reviewlytic",
2825
+ "limitation"
2826
+ ],
2827
+ "importance": 0.5,
2828
+ "metadata": {
2829
+ "source_turn": "turn-009",
2830
+ "speaker": "user"
2831
+ },
2832
+ "associations": []
2833
+ },
2834
+ {
2835
+ "id": "memC-sw2-010",
2836
+ "type": "procedural",
2837
+ "timestamp": "2025-11-15T08:00:00Z",
2838
+ "content": "Petra's process for triaging a bug report is to reproduce it locally first, label it by severity, then assign it to the next sprint if it's not a security issue.",
2839
+ "tags": [
2840
+ "reviewlytic",
2841
+ "workflow"
2842
+ ],
2843
+ "importance": 0.5,
2844
+ "metadata": {
2845
+ "source_turn": "turn-010",
2846
+ "speaker": "user"
2847
+ },
2848
+ "associations": []
2849
+ },
2850
+ {
2851
+ "id": "memC-sw2-011",
2852
+ "type": "episodic",
2853
+ "timestamp": "2026-03-10T10:00:00Z",
2854
+ "content": "A tester reported a false-positive bug pattern, which Petra traced to an outdated linting ruleset bundled with Reviewlytic, not an actual flaw in the core analysis engine.",
2855
+ "tags": [
2856
+ "reviewlytic",
2857
+ "bug"
2858
+ ],
2859
+ "importance": 0.5,
2860
+ "metadata": {
2861
+ "source_turn": "turn-011",
2862
+ "speaker": "user"
2863
+ },
2864
+ "associations": []
2865
+ },
2866
+ {
2867
+ "id": "memC-sw2-012",
2868
+ "type": "episodic",
2869
+ "timestamp": "2026-05-01T11:00:00Z",
2870
+ "content": "Petra confirmed the current public per-seat price for Reviewlytic is $8/month, unchanged since the pre-launch adjustment.",
2871
+ "tags": [
2872
+ "reviewlytic",
2873
+ "pricing"
2874
+ ],
2875
+ "importance": 0.65,
2876
+ "metadata": {
2877
+ "source_turn": "turn-012",
2878
+ "speaker": "user"
2879
+ },
2880
+ "associations": [
2881
+ {
2882
+ "target_id": "memC-sw2-008",
2883
+ "strength": 0.85,
2884
+ "reason": "confirms this is still the current price"
2885
+ }
2886
+ ]
2887
+ }
2888
+ ],
2889
+ "questions": [
2890
+ {
2891
+ "question_id": "qC-sw2-01",
2892
+ "category": "atomic_fact_recall",
2893
+ "question": "What language is Reviewlytic written in?",
2894
+ "expected_answer": "TypeScript, with a Node.js backend",
2895
+ "acceptable_answer_criteria": [
2896
+ "names TypeScript",
2897
+ "names Node.js"
2898
+ ],
2899
+ "required_memory_ids": [
2900
+ "memC-sw2-001"
2901
+ ],
2902
+ "forbidden_memory_ids": [],
2903
+ "difficulty": "easy",
2904
+ "architecture_bias_risk": "low",
2905
+ "fairness_note": "Single stated fact."
2906
+ },
2907
+ {
2908
+ "question_id": "qC-sw2-02",
2909
+ "category": "paraphrased_semantic_recall",
2910
+ "question": "What's Petra's process when she gets a new bug report?",
2911
+ "expected_answer": "Reproduce it locally, label its severity, then schedule it for the next sprint unless it's a security issue.",
2912
+ "acceptable_answer_criteria": [
2913
+ "mentions reproducing locally",
2914
+ "mentions severity labeling",
2915
+ "mentions the sprint/security distinction"
2916
+ ],
2917
+ "required_memory_ids": [
2918
+ "memC-sw2-010"
2919
+ ],
2920
+ "forbidden_memory_ids": [],
2921
+ "difficulty": "medium",
2922
+ "architecture_bias_risk": "low",
2923
+ "fairness_note": "Reworded from the procedural record."
2924
+ },
2925
+ {
2926
+ "question_id": "qC-sw2-03",
2927
+ "category": "temporal_update",
2928
+ "question": "Which platforms does Reviewlytic currently support?",
2929
+ "expected_answer": "GitHub, GitLab, and Bitbucket.",
2930
+ "acceptable_answer_criteria": [
2931
+ "names all three platforms"
2932
+ ],
2933
+ "required_memory_ids": [
2934
+ "memC-sw2-004"
2935
+ ],
2936
+ "forbidden_memory_ids": [],
2937
+ "difficulty": "medium",
2938
+ "architecture_bias_risk": "medium",
2939
+ "fairness_note": "Platform support was added incrementally across three records; requires surfacing the cumulative current state, not just the first or second addition."
2940
+ },
2941
+ {
2942
+ "question_id": "qC-sw2-04",
2943
+ "category": "contradiction_resolution",
2944
+ "question": "Is Reviewlytic's per-seat price currently $12/month?",
2945
+ "expected_answer": "No -- that was the originally planned price. It was lowered to $8/month before public launch and has stayed there.",
2946
+ "acceptable_answer_criteria": [
2947
+ "says no",
2948
+ "gives the current $8 price"
2949
+ ],
2950
+ "required_memory_ids": [
2951
+ "memC-sw2-012"
2952
+ ],
2953
+ "forbidden_memory_ids": [
2954
+ "memC-sw2-007"
2955
+ ],
2956
+ "difficulty": "medium",
2957
+ "architecture_bias_risk": "medium",
2958
+ "fairness_note": "Tests resistance to an early planned price that was explicitly changed before launch."
2959
+ },
2960
+ {
2961
+ "question_id": "qC-sw2-05",
2962
+ "category": "noise_resistance",
2963
+ "question": "Did the 'Reviewlytic Alpha' build get used by external testers?",
2964
+ "expected_answer": "No -- Alpha was Petra's private internal testing build only; the externally-tested version was called 'Reviewlytic Beta.'",
2965
+ "acceptable_answer_criteria": [
2966
+ "distinguishes Alpha from Beta",
2967
+ "correctly attributes external testing to Beta"
2968
+ ],
2969
+ "required_memory_ids": [
2970
+ "memC-sw2-005",
2971
+ "memC-sw2-006"
2972
+ ],
2973
+ "forbidden_memory_ids": [],
2974
+ "difficulty": "hard",
2975
+ "architecture_bias_risk": "medium",
2976
+ "fairness_note": "Near-duplicate naming (Alpha/Beta) for the same product line, where only one was ever externally released, is a deliberate retrieval-precision trap."
2977
+ },
2978
+ {
2979
+ "question_id": "qC-sw2-06",
2980
+ "category": "abstention",
2981
+ "question": "How many paying customers does Reviewlytic currently have?",
2982
+ "expected_answer": "Not enough information -- no record states a customer count, only a tester count and pricing.",
2983
+ "acceptable_answer_criteria": [
2984
+ "states this isn't recorded",
2985
+ "doesn't conflate the 20 testers with paying customers"
2986
+ ],
2987
+ "required_memory_ids": [],
2988
+ "forbidden_memory_ids": [
2989
+ "memC-sw2-006"
2990
+ ],
2991
+ "difficulty": "medium",
2992
+ "architecture_bias_risk": "low",
2993
+ "fairness_note": "Tests whether a stated tester count gets misread as a customer count when no such figure actually exists."
2994
+ }
2995
+ ]
2996
+ },
2997
+ {
2998
+ "conversation_id": "convC-personal-assistant-02",
2999
+ "agent_id": "agent-pa",
3000
+ "domain": "personal_assistant",
3001
+ "memory_records": [
3002
+ {
3003
+ "id": "memC-pa2-001",
3004
+ "type": "semantic",
3005
+ "timestamp": "2025-09-01T09:00:00Z",
3006
+ "content": "Yusuf manages the shared family calendar and subscriptions for his household of four.",
3007
+ "tags": [
3008
+ "yusuf",
3009
+ "profile"
3010
+ ],
3011
+ "importance": 0.5,
3012
+ "metadata": {
3013
+ "source_turn": "turn-001",
3014
+ "speaker": "user"
3015
+ },
3016
+ "associations": []
3017
+ },
3018
+ {
3019
+ "id": "memC-pa2-002",
3020
+ "type": "episodic",
3021
+ "timestamp": "2025-09-05T10:00:00Z",
3022
+ "content": "Yusuf set up the family's streaming subscription on an annual plan in March rather than monthly, to save money.",
3023
+ "tags": [
3024
+ "yusuf",
3025
+ "subscriptions"
3026
+ ],
3027
+ "importance": 0.5,
3028
+ "metadata": {
3029
+ "source_turn": "turn-002",
3030
+ "speaker": "user"
3031
+ },
3032
+ "associations": []
3033
+ },
3034
+ {
3035
+ "id": "memC-pa2-003",
3036
+ "type": "episodic",
3037
+ "timestamp": "2025-09-10T11:00:00Z",
3038
+ "content": "Yusuf's son, Adam, has Tuesday evening soccer practice at 6pm.",
3039
+ "tags": [
3040
+ "yusuf",
3041
+ "adam",
3042
+ "schedule"
3043
+ ],
3044
+ "importance": 0.6,
3045
+ "metadata": {
3046
+ "source_turn": "turn-003",
3047
+ "speaker": "user"
3048
+ },
3049
+ "associations": []
3050
+ },
3051
+ {
3052
+ "id": "memC-pa2-004",
3053
+ "type": "episodic",
3054
+ "timestamp": "2025-09-12T12:00:00Z",
3055
+ "content": "Yusuf's nephew, Adnan, who visits on alternating weekends, has his own separate Saturday morning soccer practice that Yusuf isn't responsible for driving to.",
3056
+ "tags": [
3057
+ "yusuf",
3058
+ "adnan",
3059
+ "distractor"
3060
+ ],
3061
+ "importance": 0.3,
3062
+ "metadata": {
3063
+ "source_turn": "turn-004",
3064
+ "speaker": "user"
3065
+ },
3066
+ "associations": []
3067
+ },
3068
+ {
3069
+ "id": "memC-pa2-005",
3070
+ "type": "episodic",
3071
+ "timestamp": "2026-05-01T09:00:00Z",
3072
+ "content": "Adam's soccer practice moved from Tuesday evenings to Wednesday evenings because the league changed field availability.",
3073
+ "tags": [
3074
+ "yusuf",
3075
+ "adam",
3076
+ "schedule"
3077
+ ],
3078
+ "importance": 0.6,
3079
+ "metadata": {
3080
+ "source_turn": "turn-005",
3081
+ "speaker": "user"
3082
+ },
3083
+ "associations": [
3084
+ {
3085
+ "target_id": "memC-pa2-003",
3086
+ "strength": 0.8,
3087
+ "reason": "replaces the original Tuesday slot"
3088
+ }
3089
+ ]
3090
+ },
3091
+ {
3092
+ "id": "memC-pa2-006",
3093
+ "type": "semantic",
3094
+ "timestamp": "2025-09-15T08:00:00Z",
3095
+ "content": "Yusuf's daughter, Lina, is allergic to peanuts, which matters for any snacks he packs for family outings.",
3096
+ "tags": [
3097
+ "yusuf",
3098
+ "lina",
3099
+ "health"
3100
+ ],
3101
+ "importance": 0.6,
3102
+ "metadata": {
3103
+ "source_turn": "turn-006",
3104
+ "speaker": "user"
3105
+ },
3106
+ "associations": []
3107
+ },
3108
+ {
3109
+ "id": "memC-pa2-007",
3110
+ "type": "episodic",
3111
+ "timestamp": "2025-10-01T09:00:00Z",
3112
+ "content": "Yusuf originally believed the family's internet plan renewal date was the 1st of each month.",
3113
+ "tags": [
3114
+ "yusuf",
3115
+ "internet"
3116
+ ],
3117
+ "importance": 0.4,
3118
+ "metadata": {
3119
+ "source_turn": "turn-007",
3120
+ "speaker": "user"
3121
+ },
3122
+ "associations": []
3123
+ },
3124
+ {
3125
+ "id": "memC-pa2-008",
3126
+ "type": "episodic",
3127
+ "timestamp": "2025-11-15T10:00:00Z",
3128
+ "content": "After checking the actual bill, Yusuf found the internet plan actually renews on the 15th of each month, not the 1st as he'd assumed.",
3129
+ "tags": [
3130
+ "yusuf",
3131
+ "internet"
3132
+ ],
3133
+ "importance": 0.55,
3134
+ "metadata": {
3135
+ "source_turn": "turn-008",
3136
+ "speaker": "user"
3137
+ },
3138
+ "associations": [
3139
+ {
3140
+ "target_id": "memC-pa2-007",
3141
+ "strength": 0.85,
3142
+ "reason": "corrects this earlier mistaken assumption"
3143
+ }
3144
+ ]
3145
+ },
3146
+ {
3147
+ "id": "memC-pa2-009",
3148
+ "type": "procedural",
3149
+ "timestamp": "2025-09-20T08:00:00Z",
3150
+ "content": "Yusuf's process for canceling a subscription he no longer wants is to check the renewal date first, then cancel at least 3 days before that date to avoid being charged for another cycle.",
3151
+ "tags": [
3152
+ "yusuf",
3153
+ "workflow"
3154
+ ],
3155
+ "importance": 0.5,
3156
+ "metadata": {
3157
+ "source_turn": "turn-009",
3158
+ "speaker": "user"
3159
+ },
3160
+ "associations": []
3161
+ },
3162
+ {
3163
+ "id": "memC-pa2-010",
3164
+ "type": "episodic",
3165
+ "timestamp": "2026-04-01T11:00:00Z",
3166
+ "content": "Yusuf canceled a meal-kit subscription in April using that process, successfully avoiding the next billing cycle.",
3167
+ "tags": [
3168
+ "yusuf",
3169
+ "subscriptions"
3170
+ ],
3171
+ "importance": 0.5,
3172
+ "metadata": {
3173
+ "source_turn": "turn-010",
3174
+ "speaker": "user"
3175
+ },
3176
+ "associations": [
3177
+ {
3178
+ "target_id": "memC-pa2-009",
3179
+ "strength": 0.8,
3180
+ "reason": "followed the established cancellation process"
3181
+ }
3182
+ ]
3183
+ },
3184
+ {
3185
+ "id": "memC-pa2-011",
3186
+ "type": "episodic",
3187
+ "timestamp": "2026-03-01T09:00:00Z",
3188
+ "content": "Yusuf mentioned he's considering switching internet providers but hasn't made a decision yet.",
3189
+ "tags": [
3190
+ "yusuf",
3191
+ "internet"
3192
+ ],
3193
+ "importance": 0.35,
3194
+ "metadata": {
3195
+ "source_turn": "turn-011",
3196
+ "speaker": "user"
3197
+ },
3198
+ "associations": []
3199
+ },
3200
+ {
3201
+ "id": "memC-pa2-012",
3202
+ "type": "episodic",
3203
+ "timestamp": "2026-06-01T10:00:00Z",
3204
+ "content": "Yusuf confirmed that, as of June, Adam's soccer practice remains on Wednesday evenings at 6pm.",
3205
+ "tags": [
3206
+ "yusuf",
3207
+ "adam",
3208
+ "schedule"
3209
+ ],
3210
+ "importance": 0.65,
3211
+ "metadata": {
3212
+ "source_turn": "turn-012",
3213
+ "speaker": "user"
3214
+ },
3215
+ "associations": [
3216
+ {
3217
+ "target_id": "memC-pa2-005",
3218
+ "strength": 0.85,
3219
+ "reason": "confirms the moved Wednesday slot is still current"
3220
+ }
3221
+ ]
3222
+ }
3223
+ ],
3224
+ "questions": [
3225
+ {
3226
+ "question_id": "qC-pa2-01",
3227
+ "category": "atomic_fact_recall",
3228
+ "question": "What is Lina allergic to?",
3229
+ "expected_answer": "Peanuts",
3230
+ "acceptable_answer_criteria": [
3231
+ "states peanuts"
3232
+ ],
3233
+ "required_memory_ids": [
3234
+ "memC-pa2-006"
3235
+ ],
3236
+ "forbidden_memory_ids": [],
3237
+ "difficulty": "easy",
3238
+ "architecture_bias_risk": "low",
3239
+ "fairness_note": "Single stated fact."
3240
+ },
3241
+ {
3242
+ "question_id": "qC-pa2-02",
3243
+ "category": "paraphrased_semantic_recall",
3244
+ "question": "What does Yusuf do before canceling a subscription he doesn't want anymore?",
3245
+ "expected_answer": "He checks the renewal date and cancels at least 3 days before it to avoid another billing cycle.",
3246
+ "acceptable_answer_criteria": [
3247
+ "mentions checking the renewal date",
3248
+ "mentions the 3-day buffer"
3249
+ ],
3250
+ "required_memory_ids": [
3251
+ "memC-pa2-009"
3252
+ ],
3253
+ "forbidden_memory_ids": [],
3254
+ "difficulty": "medium",
3255
+ "architecture_bias_risk": "low",
3256
+ "fairness_note": "Reworded from the procedural record."
3257
+ },
3258
+ {
3259
+ "question_id": "qC-pa2-03",
3260
+ "category": "temporal_update",
3261
+ "question": "What day is Adam's soccer practice currently on?",
3262
+ "expected_answer": "Wednesday evenings at 6pm -- moved from the original Tuesday slot.",
3263
+ "acceptable_answer_criteria": [
3264
+ "states Wednesday 6pm",
3265
+ "notes the change from Tuesday"
3266
+ ],
3267
+ "required_memory_ids": [
3268
+ "memC-pa2-005",
3269
+ "memC-pa2-012"
3270
+ ],
3271
+ "forbidden_memory_ids": [],
3272
+ "difficulty": "medium",
3273
+ "architecture_bias_risk": "medium",
3274
+ "fairness_note": "Requires surfacing the updated day over the originally stated one."
3275
+ },
3276
+ {
3277
+ "question_id": "qC-pa2-04",
3278
+ "category": "contradiction_resolution",
3279
+ "question": "Does the family's internet plan renew on the 1st of the month?",
3280
+ "expected_answer": "No -- Yusuf initially assumed that, but checking the bill showed it actually renews on the 15th.",
3281
+ "acceptable_answer_criteria": [
3282
+ "says no",
3283
+ "gives the corrected 15th date"
3284
+ ],
3285
+ "required_memory_ids": [
3286
+ "memC-pa2-008"
3287
+ ],
3288
+ "forbidden_memory_ids": [
3289
+ "memC-pa2-007"
3290
+ ],
3291
+ "difficulty": "medium",
3292
+ "architecture_bias_risk": "medium",
3293
+ "fairness_note": "The 'stale' fact here is a mistaken assumption rather than a changed real-world state, testing whether corrections are handled as well as updates."
3294
+ },
3295
+ {
3296
+ "question_id": "qC-pa2-05",
3297
+ "category": "noise_resistance",
3298
+ "question": "Is Yusuf responsible for driving Adnan to his Saturday soccer practice?",
3299
+ "expected_answer": "No -- Adnan is Yusuf's nephew with his own separate practice; Yusuf isn't responsible for that drive. Adam, Yusuf's son, is the one whose soccer schedule Yusuf manages.",
3300
+ "acceptable_answer_criteria": [
3301
+ "distinguishes Adam from Adnan",
3302
+ "correctly states Yusuf isn't responsible for Adnan's practice"
3303
+ ],
3304
+ "required_memory_ids": [
3305
+ "memC-pa2-003",
3306
+ "memC-pa2-004"
3307
+ ],
3308
+ "forbidden_memory_ids": [],
3309
+ "difficulty": "hard",
3310
+ "architecture_bias_risk": "medium",
3311
+ "fairness_note": "Near-duplicate names (Adam/Adnan) referring to a son versus a nephew, with different responsibilities, is a deliberate adversarial trap."
3312
+ },
3313
+ {
3314
+ "question_id": "qC-pa2-06",
3315
+ "category": "abstention",
3316
+ "question": "Which internet provider did Yusuf switch to?",
3317
+ "expected_answer": "Not enough information -- he's only considering switching and hasn't decided or made a switch yet.",
3318
+ "acceptable_answer_criteria": [
3319
+ "states this isn't decided yet",
3320
+ "does not name a provider"
3321
+ ],
3322
+ "required_memory_ids": [
3323
+ "memC-pa2-011"
3324
+ ],
3325
+ "forbidden_memory_ids": [],
3326
+ "difficulty": "medium",
3327
+ "architecture_bias_risk": "low",
3328
+ "fairness_note": "Tests whether 'considering switching' gets incorrectly treated as evidence that a switch already happened."
3329
+ }
3330
+ ]
3331
+ }
3332
+ ]
3333
+ }