@1mbrain/benchmarks 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (69) hide show
  1. package/README.md +85 -0
  2. package/fixtures/1mbrain-focused-mini/1mbrain-focused-mini.json +928 -0
  3. package/fixtures/1mbrain-focused-mini/README.md +45 -0
  4. package/fixtures/adversarial-memory/dataset_claude_adversarial.json +3333 -0
  5. package/fixtures/adversarial-memory/dataset_gemini_adversarial_memory.json +2984 -0
  6. package/fixtures/balanced-mini/dataset_claude_balanced_mini.json +2077 -0
  7. package/fixtures/balanced-mini/dataset_gemini_balanced_mini.json +1995 -0
  8. package/fixtures/generate_datasets.js +1741 -0
  9. package/fixtures/graph-stress-hard/README.md +43 -0
  10. package/fixtures/graph-stress-hard/dataset_graph_stress_hard.json +4374 -0
  11. package/fixtures/graph-stress-hard/generate_graph_stress_hard.js +526 -0
  12. package/fixtures/realistic-medium/dataset_claude_realistic_medium.json +7462 -0
  13. package/fixtures/realistic-medium/dataset_gemini_realistic_medium.json +7277 -0
  14. package/fixtures/realistic-medium/gen_claude_medium.js +600 -0
  15. package/package.json +22 -0
  16. package/reports/benchmark_report.md +48 -0
  17. package/reports/benchmark_report_claude_adversarial.md +42 -0
  18. package/reports/benchmark_report_claude_adversarial_adaptive.md +42 -0
  19. package/reports/benchmark_report_claude_adversarial_adaptive2_fast.md +42 -0
  20. package/reports/benchmark_report_claude_adversarial_adaptive_fast.md +42 -0
  21. package/reports/benchmark_report_claude_adversarial_rerank.md +42 -0
  22. package/reports/benchmark_report_claude_balanced_mini.md +42 -0
  23. package/reports/benchmark_report_claude_balanced_mini_adaptive.md +42 -0
  24. package/reports/benchmark_report_claude_balanced_mini_adaptive2_fast.md +42 -0
  25. package/reports/benchmark_report_claude_balanced_mini_adaptive_fast.md +42 -0
  26. package/reports/benchmark_report_claude_balanced_mini_rerank.md +42 -0
  27. package/reports/benchmark_report_claude_realistic_medium.md +42 -0
  28. package/reports/benchmark_report_claude_realistic_medium_adaptive.md +42 -0
  29. package/reports/benchmark_report_claude_realistic_medium_adaptive2_fast.md +42 -0
  30. package/reports/benchmark_report_claude_realistic_medium_adaptive_fast.md +42 -0
  31. package/reports/benchmark_report_claude_realistic_medium_evidence_rerank_local.md +42 -0
  32. package/reports/benchmark_report_claude_realistic_medium_openai_evidence_rerank.md +41 -0
  33. package/reports/benchmark_report_claude_realistic_medium_openai_multi_signal.md +41 -0
  34. package/reports/benchmark_report_claude_realistic_medium_openai_multi_signal_scoped.md +41 -0
  35. package/reports/benchmark_report_claude_realistic_medium_openai_phase8_no_judge.md +42 -0
  36. package/reports/benchmark_report_claude_realistic_medium_openai_rankingpolicy.md +41 -0
  37. package/reports/benchmark_report_claude_realistic_medium_openai_stale_filter.md +41 -0
  38. package/reports/benchmark_report_claude_realistic_medium_openai_stale_filter_absence_fix.md +41 -0
  39. package/reports/benchmark_report_claude_realistic_medium_openai_write_time_invalidation.md +41 -0
  40. package/reports/benchmark_report_claude_realistic_medium_rerank.md +42 -0
  41. package/reports/benchmark_report_claude_realistic_medium_stale_filter_local.md +42 -0
  42. package/reports/benchmark_report_graph_stress_hard.md +42 -0
  43. package/reports/benchmark_report_graph_stress_hard_absence_fix.md +42 -0
  44. package/reports/benchmark_report_graph_stress_hard_adaptive.md +42 -0
  45. package/reports/benchmark_report_graph_stress_hard_evidence_rerank.md +42 -0
  46. package/reports/benchmark_report_graph_stress_hard_multi_signal_current_guardrail.md +42 -0
  47. package/reports/benchmark_report_graph_stress_hard_multi_signal_guardrail_fixed.md +42 -0
  48. package/reports/benchmark_report_graph_stress_hard_multi_signal_local.md +42 -0
  49. package/reports/benchmark_report_graph_stress_hard_multi_signal_scoped_guardrail.md +42 -0
  50. package/reports/benchmark_report_graph_stress_hard_multi_signal_vector_pure_guardrail.md +42 -0
  51. package/reports/benchmark_report_graph_stress_hard_phase8_sdk_guardrail.md +42 -0
  52. package/reports/benchmark_report_graph_stress_hard_rerank.md +42 -0
  53. package/reports/benchmark_report_graph_stress_hard_stale_filter.md +42 -0
  54. package/reports/benchmark_report_graph_stress_hard_write_time_invalidation.md +42 -0
  55. package/results/.gitignore +2 -0
  56. package/src/adapters/1mbrain.ts +317 -0
  57. package/src/adapters/keyword-embedding.ts +48 -0
  58. package/src/adapters/mem0.ts +124 -0
  59. package/src/adapters/qdrant.ts +214 -0
  60. package/src/adapters/unavailable.ts +49 -0
  61. package/src/adapters/vector-baseline.ts +149 -0
  62. package/src/datasets/focused-mini.ts +158 -0
  63. package/src/datasets/synthetic-agent-memory.ts +532 -0
  64. package/src/llm-evaluator.ts +262 -0
  65. package/src/metrics.ts +482 -0
  66. package/src/provider.ts +151 -0
  67. package/src/runner.ts +635 -0
  68. package/tsconfig.json +10 -0
  69. package/tsconfig.tsbuildinfo +1 -0
@@ -0,0 +1,1995 @@
1
+ {
2
+ "name": "balanced-mini",
3
+ "description": "Fast balanced memory benchmark smoke test containing 8 conversations with 72 memory records and 40 questions.",
4
+ "generated_at": "2026-06-19",
5
+ "fairness_notes": [
6
+ "No memory passport, decay, or explicit graph schema dependencies are assumed.",
7
+ "Questions are distributed across exact retrieval, paraphrased semantic queries, and temporal updates.",
8
+ "Graph-heavy questions are kept below 15% to prevent bias towards graph memory models."
9
+ ],
10
+ "conversations": [
11
+ {
12
+ "conversation_id": "da_c01_fastapi",
13
+ "agent_id": "da_agent_fastapi",
14
+ "domain": "software",
15
+ "memory_records": [
16
+ {
17
+ "id": "da_c01_m01",
18
+ "type": "semantic",
19
+ "timestamp": "2026-06-10T10:00:00Z",
20
+ "content": "The developer prefers using Uvicorn with 4 workers for local development of the FastAPI server.",
21
+ "tags": [
22
+ "fastapi",
23
+ "uvicorn",
24
+ "local-dev"
25
+ ],
26
+ "importance": 0.8,
27
+ "metadata": {
28
+ "source_turn": "t01",
29
+ "speaker": "user"
30
+ }
31
+ },
32
+ {
33
+ "id": "da_c01_m02",
34
+ "type": "semantic",
35
+ "timestamp": "2026-06-10T10:05:00Z",
36
+ "content": "The database connection URI template is postgresql://user:pass@localhost:5432/dev_db.",
37
+ "tags": [
38
+ "fastapi",
39
+ "database",
40
+ "postgres"
41
+ ],
42
+ "importance": 0.85,
43
+ "metadata": {
44
+ "source_turn": "t02",
45
+ "speaker": "user"
46
+ }
47
+ },
48
+ {
49
+ "id": "da_c01_m03",
50
+ "type": "procedural",
51
+ "timestamp": "2026-06-10T10:10:00Z",
52
+ "content": "To update the database schema, run the command 'alembic upgrade head' in the root directory.",
53
+ "tags": [
54
+ "fastapi",
55
+ "alembic",
56
+ "migration"
57
+ ],
58
+ "importance": 0.9,
59
+ "metadata": {
60
+ "source_turn": "t03",
61
+ "speaker": "user"
62
+ }
63
+ },
64
+ {
65
+ "id": "da_c01_m04",
66
+ "type": "episodic",
67
+ "timestamp": "2026-06-10T10:15:00Z",
68
+ "content": "At 09:00, the database connection failed because the local Postgres container was stopped.",
69
+ "tags": [
70
+ "fastapi",
71
+ "database",
72
+ "error"
73
+ ],
74
+ "importance": 0.7,
75
+ "metadata": {
76
+ "source_turn": "t04",
77
+ "speaker": "assistant"
78
+ }
79
+ },
80
+ {
81
+ "id": "da_c01_m05",
82
+ "type": "semantic",
83
+ "timestamp": "2026-06-11T09:00:00Z",
84
+ "content": "On June 11, the developer decided that SQLite should be used for testing, replacing Postgres.",
85
+ "tags": [
86
+ "fastapi",
87
+ "database",
88
+ "testing"
89
+ ],
90
+ "importance": 0.8,
91
+ "metadata": {
92
+ "source_turn": "t05",
93
+ "speaker": "user"
94
+ }
95
+ },
96
+ {
97
+ "id": "da_c01_m06",
98
+ "type": "episodic",
99
+ "timestamp": "2026-06-11T09:10:00Z",
100
+ "content": "At 10:15, the developer reported that tests passed using the SQLite memory driver.",
101
+ "tags": [
102
+ "fastapi",
103
+ "testing",
104
+ "success"
105
+ ],
106
+ "importance": 0.75,
107
+ "metadata": {
108
+ "source_turn": "t06",
109
+ "speaker": "assistant"
110
+ }
111
+ },
112
+ {
113
+ "id": "da_c01_m07",
114
+ "type": "semantic",
115
+ "timestamp": "2026-06-12T11:00:00Z",
116
+ "content": "Noise: The project logo uses a green and blue color palette representing growth and stability.",
117
+ "tags": [
118
+ "noise",
119
+ "logo"
120
+ ],
121
+ "importance": 0.2,
122
+ "metadata": {
123
+ "source_turn": "t07",
124
+ "speaker": "user"
125
+ }
126
+ },
127
+ {
128
+ "id": "da_c01_m08",
129
+ "type": "semantic",
130
+ "timestamp": "2026-06-12T11:05:00Z",
131
+ "content": "Noise: The server logs are stored in the folder /var/log/fastapi/app.log.",
132
+ "tags": [
133
+ "noise",
134
+ "logs"
135
+ ],
136
+ "importance": 0.3,
137
+ "metadata": {
138
+ "source_turn": "t08",
139
+ "speaker": "assistant"
140
+ }
141
+ },
142
+ {
143
+ "id": "da_c01_m09",
144
+ "type": "semantic",
145
+ "timestamp": "2026-06-13T14:00:00Z",
146
+ "content": "The latest preference is to deploy on GCP Cloud Run using dockerized builds, replacing the older AWS ECS plan.",
147
+ "tags": [
148
+ "fastapi",
149
+ "deployment",
150
+ "gcp"
151
+ ],
152
+ "importance": 0.9,
153
+ "metadata": {
154
+ "source_turn": "t09",
155
+ "speaker": "user"
156
+ }
157
+ }
158
+ ],
159
+ "questions": [
160
+ {
161
+ "question_id": "da_c01_q01",
162
+ "category": "atomic_fact_recall",
163
+ "question": "What connection URI template is configured for the dev database?",
164
+ "expected_answer": "postgresql://user:pass@localhost:5432/dev_db",
165
+ "acceptable_answer_criteria": [
166
+ "postgresql://user:pass@localhost:5432/dev_db",
167
+ "Postgres dev_db URI"
168
+ ],
169
+ "required_memory_ids": [
170
+ "da_c01_m02"
171
+ ],
172
+ "forbidden_memory_ids": [],
173
+ "difficulty": "easy",
174
+ "architecture_bias_risk": "low",
175
+ "fairness_note": "Direct exact match fact retrieval, basic vector search works perfectly."
176
+ },
177
+ {
178
+ "question_id": "da_c01_q02",
179
+ "category": "paraphrased_semantic_recall",
180
+ "question": "What is the preferred setup for running the local API server?",
181
+ "expected_answer": "Uvicorn with 4 workers",
182
+ "acceptable_answer_criteria": [
183
+ "Uvicorn",
184
+ "4 workers"
185
+ ],
186
+ "required_memory_ids": [
187
+ "da_c01_m01"
188
+ ],
189
+ "forbidden_memory_ids": [],
190
+ "difficulty": "medium",
191
+ "architecture_bias_risk": "low",
192
+ "fairness_note": "Paraphrasing evaluates semantic similarity retrieval rather than simple keyword overlap."
193
+ },
194
+ {
195
+ "question_id": "da_c01_q03",
196
+ "category": "temporal_update",
197
+ "question": "Where should the application be deployed according to the latest decision?",
198
+ "expected_answer": "GCP Cloud Run using dockerized builds",
199
+ "acceptable_answer_criteria": [
200
+ "GCP Cloud Run",
201
+ "GCP",
202
+ "Cloud Run"
203
+ ],
204
+ "required_memory_ids": [
205
+ "da_c01_m09"
206
+ ],
207
+ "forbidden_memory_ids": [],
208
+ "difficulty": "medium",
209
+ "architecture_bias_risk": "low",
210
+ "fairness_note": "Requires checking timestamps or explicit updates to ignore outdated AWS plan."
211
+ },
212
+ {
213
+ "question_id": "da_c01_q04",
214
+ "category": "procedural_recall",
215
+ "question": "What command must be executed to migrate the database schema?",
216
+ "expected_answer": "alembic upgrade head",
217
+ "acceptable_answer_criteria": [
218
+ "alembic upgrade head",
219
+ "alembic"
220
+ ],
221
+ "required_memory_ids": [
222
+ "da_c01_m03"
223
+ ],
224
+ "forbidden_memory_ids": [],
225
+ "difficulty": "easy",
226
+ "architecture_bias_risk": "low",
227
+ "fairness_note": "Retrieves a simple step-by-step procedural instruction."
228
+ },
229
+ {
230
+ "question_id": "da_c01_q05",
231
+ "category": "noise_resistance",
232
+ "question": "What is the file path where FastAPI server log files are written?",
233
+ "expected_answer": "/var/log/fastapi/app.log",
234
+ "acceptable_answer_criteria": [
235
+ "/var/log/fastapi/app.log",
236
+ "/var/log/fastapi"
237
+ ],
238
+ "required_memory_ids": [
239
+ "da_c01_m08"
240
+ ],
241
+ "forbidden_memory_ids": [
242
+ "da_c01_m07"
243
+ ],
244
+ "difficulty": "easy",
245
+ "architecture_bias_risk": "low",
246
+ "fairness_note": "Verifies the system can ignore unrelated design/color noise."
247
+ }
248
+ ]
249
+ },
250
+ {
251
+ "conversation_id": "da_c02_smart_home",
252
+ "agent_id": "da_agent_smart_home",
253
+ "domain": "personal_assistant",
254
+ "memory_records": [
255
+ {
256
+ "id": "da_c02_m01",
257
+ "type": "semantic",
258
+ "timestamp": "2026-06-12T08:00:00Z",
259
+ "content": "The family prefers keeping the living room thermostat at 71 degrees Fahrenheit during daytime.",
260
+ "tags": [
261
+ "home",
262
+ "thermostat",
263
+ "temperature"
264
+ ],
265
+ "importance": 0.8,
266
+ "metadata": {
267
+ "source_turn": "t01",
268
+ "speaker": "user"
269
+ }
270
+ },
271
+ {
272
+ "id": "da_c02_m02",
273
+ "type": "semantic",
274
+ "timestamp": "2026-06-12T08:05:00Z",
275
+ "content": "The smart lock code for the back door is set to 4920 for house cleaner access.",
276
+ "tags": [
277
+ "home",
278
+ "security",
279
+ "smart-lock"
280
+ ],
281
+ "importance": 0.9,
282
+ "metadata": {
283
+ "source_turn": "t02",
284
+ "speaker": "user"
285
+ }
286
+ },
287
+ {
288
+ "id": "da_c02_m03",
289
+ "type": "episodic",
290
+ "timestamp": "2026-06-12T08:10:00Z",
291
+ "content": "The house cleaner, Maria, visits every Thursday at 10:00 AM.",
292
+ "tags": [
293
+ "schedule",
294
+ "cleaner",
295
+ "house"
296
+ ],
297
+ "importance": 0.75,
298
+ "metadata": {
299
+ "source_turn": "t03",
300
+ "speaker": "user"
301
+ },
302
+ "associations": [
303
+ {
304
+ "target_id": "da_c02_m02",
305
+ "strength": 0.85,
306
+ "reason": "cleaner uses smart lock code"
307
+ }
308
+ ]
309
+ },
310
+ {
311
+ "id": "da_c02_m04",
312
+ "type": "semantic",
313
+ "timestamp": "2026-06-13T09:00:00Z",
314
+ "content": "Noise: The living room couch is covered with a beige slipcover to prevent dog hair stains.",
315
+ "tags": [
316
+ "noise",
317
+ "furniture"
318
+ ],
319
+ "importance": 0.2,
320
+ "metadata": {
321
+ "source_turn": "t04",
322
+ "speaker": "user"
323
+ }
324
+ },
325
+ {
326
+ "id": "da_c02_m05",
327
+ "type": "semantic",
328
+ "timestamp": "2026-06-14T09:00:00Z",
329
+ "content": "On June 14, the daytime living room thermostat preference was updated to 73 degrees Fahrenheit for energy saving.",
330
+ "tags": [
331
+ "home",
332
+ "thermostat",
333
+ "temperature"
334
+ ],
335
+ "importance": 0.85,
336
+ "metadata": {
337
+ "source_turn": "t05",
338
+ "speaker": "user"
339
+ }
340
+ },
341
+ {
342
+ "id": "da_c02_m06",
343
+ "type": "episodic",
344
+ "timestamp": "2026-06-14T10:00:00Z",
345
+ "content": "The back door smart lock battery level dropped to 10% and needs replacement soon.",
346
+ "tags": [
347
+ "home",
348
+ "security",
349
+ "battery"
350
+ ],
351
+ "importance": 0.7,
352
+ "metadata": {
353
+ "source_turn": "t06",
354
+ "speaker": "assistant"
355
+ }
356
+ },
357
+ {
358
+ "id": "da_c02_m07",
359
+ "type": "semantic",
360
+ "timestamp": "2026-06-15T11:00:00Z",
361
+ "content": "The dog's name is Barnaby, and he eats dry kibble twice a day at 8:00 AM and 6:00 PM.",
362
+ "tags": [
363
+ "pet",
364
+ "schedule",
365
+ "dog"
366
+ ],
367
+ "importance": 0.8,
368
+ "metadata": {
369
+ "source_turn": "t07",
370
+ "speaker": "user"
371
+ }
372
+ },
373
+ {
374
+ "id": "da_c02_m08",
375
+ "type": "semantic",
376
+ "timestamp": "2026-06-15T11:15:00Z",
377
+ "content": "Noise: The backyard sprinkler system runs on Mondays and Thursdays at 6:00 AM.",
378
+ "tags": [
379
+ "noise",
380
+ "sprinklers"
381
+ ],
382
+ "importance": 0.3,
383
+ "metadata": {
384
+ "source_turn": "t08",
385
+ "speaker": "user"
386
+ }
387
+ },
388
+ {
389
+ "id": "da_c02_m09",
390
+ "type": "semantic",
391
+ "timestamp": "2026-06-16T12:00:00Z",
392
+ "content": "The house cleaner's contact number is 555-0192.",
393
+ "tags": [
394
+ "cleaner",
395
+ "contact"
396
+ ],
397
+ "importance": 0.85,
398
+ "metadata": {
399
+ "source_turn": "t09",
400
+ "speaker": "user"
401
+ }
402
+ }
403
+ ],
404
+ "questions": [
405
+ {
406
+ "question_id": "da_c02_q01",
407
+ "category": "atomic_fact_recall",
408
+ "question": "What is the contact number of the house cleaner?",
409
+ "expected_answer": "555-0192",
410
+ "acceptable_answer_criteria": [
411
+ "555-0192",
412
+ "cleaner's phone number is 555-0192"
413
+ ],
414
+ "required_memory_ids": [
415
+ "da_c02_m09"
416
+ ],
417
+ "forbidden_memory_ids": [],
418
+ "difficulty": "easy",
419
+ "architecture_bias_risk": "low",
420
+ "fairness_note": "Basic keyword overlap fact retrieval."
421
+ },
422
+ {
423
+ "question_id": "da_c02_q02",
424
+ "category": "paraphrased_semantic_recall",
425
+ "question": "What is the feeding schedule for Barnaby?",
426
+ "expected_answer": "Twice a day at 8:00 AM and 6:00 PM.",
427
+ "acceptable_answer_criteria": [
428
+ "8:00 AM and 6:00 PM",
429
+ "twice a day"
430
+ ],
431
+ "required_memory_ids": [
432
+ "da_c02_m07"
433
+ ],
434
+ "forbidden_memory_ids": [],
435
+ "difficulty": "medium",
436
+ "architecture_bias_risk": "low",
437
+ "fairness_note": "Tests semantic mapping of 'feeding schedule' to 'eats dry kibble'."
438
+ },
439
+ {
440
+ "question_id": "da_c02_q03",
441
+ "category": "temporal_update",
442
+ "question": "What temperature should the living room be set to during the day?",
443
+ "expected_answer": "73 degrees Fahrenheit",
444
+ "acceptable_answer_criteria": [
445
+ "73 degrees",
446
+ "73 F",
447
+ "73"
448
+ ],
449
+ "required_memory_ids": [
450
+ "da_c02_m05"
451
+ ],
452
+ "forbidden_memory_ids": [
453
+ "da_c02_m01"
454
+ ],
455
+ "difficulty": "medium",
456
+ "architecture_bias_risk": "low",
457
+ "fairness_note": "Requires checking timestamps to ensure the newer 73F setting overrides the older 71F setting."
458
+ },
459
+ {
460
+ "question_id": "da_c02_q04",
461
+ "category": "contradiction_resolution",
462
+ "question": "Is the living room thermostat still set to 71 degrees?",
463
+ "expected_answer": "No, it was updated to 73 degrees Fahrenheit on June 14.",
464
+ "acceptable_answer_criteria": [
465
+ "No",
466
+ "Updated to 73"
467
+ ],
468
+ "required_memory_ids": [
469
+ "da_c02_m05",
470
+ "da_c02_m01"
471
+ ],
472
+ "forbidden_memory_ids": [],
473
+ "difficulty": "medium",
474
+ "architecture_bias_risk": "low",
475
+ "fairness_note": "Resolves contradiction by looking at the update history."
476
+ },
477
+ {
478
+ "question_id": "da_c02_q05",
479
+ "category": "multi_hop_association",
480
+ "question": "What door entry code does Maria need when she visits on Thursdays?",
481
+ "expected_answer": "4920",
482
+ "acceptable_answer_criteria": [
483
+ "4920",
484
+ "smart lock code 4920"
485
+ ],
486
+ "required_memory_ids": [
487
+ "da_c02_m02",
488
+ "da_c02_m03"
489
+ ],
490
+ "forbidden_memory_ids": [],
491
+ "difficulty": "hard",
492
+ "architecture_bias_risk": "medium",
493
+ "fairness_note": "Requires linking Maria -> house cleaner -> smart lock code (4920). Solvable with either graph search or multi-evidence semantic retrieval."
494
+ }
495
+ ]
496
+ },
497
+ {
498
+ "conversation_id": "da_c03_quantum_sim",
499
+ "agent_id": "da_agent_quantum_sim",
500
+ "domain": "research",
501
+ "memory_records": [
502
+ {
503
+ "id": "da_c03_m01",
504
+ "type": "semantic",
505
+ "timestamp": "2026-06-15T09:00:00Z",
506
+ "content": "The quantum simulator code uses a grid spacing parameter dx = 0.05 microns.",
507
+ "tags": [
508
+ "physics",
509
+ "quantum",
510
+ "simulation"
511
+ ],
512
+ "importance": 0.8,
513
+ "metadata": {
514
+ "source_turn": "t01",
515
+ "speaker": "user"
516
+ }
517
+ },
518
+ {
519
+ "id": "da_c03_m02",
520
+ "type": "semantic",
521
+ "timestamp": "2026-06-15T09:05:00Z",
522
+ "content": "The Hamiltonian solver converges only when the relaxation factor omega is set to 1.25.",
523
+ "tags": [
524
+ "physics",
525
+ "solver",
526
+ "convergence"
527
+ ],
528
+ "importance": 0.85,
529
+ "metadata": {
530
+ "source_turn": "t02",
531
+ "speaker": "user"
532
+ }
533
+ },
534
+ {
535
+ "id": "da_c03_m03",
536
+ "type": "episodic",
537
+ "timestamp": "2026-06-15T09:10:00Z",
538
+ "content": "Simulation run #402 stalled after 1500 iterations due to a floating point exception in the grid boundary code.",
539
+ "tags": [
540
+ "physics",
541
+ "simulation",
542
+ "bug"
543
+ ],
544
+ "importance": 0.75,
545
+ "metadata": {
546
+ "source_turn": "t03",
547
+ "speaker": "assistant"
548
+ }
549
+ },
550
+ {
551
+ "id": "da_c03_m04",
552
+ "type": "semantic",
553
+ "timestamp": "2026-06-15T09:15:00Z",
554
+ "content": "Noise: Dr. Henderson recommends using Python's NumPy library instead of SciPy for matrix division.",
555
+ "tags": [
556
+ "noise",
557
+ "numpy"
558
+ ],
559
+ "importance": 0.3,
560
+ "metadata": {
561
+ "source_turn": "t04",
562
+ "speaker": "user"
563
+ }
564
+ },
565
+ {
566
+ "id": "da_c03_m05",
567
+ "type": "semantic",
568
+ "timestamp": "2026-06-16T10:00:00Z",
569
+ "content": "The grid spacing parameter dx was decreased to 0.02 microns to resolve numerical stability issues.",
570
+ "tags": [
571
+ "physics",
572
+ "quantum",
573
+ "simulation"
574
+ ],
575
+ "importance": 0.85,
576
+ "metadata": {
577
+ "source_turn": "t05",
578
+ "speaker": "user"
579
+ }
580
+ },
581
+ {
582
+ "id": "da_c03_m06",
583
+ "type": "semantic",
584
+ "timestamp": "2026-06-16T10:10:00Z",
585
+ "content": "Noise: The simulator outputs are saved in high-density HDF5 format by default.",
586
+ "tags": [
587
+ "noise",
588
+ "hdf5"
589
+ ],
590
+ "importance": 0.25,
591
+ "metadata": {
592
+ "source_turn": "t06",
593
+ "speaker": "assistant"
594
+ }
595
+ },
596
+ {
597
+ "id": "da_c03_m07",
598
+ "type": "semantic",
599
+ "timestamp": "2026-06-17T11:00:00Z",
600
+ "content": "The quantum simulator utilizes the Crank-Nicolson method for time-stepping calculations.",
601
+ "tags": [
602
+ "physics",
603
+ "simulation",
604
+ "math"
605
+ ],
606
+ "importance": 0.8,
607
+ "metadata": {
608
+ "source_turn": "t07",
609
+ "speaker": "user"
610
+ }
611
+ },
612
+ {
613
+ "id": "da_c03_m08",
614
+ "type": "semantic",
615
+ "timestamp": "2026-06-17T11:20:00Z",
616
+ "content": "We use the GPU-accelerated CuPy backend for sparse matrix operations.",
617
+ "tags": [
618
+ "physics",
619
+ "simulation",
620
+ "gpu"
621
+ ],
622
+ "importance": 0.85,
623
+ "metadata": {
624
+ "source_turn": "t08",
625
+ "speaker": "user"
626
+ }
627
+ },
628
+ {
629
+ "id": "da_c03_m09",
630
+ "type": "semantic",
631
+ "timestamp": "2026-06-18T13:00:00Z",
632
+ "content": "The research project is titled 'Project Quasar' and is funded until December 2027.",
633
+ "tags": [
634
+ "research",
635
+ "admin"
636
+ ],
637
+ "importance": 0.7,
638
+ "metadata": {
639
+ "source_turn": "t09",
640
+ "speaker": "user"
641
+ }
642
+ }
643
+ ],
644
+ "questions": [
645
+ {
646
+ "question_id": "da_c03_q01",
647
+ "category": "atomic_fact_recall",
648
+ "question": "What relaxation factor is required for the Hamiltonian solver to converge?",
649
+ "expected_answer": "1.25",
650
+ "acceptable_answer_criteria": [
651
+ "1.25",
652
+ "omega is set to 1.25"
653
+ ],
654
+ "required_memory_ids": [
655
+ "da_c03_m02"
656
+ ],
657
+ "forbidden_memory_ids": [],
658
+ "difficulty": "easy",
659
+ "architecture_bias_risk": "low",
660
+ "fairness_note": "Direct exact match fact retrieval."
661
+ },
662
+ {
663
+ "question_id": "da_c03_q02",
664
+ "category": "paraphrased_semantic_recall",
665
+ "question": "What mathematical approach is utilized for updating calculations across time?",
666
+ "expected_answer": "Crank-Nicolson method",
667
+ "acceptable_answer_criteria": [
668
+ "Crank-Nicolson method",
669
+ "Crank-Nicolson"
670
+ ],
671
+ "required_memory_ids": [
672
+ "da_c03_m07"
673
+ ],
674
+ "forbidden_memory_ids": [],
675
+ "difficulty": "medium",
676
+ "architecture_bias_risk": "low",
677
+ "fairness_note": "Tests paraphrased query match mapping 'updating calculations across time' to 'time-stepping'."
678
+ },
679
+ {
680
+ "question_id": "da_c03_q03",
681
+ "category": "temporal_update",
682
+ "question": "What is the current grid spacing parameter dx used in the simulation?",
683
+ "expected_answer": "0.02 microns",
684
+ "acceptable_answer_criteria": [
685
+ "0.02 microns",
686
+ "0.02"
687
+ ],
688
+ "required_memory_ids": [
689
+ "da_c03_m05"
690
+ ],
691
+ "forbidden_memory_ids": [
692
+ "da_c03_m01"
693
+ ],
694
+ "difficulty": "medium",
695
+ "architecture_bias_risk": "low",
696
+ "fairness_note": "Requires checking timestamps to ensure the newer 0.02 setting overrides the older 0.05 setting."
697
+ },
698
+ {
699
+ "question_id": "da_c03_q04",
700
+ "category": "contradiction_resolution",
701
+ "question": "Did we increase the grid spacing parameter dx in our latest update?",
702
+ "expected_answer": "No, it was decreased to 0.02 microns to resolve numerical stability issues.",
703
+ "acceptable_answer_criteria": [
704
+ "No, it was decreased",
705
+ "No"
706
+ ],
707
+ "required_memory_ids": [
708
+ "da_c03_m05",
709
+ "da_c03_m01"
710
+ ],
711
+ "forbidden_memory_ids": [],
712
+ "difficulty": "medium",
713
+ "architecture_bias_risk": "low",
714
+ "fairness_note": "Tests the ability to resolve the direction of change in contradiction resolution."
715
+ },
716
+ {
717
+ "question_id": "da_c03_q05",
718
+ "category": "abstention",
719
+ "question": "Which specific GPU model is used to run the CuPy simulator?",
720
+ "expected_answer": "not enough information",
721
+ "acceptable_answer_criteria": [
722
+ "not enough information",
723
+ "insufficient evidence",
724
+ "unknown"
725
+ ],
726
+ "required_memory_ids": [],
727
+ "forbidden_memory_ids": [
728
+ "da_c03_m08"
729
+ ],
730
+ "difficulty": "hard",
731
+ "architecture_bias_risk": "low",
732
+ "fairness_note": "Verifies the system correctly abstains when the record mentions a GPU backend (CuPy) but not the specific GPU model."
733
+ }
734
+ ]
735
+ },
736
+ {
737
+ "conversation_id": "da_c04_tokyo_itinerary",
738
+ "agent_id": "da_agent_tokyo_itinerary",
739
+ "domain": "travel",
740
+ "memory_records": [
741
+ {
742
+ "id": "da_c04_m01",
743
+ "type": "semantic",
744
+ "timestamp": "2026-06-01T10:00:00Z",
745
+ "content": "The traveler has a booking at Hotel Claska in Meguro, Tokyo, from October 12 to October 18.",
746
+ "tags": [
747
+ "travel",
748
+ "hotel",
749
+ "tokyo"
750
+ ],
751
+ "importance": 0.85,
752
+ "metadata": {
753
+ "source_turn": "t01",
754
+ "speaker": "user"
755
+ }
756
+ },
757
+ {
758
+ "id": "da_c04_m02",
759
+ "type": "semantic",
760
+ "timestamp": "2026-06-01T10:05:00Z",
761
+ "content": "The traveler prefers flying window seats on long-haul flights to sleep easily.",
762
+ "tags": [
763
+ "travel",
764
+ "flight",
765
+ "preference"
766
+ ],
767
+ "importance": 0.8,
768
+ "metadata": {
769
+ "source_turn": "t02",
770
+ "speaker": "user"
771
+ }
772
+ },
773
+ {
774
+ "id": "da_c04_m03",
775
+ "type": "episodic",
776
+ "timestamp": "2026-06-02T11:00:00Z",
777
+ "content": "The traveler booked flight JL005 from JFK to Haneda, departing at 1:15 PM on October 11.",
778
+ "tags": [
779
+ "travel",
780
+ "flight",
781
+ "tokyo"
782
+ ],
783
+ "importance": 0.9,
784
+ "metadata": {
785
+ "source_turn": "t03",
786
+ "speaker": "user"
787
+ },
788
+ "associations": [
789
+ {
790
+ "target_id": "da_c04_m02",
791
+ "strength": 0.75,
792
+ "reason": "flight seat preferences"
793
+ }
794
+ ]
795
+ },
796
+ {
797
+ "id": "da_c04_m04",
798
+ "type": "semantic",
799
+ "timestamp": "2026-06-02T11:05:00Z",
800
+ "content": "Noise: The traveler's luggage is a medium-sized hard-shell suitcase in navy blue color.",
801
+ "tags": [
802
+ "noise",
803
+ "traveler-suitfall"
804
+ ],
805
+ "importance": 0.2,
806
+ "metadata": {
807
+ "source_turn": "t04",
808
+ "speaker": "user"
809
+ }
810
+ },
811
+ {
812
+ "id": "da_c04_m05",
813
+ "type": "semantic",
814
+ "timestamp": "2026-06-03T09:00:00Z",
815
+ "content": "On June 3, the traveler changed the Tokyo accommodation plan to stay at Trunk Hotel in Shibuya instead of Hotel Claska.",
816
+ "tags": [
817
+ "travel",
818
+ "hotel",
819
+ "tokyo"
820
+ ],
821
+ "importance": 0.85,
822
+ "metadata": {
823
+ "source_turn": "t05",
824
+ "speaker": "user"
825
+ }
826
+ },
827
+ {
828
+ "id": "da_c04_m06",
829
+ "type": "episodic",
830
+ "timestamp": "2026-06-04T12:00:00Z",
831
+ "content": "The traveler bought a museum ticket for the teamLab Planets exhibit on October 14 at 2:00 PM.",
832
+ "tags": [
833
+ "travel",
834
+ "activity",
835
+ "museum"
836
+ ],
837
+ "importance": 0.8,
838
+ "metadata": {
839
+ "source_turn": "t06",
840
+ "speaker": "user"
841
+ }
842
+ },
843
+ {
844
+ "id": "da_c04_m07",
845
+ "type": "semantic",
846
+ "timestamp": "2026-06-04T12:05:00Z",
847
+ "content": "Noise: Tokyo temperatures in mid-October average 15 to 22 degrees Celsius.",
848
+ "tags": [
849
+ "noise",
850
+ "weather"
851
+ ],
852
+ "importance": 0.3,
853
+ "metadata": {
854
+ "source_turn": "t07",
855
+ "speaker": "assistant"
856
+ }
857
+ },
858
+ {
859
+ "id": "da_c04_m08",
860
+ "type": "semantic",
861
+ "timestamp": "2026-06-05T10:00:00Z",
862
+ "content": "The traveler enjoys visiting traditional tempura restaurants and prefers reservations for dinner.",
863
+ "tags": [
864
+ "travel",
865
+ "food",
866
+ "dining"
867
+ ],
868
+ "importance": 0.75,
869
+ "metadata": {
870
+ "source_turn": "t08",
871
+ "speaker": "user"
872
+ }
873
+ },
874
+ {
875
+ "id": "da_c04_m09",
876
+ "type": "semantic",
877
+ "timestamp": "2026-06-05T10:30:00Z",
878
+ "content": "The passenger name in the airline booking is listed as Sarah Miller.",
879
+ "tags": [
880
+ "travel",
881
+ "identity"
882
+ ],
883
+ "importance": 0.8,
884
+ "metadata": {
885
+ "source_turn": "t09",
886
+ "speaker": "user"
887
+ }
888
+ }
889
+ ],
890
+ "questions": [
891
+ {
892
+ "question_id": "da_c04_q01",
893
+ "category": "atomic_fact_recall",
894
+ "question": "What is the departure time and flight number for the JFK to Haneda flight?",
895
+ "expected_answer": "Flight JL005 departing at 1:15 PM.",
896
+ "acceptable_answer_criteria": [
897
+ "Flight JL005",
898
+ "1:15 PM",
899
+ "JL005"
900
+ ],
901
+ "required_memory_ids": [
902
+ "da_c04_m03"
903
+ ],
904
+ "forbidden_memory_ids": [],
905
+ "difficulty": "easy",
906
+ "architecture_bias_risk": "low",
907
+ "fairness_note": "Direct exact match fact retrieval."
908
+ },
909
+ {
910
+ "question_id": "da_c04_q02",
911
+ "category": "paraphrased_semantic_recall",
912
+ "question": "What type of seating does Sarah Miller prefer on long airplane journeys?",
913
+ "expected_answer": "Window seats",
914
+ "acceptable_answer_criteria": [
915
+ "window",
916
+ "window seat"
917
+ ],
918
+ "required_memory_ids": [
919
+ "da_c04_m02",
920
+ "da_c04_m09"
921
+ ],
922
+ "forbidden_memory_ids": [],
923
+ "difficulty": "medium",
924
+ "architecture_bias_risk": "low",
925
+ "fairness_note": "Tests semantic mapping of 'long airplane journeys' to 'long-haul flights' and uses passenger name 'Sarah Miller' from another record."
926
+ },
927
+ {
928
+ "question_id": "da_c04_q03",
929
+ "category": "temporal_update",
930
+ "question": "Where is the traveler staying in Tokyo during their visit?",
931
+ "expected_answer": "Trunk Hotel in Shibuya",
932
+ "acceptable_answer_criteria": [
933
+ "Trunk Hotel",
934
+ "Trunk Hotel in Shibuya",
935
+ "Trunk"
936
+ ],
937
+ "required_memory_ids": [
938
+ "da_c04_m05"
939
+ ],
940
+ "forbidden_memory_ids": [
941
+ "da_c04_m01"
942
+ ],
943
+ "difficulty": "medium",
944
+ "architecture_bias_risk": "low",
945
+ "fairness_note": "Requires retrieving the updated Trunk Hotel choice rather than the old Hotel Claska plan."
946
+ },
947
+ {
948
+ "question_id": "da_c04_q04",
949
+ "category": "noise_resistance",
950
+ "question": "What is the date and time of the teamLab Planets ticket?",
951
+ "expected_answer": "October 14 at 2:00 PM",
952
+ "acceptable_answer_criteria": [
953
+ "October 14",
954
+ "2:00 PM"
955
+ ],
956
+ "required_memory_ids": [
957
+ "da_c04_m06"
958
+ ],
959
+ "forbidden_memory_ids": [
960
+ "da_c04_m07",
961
+ "da_c04_m04"
962
+ ],
963
+ "difficulty": "easy",
964
+ "architecture_bias_risk": "low",
965
+ "fairness_note": "Tests retrieval quality when ignores irrelevant weather and luggage color metadata."
966
+ },
967
+ {
968
+ "question_id": "da_c04_q05",
969
+ "category": "multi_hop_association",
970
+ "question": "What seat selection is preferred for the traveler's flight JL005 on October 11?",
971
+ "expected_answer": "Window seat",
972
+ "acceptable_answer_criteria": [
973
+ "window seat",
974
+ "window"
975
+ ],
976
+ "required_memory_ids": [
977
+ "da_c04_m02",
978
+ "da_c04_m03"
979
+ ],
980
+ "forbidden_memory_ids": [],
981
+ "difficulty": "medium",
982
+ "architecture_bias_risk": "medium",
983
+ "fairness_note": "Tests link between flight JL005 -> long-haul flight -> window seat preference."
984
+ }
985
+ ]
986
+ },
987
+ {
988
+ "conversation_id": "da_c05_diabetes_admin",
989
+ "agent_id": "da_agent_diabetes_admin",
990
+ "domain": "health_admin",
991
+ "memory_records": [
992
+ {
993
+ "id": "da_c05_m01",
994
+ "type": "semantic",
995
+ "timestamp": "2026-06-05T08:00:00Z",
996
+ "content": "The patient is insured under Blue Shield PPO, Policy ID #BS-9021-X.",
997
+ "tags": [
998
+ "health",
999
+ "insurance",
1000
+ "policy"
1001
+ ],
1002
+ "importance": 0.85,
1003
+ "metadata": {
1004
+ "source_turn": "t01",
1005
+ "speaker": "user"
1006
+ }
1007
+ },
1008
+ {
1009
+ "id": "da_c05_m02",
1010
+ "type": "semantic",
1011
+ "timestamp": "2026-06-05T08:05:00Z",
1012
+ "content": "The patient's endocrinologist is Dr. Robert Vance, located at the Vance Clinic on Oak Street.",
1013
+ "tags": [
1014
+ "health",
1015
+ "doctor",
1016
+ "endocrinologist"
1017
+ ],
1018
+ "importance": 0.8,
1019
+ "metadata": {
1020
+ "source_turn": "t02",
1021
+ "speaker": "user"
1022
+ }
1023
+ },
1024
+ {
1025
+ "id": "da_c05_m03",
1026
+ "type": "episodic",
1027
+ "timestamp": "2026-06-05T08:10:00Z",
1028
+ "content": "The patient has a scheduled routine blood draw appointment on June 20 at 7:30 AM at Oak Laboratories.",
1029
+ "tags": [
1030
+ "health",
1031
+ "appointment",
1032
+ "labs"
1033
+ ],
1034
+ "importance": 0.9,
1035
+ "metadata": {
1036
+ "source_turn": "t03",
1037
+ "speaker": "user"
1038
+ },
1039
+ "associations": [
1040
+ {
1041
+ "target_id": "da_c05_m02",
1042
+ "strength": 0.8,
1043
+ "reason": "doctor ordered blood draw"
1044
+ }
1045
+ ]
1046
+ },
1047
+ {
1048
+ "id": "da_c05_m04",
1049
+ "type": "semantic",
1050
+ "timestamp": "2026-06-05T08:15:00Z",
1051
+ "content": "Noise: The Vance Clinic building has a red brick facade with a parking lot in the rear.",
1052
+ "tags": [
1053
+ "noise",
1054
+ "clinic"
1055
+ ],
1056
+ "importance": 0.2,
1057
+ "metadata": {
1058
+ "source_turn": "t04",
1059
+ "speaker": "assistant"
1060
+ }
1061
+ },
1062
+ {
1063
+ "id": "da_c05_m05",
1064
+ "type": "semantic",
1065
+ "timestamp": "2026-06-06T09:00:00Z",
1066
+ "content": "On June 6, the patient's insurance plan was updated to Cigna Gold Open Access, Policy ID #CI-8401-Y, due to employer benefits change.",
1067
+ "tags": [
1068
+ "health",
1069
+ "insurance",
1070
+ "policy"
1071
+ ],
1072
+ "importance": 0.9,
1073
+ "metadata": {
1074
+ "source_turn": "t05",
1075
+ "speaker": "user"
1076
+ }
1077
+ },
1078
+ {
1079
+ "id": "da_c05_m06",
1080
+ "type": "episodic",
1081
+ "timestamp": "2026-06-06T10:00:00Z",
1082
+ "content": "The patient completed their annual physical examination on June 2 and was advised to exercise 30 minutes daily.",
1083
+ "tags": [
1084
+ "health",
1085
+ "physical",
1086
+ "exercise"
1087
+ ],
1088
+ "importance": 0.8,
1089
+ "metadata": {
1090
+ "source_turn": "t06",
1091
+ "speaker": "assistant"
1092
+ }
1093
+ },
1094
+ {
1095
+ "id": "da_c05_m07",
1096
+ "type": "semantic",
1097
+ "timestamp": "2026-06-07T11:00:00Z",
1098
+ "content": "The patient takes Metformin 500mg twice daily with meals to manage blood sugar levels.",
1099
+ "tags": [
1100
+ "health",
1101
+ "prescription",
1102
+ "diabetes"
1103
+ ],
1104
+ "importance": 0.85,
1105
+ "metadata": {
1106
+ "source_turn": "t07",
1107
+ "speaker": "user"
1108
+ }
1109
+ },
1110
+ {
1111
+ "id": "da_c05_m08",
1112
+ "type": "semantic",
1113
+ "timestamp": "2026-06-07T11:15:00Z",
1114
+ "content": "Noise: Blue Shield PPO customer service phone line is open 24 hours for emergency inquiries.",
1115
+ "tags": [
1116
+ "noise",
1117
+ "insurance"
1118
+ ],
1119
+ "importance": 0.3,
1120
+ "metadata": {
1121
+ "source_turn": "t08",
1122
+ "speaker": "user"
1123
+ }
1124
+ },
1125
+ {
1126
+ "id": "da_c05_m09",
1127
+ "type": "semantic",
1128
+ "timestamp": "2026-06-08T12:00:00Z",
1129
+ "content": "The primary pharmacy is Walgreens on 4th Avenue, which has a drive-through window.",
1130
+ "tags": [
1131
+ "health",
1132
+ "pharmacy"
1133
+ ],
1134
+ "importance": 0.75,
1135
+ "metadata": {
1136
+ "source_turn": "t09",
1137
+ "speaker": "user"
1138
+ }
1139
+ }
1140
+ ],
1141
+ "questions": [
1142
+ {
1143
+ "question_id": "da_c05_q01",
1144
+ "category": "atomic_fact_recall",
1145
+ "question": "What dosage of Metformin is the patient prescribed to take?",
1146
+ "expected_answer": "500mg twice daily",
1147
+ "acceptable_answer_criteria": [
1148
+ "500mg twice daily",
1149
+ "500mg",
1150
+ "twice a day"
1151
+ ],
1152
+ "required_memory_ids": [
1153
+ "da_c05_m07"
1154
+ ],
1155
+ "forbidden_memory_ids": [],
1156
+ "difficulty": "easy",
1157
+ "architecture_bias_risk": "low",
1158
+ "fairness_note": "Direct exact match fact retrieval."
1159
+ },
1160
+ {
1161
+ "question_id": "da_c05_q02",
1162
+ "category": "paraphrased_semantic_recall",
1163
+ "question": "Where does the patient get their prescriptions filled?",
1164
+ "expected_answer": "Walgreens on 4th Avenue",
1165
+ "acceptable_answer_criteria": [
1166
+ "Walgreens",
1167
+ "Walgreens on 4th Avenue"
1168
+ ],
1169
+ "required_memory_ids": [
1170
+ "da_c05_m09"
1171
+ ],
1172
+ "forbidden_memory_ids": [],
1173
+ "difficulty": "medium",
1174
+ "architecture_bias_risk": "low",
1175
+ "fairness_note": "Tests semantic mapping of 'where does the patient get their prescriptions filled' to 'primary pharmacy'."
1176
+ },
1177
+ {
1178
+ "question_id": "da_c05_q03",
1179
+ "category": "temporal_update",
1180
+ "question": "What is the patient's current insurance provider and Policy ID?",
1181
+ "expected_answer": "Cigna Gold Open Access, Policy ID #CI-8401-Y",
1182
+ "acceptable_answer_criteria": [
1183
+ "Cigna",
1184
+ "CI-8401-Y"
1185
+ ],
1186
+ "required_memory_ids": [
1187
+ "da_c05_m05"
1188
+ ],
1189
+ "forbidden_memory_ids": [
1190
+ "da_c05_m01"
1191
+ ],
1192
+ "difficulty": "medium",
1193
+ "architecture_bias_risk": "low",
1194
+ "fairness_note": "Requires checking timestamps to ensure the Cigna policy overrides the old Blue Shield policy."
1195
+ },
1196
+ {
1197
+ "question_id": "da_c05_q04",
1198
+ "category": "contradiction_resolution",
1199
+ "question": "Is the patient's active health policy still Blue Shield PPO?",
1200
+ "expected_answer": "No, it was updated to Cigna Gold Open Access on June 6.",
1201
+ "acceptable_answer_criteria": [
1202
+ "No, it is Cigna",
1203
+ "No"
1204
+ ],
1205
+ "required_memory_ids": [
1206
+ "da_c05_m05",
1207
+ "da_c05_m01"
1208
+ ],
1209
+ "forbidden_memory_ids": [],
1210
+ "difficulty": "medium",
1211
+ "architecture_bias_risk": "low",
1212
+ "fairness_note": "Tests contradiction handling between old Blue Shield and current Cigna policies."
1213
+ },
1214
+ {
1215
+ "question_id": "da_c05_q05",
1216
+ "category": "multi_hop_association",
1217
+ "question": "At which laboratory is the blood draw requested by Dr. Robert Vance scheduled?",
1218
+ "expected_answer": "Oak Laboratories",
1219
+ "acceptable_answer_criteria": [
1220
+ "Oak Laboratories",
1221
+ "Oak Labs"
1222
+ ],
1223
+ "required_memory_ids": [
1224
+ "da_c05_m03",
1225
+ "da_c05_m02"
1226
+ ],
1227
+ "forbidden_memory_ids": [],
1228
+ "difficulty": "hard",
1229
+ "architecture_bias_risk": "medium",
1230
+ "fairness_note": "Requires connecting Dr. Vance -> ordered blood draw -> Oak Laboratories appointment."
1231
+ }
1232
+ ]
1233
+ },
1234
+ {
1235
+ "conversation_id": "da_c06_tax_prep",
1236
+ "agent_id": "da_agent_tax_prep",
1237
+ "domain": "finance_admin",
1238
+ "memory_records": [
1239
+ {
1240
+ "id": "da_c06_m01",
1241
+ "type": "semantic",
1242
+ "timestamp": "2026-06-01T10:00:00Z",
1243
+ "content": "The user has a primary checking account at Chase Bank with routing number #***0912.",
1244
+ "tags": [
1245
+ "finance",
1246
+ "bank",
1247
+ "chase"
1248
+ ],
1249
+ "importance": 0.8,
1250
+ "metadata": {
1251
+ "source_turn": "t01",
1252
+ "speaker": "user"
1253
+ }
1254
+ },
1255
+ {
1256
+ "id": "da_c06_m02",
1257
+ "type": "semantic",
1258
+ "timestamp": "2026-06-01T10:05:00Z",
1259
+ "content": "The tax consultant is Evelyn Mercer, who works at Mercer Tax Services.",
1260
+ "tags": [
1261
+ "finance",
1262
+ "tax",
1263
+ "consultant"
1264
+ ],
1265
+ "importance": 0.85,
1266
+ "metadata": {
1267
+ "source_turn": "t02",
1268
+ "speaker": "user"
1269
+ }
1270
+ },
1271
+ {
1272
+ "id": "da_c06_m03",
1273
+ "type": "episodic",
1274
+ "timestamp": "2026-06-02T09:00:00Z",
1275
+ "content": "The user submitted Form 1099-NEC for freelance earnings of $14,200 from Apex Systems.",
1276
+ "tags": [
1277
+ "finance",
1278
+ "tax",
1279
+ "income"
1280
+ ],
1281
+ "importance": 0.9,
1282
+ "metadata": {
1283
+ "source_turn": "t03",
1284
+ "speaker": "user"
1285
+ },
1286
+ "associations": [
1287
+ {
1288
+ "target_id": "da_c06_m02",
1289
+ "strength": 0.8,
1290
+ "reason": "Evelyn Mercer prepares tax submission"
1291
+ }
1292
+ ]
1293
+ },
1294
+ {
1295
+ "id": "da_c06_m04",
1296
+ "type": "semantic",
1297
+ "timestamp": "2026-06-02T09:10:00Z",
1298
+ "content": "Noise: Chase Bank's branch in downtown has a revolving door and 4 teller counters.",
1299
+ "tags": [
1300
+ "noise",
1301
+ "chase"
1302
+ ],
1303
+ "importance": 0.2,
1304
+ "metadata": {
1305
+ "source_turn": "t04",
1306
+ "speaker": "assistant"
1307
+ }
1308
+ },
1309
+ {
1310
+ "id": "da_c06_m05",
1311
+ "type": "semantic",
1312
+ "timestamp": "2026-06-03T11:00:00Z",
1313
+ "content": "On June 3, the user opened a business checking account at Silicon Valley Bank (SVB) to replace Chase for all future freelance income deposits.",
1314
+ "tags": [
1315
+ "finance",
1316
+ "bank",
1317
+ "svb"
1318
+ ],
1319
+ "importance": 0.9,
1320
+ "metadata": {
1321
+ "source_turn": "t05",
1322
+ "speaker": "user"
1323
+ }
1324
+ },
1325
+ {
1326
+ "id": "da_c06_m06",
1327
+ "type": "episodic",
1328
+ "timestamp": "2026-06-03T12:00:00Z",
1329
+ "content": "The user paid a tax preparation deposit of $150 to Mercer Tax Services using their credit card.",
1330
+ "tags": [
1331
+ "finance",
1332
+ "payment",
1333
+ "tax"
1334
+ ],
1335
+ "importance": 0.8,
1336
+ "metadata": {
1337
+ "source_turn": "t06",
1338
+ "speaker": "user"
1339
+ }
1340
+ },
1341
+ {
1342
+ "id": "da_c06_m07",
1343
+ "type": "semantic",
1344
+ "timestamp": "2026-06-04T10:00:00Z",
1345
+ "content": "The quarterly estimated tax payment deadline for Q2 is June 15.",
1346
+ "tags": [
1347
+ "finance",
1348
+ "tax",
1349
+ "deadline"
1350
+ ],
1351
+ "importance": 0.85,
1352
+ "metadata": {
1353
+ "source_turn": "t07",
1354
+ "speaker": "assistant"
1355
+ }
1356
+ },
1357
+ {
1358
+ "id": "da_c06_m08",
1359
+ "type": "semantic",
1360
+ "timestamp": "2026-06-04T10:15:00Z",
1361
+ "content": "Noise: Mercer Tax Services logo has a scales of justice symbol in gold and navy colors.",
1362
+ "tags": [
1363
+ "noise",
1364
+ "tax"
1365
+ ],
1366
+ "importance": 0.3,
1367
+ "metadata": {
1368
+ "source_turn": "t08",
1369
+ "speaker": "user"
1370
+ }
1371
+ },
1372
+ {
1373
+ "id": "da_c06_m09",
1374
+ "type": "semantic",
1375
+ "timestamp": "2026-06-05T13:00:00Z",
1376
+ "content": "The user has a personal retirement traditional IRA account at Fidelity with a 2026 contribution limit of $7,000.",
1377
+ "tags": [
1378
+ "finance",
1379
+ "ira",
1380
+ "fidelity"
1381
+ ],
1382
+ "importance": 0.8,
1383
+ "metadata": {
1384
+ "source_turn": "t09",
1385
+ "speaker": "user"
1386
+ }
1387
+ }
1388
+ ],
1389
+ "questions": [
1390
+ {
1391
+ "question_id": "da_c06_q01",
1392
+ "category": "atomic_fact_recall",
1393
+ "question": "What routing number is associated with the Chase Bank checking account?",
1394
+ "expected_answer": "#***0912",
1395
+ "acceptable_answer_criteria": [
1396
+ "#***0912",
1397
+ "routing number #***0912"
1398
+ ],
1399
+ "required_memory_ids": [
1400
+ "da_c06_m01"
1401
+ ],
1402
+ "forbidden_memory_ids": [],
1403
+ "difficulty": "easy",
1404
+ "architecture_bias_risk": "low",
1405
+ "fairness_note": "Direct exact match fact retrieval."
1406
+ },
1407
+ {
1408
+ "question_id": "da_c06_q02",
1409
+ "category": "paraphrased_semantic_recall",
1410
+ "question": "Who is handling the user's tax consultancy work?",
1411
+ "expected_answer": "Evelyn Mercer at Mercer Tax Services",
1412
+ "acceptable_answer_criteria": [
1413
+ "Evelyn Mercer",
1414
+ "Mercer Tax Services"
1415
+ ],
1416
+ "required_memory_ids": [
1417
+ "da_c06_m02"
1418
+ ],
1419
+ "forbidden_memory_ids": [],
1420
+ "difficulty": "medium",
1421
+ "architecture_bias_risk": "low",
1422
+ "fairness_note": "Tests semantic mapping of 'tax consultancy work' to 'tax consultant'."
1423
+ },
1424
+ {
1425
+ "question_id": "da_c06_q03",
1426
+ "category": "temporal_update",
1427
+ "question": "Where should future freelance earnings be deposited according to the latest decision?",
1428
+ "expected_answer": "Silicon Valley Bank (SVB)",
1429
+ "acceptable_answer_criteria": [
1430
+ "Silicon Valley Bank",
1431
+ "SVB"
1432
+ ],
1433
+ "required_memory_ids": [
1434
+ "da_c06_m05"
1435
+ ],
1436
+ "forbidden_memory_ids": [
1437
+ "da_c06_m01"
1438
+ ],
1439
+ "difficulty": "medium",
1440
+ "architecture_bias_risk": "low",
1441
+ "fairness_note": "Requires identifying the new bank account (SVB) replacing the old bank account (Chase)."
1442
+ },
1443
+ {
1444
+ "question_id": "da_c06_q04",
1445
+ "category": "contradiction_resolution",
1446
+ "question": "Is Chase Bank still the active checking account for freelance income?",
1447
+ "expected_answer": "No, it was replaced by Silicon Valley Bank (SVB) on June 3.",
1448
+ "acceptable_answer_criteria": [
1449
+ "No, it was replaced by SVB",
1450
+ "No"
1451
+ ],
1452
+ "required_memory_ids": [
1453
+ "da_c06_m05",
1454
+ "da_c06_m01"
1455
+ ],
1456
+ "forbidden_memory_ids": [],
1457
+ "difficulty": "medium",
1458
+ "architecture_bias_risk": "low",
1459
+ "fairness_note": "Verifies the system resolves contradiction and correctly updates active account."
1460
+ },
1461
+ {
1462
+ "question_id": "da_c06_q05",
1463
+ "category": "abstention",
1464
+ "question": "What is the account number of the traditional IRA at Fidelity?",
1465
+ "expected_answer": "not enough information",
1466
+ "acceptable_answer_criteria": [
1467
+ "not enough information",
1468
+ "insufficient evidence",
1469
+ "unknown"
1470
+ ],
1471
+ "required_memory_ids": [],
1472
+ "forbidden_memory_ids": [
1473
+ "da_c06_m09"
1474
+ ],
1475
+ "difficulty": "hard",
1476
+ "architecture_bias_risk": "low",
1477
+ "fairness_note": "Tests abstention: the user mentioned having a traditional IRA at Fidelity but never shared the account number."
1478
+ }
1479
+ ]
1480
+ },
1481
+ {
1482
+ "conversation_id": "da_c07_linear_algebra",
1483
+ "agent_id": "da_agent_linear_algebra",
1484
+ "domain": "education",
1485
+ "memory_records": [
1486
+ {
1487
+ "id": "da_c07_m01",
1488
+ "type": "semantic",
1489
+ "timestamp": "2026-06-01T09:00:00Z",
1490
+ "content": "The Linear Algebra course has weekly quizzes that open on Friday and close on Sunday night.",
1491
+ "tags": [
1492
+ "education",
1493
+ "math",
1494
+ "quizzes"
1495
+ ],
1496
+ "importance": 0.8,
1497
+ "metadata": {
1498
+ "source_turn": "t01",
1499
+ "speaker": "user"
1500
+ }
1501
+ },
1502
+ {
1503
+ "id": "da_c07_m02",
1504
+ "type": "semantic",
1505
+ "timestamp": "2026-06-01T09:05:00Z",
1506
+ "content": "The mid-term exam is scheduled for October 15 and covers vector spaces, eigenvalues, and linear transformations.",
1507
+ "tags": [
1508
+ "education",
1509
+ "math",
1510
+ "midterm"
1511
+ ],
1512
+ "importance": 0.9,
1513
+ "metadata": {
1514
+ "source_turn": "t02",
1515
+ "speaker": "user"
1516
+ }
1517
+ },
1518
+ {
1519
+ "id": "da_c07_m03",
1520
+ "type": "semantic",
1521
+ "timestamp": "2026-06-01T09:10:00Z",
1522
+ "content": "The required textbook is 'Introduction to Linear Algebra' by Gilbert Strang, 5th Edition.",
1523
+ "tags": [
1524
+ "education",
1525
+ "math",
1526
+ "textbook"
1527
+ ],
1528
+ "importance": 0.85,
1529
+ "metadata": {
1530
+ "source_turn": "t03",
1531
+ "speaker": "user"
1532
+ },
1533
+ "associations": [
1534
+ {
1535
+ "target_id": "da_c07_m02",
1536
+ "strength": 0.7,
1537
+ "reason": "study source for midterm"
1538
+ }
1539
+ ]
1540
+ },
1541
+ {
1542
+ "id": "da_c07_m04",
1543
+ "type": "semantic",
1544
+ "timestamp": "2026-06-01T09:15:00Z",
1545
+ "content": "Noise: The lecturer, Professor Adams, likes to drink hot black coffee during morning sessions.",
1546
+ "tags": [
1547
+ "noise",
1548
+ "professor"
1549
+ ],
1550
+ "importance": 0.2,
1551
+ "metadata": {
1552
+ "source_turn": "t04",
1553
+ "speaker": "assistant"
1554
+ }
1555
+ },
1556
+ {
1557
+ "id": "da_c07_m05",
1558
+ "type": "episodic",
1559
+ "timestamp": "2026-06-02T10:00:00Z",
1560
+ "content": "The user achieved a score of 92/100 on Homework Assignment 1.",
1561
+ "tags": [
1562
+ "education",
1563
+ "grade",
1564
+ "homework"
1565
+ ],
1566
+ "importance": 0.8,
1567
+ "metadata": {
1568
+ "source_turn": "t05",
1569
+ "speaker": "user"
1570
+ }
1571
+ },
1572
+ {
1573
+ "id": "da_c07_m06",
1574
+ "type": "semantic",
1575
+ "timestamp": "2026-06-02T10:05:00Z",
1576
+ "content": "Noise: Homework assignments must be uploaded in PDF format only.",
1577
+ "tags": [
1578
+ "noise",
1579
+ "format"
1580
+ ],
1581
+ "importance": 0.3,
1582
+ "metadata": {
1583
+ "source_turn": "t06",
1584
+ "speaker": "assistant"
1585
+ }
1586
+ },
1587
+ {
1588
+ "id": "da_c07_m07",
1589
+ "type": "semantic",
1590
+ "timestamp": "2026-06-03T11:00:00Z",
1591
+ "content": "The teaching assistant is named Marcus Vance, and his office hours are Wednesdays from 2:00 PM to 4:00 PM.",
1592
+ "tags": [
1593
+ "education",
1594
+ "math",
1595
+ "office-hours"
1596
+ ],
1597
+ "importance": 0.8,
1598
+ "metadata": {
1599
+ "source_turn": "t07",
1600
+ "speaker": "user"
1601
+ }
1602
+ },
1603
+ {
1604
+ "id": "da_c07_m08",
1605
+ "type": "semantic",
1606
+ "timestamp": "2026-06-03T11:15:00Z",
1607
+ "content": "Office hours are located in Room 402 of the Mathematics Building.",
1608
+ "tags": [
1609
+ "education",
1610
+ "math",
1611
+ "location"
1612
+ ],
1613
+ "importance": 0.75,
1614
+ "metadata": {
1615
+ "source_turn": "t08",
1616
+ "speaker": "user"
1617
+ },
1618
+ "associations": [
1619
+ {
1620
+ "target_id": "da_c07_m07",
1621
+ "strength": 0.9,
1622
+ "reason": "TA's office location"
1623
+ }
1624
+ ]
1625
+ },
1626
+ {
1627
+ "id": "da_c07_m09",
1628
+ "type": "semantic",
1629
+ "timestamp": "2026-06-04T12:00:00Z",
1630
+ "content": "The class Zoom password is 'Eigen2026'.",
1631
+ "tags": [
1632
+ "education",
1633
+ "math",
1634
+ "zoom"
1635
+ ],
1636
+ "importance": 0.85,
1637
+ "metadata": {
1638
+ "source_turn": "t09",
1639
+ "speaker": "user"
1640
+ }
1641
+ }
1642
+ ],
1643
+ "questions": [
1644
+ {
1645
+ "question_id": "da_c07_q01",
1646
+ "category": "atomic_fact_recall",
1647
+ "question": "What textbook edition is required for the Linear Algebra course?",
1648
+ "expected_answer": "'Introduction to Linear Algebra' by Gilbert Strang, 5th Edition.",
1649
+ "acceptable_answer_criteria": [
1650
+ "Strang 5th Edition",
1651
+ "Gilbert Strang",
1652
+ "5th Edition"
1653
+ ],
1654
+ "required_memory_ids": [
1655
+ "da_c07_m03"
1656
+ ],
1657
+ "forbidden_memory_ids": [],
1658
+ "difficulty": "easy",
1659
+ "architecture_bias_risk": "low",
1660
+ "fairness_note": "Direct exact match fact retrieval."
1661
+ },
1662
+ {
1663
+ "question_id": "da_c07_q02",
1664
+ "category": "atomic_fact_recall",
1665
+ "question": "What is the date of the midterm exam and what topics does it cover?",
1666
+ "expected_answer": "October 15, covering vector spaces, eigenvalues, and linear transformations.",
1667
+ "acceptable_answer_criteria": [
1668
+ "October 15",
1669
+ "vector spaces",
1670
+ "eigenvalues",
1671
+ "linear transformations"
1672
+ ],
1673
+ "required_memory_ids": [
1674
+ "da_c07_m02"
1675
+ ],
1676
+ "forbidden_memory_ids": [],
1677
+ "difficulty": "medium",
1678
+ "architecture_bias_risk": "low",
1679
+ "fairness_note": "Direct details fact retrieval."
1680
+ },
1681
+ {
1682
+ "question_id": "da_c07_q03",
1683
+ "category": "paraphrased_semantic_recall",
1684
+ "question": "What is the password to access virtual course video conferences?",
1685
+ "expected_answer": "Eigen2026",
1686
+ "acceptable_answer_criteria": [
1687
+ "Eigen2026",
1688
+ "class Zoom password 'Eigen2026'"
1689
+ ],
1690
+ "required_memory_ids": [
1691
+ "da_c07_m09"
1692
+ ],
1693
+ "forbidden_memory_ids": [],
1694
+ "difficulty": "medium",
1695
+ "architecture_bias_risk": "low",
1696
+ "fairness_note": "Tests semantic mapping of 'virtual course video conferences' to 'class Zoom'."
1697
+ },
1698
+ {
1699
+ "question_id": "da_c07_q04",
1700
+ "category": "noise_resistance",
1701
+ "question": "What grade did the user receive on the first Homework Assignment?",
1702
+ "expected_answer": "92/100",
1703
+ "acceptable_answer_criteria": [
1704
+ "92/100",
1705
+ "92"
1706
+ ],
1707
+ "required_memory_ids": [
1708
+ "da_c07_m05"
1709
+ ],
1710
+ "forbidden_memory_ids": [
1711
+ "da_c07_m04",
1712
+ "da_c07_m06"
1713
+ ],
1714
+ "difficulty": "easy",
1715
+ "architecture_bias_risk": "low",
1716
+ "fairness_note": "Verifies ignoring coffee drinking preference and PDF upload requirements."
1717
+ },
1718
+ {
1719
+ "question_id": "da_c07_q05",
1720
+ "category": "multi_hop_association",
1721
+ "question": "Where should the student go to meet teaching assistant Marcus Vance in person?",
1722
+ "expected_answer": "Room 402 of the Mathematics Building",
1723
+ "acceptable_answer_criteria": [
1724
+ "Room 402",
1725
+ "Math Building Room 402"
1726
+ ],
1727
+ "required_memory_ids": [
1728
+ "da_c07_m07",
1729
+ "da_c07_m08"
1730
+ ],
1731
+ "forbidden_memory_ids": [],
1732
+ "difficulty": "hard",
1733
+ "architecture_bias_risk": "medium",
1734
+ "fairness_note": "Requires linking Marcus Vance -> TA -> office location in Room 402 of the Mathematics Building."
1735
+ }
1736
+ ]
1737
+ },
1738
+ {
1739
+ "conversation_id": "da_c08_novel_outline",
1740
+ "agent_id": "da_agent_novel_outline",
1741
+ "domain": "creative_work",
1742
+ "memory_records": [
1743
+ {
1744
+ "id": "da_c08_m01",
1745
+ "type": "semantic",
1746
+ "timestamp": "2026-06-01T10:00:00Z",
1747
+ "content": "The main character of the sci-fi novel is Captain Vance Rennold, commander of the starship 'Nebula'.",
1748
+ "tags": [
1749
+ "creative",
1750
+ "novel",
1751
+ "character"
1752
+ ],
1753
+ "importance": 0.85,
1754
+ "metadata": {
1755
+ "source_turn": "t01",
1756
+ "speaker": "user"
1757
+ }
1758
+ },
1759
+ {
1760
+ "id": "da_c08_m02",
1761
+ "type": "semantic",
1762
+ "timestamp": "2026-06-01T10:05:00Z",
1763
+ "content": "Captain Rennold's primary motivation is finding the lost colony of Elysium to rescue his sister.",
1764
+ "tags": [
1765
+ "creative",
1766
+ "novel",
1767
+ "character-motive"
1768
+ ],
1769
+ "importance": 0.8,
1770
+ "metadata": {
1771
+ "source_turn": "t02",
1772
+ "speaker": "user"
1773
+ },
1774
+ "associations": [
1775
+ {
1776
+ "target_id": "da_c08_m01",
1777
+ "strength": 0.85,
1778
+ "reason": "character detail"
1779
+ }
1780
+ ]
1781
+ },
1782
+ {
1783
+ "id": "da_c08_m03",
1784
+ "type": "semantic",
1785
+ "timestamp": "2026-06-01T10:10:00Z",
1786
+ "content": "The starship 'Nebula' is powered by a rare dark-matter core that requires cooling every 24 hours.",
1787
+ "tags": [
1788
+ "creative",
1789
+ "novel",
1790
+ "lore"
1791
+ ],
1792
+ "importance": 0.8,
1793
+ "metadata": {
1794
+ "source_turn": "t03",
1795
+ "speaker": "user"
1796
+ }
1797
+ },
1798
+ {
1799
+ "id": "da_c08_m04",
1800
+ "type": "semantic",
1801
+ "timestamp": "2026-06-01T10:15:00Z",
1802
+ "content": "Noise: The starship control deck features chrome surfaces and blue neon status panels.",
1803
+ "tags": [
1804
+ "noise",
1805
+ "starship-deck"
1806
+ ],
1807
+ "importance": 0.25,
1808
+ "metadata": {
1809
+ "source_turn": "t04",
1810
+ "speaker": "assistant"
1811
+ }
1812
+ },
1813
+ {
1814
+ "id": "da_c08_m05",
1815
+ "type": "procedural",
1816
+ "timestamp": "2026-06-02T09:00:00Z",
1817
+ "content": "To build narrative tension in Chapter 3, outline the following steps: first, introduce the cooling system failure; second, force a hard landing on an asteroid; third, trigger a conflict between Rennold and the engineer.",
1818
+ "tags": [
1819
+ "creative",
1820
+ "novel",
1821
+ "outline"
1822
+ ],
1823
+ "importance": 0.9,
1824
+ "metadata": {
1825
+ "source_turn": "t05",
1826
+ "speaker": "user"
1827
+ }
1828
+ },
1829
+ {
1830
+ "id": "da_c08_m06",
1831
+ "type": "semantic",
1832
+ "timestamp": "2026-06-02T10:00:00Z",
1833
+ "content": "Noise: The author uses Scrivener for drafting and Google Docs for sharing review copies.",
1834
+ "tags": [
1835
+ "noise",
1836
+ "software"
1837
+ ],
1838
+ "importance": 0.3,
1839
+ "metadata": {
1840
+ "source_turn": "t06",
1841
+ "speaker": "user"
1842
+ }
1843
+ },
1844
+ {
1845
+ "id": "da_c08_m07",
1846
+ "type": "semantic",
1847
+ "timestamp": "2026-06-03T11:00:00Z",
1848
+ "content": "The primary antagonist is Commander Sarah Drake, head of the Orion Syndicate.",
1849
+ "tags": [
1850
+ "creative",
1851
+ "novel",
1852
+ "character"
1853
+ ],
1854
+ "importance": 0.85,
1855
+ "metadata": {
1856
+ "source_turn": "t07",
1857
+ "speaker": "user"
1858
+ }
1859
+ },
1860
+ {
1861
+ "id": "da_c08_m08",
1862
+ "type": "semantic",
1863
+ "timestamp": "2026-06-03T11:15:00Z",
1864
+ "content": "The Orion Syndicate operates from a hidden space station orbiting the gas giant Jupiter.",
1865
+ "tags": [
1866
+ "creative",
1867
+ "novel",
1868
+ "lore"
1869
+ ],
1870
+ "importance": 0.8,
1871
+ "metadata": {
1872
+ "source_turn": "t08",
1873
+ "speaker": "user"
1874
+ },
1875
+ "associations": [
1876
+ {
1877
+ "target_id": "da_c08_m07",
1878
+ "strength": 0.9,
1879
+ "reason": "antagonist's organization location"
1880
+ }
1881
+ ]
1882
+ },
1883
+ {
1884
+ "id": "da_c08_m09",
1885
+ "type": "semantic",
1886
+ "timestamp": "2026-06-04T12:00:00Z",
1887
+ "content": "The novel's working title is 'Shattered Nebula' and the word count goal is 80,000 words.",
1888
+ "tags": [
1889
+ "creative",
1890
+ "novel",
1891
+ "metadata"
1892
+ ],
1893
+ "importance": 0.7,
1894
+ "metadata": {
1895
+ "source_turn": "t09",
1896
+ "speaker": "user"
1897
+ }
1898
+ }
1899
+ ],
1900
+ "questions": [
1901
+ {
1902
+ "question_id": "da_c08_q01",
1903
+ "category": "atomic_fact_recall",
1904
+ "question": "Who is the primary antagonist of the sci-fi novel?",
1905
+ "expected_answer": "Commander Sarah Drake",
1906
+ "acceptable_answer_criteria": [
1907
+ "Sarah Drake",
1908
+ "Commander Sarah Drake"
1909
+ ],
1910
+ "required_memory_ids": [
1911
+ "da_c08_m07"
1912
+ ],
1913
+ "forbidden_memory_ids": [],
1914
+ "difficulty": "easy",
1915
+ "architecture_bias_risk": "low",
1916
+ "fairness_note": "Direct exact match fact retrieval."
1917
+ },
1918
+ {
1919
+ "question_id": "da_c08_q02",
1920
+ "category": "atomic_fact_recall",
1921
+ "question": "What is the working title of the book and its target word count?",
1922
+ "expected_answer": "'Shattered Nebula' with an 80,000 words target.",
1923
+ "acceptable_answer_criteria": [
1924
+ "Shattered Nebula",
1925
+ "80,000 words",
1926
+ "80k000 words"
1927
+ ],
1928
+ "required_memory_ids": [
1929
+ "da_c08_m09"
1930
+ ],
1931
+ "forbidden_memory_ids": [],
1932
+ "difficulty": "easy",
1933
+ "architecture_bias_risk": "low",
1934
+ "fairness_note": "Direct metadata fact lookup."
1935
+ },
1936
+ {
1937
+ "question_id": "da_c08_q03",
1938
+ "category": "paraphrased_semantic_recall",
1939
+ "question": "What drives Captain Rennold to search the galaxy?",
1940
+ "expected_answer": "Finding the lost colony of Elysium to rescue his sister.",
1941
+ "acceptable_answer_criteria": [
1942
+ "rescuing his sister",
1943
+ "finding Elysium",
1944
+ "his sister"
1945
+ ],
1946
+ "required_memory_ids": [
1947
+ "da_c08_m02"
1948
+ ],
1949
+ "forbidden_memory_ids": [],
1950
+ "difficulty": "medium",
1951
+ "architecture_bias_risk": "low",
1952
+ "fairness_note": "Tests paraphrased mapping from 'drives Captain Rennold to search the galaxy' to 'primary motivation'."
1953
+ },
1954
+ {
1955
+ "question_id": "da_c08_q04",
1956
+ "category": "noise_resistance",
1957
+ "question": "How is the starship 'Nebula' powered?",
1958
+ "expected_answer": "A rare dark-matter core.",
1959
+ "acceptable_answer_criteria": [
1960
+ "dark-matter core",
1961
+ "dark-matter"
1962
+ ],
1963
+ "required_memory_ids": [
1964
+ "da_c08_m03"
1965
+ ],
1966
+ "forbidden_memory_ids": [
1967
+ "da_c08_m04",
1968
+ "da_c08_m06"
1969
+ ],
1970
+ "difficulty": "easy",
1971
+ "architecture_bias_risk": "low",
1972
+ "fairness_note": "Ensures model ignores visual control deck chrome description and software like Scrivener."
1973
+ },
1974
+ {
1975
+ "question_id": "da_c08_q05",
1976
+ "category": "procedural_recall",
1977
+ "question": "What steps are planned to build narrative tension in the third chapter?",
1978
+ "expected_answer": "First, introduce cooling system failure; second, force a hard landing on an asteroid; third, trigger a conflict between Rennold and the engineer.",
1979
+ "acceptable_answer_criteria": [
1980
+ "cooling system failure",
1981
+ "hard landing on an asteroid",
1982
+ "conflict with engineer"
1983
+ ],
1984
+ "required_memory_ids": [
1985
+ "da_c08_m05"
1986
+ ],
1987
+ "forbidden_memory_ids": [],
1988
+ "difficulty": "hard",
1989
+ "architecture_bias_risk": "low",
1990
+ "fairness_note": "Retrieves a multi-step writing procedure."
1991
+ }
1992
+ ]
1993
+ }
1994
+ ]
1995
+ }