@1mbrain/benchmarks 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (69) hide show
  1. package/README.md +85 -0
  2. package/fixtures/1mbrain-focused-mini/1mbrain-focused-mini.json +928 -0
  3. package/fixtures/1mbrain-focused-mini/README.md +45 -0
  4. package/fixtures/adversarial-memory/dataset_claude_adversarial.json +3333 -0
  5. package/fixtures/adversarial-memory/dataset_gemini_adversarial_memory.json +2984 -0
  6. package/fixtures/balanced-mini/dataset_claude_balanced_mini.json +2077 -0
  7. package/fixtures/balanced-mini/dataset_gemini_balanced_mini.json +1995 -0
  8. package/fixtures/generate_datasets.js +1741 -0
  9. package/fixtures/graph-stress-hard/README.md +43 -0
  10. package/fixtures/graph-stress-hard/dataset_graph_stress_hard.json +4374 -0
  11. package/fixtures/graph-stress-hard/generate_graph_stress_hard.js +526 -0
  12. package/fixtures/realistic-medium/dataset_claude_realistic_medium.json +7462 -0
  13. package/fixtures/realistic-medium/dataset_gemini_realistic_medium.json +7277 -0
  14. package/fixtures/realistic-medium/gen_claude_medium.js +600 -0
  15. package/package.json +22 -0
  16. package/reports/benchmark_report.md +48 -0
  17. package/reports/benchmark_report_claude_adversarial.md +42 -0
  18. package/reports/benchmark_report_claude_adversarial_adaptive.md +42 -0
  19. package/reports/benchmark_report_claude_adversarial_adaptive2_fast.md +42 -0
  20. package/reports/benchmark_report_claude_adversarial_adaptive_fast.md +42 -0
  21. package/reports/benchmark_report_claude_adversarial_rerank.md +42 -0
  22. package/reports/benchmark_report_claude_balanced_mini.md +42 -0
  23. package/reports/benchmark_report_claude_balanced_mini_adaptive.md +42 -0
  24. package/reports/benchmark_report_claude_balanced_mini_adaptive2_fast.md +42 -0
  25. package/reports/benchmark_report_claude_balanced_mini_adaptive_fast.md +42 -0
  26. package/reports/benchmark_report_claude_balanced_mini_rerank.md +42 -0
  27. package/reports/benchmark_report_claude_realistic_medium.md +42 -0
  28. package/reports/benchmark_report_claude_realistic_medium_adaptive.md +42 -0
  29. package/reports/benchmark_report_claude_realistic_medium_adaptive2_fast.md +42 -0
  30. package/reports/benchmark_report_claude_realistic_medium_adaptive_fast.md +42 -0
  31. package/reports/benchmark_report_claude_realistic_medium_evidence_rerank_local.md +42 -0
  32. package/reports/benchmark_report_claude_realistic_medium_openai_evidence_rerank.md +41 -0
  33. package/reports/benchmark_report_claude_realistic_medium_openai_multi_signal.md +41 -0
  34. package/reports/benchmark_report_claude_realistic_medium_openai_multi_signal_scoped.md +41 -0
  35. package/reports/benchmark_report_claude_realistic_medium_openai_phase8_no_judge.md +42 -0
  36. package/reports/benchmark_report_claude_realistic_medium_openai_rankingpolicy.md +41 -0
  37. package/reports/benchmark_report_claude_realistic_medium_openai_stale_filter.md +41 -0
  38. package/reports/benchmark_report_claude_realistic_medium_openai_stale_filter_absence_fix.md +41 -0
  39. package/reports/benchmark_report_claude_realistic_medium_openai_write_time_invalidation.md +41 -0
  40. package/reports/benchmark_report_claude_realistic_medium_rerank.md +42 -0
  41. package/reports/benchmark_report_claude_realistic_medium_stale_filter_local.md +42 -0
  42. package/reports/benchmark_report_graph_stress_hard.md +42 -0
  43. package/reports/benchmark_report_graph_stress_hard_absence_fix.md +42 -0
  44. package/reports/benchmark_report_graph_stress_hard_adaptive.md +42 -0
  45. package/reports/benchmark_report_graph_stress_hard_evidence_rerank.md +42 -0
  46. package/reports/benchmark_report_graph_stress_hard_multi_signal_current_guardrail.md +42 -0
  47. package/reports/benchmark_report_graph_stress_hard_multi_signal_guardrail_fixed.md +42 -0
  48. package/reports/benchmark_report_graph_stress_hard_multi_signal_local.md +42 -0
  49. package/reports/benchmark_report_graph_stress_hard_multi_signal_scoped_guardrail.md +42 -0
  50. package/reports/benchmark_report_graph_stress_hard_multi_signal_vector_pure_guardrail.md +42 -0
  51. package/reports/benchmark_report_graph_stress_hard_phase8_sdk_guardrail.md +42 -0
  52. package/reports/benchmark_report_graph_stress_hard_rerank.md +42 -0
  53. package/reports/benchmark_report_graph_stress_hard_stale_filter.md +42 -0
  54. package/reports/benchmark_report_graph_stress_hard_write_time_invalidation.md +42 -0
  55. package/results/.gitignore +2 -0
  56. package/src/adapters/1mbrain.ts +317 -0
  57. package/src/adapters/keyword-embedding.ts +48 -0
  58. package/src/adapters/mem0.ts +124 -0
  59. package/src/adapters/qdrant.ts +214 -0
  60. package/src/adapters/unavailable.ts +49 -0
  61. package/src/adapters/vector-baseline.ts +149 -0
  62. package/src/datasets/focused-mini.ts +158 -0
  63. package/src/datasets/synthetic-agent-memory.ts +532 -0
  64. package/src/llm-evaluator.ts +262 -0
  65. package/src/metrics.ts +482 -0
  66. package/src/provider.ts +151 -0
  67. package/src/runner.ts +635 -0
  68. package/tsconfig.json +10 -0
  69. package/tsconfig.tsbuildinfo +1 -0
@@ -0,0 +1,4374 @@
1
+ {
2
+ "name": "graph-stress-hard",
3
+ "description": "Diagnostic benchmark for multi-hop association, conflict resolution, graph traversal with weak lexical overlap, near-entity distractors, and abstention under tempting distractors.",
4
+ "generated_at": "2026-06-19",
5
+ "generated_by": "deterministic-template",
6
+ "version": "1.0.0",
7
+ "fairness_notes": [
8
+ "This is intentionally not a provider-neutral public leaderboard fixture; it is a graph-stress diagnostic for tuning association traversal and conflict semantics.",
9
+ "Many questions require two or three required memories, and no single memory contains both the query anchor and final answer.",
10
+ "Conflict questions include stale, interim, and final states linked by replaces/replaced_by associations.",
11
+ "Weak-overlap graph traversal questions use different surface wording for the query anchor and the target memory.",
12
+ "Abstention questions contain tempting distractor facts for similar entities but no answer for the target entity."
13
+ ],
14
+ "stats": {
15
+ "conversations": 12,
16
+ "memory_records": 144,
17
+ "questions": 60,
18
+ "question_categories": {
19
+ "multi_hop_association": 24,
20
+ "contradiction_resolution": 12,
21
+ "graph_traversal": 12,
22
+ "entity_disambiguation": 7,
23
+ "abstention": 5
24
+ }
25
+ },
26
+ "conversations": [
27
+ {
28
+ "conversation_id": "gsh-c01",
29
+ "agent_id": "agent-launch-ops",
30
+ "domain": "software",
31
+ "memory_records": [
32
+ {
33
+ "id": "gsh-c01-m01",
34
+ "type": "semantic",
35
+ "timestamp": "2026-05-01T09:00:00Z",
36
+ "content": "Rina uses codename Aurora for the active workstream, but the codename itself does not state the operational dependency.",
37
+ "tags": [
38
+ "aurora",
39
+ "codename"
40
+ ],
41
+ "importance": 0.7,
42
+ "metadata": {
43
+ "role": "anchor",
44
+ "source_turn": "t01"
45
+ },
46
+ "associations": [
47
+ {
48
+ "target_id": "gsh-c01-m02",
49
+ "strength": 0.9,
50
+ "relation": "alias_of"
51
+ }
52
+ ]
53
+ },
54
+ {
55
+ "id": "gsh-c01-m02",
56
+ "type": "semantic",
57
+ "timestamp": "2026-05-01T10:00:00Z",
58
+ "content": "Codename Aurora refers to the customer-retention project, which is governed through the renewal-risk initiative.",
59
+ "tags": [
60
+ "customer-retention",
61
+ "project-link"
62
+ ],
63
+ "importance": 0.8,
64
+ "metadata": {
65
+ "role": "bridge",
66
+ "source_turn": "t02"
67
+ },
68
+ "associations": [
69
+ {
70
+ "target_id": "gsh-c01-m01",
71
+ "strength": 0.9,
72
+ "relation": "alias_of"
73
+ },
74
+ {
75
+ "target_id": "gsh-c01-m03",
76
+ "strength": 0.9,
77
+ "relation": "depends_on"
78
+ }
79
+ ]
80
+ },
81
+ {
82
+ "id": "gsh-c01-m03",
83
+ "type": "semantic",
84
+ "timestamp": "2026-05-01T11:00:00Z",
85
+ "content": "The renewal-risk initiative depends on the Postgres event pipeline; this dependency is not named in the codename discussion.",
86
+ "tags": [
87
+ "renewal-risk",
88
+ "dependency"
89
+ ],
90
+ "importance": 0.9,
91
+ "metadata": {
92
+ "role": "answer",
93
+ "source_turn": "t03"
94
+ },
95
+ "associations": [
96
+ {
97
+ "target_id": "gsh-c01-m02",
98
+ "strength": 0.9,
99
+ "relation": "supports"
100
+ }
101
+ ]
102
+ },
103
+ {
104
+ "id": "gsh-c01-m04",
105
+ "type": "episodic",
106
+ "timestamp": "2026-05-01T12:00:00Z",
107
+ "content": "Rina said the approval ritual is internally called Copper Gate; outside notes describe it as the release approval meeting.",
108
+ "tags": [
109
+ "approval",
110
+ "ritual"
111
+ ],
112
+ "importance": 0.75,
113
+ "metadata": {
114
+ "role": "weak-overlap-anchor",
115
+ "source_turn": "t04"
116
+ },
117
+ "associations": [
118
+ {
119
+ "target_id": "gsh-c01-m05",
120
+ "strength": 0.9,
121
+ "relation": "owned_by"
122
+ }
123
+ ]
124
+ },
125
+ {
126
+ "id": "gsh-c01-m05",
127
+ "type": "semantic",
128
+ "timestamp": "2026-05-01T13:00:00Z",
129
+ "content": "Copper Gate is owned by the release assurance team, not by the team whose name resembles the project codename.",
130
+ "tags": [
131
+ "approval",
132
+ "team"
133
+ ],
134
+ "importance": 0.82,
135
+ "metadata": {
136
+ "role": "weak-overlap-bridge",
137
+ "source_turn": "t05"
138
+ },
139
+ "associations": [
140
+ {
141
+ "target_id": "gsh-c01-m04",
142
+ "strength": 0.9,
143
+ "relation": "owns"
144
+ },
145
+ {
146
+ "target_id": "gsh-c01-m06",
147
+ "strength": 0.9,
148
+ "relation": "requires"
149
+ }
150
+ ]
151
+ },
152
+ {
153
+ "id": "gsh-c01-m06",
154
+ "type": "procedural",
155
+ "timestamp": "2026-05-01T14:00:00Z",
156
+ "content": "Before release assurance team signs off, they require the signed rollback matrix.",
157
+ "tags": [
158
+ "approval",
159
+ "artifact"
160
+ ],
161
+ "importance": 0.88,
162
+ "metadata": {
163
+ "role": "weak-overlap-answer",
164
+ "source_turn": "t06"
165
+ },
166
+ "associations": [
167
+ {
168
+ "target_id": "gsh-c01-m05",
169
+ "strength": 0.9,
170
+ "relation": "required_by"
171
+ }
172
+ ]
173
+ },
174
+ {
175
+ "id": "gsh-c01-m07",
176
+ "type": "semantic",
177
+ "timestamp": "2026-05-01T15:00:00Z",
178
+ "content": "Initial state for customer-retention project: email-only weekly digest.",
179
+ "tags": [
180
+ "current-state",
181
+ "stale"
182
+ ],
183
+ "importance": 0.55,
184
+ "metadata": {
185
+ "role": "stale",
186
+ "source_turn": "t07"
187
+ },
188
+ "associations": [
189
+ {
190
+ "target_id": "gsh-c01-m08",
191
+ "strength": 0.95,
192
+ "relation": "replaced_by"
193
+ }
194
+ ]
195
+ },
196
+ {
197
+ "id": "gsh-c01-m08",
198
+ "type": "episodic",
199
+ "timestamp": "2026-05-01T16:00:00Z",
200
+ "content": "Interim update for customer-retention project: Slack digest with email fallback, replacing the initial state.",
201
+ "tags": [
202
+ "current-state",
203
+ "interim"
204
+ ],
205
+ "importance": 0.65,
206
+ "metadata": {
207
+ "role": "interim",
208
+ "source_turn": "t08"
209
+ },
210
+ "associations": [
211
+ {
212
+ "target_id": "gsh-c01-m07",
213
+ "strength": 0.95,
214
+ "relation": "replaces"
215
+ },
216
+ {
217
+ "target_id": "gsh-c01-m09",
218
+ "strength": 0.98,
219
+ "relation": "replaced_by"
220
+ }
221
+ ]
222
+ },
223
+ {
224
+ "id": "gsh-c01-m09",
225
+ "type": "semantic",
226
+ "timestamp": "2026-05-01T17:00:00Z",
227
+ "content": "Final resolved state for customer-retention project: in-app digest with Slack fallback. This supersedes all earlier versions.",
228
+ "tags": [
229
+ "current-state",
230
+ "final"
231
+ ],
232
+ "importance": 0.95,
233
+ "metadata": {
234
+ "role": "final",
235
+ "source_turn": "t09"
236
+ },
237
+ "associations": [
238
+ {
239
+ "target_id": "gsh-c01-m08",
240
+ "strength": 0.98,
241
+ "relation": "replaces"
242
+ },
243
+ {
244
+ "target_id": "gsh-c01-m07",
245
+ "strength": 0.98,
246
+ "relation": "replaces"
247
+ }
248
+ ]
249
+ },
250
+ {
251
+ "id": "gsh-c01-m10",
252
+ "type": "semantic",
253
+ "timestamp": "2026-05-01T18:00:00Z",
254
+ "content": "Distractor: Aurelia runs a similar-sounding effort with Redis queue, but it is not Rina's customer-retention project.",
255
+ "tags": [
256
+ "near-entity",
257
+ "distractor"
258
+ ],
259
+ "importance": 0.5,
260
+ "metadata": {
261
+ "role": "distractor",
262
+ "source_turn": "t10"
263
+ },
264
+ "associations": []
265
+ },
266
+ {
267
+ "id": "gsh-c01-m11",
268
+ "type": "semantic",
269
+ "timestamp": "2026-05-01T19:00:00Z",
270
+ "content": "For Rina's customer-retention project, the accountable owner is Sofia Marin.",
271
+ "tags": [
272
+ "near-entity",
273
+ "owner"
274
+ ],
275
+ "importance": 0.86,
276
+ "metadata": {
277
+ "role": "entity-answer",
278
+ "source_turn": "t11"
279
+ },
280
+ "associations": [
281
+ {
282
+ "target_id": "gsh-c01-m02",
283
+ "strength": 0.9,
284
+ "relation": "owned_by"
285
+ }
286
+ ]
287
+ },
288
+ {
289
+ "id": "gsh-c01-m12",
290
+ "type": "semantic",
291
+ "timestamp": "2026-05-01T20:00:00Z",
292
+ "content": "Tempting gap: Aurelia's similar effort mentions budget cap, but Rina's records do not state that value.",
293
+ "tags": [
294
+ "abstention",
295
+ "tempting-distractor"
296
+ ],
297
+ "importance": 0.45,
298
+ "metadata": {
299
+ "role": "abstention-distractor",
300
+ "source_turn": "t12"
301
+ },
302
+ "associations": []
303
+ }
304
+ ],
305
+ "questions": [
306
+ {
307
+ "question_id": "gsh-c01-q01",
308
+ "category": "multi_hop_association",
309
+ "question": "What operational dependency is ultimately used by Rina's codename Aurora?",
310
+ "expected_answer": "Postgres event pipeline",
311
+ "required_memory_ids": [
312
+ "gsh-c01-m01",
313
+ "gsh-c01-m02",
314
+ "gsh-c01-m03"
315
+ ],
316
+ "forbidden_memory_ids": [
317
+ "gsh-c01-m10"
318
+ ],
319
+ "difficulty": "hard",
320
+ "architecture_bias_risk": "intentional_graph_stress",
321
+ "fairness_note": "Requires codename to project to bridge to dependency; no single memory contains both codename and answer."
322
+ },
323
+ {
324
+ "question_id": "gsh-c01-q02",
325
+ "category": "multi_hop_association",
326
+ "question": "Which artifact is needed before the release approval meeting can be signed off for Rina's work?",
327
+ "expected_answer": "signed rollback matrix",
328
+ "required_memory_ids": [
329
+ "gsh-c01-m04",
330
+ "gsh-c01-m05",
331
+ "gsh-c01-m06"
332
+ ],
333
+ "forbidden_memory_ids": [
334
+ "gsh-c01-m10"
335
+ ],
336
+ "difficulty": "hard",
337
+ "architecture_bias_risk": "intentional_graph_stress",
338
+ "fairness_note": "Uses weak lexical overlap between release approval meeting and the internal ritual name."
339
+ },
340
+ {
341
+ "question_id": "gsh-c01-q03",
342
+ "category": "contradiction_resolution",
343
+ "question": "What is the current resolved state for customer-retention project?",
344
+ "expected_answer": "in-app digest with Slack fallback",
345
+ "required_memory_ids": [
346
+ "gsh-c01-m09"
347
+ ],
348
+ "forbidden_memory_ids": [
349
+ "gsh-c01-m07",
350
+ "gsh-c01-m08"
351
+ ],
352
+ "difficulty": "hard",
353
+ "architecture_bias_risk": "intentional_graph_stress",
354
+ "fairness_note": "Final memory must beat stale and interim states connected by replaces/replaced_by edges."
355
+ },
356
+ {
357
+ "question_id": "gsh-c01-q04",
358
+ "category": "graph_traversal",
359
+ "question": "For the approval chain in Rina's project, which team owns the internal ritual?",
360
+ "expected_answer": "release assurance team",
361
+ "required_memory_ids": [
362
+ "gsh-c01-m04",
363
+ "gsh-c01-m05"
364
+ ],
365
+ "forbidden_memory_ids": [
366
+ "gsh-c01-m10"
367
+ ],
368
+ "difficulty": "medium",
369
+ "architecture_bias_risk": "intentional_graph_stress",
370
+ "fairness_note": "Retrieval should cross from the approval phrasing to the internal ritual and owner team."
371
+ },
372
+ {
373
+ "question_id": "gsh-c01-q05",
374
+ "category": "entity_disambiguation",
375
+ "question": "Who is the accountable owner for Rina's project, not Aurelia's similar effort?",
376
+ "expected_answer": "Sofia Marin",
377
+ "required_memory_ids": [
378
+ "gsh-c01-m11"
379
+ ],
380
+ "forbidden_memory_ids": [
381
+ "gsh-c01-m10",
382
+ "gsh-c01-m12"
383
+ ],
384
+ "difficulty": "medium",
385
+ "architecture_bias_risk": "intentional_graph_stress",
386
+ "fairness_note": "Near-name distractors are intentionally tempting but wrong."
387
+ }
388
+ ]
389
+ },
390
+ {
391
+ "conversation_id": "gsh-c02",
392
+ "agent_id": "agent-clinical-research",
393
+ "domain": "research",
394
+ "memory_records": [
395
+ {
396
+ "id": "gsh-c02-m01",
397
+ "type": "semantic",
398
+ "timestamp": "2026-05-02T09:00:00Z",
399
+ "content": "Devon uses codename Blue Lantern for the active workstream, but the codename itself does not state the operational dependency.",
400
+ "tags": [
401
+ "blue-lantern",
402
+ "codename"
403
+ ],
404
+ "importance": 0.7,
405
+ "metadata": {
406
+ "role": "anchor",
407
+ "source_turn": "t01"
408
+ },
409
+ "associations": [
410
+ {
411
+ "target_id": "gsh-c02-m02",
412
+ "strength": 0.9,
413
+ "relation": "alias_of"
414
+ }
415
+ ]
416
+ },
417
+ {
418
+ "id": "gsh-c02-m02",
419
+ "type": "semantic",
420
+ "timestamp": "2026-05-02T10:00:00Z",
421
+ "content": "Codename Blue Lantern refers to the rural asthma cohort, which is governed through the inhaler-adherence study.",
422
+ "tags": [
423
+ "rural",
424
+ "project-link"
425
+ ],
426
+ "importance": 0.8,
427
+ "metadata": {
428
+ "role": "bridge",
429
+ "source_turn": "t02"
430
+ },
431
+ "associations": [
432
+ {
433
+ "target_id": "gsh-c02-m01",
434
+ "strength": 0.9,
435
+ "relation": "alias_of"
436
+ },
437
+ {
438
+ "target_id": "gsh-c02-m03",
439
+ "strength": 0.9,
440
+ "relation": "depends_on"
441
+ }
442
+ ]
443
+ },
444
+ {
445
+ "id": "gsh-c02-m03",
446
+ "type": "semantic",
447
+ "timestamp": "2026-05-02T11:00:00Z",
448
+ "content": "The inhaler-adherence study depends on the FHIR observation export; this dependency is not named in the codename discussion.",
449
+ "tags": [
450
+ "inhaler-adherence",
451
+ "dependency"
452
+ ],
453
+ "importance": 0.9,
454
+ "metadata": {
455
+ "role": "answer",
456
+ "source_turn": "t03"
457
+ },
458
+ "associations": [
459
+ {
460
+ "target_id": "gsh-c02-m02",
461
+ "strength": 0.9,
462
+ "relation": "supports"
463
+ }
464
+ ]
465
+ },
466
+ {
467
+ "id": "gsh-c02-m04",
468
+ "type": "episodic",
469
+ "timestamp": "2026-05-02T12:00:00Z",
470
+ "content": "Devon said the approval ritual is internally called Silver File; outside notes describe it as the release approval meeting.",
471
+ "tags": [
472
+ "approval",
473
+ "ritual"
474
+ ],
475
+ "importance": 0.75,
476
+ "metadata": {
477
+ "role": "weak-overlap-anchor",
478
+ "source_turn": "t04"
479
+ },
480
+ "associations": [
481
+ {
482
+ "target_id": "gsh-c02-m05",
483
+ "strength": 0.9,
484
+ "relation": "owned_by"
485
+ }
486
+ ]
487
+ },
488
+ {
489
+ "id": "gsh-c02-m05",
490
+ "type": "semantic",
491
+ "timestamp": "2026-05-02T13:00:00Z",
492
+ "content": "Silver File is owned by the ethics submission team, not by the team whose name resembles the project codename.",
493
+ "tags": [
494
+ "approval",
495
+ "team"
496
+ ],
497
+ "importance": 0.82,
498
+ "metadata": {
499
+ "role": "weak-overlap-bridge",
500
+ "source_turn": "t05"
501
+ },
502
+ "associations": [
503
+ {
504
+ "target_id": "gsh-c02-m04",
505
+ "strength": 0.9,
506
+ "relation": "owns"
507
+ },
508
+ {
509
+ "target_id": "gsh-c02-m06",
510
+ "strength": 0.9,
511
+ "relation": "requires"
512
+ }
513
+ ]
514
+ },
515
+ {
516
+ "id": "gsh-c02-m06",
517
+ "type": "procedural",
518
+ "timestamp": "2026-05-02T14:00:00Z",
519
+ "content": "Before ethics submission team signs off, they require the redacted consent ledger.",
520
+ "tags": [
521
+ "approval",
522
+ "artifact"
523
+ ],
524
+ "importance": 0.88,
525
+ "metadata": {
526
+ "role": "weak-overlap-answer",
527
+ "source_turn": "t06"
528
+ },
529
+ "associations": [
530
+ {
531
+ "target_id": "gsh-c02-m05",
532
+ "strength": 0.9,
533
+ "relation": "required_by"
534
+ }
535
+ ]
536
+ },
537
+ {
538
+ "id": "gsh-c02-m07",
539
+ "type": "semantic",
540
+ "timestamp": "2026-05-02T15:00:00Z",
541
+ "content": "Initial state for rural asthma cohort: manual spreadsheet tracking.",
542
+ "tags": [
543
+ "current-state",
544
+ "stale"
545
+ ],
546
+ "importance": 0.55,
547
+ "metadata": {
548
+ "role": "stale",
549
+ "source_turn": "t07"
550
+ },
551
+ "associations": [
552
+ {
553
+ "target_id": "gsh-c02-m08",
554
+ "strength": 0.95,
555
+ "relation": "replaced_by"
556
+ }
557
+ ]
558
+ },
559
+ {
560
+ "id": "gsh-c02-m08",
561
+ "type": "episodic",
562
+ "timestamp": "2026-05-02T16:00:00Z",
563
+ "content": "Interim update for rural asthma cohort: Airtable tracking, replacing the initial state.",
564
+ "tags": [
565
+ "current-state",
566
+ "interim"
567
+ ],
568
+ "importance": 0.65,
569
+ "metadata": {
570
+ "role": "interim",
571
+ "source_turn": "t08"
572
+ },
573
+ "associations": [
574
+ {
575
+ "target_id": "gsh-c02-m07",
576
+ "strength": 0.95,
577
+ "relation": "replaces"
578
+ },
579
+ {
580
+ "target_id": "gsh-c02-m09",
581
+ "strength": 0.98,
582
+ "relation": "replaced_by"
583
+ }
584
+ ]
585
+ },
586
+ {
587
+ "id": "gsh-c02-m09",
588
+ "type": "semantic",
589
+ "timestamp": "2026-05-02T17:00:00Z",
590
+ "content": "Final resolved state for rural asthma cohort: REDCap tracking with nightly export. This supersedes all earlier versions.",
591
+ "tags": [
592
+ "current-state",
593
+ "final"
594
+ ],
595
+ "importance": 0.95,
596
+ "metadata": {
597
+ "role": "final",
598
+ "source_turn": "t09"
599
+ },
600
+ "associations": [
601
+ {
602
+ "target_id": "gsh-c02-m08",
603
+ "strength": 0.98,
604
+ "relation": "replaces"
605
+ },
606
+ {
607
+ "target_id": "gsh-c02-m07",
608
+ "strength": 0.98,
609
+ "relation": "replaces"
610
+ }
611
+ ]
612
+ },
613
+ {
614
+ "id": "gsh-c02-m10",
615
+ "type": "semantic",
616
+ "timestamp": "2026-05-02T18:00:00Z",
617
+ "content": "Distractor: Devan runs a similar-sounding effort with HL7 batch import, but it is not Devon's rural asthma cohort.",
618
+ "tags": [
619
+ "near-entity",
620
+ "distractor"
621
+ ],
622
+ "importance": 0.5,
623
+ "metadata": {
624
+ "role": "distractor",
625
+ "source_turn": "t10"
626
+ },
627
+ "associations": []
628
+ },
629
+ {
630
+ "id": "gsh-c02-m11",
631
+ "type": "semantic",
632
+ "timestamp": "2026-05-02T19:00:00Z",
633
+ "content": "For Devon's rural asthma cohort, the accountable owner is Priya Shah.",
634
+ "tags": [
635
+ "near-entity",
636
+ "owner"
637
+ ],
638
+ "importance": 0.86,
639
+ "metadata": {
640
+ "role": "entity-answer",
641
+ "source_turn": "t11"
642
+ },
643
+ "associations": [
644
+ {
645
+ "target_id": "gsh-c02-m02",
646
+ "strength": 0.9,
647
+ "relation": "owned_by"
648
+ }
649
+ ]
650
+ },
651
+ {
652
+ "id": "gsh-c02-m12",
653
+ "type": "semantic",
654
+ "timestamp": "2026-05-02T20:00:00Z",
655
+ "content": "Tempting gap: Devan's similar effort mentions participant compensation amount, but Devon's records do not state that value.",
656
+ "tags": [
657
+ "abstention",
658
+ "tempting-distractor"
659
+ ],
660
+ "importance": 0.45,
661
+ "metadata": {
662
+ "role": "abstention-distractor",
663
+ "source_turn": "t12"
664
+ },
665
+ "associations": []
666
+ }
667
+ ],
668
+ "questions": [
669
+ {
670
+ "question_id": "gsh-c02-q01",
671
+ "category": "multi_hop_association",
672
+ "question": "What operational dependency is ultimately used by Devon's codename Blue Lantern?",
673
+ "expected_answer": "FHIR observation export",
674
+ "required_memory_ids": [
675
+ "gsh-c02-m01",
676
+ "gsh-c02-m02",
677
+ "gsh-c02-m03"
678
+ ],
679
+ "forbidden_memory_ids": [
680
+ "gsh-c02-m10"
681
+ ],
682
+ "difficulty": "hard",
683
+ "architecture_bias_risk": "intentional_graph_stress",
684
+ "fairness_note": "Requires codename to project to bridge to dependency; no single memory contains both codename and answer."
685
+ },
686
+ {
687
+ "question_id": "gsh-c02-q02",
688
+ "category": "multi_hop_association",
689
+ "question": "Which artifact is needed before the release approval meeting can be signed off for Devon's work?",
690
+ "expected_answer": "redacted consent ledger",
691
+ "required_memory_ids": [
692
+ "gsh-c02-m04",
693
+ "gsh-c02-m05",
694
+ "gsh-c02-m06"
695
+ ],
696
+ "forbidden_memory_ids": [
697
+ "gsh-c02-m10"
698
+ ],
699
+ "difficulty": "hard",
700
+ "architecture_bias_risk": "intentional_graph_stress",
701
+ "fairness_note": "Uses weak lexical overlap between release approval meeting and the internal ritual name."
702
+ },
703
+ {
704
+ "question_id": "gsh-c02-q03",
705
+ "category": "contradiction_resolution",
706
+ "question": "What is the current resolved state for rural asthma cohort?",
707
+ "expected_answer": "REDCap tracking with nightly export",
708
+ "required_memory_ids": [
709
+ "gsh-c02-m09"
710
+ ],
711
+ "forbidden_memory_ids": [
712
+ "gsh-c02-m07",
713
+ "gsh-c02-m08"
714
+ ],
715
+ "difficulty": "hard",
716
+ "architecture_bias_risk": "intentional_graph_stress",
717
+ "fairness_note": "Final memory must beat stale and interim states connected by replaces/replaced_by edges."
718
+ },
719
+ {
720
+ "question_id": "gsh-c02-q04",
721
+ "category": "graph_traversal",
722
+ "question": "For the approval chain in Devon's project, which team owns the internal ritual?",
723
+ "expected_answer": "ethics submission team",
724
+ "required_memory_ids": [
725
+ "gsh-c02-m04",
726
+ "gsh-c02-m05"
727
+ ],
728
+ "forbidden_memory_ids": [
729
+ "gsh-c02-m10"
730
+ ],
731
+ "difficulty": "medium",
732
+ "architecture_bias_risk": "intentional_graph_stress",
733
+ "fairness_note": "Retrieval should cross from the approval phrasing to the internal ritual and owner team."
734
+ },
735
+ {
736
+ "question_id": "gsh-c02-q05",
737
+ "category": "entity_disambiguation",
738
+ "question": "Who is the accountable owner for Devon's project, not Devan's similar effort?",
739
+ "expected_answer": "Priya Shah",
740
+ "required_memory_ids": [
741
+ "gsh-c02-m11"
742
+ ],
743
+ "forbidden_memory_ids": [
744
+ "gsh-c02-m10",
745
+ "gsh-c02-m12"
746
+ ],
747
+ "difficulty": "medium",
748
+ "architecture_bias_risk": "intentional_graph_stress",
749
+ "fairness_note": "Near-name distractors are intentionally tempting but wrong."
750
+ }
751
+ ]
752
+ },
753
+ {
754
+ "conversation_id": "gsh-c03",
755
+ "agent_id": "agent-finance-ops",
756
+ "domain": "finance",
757
+ "memory_records": [
758
+ {
759
+ "id": "gsh-c03-m01",
760
+ "type": "semantic",
761
+ "timestamp": "2026-05-03T09:00:00Z",
762
+ "content": "Mika uses codename Northstar for the active workstream, but the codename itself does not state the operational dependency.",
763
+ "tags": [
764
+ "northstar",
765
+ "codename"
766
+ ],
767
+ "importance": 0.7,
768
+ "metadata": {
769
+ "role": "anchor",
770
+ "source_turn": "t01"
771
+ },
772
+ "associations": [
773
+ {
774
+ "target_id": "gsh-c03-m02",
775
+ "strength": 0.9,
776
+ "relation": "alias_of"
777
+ }
778
+ ]
779
+ },
780
+ {
781
+ "id": "gsh-c03-m02",
782
+ "type": "semantic",
783
+ "timestamp": "2026-05-03T10:00:00Z",
784
+ "content": "Codename Northstar refers to the invoice anomaly monitor, which is governed through the vendor-risk workflow.",
785
+ "tags": [
786
+ "invoice",
787
+ "project-link"
788
+ ],
789
+ "importance": 0.8,
790
+ "metadata": {
791
+ "role": "bridge",
792
+ "source_turn": "t02"
793
+ },
794
+ "associations": [
795
+ {
796
+ "target_id": "gsh-c03-m01",
797
+ "strength": 0.9,
798
+ "relation": "alias_of"
799
+ },
800
+ {
801
+ "target_id": "gsh-c03-m03",
802
+ "strength": 0.9,
803
+ "relation": "depends_on"
804
+ }
805
+ ]
806
+ },
807
+ {
808
+ "id": "gsh-c03-m03",
809
+ "type": "semantic",
810
+ "timestamp": "2026-05-03T11:00:00Z",
811
+ "content": "The vendor-risk workflow depends on the BigQuery audit table; this dependency is not named in the codename discussion.",
812
+ "tags": [
813
+ "vendor-risk",
814
+ "dependency"
815
+ ],
816
+ "importance": 0.9,
817
+ "metadata": {
818
+ "role": "answer",
819
+ "source_turn": "t03"
820
+ },
821
+ "associations": [
822
+ {
823
+ "target_id": "gsh-c03-m02",
824
+ "strength": 0.9,
825
+ "relation": "supports"
826
+ }
827
+ ]
828
+ },
829
+ {
830
+ "id": "gsh-c03-m04",
831
+ "type": "episodic",
832
+ "timestamp": "2026-05-03T12:00:00Z",
833
+ "content": "Mika said the approval ritual is internally called Pine Review; outside notes describe it as the release approval meeting.",
834
+ "tags": [
835
+ "approval",
836
+ "ritual"
837
+ ],
838
+ "importance": 0.75,
839
+ "metadata": {
840
+ "role": "weak-overlap-anchor",
841
+ "source_turn": "t04"
842
+ },
843
+ "associations": [
844
+ {
845
+ "target_id": "gsh-c03-m05",
846
+ "strength": 0.9,
847
+ "relation": "owned_by"
848
+ }
849
+ ]
850
+ },
851
+ {
852
+ "id": "gsh-c03-m05",
853
+ "type": "semantic",
854
+ "timestamp": "2026-05-03T13:00:00Z",
855
+ "content": "Pine Review is owned by the controls validation team, not by the team whose name resembles the project codename.",
856
+ "tags": [
857
+ "approval",
858
+ "team"
859
+ ],
860
+ "importance": 0.82,
861
+ "metadata": {
862
+ "role": "weak-overlap-bridge",
863
+ "source_turn": "t05"
864
+ },
865
+ "associations": [
866
+ {
867
+ "target_id": "gsh-c03-m04",
868
+ "strength": 0.9,
869
+ "relation": "owns"
870
+ },
871
+ {
872
+ "target_id": "gsh-c03-m06",
873
+ "strength": 0.9,
874
+ "relation": "requires"
875
+ }
876
+ ]
877
+ },
878
+ {
879
+ "id": "gsh-c03-m06",
880
+ "type": "procedural",
881
+ "timestamp": "2026-05-03T14:00:00Z",
882
+ "content": "Before controls validation team signs off, they require the variance exception register.",
883
+ "tags": [
884
+ "approval",
885
+ "artifact"
886
+ ],
887
+ "importance": 0.88,
888
+ "metadata": {
889
+ "role": "weak-overlap-answer",
890
+ "source_turn": "t06"
891
+ },
892
+ "associations": [
893
+ {
894
+ "target_id": "gsh-c03-m05",
895
+ "strength": 0.9,
896
+ "relation": "required_by"
897
+ }
898
+ ]
899
+ },
900
+ {
901
+ "id": "gsh-c03-m07",
902
+ "type": "semantic",
903
+ "timestamp": "2026-05-03T15:00:00Z",
904
+ "content": "Initial state for invoice anomaly monitor: threshold at USD 500.",
905
+ "tags": [
906
+ "current-state",
907
+ "stale"
908
+ ],
909
+ "importance": 0.55,
910
+ "metadata": {
911
+ "role": "stale",
912
+ "source_turn": "t07"
913
+ },
914
+ "associations": [
915
+ {
916
+ "target_id": "gsh-c03-m08",
917
+ "strength": 0.95,
918
+ "relation": "replaced_by"
919
+ }
920
+ ]
921
+ },
922
+ {
923
+ "id": "gsh-c03-m08",
924
+ "type": "episodic",
925
+ "timestamp": "2026-05-03T16:00:00Z",
926
+ "content": "Interim update for invoice anomaly monitor: threshold at USD 750, replacing the initial state.",
927
+ "tags": [
928
+ "current-state",
929
+ "interim"
930
+ ],
931
+ "importance": 0.65,
932
+ "metadata": {
933
+ "role": "interim",
934
+ "source_turn": "t08"
935
+ },
936
+ "associations": [
937
+ {
938
+ "target_id": "gsh-c03-m07",
939
+ "strength": 0.95,
940
+ "relation": "replaces"
941
+ },
942
+ {
943
+ "target_id": "gsh-c03-m09",
944
+ "strength": 0.98,
945
+ "relation": "replaced_by"
946
+ }
947
+ ]
948
+ },
949
+ {
950
+ "id": "gsh-c03-m09",
951
+ "type": "semantic",
952
+ "timestamp": "2026-05-03T17:00:00Z",
953
+ "content": "Final resolved state for invoice anomaly monitor: threshold at USD 1,200 with CFO override. This supersedes all earlier versions.",
954
+ "tags": [
955
+ "current-state",
956
+ "final"
957
+ ],
958
+ "importance": 0.95,
959
+ "metadata": {
960
+ "role": "final",
961
+ "source_turn": "t09"
962
+ },
963
+ "associations": [
964
+ {
965
+ "target_id": "gsh-c03-m08",
966
+ "strength": 0.98,
967
+ "relation": "replaces"
968
+ },
969
+ {
970
+ "target_id": "gsh-c03-m07",
971
+ "strength": 0.98,
972
+ "relation": "replaces"
973
+ }
974
+ ]
975
+ },
976
+ {
977
+ "id": "gsh-c03-m10",
978
+ "type": "semantic",
979
+ "timestamp": "2026-05-03T18:00:00Z",
980
+ "content": "Distractor: Mica runs a similar-sounding effort with Snowflake staging table, but it is not Mika's invoice anomaly monitor.",
981
+ "tags": [
982
+ "near-entity",
983
+ "distractor"
984
+ ],
985
+ "importance": 0.5,
986
+ "metadata": {
987
+ "role": "distractor",
988
+ "source_turn": "t10"
989
+ },
990
+ "associations": []
991
+ },
992
+ {
993
+ "id": "gsh-c03-m11",
994
+ "type": "semantic",
995
+ "timestamp": "2026-05-03T19:00:00Z",
996
+ "content": "For Mika's invoice anomaly monitor, the accountable owner is Elena Costa.",
997
+ "tags": [
998
+ "near-entity",
999
+ "owner"
1000
+ ],
1001
+ "importance": 0.86,
1002
+ "metadata": {
1003
+ "role": "entity-answer",
1004
+ "source_turn": "t11"
1005
+ },
1006
+ "associations": [
1007
+ {
1008
+ "target_id": "gsh-c03-m02",
1009
+ "strength": 0.9,
1010
+ "relation": "owned_by"
1011
+ }
1012
+ ]
1013
+ },
1014
+ {
1015
+ "id": "gsh-c03-m12",
1016
+ "type": "semantic",
1017
+ "timestamp": "2026-05-03T20:00:00Z",
1018
+ "content": "Tempting gap: Mica's similar effort mentions insurance carrier, but Mika's records do not state that value.",
1019
+ "tags": [
1020
+ "abstention",
1021
+ "tempting-distractor"
1022
+ ],
1023
+ "importance": 0.45,
1024
+ "metadata": {
1025
+ "role": "abstention-distractor",
1026
+ "source_turn": "t12"
1027
+ },
1028
+ "associations": []
1029
+ }
1030
+ ],
1031
+ "questions": [
1032
+ {
1033
+ "question_id": "gsh-c03-q01",
1034
+ "category": "multi_hop_association",
1035
+ "question": "What operational dependency is ultimately used by Mika's codename Northstar?",
1036
+ "expected_answer": "BigQuery audit table",
1037
+ "required_memory_ids": [
1038
+ "gsh-c03-m01",
1039
+ "gsh-c03-m02",
1040
+ "gsh-c03-m03"
1041
+ ],
1042
+ "forbidden_memory_ids": [
1043
+ "gsh-c03-m10"
1044
+ ],
1045
+ "difficulty": "hard",
1046
+ "architecture_bias_risk": "intentional_graph_stress",
1047
+ "fairness_note": "Requires codename to project to bridge to dependency; no single memory contains both codename and answer."
1048
+ },
1049
+ {
1050
+ "question_id": "gsh-c03-q02",
1051
+ "category": "multi_hop_association",
1052
+ "question": "Which artifact is needed before the release approval meeting can be signed off for Mika's work?",
1053
+ "expected_answer": "variance exception register",
1054
+ "required_memory_ids": [
1055
+ "gsh-c03-m04",
1056
+ "gsh-c03-m05",
1057
+ "gsh-c03-m06"
1058
+ ],
1059
+ "forbidden_memory_ids": [
1060
+ "gsh-c03-m10"
1061
+ ],
1062
+ "difficulty": "hard",
1063
+ "architecture_bias_risk": "intentional_graph_stress",
1064
+ "fairness_note": "Uses weak lexical overlap between release approval meeting and the internal ritual name."
1065
+ },
1066
+ {
1067
+ "question_id": "gsh-c03-q03",
1068
+ "category": "contradiction_resolution",
1069
+ "question": "What is the current resolved state for invoice anomaly monitor?",
1070
+ "expected_answer": "threshold at USD 1,200 with CFO override",
1071
+ "required_memory_ids": [
1072
+ "gsh-c03-m09"
1073
+ ],
1074
+ "forbidden_memory_ids": [
1075
+ "gsh-c03-m07",
1076
+ "gsh-c03-m08"
1077
+ ],
1078
+ "difficulty": "hard",
1079
+ "architecture_bias_risk": "intentional_graph_stress",
1080
+ "fairness_note": "Final memory must beat stale and interim states connected by replaces/replaced_by edges."
1081
+ },
1082
+ {
1083
+ "question_id": "gsh-c03-q04",
1084
+ "category": "graph_traversal",
1085
+ "question": "For the approval chain in Mika's project, which team owns the internal ritual?",
1086
+ "expected_answer": "controls validation team",
1087
+ "required_memory_ids": [
1088
+ "gsh-c03-m04",
1089
+ "gsh-c03-m05"
1090
+ ],
1091
+ "forbidden_memory_ids": [
1092
+ "gsh-c03-m10"
1093
+ ],
1094
+ "difficulty": "medium",
1095
+ "architecture_bias_risk": "intentional_graph_stress",
1096
+ "fairness_note": "Retrieval should cross from the approval phrasing to the internal ritual and owner team."
1097
+ },
1098
+ {
1099
+ "question_id": "gsh-c03-q05",
1100
+ "category": "entity_disambiguation",
1101
+ "question": "Who is the accountable owner for Mika's project, not Mica's similar effort?",
1102
+ "expected_answer": "Elena Costa",
1103
+ "required_memory_ids": [
1104
+ "gsh-c03-m11"
1105
+ ],
1106
+ "forbidden_memory_ids": [
1107
+ "gsh-c03-m10",
1108
+ "gsh-c03-m12"
1109
+ ],
1110
+ "difficulty": "medium",
1111
+ "architecture_bias_risk": "intentional_graph_stress",
1112
+ "fairness_note": "Near-name distractors are intentionally tempting but wrong."
1113
+ }
1114
+ ]
1115
+ },
1116
+ {
1117
+ "conversation_id": "gsh-c04",
1118
+ "agent_id": "agent-legal-casework",
1119
+ "domain": "legal",
1120
+ "memory_records": [
1121
+ {
1122
+ "id": "gsh-c04-m01",
1123
+ "type": "semantic",
1124
+ "timestamp": "2026-05-04T09:00:00Z",
1125
+ "content": "Amina uses codename Harbor for the active workstream, but the codename itself does not state the operational dependency.",
1126
+ "tags": [
1127
+ "harbor",
1128
+ "codename"
1129
+ ],
1130
+ "importance": 0.7,
1131
+ "metadata": {
1132
+ "role": "anchor",
1133
+ "source_turn": "t01"
1134
+ },
1135
+ "associations": [
1136
+ {
1137
+ "target_id": "gsh-c04-m02",
1138
+ "strength": 0.9,
1139
+ "relation": "alias_of"
1140
+ }
1141
+ ]
1142
+ },
1143
+ {
1144
+ "id": "gsh-c04-m02",
1145
+ "type": "semantic",
1146
+ "timestamp": "2026-05-04T10:00:00Z",
1147
+ "content": "Codename Harbor refers to the lease-renewal dispute, which is governed through the tenant-notice timeline.",
1148
+ "tags": [
1149
+ "lease-renewal",
1150
+ "project-link"
1151
+ ],
1152
+ "importance": 0.8,
1153
+ "metadata": {
1154
+ "role": "bridge",
1155
+ "source_turn": "t02"
1156
+ },
1157
+ "associations": [
1158
+ {
1159
+ "target_id": "gsh-c04-m01",
1160
+ "strength": 0.9,
1161
+ "relation": "alias_of"
1162
+ },
1163
+ {
1164
+ "target_id": "gsh-c04-m03",
1165
+ "strength": 0.9,
1166
+ "relation": "depends_on"
1167
+ }
1168
+ ]
1169
+ },
1170
+ {
1171
+ "id": "gsh-c04-m03",
1172
+ "type": "semantic",
1173
+ "timestamp": "2026-05-04T11:00:00Z",
1174
+ "content": "The tenant-notice timeline depends on the Clause 14 cure-period memo; this dependency is not named in the codename discussion.",
1175
+ "tags": [
1176
+ "tenant-notice",
1177
+ "dependency"
1178
+ ],
1179
+ "importance": 0.9,
1180
+ "metadata": {
1181
+ "role": "answer",
1182
+ "source_turn": "t03"
1183
+ },
1184
+ "associations": [
1185
+ {
1186
+ "target_id": "gsh-c04-m02",
1187
+ "strength": 0.9,
1188
+ "relation": "supports"
1189
+ }
1190
+ ]
1191
+ },
1192
+ {
1193
+ "id": "gsh-c04-m04",
1194
+ "type": "episodic",
1195
+ "timestamp": "2026-05-04T12:00:00Z",
1196
+ "content": "Amina said the approval ritual is internally called Green Tab; outside notes describe it as the release approval meeting.",
1197
+ "tags": [
1198
+ "approval",
1199
+ "ritual"
1200
+ ],
1201
+ "importance": 0.75,
1202
+ "metadata": {
1203
+ "role": "weak-overlap-anchor",
1204
+ "source_turn": "t04"
1205
+ },
1206
+ "associations": [
1207
+ {
1208
+ "target_id": "gsh-c04-m05",
1209
+ "strength": 0.9,
1210
+ "relation": "owned_by"
1211
+ }
1212
+ ]
1213
+ },
1214
+ {
1215
+ "id": "gsh-c04-m05",
1216
+ "type": "semantic",
1217
+ "timestamp": "2026-05-04T13:00:00Z",
1218
+ "content": "Green Tab is owned by the paralegal review team, not by the team whose name resembles the project codename.",
1219
+ "tags": [
1220
+ "approval",
1221
+ "team"
1222
+ ],
1223
+ "importance": 0.82,
1224
+ "metadata": {
1225
+ "role": "weak-overlap-bridge",
1226
+ "source_turn": "t05"
1227
+ },
1228
+ "associations": [
1229
+ {
1230
+ "target_id": "gsh-c04-m04",
1231
+ "strength": 0.9,
1232
+ "relation": "owns"
1233
+ },
1234
+ {
1235
+ "target_id": "gsh-c04-m06",
1236
+ "strength": 0.9,
1237
+ "relation": "requires"
1238
+ }
1239
+ ]
1240
+ },
1241
+ {
1242
+ "id": "gsh-c04-m06",
1243
+ "type": "procedural",
1244
+ "timestamp": "2026-05-04T14:00:00Z",
1245
+ "content": "Before paralegal review team signs off, they require the service affidavit packet.",
1246
+ "tags": [
1247
+ "approval",
1248
+ "artifact"
1249
+ ],
1250
+ "importance": 0.88,
1251
+ "metadata": {
1252
+ "role": "weak-overlap-answer",
1253
+ "source_turn": "t06"
1254
+ },
1255
+ "associations": [
1256
+ {
1257
+ "target_id": "gsh-c04-m05",
1258
+ "strength": 0.9,
1259
+ "relation": "required_by"
1260
+ }
1261
+ ]
1262
+ },
1263
+ {
1264
+ "id": "gsh-c04-m07",
1265
+ "type": "semantic",
1266
+ "timestamp": "2026-05-04T15:00:00Z",
1267
+ "content": "Initial state for lease-renewal dispute: hearing expected in March.",
1268
+ "tags": [
1269
+ "current-state",
1270
+ "stale"
1271
+ ],
1272
+ "importance": 0.55,
1273
+ "metadata": {
1274
+ "role": "stale",
1275
+ "source_turn": "t07"
1276
+ },
1277
+ "associations": [
1278
+ {
1279
+ "target_id": "gsh-c04-m08",
1280
+ "strength": 0.95,
1281
+ "relation": "replaced_by"
1282
+ }
1283
+ ]
1284
+ },
1285
+ {
1286
+ "id": "gsh-c04-m08",
1287
+ "type": "episodic",
1288
+ "timestamp": "2026-05-04T16:00:00Z",
1289
+ "content": "Interim update for lease-renewal dispute: hearing moved to April, replacing the initial state.",
1290
+ "tags": [
1291
+ "current-state",
1292
+ "interim"
1293
+ ],
1294
+ "importance": 0.65,
1295
+ "metadata": {
1296
+ "role": "interim",
1297
+ "source_turn": "t08"
1298
+ },
1299
+ "associations": [
1300
+ {
1301
+ "target_id": "gsh-c04-m07",
1302
+ "strength": 0.95,
1303
+ "relation": "replaces"
1304
+ },
1305
+ {
1306
+ "target_id": "gsh-c04-m09",
1307
+ "strength": 0.98,
1308
+ "relation": "replaced_by"
1309
+ }
1310
+ ]
1311
+ },
1312
+ {
1313
+ "id": "gsh-c04-m09",
1314
+ "type": "semantic",
1315
+ "timestamp": "2026-05-04T17:00:00Z",
1316
+ "content": "Final resolved state for lease-renewal dispute: hearing stayed pending mediation. This supersedes all earlier versions.",
1317
+ "tags": [
1318
+ "current-state",
1319
+ "final"
1320
+ ],
1321
+ "importance": 0.95,
1322
+ "metadata": {
1323
+ "role": "final",
1324
+ "source_turn": "t09"
1325
+ },
1326
+ "associations": [
1327
+ {
1328
+ "target_id": "gsh-c04-m08",
1329
+ "strength": 0.98,
1330
+ "relation": "replaces"
1331
+ },
1332
+ {
1333
+ "target_id": "gsh-c04-m07",
1334
+ "strength": 0.98,
1335
+ "relation": "replaces"
1336
+ }
1337
+ ]
1338
+ },
1339
+ {
1340
+ "id": "gsh-c04-m10",
1341
+ "type": "semantic",
1342
+ "timestamp": "2026-05-04T18:00:00Z",
1343
+ "content": "Distractor: Ameena runs a similar-sounding effort with Clause 9 insurance memo, but it is not Amina's lease-renewal dispute.",
1344
+ "tags": [
1345
+ "near-entity",
1346
+ "distractor"
1347
+ ],
1348
+ "importance": 0.5,
1349
+ "metadata": {
1350
+ "role": "distractor",
1351
+ "source_turn": "t10"
1352
+ },
1353
+ "associations": []
1354
+ },
1355
+ {
1356
+ "id": "gsh-c04-m11",
1357
+ "type": "semantic",
1358
+ "timestamp": "2026-05-04T19:00:00Z",
1359
+ "content": "For Amina's lease-renewal dispute, the accountable owner is Marco Bell.",
1360
+ "tags": [
1361
+ "near-entity",
1362
+ "owner"
1363
+ ],
1364
+ "importance": 0.86,
1365
+ "metadata": {
1366
+ "role": "entity-answer",
1367
+ "source_turn": "t11"
1368
+ },
1369
+ "associations": [
1370
+ {
1371
+ "target_id": "gsh-c04-m02",
1372
+ "strength": 0.9,
1373
+ "relation": "owned_by"
1374
+ }
1375
+ ]
1376
+ },
1377
+ {
1378
+ "id": "gsh-c04-m12",
1379
+ "type": "semantic",
1380
+ "timestamp": "2026-05-04T20:00:00Z",
1381
+ "content": "Tempting gap: Ameena's similar effort mentions settlement floor, but Amina's records do not state that value.",
1382
+ "tags": [
1383
+ "abstention",
1384
+ "tempting-distractor"
1385
+ ],
1386
+ "importance": 0.45,
1387
+ "metadata": {
1388
+ "role": "abstention-distractor",
1389
+ "source_turn": "t12"
1390
+ },
1391
+ "associations": []
1392
+ }
1393
+ ],
1394
+ "questions": [
1395
+ {
1396
+ "question_id": "gsh-c04-q01",
1397
+ "category": "multi_hop_association",
1398
+ "question": "What operational dependency is ultimately used by Amina's codename Harbor?",
1399
+ "expected_answer": "Clause 14 cure-period memo",
1400
+ "required_memory_ids": [
1401
+ "gsh-c04-m01",
1402
+ "gsh-c04-m02",
1403
+ "gsh-c04-m03"
1404
+ ],
1405
+ "forbidden_memory_ids": [
1406
+ "gsh-c04-m10"
1407
+ ],
1408
+ "difficulty": "hard",
1409
+ "architecture_bias_risk": "intentional_graph_stress",
1410
+ "fairness_note": "Requires codename to project to bridge to dependency; no single memory contains both codename and answer."
1411
+ },
1412
+ {
1413
+ "question_id": "gsh-c04-q02",
1414
+ "category": "multi_hop_association",
1415
+ "question": "Which artifact is needed before the release approval meeting can be signed off for Amina's work?",
1416
+ "expected_answer": "service affidavit packet",
1417
+ "required_memory_ids": [
1418
+ "gsh-c04-m04",
1419
+ "gsh-c04-m05",
1420
+ "gsh-c04-m06"
1421
+ ],
1422
+ "forbidden_memory_ids": [
1423
+ "gsh-c04-m10"
1424
+ ],
1425
+ "difficulty": "hard",
1426
+ "architecture_bias_risk": "intentional_graph_stress",
1427
+ "fairness_note": "Uses weak lexical overlap between release approval meeting and the internal ritual name."
1428
+ },
1429
+ {
1430
+ "question_id": "gsh-c04-q03",
1431
+ "category": "contradiction_resolution",
1432
+ "question": "What is the current resolved state for lease-renewal dispute?",
1433
+ "expected_answer": "hearing stayed pending mediation",
1434
+ "required_memory_ids": [
1435
+ "gsh-c04-m09"
1436
+ ],
1437
+ "forbidden_memory_ids": [
1438
+ "gsh-c04-m07",
1439
+ "gsh-c04-m08"
1440
+ ],
1441
+ "difficulty": "hard",
1442
+ "architecture_bias_risk": "intentional_graph_stress",
1443
+ "fairness_note": "Final memory must beat stale and interim states connected by replaces/replaced_by edges."
1444
+ },
1445
+ {
1446
+ "question_id": "gsh-c04-q04",
1447
+ "category": "graph_traversal",
1448
+ "question": "For the approval chain in Amina's project, which team owns the internal ritual?",
1449
+ "expected_answer": "paralegal review team",
1450
+ "required_memory_ids": [
1451
+ "gsh-c04-m04",
1452
+ "gsh-c04-m05"
1453
+ ],
1454
+ "forbidden_memory_ids": [
1455
+ "gsh-c04-m10"
1456
+ ],
1457
+ "difficulty": "medium",
1458
+ "architecture_bias_risk": "intentional_graph_stress",
1459
+ "fairness_note": "Retrieval should cross from the approval phrasing to the internal ritual and owner team."
1460
+ },
1461
+ {
1462
+ "question_id": "gsh-c04-q05",
1463
+ "category": "entity_disambiguation",
1464
+ "question": "Who is the accountable owner for Amina's project, not Ameena's similar effort?",
1465
+ "expected_answer": "Marco Bell",
1466
+ "required_memory_ids": [
1467
+ "gsh-c04-m11"
1468
+ ],
1469
+ "forbidden_memory_ids": [
1470
+ "gsh-c04-m10",
1471
+ "gsh-c04-m12"
1472
+ ],
1473
+ "difficulty": "medium",
1474
+ "architecture_bias_risk": "intentional_graph_stress",
1475
+ "fairness_note": "Near-name distractors are intentionally tempting but wrong."
1476
+ }
1477
+ ]
1478
+ },
1479
+ {
1480
+ "conversation_id": "gsh-c05",
1481
+ "agent_id": "agent-education-design",
1482
+ "domain": "education",
1483
+ "memory_records": [
1484
+ {
1485
+ "id": "gsh-c05-m01",
1486
+ "type": "semantic",
1487
+ "timestamp": "2026-05-05T09:00:00Z",
1488
+ "content": "Leo uses codename Riverglass for the active workstream, but the codename itself does not state the operational dependency.",
1489
+ "tags": [
1490
+ "riverglass",
1491
+ "codename"
1492
+ ],
1493
+ "importance": 0.7,
1494
+ "metadata": {
1495
+ "role": "anchor",
1496
+ "source_turn": "t01"
1497
+ },
1498
+ "associations": [
1499
+ {
1500
+ "target_id": "gsh-c05-m02",
1501
+ "strength": 0.9,
1502
+ "relation": "alias_of"
1503
+ }
1504
+ ]
1505
+ },
1506
+ {
1507
+ "id": "gsh-c05-m02",
1508
+ "type": "semantic",
1509
+ "timestamp": "2026-05-05T10:00:00Z",
1510
+ "content": "Codename Riverglass refers to the ninth-grade climate unit, which is governed through the field-data practicum.",
1511
+ "tags": [
1512
+ "ninth-grade",
1513
+ "project-link"
1514
+ ],
1515
+ "importance": 0.8,
1516
+ "metadata": {
1517
+ "role": "bridge",
1518
+ "source_turn": "t02"
1519
+ },
1520
+ "associations": [
1521
+ {
1522
+ "target_id": "gsh-c05-m01",
1523
+ "strength": 0.9,
1524
+ "relation": "alias_of"
1525
+ },
1526
+ {
1527
+ "target_id": "gsh-c05-m03",
1528
+ "strength": 0.9,
1529
+ "relation": "depends_on"
1530
+ }
1531
+ ]
1532
+ },
1533
+ {
1534
+ "id": "gsh-c05-m03",
1535
+ "type": "semantic",
1536
+ "timestamp": "2026-05-05T11:00:00Z",
1537
+ "content": "The field-data practicum depends on the watershed sensor dataset; this dependency is not named in the codename discussion.",
1538
+ "tags": [
1539
+ "field-data",
1540
+ "dependency"
1541
+ ],
1542
+ "importance": 0.9,
1543
+ "metadata": {
1544
+ "role": "answer",
1545
+ "source_turn": "t03"
1546
+ },
1547
+ "associations": [
1548
+ {
1549
+ "target_id": "gsh-c05-m02",
1550
+ "strength": 0.9,
1551
+ "relation": "supports"
1552
+ }
1553
+ ]
1554
+ },
1555
+ {
1556
+ "id": "gsh-c05-m04",
1557
+ "type": "episodic",
1558
+ "timestamp": "2026-05-05T12:00:00Z",
1559
+ "content": "Leo said the approval ritual is internally called Lantern Check; outside notes describe it as the release approval meeting.",
1560
+ "tags": [
1561
+ "approval",
1562
+ "ritual"
1563
+ ],
1564
+ "importance": 0.75,
1565
+ "metadata": {
1566
+ "role": "weak-overlap-anchor",
1567
+ "source_turn": "t04"
1568
+ },
1569
+ "associations": [
1570
+ {
1571
+ "target_id": "gsh-c05-m05",
1572
+ "strength": 0.9,
1573
+ "relation": "owned_by"
1574
+ }
1575
+ ]
1576
+ },
1577
+ {
1578
+ "id": "gsh-c05-m05",
1579
+ "type": "semantic",
1580
+ "timestamp": "2026-05-05T13:00:00Z",
1581
+ "content": "Lantern Check is owned by the curriculum alignment team, not by the team whose name resembles the project codename.",
1582
+ "tags": [
1583
+ "approval",
1584
+ "team"
1585
+ ],
1586
+ "importance": 0.82,
1587
+ "metadata": {
1588
+ "role": "weak-overlap-bridge",
1589
+ "source_turn": "t05"
1590
+ },
1591
+ "associations": [
1592
+ {
1593
+ "target_id": "gsh-c05-m04",
1594
+ "strength": 0.9,
1595
+ "relation": "owns"
1596
+ },
1597
+ {
1598
+ "target_id": "gsh-c05-m06",
1599
+ "strength": 0.9,
1600
+ "relation": "requires"
1601
+ }
1602
+ ]
1603
+ },
1604
+ {
1605
+ "id": "gsh-c05-m06",
1606
+ "type": "procedural",
1607
+ "timestamp": "2026-05-05T14:00:00Z",
1608
+ "content": "Before curriculum alignment team signs off, they require the rubric calibration sheet.",
1609
+ "tags": [
1610
+ "approval",
1611
+ "artifact"
1612
+ ],
1613
+ "importance": 0.88,
1614
+ "metadata": {
1615
+ "role": "weak-overlap-answer",
1616
+ "source_turn": "t06"
1617
+ },
1618
+ "associations": [
1619
+ {
1620
+ "target_id": "gsh-c05-m05",
1621
+ "strength": 0.9,
1622
+ "relation": "required_by"
1623
+ }
1624
+ ]
1625
+ },
1626
+ {
1627
+ "id": "gsh-c05-m07",
1628
+ "type": "semantic",
1629
+ "timestamp": "2026-05-05T15:00:00Z",
1630
+ "content": "Initial state for ninth-grade climate unit: final project as a poster.",
1631
+ "tags": [
1632
+ "current-state",
1633
+ "stale"
1634
+ ],
1635
+ "importance": 0.55,
1636
+ "metadata": {
1637
+ "role": "stale",
1638
+ "source_turn": "t07"
1639
+ },
1640
+ "associations": [
1641
+ {
1642
+ "target_id": "gsh-c05-m08",
1643
+ "strength": 0.95,
1644
+ "relation": "replaced_by"
1645
+ }
1646
+ ]
1647
+ },
1648
+ {
1649
+ "id": "gsh-c05-m08",
1650
+ "type": "episodic",
1651
+ "timestamp": "2026-05-05T16:00:00Z",
1652
+ "content": "Interim update for ninth-grade climate unit: final project as a slide deck, replacing the initial state.",
1653
+ "tags": [
1654
+ "current-state",
1655
+ "interim"
1656
+ ],
1657
+ "importance": 0.65,
1658
+ "metadata": {
1659
+ "role": "interim",
1660
+ "source_turn": "t08"
1661
+ },
1662
+ "associations": [
1663
+ {
1664
+ "target_id": "gsh-c05-m07",
1665
+ "strength": 0.95,
1666
+ "relation": "replaces"
1667
+ },
1668
+ {
1669
+ "target_id": "gsh-c05-m09",
1670
+ "strength": 0.98,
1671
+ "relation": "replaced_by"
1672
+ }
1673
+ ]
1674
+ },
1675
+ {
1676
+ "id": "gsh-c05-m09",
1677
+ "type": "semantic",
1678
+ "timestamp": "2026-05-05T17:00:00Z",
1679
+ "content": "Final resolved state for ninth-grade climate unit: final project as a data story notebook. This supersedes all earlier versions.",
1680
+ "tags": [
1681
+ "current-state",
1682
+ "final"
1683
+ ],
1684
+ "importance": 0.95,
1685
+ "metadata": {
1686
+ "role": "final",
1687
+ "source_turn": "t09"
1688
+ },
1689
+ "associations": [
1690
+ {
1691
+ "target_id": "gsh-c05-m08",
1692
+ "strength": 0.98,
1693
+ "relation": "replaces"
1694
+ },
1695
+ {
1696
+ "target_id": "gsh-c05-m07",
1697
+ "strength": 0.98,
1698
+ "relation": "replaces"
1699
+ }
1700
+ ]
1701
+ },
1702
+ {
1703
+ "id": "gsh-c05-m10",
1704
+ "type": "semantic",
1705
+ "timestamp": "2026-05-05T18:00:00Z",
1706
+ "content": "Distractor: Lio runs a similar-sounding effort with weather-station photo set, but it is not Leo's ninth-grade climate unit.",
1707
+ "tags": [
1708
+ "near-entity",
1709
+ "distractor"
1710
+ ],
1711
+ "importance": 0.5,
1712
+ "metadata": {
1713
+ "role": "distractor",
1714
+ "source_turn": "t10"
1715
+ },
1716
+ "associations": []
1717
+ },
1718
+ {
1719
+ "id": "gsh-c05-m11",
1720
+ "type": "semantic",
1721
+ "timestamp": "2026-05-05T19:00:00Z",
1722
+ "content": "For Leo's ninth-grade climate unit, the accountable owner is Nora Iqbal.",
1723
+ "tags": [
1724
+ "near-entity",
1725
+ "owner"
1726
+ ],
1727
+ "importance": 0.86,
1728
+ "metadata": {
1729
+ "role": "entity-answer",
1730
+ "source_turn": "t11"
1731
+ },
1732
+ "associations": [
1733
+ {
1734
+ "target_id": "gsh-c05-m02",
1735
+ "strength": 0.9,
1736
+ "relation": "owned_by"
1737
+ }
1738
+ ]
1739
+ },
1740
+ {
1741
+ "id": "gsh-c05-m12",
1742
+ "type": "semantic",
1743
+ "timestamp": "2026-05-05T20:00:00Z",
1744
+ "content": "Tempting gap: Lio's similar effort mentions bus pickup location, but Leo's records do not state that value.",
1745
+ "tags": [
1746
+ "abstention",
1747
+ "tempting-distractor"
1748
+ ],
1749
+ "importance": 0.45,
1750
+ "metadata": {
1751
+ "role": "abstention-distractor",
1752
+ "source_turn": "t12"
1753
+ },
1754
+ "associations": []
1755
+ }
1756
+ ],
1757
+ "questions": [
1758
+ {
1759
+ "question_id": "gsh-c05-q01",
1760
+ "category": "multi_hop_association",
1761
+ "question": "What operational dependency is ultimately used by Leo's codename Riverglass?",
1762
+ "expected_answer": "watershed sensor dataset",
1763
+ "required_memory_ids": [
1764
+ "gsh-c05-m01",
1765
+ "gsh-c05-m02",
1766
+ "gsh-c05-m03"
1767
+ ],
1768
+ "forbidden_memory_ids": [
1769
+ "gsh-c05-m10"
1770
+ ],
1771
+ "difficulty": "hard",
1772
+ "architecture_bias_risk": "intentional_graph_stress",
1773
+ "fairness_note": "Requires codename to project to bridge to dependency; no single memory contains both codename and answer."
1774
+ },
1775
+ {
1776
+ "question_id": "gsh-c05-q02",
1777
+ "category": "multi_hop_association",
1778
+ "question": "Which artifact is needed before the release approval meeting can be signed off for Leo's work?",
1779
+ "expected_answer": "rubric calibration sheet",
1780
+ "required_memory_ids": [
1781
+ "gsh-c05-m04",
1782
+ "gsh-c05-m05",
1783
+ "gsh-c05-m06"
1784
+ ],
1785
+ "forbidden_memory_ids": [
1786
+ "gsh-c05-m10"
1787
+ ],
1788
+ "difficulty": "hard",
1789
+ "architecture_bias_risk": "intentional_graph_stress",
1790
+ "fairness_note": "Uses weak lexical overlap between release approval meeting and the internal ritual name."
1791
+ },
1792
+ {
1793
+ "question_id": "gsh-c05-q03",
1794
+ "category": "contradiction_resolution",
1795
+ "question": "What is the current resolved state for ninth-grade climate unit?",
1796
+ "expected_answer": "final project as a data story notebook",
1797
+ "required_memory_ids": [
1798
+ "gsh-c05-m09"
1799
+ ],
1800
+ "forbidden_memory_ids": [
1801
+ "gsh-c05-m07",
1802
+ "gsh-c05-m08"
1803
+ ],
1804
+ "difficulty": "hard",
1805
+ "architecture_bias_risk": "intentional_graph_stress",
1806
+ "fairness_note": "Final memory must beat stale and interim states connected by replaces/replaced_by edges."
1807
+ },
1808
+ {
1809
+ "question_id": "gsh-c05-q04",
1810
+ "category": "graph_traversal",
1811
+ "question": "For the approval chain in Leo's project, which team owns the internal ritual?",
1812
+ "expected_answer": "curriculum alignment team",
1813
+ "required_memory_ids": [
1814
+ "gsh-c05-m04",
1815
+ "gsh-c05-m05"
1816
+ ],
1817
+ "forbidden_memory_ids": [
1818
+ "gsh-c05-m10"
1819
+ ],
1820
+ "difficulty": "medium",
1821
+ "architecture_bias_risk": "intentional_graph_stress",
1822
+ "fairness_note": "Retrieval should cross from the approval phrasing to the internal ritual and owner team."
1823
+ },
1824
+ {
1825
+ "question_id": "gsh-c05-q05",
1826
+ "category": "entity_disambiguation",
1827
+ "question": "Who is the accountable owner for Leo's project, not Lio's similar effort?",
1828
+ "expected_answer": "Nora Iqbal",
1829
+ "required_memory_ids": [
1830
+ "gsh-c05-m11"
1831
+ ],
1832
+ "forbidden_memory_ids": [
1833
+ "gsh-c05-m10",
1834
+ "gsh-c05-m12"
1835
+ ],
1836
+ "difficulty": "medium",
1837
+ "architecture_bias_risk": "intentional_graph_stress",
1838
+ "fairness_note": "Near-name distractors are intentionally tempting but wrong."
1839
+ }
1840
+ ]
1841
+ },
1842
+ {
1843
+ "conversation_id": "gsh-c06",
1844
+ "agent_id": "agent-health-coach",
1845
+ "domain": "healthcare",
1846
+ "memory_records": [
1847
+ {
1848
+ "id": "gsh-c06-m01",
1849
+ "type": "semantic",
1850
+ "timestamp": "2026-05-06T09:00:00Z",
1851
+ "content": "Samir uses codename Quiet Mile for the active workstream, but the codename itself does not state the operational dependency.",
1852
+ "tags": [
1853
+ "quiet-mile",
1854
+ "codename"
1855
+ ],
1856
+ "importance": 0.7,
1857
+ "metadata": {
1858
+ "role": "anchor",
1859
+ "source_turn": "t01"
1860
+ },
1861
+ "associations": [
1862
+ {
1863
+ "target_id": "gsh-c06-m02",
1864
+ "strength": 0.9,
1865
+ "relation": "alias_of"
1866
+ }
1867
+ ]
1868
+ },
1869
+ {
1870
+ "id": "gsh-c06-m02",
1871
+ "type": "semantic",
1872
+ "timestamp": "2026-05-06T10:00:00Z",
1873
+ "content": "Codename Quiet Mile refers to the post-surgery walking plan, which is governed through the mobility milestone tracker.",
1874
+ "tags": [
1875
+ "post-surgery",
1876
+ "project-link"
1877
+ ],
1878
+ "importance": 0.8,
1879
+ "metadata": {
1880
+ "role": "bridge",
1881
+ "source_turn": "t02"
1882
+ },
1883
+ "associations": [
1884
+ {
1885
+ "target_id": "gsh-c06-m01",
1886
+ "strength": 0.9,
1887
+ "relation": "alias_of"
1888
+ },
1889
+ {
1890
+ "target_id": "gsh-c06-m03",
1891
+ "strength": 0.9,
1892
+ "relation": "depends_on"
1893
+ }
1894
+ ]
1895
+ },
1896
+ {
1897
+ "id": "gsh-c06-m03",
1898
+ "type": "semantic",
1899
+ "timestamp": "2026-05-06T11:00:00Z",
1900
+ "content": "The mobility milestone tracker depends on the 10-minute evening walk; this dependency is not named in the codename discussion.",
1901
+ "tags": [
1902
+ "mobility",
1903
+ "dependency"
1904
+ ],
1905
+ "importance": 0.9,
1906
+ "metadata": {
1907
+ "role": "answer",
1908
+ "source_turn": "t03"
1909
+ },
1910
+ "associations": [
1911
+ {
1912
+ "target_id": "gsh-c06-m02",
1913
+ "strength": 0.9,
1914
+ "relation": "supports"
1915
+ }
1916
+ ]
1917
+ },
1918
+ {
1919
+ "id": "gsh-c06-m04",
1920
+ "type": "episodic",
1921
+ "timestamp": "2026-05-06T12:00:00Z",
1922
+ "content": "Samir said the approval ritual is internally called White Card; outside notes describe it as the release approval meeting.",
1923
+ "tags": [
1924
+ "approval",
1925
+ "ritual"
1926
+ ],
1927
+ "importance": 0.75,
1928
+ "metadata": {
1929
+ "role": "weak-overlap-anchor",
1930
+ "source_turn": "t04"
1931
+ },
1932
+ "associations": [
1933
+ {
1934
+ "target_id": "gsh-c06-m05",
1935
+ "strength": 0.9,
1936
+ "relation": "owned_by"
1937
+ }
1938
+ ]
1939
+ },
1940
+ {
1941
+ "id": "gsh-c06-m05",
1942
+ "type": "semantic",
1943
+ "timestamp": "2026-05-06T13:00:00Z",
1944
+ "content": "White Card is owned by the care-transition team, not by the team whose name resembles the project codename.",
1945
+ "tags": [
1946
+ "approval",
1947
+ "team"
1948
+ ],
1949
+ "importance": 0.82,
1950
+ "metadata": {
1951
+ "role": "weak-overlap-bridge",
1952
+ "source_turn": "t05"
1953
+ },
1954
+ "associations": [
1955
+ {
1956
+ "target_id": "gsh-c06-m04",
1957
+ "strength": 0.9,
1958
+ "relation": "owns"
1959
+ },
1960
+ {
1961
+ "target_id": "gsh-c06-m06",
1962
+ "strength": 0.9,
1963
+ "relation": "requires"
1964
+ }
1965
+ ]
1966
+ },
1967
+ {
1968
+ "id": "gsh-c06-m06",
1969
+ "type": "procedural",
1970
+ "timestamp": "2026-05-06T14:00:00Z",
1971
+ "content": "Before care-transition team signs off, they require the pain-escalation checklist.",
1972
+ "tags": [
1973
+ "approval",
1974
+ "artifact"
1975
+ ],
1976
+ "importance": 0.88,
1977
+ "metadata": {
1978
+ "role": "weak-overlap-answer",
1979
+ "source_turn": "t06"
1980
+ },
1981
+ "associations": [
1982
+ {
1983
+ "target_id": "gsh-c06-m05",
1984
+ "strength": 0.9,
1985
+ "relation": "required_by"
1986
+ }
1987
+ ]
1988
+ },
1989
+ {
1990
+ "id": "gsh-c06-m07",
1991
+ "type": "semantic",
1992
+ "timestamp": "2026-05-06T15:00:00Z",
1993
+ "content": "Initial state for post-surgery walking plan: stairs avoided completely.",
1994
+ "tags": [
1995
+ "current-state",
1996
+ "stale"
1997
+ ],
1998
+ "importance": 0.55,
1999
+ "metadata": {
2000
+ "role": "stale",
2001
+ "source_turn": "t07"
2002
+ },
2003
+ "associations": [
2004
+ {
2005
+ "target_id": "gsh-c06-m08",
2006
+ "strength": 0.95,
2007
+ "relation": "replaced_by"
2008
+ }
2009
+ ]
2010
+ },
2011
+ {
2012
+ "id": "gsh-c06-m08",
2013
+ "type": "episodic",
2014
+ "timestamp": "2026-05-06T16:00:00Z",
2015
+ "content": "Interim update for post-surgery walking plan: stairs allowed with supervision, replacing the initial state.",
2016
+ "tags": [
2017
+ "current-state",
2018
+ "interim"
2019
+ ],
2020
+ "importance": 0.65,
2021
+ "metadata": {
2022
+ "role": "interim",
2023
+ "source_turn": "t08"
2024
+ },
2025
+ "associations": [
2026
+ {
2027
+ "target_id": "gsh-c06-m07",
2028
+ "strength": 0.95,
2029
+ "relation": "replaces"
2030
+ },
2031
+ {
2032
+ "target_id": "gsh-c06-m09",
2033
+ "strength": 0.98,
2034
+ "relation": "replaced_by"
2035
+ }
2036
+ ]
2037
+ },
2038
+ {
2039
+ "id": "gsh-c06-m09",
2040
+ "type": "semantic",
2041
+ "timestamp": "2026-05-06T17:00:00Z",
2042
+ "content": "Final resolved state for post-surgery walking plan: stairs allowed twice daily with handrail. This supersedes all earlier versions.",
2043
+ "tags": [
2044
+ "current-state",
2045
+ "final"
2046
+ ],
2047
+ "importance": 0.95,
2048
+ "metadata": {
2049
+ "role": "final",
2050
+ "source_turn": "t09"
2051
+ },
2052
+ "associations": [
2053
+ {
2054
+ "target_id": "gsh-c06-m08",
2055
+ "strength": 0.98,
2056
+ "relation": "replaces"
2057
+ },
2058
+ {
2059
+ "target_id": "gsh-c06-m07",
2060
+ "strength": 0.98,
2061
+ "relation": "replaces"
2062
+ }
2063
+ ]
2064
+ },
2065
+ {
2066
+ "id": "gsh-c06-m10",
2067
+ "type": "semantic",
2068
+ "timestamp": "2026-05-06T18:00:00Z",
2069
+ "content": "Distractor: Samira runs a similar-sounding effort with 15-minute morning bike, but it is not Samir's post-surgery walking plan.",
2070
+ "tags": [
2071
+ "near-entity",
2072
+ "distractor"
2073
+ ],
2074
+ "importance": 0.5,
2075
+ "metadata": {
2076
+ "role": "distractor",
2077
+ "source_turn": "t10"
2078
+ },
2079
+ "associations": []
2080
+ },
2081
+ {
2082
+ "id": "gsh-c06-m11",
2083
+ "type": "semantic",
2084
+ "timestamp": "2026-05-06T19:00:00Z",
2085
+ "content": "For Samir's post-surgery walking plan, the accountable owner is Dr. Lin Patel.",
2086
+ "tags": [
2087
+ "near-entity",
2088
+ "owner"
2089
+ ],
2090
+ "importance": 0.86,
2091
+ "metadata": {
2092
+ "role": "entity-answer",
2093
+ "source_turn": "t11"
2094
+ },
2095
+ "associations": [
2096
+ {
2097
+ "target_id": "gsh-c06-m02",
2098
+ "strength": 0.9,
2099
+ "relation": "owned_by"
2100
+ }
2101
+ ]
2102
+ },
2103
+ {
2104
+ "id": "gsh-c06-m12",
2105
+ "type": "semantic",
2106
+ "timestamp": "2026-05-06T20:00:00Z",
2107
+ "content": "Tempting gap: Samira's similar effort mentions pharmacy copay, but Samir's records do not state that value.",
2108
+ "tags": [
2109
+ "abstention",
2110
+ "tempting-distractor"
2111
+ ],
2112
+ "importance": 0.45,
2113
+ "metadata": {
2114
+ "role": "abstention-distractor",
2115
+ "source_turn": "t12"
2116
+ },
2117
+ "associations": []
2118
+ }
2119
+ ],
2120
+ "questions": [
2121
+ {
2122
+ "question_id": "gsh-c06-q01",
2123
+ "category": "multi_hop_association",
2124
+ "question": "What operational dependency is ultimately used by Samir's codename Quiet Mile?",
2125
+ "expected_answer": "10-minute evening walk",
2126
+ "required_memory_ids": [
2127
+ "gsh-c06-m01",
2128
+ "gsh-c06-m02",
2129
+ "gsh-c06-m03"
2130
+ ],
2131
+ "forbidden_memory_ids": [
2132
+ "gsh-c06-m10"
2133
+ ],
2134
+ "difficulty": "hard",
2135
+ "architecture_bias_risk": "intentional_graph_stress",
2136
+ "fairness_note": "Requires codename to project to bridge to dependency; no single memory contains both codename and answer."
2137
+ },
2138
+ {
2139
+ "question_id": "gsh-c06-q02",
2140
+ "category": "multi_hop_association",
2141
+ "question": "Which artifact is needed before the release approval meeting can be signed off for Samir's work?",
2142
+ "expected_answer": "pain-escalation checklist",
2143
+ "required_memory_ids": [
2144
+ "gsh-c06-m04",
2145
+ "gsh-c06-m05",
2146
+ "gsh-c06-m06"
2147
+ ],
2148
+ "forbidden_memory_ids": [
2149
+ "gsh-c06-m10"
2150
+ ],
2151
+ "difficulty": "hard",
2152
+ "architecture_bias_risk": "intentional_graph_stress",
2153
+ "fairness_note": "Uses weak lexical overlap between release approval meeting and the internal ritual name."
2154
+ },
2155
+ {
2156
+ "question_id": "gsh-c06-q03",
2157
+ "category": "contradiction_resolution",
2158
+ "question": "What is the current resolved state for post-surgery walking plan?",
2159
+ "expected_answer": "stairs allowed twice daily with handrail",
2160
+ "required_memory_ids": [
2161
+ "gsh-c06-m09"
2162
+ ],
2163
+ "forbidden_memory_ids": [
2164
+ "gsh-c06-m07",
2165
+ "gsh-c06-m08"
2166
+ ],
2167
+ "difficulty": "hard",
2168
+ "architecture_bias_risk": "intentional_graph_stress",
2169
+ "fairness_note": "Final memory must beat stale and interim states connected by replaces/replaced_by edges."
2170
+ },
2171
+ {
2172
+ "question_id": "gsh-c06-q04",
2173
+ "category": "graph_traversal",
2174
+ "question": "For the approval chain in Samir's project, which team owns the internal ritual?",
2175
+ "expected_answer": "care-transition team",
2176
+ "required_memory_ids": [
2177
+ "gsh-c06-m04",
2178
+ "gsh-c06-m05"
2179
+ ],
2180
+ "forbidden_memory_ids": [
2181
+ "gsh-c06-m10"
2182
+ ],
2183
+ "difficulty": "medium",
2184
+ "architecture_bias_risk": "intentional_graph_stress",
2185
+ "fairness_note": "Retrieval should cross from the approval phrasing to the internal ritual and owner team."
2186
+ },
2187
+ {
2188
+ "question_id": "gsh-c06-q05",
2189
+ "category": "entity_disambiguation",
2190
+ "question": "Who is the accountable owner for Samir's project, not Samira's similar effort?",
2191
+ "expected_answer": "Dr. Lin Patel",
2192
+ "required_memory_ids": [
2193
+ "gsh-c06-m11"
2194
+ ],
2195
+ "forbidden_memory_ids": [
2196
+ "gsh-c06-m10",
2197
+ "gsh-c06-m12"
2198
+ ],
2199
+ "difficulty": "medium",
2200
+ "architecture_bias_risk": "intentional_graph_stress",
2201
+ "fairness_note": "Near-name distractors are intentionally tempting but wrong."
2202
+ }
2203
+ ]
2204
+ },
2205
+ {
2206
+ "conversation_id": "gsh-c07",
2207
+ "agent_id": "agent-creative-studio",
2208
+ "domain": "creative",
2209
+ "memory_records": [
2210
+ {
2211
+ "id": "gsh-c07-m01",
2212
+ "type": "semantic",
2213
+ "timestamp": "2026-05-07T09:00:00Z",
2214
+ "content": "Iskandar uses codename Glass Orchard for the active workstream, but the codename itself does not state the operational dependency.",
2215
+ "tags": [
2216
+ "glass-orchard",
2217
+ "codename"
2218
+ ],
2219
+ "importance": 0.7,
2220
+ "metadata": {
2221
+ "role": "anchor",
2222
+ "source_turn": "t01"
2223
+ },
2224
+ "associations": [
2225
+ {
2226
+ "target_id": "gsh-c07-m02",
2227
+ "strength": 0.9,
2228
+ "relation": "alias_of"
2229
+ }
2230
+ ]
2231
+ },
2232
+ {
2233
+ "id": "gsh-c07-m02",
2234
+ "type": "semantic",
2235
+ "timestamp": "2026-05-07T10:00:00Z",
2236
+ "content": "Codename Glass Orchard refers to the ambient album launch, which is governed through the vinyl preorder campaign.",
2237
+ "tags": [
2238
+ "ambient",
2239
+ "project-link"
2240
+ ],
2241
+ "importance": 0.8,
2242
+ "metadata": {
2243
+ "role": "bridge",
2244
+ "source_turn": "t02"
2245
+ },
2246
+ "associations": [
2247
+ {
2248
+ "target_id": "gsh-c07-m01",
2249
+ "strength": 0.9,
2250
+ "relation": "alias_of"
2251
+ },
2252
+ {
2253
+ "target_id": "gsh-c07-m03",
2254
+ "strength": 0.9,
2255
+ "relation": "depends_on"
2256
+ }
2257
+ ]
2258
+ },
2259
+ {
2260
+ "id": "gsh-c07-m03",
2261
+ "type": "semantic",
2262
+ "timestamp": "2026-05-07T11:00:00Z",
2263
+ "content": "The vinyl preorder campaign depends on the Bandcamp private presale page; this dependency is not named in the codename discussion.",
2264
+ "tags": [
2265
+ "vinyl",
2266
+ "dependency"
2267
+ ],
2268
+ "importance": 0.9,
2269
+ "metadata": {
2270
+ "role": "answer",
2271
+ "source_turn": "t03"
2272
+ },
2273
+ "associations": [
2274
+ {
2275
+ "target_id": "gsh-c07-m02",
2276
+ "strength": 0.9,
2277
+ "relation": "supports"
2278
+ }
2279
+ ]
2280
+ },
2281
+ {
2282
+ "id": "gsh-c07-m04",
2283
+ "type": "episodic",
2284
+ "timestamp": "2026-05-07T12:00:00Z",
2285
+ "content": "Iskandar said the approval ritual is internally called Blue Room; outside notes describe it as the release approval meeting.",
2286
+ "tags": [
2287
+ "approval",
2288
+ "ritual"
2289
+ ],
2290
+ "importance": 0.75,
2291
+ "metadata": {
2292
+ "role": "weak-overlap-anchor",
2293
+ "source_turn": "t04"
2294
+ },
2295
+ "associations": [
2296
+ {
2297
+ "target_id": "gsh-c07-m05",
2298
+ "strength": 0.9,
2299
+ "relation": "owned_by"
2300
+ }
2301
+ ]
2302
+ },
2303
+ {
2304
+ "id": "gsh-c07-m05",
2305
+ "type": "semantic",
2306
+ "timestamp": "2026-05-07T13:00:00Z",
2307
+ "content": "Blue Room is owned by the mastering coordination team, not by the team whose name resembles the project codename.",
2308
+ "tags": [
2309
+ "approval",
2310
+ "team"
2311
+ ],
2312
+ "importance": 0.82,
2313
+ "metadata": {
2314
+ "role": "weak-overlap-bridge",
2315
+ "source_turn": "t05"
2316
+ },
2317
+ "associations": [
2318
+ {
2319
+ "target_id": "gsh-c07-m04",
2320
+ "strength": 0.9,
2321
+ "relation": "owns"
2322
+ },
2323
+ {
2324
+ "target_id": "gsh-c07-m06",
2325
+ "strength": 0.9,
2326
+ "relation": "requires"
2327
+ }
2328
+ ]
2329
+ },
2330
+ {
2331
+ "id": "gsh-c07-m06",
2332
+ "type": "procedural",
2333
+ "timestamp": "2026-05-07T14:00:00Z",
2334
+ "content": "Before mastering coordination team signs off, they require the final lacquer approval note.",
2335
+ "tags": [
2336
+ "approval",
2337
+ "artifact"
2338
+ ],
2339
+ "importance": 0.88,
2340
+ "metadata": {
2341
+ "role": "weak-overlap-answer",
2342
+ "source_turn": "t06"
2343
+ },
2344
+ "associations": [
2345
+ {
2346
+ "target_id": "gsh-c07-m05",
2347
+ "strength": 0.9,
2348
+ "relation": "required_by"
2349
+ }
2350
+ ]
2351
+ },
2352
+ {
2353
+ "id": "gsh-c07-m07",
2354
+ "type": "semantic",
2355
+ "timestamp": "2026-05-07T15:00:00Z",
2356
+ "content": "Initial state for ambient album launch: release set for July.",
2357
+ "tags": [
2358
+ "current-state",
2359
+ "stale"
2360
+ ],
2361
+ "importance": 0.55,
2362
+ "metadata": {
2363
+ "role": "stale",
2364
+ "source_turn": "t07"
2365
+ },
2366
+ "associations": [
2367
+ {
2368
+ "target_id": "gsh-c07-m08",
2369
+ "strength": 0.95,
2370
+ "relation": "replaced_by"
2371
+ }
2372
+ ]
2373
+ },
2374
+ {
2375
+ "id": "gsh-c07-m08",
2376
+ "type": "episodic",
2377
+ "timestamp": "2026-05-07T16:00:00Z",
2378
+ "content": "Interim update for ambient album launch: release moved to August, replacing the initial state.",
2379
+ "tags": [
2380
+ "current-state",
2381
+ "interim"
2382
+ ],
2383
+ "importance": 0.65,
2384
+ "metadata": {
2385
+ "role": "interim",
2386
+ "source_turn": "t08"
2387
+ },
2388
+ "associations": [
2389
+ {
2390
+ "target_id": "gsh-c07-m07",
2391
+ "strength": 0.95,
2392
+ "relation": "replaces"
2393
+ },
2394
+ {
2395
+ "target_id": "gsh-c07-m09",
2396
+ "strength": 0.98,
2397
+ "relation": "replaced_by"
2398
+ }
2399
+ ]
2400
+ },
2401
+ {
2402
+ "id": "gsh-c07-m09",
2403
+ "type": "semantic",
2404
+ "timestamp": "2026-05-07T17:00:00Z",
2405
+ "content": "Final resolved state for ambient album launch: release locked for September 6. This supersedes all earlier versions.",
2406
+ "tags": [
2407
+ "current-state",
2408
+ "final"
2409
+ ],
2410
+ "importance": 0.95,
2411
+ "metadata": {
2412
+ "role": "final",
2413
+ "source_turn": "t09"
2414
+ },
2415
+ "associations": [
2416
+ {
2417
+ "target_id": "gsh-c07-m08",
2418
+ "strength": 0.98,
2419
+ "relation": "replaces"
2420
+ },
2421
+ {
2422
+ "target_id": "gsh-c07-m07",
2423
+ "strength": 0.98,
2424
+ "relation": "replaces"
2425
+ }
2426
+ ]
2427
+ },
2428
+ {
2429
+ "id": "gsh-c07-m10",
2430
+ "type": "semantic",
2431
+ "timestamp": "2026-05-07T18:00:00Z",
2432
+ "content": "Distractor: Iskander runs a similar-sounding effort with Spotify canvas package, but it is not Iskandar's ambient album launch.",
2433
+ "tags": [
2434
+ "near-entity",
2435
+ "distractor"
2436
+ ],
2437
+ "importance": 0.5,
2438
+ "metadata": {
2439
+ "role": "distractor",
2440
+ "source_turn": "t10"
2441
+ },
2442
+ "associations": []
2443
+ },
2444
+ {
2445
+ "id": "gsh-c07-m11",
2446
+ "type": "semantic",
2447
+ "timestamp": "2026-05-07T19:00:00Z",
2448
+ "content": "For Iskandar's ambient album launch, the accountable owner is Marta Velasquez.",
2449
+ "tags": [
2450
+ "near-entity",
2451
+ "owner"
2452
+ ],
2453
+ "importance": 0.86,
2454
+ "metadata": {
2455
+ "role": "entity-answer",
2456
+ "source_turn": "t11"
2457
+ },
2458
+ "associations": [
2459
+ {
2460
+ "target_id": "gsh-c07-m02",
2461
+ "strength": 0.9,
2462
+ "relation": "owned_by"
2463
+ }
2464
+ ]
2465
+ },
2466
+ {
2467
+ "id": "gsh-c07-m12",
2468
+ "type": "semantic",
2469
+ "timestamp": "2026-05-07T20:00:00Z",
2470
+ "content": "Tempting gap: Iskander's similar effort mentions tour opener, but Iskandar's records do not state that value.",
2471
+ "tags": [
2472
+ "abstention",
2473
+ "tempting-distractor"
2474
+ ],
2475
+ "importance": 0.45,
2476
+ "metadata": {
2477
+ "role": "abstention-distractor",
2478
+ "source_turn": "t12"
2479
+ },
2480
+ "associations": []
2481
+ }
2482
+ ],
2483
+ "questions": [
2484
+ {
2485
+ "question_id": "gsh-c07-q01",
2486
+ "category": "multi_hop_association",
2487
+ "question": "What operational dependency is ultimately used by Iskandar's codename Glass Orchard?",
2488
+ "expected_answer": "Bandcamp private presale page",
2489
+ "required_memory_ids": [
2490
+ "gsh-c07-m01",
2491
+ "gsh-c07-m02",
2492
+ "gsh-c07-m03"
2493
+ ],
2494
+ "forbidden_memory_ids": [
2495
+ "gsh-c07-m10"
2496
+ ],
2497
+ "difficulty": "hard",
2498
+ "architecture_bias_risk": "intentional_graph_stress",
2499
+ "fairness_note": "Requires codename to project to bridge to dependency; no single memory contains both codename and answer."
2500
+ },
2501
+ {
2502
+ "question_id": "gsh-c07-q02",
2503
+ "category": "multi_hop_association",
2504
+ "question": "Which artifact is needed before the release approval meeting can be signed off for Iskandar's work?",
2505
+ "expected_answer": "final lacquer approval note",
2506
+ "required_memory_ids": [
2507
+ "gsh-c07-m04",
2508
+ "gsh-c07-m05",
2509
+ "gsh-c07-m06"
2510
+ ],
2511
+ "forbidden_memory_ids": [
2512
+ "gsh-c07-m10"
2513
+ ],
2514
+ "difficulty": "hard",
2515
+ "architecture_bias_risk": "intentional_graph_stress",
2516
+ "fairness_note": "Uses weak lexical overlap between release approval meeting and the internal ritual name."
2517
+ },
2518
+ {
2519
+ "question_id": "gsh-c07-q03",
2520
+ "category": "contradiction_resolution",
2521
+ "question": "What is the current resolved state for ambient album launch?",
2522
+ "expected_answer": "release locked for September 6",
2523
+ "required_memory_ids": [
2524
+ "gsh-c07-m09"
2525
+ ],
2526
+ "forbidden_memory_ids": [
2527
+ "gsh-c07-m07",
2528
+ "gsh-c07-m08"
2529
+ ],
2530
+ "difficulty": "hard",
2531
+ "architecture_bias_risk": "intentional_graph_stress",
2532
+ "fairness_note": "Final memory must beat stale and interim states connected by replaces/replaced_by edges."
2533
+ },
2534
+ {
2535
+ "question_id": "gsh-c07-q04",
2536
+ "category": "graph_traversal",
2537
+ "question": "For the approval chain in Iskandar's project, which team owns the internal ritual?",
2538
+ "expected_answer": "mastering coordination team",
2539
+ "required_memory_ids": [
2540
+ "gsh-c07-m04",
2541
+ "gsh-c07-m05"
2542
+ ],
2543
+ "forbidden_memory_ids": [
2544
+ "gsh-c07-m10"
2545
+ ],
2546
+ "difficulty": "medium",
2547
+ "architecture_bias_risk": "intentional_graph_stress",
2548
+ "fairness_note": "Retrieval should cross from the approval phrasing to the internal ritual and owner team."
2549
+ },
2550
+ {
2551
+ "question_id": "gsh-c07-q05",
2552
+ "category": "entity_disambiguation",
2553
+ "question": "Who is the accountable owner for Iskandar's project, not Iskander's similar effort?",
2554
+ "expected_answer": "Marta Velasquez",
2555
+ "required_memory_ids": [
2556
+ "gsh-c07-m11"
2557
+ ],
2558
+ "forbidden_memory_ids": [
2559
+ "gsh-c07-m10",
2560
+ "gsh-c07-m12"
2561
+ ],
2562
+ "difficulty": "medium",
2563
+ "architecture_bias_risk": "intentional_graph_stress",
2564
+ "fairness_note": "Near-name distractors are intentionally tempting but wrong."
2565
+ }
2566
+ ]
2567
+ },
2568
+ {
2569
+ "conversation_id": "gsh-c08",
2570
+ "agent_id": "agent-travel-planner",
2571
+ "domain": "personal_assistant",
2572
+ "memory_records": [
2573
+ {
2574
+ "id": "gsh-c08-m01",
2575
+ "type": "semantic",
2576
+ "timestamp": "2026-05-08T09:00:00Z",
2577
+ "content": "Raf uses codename Lighthouse for the active workstream, but the codename itself does not state the operational dependency.",
2578
+ "tags": [
2579
+ "lighthouse",
2580
+ "codename"
2581
+ ],
2582
+ "importance": 0.7,
2583
+ "metadata": {
2584
+ "role": "anchor",
2585
+ "source_turn": "t01"
2586
+ },
2587
+ "associations": [
2588
+ {
2589
+ "target_id": "gsh-c08-m02",
2590
+ "strength": 0.9,
2591
+ "relation": "alias_of"
2592
+ }
2593
+ ]
2594
+ },
2595
+ {
2596
+ "id": "gsh-c08-m02",
2597
+ "type": "semantic",
2598
+ "timestamp": "2026-05-08T10:00:00Z",
2599
+ "content": "Codename Lighthouse refers to the Lisbon family itinerary, which is governed through the museum-access plan.",
2600
+ "tags": [
2601
+ "lisbon",
2602
+ "project-link"
2603
+ ],
2604
+ "importance": 0.8,
2605
+ "metadata": {
2606
+ "role": "bridge",
2607
+ "source_turn": "t02"
2608
+ },
2609
+ "associations": [
2610
+ {
2611
+ "target_id": "gsh-c08-m01",
2612
+ "strength": 0.9,
2613
+ "relation": "alias_of"
2614
+ },
2615
+ {
2616
+ "target_id": "gsh-c08-m03",
2617
+ "strength": 0.9,
2618
+ "relation": "depends_on"
2619
+ }
2620
+ ]
2621
+ },
2622
+ {
2623
+ "id": "gsh-c08-m03",
2624
+ "type": "semantic",
2625
+ "timestamp": "2026-05-08T11:00:00Z",
2626
+ "content": "The museum-access plan depends on the Tile Museum quiet-entry slot; this dependency is not named in the codename discussion.",
2627
+ "tags": [
2628
+ "museum-access",
2629
+ "dependency"
2630
+ ],
2631
+ "importance": 0.9,
2632
+ "metadata": {
2633
+ "role": "answer",
2634
+ "source_turn": "t03"
2635
+ },
2636
+ "associations": [
2637
+ {
2638
+ "target_id": "gsh-c08-m02",
2639
+ "strength": 0.9,
2640
+ "relation": "supports"
2641
+ }
2642
+ ]
2643
+ },
2644
+ {
2645
+ "id": "gsh-c08-m04",
2646
+ "type": "episodic",
2647
+ "timestamp": "2026-05-08T12:00:00Z",
2648
+ "content": "Raf said the approval ritual is internally called Map Fold; outside notes describe it as the release approval meeting.",
2649
+ "tags": [
2650
+ "approval",
2651
+ "ritual"
2652
+ ],
2653
+ "importance": 0.75,
2654
+ "metadata": {
2655
+ "role": "weak-overlap-anchor",
2656
+ "source_turn": "t04"
2657
+ },
2658
+ "associations": [
2659
+ {
2660
+ "target_id": "gsh-c08-m05",
2661
+ "strength": 0.9,
2662
+ "relation": "owned_by"
2663
+ }
2664
+ ]
2665
+ },
2666
+ {
2667
+ "id": "gsh-c08-m05",
2668
+ "type": "semantic",
2669
+ "timestamp": "2026-05-08T13:00:00Z",
2670
+ "content": "Map Fold is owned by the reservation support team, not by the team whose name resembles the project codename.",
2671
+ "tags": [
2672
+ "approval",
2673
+ "team"
2674
+ ],
2675
+ "importance": 0.82,
2676
+ "metadata": {
2677
+ "role": "weak-overlap-bridge",
2678
+ "source_turn": "t05"
2679
+ },
2680
+ "associations": [
2681
+ {
2682
+ "target_id": "gsh-c08-m04",
2683
+ "strength": 0.9,
2684
+ "relation": "owns"
2685
+ },
2686
+ {
2687
+ "target_id": "gsh-c08-m06",
2688
+ "strength": 0.9,
2689
+ "relation": "requires"
2690
+ }
2691
+ ]
2692
+ },
2693
+ {
2694
+ "id": "gsh-c08-m06",
2695
+ "type": "procedural",
2696
+ "timestamp": "2026-05-08T14:00:00Z",
2697
+ "content": "Before reservation support team signs off, they require the accessibility confirmation email.",
2698
+ "tags": [
2699
+ "approval",
2700
+ "artifact"
2701
+ ],
2702
+ "importance": 0.88,
2703
+ "metadata": {
2704
+ "role": "weak-overlap-answer",
2705
+ "source_turn": "t06"
2706
+ },
2707
+ "associations": [
2708
+ {
2709
+ "target_id": "gsh-c08-m05",
2710
+ "strength": 0.9,
2711
+ "relation": "required_by"
2712
+ }
2713
+ ]
2714
+ },
2715
+ {
2716
+ "id": "gsh-c08-m07",
2717
+ "type": "semantic",
2718
+ "timestamp": "2026-05-08T15:00:00Z",
2719
+ "content": "Initial state for Lisbon family itinerary: hostel in Baixa.",
2720
+ "tags": [
2721
+ "current-state",
2722
+ "stale"
2723
+ ],
2724
+ "importance": 0.55,
2725
+ "metadata": {
2726
+ "role": "stale",
2727
+ "source_turn": "t07"
2728
+ },
2729
+ "associations": [
2730
+ {
2731
+ "target_id": "gsh-c08-m08",
2732
+ "strength": 0.95,
2733
+ "relation": "replaced_by"
2734
+ }
2735
+ ]
2736
+ },
2737
+ {
2738
+ "id": "gsh-c08-m08",
2739
+ "type": "episodic",
2740
+ "timestamp": "2026-05-08T16:00:00Z",
2741
+ "content": "Interim update for Lisbon family itinerary: apartment in Alfama, replacing the initial state.",
2742
+ "tags": [
2743
+ "current-state",
2744
+ "interim"
2745
+ ],
2746
+ "importance": 0.65,
2747
+ "metadata": {
2748
+ "role": "interim",
2749
+ "source_turn": "t08"
2750
+ },
2751
+ "associations": [
2752
+ {
2753
+ "target_id": "gsh-c08-m07",
2754
+ "strength": 0.95,
2755
+ "relation": "replaces"
2756
+ },
2757
+ {
2758
+ "target_id": "gsh-c08-m09",
2759
+ "strength": 0.98,
2760
+ "relation": "replaced_by"
2761
+ }
2762
+ ]
2763
+ },
2764
+ {
2765
+ "id": "gsh-c08-m09",
2766
+ "type": "semantic",
2767
+ "timestamp": "2026-05-08T17:00:00Z",
2768
+ "content": "Final resolved state for Lisbon family itinerary: boutique hotel in Chiado. This supersedes all earlier versions.",
2769
+ "tags": [
2770
+ "current-state",
2771
+ "final"
2772
+ ],
2773
+ "importance": 0.95,
2774
+ "metadata": {
2775
+ "role": "final",
2776
+ "source_turn": "t09"
2777
+ },
2778
+ "associations": [
2779
+ {
2780
+ "target_id": "gsh-c08-m08",
2781
+ "strength": 0.98,
2782
+ "relation": "replaces"
2783
+ },
2784
+ {
2785
+ "target_id": "gsh-c08-m07",
2786
+ "strength": 0.98,
2787
+ "relation": "replaces"
2788
+ }
2789
+ ]
2790
+ },
2791
+ {
2792
+ "id": "gsh-c08-m10",
2793
+ "type": "semantic",
2794
+ "timestamp": "2026-05-08T18:00:00Z",
2795
+ "content": "Distractor: Rafi runs a similar-sounding effort with tram pass kiosk, but it is not Raf's Lisbon family itinerary.",
2796
+ "tags": [
2797
+ "near-entity",
2798
+ "distractor"
2799
+ ],
2800
+ "importance": 0.5,
2801
+ "metadata": {
2802
+ "role": "distractor",
2803
+ "source_turn": "t10"
2804
+ },
2805
+ "associations": []
2806
+ },
2807
+ {
2808
+ "id": "gsh-c08-m11",
2809
+ "type": "semantic",
2810
+ "timestamp": "2026-05-08T19:00:00Z",
2811
+ "content": "For Raf's Lisbon family itinerary, the accountable owner is Tomas Freire.",
2812
+ "tags": [
2813
+ "near-entity",
2814
+ "owner"
2815
+ ],
2816
+ "importance": 0.86,
2817
+ "metadata": {
2818
+ "role": "entity-answer",
2819
+ "source_turn": "t11"
2820
+ },
2821
+ "associations": [
2822
+ {
2823
+ "target_id": "gsh-c08-m02",
2824
+ "strength": 0.9,
2825
+ "relation": "owned_by"
2826
+ }
2827
+ ]
2828
+ },
2829
+ {
2830
+ "id": "gsh-c08-m12",
2831
+ "type": "semantic",
2832
+ "timestamp": "2026-05-08T20:00:00Z",
2833
+ "content": "Tempting gap: Rafi's similar effort mentions airport transfer vendor, but Raf's records do not state that value.",
2834
+ "tags": [
2835
+ "abstention",
2836
+ "tempting-distractor"
2837
+ ],
2838
+ "importance": 0.45,
2839
+ "metadata": {
2840
+ "role": "abstention-distractor",
2841
+ "source_turn": "t12"
2842
+ },
2843
+ "associations": []
2844
+ }
2845
+ ],
2846
+ "questions": [
2847
+ {
2848
+ "question_id": "gsh-c08-q01",
2849
+ "category": "multi_hop_association",
2850
+ "question": "What operational dependency is ultimately used by Raf's codename Lighthouse?",
2851
+ "expected_answer": "Tile Museum quiet-entry slot",
2852
+ "required_memory_ids": [
2853
+ "gsh-c08-m01",
2854
+ "gsh-c08-m02",
2855
+ "gsh-c08-m03"
2856
+ ],
2857
+ "forbidden_memory_ids": [
2858
+ "gsh-c08-m10"
2859
+ ],
2860
+ "difficulty": "hard",
2861
+ "architecture_bias_risk": "intentional_graph_stress",
2862
+ "fairness_note": "Requires codename to project to bridge to dependency; no single memory contains both codename and answer."
2863
+ },
2864
+ {
2865
+ "question_id": "gsh-c08-q02",
2866
+ "category": "multi_hop_association",
2867
+ "question": "Which artifact is needed before the release approval meeting can be signed off for Raf's work?",
2868
+ "expected_answer": "accessibility confirmation email",
2869
+ "required_memory_ids": [
2870
+ "gsh-c08-m04",
2871
+ "gsh-c08-m05",
2872
+ "gsh-c08-m06"
2873
+ ],
2874
+ "forbidden_memory_ids": [
2875
+ "gsh-c08-m10"
2876
+ ],
2877
+ "difficulty": "hard",
2878
+ "architecture_bias_risk": "intentional_graph_stress",
2879
+ "fairness_note": "Uses weak lexical overlap between release approval meeting and the internal ritual name."
2880
+ },
2881
+ {
2882
+ "question_id": "gsh-c08-q03",
2883
+ "category": "contradiction_resolution",
2884
+ "question": "What is the current resolved state for Lisbon family itinerary?",
2885
+ "expected_answer": "boutique hotel in Chiado",
2886
+ "required_memory_ids": [
2887
+ "gsh-c08-m09"
2888
+ ],
2889
+ "forbidden_memory_ids": [
2890
+ "gsh-c08-m07",
2891
+ "gsh-c08-m08"
2892
+ ],
2893
+ "difficulty": "hard",
2894
+ "architecture_bias_risk": "intentional_graph_stress",
2895
+ "fairness_note": "Final memory must beat stale and interim states connected by replaces/replaced_by edges."
2896
+ },
2897
+ {
2898
+ "question_id": "gsh-c08-q04",
2899
+ "category": "graph_traversal",
2900
+ "question": "For the approval chain in Raf's project, which team owns the internal ritual?",
2901
+ "expected_answer": "reservation support team",
2902
+ "required_memory_ids": [
2903
+ "gsh-c08-m04",
2904
+ "gsh-c08-m05"
2905
+ ],
2906
+ "forbidden_memory_ids": [
2907
+ "gsh-c08-m10"
2908
+ ],
2909
+ "difficulty": "medium",
2910
+ "architecture_bias_risk": "intentional_graph_stress",
2911
+ "fairness_note": "Retrieval should cross from the approval phrasing to the internal ritual and owner team."
2912
+ },
2913
+ {
2914
+ "question_id": "gsh-c08-q05",
2915
+ "category": "abstention",
2916
+ "question": "What is the airport transfer vendor for Raf's project?",
2917
+ "expected_answer": "Not stated in the available memories for Raf's project.",
2918
+ "required_memory_ids": [],
2919
+ "forbidden_memory_ids": [
2920
+ "gsh-c08-m12",
2921
+ "gsh-c08-m10"
2922
+ ],
2923
+ "difficulty": "hard",
2924
+ "architecture_bias_risk": "intentional_graph_stress",
2925
+ "fairness_note": "The dataset contains a tempting value for a similar entity, but no answer for the target entity."
2926
+ }
2927
+ ]
2928
+ },
2929
+ {
2930
+ "conversation_id": "gsh-c09",
2931
+ "agent_id": "agent-security-review",
2932
+ "domain": "software",
2933
+ "memory_records": [
2934
+ {
2935
+ "id": "gsh-c09-m01",
2936
+ "type": "semantic",
2937
+ "timestamp": "2026-05-09T09:00:00Z",
2938
+ "content": "Nadia uses codename Iron Finch for the active workstream, but the codename itself does not state the operational dependency.",
2939
+ "tags": [
2940
+ "iron-finch",
2941
+ "codename"
2942
+ ],
2943
+ "importance": 0.7,
2944
+ "metadata": {
2945
+ "role": "anchor",
2946
+ "source_turn": "t01"
2947
+ },
2948
+ "associations": [
2949
+ {
2950
+ "target_id": "gsh-c09-m02",
2951
+ "strength": 0.9,
2952
+ "relation": "alias_of"
2953
+ }
2954
+ ]
2955
+ },
2956
+ {
2957
+ "id": "gsh-c09-m02",
2958
+ "type": "semantic",
2959
+ "timestamp": "2026-05-09T10:00:00Z",
2960
+ "content": "Codename Iron Finch refers to the API abuse detection, which is governed through the credential-stuffing response.",
2961
+ "tags": [
2962
+ "api",
2963
+ "project-link"
2964
+ ],
2965
+ "importance": 0.8,
2966
+ "metadata": {
2967
+ "role": "bridge",
2968
+ "source_turn": "t02"
2969
+ },
2970
+ "associations": [
2971
+ {
2972
+ "target_id": "gsh-c09-m01",
2973
+ "strength": 0.9,
2974
+ "relation": "alias_of"
2975
+ },
2976
+ {
2977
+ "target_id": "gsh-c09-m03",
2978
+ "strength": 0.9,
2979
+ "relation": "depends_on"
2980
+ }
2981
+ ]
2982
+ },
2983
+ {
2984
+ "id": "gsh-c09-m03",
2985
+ "type": "semantic",
2986
+ "timestamp": "2026-05-09T11:00:00Z",
2987
+ "content": "The credential-stuffing response depends on the edge rate-limit rule set; this dependency is not named in the codename discussion.",
2988
+ "tags": [
2989
+ "credential-stuffing",
2990
+ "dependency"
2991
+ ],
2992
+ "importance": 0.9,
2993
+ "metadata": {
2994
+ "role": "answer",
2995
+ "source_turn": "t03"
2996
+ },
2997
+ "associations": [
2998
+ {
2999
+ "target_id": "gsh-c09-m02",
3000
+ "strength": 0.9,
3001
+ "relation": "supports"
3002
+ }
3003
+ ]
3004
+ },
3005
+ {
3006
+ "id": "gsh-c09-m04",
3007
+ "type": "episodic",
3008
+ "timestamp": "2026-05-09T12:00:00Z",
3009
+ "content": "Nadia said the approval ritual is internally called Black Seal; outside notes describe it as the release approval meeting.",
3010
+ "tags": [
3011
+ "approval",
3012
+ "ritual"
3013
+ ],
3014
+ "importance": 0.75,
3015
+ "metadata": {
3016
+ "role": "weak-overlap-anchor",
3017
+ "source_turn": "t04"
3018
+ },
3019
+ "associations": [
3020
+ {
3021
+ "target_id": "gsh-c09-m05",
3022
+ "strength": 0.9,
3023
+ "relation": "owned_by"
3024
+ }
3025
+ ]
3026
+ },
3027
+ {
3028
+ "id": "gsh-c09-m05",
3029
+ "type": "semantic",
3030
+ "timestamp": "2026-05-09T13:00:00Z",
3031
+ "content": "Black Seal is owned by the incident readiness team, not by the team whose name resembles the project codename.",
3032
+ "tags": [
3033
+ "approval",
3034
+ "team"
3035
+ ],
3036
+ "importance": 0.82,
3037
+ "metadata": {
3038
+ "role": "weak-overlap-bridge",
3039
+ "source_turn": "t05"
3040
+ },
3041
+ "associations": [
3042
+ {
3043
+ "target_id": "gsh-c09-m04",
3044
+ "strength": 0.9,
3045
+ "relation": "owns"
3046
+ },
3047
+ {
3048
+ "target_id": "gsh-c09-m06",
3049
+ "strength": 0.9,
3050
+ "relation": "requires"
3051
+ }
3052
+ ]
3053
+ },
3054
+ {
3055
+ "id": "gsh-c09-m06",
3056
+ "type": "procedural",
3057
+ "timestamp": "2026-05-09T14:00:00Z",
3058
+ "content": "Before incident readiness team signs off, they require the pager escalation map.",
3059
+ "tags": [
3060
+ "approval",
3061
+ "artifact"
3062
+ ],
3063
+ "importance": 0.88,
3064
+ "metadata": {
3065
+ "role": "weak-overlap-answer",
3066
+ "source_turn": "t06"
3067
+ },
3068
+ "associations": [
3069
+ {
3070
+ "target_id": "gsh-c09-m05",
3071
+ "strength": 0.9,
3072
+ "relation": "required_by"
3073
+ }
3074
+ ]
3075
+ },
3076
+ {
3077
+ "id": "gsh-c09-m07",
3078
+ "type": "semantic",
3079
+ "timestamp": "2026-05-09T15:00:00Z",
3080
+ "content": "Initial state for API abuse detection: block after 20 failed attempts.",
3081
+ "tags": [
3082
+ "current-state",
3083
+ "stale"
3084
+ ],
3085
+ "importance": 0.55,
3086
+ "metadata": {
3087
+ "role": "stale",
3088
+ "source_turn": "t07"
3089
+ },
3090
+ "associations": [
3091
+ {
3092
+ "target_id": "gsh-c09-m08",
3093
+ "strength": 0.95,
3094
+ "relation": "replaced_by"
3095
+ }
3096
+ ]
3097
+ },
3098
+ {
3099
+ "id": "gsh-c09-m08",
3100
+ "type": "episodic",
3101
+ "timestamp": "2026-05-09T16:00:00Z",
3102
+ "content": "Interim update for API abuse detection: challenge after 15 failed attempts, replacing the initial state.",
3103
+ "tags": [
3104
+ "current-state",
3105
+ "interim"
3106
+ ],
3107
+ "importance": 0.65,
3108
+ "metadata": {
3109
+ "role": "interim",
3110
+ "source_turn": "t08"
3111
+ },
3112
+ "associations": [
3113
+ {
3114
+ "target_id": "gsh-c09-m07",
3115
+ "strength": 0.95,
3116
+ "relation": "replaces"
3117
+ },
3118
+ {
3119
+ "target_id": "gsh-c09-m09",
3120
+ "strength": 0.98,
3121
+ "relation": "replaced_by"
3122
+ }
3123
+ ]
3124
+ },
3125
+ {
3126
+ "id": "gsh-c09-m09",
3127
+ "type": "semantic",
3128
+ "timestamp": "2026-05-09T17:00:00Z",
3129
+ "content": "Final resolved state for API abuse detection: challenge after 8 failed attempts plus IP reputation. This supersedes all earlier versions.",
3130
+ "tags": [
3131
+ "current-state",
3132
+ "final"
3133
+ ],
3134
+ "importance": 0.95,
3135
+ "metadata": {
3136
+ "role": "final",
3137
+ "source_turn": "t09"
3138
+ },
3139
+ "associations": [
3140
+ {
3141
+ "target_id": "gsh-c09-m08",
3142
+ "strength": 0.98,
3143
+ "relation": "replaces"
3144
+ },
3145
+ {
3146
+ "target_id": "gsh-c09-m07",
3147
+ "strength": 0.98,
3148
+ "relation": "replaces"
3149
+ }
3150
+ ]
3151
+ },
3152
+ {
3153
+ "id": "gsh-c09-m10",
3154
+ "type": "semantic",
3155
+ "timestamp": "2026-05-09T18:00:00Z",
3156
+ "content": "Distractor: Nadya runs a similar-sounding effort with WAF geo-block list, but it is not Nadia's API abuse detection.",
3157
+ "tags": [
3158
+ "near-entity",
3159
+ "distractor"
3160
+ ],
3161
+ "importance": 0.5,
3162
+ "metadata": {
3163
+ "role": "distractor",
3164
+ "source_turn": "t10"
3165
+ },
3166
+ "associations": []
3167
+ },
3168
+ {
3169
+ "id": "gsh-c09-m11",
3170
+ "type": "semantic",
3171
+ "timestamp": "2026-05-09T19:00:00Z",
3172
+ "content": "For Nadia's API abuse detection, the accountable owner is Owen Hart.",
3173
+ "tags": [
3174
+ "near-entity",
3175
+ "owner"
3176
+ ],
3177
+ "importance": 0.86,
3178
+ "metadata": {
3179
+ "role": "entity-answer",
3180
+ "source_turn": "t11"
3181
+ },
3182
+ "associations": [
3183
+ {
3184
+ "target_id": "gsh-c09-m02",
3185
+ "strength": 0.9,
3186
+ "relation": "owned_by"
3187
+ }
3188
+ ]
3189
+ },
3190
+ {
3191
+ "id": "gsh-c09-m12",
3192
+ "type": "semantic",
3193
+ "timestamp": "2026-05-09T20:00:00Z",
3194
+ "content": "Tempting gap: Nadya's similar effort mentions annual security budget, but Nadia's records do not state that value.",
3195
+ "tags": [
3196
+ "abstention",
3197
+ "tempting-distractor"
3198
+ ],
3199
+ "importance": 0.45,
3200
+ "metadata": {
3201
+ "role": "abstention-distractor",
3202
+ "source_turn": "t12"
3203
+ },
3204
+ "associations": []
3205
+ }
3206
+ ],
3207
+ "questions": [
3208
+ {
3209
+ "question_id": "gsh-c09-q01",
3210
+ "category": "multi_hop_association",
3211
+ "question": "What operational dependency is ultimately used by Nadia's codename Iron Finch?",
3212
+ "expected_answer": "edge rate-limit rule set",
3213
+ "required_memory_ids": [
3214
+ "gsh-c09-m01",
3215
+ "gsh-c09-m02",
3216
+ "gsh-c09-m03"
3217
+ ],
3218
+ "forbidden_memory_ids": [
3219
+ "gsh-c09-m10"
3220
+ ],
3221
+ "difficulty": "hard",
3222
+ "architecture_bias_risk": "intentional_graph_stress",
3223
+ "fairness_note": "Requires codename to project to bridge to dependency; no single memory contains both codename and answer."
3224
+ },
3225
+ {
3226
+ "question_id": "gsh-c09-q02",
3227
+ "category": "multi_hop_association",
3228
+ "question": "Which artifact is needed before the release approval meeting can be signed off for Nadia's work?",
3229
+ "expected_answer": "pager escalation map",
3230
+ "required_memory_ids": [
3231
+ "gsh-c09-m04",
3232
+ "gsh-c09-m05",
3233
+ "gsh-c09-m06"
3234
+ ],
3235
+ "forbidden_memory_ids": [
3236
+ "gsh-c09-m10"
3237
+ ],
3238
+ "difficulty": "hard",
3239
+ "architecture_bias_risk": "intentional_graph_stress",
3240
+ "fairness_note": "Uses weak lexical overlap between release approval meeting and the internal ritual name."
3241
+ },
3242
+ {
3243
+ "question_id": "gsh-c09-q03",
3244
+ "category": "contradiction_resolution",
3245
+ "question": "What is the current resolved state for API abuse detection?",
3246
+ "expected_answer": "challenge after 8 failed attempts plus IP reputation",
3247
+ "required_memory_ids": [
3248
+ "gsh-c09-m09"
3249
+ ],
3250
+ "forbidden_memory_ids": [
3251
+ "gsh-c09-m07",
3252
+ "gsh-c09-m08"
3253
+ ],
3254
+ "difficulty": "hard",
3255
+ "architecture_bias_risk": "intentional_graph_stress",
3256
+ "fairness_note": "Final memory must beat stale and interim states connected by replaces/replaced_by edges."
3257
+ },
3258
+ {
3259
+ "question_id": "gsh-c09-q04",
3260
+ "category": "graph_traversal",
3261
+ "question": "For the approval chain in Nadia's project, which team owns the internal ritual?",
3262
+ "expected_answer": "incident readiness team",
3263
+ "required_memory_ids": [
3264
+ "gsh-c09-m04",
3265
+ "gsh-c09-m05"
3266
+ ],
3267
+ "forbidden_memory_ids": [
3268
+ "gsh-c09-m10"
3269
+ ],
3270
+ "difficulty": "medium",
3271
+ "architecture_bias_risk": "intentional_graph_stress",
3272
+ "fairness_note": "Retrieval should cross from the approval phrasing to the internal ritual and owner team."
3273
+ },
3274
+ {
3275
+ "question_id": "gsh-c09-q05",
3276
+ "category": "abstention",
3277
+ "question": "What is the annual security budget for Nadia's project?",
3278
+ "expected_answer": "Not stated in the available memories for Nadia's project.",
3279
+ "required_memory_ids": [],
3280
+ "forbidden_memory_ids": [
3281
+ "gsh-c09-m12",
3282
+ "gsh-c09-m10"
3283
+ ],
3284
+ "difficulty": "hard",
3285
+ "architecture_bias_risk": "intentional_graph_stress",
3286
+ "fairness_note": "The dataset contains a tempting value for a similar entity, but no answer for the target entity."
3287
+ }
3288
+ ]
3289
+ },
3290
+ {
3291
+ "conversation_id": "gsh-c10",
3292
+ "agent_id": "agent-supply-chain",
3293
+ "domain": "operations",
3294
+ "memory_records": [
3295
+ {
3296
+ "id": "gsh-c10-m01",
3297
+ "type": "semantic",
3298
+ "timestamp": "2026-05-10T09:00:00Z",
3299
+ "content": "Jonas uses codename Cedar Path for the active workstream, but the codename itself does not state the operational dependency.",
3300
+ "tags": [
3301
+ "cedar-path",
3302
+ "codename"
3303
+ ],
3304
+ "importance": 0.7,
3305
+ "metadata": {
3306
+ "role": "anchor",
3307
+ "source_turn": "t01"
3308
+ },
3309
+ "associations": [
3310
+ {
3311
+ "target_id": "gsh-c10-m02",
3312
+ "strength": 0.9,
3313
+ "relation": "alias_of"
3314
+ }
3315
+ ]
3316
+ },
3317
+ {
3318
+ "id": "gsh-c10-m02",
3319
+ "type": "semantic",
3320
+ "timestamp": "2026-05-10T10:00:00Z",
3321
+ "content": "Codename Cedar Path refers to the warehouse slotting refresh, which is governed through the cold-chain pick route.",
3322
+ "tags": [
3323
+ "warehouse",
3324
+ "project-link"
3325
+ ],
3326
+ "importance": 0.8,
3327
+ "metadata": {
3328
+ "role": "bridge",
3329
+ "source_turn": "t02"
3330
+ },
3331
+ "associations": [
3332
+ {
3333
+ "target_id": "gsh-c10-m01",
3334
+ "strength": 0.9,
3335
+ "relation": "alias_of"
3336
+ },
3337
+ {
3338
+ "target_id": "gsh-c10-m03",
3339
+ "strength": 0.9,
3340
+ "relation": "depends_on"
3341
+ }
3342
+ ]
3343
+ },
3344
+ {
3345
+ "id": "gsh-c10-m03",
3346
+ "type": "semantic",
3347
+ "timestamp": "2026-05-10T11:00:00Z",
3348
+ "content": "The cold-chain pick route depends on the zone C thermal scanner; this dependency is not named in the codename discussion.",
3349
+ "tags": [
3350
+ "cold-chain",
3351
+ "dependency"
3352
+ ],
3353
+ "importance": 0.9,
3354
+ "metadata": {
3355
+ "role": "answer",
3356
+ "source_turn": "t03"
3357
+ },
3358
+ "associations": [
3359
+ {
3360
+ "target_id": "gsh-c10-m02",
3361
+ "strength": 0.9,
3362
+ "relation": "supports"
3363
+ }
3364
+ ]
3365
+ },
3366
+ {
3367
+ "id": "gsh-c10-m04",
3368
+ "type": "episodic",
3369
+ "timestamp": "2026-05-10T12:00:00Z",
3370
+ "content": "Jonas said the approval ritual is internally called Orange Tag; outside notes describe it as the release approval meeting.",
3371
+ "tags": [
3372
+ "approval",
3373
+ "ritual"
3374
+ ],
3375
+ "importance": 0.75,
3376
+ "metadata": {
3377
+ "role": "weak-overlap-anchor",
3378
+ "source_turn": "t04"
3379
+ },
3380
+ "associations": [
3381
+ {
3382
+ "target_id": "gsh-c10-m05",
3383
+ "strength": 0.9,
3384
+ "relation": "owned_by"
3385
+ }
3386
+ ]
3387
+ },
3388
+ {
3389
+ "id": "gsh-c10-m05",
3390
+ "type": "semantic",
3391
+ "timestamp": "2026-05-10T13:00:00Z",
3392
+ "content": "Orange Tag is owned by the inventory audit team, not by the team whose name resembles the project codename.",
3393
+ "tags": [
3394
+ "approval",
3395
+ "team"
3396
+ ],
3397
+ "importance": 0.82,
3398
+ "metadata": {
3399
+ "role": "weak-overlap-bridge",
3400
+ "source_turn": "t05"
3401
+ },
3402
+ "associations": [
3403
+ {
3404
+ "target_id": "gsh-c10-m04",
3405
+ "strength": 0.9,
3406
+ "relation": "owns"
3407
+ },
3408
+ {
3409
+ "target_id": "gsh-c10-m06",
3410
+ "strength": 0.9,
3411
+ "relation": "requires"
3412
+ }
3413
+ ]
3414
+ },
3415
+ {
3416
+ "id": "gsh-c10-m06",
3417
+ "type": "procedural",
3418
+ "timestamp": "2026-05-10T14:00:00Z",
3419
+ "content": "Before inventory audit team signs off, they require the lot-expiry reconciliation sheet.",
3420
+ "tags": [
3421
+ "approval",
3422
+ "artifact"
3423
+ ],
3424
+ "importance": 0.88,
3425
+ "metadata": {
3426
+ "role": "weak-overlap-answer",
3427
+ "source_turn": "t06"
3428
+ },
3429
+ "associations": [
3430
+ {
3431
+ "target_id": "gsh-c10-m05",
3432
+ "strength": 0.9,
3433
+ "relation": "required_by"
3434
+ }
3435
+ ]
3436
+ },
3437
+ {
3438
+ "id": "gsh-c10-m07",
3439
+ "type": "semantic",
3440
+ "timestamp": "2026-05-10T15:00:00Z",
3441
+ "content": "Initial state for warehouse slotting refresh: refresh every quarter.",
3442
+ "tags": [
3443
+ "current-state",
3444
+ "stale"
3445
+ ],
3446
+ "importance": 0.55,
3447
+ "metadata": {
3448
+ "role": "stale",
3449
+ "source_turn": "t07"
3450
+ },
3451
+ "associations": [
3452
+ {
3453
+ "target_id": "gsh-c10-m08",
3454
+ "strength": 0.95,
3455
+ "relation": "replaced_by"
3456
+ }
3457
+ ]
3458
+ },
3459
+ {
3460
+ "id": "gsh-c10-m08",
3461
+ "type": "episodic",
3462
+ "timestamp": "2026-05-10T16:00:00Z",
3463
+ "content": "Interim update for warehouse slotting refresh: refresh every two months, replacing the initial state.",
3464
+ "tags": [
3465
+ "current-state",
3466
+ "interim"
3467
+ ],
3468
+ "importance": 0.65,
3469
+ "metadata": {
3470
+ "role": "interim",
3471
+ "source_turn": "t08"
3472
+ },
3473
+ "associations": [
3474
+ {
3475
+ "target_id": "gsh-c10-m07",
3476
+ "strength": 0.95,
3477
+ "relation": "replaces"
3478
+ },
3479
+ {
3480
+ "target_id": "gsh-c10-m09",
3481
+ "strength": 0.98,
3482
+ "relation": "replaced_by"
3483
+ }
3484
+ ]
3485
+ },
3486
+ {
3487
+ "id": "gsh-c10-m09",
3488
+ "type": "semantic",
3489
+ "timestamp": "2026-05-10T17:00:00Z",
3490
+ "content": "Final resolved state for warehouse slotting refresh: refresh every six weeks during peak season. This supersedes all earlier versions.",
3491
+ "tags": [
3492
+ "current-state",
3493
+ "final"
3494
+ ],
3495
+ "importance": 0.95,
3496
+ "metadata": {
3497
+ "role": "final",
3498
+ "source_turn": "t09"
3499
+ },
3500
+ "associations": [
3501
+ {
3502
+ "target_id": "gsh-c10-m08",
3503
+ "strength": 0.98,
3504
+ "relation": "replaces"
3505
+ },
3506
+ {
3507
+ "target_id": "gsh-c10-m07",
3508
+ "strength": 0.98,
3509
+ "relation": "replaces"
3510
+ }
3511
+ ]
3512
+ },
3513
+ {
3514
+ "id": "gsh-c10-m10",
3515
+ "type": "semantic",
3516
+ "timestamp": "2026-05-10T18:00:00Z",
3517
+ "content": "Distractor: Jona runs a similar-sounding effort with zone A RFID gate, but it is not Jonas's warehouse slotting refresh.",
3518
+ "tags": [
3519
+ "near-entity",
3520
+ "distractor"
3521
+ ],
3522
+ "importance": 0.5,
3523
+ "metadata": {
3524
+ "role": "distractor",
3525
+ "source_turn": "t10"
3526
+ },
3527
+ "associations": []
3528
+ },
3529
+ {
3530
+ "id": "gsh-c10-m11",
3531
+ "type": "semantic",
3532
+ "timestamp": "2026-05-10T19:00:00Z",
3533
+ "content": "For Jonas's warehouse slotting refresh, the accountable owner is Mina Park.",
3534
+ "tags": [
3535
+ "near-entity",
3536
+ "owner"
3537
+ ],
3538
+ "importance": 0.86,
3539
+ "metadata": {
3540
+ "role": "entity-answer",
3541
+ "source_turn": "t11"
3542
+ },
3543
+ "associations": [
3544
+ {
3545
+ "target_id": "gsh-c10-m02",
3546
+ "strength": 0.9,
3547
+ "relation": "owned_by"
3548
+ }
3549
+ ]
3550
+ },
3551
+ {
3552
+ "id": "gsh-c10-m12",
3553
+ "type": "semantic",
3554
+ "timestamp": "2026-05-10T20:00:00Z",
3555
+ "content": "Tempting gap: Jona's similar effort mentions forklift maintenance vendor, but Jonas's records do not state that value.",
3556
+ "tags": [
3557
+ "abstention",
3558
+ "tempting-distractor"
3559
+ ],
3560
+ "importance": 0.45,
3561
+ "metadata": {
3562
+ "role": "abstention-distractor",
3563
+ "source_turn": "t12"
3564
+ },
3565
+ "associations": []
3566
+ }
3567
+ ],
3568
+ "questions": [
3569
+ {
3570
+ "question_id": "gsh-c10-q01",
3571
+ "category": "multi_hop_association",
3572
+ "question": "What operational dependency is ultimately used by Jonas's codename Cedar Path?",
3573
+ "expected_answer": "zone C thermal scanner",
3574
+ "required_memory_ids": [
3575
+ "gsh-c10-m01",
3576
+ "gsh-c10-m02",
3577
+ "gsh-c10-m03"
3578
+ ],
3579
+ "forbidden_memory_ids": [
3580
+ "gsh-c10-m10"
3581
+ ],
3582
+ "difficulty": "hard",
3583
+ "architecture_bias_risk": "intentional_graph_stress",
3584
+ "fairness_note": "Requires codename to project to bridge to dependency; no single memory contains both codename and answer."
3585
+ },
3586
+ {
3587
+ "question_id": "gsh-c10-q02",
3588
+ "category": "multi_hop_association",
3589
+ "question": "Which artifact is needed before the release approval meeting can be signed off for Jonas's work?",
3590
+ "expected_answer": "lot-expiry reconciliation sheet",
3591
+ "required_memory_ids": [
3592
+ "gsh-c10-m04",
3593
+ "gsh-c10-m05",
3594
+ "gsh-c10-m06"
3595
+ ],
3596
+ "forbidden_memory_ids": [
3597
+ "gsh-c10-m10"
3598
+ ],
3599
+ "difficulty": "hard",
3600
+ "architecture_bias_risk": "intentional_graph_stress",
3601
+ "fairness_note": "Uses weak lexical overlap between release approval meeting and the internal ritual name."
3602
+ },
3603
+ {
3604
+ "question_id": "gsh-c10-q03",
3605
+ "category": "contradiction_resolution",
3606
+ "question": "What is the current resolved state for warehouse slotting refresh?",
3607
+ "expected_answer": "refresh every six weeks during peak season",
3608
+ "required_memory_ids": [
3609
+ "gsh-c10-m09"
3610
+ ],
3611
+ "forbidden_memory_ids": [
3612
+ "gsh-c10-m07",
3613
+ "gsh-c10-m08"
3614
+ ],
3615
+ "difficulty": "hard",
3616
+ "architecture_bias_risk": "intentional_graph_stress",
3617
+ "fairness_note": "Final memory must beat stale and interim states connected by replaces/replaced_by edges."
3618
+ },
3619
+ {
3620
+ "question_id": "gsh-c10-q04",
3621
+ "category": "graph_traversal",
3622
+ "question": "For the approval chain in Jonas's project, which team owns the internal ritual?",
3623
+ "expected_answer": "inventory audit team",
3624
+ "required_memory_ids": [
3625
+ "gsh-c10-m04",
3626
+ "gsh-c10-m05"
3627
+ ],
3628
+ "forbidden_memory_ids": [
3629
+ "gsh-c10-m10"
3630
+ ],
3631
+ "difficulty": "medium",
3632
+ "architecture_bias_risk": "intentional_graph_stress",
3633
+ "fairness_note": "Retrieval should cross from the approval phrasing to the internal ritual and owner team."
3634
+ },
3635
+ {
3636
+ "question_id": "gsh-c10-q05",
3637
+ "category": "abstention",
3638
+ "question": "What is the forklift maintenance vendor for Jonas's project?",
3639
+ "expected_answer": "Not stated in the available memories for Jonas's project.",
3640
+ "required_memory_ids": [],
3641
+ "forbidden_memory_ids": [
3642
+ "gsh-c10-m12",
3643
+ "gsh-c10-m10"
3644
+ ],
3645
+ "difficulty": "hard",
3646
+ "architecture_bias_risk": "intentional_graph_stress",
3647
+ "fairness_note": "The dataset contains a tempting value for a similar entity, but no answer for the target entity."
3648
+ }
3649
+ ]
3650
+ },
3651
+ {
3652
+ "conversation_id": "gsh-c11",
3653
+ "agent_id": "agent-civic-data",
3654
+ "domain": "research",
3655
+ "memory_records": [
3656
+ {
3657
+ "id": "gsh-c11-m01",
3658
+ "type": "semantic",
3659
+ "timestamp": "2026-05-11T09:00:00Z",
3660
+ "content": "Yara uses codename Civic Loom for the active workstream, but the codename itself does not state the operational dependency.",
3661
+ "tags": [
3662
+ "civic-loom",
3663
+ "codename"
3664
+ ],
3665
+ "importance": 0.7,
3666
+ "metadata": {
3667
+ "role": "anchor",
3668
+ "source_turn": "t01"
3669
+ },
3670
+ "associations": [
3671
+ {
3672
+ "target_id": "gsh-c11-m02",
3673
+ "strength": 0.9,
3674
+ "relation": "alias_of"
3675
+ }
3676
+ ]
3677
+ },
3678
+ {
3679
+ "id": "gsh-c11-m02",
3680
+ "type": "semantic",
3681
+ "timestamp": "2026-05-11T10:00:00Z",
3682
+ "content": "Codename Civic Loom refers to the public-comment clustering, which is governed through the transportation hearing archive.",
3683
+ "tags": [
3684
+ "public-comment",
3685
+ "project-link"
3686
+ ],
3687
+ "importance": 0.8,
3688
+ "metadata": {
3689
+ "role": "bridge",
3690
+ "source_turn": "t02"
3691
+ },
3692
+ "associations": [
3693
+ {
3694
+ "target_id": "gsh-c11-m01",
3695
+ "strength": 0.9,
3696
+ "relation": "alias_of"
3697
+ },
3698
+ {
3699
+ "target_id": "gsh-c11-m03",
3700
+ "strength": 0.9,
3701
+ "relation": "depends_on"
3702
+ }
3703
+ ]
3704
+ },
3705
+ {
3706
+ "id": "gsh-c11-m03",
3707
+ "type": "semantic",
3708
+ "timestamp": "2026-05-11T11:00:00Z",
3709
+ "content": "The transportation hearing archive depends on the speaker-topic matrix; this dependency is not named in the codename discussion.",
3710
+ "tags": [
3711
+ "transportation",
3712
+ "dependency"
3713
+ ],
3714
+ "importance": 0.9,
3715
+ "metadata": {
3716
+ "role": "answer",
3717
+ "source_turn": "t03"
3718
+ },
3719
+ "associations": [
3720
+ {
3721
+ "target_id": "gsh-c11-m02",
3722
+ "strength": 0.9,
3723
+ "relation": "supports"
3724
+ }
3725
+ ]
3726
+ },
3727
+ {
3728
+ "id": "gsh-c11-m04",
3729
+ "type": "episodic",
3730
+ "timestamp": "2026-05-11T12:00:00Z",
3731
+ "content": "Yara said the approval ritual is internally called Grey Index; outside notes describe it as the release approval meeting.",
3732
+ "tags": [
3733
+ "approval",
3734
+ "ritual"
3735
+ ],
3736
+ "importance": 0.75,
3737
+ "metadata": {
3738
+ "role": "weak-overlap-anchor",
3739
+ "source_turn": "t04"
3740
+ },
3741
+ "associations": [
3742
+ {
3743
+ "target_id": "gsh-c11-m05",
3744
+ "strength": 0.9,
3745
+ "relation": "owned_by"
3746
+ }
3747
+ ]
3748
+ },
3749
+ {
3750
+ "id": "gsh-c11-m05",
3751
+ "type": "semantic",
3752
+ "timestamp": "2026-05-11T13:00:00Z",
3753
+ "content": "Grey Index is owned by the open-records team, not by the team whose name resembles the project codename.",
3754
+ "tags": [
3755
+ "approval",
3756
+ "team"
3757
+ ],
3758
+ "importance": 0.82,
3759
+ "metadata": {
3760
+ "role": "weak-overlap-bridge",
3761
+ "source_turn": "t05"
3762
+ },
3763
+ "associations": [
3764
+ {
3765
+ "target_id": "gsh-c11-m04",
3766
+ "strength": 0.9,
3767
+ "relation": "owns"
3768
+ },
3769
+ {
3770
+ "target_id": "gsh-c11-m06",
3771
+ "strength": 0.9,
3772
+ "relation": "requires"
3773
+ }
3774
+ ]
3775
+ },
3776
+ {
3777
+ "id": "gsh-c11-m06",
3778
+ "type": "procedural",
3779
+ "timestamp": "2026-05-11T14:00:00Z",
3780
+ "content": "Before open-records team signs off, they require the redaction exception log.",
3781
+ "tags": [
3782
+ "approval",
3783
+ "artifact"
3784
+ ],
3785
+ "importance": 0.88,
3786
+ "metadata": {
3787
+ "role": "weak-overlap-answer",
3788
+ "source_turn": "t06"
3789
+ },
3790
+ "associations": [
3791
+ {
3792
+ "target_id": "gsh-c11-m05",
3793
+ "strength": 0.9,
3794
+ "relation": "required_by"
3795
+ }
3796
+ ]
3797
+ },
3798
+ {
3799
+ "id": "gsh-c11-m07",
3800
+ "type": "semantic",
3801
+ "timestamp": "2026-05-11T15:00:00Z",
3802
+ "content": "Initial state for public-comment clustering: archive limited to 2024 hearings.",
3803
+ "tags": [
3804
+ "current-state",
3805
+ "stale"
3806
+ ],
3807
+ "importance": 0.55,
3808
+ "metadata": {
3809
+ "role": "stale",
3810
+ "source_turn": "t07"
3811
+ },
3812
+ "associations": [
3813
+ {
3814
+ "target_id": "gsh-c11-m08",
3815
+ "strength": 0.95,
3816
+ "relation": "replaced_by"
3817
+ }
3818
+ ]
3819
+ },
3820
+ {
3821
+ "id": "gsh-c11-m08",
3822
+ "type": "episodic",
3823
+ "timestamp": "2026-05-11T16:00:00Z",
3824
+ "content": "Interim update for public-comment clustering: archive expanded to 2025 hearings, replacing the initial state.",
3825
+ "tags": [
3826
+ "current-state",
3827
+ "interim"
3828
+ ],
3829
+ "importance": 0.65,
3830
+ "metadata": {
3831
+ "role": "interim",
3832
+ "source_turn": "t08"
3833
+ },
3834
+ "associations": [
3835
+ {
3836
+ "target_id": "gsh-c11-m07",
3837
+ "strength": 0.95,
3838
+ "relation": "replaces"
3839
+ },
3840
+ {
3841
+ "target_id": "gsh-c11-m09",
3842
+ "strength": 0.98,
3843
+ "relation": "replaced_by"
3844
+ }
3845
+ ]
3846
+ },
3847
+ {
3848
+ "id": "gsh-c11-m09",
3849
+ "type": "semantic",
3850
+ "timestamp": "2026-05-11T17:00:00Z",
3851
+ "content": "Final resolved state for public-comment clustering: archive covers 2023 through 2026 hearings. This supersedes all earlier versions.",
3852
+ "tags": [
3853
+ "current-state",
3854
+ "final"
3855
+ ],
3856
+ "importance": 0.95,
3857
+ "metadata": {
3858
+ "role": "final",
3859
+ "source_turn": "t09"
3860
+ },
3861
+ "associations": [
3862
+ {
3863
+ "target_id": "gsh-c11-m08",
3864
+ "strength": 0.98,
3865
+ "relation": "replaces"
3866
+ },
3867
+ {
3868
+ "target_id": "gsh-c11-m07",
3869
+ "strength": 0.98,
3870
+ "relation": "replaces"
3871
+ }
3872
+ ]
3873
+ },
3874
+ {
3875
+ "id": "gsh-c11-m10",
3876
+ "type": "semantic",
3877
+ "timestamp": "2026-05-11T18:00:00Z",
3878
+ "content": "Distractor: Yarra runs a similar-sounding effort with council-vote matrix, but it is not Yara's public-comment clustering.",
3879
+ "tags": [
3880
+ "near-entity",
3881
+ "distractor"
3882
+ ],
3883
+ "importance": 0.5,
3884
+ "metadata": {
3885
+ "role": "distractor",
3886
+ "source_turn": "t10"
3887
+ },
3888
+ "associations": []
3889
+ },
3890
+ {
3891
+ "id": "gsh-c11-m11",
3892
+ "type": "semantic",
3893
+ "timestamp": "2026-05-11T19:00:00Z",
3894
+ "content": "For Yara's public-comment clustering, the accountable owner is Hugo Stern.",
3895
+ "tags": [
3896
+ "near-entity",
3897
+ "owner"
3898
+ ],
3899
+ "importance": 0.86,
3900
+ "metadata": {
3901
+ "role": "entity-answer",
3902
+ "source_turn": "t11"
3903
+ },
3904
+ "associations": [
3905
+ {
3906
+ "target_id": "gsh-c11-m02",
3907
+ "strength": 0.9,
3908
+ "relation": "owned_by"
3909
+ }
3910
+ ]
3911
+ },
3912
+ {
3913
+ "id": "gsh-c11-m12",
3914
+ "type": "semantic",
3915
+ "timestamp": "2026-05-11T20:00:00Z",
3916
+ "content": "Tempting gap: Yarra's similar effort mentions grant renewal date, but Yara's records do not state that value.",
3917
+ "tags": [
3918
+ "abstention",
3919
+ "tempting-distractor"
3920
+ ],
3921
+ "importance": 0.45,
3922
+ "metadata": {
3923
+ "role": "abstention-distractor",
3924
+ "source_turn": "t12"
3925
+ },
3926
+ "associations": []
3927
+ }
3928
+ ],
3929
+ "questions": [
3930
+ {
3931
+ "question_id": "gsh-c11-q01",
3932
+ "category": "multi_hop_association",
3933
+ "question": "What operational dependency is ultimately used by Yara's codename Civic Loom?",
3934
+ "expected_answer": "speaker-topic matrix",
3935
+ "required_memory_ids": [
3936
+ "gsh-c11-m01",
3937
+ "gsh-c11-m02",
3938
+ "gsh-c11-m03"
3939
+ ],
3940
+ "forbidden_memory_ids": [
3941
+ "gsh-c11-m10"
3942
+ ],
3943
+ "difficulty": "hard",
3944
+ "architecture_bias_risk": "intentional_graph_stress",
3945
+ "fairness_note": "Requires codename to project to bridge to dependency; no single memory contains both codename and answer."
3946
+ },
3947
+ {
3948
+ "question_id": "gsh-c11-q02",
3949
+ "category": "multi_hop_association",
3950
+ "question": "Which artifact is needed before the release approval meeting can be signed off for Yara's work?",
3951
+ "expected_answer": "redaction exception log",
3952
+ "required_memory_ids": [
3953
+ "gsh-c11-m04",
3954
+ "gsh-c11-m05",
3955
+ "gsh-c11-m06"
3956
+ ],
3957
+ "forbidden_memory_ids": [
3958
+ "gsh-c11-m10"
3959
+ ],
3960
+ "difficulty": "hard",
3961
+ "architecture_bias_risk": "intentional_graph_stress",
3962
+ "fairness_note": "Uses weak lexical overlap between release approval meeting and the internal ritual name."
3963
+ },
3964
+ {
3965
+ "question_id": "gsh-c11-q03",
3966
+ "category": "contradiction_resolution",
3967
+ "question": "What is the current resolved state for public-comment clustering?",
3968
+ "expected_answer": "archive covers 2023 through 2026 hearings",
3969
+ "required_memory_ids": [
3970
+ "gsh-c11-m09"
3971
+ ],
3972
+ "forbidden_memory_ids": [
3973
+ "gsh-c11-m07",
3974
+ "gsh-c11-m08"
3975
+ ],
3976
+ "difficulty": "hard",
3977
+ "architecture_bias_risk": "intentional_graph_stress",
3978
+ "fairness_note": "Final memory must beat stale and interim states connected by replaces/replaced_by edges."
3979
+ },
3980
+ {
3981
+ "question_id": "gsh-c11-q04",
3982
+ "category": "graph_traversal",
3983
+ "question": "For the approval chain in Yara's project, which team owns the internal ritual?",
3984
+ "expected_answer": "open-records team",
3985
+ "required_memory_ids": [
3986
+ "gsh-c11-m04",
3987
+ "gsh-c11-m05"
3988
+ ],
3989
+ "forbidden_memory_ids": [
3990
+ "gsh-c11-m10"
3991
+ ],
3992
+ "difficulty": "medium",
3993
+ "architecture_bias_risk": "intentional_graph_stress",
3994
+ "fairness_note": "Retrieval should cross from the approval phrasing to the internal ritual and owner team."
3995
+ },
3996
+ {
3997
+ "question_id": "gsh-c11-q05",
3998
+ "category": "abstention",
3999
+ "question": "What is the grant renewal date for Yara's project?",
4000
+ "expected_answer": "Not stated in the available memories for Yara's project.",
4001
+ "required_memory_ids": [],
4002
+ "forbidden_memory_ids": [
4003
+ "gsh-c11-m12",
4004
+ "gsh-c11-m10"
4005
+ ],
4006
+ "difficulty": "hard",
4007
+ "architecture_bias_risk": "intentional_graph_stress",
4008
+ "fairness_note": "The dataset contains a tempting value for a similar entity, but no answer for the target entity."
4009
+ }
4010
+ ]
4011
+ },
4012
+ {
4013
+ "conversation_id": "gsh-c12",
4014
+ "agent_id": "agent-product-analytics",
4015
+ "domain": "software",
4016
+ "memory_records": [
4017
+ {
4018
+ "id": "gsh-c12-m01",
4019
+ "type": "semantic",
4020
+ "timestamp": "2026-05-12T09:00:00Z",
4021
+ "content": "Talia uses codename Prism Desk for the active workstream, but the codename itself does not state the operational dependency.",
4022
+ "tags": [
4023
+ "prism-desk",
4024
+ "codename"
4025
+ ],
4026
+ "importance": 0.7,
4027
+ "metadata": {
4028
+ "role": "anchor",
4029
+ "source_turn": "t01"
4030
+ },
4031
+ "associations": [
4032
+ {
4033
+ "target_id": "gsh-c12-m02",
4034
+ "strength": 0.9,
4035
+ "relation": "alias_of"
4036
+ }
4037
+ ]
4038
+ },
4039
+ {
4040
+ "id": "gsh-c12-m02",
4041
+ "type": "semantic",
4042
+ "timestamp": "2026-05-12T10:00:00Z",
4043
+ "content": "Codename Prism Desk refers to the support-ticket insight panel, which is governed through the agent-escalation taxonomy.",
4044
+ "tags": [
4045
+ "support-ticket",
4046
+ "project-link"
4047
+ ],
4048
+ "importance": 0.8,
4049
+ "metadata": {
4050
+ "role": "bridge",
4051
+ "source_turn": "t02"
4052
+ },
4053
+ "associations": [
4054
+ {
4055
+ "target_id": "gsh-c12-m01",
4056
+ "strength": 0.9,
4057
+ "relation": "alias_of"
4058
+ },
4059
+ {
4060
+ "target_id": "gsh-c12-m03",
4061
+ "strength": 0.9,
4062
+ "relation": "depends_on"
4063
+ }
4064
+ ]
4065
+ },
4066
+ {
4067
+ "id": "gsh-c12-m03",
4068
+ "type": "semantic",
4069
+ "timestamp": "2026-05-12T11:00:00Z",
4070
+ "content": "The agent-escalation taxonomy depends on the DuckDB local analytics store; this dependency is not named in the codename discussion.",
4071
+ "tags": [
4072
+ "agent-escalation",
4073
+ "dependency"
4074
+ ],
4075
+ "importance": 0.9,
4076
+ "metadata": {
4077
+ "role": "answer",
4078
+ "source_turn": "t03"
4079
+ },
4080
+ "associations": [
4081
+ {
4082
+ "target_id": "gsh-c12-m02",
4083
+ "strength": 0.9,
4084
+ "relation": "supports"
4085
+ }
4086
+ ]
4087
+ },
4088
+ {
4089
+ "id": "gsh-c12-m04",
4090
+ "type": "episodic",
4091
+ "timestamp": "2026-05-12T12:00:00Z",
4092
+ "content": "Talia said the approval ritual is internally called Crimson Pass; outside notes describe it as the release approval meeting.",
4093
+ "tags": [
4094
+ "approval",
4095
+ "ritual"
4096
+ ],
4097
+ "importance": 0.75,
4098
+ "metadata": {
4099
+ "role": "weak-overlap-anchor",
4100
+ "source_turn": "t04"
4101
+ },
4102
+ "associations": [
4103
+ {
4104
+ "target_id": "gsh-c12-m05",
4105
+ "strength": 0.9,
4106
+ "relation": "owned_by"
4107
+ }
4108
+ ]
4109
+ },
4110
+ {
4111
+ "id": "gsh-c12-m05",
4112
+ "type": "semantic",
4113
+ "timestamp": "2026-05-12T13:00:00Z",
4114
+ "content": "Crimson Pass is owned by the quality review team, not by the team whose name resembles the project codename.",
4115
+ "tags": [
4116
+ "approval",
4117
+ "team"
4118
+ ],
4119
+ "importance": 0.82,
4120
+ "metadata": {
4121
+ "role": "weak-overlap-bridge",
4122
+ "source_turn": "t05"
4123
+ },
4124
+ "associations": [
4125
+ {
4126
+ "target_id": "gsh-c12-m04",
4127
+ "strength": 0.9,
4128
+ "relation": "owns"
4129
+ },
4130
+ {
4131
+ "target_id": "gsh-c12-m06",
4132
+ "strength": 0.9,
4133
+ "relation": "requires"
4134
+ }
4135
+ ]
4136
+ },
4137
+ {
4138
+ "id": "gsh-c12-m06",
4139
+ "type": "procedural",
4140
+ "timestamp": "2026-05-12T14:00:00Z",
4141
+ "content": "Before quality review team signs off, they require the escalation-label audit file.",
4142
+ "tags": [
4143
+ "approval",
4144
+ "artifact"
4145
+ ],
4146
+ "importance": 0.88,
4147
+ "metadata": {
4148
+ "role": "weak-overlap-answer",
4149
+ "source_turn": "t06"
4150
+ },
4151
+ "associations": [
4152
+ {
4153
+ "target_id": "gsh-c12-m05",
4154
+ "strength": 0.9,
4155
+ "relation": "required_by"
4156
+ }
4157
+ ]
4158
+ },
4159
+ {
4160
+ "id": "gsh-c12-m07",
4161
+ "type": "semantic",
4162
+ "timestamp": "2026-05-12T15:00:00Z",
4163
+ "content": "Initial state for support-ticket insight panel: charts refreshed daily.",
4164
+ "tags": [
4165
+ "current-state",
4166
+ "stale"
4167
+ ],
4168
+ "importance": 0.55,
4169
+ "metadata": {
4170
+ "role": "stale",
4171
+ "source_turn": "t07"
4172
+ },
4173
+ "associations": [
4174
+ {
4175
+ "target_id": "gsh-c12-m08",
4176
+ "strength": 0.95,
4177
+ "relation": "replaced_by"
4178
+ }
4179
+ ]
4180
+ },
4181
+ {
4182
+ "id": "gsh-c12-m08",
4183
+ "type": "episodic",
4184
+ "timestamp": "2026-05-12T16:00:00Z",
4185
+ "content": "Interim update for support-ticket insight panel: charts refreshed hourly, replacing the initial state.",
4186
+ "tags": [
4187
+ "current-state",
4188
+ "interim"
4189
+ ],
4190
+ "importance": 0.65,
4191
+ "metadata": {
4192
+ "role": "interim",
4193
+ "source_turn": "t08"
4194
+ },
4195
+ "associations": [
4196
+ {
4197
+ "target_id": "gsh-c12-m07",
4198
+ "strength": 0.95,
4199
+ "relation": "replaces"
4200
+ },
4201
+ {
4202
+ "target_id": "gsh-c12-m09",
4203
+ "strength": 0.98,
4204
+ "relation": "replaced_by"
4205
+ }
4206
+ ]
4207
+ },
4208
+ {
4209
+ "id": "gsh-c12-m09",
4210
+ "type": "semantic",
4211
+ "timestamp": "2026-05-12T17:00:00Z",
4212
+ "content": "Final resolved state for support-ticket insight panel: charts refreshed every fifteen minutes. This supersedes all earlier versions.",
4213
+ "tags": [
4214
+ "current-state",
4215
+ "final"
4216
+ ],
4217
+ "importance": 0.95,
4218
+ "metadata": {
4219
+ "role": "final",
4220
+ "source_turn": "t09"
4221
+ },
4222
+ "associations": [
4223
+ {
4224
+ "target_id": "gsh-c12-m08",
4225
+ "strength": 0.98,
4226
+ "relation": "replaces"
4227
+ },
4228
+ {
4229
+ "target_id": "gsh-c12-m07",
4230
+ "strength": 0.98,
4231
+ "relation": "replaces"
4232
+ }
4233
+ ]
4234
+ },
4235
+ {
4236
+ "id": "gsh-c12-m10",
4237
+ "type": "semantic",
4238
+ "timestamp": "2026-05-12T18:00:00Z",
4239
+ "content": "Distractor: Thalia runs a similar-sounding effort with ClickHouse metrics cluster, but it is not Talia's support-ticket insight panel.",
4240
+ "tags": [
4241
+ "near-entity",
4242
+ "distractor"
4243
+ ],
4244
+ "importance": 0.5,
4245
+ "metadata": {
4246
+ "role": "distractor",
4247
+ "source_turn": "t10"
4248
+ },
4249
+ "associations": []
4250
+ },
4251
+ {
4252
+ "id": "gsh-c12-m11",
4253
+ "type": "semantic",
4254
+ "timestamp": "2026-05-12T19:00:00Z",
4255
+ "content": "For Talia's support-ticket insight panel, the accountable owner is Irene Moss.",
4256
+ "tags": [
4257
+ "near-entity",
4258
+ "owner"
4259
+ ],
4260
+ "importance": 0.86,
4261
+ "metadata": {
4262
+ "role": "entity-answer",
4263
+ "source_turn": "t11"
4264
+ },
4265
+ "associations": [
4266
+ {
4267
+ "target_id": "gsh-c12-m02",
4268
+ "strength": 0.9,
4269
+ "relation": "owned_by"
4270
+ }
4271
+ ]
4272
+ },
4273
+ {
4274
+ "id": "gsh-c12-m12",
4275
+ "type": "semantic",
4276
+ "timestamp": "2026-05-12T20:00:00Z",
4277
+ "content": "Tempting gap: Thalia's similar effort mentions customer advisory board schedule, but Talia's records do not state that value.",
4278
+ "tags": [
4279
+ "abstention",
4280
+ "tempting-distractor"
4281
+ ],
4282
+ "importance": 0.45,
4283
+ "metadata": {
4284
+ "role": "abstention-distractor",
4285
+ "source_turn": "t12"
4286
+ },
4287
+ "associations": []
4288
+ }
4289
+ ],
4290
+ "questions": [
4291
+ {
4292
+ "question_id": "gsh-c12-q01",
4293
+ "category": "multi_hop_association",
4294
+ "question": "What operational dependency is ultimately used by Talia's codename Prism Desk?",
4295
+ "expected_answer": "DuckDB local analytics store",
4296
+ "required_memory_ids": [
4297
+ "gsh-c12-m01",
4298
+ "gsh-c12-m02",
4299
+ "gsh-c12-m03"
4300
+ ],
4301
+ "forbidden_memory_ids": [
4302
+ "gsh-c12-m10"
4303
+ ],
4304
+ "difficulty": "hard",
4305
+ "architecture_bias_risk": "intentional_graph_stress",
4306
+ "fairness_note": "Requires codename to project to bridge to dependency; no single memory contains both codename and answer."
4307
+ },
4308
+ {
4309
+ "question_id": "gsh-c12-q02",
4310
+ "category": "multi_hop_association",
4311
+ "question": "Which artifact is needed before the release approval meeting can be signed off for Talia's work?",
4312
+ "expected_answer": "escalation-label audit file",
4313
+ "required_memory_ids": [
4314
+ "gsh-c12-m04",
4315
+ "gsh-c12-m05",
4316
+ "gsh-c12-m06"
4317
+ ],
4318
+ "forbidden_memory_ids": [
4319
+ "gsh-c12-m10"
4320
+ ],
4321
+ "difficulty": "hard",
4322
+ "architecture_bias_risk": "intentional_graph_stress",
4323
+ "fairness_note": "Uses weak lexical overlap between release approval meeting and the internal ritual name."
4324
+ },
4325
+ {
4326
+ "question_id": "gsh-c12-q03",
4327
+ "category": "contradiction_resolution",
4328
+ "question": "What is the current resolved state for support-ticket insight panel?",
4329
+ "expected_answer": "charts refreshed every fifteen minutes",
4330
+ "required_memory_ids": [
4331
+ "gsh-c12-m09"
4332
+ ],
4333
+ "forbidden_memory_ids": [
4334
+ "gsh-c12-m07",
4335
+ "gsh-c12-m08"
4336
+ ],
4337
+ "difficulty": "hard",
4338
+ "architecture_bias_risk": "intentional_graph_stress",
4339
+ "fairness_note": "Final memory must beat stale and interim states connected by replaces/replaced_by edges."
4340
+ },
4341
+ {
4342
+ "question_id": "gsh-c12-q04",
4343
+ "category": "graph_traversal",
4344
+ "question": "For the approval chain in Talia's project, which team owns the internal ritual?",
4345
+ "expected_answer": "quality review team",
4346
+ "required_memory_ids": [
4347
+ "gsh-c12-m04",
4348
+ "gsh-c12-m05"
4349
+ ],
4350
+ "forbidden_memory_ids": [
4351
+ "gsh-c12-m10"
4352
+ ],
4353
+ "difficulty": "medium",
4354
+ "architecture_bias_risk": "intentional_graph_stress",
4355
+ "fairness_note": "Retrieval should cross from the approval phrasing to the internal ritual and owner team."
4356
+ },
4357
+ {
4358
+ "question_id": "gsh-c12-q05",
4359
+ "category": "abstention",
4360
+ "question": "What is the customer advisory board schedule for Talia's project?",
4361
+ "expected_answer": "Not stated in the available memories for Talia's project.",
4362
+ "required_memory_ids": [],
4363
+ "forbidden_memory_ids": [
4364
+ "gsh-c12-m12",
4365
+ "gsh-c12-m10"
4366
+ ],
4367
+ "difficulty": "hard",
4368
+ "architecture_bias_risk": "intentional_graph_stress",
4369
+ "fairness_note": "The dataset contains a tempting value for a similar entity, but no answer for the target entity."
4370
+ }
4371
+ ]
4372
+ }
4373
+ ]
4374
+ }