@1mbrain/benchmarks 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (69) hide show
  1. package/README.md +85 -0
  2. package/fixtures/1mbrain-focused-mini/1mbrain-focused-mini.json +928 -0
  3. package/fixtures/1mbrain-focused-mini/README.md +45 -0
  4. package/fixtures/adversarial-memory/dataset_claude_adversarial.json +3333 -0
  5. package/fixtures/adversarial-memory/dataset_gemini_adversarial_memory.json +2984 -0
  6. package/fixtures/balanced-mini/dataset_claude_balanced_mini.json +2077 -0
  7. package/fixtures/balanced-mini/dataset_gemini_balanced_mini.json +1995 -0
  8. package/fixtures/generate_datasets.js +1741 -0
  9. package/fixtures/graph-stress-hard/README.md +43 -0
  10. package/fixtures/graph-stress-hard/dataset_graph_stress_hard.json +4374 -0
  11. package/fixtures/graph-stress-hard/generate_graph_stress_hard.js +526 -0
  12. package/fixtures/realistic-medium/dataset_claude_realistic_medium.json +7462 -0
  13. package/fixtures/realistic-medium/dataset_gemini_realistic_medium.json +7277 -0
  14. package/fixtures/realistic-medium/gen_claude_medium.js +600 -0
  15. package/package.json +22 -0
  16. package/reports/benchmark_report.md +48 -0
  17. package/reports/benchmark_report_claude_adversarial.md +42 -0
  18. package/reports/benchmark_report_claude_adversarial_adaptive.md +42 -0
  19. package/reports/benchmark_report_claude_adversarial_adaptive2_fast.md +42 -0
  20. package/reports/benchmark_report_claude_adversarial_adaptive_fast.md +42 -0
  21. package/reports/benchmark_report_claude_adversarial_rerank.md +42 -0
  22. package/reports/benchmark_report_claude_balanced_mini.md +42 -0
  23. package/reports/benchmark_report_claude_balanced_mini_adaptive.md +42 -0
  24. package/reports/benchmark_report_claude_balanced_mini_adaptive2_fast.md +42 -0
  25. package/reports/benchmark_report_claude_balanced_mini_adaptive_fast.md +42 -0
  26. package/reports/benchmark_report_claude_balanced_mini_rerank.md +42 -0
  27. package/reports/benchmark_report_claude_realistic_medium.md +42 -0
  28. package/reports/benchmark_report_claude_realistic_medium_adaptive.md +42 -0
  29. package/reports/benchmark_report_claude_realistic_medium_adaptive2_fast.md +42 -0
  30. package/reports/benchmark_report_claude_realistic_medium_adaptive_fast.md +42 -0
  31. package/reports/benchmark_report_claude_realistic_medium_evidence_rerank_local.md +42 -0
  32. package/reports/benchmark_report_claude_realistic_medium_openai_evidence_rerank.md +41 -0
  33. package/reports/benchmark_report_claude_realistic_medium_openai_multi_signal.md +41 -0
  34. package/reports/benchmark_report_claude_realistic_medium_openai_multi_signal_scoped.md +41 -0
  35. package/reports/benchmark_report_claude_realistic_medium_openai_phase8_no_judge.md +42 -0
  36. package/reports/benchmark_report_claude_realistic_medium_openai_rankingpolicy.md +41 -0
  37. package/reports/benchmark_report_claude_realistic_medium_openai_stale_filter.md +41 -0
  38. package/reports/benchmark_report_claude_realistic_medium_openai_stale_filter_absence_fix.md +41 -0
  39. package/reports/benchmark_report_claude_realistic_medium_openai_write_time_invalidation.md +41 -0
  40. package/reports/benchmark_report_claude_realistic_medium_rerank.md +42 -0
  41. package/reports/benchmark_report_claude_realistic_medium_stale_filter_local.md +42 -0
  42. package/reports/benchmark_report_graph_stress_hard.md +42 -0
  43. package/reports/benchmark_report_graph_stress_hard_absence_fix.md +42 -0
  44. package/reports/benchmark_report_graph_stress_hard_adaptive.md +42 -0
  45. package/reports/benchmark_report_graph_stress_hard_evidence_rerank.md +42 -0
  46. package/reports/benchmark_report_graph_stress_hard_multi_signal_current_guardrail.md +42 -0
  47. package/reports/benchmark_report_graph_stress_hard_multi_signal_guardrail_fixed.md +42 -0
  48. package/reports/benchmark_report_graph_stress_hard_multi_signal_local.md +42 -0
  49. package/reports/benchmark_report_graph_stress_hard_multi_signal_scoped_guardrail.md +42 -0
  50. package/reports/benchmark_report_graph_stress_hard_multi_signal_vector_pure_guardrail.md +42 -0
  51. package/reports/benchmark_report_graph_stress_hard_phase8_sdk_guardrail.md +42 -0
  52. package/reports/benchmark_report_graph_stress_hard_rerank.md +42 -0
  53. package/reports/benchmark_report_graph_stress_hard_stale_filter.md +42 -0
  54. package/reports/benchmark_report_graph_stress_hard_write_time_invalidation.md +42 -0
  55. package/results/.gitignore +2 -0
  56. package/src/adapters/1mbrain.ts +317 -0
  57. package/src/adapters/keyword-embedding.ts +48 -0
  58. package/src/adapters/mem0.ts +124 -0
  59. package/src/adapters/qdrant.ts +214 -0
  60. package/src/adapters/unavailable.ts +49 -0
  61. package/src/adapters/vector-baseline.ts +149 -0
  62. package/src/datasets/focused-mini.ts +158 -0
  63. package/src/datasets/synthetic-agent-memory.ts +532 -0
  64. package/src/llm-evaluator.ts +262 -0
  65. package/src/metrics.ts +482 -0
  66. package/src/provider.ts +151 -0
  67. package/src/runner.ts +635 -0
  68. package/tsconfig.json +10 -0
  69. package/tsconfig.tsbuildinfo +1 -0
@@ -0,0 +1,526 @@
1
+ import fs from 'node:fs';
2
+ import path from 'node:path';
3
+ import { fileURLToPath } from 'node:url';
4
+
5
+ const __dirname = path.dirname(fileURLToPath(import.meta.url));
6
+
7
+ const scenarios = [
8
+ {
9
+ id: 'gsh-c01',
10
+ agent: 'agent-launch-ops',
11
+ domain: 'software',
12
+ person: 'Rina',
13
+ codename: 'Aurora',
14
+ project: 'customer-retention project',
15
+ bridge: 'renewal-risk initiative',
16
+ answer: 'Postgres event pipeline',
17
+ ritual: 'Copper Gate',
18
+ team: 'release assurance team',
19
+ artifact: 'signed rollback matrix',
20
+ stale: 'email-only weekly digest',
21
+ middle: 'Slack digest with email fallback',
22
+ final: 'in-app digest with Slack fallback',
23
+ distractorName: 'Aurelia',
24
+ distractorAnswer: 'Redis queue',
25
+ entityAnswer: 'Sofia Marin',
26
+ missingTopic: 'budget cap',
27
+ },
28
+ {
29
+ id: 'gsh-c02',
30
+ agent: 'agent-clinical-research',
31
+ domain: 'research',
32
+ person: 'Devon',
33
+ codename: 'Blue Lantern',
34
+ project: 'rural asthma cohort',
35
+ bridge: 'inhaler-adherence study',
36
+ answer: 'FHIR observation export',
37
+ ritual: 'Silver File',
38
+ team: 'ethics submission team',
39
+ artifact: 'redacted consent ledger',
40
+ stale: 'manual spreadsheet tracking',
41
+ middle: 'Airtable tracking',
42
+ final: 'REDCap tracking with nightly export',
43
+ distractorName: 'Devan',
44
+ distractorAnswer: 'HL7 batch import',
45
+ entityAnswer: 'Priya Shah',
46
+ missingTopic: 'participant compensation amount',
47
+ },
48
+ {
49
+ id: 'gsh-c03',
50
+ agent: 'agent-finance-ops',
51
+ domain: 'finance',
52
+ person: 'Mika',
53
+ codename: 'Northstar',
54
+ project: 'invoice anomaly monitor',
55
+ bridge: 'vendor-risk workflow',
56
+ answer: 'BigQuery audit table',
57
+ ritual: 'Pine Review',
58
+ team: 'controls validation team',
59
+ artifact: 'variance exception register',
60
+ stale: 'threshold at USD 500',
61
+ middle: 'threshold at USD 750',
62
+ final: 'threshold at USD 1,200 with CFO override',
63
+ distractorName: 'Mica',
64
+ distractorAnswer: 'Snowflake staging table',
65
+ entityAnswer: 'Elena Costa',
66
+ missingTopic: 'insurance carrier',
67
+ },
68
+ {
69
+ id: 'gsh-c04',
70
+ agent: 'agent-legal-casework',
71
+ domain: 'legal',
72
+ person: 'Amina',
73
+ codename: 'Harbor',
74
+ project: 'lease-renewal dispute',
75
+ bridge: 'tenant-notice timeline',
76
+ answer: 'Clause 14 cure-period memo',
77
+ ritual: 'Green Tab',
78
+ team: 'paralegal review team',
79
+ artifact: 'service affidavit packet',
80
+ stale: 'hearing expected in March',
81
+ middle: 'hearing moved to April',
82
+ final: 'hearing stayed pending mediation',
83
+ distractorName: 'Ameena',
84
+ distractorAnswer: 'Clause 9 insurance memo',
85
+ entityAnswer: 'Marco Bell',
86
+ missingTopic: 'settlement floor',
87
+ },
88
+ {
89
+ id: 'gsh-c05',
90
+ agent: 'agent-education-design',
91
+ domain: 'education',
92
+ person: 'Leo',
93
+ codename: 'Riverglass',
94
+ project: 'ninth-grade climate unit',
95
+ bridge: 'field-data practicum',
96
+ answer: 'watershed sensor dataset',
97
+ ritual: 'Lantern Check',
98
+ team: 'curriculum alignment team',
99
+ artifact: 'rubric calibration sheet',
100
+ stale: 'final project as a poster',
101
+ middle: 'final project as a slide deck',
102
+ final: 'final project as a data story notebook',
103
+ distractorName: 'Lio',
104
+ distractorAnswer: 'weather-station photo set',
105
+ entityAnswer: 'Nora Iqbal',
106
+ missingTopic: 'bus pickup location',
107
+ },
108
+ {
109
+ id: 'gsh-c06',
110
+ agent: 'agent-health-coach',
111
+ domain: 'healthcare',
112
+ person: 'Samir',
113
+ codename: 'Quiet Mile',
114
+ project: 'post-surgery walking plan',
115
+ bridge: 'mobility milestone tracker',
116
+ answer: '10-minute evening walk',
117
+ ritual: 'White Card',
118
+ team: 'care-transition team',
119
+ artifact: 'pain-escalation checklist',
120
+ stale: 'stairs avoided completely',
121
+ middle: 'stairs allowed with supervision',
122
+ final: 'stairs allowed twice daily with handrail',
123
+ distractorName: 'Samira',
124
+ distractorAnswer: '15-minute morning bike',
125
+ entityAnswer: 'Dr. Lin Patel',
126
+ missingTopic: 'pharmacy copay',
127
+ },
128
+ {
129
+ id: 'gsh-c07',
130
+ agent: 'agent-creative-studio',
131
+ domain: 'creative',
132
+ person: 'Iskandar',
133
+ codename: 'Glass Orchard',
134
+ project: 'ambient album launch',
135
+ bridge: 'vinyl preorder campaign',
136
+ answer: 'Bandcamp private presale page',
137
+ ritual: 'Blue Room',
138
+ team: 'mastering coordination team',
139
+ artifact: 'final lacquer approval note',
140
+ stale: 'release set for July',
141
+ middle: 'release moved to August',
142
+ final: 'release locked for September 6',
143
+ distractorName: 'Iskander',
144
+ distractorAnswer: 'Spotify canvas package',
145
+ entityAnswer: 'Marta Velasquez',
146
+ missingTopic: 'tour opener',
147
+ },
148
+ {
149
+ id: 'gsh-c08',
150
+ agent: 'agent-travel-planner',
151
+ domain: 'personal_assistant',
152
+ person: 'Raf',
153
+ codename: 'Lighthouse',
154
+ project: 'Lisbon family itinerary',
155
+ bridge: 'museum-access plan',
156
+ answer: 'Tile Museum quiet-entry slot',
157
+ ritual: 'Map Fold',
158
+ team: 'reservation support team',
159
+ artifact: 'accessibility confirmation email',
160
+ stale: 'hostel in Baixa',
161
+ middle: 'apartment in Alfama',
162
+ final: 'boutique hotel in Chiado',
163
+ distractorName: 'Rafi',
164
+ distractorAnswer: 'tram pass kiosk',
165
+ entityAnswer: 'Tomas Freire',
166
+ missingTopic: 'airport transfer vendor',
167
+ },
168
+ {
169
+ id: 'gsh-c09',
170
+ agent: 'agent-security-review',
171
+ domain: 'software',
172
+ person: 'Nadia',
173
+ codename: 'Iron Finch',
174
+ project: 'API abuse detection',
175
+ bridge: 'credential-stuffing response',
176
+ answer: 'edge rate-limit rule set',
177
+ ritual: 'Black Seal',
178
+ team: 'incident readiness team',
179
+ artifact: 'pager escalation map',
180
+ stale: 'block after 20 failed attempts',
181
+ middle: 'challenge after 15 failed attempts',
182
+ final: 'challenge after 8 failed attempts plus IP reputation',
183
+ distractorName: 'Nadya',
184
+ distractorAnswer: 'WAF geo-block list',
185
+ entityAnswer: 'Owen Hart',
186
+ missingTopic: 'annual security budget',
187
+ },
188
+ {
189
+ id: 'gsh-c10',
190
+ agent: 'agent-supply-chain',
191
+ domain: 'operations',
192
+ person: 'Jonas',
193
+ codename: 'Cedar Path',
194
+ project: 'warehouse slotting refresh',
195
+ bridge: 'cold-chain pick route',
196
+ answer: 'zone C thermal scanner',
197
+ ritual: 'Orange Tag',
198
+ team: 'inventory audit team',
199
+ artifact: 'lot-expiry reconciliation sheet',
200
+ stale: 'refresh every quarter',
201
+ middle: 'refresh every two months',
202
+ final: 'refresh every six weeks during peak season',
203
+ distractorName: 'Jona',
204
+ distractorAnswer: 'zone A RFID gate',
205
+ entityAnswer: 'Mina Park',
206
+ missingTopic: 'forklift maintenance vendor',
207
+ },
208
+ {
209
+ id: 'gsh-c11',
210
+ agent: 'agent-civic-data',
211
+ domain: 'research',
212
+ person: 'Yara',
213
+ codename: 'Civic Loom',
214
+ project: 'public-comment clustering',
215
+ bridge: 'transportation hearing archive',
216
+ answer: 'speaker-topic matrix',
217
+ ritual: 'Grey Index',
218
+ team: 'open-records team',
219
+ artifact: 'redaction exception log',
220
+ stale: 'archive limited to 2024 hearings',
221
+ middle: 'archive expanded to 2025 hearings',
222
+ final: 'archive covers 2023 through 2026 hearings',
223
+ distractorName: 'Yarra',
224
+ distractorAnswer: 'council-vote matrix',
225
+ entityAnswer: 'Hugo Stern',
226
+ missingTopic: 'grant renewal date',
227
+ },
228
+ {
229
+ id: 'gsh-c12',
230
+ agent: 'agent-product-analytics',
231
+ domain: 'software',
232
+ person: 'Talia',
233
+ codename: 'Prism Desk',
234
+ project: 'support-ticket insight panel',
235
+ bridge: 'agent-escalation taxonomy',
236
+ answer: 'DuckDB local analytics store',
237
+ ritual: 'Crimson Pass',
238
+ team: 'quality review team',
239
+ artifact: 'escalation-label audit file',
240
+ stale: 'charts refreshed daily',
241
+ middle: 'charts refreshed hourly',
242
+ final: 'charts refreshed every fifteen minutes',
243
+ distractorName: 'Thalia',
244
+ distractorAnswer: 'ClickHouse metrics cluster',
245
+ entityAnswer: 'Irene Moss',
246
+ missingTopic: 'customer advisory board schedule',
247
+ },
248
+ ];
249
+
250
+ function assoc(targetId, relation, strength = 0.9) {
251
+ return { target_id: targetId, strength, relation };
252
+ }
253
+
254
+ function mem(id, type, timestamp, content, tags, importance, associations = [], metadata = {}) {
255
+ return {
256
+ id,
257
+ type,
258
+ timestamp,
259
+ content,
260
+ tags,
261
+ importance,
262
+ metadata,
263
+ associations,
264
+ };
265
+ }
266
+
267
+ function question(id, category, text, expected, required, forbidden = [], difficulty = 'hard', note = '') {
268
+ return {
269
+ question_id: id,
270
+ category,
271
+ question: text,
272
+ expected_answer: expected,
273
+ required_memory_ids: required,
274
+ forbidden_memory_ids: forbidden,
275
+ difficulty,
276
+ architecture_bias_risk: category === 'atomic_fact_recall' ? 'low' : 'intentional_graph_stress',
277
+ fairness_note: note,
278
+ };
279
+ }
280
+
281
+ function buildConversation(s, index) {
282
+ const p = s.id;
283
+ const m = (n) => `${p}-m${String(n).padStart(2, '0')}`;
284
+ const q = (n) => `${p}-q${String(n).padStart(2, '0')}`;
285
+ const baseDay = String(index + 1).padStart(2, '0');
286
+ const ts = (hour) => `2026-05-${baseDay}T${String(hour).padStart(2, '0')}:00:00Z`;
287
+
288
+ const memories = [
289
+ mem(
290
+ m(1),
291
+ 'semantic',
292
+ ts(9),
293
+ `${s.person} uses codename ${s.codename} for the active workstream, but the codename itself does not state the operational dependency.`,
294
+ [s.codename.toLowerCase().replace(/\s+/g, '-'), 'codename'],
295
+ 0.7,
296
+ [assoc(m(2), 'alias_of')],
297
+ { role: 'anchor', source_turn: 't01' },
298
+ ),
299
+ mem(
300
+ m(2),
301
+ 'semantic',
302
+ ts(10),
303
+ `Codename ${s.codename} refers to the ${s.project}, which is governed through the ${s.bridge}.`,
304
+ [s.project.split(' ')[0].toLowerCase(), 'project-link'],
305
+ 0.8,
306
+ [assoc(m(1), 'alias_of'), assoc(m(3), 'depends_on')],
307
+ { role: 'bridge', source_turn: 't02' },
308
+ ),
309
+ mem(
310
+ m(3),
311
+ 'semantic',
312
+ ts(11),
313
+ `The ${s.bridge} depends on the ${s.answer}; this dependency is not named in the codename discussion.`,
314
+ [s.bridge.split(' ')[0].toLowerCase(), 'dependency'],
315
+ 0.9,
316
+ [assoc(m(2), 'supports')],
317
+ { role: 'answer', source_turn: 't03' },
318
+ ),
319
+ mem(
320
+ m(4),
321
+ 'episodic',
322
+ ts(12),
323
+ `${s.person} said the approval ritual is internally called ${s.ritual}; outside notes describe it as the release approval meeting.`,
324
+ ['approval', 'ritual'],
325
+ 0.75,
326
+ [assoc(m(5), 'owned_by')],
327
+ { role: 'weak-overlap-anchor', source_turn: 't04' },
328
+ ),
329
+ mem(
330
+ m(5),
331
+ 'semantic',
332
+ ts(13),
333
+ `${s.ritual} is owned by the ${s.team}, not by the team whose name resembles the project codename.`,
334
+ ['approval', 'team'],
335
+ 0.82,
336
+ [assoc(m(4), 'owns'), assoc(m(6), 'requires')],
337
+ { role: 'weak-overlap-bridge', source_turn: 't05' },
338
+ ),
339
+ mem(
340
+ m(6),
341
+ 'procedural',
342
+ ts(14),
343
+ `Before ${s.team} signs off, they require the ${s.artifact}.`,
344
+ ['approval', 'artifact'],
345
+ 0.88,
346
+ [assoc(m(5), 'required_by')],
347
+ { role: 'weak-overlap-answer', source_turn: 't06' },
348
+ ),
349
+ mem(
350
+ m(7),
351
+ 'semantic',
352
+ ts(15),
353
+ `Initial state for ${s.project}: ${s.stale}.`,
354
+ ['current-state', 'stale'],
355
+ 0.55,
356
+ [assoc(m(8), 'replaced_by', 0.95)],
357
+ { role: 'stale', source_turn: 't07' },
358
+ ),
359
+ mem(
360
+ m(8),
361
+ 'episodic',
362
+ ts(16),
363
+ `Interim update for ${s.project}: ${s.middle}, replacing the initial state.`,
364
+ ['current-state', 'interim'],
365
+ 0.65,
366
+ [assoc(m(7), 'replaces', 0.95), assoc(m(9), 'replaced_by', 0.98)],
367
+ { role: 'interim', source_turn: 't08' },
368
+ ),
369
+ mem(
370
+ m(9),
371
+ 'semantic',
372
+ ts(17),
373
+ `Final resolved state for ${s.project}: ${s.final}. This supersedes all earlier versions.`,
374
+ ['current-state', 'final'],
375
+ 0.95,
376
+ [assoc(m(8), 'replaces', 0.98), assoc(m(7), 'replaces', 0.98)],
377
+ { role: 'final', source_turn: 't09' },
378
+ ),
379
+ mem(
380
+ m(10),
381
+ 'semantic',
382
+ ts(18),
383
+ `Distractor: ${s.distractorName} runs a similar-sounding effort with ${s.distractorAnswer}, but it is not ${s.person}'s ${s.project}.`,
384
+ ['near-entity', 'distractor'],
385
+ 0.5,
386
+ [],
387
+ { role: 'distractor', source_turn: 't10' },
388
+ ),
389
+ mem(
390
+ m(11),
391
+ 'semantic',
392
+ ts(19),
393
+ `For ${s.person}'s ${s.project}, the accountable owner is ${s.entityAnswer}.`,
394
+ ['near-entity', 'owner'],
395
+ 0.86,
396
+ [assoc(m(2), 'owned_by')],
397
+ { role: 'entity-answer', source_turn: 't11' },
398
+ ),
399
+ mem(
400
+ m(12),
401
+ 'semantic',
402
+ ts(20),
403
+ `Tempting gap: ${s.distractorName}'s similar effort mentions ${s.missingTopic}, but ${s.person}'s records do not state that value.`,
404
+ ['abstention', 'tempting-distractor'],
405
+ 0.45,
406
+ [],
407
+ { role: 'abstention-distractor', source_turn: 't12' },
408
+ ),
409
+ ];
410
+
411
+ const questions = [
412
+ question(
413
+ q(1),
414
+ 'multi_hop_association',
415
+ `What operational dependency is ultimately used by ${s.person}'s codename ${s.codename}?`,
416
+ s.answer,
417
+ [m(1), m(2), m(3)],
418
+ [m(10)],
419
+ 'hard',
420
+ 'Requires codename to project to bridge to dependency; no single memory contains both codename and answer.',
421
+ ),
422
+ question(
423
+ q(2),
424
+ 'multi_hop_association',
425
+ `Which artifact is needed before the release approval meeting can be signed off for ${s.person}'s work?`,
426
+ s.artifact,
427
+ [m(4), m(5), m(6)],
428
+ [m(10)],
429
+ 'hard',
430
+ 'Uses weak lexical overlap between release approval meeting and the internal ritual name.',
431
+ ),
432
+ question(
433
+ q(3),
434
+ 'contradiction_resolution',
435
+ `What is the current resolved state for ${s.project}?`,
436
+ s.final,
437
+ [m(9)],
438
+ [m(7), m(8)],
439
+ 'hard',
440
+ 'Final memory must beat stale and interim states connected by replaces/replaced_by edges.',
441
+ ),
442
+ question(
443
+ q(4),
444
+ 'graph_traversal',
445
+ `For the approval chain in ${s.person}'s project, which team owns the internal ritual?`,
446
+ s.team,
447
+ [m(4), m(5)],
448
+ [m(10)],
449
+ 'medium',
450
+ 'Retrieval should cross from the approval phrasing to the internal ritual and owner team.',
451
+ ),
452
+ index < 7
453
+ ? question(
454
+ q(5),
455
+ 'entity_disambiguation',
456
+ `Who is the accountable owner for ${s.person}'s project, not ${s.distractorName}'s similar effort?`,
457
+ s.entityAnswer,
458
+ [m(11)],
459
+ [m(10), m(12)],
460
+ 'medium',
461
+ 'Near-name distractors are intentionally tempting but wrong.',
462
+ )
463
+ : question(
464
+ q(5),
465
+ 'abstention',
466
+ `What is the ${s.missingTopic} for ${s.person}'s project?`,
467
+ `Not stated in the available memories for ${s.person}'s project.`,
468
+ [],
469
+ [m(12), m(10)],
470
+ 'hard',
471
+ 'The dataset contains a tempting value for a similar entity, but no answer for the target entity.',
472
+ ),
473
+ ];
474
+
475
+ return {
476
+ conversation_id: s.id,
477
+ agent_id: s.agent,
478
+ domain: s.domain,
479
+ memory_records: memories,
480
+ questions,
481
+ };
482
+ }
483
+
484
+ const conversations = scenarios.map(buildConversation);
485
+ const questionCategories = {};
486
+ let memoryCount = 0;
487
+ let questionCount = 0;
488
+ for (const conversation of conversations) {
489
+ memoryCount += conversation.memory_records.length;
490
+ questionCount += conversation.questions.length;
491
+ for (const q of conversation.questions) {
492
+ questionCategories[q.category] = (questionCategories[q.category] ?? 0) + 1;
493
+ }
494
+ }
495
+
496
+ const dataset = {
497
+ name: 'graph-stress-hard',
498
+ description:
499
+ 'Diagnostic benchmark for multi-hop association, conflict resolution, graph traversal with weak lexical overlap, near-entity distractors, and abstention under tempting distractors.',
500
+ generated_at: '2026-06-19',
501
+ generated_by: 'deterministic-template',
502
+ version: '1.0.0',
503
+ fairness_notes: [
504
+ 'This is intentionally not a provider-neutral public leaderboard fixture; it is a graph-stress diagnostic for tuning association traversal and conflict semantics.',
505
+ 'Many questions require two or three required memories, and no single memory contains both the query anchor and final answer.',
506
+ 'Conflict questions include stale, interim, and final states linked by replaces/replaced_by associations.',
507
+ 'Weak-overlap graph traversal questions use different surface wording for the query anchor and the target memory.',
508
+ 'Abstention questions contain tempting distractor facts for similar entities but no answer for the target entity.',
509
+ ],
510
+ stats: {
511
+ conversations: conversations.length,
512
+ memory_records: memoryCount,
513
+ questions: questionCount,
514
+ question_categories: questionCategories,
515
+ },
516
+ conversations,
517
+ };
518
+
519
+ fs.writeFileSync(
520
+ path.join(__dirname, 'dataset_graph_stress_hard.json'),
521
+ `${JSON.stringify(dataset, null, 2)}\n`,
522
+ );
523
+
524
+ console.log(
525
+ `Generated graph-stress-hard: ${conversations.length} conversations, ${memoryCount} memories, ${questionCount} questions.`,
526
+ );