@1mbrain/benchmarks 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (69) hide show
  1. package/README.md +85 -0
  2. package/fixtures/1mbrain-focused-mini/1mbrain-focused-mini.json +928 -0
  3. package/fixtures/1mbrain-focused-mini/README.md +45 -0
  4. package/fixtures/adversarial-memory/dataset_claude_adversarial.json +3333 -0
  5. package/fixtures/adversarial-memory/dataset_gemini_adversarial_memory.json +2984 -0
  6. package/fixtures/balanced-mini/dataset_claude_balanced_mini.json +2077 -0
  7. package/fixtures/balanced-mini/dataset_gemini_balanced_mini.json +1995 -0
  8. package/fixtures/generate_datasets.js +1741 -0
  9. package/fixtures/graph-stress-hard/README.md +43 -0
  10. package/fixtures/graph-stress-hard/dataset_graph_stress_hard.json +4374 -0
  11. package/fixtures/graph-stress-hard/generate_graph_stress_hard.js +526 -0
  12. package/fixtures/realistic-medium/dataset_claude_realistic_medium.json +7462 -0
  13. package/fixtures/realistic-medium/dataset_gemini_realistic_medium.json +7277 -0
  14. package/fixtures/realistic-medium/gen_claude_medium.js +600 -0
  15. package/package.json +22 -0
  16. package/reports/benchmark_report.md +48 -0
  17. package/reports/benchmark_report_claude_adversarial.md +42 -0
  18. package/reports/benchmark_report_claude_adversarial_adaptive.md +42 -0
  19. package/reports/benchmark_report_claude_adversarial_adaptive2_fast.md +42 -0
  20. package/reports/benchmark_report_claude_adversarial_adaptive_fast.md +42 -0
  21. package/reports/benchmark_report_claude_adversarial_rerank.md +42 -0
  22. package/reports/benchmark_report_claude_balanced_mini.md +42 -0
  23. package/reports/benchmark_report_claude_balanced_mini_adaptive.md +42 -0
  24. package/reports/benchmark_report_claude_balanced_mini_adaptive2_fast.md +42 -0
  25. package/reports/benchmark_report_claude_balanced_mini_adaptive_fast.md +42 -0
  26. package/reports/benchmark_report_claude_balanced_mini_rerank.md +42 -0
  27. package/reports/benchmark_report_claude_realistic_medium.md +42 -0
  28. package/reports/benchmark_report_claude_realistic_medium_adaptive.md +42 -0
  29. package/reports/benchmark_report_claude_realistic_medium_adaptive2_fast.md +42 -0
  30. package/reports/benchmark_report_claude_realistic_medium_adaptive_fast.md +42 -0
  31. package/reports/benchmark_report_claude_realistic_medium_evidence_rerank_local.md +42 -0
  32. package/reports/benchmark_report_claude_realistic_medium_openai_evidence_rerank.md +41 -0
  33. package/reports/benchmark_report_claude_realistic_medium_openai_multi_signal.md +41 -0
  34. package/reports/benchmark_report_claude_realistic_medium_openai_multi_signal_scoped.md +41 -0
  35. package/reports/benchmark_report_claude_realistic_medium_openai_phase8_no_judge.md +42 -0
  36. package/reports/benchmark_report_claude_realistic_medium_openai_rankingpolicy.md +41 -0
  37. package/reports/benchmark_report_claude_realistic_medium_openai_stale_filter.md +41 -0
  38. package/reports/benchmark_report_claude_realistic_medium_openai_stale_filter_absence_fix.md +41 -0
  39. package/reports/benchmark_report_claude_realistic_medium_openai_write_time_invalidation.md +41 -0
  40. package/reports/benchmark_report_claude_realistic_medium_rerank.md +42 -0
  41. package/reports/benchmark_report_claude_realistic_medium_stale_filter_local.md +42 -0
  42. package/reports/benchmark_report_graph_stress_hard.md +42 -0
  43. package/reports/benchmark_report_graph_stress_hard_absence_fix.md +42 -0
  44. package/reports/benchmark_report_graph_stress_hard_adaptive.md +42 -0
  45. package/reports/benchmark_report_graph_stress_hard_evidence_rerank.md +42 -0
  46. package/reports/benchmark_report_graph_stress_hard_multi_signal_current_guardrail.md +42 -0
  47. package/reports/benchmark_report_graph_stress_hard_multi_signal_guardrail_fixed.md +42 -0
  48. package/reports/benchmark_report_graph_stress_hard_multi_signal_local.md +42 -0
  49. package/reports/benchmark_report_graph_stress_hard_multi_signal_scoped_guardrail.md +42 -0
  50. package/reports/benchmark_report_graph_stress_hard_multi_signal_vector_pure_guardrail.md +42 -0
  51. package/reports/benchmark_report_graph_stress_hard_phase8_sdk_guardrail.md +42 -0
  52. package/reports/benchmark_report_graph_stress_hard_rerank.md +42 -0
  53. package/reports/benchmark_report_graph_stress_hard_stale_filter.md +42 -0
  54. package/reports/benchmark_report_graph_stress_hard_write_time_invalidation.md +42 -0
  55. package/results/.gitignore +2 -0
  56. package/src/adapters/1mbrain.ts +317 -0
  57. package/src/adapters/keyword-embedding.ts +48 -0
  58. package/src/adapters/mem0.ts +124 -0
  59. package/src/adapters/qdrant.ts +214 -0
  60. package/src/adapters/unavailable.ts +49 -0
  61. package/src/adapters/vector-baseline.ts +149 -0
  62. package/src/datasets/focused-mini.ts +158 -0
  63. package/src/datasets/synthetic-agent-memory.ts +532 -0
  64. package/src/llm-evaluator.ts +262 -0
  65. package/src/metrics.ts +482 -0
  66. package/src/provider.ts +151 -0
  67. package/src/runner.ts +635 -0
  68. package/tsconfig.json +10 -0
  69. package/tsconfig.tsbuildinfo +1 -0
@@ -0,0 +1,928 @@
1
+ {
2
+ "name": "1mbrain-focused-mini",
3
+ "version": "0.1.0",
4
+ "generated_at": "2026-06-19T00:00:00.000Z",
5
+ "license": "MIT",
6
+ "description": "Small directed benchmark dataset for evaluating 1MBrain memory behavior with low API usage.",
7
+ "api_budget_profile": {
8
+ "recommended_mode": "retrieval_only",
9
+ "llm_judge_required": false,
10
+ "estimated_conversations": 5,
11
+ "estimated_turns": 40,
12
+ "estimated_memory_records": 41,
13
+ "estimated_questions": 23,
14
+ "notes": "Expected answers and required memory IDs are included, so retrieval quality can be scored without answer generation."
15
+ },
16
+ "schema": {
17
+ "conversation_turns": "Human-readable source transcript.",
18
+ "memory_records": "Canonical memories to ingest. Each record is intentionally atomic and timestamped.",
19
+ "questions": "Evaluation prompts with expected answers, required IDs, forbidden IDs, and category labels."
20
+ },
21
+ "conversations": [
22
+ {
23
+ "conversation_id": "conv_kreasa_firebase",
24
+ "title": "Kreasa Firebase Migration",
25
+ "agent_id": "focused_kreasa_agent",
26
+ "turns": [
27
+ {
28
+ "turn_id": "kf_t01",
29
+ "timestamp": "2026-06-01T09:00:00Z",
30
+ "speaker": "User",
31
+ "text": "Kreasa is my Android AI mentor app. It stores user learning plans in Firestore."
32
+ },
33
+ {
34
+ "turn_id": "kf_t02",
35
+ "timestamp": "2026-06-01T09:03:00Z",
36
+ "speaker": "User",
37
+ "text": "The current client-side Gemini calls must move behind Firebase Callable Functions before release."
38
+ },
39
+ {
40
+ "turn_id": "kf_t03",
41
+ "timestamp": "2026-06-02T10:20:00Z",
42
+ "speaker": "User",
43
+ "text": "Firestore rules were locked down and emulator tests passed locally."
44
+ },
45
+ {
46
+ "turn_id": "kf_t04",
47
+ "timestamp": "2026-06-03T11:00:00Z",
48
+ "speaker": "User",
49
+ "text": "Deployment is blocked because Functions v2 and Secret Manager require Blaze, but the Firebase project is still on Spark."
50
+ },
51
+ {
52
+ "turn_id": "kf_t05",
53
+ "timestamp": "2026-06-04T12:10:00Z",
54
+ "speaker": "User",
55
+ "text": "When explaining cloud blockers, separate local verification from cloud deployment status."
56
+ },
57
+ {
58
+ "turn_id": "kf_t06",
59
+ "timestamp": "2026-06-05T13:25:00Z",
60
+ "speaker": "Assistant",
61
+ "text": "Local Callable Function tests passed on Node 22, but production deploy remains blocked by billing."
62
+ },
63
+ {
64
+ "turn_id": "kf_t07",
65
+ "timestamp": "2026-06-07T15:30:00Z",
66
+ "speaker": "User",
67
+ "text": "Earlier I thought Spark might be enough, but the current decision is to wait for Blaze before deploying Functions v2."
68
+ },
69
+ {
70
+ "turn_id": "kf_t08",
71
+ "timestamp": "2026-06-08T16:40:00Z",
72
+ "speaker": "User",
73
+ "text": "The next safe step is to keep backend migration code ready and document the Blaze requirement."
74
+ }
75
+ ],
76
+ "memory_records": [
77
+ {
78
+ "id": "kf_identity",
79
+ "type": "semantic",
80
+ "timestamp": "2026-06-01T09:00:00Z",
81
+ "content": "Kreasa is the user's Android AI mentor app and stores user learning plans in Firestore.",
82
+ "tags": ["kreasa", "android", "firestore"],
83
+ "importance": 0.84,
84
+ "metadata": { "source_turn_id": "kf_t01" }
85
+ },
86
+ {
87
+ "id": "kf_gemini_backend",
88
+ "type": "procedural",
89
+ "timestamp": "2026-06-01T09:03:00Z",
90
+ "content": "Kreasa must move client-side Gemini calls behind Firebase Callable Functions before release.",
91
+ "tags": ["kreasa", "gemini", "firebase-functions"],
92
+ "importance": 0.9,
93
+ "metadata": { "source_turn_id": "kf_t02" },
94
+ "associations": [{ "target_id": "kf_blaze_blocker", "strength": 0.92 }]
95
+ },
96
+ {
97
+ "id": "kf_rules_verified",
98
+ "type": "episodic",
99
+ "timestamp": "2026-06-02T10:20:00Z",
100
+ "content": "Kreasa Firestore rules were locked down and emulator tests passed locally.",
101
+ "tags": ["kreasa", "firestore", "local-verification"],
102
+ "importance": 0.78,
103
+ "metadata": { "source_turn_id": "kf_t03" }
104
+ },
105
+ {
106
+ "id": "kf_blaze_blocker",
107
+ "type": "semantic",
108
+ "timestamp": "2026-06-03T11:00:00Z",
109
+ "content": "Kreasa production deployment is blocked because Firebase Functions v2 and Secret Manager require Blaze, while the project is still on Spark.",
110
+ "tags": ["kreasa", "blaze", "spark", "deployment-blocker"],
111
+ "importance": 0.96,
112
+ "metadata": { "source_turn_id": "kf_t04" }
113
+ },
114
+ {
115
+ "id": "kf_reporting_preference",
116
+ "type": "semantic",
117
+ "timestamp": "2026-06-04T12:10:00Z",
118
+ "content": "When reporting Kreasa blockers, separate local verification status from cloud deployment status.",
119
+ "tags": ["kreasa", "reporting", "preference"],
120
+ "importance": 0.86,
121
+ "metadata": { "source_turn_id": "kf_t05" }
122
+ },
123
+ {
124
+ "id": "kf_local_functions_verified",
125
+ "type": "episodic",
126
+ "timestamp": "2026-06-05T13:25:00Z",
127
+ "content": "Kreasa Callable Function tests passed locally on Node 22, but production deploy remains blocked by billing.",
128
+ "tags": ["kreasa", "node22", "callable-functions"],
129
+ "importance": 0.82,
130
+ "metadata": { "source_turn_id": "kf_t06" },
131
+ "associations": [{ "target_id": "kf_blaze_blocker", "strength": 0.91 }]
132
+ },
133
+ {
134
+ "id": "kf_current_billing_decision",
135
+ "type": "semantic",
136
+ "timestamp": "2026-06-07T15:30:00Z",
137
+ "content": "Current Kreasa deployment decision: wait for Blaze before deploying Firebase Functions v2.",
138
+ "tags": ["kreasa", "current-decision", "blaze"],
139
+ "importance": 0.94,
140
+ "metadata": { "source_turn_id": "kf_t07" }
141
+ },
142
+ {
143
+ "id": "kf_next_safe_step",
144
+ "type": "procedural",
145
+ "timestamp": "2026-06-08T16:40:00Z",
146
+ "content": "Next safe Kreasa step: keep backend migration code ready and document the Blaze requirement.",
147
+ "tags": ["kreasa", "next-step", "documentation"],
148
+ "importance": 0.88,
149
+ "metadata": { "source_turn_id": "kf_t08" }
150
+ }
151
+ ],
152
+ "questions": [
153
+ {
154
+ "question_id": "kf_q01",
155
+ "category": "multi_hop_association",
156
+ "question": "Why is Kreasa's production deployment blocked?",
157
+ "expected_answer": "Firebase Functions v2 and Secret Manager require Blaze, but the project is still on Spark.",
158
+ "required_memory_ids": ["kf_gemini_backend", "kf_blaze_blocker"],
159
+ "forbidden_memory_ids": [],
160
+ "cutoffs": [5, 10],
161
+ "rationale": "The answer should connect the backend migration to the billing blocker."
162
+ },
163
+ {
164
+ "question_id": "kf_q02",
165
+ "category": "contradiction_resolution",
166
+ "question": "Should we deploy Kreasa Functions v2 while the Firebase project is still on Spark?",
167
+ "expected_answer": "No. The current decision is to wait for Blaze before deploying Functions v2.",
168
+ "required_memory_ids": ["kf_current_billing_decision", "kf_blaze_blocker"],
169
+ "forbidden_memory_ids": [],
170
+ "cutoffs": [5, 10],
171
+ "rationale": "The latest decision should override the earlier uncertainty about Spark."
172
+ },
173
+ {
174
+ "question_id": "kf_q03",
175
+ "category": "context_injection",
176
+ "question": "How should a status report describe Kreasa's progress?",
177
+ "expected_answer": "It should say local verification passed, while cloud deployment is still blocked by Blaze billing.",
178
+ "required_memory_ids": ["kf_rules_verified", "kf_local_functions_verified", "kf_reporting_preference", "kf_blaze_blocker"],
179
+ "forbidden_memory_ids": [],
180
+ "cutoffs": [5, 10],
181
+ "rationale": "A good context block must include both verified local work and the cloud blocker."
182
+ },
183
+ {
184
+ "question_id": "kf_q04",
185
+ "category": "procedural_recall",
186
+ "question": "What is the next safe step for Kreasa?",
187
+ "expected_answer": "Keep backend migration code ready and document the Blaze requirement.",
188
+ "required_memory_ids": ["kf_next_safe_step"],
189
+ "forbidden_memory_ids": [],
190
+ "cutoffs": [5, 10],
191
+ "rationale": "The procedural memory should be retrieved directly."
192
+ }
193
+ ]
194
+ },
195
+ {
196
+ "conversation_id": "conv_onemillionbrain_benchmark",
197
+ "title": "1MBrain Benchmark Diagnosis",
198
+ "agent_id": "focused_1mbrain_agent",
199
+ "turns": [
200
+ {
201
+ "turn_id": "omb_t01",
202
+ "timestamp": "2026-06-18T08:00:00Z",
203
+ "speaker": "User",
204
+ "text": "The 1MBrain LOCOMO benchmark used DeepSeek V4 Flash for answerer and judge."
205
+ },
206
+ {
207
+ "turn_id": "omb_t02",
208
+ "timestamp": "2026-06-18T08:05:00Z",
209
+ "speaker": "User",
210
+ "text": "The run used Postgres and Redis in Docker, with local-keyword embeddings."
211
+ },
212
+ {
213
+ "turn_id": "omb_t03",
214
+ "timestamp": "2026-06-18T21:47:00Z",
215
+ "speaker": "Assistant",
216
+ "text": "The LOCOMO result was top_10 24.61%, top_20 27.66%, and top_50 27.21% over 1540 questions."
217
+ },
218
+ {
219
+ "turn_id": "omb_t04",
220
+ "timestamp": "2026-06-19T07:00:00Z",
221
+ "speaker": "User",
222
+ "text": "We should not treat that as final product quality because the compat endpoint stored raw turns without fact extraction."
223
+ },
224
+ {
225
+ "turn_id": "omb_t05",
226
+ "timestamp": "2026-06-19T07:10:00Z",
227
+ "speaker": "Assistant",
228
+ "text": "A threshold of 0.3 was too strict for local-keyword embeddings; many evidence turns fell below it."
229
+ },
230
+ {
231
+ "turn_id": "omb_t06",
232
+ "timestamp": "2026-06-19T07:15:00Z",
233
+ "speaker": "User",
234
+ "text": "For future fair benchmarks, use semantic embeddings and add date-aware fact extraction before retrieval."
235
+ },
236
+ {
237
+ "turn_id": "omb_t07",
238
+ "timestamp": "2026-06-19T07:20:00Z",
239
+ "speaker": "User",
240
+ "text": "Current model preference for benchmark spending is DeepSeek or local models, not OpenAI API calls."
241
+ },
242
+ {
243
+ "turn_id": "omb_t08",
244
+ "timestamp": "2026-06-19T07:25:00Z",
245
+ "speaker": "Assistant",
246
+ "text": "The Mem0 comparison should be paused if it starts consuming too much API budget."
247
+ },
248
+ {
249
+ "turn_id": "omb_t09",
250
+ "timestamp": "2026-06-19T07:30:00Z",
251
+ "speaker": "User",
252
+ "text": "The useful public message is that 1MBrain needs a focused benchmark before a broad LOCOMO comparison."
253
+ }
254
+ ],
255
+ "memory_records": [
256
+ {
257
+ "id": "omb_deepseek_models",
258
+ "type": "semantic",
259
+ "timestamp": "2026-06-18T08:00:00Z",
260
+ "content": "The 1MBrain LOCOMO benchmark used DeepSeek V4 Flash for answerer and judge.",
261
+ "tags": ["1mbrain", "locomo", "deepseek"],
262
+ "importance": 0.82,
263
+ "metadata": { "source_turn_id": "omb_t01" }
264
+ },
265
+ {
266
+ "id": "omb_local_keyword_setup",
267
+ "type": "semantic",
268
+ "timestamp": "2026-06-18T08:05:00Z",
269
+ "content": "The 1MBrain LOCOMO run used Postgres and Redis in Docker with local-keyword embeddings.",
270
+ "tags": ["1mbrain", "postgres", "redis", "local-keyword"],
271
+ "importance": 0.9,
272
+ "metadata": { "source_turn_id": "omb_t02" }
273
+ },
274
+ {
275
+ "id": "omb_locomo_score",
276
+ "type": "episodic",
277
+ "timestamp": "2026-06-18T21:47:00Z",
278
+ "content": "1MBrain LOCOMO result: top_10 24.61%, top_20 27.66%, top_50 27.21% over 1540 questions.",
279
+ "tags": ["1mbrain", "locomo", "benchmark-score"],
280
+ "importance": 0.93,
281
+ "metadata": { "source_turn_id": "omb_t03" }
282
+ },
283
+ {
284
+ "id": "omb_raw_turn_limitation",
285
+ "type": "semantic",
286
+ "timestamp": "2026-06-19T07:00:00Z",
287
+ "content": "The 1MBrain LOCOMO result should not be treated as final product quality because the compat endpoint stored raw turns without fact extraction.",
288
+ "tags": ["1mbrain", "benchmark-limitation", "fact-extraction"],
289
+ "importance": 0.95,
290
+ "metadata": { "source_turn_id": "omb_t04" },
291
+ "associations": [{ "target_id": "omb_local_keyword_threshold", "strength": 0.88 }]
292
+ },
293
+ {
294
+ "id": "omb_local_keyword_threshold",
295
+ "type": "semantic",
296
+ "timestamp": "2026-06-19T07:10:00Z",
297
+ "content": "A recall threshold of 0.3 was too strict for local-keyword embeddings; many evidence turns fell below it.",
298
+ "tags": ["1mbrain", "threshold", "local-keyword"],
299
+ "importance": 0.91,
300
+ "metadata": { "source_turn_id": "omb_t05" }
301
+ },
302
+ {
303
+ "id": "omb_benchmark_fix",
304
+ "type": "procedural",
305
+ "timestamp": "2026-06-19T07:15:00Z",
306
+ "content": "For a fair 1MBrain benchmark, use semantic embeddings and add date-aware fact extraction before retrieval.",
307
+ "tags": ["1mbrain", "benchmark-plan", "semantic-embedding"],
308
+ "importance": 0.95,
309
+ "metadata": { "source_turn_id": "omb_t06" }
310
+ },
311
+ {
312
+ "id": "omb_current_model_preference",
313
+ "type": "semantic",
314
+ "timestamp": "2026-06-19T07:20:00Z",
315
+ "content": "Current benchmark spending preference: use DeepSeek or local models, not OpenAI API calls.",
316
+ "tags": ["preference", "deepseek", "local-models", "api-budget"],
317
+ "importance": 0.94,
318
+ "metadata": { "source_turn_id": "omb_t07" }
319
+ },
320
+ {
321
+ "id": "omb_pause_mem0_budget",
322
+ "type": "procedural",
323
+ "timestamp": "2026-06-19T07:25:00Z",
324
+ "content": "Pause Mem0 comparison runs if they start consuming too much API budget.",
325
+ "tags": ["mem0", "api-budget", "procedure"],
326
+ "importance": 0.86,
327
+ "metadata": { "source_turn_id": "omb_t08" }
328
+ },
329
+ {
330
+ "id": "omb_public_message",
331
+ "type": "semantic",
332
+ "timestamp": "2026-06-19T07:30:00Z",
333
+ "content": "Useful public benchmark message: 1MBrain needs a focused benchmark before broad LOCOMO comparison.",
334
+ "tags": ["1mbrain", "github", "benchmark-positioning"],
335
+ "importance": 0.88,
336
+ "metadata": { "source_turn_id": "omb_t09" }
337
+ }
338
+ ],
339
+ "questions": [
340
+ {
341
+ "question_id": "omb_q01",
342
+ "category": "atomic_fact_recall",
343
+ "question": "What scores did 1MBrain get on the LOCOMO run?",
344
+ "expected_answer": "top_10 24.61%, top_20 27.66%, top_50 27.21% over 1540 questions.",
345
+ "required_memory_ids": ["omb_locomo_score"],
346
+ "forbidden_memory_ids": [],
347
+ "cutoffs": [5, 10],
348
+ "rationale": "The score memory is atomic and should be directly retrieved."
349
+ },
350
+ {
351
+ "question_id": "omb_q02",
352
+ "category": "root_cause_recall",
353
+ "question": "Why should the low LOCOMO score not be treated as final 1MBrain quality?",
354
+ "expected_answer": "Because the benchmark stored raw turns without fact extraction and used weak local-keyword retrieval with a strict threshold.",
355
+ "required_memory_ids": ["omb_raw_turn_limitation", "omb_local_keyword_threshold"],
356
+ "forbidden_memory_ids": [],
357
+ "cutoffs": [5, 10],
358
+ "rationale": "The answer requires combining two diagnostic memories."
359
+ },
360
+ {
361
+ "question_id": "omb_q03",
362
+ "category": "procedural_recall",
363
+ "question": "What should change before a fair public benchmark?",
364
+ "expected_answer": "Use semantic embeddings and date-aware fact extraction before retrieval.",
365
+ "required_memory_ids": ["omb_benchmark_fix"],
366
+ "forbidden_memory_ids": [],
367
+ "cutoffs": [5, 10],
368
+ "rationale": "The procedure is stored as a dedicated memory."
369
+ },
370
+ {
371
+ "question_id": "omb_q04",
372
+ "category": "current_preference",
373
+ "question": "Should a benchmark run assume OpenAI API calls are acceptable?",
374
+ "expected_answer": "No. The current preference is DeepSeek or local models, not OpenAI API calls.",
375
+ "required_memory_ids": ["omb_current_model_preference"],
376
+ "forbidden_memory_ids": [],
377
+ "cutoffs": [5, 10],
378
+ "rationale": "The latest explicit preference should control the answer."
379
+ },
380
+ {
381
+ "question_id": "omb_q05",
382
+ "category": "noise_resistance",
383
+ "question": "What is the public message about broad LOCOMO comparison?",
384
+ "expected_answer": "1MBrain needs a focused benchmark before broad LOCOMO comparison.",
385
+ "required_memory_ids": ["omb_public_message"],
386
+ "forbidden_memory_ids": ["omb_locomo_score"],
387
+ "cutoffs": [5, 10],
388
+ "rationale": "The benchmark should not confuse score reporting with positioning."
389
+ }
390
+ ]
391
+ },
392
+ {
393
+ "conversation_id": "conv_hermes_sdk",
394
+ "title": "Hermes Adapter and SDK Workflow",
395
+ "agent_id": "focused_hermes_agent",
396
+ "turns": [
397
+ {
398
+ "turn_id": "ha_t01",
399
+ "timestamp": "2026-06-10T10:00:00Z",
400
+ "speaker": "User",
401
+ "text": "Hermes should remember Q&A turns as episodic memories."
402
+ },
403
+ {
404
+ "turn_id": "ha_t02",
405
+ "timestamp": "2026-06-10T10:05:00Z",
406
+ "speaker": "User",
407
+ "text": "Hermes preferences should be semantic memories with high importance."
408
+ },
409
+ {
410
+ "turn_id": "ha_t03",
411
+ "timestamp": "2026-06-10T10:10:00Z",
412
+ "speaker": "User",
413
+ "text": "Procedures like release steps should be procedural memories."
414
+ },
415
+ {
416
+ "turn_id": "ha_t04",
417
+ "timestamp": "2026-06-10T10:15:00Z",
418
+ "speaker": "User",
419
+ "text": "The TypeScript SDK exports a Hermes adapter from @1mbrain/sdk/hermes."
420
+ },
421
+ {
422
+ "turn_id": "ha_t05",
423
+ "timestamp": "2026-06-10T10:20:00Z",
424
+ "speaker": "User",
425
+ "text": "The adapter can build an LLM context block from recalled memory."
426
+ },
427
+ {
428
+ "turn_id": "ha_t06",
429
+ "timestamp": "2026-06-10T10:25:00Z",
430
+ "speaker": "User",
431
+ "text": "When using Hermes, recall history for chat continuity, facts for preferences, and procedures for repeatable workflows."
432
+ },
433
+ {
434
+ "turn_id": "ha_t07",
435
+ "timestamp": "2026-06-12T09:00:00Z",
436
+ "speaker": "User",
437
+ "text": "Earlier the SDK package was publish-pending; currently it is locally implemented and tested, but npm publish still needs credentials."
438
+ },
439
+ {
440
+ "turn_id": "ha_t08",
441
+ "timestamp": "2026-06-12T09:20:00Z",
442
+ "speaker": "User",
443
+ "text": "The quickstart should help users integrate 1MBrain into an agent in under 30 minutes."
444
+ }
445
+ ],
446
+ "memory_records": [
447
+ {
448
+ "id": "ha_episodic_turns",
449
+ "type": "semantic",
450
+ "timestamp": "2026-06-10T10:00:00Z",
451
+ "content": "Hermes should store Q&A turns as episodic memories.",
452
+ "tags": ["hermes", "episodic", "adapter"],
453
+ "importance": 0.86,
454
+ "metadata": { "source_turn_id": "ha_t01" }
455
+ },
456
+ {
457
+ "id": "ha_preference_semantic",
458
+ "type": "semantic",
459
+ "timestamp": "2026-06-10T10:05:00Z",
460
+ "content": "Hermes preferences should be semantic memories with high importance.",
461
+ "tags": ["hermes", "semantic", "preferences"],
462
+ "importance": 0.9,
463
+ "metadata": { "source_turn_id": "ha_t02" }
464
+ },
465
+ {
466
+ "id": "ha_procedural_workflows",
467
+ "type": "procedural",
468
+ "timestamp": "2026-06-10T10:10:00Z",
469
+ "content": "Repeatable Hermes workflows, such as release steps, should be stored as procedural memories.",
470
+ "tags": ["hermes", "procedural", "workflow"],
471
+ "importance": 0.88,
472
+ "metadata": { "source_turn_id": "ha_t03" }
473
+ },
474
+ {
475
+ "id": "ha_sdk_entrypoint",
476
+ "type": "semantic",
477
+ "timestamp": "2026-06-10T10:15:00Z",
478
+ "content": "The TypeScript SDK exports the Hermes adapter from @1mbrain/sdk/hermes.",
479
+ "tags": ["hermes", "typescript-sdk", "entrypoint"],
480
+ "importance": 0.87,
481
+ "metadata": { "source_turn_id": "ha_t04" }
482
+ },
483
+ {
484
+ "id": "ha_context_builder",
485
+ "type": "procedural",
486
+ "timestamp": "2026-06-10T10:20:00Z",
487
+ "content": "The Hermes adapter can build an LLM context block from recalled memory.",
488
+ "tags": ["hermes", "context-injection"],
489
+ "importance": 0.89,
490
+ "metadata": { "source_turn_id": "ha_t05" }
491
+ },
492
+ {
493
+ "id": "ha_recall_routing",
494
+ "type": "procedural",
495
+ "timestamp": "2026-06-10T10:25:00Z",
496
+ "content": "Hermes recall routing: use history for chat continuity, facts for preferences, and procedures for repeatable workflows.",
497
+ "tags": ["hermes", "recall-routing"],
498
+ "importance": 0.92,
499
+ "metadata": { "source_turn_id": "ha_t06" },
500
+ "associations": [
501
+ { "target_id": "ha_episodic_turns", "strength": 0.9 },
502
+ { "target_id": "ha_preference_semantic", "strength": 0.9 },
503
+ { "target_id": "ha_procedural_workflows", "strength": 0.9 }
504
+ ]
505
+ },
506
+ {
507
+ "id": "ha_publish_status",
508
+ "type": "semantic",
509
+ "timestamp": "2026-06-12T09:00:00Z",
510
+ "content": "The TypeScript SDK is locally implemented and tested, but npm publish still needs credentials.",
511
+ "tags": ["typescript-sdk", "publish", "credentials"],
512
+ "importance": 0.85,
513
+ "metadata": { "source_turn_id": "ha_t07" }
514
+ },
515
+ {
516
+ "id": "ha_quickstart_goal",
517
+ "type": "semantic",
518
+ "timestamp": "2026-06-12T09:20:00Z",
519
+ "content": "The SDK quickstart goal is to help users integrate 1MBrain into an agent in under 30 minutes.",
520
+ "tags": ["sdk", "quickstart", "integration"],
521
+ "importance": 0.82,
522
+ "metadata": { "source_turn_id": "ha_t08" }
523
+ }
524
+ ],
525
+ "questions": [
526
+ {
527
+ "question_id": "ha_q01",
528
+ "category": "procedural_recall",
529
+ "question": "How should Hermes route different recall needs?",
530
+ "expected_answer": "Use history for chat continuity, facts for preferences, and procedures for repeatable workflows.",
531
+ "required_memory_ids": ["ha_recall_routing"],
532
+ "forbidden_memory_ids": [],
533
+ "cutoffs": [5, 10],
534
+ "rationale": "This tests exact procedural recall."
535
+ },
536
+ {
537
+ "question_id": "ha_q02",
538
+ "category": "multi_hop_association",
539
+ "question": "Which memory types should Hermes use for turns, preferences, and workflows?",
540
+ "expected_answer": "Q&A turns are episodic, preferences are semantic, and repeatable workflows are procedural.",
541
+ "required_memory_ids": ["ha_recall_routing", "ha_episodic_turns", "ha_preference_semantic", "ha_procedural_workflows"],
542
+ "forbidden_memory_ids": [],
543
+ "cutoffs": [5, 10],
544
+ "rationale": "The routing memory should activate its associated type memories."
545
+ },
546
+ {
547
+ "question_id": "ha_q03",
548
+ "category": "atomic_fact_recall",
549
+ "question": "What package entry point exports the Hermes adapter?",
550
+ "expected_answer": "@1mbrain/sdk/hermes",
551
+ "required_memory_ids": ["ha_sdk_entrypoint"],
552
+ "forbidden_memory_ids": [],
553
+ "cutoffs": [5, 10],
554
+ "rationale": "This is a direct fact lookup."
555
+ },
556
+ {
557
+ "question_id": "ha_q04",
558
+ "category": "temporal_update",
559
+ "question": "Is the TypeScript SDK already published to npm?",
560
+ "expected_answer": "No. It is locally implemented and tested, but npm publish still needs credentials.",
561
+ "required_memory_ids": ["ha_publish_status"],
562
+ "forbidden_memory_ids": [],
563
+ "cutoffs": [5, 10],
564
+ "rationale": "The answer should reflect current publish status."
565
+ }
566
+ ]
567
+ },
568
+ {
569
+ "conversation_id": "conv_preferences_and_noise",
570
+ "title": "User Preferences With Distractors",
571
+ "agent_id": "focused_preferences_agent",
572
+ "turns": [
573
+ {
574
+ "turn_id": "pn_t01",
575
+ "timestamp": "2026-06-01T08:00:00Z",
576
+ "speaker": "User",
577
+ "text": "For coding tasks, move slowly and robustly: verify one milestone before the next."
578
+ },
579
+ {
580
+ "turn_id": "pn_t02",
581
+ "timestamp": "2026-06-01T08:05:00Z",
582
+ "speaker": "User",
583
+ "text": "Do not blur local verification with cloud deployment blockers."
584
+ },
585
+ {
586
+ "turn_id": "pn_t03",
587
+ "timestamp": "2026-06-01T08:10:00Z",
588
+ "speaker": "User",
589
+ "text": "When asked for a review, lead with concrete findings and file references."
590
+ },
591
+ {
592
+ "turn_id": "pn_t04",
593
+ "timestamp": "2026-06-02T09:00:00Z",
594
+ "speaker": "Assistant",
595
+ "text": "Noise note: The dashboard theme used dark visual styling and D3 graph rendering."
596
+ },
597
+ {
598
+ "turn_id": "pn_t05",
599
+ "timestamp": "2026-06-02T09:05:00Z",
600
+ "speaker": "Assistant",
601
+ "text": "Noise note: A test fixture mentioned purple buttons, but that was unrelated to benchmark reporting."
602
+ },
603
+ {
604
+ "turn_id": "pn_t06",
605
+ "timestamp": "2026-06-03T10:00:00Z",
606
+ "speaker": "User",
607
+ "text": "The project checkpoint that future sessions should read first is task.md."
608
+ },
609
+ {
610
+ "turn_id": "pn_t07",
611
+ "timestamp": "2026-06-04T11:00:00Z",
612
+ "speaker": "User",
613
+ "text": "If a root build stalls, switch to changed workspaces instead of insisting on the broad build."
614
+ },
615
+ {
616
+ "turn_id": "pn_t08",
617
+ "timestamp": "2026-06-05T12:00:00Z",
618
+ "speaker": "User",
619
+ "text": "Older preference: long exploratory reports were acceptable. Current preference: concise verified summaries."
620
+ },
621
+ {
622
+ "turn_id": "pn_t09",
623
+ "timestamp": "2026-06-06T13:00:00Z",
624
+ "speaker": "User",
625
+ "text": "Use repo task files as the durable handoff artifact when asked to preserve progress."
626
+ }
627
+ ],
628
+ "memory_records": [
629
+ {
630
+ "id": "pn_slow_robust",
631
+ "type": "semantic",
632
+ "timestamp": "2026-06-01T08:00:00Z",
633
+ "content": "For coding tasks, the user prefers moving slowly and robustly, verifying one milestone before the next.",
634
+ "tags": ["preference", "coding", "verification"],
635
+ "importance": 0.9,
636
+ "metadata": { "source_turn_id": "pn_t01" }
637
+ },
638
+ {
639
+ "id": "pn_cloud_boundary",
640
+ "type": "semantic",
641
+ "timestamp": "2026-06-01T08:05:00Z",
642
+ "content": "Do not blur local verification with cloud deployment blockers.",
643
+ "tags": ["preference", "verification", "cloud"],
644
+ "importance": 0.9,
645
+ "metadata": { "source_turn_id": "pn_t02" }
646
+ },
647
+ {
648
+ "id": "pn_review_style",
649
+ "type": "semantic",
650
+ "timestamp": "2026-06-01T08:10:00Z",
651
+ "content": "For code reviews, lead with concrete findings and file references.",
652
+ "tags": ["preference", "code-review"],
653
+ "importance": 0.86,
654
+ "metadata": { "source_turn_id": "pn_t03" }
655
+ },
656
+ {
657
+ "id": "pn_dashboard_noise",
658
+ "type": "episodic",
659
+ "timestamp": "2026-06-02T09:00:00Z",
660
+ "content": "Noise: the dashboard theme used dark visual styling and D3 graph rendering.",
661
+ "tags": ["noise", "dashboard"],
662
+ "importance": 0.3,
663
+ "metadata": { "source_turn_id": "pn_t04" }
664
+ },
665
+ {
666
+ "id": "pn_purple_noise",
667
+ "type": "episodic",
668
+ "timestamp": "2026-06-02T09:05:00Z",
669
+ "content": "Noise: a test fixture mentioned purple buttons, unrelated to benchmark reporting.",
670
+ "tags": ["noise", "ui"],
671
+ "importance": 0.2,
672
+ "metadata": { "source_turn_id": "pn_t05" }
673
+ },
674
+ {
675
+ "id": "pn_task_checkpoint",
676
+ "type": "procedural",
677
+ "timestamp": "2026-06-03T10:00:00Z",
678
+ "content": "In this repo, future sessions should read task.md first as the project checkpoint.",
679
+ "tags": ["procedure", "task-md", "checkpoint"],
680
+ "importance": 0.92,
681
+ "metadata": { "source_turn_id": "pn_t06" }
682
+ },
683
+ {
684
+ "id": "pn_workspace_builds",
685
+ "type": "procedural",
686
+ "timestamp": "2026-06-04T11:00:00Z",
687
+ "content": "If a root build stalls, switch to verifying changed workspaces instead of insisting on the broad build.",
688
+ "tags": ["procedure", "build", "workspace"],
689
+ "importance": 0.84,
690
+ "metadata": { "source_turn_id": "pn_t07" }
691
+ },
692
+ {
693
+ "id": "pn_old_verbose",
694
+ "type": "semantic",
695
+ "timestamp": "2026-06-01T07:00:00Z",
696
+ "content": "Older preference: long exploratory reports were acceptable.",
697
+ "tags": ["preference", "old", "reporting"],
698
+ "importance": 0.35,
699
+ "metadata": { "source_turn_id": "pn_t08", "superseded_by": "pn_current_concise" }
700
+ },
701
+ {
702
+ "id": "pn_current_concise",
703
+ "type": "semantic",
704
+ "timestamp": "2026-06-05T12:00:00Z",
705
+ "content": "Current reporting preference: concise verified summaries.",
706
+ "tags": ["preference", "current", "reporting"],
707
+ "importance": 0.94,
708
+ "metadata": { "source_turn_id": "pn_t08" }
709
+ },
710
+ {
711
+ "id": "pn_handoff_artifact",
712
+ "type": "procedural",
713
+ "timestamp": "2026-06-06T13:00:00Z",
714
+ "content": "Use repo task files as the durable handoff artifact when asked to preserve progress.",
715
+ "tags": ["procedure", "handoff", "task-file"],
716
+ "importance": 0.9,
717
+ "metadata": { "source_turn_id": "pn_t09" }
718
+ }
719
+ ],
720
+ "questions": [
721
+ {
722
+ "question_id": "pn_q01",
723
+ "category": "current_preference",
724
+ "question": "How should reports be written now?",
725
+ "expected_answer": "Use concise verified summaries.",
726
+ "required_memory_ids": ["pn_current_concise"],
727
+ "forbidden_memory_ids": ["pn_old_verbose"],
728
+ "cutoffs": [5, 10],
729
+ "rationale": "The current preference should outrank the older one."
730
+ },
731
+ {
732
+ "question_id": "pn_q02",
733
+ "category": "procedural_recall",
734
+ "question": "What file should future sessions read first in this repo?",
735
+ "expected_answer": "task.md",
736
+ "required_memory_ids": ["pn_task_checkpoint"],
737
+ "forbidden_memory_ids": [],
738
+ "cutoffs": [5, 10],
739
+ "rationale": "This tests durable checkpoint recall."
740
+ },
741
+ {
742
+ "question_id": "pn_q03",
743
+ "category": "noise_resistance",
744
+ "question": "What should a blocker report avoid mixing together?",
745
+ "expected_answer": "It should not blur local verification with cloud deployment blockers.",
746
+ "required_memory_ids": ["pn_cloud_boundary"],
747
+ "forbidden_memory_ids": ["pn_dashboard_noise", "pn_purple_noise"],
748
+ "cutoffs": [5, 10],
749
+ "rationale": "Noise records share unrelated UI/reporting language and should not dominate."
750
+ },
751
+ {
752
+ "question_id": "pn_q04",
753
+ "category": "procedural_recall",
754
+ "question": "What should happen if a root build stalls?",
755
+ "expected_answer": "Switch to verifying changed workspaces instead of insisting on the broad build.",
756
+ "required_memory_ids": ["pn_workspace_builds"],
757
+ "forbidden_memory_ids": [],
758
+ "cutoffs": [5, 10],
759
+ "rationale": "This is an operational procedure."
760
+ },
761
+ {
762
+ "question_id": "pn_q05",
763
+ "category": "review_behavior",
764
+ "question": "How should a code review response be structured?",
765
+ "expected_answer": "Lead with concrete findings and file references.",
766
+ "required_memory_ids": ["pn_review_style"],
767
+ "forbidden_memory_ids": [],
768
+ "cutoffs": [5, 10],
769
+ "rationale": "The memory should influence agent behavior, not only retrieval."
770
+ }
771
+ ]
772
+ },
773
+ {
774
+ "conversation_id": "conv_passport_recovery",
775
+ "title": "Passport Export and Recovery",
776
+ "agent_id": "focused_passport_agent",
777
+ "turns": [
778
+ {
779
+ "turn_id": "pr_t01",
780
+ "timestamp": "2026-06-11T08:00:00Z",
781
+ "speaker": "User",
782
+ "text": "A Memory Passport must preserve raw memory content and association graph edges."
783
+ },
784
+ {
785
+ "turn_id": "pr_t02",
786
+ "timestamp": "2026-06-11T08:05:00Z",
787
+ "speaker": "User",
788
+ "text": "Exports should strip raw vectors because embeddings can be regenerated after import."
789
+ },
790
+ {
791
+ "turn_id": "pr_t03",
792
+ "timestamp": "2026-06-11T08:10:00Z",
793
+ "speaker": "User",
794
+ "text": "Encrypted exports use gzip plus AES-256-GCM."
795
+ },
796
+ {
797
+ "turn_id": "pr_t04",
798
+ "timestamp": "2026-06-11T08:15:00Z",
799
+ "speaker": "User",
800
+ "text": "After import into a new agent namespace, linked project context should still answer multi-hop questions."
801
+ },
802
+ {
803
+ "turn_id": "pr_t05",
804
+ "timestamp": "2026-06-11T08:20:00Z",
805
+ "speaker": "Assistant",
806
+ "text": "The graph edge Project Atlas -> backup workflow -> encryption method should survive export and import."
807
+ },
808
+ {
809
+ "turn_id": "pr_t06",
810
+ "timestamp": "2026-06-11T08:25:00Z",
811
+ "speaker": "User",
812
+ "text": "If an import sees duplicate content, the conservative default is skip duplicates."
813
+ }
814
+ ],
815
+ "memory_records": [
816
+ {
817
+ "id": "pr_passport_preserves_content",
818
+ "type": "semantic",
819
+ "timestamp": "2026-06-11T08:00:00Z",
820
+ "content": "A Memory Passport must preserve raw memory content and association graph edges.",
821
+ "tags": ["passport", "associations"],
822
+ "importance": 0.92,
823
+ "metadata": { "source_turn_id": "pr_t01" },
824
+ "associations": [{ "target_id": "pr_import_multihop", "strength": 0.86 }]
825
+ },
826
+ {
827
+ "id": "pr_strip_vectors",
828
+ "type": "procedural",
829
+ "timestamp": "2026-06-11T08:05:00Z",
830
+ "content": "Memory Passport exports should strip raw vectors because embeddings can be regenerated after import.",
831
+ "tags": ["passport", "vectors", "import"],
832
+ "importance": 0.88,
833
+ "metadata": { "source_turn_id": "pr_t02" }
834
+ },
835
+ {
836
+ "id": "pr_encryption_method",
837
+ "type": "semantic",
838
+ "timestamp": "2026-06-11T08:10:00Z",
839
+ "content": "Encrypted Memory Passport exports use gzip plus AES-256-GCM.",
840
+ "tags": ["passport", "encryption", "gzip"],
841
+ "importance": 0.9,
842
+ "metadata": { "source_turn_id": "pr_t03" }
843
+ },
844
+ {
845
+ "id": "pr_import_multihop",
846
+ "type": "semantic",
847
+ "timestamp": "2026-06-11T08:15:00Z",
848
+ "content": "After Memory Passport import into a new agent namespace, linked project context should still answer multi-hop questions.",
849
+ "tags": ["passport", "import", "multi-hop"],
850
+ "importance": 0.94,
851
+ "metadata": { "source_turn_id": "pr_t04" },
852
+ "associations": [{ "target_id": "pr_atlas_graph_edge", "strength": 0.88 }]
853
+ },
854
+ {
855
+ "id": "pr_atlas_graph_edge",
856
+ "type": "semantic",
857
+ "timestamp": "2026-06-11T08:20:00Z",
858
+ "content": "The graph edge Project Atlas -> backup workflow -> encryption method should survive export and import.",
859
+ "tags": ["passport", "atlas", "graph"],
860
+ "importance": 0.89,
861
+ "metadata": { "source_turn_id": "pr_t05" },
862
+ "associations": [{ "target_id": "pr_encryption_method", "strength": 0.9 }]
863
+ },
864
+ {
865
+ "id": "pr_duplicate_strategy",
866
+ "type": "procedural",
867
+ "timestamp": "2026-06-11T08:25:00Z",
868
+ "content": "If Memory Passport import sees duplicate content, the conservative default is to skip duplicates.",
869
+ "tags": ["passport", "duplicates", "import"],
870
+ "importance": 0.84,
871
+ "metadata": { "source_turn_id": "pr_t06" }
872
+ }
873
+ ],
874
+ "questions": [
875
+ {
876
+ "question_id": "pr_q01",
877
+ "category": "portability",
878
+ "question": "What must a Memory Passport preserve?",
879
+ "expected_answer": "Raw memory content and association graph edges.",
880
+ "required_memory_ids": ["pr_passport_preserves_content"],
881
+ "forbidden_memory_ids": [],
882
+ "cutoffs": [5, 10],
883
+ "rationale": "This tests direct portability semantics."
884
+ },
885
+ {
886
+ "question_id": "pr_q02",
887
+ "category": "portability",
888
+ "question": "Should Memory Passport exports keep raw vectors?",
889
+ "expected_answer": "No. Raw vectors should be stripped because embeddings can be regenerated after import.",
890
+ "required_memory_ids": ["pr_strip_vectors"],
891
+ "forbidden_memory_ids": [],
892
+ "cutoffs": [5, 10],
893
+ "rationale": "The correct answer should avoid preserving embedding vectors."
894
+ },
895
+ {
896
+ "question_id": "pr_q03",
897
+ "category": "multi_hop_association",
898
+ "question": "For Project Atlas, what encryption method should survive export and import through the graph?",
899
+ "expected_answer": "gzip plus AES-256-GCM.",
900
+ "required_memory_ids": ["pr_atlas_graph_edge", "pr_encryption_method"],
901
+ "forbidden_memory_ids": [],
902
+ "cutoffs": [5, 10],
903
+ "rationale": "The answer should traverse Atlas -> graph edge -> encryption method."
904
+ },
905
+ {
906
+ "question_id": "pr_q04",
907
+ "category": "procedural_recall",
908
+ "question": "What should import do with duplicate content by default?",
909
+ "expected_answer": "Skip duplicates.",
910
+ "required_memory_ids": ["pr_duplicate_strategy"],
911
+ "forbidden_memory_ids": [],
912
+ "cutoffs": [5, 10],
913
+ "rationale": "This tests conflict strategy recall."
914
+ },
915
+ {
916
+ "question_id": "pr_q05",
917
+ "category": "portability",
918
+ "question": "After importing into a new namespace, what should still work?",
919
+ "expected_answer": "Linked project context should still answer multi-hop questions.",
920
+ "required_memory_ids": ["pr_import_multihop", "pr_passport_preserves_content"],
921
+ "forbidden_memory_ids": [],
922
+ "cutoffs": [5, 10],
923
+ "rationale": "This question is meant for export/import preservation scoring."
924
+ }
925
+ ]
926
+ }
927
+ ]
928
+ }